diff --git "a/checkpoints/trainer_state.json" "b/checkpoints/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/trainer_state.json" @@ -0,0 +1,39113 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 5520, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00018115942028985507, + "grad_norm": 155.45926107494319, + "learning_rate": 6.024096385542168e-09, + "loss": 3.2778, + "step": 1 + }, + { + "epoch": 0.00036231884057971015, + "grad_norm": 146.53525241554215, + "learning_rate": 1.2048192771084337e-08, + "loss": 2.96, + "step": 2 + }, + { + "epoch": 0.0005434782608695652, + "grad_norm": 166.08084514994493, + "learning_rate": 1.8072289156626504e-08, + "loss": 3.2065, + "step": 3 + }, + { + "epoch": 0.0007246376811594203, + "grad_norm": 167.59524060451398, + "learning_rate": 2.4096385542168673e-08, + "loss": 3.436, + "step": 4 + }, + { + "epoch": 0.0009057971014492754, + "grad_norm": 162.79759647875963, + "learning_rate": 3.012048192771084e-08, + "loss": 3.1548, + "step": 5 + }, + { + "epoch": 0.0010869565217391304, + "grad_norm": 155.23379157235624, + "learning_rate": 3.614457831325301e-08, + "loss": 3.3721, + "step": 6 + }, + { + "epoch": 0.0012681159420289854, + "grad_norm": 165.5495705406923, + "learning_rate": 4.2168674698795174e-08, + "loss": 3.2246, + "step": 7 + }, + { + "epoch": 0.0014492753623188406, + "grad_norm": 156.2303696546355, + "learning_rate": 4.8192771084337347e-08, + "loss": 3.3623, + "step": 8 + }, + { + "epoch": 0.0016304347826086956, + "grad_norm": 161.74945993469353, + "learning_rate": 5.421686746987952e-08, + "loss": 3.2163, + "step": 9 + }, + { + "epoch": 0.0018115942028985507, + "grad_norm": 155.0623773701399, + "learning_rate": 6.024096385542168e-08, + "loss": 3.4561, + "step": 10 + }, + { + "epoch": 0.0019927536231884057, + "grad_norm": 161.30079568597247, + "learning_rate": 6.626506024096386e-08, + "loss": 3.6455, + "step": 11 + }, + { + "epoch": 0.002173913043478261, + "grad_norm": 156.25269673838352, + "learning_rate": 7.228915662650602e-08, + "loss": 3.2124, + "step": 12 + }, + { + "epoch": 0.002355072463768116, + "grad_norm": 162.03906983237863, + "learning_rate": 7.83132530120482e-08, + "loss": 3.3145, + "step": 13 + }, + { + "epoch": 0.002536231884057971, + "grad_norm": 166.9528940623657, + "learning_rate": 8.433734939759035e-08, + "loss": 3.2222, + "step": 14 + }, + { + "epoch": 0.002717391304347826, + "grad_norm": 167.9591489096079, + "learning_rate": 9.036144578313253e-08, + "loss": 3.105, + "step": 15 + }, + { + "epoch": 0.002898550724637681, + "grad_norm": 168.23589443984898, + "learning_rate": 9.638554216867469e-08, + "loss": 2.9004, + "step": 16 + }, + { + "epoch": 0.0030797101449275364, + "grad_norm": 165.85795575431166, + "learning_rate": 1.0240963855421686e-07, + "loss": 3.2441, + "step": 17 + }, + { + "epoch": 0.003260869565217391, + "grad_norm": 165.04155349261808, + "learning_rate": 1.0843373493975904e-07, + "loss": 3.0679, + "step": 18 + }, + { + "epoch": 0.0034420289855072463, + "grad_norm": 170.34587941618398, + "learning_rate": 1.1445783132530119e-07, + "loss": 3.3999, + "step": 19 + }, + { + "epoch": 0.0036231884057971015, + "grad_norm": 160.2615797007729, + "learning_rate": 1.2048192771084337e-07, + "loss": 2.8306, + "step": 20 + }, + { + "epoch": 0.0038043478260869567, + "grad_norm": 162.039582212009, + "learning_rate": 1.2650602409638554e-07, + "loss": 2.9673, + "step": 21 + }, + { + "epoch": 0.003985507246376811, + "grad_norm": 176.08714589279865, + "learning_rate": 1.3253012048192773e-07, + "loss": 3.3125, + "step": 22 + }, + { + "epoch": 0.004166666666666667, + "grad_norm": 174.85115283084608, + "learning_rate": 1.3855421686746987e-07, + "loss": 3.0752, + "step": 23 + }, + { + "epoch": 0.004347826086956522, + "grad_norm": 191.75937464009905, + "learning_rate": 1.4457831325301203e-07, + "loss": 3.3062, + "step": 24 + }, + { + "epoch": 0.004528985507246377, + "grad_norm": 180.97084764424042, + "learning_rate": 1.5060240963855423e-07, + "loss": 2.9961, + "step": 25 + }, + { + "epoch": 0.004710144927536232, + "grad_norm": 186.813096280298, + "learning_rate": 1.566265060240964e-07, + "loss": 2.9917, + "step": 26 + }, + { + "epoch": 0.004891304347826087, + "grad_norm": 171.4489308986305, + "learning_rate": 1.6265060240963853e-07, + "loss": 2.645, + "step": 27 + }, + { + "epoch": 0.005072463768115942, + "grad_norm": 169.187919981961, + "learning_rate": 1.686746987951807e-07, + "loss": 2.7319, + "step": 28 + }, + { + "epoch": 0.005253623188405797, + "grad_norm": 171.130930708941, + "learning_rate": 1.746987951807229e-07, + "loss": 2.5078, + "step": 29 + }, + { + "epoch": 0.005434782608695652, + "grad_norm": 164.74902247102506, + "learning_rate": 1.8072289156626505e-07, + "loss": 2.4355, + "step": 30 + }, + { + "epoch": 0.005615942028985507, + "grad_norm": 172.31334333571056, + "learning_rate": 1.8674698795180722e-07, + "loss": 2.478, + "step": 31 + }, + { + "epoch": 0.005797101449275362, + "grad_norm": 149.52183581815365, + "learning_rate": 1.9277108433734939e-07, + "loss": 2.4834, + "step": 32 + }, + { + "epoch": 0.005978260869565218, + "grad_norm": 173.68692872709474, + "learning_rate": 1.9879518072289155e-07, + "loss": 2.5942, + "step": 33 + }, + { + "epoch": 0.006159420289855073, + "grad_norm": 171.72660128591411, + "learning_rate": 2.0481927710843372e-07, + "loss": 2.5977, + "step": 34 + }, + { + "epoch": 0.006340579710144928, + "grad_norm": 177.23977883062634, + "learning_rate": 2.108433734939759e-07, + "loss": 2.666, + "step": 35 + }, + { + "epoch": 0.006521739130434782, + "grad_norm": 191.88320532811127, + "learning_rate": 2.1686746987951808e-07, + "loss": 2.9033, + "step": 36 + }, + { + "epoch": 0.0067028985507246374, + "grad_norm": 170.88276296688394, + "learning_rate": 2.2289156626506022e-07, + "loss": 2.6719, + "step": 37 + }, + { + "epoch": 0.006884057971014493, + "grad_norm": 149.19705339355804, + "learning_rate": 2.2891566265060238e-07, + "loss": 1.8291, + "step": 38 + }, + { + "epoch": 0.007065217391304348, + "grad_norm": 153.45683505992974, + "learning_rate": 2.3493975903614457e-07, + "loss": 1.7681, + "step": 39 + }, + { + "epoch": 0.007246376811594203, + "grad_norm": 149.0156445006739, + "learning_rate": 2.4096385542168674e-07, + "loss": 1.6543, + "step": 40 + }, + { + "epoch": 0.007427536231884058, + "grad_norm": 135.04326091666607, + "learning_rate": 2.469879518072289e-07, + "loss": 1.5229, + "step": 41 + }, + { + "epoch": 0.007608695652173913, + "grad_norm": 133.04893540033066, + "learning_rate": 2.5301204819277107e-07, + "loss": 1.4431, + "step": 42 + }, + { + "epoch": 0.0077898550724637685, + "grad_norm": 132.1471804123453, + "learning_rate": 2.5903614457831324e-07, + "loss": 1.4153, + "step": 43 + }, + { + "epoch": 0.007971014492753623, + "grad_norm": 131.80069966825823, + "learning_rate": 2.6506024096385546e-07, + "loss": 1.5205, + "step": 44 + }, + { + "epoch": 0.008152173913043478, + "grad_norm": 138.90404312299518, + "learning_rate": 2.7108433734939757e-07, + "loss": 1.4946, + "step": 45 + }, + { + "epoch": 0.008333333333333333, + "grad_norm": 126.72977971234664, + "learning_rate": 2.7710843373493974e-07, + "loss": 1.4854, + "step": 46 + }, + { + "epoch": 0.008514492753623188, + "grad_norm": 133.6903705502816, + "learning_rate": 2.8313253012048195e-07, + "loss": 1.4331, + "step": 47 + }, + { + "epoch": 0.008695652173913044, + "grad_norm": 107.52537178733128, + "learning_rate": 2.8915662650602407e-07, + "loss": 1.2329, + "step": 48 + }, + { + "epoch": 0.008876811594202899, + "grad_norm": 116.93755567385571, + "learning_rate": 2.9518072289156623e-07, + "loss": 1.2612, + "step": 49 + }, + { + "epoch": 0.009057971014492754, + "grad_norm": 109.63231545954613, + "learning_rate": 3.0120481927710845e-07, + "loss": 1.2361, + "step": 50 + }, + { + "epoch": 0.00923913043478261, + "grad_norm": 79.48921062100135, + "learning_rate": 3.0722891566265056e-07, + "loss": 1.0171, + "step": 51 + }, + { + "epoch": 0.009420289855072464, + "grad_norm": 38.81750692142498, + "learning_rate": 3.132530120481928e-07, + "loss": 0.8708, + "step": 52 + }, + { + "epoch": 0.00960144927536232, + "grad_norm": 68.41113509995378, + "learning_rate": 3.192771084337349e-07, + "loss": 1.0142, + "step": 53 + }, + { + "epoch": 0.009782608695652175, + "grad_norm": 69.43876122106217, + "learning_rate": 3.2530120481927706e-07, + "loss": 0.9929, + "step": 54 + }, + { + "epoch": 0.009963768115942028, + "grad_norm": 72.7311351452876, + "learning_rate": 3.313253012048193e-07, + "loss": 1.0474, + "step": 55 + }, + { + "epoch": 0.010144927536231883, + "grad_norm": 67.84585170203647, + "learning_rate": 3.373493975903614e-07, + "loss": 1.0488, + "step": 56 + }, + { + "epoch": 0.010326086956521738, + "grad_norm": 55.83187351478142, + "learning_rate": 3.433734939759036e-07, + "loss": 0.8875, + "step": 57 + }, + { + "epoch": 0.010507246376811594, + "grad_norm": 67.99864051190457, + "learning_rate": 3.493975903614458e-07, + "loss": 1.0031, + "step": 58 + }, + { + "epoch": 0.010688405797101449, + "grad_norm": 54.07576863846796, + "learning_rate": 3.554216867469879e-07, + "loss": 0.8564, + "step": 59 + }, + { + "epoch": 0.010869565217391304, + "grad_norm": 52.809527375230566, + "learning_rate": 3.614457831325301e-07, + "loss": 0.7832, + "step": 60 + }, + { + "epoch": 0.01105072463768116, + "grad_norm": 39.022030058357664, + "learning_rate": 3.674698795180723e-07, + "loss": 0.7292, + "step": 61 + }, + { + "epoch": 0.011231884057971014, + "grad_norm": 34.14652534748319, + "learning_rate": 3.7349397590361444e-07, + "loss": 0.7489, + "step": 62 + }, + { + "epoch": 0.01141304347826087, + "grad_norm": 18.239066099044848, + "learning_rate": 3.795180722891566e-07, + "loss": 0.692, + "step": 63 + }, + { + "epoch": 0.011594202898550725, + "grad_norm": 13.40286627581432, + "learning_rate": 3.8554216867469877e-07, + "loss": 0.6915, + "step": 64 + }, + { + "epoch": 0.01177536231884058, + "grad_norm": 19.66059978230837, + "learning_rate": 3.9156626506024094e-07, + "loss": 0.7246, + "step": 65 + }, + { + "epoch": 0.011956521739130435, + "grad_norm": 13.357821616072009, + "learning_rate": 3.975903614457831e-07, + "loss": 0.676, + "step": 66 + }, + { + "epoch": 0.01213768115942029, + "grad_norm": 23.978021954471387, + "learning_rate": 4.036144578313253e-07, + "loss": 0.6628, + "step": 67 + }, + { + "epoch": 0.012318840579710146, + "grad_norm": 34.6756530637303, + "learning_rate": 4.0963855421686744e-07, + "loss": 0.6841, + "step": 68 + }, + { + "epoch": 0.0125, + "grad_norm": 28.61333355137964, + "learning_rate": 4.156626506024096e-07, + "loss": 0.6216, + "step": 69 + }, + { + "epoch": 0.012681159420289856, + "grad_norm": 26.677689329855557, + "learning_rate": 4.216867469879518e-07, + "loss": 0.6619, + "step": 70 + }, + { + "epoch": 0.01286231884057971, + "grad_norm": 22.19223137338174, + "learning_rate": 4.2771084337349393e-07, + "loss": 0.6655, + "step": 71 + }, + { + "epoch": 0.013043478260869565, + "grad_norm": 21.79590029398801, + "learning_rate": 4.3373493975903615e-07, + "loss": 0.6466, + "step": 72 + }, + { + "epoch": 0.01322463768115942, + "grad_norm": 9.856574571433779, + "learning_rate": 4.3975903614457827e-07, + "loss": 0.649, + "step": 73 + }, + { + "epoch": 0.013405797101449275, + "grad_norm": 9.445511619298856, + "learning_rate": 4.4578313253012043e-07, + "loss": 0.6648, + "step": 74 + }, + { + "epoch": 0.01358695652173913, + "grad_norm": 10.053471572311627, + "learning_rate": 4.5180722891566265e-07, + "loss": 0.6207, + "step": 75 + }, + { + "epoch": 0.013768115942028985, + "grad_norm": 9.903442777888861, + "learning_rate": 4.5783132530120476e-07, + "loss": 0.6351, + "step": 76 + }, + { + "epoch": 0.01394927536231884, + "grad_norm": 8.29476593406457, + "learning_rate": 4.63855421686747e-07, + "loss": 0.5902, + "step": 77 + }, + { + "epoch": 0.014130434782608696, + "grad_norm": 13.48145931778345, + "learning_rate": 4.6987951807228915e-07, + "loss": 0.5927, + "step": 78 + }, + { + "epoch": 0.01431159420289855, + "grad_norm": 8.231585996611404, + "learning_rate": 4.7590361445783126e-07, + "loss": 0.5648, + "step": 79 + }, + { + "epoch": 0.014492753623188406, + "grad_norm": 9.1626932531575, + "learning_rate": 4.819277108433735e-07, + "loss": 0.5475, + "step": 80 + }, + { + "epoch": 0.014673913043478261, + "grad_norm": 10.168836300766785, + "learning_rate": 4.879518072289156e-07, + "loss": 0.5139, + "step": 81 + }, + { + "epoch": 0.014855072463768116, + "grad_norm": 9.158146838509008, + "learning_rate": 4.939759036144578e-07, + "loss": 0.5251, + "step": 82 + }, + { + "epoch": 0.015036231884057972, + "grad_norm": 7.381762153704901, + "learning_rate": 5e-07, + "loss": 0.6141, + "step": 83 + }, + { + "epoch": 0.015217391304347827, + "grad_norm": 6.28495104475644, + "learning_rate": 5.060240963855421e-07, + "loss": 0.5575, + "step": 84 + }, + { + "epoch": 0.015398550724637682, + "grad_norm": 6.046197754805433, + "learning_rate": 5.120481927710843e-07, + "loss": 0.5264, + "step": 85 + }, + { + "epoch": 0.015579710144927537, + "grad_norm": 11.499601951009717, + "learning_rate": 5.180722891566265e-07, + "loss": 0.4857, + "step": 86 + }, + { + "epoch": 0.01576086956521739, + "grad_norm": 7.747865070902684, + "learning_rate": 5.240963855421686e-07, + "loss": 0.5394, + "step": 87 + }, + { + "epoch": 0.015942028985507246, + "grad_norm": 6.741894867585539, + "learning_rate": 5.301204819277109e-07, + "loss": 0.5691, + "step": 88 + }, + { + "epoch": 0.0161231884057971, + "grad_norm": 16.55724194012545, + "learning_rate": 5.36144578313253e-07, + "loss": 0.6082, + "step": 89 + }, + { + "epoch": 0.016304347826086956, + "grad_norm": 7.216912783357828, + "learning_rate": 5.421686746987951e-07, + "loss": 0.558, + "step": 90 + }, + { + "epoch": 0.01648550724637681, + "grad_norm": 6.683798937927288, + "learning_rate": 5.481927710843374e-07, + "loss": 0.5511, + "step": 91 + }, + { + "epoch": 0.016666666666666666, + "grad_norm": 6.3427743897580715, + "learning_rate": 5.542168674698795e-07, + "loss": 0.5498, + "step": 92 + }, + { + "epoch": 0.01684782608695652, + "grad_norm": 7.035954184887222, + "learning_rate": 5.602409638554216e-07, + "loss": 0.5524, + "step": 93 + }, + { + "epoch": 0.017028985507246377, + "grad_norm": 8.266621192800468, + "learning_rate": 5.662650602409639e-07, + "loss": 0.522, + "step": 94 + }, + { + "epoch": 0.017210144927536232, + "grad_norm": 7.569481363939274, + "learning_rate": 5.72289156626506e-07, + "loss": 0.5659, + "step": 95 + }, + { + "epoch": 0.017391304347826087, + "grad_norm": 23.300962393338285, + "learning_rate": 5.783132530120481e-07, + "loss": 0.5211, + "step": 96 + }, + { + "epoch": 0.017572463768115942, + "grad_norm": 7.741231260250192, + "learning_rate": 5.843373493975904e-07, + "loss": 0.5302, + "step": 97 + }, + { + "epoch": 0.017753623188405798, + "grad_norm": 9.211361854353234, + "learning_rate": 5.903614457831325e-07, + "loss": 0.5004, + "step": 98 + }, + { + "epoch": 0.017934782608695653, + "grad_norm": 8.848024831725077, + "learning_rate": 5.963855421686746e-07, + "loss": 0.5242, + "step": 99 + }, + { + "epoch": 0.018115942028985508, + "grad_norm": 9.209307129604856, + "learning_rate": 6.024096385542169e-07, + "loss": 0.4996, + "step": 100 + }, + { + "epoch": 0.018115942028985508, + "eval_loss": 0.5314062237739563, + "eval_runtime": 9.6882, + "eval_samples_per_second": 51.609, + "eval_steps_per_second": 0.103, + "step": 100 + }, + { + "epoch": 0.018297101449275363, + "grad_norm": 9.374402909979962, + "learning_rate": 6.084337349397591e-07, + "loss": 0.4619, + "step": 101 + }, + { + "epoch": 0.01847826086956522, + "grad_norm": 18.94663963081601, + "learning_rate": 6.144578313253011e-07, + "loss": 0.5286, + "step": 102 + }, + { + "epoch": 0.018659420289855073, + "grad_norm": 11.927461303799193, + "learning_rate": 6.204819277108434e-07, + "loss": 0.5277, + "step": 103 + }, + { + "epoch": 0.01884057971014493, + "grad_norm": 11.418326931268298, + "learning_rate": 6.265060240963856e-07, + "loss": 0.5803, + "step": 104 + }, + { + "epoch": 0.019021739130434784, + "grad_norm": 7.289335842869234, + "learning_rate": 6.325301204819276e-07, + "loss": 0.4612, + "step": 105 + }, + { + "epoch": 0.01920289855072464, + "grad_norm": 13.189116520311075, + "learning_rate": 6.385542168674698e-07, + "loss": 0.5441, + "step": 106 + }, + { + "epoch": 0.019384057971014494, + "grad_norm": 11.451763312237627, + "learning_rate": 6.445783132530121e-07, + "loss": 0.5208, + "step": 107 + }, + { + "epoch": 0.01956521739130435, + "grad_norm": 6.2912922072094535, + "learning_rate": 6.506024096385541e-07, + "loss": 0.5115, + "step": 108 + }, + { + "epoch": 0.019746376811594205, + "grad_norm": 13.702007675182921, + "learning_rate": 6.566265060240963e-07, + "loss": 0.5343, + "step": 109 + }, + { + "epoch": 0.019927536231884056, + "grad_norm": 5.401659969002201, + "learning_rate": 6.626506024096386e-07, + "loss": 0.4936, + "step": 110 + }, + { + "epoch": 0.02010869565217391, + "grad_norm": 6.777443622517859, + "learning_rate": 6.686746987951807e-07, + "loss": 0.5105, + "step": 111 + }, + { + "epoch": 0.020289855072463767, + "grad_norm": 5.894655570480427, + "learning_rate": 6.746987951807228e-07, + "loss": 0.4784, + "step": 112 + }, + { + "epoch": 0.020471014492753622, + "grad_norm": 10.45271767331002, + "learning_rate": 6.807228915662651e-07, + "loss": 0.5002, + "step": 113 + }, + { + "epoch": 0.020652173913043477, + "grad_norm": 15.563610610032606, + "learning_rate": 6.867469879518072e-07, + "loss": 0.5154, + "step": 114 + }, + { + "epoch": 0.020833333333333332, + "grad_norm": 7.9179192602020505, + "learning_rate": 6.927710843373493e-07, + "loss": 0.4873, + "step": 115 + }, + { + "epoch": 0.021014492753623187, + "grad_norm": 8.436961625904862, + "learning_rate": 6.987951807228916e-07, + "loss": 0.554, + "step": 116 + }, + { + "epoch": 0.021195652173913043, + "grad_norm": 8.404752827874326, + "learning_rate": 7.048192771084337e-07, + "loss": 0.4906, + "step": 117 + }, + { + "epoch": 0.021376811594202898, + "grad_norm": 6.01808853532882, + "learning_rate": 7.108433734939758e-07, + "loss": 0.5338, + "step": 118 + }, + { + "epoch": 0.021557971014492753, + "grad_norm": 8.583538396465919, + "learning_rate": 7.168674698795181e-07, + "loss": 0.5269, + "step": 119 + }, + { + "epoch": 0.021739130434782608, + "grad_norm": 8.179015465558333, + "learning_rate": 7.228915662650602e-07, + "loss": 0.505, + "step": 120 + }, + { + "epoch": 0.021920289855072463, + "grad_norm": 7.816411195873641, + "learning_rate": 7.289156626506024e-07, + "loss": 0.4822, + "step": 121 + }, + { + "epoch": 0.02210144927536232, + "grad_norm": 7.723181238831617, + "learning_rate": 7.349397590361446e-07, + "loss": 0.4962, + "step": 122 + }, + { + "epoch": 0.022282608695652174, + "grad_norm": 6.877138873147807, + "learning_rate": 7.409638554216867e-07, + "loss": 0.4381, + "step": 123 + }, + { + "epoch": 0.02246376811594203, + "grad_norm": 6.81832569687752, + "learning_rate": 7.469879518072289e-07, + "loss": 0.4794, + "step": 124 + }, + { + "epoch": 0.022644927536231884, + "grad_norm": 5.676680125317034, + "learning_rate": 7.53012048192771e-07, + "loss": 0.5306, + "step": 125 + }, + { + "epoch": 0.02282608695652174, + "grad_norm": 5.00121652261405, + "learning_rate": 7.590361445783132e-07, + "loss": 0.5134, + "step": 126 + }, + { + "epoch": 0.023007246376811594, + "grad_norm": 5.01429666835957, + "learning_rate": 7.650602409638554e-07, + "loss": 0.4814, + "step": 127 + }, + { + "epoch": 0.02318840579710145, + "grad_norm": 6.608334533851778, + "learning_rate": 7.710843373493975e-07, + "loss": 0.5015, + "step": 128 + }, + { + "epoch": 0.023369565217391305, + "grad_norm": 6.914300961015702, + "learning_rate": 7.771084337349397e-07, + "loss": 0.4758, + "step": 129 + }, + { + "epoch": 0.02355072463768116, + "grad_norm": 6.24984997664612, + "learning_rate": 7.831325301204819e-07, + "loss": 0.4501, + "step": 130 + }, + { + "epoch": 0.023731884057971015, + "grad_norm": 5.112118214192944, + "learning_rate": 7.891566265060241e-07, + "loss": 0.4746, + "step": 131 + }, + { + "epoch": 0.02391304347826087, + "grad_norm": 8.387817660357083, + "learning_rate": 7.951807228915662e-07, + "loss": 0.5029, + "step": 132 + }, + { + "epoch": 0.024094202898550725, + "grad_norm": 7.980049913886308, + "learning_rate": 8.012048192771084e-07, + "loss": 0.4542, + "step": 133 + }, + { + "epoch": 0.02427536231884058, + "grad_norm": 14.421928860935134, + "learning_rate": 8.072289156626506e-07, + "loss": 0.4762, + "step": 134 + }, + { + "epoch": 0.024456521739130436, + "grad_norm": 6.524127827626026, + "learning_rate": 8.132530120481927e-07, + "loss": 0.4901, + "step": 135 + }, + { + "epoch": 0.02463768115942029, + "grad_norm": 8.971696806473568, + "learning_rate": 8.192771084337349e-07, + "loss": 0.5189, + "step": 136 + }, + { + "epoch": 0.024818840579710146, + "grad_norm": 7.043008284400751, + "learning_rate": 8.253012048192771e-07, + "loss": 0.4894, + "step": 137 + }, + { + "epoch": 0.025, + "grad_norm": 4.834709782757821, + "learning_rate": 8.313253012048192e-07, + "loss": 0.442, + "step": 138 + }, + { + "epoch": 0.025181159420289857, + "grad_norm": 9.895461132716854, + "learning_rate": 8.373493975903614e-07, + "loss": 0.4325, + "step": 139 + }, + { + "epoch": 0.025362318840579712, + "grad_norm": 7.4036240261540245, + "learning_rate": 8.433734939759036e-07, + "loss": 0.4438, + "step": 140 + }, + { + "epoch": 0.025543478260869567, + "grad_norm": 4.548397494814773, + "learning_rate": 8.493975903614458e-07, + "loss": 0.4884, + "step": 141 + }, + { + "epoch": 0.02572463768115942, + "grad_norm": 4.152457111490965, + "learning_rate": 8.554216867469879e-07, + "loss": 0.4636, + "step": 142 + }, + { + "epoch": 0.025905797101449274, + "grad_norm": 5.924137019684747, + "learning_rate": 8.614457831325301e-07, + "loss": 0.4912, + "step": 143 + }, + { + "epoch": 0.02608695652173913, + "grad_norm": 12.975084883779902, + "learning_rate": 8.674698795180723e-07, + "loss": 0.5563, + "step": 144 + }, + { + "epoch": 0.026268115942028984, + "grad_norm": 6.1368116621228115, + "learning_rate": 8.734939759036144e-07, + "loss": 0.4487, + "step": 145 + }, + { + "epoch": 0.02644927536231884, + "grad_norm": 5.155799288498959, + "learning_rate": 8.795180722891565e-07, + "loss": 0.5308, + "step": 146 + }, + { + "epoch": 0.026630434782608695, + "grad_norm": 5.118362961694278, + "learning_rate": 8.855421686746988e-07, + "loss": 0.4598, + "step": 147 + }, + { + "epoch": 0.02681159420289855, + "grad_norm": 8.632679047110537, + "learning_rate": 8.915662650602409e-07, + "loss": 0.5283, + "step": 148 + }, + { + "epoch": 0.026992753623188405, + "grad_norm": 7.180786024663195, + "learning_rate": 8.97590361445783e-07, + "loss": 0.4962, + "step": 149 + }, + { + "epoch": 0.02717391304347826, + "grad_norm": 4.547074967752511, + "learning_rate": 9.036144578313253e-07, + "loss": 0.4877, + "step": 150 + }, + { + "epoch": 0.027355072463768115, + "grad_norm": 6.909285053575947, + "learning_rate": 9.096385542168675e-07, + "loss": 0.5031, + "step": 151 + }, + { + "epoch": 0.02753623188405797, + "grad_norm": 8.46572825149969, + "learning_rate": 9.156626506024095e-07, + "loss": 0.5369, + "step": 152 + }, + { + "epoch": 0.027717391304347826, + "grad_norm": 17.21557975553529, + "learning_rate": 9.216867469879518e-07, + "loss": 0.444, + "step": 153 + }, + { + "epoch": 0.02789855072463768, + "grad_norm": 7.958798793828718, + "learning_rate": 9.27710843373494e-07, + "loss": 0.5594, + "step": 154 + }, + { + "epoch": 0.028079710144927536, + "grad_norm": 20.089943951856156, + "learning_rate": 9.33734939759036e-07, + "loss": 0.4508, + "step": 155 + }, + { + "epoch": 0.02826086956521739, + "grad_norm": 8.348366765620753, + "learning_rate": 9.397590361445783e-07, + "loss": 0.5292, + "step": 156 + }, + { + "epoch": 0.028442028985507246, + "grad_norm": 5.368109540960397, + "learning_rate": 9.457831325301205e-07, + "loss": 0.5422, + "step": 157 + }, + { + "epoch": 0.0286231884057971, + "grad_norm": 5.388950045227162, + "learning_rate": 9.518072289156625e-07, + "loss": 0.4666, + "step": 158 + }, + { + "epoch": 0.028804347826086957, + "grad_norm": 5.053872505552171, + "learning_rate": 9.57831325301205e-07, + "loss": 0.4701, + "step": 159 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 5.421594424612676, + "learning_rate": 9.63855421686747e-07, + "loss": 0.52, + "step": 160 + }, + { + "epoch": 0.029166666666666667, + "grad_norm": 4.1264872603776785, + "learning_rate": 9.69879518072289e-07, + "loss": 0.4995, + "step": 161 + }, + { + "epoch": 0.029347826086956522, + "grad_norm": 5.044524575984205, + "learning_rate": 9.759036144578313e-07, + "loss": 0.5116, + "step": 162 + }, + { + "epoch": 0.029528985507246377, + "grad_norm": 5.499238571136398, + "learning_rate": 9.819277108433734e-07, + "loss": 0.5065, + "step": 163 + }, + { + "epoch": 0.029710144927536233, + "grad_norm": 4.698303937944457, + "learning_rate": 9.879518072289156e-07, + "loss": 0.4603, + "step": 164 + }, + { + "epoch": 0.029891304347826088, + "grad_norm": 5.9825894126873065, + "learning_rate": 9.93975903614458e-07, + "loss": 0.5424, + "step": 165 + }, + { + "epoch": 0.030072463768115943, + "grad_norm": 7.165804565484212, + "learning_rate": 1e-06, + "loss": 0.5096, + "step": 166 + }, + { + "epoch": 0.030253623188405798, + "grad_norm": 4.681956950030581, + "learning_rate": 9.999999139238154e-07, + "loss": 0.4768, + "step": 167 + }, + { + "epoch": 0.030434782608695653, + "grad_norm": 5.049888174912839, + "learning_rate": 9.999996556952915e-07, + "loss": 0.4945, + "step": 168 + }, + { + "epoch": 0.03061594202898551, + "grad_norm": 4.8280242174132315, + "learning_rate": 9.99999225314517e-07, + "loss": 0.501, + "step": 169 + }, + { + "epoch": 0.030797101449275364, + "grad_norm": 6.038857119417974, + "learning_rate": 9.999986227816403e-07, + "loss": 0.4701, + "step": 170 + }, + { + "epoch": 0.03097826086956522, + "grad_norm": 9.780202990037095, + "learning_rate": 9.999978480968688e-07, + "loss": 0.515, + "step": 171 + }, + { + "epoch": 0.031159420289855074, + "grad_norm": 7.9245280562881755, + "learning_rate": 9.999969012604688e-07, + "loss": 0.4884, + "step": 172 + }, + { + "epoch": 0.03134057971014493, + "grad_norm": 6.229806982396466, + "learning_rate": 9.99995782272767e-07, + "loss": 0.5007, + "step": 173 + }, + { + "epoch": 0.03152173913043478, + "grad_norm": 5.74944160350927, + "learning_rate": 9.999944911341482e-07, + "loss": 0.4702, + "step": 174 + }, + { + "epoch": 0.03170289855072464, + "grad_norm": 8.494426280655142, + "learning_rate": 9.999930278450572e-07, + "loss": 0.4605, + "step": 175 + }, + { + "epoch": 0.03188405797101449, + "grad_norm": 17.83791204432153, + "learning_rate": 9.999913924059976e-07, + "loss": 0.5116, + "step": 176 + }, + { + "epoch": 0.03206521739130435, + "grad_norm": 3.6510004514342236, + "learning_rate": 9.999895848175326e-07, + "loss": 0.4397, + "step": 177 + }, + { + "epoch": 0.0322463768115942, + "grad_norm": 6.016683146970344, + "learning_rate": 9.999876050802845e-07, + "loss": 0.4323, + "step": 178 + }, + { + "epoch": 0.03242753623188406, + "grad_norm": 5.310990129162454, + "learning_rate": 9.99985453194935e-07, + "loss": 0.462, + "step": 179 + }, + { + "epoch": 0.03260869565217391, + "grad_norm": 7.131501884340848, + "learning_rate": 9.999831291622249e-07, + "loss": 0.4573, + "step": 180 + }, + { + "epoch": 0.03278985507246377, + "grad_norm": 5.1447709167957365, + "learning_rate": 9.999806329829546e-07, + "loss": 0.4775, + "step": 181 + }, + { + "epoch": 0.03297101449275362, + "grad_norm": 3.4309908977880976, + "learning_rate": 9.999779646579833e-07, + "loss": 0.4683, + "step": 182 + }, + { + "epoch": 0.03315217391304348, + "grad_norm": 3.8563425479027713, + "learning_rate": 9.9997512418823e-07, + "loss": 0.4828, + "step": 183 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 7.324398994673771, + "learning_rate": 9.999721115746724e-07, + "loss": 0.4888, + "step": 184 + }, + { + "epoch": 0.03351449275362319, + "grad_norm": 5.109586000472538, + "learning_rate": 9.999689268183479e-07, + "loss": 0.432, + "step": 185 + }, + { + "epoch": 0.03369565217391304, + "grad_norm": 8.132923394746593, + "learning_rate": 9.999655699203529e-07, + "loss": 0.5387, + "step": 186 + }, + { + "epoch": 0.0338768115942029, + "grad_norm": 12.603147876277006, + "learning_rate": 9.999620408818434e-07, + "loss": 0.4939, + "step": 187 + }, + { + "epoch": 0.034057971014492754, + "grad_norm": 4.1141953695608, + "learning_rate": 9.999583397040342e-07, + "loss": 0.4778, + "step": 188 + }, + { + "epoch": 0.034239130434782605, + "grad_norm": 5.898156883814606, + "learning_rate": 9.999544663881998e-07, + "loss": 0.5166, + "step": 189 + }, + { + "epoch": 0.034420289855072464, + "grad_norm": 5.274643328578065, + "learning_rate": 9.99950420935674e-07, + "loss": 0.4391, + "step": 190 + }, + { + "epoch": 0.034601449275362316, + "grad_norm": 3.9149201160873317, + "learning_rate": 9.999462033478495e-07, + "loss": 0.4356, + "step": 191 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 3.8837631578712903, + "learning_rate": 9.999418136261781e-07, + "loss": 0.4534, + "step": 192 + }, + { + "epoch": 0.034963768115942026, + "grad_norm": 7.8220231109937695, + "learning_rate": 9.999372517721716e-07, + "loss": 0.4681, + "step": 193 + }, + { + "epoch": 0.035144927536231885, + "grad_norm": 4.092954063858875, + "learning_rate": 9.999325177874004e-07, + "loss": 0.4421, + "step": 194 + }, + { + "epoch": 0.035326086956521736, + "grad_norm": 6.792191106956772, + "learning_rate": 9.99927611673495e-07, + "loss": 0.4451, + "step": 195 + }, + { + "epoch": 0.035507246376811595, + "grad_norm": 4.001072586649773, + "learning_rate": 9.99922533432144e-07, + "loss": 0.4827, + "step": 196 + }, + { + "epoch": 0.03568840579710145, + "grad_norm": 12.554730407999608, + "learning_rate": 9.99917283065096e-07, + "loss": 0.4956, + "step": 197 + }, + { + "epoch": 0.035869565217391305, + "grad_norm": 11.02545191723406, + "learning_rate": 9.999118605741587e-07, + "loss": 0.5145, + "step": 198 + }, + { + "epoch": 0.03605072463768116, + "grad_norm": 5.486883030122579, + "learning_rate": 9.999062659611993e-07, + "loss": 0.4768, + "step": 199 + }, + { + "epoch": 0.036231884057971016, + "grad_norm": 3.863833273779658, + "learning_rate": 9.99900499228144e-07, + "loss": 0.4772, + "step": 200 + }, + { + "epoch": 0.036231884057971016, + "eval_loss": 0.4799531102180481, + "eval_runtime": 9.7837, + "eval_samples_per_second": 51.106, + "eval_steps_per_second": 0.102, + "step": 200 + }, + { + "epoch": 0.03641304347826087, + "grad_norm": 4.203418376242059, + "learning_rate": 9.998945603769783e-07, + "loss": 0.4934, + "step": 201 + }, + { + "epoch": 0.036594202898550726, + "grad_norm": 11.39032778557563, + "learning_rate": 9.998884494097466e-07, + "loss": 0.4871, + "step": 202 + }, + { + "epoch": 0.03677536231884058, + "grad_norm": 9.118931686244139, + "learning_rate": 9.998821663285535e-07, + "loss": 0.4519, + "step": 203 + }, + { + "epoch": 0.03695652173913044, + "grad_norm": 4.48001367048963, + "learning_rate": 9.998757111355617e-07, + "loss": 0.4495, + "step": 204 + }, + { + "epoch": 0.03713768115942029, + "grad_norm": 6.399610205483096, + "learning_rate": 9.998690838329946e-07, + "loss": 0.4878, + "step": 205 + }, + { + "epoch": 0.03731884057971015, + "grad_norm": 6.947629181854651, + "learning_rate": 9.998622844231333e-07, + "loss": 0.4778, + "step": 206 + }, + { + "epoch": 0.0375, + "grad_norm": 4.04023671442893, + "learning_rate": 9.99855312908319e-07, + "loss": 0.4459, + "step": 207 + }, + { + "epoch": 0.03768115942028986, + "grad_norm": 5.948010825711479, + "learning_rate": 9.998481692909519e-07, + "loss": 0.488, + "step": 208 + }, + { + "epoch": 0.03786231884057971, + "grad_norm": 5.4261911391284725, + "learning_rate": 9.998408535734921e-07, + "loss": 0.4771, + "step": 209 + }, + { + "epoch": 0.03804347826086957, + "grad_norm": 4.358401844220579, + "learning_rate": 9.99833365758458e-07, + "loss": 0.4382, + "step": 210 + }, + { + "epoch": 0.03822463768115942, + "grad_norm": 3.5567647760960823, + "learning_rate": 9.99825705848428e-07, + "loss": 0.5084, + "step": 211 + }, + { + "epoch": 0.03840579710144928, + "grad_norm": 8.107530278422496, + "learning_rate": 9.99817873846039e-07, + "loss": 0.5029, + "step": 212 + }, + { + "epoch": 0.03858695652173913, + "grad_norm": 3.6972001074897296, + "learning_rate": 9.99809869753988e-07, + "loss": 0.4844, + "step": 213 + }, + { + "epoch": 0.03876811594202899, + "grad_norm": 4.530244950059987, + "learning_rate": 9.998016935750306e-07, + "loss": 0.4925, + "step": 214 + }, + { + "epoch": 0.03894927536231884, + "grad_norm": 3.4162458991098856, + "learning_rate": 9.99793345311982e-07, + "loss": 0.4503, + "step": 215 + }, + { + "epoch": 0.0391304347826087, + "grad_norm": 8.119593317890072, + "learning_rate": 9.997848249677165e-07, + "loss": 0.4741, + "step": 216 + }, + { + "epoch": 0.03931159420289855, + "grad_norm": 4.235818762951556, + "learning_rate": 9.99776132545168e-07, + "loss": 0.4564, + "step": 217 + }, + { + "epoch": 0.03949275362318841, + "grad_norm": 4.100374620065906, + "learning_rate": 9.997672680473288e-07, + "loss": 0.5018, + "step": 218 + }, + { + "epoch": 0.03967391304347826, + "grad_norm": 3.5455145575604514, + "learning_rate": 9.997582314772513e-07, + "loss": 0.45, + "step": 219 + }, + { + "epoch": 0.03985507246376811, + "grad_norm": 7.115455836069538, + "learning_rate": 9.997490228380469e-07, + "loss": 0.4753, + "step": 220 + }, + { + "epoch": 0.04003623188405797, + "grad_norm": 4.038268060372091, + "learning_rate": 9.99739642132886e-07, + "loss": 0.452, + "step": 221 + }, + { + "epoch": 0.04021739130434782, + "grad_norm": 4.58804149646508, + "learning_rate": 9.997300893649985e-07, + "loss": 0.4331, + "step": 222 + }, + { + "epoch": 0.04039855072463768, + "grad_norm": 4.889877840343678, + "learning_rate": 9.997203645376735e-07, + "loss": 0.4084, + "step": 223 + }, + { + "epoch": 0.04057971014492753, + "grad_norm": 3.7746173579882063, + "learning_rate": 9.997104676542592e-07, + "loss": 0.4845, + "step": 224 + }, + { + "epoch": 0.04076086956521739, + "grad_norm": 4.878456578233542, + "learning_rate": 9.997003987181633e-07, + "loss": 0.486, + "step": 225 + }, + { + "epoch": 0.040942028985507244, + "grad_norm": 8.464404261598398, + "learning_rate": 9.996901577328524e-07, + "loss": 0.4651, + "step": 226 + }, + { + "epoch": 0.0411231884057971, + "grad_norm": 3.239182945202472, + "learning_rate": 9.996797447018527e-07, + "loss": 0.4742, + "step": 227 + }, + { + "epoch": 0.041304347826086954, + "grad_norm": 6.575912040753638, + "learning_rate": 9.996691596287494e-07, + "loss": 0.4789, + "step": 228 + }, + { + "epoch": 0.04148550724637681, + "grad_norm": 4.654026176911899, + "learning_rate": 9.99658402517187e-07, + "loss": 0.4991, + "step": 229 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 13.698454104683812, + "learning_rate": 9.996474733708688e-07, + "loss": 0.4452, + "step": 230 + }, + { + "epoch": 0.04184782608695652, + "grad_norm": 11.32330868708894, + "learning_rate": 9.996363721935584e-07, + "loss": 0.4279, + "step": 231 + }, + { + "epoch": 0.042028985507246375, + "grad_norm": 4.4802404094538115, + "learning_rate": 9.996250989890777e-07, + "loss": 0.4308, + "step": 232 + }, + { + "epoch": 0.04221014492753623, + "grad_norm": 5.584918400826132, + "learning_rate": 9.996136537613081e-07, + "loss": 0.4337, + "step": 233 + }, + { + "epoch": 0.042391304347826085, + "grad_norm": 3.523418864145712, + "learning_rate": 9.996020365141904e-07, + "loss": 0.4679, + "step": 234 + }, + { + "epoch": 0.042572463768115944, + "grad_norm": 4.150982567104567, + "learning_rate": 9.995902472517244e-07, + "loss": 0.4253, + "step": 235 + }, + { + "epoch": 0.042753623188405795, + "grad_norm": 5.031011676263364, + "learning_rate": 9.99578285977969e-07, + "loss": 0.4541, + "step": 236 + }, + { + "epoch": 0.042934782608695654, + "grad_norm": 10.171250203654354, + "learning_rate": 9.995661526970429e-07, + "loss": 0.4103, + "step": 237 + }, + { + "epoch": 0.043115942028985506, + "grad_norm": 3.145143864333559, + "learning_rate": 9.995538474131233e-07, + "loss": 0.4147, + "step": 238 + }, + { + "epoch": 0.043297101449275364, + "grad_norm": 3.3572508567730774, + "learning_rate": 9.995413701304472e-07, + "loss": 0.4592, + "step": 239 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 5.400613603597964, + "learning_rate": 9.995287208533102e-07, + "loss": 0.4142, + "step": 240 + }, + { + "epoch": 0.043659420289855075, + "grad_norm": 6.903415810833302, + "learning_rate": 9.995158995860681e-07, + "loss": 0.4605, + "step": 241 + }, + { + "epoch": 0.04384057971014493, + "grad_norm": 3.2645045771492427, + "learning_rate": 9.995029063331348e-07, + "loss": 0.4274, + "step": 242 + }, + { + "epoch": 0.044021739130434785, + "grad_norm": 3.7995981788280875, + "learning_rate": 9.994897410989843e-07, + "loss": 0.4455, + "step": 243 + }, + { + "epoch": 0.04420289855072464, + "grad_norm": 3.49231316391393, + "learning_rate": 9.994764038881494e-07, + "loss": 0.476, + "step": 244 + }, + { + "epoch": 0.044384057971014496, + "grad_norm": 5.007822367640591, + "learning_rate": 9.994628947052218e-07, + "loss": 0.436, + "step": 245 + }, + { + "epoch": 0.04456521739130435, + "grad_norm": 11.82610182564281, + "learning_rate": 9.994492135548532e-07, + "loss": 0.5, + "step": 246 + }, + { + "epoch": 0.044746376811594206, + "grad_norm": 7.8903958784074275, + "learning_rate": 9.99435360441754e-07, + "loss": 0.4843, + "step": 247 + }, + { + "epoch": 0.04492753623188406, + "grad_norm": 6.512054775710009, + "learning_rate": 9.994213353706937e-07, + "loss": 0.4386, + "step": 248 + }, + { + "epoch": 0.045108695652173916, + "grad_norm": 3.593400698737775, + "learning_rate": 9.994071383465015e-07, + "loss": 0.4912, + "step": 249 + }, + { + "epoch": 0.04528985507246377, + "grad_norm": 3.70358783415324, + "learning_rate": 9.99392769374065e-07, + "loss": 0.4406, + "step": 250 + }, + { + "epoch": 0.04547101449275362, + "grad_norm": 5.803109189517661, + "learning_rate": 9.99378228458332e-07, + "loss": 0.4911, + "step": 251 + }, + { + "epoch": 0.04565217391304348, + "grad_norm": 3.6893076225191366, + "learning_rate": 9.99363515604309e-07, + "loss": 0.4008, + "step": 252 + }, + { + "epoch": 0.04583333333333333, + "grad_norm": 11.30397560941352, + "learning_rate": 9.993486308170612e-07, + "loss": 0.4109, + "step": 253 + }, + { + "epoch": 0.04601449275362319, + "grad_norm": 12.350789542199472, + "learning_rate": 9.993335741017142e-07, + "loss": 0.469, + "step": 254 + }, + { + "epoch": 0.04619565217391304, + "grad_norm": 5.098644742786573, + "learning_rate": 9.993183454634518e-07, + "loss": 0.4467, + "step": 255 + }, + { + "epoch": 0.0463768115942029, + "grad_norm": 13.048604609016728, + "learning_rate": 9.99302944907517e-07, + "loss": 0.4725, + "step": 256 + }, + { + "epoch": 0.04655797101449275, + "grad_norm": 3.6081675063214647, + "learning_rate": 9.992873724392125e-07, + "loss": 0.4821, + "step": 257 + }, + { + "epoch": 0.04673913043478261, + "grad_norm": 8.134011732152958, + "learning_rate": 9.992716280639e-07, + "loss": 0.5192, + "step": 258 + }, + { + "epoch": 0.04692028985507246, + "grad_norm": 4.312079872375456, + "learning_rate": 9.992557117870004e-07, + "loss": 0.4406, + "step": 259 + }, + { + "epoch": 0.04710144927536232, + "grad_norm": 3.0720013735899614, + "learning_rate": 9.992396236139938e-07, + "loss": 0.3787, + "step": 260 + }, + { + "epoch": 0.04728260869565217, + "grad_norm": 3.197982546937872, + "learning_rate": 9.992233635504192e-07, + "loss": 0.405, + "step": 261 + }, + { + "epoch": 0.04746376811594203, + "grad_norm": 4.466568402155062, + "learning_rate": 9.992069316018753e-07, + "loss": 0.476, + "step": 262 + }, + { + "epoch": 0.04764492753623188, + "grad_norm": 9.687932325929705, + "learning_rate": 9.991903277740194e-07, + "loss": 0.3793, + "step": 263 + }, + { + "epoch": 0.04782608695652174, + "grad_norm": 4.472993031097122, + "learning_rate": 9.991735520725686e-07, + "loss": 0.4799, + "step": 264 + }, + { + "epoch": 0.04800724637681159, + "grad_norm": 5.149212903988031, + "learning_rate": 9.991566045032987e-07, + "loss": 0.4429, + "step": 265 + }, + { + "epoch": 0.04818840579710145, + "grad_norm": 10.002158108751402, + "learning_rate": 9.991394850720447e-07, + "loss": 0.4584, + "step": 266 + }, + { + "epoch": 0.0483695652173913, + "grad_norm": 5.019567737873869, + "learning_rate": 9.991221937847009e-07, + "loss": 0.3803, + "step": 267 + }, + { + "epoch": 0.04855072463768116, + "grad_norm": 3.2146775101458225, + "learning_rate": 9.991047306472212e-07, + "loss": 0.3998, + "step": 268 + }, + { + "epoch": 0.04873188405797101, + "grad_norm": 3.589919460415493, + "learning_rate": 9.990870956656177e-07, + "loss": 0.4702, + "step": 269 + }, + { + "epoch": 0.04891304347826087, + "grad_norm": 3.2269059369290596, + "learning_rate": 9.990692888459624e-07, + "loss": 0.4026, + "step": 270 + }, + { + "epoch": 0.04909420289855072, + "grad_norm": 4.6381853298897076, + "learning_rate": 9.990513101943865e-07, + "loss": 0.3651, + "step": 271 + }, + { + "epoch": 0.04927536231884058, + "grad_norm": 3.5029065849997374, + "learning_rate": 9.990331597170799e-07, + "loss": 0.4736, + "step": 272 + }, + { + "epoch": 0.049456521739130434, + "grad_norm": 3.468095053203953, + "learning_rate": 9.990148374202918e-07, + "loss": 0.4406, + "step": 273 + }, + { + "epoch": 0.04963768115942029, + "grad_norm": 5.608119264163545, + "learning_rate": 9.98996343310331e-07, + "loss": 0.4315, + "step": 274 + }, + { + "epoch": 0.049818840579710144, + "grad_norm": 4.481775347083576, + "learning_rate": 9.989776773935647e-07, + "loss": 0.4048, + "step": 275 + }, + { + "epoch": 0.05, + "grad_norm": 8.42617168385361, + "learning_rate": 9.9895883967642e-07, + "loss": 0.4717, + "step": 276 + }, + { + "epoch": 0.050181159420289854, + "grad_norm": 2.9730415433744795, + "learning_rate": 9.989398301653827e-07, + "loss": 0.4453, + "step": 277 + }, + { + "epoch": 0.05036231884057971, + "grad_norm": 6.367426887077506, + "learning_rate": 9.989206488669977e-07, + "loss": 0.5167, + "step": 278 + }, + { + "epoch": 0.050543478260869565, + "grad_norm": 3.3475584911089813, + "learning_rate": 9.989012957878696e-07, + "loss": 0.4429, + "step": 279 + }, + { + "epoch": 0.050724637681159424, + "grad_norm": 9.549679012326568, + "learning_rate": 9.988817709346613e-07, + "loss": 0.4263, + "step": 280 + }, + { + "epoch": 0.050905797101449275, + "grad_norm": 14.673903217076143, + "learning_rate": 9.988620743140954e-07, + "loss": 0.4167, + "step": 281 + }, + { + "epoch": 0.051086956521739134, + "grad_norm": 8.83576126339579, + "learning_rate": 9.98842205932954e-07, + "loss": 0.4107, + "step": 282 + }, + { + "epoch": 0.051268115942028986, + "grad_norm": 3.721884330371865, + "learning_rate": 9.988221657980773e-07, + "loss": 0.3893, + "step": 283 + }, + { + "epoch": 0.05144927536231884, + "grad_norm": 3.1958082516461084, + "learning_rate": 9.988019539163656e-07, + "loss": 0.4677, + "step": 284 + }, + { + "epoch": 0.051630434782608696, + "grad_norm": 4.774367249930716, + "learning_rate": 9.987815702947778e-07, + "loss": 0.4857, + "step": 285 + }, + { + "epoch": 0.05181159420289855, + "grad_norm": 4.653595638505687, + "learning_rate": 9.987610149403318e-07, + "loss": 0.3866, + "step": 286 + }, + { + "epoch": 0.051992753623188406, + "grad_norm": 4.4555986130321665, + "learning_rate": 9.987402878601054e-07, + "loss": 0.4225, + "step": 287 + }, + { + "epoch": 0.05217391304347826, + "grad_norm": 5.074289715952272, + "learning_rate": 9.98719389061235e-07, + "loss": 0.394, + "step": 288 + }, + { + "epoch": 0.05235507246376812, + "grad_norm": 21.000755364790056, + "learning_rate": 9.986983185509154e-07, + "loss": 0.5396, + "step": 289 + }, + { + "epoch": 0.05253623188405797, + "grad_norm": 8.756603860656229, + "learning_rate": 9.986770763364022e-07, + "loss": 0.4343, + "step": 290 + }, + { + "epoch": 0.05271739130434783, + "grad_norm": 13.006270890305858, + "learning_rate": 9.98655662425009e-07, + "loss": 0.5182, + "step": 291 + }, + { + "epoch": 0.05289855072463768, + "grad_norm": 4.322906945976464, + "learning_rate": 9.986340768241082e-07, + "loss": 0.4838, + "step": 292 + }, + { + "epoch": 0.05307971014492754, + "grad_norm": 7.013753420131705, + "learning_rate": 9.986123195411325e-07, + "loss": 0.4889, + "step": 293 + }, + { + "epoch": 0.05326086956521739, + "grad_norm": 5.939365992478394, + "learning_rate": 9.985903905835724e-07, + "loss": 0.4141, + "step": 294 + }, + { + "epoch": 0.05344202898550725, + "grad_norm": 6.907585093152365, + "learning_rate": 9.985682899589786e-07, + "loss": 0.3719, + "step": 295 + }, + { + "epoch": 0.0536231884057971, + "grad_norm": 4.789625418427414, + "learning_rate": 9.985460176749603e-07, + "loss": 0.4395, + "step": 296 + }, + { + "epoch": 0.05380434782608696, + "grad_norm": 9.530957504007826, + "learning_rate": 9.985235737391859e-07, + "loss": 0.47, + "step": 297 + }, + { + "epoch": 0.05398550724637681, + "grad_norm": 14.37981639887827, + "learning_rate": 9.985009581593832e-07, + "loss": 0.4323, + "step": 298 + }, + { + "epoch": 0.05416666666666667, + "grad_norm": 6.161679840080418, + "learning_rate": 9.984781709433385e-07, + "loss": 0.4067, + "step": 299 + }, + { + "epoch": 0.05434782608695652, + "grad_norm": 11.887943982222478, + "learning_rate": 9.984552120988977e-07, + "loss": 0.4812, + "step": 300 + }, + { + "epoch": 0.05434782608695652, + "eval_loss": 0.47120311856269836, + "eval_runtime": 9.8004, + "eval_samples_per_second": 51.018, + "eval_steps_per_second": 0.102, + "step": 300 + }, + { + "epoch": 0.05452898550724638, + "grad_norm": 6.5873591367134985, + "learning_rate": 9.984320816339657e-07, + "loss": 0.3944, + "step": 301 + }, + { + "epoch": 0.05471014492753623, + "grad_norm": 12.355267798919755, + "learning_rate": 9.984087795565062e-07, + "loss": 0.4695, + "step": 302 + }, + { + "epoch": 0.05489130434782609, + "grad_norm": 12.589460413890302, + "learning_rate": 9.983853058745427e-07, + "loss": 0.4464, + "step": 303 + }, + { + "epoch": 0.05507246376811594, + "grad_norm": 3.258325196650175, + "learning_rate": 9.983616605961567e-07, + "loss": 0.4586, + "step": 304 + }, + { + "epoch": 0.0552536231884058, + "grad_norm": 3.1999294224444133, + "learning_rate": 9.983378437294898e-07, + "loss": 0.4337, + "step": 305 + }, + { + "epoch": 0.05543478260869565, + "grad_norm": 4.610310287741306, + "learning_rate": 9.983138552827421e-07, + "loss": 0.4576, + "step": 306 + }, + { + "epoch": 0.05561594202898551, + "grad_norm": 17.25045959911115, + "learning_rate": 9.982896952641729e-07, + "loss": 0.4327, + "step": 307 + }, + { + "epoch": 0.05579710144927536, + "grad_norm": 3.2771407388093863, + "learning_rate": 9.982653636821009e-07, + "loss": 0.4128, + "step": 308 + }, + { + "epoch": 0.05597826086956522, + "grad_norm": 12.474300239271598, + "learning_rate": 9.98240860544903e-07, + "loss": 0.4292, + "step": 309 + }, + { + "epoch": 0.05615942028985507, + "grad_norm": 9.765697228028497, + "learning_rate": 9.982161858610164e-07, + "loss": 0.4678, + "step": 310 + }, + { + "epoch": 0.05634057971014493, + "grad_norm": 6.223457294323614, + "learning_rate": 9.981913396389363e-07, + "loss": 0.4363, + "step": 311 + }, + { + "epoch": 0.05652173913043478, + "grad_norm": 11.23907673982082, + "learning_rate": 9.981663218872176e-07, + "loss": 0.4471, + "step": 312 + }, + { + "epoch": 0.05670289855072464, + "grad_norm": 6.29386904352475, + "learning_rate": 9.981411326144739e-07, + "loss": 0.3714, + "step": 313 + }, + { + "epoch": 0.05688405797101449, + "grad_norm": 4.454392684265817, + "learning_rate": 9.981157718293778e-07, + "loss": 0.431, + "step": 314 + }, + { + "epoch": 0.057065217391304345, + "grad_norm": 6.7689194707428815, + "learning_rate": 9.980902395406614e-07, + "loss": 0.3973, + "step": 315 + }, + { + "epoch": 0.0572463768115942, + "grad_norm": 3.8662898008650397, + "learning_rate": 9.980645357571155e-07, + "loss": 0.4624, + "step": 316 + }, + { + "epoch": 0.057427536231884055, + "grad_norm": 11.61237583189014, + "learning_rate": 9.980386604875901e-07, + "loss": 0.4101, + "step": 317 + }, + { + "epoch": 0.057608695652173914, + "grad_norm": 18.647937963415455, + "learning_rate": 9.980126137409943e-07, + "loss": 0.4664, + "step": 318 + }, + { + "epoch": 0.057789855072463765, + "grad_norm": 20.235344240487514, + "learning_rate": 9.979863955262958e-07, + "loss": 0.485, + "step": 319 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 8.673295240220018, + "learning_rate": 9.979600058525218e-07, + "loss": 0.5213, + "step": 320 + }, + { + "epoch": 0.058152173913043476, + "grad_norm": 5.0332827313105, + "learning_rate": 9.979334447287583e-07, + "loss": 0.4302, + "step": 321 + }, + { + "epoch": 0.058333333333333334, + "grad_norm": 2.938752515020906, + "learning_rate": 9.979067121641508e-07, + "loss": 0.4212, + "step": 322 + }, + { + "epoch": 0.058514492753623186, + "grad_norm": 3.923096115868624, + "learning_rate": 9.97879808167903e-07, + "loss": 0.4178, + "step": 323 + }, + { + "epoch": 0.058695652173913045, + "grad_norm": 13.5987032976378, + "learning_rate": 9.978527327492782e-07, + "loss": 0.5078, + "step": 324 + }, + { + "epoch": 0.058876811594202896, + "grad_norm": 5.213367702287492, + "learning_rate": 9.978254859175989e-07, + "loss": 0.3997, + "step": 325 + }, + { + "epoch": 0.059057971014492755, + "grad_norm": 18.126153719639184, + "learning_rate": 9.977980676822457e-07, + "loss": 0.5056, + "step": 326 + }, + { + "epoch": 0.05923913043478261, + "grad_norm": 8.946691114858897, + "learning_rate": 9.977704780526595e-07, + "loss": 0.4235, + "step": 327 + }, + { + "epoch": 0.059420289855072465, + "grad_norm": 6.093794338617538, + "learning_rate": 9.97742717038339e-07, + "loss": 0.4073, + "step": 328 + }, + { + "epoch": 0.05960144927536232, + "grad_norm": 3.4112452861358578, + "learning_rate": 9.977147846488427e-07, + "loss": 0.4564, + "step": 329 + }, + { + "epoch": 0.059782608695652176, + "grad_norm": 7.300564120321209, + "learning_rate": 9.976866808937879e-07, + "loss": 0.4362, + "step": 330 + }, + { + "epoch": 0.05996376811594203, + "grad_norm": 5.495126326007882, + "learning_rate": 9.976584057828507e-07, + "loss": 0.4456, + "step": 331 + }, + { + "epoch": 0.060144927536231886, + "grad_norm": 12.94818234024048, + "learning_rate": 9.976299593257665e-07, + "loss": 0.3757, + "step": 332 + }, + { + "epoch": 0.06032608695652174, + "grad_norm": 5.127082909766672, + "learning_rate": 9.976013415323294e-07, + "loss": 0.4197, + "step": 333 + }, + { + "epoch": 0.060507246376811596, + "grad_norm": 7.707380300916236, + "learning_rate": 9.975725524123928e-07, + "loss": 0.5027, + "step": 334 + }, + { + "epoch": 0.06068840579710145, + "grad_norm": 2.991295798355372, + "learning_rate": 9.975435919758688e-07, + "loss": 0.4419, + "step": 335 + }, + { + "epoch": 0.06086956521739131, + "grad_norm": 11.079079182236235, + "learning_rate": 9.975144602327288e-07, + "loss": 0.4434, + "step": 336 + }, + { + "epoch": 0.06105072463768116, + "grad_norm": 18.89291545985812, + "learning_rate": 9.974851571930028e-07, + "loss": 0.467, + "step": 337 + }, + { + "epoch": 0.06123188405797102, + "grad_norm": 9.552257156822742, + "learning_rate": 9.9745568286678e-07, + "loss": 0.4831, + "step": 338 + }, + { + "epoch": 0.06141304347826087, + "grad_norm": 14.262346669578724, + "learning_rate": 9.974260372642085e-07, + "loss": 0.485, + "step": 339 + }, + { + "epoch": 0.06159420289855073, + "grad_norm": 4.106830280026516, + "learning_rate": 9.973962203954958e-07, + "loss": 0.4163, + "step": 340 + }, + { + "epoch": 0.06177536231884058, + "grad_norm": 6.518748191177843, + "learning_rate": 9.973662322709075e-07, + "loss": 0.4545, + "step": 341 + }, + { + "epoch": 0.06195652173913044, + "grad_norm": 6.564197662443871, + "learning_rate": 9.973360729007689e-07, + "loss": 0.4459, + "step": 342 + }, + { + "epoch": 0.06213768115942029, + "grad_norm": 5.044300770936432, + "learning_rate": 9.97305742295464e-07, + "loss": 0.5336, + "step": 343 + }, + { + "epoch": 0.06231884057971015, + "grad_norm": 3.7795514206422443, + "learning_rate": 9.972752404654356e-07, + "loss": 0.394, + "step": 344 + }, + { + "epoch": 0.0625, + "grad_norm": 3.8497984211877716, + "learning_rate": 9.972445674211858e-07, + "loss": 0.4195, + "step": 345 + }, + { + "epoch": 0.06268115942028986, + "grad_norm": 3.8332303461266717, + "learning_rate": 9.972137231732755e-07, + "loss": 0.3574, + "step": 346 + }, + { + "epoch": 0.0628623188405797, + "grad_norm": 4.271922254758665, + "learning_rate": 9.971827077323246e-07, + "loss": 0.4225, + "step": 347 + }, + { + "epoch": 0.06304347826086956, + "grad_norm": 7.1585652686951535, + "learning_rate": 9.971515211090116e-07, + "loss": 0.4087, + "step": 348 + }, + { + "epoch": 0.06322463768115942, + "grad_norm": 15.57564963359638, + "learning_rate": 9.971201633140745e-07, + "loss": 0.4055, + "step": 349 + }, + { + "epoch": 0.06340579710144928, + "grad_norm": 5.448346024226663, + "learning_rate": 9.970886343583096e-07, + "loss": 0.4016, + "step": 350 + }, + { + "epoch": 0.06358695652173912, + "grad_norm": 13.117456422021675, + "learning_rate": 9.970569342525725e-07, + "loss": 0.4028, + "step": 351 + }, + { + "epoch": 0.06376811594202898, + "grad_norm": 6.0202611923497455, + "learning_rate": 9.97025063007778e-07, + "loss": 0.4211, + "step": 352 + }, + { + "epoch": 0.06394927536231884, + "grad_norm": 6.870526620490523, + "learning_rate": 9.969930206348993e-07, + "loss": 0.4077, + "step": 353 + }, + { + "epoch": 0.0641304347826087, + "grad_norm": 10.515467615290369, + "learning_rate": 9.969608071449688e-07, + "loss": 0.4629, + "step": 354 + }, + { + "epoch": 0.06431159420289854, + "grad_norm": 11.80631388821243, + "learning_rate": 9.969284225490778e-07, + "loss": 0.4268, + "step": 355 + }, + { + "epoch": 0.0644927536231884, + "grad_norm": 7.6392809525616725, + "learning_rate": 9.968958668583764e-07, + "loss": 0.4094, + "step": 356 + }, + { + "epoch": 0.06467391304347826, + "grad_norm": 9.29797531345794, + "learning_rate": 9.968631400840736e-07, + "loss": 0.4138, + "step": 357 + }, + { + "epoch": 0.06485507246376812, + "grad_norm": 6.846604831154082, + "learning_rate": 9.968302422374377e-07, + "loss": 0.4102, + "step": 358 + }, + { + "epoch": 0.06503623188405797, + "grad_norm": 6.0701576869940315, + "learning_rate": 9.967971733297954e-07, + "loss": 0.5007, + "step": 359 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 3.411764228446719, + "learning_rate": 9.967639333725321e-07, + "loss": 0.4359, + "step": 360 + }, + { + "epoch": 0.06539855072463768, + "grad_norm": 6.689856685176495, + "learning_rate": 9.96730522377093e-07, + "loss": 0.465, + "step": 361 + }, + { + "epoch": 0.06557971014492754, + "grad_norm": 4.401029651137588, + "learning_rate": 9.966969403549816e-07, + "loss": 0.4526, + "step": 362 + }, + { + "epoch": 0.06576086956521739, + "grad_norm": 8.21724962171842, + "learning_rate": 9.9666318731776e-07, + "loss": 0.4962, + "step": 363 + }, + { + "epoch": 0.06594202898550725, + "grad_norm": 3.065975264812797, + "learning_rate": 9.9662926327705e-07, + "loss": 0.3774, + "step": 364 + }, + { + "epoch": 0.0661231884057971, + "grad_norm": 3.1515456547780785, + "learning_rate": 9.965951682445316e-07, + "loss": 0.4674, + "step": 365 + }, + { + "epoch": 0.06630434782608696, + "grad_norm": 8.873169062696592, + "learning_rate": 9.965609022319436e-07, + "loss": 0.4257, + "step": 366 + }, + { + "epoch": 0.06648550724637681, + "grad_norm": 7.077801766150442, + "learning_rate": 9.965264652510844e-07, + "loss": 0.4055, + "step": 367 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 2.9996172220762496, + "learning_rate": 9.964918573138104e-07, + "loss": 0.3944, + "step": 368 + }, + { + "epoch": 0.06684782608695652, + "grad_norm": 10.369047989122329, + "learning_rate": 9.964570784320377e-07, + "loss": 0.4819, + "step": 369 + }, + { + "epoch": 0.06702898550724638, + "grad_norm": 4.9460142568376355, + "learning_rate": 9.964221286177406e-07, + "loss": 0.394, + "step": 370 + }, + { + "epoch": 0.06721014492753623, + "grad_norm": 3.893525051628319, + "learning_rate": 9.963870078829525e-07, + "loss": 0.4529, + "step": 371 + }, + { + "epoch": 0.06739130434782609, + "grad_norm": 4.304044234640589, + "learning_rate": 9.963517162397657e-07, + "loss": 0.3751, + "step": 372 + }, + { + "epoch": 0.06757246376811595, + "grad_norm": 7.732223667642914, + "learning_rate": 9.963162537003312e-07, + "loss": 0.3972, + "step": 373 + }, + { + "epoch": 0.0677536231884058, + "grad_norm": 6.0130996151181595, + "learning_rate": 9.96280620276859e-07, + "loss": 0.3921, + "step": 374 + }, + { + "epoch": 0.06793478260869565, + "grad_norm": 8.460173035380654, + "learning_rate": 9.962448159816177e-07, + "loss": 0.3807, + "step": 375 + }, + { + "epoch": 0.06811594202898551, + "grad_norm": 4.819996735574024, + "learning_rate": 9.962088408269352e-07, + "loss": 0.3499, + "step": 376 + }, + { + "epoch": 0.06829710144927537, + "grad_norm": 3.3718245778120006, + "learning_rate": 9.961726948251974e-07, + "loss": 0.4545, + "step": 377 + }, + { + "epoch": 0.06847826086956521, + "grad_norm": 7.821014083891689, + "learning_rate": 9.9613637798885e-07, + "loss": 0.4446, + "step": 378 + }, + { + "epoch": 0.06865942028985507, + "grad_norm": 6.309749251328278, + "learning_rate": 9.960998903303972e-07, + "loss": 0.4193, + "step": 379 + }, + { + "epoch": 0.06884057971014493, + "grad_norm": 6.135018571957307, + "learning_rate": 9.960632318624013e-07, + "loss": 0.4197, + "step": 380 + }, + { + "epoch": 0.06902173913043479, + "grad_norm": 2.797929551699773, + "learning_rate": 9.960264025974843e-07, + "loss": 0.4111, + "step": 381 + }, + { + "epoch": 0.06920289855072463, + "grad_norm": 4.3193744861577255, + "learning_rate": 9.959894025483267e-07, + "loss": 0.4464, + "step": 382 + }, + { + "epoch": 0.06938405797101449, + "grad_norm": 3.112028280208757, + "learning_rate": 9.959522317276677e-07, + "loss": 0.3816, + "step": 383 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 5.335998953603253, + "learning_rate": 9.959148901483054e-07, + "loss": 0.4342, + "step": 384 + }, + { + "epoch": 0.06974637681159421, + "grad_norm": 9.249965647740233, + "learning_rate": 9.95877377823097e-07, + "loss": 0.4338, + "step": 385 + }, + { + "epoch": 0.06992753623188405, + "grad_norm": 3.925470899440518, + "learning_rate": 9.958396947649576e-07, + "loss": 0.3627, + "step": 386 + }, + { + "epoch": 0.07010869565217391, + "grad_norm": 3.9316397537793297, + "learning_rate": 9.95801840986862e-07, + "loss": 0.428, + "step": 387 + }, + { + "epoch": 0.07028985507246377, + "grad_norm": 4.057789454179014, + "learning_rate": 9.957638165018436e-07, + "loss": 0.4597, + "step": 388 + }, + { + "epoch": 0.07047101449275363, + "grad_norm": 5.255851869069346, + "learning_rate": 9.957256213229941e-07, + "loss": 0.4545, + "step": 389 + }, + { + "epoch": 0.07065217391304347, + "grad_norm": 13.445809799619013, + "learning_rate": 9.956872554634643e-07, + "loss": 0.3859, + "step": 390 + }, + { + "epoch": 0.07083333333333333, + "grad_norm": 7.045721748771448, + "learning_rate": 9.95648718936464e-07, + "loss": 0.4236, + "step": 391 + }, + { + "epoch": 0.07101449275362319, + "grad_norm": 10.569858991413426, + "learning_rate": 9.95610011755261e-07, + "loss": 0.3715, + "step": 392 + }, + { + "epoch": 0.07119565217391305, + "grad_norm": 5.582768581984532, + "learning_rate": 9.95571133933183e-07, + "loss": 0.4978, + "step": 393 + }, + { + "epoch": 0.0713768115942029, + "grad_norm": 8.552602155127621, + "learning_rate": 9.955320854836154e-07, + "loss": 0.3998, + "step": 394 + }, + { + "epoch": 0.07155797101449275, + "grad_norm": 4.5124503893903185, + "learning_rate": 9.954928664200028e-07, + "loss": 0.4288, + "step": 395 + }, + { + "epoch": 0.07173913043478261, + "grad_norm": 5.905639449788327, + "learning_rate": 9.954534767558488e-07, + "loss": 0.4193, + "step": 396 + }, + { + "epoch": 0.07192028985507247, + "grad_norm": 4.068999111492898, + "learning_rate": 9.954139165047153e-07, + "loss": 0.4124, + "step": 397 + }, + { + "epoch": 0.07210144927536231, + "grad_norm": 7.630135633731, + "learning_rate": 9.953741856802226e-07, + "loss": 0.4202, + "step": 398 + }, + { + "epoch": 0.07228260869565217, + "grad_norm": 5.424395768705924, + "learning_rate": 9.95334284296051e-07, + "loss": 0.3836, + "step": 399 + }, + { + "epoch": 0.07246376811594203, + "grad_norm": 7.323502213352483, + "learning_rate": 9.952942123659383e-07, + "loss": 0.4171, + "step": 400 + }, + { + "epoch": 0.07246376811594203, + "eval_loss": 0.439328134059906, + "eval_runtime": 9.758, + "eval_samples_per_second": 51.24, + "eval_steps_per_second": 0.102, + "step": 400 + }, + { + "epoch": 0.07264492753623189, + "grad_norm": 9.340926912974801, + "learning_rate": 9.952539699036817e-07, + "loss": 0.4515, + "step": 401 + }, + { + "epoch": 0.07282608695652174, + "grad_norm": 7.860902436954148, + "learning_rate": 9.952135569231364e-07, + "loss": 0.4265, + "step": 402 + }, + { + "epoch": 0.0730072463768116, + "grad_norm": 12.500932961139199, + "learning_rate": 9.951729734382173e-07, + "loss": 0.45, + "step": 403 + }, + { + "epoch": 0.07318840579710145, + "grad_norm": 6.523900108438604, + "learning_rate": 9.95132219462897e-07, + "loss": 0.4467, + "step": 404 + }, + { + "epoch": 0.07336956521739131, + "grad_norm": 6.27576837518242, + "learning_rate": 9.950912950112078e-07, + "loss": 0.457, + "step": 405 + }, + { + "epoch": 0.07355072463768116, + "grad_norm": 7.767324684551404, + "learning_rate": 9.9505020009724e-07, + "loss": 0.4264, + "step": 406 + }, + { + "epoch": 0.07373188405797101, + "grad_norm": 6.471263780811469, + "learning_rate": 9.950089347351424e-07, + "loss": 0.3779, + "step": 407 + }, + { + "epoch": 0.07391304347826087, + "grad_norm": 3.4477486851014305, + "learning_rate": 9.949674989391235e-07, + "loss": 0.3806, + "step": 408 + }, + { + "epoch": 0.07409420289855072, + "grad_norm": 3.253994445320859, + "learning_rate": 9.949258927234493e-07, + "loss": 0.4117, + "step": 409 + }, + { + "epoch": 0.07427536231884058, + "grad_norm": 3.0371762069799297, + "learning_rate": 9.948841161024452e-07, + "loss": 0.4017, + "step": 410 + }, + { + "epoch": 0.07445652173913044, + "grad_norm": 3.53851217966538, + "learning_rate": 9.948421690904953e-07, + "loss": 0.3984, + "step": 411 + }, + { + "epoch": 0.0746376811594203, + "grad_norm": 10.225663070573427, + "learning_rate": 9.94800051702042e-07, + "loss": 0.4229, + "step": 412 + }, + { + "epoch": 0.07481884057971014, + "grad_norm": 11.11255647953521, + "learning_rate": 9.947577639515862e-07, + "loss": 0.4225, + "step": 413 + }, + { + "epoch": 0.075, + "grad_norm": 6.892373928834471, + "learning_rate": 9.947153058536882e-07, + "loss": 0.4576, + "step": 414 + }, + { + "epoch": 0.07518115942028986, + "grad_norm": 3.0793916374965153, + "learning_rate": 9.946726774229664e-07, + "loss": 0.4047, + "step": 415 + }, + { + "epoch": 0.07536231884057971, + "grad_norm": 5.650987984564611, + "learning_rate": 9.94629878674098e-07, + "loss": 0.4232, + "step": 416 + }, + { + "epoch": 0.07554347826086956, + "grad_norm": 5.125240624974368, + "learning_rate": 9.945869096218188e-07, + "loss": 0.3933, + "step": 417 + }, + { + "epoch": 0.07572463768115942, + "grad_norm": 6.004904831867239, + "learning_rate": 9.94543770280923e-07, + "loss": 0.4163, + "step": 418 + }, + { + "epoch": 0.07590579710144928, + "grad_norm": 12.202913838238823, + "learning_rate": 9.945004606662642e-07, + "loss": 0.4258, + "step": 419 + }, + { + "epoch": 0.07608695652173914, + "grad_norm": 19.806067834435986, + "learning_rate": 9.944569807927534e-07, + "loss": 0.4641, + "step": 420 + }, + { + "epoch": 0.07626811594202898, + "grad_norm": 21.45097326185226, + "learning_rate": 9.944133306753616e-07, + "loss": 0.4027, + "step": 421 + }, + { + "epoch": 0.07644927536231884, + "grad_norm": 15.711150416386833, + "learning_rate": 9.943695103291175e-07, + "loss": 0.408, + "step": 422 + }, + { + "epoch": 0.0766304347826087, + "grad_norm": 5.636549116664538, + "learning_rate": 9.943255197691085e-07, + "loss": 0.4625, + "step": 423 + }, + { + "epoch": 0.07681159420289856, + "grad_norm": 6.729199348614858, + "learning_rate": 9.94281359010481e-07, + "loss": 0.4002, + "step": 424 + }, + { + "epoch": 0.0769927536231884, + "grad_norm": 3.8159797431684983, + "learning_rate": 9.942370280684396e-07, + "loss": 0.4036, + "step": 425 + }, + { + "epoch": 0.07717391304347826, + "grad_norm": 6.753286773647914, + "learning_rate": 9.941925269582477e-07, + "loss": 0.3701, + "step": 426 + }, + { + "epoch": 0.07735507246376812, + "grad_norm": 11.310994484797527, + "learning_rate": 9.94147855695227e-07, + "loss": 0.4587, + "step": 427 + }, + { + "epoch": 0.07753623188405798, + "grad_norm": 11.5073236225397, + "learning_rate": 9.941030142947586e-07, + "loss": 0.4494, + "step": 428 + }, + { + "epoch": 0.07771739130434782, + "grad_norm": 5.733456623019978, + "learning_rate": 9.94058002772281e-07, + "loss": 0.4662, + "step": 429 + }, + { + "epoch": 0.07789855072463768, + "grad_norm": 3.0184337786908553, + "learning_rate": 9.940128211432923e-07, + "loss": 0.3815, + "step": 430 + }, + { + "epoch": 0.07807971014492754, + "grad_norm": 3.7515337294113276, + "learning_rate": 9.939674694233487e-07, + "loss": 0.4439, + "step": 431 + }, + { + "epoch": 0.0782608695652174, + "grad_norm": 3.6615852173475627, + "learning_rate": 9.939219476280648e-07, + "loss": 0.4047, + "step": 432 + }, + { + "epoch": 0.07844202898550724, + "grad_norm": 3.7865592767186, + "learning_rate": 9.93876255773114e-07, + "loss": 0.4135, + "step": 433 + }, + { + "epoch": 0.0786231884057971, + "grad_norm": 4.890425711346586, + "learning_rate": 9.938303938742284e-07, + "loss": 0.4216, + "step": 434 + }, + { + "epoch": 0.07880434782608696, + "grad_norm": 8.781852176695326, + "learning_rate": 9.937843619471984e-07, + "loss": 0.4204, + "step": 435 + }, + { + "epoch": 0.07898550724637682, + "grad_norm": 5.451014653615925, + "learning_rate": 9.93738160007873e-07, + "loss": 0.4648, + "step": 436 + }, + { + "epoch": 0.07916666666666666, + "grad_norm": 6.440770007617795, + "learning_rate": 9.936917880721596e-07, + "loss": 0.4434, + "step": 437 + }, + { + "epoch": 0.07934782608695652, + "grad_norm": 5.793893214872514, + "learning_rate": 9.936452461560242e-07, + "loss": 0.4867, + "step": 438 + }, + { + "epoch": 0.07952898550724638, + "grad_norm": 3.1069520285809995, + "learning_rate": 9.93598534275492e-07, + "loss": 0.3624, + "step": 439 + }, + { + "epoch": 0.07971014492753623, + "grad_norm": 5.282723070651829, + "learning_rate": 9.935516524466456e-07, + "loss": 0.4459, + "step": 440 + }, + { + "epoch": 0.07989130434782608, + "grad_norm": 3.8561441294534524, + "learning_rate": 9.935046006856269e-07, + "loss": 0.4055, + "step": 441 + }, + { + "epoch": 0.08007246376811594, + "grad_norm": 11.283086680324008, + "learning_rate": 9.934573790086355e-07, + "loss": 0.3904, + "step": 442 + }, + { + "epoch": 0.0802536231884058, + "grad_norm": 5.699597939314377, + "learning_rate": 9.93409987431931e-07, + "loss": 0.4396, + "step": 443 + }, + { + "epoch": 0.08043478260869565, + "grad_norm": 6.436041850723825, + "learning_rate": 9.933624259718295e-07, + "loss": 0.3696, + "step": 444 + }, + { + "epoch": 0.0806159420289855, + "grad_norm": 5.295092388071156, + "learning_rate": 9.933146946447075e-07, + "loss": 0.3792, + "step": 445 + }, + { + "epoch": 0.08079710144927536, + "grad_norm": 4.352238066478703, + "learning_rate": 9.932667934669985e-07, + "loss": 0.4491, + "step": 446 + }, + { + "epoch": 0.08097826086956522, + "grad_norm": 7.410705472114366, + "learning_rate": 9.932187224551955e-07, + "loss": 0.3228, + "step": 447 + }, + { + "epoch": 0.08115942028985507, + "grad_norm": 11.511784915461405, + "learning_rate": 9.931704816258494e-07, + "loss": 0.395, + "step": 448 + }, + { + "epoch": 0.08134057971014493, + "grad_norm": 11.001562864344784, + "learning_rate": 9.931220709955698e-07, + "loss": 0.4397, + "step": 449 + }, + { + "epoch": 0.08152173913043478, + "grad_norm": 3.949317430969973, + "learning_rate": 9.930734905810248e-07, + "loss": 0.3653, + "step": 450 + }, + { + "epoch": 0.08170289855072464, + "grad_norm": 6.613952449363454, + "learning_rate": 9.930247403989407e-07, + "loss": 0.4393, + "step": 451 + }, + { + "epoch": 0.08188405797101449, + "grad_norm": 13.77915475688408, + "learning_rate": 9.929758204661026e-07, + "loss": 0.426, + "step": 452 + }, + { + "epoch": 0.08206521739130435, + "grad_norm": 19.507550047347742, + "learning_rate": 9.929267307993535e-07, + "loss": 0.4501, + "step": 453 + }, + { + "epoch": 0.0822463768115942, + "grad_norm": 5.963643178822908, + "learning_rate": 9.928774714155956e-07, + "loss": 0.3923, + "step": 454 + }, + { + "epoch": 0.08242753623188406, + "grad_norm": 15.061735947604319, + "learning_rate": 9.928280423317889e-07, + "loss": 0.4199, + "step": 455 + }, + { + "epoch": 0.08260869565217391, + "grad_norm": 6.680897612163597, + "learning_rate": 9.927784435649522e-07, + "loss": 0.4019, + "step": 456 + }, + { + "epoch": 0.08278985507246377, + "grad_norm": 4.3251429861266795, + "learning_rate": 9.927286751321625e-07, + "loss": 0.4664, + "step": 457 + }, + { + "epoch": 0.08297101449275363, + "grad_norm": 10.960962341505919, + "learning_rate": 9.926787370505555e-07, + "loss": 0.4075, + "step": 458 + }, + { + "epoch": 0.08315217391304348, + "grad_norm": 7.749145911295322, + "learning_rate": 9.926286293373247e-07, + "loss": 0.407, + "step": 459 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 3.729712142380696, + "learning_rate": 9.925783520097232e-07, + "loss": 0.4122, + "step": 460 + }, + { + "epoch": 0.08351449275362319, + "grad_norm": 3.8514934585283225, + "learning_rate": 9.925279050850607e-07, + "loss": 0.3975, + "step": 461 + }, + { + "epoch": 0.08369565217391305, + "grad_norm": 4.7496888468099785, + "learning_rate": 9.92477288580707e-07, + "loss": 0.4564, + "step": 462 + }, + { + "epoch": 0.0838768115942029, + "grad_norm": 3.709577026722062, + "learning_rate": 9.924265025140895e-07, + "loss": 0.4185, + "step": 463 + }, + { + "epoch": 0.08405797101449275, + "grad_norm": 9.222652780805598, + "learning_rate": 9.92375546902694e-07, + "loss": 0.4202, + "step": 464 + }, + { + "epoch": 0.08423913043478261, + "grad_norm": 5.765529327125465, + "learning_rate": 9.923244217640648e-07, + "loss": 0.4355, + "step": 465 + }, + { + "epoch": 0.08442028985507247, + "grad_norm": 5.914602480754362, + "learning_rate": 9.922731271158043e-07, + "loss": 0.3654, + "step": 466 + }, + { + "epoch": 0.08460144927536233, + "grad_norm": 3.079931421155299, + "learning_rate": 9.922216629755738e-07, + "loss": 0.4221, + "step": 467 + }, + { + "epoch": 0.08478260869565217, + "grad_norm": 4.747874203838646, + "learning_rate": 9.921700293610927e-07, + "loss": 0.4155, + "step": 468 + }, + { + "epoch": 0.08496376811594203, + "grad_norm": 3.88032264069236, + "learning_rate": 9.921182262901385e-07, + "loss": 0.4491, + "step": 469 + }, + { + "epoch": 0.08514492753623189, + "grad_norm": 5.187752169289457, + "learning_rate": 9.92066253780547e-07, + "loss": 0.3927, + "step": 470 + }, + { + "epoch": 0.08532608695652173, + "grad_norm": 8.771015796895298, + "learning_rate": 9.920141118502132e-07, + "loss": 0.4647, + "step": 471 + }, + { + "epoch": 0.08550724637681159, + "grad_norm": 6.553980309219007, + "learning_rate": 9.919618005170894e-07, + "loss": 0.4016, + "step": 472 + }, + { + "epoch": 0.08568840579710145, + "grad_norm": 3.4027620169061112, + "learning_rate": 9.919093197991866e-07, + "loss": 0.4774, + "step": 473 + }, + { + "epoch": 0.08586956521739131, + "grad_norm": 4.949986845985805, + "learning_rate": 9.918566697145744e-07, + "loss": 0.3714, + "step": 474 + }, + { + "epoch": 0.08605072463768115, + "grad_norm": 3.6963389796294885, + "learning_rate": 9.918038502813803e-07, + "loss": 0.4652, + "step": 475 + }, + { + "epoch": 0.08623188405797101, + "grad_norm": 6.434617160328624, + "learning_rate": 9.917508615177903e-07, + "loss": 0.392, + "step": 476 + }, + { + "epoch": 0.08641304347826087, + "grad_norm": 3.4434552084107035, + "learning_rate": 9.91697703442049e-07, + "loss": 0.4035, + "step": 477 + }, + { + "epoch": 0.08659420289855073, + "grad_norm": 6.467189772046257, + "learning_rate": 9.916443760724582e-07, + "loss": 0.3875, + "step": 478 + }, + { + "epoch": 0.08677536231884057, + "grad_norm": 4.818885264761668, + "learning_rate": 9.915908794273796e-07, + "loss": 0.451, + "step": 479 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 5.608456207161903, + "learning_rate": 9.915372135252317e-07, + "loss": 0.439, + "step": 480 + }, + { + "epoch": 0.08713768115942029, + "grad_norm": 4.436576067151408, + "learning_rate": 9.914833783844926e-07, + "loss": 0.4002, + "step": 481 + }, + { + "epoch": 0.08731884057971015, + "grad_norm": 7.373402547890791, + "learning_rate": 9.914293740236974e-07, + "loss": 0.4265, + "step": 482 + }, + { + "epoch": 0.0875, + "grad_norm": 6.5813453782574145, + "learning_rate": 9.913752004614404e-07, + "loss": 0.4747, + "step": 483 + }, + { + "epoch": 0.08768115942028985, + "grad_norm": 4.546582872445812, + "learning_rate": 9.913208577163736e-07, + "loss": 0.3824, + "step": 484 + }, + { + "epoch": 0.08786231884057971, + "grad_norm": 3.514233704435464, + "learning_rate": 9.912663458072077e-07, + "loss": 0.3755, + "step": 485 + }, + { + "epoch": 0.08804347826086957, + "grad_norm": 6.692949143335827, + "learning_rate": 9.91211664752711e-07, + "loss": 0.4484, + "step": 486 + }, + { + "epoch": 0.08822463768115942, + "grad_norm": 8.375534585210136, + "learning_rate": 9.91156814571711e-07, + "loss": 0.4218, + "step": 487 + }, + { + "epoch": 0.08840579710144927, + "grad_norm": 3.356234671918528, + "learning_rate": 9.911017952830926e-07, + "loss": 0.3743, + "step": 488 + }, + { + "epoch": 0.08858695652173913, + "grad_norm": 3.0297919611763287, + "learning_rate": 9.91046606905799e-07, + "loss": 0.4025, + "step": 489 + }, + { + "epoch": 0.08876811594202899, + "grad_norm": 12.139794903176407, + "learning_rate": 9.90991249458832e-07, + "loss": 0.4568, + "step": 490 + }, + { + "epoch": 0.08894927536231884, + "grad_norm": 14.89649793798213, + "learning_rate": 9.909357229612516e-07, + "loss": 0.4219, + "step": 491 + }, + { + "epoch": 0.0891304347826087, + "grad_norm": 9.39213140678959, + "learning_rate": 9.908800274321757e-07, + "loss": 0.4205, + "step": 492 + }, + { + "epoch": 0.08931159420289855, + "grad_norm": 11.847798168359438, + "learning_rate": 9.908241628907806e-07, + "loss": 0.3959, + "step": 493 + }, + { + "epoch": 0.08949275362318841, + "grad_norm": 4.464857190048925, + "learning_rate": 9.907681293563004e-07, + "loss": 0.414, + "step": 494 + }, + { + "epoch": 0.08967391304347826, + "grad_norm": 3.6050304449575177, + "learning_rate": 9.90711926848028e-07, + "loss": 0.4686, + "step": 495 + }, + { + "epoch": 0.08985507246376812, + "grad_norm": 4.346200294431187, + "learning_rate": 9.906555553853142e-07, + "loss": 0.3239, + "step": 496 + }, + { + "epoch": 0.09003623188405797, + "grad_norm": 5.373120289089078, + "learning_rate": 9.90599014987568e-07, + "loss": 0.4218, + "step": 497 + }, + { + "epoch": 0.09021739130434783, + "grad_norm": 13.948418387535892, + "learning_rate": 9.905423056742561e-07, + "loss": 0.4207, + "step": 498 + }, + { + "epoch": 0.09039855072463768, + "grad_norm": 9.299621889157287, + "learning_rate": 9.904854274649045e-07, + "loss": 0.4248, + "step": 499 + }, + { + "epoch": 0.09057971014492754, + "grad_norm": 3.7046528030287362, + "learning_rate": 9.90428380379096e-07, + "loss": 0.4031, + "step": 500 + }, + { + "epoch": 0.09057971014492754, + "eval_loss": 0.4216562509536743, + "eval_runtime": 9.762, + "eval_samples_per_second": 51.219, + "eval_steps_per_second": 0.102, + "step": 500 + }, + { + "epoch": 0.0907608695652174, + "grad_norm": 4.909743322977319, + "learning_rate": 9.903711644364726e-07, + "loss": 0.3991, + "step": 501 + }, + { + "epoch": 0.09094202898550724, + "grad_norm": 3.2100999274972066, + "learning_rate": 9.903137796567338e-07, + "loss": 0.3815, + "step": 502 + }, + { + "epoch": 0.0911231884057971, + "grad_norm": 8.237659705884834, + "learning_rate": 9.902562260596374e-07, + "loss": 0.3948, + "step": 503 + }, + { + "epoch": 0.09130434782608696, + "grad_norm": 4.880081596683351, + "learning_rate": 9.901985036649997e-07, + "loss": 0.3484, + "step": 504 + }, + { + "epoch": 0.09148550724637682, + "grad_norm": 8.958099383680779, + "learning_rate": 9.901406124926945e-07, + "loss": 0.3733, + "step": 505 + }, + { + "epoch": 0.09166666666666666, + "grad_norm": 8.110912513386152, + "learning_rate": 9.90082552562654e-07, + "loss": 0.3782, + "step": 506 + }, + { + "epoch": 0.09184782608695652, + "grad_norm": 6.328915402744954, + "learning_rate": 9.900243238948686e-07, + "loss": 0.4155, + "step": 507 + }, + { + "epoch": 0.09202898550724638, + "grad_norm": 7.345956243956656, + "learning_rate": 9.899659265093867e-07, + "loss": 0.4436, + "step": 508 + }, + { + "epoch": 0.09221014492753624, + "grad_norm": 7.499421591266258, + "learning_rate": 9.89907360426315e-07, + "loss": 0.3823, + "step": 509 + }, + { + "epoch": 0.09239130434782608, + "grad_norm": 8.795202176113806, + "learning_rate": 9.898486256658176e-07, + "loss": 0.4067, + "step": 510 + }, + { + "epoch": 0.09257246376811594, + "grad_norm": 8.561886093558764, + "learning_rate": 9.897897222481176e-07, + "loss": 0.4363, + "step": 511 + }, + { + "epoch": 0.0927536231884058, + "grad_norm": 3.4906015491203295, + "learning_rate": 9.897306501934954e-07, + "loss": 0.4357, + "step": 512 + }, + { + "epoch": 0.09293478260869566, + "grad_norm": 3.8086664027306103, + "learning_rate": 9.8967140952229e-07, + "loss": 0.4391, + "step": 513 + }, + { + "epoch": 0.0931159420289855, + "grad_norm": 4.021423759166613, + "learning_rate": 9.896120002548984e-07, + "loss": 0.428, + "step": 514 + }, + { + "epoch": 0.09329710144927536, + "grad_norm": 3.5751648538740204, + "learning_rate": 9.895524224117751e-07, + "loss": 0.4229, + "step": 515 + }, + { + "epoch": 0.09347826086956522, + "grad_norm": 4.377740980156112, + "learning_rate": 9.894926760134332e-07, + "loss": 0.4141, + "step": 516 + }, + { + "epoch": 0.09365942028985508, + "grad_norm": 6.746029498524841, + "learning_rate": 9.894327610804437e-07, + "loss": 0.4437, + "step": 517 + }, + { + "epoch": 0.09384057971014492, + "grad_norm": 4.662321750690026, + "learning_rate": 9.893726776334357e-07, + "loss": 0.3762, + "step": 518 + }, + { + "epoch": 0.09402173913043478, + "grad_norm": 6.028252363662997, + "learning_rate": 9.89312425693096e-07, + "loss": 0.381, + "step": 519 + }, + { + "epoch": 0.09420289855072464, + "grad_norm": 10.820973639970484, + "learning_rate": 9.892520052801696e-07, + "loss": 0.4092, + "step": 520 + }, + { + "epoch": 0.0943840579710145, + "grad_norm": 4.994053097632154, + "learning_rate": 9.891914164154597e-07, + "loss": 0.3728, + "step": 521 + }, + { + "epoch": 0.09456521739130434, + "grad_norm": 4.335073532593652, + "learning_rate": 9.891306591198273e-07, + "loss": 0.4326, + "step": 522 + }, + { + "epoch": 0.0947463768115942, + "grad_norm": 4.839108409033974, + "learning_rate": 9.890697334141917e-07, + "loss": 0.3738, + "step": 523 + }, + { + "epoch": 0.09492753623188406, + "grad_norm": 3.822368363444714, + "learning_rate": 9.890086393195293e-07, + "loss": 0.3946, + "step": 524 + }, + { + "epoch": 0.09510869565217392, + "grad_norm": 7.7936536704105, + "learning_rate": 9.889473768568756e-07, + "loss": 0.3494, + "step": 525 + }, + { + "epoch": 0.09528985507246376, + "grad_norm": 3.274470581444187, + "learning_rate": 9.888859460473233e-07, + "loss": 0.349, + "step": 526 + }, + { + "epoch": 0.09547101449275362, + "grad_norm": 3.521029986233931, + "learning_rate": 9.888243469120232e-07, + "loss": 0.3986, + "step": 527 + }, + { + "epoch": 0.09565217391304348, + "grad_norm": 4.716810008159726, + "learning_rate": 9.887625794721847e-07, + "loss": 0.3723, + "step": 528 + }, + { + "epoch": 0.09583333333333334, + "grad_norm": 3.1238151651325814, + "learning_rate": 9.88700643749074e-07, + "loss": 0.3514, + "step": 529 + }, + { + "epoch": 0.09601449275362318, + "grad_norm": 9.71183723127655, + "learning_rate": 9.886385397640164e-07, + "loss": 0.4037, + "step": 530 + }, + { + "epoch": 0.09619565217391304, + "grad_norm": 5.939099150935516, + "learning_rate": 9.885762675383942e-07, + "loss": 0.4487, + "step": 531 + }, + { + "epoch": 0.0963768115942029, + "grad_norm": 3.8273230631587127, + "learning_rate": 9.88513827093648e-07, + "loss": 0.4105, + "step": 532 + }, + { + "epoch": 0.09655797101449276, + "grad_norm": 5.485473164960536, + "learning_rate": 9.884512184512767e-07, + "loss": 0.4277, + "step": 533 + }, + { + "epoch": 0.0967391304347826, + "grad_norm": 5.031933316795367, + "learning_rate": 9.883884416328366e-07, + "loss": 0.39, + "step": 534 + }, + { + "epoch": 0.09692028985507246, + "grad_norm": 3.7205972464354407, + "learning_rate": 9.883254966599419e-07, + "loss": 0.3806, + "step": 535 + }, + { + "epoch": 0.09710144927536232, + "grad_norm": 3.7845619210949937, + "learning_rate": 9.882623835542648e-07, + "loss": 0.4013, + "step": 536 + }, + { + "epoch": 0.09728260869565217, + "grad_norm": 4.897714959472158, + "learning_rate": 9.881991023375361e-07, + "loss": 0.3812, + "step": 537 + }, + { + "epoch": 0.09746376811594203, + "grad_norm": 4.205051144146041, + "learning_rate": 9.88135653031543e-07, + "loss": 0.391, + "step": 538 + }, + { + "epoch": 0.09764492753623188, + "grad_norm": 5.9123946130328635, + "learning_rate": 9.88072035658132e-07, + "loss": 0.4523, + "step": 539 + }, + { + "epoch": 0.09782608695652174, + "grad_norm": 4.035804154739938, + "learning_rate": 9.88008250239206e-07, + "loss": 0.4333, + "step": 540 + }, + { + "epoch": 0.09800724637681159, + "grad_norm": 3.6662815956216366, + "learning_rate": 9.879442967967277e-07, + "loss": 0.4395, + "step": 541 + }, + { + "epoch": 0.09818840579710145, + "grad_norm": 3.853913747890982, + "learning_rate": 9.87880175352716e-07, + "loss": 0.3721, + "step": 542 + }, + { + "epoch": 0.0983695652173913, + "grad_norm": 4.9243426979249, + "learning_rate": 9.87815885929248e-07, + "loss": 0.4215, + "step": 543 + }, + { + "epoch": 0.09855072463768116, + "grad_norm": 4.836214182798005, + "learning_rate": 9.877514285484596e-07, + "loss": 0.4345, + "step": 544 + }, + { + "epoch": 0.09873188405797101, + "grad_norm": 2.6599651591981437, + "learning_rate": 9.876868032325431e-07, + "loss": 0.3733, + "step": 545 + }, + { + "epoch": 0.09891304347826087, + "grad_norm": 3.89849000510434, + "learning_rate": 9.876220100037495e-07, + "loss": 0.4503, + "step": 546 + }, + { + "epoch": 0.09909420289855073, + "grad_norm": 5.7650794262787395, + "learning_rate": 9.875570488843877e-07, + "loss": 0.415, + "step": 547 + }, + { + "epoch": 0.09927536231884058, + "grad_norm": 5.045400385597417, + "learning_rate": 9.874919198968238e-07, + "loss": 0.3721, + "step": 548 + }, + { + "epoch": 0.09945652173913043, + "grad_norm": 10.845790348299131, + "learning_rate": 9.874266230634817e-07, + "loss": 0.3974, + "step": 549 + }, + { + "epoch": 0.09963768115942029, + "grad_norm": 5.820459631963228, + "learning_rate": 9.87361158406844e-07, + "loss": 0.4004, + "step": 550 + }, + { + "epoch": 0.09981884057971015, + "grad_norm": 4.982527003568789, + "learning_rate": 9.872955259494507e-07, + "loss": 0.4426, + "step": 551 + }, + { + "epoch": 0.1, + "grad_norm": 4.09022491278035, + "learning_rate": 9.872297257138986e-07, + "loss": 0.4067, + "step": 552 + }, + { + "epoch": 0.10018115942028985, + "grad_norm": 4.0414344654466925, + "learning_rate": 9.871637577228435e-07, + "loss": 0.403, + "step": 553 + }, + { + "epoch": 0.10036231884057971, + "grad_norm": 5.408776258682077, + "learning_rate": 9.870976219989983e-07, + "loss": 0.392, + "step": 554 + }, + { + "epoch": 0.10054347826086957, + "grad_norm": 6.118416413008888, + "learning_rate": 9.87031318565134e-07, + "loss": 0.3667, + "step": 555 + }, + { + "epoch": 0.10072463768115943, + "grad_norm": 3.713821558799002, + "learning_rate": 9.86964847444079e-07, + "loss": 0.4355, + "step": 556 + }, + { + "epoch": 0.10090579710144927, + "grad_norm": 5.084458387294805, + "learning_rate": 9.868982086587198e-07, + "loss": 0.3884, + "step": 557 + }, + { + "epoch": 0.10108695652173913, + "grad_norm": 5.474832946669612, + "learning_rate": 9.868314022320004e-07, + "loss": 0.4503, + "step": 558 + }, + { + "epoch": 0.10126811594202899, + "grad_norm": 8.965183447895956, + "learning_rate": 9.867644281869225e-07, + "loss": 0.4244, + "step": 559 + }, + { + "epoch": 0.10144927536231885, + "grad_norm": 6.668905041203743, + "learning_rate": 9.866972865465458e-07, + "loss": 0.4434, + "step": 560 + }, + { + "epoch": 0.10163043478260869, + "grad_norm": 9.970457044149805, + "learning_rate": 9.866299773339872e-07, + "loss": 0.3823, + "step": 561 + }, + { + "epoch": 0.10181159420289855, + "grad_norm": 4.516665327587777, + "learning_rate": 9.865625005724218e-07, + "loss": 0.4406, + "step": 562 + }, + { + "epoch": 0.10199275362318841, + "grad_norm": 4.297869573489257, + "learning_rate": 9.86494856285082e-07, + "loss": 0.3742, + "step": 563 + }, + { + "epoch": 0.10217391304347827, + "grad_norm": 2.9766661533359096, + "learning_rate": 9.86427044495258e-07, + "loss": 0.402, + "step": 564 + }, + { + "epoch": 0.10235507246376811, + "grad_norm": 2.9515335020993776, + "learning_rate": 9.86359065226298e-07, + "loss": 0.3802, + "step": 565 + }, + { + "epoch": 0.10253623188405797, + "grad_norm": 6.602180704091807, + "learning_rate": 9.862909185016075e-07, + "loss": 0.4214, + "step": 566 + }, + { + "epoch": 0.10271739130434783, + "grad_norm": 6.746499100917246, + "learning_rate": 9.862226043446495e-07, + "loss": 0.3825, + "step": 567 + }, + { + "epoch": 0.10289855072463767, + "grad_norm": 3.6099067207943136, + "learning_rate": 9.861541227789453e-07, + "loss": 0.3969, + "step": 568 + }, + { + "epoch": 0.10307971014492753, + "grad_norm": 13.761913336067417, + "learning_rate": 9.860854738280729e-07, + "loss": 0.4191, + "step": 569 + }, + { + "epoch": 0.10326086956521739, + "grad_norm": 6.489451433994243, + "learning_rate": 9.860166575156687e-07, + "loss": 0.3791, + "step": 570 + }, + { + "epoch": 0.10344202898550725, + "grad_norm": 4.314055565227107, + "learning_rate": 9.859476738654267e-07, + "loss": 0.4182, + "step": 571 + }, + { + "epoch": 0.1036231884057971, + "grad_norm": 4.69387661324399, + "learning_rate": 9.85878522901098e-07, + "loss": 0.4029, + "step": 572 + }, + { + "epoch": 0.10380434782608695, + "grad_norm": 10.001203485806512, + "learning_rate": 9.858092046464918e-07, + "loss": 0.4319, + "step": 573 + }, + { + "epoch": 0.10398550724637681, + "grad_norm": 10.494122030255477, + "learning_rate": 9.857397191254745e-07, + "loss": 0.3797, + "step": 574 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 5.345093980495571, + "learning_rate": 9.856700663619706e-07, + "loss": 0.4002, + "step": 575 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 4.908460973089521, + "learning_rate": 9.856002463799618e-07, + "loss": 0.3571, + "step": 576 + }, + { + "epoch": 0.10452898550724637, + "grad_norm": 2.7921889624281984, + "learning_rate": 9.85530259203487e-07, + "loss": 0.3745, + "step": 577 + }, + { + "epoch": 0.10471014492753623, + "grad_norm": 14.436192603938819, + "learning_rate": 9.854601048566439e-07, + "loss": 0.4241, + "step": 578 + }, + { + "epoch": 0.10489130434782609, + "grad_norm": 2.92397445519826, + "learning_rate": 9.853897833635862e-07, + "loss": 0.4128, + "step": 579 + }, + { + "epoch": 0.10507246376811594, + "grad_norm": 10.900420049048746, + "learning_rate": 9.853192947485264e-07, + "loss": 0.3661, + "step": 580 + }, + { + "epoch": 0.1052536231884058, + "grad_norm": 14.626108636879042, + "learning_rate": 9.852486390357338e-07, + "loss": 0.3942, + "step": 581 + }, + { + "epoch": 0.10543478260869565, + "grad_norm": 8.30319118243175, + "learning_rate": 9.851778162495356e-07, + "loss": 0.3694, + "step": 582 + }, + { + "epoch": 0.10561594202898551, + "grad_norm": 8.169036934257266, + "learning_rate": 9.851068264143167e-07, + "loss": 0.4099, + "step": 583 + }, + { + "epoch": 0.10579710144927536, + "grad_norm": 2.9906093830436498, + "learning_rate": 9.850356695545188e-07, + "loss": 0.3566, + "step": 584 + }, + { + "epoch": 0.10597826086956522, + "grad_norm": 3.6001680612909825, + "learning_rate": 9.849643456946418e-07, + "loss": 0.4521, + "step": 585 + }, + { + "epoch": 0.10615942028985507, + "grad_norm": 6.085782026961147, + "learning_rate": 9.848928548592427e-07, + "loss": 0.3563, + "step": 586 + }, + { + "epoch": 0.10634057971014493, + "grad_norm": 7.688400779311696, + "learning_rate": 9.848211970729363e-07, + "loss": 0.4225, + "step": 587 + }, + { + "epoch": 0.10652173913043478, + "grad_norm": 5.300904388674724, + "learning_rate": 9.847493723603945e-07, + "loss": 0.4058, + "step": 588 + }, + { + "epoch": 0.10670289855072464, + "grad_norm": 11.21739541512806, + "learning_rate": 9.846773807463472e-07, + "loss": 0.4172, + "step": 589 + }, + { + "epoch": 0.1068840579710145, + "grad_norm": 8.766216878129292, + "learning_rate": 9.84605222255581e-07, + "loss": 0.3654, + "step": 590 + }, + { + "epoch": 0.10706521739130435, + "grad_norm": 12.331651847214227, + "learning_rate": 9.845328969129408e-07, + "loss": 0.4144, + "step": 591 + }, + { + "epoch": 0.1072463768115942, + "grad_norm": 6.724710853490635, + "learning_rate": 9.844604047433284e-07, + "loss": 0.3847, + "step": 592 + }, + { + "epoch": 0.10742753623188406, + "grad_norm": 3.7769319461712776, + "learning_rate": 9.84387745771703e-07, + "loss": 0.3605, + "step": 593 + }, + { + "epoch": 0.10760869565217392, + "grad_norm": 9.274282451844847, + "learning_rate": 9.84314920023082e-07, + "loss": 0.4225, + "step": 594 + }, + { + "epoch": 0.10778985507246377, + "grad_norm": 9.08703773612549, + "learning_rate": 9.842419275225393e-07, + "loss": 0.3561, + "step": 595 + }, + { + "epoch": 0.10797101449275362, + "grad_norm": 7.237535536958595, + "learning_rate": 9.841687682952065e-07, + "loss": 0.3491, + "step": 596 + }, + { + "epoch": 0.10815217391304348, + "grad_norm": 5.977113474938249, + "learning_rate": 9.840954423662725e-07, + "loss": 0.4388, + "step": 597 + }, + { + "epoch": 0.10833333333333334, + "grad_norm": 8.476019821951454, + "learning_rate": 9.840219497609843e-07, + "loss": 0.4062, + "step": 598 + }, + { + "epoch": 0.10851449275362318, + "grad_norm": 3.530986362196853, + "learning_rate": 9.839482905046454e-07, + "loss": 0.4208, + "step": 599 + }, + { + "epoch": 0.10869565217391304, + "grad_norm": 5.094473008842111, + "learning_rate": 9.83874464622617e-07, + "loss": 0.3959, + "step": 600 + }, + { + "epoch": 0.10869565217391304, + "eval_loss": 0.3958437442779541, + "eval_runtime": 9.7363, + "eval_samples_per_second": 51.354, + "eval_steps_per_second": 0.103, + "step": 600 + }, + { + "epoch": 0.1088768115942029, + "grad_norm": 3.2452463718213727, + "learning_rate": 9.83800472140318e-07, + "loss": 0.3601, + "step": 601 + }, + { + "epoch": 0.10905797101449276, + "grad_norm": 3.0100855619960862, + "learning_rate": 9.837263130832238e-07, + "loss": 0.3766, + "step": 602 + }, + { + "epoch": 0.1092391304347826, + "grad_norm": 2.8678179697818305, + "learning_rate": 9.836519874768683e-07, + "loss": 0.3745, + "step": 603 + }, + { + "epoch": 0.10942028985507246, + "grad_norm": 5.053696469470357, + "learning_rate": 9.835774953468418e-07, + "loss": 0.496, + "step": 604 + }, + { + "epoch": 0.10960144927536232, + "grad_norm": 4.415771420539539, + "learning_rate": 9.835028367187925e-07, + "loss": 0.4315, + "step": 605 + }, + { + "epoch": 0.10978260869565218, + "grad_norm": 3.9164017173426124, + "learning_rate": 9.834280116184256e-07, + "loss": 0.4481, + "step": 606 + }, + { + "epoch": 0.10996376811594202, + "grad_norm": 8.193746453862184, + "learning_rate": 9.833530200715036e-07, + "loss": 0.4083, + "step": 607 + }, + { + "epoch": 0.11014492753623188, + "grad_norm": 9.998514741027849, + "learning_rate": 9.832778621038468e-07, + "loss": 0.4246, + "step": 608 + }, + { + "epoch": 0.11032608695652174, + "grad_norm": 3.60639495102559, + "learning_rate": 9.832025377413323e-07, + "loss": 0.3658, + "step": 609 + }, + { + "epoch": 0.1105072463768116, + "grad_norm": 3.8901053542554647, + "learning_rate": 9.831270470098945e-07, + "loss": 0.4045, + "step": 610 + }, + { + "epoch": 0.11068840579710144, + "grad_norm": 5.487699115996111, + "learning_rate": 9.830513899355256e-07, + "loss": 0.3884, + "step": 611 + }, + { + "epoch": 0.1108695652173913, + "grad_norm": 7.847929804798982, + "learning_rate": 9.82975566544274e-07, + "loss": 0.3587, + "step": 612 + }, + { + "epoch": 0.11105072463768116, + "grad_norm": 4.105159274722128, + "learning_rate": 9.828995768622466e-07, + "loss": 0.3392, + "step": 613 + }, + { + "epoch": 0.11123188405797102, + "grad_norm": 6.811306174094375, + "learning_rate": 9.828234209156068e-07, + "loss": 0.4077, + "step": 614 + }, + { + "epoch": 0.11141304347826086, + "grad_norm": 7.662751539071657, + "learning_rate": 9.827470987305755e-07, + "loss": 0.3582, + "step": 615 + }, + { + "epoch": 0.11159420289855072, + "grad_norm": 3.7154738441060866, + "learning_rate": 9.82670610333431e-07, + "loss": 0.4308, + "step": 616 + }, + { + "epoch": 0.11177536231884058, + "grad_norm": 3.5686291897697036, + "learning_rate": 9.825939557505084e-07, + "loss": 0.3893, + "step": 617 + }, + { + "epoch": 0.11195652173913044, + "grad_norm": 6.22065233119125, + "learning_rate": 9.825171350082e-07, + "loss": 0.4276, + "step": 618 + }, + { + "epoch": 0.11213768115942029, + "grad_norm": 10.433056372259445, + "learning_rate": 9.82440148132956e-07, + "loss": 0.4078, + "step": 619 + }, + { + "epoch": 0.11231884057971014, + "grad_norm": 2.9469994737947705, + "learning_rate": 9.82362995151283e-07, + "loss": 0.3905, + "step": 620 + }, + { + "epoch": 0.1125, + "grad_norm": 3.082673000388614, + "learning_rate": 9.822856760897455e-07, + "loss": 0.4229, + "step": 621 + }, + { + "epoch": 0.11268115942028986, + "grad_norm": 14.433871663160714, + "learning_rate": 9.822081909749645e-07, + "loss": 0.3834, + "step": 622 + }, + { + "epoch": 0.1128623188405797, + "grad_norm": 3.179853208322323, + "learning_rate": 9.821305398336185e-07, + "loss": 0.3336, + "step": 623 + }, + { + "epoch": 0.11304347826086956, + "grad_norm": 4.095122390888196, + "learning_rate": 9.820527226924434e-07, + "loss": 0.3553, + "step": 624 + }, + { + "epoch": 0.11322463768115942, + "grad_norm": 3.8474876482605014, + "learning_rate": 9.81974739578232e-07, + "loss": 0.3805, + "step": 625 + }, + { + "epoch": 0.11340579710144928, + "grad_norm": 2.877984972194894, + "learning_rate": 9.818965905178338e-07, + "loss": 0.3715, + "step": 626 + }, + { + "epoch": 0.11358695652173913, + "grad_norm": 3.0809366208593034, + "learning_rate": 9.818182755381564e-07, + "loss": 0.3597, + "step": 627 + }, + { + "epoch": 0.11376811594202899, + "grad_norm": 9.166389852537947, + "learning_rate": 9.817397946661637e-07, + "loss": 0.4059, + "step": 628 + }, + { + "epoch": 0.11394927536231884, + "grad_norm": 7.8759684402183465, + "learning_rate": 9.816611479288771e-07, + "loss": 0.4109, + "step": 629 + }, + { + "epoch": 0.11413043478260869, + "grad_norm": 11.728571521660111, + "learning_rate": 9.815823353533754e-07, + "loss": 0.4954, + "step": 630 + }, + { + "epoch": 0.11431159420289855, + "grad_norm": 3.206979668453135, + "learning_rate": 9.815033569667936e-07, + "loss": 0.3672, + "step": 631 + }, + { + "epoch": 0.1144927536231884, + "grad_norm": 7.017852524469355, + "learning_rate": 9.814242127963249e-07, + "loss": 0.4023, + "step": 632 + }, + { + "epoch": 0.11467391304347826, + "grad_norm": 4.072937382431754, + "learning_rate": 9.813449028692183e-07, + "loss": 0.4221, + "step": 633 + }, + { + "epoch": 0.11485507246376811, + "grad_norm": 6.597252865370013, + "learning_rate": 9.812654272127811e-07, + "loss": 0.3888, + "step": 634 + }, + { + "epoch": 0.11503623188405797, + "grad_norm": 10.301603098662682, + "learning_rate": 9.811857858543774e-07, + "loss": 0.4195, + "step": 635 + }, + { + "epoch": 0.11521739130434783, + "grad_norm": 4.660264186912435, + "learning_rate": 9.811059788214272e-07, + "loss": 0.4183, + "step": 636 + }, + { + "epoch": 0.11539855072463769, + "grad_norm": 5.642509998661616, + "learning_rate": 9.81026006141409e-07, + "loss": 0.4374, + "step": 637 + }, + { + "epoch": 0.11557971014492753, + "grad_norm": 6.220978507427946, + "learning_rate": 9.80945867841858e-07, + "loss": 0.3696, + "step": 638 + }, + { + "epoch": 0.11576086956521739, + "grad_norm": 6.348755432588962, + "learning_rate": 9.80865563950366e-07, + "loss": 0.3975, + "step": 639 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 2.584135286875402, + "learning_rate": 9.807850944945816e-07, + "loss": 0.3565, + "step": 640 + }, + { + "epoch": 0.1161231884057971, + "grad_norm": 11.783637917498103, + "learning_rate": 9.807044595022115e-07, + "loss": 0.351, + "step": 641 + }, + { + "epoch": 0.11630434782608695, + "grad_norm": 8.196457156054365, + "learning_rate": 9.806236590010183e-07, + "loss": 0.4321, + "step": 642 + }, + { + "epoch": 0.11648550724637681, + "grad_norm": 4.471093443157233, + "learning_rate": 9.805426930188219e-07, + "loss": 0.3306, + "step": 643 + }, + { + "epoch": 0.11666666666666667, + "grad_norm": 3.377125441232469, + "learning_rate": 9.804615615834994e-07, + "loss": 0.3226, + "step": 644 + }, + { + "epoch": 0.11684782608695653, + "grad_norm": 13.3291823205937, + "learning_rate": 9.803802647229851e-07, + "loss": 0.4011, + "step": 645 + }, + { + "epoch": 0.11702898550724637, + "grad_norm": 8.118372845401215, + "learning_rate": 9.802988024652691e-07, + "loss": 0.3384, + "step": 646 + }, + { + "epoch": 0.11721014492753623, + "grad_norm": 3.806946572896715, + "learning_rate": 9.802171748384e-07, + "loss": 0.3943, + "step": 647 + }, + { + "epoch": 0.11739130434782609, + "grad_norm": 3.446292255646783, + "learning_rate": 9.801353818704825e-07, + "loss": 0.4333, + "step": 648 + }, + { + "epoch": 0.11757246376811595, + "grad_norm": 10.789095146819566, + "learning_rate": 9.800534235896777e-07, + "loss": 0.3832, + "step": 649 + }, + { + "epoch": 0.11775362318840579, + "grad_norm": 5.649727144288448, + "learning_rate": 9.799713000242048e-07, + "loss": 0.3973, + "step": 650 + }, + { + "epoch": 0.11793478260869565, + "grad_norm": 4.459102989692808, + "learning_rate": 9.79889011202339e-07, + "loss": 0.353, + "step": 651 + }, + { + "epoch": 0.11811594202898551, + "grad_norm": 4.759071213763818, + "learning_rate": 9.79806557152413e-07, + "loss": 0.3931, + "step": 652 + }, + { + "epoch": 0.11829710144927537, + "grad_norm": 6.921810591021103, + "learning_rate": 9.797239379028162e-07, + "loss": 0.3712, + "step": 653 + }, + { + "epoch": 0.11847826086956521, + "grad_norm": 3.440311046248852, + "learning_rate": 9.796411534819944e-07, + "loss": 0.3821, + "step": 654 + }, + { + "epoch": 0.11865942028985507, + "grad_norm": 13.185666667653342, + "learning_rate": 9.795582039184508e-07, + "loss": 0.3495, + "step": 655 + }, + { + "epoch": 0.11884057971014493, + "grad_norm": 11.383458188725191, + "learning_rate": 9.794750892407455e-07, + "loss": 0.4305, + "step": 656 + }, + { + "epoch": 0.11902173913043479, + "grad_norm": 3.580592974471932, + "learning_rate": 9.79391809477495e-07, + "loss": 0.3491, + "step": 657 + }, + { + "epoch": 0.11920289855072463, + "grad_norm": 9.90590132292987, + "learning_rate": 9.79308364657373e-07, + "loss": 0.4762, + "step": 658 + }, + { + "epoch": 0.11938405797101449, + "grad_norm": 14.952403412799473, + "learning_rate": 9.792247548091105e-07, + "loss": 0.4676, + "step": 659 + }, + { + "epoch": 0.11956521739130435, + "grad_norm": 3.556782685122642, + "learning_rate": 9.79140979961494e-07, + "loss": 0.3681, + "step": 660 + }, + { + "epoch": 0.1197463768115942, + "grad_norm": 3.356889662500131, + "learning_rate": 9.79057040143368e-07, + "loss": 0.4087, + "step": 661 + }, + { + "epoch": 0.11992753623188405, + "grad_norm": 11.935608592160225, + "learning_rate": 9.789729353836333e-07, + "loss": 0.4225, + "step": 662 + }, + { + "epoch": 0.12010869565217391, + "grad_norm": 13.74587458686005, + "learning_rate": 9.788886657112473e-07, + "loss": 0.3547, + "step": 663 + }, + { + "epoch": 0.12028985507246377, + "grad_norm": 12.814818226606935, + "learning_rate": 9.78804231155225e-07, + "loss": 0.428, + "step": 664 + }, + { + "epoch": 0.12047101449275362, + "grad_norm": 16.426466256408123, + "learning_rate": 9.787196317446368e-07, + "loss": 0.3838, + "step": 665 + }, + { + "epoch": 0.12065217391304348, + "grad_norm": 6.089270057942982, + "learning_rate": 9.786348675086115e-07, + "loss": 0.4162, + "step": 666 + }, + { + "epoch": 0.12083333333333333, + "grad_norm": 6.513792221842381, + "learning_rate": 9.785499384763336e-07, + "loss": 0.3676, + "step": 667 + }, + { + "epoch": 0.12101449275362319, + "grad_norm": 5.519976041785401, + "learning_rate": 9.784648446770442e-07, + "loss": 0.3644, + "step": 668 + }, + { + "epoch": 0.12119565217391304, + "grad_norm": 9.847255679651372, + "learning_rate": 9.78379586140042e-07, + "loss": 0.4197, + "step": 669 + }, + { + "epoch": 0.1213768115942029, + "grad_norm": 9.779224060063939, + "learning_rate": 9.782941628946817e-07, + "loss": 0.3552, + "step": 670 + }, + { + "epoch": 0.12155797101449275, + "grad_norm": 5.233732751639689, + "learning_rate": 9.782085749703747e-07, + "loss": 0.3907, + "step": 671 + }, + { + "epoch": 0.12173913043478261, + "grad_norm": 3.2881841013666295, + "learning_rate": 9.781228223965897e-07, + "loss": 0.3704, + "step": 672 + }, + { + "epoch": 0.12192028985507246, + "grad_norm": 3.1476049070256273, + "learning_rate": 9.780369052028514e-07, + "loss": 0.3734, + "step": 673 + }, + { + "epoch": 0.12210144927536232, + "grad_norm": 4.262774040056997, + "learning_rate": 9.779508234187418e-07, + "loss": 0.4518, + "step": 674 + }, + { + "epoch": 0.12228260869565218, + "grad_norm": 8.404520746520697, + "learning_rate": 9.778645770738989e-07, + "loss": 0.3849, + "step": 675 + }, + { + "epoch": 0.12246376811594203, + "grad_norm": 3.435724116865664, + "learning_rate": 9.777781661980183e-07, + "loss": 0.3946, + "step": 676 + }, + { + "epoch": 0.12264492753623188, + "grad_norm": 3.3178740566243046, + "learning_rate": 9.77691590820851e-07, + "loss": 0.4237, + "step": 677 + }, + { + "epoch": 0.12282608695652174, + "grad_norm": 7.137062377583599, + "learning_rate": 9.776048509722058e-07, + "loss": 0.4645, + "step": 678 + }, + { + "epoch": 0.1230072463768116, + "grad_norm": 3.3447094611975925, + "learning_rate": 9.775179466819473e-07, + "loss": 0.3505, + "step": 679 + }, + { + "epoch": 0.12318840579710146, + "grad_norm": 9.45090635641566, + "learning_rate": 9.774308779799973e-07, + "loss": 0.3764, + "step": 680 + }, + { + "epoch": 0.1233695652173913, + "grad_norm": 8.990675798337948, + "learning_rate": 9.77343644896334e-07, + "loss": 0.3374, + "step": 681 + }, + { + "epoch": 0.12355072463768116, + "grad_norm": 6.0427782176287135, + "learning_rate": 9.77256247460992e-07, + "loss": 0.4135, + "step": 682 + }, + { + "epoch": 0.12373188405797102, + "grad_norm": 4.548460165953483, + "learning_rate": 9.771686857040628e-07, + "loss": 0.3986, + "step": 683 + }, + { + "epoch": 0.12391304347826088, + "grad_norm": 4.814153264451756, + "learning_rate": 9.770809596556941e-07, + "loss": 0.463, + "step": 684 + }, + { + "epoch": 0.12409420289855072, + "grad_norm": 4.316694317097807, + "learning_rate": 9.769930693460905e-07, + "loss": 0.3973, + "step": 685 + }, + { + "epoch": 0.12427536231884058, + "grad_norm": 7.435108970514274, + "learning_rate": 9.769050148055132e-07, + "loss": 0.3527, + "step": 686 + }, + { + "epoch": 0.12445652173913044, + "grad_norm": 6.476662969387697, + "learning_rate": 9.768167960642797e-07, + "loss": 0.3514, + "step": 687 + }, + { + "epoch": 0.1246376811594203, + "grad_norm": 2.9049206699213035, + "learning_rate": 9.76728413152764e-07, + "loss": 0.4276, + "step": 688 + }, + { + "epoch": 0.12481884057971014, + "grad_norm": 4.764847311211572, + "learning_rate": 9.766398661013971e-07, + "loss": 0.4558, + "step": 689 + }, + { + "epoch": 0.125, + "grad_norm": 6.407684827808415, + "learning_rate": 9.765511549406656e-07, + "loss": 0.3802, + "step": 690 + }, + { + "epoch": 0.12518115942028984, + "grad_norm": 3.7028837829026564, + "learning_rate": 9.764622797011137e-07, + "loss": 0.4103, + "step": 691 + }, + { + "epoch": 0.12536231884057972, + "grad_norm": 7.71378838626683, + "learning_rate": 9.763732404133413e-07, + "loss": 0.363, + "step": 692 + }, + { + "epoch": 0.12554347826086956, + "grad_norm": 9.643257822231876, + "learning_rate": 9.762840371080053e-07, + "loss": 0.4159, + "step": 693 + }, + { + "epoch": 0.1257246376811594, + "grad_norm": 4.696104032789181, + "learning_rate": 9.761946698158184e-07, + "loss": 0.3883, + "step": 694 + }, + { + "epoch": 0.12590579710144928, + "grad_norm": 3.2731710565805168, + "learning_rate": 9.761051385675505e-07, + "loss": 0.4235, + "step": 695 + }, + { + "epoch": 0.12608695652173912, + "grad_norm": 2.7622798539691713, + "learning_rate": 9.760154433940277e-07, + "loss": 0.3998, + "step": 696 + }, + { + "epoch": 0.126268115942029, + "grad_norm": 7.475118099531793, + "learning_rate": 9.759255843261321e-07, + "loss": 0.3657, + "step": 697 + }, + { + "epoch": 0.12644927536231884, + "grad_norm": 3.0033639726547863, + "learning_rate": 9.75835561394803e-07, + "loss": 0.3854, + "step": 698 + }, + { + "epoch": 0.1266304347826087, + "grad_norm": 3.8583730298714145, + "learning_rate": 9.757453746310356e-07, + "loss": 0.4194, + "step": 699 + }, + { + "epoch": 0.12681159420289856, + "grad_norm": 5.064421534590891, + "learning_rate": 9.756550240658813e-07, + "loss": 0.386, + "step": 700 + }, + { + "epoch": 0.12681159420289856, + "eval_loss": 0.3960312604904175, + "eval_runtime": 9.762, + "eval_samples_per_second": 51.219, + "eval_steps_per_second": 0.102, + "step": 700 + }, + { + "epoch": 0.1269927536231884, + "grad_norm": 6.6325477871274945, + "learning_rate": 9.755645097304487e-07, + "loss": 0.3879, + "step": 701 + }, + { + "epoch": 0.12717391304347825, + "grad_norm": 13.7923980110677, + "learning_rate": 9.754738316559021e-07, + "loss": 0.4185, + "step": 702 + }, + { + "epoch": 0.12735507246376812, + "grad_norm": 3.700581849731215, + "learning_rate": 9.753829898734625e-07, + "loss": 0.3622, + "step": 703 + }, + { + "epoch": 0.12753623188405797, + "grad_norm": 3.0346633543449992, + "learning_rate": 9.752919844144072e-07, + "loss": 0.3396, + "step": 704 + }, + { + "epoch": 0.12771739130434784, + "grad_norm": 6.3041659774770595, + "learning_rate": 9.752008153100694e-07, + "loss": 0.4014, + "step": 705 + }, + { + "epoch": 0.12789855072463768, + "grad_norm": 6.063651744567198, + "learning_rate": 9.751094825918396e-07, + "loss": 0.4123, + "step": 706 + }, + { + "epoch": 0.12807971014492753, + "grad_norm": 9.294585763947483, + "learning_rate": 9.750179862911636e-07, + "loss": 0.3395, + "step": 707 + }, + { + "epoch": 0.1282608695652174, + "grad_norm": 3.7473147595140923, + "learning_rate": 9.749263264395442e-07, + "loss": 0.3564, + "step": 708 + }, + { + "epoch": 0.12844202898550725, + "grad_norm": 6.465478677553752, + "learning_rate": 9.748345030685405e-07, + "loss": 0.2896, + "step": 709 + }, + { + "epoch": 0.1286231884057971, + "grad_norm": 4.18430460390063, + "learning_rate": 9.747425162097675e-07, + "loss": 0.3494, + "step": 710 + }, + { + "epoch": 0.12880434782608696, + "grad_norm": 8.4926540296536, + "learning_rate": 9.74650365894897e-07, + "loss": 0.4205, + "step": 711 + }, + { + "epoch": 0.1289855072463768, + "grad_norm": 4.976275165228336, + "learning_rate": 9.745580521556565e-07, + "loss": 0.362, + "step": 712 + }, + { + "epoch": 0.12916666666666668, + "grad_norm": 3.5694099166930733, + "learning_rate": 9.7446557502383e-07, + "loss": 0.4035, + "step": 713 + }, + { + "epoch": 0.12934782608695652, + "grad_norm": 8.431100018244468, + "learning_rate": 9.74372934531258e-07, + "loss": 0.4276, + "step": 714 + }, + { + "epoch": 0.12952898550724637, + "grad_norm": 5.324249127684661, + "learning_rate": 9.74280130709837e-07, + "loss": 0.369, + "step": 715 + }, + { + "epoch": 0.12971014492753624, + "grad_norm": 6.61440037267252, + "learning_rate": 9.741871635915198e-07, + "loss": 0.4131, + "step": 716 + }, + { + "epoch": 0.1298913043478261, + "grad_norm": 8.301797080913259, + "learning_rate": 9.740940332083157e-07, + "loss": 0.4225, + "step": 717 + }, + { + "epoch": 0.13007246376811593, + "grad_norm": 8.86277659785471, + "learning_rate": 9.740007395922894e-07, + "loss": 0.3135, + "step": 718 + }, + { + "epoch": 0.1302536231884058, + "grad_norm": 4.698305190045604, + "learning_rate": 9.739072827755625e-07, + "loss": 0.4027, + "step": 719 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 2.789284161666489, + "learning_rate": 9.738136627903128e-07, + "loss": 0.3113, + "step": 720 + }, + { + "epoch": 0.13061594202898552, + "grad_norm": 3.926952487330974, + "learning_rate": 9.737198796687741e-07, + "loss": 0.354, + "step": 721 + }, + { + "epoch": 0.13079710144927537, + "grad_norm": 7.832881188785057, + "learning_rate": 9.736259334432365e-07, + "loss": 0.3597, + "step": 722 + }, + { + "epoch": 0.1309782608695652, + "grad_norm": 3.475532568110265, + "learning_rate": 9.735318241460455e-07, + "loss": 0.3256, + "step": 723 + }, + { + "epoch": 0.13115942028985508, + "grad_norm": 6.701946998152024, + "learning_rate": 9.73437551809604e-07, + "loss": 0.3619, + "step": 724 + }, + { + "epoch": 0.13134057971014493, + "grad_norm": 4.738607910166096, + "learning_rate": 9.733431164663704e-07, + "loss": 0.4333, + "step": 725 + }, + { + "epoch": 0.13152173913043477, + "grad_norm": 3.7634470648572504, + "learning_rate": 9.732485181488587e-07, + "loss": 0.421, + "step": 726 + }, + { + "epoch": 0.13170289855072465, + "grad_norm": 4.2927344771472855, + "learning_rate": 9.731537568896402e-07, + "loss": 0.3489, + "step": 727 + }, + { + "epoch": 0.1318840579710145, + "grad_norm": 13.340906086846939, + "learning_rate": 9.730588327213413e-07, + "loss": 0.3699, + "step": 728 + }, + { + "epoch": 0.13206521739130433, + "grad_norm": 4.351896014543642, + "learning_rate": 9.729637456766448e-07, + "loss": 0.3864, + "step": 729 + }, + { + "epoch": 0.1322463768115942, + "grad_norm": 5.070168107936124, + "learning_rate": 9.728684957882897e-07, + "loss": 0.3193, + "step": 730 + }, + { + "epoch": 0.13242753623188405, + "grad_norm": 9.97750769002732, + "learning_rate": 9.727730830890711e-07, + "loss": 0.3527, + "step": 731 + }, + { + "epoch": 0.13260869565217392, + "grad_norm": 3.5442634744989308, + "learning_rate": 9.7267750761184e-07, + "loss": 0.3431, + "step": 732 + }, + { + "epoch": 0.13278985507246377, + "grad_norm": 3.1279346551227056, + "learning_rate": 9.725817693895033e-07, + "loss": 0.3572, + "step": 733 + }, + { + "epoch": 0.13297101449275361, + "grad_norm": 5.618630634188929, + "learning_rate": 9.724858684550242e-07, + "loss": 0.4297, + "step": 734 + }, + { + "epoch": 0.1331521739130435, + "grad_norm": 3.2992168776312347, + "learning_rate": 9.72389804841422e-07, + "loss": 0.3508, + "step": 735 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.4024123655505205, + "learning_rate": 9.722935785817719e-07, + "loss": 0.3826, + "step": 736 + }, + { + "epoch": 0.13351449275362318, + "grad_norm": 3.3014334098875877, + "learning_rate": 9.721971897092047e-07, + "loss": 0.3617, + "step": 737 + }, + { + "epoch": 0.13369565217391305, + "grad_norm": 3.6500039654226373, + "learning_rate": 9.721006382569078e-07, + "loss": 0.3816, + "step": 738 + }, + { + "epoch": 0.1338768115942029, + "grad_norm": 14.450200674257177, + "learning_rate": 9.720039242581244e-07, + "loss": 0.3958, + "step": 739 + }, + { + "epoch": 0.13405797101449277, + "grad_norm": 6.404906931479888, + "learning_rate": 9.719070477461534e-07, + "loss": 0.3394, + "step": 740 + }, + { + "epoch": 0.1342391304347826, + "grad_norm": 5.9014248150966155, + "learning_rate": 9.7181000875435e-07, + "loss": 0.3561, + "step": 741 + }, + { + "epoch": 0.13442028985507246, + "grad_norm": 5.323203958593394, + "learning_rate": 9.717128073161251e-07, + "loss": 0.3605, + "step": 742 + }, + { + "epoch": 0.13460144927536233, + "grad_norm": 5.167294648938972, + "learning_rate": 9.716154434649454e-07, + "loss": 0.4105, + "step": 743 + }, + { + "epoch": 0.13478260869565217, + "grad_norm": 8.858319730185437, + "learning_rate": 9.715179172343342e-07, + "loss": 0.3992, + "step": 744 + }, + { + "epoch": 0.13496376811594202, + "grad_norm": 4.038283642494259, + "learning_rate": 9.7142022865787e-07, + "loss": 0.4045, + "step": 745 + }, + { + "epoch": 0.1351449275362319, + "grad_norm": 6.217163933503129, + "learning_rate": 9.713223777691873e-07, + "loss": 0.345, + "step": 746 + }, + { + "epoch": 0.13532608695652174, + "grad_norm": 6.6243645812344765, + "learning_rate": 9.712243646019768e-07, + "loss": 0.3281, + "step": 747 + }, + { + "epoch": 0.1355072463768116, + "grad_norm": 3.6858113189654027, + "learning_rate": 9.71126189189985e-07, + "loss": 0.3591, + "step": 748 + }, + { + "epoch": 0.13568840579710145, + "grad_norm": 5.855006578912425, + "learning_rate": 9.710278515670138e-07, + "loss": 0.363, + "step": 749 + }, + { + "epoch": 0.1358695652173913, + "grad_norm": 4.718939998893757, + "learning_rate": 9.709293517669216e-07, + "loss": 0.4333, + "step": 750 + }, + { + "epoch": 0.13605072463768117, + "grad_norm": 3.990509621420453, + "learning_rate": 9.708306898236224e-07, + "loss": 0.3584, + "step": 751 + }, + { + "epoch": 0.13623188405797101, + "grad_norm": 6.355724151961903, + "learning_rate": 9.707318657710856e-07, + "loss": 0.3691, + "step": 752 + }, + { + "epoch": 0.13641304347826086, + "grad_norm": 5.630874686294518, + "learning_rate": 9.706328796433372e-07, + "loss": 0.3609, + "step": 753 + }, + { + "epoch": 0.13659420289855073, + "grad_norm": 15.440653376058837, + "learning_rate": 9.705337314744584e-07, + "loss": 0.3926, + "step": 754 + }, + { + "epoch": 0.13677536231884058, + "grad_norm": 8.163687939741473, + "learning_rate": 9.704344212985864e-07, + "loss": 0.3619, + "step": 755 + }, + { + "epoch": 0.13695652173913042, + "grad_norm": 8.200076470458544, + "learning_rate": 9.703349491499141e-07, + "loss": 0.3517, + "step": 756 + }, + { + "epoch": 0.1371376811594203, + "grad_norm": 9.886028220333985, + "learning_rate": 9.702353150626905e-07, + "loss": 0.3894, + "step": 757 + }, + { + "epoch": 0.13731884057971014, + "grad_norm": 11.179868795711787, + "learning_rate": 9.701355190712198e-07, + "loss": 0.3624, + "step": 758 + }, + { + "epoch": 0.1375, + "grad_norm": 3.7661974639531945, + "learning_rate": 9.700355612098625e-07, + "loss": 0.3106, + "step": 759 + }, + { + "epoch": 0.13768115942028986, + "grad_norm": 10.171930780588786, + "learning_rate": 9.699354415130342e-07, + "loss": 0.3062, + "step": 760 + }, + { + "epoch": 0.1378623188405797, + "grad_norm": 5.922694462163911, + "learning_rate": 9.698351600152069e-07, + "loss": 0.3886, + "step": 761 + }, + { + "epoch": 0.13804347826086957, + "grad_norm": 11.546057025043572, + "learning_rate": 9.69734716750908e-07, + "loss": 0.451, + "step": 762 + }, + { + "epoch": 0.13822463768115942, + "grad_norm": 8.915262455626223, + "learning_rate": 9.696341117547203e-07, + "loss": 0.4023, + "step": 763 + }, + { + "epoch": 0.13840579710144926, + "grad_norm": 4.161229591208603, + "learning_rate": 9.695333450612826e-07, + "loss": 0.3854, + "step": 764 + }, + { + "epoch": 0.13858695652173914, + "grad_norm": 11.600860504527866, + "learning_rate": 9.694324167052897e-07, + "loss": 0.3711, + "step": 765 + }, + { + "epoch": 0.13876811594202898, + "grad_norm": 4.0145484078765765, + "learning_rate": 9.693313267214916e-07, + "loss": 0.3898, + "step": 766 + }, + { + "epoch": 0.13894927536231885, + "grad_norm": 5.804797337669499, + "learning_rate": 9.692300751446939e-07, + "loss": 0.3209, + "step": 767 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 4.193754548751051, + "learning_rate": 9.691286620097578e-07, + "loss": 0.3179, + "step": 768 + }, + { + "epoch": 0.13931159420289854, + "grad_norm": 3.475967173310117, + "learning_rate": 9.69027087351601e-07, + "loss": 0.3642, + "step": 769 + }, + { + "epoch": 0.13949275362318841, + "grad_norm": 7.257326955939186, + "learning_rate": 9.689253512051953e-07, + "loss": 0.3701, + "step": 770 + }, + { + "epoch": 0.13967391304347826, + "grad_norm": 5.009573319900192, + "learning_rate": 9.688234536055697e-07, + "loss": 0.4319, + "step": 771 + }, + { + "epoch": 0.1398550724637681, + "grad_norm": 4.11774493284656, + "learning_rate": 9.687213945878077e-07, + "loss": 0.3671, + "step": 772 + }, + { + "epoch": 0.14003623188405798, + "grad_norm": 4.00577694651643, + "learning_rate": 9.686191741870484e-07, + "loss": 0.4042, + "step": 773 + }, + { + "epoch": 0.14021739130434782, + "grad_norm": 4.933949750613941, + "learning_rate": 9.685167924384874e-07, + "loss": 0.3898, + "step": 774 + }, + { + "epoch": 0.1403985507246377, + "grad_norm": 6.733188882480495, + "learning_rate": 9.684142493773746e-07, + "loss": 0.3627, + "step": 775 + }, + { + "epoch": 0.14057971014492754, + "grad_norm": 6.449572526510951, + "learning_rate": 9.683115450390166e-07, + "loss": 0.406, + "step": 776 + }, + { + "epoch": 0.14076086956521738, + "grad_norm": 5.138868832834836, + "learning_rate": 9.682086794587746e-07, + "loss": 0.4077, + "step": 777 + }, + { + "epoch": 0.14094202898550726, + "grad_norm": 5.662777115952582, + "learning_rate": 9.681056526720659e-07, + "loss": 0.3893, + "step": 778 + }, + { + "epoch": 0.1411231884057971, + "grad_norm": 4.011876697773631, + "learning_rate": 9.68002464714363e-07, + "loss": 0.3967, + "step": 779 + }, + { + "epoch": 0.14130434782608695, + "grad_norm": 5.1476725760419635, + "learning_rate": 9.67899115621194e-07, + "loss": 0.3622, + "step": 780 + }, + { + "epoch": 0.14148550724637682, + "grad_norm": 6.817405599928272, + "learning_rate": 9.677956054281427e-07, + "loss": 0.3489, + "step": 781 + }, + { + "epoch": 0.14166666666666666, + "grad_norm": 7.3995868163142395, + "learning_rate": 9.676919341708478e-07, + "loss": 0.4357, + "step": 782 + }, + { + "epoch": 0.14184782608695654, + "grad_norm": 6.939671348813399, + "learning_rate": 9.675881018850042e-07, + "loss": 0.3524, + "step": 783 + }, + { + "epoch": 0.14202898550724638, + "grad_norm": 12.293898606053197, + "learning_rate": 9.674841086063615e-07, + "loss": 0.3965, + "step": 784 + }, + { + "epoch": 0.14221014492753623, + "grad_norm": 10.188664348028176, + "learning_rate": 9.673799543707253e-07, + "loss": 0.3198, + "step": 785 + }, + { + "epoch": 0.1423913043478261, + "grad_norm": 16.162971647431426, + "learning_rate": 9.672756392139562e-07, + "loss": 0.4337, + "step": 786 + }, + { + "epoch": 0.14257246376811594, + "grad_norm": 3.8420235768561644, + "learning_rate": 9.671711631719705e-07, + "loss": 0.3297, + "step": 787 + }, + { + "epoch": 0.1427536231884058, + "grad_norm": 6.522246431494951, + "learning_rate": 9.670665262807401e-07, + "loss": 0.3169, + "step": 788 + }, + { + "epoch": 0.14293478260869566, + "grad_norm": 6.425771496097432, + "learning_rate": 9.669617285762915e-07, + "loss": 0.295, + "step": 789 + }, + { + "epoch": 0.1431159420289855, + "grad_norm": 3.7260951294235203, + "learning_rate": 9.668567700947073e-07, + "loss": 0.4178, + "step": 790 + }, + { + "epoch": 0.14329710144927535, + "grad_norm": 5.104163619378291, + "learning_rate": 9.66751650872125e-07, + "loss": 0.4174, + "step": 791 + }, + { + "epoch": 0.14347826086956522, + "grad_norm": 7.84633879459535, + "learning_rate": 9.666463709447378e-07, + "loss": 0.3659, + "step": 792 + }, + { + "epoch": 0.14365942028985507, + "grad_norm": 7.278148656240213, + "learning_rate": 9.665409303487942e-07, + "loss": 0.4153, + "step": 793 + }, + { + "epoch": 0.14384057971014494, + "grad_norm": 6.103875019351923, + "learning_rate": 9.664353291205977e-07, + "loss": 0.35, + "step": 794 + }, + { + "epoch": 0.14402173913043478, + "grad_norm": 3.403446085823657, + "learning_rate": 9.663295672965072e-07, + "loss": 0.3642, + "step": 795 + }, + { + "epoch": 0.14420289855072463, + "grad_norm": 3.5099159116141974, + "learning_rate": 9.662236449129376e-07, + "loss": 0.34, + "step": 796 + }, + { + "epoch": 0.1443840579710145, + "grad_norm": 3.9585793014698125, + "learning_rate": 9.661175620063577e-07, + "loss": 0.4159, + "step": 797 + }, + { + "epoch": 0.14456521739130435, + "grad_norm": 3.279449429804172, + "learning_rate": 9.660113186132929e-07, + "loss": 0.3469, + "step": 798 + }, + { + "epoch": 0.1447463768115942, + "grad_norm": 10.946868427677378, + "learning_rate": 9.659049147703229e-07, + "loss": 0.3452, + "step": 799 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 5.565521881030789, + "learning_rate": 9.657983505140832e-07, + "loss": 0.3445, + "step": 800 + }, + { + "epoch": 0.14492753623188406, + "eval_loss": 0.38487499952316284, + "eval_runtime": 9.836, + "eval_samples_per_second": 50.834, + "eval_steps_per_second": 0.102, + "step": 800 + }, + { + "epoch": 0.1451086956521739, + "grad_norm": 4.548090245672361, + "learning_rate": 9.656916258812644e-07, + "loss": 0.3264, + "step": 801 + }, + { + "epoch": 0.14528985507246378, + "grad_norm": 3.60462194109557, + "learning_rate": 9.655847409086127e-07, + "loss": 0.3765, + "step": 802 + }, + { + "epoch": 0.14547101449275363, + "grad_norm": 6.9962945480349745, + "learning_rate": 9.654776956329282e-07, + "loss": 0.417, + "step": 803 + }, + { + "epoch": 0.14565217391304347, + "grad_norm": 4.42724840563812, + "learning_rate": 9.65370490091068e-07, + "loss": 0.3067, + "step": 804 + }, + { + "epoch": 0.14583333333333334, + "grad_norm": 5.433112866664569, + "learning_rate": 9.65263124319943e-07, + "loss": 0.3631, + "step": 805 + }, + { + "epoch": 0.1460144927536232, + "grad_norm": 4.506366773451285, + "learning_rate": 9.651555983565197e-07, + "loss": 0.3542, + "step": 806 + }, + { + "epoch": 0.14619565217391303, + "grad_norm": 3.549703730721839, + "learning_rate": 9.650479122378202e-07, + "loss": 0.3972, + "step": 807 + }, + { + "epoch": 0.1463768115942029, + "grad_norm": 3.9561570455500323, + "learning_rate": 9.649400660009209e-07, + "loss": 0.4058, + "step": 808 + }, + { + "epoch": 0.14655797101449275, + "grad_norm": 13.666271787920849, + "learning_rate": 9.648320596829538e-07, + "loss": 0.43, + "step": 809 + }, + { + "epoch": 0.14673913043478262, + "grad_norm": 7.1884870677029244, + "learning_rate": 9.647238933211064e-07, + "loss": 0.4365, + "step": 810 + }, + { + "epoch": 0.14692028985507247, + "grad_norm": 9.467113610615664, + "learning_rate": 9.646155669526204e-07, + "loss": 0.3666, + "step": 811 + }, + { + "epoch": 0.1471014492753623, + "grad_norm": 4.560384820550004, + "learning_rate": 9.645070806147936e-07, + "loss": 0.4078, + "step": 812 + }, + { + "epoch": 0.14728260869565218, + "grad_norm": 7.542947790041231, + "learning_rate": 9.643984343449777e-07, + "loss": 0.3817, + "step": 813 + }, + { + "epoch": 0.14746376811594203, + "grad_norm": 4.889270994146063, + "learning_rate": 9.642896281805805e-07, + "loss": 0.412, + "step": 814 + }, + { + "epoch": 0.14764492753623187, + "grad_norm": 5.810871439198549, + "learning_rate": 9.641806621590647e-07, + "loss": 0.3763, + "step": 815 + }, + { + "epoch": 0.14782608695652175, + "grad_norm": 3.2506554566136145, + "learning_rate": 9.640715363179477e-07, + "loss": 0.3561, + "step": 816 + }, + { + "epoch": 0.1480072463768116, + "grad_norm": 5.478664739910058, + "learning_rate": 9.639622506948017e-07, + "loss": 0.3551, + "step": 817 + }, + { + "epoch": 0.14818840579710144, + "grad_norm": 5.054596974684607, + "learning_rate": 9.638528053272544e-07, + "loss": 0.4258, + "step": 818 + }, + { + "epoch": 0.1483695652173913, + "grad_norm": 6.284451731268105, + "learning_rate": 9.637432002529886e-07, + "loss": 0.3923, + "step": 819 + }, + { + "epoch": 0.14855072463768115, + "grad_norm": 5.097815918738833, + "learning_rate": 9.636334355097417e-07, + "loss": 0.372, + "step": 820 + }, + { + "epoch": 0.14873188405797103, + "grad_norm": 8.42513399020794, + "learning_rate": 9.635235111353061e-07, + "loss": 0.3941, + "step": 821 + }, + { + "epoch": 0.14891304347826087, + "grad_norm": 5.125452534191936, + "learning_rate": 9.634134271675294e-07, + "loss": 0.3848, + "step": 822 + }, + { + "epoch": 0.14909420289855072, + "grad_norm": 4.9518559143653205, + "learning_rate": 9.633031836443142e-07, + "loss": 0.3407, + "step": 823 + }, + { + "epoch": 0.1492753623188406, + "grad_norm": 7.988320363279841, + "learning_rate": 9.631927806036175e-07, + "loss": 0.4225, + "step": 824 + }, + { + "epoch": 0.14945652173913043, + "grad_norm": 10.514387579134466, + "learning_rate": 9.630822180834518e-07, + "loss": 0.42, + "step": 825 + }, + { + "epoch": 0.14963768115942028, + "grad_norm": 4.008870447693685, + "learning_rate": 9.629714961218845e-07, + "loss": 0.3538, + "step": 826 + }, + { + "epoch": 0.14981884057971015, + "grad_norm": 3.2600794804770348, + "learning_rate": 9.628606147570374e-07, + "loss": 0.3674, + "step": 827 + }, + { + "epoch": 0.15, + "grad_norm": 4.660418037004727, + "learning_rate": 9.627495740270874e-07, + "loss": 0.3875, + "step": 828 + }, + { + "epoch": 0.15018115942028987, + "grad_norm": 14.658238136033177, + "learning_rate": 9.626383739702668e-07, + "loss": 0.4232, + "step": 829 + }, + { + "epoch": 0.1503623188405797, + "grad_norm": 3.427152235192319, + "learning_rate": 9.625270146248616e-07, + "loss": 0.3716, + "step": 830 + }, + { + "epoch": 0.15054347826086956, + "grad_norm": 7.104916016105561, + "learning_rate": 9.624154960292141e-07, + "loss": 0.3761, + "step": 831 + }, + { + "epoch": 0.15072463768115943, + "grad_norm": 5.268346352114768, + "learning_rate": 9.623038182217202e-07, + "loss": 0.3954, + "step": 832 + }, + { + "epoch": 0.15090579710144927, + "grad_norm": 3.909921849942659, + "learning_rate": 9.621919812408313e-07, + "loss": 0.3466, + "step": 833 + }, + { + "epoch": 0.15108695652173912, + "grad_norm": 5.313064486280673, + "learning_rate": 9.620799851250534e-07, + "loss": 0.4255, + "step": 834 + }, + { + "epoch": 0.151268115942029, + "grad_norm": 5.658689697252792, + "learning_rate": 9.61967829912947e-07, + "loss": 0.411, + "step": 835 + }, + { + "epoch": 0.15144927536231884, + "grad_norm": 7.133698128780928, + "learning_rate": 9.618555156431283e-07, + "loss": 0.4258, + "step": 836 + }, + { + "epoch": 0.1516304347826087, + "grad_norm": 10.710861565044695, + "learning_rate": 9.61743042354267e-07, + "loss": 0.3623, + "step": 837 + }, + { + "epoch": 0.15181159420289855, + "grad_norm": 5.549993138938895, + "learning_rate": 9.616304100850883e-07, + "loss": 0.3879, + "step": 838 + }, + { + "epoch": 0.1519927536231884, + "grad_norm": 2.988074991973997, + "learning_rate": 9.615176188743724e-07, + "loss": 0.3518, + "step": 839 + }, + { + "epoch": 0.15217391304347827, + "grad_norm": 4.49008750002056, + "learning_rate": 9.614046687609537e-07, + "loss": 0.3054, + "step": 840 + }, + { + "epoch": 0.15235507246376812, + "grad_norm": 6.63240051746998, + "learning_rate": 9.61291559783721e-07, + "loss": 0.3423, + "step": 841 + }, + { + "epoch": 0.15253623188405796, + "grad_norm": 9.673311143114644, + "learning_rate": 9.611782919816188e-07, + "loss": 0.3448, + "step": 842 + }, + { + "epoch": 0.15271739130434783, + "grad_norm": 3.749240635946091, + "learning_rate": 9.610648653936456e-07, + "loss": 0.3544, + "step": 843 + }, + { + "epoch": 0.15289855072463768, + "grad_norm": 4.295774831111311, + "learning_rate": 9.609512800588547e-07, + "loss": 0.3632, + "step": 844 + }, + { + "epoch": 0.15307971014492755, + "grad_norm": 6.363918337016414, + "learning_rate": 9.608375360163539e-07, + "loss": 0.3277, + "step": 845 + }, + { + "epoch": 0.1532608695652174, + "grad_norm": 8.437863753748266, + "learning_rate": 9.60723633305306e-07, + "loss": 0.3885, + "step": 846 + }, + { + "epoch": 0.15344202898550724, + "grad_norm": 3.670816263383733, + "learning_rate": 9.606095719649283e-07, + "loss": 0.3575, + "step": 847 + }, + { + "epoch": 0.1536231884057971, + "grad_norm": 4.570738702853096, + "learning_rate": 9.604953520344925e-07, + "loss": 0.3544, + "step": 848 + }, + { + "epoch": 0.15380434782608696, + "grad_norm": 13.828584658268664, + "learning_rate": 9.603809735533252e-07, + "loss": 0.4651, + "step": 849 + }, + { + "epoch": 0.1539855072463768, + "grad_norm": 11.632307488888198, + "learning_rate": 9.602664365608073e-07, + "loss": 0.3982, + "step": 850 + }, + { + "epoch": 0.15416666666666667, + "grad_norm": 10.83300354975015, + "learning_rate": 9.601517410963744e-07, + "loss": 0.3533, + "step": 851 + }, + { + "epoch": 0.15434782608695652, + "grad_norm": 6.58319396975539, + "learning_rate": 9.600368871995171e-07, + "loss": 0.3647, + "step": 852 + }, + { + "epoch": 0.15452898550724636, + "grad_norm": 6.30529673226947, + "learning_rate": 9.599218749097795e-07, + "loss": 0.3892, + "step": 853 + }, + { + "epoch": 0.15471014492753624, + "grad_norm": 3.516847429110062, + "learning_rate": 9.598067042667615e-07, + "loss": 0.3497, + "step": 854 + }, + { + "epoch": 0.15489130434782608, + "grad_norm": 7.011751073550093, + "learning_rate": 9.596913753101164e-07, + "loss": 0.3427, + "step": 855 + }, + { + "epoch": 0.15507246376811595, + "grad_norm": 15.093237775139633, + "learning_rate": 9.595758880795528e-07, + "loss": 0.3937, + "step": 856 + }, + { + "epoch": 0.1552536231884058, + "grad_norm": 7.4426286572501406, + "learning_rate": 9.594602426148333e-07, + "loss": 0.3569, + "step": 857 + }, + { + "epoch": 0.15543478260869564, + "grad_norm": 3.432755358759048, + "learning_rate": 9.593444389557754e-07, + "loss": 0.3318, + "step": 858 + }, + { + "epoch": 0.15561594202898552, + "grad_norm": 4.003942739573997, + "learning_rate": 9.592284771422508e-07, + "loss": 0.3702, + "step": 859 + }, + { + "epoch": 0.15579710144927536, + "grad_norm": 4.348993560705254, + "learning_rate": 9.591123572141855e-07, + "loss": 0.3199, + "step": 860 + }, + { + "epoch": 0.1559782608695652, + "grad_norm": 4.255913600894319, + "learning_rate": 9.589960792115604e-07, + "loss": 0.3644, + "step": 861 + }, + { + "epoch": 0.15615942028985508, + "grad_norm": 3.7138425667155657, + "learning_rate": 9.588796431744104e-07, + "loss": 0.416, + "step": 862 + }, + { + "epoch": 0.15634057971014492, + "grad_norm": 2.9089075514409983, + "learning_rate": 9.587630491428251e-07, + "loss": 0.3494, + "step": 863 + }, + { + "epoch": 0.1565217391304348, + "grad_norm": 9.313295612231036, + "learning_rate": 9.586462971569484e-07, + "loss": 0.3903, + "step": 864 + }, + { + "epoch": 0.15670289855072464, + "grad_norm": 18.45482805203893, + "learning_rate": 9.585293872569784e-07, + "loss": 0.3486, + "step": 865 + }, + { + "epoch": 0.15688405797101448, + "grad_norm": 9.596823852595307, + "learning_rate": 9.584123194831676e-07, + "loss": 0.3402, + "step": 866 + }, + { + "epoch": 0.15706521739130436, + "grad_norm": 20.850007743178377, + "learning_rate": 9.582950938758235e-07, + "loss": 0.4471, + "step": 867 + }, + { + "epoch": 0.1572463768115942, + "grad_norm": 7.402207466198938, + "learning_rate": 9.58177710475307e-07, + "loss": 0.3485, + "step": 868 + }, + { + "epoch": 0.15742753623188405, + "grad_norm": 3.2430225734636577, + "learning_rate": 9.58060169322034e-07, + "loss": 0.3873, + "step": 869 + }, + { + "epoch": 0.15760869565217392, + "grad_norm": 3.14898584025622, + "learning_rate": 9.579424704564742e-07, + "loss": 0.3349, + "step": 870 + }, + { + "epoch": 0.15778985507246376, + "grad_norm": 6.113644444204073, + "learning_rate": 9.57824613919152e-07, + "loss": 0.3682, + "step": 871 + }, + { + "epoch": 0.15797101449275364, + "grad_norm": 10.038499417889854, + "learning_rate": 9.577065997506462e-07, + "loss": 0.4148, + "step": 872 + }, + { + "epoch": 0.15815217391304348, + "grad_norm": 3.632432549337236, + "learning_rate": 9.575884279915893e-07, + "loss": 0.3898, + "step": 873 + }, + { + "epoch": 0.15833333333333333, + "grad_norm": 3.457209725662949, + "learning_rate": 9.574700986826686e-07, + "loss": 0.34, + "step": 874 + }, + { + "epoch": 0.1585144927536232, + "grad_norm": 3.77759273533897, + "learning_rate": 9.573516118646255e-07, + "loss": 0.3785, + "step": 875 + }, + { + "epoch": 0.15869565217391304, + "grad_norm": 7.5671753374998145, + "learning_rate": 9.572329675782554e-07, + "loss": 0.4427, + "step": 876 + }, + { + "epoch": 0.1588768115942029, + "grad_norm": 6.527966930844897, + "learning_rate": 9.571141658644079e-07, + "loss": 0.4471, + "step": 877 + }, + { + "epoch": 0.15905797101449276, + "grad_norm": 8.468264485855864, + "learning_rate": 9.569952067639876e-07, + "loss": 0.3697, + "step": 878 + }, + { + "epoch": 0.1592391304347826, + "grad_norm": 6.4731431205325425, + "learning_rate": 9.568760903179522e-07, + "loss": 0.3671, + "step": 879 + }, + { + "epoch": 0.15942028985507245, + "grad_norm": 4.506365643974041, + "learning_rate": 9.56756816567314e-07, + "loss": 0.3502, + "step": 880 + }, + { + "epoch": 0.15960144927536232, + "grad_norm": 5.409892690271166, + "learning_rate": 9.5663738555314e-07, + "loss": 0.3485, + "step": 881 + }, + { + "epoch": 0.15978260869565217, + "grad_norm": 3.655959457052786, + "learning_rate": 9.565177973165503e-07, + "loss": 0.3249, + "step": 882 + }, + { + "epoch": 0.15996376811594204, + "grad_norm": 10.263410742979417, + "learning_rate": 9.5639805189872e-07, + "loss": 0.3403, + "step": 883 + }, + { + "epoch": 0.16014492753623188, + "grad_norm": 3.960020975701844, + "learning_rate": 9.562781493408781e-07, + "loss": 0.335, + "step": 884 + }, + { + "epoch": 0.16032608695652173, + "grad_norm": 11.049851127898126, + "learning_rate": 9.561580896843075e-07, + "loss": 0.4307, + "step": 885 + }, + { + "epoch": 0.1605072463768116, + "grad_norm": 8.49611291863617, + "learning_rate": 9.560378729703453e-07, + "loss": 0.3697, + "step": 886 + }, + { + "epoch": 0.16068840579710145, + "grad_norm": 5.791306770435816, + "learning_rate": 9.559174992403825e-07, + "loss": 0.3317, + "step": 887 + }, + { + "epoch": 0.1608695652173913, + "grad_norm": 7.657815216224209, + "learning_rate": 9.557969685358646e-07, + "loss": 0.3938, + "step": 888 + }, + { + "epoch": 0.16105072463768116, + "grad_norm": 6.810366563771853, + "learning_rate": 9.55676280898291e-07, + "loss": 0.373, + "step": 889 + }, + { + "epoch": 0.161231884057971, + "grad_norm": 3.072323261181124, + "learning_rate": 9.555554363692146e-07, + "loss": 0.3734, + "step": 890 + }, + { + "epoch": 0.16141304347826088, + "grad_norm": 9.5510734788038, + "learning_rate": 9.55434434990243e-07, + "loss": 0.3297, + "step": 891 + }, + { + "epoch": 0.16159420289855073, + "grad_norm": 5.637895209116408, + "learning_rate": 9.553132768030377e-07, + "loss": 0.3337, + "step": 892 + }, + { + "epoch": 0.16177536231884057, + "grad_norm": 5.00893520226379, + "learning_rate": 9.551919618493137e-07, + "loss": 0.3373, + "step": 893 + }, + { + "epoch": 0.16195652173913044, + "grad_norm": 3.3584080604749693, + "learning_rate": 9.55070490170841e-07, + "loss": 0.3539, + "step": 894 + }, + { + "epoch": 0.1621376811594203, + "grad_norm": 5.186086911355004, + "learning_rate": 9.549488618094417e-07, + "loss": 0.3447, + "step": 895 + }, + { + "epoch": 0.16231884057971013, + "grad_norm": 4.71172306752271, + "learning_rate": 9.54827076806994e-07, + "loss": 0.3913, + "step": 896 + }, + { + "epoch": 0.1625, + "grad_norm": 6.159846189423231, + "learning_rate": 9.547051352054288e-07, + "loss": 0.3284, + "step": 897 + }, + { + "epoch": 0.16268115942028985, + "grad_norm": 3.5153420547325296, + "learning_rate": 9.54583037046731e-07, + "loss": 0.3806, + "step": 898 + }, + { + "epoch": 0.16286231884057972, + "grad_norm": 3.5131824704383665, + "learning_rate": 9.544607823729397e-07, + "loss": 0.3962, + "step": 899 + }, + { + "epoch": 0.16304347826086957, + "grad_norm": 9.511103902268252, + "learning_rate": 9.543383712261477e-07, + "loss": 0.3837, + "step": 900 + }, + { + "epoch": 0.16304347826086957, + "eval_loss": 0.37748438119888306, + "eval_runtime": 9.8032, + "eval_samples_per_second": 51.004, + "eval_steps_per_second": 0.102, + "step": 900 + }, + { + "epoch": 0.1632246376811594, + "grad_norm": 5.314558362677815, + "learning_rate": 9.542158036485017e-07, + "loss": 0.408, + "step": 901 + }, + { + "epoch": 0.16340579710144928, + "grad_norm": 5.2986074149954625, + "learning_rate": 9.540930796822025e-07, + "loss": 0.3248, + "step": 902 + }, + { + "epoch": 0.16358695652173913, + "grad_norm": 8.373249151123064, + "learning_rate": 9.539701993695047e-07, + "loss": 0.3659, + "step": 903 + }, + { + "epoch": 0.16376811594202897, + "grad_norm": 3.9471347645813473, + "learning_rate": 9.538471627527159e-07, + "loss": 0.3451, + "step": 904 + }, + { + "epoch": 0.16394927536231885, + "grad_norm": 8.60106479616005, + "learning_rate": 9.537239698741989e-07, + "loss": 0.3569, + "step": 905 + }, + { + "epoch": 0.1641304347826087, + "grad_norm": 6.160742255884759, + "learning_rate": 9.536006207763689e-07, + "loss": 0.3965, + "step": 906 + }, + { + "epoch": 0.16431159420289856, + "grad_norm": 5.851640936244484, + "learning_rate": 9.534771155016963e-07, + "loss": 0.3629, + "step": 907 + }, + { + "epoch": 0.1644927536231884, + "grad_norm": 6.746005514478953, + "learning_rate": 9.533534540927039e-07, + "loss": 0.4047, + "step": 908 + }, + { + "epoch": 0.16467391304347825, + "grad_norm": 7.654547234650914, + "learning_rate": 9.532296365919695e-07, + "loss": 0.4122, + "step": 909 + }, + { + "epoch": 0.16485507246376813, + "grad_norm": 3.908851215894925, + "learning_rate": 9.531056630421237e-07, + "loss": 0.395, + "step": 910 + }, + { + "epoch": 0.16503623188405797, + "grad_norm": 11.687031888304489, + "learning_rate": 9.529815334858513e-07, + "loss": 0.404, + "step": 911 + }, + { + "epoch": 0.16521739130434782, + "grad_norm": 7.180870776017081, + "learning_rate": 9.528572479658906e-07, + "loss": 0.3594, + "step": 912 + }, + { + "epoch": 0.1653985507246377, + "grad_norm": 7.414159772457074, + "learning_rate": 9.527328065250337e-07, + "loss": 0.3004, + "step": 913 + }, + { + "epoch": 0.16557971014492753, + "grad_norm": 10.31741860357107, + "learning_rate": 9.526082092061265e-07, + "loss": 0.3714, + "step": 914 + }, + { + "epoch": 0.16576086956521738, + "grad_norm": 5.360065023395594, + "learning_rate": 9.524834560520683e-07, + "loss": 0.4095, + "step": 915 + }, + { + "epoch": 0.16594202898550725, + "grad_norm": 11.373999458413381, + "learning_rate": 9.523585471058122e-07, + "loss": 0.3418, + "step": 916 + }, + { + "epoch": 0.1661231884057971, + "grad_norm": 6.350662250121829, + "learning_rate": 9.522334824103652e-07, + "loss": 0.4067, + "step": 917 + }, + { + "epoch": 0.16630434782608697, + "grad_norm": 3.4483211330563015, + "learning_rate": 9.521082620087874e-07, + "loss": 0.3484, + "step": 918 + }, + { + "epoch": 0.1664855072463768, + "grad_norm": 7.433083990164779, + "learning_rate": 9.519828859441927e-07, + "loss": 0.3609, + "step": 919 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 4.742072955737512, + "learning_rate": 9.51857354259749e-07, + "loss": 0.3688, + "step": 920 + }, + { + "epoch": 0.16684782608695653, + "grad_norm": 3.486922317106545, + "learning_rate": 9.517316669986773e-07, + "loss": 0.3536, + "step": 921 + }, + { + "epoch": 0.16702898550724637, + "grad_norm": 10.190529498816694, + "learning_rate": 9.516058242042523e-07, + "loss": 0.347, + "step": 922 + }, + { + "epoch": 0.16721014492753622, + "grad_norm": 5.361601627973088, + "learning_rate": 9.514798259198023e-07, + "loss": 0.3692, + "step": 923 + }, + { + "epoch": 0.1673913043478261, + "grad_norm": 9.072925983194018, + "learning_rate": 9.51353672188709e-07, + "loss": 0.383, + "step": 924 + }, + { + "epoch": 0.16757246376811594, + "grad_norm": 3.944725667584065, + "learning_rate": 9.512273630544076e-07, + "loss": 0.3909, + "step": 925 + }, + { + "epoch": 0.1677536231884058, + "grad_norm": 6.363206727826493, + "learning_rate": 9.511008985603874e-07, + "loss": 0.3341, + "step": 926 + }, + { + "epoch": 0.16793478260869565, + "grad_norm": 9.491061479422129, + "learning_rate": 9.509742787501905e-07, + "loss": 0.4163, + "step": 927 + }, + { + "epoch": 0.1681159420289855, + "grad_norm": 4.787527839018586, + "learning_rate": 9.508475036674126e-07, + "loss": 0.3746, + "step": 928 + }, + { + "epoch": 0.16829710144927537, + "grad_norm": 3.197899797504528, + "learning_rate": 9.50720573355703e-07, + "loss": 0.3116, + "step": 929 + }, + { + "epoch": 0.16847826086956522, + "grad_norm": 5.140509743780928, + "learning_rate": 9.505934878587645e-07, + "loss": 0.4061, + "step": 930 + }, + { + "epoch": 0.16865942028985506, + "grad_norm": 15.002553681035751, + "learning_rate": 9.504662472203531e-07, + "loss": 0.3604, + "step": 931 + }, + { + "epoch": 0.16884057971014493, + "grad_norm": 5.96837292583248, + "learning_rate": 9.503388514842785e-07, + "loss": 0.3799, + "step": 932 + }, + { + "epoch": 0.16902173913043478, + "grad_norm": 7.788969683101942, + "learning_rate": 9.502113006944035e-07, + "loss": 0.3005, + "step": 933 + }, + { + "epoch": 0.16920289855072465, + "grad_norm": 4.044286329813409, + "learning_rate": 9.500835948946445e-07, + "loss": 0.3247, + "step": 934 + }, + { + "epoch": 0.1693840579710145, + "grad_norm": 8.016480224087877, + "learning_rate": 9.499557341289712e-07, + "loss": 0.3712, + "step": 935 + }, + { + "epoch": 0.16956521739130434, + "grad_norm": 4.700508069097732, + "learning_rate": 9.498277184414069e-07, + "loss": 0.3466, + "step": 936 + }, + { + "epoch": 0.1697463768115942, + "grad_norm": 5.442230755902823, + "learning_rate": 9.496995478760277e-07, + "loss": 0.3927, + "step": 937 + }, + { + "epoch": 0.16992753623188406, + "grad_norm": 8.915551465170877, + "learning_rate": 9.495712224769634e-07, + "loss": 0.4575, + "step": 938 + }, + { + "epoch": 0.1701086956521739, + "grad_norm": 4.151027924138207, + "learning_rate": 9.494427422883973e-07, + "loss": 0.356, + "step": 939 + }, + { + "epoch": 0.17028985507246377, + "grad_norm": 5.718246214997718, + "learning_rate": 9.493141073545653e-07, + "loss": 0.3824, + "step": 940 + }, + { + "epoch": 0.17047101449275362, + "grad_norm": 3.685446969118357, + "learning_rate": 9.491853177197573e-07, + "loss": 0.335, + "step": 941 + }, + { + "epoch": 0.17065217391304346, + "grad_norm": 6.412322728638104, + "learning_rate": 9.490563734283162e-07, + "loss": 0.3681, + "step": 942 + }, + { + "epoch": 0.17083333333333334, + "grad_norm": 3.4483154072954916, + "learning_rate": 9.48927274524638e-07, + "loss": 0.3582, + "step": 943 + }, + { + "epoch": 0.17101449275362318, + "grad_norm": 3.198315352212821, + "learning_rate": 9.487980210531721e-07, + "loss": 0.3226, + "step": 944 + }, + { + "epoch": 0.17119565217391305, + "grad_norm": 4.118986349729483, + "learning_rate": 9.486686130584211e-07, + "loss": 0.3557, + "step": 945 + }, + { + "epoch": 0.1713768115942029, + "grad_norm": 5.52801025546669, + "learning_rate": 9.485390505849409e-07, + "loss": 0.338, + "step": 946 + }, + { + "epoch": 0.17155797101449274, + "grad_norm": 3.5767430582142485, + "learning_rate": 9.484093336773402e-07, + "loss": 0.3743, + "step": 947 + }, + { + "epoch": 0.17173913043478262, + "grad_norm": 2.8978836810437807, + "learning_rate": 9.482794623802813e-07, + "loss": 0.3447, + "step": 948 + }, + { + "epoch": 0.17192028985507246, + "grad_norm": 3.1757229252687202, + "learning_rate": 9.481494367384797e-07, + "loss": 0.3636, + "step": 949 + }, + { + "epoch": 0.1721014492753623, + "grad_norm": 4.031839709657643, + "learning_rate": 9.480192567967035e-07, + "loss": 0.343, + "step": 950 + }, + { + "epoch": 0.17228260869565218, + "grad_norm": 12.512170983104669, + "learning_rate": 9.478889225997744e-07, + "loss": 0.3585, + "step": 951 + }, + { + "epoch": 0.17246376811594202, + "grad_norm": 4.353936774623856, + "learning_rate": 9.477584341925672e-07, + "loss": 0.4444, + "step": 952 + }, + { + "epoch": 0.1726449275362319, + "grad_norm": 9.019787013891907, + "learning_rate": 9.476277916200095e-07, + "loss": 0.3483, + "step": 953 + }, + { + "epoch": 0.17282608695652174, + "grad_norm": 4.660307011902687, + "learning_rate": 9.474969949270824e-07, + "loss": 0.4307, + "step": 954 + }, + { + "epoch": 0.17300724637681159, + "grad_norm": 3.3725580448123145, + "learning_rate": 9.473660441588195e-07, + "loss": 0.3358, + "step": 955 + }, + { + "epoch": 0.17318840579710146, + "grad_norm": 3.777973545569461, + "learning_rate": 9.472349393603079e-07, + "loss": 0.4268, + "step": 956 + }, + { + "epoch": 0.1733695652173913, + "grad_norm": 8.176022737349475, + "learning_rate": 9.471036805766879e-07, + "loss": 0.353, + "step": 957 + }, + { + "epoch": 0.17355072463768115, + "grad_norm": 7.009417024695021, + "learning_rate": 9.46972267853152e-07, + "loss": 0.3792, + "step": 958 + }, + { + "epoch": 0.17373188405797102, + "grad_norm": 3.5393012585686616, + "learning_rate": 9.468407012349465e-07, + "loss": 0.4186, + "step": 959 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 3.0345096292351688, + "learning_rate": 9.467089807673705e-07, + "loss": 0.3476, + "step": 960 + }, + { + "epoch": 0.17409420289855074, + "grad_norm": 3.81454820468709, + "learning_rate": 9.465771064957758e-07, + "loss": 0.4319, + "step": 961 + }, + { + "epoch": 0.17427536231884058, + "grad_norm": 5.993156035622376, + "learning_rate": 9.464450784655674e-07, + "loss": 0.3436, + "step": 962 + }, + { + "epoch": 0.17445652173913043, + "grad_norm": 3.6909003706862955, + "learning_rate": 9.463128967222032e-07, + "loss": 0.3018, + "step": 963 + }, + { + "epoch": 0.1746376811594203, + "grad_norm": 14.570507151586428, + "learning_rate": 9.461805613111939e-07, + "loss": 0.4432, + "step": 964 + }, + { + "epoch": 0.17481884057971014, + "grad_norm": 5.674599814277299, + "learning_rate": 9.460480722781035e-07, + "loss": 0.3471, + "step": 965 + }, + { + "epoch": 0.175, + "grad_norm": 6.035949212202583, + "learning_rate": 9.459154296685484e-07, + "loss": 0.308, + "step": 966 + }, + { + "epoch": 0.17518115942028986, + "grad_norm": 3.208638660816449, + "learning_rate": 9.457826335281978e-07, + "loss": 0.3614, + "step": 967 + }, + { + "epoch": 0.1753623188405797, + "grad_norm": 3.5617227454126246, + "learning_rate": 9.456496839027745e-07, + "loss": 0.4059, + "step": 968 + }, + { + "epoch": 0.17554347826086958, + "grad_norm": 5.793243700113749, + "learning_rate": 9.455165808380534e-07, + "loss": 0.3929, + "step": 969 + }, + { + "epoch": 0.17572463768115942, + "grad_norm": 7.775133784202099, + "learning_rate": 9.453833243798628e-07, + "loss": 0.3406, + "step": 970 + }, + { + "epoch": 0.17590579710144927, + "grad_norm": 5.260090966172478, + "learning_rate": 9.452499145740831e-07, + "loss": 0.3907, + "step": 971 + }, + { + "epoch": 0.17608695652173914, + "grad_norm": 4.829127408307273, + "learning_rate": 9.451163514666483e-07, + "loss": 0.3336, + "step": 972 + }, + { + "epoch": 0.17626811594202899, + "grad_norm": 3.8242458029708852, + "learning_rate": 9.449826351035448e-07, + "loss": 0.3755, + "step": 973 + }, + { + "epoch": 0.17644927536231883, + "grad_norm": 8.500633066793206, + "learning_rate": 9.448487655308115e-07, + "loss": 0.3964, + "step": 974 + }, + { + "epoch": 0.1766304347826087, + "grad_norm": 3.442260634987885, + "learning_rate": 9.447147427945406e-07, + "loss": 0.3905, + "step": 975 + }, + { + "epoch": 0.17681159420289855, + "grad_norm": 3.5481962394012223, + "learning_rate": 9.445805669408765e-07, + "loss": 0.3429, + "step": 976 + }, + { + "epoch": 0.1769927536231884, + "grad_norm": 3.9489484074927526, + "learning_rate": 9.444462380160168e-07, + "loss": 0.3096, + "step": 977 + }, + { + "epoch": 0.17717391304347826, + "grad_norm": 3.774687773569502, + "learning_rate": 9.443117560662115e-07, + "loss": 0.3696, + "step": 978 + }, + { + "epoch": 0.1773550724637681, + "grad_norm": 3.557360706695933, + "learning_rate": 9.441771211377636e-07, + "loss": 0.3287, + "step": 979 + }, + { + "epoch": 0.17753623188405798, + "grad_norm": 3.8417257399064124, + "learning_rate": 9.440423332770281e-07, + "loss": 0.319, + "step": 980 + }, + { + "epoch": 0.17771739130434783, + "grad_norm": 4.566565366730673, + "learning_rate": 9.439073925304134e-07, + "loss": 0.3058, + "step": 981 + }, + { + "epoch": 0.17789855072463767, + "grad_norm": 7.3976658023800965, + "learning_rate": 9.437722989443802e-07, + "loss": 0.4134, + "step": 982 + }, + { + "epoch": 0.17807971014492754, + "grad_norm": 4.627771585188553, + "learning_rate": 9.436370525654418e-07, + "loss": 0.4205, + "step": 983 + }, + { + "epoch": 0.1782608695652174, + "grad_norm": 3.3030755440117505, + "learning_rate": 9.435016534401643e-07, + "loss": 0.3332, + "step": 984 + }, + { + "epoch": 0.17844202898550723, + "grad_norm": 8.834543020429823, + "learning_rate": 9.43366101615166e-07, + "loss": 0.3401, + "step": 985 + }, + { + "epoch": 0.1786231884057971, + "grad_norm": 8.402093992430165, + "learning_rate": 9.432303971371183e-07, + "loss": 0.3159, + "step": 986 + }, + { + "epoch": 0.17880434782608695, + "grad_norm": 10.264054501425852, + "learning_rate": 9.430945400527448e-07, + "loss": 0.3946, + "step": 987 + }, + { + "epoch": 0.17898550724637682, + "grad_norm": 3.954948520536284, + "learning_rate": 9.429585304088217e-07, + "loss": 0.3259, + "step": 988 + }, + { + "epoch": 0.17916666666666667, + "grad_norm": 3.8447824092246874, + "learning_rate": 9.428223682521778e-07, + "loss": 0.4027, + "step": 989 + }, + { + "epoch": 0.1793478260869565, + "grad_norm": 5.012065500188494, + "learning_rate": 9.426860536296946e-07, + "loss": 0.3368, + "step": 990 + }, + { + "epoch": 0.17952898550724639, + "grad_norm": 4.421023883537979, + "learning_rate": 9.425495865883053e-07, + "loss": 0.3147, + "step": 991 + }, + { + "epoch": 0.17971014492753623, + "grad_norm": 7.9101491709946625, + "learning_rate": 9.424129671749966e-07, + "loss": 0.4053, + "step": 992 + }, + { + "epoch": 0.17989130434782608, + "grad_norm": 4.053598227442787, + "learning_rate": 9.422761954368073e-07, + "loss": 0.3586, + "step": 993 + }, + { + "epoch": 0.18007246376811595, + "grad_norm": 5.957033916844329, + "learning_rate": 9.421392714208281e-07, + "loss": 0.3587, + "step": 994 + }, + { + "epoch": 0.1802536231884058, + "grad_norm": 4.195549867293666, + "learning_rate": 9.42002195174203e-07, + "loss": 0.4107, + "step": 995 + }, + { + "epoch": 0.18043478260869567, + "grad_norm": 4.509708808919222, + "learning_rate": 9.418649667441278e-07, + "loss": 0.36, + "step": 996 + }, + { + "epoch": 0.1806159420289855, + "grad_norm": 4.679967495725039, + "learning_rate": 9.417275861778509e-07, + "loss": 0.3694, + "step": 997 + }, + { + "epoch": 0.18079710144927535, + "grad_norm": 6.081344390427841, + "learning_rate": 9.415900535226733e-07, + "loss": 0.3435, + "step": 998 + }, + { + "epoch": 0.18097826086956523, + "grad_norm": 4.464610848455029, + "learning_rate": 9.414523688259477e-07, + "loss": 0.343, + "step": 999 + }, + { + "epoch": 0.18115942028985507, + "grad_norm": 5.465069141893093, + "learning_rate": 9.413145321350801e-07, + "loss": 0.3831, + "step": 1000 + }, + { + "epoch": 0.18115942028985507, + "eval_loss": 0.3557968735694885, + "eval_runtime": 9.7981, + "eval_samples_per_second": 51.03, + "eval_steps_per_second": 0.102, + "step": 1000 + }, + { + "epoch": 0.18134057971014492, + "grad_norm": 4.1988974120391225, + "learning_rate": 9.411765434975281e-07, + "loss": 0.3582, + "step": 1001 + }, + { + "epoch": 0.1815217391304348, + "grad_norm": 5.676730742098176, + "learning_rate": 9.410384029608016e-07, + "loss": 0.3672, + "step": 1002 + }, + { + "epoch": 0.18170289855072463, + "grad_norm": 5.374866950407323, + "learning_rate": 9.409001105724634e-07, + "loss": 0.3848, + "step": 1003 + }, + { + "epoch": 0.18188405797101448, + "grad_norm": 5.128296812499538, + "learning_rate": 9.407616663801283e-07, + "loss": 0.4016, + "step": 1004 + }, + { + "epoch": 0.18206521739130435, + "grad_norm": 3.31719734897798, + "learning_rate": 9.406230704314628e-07, + "loss": 0.3253, + "step": 1005 + }, + { + "epoch": 0.1822463768115942, + "grad_norm": 6.785966995929203, + "learning_rate": 9.404843227741867e-07, + "loss": 0.4005, + "step": 1006 + }, + { + "epoch": 0.18242753623188407, + "grad_norm": 6.838749556127302, + "learning_rate": 9.403454234560711e-07, + "loss": 0.3651, + "step": 1007 + }, + { + "epoch": 0.1826086956521739, + "grad_norm": 6.560158606171007, + "learning_rate": 9.402063725249396e-07, + "loss": 0.3732, + "step": 1008 + }, + { + "epoch": 0.18278985507246376, + "grad_norm": 3.86840262552788, + "learning_rate": 9.400671700286685e-07, + "loss": 0.3217, + "step": 1009 + }, + { + "epoch": 0.18297101449275363, + "grad_norm": 4.2179882987082165, + "learning_rate": 9.399278160151858e-07, + "loss": 0.3052, + "step": 1010 + }, + { + "epoch": 0.18315217391304348, + "grad_norm": 3.64792183603231, + "learning_rate": 9.397883105324713e-07, + "loss": 0.3477, + "step": 1011 + }, + { + "epoch": 0.18333333333333332, + "grad_norm": 6.892344215352792, + "learning_rate": 9.396486536285579e-07, + "loss": 0.4508, + "step": 1012 + }, + { + "epoch": 0.1835144927536232, + "grad_norm": 14.641369583306055, + "learning_rate": 9.395088453515301e-07, + "loss": 0.4146, + "step": 1013 + }, + { + "epoch": 0.18369565217391304, + "grad_norm": 15.832537940991099, + "learning_rate": 9.393688857495243e-07, + "loss": 0.34, + "step": 1014 + }, + { + "epoch": 0.1838768115942029, + "grad_norm": 4.3144035402251575, + "learning_rate": 9.392287748707292e-07, + "loss": 0.4022, + "step": 1015 + }, + { + "epoch": 0.18405797101449275, + "grad_norm": 6.261068339056287, + "learning_rate": 9.39088512763386e-07, + "loss": 0.3387, + "step": 1016 + }, + { + "epoch": 0.1842391304347826, + "grad_norm": 4.056070089493664, + "learning_rate": 9.389480994757873e-07, + "loss": 0.3511, + "step": 1017 + }, + { + "epoch": 0.18442028985507247, + "grad_norm": 6.733169082851925, + "learning_rate": 9.388075350562783e-07, + "loss": 0.405, + "step": 1018 + }, + { + "epoch": 0.18460144927536232, + "grad_norm": 3.8826643978883304, + "learning_rate": 9.386668195532557e-07, + "loss": 0.3667, + "step": 1019 + }, + { + "epoch": 0.18478260869565216, + "grad_norm": 3.5398140355329204, + "learning_rate": 9.385259530151688e-07, + "loss": 0.355, + "step": 1020 + }, + { + "epoch": 0.18496376811594203, + "grad_norm": 3.2992884292362694, + "learning_rate": 9.383849354905184e-07, + "loss": 0.3199, + "step": 1021 + }, + { + "epoch": 0.18514492753623188, + "grad_norm": 3.158293490792938, + "learning_rate": 9.382437670278578e-07, + "loss": 0.3208, + "step": 1022 + }, + { + "epoch": 0.18532608695652175, + "grad_norm": 7.364388737807517, + "learning_rate": 9.381024476757915e-07, + "loss": 0.3758, + "step": 1023 + }, + { + "epoch": 0.1855072463768116, + "grad_norm": 8.152792736078732, + "learning_rate": 9.379609774829769e-07, + "loss": 0.3214, + "step": 1024 + }, + { + "epoch": 0.18568840579710144, + "grad_norm": 4.680932173783523, + "learning_rate": 9.378193564981225e-07, + "loss": 0.3177, + "step": 1025 + }, + { + "epoch": 0.1858695652173913, + "grad_norm": 3.2460016757868186, + "learning_rate": 9.376775847699894e-07, + "loss": 0.3685, + "step": 1026 + }, + { + "epoch": 0.18605072463768116, + "grad_norm": 3.181593046712226, + "learning_rate": 9.375356623473899e-07, + "loss": 0.3076, + "step": 1027 + }, + { + "epoch": 0.186231884057971, + "grad_norm": 4.144203924103408, + "learning_rate": 9.373935892791889e-07, + "loss": 0.3441, + "step": 1028 + }, + { + "epoch": 0.18641304347826088, + "grad_norm": 8.878600569479174, + "learning_rate": 9.372513656143026e-07, + "loss": 0.3803, + "step": 1029 + }, + { + "epoch": 0.18659420289855072, + "grad_norm": 3.300176819811975, + "learning_rate": 9.371089914016995e-07, + "loss": 0.334, + "step": 1030 + }, + { + "epoch": 0.1867753623188406, + "grad_norm": 4.256776263086497, + "learning_rate": 9.369664666903996e-07, + "loss": 0.3546, + "step": 1031 + }, + { + "epoch": 0.18695652173913044, + "grad_norm": 6.922059186282991, + "learning_rate": 9.368237915294748e-07, + "loss": 0.3782, + "step": 1032 + }, + { + "epoch": 0.18713768115942028, + "grad_norm": 7.345206267770565, + "learning_rate": 9.366809659680488e-07, + "loss": 0.3101, + "step": 1033 + }, + { + "epoch": 0.18731884057971016, + "grad_norm": 3.7060120071990137, + "learning_rate": 9.365379900552972e-07, + "loss": 0.3377, + "step": 1034 + }, + { + "epoch": 0.1875, + "grad_norm": 9.258034061223405, + "learning_rate": 9.363948638404472e-07, + "loss": 0.3306, + "step": 1035 + }, + { + "epoch": 0.18768115942028984, + "grad_norm": 3.756822751714663, + "learning_rate": 9.36251587372778e-07, + "loss": 0.3321, + "step": 1036 + }, + { + "epoch": 0.18786231884057972, + "grad_norm": 3.873198801213808, + "learning_rate": 9.361081607016202e-07, + "loss": 0.3748, + "step": 1037 + }, + { + "epoch": 0.18804347826086956, + "grad_norm": 4.118267251180074, + "learning_rate": 9.359645838763564e-07, + "loss": 0.3533, + "step": 1038 + }, + { + "epoch": 0.1882246376811594, + "grad_norm": 3.516923362026741, + "learning_rate": 9.358208569464208e-07, + "loss": 0.32, + "step": 1039 + }, + { + "epoch": 0.18840579710144928, + "grad_norm": 7.744720378017167, + "learning_rate": 9.35676979961299e-07, + "loss": 0.3836, + "step": 1040 + }, + { + "epoch": 0.18858695652173912, + "grad_norm": 5.030928842442586, + "learning_rate": 9.355329529705288e-07, + "loss": 0.3219, + "step": 1041 + }, + { + "epoch": 0.188768115942029, + "grad_norm": 3.538556785759717, + "learning_rate": 9.353887760236994e-07, + "loss": 0.3073, + "step": 1042 + }, + { + "epoch": 0.18894927536231884, + "grad_norm": 4.596728290588321, + "learning_rate": 9.352444491704513e-07, + "loss": 0.3112, + "step": 1043 + }, + { + "epoch": 0.1891304347826087, + "grad_norm": 3.870053971875424, + "learning_rate": 9.350999724604772e-07, + "loss": 0.3183, + "step": 1044 + }, + { + "epoch": 0.18931159420289856, + "grad_norm": 7.761349154182568, + "learning_rate": 9.349553459435211e-07, + "loss": 0.3843, + "step": 1045 + }, + { + "epoch": 0.1894927536231884, + "grad_norm": 8.152976149577444, + "learning_rate": 9.348105696693785e-07, + "loss": 0.3494, + "step": 1046 + }, + { + "epoch": 0.18967391304347825, + "grad_norm": 12.263657947687328, + "learning_rate": 9.346656436878965e-07, + "loss": 0.3679, + "step": 1047 + }, + { + "epoch": 0.18985507246376812, + "grad_norm": 3.4252123995475636, + "learning_rate": 9.34520568048974e-07, + "loss": 0.3684, + "step": 1048 + }, + { + "epoch": 0.19003623188405797, + "grad_norm": 7.322953184670889, + "learning_rate": 9.343753428025611e-07, + "loss": 0.3858, + "step": 1049 + }, + { + "epoch": 0.19021739130434784, + "grad_norm": 3.720919985019423, + "learning_rate": 9.342299679986596e-07, + "loss": 0.352, + "step": 1050 + }, + { + "epoch": 0.19039855072463768, + "grad_norm": 5.309245134565914, + "learning_rate": 9.340844436873226e-07, + "loss": 0.3127, + "step": 1051 + }, + { + "epoch": 0.19057971014492753, + "grad_norm": 6.088834503363983, + "learning_rate": 9.33938769918655e-07, + "loss": 0.3327, + "step": 1052 + }, + { + "epoch": 0.1907608695652174, + "grad_norm": 5.244885531702647, + "learning_rate": 9.337929467428128e-07, + "loss": 0.3776, + "step": 1053 + }, + { + "epoch": 0.19094202898550725, + "grad_norm": 5.222800044527979, + "learning_rate": 9.336469742100037e-07, + "loss": 0.3658, + "step": 1054 + }, + { + "epoch": 0.1911231884057971, + "grad_norm": 4.565827218524475, + "learning_rate": 9.335008523704867e-07, + "loss": 0.315, + "step": 1055 + }, + { + "epoch": 0.19130434782608696, + "grad_norm": 4.38386894383268, + "learning_rate": 9.333545812745723e-07, + "loss": 0.3693, + "step": 1056 + }, + { + "epoch": 0.1914855072463768, + "grad_norm": 3.3435317732837455, + "learning_rate": 9.332081609726224e-07, + "loss": 0.3228, + "step": 1057 + }, + { + "epoch": 0.19166666666666668, + "grad_norm": 6.5740815579589205, + "learning_rate": 9.330615915150498e-07, + "loss": 0.3694, + "step": 1058 + }, + { + "epoch": 0.19184782608695652, + "grad_norm": 9.56767830970739, + "learning_rate": 9.329148729523195e-07, + "loss": 0.3828, + "step": 1059 + }, + { + "epoch": 0.19202898550724637, + "grad_norm": 3.3478168389268785, + "learning_rate": 9.327680053349474e-07, + "loss": 0.329, + "step": 1060 + }, + { + "epoch": 0.19221014492753624, + "grad_norm": 4.839261044548201, + "learning_rate": 9.326209887135004e-07, + "loss": 0.2738, + "step": 1061 + }, + { + "epoch": 0.1923913043478261, + "grad_norm": 5.4335721948293365, + "learning_rate": 9.324738231385971e-07, + "loss": 0.336, + "step": 1062 + }, + { + "epoch": 0.19257246376811593, + "grad_norm": 3.43564934617691, + "learning_rate": 9.323265086609076e-07, + "loss": 0.3337, + "step": 1063 + }, + { + "epoch": 0.1927536231884058, + "grad_norm": 5.187357822085098, + "learning_rate": 9.321790453311527e-07, + "loss": 0.3221, + "step": 1064 + }, + { + "epoch": 0.19293478260869565, + "grad_norm": 3.687748786376032, + "learning_rate": 9.320314332001047e-07, + "loss": 0.3907, + "step": 1065 + }, + { + "epoch": 0.19311594202898552, + "grad_norm": 3.601776619294207, + "learning_rate": 9.318836723185872e-07, + "loss": 0.3178, + "step": 1066 + }, + { + "epoch": 0.19329710144927537, + "grad_norm": 3.4853584536899236, + "learning_rate": 9.317357627374751e-07, + "loss": 0.3132, + "step": 1067 + }, + { + "epoch": 0.1934782608695652, + "grad_norm": 6.302551817302995, + "learning_rate": 9.315877045076943e-07, + "loss": 0.3544, + "step": 1068 + }, + { + "epoch": 0.19365942028985508, + "grad_norm": 3.759177455225637, + "learning_rate": 9.31439497680222e-07, + "loss": 0.3145, + "step": 1069 + }, + { + "epoch": 0.19384057971014493, + "grad_norm": 3.36728008038697, + "learning_rate": 9.312911423060863e-07, + "loss": 0.3589, + "step": 1070 + }, + { + "epoch": 0.19402173913043477, + "grad_norm": 4.154088418490633, + "learning_rate": 9.31142638436367e-07, + "loss": 0.342, + "step": 1071 + }, + { + "epoch": 0.19420289855072465, + "grad_norm": 4.255679049260611, + "learning_rate": 9.309939861221943e-07, + "loss": 0.3596, + "step": 1072 + }, + { + "epoch": 0.1943840579710145, + "grad_norm": 3.6152782589923005, + "learning_rate": 9.308451854147501e-07, + "loss": 0.3298, + "step": 1073 + }, + { + "epoch": 0.19456521739130433, + "grad_norm": 6.809714669231455, + "learning_rate": 9.306962363652673e-07, + "loss": 0.3673, + "step": 1074 + }, + { + "epoch": 0.1947463768115942, + "grad_norm": 7.817390681594236, + "learning_rate": 9.305471390250294e-07, + "loss": 0.3857, + "step": 1075 + }, + { + "epoch": 0.19492753623188405, + "grad_norm": 8.462995843106821, + "learning_rate": 9.303978934453718e-07, + "loss": 0.382, + "step": 1076 + }, + { + "epoch": 0.19510869565217392, + "grad_norm": 6.192447694817275, + "learning_rate": 9.302484996776801e-07, + "loss": 0.348, + "step": 1077 + }, + { + "epoch": 0.19528985507246377, + "grad_norm": 4.879352825100935, + "learning_rate": 9.300989577733915e-07, + "loss": 0.3686, + "step": 1078 + }, + { + "epoch": 0.19547101449275361, + "grad_norm": 9.443225979075029, + "learning_rate": 9.299492677839938e-07, + "loss": 0.3484, + "step": 1079 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 7.902031406191935, + "learning_rate": 9.297994297610261e-07, + "loss": 0.3103, + "step": 1080 + }, + { + "epoch": 0.19583333333333333, + "grad_norm": 16.296221093472447, + "learning_rate": 9.296494437560782e-07, + "loss": 0.3459, + "step": 1081 + }, + { + "epoch": 0.19601449275362318, + "grad_norm": 11.415756234070233, + "learning_rate": 9.294993098207913e-07, + "loss": 0.3062, + "step": 1082 + }, + { + "epoch": 0.19619565217391305, + "grad_norm": 10.317470423482591, + "learning_rate": 9.293490280068567e-07, + "loss": 0.3843, + "step": 1083 + }, + { + "epoch": 0.1963768115942029, + "grad_norm": 4.599444632856712, + "learning_rate": 9.291985983660178e-07, + "loss": 0.3362, + "step": 1084 + }, + { + "epoch": 0.19655797101449277, + "grad_norm": 5.607029721056904, + "learning_rate": 9.290480209500678e-07, + "loss": 0.403, + "step": 1085 + }, + { + "epoch": 0.1967391304347826, + "grad_norm": 6.211688839085225, + "learning_rate": 9.288972958108512e-07, + "loss": 0.3273, + "step": 1086 + }, + { + "epoch": 0.19692028985507246, + "grad_norm": 3.478190361462845, + "learning_rate": 9.287464230002635e-07, + "loss": 0.3742, + "step": 1087 + }, + { + "epoch": 0.19710144927536233, + "grad_norm": 5.812794593798648, + "learning_rate": 9.28595402570251e-07, + "loss": 0.3475, + "step": 1088 + }, + { + "epoch": 0.19728260869565217, + "grad_norm": 3.72849361710505, + "learning_rate": 9.284442345728107e-07, + "loss": 0.2964, + "step": 1089 + }, + { + "epoch": 0.19746376811594202, + "grad_norm": 5.949450789308594, + "learning_rate": 9.282929190599904e-07, + "loss": 0.3248, + "step": 1090 + }, + { + "epoch": 0.1976449275362319, + "grad_norm": 5.259186209578336, + "learning_rate": 9.281414560838888e-07, + "loss": 0.3251, + "step": 1091 + }, + { + "epoch": 0.19782608695652174, + "grad_norm": 7.072859407675492, + "learning_rate": 9.279898456966551e-07, + "loss": 0.2894, + "step": 1092 + }, + { + "epoch": 0.1980072463768116, + "grad_norm": 15.20510906931546, + "learning_rate": 9.278380879504899e-07, + "loss": 0.3811, + "step": 1093 + }, + { + "epoch": 0.19818840579710145, + "grad_norm": 7.818422015322425, + "learning_rate": 9.276861828976437e-07, + "loss": 0.3401, + "step": 1094 + }, + { + "epoch": 0.1983695652173913, + "grad_norm": 3.3437589662257152, + "learning_rate": 9.275341305904185e-07, + "loss": 0.3343, + "step": 1095 + }, + { + "epoch": 0.19855072463768117, + "grad_norm": 6.749822156460748, + "learning_rate": 9.273819310811664e-07, + "loss": 0.3954, + "step": 1096 + }, + { + "epoch": 0.19873188405797101, + "grad_norm": 5.763810971844492, + "learning_rate": 9.272295844222902e-07, + "loss": 0.3429, + "step": 1097 + }, + { + "epoch": 0.19891304347826086, + "grad_norm": 3.767825869199621, + "learning_rate": 9.27077090666244e-07, + "loss": 0.3553, + "step": 1098 + }, + { + "epoch": 0.19909420289855073, + "grad_norm": 11.354287594239699, + "learning_rate": 9.26924449865532e-07, + "loss": 0.3663, + "step": 1099 + }, + { + "epoch": 0.19927536231884058, + "grad_norm": 3.7643877260071297, + "learning_rate": 9.267716620727091e-07, + "loss": 0.3576, + "step": 1100 + }, + { + "epoch": 0.19927536231884058, + "eval_loss": 0.3427187502384186, + "eval_runtime": 9.7492, + "eval_samples_per_second": 51.286, + "eval_steps_per_second": 0.103, + "step": 1100 + }, + { + "epoch": 0.19945652173913042, + "grad_norm": 6.128790262274886, + "learning_rate": 9.266187273403808e-07, + "loss": 0.3613, + "step": 1101 + }, + { + "epoch": 0.1996376811594203, + "grad_norm": 11.424083402715372, + "learning_rate": 9.264656457212034e-07, + "loss": 0.4133, + "step": 1102 + }, + { + "epoch": 0.19981884057971014, + "grad_norm": 3.842995725930491, + "learning_rate": 9.263124172678835e-07, + "loss": 0.3736, + "step": 1103 + }, + { + "epoch": 0.2, + "grad_norm": 5.506934004661734, + "learning_rate": 9.261590420331784e-07, + "loss": 0.3165, + "step": 1104 + }, + { + "epoch": 0.20018115942028986, + "grad_norm": 3.8890109901948615, + "learning_rate": 9.26005520069896e-07, + "loss": 0.3175, + "step": 1105 + }, + { + "epoch": 0.2003623188405797, + "grad_norm": 3.589317743829576, + "learning_rate": 9.258518514308944e-07, + "loss": 0.3052, + "step": 1106 + }, + { + "epoch": 0.20054347826086957, + "grad_norm": 5.520151848132921, + "learning_rate": 9.256980361690827e-07, + "loss": 0.3307, + "step": 1107 + }, + { + "epoch": 0.20072463768115942, + "grad_norm": 10.784318716483138, + "learning_rate": 9.255440743374203e-07, + "loss": 0.3625, + "step": 1108 + }, + { + "epoch": 0.20090579710144926, + "grad_norm": 4.166893125999619, + "learning_rate": 9.253899659889168e-07, + "loss": 0.3052, + "step": 1109 + }, + { + "epoch": 0.20108695652173914, + "grad_norm": 11.740812290864191, + "learning_rate": 9.252357111766323e-07, + "loss": 0.3665, + "step": 1110 + }, + { + "epoch": 0.20126811594202898, + "grad_norm": 6.716346974200295, + "learning_rate": 9.250813099536778e-07, + "loss": 0.3451, + "step": 1111 + }, + { + "epoch": 0.20144927536231885, + "grad_norm": 4.498535643674493, + "learning_rate": 9.249267623732141e-07, + "loss": 0.3234, + "step": 1112 + }, + { + "epoch": 0.2016304347826087, + "grad_norm": 8.40625660849219, + "learning_rate": 9.247720684884528e-07, + "loss": 0.3078, + "step": 1113 + }, + { + "epoch": 0.20181159420289854, + "grad_norm": 5.2609762947034335, + "learning_rate": 9.246172283526557e-07, + "loss": 0.3471, + "step": 1114 + }, + { + "epoch": 0.20199275362318841, + "grad_norm": 11.903469401576388, + "learning_rate": 9.24462242019135e-07, + "loss": 0.3791, + "step": 1115 + }, + { + "epoch": 0.20217391304347826, + "grad_norm": 4.398252956752326, + "learning_rate": 9.243071095412534e-07, + "loss": 0.3436, + "step": 1116 + }, + { + "epoch": 0.2023550724637681, + "grad_norm": 8.785207159846003, + "learning_rate": 9.241518309724233e-07, + "loss": 0.2999, + "step": 1117 + }, + { + "epoch": 0.20253623188405798, + "grad_norm": 4.357553444883357, + "learning_rate": 9.239964063661083e-07, + "loss": 0.3042, + "step": 1118 + }, + { + "epoch": 0.20271739130434782, + "grad_norm": 12.05948814062256, + "learning_rate": 9.238408357758216e-07, + "loss": 0.317, + "step": 1119 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 6.7722231202873795, + "learning_rate": 9.236851192551269e-07, + "loss": 0.338, + "step": 1120 + }, + { + "epoch": 0.20307971014492754, + "grad_norm": 6.220502547650929, + "learning_rate": 9.235292568576383e-07, + "loss": 0.3484, + "step": 1121 + }, + { + "epoch": 0.20326086956521738, + "grad_norm": 4.044774034395157, + "learning_rate": 9.233732486370198e-07, + "loss": 0.3544, + "step": 1122 + }, + { + "epoch": 0.20344202898550726, + "grad_norm": 11.663325350898504, + "learning_rate": 9.232170946469858e-07, + "loss": 0.3819, + "step": 1123 + }, + { + "epoch": 0.2036231884057971, + "grad_norm": 16.325034677045284, + "learning_rate": 9.230607949413007e-07, + "loss": 0.4426, + "step": 1124 + }, + { + "epoch": 0.20380434782608695, + "grad_norm": 4.667412193785131, + "learning_rate": 9.229043495737796e-07, + "loss": 0.3227, + "step": 1125 + }, + { + "epoch": 0.20398550724637682, + "grad_norm": 10.525774925483608, + "learning_rate": 9.227477585982871e-07, + "loss": 0.3619, + "step": 1126 + }, + { + "epoch": 0.20416666666666666, + "grad_norm": 5.219206604324908, + "learning_rate": 9.225910220687383e-07, + "loss": 0.3131, + "step": 1127 + }, + { + "epoch": 0.20434782608695654, + "grad_norm": 6.393631117302292, + "learning_rate": 9.224341400390983e-07, + "loss": 0.373, + "step": 1128 + }, + { + "epoch": 0.20452898550724638, + "grad_norm": 4.014103010525397, + "learning_rate": 9.222771125633825e-07, + "loss": 0.3151, + "step": 1129 + }, + { + "epoch": 0.20471014492753623, + "grad_norm": 3.6917550993065613, + "learning_rate": 9.221199396956558e-07, + "loss": 0.3845, + "step": 1130 + }, + { + "epoch": 0.2048913043478261, + "grad_norm": 3.346092054285078, + "learning_rate": 9.21962621490034e-07, + "loss": 0.3389, + "step": 1131 + }, + { + "epoch": 0.20507246376811594, + "grad_norm": 8.044549538911658, + "learning_rate": 9.218051580006823e-07, + "loss": 0.3457, + "step": 1132 + }, + { + "epoch": 0.2052536231884058, + "grad_norm": 3.5806257550617664, + "learning_rate": 9.216475492818161e-07, + "loss": 0.3435, + "step": 1133 + }, + { + "epoch": 0.20543478260869566, + "grad_norm": 3.568645542258272, + "learning_rate": 9.21489795387701e-07, + "loss": 0.3588, + "step": 1134 + }, + { + "epoch": 0.2056159420289855, + "grad_norm": 4.71861334120439, + "learning_rate": 9.213318963726522e-07, + "loss": 0.3979, + "step": 1135 + }, + { + "epoch": 0.20579710144927535, + "grad_norm": 3.6290599462987667, + "learning_rate": 9.211738522910351e-07, + "loss": 0.2969, + "step": 1136 + }, + { + "epoch": 0.20597826086956522, + "grad_norm": 5.7895554874818, + "learning_rate": 9.210156631972652e-07, + "loss": 0.3373, + "step": 1137 + }, + { + "epoch": 0.20615942028985507, + "grad_norm": 3.6089215727395283, + "learning_rate": 9.208573291458075e-07, + "loss": 0.3846, + "step": 1138 + }, + { + "epoch": 0.20634057971014494, + "grad_norm": 9.414747750175813, + "learning_rate": 9.206988501911774e-07, + "loss": 0.3043, + "step": 1139 + }, + { + "epoch": 0.20652173913043478, + "grad_norm": 5.021675194345018, + "learning_rate": 9.205402263879399e-07, + "loss": 0.2858, + "step": 1140 + }, + { + "epoch": 0.20670289855072463, + "grad_norm": 8.691479291707761, + "learning_rate": 9.203814577907099e-07, + "loss": 0.3541, + "step": 1141 + }, + { + "epoch": 0.2068840579710145, + "grad_norm": 6.702702835093648, + "learning_rate": 9.20222544454152e-07, + "loss": 0.3394, + "step": 1142 + }, + { + "epoch": 0.20706521739130435, + "grad_norm": 4.306005758339415, + "learning_rate": 9.200634864329813e-07, + "loss": 0.3832, + "step": 1143 + }, + { + "epoch": 0.2072463768115942, + "grad_norm": 5.707737634342709, + "learning_rate": 9.199042837819617e-07, + "loss": 0.312, + "step": 1144 + }, + { + "epoch": 0.20742753623188406, + "grad_norm": 6.739596517891878, + "learning_rate": 9.197449365559076e-07, + "loss": 0.3229, + "step": 1145 + }, + { + "epoch": 0.2076086956521739, + "grad_norm": 5.032932932874201, + "learning_rate": 9.195854448096831e-07, + "loss": 0.4449, + "step": 1146 + }, + { + "epoch": 0.20778985507246378, + "grad_norm": 14.342779741181634, + "learning_rate": 9.19425808598202e-07, + "loss": 0.387, + "step": 1147 + }, + { + "epoch": 0.20797101449275363, + "grad_norm": 7.487756939525193, + "learning_rate": 9.192660279764278e-07, + "loss": 0.3715, + "step": 1148 + }, + { + "epoch": 0.20815217391304347, + "grad_norm": 5.961515436167527, + "learning_rate": 9.191061029993734e-07, + "loss": 0.3369, + "step": 1149 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 3.510495429788573, + "learning_rate": 9.189460337221021e-07, + "loss": 0.3005, + "step": 1150 + }, + { + "epoch": 0.2085144927536232, + "grad_norm": 3.714024418691465, + "learning_rate": 9.187858201997264e-07, + "loss": 0.285, + "step": 1151 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 6.725784531538931, + "learning_rate": 9.186254624874085e-07, + "loss": 0.3585, + "step": 1152 + }, + { + "epoch": 0.2088768115942029, + "grad_norm": 6.605587077891203, + "learning_rate": 9.184649606403604e-07, + "loss": 0.3703, + "step": 1153 + }, + { + "epoch": 0.20905797101449275, + "grad_norm": 3.5370045653615296, + "learning_rate": 9.183043147138436e-07, + "loss": 0.3839, + "step": 1154 + }, + { + "epoch": 0.20923913043478262, + "grad_norm": 6.352120373584736, + "learning_rate": 9.181435247631693e-07, + "loss": 0.3477, + "step": 1155 + }, + { + "epoch": 0.20942028985507247, + "grad_norm": 7.113565388564078, + "learning_rate": 9.179825908436983e-07, + "loss": 0.3512, + "step": 1156 + }, + { + "epoch": 0.2096014492753623, + "grad_norm": 6.148093784747192, + "learning_rate": 9.178215130108407e-07, + "loss": 0.3801, + "step": 1157 + }, + { + "epoch": 0.20978260869565218, + "grad_norm": 4.117767830767374, + "learning_rate": 9.176602913200567e-07, + "loss": 0.3722, + "step": 1158 + }, + { + "epoch": 0.20996376811594203, + "grad_norm": 7.290988329095725, + "learning_rate": 9.174989258268551e-07, + "loss": 0.3242, + "step": 1159 + }, + { + "epoch": 0.21014492753623187, + "grad_norm": 3.4110712136513355, + "learning_rate": 9.173374165867955e-07, + "loss": 0.3235, + "step": 1160 + }, + { + "epoch": 0.21032608695652175, + "grad_norm": 4.201425450924089, + "learning_rate": 9.171757636554859e-07, + "loss": 0.3658, + "step": 1161 + }, + { + "epoch": 0.2105072463768116, + "grad_norm": 7.851814670231926, + "learning_rate": 9.170139670885841e-07, + "loss": 0.3036, + "step": 1162 + }, + { + "epoch": 0.21068840579710144, + "grad_norm": 7.337011470424802, + "learning_rate": 9.168520269417978e-07, + "loss": 0.3761, + "step": 1163 + }, + { + "epoch": 0.2108695652173913, + "grad_norm": 9.634430631387612, + "learning_rate": 9.166899432708835e-07, + "loss": 0.3451, + "step": 1164 + }, + { + "epoch": 0.21105072463768115, + "grad_norm": 4.375900947739495, + "learning_rate": 9.165277161316473e-07, + "loss": 0.3838, + "step": 1165 + }, + { + "epoch": 0.21123188405797103, + "grad_norm": 5.230338603681001, + "learning_rate": 9.16365345579945e-07, + "loss": 0.3036, + "step": 1166 + }, + { + "epoch": 0.21141304347826087, + "grad_norm": 6.95064796883076, + "learning_rate": 9.162028316716815e-07, + "loss": 0.3689, + "step": 1167 + }, + { + "epoch": 0.21159420289855072, + "grad_norm": 5.169954445208204, + "learning_rate": 9.160401744628108e-07, + "loss": 0.3853, + "step": 1168 + }, + { + "epoch": 0.2117753623188406, + "grad_norm": 6.078275299631071, + "learning_rate": 9.15877374009337e-07, + "loss": 0.3278, + "step": 1169 + }, + { + "epoch": 0.21195652173913043, + "grad_norm": 4.771621676358347, + "learning_rate": 9.157144303673128e-07, + "loss": 0.3604, + "step": 1170 + }, + { + "epoch": 0.21213768115942028, + "grad_norm": 11.24121228358597, + "learning_rate": 9.155513435928405e-07, + "loss": 0.3915, + "step": 1171 + }, + { + "epoch": 0.21231884057971015, + "grad_norm": 3.498503875826368, + "learning_rate": 9.153881137420717e-07, + "loss": 0.3715, + "step": 1172 + }, + { + "epoch": 0.2125, + "grad_norm": 3.6629310625294367, + "learning_rate": 9.152247408712073e-07, + "loss": 0.3061, + "step": 1173 + }, + { + "epoch": 0.21268115942028987, + "grad_norm": 6.637769743333661, + "learning_rate": 9.15061225036497e-07, + "loss": 0.3265, + "step": 1174 + }, + { + "epoch": 0.2128623188405797, + "grad_norm": 3.595438248559532, + "learning_rate": 9.148975662942404e-07, + "loss": 0.3375, + "step": 1175 + }, + { + "epoch": 0.21304347826086956, + "grad_norm": 4.042233150619379, + "learning_rate": 9.14733764700786e-07, + "loss": 0.3159, + "step": 1176 + }, + { + "epoch": 0.21322463768115943, + "grad_norm": 8.690905380385374, + "learning_rate": 9.145698203125313e-07, + "loss": 0.3509, + "step": 1177 + }, + { + "epoch": 0.21340579710144927, + "grad_norm": 3.5465642413707124, + "learning_rate": 9.144057331859232e-07, + "loss": 0.3578, + "step": 1178 + }, + { + "epoch": 0.21358695652173912, + "grad_norm": 3.077868937570869, + "learning_rate": 9.142415033774577e-07, + "loss": 0.3231, + "step": 1179 + }, + { + "epoch": 0.213768115942029, + "grad_norm": 3.1355293571853315, + "learning_rate": 9.140771309436798e-07, + "loss": 0.2754, + "step": 1180 + }, + { + "epoch": 0.21394927536231884, + "grad_norm": 5.896185940355914, + "learning_rate": 9.139126159411838e-07, + "loss": 0.3365, + "step": 1181 + }, + { + "epoch": 0.2141304347826087, + "grad_norm": 5.491191469508045, + "learning_rate": 9.137479584266132e-07, + "loss": 0.3892, + "step": 1182 + }, + { + "epoch": 0.21431159420289855, + "grad_norm": 6.511034906567878, + "learning_rate": 9.135831584566597e-07, + "loss": 0.3632, + "step": 1183 + }, + { + "epoch": 0.2144927536231884, + "grad_norm": 6.513532927129578, + "learning_rate": 9.134182160880655e-07, + "loss": 0.427, + "step": 1184 + }, + { + "epoch": 0.21467391304347827, + "grad_norm": 3.1819913425535016, + "learning_rate": 9.132531313776207e-07, + "loss": 0.2913, + "step": 1185 + }, + { + "epoch": 0.21485507246376812, + "grad_norm": 9.642877373841666, + "learning_rate": 9.130879043821645e-07, + "loss": 0.3699, + "step": 1186 + }, + { + "epoch": 0.21503623188405796, + "grad_norm": 4.5689952729653065, + "learning_rate": 9.129225351585856e-07, + "loss": 0.361, + "step": 1187 + }, + { + "epoch": 0.21521739130434783, + "grad_norm": 3.5447655527158717, + "learning_rate": 9.127570237638214e-07, + "loss": 0.3415, + "step": 1188 + }, + { + "epoch": 0.21539855072463768, + "grad_norm": 5.129574453628034, + "learning_rate": 9.125913702548583e-07, + "loss": 0.3459, + "step": 1189 + }, + { + "epoch": 0.21557971014492755, + "grad_norm": 3.5775440093323105, + "learning_rate": 9.124255746887314e-07, + "loss": 0.285, + "step": 1190 + }, + { + "epoch": 0.2157608695652174, + "grad_norm": 4.653005461091044, + "learning_rate": 9.122596371225253e-07, + "loss": 0.2984, + "step": 1191 + }, + { + "epoch": 0.21594202898550724, + "grad_norm": 4.472719753949903, + "learning_rate": 9.120935576133726e-07, + "loss": 0.3945, + "step": 1192 + }, + { + "epoch": 0.2161231884057971, + "grad_norm": 4.606381116902868, + "learning_rate": 9.119273362184554e-07, + "loss": 0.365, + "step": 1193 + }, + { + "epoch": 0.21630434782608696, + "grad_norm": 6.334844303794688, + "learning_rate": 9.117609729950047e-07, + "loss": 0.3936, + "step": 1194 + }, + { + "epoch": 0.2164855072463768, + "grad_norm": 5.284068988281052, + "learning_rate": 9.115944680003001e-07, + "loss": 0.301, + "step": 1195 + }, + { + "epoch": 0.21666666666666667, + "grad_norm": 7.116168200697639, + "learning_rate": 9.1142782129167e-07, + "loss": 0.3527, + "step": 1196 + }, + { + "epoch": 0.21684782608695652, + "grad_norm": 10.075623742982382, + "learning_rate": 9.112610329264915e-07, + "loss": 0.3031, + "step": 1197 + }, + { + "epoch": 0.21702898550724636, + "grad_norm": 4.8289485313476685, + "learning_rate": 9.110941029621908e-07, + "loss": 0.3308, + "step": 1198 + }, + { + "epoch": 0.21721014492753624, + "grad_norm": 3.954519819691023, + "learning_rate": 9.109270314562427e-07, + "loss": 0.3737, + "step": 1199 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 4.387893849234945, + "learning_rate": 9.107598184661707e-07, + "loss": 0.4003, + "step": 1200 + }, + { + "epoch": 0.21739130434782608, + "eval_loss": 0.34571874141693115, + "eval_runtime": 9.8889, + "eval_samples_per_second": 50.562, + "eval_steps_per_second": 0.101, + "step": 1200 + }, + { + "epoch": 0.21757246376811595, + "grad_norm": 3.5971048581940814, + "learning_rate": 9.105924640495468e-07, + "loss": 0.3813, + "step": 1201 + }, + { + "epoch": 0.2177536231884058, + "grad_norm": 5.389401264316595, + "learning_rate": 9.104249682639922e-07, + "loss": 0.3159, + "step": 1202 + }, + { + "epoch": 0.21793478260869564, + "grad_norm": 4.8130975732853045, + "learning_rate": 9.102573311671764e-07, + "loss": 0.3953, + "step": 1203 + }, + { + "epoch": 0.21811594202898552, + "grad_norm": 6.5490419402611, + "learning_rate": 9.100895528168177e-07, + "loss": 0.3469, + "step": 1204 + }, + { + "epoch": 0.21829710144927536, + "grad_norm": 5.080686226241223, + "learning_rate": 9.099216332706828e-07, + "loss": 0.3453, + "step": 1205 + }, + { + "epoch": 0.2184782608695652, + "grad_norm": 3.7277484103384673, + "learning_rate": 9.097535725865875e-07, + "loss": 0.3433, + "step": 1206 + }, + { + "epoch": 0.21865942028985508, + "grad_norm": 4.003709876317981, + "learning_rate": 9.095853708223955e-07, + "loss": 0.3447, + "step": 1207 + }, + { + "epoch": 0.21884057971014492, + "grad_norm": 3.7323889864612765, + "learning_rate": 9.094170280360198e-07, + "loss": 0.4109, + "step": 1208 + }, + { + "epoch": 0.2190217391304348, + "grad_norm": 9.602557538583484, + "learning_rate": 9.092485442854214e-07, + "loss": 0.4127, + "step": 1209 + }, + { + "epoch": 0.21920289855072464, + "grad_norm": 7.298048008276583, + "learning_rate": 9.0907991962861e-07, + "loss": 0.3828, + "step": 1210 + }, + { + "epoch": 0.21938405797101448, + "grad_norm": 7.943662289260129, + "learning_rate": 9.089111541236444e-07, + "loss": 0.3148, + "step": 1211 + }, + { + "epoch": 0.21956521739130436, + "grad_norm": 6.083209857329283, + "learning_rate": 9.087422478286308e-07, + "loss": 0.372, + "step": 1212 + }, + { + "epoch": 0.2197463768115942, + "grad_norm": 19.988960217370533, + "learning_rate": 9.085732008017245e-07, + "loss": 0.3387, + "step": 1213 + }, + { + "epoch": 0.21992753623188405, + "grad_norm": 6.151747595618726, + "learning_rate": 9.084040131011295e-07, + "loss": 0.3114, + "step": 1214 + }, + { + "epoch": 0.22010869565217392, + "grad_norm": 4.490543143811417, + "learning_rate": 9.082346847850974e-07, + "loss": 0.3727, + "step": 1215 + }, + { + "epoch": 0.22028985507246376, + "grad_norm": 5.233150505420256, + "learning_rate": 9.080652159119294e-07, + "loss": 0.3642, + "step": 1216 + }, + { + "epoch": 0.22047101449275364, + "grad_norm": 9.732369632834613, + "learning_rate": 9.078956065399739e-07, + "loss": 0.3257, + "step": 1217 + }, + { + "epoch": 0.22065217391304348, + "grad_norm": 10.272121018431124, + "learning_rate": 9.077258567276286e-07, + "loss": 0.3812, + "step": 1218 + }, + { + "epoch": 0.22083333333333333, + "grad_norm": 3.5933635529642474, + "learning_rate": 9.075559665333389e-07, + "loss": 0.4311, + "step": 1219 + }, + { + "epoch": 0.2210144927536232, + "grad_norm": 4.327113704145465, + "learning_rate": 9.073859360155989e-07, + "loss": 0.3447, + "step": 1220 + }, + { + "epoch": 0.22119565217391304, + "grad_norm": 8.95674866628867, + "learning_rate": 9.072157652329509e-07, + "loss": 0.3631, + "step": 1221 + }, + { + "epoch": 0.2213768115942029, + "grad_norm": 3.2292604916952086, + "learning_rate": 9.070454542439854e-07, + "loss": 0.3502, + "step": 1222 + }, + { + "epoch": 0.22155797101449276, + "grad_norm": 7.583161740693873, + "learning_rate": 9.068750031073414e-07, + "loss": 0.3164, + "step": 1223 + }, + { + "epoch": 0.2217391304347826, + "grad_norm": 3.0738902557289536, + "learning_rate": 9.067044118817062e-07, + "loss": 0.3077, + "step": 1224 + }, + { + "epoch": 0.22192028985507245, + "grad_norm": 4.161213231321821, + "learning_rate": 9.065336806258148e-07, + "loss": 0.361, + "step": 1225 + }, + { + "epoch": 0.22210144927536232, + "grad_norm": 6.967930997922288, + "learning_rate": 9.06362809398451e-07, + "loss": 0.3038, + "step": 1226 + }, + { + "epoch": 0.22228260869565217, + "grad_norm": 5.623370037391552, + "learning_rate": 9.061917982584466e-07, + "loss": 0.4154, + "step": 1227 + }, + { + "epoch": 0.22246376811594204, + "grad_norm": 7.558402566314015, + "learning_rate": 9.060206472646814e-07, + "loss": 0.4089, + "step": 1228 + }, + { + "epoch": 0.22264492753623188, + "grad_norm": 9.141692572595682, + "learning_rate": 9.058493564760836e-07, + "loss": 0.4023, + "step": 1229 + }, + { + "epoch": 0.22282608695652173, + "grad_norm": 4.196488676053457, + "learning_rate": 9.056779259516294e-07, + "loss": 0.3033, + "step": 1230 + }, + { + "epoch": 0.2230072463768116, + "grad_norm": 4.294348077674157, + "learning_rate": 9.055063557503433e-07, + "loss": 0.3704, + "step": 1231 + }, + { + "epoch": 0.22318840579710145, + "grad_norm": 6.530010340490466, + "learning_rate": 9.053346459312974e-07, + "loss": 0.3223, + "step": 1232 + }, + { + "epoch": 0.2233695652173913, + "grad_norm": 3.8651163122606316, + "learning_rate": 9.051627965536123e-07, + "loss": 0.4018, + "step": 1233 + }, + { + "epoch": 0.22355072463768116, + "grad_norm": 4.103493321918278, + "learning_rate": 9.049908076764569e-07, + "loss": 0.3248, + "step": 1234 + }, + { + "epoch": 0.223731884057971, + "grad_norm": 10.96942520906358, + "learning_rate": 9.048186793590475e-07, + "loss": 0.3412, + "step": 1235 + }, + { + "epoch": 0.22391304347826088, + "grad_norm": 6.126306015723247, + "learning_rate": 9.046464116606487e-07, + "loss": 0.3866, + "step": 1236 + }, + { + "epoch": 0.22409420289855073, + "grad_norm": 4.143063120786398, + "learning_rate": 9.04474004640573e-07, + "loss": 0.3171, + "step": 1237 + }, + { + "epoch": 0.22427536231884057, + "grad_norm": 3.492663278134232, + "learning_rate": 9.043014583581812e-07, + "loss": 0.2607, + "step": 1238 + }, + { + "epoch": 0.22445652173913044, + "grad_norm": 3.939432702923544, + "learning_rate": 9.041287728728816e-07, + "loss": 0.3498, + "step": 1239 + }, + { + "epoch": 0.2246376811594203, + "grad_norm": 5.335678788094838, + "learning_rate": 9.039559482441307e-07, + "loss": 0.2717, + "step": 1240 + }, + { + "epoch": 0.22481884057971013, + "grad_norm": 4.165989925810196, + "learning_rate": 9.037829845314328e-07, + "loss": 0.3733, + "step": 1241 + }, + { + "epoch": 0.225, + "grad_norm": 11.20480507288977, + "learning_rate": 9.036098817943402e-07, + "loss": 0.3663, + "step": 1242 + }, + { + "epoch": 0.22518115942028985, + "grad_norm": 5.938461493367334, + "learning_rate": 9.034366400924529e-07, + "loss": 0.3672, + "step": 1243 + }, + { + "epoch": 0.22536231884057972, + "grad_norm": 8.216875696420953, + "learning_rate": 9.03263259485419e-07, + "loss": 0.3799, + "step": 1244 + }, + { + "epoch": 0.22554347826086957, + "grad_norm": 7.1361281645982, + "learning_rate": 9.03089740032934e-07, + "loss": 0.294, + "step": 1245 + }, + { + "epoch": 0.2257246376811594, + "grad_norm": 3.2356137509866305, + "learning_rate": 9.029160817947419e-07, + "loss": 0.2933, + "step": 1246 + }, + { + "epoch": 0.22590579710144928, + "grad_norm": 5.649658822053839, + "learning_rate": 9.027422848306336e-07, + "loss": 0.3068, + "step": 1247 + }, + { + "epoch": 0.22608695652173913, + "grad_norm": 5.312495746898288, + "learning_rate": 9.025683492004483e-07, + "loss": 0.3898, + "step": 1248 + }, + { + "epoch": 0.22626811594202897, + "grad_norm": 5.50819495308927, + "learning_rate": 9.023942749640731e-07, + "loss": 0.3415, + "step": 1249 + }, + { + "epoch": 0.22644927536231885, + "grad_norm": 3.9040870967515784, + "learning_rate": 9.022200621814425e-07, + "loss": 0.3728, + "step": 1250 + }, + { + "epoch": 0.2266304347826087, + "grad_norm": 4.93548118832667, + "learning_rate": 9.020457109125386e-07, + "loss": 0.3264, + "step": 1251 + }, + { + "epoch": 0.22681159420289856, + "grad_norm": 3.4831176329452336, + "learning_rate": 9.018712212173915e-07, + "loss": 0.3494, + "step": 1252 + }, + { + "epoch": 0.2269927536231884, + "grad_norm": 3.762130987872424, + "learning_rate": 9.01696593156079e-07, + "loss": 0.3581, + "step": 1253 + }, + { + "epoch": 0.22717391304347825, + "grad_norm": 3.960631701449595, + "learning_rate": 9.01521826788726e-07, + "loss": 0.3625, + "step": 1254 + }, + { + "epoch": 0.22735507246376813, + "grad_norm": 6.354525962480676, + "learning_rate": 9.013469221755057e-07, + "loss": 0.3446, + "step": 1255 + }, + { + "epoch": 0.22753623188405797, + "grad_norm": 7.427381252438664, + "learning_rate": 9.011718793766384e-07, + "loss": 0.3044, + "step": 1256 + }, + { + "epoch": 0.22771739130434782, + "grad_norm": 11.772935437533649, + "learning_rate": 9.009966984523923e-07, + "loss": 0.3279, + "step": 1257 + }, + { + "epoch": 0.2278985507246377, + "grad_norm": 5.047475807930224, + "learning_rate": 9.008213794630829e-07, + "loss": 0.2904, + "step": 1258 + }, + { + "epoch": 0.22807971014492753, + "grad_norm": 5.431777163692616, + "learning_rate": 9.006459224690734e-07, + "loss": 0.3078, + "step": 1259 + }, + { + "epoch": 0.22826086956521738, + "grad_norm": 8.175128606886476, + "learning_rate": 9.004703275307746e-07, + "loss": 0.3635, + "step": 1260 + }, + { + "epoch": 0.22844202898550725, + "grad_norm": 6.837442307314524, + "learning_rate": 9.002945947086445e-07, + "loss": 0.307, + "step": 1261 + }, + { + "epoch": 0.2286231884057971, + "grad_norm": 9.695243546120613, + "learning_rate": 9.001187240631889e-07, + "loss": 0.326, + "step": 1262 + }, + { + "epoch": 0.22880434782608697, + "grad_norm": 3.7129745044453446, + "learning_rate": 8.999427156549606e-07, + "loss": 0.2894, + "step": 1263 + }, + { + "epoch": 0.2289855072463768, + "grad_norm": 8.327224382648378, + "learning_rate": 8.997665695445606e-07, + "loss": 0.3881, + "step": 1264 + }, + { + "epoch": 0.22916666666666666, + "grad_norm": 7.117905841961984, + "learning_rate": 8.995902857926363e-07, + "loss": 0.3668, + "step": 1265 + }, + { + "epoch": 0.22934782608695653, + "grad_norm": 3.4377048315831282, + "learning_rate": 8.994138644598834e-07, + "loss": 0.2948, + "step": 1266 + }, + { + "epoch": 0.22952898550724637, + "grad_norm": 4.12329980743616, + "learning_rate": 8.992373056070446e-07, + "loss": 0.3214, + "step": 1267 + }, + { + "epoch": 0.22971014492753622, + "grad_norm": 3.8999094160745753, + "learning_rate": 8.990606092949098e-07, + "loss": 0.3579, + "step": 1268 + }, + { + "epoch": 0.2298913043478261, + "grad_norm": 4.181327819322105, + "learning_rate": 8.988837755843164e-07, + "loss": 0.2867, + "step": 1269 + }, + { + "epoch": 0.23007246376811594, + "grad_norm": 9.358893804692244, + "learning_rate": 8.987068045361492e-07, + "loss": 0.3965, + "step": 1270 + }, + { + "epoch": 0.2302536231884058, + "grad_norm": 10.158978678505177, + "learning_rate": 8.9852969621134e-07, + "loss": 0.2748, + "step": 1271 + }, + { + "epoch": 0.23043478260869565, + "grad_norm": 7.262494787791029, + "learning_rate": 8.983524506708681e-07, + "loss": 0.3918, + "step": 1272 + }, + { + "epoch": 0.2306159420289855, + "grad_norm": 3.3771311901426024, + "learning_rate": 8.9817506797576e-07, + "loss": 0.3373, + "step": 1273 + }, + { + "epoch": 0.23079710144927537, + "grad_norm": 5.646504676227212, + "learning_rate": 8.979975481870895e-07, + "loss": 0.3661, + "step": 1274 + }, + { + "epoch": 0.23097826086956522, + "grad_norm": 6.455236003510512, + "learning_rate": 8.978198913659774e-07, + "loss": 0.3181, + "step": 1275 + }, + { + "epoch": 0.23115942028985506, + "grad_norm": 8.34846904885038, + "learning_rate": 8.976420975735917e-07, + "loss": 0.3246, + "step": 1276 + }, + { + "epoch": 0.23134057971014493, + "grad_norm": 9.142471943183933, + "learning_rate": 8.974641668711478e-07, + "loss": 0.4117, + "step": 1277 + }, + { + "epoch": 0.23152173913043478, + "grad_norm": 4.302318983661124, + "learning_rate": 8.972860993199081e-07, + "loss": 0.3284, + "step": 1278 + }, + { + "epoch": 0.23170289855072465, + "grad_norm": 4.896572809261174, + "learning_rate": 8.971078949811819e-07, + "loss": 0.3486, + "step": 1279 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 7.513370874886547, + "learning_rate": 8.969295539163258e-07, + "loss": 0.3539, + "step": 1280 + }, + { + "epoch": 0.23206521739130434, + "grad_norm": 5.216826327162976, + "learning_rate": 8.967510761867439e-07, + "loss": 0.3019, + "step": 1281 + }, + { + "epoch": 0.2322463768115942, + "grad_norm": 10.561040987414916, + "learning_rate": 8.965724618538864e-07, + "loss": 0.3461, + "step": 1282 + }, + { + "epoch": 0.23242753623188406, + "grad_norm": 6.928651931060982, + "learning_rate": 8.963937109792514e-07, + "loss": 0.3109, + "step": 1283 + }, + { + "epoch": 0.2326086956521739, + "grad_norm": 3.5977461335210723, + "learning_rate": 8.962148236243834e-07, + "loss": 0.3292, + "step": 1284 + }, + { + "epoch": 0.23278985507246377, + "grad_norm": 4.195109345427405, + "learning_rate": 8.960357998508745e-07, + "loss": 0.4093, + "step": 1285 + }, + { + "epoch": 0.23297101449275362, + "grad_norm": 5.683806680561408, + "learning_rate": 8.958566397203632e-07, + "loss": 0.3856, + "step": 1286 + }, + { + "epoch": 0.23315217391304346, + "grad_norm": 4.826928924661649, + "learning_rate": 8.956773432945353e-07, + "loss": 0.3077, + "step": 1287 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 4.606270305212832, + "learning_rate": 8.954979106351232e-07, + "loss": 0.3164, + "step": 1288 + }, + { + "epoch": 0.23351449275362318, + "grad_norm": 4.93460270541993, + "learning_rate": 8.953183418039065e-07, + "loss": 0.314, + "step": 1289 + }, + { + "epoch": 0.23369565217391305, + "grad_norm": 3.246901983250382, + "learning_rate": 8.951386368627118e-07, + "loss": 0.3032, + "step": 1290 + }, + { + "epoch": 0.2338768115942029, + "grad_norm": 5.779498570238692, + "learning_rate": 8.949587958734122e-07, + "loss": 0.3142, + "step": 1291 + }, + { + "epoch": 0.23405797101449274, + "grad_norm": 8.125378740331204, + "learning_rate": 8.947788188979279e-07, + "loss": 0.3669, + "step": 1292 + }, + { + "epoch": 0.23423913043478262, + "grad_norm": 4.227889042202217, + "learning_rate": 8.945987059982256e-07, + "loss": 0.3128, + "step": 1293 + }, + { + "epoch": 0.23442028985507246, + "grad_norm": 16.149319615221966, + "learning_rate": 8.944184572363193e-07, + "loss": 0.3649, + "step": 1294 + }, + { + "epoch": 0.2346014492753623, + "grad_norm": 5.333794372543125, + "learning_rate": 8.942380726742693e-07, + "loss": 0.3146, + "step": 1295 + }, + { + "epoch": 0.23478260869565218, + "grad_norm": 9.186710127409288, + "learning_rate": 8.940575523741832e-07, + "loss": 0.3411, + "step": 1296 + }, + { + "epoch": 0.23496376811594202, + "grad_norm": 9.451298148970924, + "learning_rate": 8.938768963982144e-07, + "loss": 0.356, + "step": 1297 + }, + { + "epoch": 0.2351449275362319, + "grad_norm": 7.616715147955477, + "learning_rate": 8.936961048085641e-07, + "loss": 0.3349, + "step": 1298 + }, + { + "epoch": 0.23532608695652174, + "grad_norm": 7.3813470652580895, + "learning_rate": 8.935151776674794e-07, + "loss": 0.4008, + "step": 1299 + }, + { + "epoch": 0.23550724637681159, + "grad_norm": 16.072109056540143, + "learning_rate": 8.933341150372546e-07, + "loss": 0.3997, + "step": 1300 + }, + { + "epoch": 0.23550724637681159, + "eval_loss": 0.34568750858306885, + "eval_runtime": 9.768, + "eval_samples_per_second": 51.188, + "eval_steps_per_second": 0.102, + "step": 1300 + }, + { + "epoch": 0.23568840579710146, + "grad_norm": 8.18685560719828, + "learning_rate": 8.931529169802304e-07, + "loss": 0.2744, + "step": 1301 + }, + { + "epoch": 0.2358695652173913, + "grad_norm": 6.465716870823023, + "learning_rate": 8.929715835587941e-07, + "loss": 0.333, + "step": 1302 + }, + { + "epoch": 0.23605072463768115, + "grad_norm": 8.482431902401542, + "learning_rate": 8.927901148353796e-07, + "loss": 0.3171, + "step": 1303 + }, + { + "epoch": 0.23623188405797102, + "grad_norm": 5.230415572187631, + "learning_rate": 8.926085108724674e-07, + "loss": 0.3434, + "step": 1304 + }, + { + "epoch": 0.23641304347826086, + "grad_norm": 5.573864729548042, + "learning_rate": 8.924267717325848e-07, + "loss": 0.3569, + "step": 1305 + }, + { + "epoch": 0.23659420289855074, + "grad_norm": 5.622625663198217, + "learning_rate": 8.922448974783052e-07, + "loss": 0.4255, + "step": 1306 + }, + { + "epoch": 0.23677536231884058, + "grad_norm": 5.2447050884088515, + "learning_rate": 8.92062888172249e-07, + "loss": 0.3503, + "step": 1307 + }, + { + "epoch": 0.23695652173913043, + "grad_norm": 6.259870194613778, + "learning_rate": 8.918807438770828e-07, + "loss": 0.3027, + "step": 1308 + }, + { + "epoch": 0.2371376811594203, + "grad_norm": 13.288604019758068, + "learning_rate": 8.916984646555197e-07, + "loss": 0.395, + "step": 1309 + }, + { + "epoch": 0.23731884057971014, + "grad_norm": 6.59668440470176, + "learning_rate": 8.915160505703192e-07, + "loss": 0.3228, + "step": 1310 + }, + { + "epoch": 0.2375, + "grad_norm": 9.884336737057396, + "learning_rate": 8.913335016842876e-07, + "loss": 0.3704, + "step": 1311 + }, + { + "epoch": 0.23768115942028986, + "grad_norm": 2.8813813729617523, + "learning_rate": 8.911508180602771e-07, + "loss": 0.3084, + "step": 1312 + }, + { + "epoch": 0.2378623188405797, + "grad_norm": 4.282031086280285, + "learning_rate": 8.909679997611868e-07, + "loss": 0.3296, + "step": 1313 + }, + { + "epoch": 0.23804347826086958, + "grad_norm": 3.2672018028003778, + "learning_rate": 8.907850468499614e-07, + "loss": 0.3535, + "step": 1314 + }, + { + "epoch": 0.23822463768115942, + "grad_norm": 7.19297103709532, + "learning_rate": 8.90601959389593e-07, + "loss": 0.3431, + "step": 1315 + }, + { + "epoch": 0.23840579710144927, + "grad_norm": 6.197478698744432, + "learning_rate": 8.904187374431193e-07, + "loss": 0.3391, + "step": 1316 + }, + { + "epoch": 0.23858695652173914, + "grad_norm": 6.2460577721160595, + "learning_rate": 8.902353810736245e-07, + "loss": 0.32, + "step": 1317 + }, + { + "epoch": 0.23876811594202899, + "grad_norm": 5.8844281610349105, + "learning_rate": 8.900518903442389e-07, + "loss": 0.3134, + "step": 1318 + }, + { + "epoch": 0.23894927536231883, + "grad_norm": 6.4498932788279575, + "learning_rate": 8.898682653181393e-07, + "loss": 0.3516, + "step": 1319 + }, + { + "epoch": 0.2391304347826087, + "grad_norm": 3.490721115063493, + "learning_rate": 8.89684506058549e-07, + "loss": 0.3572, + "step": 1320 + }, + { + "epoch": 0.23931159420289855, + "grad_norm": 8.669454284194035, + "learning_rate": 8.895006126287366e-07, + "loss": 0.3188, + "step": 1321 + }, + { + "epoch": 0.2394927536231884, + "grad_norm": 9.137855564274608, + "learning_rate": 8.893165850920179e-07, + "loss": 0.3631, + "step": 1322 + }, + { + "epoch": 0.23967391304347826, + "grad_norm": 7.586913383241209, + "learning_rate": 8.891324235117543e-07, + "loss": 0.4019, + "step": 1323 + }, + { + "epoch": 0.2398550724637681, + "grad_norm": 3.4782370530772093, + "learning_rate": 8.889481279513536e-07, + "loss": 0.3165, + "step": 1324 + }, + { + "epoch": 0.24003623188405798, + "grad_norm": 5.828463556759546, + "learning_rate": 8.887636984742694e-07, + "loss": 0.4064, + "step": 1325 + }, + { + "epoch": 0.24021739130434783, + "grad_norm": 7.846689692953546, + "learning_rate": 8.885791351440019e-07, + "loss": 0.351, + "step": 1326 + }, + { + "epoch": 0.24039855072463767, + "grad_norm": 5.331304685578559, + "learning_rate": 8.883944380240971e-07, + "loss": 0.3246, + "step": 1327 + }, + { + "epoch": 0.24057971014492754, + "grad_norm": 11.152378530603334, + "learning_rate": 8.882096071781471e-07, + "loss": 0.3244, + "step": 1328 + }, + { + "epoch": 0.2407608695652174, + "grad_norm": 3.961230665090909, + "learning_rate": 8.880246426697899e-07, + "loss": 0.3408, + "step": 1329 + }, + { + "epoch": 0.24094202898550723, + "grad_norm": 3.0672571747120023, + "learning_rate": 8.878395445627096e-07, + "loss": 0.2731, + "step": 1330 + }, + { + "epoch": 0.2411231884057971, + "grad_norm": 5.452977269629534, + "learning_rate": 8.876543129206367e-07, + "loss": 0.3356, + "step": 1331 + }, + { + "epoch": 0.24130434782608695, + "grad_norm": 5.2650355665238955, + "learning_rate": 8.874689478073469e-07, + "loss": 0.3108, + "step": 1332 + }, + { + "epoch": 0.24148550724637682, + "grad_norm": 9.959647030914498, + "learning_rate": 8.872834492866628e-07, + "loss": 0.3843, + "step": 1333 + }, + { + "epoch": 0.24166666666666667, + "grad_norm": 6.934439359648218, + "learning_rate": 8.870978174224518e-07, + "loss": 0.3543, + "step": 1334 + }, + { + "epoch": 0.2418478260869565, + "grad_norm": 9.316571712068209, + "learning_rate": 8.869120522786284e-07, + "loss": 0.388, + "step": 1335 + }, + { + "epoch": 0.24202898550724639, + "grad_norm": 3.301519857759256, + "learning_rate": 8.867261539191521e-07, + "loss": 0.3263, + "step": 1336 + }, + { + "epoch": 0.24221014492753623, + "grad_norm": 4.572883545706603, + "learning_rate": 8.865401224080285e-07, + "loss": 0.3425, + "step": 1337 + }, + { + "epoch": 0.24239130434782608, + "grad_norm": 3.460192979791663, + "learning_rate": 8.863539578093095e-07, + "loss": 0.3369, + "step": 1338 + }, + { + "epoch": 0.24257246376811595, + "grad_norm": 4.543863120297665, + "learning_rate": 8.861676601870922e-07, + "loss": 0.2846, + "step": 1339 + }, + { + "epoch": 0.2427536231884058, + "grad_norm": 6.8038943778211465, + "learning_rate": 8.859812296055198e-07, + "loss": 0.3261, + "step": 1340 + }, + { + "epoch": 0.24293478260869567, + "grad_norm": 4.1486455698856615, + "learning_rate": 8.857946661287812e-07, + "loss": 0.3614, + "step": 1341 + }, + { + "epoch": 0.2431159420289855, + "grad_norm": 5.066788304426683, + "learning_rate": 8.856079698211109e-07, + "loss": 0.3289, + "step": 1342 + }, + { + "epoch": 0.24329710144927535, + "grad_norm": 2.930288002248603, + "learning_rate": 8.854211407467898e-07, + "loss": 0.2845, + "step": 1343 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 4.994460981204123, + "learning_rate": 8.852341789701439e-07, + "loss": 0.4155, + "step": 1344 + }, + { + "epoch": 0.24365942028985507, + "grad_norm": 6.28643347535595, + "learning_rate": 8.850470845555447e-07, + "loss": 0.3741, + "step": 1345 + }, + { + "epoch": 0.24384057971014492, + "grad_norm": 7.528701749209174, + "learning_rate": 8.848598575674099e-07, + "loss": 0.4038, + "step": 1346 + }, + { + "epoch": 0.2440217391304348, + "grad_norm": 5.138227496202631, + "learning_rate": 8.846724980702026e-07, + "loss": 0.3188, + "step": 1347 + }, + { + "epoch": 0.24420289855072463, + "grad_norm": 3.773449850025705, + "learning_rate": 8.844850061284317e-07, + "loss": 0.3418, + "step": 1348 + }, + { + "epoch": 0.24438405797101448, + "grad_norm": 6.675721193558435, + "learning_rate": 8.842973818066515e-07, + "loss": 0.3306, + "step": 1349 + }, + { + "epoch": 0.24456521739130435, + "grad_norm": 10.627721576771867, + "learning_rate": 8.841096251694618e-07, + "loss": 0.3078, + "step": 1350 + }, + { + "epoch": 0.2447463768115942, + "grad_norm": 6.955005601683814, + "learning_rate": 8.839217362815081e-07, + "loss": 0.3235, + "step": 1351 + }, + { + "epoch": 0.24492753623188407, + "grad_norm": 4.15898697119472, + "learning_rate": 8.837337152074818e-07, + "loss": 0.317, + "step": 1352 + }, + { + "epoch": 0.2451086956521739, + "grad_norm": 4.054679389135684, + "learning_rate": 8.83545562012119e-07, + "loss": 0.3682, + "step": 1353 + }, + { + "epoch": 0.24528985507246376, + "grad_norm": 7.587696624323504, + "learning_rate": 8.83357276760202e-07, + "loss": 0.3259, + "step": 1354 + }, + { + "epoch": 0.24547101449275363, + "grad_norm": 6.120132345277472, + "learning_rate": 8.831688595165583e-07, + "loss": 0.3007, + "step": 1355 + }, + { + "epoch": 0.24565217391304348, + "grad_norm": 3.7569121835614046, + "learning_rate": 8.829803103460607e-07, + "loss": 0.3274, + "step": 1356 + }, + { + "epoch": 0.24583333333333332, + "grad_norm": 4.661618593778217, + "learning_rate": 8.827916293136275e-07, + "loss": 0.3701, + "step": 1357 + }, + { + "epoch": 0.2460144927536232, + "grad_norm": 7.192015103330489, + "learning_rate": 8.826028164842228e-07, + "loss": 0.3249, + "step": 1358 + }, + { + "epoch": 0.24619565217391304, + "grad_norm": 5.832425782161701, + "learning_rate": 8.824138719228556e-07, + "loss": 0.3143, + "step": 1359 + }, + { + "epoch": 0.2463768115942029, + "grad_norm": 4.257475375062232, + "learning_rate": 8.822247956945803e-07, + "loss": 0.2422, + "step": 1360 + }, + { + "epoch": 0.24655797101449275, + "grad_norm": 10.83770867029943, + "learning_rate": 8.820355878644968e-07, + "loss": 0.4091, + "step": 1361 + }, + { + "epoch": 0.2467391304347826, + "grad_norm": 4.778022238706081, + "learning_rate": 8.818462484977502e-07, + "loss": 0.3737, + "step": 1362 + }, + { + "epoch": 0.24692028985507247, + "grad_norm": 4.56655115812147, + "learning_rate": 8.816567776595312e-07, + "loss": 0.3578, + "step": 1363 + }, + { + "epoch": 0.24710144927536232, + "grad_norm": 6.3888702589878426, + "learning_rate": 8.814671754150754e-07, + "loss": 0.3076, + "step": 1364 + }, + { + "epoch": 0.24728260869565216, + "grad_norm": 8.767249853037171, + "learning_rate": 8.812774418296633e-07, + "loss": 0.3739, + "step": 1365 + }, + { + "epoch": 0.24746376811594203, + "grad_norm": 6.242333960535613, + "learning_rate": 8.810875769686217e-07, + "loss": 0.3472, + "step": 1366 + }, + { + "epoch": 0.24764492753623188, + "grad_norm": 9.666806036687468, + "learning_rate": 8.808975808973218e-07, + "loss": 0.3757, + "step": 1367 + }, + { + "epoch": 0.24782608695652175, + "grad_norm": 6.506336349139156, + "learning_rate": 8.807074536811798e-07, + "loss": 0.356, + "step": 1368 + }, + { + "epoch": 0.2480072463768116, + "grad_norm": 4.092291263586452, + "learning_rate": 8.805171953856578e-07, + "loss": 0.3098, + "step": 1369 + }, + { + "epoch": 0.24818840579710144, + "grad_norm": 4.800265389206893, + "learning_rate": 8.803268060762626e-07, + "loss": 0.3216, + "step": 1370 + }, + { + "epoch": 0.2483695652173913, + "grad_norm": 9.205497769879106, + "learning_rate": 8.80136285818546e-07, + "loss": 0.3859, + "step": 1371 + }, + { + "epoch": 0.24855072463768116, + "grad_norm": 3.7409099979489184, + "learning_rate": 8.799456346781051e-07, + "loss": 0.2962, + "step": 1372 + }, + { + "epoch": 0.248731884057971, + "grad_norm": 4.667003183725607, + "learning_rate": 8.797548527205818e-07, + "loss": 0.329, + "step": 1373 + }, + { + "epoch": 0.24891304347826088, + "grad_norm": 9.636347282381372, + "learning_rate": 8.795639400116636e-07, + "loss": 0.3747, + "step": 1374 + }, + { + "epoch": 0.24909420289855072, + "grad_norm": 10.690178820862956, + "learning_rate": 8.793728966170824e-07, + "loss": 0.3871, + "step": 1375 + }, + { + "epoch": 0.2492753623188406, + "grad_norm": 4.02348810073237, + "learning_rate": 8.791817226026152e-07, + "loss": 0.3476, + "step": 1376 + }, + { + "epoch": 0.24945652173913044, + "grad_norm": 7.015420659830804, + "learning_rate": 8.789904180340843e-07, + "loss": 0.3312, + "step": 1377 + }, + { + "epoch": 0.24963768115942028, + "grad_norm": 7.122428647887501, + "learning_rate": 8.78798982977357e-07, + "loss": 0.3777, + "step": 1378 + }, + { + "epoch": 0.24981884057971016, + "grad_norm": 3.781332296034064, + "learning_rate": 8.786074174983451e-07, + "loss": 0.3469, + "step": 1379 + }, + { + "epoch": 0.25, + "grad_norm": 6.51224209405785, + "learning_rate": 8.784157216630053e-07, + "loss": 0.2774, + "step": 1380 + }, + { + "epoch": 0.25018115942028984, + "grad_norm": 3.8409980872379723, + "learning_rate": 8.782238955373396e-07, + "loss": 0.3129, + "step": 1381 + }, + { + "epoch": 0.2503623188405797, + "grad_norm": 6.62076412471385, + "learning_rate": 8.780319391873947e-07, + "loss": 0.2931, + "step": 1382 + }, + { + "epoch": 0.2505434782608696, + "grad_norm": 4.753718622113229, + "learning_rate": 8.778398526792619e-07, + "loss": 0.2701, + "step": 1383 + }, + { + "epoch": 0.25072463768115943, + "grad_norm": 6.038729566210822, + "learning_rate": 8.776476360790775e-07, + "loss": 0.2765, + "step": 1384 + }, + { + "epoch": 0.2509057971014493, + "grad_norm": 7.094377821118904, + "learning_rate": 8.774552894530227e-07, + "loss": 0.3313, + "step": 1385 + }, + { + "epoch": 0.2510869565217391, + "grad_norm": 4.415781042475571, + "learning_rate": 8.772628128673233e-07, + "loss": 0.3672, + "step": 1386 + }, + { + "epoch": 0.25126811594202897, + "grad_norm": 4.518381347143787, + "learning_rate": 8.770702063882501e-07, + "loss": 0.2967, + "step": 1387 + }, + { + "epoch": 0.2514492753623188, + "grad_norm": 3.8771159358588765, + "learning_rate": 8.768774700821182e-07, + "loss": 0.3756, + "step": 1388 + }, + { + "epoch": 0.2516304347826087, + "grad_norm": 8.563823161926644, + "learning_rate": 8.766846040152875e-07, + "loss": 0.3777, + "step": 1389 + }, + { + "epoch": 0.25181159420289856, + "grad_norm": 5.497199992795406, + "learning_rate": 8.764916082541631e-07, + "loss": 0.3438, + "step": 1390 + }, + { + "epoch": 0.2519927536231884, + "grad_norm": 4.20332765962466, + "learning_rate": 8.76298482865194e-07, + "loss": 0.3364, + "step": 1391 + }, + { + "epoch": 0.25217391304347825, + "grad_norm": 5.173376962279245, + "learning_rate": 8.761052279148742e-07, + "loss": 0.3392, + "step": 1392 + }, + { + "epoch": 0.2523550724637681, + "grad_norm": 8.350085792266798, + "learning_rate": 8.759118434697426e-07, + "loss": 0.313, + "step": 1393 + }, + { + "epoch": 0.252536231884058, + "grad_norm": 6.651744425298431, + "learning_rate": 8.757183295963822e-07, + "loss": 0.2971, + "step": 1394 + }, + { + "epoch": 0.25271739130434784, + "grad_norm": 3.918365506340284, + "learning_rate": 8.755246863614205e-07, + "loss": 0.3379, + "step": 1395 + }, + { + "epoch": 0.2528985507246377, + "grad_norm": 3.777983201187088, + "learning_rate": 8.753309138315301e-07, + "loss": 0.3045, + "step": 1396 + }, + { + "epoch": 0.2530797101449275, + "grad_norm": 4.360621862342518, + "learning_rate": 8.751370120734278e-07, + "loss": 0.3767, + "step": 1397 + }, + { + "epoch": 0.2532608695652174, + "grad_norm": 3.742192485163964, + "learning_rate": 8.749429811538747e-07, + "loss": 0.2821, + "step": 1398 + }, + { + "epoch": 0.2534420289855073, + "grad_norm": 4.352832978771017, + "learning_rate": 8.747488211396767e-07, + "loss": 0.3097, + "step": 1399 + }, + { + "epoch": 0.2536231884057971, + "grad_norm": 5.536884522151105, + "learning_rate": 8.745545320976842e-07, + "loss": 0.3435, + "step": 1400 + }, + { + "epoch": 0.2536231884057971, + "eval_loss": 0.33445313572883606, + "eval_runtime": 9.7801, + "eval_samples_per_second": 51.124, + "eval_steps_per_second": 0.102, + "step": 1400 + }, + { + "epoch": 0.25380434782608696, + "grad_norm": 5.400364690622893, + "learning_rate": 8.743601140947913e-07, + "loss": 0.3466, + "step": 1401 + }, + { + "epoch": 0.2539855072463768, + "grad_norm": 3.2657391415928276, + "learning_rate": 8.741655671979376e-07, + "loss": 0.2774, + "step": 1402 + }, + { + "epoch": 0.25416666666666665, + "grad_norm": 7.22819625773207, + "learning_rate": 8.739708914741061e-07, + "loss": 0.3492, + "step": 1403 + }, + { + "epoch": 0.2543478260869565, + "grad_norm": 6.45319667061321, + "learning_rate": 8.737760869903247e-07, + "loss": 0.3256, + "step": 1404 + }, + { + "epoch": 0.2545289855072464, + "grad_norm": 6.030233811979014, + "learning_rate": 8.735811538136658e-07, + "loss": 0.3067, + "step": 1405 + }, + { + "epoch": 0.25471014492753624, + "grad_norm": 10.12705551702608, + "learning_rate": 8.733860920112454e-07, + "loss": 0.3061, + "step": 1406 + }, + { + "epoch": 0.2548913043478261, + "grad_norm": 4.538490626905695, + "learning_rate": 8.731909016502246e-07, + "loss": 0.2935, + "step": 1407 + }, + { + "epoch": 0.25507246376811593, + "grad_norm": 8.729687840643468, + "learning_rate": 8.72995582797808e-07, + "loss": 0.4079, + "step": 1408 + }, + { + "epoch": 0.2552536231884058, + "grad_norm": 3.8079831634083083, + "learning_rate": 8.728001355212449e-07, + "loss": 0.3504, + "step": 1409 + }, + { + "epoch": 0.2554347826086957, + "grad_norm": 10.674909153311495, + "learning_rate": 8.726045598878288e-07, + "loss": 0.3547, + "step": 1410 + }, + { + "epoch": 0.2556159420289855, + "grad_norm": 3.860182279531041, + "learning_rate": 8.724088559648974e-07, + "loss": 0.3016, + "step": 1411 + }, + { + "epoch": 0.25579710144927537, + "grad_norm": 5.922315898188411, + "learning_rate": 8.722130238198322e-07, + "loss": 0.3296, + "step": 1412 + }, + { + "epoch": 0.2559782608695652, + "grad_norm": 4.622691813487052, + "learning_rate": 8.720170635200594e-07, + "loss": 0.2657, + "step": 1413 + }, + { + "epoch": 0.25615942028985506, + "grad_norm": 16.349822303303963, + "learning_rate": 8.718209751330491e-07, + "loss": 0.3943, + "step": 1414 + }, + { + "epoch": 0.2563405797101449, + "grad_norm": 5.153830498466211, + "learning_rate": 8.716247587263153e-07, + "loss": 0.347, + "step": 1415 + }, + { + "epoch": 0.2565217391304348, + "grad_norm": 4.988211826887706, + "learning_rate": 8.714284143674162e-07, + "loss": 0.3018, + "step": 1416 + }, + { + "epoch": 0.25670289855072465, + "grad_norm": 4.837196743173714, + "learning_rate": 8.712319421239541e-07, + "loss": 0.3177, + "step": 1417 + }, + { + "epoch": 0.2568840579710145, + "grad_norm": 3.7804238674886537, + "learning_rate": 8.710353420635754e-07, + "loss": 0.3129, + "step": 1418 + }, + { + "epoch": 0.25706521739130433, + "grad_norm": 4.151605183838069, + "learning_rate": 8.708386142539705e-07, + "loss": 0.3663, + "step": 1419 + }, + { + "epoch": 0.2572463768115942, + "grad_norm": 4.875564503388854, + "learning_rate": 8.706417587628737e-07, + "loss": 0.3683, + "step": 1420 + }, + { + "epoch": 0.2574275362318841, + "grad_norm": 3.8614735515668897, + "learning_rate": 8.704447756580631e-07, + "loss": 0.3386, + "step": 1421 + }, + { + "epoch": 0.2576086956521739, + "grad_norm": 8.498932272802971, + "learning_rate": 8.702476650073611e-07, + "loss": 0.3494, + "step": 1422 + }, + { + "epoch": 0.25778985507246377, + "grad_norm": 4.193900444081362, + "learning_rate": 8.700504268786338e-07, + "loss": 0.3281, + "step": 1423 + }, + { + "epoch": 0.2579710144927536, + "grad_norm": 4.631827800441487, + "learning_rate": 8.698530613397912e-07, + "loss": 0.312, + "step": 1424 + }, + { + "epoch": 0.25815217391304346, + "grad_norm": 6.276391739029858, + "learning_rate": 8.696555684587872e-07, + "loss": 0.3261, + "step": 1425 + }, + { + "epoch": 0.25833333333333336, + "grad_norm": 4.120197921507397, + "learning_rate": 8.694579483036194e-07, + "loss": 0.3121, + "step": 1426 + }, + { + "epoch": 0.2585144927536232, + "grad_norm": 4.9825472713501835, + "learning_rate": 8.692602009423296e-07, + "loss": 0.3355, + "step": 1427 + }, + { + "epoch": 0.25869565217391305, + "grad_norm": 5.043230537479501, + "learning_rate": 8.690623264430028e-07, + "loss": 0.3173, + "step": 1428 + }, + { + "epoch": 0.2588768115942029, + "grad_norm": 5.093064812812383, + "learning_rate": 8.688643248737686e-07, + "loss": 0.3303, + "step": 1429 + }, + { + "epoch": 0.25905797101449274, + "grad_norm": 8.815620314056591, + "learning_rate": 8.686661963027995e-07, + "loss": 0.296, + "step": 1430 + }, + { + "epoch": 0.2592391304347826, + "grad_norm": 4.862947599875481, + "learning_rate": 8.684679407983122e-07, + "loss": 0.307, + "step": 1431 + }, + { + "epoch": 0.2594202898550725, + "grad_norm": 3.535632912298651, + "learning_rate": 8.682695584285671e-07, + "loss": 0.3596, + "step": 1432 + }, + { + "epoch": 0.25960144927536233, + "grad_norm": 4.97525373354084, + "learning_rate": 8.680710492618682e-07, + "loss": 0.2819, + "step": 1433 + }, + { + "epoch": 0.2597826086956522, + "grad_norm": 5.947201326926296, + "learning_rate": 8.678724133665629e-07, + "loss": 0.3759, + "step": 1434 + }, + { + "epoch": 0.259963768115942, + "grad_norm": 3.101348659685582, + "learning_rate": 8.676736508110428e-07, + "loss": 0.3224, + "step": 1435 + }, + { + "epoch": 0.26014492753623186, + "grad_norm": 4.420082967897086, + "learning_rate": 8.674747616637426e-07, + "loss": 0.392, + "step": 1436 + }, + { + "epoch": 0.26032608695652176, + "grad_norm": 4.660980675047265, + "learning_rate": 8.67275745993141e-07, + "loss": 0.3455, + "step": 1437 + }, + { + "epoch": 0.2605072463768116, + "grad_norm": 7.488751325359456, + "learning_rate": 8.670766038677597e-07, + "loss": 0.386, + "step": 1438 + }, + { + "epoch": 0.26068840579710145, + "grad_norm": 3.1463366824675285, + "learning_rate": 8.668773353561645e-07, + "loss": 0.3355, + "step": 1439 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 6.845061398395753, + "learning_rate": 8.666779405269644e-07, + "loss": 0.3263, + "step": 1440 + }, + { + "epoch": 0.26105072463768114, + "grad_norm": 3.499376675959107, + "learning_rate": 8.66478419448812e-07, + "loss": 0.2943, + "step": 1441 + }, + { + "epoch": 0.26123188405797104, + "grad_norm": 8.645841686681946, + "learning_rate": 8.662787721904034e-07, + "loss": 0.3257, + "step": 1442 + }, + { + "epoch": 0.2614130434782609, + "grad_norm": 3.343405746176046, + "learning_rate": 8.66078998820478e-07, + "loss": 0.3149, + "step": 1443 + }, + { + "epoch": 0.26159420289855073, + "grad_norm": 8.822584109198377, + "learning_rate": 8.658790994078189e-07, + "loss": 0.2464, + "step": 1444 + }, + { + "epoch": 0.2617753623188406, + "grad_norm": 5.3212478975305295, + "learning_rate": 8.656790740212523e-07, + "loss": 0.35, + "step": 1445 + }, + { + "epoch": 0.2619565217391304, + "grad_norm": 5.783678413914947, + "learning_rate": 8.654789227296478e-07, + "loss": 0.3188, + "step": 1446 + }, + { + "epoch": 0.26213768115942027, + "grad_norm": 4.678612259950098, + "learning_rate": 8.652786456019186e-07, + "loss": 0.2831, + "step": 1447 + }, + { + "epoch": 0.26231884057971017, + "grad_norm": 7.487509081249481, + "learning_rate": 8.65078242707021e-07, + "loss": 0.3231, + "step": 1448 + }, + { + "epoch": 0.2625, + "grad_norm": 5.755727898886248, + "learning_rate": 8.648777141139547e-07, + "loss": 0.4016, + "step": 1449 + }, + { + "epoch": 0.26268115942028986, + "grad_norm": 4.898748510106668, + "learning_rate": 8.646770598917625e-07, + "loss": 0.2902, + "step": 1450 + }, + { + "epoch": 0.2628623188405797, + "grad_norm": 5.939272544142747, + "learning_rate": 8.644762801095307e-07, + "loss": 0.3433, + "step": 1451 + }, + { + "epoch": 0.26304347826086955, + "grad_norm": 4.626659024000853, + "learning_rate": 8.642753748363888e-07, + "loss": 0.2995, + "step": 1452 + }, + { + "epoch": 0.26322463768115945, + "grad_norm": 4.784240922151674, + "learning_rate": 8.640743441415094e-07, + "loss": 0.3231, + "step": 1453 + }, + { + "epoch": 0.2634057971014493, + "grad_norm": 3.2423440213722174, + "learning_rate": 8.638731880941082e-07, + "loss": 0.3152, + "step": 1454 + }, + { + "epoch": 0.26358695652173914, + "grad_norm": 6.95278933074151, + "learning_rate": 8.636719067634443e-07, + "loss": 0.3665, + "step": 1455 + }, + { + "epoch": 0.263768115942029, + "grad_norm": 9.038201053118094, + "learning_rate": 8.634705002188198e-07, + "loss": 0.3295, + "step": 1456 + }, + { + "epoch": 0.2639492753623188, + "grad_norm": 3.421819507641361, + "learning_rate": 8.6326896852958e-07, + "loss": 0.3324, + "step": 1457 + }, + { + "epoch": 0.26413043478260867, + "grad_norm": 5.2937695358451915, + "learning_rate": 8.63067311765113e-07, + "loss": 0.3452, + "step": 1458 + }, + { + "epoch": 0.26431159420289857, + "grad_norm": 8.100207003002685, + "learning_rate": 8.628655299948503e-07, + "loss": 0.3225, + "step": 1459 + }, + { + "epoch": 0.2644927536231884, + "grad_norm": 7.159499700433832, + "learning_rate": 8.626636232882664e-07, + "loss": 0.3453, + "step": 1460 + }, + { + "epoch": 0.26467391304347826, + "grad_norm": 9.163800296896483, + "learning_rate": 8.624615917148787e-07, + "loss": 0.4613, + "step": 1461 + }, + { + "epoch": 0.2648550724637681, + "grad_norm": 5.506078709741328, + "learning_rate": 8.622594353442474e-07, + "loss": 0.3197, + "step": 1462 + }, + { + "epoch": 0.26503623188405795, + "grad_norm": 3.6101260134565663, + "learning_rate": 8.620571542459762e-07, + "loss": 0.296, + "step": 1463 + }, + { + "epoch": 0.26521739130434785, + "grad_norm": 4.552938679140964, + "learning_rate": 8.618547484897114e-07, + "loss": 0.316, + "step": 1464 + }, + { + "epoch": 0.2653985507246377, + "grad_norm": 9.398824399031948, + "learning_rate": 8.616522181451422e-07, + "loss": 0.3369, + "step": 1465 + }, + { + "epoch": 0.26557971014492754, + "grad_norm": 7.186287709111952, + "learning_rate": 8.614495632820007e-07, + "loss": 0.3726, + "step": 1466 + }, + { + "epoch": 0.2657608695652174, + "grad_norm": 6.8184139732106654, + "learning_rate": 8.61246783970062e-07, + "loss": 0.2958, + "step": 1467 + }, + { + "epoch": 0.26594202898550723, + "grad_norm": 8.371268031885386, + "learning_rate": 8.61043880279144e-07, + "loss": 0.3099, + "step": 1468 + }, + { + "epoch": 0.26612318840579713, + "grad_norm": 4.732164096957703, + "learning_rate": 8.608408522791071e-07, + "loss": 0.3632, + "step": 1469 + }, + { + "epoch": 0.266304347826087, + "grad_norm": 4.443176508975629, + "learning_rate": 8.606377000398553e-07, + "loss": 0.374, + "step": 1470 + }, + { + "epoch": 0.2664855072463768, + "grad_norm": 3.681126511164227, + "learning_rate": 8.604344236313345e-07, + "loss": 0.3141, + "step": 1471 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 4.274297686313551, + "learning_rate": 8.602310231235342e-07, + "loss": 0.2937, + "step": 1472 + }, + { + "epoch": 0.2668478260869565, + "grad_norm": 4.3333896107038035, + "learning_rate": 8.600274985864855e-07, + "loss": 0.3642, + "step": 1473 + }, + { + "epoch": 0.26702898550724635, + "grad_norm": 4.320485459535167, + "learning_rate": 8.598238500902632e-07, + "loss": 0.346, + "step": 1474 + }, + { + "epoch": 0.26721014492753625, + "grad_norm": 4.591650893445888, + "learning_rate": 8.596200777049845e-07, + "loss": 0.26, + "step": 1475 + }, + { + "epoch": 0.2673913043478261, + "grad_norm": 4.558259956964872, + "learning_rate": 8.594161815008092e-07, + "loss": 0.368, + "step": 1476 + }, + { + "epoch": 0.26757246376811594, + "grad_norm": 5.87948073014778, + "learning_rate": 8.592121615479397e-07, + "loss": 0.3641, + "step": 1477 + }, + { + "epoch": 0.2677536231884058, + "grad_norm": 3.9598472438298113, + "learning_rate": 8.590080179166209e-07, + "loss": 0.326, + "step": 1478 + }, + { + "epoch": 0.26793478260869563, + "grad_norm": 3.5104250005481608, + "learning_rate": 8.588037506771404e-07, + "loss": 0.2586, + "step": 1479 + }, + { + "epoch": 0.26811594202898553, + "grad_norm": 7.862466418120342, + "learning_rate": 8.585993598998286e-07, + "loss": 0.2453, + "step": 1480 + }, + { + "epoch": 0.2682971014492754, + "grad_norm": 4.068702686785692, + "learning_rate": 8.583948456550583e-07, + "loss": 0.3664, + "step": 1481 + }, + { + "epoch": 0.2684782608695652, + "grad_norm": 3.0069357984330343, + "learning_rate": 8.581902080132442e-07, + "loss": 0.2846, + "step": 1482 + }, + { + "epoch": 0.26865942028985507, + "grad_norm": 10.261968846160247, + "learning_rate": 8.579854470448446e-07, + "loss": 0.3463, + "step": 1483 + }, + { + "epoch": 0.2688405797101449, + "grad_norm": 4.023365986941748, + "learning_rate": 8.577805628203592e-07, + "loss": 0.2749, + "step": 1484 + }, + { + "epoch": 0.26902173913043476, + "grad_norm": 6.422016011862551, + "learning_rate": 8.57575555410331e-07, + "loss": 0.3655, + "step": 1485 + }, + { + "epoch": 0.26920289855072466, + "grad_norm": 3.630918413404799, + "learning_rate": 8.573704248853447e-07, + "loss": 0.3401, + "step": 1486 + }, + { + "epoch": 0.2693840579710145, + "grad_norm": 6.10577633680755, + "learning_rate": 8.57165171316028e-07, + "loss": 0.3, + "step": 1487 + }, + { + "epoch": 0.26956521739130435, + "grad_norm": 2.8549903913483243, + "learning_rate": 8.569597947730505e-07, + "loss": 0.2357, + "step": 1488 + }, + { + "epoch": 0.2697463768115942, + "grad_norm": 5.195580665775834, + "learning_rate": 8.567542953271241e-07, + "loss": 0.3193, + "step": 1489 + }, + { + "epoch": 0.26992753623188404, + "grad_norm": 4.181596565359129, + "learning_rate": 8.565486730490037e-07, + "loss": 0.3762, + "step": 1490 + }, + { + "epoch": 0.27010869565217394, + "grad_norm": 3.913500666841408, + "learning_rate": 8.563429280094859e-07, + "loss": 0.3608, + "step": 1491 + }, + { + "epoch": 0.2702898550724638, + "grad_norm": 3.767915834614901, + "learning_rate": 8.561370602794095e-07, + "loss": 0.3436, + "step": 1492 + }, + { + "epoch": 0.2704710144927536, + "grad_norm": 3.9869985309541494, + "learning_rate": 8.559310699296558e-07, + "loss": 0.3287, + "step": 1493 + }, + { + "epoch": 0.27065217391304347, + "grad_norm": 4.448264441443902, + "learning_rate": 8.557249570311482e-07, + "loss": 0.3796, + "step": 1494 + }, + { + "epoch": 0.2708333333333333, + "grad_norm": 4.664663939809492, + "learning_rate": 8.555187216548528e-07, + "loss": 0.3147, + "step": 1495 + }, + { + "epoch": 0.2710144927536232, + "grad_norm": 3.873535547826977, + "learning_rate": 8.553123638717766e-07, + "loss": 0.3565, + "step": 1496 + }, + { + "epoch": 0.27119565217391306, + "grad_norm": 3.481105199991194, + "learning_rate": 8.551058837529702e-07, + "loss": 0.2922, + "step": 1497 + }, + { + "epoch": 0.2713768115942029, + "grad_norm": 3.618053192370526, + "learning_rate": 8.548992813695255e-07, + "loss": 0.3515, + "step": 1498 + }, + { + "epoch": 0.27155797101449275, + "grad_norm": 3.9211018799211153, + "learning_rate": 8.546925567925767e-07, + "loss": 0.3641, + "step": 1499 + }, + { + "epoch": 0.2717391304347826, + "grad_norm": 6.608175380270077, + "learning_rate": 8.544857100933e-07, + "loss": 0.3319, + "step": 1500 + }, + { + "epoch": 0.2717391304347826, + "eval_loss": 0.3291953206062317, + "eval_runtime": 9.8043, + "eval_samples_per_second": 50.998, + "eval_steps_per_second": 0.102, + "step": 1500 + }, + { + "epoch": 0.27192028985507244, + "grad_norm": 7.94603320231093, + "learning_rate": 8.542787413429138e-07, + "loss": 0.3598, + "step": 1501 + }, + { + "epoch": 0.27210144927536234, + "grad_norm": 5.200603219014856, + "learning_rate": 8.540716506126783e-07, + "loss": 0.3063, + "step": 1502 + }, + { + "epoch": 0.2722826086956522, + "grad_norm": 8.450787840873383, + "learning_rate": 8.538644379738958e-07, + "loss": 0.3376, + "step": 1503 + }, + { + "epoch": 0.27246376811594203, + "grad_norm": 4.554374448992296, + "learning_rate": 8.536571034979108e-07, + "loss": 0.3246, + "step": 1504 + }, + { + "epoch": 0.2726449275362319, + "grad_norm": 3.4097748206696927, + "learning_rate": 8.534496472561093e-07, + "loss": 0.3821, + "step": 1505 + }, + { + "epoch": 0.2728260869565217, + "grad_norm": 3.9630753217047574, + "learning_rate": 8.532420693199194e-07, + "loss": 0.3277, + "step": 1506 + }, + { + "epoch": 0.2730072463768116, + "grad_norm": 3.1081510117355036, + "learning_rate": 8.530343697608116e-07, + "loss": 0.337, + "step": 1507 + }, + { + "epoch": 0.27318840579710146, + "grad_norm": 4.175615263612925, + "learning_rate": 8.528265486502974e-07, + "loss": 0.3282, + "step": 1508 + }, + { + "epoch": 0.2733695652173913, + "grad_norm": 5.250703913777366, + "learning_rate": 8.52618606059931e-07, + "loss": 0.3531, + "step": 1509 + }, + { + "epoch": 0.27355072463768115, + "grad_norm": 3.339386059847261, + "learning_rate": 8.524105420613077e-07, + "loss": 0.3207, + "step": 1510 + }, + { + "epoch": 0.273731884057971, + "grad_norm": 3.600122407000914, + "learning_rate": 8.52202356726065e-07, + "loss": 0.2744, + "step": 1511 + }, + { + "epoch": 0.27391304347826084, + "grad_norm": 5.473886600729019, + "learning_rate": 8.51994050125882e-07, + "loss": 0.3938, + "step": 1512 + }, + { + "epoch": 0.27409420289855074, + "grad_norm": 5.937355561170181, + "learning_rate": 8.5178562233248e-07, + "loss": 0.3317, + "step": 1513 + }, + { + "epoch": 0.2742753623188406, + "grad_norm": 4.362263198446328, + "learning_rate": 8.515770734176211e-07, + "loss": 0.3368, + "step": 1514 + }, + { + "epoch": 0.27445652173913043, + "grad_norm": 3.8500946795265607, + "learning_rate": 8.513684034531104e-07, + "loss": 0.3235, + "step": 1515 + }, + { + "epoch": 0.2746376811594203, + "grad_norm": 11.298926079489457, + "learning_rate": 8.511596125107932e-07, + "loss": 0.3558, + "step": 1516 + }, + { + "epoch": 0.2748188405797101, + "grad_norm": 4.827466966068262, + "learning_rate": 8.509507006625578e-07, + "loss": 0.338, + "step": 1517 + }, + { + "epoch": 0.275, + "grad_norm": 6.087367129538768, + "learning_rate": 8.507416679803332e-07, + "loss": 0.3025, + "step": 1518 + }, + { + "epoch": 0.27518115942028987, + "grad_norm": 3.9381710819720124, + "learning_rate": 8.505325145360907e-07, + "loss": 0.3145, + "step": 1519 + }, + { + "epoch": 0.2753623188405797, + "grad_norm": 5.254791838868577, + "learning_rate": 8.503232404018423e-07, + "loss": 0.3689, + "step": 1520 + }, + { + "epoch": 0.27554347826086956, + "grad_norm": 3.7853871113110213, + "learning_rate": 8.501138456496426e-07, + "loss": 0.3392, + "step": 1521 + }, + { + "epoch": 0.2757246376811594, + "grad_norm": 6.789476258260733, + "learning_rate": 8.499043303515867e-07, + "loss": 0.303, + "step": 1522 + }, + { + "epoch": 0.2759057971014493, + "grad_norm": 7.540076645851566, + "learning_rate": 8.496946945798123e-07, + "loss": 0.2841, + "step": 1523 + }, + { + "epoch": 0.27608695652173915, + "grad_norm": 3.7575288732410073, + "learning_rate": 8.494849384064973e-07, + "loss": 0.3352, + "step": 1524 + }, + { + "epoch": 0.276268115942029, + "grad_norm": 3.852559693797952, + "learning_rate": 8.492750619038624e-07, + "loss": 0.3135, + "step": 1525 + }, + { + "epoch": 0.27644927536231884, + "grad_norm": 5.145294947713052, + "learning_rate": 8.490650651441688e-07, + "loss": 0.3342, + "step": 1526 + }, + { + "epoch": 0.2766304347826087, + "grad_norm": 4.2364768313691625, + "learning_rate": 8.488549481997191e-07, + "loss": 0.3107, + "step": 1527 + }, + { + "epoch": 0.2768115942028985, + "grad_norm": 6.861576809365711, + "learning_rate": 8.48644711142858e-07, + "loss": 0.3089, + "step": 1528 + }, + { + "epoch": 0.2769927536231884, + "grad_norm": 3.22061685004266, + "learning_rate": 8.484343540459711e-07, + "loss": 0.2904, + "step": 1529 + }, + { + "epoch": 0.27717391304347827, + "grad_norm": 5.229224561068187, + "learning_rate": 8.48223876981485e-07, + "loss": 0.2915, + "step": 1530 + }, + { + "epoch": 0.2773550724637681, + "grad_norm": 3.686300755968237, + "learning_rate": 8.480132800218681e-07, + "loss": 0.3597, + "step": 1531 + }, + { + "epoch": 0.27753623188405796, + "grad_norm": 5.085514079163521, + "learning_rate": 8.478025632396301e-07, + "loss": 0.3779, + "step": 1532 + }, + { + "epoch": 0.2777173913043478, + "grad_norm": 7.165670049387968, + "learning_rate": 8.475917267073215e-07, + "loss": 0.3167, + "step": 1533 + }, + { + "epoch": 0.2778985507246377, + "grad_norm": 5.1829820708715415, + "learning_rate": 8.473807704975346e-07, + "loss": 0.3327, + "step": 1534 + }, + { + "epoch": 0.27807971014492755, + "grad_norm": 3.88074841558038, + "learning_rate": 8.471696946829024e-07, + "loss": 0.3427, + "step": 1535 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 5.907455533941769, + "learning_rate": 8.469584993360994e-07, + "loss": 0.3848, + "step": 1536 + }, + { + "epoch": 0.27844202898550724, + "grad_norm": 7.617246279236586, + "learning_rate": 8.467471845298413e-07, + "loss": 0.3052, + "step": 1537 + }, + { + "epoch": 0.2786231884057971, + "grad_norm": 5.092561470231212, + "learning_rate": 8.465357503368845e-07, + "loss": 0.322, + "step": 1538 + }, + { + "epoch": 0.27880434782608693, + "grad_norm": 6.562172416925117, + "learning_rate": 8.46324196830027e-07, + "loss": 0.3582, + "step": 1539 + }, + { + "epoch": 0.27898550724637683, + "grad_norm": 4.394735908966694, + "learning_rate": 8.461125240821076e-07, + "loss": 0.3876, + "step": 1540 + }, + { + "epoch": 0.2791666666666667, + "grad_norm": 12.079006453677533, + "learning_rate": 8.459007321660061e-07, + "loss": 0.364, + "step": 1541 + }, + { + "epoch": 0.2793478260869565, + "grad_norm": 9.01054595163063, + "learning_rate": 8.456888211546438e-07, + "loss": 0.3115, + "step": 1542 + }, + { + "epoch": 0.27952898550724636, + "grad_norm": 3.529579067834169, + "learning_rate": 8.454767911209824e-07, + "loss": 0.3229, + "step": 1543 + }, + { + "epoch": 0.2797101449275362, + "grad_norm": 3.949927686450316, + "learning_rate": 8.452646421380249e-07, + "loss": 0.3369, + "step": 1544 + }, + { + "epoch": 0.2798913043478261, + "grad_norm": 4.419598461268908, + "learning_rate": 8.450523742788153e-07, + "loss": 0.3423, + "step": 1545 + }, + { + "epoch": 0.28007246376811595, + "grad_norm": 3.424894427169211, + "learning_rate": 8.448399876164382e-07, + "loss": 0.2834, + "step": 1546 + }, + { + "epoch": 0.2802536231884058, + "grad_norm": 7.784078738924731, + "learning_rate": 8.446274822240196e-07, + "loss": 0.342, + "step": 1547 + }, + { + "epoch": 0.28043478260869564, + "grad_norm": 3.4297566168299993, + "learning_rate": 8.444148581747259e-07, + "loss": 0.3533, + "step": 1548 + }, + { + "epoch": 0.2806159420289855, + "grad_norm": 3.4617069123342343, + "learning_rate": 8.442021155417647e-07, + "loss": 0.2811, + "step": 1549 + }, + { + "epoch": 0.2807971014492754, + "grad_norm": 3.6861467071640037, + "learning_rate": 8.439892543983844e-07, + "loss": 0.3497, + "step": 1550 + }, + { + "epoch": 0.28097826086956523, + "grad_norm": 3.202756169770153, + "learning_rate": 8.437762748178738e-07, + "loss": 0.2899, + "step": 1551 + }, + { + "epoch": 0.2811594202898551, + "grad_norm": 3.436221346362621, + "learning_rate": 8.43563176873563e-07, + "loss": 0.347, + "step": 1552 + }, + { + "epoch": 0.2813405797101449, + "grad_norm": 4.833697246701201, + "learning_rate": 8.433499606388224e-07, + "loss": 0.3221, + "step": 1553 + }, + { + "epoch": 0.28152173913043477, + "grad_norm": 4.719470145881154, + "learning_rate": 8.431366261870637e-07, + "loss": 0.3654, + "step": 1554 + }, + { + "epoch": 0.2817028985507246, + "grad_norm": 4.001786646781353, + "learning_rate": 8.429231735917387e-07, + "loss": 0.3574, + "step": 1555 + }, + { + "epoch": 0.2818840579710145, + "grad_norm": 4.539308156263115, + "learning_rate": 8.427096029263403e-07, + "loss": 0.3565, + "step": 1556 + }, + { + "epoch": 0.28206521739130436, + "grad_norm": 5.301696453308421, + "learning_rate": 8.424959142644017e-07, + "loss": 0.2933, + "step": 1557 + }, + { + "epoch": 0.2822463768115942, + "grad_norm": 8.737154332858287, + "learning_rate": 8.422821076794971e-07, + "loss": 0.3705, + "step": 1558 + }, + { + "epoch": 0.28242753623188405, + "grad_norm": 9.03406094631612, + "learning_rate": 8.420681832452411e-07, + "loss": 0.3431, + "step": 1559 + }, + { + "epoch": 0.2826086956521739, + "grad_norm": 3.494253860898528, + "learning_rate": 8.418541410352888e-07, + "loss": 0.3123, + "step": 1560 + }, + { + "epoch": 0.2827898550724638, + "grad_norm": 4.224727291054771, + "learning_rate": 8.416399811233361e-07, + "loss": 0.248, + "step": 1561 + }, + { + "epoch": 0.28297101449275364, + "grad_norm": 4.412263190343274, + "learning_rate": 8.41425703583119e-07, + "loss": 0.3516, + "step": 1562 + }, + { + "epoch": 0.2831521739130435, + "grad_norm": 5.191574023375802, + "learning_rate": 8.412113084884146e-07, + "loss": 0.3244, + "step": 1563 + }, + { + "epoch": 0.2833333333333333, + "grad_norm": 5.575472293051576, + "learning_rate": 8.4099679591304e-07, + "loss": 0.3617, + "step": 1564 + }, + { + "epoch": 0.28351449275362317, + "grad_norm": 4.455283192473403, + "learning_rate": 8.407821659308528e-07, + "loss": 0.3508, + "step": 1565 + }, + { + "epoch": 0.28369565217391307, + "grad_norm": 4.303689466574255, + "learning_rate": 8.405674186157511e-07, + "loss": 0.3636, + "step": 1566 + }, + { + "epoch": 0.2838768115942029, + "grad_norm": 4.309155995585994, + "learning_rate": 8.403525540416738e-07, + "loss": 0.3191, + "step": 1567 + }, + { + "epoch": 0.28405797101449276, + "grad_norm": 3.152108543112069, + "learning_rate": 8.401375722825995e-07, + "loss": 0.2996, + "step": 1568 + }, + { + "epoch": 0.2842391304347826, + "grad_norm": 5.603000767800198, + "learning_rate": 8.399224734125473e-07, + "loss": 0.2968, + "step": 1569 + }, + { + "epoch": 0.28442028985507245, + "grad_norm": 3.958979161088896, + "learning_rate": 8.397072575055771e-07, + "loss": 0.3631, + "step": 1570 + }, + { + "epoch": 0.2846014492753623, + "grad_norm": 6.141595615360145, + "learning_rate": 8.394919246357883e-07, + "loss": 0.3457, + "step": 1571 + }, + { + "epoch": 0.2847826086956522, + "grad_norm": 4.820277840732952, + "learning_rate": 8.392764748773214e-07, + "loss": 0.3448, + "step": 1572 + }, + { + "epoch": 0.28496376811594204, + "grad_norm": 3.9357781528625426, + "learning_rate": 8.390609083043568e-07, + "loss": 0.2584, + "step": 1573 + }, + { + "epoch": 0.2851449275362319, + "grad_norm": 3.5161519053212777, + "learning_rate": 8.388452249911149e-07, + "loss": 0.3204, + "step": 1574 + }, + { + "epoch": 0.28532608695652173, + "grad_norm": 7.282372463882652, + "learning_rate": 8.386294250118565e-07, + "loss": 0.325, + "step": 1575 + }, + { + "epoch": 0.2855072463768116, + "grad_norm": 5.918162653341962, + "learning_rate": 8.384135084408826e-07, + "loss": 0.3262, + "step": 1576 + }, + { + "epoch": 0.2856884057971015, + "grad_norm": 6.923760556303854, + "learning_rate": 8.381974753525345e-07, + "loss": 0.2956, + "step": 1577 + }, + { + "epoch": 0.2858695652173913, + "grad_norm": 7.543933793802687, + "learning_rate": 8.379813258211929e-07, + "loss": 0.3173, + "step": 1578 + }, + { + "epoch": 0.28605072463768116, + "grad_norm": 4.583192971574929, + "learning_rate": 8.377650599212798e-07, + "loss": 0.35, + "step": 1579 + }, + { + "epoch": 0.286231884057971, + "grad_norm": 3.673374292807425, + "learning_rate": 8.37548677727256e-07, + "loss": 0.3445, + "step": 1580 + }, + { + "epoch": 0.28641304347826085, + "grad_norm": 3.5752070642664178, + "learning_rate": 8.373321793136232e-07, + "loss": 0.2583, + "step": 1581 + }, + { + "epoch": 0.2865942028985507, + "grad_norm": 9.241637813586905, + "learning_rate": 8.371155647549226e-07, + "loss": 0.335, + "step": 1582 + }, + { + "epoch": 0.2867753623188406, + "grad_norm": 8.44960724751869, + "learning_rate": 8.368988341257359e-07, + "loss": 0.3398, + "step": 1583 + }, + { + "epoch": 0.28695652173913044, + "grad_norm": 6.7555221206061535, + "learning_rate": 8.366819875006843e-07, + "loss": 0.2912, + "step": 1584 + }, + { + "epoch": 0.2871376811594203, + "grad_norm": 5.143185181351389, + "learning_rate": 8.364650249544291e-07, + "loss": 0.2969, + "step": 1585 + }, + { + "epoch": 0.28731884057971013, + "grad_norm": 3.6029792463124086, + "learning_rate": 8.362479465616717e-07, + "loss": 0.3134, + "step": 1586 + }, + { + "epoch": 0.2875, + "grad_norm": 3.354229431447784, + "learning_rate": 8.360307523971532e-07, + "loss": 0.2607, + "step": 1587 + }, + { + "epoch": 0.2876811594202899, + "grad_norm": 4.48795155613157, + "learning_rate": 8.358134425356543e-07, + "loss": 0.3406, + "step": 1588 + }, + { + "epoch": 0.2878623188405797, + "grad_norm": 3.8339057208163276, + "learning_rate": 8.355960170519962e-07, + "loss": 0.3412, + "step": 1589 + }, + { + "epoch": 0.28804347826086957, + "grad_norm": 4.661123094207706, + "learning_rate": 8.353784760210392e-07, + "loss": 0.3792, + "step": 1590 + }, + { + "epoch": 0.2882246376811594, + "grad_norm": 12.802215822779953, + "learning_rate": 8.351608195176839e-07, + "loss": 0.3747, + "step": 1591 + }, + { + "epoch": 0.28840579710144926, + "grad_norm": 8.422644914479166, + "learning_rate": 8.349430476168704e-07, + "loss": 0.431, + "step": 1592 + }, + { + "epoch": 0.28858695652173916, + "grad_norm": 3.8533377070126567, + "learning_rate": 8.347251603935788e-07, + "loss": 0.3596, + "step": 1593 + }, + { + "epoch": 0.288768115942029, + "grad_norm": 3.8344946606723975, + "learning_rate": 8.345071579228282e-07, + "loss": 0.3595, + "step": 1594 + }, + { + "epoch": 0.28894927536231885, + "grad_norm": 5.176216822170738, + "learning_rate": 8.342890402796783e-07, + "loss": 0.3087, + "step": 1595 + }, + { + "epoch": 0.2891304347826087, + "grad_norm": 4.634830361021857, + "learning_rate": 8.340708075392281e-07, + "loss": 0.2932, + "step": 1596 + }, + { + "epoch": 0.28931159420289854, + "grad_norm": 2.980558253197547, + "learning_rate": 8.338524597766159e-07, + "loss": 0.2413, + "step": 1597 + }, + { + "epoch": 0.2894927536231884, + "grad_norm": 3.331210909978132, + "learning_rate": 8.336339970670198e-07, + "loss": 0.3071, + "step": 1598 + }, + { + "epoch": 0.2896739130434783, + "grad_norm": 5.5771896280633, + "learning_rate": 8.33415419485658e-07, + "loss": 0.3151, + "step": 1599 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 4.64277194305046, + "learning_rate": 8.331967271077874e-07, + "loss": 0.3495, + "step": 1600 + }, + { + "epoch": 0.2898550724637681, + "eval_loss": 0.3165625035762787, + "eval_runtime": 9.7974, + "eval_samples_per_second": 51.034, + "eval_steps_per_second": 0.102, + "step": 1600 + }, + { + "epoch": 0.29003623188405797, + "grad_norm": 6.378395618638732, + "learning_rate": 8.32977920008705e-07, + "loss": 0.3236, + "step": 1601 + }, + { + "epoch": 0.2902173913043478, + "grad_norm": 3.305310306727567, + "learning_rate": 8.327589982637469e-07, + "loss": 0.3272, + "step": 1602 + }, + { + "epoch": 0.29039855072463766, + "grad_norm": 3.2262682005236267, + "learning_rate": 8.325399619482892e-07, + "loss": 0.317, + "step": 1603 + }, + { + "epoch": 0.29057971014492756, + "grad_norm": 4.148213141193882, + "learning_rate": 8.32320811137747e-07, + "loss": 0.3322, + "step": 1604 + }, + { + "epoch": 0.2907608695652174, + "grad_norm": 10.652562906333356, + "learning_rate": 8.321015459075749e-07, + "loss": 0.3375, + "step": 1605 + }, + { + "epoch": 0.29094202898550725, + "grad_norm": 5.3521498911331715, + "learning_rate": 8.318821663332669e-07, + "loss": 0.3535, + "step": 1606 + }, + { + "epoch": 0.2911231884057971, + "grad_norm": 7.128114754485076, + "learning_rate": 8.316626724903567e-07, + "loss": 0.3358, + "step": 1607 + }, + { + "epoch": 0.29130434782608694, + "grad_norm": 7.274594250395883, + "learning_rate": 8.314430644544169e-07, + "loss": 0.3214, + "step": 1608 + }, + { + "epoch": 0.2914855072463768, + "grad_norm": 3.4403681115590103, + "learning_rate": 8.312233423010595e-07, + "loss": 0.3372, + "step": 1609 + }, + { + "epoch": 0.2916666666666667, + "grad_norm": 6.147402016937184, + "learning_rate": 8.310035061059362e-07, + "loss": 0.2874, + "step": 1610 + }, + { + "epoch": 0.29184782608695653, + "grad_norm": 7.155103900182853, + "learning_rate": 8.307835559447371e-07, + "loss": 0.3665, + "step": 1611 + }, + { + "epoch": 0.2920289855072464, + "grad_norm": 7.400319149805051, + "learning_rate": 8.305634918931927e-07, + "loss": 0.3191, + "step": 1612 + }, + { + "epoch": 0.2922101449275362, + "grad_norm": 3.997353071823464, + "learning_rate": 8.303433140270717e-07, + "loss": 0.3461, + "step": 1613 + }, + { + "epoch": 0.29239130434782606, + "grad_norm": 9.75431790097079, + "learning_rate": 8.301230224221825e-07, + "loss": 0.2663, + "step": 1614 + }, + { + "epoch": 0.29257246376811596, + "grad_norm": 3.448463681822264, + "learning_rate": 8.299026171543723e-07, + "loss": 0.3371, + "step": 1615 + }, + { + "epoch": 0.2927536231884058, + "grad_norm": 10.201464537825748, + "learning_rate": 8.29682098299528e-07, + "loss": 0.3842, + "step": 1616 + }, + { + "epoch": 0.29293478260869565, + "grad_norm": 5.533583721142776, + "learning_rate": 8.294614659335754e-07, + "loss": 0.3392, + "step": 1617 + }, + { + "epoch": 0.2931159420289855, + "grad_norm": 7.100245978645747, + "learning_rate": 8.29240720132479e-07, + "loss": 0.4021, + "step": 1618 + }, + { + "epoch": 0.29329710144927534, + "grad_norm": 3.29047254076525, + "learning_rate": 8.290198609722425e-07, + "loss": 0.3055, + "step": 1619 + }, + { + "epoch": 0.29347826086956524, + "grad_norm": 12.697452764155445, + "learning_rate": 8.28798888528909e-07, + "loss": 0.2784, + "step": 1620 + }, + { + "epoch": 0.2936594202898551, + "grad_norm": 8.577476673394132, + "learning_rate": 8.285778028785604e-07, + "loss": 0.2752, + "step": 1621 + }, + { + "epoch": 0.29384057971014493, + "grad_norm": 8.891110236058598, + "learning_rate": 8.283566040973173e-07, + "loss": 0.3295, + "step": 1622 + }, + { + "epoch": 0.2940217391304348, + "grad_norm": 6.278714573875438, + "learning_rate": 8.281352922613397e-07, + "loss": 0.3491, + "step": 1623 + }, + { + "epoch": 0.2942028985507246, + "grad_norm": 3.6375562155142918, + "learning_rate": 8.279138674468263e-07, + "loss": 0.3343, + "step": 1624 + }, + { + "epoch": 0.29438405797101447, + "grad_norm": 4.537387375519442, + "learning_rate": 8.276923297300146e-07, + "loss": 0.3178, + "step": 1625 + }, + { + "epoch": 0.29456521739130437, + "grad_norm": 11.942034259448368, + "learning_rate": 8.27470679187181e-07, + "loss": 0.39, + "step": 1626 + }, + { + "epoch": 0.2947463768115942, + "grad_norm": 14.695645928480674, + "learning_rate": 8.272489158946412e-07, + "loss": 0.3695, + "step": 1627 + }, + { + "epoch": 0.29492753623188406, + "grad_norm": 11.078467592004337, + "learning_rate": 8.27027039928749e-07, + "loss": 0.3189, + "step": 1628 + }, + { + "epoch": 0.2951086956521739, + "grad_norm": 11.260718473857697, + "learning_rate": 8.268050513658976e-07, + "loss": 0.3262, + "step": 1629 + }, + { + "epoch": 0.29528985507246375, + "grad_norm": 4.7968953279817015, + "learning_rate": 8.265829502825182e-07, + "loss": 0.3589, + "step": 1630 + }, + { + "epoch": 0.29547101449275365, + "grad_norm": 8.480942280628483, + "learning_rate": 8.26360736755082e-07, + "loss": 0.3444, + "step": 1631 + }, + { + "epoch": 0.2956521739130435, + "grad_norm": 4.599040676192968, + "learning_rate": 8.261384108600977e-07, + "loss": 0.3277, + "step": 1632 + }, + { + "epoch": 0.29583333333333334, + "grad_norm": 5.803326829663677, + "learning_rate": 8.259159726741132e-07, + "loss": 0.2633, + "step": 1633 + }, + { + "epoch": 0.2960144927536232, + "grad_norm": 3.944907576685592, + "learning_rate": 8.25693422273715e-07, + "loss": 0.2755, + "step": 1634 + }, + { + "epoch": 0.296195652173913, + "grad_norm": 7.6415934521684195, + "learning_rate": 8.254707597355286e-07, + "loss": 0.3148, + "step": 1635 + }, + { + "epoch": 0.29637681159420287, + "grad_norm": 7.1530593049049855, + "learning_rate": 8.252479851362176e-07, + "loss": 0.3329, + "step": 1636 + }, + { + "epoch": 0.29655797101449277, + "grad_norm": 7.3869089157210315, + "learning_rate": 8.250250985524839e-07, + "loss": 0.3277, + "step": 1637 + }, + { + "epoch": 0.2967391304347826, + "grad_norm": 3.9304977458400305, + "learning_rate": 8.24802100061069e-07, + "loss": 0.3284, + "step": 1638 + }, + { + "epoch": 0.29692028985507246, + "grad_norm": 4.319350273583407, + "learning_rate": 8.245789897387521e-07, + "loss": 0.2677, + "step": 1639 + }, + { + "epoch": 0.2971014492753623, + "grad_norm": 5.486958289995254, + "learning_rate": 8.243557676623509e-07, + "loss": 0.3007, + "step": 1640 + }, + { + "epoch": 0.29728260869565215, + "grad_norm": 4.093664414283139, + "learning_rate": 8.241324339087224e-07, + "loss": 0.2998, + "step": 1641 + }, + { + "epoch": 0.29746376811594205, + "grad_norm": 4.432355806367156, + "learning_rate": 8.239089885547608e-07, + "loss": 0.2794, + "step": 1642 + }, + { + "epoch": 0.2976449275362319, + "grad_norm": 4.1344808743219925, + "learning_rate": 8.236854316774e-07, + "loss": 0.3036, + "step": 1643 + }, + { + "epoch": 0.29782608695652174, + "grad_norm": 6.009139098621791, + "learning_rate": 8.234617633536113e-07, + "loss": 0.3314, + "step": 1644 + }, + { + "epoch": 0.2980072463768116, + "grad_norm": 7.239377871455703, + "learning_rate": 8.232379836604048e-07, + "loss": 0.4076, + "step": 1645 + }, + { + "epoch": 0.29818840579710143, + "grad_norm": 4.582783674139785, + "learning_rate": 8.230140926748291e-07, + "loss": 0.3508, + "step": 1646 + }, + { + "epoch": 0.29836956521739133, + "grad_norm": 7.506638240825678, + "learning_rate": 8.227900904739709e-07, + "loss": 0.3915, + "step": 1647 + }, + { + "epoch": 0.2985507246376812, + "grad_norm": 10.230292626773988, + "learning_rate": 8.225659771349551e-07, + "loss": 0.2514, + "step": 1648 + }, + { + "epoch": 0.298731884057971, + "grad_norm": 4.597719450326029, + "learning_rate": 8.22341752734945e-07, + "loss": 0.3474, + "step": 1649 + }, + { + "epoch": 0.29891304347826086, + "grad_norm": 7.992863045230097, + "learning_rate": 8.221174173511421e-07, + "loss": 0.3326, + "step": 1650 + }, + { + "epoch": 0.2990942028985507, + "grad_norm": 3.6998288338167984, + "learning_rate": 8.218929710607863e-07, + "loss": 0.3412, + "step": 1651 + }, + { + "epoch": 0.29927536231884055, + "grad_norm": 9.78677207193033, + "learning_rate": 8.216684139411551e-07, + "loss": 0.3571, + "step": 1652 + }, + { + "epoch": 0.29945652173913045, + "grad_norm": 4.754946772389717, + "learning_rate": 8.214437460695651e-07, + "loss": 0.3102, + "step": 1653 + }, + { + "epoch": 0.2996376811594203, + "grad_norm": 3.6399284203865063, + "learning_rate": 8.212189675233703e-07, + "loss": 0.3074, + "step": 1654 + }, + { + "epoch": 0.29981884057971014, + "grad_norm": 7.290067679634445, + "learning_rate": 8.209940783799632e-07, + "loss": 0.3187, + "step": 1655 + }, + { + "epoch": 0.3, + "grad_norm": 4.814851215421622, + "learning_rate": 8.207690787167736e-07, + "loss": 0.2526, + "step": 1656 + }, + { + "epoch": 0.30018115942028983, + "grad_norm": 4.5703032648245925, + "learning_rate": 8.205439686112706e-07, + "loss": 0.3178, + "step": 1657 + }, + { + "epoch": 0.30036231884057973, + "grad_norm": 3.483001123055395, + "learning_rate": 8.203187481409604e-07, + "loss": 0.3109, + "step": 1658 + }, + { + "epoch": 0.3005434782608696, + "grad_norm": 6.539740426528362, + "learning_rate": 8.200934173833876e-07, + "loss": 0.3038, + "step": 1659 + }, + { + "epoch": 0.3007246376811594, + "grad_norm": 4.035415377972808, + "learning_rate": 8.198679764161344e-07, + "loss": 0.2564, + "step": 1660 + }, + { + "epoch": 0.30090579710144927, + "grad_norm": 11.928054378232504, + "learning_rate": 8.196424253168215e-07, + "loss": 0.2992, + "step": 1661 + }, + { + "epoch": 0.3010869565217391, + "grad_norm": 4.761747541137394, + "learning_rate": 8.19416764163107e-07, + "loss": 0.2985, + "step": 1662 + }, + { + "epoch": 0.30126811594202896, + "grad_norm": 7.882641673254881, + "learning_rate": 8.191909930326873e-07, + "loss": 0.2681, + "step": 1663 + }, + { + "epoch": 0.30144927536231886, + "grad_norm": 5.16498571414151, + "learning_rate": 8.189651120032961e-07, + "loss": 0.2982, + "step": 1664 + }, + { + "epoch": 0.3016304347826087, + "grad_norm": 3.791597801034975, + "learning_rate": 8.187391211527057e-07, + "loss": 0.2672, + "step": 1665 + }, + { + "epoch": 0.30181159420289855, + "grad_norm": 4.391914652669149, + "learning_rate": 8.185130205587256e-07, + "loss": 0.3719, + "step": 1666 + }, + { + "epoch": 0.3019927536231884, + "grad_norm": 3.825751565978178, + "learning_rate": 8.182868102992034e-07, + "loss": 0.3201, + "step": 1667 + }, + { + "epoch": 0.30217391304347824, + "grad_norm": 3.8365459038452423, + "learning_rate": 8.180604904520243e-07, + "loss": 0.3198, + "step": 1668 + }, + { + "epoch": 0.30235507246376814, + "grad_norm": 9.954952894096023, + "learning_rate": 8.178340610951113e-07, + "loss": 0.3356, + "step": 1669 + }, + { + "epoch": 0.302536231884058, + "grad_norm": 4.237608552840032, + "learning_rate": 8.176075223064254e-07, + "loss": 0.3057, + "step": 1670 + }, + { + "epoch": 0.3027173913043478, + "grad_norm": 4.937242080309297, + "learning_rate": 8.173808741639645e-07, + "loss": 0.3831, + "step": 1671 + }, + { + "epoch": 0.30289855072463767, + "grad_norm": 8.638907774574463, + "learning_rate": 8.171541167457648e-07, + "loss": 0.3585, + "step": 1672 + }, + { + "epoch": 0.3030797101449275, + "grad_norm": 4.9368554415733295, + "learning_rate": 8.169272501299e-07, + "loss": 0.2738, + "step": 1673 + }, + { + "epoch": 0.3032608695652174, + "grad_norm": 3.105638821588064, + "learning_rate": 8.167002743944815e-07, + "loss": 0.3069, + "step": 1674 + }, + { + "epoch": 0.30344202898550726, + "grad_norm": 6.184566997979668, + "learning_rate": 8.16473189617658e-07, + "loss": 0.3126, + "step": 1675 + }, + { + "epoch": 0.3036231884057971, + "grad_norm": 8.577425247463461, + "learning_rate": 8.162459958776157e-07, + "loss": 0.3297, + "step": 1676 + }, + { + "epoch": 0.30380434782608695, + "grad_norm": 6.075670646524077, + "learning_rate": 8.160186932525786e-07, + "loss": 0.2979, + "step": 1677 + }, + { + "epoch": 0.3039855072463768, + "grad_norm": 8.17065404475752, + "learning_rate": 8.157912818208082e-07, + "loss": 0.2567, + "step": 1678 + }, + { + "epoch": 0.30416666666666664, + "grad_norm": 12.172849250228765, + "learning_rate": 8.15563761660603e-07, + "loss": 0.3384, + "step": 1679 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 11.514993580298441, + "learning_rate": 8.153361328502997e-07, + "loss": 0.4114, + "step": 1680 + }, + { + "epoch": 0.3045289855072464, + "grad_norm": 4.422004935489591, + "learning_rate": 8.151083954682716e-07, + "loss": 0.3861, + "step": 1681 + }, + { + "epoch": 0.30471014492753623, + "grad_norm": 3.0950820256059632, + "learning_rate": 8.148805495929301e-07, + "loss": 0.3081, + "step": 1682 + }, + { + "epoch": 0.3048913043478261, + "grad_norm": 4.2933449575939955, + "learning_rate": 8.146525953027234e-07, + "loss": 0.3385, + "step": 1683 + }, + { + "epoch": 0.3050724637681159, + "grad_norm": 6.563174574341766, + "learning_rate": 8.144245326761372e-07, + "loss": 0.3494, + "step": 1684 + }, + { + "epoch": 0.3052536231884058, + "grad_norm": 14.968938020714559, + "learning_rate": 8.141963617916947e-07, + "loss": 0.3132, + "step": 1685 + }, + { + "epoch": 0.30543478260869567, + "grad_norm": 8.119941435405739, + "learning_rate": 8.139680827279561e-07, + "loss": 0.302, + "step": 1686 + }, + { + "epoch": 0.3056159420289855, + "grad_norm": 7.514167445732151, + "learning_rate": 8.13739695563519e-07, + "loss": 0.2958, + "step": 1687 + }, + { + "epoch": 0.30579710144927535, + "grad_norm": 4.8600159321965455, + "learning_rate": 8.135112003770183e-07, + "loss": 0.3024, + "step": 1688 + }, + { + "epoch": 0.3059782608695652, + "grad_norm": 4.362134270632265, + "learning_rate": 8.132825972471257e-07, + "loss": 0.3058, + "step": 1689 + }, + { + "epoch": 0.3061594202898551, + "grad_norm": 11.88542025847849, + "learning_rate": 8.130538862525507e-07, + "loss": 0.3456, + "step": 1690 + }, + { + "epoch": 0.30634057971014494, + "grad_norm": 7.380431182586247, + "learning_rate": 8.128250674720391e-07, + "loss": 0.3011, + "step": 1691 + }, + { + "epoch": 0.3065217391304348, + "grad_norm": 4.484294932241263, + "learning_rate": 8.125961409843747e-07, + "loss": 0.3658, + "step": 1692 + }, + { + "epoch": 0.30670289855072463, + "grad_norm": 3.320116234268537, + "learning_rate": 8.123671068683779e-07, + "loss": 0.3053, + "step": 1693 + }, + { + "epoch": 0.3068840579710145, + "grad_norm": 6.242270244530482, + "learning_rate": 8.121379652029062e-07, + "loss": 0.289, + "step": 1694 + }, + { + "epoch": 0.3070652173913043, + "grad_norm": 8.685104064895292, + "learning_rate": 8.119087160668541e-07, + "loss": 0.3122, + "step": 1695 + }, + { + "epoch": 0.3072463768115942, + "grad_norm": 5.625449806615163, + "learning_rate": 8.116793595391531e-07, + "loss": 0.3004, + "step": 1696 + }, + { + "epoch": 0.30742753623188407, + "grad_norm": 4.83672194009164, + "learning_rate": 8.11449895698772e-07, + "loss": 0.3246, + "step": 1697 + }, + { + "epoch": 0.3076086956521739, + "grad_norm": 4.951062129123521, + "learning_rate": 8.11220324624716e-07, + "loss": 0.3808, + "step": 1698 + }, + { + "epoch": 0.30778985507246376, + "grad_norm": 7.803992820412519, + "learning_rate": 8.109906463960278e-07, + "loss": 0.3244, + "step": 1699 + }, + { + "epoch": 0.3079710144927536, + "grad_norm": 6.018782723368235, + "learning_rate": 8.107608610917864e-07, + "loss": 0.4355, + "step": 1700 + }, + { + "epoch": 0.3079710144927536, + "eval_loss": 0.3173125088214874, + "eval_runtime": 9.8026, + "eval_samples_per_second": 51.007, + "eval_steps_per_second": 0.102, + "step": 1700 + }, + { + "epoch": 0.3081521739130435, + "grad_norm": 5.492281426920849, + "learning_rate": 8.105309687911081e-07, + "loss": 0.3196, + "step": 1701 + }, + { + "epoch": 0.30833333333333335, + "grad_norm": 5.719478406761022, + "learning_rate": 8.10300969573146e-07, + "loss": 0.3062, + "step": 1702 + }, + { + "epoch": 0.3085144927536232, + "grad_norm": 7.930820493815263, + "learning_rate": 8.100708635170899e-07, + "loss": 0.282, + "step": 1703 + }, + { + "epoch": 0.30869565217391304, + "grad_norm": 11.007865923193712, + "learning_rate": 8.098406507021662e-07, + "loss": 0.3151, + "step": 1704 + }, + { + "epoch": 0.3088768115942029, + "grad_norm": 10.859344678807489, + "learning_rate": 8.096103312076385e-07, + "loss": 0.3986, + "step": 1705 + }, + { + "epoch": 0.3090579710144927, + "grad_norm": 3.776643484700144, + "learning_rate": 8.093799051128068e-07, + "loss": 0.3228, + "step": 1706 + }, + { + "epoch": 0.3092391304347826, + "grad_norm": 5.930200032897158, + "learning_rate": 8.091493724970078e-07, + "loss": 0.3097, + "step": 1707 + }, + { + "epoch": 0.3094202898550725, + "grad_norm": 7.995811284308179, + "learning_rate": 8.089187334396152e-07, + "loss": 0.2922, + "step": 1708 + }, + { + "epoch": 0.3096014492753623, + "grad_norm": 9.038408354147903, + "learning_rate": 8.086879880200389e-07, + "loss": 0.321, + "step": 1709 + }, + { + "epoch": 0.30978260869565216, + "grad_norm": 5.328408142403583, + "learning_rate": 8.084571363177257e-07, + "loss": 0.319, + "step": 1710 + }, + { + "epoch": 0.309963768115942, + "grad_norm": 3.5597320281458615, + "learning_rate": 8.082261784121591e-07, + "loss": 0.2508, + "step": 1711 + }, + { + "epoch": 0.3101449275362319, + "grad_norm": 6.336031122568708, + "learning_rate": 8.079951143828587e-07, + "loss": 0.2932, + "step": 1712 + }, + { + "epoch": 0.31032608695652175, + "grad_norm": 4.919364404287858, + "learning_rate": 8.077639443093813e-07, + "loss": 0.2613, + "step": 1713 + }, + { + "epoch": 0.3105072463768116, + "grad_norm": 4.1022156965988765, + "learning_rate": 8.075326682713195e-07, + "loss": 0.3666, + "step": 1714 + }, + { + "epoch": 0.31068840579710144, + "grad_norm": 3.8694318989791263, + "learning_rate": 8.07301286348303e-07, + "loss": 0.3367, + "step": 1715 + }, + { + "epoch": 0.3108695652173913, + "grad_norm": 7.122481586009249, + "learning_rate": 8.070697986199975e-07, + "loss": 0.3001, + "step": 1716 + }, + { + "epoch": 0.3110507246376812, + "grad_norm": 3.3830720501581975, + "learning_rate": 8.068382051661054e-07, + "loss": 0.3013, + "step": 1717 + }, + { + "epoch": 0.31123188405797103, + "grad_norm": 10.629590540589108, + "learning_rate": 8.066065060663655e-07, + "loss": 0.3384, + "step": 1718 + }, + { + "epoch": 0.3114130434782609, + "grad_norm": 3.894935965033015, + "learning_rate": 8.063747014005528e-07, + "loss": 0.3307, + "step": 1719 + }, + { + "epoch": 0.3115942028985507, + "grad_norm": 8.494647196669545, + "learning_rate": 8.061427912484787e-07, + "loss": 0.3248, + "step": 1720 + }, + { + "epoch": 0.31177536231884057, + "grad_norm": 7.12551551259516, + "learning_rate": 8.059107756899912e-07, + "loss": 0.3502, + "step": 1721 + }, + { + "epoch": 0.3119565217391304, + "grad_norm": 5.32475208946155, + "learning_rate": 8.056786548049741e-07, + "loss": 0.2877, + "step": 1722 + }, + { + "epoch": 0.3121376811594203, + "grad_norm": 6.967031117789461, + "learning_rate": 8.054464286733478e-07, + "loss": 0.3152, + "step": 1723 + }, + { + "epoch": 0.31231884057971016, + "grad_norm": 7.555447942790169, + "learning_rate": 8.05214097375069e-07, + "loss": 0.3465, + "step": 1724 + }, + { + "epoch": 0.3125, + "grad_norm": 4.070719397505282, + "learning_rate": 8.049816609901303e-07, + "loss": 0.3354, + "step": 1725 + }, + { + "epoch": 0.31268115942028984, + "grad_norm": 8.384575921616161, + "learning_rate": 8.047491195985605e-07, + "loss": 0.2967, + "step": 1726 + }, + { + "epoch": 0.3128623188405797, + "grad_norm": 4.611983231322097, + "learning_rate": 8.04516473280425e-07, + "loss": 0.3364, + "step": 1727 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 5.853952889529148, + "learning_rate": 8.04283722115825e-07, + "loss": 0.3113, + "step": 1728 + }, + { + "epoch": 0.31322463768115943, + "grad_norm": 5.027119828842899, + "learning_rate": 8.040508661848977e-07, + "loss": 0.3289, + "step": 1729 + }, + { + "epoch": 0.3134057971014493, + "grad_norm": 3.270780677236944, + "learning_rate": 8.038179055678165e-07, + "loss": 0.3124, + "step": 1730 + }, + { + "epoch": 0.3135869565217391, + "grad_norm": 4.127668886158838, + "learning_rate": 8.035848403447909e-07, + "loss": 0.3549, + "step": 1731 + }, + { + "epoch": 0.31376811594202897, + "grad_norm": 4.087005380805331, + "learning_rate": 8.033516705960663e-07, + "loss": 0.3385, + "step": 1732 + }, + { + "epoch": 0.3139492753623188, + "grad_norm": 3.327602869466973, + "learning_rate": 8.031183964019243e-07, + "loss": 0.3244, + "step": 1733 + }, + { + "epoch": 0.3141304347826087, + "grad_norm": 10.313271954335411, + "learning_rate": 8.028850178426822e-07, + "loss": 0.3574, + "step": 1734 + }, + { + "epoch": 0.31431159420289856, + "grad_norm": 5.520779553007381, + "learning_rate": 8.026515349986935e-07, + "loss": 0.3619, + "step": 1735 + }, + { + "epoch": 0.3144927536231884, + "grad_norm": 3.953922544297295, + "learning_rate": 8.02417947950347e-07, + "loss": 0.3014, + "step": 1736 + }, + { + "epoch": 0.31467391304347825, + "grad_norm": 6.692317222783169, + "learning_rate": 8.021842567780684e-07, + "loss": 0.3732, + "step": 1737 + }, + { + "epoch": 0.3148550724637681, + "grad_norm": 8.65083240899913, + "learning_rate": 8.019504615623183e-07, + "loss": 0.3563, + "step": 1738 + }, + { + "epoch": 0.315036231884058, + "grad_norm": 10.892243907432828, + "learning_rate": 8.017165623835935e-07, + "loss": 0.3206, + "step": 1739 + }, + { + "epoch": 0.31521739130434784, + "grad_norm": 6.3045245615993535, + "learning_rate": 8.014825593224268e-07, + "loss": 0.2876, + "step": 1740 + }, + { + "epoch": 0.3153985507246377, + "grad_norm": 3.8197231707198473, + "learning_rate": 8.012484524593866e-07, + "loss": 0.3541, + "step": 1741 + }, + { + "epoch": 0.3155797101449275, + "grad_norm": 5.755044329091152, + "learning_rate": 8.010142418750768e-07, + "loss": 0.3152, + "step": 1742 + }, + { + "epoch": 0.3157608695652174, + "grad_norm": 4.137010640554019, + "learning_rate": 8.007799276501372e-07, + "loss": 0.311, + "step": 1743 + }, + { + "epoch": 0.3159420289855073, + "grad_norm": 5.767085388663738, + "learning_rate": 8.005455098652435e-07, + "loss": 0.3432, + "step": 1744 + }, + { + "epoch": 0.3161231884057971, + "grad_norm": 12.840668638797792, + "learning_rate": 8.003109886011066e-07, + "loss": 0.413, + "step": 1745 + }, + { + "epoch": 0.31630434782608696, + "grad_norm": 9.072168907881776, + "learning_rate": 8.000763639384735e-07, + "loss": 0.2955, + "step": 1746 + }, + { + "epoch": 0.3164855072463768, + "grad_norm": 3.228234268362536, + "learning_rate": 7.998416359581266e-07, + "loss": 0.3198, + "step": 1747 + }, + { + "epoch": 0.31666666666666665, + "grad_norm": 3.6393868339760074, + "learning_rate": 7.996068047408837e-07, + "loss": 0.2711, + "step": 1748 + }, + { + "epoch": 0.3168478260869565, + "grad_norm": 2.909269469768695, + "learning_rate": 7.993718703675983e-07, + "loss": 0.3151, + "step": 1749 + }, + { + "epoch": 0.3170289855072464, + "grad_norm": 12.590518173027503, + "learning_rate": 7.991368329191595e-07, + "loss": 0.3557, + "step": 1750 + }, + { + "epoch": 0.31721014492753624, + "grad_norm": 9.625837073942225, + "learning_rate": 7.989016924764918e-07, + "loss": 0.3373, + "step": 1751 + }, + { + "epoch": 0.3173913043478261, + "grad_norm": 12.204503330163293, + "learning_rate": 7.986664491205552e-07, + "loss": 0.297, + "step": 1752 + }, + { + "epoch": 0.31757246376811593, + "grad_norm": 3.9730453944517707, + "learning_rate": 7.98431102932345e-07, + "loss": 0.2768, + "step": 1753 + }, + { + "epoch": 0.3177536231884058, + "grad_norm": 4.878760706265584, + "learning_rate": 7.98195653992892e-07, + "loss": 0.3432, + "step": 1754 + }, + { + "epoch": 0.3179347826086957, + "grad_norm": 4.7797206233418335, + "learning_rate": 7.979601023832625e-07, + "loss": 0.3926, + "step": 1755 + }, + { + "epoch": 0.3181159420289855, + "grad_norm": 7.197706234302545, + "learning_rate": 7.97724448184558e-07, + "loss": 0.2906, + "step": 1756 + }, + { + "epoch": 0.31829710144927537, + "grad_norm": 13.018178014540316, + "learning_rate": 7.974886914779153e-07, + "loss": 0.3726, + "step": 1757 + }, + { + "epoch": 0.3184782608695652, + "grad_norm": 9.80128062782683, + "learning_rate": 7.972528323445067e-07, + "loss": 0.3029, + "step": 1758 + }, + { + "epoch": 0.31865942028985506, + "grad_norm": 8.733887067577676, + "learning_rate": 7.970168708655394e-07, + "loss": 0.3651, + "step": 1759 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 5.75649463387482, + "learning_rate": 7.967808071222564e-07, + "loss": 0.2818, + "step": 1760 + }, + { + "epoch": 0.3190217391304348, + "grad_norm": 3.7894055734976093, + "learning_rate": 7.96544641195935e-07, + "loss": 0.3068, + "step": 1761 + }, + { + "epoch": 0.31920289855072465, + "grad_norm": 8.2874706703031, + "learning_rate": 7.963083731678888e-07, + "loss": 0.3273, + "step": 1762 + }, + { + "epoch": 0.3193840579710145, + "grad_norm": 4.744385932326128, + "learning_rate": 7.960720031194656e-07, + "loss": 0.3093, + "step": 1763 + }, + { + "epoch": 0.31956521739130433, + "grad_norm": 6.046360752192413, + "learning_rate": 7.95835531132049e-07, + "loss": 0.3463, + "step": 1764 + }, + { + "epoch": 0.3197463768115942, + "grad_norm": 6.25156012306365, + "learning_rate": 7.955989572870574e-07, + "loss": 0.4005, + "step": 1765 + }, + { + "epoch": 0.3199275362318841, + "grad_norm": 4.033191064466662, + "learning_rate": 7.953622816659442e-07, + "loss": 0.3305, + "step": 1766 + }, + { + "epoch": 0.3201086956521739, + "grad_norm": 3.4091109789621843, + "learning_rate": 7.951255043501978e-07, + "loss": 0.325, + "step": 1767 + }, + { + "epoch": 0.32028985507246377, + "grad_norm": 6.969332580688918, + "learning_rate": 7.948886254213423e-07, + "loss": 0.3218, + "step": 1768 + }, + { + "epoch": 0.3204710144927536, + "grad_norm": 3.1964983579917146, + "learning_rate": 7.946516449609355e-07, + "loss": 0.2884, + "step": 1769 + }, + { + "epoch": 0.32065217391304346, + "grad_norm": 3.665297902530676, + "learning_rate": 7.944145630505714e-07, + "loss": 0.2778, + "step": 1770 + }, + { + "epoch": 0.32083333333333336, + "grad_norm": 15.983651726363597, + "learning_rate": 7.941773797718783e-07, + "loss": 0.3463, + "step": 1771 + }, + { + "epoch": 0.3210144927536232, + "grad_norm": 10.265141581605715, + "learning_rate": 7.939400952065193e-07, + "loss": 0.3379, + "step": 1772 + }, + { + "epoch": 0.32119565217391305, + "grad_norm": 15.058386227801867, + "learning_rate": 7.93702709436193e-07, + "loss": 0.3262, + "step": 1773 + }, + { + "epoch": 0.3213768115942029, + "grad_norm": 12.753059074024279, + "learning_rate": 7.934652225426321e-07, + "loss": 0.3325, + "step": 1774 + }, + { + "epoch": 0.32155797101449274, + "grad_norm": 8.206695925369717, + "learning_rate": 7.932276346076047e-07, + "loss": 0.3304, + "step": 1775 + }, + { + "epoch": 0.3217391304347826, + "grad_norm": 5.127611689367168, + "learning_rate": 7.929899457129135e-07, + "loss": 0.3076, + "step": 1776 + }, + { + "epoch": 0.3219202898550725, + "grad_norm": 3.616167030994503, + "learning_rate": 7.927521559403956e-07, + "loss": 0.2938, + "step": 1777 + }, + { + "epoch": 0.32210144927536233, + "grad_norm": 5.936154514974325, + "learning_rate": 7.925142653719235e-07, + "loss": 0.3397, + "step": 1778 + }, + { + "epoch": 0.3222826086956522, + "grad_norm": 7.531415104527346, + "learning_rate": 7.922762740894036e-07, + "loss": 0.3241, + "step": 1779 + }, + { + "epoch": 0.322463768115942, + "grad_norm": 12.151271145029897, + "learning_rate": 7.92038182174778e-07, + "loss": 0.3487, + "step": 1780 + }, + { + "epoch": 0.32264492753623186, + "grad_norm": 5.623020406797692, + "learning_rate": 7.917999897100222e-07, + "loss": 0.3139, + "step": 1781 + }, + { + "epoch": 0.32282608695652176, + "grad_norm": 10.863941871658541, + "learning_rate": 7.915616967771477e-07, + "loss": 0.3318, + "step": 1782 + }, + { + "epoch": 0.3230072463768116, + "grad_norm": 9.024418909062998, + "learning_rate": 7.913233034581994e-07, + "loss": 0.2895, + "step": 1783 + }, + { + "epoch": 0.32318840579710145, + "grad_norm": 3.9634898278216077, + "learning_rate": 7.910848098352574e-07, + "loss": 0.3504, + "step": 1784 + }, + { + "epoch": 0.3233695652173913, + "grad_norm": 4.518014284501169, + "learning_rate": 7.908462159904362e-07, + "loss": 0.3021, + "step": 1785 + }, + { + "epoch": 0.32355072463768114, + "grad_norm": 9.238802669532143, + "learning_rate": 7.906075220058847e-07, + "loss": 0.4299, + "step": 1786 + }, + { + "epoch": 0.32373188405797104, + "grad_norm": 5.21929560669184, + "learning_rate": 7.903687279637867e-07, + "loss": 0.3364, + "step": 1787 + }, + { + "epoch": 0.3239130434782609, + "grad_norm": 11.012728402839075, + "learning_rate": 7.901298339463597e-07, + "loss": 0.3175, + "step": 1788 + }, + { + "epoch": 0.32409420289855073, + "grad_norm": 5.696682544109489, + "learning_rate": 7.898908400358561e-07, + "loss": 0.2815, + "step": 1789 + }, + { + "epoch": 0.3242753623188406, + "grad_norm": 6.464259691848639, + "learning_rate": 7.896517463145629e-07, + "loss": 0.3231, + "step": 1790 + }, + { + "epoch": 0.3244565217391304, + "grad_norm": 5.970634926153257, + "learning_rate": 7.894125528648011e-07, + "loss": 0.3132, + "step": 1791 + }, + { + "epoch": 0.32463768115942027, + "grad_norm": 3.9456199455399763, + "learning_rate": 7.891732597689259e-07, + "loss": 0.3501, + "step": 1792 + }, + { + "epoch": 0.32481884057971017, + "grad_norm": 4.132907835092769, + "learning_rate": 7.889338671093273e-07, + "loss": 0.3908, + "step": 1793 + }, + { + "epoch": 0.325, + "grad_norm": 6.286989059880983, + "learning_rate": 7.886943749684293e-07, + "loss": 0.3026, + "step": 1794 + }, + { + "epoch": 0.32518115942028986, + "grad_norm": 7.261166467637907, + "learning_rate": 7.884547834286901e-07, + "loss": 0.3568, + "step": 1795 + }, + { + "epoch": 0.3253623188405797, + "grad_norm": 3.406809395394441, + "learning_rate": 7.882150925726023e-07, + "loss": 0.3052, + "step": 1796 + }, + { + "epoch": 0.32554347826086955, + "grad_norm": 6.114238799476201, + "learning_rate": 7.879753024826925e-07, + "loss": 0.2653, + "step": 1797 + }, + { + "epoch": 0.32572463768115945, + "grad_norm": 8.117039676780227, + "learning_rate": 7.877354132415215e-07, + "loss": 0.2944, + "step": 1798 + }, + { + "epoch": 0.3259057971014493, + "grad_norm": 7.043234745915229, + "learning_rate": 7.874954249316846e-07, + "loss": 0.3492, + "step": 1799 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 5.58362755560085, + "learning_rate": 7.872553376358104e-07, + "loss": 0.35, + "step": 1800 + }, + { + "epoch": 0.32608695652173914, + "eval_loss": 0.3110625147819519, + "eval_runtime": 9.8069, + "eval_samples_per_second": 50.984, + "eval_steps_per_second": 0.102, + "step": 1800 + }, + { + "epoch": 0.326268115942029, + "grad_norm": 6.9229344023976935, + "learning_rate": 7.870151514365626e-07, + "loss": 0.3014, + "step": 1801 + }, + { + "epoch": 0.3264492753623188, + "grad_norm": 7.5092665291583485, + "learning_rate": 7.867748664166383e-07, + "loss": 0.3281, + "step": 1802 + }, + { + "epoch": 0.32663043478260867, + "grad_norm": 4.530904318891568, + "learning_rate": 7.865344826587688e-07, + "loss": 0.2728, + "step": 1803 + }, + { + "epoch": 0.32681159420289857, + "grad_norm": 9.976531480698771, + "learning_rate": 7.86294000245719e-07, + "loss": 0.3434, + "step": 1804 + }, + { + "epoch": 0.3269927536231884, + "grad_norm": 3.712576034526324, + "learning_rate": 7.860534192602887e-07, + "loss": 0.3596, + "step": 1805 + }, + { + "epoch": 0.32717391304347826, + "grad_norm": 6.78122623043467, + "learning_rate": 7.858127397853107e-07, + "loss": 0.3255, + "step": 1806 + }, + { + "epoch": 0.3273550724637681, + "grad_norm": 13.949449697003933, + "learning_rate": 7.855719619036522e-07, + "loss": 0.2817, + "step": 1807 + }, + { + "epoch": 0.32753623188405795, + "grad_norm": 4.218279967689233, + "learning_rate": 7.85331085698214e-07, + "loss": 0.3364, + "step": 1808 + }, + { + "epoch": 0.32771739130434785, + "grad_norm": 5.533274062204578, + "learning_rate": 7.850901112519312e-07, + "loss": 0.2823, + "step": 1809 + }, + { + "epoch": 0.3278985507246377, + "grad_norm": 3.543552759773686, + "learning_rate": 7.848490386477724e-07, + "loss": 0.3599, + "step": 1810 + }, + { + "epoch": 0.32807971014492754, + "grad_norm": 3.9866252451118527, + "learning_rate": 7.846078679687398e-07, + "loss": 0.337, + "step": 1811 + }, + { + "epoch": 0.3282608695652174, + "grad_norm": 5.734366563873238, + "learning_rate": 7.843665992978699e-07, + "loss": 0.277, + "step": 1812 + }, + { + "epoch": 0.32844202898550723, + "grad_norm": 5.419092947234047, + "learning_rate": 7.841252327182324e-07, + "loss": 0.3199, + "step": 1813 + }, + { + "epoch": 0.32862318840579713, + "grad_norm": 9.236887425051977, + "learning_rate": 7.838837683129311e-07, + "loss": 0.3139, + "step": 1814 + }, + { + "epoch": 0.328804347826087, + "grad_norm": 3.9756656590256862, + "learning_rate": 7.836422061651031e-07, + "loss": 0.329, + "step": 1815 + }, + { + "epoch": 0.3289855072463768, + "grad_norm": 5.01291542310867, + "learning_rate": 7.834005463579199e-07, + "loss": 0.3097, + "step": 1816 + }, + { + "epoch": 0.32916666666666666, + "grad_norm": 4.769191561751831, + "learning_rate": 7.831587889745856e-07, + "loss": 0.3507, + "step": 1817 + }, + { + "epoch": 0.3293478260869565, + "grad_norm": 4.0984766101069985, + "learning_rate": 7.829169340983388e-07, + "loss": 0.266, + "step": 1818 + }, + { + "epoch": 0.32952898550724635, + "grad_norm": 11.098333700488112, + "learning_rate": 7.826749818124509e-07, + "loss": 0.304, + "step": 1819 + }, + { + "epoch": 0.32971014492753625, + "grad_norm": 4.8360526383389395, + "learning_rate": 7.824329322002276e-07, + "loss": 0.2941, + "step": 1820 + }, + { + "epoch": 0.3298913043478261, + "grad_norm": 4.354136907590092, + "learning_rate": 7.821907853450074e-07, + "loss": 0.2926, + "step": 1821 + }, + { + "epoch": 0.33007246376811594, + "grad_norm": 5.682014557605183, + "learning_rate": 7.819485413301629e-07, + "loss": 0.2977, + "step": 1822 + }, + { + "epoch": 0.3302536231884058, + "grad_norm": 5.29047262698395, + "learning_rate": 7.817062002390997e-07, + "loss": 0.3418, + "step": 1823 + }, + { + "epoch": 0.33043478260869563, + "grad_norm": 3.437535128783487, + "learning_rate": 7.814637621552569e-07, + "loss": 0.2943, + "step": 1824 + }, + { + "epoch": 0.33061594202898553, + "grad_norm": 8.494373244559128, + "learning_rate": 7.812212271621072e-07, + "loss": 0.3455, + "step": 1825 + }, + { + "epoch": 0.3307971014492754, + "grad_norm": 6.117527539614069, + "learning_rate": 7.809785953431566e-07, + "loss": 0.4307, + "step": 1826 + }, + { + "epoch": 0.3309782608695652, + "grad_norm": 7.0242481198031, + "learning_rate": 7.807358667819444e-07, + "loss": 0.3699, + "step": 1827 + }, + { + "epoch": 0.33115942028985507, + "grad_norm": 6.494935737259366, + "learning_rate": 7.80493041562043e-07, + "loss": 0.306, + "step": 1828 + }, + { + "epoch": 0.3313405797101449, + "grad_norm": 11.040990824622417, + "learning_rate": 7.802501197670584e-07, + "loss": 0.3092, + "step": 1829 + }, + { + "epoch": 0.33152173913043476, + "grad_norm": 6.140918427273407, + "learning_rate": 7.800071014806298e-07, + "loss": 0.2845, + "step": 1830 + }, + { + "epoch": 0.33170289855072466, + "grad_norm": 5.782933828184184, + "learning_rate": 7.797639867864292e-07, + "loss": 0.2868, + "step": 1831 + }, + { + "epoch": 0.3318840579710145, + "grad_norm": 4.55085839386319, + "learning_rate": 7.795207757681625e-07, + "loss": 0.32, + "step": 1832 + }, + { + "epoch": 0.33206521739130435, + "grad_norm": 4.988593286291903, + "learning_rate": 7.792774685095685e-07, + "loss": 0.2611, + "step": 1833 + }, + { + "epoch": 0.3322463768115942, + "grad_norm": 10.675872523095858, + "learning_rate": 7.790340650944187e-07, + "loss": 0.3535, + "step": 1834 + }, + { + "epoch": 0.33242753623188404, + "grad_norm": 6.438853883030144, + "learning_rate": 7.787905656065181e-07, + "loss": 0.3236, + "step": 1835 + }, + { + "epoch": 0.33260869565217394, + "grad_norm": 11.587803589616485, + "learning_rate": 7.785469701297051e-07, + "loss": 0.3597, + "step": 1836 + }, + { + "epoch": 0.3327898550724638, + "grad_norm": 4.93510083383632, + "learning_rate": 7.783032787478503e-07, + "loss": 0.345, + "step": 1837 + }, + { + "epoch": 0.3329710144927536, + "grad_norm": 5.870644635889845, + "learning_rate": 7.78059491544858e-07, + "loss": 0.3492, + "step": 1838 + }, + { + "epoch": 0.33315217391304347, + "grad_norm": 6.756904392385173, + "learning_rate": 7.778156086046653e-07, + "loss": 0.4325, + "step": 1839 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 3.6573303948484868, + "learning_rate": 7.775716300112422e-07, + "loss": 0.3203, + "step": 1840 + }, + { + "epoch": 0.3335144927536232, + "grad_norm": 3.046149780740665, + "learning_rate": 7.773275558485919e-07, + "loss": 0.2639, + "step": 1841 + }, + { + "epoch": 0.33369565217391306, + "grad_norm": 5.529985241268639, + "learning_rate": 7.7708338620075e-07, + "loss": 0.3153, + "step": 1842 + }, + { + "epoch": 0.3338768115942029, + "grad_norm": 3.634591019962169, + "learning_rate": 7.768391211517854e-07, + "loss": 0.2825, + "step": 1843 + }, + { + "epoch": 0.33405797101449275, + "grad_norm": 4.064523349135702, + "learning_rate": 7.765947607857996e-07, + "loss": 0.3234, + "step": 1844 + }, + { + "epoch": 0.3342391304347826, + "grad_norm": 4.898462953096211, + "learning_rate": 7.763503051869274e-07, + "loss": 0.3493, + "step": 1845 + }, + { + "epoch": 0.33442028985507244, + "grad_norm": 4.535616035004989, + "learning_rate": 7.761057544393354e-07, + "loss": 0.3633, + "step": 1846 + }, + { + "epoch": 0.33460144927536234, + "grad_norm": 6.399007789665967, + "learning_rate": 7.758611086272242e-07, + "loss": 0.2986, + "step": 1847 + }, + { + "epoch": 0.3347826086956522, + "grad_norm": 11.827980252735243, + "learning_rate": 7.756163678348259e-07, + "loss": 0.3161, + "step": 1848 + }, + { + "epoch": 0.33496376811594203, + "grad_norm": 5.764356486799095, + "learning_rate": 7.753715321464067e-07, + "loss": 0.3814, + "step": 1849 + }, + { + "epoch": 0.3351449275362319, + "grad_norm": 8.655970632077477, + "learning_rate": 7.75126601646264e-07, + "loss": 0.2574, + "step": 1850 + }, + { + "epoch": 0.3353260869565217, + "grad_norm": 7.392371604385812, + "learning_rate": 7.748815764187289e-07, + "loss": 0.3307, + "step": 1851 + }, + { + "epoch": 0.3355072463768116, + "grad_norm": 3.961886234950718, + "learning_rate": 7.746364565481644e-07, + "loss": 0.3492, + "step": 1852 + }, + { + "epoch": 0.33568840579710146, + "grad_norm": 4.312124096310197, + "learning_rate": 7.743912421189669e-07, + "loss": 0.3745, + "step": 1853 + }, + { + "epoch": 0.3358695652173913, + "grad_norm": 3.576929001616148, + "learning_rate": 7.741459332155644e-07, + "loss": 0.3312, + "step": 1854 + }, + { + "epoch": 0.33605072463768115, + "grad_norm": 3.1350013576812588, + "learning_rate": 7.739005299224184e-07, + "loss": 0.3079, + "step": 1855 + }, + { + "epoch": 0.336231884057971, + "grad_norm": 3.100110304948787, + "learning_rate": 7.736550323240221e-07, + "loss": 0.2841, + "step": 1856 + }, + { + "epoch": 0.33641304347826084, + "grad_norm": 5.412568495165048, + "learning_rate": 7.734094405049016e-07, + "loss": 0.297, + "step": 1857 + }, + { + "epoch": 0.33659420289855074, + "grad_norm": 4.829316799924912, + "learning_rate": 7.731637545496152e-07, + "loss": 0.3187, + "step": 1858 + }, + { + "epoch": 0.3367753623188406, + "grad_norm": 4.069996412320471, + "learning_rate": 7.729179745427539e-07, + "loss": 0.3193, + "step": 1859 + }, + { + "epoch": 0.33695652173913043, + "grad_norm": 6.896132936551222, + "learning_rate": 7.726721005689407e-07, + "loss": 0.2841, + "step": 1860 + }, + { + "epoch": 0.3371376811594203, + "grad_norm": 3.4484648851852304, + "learning_rate": 7.724261327128316e-07, + "loss": 0.3051, + "step": 1861 + }, + { + "epoch": 0.3373188405797101, + "grad_norm": 4.8216965517130195, + "learning_rate": 7.72180071059114e-07, + "loss": 0.3949, + "step": 1862 + }, + { + "epoch": 0.3375, + "grad_norm": 3.5955961823535145, + "learning_rate": 7.719339156925085e-07, + "loss": 0.2961, + "step": 1863 + }, + { + "epoch": 0.33768115942028987, + "grad_norm": 4.568770172107434, + "learning_rate": 7.716876666977672e-07, + "loss": 0.3697, + "step": 1864 + }, + { + "epoch": 0.3378623188405797, + "grad_norm": 3.4887417154003866, + "learning_rate": 7.714413241596752e-07, + "loss": 0.337, + "step": 1865 + }, + { + "epoch": 0.33804347826086956, + "grad_norm": 8.078526845128401, + "learning_rate": 7.711948881630488e-07, + "loss": 0.3098, + "step": 1866 + }, + { + "epoch": 0.3382246376811594, + "grad_norm": 4.666920406554297, + "learning_rate": 7.709483587927377e-07, + "loss": 0.3748, + "step": 1867 + }, + { + "epoch": 0.3384057971014493, + "grad_norm": 11.889240970106998, + "learning_rate": 7.707017361336229e-07, + "loss": 0.301, + "step": 1868 + }, + { + "epoch": 0.33858695652173915, + "grad_norm": 7.052726826122178, + "learning_rate": 7.704550202706177e-07, + "loss": 0.2922, + "step": 1869 + }, + { + "epoch": 0.338768115942029, + "grad_norm": 4.541414767469716, + "learning_rate": 7.702082112886675e-07, + "loss": 0.3466, + "step": 1870 + }, + { + "epoch": 0.33894927536231884, + "grad_norm": 5.045076435203539, + "learning_rate": 7.699613092727501e-07, + "loss": 0.3337, + "step": 1871 + }, + { + "epoch": 0.3391304347826087, + "grad_norm": 5.174490533205886, + "learning_rate": 7.697143143078746e-07, + "loss": 0.4119, + "step": 1872 + }, + { + "epoch": 0.3393115942028985, + "grad_norm": 5.66541132872301, + "learning_rate": 7.694672264790829e-07, + "loss": 0.3312, + "step": 1873 + }, + { + "epoch": 0.3394927536231884, + "grad_norm": 6.865984013490447, + "learning_rate": 7.692200458714482e-07, + "loss": 0.3042, + "step": 1874 + }, + { + "epoch": 0.33967391304347827, + "grad_norm": 13.355950382047805, + "learning_rate": 7.689727725700762e-07, + "loss": 0.3695, + "step": 1875 + }, + { + "epoch": 0.3398550724637681, + "grad_norm": 4.346972351647204, + "learning_rate": 7.687254066601042e-07, + "loss": 0.3828, + "step": 1876 + }, + { + "epoch": 0.34003623188405796, + "grad_norm": 3.2301297780422895, + "learning_rate": 7.684779482267015e-07, + "loss": 0.2957, + "step": 1877 + }, + { + "epoch": 0.3402173913043478, + "grad_norm": 3.729963802113808, + "learning_rate": 7.68230397355069e-07, + "loss": 0.2879, + "step": 1878 + }, + { + "epoch": 0.3403985507246377, + "grad_norm": 11.284445167538793, + "learning_rate": 7.679827541304399e-07, + "loss": 0.3391, + "step": 1879 + }, + { + "epoch": 0.34057971014492755, + "grad_norm": 7.993766447744876, + "learning_rate": 7.677350186380787e-07, + "loss": 0.2724, + "step": 1880 + }, + { + "epoch": 0.3407608695652174, + "grad_norm": 7.059520062316174, + "learning_rate": 7.67487190963282e-07, + "loss": 0.323, + "step": 1881 + }, + { + "epoch": 0.34094202898550724, + "grad_norm": 9.917544819971706, + "learning_rate": 7.672392711913783e-07, + "loss": 0.2579, + "step": 1882 + }, + { + "epoch": 0.3411231884057971, + "grad_norm": 8.212302852980963, + "learning_rate": 7.669912594077272e-07, + "loss": 0.3157, + "step": 1883 + }, + { + "epoch": 0.34130434782608693, + "grad_norm": 6.432886292746912, + "learning_rate": 7.667431556977205e-07, + "loss": 0.3038, + "step": 1884 + }, + { + "epoch": 0.34148550724637683, + "grad_norm": 4.5740339102545216, + "learning_rate": 7.664949601467814e-07, + "loss": 0.2946, + "step": 1885 + }, + { + "epoch": 0.3416666666666667, + "grad_norm": 6.278337582897395, + "learning_rate": 7.66246672840365e-07, + "loss": 0.2569, + "step": 1886 + }, + { + "epoch": 0.3418478260869565, + "grad_norm": 5.553927723122158, + "learning_rate": 7.659982938639573e-07, + "loss": 0.3451, + "step": 1887 + }, + { + "epoch": 0.34202898550724636, + "grad_norm": 9.631497175645482, + "learning_rate": 7.657498233030769e-07, + "loss": 0.3146, + "step": 1888 + }, + { + "epoch": 0.3422101449275362, + "grad_norm": 4.92189232450789, + "learning_rate": 7.655012612432732e-07, + "loss": 0.2783, + "step": 1889 + }, + { + "epoch": 0.3423913043478261, + "grad_norm": 3.8638210356351808, + "learning_rate": 7.652526077701273e-07, + "loss": 0.3392, + "step": 1890 + }, + { + "epoch": 0.34257246376811595, + "grad_norm": 9.769706653601604, + "learning_rate": 7.650038629692517e-07, + "loss": 0.345, + "step": 1891 + }, + { + "epoch": 0.3427536231884058, + "grad_norm": 2.985434391402221, + "learning_rate": 7.647550269262904e-07, + "loss": 0.2556, + "step": 1892 + }, + { + "epoch": 0.34293478260869564, + "grad_norm": 7.128359932100153, + "learning_rate": 7.64506099726919e-07, + "loss": 0.3926, + "step": 1893 + }, + { + "epoch": 0.3431159420289855, + "grad_norm": 4.712568135716269, + "learning_rate": 7.642570814568442e-07, + "loss": 0.2929, + "step": 1894 + }, + { + "epoch": 0.3432971014492754, + "grad_norm": 4.870888807304561, + "learning_rate": 7.64007972201804e-07, + "loss": 0.2495, + "step": 1895 + }, + { + "epoch": 0.34347826086956523, + "grad_norm": 11.396376975111291, + "learning_rate": 7.637587720475683e-07, + "loss": 0.3848, + "step": 1896 + }, + { + "epoch": 0.3436594202898551, + "grad_norm": 3.87365561517141, + "learning_rate": 7.635094810799376e-07, + "loss": 0.3206, + "step": 1897 + }, + { + "epoch": 0.3438405797101449, + "grad_norm": 3.1454057233550703, + "learning_rate": 7.63260099384744e-07, + "loss": 0.2676, + "step": 1898 + }, + { + "epoch": 0.34402173913043477, + "grad_norm": 3.6565351839349693, + "learning_rate": 7.63010627047851e-07, + "loss": 0.3404, + "step": 1899 + }, + { + "epoch": 0.3442028985507246, + "grad_norm": 3.3259769279543416, + "learning_rate": 7.627610641551527e-07, + "loss": 0.3035, + "step": 1900 + }, + { + "epoch": 0.3442028985507246, + "eval_loss": 0.31725001335144043, + "eval_runtime": 9.7375, + "eval_samples_per_second": 51.348, + "eval_steps_per_second": 0.103, + "step": 1900 + }, + { + "epoch": 0.3443840579710145, + "grad_norm": 4.282098781605039, + "learning_rate": 7.62511410792575e-07, + "loss": 0.3334, + "step": 1901 + }, + { + "epoch": 0.34456521739130436, + "grad_norm": 7.7128900444262545, + "learning_rate": 7.62261667046075e-07, + "loss": 0.3718, + "step": 1902 + }, + { + "epoch": 0.3447463768115942, + "grad_norm": 5.686317933797614, + "learning_rate": 7.620118330016402e-07, + "loss": 0.3582, + "step": 1903 + }, + { + "epoch": 0.34492753623188405, + "grad_norm": 9.517294360438747, + "learning_rate": 7.6176190874529e-07, + "loss": 0.3749, + "step": 1904 + }, + { + "epoch": 0.3451086956521739, + "grad_norm": 7.779011917835557, + "learning_rate": 7.615118943630743e-07, + "loss": 0.3279, + "step": 1905 + }, + { + "epoch": 0.3452898550724638, + "grad_norm": 5.391162533515678, + "learning_rate": 7.612617899410743e-07, + "loss": 0.3345, + "step": 1906 + }, + { + "epoch": 0.34547101449275364, + "grad_norm": 8.667519242476674, + "learning_rate": 7.61011595565402e-07, + "loss": 0.2916, + "step": 1907 + }, + { + "epoch": 0.3456521739130435, + "grad_norm": 5.248028694855879, + "learning_rate": 7.607613113222009e-07, + "loss": 0.291, + "step": 1908 + }, + { + "epoch": 0.3458333333333333, + "grad_norm": 13.70835393462969, + "learning_rate": 7.605109372976446e-07, + "loss": 0.3652, + "step": 1909 + }, + { + "epoch": 0.34601449275362317, + "grad_norm": 7.095388012091221, + "learning_rate": 7.602604735779384e-07, + "loss": 0.299, + "step": 1910 + }, + { + "epoch": 0.34619565217391307, + "grad_norm": 3.3656288451090948, + "learning_rate": 7.60009920249318e-07, + "loss": 0.2843, + "step": 1911 + }, + { + "epoch": 0.3463768115942029, + "grad_norm": 9.18839271346281, + "learning_rate": 7.597592773980501e-07, + "loss": 0.3467, + "step": 1912 + }, + { + "epoch": 0.34655797101449276, + "grad_norm": 3.363641644804981, + "learning_rate": 7.595085451104322e-07, + "loss": 0.271, + "step": 1913 + }, + { + "epoch": 0.3467391304347826, + "grad_norm": 5.443643354170259, + "learning_rate": 7.592577234727927e-07, + "loss": 0.3369, + "step": 1914 + }, + { + "epoch": 0.34692028985507245, + "grad_norm": 3.9042244073527077, + "learning_rate": 7.590068125714904e-07, + "loss": 0.3802, + "step": 1915 + }, + { + "epoch": 0.3471014492753623, + "grad_norm": 5.439914170267567, + "learning_rate": 7.587558124929155e-07, + "loss": 0.3301, + "step": 1916 + }, + { + "epoch": 0.3472826086956522, + "grad_norm": 8.900399166788967, + "learning_rate": 7.585047233234883e-07, + "loss": 0.368, + "step": 1917 + }, + { + "epoch": 0.34746376811594204, + "grad_norm": 6.041870917497792, + "learning_rate": 7.582535451496601e-07, + "loss": 0.3347, + "step": 1918 + }, + { + "epoch": 0.3476449275362319, + "grad_norm": 4.568070340817429, + "learning_rate": 7.580022780579127e-07, + "loss": 0.3172, + "step": 1919 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 4.441333780159891, + "learning_rate": 7.577509221347584e-07, + "loss": 0.3246, + "step": 1920 + }, + { + "epoch": 0.3480072463768116, + "grad_norm": 7.3331851376781145, + "learning_rate": 7.574994774667405e-07, + "loss": 0.2966, + "step": 1921 + }, + { + "epoch": 0.3481884057971015, + "grad_norm": 8.145608524730086, + "learning_rate": 7.572479441404323e-07, + "loss": 0.2691, + "step": 1922 + }, + { + "epoch": 0.3483695652173913, + "grad_norm": 4.073117607302022, + "learning_rate": 7.569963222424382e-07, + "loss": 0.2984, + "step": 1923 + }, + { + "epoch": 0.34855072463768116, + "grad_norm": 4.092988409608099, + "learning_rate": 7.567446118593927e-07, + "loss": 0.3324, + "step": 1924 + }, + { + "epoch": 0.348731884057971, + "grad_norm": 3.619394052885244, + "learning_rate": 7.564928130779608e-07, + "loss": 0.3246, + "step": 1925 + }, + { + "epoch": 0.34891304347826085, + "grad_norm": 4.843311112410277, + "learning_rate": 7.56240925984838e-07, + "loss": 0.3395, + "step": 1926 + }, + { + "epoch": 0.3490942028985507, + "grad_norm": 3.299196083780203, + "learning_rate": 7.559889506667502e-07, + "loss": 0.3041, + "step": 1927 + }, + { + "epoch": 0.3492753623188406, + "grad_norm": 4.51999885419715, + "learning_rate": 7.557368872104539e-07, + "loss": 0.3693, + "step": 1928 + }, + { + "epoch": 0.34945652173913044, + "grad_norm": 6.250648374945436, + "learning_rate": 7.554847357027358e-07, + "loss": 0.2881, + "step": 1929 + }, + { + "epoch": 0.3496376811594203, + "grad_norm": 9.058991677860087, + "learning_rate": 7.552324962304126e-07, + "loss": 0.3195, + "step": 1930 + }, + { + "epoch": 0.34981884057971013, + "grad_norm": 9.306681764335439, + "learning_rate": 7.549801688803314e-07, + "loss": 0.3363, + "step": 1931 + }, + { + "epoch": 0.35, + "grad_norm": 4.41249544527736, + "learning_rate": 7.547277537393701e-07, + "loss": 0.3355, + "step": 1932 + }, + { + "epoch": 0.3501811594202899, + "grad_norm": 5.059114323913898, + "learning_rate": 7.544752508944363e-07, + "loss": 0.2837, + "step": 1933 + }, + { + "epoch": 0.3503623188405797, + "grad_norm": 3.9975470441371685, + "learning_rate": 7.54222660432468e-07, + "loss": 0.3284, + "step": 1934 + }, + { + "epoch": 0.35054347826086957, + "grad_norm": 3.667456386660029, + "learning_rate": 7.53969982440433e-07, + "loss": 0.3544, + "step": 1935 + }, + { + "epoch": 0.3507246376811594, + "grad_norm": 5.467054414616333, + "learning_rate": 7.537172170053296e-07, + "loss": 0.354, + "step": 1936 + }, + { + "epoch": 0.35090579710144926, + "grad_norm": 3.898467393603716, + "learning_rate": 7.534643642141864e-07, + "loss": 0.3335, + "step": 1937 + }, + { + "epoch": 0.35108695652173916, + "grad_norm": 5.351133640830385, + "learning_rate": 7.532114241540617e-07, + "loss": 0.3429, + "step": 1938 + }, + { + "epoch": 0.351268115942029, + "grad_norm": 6.713417884803827, + "learning_rate": 7.529583969120439e-07, + "loss": 0.3193, + "step": 1939 + }, + { + "epoch": 0.35144927536231885, + "grad_norm": 3.6311873813636546, + "learning_rate": 7.527052825752514e-07, + "loss": 0.2878, + "step": 1940 + }, + { + "epoch": 0.3516304347826087, + "grad_norm": 4.598739660373051, + "learning_rate": 7.524520812308329e-07, + "loss": 0.304, + "step": 1941 + }, + { + "epoch": 0.35181159420289854, + "grad_norm": 4.7275618672068385, + "learning_rate": 7.521987929659666e-07, + "loss": 0.2905, + "step": 1942 + }, + { + "epoch": 0.3519927536231884, + "grad_norm": 11.364381926193575, + "learning_rate": 7.51945417867861e-07, + "loss": 0.314, + "step": 1943 + }, + { + "epoch": 0.3521739130434783, + "grad_norm": 3.45285174488261, + "learning_rate": 7.516919560237543e-07, + "loss": 0.3115, + "step": 1944 + }, + { + "epoch": 0.3523550724637681, + "grad_norm": 6.88704782510566, + "learning_rate": 7.514384075209145e-07, + "loss": 0.3046, + "step": 1945 + }, + { + "epoch": 0.35253623188405797, + "grad_norm": 3.461509491621105, + "learning_rate": 7.511847724466398e-07, + "loss": 0.3596, + "step": 1946 + }, + { + "epoch": 0.3527173913043478, + "grad_norm": 7.224756575119816, + "learning_rate": 7.509310508882576e-07, + "loss": 0.2902, + "step": 1947 + }, + { + "epoch": 0.35289855072463766, + "grad_norm": 6.139441570882219, + "learning_rate": 7.506772429331258e-07, + "loss": 0.3176, + "step": 1948 + }, + { + "epoch": 0.35307971014492756, + "grad_norm": 5.864063586915534, + "learning_rate": 7.504233486686315e-07, + "loss": 0.2741, + "step": 1949 + }, + { + "epoch": 0.3532608695652174, + "grad_norm": 4.737651456180436, + "learning_rate": 7.501693681821917e-07, + "loss": 0.3732, + "step": 1950 + }, + { + "epoch": 0.35344202898550725, + "grad_norm": 7.098250544261399, + "learning_rate": 7.499153015612531e-07, + "loss": 0.3041, + "step": 1951 + }, + { + "epoch": 0.3536231884057971, + "grad_norm": 11.461618543071948, + "learning_rate": 7.49661148893292e-07, + "loss": 0.3358, + "step": 1952 + }, + { + "epoch": 0.35380434782608694, + "grad_norm": 13.918541046966165, + "learning_rate": 7.494069102658144e-07, + "loss": 0.3547, + "step": 1953 + }, + { + "epoch": 0.3539855072463768, + "grad_norm": 3.946662794690365, + "learning_rate": 7.491525857663561e-07, + "loss": 0.2751, + "step": 1954 + }, + { + "epoch": 0.3541666666666667, + "grad_norm": 3.320597497624449, + "learning_rate": 7.488981754824818e-07, + "loss": 0.3018, + "step": 1955 + }, + { + "epoch": 0.35434782608695653, + "grad_norm": 5.79081248013009, + "learning_rate": 7.486436795017865e-07, + "loss": 0.3255, + "step": 1956 + }, + { + "epoch": 0.3545289855072464, + "grad_norm": 5.377384462939565, + "learning_rate": 7.483890979118941e-07, + "loss": 0.3199, + "step": 1957 + }, + { + "epoch": 0.3547101449275362, + "grad_norm": 3.9946503786788456, + "learning_rate": 7.481344308004586e-07, + "loss": 0.3405, + "step": 1958 + }, + { + "epoch": 0.35489130434782606, + "grad_norm": 4.459599253562778, + "learning_rate": 7.478796782551627e-07, + "loss": 0.3222, + "step": 1959 + }, + { + "epoch": 0.35507246376811596, + "grad_norm": 4.321830940291255, + "learning_rate": 7.476248403637193e-07, + "loss": 0.3451, + "step": 1960 + }, + { + "epoch": 0.3552536231884058, + "grad_norm": 6.154671052614726, + "learning_rate": 7.473699172138699e-07, + "loss": 0.2844, + "step": 1961 + }, + { + "epoch": 0.35543478260869565, + "grad_norm": 9.313270188556709, + "learning_rate": 7.471149088933861e-07, + "loss": 0.3265, + "step": 1962 + }, + { + "epoch": 0.3556159420289855, + "grad_norm": 3.312867911782334, + "learning_rate": 7.468598154900681e-07, + "loss": 0.3312, + "step": 1963 + }, + { + "epoch": 0.35579710144927534, + "grad_norm": 6.432823713029236, + "learning_rate": 7.466046370917462e-07, + "loss": 0.2874, + "step": 1964 + }, + { + "epoch": 0.35597826086956524, + "grad_norm": 7.2127414323412005, + "learning_rate": 7.463493737862792e-07, + "loss": 0.3055, + "step": 1965 + }, + { + "epoch": 0.3561594202898551, + "grad_norm": 4.439994348162171, + "learning_rate": 7.460940256615556e-07, + "loss": 0.3275, + "step": 1966 + }, + { + "epoch": 0.35634057971014493, + "grad_norm": 8.970028985119088, + "learning_rate": 7.458385928054929e-07, + "loss": 0.3383, + "step": 1967 + }, + { + "epoch": 0.3565217391304348, + "grad_norm": 3.712195228807908, + "learning_rate": 7.455830753060379e-07, + "loss": 0.2984, + "step": 1968 + }, + { + "epoch": 0.3567028985507246, + "grad_norm": 4.318077828598915, + "learning_rate": 7.453274732511666e-07, + "loss": 0.3431, + "step": 1969 + }, + { + "epoch": 0.35688405797101447, + "grad_norm": 3.8396660276512757, + "learning_rate": 7.450717867288838e-07, + "loss": 0.3336, + "step": 1970 + }, + { + "epoch": 0.35706521739130437, + "grad_norm": 6.4177151398265195, + "learning_rate": 7.448160158272235e-07, + "loss": 0.3145, + "step": 1971 + }, + { + "epoch": 0.3572463768115942, + "grad_norm": 3.787539417337925, + "learning_rate": 7.445601606342493e-07, + "loss": 0.3432, + "step": 1972 + }, + { + "epoch": 0.35742753623188406, + "grad_norm": 3.486054169161485, + "learning_rate": 7.443042212380527e-07, + "loss": 0.314, + "step": 1973 + }, + { + "epoch": 0.3576086956521739, + "grad_norm": 3.5757932929300806, + "learning_rate": 7.440481977267555e-07, + "loss": 0.3143, + "step": 1974 + }, + { + "epoch": 0.35778985507246375, + "grad_norm": 3.3658739895347063, + "learning_rate": 7.437920901885073e-07, + "loss": 0.3223, + "step": 1975 + }, + { + "epoch": 0.35797101449275365, + "grad_norm": 4.100429319835068, + "learning_rate": 7.435358987114874e-07, + "loss": 0.3652, + "step": 1976 + }, + { + "epoch": 0.3581521739130435, + "grad_norm": 4.129137635609264, + "learning_rate": 7.432796233839036e-07, + "loss": 0.3182, + "step": 1977 + }, + { + "epoch": 0.35833333333333334, + "grad_norm": 7.570761725939312, + "learning_rate": 7.430232642939929e-07, + "loss": 0.3354, + "step": 1978 + }, + { + "epoch": 0.3585144927536232, + "grad_norm": 4.619318360142829, + "learning_rate": 7.427668215300206e-07, + "loss": 0.3604, + "step": 1979 + }, + { + "epoch": 0.358695652173913, + "grad_norm": 7.679296847995771, + "learning_rate": 7.425102951802817e-07, + "loss": 0.2763, + "step": 1980 + }, + { + "epoch": 0.35887681159420287, + "grad_norm": 5.04311052514982, + "learning_rate": 7.422536853330991e-07, + "loss": 0.3366, + "step": 1981 + }, + { + "epoch": 0.35905797101449277, + "grad_norm": 3.981794977926401, + "learning_rate": 7.419969920768248e-07, + "loss": 0.2789, + "step": 1982 + }, + { + "epoch": 0.3592391304347826, + "grad_norm": 4.216962557200483, + "learning_rate": 7.417402154998393e-07, + "loss": 0.2929, + "step": 1983 + }, + { + "epoch": 0.35942028985507246, + "grad_norm": 4.1488859722145355, + "learning_rate": 7.414833556905524e-07, + "loss": 0.338, + "step": 1984 + }, + { + "epoch": 0.3596014492753623, + "grad_norm": 4.934740187518044, + "learning_rate": 7.41226412737402e-07, + "loss": 0.286, + "step": 1985 + }, + { + "epoch": 0.35978260869565215, + "grad_norm": 3.9615401399550736, + "learning_rate": 7.409693867288547e-07, + "loss": 0.3377, + "step": 1986 + }, + { + "epoch": 0.35996376811594205, + "grad_norm": 10.247830981070214, + "learning_rate": 7.407122777534058e-07, + "loss": 0.3495, + "step": 1987 + }, + { + "epoch": 0.3601449275362319, + "grad_norm": 4.051189792203562, + "learning_rate": 7.40455085899579e-07, + "loss": 0.364, + "step": 1988 + }, + { + "epoch": 0.36032608695652174, + "grad_norm": 5.035068085203559, + "learning_rate": 7.401978112559271e-07, + "loss": 0.32, + "step": 1989 + }, + { + "epoch": 0.3605072463768116, + "grad_norm": 7.652800777619602, + "learning_rate": 7.399404539110304e-07, + "loss": 0.3494, + "step": 1990 + }, + { + "epoch": 0.36068840579710143, + "grad_norm": 12.323705836928221, + "learning_rate": 7.396830139534988e-07, + "loss": 0.4155, + "step": 1991 + }, + { + "epoch": 0.36086956521739133, + "grad_norm": 3.422811390821941, + "learning_rate": 7.394254914719697e-07, + "loss": 0.3353, + "step": 1992 + }, + { + "epoch": 0.3610507246376812, + "grad_norm": 5.878923700656869, + "learning_rate": 7.391678865551096e-07, + "loss": 0.3138, + "step": 1993 + }, + { + "epoch": 0.361231884057971, + "grad_norm": 3.078172283599551, + "learning_rate": 7.389101992916129e-07, + "loss": 0.2726, + "step": 1994 + }, + { + "epoch": 0.36141304347826086, + "grad_norm": 7.297137082836424, + "learning_rate": 7.386524297702025e-07, + "loss": 0.3261, + "step": 1995 + }, + { + "epoch": 0.3615942028985507, + "grad_norm": 5.497105598108665, + "learning_rate": 7.3839457807963e-07, + "loss": 0.2926, + "step": 1996 + }, + { + "epoch": 0.36177536231884055, + "grad_norm": 4.306504039927819, + "learning_rate": 7.381366443086746e-07, + "loss": 0.3333, + "step": 1997 + }, + { + "epoch": 0.36195652173913045, + "grad_norm": 4.7975678003784745, + "learning_rate": 7.378786285461441e-07, + "loss": 0.3125, + "step": 1998 + }, + { + "epoch": 0.3621376811594203, + "grad_norm": 3.8304666735680795, + "learning_rate": 7.376205308808751e-07, + "loss": 0.3451, + "step": 1999 + }, + { + "epoch": 0.36231884057971014, + "grad_norm": 4.245393270464726, + "learning_rate": 7.37362351401731e-07, + "loss": 0.3298, + "step": 2000 + }, + { + "epoch": 0.36231884057971014, + "eval_loss": 0.3128125071525574, + "eval_runtime": 9.7582, + "eval_samples_per_second": 51.239, + "eval_steps_per_second": 0.102, + "step": 2000 + }, + { + "epoch": 0.3625, + "grad_norm": 3.0270586053648496, + "learning_rate": 7.371040901976049e-07, + "loss": 0.2622, + "step": 2001 + }, + { + "epoch": 0.36268115942028983, + "grad_norm": 3.8414369889881725, + "learning_rate": 7.368457473574171e-07, + "loss": 0.2963, + "step": 2002 + }, + { + "epoch": 0.36286231884057973, + "grad_norm": 5.497425488685455, + "learning_rate": 7.365873229701163e-07, + "loss": 0.3499, + "step": 2003 + }, + { + "epoch": 0.3630434782608696, + "grad_norm": 5.631438760538479, + "learning_rate": 7.363288171246792e-07, + "loss": 0.3811, + "step": 2004 + }, + { + "epoch": 0.3632246376811594, + "grad_norm": 5.966784931572397, + "learning_rate": 7.360702299101107e-07, + "loss": 0.2722, + "step": 2005 + }, + { + "epoch": 0.36340579710144927, + "grad_norm": 3.687933646458537, + "learning_rate": 7.358115614154433e-07, + "loss": 0.3185, + "step": 2006 + }, + { + "epoch": 0.3635869565217391, + "grad_norm": 6.581873337166921, + "learning_rate": 7.355528117297383e-07, + "loss": 0.3166, + "step": 2007 + }, + { + "epoch": 0.36376811594202896, + "grad_norm": 6.170943611267692, + "learning_rate": 7.352939809420839e-07, + "loss": 0.3138, + "step": 2008 + }, + { + "epoch": 0.36394927536231886, + "grad_norm": 8.952168700024632, + "learning_rate": 7.350350691415971e-07, + "loss": 0.3372, + "step": 2009 + }, + { + "epoch": 0.3641304347826087, + "grad_norm": 11.028699099346055, + "learning_rate": 7.347760764174224e-07, + "loss": 0.3206, + "step": 2010 + }, + { + "epoch": 0.36431159420289855, + "grad_norm": 14.133079621169708, + "learning_rate": 7.345170028587322e-07, + "loss": 0.4056, + "step": 2011 + }, + { + "epoch": 0.3644927536231884, + "grad_norm": 3.2778698499876784, + "learning_rate": 7.342578485547266e-07, + "loss": 0.3339, + "step": 2012 + }, + { + "epoch": 0.36467391304347824, + "grad_norm": 6.553533744716087, + "learning_rate": 7.339986135946341e-07, + "loss": 0.3341, + "step": 2013 + }, + { + "epoch": 0.36485507246376814, + "grad_norm": 4.9691193087276, + "learning_rate": 7.337392980677099e-07, + "loss": 0.3122, + "step": 2014 + }, + { + "epoch": 0.365036231884058, + "grad_norm": 5.36577648784817, + "learning_rate": 7.334799020632381e-07, + "loss": 0.2962, + "step": 2015 + }, + { + "epoch": 0.3652173913043478, + "grad_norm": 5.238857688780358, + "learning_rate": 7.332204256705298e-07, + "loss": 0.2906, + "step": 2016 + }, + { + "epoch": 0.36539855072463767, + "grad_norm": 4.505102622026442, + "learning_rate": 7.329608689789239e-07, + "loss": 0.3501, + "step": 2017 + }, + { + "epoch": 0.3655797101449275, + "grad_norm": 6.525304426787631, + "learning_rate": 7.327012320777869e-07, + "loss": 0.3028, + "step": 2018 + }, + { + "epoch": 0.3657608695652174, + "grad_norm": 4.958368122970147, + "learning_rate": 7.324415150565132e-07, + "loss": 0.2903, + "step": 2019 + }, + { + "epoch": 0.36594202898550726, + "grad_norm": 3.309716557522381, + "learning_rate": 7.321817180045244e-07, + "loss": 0.3021, + "step": 2020 + }, + { + "epoch": 0.3661231884057971, + "grad_norm": 4.354441746693979, + "learning_rate": 7.319218410112703e-07, + "loss": 0.3676, + "step": 2021 + }, + { + "epoch": 0.36630434782608695, + "grad_norm": 5.469423312657493, + "learning_rate": 7.316618841662272e-07, + "loss": 0.2757, + "step": 2022 + }, + { + "epoch": 0.3664855072463768, + "grad_norm": 4.344599867273378, + "learning_rate": 7.314018475588999e-07, + "loss": 0.3359, + "step": 2023 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 4.28121711090224, + "learning_rate": 7.311417312788199e-07, + "loss": 0.3145, + "step": 2024 + }, + { + "epoch": 0.36684782608695654, + "grad_norm": 3.693167465141958, + "learning_rate": 7.308815354155467e-07, + "loss": 0.3191, + "step": 2025 + }, + { + "epoch": 0.3670289855072464, + "grad_norm": 3.327928016514394, + "learning_rate": 7.306212600586672e-07, + "loss": 0.3119, + "step": 2026 + }, + { + "epoch": 0.36721014492753623, + "grad_norm": 2.960953077757847, + "learning_rate": 7.303609052977949e-07, + "loss": 0.251, + "step": 2027 + }, + { + "epoch": 0.3673913043478261, + "grad_norm": 5.6379815879269755, + "learning_rate": 7.301004712225715e-07, + "loss": 0.3292, + "step": 2028 + }, + { + "epoch": 0.3675724637681159, + "grad_norm": 4.153505492287318, + "learning_rate": 7.298399579226656e-07, + "loss": 0.2939, + "step": 2029 + }, + { + "epoch": 0.3677536231884058, + "grad_norm": 3.6907010172366044, + "learning_rate": 7.295793654877731e-07, + "loss": 0.3217, + "step": 2030 + }, + { + "epoch": 0.36793478260869567, + "grad_norm": 6.660776135019311, + "learning_rate": 7.293186940076175e-07, + "loss": 0.3063, + "step": 2031 + }, + { + "epoch": 0.3681159420289855, + "grad_norm": 8.058889248374719, + "learning_rate": 7.290579435719489e-07, + "loss": 0.2703, + "step": 2032 + }, + { + "epoch": 0.36829710144927535, + "grad_norm": 4.568875412446538, + "learning_rate": 7.287971142705449e-07, + "loss": 0.3111, + "step": 2033 + }, + { + "epoch": 0.3684782608695652, + "grad_norm": 5.285773026662832, + "learning_rate": 7.285362061932106e-07, + "loss": 0.3599, + "step": 2034 + }, + { + "epoch": 0.3686594202898551, + "grad_norm": 4.559084104330743, + "learning_rate": 7.282752194297774e-07, + "loss": 0.3152, + "step": 2035 + }, + { + "epoch": 0.36884057971014494, + "grad_norm": 4.003909428970958, + "learning_rate": 7.280141540701048e-07, + "loss": 0.3255, + "step": 2036 + }, + { + "epoch": 0.3690217391304348, + "grad_norm": 6.536513211069695, + "learning_rate": 7.277530102040787e-07, + "loss": 0.3642, + "step": 2037 + }, + { + "epoch": 0.36920289855072463, + "grad_norm": 3.4631942393304374, + "learning_rate": 7.274917879216119e-07, + "loss": 0.2774, + "step": 2038 + }, + { + "epoch": 0.3693840579710145, + "grad_norm": 3.9895088509639742, + "learning_rate": 7.272304873126446e-07, + "loss": 0.3041, + "step": 2039 + }, + { + "epoch": 0.3695652173913043, + "grad_norm": 3.8860915610883393, + "learning_rate": 7.26969108467144e-07, + "loss": 0.3497, + "step": 2040 + }, + { + "epoch": 0.3697463768115942, + "grad_norm": 3.8852716416302155, + "learning_rate": 7.267076514751038e-07, + "loss": 0.33, + "step": 2041 + }, + { + "epoch": 0.36992753623188407, + "grad_norm": 9.728558434089113, + "learning_rate": 7.264461164265452e-07, + "loss": 0.3249, + "step": 2042 + }, + { + "epoch": 0.3701086956521739, + "grad_norm": 3.790370008020601, + "learning_rate": 7.261845034115157e-07, + "loss": 0.2957, + "step": 2043 + }, + { + "epoch": 0.37028985507246376, + "grad_norm": 5.449015758986728, + "learning_rate": 7.2592281252009e-07, + "loss": 0.2465, + "step": 2044 + }, + { + "epoch": 0.3704710144927536, + "grad_norm": 7.623878282007046, + "learning_rate": 7.256610438423695e-07, + "loss": 0.3091, + "step": 2045 + }, + { + "epoch": 0.3706521739130435, + "grad_norm": 5.066301414351345, + "learning_rate": 7.253991974684824e-07, + "loss": 0.2812, + "step": 2046 + }, + { + "epoch": 0.37083333333333335, + "grad_norm": 4.117911446775097, + "learning_rate": 7.251372734885836e-07, + "loss": 0.3156, + "step": 2047 + }, + { + "epoch": 0.3710144927536232, + "grad_norm": 5.99825679691614, + "learning_rate": 7.248752719928551e-07, + "loss": 0.2898, + "step": 2048 + }, + { + "epoch": 0.37119565217391304, + "grad_norm": 5.49201968945519, + "learning_rate": 7.246131930715045e-07, + "loss": 0.3816, + "step": 2049 + }, + { + "epoch": 0.3713768115942029, + "grad_norm": 4.145233219363318, + "learning_rate": 7.243510368147676e-07, + "loss": 0.2585, + "step": 2050 + }, + { + "epoch": 0.3715579710144927, + "grad_norm": 7.717990554087374, + "learning_rate": 7.240888033129056e-07, + "loss": 0.3192, + "step": 2051 + }, + { + "epoch": 0.3717391304347826, + "grad_norm": 8.169066793495674, + "learning_rate": 7.23826492656207e-07, + "loss": 0.3848, + "step": 2052 + }, + { + "epoch": 0.3719202898550725, + "grad_norm": 6.400391417922713, + "learning_rate": 7.235641049349865e-07, + "loss": 0.3492, + "step": 2053 + }, + { + "epoch": 0.3721014492753623, + "grad_norm": 3.931827679268894, + "learning_rate": 7.233016402395852e-07, + "loss": 0.3179, + "step": 2054 + }, + { + "epoch": 0.37228260869565216, + "grad_norm": 4.5048953586220355, + "learning_rate": 7.230390986603712e-07, + "loss": 0.3062, + "step": 2055 + }, + { + "epoch": 0.372463768115942, + "grad_norm": 4.645952094260839, + "learning_rate": 7.227764802877389e-07, + "loss": 0.3712, + "step": 2056 + }, + { + "epoch": 0.3726449275362319, + "grad_norm": 9.515911650410436, + "learning_rate": 7.225137852121086e-07, + "loss": 0.3331, + "step": 2057 + }, + { + "epoch": 0.37282608695652175, + "grad_norm": 4.442014684996327, + "learning_rate": 7.22251013523928e-07, + "loss": 0.355, + "step": 2058 + }, + { + "epoch": 0.3730072463768116, + "grad_norm": 9.383218319731174, + "learning_rate": 7.219881653136704e-07, + "loss": 0.3008, + "step": 2059 + }, + { + "epoch": 0.37318840579710144, + "grad_norm": 7.457139813990069, + "learning_rate": 7.217252406718355e-07, + "loss": 0.309, + "step": 2060 + }, + { + "epoch": 0.3733695652173913, + "grad_norm": 4.133345550164097, + "learning_rate": 7.214622396889499e-07, + "loss": 0.2967, + "step": 2061 + }, + { + "epoch": 0.3735507246376812, + "grad_norm": 4.823880286761828, + "learning_rate": 7.211991624555657e-07, + "loss": 0.2686, + "step": 2062 + }, + { + "epoch": 0.37373188405797103, + "grad_norm": 4.739082202303997, + "learning_rate": 7.209360090622618e-07, + "loss": 0.3135, + "step": 2063 + }, + { + "epoch": 0.3739130434782609, + "grad_norm": 11.611386841223215, + "learning_rate": 7.206727795996433e-07, + "loss": 0.3756, + "step": 2064 + }, + { + "epoch": 0.3740942028985507, + "grad_norm": 11.806555837798529, + "learning_rate": 7.204094741583412e-07, + "loss": 0.3278, + "step": 2065 + }, + { + "epoch": 0.37427536231884057, + "grad_norm": 11.27800619214121, + "learning_rate": 7.201460928290128e-07, + "loss": 0.3615, + "step": 2066 + }, + { + "epoch": 0.3744565217391304, + "grad_norm": 8.62118823112314, + "learning_rate": 7.198826357023415e-07, + "loss": 0.3564, + "step": 2067 + }, + { + "epoch": 0.3746376811594203, + "grad_norm": 11.727212544956975, + "learning_rate": 7.196191028690369e-07, + "loss": 0.3348, + "step": 2068 + }, + { + "epoch": 0.37481884057971016, + "grad_norm": 8.065242909197162, + "learning_rate": 7.193554944198347e-07, + "loss": 0.2963, + "step": 2069 + }, + { + "epoch": 0.375, + "grad_norm": 4.153451890912459, + "learning_rate": 7.190918104454963e-07, + "loss": 0.3575, + "step": 2070 + }, + { + "epoch": 0.37518115942028984, + "grad_norm": 3.568962249415379, + "learning_rate": 7.188280510368096e-07, + "loss": 0.3228, + "step": 2071 + }, + { + "epoch": 0.3753623188405797, + "grad_norm": 4.0733749588043615, + "learning_rate": 7.18564216284588e-07, + "loss": 0.2936, + "step": 2072 + }, + { + "epoch": 0.3755434782608696, + "grad_norm": 4.372754958251993, + "learning_rate": 7.183003062796713e-07, + "loss": 0.3703, + "step": 2073 + }, + { + "epoch": 0.37572463768115943, + "grad_norm": 6.049029820514382, + "learning_rate": 7.180363211129248e-07, + "loss": 0.344, + "step": 2074 + }, + { + "epoch": 0.3759057971014493, + "grad_norm": 15.136135414834653, + "learning_rate": 7.177722608752398e-07, + "loss": 0.3454, + "step": 2075 + }, + { + "epoch": 0.3760869565217391, + "grad_norm": 3.398013820032196, + "learning_rate": 7.175081256575335e-07, + "loss": 0.2539, + "step": 2076 + }, + { + "epoch": 0.37626811594202897, + "grad_norm": 8.8830238308733, + "learning_rate": 7.17243915550749e-07, + "loss": 0.3383, + "step": 2077 + }, + { + "epoch": 0.3764492753623188, + "grad_norm": 9.92106288135896, + "learning_rate": 7.169796306458551e-07, + "loss": 0.3244, + "step": 2078 + }, + { + "epoch": 0.3766304347826087, + "grad_norm": 7.768034758252296, + "learning_rate": 7.167152710338462e-07, + "loss": 0.3178, + "step": 2079 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 8.926891587456375, + "learning_rate": 7.164508368057428e-07, + "loss": 0.2653, + "step": 2080 + }, + { + "epoch": 0.3769927536231884, + "grad_norm": 4.380748384571386, + "learning_rate": 7.161863280525907e-07, + "loss": 0.3743, + "step": 2081 + }, + { + "epoch": 0.37717391304347825, + "grad_norm": 5.274089189912098, + "learning_rate": 7.159217448654614e-07, + "loss": 0.2967, + "step": 2082 + }, + { + "epoch": 0.3773550724637681, + "grad_norm": 4.3655903354976315, + "learning_rate": 7.156570873354525e-07, + "loss": 0.2751, + "step": 2083 + }, + { + "epoch": 0.377536231884058, + "grad_norm": 3.696656201101535, + "learning_rate": 7.153923555536865e-07, + "loss": 0.2997, + "step": 2084 + }, + { + "epoch": 0.37771739130434784, + "grad_norm": 8.018624634293852, + "learning_rate": 7.151275496113119e-07, + "loss": 0.3689, + "step": 2085 + }, + { + "epoch": 0.3778985507246377, + "grad_norm": 8.433384008571464, + "learning_rate": 7.148626695995027e-07, + "loss": 0.3384, + "step": 2086 + }, + { + "epoch": 0.3780797101449275, + "grad_norm": 7.272546899288065, + "learning_rate": 7.145977156094584e-07, + "loss": 0.2984, + "step": 2087 + }, + { + "epoch": 0.3782608695652174, + "grad_norm": 10.852528127690967, + "learning_rate": 7.143326877324037e-07, + "loss": 0.3497, + "step": 2088 + }, + { + "epoch": 0.3784420289855073, + "grad_norm": 13.87037781322862, + "learning_rate": 7.140675860595892e-07, + "loss": 0.3098, + "step": 2089 + }, + { + "epoch": 0.3786231884057971, + "grad_norm": 12.301017897790347, + "learning_rate": 7.138024106822904e-07, + "loss": 0.3711, + "step": 2090 + }, + { + "epoch": 0.37880434782608696, + "grad_norm": 7.917130556191393, + "learning_rate": 7.135371616918088e-07, + "loss": 0.2796, + "step": 2091 + }, + { + "epoch": 0.3789855072463768, + "grad_norm": 3.9873882203685094, + "learning_rate": 7.132718391794704e-07, + "loss": 0.2603, + "step": 2092 + }, + { + "epoch": 0.37916666666666665, + "grad_norm": 5.311367551899578, + "learning_rate": 7.130064432366274e-07, + "loss": 0.2735, + "step": 2093 + }, + { + "epoch": 0.3793478260869565, + "grad_norm": 5.228149985285873, + "learning_rate": 7.127409739546568e-07, + "loss": 0.3029, + "step": 2094 + }, + { + "epoch": 0.3795289855072464, + "grad_norm": 4.001570027615623, + "learning_rate": 7.124754314249608e-07, + "loss": 0.3132, + "step": 2095 + }, + { + "epoch": 0.37971014492753624, + "grad_norm": 4.687844615547635, + "learning_rate": 7.122098157389671e-07, + "loss": 0.334, + "step": 2096 + }, + { + "epoch": 0.3798913043478261, + "grad_norm": 6.1324919335590575, + "learning_rate": 7.119441269881283e-07, + "loss": 0.2968, + "step": 2097 + }, + { + "epoch": 0.38007246376811593, + "grad_norm": 6.693705083215806, + "learning_rate": 7.116783652639224e-07, + "loss": 0.2996, + "step": 2098 + }, + { + "epoch": 0.3802536231884058, + "grad_norm": 6.938585015310847, + "learning_rate": 7.114125306578525e-07, + "loss": 0.3004, + "step": 2099 + }, + { + "epoch": 0.3804347826086957, + "grad_norm": 3.3305523385032423, + "learning_rate": 7.111466232614465e-07, + "loss": 0.3071, + "step": 2100 + }, + { + "epoch": 0.3804347826086957, + "eval_loss": 0.2997343838214874, + "eval_runtime": 9.772, + "eval_samples_per_second": 51.167, + "eval_steps_per_second": 0.102, + "step": 2100 + }, + { + "epoch": 0.3806159420289855, + "grad_norm": 3.5937342311261893, + "learning_rate": 7.108806431662577e-07, + "loss": 0.3115, + "step": 2101 + }, + { + "epoch": 0.38079710144927537, + "grad_norm": 5.733302048934132, + "learning_rate": 7.106145904638642e-07, + "loss": 0.3015, + "step": 2102 + }, + { + "epoch": 0.3809782608695652, + "grad_norm": 3.275610822661634, + "learning_rate": 7.103484652458693e-07, + "loss": 0.3269, + "step": 2103 + }, + { + "epoch": 0.38115942028985506, + "grad_norm": 3.4731681017381923, + "learning_rate": 7.100822676039013e-07, + "loss": 0.3345, + "step": 2104 + }, + { + "epoch": 0.3813405797101449, + "grad_norm": 9.474435185831744, + "learning_rate": 7.09815997629613e-07, + "loss": 0.2991, + "step": 2105 + }, + { + "epoch": 0.3815217391304348, + "grad_norm": 4.873748219443486, + "learning_rate": 7.095496554146827e-07, + "loss": 0.2983, + "step": 2106 + }, + { + "epoch": 0.38170289855072465, + "grad_norm": 3.5377240835984316, + "learning_rate": 7.092832410508132e-07, + "loss": 0.3148, + "step": 2107 + }, + { + "epoch": 0.3818840579710145, + "grad_norm": 4.693249675105682, + "learning_rate": 7.090167546297321e-07, + "loss": 0.3024, + "step": 2108 + }, + { + "epoch": 0.38206521739130433, + "grad_norm": 3.6605558938989775, + "learning_rate": 7.087501962431921e-07, + "loss": 0.2876, + "step": 2109 + }, + { + "epoch": 0.3822463768115942, + "grad_norm": 4.596755121653797, + "learning_rate": 7.084835659829705e-07, + "loss": 0.3019, + "step": 2110 + }, + { + "epoch": 0.3824275362318841, + "grad_norm": 3.686897769046088, + "learning_rate": 7.082168639408691e-07, + "loss": 0.2493, + "step": 2111 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 4.072969934127878, + "learning_rate": 7.079500902087152e-07, + "loss": 0.3324, + "step": 2112 + }, + { + "epoch": 0.38278985507246377, + "grad_norm": 6.399363765959017, + "learning_rate": 7.076832448783596e-07, + "loss": 0.3882, + "step": 2113 + }, + { + "epoch": 0.3829710144927536, + "grad_norm": 3.4042653262831815, + "learning_rate": 7.07416328041679e-07, + "loss": 0.2881, + "step": 2114 + }, + { + "epoch": 0.38315217391304346, + "grad_norm": 4.078562472526981, + "learning_rate": 7.071493397905739e-07, + "loss": 0.2971, + "step": 2115 + }, + { + "epoch": 0.38333333333333336, + "grad_norm": 4.152659854568918, + "learning_rate": 7.068822802169696e-07, + "loss": 0.2765, + "step": 2116 + }, + { + "epoch": 0.3835144927536232, + "grad_norm": 5.370460060434913, + "learning_rate": 7.066151494128158e-07, + "loss": 0.3091, + "step": 2117 + }, + { + "epoch": 0.38369565217391305, + "grad_norm": 4.093914646720985, + "learning_rate": 7.063479474700875e-07, + "loss": 0.3264, + "step": 2118 + }, + { + "epoch": 0.3838768115942029, + "grad_norm": 5.681256537201822, + "learning_rate": 7.060806744807828e-07, + "loss": 0.2866, + "step": 2119 + }, + { + "epoch": 0.38405797101449274, + "grad_norm": 6.630432321189139, + "learning_rate": 7.058133305369256e-07, + "loss": 0.3312, + "step": 2120 + }, + { + "epoch": 0.3842391304347826, + "grad_norm": 3.9654397095708735, + "learning_rate": 7.055459157305637e-07, + "loss": 0.344, + "step": 2121 + }, + { + "epoch": 0.3844202898550725, + "grad_norm": 11.058387699729824, + "learning_rate": 7.052784301537688e-07, + "loss": 0.3214, + "step": 2122 + }, + { + "epoch": 0.38460144927536233, + "grad_norm": 9.341950315149177, + "learning_rate": 7.05010873898638e-07, + "loss": 0.3778, + "step": 2123 + }, + { + "epoch": 0.3847826086956522, + "grad_norm": 10.937001758481303, + "learning_rate": 7.047432470572918e-07, + "loss": 0.3831, + "step": 2124 + }, + { + "epoch": 0.384963768115942, + "grad_norm": 4.436601013218096, + "learning_rate": 7.044755497218756e-07, + "loss": 0.2963, + "step": 2125 + }, + { + "epoch": 0.38514492753623186, + "grad_norm": 3.4628776872665328, + "learning_rate": 7.042077819845588e-07, + "loss": 0.2726, + "step": 2126 + }, + { + "epoch": 0.38532608695652176, + "grad_norm": 10.3383413703353, + "learning_rate": 7.039399439375352e-07, + "loss": 0.3159, + "step": 2127 + }, + { + "epoch": 0.3855072463768116, + "grad_norm": 5.6294194173317, + "learning_rate": 7.036720356730225e-07, + "loss": 0.3329, + "step": 2128 + }, + { + "epoch": 0.38568840579710145, + "grad_norm": 2.9282256130093156, + "learning_rate": 7.03404057283263e-07, + "loss": 0.2479, + "step": 2129 + }, + { + "epoch": 0.3858695652173913, + "grad_norm": 4.029948418252385, + "learning_rate": 7.031360088605227e-07, + "loss": 0.3576, + "step": 2130 + }, + { + "epoch": 0.38605072463768114, + "grad_norm": 4.766907554519569, + "learning_rate": 7.028678904970923e-07, + "loss": 0.4047, + "step": 2131 + }, + { + "epoch": 0.38623188405797104, + "grad_norm": 7.871698129751128, + "learning_rate": 7.025997022852856e-07, + "loss": 0.3535, + "step": 2132 + }, + { + "epoch": 0.3864130434782609, + "grad_norm": 4.269659727960499, + "learning_rate": 7.023314443174418e-07, + "loss": 0.2599, + "step": 2133 + }, + { + "epoch": 0.38659420289855073, + "grad_norm": 4.980339697472714, + "learning_rate": 7.02063116685923e-07, + "loss": 0.3186, + "step": 2134 + }, + { + "epoch": 0.3867753623188406, + "grad_norm": 4.826318513402168, + "learning_rate": 7.017947194831156e-07, + "loss": 0.3271, + "step": 2135 + }, + { + "epoch": 0.3869565217391304, + "grad_norm": 8.225636588927411, + "learning_rate": 7.015262528014303e-07, + "loss": 0.3381, + "step": 2136 + }, + { + "epoch": 0.38713768115942027, + "grad_norm": 4.149261097093593, + "learning_rate": 7.012577167333013e-07, + "loss": 0.3179, + "step": 2137 + }, + { + "epoch": 0.38731884057971017, + "grad_norm": 4.762094949943608, + "learning_rate": 7.009891113711868e-07, + "loss": 0.343, + "step": 2138 + }, + { + "epoch": 0.3875, + "grad_norm": 10.001299244383034, + "learning_rate": 7.00720436807569e-07, + "loss": 0.3435, + "step": 2139 + }, + { + "epoch": 0.38768115942028986, + "grad_norm": 6.122362279332189, + "learning_rate": 7.004516931349535e-07, + "loss": 0.3444, + "step": 2140 + }, + { + "epoch": 0.3878623188405797, + "grad_norm": 5.673363310153358, + "learning_rate": 7.001828804458707e-07, + "loss": 0.2896, + "step": 2141 + }, + { + "epoch": 0.38804347826086955, + "grad_norm": 4.365146468695218, + "learning_rate": 6.999139988328735e-07, + "loss": 0.2904, + "step": 2142 + }, + { + "epoch": 0.38822463768115945, + "grad_norm": 4.952228499391306, + "learning_rate": 6.996450483885392e-07, + "loss": 0.3213, + "step": 2143 + }, + { + "epoch": 0.3884057971014493, + "grad_norm": 4.283859416825511, + "learning_rate": 6.993760292054689e-07, + "loss": 0.3162, + "step": 2144 + }, + { + "epoch": 0.38858695652173914, + "grad_norm": 5.565955989672552, + "learning_rate": 6.991069413762871e-07, + "loss": 0.2863, + "step": 2145 + }, + { + "epoch": 0.388768115942029, + "grad_norm": 5.840585925510722, + "learning_rate": 6.988377849936419e-07, + "loss": 0.3013, + "step": 2146 + }, + { + "epoch": 0.3889492753623188, + "grad_norm": 2.985844384205347, + "learning_rate": 6.985685601502054e-07, + "loss": 0.234, + "step": 2147 + }, + { + "epoch": 0.38913043478260867, + "grad_norm": 7.52349361444468, + "learning_rate": 6.982992669386726e-07, + "loss": 0.3636, + "step": 2148 + }, + { + "epoch": 0.38931159420289857, + "grad_norm": 8.179994075387192, + "learning_rate": 6.980299054517627e-07, + "loss": 0.3079, + "step": 2149 + }, + { + "epoch": 0.3894927536231884, + "grad_norm": 7.3953456035424745, + "learning_rate": 6.977604757822181e-07, + "loss": 0.3696, + "step": 2150 + }, + { + "epoch": 0.38967391304347826, + "grad_norm": 10.231994205623613, + "learning_rate": 6.974909780228046e-07, + "loss": 0.278, + "step": 2151 + }, + { + "epoch": 0.3898550724637681, + "grad_norm": 3.6122227075145936, + "learning_rate": 6.972214122663117e-07, + "loss": 0.3023, + "step": 2152 + }, + { + "epoch": 0.39003623188405795, + "grad_norm": 4.98461583138587, + "learning_rate": 6.969517786055522e-07, + "loss": 0.3162, + "step": 2153 + }, + { + "epoch": 0.39021739130434785, + "grad_norm": 4.528627814814437, + "learning_rate": 6.966820771333619e-07, + "loss": 0.3128, + "step": 2154 + }, + { + "epoch": 0.3903985507246377, + "grad_norm": 6.153493328734538, + "learning_rate": 6.964123079426008e-07, + "loss": 0.3225, + "step": 2155 + }, + { + "epoch": 0.39057971014492754, + "grad_norm": 6.464676154023411, + "learning_rate": 6.961424711261514e-07, + "loss": 0.3465, + "step": 2156 + }, + { + "epoch": 0.3907608695652174, + "grad_norm": 7.425681674557852, + "learning_rate": 6.958725667769197e-07, + "loss": 0.346, + "step": 2157 + }, + { + "epoch": 0.39094202898550723, + "grad_norm": 3.821427599782369, + "learning_rate": 6.956025949878353e-07, + "loss": 0.2852, + "step": 2158 + }, + { + "epoch": 0.39112318840579713, + "grad_norm": 9.394635561319111, + "learning_rate": 6.953325558518507e-07, + "loss": 0.2609, + "step": 2159 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 4.747575940249641, + "learning_rate": 6.950624494619415e-07, + "loss": 0.2659, + "step": 2160 + }, + { + "epoch": 0.3914855072463768, + "grad_norm": 7.480201764047443, + "learning_rate": 6.947922759111069e-07, + "loss": 0.3117, + "step": 2161 + }, + { + "epoch": 0.39166666666666666, + "grad_norm": 7.366875731216558, + "learning_rate": 6.945220352923685e-07, + "loss": 0.3016, + "step": 2162 + }, + { + "epoch": 0.3918478260869565, + "grad_norm": 9.656088497588994, + "learning_rate": 6.942517276987719e-07, + "loss": 0.2977, + "step": 2163 + }, + { + "epoch": 0.39202898550724635, + "grad_norm": 5.201203908319206, + "learning_rate": 6.939813532233849e-07, + "loss": 0.2704, + "step": 2164 + }, + { + "epoch": 0.39221014492753625, + "grad_norm": 3.9137337075989955, + "learning_rate": 6.93710911959299e-07, + "loss": 0.2855, + "step": 2165 + }, + { + "epoch": 0.3923913043478261, + "grad_norm": 3.540516257074366, + "learning_rate": 6.934404039996283e-07, + "loss": 0.2849, + "step": 2166 + }, + { + "epoch": 0.39257246376811594, + "grad_norm": 4.534143848571846, + "learning_rate": 6.931698294375099e-07, + "loss": 0.2901, + "step": 2167 + }, + { + "epoch": 0.3927536231884058, + "grad_norm": 9.226494214506207, + "learning_rate": 6.928991883661039e-07, + "loss": 0.3448, + "step": 2168 + }, + { + "epoch": 0.39293478260869563, + "grad_norm": 5.088294842948125, + "learning_rate": 6.926284808785936e-07, + "loss": 0.348, + "step": 2169 + }, + { + "epoch": 0.39311594202898553, + "grad_norm": 3.7941952287354437, + "learning_rate": 6.923577070681845e-07, + "loss": 0.2458, + "step": 2170 + }, + { + "epoch": 0.3932971014492754, + "grad_norm": 4.128119822105309, + "learning_rate": 6.920868670281055e-07, + "loss": 0.317, + "step": 2171 + }, + { + "epoch": 0.3934782608695652, + "grad_norm": 8.80245311259596, + "learning_rate": 6.91815960851608e-07, + "loss": 0.2327, + "step": 2172 + }, + { + "epoch": 0.39365942028985507, + "grad_norm": 6.295775077737116, + "learning_rate": 6.915449886319663e-07, + "loss": 0.3115, + "step": 2173 + }, + { + "epoch": 0.3938405797101449, + "grad_norm": 5.805682623689149, + "learning_rate": 6.912739504624776e-07, + "loss": 0.3759, + "step": 2174 + }, + { + "epoch": 0.39402173913043476, + "grad_norm": 7.582295689095818, + "learning_rate": 6.910028464364612e-07, + "loss": 0.3541, + "step": 2175 + }, + { + "epoch": 0.39420289855072466, + "grad_norm": 4.403919544543604, + "learning_rate": 6.9073167664726e-07, + "loss": 0.3095, + "step": 2176 + }, + { + "epoch": 0.3943840579710145, + "grad_norm": 4.216083694991703, + "learning_rate": 6.904604411882388e-07, + "loss": 0.3014, + "step": 2177 + }, + { + "epoch": 0.39456521739130435, + "grad_norm": 4.176160918755515, + "learning_rate": 6.901891401527854e-07, + "loss": 0.3115, + "step": 2178 + }, + { + "epoch": 0.3947463768115942, + "grad_norm": 4.3890765587182665, + "learning_rate": 6.899177736343098e-07, + "loss": 0.2708, + "step": 2179 + }, + { + "epoch": 0.39492753623188404, + "grad_norm": 4.103807331881485, + "learning_rate": 6.896463417262448e-07, + "loss": 0.2887, + "step": 2180 + }, + { + "epoch": 0.39510869565217394, + "grad_norm": 4.867160521532571, + "learning_rate": 6.893748445220457e-07, + "loss": 0.3401, + "step": 2181 + }, + { + "epoch": 0.3952898550724638, + "grad_norm": 10.806779871556616, + "learning_rate": 6.891032821151907e-07, + "loss": 0.314, + "step": 2182 + }, + { + "epoch": 0.3954710144927536, + "grad_norm": 4.679841284115658, + "learning_rate": 6.888316545991795e-07, + "loss": 0.2952, + "step": 2183 + }, + { + "epoch": 0.39565217391304347, + "grad_norm": 7.174450230651045, + "learning_rate": 6.885599620675349e-07, + "loss": 0.3337, + "step": 2184 + }, + { + "epoch": 0.3958333333333333, + "grad_norm": 6.805732276950488, + "learning_rate": 6.882882046138019e-07, + "loss": 0.3029, + "step": 2185 + }, + { + "epoch": 0.3960144927536232, + "grad_norm": 6.916376239794475, + "learning_rate": 6.88016382331548e-07, + "loss": 0.2755, + "step": 2186 + }, + { + "epoch": 0.39619565217391306, + "grad_norm": 3.1640859016069918, + "learning_rate": 6.877444953143628e-07, + "loss": 0.2586, + "step": 2187 + }, + { + "epoch": 0.3963768115942029, + "grad_norm": 5.337632469953887, + "learning_rate": 6.874725436558583e-07, + "loss": 0.3503, + "step": 2188 + }, + { + "epoch": 0.39655797101449275, + "grad_norm": 3.7614417732880985, + "learning_rate": 6.872005274496686e-07, + "loss": 0.2927, + "step": 2189 + }, + { + "epoch": 0.3967391304347826, + "grad_norm": 5.941380756939097, + "learning_rate": 6.869284467894506e-07, + "loss": 0.3042, + "step": 2190 + }, + { + "epoch": 0.39692028985507244, + "grad_norm": 7.354207197148562, + "learning_rate": 6.866563017688825e-07, + "loss": 0.2971, + "step": 2191 + }, + { + "epoch": 0.39710144927536234, + "grad_norm": 4.92479046090298, + "learning_rate": 6.863840924816654e-07, + "loss": 0.2868, + "step": 2192 + }, + { + "epoch": 0.3972826086956522, + "grad_norm": 4.140477044611321, + "learning_rate": 6.861118190215221e-07, + "loss": 0.2681, + "step": 2193 + }, + { + "epoch": 0.39746376811594203, + "grad_norm": 9.608005362186999, + "learning_rate": 6.858394814821979e-07, + "loss": 0.3176, + "step": 2194 + }, + { + "epoch": 0.3976449275362319, + "grad_norm": 3.5331177111360743, + "learning_rate": 6.855670799574593e-07, + "loss": 0.3329, + "step": 2195 + }, + { + "epoch": 0.3978260869565217, + "grad_norm": 11.36791909861226, + "learning_rate": 6.852946145410963e-07, + "loss": 0.2798, + "step": 2196 + }, + { + "epoch": 0.3980072463768116, + "grad_norm": 5.905235439283369, + "learning_rate": 6.850220853269192e-07, + "loss": 0.2823, + "step": 2197 + }, + { + "epoch": 0.39818840579710146, + "grad_norm": 6.802293435441138, + "learning_rate": 6.847494924087615e-07, + "loss": 0.2319, + "step": 2198 + }, + { + "epoch": 0.3983695652173913, + "grad_norm": 3.4871520154116094, + "learning_rate": 6.844768358804784e-07, + "loss": 0.2877, + "step": 2199 + }, + { + "epoch": 0.39855072463768115, + "grad_norm": 6.606776141129715, + "learning_rate": 6.842041158359465e-07, + "loss": 0.2839, + "step": 2200 + }, + { + "epoch": 0.39855072463768115, + "eval_loss": 0.31165623664855957, + "eval_runtime": 9.7812, + "eval_samples_per_second": 51.119, + "eval_steps_per_second": 0.102, + "step": 2200 + }, + { + "epoch": 0.398731884057971, + "grad_norm": 4.004580365436502, + "learning_rate": 6.839313323690647e-07, + "loss": 0.3235, + "step": 2201 + }, + { + "epoch": 0.39891304347826084, + "grad_norm": 9.017065743390921, + "learning_rate": 6.836584855737537e-07, + "loss": 0.3066, + "step": 2202 + }, + { + "epoch": 0.39909420289855074, + "grad_norm": 5.334253719988122, + "learning_rate": 6.833855755439561e-07, + "loss": 0.2645, + "step": 2203 + }, + { + "epoch": 0.3992753623188406, + "grad_norm": 8.777738221400382, + "learning_rate": 6.831126023736358e-07, + "loss": 0.2669, + "step": 2204 + }, + { + "epoch": 0.39945652173913043, + "grad_norm": 3.8197175856336165, + "learning_rate": 6.828395661567789e-07, + "loss": 0.2713, + "step": 2205 + }, + { + "epoch": 0.3996376811594203, + "grad_norm": 4.48874189922189, + "learning_rate": 6.825664669873931e-07, + "loss": 0.3084, + "step": 2206 + }, + { + "epoch": 0.3998188405797101, + "grad_norm": 5.080293220585723, + "learning_rate": 6.822933049595077e-07, + "loss": 0.36, + "step": 2207 + }, + { + "epoch": 0.4, + "grad_norm": 4.696484334787702, + "learning_rate": 6.820200801671736e-07, + "loss": 0.2526, + "step": 2208 + }, + { + "epoch": 0.40018115942028987, + "grad_norm": 3.5370597557875625, + "learning_rate": 6.817467927044635e-07, + "loss": 0.2644, + "step": 2209 + }, + { + "epoch": 0.4003623188405797, + "grad_norm": 4.425550127312261, + "learning_rate": 6.814734426654716e-07, + "loss": 0.3272, + "step": 2210 + }, + { + "epoch": 0.40054347826086956, + "grad_norm": 5.760714598972042, + "learning_rate": 6.812000301443135e-07, + "loss": 0.2659, + "step": 2211 + }, + { + "epoch": 0.4007246376811594, + "grad_norm": 7.58433814276553, + "learning_rate": 6.809265552351264e-07, + "loss": 0.3356, + "step": 2212 + }, + { + "epoch": 0.4009057971014493, + "grad_norm": 4.332188856177377, + "learning_rate": 6.806530180320693e-07, + "loss": 0.2903, + "step": 2213 + }, + { + "epoch": 0.40108695652173915, + "grad_norm": 7.1163274548067275, + "learning_rate": 6.80379418629322e-07, + "loss": 0.3761, + "step": 2214 + }, + { + "epoch": 0.401268115942029, + "grad_norm": 4.49422129917352, + "learning_rate": 6.801057571210862e-07, + "loss": 0.3243, + "step": 2215 + }, + { + "epoch": 0.40144927536231884, + "grad_norm": 4.773996975382843, + "learning_rate": 6.798320336015848e-07, + "loss": 0.3078, + "step": 2216 + }, + { + "epoch": 0.4016304347826087, + "grad_norm": 6.104874246248235, + "learning_rate": 6.795582481650623e-07, + "loss": 0.3101, + "step": 2217 + }, + { + "epoch": 0.4018115942028985, + "grad_norm": 3.962353351560333, + "learning_rate": 6.792844009057842e-07, + "loss": 0.2927, + "step": 2218 + }, + { + "epoch": 0.4019927536231884, + "grad_norm": 4.43873212167092, + "learning_rate": 6.790104919180373e-07, + "loss": 0.2672, + "step": 2219 + }, + { + "epoch": 0.40217391304347827, + "grad_norm": 8.851462258056083, + "learning_rate": 6.7873652129613e-07, + "loss": 0.3768, + "step": 2220 + }, + { + "epoch": 0.4023550724637681, + "grad_norm": 4.13758685057565, + "learning_rate": 6.784624891343914e-07, + "loss": 0.331, + "step": 2221 + }, + { + "epoch": 0.40253623188405796, + "grad_norm": 4.07969786302646, + "learning_rate": 6.781883955271722e-07, + "loss": 0.3206, + "step": 2222 + }, + { + "epoch": 0.4027173913043478, + "grad_norm": 3.4425626225475225, + "learning_rate": 6.779142405688443e-07, + "loss": 0.3099, + "step": 2223 + }, + { + "epoch": 0.4028985507246377, + "grad_norm": 4.909572420801204, + "learning_rate": 6.776400243538003e-07, + "loss": 0.2478, + "step": 2224 + }, + { + "epoch": 0.40307971014492755, + "grad_norm": 5.291819558569912, + "learning_rate": 6.773657469764542e-07, + "loss": 0.3564, + "step": 2225 + }, + { + "epoch": 0.4032608695652174, + "grad_norm": 4.373416755523693, + "learning_rate": 6.770914085312412e-07, + "loss": 0.293, + "step": 2226 + }, + { + "epoch": 0.40344202898550724, + "grad_norm": 4.09980830065468, + "learning_rate": 6.76817009112617e-07, + "loss": 0.2836, + "step": 2227 + }, + { + "epoch": 0.4036231884057971, + "grad_norm": 6.344705788752141, + "learning_rate": 6.765425488150589e-07, + "loss": 0.3275, + "step": 2228 + }, + { + "epoch": 0.40380434782608693, + "grad_norm": 4.770346520763357, + "learning_rate": 6.762680277330648e-07, + "loss": 0.3109, + "step": 2229 + }, + { + "epoch": 0.40398550724637683, + "grad_norm": 8.881697922793602, + "learning_rate": 6.759934459611534e-07, + "loss": 0.3075, + "step": 2230 + }, + { + "epoch": 0.4041666666666667, + "grad_norm": 4.1903094693543785, + "learning_rate": 6.757188035938648e-07, + "loss": 0.2944, + "step": 2231 + }, + { + "epoch": 0.4043478260869565, + "grad_norm": 4.356995256903603, + "learning_rate": 6.754441007257594e-07, + "loss": 0.3127, + "step": 2232 + }, + { + "epoch": 0.40452898550724636, + "grad_norm": 6.098313538479057, + "learning_rate": 6.751693374514192e-07, + "loss": 0.3044, + "step": 2233 + }, + { + "epoch": 0.4047101449275362, + "grad_norm": 4.91282076148066, + "learning_rate": 6.748945138654458e-07, + "loss": 0.3127, + "step": 2234 + }, + { + "epoch": 0.4048913043478261, + "grad_norm": 8.409449416572015, + "learning_rate": 6.746196300624627e-07, + "loss": 0.3011, + "step": 2235 + }, + { + "epoch": 0.40507246376811595, + "grad_norm": 3.5367485660500972, + "learning_rate": 6.743446861371137e-07, + "loss": 0.3378, + "step": 2236 + }, + { + "epoch": 0.4052536231884058, + "grad_norm": 3.5167666854013504, + "learning_rate": 6.74069682184063e-07, + "loss": 0.2645, + "step": 2237 + }, + { + "epoch": 0.40543478260869564, + "grad_norm": 4.661991601268904, + "learning_rate": 6.737946182979961e-07, + "loss": 0.403, + "step": 2238 + }, + { + "epoch": 0.4056159420289855, + "grad_norm": 4.491830335469685, + "learning_rate": 6.735194945736186e-07, + "loss": 0.3082, + "step": 2239 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 4.013795874324127, + "learning_rate": 6.732443111056572e-07, + "loss": 0.2737, + "step": 2240 + }, + { + "epoch": 0.40597826086956523, + "grad_norm": 7.642900286388456, + "learning_rate": 6.729690679888584e-07, + "loss": 0.3472, + "step": 2241 + }, + { + "epoch": 0.4061594202898551, + "grad_norm": 4.434087916359773, + "learning_rate": 6.7269376531799e-07, + "loss": 0.2744, + "step": 2242 + }, + { + "epoch": 0.4063405797101449, + "grad_norm": 5.641102829094799, + "learning_rate": 6.724184031878399e-07, + "loss": 0.3148, + "step": 2243 + }, + { + "epoch": 0.40652173913043477, + "grad_norm": 3.441437848483509, + "learning_rate": 6.721429816932169e-07, + "loss": 0.2956, + "step": 2244 + }, + { + "epoch": 0.4067028985507246, + "grad_norm": 8.894075312929761, + "learning_rate": 6.718675009289494e-07, + "loss": 0.3626, + "step": 2245 + }, + { + "epoch": 0.4068840579710145, + "grad_norm": 5.939315500932609, + "learning_rate": 6.71591960989887e-07, + "loss": 0.3075, + "step": 2246 + }, + { + "epoch": 0.40706521739130436, + "grad_norm": 6.991336955854468, + "learning_rate": 6.713163619708995e-07, + "loss": 0.3081, + "step": 2247 + }, + { + "epoch": 0.4072463768115942, + "grad_norm": 6.329886474887556, + "learning_rate": 6.71040703966877e-07, + "loss": 0.2896, + "step": 2248 + }, + { + "epoch": 0.40742753623188405, + "grad_norm": 5.583220967094046, + "learning_rate": 6.707649870727296e-07, + "loss": 0.3637, + "step": 2249 + }, + { + "epoch": 0.4076086956521739, + "grad_norm": 7.386632320792021, + "learning_rate": 6.70489211383388e-07, + "loss": 0.2961, + "step": 2250 + }, + { + "epoch": 0.4077898550724638, + "grad_norm": 3.375989649154391, + "learning_rate": 6.702133769938031e-07, + "loss": 0.3148, + "step": 2251 + }, + { + "epoch": 0.40797101449275364, + "grad_norm": 8.362479882284925, + "learning_rate": 6.699374839989462e-07, + "loss": 0.2897, + "step": 2252 + }, + { + "epoch": 0.4081521739130435, + "grad_norm": 5.169666061134393, + "learning_rate": 6.696615324938082e-07, + "loss": 0.321, + "step": 2253 + }, + { + "epoch": 0.4083333333333333, + "grad_norm": 5.569690016903751, + "learning_rate": 6.693855225734006e-07, + "loss": 0.3489, + "step": 2254 + }, + { + "epoch": 0.40851449275362317, + "grad_norm": 4.426887493077836, + "learning_rate": 6.691094543327553e-07, + "loss": 0.3335, + "step": 2255 + }, + { + "epoch": 0.40869565217391307, + "grad_norm": 7.528814173632632, + "learning_rate": 6.688333278669233e-07, + "loss": 0.2668, + "step": 2256 + }, + { + "epoch": 0.4088768115942029, + "grad_norm": 8.447861692222983, + "learning_rate": 6.685571432709768e-07, + "loss": 0.3514, + "step": 2257 + }, + { + "epoch": 0.40905797101449276, + "grad_norm": 5.551127199752848, + "learning_rate": 6.682809006400073e-07, + "loss": 0.2867, + "step": 2258 + }, + { + "epoch": 0.4092391304347826, + "grad_norm": 6.078553925398269, + "learning_rate": 6.680046000691262e-07, + "loss": 0.2964, + "step": 2259 + }, + { + "epoch": 0.40942028985507245, + "grad_norm": 4.200727215066275, + "learning_rate": 6.677282416534653e-07, + "loss": 0.2908, + "step": 2260 + }, + { + "epoch": 0.4096014492753623, + "grad_norm": 5.614888855289529, + "learning_rate": 6.67451825488176e-07, + "loss": 0.2991, + "step": 2261 + }, + { + "epoch": 0.4097826086956522, + "grad_norm": 3.685666096999919, + "learning_rate": 6.6717535166843e-07, + "loss": 0.3232, + "step": 2262 + }, + { + "epoch": 0.40996376811594204, + "grad_norm": 3.8855236972948353, + "learning_rate": 6.668988202894181e-07, + "loss": 0.3747, + "step": 2263 + }, + { + "epoch": 0.4101449275362319, + "grad_norm": 9.250525232590523, + "learning_rate": 6.666222314463518e-07, + "loss": 0.3065, + "step": 2264 + }, + { + "epoch": 0.41032608695652173, + "grad_norm": 7.635126726567272, + "learning_rate": 6.663455852344615e-07, + "loss": 0.2876, + "step": 2265 + }, + { + "epoch": 0.4105072463768116, + "grad_norm": 5.114922133890042, + "learning_rate": 6.660688817489984e-07, + "loss": 0.3087, + "step": 2266 + }, + { + "epoch": 0.4106884057971015, + "grad_norm": 8.285470410124132, + "learning_rate": 6.657921210852321e-07, + "loss": 0.2542, + "step": 2267 + }, + { + "epoch": 0.4108695652173913, + "grad_norm": 4.170803308744904, + "learning_rate": 6.655153033384531e-07, + "loss": 0.366, + "step": 2268 + }, + { + "epoch": 0.41105072463768116, + "grad_norm": 4.8761931590322565, + "learning_rate": 6.65238428603971e-07, + "loss": 0.3199, + "step": 2269 + }, + { + "epoch": 0.411231884057971, + "grad_norm": 4.143135332840739, + "learning_rate": 6.64961496977115e-07, + "loss": 0.3059, + "step": 2270 + }, + { + "epoch": 0.41141304347826085, + "grad_norm": 3.237703349654009, + "learning_rate": 6.646845085532339e-07, + "loss": 0.2816, + "step": 2271 + }, + { + "epoch": 0.4115942028985507, + "grad_norm": 8.299850469933856, + "learning_rate": 6.644074634276963e-07, + "loss": 0.3042, + "step": 2272 + }, + { + "epoch": 0.4117753623188406, + "grad_norm": 3.2974559364063927, + "learning_rate": 6.6413036169589e-07, + "loss": 0.3103, + "step": 2273 + }, + { + "epoch": 0.41195652173913044, + "grad_norm": 3.76438256733284, + "learning_rate": 6.638532034532225e-07, + "loss": 0.3385, + "step": 2274 + }, + { + "epoch": 0.4121376811594203, + "grad_norm": 5.685197955516119, + "learning_rate": 6.635759887951208e-07, + "loss": 0.3176, + "step": 2275 + }, + { + "epoch": 0.41231884057971013, + "grad_norm": 5.367368509701561, + "learning_rate": 6.63298717817031e-07, + "loss": 0.3161, + "step": 2276 + }, + { + "epoch": 0.4125, + "grad_norm": 8.23828116247501, + "learning_rate": 6.630213906144191e-07, + "loss": 0.2477, + "step": 2277 + }, + { + "epoch": 0.4126811594202899, + "grad_norm": 4.245331880732722, + "learning_rate": 6.627440072827697e-07, + "loss": 0.2964, + "step": 2278 + }, + { + "epoch": 0.4128623188405797, + "grad_norm": 5.027613811697914, + "learning_rate": 6.624665679175878e-07, + "loss": 0.3002, + "step": 2279 + }, + { + "epoch": 0.41304347826086957, + "grad_norm": 5.254985319598645, + "learning_rate": 6.621890726143966e-07, + "loss": 0.3574, + "step": 2280 + }, + { + "epoch": 0.4132246376811594, + "grad_norm": 6.872207546488963, + "learning_rate": 6.619115214687393e-07, + "loss": 0.2712, + "step": 2281 + }, + { + "epoch": 0.41340579710144926, + "grad_norm": 3.2641773382908377, + "learning_rate": 6.61633914576178e-07, + "loss": 0.2719, + "step": 2282 + }, + { + "epoch": 0.41358695652173916, + "grad_norm": 6.1828766544893785, + "learning_rate": 6.61356252032294e-07, + "loss": 0.304, + "step": 2283 + }, + { + "epoch": 0.413768115942029, + "grad_norm": 4.321040540507069, + "learning_rate": 6.610785339326881e-07, + "loss": 0.2975, + "step": 2284 + }, + { + "epoch": 0.41394927536231885, + "grad_norm": 7.069091130381335, + "learning_rate": 6.608007603729796e-07, + "loss": 0.3124, + "step": 2285 + }, + { + "epoch": 0.4141304347826087, + "grad_norm": 5.060448386586392, + "learning_rate": 6.605229314488073e-07, + "loss": 0.2866, + "step": 2286 + }, + { + "epoch": 0.41431159420289854, + "grad_norm": 6.203290330828651, + "learning_rate": 6.602450472558294e-07, + "loss": 0.2976, + "step": 2287 + }, + { + "epoch": 0.4144927536231884, + "grad_norm": 3.50982587550599, + "learning_rate": 6.599671078897223e-07, + "loss": 0.3137, + "step": 2288 + }, + { + "epoch": 0.4146739130434783, + "grad_norm": 3.5430731875103385, + "learning_rate": 6.59689113446182e-07, + "loss": 0.2619, + "step": 2289 + }, + { + "epoch": 0.4148550724637681, + "grad_norm": 4.771418239074733, + "learning_rate": 6.594110640209235e-07, + "loss": 0.3391, + "step": 2290 + }, + { + "epoch": 0.41503623188405797, + "grad_norm": 5.825738070816933, + "learning_rate": 6.591329597096802e-07, + "loss": 0.364, + "step": 2291 + }, + { + "epoch": 0.4152173913043478, + "grad_norm": 6.74872906751537, + "learning_rate": 6.588548006082049e-07, + "loss": 0.3171, + "step": 2292 + }, + { + "epoch": 0.41539855072463766, + "grad_norm": 4.878259741149326, + "learning_rate": 6.585765868122691e-07, + "loss": 0.3227, + "step": 2293 + }, + { + "epoch": 0.41557971014492756, + "grad_norm": 6.212747632634399, + "learning_rate": 6.58298318417663e-07, + "loss": 0.2941, + "step": 2294 + }, + { + "epoch": 0.4157608695652174, + "grad_norm": 14.253826190308022, + "learning_rate": 6.580199955201961e-07, + "loss": 0.3592, + "step": 2295 + }, + { + "epoch": 0.41594202898550725, + "grad_norm": 5.10730921894822, + "learning_rate": 6.577416182156958e-07, + "loss": 0.2499, + "step": 2296 + }, + { + "epoch": 0.4161231884057971, + "grad_norm": 16.19235589648736, + "learning_rate": 6.574631866000089e-07, + "loss": 0.382, + "step": 2297 + }, + { + "epoch": 0.41630434782608694, + "grad_norm": 9.27566753712969, + "learning_rate": 6.57184700769001e-07, + "loss": 0.2907, + "step": 2298 + }, + { + "epoch": 0.4164855072463768, + "grad_norm": 3.6538757269226343, + "learning_rate": 6.569061608185557e-07, + "loss": 0.2971, + "step": 2299 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 4.554385376545483, + "learning_rate": 6.566275668445758e-07, + "loss": 0.283, + "step": 2300 + }, + { + "epoch": 0.4166666666666667, + "eval_loss": 0.3081875145435333, + "eval_runtime": 9.7833, + "eval_samples_per_second": 51.108, + "eval_steps_per_second": 0.102, + "step": 2300 + }, + { + "epoch": 0.41684782608695653, + "grad_norm": 5.252070927566908, + "learning_rate": 6.563489189429828e-07, + "loss": 0.3335, + "step": 2301 + }, + { + "epoch": 0.4170289855072464, + "grad_norm": 6.5788475189753886, + "learning_rate": 6.560702172097158e-07, + "loss": 0.2588, + "step": 2302 + }, + { + "epoch": 0.4172101449275362, + "grad_norm": 4.44138324373938, + "learning_rate": 6.557914617407339e-07, + "loss": 0.3036, + "step": 2303 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 8.79844273409732, + "learning_rate": 6.555126526320134e-07, + "loss": 0.3117, + "step": 2304 + }, + { + "epoch": 0.41757246376811596, + "grad_norm": 9.321045294902802, + "learning_rate": 6.552337899795497e-07, + "loss": 0.293, + "step": 2305 + }, + { + "epoch": 0.4177536231884058, + "grad_norm": 6.777456768486296, + "learning_rate": 6.549548738793566e-07, + "loss": 0.3115, + "step": 2306 + }, + { + "epoch": 0.41793478260869565, + "grad_norm": 5.490405975247144, + "learning_rate": 6.546759044274663e-07, + "loss": 0.2958, + "step": 2307 + }, + { + "epoch": 0.4181159420289855, + "grad_norm": 6.1963416416821735, + "learning_rate": 6.543968817199292e-07, + "loss": 0.3203, + "step": 2308 + }, + { + "epoch": 0.41829710144927534, + "grad_norm": 8.25763006972259, + "learning_rate": 6.541178058528143e-07, + "loss": 0.2866, + "step": 2309 + }, + { + "epoch": 0.41847826086956524, + "grad_norm": 7.2752239577071895, + "learning_rate": 6.538386769222085e-07, + "loss": 0.3369, + "step": 2310 + }, + { + "epoch": 0.4186594202898551, + "grad_norm": 6.863002245502321, + "learning_rate": 6.535594950242174e-07, + "loss": 0.3251, + "step": 2311 + }, + { + "epoch": 0.41884057971014493, + "grad_norm": 4.411816407905571, + "learning_rate": 6.532802602549646e-07, + "loss": 0.3568, + "step": 2312 + }, + { + "epoch": 0.4190217391304348, + "grad_norm": 5.093038580306019, + "learning_rate": 6.530009727105916e-07, + "loss": 0.3051, + "step": 2313 + }, + { + "epoch": 0.4192028985507246, + "grad_norm": 11.348804991938719, + "learning_rate": 6.527216324872592e-07, + "loss": 0.3459, + "step": 2314 + }, + { + "epoch": 0.41938405797101447, + "grad_norm": 10.843852583075352, + "learning_rate": 6.524422396811448e-07, + "loss": 0.2824, + "step": 2315 + }, + { + "epoch": 0.41956521739130437, + "grad_norm": 9.116085155890177, + "learning_rate": 6.521627943884452e-07, + "loss": 0.3583, + "step": 2316 + }, + { + "epoch": 0.4197463768115942, + "grad_norm": 8.427956674746454, + "learning_rate": 6.518832967053746e-07, + "loss": 0.2868, + "step": 2317 + }, + { + "epoch": 0.41992753623188406, + "grad_norm": 5.639861262365254, + "learning_rate": 6.516037467281652e-07, + "loss": 0.2633, + "step": 2318 + }, + { + "epoch": 0.4201086956521739, + "grad_norm": 5.14891423125128, + "learning_rate": 6.513241445530676e-07, + "loss": 0.271, + "step": 2319 + }, + { + "epoch": 0.42028985507246375, + "grad_norm": 4.579124761321766, + "learning_rate": 6.510444902763498e-07, + "loss": 0.2664, + "step": 2320 + }, + { + "epoch": 0.42047101449275365, + "grad_norm": 4.534131870437726, + "learning_rate": 6.507647839942983e-07, + "loss": 0.2937, + "step": 2321 + }, + { + "epoch": 0.4206521739130435, + "grad_norm": 3.40644285543595, + "learning_rate": 6.504850258032176e-07, + "loss": 0.2928, + "step": 2322 + }, + { + "epoch": 0.42083333333333334, + "grad_norm": 3.1704861579416894, + "learning_rate": 6.502052157994294e-07, + "loss": 0.2768, + "step": 2323 + }, + { + "epoch": 0.4210144927536232, + "grad_norm": 7.494768894456705, + "learning_rate": 6.499253540792736e-07, + "loss": 0.2935, + "step": 2324 + }, + { + "epoch": 0.421195652173913, + "grad_norm": 6.9556993549617445, + "learning_rate": 6.496454407391082e-07, + "loss": 0.2941, + "step": 2325 + }, + { + "epoch": 0.42137681159420287, + "grad_norm": 4.136855338022511, + "learning_rate": 6.493654758753084e-07, + "loss": 0.3093, + "step": 2326 + }, + { + "epoch": 0.42155797101449277, + "grad_norm": 9.212047863217336, + "learning_rate": 6.490854595842675e-07, + "loss": 0.33, + "step": 2327 + }, + { + "epoch": 0.4217391304347826, + "grad_norm": 7.667652790684813, + "learning_rate": 6.488053919623968e-07, + "loss": 0.3428, + "step": 2328 + }, + { + "epoch": 0.42192028985507246, + "grad_norm": 3.5609158476656435, + "learning_rate": 6.485252731061242e-07, + "loss": 0.3044, + "step": 2329 + }, + { + "epoch": 0.4221014492753623, + "grad_norm": 5.096558513035923, + "learning_rate": 6.482451031118965e-07, + "loss": 0.2994, + "step": 2330 + }, + { + "epoch": 0.42228260869565215, + "grad_norm": 9.849021584451869, + "learning_rate": 6.479648820761776e-07, + "loss": 0.2762, + "step": 2331 + }, + { + "epoch": 0.42246376811594205, + "grad_norm": 4.593447913335792, + "learning_rate": 6.476846100954484e-07, + "loss": 0.3464, + "step": 2332 + }, + { + "epoch": 0.4226449275362319, + "grad_norm": 3.4143861457939044, + "learning_rate": 6.474042872662084e-07, + "loss": 0.2854, + "step": 2333 + }, + { + "epoch": 0.42282608695652174, + "grad_norm": 5.95878172037489, + "learning_rate": 6.471239136849738e-07, + "loss": 0.2922, + "step": 2334 + }, + { + "epoch": 0.4230072463768116, + "grad_norm": 6.95448046727009, + "learning_rate": 6.468434894482786e-07, + "loss": 0.3218, + "step": 2335 + }, + { + "epoch": 0.42318840579710143, + "grad_norm": 5.737354145804355, + "learning_rate": 6.465630146526744e-07, + "loss": 0.3313, + "step": 2336 + }, + { + "epoch": 0.42336956521739133, + "grad_norm": 3.2418882401072415, + "learning_rate": 6.462824893947296e-07, + "loss": 0.2754, + "step": 2337 + }, + { + "epoch": 0.4235507246376812, + "grad_norm": 6.638313730445387, + "learning_rate": 6.460019137710307e-07, + "loss": 0.3213, + "step": 2338 + }, + { + "epoch": 0.423731884057971, + "grad_norm": 4.075967396282302, + "learning_rate": 6.457212878781812e-07, + "loss": 0.2939, + "step": 2339 + }, + { + "epoch": 0.42391304347826086, + "grad_norm": 6.545210924283952, + "learning_rate": 6.454406118128017e-07, + "loss": 0.2839, + "step": 2340 + }, + { + "epoch": 0.4240942028985507, + "grad_norm": 4.525793572017759, + "learning_rate": 6.451598856715304e-07, + "loss": 0.283, + "step": 2341 + }, + { + "epoch": 0.42427536231884055, + "grad_norm": 6.156340652832883, + "learning_rate": 6.448791095510229e-07, + "loss": 0.2988, + "step": 2342 + }, + { + "epoch": 0.42445652173913045, + "grad_norm": 6.568764572585965, + "learning_rate": 6.445982835479513e-07, + "loss": 0.3907, + "step": 2343 + }, + { + "epoch": 0.4246376811594203, + "grad_norm": 5.951589934703833, + "learning_rate": 6.443174077590056e-07, + "loss": 0.2634, + "step": 2344 + }, + { + "epoch": 0.42481884057971014, + "grad_norm": 7.108996064150201, + "learning_rate": 6.440364822808928e-07, + "loss": 0.3592, + "step": 2345 + }, + { + "epoch": 0.425, + "grad_norm": 8.727790706486658, + "learning_rate": 6.437555072103365e-07, + "loss": 0.274, + "step": 2346 + }, + { + "epoch": 0.42518115942028983, + "grad_norm": 4.802534038712147, + "learning_rate": 6.434744826440781e-07, + "loss": 0.3236, + "step": 2347 + }, + { + "epoch": 0.42536231884057973, + "grad_norm": 3.187665598838089, + "learning_rate": 6.431934086788753e-07, + "loss": 0.3055, + "step": 2348 + }, + { + "epoch": 0.4255434782608696, + "grad_norm": 4.149065953139308, + "learning_rate": 6.429122854115036e-07, + "loss": 0.3238, + "step": 2349 + }, + { + "epoch": 0.4257246376811594, + "grad_norm": 7.3621213118183135, + "learning_rate": 6.42631112938755e-07, + "loss": 0.2961, + "step": 2350 + }, + { + "epoch": 0.42590579710144927, + "grad_norm": 3.2486055671494616, + "learning_rate": 6.423498913574383e-07, + "loss": 0.2145, + "step": 2351 + }, + { + "epoch": 0.4260869565217391, + "grad_norm": 4.526894388022784, + "learning_rate": 6.420686207643794e-07, + "loss": 0.3409, + "step": 2352 + }, + { + "epoch": 0.42626811594202896, + "grad_norm": 4.392681220850965, + "learning_rate": 6.417873012564215e-07, + "loss": 0.3176, + "step": 2353 + }, + { + "epoch": 0.42644927536231886, + "grad_norm": 6.110794389623651, + "learning_rate": 6.415059329304238e-07, + "loss": 0.2667, + "step": 2354 + }, + { + "epoch": 0.4266304347826087, + "grad_norm": 7.266934307401165, + "learning_rate": 6.412245158832629e-07, + "loss": 0.3011, + "step": 2355 + }, + { + "epoch": 0.42681159420289855, + "grad_norm": 4.446239473597966, + "learning_rate": 6.40943050211832e-07, + "loss": 0.3144, + "step": 2356 + }, + { + "epoch": 0.4269927536231884, + "grad_norm": 4.243355332455165, + "learning_rate": 6.406615360130414e-07, + "loss": 0.2814, + "step": 2357 + }, + { + "epoch": 0.42717391304347824, + "grad_norm": 7.42819065267284, + "learning_rate": 6.403799733838171e-07, + "loss": 0.3367, + "step": 2358 + }, + { + "epoch": 0.42735507246376814, + "grad_norm": 3.117263154175289, + "learning_rate": 6.400983624211031e-07, + "loss": 0.2473, + "step": 2359 + }, + { + "epoch": 0.427536231884058, + "grad_norm": 5.654648596519151, + "learning_rate": 6.398167032218591e-07, + "loss": 0.3391, + "step": 2360 + }, + { + "epoch": 0.4277173913043478, + "grad_norm": 5.712715697561089, + "learning_rate": 6.395349958830616e-07, + "loss": 0.3131, + "step": 2361 + }, + { + "epoch": 0.42789855072463767, + "grad_norm": 6.014980975626511, + "learning_rate": 6.392532405017039e-07, + "loss": 0.2879, + "step": 2362 + }, + { + "epoch": 0.4280797101449275, + "grad_norm": 5.2166066553112245, + "learning_rate": 6.389714371747958e-07, + "loss": 0.271, + "step": 2363 + }, + { + "epoch": 0.4282608695652174, + "grad_norm": 3.5281287010252895, + "learning_rate": 6.386895859993633e-07, + "loss": 0.2781, + "step": 2364 + }, + { + "epoch": 0.42844202898550726, + "grad_norm": 6.075133336028407, + "learning_rate": 6.384076870724493e-07, + "loss": 0.3413, + "step": 2365 + }, + { + "epoch": 0.4286231884057971, + "grad_norm": 3.311171515576892, + "learning_rate": 6.38125740491113e-07, + "loss": 0.2614, + "step": 2366 + }, + { + "epoch": 0.42880434782608695, + "grad_norm": 4.038095763657474, + "learning_rate": 6.378437463524295e-07, + "loss": 0.2852, + "step": 2367 + }, + { + "epoch": 0.4289855072463768, + "grad_norm": 5.270364823459333, + "learning_rate": 6.375617047534911e-07, + "loss": 0.3007, + "step": 2368 + }, + { + "epoch": 0.42916666666666664, + "grad_norm": 7.889787936372453, + "learning_rate": 6.372796157914059e-07, + "loss": 0.2674, + "step": 2369 + }, + { + "epoch": 0.42934782608695654, + "grad_norm": 10.987924033374092, + "learning_rate": 6.369974795632988e-07, + "loss": 0.3345, + "step": 2370 + }, + { + "epoch": 0.4295289855072464, + "grad_norm": 7.329870667955026, + "learning_rate": 6.367152961663102e-07, + "loss": 0.3263, + "step": 2371 + }, + { + "epoch": 0.42971014492753623, + "grad_norm": 4.220140572988842, + "learning_rate": 6.364330656975973e-07, + "loss": 0.3106, + "step": 2372 + }, + { + "epoch": 0.4298913043478261, + "grad_norm": 4.163053856611312, + "learning_rate": 6.361507882543335e-07, + "loss": 0.3107, + "step": 2373 + }, + { + "epoch": 0.4300724637681159, + "grad_norm": 4.999521026328111, + "learning_rate": 6.358684639337084e-07, + "loss": 0.3176, + "step": 2374 + }, + { + "epoch": 0.4302536231884058, + "grad_norm": 5.589645442204374, + "learning_rate": 6.355860928329271e-07, + "loss": 0.3328, + "step": 2375 + }, + { + "epoch": 0.43043478260869567, + "grad_norm": 5.463617192861189, + "learning_rate": 6.35303675049212e-07, + "loss": 0.2833, + "step": 2376 + }, + { + "epoch": 0.4306159420289855, + "grad_norm": 4.594530499190523, + "learning_rate": 6.350212106798002e-07, + "loss": 0.3701, + "step": 2377 + }, + { + "epoch": 0.43079710144927535, + "grad_norm": 5.351475130390271, + "learning_rate": 6.347386998219458e-07, + "loss": 0.2662, + "step": 2378 + }, + { + "epoch": 0.4309782608695652, + "grad_norm": 3.443919946262373, + "learning_rate": 6.344561425729186e-07, + "loss": 0.2779, + "step": 2379 + }, + { + "epoch": 0.4311594202898551, + "grad_norm": 3.7981303231880696, + "learning_rate": 6.341735390300047e-07, + "loss": 0.2863, + "step": 2380 + }, + { + "epoch": 0.43134057971014494, + "grad_norm": 4.804880059405027, + "learning_rate": 6.338908892905055e-07, + "loss": 0.3002, + "step": 2381 + }, + { + "epoch": 0.4315217391304348, + "grad_norm": 6.016134838775121, + "learning_rate": 6.336081934517388e-07, + "loss": 0.2338, + "step": 2382 + }, + { + "epoch": 0.43170289855072463, + "grad_norm": 7.220698799639642, + "learning_rate": 6.333254516110377e-07, + "loss": 0.2667, + "step": 2383 + }, + { + "epoch": 0.4318840579710145, + "grad_norm": 3.783084763616544, + "learning_rate": 6.330426638657524e-07, + "loss": 0.2923, + "step": 2384 + }, + { + "epoch": 0.4320652173913043, + "grad_norm": 5.821360737081035, + "learning_rate": 6.32759830313247e-07, + "loss": 0.302, + "step": 2385 + }, + { + "epoch": 0.4322463768115942, + "grad_norm": 5.911176517191124, + "learning_rate": 6.324769510509034e-07, + "loss": 0.3166, + "step": 2386 + }, + { + "epoch": 0.43242753623188407, + "grad_norm": 10.33246031124787, + "learning_rate": 6.321940261761178e-07, + "loss": 0.3128, + "step": 2387 + }, + { + "epoch": 0.4326086956521739, + "grad_norm": 5.866156582196534, + "learning_rate": 6.319110557863025e-07, + "loss": 0.2822, + "step": 2388 + }, + { + "epoch": 0.43278985507246376, + "grad_norm": 4.568627246411997, + "learning_rate": 6.316280399788859e-07, + "loss": 0.4158, + "step": 2389 + }, + { + "epoch": 0.4329710144927536, + "grad_norm": 3.9881536953107584, + "learning_rate": 6.313449788513114e-07, + "loss": 0.2872, + "step": 2390 + }, + { + "epoch": 0.4331521739130435, + "grad_norm": 7.191773234053636, + "learning_rate": 6.310618725010381e-07, + "loss": 0.3354, + "step": 2391 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 3.3151817463531836, + "learning_rate": 6.307787210255414e-07, + "loss": 0.2838, + "step": 2392 + }, + { + "epoch": 0.4335144927536232, + "grad_norm": 3.1034187918020972, + "learning_rate": 6.304955245223113e-07, + "loss": 0.2724, + "step": 2393 + }, + { + "epoch": 0.43369565217391304, + "grad_norm": 3.30721134824641, + "learning_rate": 6.302122830888539e-07, + "loss": 0.2397, + "step": 2394 + }, + { + "epoch": 0.4338768115942029, + "grad_norm": 3.5254479200838014, + "learning_rate": 6.299289968226904e-07, + "loss": 0.2768, + "step": 2395 + }, + { + "epoch": 0.4340579710144927, + "grad_norm": 11.329931826072082, + "learning_rate": 6.296456658213577e-07, + "loss": 0.3801, + "step": 2396 + }, + { + "epoch": 0.4342391304347826, + "grad_norm": 4.763436588760162, + "learning_rate": 6.29362290182408e-07, + "loss": 0.3279, + "step": 2397 + }, + { + "epoch": 0.4344202898550725, + "grad_norm": 5.514605367037099, + "learning_rate": 6.290788700034088e-07, + "loss": 0.2995, + "step": 2398 + }, + { + "epoch": 0.4346014492753623, + "grad_norm": 4.884851347962377, + "learning_rate": 6.287954053819431e-07, + "loss": 0.2985, + "step": 2399 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 5.8128705795798306, + "learning_rate": 6.285118964156091e-07, + "loss": 0.3162, + "step": 2400 + }, + { + "epoch": 0.43478260869565216, + "eval_loss": 0.3111875057220459, + "eval_runtime": 9.7334, + "eval_samples_per_second": 51.369, + "eval_steps_per_second": 0.103, + "step": 2400 + }, + { + "epoch": 0.434963768115942, + "grad_norm": 6.032919533461052, + "learning_rate": 6.282283432020202e-07, + "loss": 0.3033, + "step": 2401 + }, + { + "epoch": 0.4351449275362319, + "grad_norm": 9.8388685666013, + "learning_rate": 6.279447458388051e-07, + "loss": 0.2837, + "step": 2402 + }, + { + "epoch": 0.43532608695652175, + "grad_norm": 8.627197461456078, + "learning_rate": 6.27661104423608e-07, + "loss": 0.3109, + "step": 2403 + }, + { + "epoch": 0.4355072463768116, + "grad_norm": 4.6613897030136995, + "learning_rate": 6.273774190540878e-07, + "loss": 0.3176, + "step": 2404 + }, + { + "epoch": 0.43568840579710144, + "grad_norm": 5.168306745523437, + "learning_rate": 6.270936898279185e-07, + "loss": 0.3404, + "step": 2405 + }, + { + "epoch": 0.4358695652173913, + "grad_norm": 5.646349520336148, + "learning_rate": 6.268099168427898e-07, + "loss": 0.3678, + "step": 2406 + }, + { + "epoch": 0.4360507246376812, + "grad_norm": 3.575771991391126, + "learning_rate": 6.265261001964057e-07, + "loss": 0.3228, + "step": 2407 + }, + { + "epoch": 0.43623188405797103, + "grad_norm": 4.803310136645171, + "learning_rate": 6.262422399864859e-07, + "loss": 0.285, + "step": 2408 + }, + { + "epoch": 0.4364130434782609, + "grad_norm": 4.033472385962535, + "learning_rate": 6.259583363107648e-07, + "loss": 0.2501, + "step": 2409 + }, + { + "epoch": 0.4365942028985507, + "grad_norm": 4.6880235105969215, + "learning_rate": 6.256743892669916e-07, + "loss": 0.277, + "step": 2410 + }, + { + "epoch": 0.43677536231884057, + "grad_norm": 7.729944298397247, + "learning_rate": 6.253903989529307e-07, + "loss": 0.3259, + "step": 2411 + }, + { + "epoch": 0.4369565217391304, + "grad_norm": 5.974530213953839, + "learning_rate": 6.251063654663614e-07, + "loss": 0.2841, + "step": 2412 + }, + { + "epoch": 0.4371376811594203, + "grad_norm": 4.166502550490707, + "learning_rate": 6.248222889050776e-07, + "loss": 0.3132, + "step": 2413 + }, + { + "epoch": 0.43731884057971016, + "grad_norm": 3.3637687861488783, + "learning_rate": 6.245381693668885e-07, + "loss": 0.2901, + "step": 2414 + }, + { + "epoch": 0.4375, + "grad_norm": 5.353592628541287, + "learning_rate": 6.242540069496173e-07, + "loss": 0.3437, + "step": 2415 + }, + { + "epoch": 0.43768115942028984, + "grad_norm": 3.8199879977948545, + "learning_rate": 6.23969801751103e-07, + "loss": 0.2813, + "step": 2416 + }, + { + "epoch": 0.4378623188405797, + "grad_norm": 4.033829725970013, + "learning_rate": 6.236855538691986e-07, + "loss": 0.3056, + "step": 2417 + }, + { + "epoch": 0.4380434782608696, + "grad_norm": 3.8833235801396766, + "learning_rate": 6.234012634017718e-07, + "loss": 0.2981, + "step": 2418 + }, + { + "epoch": 0.43822463768115943, + "grad_norm": 3.2405427516316188, + "learning_rate": 6.231169304467056e-07, + "loss": 0.2632, + "step": 2419 + }, + { + "epoch": 0.4384057971014493, + "grad_norm": 3.652979531153594, + "learning_rate": 6.228325551018967e-07, + "loss": 0.2841, + "step": 2420 + }, + { + "epoch": 0.4385869565217391, + "grad_norm": 3.8590877440846967, + "learning_rate": 6.225481374652572e-07, + "loss": 0.2858, + "step": 2421 + }, + { + "epoch": 0.43876811594202897, + "grad_norm": 6.556473631494257, + "learning_rate": 6.222636776347132e-07, + "loss": 0.3133, + "step": 2422 + }, + { + "epoch": 0.4389492753623188, + "grad_norm": 3.7270987312082293, + "learning_rate": 6.219791757082058e-07, + "loss": 0.2819, + "step": 2423 + }, + { + "epoch": 0.4391304347826087, + "grad_norm": 4.533409871199603, + "learning_rate": 6.2169463178369e-07, + "loss": 0.299, + "step": 2424 + }, + { + "epoch": 0.43931159420289856, + "grad_norm": 3.258767091980891, + "learning_rate": 6.214100459591363e-07, + "loss": 0.2942, + "step": 2425 + }, + { + "epoch": 0.4394927536231884, + "grad_norm": 4.322071867009134, + "learning_rate": 6.211254183325281e-07, + "loss": 0.3331, + "step": 2426 + }, + { + "epoch": 0.43967391304347825, + "grad_norm": 4.694868290410619, + "learning_rate": 6.208407490018648e-07, + "loss": 0.3338, + "step": 2427 + }, + { + "epoch": 0.4398550724637681, + "grad_norm": 8.944845316507115, + "learning_rate": 6.205560380651589e-07, + "loss": 0.2864, + "step": 2428 + }, + { + "epoch": 0.440036231884058, + "grad_norm": 4.489572713454659, + "learning_rate": 6.202712856204379e-07, + "loss": 0.3502, + "step": 2429 + }, + { + "epoch": 0.44021739130434784, + "grad_norm": 3.2292276395752584, + "learning_rate": 6.199864917657434e-07, + "loss": 0.2912, + "step": 2430 + }, + { + "epoch": 0.4403985507246377, + "grad_norm": 6.383876568796987, + "learning_rate": 6.197016565991314e-07, + "loss": 0.3878, + "step": 2431 + }, + { + "epoch": 0.4405797101449275, + "grad_norm": 4.309347998381022, + "learning_rate": 6.194167802186718e-07, + "loss": 0.3552, + "step": 2432 + }, + { + "epoch": 0.4407608695652174, + "grad_norm": 4.349599489988224, + "learning_rate": 6.191318627224489e-07, + "loss": 0.29, + "step": 2433 + }, + { + "epoch": 0.4409420289855073, + "grad_norm": 10.140908604499579, + "learning_rate": 6.188469042085612e-07, + "loss": 0.3605, + "step": 2434 + }, + { + "epoch": 0.4411231884057971, + "grad_norm": 4.177803360097879, + "learning_rate": 6.185619047751214e-07, + "loss": 0.2773, + "step": 2435 + }, + { + "epoch": 0.44130434782608696, + "grad_norm": 4.767036299494666, + "learning_rate": 6.182768645202558e-07, + "loss": 0.2799, + "step": 2436 + }, + { + "epoch": 0.4414855072463768, + "grad_norm": 3.766462633636971, + "learning_rate": 6.179917835421055e-07, + "loss": 0.2733, + "step": 2437 + }, + { + "epoch": 0.44166666666666665, + "grad_norm": 3.818063907046883, + "learning_rate": 6.177066619388251e-07, + "loss": 0.3255, + "step": 2438 + }, + { + "epoch": 0.4418478260869565, + "grad_norm": 3.5420938549509158, + "learning_rate": 6.174214998085832e-07, + "loss": 0.3177, + "step": 2439 + }, + { + "epoch": 0.4420289855072464, + "grad_norm": 4.705024431833968, + "learning_rate": 6.171362972495626e-07, + "loss": 0.3486, + "step": 2440 + }, + { + "epoch": 0.44221014492753624, + "grad_norm": 3.713483054306353, + "learning_rate": 6.1685105435996e-07, + "loss": 0.3105, + "step": 2441 + }, + { + "epoch": 0.4423913043478261, + "grad_norm": 6.860656832518109, + "learning_rate": 6.165657712379854e-07, + "loss": 0.3324, + "step": 2442 + }, + { + "epoch": 0.44257246376811593, + "grad_norm": 5.170289831277415, + "learning_rate": 6.162804479818637e-07, + "loss": 0.3116, + "step": 2443 + }, + { + "epoch": 0.4427536231884058, + "grad_norm": 6.865661144465947, + "learning_rate": 6.159950846898328e-07, + "loss": 0.286, + "step": 2444 + }, + { + "epoch": 0.4429347826086957, + "grad_norm": 3.6141622065675576, + "learning_rate": 6.157096814601447e-07, + "loss": 0.3246, + "step": 2445 + }, + { + "epoch": 0.4431159420289855, + "grad_norm": 4.711299127220832, + "learning_rate": 6.154242383910649e-07, + "loss": 0.2776, + "step": 2446 + }, + { + "epoch": 0.44329710144927537, + "grad_norm": 4.800757471753301, + "learning_rate": 6.151387555808729e-07, + "loss": 0.3115, + "step": 2447 + }, + { + "epoch": 0.4434782608695652, + "grad_norm": 7.2176308367961, + "learning_rate": 6.148532331278619e-07, + "loss": 0.2622, + "step": 2448 + }, + { + "epoch": 0.44365942028985506, + "grad_norm": 4.580417800697855, + "learning_rate": 6.145676711303386e-07, + "loss": 0.2483, + "step": 2449 + }, + { + "epoch": 0.4438405797101449, + "grad_norm": 7.090839506145054, + "learning_rate": 6.142820696866231e-07, + "loss": 0.3002, + "step": 2450 + }, + { + "epoch": 0.4440217391304348, + "grad_norm": 4.170397544873171, + "learning_rate": 6.139964288950497e-07, + "loss": 0.3181, + "step": 2451 + }, + { + "epoch": 0.44420289855072465, + "grad_norm": 8.087550735356297, + "learning_rate": 6.137107488539657e-07, + "loss": 0.3585, + "step": 2452 + }, + { + "epoch": 0.4443840579710145, + "grad_norm": 5.713435797652091, + "learning_rate": 6.13425029661732e-07, + "loss": 0.3129, + "step": 2453 + }, + { + "epoch": 0.44456521739130433, + "grad_norm": 3.844583853942515, + "learning_rate": 6.131392714167233e-07, + "loss": 0.3107, + "step": 2454 + }, + { + "epoch": 0.4447463768115942, + "grad_norm": 3.0519902543496387, + "learning_rate": 6.128534742173273e-07, + "loss": 0.2734, + "step": 2455 + }, + { + "epoch": 0.4449275362318841, + "grad_norm": 4.734232598715959, + "learning_rate": 6.125676381619454e-07, + "loss": 0.288, + "step": 2456 + }, + { + "epoch": 0.4451086956521739, + "grad_norm": 7.043243332262786, + "learning_rate": 6.122817633489923e-07, + "loss": 0.3175, + "step": 2457 + }, + { + "epoch": 0.44528985507246377, + "grad_norm": 8.200428475635034, + "learning_rate": 6.119958498768962e-07, + "loss": 0.3144, + "step": 2458 + }, + { + "epoch": 0.4454710144927536, + "grad_norm": 4.47371989403032, + "learning_rate": 6.117098978440981e-07, + "loss": 0.3111, + "step": 2459 + }, + { + "epoch": 0.44565217391304346, + "grad_norm": 3.2942870807097124, + "learning_rate": 6.114239073490533e-07, + "loss": 0.2837, + "step": 2460 + }, + { + "epoch": 0.44583333333333336, + "grad_norm": 3.2136242867192517, + "learning_rate": 6.111378784902288e-07, + "loss": 0.3165, + "step": 2461 + }, + { + "epoch": 0.4460144927536232, + "grad_norm": 3.566430781317973, + "learning_rate": 6.108518113661064e-07, + "loss": 0.3129, + "step": 2462 + }, + { + "epoch": 0.44619565217391305, + "grad_norm": 4.43212040990493, + "learning_rate": 6.105657060751801e-07, + "loss": 0.3618, + "step": 2463 + }, + { + "epoch": 0.4463768115942029, + "grad_norm": 7.100989076740194, + "learning_rate": 6.102795627159572e-07, + "loss": 0.3349, + "step": 2464 + }, + { + "epoch": 0.44655797101449274, + "grad_norm": 4.798284331614351, + "learning_rate": 6.099933813869585e-07, + "loss": 0.2486, + "step": 2465 + }, + { + "epoch": 0.4467391304347826, + "grad_norm": 4.114344426352763, + "learning_rate": 6.097071621867175e-07, + "loss": 0.3625, + "step": 2466 + }, + { + "epoch": 0.4469202898550725, + "grad_norm": 5.591657104607873, + "learning_rate": 6.094209052137805e-07, + "loss": 0.3869, + "step": 2467 + }, + { + "epoch": 0.44710144927536233, + "grad_norm": 4.5381586589400715, + "learning_rate": 6.091346105667077e-07, + "loss": 0.3456, + "step": 2468 + }, + { + "epoch": 0.4472826086956522, + "grad_norm": 6.732564754578744, + "learning_rate": 6.08848278344071e-07, + "loss": 0.256, + "step": 2469 + }, + { + "epoch": 0.447463768115942, + "grad_norm": 5.522757347127664, + "learning_rate": 6.085619086444566e-07, + "loss": 0.2551, + "step": 2470 + }, + { + "epoch": 0.44764492753623186, + "grad_norm": 3.7862866477335535, + "learning_rate": 6.082755015664626e-07, + "loss": 0.3159, + "step": 2471 + }, + { + "epoch": 0.44782608695652176, + "grad_norm": 5.773950369934294, + "learning_rate": 6.079890572087005e-07, + "loss": 0.2975, + "step": 2472 + }, + { + "epoch": 0.4480072463768116, + "grad_norm": 4.245890011685441, + "learning_rate": 6.077025756697942e-07, + "loss": 0.2986, + "step": 2473 + }, + { + "epoch": 0.44818840579710145, + "grad_norm": 6.326689728606212, + "learning_rate": 6.074160570483809e-07, + "loss": 0.3101, + "step": 2474 + }, + { + "epoch": 0.4483695652173913, + "grad_norm": 8.499362818011361, + "learning_rate": 6.0712950144311e-07, + "loss": 0.3405, + "step": 2475 + }, + { + "epoch": 0.44855072463768114, + "grad_norm": 3.7230462980396717, + "learning_rate": 6.068429089526446e-07, + "loss": 0.2809, + "step": 2476 + }, + { + "epoch": 0.44873188405797104, + "grad_norm": 11.311724816719947, + "learning_rate": 6.06556279675659e-07, + "loss": 0.3452, + "step": 2477 + }, + { + "epoch": 0.4489130434782609, + "grad_norm": 9.75145005318127, + "learning_rate": 6.062696137108415e-07, + "loss": 0.2696, + "step": 2478 + }, + { + "epoch": 0.44909420289855073, + "grad_norm": 7.14584372660994, + "learning_rate": 6.059829111568926e-07, + "loss": 0.3522, + "step": 2479 + }, + { + "epoch": 0.4492753623188406, + "grad_norm": 10.218413451985699, + "learning_rate": 6.056961721125252e-07, + "loss": 0.3456, + "step": 2480 + }, + { + "epoch": 0.4494565217391304, + "grad_norm": 3.3258361918376496, + "learning_rate": 6.054093966764649e-07, + "loss": 0.2685, + "step": 2481 + }, + { + "epoch": 0.44963768115942027, + "grad_norm": 6.63057783981375, + "learning_rate": 6.0512258494745e-07, + "loss": 0.2621, + "step": 2482 + }, + { + "epoch": 0.44981884057971017, + "grad_norm": 3.5014204089310494, + "learning_rate": 6.048357370242308e-07, + "loss": 0.3149, + "step": 2483 + }, + { + "epoch": 0.45, + "grad_norm": 4.371632426519727, + "learning_rate": 6.045488530055709e-07, + "loss": 0.2606, + "step": 2484 + }, + { + "epoch": 0.45018115942028986, + "grad_norm": 4.303240966821157, + "learning_rate": 6.042619329902453e-07, + "loss": 0.3337, + "step": 2485 + }, + { + "epoch": 0.4503623188405797, + "grad_norm": 3.529163527135913, + "learning_rate": 6.039749770770422e-07, + "loss": 0.3257, + "step": 2486 + }, + { + "epoch": 0.45054347826086955, + "grad_norm": 3.300415660889749, + "learning_rate": 6.03687985364762e-07, + "loss": 0.2918, + "step": 2487 + }, + { + "epoch": 0.45072463768115945, + "grad_norm": 7.841796250440804, + "learning_rate": 6.03400957952217e-07, + "loss": 0.2842, + "step": 2488 + }, + { + "epoch": 0.4509057971014493, + "grad_norm": 3.76475456465956, + "learning_rate": 6.031138949382323e-07, + "loss": 0.3086, + "step": 2489 + }, + { + "epoch": 0.45108695652173914, + "grad_norm": 3.4730194512582906, + "learning_rate": 6.02826796421645e-07, + "loss": 0.3077, + "step": 2490 + }, + { + "epoch": 0.451268115942029, + "grad_norm": 3.689315498803631, + "learning_rate": 6.025396625013046e-07, + "loss": 0.2664, + "step": 2491 + }, + { + "epoch": 0.4514492753623188, + "grad_norm": 6.188874922621899, + "learning_rate": 6.022524932760724e-07, + "loss": 0.3626, + "step": 2492 + }, + { + "epoch": 0.45163043478260867, + "grad_norm": 4.635190863041126, + "learning_rate": 6.019652888448225e-07, + "loss": 0.2876, + "step": 2493 + }, + { + "epoch": 0.45181159420289857, + "grad_norm": 3.311049094059225, + "learning_rate": 6.016780493064403e-07, + "loss": 0.3193, + "step": 2494 + }, + { + "epoch": 0.4519927536231884, + "grad_norm": 9.044556119574287, + "learning_rate": 6.013907747598241e-07, + "loss": 0.3398, + "step": 2495 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 3.354433975091886, + "learning_rate": 6.011034653038837e-07, + "loss": 0.2773, + "step": 2496 + }, + { + "epoch": 0.4523550724637681, + "grad_norm": 5.041896221104646, + "learning_rate": 6.008161210375411e-07, + "loss": 0.4211, + "step": 2497 + }, + { + "epoch": 0.45253623188405795, + "grad_norm": 5.482637424516314, + "learning_rate": 6.005287420597305e-07, + "loss": 0.3891, + "step": 2498 + }, + { + "epoch": 0.45271739130434785, + "grad_norm": 6.77500890492817, + "learning_rate": 6.002413284693976e-07, + "loss": 0.2743, + "step": 2499 + }, + { + "epoch": 0.4528985507246377, + "grad_norm": 8.011139241455309, + "learning_rate": 5.999538803655003e-07, + "loss": 0.3303, + "step": 2500 + }, + { + "epoch": 0.4528985507246377, + "eval_loss": 0.30396875739097595, + "eval_runtime": 9.7609, + "eval_samples_per_second": 51.225, + "eval_steps_per_second": 0.102, + "step": 2500 + }, + { + "epoch": 0.45307971014492754, + "grad_norm": 6.9092246240073525, + "learning_rate": 5.996663978470084e-07, + "loss": 0.2473, + "step": 2501 + }, + { + "epoch": 0.4532608695652174, + "grad_norm": 3.8900354175610254, + "learning_rate": 5.993788810129036e-07, + "loss": 0.2997, + "step": 2502 + }, + { + "epoch": 0.45344202898550723, + "grad_norm": 3.993330457172059, + "learning_rate": 5.990913299621792e-07, + "loss": 0.3003, + "step": 2503 + }, + { + "epoch": 0.45362318840579713, + "grad_norm": 6.952833218191655, + "learning_rate": 5.988037447938402e-07, + "loss": 0.3267, + "step": 2504 + }, + { + "epoch": 0.453804347826087, + "grad_norm": 7.3550149817202515, + "learning_rate": 5.985161256069039e-07, + "loss": 0.2859, + "step": 2505 + }, + { + "epoch": 0.4539855072463768, + "grad_norm": 7.708170270294903, + "learning_rate": 5.982284725003988e-07, + "loss": 0.2684, + "step": 2506 + }, + { + "epoch": 0.45416666666666666, + "grad_norm": 3.6160747934822424, + "learning_rate": 5.979407855733651e-07, + "loss": 0.2888, + "step": 2507 + }, + { + "epoch": 0.4543478260869565, + "grad_norm": 3.7935639790069198, + "learning_rate": 5.976530649248551e-07, + "loss": 0.3668, + "step": 2508 + }, + { + "epoch": 0.45452898550724635, + "grad_norm": 5.813067893712035, + "learning_rate": 5.973653106539318e-07, + "loss": 0.3615, + "step": 2509 + }, + { + "epoch": 0.45471014492753625, + "grad_norm": 3.457242965645761, + "learning_rate": 5.970775228596708e-07, + "loss": 0.2869, + "step": 2510 + }, + { + "epoch": 0.4548913043478261, + "grad_norm": 3.3273154708971755, + "learning_rate": 5.967897016411589e-07, + "loss": 0.2926, + "step": 2511 + }, + { + "epoch": 0.45507246376811594, + "grad_norm": 4.769305794513921, + "learning_rate": 5.965018470974941e-07, + "loss": 0.3344, + "step": 2512 + }, + { + "epoch": 0.4552536231884058, + "grad_norm": 3.723894121100625, + "learning_rate": 5.96213959327786e-07, + "loss": 0.2911, + "step": 2513 + }, + { + "epoch": 0.45543478260869563, + "grad_norm": 4.444093860444105, + "learning_rate": 5.959260384311559e-07, + "loss": 0.3074, + "step": 2514 + }, + { + "epoch": 0.45561594202898553, + "grad_norm": 3.7273155036014654, + "learning_rate": 5.956380845067361e-07, + "loss": 0.2979, + "step": 2515 + }, + { + "epoch": 0.4557971014492754, + "grad_norm": 9.428175187359235, + "learning_rate": 5.95350097653671e-07, + "loss": 0.2889, + "step": 2516 + }, + { + "epoch": 0.4559782608695652, + "grad_norm": 5.184841240186356, + "learning_rate": 5.950620779711152e-07, + "loss": 0.3802, + "step": 2517 + }, + { + "epoch": 0.45615942028985507, + "grad_norm": 6.6449863043115505, + "learning_rate": 5.947740255582355e-07, + "loss": 0.3157, + "step": 2518 + }, + { + "epoch": 0.4563405797101449, + "grad_norm": 6.853326562915106, + "learning_rate": 5.944859405142101e-07, + "loss": 0.3018, + "step": 2519 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 4.965577971452306, + "learning_rate": 5.941978229382274e-07, + "loss": 0.2693, + "step": 2520 + }, + { + "epoch": 0.45670289855072466, + "grad_norm": 3.708603031043961, + "learning_rate": 5.93909672929488e-07, + "loss": 0.2856, + "step": 2521 + }, + { + "epoch": 0.4568840579710145, + "grad_norm": 3.220568144515752, + "learning_rate": 5.936214905872032e-07, + "loss": 0.3139, + "step": 2522 + }, + { + "epoch": 0.45706521739130435, + "grad_norm": 4.092866477931799, + "learning_rate": 5.933332760105956e-07, + "loss": 0.3007, + "step": 2523 + }, + { + "epoch": 0.4572463768115942, + "grad_norm": 4.225827040897612, + "learning_rate": 5.930450292988991e-07, + "loss": 0.301, + "step": 2524 + }, + { + "epoch": 0.45742753623188404, + "grad_norm": 3.3557582860384136, + "learning_rate": 5.92756750551358e-07, + "loss": 0.366, + "step": 2525 + }, + { + "epoch": 0.45760869565217394, + "grad_norm": 5.518169821067675, + "learning_rate": 5.924684398672281e-07, + "loss": 0.3443, + "step": 2526 + }, + { + "epoch": 0.4577898550724638, + "grad_norm": 3.785659288877862, + "learning_rate": 5.921800973457764e-07, + "loss": 0.2654, + "step": 2527 + }, + { + "epoch": 0.4579710144927536, + "grad_norm": 4.009671498013421, + "learning_rate": 5.918917230862803e-07, + "loss": 0.307, + "step": 2528 + }, + { + "epoch": 0.45815217391304347, + "grad_norm": 4.9690540897524, + "learning_rate": 5.916033171880284e-07, + "loss": 0.3379, + "step": 2529 + }, + { + "epoch": 0.4583333333333333, + "grad_norm": 9.41697339851124, + "learning_rate": 5.913148797503208e-07, + "loss": 0.3092, + "step": 2530 + }, + { + "epoch": 0.4585144927536232, + "grad_norm": 9.47462224952637, + "learning_rate": 5.91026410872467e-07, + "loss": 0.3121, + "step": 2531 + }, + { + "epoch": 0.45869565217391306, + "grad_norm": 5.690768341547824, + "learning_rate": 5.907379106537889e-07, + "loss": 0.2885, + "step": 2532 + }, + { + "epoch": 0.4588768115942029, + "grad_norm": 4.628047691863953, + "learning_rate": 5.904493791936183e-07, + "loss": 0.354, + "step": 2533 + }, + { + "epoch": 0.45905797101449275, + "grad_norm": 4.848851639431173, + "learning_rate": 5.901608165912976e-07, + "loss": 0.2851, + "step": 2534 + }, + { + "epoch": 0.4592391304347826, + "grad_norm": 5.1550511103204135, + "learning_rate": 5.898722229461809e-07, + "loss": 0.3049, + "step": 2535 + }, + { + "epoch": 0.45942028985507244, + "grad_norm": 4.297388301767464, + "learning_rate": 5.895835983576319e-07, + "loss": 0.2299, + "step": 2536 + }, + { + "epoch": 0.45960144927536234, + "grad_norm": 5.199239870062037, + "learning_rate": 5.892949429250253e-07, + "loss": 0.3433, + "step": 2537 + }, + { + "epoch": 0.4597826086956522, + "grad_norm": 7.774860038661607, + "learning_rate": 5.89006256747747e-07, + "loss": 0.3519, + "step": 2538 + }, + { + "epoch": 0.45996376811594203, + "grad_norm": 4.235904750735014, + "learning_rate": 5.887175399251927e-07, + "loss": 0.3299, + "step": 2539 + }, + { + "epoch": 0.4601449275362319, + "grad_norm": 4.407687118864259, + "learning_rate": 5.88428792556769e-07, + "loss": 0.2474, + "step": 2540 + }, + { + "epoch": 0.4603260869565217, + "grad_norm": 9.510498571741651, + "learning_rate": 5.881400147418931e-07, + "loss": 0.3548, + "step": 2541 + }, + { + "epoch": 0.4605072463768116, + "grad_norm": 5.929562229013172, + "learning_rate": 5.878512065799925e-07, + "loss": 0.2903, + "step": 2542 + }, + { + "epoch": 0.46068840579710146, + "grad_norm": 5.254605174575904, + "learning_rate": 5.875623681705053e-07, + "loss": 0.2956, + "step": 2543 + }, + { + "epoch": 0.4608695652173913, + "grad_norm": 4.496975943263152, + "learning_rate": 5.872734996128798e-07, + "loss": 0.3078, + "step": 2544 + }, + { + "epoch": 0.46105072463768115, + "grad_norm": 3.2626975670614464, + "learning_rate": 5.869846010065748e-07, + "loss": 0.2976, + "step": 2545 + }, + { + "epoch": 0.461231884057971, + "grad_norm": 4.7579484396135285, + "learning_rate": 5.866956724510597e-07, + "loss": 0.318, + "step": 2546 + }, + { + "epoch": 0.46141304347826084, + "grad_norm": 4.759287212212859, + "learning_rate": 5.864067140458136e-07, + "loss": 0.2473, + "step": 2547 + }, + { + "epoch": 0.46159420289855074, + "grad_norm": 3.8593186038718255, + "learning_rate": 5.861177258903266e-07, + "loss": 0.2975, + "step": 2548 + }, + { + "epoch": 0.4617753623188406, + "grad_norm": 4.504714612409603, + "learning_rate": 5.858287080840984e-07, + "loss": 0.3153, + "step": 2549 + }, + { + "epoch": 0.46195652173913043, + "grad_norm": 5.811938849443222, + "learning_rate": 5.855396607266395e-07, + "loss": 0.282, + "step": 2550 + }, + { + "epoch": 0.4621376811594203, + "grad_norm": 3.82343735772419, + "learning_rate": 5.852505839174701e-07, + "loss": 0.2822, + "step": 2551 + }, + { + "epoch": 0.4623188405797101, + "grad_norm": 6.042649419091215, + "learning_rate": 5.849614777561207e-07, + "loss": 0.2831, + "step": 2552 + }, + { + "epoch": 0.4625, + "grad_norm": 3.3205333473392162, + "learning_rate": 5.846723423421318e-07, + "loss": 0.2746, + "step": 2553 + }, + { + "epoch": 0.46268115942028987, + "grad_norm": 3.068165194143659, + "learning_rate": 5.843831777750546e-07, + "loss": 0.2639, + "step": 2554 + }, + { + "epoch": 0.4628623188405797, + "grad_norm": 3.077256674128147, + "learning_rate": 5.840939841544491e-07, + "loss": 0.2482, + "step": 2555 + }, + { + "epoch": 0.46304347826086956, + "grad_norm": 4.5863200275723495, + "learning_rate": 5.838047615798865e-07, + "loss": 0.354, + "step": 2556 + }, + { + "epoch": 0.4632246376811594, + "grad_norm": 5.246053590489951, + "learning_rate": 5.835155101509476e-07, + "loss": 0.2958, + "step": 2557 + }, + { + "epoch": 0.4634057971014493, + "grad_norm": 3.927843424714349, + "learning_rate": 5.832262299672226e-07, + "loss": 0.33, + "step": 2558 + }, + { + "epoch": 0.46358695652173915, + "grad_norm": 6.396070289297395, + "learning_rate": 5.829369211283125e-07, + "loss": 0.2932, + "step": 2559 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 4.140828888321739, + "learning_rate": 5.826475837338274e-07, + "loss": 0.2795, + "step": 2560 + }, + { + "epoch": 0.46394927536231884, + "grad_norm": 6.938652882119639, + "learning_rate": 5.823582178833876e-07, + "loss": 0.2945, + "step": 2561 + }, + { + "epoch": 0.4641304347826087, + "grad_norm": 11.615666670262163, + "learning_rate": 5.820688236766232e-07, + "loss": 0.2376, + "step": 2562 + }, + { + "epoch": 0.4643115942028985, + "grad_norm": 4.953730084152967, + "learning_rate": 5.817794012131741e-07, + "loss": 0.3398, + "step": 2563 + }, + { + "epoch": 0.4644927536231884, + "grad_norm": 3.6463775335092397, + "learning_rate": 5.814899505926893e-07, + "loss": 0.2933, + "step": 2564 + }, + { + "epoch": 0.46467391304347827, + "grad_norm": 5.865073224835817, + "learning_rate": 5.812004719148288e-07, + "loss": 0.3032, + "step": 2565 + }, + { + "epoch": 0.4648550724637681, + "grad_norm": 3.495056803371781, + "learning_rate": 5.809109652792608e-07, + "loss": 0.2777, + "step": 2566 + }, + { + "epoch": 0.46503623188405796, + "grad_norm": 3.8229105722497416, + "learning_rate": 5.806214307856643e-07, + "loss": 0.3315, + "step": 2567 + }, + { + "epoch": 0.4652173913043478, + "grad_norm": 4.098903819095269, + "learning_rate": 5.803318685337271e-07, + "loss": 0.2664, + "step": 2568 + }, + { + "epoch": 0.4653985507246377, + "grad_norm": 3.461546788856099, + "learning_rate": 5.800422786231469e-07, + "loss": 0.3271, + "step": 2569 + }, + { + "epoch": 0.46557971014492755, + "grad_norm": 3.8846026275711814, + "learning_rate": 5.797526611536311e-07, + "loss": 0.3003, + "step": 2570 + }, + { + "epoch": 0.4657608695652174, + "grad_norm": 3.4201127324348732, + "learning_rate": 5.79463016224896e-07, + "loss": 0.2542, + "step": 2571 + }, + { + "epoch": 0.46594202898550724, + "grad_norm": 4.100625254758706, + "learning_rate": 5.79173343936668e-07, + "loss": 0.3068, + "step": 2572 + }, + { + "epoch": 0.4661231884057971, + "grad_norm": 7.772267251134122, + "learning_rate": 5.788836443886825e-07, + "loss": 0.2599, + "step": 2573 + }, + { + "epoch": 0.46630434782608693, + "grad_norm": 4.882826856368822, + "learning_rate": 5.785939176806845e-07, + "loss": 0.3243, + "step": 2574 + }, + { + "epoch": 0.46648550724637683, + "grad_norm": 3.2698182786469556, + "learning_rate": 5.783041639124282e-07, + "loss": 0.2491, + "step": 2575 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 5.122200296028673, + "learning_rate": 5.780143831836774e-07, + "loss": 0.2838, + "step": 2576 + }, + { + "epoch": 0.4668478260869565, + "grad_norm": 4.605204533068787, + "learning_rate": 5.777245755942047e-07, + "loss": 0.2925, + "step": 2577 + }, + { + "epoch": 0.46702898550724636, + "grad_norm": 3.225029378647811, + "learning_rate": 5.774347412437924e-07, + "loss": 0.2727, + "step": 2578 + }, + { + "epoch": 0.4672101449275362, + "grad_norm": 5.430634093165232, + "learning_rate": 5.771448802322319e-07, + "loss": 0.3149, + "step": 2579 + }, + { + "epoch": 0.4673913043478261, + "grad_norm": 4.743721177857558, + "learning_rate": 5.768549926593234e-07, + "loss": 0.3106, + "step": 2580 + }, + { + "epoch": 0.46757246376811595, + "grad_norm": 4.010257606889404, + "learning_rate": 5.765650786248769e-07, + "loss": 0.323, + "step": 2581 + }, + { + "epoch": 0.4677536231884058, + "grad_norm": 3.925544492714965, + "learning_rate": 5.76275138228711e-07, + "loss": 0.299, + "step": 2582 + }, + { + "epoch": 0.46793478260869564, + "grad_norm": 3.6602035527498558, + "learning_rate": 5.759851715706536e-07, + "loss": 0.2757, + "step": 2583 + }, + { + "epoch": 0.4681159420289855, + "grad_norm": 5.012508221820364, + "learning_rate": 5.756951787505417e-07, + "loss": 0.3045, + "step": 2584 + }, + { + "epoch": 0.4682971014492754, + "grad_norm": 6.155301065533482, + "learning_rate": 5.754051598682212e-07, + "loss": 0.4303, + "step": 2585 + }, + { + "epoch": 0.46847826086956523, + "grad_norm": 3.053830558166979, + "learning_rate": 5.751151150235467e-07, + "loss": 0.2546, + "step": 2586 + }, + { + "epoch": 0.4686594202898551, + "grad_norm": 4.508932676087459, + "learning_rate": 5.748250443163823e-07, + "loss": 0.3448, + "step": 2587 + }, + { + "epoch": 0.4688405797101449, + "grad_norm": 3.57702351859665, + "learning_rate": 5.745349478466004e-07, + "loss": 0.2932, + "step": 2588 + }, + { + "epoch": 0.46902173913043477, + "grad_norm": 4.067593840822912, + "learning_rate": 5.742448257140831e-07, + "loss": 0.3339, + "step": 2589 + }, + { + "epoch": 0.4692028985507246, + "grad_norm": 8.400370210807065, + "learning_rate": 5.739546780187202e-07, + "loss": 0.3749, + "step": 2590 + }, + { + "epoch": 0.4693840579710145, + "grad_norm": 5.184628643464647, + "learning_rate": 5.736645048604115e-07, + "loss": 0.3138, + "step": 2591 + }, + { + "epoch": 0.46956521739130436, + "grad_norm": 4.549921201725895, + "learning_rate": 5.733743063390647e-07, + "loss": 0.2975, + "step": 2592 + }, + { + "epoch": 0.4697463768115942, + "grad_norm": 6.4597389096253, + "learning_rate": 5.730840825545965e-07, + "loss": 0.3349, + "step": 2593 + }, + { + "epoch": 0.46992753623188405, + "grad_norm": 5.414429841894115, + "learning_rate": 5.727938336069325e-07, + "loss": 0.3463, + "step": 2594 + }, + { + "epoch": 0.4701086956521739, + "grad_norm": 6.182784881613846, + "learning_rate": 5.725035595960066e-07, + "loss": 0.3101, + "step": 2595 + }, + { + "epoch": 0.4702898550724638, + "grad_norm": 7.2311545179767425, + "learning_rate": 5.722132606217616e-07, + "loss": 0.3177, + "step": 2596 + }, + { + "epoch": 0.47047101449275364, + "grad_norm": 10.54098617581661, + "learning_rate": 5.719229367841489e-07, + "loss": 0.3, + "step": 2597 + }, + { + "epoch": 0.4706521739130435, + "grad_norm": 6.62541623108199, + "learning_rate": 5.716325881831282e-07, + "loss": 0.3197, + "step": 2598 + }, + { + "epoch": 0.4708333333333333, + "grad_norm": 4.690670094677277, + "learning_rate": 5.71342214918668e-07, + "loss": 0.2814, + "step": 2599 + }, + { + "epoch": 0.47101449275362317, + "grad_norm": 3.5949207240586953, + "learning_rate": 5.710518170907452e-07, + "loss": 0.2644, + "step": 2600 + }, + { + "epoch": 0.47101449275362317, + "eval_loss": 0.310484379529953, + "eval_runtime": 9.849, + "eval_samples_per_second": 50.767, + "eval_steps_per_second": 0.102, + "step": 2600 + }, + { + "epoch": 0.47119565217391307, + "grad_norm": 3.595624114374465, + "learning_rate": 5.707613947993451e-07, + "loss": 0.2267, + "step": 2601 + }, + { + "epoch": 0.4713768115942029, + "grad_norm": 5.846735160575425, + "learning_rate": 5.704709481444615e-07, + "loss": 0.2739, + "step": 2602 + }, + { + "epoch": 0.47155797101449276, + "grad_norm": 4.312101455072667, + "learning_rate": 5.701804772260967e-07, + "loss": 0.2902, + "step": 2603 + }, + { + "epoch": 0.4717391304347826, + "grad_norm": 5.677494926181608, + "learning_rate": 5.698899821442608e-07, + "loss": 0.2375, + "step": 2604 + }, + { + "epoch": 0.47192028985507245, + "grad_norm": 4.710343177338405, + "learning_rate": 5.695994629989731e-07, + "loss": 0.3088, + "step": 2605 + }, + { + "epoch": 0.4721014492753623, + "grad_norm": 6.474847578010311, + "learning_rate": 5.693089198902605e-07, + "loss": 0.2696, + "step": 2606 + }, + { + "epoch": 0.4722826086956522, + "grad_norm": 7.754153874184792, + "learning_rate": 5.690183529181583e-07, + "loss": 0.324, + "step": 2607 + }, + { + "epoch": 0.47246376811594204, + "grad_norm": 5.39865584055263, + "learning_rate": 5.687277621827104e-07, + "loss": 0.2482, + "step": 2608 + }, + { + "epoch": 0.4726449275362319, + "grad_norm": 7.302495389596663, + "learning_rate": 5.68437147783968e-07, + "loss": 0.2247, + "step": 2609 + }, + { + "epoch": 0.47282608695652173, + "grad_norm": 5.040916731038546, + "learning_rate": 5.681465098219915e-07, + "loss": 0.2831, + "step": 2610 + }, + { + "epoch": 0.4730072463768116, + "grad_norm": 6.064501599866838, + "learning_rate": 5.678558483968489e-07, + "loss": 0.2481, + "step": 2611 + }, + { + "epoch": 0.4731884057971015, + "grad_norm": 3.573436779516617, + "learning_rate": 5.675651636086162e-07, + "loss": 0.2868, + "step": 2612 + }, + { + "epoch": 0.4733695652173913, + "grad_norm": 9.570513209943377, + "learning_rate": 5.672744555573774e-07, + "loss": 0.3184, + "step": 2613 + }, + { + "epoch": 0.47355072463768116, + "grad_norm": 8.493863799379548, + "learning_rate": 5.669837243432248e-07, + "loss": 0.2867, + "step": 2614 + }, + { + "epoch": 0.473731884057971, + "grad_norm": 5.503225370995322, + "learning_rate": 5.666929700662584e-07, + "loss": 0.295, + "step": 2615 + }, + { + "epoch": 0.47391304347826085, + "grad_norm": 11.850025654481273, + "learning_rate": 5.664021928265868e-07, + "loss": 0.4279, + "step": 2616 + }, + { + "epoch": 0.4740942028985507, + "grad_norm": 4.6253460668145925, + "learning_rate": 5.661113927243253e-07, + "loss": 0.2358, + "step": 2617 + }, + { + "epoch": 0.4742753623188406, + "grad_norm": 6.264961553023649, + "learning_rate": 5.658205698595982e-07, + "loss": 0.2819, + "step": 2618 + }, + { + "epoch": 0.47445652173913044, + "grad_norm": 4.075884242549915, + "learning_rate": 5.655297243325368e-07, + "loss": 0.2966, + "step": 2619 + }, + { + "epoch": 0.4746376811594203, + "grad_norm": 6.867910903811688, + "learning_rate": 5.65238856243281e-07, + "loss": 0.381, + "step": 2620 + }, + { + "epoch": 0.47481884057971013, + "grad_norm": 3.883815606795304, + "learning_rate": 5.64947965691978e-07, + "loss": 0.2253, + "step": 2621 + }, + { + "epoch": 0.475, + "grad_norm": 4.537925629994537, + "learning_rate": 5.646570527787826e-07, + "loss": 0.3035, + "step": 2622 + }, + { + "epoch": 0.4751811594202899, + "grad_norm": 6.288214030219309, + "learning_rate": 5.643661176038574e-07, + "loss": 0.3158, + "step": 2623 + }, + { + "epoch": 0.4753623188405797, + "grad_norm": 5.479975975318324, + "learning_rate": 5.640751602673731e-07, + "loss": 0.2769, + "step": 2624 + }, + { + "epoch": 0.47554347826086957, + "grad_norm": 5.212645691924527, + "learning_rate": 5.637841808695074e-07, + "loss": 0.3199, + "step": 2625 + }, + { + "epoch": 0.4757246376811594, + "grad_norm": 4.169349248093498, + "learning_rate": 5.634931795104461e-07, + "loss": 0.2965, + "step": 2626 + }, + { + "epoch": 0.47590579710144926, + "grad_norm": 4.996569224274965, + "learning_rate": 5.632021562903822e-07, + "loss": 0.2849, + "step": 2627 + }, + { + "epoch": 0.47608695652173916, + "grad_norm": 7.0275331389235305, + "learning_rate": 5.629111113095166e-07, + "loss": 0.291, + "step": 2628 + }, + { + "epoch": 0.476268115942029, + "grad_norm": 4.8035532703178, + "learning_rate": 5.626200446680571e-07, + "loss": 0.3569, + "step": 2629 + }, + { + "epoch": 0.47644927536231885, + "grad_norm": 13.915672788863972, + "learning_rate": 5.623289564662195e-07, + "loss": 0.3025, + "step": 2630 + }, + { + "epoch": 0.4766304347826087, + "grad_norm": 14.618706013861567, + "learning_rate": 5.62037846804227e-07, + "loss": 0.3708, + "step": 2631 + }, + { + "epoch": 0.47681159420289854, + "grad_norm": 15.877577208065015, + "learning_rate": 5.617467157823099e-07, + "loss": 0.382, + "step": 2632 + }, + { + "epoch": 0.4769927536231884, + "grad_norm": 10.108425075493251, + "learning_rate": 5.61455563500706e-07, + "loss": 0.2957, + "step": 2633 + }, + { + "epoch": 0.4771739130434783, + "grad_norm": 3.492586410796111, + "learning_rate": 5.611643900596604e-07, + "loss": 0.3059, + "step": 2634 + }, + { + "epoch": 0.4773550724637681, + "grad_norm": 4.880506232723462, + "learning_rate": 5.608731955594255e-07, + "loss": 0.2765, + "step": 2635 + }, + { + "epoch": 0.47753623188405797, + "grad_norm": 2.938274308863043, + "learning_rate": 5.605819801002608e-07, + "loss": 0.2414, + "step": 2636 + }, + { + "epoch": 0.4777173913043478, + "grad_norm": 5.748849209465617, + "learning_rate": 5.602907437824335e-07, + "loss": 0.3214, + "step": 2637 + }, + { + "epoch": 0.47789855072463766, + "grad_norm": 7.629622573469101, + "learning_rate": 5.599994867062173e-07, + "loss": 0.31, + "step": 2638 + }, + { + "epoch": 0.47807971014492756, + "grad_norm": 4.807859565732472, + "learning_rate": 5.597082089718937e-07, + "loss": 0.3092, + "step": 2639 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 5.942727586030775, + "learning_rate": 5.594169106797507e-07, + "loss": 0.3688, + "step": 2640 + }, + { + "epoch": 0.47844202898550725, + "grad_norm": 12.050696095031252, + "learning_rate": 5.591255919300839e-07, + "loss": 0.3134, + "step": 2641 + }, + { + "epoch": 0.4786231884057971, + "grad_norm": 5.415935043959021, + "learning_rate": 5.588342528231956e-07, + "loss": 0.3516, + "step": 2642 + }, + { + "epoch": 0.47880434782608694, + "grad_norm": 3.415654005211583, + "learning_rate": 5.585428934593954e-07, + "loss": 0.2853, + "step": 2643 + }, + { + "epoch": 0.4789855072463768, + "grad_norm": 9.16039898259828, + "learning_rate": 5.582515139389995e-07, + "loss": 0.3593, + "step": 2644 + }, + { + "epoch": 0.4791666666666667, + "grad_norm": 6.434642097986956, + "learning_rate": 5.579601143623314e-07, + "loss": 0.3035, + "step": 2645 + }, + { + "epoch": 0.47934782608695653, + "grad_norm": 13.360594188126193, + "learning_rate": 5.576686948297213e-07, + "loss": 0.3365, + "step": 2646 + }, + { + "epoch": 0.4795289855072464, + "grad_norm": 9.596711866869823, + "learning_rate": 5.573772554415064e-07, + "loss": 0.2995, + "step": 2647 + }, + { + "epoch": 0.4797101449275362, + "grad_norm": 9.047410781720341, + "learning_rate": 5.570857962980305e-07, + "loss": 0.2821, + "step": 2648 + }, + { + "epoch": 0.47989130434782606, + "grad_norm": 7.026168666018671, + "learning_rate": 5.567943174996444e-07, + "loss": 0.2991, + "step": 2649 + }, + { + "epoch": 0.48007246376811596, + "grad_norm": 3.7208179823506784, + "learning_rate": 5.565028191467057e-07, + "loss": 0.3339, + "step": 2650 + }, + { + "epoch": 0.4802536231884058, + "grad_norm": 10.732073571208474, + "learning_rate": 5.562113013395789e-07, + "loss": 0.3353, + "step": 2651 + }, + { + "epoch": 0.48043478260869565, + "grad_norm": 3.7077811782930774, + "learning_rate": 5.559197641786344e-07, + "loss": 0.3303, + "step": 2652 + }, + { + "epoch": 0.4806159420289855, + "grad_norm": 4.754087645243929, + "learning_rate": 5.556282077642504e-07, + "loss": 0.2982, + "step": 2653 + }, + { + "epoch": 0.48079710144927534, + "grad_norm": 7.1514453187911, + "learning_rate": 5.553366321968107e-07, + "loss": 0.2945, + "step": 2654 + }, + { + "epoch": 0.48097826086956524, + "grad_norm": 6.644085111264358, + "learning_rate": 5.550450375767064e-07, + "loss": 0.2902, + "step": 2655 + }, + { + "epoch": 0.4811594202898551, + "grad_norm": 7.535968497184428, + "learning_rate": 5.547534240043349e-07, + "loss": 0.3392, + "step": 2656 + }, + { + "epoch": 0.48134057971014493, + "grad_norm": 3.5069523855932556, + "learning_rate": 5.544617915801e-07, + "loss": 0.2854, + "step": 2657 + }, + { + "epoch": 0.4815217391304348, + "grad_norm": 6.992167105541255, + "learning_rate": 5.541701404044122e-07, + "loss": 0.3161, + "step": 2658 + }, + { + "epoch": 0.4817028985507246, + "grad_norm": 4.306855989262285, + "learning_rate": 5.538784705776886e-07, + "loss": 0.2746, + "step": 2659 + }, + { + "epoch": 0.48188405797101447, + "grad_norm": 3.849789750717037, + "learning_rate": 5.535867822003521e-07, + "loss": 0.3391, + "step": 2660 + }, + { + "epoch": 0.48206521739130437, + "grad_norm": 5.490840859226864, + "learning_rate": 5.532950753728325e-07, + "loss": 0.2808, + "step": 2661 + }, + { + "epoch": 0.4822463768115942, + "grad_norm": 8.980019169542674, + "learning_rate": 5.530033501955662e-07, + "loss": 0.2883, + "step": 2662 + }, + { + "epoch": 0.48242753623188406, + "grad_norm": 2.9911500066400216, + "learning_rate": 5.527116067689951e-07, + "loss": 0.2689, + "step": 2663 + }, + { + "epoch": 0.4826086956521739, + "grad_norm": 4.509846198833672, + "learning_rate": 5.524198451935682e-07, + "loss": 0.2955, + "step": 2664 + }, + { + "epoch": 0.48278985507246375, + "grad_norm": 5.2834549244991305, + "learning_rate": 5.5212806556974e-07, + "loss": 0.338, + "step": 2665 + }, + { + "epoch": 0.48297101449275365, + "grad_norm": 3.817141454906933, + "learning_rate": 5.51836267997972e-07, + "loss": 0.3278, + "step": 2666 + }, + { + "epoch": 0.4831521739130435, + "grad_norm": 7.783635165410847, + "learning_rate": 5.515444525787313e-07, + "loss": 0.3436, + "step": 2667 + }, + { + "epoch": 0.48333333333333334, + "grad_norm": 8.629770650834422, + "learning_rate": 5.512526194124914e-07, + "loss": 0.3251, + "step": 2668 + }, + { + "epoch": 0.4835144927536232, + "grad_norm": 4.83390122724991, + "learning_rate": 5.509607685997316e-07, + "loss": 0.3123, + "step": 2669 + }, + { + "epoch": 0.483695652173913, + "grad_norm": 4.894794140233111, + "learning_rate": 5.50668900240938e-07, + "loss": 0.2605, + "step": 2670 + }, + { + "epoch": 0.48387681159420287, + "grad_norm": 3.4730321680561484, + "learning_rate": 5.503770144366018e-07, + "loss": 0.2723, + "step": 2671 + }, + { + "epoch": 0.48405797101449277, + "grad_norm": 3.0638806487271766, + "learning_rate": 5.500851112872207e-07, + "loss": 0.2955, + "step": 2672 + }, + { + "epoch": 0.4842391304347826, + "grad_norm": 3.7593535571090975, + "learning_rate": 5.497931908932988e-07, + "loss": 0.3083, + "step": 2673 + }, + { + "epoch": 0.48442028985507246, + "grad_norm": 11.062331823031327, + "learning_rate": 5.495012533553452e-07, + "loss": 0.3174, + "step": 2674 + }, + { + "epoch": 0.4846014492753623, + "grad_norm": 5.575073713322108, + "learning_rate": 5.492092987738756e-07, + "loss": 0.2735, + "step": 2675 + }, + { + "epoch": 0.48478260869565215, + "grad_norm": 6.716159322376339, + "learning_rate": 5.489173272494112e-07, + "loss": 0.3295, + "step": 2676 + }, + { + "epoch": 0.48496376811594205, + "grad_norm": 3.8029670925725387, + "learning_rate": 5.486253388824791e-07, + "loss": 0.3206, + "step": 2677 + }, + { + "epoch": 0.4851449275362319, + "grad_norm": 7.328418064770815, + "learning_rate": 5.483333337736127e-07, + "loss": 0.2246, + "step": 2678 + }, + { + "epoch": 0.48532608695652174, + "grad_norm": 4.1349575904903295, + "learning_rate": 5.480413120233503e-07, + "loss": 0.3096, + "step": 2679 + }, + { + "epoch": 0.4855072463768116, + "grad_norm": 7.1317055532675155, + "learning_rate": 5.477492737322366e-07, + "loss": 0.3323, + "step": 2680 + }, + { + "epoch": 0.48568840579710143, + "grad_norm": 5.435014203979758, + "learning_rate": 5.474572190008217e-07, + "loss": 0.2565, + "step": 2681 + }, + { + "epoch": 0.48586956521739133, + "grad_norm": 4.490403980577365, + "learning_rate": 5.471651479296616e-07, + "loss": 0.2511, + "step": 2682 + }, + { + "epoch": 0.4860507246376812, + "grad_norm": 3.496916476105908, + "learning_rate": 5.468730606193174e-07, + "loss": 0.297, + "step": 2683 + }, + { + "epoch": 0.486231884057971, + "grad_norm": 4.435615242357741, + "learning_rate": 5.465809571703564e-07, + "loss": 0.3972, + "step": 2684 + }, + { + "epoch": 0.48641304347826086, + "grad_norm": 4.05157110261667, + "learning_rate": 5.462888376833509e-07, + "loss": 0.2606, + "step": 2685 + }, + { + "epoch": 0.4865942028985507, + "grad_norm": 8.997093826612808, + "learning_rate": 5.459967022588797e-07, + "loss": 0.3108, + "step": 2686 + }, + { + "epoch": 0.48677536231884055, + "grad_norm": 6.653451282453933, + "learning_rate": 5.457045509975256e-07, + "loss": 0.3045, + "step": 2687 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 4.380012813074611, + "learning_rate": 5.454123839998784e-07, + "loss": 0.3346, + "step": 2688 + }, + { + "epoch": 0.4871376811594203, + "grad_norm": 5.08012002303374, + "learning_rate": 5.451202013665319e-07, + "loss": 0.2602, + "step": 2689 + }, + { + "epoch": 0.48731884057971014, + "grad_norm": 4.677514818883332, + "learning_rate": 5.448280031980865e-07, + "loss": 0.297, + "step": 2690 + }, + { + "epoch": 0.4875, + "grad_norm": 3.939079055787215, + "learning_rate": 5.445357895951471e-07, + "loss": 0.2489, + "step": 2691 + }, + { + "epoch": 0.48768115942028983, + "grad_norm": 3.6384946991565017, + "learning_rate": 5.442435606583243e-07, + "loss": 0.2826, + "step": 2692 + }, + { + "epoch": 0.48786231884057973, + "grad_norm": 3.4208001965138912, + "learning_rate": 5.439513164882339e-07, + "loss": 0.3057, + "step": 2693 + }, + { + "epoch": 0.4880434782608696, + "grad_norm": 4.241873975138517, + "learning_rate": 5.436590571854973e-07, + "loss": 0.3235, + "step": 2694 + }, + { + "epoch": 0.4882246376811594, + "grad_norm": 3.523296992085853, + "learning_rate": 5.4336678285074e-07, + "loss": 0.2848, + "step": 2695 + }, + { + "epoch": 0.48840579710144927, + "grad_norm": 3.9136402428348176, + "learning_rate": 5.430744935845941e-07, + "loss": 0.3137, + "step": 2696 + }, + { + "epoch": 0.4885869565217391, + "grad_norm": 4.002356665702272, + "learning_rate": 5.42782189487696e-07, + "loss": 0.2593, + "step": 2697 + }, + { + "epoch": 0.48876811594202896, + "grad_norm": 4.317955103919563, + "learning_rate": 5.424898706606874e-07, + "loss": 0.3594, + "step": 2698 + }, + { + "epoch": 0.48894927536231886, + "grad_norm": 3.321739899539395, + "learning_rate": 5.421975372042149e-07, + "loss": 0.2896, + "step": 2699 + }, + { + "epoch": 0.4891304347826087, + "grad_norm": 5.904949321834864, + "learning_rate": 5.419051892189305e-07, + "loss": 0.367, + "step": 2700 + }, + { + "epoch": 0.4891304347826087, + "eval_loss": 0.3039218783378601, + "eval_runtime": 9.7551, + "eval_samples_per_second": 51.255, + "eval_steps_per_second": 0.103, + "step": 2700 + }, + { + "epoch": 0.48931159420289855, + "grad_norm": 4.45579202305939, + "learning_rate": 5.416128268054907e-07, + "loss": 0.2626, + "step": 2701 + }, + { + "epoch": 0.4894927536231884, + "grad_norm": 6.308609420590981, + "learning_rate": 5.413204500645576e-07, + "loss": 0.2723, + "step": 2702 + }, + { + "epoch": 0.48967391304347824, + "grad_norm": 3.5778652091022485, + "learning_rate": 5.410280590967978e-07, + "loss": 0.3099, + "step": 2703 + }, + { + "epoch": 0.48985507246376814, + "grad_norm": 8.508044392263377, + "learning_rate": 5.407356540028828e-07, + "loss": 0.3034, + "step": 2704 + }, + { + "epoch": 0.490036231884058, + "grad_norm": 5.734873014248094, + "learning_rate": 5.404432348834892e-07, + "loss": 0.2523, + "step": 2705 + }, + { + "epoch": 0.4902173913043478, + "grad_norm": 4.525846647957305, + "learning_rate": 5.40150801839298e-07, + "loss": 0.2657, + "step": 2706 + }, + { + "epoch": 0.49039855072463767, + "grad_norm": 5.521035150051582, + "learning_rate": 5.398583549709957e-07, + "loss": 0.2695, + "step": 2707 + }, + { + "epoch": 0.4905797101449275, + "grad_norm": 4.882680052252439, + "learning_rate": 5.395658943792729e-07, + "loss": 0.2752, + "step": 2708 + }, + { + "epoch": 0.4907608695652174, + "grad_norm": 6.149246453969823, + "learning_rate": 5.392734201648251e-07, + "loss": 0.2829, + "step": 2709 + }, + { + "epoch": 0.49094202898550726, + "grad_norm": 4.624701767646452, + "learning_rate": 5.389809324283528e-07, + "loss": 0.3101, + "step": 2710 + }, + { + "epoch": 0.4911231884057971, + "grad_norm": 3.2727847471012192, + "learning_rate": 5.386884312705607e-07, + "loss": 0.29, + "step": 2711 + }, + { + "epoch": 0.49130434782608695, + "grad_norm": 4.455545016255246, + "learning_rate": 5.383959167921584e-07, + "loss": 0.4018, + "step": 2712 + }, + { + "epoch": 0.4914855072463768, + "grad_norm": 6.838452853084961, + "learning_rate": 5.3810338909386e-07, + "loss": 0.3067, + "step": 2713 + }, + { + "epoch": 0.49166666666666664, + "grad_norm": 7.775671180613394, + "learning_rate": 5.378108482763841e-07, + "loss": 0.3201, + "step": 2714 + }, + { + "epoch": 0.49184782608695654, + "grad_norm": 4.462140202625592, + "learning_rate": 5.375182944404542e-07, + "loss": 0.3137, + "step": 2715 + }, + { + "epoch": 0.4920289855072464, + "grad_norm": 4.687554638196434, + "learning_rate": 5.372257276867977e-07, + "loss": 0.2755, + "step": 2716 + }, + { + "epoch": 0.49221014492753623, + "grad_norm": 7.577957605634497, + "learning_rate": 5.369331481161468e-07, + "loss": 0.2793, + "step": 2717 + }, + { + "epoch": 0.4923913043478261, + "grad_norm": 4.467344434683192, + "learning_rate": 5.366405558292379e-07, + "loss": 0.266, + "step": 2718 + }, + { + "epoch": 0.4925724637681159, + "grad_norm": 4.072289834852524, + "learning_rate": 5.36347950926812e-07, + "loss": 0.3357, + "step": 2719 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 4.842289431907543, + "learning_rate": 5.360553335096144e-07, + "loss": 0.3132, + "step": 2720 + }, + { + "epoch": 0.49293478260869567, + "grad_norm": 4.192826672463043, + "learning_rate": 5.357627036783949e-07, + "loss": 0.2427, + "step": 2721 + }, + { + "epoch": 0.4931159420289855, + "grad_norm": 5.315459785000078, + "learning_rate": 5.354700615339067e-07, + "loss": 0.3047, + "step": 2722 + }, + { + "epoch": 0.49329710144927535, + "grad_norm": 7.950173376612212, + "learning_rate": 5.351774071769084e-07, + "loss": 0.2732, + "step": 2723 + }, + { + "epoch": 0.4934782608695652, + "grad_norm": 4.555633720750855, + "learning_rate": 5.348847407081622e-07, + "loss": 0.2816, + "step": 2724 + }, + { + "epoch": 0.4936594202898551, + "grad_norm": 6.59457212048181, + "learning_rate": 5.345920622284344e-07, + "loss": 0.2854, + "step": 2725 + }, + { + "epoch": 0.49384057971014494, + "grad_norm": 4.483224534729175, + "learning_rate": 5.342993718384958e-07, + "loss": 0.2434, + "step": 2726 + }, + { + "epoch": 0.4940217391304348, + "grad_norm": 8.111939583998177, + "learning_rate": 5.340066696391209e-07, + "loss": 0.3217, + "step": 2727 + }, + { + "epoch": 0.49420289855072463, + "grad_norm": 4.168379596516225, + "learning_rate": 5.337139557310883e-07, + "loss": 0.2947, + "step": 2728 + }, + { + "epoch": 0.4943840579710145, + "grad_norm": 3.7397990657870896, + "learning_rate": 5.334212302151814e-07, + "loss": 0.2913, + "step": 2729 + }, + { + "epoch": 0.4945652173913043, + "grad_norm": 3.537244043476381, + "learning_rate": 5.331284931921862e-07, + "loss": 0.2637, + "step": 2730 + }, + { + "epoch": 0.4947463768115942, + "grad_norm": 10.28278894777417, + "learning_rate": 5.328357447628941e-07, + "loss": 0.2918, + "step": 2731 + }, + { + "epoch": 0.49492753623188407, + "grad_norm": 4.893977234583997, + "learning_rate": 5.325429850280993e-07, + "loss": 0.3189, + "step": 2732 + }, + { + "epoch": 0.4951086956521739, + "grad_norm": 7.677152587780667, + "learning_rate": 5.322502140886007e-07, + "loss": 0.3112, + "step": 2733 + }, + { + "epoch": 0.49528985507246376, + "grad_norm": 7.244766671748888, + "learning_rate": 5.319574320452007e-07, + "loss": 0.2805, + "step": 2734 + }, + { + "epoch": 0.4954710144927536, + "grad_norm": 7.375741734044889, + "learning_rate": 5.316646389987052e-07, + "loss": 0.2301, + "step": 2735 + }, + { + "epoch": 0.4956521739130435, + "grad_norm": 14.714999307588787, + "learning_rate": 5.313718350499245e-07, + "loss": 0.3243, + "step": 2736 + }, + { + "epoch": 0.49583333333333335, + "grad_norm": 8.895261043189805, + "learning_rate": 5.310790202996723e-07, + "loss": 0.3119, + "step": 2737 + }, + { + "epoch": 0.4960144927536232, + "grad_norm": 4.718628149017444, + "learning_rate": 5.307861948487663e-07, + "loss": 0.3616, + "step": 2738 + }, + { + "epoch": 0.49619565217391304, + "grad_norm": 5.052879601517405, + "learning_rate": 5.304933587980274e-07, + "loss": 0.3123, + "step": 2739 + }, + { + "epoch": 0.4963768115942029, + "grad_norm": 5.537096583386094, + "learning_rate": 5.302005122482808e-07, + "loss": 0.34, + "step": 2740 + }, + { + "epoch": 0.4965579710144927, + "grad_norm": 5.390605984417862, + "learning_rate": 5.299076553003545e-07, + "loss": 0.2796, + "step": 2741 + }, + { + "epoch": 0.4967391304347826, + "grad_norm": 11.504103419548, + "learning_rate": 5.29614788055081e-07, + "loss": 0.2965, + "step": 2742 + }, + { + "epoch": 0.4969202898550725, + "grad_norm": 4.423570427125164, + "learning_rate": 5.293219106132956e-07, + "loss": 0.3015, + "step": 2743 + }, + { + "epoch": 0.4971014492753623, + "grad_norm": 3.7265284481310266, + "learning_rate": 5.290290230758373e-07, + "loss": 0.3188, + "step": 2744 + }, + { + "epoch": 0.49728260869565216, + "grad_norm": 3.7518614117350824, + "learning_rate": 5.28736125543549e-07, + "loss": 0.2914, + "step": 2745 + }, + { + "epoch": 0.497463768115942, + "grad_norm": 3.5353363783121328, + "learning_rate": 5.284432181172763e-07, + "loss": 0.2986, + "step": 2746 + }, + { + "epoch": 0.4976449275362319, + "grad_norm": 8.074210758954466, + "learning_rate": 5.281503008978689e-07, + "loss": 0.285, + "step": 2747 + }, + { + "epoch": 0.49782608695652175, + "grad_norm": 4.265055425624807, + "learning_rate": 5.278573739861798e-07, + "loss": 0.2608, + "step": 2748 + }, + { + "epoch": 0.4980072463768116, + "grad_norm": 4.28218710607402, + "learning_rate": 5.275644374830645e-07, + "loss": 0.3041, + "step": 2749 + }, + { + "epoch": 0.49818840579710144, + "grad_norm": 8.163648954334318, + "learning_rate": 5.272714914893829e-07, + "loss": 0.301, + "step": 2750 + }, + { + "epoch": 0.4983695652173913, + "grad_norm": 4.2666614306728885, + "learning_rate": 5.269785361059976e-07, + "loss": 0.3238, + "step": 2751 + }, + { + "epoch": 0.4985507246376812, + "grad_norm": 9.276078803223166, + "learning_rate": 5.266855714337745e-07, + "loss": 0.3673, + "step": 2752 + }, + { + "epoch": 0.49873188405797103, + "grad_norm": 4.20801313210396, + "learning_rate": 5.263925975735826e-07, + "loss": 0.2893, + "step": 2753 + }, + { + "epoch": 0.4989130434782609, + "grad_norm": 4.800150450879598, + "learning_rate": 5.260996146262944e-07, + "loss": 0.311, + "step": 2754 + }, + { + "epoch": 0.4990942028985507, + "grad_norm": 6.266777583724696, + "learning_rate": 5.258066226927851e-07, + "loss": 0.2688, + "step": 2755 + }, + { + "epoch": 0.49927536231884057, + "grad_norm": 6.216070769103604, + "learning_rate": 5.255136218739336e-07, + "loss": 0.3215, + "step": 2756 + }, + { + "epoch": 0.4994565217391304, + "grad_norm": 3.6440029964968574, + "learning_rate": 5.25220612270621e-07, + "loss": 0.284, + "step": 2757 + }, + { + "epoch": 0.4996376811594203, + "grad_norm": 7.325266596129861, + "learning_rate": 5.249275939837321e-07, + "loss": 0.2799, + "step": 2758 + }, + { + "epoch": 0.49981884057971016, + "grad_norm": 8.400060048053327, + "learning_rate": 5.246345671141546e-07, + "loss": 0.3279, + "step": 2759 + }, + { + "epoch": 0.5, + "grad_norm": 6.1671206027410035, + "learning_rate": 5.243415317627788e-07, + "loss": 0.3204, + "step": 2760 + }, + { + "epoch": 0.5001811594202898, + "grad_norm": 8.764162152695576, + "learning_rate": 5.240484880304983e-07, + "loss": 0.3597, + "step": 2761 + }, + { + "epoch": 0.5003623188405797, + "grad_norm": 3.8247283686088114, + "learning_rate": 5.237554360182095e-07, + "loss": 0.2886, + "step": 2762 + }, + { + "epoch": 0.5005434782608695, + "grad_norm": 4.47621729177752, + "learning_rate": 5.234623758268113e-07, + "loss": 0.3073, + "step": 2763 + }, + { + "epoch": 0.5007246376811594, + "grad_norm": 3.5599895353965145, + "learning_rate": 5.231693075572062e-07, + "loss": 0.2818, + "step": 2764 + }, + { + "epoch": 0.5009057971014492, + "grad_norm": 5.229752367628947, + "learning_rate": 5.228762313102985e-07, + "loss": 0.3219, + "step": 2765 + }, + { + "epoch": 0.5010869565217392, + "grad_norm": 7.554121663949298, + "learning_rate": 5.225831471869961e-07, + "loss": 0.3141, + "step": 2766 + }, + { + "epoch": 0.501268115942029, + "grad_norm": 6.040960590596064, + "learning_rate": 5.222900552882092e-07, + "loss": 0.3264, + "step": 2767 + }, + { + "epoch": 0.5014492753623189, + "grad_norm": 4.87642454573085, + "learning_rate": 5.219969557148506e-07, + "loss": 0.2887, + "step": 2768 + }, + { + "epoch": 0.5016304347826087, + "grad_norm": 3.7610477736395014, + "learning_rate": 5.217038485678359e-07, + "loss": 0.287, + "step": 2769 + }, + { + "epoch": 0.5018115942028986, + "grad_norm": 3.477214852855607, + "learning_rate": 5.214107339480833e-07, + "loss": 0.3, + "step": 2770 + }, + { + "epoch": 0.5019927536231884, + "grad_norm": 3.4792708443862193, + "learning_rate": 5.211176119565135e-07, + "loss": 0.2627, + "step": 2771 + }, + { + "epoch": 0.5021739130434782, + "grad_norm": 5.198722844529025, + "learning_rate": 5.2082448269405e-07, + "loss": 0.3126, + "step": 2772 + }, + { + "epoch": 0.5023550724637681, + "grad_norm": 7.767051608977658, + "learning_rate": 5.205313462616183e-07, + "loss": 0.3113, + "step": 2773 + }, + { + "epoch": 0.5025362318840579, + "grad_norm": 5.481460204094765, + "learning_rate": 5.202382027601467e-07, + "loss": 0.321, + "step": 2774 + }, + { + "epoch": 0.5027173913043478, + "grad_norm": 5.298026483632371, + "learning_rate": 5.199450522905663e-07, + "loss": 0.296, + "step": 2775 + }, + { + "epoch": 0.5028985507246376, + "grad_norm": 6.528779965297598, + "learning_rate": 5.196518949538097e-07, + "loss": 0.2764, + "step": 2776 + }, + { + "epoch": 0.5030797101449276, + "grad_norm": 7.486347283018093, + "learning_rate": 5.193587308508126e-07, + "loss": 0.2903, + "step": 2777 + }, + { + "epoch": 0.5032608695652174, + "grad_norm": 7.771786703073877, + "learning_rate": 5.190655600825128e-07, + "loss": 0.2868, + "step": 2778 + }, + { + "epoch": 0.5034420289855073, + "grad_norm": 3.5659050589334194, + "learning_rate": 5.187723827498502e-07, + "loss": 0.1864, + "step": 2779 + }, + { + "epoch": 0.5036231884057971, + "grad_norm": 6.008823588538369, + "learning_rate": 5.184791989537674e-07, + "loss": 0.2789, + "step": 2780 + }, + { + "epoch": 0.503804347826087, + "grad_norm": 7.613597510972579, + "learning_rate": 5.181860087952088e-07, + "loss": 0.2598, + "step": 2781 + }, + { + "epoch": 0.5039855072463768, + "grad_norm": 3.3576839466132835, + "learning_rate": 5.178928123751211e-07, + "loss": 0.2599, + "step": 2782 + }, + { + "epoch": 0.5041666666666667, + "grad_norm": 7.31152567780526, + "learning_rate": 5.175996097944535e-07, + "loss": 0.2998, + "step": 2783 + }, + { + "epoch": 0.5043478260869565, + "grad_norm": 4.578935790793429, + "learning_rate": 5.173064011541566e-07, + "loss": 0.2965, + "step": 2784 + }, + { + "epoch": 0.5045289855072463, + "grad_norm": 4.169471430311392, + "learning_rate": 5.170131865551841e-07, + "loss": 0.3135, + "step": 2785 + }, + { + "epoch": 0.5047101449275362, + "grad_norm": 7.580570026181968, + "learning_rate": 5.167199660984906e-07, + "loss": 0.2832, + "step": 2786 + }, + { + "epoch": 0.5048913043478261, + "grad_norm": 6.040073670942348, + "learning_rate": 5.164267398850339e-07, + "loss": 0.2553, + "step": 2787 + }, + { + "epoch": 0.505072463768116, + "grad_norm": 6.67835437416186, + "learning_rate": 5.161335080157725e-07, + "loss": 0.3483, + "step": 2788 + }, + { + "epoch": 0.5052536231884058, + "grad_norm": 6.219598029669925, + "learning_rate": 5.158402705916679e-07, + "loss": 0.3111, + "step": 2789 + }, + { + "epoch": 0.5054347826086957, + "grad_norm": 6.481701755817264, + "learning_rate": 5.155470277136831e-07, + "loss": 0.2825, + "step": 2790 + }, + { + "epoch": 0.5056159420289855, + "grad_norm": 5.70283411828738, + "learning_rate": 5.152537794827832e-07, + "loss": 0.3415, + "step": 2791 + }, + { + "epoch": 0.5057971014492754, + "grad_norm": 3.405505719167638, + "learning_rate": 5.149605259999344e-07, + "loss": 0.2909, + "step": 2792 + }, + { + "epoch": 0.5059782608695652, + "grad_norm": 4.18580993332098, + "learning_rate": 5.146672673661058e-07, + "loss": 0.3708, + "step": 2793 + }, + { + "epoch": 0.506159420289855, + "grad_norm": 7.709719581792903, + "learning_rate": 5.143740036822676e-07, + "loss": 0.2595, + "step": 2794 + }, + { + "epoch": 0.5063405797101449, + "grad_norm": 4.388332583582546, + "learning_rate": 5.140807350493916e-07, + "loss": 0.3013, + "step": 2795 + }, + { + "epoch": 0.5065217391304347, + "grad_norm": 6.201275091791021, + "learning_rate": 5.137874615684521e-07, + "loss": 0.333, + "step": 2796 + }, + { + "epoch": 0.5067028985507246, + "grad_norm": 3.939595196659426, + "learning_rate": 5.134941833404241e-07, + "loss": 0.287, + "step": 2797 + }, + { + "epoch": 0.5068840579710145, + "grad_norm": 9.636058615048986, + "learning_rate": 5.132009004662848e-07, + "loss": 0.316, + "step": 2798 + }, + { + "epoch": 0.5070652173913044, + "grad_norm": 8.779208618325898, + "learning_rate": 5.129076130470132e-07, + "loss": 0.276, + "step": 2799 + }, + { + "epoch": 0.5072463768115942, + "grad_norm": 4.381185710594406, + "learning_rate": 5.126143211835888e-07, + "loss": 0.2783, + "step": 2800 + }, + { + "epoch": 0.5072463768115942, + "eval_loss": 0.30992186069488525, + "eval_runtime": 9.8138, + "eval_samples_per_second": 50.948, + "eval_steps_per_second": 0.102, + "step": 2800 + }, + { + "epoch": 0.5074275362318841, + "grad_norm": 4.444935383307834, + "learning_rate": 5.123210249769941e-07, + "loss": 0.2879, + "step": 2801 + }, + { + "epoch": 0.5076086956521739, + "grad_norm": 7.0702367080838044, + "learning_rate": 5.120277245282122e-07, + "loss": 0.3459, + "step": 2802 + }, + { + "epoch": 0.5077898550724638, + "grad_norm": 9.285804903120146, + "learning_rate": 5.117344199382277e-07, + "loss": 0.3477, + "step": 2803 + }, + { + "epoch": 0.5079710144927536, + "grad_norm": 4.512259842940229, + "learning_rate": 5.114411113080269e-07, + "loss": 0.2756, + "step": 2804 + }, + { + "epoch": 0.5081521739130435, + "grad_norm": 3.2881963983268223, + "learning_rate": 5.111477987385971e-07, + "loss": 0.2763, + "step": 2805 + }, + { + "epoch": 0.5083333333333333, + "grad_norm": 7.287858779094714, + "learning_rate": 5.108544823309273e-07, + "loss": 0.2863, + "step": 2806 + }, + { + "epoch": 0.5085144927536231, + "grad_norm": 6.12396532048185, + "learning_rate": 5.105611621860081e-07, + "loss": 0.2384, + "step": 2807 + }, + { + "epoch": 0.508695652173913, + "grad_norm": 9.270361915799064, + "learning_rate": 5.102678384048304e-07, + "loss": 0.2773, + "step": 2808 + }, + { + "epoch": 0.508876811594203, + "grad_norm": 3.9602652508454104, + "learning_rate": 5.099745110883874e-07, + "loss": 0.2974, + "step": 2809 + }, + { + "epoch": 0.5090579710144928, + "grad_norm": 4.000995661088271, + "learning_rate": 5.096811803376732e-07, + "loss": 0.342, + "step": 2810 + }, + { + "epoch": 0.5092391304347826, + "grad_norm": 8.50366055980769, + "learning_rate": 5.093878462536825e-07, + "loss": 0.2723, + "step": 2811 + }, + { + "epoch": 0.5094202898550725, + "grad_norm": 3.44748965070376, + "learning_rate": 5.090945089374119e-07, + "loss": 0.296, + "step": 2812 + }, + { + "epoch": 0.5096014492753623, + "grad_norm": 4.992219720206116, + "learning_rate": 5.088011684898587e-07, + "loss": 0.2431, + "step": 2813 + }, + { + "epoch": 0.5097826086956522, + "grad_norm": 5.490216340891855, + "learning_rate": 5.085078250120217e-07, + "loss": 0.2874, + "step": 2814 + }, + { + "epoch": 0.509963768115942, + "grad_norm": 9.770944949097256, + "learning_rate": 5.082144786049001e-07, + "loss": 0.3555, + "step": 2815 + }, + { + "epoch": 0.5101449275362319, + "grad_norm": 5.034012903524801, + "learning_rate": 5.079211293694946e-07, + "loss": 0.2713, + "step": 2816 + }, + { + "epoch": 0.5103260869565217, + "grad_norm": 6.360553422122106, + "learning_rate": 5.076277774068067e-07, + "loss": 0.29, + "step": 2817 + }, + { + "epoch": 0.5105072463768116, + "grad_norm": 3.674034260627281, + "learning_rate": 5.073344228178391e-07, + "loss": 0.3521, + "step": 2818 + }, + { + "epoch": 0.5106884057971014, + "grad_norm": 4.268537459452379, + "learning_rate": 5.070410657035948e-07, + "loss": 0.2903, + "step": 2819 + }, + { + "epoch": 0.5108695652173914, + "grad_norm": 3.52560398046829, + "learning_rate": 5.067477061650781e-07, + "loss": 0.3023, + "step": 2820 + }, + { + "epoch": 0.5110507246376812, + "grad_norm": 6.909141459924591, + "learning_rate": 5.064543443032944e-07, + "loss": 0.2351, + "step": 2821 + }, + { + "epoch": 0.511231884057971, + "grad_norm": 3.3244077410951816, + "learning_rate": 5.061609802192492e-07, + "loss": 0.2926, + "step": 2822 + }, + { + "epoch": 0.5114130434782609, + "grad_norm": 3.3025160531589783, + "learning_rate": 5.058676140139495e-07, + "loss": 0.2822, + "step": 2823 + }, + { + "epoch": 0.5115942028985507, + "grad_norm": 3.846351024786474, + "learning_rate": 5.055742457884024e-07, + "loss": 0.2941, + "step": 2824 + }, + { + "epoch": 0.5117753623188406, + "grad_norm": 4.159220113731831, + "learning_rate": 5.05280875643616e-07, + "loss": 0.2748, + "step": 2825 + }, + { + "epoch": 0.5119565217391304, + "grad_norm": 4.263021700051534, + "learning_rate": 5.049875036805994e-07, + "loss": 0.3434, + "step": 2826 + }, + { + "epoch": 0.5121376811594203, + "grad_norm": 3.427479635258739, + "learning_rate": 5.046941300003614e-07, + "loss": 0.2494, + "step": 2827 + }, + { + "epoch": 0.5123188405797101, + "grad_norm": 6.250932982838778, + "learning_rate": 5.044007547039121e-07, + "loss": 0.3705, + "step": 2828 + }, + { + "epoch": 0.5125, + "grad_norm": 6.1214318923029625, + "learning_rate": 5.041073778922622e-07, + "loss": 0.2844, + "step": 2829 + }, + { + "epoch": 0.5126811594202898, + "grad_norm": 4.41172827489127, + "learning_rate": 5.038139996664227e-07, + "loss": 0.3126, + "step": 2830 + }, + { + "epoch": 0.5128623188405798, + "grad_norm": 4.791392182353196, + "learning_rate": 5.035206201274051e-07, + "loss": 0.2914, + "step": 2831 + }, + { + "epoch": 0.5130434782608696, + "grad_norm": 3.847998232525494, + "learning_rate": 5.032272393762211e-07, + "loss": 0.3172, + "step": 2832 + }, + { + "epoch": 0.5132246376811594, + "grad_norm": 4.6133980823835605, + "learning_rate": 5.029338575138834e-07, + "loss": 0.2607, + "step": 2833 + }, + { + "epoch": 0.5134057971014493, + "grad_norm": 3.991569445253998, + "learning_rate": 5.026404746414048e-07, + "loss": 0.3232, + "step": 2834 + }, + { + "epoch": 0.5135869565217391, + "grad_norm": 6.100076834062837, + "learning_rate": 5.02347090859798e-07, + "loss": 0.2892, + "step": 2835 + }, + { + "epoch": 0.513768115942029, + "grad_norm": 5.290232373801101, + "learning_rate": 5.020537062700768e-07, + "loss": 0.277, + "step": 2836 + }, + { + "epoch": 0.5139492753623188, + "grad_norm": 3.793760745375536, + "learning_rate": 5.017603209732549e-07, + "loss": 0.3287, + "step": 2837 + }, + { + "epoch": 0.5141304347826087, + "grad_norm": 4.965000473966615, + "learning_rate": 5.014669350703461e-07, + "loss": 0.2634, + "step": 2838 + }, + { + "epoch": 0.5143115942028985, + "grad_norm": 7.688074758279249, + "learning_rate": 5.011735486623646e-07, + "loss": 0.2652, + "step": 2839 + }, + { + "epoch": 0.5144927536231884, + "grad_norm": 5.197366458962598, + "learning_rate": 5.008801618503248e-07, + "loss": 0.2589, + "step": 2840 + }, + { + "epoch": 0.5146739130434783, + "grad_norm": 4.162405717601701, + "learning_rate": 5.005867747352408e-07, + "loss": 0.2465, + "step": 2841 + }, + { + "epoch": 0.5148550724637682, + "grad_norm": 7.387248082860555, + "learning_rate": 5.002933874181279e-07, + "loss": 0.3106, + "step": 2842 + }, + { + "epoch": 0.515036231884058, + "grad_norm": 3.8299961115052894, + "learning_rate": 5e-07, + "loss": 0.2891, + "step": 2843 + }, + { + "epoch": 0.5152173913043478, + "grad_norm": 7.082791106352679, + "learning_rate": 4.997066125818723e-07, + "loss": 0.3214, + "step": 2844 + }, + { + "epoch": 0.5153985507246377, + "grad_norm": 4.721037829850546, + "learning_rate": 4.99413225264759e-07, + "loss": 0.3017, + "step": 2845 + }, + { + "epoch": 0.5155797101449275, + "grad_norm": 4.907335920373676, + "learning_rate": 4.991198381496753e-07, + "loss": 0.2975, + "step": 2846 + }, + { + "epoch": 0.5157608695652174, + "grad_norm": 5.846007352970598, + "learning_rate": 4.988264513376354e-07, + "loss": 0.2214, + "step": 2847 + }, + { + "epoch": 0.5159420289855072, + "grad_norm": 6.454427299303954, + "learning_rate": 4.98533064929654e-07, + "loss": 0.3337, + "step": 2848 + }, + { + "epoch": 0.5161231884057971, + "grad_norm": 6.5084427526921695, + "learning_rate": 4.982396790267451e-07, + "loss": 0.2596, + "step": 2849 + }, + { + "epoch": 0.5163043478260869, + "grad_norm": 4.860629882289485, + "learning_rate": 4.979462937299232e-07, + "loss": 0.3135, + "step": 2850 + }, + { + "epoch": 0.5164855072463768, + "grad_norm": 9.222953468180826, + "learning_rate": 4.976529091402019e-07, + "loss": 0.3661, + "step": 2851 + }, + { + "epoch": 0.5166666666666667, + "grad_norm": 4.652435896338946, + "learning_rate": 4.973595253585953e-07, + "loss": 0.3105, + "step": 2852 + }, + { + "epoch": 0.5168478260869566, + "grad_norm": 3.357892891036832, + "learning_rate": 4.970661424861165e-07, + "loss": 0.2791, + "step": 2853 + }, + { + "epoch": 0.5170289855072464, + "grad_norm": 3.7458053853401903, + "learning_rate": 4.967727606237788e-07, + "loss": 0.276, + "step": 2854 + }, + { + "epoch": 0.5172101449275363, + "grad_norm": 3.730847214688724, + "learning_rate": 4.96479379872595e-07, + "loss": 0.2246, + "step": 2855 + }, + { + "epoch": 0.5173913043478261, + "grad_norm": 7.150438128393571, + "learning_rate": 4.961860003335774e-07, + "loss": 0.3039, + "step": 2856 + }, + { + "epoch": 0.5175724637681159, + "grad_norm": 3.611701708090848, + "learning_rate": 4.958926221077376e-07, + "loss": 0.305, + "step": 2857 + }, + { + "epoch": 0.5177536231884058, + "grad_norm": 4.26438471115421, + "learning_rate": 4.955992452960879e-07, + "loss": 0.3383, + "step": 2858 + }, + { + "epoch": 0.5179347826086956, + "grad_norm": 3.5519538361101097, + "learning_rate": 4.953058699996388e-07, + "loss": 0.3162, + "step": 2859 + }, + { + "epoch": 0.5181159420289855, + "grad_norm": 5.21780459865734, + "learning_rate": 4.950124963194008e-07, + "loss": 0.3235, + "step": 2860 + }, + { + "epoch": 0.5182971014492753, + "grad_norm": 4.694891216443285, + "learning_rate": 4.947191243563838e-07, + "loss": 0.223, + "step": 2861 + }, + { + "epoch": 0.5184782608695652, + "grad_norm": 3.4368356884789777, + "learning_rate": 4.944257542115975e-07, + "loss": 0.2693, + "step": 2862 + }, + { + "epoch": 0.5186594202898551, + "grad_norm": 4.490206472701619, + "learning_rate": 4.941323859860505e-07, + "loss": 0.3028, + "step": 2863 + }, + { + "epoch": 0.518840579710145, + "grad_norm": 10.41683550303921, + "learning_rate": 4.938390197807508e-07, + "loss": 0.2414, + "step": 2864 + }, + { + "epoch": 0.5190217391304348, + "grad_norm": 5.534953077913409, + "learning_rate": 4.935456556967055e-07, + "loss": 0.3218, + "step": 2865 + }, + { + "epoch": 0.5192028985507247, + "grad_norm": 5.4908036609999735, + "learning_rate": 4.932522938349219e-07, + "loss": 0.2693, + "step": 2866 + }, + { + "epoch": 0.5193840579710145, + "grad_norm": 7.792525436579251, + "learning_rate": 4.929589342964053e-07, + "loss": 0.2638, + "step": 2867 + }, + { + "epoch": 0.5195652173913043, + "grad_norm": 4.582075008658208, + "learning_rate": 4.92665577182161e-07, + "loss": 0.286, + "step": 2868 + }, + { + "epoch": 0.5197463768115942, + "grad_norm": 3.7679544064073807, + "learning_rate": 4.923722225931932e-07, + "loss": 0.2639, + "step": 2869 + }, + { + "epoch": 0.519927536231884, + "grad_norm": 7.423896020144309, + "learning_rate": 4.920788706305053e-07, + "loss": 0.3192, + "step": 2870 + }, + { + "epoch": 0.5201086956521739, + "grad_norm": 3.5172256413182192, + "learning_rate": 4.917855213950999e-07, + "loss": 0.233, + "step": 2871 + }, + { + "epoch": 0.5202898550724637, + "grad_norm": 5.646715112284041, + "learning_rate": 4.914921749879784e-07, + "loss": 0.2809, + "step": 2872 + }, + { + "epoch": 0.5204710144927536, + "grad_norm": 3.960817594061969, + "learning_rate": 4.911988315101411e-07, + "loss": 0.3662, + "step": 2873 + }, + { + "epoch": 0.5206521739130435, + "grad_norm": 10.285735948656155, + "learning_rate": 4.909054910625882e-07, + "loss": 0.3069, + "step": 2874 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 5.050756744713645, + "learning_rate": 4.906121537463176e-07, + "loss": 0.3036, + "step": 2875 + }, + { + "epoch": 0.5210144927536232, + "grad_norm": 5.570303055947805, + "learning_rate": 4.90318819662327e-07, + "loss": 0.3015, + "step": 2876 + }, + { + "epoch": 0.5211956521739131, + "grad_norm": 6.726832132065173, + "learning_rate": 4.900254889116125e-07, + "loss": 0.3845, + "step": 2877 + }, + { + "epoch": 0.5213768115942029, + "grad_norm": 7.751050382260064, + "learning_rate": 4.897321615951695e-07, + "loss": 0.2859, + "step": 2878 + }, + { + "epoch": 0.5215579710144927, + "grad_norm": 4.4863994835212875, + "learning_rate": 4.894388378139921e-07, + "loss": 0.3872, + "step": 2879 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 8.630366966978777, + "learning_rate": 4.891455176690725e-07, + "loss": 0.296, + "step": 2880 + }, + { + "epoch": 0.5219202898550724, + "grad_norm": 10.85056205126875, + "learning_rate": 4.888522012614029e-07, + "loss": 0.2988, + "step": 2881 + }, + { + "epoch": 0.5221014492753623, + "grad_norm": 5.694373442398479, + "learning_rate": 4.885588886919732e-07, + "loss": 0.2819, + "step": 2882 + }, + { + "epoch": 0.5222826086956521, + "grad_norm": 5.260728137507764, + "learning_rate": 4.882655800617724e-07, + "loss": 0.2535, + "step": 2883 + }, + { + "epoch": 0.5224637681159421, + "grad_norm": 4.626474486270258, + "learning_rate": 4.879722754717878e-07, + "loss": 0.2746, + "step": 2884 + }, + { + "epoch": 0.5226449275362319, + "grad_norm": 6.315266110750342, + "learning_rate": 4.876789750230059e-07, + "loss": 0.2878, + "step": 2885 + }, + { + "epoch": 0.5228260869565218, + "grad_norm": 3.415097083621458, + "learning_rate": 4.873856788164111e-07, + "loss": 0.2577, + "step": 2886 + }, + { + "epoch": 0.5230072463768116, + "grad_norm": 4.426905421836901, + "learning_rate": 4.87092386952987e-07, + "loss": 0.3109, + "step": 2887 + }, + { + "epoch": 0.5231884057971015, + "grad_norm": 4.83881863171699, + "learning_rate": 4.867990995337151e-07, + "loss": 0.324, + "step": 2888 + }, + { + "epoch": 0.5233695652173913, + "grad_norm": 3.3212060803866437, + "learning_rate": 4.865058166595759e-07, + "loss": 0.2217, + "step": 2889 + }, + { + "epoch": 0.5235507246376812, + "grad_norm": 3.593184184582708, + "learning_rate": 4.862125384315479e-07, + "loss": 0.2856, + "step": 2890 + }, + { + "epoch": 0.523731884057971, + "grad_norm": 4.872786154993526, + "learning_rate": 4.859192649506084e-07, + "loss": 0.3417, + "step": 2891 + }, + { + "epoch": 0.5239130434782608, + "grad_norm": 3.3607488085082746, + "learning_rate": 4.856259963177324e-07, + "loss": 0.2466, + "step": 2892 + }, + { + "epoch": 0.5240942028985507, + "grad_norm": 5.05014734722463, + "learning_rate": 4.853327326338942e-07, + "loss": 0.2898, + "step": 2893 + }, + { + "epoch": 0.5242753623188405, + "grad_norm": 4.841080865206514, + "learning_rate": 4.850394740000656e-07, + "loss": 0.2924, + "step": 2894 + }, + { + "epoch": 0.5244565217391305, + "grad_norm": 4.7657042041267985, + "learning_rate": 4.84746220517217e-07, + "loss": 0.3357, + "step": 2895 + }, + { + "epoch": 0.5246376811594203, + "grad_norm": 9.302743520382093, + "learning_rate": 4.844529722863168e-07, + "loss": 0.2906, + "step": 2896 + }, + { + "epoch": 0.5248188405797102, + "grad_norm": 4.685193581166995, + "learning_rate": 4.84159729408332e-07, + "loss": 0.2639, + "step": 2897 + }, + { + "epoch": 0.525, + "grad_norm": 10.081003204801586, + "learning_rate": 4.838664919842275e-07, + "loss": 0.3782, + "step": 2898 + }, + { + "epoch": 0.5251811594202899, + "grad_norm": 3.865197310034983, + "learning_rate": 4.835732601149663e-07, + "loss": 0.3392, + "step": 2899 + }, + { + "epoch": 0.5253623188405797, + "grad_norm": 3.8486182918014973, + "learning_rate": 4.832800339015092e-07, + "loss": 0.2517, + "step": 2900 + }, + { + "epoch": 0.5253623188405797, + "eval_loss": 0.30259373784065247, + "eval_runtime": 9.848, + "eval_samples_per_second": 50.772, + "eval_steps_per_second": 0.102, + "step": 2900 + }, + { + "epoch": 0.5255434782608696, + "grad_norm": 4.706585507984769, + "learning_rate": 4.82986813444816e-07, + "loss": 0.2884, + "step": 2901 + }, + { + "epoch": 0.5257246376811594, + "grad_norm": 4.369058975566071, + "learning_rate": 4.826935988458433e-07, + "loss": 0.291, + "step": 2902 + }, + { + "epoch": 0.5259057971014492, + "grad_norm": 6.651098391671914, + "learning_rate": 4.824003902055466e-07, + "loss": 0.3423, + "step": 2903 + }, + { + "epoch": 0.5260869565217391, + "grad_norm": 3.4080392294999386, + "learning_rate": 4.821071876248788e-07, + "loss": 0.2741, + "step": 2904 + }, + { + "epoch": 0.5262681159420289, + "grad_norm": 3.8671285957757413, + "learning_rate": 4.818139912047912e-07, + "loss": 0.29, + "step": 2905 + }, + { + "epoch": 0.5264492753623189, + "grad_norm": 6.825199707495563, + "learning_rate": 4.815208010462326e-07, + "loss": 0.3319, + "step": 2906 + }, + { + "epoch": 0.5266304347826087, + "grad_norm": 4.097937362276843, + "learning_rate": 4.8122761725015e-07, + "loss": 0.352, + "step": 2907 + }, + { + "epoch": 0.5268115942028986, + "grad_norm": 7.949303661327839, + "learning_rate": 4.809344399174872e-07, + "loss": 0.3299, + "step": 2908 + }, + { + "epoch": 0.5269927536231884, + "grad_norm": 8.239374923375, + "learning_rate": 4.806412691491875e-07, + "loss": 0.2705, + "step": 2909 + }, + { + "epoch": 0.5271739130434783, + "grad_norm": 4.134447589834429, + "learning_rate": 4.803481050461903e-07, + "loss": 0.2774, + "step": 2910 + }, + { + "epoch": 0.5273550724637681, + "grad_norm": 3.4807326975889135, + "learning_rate": 4.800549477094337e-07, + "loss": 0.2727, + "step": 2911 + }, + { + "epoch": 0.527536231884058, + "grad_norm": 9.416456224930261, + "learning_rate": 4.797617972398531e-07, + "loss": 0.2947, + "step": 2912 + }, + { + "epoch": 0.5277173913043478, + "grad_norm": 7.037133986338564, + "learning_rate": 4.794686537383817e-07, + "loss": 0.285, + "step": 2913 + }, + { + "epoch": 0.5278985507246376, + "grad_norm": 4.595600062062957, + "learning_rate": 4.791755173059501e-07, + "loss": 0.2542, + "step": 2914 + }, + { + "epoch": 0.5280797101449275, + "grad_norm": 3.3442524936434714, + "learning_rate": 4.788823880434864e-07, + "loss": 0.2851, + "step": 2915 + }, + { + "epoch": 0.5282608695652173, + "grad_norm": 4.7399865498045015, + "learning_rate": 4.785892660519166e-07, + "loss": 0.3278, + "step": 2916 + }, + { + "epoch": 0.5284420289855073, + "grad_norm": 8.944313544806079, + "learning_rate": 4.782961514321641e-07, + "loss": 0.2914, + "step": 2917 + }, + { + "epoch": 0.5286231884057971, + "grad_norm": 9.43697220477717, + "learning_rate": 4.780030442851495e-07, + "loss": 0.2931, + "step": 2918 + }, + { + "epoch": 0.528804347826087, + "grad_norm": 10.456610248000333, + "learning_rate": 4.777099447117907e-07, + "loss": 0.3064, + "step": 2919 + }, + { + "epoch": 0.5289855072463768, + "grad_norm": 5.106965218301173, + "learning_rate": 4.774168528130038e-07, + "loss": 0.3267, + "step": 2920 + }, + { + "epoch": 0.5291666666666667, + "grad_norm": 3.435840283644493, + "learning_rate": 4.771237686897014e-07, + "loss": 0.3195, + "step": 2921 + }, + { + "epoch": 0.5293478260869565, + "grad_norm": 6.460835795089601, + "learning_rate": 4.7683069244279387e-07, + "loss": 0.2497, + "step": 2922 + }, + { + "epoch": 0.5295289855072464, + "grad_norm": 4.364890048962088, + "learning_rate": 4.765376241731886e-07, + "loss": 0.2616, + "step": 2923 + }, + { + "epoch": 0.5297101449275362, + "grad_norm": 3.6488425646980183, + "learning_rate": 4.7624456398179056e-07, + "loss": 0.2552, + "step": 2924 + }, + { + "epoch": 0.529891304347826, + "grad_norm": 8.556067994898077, + "learning_rate": 4.7595151196950173e-07, + "loss": 0.3494, + "step": 2925 + }, + { + "epoch": 0.5300724637681159, + "grad_norm": 8.903265099327312, + "learning_rate": 4.756584682372214e-07, + "loss": 0.3016, + "step": 2926 + }, + { + "epoch": 0.5302536231884057, + "grad_norm": 3.3351674195301415, + "learning_rate": 4.7536543288584543e-07, + "loss": 0.2672, + "step": 2927 + }, + { + "epoch": 0.5304347826086957, + "grad_norm": 6.522383373178969, + "learning_rate": 4.75072406016268e-07, + "loss": 0.32, + "step": 2928 + }, + { + "epoch": 0.5306159420289855, + "grad_norm": 3.912768172411499, + "learning_rate": 4.747793877293791e-07, + "loss": 0.3214, + "step": 2929 + }, + { + "epoch": 0.5307971014492754, + "grad_norm": 3.9593986244641233, + "learning_rate": 4.7448637812606656e-07, + "loss": 0.3168, + "step": 2930 + }, + { + "epoch": 0.5309782608695652, + "grad_norm": 4.013660105745786, + "learning_rate": 4.741933773072148e-07, + "loss": 0.2811, + "step": 2931 + }, + { + "epoch": 0.5311594202898551, + "grad_norm": 6.328394689392958, + "learning_rate": 4.739003853737056e-07, + "loss": 0.3249, + "step": 2932 + }, + { + "epoch": 0.5313405797101449, + "grad_norm": 4.128936545453159, + "learning_rate": 4.7360740242641737e-07, + "loss": 0.2949, + "step": 2933 + }, + { + "epoch": 0.5315217391304348, + "grad_norm": 7.178584594619719, + "learning_rate": 4.7331442856622566e-07, + "loss": 0.2908, + "step": 2934 + }, + { + "epoch": 0.5317028985507246, + "grad_norm": 3.294286165671993, + "learning_rate": 4.7302146389400235e-07, + "loss": 0.2663, + "step": 2935 + }, + { + "epoch": 0.5318840579710145, + "grad_norm": 3.842315395751036, + "learning_rate": 4.7272850851061715e-07, + "loss": 0.2726, + "step": 2936 + }, + { + "epoch": 0.5320652173913043, + "grad_norm": 4.841177790154437, + "learning_rate": 4.7243556251693553e-07, + "loss": 0.2632, + "step": 2937 + }, + { + "epoch": 0.5322463768115943, + "grad_norm": 4.827651120836476, + "learning_rate": 4.721426260138204e-07, + "loss": 0.3129, + "step": 2938 + }, + { + "epoch": 0.5324275362318841, + "grad_norm": 4.382323862941937, + "learning_rate": 4.7184969910213094e-07, + "loss": 0.2488, + "step": 2939 + }, + { + "epoch": 0.532608695652174, + "grad_norm": 5.221230383386048, + "learning_rate": 4.715567818827236e-07, + "loss": 0.3083, + "step": 2940 + }, + { + "epoch": 0.5327898550724638, + "grad_norm": 10.395773574737106, + "learning_rate": 4.712638744564511e-07, + "loss": 0.2769, + "step": 2941 + }, + { + "epoch": 0.5329710144927536, + "grad_norm": 5.955584361676708, + "learning_rate": 4.709709769241628e-07, + "loss": 0.3167, + "step": 2942 + }, + { + "epoch": 0.5331521739130435, + "grad_norm": 4.997364715528131, + "learning_rate": 4.706780893867044e-07, + "loss": 0.3279, + "step": 2943 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 3.3899341303151282, + "learning_rate": 4.703852119449191e-07, + "loss": 0.2329, + "step": 2944 + }, + { + "epoch": 0.5335144927536232, + "grad_norm": 3.837514775848834, + "learning_rate": 4.700923446996455e-07, + "loss": 0.2893, + "step": 2945 + }, + { + "epoch": 0.533695652173913, + "grad_norm": 3.3874379575637934, + "learning_rate": 4.6979948775171927e-07, + "loss": 0.2746, + "step": 2946 + }, + { + "epoch": 0.5338768115942029, + "grad_norm": 7.888294680935468, + "learning_rate": 4.695066412019725e-07, + "loss": 0.2999, + "step": 2947 + }, + { + "epoch": 0.5340579710144927, + "grad_norm": 3.9539066164778367, + "learning_rate": 4.692138051512337e-07, + "loss": 0.2564, + "step": 2948 + }, + { + "epoch": 0.5342391304347827, + "grad_norm": 15.137564404946195, + "learning_rate": 4.689209797003277e-07, + "loss": 0.3544, + "step": 2949 + }, + { + "epoch": 0.5344202898550725, + "grad_norm": 3.699822810136642, + "learning_rate": 4.6862816495007566e-07, + "loss": 0.2782, + "step": 2950 + }, + { + "epoch": 0.5346014492753624, + "grad_norm": 7.664313700060354, + "learning_rate": 4.6833536100129477e-07, + "loss": 0.2747, + "step": 2951 + }, + { + "epoch": 0.5347826086956522, + "grad_norm": 4.057222464136469, + "learning_rate": 4.680425679547994e-07, + "loss": 0.2543, + "step": 2952 + }, + { + "epoch": 0.534963768115942, + "grad_norm": 8.476141520893258, + "learning_rate": 4.6774978591139936e-07, + "loss": 0.31, + "step": 2953 + }, + { + "epoch": 0.5351449275362319, + "grad_norm": 5.740854409432257, + "learning_rate": 4.6745701497190053e-07, + "loss": 0.3149, + "step": 2954 + }, + { + "epoch": 0.5353260869565217, + "grad_norm": 6.889381913163191, + "learning_rate": 4.6716425523710595e-07, + "loss": 0.2866, + "step": 2955 + }, + { + "epoch": 0.5355072463768116, + "grad_norm": 4.4349145120754185, + "learning_rate": 4.668715068078138e-07, + "loss": 0.3289, + "step": 2956 + }, + { + "epoch": 0.5356884057971014, + "grad_norm": 4.993115450699712, + "learning_rate": 4.6657876978481877e-07, + "loss": 0.4318, + "step": 2957 + }, + { + "epoch": 0.5358695652173913, + "grad_norm": 5.732098456324761, + "learning_rate": 4.6628604426891154e-07, + "loss": 0.3144, + "step": 2958 + }, + { + "epoch": 0.5360507246376811, + "grad_norm": 6.32482046003519, + "learning_rate": 4.659933303608792e-07, + "loss": 0.2964, + "step": 2959 + }, + { + "epoch": 0.5362318840579711, + "grad_norm": 5.784427743641433, + "learning_rate": 4.6570062816150424e-07, + "loss": 0.2787, + "step": 2960 + }, + { + "epoch": 0.5364130434782609, + "grad_norm": 3.1823569216573877, + "learning_rate": 4.6540793777156566e-07, + "loss": 0.2928, + "step": 2961 + }, + { + "epoch": 0.5365942028985508, + "grad_norm": 4.198969777840339, + "learning_rate": 4.651152592918377e-07, + "loss": 0.3306, + "step": 2962 + }, + { + "epoch": 0.5367753623188406, + "grad_norm": 4.743275689858857, + "learning_rate": 4.648225928230916e-07, + "loss": 0.2926, + "step": 2963 + }, + { + "epoch": 0.5369565217391304, + "grad_norm": 3.315411269492071, + "learning_rate": 4.6452993846609336e-07, + "loss": 0.219, + "step": 2964 + }, + { + "epoch": 0.5371376811594203, + "grad_norm": 3.9691781573552305, + "learning_rate": 4.642372963216053e-07, + "loss": 0.3098, + "step": 2965 + }, + { + "epoch": 0.5373188405797101, + "grad_norm": 5.76370586122385, + "learning_rate": 4.639446664903855e-07, + "loss": 0.2178, + "step": 2966 + }, + { + "epoch": 0.5375, + "grad_norm": 5.660178666093768, + "learning_rate": 4.6365204907318796e-07, + "loss": 0.3028, + "step": 2967 + }, + { + "epoch": 0.5376811594202898, + "grad_norm": 3.278949890067449, + "learning_rate": 4.633594441707621e-07, + "loss": 0.2673, + "step": 2968 + }, + { + "epoch": 0.5378623188405797, + "grad_norm": 6.324043358926048, + "learning_rate": 4.630668518838534e-07, + "loss": 0.269, + "step": 2969 + }, + { + "epoch": 0.5380434782608695, + "grad_norm": 4.201414182476533, + "learning_rate": 4.627742723132023e-07, + "loss": 0.3378, + "step": 2970 + }, + { + "epoch": 0.5382246376811595, + "grad_norm": 4.2934541855468, + "learning_rate": 4.624817055595458e-07, + "loss": 0.2958, + "step": 2971 + }, + { + "epoch": 0.5384057971014493, + "grad_norm": 12.725856977736433, + "learning_rate": 4.6218915172361583e-07, + "loss": 0.2684, + "step": 2972 + }, + { + "epoch": 0.5385869565217392, + "grad_norm": 9.13372364217117, + "learning_rate": 4.618966109061401e-07, + "loss": 0.3318, + "step": 2973 + }, + { + "epoch": 0.538768115942029, + "grad_norm": 5.194467951361341, + "learning_rate": 4.616040832078416e-07, + "loss": 0.2922, + "step": 2974 + }, + { + "epoch": 0.5389492753623188, + "grad_norm": 3.48682987762208, + "learning_rate": 4.6131156872943933e-07, + "loss": 0.3163, + "step": 2975 + }, + { + "epoch": 0.5391304347826087, + "grad_norm": 3.6981277256158083, + "learning_rate": 4.610190675716472e-07, + "loss": 0.2395, + "step": 2976 + }, + { + "epoch": 0.5393115942028985, + "grad_norm": 5.070913587930658, + "learning_rate": 4.607265798351749e-07, + "loss": 0.3273, + "step": 2977 + }, + { + "epoch": 0.5394927536231884, + "grad_norm": 7.719395187329499, + "learning_rate": 4.60434105620727e-07, + "loss": 0.3058, + "step": 2978 + }, + { + "epoch": 0.5396739130434782, + "grad_norm": 13.266257504656563, + "learning_rate": 4.6014164502900434e-07, + "loss": 0.3113, + "step": 2979 + }, + { + "epoch": 0.5398550724637681, + "grad_norm": 5.977656331543663, + "learning_rate": 4.5984919816070193e-07, + "loss": 0.2737, + "step": 2980 + }, + { + "epoch": 0.5400362318840579, + "grad_norm": 4.61219466513511, + "learning_rate": 4.595567651165109e-07, + "loss": 0.2808, + "step": 2981 + }, + { + "epoch": 0.5402173913043479, + "grad_norm": 3.9956499930743594, + "learning_rate": 4.5926434599711716e-07, + "loss": 0.3135, + "step": 2982 + }, + { + "epoch": 0.5403985507246377, + "grad_norm": 4.042843240138853, + "learning_rate": 4.5897194090320217e-07, + "loss": 0.2914, + "step": 2983 + }, + { + "epoch": 0.5405797101449276, + "grad_norm": 3.949287588623843, + "learning_rate": 4.586795499354424e-07, + "loss": 0.2986, + "step": 2984 + }, + { + "epoch": 0.5407608695652174, + "grad_norm": 3.456017304253609, + "learning_rate": 4.5838717319450945e-07, + "loss": 0.3401, + "step": 2985 + }, + { + "epoch": 0.5409420289855073, + "grad_norm": 3.1839016554398185, + "learning_rate": 4.5809481078106954e-07, + "loss": 0.2473, + "step": 2986 + }, + { + "epoch": 0.5411231884057971, + "grad_norm": 8.448942119415877, + "learning_rate": 4.578024627957851e-07, + "loss": 0.2843, + "step": 2987 + }, + { + "epoch": 0.5413043478260869, + "grad_norm": 10.884188441341491, + "learning_rate": 4.575101293393128e-07, + "loss": 0.2931, + "step": 2988 + }, + { + "epoch": 0.5414855072463768, + "grad_norm": 5.436363730614316, + "learning_rate": 4.5721781051230385e-07, + "loss": 0.3016, + "step": 2989 + }, + { + "epoch": 0.5416666666666666, + "grad_norm": 3.903275862626155, + "learning_rate": 4.569255064154058e-07, + "loss": 0.3121, + "step": 2990 + }, + { + "epoch": 0.5418478260869565, + "grad_norm": 3.568864628172722, + "learning_rate": 4.5663321714925997e-07, + "loss": 0.2786, + "step": 2991 + }, + { + "epoch": 0.5420289855072464, + "grad_norm": 5.482816712868959, + "learning_rate": 4.563409428145029e-07, + "loss": 0.3961, + "step": 2992 + }, + { + "epoch": 0.5422101449275363, + "grad_norm": 7.425900947266244, + "learning_rate": 4.56048683511766e-07, + "loss": 0.2738, + "step": 2993 + }, + { + "epoch": 0.5423913043478261, + "grad_norm": 4.399882730754699, + "learning_rate": 4.5575643934167567e-07, + "loss": 0.2886, + "step": 2994 + }, + { + "epoch": 0.542572463768116, + "grad_norm": 3.722852451943934, + "learning_rate": 4.5546421040485295e-07, + "loss": 0.327, + "step": 2995 + }, + { + "epoch": 0.5427536231884058, + "grad_norm": 4.917695344786367, + "learning_rate": 4.551719968019136e-07, + "loss": 0.2651, + "step": 2996 + }, + { + "epoch": 0.5429347826086957, + "grad_norm": 7.009359764797192, + "learning_rate": 4.54879798633468e-07, + "loss": 0.3207, + "step": 2997 + }, + { + "epoch": 0.5431159420289855, + "grad_norm": 9.242907991912164, + "learning_rate": 4.5458761600012174e-07, + "loss": 0.3321, + "step": 2998 + }, + { + "epoch": 0.5432971014492753, + "grad_norm": 3.51820867373752, + "learning_rate": 4.5429544900247434e-07, + "loss": 0.32, + "step": 2999 + }, + { + "epoch": 0.5434782608695652, + "grad_norm": 4.33423858985368, + "learning_rate": 4.5400329774112043e-07, + "loss": 0.3243, + "step": 3000 + }, + { + "epoch": 0.5434782608695652, + "eval_loss": 0.2858281135559082, + "eval_runtime": 9.7799, + "eval_samples_per_second": 51.125, + "eval_steps_per_second": 0.102, + "step": 3000 + }, + { + "epoch": 0.543659420289855, + "grad_norm": 3.1264835143066825, + "learning_rate": 4.537111623166489e-07, + "loss": 0.2181, + "step": 3001 + }, + { + "epoch": 0.5438405797101449, + "grad_norm": 3.5655919537808867, + "learning_rate": 4.5341904282964364e-07, + "loss": 0.2283, + "step": 3002 + }, + { + "epoch": 0.5440217391304348, + "grad_norm": 7.914218311548956, + "learning_rate": 4.531269393806827e-07, + "loss": 0.3013, + "step": 3003 + }, + { + "epoch": 0.5442028985507247, + "grad_norm": 3.9144643785650532, + "learning_rate": 4.5283485207033866e-07, + "loss": 0.3562, + "step": 3004 + }, + { + "epoch": 0.5443840579710145, + "grad_norm": 4.100928593273769, + "learning_rate": 4.525427809991782e-07, + "loss": 0.3179, + "step": 3005 + }, + { + "epoch": 0.5445652173913044, + "grad_norm": 4.668630208425053, + "learning_rate": 4.5225072626776345e-07, + "loss": 0.3104, + "step": 3006 + }, + { + "epoch": 0.5447463768115942, + "grad_norm": 4.179568114727412, + "learning_rate": 4.519586879766498e-07, + "loss": 0.2754, + "step": 3007 + }, + { + "epoch": 0.5449275362318841, + "grad_norm": 4.029226018148599, + "learning_rate": 4.516666662263874e-07, + "loss": 0.3084, + "step": 3008 + }, + { + "epoch": 0.5451086956521739, + "grad_norm": 6.272860036723506, + "learning_rate": 4.513746611175208e-07, + "loss": 0.2531, + "step": 3009 + }, + { + "epoch": 0.5452898550724637, + "grad_norm": 3.4541112540288053, + "learning_rate": 4.5108267275058887e-07, + "loss": 0.2635, + "step": 3010 + }, + { + "epoch": 0.5454710144927536, + "grad_norm": 3.3428132709717384, + "learning_rate": 4.507907012261244e-07, + "loss": 0.2941, + "step": 3011 + }, + { + "epoch": 0.5456521739130434, + "grad_norm": 4.753350387936348, + "learning_rate": 4.5049874664465493e-07, + "loss": 0.3719, + "step": 3012 + }, + { + "epoch": 0.5458333333333333, + "grad_norm": 3.514733852595906, + "learning_rate": 4.5020680910670114e-07, + "loss": 0.2769, + "step": 3013 + }, + { + "epoch": 0.5460144927536232, + "grad_norm": 7.147529483288572, + "learning_rate": 4.4991488871277916e-07, + "loss": 0.2406, + "step": 3014 + }, + { + "epoch": 0.5461956521739131, + "grad_norm": 8.423306894007123, + "learning_rate": 4.496229855633983e-07, + "loss": 0.3859, + "step": 3015 + }, + { + "epoch": 0.5463768115942029, + "grad_norm": 7.700930285396802, + "learning_rate": 4.4933109975906215e-07, + "loss": 0.3139, + "step": 3016 + }, + { + "epoch": 0.5465579710144928, + "grad_norm": 9.1331181380497, + "learning_rate": 4.490392314002683e-07, + "loss": 0.3053, + "step": 3017 + }, + { + "epoch": 0.5467391304347826, + "grad_norm": 4.895184243789536, + "learning_rate": 4.487473805875086e-07, + "loss": 0.3299, + "step": 3018 + }, + { + "epoch": 0.5469202898550725, + "grad_norm": 3.6798840734975613, + "learning_rate": 4.484555474212687e-07, + "loss": 0.333, + "step": 3019 + }, + { + "epoch": 0.5471014492753623, + "grad_norm": 3.5009996865176602, + "learning_rate": 4.481637320020281e-07, + "loss": 0.2326, + "step": 3020 + }, + { + "epoch": 0.5472826086956522, + "grad_norm": 5.293161848635642, + "learning_rate": 4.478719344302599e-07, + "loss": 0.3563, + "step": 3021 + }, + { + "epoch": 0.547463768115942, + "grad_norm": 8.159729543469046, + "learning_rate": 4.4758015480643187e-07, + "loss": 0.3024, + "step": 3022 + }, + { + "epoch": 0.5476449275362318, + "grad_norm": 6.678209885498957, + "learning_rate": 4.4728839323100497e-07, + "loss": 0.3936, + "step": 3023 + }, + { + "epoch": 0.5478260869565217, + "grad_norm": 4.914223685488828, + "learning_rate": 4.4699664980443374e-07, + "loss": 0.3192, + "step": 3024 + }, + { + "epoch": 0.5480072463768116, + "grad_norm": 4.741715684114004, + "learning_rate": 4.467049246271674e-07, + "loss": 0.2931, + "step": 3025 + }, + { + "epoch": 0.5481884057971015, + "grad_norm": 4.64421152835889, + "learning_rate": 4.46413217799648e-07, + "loss": 0.2894, + "step": 3026 + }, + { + "epoch": 0.5483695652173913, + "grad_norm": 4.778274076162452, + "learning_rate": 4.4612152942231153e-07, + "loss": 0.3031, + "step": 3027 + }, + { + "epoch": 0.5485507246376812, + "grad_norm": 3.8725403096508915, + "learning_rate": 4.458298595955877e-07, + "loss": 0.2834, + "step": 3028 + }, + { + "epoch": 0.548731884057971, + "grad_norm": 6.526856152642174, + "learning_rate": 4.455382084199e-07, + "loss": 0.339, + "step": 3029 + }, + { + "epoch": 0.5489130434782609, + "grad_norm": 7.448732740194804, + "learning_rate": 4.452465759956651e-07, + "loss": 0.3245, + "step": 3030 + }, + { + "epoch": 0.5490942028985507, + "grad_norm": 4.325473892031156, + "learning_rate": 4.4495496242329373e-07, + "loss": 0.3671, + "step": 3031 + }, + { + "epoch": 0.5492753623188406, + "grad_norm": 3.313219598344092, + "learning_rate": 4.4466336780318925e-07, + "loss": 0.2812, + "step": 3032 + }, + { + "epoch": 0.5494565217391304, + "grad_norm": 4.170847141322727, + "learning_rate": 4.4437179223574975e-07, + "loss": 0.2792, + "step": 3033 + }, + { + "epoch": 0.5496376811594202, + "grad_norm": 3.83450749289122, + "learning_rate": 4.440802358213656e-07, + "loss": 0.2888, + "step": 3034 + }, + { + "epoch": 0.5498188405797102, + "grad_norm": 11.602192690816686, + "learning_rate": 4.4378869866042123e-07, + "loss": 0.3203, + "step": 3035 + }, + { + "epoch": 0.55, + "grad_norm": 4.080561661575524, + "learning_rate": 4.434971808532941e-07, + "loss": 0.2663, + "step": 3036 + }, + { + "epoch": 0.5501811594202899, + "grad_norm": 3.714301638662152, + "learning_rate": 4.432056825003555e-07, + "loss": 0.2521, + "step": 3037 + }, + { + "epoch": 0.5503623188405797, + "grad_norm": 4.083970659729972, + "learning_rate": 4.4291420370196954e-07, + "loss": 0.3152, + "step": 3038 + }, + { + "epoch": 0.5505434782608696, + "grad_norm": 8.1406301605279, + "learning_rate": 4.4262274455849374e-07, + "loss": 0.3028, + "step": 3039 + }, + { + "epoch": 0.5507246376811594, + "grad_norm": 3.6107795952953894, + "learning_rate": 4.4233130517027854e-07, + "loss": 0.2984, + "step": 3040 + }, + { + "epoch": 0.5509057971014493, + "grad_norm": 4.940447745353018, + "learning_rate": 4.420398856376686e-07, + "loss": 0.2968, + "step": 3041 + }, + { + "epoch": 0.5510869565217391, + "grad_norm": 3.2577852404309553, + "learning_rate": 4.417484860610005e-07, + "loss": 0.2059, + "step": 3042 + }, + { + "epoch": 0.551268115942029, + "grad_norm": 3.449398679342599, + "learning_rate": 4.4145710654060466e-07, + "loss": 0.2029, + "step": 3043 + }, + { + "epoch": 0.5514492753623188, + "grad_norm": 5.680281589249258, + "learning_rate": 4.411657471768043e-07, + "loss": 0.3632, + "step": 3044 + }, + { + "epoch": 0.5516304347826086, + "grad_norm": 5.594986464110723, + "learning_rate": 4.4087440806991606e-07, + "loss": 0.2917, + "step": 3045 + }, + { + "epoch": 0.5518115942028986, + "grad_norm": 4.151506151568021, + "learning_rate": 4.405830893202493e-07, + "loss": 0.2303, + "step": 3046 + }, + { + "epoch": 0.5519927536231884, + "grad_norm": 4.81399760256334, + "learning_rate": 4.402917910281065e-07, + "loss": 0.3193, + "step": 3047 + }, + { + "epoch": 0.5521739130434783, + "grad_norm": 9.003512088687303, + "learning_rate": 4.4000051329378256e-07, + "loss": 0.2758, + "step": 3048 + }, + { + "epoch": 0.5523550724637681, + "grad_norm": 10.180209402771974, + "learning_rate": 4.397092562175666e-07, + "loss": 0.222, + "step": 3049 + }, + { + "epoch": 0.552536231884058, + "grad_norm": 8.975281306131343, + "learning_rate": 4.394180198997392e-07, + "loss": 0.2719, + "step": 3050 + }, + { + "epoch": 0.5527173913043478, + "grad_norm": 9.774435292322675, + "learning_rate": 4.3912680444057465e-07, + "loss": 0.2654, + "step": 3051 + }, + { + "epoch": 0.5528985507246377, + "grad_norm": 5.981068924724844, + "learning_rate": 4.3883560994033965e-07, + "loss": 0.2547, + "step": 3052 + }, + { + "epoch": 0.5530797101449275, + "grad_norm": 4.057776427574125, + "learning_rate": 4.38544436499294e-07, + "loss": 0.2628, + "step": 3053 + }, + { + "epoch": 0.5532608695652174, + "grad_norm": 7.213901341063897, + "learning_rate": 4.382532842176901e-07, + "loss": 0.3431, + "step": 3054 + }, + { + "epoch": 0.5534420289855072, + "grad_norm": 5.568143023767482, + "learning_rate": 4.3796215319577304e-07, + "loss": 0.3236, + "step": 3055 + }, + { + "epoch": 0.553623188405797, + "grad_norm": 9.110566168045363, + "learning_rate": 4.376710435337803e-07, + "loss": 0.3753, + "step": 3056 + }, + { + "epoch": 0.553804347826087, + "grad_norm": 7.904957367726735, + "learning_rate": 4.3737995533194285e-07, + "loss": 0.2792, + "step": 3057 + }, + { + "epoch": 0.5539855072463769, + "grad_norm": 9.819636699779448, + "learning_rate": 4.3708888869048357e-07, + "loss": 0.3073, + "step": 3058 + }, + { + "epoch": 0.5541666666666667, + "grad_norm": 3.5955363064912467, + "learning_rate": 4.3679784370961763e-07, + "loss": 0.2136, + "step": 3059 + }, + { + "epoch": 0.5543478260869565, + "grad_norm": 4.364748613602733, + "learning_rate": 4.365068204895539e-07, + "loss": 0.2978, + "step": 3060 + }, + { + "epoch": 0.5545289855072464, + "grad_norm": 4.952137117510628, + "learning_rate": 4.362158191304926e-07, + "loss": 0.286, + "step": 3061 + }, + { + "epoch": 0.5547101449275362, + "grad_norm": 4.250720821427454, + "learning_rate": 4.35924839732627e-07, + "loss": 0.2574, + "step": 3062 + }, + { + "epoch": 0.5548913043478261, + "grad_norm": 4.096297202278202, + "learning_rate": 4.356338823961426e-07, + "loss": 0.2784, + "step": 3063 + }, + { + "epoch": 0.5550724637681159, + "grad_norm": 7.13673607899764, + "learning_rate": 4.353429472212175e-07, + "loss": 0.2777, + "step": 3064 + }, + { + "epoch": 0.5552536231884058, + "grad_norm": 4.546895754919055, + "learning_rate": 4.350520343080221e-07, + "loss": 0.2878, + "step": 3065 + }, + { + "epoch": 0.5554347826086956, + "grad_norm": 4.661956689875289, + "learning_rate": 4.3476114375671904e-07, + "loss": 0.2853, + "step": 3066 + }, + { + "epoch": 0.5556159420289855, + "grad_norm": 3.245026121304084, + "learning_rate": 4.3447027566746296e-07, + "loss": 0.2377, + "step": 3067 + }, + { + "epoch": 0.5557971014492754, + "grad_norm": 3.816033137196711, + "learning_rate": 4.341794301404019e-07, + "loss": 0.3168, + "step": 3068 + }, + { + "epoch": 0.5559782608695653, + "grad_norm": 5.170592258852042, + "learning_rate": 4.338886072756747e-07, + "loss": 0.2686, + "step": 3069 + }, + { + "epoch": 0.5561594202898551, + "grad_norm": 4.193429030165531, + "learning_rate": 4.335978071734133e-07, + "loss": 0.2838, + "step": 3070 + }, + { + "epoch": 0.556340579710145, + "grad_norm": 4.252985585484878, + "learning_rate": 4.3330702993374136e-07, + "loss": 0.3035, + "step": 3071 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 4.415642859341208, + "learning_rate": 4.330162756567752e-07, + "loss": 0.3315, + "step": 3072 + }, + { + "epoch": 0.5567028985507246, + "grad_norm": 3.373147831717073, + "learning_rate": 4.3272554444262265e-07, + "loss": 0.3083, + "step": 3073 + }, + { + "epoch": 0.5568840579710145, + "grad_norm": 3.861033259888325, + "learning_rate": 4.32434836391384e-07, + "loss": 0.3135, + "step": 3074 + }, + { + "epoch": 0.5570652173913043, + "grad_norm": 4.139382322849648, + "learning_rate": 4.3214415160315096e-07, + "loss": 0.2689, + "step": 3075 + }, + { + "epoch": 0.5572463768115942, + "grad_norm": 3.9494392571429415, + "learning_rate": 4.318534901780084e-07, + "loss": 0.2547, + "step": 3076 + }, + { + "epoch": 0.557427536231884, + "grad_norm": 5.836651239042293, + "learning_rate": 4.3156285221603195e-07, + "loss": 0.3206, + "step": 3077 + }, + { + "epoch": 0.5576086956521739, + "grad_norm": 5.241781109855, + "learning_rate": 4.312722378172898e-07, + "loss": 0.316, + "step": 3078 + }, + { + "epoch": 0.5577898550724638, + "grad_norm": 5.5655640008768055, + "learning_rate": 4.309816470818416e-07, + "loss": 0.3806, + "step": 3079 + }, + { + "epoch": 0.5579710144927537, + "grad_norm": 5.1015152634940755, + "learning_rate": 4.3069108010973947e-07, + "loss": 0.3045, + "step": 3080 + }, + { + "epoch": 0.5581521739130435, + "grad_norm": 9.555409191887346, + "learning_rate": 4.3040053700102685e-07, + "loss": 0.3007, + "step": 3081 + }, + { + "epoch": 0.5583333333333333, + "grad_norm": 4.0169729126319735, + "learning_rate": 4.3011001785573927e-07, + "loss": 0.3313, + "step": 3082 + }, + { + "epoch": 0.5585144927536232, + "grad_norm": 4.061341405939489, + "learning_rate": 4.298195227739033e-07, + "loss": 0.351, + "step": 3083 + }, + { + "epoch": 0.558695652173913, + "grad_norm": 3.748759139505214, + "learning_rate": 4.2952905185553844e-07, + "loss": 0.2306, + "step": 3084 + }, + { + "epoch": 0.5588768115942029, + "grad_norm": 5.1452429073871695, + "learning_rate": 4.292386052006549e-07, + "loss": 0.248, + "step": 3085 + }, + { + "epoch": 0.5590579710144927, + "grad_norm": 8.889189688543281, + "learning_rate": 4.2894818290925483e-07, + "loss": 0.2637, + "step": 3086 + }, + { + "epoch": 0.5592391304347826, + "grad_norm": 5.078518650418005, + "learning_rate": 4.28657785081332e-07, + "loss": 0.358, + "step": 3087 + }, + { + "epoch": 0.5594202898550724, + "grad_norm": 8.79729138579761, + "learning_rate": 4.283674118168718e-07, + "loss": 0.2508, + "step": 3088 + }, + { + "epoch": 0.5596014492753624, + "grad_norm": 3.944535166883078, + "learning_rate": 4.2807706321585115e-07, + "loss": 0.3259, + "step": 3089 + }, + { + "epoch": 0.5597826086956522, + "grad_norm": 3.9694116321997117, + "learning_rate": 4.277867393782385e-07, + "loss": 0.2987, + "step": 3090 + }, + { + "epoch": 0.5599637681159421, + "grad_norm": 9.300774582156349, + "learning_rate": 4.2749644040399336e-07, + "loss": 0.304, + "step": 3091 + }, + { + "epoch": 0.5601449275362319, + "grad_norm": 4.456884646682889, + "learning_rate": 4.272061663930675e-07, + "loss": 0.3054, + "step": 3092 + }, + { + "epoch": 0.5603260869565218, + "grad_norm": 3.6323857273192024, + "learning_rate": 4.2691591744540357e-07, + "loss": 0.2633, + "step": 3093 + }, + { + "epoch": 0.5605072463768116, + "grad_norm": 4.924186217588339, + "learning_rate": 4.2662569366093525e-07, + "loss": 0.2698, + "step": 3094 + }, + { + "epoch": 0.5606884057971014, + "grad_norm": 4.035031136056453, + "learning_rate": 4.2633549513958855e-07, + "loss": 0.3078, + "step": 3095 + }, + { + "epoch": 0.5608695652173913, + "grad_norm": 6.817369011577607, + "learning_rate": 4.260453219812798e-07, + "loss": 0.2662, + "step": 3096 + }, + { + "epoch": 0.5610507246376811, + "grad_norm": 9.477026212375621, + "learning_rate": 4.2575517428591707e-07, + "loss": 0.2849, + "step": 3097 + }, + { + "epoch": 0.561231884057971, + "grad_norm": 4.0656429152612, + "learning_rate": 4.254650521533996e-07, + "loss": 0.271, + "step": 3098 + }, + { + "epoch": 0.5614130434782608, + "grad_norm": 6.312032905329363, + "learning_rate": 4.2517495568361776e-07, + "loss": 0.3003, + "step": 3099 + }, + { + "epoch": 0.5615942028985508, + "grad_norm": 4.185878879962339, + "learning_rate": 4.2488488497645335e-07, + "loss": 0.2839, + "step": 3100 + }, + { + "epoch": 0.5615942028985508, + "eval_loss": 0.2846718728542328, + "eval_runtime": 9.777, + "eval_samples_per_second": 51.14, + "eval_steps_per_second": 0.102, + "step": 3100 + }, + { + "epoch": 0.5617753623188406, + "grad_norm": 6.949084863457091, + "learning_rate": 4.2459484013177906e-07, + "loss": 0.3165, + "step": 3101 + }, + { + "epoch": 0.5619565217391305, + "grad_norm": 4.592299253828998, + "learning_rate": 4.2430482124945816e-07, + "loss": 0.2896, + "step": 3102 + }, + { + "epoch": 0.5621376811594203, + "grad_norm": 3.806262327518691, + "learning_rate": 4.240148284293463e-07, + "loss": 0.3167, + "step": 3103 + }, + { + "epoch": 0.5623188405797102, + "grad_norm": 5.257043951174633, + "learning_rate": 4.2372486177128903e-07, + "loss": 0.3397, + "step": 3104 + }, + { + "epoch": 0.5625, + "grad_norm": 6.609651408730121, + "learning_rate": 4.2343492137512314e-07, + "loss": 0.3698, + "step": 3105 + }, + { + "epoch": 0.5626811594202898, + "grad_norm": 6.137139695731701, + "learning_rate": 4.231450073406766e-07, + "loss": 0.3414, + "step": 3106 + }, + { + "epoch": 0.5628623188405797, + "grad_norm": 3.6846949971123313, + "learning_rate": 4.2285511976776823e-07, + "loss": 0.2878, + "step": 3107 + }, + { + "epoch": 0.5630434782608695, + "grad_norm": 4.623271243801174, + "learning_rate": 4.225652587562076e-07, + "loss": 0.2772, + "step": 3108 + }, + { + "epoch": 0.5632246376811594, + "grad_norm": 5.648686512173944, + "learning_rate": 4.2227542440579545e-07, + "loss": 0.2966, + "step": 3109 + }, + { + "epoch": 0.5634057971014492, + "grad_norm": 3.9955644324813777, + "learning_rate": 4.2198561681632256e-07, + "loss": 0.2767, + "step": 3110 + }, + { + "epoch": 0.5635869565217392, + "grad_norm": 3.635889960840437, + "learning_rate": 4.2169583608757183e-07, + "loss": 0.2879, + "step": 3111 + }, + { + "epoch": 0.563768115942029, + "grad_norm": 3.7507716297553766, + "learning_rate": 4.214060823193156e-07, + "loss": 0.3352, + "step": 3112 + }, + { + "epoch": 0.5639492753623189, + "grad_norm": 5.607021049512822, + "learning_rate": 4.2111635561131756e-07, + "loss": 0.3185, + "step": 3113 + }, + { + "epoch": 0.5641304347826087, + "grad_norm": 3.431461168725288, + "learning_rate": 4.20826656063332e-07, + "loss": 0.2626, + "step": 3114 + }, + { + "epoch": 0.5643115942028986, + "grad_norm": 4.102314089864365, + "learning_rate": 4.20536983775104e-07, + "loss": 0.3115, + "step": 3115 + }, + { + "epoch": 0.5644927536231884, + "grad_norm": 5.666883550192323, + "learning_rate": 4.20247338846369e-07, + "loss": 0.2644, + "step": 3116 + }, + { + "epoch": 0.5646739130434782, + "grad_norm": 4.139260839997685, + "learning_rate": 4.1995772137685317e-07, + "loss": 0.2957, + "step": 3117 + }, + { + "epoch": 0.5648550724637681, + "grad_norm": 4.124176552616789, + "learning_rate": 4.196681314662728e-07, + "loss": 0.3167, + "step": 3118 + }, + { + "epoch": 0.5650362318840579, + "grad_norm": 3.479069189641907, + "learning_rate": 4.1937856921433574e-07, + "loss": 0.2617, + "step": 3119 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 4.679231527721626, + "learning_rate": 4.190890347207392e-07, + "loss": 0.3134, + "step": 3120 + }, + { + "epoch": 0.5653985507246376, + "grad_norm": 3.2858164223322244, + "learning_rate": 4.1879952808517133e-07, + "loss": 0.2576, + "step": 3121 + }, + { + "epoch": 0.5655797101449276, + "grad_norm": 5.965198139648401, + "learning_rate": 4.1851004940731054e-07, + "loss": 0.219, + "step": 3122 + }, + { + "epoch": 0.5657608695652174, + "grad_norm": 7.409424627993025, + "learning_rate": 4.1822059878682605e-07, + "loss": 0.3167, + "step": 3123 + }, + { + "epoch": 0.5659420289855073, + "grad_norm": 4.864908478326171, + "learning_rate": 4.179311763233768e-07, + "loss": 0.2312, + "step": 3124 + }, + { + "epoch": 0.5661231884057971, + "grad_norm": 5.406560086172158, + "learning_rate": 4.176417821166125e-07, + "loss": 0.3193, + "step": 3125 + }, + { + "epoch": 0.566304347826087, + "grad_norm": 5.54760518102799, + "learning_rate": 4.173524162661726e-07, + "loss": 0.2976, + "step": 3126 + }, + { + "epoch": 0.5664855072463768, + "grad_norm": 12.752484752986526, + "learning_rate": 4.170630788716875e-07, + "loss": 0.3015, + "step": 3127 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 4.5778004716882235, + "learning_rate": 4.1677377003277743e-07, + "loss": 0.3035, + "step": 3128 + }, + { + "epoch": 0.5668478260869565, + "grad_norm": 3.919913317470607, + "learning_rate": 4.1648448984905234e-07, + "loss": 0.2591, + "step": 3129 + }, + { + "epoch": 0.5670289855072463, + "grad_norm": 3.2804008284376933, + "learning_rate": 4.1619523842011343e-07, + "loss": 0.2717, + "step": 3130 + }, + { + "epoch": 0.5672101449275362, + "grad_norm": 5.0904713266390145, + "learning_rate": 4.15906015845551e-07, + "loss": 0.2885, + "step": 3131 + }, + { + "epoch": 0.5673913043478261, + "grad_norm": 6.787433047325534, + "learning_rate": 4.156168222249456e-07, + "loss": 0.289, + "step": 3132 + }, + { + "epoch": 0.567572463768116, + "grad_norm": 3.135895726086536, + "learning_rate": 4.1532765765786807e-07, + "loss": 0.2587, + "step": 3133 + }, + { + "epoch": 0.5677536231884058, + "grad_norm": 5.3886413582760655, + "learning_rate": 4.1503852224387934e-07, + "loss": 0.2865, + "step": 3134 + }, + { + "epoch": 0.5679347826086957, + "grad_norm": 6.891292318722531, + "learning_rate": 4.1474941608252994e-07, + "loss": 0.3004, + "step": 3135 + }, + { + "epoch": 0.5681159420289855, + "grad_norm": 4.0631171289460575, + "learning_rate": 4.144603392733606e-07, + "loss": 0.2924, + "step": 3136 + }, + { + "epoch": 0.5682971014492754, + "grad_norm": 3.971983785321238, + "learning_rate": 4.141712919159014e-07, + "loss": 0.3422, + "step": 3137 + }, + { + "epoch": 0.5684782608695652, + "grad_norm": 4.421868693202735, + "learning_rate": 4.1388227410967344e-07, + "loss": 0.2748, + "step": 3138 + }, + { + "epoch": 0.568659420289855, + "grad_norm": 3.78304635392667, + "learning_rate": 4.135932859541864e-07, + "loss": 0.2799, + "step": 3139 + }, + { + "epoch": 0.5688405797101449, + "grad_norm": 9.240160227832156, + "learning_rate": 4.133043275489404e-07, + "loss": 0.3167, + "step": 3140 + }, + { + "epoch": 0.5690217391304347, + "grad_norm": 4.299978197339649, + "learning_rate": 4.1301539899342513e-07, + "loss": 0.306, + "step": 3141 + }, + { + "epoch": 0.5692028985507246, + "grad_norm": 6.495885024418353, + "learning_rate": 4.127265003871202e-07, + "loss": 0.3121, + "step": 3142 + }, + { + "epoch": 0.5693840579710145, + "grad_norm": 5.930882769486175, + "learning_rate": 4.1243763182949475e-07, + "loss": 0.2737, + "step": 3143 + }, + { + "epoch": 0.5695652173913044, + "grad_norm": 3.5745530243430226, + "learning_rate": 4.121487934200076e-07, + "loss": 0.2837, + "step": 3144 + }, + { + "epoch": 0.5697463768115942, + "grad_norm": 5.313532692298856, + "learning_rate": 4.118599852581068e-07, + "loss": 0.3122, + "step": 3145 + }, + { + "epoch": 0.5699275362318841, + "grad_norm": 3.4633376632436788, + "learning_rate": 4.11571207443231e-07, + "loss": 0.2672, + "step": 3146 + }, + { + "epoch": 0.5701086956521739, + "grad_norm": 6.337617029583747, + "learning_rate": 4.112824600748074e-07, + "loss": 0.2706, + "step": 3147 + }, + { + "epoch": 0.5702898550724638, + "grad_norm": 3.9319648502173057, + "learning_rate": 4.109937432522531e-07, + "loss": 0.3105, + "step": 3148 + }, + { + "epoch": 0.5704710144927536, + "grad_norm": 3.7605660949503164, + "learning_rate": 4.1070505707497466e-07, + "loss": 0.2592, + "step": 3149 + }, + { + "epoch": 0.5706521739130435, + "grad_norm": 4.973335691821945, + "learning_rate": 4.1041640164236825e-07, + "loss": 0.3108, + "step": 3150 + }, + { + "epoch": 0.5708333333333333, + "grad_norm": 3.956145442108406, + "learning_rate": 4.1012777705381917e-07, + "loss": 0.2471, + "step": 3151 + }, + { + "epoch": 0.5710144927536231, + "grad_norm": 4.520781047221492, + "learning_rate": 4.098391834087024e-07, + "loss": 0.3473, + "step": 3152 + }, + { + "epoch": 0.571195652173913, + "grad_norm": 5.322092816443009, + "learning_rate": 4.095506208063817e-07, + "loss": 0.358, + "step": 3153 + }, + { + "epoch": 0.571376811594203, + "grad_norm": 3.8052133702527535, + "learning_rate": 4.0926208934621107e-07, + "loss": 0.2829, + "step": 3154 + }, + { + "epoch": 0.5715579710144928, + "grad_norm": 7.80563711340646, + "learning_rate": 4.089735891275329e-07, + "loss": 0.2911, + "step": 3155 + }, + { + "epoch": 0.5717391304347826, + "grad_norm": 4.352110278183092, + "learning_rate": 4.086851202496794e-07, + "loss": 0.2012, + "step": 3156 + }, + { + "epoch": 0.5719202898550725, + "grad_norm": 3.5658827225666867, + "learning_rate": 4.083966828119714e-07, + "loss": 0.2701, + "step": 3157 + }, + { + "epoch": 0.5721014492753623, + "grad_norm": 4.411114675989049, + "learning_rate": 4.0810827691371976e-07, + "loss": 0.2751, + "step": 3158 + }, + { + "epoch": 0.5722826086956522, + "grad_norm": 4.123892333270912, + "learning_rate": 4.0781990265422365e-07, + "loss": 0.2978, + "step": 3159 + }, + { + "epoch": 0.572463768115942, + "grad_norm": 3.813536335277768, + "learning_rate": 4.0753156013277204e-07, + "loss": 0.2974, + "step": 3160 + }, + { + "epoch": 0.5726449275362319, + "grad_norm": 3.7620965430692532, + "learning_rate": 4.07243249448642e-07, + "loss": 0.2796, + "step": 3161 + }, + { + "epoch": 0.5728260869565217, + "grad_norm": 3.565521747323397, + "learning_rate": 4.069549707011009e-07, + "loss": 0.2118, + "step": 3162 + }, + { + "epoch": 0.5730072463768116, + "grad_norm": 4.857204202840505, + "learning_rate": 4.066667239894043e-07, + "loss": 0.3456, + "step": 3163 + }, + { + "epoch": 0.5731884057971014, + "grad_norm": 3.5528962088365987, + "learning_rate": 4.0637850941279686e-07, + "loss": 0.2639, + "step": 3164 + }, + { + "epoch": 0.5733695652173914, + "grad_norm": 4.776114143366503, + "learning_rate": 4.06090327070512e-07, + "loss": 0.3894, + "step": 3165 + }, + { + "epoch": 0.5735507246376812, + "grad_norm": 4.018969363195631, + "learning_rate": 4.058021770617727e-07, + "loss": 0.2462, + "step": 3166 + }, + { + "epoch": 0.573731884057971, + "grad_norm": 3.288804497553654, + "learning_rate": 4.055140594857901e-07, + "loss": 0.273, + "step": 3167 + }, + { + "epoch": 0.5739130434782609, + "grad_norm": 3.9788793824218405, + "learning_rate": 4.052259744417643e-07, + "loss": 0.2915, + "step": 3168 + }, + { + "epoch": 0.5740942028985507, + "grad_norm": 8.572059514244884, + "learning_rate": 4.049379220288848e-07, + "loss": 0.2836, + "step": 3169 + }, + { + "epoch": 0.5742753623188406, + "grad_norm": 5.01916765722816, + "learning_rate": 4.0464990234632914e-07, + "loss": 0.3712, + "step": 3170 + }, + { + "epoch": 0.5744565217391304, + "grad_norm": 3.7980771704558327, + "learning_rate": 4.0436191549326393e-07, + "loss": 0.2504, + "step": 3171 + }, + { + "epoch": 0.5746376811594203, + "grad_norm": 8.559109239460847, + "learning_rate": 4.0407396156884405e-07, + "loss": 0.321, + "step": 3172 + }, + { + "epoch": 0.5748188405797101, + "grad_norm": 4.741865725066212, + "learning_rate": 4.0378604067221406e-07, + "loss": 0.2537, + "step": 3173 + }, + { + "epoch": 0.575, + "grad_norm": 7.071076993744153, + "learning_rate": 4.03498152902506e-07, + "loss": 0.3358, + "step": 3174 + }, + { + "epoch": 0.5751811594202898, + "grad_norm": 6.473387412681447, + "learning_rate": 4.032102983588411e-07, + "loss": 0.2787, + "step": 3175 + }, + { + "epoch": 0.5753623188405798, + "grad_norm": 6.521075587914448, + "learning_rate": 4.0292247714032906e-07, + "loss": 0.3716, + "step": 3176 + }, + { + "epoch": 0.5755434782608696, + "grad_norm": 4.5883114766243045, + "learning_rate": 4.0263468934606814e-07, + "loss": 0.2586, + "step": 3177 + }, + { + "epoch": 0.5757246376811594, + "grad_norm": 6.354345728235465, + "learning_rate": 4.0234693507514506e-07, + "loss": 0.327, + "step": 3178 + }, + { + "epoch": 0.5759057971014493, + "grad_norm": 7.045053993053714, + "learning_rate": 4.02059214426635e-07, + "loss": 0.3047, + "step": 3179 + }, + { + "epoch": 0.5760869565217391, + "grad_norm": 4.349362756969652, + "learning_rate": 4.0177152749960106e-07, + "loss": 0.3257, + "step": 3180 + }, + { + "epoch": 0.576268115942029, + "grad_norm": 3.7961314109792093, + "learning_rate": 4.0148387439309607e-07, + "loss": 0.2623, + "step": 3181 + }, + { + "epoch": 0.5764492753623188, + "grad_norm": 6.670507682263265, + "learning_rate": 4.0119625520615976e-07, + "loss": 0.2667, + "step": 3182 + }, + { + "epoch": 0.5766304347826087, + "grad_norm": 6.710426180940357, + "learning_rate": 4.009086700378209e-07, + "loss": 0.2349, + "step": 3183 + }, + { + "epoch": 0.5768115942028985, + "grad_norm": 7.508029539245563, + "learning_rate": 4.006211189870964e-07, + "loss": 0.3401, + "step": 3184 + }, + { + "epoch": 0.5769927536231884, + "grad_norm": 7.076363750359712, + "learning_rate": 4.003336021529915e-07, + "loss": 0.2695, + "step": 3185 + }, + { + "epoch": 0.5771739130434783, + "grad_norm": 3.7809178451605123, + "learning_rate": 4.0004611963449966e-07, + "loss": 0.3062, + "step": 3186 + }, + { + "epoch": 0.5773550724637682, + "grad_norm": 3.5979546910937352, + "learning_rate": 3.997586715306026e-07, + "loss": 0.2785, + "step": 3187 + }, + { + "epoch": 0.577536231884058, + "grad_norm": 5.830730515411123, + "learning_rate": 3.994712579402695e-07, + "loss": 0.2812, + "step": 3188 + }, + { + "epoch": 0.5777173913043478, + "grad_norm": 7.476645754214058, + "learning_rate": 3.991838789624589e-07, + "loss": 0.3339, + "step": 3189 + }, + { + "epoch": 0.5778985507246377, + "grad_norm": 5.417749254434929, + "learning_rate": 3.988965346961164e-07, + "loss": 0.2789, + "step": 3190 + }, + { + "epoch": 0.5780797101449275, + "grad_norm": 3.657765745586712, + "learning_rate": 3.98609225240176e-07, + "loss": 0.2615, + "step": 3191 + }, + { + "epoch": 0.5782608695652174, + "grad_norm": 5.675044204409349, + "learning_rate": 3.983219506935597e-07, + "loss": 0.2599, + "step": 3192 + }, + { + "epoch": 0.5784420289855072, + "grad_norm": 8.459184805088174, + "learning_rate": 3.9803471115517756e-07, + "loss": 0.2801, + "step": 3193 + }, + { + "epoch": 0.5786231884057971, + "grad_norm": 4.157611675203298, + "learning_rate": 3.9774750672392754e-07, + "loss": 0.2852, + "step": 3194 + }, + { + "epoch": 0.5788043478260869, + "grad_norm": 8.947731097209754, + "learning_rate": 3.974603374986956e-07, + "loss": 0.3367, + "step": 3195 + }, + { + "epoch": 0.5789855072463768, + "grad_norm": 8.917631421416088, + "learning_rate": 3.9717320357835486e-07, + "loss": 0.2678, + "step": 3196 + }, + { + "epoch": 0.5791666666666667, + "grad_norm": 3.5654023616724633, + "learning_rate": 3.968861050617676e-07, + "loss": 0.2662, + "step": 3197 + }, + { + "epoch": 0.5793478260869566, + "grad_norm": 4.001834116139316, + "learning_rate": 3.9659904204778304e-07, + "loss": 0.3232, + "step": 3198 + }, + { + "epoch": 0.5795289855072464, + "grad_norm": 3.2428858988458837, + "learning_rate": 3.963120146352381e-07, + "loss": 0.2375, + "step": 3199 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 5.317994623227954, + "learning_rate": 3.960250229229577e-07, + "loss": 0.2979, + "step": 3200 + }, + { + "epoch": 0.5797101449275363, + "eval_loss": 0.2931874990463257, + "eval_runtime": 9.7575, + "eval_samples_per_second": 51.243, + "eval_steps_per_second": 0.102, + "step": 3200 + }, + { + "epoch": 0.5798913043478261, + "grad_norm": 10.632587403830119, + "learning_rate": 3.9573806700975475e-07, + "loss": 0.363, + "step": 3201 + }, + { + "epoch": 0.5800724637681159, + "grad_norm": 8.201436610766304, + "learning_rate": 3.9545114699442927e-07, + "loss": 0.2855, + "step": 3202 + }, + { + "epoch": 0.5802536231884058, + "grad_norm": 4.847513761989736, + "learning_rate": 3.951642629757691e-07, + "loss": 0.311, + "step": 3203 + }, + { + "epoch": 0.5804347826086956, + "grad_norm": 5.402295087246476, + "learning_rate": 3.9487741505255e-07, + "loss": 0.2665, + "step": 3204 + }, + { + "epoch": 0.5806159420289855, + "grad_norm": 5.398358882498308, + "learning_rate": 3.9459060332353504e-07, + "loss": 0.2705, + "step": 3205 + }, + { + "epoch": 0.5807971014492753, + "grad_norm": 6.826953785054319, + "learning_rate": 3.943038278874749e-07, + "loss": 0.3216, + "step": 3206 + }, + { + "epoch": 0.5809782608695652, + "grad_norm": 4.312309292767122, + "learning_rate": 3.940170888431073e-07, + "loss": 0.2814, + "step": 3207 + }, + { + "epoch": 0.5811594202898551, + "grad_norm": 4.571435328470545, + "learning_rate": 3.9373038628915846e-07, + "loss": 0.3012, + "step": 3208 + }, + { + "epoch": 0.581340579710145, + "grad_norm": 4.6779913648386975, + "learning_rate": 3.9344372032434104e-07, + "loss": 0.3028, + "step": 3209 + }, + { + "epoch": 0.5815217391304348, + "grad_norm": 4.379491450121226, + "learning_rate": 3.931570910473556e-07, + "loss": 0.3233, + "step": 3210 + }, + { + "epoch": 0.5817028985507247, + "grad_norm": 8.267627570898787, + "learning_rate": 3.928704985568898e-07, + "loss": 0.3013, + "step": 3211 + }, + { + "epoch": 0.5818840579710145, + "grad_norm": 4.332113991406994, + "learning_rate": 3.925839429516191e-07, + "loss": 0.2283, + "step": 3212 + }, + { + "epoch": 0.5820652173913043, + "grad_norm": 4.032941534913446, + "learning_rate": 3.9229742433020575e-07, + "loss": 0.2905, + "step": 3213 + }, + { + "epoch": 0.5822463768115942, + "grad_norm": 3.821066720226336, + "learning_rate": 3.9201094279129967e-07, + "loss": 0.2689, + "step": 3214 + }, + { + "epoch": 0.582427536231884, + "grad_norm": 7.352123234074928, + "learning_rate": 3.917244984335372e-07, + "loss": 0.2378, + "step": 3215 + }, + { + "epoch": 0.5826086956521739, + "grad_norm": 3.591410503349207, + "learning_rate": 3.914380913555434e-07, + "loss": 0.3025, + "step": 3216 + }, + { + "epoch": 0.5827898550724637, + "grad_norm": 4.678012854640022, + "learning_rate": 3.911517216559289e-07, + "loss": 0.2707, + "step": 3217 + }, + { + "epoch": 0.5829710144927536, + "grad_norm": 5.782072710550837, + "learning_rate": 3.908653894332925e-07, + "loss": 0.2901, + "step": 3218 + }, + { + "epoch": 0.5831521739130435, + "grad_norm": 4.0049765729710085, + "learning_rate": 3.905790947862194e-07, + "loss": 0.2679, + "step": 3219 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 5.717771141483069, + "learning_rate": 3.9029283781328255e-07, + "loss": 0.3343, + "step": 3220 + }, + { + "epoch": 0.5835144927536232, + "grad_norm": 4.610237268426266, + "learning_rate": 3.9000661861304147e-07, + "loss": 0.4015, + "step": 3221 + }, + { + "epoch": 0.5836956521739131, + "grad_norm": 3.7547520008586686, + "learning_rate": 3.897204372840428e-07, + "loss": 0.2813, + "step": 3222 + }, + { + "epoch": 0.5838768115942029, + "grad_norm": 5.66949126602403, + "learning_rate": 3.8943429392481987e-07, + "loss": 0.3884, + "step": 3223 + }, + { + "epoch": 0.5840579710144927, + "grad_norm": 5.677565741953224, + "learning_rate": 3.8914818863389363e-07, + "loss": 0.3382, + "step": 3224 + }, + { + "epoch": 0.5842391304347826, + "grad_norm": 4.440918277352746, + "learning_rate": 3.8886212150977124e-07, + "loss": 0.2991, + "step": 3225 + }, + { + "epoch": 0.5844202898550724, + "grad_norm": 3.7331164079927954, + "learning_rate": 3.8857609265094695e-07, + "loss": 0.3155, + "step": 3226 + }, + { + "epoch": 0.5846014492753623, + "grad_norm": 4.1338565102813885, + "learning_rate": 3.882901021559018e-07, + "loss": 0.3377, + "step": 3227 + }, + { + "epoch": 0.5847826086956521, + "grad_norm": 3.49638612370747, + "learning_rate": 3.8800415012310385e-07, + "loss": 0.3029, + "step": 3228 + }, + { + "epoch": 0.5849637681159421, + "grad_norm": 4.659768814843623, + "learning_rate": 3.8771823665100765e-07, + "loss": 0.3031, + "step": 3229 + }, + { + "epoch": 0.5851449275362319, + "grad_norm": 4.260517216937475, + "learning_rate": 3.8743236183805467e-07, + "loss": 0.2606, + "step": 3230 + }, + { + "epoch": 0.5853260869565218, + "grad_norm": 5.120662909136054, + "learning_rate": 3.871465257826727e-07, + "loss": 0.3626, + "step": 3231 + }, + { + "epoch": 0.5855072463768116, + "grad_norm": 3.3822814763757405, + "learning_rate": 3.8686072858327674e-07, + "loss": 0.2247, + "step": 3232 + }, + { + "epoch": 0.5856884057971015, + "grad_norm": 6.219499803703084, + "learning_rate": 3.865749703382681e-07, + "loss": 0.3559, + "step": 3233 + }, + { + "epoch": 0.5858695652173913, + "grad_norm": 3.901230265365265, + "learning_rate": 3.8628925114603445e-07, + "loss": 0.3047, + "step": 3234 + }, + { + "epoch": 0.5860507246376812, + "grad_norm": 5.789195524685199, + "learning_rate": 3.860035711049503e-07, + "loss": 0.2838, + "step": 3235 + }, + { + "epoch": 0.586231884057971, + "grad_norm": 5.212309378126797, + "learning_rate": 3.8571793031337683e-07, + "loss": 0.2679, + "step": 3236 + }, + { + "epoch": 0.5864130434782608, + "grad_norm": 6.4838493888046, + "learning_rate": 3.854323288696615e-07, + "loss": 0.3177, + "step": 3237 + }, + { + "epoch": 0.5865942028985507, + "grad_norm": 4.238989543633254, + "learning_rate": 3.8514676687213805e-07, + "loss": 0.2897, + "step": 3238 + }, + { + "epoch": 0.5867753623188405, + "grad_norm": 4.0237297775096605, + "learning_rate": 3.84861244419127e-07, + "loss": 0.3091, + "step": 3239 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 11.250363171204638, + "learning_rate": 3.845757616089351e-07, + "loss": 0.3068, + "step": 3240 + }, + { + "epoch": 0.5871376811594203, + "grad_norm": 4.271114958381544, + "learning_rate": 3.842903185398555e-07, + "loss": 0.2757, + "step": 3241 + }, + { + "epoch": 0.5873188405797102, + "grad_norm": 3.363792699061715, + "learning_rate": 3.840049153101671e-07, + "loss": 0.2961, + "step": 3242 + }, + { + "epoch": 0.5875, + "grad_norm": 3.4954472309060542, + "learning_rate": 3.8371955201813626e-07, + "loss": 0.2579, + "step": 3243 + }, + { + "epoch": 0.5876811594202899, + "grad_norm": 3.586913567068485, + "learning_rate": 3.8343422876201453e-07, + "loss": 0.2827, + "step": 3244 + }, + { + "epoch": 0.5878623188405797, + "grad_norm": 5.0360083602088626, + "learning_rate": 3.8314894564004014e-07, + "loss": 0.2249, + "step": 3245 + }, + { + "epoch": 0.5880434782608696, + "grad_norm": 9.006627240002917, + "learning_rate": 3.8286370275043727e-07, + "loss": 0.3461, + "step": 3246 + }, + { + "epoch": 0.5882246376811594, + "grad_norm": 6.204042962760713, + "learning_rate": 3.825785001914167e-07, + "loss": 0.341, + "step": 3247 + }, + { + "epoch": 0.5884057971014492, + "grad_norm": 4.098854608581227, + "learning_rate": 3.8229333806117484e-07, + "loss": 0.2946, + "step": 3248 + }, + { + "epoch": 0.5885869565217391, + "grad_norm": 4.243210178727484, + "learning_rate": 3.8200821645789453e-07, + "loss": 0.2633, + "step": 3249 + }, + { + "epoch": 0.5887681159420289, + "grad_norm": 5.574360299927119, + "learning_rate": 3.81723135479744e-07, + "loss": 0.2849, + "step": 3250 + }, + { + "epoch": 0.5889492753623189, + "grad_norm": 5.278660862481296, + "learning_rate": 3.814380952248787e-07, + "loss": 0.2842, + "step": 3251 + }, + { + "epoch": 0.5891304347826087, + "grad_norm": 6.588640102490535, + "learning_rate": 3.8115309579143884e-07, + "loss": 0.2468, + "step": 3252 + }, + { + "epoch": 0.5893115942028986, + "grad_norm": 3.50813118412626, + "learning_rate": 3.8086813727755116e-07, + "loss": 0.2671, + "step": 3253 + }, + { + "epoch": 0.5894927536231884, + "grad_norm": 3.6360011048964713, + "learning_rate": 3.8058321978132825e-07, + "loss": 0.2677, + "step": 3254 + }, + { + "epoch": 0.5896739130434783, + "grad_norm": 3.4689192795021886, + "learning_rate": 3.802983434008686e-07, + "loss": 0.3187, + "step": 3255 + }, + { + "epoch": 0.5898550724637681, + "grad_norm": 14.163157871751594, + "learning_rate": 3.8001350823425654e-07, + "loss": 0.3575, + "step": 3256 + }, + { + "epoch": 0.590036231884058, + "grad_norm": 4.752879779040167, + "learning_rate": 3.7972871437956215e-07, + "loss": 0.2869, + "step": 3257 + }, + { + "epoch": 0.5902173913043478, + "grad_norm": 4.166900134651269, + "learning_rate": 3.7944396193484097e-07, + "loss": 0.372, + "step": 3258 + }, + { + "epoch": 0.5903985507246376, + "grad_norm": 3.3734948533573212, + "learning_rate": 3.791592509981353e-07, + "loss": 0.2589, + "step": 3259 + }, + { + "epoch": 0.5905797101449275, + "grad_norm": 4.183294737934078, + "learning_rate": 3.788745816674718e-07, + "loss": 0.3133, + "step": 3260 + }, + { + "epoch": 0.5907608695652173, + "grad_norm": 5.080898492769002, + "learning_rate": 3.785899540408639e-07, + "loss": 0.2875, + "step": 3261 + }, + { + "epoch": 0.5909420289855073, + "grad_norm": 4.528968696106944, + "learning_rate": 3.7830536821630986e-07, + "loss": 0.3152, + "step": 3262 + }, + { + "epoch": 0.5911231884057971, + "grad_norm": 4.029363255046118, + "learning_rate": 3.780208242917943e-07, + "loss": 0.2794, + "step": 3263 + }, + { + "epoch": 0.591304347826087, + "grad_norm": 3.6929004444166282, + "learning_rate": 3.7773632236528687e-07, + "loss": 0.258, + "step": 3264 + }, + { + "epoch": 0.5914855072463768, + "grad_norm": 5.120903405267128, + "learning_rate": 3.774518625347429e-07, + "loss": 0.3018, + "step": 3265 + }, + { + "epoch": 0.5916666666666667, + "grad_norm": 6.583937887940884, + "learning_rate": 3.7716744489810324e-07, + "loss": 0.2676, + "step": 3266 + }, + { + "epoch": 0.5918478260869565, + "grad_norm": 3.8061779805510025, + "learning_rate": 3.768830695532944e-07, + "loss": 0.2682, + "step": 3267 + }, + { + "epoch": 0.5920289855072464, + "grad_norm": 6.667825440545987, + "learning_rate": 3.765987365982282e-07, + "loss": 0.2851, + "step": 3268 + }, + { + "epoch": 0.5922101449275362, + "grad_norm": 6.5544563472066955, + "learning_rate": 3.7631444613080147e-07, + "loss": 0.2962, + "step": 3269 + }, + { + "epoch": 0.592391304347826, + "grad_norm": 7.032115641658924, + "learning_rate": 3.760301982488969e-07, + "loss": 0.2695, + "step": 3270 + }, + { + "epoch": 0.5925724637681159, + "grad_norm": 6.59219998684683, + "learning_rate": 3.757459930503826e-07, + "loss": 0.2904, + "step": 3271 + }, + { + "epoch": 0.5927536231884057, + "grad_norm": 4.694711776980366, + "learning_rate": 3.754618306331117e-07, + "loss": 0.2942, + "step": 3272 + }, + { + "epoch": 0.5929347826086957, + "grad_norm": 5.809791734644619, + "learning_rate": 3.7517771109492233e-07, + "loss": 0.2739, + "step": 3273 + }, + { + "epoch": 0.5931159420289855, + "grad_norm": 6.521801647853962, + "learning_rate": 3.7489363453363864e-07, + "loss": 0.2362, + "step": 3274 + }, + { + "epoch": 0.5932971014492754, + "grad_norm": 4.334760103721602, + "learning_rate": 3.746096010470693e-07, + "loss": 0.3292, + "step": 3275 + }, + { + "epoch": 0.5934782608695652, + "grad_norm": 4.233674351733492, + "learning_rate": 3.743256107330086e-07, + "loss": 0.3026, + "step": 3276 + }, + { + "epoch": 0.5936594202898551, + "grad_norm": 3.6110189239777957, + "learning_rate": 3.740416636892352e-07, + "loss": 0.2613, + "step": 3277 + }, + { + "epoch": 0.5938405797101449, + "grad_norm": 6.528239288578401, + "learning_rate": 3.737577600135141e-07, + "loss": 0.3016, + "step": 3278 + }, + { + "epoch": 0.5940217391304348, + "grad_norm": 6.194466142336035, + "learning_rate": 3.734738998035943e-07, + "loss": 0.315, + "step": 3279 + }, + { + "epoch": 0.5942028985507246, + "grad_norm": 5.2522975418429425, + "learning_rate": 3.731900831572103e-07, + "loss": 0.3537, + "step": 3280 + }, + { + "epoch": 0.5943840579710145, + "grad_norm": 5.053300408112575, + "learning_rate": 3.729063101720814e-07, + "loss": 0.3765, + "step": 3281 + }, + { + "epoch": 0.5945652173913043, + "grad_norm": 4.08815108337814, + "learning_rate": 3.7262258094591224e-07, + "loss": 0.2597, + "step": 3282 + }, + { + "epoch": 0.5947463768115943, + "grad_norm": 6.285926564167949, + "learning_rate": 3.723388955763919e-07, + "loss": 0.2876, + "step": 3283 + }, + { + "epoch": 0.5949275362318841, + "grad_norm": 5.681103637436708, + "learning_rate": 3.7205525416119487e-07, + "loss": 0.3243, + "step": 3284 + }, + { + "epoch": 0.595108695652174, + "grad_norm": 4.778912246133184, + "learning_rate": 3.7177165679797965e-07, + "loss": 0.3038, + "step": 3285 + }, + { + "epoch": 0.5952898550724638, + "grad_norm": 4.792956483475528, + "learning_rate": 3.7148810358439095e-07, + "loss": 0.2946, + "step": 3286 + }, + { + "epoch": 0.5954710144927536, + "grad_norm": 3.72172770604181, + "learning_rate": 3.71204594618057e-07, + "loss": 0.2953, + "step": 3287 + }, + { + "epoch": 0.5956521739130435, + "grad_norm": 4.1948615320251905, + "learning_rate": 3.7092112999659123e-07, + "loss": 0.2538, + "step": 3288 + }, + { + "epoch": 0.5958333333333333, + "grad_norm": 5.343797727190462, + "learning_rate": 3.7063770981759203e-07, + "loss": 0.2542, + "step": 3289 + }, + { + "epoch": 0.5960144927536232, + "grad_norm": 3.9454674952969877, + "learning_rate": 3.7035433417864224e-07, + "loss": 0.2883, + "step": 3290 + }, + { + "epoch": 0.596195652173913, + "grad_norm": 3.184036996587978, + "learning_rate": 3.7007100317730953e-07, + "loss": 0.2418, + "step": 3291 + }, + { + "epoch": 0.5963768115942029, + "grad_norm": 4.508573683342913, + "learning_rate": 3.697877169111462e-07, + "loss": 0.2541, + "step": 3292 + }, + { + "epoch": 0.5965579710144927, + "grad_norm": 14.260662825127268, + "learning_rate": 3.695044754776885e-07, + "loss": 0.3101, + "step": 3293 + }, + { + "epoch": 0.5967391304347827, + "grad_norm": 5.394422598058214, + "learning_rate": 3.6922127897445857e-07, + "loss": 0.2617, + "step": 3294 + }, + { + "epoch": 0.5969202898550725, + "grad_norm": 4.25312159106331, + "learning_rate": 3.689381274989618e-07, + "loss": 0.261, + "step": 3295 + }, + { + "epoch": 0.5971014492753624, + "grad_norm": 6.529957747875226, + "learning_rate": 3.6865502114868876e-07, + "loss": 0.241, + "step": 3296 + }, + { + "epoch": 0.5972826086956522, + "grad_norm": 8.190660379147307, + "learning_rate": 3.683719600211141e-07, + "loss": 0.3252, + "step": 3297 + }, + { + "epoch": 0.597463768115942, + "grad_norm": 3.1977277274683624, + "learning_rate": 3.680889442136974e-07, + "loss": 0.2196, + "step": 3298 + }, + { + "epoch": 0.5976449275362319, + "grad_norm": 7.144997125269342, + "learning_rate": 3.678059738238822e-07, + "loss": 0.3522, + "step": 3299 + }, + { + "epoch": 0.5978260869565217, + "grad_norm": 4.022176567756693, + "learning_rate": 3.675230489490966e-07, + "loss": 0.3139, + "step": 3300 + }, + { + "epoch": 0.5978260869565217, + "eval_loss": 0.2955937385559082, + "eval_runtime": 9.7532, + "eval_samples_per_second": 51.265, + "eval_steps_per_second": 0.103, + "step": 3300 + }, + { + "epoch": 0.5980072463768116, + "grad_norm": 3.651167428835501, + "learning_rate": 3.6724016968675274e-07, + "loss": 0.3181, + "step": 3301 + }, + { + "epoch": 0.5981884057971014, + "grad_norm": 4.020400213111246, + "learning_rate": 3.669573361342477e-07, + "loss": 0.2268, + "step": 3302 + }, + { + "epoch": 0.5983695652173913, + "grad_norm": 3.563789639177328, + "learning_rate": 3.6667454838896226e-07, + "loss": 0.279, + "step": 3303 + }, + { + "epoch": 0.5985507246376811, + "grad_norm": 3.679358010953408, + "learning_rate": 3.663918065482614e-07, + "loss": 0.2967, + "step": 3304 + }, + { + "epoch": 0.5987318840579711, + "grad_norm": 5.381985935358111, + "learning_rate": 3.6610911070949453e-07, + "loss": 0.2859, + "step": 3305 + }, + { + "epoch": 0.5989130434782609, + "grad_norm": 10.081364661882816, + "learning_rate": 3.6582646096999525e-07, + "loss": 0.3456, + "step": 3306 + }, + { + "epoch": 0.5990942028985508, + "grad_norm": 6.863005718495925, + "learning_rate": 3.6554385742708126e-07, + "loss": 0.3032, + "step": 3307 + }, + { + "epoch": 0.5992753623188406, + "grad_norm": 5.2518483261676945, + "learning_rate": 3.6526130017805414e-07, + "loss": 0.2938, + "step": 3308 + }, + { + "epoch": 0.5994565217391304, + "grad_norm": 4.557840109538176, + "learning_rate": 3.649787893201998e-07, + "loss": 0.3504, + "step": 3309 + }, + { + "epoch": 0.5996376811594203, + "grad_norm": 3.8251551994823965, + "learning_rate": 3.646963249507881e-07, + "loss": 0.2773, + "step": 3310 + }, + { + "epoch": 0.5998188405797101, + "grad_norm": 4.748903552679971, + "learning_rate": 3.6441390716707286e-07, + "loss": 0.3156, + "step": 3311 + }, + { + "epoch": 0.6, + "grad_norm": 8.469180151323103, + "learning_rate": 3.6413153606629153e-07, + "loss": 0.2674, + "step": 3312 + }, + { + "epoch": 0.6001811594202898, + "grad_norm": 5.932796807598475, + "learning_rate": 3.638492117456664e-07, + "loss": 0.3092, + "step": 3313 + }, + { + "epoch": 0.6003623188405797, + "grad_norm": 4.780136478246534, + "learning_rate": 3.635669343024027e-07, + "loss": 0.2219, + "step": 3314 + }, + { + "epoch": 0.6005434782608695, + "grad_norm": 3.916191481404587, + "learning_rate": 3.6328470383368987e-07, + "loss": 0.2697, + "step": 3315 + }, + { + "epoch": 0.6007246376811595, + "grad_norm": 4.0423793769398815, + "learning_rate": 3.630025204367012e-07, + "loss": 0.2955, + "step": 3316 + }, + { + "epoch": 0.6009057971014493, + "grad_norm": 4.246249678323975, + "learning_rate": 3.6272038420859396e-07, + "loss": 0.3051, + "step": 3317 + }, + { + "epoch": 0.6010869565217392, + "grad_norm": 4.452423326475648, + "learning_rate": 3.6243829524650895e-07, + "loss": 0.2737, + "step": 3318 + }, + { + "epoch": 0.601268115942029, + "grad_norm": 3.6839003826844325, + "learning_rate": 3.6215625364757063e-07, + "loss": 0.2668, + "step": 3319 + }, + { + "epoch": 0.6014492753623188, + "grad_norm": 5.179098262317639, + "learning_rate": 3.6187425950888706e-07, + "loss": 0.2559, + "step": 3320 + }, + { + "epoch": 0.6016304347826087, + "grad_norm": 4.89579337971018, + "learning_rate": 3.615923129275507e-07, + "loss": 0.2967, + "step": 3321 + }, + { + "epoch": 0.6018115942028985, + "grad_norm": 4.7918255542702655, + "learning_rate": 3.613104140006367e-07, + "loss": 0.3235, + "step": 3322 + }, + { + "epoch": 0.6019927536231884, + "grad_norm": 5.507709040721174, + "learning_rate": 3.6102856282520435e-07, + "loss": 0.2935, + "step": 3323 + }, + { + "epoch": 0.6021739130434782, + "grad_norm": 5.366217628216412, + "learning_rate": 3.6074675949829603e-07, + "loss": 0.3015, + "step": 3324 + }, + { + "epoch": 0.6023550724637681, + "grad_norm": 7.153041854058611, + "learning_rate": 3.604650041169384e-07, + "loss": 0.3326, + "step": 3325 + }, + { + "epoch": 0.6025362318840579, + "grad_norm": 4.36631190057768, + "learning_rate": 3.60183296778141e-07, + "loss": 0.2353, + "step": 3326 + }, + { + "epoch": 0.6027173913043479, + "grad_norm": 3.549837351966586, + "learning_rate": 3.5990163757889704e-07, + "loss": 0.2206, + "step": 3327 + }, + { + "epoch": 0.6028985507246377, + "grad_norm": 10.07892491028872, + "learning_rate": 3.596200266161827e-07, + "loss": 0.2715, + "step": 3328 + }, + { + "epoch": 0.6030797101449276, + "grad_norm": 3.8983570432557104, + "learning_rate": 3.593384639869587e-07, + "loss": 0.3248, + "step": 3329 + }, + { + "epoch": 0.6032608695652174, + "grad_norm": 4.489422865055295, + "learning_rate": 3.59056949788168e-07, + "loss": 0.3404, + "step": 3330 + }, + { + "epoch": 0.6034420289855073, + "grad_norm": 3.4483135395784945, + "learning_rate": 3.587754841167372e-07, + "loss": 0.2872, + "step": 3331 + }, + { + "epoch": 0.6036231884057971, + "grad_norm": 4.235392034039292, + "learning_rate": 3.584940670695763e-07, + "loss": 0.3454, + "step": 3332 + }, + { + "epoch": 0.6038043478260869, + "grad_norm": 5.96991656199851, + "learning_rate": 3.5821269874357864e-07, + "loss": 0.3012, + "step": 3333 + }, + { + "epoch": 0.6039855072463768, + "grad_norm": 5.049135852776992, + "learning_rate": 3.5793137923562053e-07, + "loss": 0.262, + "step": 3334 + }, + { + "epoch": 0.6041666666666666, + "grad_norm": 6.559784801461435, + "learning_rate": 3.5765010864256184e-07, + "loss": 0.3138, + "step": 3335 + }, + { + "epoch": 0.6043478260869565, + "grad_norm": 3.6045134388981657, + "learning_rate": 3.57368887061245e-07, + "loss": 0.3234, + "step": 3336 + }, + { + "epoch": 0.6045289855072464, + "grad_norm": 4.19751242627348, + "learning_rate": 3.570877145884963e-07, + "loss": 0.244, + "step": 3337 + }, + { + "epoch": 0.6047101449275363, + "grad_norm": 3.4859578467470445, + "learning_rate": 3.568065913211247e-07, + "loss": 0.2568, + "step": 3338 + }, + { + "epoch": 0.6048913043478261, + "grad_norm": 6.3285829932738435, + "learning_rate": 3.5652551735592205e-07, + "loss": 0.3026, + "step": 3339 + }, + { + "epoch": 0.605072463768116, + "grad_norm": 9.85991638979906, + "learning_rate": 3.5624449278966347e-07, + "loss": 0.3299, + "step": 3340 + }, + { + "epoch": 0.6052536231884058, + "grad_norm": 7.714432474088771, + "learning_rate": 3.559635177191073e-07, + "loss": 0.279, + "step": 3341 + }, + { + "epoch": 0.6054347826086957, + "grad_norm": 3.713120356597635, + "learning_rate": 3.556825922409943e-07, + "loss": 0.3166, + "step": 3342 + }, + { + "epoch": 0.6056159420289855, + "grad_norm": 5.902214459588551, + "learning_rate": 3.554017164520486e-07, + "loss": 0.2979, + "step": 3343 + }, + { + "epoch": 0.6057971014492753, + "grad_norm": 6.898580287480659, + "learning_rate": 3.5512089044897714e-07, + "loss": 0.3271, + "step": 3344 + }, + { + "epoch": 0.6059782608695652, + "grad_norm": 5.063178511010489, + "learning_rate": 3.548401143284695e-07, + "loss": 0.2301, + "step": 3345 + }, + { + "epoch": 0.606159420289855, + "grad_norm": 4.25370324712661, + "learning_rate": 3.5455938818719843e-07, + "loss": 0.3126, + "step": 3346 + }, + { + "epoch": 0.6063405797101449, + "grad_norm": 5.713186328910256, + "learning_rate": 3.542787121218188e-07, + "loss": 0.3062, + "step": 3347 + }, + { + "epoch": 0.6065217391304348, + "grad_norm": 3.3805392598126005, + "learning_rate": 3.539980862289693e-07, + "loss": 0.2588, + "step": 3348 + }, + { + "epoch": 0.6067028985507247, + "grad_norm": 4.178821876849478, + "learning_rate": 3.5371751060527046e-07, + "loss": 0.3023, + "step": 3349 + }, + { + "epoch": 0.6068840579710145, + "grad_norm": 4.505841378383071, + "learning_rate": 3.534369853473258e-07, + "loss": 0.2392, + "step": 3350 + }, + { + "epoch": 0.6070652173913044, + "grad_norm": 5.967333772132723, + "learning_rate": 3.5315651055172133e-07, + "loss": 0.2878, + "step": 3351 + }, + { + "epoch": 0.6072463768115942, + "grad_norm": 4.2825410858985515, + "learning_rate": 3.528760863150262e-07, + "loss": 0.2711, + "step": 3352 + }, + { + "epoch": 0.6074275362318841, + "grad_norm": 3.6390844389142467, + "learning_rate": 3.525957127337916e-07, + "loss": 0.2304, + "step": 3353 + }, + { + "epoch": 0.6076086956521739, + "grad_norm": 4.747535560337662, + "learning_rate": 3.523153899045517e-07, + "loss": 0.3466, + "step": 3354 + }, + { + "epoch": 0.6077898550724637, + "grad_norm": 4.37513776994546, + "learning_rate": 3.5203511792382246e-07, + "loss": 0.2843, + "step": 3355 + }, + { + "epoch": 0.6079710144927536, + "grad_norm": 4.205680129712524, + "learning_rate": 3.5175489688810344e-07, + "loss": 0.3276, + "step": 3356 + }, + { + "epoch": 0.6081521739130434, + "grad_norm": 5.806257279452617, + "learning_rate": 3.5147472689387583e-07, + "loss": 0.2974, + "step": 3357 + }, + { + "epoch": 0.6083333333333333, + "grad_norm": 3.9912106927406046, + "learning_rate": 3.511946080376034e-07, + "loss": 0.2292, + "step": 3358 + }, + { + "epoch": 0.6085144927536232, + "grad_norm": 5.014533378817344, + "learning_rate": 3.5091454041573236e-07, + "loss": 0.3342, + "step": 3359 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 5.9651598388958105, + "learning_rate": 3.5063452412469157e-07, + "loss": 0.3226, + "step": 3360 + }, + { + "epoch": 0.6088768115942029, + "grad_norm": 8.104090091646354, + "learning_rate": 3.5035455926089184e-07, + "loss": 0.2911, + "step": 3361 + }, + { + "epoch": 0.6090579710144928, + "grad_norm": 4.41459225803029, + "learning_rate": 3.5007464592072643e-07, + "loss": 0.3134, + "step": 3362 + }, + { + "epoch": 0.6092391304347826, + "grad_norm": 3.3807735209487197, + "learning_rate": 3.4979478420057057e-07, + "loss": 0.2551, + "step": 3363 + }, + { + "epoch": 0.6094202898550725, + "grad_norm": 4.782780075566693, + "learning_rate": 3.495149741967824e-07, + "loss": 0.2216, + "step": 3364 + }, + { + "epoch": 0.6096014492753623, + "grad_norm": 5.241905344890328, + "learning_rate": 3.4923521600570173e-07, + "loss": 0.2244, + "step": 3365 + }, + { + "epoch": 0.6097826086956522, + "grad_norm": 3.5897214172554928, + "learning_rate": 3.4895550972365035e-07, + "loss": 0.2721, + "step": 3366 + }, + { + "epoch": 0.609963768115942, + "grad_norm": 5.61174062325696, + "learning_rate": 3.4867585544693256e-07, + "loss": 0.2862, + "step": 3367 + }, + { + "epoch": 0.6101449275362318, + "grad_norm": 4.371578527908705, + "learning_rate": 3.483962532718349e-07, + "loss": 0.2991, + "step": 3368 + }, + { + "epoch": 0.6103260869565217, + "grad_norm": 3.6397045144993094, + "learning_rate": 3.4811670329462546e-07, + "loss": 0.2785, + "step": 3369 + }, + { + "epoch": 0.6105072463768116, + "grad_norm": 4.0070612839986754, + "learning_rate": 3.478372056115547e-07, + "loss": 0.3022, + "step": 3370 + }, + { + "epoch": 0.6106884057971015, + "grad_norm": 3.41215812125486, + "learning_rate": 3.4755776031885496e-07, + "loss": 0.2204, + "step": 3371 + }, + { + "epoch": 0.6108695652173913, + "grad_norm": 3.6570079575541383, + "learning_rate": 3.4727836751274073e-07, + "loss": 0.3153, + "step": 3372 + }, + { + "epoch": 0.6110507246376812, + "grad_norm": 4.92218841189247, + "learning_rate": 3.4699902728940833e-07, + "loss": 0.3091, + "step": 3373 + }, + { + "epoch": 0.611231884057971, + "grad_norm": 3.918209467061928, + "learning_rate": 3.4671973974503565e-07, + "loss": 0.2362, + "step": 3374 + }, + { + "epoch": 0.6114130434782609, + "grad_norm": 4.02918279825564, + "learning_rate": 3.464405049757827e-07, + "loss": 0.2953, + "step": 3375 + }, + { + "epoch": 0.6115942028985507, + "grad_norm": 4.6206769906014085, + "learning_rate": 3.4616132307779156e-07, + "loss": 0.3171, + "step": 3376 + }, + { + "epoch": 0.6117753623188406, + "grad_norm": 4.665061766891564, + "learning_rate": 3.458821941471858e-07, + "loss": 0.2837, + "step": 3377 + }, + { + "epoch": 0.6119565217391304, + "grad_norm": 5.39750229597707, + "learning_rate": 3.456031182800708e-07, + "loss": 0.302, + "step": 3378 + }, + { + "epoch": 0.6121376811594202, + "grad_norm": 6.686040640006759, + "learning_rate": 3.453240955725336e-07, + "loss": 0.3274, + "step": 3379 + }, + { + "epoch": 0.6123188405797102, + "grad_norm": 7.965186976185531, + "learning_rate": 3.450451261206433e-07, + "loss": 0.3138, + "step": 3380 + }, + { + "epoch": 0.6125, + "grad_norm": 7.29154606731725, + "learning_rate": 3.4476621002045034e-07, + "loss": 0.2838, + "step": 3381 + }, + { + "epoch": 0.6126811594202899, + "grad_norm": 3.627494328076497, + "learning_rate": 3.444873473679866e-07, + "loss": 0.2866, + "step": 3382 + }, + { + "epoch": 0.6128623188405797, + "grad_norm": 4.26359221396759, + "learning_rate": 3.442085382592662e-07, + "loss": 0.2896, + "step": 3383 + }, + { + "epoch": 0.6130434782608696, + "grad_norm": 3.2586955889005846, + "learning_rate": 3.439297827902841e-07, + "loss": 0.2314, + "step": 3384 + }, + { + "epoch": 0.6132246376811594, + "grad_norm": 3.5385499510280893, + "learning_rate": 3.436510810570173e-07, + "loss": 0.2329, + "step": 3385 + }, + { + "epoch": 0.6134057971014493, + "grad_norm": 5.964639570819032, + "learning_rate": 3.43372433155424e-07, + "loss": 0.2726, + "step": 3386 + }, + { + "epoch": 0.6135869565217391, + "grad_norm": 3.8224831558381838, + "learning_rate": 3.430938391814442e-07, + "loss": 0.2848, + "step": 3387 + }, + { + "epoch": 0.613768115942029, + "grad_norm": 4.646803586653227, + "learning_rate": 3.4281529923099895e-07, + "loss": 0.2561, + "step": 3388 + }, + { + "epoch": 0.6139492753623188, + "grad_norm": 4.008871311678018, + "learning_rate": 3.4253681339999106e-07, + "loss": 0.3173, + "step": 3389 + }, + { + "epoch": 0.6141304347826086, + "grad_norm": 5.398176800269643, + "learning_rate": 3.422583817843041e-07, + "loss": 0.2813, + "step": 3390 + }, + { + "epoch": 0.6143115942028986, + "grad_norm": 4.081504817136662, + "learning_rate": 3.41980004479804e-07, + "loss": 0.3089, + "step": 3391 + }, + { + "epoch": 0.6144927536231884, + "grad_norm": 5.69908946404749, + "learning_rate": 3.417016815823369e-07, + "loss": 0.2864, + "step": 3392 + }, + { + "epoch": 0.6146739130434783, + "grad_norm": 3.7981437707990136, + "learning_rate": 3.414234131877309e-07, + "loss": 0.3279, + "step": 3393 + }, + { + "epoch": 0.6148550724637681, + "grad_norm": 4.090170855014332, + "learning_rate": 3.411451993917951e-07, + "loss": 0.3269, + "step": 3394 + }, + { + "epoch": 0.615036231884058, + "grad_norm": 7.195674493614941, + "learning_rate": 3.408670402903198e-07, + "loss": 0.2845, + "step": 3395 + }, + { + "epoch": 0.6152173913043478, + "grad_norm": 3.713785620643085, + "learning_rate": 3.4058893597907655e-07, + "loss": 0.2922, + "step": 3396 + }, + { + "epoch": 0.6153985507246377, + "grad_norm": 5.029608827363032, + "learning_rate": 3.4031088655381803e-07, + "loss": 0.2916, + "step": 3397 + }, + { + "epoch": 0.6155797101449275, + "grad_norm": 3.9481893696522627, + "learning_rate": 3.400328921102776e-07, + "loss": 0.2956, + "step": 3398 + }, + { + "epoch": 0.6157608695652174, + "grad_norm": 3.926948761101867, + "learning_rate": 3.3975495274417056e-07, + "loss": 0.2933, + "step": 3399 + }, + { + "epoch": 0.6159420289855072, + "grad_norm": 3.448641775407215, + "learning_rate": 3.3947706855119263e-07, + "loss": 0.2419, + "step": 3400 + }, + { + "epoch": 0.6159420289855072, + "eval_loss": 0.2915000021457672, + "eval_runtime": 9.7516, + "eval_samples_per_second": 51.274, + "eval_steps_per_second": 0.103, + "step": 3400 + }, + { + "epoch": 0.616123188405797, + "grad_norm": 5.416204808194034, + "learning_rate": 3.391992396270205e-07, + "loss": 0.3008, + "step": 3401 + }, + { + "epoch": 0.616304347826087, + "grad_norm": 3.879247725695068, + "learning_rate": 3.3892146606731195e-07, + "loss": 0.2998, + "step": 3402 + }, + { + "epoch": 0.6164855072463769, + "grad_norm": 4.792584029000788, + "learning_rate": 3.386437479677059e-07, + "loss": 0.2835, + "step": 3403 + }, + { + "epoch": 0.6166666666666667, + "grad_norm": 10.50501637472616, + "learning_rate": 3.3836608542382206e-07, + "loss": 0.2781, + "step": 3404 + }, + { + "epoch": 0.6168478260869565, + "grad_norm": 4.774879211806365, + "learning_rate": 3.380884785312608e-07, + "loss": 0.3511, + "step": 3405 + }, + { + "epoch": 0.6170289855072464, + "grad_norm": 4.770318371124587, + "learning_rate": 3.3781092738560334e-07, + "loss": 0.3533, + "step": 3406 + }, + { + "epoch": 0.6172101449275362, + "grad_norm": 5.767018152305727, + "learning_rate": 3.375334320824122e-07, + "loss": 0.227, + "step": 3407 + }, + { + "epoch": 0.6173913043478261, + "grad_norm": 4.197197355550296, + "learning_rate": 3.3725599271723024e-07, + "loss": 0.2906, + "step": 3408 + }, + { + "epoch": 0.6175724637681159, + "grad_norm": 4.090042387105566, + "learning_rate": 3.3697860938558107e-07, + "loss": 0.2596, + "step": 3409 + }, + { + "epoch": 0.6177536231884058, + "grad_norm": 6.108399253355486, + "learning_rate": 3.36701282182969e-07, + "loss": 0.312, + "step": 3410 + }, + { + "epoch": 0.6179347826086956, + "grad_norm": 3.949792855294822, + "learning_rate": 3.3642401120487925e-07, + "loss": 0.2729, + "step": 3411 + }, + { + "epoch": 0.6181159420289855, + "grad_norm": 4.068161218770359, + "learning_rate": 3.361467965467775e-07, + "loss": 0.2581, + "step": 3412 + }, + { + "epoch": 0.6182971014492754, + "grad_norm": 7.26254582304241, + "learning_rate": 3.3586963830411004e-07, + "loss": 0.2687, + "step": 3413 + }, + { + "epoch": 0.6184782608695653, + "grad_norm": 7.939674765239618, + "learning_rate": 3.355925365723037e-07, + "loss": 0.2827, + "step": 3414 + }, + { + "epoch": 0.6186594202898551, + "grad_norm": 4.768702238108111, + "learning_rate": 3.3531549144676606e-07, + "loss": 0.3147, + "step": 3415 + }, + { + "epoch": 0.618840579710145, + "grad_norm": 5.081040663047155, + "learning_rate": 3.3503850302288517e-07, + "loss": 0.2643, + "step": 3416 + }, + { + "epoch": 0.6190217391304348, + "grad_norm": 5.307761754284609, + "learning_rate": 3.347615713960289e-07, + "loss": 0.3767, + "step": 3417 + }, + { + "epoch": 0.6192028985507246, + "grad_norm": 5.7265332134745, + "learning_rate": 3.3448469666154687e-07, + "loss": 0.2711, + "step": 3418 + }, + { + "epoch": 0.6193840579710145, + "grad_norm": 5.98983013767756, + "learning_rate": 3.3420787891476785e-07, + "loss": 0.306, + "step": 3419 + }, + { + "epoch": 0.6195652173913043, + "grad_norm": 10.665162947850796, + "learning_rate": 3.3393111825100176e-07, + "loss": 0.2927, + "step": 3420 + }, + { + "epoch": 0.6197463768115942, + "grad_norm": 10.369888656993997, + "learning_rate": 3.3365441476553837e-07, + "loss": 0.2486, + "step": 3421 + }, + { + "epoch": 0.619927536231884, + "grad_norm": 5.319507481340947, + "learning_rate": 3.333777685536482e-07, + "loss": 0.319, + "step": 3422 + }, + { + "epoch": 0.6201086956521739, + "grad_norm": 5.341075825958292, + "learning_rate": 3.3310117971058184e-07, + "loss": 0.2625, + "step": 3423 + }, + { + "epoch": 0.6202898550724638, + "grad_norm": 5.119010067014903, + "learning_rate": 3.3282464833157016e-07, + "loss": 0.3404, + "step": 3424 + }, + { + "epoch": 0.6204710144927537, + "grad_norm": 5.518815813722564, + "learning_rate": 3.3254817451182383e-07, + "loss": 0.2431, + "step": 3425 + }, + { + "epoch": 0.6206521739130435, + "grad_norm": 3.8941872483903084, + "learning_rate": 3.3227175834653475e-07, + "loss": 0.2717, + "step": 3426 + }, + { + "epoch": 0.6208333333333333, + "grad_norm": 5.739700907654499, + "learning_rate": 3.319953999308739e-07, + "loss": 0.3448, + "step": 3427 + }, + { + "epoch": 0.6210144927536232, + "grad_norm": 4.1154490717111045, + "learning_rate": 3.317190993599929e-07, + "loss": 0.2578, + "step": 3428 + }, + { + "epoch": 0.621195652173913, + "grad_norm": 7.147933098720609, + "learning_rate": 3.3144285672902314e-07, + "loss": 0.3057, + "step": 3429 + }, + { + "epoch": 0.6213768115942029, + "grad_norm": 5.835403803071247, + "learning_rate": 3.3116667213307657e-07, + "loss": 0.3485, + "step": 3430 + }, + { + "epoch": 0.6215579710144927, + "grad_norm": 4.378248901854491, + "learning_rate": 3.3089054566724474e-07, + "loss": 0.2889, + "step": 3431 + }, + { + "epoch": 0.6217391304347826, + "grad_norm": 3.45538849605132, + "learning_rate": 3.306144774265994e-07, + "loss": 0.2964, + "step": 3432 + }, + { + "epoch": 0.6219202898550724, + "grad_norm": 4.898633954388818, + "learning_rate": 3.303384675061918e-07, + "loss": 0.2666, + "step": 3433 + }, + { + "epoch": 0.6221014492753624, + "grad_norm": 4.157915349439017, + "learning_rate": 3.300625160010538e-07, + "loss": 0.3556, + "step": 3434 + }, + { + "epoch": 0.6222826086956522, + "grad_norm": 4.15883329519004, + "learning_rate": 3.297866230061969e-07, + "loss": 0.3005, + "step": 3435 + }, + { + "epoch": 0.6224637681159421, + "grad_norm": 4.612893971422, + "learning_rate": 3.295107886166121e-07, + "loss": 0.2969, + "step": 3436 + }, + { + "epoch": 0.6226449275362319, + "grad_norm": 6.407804537293918, + "learning_rate": 3.292350129272704e-07, + "loss": 0.2571, + "step": 3437 + }, + { + "epoch": 0.6228260869565218, + "grad_norm": 11.807591402410461, + "learning_rate": 3.28959296033123e-07, + "loss": 0.2889, + "step": 3438 + }, + { + "epoch": 0.6230072463768116, + "grad_norm": 6.707847759024104, + "learning_rate": 3.2868363802910036e-07, + "loss": 0.2947, + "step": 3439 + }, + { + "epoch": 0.6231884057971014, + "grad_norm": 5.553636979851655, + "learning_rate": 3.2840803901011293e-07, + "loss": 0.3348, + "step": 3440 + }, + { + "epoch": 0.6233695652173913, + "grad_norm": 5.267889379745383, + "learning_rate": 3.281324990710506e-07, + "loss": 0.268, + "step": 3441 + }, + { + "epoch": 0.6235507246376811, + "grad_norm": 3.4546733257721374, + "learning_rate": 3.2785701830678317e-07, + "loss": 0.2639, + "step": 3442 + }, + { + "epoch": 0.623731884057971, + "grad_norm": 4.319401349102515, + "learning_rate": 3.2758159681216006e-07, + "loss": 0.2735, + "step": 3443 + }, + { + "epoch": 0.6239130434782608, + "grad_norm": 6.245917005495754, + "learning_rate": 3.2730623468201005e-07, + "loss": 0.2809, + "step": 3444 + }, + { + "epoch": 0.6240942028985508, + "grad_norm": 7.218059252158506, + "learning_rate": 3.2703093201114164e-07, + "loss": 0.3095, + "step": 3445 + }, + { + "epoch": 0.6242753623188406, + "grad_norm": 6.223554817332476, + "learning_rate": 3.2675568889434283e-07, + "loss": 0.3175, + "step": 3446 + }, + { + "epoch": 0.6244565217391305, + "grad_norm": 5.208981718445207, + "learning_rate": 3.2648050542638127e-07, + "loss": 0.3013, + "step": 3447 + }, + { + "epoch": 0.6246376811594203, + "grad_norm": 3.643399611839414, + "learning_rate": 3.2620538170200384e-07, + "loss": 0.3114, + "step": 3448 + }, + { + "epoch": 0.6248188405797102, + "grad_norm": 9.026695166408295, + "learning_rate": 3.259303178159369e-07, + "loss": 0.2425, + "step": 3449 + }, + { + "epoch": 0.625, + "grad_norm": 3.8624549332886042, + "learning_rate": 3.2565531386288634e-07, + "loss": 0.3045, + "step": 3450 + }, + { + "epoch": 0.6251811594202898, + "grad_norm": 3.357615249587839, + "learning_rate": 3.253803699375374e-07, + "loss": 0.2858, + "step": 3451 + }, + { + "epoch": 0.6253623188405797, + "grad_norm": 4.7049179864879775, + "learning_rate": 3.251054861345541e-07, + "loss": 0.35, + "step": 3452 + }, + { + "epoch": 0.6255434782608695, + "grad_norm": 5.497095510356356, + "learning_rate": 3.2483066254858094e-07, + "loss": 0.343, + "step": 3453 + }, + { + "epoch": 0.6257246376811594, + "grad_norm": 5.123134396124013, + "learning_rate": 3.2455589927424056e-07, + "loss": 0.2871, + "step": 3454 + }, + { + "epoch": 0.6259057971014492, + "grad_norm": 4.461689299945907, + "learning_rate": 3.242811964061353e-07, + "loss": 0.2745, + "step": 3455 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 7.422599470726544, + "learning_rate": 3.2400655403884657e-07, + "loss": 0.2746, + "step": 3456 + }, + { + "epoch": 0.626268115942029, + "grad_norm": 10.878100702500891, + "learning_rate": 3.237319722669353e-07, + "loss": 0.2689, + "step": 3457 + }, + { + "epoch": 0.6264492753623189, + "grad_norm": 5.8444935654169905, + "learning_rate": 3.2345745118494105e-07, + "loss": 0.2494, + "step": 3458 + }, + { + "epoch": 0.6266304347826087, + "grad_norm": 3.3332809025468886, + "learning_rate": 3.2318299088738306e-07, + "loss": 0.249, + "step": 3459 + }, + { + "epoch": 0.6268115942028986, + "grad_norm": 11.392629740906484, + "learning_rate": 3.229085914687587e-07, + "loss": 0.281, + "step": 3460 + }, + { + "epoch": 0.6269927536231884, + "grad_norm": 4.492616175317283, + "learning_rate": 3.2263425302354576e-07, + "loss": 0.3051, + "step": 3461 + }, + { + "epoch": 0.6271739130434782, + "grad_norm": 8.144427767289047, + "learning_rate": 3.223599756461997e-07, + "loss": 0.3134, + "step": 3462 + }, + { + "epoch": 0.6273550724637681, + "grad_norm": 5.289769844162745, + "learning_rate": 3.2208575943115577e-07, + "loss": 0.2764, + "step": 3463 + }, + { + "epoch": 0.6275362318840579, + "grad_norm": 5.204015174587078, + "learning_rate": 3.218116044728277e-07, + "loss": 0.3087, + "step": 3464 + }, + { + "epoch": 0.6277173913043478, + "grad_norm": 5.366672131118816, + "learning_rate": 3.2153751086560856e-07, + "loss": 0.2218, + "step": 3465 + }, + { + "epoch": 0.6278985507246376, + "grad_norm": 6.319604205684525, + "learning_rate": 3.2126347870387006e-07, + "loss": 0.2805, + "step": 3466 + }, + { + "epoch": 0.6280797101449276, + "grad_norm": 5.190052431153651, + "learning_rate": 3.209895080819628e-07, + "loss": 0.3307, + "step": 3467 + }, + { + "epoch": 0.6282608695652174, + "grad_norm": 5.973216655062214, + "learning_rate": 3.2071559909421574e-07, + "loss": 0.2201, + "step": 3468 + }, + { + "epoch": 0.6284420289855073, + "grad_norm": 4.256350548736856, + "learning_rate": 3.204417518349376e-07, + "loss": 0.2855, + "step": 3469 + }, + { + "epoch": 0.6286231884057971, + "grad_norm": 4.68663346579806, + "learning_rate": 3.2016796639841515e-07, + "loss": 0.3165, + "step": 3470 + }, + { + "epoch": 0.628804347826087, + "grad_norm": 6.720386983039846, + "learning_rate": 3.1989424287891386e-07, + "loss": 0.3184, + "step": 3471 + }, + { + "epoch": 0.6289855072463768, + "grad_norm": 4.595726189265466, + "learning_rate": 3.19620581370678e-07, + "loss": 0.2801, + "step": 3472 + }, + { + "epoch": 0.6291666666666667, + "grad_norm": 6.091865827108255, + "learning_rate": 3.1934698196793077e-07, + "loss": 0.2778, + "step": 3473 + }, + { + "epoch": 0.6293478260869565, + "grad_norm": 3.576356064987536, + "learning_rate": 3.190734447648735e-07, + "loss": 0.2841, + "step": 3474 + }, + { + "epoch": 0.6295289855072463, + "grad_norm": 3.394416153346299, + "learning_rate": 3.187999698556865e-07, + "loss": 0.2241, + "step": 3475 + }, + { + "epoch": 0.6297101449275362, + "grad_norm": 6.186491700325118, + "learning_rate": 3.185265573345284e-07, + "loss": 0.3525, + "step": 3476 + }, + { + "epoch": 0.6298913043478261, + "grad_norm": 5.144816350253735, + "learning_rate": 3.182532072955364e-07, + "loss": 0.262, + "step": 3477 + }, + { + "epoch": 0.630072463768116, + "grad_norm": 3.876315392290179, + "learning_rate": 3.179799198328265e-07, + "loss": 0.3162, + "step": 3478 + }, + { + "epoch": 0.6302536231884058, + "grad_norm": 3.1545542978646237, + "learning_rate": 3.177066950404924e-07, + "loss": 0.239, + "step": 3479 + }, + { + "epoch": 0.6304347826086957, + "grad_norm": 3.8527715428261757, + "learning_rate": 3.174335330126069e-07, + "loss": 0.3144, + "step": 3480 + }, + { + "epoch": 0.6306159420289855, + "grad_norm": 10.807349738611567, + "learning_rate": 3.171604338432211e-07, + "loss": 0.2858, + "step": 3481 + }, + { + "epoch": 0.6307971014492754, + "grad_norm": 3.90341742411574, + "learning_rate": 3.1688739762636425e-07, + "loss": 0.2615, + "step": 3482 + }, + { + "epoch": 0.6309782608695652, + "grad_norm": 5.956069805012847, + "learning_rate": 3.1661442445604395e-07, + "loss": 0.3372, + "step": 3483 + }, + { + "epoch": 0.631159420289855, + "grad_norm": 7.589601830606784, + "learning_rate": 3.163415144262461e-07, + "loss": 0.2803, + "step": 3484 + }, + { + "epoch": 0.6313405797101449, + "grad_norm": 6.597118343210361, + "learning_rate": 3.160686676309352e-07, + "loss": 0.3384, + "step": 3485 + }, + { + "epoch": 0.6315217391304347, + "grad_norm": 6.012869394286747, + "learning_rate": 3.157958841640536e-07, + "loss": 0.3165, + "step": 3486 + }, + { + "epoch": 0.6317028985507246, + "grad_norm": 6.555759175967989, + "learning_rate": 3.1552316411952154e-07, + "loss": 0.3617, + "step": 3487 + }, + { + "epoch": 0.6318840579710145, + "grad_norm": 5.300184139611348, + "learning_rate": 3.1525050759123843e-07, + "loss": 0.2909, + "step": 3488 + }, + { + "epoch": 0.6320652173913044, + "grad_norm": 4.005959374094459, + "learning_rate": 3.149779146730809e-07, + "loss": 0.299, + "step": 3489 + }, + { + "epoch": 0.6322463768115942, + "grad_norm": 5.95346829250653, + "learning_rate": 3.147053854589039e-07, + "loss": 0.2699, + "step": 3490 + }, + { + "epoch": 0.6324275362318841, + "grad_norm": 4.272464390805704, + "learning_rate": 3.144329200425406e-07, + "loss": 0.3195, + "step": 3491 + }, + { + "epoch": 0.6326086956521739, + "grad_norm": 9.057414868259817, + "learning_rate": 3.1416051851780223e-07, + "loss": 0.286, + "step": 3492 + }, + { + "epoch": 0.6327898550724638, + "grad_norm": 8.477771706292318, + "learning_rate": 3.138881809784778e-07, + "loss": 0.2807, + "step": 3493 + }, + { + "epoch": 0.6329710144927536, + "grad_norm": 3.7911143503502793, + "learning_rate": 3.1361590751833467e-07, + "loss": 0.3135, + "step": 3494 + }, + { + "epoch": 0.6331521739130435, + "grad_norm": 5.681542121834925, + "learning_rate": 3.133436982311174e-07, + "loss": 0.2736, + "step": 3495 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 9.349125398278641, + "learning_rate": 3.1307155321054947e-07, + "loss": 0.3322, + "step": 3496 + }, + { + "epoch": 0.6335144927536231, + "grad_norm": 7.240340442453464, + "learning_rate": 3.127994725503313e-07, + "loss": 0.2888, + "step": 3497 + }, + { + "epoch": 0.633695652173913, + "grad_norm": 3.9439792548592645, + "learning_rate": 3.125274563441418e-07, + "loss": 0.3116, + "step": 3498 + }, + { + "epoch": 0.633876811594203, + "grad_norm": 6.244013027880077, + "learning_rate": 3.1225550468563724e-07, + "loss": 0.2688, + "step": 3499 + }, + { + "epoch": 0.6340579710144928, + "grad_norm": 5.060186724217018, + "learning_rate": 3.1198361766845205e-07, + "loss": 0.2377, + "step": 3500 + }, + { + "epoch": 0.6340579710144928, + "eval_loss": 0.2820156216621399, + "eval_runtime": 9.7492, + "eval_samples_per_second": 51.286, + "eval_steps_per_second": 0.103, + "step": 3500 + }, + { + "epoch": 0.6342391304347826, + "grad_norm": 6.2382579177319295, + "learning_rate": 3.117117953861981e-07, + "loss": 0.3268, + "step": 3501 + }, + { + "epoch": 0.6344202898550725, + "grad_norm": 8.029668359470788, + "learning_rate": 3.1144003793246524e-07, + "loss": 0.3033, + "step": 3502 + }, + { + "epoch": 0.6346014492753623, + "grad_norm": 5.564703030577194, + "learning_rate": 3.1116834540082047e-07, + "loss": 0.2782, + "step": 3503 + }, + { + "epoch": 0.6347826086956522, + "grad_norm": 4.421979813804192, + "learning_rate": 3.108967178848093e-07, + "loss": 0.3212, + "step": 3504 + }, + { + "epoch": 0.634963768115942, + "grad_norm": 6.855121681408441, + "learning_rate": 3.106251554779542e-07, + "loss": 0.3108, + "step": 3505 + }, + { + "epoch": 0.6351449275362319, + "grad_norm": 4.696215758109863, + "learning_rate": 3.103536582737553e-07, + "loss": 0.3031, + "step": 3506 + }, + { + "epoch": 0.6353260869565217, + "grad_norm": 3.549072914840705, + "learning_rate": 3.1008222636569023e-07, + "loss": 0.2772, + "step": 3507 + }, + { + "epoch": 0.6355072463768116, + "grad_norm": 3.7165036252945227, + "learning_rate": 3.098108598472147e-07, + "loss": 0.2446, + "step": 3508 + }, + { + "epoch": 0.6356884057971014, + "grad_norm": 8.03291885166229, + "learning_rate": 3.0953955881176116e-07, + "loss": 0.2819, + "step": 3509 + }, + { + "epoch": 0.6358695652173914, + "grad_norm": 5.192305453981132, + "learning_rate": 3.0926832335273996e-07, + "loss": 0.2822, + "step": 3510 + }, + { + "epoch": 0.6360507246376812, + "grad_norm": 4.5139104730937305, + "learning_rate": 3.089971535635386e-07, + "loss": 0.3029, + "step": 3511 + }, + { + "epoch": 0.636231884057971, + "grad_norm": 5.713649470606083, + "learning_rate": 3.087260495375224e-07, + "loss": 0.3518, + "step": 3512 + }, + { + "epoch": 0.6364130434782609, + "grad_norm": 8.90754021622772, + "learning_rate": 3.0845501136803376e-07, + "loss": 0.3116, + "step": 3513 + }, + { + "epoch": 0.6365942028985507, + "grad_norm": 5.7333201268731155, + "learning_rate": 3.0818403914839206e-07, + "loss": 0.2771, + "step": 3514 + }, + { + "epoch": 0.6367753623188406, + "grad_norm": 5.417018044112498, + "learning_rate": 3.0791313297189454e-07, + "loss": 0.3217, + "step": 3515 + }, + { + "epoch": 0.6369565217391304, + "grad_norm": 6.228573519493017, + "learning_rate": 3.076422929318155e-07, + "loss": 0.2775, + "step": 3516 + }, + { + "epoch": 0.6371376811594203, + "grad_norm": 3.797721447614127, + "learning_rate": 3.073715191214065e-07, + "loss": 0.2864, + "step": 3517 + }, + { + "epoch": 0.6373188405797101, + "grad_norm": 4.1832251869510175, + "learning_rate": 3.07100811633896e-07, + "loss": 0.291, + "step": 3518 + }, + { + "epoch": 0.6375, + "grad_norm": 5.4771009462827696, + "learning_rate": 3.0683017056249005e-07, + "loss": 0.2808, + "step": 3519 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 4.604587667290546, + "learning_rate": 3.0655959600037167e-07, + "loss": 0.3158, + "step": 3520 + }, + { + "epoch": 0.6378623188405798, + "grad_norm": 7.029354005414273, + "learning_rate": 3.062890880407011e-07, + "loss": 0.3228, + "step": 3521 + }, + { + "epoch": 0.6380434782608696, + "grad_norm": 6.918926416341621, + "learning_rate": 3.060186467766149e-07, + "loss": 0.2932, + "step": 3522 + }, + { + "epoch": 0.6382246376811594, + "grad_norm": 3.674184108477483, + "learning_rate": 3.057482723012282e-07, + "loss": 0.2091, + "step": 3523 + }, + { + "epoch": 0.6384057971014493, + "grad_norm": 3.2998305727648702, + "learning_rate": 3.054779647076315e-07, + "loss": 0.2574, + "step": 3524 + }, + { + "epoch": 0.6385869565217391, + "grad_norm": 6.642957874434674, + "learning_rate": 3.0520772408889327e-07, + "loss": 0.302, + "step": 3525 + }, + { + "epoch": 0.638768115942029, + "grad_norm": 3.380285563687606, + "learning_rate": 3.049375505380585e-07, + "loss": 0.2422, + "step": 3526 + }, + { + "epoch": 0.6389492753623188, + "grad_norm": 4.673713098123802, + "learning_rate": 3.046674441481494e-07, + "loss": 0.306, + "step": 3527 + }, + { + "epoch": 0.6391304347826087, + "grad_norm": 3.9190718489309124, + "learning_rate": 3.043974050121647e-07, + "loss": 0.344, + "step": 3528 + }, + { + "epoch": 0.6393115942028985, + "grad_norm": 11.003515385389463, + "learning_rate": 3.0412743322308044e-07, + "loss": 0.3078, + "step": 3529 + }, + { + "epoch": 0.6394927536231884, + "grad_norm": 4.157619027974557, + "learning_rate": 3.0385752887384864e-07, + "loss": 0.2899, + "step": 3530 + }, + { + "epoch": 0.6396739130434783, + "grad_norm": 6.931307811011895, + "learning_rate": 3.0358769205739927e-07, + "loss": 0.3147, + "step": 3531 + }, + { + "epoch": 0.6398550724637682, + "grad_norm": 4.0321045784591645, + "learning_rate": 3.03317922866638e-07, + "loss": 0.2841, + "step": 3532 + }, + { + "epoch": 0.640036231884058, + "grad_norm": 5.4472268314775665, + "learning_rate": 3.0304822139444795e-07, + "loss": 0.3238, + "step": 3533 + }, + { + "epoch": 0.6402173913043478, + "grad_norm": 6.201566239732925, + "learning_rate": 3.0277858773368826e-07, + "loss": 0.319, + "step": 3534 + }, + { + "epoch": 0.6403985507246377, + "grad_norm": 6.427192834854141, + "learning_rate": 3.025090219771953e-07, + "loss": 0.3573, + "step": 3535 + }, + { + "epoch": 0.6405797101449275, + "grad_norm": 4.734874658914441, + "learning_rate": 3.0223952421778186e-07, + "loss": 0.325, + "step": 3536 + }, + { + "epoch": 0.6407608695652174, + "grad_norm": 3.9676697086942867, + "learning_rate": 3.019700945482374e-07, + "loss": 0.2262, + "step": 3537 + }, + { + "epoch": 0.6409420289855072, + "grad_norm": 4.37865260270185, + "learning_rate": 3.017007330613273e-07, + "loss": 0.2751, + "step": 3538 + }, + { + "epoch": 0.6411231884057971, + "grad_norm": 4.287482247035775, + "learning_rate": 3.0143143984979464e-07, + "loss": 0.261, + "step": 3539 + }, + { + "epoch": 0.6413043478260869, + "grad_norm": 4.486192162549001, + "learning_rate": 3.0116221500635806e-07, + "loss": 0.2871, + "step": 3540 + }, + { + "epoch": 0.6414855072463768, + "grad_norm": 7.8293707971430155, + "learning_rate": 3.0089305862371294e-07, + "loss": 0.308, + "step": 3541 + }, + { + "epoch": 0.6416666666666667, + "grad_norm": 8.106392472119104, + "learning_rate": 3.0062397079453104e-07, + "loss": 0.2628, + "step": 3542 + }, + { + "epoch": 0.6418478260869566, + "grad_norm": 4.069627385869626, + "learning_rate": 3.003549516114607e-07, + "loss": 0.2195, + "step": 3543 + }, + { + "epoch": 0.6420289855072464, + "grad_norm": 4.6315966071872845, + "learning_rate": 3.0008600116712657e-07, + "loss": 0.2597, + "step": 3544 + }, + { + "epoch": 0.6422101449275363, + "grad_norm": 8.120672814882225, + "learning_rate": 2.9981711955412935e-07, + "loss": 0.3131, + "step": 3545 + }, + { + "epoch": 0.6423913043478261, + "grad_norm": 8.400669278395796, + "learning_rate": 2.9954830686504627e-07, + "loss": 0.3154, + "step": 3546 + }, + { + "epoch": 0.6425724637681159, + "grad_norm": 9.726823509023543, + "learning_rate": 2.9927956319243103e-07, + "loss": 0.2377, + "step": 3547 + }, + { + "epoch": 0.6427536231884058, + "grad_norm": 5.26665970832048, + "learning_rate": 2.990108886288133e-07, + "loss": 0.35, + "step": 3548 + }, + { + "epoch": 0.6429347826086956, + "grad_norm": 7.755263541017699, + "learning_rate": 2.987422832666988e-07, + "loss": 0.3204, + "step": 3549 + }, + { + "epoch": 0.6431159420289855, + "grad_norm": 7.935360862257978, + "learning_rate": 2.9847374719856965e-07, + "loss": 0.3047, + "step": 3550 + }, + { + "epoch": 0.6432971014492753, + "grad_norm": 4.980652920524264, + "learning_rate": 2.9820528051688433e-07, + "loss": 0.2729, + "step": 3551 + }, + { + "epoch": 0.6434782608695652, + "grad_norm": 4.958848830089449, + "learning_rate": 2.9793688331407707e-07, + "loss": 0.3193, + "step": 3552 + }, + { + "epoch": 0.6436594202898551, + "grad_norm": 6.451077083156074, + "learning_rate": 2.976685556825582e-07, + "loss": 0.2958, + "step": 3553 + }, + { + "epoch": 0.643840579710145, + "grad_norm": 4.456700058175346, + "learning_rate": 2.974002977147142e-07, + "loss": 0.3391, + "step": 3554 + }, + { + "epoch": 0.6440217391304348, + "grad_norm": 4.717785655282856, + "learning_rate": 2.971321095029078e-07, + "loss": 0.3224, + "step": 3555 + }, + { + "epoch": 0.6442028985507247, + "grad_norm": 6.604709962169939, + "learning_rate": 2.9686399113947737e-07, + "loss": 0.3217, + "step": 3556 + }, + { + "epoch": 0.6443840579710145, + "grad_norm": 4.142883397303934, + "learning_rate": 2.9659594271673693e-07, + "loss": 0.2995, + "step": 3557 + }, + { + "epoch": 0.6445652173913043, + "grad_norm": 4.520706467154874, + "learning_rate": 2.9632796432697746e-07, + "loss": 0.2965, + "step": 3558 + }, + { + "epoch": 0.6447463768115942, + "grad_norm": 6.921158413824603, + "learning_rate": 2.960600560624648e-07, + "loss": 0.2851, + "step": 3559 + }, + { + "epoch": 0.644927536231884, + "grad_norm": 5.663681867486989, + "learning_rate": 2.957922180154412e-07, + "loss": 0.3085, + "step": 3560 + }, + { + "epoch": 0.6451086956521739, + "grad_norm": 6.446251091437465, + "learning_rate": 2.955244502781243e-07, + "loss": 0.2506, + "step": 3561 + }, + { + "epoch": 0.6452898550724637, + "grad_norm": 4.661010702875182, + "learning_rate": 2.952567529427081e-07, + "loss": 0.2925, + "step": 3562 + }, + { + "epoch": 0.6454710144927536, + "grad_norm": 3.5723092201246724, + "learning_rate": 2.9498912610136203e-07, + "loss": 0.2347, + "step": 3563 + }, + { + "epoch": 0.6456521739130435, + "grad_norm": 4.453548900575511, + "learning_rate": 2.9472156984623124e-07, + "loss": 0.3224, + "step": 3564 + }, + { + "epoch": 0.6458333333333334, + "grad_norm": 7.88515240431393, + "learning_rate": 2.944540842694363e-07, + "loss": 0.2769, + "step": 3565 + }, + { + "epoch": 0.6460144927536232, + "grad_norm": 6.2753908255033295, + "learning_rate": 2.9418666946307434e-07, + "loss": 0.2767, + "step": 3566 + }, + { + "epoch": 0.6461956521739131, + "grad_norm": 4.967052897771148, + "learning_rate": 2.939193255192172e-07, + "loss": 0.2951, + "step": 3567 + }, + { + "epoch": 0.6463768115942029, + "grad_norm": 6.29695871870701, + "learning_rate": 2.9365205252991267e-07, + "loss": 0.3206, + "step": 3568 + }, + { + "epoch": 0.6465579710144927, + "grad_norm": 5.461688541890273, + "learning_rate": 2.93384850587184e-07, + "loss": 0.2759, + "step": 3569 + }, + { + "epoch": 0.6467391304347826, + "grad_norm": 5.176528427497427, + "learning_rate": 2.9311771978303035e-07, + "loss": 0.3406, + "step": 3570 + }, + { + "epoch": 0.6469202898550724, + "grad_norm": 4.850940843561452, + "learning_rate": 2.928506602094261e-07, + "loss": 0.2776, + "step": 3571 + }, + { + "epoch": 0.6471014492753623, + "grad_norm": 4.781029397025524, + "learning_rate": 2.925836719583211e-07, + "loss": 0.2465, + "step": 3572 + }, + { + "epoch": 0.6472826086956521, + "grad_norm": 6.414965132211462, + "learning_rate": 2.923167551216402e-07, + "loss": 0.3738, + "step": 3573 + }, + { + "epoch": 0.6474637681159421, + "grad_norm": 3.314440794934911, + "learning_rate": 2.9204990979128485e-07, + "loss": 0.285, + "step": 3574 + }, + { + "epoch": 0.6476449275362319, + "grad_norm": 4.0654746745282475, + "learning_rate": 2.917831360591309e-07, + "loss": 0.2924, + "step": 3575 + }, + { + "epoch": 0.6478260869565218, + "grad_norm": 4.648658864332499, + "learning_rate": 2.915164340170297e-07, + "loss": 0.2477, + "step": 3576 + }, + { + "epoch": 0.6480072463768116, + "grad_norm": 3.813850364481976, + "learning_rate": 2.9124980375680784e-07, + "loss": 0.2574, + "step": 3577 + }, + { + "epoch": 0.6481884057971015, + "grad_norm": 7.338703093723526, + "learning_rate": 2.9098324537026785e-07, + "loss": 0.2906, + "step": 3578 + }, + { + "epoch": 0.6483695652173913, + "grad_norm": 7.98600963714465, + "learning_rate": 2.90716758949187e-07, + "loss": 0.2627, + "step": 3579 + }, + { + "epoch": 0.6485507246376812, + "grad_norm": 6.683950177557102, + "learning_rate": 2.904503445853175e-07, + "loss": 0.3596, + "step": 3580 + }, + { + "epoch": 0.648731884057971, + "grad_norm": 4.624608572412256, + "learning_rate": 2.9018400237038695e-07, + "loss": 0.2997, + "step": 3581 + }, + { + "epoch": 0.6489130434782608, + "grad_norm": 4.100339115439525, + "learning_rate": 2.8991773239609873e-07, + "loss": 0.3193, + "step": 3582 + }, + { + "epoch": 0.6490942028985507, + "grad_norm": 5.473573855639002, + "learning_rate": 2.8965153475413065e-07, + "loss": 0.3152, + "step": 3583 + }, + { + "epoch": 0.6492753623188405, + "grad_norm": 5.123033095878148, + "learning_rate": 2.8938540953613575e-07, + "loss": 0.3493, + "step": 3584 + }, + { + "epoch": 0.6494565217391305, + "grad_norm": 4.6530707252908385, + "learning_rate": 2.891193568337424e-07, + "loss": 0.3193, + "step": 3585 + }, + { + "epoch": 0.6496376811594203, + "grad_norm": 3.804542653115263, + "learning_rate": 2.888533767385536e-07, + "loss": 0.2352, + "step": 3586 + }, + { + "epoch": 0.6498188405797102, + "grad_norm": 5.914939404188481, + "learning_rate": 2.8858746934214757e-07, + "loss": 0.3182, + "step": 3587 + }, + { + "epoch": 0.65, + "grad_norm": 9.73199331068314, + "learning_rate": 2.883216347360776e-07, + "loss": 0.3198, + "step": 3588 + }, + { + "epoch": 0.6501811594202899, + "grad_norm": 4.132394173945082, + "learning_rate": 2.880558730118717e-07, + "loss": 0.3091, + "step": 3589 + }, + { + "epoch": 0.6503623188405797, + "grad_norm": 3.989013391867055, + "learning_rate": 2.8779018426103294e-07, + "loss": 0.2842, + "step": 3590 + }, + { + "epoch": 0.6505434782608696, + "grad_norm": 5.056871858922117, + "learning_rate": 2.8752456857503926e-07, + "loss": 0.2593, + "step": 3591 + }, + { + "epoch": 0.6507246376811594, + "grad_norm": 4.065174857509861, + "learning_rate": 2.8725902604534327e-07, + "loss": 0.3019, + "step": 3592 + }, + { + "epoch": 0.6509057971014492, + "grad_norm": 7.30139705704604, + "learning_rate": 2.8699355676337244e-07, + "loss": 0.3196, + "step": 3593 + }, + { + "epoch": 0.6510869565217391, + "grad_norm": 5.435898779179223, + "learning_rate": 2.8672816082052947e-07, + "loss": 0.2925, + "step": 3594 + }, + { + "epoch": 0.6512681159420289, + "grad_norm": 5.442550149053178, + "learning_rate": 2.8646283830819147e-07, + "loss": 0.2471, + "step": 3595 + }, + { + "epoch": 0.6514492753623189, + "grad_norm": 4.723932616589107, + "learning_rate": 2.8619758931770956e-07, + "loss": 0.2673, + "step": 3596 + }, + { + "epoch": 0.6516304347826087, + "grad_norm": 4.394625475998589, + "learning_rate": 2.8593241394041085e-07, + "loss": 0.3923, + "step": 3597 + }, + { + "epoch": 0.6518115942028986, + "grad_norm": 3.718928857222592, + "learning_rate": 2.856673122675963e-07, + "loss": 0.3085, + "step": 3598 + }, + { + "epoch": 0.6519927536231884, + "grad_norm": 9.255569009093893, + "learning_rate": 2.854022843905417e-07, + "loss": 0.2863, + "step": 3599 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 3.488786308451071, + "learning_rate": 2.851373304004973e-07, + "loss": 0.265, + "step": 3600 + }, + { + "epoch": 0.6521739130434783, + "eval_loss": 0.279484361410141, + "eval_runtime": 9.7579, + "eval_samples_per_second": 51.241, + "eval_steps_per_second": 0.102, + "step": 3600 + }, + { + "epoch": 0.6523550724637681, + "grad_norm": 3.588964815454868, + "learning_rate": 2.8487245038868815e-07, + "loss": 0.2914, + "step": 3601 + }, + { + "epoch": 0.652536231884058, + "grad_norm": 6.321461042729916, + "learning_rate": 2.8460764444631355e-07, + "loss": 0.267, + "step": 3602 + }, + { + "epoch": 0.6527173913043478, + "grad_norm": 3.5278027809007564, + "learning_rate": 2.843429126645476e-07, + "loss": 0.2185, + "step": 3603 + }, + { + "epoch": 0.6528985507246376, + "grad_norm": 3.642436917467338, + "learning_rate": 2.8407825513453843e-07, + "loss": 0.2671, + "step": 3604 + }, + { + "epoch": 0.6530797101449275, + "grad_norm": 4.776068447729105, + "learning_rate": 2.838136719474094e-07, + "loss": 0.2625, + "step": 3605 + }, + { + "epoch": 0.6532608695652173, + "grad_norm": 4.181266998541342, + "learning_rate": 2.8354916319425727e-07, + "loss": 0.2915, + "step": 3606 + }, + { + "epoch": 0.6534420289855073, + "grad_norm": 5.154781344835548, + "learning_rate": 2.8328472896615387e-07, + "loss": 0.2954, + "step": 3607 + }, + { + "epoch": 0.6536231884057971, + "grad_norm": 4.765559229241035, + "learning_rate": 2.8302036935414486e-07, + "loss": 0.2863, + "step": 3608 + }, + { + "epoch": 0.653804347826087, + "grad_norm": 4.0211848337348375, + "learning_rate": 2.8275608444925093e-07, + "loss": 0.2562, + "step": 3609 + }, + { + "epoch": 0.6539855072463768, + "grad_norm": 4.335879578395847, + "learning_rate": 2.8249187434246644e-07, + "loss": 0.2533, + "step": 3610 + }, + { + "epoch": 0.6541666666666667, + "grad_norm": 7.899859825786745, + "learning_rate": 2.822277391247604e-07, + "loss": 0.3308, + "step": 3611 + }, + { + "epoch": 0.6543478260869565, + "grad_norm": 3.5302127760530886, + "learning_rate": 2.8196367888707523e-07, + "loss": 0.261, + "step": 3612 + }, + { + "epoch": 0.6545289855072464, + "grad_norm": 4.249023241291757, + "learning_rate": 2.816996937203287e-07, + "loss": 0.2715, + "step": 3613 + }, + { + "epoch": 0.6547101449275362, + "grad_norm": 5.75142314526588, + "learning_rate": 2.8143578371541193e-07, + "loss": 0.2981, + "step": 3614 + }, + { + "epoch": 0.654891304347826, + "grad_norm": 5.824776336629638, + "learning_rate": 2.8117194896319043e-07, + "loss": 0.2747, + "step": 3615 + }, + { + "epoch": 0.6550724637681159, + "grad_norm": 3.73281373204958, + "learning_rate": 2.809081895545037e-07, + "loss": 0.2938, + "step": 3616 + }, + { + "epoch": 0.6552536231884057, + "grad_norm": 4.2455903901890455, + "learning_rate": 2.806445055801654e-07, + "loss": 0.3126, + "step": 3617 + }, + { + "epoch": 0.6554347826086957, + "grad_norm": 6.411375057907996, + "learning_rate": 2.8038089713096315e-07, + "loss": 0.3749, + "step": 3618 + }, + { + "epoch": 0.6556159420289855, + "grad_norm": 5.820503794110793, + "learning_rate": 2.801173642976586e-07, + "loss": 0.3274, + "step": 3619 + }, + { + "epoch": 0.6557971014492754, + "grad_norm": 4.913970927173583, + "learning_rate": 2.7985390717098715e-07, + "loss": 0.3523, + "step": 3620 + }, + { + "epoch": 0.6559782608695652, + "grad_norm": 4.619482463634723, + "learning_rate": 2.7959052584165876e-07, + "loss": 0.2765, + "step": 3621 + }, + { + "epoch": 0.6561594202898551, + "grad_norm": 3.417401039290704, + "learning_rate": 2.793272204003568e-07, + "loss": 0.248, + "step": 3622 + }, + { + "epoch": 0.6563405797101449, + "grad_norm": 4.011142184815913, + "learning_rate": 2.7906399093773824e-07, + "loss": 0.2491, + "step": 3623 + }, + { + "epoch": 0.6565217391304348, + "grad_norm": 3.836371639593736, + "learning_rate": 2.7880083754443424e-07, + "loss": 0.2418, + "step": 3624 + }, + { + "epoch": 0.6567028985507246, + "grad_norm": 3.4113225116294426, + "learning_rate": 2.785377603110501e-07, + "loss": 0.2578, + "step": 3625 + }, + { + "epoch": 0.6568840579710145, + "grad_norm": 5.503691518951337, + "learning_rate": 2.782747593281644e-07, + "loss": 0.3781, + "step": 3626 + }, + { + "epoch": 0.6570652173913043, + "grad_norm": 4.927070530792558, + "learning_rate": 2.7801183468632964e-07, + "loss": 0.2755, + "step": 3627 + }, + { + "epoch": 0.6572463768115943, + "grad_norm": 8.859870219546806, + "learning_rate": 2.7774898647607205e-07, + "loss": 0.3488, + "step": 3628 + }, + { + "epoch": 0.6574275362318841, + "grad_norm": 3.8815894478064683, + "learning_rate": 2.7748621478789137e-07, + "loss": 0.261, + "step": 3629 + }, + { + "epoch": 0.657608695652174, + "grad_norm": 6.688821754510899, + "learning_rate": 2.772235197122612e-07, + "loss": 0.2667, + "step": 3630 + }, + { + "epoch": 0.6577898550724638, + "grad_norm": 7.288455264644119, + "learning_rate": 2.7696090133962866e-07, + "loss": 0.3488, + "step": 3631 + }, + { + "epoch": 0.6579710144927536, + "grad_norm": 3.909989091520067, + "learning_rate": 2.766983597604149e-07, + "loss": 0.2975, + "step": 3632 + }, + { + "epoch": 0.6581521739130435, + "grad_norm": 6.100025961693236, + "learning_rate": 2.764358950650137e-07, + "loss": 0.3118, + "step": 3633 + }, + { + "epoch": 0.6583333333333333, + "grad_norm": 4.105225728555756, + "learning_rate": 2.761735073437931e-07, + "loss": 0.3026, + "step": 3634 + }, + { + "epoch": 0.6585144927536232, + "grad_norm": 4.367941485720895, + "learning_rate": 2.7591119668709426e-07, + "loss": 0.3368, + "step": 3635 + }, + { + "epoch": 0.658695652173913, + "grad_norm": 4.747645896093567, + "learning_rate": 2.7564896318523235e-07, + "loss": 0.2986, + "step": 3636 + }, + { + "epoch": 0.6588768115942029, + "grad_norm": 6.123179664376465, + "learning_rate": 2.753868069284954e-07, + "loss": 0.248, + "step": 3637 + }, + { + "epoch": 0.6590579710144927, + "grad_norm": 6.171803416165604, + "learning_rate": 2.7512472800714524e-07, + "loss": 0.2244, + "step": 3638 + }, + { + "epoch": 0.6592391304347827, + "grad_norm": 4.101033123111261, + "learning_rate": 2.7486272651141626e-07, + "loss": 0.3093, + "step": 3639 + }, + { + "epoch": 0.6594202898550725, + "grad_norm": 4.439942192889116, + "learning_rate": 2.746008025315176e-07, + "loss": 0.2698, + "step": 3640 + }, + { + "epoch": 0.6596014492753624, + "grad_norm": 4.242412112289066, + "learning_rate": 2.743389561576305e-07, + "loss": 0.2428, + "step": 3641 + }, + { + "epoch": 0.6597826086956522, + "grad_norm": 5.087518885612564, + "learning_rate": 2.7407718747991006e-07, + "loss": 0.2472, + "step": 3642 + }, + { + "epoch": 0.659963768115942, + "grad_norm": 7.055745367515722, + "learning_rate": 2.7381549658848437e-07, + "loss": 0.3213, + "step": 3643 + }, + { + "epoch": 0.6601449275362319, + "grad_norm": 5.169329899691847, + "learning_rate": 2.7355388357345487e-07, + "loss": 0.2875, + "step": 3644 + }, + { + "epoch": 0.6603260869565217, + "grad_norm": 6.595283603259727, + "learning_rate": 2.7329234852489623e-07, + "loss": 0.2546, + "step": 3645 + }, + { + "epoch": 0.6605072463768116, + "grad_norm": 3.8489407852792468, + "learning_rate": 2.7303089153285615e-07, + "loss": 0.2702, + "step": 3646 + }, + { + "epoch": 0.6606884057971014, + "grad_norm": 3.892946075740103, + "learning_rate": 2.727695126873553e-07, + "loss": 0.2726, + "step": 3647 + }, + { + "epoch": 0.6608695652173913, + "grad_norm": 4.533870632461735, + "learning_rate": 2.7250821207838805e-07, + "loss": 0.2774, + "step": 3648 + }, + { + "epoch": 0.6610507246376811, + "grad_norm": 3.7080499511784577, + "learning_rate": 2.7224698979592155e-07, + "loss": 0.257, + "step": 3649 + }, + { + "epoch": 0.6612318840579711, + "grad_norm": 6.809284094972931, + "learning_rate": 2.7198584592989527e-07, + "loss": 0.3, + "step": 3650 + }, + { + "epoch": 0.6614130434782609, + "grad_norm": 5.690296168129186, + "learning_rate": 2.717247805702224e-07, + "loss": 0.2307, + "step": 3651 + }, + { + "epoch": 0.6615942028985508, + "grad_norm": 5.037475673626697, + "learning_rate": 2.714637938067894e-07, + "loss": 0.2774, + "step": 3652 + }, + { + "epoch": 0.6617753623188406, + "grad_norm": 3.400094922068276, + "learning_rate": 2.7120288572945506e-07, + "loss": 0.257, + "step": 3653 + }, + { + "epoch": 0.6619565217391304, + "grad_norm": 3.878530967935302, + "learning_rate": 2.7094205642805123e-07, + "loss": 0.308, + "step": 3654 + }, + { + "epoch": 0.6621376811594203, + "grad_norm": 5.782688264320758, + "learning_rate": 2.706813059923826e-07, + "loss": 0.3242, + "step": 3655 + }, + { + "epoch": 0.6623188405797101, + "grad_norm": 4.816636776378942, + "learning_rate": 2.7042063451222685e-07, + "loss": 0.3356, + "step": 3656 + }, + { + "epoch": 0.6625, + "grad_norm": 5.812525310932098, + "learning_rate": 2.7016004207733445e-07, + "loss": 0.279, + "step": 3657 + }, + { + "epoch": 0.6626811594202898, + "grad_norm": 3.489803123765879, + "learning_rate": 2.6989952877742863e-07, + "loss": 0.2511, + "step": 3658 + }, + { + "epoch": 0.6628623188405797, + "grad_norm": 4.209224942182741, + "learning_rate": 2.6963909470220516e-07, + "loss": 0.2585, + "step": 3659 + }, + { + "epoch": 0.6630434782608695, + "grad_norm": 4.781288714945626, + "learning_rate": 2.6937873994133294e-07, + "loss": 0.298, + "step": 3660 + }, + { + "epoch": 0.6632246376811595, + "grad_norm": 4.22864320769655, + "learning_rate": 2.691184645844532e-07, + "loss": 0.3086, + "step": 3661 + }, + { + "epoch": 0.6634057971014493, + "grad_norm": 5.6425340265076365, + "learning_rate": 2.688582687211801e-07, + "loss": 0.3309, + "step": 3662 + }, + { + "epoch": 0.6635869565217392, + "grad_norm": 4.3399493955212405, + "learning_rate": 2.6859815244110006e-07, + "loss": 0.27, + "step": 3663 + }, + { + "epoch": 0.663768115942029, + "grad_norm": 7.329606499573125, + "learning_rate": 2.683381158337727e-07, + "loss": 0.3699, + "step": 3664 + }, + { + "epoch": 0.6639492753623188, + "grad_norm": 4.548447538524371, + "learning_rate": 2.680781589887299e-07, + "loss": 0.284, + "step": 3665 + }, + { + "epoch": 0.6641304347826087, + "grad_norm": 5.257102677918078, + "learning_rate": 2.6781828199547545e-07, + "loss": 0.3345, + "step": 3666 + }, + { + "epoch": 0.6643115942028985, + "grad_norm": 5.057863993945306, + "learning_rate": 2.675584849434868e-07, + "loss": 0.275, + "step": 3667 + }, + { + "epoch": 0.6644927536231884, + "grad_norm": 5.083540153822678, + "learning_rate": 2.672987679222131e-07, + "loss": 0.3251, + "step": 3668 + }, + { + "epoch": 0.6646739130434782, + "grad_norm": 5.36873637733001, + "learning_rate": 2.670391310210762e-07, + "loss": 0.318, + "step": 3669 + }, + { + "epoch": 0.6648550724637681, + "grad_norm": 3.701325257273936, + "learning_rate": 2.667795743294703e-07, + "loss": 0.3444, + "step": 3670 + }, + { + "epoch": 0.6650362318840579, + "grad_norm": 5.710167866377029, + "learning_rate": 2.665200979367619e-07, + "loss": 0.2747, + "step": 3671 + }, + { + "epoch": 0.6652173913043479, + "grad_norm": 3.2670581234439897, + "learning_rate": 2.662607019322901e-07, + "loss": 0.2459, + "step": 3672 + }, + { + "epoch": 0.6653985507246377, + "grad_norm": 4.739820353040369, + "learning_rate": 2.6600138640536606e-07, + "loss": 0.3047, + "step": 3673 + }, + { + "epoch": 0.6655797101449276, + "grad_norm": 4.646381608458436, + "learning_rate": 2.657421514452732e-07, + "loss": 0.1967, + "step": 3674 + }, + { + "epoch": 0.6657608695652174, + "grad_norm": 4.424479068892569, + "learning_rate": 2.65482997141268e-07, + "loss": 0.2993, + "step": 3675 + }, + { + "epoch": 0.6659420289855073, + "grad_norm": 11.823343439874979, + "learning_rate": 2.652239235825777e-07, + "loss": 0.3167, + "step": 3676 + }, + { + "epoch": 0.6661231884057971, + "grad_norm": 8.421222300347122, + "learning_rate": 2.64964930858403e-07, + "loss": 0.2632, + "step": 3677 + }, + { + "epoch": 0.6663043478260869, + "grad_norm": 6.684647609976163, + "learning_rate": 2.64706019057916e-07, + "loss": 0.2806, + "step": 3678 + }, + { + "epoch": 0.6664855072463768, + "grad_norm": 5.69973139210881, + "learning_rate": 2.644471882702617e-07, + "loss": 0.2846, + "step": 3679 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 4.44474700518741, + "learning_rate": 2.6418843858455664e-07, + "loss": 0.3004, + "step": 3680 + }, + { + "epoch": 0.6668478260869565, + "grad_norm": 4.37476347752842, + "learning_rate": 2.6392977008988954e-07, + "loss": 0.2831, + "step": 3681 + }, + { + "epoch": 0.6670289855072464, + "grad_norm": 8.191837727313821, + "learning_rate": 2.6367118287532075e-07, + "loss": 0.3109, + "step": 3682 + }, + { + "epoch": 0.6672101449275363, + "grad_norm": 4.537798478846723, + "learning_rate": 2.6341267702988366e-07, + "loss": 0.319, + "step": 3683 + }, + { + "epoch": 0.6673913043478261, + "grad_norm": 6.0466095915037315, + "learning_rate": 2.6315425264258285e-07, + "loss": 0.2448, + "step": 3684 + }, + { + "epoch": 0.667572463768116, + "grad_norm": 6.750126278221356, + "learning_rate": 2.6289590980239504e-07, + "loss": 0.3277, + "step": 3685 + }, + { + "epoch": 0.6677536231884058, + "grad_norm": 4.75955449751252, + "learning_rate": 2.6263764859826897e-07, + "loss": 0.3595, + "step": 3686 + }, + { + "epoch": 0.6679347826086957, + "grad_norm": 6.82483244022603, + "learning_rate": 2.6237946911912505e-07, + "loss": 0.3149, + "step": 3687 + }, + { + "epoch": 0.6681159420289855, + "grad_norm": 3.5489601048998285, + "learning_rate": 2.6212137145385583e-07, + "loss": 0.2621, + "step": 3688 + }, + { + "epoch": 0.6682971014492753, + "grad_norm": 6.608472992586457, + "learning_rate": 2.618633556913255e-07, + "loss": 0.3195, + "step": 3689 + }, + { + "epoch": 0.6684782608695652, + "grad_norm": 9.622788416228062, + "learning_rate": 2.6160542192036994e-07, + "loss": 0.3537, + "step": 3690 + }, + { + "epoch": 0.668659420289855, + "grad_norm": 5.998937943780035, + "learning_rate": 2.613475702297973e-07, + "loss": 0.2273, + "step": 3691 + }, + { + "epoch": 0.6688405797101449, + "grad_norm": 5.507125327866316, + "learning_rate": 2.610898007083872e-07, + "loss": 0.2766, + "step": 3692 + }, + { + "epoch": 0.6690217391304348, + "grad_norm": 4.998722513971953, + "learning_rate": 2.6083211344489053e-07, + "loss": 0.2771, + "step": 3693 + }, + { + "epoch": 0.6692028985507247, + "grad_norm": 4.815733635290072, + "learning_rate": 2.605745085280302e-07, + "loss": 0.2745, + "step": 3694 + }, + { + "epoch": 0.6693840579710145, + "grad_norm": 5.169377603311988, + "learning_rate": 2.603169860465012e-07, + "loss": 0.2627, + "step": 3695 + }, + { + "epoch": 0.6695652173913044, + "grad_norm": 3.51605881908899, + "learning_rate": 2.6005954608896954e-07, + "loss": 0.2889, + "step": 3696 + }, + { + "epoch": 0.6697463768115942, + "grad_norm": 3.3417235649319146, + "learning_rate": 2.59802188744073e-07, + "loss": 0.2629, + "step": 3697 + }, + { + "epoch": 0.6699275362318841, + "grad_norm": 3.3583454007999216, + "learning_rate": 2.5954491410042094e-07, + "loss": 0.268, + "step": 3698 + }, + { + "epoch": 0.6701086956521739, + "grad_norm": 3.482215764461388, + "learning_rate": 2.5928772224659436e-07, + "loss": 0.2585, + "step": 3699 + }, + { + "epoch": 0.6702898550724637, + "grad_norm": 6.939062804970206, + "learning_rate": 2.5903061327114537e-07, + "loss": 0.3046, + "step": 3700 + }, + { + "epoch": 0.6702898550724637, + "eval_loss": 0.27495312690734863, + "eval_runtime": 9.8384, + "eval_samples_per_second": 50.821, + "eval_steps_per_second": 0.102, + "step": 3700 + }, + { + "epoch": 0.6704710144927536, + "grad_norm": 3.823398836964224, + "learning_rate": 2.587735872625979e-07, + "loss": 0.2745, + "step": 3701 + }, + { + "epoch": 0.6706521739130434, + "grad_norm": 4.892659381418445, + "learning_rate": 2.585166443094476e-07, + "loss": 0.29, + "step": 3702 + }, + { + "epoch": 0.6708333333333333, + "grad_norm": 3.846838461143228, + "learning_rate": 2.582597845001607e-07, + "loss": 0.2869, + "step": 3703 + }, + { + "epoch": 0.6710144927536232, + "grad_norm": 4.82868446969758, + "learning_rate": 2.580030079231754e-07, + "loss": 0.2871, + "step": 3704 + }, + { + "epoch": 0.6711956521739131, + "grad_norm": 3.5194616192698893, + "learning_rate": 2.5774631466690087e-07, + "loss": 0.2928, + "step": 3705 + }, + { + "epoch": 0.6713768115942029, + "grad_norm": 5.433930905167059, + "learning_rate": 2.574897048197182e-07, + "loss": 0.3018, + "step": 3706 + }, + { + "epoch": 0.6715579710144928, + "grad_norm": 3.923716063835157, + "learning_rate": 2.5723317846997916e-07, + "loss": 0.2405, + "step": 3707 + }, + { + "epoch": 0.6717391304347826, + "grad_norm": 4.16140983937233, + "learning_rate": 2.569767357060073e-07, + "loss": 0.2783, + "step": 3708 + }, + { + "epoch": 0.6719202898550725, + "grad_norm": 3.7676758780958037, + "learning_rate": 2.567203766160964e-07, + "loss": 0.2922, + "step": 3709 + }, + { + "epoch": 0.6721014492753623, + "grad_norm": 4.343592744738559, + "learning_rate": 2.564641012885126e-07, + "loss": 0.2432, + "step": 3710 + }, + { + "epoch": 0.6722826086956522, + "grad_norm": 6.377767425993752, + "learning_rate": 2.5620790981149274e-07, + "loss": 0.2734, + "step": 3711 + }, + { + "epoch": 0.672463768115942, + "grad_norm": 3.9623108479923697, + "learning_rate": 2.559518022732446e-07, + "loss": 0.3119, + "step": 3712 + }, + { + "epoch": 0.6726449275362318, + "grad_norm": 7.791476134644975, + "learning_rate": 2.5569577876194725e-07, + "loss": 0.2958, + "step": 3713 + }, + { + "epoch": 0.6728260869565217, + "grad_norm": 7.555931885836235, + "learning_rate": 2.5543983936575085e-07, + "loss": 0.2807, + "step": 3714 + }, + { + "epoch": 0.6730072463768116, + "grad_norm": 4.441798355122445, + "learning_rate": 2.551839841727765e-07, + "loss": 0.321, + "step": 3715 + }, + { + "epoch": 0.6731884057971015, + "grad_norm": 4.497291138388753, + "learning_rate": 2.5492821327111634e-07, + "loss": 0.312, + "step": 3716 + }, + { + "epoch": 0.6733695652173913, + "grad_norm": 6.19309082208371, + "learning_rate": 2.5467252674883334e-07, + "loss": 0.2809, + "step": 3717 + }, + { + "epoch": 0.6735507246376812, + "grad_norm": 3.9176046880543294, + "learning_rate": 2.544169246939619e-07, + "loss": 0.2253, + "step": 3718 + }, + { + "epoch": 0.673731884057971, + "grad_norm": 3.8381856072852116, + "learning_rate": 2.5416140719450716e-07, + "loss": 0.2464, + "step": 3719 + }, + { + "epoch": 0.6739130434782609, + "grad_norm": 3.9871109924332573, + "learning_rate": 2.5390597433844454e-07, + "loss": 0.2563, + "step": 3720 + }, + { + "epoch": 0.6740942028985507, + "grad_norm": 7.1776908292991575, + "learning_rate": 2.536506262137208e-07, + "loss": 0.3132, + "step": 3721 + }, + { + "epoch": 0.6742753623188406, + "grad_norm": 5.935203294853653, + "learning_rate": 2.5339536290825383e-07, + "loss": 0.2835, + "step": 3722 + }, + { + "epoch": 0.6744565217391304, + "grad_norm": 7.16900488574676, + "learning_rate": 2.531401845099318e-07, + "loss": 0.336, + "step": 3723 + }, + { + "epoch": 0.6746376811594202, + "grad_norm": 3.484536818051787, + "learning_rate": 2.5288509110661394e-07, + "loss": 0.2505, + "step": 3724 + }, + { + "epoch": 0.6748188405797102, + "grad_norm": 4.152149688453602, + "learning_rate": 2.5263008278613005e-07, + "loss": 0.3403, + "step": 3725 + }, + { + "epoch": 0.675, + "grad_norm": 3.923635820170825, + "learning_rate": 2.523751596362808e-07, + "loss": 0.2747, + "step": 3726 + }, + { + "epoch": 0.6751811594202899, + "grad_norm": 3.5615440570905417, + "learning_rate": 2.521203217448372e-07, + "loss": 0.2621, + "step": 3727 + }, + { + "epoch": 0.6753623188405797, + "grad_norm": 4.477573613988831, + "learning_rate": 2.518655691995415e-07, + "loss": 0.3018, + "step": 3728 + }, + { + "epoch": 0.6755434782608696, + "grad_norm": 4.937307739699936, + "learning_rate": 2.516109020881059e-07, + "loss": 0.2702, + "step": 3729 + }, + { + "epoch": 0.6757246376811594, + "grad_norm": 3.578465065191748, + "learning_rate": 2.5135632049821354e-07, + "loss": 0.2735, + "step": 3730 + }, + { + "epoch": 0.6759057971014493, + "grad_norm": 9.165499789617822, + "learning_rate": 2.5110182451751825e-07, + "loss": 0.2474, + "step": 3731 + }, + { + "epoch": 0.6760869565217391, + "grad_norm": 3.6574739974542902, + "learning_rate": 2.5084741423364397e-07, + "loss": 0.2176, + "step": 3732 + }, + { + "epoch": 0.676268115942029, + "grad_norm": 3.4945780501329633, + "learning_rate": 2.5059308973418535e-07, + "loss": 0.2553, + "step": 3733 + }, + { + "epoch": 0.6764492753623188, + "grad_norm": 4.7676407893054416, + "learning_rate": 2.503388511067079e-07, + "loss": 0.2938, + "step": 3734 + }, + { + "epoch": 0.6766304347826086, + "grad_norm": 3.714527861368929, + "learning_rate": 2.5008469843874705e-07, + "loss": 0.2833, + "step": 3735 + }, + { + "epoch": 0.6768115942028986, + "grad_norm": 5.2619405581981376, + "learning_rate": 2.4983063181780827e-07, + "loss": 0.3008, + "step": 3736 + }, + { + "epoch": 0.6769927536231884, + "grad_norm": 5.806052309007467, + "learning_rate": 2.495766513313685e-07, + "loss": 0.3541, + "step": 3737 + }, + { + "epoch": 0.6771739130434783, + "grad_norm": 8.334073080872042, + "learning_rate": 2.4932275706687416e-07, + "loss": 0.3122, + "step": 3738 + }, + { + "epoch": 0.6773550724637681, + "grad_norm": 5.036014008178167, + "learning_rate": 2.490689491117424e-07, + "loss": 0.2791, + "step": 3739 + }, + { + "epoch": 0.677536231884058, + "grad_norm": 4.894421293528215, + "learning_rate": 2.4881522755336024e-07, + "loss": 0.3398, + "step": 3740 + }, + { + "epoch": 0.6777173913043478, + "grad_norm": 3.8092840533788253, + "learning_rate": 2.485615924790855e-07, + "loss": 0.29, + "step": 3741 + }, + { + "epoch": 0.6778985507246377, + "grad_norm": 3.5658172235659524, + "learning_rate": 2.483080439762458e-07, + "loss": 0.2381, + "step": 3742 + }, + { + "epoch": 0.6780797101449275, + "grad_norm": 5.211964956085503, + "learning_rate": 2.4805458213213904e-07, + "loss": 0.328, + "step": 3743 + }, + { + "epoch": 0.6782608695652174, + "grad_norm": 3.8336922392325277, + "learning_rate": 2.478012070340332e-07, + "loss": 0.2769, + "step": 3744 + }, + { + "epoch": 0.6784420289855072, + "grad_norm": 4.0706889002832485, + "learning_rate": 2.475479187691672e-07, + "loss": 0.3317, + "step": 3745 + }, + { + "epoch": 0.678623188405797, + "grad_norm": 4.8548088776608145, + "learning_rate": 2.472947174247486e-07, + "loss": 0.3067, + "step": 3746 + }, + { + "epoch": 0.678804347826087, + "grad_norm": 7.700384039520854, + "learning_rate": 2.4704160308795623e-07, + "loss": 0.2748, + "step": 3747 + }, + { + "epoch": 0.6789855072463769, + "grad_norm": 4.377439947690035, + "learning_rate": 2.4678857584593823e-07, + "loss": 0.2952, + "step": 3748 + }, + { + "epoch": 0.6791666666666667, + "grad_norm": 4.191160381355023, + "learning_rate": 2.465356357858135e-07, + "loss": 0.3186, + "step": 3749 + }, + { + "epoch": 0.6793478260869565, + "grad_norm": 3.8901980606507256, + "learning_rate": 2.4628278299467025e-07, + "loss": 0.2575, + "step": 3750 + }, + { + "epoch": 0.6795289855072464, + "grad_norm": 5.933138967534731, + "learning_rate": 2.4603001755956706e-07, + "loss": 0.3002, + "step": 3751 + }, + { + "epoch": 0.6797101449275362, + "grad_norm": 5.885488032232096, + "learning_rate": 2.4577733956753204e-07, + "loss": 0.2572, + "step": 3752 + }, + { + "epoch": 0.6798913043478261, + "grad_norm": 5.063156531520064, + "learning_rate": 2.4552474910556366e-07, + "loss": 0.2988, + "step": 3753 + }, + { + "epoch": 0.6800724637681159, + "grad_norm": 3.6891494181801194, + "learning_rate": 2.4527224626062983e-07, + "loss": 0.31, + "step": 3754 + }, + { + "epoch": 0.6802536231884058, + "grad_norm": 4.048423715052501, + "learning_rate": 2.450198311196685e-07, + "loss": 0.2791, + "step": 3755 + }, + { + "epoch": 0.6804347826086956, + "grad_norm": 3.5643477289797474, + "learning_rate": 2.447675037695875e-07, + "loss": 0.2852, + "step": 3756 + }, + { + "epoch": 0.6806159420289855, + "grad_norm": 7.374217912480681, + "learning_rate": 2.4451526429726425e-07, + "loss": 0.27, + "step": 3757 + }, + { + "epoch": 0.6807971014492754, + "grad_norm": 4.320903535185061, + "learning_rate": 2.4426311278954604e-07, + "loss": 0.3285, + "step": 3758 + }, + { + "epoch": 0.6809782608695653, + "grad_norm": 5.493615002209071, + "learning_rate": 2.4401104933324973e-07, + "loss": 0.3208, + "step": 3759 + }, + { + "epoch": 0.6811594202898551, + "grad_norm": 3.348509491773087, + "learning_rate": 2.437590740151619e-07, + "loss": 0.2755, + "step": 3760 + }, + { + "epoch": 0.681340579710145, + "grad_norm": 6.1246952309264735, + "learning_rate": 2.435071869220392e-07, + "loss": 0.2782, + "step": 3761 + }, + { + "epoch": 0.6815217391304348, + "grad_norm": 5.360414590507508, + "learning_rate": 2.432553881406075e-07, + "loss": 0.2525, + "step": 3762 + }, + { + "epoch": 0.6817028985507246, + "grad_norm": 3.154309192637602, + "learning_rate": 2.430036777575619e-07, + "loss": 0.2622, + "step": 3763 + }, + { + "epoch": 0.6818840579710145, + "grad_norm": 8.186132924282555, + "learning_rate": 2.4275205585956757e-07, + "loss": 0.2174, + "step": 3764 + }, + { + "epoch": 0.6820652173913043, + "grad_norm": 6.183140415361262, + "learning_rate": 2.425005225332595e-07, + "loss": 0.3044, + "step": 3765 + }, + { + "epoch": 0.6822463768115942, + "grad_norm": 4.440254661755383, + "learning_rate": 2.4224907786524154e-07, + "loss": 0.2992, + "step": 3766 + }, + { + "epoch": 0.682427536231884, + "grad_norm": 3.7596045205174278, + "learning_rate": 2.4199772194208726e-07, + "loss": 0.2947, + "step": 3767 + }, + { + "epoch": 0.6826086956521739, + "grad_norm": 4.609703182758899, + "learning_rate": 2.417464548503398e-07, + "loss": 0.262, + "step": 3768 + }, + { + "epoch": 0.6827898550724638, + "grad_norm": 3.9194289858445086, + "learning_rate": 2.4149527667651166e-07, + "loss": 0.3021, + "step": 3769 + }, + { + "epoch": 0.6829710144927537, + "grad_norm": 5.309697719592353, + "learning_rate": 2.412441875070845e-07, + "loss": 0.2904, + "step": 3770 + }, + { + "epoch": 0.6831521739130435, + "grad_norm": 5.481643647046267, + "learning_rate": 2.409931874285096e-07, + "loss": 0.3001, + "step": 3771 + }, + { + "epoch": 0.6833333333333333, + "grad_norm": 4.742414949513573, + "learning_rate": 2.4074227652720746e-07, + "loss": 0.2664, + "step": 3772 + }, + { + "epoch": 0.6835144927536232, + "grad_norm": 10.468248103439224, + "learning_rate": 2.404914548895679e-07, + "loss": 0.2708, + "step": 3773 + }, + { + "epoch": 0.683695652173913, + "grad_norm": 4.70162866668075, + "learning_rate": 2.4024072260195e-07, + "loss": 0.2391, + "step": 3774 + }, + { + "epoch": 0.6838768115942029, + "grad_norm": 7.271222561201524, + "learning_rate": 2.3999007975068193e-07, + "loss": 0.2966, + "step": 3775 + }, + { + "epoch": 0.6840579710144927, + "grad_norm": 4.935387864568864, + "learning_rate": 2.3973952642206146e-07, + "loss": 0.3543, + "step": 3776 + }, + { + "epoch": 0.6842391304347826, + "grad_norm": 3.9182071596776384, + "learning_rate": 2.3948906270235524e-07, + "loss": 0.2336, + "step": 3777 + }, + { + "epoch": 0.6844202898550724, + "grad_norm": 5.98865061652781, + "learning_rate": 2.3923868867779923e-07, + "loss": 0.2335, + "step": 3778 + }, + { + "epoch": 0.6846014492753624, + "grad_norm": 4.016137179708794, + "learning_rate": 2.3898840443459787e-07, + "loss": 0.3028, + "step": 3779 + }, + { + "epoch": 0.6847826086956522, + "grad_norm": 3.359287852461111, + "learning_rate": 2.3873821005892575e-07, + "loss": 0.2885, + "step": 3780 + }, + { + "epoch": 0.6849637681159421, + "grad_norm": 5.74270091761096, + "learning_rate": 2.384881056369257e-07, + "loss": 0.2542, + "step": 3781 + }, + { + "epoch": 0.6851449275362319, + "grad_norm": 5.5518861750157535, + "learning_rate": 2.3823809125471006e-07, + "loss": 0.3141, + "step": 3782 + }, + { + "epoch": 0.6853260869565218, + "grad_norm": 8.20629876964488, + "learning_rate": 2.3798816699835982e-07, + "loss": 0.2784, + "step": 3783 + }, + { + "epoch": 0.6855072463768116, + "grad_norm": 4.634983024752152, + "learning_rate": 2.3773833295392514e-07, + "loss": 0.2207, + "step": 3784 + }, + { + "epoch": 0.6856884057971014, + "grad_norm": 6.191718181565424, + "learning_rate": 2.3748858920742498e-07, + "loss": 0.2737, + "step": 3785 + }, + { + "epoch": 0.6858695652173913, + "grad_norm": 5.284665196778069, + "learning_rate": 2.3723893584484744e-07, + "loss": 0.3055, + "step": 3786 + }, + { + "epoch": 0.6860507246376811, + "grad_norm": 3.6539431978292742, + "learning_rate": 2.3698937295214905e-07, + "loss": 0.2796, + "step": 3787 + }, + { + "epoch": 0.686231884057971, + "grad_norm": 5.738723518638768, + "learning_rate": 2.367399006152559e-07, + "loss": 0.2868, + "step": 3788 + }, + { + "epoch": 0.6864130434782608, + "grad_norm": 3.93371117987035, + "learning_rate": 2.364905189200625e-07, + "loss": 0.2718, + "step": 3789 + }, + { + "epoch": 0.6865942028985508, + "grad_norm": 5.756673818183177, + "learning_rate": 2.3624122795243183e-07, + "loss": 0.3469, + "step": 3790 + }, + { + "epoch": 0.6867753623188406, + "grad_norm": 3.6251648800464054, + "learning_rate": 2.359920277981959e-07, + "loss": 0.2632, + "step": 3791 + }, + { + "epoch": 0.6869565217391305, + "grad_norm": 3.729185749355193, + "learning_rate": 2.3574291854315582e-07, + "loss": 0.3147, + "step": 3792 + }, + { + "epoch": 0.6871376811594203, + "grad_norm": 4.260088636735624, + "learning_rate": 2.3549390027308103e-07, + "loss": 0.3128, + "step": 3793 + }, + { + "epoch": 0.6873188405797102, + "grad_norm": 5.971931144202631, + "learning_rate": 2.3524497307370954e-07, + "loss": 0.2944, + "step": 3794 + }, + { + "epoch": 0.6875, + "grad_norm": 7.096882484653727, + "learning_rate": 2.3499613703074834e-07, + "loss": 0.283, + "step": 3795 + }, + { + "epoch": 0.6876811594202898, + "grad_norm": 7.532401824587726, + "learning_rate": 2.3474739222987277e-07, + "loss": 0.2555, + "step": 3796 + }, + { + "epoch": 0.6878623188405797, + "grad_norm": 4.837707478698999, + "learning_rate": 2.344987387567268e-07, + "loss": 0.2844, + "step": 3797 + }, + { + "epoch": 0.6880434782608695, + "grad_norm": 4.5302550670805015, + "learning_rate": 2.342501766969231e-07, + "loss": 0.2845, + "step": 3798 + }, + { + "epoch": 0.6882246376811594, + "grad_norm": 4.107654960421121, + "learning_rate": 2.340017061360427e-07, + "loss": 0.299, + "step": 3799 + }, + { + "epoch": 0.6884057971014492, + "grad_norm": 5.1077384860866335, + "learning_rate": 2.337533271596352e-07, + "loss": 0.3354, + "step": 3800 + }, + { + "epoch": 0.6884057971014492, + "eval_loss": 0.28162500262260437, + "eval_runtime": 9.7581, + "eval_samples_per_second": 51.239, + "eval_steps_per_second": 0.102, + "step": 3800 + }, + { + "epoch": 0.6885869565217392, + "grad_norm": 3.895599508421316, + "learning_rate": 2.3350503985321863e-07, + "loss": 0.2299, + "step": 3801 + }, + { + "epoch": 0.688768115942029, + "grad_norm": 6.36223034603808, + "learning_rate": 2.3325684430227953e-07, + "loss": 0.2933, + "step": 3802 + }, + { + "epoch": 0.6889492753623189, + "grad_norm": 5.793919492476561, + "learning_rate": 2.3300874059227265e-07, + "loss": 0.2676, + "step": 3803 + }, + { + "epoch": 0.6891304347826087, + "grad_norm": 3.5537249304223066, + "learning_rate": 2.3276072880862159e-07, + "loss": 0.2871, + "step": 3804 + }, + { + "epoch": 0.6893115942028986, + "grad_norm": 7.85421289156873, + "learning_rate": 2.3251280903671793e-07, + "loss": 0.2793, + "step": 3805 + }, + { + "epoch": 0.6894927536231884, + "grad_norm": 5.2015059804309836, + "learning_rate": 2.322649813619214e-07, + "loss": 0.2999, + "step": 3806 + }, + { + "epoch": 0.6896739130434782, + "grad_norm": 3.89391652983408, + "learning_rate": 2.3201724586956013e-07, + "loss": 0.264, + "step": 3807 + }, + { + "epoch": 0.6898550724637681, + "grad_norm": 4.940018783241108, + "learning_rate": 2.31769602644931e-07, + "loss": 0.3566, + "step": 3808 + }, + { + "epoch": 0.6900362318840579, + "grad_norm": 3.577183307372419, + "learning_rate": 2.315220517732986e-07, + "loss": 0.2273, + "step": 3809 + }, + { + "epoch": 0.6902173913043478, + "grad_norm": 8.334791736171915, + "learning_rate": 2.3127459333989578e-07, + "loss": 0.2323, + "step": 3810 + }, + { + "epoch": 0.6903985507246376, + "grad_norm": 5.026998604234912, + "learning_rate": 2.310272274299238e-07, + "loss": 0.2943, + "step": 3811 + }, + { + "epoch": 0.6905797101449276, + "grad_norm": 7.631456430315683, + "learning_rate": 2.3077995412855183e-07, + "loss": 0.3053, + "step": 3812 + }, + { + "epoch": 0.6907608695652174, + "grad_norm": 5.904942862059751, + "learning_rate": 2.305327735209172e-07, + "loss": 0.227, + "step": 3813 + }, + { + "epoch": 0.6909420289855073, + "grad_norm": 4.466594245637513, + "learning_rate": 2.3028568569212526e-07, + "loss": 0.279, + "step": 3814 + }, + { + "epoch": 0.6911231884057971, + "grad_norm": 4.2423410047423715, + "learning_rate": 2.3003869072724986e-07, + "loss": 0.2641, + "step": 3815 + }, + { + "epoch": 0.691304347826087, + "grad_norm": 4.085576865913926, + "learning_rate": 2.2979178871133255e-07, + "loss": 0.2751, + "step": 3816 + }, + { + "epoch": 0.6914855072463768, + "grad_norm": 5.300833733691943, + "learning_rate": 2.295449797293824e-07, + "loss": 0.3022, + "step": 3817 + }, + { + "epoch": 0.6916666666666667, + "grad_norm": 4.350403258352893, + "learning_rate": 2.2929826386637703e-07, + "loss": 0.2694, + "step": 3818 + }, + { + "epoch": 0.6918478260869565, + "grad_norm": 7.237121567159541, + "learning_rate": 2.290516412072622e-07, + "loss": 0.2486, + "step": 3819 + }, + { + "epoch": 0.6920289855072463, + "grad_norm": 5.413961554673111, + "learning_rate": 2.288051118369511e-07, + "loss": 0.3938, + "step": 3820 + }, + { + "epoch": 0.6922101449275362, + "grad_norm": 8.626655468906, + "learning_rate": 2.2855867584032496e-07, + "loss": 0.291, + "step": 3821 + }, + { + "epoch": 0.6923913043478261, + "grad_norm": 4.157941098726383, + "learning_rate": 2.2831233330223282e-07, + "loss": 0.3305, + "step": 3822 + }, + { + "epoch": 0.692572463768116, + "grad_norm": 7.99926831495356, + "learning_rate": 2.2806608430749158e-07, + "loss": 0.2769, + "step": 3823 + }, + { + "epoch": 0.6927536231884058, + "grad_norm": 7.928550603128422, + "learning_rate": 2.2781992894088599e-07, + "loss": 0.3252, + "step": 3824 + }, + { + "epoch": 0.6929347826086957, + "grad_norm": 4.941603073940312, + "learning_rate": 2.2757386728716849e-07, + "loss": 0.266, + "step": 3825 + }, + { + "epoch": 0.6931159420289855, + "grad_norm": 5.025470239545978, + "learning_rate": 2.2732789943105924e-07, + "loss": 0.2733, + "step": 3826 + }, + { + "epoch": 0.6932971014492754, + "grad_norm": 3.5713345321840717, + "learning_rate": 2.270820254572462e-07, + "loss": 0.2453, + "step": 3827 + }, + { + "epoch": 0.6934782608695652, + "grad_norm": 3.723252805044335, + "learning_rate": 2.2683624545038488e-07, + "loss": 0.2809, + "step": 3828 + }, + { + "epoch": 0.693659420289855, + "grad_norm": 3.98260193173211, + "learning_rate": 2.2659055949509852e-07, + "loss": 0.2941, + "step": 3829 + }, + { + "epoch": 0.6938405797101449, + "grad_norm": 4.697418403635444, + "learning_rate": 2.2634496767597784e-07, + "loss": 0.3369, + "step": 3830 + }, + { + "epoch": 0.6940217391304347, + "grad_norm": 5.99252702407414, + "learning_rate": 2.2609947007758152e-07, + "loss": 0.2697, + "step": 3831 + }, + { + "epoch": 0.6942028985507246, + "grad_norm": 6.180970861163161, + "learning_rate": 2.2585406678443558e-07, + "loss": 0.3008, + "step": 3832 + }, + { + "epoch": 0.6943840579710145, + "grad_norm": 3.9945378122709516, + "learning_rate": 2.2560875788103323e-07, + "loss": 0.2973, + "step": 3833 + }, + { + "epoch": 0.6945652173913044, + "grad_norm": 3.142306917182887, + "learning_rate": 2.2536354345183545e-07, + "loss": 0.2305, + "step": 3834 + }, + { + "epoch": 0.6947463768115942, + "grad_norm": 3.5138372491509857, + "learning_rate": 2.2511842358127114e-07, + "loss": 0.2518, + "step": 3835 + }, + { + "epoch": 0.6949275362318841, + "grad_norm": 2.8847508407145437, + "learning_rate": 2.2487339835373593e-07, + "loss": 0.2212, + "step": 3836 + }, + { + "epoch": 0.6951086956521739, + "grad_norm": 10.539855375468377, + "learning_rate": 2.246284678535933e-07, + "loss": 0.2988, + "step": 3837 + }, + { + "epoch": 0.6952898550724638, + "grad_norm": 4.358687419164801, + "learning_rate": 2.243836321651739e-07, + "loss": 0.2793, + "step": 3838 + }, + { + "epoch": 0.6954710144927536, + "grad_norm": 3.5529227307144238, + "learning_rate": 2.2413889137277586e-07, + "loss": 0.2875, + "step": 3839 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 6.488059008046831, + "learning_rate": 2.2389424556066455e-07, + "loss": 0.2217, + "step": 3840 + }, + { + "epoch": 0.6958333333333333, + "grad_norm": 3.83985159372148, + "learning_rate": 2.2364969481307272e-07, + "loss": 0.2709, + "step": 3841 + }, + { + "epoch": 0.6960144927536231, + "grad_norm": 5.792415216894456, + "learning_rate": 2.2340523921420034e-07, + "loss": 0.2792, + "step": 3842 + }, + { + "epoch": 0.696195652173913, + "grad_norm": 3.633719525565086, + "learning_rate": 2.2316087884821467e-07, + "loss": 0.2446, + "step": 3843 + }, + { + "epoch": 0.696376811594203, + "grad_norm": 5.443811432586644, + "learning_rate": 2.2291661379925008e-07, + "loss": 0.3033, + "step": 3844 + }, + { + "epoch": 0.6965579710144928, + "grad_norm": 3.793910056797811, + "learning_rate": 2.22672444151408e-07, + "loss": 0.3072, + "step": 3845 + }, + { + "epoch": 0.6967391304347826, + "grad_norm": 4.581990500799826, + "learning_rate": 2.2242836998875763e-07, + "loss": 0.2587, + "step": 3846 + }, + { + "epoch": 0.6969202898550725, + "grad_norm": 3.9969392833061437, + "learning_rate": 2.2218439139533462e-07, + "loss": 0.252, + "step": 3847 + }, + { + "epoch": 0.6971014492753623, + "grad_norm": 3.813600695419182, + "learning_rate": 2.2194050845514212e-07, + "loss": 0.292, + "step": 3848 + }, + { + "epoch": 0.6972826086956522, + "grad_norm": 8.126997778204192, + "learning_rate": 2.2169672125214971e-07, + "loss": 0.2962, + "step": 3849 + }, + { + "epoch": 0.697463768115942, + "grad_norm": 3.817536113558924, + "learning_rate": 2.2145302987029495e-07, + "loss": 0.3088, + "step": 3850 + }, + { + "epoch": 0.6976449275362319, + "grad_norm": 6.393869814890446, + "learning_rate": 2.2120943439348184e-07, + "loss": 0.2606, + "step": 3851 + }, + { + "epoch": 0.6978260869565217, + "grad_norm": 4.84019102550198, + "learning_rate": 2.2096593490558134e-07, + "loss": 0.2768, + "step": 3852 + }, + { + "epoch": 0.6980072463768116, + "grad_norm": 7.712696443643765, + "learning_rate": 2.2072253149043158e-07, + "loss": 0.3077, + "step": 3853 + }, + { + "epoch": 0.6981884057971014, + "grad_norm": 5.247110955916728, + "learning_rate": 2.2047922423183746e-07, + "loss": 0.3003, + "step": 3854 + }, + { + "epoch": 0.6983695652173914, + "grad_norm": 6.554265872852244, + "learning_rate": 2.2023601321357082e-07, + "loss": 0.235, + "step": 3855 + }, + { + "epoch": 0.6985507246376812, + "grad_norm": 3.8444108134514385, + "learning_rate": 2.199928985193704e-07, + "loss": 0.2716, + "step": 3856 + }, + { + "epoch": 0.698731884057971, + "grad_norm": 8.164588563505138, + "learning_rate": 2.1974988023294156e-07, + "loss": 0.3179, + "step": 3857 + }, + { + "epoch": 0.6989130434782609, + "grad_norm": 6.013200076195491, + "learning_rate": 2.1950695843795697e-07, + "loss": 0.2845, + "step": 3858 + }, + { + "epoch": 0.6990942028985507, + "grad_norm": 4.48517778665608, + "learning_rate": 2.1926413321805574e-07, + "loss": 0.3093, + "step": 3859 + }, + { + "epoch": 0.6992753623188406, + "grad_norm": 4.817871914286493, + "learning_rate": 2.190214046568435e-07, + "loss": 0.2713, + "step": 3860 + }, + { + "epoch": 0.6994565217391304, + "grad_norm": 5.423984674913027, + "learning_rate": 2.187787728378927e-07, + "loss": 0.3008, + "step": 3861 + }, + { + "epoch": 0.6996376811594203, + "grad_norm": 4.036100771245654, + "learning_rate": 2.18536237844743e-07, + "loss": 0.2845, + "step": 3862 + }, + { + "epoch": 0.6998188405797101, + "grad_norm": 4.0802816512856666, + "learning_rate": 2.1829379976090028e-07, + "loss": 0.2702, + "step": 3863 + }, + { + "epoch": 0.7, + "grad_norm": 8.461585122702223, + "learning_rate": 2.180514586698371e-07, + "loss": 0.2683, + "step": 3864 + }, + { + "epoch": 0.7001811594202898, + "grad_norm": 6.164450221428653, + "learning_rate": 2.1780921465499252e-07, + "loss": 0.3211, + "step": 3865 + }, + { + "epoch": 0.7003623188405798, + "grad_norm": 3.234841078108479, + "learning_rate": 2.175670677997724e-07, + "loss": 0.2292, + "step": 3866 + }, + { + "epoch": 0.7005434782608696, + "grad_norm": 3.893588969357438, + "learning_rate": 2.1732501818754906e-07, + "loss": 0.298, + "step": 3867 + }, + { + "epoch": 0.7007246376811594, + "grad_norm": 8.101903589515599, + "learning_rate": 2.1708306590166126e-07, + "loss": 0.2877, + "step": 3868 + }, + { + "epoch": 0.7009057971014493, + "grad_norm": 8.238000132321554, + "learning_rate": 2.1684121102541435e-07, + "loss": 0.3257, + "step": 3869 + }, + { + "epoch": 0.7010869565217391, + "grad_norm": 6.974761142208887, + "learning_rate": 2.1659945364208017e-07, + "loss": 0.3155, + "step": 3870 + }, + { + "epoch": 0.701268115942029, + "grad_norm": 4.785887971435475, + "learning_rate": 2.1635779383489687e-07, + "loss": 0.2602, + "step": 3871 + }, + { + "epoch": 0.7014492753623188, + "grad_norm": 3.6665107106925614, + "learning_rate": 2.1611623168706905e-07, + "loss": 0.3032, + "step": 3872 + }, + { + "epoch": 0.7016304347826087, + "grad_norm": 4.266989471594792, + "learning_rate": 2.1587476728176757e-07, + "loss": 0.3152, + "step": 3873 + }, + { + "epoch": 0.7018115942028985, + "grad_norm": 3.477226931322972, + "learning_rate": 2.156334007021301e-07, + "loss": 0.2901, + "step": 3874 + }, + { + "epoch": 0.7019927536231884, + "grad_norm": 5.14395421208077, + "learning_rate": 2.1539213203126034e-07, + "loss": 0.3104, + "step": 3875 + }, + { + "epoch": 0.7021739130434783, + "grad_norm": 5.257762792478757, + "learning_rate": 2.1515096135222775e-07, + "loss": 0.3226, + "step": 3876 + }, + { + "epoch": 0.7023550724637682, + "grad_norm": 3.9327063916925433, + "learning_rate": 2.149098887480687e-07, + "loss": 0.2662, + "step": 3877 + }, + { + "epoch": 0.702536231884058, + "grad_norm": 4.931721664454086, + "learning_rate": 2.146689143017859e-07, + "loss": 0.3296, + "step": 3878 + }, + { + "epoch": 0.7027173913043478, + "grad_norm": 7.7016531339644505, + "learning_rate": 2.1442803809634785e-07, + "loss": 0.2648, + "step": 3879 + }, + { + "epoch": 0.7028985507246377, + "grad_norm": 3.651930880124436, + "learning_rate": 2.1418726021468937e-07, + "loss": 0.2563, + "step": 3880 + }, + { + "epoch": 0.7030797101449275, + "grad_norm": 5.508046338388239, + "learning_rate": 2.1394658073971135e-07, + "loss": 0.3018, + "step": 3881 + }, + { + "epoch": 0.7032608695652174, + "grad_norm": 3.7365230202239585, + "learning_rate": 2.13705999754281e-07, + "loss": 0.248, + "step": 3882 + }, + { + "epoch": 0.7034420289855072, + "grad_norm": 3.683401828315749, + "learning_rate": 2.1346551734123136e-07, + "loss": 0.295, + "step": 3883 + }, + { + "epoch": 0.7036231884057971, + "grad_norm": 3.6852567553988553, + "learning_rate": 2.1322513358336158e-07, + "loss": 0.2618, + "step": 3884 + }, + { + "epoch": 0.7038043478260869, + "grad_norm": 5.8496857708182075, + "learning_rate": 2.1298484856343724e-07, + "loss": 0.3161, + "step": 3885 + }, + { + "epoch": 0.7039855072463768, + "grad_norm": 4.110668816683411, + "learning_rate": 2.1274466236418963e-07, + "loss": 0.3302, + "step": 3886 + }, + { + "epoch": 0.7041666666666667, + "grad_norm": 3.443120256525678, + "learning_rate": 2.1250457506831565e-07, + "loss": 0.257, + "step": 3887 + }, + { + "epoch": 0.7043478260869566, + "grad_norm": 5.992851148463246, + "learning_rate": 2.1226458675847847e-07, + "loss": 0.322, + "step": 3888 + }, + { + "epoch": 0.7045289855072464, + "grad_norm": 5.735196706510981, + "learning_rate": 2.1202469751730757e-07, + "loss": 0.2557, + "step": 3889 + }, + { + "epoch": 0.7047101449275363, + "grad_norm": 4.543808253257551, + "learning_rate": 2.1178490742739773e-07, + "loss": 0.3094, + "step": 3890 + }, + { + "epoch": 0.7048913043478261, + "grad_norm": 4.176058969502284, + "learning_rate": 2.1154521657130985e-07, + "loss": 0.2956, + "step": 3891 + }, + { + "epoch": 0.7050724637681159, + "grad_norm": 5.820421759039495, + "learning_rate": 2.1130562503157068e-07, + "loss": 0.263, + "step": 3892 + }, + { + "epoch": 0.7052536231884058, + "grad_norm": 3.864294812391932, + "learning_rate": 2.1106613289067266e-07, + "loss": 0.2593, + "step": 3893 + }, + { + "epoch": 0.7054347826086956, + "grad_norm": 4.013625144244584, + "learning_rate": 2.108267402310741e-07, + "loss": 0.2783, + "step": 3894 + }, + { + "epoch": 0.7056159420289855, + "grad_norm": 3.4819882548369487, + "learning_rate": 2.10587447135199e-07, + "loss": 0.2769, + "step": 3895 + }, + { + "epoch": 0.7057971014492753, + "grad_norm": 4.494904366891253, + "learning_rate": 2.1034825368543713e-07, + "loss": 0.2751, + "step": 3896 + }, + { + "epoch": 0.7059782608695652, + "grad_norm": 3.788228229461717, + "learning_rate": 2.1010915996414387e-07, + "loss": 0.2715, + "step": 3897 + }, + { + "epoch": 0.7061594202898551, + "grad_norm": 4.558146418442677, + "learning_rate": 2.0987016605364038e-07, + "loss": 0.2485, + "step": 3898 + }, + { + "epoch": 0.706340579710145, + "grad_norm": 9.011667071487581, + "learning_rate": 2.096312720362134e-07, + "loss": 0.3279, + "step": 3899 + }, + { + "epoch": 0.7065217391304348, + "grad_norm": 4.033816335560063, + "learning_rate": 2.093924779941151e-07, + "loss": 0.2674, + "step": 3900 + }, + { + "epoch": 0.7065217391304348, + "eval_loss": 0.2748437523841858, + "eval_runtime": 9.7949, + "eval_samples_per_second": 51.047, + "eval_steps_per_second": 0.102, + "step": 3900 + }, + { + "epoch": 0.7067028985507247, + "grad_norm": 6.570008043991807, + "learning_rate": 2.091537840095637e-07, + "loss": 0.3518, + "step": 3901 + }, + { + "epoch": 0.7068840579710145, + "grad_norm": 8.488845064009187, + "learning_rate": 2.0891519016474268e-07, + "loss": 0.2664, + "step": 3902 + }, + { + "epoch": 0.7070652173913043, + "grad_norm": 7.184044904634563, + "learning_rate": 2.086766965418007e-07, + "loss": 0.3019, + "step": 3903 + }, + { + "epoch": 0.7072463768115942, + "grad_norm": 6.153126550834495, + "learning_rate": 2.0843830322285227e-07, + "loss": 0.3337, + "step": 3904 + }, + { + "epoch": 0.707427536231884, + "grad_norm": 3.18806571497542, + "learning_rate": 2.0820001028997763e-07, + "loss": 0.2554, + "step": 3905 + }, + { + "epoch": 0.7076086956521739, + "grad_norm": 7.231120940129814, + "learning_rate": 2.079618178252221e-07, + "loss": 0.292, + "step": 3906 + }, + { + "epoch": 0.7077898550724637, + "grad_norm": 6.9688938353825645, + "learning_rate": 2.0772372591059633e-07, + "loss": 0.2545, + "step": 3907 + }, + { + "epoch": 0.7079710144927536, + "grad_norm": 4.725859624784996, + "learning_rate": 2.0748573462807657e-07, + "loss": 0.2582, + "step": 3908 + }, + { + "epoch": 0.7081521739130435, + "grad_norm": 4.493747199323723, + "learning_rate": 2.0724784405960438e-07, + "loss": 0.249, + "step": 3909 + }, + { + "epoch": 0.7083333333333334, + "grad_norm": 4.05813342984372, + "learning_rate": 2.070100542870865e-07, + "loss": 0.3141, + "step": 3910 + }, + { + "epoch": 0.7085144927536232, + "grad_norm": 3.646999104315953, + "learning_rate": 2.0677236539239523e-07, + "loss": 0.2483, + "step": 3911 + }, + { + "epoch": 0.7086956521739131, + "grad_norm": 7.400528883186729, + "learning_rate": 2.0653477745736786e-07, + "loss": 0.2589, + "step": 3912 + }, + { + "epoch": 0.7088768115942029, + "grad_norm": 3.923596584462551, + "learning_rate": 2.0629729056380708e-07, + "loss": 0.2462, + "step": 3913 + }, + { + "epoch": 0.7090579710144927, + "grad_norm": 10.302374494290001, + "learning_rate": 2.0605990479348072e-07, + "loss": 0.2842, + "step": 3914 + }, + { + "epoch": 0.7092391304347826, + "grad_norm": 4.363029422198087, + "learning_rate": 2.058226202281217e-07, + "loss": 0.2841, + "step": 3915 + }, + { + "epoch": 0.7094202898550724, + "grad_norm": 3.1212162356638316, + "learning_rate": 2.0558543694942853e-07, + "loss": 0.1978, + "step": 3916 + }, + { + "epoch": 0.7096014492753623, + "grad_norm": 4.278531571620851, + "learning_rate": 2.0534835503906446e-07, + "loss": 0.3357, + "step": 3917 + }, + { + "epoch": 0.7097826086956521, + "grad_norm": 9.792183444454981, + "learning_rate": 2.0511137457865797e-07, + "loss": 0.3221, + "step": 3918 + }, + { + "epoch": 0.7099637681159421, + "grad_norm": 3.365095405079111, + "learning_rate": 2.0487449564980202e-07, + "loss": 0.2504, + "step": 3919 + }, + { + "epoch": 0.7101449275362319, + "grad_norm": 8.133772695587682, + "learning_rate": 2.046377183340558e-07, + "loss": 0.3301, + "step": 3920 + }, + { + "epoch": 0.7103260869565218, + "grad_norm": 9.7820690195649, + "learning_rate": 2.0440104271294257e-07, + "loss": 0.2816, + "step": 3921 + }, + { + "epoch": 0.7105072463768116, + "grad_norm": 7.165834345146065, + "learning_rate": 2.041644688679509e-07, + "loss": 0.286, + "step": 3922 + }, + { + "epoch": 0.7106884057971015, + "grad_norm": 7.163950001306182, + "learning_rate": 2.0392799688053435e-07, + "loss": 0.2729, + "step": 3923 + }, + { + "epoch": 0.7108695652173913, + "grad_norm": 3.328188233958173, + "learning_rate": 2.0369162683211129e-07, + "loss": 0.247, + "step": 3924 + }, + { + "epoch": 0.7110507246376812, + "grad_norm": 3.438127332975321, + "learning_rate": 2.03455358804065e-07, + "loss": 0.2189, + "step": 3925 + }, + { + "epoch": 0.711231884057971, + "grad_norm": 6.8672147674006, + "learning_rate": 2.0321919287774374e-07, + "loss": 0.2694, + "step": 3926 + }, + { + "epoch": 0.7114130434782608, + "grad_norm": 4.262116419446079, + "learning_rate": 2.0298312913446042e-07, + "loss": 0.3133, + "step": 3927 + }, + { + "epoch": 0.7115942028985507, + "grad_norm": 3.8195989168364486, + "learning_rate": 2.027471676554932e-07, + "loss": 0.265, + "step": 3928 + }, + { + "epoch": 0.7117753623188405, + "grad_norm": 4.205182147749765, + "learning_rate": 2.025113085220847e-07, + "loss": 0.2808, + "step": 3929 + }, + { + "epoch": 0.7119565217391305, + "grad_norm": 8.755210542150248, + "learning_rate": 2.022755518154421e-07, + "loss": 0.2946, + "step": 3930 + }, + { + "epoch": 0.7121376811594203, + "grad_norm": 5.157590356896721, + "learning_rate": 2.020398976167374e-07, + "loss": 0.2964, + "step": 3931 + }, + { + "epoch": 0.7123188405797102, + "grad_norm": 4.19122884140391, + "learning_rate": 2.0180434600710794e-07, + "loss": 0.2569, + "step": 3932 + }, + { + "epoch": 0.7125, + "grad_norm": 5.382443262970783, + "learning_rate": 2.0156889706765506e-07, + "loss": 0.3063, + "step": 3933 + }, + { + "epoch": 0.7126811594202899, + "grad_norm": 3.7882097904317646, + "learning_rate": 2.0133355087944488e-07, + "loss": 0.2797, + "step": 3934 + }, + { + "epoch": 0.7128623188405797, + "grad_norm": 5.14816199052616, + "learning_rate": 2.010983075235082e-07, + "loss": 0.2382, + "step": 3935 + }, + { + "epoch": 0.7130434782608696, + "grad_norm": 5.251671940574674, + "learning_rate": 2.0086316708084055e-07, + "loss": 0.3133, + "step": 3936 + }, + { + "epoch": 0.7132246376811594, + "grad_norm": 4.655460168551347, + "learning_rate": 2.0062812963240177e-07, + "loss": 0.2487, + "step": 3937 + }, + { + "epoch": 0.7134057971014492, + "grad_norm": 8.264429332214254, + "learning_rate": 2.003931952591164e-07, + "loss": 0.2998, + "step": 3938 + }, + { + "epoch": 0.7135869565217391, + "grad_norm": 4.289325963492413, + "learning_rate": 2.0015836404187348e-07, + "loss": 0.2999, + "step": 3939 + }, + { + "epoch": 0.7137681159420289, + "grad_norm": 4.135300883105886, + "learning_rate": 1.999236360615265e-07, + "loss": 0.334, + "step": 3940 + }, + { + "epoch": 0.7139492753623189, + "grad_norm": 6.914956714424375, + "learning_rate": 1.9968901139889343e-07, + "loss": 0.289, + "step": 3941 + }, + { + "epoch": 0.7141304347826087, + "grad_norm": 3.7373158418014905, + "learning_rate": 1.9945449013475663e-07, + "loss": 0.2569, + "step": 3942 + }, + { + "epoch": 0.7143115942028986, + "grad_norm": 4.7759805108158355, + "learning_rate": 1.992200723498627e-07, + "loss": 0.2658, + "step": 3943 + }, + { + "epoch": 0.7144927536231884, + "grad_norm": 3.8138379019022457, + "learning_rate": 1.9898575812492317e-07, + "loss": 0.2783, + "step": 3944 + }, + { + "epoch": 0.7146739130434783, + "grad_norm": 4.250834228868454, + "learning_rate": 1.9875154754061351e-07, + "loss": 0.302, + "step": 3945 + }, + { + "epoch": 0.7148550724637681, + "grad_norm": 5.5105589011747655, + "learning_rate": 1.9851744067757324e-07, + "loss": 0.2726, + "step": 3946 + }, + { + "epoch": 0.715036231884058, + "grad_norm": 4.4198010547508675, + "learning_rate": 1.9828343761640642e-07, + "loss": 0.2381, + "step": 3947 + }, + { + "epoch": 0.7152173913043478, + "grad_norm": 5.537535517595483, + "learning_rate": 1.9804953843768174e-07, + "loss": 0.3453, + "step": 3948 + }, + { + "epoch": 0.7153985507246376, + "grad_norm": 7.963945946747752, + "learning_rate": 1.9781574322193168e-07, + "loss": 0.3154, + "step": 3949 + }, + { + "epoch": 0.7155797101449275, + "grad_norm": 3.9181213151800827, + "learning_rate": 1.9758205204965294e-07, + "loss": 0.2582, + "step": 3950 + }, + { + "epoch": 0.7157608695652173, + "grad_norm": 4.562695087166056, + "learning_rate": 1.9734846500130665e-07, + "loss": 0.2763, + "step": 3951 + }, + { + "epoch": 0.7159420289855073, + "grad_norm": 5.2157578413710715, + "learning_rate": 1.971149821573178e-07, + "loss": 0.3168, + "step": 3952 + }, + { + "epoch": 0.7161231884057971, + "grad_norm": 6.969112931137719, + "learning_rate": 1.9688160359807571e-07, + "loss": 0.3432, + "step": 3953 + }, + { + "epoch": 0.716304347826087, + "grad_norm": 4.125750770221318, + "learning_rate": 1.9664832940393355e-07, + "loss": 0.265, + "step": 3954 + }, + { + "epoch": 0.7164855072463768, + "grad_norm": 7.183623564070895, + "learning_rate": 1.9641515965520905e-07, + "loss": 0.2924, + "step": 3955 + }, + { + "epoch": 0.7166666666666667, + "grad_norm": 4.4769166730914725, + "learning_rate": 1.9618209443218363e-07, + "loss": 0.305, + "step": 3956 + }, + { + "epoch": 0.7168478260869565, + "grad_norm": 4.150501688576831, + "learning_rate": 1.9594913381510246e-07, + "loss": 0.2535, + "step": 3957 + }, + { + "epoch": 0.7170289855072464, + "grad_norm": 5.819210510466341, + "learning_rate": 1.95716277884175e-07, + "loss": 0.2811, + "step": 3958 + }, + { + "epoch": 0.7172101449275362, + "grad_norm": 7.3890368754842655, + "learning_rate": 1.9548352671957496e-07, + "loss": 0.2869, + "step": 3959 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 3.7529413710727035, + "learning_rate": 1.952508804014395e-07, + "loss": 0.2644, + "step": 3960 + }, + { + "epoch": 0.7175724637681159, + "grad_norm": 4.702756802345117, + "learning_rate": 1.950183390098698e-07, + "loss": 0.3067, + "step": 3961 + }, + { + "epoch": 0.7177536231884057, + "grad_norm": 6.550127858133256, + "learning_rate": 1.9478590262493106e-07, + "loss": 0.2847, + "step": 3962 + }, + { + "epoch": 0.7179347826086957, + "grad_norm": 6.313935907248912, + "learning_rate": 1.9455357132665218e-07, + "loss": 0.2468, + "step": 3963 + }, + { + "epoch": 0.7181159420289855, + "grad_norm": 4.057518227244305, + "learning_rate": 1.943213451950259e-07, + "loss": 0.313, + "step": 3964 + }, + { + "epoch": 0.7182971014492754, + "grad_norm": 4.204988904175218, + "learning_rate": 1.9408922431000885e-07, + "loss": 0.2439, + "step": 3965 + }, + { + "epoch": 0.7184782608695652, + "grad_norm": 5.061574208602043, + "learning_rate": 1.9385720875152122e-07, + "loss": 0.3147, + "step": 3966 + }, + { + "epoch": 0.7186594202898551, + "grad_norm": 4.5808412053038445, + "learning_rate": 1.9362529859944727e-07, + "loss": 0.2769, + "step": 3967 + }, + { + "epoch": 0.7188405797101449, + "grad_norm": 4.0086085354532965, + "learning_rate": 1.9339349393363458e-07, + "loss": 0.2761, + "step": 3968 + }, + { + "epoch": 0.7190217391304348, + "grad_norm": 5.049972341909374, + "learning_rate": 1.931617948338946e-07, + "loss": 0.312, + "step": 3969 + }, + { + "epoch": 0.7192028985507246, + "grad_norm": 7.540720873630649, + "learning_rate": 1.9293020138000244e-07, + "loss": 0.2534, + "step": 3970 + }, + { + "epoch": 0.7193840579710145, + "grad_norm": 4.634285839162342, + "learning_rate": 1.9269871365169692e-07, + "loss": 0.3176, + "step": 3971 + }, + { + "epoch": 0.7195652173913043, + "grad_norm": 4.619762711532391, + "learning_rate": 1.9246733172868056e-07, + "loss": 0.3036, + "step": 3972 + }, + { + "epoch": 0.7197463768115943, + "grad_norm": 5.05537239517053, + "learning_rate": 1.9223605569061885e-07, + "loss": 0.3593, + "step": 3973 + }, + { + "epoch": 0.7199275362318841, + "grad_norm": 3.7669551631783085, + "learning_rate": 1.9200488561714118e-07, + "loss": 0.2793, + "step": 3974 + }, + { + "epoch": 0.720108695652174, + "grad_norm": 7.9264972886021035, + "learning_rate": 1.9177382158784088e-07, + "loss": 0.2951, + "step": 3975 + }, + { + "epoch": 0.7202898550724638, + "grad_norm": 3.966875002065434, + "learning_rate": 1.9154286368227423e-07, + "loss": 0.2998, + "step": 3976 + }, + { + "epoch": 0.7204710144927536, + "grad_norm": 7.076021952740293, + "learning_rate": 1.913120119799611e-07, + "loss": 0.297, + "step": 3977 + }, + { + "epoch": 0.7206521739130435, + "grad_norm": 4.144593857369443, + "learning_rate": 1.9108126656038482e-07, + "loss": 0.2999, + "step": 3978 + }, + { + "epoch": 0.7208333333333333, + "grad_norm": 4.238234717643975, + "learning_rate": 1.908506275029922e-07, + "loss": 0.2742, + "step": 3979 + }, + { + "epoch": 0.7210144927536232, + "grad_norm": 4.055815296261728, + "learning_rate": 1.9062009488719326e-07, + "loss": 0.2943, + "step": 3980 + }, + { + "epoch": 0.721195652173913, + "grad_norm": 3.5371083302642146, + "learning_rate": 1.903896687923615e-07, + "loss": 0.2992, + "step": 3981 + }, + { + "epoch": 0.7213768115942029, + "grad_norm": 7.77354011345013, + "learning_rate": 1.9015934929783383e-07, + "loss": 0.2567, + "step": 3982 + }, + { + "epoch": 0.7215579710144927, + "grad_norm": 5.88584406704857, + "learning_rate": 1.899291364829102e-07, + "loss": 0.3499, + "step": 3983 + }, + { + "epoch": 0.7217391304347827, + "grad_norm": 5.695720052748647, + "learning_rate": 1.8969903042685403e-07, + "loss": 0.3163, + "step": 3984 + }, + { + "epoch": 0.7219202898550725, + "grad_norm": 4.028077446196472, + "learning_rate": 1.894690312088919e-07, + "loss": 0.2874, + "step": 3985 + }, + { + "epoch": 0.7221014492753624, + "grad_norm": 6.088330308383606, + "learning_rate": 1.8923913890821352e-07, + "loss": 0.3097, + "step": 3986 + }, + { + "epoch": 0.7222826086956522, + "grad_norm": 3.721897113873162, + "learning_rate": 1.890093536039722e-07, + "loss": 0.2999, + "step": 3987 + }, + { + "epoch": 0.722463768115942, + "grad_norm": 4.939991099997167, + "learning_rate": 1.8877967537528405e-07, + "loss": 0.2817, + "step": 3988 + }, + { + "epoch": 0.7226449275362319, + "grad_norm": 5.561433227333897, + "learning_rate": 1.8855010430122798e-07, + "loss": 0.2955, + "step": 3989 + }, + { + "epoch": 0.7228260869565217, + "grad_norm": 8.647609171184131, + "learning_rate": 1.8832064046084683e-07, + "loss": 0.3036, + "step": 3990 + }, + { + "epoch": 0.7230072463768116, + "grad_norm": 6.130374411993332, + "learning_rate": 1.8809128393314595e-07, + "loss": 0.2071, + "step": 3991 + }, + { + "epoch": 0.7231884057971014, + "grad_norm": 3.0488322090344013, + "learning_rate": 1.8786203479709383e-07, + "loss": 0.2276, + "step": 3992 + }, + { + "epoch": 0.7233695652173913, + "grad_norm": 8.082891288733899, + "learning_rate": 1.8763289313162212e-07, + "loss": 0.2735, + "step": 3993 + }, + { + "epoch": 0.7235507246376811, + "grad_norm": 3.9796336873473774, + "learning_rate": 1.874038590156253e-07, + "loss": 0.2992, + "step": 3994 + }, + { + "epoch": 0.7237318840579711, + "grad_norm": 5.486330471952747, + "learning_rate": 1.871749325279609e-07, + "loss": 0.2946, + "step": 3995 + }, + { + "epoch": 0.7239130434782609, + "grad_norm": 4.123125049435668, + "learning_rate": 1.869461137474495e-07, + "loss": 0.2979, + "step": 3996 + }, + { + "epoch": 0.7240942028985508, + "grad_norm": 7.711667669591236, + "learning_rate": 1.8671740275287416e-07, + "loss": 0.316, + "step": 3997 + }, + { + "epoch": 0.7242753623188406, + "grad_norm": 4.361146302612137, + "learning_rate": 1.864887996229817e-07, + "loss": 0.2792, + "step": 3998 + }, + { + "epoch": 0.7244565217391304, + "grad_norm": 6.372879497140796, + "learning_rate": 1.8626030443648105e-07, + "loss": 0.3497, + "step": 3999 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 5.290022841495765, + "learning_rate": 1.8603191727204398e-07, + "loss": 0.3032, + "step": 4000 + }, + { + "epoch": 0.7246376811594203, + "eval_loss": 0.2775624990463257, + "eval_runtime": 9.8232, + "eval_samples_per_second": 50.9, + "eval_steps_per_second": 0.102, + "step": 4000 + }, + { + "epoch": 0.7248188405797101, + "grad_norm": 3.7837835685083743, + "learning_rate": 1.8580363820830525e-07, + "loss": 0.2328, + "step": 4001 + }, + { + "epoch": 0.725, + "grad_norm": 11.243154586682063, + "learning_rate": 1.855754673238627e-07, + "loss": 0.3296, + "step": 4002 + }, + { + "epoch": 0.7251811594202898, + "grad_norm": 4.283002776116116, + "learning_rate": 1.8534740469727655e-07, + "loss": 0.3029, + "step": 4003 + }, + { + "epoch": 0.7253623188405797, + "grad_norm": 4.460776168875511, + "learning_rate": 1.8511945040706984e-07, + "loss": 0.3147, + "step": 4004 + }, + { + "epoch": 0.7255434782608695, + "grad_norm": 7.1945108217623694, + "learning_rate": 1.848916045317283e-07, + "loss": 0.276, + "step": 4005 + }, + { + "epoch": 0.7257246376811595, + "grad_norm": 4.454930156948343, + "learning_rate": 1.8466386714970027e-07, + "loss": 0.3214, + "step": 4006 + }, + { + "epoch": 0.7259057971014493, + "grad_norm": 6.783122275683271, + "learning_rate": 1.8443623833939693e-07, + "loss": 0.2924, + "step": 4007 + }, + { + "epoch": 0.7260869565217392, + "grad_norm": 4.581667747058475, + "learning_rate": 1.8420871817919187e-07, + "loss": 0.2937, + "step": 4008 + }, + { + "epoch": 0.726268115942029, + "grad_norm": 6.188306253550094, + "learning_rate": 1.839813067474214e-07, + "loss": 0.2735, + "step": 4009 + }, + { + "epoch": 0.7264492753623188, + "grad_norm": 5.070865441582673, + "learning_rate": 1.837540041223844e-07, + "loss": 0.3237, + "step": 4010 + }, + { + "epoch": 0.7266304347826087, + "grad_norm": 4.001889637709018, + "learning_rate": 1.8352681038234212e-07, + "loss": 0.2719, + "step": 4011 + }, + { + "epoch": 0.7268115942028985, + "grad_norm": 5.72533614339234, + "learning_rate": 1.8329972560551854e-07, + "loss": 0.2997, + "step": 4012 + }, + { + "epoch": 0.7269927536231884, + "grad_norm": 4.695592503012075, + "learning_rate": 1.830727498700998e-07, + "loss": 0.2615, + "step": 4013 + }, + { + "epoch": 0.7271739130434782, + "grad_norm": 7.1601274393712, + "learning_rate": 1.8284588325423505e-07, + "loss": 0.2591, + "step": 4014 + }, + { + "epoch": 0.7273550724637681, + "grad_norm": 5.915948575911907, + "learning_rate": 1.826191258360356e-07, + "loss": 0.2983, + "step": 4015 + }, + { + "epoch": 0.7275362318840579, + "grad_norm": 3.3690810921377534, + "learning_rate": 1.823924776935748e-07, + "loss": 0.2572, + "step": 4016 + }, + { + "epoch": 0.7277173913043479, + "grad_norm": 3.6344854353821767, + "learning_rate": 1.821659389048885e-07, + "loss": 0.3033, + "step": 4017 + }, + { + "epoch": 0.7278985507246377, + "grad_norm": 4.010645014504151, + "learning_rate": 1.8193950954797565e-07, + "loss": 0.3045, + "step": 4018 + }, + { + "epoch": 0.7280797101449276, + "grad_norm": 3.784089514278222, + "learning_rate": 1.8171318970079658e-07, + "loss": 0.2926, + "step": 4019 + }, + { + "epoch": 0.7282608695652174, + "grad_norm": 5.136532169035658, + "learning_rate": 1.8148697944127438e-07, + "loss": 0.309, + "step": 4020 + }, + { + "epoch": 0.7284420289855073, + "grad_norm": 4.9724965037391735, + "learning_rate": 1.8126087884729434e-07, + "loss": 0.2961, + "step": 4021 + }, + { + "epoch": 0.7286231884057971, + "grad_norm": 15.440337642479328, + "learning_rate": 1.8103488799670395e-07, + "loss": 0.3103, + "step": 4022 + }, + { + "epoch": 0.7288043478260869, + "grad_norm": 4.229264400754731, + "learning_rate": 1.8080900696731288e-07, + "loss": 0.2177, + "step": 4023 + }, + { + "epoch": 0.7289855072463768, + "grad_norm": 7.591407787228796, + "learning_rate": 1.8058323583689288e-07, + "loss": 0.312, + "step": 4024 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 3.9192300374901547, + "learning_rate": 1.8035757468317842e-07, + "loss": 0.2862, + "step": 4025 + }, + { + "epoch": 0.7293478260869565, + "grad_norm": 5.0747843670043835, + "learning_rate": 1.8013202358386565e-07, + "loss": 0.2777, + "step": 4026 + }, + { + "epoch": 0.7295289855072464, + "grad_norm": 9.622895749432057, + "learning_rate": 1.799065826166125e-07, + "loss": 0.3371, + "step": 4027 + }, + { + "epoch": 0.7297101449275363, + "grad_norm": 5.199294203354176, + "learning_rate": 1.796812518590395e-07, + "loss": 0.3027, + "step": 4028 + }, + { + "epoch": 0.7298913043478261, + "grad_norm": 4.161505912391221, + "learning_rate": 1.7945603138872933e-07, + "loss": 0.29, + "step": 4029 + }, + { + "epoch": 0.730072463768116, + "grad_norm": 6.124467623153883, + "learning_rate": 1.792309212832263e-07, + "loss": 0.2486, + "step": 4030 + }, + { + "epoch": 0.7302536231884058, + "grad_norm": 3.8038601359536233, + "learning_rate": 1.7900592162003692e-07, + "loss": 0.2899, + "step": 4031 + }, + { + "epoch": 0.7304347826086957, + "grad_norm": 4.321741200410663, + "learning_rate": 1.7878103247662962e-07, + "loss": 0.2809, + "step": 4032 + }, + { + "epoch": 0.7306159420289855, + "grad_norm": 3.700894881311305, + "learning_rate": 1.7855625393043482e-07, + "loss": 0.2821, + "step": 4033 + }, + { + "epoch": 0.7307971014492753, + "grad_norm": 4.112404728514255, + "learning_rate": 1.7833158605884485e-07, + "loss": 0.2992, + "step": 4034 + }, + { + "epoch": 0.7309782608695652, + "grad_norm": 6.260070397947784, + "learning_rate": 1.7810702893921387e-07, + "loss": 0.3093, + "step": 4035 + }, + { + "epoch": 0.731159420289855, + "grad_norm": 3.778333428132222, + "learning_rate": 1.7788258264885797e-07, + "loss": 0.2587, + "step": 4036 + }, + { + "epoch": 0.7313405797101449, + "grad_norm": 4.904834391024922, + "learning_rate": 1.7765824726505512e-07, + "loss": 0.277, + "step": 4037 + }, + { + "epoch": 0.7315217391304348, + "grad_norm": 5.272550025120003, + "learning_rate": 1.77434022865045e-07, + "loss": 0.2838, + "step": 4038 + }, + { + "epoch": 0.7317028985507247, + "grad_norm": 7.441763426474568, + "learning_rate": 1.7720990952602916e-07, + "loss": 0.2766, + "step": 4039 + }, + { + "epoch": 0.7318840579710145, + "grad_norm": 4.397405024936907, + "learning_rate": 1.769859073251707e-07, + "loss": 0.3033, + "step": 4040 + }, + { + "epoch": 0.7320652173913044, + "grad_norm": 7.211549355034018, + "learning_rate": 1.7676201633959503e-07, + "loss": 0.2489, + "step": 4041 + }, + { + "epoch": 0.7322463768115942, + "grad_norm": 10.135333894179256, + "learning_rate": 1.7653823664638884e-07, + "loss": 0.2521, + "step": 4042 + }, + { + "epoch": 0.7324275362318841, + "grad_norm": 4.204932706072341, + "learning_rate": 1.7631456832260017e-07, + "loss": 0.2693, + "step": 4043 + }, + { + "epoch": 0.7326086956521739, + "grad_norm": 5.00090250074175, + "learning_rate": 1.7609101144523908e-07, + "loss": 0.2872, + "step": 4044 + }, + { + "epoch": 0.7327898550724637, + "grad_norm": 3.409662996576817, + "learning_rate": 1.7586756609127768e-07, + "loss": 0.2715, + "step": 4045 + }, + { + "epoch": 0.7329710144927536, + "grad_norm": 3.3456447784627414, + "learning_rate": 1.7564423233764901e-07, + "loss": 0.2756, + "step": 4046 + }, + { + "epoch": 0.7331521739130434, + "grad_norm": 4.044345182147048, + "learning_rate": 1.75421010261248e-07, + "loss": 0.2793, + "step": 4047 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 6.100970187970424, + "learning_rate": 1.7519789993893103e-07, + "loss": 0.2995, + "step": 4048 + }, + { + "epoch": 0.7335144927536232, + "grad_norm": 3.8182739620184853, + "learning_rate": 1.749749014475161e-07, + "loss": 0.2834, + "step": 4049 + }, + { + "epoch": 0.7336956521739131, + "grad_norm": 4.831301384608513, + "learning_rate": 1.7475201486378255e-07, + "loss": 0.2715, + "step": 4050 + }, + { + "epoch": 0.7338768115942029, + "grad_norm": 3.0211303095125146, + "learning_rate": 1.7452924026447136e-07, + "loss": 0.2327, + "step": 4051 + }, + { + "epoch": 0.7340579710144928, + "grad_norm": 5.447840050625781, + "learning_rate": 1.7430657772628488e-07, + "loss": 0.2921, + "step": 4052 + }, + { + "epoch": 0.7342391304347826, + "grad_norm": 7.112618612472881, + "learning_rate": 1.7408402732588685e-07, + "loss": 0.2825, + "step": 4053 + }, + { + "epoch": 0.7344202898550725, + "grad_norm": 7.231675239000602, + "learning_rate": 1.738615891399024e-07, + "loss": 0.3269, + "step": 4054 + }, + { + "epoch": 0.7346014492753623, + "grad_norm": 4.143875646341541, + "learning_rate": 1.7363926324491813e-07, + "loss": 0.2305, + "step": 4055 + }, + { + "epoch": 0.7347826086956522, + "grad_norm": 10.373148091535601, + "learning_rate": 1.734170497174816e-07, + "loss": 0.2494, + "step": 4056 + }, + { + "epoch": 0.734963768115942, + "grad_norm": 13.364313613074309, + "learning_rate": 1.731949486341025e-07, + "loss": 0.3307, + "step": 4057 + }, + { + "epoch": 0.7351449275362318, + "grad_norm": 4.382852748350924, + "learning_rate": 1.7297296007125112e-07, + "loss": 0.292, + "step": 4058 + }, + { + "epoch": 0.7353260869565217, + "grad_norm": 5.327589801474438, + "learning_rate": 1.7275108410535877e-07, + "loss": 0.2167, + "step": 4059 + }, + { + "epoch": 0.7355072463768116, + "grad_norm": 6.394193193401852, + "learning_rate": 1.725293208128189e-07, + "loss": 0.3071, + "step": 4060 + }, + { + "epoch": 0.7356884057971015, + "grad_norm": 8.234081461035954, + "learning_rate": 1.7230767026998543e-07, + "loss": 0.333, + "step": 4061 + }, + { + "epoch": 0.7358695652173913, + "grad_norm": 5.857308893031953, + "learning_rate": 1.720861325531737e-07, + "loss": 0.3001, + "step": 4062 + }, + { + "epoch": 0.7360507246376812, + "grad_norm": 2.933292943525002, + "learning_rate": 1.7186470773866025e-07, + "loss": 0.2111, + "step": 4063 + }, + { + "epoch": 0.736231884057971, + "grad_norm": 4.109004963026581, + "learning_rate": 1.7164339590268267e-07, + "loss": 0.2748, + "step": 4064 + }, + { + "epoch": 0.7364130434782609, + "grad_norm": 8.67488643719478, + "learning_rate": 1.7142219712143968e-07, + "loss": 0.3017, + "step": 4065 + }, + { + "epoch": 0.7365942028985507, + "grad_norm": 3.9605724776002447, + "learning_rate": 1.71201111471091e-07, + "loss": 0.2592, + "step": 4066 + }, + { + "epoch": 0.7367753623188406, + "grad_norm": 4.891730972876826, + "learning_rate": 1.7098013902775736e-07, + "loss": 0.2716, + "step": 4067 + }, + { + "epoch": 0.7369565217391304, + "grad_norm": 10.637408114660769, + "learning_rate": 1.7075927986752104e-07, + "loss": 0.3638, + "step": 4068 + }, + { + "epoch": 0.7371376811594202, + "grad_norm": 3.7711968309292203, + "learning_rate": 1.7053853406642471e-07, + "loss": 0.2878, + "step": 4069 + }, + { + "epoch": 0.7373188405797102, + "grad_norm": 4.7111630448958435, + "learning_rate": 1.70317901700472e-07, + "loss": 0.2631, + "step": 4070 + }, + { + "epoch": 0.7375, + "grad_norm": 4.766524430188335, + "learning_rate": 1.700973828456276e-07, + "loss": 0.3398, + "step": 4071 + }, + { + "epoch": 0.7376811594202899, + "grad_norm": 4.878166475157653, + "learning_rate": 1.698769775778176e-07, + "loss": 0.3311, + "step": 4072 + }, + { + "epoch": 0.7378623188405797, + "grad_norm": 4.5110435891535055, + "learning_rate": 1.6965668597292832e-07, + "loss": 0.3185, + "step": 4073 + }, + { + "epoch": 0.7380434782608696, + "grad_norm": 6.105604566424603, + "learning_rate": 1.694365081068073e-07, + "loss": 0.3055, + "step": 4074 + }, + { + "epoch": 0.7382246376811594, + "grad_norm": 9.808078236948342, + "learning_rate": 1.692164440552628e-07, + "loss": 0.3059, + "step": 4075 + }, + { + "epoch": 0.7384057971014493, + "grad_norm": 6.478678720347312, + "learning_rate": 1.6899649389406384e-07, + "loss": 0.3526, + "step": 4076 + }, + { + "epoch": 0.7385869565217391, + "grad_norm": 7.119297399782839, + "learning_rate": 1.6877665769894038e-07, + "loss": 0.3319, + "step": 4077 + }, + { + "epoch": 0.738768115942029, + "grad_norm": 5.277490930196286, + "learning_rate": 1.685569355455831e-07, + "loss": 0.2769, + "step": 4078 + }, + { + "epoch": 0.7389492753623188, + "grad_norm": 5.556273927347214, + "learning_rate": 1.683373275096433e-07, + "loss": 0.2494, + "step": 4079 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 4.3268188693427465, + "learning_rate": 1.6811783366673304e-07, + "loss": 0.3031, + "step": 4080 + }, + { + "epoch": 0.7393115942028986, + "grad_norm": 3.8310131069680864, + "learning_rate": 1.678984540924252e-07, + "loss": 0.252, + "step": 4081 + }, + { + "epoch": 0.7394927536231884, + "grad_norm": 5.287240234109758, + "learning_rate": 1.6767918886225307e-07, + "loss": 0.2477, + "step": 4082 + }, + { + "epoch": 0.7396739130434783, + "grad_norm": 3.4545222855171565, + "learning_rate": 1.6746003805171067e-07, + "loss": 0.2697, + "step": 4083 + }, + { + "epoch": 0.7398550724637681, + "grad_norm": 3.609924065994071, + "learning_rate": 1.6724100173625293e-07, + "loss": 0.2983, + "step": 4084 + }, + { + "epoch": 0.740036231884058, + "grad_norm": 4.6170587748580765, + "learning_rate": 1.6702207999129515e-07, + "loss": 0.3274, + "step": 4085 + }, + { + "epoch": 0.7402173913043478, + "grad_norm": 4.300091182191505, + "learning_rate": 1.6680327289221267e-07, + "loss": 0.3186, + "step": 4086 + }, + { + "epoch": 0.7403985507246377, + "grad_norm": 3.9430702549675587, + "learning_rate": 1.6658458051434193e-07, + "loss": 0.2607, + "step": 4087 + }, + { + "epoch": 0.7405797101449275, + "grad_norm": 6.253163173036239, + "learning_rate": 1.6636600293298003e-07, + "loss": 0.284, + "step": 4088 + }, + { + "epoch": 0.7407608695652174, + "grad_norm": 3.676676767813719, + "learning_rate": 1.6614754022338412e-07, + "loss": 0.2747, + "step": 4089 + }, + { + "epoch": 0.7409420289855072, + "grad_norm": 5.784409718586341, + "learning_rate": 1.659291924607719e-07, + "loss": 0.2615, + "step": 4090 + }, + { + "epoch": 0.741123188405797, + "grad_norm": 4.577076389372262, + "learning_rate": 1.6571095972032161e-07, + "loss": 0.2761, + "step": 4091 + }, + { + "epoch": 0.741304347826087, + "grad_norm": 5.024957066205122, + "learning_rate": 1.654928420771718e-07, + "loss": 0.2974, + "step": 4092 + }, + { + "epoch": 0.7414855072463769, + "grad_norm": 4.326184341671534, + "learning_rate": 1.6527483960642135e-07, + "loss": 0.2399, + "step": 4093 + }, + { + "epoch": 0.7416666666666667, + "grad_norm": 11.610959921859596, + "learning_rate": 1.6505695238312944e-07, + "loss": 0.3054, + "step": 4094 + }, + { + "epoch": 0.7418478260869565, + "grad_norm": 5.813119547517348, + "learning_rate": 1.64839180482316e-07, + "loss": 0.3284, + "step": 4095 + }, + { + "epoch": 0.7420289855072464, + "grad_norm": 10.051182128455602, + "learning_rate": 1.646215239789609e-07, + "loss": 0.3024, + "step": 4096 + }, + { + "epoch": 0.7422101449275362, + "grad_norm": 6.03298777596573, + "learning_rate": 1.6440398294800394e-07, + "loss": 0.3374, + "step": 4097 + }, + { + "epoch": 0.7423913043478261, + "grad_norm": 4.861400252041628, + "learning_rate": 1.6418655746434563e-07, + "loss": 0.2641, + "step": 4098 + }, + { + "epoch": 0.7425724637681159, + "grad_norm": 4.274509348863295, + "learning_rate": 1.639692476028468e-07, + "loss": 0.2832, + "step": 4099 + }, + { + "epoch": 0.7427536231884058, + "grad_norm": 8.73539491030266, + "learning_rate": 1.6375205343832825e-07, + "loss": 0.2552, + "step": 4100 + }, + { + "epoch": 0.7427536231884058, + "eval_loss": 0.2782343626022339, + "eval_runtime": 9.7517, + "eval_samples_per_second": 51.273, + "eval_steps_per_second": 0.103, + "step": 4100 + }, + { + "epoch": 0.7429347826086956, + "grad_norm": 6.997640893148164, + "learning_rate": 1.6353497504557085e-07, + "loss": 0.3903, + "step": 4101 + }, + { + "epoch": 0.7431159420289855, + "grad_norm": 3.405098061964635, + "learning_rate": 1.633180124993157e-07, + "loss": 0.2123, + "step": 4102 + }, + { + "epoch": 0.7432971014492754, + "grad_norm": 5.5333548242744435, + "learning_rate": 1.6310116587426415e-07, + "loss": 0.3348, + "step": 4103 + }, + { + "epoch": 0.7434782608695653, + "grad_norm": 6.414892445450211, + "learning_rate": 1.628844352450774e-07, + "loss": 0.2959, + "step": 4104 + }, + { + "epoch": 0.7436594202898551, + "grad_norm": 3.7926079170974045, + "learning_rate": 1.6266782068637692e-07, + "loss": 0.294, + "step": 4105 + }, + { + "epoch": 0.743840579710145, + "grad_norm": 7.616807850241395, + "learning_rate": 1.6245132227274406e-07, + "loss": 0.3031, + "step": 4106 + }, + { + "epoch": 0.7440217391304348, + "grad_norm": 9.835890341953727, + "learning_rate": 1.622349400787203e-07, + "loss": 0.3392, + "step": 4107 + }, + { + "epoch": 0.7442028985507246, + "grad_norm": 8.789924256009957, + "learning_rate": 1.6201867417880699e-07, + "loss": 0.2862, + "step": 4108 + }, + { + "epoch": 0.7443840579710145, + "grad_norm": 7.889504668661902, + "learning_rate": 1.6180252464746558e-07, + "loss": 0.2928, + "step": 4109 + }, + { + "epoch": 0.7445652173913043, + "grad_norm": 8.359257683253054, + "learning_rate": 1.615864915591172e-07, + "loss": 0.2426, + "step": 4110 + }, + { + "epoch": 0.7447463768115942, + "grad_norm": 7.616949080245882, + "learning_rate": 1.6137057498814338e-07, + "loss": 0.3115, + "step": 4111 + }, + { + "epoch": 0.744927536231884, + "grad_norm": 3.9817073099023292, + "learning_rate": 1.6115477500888518e-07, + "loss": 0.3411, + "step": 4112 + }, + { + "epoch": 0.7451086956521739, + "grad_norm": 6.522618790958368, + "learning_rate": 1.609390916956433e-07, + "loss": 0.343, + "step": 4113 + }, + { + "epoch": 0.7452898550724638, + "grad_norm": 5.04808828468488, + "learning_rate": 1.6072352512267846e-07, + "loss": 0.3392, + "step": 4114 + }, + { + "epoch": 0.7454710144927537, + "grad_norm": 5.154141129836924, + "learning_rate": 1.6050807536421163e-07, + "loss": 0.2626, + "step": 4115 + }, + { + "epoch": 0.7456521739130435, + "grad_norm": 5.605569529132893, + "learning_rate": 1.6029274249442299e-07, + "loss": 0.2807, + "step": 4116 + }, + { + "epoch": 0.7458333333333333, + "grad_norm": 4.878872963968999, + "learning_rate": 1.6007752658745267e-07, + "loss": 0.3379, + "step": 4117 + }, + { + "epoch": 0.7460144927536232, + "grad_norm": 4.022436621642571, + "learning_rate": 1.5986242771740056e-07, + "loss": 0.2585, + "step": 4118 + }, + { + "epoch": 0.746195652173913, + "grad_norm": 4.520911370530145, + "learning_rate": 1.5964744595832614e-07, + "loss": 0.2227, + "step": 4119 + }, + { + "epoch": 0.7463768115942029, + "grad_norm": 5.066824652835237, + "learning_rate": 1.5943258138424875e-07, + "loss": 0.2846, + "step": 4120 + }, + { + "epoch": 0.7465579710144927, + "grad_norm": 6.506074663360499, + "learning_rate": 1.5921783406914724e-07, + "loss": 0.2517, + "step": 4121 + }, + { + "epoch": 0.7467391304347826, + "grad_norm": 4.407061276506215, + "learning_rate": 1.5900320408696007e-07, + "loss": 0.2575, + "step": 4122 + }, + { + "epoch": 0.7469202898550724, + "grad_norm": 4.093206575359533, + "learning_rate": 1.5878869151158542e-07, + "loss": 0.2934, + "step": 4123 + }, + { + "epoch": 0.7471014492753624, + "grad_norm": 3.5818064242370404, + "learning_rate": 1.5857429641688097e-07, + "loss": 0.2489, + "step": 4124 + }, + { + "epoch": 0.7472826086956522, + "grad_norm": 8.332450278959207, + "learning_rate": 1.58360018876664e-07, + "loss": 0.2742, + "step": 4125 + }, + { + "epoch": 0.7474637681159421, + "grad_norm": 4.117337045630649, + "learning_rate": 1.581458589647111e-07, + "loss": 0.3022, + "step": 4126 + }, + { + "epoch": 0.7476449275362319, + "grad_norm": 7.547400067842371, + "learning_rate": 1.5793181675475885e-07, + "loss": 0.3691, + "step": 4127 + }, + { + "epoch": 0.7478260869565218, + "grad_norm": 3.9307806662900324, + "learning_rate": 1.57717892320503e-07, + "loss": 0.2103, + "step": 4128 + }, + { + "epoch": 0.7480072463768116, + "grad_norm": 4.483912691908728, + "learning_rate": 1.5750408573559827e-07, + "loss": 0.3202, + "step": 4129 + }, + { + "epoch": 0.7481884057971014, + "grad_norm": 3.648510635186452, + "learning_rate": 1.5729039707365977e-07, + "loss": 0.2698, + "step": 4130 + }, + { + "epoch": 0.7483695652173913, + "grad_norm": 7.938871952174254, + "learning_rate": 1.570768264082613e-07, + "loss": 0.3275, + "step": 4131 + }, + { + "epoch": 0.7485507246376811, + "grad_norm": 8.708073803465622, + "learning_rate": 1.5686337381293635e-07, + "loss": 0.3507, + "step": 4132 + }, + { + "epoch": 0.748731884057971, + "grad_norm": 3.471292359775845, + "learning_rate": 1.566500393611776e-07, + "loss": 0.2842, + "step": 4133 + }, + { + "epoch": 0.7489130434782608, + "grad_norm": 4.23335422597505, + "learning_rate": 1.5643682312643714e-07, + "loss": 0.2565, + "step": 4134 + }, + { + "epoch": 0.7490942028985508, + "grad_norm": 4.407386327956635, + "learning_rate": 1.562237251821263e-07, + "loss": 0.2333, + "step": 4135 + }, + { + "epoch": 0.7492753623188406, + "grad_norm": 4.846907412454899, + "learning_rate": 1.560107456016157e-07, + "loss": 0.261, + "step": 4136 + }, + { + "epoch": 0.7494565217391305, + "grad_norm": 3.6805621141324236, + "learning_rate": 1.5579788445823512e-07, + "loss": 0.2783, + "step": 4137 + }, + { + "epoch": 0.7496376811594203, + "grad_norm": 4.3644613723940076, + "learning_rate": 1.5558514182527392e-07, + "loss": 0.2745, + "step": 4138 + }, + { + "epoch": 0.7498188405797102, + "grad_norm": 4.645181489364351, + "learning_rate": 1.5537251777598043e-07, + "loss": 0.2913, + "step": 4139 + }, + { + "epoch": 0.75, + "grad_norm": 4.069215011046552, + "learning_rate": 1.551600123835618e-07, + "loss": 0.2611, + "step": 4140 + }, + { + "epoch": 0.7501811594202898, + "grad_norm": 8.70871320591032, + "learning_rate": 1.5494762572118464e-07, + "loss": 0.2652, + "step": 4141 + }, + { + "epoch": 0.7503623188405797, + "grad_norm": 3.677680153020506, + "learning_rate": 1.54735357861975e-07, + "loss": 0.324, + "step": 4142 + }, + { + "epoch": 0.7505434782608695, + "grad_norm": 4.291613794269187, + "learning_rate": 1.5452320887901755e-07, + "loss": 0.2715, + "step": 4143 + }, + { + "epoch": 0.7507246376811594, + "grad_norm": 8.488776567402915, + "learning_rate": 1.543111788453561e-07, + "loss": 0.2785, + "step": 4144 + }, + { + "epoch": 0.7509057971014492, + "grad_norm": 4.370941696431717, + "learning_rate": 1.540992678339938e-07, + "loss": 0.3212, + "step": 4145 + }, + { + "epoch": 0.7510869565217392, + "grad_norm": 4.909632496990874, + "learning_rate": 1.5388747591789242e-07, + "loss": 0.2714, + "step": 4146 + }, + { + "epoch": 0.751268115942029, + "grad_norm": 3.9703608273347317, + "learning_rate": 1.5367580316997302e-07, + "loss": 0.3184, + "step": 4147 + }, + { + "epoch": 0.7514492753623189, + "grad_norm": 4.362969990390846, + "learning_rate": 1.534642496631155e-07, + "loss": 0.3372, + "step": 4148 + }, + { + "epoch": 0.7516304347826087, + "grad_norm": 5.1601994634830595, + "learning_rate": 1.5325281547015879e-07, + "loss": 0.264, + "step": 4149 + }, + { + "epoch": 0.7518115942028986, + "grad_norm": 5.3739980521431905, + "learning_rate": 1.530415006639006e-07, + "loss": 0.2723, + "step": 4150 + }, + { + "epoch": 0.7519927536231884, + "grad_norm": 4.5117810317927285, + "learning_rate": 1.5283030531709763e-07, + "loss": 0.3054, + "step": 4151 + }, + { + "epoch": 0.7521739130434782, + "grad_norm": 4.2660004626055255, + "learning_rate": 1.5261922950246548e-07, + "loss": 0.2832, + "step": 4152 + }, + { + "epoch": 0.7523550724637681, + "grad_norm": 5.480095079200267, + "learning_rate": 1.5240827329267835e-07, + "loss": 0.2521, + "step": 4153 + }, + { + "epoch": 0.7525362318840579, + "grad_norm": 7.851072070959068, + "learning_rate": 1.521974367603699e-07, + "loss": 0.2841, + "step": 4154 + }, + { + "epoch": 0.7527173913043478, + "grad_norm": 3.8152780757210905, + "learning_rate": 1.5198671997813195e-07, + "loss": 0.236, + "step": 4155 + }, + { + "epoch": 0.7528985507246376, + "grad_norm": 5.010142250718976, + "learning_rate": 1.517761230185151e-07, + "loss": 0.3008, + "step": 4156 + }, + { + "epoch": 0.7530797101449276, + "grad_norm": 3.626894823200263, + "learning_rate": 1.5156564595402894e-07, + "loss": 0.2288, + "step": 4157 + }, + { + "epoch": 0.7532608695652174, + "grad_norm": 5.786197465607927, + "learning_rate": 1.513552888571419e-07, + "loss": 0.3024, + "step": 4158 + }, + { + "epoch": 0.7534420289855073, + "grad_norm": 4.253387001983209, + "learning_rate": 1.5114505180028075e-07, + "loss": 0.2839, + "step": 4159 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 4.481154963364705, + "learning_rate": 1.5093493485583126e-07, + "loss": 0.2959, + "step": 4160 + }, + { + "epoch": 0.753804347826087, + "grad_norm": 7.810375492123668, + "learning_rate": 1.5072493809613756e-07, + "loss": 0.3178, + "step": 4161 + }, + { + "epoch": 0.7539855072463768, + "grad_norm": 6.590657181605559, + "learning_rate": 1.5051506159350257e-07, + "loss": 0.3199, + "step": 4162 + }, + { + "epoch": 0.7541666666666667, + "grad_norm": 5.370073227049868, + "learning_rate": 1.5030530542018784e-07, + "loss": 0.2837, + "step": 4163 + }, + { + "epoch": 0.7543478260869565, + "grad_norm": 3.219210745592578, + "learning_rate": 1.5009566964841313e-07, + "loss": 0.181, + "step": 4164 + }, + { + "epoch": 0.7545289855072463, + "grad_norm": 4.216171026106692, + "learning_rate": 1.498861543503574e-07, + "loss": 0.2444, + "step": 4165 + }, + { + "epoch": 0.7547101449275362, + "grad_norm": 4.0956290985620685, + "learning_rate": 1.4967675959815772e-07, + "loss": 0.2893, + "step": 4166 + }, + { + "epoch": 0.7548913043478261, + "grad_norm": 3.579762785926032, + "learning_rate": 1.4946748546390947e-07, + "loss": 0.2187, + "step": 4167 + }, + { + "epoch": 0.755072463768116, + "grad_norm": 4.42191594003051, + "learning_rate": 1.492583320196667e-07, + "loss": 0.2939, + "step": 4168 + }, + { + "epoch": 0.7552536231884058, + "grad_norm": 6.8527628806133905, + "learning_rate": 1.4904929933744215e-07, + "loss": 0.2354, + "step": 4169 + }, + { + "epoch": 0.7554347826086957, + "grad_norm": 8.513229368530343, + "learning_rate": 1.4884038748920674e-07, + "loss": 0.2917, + "step": 4170 + }, + { + "epoch": 0.7556159420289855, + "grad_norm": 3.474828642165609, + "learning_rate": 1.4863159654688973e-07, + "loss": 0.252, + "step": 4171 + }, + { + "epoch": 0.7557971014492754, + "grad_norm": 4.808820151260308, + "learning_rate": 1.4842292658237883e-07, + "loss": 0.2598, + "step": 4172 + }, + { + "epoch": 0.7559782608695652, + "grad_norm": 5.058108048547361, + "learning_rate": 1.482143776675201e-07, + "loss": 0.2723, + "step": 4173 + }, + { + "epoch": 0.756159420289855, + "grad_norm": 7.044222693881663, + "learning_rate": 1.4800594987411797e-07, + "loss": 0.2658, + "step": 4174 + }, + { + "epoch": 0.7563405797101449, + "grad_norm": 4.213877280511019, + "learning_rate": 1.4779764327393507e-07, + "loss": 0.2941, + "step": 4175 + }, + { + "epoch": 0.7565217391304347, + "grad_norm": 4.361878436395466, + "learning_rate": 1.4758945793869237e-07, + "loss": 0.2592, + "step": 4176 + }, + { + "epoch": 0.7567028985507246, + "grad_norm": 6.568963758929252, + "learning_rate": 1.4738139394006905e-07, + "loss": 0.317, + "step": 4177 + }, + { + "epoch": 0.7568840579710145, + "grad_norm": 3.803413229713382, + "learning_rate": 1.471734513497025e-07, + "loss": 0.2423, + "step": 4178 + }, + { + "epoch": 0.7570652173913044, + "grad_norm": 3.8079128979911303, + "learning_rate": 1.469656302391884e-07, + "loss": 0.2382, + "step": 4179 + }, + { + "epoch": 0.7572463768115942, + "grad_norm": 3.8940522193023743, + "learning_rate": 1.467579306800804e-07, + "loss": 0.2844, + "step": 4180 + }, + { + "epoch": 0.7574275362318841, + "grad_norm": 6.788700170594187, + "learning_rate": 1.465503527438907e-07, + "loss": 0.2817, + "step": 4181 + }, + { + "epoch": 0.7576086956521739, + "grad_norm": 6.452652737045557, + "learning_rate": 1.4634289650208936e-07, + "loss": 0.3367, + "step": 4182 + }, + { + "epoch": 0.7577898550724638, + "grad_norm": 3.9382792088577037, + "learning_rate": 1.4613556202610426e-07, + "loss": 0.2291, + "step": 4183 + }, + { + "epoch": 0.7579710144927536, + "grad_norm": 3.5800344995715885, + "learning_rate": 1.4592834938732167e-07, + "loss": 0.2698, + "step": 4184 + }, + { + "epoch": 0.7581521739130435, + "grad_norm": 4.390570948316386, + "learning_rate": 1.4572125865708617e-07, + "loss": 0.2936, + "step": 4185 + }, + { + "epoch": 0.7583333333333333, + "grad_norm": 6.913478110394954, + "learning_rate": 1.4551428990669994e-07, + "loss": 0.2905, + "step": 4186 + }, + { + "epoch": 0.7585144927536231, + "grad_norm": 7.941113618589702, + "learning_rate": 1.4530744320742327e-07, + "loss": 0.4074, + "step": 4187 + }, + { + "epoch": 0.758695652173913, + "grad_norm": 4.638788236343474, + "learning_rate": 1.4510071863047445e-07, + "loss": 0.2895, + "step": 4188 + }, + { + "epoch": 0.758876811594203, + "grad_norm": 3.8351442733602714, + "learning_rate": 1.4489411624702975e-07, + "loss": 0.2842, + "step": 4189 + }, + { + "epoch": 0.7590579710144928, + "grad_norm": 8.03330761938926, + "learning_rate": 1.4468763612822338e-07, + "loss": 0.3365, + "step": 4190 + }, + { + "epoch": 0.7592391304347826, + "grad_norm": 3.6270601815734165, + "learning_rate": 1.4448127834514738e-07, + "loss": 0.2734, + "step": 4191 + }, + { + "epoch": 0.7594202898550725, + "grad_norm": 6.530119672654416, + "learning_rate": 1.4427504296885172e-07, + "loss": 0.3629, + "step": 4192 + }, + { + "epoch": 0.7596014492753623, + "grad_norm": 4.8994945308352, + "learning_rate": 1.4406893007034426e-07, + "loss": 0.2542, + "step": 4193 + }, + { + "epoch": 0.7597826086956522, + "grad_norm": 5.781770686188912, + "learning_rate": 1.438629397205906e-07, + "loss": 0.239, + "step": 4194 + }, + { + "epoch": 0.759963768115942, + "grad_norm": 6.9974053713430155, + "learning_rate": 1.4365707199051418e-07, + "loss": 0.3015, + "step": 4195 + }, + { + "epoch": 0.7601449275362319, + "grad_norm": 3.685187860278038, + "learning_rate": 1.4345132695099615e-07, + "loss": 0.2604, + "step": 4196 + }, + { + "epoch": 0.7603260869565217, + "grad_norm": 4.616105592637974, + "learning_rate": 1.4324570467287572e-07, + "loss": 0.2524, + "step": 4197 + }, + { + "epoch": 0.7605072463768116, + "grad_norm": 3.754075937797778, + "learning_rate": 1.430402052269497e-07, + "loss": 0.2279, + "step": 4198 + }, + { + "epoch": 0.7606884057971014, + "grad_norm": 4.67240179142241, + "learning_rate": 1.4283482868397218e-07, + "loss": 0.3127, + "step": 4199 + }, + { + "epoch": 0.7608695652173914, + "grad_norm": 4.336504431516033, + "learning_rate": 1.4262957511465522e-07, + "loss": 0.2823, + "step": 4200 + }, + { + "epoch": 0.7608695652173914, + "eval_loss": 0.27192187309265137, + "eval_runtime": 9.8168, + "eval_samples_per_second": 50.933, + "eval_steps_per_second": 0.102, + "step": 4200 + }, + { + "epoch": 0.7610507246376812, + "grad_norm": 4.7301263936488525, + "learning_rate": 1.42424444589669e-07, + "loss": 0.3142, + "step": 4201 + }, + { + "epoch": 0.761231884057971, + "grad_norm": 7.178449701857179, + "learning_rate": 1.4221943717964074e-07, + "loss": 0.2501, + "step": 4202 + }, + { + "epoch": 0.7614130434782609, + "grad_norm": 4.371466132811722, + "learning_rate": 1.4201455295515547e-07, + "loss": 0.2727, + "step": 4203 + }, + { + "epoch": 0.7615942028985507, + "grad_norm": 6.684248286835398, + "learning_rate": 1.4180979198675575e-07, + "loss": 0.229, + "step": 4204 + }, + { + "epoch": 0.7617753623188406, + "grad_norm": 5.705066448837432, + "learning_rate": 1.416051543449418e-07, + "loss": 0.3169, + "step": 4205 + }, + { + "epoch": 0.7619565217391304, + "grad_norm": 3.8661632846651193, + "learning_rate": 1.4140064010017134e-07, + "loss": 0.3196, + "step": 4206 + }, + { + "epoch": 0.7621376811594203, + "grad_norm": 4.329415578700389, + "learning_rate": 1.4119624932285939e-07, + "loss": 0.2486, + "step": 4207 + }, + { + "epoch": 0.7623188405797101, + "grad_norm": 5.556767223400084, + "learning_rate": 1.4099198208337905e-07, + "loss": 0.306, + "step": 4208 + }, + { + "epoch": 0.7625, + "grad_norm": 3.652407110653765, + "learning_rate": 1.4078783845206045e-07, + "loss": 0.1817, + "step": 4209 + }, + { + "epoch": 0.7626811594202898, + "grad_norm": 4.486455444170379, + "learning_rate": 1.4058381849919083e-07, + "loss": 0.3275, + "step": 4210 + }, + { + "epoch": 0.7628623188405798, + "grad_norm": 8.492892926974811, + "learning_rate": 1.4037992229501533e-07, + "loss": 0.3127, + "step": 4211 + }, + { + "epoch": 0.7630434782608696, + "grad_norm": 4.980890710303177, + "learning_rate": 1.4017614990973663e-07, + "loss": 0.2785, + "step": 4212 + }, + { + "epoch": 0.7632246376811594, + "grad_norm": 4.2119146247255355, + "learning_rate": 1.3997250141351447e-07, + "loss": 0.2526, + "step": 4213 + }, + { + "epoch": 0.7634057971014493, + "grad_norm": 10.343217546623658, + "learning_rate": 1.3976897687646584e-07, + "loss": 0.2785, + "step": 4214 + }, + { + "epoch": 0.7635869565217391, + "grad_norm": 4.545859529884203, + "learning_rate": 1.3956557636866534e-07, + "loss": 0.256, + "step": 4215 + }, + { + "epoch": 0.763768115942029, + "grad_norm": 3.624383069864175, + "learning_rate": 1.3936229996014464e-07, + "loss": 0.2658, + "step": 4216 + }, + { + "epoch": 0.7639492753623188, + "grad_norm": 4.112704435755177, + "learning_rate": 1.3915914772089281e-07, + "loss": 0.2633, + "step": 4217 + }, + { + "epoch": 0.7641304347826087, + "grad_norm": 5.52640967342109, + "learning_rate": 1.3895611972085609e-07, + "loss": 0.2656, + "step": 4218 + }, + { + "epoch": 0.7643115942028985, + "grad_norm": 6.179450699028498, + "learning_rate": 1.3875321602993805e-07, + "loss": 0.31, + "step": 4219 + }, + { + "epoch": 0.7644927536231884, + "grad_norm": 8.567682621644742, + "learning_rate": 1.385504367179993e-07, + "loss": 0.2313, + "step": 4220 + }, + { + "epoch": 0.7646739130434783, + "grad_norm": 5.199140701796199, + "learning_rate": 1.3834778185485785e-07, + "loss": 0.2846, + "step": 4221 + }, + { + "epoch": 0.7648550724637682, + "grad_norm": 4.19093577587622, + "learning_rate": 1.381452515102886e-07, + "loss": 0.3342, + "step": 4222 + }, + { + "epoch": 0.765036231884058, + "grad_norm": 6.177618278834126, + "learning_rate": 1.3794284575402365e-07, + "loss": 0.2689, + "step": 4223 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 4.5386623077575665, + "learning_rate": 1.3774056465575247e-07, + "loss": 0.2402, + "step": 4224 + }, + { + "epoch": 0.7653985507246377, + "grad_norm": 6.323743985541725, + "learning_rate": 1.3753840828512148e-07, + "loss": 0.2587, + "step": 4225 + }, + { + "epoch": 0.7655797101449275, + "grad_norm": 6.098581096792941, + "learning_rate": 1.3733637671173375e-07, + "loss": 0.2495, + "step": 4226 + }, + { + "epoch": 0.7657608695652174, + "grad_norm": 5.466505544348531, + "learning_rate": 1.3713447000514967e-07, + "loss": 0.2788, + "step": 4227 + }, + { + "epoch": 0.7659420289855072, + "grad_norm": 8.223041813063077, + "learning_rate": 1.36932688234887e-07, + "loss": 0.3181, + "step": 4228 + }, + { + "epoch": 0.7661231884057971, + "grad_norm": 6.39944566226906, + "learning_rate": 1.367310314704201e-07, + "loss": 0.2869, + "step": 4229 + }, + { + "epoch": 0.7663043478260869, + "grad_norm": 3.844702541479501, + "learning_rate": 1.3652949978118021e-07, + "loss": 0.2424, + "step": 4230 + }, + { + "epoch": 0.7664855072463768, + "grad_norm": 4.2510147854212565, + "learning_rate": 1.363280932365557e-07, + "loss": 0.2825, + "step": 4231 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 6.29174491356392, + "learning_rate": 1.3612681190589183e-07, + "loss": 0.2799, + "step": 4232 + }, + { + "epoch": 0.7668478260869566, + "grad_norm": 9.719189928359546, + "learning_rate": 1.359256558584907e-07, + "loss": 0.2976, + "step": 4233 + }, + { + "epoch": 0.7670289855072464, + "grad_norm": 4.6877384723755995, + "learning_rate": 1.357246251636112e-07, + "loss": 0.2967, + "step": 4234 + }, + { + "epoch": 0.7672101449275363, + "grad_norm": 5.024022407328368, + "learning_rate": 1.3552371989046917e-07, + "loss": 0.3116, + "step": 4235 + }, + { + "epoch": 0.7673913043478261, + "grad_norm": 7.6677217400183215, + "learning_rate": 1.3532294010823757e-07, + "loss": 0.2378, + "step": 4236 + }, + { + "epoch": 0.7675724637681159, + "grad_norm": 6.981662881687111, + "learning_rate": 1.3512228588604542e-07, + "loss": 0.2849, + "step": 4237 + }, + { + "epoch": 0.7677536231884058, + "grad_norm": 4.880698556120381, + "learning_rate": 1.349217572929789e-07, + "loss": 0.2538, + "step": 4238 + }, + { + "epoch": 0.7679347826086956, + "grad_norm": 5.136854827223748, + "learning_rate": 1.347213543980813e-07, + "loss": 0.3288, + "step": 4239 + }, + { + "epoch": 0.7681159420289855, + "grad_norm": 3.766387750092391, + "learning_rate": 1.3452107727035212e-07, + "loss": 0.2618, + "step": 4240 + }, + { + "epoch": 0.7682971014492753, + "grad_norm": 4.636147779595781, + "learning_rate": 1.3432092597874768e-07, + "loss": 0.2407, + "step": 4241 + }, + { + "epoch": 0.7684782608695652, + "grad_norm": 4.165515611198434, + "learning_rate": 1.34120900592181e-07, + "loss": 0.246, + "step": 4242 + }, + { + "epoch": 0.7686594202898551, + "grad_norm": 4.966108961627089, + "learning_rate": 1.3392100117952189e-07, + "loss": 0.2666, + "step": 4243 + }, + { + "epoch": 0.768840579710145, + "grad_norm": 3.782537784382739, + "learning_rate": 1.3372122780959654e-07, + "loss": 0.2381, + "step": 4244 + }, + { + "epoch": 0.7690217391304348, + "grad_norm": 6.500482548869603, + "learning_rate": 1.33521580551188e-07, + "loss": 0.2378, + "step": 4245 + }, + { + "epoch": 0.7692028985507247, + "grad_norm": 5.397272885702307, + "learning_rate": 1.3332205947303564e-07, + "loss": 0.2235, + "step": 4246 + }, + { + "epoch": 0.7693840579710145, + "grad_norm": 4.6291956763175035, + "learning_rate": 1.3312266464383553e-07, + "loss": 0.3353, + "step": 4247 + }, + { + "epoch": 0.7695652173913043, + "grad_norm": 5.04456172886357, + "learning_rate": 1.3292339613224036e-07, + "loss": 0.2473, + "step": 4248 + }, + { + "epoch": 0.7697463768115942, + "grad_norm": 4.610438720089957, + "learning_rate": 1.3272425400685905e-07, + "loss": 0.229, + "step": 4249 + }, + { + "epoch": 0.769927536231884, + "grad_norm": 3.5357954968155334, + "learning_rate": 1.3252523833625717e-07, + "loss": 0.26, + "step": 4250 + }, + { + "epoch": 0.7701086956521739, + "grad_norm": 5.518110853842271, + "learning_rate": 1.3232634918895702e-07, + "loss": 0.286, + "step": 4251 + }, + { + "epoch": 0.7702898550724637, + "grad_norm": 5.276677152342091, + "learning_rate": 1.3212758663343708e-07, + "loss": 0.2625, + "step": 4252 + }, + { + "epoch": 0.7704710144927536, + "grad_norm": 5.4594377001446555, + "learning_rate": 1.3192895073813193e-07, + "loss": 0.2956, + "step": 4253 + }, + { + "epoch": 0.7706521739130435, + "grad_norm": 5.219013116171993, + "learning_rate": 1.3173044157143282e-07, + "loss": 0.2343, + "step": 4254 + }, + { + "epoch": 0.7708333333333334, + "grad_norm": 8.742759598778866, + "learning_rate": 1.3153205920168775e-07, + "loss": 0.3239, + "step": 4255 + }, + { + "epoch": 0.7710144927536232, + "grad_norm": 3.616740810665911, + "learning_rate": 1.3133380369720055e-07, + "loss": 0.2605, + "step": 4256 + }, + { + "epoch": 0.7711956521739131, + "grad_norm": 3.917254595375453, + "learning_rate": 1.3113567512623147e-07, + "loss": 0.2941, + "step": 4257 + }, + { + "epoch": 0.7713768115942029, + "grad_norm": 4.03465354171425, + "learning_rate": 1.3093767355699715e-07, + "loss": 0.2528, + "step": 4258 + }, + { + "epoch": 0.7715579710144927, + "grad_norm": 4.118308765400935, + "learning_rate": 1.307397990576705e-07, + "loss": 0.2932, + "step": 4259 + }, + { + "epoch": 0.7717391304347826, + "grad_norm": 5.919875582433207, + "learning_rate": 1.3054205169638065e-07, + "loss": 0.2896, + "step": 4260 + }, + { + "epoch": 0.7719202898550724, + "grad_norm": 3.9386329365764245, + "learning_rate": 1.303444315412129e-07, + "loss": 0.2594, + "step": 4261 + }, + { + "epoch": 0.7721014492753623, + "grad_norm": 8.446124708946124, + "learning_rate": 1.3014693866020881e-07, + "loss": 0.2993, + "step": 4262 + }, + { + "epoch": 0.7722826086956521, + "grad_norm": 3.7518634564241, + "learning_rate": 1.2994957312136622e-07, + "loss": 0.2457, + "step": 4263 + }, + { + "epoch": 0.7724637681159421, + "grad_norm": 5.277053300889531, + "learning_rate": 1.2975233499263887e-07, + "loss": 0.3564, + "step": 4264 + }, + { + "epoch": 0.7726449275362319, + "grad_norm": 3.4737252317609815, + "learning_rate": 1.295552243419369e-07, + "loss": 0.2466, + "step": 4265 + }, + { + "epoch": 0.7728260869565218, + "grad_norm": 4.955076209571368, + "learning_rate": 1.293582412371262e-07, + "loss": 0.2694, + "step": 4266 + }, + { + "epoch": 0.7730072463768116, + "grad_norm": 3.829758219547843, + "learning_rate": 1.2916138574602937e-07, + "loss": 0.2077, + "step": 4267 + }, + { + "epoch": 0.7731884057971015, + "grad_norm": 7.940474059004807, + "learning_rate": 1.2896465793642459e-07, + "loss": 0.3445, + "step": 4268 + }, + { + "epoch": 0.7733695652173913, + "grad_norm": 3.771841398545769, + "learning_rate": 1.28768057876046e-07, + "loss": 0.3044, + "step": 4269 + }, + { + "epoch": 0.7735507246376812, + "grad_norm": 7.247392127189758, + "learning_rate": 1.285715856325838e-07, + "loss": 0.2571, + "step": 4270 + }, + { + "epoch": 0.773731884057971, + "grad_norm": 3.9908414542369544, + "learning_rate": 1.2837524127368477e-07, + "loss": 0.2574, + "step": 4271 + }, + { + "epoch": 0.7739130434782608, + "grad_norm": 6.57559591721725, + "learning_rate": 1.2817902486695088e-07, + "loss": 0.2859, + "step": 4272 + }, + { + "epoch": 0.7740942028985507, + "grad_norm": 4.360375468246992, + "learning_rate": 1.279829364799405e-07, + "loss": 0.2895, + "step": 4273 + }, + { + "epoch": 0.7742753623188405, + "grad_norm": 12.665574219810782, + "learning_rate": 1.2778697618016772e-07, + "loss": 0.2782, + "step": 4274 + }, + { + "epoch": 0.7744565217391305, + "grad_norm": 4.2619959146898445, + "learning_rate": 1.2759114403510262e-07, + "loss": 0.2907, + "step": 4275 + }, + { + "epoch": 0.7746376811594203, + "grad_norm": 4.312343028709694, + "learning_rate": 1.273954401121712e-07, + "loss": 0.2841, + "step": 4276 + }, + { + "epoch": 0.7748188405797102, + "grad_norm": 9.039239924794582, + "learning_rate": 1.2719986447875497e-07, + "loss": 0.3053, + "step": 4277 + }, + { + "epoch": 0.775, + "grad_norm": 4.791390608295223, + "learning_rate": 1.270044172021919e-07, + "loss": 0.2412, + "step": 4278 + }, + { + "epoch": 0.7751811594202899, + "grad_norm": 3.637247407813648, + "learning_rate": 1.268090983497755e-07, + "loss": 0.262, + "step": 4279 + }, + { + "epoch": 0.7753623188405797, + "grad_norm": 3.7610351208892916, + "learning_rate": 1.2661390798875453e-07, + "loss": 0.2452, + "step": 4280 + }, + { + "epoch": 0.7755434782608696, + "grad_norm": 5.234613384236787, + "learning_rate": 1.2641884618633408e-07, + "loss": 0.2431, + "step": 4281 + }, + { + "epoch": 0.7757246376811594, + "grad_norm": 6.967885374647666, + "learning_rate": 1.262239130096751e-07, + "loss": 0.2687, + "step": 4282 + }, + { + "epoch": 0.7759057971014492, + "grad_norm": 4.134985338581212, + "learning_rate": 1.2602910852589382e-07, + "loss": 0.2769, + "step": 4283 + }, + { + "epoch": 0.7760869565217391, + "grad_norm": 5.074025185329541, + "learning_rate": 1.258344328020624e-07, + "loss": 0.2731, + "step": 4284 + }, + { + "epoch": 0.7762681159420289, + "grad_norm": 7.451443342150492, + "learning_rate": 1.2563988590520864e-07, + "loss": 0.3038, + "step": 4285 + }, + { + "epoch": 0.7764492753623189, + "grad_norm": 3.5768365120156003, + "learning_rate": 1.2544546790231587e-07, + "loss": 0.2547, + "step": 4286 + }, + { + "epoch": 0.7766304347826087, + "grad_norm": 4.183702271956826, + "learning_rate": 1.252511788603232e-07, + "loss": 0.2826, + "step": 4287 + }, + { + "epoch": 0.7768115942028986, + "grad_norm": 5.8009154904728595, + "learning_rate": 1.2505701884612524e-07, + "loss": 0.226, + "step": 4288 + }, + { + "epoch": 0.7769927536231884, + "grad_norm": 4.7817198402104815, + "learning_rate": 1.2486298792657223e-07, + "loss": 0.333, + "step": 4289 + }, + { + "epoch": 0.7771739130434783, + "grad_norm": 5.260721823023794, + "learning_rate": 1.246690861684699e-07, + "loss": 0.2512, + "step": 4290 + }, + { + "epoch": 0.7773550724637681, + "grad_norm": 5.547021387034899, + "learning_rate": 1.2447531363857955e-07, + "loss": 0.2807, + "step": 4291 + }, + { + "epoch": 0.777536231884058, + "grad_norm": 3.609036974458032, + "learning_rate": 1.2428167040361797e-07, + "loss": 0.2164, + "step": 4292 + }, + { + "epoch": 0.7777173913043478, + "grad_norm": 6.372754101043129, + "learning_rate": 1.2408815653025734e-07, + "loss": 0.2881, + "step": 4293 + }, + { + "epoch": 0.7778985507246376, + "grad_norm": 6.880907363032734, + "learning_rate": 1.2389477208512565e-07, + "loss": 0.3198, + "step": 4294 + }, + { + "epoch": 0.7780797101449275, + "grad_norm": 5.1045788428220895, + "learning_rate": 1.2370151713480614e-07, + "loss": 0.3468, + "step": 4295 + }, + { + "epoch": 0.7782608695652173, + "grad_norm": 6.342279152312474, + "learning_rate": 1.2350839174583706e-07, + "loss": 0.2995, + "step": 4296 + }, + { + "epoch": 0.7784420289855073, + "grad_norm": 5.960982001109151, + "learning_rate": 1.2331539598471235e-07, + "loss": 0.404, + "step": 4297 + }, + { + "epoch": 0.7786231884057971, + "grad_norm": 6.714653132956193, + "learning_rate": 1.231225299178818e-07, + "loss": 0.3202, + "step": 4298 + }, + { + "epoch": 0.778804347826087, + "grad_norm": 8.308904574850047, + "learning_rate": 1.2292979361174982e-07, + "loss": 0.2458, + "step": 4299 + }, + { + "epoch": 0.7789855072463768, + "grad_norm": 6.034433055648776, + "learning_rate": 1.2273718713267655e-07, + "loss": 0.2302, + "step": 4300 + }, + { + "epoch": 0.7789855072463768, + "eval_loss": 0.2721562385559082, + "eval_runtime": 9.7999, + "eval_samples_per_second": 51.021, + "eval_steps_per_second": 0.102, + "step": 4300 + }, + { + "epoch": 0.7791666666666667, + "grad_norm": 3.9641529430639197, + "learning_rate": 1.2254471054697724e-07, + "loss": 0.2313, + "step": 4301 + }, + { + "epoch": 0.7793478260869565, + "grad_norm": 4.2347563972297255, + "learning_rate": 1.2235236392092247e-07, + "loss": 0.2908, + "step": 4302 + }, + { + "epoch": 0.7795289855072464, + "grad_norm": 5.560438683980006, + "learning_rate": 1.2216014732073822e-07, + "loss": 0.2839, + "step": 4303 + }, + { + "epoch": 0.7797101449275362, + "grad_norm": 5.047560533143979, + "learning_rate": 1.219680608126054e-07, + "loss": 0.2195, + "step": 4304 + }, + { + "epoch": 0.779891304347826, + "grad_norm": 4.251025729695162, + "learning_rate": 1.217761044626603e-07, + "loss": 0.2908, + "step": 4305 + }, + { + "epoch": 0.7800724637681159, + "grad_norm": 4.081770458735122, + "learning_rate": 1.2158427833699475e-07, + "loss": 0.3004, + "step": 4306 + }, + { + "epoch": 0.7802536231884057, + "grad_norm": 7.901593041828544, + "learning_rate": 1.2139258250165502e-07, + "loss": 0.3129, + "step": 4307 + }, + { + "epoch": 0.7804347826086957, + "grad_norm": 4.896613534680306, + "learning_rate": 1.2120101702264284e-07, + "loss": 0.2588, + "step": 4308 + }, + { + "epoch": 0.7806159420289855, + "grad_norm": 8.82432248622185, + "learning_rate": 1.210095819659155e-07, + "loss": 0.3273, + "step": 4309 + }, + { + "epoch": 0.7807971014492754, + "grad_norm": 4.169852595347561, + "learning_rate": 1.208182773973847e-07, + "loss": 0.2874, + "step": 4310 + }, + { + "epoch": 0.7809782608695652, + "grad_norm": 6.537371092731516, + "learning_rate": 1.2062710338291764e-07, + "loss": 0.2168, + "step": 4311 + }, + { + "epoch": 0.7811594202898551, + "grad_norm": 4.538257206402436, + "learning_rate": 1.204360599883364e-07, + "loss": 0.2805, + "step": 4312 + }, + { + "epoch": 0.7813405797101449, + "grad_norm": 5.3269107160191, + "learning_rate": 1.202451472794181e-07, + "loss": 0.2737, + "step": 4313 + }, + { + "epoch": 0.7815217391304348, + "grad_norm": 7.211691377268884, + "learning_rate": 1.2005436532189494e-07, + "loss": 0.316, + "step": 4314 + }, + { + "epoch": 0.7817028985507246, + "grad_norm": 6.396467829044089, + "learning_rate": 1.1986371418145398e-07, + "loss": 0.386, + "step": 4315 + }, + { + "epoch": 0.7818840579710145, + "grad_norm": 11.572665625743616, + "learning_rate": 1.1967319392373737e-07, + "loss": 0.2558, + "step": 4316 + }, + { + "epoch": 0.7820652173913043, + "grad_norm": 6.864690424074091, + "learning_rate": 1.1948280461434208e-07, + "loss": 0.2794, + "step": 4317 + }, + { + "epoch": 0.7822463768115943, + "grad_norm": 6.459327350400014, + "learning_rate": 1.1929254631882013e-07, + "loss": 0.2517, + "step": 4318 + }, + { + "epoch": 0.7824275362318841, + "grad_norm": 4.002285778979529, + "learning_rate": 1.1910241910267831e-07, + "loss": 0.3039, + "step": 4319 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 3.3428263625272843, + "learning_rate": 1.1891242303137816e-07, + "loss": 0.1966, + "step": 4320 + }, + { + "epoch": 0.7827898550724638, + "grad_norm": 4.269009188343704, + "learning_rate": 1.1872255817033655e-07, + "loss": 0.2655, + "step": 4321 + }, + { + "epoch": 0.7829710144927536, + "grad_norm": 3.6777823157651395, + "learning_rate": 1.1853282458492481e-07, + "loss": 0.2174, + "step": 4322 + }, + { + "epoch": 0.7831521739130435, + "grad_norm": 5.215493351557534, + "learning_rate": 1.1834322234046889e-07, + "loss": 0.3128, + "step": 4323 + }, + { + "epoch": 0.7833333333333333, + "grad_norm": 3.3498642280783435, + "learning_rate": 1.181537515022497e-07, + "loss": 0.2614, + "step": 4324 + }, + { + "epoch": 0.7835144927536232, + "grad_norm": 6.5430837453714545, + "learning_rate": 1.1796441213550323e-07, + "loss": 0.2397, + "step": 4325 + }, + { + "epoch": 0.783695652173913, + "grad_norm": 6.09958769098372, + "learning_rate": 1.1777520430541976e-07, + "loss": 0.2855, + "step": 4326 + }, + { + "epoch": 0.7838768115942029, + "grad_norm": 7.624891119846719, + "learning_rate": 1.1758612807714446e-07, + "loss": 0.274, + "step": 4327 + }, + { + "epoch": 0.7840579710144927, + "grad_norm": 4.417815433473692, + "learning_rate": 1.1739718351577721e-07, + "loss": 0.319, + "step": 4328 + }, + { + "epoch": 0.7842391304347827, + "grad_norm": 5.344259716517517, + "learning_rate": 1.1720837068637245e-07, + "loss": 0.2053, + "step": 4329 + }, + { + "epoch": 0.7844202898550725, + "grad_norm": 6.780602735357904, + "learning_rate": 1.1701968965393943e-07, + "loss": 0.2965, + "step": 4330 + }, + { + "epoch": 0.7846014492753624, + "grad_norm": 7.368422017759983, + "learning_rate": 1.1683114048344183e-07, + "loss": 0.269, + "step": 4331 + }, + { + "epoch": 0.7847826086956522, + "grad_norm": 5.265692290999796, + "learning_rate": 1.1664272323979801e-07, + "loss": 0.3048, + "step": 4332 + }, + { + "epoch": 0.784963768115942, + "grad_norm": 5.773739533904738, + "learning_rate": 1.1645443798788102e-07, + "loss": 0.2999, + "step": 4333 + }, + { + "epoch": 0.7851449275362319, + "grad_norm": 5.997367717396313, + "learning_rate": 1.1626628479251827e-07, + "loss": 0.3083, + "step": 4334 + }, + { + "epoch": 0.7853260869565217, + "grad_norm": 5.242432066263883, + "learning_rate": 1.1607826371849189e-07, + "loss": 0.2283, + "step": 4335 + }, + { + "epoch": 0.7855072463768116, + "grad_norm": 3.772756701272312, + "learning_rate": 1.1589037483053815e-07, + "loss": 0.2817, + "step": 4336 + }, + { + "epoch": 0.7856884057971014, + "grad_norm": 4.221995209084304, + "learning_rate": 1.1570261819334854e-07, + "loss": 0.2899, + "step": 4337 + }, + { + "epoch": 0.7858695652173913, + "grad_norm": 4.289148856890564, + "learning_rate": 1.1551499387156838e-07, + "loss": 0.2249, + "step": 4338 + }, + { + "epoch": 0.7860507246376811, + "grad_norm": 3.9027639821573583, + "learning_rate": 1.1532750192979745e-07, + "loss": 0.28, + "step": 4339 + }, + { + "epoch": 0.7862318840579711, + "grad_norm": 10.274929003346609, + "learning_rate": 1.1514014243259007e-07, + "loss": 0.3168, + "step": 4340 + }, + { + "epoch": 0.7864130434782609, + "grad_norm": 3.723581129625457, + "learning_rate": 1.1495291544445535e-07, + "loss": 0.2489, + "step": 4341 + }, + { + "epoch": 0.7865942028985508, + "grad_norm": 4.9455984601475595, + "learning_rate": 1.1476582102985616e-07, + "loss": 0.3217, + "step": 4342 + }, + { + "epoch": 0.7867753623188406, + "grad_norm": 4.8970597242938405, + "learning_rate": 1.145788592532101e-07, + "loss": 0.2731, + "step": 4343 + }, + { + "epoch": 0.7869565217391304, + "grad_norm": 9.31684561437788, + "learning_rate": 1.1439203017888899e-07, + "loss": 0.2532, + "step": 4344 + }, + { + "epoch": 0.7871376811594203, + "grad_norm": 5.143484648090279, + "learning_rate": 1.1420533387121889e-07, + "loss": 0.3276, + "step": 4345 + }, + { + "epoch": 0.7873188405797101, + "grad_norm": 6.421568313001237, + "learning_rate": 1.1401877039448033e-07, + "loss": 0.2794, + "step": 4346 + }, + { + "epoch": 0.7875, + "grad_norm": 4.376016615786184, + "learning_rate": 1.1383233981290775e-07, + "loss": 0.2724, + "step": 4347 + }, + { + "epoch": 0.7876811594202898, + "grad_norm": 7.5639722310396555, + "learning_rate": 1.136460421906904e-07, + "loss": 0.298, + "step": 4348 + }, + { + "epoch": 0.7878623188405797, + "grad_norm": 3.7635990191806683, + "learning_rate": 1.134598775919715e-07, + "loss": 0.2614, + "step": 4349 + }, + { + "epoch": 0.7880434782608695, + "grad_norm": 4.867646163487617, + "learning_rate": 1.1327384608084801e-07, + "loss": 0.2857, + "step": 4350 + }, + { + "epoch": 0.7882246376811595, + "grad_norm": 4.345151585157378, + "learning_rate": 1.1308794772137159e-07, + "loss": 0.3154, + "step": 4351 + }, + { + "epoch": 0.7884057971014493, + "grad_norm": 4.5041696021371145, + "learning_rate": 1.1290218257754808e-07, + "loss": 0.2754, + "step": 4352 + }, + { + "epoch": 0.7885869565217392, + "grad_norm": 4.721472794674815, + "learning_rate": 1.1271655071333724e-07, + "loss": 0.2739, + "step": 4353 + }, + { + "epoch": 0.788768115942029, + "grad_norm": 6.334531397734939, + "learning_rate": 1.1253105219265297e-07, + "loss": 0.2805, + "step": 4354 + }, + { + "epoch": 0.7889492753623188, + "grad_norm": 5.409477526323355, + "learning_rate": 1.1234568707936332e-07, + "loss": 0.2499, + "step": 4355 + }, + { + "epoch": 0.7891304347826087, + "grad_norm": 5.742348091227902, + "learning_rate": 1.121604554372903e-07, + "loss": 0.267, + "step": 4356 + }, + { + "epoch": 0.7893115942028985, + "grad_norm": 3.717893113109431, + "learning_rate": 1.1197535733021012e-07, + "loss": 0.2898, + "step": 4357 + }, + { + "epoch": 0.7894927536231884, + "grad_norm": 4.930278931744944, + "learning_rate": 1.1179039282185292e-07, + "loss": 0.263, + "step": 4358 + }, + { + "epoch": 0.7896739130434782, + "grad_norm": 5.787542121585203, + "learning_rate": 1.1160556197590288e-07, + "loss": 0.271, + "step": 4359 + }, + { + "epoch": 0.7898550724637681, + "grad_norm": 7.03213955596468, + "learning_rate": 1.1142086485599805e-07, + "loss": 0.2353, + "step": 4360 + }, + { + "epoch": 0.7900362318840579, + "grad_norm": 7.639632036770238, + "learning_rate": 1.112363015257306e-07, + "loss": 0.247, + "step": 4361 + }, + { + "epoch": 0.7902173913043479, + "grad_norm": 4.991909473610004, + "learning_rate": 1.1105187204864652e-07, + "loss": 0.3652, + "step": 4362 + }, + { + "epoch": 0.7903985507246377, + "grad_norm": 5.0503056546086045, + "learning_rate": 1.1086757648824568e-07, + "loss": 0.2707, + "step": 4363 + }, + { + "epoch": 0.7905797101449276, + "grad_norm": 4.87105653339573, + "learning_rate": 1.1068341490798211e-07, + "loss": 0.2922, + "step": 4364 + }, + { + "epoch": 0.7907608695652174, + "grad_norm": 5.611936823840022, + "learning_rate": 1.1049938737126352e-07, + "loss": 0.2849, + "step": 4365 + }, + { + "epoch": 0.7909420289855073, + "grad_norm": 6.987743600677044, + "learning_rate": 1.1031549394145123e-07, + "loss": 0.2533, + "step": 4366 + }, + { + "epoch": 0.7911231884057971, + "grad_norm": 4.936783841785507, + "learning_rate": 1.101317346818606e-07, + "loss": 0.287, + "step": 4367 + }, + { + "epoch": 0.7913043478260869, + "grad_norm": 6.360574917374179, + "learning_rate": 1.0994810965576113e-07, + "loss": 0.2127, + "step": 4368 + }, + { + "epoch": 0.7914855072463768, + "grad_norm": 3.7163582835044195, + "learning_rate": 1.0976461892637556e-07, + "loss": 0.2467, + "step": 4369 + }, + { + "epoch": 0.7916666666666666, + "grad_norm": 5.275052750252274, + "learning_rate": 1.0958126255688066e-07, + "loss": 0.2879, + "step": 4370 + }, + { + "epoch": 0.7918478260869565, + "grad_norm": 4.897464727012649, + "learning_rate": 1.0939804061040692e-07, + "loss": 0.2687, + "step": 4371 + }, + { + "epoch": 0.7920289855072464, + "grad_norm": 5.8651730444391506, + "learning_rate": 1.0921495315003854e-07, + "loss": 0.3102, + "step": 4372 + }, + { + "epoch": 0.7922101449275363, + "grad_norm": 6.034431392103668, + "learning_rate": 1.0903200023881331e-07, + "loss": 0.226, + "step": 4373 + }, + { + "epoch": 0.7923913043478261, + "grad_norm": 6.093578570390563, + "learning_rate": 1.0884918193972292e-07, + "loss": 0.2888, + "step": 4374 + }, + { + "epoch": 0.792572463768116, + "grad_norm": 5.301588109567178, + "learning_rate": 1.0866649831571228e-07, + "loss": 0.2769, + "step": 4375 + }, + { + "epoch": 0.7927536231884058, + "grad_norm": 3.841455526864192, + "learning_rate": 1.0848394942968082e-07, + "loss": 0.2262, + "step": 4376 + }, + { + "epoch": 0.7929347826086957, + "grad_norm": 5.244813046494392, + "learning_rate": 1.0830153534448039e-07, + "loss": 0.2725, + "step": 4377 + }, + { + "epoch": 0.7931159420289855, + "grad_norm": 4.855975519598137, + "learning_rate": 1.0811925612291711e-07, + "loss": 0.2708, + "step": 4378 + }, + { + "epoch": 0.7932971014492753, + "grad_norm": 6.4299517809467055, + "learning_rate": 1.0793711182775089e-07, + "loss": 0.2932, + "step": 4379 + }, + { + "epoch": 0.7934782608695652, + "grad_norm": 9.412166418445324, + "learning_rate": 1.0775510252169473e-07, + "loss": 0.295, + "step": 4380 + }, + { + "epoch": 0.793659420289855, + "grad_norm": 4.66989288450747, + "learning_rate": 1.0757322826741522e-07, + "loss": 0.3268, + "step": 4381 + }, + { + "epoch": 0.7938405797101449, + "grad_norm": 7.045510131562466, + "learning_rate": 1.0739148912753254e-07, + "loss": 0.2671, + "step": 4382 + }, + { + "epoch": 0.7940217391304348, + "grad_norm": 4.359004547017069, + "learning_rate": 1.0720988516462043e-07, + "loss": 0.2749, + "step": 4383 + }, + { + "epoch": 0.7942028985507247, + "grad_norm": 9.067450917670207, + "learning_rate": 1.0702841644120592e-07, + "loss": 0.2484, + "step": 4384 + }, + { + "epoch": 0.7943840579710145, + "grad_norm": 7.562776868430158, + "learning_rate": 1.0684708301976958e-07, + "loss": 0.338, + "step": 4385 + }, + { + "epoch": 0.7945652173913044, + "grad_norm": 6.365375507040643, + "learning_rate": 1.0666588496274536e-07, + "loss": 0.3289, + "step": 4386 + }, + { + "epoch": 0.7947463768115942, + "grad_norm": 8.467988587426978, + "learning_rate": 1.0648482233252059e-07, + "loss": 0.3213, + "step": 4387 + }, + { + "epoch": 0.7949275362318841, + "grad_norm": 5.051029289686665, + "learning_rate": 1.0630389519143596e-07, + "loss": 0.3028, + "step": 4388 + }, + { + "epoch": 0.7951086956521739, + "grad_norm": 3.927823722496376, + "learning_rate": 1.0612310360178567e-07, + "loss": 0.2188, + "step": 4389 + }, + { + "epoch": 0.7952898550724637, + "grad_norm": 5.931690158092268, + "learning_rate": 1.0594244762581684e-07, + "loss": 0.3128, + "step": 4390 + }, + { + "epoch": 0.7954710144927536, + "grad_norm": 4.1543030349305345, + "learning_rate": 1.0576192732573052e-07, + "loss": 0.2016, + "step": 4391 + }, + { + "epoch": 0.7956521739130434, + "grad_norm": 4.82124142941218, + "learning_rate": 1.055815427636807e-07, + "loss": 0.3058, + "step": 4392 + }, + { + "epoch": 0.7958333333333333, + "grad_norm": 4.102743339663563, + "learning_rate": 1.0540129400177439e-07, + "loss": 0.3086, + "step": 4393 + }, + { + "epoch": 0.7960144927536232, + "grad_norm": 10.320209982991518, + "learning_rate": 1.0522118110207207e-07, + "loss": 0.2979, + "step": 4394 + }, + { + "epoch": 0.7961956521739131, + "grad_norm": 4.30696671470187, + "learning_rate": 1.0504120412658768e-07, + "loss": 0.2893, + "step": 4395 + }, + { + "epoch": 0.7963768115942029, + "grad_norm": 3.82243937732715, + "learning_rate": 1.048613631372881e-07, + "loss": 0.3101, + "step": 4396 + }, + { + "epoch": 0.7965579710144928, + "grad_norm": 4.704501285624889, + "learning_rate": 1.0468165819609342e-07, + "loss": 0.2772, + "step": 4397 + }, + { + "epoch": 0.7967391304347826, + "grad_norm": 3.7848537954282704, + "learning_rate": 1.0450208936487682e-07, + "loss": 0.226, + "step": 4398 + }, + { + "epoch": 0.7969202898550725, + "grad_norm": 4.452202943100958, + "learning_rate": 1.0432265670546481e-07, + "loss": 0.3208, + "step": 4399 + }, + { + "epoch": 0.7971014492753623, + "grad_norm": 4.701155658158093, + "learning_rate": 1.0414336027963683e-07, + "loss": 0.3186, + "step": 4400 + }, + { + "epoch": 0.7971014492753623, + "eval_loss": 0.26926562190055847, + "eval_runtime": 9.7489, + "eval_samples_per_second": 51.288, + "eval_steps_per_second": 0.103, + "step": 4400 + }, + { + "epoch": 0.7972826086956522, + "grad_norm": 4.483976895035428, + "learning_rate": 1.0396420014912555e-07, + "loss": 0.2769, + "step": 4401 + }, + { + "epoch": 0.797463768115942, + "grad_norm": 3.9281427091930574, + "learning_rate": 1.0378517637561656e-07, + "loss": 0.2526, + "step": 4402 + }, + { + "epoch": 0.7976449275362318, + "grad_norm": 8.835697258547217, + "learning_rate": 1.0360628902074869e-07, + "loss": 0.232, + "step": 4403 + }, + { + "epoch": 0.7978260869565217, + "grad_norm": 4.840737002780634, + "learning_rate": 1.0342753814611366e-07, + "loss": 0.2697, + "step": 4404 + }, + { + "epoch": 0.7980072463768116, + "grad_norm": 3.882108337877771, + "learning_rate": 1.0324892381325622e-07, + "loss": 0.2794, + "step": 4405 + }, + { + "epoch": 0.7981884057971015, + "grad_norm": 3.6751993075289087, + "learning_rate": 1.0307044608367404e-07, + "loss": 0.2745, + "step": 4406 + }, + { + "epoch": 0.7983695652173913, + "grad_norm": 9.78329844589078, + "learning_rate": 1.0289210501881812e-07, + "loss": 0.2809, + "step": 4407 + }, + { + "epoch": 0.7985507246376812, + "grad_norm": 7.940037743288601, + "learning_rate": 1.0271390068009205e-07, + "loss": 0.2606, + "step": 4408 + }, + { + "epoch": 0.798731884057971, + "grad_norm": 4.335585900794439, + "learning_rate": 1.0253583312885228e-07, + "loss": 0.2849, + "step": 4409 + }, + { + "epoch": 0.7989130434782609, + "grad_norm": 4.800262715348598, + "learning_rate": 1.0235790242640824e-07, + "loss": 0.3176, + "step": 4410 + }, + { + "epoch": 0.7990942028985507, + "grad_norm": 3.3199641010341474, + "learning_rate": 1.021801086340226e-07, + "loss": 0.2716, + "step": 4411 + }, + { + "epoch": 0.7992753623188406, + "grad_norm": 5.394266166423357, + "learning_rate": 1.0200245181291045e-07, + "loss": 0.3093, + "step": 4412 + }, + { + "epoch": 0.7994565217391304, + "grad_norm": 3.4182170503118177, + "learning_rate": 1.0182493202423992e-07, + "loss": 0.2184, + "step": 4413 + }, + { + "epoch": 0.7996376811594202, + "grad_norm": 5.90931191839386, + "learning_rate": 1.0164754932913189e-07, + "loss": 0.2704, + "step": 4414 + }, + { + "epoch": 0.7998188405797102, + "grad_norm": 5.174370325166519, + "learning_rate": 1.0147030378866001e-07, + "loss": 0.3286, + "step": 4415 + }, + { + "epoch": 0.8, + "grad_norm": 4.065339598191638, + "learning_rate": 1.0129319546385084e-07, + "loss": 0.2577, + "step": 4416 + }, + { + "epoch": 0.8001811594202899, + "grad_norm": 5.253527984832621, + "learning_rate": 1.0111622441568347e-07, + "loss": 0.2521, + "step": 4417 + }, + { + "epoch": 0.8003623188405797, + "grad_norm": 4.030342323593917, + "learning_rate": 1.009393907050901e-07, + "loss": 0.2462, + "step": 4418 + }, + { + "epoch": 0.8005434782608696, + "grad_norm": 8.545226693843421, + "learning_rate": 1.0076269439295542e-07, + "loss": 0.3408, + "step": 4419 + }, + { + "epoch": 0.8007246376811594, + "grad_norm": 9.194804076339683, + "learning_rate": 1.0058613554011658e-07, + "loss": 0.3095, + "step": 4420 + }, + { + "epoch": 0.8009057971014493, + "grad_norm": 5.415262708466341, + "learning_rate": 1.0040971420736354e-07, + "loss": 0.2368, + "step": 4421 + }, + { + "epoch": 0.8010869565217391, + "grad_norm": 7.295542387729058, + "learning_rate": 1.0023343045543942e-07, + "loss": 0.2711, + "step": 4422 + }, + { + "epoch": 0.801268115942029, + "grad_norm": 5.972930590199982, + "learning_rate": 1.000572843450393e-07, + "loss": 0.3103, + "step": 4423 + }, + { + "epoch": 0.8014492753623188, + "grad_norm": 3.5939071024882954, + "learning_rate": 9.988127593681112e-08, + "loss": 0.2411, + "step": 4424 + }, + { + "epoch": 0.8016304347826086, + "grad_norm": 3.971706738585476, + "learning_rate": 9.970540529135546e-08, + "loss": 0.2788, + "step": 4425 + }, + { + "epoch": 0.8018115942028986, + "grad_norm": 6.617494185028553, + "learning_rate": 9.952967246922533e-08, + "loss": 0.2982, + "step": 4426 + }, + { + "epoch": 0.8019927536231884, + "grad_norm": 3.823629960705914, + "learning_rate": 9.93540775309265e-08, + "loss": 0.2871, + "step": 4427 + }, + { + "epoch": 0.8021739130434783, + "grad_norm": 4.443748685179726, + "learning_rate": 9.917862053691711e-08, + "loss": 0.3027, + "step": 4428 + }, + { + "epoch": 0.8023550724637681, + "grad_norm": 3.691182788639689, + "learning_rate": 9.900330154760772e-08, + "loss": 0.2379, + "step": 4429 + }, + { + "epoch": 0.802536231884058, + "grad_norm": 3.765337784084508, + "learning_rate": 9.88281206233616e-08, + "loss": 0.3134, + "step": 4430 + }, + { + "epoch": 0.8027173913043478, + "grad_norm": 7.517591554219634, + "learning_rate": 9.865307782449438e-08, + "loss": 0.2959, + "step": 4431 + }, + { + "epoch": 0.8028985507246377, + "grad_norm": 5.206635904310933, + "learning_rate": 9.847817321127405e-08, + "loss": 0.2965, + "step": 4432 + }, + { + "epoch": 0.8030797101449275, + "grad_norm": 4.315714455320872, + "learning_rate": 9.830340684392102e-08, + "loss": 0.308, + "step": 4433 + }, + { + "epoch": 0.8032608695652174, + "grad_norm": 4.573085504573862, + "learning_rate": 9.81287787826084e-08, + "loss": 0.3131, + "step": 4434 + }, + { + "epoch": 0.8034420289855072, + "grad_norm": 6.0129171159902395, + "learning_rate": 9.79542890874615e-08, + "loss": 0.2848, + "step": 4435 + }, + { + "epoch": 0.803623188405797, + "grad_norm": 4.446642005512727, + "learning_rate": 9.777993781855765e-08, + "loss": 0.2756, + "step": 4436 + }, + { + "epoch": 0.803804347826087, + "grad_norm": 6.6918027311394646, + "learning_rate": 9.760572503592684e-08, + "loss": 0.3088, + "step": 4437 + }, + { + "epoch": 0.8039855072463769, + "grad_norm": 3.775716611132495, + "learning_rate": 9.743165079955162e-08, + "loss": 0.2742, + "step": 4438 + }, + { + "epoch": 0.8041666666666667, + "grad_norm": 3.8840903495932158, + "learning_rate": 9.725771516936643e-08, + "loss": 0.3095, + "step": 4439 + }, + { + "epoch": 0.8043478260869565, + "grad_norm": 5.679148174501449, + "learning_rate": 9.708391820525819e-08, + "loss": 0.3871, + "step": 4440 + }, + { + "epoch": 0.8045289855072464, + "grad_norm": 5.146405362566784, + "learning_rate": 9.691025996706592e-08, + "loss": 0.2687, + "step": 4441 + }, + { + "epoch": 0.8047101449275362, + "grad_norm": 3.8738716557708237, + "learning_rate": 9.673674051458102e-08, + "loss": 0.2836, + "step": 4442 + }, + { + "epoch": 0.8048913043478261, + "grad_norm": 4.892873410120914, + "learning_rate": 9.656335990754705e-08, + "loss": 0.2916, + "step": 4443 + }, + { + "epoch": 0.8050724637681159, + "grad_norm": 4.20176314788988, + "learning_rate": 9.639011820565985e-08, + "loss": 0.2564, + "step": 4444 + }, + { + "epoch": 0.8052536231884058, + "grad_norm": 3.844048241526155, + "learning_rate": 9.621701546856708e-08, + "loss": 0.287, + "step": 4445 + }, + { + "epoch": 0.8054347826086956, + "grad_norm": 4.26916196506891, + "learning_rate": 9.604405175586938e-08, + "loss": 0.321, + "step": 4446 + }, + { + "epoch": 0.8056159420289855, + "grad_norm": 3.755143878960769, + "learning_rate": 9.587122712711848e-08, + "loss": 0.2868, + "step": 4447 + }, + { + "epoch": 0.8057971014492754, + "grad_norm": 4.368650120206068, + "learning_rate": 9.569854164181889e-08, + "loss": 0.2901, + "step": 4448 + }, + { + "epoch": 0.8059782608695653, + "grad_norm": 10.91322396602523, + "learning_rate": 9.55259953594269e-08, + "loss": 0.2773, + "step": 4449 + }, + { + "epoch": 0.8061594202898551, + "grad_norm": 6.516672511642825, + "learning_rate": 9.53535883393513e-08, + "loss": 0.3225, + "step": 4450 + }, + { + "epoch": 0.806340579710145, + "grad_norm": 5.427710697957984, + "learning_rate": 9.518132064095247e-08, + "loss": 0.2355, + "step": 4451 + }, + { + "epoch": 0.8065217391304348, + "grad_norm": 7.559695412580726, + "learning_rate": 9.500919232354298e-08, + "loss": 0.2981, + "step": 4452 + }, + { + "epoch": 0.8067028985507246, + "grad_norm": 6.955825651938101, + "learning_rate": 9.483720344638751e-08, + "loss": 0.2448, + "step": 4453 + }, + { + "epoch": 0.8068840579710145, + "grad_norm": 3.713207678670163, + "learning_rate": 9.466535406870257e-08, + "loss": 0.3129, + "step": 4454 + }, + { + "epoch": 0.8070652173913043, + "grad_norm": 4.157397748525312, + "learning_rate": 9.449364424965679e-08, + "loss": 0.2848, + "step": 4455 + }, + { + "epoch": 0.8072463768115942, + "grad_norm": 3.5746065286249262, + "learning_rate": 9.432207404837056e-08, + "loss": 0.2544, + "step": 4456 + }, + { + "epoch": 0.807427536231884, + "grad_norm": 4.6564568125611485, + "learning_rate": 9.415064352391638e-08, + "loss": 0.271, + "step": 4457 + }, + { + "epoch": 0.8076086956521739, + "grad_norm": 10.2790428533922, + "learning_rate": 9.39793527353186e-08, + "loss": 0.2902, + "step": 4458 + }, + { + "epoch": 0.8077898550724638, + "grad_norm": 4.998771791064756, + "learning_rate": 9.380820174155346e-08, + "loss": 0.3238, + "step": 4459 + }, + { + "epoch": 0.8079710144927537, + "grad_norm": 6.616505634224275, + "learning_rate": 9.36371906015489e-08, + "loss": 0.2136, + "step": 4460 + }, + { + "epoch": 0.8081521739130435, + "grad_norm": 7.35628141094853, + "learning_rate": 9.346631937418515e-08, + "loss": 0.2473, + "step": 4461 + }, + { + "epoch": 0.8083333333333333, + "grad_norm": 3.4954802861571497, + "learning_rate": 9.329558811829391e-08, + "loss": 0.2547, + "step": 4462 + }, + { + "epoch": 0.8085144927536232, + "grad_norm": 4.182033931861492, + "learning_rate": 9.312499689265862e-08, + "loss": 0.3068, + "step": 4463 + }, + { + "epoch": 0.808695652173913, + "grad_norm": 4.230426910270004, + "learning_rate": 9.295454575601452e-08, + "loss": 0.3209, + "step": 4464 + }, + { + "epoch": 0.8088768115942029, + "grad_norm": 7.807013171989984, + "learning_rate": 9.27842347670491e-08, + "loss": 0.2917, + "step": 4465 + }, + { + "epoch": 0.8090579710144927, + "grad_norm": 3.970589920750272, + "learning_rate": 9.261406398440109e-08, + "loss": 0.3092, + "step": 4466 + }, + { + "epoch": 0.8092391304347826, + "grad_norm": 8.422901506396375, + "learning_rate": 9.244403346666108e-08, + "loss": 0.2765, + "step": 4467 + }, + { + "epoch": 0.8094202898550724, + "grad_norm": 5.601125621354369, + "learning_rate": 9.227414327237137e-08, + "loss": 0.2999, + "step": 4468 + }, + { + "epoch": 0.8096014492753624, + "grad_norm": 9.239891652940003, + "learning_rate": 9.210439346002601e-08, + "loss": 0.2802, + "step": 4469 + }, + { + "epoch": 0.8097826086956522, + "grad_norm": 3.928911000826045, + "learning_rate": 9.193478408807059e-08, + "loss": 0.3155, + "step": 4470 + }, + { + "epoch": 0.8099637681159421, + "grad_norm": 5.046923233095308, + "learning_rate": 9.17653152149025e-08, + "loss": 0.2582, + "step": 4471 + }, + { + "epoch": 0.8101449275362319, + "grad_norm": 9.222433672154175, + "learning_rate": 9.159598689887066e-08, + "loss": 0.3009, + "step": 4472 + }, + { + "epoch": 0.8103260869565218, + "grad_norm": 5.154128135295395, + "learning_rate": 9.142679919827551e-08, + "loss": 0.2422, + "step": 4473 + }, + { + "epoch": 0.8105072463768116, + "grad_norm": 3.9716287840363274, + "learning_rate": 9.125775217136933e-08, + "loss": 0.3397, + "step": 4474 + }, + { + "epoch": 0.8106884057971014, + "grad_norm": 11.52519174148552, + "learning_rate": 9.108884587635568e-08, + "loss": 0.4065, + "step": 4475 + }, + { + "epoch": 0.8108695652173913, + "grad_norm": 4.832587139333683, + "learning_rate": 9.092008037138976e-08, + "loss": 0.2925, + "step": 4476 + }, + { + "epoch": 0.8110507246376811, + "grad_norm": 3.738426851167183, + "learning_rate": 9.075145571457854e-08, + "loss": 0.2787, + "step": 4477 + }, + { + "epoch": 0.811231884057971, + "grad_norm": 5.813514122224783, + "learning_rate": 9.058297196398035e-08, + "loss": 0.2907, + "step": 4478 + }, + { + "epoch": 0.8114130434782608, + "grad_norm": 4.928152994891378, + "learning_rate": 9.041462917760462e-08, + "loss": 0.344, + "step": 4479 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 4.7269464698545995, + "learning_rate": 9.024642741341254e-08, + "loss": 0.2448, + "step": 4480 + }, + { + "epoch": 0.8117753623188406, + "grad_norm": 4.609460154469483, + "learning_rate": 9.007836672931718e-08, + "loss": 0.287, + "step": 4481 + }, + { + "epoch": 0.8119565217391305, + "grad_norm": 5.455383492329131, + "learning_rate": 8.99104471831823e-08, + "loss": 0.3176, + "step": 4482 + }, + { + "epoch": 0.8121376811594203, + "grad_norm": 5.620968383856849, + "learning_rate": 8.974266883282356e-08, + "loss": 0.3005, + "step": 4483 + }, + { + "epoch": 0.8123188405797102, + "grad_norm": 4.85414070159053, + "learning_rate": 8.957503173600777e-08, + "loss": 0.3463, + "step": 4484 + }, + { + "epoch": 0.8125, + "grad_norm": 4.365315467978919, + "learning_rate": 8.940753595045319e-08, + "loss": 0.2781, + "step": 4485 + }, + { + "epoch": 0.8126811594202898, + "grad_norm": 5.696896947712273, + "learning_rate": 8.924018153382944e-08, + "loss": 0.2685, + "step": 4486 + }, + { + "epoch": 0.8128623188405797, + "grad_norm": 3.813575653169163, + "learning_rate": 8.907296854375723e-08, + "loss": 0.2249, + "step": 4487 + }, + { + "epoch": 0.8130434782608695, + "grad_norm": 9.336856657595598, + "learning_rate": 8.89058970378091e-08, + "loss": 0.304, + "step": 4488 + }, + { + "epoch": 0.8132246376811594, + "grad_norm": 4.801517466990611, + "learning_rate": 8.873896707350859e-08, + "loss": 0.3719, + "step": 4489 + }, + { + "epoch": 0.8134057971014492, + "grad_norm": 3.8749576849356604, + "learning_rate": 8.857217870833017e-08, + "loss": 0.225, + "step": 4490 + }, + { + "epoch": 0.8135869565217392, + "grad_norm": 4.339821124136354, + "learning_rate": 8.840553199969986e-08, + "loss": 0.265, + "step": 4491 + }, + { + "epoch": 0.813768115942029, + "grad_norm": 6.635825756875962, + "learning_rate": 8.82390270049952e-08, + "loss": 0.3177, + "step": 4492 + }, + { + "epoch": 0.8139492753623189, + "grad_norm": 3.889016573529078, + "learning_rate": 8.807266378154448e-08, + "loss": 0.3056, + "step": 4493 + }, + { + "epoch": 0.8141304347826087, + "grad_norm": 4.436326612962354, + "learning_rate": 8.790644238662736e-08, + "loss": 0.3503, + "step": 4494 + }, + { + "epoch": 0.8143115942028986, + "grad_norm": 4.943557798377172, + "learning_rate": 8.774036287747472e-08, + "loss": 0.2885, + "step": 4495 + }, + { + "epoch": 0.8144927536231884, + "grad_norm": 4.076109356131406, + "learning_rate": 8.757442531126847e-08, + "loss": 0.3089, + "step": 4496 + }, + { + "epoch": 0.8146739130434782, + "grad_norm": 4.510012709219583, + "learning_rate": 8.740862974514168e-08, + "loss": 0.2659, + "step": 4497 + }, + { + "epoch": 0.8148550724637681, + "grad_norm": 4.791410314064984, + "learning_rate": 8.724297623617854e-08, + "loss": 0.2907, + "step": 4498 + }, + { + "epoch": 0.8150362318840579, + "grad_norm": 8.087254925325885, + "learning_rate": 8.707746484141437e-08, + "loss": 0.3569, + "step": 4499 + }, + { + "epoch": 0.8152173913043478, + "grad_norm": 4.752153835035208, + "learning_rate": 8.691209561783558e-08, + "loss": 0.3065, + "step": 4500 + }, + { + "epoch": 0.8152173913043478, + "eval_loss": 0.2705000042915344, + "eval_runtime": 9.9574, + "eval_samples_per_second": 50.214, + "eval_steps_per_second": 0.1, + "step": 4500 + }, + { + "epoch": 0.8153985507246376, + "grad_norm": 3.7412207252349905, + "learning_rate": 8.674686862237945e-08, + "loss": 0.2535, + "step": 4501 + }, + { + "epoch": 0.8155797101449276, + "grad_norm": 4.239432844108309, + "learning_rate": 8.658178391193455e-08, + "loss": 0.2935, + "step": 4502 + }, + { + "epoch": 0.8157608695652174, + "grad_norm": 4.807420571566828, + "learning_rate": 8.641684154334011e-08, + "loss": 0.2274, + "step": 4503 + }, + { + "epoch": 0.8159420289855073, + "grad_norm": 3.4584932735012015, + "learning_rate": 8.625204157338683e-08, + "loss": 0.2878, + "step": 4504 + }, + { + "epoch": 0.8161231884057971, + "grad_norm": 3.274783382714847, + "learning_rate": 8.608738405881615e-08, + "loss": 0.2406, + "step": 4505 + }, + { + "epoch": 0.816304347826087, + "grad_norm": 3.8227126282928836, + "learning_rate": 8.592286905632018e-08, + "loss": 0.2698, + "step": 4506 + }, + { + "epoch": 0.8164855072463768, + "grad_norm": 5.4628156362565665, + "learning_rate": 8.57584966225422e-08, + "loss": 0.2831, + "step": 4507 + }, + { + "epoch": 0.8166666666666667, + "grad_norm": 3.9440184370132405, + "learning_rate": 8.559426681407673e-08, + "loss": 0.2774, + "step": 4508 + }, + { + "epoch": 0.8168478260869565, + "grad_norm": 3.2194696444855793, + "learning_rate": 8.543017968746863e-08, + "loss": 0.2469, + "step": 4509 + }, + { + "epoch": 0.8170289855072463, + "grad_norm": 8.468175360267214, + "learning_rate": 8.526623529921395e-08, + "loss": 0.3587, + "step": 4510 + }, + { + "epoch": 0.8172101449275362, + "grad_norm": 5.483904636675657, + "learning_rate": 8.510243370575948e-08, + "loss": 0.3037, + "step": 4511 + }, + { + "epoch": 0.8173913043478261, + "grad_norm": 3.6498687641484002, + "learning_rate": 8.493877496350293e-08, + "loss": 0.2844, + "step": 4512 + }, + { + "epoch": 0.817572463768116, + "grad_norm": 5.538497802151831, + "learning_rate": 8.477525912879279e-08, + "loss": 0.314, + "step": 4513 + }, + { + "epoch": 0.8177536231884058, + "grad_norm": 4.2583938931388685, + "learning_rate": 8.461188625792831e-08, + "loss": 0.295, + "step": 4514 + }, + { + "epoch": 0.8179347826086957, + "grad_norm": 4.856622093857845, + "learning_rate": 8.444865640715937e-08, + "loss": 0.3069, + "step": 4515 + }, + { + "epoch": 0.8181159420289855, + "grad_norm": 3.1976385386297332, + "learning_rate": 8.428556963268724e-08, + "loss": 0.2473, + "step": 4516 + }, + { + "epoch": 0.8182971014492754, + "grad_norm": 3.5014865815986282, + "learning_rate": 8.412262599066305e-08, + "loss": 0.2619, + "step": 4517 + }, + { + "epoch": 0.8184782608695652, + "grad_norm": 4.042521641949122, + "learning_rate": 8.395982553718916e-08, + "loss": 0.2372, + "step": 4518 + }, + { + "epoch": 0.818659420289855, + "grad_norm": 7.272152537808116, + "learning_rate": 8.379716832831851e-08, + "loss": 0.3098, + "step": 4519 + }, + { + "epoch": 0.8188405797101449, + "grad_norm": 4.400055328962682, + "learning_rate": 8.363465442005491e-08, + "loss": 0.2678, + "step": 4520 + }, + { + "epoch": 0.8190217391304347, + "grad_norm": 5.800760413180088, + "learning_rate": 8.347228386835258e-08, + "loss": 0.2325, + "step": 4521 + }, + { + "epoch": 0.8192028985507246, + "grad_norm": 7.017407354096239, + "learning_rate": 8.331005672911645e-08, + "loss": 0.2915, + "step": 4522 + }, + { + "epoch": 0.8193840579710145, + "grad_norm": 4.879020770616914, + "learning_rate": 8.314797305820215e-08, + "loss": 0.2935, + "step": 4523 + }, + { + "epoch": 0.8195652173913044, + "grad_norm": 4.8002026579078025, + "learning_rate": 8.298603291141576e-08, + "loss": 0.2384, + "step": 4524 + }, + { + "epoch": 0.8197463768115942, + "grad_norm": 5.626022687975708, + "learning_rate": 8.282423634451413e-08, + "loss": 0.297, + "step": 4525 + }, + { + "epoch": 0.8199275362318841, + "grad_norm": 3.9870916742008027, + "learning_rate": 8.266258341320454e-08, + "loss": 0.2549, + "step": 4526 + }, + { + "epoch": 0.8201086956521739, + "grad_norm": 4.516704214771563, + "learning_rate": 8.250107417314483e-08, + "loss": 0.2635, + "step": 4527 + }, + { + "epoch": 0.8202898550724638, + "grad_norm": 4.036721398397296, + "learning_rate": 8.23397086799435e-08, + "loss": 0.2828, + "step": 4528 + }, + { + "epoch": 0.8204710144927536, + "grad_norm": 4.146590494641051, + "learning_rate": 8.21784869891593e-08, + "loss": 0.2511, + "step": 4529 + }, + { + "epoch": 0.8206521739130435, + "grad_norm": 4.186689228368776, + "learning_rate": 8.201740915630168e-08, + "loss": 0.2846, + "step": 4530 + }, + { + "epoch": 0.8208333333333333, + "grad_norm": 6.510141393049845, + "learning_rate": 8.185647523683059e-08, + "loss": 0.269, + "step": 4531 + }, + { + "epoch": 0.8210144927536231, + "grad_norm": 3.8618111825172337, + "learning_rate": 8.169568528615645e-08, + "loss": 0.28, + "step": 4532 + }, + { + "epoch": 0.821195652173913, + "grad_norm": 4.9055214640138, + "learning_rate": 8.153503935963969e-08, + "loss": 0.2966, + "step": 4533 + }, + { + "epoch": 0.821376811594203, + "grad_norm": 6.939194869769065, + "learning_rate": 8.137453751259144e-08, + "loss": 0.2386, + "step": 4534 + }, + { + "epoch": 0.8215579710144928, + "grad_norm": 7.356681094644599, + "learning_rate": 8.121417980027357e-08, + "loss": 0.2611, + "step": 4535 + }, + { + "epoch": 0.8217391304347826, + "grad_norm": 8.240959360183206, + "learning_rate": 8.105396627789784e-08, + "loss": 0.2979, + "step": 4536 + }, + { + "epoch": 0.8219202898550725, + "grad_norm": 3.7187400594726188, + "learning_rate": 8.089389700062655e-08, + "loss": 0.2864, + "step": 4537 + }, + { + "epoch": 0.8221014492753623, + "grad_norm": 3.5286679365410216, + "learning_rate": 8.07339720235723e-08, + "loss": 0.2773, + "step": 4538 + }, + { + "epoch": 0.8222826086956522, + "grad_norm": 7.8396160310582195, + "learning_rate": 8.057419140179794e-08, + "loss": 0.3129, + "step": 4539 + }, + { + "epoch": 0.822463768115942, + "grad_norm": 5.448547614255271, + "learning_rate": 8.041455519031681e-08, + "loss": 0.3088, + "step": 4540 + }, + { + "epoch": 0.8226449275362319, + "grad_norm": 5.906614222823553, + "learning_rate": 8.025506344409239e-08, + "loss": 0.2906, + "step": 4541 + }, + { + "epoch": 0.8228260869565217, + "grad_norm": 4.421292587900288, + "learning_rate": 8.009571621803834e-08, + "loss": 0.2994, + "step": 4542 + }, + { + "epoch": 0.8230072463768116, + "grad_norm": 5.653466883853886, + "learning_rate": 7.99365135670188e-08, + "loss": 0.256, + "step": 4543 + }, + { + "epoch": 0.8231884057971014, + "grad_norm": 4.033538522947047, + "learning_rate": 7.977745554584792e-08, + "loss": 0.223, + "step": 4544 + }, + { + "epoch": 0.8233695652173914, + "grad_norm": 5.321304918514272, + "learning_rate": 7.961854220929021e-08, + "loss": 0.296, + "step": 4545 + }, + { + "epoch": 0.8235507246376812, + "grad_norm": 3.392112387466615, + "learning_rate": 7.945977361206002e-08, + "loss": 0.2178, + "step": 4546 + }, + { + "epoch": 0.823731884057971, + "grad_norm": 7.444897929950792, + "learning_rate": 7.930114980882252e-08, + "loss": 0.3275, + "step": 4547 + }, + { + "epoch": 0.8239130434782609, + "grad_norm": 3.9537372422156927, + "learning_rate": 7.914267085419252e-08, + "loss": 0.254, + "step": 4548 + }, + { + "epoch": 0.8240942028985507, + "grad_norm": 4.06748188882281, + "learning_rate": 7.898433680273491e-08, + "loss": 0.3094, + "step": 4549 + }, + { + "epoch": 0.8242753623188406, + "grad_norm": 4.033843170165374, + "learning_rate": 7.88261477089649e-08, + "loss": 0.338, + "step": 4550 + }, + { + "epoch": 0.8244565217391304, + "grad_norm": 3.7284617171729724, + "learning_rate": 7.866810362734782e-08, + "loss": 0.2702, + "step": 4551 + }, + { + "epoch": 0.8246376811594203, + "grad_norm": 4.892225452540291, + "learning_rate": 7.851020461229901e-08, + "loss": 0.2605, + "step": 4552 + }, + { + "epoch": 0.8248188405797101, + "grad_norm": 8.963708769452413, + "learning_rate": 7.835245071818381e-08, + "loss": 0.2347, + "step": 4553 + }, + { + "epoch": 0.825, + "grad_norm": 3.762664293270453, + "learning_rate": 7.819484199931764e-08, + "loss": 0.2467, + "step": 4554 + }, + { + "epoch": 0.8251811594202898, + "grad_norm": 4.478347655001307, + "learning_rate": 7.803737850996595e-08, + "loss": 0.3044, + "step": 4555 + }, + { + "epoch": 0.8253623188405798, + "grad_norm": 4.657945259550352, + "learning_rate": 7.788006030434413e-08, + "loss": 0.2756, + "step": 4556 + }, + { + "epoch": 0.8255434782608696, + "grad_norm": 5.996282107904425, + "learning_rate": 7.772288743661743e-08, + "loss": 0.3226, + "step": 4557 + }, + { + "epoch": 0.8257246376811594, + "grad_norm": 5.08425677589519, + "learning_rate": 7.756585996090153e-08, + "loss": 0.2513, + "step": 4558 + }, + { + "epoch": 0.8259057971014493, + "grad_norm": 4.074022136387155, + "learning_rate": 7.740897793126172e-08, + "loss": 0.303, + "step": 4559 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 4.575120505710971, + "learning_rate": 7.725224140171294e-08, + "loss": 0.269, + "step": 4560 + }, + { + "epoch": 0.826268115942029, + "grad_norm": 5.009299759414409, + "learning_rate": 7.709565042622035e-08, + "loss": 0.267, + "step": 4561 + }, + { + "epoch": 0.8264492753623188, + "grad_norm": 8.676478870797638, + "learning_rate": 7.693920505869916e-08, + "loss": 0.2768, + "step": 4562 + }, + { + "epoch": 0.8266304347826087, + "grad_norm": 3.7248881268132514, + "learning_rate": 7.678290535301424e-08, + "loss": 0.2294, + "step": 4563 + }, + { + "epoch": 0.8268115942028985, + "grad_norm": 4.337018339811962, + "learning_rate": 7.662675136298019e-08, + "loss": 0.2517, + "step": 4564 + }, + { + "epoch": 0.8269927536231884, + "grad_norm": 5.732066420693558, + "learning_rate": 7.647074314236168e-08, + "loss": 0.3009, + "step": 4565 + }, + { + "epoch": 0.8271739130434783, + "grad_norm": 3.115914620329363, + "learning_rate": 7.631488074487303e-08, + "loss": 0.2008, + "step": 4566 + }, + { + "epoch": 0.8273550724637682, + "grad_norm": 4.1053472125710195, + "learning_rate": 7.615916422417835e-08, + "loss": 0.2404, + "step": 4567 + }, + { + "epoch": 0.827536231884058, + "grad_norm": 3.516506100886446, + "learning_rate": 7.60035936338917e-08, + "loss": 0.2543, + "step": 4568 + }, + { + "epoch": 0.8277173913043478, + "grad_norm": 5.764692271925027, + "learning_rate": 7.584816902757662e-08, + "loss": 0.2623, + "step": 4569 + }, + { + "epoch": 0.8278985507246377, + "grad_norm": 4.99289911434981, + "learning_rate": 7.569289045874666e-08, + "loss": 0.2876, + "step": 4570 + }, + { + "epoch": 0.8280797101449275, + "grad_norm": 4.147310360008969, + "learning_rate": 7.553775798086492e-08, + "loss": 0.3152, + "step": 4571 + }, + { + "epoch": 0.8282608695652174, + "grad_norm": 5.805804651292067, + "learning_rate": 7.538277164734424e-08, + "loss": 0.3445, + "step": 4572 + }, + { + "epoch": 0.8284420289855072, + "grad_norm": 3.787622481562338, + "learning_rate": 7.52279315115471e-08, + "loss": 0.2699, + "step": 4573 + }, + { + "epoch": 0.8286231884057971, + "grad_norm": 5.060317865165723, + "learning_rate": 7.50732376267858e-08, + "loss": 0.2759, + "step": 4574 + }, + { + "epoch": 0.8288043478260869, + "grad_norm": 3.8006891740553925, + "learning_rate": 7.491869004632228e-08, + "loss": 0.2354, + "step": 4575 + }, + { + "epoch": 0.8289855072463768, + "grad_norm": 6.383636251670721, + "learning_rate": 7.476428882336771e-08, + "loss": 0.3014, + "step": 4576 + }, + { + "epoch": 0.8291666666666667, + "grad_norm": 4.504065411112429, + "learning_rate": 7.461003401108324e-08, + "loss": 0.3342, + "step": 4577 + }, + { + "epoch": 0.8293478260869566, + "grad_norm": 7.821486454771593, + "learning_rate": 7.445592566257969e-08, + "loss": 0.3291, + "step": 4578 + }, + { + "epoch": 0.8295289855072464, + "grad_norm": 6.540642944379602, + "learning_rate": 7.430196383091719e-08, + "loss": 0.2833, + "step": 4579 + }, + { + "epoch": 0.8297101449275363, + "grad_norm": 4.801700366893205, + "learning_rate": 7.414814856910556e-08, + "loss": 0.2848, + "step": 4580 + }, + { + "epoch": 0.8298913043478261, + "grad_norm": 8.770380136268663, + "learning_rate": 7.39944799301041e-08, + "loss": 0.2853, + "step": 4581 + }, + { + "epoch": 0.8300724637681159, + "grad_norm": 4.488806416950078, + "learning_rate": 7.384095796682167e-08, + "loss": 0.2527, + "step": 4582 + }, + { + "epoch": 0.8302536231884058, + "grad_norm": 4.8756132096210845, + "learning_rate": 7.368758273211657e-08, + "loss": 0.3311, + "step": 4583 + }, + { + "epoch": 0.8304347826086956, + "grad_norm": 9.874067423348928, + "learning_rate": 7.353435427879667e-08, + "loss": 0.2852, + "step": 4584 + }, + { + "epoch": 0.8306159420289855, + "grad_norm": 3.949042840801302, + "learning_rate": 7.338127265961908e-08, + "loss": 0.2869, + "step": 4585 + }, + { + "epoch": 0.8307971014492753, + "grad_norm": 5.683093956796709, + "learning_rate": 7.322833792729094e-08, + "loss": 0.3038, + "step": 4586 + }, + { + "epoch": 0.8309782608695652, + "grad_norm": 6.472196198784984, + "learning_rate": 7.307555013446804e-08, + "loss": 0.3348, + "step": 4587 + }, + { + "epoch": 0.8311594202898551, + "grad_norm": 5.910602016287594, + "learning_rate": 7.292290933375599e-08, + "loss": 0.2895, + "step": 4588 + }, + { + "epoch": 0.831340579710145, + "grad_norm": 8.730732847824768, + "learning_rate": 7.277041557770968e-08, + "loss": 0.2997, + "step": 4589 + }, + { + "epoch": 0.8315217391304348, + "grad_norm": 5.319609501163069, + "learning_rate": 7.261806891883366e-08, + "loss": 0.2154, + "step": 4590 + }, + { + "epoch": 0.8317028985507247, + "grad_norm": 3.8090097531533353, + "learning_rate": 7.24658694095815e-08, + "loss": 0.2503, + "step": 4591 + }, + { + "epoch": 0.8318840579710145, + "grad_norm": 3.4929689277882083, + "learning_rate": 7.231381710235624e-08, + "loss": 0.2675, + "step": 4592 + }, + { + "epoch": 0.8320652173913043, + "grad_norm": 4.505221488581019, + "learning_rate": 7.216191204951011e-08, + "loss": 0.3536, + "step": 4593 + }, + { + "epoch": 0.8322463768115942, + "grad_norm": 6.8389613716428865, + "learning_rate": 7.201015430334484e-08, + "loss": 0.3765, + "step": 4594 + }, + { + "epoch": 0.832427536231884, + "grad_norm": 7.644920094477968, + "learning_rate": 7.18585439161113e-08, + "loss": 0.3227, + "step": 4595 + }, + { + "epoch": 0.8326086956521739, + "grad_norm": 7.017516603300654, + "learning_rate": 7.17070809400096e-08, + "loss": 0.2935, + "step": 4596 + }, + { + "epoch": 0.8327898550724637, + "grad_norm": 3.9358900637729213, + "learning_rate": 7.155576542718933e-08, + "loss": 0.2968, + "step": 4597 + }, + { + "epoch": 0.8329710144927536, + "grad_norm": 5.354095877087518, + "learning_rate": 7.140459742974897e-08, + "loss": 0.2567, + "step": 4598 + }, + { + "epoch": 0.8331521739130435, + "grad_norm": 8.183165741916955, + "learning_rate": 7.125357699973644e-08, + "loss": 0.2357, + "step": 4599 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 3.440220246863262, + "learning_rate": 7.110270418914871e-08, + "loss": 0.2675, + "step": 4600 + }, + { + "epoch": 0.8333333333333334, + "eval_loss": 0.2700468897819519, + "eval_runtime": 9.7193, + "eval_samples_per_second": 51.444, + "eval_steps_per_second": 0.103, + "step": 4600 + }, + { + "epoch": 0.8335144927536232, + "grad_norm": 3.566197792932106, + "learning_rate": 7.095197904993217e-08, + "loss": 0.2826, + "step": 4601 + }, + { + "epoch": 0.8336956521739131, + "grad_norm": 3.9282213710588105, + "learning_rate": 7.080140163398224e-08, + "loss": 0.2444, + "step": 4602 + }, + { + "epoch": 0.8338768115942029, + "grad_norm": 4.1879570600795955, + "learning_rate": 7.065097199314318e-08, + "loss": 0.2952, + "step": 4603 + }, + { + "epoch": 0.8340579710144927, + "grad_norm": 3.5402998322927712, + "learning_rate": 7.050069017920867e-08, + "loss": 0.2438, + "step": 4604 + }, + { + "epoch": 0.8342391304347826, + "grad_norm": 4.80708840985529, + "learning_rate": 7.035055624392166e-08, + "loss": 0.3354, + "step": 4605 + }, + { + "epoch": 0.8344202898550724, + "grad_norm": 4.252489328602519, + "learning_rate": 7.020057023897385e-08, + "loss": 0.3183, + "step": 4606 + }, + { + "epoch": 0.8346014492753623, + "grad_norm": 4.024121953989593, + "learning_rate": 7.005073221600616e-08, + "loss": 0.3135, + "step": 4607 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 9.099100334525772, + "learning_rate": 6.990104222660848e-08, + "loss": 0.3462, + "step": 4608 + }, + { + "epoch": 0.8349637681159421, + "grad_norm": 4.463826660760659, + "learning_rate": 6.975150032231986e-08, + "loss": 0.3124, + "step": 4609 + }, + { + "epoch": 0.8351449275362319, + "grad_norm": 4.364691720656996, + "learning_rate": 6.960210655462817e-08, + "loss": 0.2889, + "step": 4610 + }, + { + "epoch": 0.8353260869565218, + "grad_norm": 11.89761114431437, + "learning_rate": 6.94528609749705e-08, + "loss": 0.34, + "step": 4611 + }, + { + "epoch": 0.8355072463768116, + "grad_norm": 6.221055677129767, + "learning_rate": 6.930376363473278e-08, + "loss": 0.2609, + "step": 4612 + }, + { + "epoch": 0.8356884057971015, + "grad_norm": 4.844396894405488, + "learning_rate": 6.915481458524991e-08, + "loss": 0.3019, + "step": 4613 + }, + { + "epoch": 0.8358695652173913, + "grad_norm": 4.223033504683856, + "learning_rate": 6.900601387780574e-08, + "loss": 0.2987, + "step": 4614 + }, + { + "epoch": 0.8360507246376812, + "grad_norm": 6.979081728781871, + "learning_rate": 6.885736156363315e-08, + "loss": 0.2936, + "step": 4615 + }, + { + "epoch": 0.836231884057971, + "grad_norm": 5.105494733356094, + "learning_rate": 6.87088576939136e-08, + "loss": 0.3668, + "step": 4616 + }, + { + "epoch": 0.8364130434782608, + "grad_norm": 4.168987919542068, + "learning_rate": 6.856050231977795e-08, + "loss": 0.2586, + "step": 4617 + }, + { + "epoch": 0.8365942028985507, + "grad_norm": 5.130383147489327, + "learning_rate": 6.84122954923056e-08, + "loss": 0.287, + "step": 4618 + }, + { + "epoch": 0.8367753623188405, + "grad_norm": 5.891940624600407, + "learning_rate": 6.82642372625249e-08, + "loss": 0.2816, + "step": 4619 + }, + { + "epoch": 0.8369565217391305, + "grad_norm": 4.997358400898634, + "learning_rate": 6.811632768141268e-08, + "loss": 0.2674, + "step": 4620 + }, + { + "epoch": 0.8371376811594203, + "grad_norm": 3.629416400595771, + "learning_rate": 6.796856679989526e-08, + "loss": 0.275, + "step": 4621 + }, + { + "epoch": 0.8373188405797102, + "grad_norm": 4.240732717298964, + "learning_rate": 6.782095466884735e-08, + "loss": 0.2987, + "step": 4622 + }, + { + "epoch": 0.8375, + "grad_norm": 4.893261770793716, + "learning_rate": 6.767349133909244e-08, + "loss": 0.2912, + "step": 4623 + }, + { + "epoch": 0.8376811594202899, + "grad_norm": 4.127331232718756, + "learning_rate": 6.752617686140283e-08, + "loss": 0.3145, + "step": 4624 + }, + { + "epoch": 0.8378623188405797, + "grad_norm": 3.641417688451911, + "learning_rate": 6.737901128649964e-08, + "loss": 0.2688, + "step": 4625 + }, + { + "epoch": 0.8380434782608696, + "grad_norm": 4.664216618294672, + "learning_rate": 6.723199466505269e-08, + "loss": 0.2458, + "step": 4626 + }, + { + "epoch": 0.8382246376811594, + "grad_norm": 5.015554223029727, + "learning_rate": 6.708512704768043e-08, + "loss": 0.3441, + "step": 4627 + }, + { + "epoch": 0.8384057971014492, + "grad_norm": 4.105985011543899, + "learning_rate": 6.693840848495008e-08, + "loss": 0.2369, + "step": 4628 + }, + { + "epoch": 0.8385869565217391, + "grad_norm": 3.625188684504257, + "learning_rate": 6.679183902737772e-08, + "loss": 0.2729, + "step": 4629 + }, + { + "epoch": 0.8387681159420289, + "grad_norm": 3.3713080053839373, + "learning_rate": 6.664541872542773e-08, + "loss": 0.2684, + "step": 4630 + }, + { + "epoch": 0.8389492753623189, + "grad_norm": 5.198538967945334, + "learning_rate": 6.64991476295132e-08, + "loss": 0.2855, + "step": 4631 + }, + { + "epoch": 0.8391304347826087, + "grad_norm": 3.9928089077361526, + "learning_rate": 6.635302578999625e-08, + "loss": 0.2829, + "step": 4632 + }, + { + "epoch": 0.8393115942028986, + "grad_norm": 5.963949663512671, + "learning_rate": 6.620705325718711e-08, + "loss": 0.2856, + "step": 4633 + }, + { + "epoch": 0.8394927536231884, + "grad_norm": 4.4067839508621365, + "learning_rate": 6.606123008134495e-08, + "loss": 0.3211, + "step": 4634 + }, + { + "epoch": 0.8396739130434783, + "grad_norm": 4.336194818451433, + "learning_rate": 6.591555631267731e-08, + "loss": 0.3331, + "step": 4635 + }, + { + "epoch": 0.8398550724637681, + "grad_norm": 4.232302513571592, + "learning_rate": 6.577003200134035e-08, + "loss": 0.2977, + "step": 4636 + }, + { + "epoch": 0.840036231884058, + "grad_norm": 5.002681776950905, + "learning_rate": 6.562465719743882e-08, + "loss": 0.3017, + "step": 4637 + }, + { + "epoch": 0.8402173913043478, + "grad_norm": 9.41011178129296, + "learning_rate": 6.54794319510259e-08, + "loss": 0.3365, + "step": 4638 + }, + { + "epoch": 0.8403985507246376, + "grad_norm": 4.729062019404165, + "learning_rate": 6.533435631210337e-08, + "loss": 0.3012, + "step": 4639 + }, + { + "epoch": 0.8405797101449275, + "grad_norm": 4.294091482165877, + "learning_rate": 6.518943033062152e-08, + "loss": 0.2812, + "step": 4640 + }, + { + "epoch": 0.8407608695652173, + "grad_norm": 4.722192168190877, + "learning_rate": 6.504465405647891e-08, + "loss": 0.2344, + "step": 4641 + }, + { + "epoch": 0.8409420289855073, + "grad_norm": 3.7628341864837873, + "learning_rate": 6.490002753952278e-08, + "loss": 0.2495, + "step": 4642 + }, + { + "epoch": 0.8411231884057971, + "grad_norm": 8.745142473889473, + "learning_rate": 6.475555082954858e-08, + "loss": 0.3187, + "step": 4643 + }, + { + "epoch": 0.841304347826087, + "grad_norm": 6.64893913796071, + "learning_rate": 6.461122397630059e-08, + "loss": 0.2415, + "step": 4644 + }, + { + "epoch": 0.8414855072463768, + "grad_norm": 4.266242376017027, + "learning_rate": 6.446704702947126e-08, + "loss": 0.2647, + "step": 4645 + }, + { + "epoch": 0.8416666666666667, + "grad_norm": 8.999463129996617, + "learning_rate": 6.432302003870105e-08, + "loss": 0.2688, + "step": 4646 + }, + { + "epoch": 0.8418478260869565, + "grad_norm": 3.860153262136091, + "learning_rate": 6.417914305357924e-08, + "loss": 0.2763, + "step": 4647 + }, + { + "epoch": 0.8420289855072464, + "grad_norm": 4.7412639784227855, + "learning_rate": 6.40354161236436e-08, + "loss": 0.2893, + "step": 4648 + }, + { + "epoch": 0.8422101449275362, + "grad_norm": 4.367860154244886, + "learning_rate": 6.389183929837977e-08, + "loss": 0.3115, + "step": 4649 + }, + { + "epoch": 0.842391304347826, + "grad_norm": 5.3354416006066225, + "learning_rate": 6.374841262722203e-08, + "loss": 0.3115, + "step": 4650 + }, + { + "epoch": 0.8425724637681159, + "grad_norm": 4.6763182073943135, + "learning_rate": 6.360513615955276e-08, + "loss": 0.3606, + "step": 4651 + }, + { + "epoch": 0.8427536231884057, + "grad_norm": 4.463426545162006, + "learning_rate": 6.346200994470286e-08, + "loss": 0.2901, + "step": 4652 + }, + { + "epoch": 0.8429347826086957, + "grad_norm": 3.829588436633688, + "learning_rate": 6.331903403195126e-08, + "loss": 0.2979, + "step": 4653 + }, + { + "epoch": 0.8431159420289855, + "grad_norm": 6.322096952904681, + "learning_rate": 6.317620847052529e-08, + "loss": 0.3029, + "step": 4654 + }, + { + "epoch": 0.8432971014492754, + "grad_norm": 6.606225194438366, + "learning_rate": 6.303353330960037e-08, + "loss": 0.3021, + "step": 4655 + }, + { + "epoch": 0.8434782608695652, + "grad_norm": 5.308279302902277, + "learning_rate": 6.289100859830054e-08, + "loss": 0.2612, + "step": 4656 + }, + { + "epoch": 0.8436594202898551, + "grad_norm": 6.929234085370544, + "learning_rate": 6.274863438569739e-08, + "loss": 0.2616, + "step": 4657 + }, + { + "epoch": 0.8438405797101449, + "grad_norm": 4.388890278145771, + "learning_rate": 6.260641072081113e-08, + "loss": 0.2467, + "step": 4658 + }, + { + "epoch": 0.8440217391304348, + "grad_norm": 3.872824440031464, + "learning_rate": 6.246433765261e-08, + "loss": 0.2393, + "step": 4659 + }, + { + "epoch": 0.8442028985507246, + "grad_norm": 3.370403125573209, + "learning_rate": 6.232241523001064e-08, + "loss": 0.2651, + "step": 4660 + }, + { + "epoch": 0.8443840579710145, + "grad_norm": 6.484148287846074, + "learning_rate": 6.21806435018774e-08, + "loss": 0.2365, + "step": 4661 + }, + { + "epoch": 0.8445652173913043, + "grad_norm": 4.697103225670785, + "learning_rate": 6.20390225170232e-08, + "loss": 0.2861, + "step": 4662 + }, + { + "epoch": 0.8447463768115943, + "grad_norm": 4.712302106970035, + "learning_rate": 6.189755232420846e-08, + "loss": 0.2513, + "step": 4663 + }, + { + "epoch": 0.8449275362318841, + "grad_norm": 6.829499641931987, + "learning_rate": 6.175623297214228e-08, + "loss": 0.2568, + "step": 4664 + }, + { + "epoch": 0.845108695652174, + "grad_norm": 7.255597424418193, + "learning_rate": 6.161506450948156e-08, + "loss": 0.2821, + "step": 4665 + }, + { + "epoch": 0.8452898550724638, + "grad_norm": 5.826200964460358, + "learning_rate": 6.147404698483121e-08, + "loss": 0.3091, + "step": 4666 + }, + { + "epoch": 0.8454710144927536, + "grad_norm": 6.528620863404902, + "learning_rate": 6.133318044674429e-08, + "loss": 0.2889, + "step": 4667 + }, + { + "epoch": 0.8456521739130435, + "grad_norm": 3.6383405578413472, + "learning_rate": 6.119246494372177e-08, + "loss": 0.2276, + "step": 4668 + }, + { + "epoch": 0.8458333333333333, + "grad_norm": 7.1429190119245884, + "learning_rate": 6.105190052421267e-08, + "loss": 0.2999, + "step": 4669 + }, + { + "epoch": 0.8460144927536232, + "grad_norm": 5.0583545694082925, + "learning_rate": 6.091148723661393e-08, + "loss": 0.2669, + "step": 4670 + }, + { + "epoch": 0.846195652173913, + "grad_norm": 3.4760269165341042, + "learning_rate": 6.077122512927069e-08, + "loss": 0.2548, + "step": 4671 + }, + { + "epoch": 0.8463768115942029, + "grad_norm": 7.465285976872562, + "learning_rate": 6.06311142504758e-08, + "loss": 0.2528, + "step": 4672 + }, + { + "epoch": 0.8465579710144927, + "grad_norm": 4.009623575510082, + "learning_rate": 6.049115464846999e-08, + "loss": 0.2544, + "step": 4673 + }, + { + "epoch": 0.8467391304347827, + "grad_norm": 5.362324956894773, + "learning_rate": 6.035134637144195e-08, + "loss": 0.2807, + "step": 4674 + }, + { + "epoch": 0.8469202898550725, + "grad_norm": 4.733962155504348, + "learning_rate": 6.021168946752852e-08, + "loss": 0.237, + "step": 4675 + }, + { + "epoch": 0.8471014492753624, + "grad_norm": 5.8951886543697585, + "learning_rate": 6.007218398481422e-08, + "loss": 0.2189, + "step": 4676 + }, + { + "epoch": 0.8472826086956522, + "grad_norm": 3.6510152757623606, + "learning_rate": 5.993282997133142e-08, + "loss": 0.2348, + "step": 4677 + }, + { + "epoch": 0.847463768115942, + "grad_norm": 8.862483351831589, + "learning_rate": 5.979362747506028e-08, + "loss": 0.2478, + "step": 4678 + }, + { + "epoch": 0.8476449275362319, + "grad_norm": 9.531432168149545, + "learning_rate": 5.965457654392897e-08, + "loss": 0.3038, + "step": 4679 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 4.7306617601216345, + "learning_rate": 5.9515677225813354e-08, + "loss": 0.3643, + "step": 4680 + }, + { + "epoch": 0.8480072463768116, + "grad_norm": 9.342600104866074, + "learning_rate": 5.937692956853713e-08, + "loss": 0.2897, + "step": 4681 + }, + { + "epoch": 0.8481884057971014, + "grad_norm": 5.807126660613538, + "learning_rate": 5.923833361987168e-08, + "loss": 0.2376, + "step": 4682 + }, + { + "epoch": 0.8483695652173913, + "grad_norm": 4.38295916468315, + "learning_rate": 5.909988942753652e-08, + "loss": 0.2624, + "step": 4683 + }, + { + "epoch": 0.8485507246376811, + "grad_norm": 5.323071904588808, + "learning_rate": 5.896159703919834e-08, + "loss": 0.2485, + "step": 4684 + }, + { + "epoch": 0.8487318840579711, + "grad_norm": 4.31720216281445, + "learning_rate": 5.8823456502472004e-08, + "loss": 0.2649, + "step": 4685 + }, + { + "epoch": 0.8489130434782609, + "grad_norm": 5.184987202632363, + "learning_rate": 5.868546786491985e-08, + "loss": 0.269, + "step": 4686 + }, + { + "epoch": 0.8490942028985508, + "grad_norm": 4.8085882817430265, + "learning_rate": 5.85476311740522e-08, + "loss": 0.2983, + "step": 4687 + }, + { + "epoch": 0.8492753623188406, + "grad_norm": 5.925885020558899, + "learning_rate": 5.8409946477326724e-08, + "loss": 0.2866, + "step": 4688 + }, + { + "epoch": 0.8494565217391304, + "grad_norm": 4.639274597320364, + "learning_rate": 5.827241382214915e-08, + "loss": 0.2687, + "step": 4689 + }, + { + "epoch": 0.8496376811594203, + "grad_norm": 5.653284157072135, + "learning_rate": 5.8135033255872214e-08, + "loss": 0.2754, + "step": 4690 + }, + { + "epoch": 0.8498188405797101, + "grad_norm": 4.995066341662183, + "learning_rate": 5.799780482579703e-08, + "loss": 0.271, + "step": 4691 + }, + { + "epoch": 0.85, + "grad_norm": 6.596674435450166, + "learning_rate": 5.7860728579171904e-08, + "loss": 0.29, + "step": 4692 + }, + { + "epoch": 0.8501811594202898, + "grad_norm": 6.288817020917897, + "learning_rate": 5.7723804563192814e-08, + "loss": 0.265, + "step": 4693 + }, + { + "epoch": 0.8503623188405797, + "grad_norm": 6.463946918667353, + "learning_rate": 5.758703282500332e-08, + "loss": 0.3008, + "step": 4694 + }, + { + "epoch": 0.8505434782608695, + "grad_norm": 11.56470820524333, + "learning_rate": 5.745041341169471e-08, + "loss": 0.2816, + "step": 4695 + }, + { + "epoch": 0.8507246376811595, + "grad_norm": 6.651580681379385, + "learning_rate": 5.731394637030551e-08, + "loss": 0.2881, + "step": 4696 + }, + { + "epoch": 0.8509057971014493, + "grad_norm": 4.186837179879975, + "learning_rate": 5.7177631747822144e-08, + "loss": 0.2469, + "step": 4697 + }, + { + "epoch": 0.8510869565217392, + "grad_norm": 4.385237409060419, + "learning_rate": 5.704146959117817e-08, + "loss": 0.2703, + "step": 4698 + }, + { + "epoch": 0.851268115942029, + "grad_norm": 4.485860860883113, + "learning_rate": 5.6905459947255206e-08, + "loss": 0.2814, + "step": 4699 + }, + { + "epoch": 0.8514492753623188, + "grad_norm": 9.119240206569785, + "learning_rate": 5.6769602862881685e-08, + "loss": 0.3212, + "step": 4700 + }, + { + "epoch": 0.8514492753623188, + "eval_loss": 0.2677968740463257, + "eval_runtime": 9.8105, + "eval_samples_per_second": 50.966, + "eval_steps_per_second": 0.102, + "step": 4700 + }, + { + "epoch": 0.8516304347826087, + "grad_norm": 4.111293348208835, + "learning_rate": 5.66338983848339e-08, + "loss": 0.3243, + "step": 4701 + }, + { + "epoch": 0.8518115942028985, + "grad_norm": 5.833876041662731, + "learning_rate": 5.6498346559835705e-08, + "loss": 0.3138, + "step": 4702 + }, + { + "epoch": 0.8519927536231884, + "grad_norm": 6.790067269751507, + "learning_rate": 5.636294743455816e-08, + "loss": 0.2774, + "step": 4703 + }, + { + "epoch": 0.8521739130434782, + "grad_norm": 7.446225616700585, + "learning_rate": 5.622770105561975e-08, + "loss": 0.2301, + "step": 4704 + }, + { + "epoch": 0.8523550724637681, + "grad_norm": 4.417469034169004, + "learning_rate": 5.609260746958655e-08, + "loss": 0.3073, + "step": 4705 + }, + { + "epoch": 0.8525362318840579, + "grad_norm": 3.7395050534082213, + "learning_rate": 5.5957666722971866e-08, + "loss": 0.2215, + "step": 4706 + }, + { + "epoch": 0.8527173913043479, + "grad_norm": 3.3868280170156653, + "learning_rate": 5.5822878862236465e-08, + "loss": 0.2477, + "step": 4707 + }, + { + "epoch": 0.8528985507246377, + "grad_norm": 4.512889420209454, + "learning_rate": 5.56882439337884e-08, + "loss": 0.3072, + "step": 4708 + }, + { + "epoch": 0.8530797101449276, + "grad_norm": 4.724107102209065, + "learning_rate": 5.555376198398315e-08, + "loss": 0.2813, + "step": 4709 + }, + { + "epoch": 0.8532608695652174, + "grad_norm": 4.12087526145894, + "learning_rate": 5.54194330591235e-08, + "loss": 0.2514, + "step": 4710 + }, + { + "epoch": 0.8534420289855073, + "grad_norm": 5.13964821194713, + "learning_rate": 5.5285257205459515e-08, + "loss": 0.2763, + "step": 4711 + }, + { + "epoch": 0.8536231884057971, + "grad_norm": 4.970108016520217, + "learning_rate": 5.5151234469188514e-08, + "loss": 0.2092, + "step": 4712 + }, + { + "epoch": 0.8538043478260869, + "grad_norm": 4.262464901136843, + "learning_rate": 5.50173648964552e-08, + "loss": 0.2497, + "step": 4713 + }, + { + "epoch": 0.8539855072463768, + "grad_norm": 4.618533712913622, + "learning_rate": 5.488364853335159e-08, + "loss": 0.2919, + "step": 4714 + }, + { + "epoch": 0.8541666666666666, + "grad_norm": 3.845424524180444, + "learning_rate": 5.475008542591686e-08, + "loss": 0.2914, + "step": 4715 + }, + { + "epoch": 0.8543478260869565, + "grad_norm": 4.469559295410864, + "learning_rate": 5.461667562013733e-08, + "loss": 0.2626, + "step": 4716 + }, + { + "epoch": 0.8545289855072464, + "grad_norm": 9.738853515626896, + "learning_rate": 5.448341916194649e-08, + "loss": 0.2697, + "step": 4717 + }, + { + "epoch": 0.8547101449275363, + "grad_norm": 3.244657519826543, + "learning_rate": 5.4350316097225454e-08, + "loss": 0.1809, + "step": 4718 + }, + { + "epoch": 0.8548913043478261, + "grad_norm": 6.693330699637032, + "learning_rate": 5.421736647180214e-08, + "loss": 0.2318, + "step": 4719 + }, + { + "epoch": 0.855072463768116, + "grad_norm": 4.211542645489069, + "learning_rate": 5.4084570331451694e-08, + "loss": 0.2398, + "step": 4720 + }, + { + "epoch": 0.8552536231884058, + "grad_norm": 3.6129928871791104, + "learning_rate": 5.3951927721896494e-08, + "loss": 0.2245, + "step": 4721 + }, + { + "epoch": 0.8554347826086957, + "grad_norm": 4.787883134533534, + "learning_rate": 5.381943868880595e-08, + "loss": 0.2961, + "step": 4722 + }, + { + "epoch": 0.8556159420289855, + "grad_norm": 3.986756509430605, + "learning_rate": 5.3687103277796796e-08, + "loss": 0.2698, + "step": 4723 + }, + { + "epoch": 0.8557971014492753, + "grad_norm": 5.345254776169634, + "learning_rate": 5.355492153443258e-08, + "loss": 0.2896, + "step": 4724 + }, + { + "epoch": 0.8559782608695652, + "grad_norm": 5.112378029932426, + "learning_rate": 5.342289350422413e-08, + "loss": 0.2886, + "step": 4725 + }, + { + "epoch": 0.856159420289855, + "grad_norm": 3.98823730267497, + "learning_rate": 5.329101923262952e-08, + "loss": 0.2609, + "step": 4726 + }, + { + "epoch": 0.8563405797101449, + "grad_norm": 4.692040697273998, + "learning_rate": 5.315929876505348e-08, + "loss": 0.3154, + "step": 4727 + }, + { + "epoch": 0.8565217391304348, + "grad_norm": 7.547529256722531, + "learning_rate": 5.302773214684803e-08, + "loss": 0.2705, + "step": 4728 + }, + { + "epoch": 0.8567028985507247, + "grad_norm": 5.106064010638053, + "learning_rate": 5.289631942331213e-08, + "loss": 0.2718, + "step": 4729 + }, + { + "epoch": 0.8568840579710145, + "grad_norm": 7.947185915904829, + "learning_rate": 5.2765060639691937e-08, + "loss": 0.2701, + "step": 4730 + }, + { + "epoch": 0.8570652173913044, + "grad_norm": 5.49692550338986, + "learning_rate": 5.263395584118047e-08, + "loss": 0.3084, + "step": 4731 + }, + { + "epoch": 0.8572463768115942, + "grad_norm": 4.050460746928886, + "learning_rate": 5.2503005072917713e-08, + "loss": 0.226, + "step": 4732 + }, + { + "epoch": 0.8574275362318841, + "grad_norm": 9.278539825170677, + "learning_rate": 5.23722083799904e-08, + "loss": 0.2383, + "step": 4733 + }, + { + "epoch": 0.8576086956521739, + "grad_norm": 4.928670358788241, + "learning_rate": 5.2241565807432776e-08, + "loss": 0.2817, + "step": 4734 + }, + { + "epoch": 0.8577898550724637, + "grad_norm": 3.5898406792671156, + "learning_rate": 5.211107740022558e-08, + "loss": 0.2701, + "step": 4735 + }, + { + "epoch": 0.8579710144927536, + "grad_norm": 4.975485937192667, + "learning_rate": 5.198074320329654e-08, + "loss": 0.2805, + "step": 4736 + }, + { + "epoch": 0.8581521739130434, + "grad_norm": 4.43262879083396, + "learning_rate": 5.185056326152037e-08, + "loss": 0.2542, + "step": 4737 + }, + { + "epoch": 0.8583333333333333, + "grad_norm": 8.369412908749364, + "learning_rate": 5.172053761971867e-08, + "loss": 0.3017, + "step": 4738 + }, + { + "epoch": 0.8585144927536232, + "grad_norm": 7.415489230243173, + "learning_rate": 5.1590666322659845e-08, + "loss": 0.2787, + "step": 4739 + }, + { + "epoch": 0.8586956521739131, + "grad_norm": 6.9000582139541375, + "learning_rate": 5.146094941505913e-08, + "loss": 0.2208, + "step": 4740 + }, + { + "epoch": 0.8588768115942029, + "grad_norm": 6.186041763846137, + "learning_rate": 5.1331386941578846e-08, + "loss": 0.247, + "step": 4741 + }, + { + "epoch": 0.8590579710144928, + "grad_norm": 3.924576031889005, + "learning_rate": 5.120197894682793e-08, + "loss": 0.2699, + "step": 4742 + }, + { + "epoch": 0.8592391304347826, + "grad_norm": 7.8848981890728105, + "learning_rate": 5.107272547536207e-08, + "loss": 0.2734, + "step": 4743 + }, + { + "epoch": 0.8594202898550725, + "grad_norm": 6.519843239454956, + "learning_rate": 5.0943626571683774e-08, + "loss": 0.2563, + "step": 4744 + }, + { + "epoch": 0.8596014492753623, + "grad_norm": 4.9993217366247364, + "learning_rate": 5.0814682280242604e-08, + "loss": 0.3061, + "step": 4745 + }, + { + "epoch": 0.8597826086956522, + "grad_norm": 6.383583032520059, + "learning_rate": 5.068589264543466e-08, + "loss": 0.2708, + "step": 4746 + }, + { + "epoch": 0.859963768115942, + "grad_norm": 5.9074673356650145, + "learning_rate": 5.055725771160274e-08, + "loss": 0.2956, + "step": 4747 + }, + { + "epoch": 0.8601449275362318, + "grad_norm": 3.9232002430942012, + "learning_rate": 5.042877752303648e-08, + "loss": 0.3127, + "step": 4748 + }, + { + "epoch": 0.8603260869565217, + "grad_norm": 5.816423456020595, + "learning_rate": 5.030045212397227e-08, + "loss": 0.2979, + "step": 4749 + }, + { + "epoch": 0.8605072463768116, + "grad_norm": 5.964388887545941, + "learning_rate": 5.0172281558593044e-08, + "loss": 0.2415, + "step": 4750 + }, + { + "epoch": 0.8606884057971015, + "grad_norm": 8.207513057690512, + "learning_rate": 5.0044265871028666e-08, + "loss": 0.2746, + "step": 4751 + }, + { + "epoch": 0.8608695652173913, + "grad_norm": 4.4646718280336755, + "learning_rate": 4.991640510535538e-08, + "loss": 0.2393, + "step": 4752 + }, + { + "epoch": 0.8610507246376812, + "grad_norm": 4.934126995471067, + "learning_rate": 4.978869930559654e-08, + "loss": 0.2518, + "step": 4753 + }, + { + "epoch": 0.861231884057971, + "grad_norm": 4.75033009967884, + "learning_rate": 4.966114851572156e-08, + "loss": 0.2834, + "step": 4754 + }, + { + "epoch": 0.8614130434782609, + "grad_norm": 5.824520280458007, + "learning_rate": 4.953375277964694e-08, + "loss": 0.3035, + "step": 4755 + }, + { + "epoch": 0.8615942028985507, + "grad_norm": 4.2230720683193, + "learning_rate": 4.940651214123548e-08, + "loss": 0.2676, + "step": 4756 + }, + { + "epoch": 0.8617753623188406, + "grad_norm": 4.356967722732345, + "learning_rate": 4.9279426644296906e-08, + "loss": 0.2866, + "step": 4757 + }, + { + "epoch": 0.8619565217391304, + "grad_norm": 10.629175468892866, + "learning_rate": 4.9152496332587336e-08, + "loss": 0.3048, + "step": 4758 + }, + { + "epoch": 0.8621376811594202, + "grad_norm": 3.7226318177999107, + "learning_rate": 4.902572124980952e-08, + "loss": 0.2728, + "step": 4759 + }, + { + "epoch": 0.8623188405797102, + "grad_norm": 5.177982449770165, + "learning_rate": 4.889910143961246e-08, + "loss": 0.2422, + "step": 4760 + }, + { + "epoch": 0.8625, + "grad_norm": 4.786627664604113, + "learning_rate": 4.877263694559225e-08, + "loss": 0.2783, + "step": 4761 + }, + { + "epoch": 0.8626811594202899, + "grad_norm": 6.884433016024381, + "learning_rate": 4.8646327811291074e-08, + "loss": 0.2831, + "step": 4762 + }, + { + "epoch": 0.8628623188405797, + "grad_norm": 5.554165733102009, + "learning_rate": 4.8520174080197764e-08, + "loss": 0.3451, + "step": 4763 + }, + { + "epoch": 0.8630434782608696, + "grad_norm": 4.168758882239681, + "learning_rate": 4.8394175795747725e-08, + "loss": 0.2355, + "step": 4764 + }, + { + "epoch": 0.8632246376811594, + "grad_norm": 5.941932261378795, + "learning_rate": 4.826833300132266e-08, + "loss": 0.2834, + "step": 4765 + }, + { + "epoch": 0.8634057971014493, + "grad_norm": 5.599059540455458, + "learning_rate": 4.814264574025095e-08, + "loss": 0.2906, + "step": 4766 + }, + { + "epoch": 0.8635869565217391, + "grad_norm": 4.992164502682655, + "learning_rate": 4.801711405580722e-08, + "loss": 0.3453, + "step": 4767 + }, + { + "epoch": 0.863768115942029, + "grad_norm": 4.19583737550248, + "learning_rate": 4.78917379912126e-08, + "loss": 0.2665, + "step": 4768 + }, + { + "epoch": 0.8639492753623188, + "grad_norm": 3.5818416605333, + "learning_rate": 4.776651758963485e-08, + "loss": 0.305, + "step": 4769 + }, + { + "epoch": 0.8641304347826086, + "grad_norm": 5.740539453117691, + "learning_rate": 4.764145289418775e-08, + "loss": 0.2852, + "step": 4770 + }, + { + "epoch": 0.8643115942028986, + "grad_norm": 7.302862362273101, + "learning_rate": 4.7516543947931695e-08, + "loss": 0.266, + "step": 4771 + }, + { + "epoch": 0.8644927536231884, + "grad_norm": 8.665570781591065, + "learning_rate": 4.739179079387351e-08, + "loss": 0.2543, + "step": 4772 + }, + { + "epoch": 0.8646739130434783, + "grad_norm": 6.737153509525896, + "learning_rate": 4.726719347496627e-08, + "loss": 0.324, + "step": 4773 + }, + { + "epoch": 0.8648550724637681, + "grad_norm": 4.011633541562273, + "learning_rate": 4.714275203410939e-08, + "loss": 0.2929, + "step": 4774 + }, + { + "epoch": 0.865036231884058, + "grad_norm": 5.739907818725988, + "learning_rate": 4.70184665141487e-08, + "loss": 0.2704, + "step": 4775 + }, + { + "epoch": 0.8652173913043478, + "grad_norm": 7.08178737126034, + "learning_rate": 4.689433695787626e-08, + "loss": 0.3541, + "step": 4776 + }, + { + "epoch": 0.8653985507246377, + "grad_norm": 4.893928876851549, + "learning_rate": 4.6770363408030476e-08, + "loss": 0.2469, + "step": 4777 + }, + { + "epoch": 0.8655797101449275, + "grad_norm": 3.469906626602484, + "learning_rate": 4.6646545907296e-08, + "loss": 0.2394, + "step": 4778 + }, + { + "epoch": 0.8657608695652174, + "grad_norm": 5.023344079291013, + "learning_rate": 4.6522884498303774e-08, + "loss": 0.2817, + "step": 4779 + }, + { + "epoch": 0.8659420289855072, + "grad_norm": 7.366778179396263, + "learning_rate": 4.6399379223631074e-08, + "loss": 0.276, + "step": 4780 + }, + { + "epoch": 0.866123188405797, + "grad_norm": 4.6909101921856875, + "learning_rate": 4.627603012580128e-08, + "loss": 0.2822, + "step": 4781 + }, + { + "epoch": 0.866304347826087, + "grad_norm": 3.6207284062663496, + "learning_rate": 4.6152837247284136e-08, + "loss": 0.2505, + "step": 4782 + }, + { + "epoch": 0.8664855072463769, + "grad_norm": 3.641406937992566, + "learning_rate": 4.6029800630495385e-08, + "loss": 0.2873, + "step": 4783 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 5.3715093702827765, + "learning_rate": 4.59069203177973e-08, + "loss": 0.2204, + "step": 4784 + }, + { + "epoch": 0.8668478260869565, + "grad_norm": 4.068329719988691, + "learning_rate": 4.5784196351498216e-08, + "loss": 0.2654, + "step": 4785 + }, + { + "epoch": 0.8670289855072464, + "grad_norm": 6.627004008280092, + "learning_rate": 4.566162877385238e-08, + "loss": 0.3206, + "step": 4786 + }, + { + "epoch": 0.8672101449275362, + "grad_norm": 9.649125062706997, + "learning_rate": 4.553921762706031e-08, + "loss": 0.2735, + "step": 4787 + }, + { + "epoch": 0.8673913043478261, + "grad_norm": 4.496906073500851, + "learning_rate": 4.541696295326902e-08, + "loss": 0.2132, + "step": 4788 + }, + { + "epoch": 0.8675724637681159, + "grad_norm": 6.412039995766007, + "learning_rate": 4.529486479457123e-08, + "loss": 0.2972, + "step": 4789 + }, + { + "epoch": 0.8677536231884058, + "grad_norm": 6.992352001193379, + "learning_rate": 4.5172923193005975e-08, + "loss": 0.2576, + "step": 4790 + }, + { + "epoch": 0.8679347826086956, + "grad_norm": 3.645832732654844, + "learning_rate": 4.505113819055822e-08, + "loss": 0.2805, + "step": 4791 + }, + { + "epoch": 0.8681159420289855, + "grad_norm": 5.947747610274793, + "learning_rate": 4.492950982915922e-08, + "loss": 0.2874, + "step": 4792 + }, + { + "epoch": 0.8682971014492754, + "grad_norm": 5.180417435140503, + "learning_rate": 4.480803815068612e-08, + "loss": 0.2882, + "step": 4793 + }, + { + "epoch": 0.8684782608695653, + "grad_norm": 6.196152999866419, + "learning_rate": 4.468672319696221e-08, + "loss": 0.3124, + "step": 4794 + }, + { + "epoch": 0.8686594202898551, + "grad_norm": 5.241817376967998, + "learning_rate": 4.456556500975678e-08, + "loss": 0.2569, + "step": 4795 + }, + { + "epoch": 0.868840579710145, + "grad_norm": 5.969026236850944, + "learning_rate": 4.444456363078536e-08, + "loss": 0.3378, + "step": 4796 + }, + { + "epoch": 0.8690217391304348, + "grad_norm": 8.659986363062528, + "learning_rate": 4.432371910170907e-08, + "loss": 0.2789, + "step": 4797 + }, + { + "epoch": 0.8692028985507246, + "grad_norm": 4.458566167266721, + "learning_rate": 4.4203031464135345e-08, + "loss": 0.2671, + "step": 4798 + }, + { + "epoch": 0.8693840579710145, + "grad_norm": 4.721670555075104, + "learning_rate": 4.408250075961739e-08, + "loss": 0.2736, + "step": 4799 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 3.93480745081534, + "learning_rate": 4.396212702965474e-08, + "loss": 0.3227, + "step": 4800 + }, + { + "epoch": 0.8695652173913043, + "eval_loss": 0.26845312118530273, + "eval_runtime": 9.7816, + "eval_samples_per_second": 51.116, + "eval_steps_per_second": 0.102, + "step": 4800 + }, + { + "epoch": 0.8697463768115942, + "grad_norm": 5.449914182625196, + "learning_rate": 4.3841910315692456e-08, + "loss": 0.218, + "step": 4801 + }, + { + "epoch": 0.869927536231884, + "grad_norm": 4.445718265230896, + "learning_rate": 4.3721850659121875e-08, + "loss": 0.3426, + "step": 4802 + }, + { + "epoch": 0.8701086956521739, + "grad_norm": 4.329150571611649, + "learning_rate": 4.3601948101279814e-08, + "loss": 0.2693, + "step": 4803 + }, + { + "epoch": 0.8702898550724638, + "grad_norm": 3.545951424301417, + "learning_rate": 4.34822026834496e-08, + "loss": 0.2513, + "step": 4804 + }, + { + "epoch": 0.8704710144927537, + "grad_norm": 3.484189392788549, + "learning_rate": 4.336261444686001e-08, + "loss": 0.2346, + "step": 4805 + }, + { + "epoch": 0.8706521739130435, + "grad_norm": 6.097123904837969, + "learning_rate": 4.324318343268585e-08, + "loss": 0.3472, + "step": 4806 + }, + { + "epoch": 0.8708333333333333, + "grad_norm": 4.026122015491682, + "learning_rate": 4.312390968204782e-08, + "loss": 0.2438, + "step": 4807 + }, + { + "epoch": 0.8710144927536232, + "grad_norm": 4.471671504660214, + "learning_rate": 4.3004793236012414e-08, + "loss": 0.2803, + "step": 4808 + }, + { + "epoch": 0.871195652173913, + "grad_norm": 5.533011614134167, + "learning_rate": 4.2885834135591933e-08, + "loss": 0.3653, + "step": 4809 + }, + { + "epoch": 0.8713768115942029, + "grad_norm": 5.921194508936267, + "learning_rate": 4.2767032421744565e-08, + "loss": 0.2462, + "step": 4810 + }, + { + "epoch": 0.8715579710144927, + "grad_norm": 7.571181168152904, + "learning_rate": 4.264838813537447e-08, + "loss": 0.2946, + "step": 4811 + }, + { + "epoch": 0.8717391304347826, + "grad_norm": 6.735517852130179, + "learning_rate": 4.252990131733136e-08, + "loss": 0.2751, + "step": 4812 + }, + { + "epoch": 0.8719202898550724, + "grad_norm": 6.10618697706934, + "learning_rate": 4.2411572008410665e-08, + "loss": 0.2744, + "step": 4813 + }, + { + "epoch": 0.8721014492753624, + "grad_norm": 5.375193949200026, + "learning_rate": 4.2293400249353737e-08, + "loss": 0.2868, + "step": 4814 + }, + { + "epoch": 0.8722826086956522, + "grad_norm": 4.562098702692266, + "learning_rate": 4.217538608084792e-08, + "loss": 0.3376, + "step": 4815 + }, + { + "epoch": 0.8724637681159421, + "grad_norm": 4.0965578662631446, + "learning_rate": 4.205752954352581e-08, + "loss": 0.2745, + "step": 4816 + }, + { + "epoch": 0.8726449275362319, + "grad_norm": 11.553300530492404, + "learning_rate": 4.193983067796608e-08, + "loss": 0.2716, + "step": 4817 + }, + { + "epoch": 0.8728260869565218, + "grad_norm": 4.9190342539145195, + "learning_rate": 4.182228952469302e-08, + "loss": 0.3053, + "step": 4818 + }, + { + "epoch": 0.8730072463768116, + "grad_norm": 7.173060887853867, + "learning_rate": 4.1704906124176544e-08, + "loss": 0.2352, + "step": 4819 + }, + { + "epoch": 0.8731884057971014, + "grad_norm": 3.4336738510071534, + "learning_rate": 4.158768051683231e-08, + "loss": 0.2391, + "step": 4820 + }, + { + "epoch": 0.8733695652173913, + "grad_norm": 5.013687030619245, + "learning_rate": 4.147061274302172e-08, + "loss": 0.2604, + "step": 4821 + }, + { + "epoch": 0.8735507246376811, + "grad_norm": 4.967742778617009, + "learning_rate": 4.135370284305162e-08, + "loss": 0.2849, + "step": 4822 + }, + { + "epoch": 0.873731884057971, + "grad_norm": 3.87426816123235, + "learning_rate": 4.123695085717488e-08, + "loss": 0.2939, + "step": 4823 + }, + { + "epoch": 0.8739130434782608, + "grad_norm": 4.443053351641189, + "learning_rate": 4.112035682558956e-08, + "loss": 0.2355, + "step": 4824 + }, + { + "epoch": 0.8740942028985508, + "grad_norm": 8.136016356071698, + "learning_rate": 4.100392078843962e-08, + "loss": 0.3054, + "step": 4825 + }, + { + "epoch": 0.8742753623188406, + "grad_norm": 5.446284062484525, + "learning_rate": 4.0887642785814416e-08, + "loss": 0.3049, + "step": 4826 + }, + { + "epoch": 0.8744565217391305, + "grad_norm": 3.972469992526076, + "learning_rate": 4.077152285774921e-08, + "loss": 0.2898, + "step": 4827 + }, + { + "epoch": 0.8746376811594203, + "grad_norm": 3.8042995471976133, + "learning_rate": 4.0655561044224575e-08, + "loss": 0.2824, + "step": 4828 + }, + { + "epoch": 0.8748188405797102, + "grad_norm": 4.462456674296698, + "learning_rate": 4.053975738516674e-08, + "loss": 0.3095, + "step": 4829 + }, + { + "epoch": 0.875, + "grad_norm": 3.6003349523794435, + "learning_rate": 4.0424111920447256e-08, + "loss": 0.2736, + "step": 4830 + }, + { + "epoch": 0.8751811594202898, + "grad_norm": 5.293492608681053, + "learning_rate": 4.0308624689883665e-08, + "loss": 0.3599, + "step": 4831 + }, + { + "epoch": 0.8753623188405797, + "grad_norm": 4.559790756344651, + "learning_rate": 4.0193295733238597e-08, + "loss": 0.3252, + "step": 4832 + }, + { + "epoch": 0.8755434782608695, + "grad_norm": 5.825058697053931, + "learning_rate": 4.007812509022046e-08, + "loss": 0.3034, + "step": 4833 + }, + { + "epoch": 0.8757246376811594, + "grad_norm": 7.078450517220368, + "learning_rate": 3.9963112800483035e-08, + "loss": 0.3009, + "step": 4834 + }, + { + "epoch": 0.8759057971014492, + "grad_norm": 3.5727324426677285, + "learning_rate": 3.984825890362553e-08, + "loss": 0.209, + "step": 4835 + }, + { + "epoch": 0.8760869565217392, + "grad_norm": 5.363655439938031, + "learning_rate": 3.973356343919271e-08, + "loss": 0.3563, + "step": 4836 + }, + { + "epoch": 0.876268115942029, + "grad_norm": 3.872962487747958, + "learning_rate": 3.9619026446674865e-08, + "loss": 0.2441, + "step": 4837 + }, + { + "epoch": 0.8764492753623189, + "grad_norm": 3.6838392317210786, + "learning_rate": 3.95046479655074e-08, + "loss": 0.2673, + "step": 4838 + }, + { + "epoch": 0.8766304347826087, + "grad_norm": 8.453286777806182, + "learning_rate": 3.93904280350717e-08, + "loss": 0.2264, + "step": 4839 + }, + { + "epoch": 0.8768115942028986, + "grad_norm": 5.844697872745354, + "learning_rate": 3.927636669469392e-08, + "loss": 0.2577, + "step": 4840 + }, + { + "epoch": 0.8769927536231884, + "grad_norm": 3.6777999015594083, + "learning_rate": 3.916246398364609e-08, + "loss": 0.2621, + "step": 4841 + }, + { + "epoch": 0.8771739130434782, + "grad_norm": 6.001265432192033, + "learning_rate": 3.9048719941145283e-08, + "loss": 0.309, + "step": 4842 + }, + { + "epoch": 0.8773550724637681, + "grad_norm": 6.462604414134679, + "learning_rate": 3.893513460635434e-08, + "loss": 0.3062, + "step": 4843 + }, + { + "epoch": 0.8775362318840579, + "grad_norm": 3.4540402710718636, + "learning_rate": 3.882170801838114e-08, + "loss": 0.2057, + "step": 4844 + }, + { + "epoch": 0.8777173913043478, + "grad_norm": 4.842282418840915, + "learning_rate": 3.870844021627895e-08, + "loss": 0.2336, + "step": 4845 + }, + { + "epoch": 0.8778985507246376, + "grad_norm": 3.7689894636285373, + "learning_rate": 3.85953312390464e-08, + "loss": 0.2488, + "step": 4846 + }, + { + "epoch": 0.8780797101449276, + "grad_norm": 7.61909799344492, + "learning_rate": 3.848238112562757e-08, + "loss": 0.3136, + "step": 4847 + }, + { + "epoch": 0.8782608695652174, + "grad_norm": 4.224094208865548, + "learning_rate": 3.836958991491157e-08, + "loss": 0.2879, + "step": 4848 + }, + { + "epoch": 0.8784420289855073, + "grad_norm": 6.87142070956398, + "learning_rate": 3.825695764573306e-08, + "loss": 0.2994, + "step": 4849 + }, + { + "epoch": 0.8786231884057971, + "grad_norm": 6.84381033527655, + "learning_rate": 3.8144484356871785e-08, + "loss": 0.2286, + "step": 4850 + }, + { + "epoch": 0.878804347826087, + "grad_norm": 5.619098974696006, + "learning_rate": 3.803217008705289e-08, + "loss": 0.2885, + "step": 4851 + }, + { + "epoch": 0.8789855072463768, + "grad_norm": 4.279187710623164, + "learning_rate": 3.792001487494667e-08, + "loss": 0.3192, + "step": 4852 + }, + { + "epoch": 0.8791666666666667, + "grad_norm": 6.9355359843170685, + "learning_rate": 3.7808018759168614e-08, + "loss": 0.2565, + "step": 4853 + }, + { + "epoch": 0.8793478260869565, + "grad_norm": 6.059950578905525, + "learning_rate": 3.769618177827971e-08, + "loss": 0.2973, + "step": 4854 + }, + { + "epoch": 0.8795289855072463, + "grad_norm": 4.691597441259821, + "learning_rate": 3.758450397078594e-08, + "loss": 0.3328, + "step": 4855 + }, + { + "epoch": 0.8797101449275362, + "grad_norm": 4.2106759387131305, + "learning_rate": 3.7472985375138325e-08, + "loss": 0.2847, + "step": 4856 + }, + { + "epoch": 0.8798913043478261, + "grad_norm": 9.681248745490038, + "learning_rate": 3.736162602973325e-08, + "loss": 0.3246, + "step": 4857 + }, + { + "epoch": 0.880072463768116, + "grad_norm": 5.34928426402028, + "learning_rate": 3.7250425972912505e-08, + "loss": 0.3119, + "step": 4858 + }, + { + "epoch": 0.8802536231884058, + "grad_norm": 7.115363303078974, + "learning_rate": 3.713938524296256e-08, + "loss": 0.2783, + "step": 4859 + }, + { + "epoch": 0.8804347826086957, + "grad_norm": 3.9243422982401293, + "learning_rate": 3.702850387811546e-08, + "loss": 0.2657, + "step": 4860 + }, + { + "epoch": 0.8806159420289855, + "grad_norm": 3.662297261711184, + "learning_rate": 3.691778191654799e-08, + "loss": 0.2921, + "step": 4861 + }, + { + "epoch": 0.8807971014492754, + "grad_norm": 7.683679150255554, + "learning_rate": 3.680721939638237e-08, + "loss": 0.2434, + "step": 4862 + }, + { + "epoch": 0.8809782608695652, + "grad_norm": 5.255328396636565, + "learning_rate": 3.669681635568578e-08, + "loss": 0.2537, + "step": 4863 + }, + { + "epoch": 0.881159420289855, + "grad_norm": 5.3053784017465775, + "learning_rate": 3.6586572832470464e-08, + "loss": 0.2754, + "step": 4864 + }, + { + "epoch": 0.8813405797101449, + "grad_norm": 5.288496246351217, + "learning_rate": 3.647648886469379e-08, + "loss": 0.2188, + "step": 4865 + }, + { + "epoch": 0.8815217391304347, + "grad_norm": 5.606372318197591, + "learning_rate": 3.636656449025832e-08, + "loss": 0.2742, + "step": 4866 + }, + { + "epoch": 0.8817028985507246, + "grad_norm": 4.892399152430679, + "learning_rate": 3.62567997470114e-08, + "loss": 0.2994, + "step": 4867 + }, + { + "epoch": 0.8818840579710145, + "grad_norm": 3.807216904071993, + "learning_rate": 3.614719467274557e-08, + "loss": 0.2669, + "step": 4868 + }, + { + "epoch": 0.8820652173913044, + "grad_norm": 4.5879566164909695, + "learning_rate": 3.6037749305198317e-08, + "loss": 0.2427, + "step": 4869 + }, + { + "epoch": 0.8822463768115942, + "grad_norm": 4.618097294151533, + "learning_rate": 3.592846368205238e-08, + "loss": 0.2825, + "step": 4870 + }, + { + "epoch": 0.8824275362318841, + "grad_norm": 4.708085248529916, + "learning_rate": 3.581933784093516e-08, + "loss": 0.2697, + "step": 4871 + }, + { + "epoch": 0.8826086956521739, + "grad_norm": 4.412138029641981, + "learning_rate": 3.5710371819419385e-08, + "loss": 0.2245, + "step": 4872 + }, + { + "epoch": 0.8827898550724638, + "grad_norm": 3.2617556890915003, + "learning_rate": 3.560156565502226e-08, + "loss": 0.2177, + "step": 4873 + }, + { + "epoch": 0.8829710144927536, + "grad_norm": 7.6055981263454004, + "learning_rate": 3.5492919385206546e-08, + "loss": 0.2711, + "step": 4874 + }, + { + "epoch": 0.8831521739130435, + "grad_norm": 5.815422259527724, + "learning_rate": 3.538443304737954e-08, + "loss": 0.2972, + "step": 4875 + }, + { + "epoch": 0.8833333333333333, + "grad_norm": 4.712542839093594, + "learning_rate": 3.5276106678893637e-08, + "loss": 0.2962, + "step": 4876 + }, + { + "epoch": 0.8835144927536231, + "grad_norm": 4.0208079557661, + "learning_rate": 3.516794031704612e-08, + "loss": 0.2825, + "step": 4877 + }, + { + "epoch": 0.883695652173913, + "grad_norm": 6.862615966791953, + "learning_rate": 3.505993399907919e-08, + "loss": 0.2507, + "step": 4878 + }, + { + "epoch": 0.883876811594203, + "grad_norm": 3.412905342786199, + "learning_rate": 3.495208776217989e-08, + "loss": 0.2336, + "step": 4879 + }, + { + "epoch": 0.8840579710144928, + "grad_norm": 5.4957707027751805, + "learning_rate": 3.484440164348018e-08, + "loss": 0.2782, + "step": 4880 + }, + { + "epoch": 0.8842391304347826, + "grad_norm": 3.813333636561732, + "learning_rate": 3.473687568005696e-08, + "loss": 0.2486, + "step": 4881 + }, + { + "epoch": 0.8844202898550725, + "grad_norm": 4.6979226299210035, + "learning_rate": 3.462950990893199e-08, + "loss": 0.2738, + "step": 4882 + }, + { + "epoch": 0.8846014492753623, + "grad_norm": 4.958394268257525, + "learning_rate": 3.452230436707171e-08, + "loss": 0.2748, + "step": 4883 + }, + { + "epoch": 0.8847826086956522, + "grad_norm": 8.42221285391293, + "learning_rate": 3.441525909138737e-08, + "loss": 0.2933, + "step": 4884 + }, + { + "epoch": 0.884963768115942, + "grad_norm": 4.762954284955818, + "learning_rate": 3.4308374118735436e-08, + "loss": 0.2499, + "step": 4885 + }, + { + "epoch": 0.8851449275362319, + "grad_norm": 5.154770682313216, + "learning_rate": 3.420164948591675e-08, + "loss": 0.2362, + "step": 4886 + }, + { + "epoch": 0.8853260869565217, + "grad_norm": 4.491915351344419, + "learning_rate": 3.409508522967719e-08, + "loss": 0.2784, + "step": 4887 + }, + { + "epoch": 0.8855072463768116, + "grad_norm": 10.463644419930143, + "learning_rate": 3.398868138670724e-08, + "loss": 0.2869, + "step": 4888 + }, + { + "epoch": 0.8856884057971014, + "grad_norm": 3.322646623941776, + "learning_rate": 3.388243799364232e-08, + "loss": 0.23, + "step": 4889 + }, + { + "epoch": 0.8858695652173914, + "grad_norm": 4.972555851864783, + "learning_rate": 3.37763550870625e-08, + "loss": 0.3171, + "step": 4890 + }, + { + "epoch": 0.8860507246376812, + "grad_norm": 5.429904523745769, + "learning_rate": 3.367043270349262e-08, + "loss": 0.2591, + "step": 4891 + }, + { + "epoch": 0.886231884057971, + "grad_norm": 3.558324423323521, + "learning_rate": 3.3564670879402236e-08, + "loss": 0.245, + "step": 4892 + }, + { + "epoch": 0.8864130434782609, + "grad_norm": 7.184105641762762, + "learning_rate": 3.345906965120582e-08, + "loss": 0.2751, + "step": 4893 + }, + { + "epoch": 0.8865942028985507, + "grad_norm": 5.618966803832959, + "learning_rate": 3.3353629055262176e-08, + "loss": 0.2921, + "step": 4894 + }, + { + "epoch": 0.8867753623188406, + "grad_norm": 4.009092763053742, + "learning_rate": 3.324834912787505e-08, + "loss": 0.2785, + "step": 4895 + }, + { + "epoch": 0.8869565217391304, + "grad_norm": 4.367515994731803, + "learning_rate": 3.314322990529278e-08, + "loss": 0.2892, + "step": 4896 + }, + { + "epoch": 0.8871376811594203, + "grad_norm": 4.633884973302452, + "learning_rate": 3.3038271423708517e-08, + "loss": 0.246, + "step": 4897 + }, + { + "epoch": 0.8873188405797101, + "grad_norm": 6.72248643922981, + "learning_rate": 3.2933473719259976e-08, + "loss": 0.2435, + "step": 4898 + }, + { + "epoch": 0.8875, + "grad_norm": 4.839274439927255, + "learning_rate": 3.282883682802945e-08, + "loss": 0.2688, + "step": 4899 + }, + { + "epoch": 0.8876811594202898, + "grad_norm": 5.282350163828013, + "learning_rate": 3.2724360786043794e-08, + "loss": 0.3119, + "step": 4900 + }, + { + "epoch": 0.8876811594202898, + "eval_loss": 0.2679218649864197, + "eval_runtime": 9.8305, + "eval_samples_per_second": 50.862, + "eval_steps_per_second": 0.102, + "step": 4900 + }, + { + "epoch": 0.8878623188405798, + "grad_norm": 4.423994719852854, + "learning_rate": 3.262004562927473e-08, + "loss": 0.2878, + "step": 4901 + }, + { + "epoch": 0.8880434782608696, + "grad_norm": 8.32233694874011, + "learning_rate": 3.251589139363853e-08, + "loss": 0.3172, + "step": 4902 + }, + { + "epoch": 0.8882246376811594, + "grad_norm": 3.8348645691430967, + "learning_rate": 3.241189811499584e-08, + "loss": 0.2498, + "step": 4903 + }, + { + "epoch": 0.8884057971014493, + "grad_norm": 3.5383964242941133, + "learning_rate": 3.230806582915213e-08, + "loss": 0.2691, + "step": 4904 + }, + { + "epoch": 0.8885869565217391, + "grad_norm": 4.283264010759309, + "learning_rate": 3.220439457185736e-08, + "loss": 0.28, + "step": 4905 + }, + { + "epoch": 0.888768115942029, + "grad_norm": 6.662747446368292, + "learning_rate": 3.210088437880598e-08, + "loss": 0.2807, + "step": 4906 + }, + { + "epoch": 0.8889492753623188, + "grad_norm": 5.462035865433335, + "learning_rate": 3.199753528563703e-08, + "loss": 0.3171, + "step": 4907 + }, + { + "epoch": 0.8891304347826087, + "grad_norm": 5.120504887617094, + "learning_rate": 3.1894347327934115e-08, + "loss": 0.2391, + "step": 4908 + }, + { + "epoch": 0.8893115942028985, + "grad_norm": 5.2963822327425545, + "learning_rate": 3.1791320541225465e-08, + "loss": 0.2938, + "step": 4909 + }, + { + "epoch": 0.8894927536231884, + "grad_norm": 4.074254359372084, + "learning_rate": 3.1688454960983434e-08, + "loss": 0.2362, + "step": 4910 + }, + { + "epoch": 0.8896739130434783, + "grad_norm": 6.002395734156921, + "learning_rate": 3.158575062262536e-08, + "loss": 0.2568, + "step": 4911 + }, + { + "epoch": 0.8898550724637682, + "grad_norm": 4.221303060480557, + "learning_rate": 3.148320756151263e-08, + "loss": 0.2667, + "step": 4912 + }, + { + "epoch": 0.890036231884058, + "grad_norm": 4.9094016326471115, + "learning_rate": 3.138082581295149e-08, + "loss": 0.246, + "step": 4913 + }, + { + "epoch": 0.8902173913043478, + "grad_norm": 9.638882921892828, + "learning_rate": 3.1278605412192346e-08, + "loss": 0.2878, + "step": 4914 + }, + { + "epoch": 0.8903985507246377, + "grad_norm": 7.482394468200364, + "learning_rate": 3.117654639443024e-08, + "loss": 0.29, + "step": 4915 + }, + { + "epoch": 0.8905797101449275, + "grad_norm": 4.330414179407386, + "learning_rate": 3.107464879480454e-08, + "loss": 0.2974, + "step": 4916 + }, + { + "epoch": 0.8907608695652174, + "grad_norm": 7.0903282291563485, + "learning_rate": 3.0972912648399076e-08, + "loss": 0.2452, + "step": 4917 + }, + { + "epoch": 0.8909420289855072, + "grad_norm": 3.5933345467530304, + "learning_rate": 3.087133799024211e-08, + "loss": 0.272, + "step": 4918 + }, + { + "epoch": 0.8911231884057971, + "grad_norm": 8.770212444775352, + "learning_rate": 3.0769924855306215e-08, + "loss": 0.2745, + "step": 4919 + }, + { + "epoch": 0.8913043478260869, + "grad_norm": 4.159004302858452, + "learning_rate": 3.066867327850847e-08, + "loss": 0.2747, + "step": 4920 + }, + { + "epoch": 0.8914855072463768, + "grad_norm": 3.973331775126609, + "learning_rate": 3.0567583294710267e-08, + "loss": 0.3397, + "step": 4921 + }, + { + "epoch": 0.8916666666666667, + "grad_norm": 3.567657693090071, + "learning_rate": 3.046665493871736e-08, + "loss": 0.2499, + "step": 4922 + }, + { + "epoch": 0.8918478260869566, + "grad_norm": 5.520179583929899, + "learning_rate": 3.036588824527975e-08, + "loss": 0.3479, + "step": 4923 + }, + { + "epoch": 0.8920289855072464, + "grad_norm": 5.060652974172302, + "learning_rate": 3.0265283249092055e-08, + "loss": 0.3237, + "step": 4924 + }, + { + "epoch": 0.8922101449275363, + "grad_norm": 6.761030305735268, + "learning_rate": 3.016483998479308e-08, + "loss": 0.2614, + "step": 4925 + }, + { + "epoch": 0.8923913043478261, + "grad_norm": 6.2654758109735, + "learning_rate": 3.006455848696576e-08, + "loss": 0.2674, + "step": 4926 + }, + { + "epoch": 0.8925724637681159, + "grad_norm": 8.002422344419415, + "learning_rate": 2.9964438790137437e-08, + "loss": 0.2798, + "step": 4927 + }, + { + "epoch": 0.8927536231884058, + "grad_norm": 5.857452216529343, + "learning_rate": 2.986448092878002e-08, + "loss": 0.3051, + "step": 4928 + }, + { + "epoch": 0.8929347826086956, + "grad_norm": 3.2551135069878083, + "learning_rate": 2.9764684937309382e-08, + "loss": 0.2422, + "step": 4929 + }, + { + "epoch": 0.8931159420289855, + "grad_norm": 6.007800271505982, + "learning_rate": 2.9665050850085694e-08, + "loss": 0.2614, + "step": 4930 + }, + { + "epoch": 0.8932971014492753, + "grad_norm": 3.805841564633844, + "learning_rate": 2.9565578701413517e-08, + "loss": 0.2437, + "step": 4931 + }, + { + "epoch": 0.8934782608695652, + "grad_norm": 6.717074900683175, + "learning_rate": 2.9466268525541572e-08, + "loss": 0.2426, + "step": 4932 + }, + { + "epoch": 0.8936594202898551, + "grad_norm": 5.522394967042641, + "learning_rate": 2.9367120356662776e-08, + "loss": 0.3172, + "step": 4933 + }, + { + "epoch": 0.893840579710145, + "grad_norm": 4.4627192239062, + "learning_rate": 2.9268134228914387e-08, + "loss": 0.2798, + "step": 4934 + }, + { + "epoch": 0.8940217391304348, + "grad_norm": 8.960653802495912, + "learning_rate": 2.916931017637769e-08, + "loss": 0.3389, + "step": 4935 + }, + { + "epoch": 0.8942028985507247, + "grad_norm": 5.2878398968896185, + "learning_rate": 2.9070648233078476e-08, + "loss": 0.2716, + "step": 4936 + }, + { + "epoch": 0.8943840579710145, + "grad_norm": 3.411084187465577, + "learning_rate": 2.8972148432986286e-08, + "loss": 0.2378, + "step": 4937 + }, + { + "epoch": 0.8945652173913043, + "grad_norm": 6.973036498122388, + "learning_rate": 2.8873810810015166e-08, + "loss": 0.3407, + "step": 4938 + }, + { + "epoch": 0.8947463768115942, + "grad_norm": 4.404505645813442, + "learning_rate": 2.877563539802319e-08, + "loss": 0.2634, + "step": 4939 + }, + { + "epoch": 0.894927536231884, + "grad_norm": 4.807986854374034, + "learning_rate": 2.8677622230812715e-08, + "loss": 0.3073, + "step": 4940 + }, + { + "epoch": 0.8951086956521739, + "grad_norm": 3.7893067280954384, + "learning_rate": 2.8579771342130023e-08, + "loss": 0.2964, + "step": 4941 + }, + { + "epoch": 0.8952898550724637, + "grad_norm": 7.07325661497837, + "learning_rate": 2.8482082765665827e-08, + "loss": 0.2997, + "step": 4942 + }, + { + "epoch": 0.8954710144927536, + "grad_norm": 9.4814809879375, + "learning_rate": 2.8384556535054503e-08, + "loss": 0.2734, + "step": 4943 + }, + { + "epoch": 0.8956521739130435, + "grad_norm": 3.315429785228442, + "learning_rate": 2.828719268387497e-08, + "loss": 0.2289, + "step": 4944 + }, + { + "epoch": 0.8958333333333334, + "grad_norm": 4.610959759278451, + "learning_rate": 2.8189991245650024e-08, + "loss": 0.2247, + "step": 4945 + }, + { + "epoch": 0.8960144927536232, + "grad_norm": 6.368134462803115, + "learning_rate": 2.809295225384656e-08, + "loss": 0.2459, + "step": 4946 + }, + { + "epoch": 0.8961956521739131, + "grad_norm": 5.0630745725534, + "learning_rate": 2.799607574187557e-08, + "loss": 0.2957, + "step": 4947 + }, + { + "epoch": 0.8963768115942029, + "grad_norm": 6.373281658782514, + "learning_rate": 2.7899361743092098e-08, + "loss": 0.3298, + "step": 4948 + }, + { + "epoch": 0.8965579710144927, + "grad_norm": 3.842168621171289, + "learning_rate": 2.7802810290795288e-08, + "loss": 0.2625, + "step": 4949 + }, + { + "epoch": 0.8967391304347826, + "grad_norm": 4.754489989802223, + "learning_rate": 2.7706421418228098e-08, + "loss": 0.2701, + "step": 4950 + }, + { + "epoch": 0.8969202898550724, + "grad_norm": 7.130775245119226, + "learning_rate": 2.7610195158577875e-08, + "loss": 0.2725, + "step": 4951 + }, + { + "epoch": 0.8971014492753623, + "grad_norm": 4.404462866332119, + "learning_rate": 2.7514131544975783e-08, + "loss": 0.2342, + "step": 4952 + }, + { + "epoch": 0.8972826086956521, + "grad_norm": 4.014695321327235, + "learning_rate": 2.7418230610496752e-08, + "loss": 0.333, + "step": 4953 + }, + { + "epoch": 0.8974637681159421, + "grad_norm": 3.9185498671598946, + "learning_rate": 2.7322492388160034e-08, + "loss": 0.2718, + "step": 4954 + }, + { + "epoch": 0.8976449275362319, + "grad_norm": 4.045161987136994, + "learning_rate": 2.722691691092882e-08, + "loss": 0.3177, + "step": 4955 + }, + { + "epoch": 0.8978260869565218, + "grad_norm": 4.927213514518188, + "learning_rate": 2.7131504211710176e-08, + "loss": 0.2991, + "step": 4956 + }, + { + "epoch": 0.8980072463768116, + "grad_norm": 4.845381208621474, + "learning_rate": 2.7036254323355158e-08, + "loss": 0.2947, + "step": 4957 + }, + { + "epoch": 0.8981884057971015, + "grad_norm": 4.805877299472574, + "learning_rate": 2.6941167278658695e-08, + "loss": 0.254, + "step": 4958 + }, + { + "epoch": 0.8983695652173913, + "grad_norm": 4.471561337672168, + "learning_rate": 2.6846243110359766e-08, + "loss": 0.3454, + "step": 4959 + }, + { + "epoch": 0.8985507246376812, + "grad_norm": 4.605199813315009, + "learning_rate": 2.6751481851141176e-08, + "loss": 0.3346, + "step": 4960 + }, + { + "epoch": 0.898731884057971, + "grad_norm": 6.0535188107746505, + "learning_rate": 2.665688353362966e-08, + "loss": 0.3691, + "step": 4961 + }, + { + "epoch": 0.8989130434782608, + "grad_norm": 3.7750320846092156, + "learning_rate": 2.6562448190395825e-08, + "loss": 0.2136, + "step": 4962 + }, + { + "epoch": 0.8990942028985507, + "grad_norm": 7.198297028466607, + "learning_rate": 2.646817585395439e-08, + "loss": 0.257, + "step": 4963 + }, + { + "epoch": 0.8992753623188405, + "grad_norm": 6.476111737923261, + "learning_rate": 2.6374066556763607e-08, + "loss": 0.3098, + "step": 4964 + }, + { + "epoch": 0.8994565217391305, + "grad_norm": 5.0366910329858925, + "learning_rate": 2.6280120331225785e-08, + "loss": 0.241, + "step": 4965 + }, + { + "epoch": 0.8996376811594203, + "grad_norm": 4.470253039260638, + "learning_rate": 2.6186337209687048e-08, + "loss": 0.2919, + "step": 4966 + }, + { + "epoch": 0.8998188405797102, + "grad_norm": 3.880286171090008, + "learning_rate": 2.60927172244374e-08, + "loss": 0.2949, + "step": 4967 + }, + { + "epoch": 0.9, + "grad_norm": 4.514510867495617, + "learning_rate": 2.5999260407710667e-08, + "loss": 0.297, + "step": 4968 + }, + { + "epoch": 0.9001811594202899, + "grad_norm": 6.161267685873009, + "learning_rate": 2.5905966791684442e-08, + "loss": 0.306, + "step": 4969 + }, + { + "epoch": 0.9003623188405797, + "grad_norm": 4.762647472063687, + "learning_rate": 2.581283640848009e-08, + "loss": 0.3131, + "step": 4970 + }, + { + "epoch": 0.9005434782608696, + "grad_norm": 4.418157380169589, + "learning_rate": 2.5719869290162954e-08, + "loss": 0.2644, + "step": 4971 + }, + { + "epoch": 0.9007246376811594, + "grad_norm": 8.689256920679696, + "learning_rate": 2.5627065468741994e-08, + "loss": 0.3402, + "step": 4972 + }, + { + "epoch": 0.9009057971014492, + "grad_norm": 10.44592708668007, + "learning_rate": 2.5534424976169977e-08, + "loss": 0.285, + "step": 4973 + }, + { + "epoch": 0.9010869565217391, + "grad_norm": 9.902636234972858, + "learning_rate": 2.5441947844343558e-08, + "loss": 0.2535, + "step": 4974 + }, + { + "epoch": 0.9012681159420289, + "grad_norm": 3.7938542421793806, + "learning_rate": 2.534963410510299e-08, + "loss": 0.2135, + "step": 4975 + }, + { + "epoch": 0.9014492753623189, + "grad_norm": 4.875091412007716, + "learning_rate": 2.5257483790232348e-08, + "loss": 0.3093, + "step": 4976 + }, + { + "epoch": 0.9016304347826087, + "grad_norm": 3.819686483849816, + "learning_rate": 2.5165496931459418e-08, + "loss": 0.2527, + "step": 4977 + }, + { + "epoch": 0.9018115942028986, + "grad_norm": 3.842126850766311, + "learning_rate": 2.5073673560455643e-08, + "loss": 0.2332, + "step": 4978 + }, + { + "epoch": 0.9019927536231884, + "grad_norm": 5.73557598392169, + "learning_rate": 2.4982013708836457e-08, + "loss": 0.2531, + "step": 4979 + }, + { + "epoch": 0.9021739130434783, + "grad_norm": 12.47029981212199, + "learning_rate": 2.4890517408160507e-08, + "loss": 0.2736, + "step": 4980 + }, + { + "epoch": 0.9023550724637681, + "grad_norm": 5.459747676317857, + "learning_rate": 2.4799184689930585e-08, + "loss": 0.2807, + "step": 4981 + }, + { + "epoch": 0.902536231884058, + "grad_norm": 3.5772999663310396, + "learning_rate": 2.4708015585592824e-08, + "loss": 0.2598, + "step": 4982 + }, + { + "epoch": 0.9027173913043478, + "grad_norm": 4.544660272019451, + "learning_rate": 2.4617010126537386e-08, + "loss": 0.2944, + "step": 4983 + }, + { + "epoch": 0.9028985507246376, + "grad_norm": 4.293635149468965, + "learning_rate": 2.452616834409771e-08, + "loss": 0.3198, + "step": 4984 + }, + { + "epoch": 0.9030797101449275, + "grad_norm": 4.825322656862586, + "learning_rate": 2.4435490269551163e-08, + "loss": 0.2246, + "step": 4985 + }, + { + "epoch": 0.9032608695652173, + "grad_norm": 6.462231909797903, + "learning_rate": 2.434497593411855e-08, + "loss": 0.3259, + "step": 4986 + }, + { + "epoch": 0.9034420289855073, + "grad_norm": 6.363823868240444, + "learning_rate": 2.4254625368964442e-08, + "loss": 0.2075, + "step": 4987 + }, + { + "epoch": 0.9036231884057971, + "grad_norm": 6.283403785336436, + "learning_rate": 2.416443860519696e-08, + "loss": 0.2563, + "step": 4988 + }, + { + "epoch": 0.903804347826087, + "grad_norm": 7.870698654080862, + "learning_rate": 2.407441567386781e-08, + "loss": 0.317, + "step": 4989 + }, + { + "epoch": 0.9039855072463768, + "grad_norm": 5.638600354645924, + "learning_rate": 2.3984556605972373e-08, + "loss": 0.2985, + "step": 4990 + }, + { + "epoch": 0.9041666666666667, + "grad_norm": 4.326858378065611, + "learning_rate": 2.3894861432449454e-08, + "loss": 0.3215, + "step": 4991 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 4.86545450598657, + "learning_rate": 2.380533018418157e-08, + "loss": 0.2892, + "step": 4992 + }, + { + "epoch": 0.9045289855072464, + "grad_norm": 3.6975460578264685, + "learning_rate": 2.3715962891994733e-08, + "loss": 0.2842, + "step": 4993 + }, + { + "epoch": 0.9047101449275362, + "grad_norm": 6.772915684266903, + "learning_rate": 2.3626759586658606e-08, + "loss": 0.3147, + "step": 4994 + }, + { + "epoch": 0.904891304347826, + "grad_norm": 3.441492913350605, + "learning_rate": 2.3537720298886288e-08, + "loss": 0.2209, + "step": 4995 + }, + { + "epoch": 0.9050724637681159, + "grad_norm": 3.583978866285919, + "learning_rate": 2.3448845059334364e-08, + "loss": 0.217, + "step": 4996 + }, + { + "epoch": 0.9052536231884057, + "grad_norm": 5.09401151709153, + "learning_rate": 2.3360133898602975e-08, + "loss": 0.227, + "step": 4997 + }, + { + "epoch": 0.9054347826086957, + "grad_norm": 5.19084400479948, + "learning_rate": 2.3271586847235903e-08, + "loss": 0.276, + "step": 4998 + }, + { + "epoch": 0.9056159420289855, + "grad_norm": 6.008678684248983, + "learning_rate": 2.3183203935720263e-08, + "loss": 0.3193, + "step": 4999 + }, + { + "epoch": 0.9057971014492754, + "grad_norm": 6.562682545796406, + "learning_rate": 2.3094985194486717e-08, + "loss": 0.3159, + "step": 5000 + }, + { + "epoch": 0.9057971014492754, + "eval_loss": 0.26676562428474426, + "eval_runtime": 9.817, + "eval_samples_per_second": 50.932, + "eval_steps_per_second": 0.102, + "step": 5000 + }, + { + "epoch": 0.9059782608695652, + "grad_norm": 3.229423738318707, + "learning_rate": 2.3006930653909405e-08, + "loss": 0.24, + "step": 5001 + }, + { + "epoch": 0.9061594202898551, + "grad_norm": 4.6667148214844945, + "learning_rate": 2.291904034430586e-08, + "loss": 0.2508, + "step": 5002 + }, + { + "epoch": 0.9063405797101449, + "grad_norm": 13.05844569406825, + "learning_rate": 2.283131429593721e-08, + "loss": 0.281, + "step": 5003 + }, + { + "epoch": 0.9065217391304348, + "grad_norm": 5.216466468381175, + "learning_rate": 2.2743752539007956e-08, + "loss": 0.2722, + "step": 5004 + }, + { + "epoch": 0.9067028985507246, + "grad_norm": 3.6511003886628357, + "learning_rate": 2.265635510366587e-08, + "loss": 0.264, + "step": 5005 + }, + { + "epoch": 0.9068840579710145, + "grad_norm": 6.532822920264964, + "learning_rate": 2.2569122020002608e-08, + "loss": 0.2844, + "step": 5006 + }, + { + "epoch": 0.9070652173913043, + "grad_norm": 4.279531197117882, + "learning_rate": 2.248205331805264e-08, + "loss": 0.3023, + "step": 5007 + }, + { + "epoch": 0.9072463768115943, + "grad_norm": 3.6591732478307035, + "learning_rate": 2.239514902779427e-08, + "loss": 0.2346, + "step": 5008 + }, + { + "epoch": 0.9074275362318841, + "grad_norm": 4.509440199602259, + "learning_rate": 2.2308409179148946e-08, + "loss": 0.2748, + "step": 5009 + }, + { + "epoch": 0.907608695652174, + "grad_norm": 3.7171940871450784, + "learning_rate": 2.222183380198178e-08, + "loss": 0.2786, + "step": 5010 + }, + { + "epoch": 0.9077898550724638, + "grad_norm": 3.876903831363149, + "learning_rate": 2.213542292610099e-08, + "loss": 0.2753, + "step": 5011 + }, + { + "epoch": 0.9079710144927536, + "grad_norm": 5.446000205318678, + "learning_rate": 2.2049176581258266e-08, + "loss": 0.3321, + "step": 5012 + }, + { + "epoch": 0.9081521739130435, + "grad_norm": 4.296970278371665, + "learning_rate": 2.1963094797148586e-08, + "loss": 0.2487, + "step": 5013 + }, + { + "epoch": 0.9083333333333333, + "grad_norm": 5.545879698786914, + "learning_rate": 2.1877177603410345e-08, + "loss": 0.2685, + "step": 5014 + }, + { + "epoch": 0.9085144927536232, + "grad_norm": 4.288822339549759, + "learning_rate": 2.1791425029625278e-08, + "loss": 0.3315, + "step": 5015 + }, + { + "epoch": 0.908695652173913, + "grad_norm": 6.126726498296205, + "learning_rate": 2.170583710531837e-08, + "loss": 0.2769, + "step": 5016 + }, + { + "epoch": 0.9088768115942029, + "grad_norm": 4.067313778600806, + "learning_rate": 2.1620413859957942e-08, + "loss": 0.2445, + "step": 5017 + }, + { + "epoch": 0.9090579710144927, + "grad_norm": 7.796096841756646, + "learning_rate": 2.1535155322955634e-08, + "loss": 0.2668, + "step": 5018 + }, + { + "epoch": 0.9092391304347827, + "grad_norm": 4.883764368102144, + "learning_rate": 2.145006152366635e-08, + "loss": 0.2594, + "step": 5019 + }, + { + "epoch": 0.9094202898550725, + "grad_norm": 4.576816422415214, + "learning_rate": 2.1365132491388326e-08, + "loss": 0.274, + "step": 5020 + }, + { + "epoch": 0.9096014492753624, + "grad_norm": 4.518333094509602, + "learning_rate": 2.1280368255362944e-08, + "loss": 0.2534, + "step": 5021 + }, + { + "epoch": 0.9097826086956522, + "grad_norm": 3.6022373485406947, + "learning_rate": 2.119576884477514e-08, + "loss": 0.2592, + "step": 5022 + }, + { + "epoch": 0.909963768115942, + "grad_norm": 8.729549762123135, + "learning_rate": 2.1111334288752612e-08, + "loss": 0.3268, + "step": 5023 + }, + { + "epoch": 0.9101449275362319, + "grad_norm": 4.694266552021978, + "learning_rate": 2.102706461636672e-08, + "loss": 0.2753, + "step": 5024 + }, + { + "epoch": 0.9103260869565217, + "grad_norm": 3.953792605102031, + "learning_rate": 2.0942959856631913e-08, + "loss": 0.2793, + "step": 5025 + }, + { + "epoch": 0.9105072463768116, + "grad_norm": 3.799497583559223, + "learning_rate": 2.085902003850587e-08, + "loss": 0.2816, + "step": 5026 + }, + { + "epoch": 0.9106884057971014, + "grad_norm": 6.345808495963679, + "learning_rate": 2.0775245190889467e-08, + "loss": 0.2639, + "step": 5027 + }, + { + "epoch": 0.9108695652173913, + "grad_norm": 3.7361030805644218, + "learning_rate": 2.069163534262669e-08, + "loss": 0.3098, + "step": 5028 + }, + { + "epoch": 0.9110507246376811, + "grad_norm": 6.29533855797318, + "learning_rate": 2.0608190522504952e-08, + "loss": 0.3438, + "step": 5029 + }, + { + "epoch": 0.9112318840579711, + "grad_norm": 4.36788833095191, + "learning_rate": 2.0524910759254554e-08, + "loss": 0.3239, + "step": 5030 + }, + { + "epoch": 0.9114130434782609, + "grad_norm": 6.3593487647630536, + "learning_rate": 2.0441796081549222e-08, + "loss": 0.2438, + "step": 5031 + }, + { + "epoch": 0.9115942028985508, + "grad_norm": 3.319905367057363, + "learning_rate": 2.0358846518005624e-08, + "loss": 0.2296, + "step": 5032 + }, + { + "epoch": 0.9117753623188406, + "grad_norm": 7.6219929520052245, + "learning_rate": 2.027606209718391e-08, + "loss": 0.2351, + "step": 5033 + }, + { + "epoch": 0.9119565217391304, + "grad_norm": 3.895857690945518, + "learning_rate": 2.0193442847586893e-08, + "loss": 0.2766, + "step": 5034 + }, + { + "epoch": 0.9121376811594203, + "grad_norm": 4.117482932198145, + "learning_rate": 2.0110988797660876e-08, + "loss": 0.2654, + "step": 5035 + }, + { + "epoch": 0.9123188405797101, + "grad_norm": 5.438219935022428, + "learning_rate": 2.002869997579515e-08, + "loss": 0.2645, + "step": 5036 + }, + { + "epoch": 0.9125, + "grad_norm": 4.262290777751656, + "learning_rate": 1.9946576410322212e-08, + "loss": 0.2953, + "step": 5037 + }, + { + "epoch": 0.9126811594202898, + "grad_norm": 4.801393012434805, + "learning_rate": 1.986461812951756e-08, + "loss": 0.2266, + "step": 5038 + }, + { + "epoch": 0.9128623188405797, + "grad_norm": 3.766270719758786, + "learning_rate": 1.9782825161599903e-08, + "loss": 0.2585, + "step": 5039 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 4.615851033768331, + "learning_rate": 1.9701197534730707e-08, + "loss": 0.2607, + "step": 5040 + }, + { + "epoch": 0.9132246376811595, + "grad_norm": 3.766218146499011, + "learning_rate": 1.9619735277014937e-08, + "loss": 0.2604, + "step": 5041 + }, + { + "epoch": 0.9134057971014493, + "grad_norm": 5.399669782098153, + "learning_rate": 1.9538438416500437e-08, + "loss": 0.2994, + "step": 5042 + }, + { + "epoch": 0.9135869565217392, + "grad_norm": 6.038002054653043, + "learning_rate": 1.945730698117809e-08, + "loss": 0.3291, + "step": 5043 + }, + { + "epoch": 0.913768115942029, + "grad_norm": 5.018315155132307, + "learning_rate": 1.937634099898172e-08, + "loss": 0.3091, + "step": 5044 + }, + { + "epoch": 0.9139492753623188, + "grad_norm": 4.756131607605878, + "learning_rate": 1.9295540497788477e-08, + "loss": 0.2685, + "step": 5045 + }, + { + "epoch": 0.9141304347826087, + "grad_norm": 3.4968442666410557, + "learning_rate": 1.921490550541821e-08, + "loss": 0.2428, + "step": 5046 + }, + { + "epoch": 0.9143115942028985, + "grad_norm": 5.277557304063835, + "learning_rate": 1.913443604963405e-08, + "loss": 0.2838, + "step": 5047 + }, + { + "epoch": 0.9144927536231884, + "grad_norm": 3.8340585382415413, + "learning_rate": 1.9054132158141834e-08, + "loss": 0.2415, + "step": 5048 + }, + { + "epoch": 0.9146739130434782, + "grad_norm": 4.955260026857148, + "learning_rate": 1.8973993858590774e-08, + "loss": 0.2588, + "step": 5049 + }, + { + "epoch": 0.9148550724637681, + "grad_norm": 4.183827352373052, + "learning_rate": 1.8894021178572807e-08, + "loss": 0.3065, + "step": 5050 + }, + { + "epoch": 0.9150362318840579, + "grad_norm": 5.089752794858959, + "learning_rate": 1.8814214145622785e-08, + "loss": 0.2816, + "step": 5051 + }, + { + "epoch": 0.9152173913043479, + "grad_norm": 4.334815454747086, + "learning_rate": 1.8734572787218738e-08, + "loss": 0.2606, + "step": 5052 + }, + { + "epoch": 0.9153985507246377, + "grad_norm": 5.958979549328401, + "learning_rate": 1.8655097130781618e-08, + "loss": 0.2757, + "step": 5053 + }, + { + "epoch": 0.9155797101449276, + "grad_norm": 5.382098103782952, + "learning_rate": 1.85757872036752e-08, + "loss": 0.2805, + "step": 5054 + }, + { + "epoch": 0.9157608695652174, + "grad_norm": 8.874010798483143, + "learning_rate": 1.849664303320636e-08, + "loss": 0.2847, + "step": 5055 + }, + { + "epoch": 0.9159420289855073, + "grad_norm": 5.67527329704592, + "learning_rate": 1.8417664646624587e-08, + "loss": 0.2693, + "step": 5056 + }, + { + "epoch": 0.9161231884057971, + "grad_norm": 6.475145950974189, + "learning_rate": 1.8338852071122735e-08, + "loss": 0.3095, + "step": 5057 + }, + { + "epoch": 0.9163043478260869, + "grad_norm": 6.454985407522784, + "learning_rate": 1.8260205333836263e-08, + "loss": 0.2964, + "step": 5058 + }, + { + "epoch": 0.9164855072463768, + "grad_norm": 5.080559793921507, + "learning_rate": 1.8181724461843628e-08, + "loss": 0.2381, + "step": 5059 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 4.563514151399476, + "learning_rate": 1.810340948216621e-08, + "loss": 0.2857, + "step": 5060 + }, + { + "epoch": 0.9168478260869565, + "grad_norm": 4.355115903306195, + "learning_rate": 1.8025260421768106e-08, + "loss": 0.294, + "step": 5061 + }, + { + "epoch": 0.9170289855072464, + "grad_norm": 4.643713540221931, + "learning_rate": 1.794727730755652e-08, + "loss": 0.229, + "step": 5062 + }, + { + "epoch": 0.9172101449275363, + "grad_norm": 4.199254981250338, + "learning_rate": 1.7869460166381355e-08, + "loss": 0.3133, + "step": 5063 + }, + { + "epoch": 0.9173913043478261, + "grad_norm": 4.870083561785064, + "learning_rate": 1.779180902503541e-08, + "loss": 0.3428, + "step": 5064 + }, + { + "epoch": 0.917572463768116, + "grad_norm": 7.541688996464984, + "learning_rate": 1.7714323910254513e-08, + "loss": 0.3094, + "step": 5065 + }, + { + "epoch": 0.9177536231884058, + "grad_norm": 4.676154314433912, + "learning_rate": 1.7637004848716884e-08, + "loss": 0.2651, + "step": 5066 + }, + { + "epoch": 0.9179347826086957, + "grad_norm": 3.9416739081153005, + "learning_rate": 1.755985186704395e-08, + "loss": 0.2463, + "step": 5067 + }, + { + "epoch": 0.9181159420289855, + "grad_norm": 4.023576344996432, + "learning_rate": 1.7482864991799906e-08, + "loss": 0.3121, + "step": 5068 + }, + { + "epoch": 0.9182971014492753, + "grad_norm": 5.051948611056018, + "learning_rate": 1.7406044249491657e-08, + "loss": 0.3416, + "step": 5069 + }, + { + "epoch": 0.9184782608695652, + "grad_norm": 6.522263640665446, + "learning_rate": 1.7329389666568995e-08, + "loss": 0.2732, + "step": 5070 + }, + { + "epoch": 0.918659420289855, + "grad_norm": 6.2663681174231325, + "learning_rate": 1.725290126942436e-08, + "loss": 0.2924, + "step": 5071 + }, + { + "epoch": 0.9188405797101449, + "grad_norm": 4.6754627376987195, + "learning_rate": 1.717657908439313e-08, + "loss": 0.2655, + "step": 5072 + }, + { + "epoch": 0.9190217391304348, + "grad_norm": 7.482888573489373, + "learning_rate": 1.7100423137753395e-08, + "loss": 0.3546, + "step": 5073 + }, + { + "epoch": 0.9192028985507247, + "grad_norm": 6.048004571895161, + "learning_rate": 1.7024433455726016e-08, + "loss": 0.3192, + "step": 5074 + }, + { + "epoch": 0.9193840579710145, + "grad_norm": 4.839161050960119, + "learning_rate": 1.694861006447451e-08, + "loss": 0.2652, + "step": 5075 + }, + { + "epoch": 0.9195652173913044, + "grad_norm": 3.7481570921489067, + "learning_rate": 1.687295299010538e-08, + "loss": 0.2605, + "step": 5076 + }, + { + "epoch": 0.9197463768115942, + "grad_norm": 4.862934612934118, + "learning_rate": 1.6797462258667626e-08, + "loss": 0.2938, + "step": 5077 + }, + { + "epoch": 0.9199275362318841, + "grad_norm": 11.048120492288488, + "learning_rate": 1.6722137896153066e-08, + "loss": 0.2985, + "step": 5078 + }, + { + "epoch": 0.9201086956521739, + "grad_norm": 4.461946790516098, + "learning_rate": 1.664697992849623e-08, + "loss": 0.2741, + "step": 5079 + }, + { + "epoch": 0.9202898550724637, + "grad_norm": 4.93336853506006, + "learning_rate": 1.6571988381574364e-08, + "loss": 0.2665, + "step": 5080 + }, + { + "epoch": 0.9204710144927536, + "grad_norm": 6.847480275289616, + "learning_rate": 1.6497163281207482e-08, + "loss": 0.2283, + "step": 5081 + }, + { + "epoch": 0.9206521739130434, + "grad_norm": 3.550784214808077, + "learning_rate": 1.6422504653158198e-08, + "loss": 0.265, + "step": 5082 + }, + { + "epoch": 0.9208333333333333, + "grad_norm": 4.057937731729632, + "learning_rate": 1.6348012523131726e-08, + "loss": 0.2822, + "step": 5083 + }, + { + "epoch": 0.9210144927536232, + "grad_norm": 4.061184475967941, + "learning_rate": 1.6273686916776164e-08, + "loss": 0.2913, + "step": 5084 + }, + { + "epoch": 0.9211956521739131, + "grad_norm": 4.492671861756009, + "learning_rate": 1.6199527859682148e-08, + "loss": 0.3105, + "step": 5085 + }, + { + "epoch": 0.9213768115942029, + "grad_norm": 4.01204606905016, + "learning_rate": 1.6125535377382926e-08, + "loss": 0.1968, + "step": 5086 + }, + { + "epoch": 0.9215579710144928, + "grad_norm": 5.193794014494544, + "learning_rate": 1.6051709495354615e-08, + "loss": 0.3356, + "step": 5087 + }, + { + "epoch": 0.9217391304347826, + "grad_norm": 4.566021644013141, + "learning_rate": 1.597805023901566e-08, + "loss": 0.3008, + "step": 5088 + }, + { + "epoch": 0.9219202898550725, + "grad_norm": 8.130145227303329, + "learning_rate": 1.5904557633727334e-08, + "loss": 0.2669, + "step": 5089 + }, + { + "epoch": 0.9221014492753623, + "grad_norm": 3.736318496486843, + "learning_rate": 1.583123170479356e-08, + "loss": 0.2539, + "step": 5090 + }, + { + "epoch": 0.9222826086956522, + "grad_norm": 5.068552662849525, + "learning_rate": 1.5758072477460638e-08, + "loss": 0.3506, + "step": 5091 + }, + { + "epoch": 0.922463768115942, + "grad_norm": 3.6260657931632045, + "learning_rate": 1.5685079976917926e-08, + "loss": 0.2527, + "step": 5092 + }, + { + "epoch": 0.9226449275362318, + "grad_norm": 4.80250953741809, + "learning_rate": 1.5612254228296816e-08, + "loss": 0.2701, + "step": 5093 + }, + { + "epoch": 0.9228260869565217, + "grad_norm": 3.747919780124867, + "learning_rate": 1.553959525667159e-08, + "loss": 0.276, + "step": 5094 + }, + { + "epoch": 0.9230072463768116, + "grad_norm": 3.6454171276463168, + "learning_rate": 1.546710308705923e-08, + "loss": 0.2492, + "step": 5095 + }, + { + "epoch": 0.9231884057971015, + "grad_norm": 3.475438057973087, + "learning_rate": 1.5394777744418997e-08, + "loss": 0.2345, + "step": 5096 + }, + { + "epoch": 0.9233695652173913, + "grad_norm": 5.1466940642142, + "learning_rate": 1.5322619253652912e-08, + "loss": 0.2461, + "step": 5097 + }, + { + "epoch": 0.9235507246376812, + "grad_norm": 3.837391521056926, + "learning_rate": 1.525062763960544e-08, + "loss": 0.2072, + "step": 5098 + }, + { + "epoch": 0.923731884057971, + "grad_norm": 4.4926669852053225, + "learning_rate": 1.5178802927063693e-08, + "loss": 0.2855, + "step": 5099 + }, + { + "epoch": 0.9239130434782609, + "grad_norm": 6.664191297941805, + "learning_rate": 1.5107145140757226e-08, + "loss": 0.2456, + "step": 5100 + }, + { + "epoch": 0.9239130434782609, + "eval_loss": 0.2668437361717224, + "eval_runtime": 9.7476, + "eval_samples_per_second": 51.295, + "eval_steps_per_second": 0.103, + "step": 5100 + }, + { + "epoch": 0.9240942028985507, + "grad_norm": 5.996202108686227, + "learning_rate": 1.5035654305358192e-08, + "loss": 0.312, + "step": 5101 + }, + { + "epoch": 0.9242753623188406, + "grad_norm": 4.11531538231226, + "learning_rate": 1.496433044548112e-08, + "loss": 0.2408, + "step": 5102 + }, + { + "epoch": 0.9244565217391304, + "grad_norm": 4.59986698816013, + "learning_rate": 1.4893173585683261e-08, + "loss": 0.2579, + "step": 5103 + }, + { + "epoch": 0.9246376811594202, + "grad_norm": 6.818235046598892, + "learning_rate": 1.4822183750464234e-08, + "loss": 0.313, + "step": 5104 + }, + { + "epoch": 0.9248188405797102, + "grad_norm": 4.956228855141052, + "learning_rate": 1.4751360964266157e-08, + "loss": 0.292, + "step": 5105 + }, + { + "epoch": 0.925, + "grad_norm": 5.021762857267433, + "learning_rate": 1.468070525147358e-08, + "loss": 0.2688, + "step": 5106 + }, + { + "epoch": 0.9251811594202899, + "grad_norm": 3.765696568104086, + "learning_rate": 1.4610216636413764e-08, + "loss": 0.2701, + "step": 5107 + }, + { + "epoch": 0.9253623188405797, + "grad_norm": 8.847734337942263, + "learning_rate": 1.4539895143356185e-08, + "loss": 0.3277, + "step": 5108 + }, + { + "epoch": 0.9255434782608696, + "grad_norm": 5.898594802793108, + "learning_rate": 1.4469740796512863e-08, + "loss": 0.2574, + "step": 5109 + }, + { + "epoch": 0.9257246376811594, + "grad_norm": 7.290441196622495, + "learning_rate": 1.4399753620038201e-08, + "loss": 0.272, + "step": 5110 + }, + { + "epoch": 0.9259057971014493, + "grad_norm": 6.587318004773873, + "learning_rate": 1.4329933638029257e-08, + "loss": 0.3222, + "step": 5111 + }, + { + "epoch": 0.9260869565217391, + "grad_norm": 5.234551020115269, + "learning_rate": 1.4260280874525299e-08, + "loss": 0.2845, + "step": 5112 + }, + { + "epoch": 0.926268115942029, + "grad_norm": 5.325414020813716, + "learning_rate": 1.4190795353508145e-08, + "loss": 0.2658, + "step": 5113 + }, + { + "epoch": 0.9264492753623188, + "grad_norm": 4.241550296385165, + "learning_rate": 1.4121477098901935e-08, + "loss": 0.2872, + "step": 5114 + }, + { + "epoch": 0.9266304347826086, + "grad_norm": 4.018722430337241, + "learning_rate": 1.40523261345733e-08, + "loss": 0.2512, + "step": 5115 + }, + { + "epoch": 0.9268115942028986, + "grad_norm": 4.691758106098034, + "learning_rate": 1.398334248433125e-08, + "loss": 0.2219, + "step": 5116 + }, + { + "epoch": 0.9269927536231884, + "grad_norm": 5.447034736047103, + "learning_rate": 1.3914526171927176e-08, + "loss": 0.3033, + "step": 5117 + }, + { + "epoch": 0.9271739130434783, + "grad_norm": 8.689800691959029, + "learning_rate": 1.3845877221054792e-08, + "loss": 0.3181, + "step": 5118 + }, + { + "epoch": 0.9273550724637681, + "grad_norm": 6.034756808020308, + "learning_rate": 1.3777395655350465e-08, + "loss": 0.3094, + "step": 5119 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 4.033737130482553, + "learning_rate": 1.3709081498392505e-08, + "loss": 0.3133, + "step": 5120 + }, + { + "epoch": 0.9277173913043478, + "grad_norm": 4.7490448919508585, + "learning_rate": 1.3640934773701928e-08, + "loss": 0.2878, + "step": 5121 + }, + { + "epoch": 0.9278985507246377, + "grad_norm": 4.475110452629174, + "learning_rate": 1.3572955504741857e-08, + "loss": 0.2625, + "step": 5122 + }, + { + "epoch": 0.9280797101449275, + "grad_norm": 4.171413591876194, + "learning_rate": 1.3505143714917955e-08, + "loss": 0.2176, + "step": 5123 + }, + { + "epoch": 0.9282608695652174, + "grad_norm": 4.532596633812448, + "learning_rate": 1.3437499427578159e-08, + "loss": 0.3316, + "step": 5124 + }, + { + "epoch": 0.9284420289855072, + "grad_norm": 4.63419965649599, + "learning_rate": 1.3370022666012726e-08, + "loss": 0.3172, + "step": 5125 + }, + { + "epoch": 0.928623188405797, + "grad_norm": 4.989870724738706, + "learning_rate": 1.3302713453454128e-08, + "loss": 0.2568, + "step": 5126 + }, + { + "epoch": 0.928804347826087, + "grad_norm": 5.59649172880547, + "learning_rate": 1.3235571813077328e-08, + "loss": 0.3048, + "step": 5127 + }, + { + "epoch": 0.9289855072463769, + "grad_norm": 3.802912816327616, + "learning_rate": 1.31685977679995e-08, + "loss": 0.2674, + "step": 5128 + }, + { + "epoch": 0.9291666666666667, + "grad_norm": 4.522374770300389, + "learning_rate": 1.3101791341280088e-08, + "loss": 0.2617, + "step": 5129 + }, + { + "epoch": 0.9293478260869565, + "grad_norm": 4.3658050539372635, + "learning_rate": 1.3035152555920915e-08, + "loss": 0.3161, + "step": 5130 + }, + { + "epoch": 0.9295289855072464, + "grad_norm": 4.004384942366896, + "learning_rate": 1.2968681434866013e-08, + "loss": 0.2636, + "step": 5131 + }, + { + "epoch": 0.9297101449275362, + "grad_norm": 3.8775817461579978, + "learning_rate": 1.2902378001001691e-08, + "loss": 0.295, + "step": 5132 + }, + { + "epoch": 0.9298913043478261, + "grad_norm": 5.296565105031837, + "learning_rate": 1.2836242277156517e-08, + "loss": 0.2923, + "step": 5133 + }, + { + "epoch": 0.9300724637681159, + "grad_norm": 9.626141623922942, + "learning_rate": 1.277027428610139e-08, + "loss": 0.3247, + "step": 5134 + }, + { + "epoch": 0.9302536231884058, + "grad_norm": 3.833925962328562, + "learning_rate": 1.2704474050549363e-08, + "loss": 0.2736, + "step": 5135 + }, + { + "epoch": 0.9304347826086956, + "grad_norm": 8.143596855845226, + "learning_rate": 1.2638841593155758e-08, + "loss": 0.3136, + "step": 5136 + }, + { + "epoch": 0.9306159420289855, + "grad_norm": 6.425821858079505, + "learning_rate": 1.2573376936518165e-08, + "loss": 0.2686, + "step": 5137 + }, + { + "epoch": 0.9307971014492754, + "grad_norm": 7.227079439326006, + "learning_rate": 1.2508080103176333e-08, + "loss": 0.2377, + "step": 5138 + }, + { + "epoch": 0.9309782608695653, + "grad_norm": 4.969244802282517, + "learning_rate": 1.2442951115612387e-08, + "loss": 0.2693, + "step": 5139 + }, + { + "epoch": 0.9311594202898551, + "grad_norm": 5.780637981934778, + "learning_rate": 1.2377989996250394e-08, + "loss": 0.234, + "step": 5140 + }, + { + "epoch": 0.931340579710145, + "grad_norm": 5.727817731378993, + "learning_rate": 1.2313196767456902e-08, + "loss": 0.2651, + "step": 5141 + }, + { + "epoch": 0.9315217391304348, + "grad_norm": 5.628729170796921, + "learning_rate": 1.2248571451540401e-08, + "loss": 0.2806, + "step": 5142 + }, + { + "epoch": 0.9317028985507246, + "grad_norm": 3.9721827221516235, + "learning_rate": 1.2184114070751817e-08, + "loss": 0.2784, + "step": 5143 + }, + { + "epoch": 0.9318840579710145, + "grad_norm": 3.902570172692182, + "learning_rate": 1.2119824647284116e-08, + "loss": 0.2786, + "step": 5144 + }, + { + "epoch": 0.9320652173913043, + "grad_norm": 9.144841875299965, + "learning_rate": 1.2055703203272317e-08, + "loss": 0.3035, + "step": 5145 + }, + { + "epoch": 0.9322463768115942, + "grad_norm": 7.11284579862854, + "learning_rate": 1.1991749760793924e-08, + "loss": 0.2806, + "step": 5146 + }, + { + "epoch": 0.932427536231884, + "grad_norm": 6.261817635323582, + "learning_rate": 1.1927964341868269e-08, + "loss": 0.2509, + "step": 5147 + }, + { + "epoch": 0.9326086956521739, + "grad_norm": 6.103494741894521, + "learning_rate": 1.1864346968457007e-08, + "loss": 0.2385, + "step": 5148 + }, + { + "epoch": 0.9327898550724638, + "grad_norm": 3.529692060934503, + "learning_rate": 1.1800897662463948e-08, + "loss": 0.2329, + "step": 5149 + }, + { + "epoch": 0.9329710144927537, + "grad_norm": 6.840440921411112, + "learning_rate": 1.1737616445734954e-08, + "loss": 0.2751, + "step": 5150 + }, + { + "epoch": 0.9331521739130435, + "grad_norm": 8.95527016855722, + "learning_rate": 1.167450334005804e-08, + "loss": 0.3406, + "step": 5151 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 4.728909246655847, + "learning_rate": 1.1611558367163433e-08, + "loss": 0.2968, + "step": 5152 + }, + { + "epoch": 0.9335144927536232, + "grad_norm": 7.263384673038895, + "learning_rate": 1.1548781548723186e-08, + "loss": 0.2949, + "step": 5153 + }, + { + "epoch": 0.933695652173913, + "grad_norm": 4.542923852915622, + "learning_rate": 1.1486172906351898e-08, + "loss": 0.3085, + "step": 5154 + }, + { + "epoch": 0.9338768115942029, + "grad_norm": 8.489658178267135, + "learning_rate": 1.1423732461605829e-08, + "loss": 0.2824, + "step": 5155 + }, + { + "epoch": 0.9340579710144927, + "grad_norm": 5.8764754767589835, + "learning_rate": 1.1361460235983666e-08, + "loss": 0.2451, + "step": 5156 + }, + { + "epoch": 0.9342391304347826, + "grad_norm": 4.165217434363591, + "learning_rate": 1.1299356250925929e-08, + "loss": 0.2914, + "step": 5157 + }, + { + "epoch": 0.9344202898550724, + "grad_norm": 7.990978544947612, + "learning_rate": 1.1237420527815288e-08, + "loss": 0.2505, + "step": 5158 + }, + { + "epoch": 0.9346014492753624, + "grad_norm": 3.6882036061929044, + "learning_rate": 1.1175653087976633e-08, + "loss": 0.2792, + "step": 5159 + }, + { + "epoch": 0.9347826086956522, + "grad_norm": 9.097928287582794, + "learning_rate": 1.1114053952676728e-08, + "loss": 0.2818, + "step": 5160 + }, + { + "epoch": 0.9349637681159421, + "grad_norm": 4.336478351026284, + "learning_rate": 1.1052623143124385e-08, + "loss": 0.2999, + "step": 5161 + }, + { + "epoch": 0.9351449275362319, + "grad_norm": 3.7031826051401264, + "learning_rate": 1.099136068047063e-08, + "loss": 0.2857, + "step": 5162 + }, + { + "epoch": 0.9353260869565218, + "grad_norm": 8.12979280754706, + "learning_rate": 1.0930266585808312e-08, + "loss": 0.2515, + "step": 5163 + }, + { + "epoch": 0.9355072463768116, + "grad_norm": 6.514235409319526, + "learning_rate": 1.0869340880172495e-08, + "loss": 0.3394, + "step": 5164 + }, + { + "epoch": 0.9356884057971014, + "grad_norm": 5.657832353776019, + "learning_rate": 1.080858358454012e-08, + "loss": 0.3709, + "step": 5165 + }, + { + "epoch": 0.9358695652173913, + "grad_norm": 6.783729641059947, + "learning_rate": 1.0747994719830345e-08, + "loss": 0.2923, + "step": 5166 + }, + { + "epoch": 0.9360507246376811, + "grad_norm": 5.183126493072396, + "learning_rate": 1.0687574306904034e-08, + "loss": 0.231, + "step": 5167 + }, + { + "epoch": 0.936231884057971, + "grad_norm": 5.303427453622968, + "learning_rate": 1.0627322366564329e-08, + "loss": 0.327, + "step": 5168 + }, + { + "epoch": 0.9364130434782608, + "grad_norm": 4.044958135680238, + "learning_rate": 1.0567238919556243e-08, + "loss": 0.2185, + "step": 5169 + }, + { + "epoch": 0.9365942028985508, + "grad_norm": 7.617891447867524, + "learning_rate": 1.0507323986566785e-08, + "loss": 0.3349, + "step": 5170 + }, + { + "epoch": 0.9367753623188406, + "grad_norm": 4.489700405684392, + "learning_rate": 1.0447577588224898e-08, + "loss": 0.2772, + "step": 5171 + }, + { + "epoch": 0.9369565217391305, + "grad_norm": 4.022225323000435, + "learning_rate": 1.0387999745101573e-08, + "loss": 0.2778, + "step": 5172 + }, + { + "epoch": 0.9371376811594203, + "grad_norm": 4.315314558687485, + "learning_rate": 1.0328590477709897e-08, + "loss": 0.3119, + "step": 5173 + }, + { + "epoch": 0.9373188405797102, + "grad_norm": 4.145775309360231, + "learning_rate": 1.0269349806504513e-08, + "loss": 0.2559, + "step": 5174 + }, + { + "epoch": 0.9375, + "grad_norm": 3.5666409942919426, + "learning_rate": 1.0210277751882435e-08, + "loss": 0.2314, + "step": 5175 + }, + { + "epoch": 0.9376811594202898, + "grad_norm": 7.868011153820809, + "learning_rate": 1.0151374334182338e-08, + "loss": 0.2843, + "step": 5176 + }, + { + "epoch": 0.9378623188405797, + "grad_norm": 5.971769457271549, + "learning_rate": 1.0092639573685058e-08, + "loss": 0.3155, + "step": 5177 + }, + { + "epoch": 0.9380434782608695, + "grad_norm": 3.6056420253172483, + "learning_rate": 1.0034073490613193e-08, + "loss": 0.3041, + "step": 5178 + }, + { + "epoch": 0.9382246376811594, + "grad_norm": 3.4142709258500785, + "learning_rate": 9.975676105131392e-09, + "loss": 0.2557, + "step": 5179 + }, + { + "epoch": 0.9384057971014492, + "grad_norm": 5.229571866439575, + "learning_rate": 9.91744743734596e-09, + "loss": 0.3055, + "step": 5180 + }, + { + "epoch": 0.9385869565217392, + "grad_norm": 5.0343397214127465, + "learning_rate": 9.859387507305527e-09, + "loss": 0.2674, + "step": 5181 + }, + { + "epoch": 0.938768115942029, + "grad_norm": 4.7813600296960255, + "learning_rate": 9.801496335000325e-09, + "loss": 0.3187, + "step": 5182 + }, + { + "epoch": 0.9389492753623189, + "grad_norm": 4.111992620134856, + "learning_rate": 9.743773940362521e-09, + "loss": 0.291, + "step": 5183 + }, + { + "epoch": 0.9391304347826087, + "grad_norm": 3.9882803679770453, + "learning_rate": 9.686220343266215e-09, + "loss": 0.1941, + "step": 5184 + }, + { + "epoch": 0.9393115942028986, + "grad_norm": 5.012742363516596, + "learning_rate": 9.628835563527394e-09, + "loss": 0.3228, + "step": 5185 + }, + { + "epoch": 0.9394927536231884, + "grad_norm": 6.935891231578678, + "learning_rate": 9.571619620903915e-09, + "loss": 0.3055, + "step": 5186 + }, + { + "epoch": 0.9396739130434782, + "grad_norm": 4.0205122552600665, + "learning_rate": 9.514572535095522e-09, + "loss": 0.2887, + "step": 5187 + }, + { + "epoch": 0.9398550724637681, + "grad_norm": 3.606659696119982, + "learning_rate": 9.457694325743726e-09, + "loss": 0.2052, + "step": 5188 + }, + { + "epoch": 0.9400362318840579, + "grad_norm": 4.8674593317218005, + "learning_rate": 9.400985012432139e-09, + "loss": 0.3139, + "step": 5189 + }, + { + "epoch": 0.9402173913043478, + "grad_norm": 5.227936482617197, + "learning_rate": 9.34444461468581e-09, + "loss": 0.2906, + "step": 5190 + }, + { + "epoch": 0.9403985507246376, + "grad_norm": 4.3523672079135745, + "learning_rate": 9.288073151971998e-09, + "loss": 0.3335, + "step": 5191 + }, + { + "epoch": 0.9405797101449276, + "grad_norm": 5.84180683534299, + "learning_rate": 9.231870643699624e-09, + "loss": 0.2399, + "step": 5192 + }, + { + "epoch": 0.9407608695652174, + "grad_norm": 4.607898246114629, + "learning_rate": 9.175837109219487e-09, + "loss": 0.3359, + "step": 5193 + }, + { + "epoch": 0.9409420289855073, + "grad_norm": 8.875740206969539, + "learning_rate": 9.119972567824263e-09, + "loss": 0.3195, + "step": 5194 + }, + { + "epoch": 0.9411231884057971, + "grad_norm": 4.251070548163476, + "learning_rate": 9.064277038748291e-09, + "loss": 0.2402, + "step": 5195 + }, + { + "epoch": 0.941304347826087, + "grad_norm": 6.061949092248013, + "learning_rate": 9.008750541167842e-09, + "loss": 0.1948, + "step": 5196 + }, + { + "epoch": 0.9414855072463768, + "grad_norm": 6.802492719264378, + "learning_rate": 8.9533930942009e-09, + "loss": 0.2428, + "step": 5197 + }, + { + "epoch": 0.9416666666666667, + "grad_norm": 3.541856869927257, + "learning_rate": 8.898204716907387e-09, + "loss": 0.2743, + "step": 5198 + }, + { + "epoch": 0.9418478260869565, + "grad_norm": 3.6694553682078546, + "learning_rate": 8.84318542828888e-09, + "loss": 0.2513, + "step": 5199 + }, + { + "epoch": 0.9420289855072463, + "grad_norm": 9.365049154771608, + "learning_rate": 8.78833524728878e-09, + "loss": 0.3543, + "step": 5200 + }, + { + "epoch": 0.9420289855072463, + "eval_loss": 0.2675468623638153, + "eval_runtime": 9.7634, + "eval_samples_per_second": 51.212, + "eval_steps_per_second": 0.102, + "step": 5200 + }, + { + "epoch": 0.9422101449275362, + "grad_norm": 6.625773480054826, + "learning_rate": 8.733654192792262e-09, + "loss": 0.2614, + "step": 5201 + }, + { + "epoch": 0.9423913043478261, + "grad_norm": 4.63804473090142, + "learning_rate": 8.679142283626317e-09, + "loss": 0.3326, + "step": 5202 + }, + { + "epoch": 0.942572463768116, + "grad_norm": 5.027940785910337, + "learning_rate": 8.624799538559491e-09, + "loss": 0.287, + "step": 5203 + }, + { + "epoch": 0.9427536231884058, + "grad_norm": 7.027324082011557, + "learning_rate": 8.570625976302481e-09, + "loss": 0.3077, + "step": 5204 + }, + { + "epoch": 0.9429347826086957, + "grad_norm": 7.9237692435664, + "learning_rate": 8.51662161550737e-09, + "loss": 0.3299, + "step": 5205 + }, + { + "epoch": 0.9431159420289855, + "grad_norm": 3.891689271742777, + "learning_rate": 8.462786474768114e-09, + "loss": 0.2624, + "step": 5206 + }, + { + "epoch": 0.9432971014492754, + "grad_norm": 4.239166626895789, + "learning_rate": 8.409120572620388e-09, + "loss": 0.3155, + "step": 5207 + }, + { + "epoch": 0.9434782608695652, + "grad_norm": 4.306638520026201, + "learning_rate": 8.355623927541688e-09, + "loss": 0.2761, + "step": 5208 + }, + { + "epoch": 0.943659420289855, + "grad_norm": 3.996107002635442, + "learning_rate": 8.302296557951171e-09, + "loss": 0.2578, + "step": 5209 + }, + { + "epoch": 0.9438405797101449, + "grad_norm": 8.06081440133288, + "learning_rate": 8.249138482209594e-09, + "loss": 0.3304, + "step": 5210 + }, + { + "epoch": 0.9440217391304347, + "grad_norm": 3.4038023088830642, + "learning_rate": 8.19614971861965e-09, + "loss": 0.2391, + "step": 5211 + }, + { + "epoch": 0.9442028985507246, + "grad_norm": 6.0568719774199655, + "learning_rate": 8.143330285425576e-09, + "loss": 0.3187, + "step": 5212 + }, + { + "epoch": 0.9443840579710145, + "grad_norm": 8.94340875743482, + "learning_rate": 8.090680200813327e-09, + "loss": 0.2848, + "step": 5213 + }, + { + "epoch": 0.9445652173913044, + "grad_norm": 6.3563296169192105, + "learning_rate": 8.038199482910624e-09, + "loss": 0.2873, + "step": 5214 + }, + { + "epoch": 0.9447463768115942, + "grad_norm": 3.5769701521760204, + "learning_rate": 7.985888149786734e-09, + "loss": 0.2636, + "step": 5215 + }, + { + "epoch": 0.9449275362318841, + "grad_norm": 7.784593493016849, + "learning_rate": 7.933746219452863e-09, + "loss": 0.364, + "step": 5216 + }, + { + "epoch": 0.9451086956521739, + "grad_norm": 8.32632421718918, + "learning_rate": 7.881773709861594e-09, + "loss": 0.2504, + "step": 5217 + }, + { + "epoch": 0.9452898550724638, + "grad_norm": 3.5305623131540873, + "learning_rate": 7.829970638907335e-09, + "loss": 0.2366, + "step": 5218 + }, + { + "epoch": 0.9454710144927536, + "grad_norm": 8.011139312629501, + "learning_rate": 7.778337024426096e-09, + "loss": 0.2681, + "step": 5219 + }, + { + "epoch": 0.9456521739130435, + "grad_norm": 7.020375540565217, + "learning_rate": 7.7268728841956e-09, + "loss": 0.308, + "step": 5220 + }, + { + "epoch": 0.9458333333333333, + "grad_norm": 6.483658683535193, + "learning_rate": 7.675578235935287e-09, + "loss": 0.3494, + "step": 5221 + }, + { + "epoch": 0.9460144927536231, + "grad_norm": 8.261431112546184, + "learning_rate": 7.624453097306083e-09, + "loss": 0.2719, + "step": 5222 + }, + { + "epoch": 0.946195652173913, + "grad_norm": 3.523566791558715, + "learning_rate": 7.573497485910518e-09, + "loss": 0.2686, + "step": 5223 + }, + { + "epoch": 0.946376811594203, + "grad_norm": 6.529275284799194, + "learning_rate": 7.522711419292948e-09, + "loss": 0.2671, + "step": 5224 + }, + { + "epoch": 0.9465579710144928, + "grad_norm": 3.6375153146588364, + "learning_rate": 7.472094914939275e-09, + "loss": 0.2528, + "step": 5225 + }, + { + "epoch": 0.9467391304347826, + "grad_norm": 5.400021225952358, + "learning_rate": 7.421647990277003e-09, + "loss": 0.275, + "step": 5226 + }, + { + "epoch": 0.9469202898550725, + "grad_norm": 4.0549937205945215, + "learning_rate": 7.371370662675125e-09, + "loss": 0.2816, + "step": 5227 + }, + { + "epoch": 0.9471014492753623, + "grad_norm": 8.86014557399874, + "learning_rate": 7.321262949444518e-09, + "loss": 0.3084, + "step": 5228 + }, + { + "epoch": 0.9472826086956522, + "grad_norm": 5.932963213703208, + "learning_rate": 7.2713248678374364e-09, + "loss": 0.3603, + "step": 5229 + }, + { + "epoch": 0.947463768115942, + "grad_norm": 5.161157132834427, + "learning_rate": 7.221556435047793e-09, + "loss": 0.2878, + "step": 5230 + }, + { + "epoch": 0.9476449275362319, + "grad_norm": 4.707156404513242, + "learning_rate": 7.171957668211048e-09, + "loss": 0.2058, + "step": 5231 + }, + { + "epoch": 0.9478260869565217, + "grad_norm": 5.301462569719585, + "learning_rate": 7.12252858440443e-09, + "loss": 0.3037, + "step": 5232 + }, + { + "epoch": 0.9480072463768116, + "grad_norm": 4.158442985000619, + "learning_rate": 7.073269200646493e-09, + "loss": 0.2609, + "step": 5233 + }, + { + "epoch": 0.9481884057971014, + "grad_norm": 6.888317241230673, + "learning_rate": 7.024179533897501e-09, + "loss": 0.2957, + "step": 5234 + }, + { + "epoch": 0.9483695652173914, + "grad_norm": 4.664569634794076, + "learning_rate": 6.9752596010592135e-09, + "loss": 0.2808, + "step": 5235 + }, + { + "epoch": 0.9485507246376812, + "grad_norm": 4.496352837949094, + "learning_rate": 6.926509418975102e-09, + "loss": 0.2798, + "step": 5236 + }, + { + "epoch": 0.948731884057971, + "grad_norm": 4.987760796161818, + "learning_rate": 6.877929004430016e-09, + "loss": 0.2402, + "step": 5237 + }, + { + "epoch": 0.9489130434782609, + "grad_norm": 5.070773683749353, + "learning_rate": 6.829518374150412e-09, + "loss": 0.3138, + "step": 5238 + }, + { + "epoch": 0.9490942028985507, + "grad_norm": 4.318789816820521, + "learning_rate": 6.7812775448043425e-09, + "loss": 0.3044, + "step": 5239 + }, + { + "epoch": 0.9492753623188406, + "grad_norm": 4.346974441858678, + "learning_rate": 6.733206533001357e-09, + "loss": 0.3256, + "step": 5240 + }, + { + "epoch": 0.9494565217391304, + "grad_norm": 3.260933116258712, + "learning_rate": 6.685305355292492e-09, + "loss": 0.1912, + "step": 5241 + }, + { + "epoch": 0.9496376811594203, + "grad_norm": 4.619320148338823, + "learning_rate": 6.637574028170334e-09, + "loss": 0.322, + "step": 5242 + }, + { + "epoch": 0.9498188405797101, + "grad_norm": 3.1745478993421443, + "learning_rate": 6.59001256806907e-09, + "loss": 0.2245, + "step": 5243 + }, + { + "epoch": 0.95, + "grad_norm": 3.987966966510248, + "learning_rate": 6.542620991364322e-09, + "loss": 0.2685, + "step": 5244 + }, + { + "epoch": 0.9501811594202898, + "grad_norm": 5.347811060700223, + "learning_rate": 6.4953993143732065e-09, + "loss": 0.2819, + "step": 5245 + }, + { + "epoch": 0.9503623188405798, + "grad_norm": 6.808749209868338, + "learning_rate": 6.448347553354327e-09, + "loss": 0.2953, + "step": 5246 + }, + { + "epoch": 0.9505434782608696, + "grad_norm": 3.9586405492281047, + "learning_rate": 6.401465724507949e-09, + "loss": 0.2686, + "step": 5247 + }, + { + "epoch": 0.9507246376811594, + "grad_norm": 7.088376003581387, + "learning_rate": 6.354753843975602e-09, + "loss": 0.3029, + "step": 5248 + }, + { + "epoch": 0.9509057971014493, + "grad_norm": 3.913350536822113, + "learning_rate": 6.308211927840479e-09, + "loss": 0.2422, + "step": 5249 + }, + { + "epoch": 0.9510869565217391, + "grad_norm": 4.05105707054459, + "learning_rate": 6.261839992127149e-09, + "loss": 0.2711, + "step": 5250 + }, + { + "epoch": 0.951268115942029, + "grad_norm": 5.161519366178647, + "learning_rate": 6.215638052801675e-09, + "loss": 0.2412, + "step": 5251 + }, + { + "epoch": 0.9514492753623188, + "grad_norm": 4.564154072342774, + "learning_rate": 6.1696061257716095e-09, + "loss": 0.3124, + "step": 5252 + }, + { + "epoch": 0.9516304347826087, + "grad_norm": 4.266185334387586, + "learning_rate": 6.123744226885941e-09, + "loss": 0.2614, + "step": 5253 + }, + { + "epoch": 0.9518115942028985, + "grad_norm": 4.5724293724969485, + "learning_rate": 6.078052371935261e-09, + "loss": 0.2836, + "step": 5254 + }, + { + "epoch": 0.9519927536231884, + "grad_norm": 7.873857112998623, + "learning_rate": 6.032530576651318e-09, + "loss": 0.3303, + "step": 5255 + }, + { + "epoch": 0.9521739130434783, + "grad_norm": 5.54396943809504, + "learning_rate": 5.987178856707631e-09, + "loss": 0.2902, + "step": 5256 + }, + { + "epoch": 0.9523550724637682, + "grad_norm": 3.6892455901495556, + "learning_rate": 5.9419972277188756e-09, + "loss": 0.2596, + "step": 5257 + }, + { + "epoch": 0.952536231884058, + "grad_norm": 5.363885630238747, + "learning_rate": 5.896985705241386e-09, + "loss": 0.2485, + "step": 5258 + }, + { + "epoch": 0.9527173913043478, + "grad_norm": 4.966115630858673, + "learning_rate": 5.8521443047728765e-09, + "loss": 0.2482, + "step": 5259 + }, + { + "epoch": 0.9528985507246377, + "grad_norm": 4.389787298202812, + "learning_rate": 5.807473041752386e-09, + "loss": 0.2873, + "step": 5260 + }, + { + "epoch": 0.9530797101449275, + "grad_norm": 4.221893453112242, + "learning_rate": 5.762971931560445e-09, + "loss": 0.3035, + "step": 5261 + }, + { + "epoch": 0.9532608695652174, + "grad_norm": 5.8723598191839965, + "learning_rate": 5.7186409895189635e-09, + "loss": 0.2631, + "step": 5262 + }, + { + "epoch": 0.9534420289855072, + "grad_norm": 5.933176438288503, + "learning_rate": 5.674480230891398e-09, + "loss": 0.2467, + "step": 5263 + }, + { + "epoch": 0.9536231884057971, + "grad_norm": 5.046570716069905, + "learning_rate": 5.630489670882477e-09, + "loss": 0.25, + "step": 5264 + }, + { + "epoch": 0.9538043478260869, + "grad_norm": 4.12051128888061, + "learning_rate": 5.58666932463836e-09, + "loss": 0.2689, + "step": 5265 + }, + { + "epoch": 0.9539855072463768, + "grad_norm": 4.015646219143732, + "learning_rate": 5.54301920724648e-09, + "loss": 0.2784, + "step": 5266 + }, + { + "epoch": 0.9541666666666667, + "grad_norm": 7.32827706516708, + "learning_rate": 5.499539333735925e-09, + "loss": 0.2778, + "step": 5267 + }, + { + "epoch": 0.9543478260869566, + "grad_norm": 5.0592087296365, + "learning_rate": 5.456229719076944e-09, + "loss": 0.2328, + "step": 5268 + }, + { + "epoch": 0.9545289855072464, + "grad_norm": 4.173909970806722, + "learning_rate": 5.4130903781812734e-09, + "loss": 0.2777, + "step": 5269 + }, + { + "epoch": 0.9547101449275363, + "grad_norm": 3.9241170686678974, + "learning_rate": 5.370121325901977e-09, + "loss": 0.2274, + "step": 5270 + }, + { + "epoch": 0.9548913043478261, + "grad_norm": 3.7221589650692377, + "learning_rate": 5.327322577033555e-09, + "loss": 0.2596, + "step": 5271 + }, + { + "epoch": 0.9550724637681159, + "grad_norm": 3.566411005545428, + "learning_rate": 5.2846941463117745e-09, + "loss": 0.2253, + "step": 5272 + }, + { + "epoch": 0.9552536231884058, + "grad_norm": 5.995192068391682, + "learning_rate": 5.242236048413729e-09, + "loss": 0.3454, + "step": 5273 + }, + { + "epoch": 0.9554347826086956, + "grad_norm": 3.3625303578624886, + "learning_rate": 5.199948297958112e-09, + "loss": 0.2513, + "step": 5274 + }, + { + "epoch": 0.9556159420289855, + "grad_norm": 3.160437305148599, + "learning_rate": 5.1578309095047234e-09, + "loss": 0.227, + "step": 5275 + }, + { + "epoch": 0.9557971014492753, + "grad_norm": 4.484128792821842, + "learning_rate": 5.115883897554685e-09, + "loss": 0.2813, + "step": 5276 + }, + { + "epoch": 0.9559782608695652, + "grad_norm": 3.7893548393419008, + "learning_rate": 5.074107276550665e-09, + "loss": 0.2438, + "step": 5277 + }, + { + "epoch": 0.9561594202898551, + "grad_norm": 5.28260982444716, + "learning_rate": 5.032501060876493e-09, + "loss": 0.2268, + "step": 5278 + }, + { + "epoch": 0.956340579710145, + "grad_norm": 3.99639591974172, + "learning_rate": 4.9910652648574856e-09, + "loss": 0.2862, + "step": 5279 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 7.454676001194611, + "learning_rate": 4.9497999027600655e-09, + "loss": 0.2739, + "step": 5280 + }, + { + "epoch": 0.9567028985507247, + "grad_norm": 6.8225393795100615, + "learning_rate": 4.9087049887920896e-09, + "loss": 0.2651, + "step": 5281 + }, + { + "epoch": 0.9568840579710145, + "grad_norm": 5.785219789631884, + "learning_rate": 4.86778053710285e-09, + "loss": 0.2949, + "step": 5282 + }, + { + "epoch": 0.9570652173913043, + "grad_norm": 4.2164614909978475, + "learning_rate": 4.827026561782743e-09, + "loss": 0.2816, + "step": 5283 + }, + { + "epoch": 0.9572463768115942, + "grad_norm": 3.7441271739791087, + "learning_rate": 4.7864430768635445e-09, + "loss": 0.2578, + "step": 5284 + }, + { + "epoch": 0.957427536231884, + "grad_norm": 7.34184115492016, + "learning_rate": 4.746030096318354e-09, + "loss": 0.2874, + "step": 5285 + }, + { + "epoch": 0.9576086956521739, + "grad_norm": 3.3537842398614575, + "learning_rate": 4.705787634061598e-09, + "loss": 0.2207, + "step": 5286 + }, + { + "epoch": 0.9577898550724637, + "grad_norm": 6.224869312879529, + "learning_rate": 4.665715703948914e-09, + "loss": 0.2655, + "step": 5287 + }, + { + "epoch": 0.9579710144927536, + "grad_norm": 4.584916125920998, + "learning_rate": 4.62581431977721e-09, + "loss": 0.2872, + "step": 5288 + }, + { + "epoch": 0.9581521739130435, + "grad_norm": 3.8269892319901433, + "learning_rate": 4.586083495284776e-09, + "loss": 0.227, + "step": 5289 + }, + { + "epoch": 0.9583333333333334, + "grad_norm": 3.993504772745209, + "learning_rate": 4.546523244151168e-09, + "loss": 0.3121, + "step": 5290 + }, + { + "epoch": 0.9585144927536232, + "grad_norm": 4.3220758292666455, + "learning_rate": 4.507133579997046e-09, + "loss": 0.2895, + "step": 5291 + }, + { + "epoch": 0.9586956521739131, + "grad_norm": 5.337093426651146, + "learning_rate": 4.467914516384619e-09, + "loss": 0.218, + "step": 5292 + }, + { + "epoch": 0.9588768115942029, + "grad_norm": 6.41035892268632, + "learning_rate": 4.4288660668170275e-09, + "loss": 0.2631, + "step": 5293 + }, + { + "epoch": 0.9590579710144927, + "grad_norm": 5.492670490114389, + "learning_rate": 4.389988244738907e-09, + "loss": 0.2876, + "step": 5294 + }, + { + "epoch": 0.9592391304347826, + "grad_norm": 8.88771176249773, + "learning_rate": 4.35128106353616e-09, + "loss": 0.305, + "step": 5295 + }, + { + "epoch": 0.9594202898550724, + "grad_norm": 5.199139540470069, + "learning_rate": 4.312744536535684e-09, + "loss": 0.2639, + "step": 5296 + }, + { + "epoch": 0.9596014492753623, + "grad_norm": 4.725924075487749, + "learning_rate": 4.274378677005919e-09, + "loss": 0.2927, + "step": 5297 + }, + { + "epoch": 0.9597826086956521, + "grad_norm": 4.380283488850347, + "learning_rate": 4.236183498156409e-09, + "loss": 0.263, + "step": 5298 + }, + { + "epoch": 0.9599637681159421, + "grad_norm": 5.686965306264911, + "learning_rate": 4.198159013137858e-09, + "loss": 0.2609, + "step": 5299 + }, + { + "epoch": 0.9601449275362319, + "grad_norm": 6.897693185979009, + "learning_rate": 4.16030523504235e-09, + "loss": 0.2424, + "step": 5300 + }, + { + "epoch": 0.9601449275362319, + "eval_loss": 0.265687495470047, + "eval_runtime": 9.7821, + "eval_samples_per_second": 51.114, + "eval_steps_per_second": 0.102, + "step": 5300 + }, + { + "epoch": 0.9603260869565218, + "grad_norm": 3.92293501921209, + "learning_rate": 4.122622176903012e-09, + "loss": 0.2617, + "step": 5301 + }, + { + "epoch": 0.9605072463768116, + "grad_norm": 3.9961230210837373, + "learning_rate": 4.085109851694468e-09, + "loss": 0.2824, + "step": 5302 + }, + { + "epoch": 0.9606884057971015, + "grad_norm": 4.603723190223892, + "learning_rate": 4.047768272332275e-09, + "loss": 0.3134, + "step": 5303 + }, + { + "epoch": 0.9608695652173913, + "grad_norm": 3.5847924378981872, + "learning_rate": 4.010597451673315e-09, + "loss": 0.242, + "step": 5304 + }, + { + "epoch": 0.9610507246376812, + "grad_norm": 4.470248813825341, + "learning_rate": 3.9735974025156825e-09, + "loss": 0.2637, + "step": 5305 + }, + { + "epoch": 0.961231884057971, + "grad_norm": 5.604232429894755, + "learning_rate": 3.9367681375986895e-09, + "loss": 0.2623, + "step": 5306 + }, + { + "epoch": 0.9614130434782608, + "grad_norm": 9.454709939249124, + "learning_rate": 3.900109669602858e-09, + "loss": 0.3171, + "step": 5307 + }, + { + "epoch": 0.9615942028985507, + "grad_norm": 3.9611452214911633, + "learning_rate": 3.863622011149814e-09, + "loss": 0.2791, + "step": 5308 + }, + { + "epoch": 0.9617753623188405, + "grad_norm": 3.7451395602609288, + "learning_rate": 3.827305174802453e-09, + "loss": 0.2532, + "step": 5309 + }, + { + "epoch": 0.9619565217391305, + "grad_norm": 8.570247117055857, + "learning_rate": 3.791159173064829e-09, + "loss": 0.3173, + "step": 5310 + }, + { + "epoch": 0.9621376811594203, + "grad_norm": 4.029694462241031, + "learning_rate": 3.755184018382207e-09, + "loss": 0.2695, + "step": 5311 + }, + { + "epoch": 0.9623188405797102, + "grad_norm": 6.757196905855348, + "learning_rate": 3.7193797231409587e-09, + "loss": 0.325, + "step": 5312 + }, + { + "epoch": 0.9625, + "grad_norm": 5.285739203511072, + "learning_rate": 3.683746299668722e-09, + "loss": 0.2545, + "step": 5313 + }, + { + "epoch": 0.9626811594202899, + "grad_norm": 7.099790587122954, + "learning_rate": 3.648283760234239e-09, + "loss": 0.3342, + "step": 5314 + }, + { + "epoch": 0.9628623188405797, + "grad_norm": 6.877196420782656, + "learning_rate": 3.612992117047409e-09, + "loss": 0.2479, + "step": 5315 + }, + { + "epoch": 0.9630434782608696, + "grad_norm": 6.331353055333265, + "learning_rate": 3.5778713822592897e-09, + "loss": 0.3105, + "step": 5316 + }, + { + "epoch": 0.9632246376811594, + "grad_norm": 4.385160533531568, + "learning_rate": 3.5429215679622093e-09, + "loss": 0.286, + "step": 5317 + }, + { + "epoch": 0.9634057971014492, + "grad_norm": 3.92321524492969, + "learning_rate": 3.508142686189486e-09, + "loss": 0.2476, + "step": 5318 + }, + { + "epoch": 0.9635869565217391, + "grad_norm": 6.340009409255978, + "learning_rate": 3.4735347489156518e-09, + "loss": 0.2918, + "step": 5319 + }, + { + "epoch": 0.9637681159420289, + "grad_norm": 10.430866106595952, + "learning_rate": 3.439097768056398e-09, + "loss": 0.2798, + "step": 5320 + }, + { + "epoch": 0.9639492753623189, + "grad_norm": 4.018759683538263, + "learning_rate": 3.404831755468518e-09, + "loss": 0.2095, + "step": 5321 + }, + { + "epoch": 0.9641304347826087, + "grad_norm": 4.408173689557219, + "learning_rate": 3.370736722949963e-09, + "loss": 0.2395, + "step": 5322 + }, + { + "epoch": 0.9643115942028986, + "grad_norm": 5.462553308557146, + "learning_rate": 3.3368126822398977e-09, + "loss": 0.2967, + "step": 5323 + }, + { + "epoch": 0.9644927536231884, + "grad_norm": 4.230050254888981, + "learning_rate": 3.3030596450184246e-09, + "loss": 0.2747, + "step": 5324 + }, + { + "epoch": 0.9646739130434783, + "grad_norm": 4.915607063222736, + "learning_rate": 3.269477622906913e-09, + "loss": 0.271, + "step": 5325 + }, + { + "epoch": 0.9648550724637681, + "grad_norm": 4.452257379206557, + "learning_rate": 3.2360666274678372e-09, + "loss": 0.2669, + "step": 5326 + }, + { + "epoch": 0.965036231884058, + "grad_norm": 4.891015437746063, + "learning_rate": 3.202826670204717e-09, + "loss": 0.3355, + "step": 5327 + }, + { + "epoch": 0.9652173913043478, + "grad_norm": 5.810263111482196, + "learning_rate": 3.169757762562231e-09, + "loss": 0.2556, + "step": 5328 + }, + { + "epoch": 0.9653985507246376, + "grad_norm": 4.932339215750199, + "learning_rate": 3.136859915926271e-09, + "loss": 0.3235, + "step": 5329 + }, + { + "epoch": 0.9655797101449275, + "grad_norm": 6.889390842868626, + "learning_rate": 3.104133141623555e-09, + "loss": 0.3049, + "step": 5330 + }, + { + "epoch": 0.9657608695652173, + "grad_norm": 3.9181870647126145, + "learning_rate": 3.0715774509221247e-09, + "loss": 0.2643, + "step": 5331 + }, + { + "epoch": 0.9659420289855073, + "grad_norm": 4.153760902456737, + "learning_rate": 3.03919285503107e-09, + "loss": 0.2556, + "step": 5332 + }, + { + "epoch": 0.9661231884057971, + "grad_norm": 5.610690194105614, + "learning_rate": 3.0069793651005813e-09, + "loss": 0.3047, + "step": 5333 + }, + { + "epoch": 0.966304347826087, + "grad_norm": 4.3715740666469705, + "learning_rate": 2.974936992221955e-09, + "loss": 0.2967, + "step": 5334 + }, + { + "epoch": 0.9664855072463768, + "grad_norm": 4.299056585111778, + "learning_rate": 2.94306574742742e-09, + "loss": 0.2882, + "step": 5335 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 3.626239209744918, + "learning_rate": 2.9113656416904777e-09, + "loss": 0.251, + "step": 5336 + }, + { + "epoch": 0.9668478260869565, + "grad_norm": 3.8339972300395266, + "learning_rate": 2.879836685925563e-09, + "loss": 0.258, + "step": 5337 + }, + { + "epoch": 0.9670289855072464, + "grad_norm": 3.406643574058657, + "learning_rate": 2.848478890988326e-09, + "loss": 0.2384, + "step": 5338 + }, + { + "epoch": 0.9672101449275362, + "grad_norm": 4.297831596776021, + "learning_rate": 2.817292267675353e-09, + "loss": 0.3227, + "step": 5339 + }, + { + "epoch": 0.967391304347826, + "grad_norm": 4.1032232118488885, + "learning_rate": 2.786276826724332e-09, + "loss": 0.2624, + "step": 5340 + }, + { + "epoch": 0.9675724637681159, + "grad_norm": 5.652900636528436, + "learning_rate": 2.7554325788140544e-09, + "loss": 0.2628, + "step": 5341 + }, + { + "epoch": 0.9677536231884057, + "grad_norm": 5.903433931780544, + "learning_rate": 2.724759534564358e-09, + "loss": 0.2658, + "step": 5342 + }, + { + "epoch": 0.9679347826086957, + "grad_norm": 4.28748291226025, + "learning_rate": 2.694257704536018e-09, + "loss": 0.3366, + "step": 5343 + }, + { + "epoch": 0.9681159420289855, + "grad_norm": 5.836648975444641, + "learning_rate": 2.663927099231078e-09, + "loss": 0.2606, + "step": 5344 + }, + { + "epoch": 0.9682971014492754, + "grad_norm": 3.9110715218946335, + "learning_rate": 2.6337677290925176e-09, + "loss": 0.2691, + "step": 5345 + }, + { + "epoch": 0.9684782608695652, + "grad_norm": 4.166385712527874, + "learning_rate": 2.6037796045042525e-09, + "loss": 0.2972, + "step": 5346 + }, + { + "epoch": 0.9686594202898551, + "grad_norm": 6.179035052173582, + "learning_rate": 2.5739627357913574e-09, + "loss": 0.2561, + "step": 5347 + }, + { + "epoch": 0.9688405797101449, + "grad_norm": 3.8827170465724246, + "learning_rate": 2.5443171332200086e-09, + "loss": 0.2542, + "step": 5348 + }, + { + "epoch": 0.9690217391304348, + "grad_norm": 4.870344068581291, + "learning_rate": 2.514842806997208e-09, + "loss": 0.2982, + "step": 5349 + }, + { + "epoch": 0.9692028985507246, + "grad_norm": 4.416705638710597, + "learning_rate": 2.4855397672711718e-09, + "loss": 0.2463, + "step": 5350 + }, + { + "epoch": 0.9693840579710145, + "grad_norm": 4.053104386533474, + "learning_rate": 2.4564080241311067e-09, + "loss": 0.2722, + "step": 5351 + }, + { + "epoch": 0.9695652173913043, + "grad_norm": 4.817134153346506, + "learning_rate": 2.4274475876071565e-09, + "loss": 0.2784, + "step": 5352 + }, + { + "epoch": 0.9697463768115943, + "grad_norm": 4.628430218577455, + "learning_rate": 2.3986584676705114e-09, + "loss": 0.2639, + "step": 5353 + }, + { + "epoch": 0.9699275362318841, + "grad_norm": 4.62442616196145, + "learning_rate": 2.3700406742334646e-09, + "loss": 0.3144, + "step": 5354 + }, + { + "epoch": 0.970108695652174, + "grad_norm": 5.794690268446705, + "learning_rate": 2.3415942171492454e-09, + "loss": 0.2912, + "step": 5355 + }, + { + "epoch": 0.9702898550724638, + "grad_norm": 8.525358047871158, + "learning_rate": 2.313319106212075e-09, + "loss": 0.3158, + "step": 5356 + }, + { + "epoch": 0.9704710144927536, + "grad_norm": 4.286941355940873, + "learning_rate": 2.2852153511572214e-09, + "loss": 0.3297, + "step": 5357 + }, + { + "epoch": 0.9706521739130435, + "grad_norm": 3.5505002243232013, + "learning_rate": 2.257282961661e-09, + "loss": 0.2566, + "step": 5358 + }, + { + "epoch": 0.9708333333333333, + "grad_norm": 7.7463865282986175, + "learning_rate": 2.229521947340496e-09, + "loss": 0.2687, + "step": 5359 + }, + { + "epoch": 0.9710144927536232, + "grad_norm": 6.979732451728607, + "learning_rate": 2.2019323177541737e-09, + "loss": 0.277, + "step": 5360 + }, + { + "epoch": 0.971195652173913, + "grad_norm": 4.045830369807944, + "learning_rate": 2.174514082401102e-09, + "loss": 0.3198, + "step": 5361 + }, + { + "epoch": 0.9713768115942029, + "grad_norm": 5.614473530109136, + "learning_rate": 2.147267250721674e-09, + "loss": 0.3076, + "step": 5362 + }, + { + "epoch": 0.9715579710144927, + "grad_norm": 5.466830402189616, + "learning_rate": 2.1201918320969405e-09, + "loss": 0.3121, + "step": 5363 + }, + { + "epoch": 0.9717391304347827, + "grad_norm": 4.607817489277193, + "learning_rate": 2.093287835849167e-09, + "loss": 0.2664, + "step": 5364 + }, + { + "epoch": 0.9719202898550725, + "grad_norm": 4.5466711311238726, + "learning_rate": 2.066555271241499e-09, + "loss": 0.3068, + "step": 5365 + }, + { + "epoch": 0.9721014492753624, + "grad_norm": 4.844353536563658, + "learning_rate": 2.03999414747813e-09, + "loss": 0.3075, + "step": 5366 + }, + { + "epoch": 0.9722826086956522, + "grad_norm": 4.0221347980556885, + "learning_rate": 2.0136044737041892e-09, + "loss": 0.2792, + "step": 5367 + }, + { + "epoch": 0.972463768115942, + "grad_norm": 3.830436147135888, + "learning_rate": 1.9873862590056855e-09, + "loss": 0.2385, + "step": 5368 + }, + { + "epoch": 0.9726449275362319, + "grad_norm": 6.055466069082649, + "learning_rate": 1.9613395124097875e-09, + "loss": 0.2952, + "step": 5369 + }, + { + "epoch": 0.9728260869565217, + "grad_norm": 4.65234318075288, + "learning_rate": 1.935464242884377e-09, + "loss": 0.2841, + "step": 5370 + }, + { + "epoch": 0.9730072463768116, + "grad_norm": 3.5272328187023114, + "learning_rate": 1.9097604593385498e-09, + "loss": 0.2579, + "step": 5371 + }, + { + "epoch": 0.9731884057971014, + "grad_norm": 5.0692889021067975, + "learning_rate": 1.884228170622226e-09, + "loss": 0.2742, + "step": 5372 + }, + { + "epoch": 0.9733695652173913, + "grad_norm": 10.365122437017554, + "learning_rate": 1.8588673855262083e-09, + "loss": 0.3455, + "step": 5373 + }, + { + "epoch": 0.9735507246376811, + "grad_norm": 3.8885618036197633, + "learning_rate": 1.8336781127824551e-09, + "loss": 0.1974, + "step": 5374 + }, + { + "epoch": 0.9737318840579711, + "grad_norm": 5.205476668740088, + "learning_rate": 1.808660361063641e-09, + "loss": 0.2156, + "step": 5375 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 4.378081512286682, + "learning_rate": 1.7838141389835415e-09, + "loss": 0.3248, + "step": 5376 + }, + { + "epoch": 0.9740942028985508, + "grad_norm": 3.506761683127004, + "learning_rate": 1.7591394550968142e-09, + "loss": 0.2646, + "step": 5377 + }, + { + "epoch": 0.9742753623188406, + "grad_norm": 15.107223794431473, + "learning_rate": 1.734636317899163e-09, + "loss": 0.3262, + "step": 5378 + }, + { + "epoch": 0.9744565217391304, + "grad_norm": 7.41229487072595, + "learning_rate": 1.7103047358270062e-09, + "loss": 0.2687, + "step": 5379 + }, + { + "epoch": 0.9746376811594203, + "grad_norm": 7.698617950518482, + "learning_rate": 1.6861447172578647e-09, + "loss": 0.3011, + "step": 5380 + }, + { + "epoch": 0.9748188405797101, + "grad_norm": 6.716326450497823, + "learning_rate": 1.6621562705101954e-09, + "loss": 0.2363, + "step": 5381 + }, + { + "epoch": 0.975, + "grad_norm": 3.710004508605813, + "learning_rate": 1.6383394038432252e-09, + "loss": 0.2564, + "step": 5382 + }, + { + "epoch": 0.9751811594202898, + "grad_norm": 3.781146427523997, + "learning_rate": 1.6146941254573943e-09, + "loss": 0.2713, + "step": 5383 + }, + { + "epoch": 0.9753623188405797, + "grad_norm": 4.525821925330528, + "learning_rate": 1.5912204434936905e-09, + "loss": 0.2687, + "step": 5384 + }, + { + "epoch": 0.9755434782608695, + "grad_norm": 4.031518798232069, + "learning_rate": 1.5679183660343153e-09, + "loss": 0.308, + "step": 5385 + }, + { + "epoch": 0.9757246376811595, + "grad_norm": 5.415305598671766, + "learning_rate": 1.5447879011022957e-09, + "loss": 0.2961, + "step": 5386 + }, + { + "epoch": 0.9759057971014493, + "grad_norm": 3.0763232948411168, + "learning_rate": 1.5218290566614834e-09, + "loss": 0.1875, + "step": 5387 + }, + { + "epoch": 0.9760869565217392, + "grad_norm": 4.4429645427529145, + "learning_rate": 1.4990418406168327e-09, + "loss": 0.2615, + "step": 5388 + }, + { + "epoch": 0.976268115942029, + "grad_norm": 3.806457330306627, + "learning_rate": 1.476426260814012e-09, + "loss": 0.2879, + "step": 5389 + }, + { + "epoch": 0.9764492753623188, + "grad_norm": 6.668742748691466, + "learning_rate": 1.4539823250396266e-09, + "loss": 0.2675, + "step": 5390 + }, + { + "epoch": 0.9766304347826087, + "grad_norm": 3.6137435470625086, + "learning_rate": 1.431710041021328e-09, + "loss": 0.2544, + "step": 5391 + }, + { + "epoch": 0.9768115942028985, + "grad_norm": 5.294527710341784, + "learning_rate": 1.409609416427482e-09, + "loss": 0.2939, + "step": 5392 + }, + { + "epoch": 0.9769927536231884, + "grad_norm": 6.247914122452138, + "learning_rate": 1.3876804588675572e-09, + "loss": 0.2603, + "step": 5393 + }, + { + "epoch": 0.9771739130434782, + "grad_norm": 5.003101428863767, + "learning_rate": 1.3659231758916812e-09, + "loss": 0.2929, + "step": 5394 + }, + { + "epoch": 0.9773550724637681, + "grad_norm": 4.841999496179696, + "learning_rate": 1.344337574991028e-09, + "loss": 0.2664, + "step": 5395 + }, + { + "epoch": 0.9775362318840579, + "grad_norm": 5.545665157081709, + "learning_rate": 1.3229236635976527e-09, + "loss": 0.3151, + "step": 5396 + }, + { + "epoch": 0.9777173913043479, + "grad_norm": 4.892995303869446, + "learning_rate": 1.3016814490844352e-09, + "loss": 0.3325, + "step": 5397 + }, + { + "epoch": 0.9778985507246377, + "grad_norm": 4.536290409219323, + "learning_rate": 1.2806109387651364e-09, + "loss": 0.2897, + "step": 5398 + }, + { + "epoch": 0.9780797101449276, + "grad_norm": 3.461072078436241, + "learning_rate": 1.2597121398945643e-09, + "loss": 0.2342, + "step": 5399 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 3.7211699810280576, + "learning_rate": 1.238985059668074e-09, + "loss": 0.2442, + "step": 5400 + }, + { + "epoch": 0.9782608695652174, + "eval_loss": 0.2665937542915344, + "eval_runtime": 9.7749, + "eval_samples_per_second": 51.152, + "eval_steps_per_second": 0.102, + "step": 5400 + }, + { + "epoch": 0.9784420289855073, + "grad_norm": 5.422118211327256, + "learning_rate": 1.2184297052222902e-09, + "loss": 0.2554, + "step": 5401 + }, + { + "epoch": 0.9786231884057971, + "grad_norm": 5.0091135585910465, + "learning_rate": 1.1980460836343852e-09, + "loss": 0.3259, + "step": 5402 + }, + { + "epoch": 0.9788043478260869, + "grad_norm": 5.541540763611145, + "learning_rate": 1.177834201922634e-09, + "loss": 0.2812, + "step": 5403 + }, + { + "epoch": 0.9789855072463768, + "grad_norm": 3.9651602429877033, + "learning_rate": 1.1577940670459696e-09, + "loss": 0.2695, + "step": 5404 + }, + { + "epoch": 0.9791666666666666, + "grad_norm": 8.697458753403248, + "learning_rate": 1.1379256859044285e-09, + "loss": 0.2721, + "step": 5405 + }, + { + "epoch": 0.9793478260869565, + "grad_norm": 7.292452380824839, + "learning_rate": 1.118229065338705e-09, + "loss": 0.3269, + "step": 5406 + }, + { + "epoch": 0.9795289855072464, + "grad_norm": 4.317707391023914, + "learning_rate": 1.0987042121304856e-09, + "loss": 0.3478, + "step": 5407 + }, + { + "epoch": 0.9797101449275363, + "grad_norm": 3.8484598246853974, + "learning_rate": 1.0793511330022265e-09, + "loss": 0.2611, + "step": 5408 + }, + { + "epoch": 0.9798913043478261, + "grad_norm": 4.572601236962431, + "learning_rate": 1.0601698346173194e-09, + "loss": 0.2671, + "step": 5409 + }, + { + "epoch": 0.980072463768116, + "grad_norm": 4.86510860319875, + "learning_rate": 1.0411603235799816e-09, + "loss": 0.2558, + "step": 5410 + }, + { + "epoch": 0.9802536231884058, + "grad_norm": 6.706864546338535, + "learning_rate": 1.0223226064352553e-09, + "loss": 0.2526, + "step": 5411 + }, + { + "epoch": 0.9804347826086957, + "grad_norm": 4.133209617955311, + "learning_rate": 1.0036566896690634e-09, + "loss": 0.2185, + "step": 5412 + }, + { + "epoch": 0.9806159420289855, + "grad_norm": 4.5515383124251425, + "learning_rate": 9.851625797080988e-10, + "loss": 0.2473, + "step": 5413 + }, + { + "epoch": 0.9807971014492753, + "grad_norm": 3.361838207279218, + "learning_rate": 9.668402829201005e-10, + "loss": 0.2391, + "step": 5414 + }, + { + "epoch": 0.9809782608695652, + "grad_norm": 3.889868020065799, + "learning_rate": 9.486898056134674e-10, + "loss": 0.269, + "step": 5415 + }, + { + "epoch": 0.981159420289855, + "grad_norm": 3.7282981107979767, + "learning_rate": 9.307111540374779e-10, + "loss": 0.2563, + "step": 5416 + }, + { + "epoch": 0.9813405797101449, + "grad_norm": 3.8754572835249577, + "learning_rate": 9.129043343822917e-10, + "loss": 0.2398, + "step": 5417 + }, + { + "epoch": 0.9815217391304348, + "grad_norm": 4.007719066054778, + "learning_rate": 8.952693527788379e-10, + "loss": 0.274, + "step": 5418 + }, + { + "epoch": 0.9817028985507247, + "grad_norm": 6.050332604115417, + "learning_rate": 8.778062152989818e-10, + "loss": 0.2845, + "step": 5419 + }, + { + "epoch": 0.9818840579710145, + "grad_norm": 7.580077735401974, + "learning_rate": 8.605149279553026e-10, + "loss": 0.2422, + "step": 5420 + }, + { + "epoch": 0.9820652173913044, + "grad_norm": 4.402840921667809, + "learning_rate": 8.433954967013712e-10, + "loss": 0.2901, + "step": 5421 + }, + { + "epoch": 0.9822463768115942, + "grad_norm": 3.673598356625748, + "learning_rate": 8.264479274313618e-10, + "loss": 0.2521, + "step": 5422 + }, + { + "epoch": 0.9824275362318841, + "grad_norm": 6.365360275780007, + "learning_rate": 8.096722259804956e-10, + "loss": 0.2447, + "step": 5423 + }, + { + "epoch": 0.9826086956521739, + "grad_norm": 4.03821607486304, + "learning_rate": 7.930683981246522e-10, + "loss": 0.3603, + "step": 5424 + }, + { + "epoch": 0.9827898550724637, + "grad_norm": 6.739572399971401, + "learning_rate": 7.766364495807032e-10, + "loss": 0.3262, + "step": 5425 + }, + { + "epoch": 0.9829710144927536, + "grad_norm": 4.571141720022084, + "learning_rate": 7.603763860061785e-10, + "loss": 0.2794, + "step": 5426 + }, + { + "epoch": 0.9831521739130434, + "grad_norm": 5.065904520178715, + "learning_rate": 7.442882129994887e-10, + "loss": 0.2453, + "step": 5427 + }, + { + "epoch": 0.9833333333333333, + "grad_norm": 4.855145685520532, + "learning_rate": 7.283719360999252e-10, + "loss": 0.2557, + "step": 5428 + }, + { + "epoch": 0.9835144927536232, + "grad_norm": 4.441041132766409, + "learning_rate": 7.126275607874932e-10, + "loss": 0.3177, + "step": 5429 + }, + { + "epoch": 0.9836956521739131, + "grad_norm": 4.179488390084364, + "learning_rate": 6.970550924830232e-10, + "loss": 0.2663, + "step": 5430 + }, + { + "epoch": 0.9838768115942029, + "grad_norm": 4.465427232747172, + "learning_rate": 6.816545365482818e-10, + "loss": 0.2136, + "step": 5431 + }, + { + "epoch": 0.9840579710144928, + "grad_norm": 4.456559508536437, + "learning_rate": 6.664258982856941e-10, + "loss": 0.2659, + "step": 5432 + }, + { + "epoch": 0.9842391304347826, + "grad_norm": 7.829687893352149, + "learning_rate": 6.513691829385659e-10, + "loss": 0.3099, + "step": 5433 + }, + { + "epoch": 0.9844202898550725, + "grad_norm": 4.70524836610605, + "learning_rate": 6.364843956909727e-10, + "loss": 0.2732, + "step": 5434 + }, + { + "epoch": 0.9846014492753623, + "grad_norm": 4.064049887970608, + "learning_rate": 6.217715416678149e-10, + "loss": 0.2644, + "step": 5435 + }, + { + "epoch": 0.9847826086956522, + "grad_norm": 4.278682606974456, + "learning_rate": 6.072306259348736e-10, + "loss": 0.2378, + "step": 5436 + }, + { + "epoch": 0.984963768115942, + "grad_norm": 5.105688634048712, + "learning_rate": 5.928616534985331e-10, + "loss": 0.3339, + "step": 5437 + }, + { + "epoch": 0.9851449275362318, + "grad_norm": 5.825569143195145, + "learning_rate": 5.786646293062247e-10, + "loss": 0.3376, + "step": 5438 + }, + { + "epoch": 0.9853260869565217, + "grad_norm": 4.949625180464565, + "learning_rate": 5.646395582459829e-10, + "loss": 0.2317, + "step": 5439 + }, + { + "epoch": 0.9855072463768116, + "grad_norm": 6.800571368945507, + "learning_rate": 5.507864451467226e-10, + "loss": 0.3033, + "step": 5440 + }, + { + "epoch": 0.9856884057971015, + "grad_norm": 4.629772350923752, + "learning_rate": 5.37105294778073e-10, + "loss": 0.2796, + "step": 5441 + }, + { + "epoch": 0.9858695652173913, + "grad_norm": 8.381900124221994, + "learning_rate": 5.235961118506549e-10, + "loss": 0.3246, + "step": 5442 + }, + { + "epoch": 0.9860507246376812, + "grad_norm": 4.041833176217775, + "learning_rate": 5.102589010155811e-10, + "loss": 0.3084, + "step": 5443 + }, + { + "epoch": 0.986231884057971, + "grad_norm": 4.510579425868753, + "learning_rate": 4.970936668650672e-10, + "loss": 0.2239, + "step": 5444 + }, + { + "epoch": 0.9864130434782609, + "grad_norm": 5.226292915931263, + "learning_rate": 4.84100413931876e-10, + "loss": 0.2804, + "step": 5445 + }, + { + "epoch": 0.9865942028985507, + "grad_norm": 6.873202770859247, + "learning_rate": 4.712791466896515e-10, + "loss": 0.2946, + "step": 5446 + }, + { + "epoch": 0.9867753623188406, + "grad_norm": 5.264889708568753, + "learning_rate": 4.5862986955286234e-10, + "loss": 0.2559, + "step": 5447 + }, + { + "epoch": 0.9869565217391304, + "grad_norm": 4.2741344049337435, + "learning_rate": 4.4615258687669134e-10, + "loss": 0.2449, + "step": 5448 + }, + { + "epoch": 0.9871376811594202, + "grad_norm": 4.183967611620681, + "learning_rate": 4.3384730295709105e-10, + "loss": 0.3258, + "step": 5449 + }, + { + "epoch": 0.9873188405797102, + "grad_norm": 4.638276899178078, + "learning_rate": 4.2171402203083904e-10, + "loss": 0.2821, + "step": 5450 + }, + { + "epoch": 0.9875, + "grad_norm": 8.410577616809684, + "learning_rate": 4.0975274827553807e-10, + "loss": 0.2704, + "step": 5451 + }, + { + "epoch": 0.9876811594202899, + "grad_norm": 4.754367755517139, + "learning_rate": 3.979634858094494e-10, + "loss": 0.3379, + "step": 5452 + }, + { + "epoch": 0.9878623188405797, + "grad_norm": 4.6351726551871515, + "learning_rate": 3.8634623869171487e-10, + "loss": 0.2816, + "step": 5453 + }, + { + "epoch": 0.9880434782608696, + "grad_norm": 3.6255123351982994, + "learning_rate": 3.749010109221906e-10, + "loss": 0.2753, + "step": 5454 + }, + { + "epoch": 0.9882246376811594, + "grad_norm": 3.9850490392522833, + "learning_rate": 3.6362780644150217e-10, + "loss": 0.2491, + "step": 5455 + }, + { + "epoch": 0.9884057971014493, + "grad_norm": 4.8421733803156135, + "learning_rate": 3.525266291311002e-10, + "loss": 0.2438, + "step": 5456 + }, + { + "epoch": 0.9885869565217391, + "grad_norm": 5.628931566659291, + "learning_rate": 3.415974828132051e-10, + "loss": 0.3735, + "step": 5457 + }, + { + "epoch": 0.988768115942029, + "grad_norm": 5.429817581893199, + "learning_rate": 3.3084037125064023e-10, + "loss": 0.3151, + "step": 5458 + }, + { + "epoch": 0.9889492753623188, + "grad_norm": 7.646579950171526, + "learning_rate": 3.2025529814727615e-10, + "loss": 0.2749, + "step": 5459 + }, + { + "epoch": 0.9891304347826086, + "grad_norm": 6.334711858303099, + "learning_rate": 3.098422671475309e-10, + "loss": 0.3431, + "step": 5460 + }, + { + "epoch": 0.9893115942028986, + "grad_norm": 5.414762033517194, + "learning_rate": 2.996012818367033e-10, + "loss": 0.2212, + "step": 5461 + }, + { + "epoch": 0.9894927536231884, + "grad_norm": 4.086616243696165, + "learning_rate": 2.8953234574075056e-10, + "loss": 0.3023, + "step": 5462 + }, + { + "epoch": 0.9896739130434783, + "grad_norm": 3.8956174718852687, + "learning_rate": 2.7963546232645516e-10, + "loss": 0.2601, + "step": 5463 + }, + { + "epoch": 0.9898550724637681, + "grad_norm": 5.930213521427466, + "learning_rate": 2.6991063500142465e-10, + "loss": 0.3049, + "step": 5464 + }, + { + "epoch": 0.990036231884058, + "grad_norm": 8.909291520141176, + "learning_rate": 2.6035786711398053e-10, + "loss": 0.2707, + "step": 5465 + }, + { + "epoch": 0.9902173913043478, + "grad_norm": 4.854253565918032, + "learning_rate": 2.5097716195310315e-10, + "loss": 0.3038, + "step": 5466 + }, + { + "epoch": 0.9903985507246377, + "grad_norm": 4.937258246366047, + "learning_rate": 2.4176852274865325e-10, + "loss": 0.3134, + "step": 5467 + }, + { + "epoch": 0.9905797101449275, + "grad_norm": 6.691044884794042, + "learning_rate": 2.327319526711502e-10, + "loss": 0.2869, + "step": 5468 + }, + { + "epoch": 0.9907608695652174, + "grad_norm": 4.093884814767755, + "learning_rate": 2.2386745483204962e-10, + "loss": 0.2447, + "step": 5469 + }, + { + "epoch": 0.9909420289855072, + "grad_norm": 4.250586372863622, + "learning_rate": 2.1517503228335455e-10, + "loss": 0.3075, + "step": 5470 + }, + { + "epoch": 0.991123188405797, + "grad_norm": 5.539273136072462, + "learning_rate": 2.0665468801794873e-10, + "loss": 0.3143, + "step": 5471 + }, + { + "epoch": 0.991304347826087, + "grad_norm": 4.238100541488654, + "learning_rate": 1.9830642496937445e-10, + "loss": 0.2263, + "step": 5472 + }, + { + "epoch": 0.9914855072463769, + "grad_norm": 4.095263786767813, + "learning_rate": 1.9013024601199913e-10, + "loss": 0.2975, + "step": 5473 + }, + { + "epoch": 0.9916666666666667, + "grad_norm": 5.329225075674741, + "learning_rate": 1.8212615396095975e-10, + "loss": 0.2876, + "step": 5474 + }, + { + "epoch": 0.9918478260869565, + "grad_norm": 4.959822526288037, + "learning_rate": 1.7429415157205195e-10, + "loss": 0.3141, + "step": 5475 + }, + { + "epoch": 0.9920289855072464, + "grad_norm": 5.153386631951864, + "learning_rate": 1.6663424154189642e-10, + "loss": 0.2689, + "step": 5476 + }, + { + "epoch": 0.9922101449275362, + "grad_norm": 7.4229996497174096, + "learning_rate": 1.5914642650782795e-10, + "loss": 0.3066, + "step": 5477 + }, + { + "epoch": 0.9923913043478261, + "grad_norm": 5.439655998064373, + "learning_rate": 1.5183070904795093e-10, + "loss": 0.2675, + "step": 5478 + }, + { + "epoch": 0.9925724637681159, + "grad_norm": 4.203423533849288, + "learning_rate": 1.4468709168108384e-10, + "loss": 0.3242, + "step": 5479 + }, + { + "epoch": 0.9927536231884058, + "grad_norm": 5.179510812789131, + "learning_rate": 1.3771557686681479e-10, + "loss": 0.3707, + "step": 5480 + }, + { + "epoch": 0.9929347826086956, + "grad_norm": 4.930300349517559, + "learning_rate": 1.3091616700544594e-10, + "loss": 0.3034, + "step": 5481 + }, + { + "epoch": 0.9931159420289855, + "grad_norm": 7.220794549135853, + "learning_rate": 1.2428886443810461e-10, + "loss": 0.2843, + "step": 5482 + }, + { + "epoch": 0.9932971014492754, + "grad_norm": 4.214810346606237, + "learning_rate": 1.1783367144652112e-10, + "loss": 0.298, + "step": 5483 + }, + { + "epoch": 0.9934782608695653, + "grad_norm": 5.508075277282163, + "learning_rate": 1.1155059025336199e-10, + "loss": 0.2578, + "step": 5484 + }, + { + "epoch": 0.9936594202898551, + "grad_norm": 3.614860085170085, + "learning_rate": 1.0543962302184129e-10, + "loss": 0.2495, + "step": 5485 + }, + { + "epoch": 0.993840579710145, + "grad_norm": 3.6827084359797553, + "learning_rate": 9.950077185594263e-11, + "loss": 0.24, + "step": 5486 + }, + { + "epoch": 0.9940217391304348, + "grad_norm": 7.467738509768066, + "learning_rate": 9.373403880058584e-11, + "loss": 0.2676, + "step": 5487 + }, + { + "epoch": 0.9942028985507246, + "grad_norm": 5.411381852990324, + "learning_rate": 8.813942584118273e-11, + "loss": 0.3359, + "step": 5488 + }, + { + "epoch": 0.9943840579710145, + "grad_norm": 7.004289629299719, + "learning_rate": 8.271693490397025e-11, + "loss": 0.3007, + "step": 5489 + }, + { + "epoch": 0.9945652173913043, + "grad_norm": 3.7988064531036145, + "learning_rate": 7.746656785601046e-11, + "loss": 0.2605, + "step": 5490 + }, + { + "epoch": 0.9947463768115942, + "grad_norm": 4.464541104464458, + "learning_rate": 7.238832650502402e-11, + "loss": 0.3206, + "step": 5491 + }, + { + "epoch": 0.994927536231884, + "grad_norm": 4.6955437267846785, + "learning_rate": 6.748221259939014e-11, + "loss": 0.2586, + "step": 5492 + }, + { + "epoch": 0.9951086956521739, + "grad_norm": 6.495103498924206, + "learning_rate": 6.274822782836864e-11, + "loss": 0.3049, + "step": 5493 + }, + { + "epoch": 0.9952898550724638, + "grad_norm": 3.3438673053709094, + "learning_rate": 5.8186373821877966e-11, + "loss": 0.1991, + "step": 5494 + }, + { + "epoch": 0.9954710144927537, + "grad_norm": 3.979301586623832, + "learning_rate": 5.3796652150606135e-11, + "loss": 0.2931, + "step": 5495 + }, + { + "epoch": 0.9956521739130435, + "grad_norm": 5.001210512934167, + "learning_rate": 4.9579064325955265e-11, + "loss": 0.2877, + "step": 5496 + }, + { + "epoch": 0.9958333333333333, + "grad_norm": 4.259152831377175, + "learning_rate": 4.553361180004156e-11, + "loss": 0.2743, + "step": 5497 + }, + { + "epoch": 0.9960144927536232, + "grad_norm": 8.745702706044188, + "learning_rate": 4.1660295965750824e-11, + "loss": 0.2693, + "step": 5498 + }, + { + "epoch": 0.996195652173913, + "grad_norm": 3.377521619414664, + "learning_rate": 3.795911815662744e-11, + "loss": 0.2079, + "step": 5499 + }, + { + "epoch": 0.9963768115942029, + "grad_norm": 9.176887964146367, + "learning_rate": 3.443007964709643e-11, + "loss": 0.2444, + "step": 5500 + }, + { + "epoch": 0.9963768115942029, + "eval_loss": 0.26609376072883606, + "eval_runtime": 9.9093, + "eval_samples_per_second": 50.458, + "eval_steps_per_second": 0.101, + "step": 5500 + }, + { + "epoch": 0.9965579710144927, + "grad_norm": 10.880353098499464, + "learning_rate": 3.1073181652130354e-11, + "loss": 0.2796, + "step": 5501 + }, + { + "epoch": 0.9967391304347826, + "grad_norm": 4.910476782688724, + "learning_rate": 2.7888425327582398e-11, + "loss": 0.3411, + "step": 5502 + }, + { + "epoch": 0.9969202898550724, + "grad_norm": 4.660099329650357, + "learning_rate": 2.487581177001985e-11, + "loss": 0.2875, + "step": 5503 + }, + { + "epoch": 0.9971014492753624, + "grad_norm": 3.97791689712913, + "learning_rate": 2.2035342016613055e-11, + "loss": 0.2749, + "step": 5504 + }, + { + "epoch": 0.9972826086956522, + "grad_norm": 3.333737148729066, + "learning_rate": 1.936701704535748e-11, + "loss": 0.1992, + "step": 5505 + }, + { + "epoch": 0.9974637681159421, + "grad_norm": 8.32486290836786, + "learning_rate": 1.6870837775018187e-11, + "loss": 0.241, + "step": 5506 + }, + { + "epoch": 0.9976449275362319, + "grad_norm": 5.152901366150169, + "learning_rate": 1.454680506501882e-11, + "loss": 0.3158, + "step": 5507 + }, + { + "epoch": 0.9978260869565218, + "grad_norm": 3.950925255581313, + "learning_rate": 1.239491971549711e-11, + "loss": 0.3058, + "step": 5508 + }, + { + "epoch": 0.9980072463768116, + "grad_norm": 4.680356667952785, + "learning_rate": 1.0415182467471417e-11, + "loss": 0.2732, + "step": 5509 + }, + { + "epoch": 0.9981884057971014, + "grad_norm": 3.1183893730487924, + "learning_rate": 8.607594002452145e-12, + "loss": 0.2336, + "step": 5510 + }, + { + "epoch": 0.9983695652173913, + "grad_norm": 6.70328003220962, + "learning_rate": 6.972154942830322e-12, + "loss": 0.2884, + "step": 5511 + }, + { + "epoch": 0.9985507246376811, + "grad_norm": 3.726600456585857, + "learning_rate": 5.508865851766575e-12, + "loss": 0.2332, + "step": 5512 + }, + { + "epoch": 0.998731884057971, + "grad_norm": 4.185692384356067, + "learning_rate": 4.217727232969093e-12, + "loss": 0.2789, + "step": 5513 + }, + { + "epoch": 0.9989130434782608, + "grad_norm": 5.320448399575653, + "learning_rate": 3.098739531082195e-12, + "loss": 0.2807, + "step": 5514 + }, + { + "epoch": 0.9990942028985508, + "grad_norm": 4.351928942124165, + "learning_rate": 2.151903131297761e-12, + "loss": 0.2525, + "step": 5515 + }, + { + "epoch": 0.9992753623188406, + "grad_norm": 6.602538203595945, + "learning_rate": 1.3772183596882925e-12, + "loss": 0.3251, + "step": 5516 + }, + { + "epoch": 0.9994565217391305, + "grad_norm": 5.979729564355757, + "learning_rate": 7.746854829293603e-13, + "loss": 0.3123, + "step": 5517 + }, + { + "epoch": 0.9996376811594203, + "grad_norm": 11.580413662837362, + "learning_rate": 3.4430470846613656e-13, + "loss": 0.3552, + "step": 5518 + }, + { + "epoch": 0.9998188405797102, + "grad_norm": 4.705549896670449, + "learning_rate": 8.607618451339505e-14, + "loss": 0.273, + "step": 5519 + }, + { + "epoch": 1.0, + "grad_norm": 4.435987266997774, + "learning_rate": 0.0, + "loss": 0.3062, + "step": 5520 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3744526270464000.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}