{ "best_metric": 1.2292229623575261, "best_model_checkpoint": "train/Large-20241118-Compress:128x-Lr:5e-5-Llama3-8B-instruct-GPT2-Large-RAG-no-ft_token-onlySquad-everymem/checkpoint-800", "epoch": 0.5896987008200497, "eval_steps": 800, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003685616880125311, "grad_norm": 12.896270070771674, "learning_rate": 0.0, "loss": 4.8, "step": 1 }, { "epoch": 0.0007371233760250622, "grad_norm": 18.877003948618857, "learning_rate": 7.525749891599529e-06, "loss": 4.4021, "step": 2 }, { "epoch": 0.0011056850640375933, "grad_norm": 13.809141277157678, "learning_rate": 1.192803136799156e-05, "loss": 4.0912, "step": 3 }, { "epoch": 0.0014742467520501245, "grad_norm": 15.250628348991476, "learning_rate": 1.5051499783199057e-05, "loss": 4.399, "step": 4 }, { "epoch": 0.0018428084400626554, "grad_norm": 15.171449067946314, "learning_rate": 1.7474250108400467e-05, "loss": 3.9989, "step": 5 }, { "epoch": 0.0022113701280751866, "grad_norm": 18.915619450065268, "learning_rate": 1.945378125959109e-05, "loss": 3.6823, "step": 6 }, { "epoch": 0.0025799318160877175, "grad_norm": 18.087501619730403, "learning_rate": 2.1127451000356418e-05, "loss": 3.6053, "step": 7 }, { "epoch": 0.002948493504100249, "grad_norm": 14.263011680471472, "learning_rate": 2.2577249674798584e-05, "loss": 3.0096, "step": 8 }, { "epoch": 0.00331705519211278, "grad_norm": 15.50245794680889, "learning_rate": 2.385606273598312e-05, "loss": 2.9869, "step": 9 }, { "epoch": 0.003685616880125311, "grad_norm": 32.60693981983072, "learning_rate": 2.4999999999999998e-05, "loss": 2.2911, "step": 10 }, { "epoch": 0.004054178568137842, "grad_norm": 13.57563672899567, "learning_rate": 2.6034817128955623e-05, "loss": 2.3186, "step": 11 }, { "epoch": 0.004422740256150373, "grad_norm": 12.788842965434021, "learning_rate": 2.6979531151190617e-05, "loss": 2.2492, "step": 12 }, { "epoch": 0.0047913019441629045, "grad_norm": 12.147698304480553, "learning_rate": 2.7848583807670913e-05, "loss": 2.3967, "step": 13 }, { "epoch": 0.005159863632175435, "grad_norm": 13.45873141019892, "learning_rate": 2.8653200891955945e-05, "loss": 2.0692, "step": 14 }, { "epoch": 0.005528425320187966, "grad_norm": 10.21726003627811, "learning_rate": 2.940228147639203e-05, "loss": 1.8092, "step": 15 }, { "epoch": 0.005896987008200498, "grad_norm": 9.78948491456921, "learning_rate": 3.0102999566398115e-05, "loss": 1.7192, "step": 16 }, { "epoch": 0.006265548696213028, "grad_norm": 7.983116931676763, "learning_rate": 3.076122303445685e-05, "loss": 2.0458, "step": 17 }, { "epoch": 0.00663411038422556, "grad_norm": 8.95343943737763, "learning_rate": 3.1381812627582646e-05, "loss": 1.7167, "step": 18 }, { "epoch": 0.007002672072238091, "grad_norm": 9.31969609104783, "learning_rate": 3.1968840023820715e-05, "loss": 1.5482, "step": 19 }, { "epoch": 0.007371233760250622, "grad_norm": 12.912404068685575, "learning_rate": 3.2525749891599525e-05, "loss": 1.8441, "step": 20 }, { "epoch": 0.007739795448263153, "grad_norm": 8.440545330780294, "learning_rate": 3.305548236834798e-05, "loss": 1.637, "step": 21 }, { "epoch": 0.008108357136275684, "grad_norm": 8.68116144803874, "learning_rate": 3.3560567020555153e-05, "loss": 1.3644, "step": 22 }, { "epoch": 0.008476918824288216, "grad_norm": 9.08421344751217, "learning_rate": 3.404319590043982e-05, "loss": 1.7252, "step": 23 }, { "epoch": 0.008845480512300746, "grad_norm": 9.850779902376187, "learning_rate": 3.450528104279015e-05, "loss": 1.442, "step": 24 }, { "epoch": 0.009214042200313277, "grad_norm": 9.530609609634109, "learning_rate": 3.4948500216800935e-05, "loss": 1.3857, "step": 25 }, { "epoch": 0.009582603888325809, "grad_norm": 6.950202104662446, "learning_rate": 3.537433369927044e-05, "loss": 1.7229, "step": 26 }, { "epoch": 0.00995116557633834, "grad_norm": 10.707029824881335, "learning_rate": 3.578409410397468e-05, "loss": 1.7879, "step": 27 }, { "epoch": 0.01031972726435087, "grad_norm": 6.991325998778158, "learning_rate": 3.6178950783555475e-05, "loss": 1.8019, "step": 28 }, { "epoch": 0.010688288952363402, "grad_norm": 6.7098116302201145, "learning_rate": 3.65599499474739e-05, "loss": 1.4029, "step": 29 }, { "epoch": 0.011056850640375933, "grad_norm": 6.62638552590622, "learning_rate": 3.6928031367991554e-05, "loss": 1.3335, "step": 30 }, { "epoch": 0.011425412328388463, "grad_norm": 8.902163776153579, "learning_rate": 3.728404234585681e-05, "loss": 1.2416, "step": 31 }, { "epoch": 0.011793974016400996, "grad_norm": 7.219531260567715, "learning_rate": 3.762874945799765e-05, "loss": 1.679, "step": 32 }, { "epoch": 0.012162535704413526, "grad_norm": 7.798472024959265, "learning_rate": 3.796284849694718e-05, "loss": 1.8278, "step": 33 }, { "epoch": 0.012531097392426057, "grad_norm": 7.345505005554151, "learning_rate": 3.8286972926056376e-05, "loss": 1.6604, "step": 34 }, { "epoch": 0.012899659080438589, "grad_norm": 6.285130151631645, "learning_rate": 3.8601701108756885e-05, "loss": 1.5006, "step": 35 }, { "epoch": 0.01326822076845112, "grad_norm": 8.035105451498394, "learning_rate": 3.890756251918218e-05, "loss": 1.4861, "step": 36 }, { "epoch": 0.01363678245646365, "grad_norm": 6.425750083036117, "learning_rate": 3.920504310167487e-05, "loss": 1.5534, "step": 37 }, { "epoch": 0.014005344144476182, "grad_norm": 5.491950473581536, "learning_rate": 3.949458991542025e-05, "loss": 1.4161, "step": 38 }, { "epoch": 0.014373905832488713, "grad_norm": 5.936164143516378, "learning_rate": 3.977661517566247e-05, "loss": 1.1817, "step": 39 }, { "epoch": 0.014742467520501243, "grad_norm": 9.030311403703173, "learning_rate": 4.005149978319905e-05, "loss": 1.3424, "step": 40 }, { "epoch": 0.015111029208513775, "grad_norm": 6.444348294355671, "learning_rate": 4.031959641799338e-05, "loss": 1.6785, "step": 41 }, { "epoch": 0.015479590896526306, "grad_norm": 6.269465232810524, "learning_rate": 4.058123225994751e-05, "loss": 1.1862, "step": 42 }, { "epoch": 0.015848152584538838, "grad_norm": 6.713123325269203, "learning_rate": 4.0836711389489654e-05, "loss": 1.3913, "step": 43 }, { "epoch": 0.016216714272551367, "grad_norm": 8.067728875373971, "learning_rate": 4.108631691215468e-05, "loss": 1.332, "step": 44 }, { "epoch": 0.0165852759605639, "grad_norm": 8.01298708573356, "learning_rate": 4.133031284438358e-05, "loss": 1.5009, "step": 45 }, { "epoch": 0.01695383764857643, "grad_norm": 7.887492610002976, "learning_rate": 4.156894579203935e-05, "loss": 1.4764, "step": 46 }, { "epoch": 0.01732239933658896, "grad_norm": 6.551337639888985, "learning_rate": 4.180244644839293e-05, "loss": 1.2348, "step": 47 }, { "epoch": 0.017690961024601493, "grad_norm": 10.849875706958366, "learning_rate": 4.203103093438968e-05, "loss": 1.3009, "step": 48 }, { "epoch": 0.018059522712614025, "grad_norm": 5.622759902802231, "learning_rate": 4.2254902000712836e-05, "loss": 1.4009, "step": 49 }, { "epoch": 0.018428084400626554, "grad_norm": 6.179210002984712, "learning_rate": 4.247425010840046e-05, "loss": 1.3582, "step": 50 }, { "epoch": 0.018796646088639086, "grad_norm": 5.3121326992741125, "learning_rate": 4.2689254402448405e-05, "loss": 1.2138, "step": 51 }, { "epoch": 0.019165207776651618, "grad_norm": 5.954361594417181, "learning_rate": 4.290008359086998e-05, "loss": 1.3145, "step": 52 }, { "epoch": 0.019533769464664147, "grad_norm": 5.374896381177821, "learning_rate": 4.310689674001973e-05, "loss": 1.3602, "step": 53 }, { "epoch": 0.01990233115267668, "grad_norm": 6.972429475916616, "learning_rate": 4.330984399557421e-05, "loss": 1.6309, "step": 54 }, { "epoch": 0.02027089284068921, "grad_norm": 6.247904006455041, "learning_rate": 4.350906723735609e-05, "loss": 1.326, "step": 55 }, { "epoch": 0.02063945452870174, "grad_norm": 6.492689570030528, "learning_rate": 4.370470067515501e-05, "loss": 1.0703, "step": 56 }, { "epoch": 0.021008016216714272, "grad_norm": 6.087590787551319, "learning_rate": 4.3896871391812285e-05, "loss": 1.3541, "step": 57 }, { "epoch": 0.021376577904726805, "grad_norm": 6.295398169810634, "learning_rate": 4.408569983907343e-05, "loss": 1.0475, "step": 58 }, { "epoch": 0.021745139592739333, "grad_norm": 6.911997617156007, "learning_rate": 4.42713002910536e-05, "loss": 1.4733, "step": 59 }, { "epoch": 0.022113701280751866, "grad_norm": 5.494744998197354, "learning_rate": 4.445378125959108e-05, "loss": 1.319, "step": 60 }, { "epoch": 0.022482262968764398, "grad_norm": 7.122526843849549, "learning_rate": 4.463324587526917e-05, "loss": 1.7259, "step": 61 }, { "epoch": 0.022850824656776927, "grad_norm": 6.332925515680585, "learning_rate": 4.4809792237456346e-05, "loss": 1.1279, "step": 62 }, { "epoch": 0.02321938634478946, "grad_norm": 5.536222991774125, "learning_rate": 4.498351373633954e-05, "loss": 1.1275, "step": 63 }, { "epoch": 0.02358794803280199, "grad_norm": 6.668778212254347, "learning_rate": 4.515449934959717e-05, "loss": 1.4898, "step": 64 }, { "epoch": 0.02395650972081452, "grad_norm": 5.0639341230401325, "learning_rate": 4.532283391607138e-05, "loss": 1.348, "step": 65 }, { "epoch": 0.024325071408827052, "grad_norm": 6.281730367786747, "learning_rate": 4.548859838854671e-05, "loss": 1.4306, "step": 66 }, { "epoch": 0.024693633096839585, "grad_norm": 6.834738176042927, "learning_rate": 4.565187006752065e-05, "loss": 1.3899, "step": 67 }, { "epoch": 0.025062194784852113, "grad_norm": 5.802680796937942, "learning_rate": 4.581272281765591e-05, "loss": 1.2185, "step": 68 }, { "epoch": 0.025430756472864646, "grad_norm": 6.077391278617657, "learning_rate": 4.597122726843138e-05, "loss": 1.3475, "step": 69 }, { "epoch": 0.025799318160877178, "grad_norm": 6.042202980528721, "learning_rate": 4.612745100035642e-05, "loss": 1.4218, "step": 70 }, { "epoch": 0.026167879848889707, "grad_norm": 5.491391467471923, "learning_rate": 4.628145871797688e-05, "loss": 1.23, "step": 71 }, { "epoch": 0.02653644153690224, "grad_norm": 5.470718116835045, "learning_rate": 4.643331241078171e-05, "loss": 1.1512, "step": 72 }, { "epoch": 0.02690500322491477, "grad_norm": 7.372062486024624, "learning_rate": 4.658307150301139e-05, "loss": 1.2542, "step": 73 }, { "epoch": 0.0272735649129273, "grad_norm": 6.558139187735565, "learning_rate": 4.67307929932744e-05, "loss": 1.4261, "step": 74 }, { "epoch": 0.027642126600939832, "grad_norm": 5.525953957894997, "learning_rate": 4.687653158479249e-05, "loss": 1.3646, "step": 75 }, { "epoch": 0.028010688288952364, "grad_norm": 11.041044474411716, "learning_rate": 4.702033980701978e-05, "loss": 1.6158, "step": 76 }, { "epoch": 0.028379249976964893, "grad_norm": 6.497371435533685, "learning_rate": 4.716226812931204e-05, "loss": 1.2474, "step": 77 }, { "epoch": 0.028747811664977425, "grad_norm": 7.42491405707048, "learning_rate": 4.7302365067262006e-05, "loss": 1.5185, "step": 78 }, { "epoch": 0.029116373352989958, "grad_norm": 10.950644306056764, "learning_rate": 4.744067728226103e-05, "loss": 1.324, "step": 79 }, { "epoch": 0.029484935041002486, "grad_norm": 7.762113395069036, "learning_rate": 4.757724967479858e-05, "loss": 1.4988, "step": 80 }, { "epoch": 0.02985349672901502, "grad_norm": 5.095532813247198, "learning_rate": 4.771212547196624e-05, "loss": 1.0123, "step": 81 }, { "epoch": 0.03022205841702755, "grad_norm": 6.466487470418397, "learning_rate": 4.7845346309592914e-05, "loss": 1.2968, "step": 82 }, { "epoch": 0.03059062010504008, "grad_norm": 8.714969813633267, "learning_rate": 4.7976952309401844e-05, "loss": 1.3373, "step": 83 }, { "epoch": 0.030959181793052612, "grad_norm": 9.558435311697734, "learning_rate": 4.810698215154703e-05, "loss": 1.0918, "step": 84 }, { "epoch": 0.031327743481065144, "grad_norm": 7.453071769988114, "learning_rate": 4.823547314285732e-05, "loss": 1.1226, "step": 85 }, { "epoch": 0.031696305169077676, "grad_norm": 6.639004857510415, "learning_rate": 4.836246128108918e-05, "loss": 1.2868, "step": 86 }, { "epoch": 0.03206486685709021, "grad_norm": 4.724710905727481, "learning_rate": 4.8487981315465456e-05, "loss": 1.1106, "step": 87 }, { "epoch": 0.032433428545102734, "grad_norm": 6.837736370457331, "learning_rate": 4.8612066803754214e-05, "loss": 1.6252, "step": 88 }, { "epoch": 0.032801990233115266, "grad_norm": 6.019233394236917, "learning_rate": 4.873475016612281e-05, "loss": 1.0721, "step": 89 }, { "epoch": 0.0331705519211278, "grad_norm": 9.239152476096631, "learning_rate": 4.885606273598312e-05, "loss": 1.4812, "step": 90 }, { "epoch": 0.03353911360914033, "grad_norm": 6.304085741786427, "learning_rate": 4.897603480802733e-05, "loss": 1.1924, "step": 91 }, { "epoch": 0.03390767529715286, "grad_norm": 5.496624344168719, "learning_rate": 4.909469568363888e-05, "loss": 1.3279, "step": 92 }, { "epoch": 0.034276236985165395, "grad_norm": 7.253992494102278, "learning_rate": 4.9212073713848375e-05, "loss": 1.2015, "step": 93 }, { "epoch": 0.03464479867317792, "grad_norm": 6.2800631943943, "learning_rate": 4.932819633999246e-05, "loss": 1.3519, "step": 94 }, { "epoch": 0.03501336036119045, "grad_norm": 12.403908611906937, "learning_rate": 4.9443090132221186e-05, "loss": 1.3798, "step": 95 }, { "epoch": 0.035381922049202985, "grad_norm": 6.129520849820863, "learning_rate": 4.9556780825989205e-05, "loss": 1.4534, "step": 96 }, { "epoch": 0.03575048373721552, "grad_norm": 5.87801462300011, "learning_rate": 4.9669293356656114e-05, "loss": 1.422, "step": 97 }, { "epoch": 0.03611904542522805, "grad_norm": 9.118639257541647, "learning_rate": 4.978065189231237e-05, "loss": 1.2414, "step": 98 }, { "epoch": 0.03648760711324058, "grad_norm": 13.095668606637952, "learning_rate": 4.989087986493874e-05, "loss": 1.4445, "step": 99 }, { "epoch": 0.03685616880125311, "grad_norm": 8.545076561099883, "learning_rate": 4.9999999999999996e-05, "loss": 1.4495, "step": 100 }, { "epoch": 0.03722473048926564, "grad_norm": 6.611604210908213, "learning_rate": 5e-05, "loss": 1.2762, "step": 101 }, { "epoch": 0.03759329217727817, "grad_norm": 8.442047024210678, "learning_rate": 4.9996909383112874e-05, "loss": 1.5386, "step": 102 }, { "epoch": 0.037961853865290704, "grad_norm": 6.2775880377949, "learning_rate": 4.9993818766225745e-05, "loss": 1.2173, "step": 103 }, { "epoch": 0.038330415553303236, "grad_norm": 7.0201621560172915, "learning_rate": 4.999072814933861e-05, "loss": 1.2283, "step": 104 }, { "epoch": 0.03869897724131577, "grad_norm": 10.250879389558602, "learning_rate": 4.998763753245148e-05, "loss": 1.3552, "step": 105 }, { "epoch": 0.039067538929328294, "grad_norm": 6.910859515866813, "learning_rate": 4.998454691556435e-05, "loss": 0.9422, "step": 106 }, { "epoch": 0.039436100617340826, "grad_norm": 6.418197581320716, "learning_rate": 4.998145629867722e-05, "loss": 1.3491, "step": 107 }, { "epoch": 0.03980466230535336, "grad_norm": 15.825695939753773, "learning_rate": 4.997836568179009e-05, "loss": 1.3226, "step": 108 }, { "epoch": 0.04017322399336589, "grad_norm": 5.754672598321053, "learning_rate": 4.997527506490295e-05, "loss": 1.1386, "step": 109 }, { "epoch": 0.04054178568137842, "grad_norm": 5.521963308564077, "learning_rate": 4.997218444801582e-05, "loss": 1.2448, "step": 110 }, { "epoch": 0.040910347369390955, "grad_norm": 5.99041363696967, "learning_rate": 4.9969093831128694e-05, "loss": 1.2734, "step": 111 }, { "epoch": 0.04127890905740348, "grad_norm": 5.729272053067182, "learning_rate": 4.9966003214241565e-05, "loss": 1.3659, "step": 112 }, { "epoch": 0.04164747074541601, "grad_norm": 7.177540654273579, "learning_rate": 4.9962912597354436e-05, "loss": 1.2557, "step": 113 }, { "epoch": 0.042016032433428545, "grad_norm": 6.46522433264204, "learning_rate": 4.99598219804673e-05, "loss": 1.1916, "step": 114 }, { "epoch": 0.04238459412144108, "grad_norm": 4.342757077247405, "learning_rate": 4.995673136358017e-05, "loss": 1.0334, "step": 115 }, { "epoch": 0.04275315580945361, "grad_norm": 5.121311255869164, "learning_rate": 4.995364074669304e-05, "loss": 1.3723, "step": 116 }, { "epoch": 0.04312171749746614, "grad_norm": 3.8131571969344136, "learning_rate": 4.9950550129805914e-05, "loss": 0.8755, "step": 117 }, { "epoch": 0.04349027918547867, "grad_norm": 7.8174622912301075, "learning_rate": 4.994745951291878e-05, "loss": 1.4283, "step": 118 }, { "epoch": 0.0438588408734912, "grad_norm": 5.278775871442899, "learning_rate": 4.994436889603165e-05, "loss": 1.1676, "step": 119 }, { "epoch": 0.04422740256150373, "grad_norm": 6.909196734188335, "learning_rate": 4.994127827914452e-05, "loss": 1.2821, "step": 120 }, { "epoch": 0.044595964249516264, "grad_norm": 6.463831114045171, "learning_rate": 4.993818766225739e-05, "loss": 1.186, "step": 121 }, { "epoch": 0.044964525937528796, "grad_norm": 7.19514364071337, "learning_rate": 4.9935097045370264e-05, "loss": 1.3743, "step": 122 }, { "epoch": 0.04533308762554133, "grad_norm": 7.69720714501383, "learning_rate": 4.993200642848313e-05, "loss": 1.2491, "step": 123 }, { "epoch": 0.04570164931355385, "grad_norm": 8.017544581613754, "learning_rate": 4.992891581159599e-05, "loss": 1.3167, "step": 124 }, { "epoch": 0.046070211001566386, "grad_norm": 6.222617935283015, "learning_rate": 4.9925825194708864e-05, "loss": 1.6164, "step": 125 }, { "epoch": 0.04643877268957892, "grad_norm": 7.450710329332439, "learning_rate": 4.9922734577821735e-05, "loss": 1.4724, "step": 126 }, { "epoch": 0.04680733437759145, "grad_norm": 9.3893473385973, "learning_rate": 4.9919643960934606e-05, "loss": 1.3712, "step": 127 }, { "epoch": 0.04717589606560398, "grad_norm": 5.248348157871243, "learning_rate": 4.991655334404747e-05, "loss": 1.1371, "step": 128 }, { "epoch": 0.047544457753616515, "grad_norm": 6.112784524548096, "learning_rate": 4.991346272716034e-05, "loss": 1.1942, "step": 129 }, { "epoch": 0.04791301944162904, "grad_norm": 5.137303540262627, "learning_rate": 4.991037211027321e-05, "loss": 1.4038, "step": 130 }, { "epoch": 0.04828158112964157, "grad_norm": 6.0469601647345685, "learning_rate": 4.9907281493386084e-05, "loss": 1.2442, "step": 131 }, { "epoch": 0.048650142817654105, "grad_norm": 3.9979171098261923, "learning_rate": 4.9904190876498955e-05, "loss": 0.8527, "step": 132 }, { "epoch": 0.04901870450566664, "grad_norm": 6.227755213362082, "learning_rate": 4.990110025961182e-05, "loss": 1.3631, "step": 133 }, { "epoch": 0.04938726619367917, "grad_norm": 6.369496681534124, "learning_rate": 4.989800964272469e-05, "loss": 1.4197, "step": 134 }, { "epoch": 0.0497558278816917, "grad_norm": 7.707935742271236, "learning_rate": 4.989491902583756e-05, "loss": 1.5846, "step": 135 }, { "epoch": 0.05012438956970423, "grad_norm": 6.682224938368441, "learning_rate": 4.989182840895043e-05, "loss": 1.1206, "step": 136 }, { "epoch": 0.05049295125771676, "grad_norm": 7.102074935100943, "learning_rate": 4.98887377920633e-05, "loss": 1.2502, "step": 137 }, { "epoch": 0.05086151294572929, "grad_norm": 8.672377783144263, "learning_rate": 4.988564717517617e-05, "loss": 1.4722, "step": 138 }, { "epoch": 0.05123007463374182, "grad_norm": 9.926028750357803, "learning_rate": 4.988255655828903e-05, "loss": 1.5996, "step": 139 }, { "epoch": 0.051598636321754356, "grad_norm": 14.549701616826095, "learning_rate": 4.9879465941401904e-05, "loss": 1.3695, "step": 140 }, { "epoch": 0.05196719800976689, "grad_norm": 5.92643847809694, "learning_rate": 4.9876375324514776e-05, "loss": 1.1861, "step": 141 }, { "epoch": 0.05233575969777941, "grad_norm": 6.472679529510792, "learning_rate": 4.987328470762764e-05, "loss": 1.4636, "step": 142 }, { "epoch": 0.052704321385791945, "grad_norm": 8.894873490615957, "learning_rate": 4.987019409074051e-05, "loss": 1.2821, "step": 143 }, { "epoch": 0.05307288307380448, "grad_norm": 11.999076171918077, "learning_rate": 4.986710347385338e-05, "loss": 1.323, "step": 144 }, { "epoch": 0.05344144476181701, "grad_norm": 6.911220505241481, "learning_rate": 4.9864012856966254e-05, "loss": 1.1951, "step": 145 }, { "epoch": 0.05381000644982954, "grad_norm": 5.720003918199597, "learning_rate": 4.9860922240079125e-05, "loss": 1.4091, "step": 146 }, { "epoch": 0.054178568137842074, "grad_norm": 5.165568491401064, "learning_rate": 4.985783162319199e-05, "loss": 1.2144, "step": 147 }, { "epoch": 0.0545471298258546, "grad_norm": 14.503254492324839, "learning_rate": 4.985474100630486e-05, "loss": 1.632, "step": 148 }, { "epoch": 0.05491569151386713, "grad_norm": 17.155139101954898, "learning_rate": 4.985165038941773e-05, "loss": 1.4753, "step": 149 }, { "epoch": 0.055284253201879664, "grad_norm": 5.80418966836016, "learning_rate": 4.98485597725306e-05, "loss": 1.2038, "step": 150 }, { "epoch": 0.055652814889892196, "grad_norm": 6.083616938142048, "learning_rate": 4.9845469155643474e-05, "loss": 1.0644, "step": 151 }, { "epoch": 0.05602137657790473, "grad_norm": 4.575967013193491, "learning_rate": 4.984237853875634e-05, "loss": 1.28, "step": 152 }, { "epoch": 0.05638993826591726, "grad_norm": 12.647913921374544, "learning_rate": 4.983928792186921e-05, "loss": 1.2258, "step": 153 }, { "epoch": 0.056758499953929786, "grad_norm": 6.154330656345718, "learning_rate": 4.9836197304982074e-05, "loss": 1.1833, "step": 154 }, { "epoch": 0.05712706164194232, "grad_norm": 4.93816709237119, "learning_rate": 4.9833106688094945e-05, "loss": 1.3088, "step": 155 }, { "epoch": 0.05749562332995485, "grad_norm": 4.907568104563373, "learning_rate": 4.9830016071207816e-05, "loss": 1.0939, "step": 156 }, { "epoch": 0.05786418501796738, "grad_norm": 6.189850640043338, "learning_rate": 4.982692545432068e-05, "loss": 1.2326, "step": 157 }, { "epoch": 0.058232746705979915, "grad_norm": 7.058363521771088, "learning_rate": 4.982383483743355e-05, "loss": 1.1745, "step": 158 }, { "epoch": 0.05860130839399245, "grad_norm": 4.919486880333302, "learning_rate": 4.982074422054642e-05, "loss": 1.0849, "step": 159 }, { "epoch": 0.05896987008200497, "grad_norm": 5.8735534835124845, "learning_rate": 4.9817653603659294e-05, "loss": 1.1559, "step": 160 }, { "epoch": 0.059338431770017505, "grad_norm": 5.040765708099335, "learning_rate": 4.981456298677216e-05, "loss": 1.0756, "step": 161 }, { "epoch": 0.05970699345803004, "grad_norm": 8.16915772521581, "learning_rate": 4.981147236988503e-05, "loss": 1.2112, "step": 162 }, { "epoch": 0.06007555514604257, "grad_norm": 5.043000142112789, "learning_rate": 4.98083817529979e-05, "loss": 1.3525, "step": 163 }, { "epoch": 0.0604441168340551, "grad_norm": 5.544478465561763, "learning_rate": 4.980529113611077e-05, "loss": 0.939, "step": 164 }, { "epoch": 0.060812678522067634, "grad_norm": 6.673261940517381, "learning_rate": 4.9802200519223643e-05, "loss": 1.0678, "step": 165 }, { "epoch": 0.06118124021008016, "grad_norm": 4.979940229309986, "learning_rate": 4.979910990233651e-05, "loss": 1.1876, "step": 166 }, { "epoch": 0.06154980189809269, "grad_norm": 5.472248811006362, "learning_rate": 4.979601928544938e-05, "loss": 1.1034, "step": 167 }, { "epoch": 0.061918363586105224, "grad_norm": 5.4912989020997705, "learning_rate": 4.979292866856225e-05, "loss": 1.0243, "step": 168 }, { "epoch": 0.062286925274117756, "grad_norm": 5.554134121356154, "learning_rate": 4.978983805167512e-05, "loss": 1.1394, "step": 169 }, { "epoch": 0.06265548696213029, "grad_norm": 5.551975537864596, "learning_rate": 4.9786747434787986e-05, "loss": 1.1851, "step": 170 }, { "epoch": 0.06302404865014281, "grad_norm": 6.409577189529365, "learning_rate": 4.978365681790085e-05, "loss": 1.2277, "step": 171 }, { "epoch": 0.06339261033815535, "grad_norm": 5.809053116818065, "learning_rate": 4.978056620101372e-05, "loss": 1.1191, "step": 172 }, { "epoch": 0.06376117202616788, "grad_norm": 6.705922643345594, "learning_rate": 4.977747558412659e-05, "loss": 1.2595, "step": 173 }, { "epoch": 0.06412973371418042, "grad_norm": 4.664891972737372, "learning_rate": 4.9774384967239464e-05, "loss": 0.8822, "step": 174 }, { "epoch": 0.06449829540219294, "grad_norm": 7.165478215999602, "learning_rate": 4.9771294350352335e-05, "loss": 1.4933, "step": 175 }, { "epoch": 0.06486685709020547, "grad_norm": 6.629950634986907, "learning_rate": 4.97682037334652e-05, "loss": 1.0551, "step": 176 }, { "epoch": 0.06523541877821801, "grad_norm": 7.122760085997266, "learning_rate": 4.976511311657807e-05, "loss": 1.1245, "step": 177 }, { "epoch": 0.06560398046623053, "grad_norm": 5.391997320225443, "learning_rate": 4.976202249969094e-05, "loss": 1.2946, "step": 178 }, { "epoch": 0.06597254215424307, "grad_norm": 6.253593327391423, "learning_rate": 4.975893188280381e-05, "loss": 1.2977, "step": 179 }, { "epoch": 0.0663411038422556, "grad_norm": 7.135770988484542, "learning_rate": 4.975584126591668e-05, "loss": 1.2895, "step": 180 }, { "epoch": 0.06670966553026812, "grad_norm": 8.224157838694593, "learning_rate": 4.975275064902955e-05, "loss": 1.2163, "step": 181 }, { "epoch": 0.06707822721828066, "grad_norm": 8.124482123669916, "learning_rate": 4.974966003214242e-05, "loss": 1.2168, "step": 182 }, { "epoch": 0.06744678890629319, "grad_norm": 5.794531829327866, "learning_rate": 4.974656941525529e-05, "loss": 1.2742, "step": 183 }, { "epoch": 0.06781535059430573, "grad_norm": 6.210911089763031, "learning_rate": 4.974347879836816e-05, "loss": 1.1788, "step": 184 }, { "epoch": 0.06818391228231825, "grad_norm": 4.824126232665497, "learning_rate": 4.974038818148103e-05, "loss": 0.9266, "step": 185 }, { "epoch": 0.06855247397033079, "grad_norm": 5.962317710665053, "learning_rate": 4.973729756459389e-05, "loss": 1.2913, "step": 186 }, { "epoch": 0.06892103565834332, "grad_norm": 4.334739725819832, "learning_rate": 4.973420694770676e-05, "loss": 1.2704, "step": 187 }, { "epoch": 0.06928959734635584, "grad_norm": 4.647682859569354, "learning_rate": 4.9731116330819633e-05, "loss": 1.2808, "step": 188 }, { "epoch": 0.06965815903436838, "grad_norm": 3.9537533950429276, "learning_rate": 4.9728025713932505e-05, "loss": 0.9646, "step": 189 }, { "epoch": 0.0700267207223809, "grad_norm": 6.864154931585147, "learning_rate": 4.972493509704537e-05, "loss": 1.1167, "step": 190 }, { "epoch": 0.07039528241039344, "grad_norm": 6.205265319709406, "learning_rate": 4.972184448015824e-05, "loss": 1.3182, "step": 191 }, { "epoch": 0.07076384409840597, "grad_norm": 8.98720488720327, "learning_rate": 4.971875386327111e-05, "loss": 0.8958, "step": 192 }, { "epoch": 0.0711324057864185, "grad_norm": 12.947311440417128, "learning_rate": 4.971566324638398e-05, "loss": 1.152, "step": 193 }, { "epoch": 0.07150096747443103, "grad_norm": 7.38681945247988, "learning_rate": 4.9712572629496854e-05, "loss": 1.1188, "step": 194 }, { "epoch": 0.07186952916244356, "grad_norm": 4.761322981399511, "learning_rate": 4.970948201260972e-05, "loss": 1.0161, "step": 195 }, { "epoch": 0.0722380908504561, "grad_norm": 7.897568108680671, "learning_rate": 4.970639139572259e-05, "loss": 1.6362, "step": 196 }, { "epoch": 0.07260665253846862, "grad_norm": 5.909825126010036, "learning_rate": 4.970330077883546e-05, "loss": 1.1799, "step": 197 }, { "epoch": 0.07297521422648116, "grad_norm": 6.645506018667098, "learning_rate": 4.970021016194833e-05, "loss": 1.2349, "step": 198 }, { "epoch": 0.07334377591449369, "grad_norm": 6.234362150480179, "learning_rate": 4.9697119545061196e-05, "loss": 1.3767, "step": 199 }, { "epoch": 0.07371233760250621, "grad_norm": 4.376515589186514, "learning_rate": 4.969402892817406e-05, "loss": 1.1567, "step": 200 }, { "epoch": 0.07408089929051875, "grad_norm": 7.4786612535605475, "learning_rate": 4.969093831128693e-05, "loss": 1.2555, "step": 201 }, { "epoch": 0.07444946097853128, "grad_norm": 5.703519959033693, "learning_rate": 4.96878476943998e-05, "loss": 1.2568, "step": 202 }, { "epoch": 0.07481802266654382, "grad_norm": 7.094840482675377, "learning_rate": 4.9684757077512674e-05, "loss": 1.2392, "step": 203 }, { "epoch": 0.07518658435455634, "grad_norm": 4.758156449236422, "learning_rate": 4.9681666460625545e-05, "loss": 1.2573, "step": 204 }, { "epoch": 0.07555514604256887, "grad_norm": 6.469796948319957, "learning_rate": 4.967857584373841e-05, "loss": 1.3726, "step": 205 }, { "epoch": 0.07592370773058141, "grad_norm": 5.760716279316423, "learning_rate": 4.967548522685128e-05, "loss": 1.1956, "step": 206 }, { "epoch": 0.07629226941859393, "grad_norm": 6.810579921772419, "learning_rate": 4.967239460996415e-05, "loss": 1.3474, "step": 207 }, { "epoch": 0.07666083110660647, "grad_norm": 8.514060564438209, "learning_rate": 4.9669303993077023e-05, "loss": 1.3017, "step": 208 }, { "epoch": 0.077029392794619, "grad_norm": 5.138862841199323, "learning_rate": 4.966621337618989e-05, "loss": 0.9937, "step": 209 }, { "epoch": 0.07739795448263154, "grad_norm": 11.098318615286992, "learning_rate": 4.966312275930276e-05, "loss": 1.0504, "step": 210 }, { "epoch": 0.07776651617064406, "grad_norm": 4.403603482696018, "learning_rate": 4.966003214241563e-05, "loss": 1.3824, "step": 211 }, { "epoch": 0.07813507785865659, "grad_norm": 7.087153791956938, "learning_rate": 4.96569415255285e-05, "loss": 1.0629, "step": 212 }, { "epoch": 0.07850363954666913, "grad_norm": 5.957409695970601, "learning_rate": 4.965385090864137e-05, "loss": 1.4202, "step": 213 }, { "epoch": 0.07887220123468165, "grad_norm": 4.535327418577536, "learning_rate": 4.965076029175424e-05, "loss": 1.0709, "step": 214 }, { "epoch": 0.07924076292269419, "grad_norm": 5.264646173164278, "learning_rate": 4.96476696748671e-05, "loss": 1.1433, "step": 215 }, { "epoch": 0.07960932461070672, "grad_norm": 7.135663802793272, "learning_rate": 4.964457905797997e-05, "loss": 1.114, "step": 216 }, { "epoch": 0.07997788629871924, "grad_norm": 5.314778994740934, "learning_rate": 4.9641488441092844e-05, "loss": 1.556, "step": 217 }, { "epoch": 0.08034644798673178, "grad_norm": 6.804325343763999, "learning_rate": 4.9638397824205715e-05, "loss": 1.3433, "step": 218 }, { "epoch": 0.0807150096747443, "grad_norm": 5.257173632596843, "learning_rate": 4.963530720731858e-05, "loss": 1.0987, "step": 219 }, { "epoch": 0.08108357136275685, "grad_norm": 4.640412913398541, "learning_rate": 4.963221659043145e-05, "loss": 0.9935, "step": 220 }, { "epoch": 0.08145213305076937, "grad_norm": 5.071417219429436, "learning_rate": 4.962912597354432e-05, "loss": 1.0438, "step": 221 }, { "epoch": 0.08182069473878191, "grad_norm": 6.844003333014563, "learning_rate": 4.962603535665719e-05, "loss": 1.0724, "step": 222 }, { "epoch": 0.08218925642679444, "grad_norm": 6.29477825231752, "learning_rate": 4.9622944739770064e-05, "loss": 1.7419, "step": 223 }, { "epoch": 0.08255781811480696, "grad_norm": 8.439493579586562, "learning_rate": 4.961985412288293e-05, "loss": 1.1482, "step": 224 }, { "epoch": 0.0829263798028195, "grad_norm": 7.85983972711204, "learning_rate": 4.96167635059958e-05, "loss": 1.2495, "step": 225 }, { "epoch": 0.08329494149083203, "grad_norm": 6.089432815673522, "learning_rate": 4.961367288910867e-05, "loss": 1.5158, "step": 226 }, { "epoch": 0.08366350317884456, "grad_norm": 6.670746826271766, "learning_rate": 4.961058227222154e-05, "loss": 1.3538, "step": 227 }, { "epoch": 0.08403206486685709, "grad_norm": 5.294827135702605, "learning_rate": 4.960749165533441e-05, "loss": 1.1472, "step": 228 }, { "epoch": 0.08440062655486961, "grad_norm": 7.517374893735761, "learning_rate": 4.960440103844728e-05, "loss": 1.0343, "step": 229 }, { "epoch": 0.08476918824288215, "grad_norm": 5.223629434969229, "learning_rate": 4.960131042156014e-05, "loss": 0.9479, "step": 230 }, { "epoch": 0.08513774993089468, "grad_norm": 5.496239503664357, "learning_rate": 4.9598219804673013e-05, "loss": 1.1869, "step": 231 }, { "epoch": 0.08550631161890722, "grad_norm": 6.066651800507716, "learning_rate": 4.9595129187785885e-05, "loss": 1.2192, "step": 232 }, { "epoch": 0.08587487330691974, "grad_norm": 5.262957794501501, "learning_rate": 4.959203857089875e-05, "loss": 1.4365, "step": 233 }, { "epoch": 0.08624343499493228, "grad_norm": 5.953367854093374, "learning_rate": 4.958894795401162e-05, "loss": 1.0485, "step": 234 }, { "epoch": 0.08661199668294481, "grad_norm": 5.396611259479184, "learning_rate": 4.958585733712449e-05, "loss": 1.1604, "step": 235 }, { "epoch": 0.08698055837095733, "grad_norm": 5.449937298615269, "learning_rate": 4.958276672023736e-05, "loss": 1.1667, "step": 236 }, { "epoch": 0.08734912005896987, "grad_norm": 4.494984905145439, "learning_rate": 4.9579676103350234e-05, "loss": 1.0759, "step": 237 }, { "epoch": 0.0877176817469824, "grad_norm": 4.429934647812169, "learning_rate": 4.95765854864631e-05, "loss": 1.051, "step": 238 }, { "epoch": 0.08808624343499494, "grad_norm": 6.006389712357081, "learning_rate": 4.957349486957597e-05, "loss": 1.1266, "step": 239 }, { "epoch": 0.08845480512300746, "grad_norm": 4.526927912443812, "learning_rate": 4.957040425268884e-05, "loss": 1.0413, "step": 240 }, { "epoch": 0.08882336681101999, "grad_norm": 7.153823907721349, "learning_rate": 4.956731363580171e-05, "loss": 1.5595, "step": 241 }, { "epoch": 0.08919192849903253, "grad_norm": 7.181414607114489, "learning_rate": 4.9564223018914576e-05, "loss": 1.2553, "step": 242 }, { "epoch": 0.08956049018704505, "grad_norm": 6.8342403027064185, "learning_rate": 4.956113240202745e-05, "loss": 1.4703, "step": 243 }, { "epoch": 0.08992905187505759, "grad_norm": 11.377615491716345, "learning_rate": 4.955804178514032e-05, "loss": 1.1635, "step": 244 }, { "epoch": 0.09029761356307012, "grad_norm": 7.499810025670156, "learning_rate": 4.955495116825318e-05, "loss": 1.2663, "step": 245 }, { "epoch": 0.09066617525108266, "grad_norm": 6.161553127132522, "learning_rate": 4.9551860551366054e-05, "loss": 1.0974, "step": 246 }, { "epoch": 0.09103473693909518, "grad_norm": 5.4264473667557045, "learning_rate": 4.9548769934478925e-05, "loss": 1.1905, "step": 247 }, { "epoch": 0.0914032986271077, "grad_norm": 8.324216458675188, "learning_rate": 4.954567931759179e-05, "loss": 1.3127, "step": 248 }, { "epoch": 0.09177186031512025, "grad_norm": 7.775008429127138, "learning_rate": 4.954258870070466e-05, "loss": 1.1795, "step": 249 }, { "epoch": 0.09214042200313277, "grad_norm": 8.349949051935699, "learning_rate": 4.953949808381753e-05, "loss": 1.2673, "step": 250 }, { "epoch": 0.09250898369114531, "grad_norm": 11.227871982389614, "learning_rate": 4.95364074669304e-05, "loss": 1.0281, "step": 251 }, { "epoch": 0.09287754537915784, "grad_norm": 6.352852440507285, "learning_rate": 4.953331685004327e-05, "loss": 1.3931, "step": 252 }, { "epoch": 0.09324610706717036, "grad_norm": 10.777498427328775, "learning_rate": 4.953022623315614e-05, "loss": 1.1933, "step": 253 }, { "epoch": 0.0936146687551829, "grad_norm": 7.8495192265133245, "learning_rate": 4.952713561626901e-05, "loss": 1.0155, "step": 254 }, { "epoch": 0.09398323044319543, "grad_norm": 5.263969546024153, "learning_rate": 4.952404499938188e-05, "loss": 0.8821, "step": 255 }, { "epoch": 0.09435179213120796, "grad_norm": 9.632740029760848, "learning_rate": 4.952095438249475e-05, "loss": 1.1622, "step": 256 }, { "epoch": 0.09472035381922049, "grad_norm": 4.946448799709688, "learning_rate": 4.951786376560762e-05, "loss": 1.002, "step": 257 }, { "epoch": 0.09508891550723303, "grad_norm": 7.045397009292996, "learning_rate": 4.951477314872049e-05, "loss": 1.3641, "step": 258 }, { "epoch": 0.09545747719524555, "grad_norm": 15.74541006872866, "learning_rate": 4.951168253183336e-05, "loss": 1.5834, "step": 259 }, { "epoch": 0.09582603888325808, "grad_norm": 8.034764096227795, "learning_rate": 4.9508591914946224e-05, "loss": 1.0922, "step": 260 }, { "epoch": 0.09619460057127062, "grad_norm": 8.99528379848209, "learning_rate": 4.9505501298059095e-05, "loss": 1.3052, "step": 261 }, { "epoch": 0.09656316225928314, "grad_norm": 4.846496984422824, "learning_rate": 4.950241068117196e-05, "loss": 1.0523, "step": 262 }, { "epoch": 0.09693172394729568, "grad_norm": 14.311201411476388, "learning_rate": 4.949932006428483e-05, "loss": 1.2673, "step": 263 }, { "epoch": 0.09730028563530821, "grad_norm": 8.497513351227926, "learning_rate": 4.94962294473977e-05, "loss": 1.3028, "step": 264 }, { "epoch": 0.09766884732332073, "grad_norm": 5.064250125487098, "learning_rate": 4.949313883051057e-05, "loss": 1.1588, "step": 265 }, { "epoch": 0.09803740901133327, "grad_norm": 7.2247147130905365, "learning_rate": 4.9490048213623444e-05, "loss": 1.0166, "step": 266 }, { "epoch": 0.0984059706993458, "grad_norm": 7.337994336920128, "learning_rate": 4.948695759673631e-05, "loss": 1.0936, "step": 267 }, { "epoch": 0.09877453238735834, "grad_norm": 8.710508327432196, "learning_rate": 4.948386697984918e-05, "loss": 1.2697, "step": 268 }, { "epoch": 0.09914309407537086, "grad_norm": 6.540373202533474, "learning_rate": 4.948077636296205e-05, "loss": 1.0899, "step": 269 }, { "epoch": 0.0995116557633834, "grad_norm": 11.924066624611088, "learning_rate": 4.947768574607492e-05, "loss": 1.2645, "step": 270 }, { "epoch": 0.09988021745139593, "grad_norm": 9.016728008106265, "learning_rate": 4.9474595129187787e-05, "loss": 1.4268, "step": 271 }, { "epoch": 0.10024877913940845, "grad_norm": 5.251132298026489, "learning_rate": 4.947150451230066e-05, "loss": 1.0732, "step": 272 }, { "epoch": 0.10061734082742099, "grad_norm": 6.48642016411288, "learning_rate": 4.946841389541353e-05, "loss": 1.233, "step": 273 }, { "epoch": 0.10098590251543352, "grad_norm": 6.451714386910927, "learning_rate": 4.94653232785264e-05, "loss": 1.2511, "step": 274 }, { "epoch": 0.10135446420344606, "grad_norm": 18.65367918975914, "learning_rate": 4.9462232661639265e-05, "loss": 1.2991, "step": 275 }, { "epoch": 0.10172302589145858, "grad_norm": 5.773892726682605, "learning_rate": 4.9459142044752136e-05, "loss": 1.0407, "step": 276 }, { "epoch": 0.10209158757947111, "grad_norm": 5.52234687451281, "learning_rate": 4.9456051427865e-05, "loss": 1.2793, "step": 277 }, { "epoch": 0.10246014926748365, "grad_norm": 5.638011210538362, "learning_rate": 4.945296081097787e-05, "loss": 1.286, "step": 278 }, { "epoch": 0.10282871095549617, "grad_norm": 6.65580927146395, "learning_rate": 4.944987019409074e-05, "loss": 1.2434, "step": 279 }, { "epoch": 0.10319727264350871, "grad_norm": 7.187394912614918, "learning_rate": 4.9446779577203614e-05, "loss": 1.462, "step": 280 }, { "epoch": 0.10356583433152124, "grad_norm": 4.994672225568875, "learning_rate": 4.944368896031648e-05, "loss": 1.4349, "step": 281 }, { "epoch": 0.10393439601953378, "grad_norm": 7.800419370674787, "learning_rate": 4.944059834342935e-05, "loss": 1.2802, "step": 282 }, { "epoch": 0.1043029577075463, "grad_norm": 4.525541932758578, "learning_rate": 4.943750772654222e-05, "loss": 1.086, "step": 283 }, { "epoch": 0.10467151939555883, "grad_norm": 22.700186385007118, "learning_rate": 4.943441710965509e-05, "loss": 1.1017, "step": 284 }, { "epoch": 0.10504008108357137, "grad_norm": 5.0829000236229795, "learning_rate": 4.943132649276796e-05, "loss": 0.9921, "step": 285 }, { "epoch": 0.10540864277158389, "grad_norm": 6.361878598189037, "learning_rate": 4.942823587588083e-05, "loss": 1.2393, "step": 286 }, { "epoch": 0.10577720445959643, "grad_norm": 5.3318527471265575, "learning_rate": 4.94251452589937e-05, "loss": 1.4581, "step": 287 }, { "epoch": 0.10614576614760896, "grad_norm": 7.774525873870159, "learning_rate": 4.942205464210657e-05, "loss": 1.3379, "step": 288 }, { "epoch": 0.10651432783562148, "grad_norm": 5.500047510115289, "learning_rate": 4.941896402521944e-05, "loss": 1.169, "step": 289 }, { "epoch": 0.10688288952363402, "grad_norm": 5.850595636284942, "learning_rate": 4.9415873408332305e-05, "loss": 1.2542, "step": 290 }, { "epoch": 0.10725145121164655, "grad_norm": 5.12028770592105, "learning_rate": 4.941278279144517e-05, "loss": 1.253, "step": 291 }, { "epoch": 0.10762001289965908, "grad_norm": 7.0639058292112615, "learning_rate": 4.940969217455804e-05, "loss": 1.386, "step": 292 }, { "epoch": 0.10798857458767161, "grad_norm": 10.8320745666056, "learning_rate": 4.940660155767091e-05, "loss": 1.4457, "step": 293 }, { "epoch": 0.10835713627568415, "grad_norm": 7.212328940863742, "learning_rate": 4.940351094078378e-05, "loss": 1.1138, "step": 294 }, { "epoch": 0.10872569796369667, "grad_norm": 5.444422031970614, "learning_rate": 4.9400420323896655e-05, "loss": 1.2433, "step": 295 }, { "epoch": 0.1090942596517092, "grad_norm": 4.966729671070118, "learning_rate": 4.939732970700952e-05, "loss": 1.128, "step": 296 }, { "epoch": 0.10946282133972174, "grad_norm": 4.692820873315264, "learning_rate": 4.939423909012239e-05, "loss": 1.0707, "step": 297 }, { "epoch": 0.10983138302773426, "grad_norm": 6.228030540259127, "learning_rate": 4.939114847323526e-05, "loss": 1.284, "step": 298 }, { "epoch": 0.1101999447157468, "grad_norm": 4.043591909251532, "learning_rate": 4.938805785634813e-05, "loss": 1.0759, "step": 299 }, { "epoch": 0.11056850640375933, "grad_norm": 5.735835666318869, "learning_rate": 4.9384967239461e-05, "loss": 1.2186, "step": 300 }, { "epoch": 0.11093706809177185, "grad_norm": 7.582990142546219, "learning_rate": 4.938187662257387e-05, "loss": 1.2869, "step": 301 }, { "epoch": 0.11130562977978439, "grad_norm": 5.438022281512693, "learning_rate": 4.937878600568674e-05, "loss": 1.1445, "step": 302 }, { "epoch": 0.11167419146779692, "grad_norm": 6.986633891302986, "learning_rate": 4.937569538879961e-05, "loss": 1.1048, "step": 303 }, { "epoch": 0.11204275315580946, "grad_norm": 6.0089661044991365, "learning_rate": 4.937260477191248e-05, "loss": 1.2594, "step": 304 }, { "epoch": 0.11241131484382198, "grad_norm": 5.059235073367108, "learning_rate": 4.9369514155025346e-05, "loss": 0.9074, "step": 305 }, { "epoch": 0.11277987653183452, "grad_norm": 5.493444176885804, "learning_rate": 4.936642353813821e-05, "loss": 1.0724, "step": 306 }, { "epoch": 0.11314843821984705, "grad_norm": 5.58906708659303, "learning_rate": 4.936333292125108e-05, "loss": 1.2242, "step": 307 }, { "epoch": 0.11351699990785957, "grad_norm": 8.144229142328568, "learning_rate": 4.936024230436395e-05, "loss": 1.1608, "step": 308 }, { "epoch": 0.11388556159587211, "grad_norm": 5.8621803180262795, "learning_rate": 4.9357151687476824e-05, "loss": 1.0479, "step": 309 }, { "epoch": 0.11425412328388464, "grad_norm": 6.965515207047456, "learning_rate": 4.935406107058969e-05, "loss": 0.9608, "step": 310 }, { "epoch": 0.11462268497189718, "grad_norm": 4.768489389560347, "learning_rate": 4.935097045370256e-05, "loss": 1.0909, "step": 311 }, { "epoch": 0.1149912466599097, "grad_norm": 7.333076674853953, "learning_rate": 4.934787983681543e-05, "loss": 1.159, "step": 312 }, { "epoch": 0.11535980834792223, "grad_norm": 4.624896692720277, "learning_rate": 4.93447892199283e-05, "loss": 1.0488, "step": 313 }, { "epoch": 0.11572837003593477, "grad_norm": 6.680810737979278, "learning_rate": 4.9341698603041167e-05, "loss": 1.4569, "step": 314 }, { "epoch": 0.11609693172394729, "grad_norm": 5.897308847936423, "learning_rate": 4.933860798615404e-05, "loss": 1.3843, "step": 315 }, { "epoch": 0.11646549341195983, "grad_norm": 7.011097966077881, "learning_rate": 4.933551736926691e-05, "loss": 1.3934, "step": 316 }, { "epoch": 0.11683405509997236, "grad_norm": 14.806988813550491, "learning_rate": 4.933242675237978e-05, "loss": 1.1728, "step": 317 }, { "epoch": 0.1172026167879849, "grad_norm": 6.212539101675236, "learning_rate": 4.932933613549265e-05, "loss": 1.0055, "step": 318 }, { "epoch": 0.11757117847599742, "grad_norm": 6.786713398970493, "learning_rate": 4.9326245518605516e-05, "loss": 1.4222, "step": 319 }, { "epoch": 0.11793974016400995, "grad_norm": 4.8353131065252395, "learning_rate": 4.932315490171839e-05, "loss": 1.2391, "step": 320 }, { "epoch": 0.11830830185202248, "grad_norm": 7.886122342326711, "learning_rate": 4.932006428483125e-05, "loss": 1.4725, "step": 321 }, { "epoch": 0.11867686354003501, "grad_norm": 5.34049629962594, "learning_rate": 4.931697366794412e-05, "loss": 1.2032, "step": 322 }, { "epoch": 0.11904542522804755, "grad_norm": 5.385090073422816, "learning_rate": 4.9313883051056994e-05, "loss": 1.3013, "step": 323 }, { "epoch": 0.11941398691606007, "grad_norm": 7.1768792007568205, "learning_rate": 4.931079243416986e-05, "loss": 1.1563, "step": 324 }, { "epoch": 0.1197825486040726, "grad_norm": 6.524805128907287, "learning_rate": 4.930770181728273e-05, "loss": 1.3036, "step": 325 }, { "epoch": 0.12015111029208514, "grad_norm": 6.919309118985943, "learning_rate": 4.93046112003956e-05, "loss": 1.0923, "step": 326 }, { "epoch": 0.12051967198009766, "grad_norm": 7.489783640284377, "learning_rate": 4.930152058350847e-05, "loss": 1.1384, "step": 327 }, { "epoch": 0.1208882336681102, "grad_norm": 4.5639136227751225, "learning_rate": 4.929842996662134e-05, "loss": 1.4168, "step": 328 }, { "epoch": 0.12125679535612273, "grad_norm": 6.21170703271282, "learning_rate": 4.929533934973421e-05, "loss": 1.0462, "step": 329 }, { "epoch": 0.12162535704413527, "grad_norm": 5.359274638045348, "learning_rate": 4.929224873284708e-05, "loss": 1.3818, "step": 330 }, { "epoch": 0.1219939187321478, "grad_norm": 6.491790575731043, "learning_rate": 4.928915811595995e-05, "loss": 1.4171, "step": 331 }, { "epoch": 0.12236248042016032, "grad_norm": 7.9225582753593455, "learning_rate": 4.928606749907282e-05, "loss": 1.3357, "step": 332 }, { "epoch": 0.12273104210817286, "grad_norm": 6.039142089950955, "learning_rate": 4.9282976882185685e-05, "loss": 1.4001, "step": 333 }, { "epoch": 0.12309960379618538, "grad_norm": 4.808801592279125, "learning_rate": 4.9279886265298556e-05, "loss": 1.1165, "step": 334 }, { "epoch": 0.12346816548419792, "grad_norm": 5.18991767891515, "learning_rate": 4.927679564841143e-05, "loss": 1.0876, "step": 335 }, { "epoch": 0.12383672717221045, "grad_norm": 12.077264634217546, "learning_rate": 4.927370503152429e-05, "loss": 1.1044, "step": 336 }, { "epoch": 0.12420528886022297, "grad_norm": 6.956176362586154, "learning_rate": 4.927061441463716e-05, "loss": 1.1425, "step": 337 }, { "epoch": 0.12457385054823551, "grad_norm": 8.637331343465782, "learning_rate": 4.9267523797750034e-05, "loss": 1.4653, "step": 338 }, { "epoch": 0.12494241223624804, "grad_norm": 7.724964188674831, "learning_rate": 4.92644331808629e-05, "loss": 1.3254, "step": 339 }, { "epoch": 0.12531097392426058, "grad_norm": 6.557654926478339, "learning_rate": 4.926134256397577e-05, "loss": 0.7767, "step": 340 }, { "epoch": 0.1256795356122731, "grad_norm": 4.741387841865732, "learning_rate": 4.925825194708864e-05, "loss": 1.1564, "step": 341 }, { "epoch": 0.12604809730028563, "grad_norm": 7.658139614551111, "learning_rate": 4.925516133020151e-05, "loss": 1.2498, "step": 342 }, { "epoch": 0.12641665898829815, "grad_norm": 6.231000785473729, "learning_rate": 4.925207071331438e-05, "loss": 1.1597, "step": 343 }, { "epoch": 0.1267852206763107, "grad_norm": 5.154369959398018, "learning_rate": 4.924898009642725e-05, "loss": 0.7791, "step": 344 }, { "epoch": 0.12715378236432323, "grad_norm": 8.102806415172326, "learning_rate": 4.924588947954012e-05, "loss": 1.3813, "step": 345 }, { "epoch": 0.12752234405233576, "grad_norm": 7.1378267176686805, "learning_rate": 4.924279886265299e-05, "loss": 1.125, "step": 346 }, { "epoch": 0.12789090574034828, "grad_norm": 8.176325270063257, "learning_rate": 4.923970824576586e-05, "loss": 0.9251, "step": 347 }, { "epoch": 0.12825946742836083, "grad_norm": 5.04486100488346, "learning_rate": 4.9236617628878726e-05, "loss": 1.2059, "step": 348 }, { "epoch": 0.12862802911637336, "grad_norm": 7.420892718714898, "learning_rate": 4.92335270119916e-05, "loss": 0.9747, "step": 349 }, { "epoch": 0.12899659080438589, "grad_norm": 5.540172212864268, "learning_rate": 4.923043639510447e-05, "loss": 0.8319, "step": 350 }, { "epoch": 0.1293651524923984, "grad_norm": 9.255737999375627, "learning_rate": 4.922734577821733e-05, "loss": 0.9942, "step": 351 }, { "epoch": 0.12973371418041094, "grad_norm": 12.23673885109758, "learning_rate": 4.9224255161330204e-05, "loss": 1.3623, "step": 352 }, { "epoch": 0.1301022758684235, "grad_norm": 14.597863738657855, "learning_rate": 4.922116454444307e-05, "loss": 1.3028, "step": 353 }, { "epoch": 0.13047083755643601, "grad_norm": 5.7107638268736105, "learning_rate": 4.921807392755594e-05, "loss": 0.9463, "step": 354 }, { "epoch": 0.13083939924444854, "grad_norm": 5.73374698536546, "learning_rate": 4.921498331066881e-05, "loss": 1.4821, "step": 355 }, { "epoch": 0.13120796093246107, "grad_norm": 8.587049944993478, "learning_rate": 4.921189269378168e-05, "loss": 1.0356, "step": 356 }, { "epoch": 0.1315765226204736, "grad_norm": 6.536146163071865, "learning_rate": 4.920880207689455e-05, "loss": 1.3828, "step": 357 }, { "epoch": 0.13194508430848614, "grad_norm": 7.328506036868761, "learning_rate": 4.920571146000742e-05, "loss": 1.1721, "step": 358 }, { "epoch": 0.13231364599649867, "grad_norm": 6.324856891445499, "learning_rate": 4.920262084312029e-05, "loss": 0.8959, "step": 359 }, { "epoch": 0.1326822076845112, "grad_norm": 5.529483836471785, "learning_rate": 4.919953022623316e-05, "loss": 0.8657, "step": 360 }, { "epoch": 0.13305076937252372, "grad_norm": 5.868832536630908, "learning_rate": 4.919643960934603e-05, "loss": 0.8185, "step": 361 }, { "epoch": 0.13341933106053624, "grad_norm": 6.226258312343095, "learning_rate": 4.9193348992458896e-05, "loss": 1.0831, "step": 362 }, { "epoch": 0.1337878927485488, "grad_norm": 7.060891297356584, "learning_rate": 4.919025837557177e-05, "loss": 1.029, "step": 363 }, { "epoch": 0.13415645443656132, "grad_norm": 6.770160661196894, "learning_rate": 4.918716775868464e-05, "loss": 1.0122, "step": 364 }, { "epoch": 0.13452501612457385, "grad_norm": 9.534993821588683, "learning_rate": 4.918407714179751e-05, "loss": 1.6194, "step": 365 }, { "epoch": 0.13489357781258637, "grad_norm": 10.446648340393754, "learning_rate": 4.9180986524910374e-05, "loss": 1.2542, "step": 366 }, { "epoch": 0.1352621395005989, "grad_norm": 6.5623366562632794, "learning_rate": 4.9177895908023245e-05, "loss": 1.192, "step": 367 }, { "epoch": 0.13563070118861145, "grad_norm": 5.150789455903141, "learning_rate": 4.917480529113611e-05, "loss": 0.9876, "step": 368 }, { "epoch": 0.13599926287662398, "grad_norm": 9.50080546929731, "learning_rate": 4.917171467424898e-05, "loss": 1.0272, "step": 369 }, { "epoch": 0.1363678245646365, "grad_norm": 6.825848581847145, "learning_rate": 4.916862405736185e-05, "loss": 1.3901, "step": 370 }, { "epoch": 0.13673638625264903, "grad_norm": 18.12197071937559, "learning_rate": 4.916553344047472e-05, "loss": 1.4128, "step": 371 }, { "epoch": 0.13710494794066158, "grad_norm": 8.668510045345164, "learning_rate": 4.916244282358759e-05, "loss": 1.3129, "step": 372 }, { "epoch": 0.1374735096286741, "grad_norm": 8.950826459692056, "learning_rate": 4.915935220670046e-05, "loss": 1.2541, "step": 373 }, { "epoch": 0.13784207131668663, "grad_norm": 17.56845393682861, "learning_rate": 4.915626158981333e-05, "loss": 1.4439, "step": 374 }, { "epoch": 0.13821063300469916, "grad_norm": 5.2520508166583575, "learning_rate": 4.91531709729262e-05, "loss": 1.1273, "step": 375 }, { "epoch": 0.13857919469271168, "grad_norm": 8.899634758816344, "learning_rate": 4.915008035603907e-05, "loss": 0.8192, "step": 376 }, { "epoch": 0.13894775638072424, "grad_norm": 42.06029143068763, "learning_rate": 4.9146989739151936e-05, "loss": 1.1443, "step": 377 }, { "epoch": 0.13931631806873676, "grad_norm": 12.90058386612459, "learning_rate": 4.914389912226481e-05, "loss": 1.0708, "step": 378 }, { "epoch": 0.1396848797567493, "grad_norm": 4.7205338295822505, "learning_rate": 4.914080850537768e-05, "loss": 0.8594, "step": 379 }, { "epoch": 0.1400534414447618, "grad_norm": 5.848048297019899, "learning_rate": 4.913771788849055e-05, "loss": 1.0465, "step": 380 }, { "epoch": 0.14042200313277434, "grad_norm": 5.327216912798909, "learning_rate": 4.9134627271603414e-05, "loss": 1.1288, "step": 381 }, { "epoch": 0.1407905648207869, "grad_norm": 5.675002567778019, "learning_rate": 4.913153665471628e-05, "loss": 1.2288, "step": 382 }, { "epoch": 0.14115912650879942, "grad_norm": 4.9315311714339005, "learning_rate": 4.912844603782915e-05, "loss": 0.9363, "step": 383 }, { "epoch": 0.14152768819681194, "grad_norm": 4.85635134715173, "learning_rate": 4.912535542094202e-05, "loss": 1.2016, "step": 384 }, { "epoch": 0.14189624988482447, "grad_norm": 4.88702630676845, "learning_rate": 4.912226480405489e-05, "loss": 1.0376, "step": 385 }, { "epoch": 0.142264811572837, "grad_norm": 4.1737275885762575, "learning_rate": 4.9119174187167764e-05, "loss": 0.9387, "step": 386 }, { "epoch": 0.14263337326084954, "grad_norm": 4.805125345268345, "learning_rate": 4.911608357028063e-05, "loss": 0.8343, "step": 387 }, { "epoch": 0.14300193494886207, "grad_norm": 7.669155656676172, "learning_rate": 4.91129929533935e-05, "loss": 1.3994, "step": 388 }, { "epoch": 0.1433704966368746, "grad_norm": 5.151602205095131, "learning_rate": 4.910990233650637e-05, "loss": 1.2654, "step": 389 }, { "epoch": 0.14373905832488712, "grad_norm": 9.997024856498323, "learning_rate": 4.910681171961924e-05, "loss": 0.9663, "step": 390 }, { "epoch": 0.14410762001289965, "grad_norm": 5.189532881279288, "learning_rate": 4.9103721102732106e-05, "loss": 1.1273, "step": 391 }, { "epoch": 0.1444761817009122, "grad_norm": 7.415294934694496, "learning_rate": 4.910063048584498e-05, "loss": 1.0589, "step": 392 }, { "epoch": 0.14484474338892472, "grad_norm": 7.925965586662424, "learning_rate": 4.909753986895785e-05, "loss": 1.5007, "step": 393 }, { "epoch": 0.14521330507693725, "grad_norm": 5.933367766948323, "learning_rate": 4.909444925207072e-05, "loss": 1.3108, "step": 394 }, { "epoch": 0.14558186676494977, "grad_norm": 11.667764739177297, "learning_rate": 4.909135863518359e-05, "loss": 1.2522, "step": 395 }, { "epoch": 0.14595042845296233, "grad_norm": 5.931729695063066, "learning_rate": 4.908826801829645e-05, "loss": 1.177, "step": 396 }, { "epoch": 0.14631899014097485, "grad_norm": 5.628439466300199, "learning_rate": 4.908517740140932e-05, "loss": 1.0566, "step": 397 }, { "epoch": 0.14668755182898738, "grad_norm": 5.897221522012506, "learning_rate": 4.908208678452219e-05, "loss": 1.1677, "step": 398 }, { "epoch": 0.1470561135169999, "grad_norm": 9.324573856874355, "learning_rate": 4.907899616763506e-05, "loss": 1.3099, "step": 399 }, { "epoch": 0.14742467520501243, "grad_norm": 7.466723562205482, "learning_rate": 4.907590555074793e-05, "loss": 1.1938, "step": 400 }, { "epoch": 0.14779323689302498, "grad_norm": 5.293443316478924, "learning_rate": 4.90728149338608e-05, "loss": 1.2967, "step": 401 }, { "epoch": 0.1481617985810375, "grad_norm": 6.45036970158373, "learning_rate": 4.906972431697367e-05, "loss": 1.1567, "step": 402 }, { "epoch": 0.14853036026905003, "grad_norm": 9.354671563765239, "learning_rate": 4.906663370008654e-05, "loss": 1.4757, "step": 403 }, { "epoch": 0.14889892195706256, "grad_norm": 6.450244325253792, "learning_rate": 4.906354308319941e-05, "loss": 0.9676, "step": 404 }, { "epoch": 0.14926748364507508, "grad_norm": 6.692959847282365, "learning_rate": 4.9060452466312276e-05, "loss": 1.0651, "step": 405 }, { "epoch": 0.14963604533308764, "grad_norm": 9.226097360073684, "learning_rate": 4.905736184942515e-05, "loss": 0.9877, "step": 406 }, { "epoch": 0.15000460702110016, "grad_norm": 7.460429005495925, "learning_rate": 4.905427123253802e-05, "loss": 1.2667, "step": 407 }, { "epoch": 0.1503731687091127, "grad_norm": 13.315356597308108, "learning_rate": 4.905118061565089e-05, "loss": 1.3171, "step": 408 }, { "epoch": 0.1507417303971252, "grad_norm": 5.383891713066612, "learning_rate": 4.904808999876376e-05, "loss": 1.1255, "step": 409 }, { "epoch": 0.15111029208513774, "grad_norm": 7.102314488513586, "learning_rate": 4.9044999381876625e-05, "loss": 1.2151, "step": 410 }, { "epoch": 0.1514788537731503, "grad_norm": 10.916067868959342, "learning_rate": 4.904190876498949e-05, "loss": 1.0506, "step": 411 }, { "epoch": 0.15184741546116282, "grad_norm": 4.6797005425527765, "learning_rate": 4.903881814810236e-05, "loss": 0.8158, "step": 412 }, { "epoch": 0.15221597714917534, "grad_norm": 9.729567024657607, "learning_rate": 4.903572753121523e-05, "loss": 1.125, "step": 413 }, { "epoch": 0.15258453883718787, "grad_norm": 5.818812070652757, "learning_rate": 4.90326369143281e-05, "loss": 1.4306, "step": 414 }, { "epoch": 0.1529531005252004, "grad_norm": 7.212116314551446, "learning_rate": 4.902954629744097e-05, "loss": 1.6493, "step": 415 }, { "epoch": 0.15332166221321294, "grad_norm": 7.095127056078314, "learning_rate": 4.902645568055384e-05, "loss": 1.1238, "step": 416 }, { "epoch": 0.15369022390122547, "grad_norm": 4.6421783621796635, "learning_rate": 4.902336506366671e-05, "loss": 1.3398, "step": 417 }, { "epoch": 0.154058785589238, "grad_norm": 9.644922816322197, "learning_rate": 4.902027444677958e-05, "loss": 1.2469, "step": 418 }, { "epoch": 0.15442734727725052, "grad_norm": 7.12199070607591, "learning_rate": 4.901718382989245e-05, "loss": 0.9717, "step": 419 }, { "epoch": 0.15479590896526307, "grad_norm": 7.3734067068651346, "learning_rate": 4.9014093213005316e-05, "loss": 1.3293, "step": 420 }, { "epoch": 0.1551644706532756, "grad_norm": 5.325135049316311, "learning_rate": 4.901100259611819e-05, "loss": 1.1873, "step": 421 }, { "epoch": 0.15553303234128812, "grad_norm": 5.785535059445433, "learning_rate": 4.900791197923106e-05, "loss": 1.2497, "step": 422 }, { "epoch": 0.15590159402930065, "grad_norm": 11.542008340967735, "learning_rate": 4.900482136234393e-05, "loss": 0.9794, "step": 423 }, { "epoch": 0.15627015571731318, "grad_norm": 9.119240811521298, "learning_rate": 4.9001730745456794e-05, "loss": 1.1521, "step": 424 }, { "epoch": 0.15663871740532573, "grad_norm": 14.656722656957413, "learning_rate": 4.8998640128569666e-05, "loss": 1.101, "step": 425 }, { "epoch": 0.15700727909333825, "grad_norm": 14.821016205453143, "learning_rate": 4.899554951168254e-05, "loss": 1.3736, "step": 426 }, { "epoch": 0.15737584078135078, "grad_norm": 15.747951798244868, "learning_rate": 4.89924588947954e-05, "loss": 1.0672, "step": 427 }, { "epoch": 0.1577444024693633, "grad_norm": 19.206865609032555, "learning_rate": 4.898936827790827e-05, "loss": 1.2056, "step": 428 }, { "epoch": 0.15811296415737583, "grad_norm": 2741.8339665267845, "learning_rate": 4.8986277661021144e-05, "loss": 1.05, "step": 429 }, { "epoch": 0.15848152584538838, "grad_norm": 7.981266260093191, "learning_rate": 4.898318704413401e-05, "loss": 1.155, "step": 430 }, { "epoch": 0.1588500875334009, "grad_norm": 20.445337874936488, "learning_rate": 4.898009642724688e-05, "loss": 1.1892, "step": 431 }, { "epoch": 0.15921864922141343, "grad_norm": 5.621996437953714, "learning_rate": 4.897700581035975e-05, "loss": 1.1792, "step": 432 }, { "epoch": 0.15958721090942596, "grad_norm": 7.467540946365445, "learning_rate": 4.897391519347262e-05, "loss": 1.3794, "step": 433 }, { "epoch": 0.15995577259743848, "grad_norm": 34.002674670761195, "learning_rate": 4.8970824576585486e-05, "loss": 1.3727, "step": 434 }, { "epoch": 0.16032433428545104, "grad_norm": 6.216859156242378, "learning_rate": 4.896773395969836e-05, "loss": 1.3795, "step": 435 }, { "epoch": 0.16069289597346356, "grad_norm": 14.196521730622853, "learning_rate": 4.896464334281123e-05, "loss": 0.8898, "step": 436 }, { "epoch": 0.1610614576614761, "grad_norm": 4.832796768572339, "learning_rate": 4.89615527259241e-05, "loss": 1.084, "step": 437 }, { "epoch": 0.1614300193494886, "grad_norm": 15.751269576962132, "learning_rate": 4.895846210903697e-05, "loss": 1.4898, "step": 438 }, { "epoch": 0.16179858103750114, "grad_norm": 6.937277850469115, "learning_rate": 4.8955371492149835e-05, "loss": 0.8577, "step": 439 }, { "epoch": 0.1621671427255137, "grad_norm": 30.16231217402926, "learning_rate": 4.8952280875262706e-05, "loss": 1.0642, "step": 440 }, { "epoch": 0.16253570441352622, "grad_norm": 19.806246856756616, "learning_rate": 4.894919025837558e-05, "loss": 1.1952, "step": 441 }, { "epoch": 0.16290426610153874, "grad_norm": 6.670574695816017, "learning_rate": 4.894609964148844e-05, "loss": 1.1366, "step": 442 }, { "epoch": 0.16327282778955127, "grad_norm": 26.986632252144766, "learning_rate": 4.894300902460131e-05, "loss": 1.1214, "step": 443 }, { "epoch": 0.16364138947756382, "grad_norm": 21.59694090479743, "learning_rate": 4.893991840771418e-05, "loss": 1.3071, "step": 444 }, { "epoch": 0.16400995116557635, "grad_norm": 11.328545250825456, "learning_rate": 4.893682779082705e-05, "loss": 1.3066, "step": 445 }, { "epoch": 0.16437851285358887, "grad_norm": 12.444348160810229, "learning_rate": 4.893373717393992e-05, "loss": 0.8667, "step": 446 }, { "epoch": 0.1647470745416014, "grad_norm": 14.069484485110243, "learning_rate": 4.893064655705279e-05, "loss": 0.9907, "step": 447 }, { "epoch": 0.16511563622961392, "grad_norm": 9.88590845158329, "learning_rate": 4.892755594016566e-05, "loss": 0.8727, "step": 448 }, { "epoch": 0.16548419791762647, "grad_norm": 7.428158062133904, "learning_rate": 4.892446532327853e-05, "loss": 1.0044, "step": 449 }, { "epoch": 0.165852759605639, "grad_norm": 6.780033125444809, "learning_rate": 4.89213747063914e-05, "loss": 1.2852, "step": 450 }, { "epoch": 0.16622132129365152, "grad_norm": 6.372400389559143, "learning_rate": 4.891828408950427e-05, "loss": 1.2035, "step": 451 }, { "epoch": 0.16658988298166405, "grad_norm": 34.49929941544103, "learning_rate": 4.891519347261714e-05, "loss": 1.0931, "step": 452 }, { "epoch": 0.16695844466967658, "grad_norm": 15.766323773538055, "learning_rate": 4.8912102855730005e-05, "loss": 1.1139, "step": 453 }, { "epoch": 0.16732700635768913, "grad_norm": 7.418815158728178, "learning_rate": 4.8909012238842876e-05, "loss": 1.217, "step": 454 }, { "epoch": 0.16769556804570165, "grad_norm": 10.281681503934065, "learning_rate": 4.890592162195575e-05, "loss": 1.0925, "step": 455 }, { "epoch": 0.16806412973371418, "grad_norm": 6.650564711836742, "learning_rate": 4.890283100506862e-05, "loss": 1.4965, "step": 456 }, { "epoch": 0.1684326914217267, "grad_norm": 8.681820556698401, "learning_rate": 4.889974038818148e-05, "loss": 1.1963, "step": 457 }, { "epoch": 0.16880125310973923, "grad_norm": 15.67658574359136, "learning_rate": 4.8896649771294354e-05, "loss": 0.8114, "step": 458 }, { "epoch": 0.16916981479775178, "grad_norm": 9.71723244155671, "learning_rate": 4.889355915440722e-05, "loss": 1.3176, "step": 459 }, { "epoch": 0.1695383764857643, "grad_norm": 6.308618239600299, "learning_rate": 4.889046853752009e-05, "loss": 1.1728, "step": 460 }, { "epoch": 0.16990693817377683, "grad_norm": 20.73579747970019, "learning_rate": 4.888737792063296e-05, "loss": 1.2724, "step": 461 }, { "epoch": 0.17027549986178936, "grad_norm": 11.288364354493732, "learning_rate": 4.888428730374583e-05, "loss": 1.4436, "step": 462 }, { "epoch": 0.17064406154980188, "grad_norm": 15.275129781568141, "learning_rate": 4.8881196686858696e-05, "loss": 1.1254, "step": 463 }, { "epoch": 0.17101262323781444, "grad_norm": 6.677343624316279, "learning_rate": 4.887810606997157e-05, "loss": 1.1188, "step": 464 }, { "epoch": 0.17138118492582696, "grad_norm": 14.037328817898256, "learning_rate": 4.887501545308444e-05, "loss": 1.3342, "step": 465 }, { "epoch": 0.1717497466138395, "grad_norm": 17.31400267762204, "learning_rate": 4.887192483619731e-05, "loss": 0.9584, "step": 466 }, { "epoch": 0.172118308301852, "grad_norm": 7.5501098018518595, "learning_rate": 4.886883421931018e-05, "loss": 0.8223, "step": 467 }, { "epoch": 0.17248686998986457, "grad_norm": 8.086680974274149, "learning_rate": 4.8865743602423045e-05, "loss": 1.0781, "step": 468 }, { "epoch": 0.1728554316778771, "grad_norm": 7.094980813686675, "learning_rate": 4.886265298553592e-05, "loss": 1.217, "step": 469 }, { "epoch": 0.17322399336588962, "grad_norm": 6.78061233546048, "learning_rate": 4.885956236864879e-05, "loss": 0.9831, "step": 470 }, { "epoch": 0.17359255505390214, "grad_norm": 6.858460106803939, "learning_rate": 4.885647175176166e-05, "loss": 1.009, "step": 471 }, { "epoch": 0.17396111674191467, "grad_norm": 10.07442398128486, "learning_rate": 4.8853381134874523e-05, "loss": 0.9942, "step": 472 }, { "epoch": 0.17432967842992722, "grad_norm": 8.149200081915872, "learning_rate": 4.885029051798739e-05, "loss": 0.9108, "step": 473 }, { "epoch": 0.17469824011793975, "grad_norm": 5.86355254722946, "learning_rate": 4.884719990110026e-05, "loss": 0.9589, "step": 474 }, { "epoch": 0.17506680180595227, "grad_norm": 6.61972382813596, "learning_rate": 4.884410928421313e-05, "loss": 1.0758, "step": 475 }, { "epoch": 0.1754353634939648, "grad_norm": 6.533490394469035, "learning_rate": 4.8841018667326e-05, "loss": 0.9264, "step": 476 }, { "epoch": 0.17580392518197732, "grad_norm": 7.8574769778859554, "learning_rate": 4.8837928050438866e-05, "loss": 1.0821, "step": 477 }, { "epoch": 0.17617248686998987, "grad_norm": 9.970158875021204, "learning_rate": 4.883483743355174e-05, "loss": 0.8626, "step": 478 }, { "epoch": 0.1765410485580024, "grad_norm": 7.836926170648746, "learning_rate": 4.883174681666461e-05, "loss": 1.1195, "step": 479 }, { "epoch": 0.17690961024601493, "grad_norm": 7.837306807234399, "learning_rate": 4.882865619977748e-05, "loss": 1.165, "step": 480 }, { "epoch": 0.17727817193402745, "grad_norm": 6.760696977428101, "learning_rate": 4.882556558289035e-05, "loss": 0.93, "step": 481 }, { "epoch": 0.17764673362203998, "grad_norm": 6.009582179520645, "learning_rate": 4.8822474966003215e-05, "loss": 1.3213, "step": 482 }, { "epoch": 0.17801529531005253, "grad_norm": 9.259376928076284, "learning_rate": 4.8819384349116086e-05, "loss": 1.1025, "step": 483 }, { "epoch": 0.17838385699806505, "grad_norm": 5.995923882976602, "learning_rate": 4.881629373222896e-05, "loss": 0.9491, "step": 484 }, { "epoch": 0.17875241868607758, "grad_norm": 7.563571822451276, "learning_rate": 4.881320311534183e-05, "loss": 0.9716, "step": 485 }, { "epoch": 0.1791209803740901, "grad_norm": 8.979709641280124, "learning_rate": 4.88101124984547e-05, "loss": 1.1488, "step": 486 }, { "epoch": 0.17948954206210266, "grad_norm": 5.946943943989149, "learning_rate": 4.880702188156756e-05, "loss": 1.2037, "step": 487 }, { "epoch": 0.17985810375011518, "grad_norm": 7.6258059216445995, "learning_rate": 4.880393126468043e-05, "loss": 1.2469, "step": 488 }, { "epoch": 0.1802266654381277, "grad_norm": 5.940437313126989, "learning_rate": 4.88008406477933e-05, "loss": 1.0454, "step": 489 }, { "epoch": 0.18059522712614023, "grad_norm": 6.783850105465035, "learning_rate": 4.879775003090617e-05, "loss": 1.021, "step": 490 }, { "epoch": 0.18096378881415276, "grad_norm": 11.924250254864035, "learning_rate": 4.879465941401904e-05, "loss": 1.303, "step": 491 }, { "epoch": 0.1813323505021653, "grad_norm": 6.5795444578965725, "learning_rate": 4.879156879713191e-05, "loss": 1.3959, "step": 492 }, { "epoch": 0.18170091219017784, "grad_norm": 8.72176973285521, "learning_rate": 4.878847818024478e-05, "loss": 1.0626, "step": 493 }, { "epoch": 0.18206947387819036, "grad_norm": 9.92965983965463, "learning_rate": 4.878538756335765e-05, "loss": 1.1706, "step": 494 }, { "epoch": 0.1824380355662029, "grad_norm": 6.642619185866182, "learning_rate": 4.878229694647052e-05, "loss": 1.1006, "step": 495 }, { "epoch": 0.1828065972542154, "grad_norm": 5.48482448550916, "learning_rate": 4.8779206329583385e-05, "loss": 1.1172, "step": 496 }, { "epoch": 0.18317515894222797, "grad_norm": 6.654428438767652, "learning_rate": 4.8776115712696256e-05, "loss": 1.3921, "step": 497 }, { "epoch": 0.1835437206302405, "grad_norm": 5.789494095542738, "learning_rate": 4.877302509580913e-05, "loss": 1.1215, "step": 498 }, { "epoch": 0.18391228231825302, "grad_norm": 6.542642249316331, "learning_rate": 4.8769934478922e-05, "loss": 1.0752, "step": 499 }, { "epoch": 0.18428084400626554, "grad_norm": 6.8280434330797455, "learning_rate": 4.876684386203487e-05, "loss": 1.2777, "step": 500 }, { "epoch": 0.18464940569427807, "grad_norm": 4.816705228833773, "learning_rate": 4.8763753245147734e-05, "loss": 1.4133, "step": 501 }, { "epoch": 0.18501796738229062, "grad_norm": 7.815878663947566, "learning_rate": 4.87606626282606e-05, "loss": 1.1358, "step": 502 }, { "epoch": 0.18538652907030315, "grad_norm": 5.229645097992037, "learning_rate": 4.875757201137347e-05, "loss": 0.9047, "step": 503 }, { "epoch": 0.18575509075831567, "grad_norm": 13.23219802047736, "learning_rate": 4.875448139448634e-05, "loss": 1.122, "step": 504 }, { "epoch": 0.1861236524463282, "grad_norm": 7.68031136199487, "learning_rate": 4.875139077759921e-05, "loss": 1.0236, "step": 505 }, { "epoch": 0.18649221413434072, "grad_norm": 5.718500183986012, "learning_rate": 4.8748300160712076e-05, "loss": 1.1515, "step": 506 }, { "epoch": 0.18686077582235328, "grad_norm": 10.475015251494527, "learning_rate": 4.874520954382495e-05, "loss": 0.9798, "step": 507 }, { "epoch": 0.1872293375103658, "grad_norm": 5.708806299074401, "learning_rate": 4.874211892693782e-05, "loss": 1.1428, "step": 508 }, { "epoch": 0.18759789919837833, "grad_norm": 5.464094698262653, "learning_rate": 4.873902831005069e-05, "loss": 1.3537, "step": 509 }, { "epoch": 0.18796646088639085, "grad_norm": 5.689041431852125, "learning_rate": 4.873593769316356e-05, "loss": 1.5163, "step": 510 }, { "epoch": 0.1883350225744034, "grad_norm": 5.131515083481871, "learning_rate": 4.8732847076276425e-05, "loss": 1.0061, "step": 511 }, { "epoch": 0.18870358426241593, "grad_norm": 4.97150138635663, "learning_rate": 4.87297564593893e-05, "loss": 1.085, "step": 512 }, { "epoch": 0.18907214595042846, "grad_norm": 5.09271828904105, "learning_rate": 4.872666584250217e-05, "loss": 1.6388, "step": 513 }, { "epoch": 0.18944070763844098, "grad_norm": 5.657749040611732, "learning_rate": 4.872357522561504e-05, "loss": 1.1245, "step": 514 }, { "epoch": 0.1898092693264535, "grad_norm": 5.79634359139803, "learning_rate": 4.8720484608727903e-05, "loss": 0.9376, "step": 515 }, { "epoch": 0.19017783101446606, "grad_norm": 11.859652268456733, "learning_rate": 4.8717393991840775e-05, "loss": 0.9774, "step": 516 }, { "epoch": 0.19054639270247858, "grad_norm": 9.012380773255668, "learning_rate": 4.871430337495364e-05, "loss": 1.1763, "step": 517 }, { "epoch": 0.1909149543904911, "grad_norm": 7.214032105300655, "learning_rate": 4.871121275806651e-05, "loss": 1.2164, "step": 518 }, { "epoch": 0.19128351607850363, "grad_norm": 11.410915810476785, "learning_rate": 4.870812214117938e-05, "loss": 0.8214, "step": 519 }, { "epoch": 0.19165207776651616, "grad_norm": 5.406978139187937, "learning_rate": 4.870503152429225e-05, "loss": 1.1722, "step": 520 }, { "epoch": 0.1920206394545287, "grad_norm": 6.57784025975929, "learning_rate": 4.870194090740512e-05, "loss": 1.2808, "step": 521 }, { "epoch": 0.19238920114254124, "grad_norm": 5.452649814483748, "learning_rate": 4.869885029051799e-05, "loss": 1.0169, "step": 522 }, { "epoch": 0.19275776283055376, "grad_norm": 6.770319554325023, "learning_rate": 4.869575967363086e-05, "loss": 1.4829, "step": 523 }, { "epoch": 0.1931263245185663, "grad_norm": 7.801645780210917, "learning_rate": 4.869266905674373e-05, "loss": 1.1344, "step": 524 }, { "epoch": 0.19349488620657881, "grad_norm": 8.158003227665313, "learning_rate": 4.8689578439856595e-05, "loss": 1.1069, "step": 525 }, { "epoch": 0.19386344789459137, "grad_norm": 8.054644875917774, "learning_rate": 4.8686487822969466e-05, "loss": 1.0546, "step": 526 }, { "epoch": 0.1942320095826039, "grad_norm": 4.806435467101796, "learning_rate": 4.868339720608234e-05, "loss": 1.3673, "step": 527 }, { "epoch": 0.19460057127061642, "grad_norm": 4.483719515328615, "learning_rate": 4.868030658919521e-05, "loss": 1.3511, "step": 528 }, { "epoch": 0.19496913295862894, "grad_norm": 5.537544419086801, "learning_rate": 4.867721597230808e-05, "loss": 1.0598, "step": 529 }, { "epoch": 0.19533769464664147, "grad_norm": 11.103255033496582, "learning_rate": 4.8674125355420944e-05, "loss": 1.2132, "step": 530 }, { "epoch": 0.19570625633465402, "grad_norm": 7.0194880463507685, "learning_rate": 4.8671034738533815e-05, "loss": 1.157, "step": 531 }, { "epoch": 0.19607481802266655, "grad_norm": 6.9716834492959805, "learning_rate": 4.866794412164668e-05, "loss": 0.9043, "step": 532 }, { "epoch": 0.19644337971067907, "grad_norm": 29.283777280542314, "learning_rate": 4.866485350475955e-05, "loss": 1.2764, "step": 533 }, { "epoch": 0.1968119413986916, "grad_norm": 6.829646100416448, "learning_rate": 4.866176288787242e-05, "loss": 0.9792, "step": 534 }, { "epoch": 0.19718050308670415, "grad_norm": 7.294840559704092, "learning_rate": 4.8658672270985287e-05, "loss": 1.0519, "step": 535 }, { "epoch": 0.19754906477471668, "grad_norm": 8.208445540175994, "learning_rate": 4.865558165409816e-05, "loss": 1.12, "step": 536 }, { "epoch": 0.1979176264627292, "grad_norm": 6.395485966928066, "learning_rate": 4.865249103721103e-05, "loss": 0.8189, "step": 537 }, { "epoch": 0.19828618815074173, "grad_norm": 6.794421084053567, "learning_rate": 4.86494004203239e-05, "loss": 1.3169, "step": 538 }, { "epoch": 0.19865474983875425, "grad_norm": 6.163599574663889, "learning_rate": 4.864630980343677e-05, "loss": 1.117, "step": 539 }, { "epoch": 0.1990233115267668, "grad_norm": 5.3960075595766, "learning_rate": 4.8643219186549636e-05, "loss": 0.8632, "step": 540 }, { "epoch": 0.19939187321477933, "grad_norm": 6.248889977111529, "learning_rate": 4.864012856966251e-05, "loss": 1.3206, "step": 541 }, { "epoch": 0.19976043490279186, "grad_norm": 6.015148748396172, "learning_rate": 4.863703795277538e-05, "loss": 1.3382, "step": 542 }, { "epoch": 0.20012899659080438, "grad_norm": 7.224228930659529, "learning_rate": 4.863394733588825e-05, "loss": 1.2876, "step": 543 }, { "epoch": 0.2004975582788169, "grad_norm": 4.661090766721896, "learning_rate": 4.8630856719001114e-05, "loss": 0.8935, "step": 544 }, { "epoch": 0.20086611996682946, "grad_norm": 5.617939735521221, "learning_rate": 4.8627766102113985e-05, "loss": 1.1207, "step": 545 }, { "epoch": 0.20123468165484198, "grad_norm": 6.245904419816587, "learning_rate": 4.8624675485226856e-05, "loss": 1.2043, "step": 546 }, { "epoch": 0.2016032433428545, "grad_norm": 4.624189976046649, "learning_rate": 4.862158486833973e-05, "loss": 0.9727, "step": 547 }, { "epoch": 0.20197180503086704, "grad_norm": 4.308522477170022, "learning_rate": 4.861849425145259e-05, "loss": 0.9264, "step": 548 }, { "epoch": 0.20234036671887956, "grad_norm": 7.5323910936952085, "learning_rate": 4.8615403634565456e-05, "loss": 1.34, "step": 549 }, { "epoch": 0.2027089284068921, "grad_norm": 5.940748751423432, "learning_rate": 4.861231301767833e-05, "loss": 1.4971, "step": 550 }, { "epoch": 0.20307749009490464, "grad_norm": 5.782130818668773, "learning_rate": 4.86092224007912e-05, "loss": 1.3022, "step": 551 }, { "epoch": 0.20344605178291716, "grad_norm": 5.66778063576962, "learning_rate": 4.860613178390407e-05, "loss": 1.2333, "step": 552 }, { "epoch": 0.2038146134709297, "grad_norm": 6.435990536205667, "learning_rate": 4.860304116701694e-05, "loss": 1.3409, "step": 553 }, { "epoch": 0.20418317515894222, "grad_norm": 5.413631625734281, "learning_rate": 4.8599950550129805e-05, "loss": 1.0959, "step": 554 }, { "epoch": 0.20455173684695477, "grad_norm": 4.682458226088355, "learning_rate": 4.8596859933242677e-05, "loss": 1.1051, "step": 555 }, { "epoch": 0.2049202985349673, "grad_norm": 4.589108983740573, "learning_rate": 4.859376931635555e-05, "loss": 1.1899, "step": 556 }, { "epoch": 0.20528886022297982, "grad_norm": 6.442292734822933, "learning_rate": 4.859067869946842e-05, "loss": 1.0333, "step": 557 }, { "epoch": 0.20565742191099234, "grad_norm": 6.7594865610736985, "learning_rate": 4.858758808258129e-05, "loss": 1.2384, "step": 558 }, { "epoch": 0.2060259835990049, "grad_norm": 5.674134700194142, "learning_rate": 4.8584497465694155e-05, "loss": 1.2085, "step": 559 }, { "epoch": 0.20639454528701742, "grad_norm": 4.506036100985409, "learning_rate": 4.8581406848807026e-05, "loss": 0.897, "step": 560 }, { "epoch": 0.20676310697502995, "grad_norm": 4.428047105188865, "learning_rate": 4.85783162319199e-05, "loss": 1.1156, "step": 561 }, { "epoch": 0.20713166866304247, "grad_norm": 6.008668994742422, "learning_rate": 4.857522561503277e-05, "loss": 1.2385, "step": 562 }, { "epoch": 0.207500230351055, "grad_norm": 5.562705046914211, "learning_rate": 4.857213499814563e-05, "loss": 1.0523, "step": 563 }, { "epoch": 0.20786879203906755, "grad_norm": 6.5417437052796865, "learning_rate": 4.85690443812585e-05, "loss": 1.0807, "step": 564 }, { "epoch": 0.20823735372708008, "grad_norm": 5.4078354467716885, "learning_rate": 4.856595376437137e-05, "loss": 1.1401, "step": 565 }, { "epoch": 0.2086059154150926, "grad_norm": 5.64675308374591, "learning_rate": 4.856286314748424e-05, "loss": 0.9375, "step": 566 }, { "epoch": 0.20897447710310513, "grad_norm": 22.992346775697044, "learning_rate": 4.855977253059711e-05, "loss": 1.425, "step": 567 }, { "epoch": 0.20934303879111765, "grad_norm": 8.561512242724659, "learning_rate": 4.8556681913709975e-05, "loss": 1.2089, "step": 568 }, { "epoch": 0.2097116004791302, "grad_norm": 5.57616075825293, "learning_rate": 4.8553591296822846e-05, "loss": 1.2694, "step": 569 }, { "epoch": 0.21008016216714273, "grad_norm": 6.879757708785375, "learning_rate": 4.855050067993572e-05, "loss": 1.2773, "step": 570 }, { "epoch": 0.21044872385515526, "grad_norm": 6.381289801253967, "learning_rate": 4.854741006304859e-05, "loss": 1.1543, "step": 571 }, { "epoch": 0.21081728554316778, "grad_norm": 10.26254658287711, "learning_rate": 4.854431944616146e-05, "loss": 1.5938, "step": 572 }, { "epoch": 0.2111858472311803, "grad_norm": 5.702497557522904, "learning_rate": 4.8541228829274324e-05, "loss": 1.2831, "step": 573 }, { "epoch": 0.21155440891919286, "grad_norm": 12.843201711418976, "learning_rate": 4.8538138212387195e-05, "loss": 1.3748, "step": 574 }, { "epoch": 0.21192297060720539, "grad_norm": 14.450533445866876, "learning_rate": 4.8535047595500067e-05, "loss": 1.229, "step": 575 }, { "epoch": 0.2122915322952179, "grad_norm": 20.47530533745735, "learning_rate": 4.853195697861294e-05, "loss": 0.9115, "step": 576 }, { "epoch": 0.21266009398323044, "grad_norm": 10.968119806950021, "learning_rate": 4.85288663617258e-05, "loss": 1.2568, "step": 577 }, { "epoch": 0.21302865567124296, "grad_norm": 6.2746054962284905, "learning_rate": 4.8525775744838667e-05, "loss": 1.2777, "step": 578 }, { "epoch": 0.21339721735925551, "grad_norm": 6.51965821944217, "learning_rate": 4.852268512795154e-05, "loss": 1.212, "step": 579 }, { "epoch": 0.21376577904726804, "grad_norm": 6.732410542672505, "learning_rate": 4.851959451106441e-05, "loss": 0.9607, "step": 580 }, { "epoch": 0.21413434073528057, "grad_norm": 9.840399665469482, "learning_rate": 4.851650389417728e-05, "loss": 1.3767, "step": 581 }, { "epoch": 0.2145029024232931, "grad_norm": 9.408434034495077, "learning_rate": 4.851341327729015e-05, "loss": 0.8693, "step": 582 }, { "epoch": 0.21487146411130564, "grad_norm": 9.639407781240863, "learning_rate": 4.8510322660403016e-05, "loss": 0.8713, "step": 583 }, { "epoch": 0.21524002579931817, "grad_norm": 7.20041299271264, "learning_rate": 4.850723204351589e-05, "loss": 1.0637, "step": 584 }, { "epoch": 0.2156085874873307, "grad_norm": 8.27913169399094, "learning_rate": 4.850414142662876e-05, "loss": 1.3151, "step": 585 }, { "epoch": 0.21597714917534322, "grad_norm": 7.863737257021049, "learning_rate": 4.850105080974163e-05, "loss": 1.0589, "step": 586 }, { "epoch": 0.21634571086335574, "grad_norm": 5.024388153632179, "learning_rate": 4.8497960192854494e-05, "loss": 1.3165, "step": 587 }, { "epoch": 0.2167142725513683, "grad_norm": 7.91759921822083, "learning_rate": 4.8494869575967365e-05, "loss": 0.9896, "step": 588 }, { "epoch": 0.21708283423938082, "grad_norm": 5.024904977080316, "learning_rate": 4.8491778959080236e-05, "loss": 1.1191, "step": 589 }, { "epoch": 0.21745139592739335, "grad_norm": 4.1539261136415, "learning_rate": 4.848868834219311e-05, "loss": 0.8577, "step": 590 }, { "epoch": 0.21781995761540587, "grad_norm": 7.715295675054562, "learning_rate": 4.848559772530598e-05, "loss": 1.1856, "step": 591 }, { "epoch": 0.2181885193034184, "grad_norm": 4.987587207614966, "learning_rate": 4.848250710841884e-05, "loss": 1.1159, "step": 592 }, { "epoch": 0.21855708099143095, "grad_norm": 5.295381498632979, "learning_rate": 4.847941649153171e-05, "loss": 0.8546, "step": 593 }, { "epoch": 0.21892564267944348, "grad_norm": 4.893181763223632, "learning_rate": 4.847632587464458e-05, "loss": 0.8954, "step": 594 }, { "epoch": 0.219294204367456, "grad_norm": 5.845933149396, "learning_rate": 4.847323525775745e-05, "loss": 1.089, "step": 595 }, { "epoch": 0.21966276605546853, "grad_norm": 6.642335487427525, "learning_rate": 4.847014464087032e-05, "loss": 1.2089, "step": 596 }, { "epoch": 0.22003132774348105, "grad_norm": 7.4496921654921895, "learning_rate": 4.8467054023983185e-05, "loss": 1.2268, "step": 597 }, { "epoch": 0.2203998894314936, "grad_norm": 5.305582007607942, "learning_rate": 4.8463963407096057e-05, "loss": 0.9129, "step": 598 }, { "epoch": 0.22076845111950613, "grad_norm": 8.144483007201245, "learning_rate": 4.846087279020893e-05, "loss": 1.4724, "step": 599 }, { "epoch": 0.22113701280751866, "grad_norm": 7.225328230744413, "learning_rate": 4.84577821733218e-05, "loss": 1.0364, "step": 600 }, { "epoch": 0.22150557449553118, "grad_norm": 11.129663411523666, "learning_rate": 4.845469155643467e-05, "loss": 1.0654, "step": 601 }, { "epoch": 0.2218741361835437, "grad_norm": 6.170111991076165, "learning_rate": 4.8451600939547534e-05, "loss": 1.0267, "step": 602 }, { "epoch": 0.22224269787155626, "grad_norm": 5.859751615500552, "learning_rate": 4.8448510322660406e-05, "loss": 1.2007, "step": 603 }, { "epoch": 0.22261125955956879, "grad_norm": 6.742517350527766, "learning_rate": 4.844541970577328e-05, "loss": 1.0587, "step": 604 }, { "epoch": 0.2229798212475813, "grad_norm": 4.453502117049838, "learning_rate": 4.844232908888615e-05, "loss": 1.0871, "step": 605 }, { "epoch": 0.22334838293559384, "grad_norm": 7.873336374065298, "learning_rate": 4.843923847199901e-05, "loss": 1.3497, "step": 606 }, { "epoch": 0.2237169446236064, "grad_norm": 5.682576490257474, "learning_rate": 4.8436147855111884e-05, "loss": 1.3846, "step": 607 }, { "epoch": 0.22408550631161891, "grad_norm": 8.780352020506006, "learning_rate": 4.843305723822475e-05, "loss": 1.1671, "step": 608 }, { "epoch": 0.22445406799963144, "grad_norm": 8.19539488831919, "learning_rate": 4.842996662133762e-05, "loss": 1.5214, "step": 609 }, { "epoch": 0.22482262968764397, "grad_norm": 6.0965475849905975, "learning_rate": 4.842687600445049e-05, "loss": 1.2942, "step": 610 }, { "epoch": 0.2251911913756565, "grad_norm": 6.035349663162908, "learning_rate": 4.842378538756336e-05, "loss": 1.1766, "step": 611 }, { "epoch": 0.22555975306366904, "grad_norm": 7.921315431112417, "learning_rate": 4.8420694770676226e-05, "loss": 1.2485, "step": 612 }, { "epoch": 0.22592831475168157, "grad_norm": 5.74204068158393, "learning_rate": 4.84176041537891e-05, "loss": 1.261, "step": 613 }, { "epoch": 0.2262968764396941, "grad_norm": 5.7354236444314655, "learning_rate": 4.841451353690197e-05, "loss": 1.2115, "step": 614 }, { "epoch": 0.22666543812770662, "grad_norm": 4.071161280489012, "learning_rate": 4.841142292001484e-05, "loss": 0.932, "step": 615 }, { "epoch": 0.22703399981571915, "grad_norm": 4.1296231343120535, "learning_rate": 4.8408332303127704e-05, "loss": 1.0151, "step": 616 }, { "epoch": 0.2274025615037317, "grad_norm": 5.074611915820894, "learning_rate": 4.8405241686240575e-05, "loss": 1.1631, "step": 617 }, { "epoch": 0.22777112319174422, "grad_norm": 6.273117646344823, "learning_rate": 4.8402151069353446e-05, "loss": 1.4778, "step": 618 }, { "epoch": 0.22813968487975675, "grad_norm": 4.788457486890717, "learning_rate": 4.839906045246632e-05, "loss": 1.1453, "step": 619 }, { "epoch": 0.22850824656776927, "grad_norm": 10.320386056110884, "learning_rate": 4.839596983557919e-05, "loss": 0.9597, "step": 620 }, { "epoch": 0.2288768082557818, "grad_norm": 5.371167080465326, "learning_rate": 4.839287921869205e-05, "loss": 1.2418, "step": 621 }, { "epoch": 0.22924536994379435, "grad_norm": 7.8343673456471885, "learning_rate": 4.8389788601804924e-05, "loss": 1.2434, "step": 622 }, { "epoch": 0.22961393163180688, "grad_norm": 8.670194445830822, "learning_rate": 4.838669798491779e-05, "loss": 1.2701, "step": 623 }, { "epoch": 0.2299824933198194, "grad_norm": 5.870795185875756, "learning_rate": 4.838360736803066e-05, "loss": 1.1277, "step": 624 }, { "epoch": 0.23035105500783193, "grad_norm": 5.372133067058703, "learning_rate": 4.838051675114353e-05, "loss": 1.2452, "step": 625 }, { "epoch": 0.23071961669584445, "grad_norm": 6.218693565227597, "learning_rate": 4.8377426134256396e-05, "loss": 1.0221, "step": 626 }, { "epoch": 0.231088178383857, "grad_norm": 6.6295878521301255, "learning_rate": 4.837433551736927e-05, "loss": 1.391, "step": 627 }, { "epoch": 0.23145674007186953, "grad_norm": 5.169779649831085, "learning_rate": 4.837124490048214e-05, "loss": 1.1892, "step": 628 }, { "epoch": 0.23182530175988206, "grad_norm": 10.23305692848244, "learning_rate": 4.836815428359501e-05, "loss": 0.9228, "step": 629 }, { "epoch": 0.23219386344789458, "grad_norm": 6.48094730093375, "learning_rate": 4.836506366670788e-05, "loss": 1.092, "step": 630 }, { "epoch": 0.23256242513590714, "grad_norm": 6.822133383792052, "learning_rate": 4.8361973049820745e-05, "loss": 1.0791, "step": 631 }, { "epoch": 0.23293098682391966, "grad_norm": 7.09520178907791, "learning_rate": 4.8358882432933616e-05, "loss": 1.0015, "step": 632 }, { "epoch": 0.2332995485119322, "grad_norm": 6.78211118793605, "learning_rate": 4.835579181604649e-05, "loss": 1.1589, "step": 633 }, { "epoch": 0.2336681101999447, "grad_norm": 5.723421450547385, "learning_rate": 4.835270119915936e-05, "loss": 1.082, "step": 634 }, { "epoch": 0.23403667188795724, "grad_norm": 5.301563269909961, "learning_rate": 4.834961058227222e-05, "loss": 1.0628, "step": 635 }, { "epoch": 0.2344052335759698, "grad_norm": 5.296408483615797, "learning_rate": 4.8346519965385094e-05, "loss": 1.1048, "step": 636 }, { "epoch": 0.23477379526398232, "grad_norm": 4.641882112846097, "learning_rate": 4.8343429348497965e-05, "loss": 1.2066, "step": 637 }, { "epoch": 0.23514235695199484, "grad_norm": 6.1044421171808745, "learning_rate": 4.834033873161083e-05, "loss": 1.1136, "step": 638 }, { "epoch": 0.23551091864000737, "grad_norm": 6.118884088841345, "learning_rate": 4.83372481147237e-05, "loss": 1.0831, "step": 639 }, { "epoch": 0.2358794803280199, "grad_norm": 5.777954481960331, "learning_rate": 4.8334157497836565e-05, "loss": 0.8248, "step": 640 }, { "epoch": 0.23624804201603244, "grad_norm": 7.184287771497702, "learning_rate": 4.8331066880949436e-05, "loss": 1.1711, "step": 641 }, { "epoch": 0.23661660370404497, "grad_norm": 6.488231275206395, "learning_rate": 4.832797626406231e-05, "loss": 1.2087, "step": 642 }, { "epoch": 0.2369851653920575, "grad_norm": 4.8700501181277565, "learning_rate": 4.832488564717518e-05, "loss": 1.0907, "step": 643 }, { "epoch": 0.23735372708007002, "grad_norm": 6.185230802332683, "learning_rate": 4.832179503028805e-05, "loss": 1.1192, "step": 644 }, { "epoch": 0.23772228876808255, "grad_norm": 4.7350759459450265, "learning_rate": 4.8318704413400914e-05, "loss": 0.8844, "step": 645 }, { "epoch": 0.2380908504560951, "grad_norm": 5.463196468690978, "learning_rate": 4.8315613796513786e-05, "loss": 1.2228, "step": 646 }, { "epoch": 0.23845941214410762, "grad_norm": 4.687980118641017, "learning_rate": 4.831252317962666e-05, "loss": 0.9556, "step": 647 }, { "epoch": 0.23882797383212015, "grad_norm": 5.043804545621647, "learning_rate": 4.830943256273953e-05, "loss": 1.1661, "step": 648 }, { "epoch": 0.23919653552013267, "grad_norm": 6.127972562723514, "learning_rate": 4.830634194585239e-05, "loss": 1.2003, "step": 649 }, { "epoch": 0.2395650972081452, "grad_norm": 10.945134403105829, "learning_rate": 4.8303251328965264e-05, "loss": 1.1633, "step": 650 }, { "epoch": 0.23993365889615775, "grad_norm": 5.638890726290666, "learning_rate": 4.8300160712078135e-05, "loss": 1.0523, "step": 651 }, { "epoch": 0.24030222058417028, "grad_norm": 8.041638257333913, "learning_rate": 4.8297070095191006e-05, "loss": 1.2191, "step": 652 }, { "epoch": 0.2406707822721828, "grad_norm": 4.390144729140243, "learning_rate": 4.829397947830387e-05, "loss": 1.0579, "step": 653 }, { "epoch": 0.24103934396019533, "grad_norm": 6.342562860366093, "learning_rate": 4.829088886141674e-05, "loss": 1.2356, "step": 654 }, { "epoch": 0.24140790564820788, "grad_norm": 6.853342240521967, "learning_rate": 4.8287798244529606e-05, "loss": 1.1162, "step": 655 }, { "epoch": 0.2417764673362204, "grad_norm": 7.975421341913836, "learning_rate": 4.828470762764248e-05, "loss": 0.9331, "step": 656 }, { "epoch": 0.24214502902423293, "grad_norm": 6.551335019639186, "learning_rate": 4.828161701075535e-05, "loss": 1.134, "step": 657 }, { "epoch": 0.24251359071224546, "grad_norm": 5.388714349461093, "learning_rate": 4.827852639386822e-05, "loss": 1.3405, "step": 658 }, { "epoch": 0.24288215240025798, "grad_norm": 4.865773787088662, "learning_rate": 4.8275435776981084e-05, "loss": 1.0025, "step": 659 }, { "epoch": 0.24325071408827054, "grad_norm": 7.285251549309656, "learning_rate": 4.8272345160093955e-05, "loss": 1.0225, "step": 660 }, { "epoch": 0.24361927577628306, "grad_norm": 4.955935670522906, "learning_rate": 4.8269254543206826e-05, "loss": 1.2447, "step": 661 }, { "epoch": 0.2439878374642956, "grad_norm": 5.560583234426261, "learning_rate": 4.82661639263197e-05, "loss": 1.3603, "step": 662 }, { "epoch": 0.2443563991523081, "grad_norm": 5.523999996221212, "learning_rate": 4.826307330943257e-05, "loss": 1.1863, "step": 663 }, { "epoch": 0.24472496084032064, "grad_norm": 4.645960556591953, "learning_rate": 4.825998269254543e-05, "loss": 1.0466, "step": 664 }, { "epoch": 0.2450935225283332, "grad_norm": 5.177773422617757, "learning_rate": 4.8256892075658304e-05, "loss": 0.9531, "step": 665 }, { "epoch": 0.24546208421634572, "grad_norm": 4.213743502856021, "learning_rate": 4.8253801458771176e-05, "loss": 1.0685, "step": 666 }, { "epoch": 0.24583064590435824, "grad_norm": 8.337260376376408, "learning_rate": 4.825071084188405e-05, "loss": 0.9665, "step": 667 }, { "epoch": 0.24619920759237077, "grad_norm": 4.635857904700407, "learning_rate": 4.824762022499691e-05, "loss": 1.3689, "step": 668 }, { "epoch": 0.2465677692803833, "grad_norm": 5.742785696413513, "learning_rate": 4.8244529608109776e-05, "loss": 1.0097, "step": 669 }, { "epoch": 0.24693633096839585, "grad_norm": 5.883978942086551, "learning_rate": 4.824143899122265e-05, "loss": 1.1255, "step": 670 }, { "epoch": 0.24730489265640837, "grad_norm": 6.734940861341079, "learning_rate": 4.823834837433552e-05, "loss": 1.2461, "step": 671 }, { "epoch": 0.2476734543444209, "grad_norm": 13.071510237260746, "learning_rate": 4.823525775744839e-05, "loss": 1.322, "step": 672 }, { "epoch": 0.24804201603243342, "grad_norm": 8.285070610413122, "learning_rate": 4.823216714056126e-05, "loss": 1.1619, "step": 673 }, { "epoch": 0.24841057772044595, "grad_norm": 6.505201752544543, "learning_rate": 4.8229076523674125e-05, "loss": 1.2544, "step": 674 }, { "epoch": 0.2487791394084585, "grad_norm": 5.366070449281335, "learning_rate": 4.8225985906786996e-05, "loss": 1.2441, "step": 675 }, { "epoch": 0.24914770109647102, "grad_norm": 4.632120579324481, "learning_rate": 4.822289528989987e-05, "loss": 1.0069, "step": 676 }, { "epoch": 0.24951626278448355, "grad_norm": 5.126751484130014, "learning_rate": 4.821980467301274e-05, "loss": 0.8789, "step": 677 }, { "epoch": 0.24988482447249608, "grad_norm": 5.453889722133763, "learning_rate": 4.82167140561256e-05, "loss": 1.1648, "step": 678 }, { "epoch": 0.25025338616050863, "grad_norm": 4.946986681893834, "learning_rate": 4.8213623439238474e-05, "loss": 1.1006, "step": 679 }, { "epoch": 0.25062194784852115, "grad_norm": 4.385909946030299, "learning_rate": 4.8210532822351345e-05, "loss": 0.9372, "step": 680 }, { "epoch": 0.2509905095365337, "grad_norm": 5.625548781010117, "learning_rate": 4.8207442205464216e-05, "loss": 1.1726, "step": 681 }, { "epoch": 0.2513590712245462, "grad_norm": 4.809773254760253, "learning_rate": 4.820435158857709e-05, "loss": 1.0222, "step": 682 }, { "epoch": 0.25172763291255873, "grad_norm": 5.734331094433416, "learning_rate": 4.820126097168995e-05, "loss": 1.3505, "step": 683 }, { "epoch": 0.25209619460057126, "grad_norm": 6.865627767000609, "learning_rate": 4.8198170354802816e-05, "loss": 1.1087, "step": 684 }, { "epoch": 0.2524647562885838, "grad_norm": 4.173095067528052, "learning_rate": 4.819507973791569e-05, "loss": 1.1715, "step": 685 }, { "epoch": 0.2528333179765963, "grad_norm": 6.258548231367676, "learning_rate": 4.819198912102856e-05, "loss": 0.851, "step": 686 }, { "epoch": 0.2532018796646089, "grad_norm": 3.806018886220622, "learning_rate": 4.818889850414143e-05, "loss": 1.0843, "step": 687 }, { "epoch": 0.2535704413526214, "grad_norm": 4.1293353793375545, "learning_rate": 4.8185807887254294e-05, "loss": 1.2097, "step": 688 }, { "epoch": 0.25393900304063394, "grad_norm": 5.389686215242897, "learning_rate": 4.8182717270367166e-05, "loss": 1.3031, "step": 689 }, { "epoch": 0.25430756472864646, "grad_norm": 5.377728856252162, "learning_rate": 4.817962665348004e-05, "loss": 0.9634, "step": 690 }, { "epoch": 0.254676126416659, "grad_norm": 7.107999094192885, "learning_rate": 4.817653603659291e-05, "loss": 1.1607, "step": 691 }, { "epoch": 0.2550446881046715, "grad_norm": 4.701102091037944, "learning_rate": 4.817344541970578e-05, "loss": 1.108, "step": 692 }, { "epoch": 0.25541324979268404, "grad_norm": 9.687896917271958, "learning_rate": 4.8170354802818644e-05, "loss": 1.4576, "step": 693 }, { "epoch": 0.25578181148069656, "grad_norm": 6.005832380528765, "learning_rate": 4.8167264185931515e-05, "loss": 1.0727, "step": 694 }, { "epoch": 0.2561503731687091, "grad_norm": 5.372676258346668, "learning_rate": 4.8164173569044386e-05, "loss": 1.2288, "step": 695 }, { "epoch": 0.25651893485672167, "grad_norm": 9.396011204244754, "learning_rate": 4.816108295215726e-05, "loss": 1.3482, "step": 696 }, { "epoch": 0.2568874965447342, "grad_norm": 4.917074326678622, "learning_rate": 4.815799233527012e-05, "loss": 1.1037, "step": 697 }, { "epoch": 0.2572560582327467, "grad_norm": 8.432527560413241, "learning_rate": 4.815490171838299e-05, "loss": 1.1642, "step": 698 }, { "epoch": 0.25762461992075925, "grad_norm": 6.182920668713571, "learning_rate": 4.815181110149586e-05, "loss": 1.2761, "step": 699 }, { "epoch": 0.25799318160877177, "grad_norm": 6.238350960396355, "learning_rate": 4.814872048460873e-05, "loss": 1.2275, "step": 700 }, { "epoch": 0.2583617432967843, "grad_norm": 6.902982603428911, "learning_rate": 4.81456298677216e-05, "loss": 1.4815, "step": 701 }, { "epoch": 0.2587303049847968, "grad_norm": 5.491848887699138, "learning_rate": 4.814253925083447e-05, "loss": 1.0133, "step": 702 }, { "epoch": 0.25909886667280935, "grad_norm": 5.081019682784716, "learning_rate": 4.8139448633947335e-05, "loss": 1.1623, "step": 703 }, { "epoch": 0.2594674283608219, "grad_norm": 9.513872157420153, "learning_rate": 4.8136358017060206e-05, "loss": 1.0352, "step": 704 }, { "epoch": 0.2598359900488344, "grad_norm": 4.053110628242277, "learning_rate": 4.813326740017308e-05, "loss": 1.1101, "step": 705 }, { "epoch": 0.260204551736847, "grad_norm": 5.526070103645562, "learning_rate": 4.813017678328595e-05, "loss": 1.1776, "step": 706 }, { "epoch": 0.2605731134248595, "grad_norm": 4.668193885130514, "learning_rate": 4.812708616639881e-05, "loss": 0.8914, "step": 707 }, { "epoch": 0.26094167511287203, "grad_norm": 4.720710195678921, "learning_rate": 4.8123995549511684e-05, "loss": 1.0378, "step": 708 }, { "epoch": 0.26131023680088455, "grad_norm": 6.562141045335505, "learning_rate": 4.8120904932624556e-05, "loss": 1.034, "step": 709 }, { "epoch": 0.2616787984888971, "grad_norm": 5.0775416582014845, "learning_rate": 4.811781431573743e-05, "loss": 0.9835, "step": 710 }, { "epoch": 0.2620473601769096, "grad_norm": 4.699671291463131, "learning_rate": 4.81147236988503e-05, "loss": 1.0025, "step": 711 }, { "epoch": 0.26241592186492213, "grad_norm": 9.534734170425708, "learning_rate": 4.811163308196316e-05, "loss": 0.8228, "step": 712 }, { "epoch": 0.26278448355293466, "grad_norm": 4.47906115503756, "learning_rate": 4.8108542465076034e-05, "loss": 1.0384, "step": 713 }, { "epoch": 0.2631530452409472, "grad_norm": 5.699716045601595, "learning_rate": 4.81054518481889e-05, "loss": 0.8551, "step": 714 }, { "epoch": 0.26352160692895976, "grad_norm": 5.740217428841097, "learning_rate": 4.810236123130177e-05, "loss": 1.2485, "step": 715 }, { "epoch": 0.2638901686169723, "grad_norm": 4.30183795416977, "learning_rate": 4.809927061441464e-05, "loss": 0.9598, "step": 716 }, { "epoch": 0.2642587303049848, "grad_norm": 5.318128409187891, "learning_rate": 4.8096179997527505e-05, "loss": 1.2778, "step": 717 }, { "epoch": 0.26462729199299734, "grad_norm": 4.98829807407108, "learning_rate": 4.8093089380640376e-05, "loss": 1.3837, "step": 718 }, { "epoch": 0.26499585368100986, "grad_norm": 6.636374443871996, "learning_rate": 4.808999876375325e-05, "loss": 1.1828, "step": 719 }, { "epoch": 0.2653644153690224, "grad_norm": 6.703533595865574, "learning_rate": 4.808690814686612e-05, "loss": 1.0231, "step": 720 }, { "epoch": 0.2657329770570349, "grad_norm": 4.524118849807938, "learning_rate": 4.808381752997899e-05, "loss": 1.11, "step": 721 }, { "epoch": 0.26610153874504744, "grad_norm": 6.069169927461122, "learning_rate": 4.8080726913091854e-05, "loss": 0.9488, "step": 722 }, { "epoch": 0.26647010043305996, "grad_norm": 5.331665294790533, "learning_rate": 4.8077636296204725e-05, "loss": 0.7603, "step": 723 }, { "epoch": 0.2668386621210725, "grad_norm": 10.345415701536485, "learning_rate": 4.8074545679317596e-05, "loss": 1.3889, "step": 724 }, { "epoch": 0.26720722380908507, "grad_norm": 5.695250208635688, "learning_rate": 4.807145506243047e-05, "loss": 1.2903, "step": 725 }, { "epoch": 0.2675757854970976, "grad_norm": 4.930532440465672, "learning_rate": 4.806836444554333e-05, "loss": 0.9526, "step": 726 }, { "epoch": 0.2679443471851101, "grad_norm": 4.420964555054236, "learning_rate": 4.80652738286562e-05, "loss": 1.0921, "step": 727 }, { "epoch": 0.26831290887312265, "grad_norm": 4.601192129155162, "learning_rate": 4.8062183211769074e-05, "loss": 1.1706, "step": 728 }, { "epoch": 0.26868147056113517, "grad_norm": 5.20469334881745, "learning_rate": 4.805909259488194e-05, "loss": 1.1379, "step": 729 }, { "epoch": 0.2690500322491477, "grad_norm": 4.661235214377717, "learning_rate": 4.805600197799481e-05, "loss": 1.1819, "step": 730 }, { "epoch": 0.2694185939371602, "grad_norm": 5.419950907861429, "learning_rate": 4.8052911361107674e-05, "loss": 1.1893, "step": 731 }, { "epoch": 0.26978715562517275, "grad_norm": 4.678117440930453, "learning_rate": 4.8049820744220546e-05, "loss": 0.9917, "step": 732 }, { "epoch": 0.2701557173131853, "grad_norm": 6.068163556443902, "learning_rate": 4.804673012733342e-05, "loss": 1.259, "step": 733 }, { "epoch": 0.2705242790011978, "grad_norm": 5.5724834903628215, "learning_rate": 4.804363951044629e-05, "loss": 1.3638, "step": 734 }, { "epoch": 0.2708928406892104, "grad_norm": 7.56777366685243, "learning_rate": 4.804054889355916e-05, "loss": 1.1625, "step": 735 }, { "epoch": 0.2712614023772229, "grad_norm": 4.703125811098352, "learning_rate": 4.8037458276672024e-05, "loss": 1.3413, "step": 736 }, { "epoch": 0.27162996406523543, "grad_norm": 4.644957501203674, "learning_rate": 4.8034367659784895e-05, "loss": 1.2684, "step": 737 }, { "epoch": 0.27199852575324796, "grad_norm": 9.303440269840625, "learning_rate": 4.8031277042897766e-05, "loss": 1.1977, "step": 738 }, { "epoch": 0.2723670874412605, "grad_norm": 7.418252611077477, "learning_rate": 4.802818642601064e-05, "loss": 1.0246, "step": 739 }, { "epoch": 0.272735649129273, "grad_norm": 4.858604443653813, "learning_rate": 4.80250958091235e-05, "loss": 1.142, "step": 740 }, { "epoch": 0.27310421081728553, "grad_norm": 4.203429937823099, "learning_rate": 4.802200519223637e-05, "loss": 0.9833, "step": 741 }, { "epoch": 0.27347277250529806, "grad_norm": 5.276622663657042, "learning_rate": 4.8018914575349244e-05, "loss": 1.2035, "step": 742 }, { "epoch": 0.2738413341933106, "grad_norm": 5.790556803598694, "learning_rate": 4.8015823958462115e-05, "loss": 1.2865, "step": 743 }, { "epoch": 0.27420989588132316, "grad_norm": 5.80398986649368, "learning_rate": 4.801273334157498e-05, "loss": 1.3874, "step": 744 }, { "epoch": 0.2745784575693357, "grad_norm": 7.831222682701258, "learning_rate": 4.800964272468785e-05, "loss": 1.0925, "step": 745 }, { "epoch": 0.2749470192573482, "grad_norm": 7.819956402669291, "learning_rate": 4.8006552107800715e-05, "loss": 1.2402, "step": 746 }, { "epoch": 0.27531558094536074, "grad_norm": 5.824989600438343, "learning_rate": 4.8003461490913586e-05, "loss": 1.3954, "step": 747 }, { "epoch": 0.27568414263337326, "grad_norm": 7.644911985121471, "learning_rate": 4.800037087402646e-05, "loss": 1.1342, "step": 748 }, { "epoch": 0.2760527043213858, "grad_norm": 5.451943693120738, "learning_rate": 4.799728025713933e-05, "loss": 1.2115, "step": 749 }, { "epoch": 0.2764212660093983, "grad_norm": 5.575305386377251, "learning_rate": 4.799418964025219e-05, "loss": 1.4268, "step": 750 }, { "epoch": 0.27678982769741084, "grad_norm": 5.336192000702315, "learning_rate": 4.7991099023365064e-05, "loss": 1.1369, "step": 751 }, { "epoch": 0.27715838938542336, "grad_norm": 5.2248758693716155, "learning_rate": 4.7988008406477935e-05, "loss": 1.1355, "step": 752 }, { "epoch": 0.2775269510734359, "grad_norm": 7.285284799050171, "learning_rate": 4.798491778959081e-05, "loss": 1.2719, "step": 753 }, { "epoch": 0.27789551276144847, "grad_norm": 6.901039332743316, "learning_rate": 4.798182717270368e-05, "loss": 1.1771, "step": 754 }, { "epoch": 0.278264074449461, "grad_norm": 7.325330384390763, "learning_rate": 4.797873655581654e-05, "loss": 0.9152, "step": 755 }, { "epoch": 0.2786326361374735, "grad_norm": 5.754648734342007, "learning_rate": 4.7975645938929413e-05, "loss": 1.1813, "step": 756 }, { "epoch": 0.27900119782548605, "grad_norm": 5.28802525141194, "learning_rate": 4.7972555322042285e-05, "loss": 0.8032, "step": 757 }, { "epoch": 0.2793697595134986, "grad_norm": 4.575615204684805, "learning_rate": 4.7969464705155156e-05, "loss": 1.2471, "step": 758 }, { "epoch": 0.2797383212015111, "grad_norm": 10.61354650363231, "learning_rate": 4.796637408826802e-05, "loss": 0.935, "step": 759 }, { "epoch": 0.2801068828895236, "grad_norm": 6.8978092862313, "learning_rate": 4.7963283471380885e-05, "loss": 1.1981, "step": 760 }, { "epoch": 0.28047544457753615, "grad_norm": 4.114987330179653, "learning_rate": 4.7960192854493756e-05, "loss": 0.9284, "step": 761 }, { "epoch": 0.2808440062655487, "grad_norm": 6.967972647927489, "learning_rate": 4.795710223760663e-05, "loss": 0.8976, "step": 762 }, { "epoch": 0.28121256795356125, "grad_norm": 8.563087164801733, "learning_rate": 4.79540116207195e-05, "loss": 1.2078, "step": 763 }, { "epoch": 0.2815811296415738, "grad_norm": 6.111962592404917, "learning_rate": 4.795092100383237e-05, "loss": 1.33, "step": 764 }, { "epoch": 0.2819496913295863, "grad_norm": 5.004437194338648, "learning_rate": 4.7947830386945234e-05, "loss": 0.8961, "step": 765 }, { "epoch": 0.28231825301759883, "grad_norm": 5.638576991700389, "learning_rate": 4.7944739770058105e-05, "loss": 1.0346, "step": 766 }, { "epoch": 0.28268681470561136, "grad_norm": 5.4402387014078615, "learning_rate": 4.7941649153170976e-05, "loss": 1.1741, "step": 767 }, { "epoch": 0.2830553763936239, "grad_norm": 9.721536807165132, "learning_rate": 4.793855853628385e-05, "loss": 1.2149, "step": 768 }, { "epoch": 0.2834239380816364, "grad_norm": 6.536128945935843, "learning_rate": 4.793546791939671e-05, "loss": 1.2145, "step": 769 }, { "epoch": 0.28379249976964893, "grad_norm": 5.580677634490942, "learning_rate": 4.793237730250958e-05, "loss": 0.8417, "step": 770 }, { "epoch": 0.28416106145766146, "grad_norm": 8.119324565287705, "learning_rate": 4.7929286685622454e-05, "loss": 1.239, "step": 771 }, { "epoch": 0.284529623145674, "grad_norm": 10.72208168144116, "learning_rate": 4.7926196068735325e-05, "loss": 0.7368, "step": 772 }, { "epoch": 0.28489818483368656, "grad_norm": 5.22665774681991, "learning_rate": 4.79231054518482e-05, "loss": 1.1039, "step": 773 }, { "epoch": 0.2852667465216991, "grad_norm": 4.8618660089380485, "learning_rate": 4.792001483496106e-05, "loss": 1.2236, "step": 774 }, { "epoch": 0.2856353082097116, "grad_norm": 6.018756477455464, "learning_rate": 4.7916924218073925e-05, "loss": 0.9775, "step": 775 }, { "epoch": 0.28600386989772414, "grad_norm": 4.690109137137612, "learning_rate": 4.79138336011868e-05, "loss": 1.2573, "step": 776 }, { "epoch": 0.28637243158573666, "grad_norm": 5.537490686275322, "learning_rate": 4.791074298429967e-05, "loss": 1.114, "step": 777 }, { "epoch": 0.2867409932737492, "grad_norm": 5.132437171161277, "learning_rate": 4.790765236741254e-05, "loss": 1.2209, "step": 778 }, { "epoch": 0.2871095549617617, "grad_norm": 6.103243118916656, "learning_rate": 4.7904561750525403e-05, "loss": 1.0298, "step": 779 }, { "epoch": 0.28747811664977424, "grad_norm": 5.105650311184429, "learning_rate": 4.7901471133638275e-05, "loss": 0.9833, "step": 780 }, { "epoch": 0.28784667833778677, "grad_norm": 4.945842021537076, "learning_rate": 4.7898380516751146e-05, "loss": 1.1058, "step": 781 }, { "epoch": 0.2882152400257993, "grad_norm": 7.588338045850073, "learning_rate": 4.789528989986402e-05, "loss": 1.2795, "step": 782 }, { "epoch": 0.28858380171381187, "grad_norm": 4.196904340401443, "learning_rate": 4.789219928297689e-05, "loss": 1.0276, "step": 783 }, { "epoch": 0.2889523634018244, "grad_norm": 4.532130504429422, "learning_rate": 4.788910866608975e-05, "loss": 1.0417, "step": 784 }, { "epoch": 0.2893209250898369, "grad_norm": 5.487087350680153, "learning_rate": 4.7886018049202624e-05, "loss": 1.2825, "step": 785 }, { "epoch": 0.28968948677784945, "grad_norm": 5.132587491798061, "learning_rate": 4.7882927432315495e-05, "loss": 1.3217, "step": 786 }, { "epoch": 0.290058048465862, "grad_norm": 6.340888820403244, "learning_rate": 4.7879836815428366e-05, "loss": 1.1663, "step": 787 }, { "epoch": 0.2904266101538745, "grad_norm": 6.62677381268976, "learning_rate": 4.787674619854123e-05, "loss": 1.2102, "step": 788 }, { "epoch": 0.290795171841887, "grad_norm": 6.338377834220356, "learning_rate": 4.7873655581654095e-05, "loss": 1.1854, "step": 789 }, { "epoch": 0.29116373352989955, "grad_norm": 7.535307618198404, "learning_rate": 4.7870564964766966e-05, "loss": 1.3216, "step": 790 }, { "epoch": 0.2915322952179121, "grad_norm": 6.419930787946353, "learning_rate": 4.786747434787984e-05, "loss": 1.1384, "step": 791 }, { "epoch": 0.29190085690592465, "grad_norm": 5.9038151965074, "learning_rate": 4.786438373099271e-05, "loss": 1.2352, "step": 792 }, { "epoch": 0.2922694185939372, "grad_norm": 6.274617047421849, "learning_rate": 4.786129311410558e-05, "loss": 1.3188, "step": 793 }, { "epoch": 0.2926379802819497, "grad_norm": 6.93123997239539, "learning_rate": 4.7858202497218444e-05, "loss": 1.2498, "step": 794 }, { "epoch": 0.29300654196996223, "grad_norm": 6.5088417365882485, "learning_rate": 4.7855111880331315e-05, "loss": 1.1757, "step": 795 }, { "epoch": 0.29337510365797476, "grad_norm": 5.843896874198717, "learning_rate": 4.785202126344419e-05, "loss": 1.182, "step": 796 }, { "epoch": 0.2937436653459873, "grad_norm": 5.265621921074304, "learning_rate": 4.784893064655706e-05, "loss": 1.1618, "step": 797 }, { "epoch": 0.2941122270339998, "grad_norm": 4.747994652119222, "learning_rate": 4.784584002966992e-05, "loss": 1.3625, "step": 798 }, { "epoch": 0.29448078872201233, "grad_norm": 5.129294108406839, "learning_rate": 4.7842749412782793e-05, "loss": 0.7386, "step": 799 }, { "epoch": 0.29484935041002486, "grad_norm": 5.630766561843477, "learning_rate": 4.7839658795895665e-05, "loss": 1.1316, "step": 800 }, { "epoch": 0.29484935041002486, "eval_bleu": 0.029706287692983103, "eval_bleu_1gram": 0.34749794196097433, "eval_bleu_2gram": 0.13702612603313277, "eval_bleu_3gram": 0.050977083734064864, "eval_bleu_4gram": 0.022524601243577004, "eval_rag_val_loss": 1.2292229623575261, "eval_rouge1": 0.33337731268541965, "eval_rouge2": 0.1300609138416409, "eval_rougeL": 0.3287182606569555, "step": 800 }, { "epoch": 0.2952179120980374, "grad_norm": 4.5702450999971385, "learning_rate": 4.7836568179008536e-05, "loss": 0.9699, "step": 801 }, { "epoch": 0.29558647378604996, "grad_norm": 5.584208785168016, "learning_rate": 4.783347756212141e-05, "loss": 0.845, "step": 802 }, { "epoch": 0.2959550354740625, "grad_norm": 7.315693215138096, "learning_rate": 4.783038694523427e-05, "loss": 1.0475, "step": 803 }, { "epoch": 0.296323597162075, "grad_norm": 7.0399279131233055, "learning_rate": 4.782729632834714e-05, "loss": 1.4035, "step": 804 }, { "epoch": 0.29669215885008754, "grad_norm": 4.300192699993813, "learning_rate": 4.782420571146001e-05, "loss": 0.9524, "step": 805 }, { "epoch": 0.29706072053810006, "grad_norm": 9.79880807595866, "learning_rate": 4.782111509457288e-05, "loss": 1.0851, "step": 806 }, { "epoch": 0.2974292822261126, "grad_norm": 6.937105459730581, "learning_rate": 4.781802447768575e-05, "loss": 1.4259, "step": 807 }, { "epoch": 0.2977978439141251, "grad_norm": 9.699411858100799, "learning_rate": 4.7814933860798614e-05, "loss": 1.0154, "step": 808 }, { "epoch": 0.29816640560213764, "grad_norm": 7.78975399701328, "learning_rate": 4.7811843243911485e-05, "loss": 1.0738, "step": 809 }, { "epoch": 0.29853496729015017, "grad_norm": 6.022635677419786, "learning_rate": 4.7808752627024356e-05, "loss": 0.9371, "step": 810 }, { "epoch": 0.29890352897816275, "grad_norm": 6.253083955934139, "learning_rate": 4.780566201013723e-05, "loss": 1.2798, "step": 811 }, { "epoch": 0.29927209066617527, "grad_norm": 4.945084553025587, "learning_rate": 4.780257139325009e-05, "loss": 1.2642, "step": 812 }, { "epoch": 0.2996406523541878, "grad_norm": 5.5510980568019574, "learning_rate": 4.779948077636296e-05, "loss": 0.8573, "step": 813 }, { "epoch": 0.3000092140422003, "grad_norm": 4.174282793585062, "learning_rate": 4.7796390159475834e-05, "loss": 0.9555, "step": 814 }, { "epoch": 0.30037777573021285, "grad_norm": 5.373373273380792, "learning_rate": 4.7793299542588705e-05, "loss": 1.2372, "step": 815 }, { "epoch": 0.3007463374182254, "grad_norm": 4.002891687388245, "learning_rate": 4.7790208925701577e-05, "loss": 0.9406, "step": 816 }, { "epoch": 0.3011148991062379, "grad_norm": 10.02899125477627, "learning_rate": 4.778711830881444e-05, "loss": 1.2015, "step": 817 }, { "epoch": 0.3014834607942504, "grad_norm": 7.789390870789542, "learning_rate": 4.778402769192731e-05, "loss": 1.3565, "step": 818 }, { "epoch": 0.30185202248226295, "grad_norm": 5.837276688088443, "learning_rate": 4.778093707504018e-05, "loss": 1.1856, "step": 819 }, { "epoch": 0.3022205841702755, "grad_norm": 6.2265131815604855, "learning_rate": 4.777784645815305e-05, "loss": 1.4108, "step": 820 }, { "epoch": 0.30258914585828806, "grad_norm": 4.8885083965624645, "learning_rate": 4.777475584126592e-05, "loss": 0.9768, "step": 821 }, { "epoch": 0.3029577075463006, "grad_norm": 5.185641771730554, "learning_rate": 4.7771665224378783e-05, "loss": 1.1993, "step": 822 }, { "epoch": 0.3033262692343131, "grad_norm": 5.798818915241928, "learning_rate": 4.7768574607491655e-05, "loss": 1.177, "step": 823 }, { "epoch": 0.30369483092232563, "grad_norm": 10.68597865291279, "learning_rate": 4.7765483990604526e-05, "loss": 1.32, "step": 824 }, { "epoch": 0.30406339261033816, "grad_norm": 3.935478175772909, "learning_rate": 4.77623933737174e-05, "loss": 1.0147, "step": 825 }, { "epoch": 0.3044319542983507, "grad_norm": 6.6070711345535535, "learning_rate": 4.775930275683027e-05, "loss": 1.1935, "step": 826 }, { "epoch": 0.3048005159863632, "grad_norm": 4.779356338386723, "learning_rate": 4.775621213994313e-05, "loss": 1.0937, "step": 827 }, { "epoch": 0.30516907767437573, "grad_norm": 4.4464972179407996, "learning_rate": 4.7753121523056004e-05, "loss": 1.0402, "step": 828 }, { "epoch": 0.30553763936238826, "grad_norm": 5.840254610337996, "learning_rate": 4.7750030906168875e-05, "loss": 0.9905, "step": 829 }, { "epoch": 0.3059062010504008, "grad_norm": 7.027842870250839, "learning_rate": 4.7746940289281746e-05, "loss": 1.3655, "step": 830 }, { "epoch": 0.30627476273841336, "grad_norm": 6.065649728464212, "learning_rate": 4.774384967239461e-05, "loss": 1.2803, "step": 831 }, { "epoch": 0.3066433244264259, "grad_norm": 3.9316401398720515, "learning_rate": 4.774075905550748e-05, "loss": 1.19, "step": 832 }, { "epoch": 0.3070118861144384, "grad_norm": 5.64353854943942, "learning_rate": 4.773766843862035e-05, "loss": 1.2271, "step": 833 }, { "epoch": 0.30738044780245094, "grad_norm": 12.859789706063614, "learning_rate": 4.7734577821733224e-05, "loss": 0.9038, "step": 834 }, { "epoch": 0.30774900949046347, "grad_norm": 5.777852807821229, "learning_rate": 4.773148720484609e-05, "loss": 0.9298, "step": 835 }, { "epoch": 0.308117571178476, "grad_norm": 6.545891080132558, "learning_rate": 4.772839658795896e-05, "loss": 1.0828, "step": 836 }, { "epoch": 0.3084861328664885, "grad_norm": 6.047326044022667, "learning_rate": 4.7725305971071824e-05, "loss": 0.9603, "step": 837 }, { "epoch": 0.30885469455450104, "grad_norm": 7.719424075450229, "learning_rate": 4.7722215354184695e-05, "loss": 1.0381, "step": 838 }, { "epoch": 0.30922325624251357, "grad_norm": 8.431691975246112, "learning_rate": 4.7719124737297567e-05, "loss": 1.2123, "step": 839 }, { "epoch": 0.30959181793052615, "grad_norm": 6.698613709467214, "learning_rate": 4.771603412041044e-05, "loss": 1.1645, "step": 840 }, { "epoch": 0.3099603796185387, "grad_norm": 8.310125714577959, "learning_rate": 4.77129435035233e-05, "loss": 1.0411, "step": 841 }, { "epoch": 0.3103289413065512, "grad_norm": 4.814550186014509, "learning_rate": 4.770985288663617e-05, "loss": 1.0944, "step": 842 }, { "epoch": 0.3106975029945637, "grad_norm": 8.349485790386566, "learning_rate": 4.7706762269749045e-05, "loss": 1.0747, "step": 843 }, { "epoch": 0.31106606468257625, "grad_norm": 8.841838940216372, "learning_rate": 4.7703671652861916e-05, "loss": 1.2053, "step": 844 }, { "epoch": 0.3114346263705888, "grad_norm": 11.792331492595093, "learning_rate": 4.770058103597479e-05, "loss": 0.9904, "step": 845 }, { "epoch": 0.3118031880586013, "grad_norm": 5.674378402209783, "learning_rate": 4.769749041908765e-05, "loss": 0.9703, "step": 846 }, { "epoch": 0.3121717497466138, "grad_norm": 5.3177686707164975, "learning_rate": 4.769439980220052e-05, "loss": 0.8903, "step": 847 }, { "epoch": 0.31254031143462635, "grad_norm": 5.419866624155323, "learning_rate": 4.7691309185313394e-05, "loss": 1.2181, "step": 848 }, { "epoch": 0.3129088731226389, "grad_norm": 6.391436163945704, "learning_rate": 4.7688218568426265e-05, "loss": 1.3884, "step": 849 }, { "epoch": 0.31327743481065146, "grad_norm": 7.697451221674522, "learning_rate": 4.768512795153913e-05, "loss": 1.0459, "step": 850 }, { "epoch": 0.313645996498664, "grad_norm": 10.255609977507051, "learning_rate": 4.7682037334651994e-05, "loss": 1.0832, "step": 851 }, { "epoch": 0.3140145581866765, "grad_norm": 5.259563092264298, "learning_rate": 4.7678946717764865e-05, "loss": 1.1868, "step": 852 }, { "epoch": 0.31438311987468903, "grad_norm": 13.229445727351154, "learning_rate": 4.7675856100877736e-05, "loss": 0.8677, "step": 853 }, { "epoch": 0.31475168156270156, "grad_norm": 7.478635749639185, "learning_rate": 4.767276548399061e-05, "loss": 1.2106, "step": 854 }, { "epoch": 0.3151202432507141, "grad_norm": 5.012341336524092, "learning_rate": 4.766967486710348e-05, "loss": 0.7909, "step": 855 }, { "epoch": 0.3154888049387266, "grad_norm": 6.293933721018225, "learning_rate": 4.766658425021634e-05, "loss": 1.0496, "step": 856 }, { "epoch": 0.31585736662673913, "grad_norm": 9.239693338168667, "learning_rate": 4.7663493633329214e-05, "loss": 0.9167, "step": 857 }, { "epoch": 0.31622592831475166, "grad_norm": 7.263445258548162, "learning_rate": 4.7660403016442085e-05, "loss": 0.8936, "step": 858 }, { "epoch": 0.31659449000276424, "grad_norm": 8.168586609073898, "learning_rate": 4.7657312399554957e-05, "loss": 1.2561, "step": 859 }, { "epoch": 0.31696305169077676, "grad_norm": 5.721525925593305, "learning_rate": 4.765422178266782e-05, "loss": 1.1656, "step": 860 }, { "epoch": 0.3173316133787893, "grad_norm": 6.999892642696554, "learning_rate": 4.765113116578069e-05, "loss": 1.2123, "step": 861 }, { "epoch": 0.3177001750668018, "grad_norm": 9.808895542449992, "learning_rate": 4.764804054889356e-05, "loss": 1.0698, "step": 862 }, { "epoch": 0.31806873675481434, "grad_norm": 11.473960254245943, "learning_rate": 4.7644949932006435e-05, "loss": 1.1665, "step": 863 }, { "epoch": 0.31843729844282687, "grad_norm": 5.967295204742839, "learning_rate": 4.7641859315119306e-05, "loss": 1.1284, "step": 864 }, { "epoch": 0.3188058601308394, "grad_norm": 8.781192657174872, "learning_rate": 4.763876869823217e-05, "loss": 0.9764, "step": 865 }, { "epoch": 0.3191744218188519, "grad_norm": 8.69375100043241, "learning_rate": 4.7635678081345035e-05, "loss": 1.1837, "step": 866 }, { "epoch": 0.31954298350686444, "grad_norm": 4.690192301372359, "learning_rate": 4.7632587464457906e-05, "loss": 1.2884, "step": 867 }, { "epoch": 0.31991154519487697, "grad_norm": 7.625264647847118, "learning_rate": 4.762949684757078e-05, "loss": 1.1583, "step": 868 }, { "epoch": 0.32028010688288955, "grad_norm": 4.844127812571254, "learning_rate": 4.762640623068365e-05, "loss": 0.8952, "step": 869 }, { "epoch": 0.3206486685709021, "grad_norm": 6.150345347562778, "learning_rate": 4.762331561379651e-05, "loss": 1.1837, "step": 870 }, { "epoch": 0.3210172302589146, "grad_norm": 6.4865621899577475, "learning_rate": 4.7620224996909384e-05, "loss": 1.709, "step": 871 }, { "epoch": 0.3213857919469271, "grad_norm": 6.541633784055575, "learning_rate": 4.7617134380022255e-05, "loss": 1.2724, "step": 872 }, { "epoch": 0.32175435363493965, "grad_norm": 5.605097167562084, "learning_rate": 4.7614043763135126e-05, "loss": 1.0786, "step": 873 }, { "epoch": 0.3221229153229522, "grad_norm": 4.914520873022817, "learning_rate": 4.7610953146248e-05, "loss": 1.1722, "step": 874 }, { "epoch": 0.3224914770109647, "grad_norm": 5.621721117374394, "learning_rate": 4.760786252936086e-05, "loss": 1.2221, "step": 875 }, { "epoch": 0.3228600386989772, "grad_norm": 4.321209943661296, "learning_rate": 4.760477191247373e-05, "loss": 0.9247, "step": 876 }, { "epoch": 0.32322860038698975, "grad_norm": 4.843998465779547, "learning_rate": 4.7601681295586604e-05, "loss": 1.2594, "step": 877 }, { "epoch": 0.3235971620750023, "grad_norm": 5.544370101773461, "learning_rate": 4.7598590678699475e-05, "loss": 1.0467, "step": 878 }, { "epoch": 0.32396572376301486, "grad_norm": 7.742256972600765, "learning_rate": 4.759550006181234e-05, "loss": 0.8665, "step": 879 }, { "epoch": 0.3243342854510274, "grad_norm": 7.876082815173708, "learning_rate": 4.7592409444925204e-05, "loss": 1.206, "step": 880 }, { "epoch": 0.3247028471390399, "grad_norm": 6.918036022753089, "learning_rate": 4.7589318828038075e-05, "loss": 1.1073, "step": 881 }, { "epoch": 0.32507140882705243, "grad_norm": 5.185470735669837, "learning_rate": 4.7586228211150947e-05, "loss": 1.3207, "step": 882 }, { "epoch": 0.32543997051506496, "grad_norm": 10.367796085895543, "learning_rate": 4.758313759426382e-05, "loss": 1.3698, "step": 883 }, { "epoch": 0.3258085322030775, "grad_norm": 7.100332289965709, "learning_rate": 4.758004697737668e-05, "loss": 1.1728, "step": 884 }, { "epoch": 0.32617709389109, "grad_norm": 4.887416579371202, "learning_rate": 4.757695636048955e-05, "loss": 1.1437, "step": 885 }, { "epoch": 0.32654565557910253, "grad_norm": 5.350147009995562, "learning_rate": 4.7573865743602424e-05, "loss": 1.2246, "step": 886 }, { "epoch": 0.32691421726711506, "grad_norm": 8.761595100554162, "learning_rate": 4.7570775126715296e-05, "loss": 0.9493, "step": 887 }, { "epoch": 0.32728277895512764, "grad_norm": 5.225715420308758, "learning_rate": 4.756768450982817e-05, "loss": 1.2121, "step": 888 }, { "epoch": 0.32765134064314017, "grad_norm": 4.98482556832787, "learning_rate": 4.756459389294103e-05, "loss": 1.2855, "step": 889 }, { "epoch": 0.3280199023311527, "grad_norm": 4.1272692362961925, "learning_rate": 4.75615032760539e-05, "loss": 1.0122, "step": 890 }, { "epoch": 0.3283884640191652, "grad_norm": 5.239593000257711, "learning_rate": 4.7558412659166774e-05, "loss": 1.1326, "step": 891 }, { "epoch": 0.32875702570717774, "grad_norm": 6.716408112931578, "learning_rate": 4.7555322042279645e-05, "loss": 1.1535, "step": 892 }, { "epoch": 0.32912558739519027, "grad_norm": 5.000546425525215, "learning_rate": 4.7552231425392516e-05, "loss": 1.1662, "step": 893 }, { "epoch": 0.3294941490832028, "grad_norm": 5.62298853195679, "learning_rate": 4.754914080850538e-05, "loss": 1.5371, "step": 894 }, { "epoch": 0.3298627107712153, "grad_norm": 4.645305700518485, "learning_rate": 4.7546050191618245e-05, "loss": 1.1725, "step": 895 }, { "epoch": 0.33023127245922784, "grad_norm": 4.993279327249919, "learning_rate": 4.7542959574731116e-05, "loss": 1.4147, "step": 896 }, { "epoch": 0.33059983414724037, "grad_norm": 3.917835845670802, "learning_rate": 4.753986895784399e-05, "loss": 1.0227, "step": 897 }, { "epoch": 0.33096839583525295, "grad_norm": 7.155724918475187, "learning_rate": 4.753677834095686e-05, "loss": 1.528, "step": 898 }, { "epoch": 0.3313369575232655, "grad_norm": 7.880979175437014, "learning_rate": 4.753368772406972e-05, "loss": 1.0303, "step": 899 }, { "epoch": 0.331705519211278, "grad_norm": 6.7891577394607125, "learning_rate": 4.7530597107182594e-05, "loss": 1.4549, "step": 900 }, { "epoch": 0.3320740808992905, "grad_norm": 4.430872521510846, "learning_rate": 4.7527506490295465e-05, "loss": 1.1053, "step": 901 }, { "epoch": 0.33244264258730305, "grad_norm": 10.946002554532322, "learning_rate": 4.7524415873408336e-05, "loss": 1.2007, "step": 902 }, { "epoch": 0.3328112042753156, "grad_norm": 5.055693017863045, "learning_rate": 4.75213252565212e-05, "loss": 1.4705, "step": 903 }, { "epoch": 0.3331797659633281, "grad_norm": 5.216667262230657, "learning_rate": 4.751823463963407e-05, "loss": 1.1234, "step": 904 }, { "epoch": 0.3335483276513406, "grad_norm": 5.264864812928542, "learning_rate": 4.751514402274694e-05, "loss": 1.143, "step": 905 }, { "epoch": 0.33391688933935315, "grad_norm": 9.714897545211933, "learning_rate": 4.7512053405859814e-05, "loss": 1.2114, "step": 906 }, { "epoch": 0.33428545102736573, "grad_norm": 9.663676051413363, "learning_rate": 4.7508962788972686e-05, "loss": 1.1775, "step": 907 }, { "epoch": 0.33465401271537826, "grad_norm": 4.486505407767361, "learning_rate": 4.750587217208555e-05, "loss": 1.2147, "step": 908 }, { "epoch": 0.3350225744033908, "grad_norm": 8.024162044330353, "learning_rate": 4.750278155519842e-05, "loss": 0.9803, "step": 909 }, { "epoch": 0.3353911360914033, "grad_norm": 4.2798464068367315, "learning_rate": 4.7499690938311286e-05, "loss": 1.137, "step": 910 }, { "epoch": 0.33575969777941583, "grad_norm": 5.715458824449233, "learning_rate": 4.749660032142416e-05, "loss": 1.573, "step": 911 }, { "epoch": 0.33612825946742836, "grad_norm": 6.74292044783056, "learning_rate": 4.749350970453703e-05, "loss": 1.2633, "step": 912 }, { "epoch": 0.3364968211554409, "grad_norm": 4.295314436072909, "learning_rate": 4.749041908764989e-05, "loss": 1.1515, "step": 913 }, { "epoch": 0.3368653828434534, "grad_norm": 4.919869925787665, "learning_rate": 4.7487328470762764e-05, "loss": 1.219, "step": 914 }, { "epoch": 0.33723394453146593, "grad_norm": 5.936192418725063, "learning_rate": 4.7484237853875635e-05, "loss": 1.0811, "step": 915 }, { "epoch": 0.33760250621947846, "grad_norm": 7.589945275633374, "learning_rate": 4.7481147236988506e-05, "loss": 0.9747, "step": 916 }, { "epoch": 0.33797106790749104, "grad_norm": 4.220405762315276, "learning_rate": 4.747805662010138e-05, "loss": 1.0365, "step": 917 }, { "epoch": 0.33833962959550357, "grad_norm": 4.580447080021869, "learning_rate": 4.747496600321424e-05, "loss": 0.8848, "step": 918 }, { "epoch": 0.3387081912835161, "grad_norm": 5.790363778132205, "learning_rate": 4.747187538632711e-05, "loss": 1.0983, "step": 919 }, { "epoch": 0.3390767529715286, "grad_norm": 5.499263020470156, "learning_rate": 4.7468784769439984e-05, "loss": 0.8851, "step": 920 }, { "epoch": 0.33944531465954114, "grad_norm": 6.203637243687733, "learning_rate": 4.7465694152552855e-05, "loss": 1.299, "step": 921 }, { "epoch": 0.33981387634755367, "grad_norm": 7.53019282728625, "learning_rate": 4.746260353566572e-05, "loss": 0.8827, "step": 922 }, { "epoch": 0.3401824380355662, "grad_norm": 5.999705307399253, "learning_rate": 4.745951291877859e-05, "loss": 1.313, "step": 923 }, { "epoch": 0.3405509997235787, "grad_norm": 5.767976606937965, "learning_rate": 4.745642230189146e-05, "loss": 1.4471, "step": 924 }, { "epoch": 0.34091956141159124, "grad_norm": 6.473983343131834, "learning_rate": 4.745333168500433e-05, "loss": 1.2997, "step": 925 }, { "epoch": 0.34128812309960377, "grad_norm": 4.700047090984377, "learning_rate": 4.74502410681172e-05, "loss": 1.3902, "step": 926 }, { "epoch": 0.34165668478761635, "grad_norm": 5.9961204702173365, "learning_rate": 4.744715045123007e-05, "loss": 0.9745, "step": 927 }, { "epoch": 0.3420252464756289, "grad_norm": 10.367781368399397, "learning_rate": 4.744405983434293e-05, "loss": 1.1483, "step": 928 }, { "epoch": 0.3423938081636414, "grad_norm": 5.020373986891984, "learning_rate": 4.7440969217455804e-05, "loss": 1.2569, "step": 929 }, { "epoch": 0.3427623698516539, "grad_norm": 5.908197314046729, "learning_rate": 4.7437878600568676e-05, "loss": 1.0721, "step": 930 }, { "epoch": 0.34313093153966645, "grad_norm": 3.8631215308395532, "learning_rate": 4.743478798368155e-05, "loss": 1.0458, "step": 931 }, { "epoch": 0.343499493227679, "grad_norm": 10.482278310049583, "learning_rate": 4.743169736679441e-05, "loss": 1.2537, "step": 932 }, { "epoch": 0.3438680549156915, "grad_norm": 4.372828353322045, "learning_rate": 4.742860674990728e-05, "loss": 1.2439, "step": 933 }, { "epoch": 0.344236616603704, "grad_norm": 5.730130571139255, "learning_rate": 4.7425516133020154e-05, "loss": 1.1897, "step": 934 }, { "epoch": 0.34460517829171655, "grad_norm": 7.417628565215041, "learning_rate": 4.7422425516133025e-05, "loss": 0.874, "step": 935 }, { "epoch": 0.34497373997972913, "grad_norm": 6.940330683940169, "learning_rate": 4.7419334899245896e-05, "loss": 1.4085, "step": 936 }, { "epoch": 0.34534230166774166, "grad_norm": 6.348073604068089, "learning_rate": 4.741624428235876e-05, "loss": 1.1477, "step": 937 }, { "epoch": 0.3457108633557542, "grad_norm": 6.306155198200331, "learning_rate": 4.741315366547163e-05, "loss": 1.3654, "step": 938 }, { "epoch": 0.3460794250437667, "grad_norm": 4.967035729608655, "learning_rate": 4.74100630485845e-05, "loss": 1.0143, "step": 939 }, { "epoch": 0.34644798673177923, "grad_norm": 4.999068364133196, "learning_rate": 4.7406972431697374e-05, "loss": 1.0416, "step": 940 }, { "epoch": 0.34681654841979176, "grad_norm": 5.25038327225882, "learning_rate": 4.740388181481024e-05, "loss": 1.1947, "step": 941 }, { "epoch": 0.3471851101078043, "grad_norm": 8.38226504284702, "learning_rate": 4.74007911979231e-05, "loss": 0.998, "step": 942 }, { "epoch": 0.3475536717958168, "grad_norm": 8.5974769972227, "learning_rate": 4.7397700581035974e-05, "loss": 1.0285, "step": 943 }, { "epoch": 0.34792223348382934, "grad_norm": 7.69491889063156, "learning_rate": 4.7394609964148845e-05, "loss": 1.3363, "step": 944 }, { "epoch": 0.34829079517184186, "grad_norm": 6.568253674253584, "learning_rate": 4.7391519347261716e-05, "loss": 0.9319, "step": 945 }, { "epoch": 0.34865935685985444, "grad_norm": 8.363614214238327, "learning_rate": 4.738842873037459e-05, "loss": 1.379, "step": 946 }, { "epoch": 0.34902791854786697, "grad_norm": 7.136795720028286, "learning_rate": 4.738533811348745e-05, "loss": 1.1785, "step": 947 }, { "epoch": 0.3493964802358795, "grad_norm": 4.529433103209517, "learning_rate": 4.738224749660032e-05, "loss": 1.1816, "step": 948 }, { "epoch": 0.349765041923892, "grad_norm": 4.235205312028471, "learning_rate": 4.7379156879713194e-05, "loss": 1.2251, "step": 949 }, { "epoch": 0.35013360361190454, "grad_norm": 4.37393088220478, "learning_rate": 4.7376066262826066e-05, "loss": 0.6988, "step": 950 }, { "epoch": 0.35050216529991707, "grad_norm": 4.10045255628959, "learning_rate": 4.737297564593893e-05, "loss": 1.1939, "step": 951 }, { "epoch": 0.3508707269879296, "grad_norm": 4.730886451934114, "learning_rate": 4.73698850290518e-05, "loss": 1.3771, "step": 952 }, { "epoch": 0.3512392886759421, "grad_norm": 5.1584101920289855, "learning_rate": 4.736679441216467e-05, "loss": 1.2602, "step": 953 }, { "epoch": 0.35160785036395464, "grad_norm": 4.655177869830095, "learning_rate": 4.7363703795277544e-05, "loss": 1.0143, "step": 954 }, { "epoch": 0.3519764120519672, "grad_norm": 5.38440715051795, "learning_rate": 4.7360613178390415e-05, "loss": 1.221, "step": 955 }, { "epoch": 0.35234497373997975, "grad_norm": 3.9749512771403435, "learning_rate": 4.735752256150328e-05, "loss": 0.9819, "step": 956 }, { "epoch": 0.3527135354279923, "grad_norm": 5.233287368054208, "learning_rate": 4.7354431944616144e-05, "loss": 1.1364, "step": 957 }, { "epoch": 0.3530820971160048, "grad_norm": 8.33579106010079, "learning_rate": 4.7351341327729015e-05, "loss": 1.0368, "step": 958 }, { "epoch": 0.3534506588040173, "grad_norm": 6.844378995513655, "learning_rate": 4.7348250710841886e-05, "loss": 1.4082, "step": 959 }, { "epoch": 0.35381922049202985, "grad_norm": 6.818764981848659, "learning_rate": 4.734516009395476e-05, "loss": 1.1101, "step": 960 }, { "epoch": 0.3541877821800424, "grad_norm": 4.528980397052923, "learning_rate": 4.734206947706762e-05, "loss": 0.653, "step": 961 }, { "epoch": 0.3545563438680549, "grad_norm": 6.978811074682656, "learning_rate": 4.733897886018049e-05, "loss": 1.0145, "step": 962 }, { "epoch": 0.3549249055560674, "grad_norm": 7.525190139345369, "learning_rate": 4.7335888243293364e-05, "loss": 1.0948, "step": 963 }, { "epoch": 0.35529346724407995, "grad_norm": 9.723488783911842, "learning_rate": 4.7332797626406235e-05, "loss": 1.3272, "step": 964 }, { "epoch": 0.35566202893209253, "grad_norm": 7.132145561132062, "learning_rate": 4.7329707009519106e-05, "loss": 0.9832, "step": 965 }, { "epoch": 0.35603059062010506, "grad_norm": 6.635239084243922, "learning_rate": 4.732661639263197e-05, "loss": 0.848, "step": 966 }, { "epoch": 0.3563991523081176, "grad_norm": 10.2546677544616, "learning_rate": 4.732352577574484e-05, "loss": 1.1726, "step": 967 }, { "epoch": 0.3567677139961301, "grad_norm": 10.898185130926581, "learning_rate": 4.732043515885771e-05, "loss": 1.3139, "step": 968 }, { "epoch": 0.35713627568414263, "grad_norm": 8.673803276605367, "learning_rate": 4.7317344541970584e-05, "loss": 1.0735, "step": 969 }, { "epoch": 0.35750483737215516, "grad_norm": 11.55024028049241, "learning_rate": 4.731425392508345e-05, "loss": 1.0472, "step": 970 }, { "epoch": 0.3578733990601677, "grad_norm": 7.0408544429366735, "learning_rate": 4.731116330819631e-05, "loss": 1.1578, "step": 971 }, { "epoch": 0.3582419607481802, "grad_norm": 8.339221196757945, "learning_rate": 4.7308072691309184e-05, "loss": 1.4005, "step": 972 }, { "epoch": 0.35861052243619274, "grad_norm": 7.838002808426541, "learning_rate": 4.7304982074422056e-05, "loss": 1.0698, "step": 973 }, { "epoch": 0.3589790841242053, "grad_norm": 10.538774377547865, "learning_rate": 4.730189145753493e-05, "loss": 1.1248, "step": 974 }, { "epoch": 0.35934764581221784, "grad_norm": 6.818794352442258, "learning_rate": 4.729880084064779e-05, "loss": 1.3875, "step": 975 }, { "epoch": 0.35971620750023037, "grad_norm": 8.982381742204865, "learning_rate": 4.729571022376066e-05, "loss": 1.0141, "step": 976 }, { "epoch": 0.3600847691882429, "grad_norm": 8.677808007016555, "learning_rate": 4.7292619606873534e-05, "loss": 0.9983, "step": 977 }, { "epoch": 0.3604533308762554, "grad_norm": 10.260219782507175, "learning_rate": 4.7289528989986405e-05, "loss": 1.8314, "step": 978 }, { "epoch": 0.36082189256426794, "grad_norm": 7.823180199733354, "learning_rate": 4.7286438373099276e-05, "loss": 1.0102, "step": 979 }, { "epoch": 0.36119045425228047, "grad_norm": 8.833349359845672, "learning_rate": 4.728334775621214e-05, "loss": 1.4493, "step": 980 }, { "epoch": 0.361559015940293, "grad_norm": 6.117489316468757, "learning_rate": 4.728025713932501e-05, "loss": 0.7661, "step": 981 }, { "epoch": 0.3619275776283055, "grad_norm": 6.42184648716298, "learning_rate": 4.727716652243788e-05, "loss": 1.0766, "step": 982 }, { "epoch": 0.36229613931631804, "grad_norm": 3.99121380482219, "learning_rate": 4.7274075905550754e-05, "loss": 0.8369, "step": 983 }, { "epoch": 0.3626647010043306, "grad_norm": 8.609503051027076, "learning_rate": 4.7270985288663625e-05, "loss": 0.8298, "step": 984 }, { "epoch": 0.36303326269234315, "grad_norm": 8.122064148071635, "learning_rate": 4.726789467177649e-05, "loss": 1.3074, "step": 985 }, { "epoch": 0.3634018243803557, "grad_norm": 6.404667654237062, "learning_rate": 4.7264804054889354e-05, "loss": 1.1537, "step": 986 }, { "epoch": 0.3637703860683682, "grad_norm": 5.165012012626512, "learning_rate": 4.7261713438002225e-05, "loss": 1.0269, "step": 987 }, { "epoch": 0.3641389477563807, "grad_norm": 7.292713751316623, "learning_rate": 4.7258622821115096e-05, "loss": 1.3841, "step": 988 }, { "epoch": 0.36450750944439325, "grad_norm": 4.857042936691747, "learning_rate": 4.725553220422797e-05, "loss": 1.0782, "step": 989 }, { "epoch": 0.3648760711324058, "grad_norm": 5.41060255943387, "learning_rate": 4.725244158734083e-05, "loss": 1.0797, "step": 990 }, { "epoch": 0.3652446328204183, "grad_norm": 4.685639686190012, "learning_rate": 4.72493509704537e-05, "loss": 1.0163, "step": 991 }, { "epoch": 0.3656131945084308, "grad_norm": 7.7885168988324365, "learning_rate": 4.7246260353566574e-05, "loss": 1.165, "step": 992 }, { "epoch": 0.36598175619644335, "grad_norm": 4.92716477738773, "learning_rate": 4.7243169736679446e-05, "loss": 1.1255, "step": 993 }, { "epoch": 0.36635031788445593, "grad_norm": 6.976340500737228, "learning_rate": 4.724007911979231e-05, "loss": 1.1921, "step": 994 }, { "epoch": 0.36671887957246846, "grad_norm": 5.6301842429333515, "learning_rate": 4.723698850290518e-05, "loss": 1.2671, "step": 995 }, { "epoch": 0.367087441260481, "grad_norm": 4.548444444290044, "learning_rate": 4.723389788601805e-05, "loss": 1.0388, "step": 996 }, { "epoch": 0.3674560029484935, "grad_norm": 8.71233698119099, "learning_rate": 4.7230807269130924e-05, "loss": 1.1237, "step": 997 }, { "epoch": 0.36782456463650604, "grad_norm": 6.090028222931608, "learning_rate": 4.7227716652243795e-05, "loss": 1.3744, "step": 998 }, { "epoch": 0.36819312632451856, "grad_norm": 6.209532380857356, "learning_rate": 4.722462603535666e-05, "loss": 1.2114, "step": 999 }, { "epoch": 0.3685616880125311, "grad_norm": 9.299115489612985, "learning_rate": 4.722153541846953e-05, "loss": 1.5144, "step": 1000 }, { "epoch": 0.3689302497005436, "grad_norm": 6.7486144162566175, "learning_rate": 4.7218444801582395e-05, "loss": 1.2359, "step": 1001 }, { "epoch": 0.36929881138855614, "grad_norm": 4.516499470974559, "learning_rate": 4.7215354184695266e-05, "loss": 1.0927, "step": 1002 }, { "epoch": 0.3696673730765687, "grad_norm": 5.530972058915495, "learning_rate": 4.721226356780814e-05, "loss": 1.3929, "step": 1003 }, { "epoch": 0.37003593476458124, "grad_norm": 5.112337985654453, "learning_rate": 4.7209172950921e-05, "loss": 0.9383, "step": 1004 }, { "epoch": 0.37040449645259377, "grad_norm": 5.3807712886959695, "learning_rate": 4.720608233403387e-05, "loss": 1.1229, "step": 1005 }, { "epoch": 0.3707730581406063, "grad_norm": 5.86331247909738, "learning_rate": 4.7202991717146744e-05, "loss": 0.9278, "step": 1006 }, { "epoch": 0.3711416198286188, "grad_norm": 11.056870574527652, "learning_rate": 4.7199901100259615e-05, "loss": 1.0022, "step": 1007 }, { "epoch": 0.37151018151663134, "grad_norm": 5.7283331617292985, "learning_rate": 4.7196810483372486e-05, "loss": 1.3716, "step": 1008 }, { "epoch": 0.37187874320464387, "grad_norm": 6.9165007176848015, "learning_rate": 4.719371986648535e-05, "loss": 1.317, "step": 1009 }, { "epoch": 0.3722473048926564, "grad_norm": 5.311026694257819, "learning_rate": 4.719062924959822e-05, "loss": 1.3528, "step": 1010 }, { "epoch": 0.3726158665806689, "grad_norm": 6.711801457775233, "learning_rate": 4.718753863271109e-05, "loss": 1.1772, "step": 1011 }, { "epoch": 0.37298442826868144, "grad_norm": 7.82664637201766, "learning_rate": 4.7184448015823964e-05, "loss": 1.4763, "step": 1012 }, { "epoch": 0.373352989956694, "grad_norm": 4.5455702125397766, "learning_rate": 4.718135739893683e-05, "loss": 1.2133, "step": 1013 }, { "epoch": 0.37372155164470655, "grad_norm": 5.31863838553845, "learning_rate": 4.71782667820497e-05, "loss": 1.2828, "step": 1014 }, { "epoch": 0.3740901133327191, "grad_norm": 4.411597936133534, "learning_rate": 4.717517616516257e-05, "loss": 0.8584, "step": 1015 }, { "epoch": 0.3744586750207316, "grad_norm": 5.8095291913414835, "learning_rate": 4.7172085548275436e-05, "loss": 1.226, "step": 1016 }, { "epoch": 0.3748272367087441, "grad_norm": 5.381555507374711, "learning_rate": 4.716899493138831e-05, "loss": 1.06, "step": 1017 }, { "epoch": 0.37519579839675665, "grad_norm": 7.066969276042959, "learning_rate": 4.716590431450118e-05, "loss": 1.3013, "step": 1018 }, { "epoch": 0.3755643600847692, "grad_norm": 5.127066218990741, "learning_rate": 4.716281369761404e-05, "loss": 1.2634, "step": 1019 }, { "epoch": 0.3759329217727817, "grad_norm": 6.357244561179359, "learning_rate": 4.7159723080726914e-05, "loss": 1.1756, "step": 1020 }, { "epoch": 0.37630148346079423, "grad_norm": 6.867432469500121, "learning_rate": 4.7156632463839785e-05, "loss": 1.3066, "step": 1021 }, { "epoch": 0.3766700451488068, "grad_norm": 5.592550820472303, "learning_rate": 4.7153541846952656e-05, "loss": 1.3438, "step": 1022 }, { "epoch": 0.37703860683681933, "grad_norm": 6.872622824549984, "learning_rate": 4.715045123006552e-05, "loss": 1.0282, "step": 1023 }, { "epoch": 0.37740716852483186, "grad_norm": 5.610639934250082, "learning_rate": 4.714736061317839e-05, "loss": 1.0051, "step": 1024 }, { "epoch": 0.3777757302128444, "grad_norm": 7.618688253601561, "learning_rate": 4.714426999629126e-05, "loss": 1.1389, "step": 1025 }, { "epoch": 0.3781442919008569, "grad_norm": 7.229596069364586, "learning_rate": 4.7141179379404134e-05, "loss": 1.0351, "step": 1026 }, { "epoch": 0.37851285358886944, "grad_norm": 4.232103477510959, "learning_rate": 4.7138088762517005e-05, "loss": 0.9767, "step": 1027 }, { "epoch": 0.37888141527688196, "grad_norm": 10.994241767888372, "learning_rate": 4.713499814562987e-05, "loss": 1.2759, "step": 1028 }, { "epoch": 0.3792499769648945, "grad_norm": 6.868147330532904, "learning_rate": 4.713190752874274e-05, "loss": 1.053, "step": 1029 }, { "epoch": 0.379618538652907, "grad_norm": 4.400851358064852, "learning_rate": 4.712881691185561e-05, "loss": 0.9053, "step": 1030 }, { "epoch": 0.37998710034091954, "grad_norm": 5.538366879363418, "learning_rate": 4.7125726294968476e-05, "loss": 1.2485, "step": 1031 }, { "epoch": 0.3803556620289321, "grad_norm": 4.875943997268831, "learning_rate": 4.712263567808135e-05, "loss": 1.3192, "step": 1032 }, { "epoch": 0.38072422371694464, "grad_norm": 9.178478976298816, "learning_rate": 4.711954506119421e-05, "loss": 1.595, "step": 1033 }, { "epoch": 0.38109278540495717, "grad_norm": 3.7134633673518396, "learning_rate": 4.711645444430708e-05, "loss": 1.0071, "step": 1034 }, { "epoch": 0.3814613470929697, "grad_norm": 4.611596971726678, "learning_rate": 4.7113363827419954e-05, "loss": 1.1807, "step": 1035 }, { "epoch": 0.3818299087809822, "grad_norm": 4.956097694548621, "learning_rate": 4.7110273210532825e-05, "loss": 1.085, "step": 1036 }, { "epoch": 0.38219847046899474, "grad_norm": 3.930871864654584, "learning_rate": 4.71071825936457e-05, "loss": 0.9229, "step": 1037 }, { "epoch": 0.38256703215700727, "grad_norm": 14.642882279962636, "learning_rate": 4.710409197675856e-05, "loss": 1.1848, "step": 1038 }, { "epoch": 0.3829355938450198, "grad_norm": 8.353611294241158, "learning_rate": 4.710100135987143e-05, "loss": 1.4632, "step": 1039 }, { "epoch": 0.3833041555330323, "grad_norm": 5.199397771147846, "learning_rate": 4.7097910742984303e-05, "loss": 1.3044, "step": 1040 }, { "epoch": 0.38367271722104485, "grad_norm": 5.083480499305216, "learning_rate": 4.7094820126097175e-05, "loss": 1.268, "step": 1041 }, { "epoch": 0.3840412789090574, "grad_norm": 6.811945061435796, "learning_rate": 4.709172950921004e-05, "loss": 1.1201, "step": 1042 }, { "epoch": 0.38440984059706995, "grad_norm": 6.808595991106918, "learning_rate": 4.708863889232291e-05, "loss": 1.363, "step": 1043 }, { "epoch": 0.3847784022850825, "grad_norm": 6.488332988204313, "learning_rate": 4.708554827543578e-05, "loss": 1.0223, "step": 1044 }, { "epoch": 0.385146963973095, "grad_norm": 7.882978000926169, "learning_rate": 4.708245765854865e-05, "loss": 1.4796, "step": 1045 }, { "epoch": 0.3855155256611075, "grad_norm": 11.0004570605949, "learning_rate": 4.707936704166152e-05, "loss": 1.2165, "step": 1046 }, { "epoch": 0.38588408734912005, "grad_norm": 4.911697764812217, "learning_rate": 4.707627642477438e-05, "loss": 1.0966, "step": 1047 }, { "epoch": 0.3862526490371326, "grad_norm": 6.825930174994925, "learning_rate": 4.707318580788725e-05, "loss": 0.8968, "step": 1048 }, { "epoch": 0.3866212107251451, "grad_norm": 5.370664755595175, "learning_rate": 4.7070095191000124e-05, "loss": 1.1584, "step": 1049 }, { "epoch": 0.38698977241315763, "grad_norm": 5.298512073600644, "learning_rate": 4.7067004574112995e-05, "loss": 1.2907, "step": 1050 }, { "epoch": 0.3873583341011702, "grad_norm": 6.253445400434924, "learning_rate": 4.7063913957225866e-05, "loss": 1.0049, "step": 1051 }, { "epoch": 0.38772689578918274, "grad_norm": 5.92760984990703, "learning_rate": 4.706082334033873e-05, "loss": 0.9233, "step": 1052 }, { "epoch": 0.38809545747719526, "grad_norm": 10.49117889442014, "learning_rate": 4.70577327234516e-05, "loss": 1.3079, "step": 1053 }, { "epoch": 0.3884640191652078, "grad_norm": 5.671518866337368, "learning_rate": 4.705464210656447e-05, "loss": 1.1362, "step": 1054 }, { "epoch": 0.3888325808532203, "grad_norm": 6.36714016949433, "learning_rate": 4.7051551489677344e-05, "loss": 0.9573, "step": 1055 }, { "epoch": 0.38920114254123284, "grad_norm": 6.542785969928332, "learning_rate": 4.7048460872790215e-05, "loss": 1.2893, "step": 1056 }, { "epoch": 0.38956970422924536, "grad_norm": 5.310980725428133, "learning_rate": 4.704537025590308e-05, "loss": 1.2936, "step": 1057 }, { "epoch": 0.3899382659172579, "grad_norm": 5.395396368334355, "learning_rate": 4.704227963901595e-05, "loss": 1.2508, "step": 1058 }, { "epoch": 0.3903068276052704, "grad_norm": 9.577393745026129, "learning_rate": 4.703918902212882e-05, "loss": 1.0524, "step": 1059 }, { "epoch": 0.39067538929328294, "grad_norm": 5.619118900939932, "learning_rate": 4.7036098405241693e-05, "loss": 1.1922, "step": 1060 }, { "epoch": 0.3910439509812955, "grad_norm": 7.82314314093227, "learning_rate": 4.703300778835456e-05, "loss": 1.4206, "step": 1061 }, { "epoch": 0.39141251266930804, "grad_norm": 6.355069589301386, "learning_rate": 4.702991717146742e-05, "loss": 1.098, "step": 1062 }, { "epoch": 0.39178107435732057, "grad_norm": 9.991746166465505, "learning_rate": 4.7026826554580293e-05, "loss": 1.2562, "step": 1063 }, { "epoch": 0.3921496360453331, "grad_norm": 13.157468906940627, "learning_rate": 4.7023735937693165e-05, "loss": 1.4482, "step": 1064 }, { "epoch": 0.3925181977333456, "grad_norm": 6.6502064952483515, "learning_rate": 4.7020645320806036e-05, "loss": 1.1488, "step": 1065 }, { "epoch": 0.39288675942135814, "grad_norm": 6.643978932438565, "learning_rate": 4.70175547039189e-05, "loss": 1.0342, "step": 1066 }, { "epoch": 0.39325532110937067, "grad_norm": 13.233917264442503, "learning_rate": 4.701446408703177e-05, "loss": 1.121, "step": 1067 }, { "epoch": 0.3936238827973832, "grad_norm": 9.833257211509078, "learning_rate": 4.701137347014464e-05, "loss": 0.8879, "step": 1068 }, { "epoch": 0.3939924444853957, "grad_norm": 6.698527433353589, "learning_rate": 4.7008282853257514e-05, "loss": 1.2387, "step": 1069 }, { "epoch": 0.3943610061734083, "grad_norm": 4.864425536815489, "learning_rate": 4.7005192236370385e-05, "loss": 0.9638, "step": 1070 }, { "epoch": 0.3947295678614208, "grad_norm": 4.805976310249327, "learning_rate": 4.700210161948325e-05, "loss": 0.9403, "step": 1071 }, { "epoch": 0.39509812954943335, "grad_norm": 7.23866938761357, "learning_rate": 4.699901100259612e-05, "loss": 1.2882, "step": 1072 }, { "epoch": 0.3954666912374459, "grad_norm": 7.728619277640781, "learning_rate": 4.699592038570899e-05, "loss": 1.1152, "step": 1073 }, { "epoch": 0.3958352529254584, "grad_norm": 4.92781140013622, "learning_rate": 4.699282976882186e-05, "loss": 1.123, "step": 1074 }, { "epoch": 0.39620381461347093, "grad_norm": 5.693223023131386, "learning_rate": 4.698973915193473e-05, "loss": 0.8648, "step": 1075 }, { "epoch": 0.39657237630148345, "grad_norm": 4.812900675157918, "learning_rate": 4.69866485350476e-05, "loss": 1.1479, "step": 1076 }, { "epoch": 0.396940937989496, "grad_norm": 7.7648139391909385, "learning_rate": 4.698355791816046e-05, "loss": 1.1865, "step": 1077 }, { "epoch": 0.3973094996775085, "grad_norm": 5.680839579144496, "learning_rate": 4.6980467301273334e-05, "loss": 1.3449, "step": 1078 }, { "epoch": 0.39767806136552103, "grad_norm": 4.23963337086885, "learning_rate": 4.6977376684386205e-05, "loss": 1.0933, "step": 1079 }, { "epoch": 0.3980466230535336, "grad_norm": 6.213549716003099, "learning_rate": 4.6974286067499077e-05, "loss": 1.0472, "step": 1080 }, { "epoch": 0.39841518474154614, "grad_norm": 5.252911441598958, "learning_rate": 4.697119545061194e-05, "loss": 1.098, "step": 1081 }, { "epoch": 0.39878374642955866, "grad_norm": 8.609064833909263, "learning_rate": 4.696810483372481e-05, "loss": 1.2271, "step": 1082 }, { "epoch": 0.3991523081175712, "grad_norm": 6.887849763549897, "learning_rate": 4.6965014216837683e-05, "loss": 0.9606, "step": 1083 }, { "epoch": 0.3995208698055837, "grad_norm": 5.698693965729259, "learning_rate": 4.6961923599950555e-05, "loss": 0.9664, "step": 1084 }, { "epoch": 0.39988943149359624, "grad_norm": 7.8849620478072895, "learning_rate": 4.695883298306342e-05, "loss": 1.1946, "step": 1085 }, { "epoch": 0.40025799318160876, "grad_norm": 5.419176117049312, "learning_rate": 4.695574236617629e-05, "loss": 0.9888, "step": 1086 }, { "epoch": 0.4006265548696213, "grad_norm": 4.938392268814173, "learning_rate": 4.695265174928916e-05, "loss": 1.1591, "step": 1087 }, { "epoch": 0.4009951165576338, "grad_norm": 5.763155607384867, "learning_rate": 4.694956113240203e-05, "loss": 1.1367, "step": 1088 }, { "epoch": 0.40136367824564634, "grad_norm": 5.6067793001948765, "learning_rate": 4.6946470515514904e-05, "loss": 1.3895, "step": 1089 }, { "epoch": 0.4017322399336589, "grad_norm": 5.83582049435083, "learning_rate": 4.694337989862777e-05, "loss": 1.2332, "step": 1090 }, { "epoch": 0.40210080162167144, "grad_norm": 5.9580832831281745, "learning_rate": 4.694028928174064e-05, "loss": 1.1917, "step": 1091 }, { "epoch": 0.40246936330968397, "grad_norm": 5.585052377426173, "learning_rate": 4.6937198664853504e-05, "loss": 1.6024, "step": 1092 }, { "epoch": 0.4028379249976965, "grad_norm": 6.824040708091751, "learning_rate": 4.6934108047966375e-05, "loss": 1.3413, "step": 1093 }, { "epoch": 0.403206486685709, "grad_norm": 7.478877013255033, "learning_rate": 4.6931017431079246e-05, "loss": 0.9701, "step": 1094 }, { "epoch": 0.40357504837372155, "grad_norm": 5.072968197455752, "learning_rate": 4.692792681419211e-05, "loss": 1.1993, "step": 1095 }, { "epoch": 0.40394361006173407, "grad_norm": 6.067483957314236, "learning_rate": 4.692483619730498e-05, "loss": 1.1427, "step": 1096 }, { "epoch": 0.4043121717497466, "grad_norm": 26.430791341861696, "learning_rate": 4.692174558041785e-05, "loss": 1.2943, "step": 1097 }, { "epoch": 0.4046807334377591, "grad_norm": 8.524329428154864, "learning_rate": 4.6918654963530724e-05, "loss": 1.3079, "step": 1098 }, { "epoch": 0.4050492951257717, "grad_norm": 6.410157440204033, "learning_rate": 4.6915564346643595e-05, "loss": 1.1496, "step": 1099 }, { "epoch": 0.4054178568137842, "grad_norm": 4.275593382940562, "learning_rate": 4.691247372975646e-05, "loss": 1.0458, "step": 1100 }, { "epoch": 0.40578641850179675, "grad_norm": 5.195251556268489, "learning_rate": 4.690938311286933e-05, "loss": 1.343, "step": 1101 }, { "epoch": 0.4061549801898093, "grad_norm": 7.301162930181509, "learning_rate": 4.69062924959822e-05, "loss": 1.0825, "step": 1102 }, { "epoch": 0.4065235418778218, "grad_norm": 10.44365907877183, "learning_rate": 4.690320187909507e-05, "loss": 1.1703, "step": 1103 }, { "epoch": 0.40689210356583433, "grad_norm": 5.324086378639242, "learning_rate": 4.690011126220794e-05, "loss": 0.9876, "step": 1104 }, { "epoch": 0.40726066525384685, "grad_norm": 4.674219468180623, "learning_rate": 4.689702064532081e-05, "loss": 1.0732, "step": 1105 }, { "epoch": 0.4076292269418594, "grad_norm": 6.245598425677416, "learning_rate": 4.689393002843368e-05, "loss": 1.3049, "step": 1106 }, { "epoch": 0.4079977886298719, "grad_norm": 5.85451067618536, "learning_rate": 4.6890839411546545e-05, "loss": 1.2297, "step": 1107 }, { "epoch": 0.40836635031788443, "grad_norm": 11.864989870165951, "learning_rate": 4.6887748794659416e-05, "loss": 1.2661, "step": 1108 }, { "epoch": 0.408734912005897, "grad_norm": 7.656880006365828, "learning_rate": 4.688465817777229e-05, "loss": 1.5392, "step": 1109 }, { "epoch": 0.40910347369390954, "grad_norm": 6.398675376187185, "learning_rate": 4.688156756088515e-05, "loss": 1.1853, "step": 1110 }, { "epoch": 0.40947203538192206, "grad_norm": 10.393887528216704, "learning_rate": 4.687847694399802e-05, "loss": 1.1996, "step": 1111 }, { "epoch": 0.4098405970699346, "grad_norm": 11.413291785024914, "learning_rate": 4.6875386327110894e-05, "loss": 1.4932, "step": 1112 }, { "epoch": 0.4102091587579471, "grad_norm": 7.085836559110833, "learning_rate": 4.6872295710223765e-05, "loss": 1.2745, "step": 1113 }, { "epoch": 0.41057772044595964, "grad_norm": 6.836438876702736, "learning_rate": 4.686920509333663e-05, "loss": 1.2008, "step": 1114 }, { "epoch": 0.41094628213397216, "grad_norm": 7.545550559896417, "learning_rate": 4.68661144764495e-05, "loss": 1.3554, "step": 1115 }, { "epoch": 0.4113148438219847, "grad_norm": 9.413746834048938, "learning_rate": 4.686302385956237e-05, "loss": 1.1374, "step": 1116 }, { "epoch": 0.4116834055099972, "grad_norm": 7.830499772337032, "learning_rate": 4.685993324267524e-05, "loss": 1.2509, "step": 1117 }, { "epoch": 0.4120519671980098, "grad_norm": 7.329554827027515, "learning_rate": 4.6856842625788114e-05, "loss": 1.3417, "step": 1118 }, { "epoch": 0.4124205288860223, "grad_norm": 11.10440454472768, "learning_rate": 4.685375200890098e-05, "loss": 0.9119, "step": 1119 }, { "epoch": 0.41278909057403484, "grad_norm": 8.086828151431268, "learning_rate": 4.685066139201385e-05, "loss": 1.0392, "step": 1120 }, { "epoch": 0.41315765226204737, "grad_norm": 7.113693904062316, "learning_rate": 4.684757077512672e-05, "loss": 1.3752, "step": 1121 }, { "epoch": 0.4135262139500599, "grad_norm": 8.510073807034336, "learning_rate": 4.6844480158239585e-05, "loss": 1.1207, "step": 1122 }, { "epoch": 0.4138947756380724, "grad_norm": 3.76120178536585, "learning_rate": 4.6841389541352457e-05, "loss": 0.8952, "step": 1123 }, { "epoch": 0.41426333732608495, "grad_norm": 4.928326547355912, "learning_rate": 4.683829892446532e-05, "loss": 1.0164, "step": 1124 }, { "epoch": 0.41463189901409747, "grad_norm": 5.688784946710666, "learning_rate": 4.683520830757819e-05, "loss": 1.1541, "step": 1125 }, { "epoch": 0.41500046070211, "grad_norm": 7.370250142173136, "learning_rate": 4.683211769069106e-05, "loss": 0.8708, "step": 1126 }, { "epoch": 0.4153690223901225, "grad_norm": 5.9455858285560526, "learning_rate": 4.6829027073803935e-05, "loss": 1.1587, "step": 1127 }, { "epoch": 0.4157375840781351, "grad_norm": 15.149264458152642, "learning_rate": 4.6825936456916806e-05, "loss": 1.163, "step": 1128 }, { "epoch": 0.41610614576614763, "grad_norm": 5.662425005755112, "learning_rate": 4.682284584002967e-05, "loss": 1.254, "step": 1129 }, { "epoch": 0.41647470745416015, "grad_norm": 22.084986825135445, "learning_rate": 4.681975522314254e-05, "loss": 0.8963, "step": 1130 }, { "epoch": 0.4168432691421727, "grad_norm": 8.85397427218973, "learning_rate": 4.681666460625541e-05, "loss": 1.2122, "step": 1131 }, { "epoch": 0.4172118308301852, "grad_norm": 25.07239152722634, "learning_rate": 4.6813573989368284e-05, "loss": 1.5466, "step": 1132 }, { "epoch": 0.41758039251819773, "grad_norm": 10.115159515616508, "learning_rate": 4.681048337248115e-05, "loss": 1.0739, "step": 1133 }, { "epoch": 0.41794895420621025, "grad_norm": 5.6183021511199716, "learning_rate": 4.680739275559402e-05, "loss": 0.9985, "step": 1134 }, { "epoch": 0.4183175158942228, "grad_norm": 4.522458723311107, "learning_rate": 4.680430213870689e-05, "loss": 1.0793, "step": 1135 }, { "epoch": 0.4186860775822353, "grad_norm": 4.979903077557357, "learning_rate": 4.680121152181976e-05, "loss": 0.9835, "step": 1136 }, { "epoch": 0.41905463927024783, "grad_norm": 4.767374086682631, "learning_rate": 4.6798120904932626e-05, "loss": 1.0259, "step": 1137 }, { "epoch": 0.4194232009582604, "grad_norm": 14.134395209194043, "learning_rate": 4.679503028804549e-05, "loss": 1.0891, "step": 1138 }, { "epoch": 0.41979176264627294, "grad_norm": 12.012242747716936, "learning_rate": 4.679193967115836e-05, "loss": 0.8313, "step": 1139 }, { "epoch": 0.42016032433428546, "grad_norm": 4.671794571710498, "learning_rate": 4.678884905427123e-05, "loss": 1.3584, "step": 1140 }, { "epoch": 0.420528886022298, "grad_norm": 5.156345620858164, "learning_rate": 4.6785758437384104e-05, "loss": 1.2447, "step": 1141 }, { "epoch": 0.4208974477103105, "grad_norm": 4.913253935988038, "learning_rate": 4.6782667820496975e-05, "loss": 1.2203, "step": 1142 }, { "epoch": 0.42126600939832304, "grad_norm": 9.828508475726206, "learning_rate": 4.677957720360984e-05, "loss": 1.1779, "step": 1143 }, { "epoch": 0.42163457108633556, "grad_norm": 5.843003822055124, "learning_rate": 4.677648658672271e-05, "loss": 1.1248, "step": 1144 }, { "epoch": 0.4220031327743481, "grad_norm": 6.480539975686042, "learning_rate": 4.677339596983558e-05, "loss": 0.8592, "step": 1145 }, { "epoch": 0.4223716944623606, "grad_norm": 10.214054627143401, "learning_rate": 4.677030535294845e-05, "loss": 1.308, "step": 1146 }, { "epoch": 0.4227402561503732, "grad_norm": 5.267948351202071, "learning_rate": 4.676721473606132e-05, "loss": 1.0818, "step": 1147 }, { "epoch": 0.4231088178383857, "grad_norm": 6.145794505196025, "learning_rate": 4.676412411917419e-05, "loss": 1.2029, "step": 1148 }, { "epoch": 0.42347737952639825, "grad_norm": 5.024583463343259, "learning_rate": 4.676103350228706e-05, "loss": 0.9379, "step": 1149 }, { "epoch": 0.42384594121441077, "grad_norm": 6.37198163551802, "learning_rate": 4.675794288539993e-05, "loss": 1.0829, "step": 1150 }, { "epoch": 0.4242145029024233, "grad_norm": 6.158470411758102, "learning_rate": 4.67548522685128e-05, "loss": 1.3506, "step": 1151 }, { "epoch": 0.4245830645904358, "grad_norm": 4.866740345506032, "learning_rate": 4.675176165162567e-05, "loss": 1.024, "step": 1152 }, { "epoch": 0.42495162627844835, "grad_norm": 5.496214170566885, "learning_rate": 4.674867103473853e-05, "loss": 1.1978, "step": 1153 }, { "epoch": 0.42532018796646087, "grad_norm": 4.079051880720012, "learning_rate": 4.67455804178514e-05, "loss": 1.0954, "step": 1154 }, { "epoch": 0.4256887496544734, "grad_norm": 5.354735743736931, "learning_rate": 4.6742489800964274e-05, "loss": 1.0137, "step": 1155 }, { "epoch": 0.4260573113424859, "grad_norm": 6.339610586150635, "learning_rate": 4.6739399184077145e-05, "loss": 1.0208, "step": 1156 }, { "epoch": 0.4264258730304985, "grad_norm": 4.373877463062911, "learning_rate": 4.673630856719001e-05, "loss": 1.1526, "step": 1157 }, { "epoch": 0.42679443471851103, "grad_norm": 7.1565186562562495, "learning_rate": 4.673321795030288e-05, "loss": 1.0138, "step": 1158 }, { "epoch": 0.42716299640652355, "grad_norm": 8.352234373553086, "learning_rate": 4.673012733341575e-05, "loss": 1.2139, "step": 1159 }, { "epoch": 0.4275315580945361, "grad_norm": 5.661899506178682, "learning_rate": 4.672703671652862e-05, "loss": 1.1941, "step": 1160 }, { "epoch": 0.4279001197825486, "grad_norm": 6.529747018864597, "learning_rate": 4.6723946099641494e-05, "loss": 1.2003, "step": 1161 }, { "epoch": 0.42826868147056113, "grad_norm": 5.040500642646088, "learning_rate": 4.672085548275436e-05, "loss": 1.4841, "step": 1162 }, { "epoch": 0.42863724315857366, "grad_norm": 6.861186351011795, "learning_rate": 4.671776486586723e-05, "loss": 1.2556, "step": 1163 }, { "epoch": 0.4290058048465862, "grad_norm": 6.277854495109581, "learning_rate": 4.67146742489801e-05, "loss": 1.2795, "step": 1164 }, { "epoch": 0.4293743665345987, "grad_norm": 4.964036157026026, "learning_rate": 4.671158363209297e-05, "loss": 1.117, "step": 1165 }, { "epoch": 0.4297429282226113, "grad_norm": 4.755686919357831, "learning_rate": 4.6708493015205836e-05, "loss": 1.0267, "step": 1166 }, { "epoch": 0.4301114899106238, "grad_norm": 5.811947375863333, "learning_rate": 4.67054023983187e-05, "loss": 1.486, "step": 1167 }, { "epoch": 0.43048005159863634, "grad_norm": 5.156497278930929, "learning_rate": 4.670231178143157e-05, "loss": 1.171, "step": 1168 }, { "epoch": 0.43084861328664886, "grad_norm": 4.6253214543984456, "learning_rate": 4.669922116454444e-05, "loss": 1.2923, "step": 1169 }, { "epoch": 0.4312171749746614, "grad_norm": 4.768049980170439, "learning_rate": 4.6696130547657314e-05, "loss": 1.1558, "step": 1170 }, { "epoch": 0.4315857366626739, "grad_norm": 6.631021893718982, "learning_rate": 4.6693039930770186e-05, "loss": 1.1474, "step": 1171 }, { "epoch": 0.43195429835068644, "grad_norm": 5.820336094750833, "learning_rate": 4.668994931388305e-05, "loss": 0.9716, "step": 1172 }, { "epoch": 0.43232286003869896, "grad_norm": 5.17809205490802, "learning_rate": 4.668685869699592e-05, "loss": 1.3464, "step": 1173 }, { "epoch": 0.4326914217267115, "grad_norm": 5.006984029193974, "learning_rate": 4.668376808010879e-05, "loss": 1.0179, "step": 1174 }, { "epoch": 0.433059983414724, "grad_norm": 5.011254423638606, "learning_rate": 4.6680677463221664e-05, "loss": 1.3763, "step": 1175 }, { "epoch": 0.4334285451027366, "grad_norm": 7.154087589503028, "learning_rate": 4.667758684633453e-05, "loss": 1.06, "step": 1176 }, { "epoch": 0.4337971067907491, "grad_norm": 7.322445879183024, "learning_rate": 4.66744962294474e-05, "loss": 0.9474, "step": 1177 }, { "epoch": 0.43416566847876165, "grad_norm": 7.446404776252556, "learning_rate": 4.667140561256027e-05, "loss": 1.5706, "step": 1178 }, { "epoch": 0.43453423016677417, "grad_norm": 4.757583186694722, "learning_rate": 4.666831499567314e-05, "loss": 1.0276, "step": 1179 }, { "epoch": 0.4349027918547867, "grad_norm": 5.001822711595486, "learning_rate": 4.666522437878601e-05, "loss": 0.9421, "step": 1180 }, { "epoch": 0.4352713535427992, "grad_norm": 9.968269743158201, "learning_rate": 4.666213376189888e-05, "loss": 1.3707, "step": 1181 }, { "epoch": 0.43563991523081175, "grad_norm": 6.519416565797774, "learning_rate": 4.665904314501175e-05, "loss": 1.1428, "step": 1182 }, { "epoch": 0.4360084769188243, "grad_norm": 14.943512412830588, "learning_rate": 4.665595252812461e-05, "loss": 1.2346, "step": 1183 }, { "epoch": 0.4363770386068368, "grad_norm": 6.554253910532492, "learning_rate": 4.6652861911237484e-05, "loss": 1.3066, "step": 1184 }, { "epoch": 0.4367456002948493, "grad_norm": 14.675823878697736, "learning_rate": 4.6649771294350355e-05, "loss": 1.0941, "step": 1185 }, { "epoch": 0.4371141619828619, "grad_norm": 10.584859622822746, "learning_rate": 4.664668067746322e-05, "loss": 1.3355, "step": 1186 }, { "epoch": 0.43748272367087443, "grad_norm": 9.16249095713337, "learning_rate": 4.664359006057609e-05, "loss": 1.2638, "step": 1187 }, { "epoch": 0.43785128535888695, "grad_norm": 5.501583218284973, "learning_rate": 4.664049944368896e-05, "loss": 1.2336, "step": 1188 }, { "epoch": 0.4382198470468995, "grad_norm": 5.189646816694875, "learning_rate": 4.663740882680183e-05, "loss": 0.8693, "step": 1189 }, { "epoch": 0.438588408734912, "grad_norm": 10.09614992693591, "learning_rate": 4.6634318209914704e-05, "loss": 1.3382, "step": 1190 }, { "epoch": 0.43895697042292453, "grad_norm": 7.47442232789546, "learning_rate": 4.663122759302757e-05, "loss": 1.2773, "step": 1191 }, { "epoch": 0.43932553211093706, "grad_norm": 8.514743360324344, "learning_rate": 4.662813697614044e-05, "loss": 0.9897, "step": 1192 }, { "epoch": 0.4396940937989496, "grad_norm": 6.448629295726431, "learning_rate": 4.662504635925331e-05, "loss": 1.1815, "step": 1193 }, { "epoch": 0.4400626554869621, "grad_norm": 36.99043438955705, "learning_rate": 4.662195574236618e-05, "loss": 1.0529, "step": 1194 }, { "epoch": 0.4404312171749747, "grad_norm": 32.5185212399459, "learning_rate": 4.661886512547905e-05, "loss": 1.5799, "step": 1195 }, { "epoch": 0.4407997788629872, "grad_norm": 9.102329886506695, "learning_rate": 4.661577450859192e-05, "loss": 1.0655, "step": 1196 }, { "epoch": 0.44116834055099974, "grad_norm": 4.319433416016548, "learning_rate": 4.661268389170479e-05, "loss": 0.9367, "step": 1197 }, { "epoch": 0.44153690223901226, "grad_norm": 8.896768870656954, "learning_rate": 4.6609593274817654e-05, "loss": 0.8532, "step": 1198 }, { "epoch": 0.4419054639270248, "grad_norm": 31.359736038965533, "learning_rate": 4.6606502657930525e-05, "loss": 1.3511, "step": 1199 }, { "epoch": 0.4422740256150373, "grad_norm": 18.164836173039365, "learning_rate": 4.6603412041043396e-05, "loss": 1.1009, "step": 1200 }, { "epoch": 0.44264258730304984, "grad_norm": 16.661234669815325, "learning_rate": 4.660032142415626e-05, "loss": 1.1034, "step": 1201 }, { "epoch": 0.44301114899106236, "grad_norm": 5.988700399143634, "learning_rate": 4.659723080726913e-05, "loss": 1.1399, "step": 1202 }, { "epoch": 0.4433797106790749, "grad_norm": 6.236434252248411, "learning_rate": 4.6594140190382e-05, "loss": 0.9856, "step": 1203 }, { "epoch": 0.4437482723670874, "grad_norm": 17.107591231215896, "learning_rate": 4.6591049573494874e-05, "loss": 1.2533, "step": 1204 }, { "epoch": 0.4441168340551, "grad_norm": 9.4035149840459, "learning_rate": 4.658795895660774e-05, "loss": 1.0118, "step": 1205 }, { "epoch": 0.4444853957431125, "grad_norm": 9.318176587562192, "learning_rate": 4.658486833972061e-05, "loss": 1.1267, "step": 1206 }, { "epoch": 0.44485395743112505, "grad_norm": 8.395511390691485, "learning_rate": 4.658177772283348e-05, "loss": 1.2092, "step": 1207 }, { "epoch": 0.44522251911913757, "grad_norm": 5.58658998319913, "learning_rate": 4.657868710594635e-05, "loss": 1.1255, "step": 1208 }, { "epoch": 0.4455910808071501, "grad_norm": 8.018545113352127, "learning_rate": 4.657559648905922e-05, "loss": 1.3503, "step": 1209 }, { "epoch": 0.4459596424951626, "grad_norm": 4.556085964018177, "learning_rate": 4.657250587217209e-05, "loss": 1.0395, "step": 1210 }, { "epoch": 0.44632820418317515, "grad_norm": 5.224636390117071, "learning_rate": 4.656941525528496e-05, "loss": 1.0318, "step": 1211 }, { "epoch": 0.4466967658711877, "grad_norm": 13.126050925187856, "learning_rate": 4.656632463839783e-05, "loss": 1.2538, "step": 1212 }, { "epoch": 0.4470653275592002, "grad_norm": 16.37518170670892, "learning_rate": 4.6563234021510694e-05, "loss": 1.323, "step": 1213 }, { "epoch": 0.4474338892472128, "grad_norm": 5.242672484202244, "learning_rate": 4.6560143404623566e-05, "loss": 1.094, "step": 1214 }, { "epoch": 0.4478024509352253, "grad_norm": 5.567634184735835, "learning_rate": 4.655705278773643e-05, "loss": 0.9849, "step": 1215 }, { "epoch": 0.44817101262323783, "grad_norm": 13.32310385891917, "learning_rate": 4.65539621708493e-05, "loss": 1.0779, "step": 1216 }, { "epoch": 0.44853957431125036, "grad_norm": 5.345850838295316, "learning_rate": 4.655087155396217e-05, "loss": 1.0853, "step": 1217 }, { "epoch": 0.4489081359992629, "grad_norm": 6.22236934366833, "learning_rate": 4.6547780937075044e-05, "loss": 1.3036, "step": 1218 }, { "epoch": 0.4492766976872754, "grad_norm": 8.842504622587894, "learning_rate": 4.6544690320187915e-05, "loss": 1.1582, "step": 1219 }, { "epoch": 0.44964525937528793, "grad_norm": 6.124211786117517, "learning_rate": 4.654159970330078e-05, "loss": 1.3742, "step": 1220 }, { "epoch": 0.45001382106330046, "grad_norm": 11.183371352137275, "learning_rate": 4.653850908641365e-05, "loss": 0.7412, "step": 1221 }, { "epoch": 0.450382382751313, "grad_norm": 5.682333811164556, "learning_rate": 4.653541846952652e-05, "loss": 0.9229, "step": 1222 }, { "epoch": 0.4507509444393255, "grad_norm": 5.306638827865909, "learning_rate": 4.653232785263939e-05, "loss": 1.0158, "step": 1223 }, { "epoch": 0.4511195061273381, "grad_norm": 8.822936459233507, "learning_rate": 4.652923723575226e-05, "loss": 1.329, "step": 1224 }, { "epoch": 0.4514880678153506, "grad_norm": 7.007256969602093, "learning_rate": 4.652614661886513e-05, "loss": 1.0267, "step": 1225 }, { "epoch": 0.45185662950336314, "grad_norm": 10.12099839242516, "learning_rate": 4.6523056001978e-05, "loss": 1.0282, "step": 1226 }, { "epoch": 0.45222519119137566, "grad_norm": 12.386272190192305, "learning_rate": 4.651996538509087e-05, "loss": 0.843, "step": 1227 }, { "epoch": 0.4525937528793882, "grad_norm": 4.986424518016587, "learning_rate": 4.6516874768203735e-05, "loss": 1.096, "step": 1228 }, { "epoch": 0.4529623145674007, "grad_norm": 11.0442247030741, "learning_rate": 4.65137841513166e-05, "loss": 1.1938, "step": 1229 }, { "epoch": 0.45333087625541324, "grad_norm": 12.311087125470987, "learning_rate": 4.651069353442947e-05, "loss": 1.0314, "step": 1230 }, { "epoch": 0.45369943794342577, "grad_norm": 7.8914951562287134, "learning_rate": 4.650760291754234e-05, "loss": 1.2361, "step": 1231 }, { "epoch": 0.4540679996314383, "grad_norm": 7.673372514869399, "learning_rate": 4.650451230065521e-05, "loss": 1.2924, "step": 1232 }, { "epoch": 0.4544365613194508, "grad_norm": 6.835330330178363, "learning_rate": 4.6501421683768084e-05, "loss": 1.2354, "step": 1233 }, { "epoch": 0.4548051230074634, "grad_norm": 13.36520454863872, "learning_rate": 4.649833106688095e-05, "loss": 0.9497, "step": 1234 }, { "epoch": 0.4551736846954759, "grad_norm": 7.826415341624119, "learning_rate": 4.649524044999382e-05, "loss": 1.3615, "step": 1235 }, { "epoch": 0.45554224638348845, "grad_norm": 12.083700731326385, "learning_rate": 4.649214983310669e-05, "loss": 1.1162, "step": 1236 }, { "epoch": 0.455910808071501, "grad_norm": 16.816617213397976, "learning_rate": 4.648905921621956e-05, "loss": 1.2199, "step": 1237 }, { "epoch": 0.4562793697595135, "grad_norm": 9.786680426453662, "learning_rate": 4.648596859933243e-05, "loss": 0.9474, "step": 1238 }, { "epoch": 0.456647931447526, "grad_norm": 6.635812249561148, "learning_rate": 4.64828779824453e-05, "loss": 1.0187, "step": 1239 }, { "epoch": 0.45701649313553855, "grad_norm": 9.45318633761092, "learning_rate": 4.647978736555817e-05, "loss": 0.8198, "step": 1240 }, { "epoch": 0.4573850548235511, "grad_norm": 8.815128015723523, "learning_rate": 4.647669674867104e-05, "loss": 1.0844, "step": 1241 }, { "epoch": 0.4577536165115636, "grad_norm": 6.228750497786203, "learning_rate": 4.647360613178391e-05, "loss": 1.0344, "step": 1242 }, { "epoch": 0.4581221781995762, "grad_norm": 5.139885660821965, "learning_rate": 4.6470515514896776e-05, "loss": 1.0732, "step": 1243 }, { "epoch": 0.4584907398875887, "grad_norm": 4.307617519588799, "learning_rate": 4.646742489800964e-05, "loss": 1.0996, "step": 1244 }, { "epoch": 0.45885930157560123, "grad_norm": 5.913766427735915, "learning_rate": 4.646433428112251e-05, "loss": 1.1944, "step": 1245 }, { "epoch": 0.45922786326361376, "grad_norm": 20.41293005972065, "learning_rate": 4.646124366423538e-05, "loss": 1.2875, "step": 1246 }, { "epoch": 0.4595964249516263, "grad_norm": 6.581644670583806, "learning_rate": 4.6458153047348254e-05, "loss": 1.1819, "step": 1247 }, { "epoch": 0.4599649866396388, "grad_norm": 9.128339600513595, "learning_rate": 4.645506243046112e-05, "loss": 1.4247, "step": 1248 }, { "epoch": 0.46033354832765133, "grad_norm": 7.396106582393537, "learning_rate": 4.645197181357399e-05, "loss": 1.438, "step": 1249 }, { "epoch": 0.46070211001566386, "grad_norm": 27.406662001606698, "learning_rate": 4.644888119668686e-05, "loss": 1.0878, "step": 1250 }, { "epoch": 0.4610706717036764, "grad_norm": 5.324121307802379, "learning_rate": 4.644579057979973e-05, "loss": 1.0896, "step": 1251 }, { "epoch": 0.4614392333916889, "grad_norm": 6.052551916719909, "learning_rate": 4.64426999629126e-05, "loss": 0.8428, "step": 1252 }, { "epoch": 0.4618077950797015, "grad_norm": 5.680584066744851, "learning_rate": 4.643960934602547e-05, "loss": 1.0685, "step": 1253 }, { "epoch": 0.462176356767714, "grad_norm": 10.591557188591262, "learning_rate": 4.643651872913834e-05, "loss": 1.283, "step": 1254 }, { "epoch": 0.46254491845572654, "grad_norm": 6.537450156377419, "learning_rate": 4.643342811225121e-05, "loss": 1.1056, "step": 1255 }, { "epoch": 0.46291348014373906, "grad_norm": 9.444361737456358, "learning_rate": 4.643033749536408e-05, "loss": 1.1043, "step": 1256 }, { "epoch": 0.4632820418317516, "grad_norm": 9.171324315346482, "learning_rate": 4.6427246878476946e-05, "loss": 1.03, "step": 1257 }, { "epoch": 0.4636506035197641, "grad_norm": 6.473971558422065, "learning_rate": 4.642415626158981e-05, "loss": 1.3609, "step": 1258 }, { "epoch": 0.46401916520777664, "grad_norm": 6.938768425533713, "learning_rate": 4.642106564470268e-05, "loss": 1.3194, "step": 1259 }, { "epoch": 0.46438772689578917, "grad_norm": 66.62624954855444, "learning_rate": 4.641797502781555e-05, "loss": 0.8231, "step": 1260 }, { "epoch": 0.4647562885838017, "grad_norm": 32.4977838787701, "learning_rate": 4.6414884410928424e-05, "loss": 1.1665, "step": 1261 }, { "epoch": 0.46512485027181427, "grad_norm": 10.703752975418904, "learning_rate": 4.6411793794041295e-05, "loss": 1.1175, "step": 1262 }, { "epoch": 0.4654934119598268, "grad_norm": 7.616791856712153, "learning_rate": 4.640870317715416e-05, "loss": 1.1696, "step": 1263 }, { "epoch": 0.4658619736478393, "grad_norm": 34.45006186780094, "learning_rate": 4.640561256026703e-05, "loss": 1.0616, "step": 1264 }, { "epoch": 0.46623053533585185, "grad_norm": 10.805650250287716, "learning_rate": 4.64025219433799e-05, "loss": 1.3244, "step": 1265 }, { "epoch": 0.4665990970238644, "grad_norm": 8.13282657149359, "learning_rate": 4.639943132649277e-05, "loss": 1.2998, "step": 1266 }, { "epoch": 0.4669676587118769, "grad_norm": 11.685721685398542, "learning_rate": 4.639634070960564e-05, "loss": 1.0007, "step": 1267 }, { "epoch": 0.4673362203998894, "grad_norm": 7.963184043450414, "learning_rate": 4.639325009271851e-05, "loss": 1.0205, "step": 1268 }, { "epoch": 0.46770478208790195, "grad_norm": 8.370877290277688, "learning_rate": 4.639015947583138e-05, "loss": 1.1129, "step": 1269 }, { "epoch": 0.4680733437759145, "grad_norm": 10.46528153537907, "learning_rate": 4.638706885894425e-05, "loss": 1.1187, "step": 1270 }, { "epoch": 0.468441905463927, "grad_norm": 11.469218314689105, "learning_rate": 4.638397824205712e-05, "loss": 1.0041, "step": 1271 }, { "epoch": 0.4688104671519396, "grad_norm": 12.708857557813948, "learning_rate": 4.6380887625169986e-05, "loss": 1.1409, "step": 1272 }, { "epoch": 0.4691790288399521, "grad_norm": 7.144951442502787, "learning_rate": 4.637779700828285e-05, "loss": 1.3481, "step": 1273 }, { "epoch": 0.46954759052796463, "grad_norm": 11.954901828721765, "learning_rate": 4.637470639139572e-05, "loss": 0.9403, "step": 1274 }, { "epoch": 0.46991615221597716, "grad_norm": 15.687708636716447, "learning_rate": 4.637161577450859e-05, "loss": 0.99, "step": 1275 }, { "epoch": 0.4702847139039897, "grad_norm": 17.117712544720217, "learning_rate": 4.6368525157621464e-05, "loss": 0.6808, "step": 1276 }, { "epoch": 0.4706532755920022, "grad_norm": 13.553715405524247, "learning_rate": 4.636543454073433e-05, "loss": 1.208, "step": 1277 }, { "epoch": 0.47102183728001473, "grad_norm": 8.583171756544532, "learning_rate": 4.63623439238472e-05, "loss": 1.2442, "step": 1278 }, { "epoch": 0.47139039896802726, "grad_norm": 8.71656033491799, "learning_rate": 4.635925330696007e-05, "loss": 1.2792, "step": 1279 }, { "epoch": 0.4717589606560398, "grad_norm": 16.529439845070275, "learning_rate": 4.635616269007294e-05, "loss": 1.3671, "step": 1280 }, { "epoch": 0.4721275223440523, "grad_norm": 5.547425836425804, "learning_rate": 4.6353072073185814e-05, "loss": 0.9341, "step": 1281 }, { "epoch": 0.4724960840320649, "grad_norm": 16.96552865659235, "learning_rate": 4.634998145629868e-05, "loss": 0.765, "step": 1282 }, { "epoch": 0.4728646457200774, "grad_norm": 8.330218025275977, "learning_rate": 4.634689083941155e-05, "loss": 0.9801, "step": 1283 }, { "epoch": 0.47323320740808994, "grad_norm": 10.623839550599161, "learning_rate": 4.634380022252442e-05, "loss": 1.218, "step": 1284 }, { "epoch": 0.47360176909610247, "grad_norm": 5.978523600839992, "learning_rate": 4.634070960563729e-05, "loss": 1.5289, "step": 1285 }, { "epoch": 0.473970330784115, "grad_norm": 8.811903913101327, "learning_rate": 4.6337618988750156e-05, "loss": 1.0358, "step": 1286 }, { "epoch": 0.4743388924721275, "grad_norm": 8.705693576445972, "learning_rate": 4.633452837186303e-05, "loss": 0.7216, "step": 1287 }, { "epoch": 0.47470745416014004, "grad_norm": 13.954831966207193, "learning_rate": 4.633143775497589e-05, "loss": 1.2255, "step": 1288 }, { "epoch": 0.47507601584815257, "grad_norm": 8.069877620180339, "learning_rate": 4.632834713808876e-05, "loss": 1.351, "step": 1289 }, { "epoch": 0.4754445775361651, "grad_norm": 11.912813715291378, "learning_rate": 4.6325256521201634e-05, "loss": 1.0469, "step": 1290 }, { "epoch": 0.4758131392241777, "grad_norm": 9.33583453133593, "learning_rate": 4.6322165904314505e-05, "loss": 1.3929, "step": 1291 }, { "epoch": 0.4761817009121902, "grad_norm": 7.813342727985311, "learning_rate": 4.631907528742737e-05, "loss": 1.2098, "step": 1292 }, { "epoch": 0.4765502626002027, "grad_norm": 12.176589542468047, "learning_rate": 4.631598467054024e-05, "loss": 1.0419, "step": 1293 }, { "epoch": 0.47691882428821525, "grad_norm": 156.77624506163235, "learning_rate": 4.631289405365311e-05, "loss": 1.0115, "step": 1294 }, { "epoch": 0.4772873859762278, "grad_norm": 10.661866975800626, "learning_rate": 4.630980343676598e-05, "loss": 1.3579, "step": 1295 }, { "epoch": 0.4776559476642403, "grad_norm": 15.348157323520601, "learning_rate": 4.630671281987885e-05, "loss": 1.1924, "step": 1296 }, { "epoch": 0.4780245093522528, "grad_norm": 12.488672846339222, "learning_rate": 4.630362220299172e-05, "loss": 1.3406, "step": 1297 }, { "epoch": 0.47839307104026535, "grad_norm": 18.550160631990664, "learning_rate": 4.630053158610459e-05, "loss": 1.1332, "step": 1298 }, { "epoch": 0.4787616327282779, "grad_norm": 14.990181316070592, "learning_rate": 4.629744096921746e-05, "loss": 1.0598, "step": 1299 }, { "epoch": 0.4791301944162904, "grad_norm": 7.883612872833809, "learning_rate": 4.629435035233033e-05, "loss": 1.1195, "step": 1300 }, { "epoch": 0.479498756104303, "grad_norm": 14.43376342508523, "learning_rate": 4.62912597354432e-05, "loss": 1.0255, "step": 1301 }, { "epoch": 0.4798673177923155, "grad_norm": 10.727900655674778, "learning_rate": 4.628816911855607e-05, "loss": 1.1066, "step": 1302 }, { "epoch": 0.48023587948032803, "grad_norm": 11.43795158453625, "learning_rate": 4.628507850166894e-05, "loss": 1.063, "step": 1303 }, { "epoch": 0.48060444116834056, "grad_norm": 12.512379124197613, "learning_rate": 4.6281987884781804e-05, "loss": 0.9197, "step": 1304 }, { "epoch": 0.4809730028563531, "grad_norm": 4.934624715988693, "learning_rate": 4.6278897267894675e-05, "loss": 0.9761, "step": 1305 }, { "epoch": 0.4813415645443656, "grad_norm": 10.337976417495419, "learning_rate": 4.627580665100754e-05, "loss": 1.5531, "step": 1306 }, { "epoch": 0.48171012623237813, "grad_norm": 8.785352515593338, "learning_rate": 4.627271603412041e-05, "loss": 0.8896, "step": 1307 }, { "epoch": 0.48207868792039066, "grad_norm": 13.13538639913163, "learning_rate": 4.626962541723328e-05, "loss": 1.308, "step": 1308 }, { "epoch": 0.4824472496084032, "grad_norm": 4.788545913589295, "learning_rate": 4.626653480034615e-05, "loss": 0.9854, "step": 1309 }, { "epoch": 0.48281581129641576, "grad_norm": 6.207847968805985, "learning_rate": 4.626344418345902e-05, "loss": 1.2679, "step": 1310 }, { "epoch": 0.4831843729844283, "grad_norm": 6.155591280788003, "learning_rate": 4.626035356657189e-05, "loss": 1.4201, "step": 1311 }, { "epoch": 0.4835529346724408, "grad_norm": 6.268880478783703, "learning_rate": 4.625726294968476e-05, "loss": 1.1393, "step": 1312 }, { "epoch": 0.48392149636045334, "grad_norm": 4.357774009821839, "learning_rate": 4.625417233279763e-05, "loss": 1.2133, "step": 1313 }, { "epoch": 0.48429005804846587, "grad_norm": 18.28889980838574, "learning_rate": 4.62510817159105e-05, "loss": 0.9582, "step": 1314 }, { "epoch": 0.4846586197364784, "grad_norm": 5.453052771671997, "learning_rate": 4.6247991099023366e-05, "loss": 1.2876, "step": 1315 }, { "epoch": 0.4850271814244909, "grad_norm": 7.235521262793437, "learning_rate": 4.624490048213624e-05, "loss": 1.1955, "step": 1316 }, { "epoch": 0.48539574311250344, "grad_norm": 5.564871636047286, "learning_rate": 4.624180986524911e-05, "loss": 0.9287, "step": 1317 }, { "epoch": 0.48576430480051597, "grad_norm": 7.125381727198956, "learning_rate": 4.623871924836198e-05, "loss": 1.0375, "step": 1318 }, { "epoch": 0.4861328664885285, "grad_norm": 67.42364025788729, "learning_rate": 4.6235628631474844e-05, "loss": 0.7767, "step": 1319 }, { "epoch": 0.4865014281765411, "grad_norm": 6.40837664777638, "learning_rate": 4.623253801458771e-05, "loss": 1.1819, "step": 1320 }, { "epoch": 0.4868699898645536, "grad_norm": 5.871716170017928, "learning_rate": 4.622944739770058e-05, "loss": 0.8566, "step": 1321 }, { "epoch": 0.4872385515525661, "grad_norm": 5.100503575015902, "learning_rate": 4.622635678081345e-05, "loss": 1.2345, "step": 1322 }, { "epoch": 0.48760711324057865, "grad_norm": 7.6929783103296945, "learning_rate": 4.622326616392632e-05, "loss": 1.0857, "step": 1323 }, { "epoch": 0.4879756749285912, "grad_norm": 7.354283087856227, "learning_rate": 4.6220175547039193e-05, "loss": 0.904, "step": 1324 }, { "epoch": 0.4883442366166037, "grad_norm": 11.82658845906452, "learning_rate": 4.621708493015206e-05, "loss": 0.9724, "step": 1325 }, { "epoch": 0.4887127983046162, "grad_norm": 10.020367289681412, "learning_rate": 4.621399431326493e-05, "loss": 1.2988, "step": 1326 }, { "epoch": 0.48908135999262875, "grad_norm": 10.372750279663432, "learning_rate": 4.62109036963778e-05, "loss": 1.2283, "step": 1327 }, { "epoch": 0.4894499216806413, "grad_norm": 5.620664663684953, "learning_rate": 4.620781307949067e-05, "loss": 1.0589, "step": 1328 }, { "epoch": 0.4898184833686538, "grad_norm": 7.134595867385761, "learning_rate": 4.6204722462603536e-05, "loss": 1.1462, "step": 1329 }, { "epoch": 0.4901870450566664, "grad_norm": 6.235542971929963, "learning_rate": 4.620163184571641e-05, "loss": 1.2251, "step": 1330 }, { "epoch": 0.4905556067446789, "grad_norm": 5.883730630544918, "learning_rate": 4.619854122882928e-05, "loss": 0.9641, "step": 1331 }, { "epoch": 0.49092416843269143, "grad_norm": 5.497155060608712, "learning_rate": 4.619545061194215e-05, "loss": 0.9805, "step": 1332 }, { "epoch": 0.49129273012070396, "grad_norm": 5.757139334800965, "learning_rate": 4.619235999505502e-05, "loss": 1.0501, "step": 1333 }, { "epoch": 0.4916612918087165, "grad_norm": 8.304263515899764, "learning_rate": 4.6189269378167885e-05, "loss": 1.2334, "step": 1334 }, { "epoch": 0.492029853496729, "grad_norm": 6.66114619888059, "learning_rate": 4.618617876128075e-05, "loss": 1.0482, "step": 1335 }, { "epoch": 0.49239841518474153, "grad_norm": 6.448326709961154, "learning_rate": 4.618308814439362e-05, "loss": 1.3755, "step": 1336 }, { "epoch": 0.49276697687275406, "grad_norm": 5.817888940678931, "learning_rate": 4.617999752750649e-05, "loss": 0.9427, "step": 1337 }, { "epoch": 0.4931355385607666, "grad_norm": 5.901538725813412, "learning_rate": 4.617690691061936e-05, "loss": 1.0096, "step": 1338 }, { "epoch": 0.49350410024877917, "grad_norm": 5.5230141225202924, "learning_rate": 4.617381629373223e-05, "loss": 1.1417, "step": 1339 }, { "epoch": 0.4938726619367917, "grad_norm": 6.586168024954244, "learning_rate": 4.61707256768451e-05, "loss": 1.1861, "step": 1340 }, { "epoch": 0.4942412236248042, "grad_norm": 6.694176879150675, "learning_rate": 4.616763505995797e-05, "loss": 0.9246, "step": 1341 }, { "epoch": 0.49460978531281674, "grad_norm": 9.01075441172155, "learning_rate": 4.616454444307084e-05, "loss": 1.3423, "step": 1342 }, { "epoch": 0.49497834700082927, "grad_norm": 5.2706709322832905, "learning_rate": 4.616145382618371e-05, "loss": 1.2365, "step": 1343 }, { "epoch": 0.4953469086888418, "grad_norm": 5.3023158573908145, "learning_rate": 4.615836320929658e-05, "loss": 1.0818, "step": 1344 }, { "epoch": 0.4957154703768543, "grad_norm": 5.806013199788909, "learning_rate": 4.615527259240945e-05, "loss": 1.2948, "step": 1345 }, { "epoch": 0.49608403206486684, "grad_norm": 5.4047955447335, "learning_rate": 4.615218197552232e-05, "loss": 1.0738, "step": 1346 }, { "epoch": 0.49645259375287937, "grad_norm": 5.199673994555639, "learning_rate": 4.614909135863519e-05, "loss": 1.0257, "step": 1347 }, { "epoch": 0.4968211554408919, "grad_norm": 9.324691677365514, "learning_rate": 4.6146000741748055e-05, "loss": 1.0197, "step": 1348 }, { "epoch": 0.4971897171289045, "grad_norm": 4.828525366124083, "learning_rate": 4.614291012486092e-05, "loss": 1.0874, "step": 1349 }, { "epoch": 0.497558278816917, "grad_norm": 6.576023083128872, "learning_rate": 4.613981950797379e-05, "loss": 1.2664, "step": 1350 }, { "epoch": 0.4979268405049295, "grad_norm": 5.99027417607824, "learning_rate": 4.613672889108666e-05, "loss": 1.5817, "step": 1351 }, { "epoch": 0.49829540219294205, "grad_norm": 7.700531621638818, "learning_rate": 4.613363827419953e-05, "loss": 1.1818, "step": 1352 }, { "epoch": 0.4986639638809546, "grad_norm": 8.913387182734516, "learning_rate": 4.6130547657312404e-05, "loss": 1.0617, "step": 1353 }, { "epoch": 0.4990325255689671, "grad_norm": 6.050818446262365, "learning_rate": 4.612745704042527e-05, "loss": 0.8806, "step": 1354 }, { "epoch": 0.4994010872569796, "grad_norm": 12.657124656312337, "learning_rate": 4.612436642353814e-05, "loss": 1.4306, "step": 1355 }, { "epoch": 0.49976964894499215, "grad_norm": 21.5416041753909, "learning_rate": 4.612127580665101e-05, "loss": 1.3281, "step": 1356 }, { "epoch": 0.5001382106330047, "grad_norm": 25.309384453325805, "learning_rate": 4.611818518976388e-05, "loss": 1.2849, "step": 1357 }, { "epoch": 0.5005067723210173, "grad_norm": 6.692426061553125, "learning_rate": 4.6115094572876746e-05, "loss": 0.9255, "step": 1358 }, { "epoch": 0.5008753340090297, "grad_norm": 12.865441783520543, "learning_rate": 4.611200395598962e-05, "loss": 1.0414, "step": 1359 }, { "epoch": 0.5012438956970423, "grad_norm": 6.1927135350938345, "learning_rate": 4.610891333910249e-05, "loss": 1.1785, "step": 1360 }, { "epoch": 0.5016124573850548, "grad_norm": 6.177665215586641, "learning_rate": 4.610582272221536e-05, "loss": 0.7057, "step": 1361 }, { "epoch": 0.5019810190730674, "grad_norm": 4.910380380351031, "learning_rate": 4.610273210532823e-05, "loss": 0.8855, "step": 1362 }, { "epoch": 0.5023495807610799, "grad_norm": 6.153999033276046, "learning_rate": 4.6099641488441095e-05, "loss": 1.4304, "step": 1363 }, { "epoch": 0.5027181424490924, "grad_norm": 6.872794612538105, "learning_rate": 4.609655087155396e-05, "loss": 0.9176, "step": 1364 }, { "epoch": 0.503086704137105, "grad_norm": 6.33310742560922, "learning_rate": 4.609346025466683e-05, "loss": 0.9205, "step": 1365 }, { "epoch": 0.5034552658251175, "grad_norm": 6.277977845521392, "learning_rate": 4.60903696377797e-05, "loss": 1.1705, "step": 1366 }, { "epoch": 0.50382382751313, "grad_norm": 5.428329978678698, "learning_rate": 4.6087279020892573e-05, "loss": 1.2238, "step": 1367 }, { "epoch": 0.5041923892011425, "grad_norm": 4.841351315092961, "learning_rate": 4.608418840400544e-05, "loss": 1.2653, "step": 1368 }, { "epoch": 0.5045609508891551, "grad_norm": 4.8576370516775, "learning_rate": 4.608109778711831e-05, "loss": 1.3166, "step": 1369 }, { "epoch": 0.5049295125771676, "grad_norm": 13.897812793366196, "learning_rate": 4.607800717023118e-05, "loss": 1.1008, "step": 1370 }, { "epoch": 0.5052980742651801, "grad_norm": 7.399176845647363, "learning_rate": 4.607491655334405e-05, "loss": 1.1062, "step": 1371 }, { "epoch": 0.5056666359531926, "grad_norm": 4.118186641248573, "learning_rate": 4.607182593645692e-05, "loss": 0.9248, "step": 1372 }, { "epoch": 0.5060351976412052, "grad_norm": 4.427736959637534, "learning_rate": 4.606873531956979e-05, "loss": 0.8761, "step": 1373 }, { "epoch": 0.5064037593292178, "grad_norm": 4.723656535015117, "learning_rate": 4.606564470268266e-05, "loss": 0.9552, "step": 1374 }, { "epoch": 0.5067723210172302, "grad_norm": 6.425831116827783, "learning_rate": 4.606255408579553e-05, "loss": 1.213, "step": 1375 }, { "epoch": 0.5071408827052428, "grad_norm": 35.99689215914601, "learning_rate": 4.60594634689084e-05, "loss": 1.1288, "step": 1376 }, { "epoch": 0.5075094443932553, "grad_norm": 5.915099281129205, "learning_rate": 4.6056372852021265e-05, "loss": 1.2045, "step": 1377 }, { "epoch": 0.5078780060812679, "grad_norm": 8.701927883472727, "learning_rate": 4.6053282235134136e-05, "loss": 1.0841, "step": 1378 }, { "epoch": 0.5082465677692803, "grad_norm": 4.583172512122897, "learning_rate": 4.6050191618247e-05, "loss": 1.0118, "step": 1379 }, { "epoch": 0.5086151294572929, "grad_norm": 6.332037274441921, "learning_rate": 4.604710100135987e-05, "loss": 1.4617, "step": 1380 }, { "epoch": 0.5089836911453054, "grad_norm": 4.644256917360744, "learning_rate": 4.604401038447274e-05, "loss": 1.136, "step": 1381 }, { "epoch": 0.509352252833318, "grad_norm": 7.742006917593173, "learning_rate": 4.604091976758561e-05, "loss": 1.3107, "step": 1382 }, { "epoch": 0.5097208145213306, "grad_norm": 9.053631672426317, "learning_rate": 4.603782915069848e-05, "loss": 0.8659, "step": 1383 }, { "epoch": 0.510089376209343, "grad_norm": 6.987381870315327, "learning_rate": 4.603473853381135e-05, "loss": 1.2735, "step": 1384 }, { "epoch": 0.5104579378973556, "grad_norm": 5.609683227571437, "learning_rate": 4.603164791692422e-05, "loss": 1.1862, "step": 1385 }, { "epoch": 0.5108264995853681, "grad_norm": 4.408412105286785, "learning_rate": 4.602855730003709e-05, "loss": 1.0824, "step": 1386 }, { "epoch": 0.5111950612733807, "grad_norm": 4.560622364538497, "learning_rate": 4.6025466683149957e-05, "loss": 0.9481, "step": 1387 }, { "epoch": 0.5115636229613931, "grad_norm": 6.03158584213108, "learning_rate": 4.602237606626283e-05, "loss": 1.3159, "step": 1388 }, { "epoch": 0.5119321846494057, "grad_norm": 9.104498838974429, "learning_rate": 4.60192854493757e-05, "loss": 1.0252, "step": 1389 }, { "epoch": 0.5123007463374182, "grad_norm": 6.42844146112731, "learning_rate": 4.601619483248857e-05, "loss": 1.2013, "step": 1390 }, { "epoch": 0.5126693080254308, "grad_norm": 7.410268474044774, "learning_rate": 4.601310421560144e-05, "loss": 1.1017, "step": 1391 }, { "epoch": 0.5130378697134433, "grad_norm": 8.300279856180632, "learning_rate": 4.6010013598714306e-05, "loss": 1.3743, "step": 1392 }, { "epoch": 0.5134064314014558, "grad_norm": 9.339936872183195, "learning_rate": 4.600692298182718e-05, "loss": 1.2565, "step": 1393 }, { "epoch": 0.5137749930894684, "grad_norm": 6.514349723444078, "learning_rate": 4.600383236494004e-05, "loss": 1.2742, "step": 1394 }, { "epoch": 0.5141435547774809, "grad_norm": 6.4332762259417695, "learning_rate": 4.600074174805291e-05, "loss": 1.1505, "step": 1395 }, { "epoch": 0.5145121164654934, "grad_norm": 7.334176564142933, "learning_rate": 4.5997651131165784e-05, "loss": 1.1882, "step": 1396 }, { "epoch": 0.5148806781535059, "grad_norm": 5.537603834569397, "learning_rate": 4.599456051427865e-05, "loss": 1.1976, "step": 1397 }, { "epoch": 0.5152492398415185, "grad_norm": 4.9445954536685255, "learning_rate": 4.599146989739152e-05, "loss": 1.0965, "step": 1398 }, { "epoch": 0.515617801529531, "grad_norm": 4.6781290608543245, "learning_rate": 4.598837928050439e-05, "loss": 0.8955, "step": 1399 }, { "epoch": 0.5159863632175435, "grad_norm": 6.585124226921188, "learning_rate": 4.598528866361726e-05, "loss": 1.0804, "step": 1400 }, { "epoch": 0.516354924905556, "grad_norm": 5.444987086201076, "learning_rate": 4.5982198046730126e-05, "loss": 1.3405, "step": 1401 }, { "epoch": 0.5167234865935686, "grad_norm": 6.051344843426335, "learning_rate": 4.5979107429843e-05, "loss": 1.2345, "step": 1402 }, { "epoch": 0.5170920482815812, "grad_norm": 4.034272709265526, "learning_rate": 4.597601681295587e-05, "loss": 0.8754, "step": 1403 }, { "epoch": 0.5174606099695936, "grad_norm": 8.916639619114815, "learning_rate": 4.597292619606874e-05, "loss": 0.9393, "step": 1404 }, { "epoch": 0.5178291716576062, "grad_norm": 4.64264385865328, "learning_rate": 4.596983557918161e-05, "loss": 1.0408, "step": 1405 }, { "epoch": 0.5181977333456187, "grad_norm": 4.968259391315486, "learning_rate": 4.5966744962294475e-05, "loss": 1.0215, "step": 1406 }, { "epoch": 0.5185662950336313, "grad_norm": 5.007202682619706, "learning_rate": 4.5963654345407347e-05, "loss": 1.098, "step": 1407 }, { "epoch": 0.5189348567216437, "grad_norm": 4.82833693943453, "learning_rate": 4.596056372852022e-05, "loss": 1.0053, "step": 1408 }, { "epoch": 0.5193034184096563, "grad_norm": 7.228115331475916, "learning_rate": 4.595747311163308e-05, "loss": 1.1636, "step": 1409 }, { "epoch": 0.5196719800976688, "grad_norm": 10.133161104980626, "learning_rate": 4.595438249474595e-05, "loss": 1.0347, "step": 1410 }, { "epoch": 0.5200405417856814, "grad_norm": 14.859520816187807, "learning_rate": 4.595129187785882e-05, "loss": 1.0652, "step": 1411 }, { "epoch": 0.520409103473694, "grad_norm": 8.190711869263438, "learning_rate": 4.594820126097169e-05, "loss": 0.9945, "step": 1412 }, { "epoch": 0.5207776651617064, "grad_norm": 4.4634643453030955, "learning_rate": 4.594511064408456e-05, "loss": 0.8387, "step": 1413 }, { "epoch": 0.521146226849719, "grad_norm": 8.088248839341516, "learning_rate": 4.594202002719743e-05, "loss": 1.0924, "step": 1414 }, { "epoch": 0.5215147885377315, "grad_norm": 9.458516347405688, "learning_rate": 4.59389294103103e-05, "loss": 1.3342, "step": 1415 }, { "epoch": 0.5218833502257441, "grad_norm": 5.494933829494887, "learning_rate": 4.593583879342317e-05, "loss": 1.5148, "step": 1416 }, { "epoch": 0.5222519119137565, "grad_norm": 3.967231520070122, "learning_rate": 4.593274817653604e-05, "loss": 1.133, "step": 1417 }, { "epoch": 0.5226204736017691, "grad_norm": 5.980311196691803, "learning_rate": 4.592965755964891e-05, "loss": 1.362, "step": 1418 }, { "epoch": 0.5229890352897816, "grad_norm": 6.10162783305691, "learning_rate": 4.592656694276178e-05, "loss": 1.1722, "step": 1419 }, { "epoch": 0.5233575969777942, "grad_norm": 9.272957022093532, "learning_rate": 4.5923476325874645e-05, "loss": 1.1589, "step": 1420 }, { "epoch": 0.5237261586658067, "grad_norm": 5.841412571465751, "learning_rate": 4.5920385708987516e-05, "loss": 1.1105, "step": 1421 }, { "epoch": 0.5240947203538192, "grad_norm": 5.8765399619017415, "learning_rate": 4.591729509210039e-05, "loss": 1.2054, "step": 1422 }, { "epoch": 0.5244632820418318, "grad_norm": 7.193500170441955, "learning_rate": 4.591420447521326e-05, "loss": 1.0401, "step": 1423 }, { "epoch": 0.5248318437298443, "grad_norm": 4.842146675796969, "learning_rate": 4.591111385832612e-05, "loss": 1.131, "step": 1424 }, { "epoch": 0.5252004054178568, "grad_norm": 5.959911893154959, "learning_rate": 4.5908023241438994e-05, "loss": 1.168, "step": 1425 }, { "epoch": 0.5255689671058693, "grad_norm": 5.074669798082463, "learning_rate": 4.590493262455186e-05, "loss": 1.0764, "step": 1426 }, { "epoch": 0.5259375287938819, "grad_norm": 4.550449290467611, "learning_rate": 4.590184200766473e-05, "loss": 0.8647, "step": 1427 }, { "epoch": 0.5263060904818944, "grad_norm": 8.148569049303658, "learning_rate": 4.58987513907776e-05, "loss": 1.0592, "step": 1428 }, { "epoch": 0.5266746521699069, "grad_norm": 11.650110177104416, "learning_rate": 4.589566077389047e-05, "loss": 1.0185, "step": 1429 }, { "epoch": 0.5270432138579195, "grad_norm": 24.535690780377724, "learning_rate": 4.5892570157003337e-05, "loss": 1.1862, "step": 1430 }, { "epoch": 0.527411775545932, "grad_norm": 6.096200303639132, "learning_rate": 4.588947954011621e-05, "loss": 1.1664, "step": 1431 }, { "epoch": 0.5277803372339446, "grad_norm": 3.7612507213060646, "learning_rate": 4.588638892322908e-05, "loss": 1.0868, "step": 1432 }, { "epoch": 0.528148898921957, "grad_norm": 10.031132112089608, "learning_rate": 4.588329830634195e-05, "loss": 1.3362, "step": 1433 }, { "epoch": 0.5285174606099696, "grad_norm": 5.187570962075822, "learning_rate": 4.588020768945482e-05, "loss": 1.0313, "step": 1434 }, { "epoch": 0.5288860222979821, "grad_norm": 10.863202590959379, "learning_rate": 4.5877117072567686e-05, "loss": 1.5821, "step": 1435 }, { "epoch": 0.5292545839859947, "grad_norm": 4.631438027296014, "learning_rate": 4.587402645568056e-05, "loss": 1.2106, "step": 1436 }, { "epoch": 0.5296231456740071, "grad_norm": 7.482919953730332, "learning_rate": 4.587093583879343e-05, "loss": 1.1348, "step": 1437 }, { "epoch": 0.5299917073620197, "grad_norm": 6.670176662867872, "learning_rate": 4.58678452219063e-05, "loss": 1.1949, "step": 1438 }, { "epoch": 0.5303602690500322, "grad_norm": 6.35947097593424, "learning_rate": 4.5864754605019164e-05, "loss": 1.2655, "step": 1439 }, { "epoch": 0.5307288307380448, "grad_norm": 17.765418864239454, "learning_rate": 4.586166398813203e-05, "loss": 1.1305, "step": 1440 }, { "epoch": 0.5310973924260574, "grad_norm": 25.51079375763387, "learning_rate": 4.58585733712449e-05, "loss": 0.995, "step": 1441 }, { "epoch": 0.5314659541140698, "grad_norm": 7.3111764084909225, "learning_rate": 4.585548275435777e-05, "loss": 1.3609, "step": 1442 }, { "epoch": 0.5318345158020824, "grad_norm": 7.572373179610806, "learning_rate": 4.585239213747064e-05, "loss": 0.9312, "step": 1443 }, { "epoch": 0.5322030774900949, "grad_norm": 5.809563664188707, "learning_rate": 4.584930152058351e-05, "loss": 1.2291, "step": 1444 }, { "epoch": 0.5325716391781075, "grad_norm": 7.50796695990957, "learning_rate": 4.584621090369638e-05, "loss": 1.0787, "step": 1445 }, { "epoch": 0.5329402008661199, "grad_norm": 5.656323174272015, "learning_rate": 4.584312028680925e-05, "loss": 1.0015, "step": 1446 }, { "epoch": 0.5333087625541325, "grad_norm": 5.074413081464485, "learning_rate": 4.584002966992212e-05, "loss": 1.3996, "step": 1447 }, { "epoch": 0.533677324242145, "grad_norm": 6.500872039986131, "learning_rate": 4.583693905303499e-05, "loss": 1.2366, "step": 1448 }, { "epoch": 0.5340458859301576, "grad_norm": 8.679534553670454, "learning_rate": 4.5833848436147855e-05, "loss": 1.1094, "step": 1449 }, { "epoch": 0.5344144476181701, "grad_norm": 6.819370829626652, "learning_rate": 4.5830757819260726e-05, "loss": 1.0398, "step": 1450 }, { "epoch": 0.5347830093061826, "grad_norm": 6.969591851126511, "learning_rate": 4.58276672023736e-05, "loss": 1.2969, "step": 1451 }, { "epoch": 0.5351515709941952, "grad_norm": 6.44971618027897, "learning_rate": 4.582457658548647e-05, "loss": 1.1877, "step": 1452 }, { "epoch": 0.5355201326822077, "grad_norm": 6.909901753898905, "learning_rate": 4.582148596859934e-05, "loss": 1.1349, "step": 1453 }, { "epoch": 0.5358886943702202, "grad_norm": 6.410568344938454, "learning_rate": 4.5818395351712204e-05, "loss": 1.2423, "step": 1454 }, { "epoch": 0.5362572560582327, "grad_norm": 6.189035244452893, "learning_rate": 4.581530473482507e-05, "loss": 1.174, "step": 1455 }, { "epoch": 0.5366258177462453, "grad_norm": 5.334545037588176, "learning_rate": 4.581221411793794e-05, "loss": 1.4126, "step": 1456 }, { "epoch": 0.5369943794342578, "grad_norm": 13.726439664895754, "learning_rate": 4.580912350105081e-05, "loss": 0.906, "step": 1457 }, { "epoch": 0.5373629411222703, "grad_norm": 17.528786661272544, "learning_rate": 4.580603288416368e-05, "loss": 1.2245, "step": 1458 }, { "epoch": 0.5377315028102829, "grad_norm": 7.6108562429903275, "learning_rate": 4.580294226727655e-05, "loss": 1.2157, "step": 1459 }, { "epoch": 0.5381000644982954, "grad_norm": 7.571701630966378, "learning_rate": 4.579985165038942e-05, "loss": 1.3238, "step": 1460 }, { "epoch": 0.538468626186308, "grad_norm": 6.626751650157212, "learning_rate": 4.579676103350229e-05, "loss": 1.0022, "step": 1461 }, { "epoch": 0.5388371878743204, "grad_norm": 3.9713876925027853, "learning_rate": 4.579367041661516e-05, "loss": 0.8486, "step": 1462 }, { "epoch": 0.539205749562333, "grad_norm": 5.587657487599833, "learning_rate": 4.579057979972803e-05, "loss": 0.9946, "step": 1463 }, { "epoch": 0.5395743112503455, "grad_norm": 10.76458177093402, "learning_rate": 4.5787489182840896e-05, "loss": 1.2839, "step": 1464 }, { "epoch": 0.5399428729383581, "grad_norm": 4.487499889490663, "learning_rate": 4.578439856595377e-05, "loss": 1.2291, "step": 1465 }, { "epoch": 0.5403114346263705, "grad_norm": 5.860989686372069, "learning_rate": 4.578130794906664e-05, "loss": 1.0207, "step": 1466 }, { "epoch": 0.5406799963143831, "grad_norm": 7.339650669776165, "learning_rate": 4.577821733217951e-05, "loss": 0.9484, "step": 1467 }, { "epoch": 0.5410485580023956, "grad_norm": 4.923697191833108, "learning_rate": 4.5775126715292374e-05, "loss": 1.2113, "step": 1468 }, { "epoch": 0.5414171196904082, "grad_norm": 5.484256927871206, "learning_rate": 4.5772036098405245e-05, "loss": 1.1605, "step": 1469 }, { "epoch": 0.5417856813784208, "grad_norm": 5.52731338330881, "learning_rate": 4.576894548151811e-05, "loss": 1.0387, "step": 1470 }, { "epoch": 0.5421542430664332, "grad_norm": 6.479007567714364, "learning_rate": 4.576585486463098e-05, "loss": 1.3177, "step": 1471 }, { "epoch": 0.5425228047544458, "grad_norm": 4.5506609593812115, "learning_rate": 4.576276424774385e-05, "loss": 1.3162, "step": 1472 }, { "epoch": 0.5428913664424583, "grad_norm": 5.524510647605615, "learning_rate": 4.5759673630856716e-05, "loss": 1.2093, "step": 1473 }, { "epoch": 0.5432599281304709, "grad_norm": 5.619237873083145, "learning_rate": 4.575658301396959e-05, "loss": 1.123, "step": 1474 }, { "epoch": 0.5436284898184833, "grad_norm": 4.851186642546369, "learning_rate": 4.575349239708246e-05, "loss": 1.1258, "step": 1475 }, { "epoch": 0.5439970515064959, "grad_norm": 5.954251400571774, "learning_rate": 4.575040178019533e-05, "loss": 0.8755, "step": 1476 }, { "epoch": 0.5443656131945084, "grad_norm": 7.711027289798271, "learning_rate": 4.57473111633082e-05, "loss": 1.1447, "step": 1477 }, { "epoch": 0.544734174882521, "grad_norm": 4.462963920008087, "learning_rate": 4.5744220546421066e-05, "loss": 0.8634, "step": 1478 }, { "epoch": 0.5451027365705335, "grad_norm": 5.93474185526817, "learning_rate": 4.574112992953394e-05, "loss": 0.9113, "step": 1479 }, { "epoch": 0.545471298258546, "grad_norm": 5.559858059066157, "learning_rate": 4.573803931264681e-05, "loss": 1.2044, "step": 1480 }, { "epoch": 0.5458398599465586, "grad_norm": 7.782482371348589, "learning_rate": 4.573494869575968e-05, "loss": 1.0595, "step": 1481 }, { "epoch": 0.5462084216345711, "grad_norm": 3.737071576011543, "learning_rate": 4.5731858078872544e-05, "loss": 0.7751, "step": 1482 }, { "epoch": 0.5465769833225836, "grad_norm": 4.322417644821547, "learning_rate": 4.5728767461985415e-05, "loss": 0.9703, "step": 1483 }, { "epoch": 0.5469455450105961, "grad_norm": 4.252637605677752, "learning_rate": 4.5725676845098286e-05, "loss": 1.4304, "step": 1484 }, { "epoch": 0.5473141066986087, "grad_norm": 4.19738150215219, "learning_rate": 4.572258622821115e-05, "loss": 1.1913, "step": 1485 }, { "epoch": 0.5476826683866212, "grad_norm": 5.160774234380166, "learning_rate": 4.571949561132402e-05, "loss": 1.2322, "step": 1486 }, { "epoch": 0.5480512300746337, "grad_norm": 4.256463465330045, "learning_rate": 4.571640499443689e-05, "loss": 1.0023, "step": 1487 }, { "epoch": 0.5484197917626463, "grad_norm": 5.765084424819204, "learning_rate": 4.571331437754976e-05, "loss": 1.3717, "step": 1488 }, { "epoch": 0.5487883534506588, "grad_norm": 5.1771442043888545, "learning_rate": 4.571022376066263e-05, "loss": 1.2058, "step": 1489 }, { "epoch": 0.5491569151386714, "grad_norm": 6.103965295935453, "learning_rate": 4.57071331437755e-05, "loss": 1.3445, "step": 1490 }, { "epoch": 0.5495254768266838, "grad_norm": 8.645766758183687, "learning_rate": 4.570404252688837e-05, "loss": 0.9779, "step": 1491 }, { "epoch": 0.5498940385146964, "grad_norm": 6.483220965172477, "learning_rate": 4.5700951910001235e-05, "loss": 1.5738, "step": 1492 }, { "epoch": 0.5502626002027089, "grad_norm": 4.867459848294166, "learning_rate": 4.5697861293114106e-05, "loss": 0.9382, "step": 1493 }, { "epoch": 0.5506311618907215, "grad_norm": 8.714852942201562, "learning_rate": 4.569477067622698e-05, "loss": 1.1368, "step": 1494 }, { "epoch": 0.550999723578734, "grad_norm": 14.426307997084972, "learning_rate": 4.569168005933985e-05, "loss": 1.1823, "step": 1495 }, { "epoch": 0.5513682852667465, "grad_norm": 12.194059983326026, "learning_rate": 4.568858944245272e-05, "loss": 0.9648, "step": 1496 }, { "epoch": 0.5517368469547591, "grad_norm": 12.705199275219845, "learning_rate": 4.5685498825565584e-05, "loss": 1.298, "step": 1497 }, { "epoch": 0.5521054086427716, "grad_norm": 8.491254907969102, "learning_rate": 4.5682408208678456e-05, "loss": 1.2874, "step": 1498 }, { "epoch": 0.5524739703307842, "grad_norm": 8.682754647661326, "learning_rate": 4.567931759179133e-05, "loss": 0.9937, "step": 1499 }, { "epoch": 0.5528425320187966, "grad_norm": 7.019681509677946, "learning_rate": 4.567622697490419e-05, "loss": 1.5617, "step": 1500 }, { "epoch": 0.5532110937068092, "grad_norm": 7.081341482111938, "learning_rate": 4.567313635801706e-05, "loss": 0.7658, "step": 1501 }, { "epoch": 0.5535796553948217, "grad_norm": 6.0449606849521516, "learning_rate": 4.567004574112993e-05, "loss": 0.8354, "step": 1502 }, { "epoch": 0.5539482170828343, "grad_norm": 5.6437553532470455, "learning_rate": 4.56669551242428e-05, "loss": 1.2921, "step": 1503 }, { "epoch": 0.5543167787708467, "grad_norm": 11.468564398404517, "learning_rate": 4.566386450735567e-05, "loss": 1.1782, "step": 1504 }, { "epoch": 0.5546853404588593, "grad_norm": 7.071031127153371, "learning_rate": 4.566077389046854e-05, "loss": 0.9699, "step": 1505 }, { "epoch": 0.5550539021468718, "grad_norm": 4.2461476537516845, "learning_rate": 4.565768327358141e-05, "loss": 1.263, "step": 1506 }, { "epoch": 0.5554224638348844, "grad_norm": 6.919464587187353, "learning_rate": 4.5654592656694276e-05, "loss": 1.4148, "step": 1507 }, { "epoch": 0.5557910255228969, "grad_norm": 13.766587118761043, "learning_rate": 4.565150203980715e-05, "loss": 0.91, "step": 1508 }, { "epoch": 0.5561595872109094, "grad_norm": 14.086741800560699, "learning_rate": 4.564841142292002e-05, "loss": 1.0927, "step": 1509 }, { "epoch": 0.556528148898922, "grad_norm": 6.478056621420658, "learning_rate": 4.564532080603289e-05, "loss": 1.1978, "step": 1510 }, { "epoch": 0.5568967105869345, "grad_norm": 7.518448075394439, "learning_rate": 4.5642230189145754e-05, "loss": 1.2405, "step": 1511 }, { "epoch": 0.557265272274947, "grad_norm": 8.376889143859767, "learning_rate": 4.5639139572258625e-05, "loss": 1.2012, "step": 1512 }, { "epoch": 0.5576338339629595, "grad_norm": 13.76836154162926, "learning_rate": 4.5636048955371496e-05, "loss": 1.2526, "step": 1513 }, { "epoch": 0.5580023956509721, "grad_norm": 16.940820347402564, "learning_rate": 4.563295833848437e-05, "loss": 0.9873, "step": 1514 }, { "epoch": 0.5583709573389846, "grad_norm": 15.841516461463787, "learning_rate": 4.562986772159723e-05, "loss": 0.9772, "step": 1515 }, { "epoch": 0.5587395190269971, "grad_norm": 9.750470272362588, "learning_rate": 4.56267771047101e-05, "loss": 0.8959, "step": 1516 }, { "epoch": 0.5591080807150097, "grad_norm": 9.466760810419752, "learning_rate": 4.562368648782297e-05, "loss": 1.0199, "step": 1517 }, { "epoch": 0.5594766424030222, "grad_norm": 7.861547939473414, "learning_rate": 4.562059587093584e-05, "loss": 1.3673, "step": 1518 }, { "epoch": 0.5598452040910348, "grad_norm": 17.447785874820728, "learning_rate": 4.561750525404871e-05, "loss": 1.3013, "step": 1519 }, { "epoch": 0.5602137657790472, "grad_norm": 7.127273999782017, "learning_rate": 4.561441463716158e-05, "loss": 1.5435, "step": 1520 }, { "epoch": 0.5605823274670598, "grad_norm": 5.134458791707285, "learning_rate": 4.5611324020274446e-05, "loss": 1.0442, "step": 1521 }, { "epoch": 0.5609508891550723, "grad_norm": 4.845840495206221, "learning_rate": 4.560823340338732e-05, "loss": 1.0011, "step": 1522 }, { "epoch": 0.5613194508430849, "grad_norm": 4.4819522919198835, "learning_rate": 4.560514278650019e-05, "loss": 0.9647, "step": 1523 }, { "epoch": 0.5616880125310973, "grad_norm": 5.08180042771833, "learning_rate": 4.560205216961306e-05, "loss": 0.9923, "step": 1524 }, { "epoch": 0.5620565742191099, "grad_norm": 5.668677403364736, "learning_rate": 4.559896155272593e-05, "loss": 1.0914, "step": 1525 }, { "epoch": 0.5624251359071225, "grad_norm": 11.7326649157551, "learning_rate": 4.5595870935838795e-05, "loss": 0.9958, "step": 1526 }, { "epoch": 0.562793697595135, "grad_norm": 4.187905363133987, "learning_rate": 4.5592780318951666e-05, "loss": 0.9853, "step": 1527 }, { "epoch": 0.5631622592831476, "grad_norm": 5.047692200249726, "learning_rate": 4.558968970206454e-05, "loss": 1.3233, "step": 1528 }, { "epoch": 0.56353082097116, "grad_norm": 4.689826489718155, "learning_rate": 4.558659908517741e-05, "loss": 1.3296, "step": 1529 }, { "epoch": 0.5638993826591726, "grad_norm": 4.92193816613733, "learning_rate": 4.558350846829027e-05, "loss": 0.8592, "step": 1530 }, { "epoch": 0.5642679443471851, "grad_norm": 6.39127889333066, "learning_rate": 4.558041785140314e-05, "loss": 1.136, "step": 1531 }, { "epoch": 0.5646365060351977, "grad_norm": 5.460804775300552, "learning_rate": 4.557732723451601e-05, "loss": 1.2733, "step": 1532 }, { "epoch": 0.5650050677232101, "grad_norm": 4.834478746816245, "learning_rate": 4.557423661762888e-05, "loss": 0.7645, "step": 1533 }, { "epoch": 0.5653736294112227, "grad_norm": 8.455160463548605, "learning_rate": 4.557114600074175e-05, "loss": 1.2449, "step": 1534 }, { "epoch": 0.5657421910992352, "grad_norm": 7.789167795646109, "learning_rate": 4.556805538385462e-05, "loss": 1.4991, "step": 1535 }, { "epoch": 0.5661107527872478, "grad_norm": 7.301956792397972, "learning_rate": 4.5564964766967486e-05, "loss": 0.9844, "step": 1536 }, { "epoch": 0.5664793144752603, "grad_norm": 5.073845289434647, "learning_rate": 4.556187415008036e-05, "loss": 1.011, "step": 1537 }, { "epoch": 0.5668478761632728, "grad_norm": 6.705373675852264, "learning_rate": 4.555878353319323e-05, "loss": 1.3919, "step": 1538 }, { "epoch": 0.5672164378512854, "grad_norm": 3.9837950041245467, "learning_rate": 4.55556929163061e-05, "loss": 1.1489, "step": 1539 }, { "epoch": 0.5675849995392979, "grad_norm": 5.804835362559888, "learning_rate": 4.5552602299418964e-05, "loss": 1.2964, "step": 1540 }, { "epoch": 0.5679535612273104, "grad_norm": 5.895674021515161, "learning_rate": 4.5549511682531836e-05, "loss": 1.0633, "step": 1541 }, { "epoch": 0.5683221229153229, "grad_norm": 4.994469444958202, "learning_rate": 4.554642106564471e-05, "loss": 1.2499, "step": 1542 }, { "epoch": 0.5686906846033355, "grad_norm": 4.685894093403077, "learning_rate": 4.554333044875758e-05, "loss": 1.0887, "step": 1543 }, { "epoch": 0.569059246291348, "grad_norm": 3.6403711582474396, "learning_rate": 4.554023983187045e-05, "loss": 1.0355, "step": 1544 }, { "epoch": 0.5694278079793605, "grad_norm": 5.364576479219175, "learning_rate": 4.553714921498331e-05, "loss": 0.969, "step": 1545 }, { "epoch": 0.5697963696673731, "grad_norm": 5.481068366944097, "learning_rate": 4.553405859809618e-05, "loss": 1.0584, "step": 1546 }, { "epoch": 0.5701649313553856, "grad_norm": 5.436018292472776, "learning_rate": 4.553096798120905e-05, "loss": 0.931, "step": 1547 }, { "epoch": 0.5705334930433982, "grad_norm": 5.060808440750806, "learning_rate": 4.552787736432192e-05, "loss": 1.0207, "step": 1548 }, { "epoch": 0.5709020547314106, "grad_norm": 5.224831332902086, "learning_rate": 4.552478674743479e-05, "loss": 1.1395, "step": 1549 }, { "epoch": 0.5712706164194232, "grad_norm": 5.383302353493251, "learning_rate": 4.5521696130547656e-05, "loss": 1.4673, "step": 1550 }, { "epoch": 0.5716391781074357, "grad_norm": 4.921509162991191, "learning_rate": 4.551860551366053e-05, "loss": 0.9877, "step": 1551 }, { "epoch": 0.5720077397954483, "grad_norm": 6.247986736288413, "learning_rate": 4.55155148967734e-05, "loss": 1.2353, "step": 1552 }, { "epoch": 0.5723763014834607, "grad_norm": 9.484858040237452, "learning_rate": 4.551242427988627e-05, "loss": 0.947, "step": 1553 }, { "epoch": 0.5727448631714733, "grad_norm": 3.661450982482101, "learning_rate": 4.550933366299914e-05, "loss": 0.8722, "step": 1554 }, { "epoch": 0.5731134248594859, "grad_norm": 4.594021328548997, "learning_rate": 4.5506243046112005e-05, "loss": 0.9151, "step": 1555 }, { "epoch": 0.5734819865474984, "grad_norm": 10.948486739154475, "learning_rate": 4.5503152429224876e-05, "loss": 1.1317, "step": 1556 }, { "epoch": 0.573850548235511, "grad_norm": 6.005578308972027, "learning_rate": 4.550006181233775e-05, "loss": 1.1839, "step": 1557 }, { "epoch": 0.5742191099235234, "grad_norm": 5.698716055837367, "learning_rate": 4.549697119545062e-05, "loss": 1.0963, "step": 1558 }, { "epoch": 0.574587671611536, "grad_norm": 4.204866296772166, "learning_rate": 4.549388057856348e-05, "loss": 1.0152, "step": 1559 }, { "epoch": 0.5749562332995485, "grad_norm": 4.4933048882041575, "learning_rate": 4.5490789961676354e-05, "loss": 1.0074, "step": 1560 }, { "epoch": 0.5753247949875611, "grad_norm": 6.940352944405632, "learning_rate": 4.548769934478922e-05, "loss": 1.1578, "step": 1561 }, { "epoch": 0.5756933566755735, "grad_norm": 4.584945649435374, "learning_rate": 4.548460872790209e-05, "loss": 1.4376, "step": 1562 }, { "epoch": 0.5760619183635861, "grad_norm": 5.522949197165646, "learning_rate": 4.548151811101496e-05, "loss": 1.2882, "step": 1563 }, { "epoch": 0.5764304800515986, "grad_norm": 7.556984647845557, "learning_rate": 4.5478427494127826e-05, "loss": 1.1961, "step": 1564 }, { "epoch": 0.5767990417396112, "grad_norm": 4.763191977864546, "learning_rate": 4.54753368772407e-05, "loss": 1.3392, "step": 1565 }, { "epoch": 0.5771676034276237, "grad_norm": 5.131303214053965, "learning_rate": 4.547224626035357e-05, "loss": 1.0501, "step": 1566 }, { "epoch": 0.5775361651156362, "grad_norm": 5.268205412281417, "learning_rate": 4.546915564346644e-05, "loss": 1.0654, "step": 1567 }, { "epoch": 0.5779047268036488, "grad_norm": 7.6570934843086125, "learning_rate": 4.546606502657931e-05, "loss": 0.9361, "step": 1568 }, { "epoch": 0.5782732884916613, "grad_norm": 5.840367608192837, "learning_rate": 4.5462974409692175e-05, "loss": 1.1718, "step": 1569 }, { "epoch": 0.5786418501796738, "grad_norm": 5.5765818109495955, "learning_rate": 4.5459883792805046e-05, "loss": 1.0469, "step": 1570 }, { "epoch": 0.5790104118676863, "grad_norm": 6.107432180895526, "learning_rate": 4.545679317591792e-05, "loss": 0.7529, "step": 1571 }, { "epoch": 0.5793789735556989, "grad_norm": 5.414655028227286, "learning_rate": 4.545370255903079e-05, "loss": 1.4273, "step": 1572 }, { "epoch": 0.5797475352437114, "grad_norm": 8.34310030461525, "learning_rate": 4.545061194214365e-05, "loss": 1.0345, "step": 1573 }, { "epoch": 0.580116096931724, "grad_norm": 6.225760490812736, "learning_rate": 4.5447521325256524e-05, "loss": 1.029, "step": 1574 }, { "epoch": 0.5804846586197365, "grad_norm": 5.21227644113899, "learning_rate": 4.5444430708369395e-05, "loss": 1.1135, "step": 1575 }, { "epoch": 0.580853220307749, "grad_norm": 5.49172073052074, "learning_rate": 4.544134009148226e-05, "loss": 1.4142, "step": 1576 }, { "epoch": 0.5812217819957616, "grad_norm": 5.110953084373214, "learning_rate": 4.543824947459513e-05, "loss": 0.8747, "step": 1577 }, { "epoch": 0.581590343683774, "grad_norm": 4.747002308226387, "learning_rate": 4.5435158857708e-05, "loss": 0.9918, "step": 1578 }, { "epoch": 0.5819589053717866, "grad_norm": 5.384378103121385, "learning_rate": 4.5432068240820866e-05, "loss": 1.0443, "step": 1579 }, { "epoch": 0.5823274670597991, "grad_norm": 7.026963754317521, "learning_rate": 4.542897762393374e-05, "loss": 1.5403, "step": 1580 }, { "epoch": 0.5826960287478117, "grad_norm": 14.943522623806047, "learning_rate": 4.542588700704661e-05, "loss": 1.4397, "step": 1581 }, { "epoch": 0.5830645904358241, "grad_norm": 4.584569775812871, "learning_rate": 4.542279639015948e-05, "loss": 0.9964, "step": 1582 }, { "epoch": 0.5834331521238367, "grad_norm": 5.695579407433185, "learning_rate": 4.5419705773272344e-05, "loss": 0.7834, "step": 1583 }, { "epoch": 0.5838017138118493, "grad_norm": 6.883111226313911, "learning_rate": 4.5416615156385216e-05, "loss": 1.3168, "step": 1584 }, { "epoch": 0.5841702754998618, "grad_norm": 5.901846399566178, "learning_rate": 4.541352453949809e-05, "loss": 1.1255, "step": 1585 }, { "epoch": 0.5845388371878744, "grad_norm": 4.471100302502625, "learning_rate": 4.541043392261096e-05, "loss": 1.0721, "step": 1586 }, { "epoch": 0.5849073988758868, "grad_norm": 5.4931650173610285, "learning_rate": 4.540734330572383e-05, "loss": 1.0567, "step": 1587 }, { "epoch": 0.5852759605638994, "grad_norm": 8.598316435920847, "learning_rate": 4.5404252688836694e-05, "loss": 0.9566, "step": 1588 }, { "epoch": 0.5856445222519119, "grad_norm": 10.455127564157813, "learning_rate": 4.5401162071949565e-05, "loss": 0.9641, "step": 1589 }, { "epoch": 0.5860130839399245, "grad_norm": 11.66814202562165, "learning_rate": 4.5398071455062436e-05, "loss": 1.3543, "step": 1590 }, { "epoch": 0.5863816456279369, "grad_norm": 7.880390320613859, "learning_rate": 4.53949808381753e-05, "loss": 0.9353, "step": 1591 }, { "epoch": 0.5867502073159495, "grad_norm": 5.123153353830552, "learning_rate": 4.539189022128817e-05, "loss": 1.1042, "step": 1592 }, { "epoch": 0.5871187690039621, "grad_norm": 43.612567343294955, "learning_rate": 4.5388799604401036e-05, "loss": 1.1332, "step": 1593 }, { "epoch": 0.5874873306919746, "grad_norm": 8.878810870308204, "learning_rate": 4.538570898751391e-05, "loss": 1.1816, "step": 1594 }, { "epoch": 0.5878558923799871, "grad_norm": 8.84236743485744, "learning_rate": 4.538261837062678e-05, "loss": 1.1374, "step": 1595 }, { "epoch": 0.5882244540679996, "grad_norm": 7.399872814838188, "learning_rate": 4.537952775373965e-05, "loss": 1.0953, "step": 1596 }, { "epoch": 0.5885930157560122, "grad_norm": 6.29435069788837, "learning_rate": 4.537643713685252e-05, "loss": 1.3777, "step": 1597 }, { "epoch": 0.5889615774440247, "grad_norm": 15.162026222921867, "learning_rate": 4.5373346519965385e-05, "loss": 1.3326, "step": 1598 }, { "epoch": 0.5893301391320372, "grad_norm": 7.501258744468481, "learning_rate": 4.5370255903078256e-05, "loss": 1.2691, "step": 1599 }, { "epoch": 0.5896987008200497, "grad_norm": 10.263546807337425, "learning_rate": 4.536716528619113e-05, "loss": 1.3502, "step": 1600 }, { "epoch": 0.5896987008200497, "eval_bleu": 0.029805686942628605, "eval_bleu_1gram": 0.3512203643274078, "eval_bleu_2gram": 0.13509958248402254, "eval_bleu_3gram": 0.05079623730554678, "eval_bleu_4gram": 0.02235174338456145, "eval_rag_val_loss": 1.2014313310613496, "eval_rouge1": 0.3311540259800937, "eval_rouge2": 0.12637517713471055, "eval_rougeL": 0.3260063671644828, "step": 1600 } ], "logging_steps": 1, "max_steps": 16278, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }