{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.994882292732855, "eval_steps": 500, "global_step": 2440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020470829068577278, "grad_norm": 5.914718943943769, "learning_rate": 1.639344262295082e-07, "loss": 0.89, "step": 1 }, { "epoch": 0.0040941658137154556, "grad_norm": 5.711215790032282, "learning_rate": 3.278688524590164e-07, "loss": 0.8602, "step": 2 }, { "epoch": 0.006141248720573183, "grad_norm": 6.070030223088026, "learning_rate": 4.918032786885246e-07, "loss": 0.8902, "step": 3 }, { "epoch": 0.008188331627430911, "grad_norm": 5.739027819285582, "learning_rate": 6.557377049180328e-07, "loss": 0.9194, "step": 4 }, { "epoch": 0.01023541453428864, "grad_norm": 5.558991142229951, "learning_rate": 8.196721311475409e-07, "loss": 0.8523, "step": 5 }, { "epoch": 0.012282497441146366, "grad_norm": 5.622988925862499, "learning_rate": 9.836065573770493e-07, "loss": 0.9102, "step": 6 }, { "epoch": 0.014329580348004094, "grad_norm": 5.360893456655671, "learning_rate": 1.1475409836065575e-06, "loss": 0.8682, "step": 7 }, { "epoch": 0.016376663254861822, "grad_norm": 5.095538528802894, "learning_rate": 1.3114754098360657e-06, "loss": 0.8689, "step": 8 }, { "epoch": 0.01842374616171955, "grad_norm": 4.59455985194371, "learning_rate": 1.4754098360655739e-06, "loss": 0.852, "step": 9 }, { "epoch": 0.02047082906857728, "grad_norm": 4.139437078984008, "learning_rate": 1.6393442622950819e-06, "loss": 0.8102, "step": 10 }, { "epoch": 0.022517911975435005, "grad_norm": 4.171848417796509, "learning_rate": 1.8032786885245903e-06, "loss": 0.8102, "step": 11 }, { "epoch": 0.02456499488229273, "grad_norm": 2.4194756147420646, "learning_rate": 1.9672131147540985e-06, "loss": 0.7481, "step": 12 }, { "epoch": 0.02661207778915046, "grad_norm": 2.28600629836227, "learning_rate": 2.1311475409836067e-06, "loss": 0.7631, "step": 13 }, { "epoch": 0.028659160696008188, "grad_norm": 2.075915465619067, "learning_rate": 2.295081967213115e-06, "loss": 0.7506, "step": 14 }, { "epoch": 0.030706243602865915, "grad_norm": 1.952947905310399, "learning_rate": 2.459016393442623e-06, "loss": 0.7781, "step": 15 }, { "epoch": 0.032753326509723645, "grad_norm": 3.070659688365803, "learning_rate": 2.6229508196721314e-06, "loss": 0.7488, "step": 16 }, { "epoch": 0.03480040941658137, "grad_norm": 3.5264876012722652, "learning_rate": 2.786885245901639e-06, "loss": 0.7746, "step": 17 }, { "epoch": 0.0368474923234391, "grad_norm": 3.580605082704591, "learning_rate": 2.9508196721311478e-06, "loss": 0.7618, "step": 18 }, { "epoch": 0.038894575230296824, "grad_norm": 3.194230800018389, "learning_rate": 3.114754098360656e-06, "loss": 0.7207, "step": 19 }, { "epoch": 0.04094165813715456, "grad_norm": 3.116544112683001, "learning_rate": 3.2786885245901638e-06, "loss": 0.7352, "step": 20 }, { "epoch": 0.042988741044012284, "grad_norm": 2.6314247686078094, "learning_rate": 3.4426229508196724e-06, "loss": 0.7432, "step": 21 }, { "epoch": 0.04503582395087001, "grad_norm": 2.2137046535068037, "learning_rate": 3.6065573770491806e-06, "loss": 0.7444, "step": 22 }, { "epoch": 0.04708290685772774, "grad_norm": 1.6649733496725676, "learning_rate": 3.7704918032786884e-06, "loss": 0.6646, "step": 23 }, { "epoch": 0.04912998976458546, "grad_norm": 1.3074947984675702, "learning_rate": 3.934426229508197e-06, "loss": 0.6388, "step": 24 }, { "epoch": 0.0511770726714432, "grad_norm": 1.190645815657471, "learning_rate": 4.098360655737705e-06, "loss": 0.6575, "step": 25 }, { "epoch": 0.05322415557830092, "grad_norm": 1.2707845894729972, "learning_rate": 4.2622950819672135e-06, "loss": 0.671, "step": 26 }, { "epoch": 0.05527123848515865, "grad_norm": 1.4412640499522065, "learning_rate": 4.426229508196722e-06, "loss": 0.6526, "step": 27 }, { "epoch": 0.057318321392016376, "grad_norm": 1.3167145329441712, "learning_rate": 4.59016393442623e-06, "loss": 0.6533, "step": 28 }, { "epoch": 0.0593654042988741, "grad_norm": 1.338399336461372, "learning_rate": 4.754098360655738e-06, "loss": 0.6525, "step": 29 }, { "epoch": 0.06141248720573183, "grad_norm": 1.1005314901400522, "learning_rate": 4.918032786885246e-06, "loss": 0.6513, "step": 30 }, { "epoch": 0.06345957011258956, "grad_norm": 1.0341365252216648, "learning_rate": 5.0819672131147545e-06, "loss": 0.6517, "step": 31 }, { "epoch": 0.06550665301944729, "grad_norm": 0.8191371315934305, "learning_rate": 5.245901639344263e-06, "loss": 0.6562, "step": 32 }, { "epoch": 0.06755373592630501, "grad_norm": 1.0105078954121915, "learning_rate": 5.409836065573772e-06, "loss": 0.6318, "step": 33 }, { "epoch": 0.06960081883316274, "grad_norm": 1.083317612523659, "learning_rate": 5.573770491803278e-06, "loss": 0.6528, "step": 34 }, { "epoch": 0.07164790174002048, "grad_norm": 1.054453013530755, "learning_rate": 5.737704918032787e-06, "loss": 0.6233, "step": 35 }, { "epoch": 0.0736949846468782, "grad_norm": 0.693669694861969, "learning_rate": 5.9016393442622956e-06, "loss": 0.5959, "step": 36 }, { "epoch": 0.07574206755373593, "grad_norm": 0.6429960279803676, "learning_rate": 6.065573770491804e-06, "loss": 0.6031, "step": 37 }, { "epoch": 0.07778915046059365, "grad_norm": 0.8326921183964671, "learning_rate": 6.229508196721312e-06, "loss": 0.6252, "step": 38 }, { "epoch": 0.07983623336745138, "grad_norm": 0.7826089791231328, "learning_rate": 6.393442622950821e-06, "loss": 0.5891, "step": 39 }, { "epoch": 0.08188331627430911, "grad_norm": 0.6837753928275052, "learning_rate": 6.5573770491803276e-06, "loss": 0.6151, "step": 40 }, { "epoch": 0.08393039918116683, "grad_norm": 0.557265922988386, "learning_rate": 6.721311475409837e-06, "loss": 0.5832, "step": 41 }, { "epoch": 0.08597748208802457, "grad_norm": 0.697548950763872, "learning_rate": 6.885245901639345e-06, "loss": 0.629, "step": 42 }, { "epoch": 0.08802456499488229, "grad_norm": 0.7262822585766031, "learning_rate": 7.049180327868853e-06, "loss": 0.5531, "step": 43 }, { "epoch": 0.09007164790174002, "grad_norm": 0.7701295257257081, "learning_rate": 7.213114754098361e-06, "loss": 0.612, "step": 44 }, { "epoch": 0.09211873080859775, "grad_norm": 0.4681801446847226, "learning_rate": 7.3770491803278695e-06, "loss": 0.5675, "step": 45 }, { "epoch": 0.09416581371545547, "grad_norm": 0.5678252337287879, "learning_rate": 7.540983606557377e-06, "loss": 0.6391, "step": 46 }, { "epoch": 0.09621289662231321, "grad_norm": 0.5704556041531674, "learning_rate": 7.704918032786886e-06, "loss": 0.5773, "step": 47 }, { "epoch": 0.09825997952917093, "grad_norm": 0.5739689196908508, "learning_rate": 7.868852459016394e-06, "loss": 0.5928, "step": 48 }, { "epoch": 0.10030706243602866, "grad_norm": 0.5604483444656064, "learning_rate": 8.032786885245902e-06, "loss": 0.5747, "step": 49 }, { "epoch": 0.1023541453428864, "grad_norm": 0.5266605058896955, "learning_rate": 8.19672131147541e-06, "loss": 0.5506, "step": 50 }, { "epoch": 0.10440122824974411, "grad_norm": 0.6855806410785544, "learning_rate": 8.360655737704919e-06, "loss": 0.5974, "step": 51 }, { "epoch": 0.10644831115660185, "grad_norm": 0.6492542095140276, "learning_rate": 8.524590163934427e-06, "loss": 0.6027, "step": 52 }, { "epoch": 0.10849539406345957, "grad_norm": 0.5702781382301559, "learning_rate": 8.688524590163935e-06, "loss": 0.5997, "step": 53 }, { "epoch": 0.1105424769703173, "grad_norm": 0.5828822136164438, "learning_rate": 8.852459016393443e-06, "loss": 0.5633, "step": 54 }, { "epoch": 0.11258955987717502, "grad_norm": 0.5923389847320444, "learning_rate": 9.016393442622952e-06, "loss": 0.5842, "step": 55 }, { "epoch": 0.11463664278403275, "grad_norm": 0.5636209619702331, "learning_rate": 9.18032786885246e-06, "loss": 0.5645, "step": 56 }, { "epoch": 0.11668372569089049, "grad_norm": 0.4912175098948749, "learning_rate": 9.344262295081968e-06, "loss": 0.5635, "step": 57 }, { "epoch": 0.1187308085977482, "grad_norm": 0.754994674395481, "learning_rate": 9.508196721311476e-06, "loss": 0.5658, "step": 58 }, { "epoch": 0.12077789150460594, "grad_norm": 0.5456693243174974, "learning_rate": 9.672131147540984e-06, "loss": 0.5559, "step": 59 }, { "epoch": 0.12282497441146366, "grad_norm": 0.5877326554557568, "learning_rate": 9.836065573770493e-06, "loss": 0.5755, "step": 60 }, { "epoch": 0.12487205731832139, "grad_norm": 0.4917195015950769, "learning_rate": 1e-05, "loss": 0.5422, "step": 61 }, { "epoch": 0.1269191402251791, "grad_norm": 0.5000347167895204, "learning_rate": 1.0163934426229509e-05, "loss": 0.5638, "step": 62 }, { "epoch": 0.12896622313203684, "grad_norm": 0.5165061332900804, "learning_rate": 1.0327868852459017e-05, "loss": 0.5874, "step": 63 }, { "epoch": 0.13101330603889458, "grad_norm": 0.5647596683341366, "learning_rate": 1.0491803278688525e-05, "loss": 0.5549, "step": 64 }, { "epoch": 0.1330603889457523, "grad_norm": 0.556985780595292, "learning_rate": 1.0655737704918034e-05, "loss": 0.5287, "step": 65 }, { "epoch": 0.13510747185261002, "grad_norm": 0.5870046275772329, "learning_rate": 1.0819672131147544e-05, "loss": 0.5726, "step": 66 }, { "epoch": 0.13715455475946775, "grad_norm": 0.510016021993097, "learning_rate": 1.0983606557377052e-05, "loss": 0.641, "step": 67 }, { "epoch": 0.13920163766632548, "grad_norm": 0.5757122682850916, "learning_rate": 1.1147540983606557e-05, "loss": 0.5493, "step": 68 }, { "epoch": 0.14124872057318322, "grad_norm": 0.5572446165702184, "learning_rate": 1.1311475409836066e-05, "loss": 0.5231, "step": 69 }, { "epoch": 0.14329580348004095, "grad_norm": 0.5000459199001162, "learning_rate": 1.1475409836065575e-05, "loss": 0.5221, "step": 70 }, { "epoch": 0.14534288638689866, "grad_norm": 0.49933135856571004, "learning_rate": 1.1639344262295083e-05, "loss": 0.5254, "step": 71 }, { "epoch": 0.1473899692937564, "grad_norm": 0.542314402205445, "learning_rate": 1.1803278688524591e-05, "loss": 0.5855, "step": 72 }, { "epoch": 0.14943705220061412, "grad_norm": 0.4908515642706825, "learning_rate": 1.19672131147541e-05, "loss": 0.5277, "step": 73 }, { "epoch": 0.15148413510747186, "grad_norm": 0.5541036948715271, "learning_rate": 1.2131147540983608e-05, "loss": 0.5226, "step": 74 }, { "epoch": 0.1535312180143296, "grad_norm": 0.5925205612588014, "learning_rate": 1.2295081967213116e-05, "loss": 0.5588, "step": 75 }, { "epoch": 0.1555783009211873, "grad_norm": 0.5472033034034951, "learning_rate": 1.2459016393442624e-05, "loss": 0.5601, "step": 76 }, { "epoch": 0.15762538382804503, "grad_norm": 0.5831899459875634, "learning_rate": 1.2622950819672132e-05, "loss": 0.5208, "step": 77 }, { "epoch": 0.15967246673490276, "grad_norm": 0.6725600970137904, "learning_rate": 1.2786885245901642e-05, "loss": 0.5646, "step": 78 }, { "epoch": 0.1617195496417605, "grad_norm": 0.4997581515870628, "learning_rate": 1.295081967213115e-05, "loss": 0.5345, "step": 79 }, { "epoch": 0.16376663254861823, "grad_norm": 0.6060573387394406, "learning_rate": 1.3114754098360655e-05, "loss": 0.5516, "step": 80 }, { "epoch": 0.16581371545547594, "grad_norm": 0.5707723342314415, "learning_rate": 1.3278688524590165e-05, "loss": 0.5494, "step": 81 }, { "epoch": 0.16786079836233367, "grad_norm": 0.5820968760684135, "learning_rate": 1.3442622950819673e-05, "loss": 0.5374, "step": 82 }, { "epoch": 0.1699078812691914, "grad_norm": 0.801732853136766, "learning_rate": 1.3606557377049181e-05, "loss": 0.546, "step": 83 }, { "epoch": 0.17195496417604914, "grad_norm": 0.5288994895334571, "learning_rate": 1.377049180327869e-05, "loss": 0.5761, "step": 84 }, { "epoch": 0.17400204708290687, "grad_norm": 0.7883330207931984, "learning_rate": 1.3934426229508198e-05, "loss": 0.5326, "step": 85 }, { "epoch": 0.17604912998976457, "grad_norm": 0.4807106883512578, "learning_rate": 1.4098360655737706e-05, "loss": 0.5311, "step": 86 }, { "epoch": 0.1780962128966223, "grad_norm": 0.6954942427041093, "learning_rate": 1.4262295081967214e-05, "loss": 0.5206, "step": 87 }, { "epoch": 0.18014329580348004, "grad_norm": 0.5706344929187627, "learning_rate": 1.4426229508196722e-05, "loss": 0.5575, "step": 88 }, { "epoch": 0.18219037871033777, "grad_norm": 0.5660731771723676, "learning_rate": 1.459016393442623e-05, "loss": 0.5243, "step": 89 }, { "epoch": 0.1842374616171955, "grad_norm": 0.7077259968257474, "learning_rate": 1.4754098360655739e-05, "loss": 0.5375, "step": 90 }, { "epoch": 0.1862845445240532, "grad_norm": 0.5640871854454458, "learning_rate": 1.4918032786885249e-05, "loss": 0.5678, "step": 91 }, { "epoch": 0.18833162743091095, "grad_norm": 0.5686460669441292, "learning_rate": 1.5081967213114754e-05, "loss": 0.5317, "step": 92 }, { "epoch": 0.19037871033776868, "grad_norm": 0.5667037000506248, "learning_rate": 1.5245901639344264e-05, "loss": 0.5208, "step": 93 }, { "epoch": 0.19242579324462641, "grad_norm": 0.6528802562895782, "learning_rate": 1.5409836065573772e-05, "loss": 0.5538, "step": 94 }, { "epoch": 0.19447287615148415, "grad_norm": 0.6104791131678972, "learning_rate": 1.5573770491803278e-05, "loss": 0.4945, "step": 95 }, { "epoch": 0.19651995905834185, "grad_norm": 0.770297526442582, "learning_rate": 1.5737704918032788e-05, "loss": 0.5542, "step": 96 }, { "epoch": 0.1985670419651996, "grad_norm": 0.639855439582837, "learning_rate": 1.5901639344262295e-05, "loss": 0.5604, "step": 97 }, { "epoch": 0.20061412487205732, "grad_norm": 0.6516313518294781, "learning_rate": 1.6065573770491805e-05, "loss": 0.492, "step": 98 }, { "epoch": 0.20266120777891505, "grad_norm": 0.7323248049827676, "learning_rate": 1.6229508196721314e-05, "loss": 0.5561, "step": 99 }, { "epoch": 0.2047082906857728, "grad_norm": 0.614575250954447, "learning_rate": 1.639344262295082e-05, "loss": 0.5319, "step": 100 }, { "epoch": 0.2067553735926305, "grad_norm": 0.6867103983793634, "learning_rate": 1.655737704918033e-05, "loss": 0.5087, "step": 101 }, { "epoch": 0.20880245649948823, "grad_norm": 0.5125936025327898, "learning_rate": 1.6721311475409837e-05, "loss": 0.5187, "step": 102 }, { "epoch": 0.21084953940634596, "grad_norm": 0.7791039154279353, "learning_rate": 1.6885245901639347e-05, "loss": 0.5831, "step": 103 }, { "epoch": 0.2128966223132037, "grad_norm": 0.5610382412593208, "learning_rate": 1.7049180327868854e-05, "loss": 0.5979, "step": 104 }, { "epoch": 0.21494370522006143, "grad_norm": 0.6872141481595387, "learning_rate": 1.721311475409836e-05, "loss": 0.5337, "step": 105 }, { "epoch": 0.21699078812691913, "grad_norm": 0.621726895260148, "learning_rate": 1.737704918032787e-05, "loss": 0.5014, "step": 106 }, { "epoch": 0.21903787103377687, "grad_norm": 0.6593450481678657, "learning_rate": 1.7540983606557377e-05, "loss": 0.5298, "step": 107 }, { "epoch": 0.2210849539406346, "grad_norm": 0.6085504680048398, "learning_rate": 1.7704918032786887e-05, "loss": 0.5597, "step": 108 }, { "epoch": 0.22313203684749233, "grad_norm": 0.5818331349760811, "learning_rate": 1.7868852459016393e-05, "loss": 0.5671, "step": 109 }, { "epoch": 0.22517911975435004, "grad_norm": 0.5705507858388099, "learning_rate": 1.8032786885245903e-05, "loss": 0.5608, "step": 110 }, { "epoch": 0.22722620266120777, "grad_norm": 0.566454711636141, "learning_rate": 1.8196721311475413e-05, "loss": 0.536, "step": 111 }, { "epoch": 0.2292732855680655, "grad_norm": 0.6584736492952961, "learning_rate": 1.836065573770492e-05, "loss": 0.5107, "step": 112 }, { "epoch": 0.23132036847492324, "grad_norm": 0.5431857503415616, "learning_rate": 1.852459016393443e-05, "loss": 0.5216, "step": 113 }, { "epoch": 0.23336745138178097, "grad_norm": 0.7429344164092465, "learning_rate": 1.8688524590163936e-05, "loss": 0.5673, "step": 114 }, { "epoch": 0.23541453428863868, "grad_norm": 0.6303478113244917, "learning_rate": 1.8852459016393446e-05, "loss": 0.5173, "step": 115 }, { "epoch": 0.2374616171954964, "grad_norm": 0.6632339341308846, "learning_rate": 1.9016393442622952e-05, "loss": 0.526, "step": 116 }, { "epoch": 0.23950870010235414, "grad_norm": 0.7178085764934704, "learning_rate": 1.918032786885246e-05, "loss": 0.5501, "step": 117 }, { "epoch": 0.24155578300921188, "grad_norm": 0.7261199143030841, "learning_rate": 1.934426229508197e-05, "loss": 0.5564, "step": 118 }, { "epoch": 0.2436028659160696, "grad_norm": 0.6467257123886485, "learning_rate": 1.9508196721311475e-05, "loss": 0.5307, "step": 119 }, { "epoch": 0.24564994882292732, "grad_norm": 0.7743559427761539, "learning_rate": 1.9672131147540985e-05, "loss": 0.4867, "step": 120 }, { "epoch": 0.24769703172978505, "grad_norm": 0.5777069325137312, "learning_rate": 1.9836065573770492e-05, "loss": 0.5235, "step": 121 }, { "epoch": 0.24974411463664278, "grad_norm": 0.7434807781935519, "learning_rate": 2e-05, "loss": 0.524, "step": 122 }, { "epoch": 0.2517911975435005, "grad_norm": 0.6635906710416195, "learning_rate": 2.0163934426229508e-05, "loss": 0.4759, "step": 123 }, { "epoch": 0.2538382804503582, "grad_norm": 0.6164296619684109, "learning_rate": 2.0327868852459018e-05, "loss": 0.4925, "step": 124 }, { "epoch": 0.25588536335721596, "grad_norm": 0.5800314323412163, "learning_rate": 2.0491803278688525e-05, "loss": 0.536, "step": 125 }, { "epoch": 0.2579324462640737, "grad_norm": 0.6910504700298034, "learning_rate": 2.0655737704918034e-05, "loss": 0.5937, "step": 126 }, { "epoch": 0.2599795291709314, "grad_norm": 0.5724201379088258, "learning_rate": 2.081967213114754e-05, "loss": 0.5019, "step": 127 }, { "epoch": 0.26202661207778916, "grad_norm": 0.7529480886936586, "learning_rate": 2.098360655737705e-05, "loss": 0.5643, "step": 128 }, { "epoch": 0.2640736949846469, "grad_norm": 0.6170979184951091, "learning_rate": 2.1147540983606557e-05, "loss": 0.5449, "step": 129 }, { "epoch": 0.2661207778915046, "grad_norm": 0.6541686452612759, "learning_rate": 2.1311475409836067e-05, "loss": 0.5377, "step": 130 }, { "epoch": 0.26816786079836236, "grad_norm": 0.7744689267947779, "learning_rate": 2.1475409836065574e-05, "loss": 0.5846, "step": 131 }, { "epoch": 0.27021494370522003, "grad_norm": 0.5945036689673261, "learning_rate": 2.1639344262295087e-05, "loss": 0.5283, "step": 132 }, { "epoch": 0.27226202661207777, "grad_norm": 0.661953998440321, "learning_rate": 2.180327868852459e-05, "loss": 0.557, "step": 133 }, { "epoch": 0.2743091095189355, "grad_norm": 0.5763477769339282, "learning_rate": 2.1967213114754104e-05, "loss": 0.511, "step": 134 }, { "epoch": 0.27635619242579323, "grad_norm": 0.5514346276124723, "learning_rate": 2.213114754098361e-05, "loss": 0.5182, "step": 135 }, { "epoch": 0.27840327533265097, "grad_norm": 0.5695095404976926, "learning_rate": 2.2295081967213113e-05, "loss": 0.5146, "step": 136 }, { "epoch": 0.2804503582395087, "grad_norm": 0.5986561008583392, "learning_rate": 2.2459016393442626e-05, "loss": 0.5083, "step": 137 }, { "epoch": 0.28249744114636643, "grad_norm": 0.5610678921247985, "learning_rate": 2.2622950819672133e-05, "loss": 0.5313, "step": 138 }, { "epoch": 0.28454452405322417, "grad_norm": 0.6049817345717909, "learning_rate": 2.2786885245901643e-05, "loss": 0.5052, "step": 139 }, { "epoch": 0.2865916069600819, "grad_norm": 0.5464694050777636, "learning_rate": 2.295081967213115e-05, "loss": 0.5371, "step": 140 }, { "epoch": 0.28863868986693964, "grad_norm": 0.6288012925738132, "learning_rate": 2.311475409836066e-05, "loss": 0.4964, "step": 141 }, { "epoch": 0.2906857727737973, "grad_norm": 0.6292583801203022, "learning_rate": 2.3278688524590166e-05, "loss": 0.524, "step": 142 }, { "epoch": 0.29273285568065505, "grad_norm": 0.5989675634399306, "learning_rate": 2.3442622950819676e-05, "loss": 0.5347, "step": 143 }, { "epoch": 0.2947799385875128, "grad_norm": 0.711475328293837, "learning_rate": 2.3606557377049182e-05, "loss": 0.5212, "step": 144 }, { "epoch": 0.2968270214943705, "grad_norm": 0.6147945346885277, "learning_rate": 2.3770491803278692e-05, "loss": 0.536, "step": 145 }, { "epoch": 0.29887410440122825, "grad_norm": 0.7665801712563535, "learning_rate": 2.39344262295082e-05, "loss": 0.506, "step": 146 }, { "epoch": 0.300921187308086, "grad_norm": 0.5675791883302023, "learning_rate": 2.4098360655737705e-05, "loss": 0.5165, "step": 147 }, { "epoch": 0.3029682702149437, "grad_norm": 0.6749613199444409, "learning_rate": 2.4262295081967215e-05, "loss": 0.5353, "step": 148 }, { "epoch": 0.30501535312180145, "grad_norm": 0.5449017130362618, "learning_rate": 2.442622950819672e-05, "loss": 0.5378, "step": 149 }, { "epoch": 0.3070624360286592, "grad_norm": 0.6201210295020051, "learning_rate": 2.459016393442623e-05, "loss": 0.5094, "step": 150 }, { "epoch": 0.3091095189355169, "grad_norm": 0.577151136698421, "learning_rate": 2.4754098360655738e-05, "loss": 0.5196, "step": 151 }, { "epoch": 0.3111566018423746, "grad_norm": 0.5313033735643246, "learning_rate": 2.4918032786885248e-05, "loss": 0.4937, "step": 152 }, { "epoch": 0.3132036847492323, "grad_norm": 0.5101509817549174, "learning_rate": 2.5081967213114754e-05, "loss": 0.4788, "step": 153 }, { "epoch": 0.31525076765609006, "grad_norm": 0.6444658936554309, "learning_rate": 2.5245901639344264e-05, "loss": 0.5275, "step": 154 }, { "epoch": 0.3172978505629478, "grad_norm": 0.44154622012146943, "learning_rate": 2.540983606557377e-05, "loss": 0.4861, "step": 155 }, { "epoch": 0.3193449334698055, "grad_norm": 0.608818705758696, "learning_rate": 2.5573770491803284e-05, "loss": 0.5071, "step": 156 }, { "epoch": 0.32139201637666326, "grad_norm": 0.616456605045568, "learning_rate": 2.5737704918032787e-05, "loss": 0.5195, "step": 157 }, { "epoch": 0.323439099283521, "grad_norm": 0.67099003512138, "learning_rate": 2.59016393442623e-05, "loss": 0.5523, "step": 158 }, { "epoch": 0.3254861821903787, "grad_norm": 0.6144728851907352, "learning_rate": 2.6065573770491807e-05, "loss": 0.5451, "step": 159 }, { "epoch": 0.32753326509723646, "grad_norm": 0.6323429696875369, "learning_rate": 2.622950819672131e-05, "loss": 0.5309, "step": 160 }, { "epoch": 0.3295803480040942, "grad_norm": 0.5465209069864494, "learning_rate": 2.6393442622950824e-05, "loss": 0.5342, "step": 161 }, { "epoch": 0.33162743091095187, "grad_norm": 0.7352587986716179, "learning_rate": 2.655737704918033e-05, "loss": 0.5181, "step": 162 }, { "epoch": 0.3336745138178096, "grad_norm": 0.5130967329382123, "learning_rate": 2.672131147540984e-05, "loss": 0.5074, "step": 163 }, { "epoch": 0.33572159672466734, "grad_norm": 0.6870511656785425, "learning_rate": 2.6885245901639346e-05, "loss": 0.5603, "step": 164 }, { "epoch": 0.33776867963152507, "grad_norm": 0.7048410066657461, "learning_rate": 2.7049180327868856e-05, "loss": 0.5272, "step": 165 }, { "epoch": 0.3398157625383828, "grad_norm": 0.8082743167202432, "learning_rate": 2.7213114754098363e-05, "loss": 0.5196, "step": 166 }, { "epoch": 0.34186284544524054, "grad_norm": 0.6901700346930703, "learning_rate": 2.7377049180327873e-05, "loss": 0.5439, "step": 167 }, { "epoch": 0.34390992835209827, "grad_norm": 0.6543817136595013, "learning_rate": 2.754098360655738e-05, "loss": 0.5375, "step": 168 }, { "epoch": 0.345957011258956, "grad_norm": 0.6015689406714214, "learning_rate": 2.770491803278689e-05, "loss": 0.5014, "step": 169 }, { "epoch": 0.34800409416581374, "grad_norm": 0.658713714501858, "learning_rate": 2.7868852459016396e-05, "loss": 0.5262, "step": 170 }, { "epoch": 0.3500511770726714, "grad_norm": 0.5872370736842677, "learning_rate": 2.8032786885245902e-05, "loss": 0.5454, "step": 171 }, { "epoch": 0.35209825997952915, "grad_norm": 0.5434995669507285, "learning_rate": 2.8196721311475412e-05, "loss": 0.562, "step": 172 }, { "epoch": 0.3541453428863869, "grad_norm": 0.7617505474169058, "learning_rate": 2.836065573770492e-05, "loss": 0.5147, "step": 173 }, { "epoch": 0.3561924257932446, "grad_norm": 0.6940517866702995, "learning_rate": 2.852459016393443e-05, "loss": 0.561, "step": 174 }, { "epoch": 0.35823950870010235, "grad_norm": 0.6026045944898959, "learning_rate": 2.8688524590163935e-05, "loss": 0.5294, "step": 175 }, { "epoch": 0.3602865916069601, "grad_norm": 0.685003134185584, "learning_rate": 2.8852459016393445e-05, "loss": 0.5086, "step": 176 }, { "epoch": 0.3623336745138178, "grad_norm": 0.7007920129863516, "learning_rate": 2.901639344262295e-05, "loss": 0.5021, "step": 177 }, { "epoch": 0.36438075742067555, "grad_norm": 0.7355272145052204, "learning_rate": 2.918032786885246e-05, "loss": 0.5007, "step": 178 }, { "epoch": 0.3664278403275333, "grad_norm": 0.6487776776109249, "learning_rate": 2.9344262295081968e-05, "loss": 0.5524, "step": 179 }, { "epoch": 0.368474923234391, "grad_norm": 0.6902433390057133, "learning_rate": 2.9508196721311478e-05, "loss": 0.5142, "step": 180 }, { "epoch": 0.3705220061412487, "grad_norm": 0.6867948754438059, "learning_rate": 2.9672131147540984e-05, "loss": 0.5187, "step": 181 }, { "epoch": 0.3725690890481064, "grad_norm": 0.6159722804543436, "learning_rate": 2.9836065573770498e-05, "loss": 0.5508, "step": 182 }, { "epoch": 0.37461617195496416, "grad_norm": 0.7696599736408124, "learning_rate": 3.0000000000000004e-05, "loss": 0.5348, "step": 183 }, { "epoch": 0.3766632548618219, "grad_norm": 0.6954090228635682, "learning_rate": 3.0163934426229507e-05, "loss": 0.5447, "step": 184 }, { "epoch": 0.37871033776867963, "grad_norm": 0.6435121902381328, "learning_rate": 3.032786885245902e-05, "loss": 0.5474, "step": 185 }, { "epoch": 0.38075742067553736, "grad_norm": 0.8038288638214993, "learning_rate": 3.0491803278688527e-05, "loss": 0.5559, "step": 186 }, { "epoch": 0.3828045035823951, "grad_norm": 0.688956722367706, "learning_rate": 3.065573770491804e-05, "loss": 0.4875, "step": 187 }, { "epoch": 0.38485158648925283, "grad_norm": 0.7159386414570686, "learning_rate": 3.0819672131147544e-05, "loss": 0.5499, "step": 188 }, { "epoch": 0.38689866939611056, "grad_norm": 0.9285947357940207, "learning_rate": 3.098360655737705e-05, "loss": 0.5144, "step": 189 }, { "epoch": 0.3889457523029683, "grad_norm": 0.6303078015541651, "learning_rate": 3.1147540983606557e-05, "loss": 0.6274, "step": 190 }, { "epoch": 0.390992835209826, "grad_norm": 0.8868869883770375, "learning_rate": 3.131147540983607e-05, "loss": 0.5261, "step": 191 }, { "epoch": 0.3930399181166837, "grad_norm": 0.8238279905547772, "learning_rate": 3.1475409836065576e-05, "loss": 0.5284, "step": 192 }, { "epoch": 0.39508700102354144, "grad_norm": 0.7109735935815616, "learning_rate": 3.163934426229509e-05, "loss": 0.4981, "step": 193 }, { "epoch": 0.3971340839303992, "grad_norm": 0.8555381934750876, "learning_rate": 3.180327868852459e-05, "loss": 0.5184, "step": 194 }, { "epoch": 0.3991811668372569, "grad_norm": 0.7950926694758862, "learning_rate": 3.19672131147541e-05, "loss": 0.5286, "step": 195 }, { "epoch": 0.40122824974411464, "grad_norm": 0.6893785135479137, "learning_rate": 3.213114754098361e-05, "loss": 0.5048, "step": 196 }, { "epoch": 0.4032753326509724, "grad_norm": 0.8285127939060417, "learning_rate": 3.2295081967213116e-05, "loss": 0.5696, "step": 197 }, { "epoch": 0.4053224155578301, "grad_norm": 0.6476667354794096, "learning_rate": 3.245901639344263e-05, "loss": 0.5175, "step": 198 }, { "epoch": 0.40736949846468784, "grad_norm": 0.9133932587744947, "learning_rate": 3.2622950819672136e-05, "loss": 0.5426, "step": 199 }, { "epoch": 0.4094165813715456, "grad_norm": 0.7532290387705938, "learning_rate": 3.278688524590164e-05, "loss": 0.5087, "step": 200 }, { "epoch": 0.41146366427840325, "grad_norm": 0.9389363029784095, "learning_rate": 3.295081967213115e-05, "loss": 0.509, "step": 201 }, { "epoch": 0.413510747185261, "grad_norm": 0.5686252927682621, "learning_rate": 3.311475409836066e-05, "loss": 0.5193, "step": 202 }, { "epoch": 0.4155578300921187, "grad_norm": 0.8087812079712846, "learning_rate": 3.327868852459017e-05, "loss": 0.5197, "step": 203 }, { "epoch": 0.41760491299897645, "grad_norm": 0.4881097207653879, "learning_rate": 3.3442622950819675e-05, "loss": 0.5044, "step": 204 }, { "epoch": 0.4196519959058342, "grad_norm": 0.7274067002262758, "learning_rate": 3.360655737704918e-05, "loss": 0.4974, "step": 205 }, { "epoch": 0.4216990788126919, "grad_norm": 0.5458714268885722, "learning_rate": 3.3770491803278695e-05, "loss": 0.5017, "step": 206 }, { "epoch": 0.42374616171954965, "grad_norm": 0.7035504070202385, "learning_rate": 3.39344262295082e-05, "loss": 0.4983, "step": 207 }, { "epoch": 0.4257932446264074, "grad_norm": 0.5269572768909501, "learning_rate": 3.409836065573771e-05, "loss": 0.5484, "step": 208 }, { "epoch": 0.4278403275332651, "grad_norm": 0.5550435466705637, "learning_rate": 3.4262295081967214e-05, "loss": 0.5152, "step": 209 }, { "epoch": 0.42988741044012285, "grad_norm": 0.5658118678239209, "learning_rate": 3.442622950819672e-05, "loss": 0.529, "step": 210 }, { "epoch": 0.43193449334698053, "grad_norm": 0.4274306239416562, "learning_rate": 3.4590163934426234e-05, "loss": 0.547, "step": 211 }, { "epoch": 0.43398157625383826, "grad_norm": 0.6498291018145316, "learning_rate": 3.475409836065574e-05, "loss": 0.5255, "step": 212 }, { "epoch": 0.436028659160696, "grad_norm": 0.4862132754376562, "learning_rate": 3.491803278688525e-05, "loss": 0.5407, "step": 213 }, { "epoch": 0.43807574206755373, "grad_norm": 0.7345334765463367, "learning_rate": 3.5081967213114754e-05, "loss": 0.5467, "step": 214 }, { "epoch": 0.44012282497441146, "grad_norm": 0.6199950312514801, "learning_rate": 3.524590163934427e-05, "loss": 0.5599, "step": 215 }, { "epoch": 0.4421699078812692, "grad_norm": 0.7074691063675598, "learning_rate": 3.5409836065573773e-05, "loss": 0.5127, "step": 216 }, { "epoch": 0.44421699078812693, "grad_norm": 0.6229358703039948, "learning_rate": 3.557377049180329e-05, "loss": 0.5186, "step": 217 }, { "epoch": 0.44626407369498466, "grad_norm": 0.6395887726511317, "learning_rate": 3.5737704918032786e-05, "loss": 0.4989, "step": 218 }, { "epoch": 0.4483111566018424, "grad_norm": 0.5458826332923155, "learning_rate": 3.59016393442623e-05, "loss": 0.5008, "step": 219 }, { "epoch": 0.4503582395087001, "grad_norm": 0.6729015313127311, "learning_rate": 3.6065573770491806e-05, "loss": 0.5122, "step": 220 }, { "epoch": 0.4524053224155578, "grad_norm": 0.7752024606600724, "learning_rate": 3.622950819672131e-05, "loss": 0.5018, "step": 221 }, { "epoch": 0.45445240532241554, "grad_norm": 0.6141567713992134, "learning_rate": 3.6393442622950826e-05, "loss": 0.4921, "step": 222 }, { "epoch": 0.4564994882292733, "grad_norm": 0.6807359145513986, "learning_rate": 3.655737704918033e-05, "loss": 0.5041, "step": 223 }, { "epoch": 0.458546571136131, "grad_norm": 0.5856826658124886, "learning_rate": 3.672131147540984e-05, "loss": 0.5281, "step": 224 }, { "epoch": 0.46059365404298874, "grad_norm": 0.6528386749398997, "learning_rate": 3.6885245901639346e-05, "loss": 0.4862, "step": 225 }, { "epoch": 0.4626407369498465, "grad_norm": 0.8760949897414592, "learning_rate": 3.704918032786886e-05, "loss": 0.5313, "step": 226 }, { "epoch": 0.4646878198567042, "grad_norm": 0.7133493187489152, "learning_rate": 3.7213114754098365e-05, "loss": 0.5611, "step": 227 }, { "epoch": 0.46673490276356194, "grad_norm": 0.6514391258645618, "learning_rate": 3.737704918032787e-05, "loss": 0.5017, "step": 228 }, { "epoch": 0.4687819856704197, "grad_norm": 0.712386378258888, "learning_rate": 3.754098360655738e-05, "loss": 0.4801, "step": 229 }, { "epoch": 0.47082906857727735, "grad_norm": 0.7933509165191774, "learning_rate": 3.770491803278689e-05, "loss": 0.5265, "step": 230 }, { "epoch": 0.4728761514841351, "grad_norm": 0.6393959136262052, "learning_rate": 3.78688524590164e-05, "loss": 0.533, "step": 231 }, { "epoch": 0.4749232343909928, "grad_norm": 0.8336237173823177, "learning_rate": 3.8032786885245905e-05, "loss": 0.5067, "step": 232 }, { "epoch": 0.47697031729785055, "grad_norm": 0.7946625026406952, "learning_rate": 3.819672131147541e-05, "loss": 0.5572, "step": 233 }, { "epoch": 0.4790174002047083, "grad_norm": 0.7448514333498657, "learning_rate": 3.836065573770492e-05, "loss": 0.5267, "step": 234 }, { "epoch": 0.481064483111566, "grad_norm": 0.6216449013018147, "learning_rate": 3.852459016393443e-05, "loss": 0.5007, "step": 235 }, { "epoch": 0.48311156601842375, "grad_norm": 0.5847497681971316, "learning_rate": 3.868852459016394e-05, "loss": 0.5465, "step": 236 }, { "epoch": 0.4851586489252815, "grad_norm": 0.5930045841712915, "learning_rate": 3.8852459016393444e-05, "loss": 0.5498, "step": 237 }, { "epoch": 0.4872057318321392, "grad_norm": 0.6242247074949386, "learning_rate": 3.901639344262295e-05, "loss": 0.5032, "step": 238 }, { "epoch": 0.48925281473899696, "grad_norm": 0.5315372089751544, "learning_rate": 3.9180327868852464e-05, "loss": 0.5351, "step": 239 }, { "epoch": 0.49129989764585463, "grad_norm": 0.5976493830078852, "learning_rate": 3.934426229508197e-05, "loss": 0.5511, "step": 240 }, { "epoch": 0.49334698055271237, "grad_norm": 0.5263959833035169, "learning_rate": 3.950819672131148e-05, "loss": 0.5095, "step": 241 }, { "epoch": 0.4953940634595701, "grad_norm": 0.5164961935996712, "learning_rate": 3.9672131147540983e-05, "loss": 0.4903, "step": 242 }, { "epoch": 0.49744114636642783, "grad_norm": 0.5030461231666816, "learning_rate": 3.98360655737705e-05, "loss": 0.5246, "step": 243 }, { "epoch": 0.49948822927328557, "grad_norm": 0.5199256237042372, "learning_rate": 4e-05, "loss": 0.527, "step": 244 }, { "epoch": 0.5015353121801432, "grad_norm": 0.5720072000502151, "learning_rate": 3.999997953390434e-05, "loss": 0.4698, "step": 245 }, { "epoch": 0.503582395087001, "grad_norm": 0.4723958838410465, "learning_rate": 3.999991813565924e-05, "loss": 0.5021, "step": 246 }, { "epoch": 0.5056294779938587, "grad_norm": 0.6664032925560375, "learning_rate": 3.999981580539036e-05, "loss": 0.5195, "step": 247 }, { "epoch": 0.5076765609007164, "grad_norm": 0.46934376217932583, "learning_rate": 3.999967254330713e-05, "loss": 0.4915, "step": 248 }, { "epoch": 0.5097236438075742, "grad_norm": 0.5471916125552302, "learning_rate": 3.999948834970275e-05, "loss": 0.5395, "step": 249 }, { "epoch": 0.5117707267144319, "grad_norm": 0.5322293661429813, "learning_rate": 3.9999263224954204e-05, "loss": 0.5156, "step": 250 }, { "epoch": 0.5138178096212896, "grad_norm": 0.48934414004740173, "learning_rate": 3.999899716952221e-05, "loss": 0.505, "step": 251 }, { "epoch": 0.5158648925281474, "grad_norm": 0.6841239054987143, "learning_rate": 3.9998690183951304e-05, "loss": 0.517, "step": 252 }, { "epoch": 0.5179119754350051, "grad_norm": 0.6081072200654224, "learning_rate": 3.999834226886976e-05, "loss": 0.5209, "step": 253 }, { "epoch": 0.5199590583418628, "grad_norm": 0.591682811543655, "learning_rate": 3.999795342498961e-05, "loss": 0.5144, "step": 254 }, { "epoch": 0.5220061412487206, "grad_norm": 0.644975243350573, "learning_rate": 3.999752365310668e-05, "loss": 0.5285, "step": 255 }, { "epoch": 0.5240532241555783, "grad_norm": 0.5648625378625047, "learning_rate": 3.999705295410054e-05, "loss": 0.493, "step": 256 }, { "epoch": 0.526100307062436, "grad_norm": 0.6130167811037579, "learning_rate": 3.999654132893453e-05, "loss": 0.5257, "step": 257 }, { "epoch": 0.5281473899692938, "grad_norm": 0.5037937329537826, "learning_rate": 3.999598877865575e-05, "loss": 0.4947, "step": 258 }, { "epoch": 0.5301944728761515, "grad_norm": 0.6388452684007601, "learning_rate": 3.999539530439504e-05, "loss": 0.5319, "step": 259 }, { "epoch": 0.5322415557830092, "grad_norm": 0.5304888345319132, "learning_rate": 3.9994760907367025e-05, "loss": 0.5239, "step": 260 }, { "epoch": 0.534288638689867, "grad_norm": 0.548729152916564, "learning_rate": 3.999408558887006e-05, "loss": 0.5182, "step": 261 }, { "epoch": 0.5363357215967247, "grad_norm": 0.4946265440615839, "learning_rate": 3.9993369350286265e-05, "loss": 0.5211, "step": 262 }, { "epoch": 0.5383828045035824, "grad_norm": 0.5513131572470374, "learning_rate": 3.999261219308149e-05, "loss": 0.4922, "step": 263 }, { "epoch": 0.5404298874104401, "grad_norm": 0.5628821503706624, "learning_rate": 3.999181411880536e-05, "loss": 0.4833, "step": 264 }, { "epoch": 0.5424769703172978, "grad_norm": 0.6321595406631201, "learning_rate": 3.99909751290912e-05, "loss": 0.5156, "step": 265 }, { "epoch": 0.5445240532241555, "grad_norm": 0.49452183826279106, "learning_rate": 3.9990095225656104e-05, "loss": 0.4918, "step": 266 }, { "epoch": 0.5465711361310133, "grad_norm": 0.5983835992691791, "learning_rate": 3.998917441030089e-05, "loss": 0.534, "step": 267 }, { "epoch": 0.548618219037871, "grad_norm": 0.5392668568126767, "learning_rate": 3.9988212684910107e-05, "loss": 0.4919, "step": 268 }, { "epoch": 0.5506653019447287, "grad_norm": 0.5271122702751097, "learning_rate": 3.998721005145204e-05, "loss": 0.5023, "step": 269 }, { "epoch": 0.5527123848515865, "grad_norm": 0.613359179011921, "learning_rate": 3.998616651197867e-05, "loss": 0.5123, "step": 270 }, { "epoch": 0.5547594677584442, "grad_norm": 0.5885866628158287, "learning_rate": 3.9985082068625724e-05, "loss": 0.5192, "step": 271 }, { "epoch": 0.5568065506653019, "grad_norm": 0.5458709549332362, "learning_rate": 3.998395672361264e-05, "loss": 0.5159, "step": 272 }, { "epoch": 0.5588536335721597, "grad_norm": 0.47718089897345783, "learning_rate": 3.998279047924255e-05, "loss": 0.4571, "step": 273 }, { "epoch": 0.5609007164790174, "grad_norm": 0.5918739989559765, "learning_rate": 3.998158333790231e-05, "loss": 0.5093, "step": 274 }, { "epoch": 0.5629477993858751, "grad_norm": 0.6171028421078789, "learning_rate": 3.998033530206246e-05, "loss": 0.5174, "step": 275 }, { "epoch": 0.5649948822927329, "grad_norm": 0.6049929559700232, "learning_rate": 3.9979046374277246e-05, "loss": 0.5292, "step": 276 }, { "epoch": 0.5670419651995906, "grad_norm": 0.6353516580703451, "learning_rate": 3.99777165571846e-05, "loss": 0.5202, "step": 277 }, { "epoch": 0.5690890481064483, "grad_norm": 0.542126012081921, "learning_rate": 3.997634585350614e-05, "loss": 0.515, "step": 278 }, { "epoch": 0.5711361310133061, "grad_norm": 0.6577209989909311, "learning_rate": 3.997493426604715e-05, "loss": 0.4827, "step": 279 }, { "epoch": 0.5731832139201638, "grad_norm": 0.5786055028419322, "learning_rate": 3.997348179769661e-05, "loss": 0.4984, "step": 280 }, { "epoch": 0.5752302968270215, "grad_norm": 0.5693454757360409, "learning_rate": 3.9971988451427155e-05, "loss": 0.4795, "step": 281 }, { "epoch": 0.5772773797338793, "grad_norm": 0.523219696074873, "learning_rate": 3.997045423029508e-05, "loss": 0.5288, "step": 282 }, { "epoch": 0.579324462640737, "grad_norm": 0.5914665747076296, "learning_rate": 3.996887913744033e-05, "loss": 0.5412, "step": 283 }, { "epoch": 0.5813715455475946, "grad_norm": 0.504747034534071, "learning_rate": 3.996726317608652e-05, "loss": 0.5119, "step": 284 }, { "epoch": 0.5834186284544524, "grad_norm": 0.6330512274369553, "learning_rate": 3.996560634954088e-05, "loss": 0.5504, "step": 285 }, { "epoch": 0.5854657113613101, "grad_norm": 0.5570244494354821, "learning_rate": 3.9963908661194285e-05, "loss": 0.5323, "step": 286 }, { "epoch": 0.5875127942681678, "grad_norm": 0.7110270240629256, "learning_rate": 3.9962170114521246e-05, "loss": 0.5086, "step": 287 }, { "epoch": 0.5895598771750256, "grad_norm": 0.5503143307537267, "learning_rate": 3.996039071307989e-05, "loss": 0.5451, "step": 288 }, { "epoch": 0.5916069600818833, "grad_norm": 0.6551750805922822, "learning_rate": 3.995857046051196e-05, "loss": 0.5375, "step": 289 }, { "epoch": 0.593654042988741, "grad_norm": 0.625258001738531, "learning_rate": 3.995670936054279e-05, "loss": 0.5241, "step": 290 }, { "epoch": 0.5957011258955988, "grad_norm": 0.6762376840884802, "learning_rate": 3.9954807416981335e-05, "loss": 0.5049, "step": 291 }, { "epoch": 0.5977482088024565, "grad_norm": 0.6711303804476403, "learning_rate": 3.995286463372013e-05, "loss": 0.5117, "step": 292 }, { "epoch": 0.5997952917093142, "grad_norm": 0.5151264552712931, "learning_rate": 3.9950881014735295e-05, "loss": 0.5053, "step": 293 }, { "epoch": 0.601842374616172, "grad_norm": 0.663266741433607, "learning_rate": 3.994885656408651e-05, "loss": 0.4948, "step": 294 }, { "epoch": 0.6038894575230297, "grad_norm": 0.567833717196775, "learning_rate": 3.994679128591706e-05, "loss": 0.5566, "step": 295 }, { "epoch": 0.6059365404298874, "grad_norm": 0.5899154422863364, "learning_rate": 3.9944685184453746e-05, "loss": 0.53, "step": 296 }, { "epoch": 0.6079836233367452, "grad_norm": 0.5758493539965378, "learning_rate": 3.994253826400693e-05, "loss": 0.5314, "step": 297 }, { "epoch": 0.6100307062436029, "grad_norm": 0.5296826695926912, "learning_rate": 3.9940350528970535e-05, "loss": 0.5116, "step": 298 }, { "epoch": 0.6120777891504606, "grad_norm": 0.5290847709355387, "learning_rate": 3.993812198382199e-05, "loss": 0.5028, "step": 299 }, { "epoch": 0.6141248720573184, "grad_norm": 0.552336105606733, "learning_rate": 3.993585263312227e-05, "loss": 0.5202, "step": 300 }, { "epoch": 0.6161719549641761, "grad_norm": 0.5293716014410816, "learning_rate": 3.993354248151583e-05, "loss": 0.4912, "step": 301 }, { "epoch": 0.6182190378710338, "grad_norm": 0.6609476283232325, "learning_rate": 3.993119153373067e-05, "loss": 0.5438, "step": 302 }, { "epoch": 0.6202661207778914, "grad_norm": 0.5469203904739622, "learning_rate": 3.992879979457824e-05, "loss": 0.5123, "step": 303 }, { "epoch": 0.6223132036847492, "grad_norm": 0.6036405520109454, "learning_rate": 3.9926367268953514e-05, "loss": 0.5047, "step": 304 }, { "epoch": 0.6243602865916069, "grad_norm": 0.5523585337922061, "learning_rate": 3.9923893961834914e-05, "loss": 0.5126, "step": 305 }, { "epoch": 0.6264073694984647, "grad_norm": 0.5060340859748851, "learning_rate": 3.992137987828434e-05, "loss": 0.512, "step": 306 }, { "epoch": 0.6284544524053224, "grad_norm": 0.4931821446334137, "learning_rate": 3.991882502344712e-05, "loss": 0.5086, "step": 307 }, { "epoch": 0.6305015353121801, "grad_norm": 0.49318765086607474, "learning_rate": 3.991622940255208e-05, "loss": 0.4928, "step": 308 }, { "epoch": 0.6325486182190379, "grad_norm": 0.47444503281787836, "learning_rate": 3.991359302091141e-05, "loss": 0.4823, "step": 309 }, { "epoch": 0.6345957011258956, "grad_norm": 0.5837217244432208, "learning_rate": 3.991091588392077e-05, "loss": 0.553, "step": 310 }, { "epoch": 0.6366427840327533, "grad_norm": 0.5943267226346076, "learning_rate": 3.99081979970592e-05, "loss": 0.5191, "step": 311 }, { "epoch": 0.638689866939611, "grad_norm": 0.4692619037725303, "learning_rate": 3.9905439365889176e-05, "loss": 0.4833, "step": 312 }, { "epoch": 0.6407369498464688, "grad_norm": 0.5017562232149083, "learning_rate": 3.990263999605652e-05, "loss": 0.4932, "step": 313 }, { "epoch": 0.6427840327533265, "grad_norm": 0.4800449124898955, "learning_rate": 3.989979989329046e-05, "loss": 0.5475, "step": 314 }, { "epoch": 0.6448311156601843, "grad_norm": 0.7077261651500286, "learning_rate": 3.9896919063403567e-05, "loss": 0.5656, "step": 315 }, { "epoch": 0.646878198567042, "grad_norm": 0.4705479904386771, "learning_rate": 3.989399751229179e-05, "loss": 0.4812, "step": 316 }, { "epoch": 0.6489252814738997, "grad_norm": 0.6188237485091818, "learning_rate": 3.989103524593439e-05, "loss": 0.5448, "step": 317 }, { "epoch": 0.6509723643807575, "grad_norm": 0.5875009833381106, "learning_rate": 3.9888032270393966e-05, "loss": 0.5391, "step": 318 }, { "epoch": 0.6530194472876152, "grad_norm": 0.6367560756626307, "learning_rate": 3.988498859181645e-05, "loss": 0.5857, "step": 319 }, { "epoch": 0.6550665301944729, "grad_norm": 0.485432763118475, "learning_rate": 3.988190421643105e-05, "loss": 0.4775, "step": 320 }, { "epoch": 0.6571136131013307, "grad_norm": 0.6730462808178248, "learning_rate": 3.9878779150550306e-05, "loss": 0.4953, "step": 321 }, { "epoch": 0.6591606960081884, "grad_norm": 0.6111893546543505, "learning_rate": 3.9875613400569975e-05, "loss": 0.4593, "step": 322 }, { "epoch": 0.661207778915046, "grad_norm": 0.7191482927871957, "learning_rate": 3.987240697296912e-05, "loss": 0.4943, "step": 323 }, { "epoch": 0.6632548618219037, "grad_norm": 0.6827606638555512, "learning_rate": 3.986915987431006e-05, "loss": 0.5312, "step": 324 }, { "epoch": 0.6653019447287615, "grad_norm": 0.6385480452392281, "learning_rate": 3.986587211123833e-05, "loss": 0.5066, "step": 325 }, { "epoch": 0.6673490276356192, "grad_norm": 0.5586591391727767, "learning_rate": 3.986254369048268e-05, "loss": 0.519, "step": 326 }, { "epoch": 0.6693961105424769, "grad_norm": 0.7119031753948682, "learning_rate": 3.985917461885512e-05, "loss": 0.526, "step": 327 }, { "epoch": 0.6714431934493347, "grad_norm": 0.403690119122138, "learning_rate": 3.98557649032508e-05, "loss": 0.4941, "step": 328 }, { "epoch": 0.6734902763561924, "grad_norm": 0.711228959963807, "learning_rate": 3.985231455064809e-05, "loss": 0.5161, "step": 329 }, { "epoch": 0.6755373592630501, "grad_norm": 0.6152056684018806, "learning_rate": 3.9848823568108515e-05, "loss": 0.5252, "step": 330 }, { "epoch": 0.6775844421699079, "grad_norm": 0.6811387482564264, "learning_rate": 3.984529196277674e-05, "loss": 0.5153, "step": 331 }, { "epoch": 0.6796315250767656, "grad_norm": 0.715472760713951, "learning_rate": 3.9841719741880583e-05, "loss": 0.5136, "step": 332 }, { "epoch": 0.6816786079836233, "grad_norm": 0.6117081033404244, "learning_rate": 3.9838106912731e-05, "loss": 0.5056, "step": 333 }, { "epoch": 0.6837256908904811, "grad_norm": 0.5640133468287892, "learning_rate": 3.983445348272203e-05, "loss": 0.5022, "step": 334 }, { "epoch": 0.6857727737973388, "grad_norm": 0.5708874168642846, "learning_rate": 3.983075945933083e-05, "loss": 0.5063, "step": 335 }, { "epoch": 0.6878198567041965, "grad_norm": 0.5178591256000558, "learning_rate": 3.9827024850117606e-05, "loss": 0.5067, "step": 336 }, { "epoch": 0.6898669396110543, "grad_norm": 0.5425146505052523, "learning_rate": 3.982324966272566e-05, "loss": 0.5112, "step": 337 }, { "epoch": 0.691914022517912, "grad_norm": 0.4545857800127313, "learning_rate": 3.9819433904881324e-05, "loss": 0.4987, "step": 338 }, { "epoch": 0.6939611054247697, "grad_norm": 0.5538231712121938, "learning_rate": 3.981557758439396e-05, "loss": 0.5174, "step": 339 }, { "epoch": 0.6960081883316275, "grad_norm": 0.516910525115318, "learning_rate": 3.981168070915594e-05, "loss": 0.5205, "step": 340 }, { "epoch": 0.6980552712384852, "grad_norm": 0.5341261157883666, "learning_rate": 3.980774328714267e-05, "loss": 0.5406, "step": 341 }, { "epoch": 0.7001023541453428, "grad_norm": 0.5219326420859508, "learning_rate": 3.9803765326412506e-05, "loss": 0.5491, "step": 342 }, { "epoch": 0.7021494370522006, "grad_norm": 0.5126045854291318, "learning_rate": 3.979974683510677e-05, "loss": 0.5507, "step": 343 }, { "epoch": 0.7041965199590583, "grad_norm": 0.5442853681992302, "learning_rate": 3.9795687821449754e-05, "loss": 0.5136, "step": 344 }, { "epoch": 0.706243602865916, "grad_norm": 0.4580135182917517, "learning_rate": 3.9791588293748676e-05, "loss": 0.4908, "step": 345 }, { "epoch": 0.7082906857727738, "grad_norm": 0.5632074896697251, "learning_rate": 3.978744826039366e-05, "loss": 0.5046, "step": 346 }, { "epoch": 0.7103377686796315, "grad_norm": 0.4381200556890962, "learning_rate": 3.9783267729857756e-05, "loss": 0.4994, "step": 347 }, { "epoch": 0.7123848515864892, "grad_norm": 0.5608491796041225, "learning_rate": 3.9779046710696854e-05, "loss": 0.4813, "step": 348 }, { "epoch": 0.714431934493347, "grad_norm": 0.4726444333140026, "learning_rate": 3.977478521154974e-05, "loss": 0.487, "step": 349 }, { "epoch": 0.7164790174002047, "grad_norm": 0.5138973186415984, "learning_rate": 3.977048324113805e-05, "loss": 0.5418, "step": 350 }, { "epoch": 0.7185261003070624, "grad_norm": 0.48636953697879376, "learning_rate": 3.976614080826623e-05, "loss": 0.5313, "step": 351 }, { "epoch": 0.7205731832139202, "grad_norm": 0.5627556793214199, "learning_rate": 3.9761757921821544e-05, "loss": 0.5136, "step": 352 }, { "epoch": 0.7226202661207779, "grad_norm": 0.42092677165552256, "learning_rate": 3.975733459077405e-05, "loss": 0.5396, "step": 353 }, { "epoch": 0.7246673490276356, "grad_norm": 0.48567860686645814, "learning_rate": 3.9752870824176585e-05, "loss": 0.4912, "step": 354 }, { "epoch": 0.7267144319344934, "grad_norm": 0.48215074888552417, "learning_rate": 3.974836663116472e-05, "loss": 0.5246, "step": 355 }, { "epoch": 0.7287615148413511, "grad_norm": 0.4041796943755226, "learning_rate": 3.97438220209568e-05, "loss": 0.4697, "step": 356 }, { "epoch": 0.7308085977482088, "grad_norm": 0.5033023757557246, "learning_rate": 3.973923700285386e-05, "loss": 0.5033, "step": 357 }, { "epoch": 0.7328556806550666, "grad_norm": 0.47849562907550447, "learning_rate": 3.973461158623963e-05, "loss": 0.51, "step": 358 }, { "epoch": 0.7349027635619243, "grad_norm": 0.4728712624318383, "learning_rate": 3.972994578058055e-05, "loss": 0.5183, "step": 359 }, { "epoch": 0.736949846468782, "grad_norm": 0.46994907616226134, "learning_rate": 3.972523959542569e-05, "loss": 0.4791, "step": 360 }, { "epoch": 0.7389969293756398, "grad_norm": 0.4716759611852859, "learning_rate": 3.9720493040406786e-05, "loss": 0.5053, "step": 361 }, { "epoch": 0.7410440122824974, "grad_norm": 0.4739511859859272, "learning_rate": 3.9715706125238164e-05, "loss": 0.4902, "step": 362 }, { "epoch": 0.7430910951893551, "grad_norm": 0.5524351394525074, "learning_rate": 3.971087885971679e-05, "loss": 0.5002, "step": 363 }, { "epoch": 0.7451381780962129, "grad_norm": 0.5160577267882994, "learning_rate": 3.970601125372218e-05, "loss": 0.5077, "step": 364 }, { "epoch": 0.7471852610030706, "grad_norm": 0.6036728391773393, "learning_rate": 3.970110331721643e-05, "loss": 0.5152, "step": 365 }, { "epoch": 0.7492323439099283, "grad_norm": 0.44920724019503216, "learning_rate": 3.9696155060244166e-05, "loss": 0.5016, "step": 366 }, { "epoch": 0.7512794268167861, "grad_norm": 0.5748889938625247, "learning_rate": 3.9691166492932535e-05, "loss": 0.5484, "step": 367 }, { "epoch": 0.7533265097236438, "grad_norm": 0.456775282424986, "learning_rate": 3.968613762549119e-05, "loss": 0.4839, "step": 368 }, { "epoch": 0.7553735926305015, "grad_norm": 0.5722901317947914, "learning_rate": 3.968106846821226e-05, "loss": 0.4961, "step": 369 }, { "epoch": 0.7574206755373593, "grad_norm": 0.513431808850321, "learning_rate": 3.9675959031470336e-05, "loss": 0.5347, "step": 370 }, { "epoch": 0.759467758444217, "grad_norm": 0.4971842267477506, "learning_rate": 3.9670809325722425e-05, "loss": 0.5025, "step": 371 }, { "epoch": 0.7615148413510747, "grad_norm": 0.5489460962549015, "learning_rate": 3.966561936150797e-05, "loss": 0.527, "step": 372 }, { "epoch": 0.7635619242579325, "grad_norm": 0.46580059626935816, "learning_rate": 3.966038914944881e-05, "loss": 0.5024, "step": 373 }, { "epoch": 0.7656090071647902, "grad_norm": 0.48329497303274316, "learning_rate": 3.9655118700249146e-05, "loss": 0.4543, "step": 374 }, { "epoch": 0.7676560900716479, "grad_norm": 0.4294347901743354, "learning_rate": 3.964980802469552e-05, "loss": 0.4918, "step": 375 }, { "epoch": 0.7697031729785057, "grad_norm": 0.46898699035993047, "learning_rate": 3.964445713365682e-05, "loss": 0.5219, "step": 376 }, { "epoch": 0.7717502558853634, "grad_norm": 0.5368649938570568, "learning_rate": 3.963906603808422e-05, "loss": 0.5491, "step": 377 }, { "epoch": 0.7737973387922211, "grad_norm": 0.4589070525461554, "learning_rate": 3.96336347490112e-05, "loss": 0.5193, "step": 378 }, { "epoch": 0.7758444216990789, "grad_norm": 0.5198618447541961, "learning_rate": 3.9628163277553486e-05, "loss": 0.5059, "step": 379 }, { "epoch": 0.7778915046059366, "grad_norm": 0.5201151329391878, "learning_rate": 3.962265163490903e-05, "loss": 0.5242, "step": 380 }, { "epoch": 0.7799385875127943, "grad_norm": 0.5286097719017859, "learning_rate": 3.9617099832358035e-05, "loss": 0.5131, "step": 381 }, { "epoch": 0.781985670419652, "grad_norm": 0.4908828849732809, "learning_rate": 3.961150788126286e-05, "loss": 0.5247, "step": 382 }, { "epoch": 0.7840327533265097, "grad_norm": 0.5105242176184168, "learning_rate": 3.960587579306805e-05, "loss": 0.5082, "step": 383 }, { "epoch": 0.7860798362333674, "grad_norm": 0.4827529985137035, "learning_rate": 3.960020357930028e-05, "loss": 0.4886, "step": 384 }, { "epoch": 0.7881269191402251, "grad_norm": 0.5255080063950176, "learning_rate": 3.9594491251568376e-05, "loss": 0.5323, "step": 385 }, { "epoch": 0.7901740020470829, "grad_norm": 0.42324786024329186, "learning_rate": 3.958873882156322e-05, "loss": 0.4913, "step": 386 }, { "epoch": 0.7922210849539406, "grad_norm": 0.4856526176027495, "learning_rate": 3.9582946301057806e-05, "loss": 0.5083, "step": 387 }, { "epoch": 0.7942681678607983, "grad_norm": 0.4869309440084628, "learning_rate": 3.957711370190716e-05, "loss": 0.503, "step": 388 }, { "epoch": 0.7963152507676561, "grad_norm": 0.5268029487052006, "learning_rate": 3.957124103604833e-05, "loss": 0.5082, "step": 389 }, { "epoch": 0.7983623336745138, "grad_norm": 0.507634587065165, "learning_rate": 3.9565328315500375e-05, "loss": 0.5166, "step": 390 }, { "epoch": 0.8004094165813715, "grad_norm": 0.5687757643916819, "learning_rate": 3.9559375552364325e-05, "loss": 0.5143, "step": 391 }, { "epoch": 0.8024564994882293, "grad_norm": 0.4695087735006217, "learning_rate": 3.955338275882316e-05, "loss": 0.4713, "step": 392 }, { "epoch": 0.804503582395087, "grad_norm": 0.46001610899852224, "learning_rate": 3.9547349947141787e-05, "loss": 0.4701, "step": 393 }, { "epoch": 0.8065506653019447, "grad_norm": 0.4942091923144858, "learning_rate": 3.954127712966702e-05, "loss": 0.4916, "step": 394 }, { "epoch": 0.8085977482088025, "grad_norm": 0.4750963479134321, "learning_rate": 3.953516431882754e-05, "loss": 0.535, "step": 395 }, { "epoch": 0.8106448311156602, "grad_norm": 0.45303577566833647, "learning_rate": 3.952901152713389e-05, "loss": 0.4756, "step": 396 }, { "epoch": 0.812691914022518, "grad_norm": 0.45075457713091993, "learning_rate": 3.952281876717843e-05, "loss": 0.5013, "step": 397 }, { "epoch": 0.8147389969293757, "grad_norm": 0.4866908528841161, "learning_rate": 3.951658605163533e-05, "loss": 0.5159, "step": 398 }, { "epoch": 0.8167860798362334, "grad_norm": 0.47273974659050305, "learning_rate": 3.9510313393260507e-05, "loss": 0.4876, "step": 399 }, { "epoch": 0.8188331627430911, "grad_norm": 0.4599999323212464, "learning_rate": 3.950400080489165e-05, "loss": 0.486, "step": 400 }, { "epoch": 0.8208802456499488, "grad_norm": 0.41263568282343643, "learning_rate": 3.9497648299448174e-05, "loss": 0.6514, "step": 401 }, { "epoch": 0.8229273285568065, "grad_norm": 0.49637310910399085, "learning_rate": 3.949125588993117e-05, "loss": 0.5247, "step": 402 }, { "epoch": 0.8249744114636642, "grad_norm": 0.407642872768924, "learning_rate": 3.94848235894234e-05, "loss": 0.5023, "step": 403 }, { "epoch": 0.827021494370522, "grad_norm": 0.5125499802657086, "learning_rate": 3.947835141108928e-05, "loss": 0.5187, "step": 404 }, { "epoch": 0.8290685772773797, "grad_norm": 0.46889549178993806, "learning_rate": 3.947183936817483e-05, "loss": 0.5089, "step": 405 }, { "epoch": 0.8311156601842374, "grad_norm": 0.4793952039316711, "learning_rate": 3.9465287474007654e-05, "loss": 0.4946, "step": 406 }, { "epoch": 0.8331627430910952, "grad_norm": 0.4616324812446371, "learning_rate": 3.945869574199693e-05, "loss": 0.4905, "step": 407 }, { "epoch": 0.8352098259979529, "grad_norm": 0.522723518377932, "learning_rate": 3.9452064185633345e-05, "loss": 0.4873, "step": 408 }, { "epoch": 0.8372569089048106, "grad_norm": 0.4444650907824343, "learning_rate": 3.944539281848912e-05, "loss": 0.4995, "step": 409 }, { "epoch": 0.8393039918116684, "grad_norm": 0.5394910784111964, "learning_rate": 3.943868165421793e-05, "loss": 0.4597, "step": 410 }, { "epoch": 0.8413510747185261, "grad_norm": 0.47150833451005714, "learning_rate": 3.943193070655492e-05, "loss": 0.4768, "step": 411 }, { "epoch": 0.8433981576253838, "grad_norm": 0.5217385743079497, "learning_rate": 3.942513998931663e-05, "loss": 0.4936, "step": 412 }, { "epoch": 0.8454452405322416, "grad_norm": 0.47838287732742774, "learning_rate": 3.9418309516401015e-05, "loss": 0.4998, "step": 413 }, { "epoch": 0.8474923234390993, "grad_norm": 0.5219985412359689, "learning_rate": 3.9411439301787383e-05, "loss": 0.4922, "step": 414 }, { "epoch": 0.849539406345957, "grad_norm": 0.5360559638934609, "learning_rate": 3.940452935953639e-05, "loss": 0.4932, "step": 415 }, { "epoch": 0.8515864892528148, "grad_norm": 0.45167095248901046, "learning_rate": 3.939757970378997e-05, "loss": 0.5325, "step": 416 }, { "epoch": 0.8536335721596725, "grad_norm": 0.5392146495002762, "learning_rate": 3.9390590348771374e-05, "loss": 0.5161, "step": 417 }, { "epoch": 0.8556806550665302, "grad_norm": 0.4694690399172554, "learning_rate": 3.9383561308785075e-05, "loss": 0.4872, "step": 418 }, { "epoch": 0.857727737973388, "grad_norm": 0.5450902044992034, "learning_rate": 3.937649259821677e-05, "loss": 0.5033, "step": 419 }, { "epoch": 0.8597748208802457, "grad_norm": 0.4513747246745581, "learning_rate": 3.9369384231533365e-05, "loss": 0.5022, "step": 420 }, { "epoch": 0.8618219037871033, "grad_norm": 0.47998170745321456, "learning_rate": 3.9362236223282885e-05, "loss": 0.488, "step": 421 }, { "epoch": 0.8638689866939611, "grad_norm": 0.5703617109539353, "learning_rate": 3.935504858809454e-05, "loss": 0.5364, "step": 422 }, { "epoch": 0.8659160696008188, "grad_norm": 0.46673348103204154, "learning_rate": 3.9347821340678597e-05, "loss": 0.519, "step": 423 }, { "epoch": 0.8679631525076765, "grad_norm": 0.5424695733970012, "learning_rate": 3.934055449582641e-05, "loss": 0.5568, "step": 424 }, { "epoch": 0.8700102354145343, "grad_norm": 0.4765444623240975, "learning_rate": 3.9333248068410375e-05, "loss": 0.4743, "step": 425 }, { "epoch": 0.872057318321392, "grad_norm": 0.49864798719594966, "learning_rate": 3.932590207338391e-05, "loss": 0.4982, "step": 426 }, { "epoch": 0.8741044012282497, "grad_norm": 0.4988338996845499, "learning_rate": 3.931851652578137e-05, "loss": 0.4963, "step": 427 }, { "epoch": 0.8761514841351075, "grad_norm": 0.4432676805703767, "learning_rate": 3.931109144071811e-05, "loss": 0.4719, "step": 428 }, { "epoch": 0.8781985670419652, "grad_norm": 0.48146006351463866, "learning_rate": 3.930362683339037e-05, "loss": 0.4863, "step": 429 }, { "epoch": 0.8802456499488229, "grad_norm": 0.44888291145823134, "learning_rate": 3.92961227190753e-05, "loss": 0.5167, "step": 430 }, { "epoch": 0.8822927328556807, "grad_norm": 0.4602238267927309, "learning_rate": 3.928857911313088e-05, "loss": 0.5031, "step": 431 }, { "epoch": 0.8843398157625384, "grad_norm": 0.49233832361099084, "learning_rate": 3.928099603099591e-05, "loss": 0.5013, "step": 432 }, { "epoch": 0.8863868986693961, "grad_norm": 0.4576541292218355, "learning_rate": 3.9273373488190036e-05, "loss": 0.5574, "step": 433 }, { "epoch": 0.8884339815762539, "grad_norm": 0.4638871631773895, "learning_rate": 3.92657115003136e-05, "loss": 0.5253, "step": 434 }, { "epoch": 0.8904810644831116, "grad_norm": 0.5178780585024065, "learning_rate": 3.9258010083047715e-05, "loss": 0.5485, "step": 435 }, { "epoch": 0.8925281473899693, "grad_norm": 0.4598091842235503, "learning_rate": 3.925026925215417e-05, "loss": 0.4636, "step": 436 }, { "epoch": 0.8945752302968271, "grad_norm": 0.5965207792237605, "learning_rate": 3.924248902347541e-05, "loss": 0.5464, "step": 437 }, { "epoch": 0.8966223132036848, "grad_norm": 0.49770158650370516, "learning_rate": 3.9234669412934546e-05, "loss": 0.5461, "step": 438 }, { "epoch": 0.8986693961105425, "grad_norm": 0.5031859956447141, "learning_rate": 3.922681043653526e-05, "loss": 0.5146, "step": 439 }, { "epoch": 0.9007164790174002, "grad_norm": 0.4561558037016599, "learning_rate": 3.92189121103618e-05, "loss": 0.4898, "step": 440 }, { "epoch": 0.9027635619242579, "grad_norm": 0.4247526477385047, "learning_rate": 3.921097445057896e-05, "loss": 0.4976, "step": 441 }, { "epoch": 0.9048106448311156, "grad_norm": 0.523620896917421, "learning_rate": 3.920299747343204e-05, "loss": 0.4894, "step": 442 }, { "epoch": 0.9068577277379734, "grad_norm": 0.4267971609347695, "learning_rate": 3.919498119524679e-05, "loss": 0.5029, "step": 443 }, { "epoch": 0.9089048106448311, "grad_norm": 0.5817422656960313, "learning_rate": 3.9186925632429396e-05, "loss": 0.5477, "step": 444 }, { "epoch": 0.9109518935516888, "grad_norm": 0.3812610872657855, "learning_rate": 3.9178830801466465e-05, "loss": 0.5147, "step": 445 }, { "epoch": 0.9129989764585466, "grad_norm": 0.5210417707070376, "learning_rate": 3.917069671892494e-05, "loss": 0.5052, "step": 446 }, { "epoch": 0.9150460593654043, "grad_norm": 0.4195818289044568, "learning_rate": 3.9162523401452125e-05, "loss": 0.476, "step": 447 }, { "epoch": 0.917093142272262, "grad_norm": 0.4640561681219655, "learning_rate": 3.915431086577561e-05, "loss": 0.4811, "step": 448 }, { "epoch": 0.9191402251791198, "grad_norm": 0.46049336960827775, "learning_rate": 3.914605912870324e-05, "loss": 0.5082, "step": 449 }, { "epoch": 0.9211873080859775, "grad_norm": 0.5244765933281279, "learning_rate": 3.913776820712309e-05, "loss": 0.4982, "step": 450 }, { "epoch": 0.9232343909928352, "grad_norm": 0.4207300716892893, "learning_rate": 3.912943811800347e-05, "loss": 0.477, "step": 451 }, { "epoch": 0.925281473899693, "grad_norm": 0.4532179957617135, "learning_rate": 3.912106887839278e-05, "loss": 0.4953, "step": 452 }, { "epoch": 0.9273285568065507, "grad_norm": 0.42360404758505654, "learning_rate": 3.9112660505419626e-05, "loss": 0.5248, "step": 453 }, { "epoch": 0.9293756397134084, "grad_norm": 0.4345837890396033, "learning_rate": 3.910421301629264e-05, "loss": 0.4887, "step": 454 }, { "epoch": 0.9314227226202662, "grad_norm": 0.43091075968385034, "learning_rate": 3.909572642830053e-05, "loss": 0.4987, "step": 455 }, { "epoch": 0.9334698055271239, "grad_norm": 0.4111596649311637, "learning_rate": 3.9087200758812054e-05, "loss": 0.5429, "step": 456 }, { "epoch": 0.9355168884339816, "grad_norm": 0.4114767207964879, "learning_rate": 3.9078636025275904e-05, "loss": 0.4868, "step": 457 }, { "epoch": 0.9375639713408394, "grad_norm": 0.4511575697019446, "learning_rate": 3.907003224522075e-05, "loss": 0.5151, "step": 458 }, { "epoch": 0.9396110542476971, "grad_norm": 0.47837137672489904, "learning_rate": 3.906138943625519e-05, "loss": 0.4812, "step": 459 }, { "epoch": 0.9416581371545547, "grad_norm": 0.394915101475589, "learning_rate": 3.9052707616067654e-05, "loss": 0.5405, "step": 460 }, { "epoch": 0.9437052200614124, "grad_norm": 0.5432929178652014, "learning_rate": 3.9043986802426453e-05, "loss": 0.4816, "step": 461 }, { "epoch": 0.9457523029682702, "grad_norm": 0.3617390720179231, "learning_rate": 3.903522701317968e-05, "loss": 0.4864, "step": 462 }, { "epoch": 0.9477993858751279, "grad_norm": 0.5239560704020834, "learning_rate": 3.9026428266255205e-05, "loss": 0.4979, "step": 463 }, { "epoch": 0.9498464687819856, "grad_norm": 0.39348645216036965, "learning_rate": 3.901759057966064e-05, "loss": 0.4741, "step": 464 }, { "epoch": 0.9518935516888434, "grad_norm": 0.47748699997048394, "learning_rate": 3.9008713971483257e-05, "loss": 0.4729, "step": 465 }, { "epoch": 0.9539406345957011, "grad_norm": 0.4339281681486082, "learning_rate": 3.899979845989003e-05, "loss": 0.5072, "step": 466 }, { "epoch": 0.9559877175025588, "grad_norm": 0.4360289845901765, "learning_rate": 3.899084406312751e-05, "loss": 0.4844, "step": 467 }, { "epoch": 0.9580348004094166, "grad_norm": 0.49803606731717964, "learning_rate": 3.8981850799521856e-05, "loss": 0.5021, "step": 468 }, { "epoch": 0.9600818833162743, "grad_norm": 0.4633434375152389, "learning_rate": 3.897281868747878e-05, "loss": 0.5003, "step": 469 }, { "epoch": 0.962128966223132, "grad_norm": 0.4887467757571554, "learning_rate": 3.896374774548348e-05, "loss": 0.5054, "step": 470 }, { "epoch": 0.9641760491299898, "grad_norm": 0.44664870237749776, "learning_rate": 3.895463799210063e-05, "loss": 0.5186, "step": 471 }, { "epoch": 0.9662231320368475, "grad_norm": 0.5203094189929391, "learning_rate": 3.894548944597434e-05, "loss": 0.5227, "step": 472 }, { "epoch": 0.9682702149437052, "grad_norm": 0.37329019363463944, "learning_rate": 3.8936302125828114e-05, "loss": 0.4836, "step": 473 }, { "epoch": 0.970317297850563, "grad_norm": 0.46252269585391337, "learning_rate": 3.892707605046482e-05, "loss": 0.465, "step": 474 }, { "epoch": 0.9723643807574207, "grad_norm": 0.4466031989756924, "learning_rate": 3.8917811238766606e-05, "loss": 0.5103, "step": 475 }, { "epoch": 0.9744114636642784, "grad_norm": 0.49858281748974287, "learning_rate": 3.8908507709694945e-05, "loss": 0.4857, "step": 476 }, { "epoch": 0.9764585465711362, "grad_norm": 0.5999408245173112, "learning_rate": 3.8899165482290524e-05, "loss": 0.5135, "step": 477 }, { "epoch": 0.9785056294779939, "grad_norm": 0.44390557227435046, "learning_rate": 3.888978457567323e-05, "loss": 0.4826, "step": 478 }, { "epoch": 0.9805527123848515, "grad_norm": 0.6043545995456223, "learning_rate": 3.888036500904212e-05, "loss": 0.5154, "step": 479 }, { "epoch": 0.9825997952917093, "grad_norm": 0.4818312706910111, "learning_rate": 3.887090680167537e-05, "loss": 0.5087, "step": 480 }, { "epoch": 0.984646878198567, "grad_norm": 0.6429627680182052, "learning_rate": 3.886140997293024e-05, "loss": 0.5033, "step": 481 }, { "epoch": 0.9866939611054247, "grad_norm": 0.46009297651512515, "learning_rate": 3.8851874542243024e-05, "loss": 0.4737, "step": 482 }, { "epoch": 0.9887410440122825, "grad_norm": 0.5257256010671767, "learning_rate": 3.8842300529129026e-05, "loss": 0.4922, "step": 483 }, { "epoch": 0.9907881269191402, "grad_norm": 0.4952147402880594, "learning_rate": 3.883268795318252e-05, "loss": 0.5093, "step": 484 }, { "epoch": 0.9928352098259979, "grad_norm": 0.40718187065435846, "learning_rate": 3.882303683407669e-05, "loss": 0.4898, "step": 485 }, { "epoch": 0.9948822927328557, "grad_norm": 0.6342262964756523, "learning_rate": 3.8813347191563615e-05, "loss": 0.4809, "step": 486 }, { "epoch": 0.9969293756397134, "grad_norm": 0.36521858199635876, "learning_rate": 3.88036190454742e-05, "loss": 0.4792, "step": 487 }, { "epoch": 0.9989764585465711, "grad_norm": 0.49465426774165494, "learning_rate": 3.879385241571817e-05, "loss": 0.4945, "step": 488 }, { "epoch": 1.0010235414534288, "grad_norm": 0.6823091472624835, "learning_rate": 3.8784047322284e-05, "loss": 0.7219, "step": 489 }, { "epoch": 1.0030706243602865, "grad_norm": 0.5576204983958845, "learning_rate": 3.8774203785238886e-05, "loss": 0.5356, "step": 490 }, { "epoch": 1.0051177072671442, "grad_norm": 0.4049517959021661, "learning_rate": 3.8764321824728715e-05, "loss": 0.3433, "step": 491 }, { "epoch": 1.007164790174002, "grad_norm": 0.6379294758690475, "learning_rate": 3.875440146097798e-05, "loss": 0.4658, "step": 492 }, { "epoch": 1.0092118730808597, "grad_norm": 0.48729117155574453, "learning_rate": 3.8744442714289816e-05, "loss": 0.423, "step": 493 }, { "epoch": 1.0112589559877174, "grad_norm": 0.6905667947564962, "learning_rate": 3.873444560504588e-05, "loss": 0.4378, "step": 494 }, { "epoch": 1.0133060388945752, "grad_norm": 0.5821283432486261, "learning_rate": 3.872441015370635e-05, "loss": 0.4592, "step": 495 }, { "epoch": 1.015353121801433, "grad_norm": 0.6506243911845556, "learning_rate": 3.8714336380809874e-05, "loss": 0.4401, "step": 496 }, { "epoch": 1.0174002047082906, "grad_norm": 0.5026499066925773, "learning_rate": 3.870422430697354e-05, "loss": 0.4082, "step": 497 }, { "epoch": 1.0194472876151484, "grad_norm": 0.7706353109115344, "learning_rate": 3.869407395289281e-05, "loss": 0.4851, "step": 498 }, { "epoch": 1.021494370522006, "grad_norm": 0.40094293966181355, "learning_rate": 3.86838853393415e-05, "loss": 0.3689, "step": 499 }, { "epoch": 1.0235414534288638, "grad_norm": 0.7044751532962212, "learning_rate": 3.867365848717171e-05, "loss": 0.4298, "step": 500 }, { "epoch": 1.0255885363357216, "grad_norm": 0.5123771630826552, "learning_rate": 3.866339341731384e-05, "loss": 0.4313, "step": 501 }, { "epoch": 1.0276356192425793, "grad_norm": 0.44854290574972205, "learning_rate": 3.865309015077645e-05, "loss": 0.3686, "step": 502 }, { "epoch": 1.029682702149437, "grad_norm": 0.5101255948957101, "learning_rate": 3.8642748708646324e-05, "loss": 0.4471, "step": 503 }, { "epoch": 1.0317297850562948, "grad_norm": 0.44907848191294825, "learning_rate": 3.863236911208835e-05, "loss": 0.342, "step": 504 }, { "epoch": 1.0337768679631525, "grad_norm": 0.5031368605626068, "learning_rate": 3.862195138234551e-05, "loss": 0.4214, "step": 505 }, { "epoch": 1.0358239508700102, "grad_norm": 0.5833077927402516, "learning_rate": 3.8611495540738835e-05, "loss": 0.4858, "step": 506 }, { "epoch": 1.037871033776868, "grad_norm": 0.41256204125595797, "learning_rate": 3.860100160866733e-05, "loss": 0.4171, "step": 507 }, { "epoch": 1.0399181166837257, "grad_norm": 0.5124022037131871, "learning_rate": 3.859046960760801e-05, "loss": 0.4517, "step": 508 }, { "epoch": 1.0419651995905834, "grad_norm": 0.5225640900487728, "learning_rate": 3.857989955911574e-05, "loss": 0.4341, "step": 509 }, { "epoch": 1.0440122824974412, "grad_norm": 0.38566215827923084, "learning_rate": 3.85692914848233e-05, "loss": 0.3913, "step": 510 }, { "epoch": 1.046059365404299, "grad_norm": 0.4633891939003197, "learning_rate": 3.855864540644126e-05, "loss": 0.397, "step": 511 }, { "epoch": 1.0481064483111566, "grad_norm": 0.47380208701961085, "learning_rate": 3.8547961345758e-05, "loss": 0.4476, "step": 512 }, { "epoch": 1.0501535312180144, "grad_norm": 0.5577403954270109, "learning_rate": 3.853723932463962e-05, "loss": 0.4559, "step": 513 }, { "epoch": 1.052200614124872, "grad_norm": 0.5358423915305728, "learning_rate": 3.8526479365029906e-05, "loss": 0.4322, "step": 514 }, { "epoch": 1.0542476970317298, "grad_norm": 0.463025872995783, "learning_rate": 3.8515681488950286e-05, "loss": 0.4725, "step": 515 }, { "epoch": 1.0562947799385876, "grad_norm": 0.4349348376778485, "learning_rate": 3.850484571849982e-05, "loss": 0.3743, "step": 516 }, { "epoch": 1.0583418628454453, "grad_norm": 0.49093997411971185, "learning_rate": 3.849397207585508e-05, "loss": 0.4704, "step": 517 }, { "epoch": 1.060388945752303, "grad_norm": 0.5752884046292331, "learning_rate": 3.848306058327016e-05, "loss": 0.4772, "step": 518 }, { "epoch": 1.0624360286591608, "grad_norm": 0.36504458066426737, "learning_rate": 3.847211126307666e-05, "loss": 0.4277, "step": 519 }, { "epoch": 1.0644831115660185, "grad_norm": 0.5134952699143726, "learning_rate": 3.846112413768353e-05, "loss": 0.4094, "step": 520 }, { "epoch": 1.0665301944728762, "grad_norm": 0.42686550744637813, "learning_rate": 3.845009922957713e-05, "loss": 0.3999, "step": 521 }, { "epoch": 1.068577277379734, "grad_norm": 0.45121329177961284, "learning_rate": 3.843903656132116e-05, "loss": 0.4782, "step": 522 }, { "epoch": 1.0706243602865917, "grad_norm": 0.4799180332218181, "learning_rate": 3.842793615555657e-05, "loss": 0.4344, "step": 523 }, { "epoch": 1.0726714431934494, "grad_norm": 0.4906577800070559, "learning_rate": 3.8416798035001545e-05, "loss": 0.3999, "step": 524 }, { "epoch": 1.0747185261003072, "grad_norm": 0.4381457628189835, "learning_rate": 3.8405622222451496e-05, "loss": 0.4867, "step": 525 }, { "epoch": 1.076765609007165, "grad_norm": 0.4802227337203006, "learning_rate": 3.8394408740778934e-05, "loss": 0.4067, "step": 526 }, { "epoch": 1.0788126919140226, "grad_norm": 0.4490101903364367, "learning_rate": 3.838315761293348e-05, "loss": 0.4346, "step": 527 }, { "epoch": 1.0808597748208801, "grad_norm": 0.4633513621013708, "learning_rate": 3.8371868861941795e-05, "loss": 0.4177, "step": 528 }, { "epoch": 1.0829068577277379, "grad_norm": 0.38946131051973665, "learning_rate": 3.836054251090755e-05, "loss": 0.3554, "step": 529 }, { "epoch": 1.0849539406345956, "grad_norm": 0.4671283243776551, "learning_rate": 3.8349178583011356e-05, "loss": 0.4613, "step": 530 }, { "epoch": 1.0870010235414533, "grad_norm": 0.42447494229810095, "learning_rate": 3.833777710151075e-05, "loss": 0.4056, "step": 531 }, { "epoch": 1.089048106448311, "grad_norm": 0.42416375790936595, "learning_rate": 3.83263380897401e-05, "loss": 0.3751, "step": 532 }, { "epoch": 1.0910951893551688, "grad_norm": 0.4073852074265662, "learning_rate": 3.8314861571110604e-05, "loss": 0.4637, "step": 533 }, { "epoch": 1.0931422722620265, "grad_norm": 0.4548496033565697, "learning_rate": 3.830334756911021e-05, "loss": 0.4299, "step": 534 }, { "epoch": 1.0951893551688843, "grad_norm": 0.376272980920323, "learning_rate": 3.829179610730359e-05, "loss": 0.408, "step": 535 }, { "epoch": 1.097236438075742, "grad_norm": 0.427450221785291, "learning_rate": 3.828020720933207e-05, "loss": 0.4095, "step": 536 }, { "epoch": 1.0992835209825997, "grad_norm": 0.440976279211047, "learning_rate": 3.826858089891361e-05, "loss": 0.494, "step": 537 }, { "epoch": 1.1013306038894575, "grad_norm": 0.4209945160038491, "learning_rate": 3.8256917199842715e-05, "loss": 0.4586, "step": 538 }, { "epoch": 1.1033776867963152, "grad_norm": 0.3962229334467954, "learning_rate": 3.824521613599043e-05, "loss": 0.405, "step": 539 }, { "epoch": 1.105424769703173, "grad_norm": 0.40571779964971894, "learning_rate": 3.823347773130427e-05, "loss": 0.4275, "step": 540 }, { "epoch": 1.1074718526100307, "grad_norm": 0.3618547256384093, "learning_rate": 3.822170200980815e-05, "loss": 0.3858, "step": 541 }, { "epoch": 1.1095189355168884, "grad_norm": 0.4080407212884249, "learning_rate": 3.820988899560239e-05, "loss": 0.4645, "step": 542 }, { "epoch": 1.1115660184237461, "grad_norm": 0.3661867427781835, "learning_rate": 3.819803871286361e-05, "loss": 0.4359, "step": 543 }, { "epoch": 1.1136131013306039, "grad_norm": 0.3672357843320578, "learning_rate": 3.818615118584472e-05, "loss": 0.4266, "step": 544 }, { "epoch": 1.1156601842374616, "grad_norm": 0.42948959266814246, "learning_rate": 3.817422643887484e-05, "loss": 0.3994, "step": 545 }, { "epoch": 1.1177072671443193, "grad_norm": 0.4671751141730952, "learning_rate": 3.816226449635927e-05, "loss": 0.4408, "step": 546 }, { "epoch": 1.119754350051177, "grad_norm": 0.41461516441504426, "learning_rate": 3.815026538277943e-05, "loss": 0.3956, "step": 547 }, { "epoch": 1.1218014329580348, "grad_norm": 0.5835686601708205, "learning_rate": 3.813822912269284e-05, "loss": 0.4505, "step": 548 }, { "epoch": 1.1238485158648925, "grad_norm": 0.4319406426533076, "learning_rate": 3.812615574073301e-05, "loss": 0.3822, "step": 549 }, { "epoch": 1.1258955987717503, "grad_norm": 0.4516791429478303, "learning_rate": 3.811404526160943e-05, "loss": 0.4144, "step": 550 }, { "epoch": 1.127942681678608, "grad_norm": 0.5360130807078863, "learning_rate": 3.810189771010755e-05, "loss": 0.4786, "step": 551 }, { "epoch": 1.1299897645854657, "grad_norm": 0.4708744253041833, "learning_rate": 3.808971311108865e-05, "loss": 0.4241, "step": 552 }, { "epoch": 1.1320368474923235, "grad_norm": 0.43422914565208964, "learning_rate": 3.8077491489489835e-05, "loss": 0.4204, "step": 553 }, { "epoch": 1.1340839303991812, "grad_norm": 0.4721868771642081, "learning_rate": 3.806523287032401e-05, "loss": 0.3909, "step": 554 }, { "epoch": 1.136131013306039, "grad_norm": 0.4196801932672472, "learning_rate": 3.805293727867978e-05, "loss": 0.4427, "step": 555 }, { "epoch": 1.1381780962128967, "grad_norm": 0.43191530611285084, "learning_rate": 3.8040604739721415e-05, "loss": 0.4247, "step": 556 }, { "epoch": 1.1402251791197544, "grad_norm": 0.4098386090267591, "learning_rate": 3.8028235278688814e-05, "loss": 0.4136, "step": 557 }, { "epoch": 1.1422722620266121, "grad_norm": 0.42481716204053743, "learning_rate": 3.8015828920897425e-05, "loss": 0.425, "step": 558 }, { "epoch": 1.1443193449334699, "grad_norm": 0.3947721096519682, "learning_rate": 3.8003385691738227e-05, "loss": 0.4169, "step": 559 }, { "epoch": 1.1463664278403276, "grad_norm": 0.4434721832169034, "learning_rate": 3.7990905616677644e-05, "loss": 0.4804, "step": 560 }, { "epoch": 1.1484135107471853, "grad_norm": 0.44596996028550917, "learning_rate": 3.797838872125752e-05, "loss": 0.4587, "step": 561 }, { "epoch": 1.150460593654043, "grad_norm": 0.40781008639677546, "learning_rate": 3.7965835031095065e-05, "loss": 0.4614, "step": 562 }, { "epoch": 1.1525076765609008, "grad_norm": 0.4036300851691864, "learning_rate": 3.795324457188276e-05, "loss": 0.3913, "step": 563 }, { "epoch": 1.1545547594677585, "grad_norm": 0.3583696642752529, "learning_rate": 3.794061736938837e-05, "loss": 0.3828, "step": 564 }, { "epoch": 1.156601842374616, "grad_norm": 0.5138599201629513, "learning_rate": 3.792795344945485e-05, "loss": 0.4861, "step": 565 }, { "epoch": 1.158648925281474, "grad_norm": 0.40729637695646664, "learning_rate": 3.79152528380003e-05, "loss": 0.4136, "step": 566 }, { "epoch": 1.1606960081883315, "grad_norm": 0.4688730059593128, "learning_rate": 3.790251556101791e-05, "loss": 0.3991, "step": 567 }, { "epoch": 1.1627430910951895, "grad_norm": 0.41555333567521613, "learning_rate": 3.7889741644575914e-05, "loss": 0.4859, "step": 568 }, { "epoch": 1.164790174002047, "grad_norm": 0.368128010733192, "learning_rate": 3.787693111481753e-05, "loss": 0.412, "step": 569 }, { "epoch": 1.1668372569089047, "grad_norm": 0.40448293578209554, "learning_rate": 3.786408399796091e-05, "loss": 0.4812, "step": 570 }, { "epoch": 1.1688843398157625, "grad_norm": 0.35706355299321096, "learning_rate": 3.78512003202991e-05, "loss": 0.4098, "step": 571 }, { "epoch": 1.1709314227226202, "grad_norm": 0.3794649872224116, "learning_rate": 3.783828010819993e-05, "loss": 0.4184, "step": 572 }, { "epoch": 1.172978505629478, "grad_norm": 0.4084126099380887, "learning_rate": 3.782532338810605e-05, "loss": 0.4279, "step": 573 }, { "epoch": 1.1750255885363357, "grad_norm": 0.40248846798503674, "learning_rate": 3.7812330186534815e-05, "loss": 0.374, "step": 574 }, { "epoch": 1.1770726714431934, "grad_norm": 0.42082880622868263, "learning_rate": 3.779930053007821e-05, "loss": 0.4294, "step": 575 }, { "epoch": 1.1791197543500511, "grad_norm": 0.4741074023730061, "learning_rate": 3.778623444540287e-05, "loss": 0.4655, "step": 576 }, { "epoch": 1.1811668372569089, "grad_norm": 0.40443469906849566, "learning_rate": 3.777313195924998e-05, "loss": 0.4313, "step": 577 }, { "epoch": 1.1832139201637666, "grad_norm": 0.5301327875202605, "learning_rate": 3.775999309843519e-05, "loss": 0.4872, "step": 578 }, { "epoch": 1.1852610030706243, "grad_norm": 0.4031400337502115, "learning_rate": 3.774681788984863e-05, "loss": 0.44, "step": 579 }, { "epoch": 1.187308085977482, "grad_norm": 0.32666462235868937, "learning_rate": 3.773360636045481e-05, "loss": 0.3297, "step": 580 }, { "epoch": 1.1893551688843398, "grad_norm": 0.48740976479218406, "learning_rate": 3.7720358537292566e-05, "loss": 0.502, "step": 581 }, { "epoch": 1.1914022517911975, "grad_norm": 0.4129431656165589, "learning_rate": 3.770707444747502e-05, "loss": 0.3901, "step": 582 }, { "epoch": 1.1934493346980553, "grad_norm": 0.37634568229429827, "learning_rate": 3.7693754118189525e-05, "loss": 0.406, "step": 583 }, { "epoch": 1.195496417604913, "grad_norm": 0.413600210538619, "learning_rate": 3.768039757669759e-05, "loss": 0.4389, "step": 584 }, { "epoch": 1.1975435005117707, "grad_norm": 0.41522152600174966, "learning_rate": 3.766700485033484e-05, "loss": 0.4213, "step": 585 }, { "epoch": 1.1995905834186285, "grad_norm": 0.3994741162417736, "learning_rate": 3.765357596651095e-05, "loss": 0.4192, "step": 586 }, { "epoch": 1.2016376663254862, "grad_norm": 0.44961799715640177, "learning_rate": 3.764011095270962e-05, "loss": 0.4448, "step": 587 }, { "epoch": 1.203684749232344, "grad_norm": 0.4405333877210868, "learning_rate": 3.762660983648846e-05, "loss": 0.4425, "step": 588 }, { "epoch": 1.2057318321392017, "grad_norm": 0.3647533882869788, "learning_rate": 3.761307264547899e-05, "loss": 0.3798, "step": 589 }, { "epoch": 1.2077789150460594, "grad_norm": 0.5134897309426143, "learning_rate": 3.759949940738655e-05, "loss": 0.4862, "step": 590 }, { "epoch": 1.2098259979529171, "grad_norm": 0.42730289018233486, "learning_rate": 3.7585890149990265e-05, "loss": 0.3887, "step": 591 }, { "epoch": 1.2118730808597749, "grad_norm": 0.4053414794034375, "learning_rate": 3.757224490114297e-05, "loss": 0.4327, "step": 592 }, { "epoch": 1.2139201637666326, "grad_norm": 0.4351870322953961, "learning_rate": 3.755856368877116e-05, "loss": 0.379, "step": 593 }, { "epoch": 1.2159672466734903, "grad_norm": 0.4319979039337228, "learning_rate": 3.7544846540874934e-05, "loss": 0.45, "step": 594 }, { "epoch": 1.218014329580348, "grad_norm": 0.3683791838302071, "learning_rate": 3.7531093485527943e-05, "loss": 0.4189, "step": 595 }, { "epoch": 1.2200614124872058, "grad_norm": 0.47932608544530164, "learning_rate": 3.7517304550877315e-05, "loss": 0.4327, "step": 596 }, { "epoch": 1.2221084953940635, "grad_norm": 0.46656194426712394, "learning_rate": 3.750347976514362e-05, "loss": 0.4774, "step": 597 }, { "epoch": 1.2241555783009213, "grad_norm": 0.3933033098379041, "learning_rate": 3.7489619156620796e-05, "loss": 0.4224, "step": 598 }, { "epoch": 1.226202661207779, "grad_norm": 0.40838601128257357, "learning_rate": 3.74757227536761e-05, "loss": 0.4361, "step": 599 }, { "epoch": 1.2282497441146367, "grad_norm": 0.4011956657411684, "learning_rate": 3.7461790584750036e-05, "loss": 0.373, "step": 600 }, { "epoch": 1.2302968270214945, "grad_norm": 0.4783960660544416, "learning_rate": 3.744782267835632e-05, "loss": 0.4497, "step": 601 }, { "epoch": 1.2323439099283522, "grad_norm": 0.4426728718136347, "learning_rate": 3.74338190630818e-05, "loss": 0.4396, "step": 602 }, { "epoch": 1.23439099283521, "grad_norm": 0.4507239891416186, "learning_rate": 3.7419779767586406e-05, "loss": 0.4312, "step": 603 }, { "epoch": 1.2364380757420674, "grad_norm": 0.45738216770554924, "learning_rate": 3.740570482060311e-05, "loss": 0.4186, "step": 604 }, { "epoch": 1.2384851586489254, "grad_norm": 0.4112804317839281, "learning_rate": 3.7391594250937813e-05, "loss": 0.4075, "step": 605 }, { "epoch": 1.240532241555783, "grad_norm": 0.3910005461879264, "learning_rate": 3.737744808746935e-05, "loss": 0.4063, "step": 606 }, { "epoch": 1.2425793244626409, "grad_norm": 0.37923695698945686, "learning_rate": 3.73632663591494e-05, "loss": 0.3753, "step": 607 }, { "epoch": 1.2446264073694984, "grad_norm": 0.3993349892342946, "learning_rate": 3.7349049095002414e-05, "loss": 0.415, "step": 608 }, { "epoch": 1.246673490276356, "grad_norm": 0.49340730052744397, "learning_rate": 3.733479632412559e-05, "loss": 0.4137, "step": 609 }, { "epoch": 1.2487205731832138, "grad_norm": 0.4033663663707718, "learning_rate": 3.732050807568878e-05, "loss": 0.4078, "step": 610 }, { "epoch": 1.2507676560900716, "grad_norm": 0.4885064983059752, "learning_rate": 3.730618437893444e-05, "loss": 0.479, "step": 611 }, { "epoch": 1.2528147389969293, "grad_norm": 0.43160732650352407, "learning_rate": 3.729182526317761e-05, "loss": 0.455, "step": 612 }, { "epoch": 1.254861821903787, "grad_norm": 0.45988754618419736, "learning_rate": 3.727743075780578e-05, "loss": 0.3783, "step": 613 }, { "epoch": 1.2569089048106448, "grad_norm": 0.44846875859935137, "learning_rate": 3.726300089227887e-05, "loss": 0.4124, "step": 614 }, { "epoch": 1.2589559877175025, "grad_norm": 0.5009012146609582, "learning_rate": 3.72485356961292e-05, "loss": 0.4256, "step": 615 }, { "epoch": 1.2610030706243602, "grad_norm": 0.40882732802753774, "learning_rate": 3.723403519896136e-05, "loss": 0.3877, "step": 616 }, { "epoch": 1.263050153531218, "grad_norm": 0.509277319060574, "learning_rate": 3.721949943045223e-05, "loss": 0.4603, "step": 617 }, { "epoch": 1.2650972364380757, "grad_norm": 0.4550675308253649, "learning_rate": 3.720492842035084e-05, "loss": 0.4958, "step": 618 }, { "epoch": 1.2671443193449334, "grad_norm": 0.5039050269537974, "learning_rate": 3.7190322198478355e-05, "loss": 0.5296, "step": 619 }, { "epoch": 1.2691914022517912, "grad_norm": 0.42034576410985675, "learning_rate": 3.7175680794728015e-05, "loss": 0.4171, "step": 620 }, { "epoch": 1.271238485158649, "grad_norm": 0.6194106838370393, "learning_rate": 3.716100423906505e-05, "loss": 0.524, "step": 621 }, { "epoch": 1.2732855680655066, "grad_norm": 0.3686755041504862, "learning_rate": 3.7146292561526654e-05, "loss": 0.3836, "step": 622 }, { "epoch": 1.2753326509723644, "grad_norm": 0.5059037426173506, "learning_rate": 3.7131545792221864e-05, "loss": 0.485, "step": 623 }, { "epoch": 1.277379733879222, "grad_norm": 0.40419199499915215, "learning_rate": 3.711676396133158e-05, "loss": 0.3979, "step": 624 }, { "epoch": 1.2794268167860798, "grad_norm": 0.48892649990931586, "learning_rate": 3.7101947099108425e-05, "loss": 0.5072, "step": 625 }, { "epoch": 1.2814738996929376, "grad_norm": 0.4545220525315462, "learning_rate": 3.708709523587674e-05, "loss": 0.4275, "step": 626 }, { "epoch": 1.2835209825997953, "grad_norm": 0.40189284591007923, "learning_rate": 3.707220840203249e-05, "loss": 0.3881, "step": 627 }, { "epoch": 1.285568065506653, "grad_norm": 0.43194988622551983, "learning_rate": 3.70572866280432e-05, "loss": 0.4747, "step": 628 }, { "epoch": 1.2876151484135108, "grad_norm": 0.44678464462286466, "learning_rate": 3.7042329944447925e-05, "loss": 0.391, "step": 629 }, { "epoch": 1.2896622313203685, "grad_norm": 0.4623510609405332, "learning_rate": 3.702733838185716e-05, "loss": 0.423, "step": 630 }, { "epoch": 1.2917093142272262, "grad_norm": 0.4314971051525567, "learning_rate": 3.701231197095277e-05, "loss": 0.4285, "step": 631 }, { "epoch": 1.293756397134084, "grad_norm": 0.39807090722917615, "learning_rate": 3.6997250742487955e-05, "loss": 0.3975, "step": 632 }, { "epoch": 1.2958034800409417, "grad_norm": 0.4538564539433799, "learning_rate": 3.698215472728718e-05, "loss": 0.4566, "step": 633 }, { "epoch": 1.2978505629477994, "grad_norm": 0.387005647622149, "learning_rate": 3.696702395624608e-05, "loss": 0.4376, "step": 634 }, { "epoch": 1.2998976458546572, "grad_norm": 0.4089593023659839, "learning_rate": 3.6951858460331446e-05, "loss": 0.4297, "step": 635 }, { "epoch": 1.301944728761515, "grad_norm": 0.423980436557396, "learning_rate": 3.693665827058111e-05, "loss": 0.4407, "step": 636 }, { "epoch": 1.3039918116683726, "grad_norm": 0.3939340668431382, "learning_rate": 3.692142341810395e-05, "loss": 0.3762, "step": 637 }, { "epoch": 1.3060388945752304, "grad_norm": 0.3791993183647926, "learning_rate": 3.690615393407975e-05, "loss": 0.412, "step": 638 }, { "epoch": 1.308085977482088, "grad_norm": 0.44464627005705176, "learning_rate": 3.689084984975918e-05, "loss": 0.3946, "step": 639 }, { "epoch": 1.3101330603889458, "grad_norm": 0.4356188955070128, "learning_rate": 3.6875511196463715e-05, "loss": 0.4583, "step": 640 }, { "epoch": 1.3121801432958033, "grad_norm": 0.43588824508846175, "learning_rate": 3.686013800558561e-05, "loss": 0.4674, "step": 641 }, { "epoch": 1.3142272262026613, "grad_norm": 0.5156297864126925, "learning_rate": 3.6844730308587776e-05, "loss": 0.4052, "step": 642 }, { "epoch": 1.3162743091095188, "grad_norm": 0.40684908578032264, "learning_rate": 3.682928813700375e-05, "loss": 0.4329, "step": 643 }, { "epoch": 1.3183213920163768, "grad_norm": 0.42989019879872664, "learning_rate": 3.681381152243763e-05, "loss": 0.4264, "step": 644 }, { "epoch": 1.3203684749232343, "grad_norm": 0.37812994433882907, "learning_rate": 3.6798300496564e-05, "loss": 0.386, "step": 645 }, { "epoch": 1.3224155578300922, "grad_norm": 0.4191444772159957, "learning_rate": 3.678275509112788e-05, "loss": 0.4587, "step": 646 }, { "epoch": 1.3244626407369497, "grad_norm": 0.34057419650855425, "learning_rate": 3.6767175337944646e-05, "loss": 0.4069, "step": 647 }, { "epoch": 1.3265097236438077, "grad_norm": 0.4210596650083103, "learning_rate": 3.675156126889996e-05, "loss": 0.4614, "step": 648 }, { "epoch": 1.3285568065506652, "grad_norm": 0.380672360295006, "learning_rate": 3.6735912915949745e-05, "loss": 0.4919, "step": 649 }, { "epoch": 1.330603889457523, "grad_norm": 0.46378857138786805, "learning_rate": 3.672023031112005e-05, "loss": 0.4398, "step": 650 }, { "epoch": 1.3326509723643807, "grad_norm": 0.3731856457734196, "learning_rate": 3.670451348650705e-05, "loss": 0.3786, "step": 651 }, { "epoch": 1.3346980552712384, "grad_norm": 0.4439716102781381, "learning_rate": 3.6688762474276945e-05, "loss": 0.4175, "step": 652 }, { "epoch": 1.3367451381780961, "grad_norm": 0.5191394349356976, "learning_rate": 3.667297730666592e-05, "loss": 0.5042, "step": 653 }, { "epoch": 1.3387922210849539, "grad_norm": 0.41690574322071583, "learning_rate": 3.665715801598004e-05, "loss": 0.4098, "step": 654 }, { "epoch": 1.3408393039918116, "grad_norm": 0.42158819550590015, "learning_rate": 3.6641304634595216e-05, "loss": 0.4271, "step": 655 }, { "epoch": 1.3428863868986693, "grad_norm": 0.4596791286598413, "learning_rate": 3.662541719495714e-05, "loss": 0.4609, "step": 656 }, { "epoch": 1.344933469805527, "grad_norm": 0.4858116688501969, "learning_rate": 3.6609495729581186e-05, "loss": 0.4909, "step": 657 }, { "epoch": 1.3469805527123848, "grad_norm": 0.3811091658108338, "learning_rate": 3.659354027105238e-05, "loss": 0.4224, "step": 658 }, { "epoch": 1.3490276356192425, "grad_norm": 0.40653713538788, "learning_rate": 3.657755085202532e-05, "loss": 0.4144, "step": 659 }, { "epoch": 1.3510747185261003, "grad_norm": 0.4900473769452504, "learning_rate": 3.6561527505224104e-05, "loss": 0.4582, "step": 660 }, { "epoch": 1.353121801432958, "grad_norm": 0.45088745908937394, "learning_rate": 3.6545470263442265e-05, "loss": 0.4345, "step": 661 }, { "epoch": 1.3551688843398157, "grad_norm": 0.44409465950161797, "learning_rate": 3.65293791595427e-05, "loss": 0.4867, "step": 662 }, { "epoch": 1.3572159672466735, "grad_norm": 0.3857032013105052, "learning_rate": 3.651325422645763e-05, "loss": 0.4072, "step": 663 }, { "epoch": 1.3592630501535312, "grad_norm": 0.4196509296621015, "learning_rate": 3.649709549718849e-05, "loss": 0.4109, "step": 664 }, { "epoch": 1.361310133060389, "grad_norm": 0.3711033202233833, "learning_rate": 3.648090300480589e-05, "loss": 0.4498, "step": 665 }, { "epoch": 1.3633572159672467, "grad_norm": 0.4738363974307569, "learning_rate": 3.646467678244954e-05, "loss": 0.4268, "step": 666 }, { "epoch": 1.3654042988741044, "grad_norm": 0.4028954103082967, "learning_rate": 3.6448416863328186e-05, "loss": 0.4346, "step": 667 }, { "epoch": 1.3674513817809621, "grad_norm": 0.4635908886319793, "learning_rate": 3.643212328071953e-05, "loss": 0.4453, "step": 668 }, { "epoch": 1.3694984646878199, "grad_norm": 0.38319546452013337, "learning_rate": 3.641579606797017e-05, "loss": 0.4054, "step": 669 }, { "epoch": 1.3715455475946776, "grad_norm": 0.3700643765726001, "learning_rate": 3.639943525849555e-05, "loss": 0.4692, "step": 670 }, { "epoch": 1.3735926305015353, "grad_norm": 0.34905600329178454, "learning_rate": 3.638304088577984e-05, "loss": 0.4131, "step": 671 }, { "epoch": 1.375639713408393, "grad_norm": 0.37865539989560054, "learning_rate": 3.6366612983375936e-05, "loss": 0.4696, "step": 672 }, { "epoch": 1.3776867963152508, "grad_norm": 0.34634916188478093, "learning_rate": 3.635015158490533e-05, "loss": 0.4206, "step": 673 }, { "epoch": 1.3797338792221086, "grad_norm": 0.3790168529129947, "learning_rate": 3.6333656724058075e-05, "loss": 0.4311, "step": 674 }, { "epoch": 1.3817809621289663, "grad_norm": 0.39245574822155366, "learning_rate": 3.6317128434592725e-05, "loss": 0.453, "step": 675 }, { "epoch": 1.383828045035824, "grad_norm": 0.3723108820988336, "learning_rate": 3.6300566750336225e-05, "loss": 0.4055, "step": 676 }, { "epoch": 1.3858751279426818, "grad_norm": 0.37587879714211964, "learning_rate": 3.6283971705183884e-05, "loss": 0.4468, "step": 677 }, { "epoch": 1.3879222108495395, "grad_norm": 0.3410504468464227, "learning_rate": 3.626734333309927e-05, "loss": 0.3782, "step": 678 }, { "epoch": 1.3899692937563972, "grad_norm": 0.4332726859166056, "learning_rate": 3.625068166811418e-05, "loss": 0.4419, "step": 679 }, { "epoch": 1.3920163766632547, "grad_norm": 0.3869800586992885, "learning_rate": 3.623398674432853e-05, "loss": 0.4211, "step": 680 }, { "epoch": 1.3940634595701127, "grad_norm": 0.38121810828451136, "learning_rate": 3.621725859591031e-05, "loss": 0.4592, "step": 681 }, { "epoch": 1.3961105424769702, "grad_norm": 0.39222808854449187, "learning_rate": 3.6200497257095504e-05, "loss": 0.4664, "step": 682 }, { "epoch": 1.3981576253838282, "grad_norm": 0.3565067379505055, "learning_rate": 3.6183702762188045e-05, "loss": 0.4218, "step": 683 }, { "epoch": 1.4002047082906857, "grad_norm": 0.3730858903859956, "learning_rate": 3.6166875145559684e-05, "loss": 0.4338, "step": 684 }, { "epoch": 1.4022517911975436, "grad_norm": 0.36655424256782704, "learning_rate": 3.615001444165001e-05, "loss": 0.4372, "step": 685 }, { "epoch": 1.4042988741044011, "grad_norm": 0.4487119389407511, "learning_rate": 3.613312068496627e-05, "loss": 0.462, "step": 686 }, { "epoch": 1.406345957011259, "grad_norm": 0.3515839553061697, "learning_rate": 3.611619391008341e-05, "loss": 0.3974, "step": 687 }, { "epoch": 1.4083930399181166, "grad_norm": 0.3677425891572493, "learning_rate": 3.6099234151643924e-05, "loss": 0.4634, "step": 688 }, { "epoch": 1.4104401228249743, "grad_norm": 0.3826249798074036, "learning_rate": 3.608224144435781e-05, "loss": 0.4338, "step": 689 }, { "epoch": 1.412487205731832, "grad_norm": 0.37018522079183763, "learning_rate": 3.606521582300252e-05, "loss": 0.4089, "step": 690 }, { "epoch": 1.4145342886386898, "grad_norm": 0.39718163395641504, "learning_rate": 3.604815732242283e-05, "loss": 0.4481, "step": 691 }, { "epoch": 1.4165813715455475, "grad_norm": 0.4177328745486512, "learning_rate": 3.6031065977530854e-05, "loss": 0.4061, "step": 692 }, { "epoch": 1.4186284544524053, "grad_norm": 0.43120876742640946, "learning_rate": 3.6013941823305884e-05, "loss": 0.4105, "step": 693 }, { "epoch": 1.420675537359263, "grad_norm": 0.4330368740835022, "learning_rate": 3.5996784894794394e-05, "loss": 0.4329, "step": 694 }, { "epoch": 1.4227226202661207, "grad_norm": 0.42455998302592957, "learning_rate": 3.5979595227109906e-05, "loss": 0.4045, "step": 695 }, { "epoch": 1.4247697031729785, "grad_norm": 0.4072147719505155, "learning_rate": 3.5962372855432956e-05, "loss": 0.452, "step": 696 }, { "epoch": 1.4268167860798362, "grad_norm": 0.4188081245089065, "learning_rate": 3.594511781501103e-05, "loss": 0.4199, "step": 697 }, { "epoch": 1.428863868986694, "grad_norm": 0.38251894827311167, "learning_rate": 3.592783014115845e-05, "loss": 0.3955, "step": 698 }, { "epoch": 1.4309109518935517, "grad_norm": 0.4441091071016442, "learning_rate": 3.5910509869256326e-05, "loss": 0.4398, "step": 699 }, { "epoch": 1.4329580348004094, "grad_norm": 0.38058511976389736, "learning_rate": 3.58931570347525e-05, "loss": 0.4019, "step": 700 }, { "epoch": 1.4350051177072671, "grad_norm": 0.3617807382285515, "learning_rate": 3.587577167316146e-05, "loss": 0.4363, "step": 701 }, { "epoch": 1.4370522006141249, "grad_norm": 0.43987641158494933, "learning_rate": 3.585835382006424e-05, "loss": 0.4328, "step": 702 }, { "epoch": 1.4390992835209826, "grad_norm": 0.3609144606393506, "learning_rate": 3.584090351110838e-05, "loss": 0.4104, "step": 703 }, { "epoch": 1.4411463664278403, "grad_norm": 0.4091808525809948, "learning_rate": 3.582342078200786e-05, "loss": 0.4138, "step": 704 }, { "epoch": 1.443193449334698, "grad_norm": 0.41531638710969015, "learning_rate": 3.5805905668543e-05, "loss": 0.4697, "step": 705 }, { "epoch": 1.4452405322415558, "grad_norm": 0.4154315464780426, "learning_rate": 3.57883582065604e-05, "loss": 0.4844, "step": 706 }, { "epoch": 1.4472876151484135, "grad_norm": 0.426486587951999, "learning_rate": 3.577077843197285e-05, "loss": 0.4088, "step": 707 }, { "epoch": 1.4493346980552713, "grad_norm": 0.4965404868194584, "learning_rate": 3.57531663807593e-05, "loss": 0.4369, "step": 708 }, { "epoch": 1.451381780962129, "grad_norm": 0.4695819928605211, "learning_rate": 3.573552208896474e-05, "loss": 0.4382, "step": 709 }, { "epoch": 1.4534288638689867, "grad_norm": 0.46954254869967305, "learning_rate": 3.571784559270014e-05, "loss": 0.4456, "step": 710 }, { "epoch": 1.4554759467758445, "grad_norm": 0.4760159462479475, "learning_rate": 3.570013692814239e-05, "loss": 0.4477, "step": 711 }, { "epoch": 1.4575230296827022, "grad_norm": 0.3992032479219894, "learning_rate": 3.568239613153421e-05, "loss": 0.396, "step": 712 }, { "epoch": 1.45957011258956, "grad_norm": 0.5000911965095803, "learning_rate": 3.566462323918409e-05, "loss": 0.4535, "step": 713 }, { "epoch": 1.4616171954964177, "grad_norm": 0.43927830807739277, "learning_rate": 3.564681828746619e-05, "loss": 0.4579, "step": 714 }, { "epoch": 1.4636642784032754, "grad_norm": 0.3840584961797973, "learning_rate": 3.5628981312820315e-05, "loss": 0.3861, "step": 715 }, { "epoch": 1.4657113613101331, "grad_norm": 0.46545692391278676, "learning_rate": 3.561111235175177e-05, "loss": 0.4142, "step": 716 }, { "epoch": 1.4677584442169909, "grad_norm": 0.32920533353338466, "learning_rate": 3.5593211440831345e-05, "loss": 0.3932, "step": 717 }, { "epoch": 1.4698055271238486, "grad_norm": 0.5879408329975259, "learning_rate": 3.557527861669522e-05, "loss": 0.4518, "step": 718 }, { "epoch": 1.471852610030706, "grad_norm": 0.3711931671196681, "learning_rate": 3.555731391604488e-05, "loss": 0.4135, "step": 719 }, { "epoch": 1.473899692937564, "grad_norm": 0.451950604877235, "learning_rate": 3.553931737564705e-05, "loss": 0.3778, "step": 720 }, { "epoch": 1.4759467758444216, "grad_norm": 0.3802803607853056, "learning_rate": 3.552128903233363e-05, "loss": 0.4528, "step": 721 }, { "epoch": 1.4779938587512795, "grad_norm": 0.5401764017146911, "learning_rate": 3.55032289230016e-05, "loss": 0.4431, "step": 722 }, { "epoch": 1.480040941658137, "grad_norm": 0.39166017929307784, "learning_rate": 3.5485137084612945e-05, "loss": 0.4384, "step": 723 }, { "epoch": 1.482088024564995, "grad_norm": 0.44680536201066046, "learning_rate": 3.54670135541946e-05, "loss": 0.4017, "step": 724 }, { "epoch": 1.4841351074718525, "grad_norm": 0.4126986303099099, "learning_rate": 3.544885836883836e-05, "loss": 0.4267, "step": 725 }, { "epoch": 1.4861821903787105, "grad_norm": 0.49002764481757854, "learning_rate": 3.5430671565700786e-05, "loss": 0.4451, "step": 726 }, { "epoch": 1.488229273285568, "grad_norm": 0.43094421524683524, "learning_rate": 3.541245318200318e-05, "loss": 0.4157, "step": 727 }, { "epoch": 1.4902763561924257, "grad_norm": 0.4077891821232257, "learning_rate": 3.5394203255031445e-05, "loss": 0.4184, "step": 728 }, { "epoch": 1.4923234390992834, "grad_norm": 0.4214423887733812, "learning_rate": 3.537592182213607e-05, "loss": 0.4404, "step": 729 }, { "epoch": 1.4943705220061412, "grad_norm": 0.39779568039250524, "learning_rate": 3.5357608920732e-05, "loss": 0.3862, "step": 730 }, { "epoch": 1.496417604912999, "grad_norm": 0.3868845645927947, "learning_rate": 3.5339264588298606e-05, "loss": 0.4859, "step": 731 }, { "epoch": 1.4984646878198566, "grad_norm": 0.401554933480186, "learning_rate": 3.532088886237956e-05, "loss": 0.4605, "step": 732 }, { "epoch": 1.5005117707267144, "grad_norm": 0.39625088885583626, "learning_rate": 3.530248178058282e-05, "loss": 0.4371, "step": 733 }, { "epoch": 1.5025588536335721, "grad_norm": 0.3433544773175932, "learning_rate": 3.528404338058046e-05, "loss": 0.359, "step": 734 }, { "epoch": 1.5046059365404298, "grad_norm": 0.3790208864361706, "learning_rate": 3.526557370010872e-05, "loss": 0.4668, "step": 735 }, { "epoch": 1.5066530194472876, "grad_norm": 0.3463058143561582, "learning_rate": 3.5247072776967805e-05, "loss": 0.428, "step": 736 }, { "epoch": 1.5087001023541453, "grad_norm": 0.4092673650520593, "learning_rate": 3.522854064902189e-05, "loss": 0.4787, "step": 737 }, { "epoch": 1.510747185261003, "grad_norm": 0.3701835656091217, "learning_rate": 3.520997735419901e-05, "loss": 0.4335, "step": 738 }, { "epoch": 1.5127942681678608, "grad_norm": 0.4036995525874622, "learning_rate": 3.519138293049097e-05, "loss": 0.4371, "step": 739 }, { "epoch": 1.5148413510747185, "grad_norm": 0.3520959626432674, "learning_rate": 3.51727574159533e-05, "loss": 0.442, "step": 740 }, { "epoch": 1.5168884339815762, "grad_norm": 0.3474665477504301, "learning_rate": 3.515410084870516e-05, "loss": 0.3833, "step": 741 }, { "epoch": 1.518935516888434, "grad_norm": 0.37700117041246284, "learning_rate": 3.513541326692925e-05, "loss": 0.421, "step": 742 }, { "epoch": 1.5209825997952917, "grad_norm": 0.3354789534984794, "learning_rate": 3.511669470887177e-05, "loss": 0.3646, "step": 743 }, { "epoch": 1.5230296827021494, "grad_norm": 0.36336403413161944, "learning_rate": 3.509794521284228e-05, "loss": 0.4479, "step": 744 }, { "epoch": 1.5250767656090072, "grad_norm": 0.3640226182102435, "learning_rate": 3.5079164817213684e-05, "loss": 0.3994, "step": 745 }, { "epoch": 1.527123848515865, "grad_norm": 0.39095484712326395, "learning_rate": 3.5060353560422137e-05, "loss": 0.4906, "step": 746 }, { "epoch": 1.5291709314227226, "grad_norm": 0.42138462191208187, "learning_rate": 3.504151148096691e-05, "loss": 0.4876, "step": 747 }, { "epoch": 1.5312180143295804, "grad_norm": 0.36356347432593744, "learning_rate": 3.5022638617410396e-05, "loss": 0.4171, "step": 748 }, { "epoch": 1.5332650972364381, "grad_norm": 0.402770920410115, "learning_rate": 3.500373500837799e-05, "loss": 0.4427, "step": 749 }, { "epoch": 1.5353121801432958, "grad_norm": 0.35789791445115465, "learning_rate": 3.4984800692557974e-05, "loss": 0.4306, "step": 750 }, { "epoch": 1.5373592630501536, "grad_norm": 0.3511318223180355, "learning_rate": 3.496583570870152e-05, "loss": 0.4051, "step": 751 }, { "epoch": 1.5394063459570113, "grad_norm": 0.415831551492267, "learning_rate": 3.494684009562254e-05, "loss": 0.4738, "step": 752 }, { "epoch": 1.541453428863869, "grad_norm": 0.3774452021561776, "learning_rate": 3.492781389219763e-05, "loss": 0.472, "step": 753 }, { "epoch": 1.5435005117707266, "grad_norm": 0.38222406103253237, "learning_rate": 3.4908757137366006e-05, "loss": 0.3984, "step": 754 }, { "epoch": 1.5455475946775845, "grad_norm": 0.4097364486307627, "learning_rate": 3.488966987012941e-05, "loss": 0.4436, "step": 755 }, { "epoch": 1.547594677584442, "grad_norm": 0.3733136430359242, "learning_rate": 3.487055212955201e-05, "loss": 0.4102, "step": 756 }, { "epoch": 1.5496417604913, "grad_norm": 0.49818552627556745, "learning_rate": 3.485140395476038e-05, "loss": 0.485, "step": 757 }, { "epoch": 1.5516888433981575, "grad_norm": 0.40160664320174677, "learning_rate": 3.4832225384943335e-05, "loss": 0.4662, "step": 758 }, { "epoch": 1.5537359263050154, "grad_norm": 0.36914648750243184, "learning_rate": 3.481301645935193e-05, "loss": 0.4173, "step": 759 }, { "epoch": 1.555783009211873, "grad_norm": 0.40716553946286355, "learning_rate": 3.4793777217299346e-05, "loss": 0.417, "step": 760 }, { "epoch": 1.557830092118731, "grad_norm": 0.42779403061287685, "learning_rate": 3.477450769816077e-05, "loss": 0.4848, "step": 761 }, { "epoch": 1.5598771750255884, "grad_norm": 0.3422957268296126, "learning_rate": 3.475520794137341e-05, "loss": 0.4039, "step": 762 }, { "epoch": 1.5619242579324464, "grad_norm": 0.4032099351305834, "learning_rate": 3.473587798643633e-05, "loss": 0.4378, "step": 763 }, { "epoch": 1.563971340839304, "grad_norm": 0.4010793632381822, "learning_rate": 3.4716517872910405e-05, "loss": 0.4461, "step": 764 }, { "epoch": 1.5660184237461618, "grad_norm": 0.41602009577806304, "learning_rate": 3.4697127640418204e-05, "loss": 0.5198, "step": 765 }, { "epoch": 1.5680655066530194, "grad_norm": 0.3840607306535795, "learning_rate": 3.467770732864399e-05, "loss": 0.4072, "step": 766 }, { "epoch": 1.5701125895598773, "grad_norm": 0.45544304460334517, "learning_rate": 3.4658256977333536e-05, "loss": 0.4263, "step": 767 }, { "epoch": 1.5721596724667348, "grad_norm": 0.4489200382867973, "learning_rate": 3.4638776626294134e-05, "loss": 0.4189, "step": 768 }, { "epoch": 1.5742067553735928, "grad_norm": 0.40708118209847305, "learning_rate": 3.461926631539445e-05, "loss": 0.4344, "step": 769 }, { "epoch": 1.5762538382804503, "grad_norm": 0.47396385986382383, "learning_rate": 3.459972608456448e-05, "loss": 0.4318, "step": 770 }, { "epoch": 1.5783009211873082, "grad_norm": 0.44275905450120784, "learning_rate": 3.4580155973795434e-05, "loss": 0.4024, "step": 771 }, { "epoch": 1.5803480040941658, "grad_norm": 0.37512733188763275, "learning_rate": 3.4560556023139695e-05, "loss": 0.4646, "step": 772 }, { "epoch": 1.5823950870010235, "grad_norm": 0.4718852007159965, "learning_rate": 3.454092627271072e-05, "loss": 0.4386, "step": 773 }, { "epoch": 1.5844421699078812, "grad_norm": 0.36977406003434726, "learning_rate": 3.4521266762682924e-05, "loss": 0.4429, "step": 774 }, { "epoch": 1.586489252814739, "grad_norm": 0.36917719793736264, "learning_rate": 3.450157753329166e-05, "loss": 0.4753, "step": 775 }, { "epoch": 1.5885363357215967, "grad_norm": 0.466057931195573, "learning_rate": 3.448185862483309e-05, "loss": 0.4542, "step": 776 }, { "epoch": 1.5905834186284544, "grad_norm": 0.34642675856704536, "learning_rate": 3.446211007766412e-05, "loss": 0.3954, "step": 777 }, { "epoch": 1.5926305015353122, "grad_norm": 0.4330447451756299, "learning_rate": 3.4442331932202326e-05, "loss": 0.4278, "step": 778 }, { "epoch": 1.59467758444217, "grad_norm": 0.3730822908863033, "learning_rate": 3.4422524228925836e-05, "loss": 0.4061, "step": 779 }, { "epoch": 1.5967246673490276, "grad_norm": 0.37973943757397915, "learning_rate": 3.440268700837329e-05, "loss": 0.394, "step": 780 }, { "epoch": 1.5987717502558854, "grad_norm": 0.49428851704529125, "learning_rate": 3.438282031114374e-05, "loss": 0.4486, "step": 781 }, { "epoch": 1.600818833162743, "grad_norm": 0.41805483229206175, "learning_rate": 3.4362924177896545e-05, "loss": 0.4393, "step": 782 }, { "epoch": 1.6028659160696008, "grad_norm": 0.47689234144727466, "learning_rate": 3.434299864935133e-05, "loss": 0.4354, "step": 783 }, { "epoch": 1.6049129989764586, "grad_norm": 0.42155977412917267, "learning_rate": 3.432304376628787e-05, "loss": 0.4602, "step": 784 }, { "epoch": 1.6069600818833163, "grad_norm": 0.3865188335568589, "learning_rate": 3.430305956954602e-05, "loss": 0.4152, "step": 785 }, { "epoch": 1.609007164790174, "grad_norm": 0.4271955733661172, "learning_rate": 3.428304610002563e-05, "loss": 0.4408, "step": 786 }, { "epoch": 1.6110542476970318, "grad_norm": 0.44963813575609907, "learning_rate": 3.4263003398686464e-05, "loss": 0.4958, "step": 787 }, { "epoch": 1.6131013306038895, "grad_norm": 0.3733726588143654, "learning_rate": 3.424293150654809e-05, "loss": 0.4287, "step": 788 }, { "epoch": 1.6151484135107472, "grad_norm": 0.3812115032343838, "learning_rate": 3.422283046468985e-05, "loss": 0.405, "step": 789 }, { "epoch": 1.617195496417605, "grad_norm": 0.37005120300102184, "learning_rate": 3.420270031425072e-05, "loss": 0.4516, "step": 790 }, { "epoch": 1.6192425793244627, "grad_norm": 0.45666000218850494, "learning_rate": 3.4182541096429265e-05, "loss": 0.4523, "step": 791 }, { "epoch": 1.6212896622313204, "grad_norm": 0.34898311295474405, "learning_rate": 3.416235285248352e-05, "loss": 0.4007, "step": 792 }, { "epoch": 1.623336745138178, "grad_norm": 0.3839127190696324, "learning_rate": 3.4142135623730954e-05, "loss": 0.4617, "step": 793 }, { "epoch": 1.625383828045036, "grad_norm": 0.4387438648695345, "learning_rate": 3.412188945154833e-05, "loss": 0.4723, "step": 794 }, { "epoch": 1.6274309109518934, "grad_norm": 0.3332803619102528, "learning_rate": 3.410161437737166e-05, "loss": 0.362, "step": 795 }, { "epoch": 1.6294779938587514, "grad_norm": 0.3807927986402262, "learning_rate": 3.4081310442696114e-05, "loss": 0.4339, "step": 796 }, { "epoch": 1.6315250767656089, "grad_norm": 0.3937568675037694, "learning_rate": 3.4060977689075914e-05, "loss": 0.4184, "step": 797 }, { "epoch": 1.6335721596724668, "grad_norm": 0.40435872407490453, "learning_rate": 3.404061615812425e-05, "loss": 0.4906, "step": 798 }, { "epoch": 1.6356192425793243, "grad_norm": 0.37907378547306464, "learning_rate": 3.402022589151325e-05, "loss": 0.4273, "step": 799 }, { "epoch": 1.6376663254861823, "grad_norm": 0.3703864227511636, "learning_rate": 3.399980693097383e-05, "loss": 0.4154, "step": 800 }, { "epoch": 1.6397134083930398, "grad_norm": 0.4191952674222777, "learning_rate": 3.3979359318295605e-05, "loss": 0.3581, "step": 801 }, { "epoch": 1.6417604912998978, "grad_norm": 0.3966331751054016, "learning_rate": 3.395888309532687e-05, "loss": 0.4669, "step": 802 }, { "epoch": 1.6438075742067553, "grad_norm": 0.445008569764165, "learning_rate": 3.393837830397446e-05, "loss": 0.4267, "step": 803 }, { "epoch": 1.6458546571136132, "grad_norm": 0.46681180810571704, "learning_rate": 3.391784498620369e-05, "loss": 0.4895, "step": 804 }, { "epoch": 1.6479017400204707, "grad_norm": 0.3385167810574236, "learning_rate": 3.3897283184038215e-05, "loss": 0.404, "step": 805 }, { "epoch": 1.6499488229273287, "grad_norm": 0.5219129958909896, "learning_rate": 3.387669293956003e-05, "loss": 0.4305, "step": 806 }, { "epoch": 1.6519959058341862, "grad_norm": 0.3668653050867526, "learning_rate": 3.385607429490934e-05, "loss": 0.3746, "step": 807 }, { "epoch": 1.6540429887410442, "grad_norm": 0.5158311002393402, "learning_rate": 3.3835427292284445e-05, "loss": 0.486, "step": 808 }, { "epoch": 1.6560900716479017, "grad_norm": 0.36978068539278974, "learning_rate": 3.38147519739417e-05, "loss": 0.4487, "step": 809 }, { "epoch": 1.6581371545547596, "grad_norm": 0.3772021211105363, "learning_rate": 3.37940483821954e-05, "loss": 0.4017, "step": 810 }, { "epoch": 1.6601842374616171, "grad_norm": 0.41401208686983515, "learning_rate": 3.3773316559417734e-05, "loss": 0.491, "step": 811 }, { "epoch": 1.6622313203684749, "grad_norm": 0.3524038676017696, "learning_rate": 3.375255654803864e-05, "loss": 0.3938, "step": 812 }, { "epoch": 1.6642784032753326, "grad_norm": 0.4096084241414413, "learning_rate": 3.373176839054576e-05, "loss": 0.4157, "step": 813 }, { "epoch": 1.6663254861821903, "grad_norm": 0.4186774343886114, "learning_rate": 3.371095212948431e-05, "loss": 0.4681, "step": 814 }, { "epoch": 1.668372569089048, "grad_norm": 0.40515797591205743, "learning_rate": 3.3690107807457085e-05, "loss": 0.4459, "step": 815 }, { "epoch": 1.6704196519959058, "grad_norm": 0.32867520587713756, "learning_rate": 3.366923546712426e-05, "loss": 0.3888, "step": 816 }, { "epoch": 1.6724667349027635, "grad_norm": 0.347589875409667, "learning_rate": 3.364833515120336e-05, "loss": 0.4083, "step": 817 }, { "epoch": 1.6745138178096213, "grad_norm": 0.3591052938957919, "learning_rate": 3.362740690246918e-05, "loss": 0.3938, "step": 818 }, { "epoch": 1.676560900716479, "grad_norm": 0.35854368626658006, "learning_rate": 3.360645076375368e-05, "loss": 0.4217, "step": 819 }, { "epoch": 1.6786079836233367, "grad_norm": 0.44210711469086067, "learning_rate": 3.358546677794586e-05, "loss": 0.4752, "step": 820 }, { "epoch": 1.6806550665301945, "grad_norm": 0.3418341105649144, "learning_rate": 3.356445498799179e-05, "loss": 0.3828, "step": 821 }, { "epoch": 1.6827021494370522, "grad_norm": 0.40574252833368174, "learning_rate": 3.354341543689438e-05, "loss": 0.4138, "step": 822 }, { "epoch": 1.68474923234391, "grad_norm": 0.41618079643123923, "learning_rate": 3.352234816771337e-05, "loss": 0.4559, "step": 823 }, { "epoch": 1.6867963152507677, "grad_norm": 0.42761098792857727, "learning_rate": 3.350125322356525e-05, "loss": 0.4466, "step": 824 }, { "epoch": 1.6888433981576254, "grad_norm": 0.33049589320031353, "learning_rate": 3.348013064762312e-05, "loss": 0.3768, "step": 825 }, { "epoch": 1.6908904810644831, "grad_norm": 0.3768698742983625, "learning_rate": 3.3458980483116664e-05, "loss": 0.4278, "step": 826 }, { "epoch": 1.6929375639713409, "grad_norm": 0.3675413592176129, "learning_rate": 3.343780277333199e-05, "loss": 0.4573, "step": 827 }, { "epoch": 1.6949846468781986, "grad_norm": 0.39935534592316574, "learning_rate": 3.3416597561611616e-05, "loss": 0.4607, "step": 828 }, { "epoch": 1.6970317297850563, "grad_norm": 0.3422867133996467, "learning_rate": 3.3395364891354316e-05, "loss": 0.3925, "step": 829 }, { "epoch": 1.699078812691914, "grad_norm": 0.4421095089073154, "learning_rate": 3.33741048060151e-05, "loss": 0.5186, "step": 830 }, { "epoch": 1.7011258955987718, "grad_norm": 0.40513609641554227, "learning_rate": 3.3352817349105046e-05, "loss": 0.4388, "step": 831 }, { "epoch": 1.7031729785056293, "grad_norm": 0.47309198797804985, "learning_rate": 3.333150256419127e-05, "loss": 0.5152, "step": 832 }, { "epoch": 1.7052200614124873, "grad_norm": 0.37396555660516145, "learning_rate": 3.331016049489681e-05, "loss": 0.3778, "step": 833 }, { "epoch": 1.7072671443193448, "grad_norm": 0.46343295183867955, "learning_rate": 3.328879118490055e-05, "loss": 0.4181, "step": 834 }, { "epoch": 1.7093142272262027, "grad_norm": 0.36985219342112263, "learning_rate": 3.3267394677937134e-05, "loss": 0.4519, "step": 835 }, { "epoch": 1.7113613101330603, "grad_norm": 0.47168961292482453, "learning_rate": 3.3245971017796854e-05, "loss": 0.4754, "step": 836 }, { "epoch": 1.7134083930399182, "grad_norm": 0.45545209130289294, "learning_rate": 3.322452024832557e-05, "loss": 0.4054, "step": 837 }, { "epoch": 1.7154554759467757, "grad_norm": 0.44300545126776186, "learning_rate": 3.320304241342464e-05, "loss": 0.4707, "step": 838 }, { "epoch": 1.7175025588536337, "grad_norm": 0.40914423787668386, "learning_rate": 3.31815375570508e-05, "loss": 0.4008, "step": 839 }, { "epoch": 1.7195496417604912, "grad_norm": 0.42788343961512754, "learning_rate": 3.3160005723216105e-05, "loss": 0.4063, "step": 840 }, { "epoch": 1.7215967246673491, "grad_norm": 0.3836547816282569, "learning_rate": 3.31384469559878e-05, "loss": 0.4471, "step": 841 }, { "epoch": 1.7236438075742067, "grad_norm": 0.3712650340512891, "learning_rate": 3.311686129948827e-05, "loss": 0.3813, "step": 842 }, { "epoch": 1.7256908904810646, "grad_norm": 0.3661541112584301, "learning_rate": 3.3095248797894925e-05, "loss": 0.4373, "step": 843 }, { "epoch": 1.7277379733879221, "grad_norm": 0.40769069558582477, "learning_rate": 3.307360949544012e-05, "loss": 0.433, "step": 844 }, { "epoch": 1.72978505629478, "grad_norm": 0.4384390897805754, "learning_rate": 3.305194343641106e-05, "loss": 0.4414, "step": 845 }, { "epoch": 1.7318321392016376, "grad_norm": 0.39419405263041707, "learning_rate": 3.30302506651497e-05, "loss": 0.3903, "step": 846 }, { "epoch": 1.7338792221084955, "grad_norm": 0.5193517899563321, "learning_rate": 3.300853122605268e-05, "loss": 0.5126, "step": 847 }, { "epoch": 1.735926305015353, "grad_norm": 0.37470876760021676, "learning_rate": 3.2986785163571216e-05, "loss": 0.4088, "step": 848 }, { "epoch": 1.737973387922211, "grad_norm": 0.3911340678248973, "learning_rate": 3.2965012522211e-05, "loss": 0.4231, "step": 849 }, { "epoch": 1.7400204708290685, "grad_norm": 0.3718531256618771, "learning_rate": 3.294321334653213e-05, "loss": 0.4087, "step": 850 }, { "epoch": 1.7420675537359263, "grad_norm": 0.44651626014948276, "learning_rate": 3.2921387681149e-05, "loss": 0.4623, "step": 851 }, { "epoch": 1.744114636642784, "grad_norm": 0.37467196684362686, "learning_rate": 3.289953557073024e-05, "loss": 0.4148, "step": 852 }, { "epoch": 1.7461617195496417, "grad_norm": 0.42577176527999205, "learning_rate": 3.2877657059998584e-05, "loss": 0.4103, "step": 853 }, { "epoch": 1.7482088024564995, "grad_norm": 0.3833747846094608, "learning_rate": 3.285575219373079e-05, "loss": 0.4432, "step": 854 }, { "epoch": 1.7502558853633572, "grad_norm": 0.4116483874002876, "learning_rate": 3.2833821016757586e-05, "loss": 0.4203, "step": 855 }, { "epoch": 1.752302968270215, "grad_norm": 0.330063754223896, "learning_rate": 3.281186357396351e-05, "loss": 0.3895, "step": 856 }, { "epoch": 1.7543500511770727, "grad_norm": 0.398313432569639, "learning_rate": 3.278987991028688e-05, "loss": 0.4367, "step": 857 }, { "epoch": 1.7563971340839304, "grad_norm": 0.3618880492014595, "learning_rate": 3.276787007071968e-05, "loss": 0.3917, "step": 858 }, { "epoch": 1.7584442169907881, "grad_norm": 0.4604401121085883, "learning_rate": 3.274583410030745e-05, "loss": 0.4577, "step": 859 }, { "epoch": 1.7604912998976459, "grad_norm": 0.33679127692517397, "learning_rate": 3.2723772044149224e-05, "loss": 0.4072, "step": 860 }, { "epoch": 1.7625383828045036, "grad_norm": 0.3624660321439512, "learning_rate": 3.270168394739741e-05, "loss": 0.4098, "step": 861 }, { "epoch": 1.7645854657113613, "grad_norm": 0.343429438960415, "learning_rate": 3.267956985525774e-05, "loss": 0.4262, "step": 862 }, { "epoch": 1.766632548618219, "grad_norm": 0.3866589432412809, "learning_rate": 3.26574298129891e-05, "loss": 0.4311, "step": 863 }, { "epoch": 1.7686796315250768, "grad_norm": 0.3484380413267947, "learning_rate": 3.263526386590351e-05, "loss": 0.4265, "step": 864 }, { "epoch": 1.7707267144319345, "grad_norm": 0.3727255169733587, "learning_rate": 3.261307205936603e-05, "loss": 0.4936, "step": 865 }, { "epoch": 1.7727737973387923, "grad_norm": 0.35604276467584445, "learning_rate": 3.2590854438794604e-05, "loss": 0.3855, "step": 866 }, { "epoch": 1.77482088024565, "grad_norm": 0.37701838111479336, "learning_rate": 3.2568611049660046e-05, "loss": 0.4308, "step": 867 }, { "epoch": 1.7768679631525077, "grad_norm": 0.3530904170276449, "learning_rate": 3.2546341937485884e-05, "loss": 0.4198, "step": 868 }, { "epoch": 1.7789150460593655, "grad_norm": 0.37564040686995553, "learning_rate": 3.2524047147848284e-05, "loss": 0.3702, "step": 869 }, { "epoch": 1.7809621289662232, "grad_norm": 0.366044908240742, "learning_rate": 3.250172672637598e-05, "loss": 0.4561, "step": 870 }, { "epoch": 1.7830092118730807, "grad_norm": 0.3683215486290299, "learning_rate": 3.247938071875017e-05, "loss": 0.4467, "step": 871 }, { "epoch": 1.7850562947799387, "grad_norm": 0.38716018556798415, "learning_rate": 3.24570091707044e-05, "loss": 0.4151, "step": 872 }, { "epoch": 1.7871033776867962, "grad_norm": 0.7367692001662189, "learning_rate": 3.24346121280245e-05, "loss": 0.438, "step": 873 }, { "epoch": 1.7891504605936541, "grad_norm": 0.41338889728660405, "learning_rate": 3.2412189636548456e-05, "loss": 0.4629, "step": 874 }, { "epoch": 1.7911975435005116, "grad_norm": 0.35357448274069847, "learning_rate": 3.238974174216637e-05, "loss": 0.3559, "step": 875 }, { "epoch": 1.7932446264073696, "grad_norm": 0.3796163326385031, "learning_rate": 3.236726849082032e-05, "loss": 0.4281, "step": 876 }, { "epoch": 1.795291709314227, "grad_norm": 0.3944707372507435, "learning_rate": 3.234476992850425e-05, "loss": 0.4537, "step": 877 }, { "epoch": 1.797338792221085, "grad_norm": 0.3318296802336291, "learning_rate": 3.232224610126396e-05, "loss": 0.3985, "step": 878 }, { "epoch": 1.7993858751279426, "grad_norm": 0.40919288715091856, "learning_rate": 3.229969705519693e-05, "loss": 0.4616, "step": 879 }, { "epoch": 1.8014329580348005, "grad_norm": 0.39543379019412783, "learning_rate": 3.227712283645224e-05, "loss": 0.4883, "step": 880 }, { "epoch": 1.803480040941658, "grad_norm": 0.3624564100802025, "learning_rate": 3.225452349123051e-05, "loss": 0.4264, "step": 881 }, { "epoch": 1.805527123848516, "grad_norm": 0.35250394717776745, "learning_rate": 3.2231899065783766e-05, "loss": 0.3975, "step": 882 }, { "epoch": 1.8075742067553735, "grad_norm": 0.41454217155818623, "learning_rate": 3.2209249606415394e-05, "loss": 0.4668, "step": 883 }, { "epoch": 1.8096212896622315, "grad_norm": 0.33248312085973525, "learning_rate": 3.2186575159479966e-05, "loss": 0.3853, "step": 884 }, { "epoch": 1.811668372569089, "grad_norm": 0.41602712353842625, "learning_rate": 3.2163875771383246e-05, "loss": 0.4615, "step": 885 }, { "epoch": 1.813715455475947, "grad_norm": 0.3941084429766098, "learning_rate": 3.214115148858201e-05, "loss": 0.5111, "step": 886 }, { "epoch": 1.8157625383828044, "grad_norm": 0.34750451682083505, "learning_rate": 3.211840235758399e-05, "loss": 0.4055, "step": 887 }, { "epoch": 1.8178096212896624, "grad_norm": 0.45072414952251894, "learning_rate": 3.209562842494778e-05, "loss": 0.4673, "step": 888 }, { "epoch": 1.81985670419652, "grad_norm": 0.36513950471383816, "learning_rate": 3.207282973728273e-05, "loss": 0.4526, "step": 889 }, { "epoch": 1.8219037871033776, "grad_norm": 0.3898093803880879, "learning_rate": 3.205000634124884e-05, "loss": 0.4045, "step": 890 }, { "epoch": 1.8239508700102354, "grad_norm": 0.36982373792147444, "learning_rate": 3.20271582835567e-05, "loss": 0.4079, "step": 891 }, { "epoch": 1.825997952917093, "grad_norm": 0.35532947918298324, "learning_rate": 3.200428561096737e-05, "loss": 0.444, "step": 892 }, { "epoch": 1.8280450358239508, "grad_norm": 0.3777203606733099, "learning_rate": 3.198138837029227e-05, "loss": 0.46, "step": 893 }, { "epoch": 1.8300921187308086, "grad_norm": 0.3489126911275309, "learning_rate": 3.195846660839311e-05, "loss": 0.3887, "step": 894 }, { "epoch": 1.8321392016376663, "grad_norm": 0.3633804424526499, "learning_rate": 3.193552037218179e-05, "loss": 0.5416, "step": 895 }, { "epoch": 1.834186284544524, "grad_norm": 0.38704595278722626, "learning_rate": 3.1912549708620314e-05, "loss": 0.47, "step": 896 }, { "epoch": 1.8362333674513818, "grad_norm": 0.36865192383072315, "learning_rate": 3.188955466472063e-05, "loss": 0.4084, "step": 897 }, { "epoch": 1.8382804503582395, "grad_norm": 0.42348322893722706, "learning_rate": 3.186653528754464e-05, "loss": 0.4354, "step": 898 }, { "epoch": 1.8403275332650972, "grad_norm": 0.37712189200316865, "learning_rate": 3.184349162420401e-05, "loss": 0.4011, "step": 899 }, { "epoch": 1.842374616171955, "grad_norm": 0.40519216196971364, "learning_rate": 3.182042372186013e-05, "loss": 0.4523, "step": 900 }, { "epoch": 1.8444216990788127, "grad_norm": 0.3270302439955843, "learning_rate": 3.179733162772398e-05, "loss": 0.3863, "step": 901 }, { "epoch": 1.8464687819856704, "grad_norm": 0.36562347535937434, "learning_rate": 3.177421538905606e-05, "loss": 0.414, "step": 902 }, { "epoch": 1.8485158648925282, "grad_norm": 0.34059679657441694, "learning_rate": 3.17510750531663e-05, "loss": 0.4319, "step": 903 }, { "epoch": 1.850562947799386, "grad_norm": 0.3971233949934265, "learning_rate": 3.172791066741392e-05, "loss": 0.4051, "step": 904 }, { "epoch": 1.8526100307062436, "grad_norm": 0.37489566812190156, "learning_rate": 3.170472227920737e-05, "loss": 0.4488, "step": 905 }, { "epoch": 1.8546571136131014, "grad_norm": 0.3619651650704685, "learning_rate": 3.168150993600424e-05, "loss": 0.4097, "step": 906 }, { "epoch": 1.856704196519959, "grad_norm": 0.4144444046604467, "learning_rate": 3.165827368531113e-05, "loss": 0.4838, "step": 907 }, { "epoch": 1.8587512794268168, "grad_norm": 0.34162161820675707, "learning_rate": 3.1635013574683564e-05, "loss": 0.403, "step": 908 }, { "epoch": 1.8607983623336746, "grad_norm": 0.3876122467545283, "learning_rate": 3.161172965172591e-05, "loss": 0.4564, "step": 909 }, { "epoch": 1.862845445240532, "grad_norm": 0.34340552862244217, "learning_rate": 3.1588421964091276e-05, "loss": 0.397, "step": 910 }, { "epoch": 1.86489252814739, "grad_norm": 0.383808899187042, "learning_rate": 3.1565090559481396e-05, "loss": 0.4265, "step": 911 }, { "epoch": 1.8669396110542475, "grad_norm": 0.31690081389294966, "learning_rate": 3.1541735485646536e-05, "loss": 0.4047, "step": 912 }, { "epoch": 1.8689866939611055, "grad_norm": 0.3992971993499514, "learning_rate": 3.151835679038542e-05, "loss": 0.439, "step": 913 }, { "epoch": 1.871033776867963, "grad_norm": 0.31545614214320866, "learning_rate": 3.149495452154512e-05, "loss": 0.3986, "step": 914 }, { "epoch": 1.873080859774821, "grad_norm": 0.33181454924376286, "learning_rate": 3.147152872702092e-05, "loss": 0.4364, "step": 915 }, { "epoch": 1.8751279426816785, "grad_norm": 0.38596107764783966, "learning_rate": 3.14480794547563e-05, "loss": 0.4666, "step": 916 }, { "epoch": 1.8771750255885364, "grad_norm": 0.3743307900880147, "learning_rate": 3.142460675274275e-05, "loss": 0.4136, "step": 917 }, { "epoch": 1.879222108495394, "grad_norm": 0.39267653351820997, "learning_rate": 3.1401110669019724e-05, "loss": 0.4308, "step": 918 }, { "epoch": 1.881269191402252, "grad_norm": 0.3865400215305747, "learning_rate": 3.137759125167455e-05, "loss": 0.4663, "step": 919 }, { "epoch": 1.8833162743091094, "grad_norm": 0.36025373309076514, "learning_rate": 3.135404854884226e-05, "loss": 0.4202, "step": 920 }, { "epoch": 1.8853633572159674, "grad_norm": 0.3774704107912035, "learning_rate": 3.133048260870561e-05, "loss": 0.4047, "step": 921 }, { "epoch": 1.8874104401228249, "grad_norm": 0.3281213257339426, "learning_rate": 3.130689347949486e-05, "loss": 0.4088, "step": 922 }, { "epoch": 1.8894575230296828, "grad_norm": 0.3593342361422244, "learning_rate": 3.1283281209487755e-05, "loss": 0.4475, "step": 923 }, { "epoch": 1.8915046059365404, "grad_norm": 0.348849346649699, "learning_rate": 3.1259645847009384e-05, "loss": 0.4133, "step": 924 }, { "epoch": 1.8935516888433983, "grad_norm": 0.3661633848236013, "learning_rate": 3.123598744043211e-05, "loss": 0.4345, "step": 925 }, { "epoch": 1.8955987717502558, "grad_norm": 0.39376453806042766, "learning_rate": 3.121230603817545e-05, "loss": 0.4802, "step": 926 }, { "epoch": 1.8976458546571138, "grad_norm": 0.4054577347062655, "learning_rate": 3.1188601688706e-05, "loss": 0.4861, "step": 927 }, { "epoch": 1.8996929375639713, "grad_norm": 0.3244003846213942, "learning_rate": 3.1164874440537295e-05, "loss": 0.3988, "step": 928 }, { "epoch": 1.901740020470829, "grad_norm": 0.3598304561011502, "learning_rate": 3.114112434222976e-05, "loss": 0.4083, "step": 929 }, { "epoch": 1.9037871033776868, "grad_norm": 0.3818801716604425, "learning_rate": 3.111735144239057e-05, "loss": 0.424, "step": 930 }, { "epoch": 1.9058341862845445, "grad_norm": 0.351513366570262, "learning_rate": 3.109355578967356e-05, "loss": 0.4529, "step": 931 }, { "epoch": 1.9078812691914022, "grad_norm": 0.3240275031967729, "learning_rate": 3.106973743277916e-05, "loss": 0.4211, "step": 932 }, { "epoch": 1.90992835209826, "grad_norm": 0.361341366033979, "learning_rate": 3.104589642045422e-05, "loss": 0.4776, "step": 933 }, { "epoch": 1.9119754350051177, "grad_norm": 0.29231193321874205, "learning_rate": 3.1022032801492e-05, "loss": 0.3741, "step": 934 }, { "epoch": 1.9140225179119754, "grad_norm": 0.3401516128458725, "learning_rate": 3.099814662473202e-05, "loss": 0.411, "step": 935 }, { "epoch": 1.9160696008188332, "grad_norm": 0.3813395041273339, "learning_rate": 3.0974237939059947e-05, "loss": 0.4652, "step": 936 }, { "epoch": 1.9181166837256909, "grad_norm": 0.33396626631772636, "learning_rate": 3.095030679340751e-05, "loss": 0.438, "step": 937 }, { "epoch": 1.9201637666325486, "grad_norm": 0.27154874124436096, "learning_rate": 3.092635323675245e-05, "loss": 0.3297, "step": 938 }, { "epoch": 1.9222108495394064, "grad_norm": 0.3729869848757238, "learning_rate": 3.0902377318118336e-05, "loss": 0.3925, "step": 939 }, { "epoch": 1.924257932446264, "grad_norm": 0.3729082740901169, "learning_rate": 3.0878379086574494e-05, "loss": 0.4632, "step": 940 }, { "epoch": 1.9263050153531218, "grad_norm": 0.33442817830860083, "learning_rate": 3.085435859123596e-05, "loss": 0.4246, "step": 941 }, { "epoch": 1.9283520982599796, "grad_norm": 0.3455333402015141, "learning_rate": 3.083031588126329e-05, "loss": 0.4291, "step": 942 }, { "epoch": 1.9303991811668373, "grad_norm": 0.33030857885870823, "learning_rate": 3.0806251005862535e-05, "loss": 0.4293, "step": 943 }, { "epoch": 1.932446264073695, "grad_norm": 0.3313566999610316, "learning_rate": 3.07821640142851e-05, "loss": 0.4691, "step": 944 }, { "epoch": 1.9344933469805528, "grad_norm": 0.3371774148631356, "learning_rate": 3.0758054955827655e-05, "loss": 0.4283, "step": 945 }, { "epoch": 1.9365404298874105, "grad_norm": 0.35055569911779827, "learning_rate": 3.073392387983202e-05, "loss": 0.4157, "step": 946 }, { "epoch": 1.9385875127942682, "grad_norm": 0.3713512962937003, "learning_rate": 3.070977083568508e-05, "loss": 0.4709, "step": 947 }, { "epoch": 1.940634595701126, "grad_norm": 0.3121880967121788, "learning_rate": 3.06855958728187e-05, "loss": 0.3584, "step": 948 }, { "epoch": 1.9426816786079835, "grad_norm": 0.3923276029776799, "learning_rate": 3.0661399040709584e-05, "loss": 0.4273, "step": 949 }, { "epoch": 1.9447287615148414, "grad_norm": 0.39501548252234137, "learning_rate": 3.0637180388879207e-05, "loss": 0.4292, "step": 950 }, { "epoch": 1.946775844421699, "grad_norm": 0.3918727723912462, "learning_rate": 3.061293996689369e-05, "loss": 0.4422, "step": 951 }, { "epoch": 1.9488229273285569, "grad_norm": 0.3650853100344803, "learning_rate": 3.05886778243637e-05, "loss": 0.4355, "step": 952 }, { "epoch": 1.9508700102354144, "grad_norm": 0.40314024207731314, "learning_rate": 3.0564394010944396e-05, "loss": 0.3964, "step": 953 }, { "epoch": 1.9529170931422724, "grad_norm": 0.3321329959720213, "learning_rate": 3.054008857633524e-05, "loss": 0.3802, "step": 954 }, { "epoch": 1.9549641760491299, "grad_norm": 0.3578935619719161, "learning_rate": 3.051576157027998e-05, "loss": 0.4187, "step": 955 }, { "epoch": 1.9570112589559878, "grad_norm": 0.36008706778157745, "learning_rate": 3.0491413042566492e-05, "loss": 0.421, "step": 956 }, { "epoch": 1.9590583418628453, "grad_norm": 0.35653968726696394, "learning_rate": 3.0467043043026705e-05, "loss": 0.3773, "step": 957 }, { "epoch": 1.9611054247697033, "grad_norm": 0.398663136011159, "learning_rate": 3.0442651621536502e-05, "loss": 0.4867, "step": 958 }, { "epoch": 1.9631525076765608, "grad_norm": 0.39803480683406417, "learning_rate": 3.041823882801559e-05, "loss": 0.4572, "step": 959 }, { "epoch": 1.9651995905834188, "grad_norm": 0.31571236419876386, "learning_rate": 3.039380471242743e-05, "loss": 0.3953, "step": 960 }, { "epoch": 1.9672466734902763, "grad_norm": 0.356704934998217, "learning_rate": 3.0369349324779115e-05, "loss": 0.4116, "step": 961 }, { "epoch": 1.9692937563971342, "grad_norm": 0.38456016779817315, "learning_rate": 3.0344872715121276e-05, "loss": 0.4181, "step": 962 }, { "epoch": 1.9713408393039917, "grad_norm": 0.42725505346001535, "learning_rate": 3.0320374933547982e-05, "loss": 0.4509, "step": 963 }, { "epoch": 1.9733879222108497, "grad_norm": 0.3124541183454085, "learning_rate": 3.0295856030196618e-05, "loss": 0.3635, "step": 964 }, { "epoch": 1.9754350051177072, "grad_norm": 0.38135606484411233, "learning_rate": 3.0271316055247812e-05, "loss": 0.4322, "step": 965 }, { "epoch": 1.9774820880245652, "grad_norm": 0.3564421243258828, "learning_rate": 3.024675505892531e-05, "loss": 0.4706, "step": 966 }, { "epoch": 1.9795291709314227, "grad_norm": 0.33614285440354186, "learning_rate": 3.022217309149588e-05, "loss": 0.3916, "step": 967 }, { "epoch": 1.9815762538382804, "grad_norm": 0.3642272706560808, "learning_rate": 3.019757020326921e-05, "loss": 0.4001, "step": 968 }, { "epoch": 1.9836233367451381, "grad_norm": 0.3677389037382802, "learning_rate": 3.017294644459782e-05, "loss": 0.4067, "step": 969 }, { "epoch": 1.9856704196519959, "grad_norm": 0.38491158269193576, "learning_rate": 3.0148301865876913e-05, "loss": 0.4266, "step": 970 }, { "epoch": 1.9877175025588536, "grad_norm": 0.4027798698020095, "learning_rate": 3.0123636517544326e-05, "loss": 0.5046, "step": 971 }, { "epoch": 1.9897645854657113, "grad_norm": 0.36103993797318307, "learning_rate": 3.0098950450080404e-05, "loss": 0.3863, "step": 972 }, { "epoch": 1.991811668372569, "grad_norm": 0.3202323840951822, "learning_rate": 3.0074243714007875e-05, "loss": 0.3562, "step": 973 }, { "epoch": 1.9938587512794268, "grad_norm": 0.3949919429980765, "learning_rate": 3.004951635989179e-05, "loss": 0.4732, "step": 974 }, { "epoch": 1.9959058341862845, "grad_norm": 0.3475495093601558, "learning_rate": 3.0024768438339388e-05, "loss": 0.391, "step": 975 }, { "epoch": 1.9979529170931423, "grad_norm": 0.39216689067501626, "learning_rate": 3.0000000000000004e-05, "loss": 0.4564, "step": 976 }, { "epoch": 2.0, "grad_norm": 0.6533349441460323, "learning_rate": 2.9975211095564955e-05, "loss": 0.5882, "step": 977 }, { "epoch": 2.0020470829068575, "grad_norm": 0.4211988149390974, "learning_rate": 2.995040177576745e-05, "loss": 0.3174, "step": 978 }, { "epoch": 2.0040941658137155, "grad_norm": 0.6108187632741519, "learning_rate": 2.992557209138249e-05, "loss": 0.3367, "step": 979 }, { "epoch": 2.006141248720573, "grad_norm": 0.38089369292387826, "learning_rate": 2.9900722093226737e-05, "loss": 0.3232, "step": 980 }, { "epoch": 2.008188331627431, "grad_norm": 0.47508227914348694, "learning_rate": 2.9875851832158428e-05, "loss": 0.3522, "step": 981 }, { "epoch": 2.0102354145342884, "grad_norm": 0.4326113199153507, "learning_rate": 2.9850961359077293e-05, "loss": 0.2981, "step": 982 }, { "epoch": 2.0122824974411464, "grad_norm": 0.42932267247803857, "learning_rate": 2.98260507249244e-05, "loss": 0.3202, "step": 983 }, { "epoch": 2.014329580348004, "grad_norm": 0.3951716605003508, "learning_rate": 2.9801119980682095e-05, "loss": 0.3509, "step": 984 }, { "epoch": 2.016376663254862, "grad_norm": 0.35789828953920616, "learning_rate": 2.977616917737388e-05, "loss": 0.3281, "step": 985 }, { "epoch": 2.0184237461617194, "grad_norm": 0.3460413679418126, "learning_rate": 2.9751198366064304e-05, "loss": 0.3161, "step": 986 }, { "epoch": 2.0204708290685773, "grad_norm": 0.3703844711305289, "learning_rate": 2.9726207597858872e-05, "loss": 0.3251, "step": 987 }, { "epoch": 2.022517911975435, "grad_norm": 0.39238883821417286, "learning_rate": 2.9701196923903927e-05, "loss": 0.3391, "step": 988 }, { "epoch": 2.024564994882293, "grad_norm": 0.34303403649979675, "learning_rate": 2.9676166395386553e-05, "loss": 0.361, "step": 989 }, { "epoch": 2.0266120777891503, "grad_norm": 0.38788579378291205, "learning_rate": 2.965111606353447e-05, "loss": 0.312, "step": 990 }, { "epoch": 2.0286591606960083, "grad_norm": 0.4013869555066414, "learning_rate": 2.9626045979615928e-05, "loss": 0.3209, "step": 991 }, { "epoch": 2.030706243602866, "grad_norm": 0.37089151639897344, "learning_rate": 2.9600956194939598e-05, "loss": 0.3498, "step": 992 }, { "epoch": 2.0327533265097237, "grad_norm": 0.371332657755311, "learning_rate": 2.957584676085447e-05, "loss": 0.3422, "step": 993 }, { "epoch": 2.0348004094165812, "grad_norm": 0.3762690812523185, "learning_rate": 2.9550717728749768e-05, "loss": 0.3199, "step": 994 }, { "epoch": 2.036847492323439, "grad_norm": 0.3431217144511242, "learning_rate": 2.9525569150054796e-05, "loss": 0.2871, "step": 995 }, { "epoch": 2.0388945752302967, "grad_norm": 0.42824685102967736, "learning_rate": 2.950040107623887e-05, "loss": 0.3268, "step": 996 }, { "epoch": 2.0409416581371547, "grad_norm": 0.3459406390165573, "learning_rate": 2.947521355881122e-05, "loss": 0.3268, "step": 997 }, { "epoch": 2.042988741044012, "grad_norm": 0.3484085391017404, "learning_rate": 2.9450006649320862e-05, "loss": 0.325, "step": 998 }, { "epoch": 2.04503582395087, "grad_norm": 0.3537579300316069, "learning_rate": 2.9424780399356497e-05, "loss": 0.3144, "step": 999 }, { "epoch": 2.0470829068577276, "grad_norm": 0.3540499619721327, "learning_rate": 2.9399534860546404e-05, "loss": 0.3324, "step": 1000 }, { "epoch": 2.0491299897645856, "grad_norm": 0.3146082628297793, "learning_rate": 2.937427008455835e-05, "loss": 0.3458, "step": 1001 }, { "epoch": 2.051177072671443, "grad_norm": 0.41424472646515875, "learning_rate": 2.9348986123099462e-05, "loss": 0.3638, "step": 1002 }, { "epoch": 2.053224155578301, "grad_norm": 0.3640670691803175, "learning_rate": 2.932368302791614e-05, "loss": 0.3596, "step": 1003 }, { "epoch": 2.0552712384851586, "grad_norm": 0.38223463417283743, "learning_rate": 2.9298360850793944e-05, "loss": 0.3048, "step": 1004 }, { "epoch": 2.0573183213920165, "grad_norm": 0.35066959498537137, "learning_rate": 2.9273019643557474e-05, "loss": 0.3154, "step": 1005 }, { "epoch": 2.059365404298874, "grad_norm": 0.3912112568074411, "learning_rate": 2.92476594580703e-05, "loss": 0.3397, "step": 1006 }, { "epoch": 2.061412487205732, "grad_norm": 0.40688373618091567, "learning_rate": 2.9222280346234816e-05, "loss": 0.3315, "step": 1007 }, { "epoch": 2.0634595701125895, "grad_norm": 0.3408605767984647, "learning_rate": 2.919688235999215e-05, "loss": 0.3334, "step": 1008 }, { "epoch": 2.0655066530194475, "grad_norm": 0.35811936384341014, "learning_rate": 2.917146555132206e-05, "loss": 0.3386, "step": 1009 }, { "epoch": 2.067553735926305, "grad_norm": 0.3364760604746556, "learning_rate": 2.914602997224285e-05, "loss": 0.3199, "step": 1010 }, { "epoch": 2.069600818833163, "grad_norm": 0.4118145924928188, "learning_rate": 2.912057567481119e-05, "loss": 0.3507, "step": 1011 }, { "epoch": 2.0716479017400204, "grad_norm": 0.3612606023218023, "learning_rate": 2.909510271112212e-05, "loss": 0.3462, "step": 1012 }, { "epoch": 2.0736949846468784, "grad_norm": 0.3222144557445549, "learning_rate": 2.906961113330883e-05, "loss": 0.3271, "step": 1013 }, { "epoch": 2.075742067553736, "grad_norm": 0.4762234619066632, "learning_rate": 2.904410099354263e-05, "loss": 0.311, "step": 1014 }, { "epoch": 2.0777891504605934, "grad_norm": 0.3407988463597387, "learning_rate": 2.9018572344032823e-05, "loss": 0.3242, "step": 1015 }, { "epoch": 2.0798362333674514, "grad_norm": 0.38477863373629423, "learning_rate": 2.8993025237026578e-05, "loss": 0.3472, "step": 1016 }, { "epoch": 2.0818833162743093, "grad_norm": 0.4212662322514699, "learning_rate": 2.8967459724808856e-05, "loss": 0.3055, "step": 1017 }, { "epoch": 2.083930399181167, "grad_norm": 0.34644422983431267, "learning_rate": 2.8941875859702283e-05, "loss": 0.3099, "step": 1018 }, { "epoch": 2.0859774820880244, "grad_norm": 0.38660401266521843, "learning_rate": 2.891627369406703e-05, "loss": 0.3301, "step": 1019 }, { "epoch": 2.0880245649948823, "grad_norm": 0.33570902053280804, "learning_rate": 2.889065328030074e-05, "loss": 0.3559, "step": 1020 }, { "epoch": 2.09007164790174, "grad_norm": 0.3938573985227862, "learning_rate": 2.88650146708384e-05, "loss": 0.3301, "step": 1021 }, { "epoch": 2.092118730808598, "grad_norm": 0.37756710702648894, "learning_rate": 2.883935791815222e-05, "loss": 0.3413, "step": 1022 }, { "epoch": 2.0941658137154553, "grad_norm": 0.36055751260111174, "learning_rate": 2.8813683074751578e-05, "loss": 0.3469, "step": 1023 }, { "epoch": 2.0962128966223132, "grad_norm": 0.34337336038390814, "learning_rate": 2.878799019318283e-05, "loss": 0.3339, "step": 1024 }, { "epoch": 2.0982599795291708, "grad_norm": 0.37844750997650106, "learning_rate": 2.8762279326029293e-05, "loss": 0.3278, "step": 1025 }, { "epoch": 2.1003070624360287, "grad_norm": 0.3641190698552649, "learning_rate": 2.8736550525911066e-05, "loss": 0.3548, "step": 1026 }, { "epoch": 2.1023541453428862, "grad_norm": 0.36462562710150886, "learning_rate": 2.8710803845484955e-05, "loss": 0.321, "step": 1027 }, { "epoch": 2.104401228249744, "grad_norm": 0.3706428526021544, "learning_rate": 2.8685039337444368e-05, "loss": 0.3174, "step": 1028 }, { "epoch": 2.1064483111566017, "grad_norm": 0.3276491863527004, "learning_rate": 2.8659257054519182e-05, "loss": 0.3046, "step": 1029 }, { "epoch": 2.1084953940634596, "grad_norm": 0.3247212720041268, "learning_rate": 2.8633457049475678e-05, "loss": 0.3526, "step": 1030 }, { "epoch": 2.110542476970317, "grad_norm": 0.3326121138866842, "learning_rate": 2.8607639375116388e-05, "loss": 0.3179, "step": 1031 }, { "epoch": 2.112589559877175, "grad_norm": 0.36702351353785473, "learning_rate": 2.858180408428001e-05, "loss": 0.3393, "step": 1032 }, { "epoch": 2.1146366427840326, "grad_norm": 0.31780331970447684, "learning_rate": 2.855595122984129e-05, "loss": 0.3248, "step": 1033 }, { "epoch": 2.1166837256908906, "grad_norm": 0.32446381302593813, "learning_rate": 2.853008086471094e-05, "loss": 0.3283, "step": 1034 }, { "epoch": 2.118730808597748, "grad_norm": 0.3428912531231067, "learning_rate": 2.8504193041835497e-05, "loss": 0.3048, "step": 1035 }, { "epoch": 2.120777891504606, "grad_norm": 0.4004823401099236, "learning_rate": 2.847828781419722e-05, "loss": 0.3667, "step": 1036 }, { "epoch": 2.1228249744114636, "grad_norm": 0.36467750504991164, "learning_rate": 2.8452365234813992e-05, "loss": 0.3601, "step": 1037 }, { "epoch": 2.1248720573183215, "grad_norm": 0.3853260717775913, "learning_rate": 2.842642535673922e-05, "loss": 0.3289, "step": 1038 }, { "epoch": 2.126919140225179, "grad_norm": 0.3693312153767782, "learning_rate": 2.8400468233061708e-05, "loss": 0.3147, "step": 1039 }, { "epoch": 2.128966223132037, "grad_norm": 0.38807123819436246, "learning_rate": 2.8374493916905544e-05, "loss": 0.3269, "step": 1040 }, { "epoch": 2.1310133060388945, "grad_norm": 0.3324036292794174, "learning_rate": 2.834850246143002e-05, "loss": 0.3076, "step": 1041 }, { "epoch": 2.1330603889457525, "grad_norm": 0.36633069839093463, "learning_rate": 2.832249391982949e-05, "loss": 0.3315, "step": 1042 }, { "epoch": 2.13510747185261, "grad_norm": 0.348911620524954, "learning_rate": 2.8296468345333298e-05, "loss": 0.2945, "step": 1043 }, { "epoch": 2.137154554759468, "grad_norm": 0.4313978670199707, "learning_rate": 2.827042579120562e-05, "loss": 0.3556, "step": 1044 }, { "epoch": 2.1392016376663254, "grad_norm": 0.33673070921204956, "learning_rate": 2.8244366310745398e-05, "loss": 0.3301, "step": 1045 }, { "epoch": 2.1412487205731834, "grad_norm": 0.4048102626926484, "learning_rate": 2.8218289957286226e-05, "loss": 0.3672, "step": 1046 }, { "epoch": 2.143295803480041, "grad_norm": 0.41846672934257156, "learning_rate": 2.8192196784196198e-05, "loss": 0.3148, "step": 1047 }, { "epoch": 2.145342886386899, "grad_norm": 0.3744830737938391, "learning_rate": 2.816608684487787e-05, "loss": 0.3252, "step": 1048 }, { "epoch": 2.1473899692937564, "grad_norm": 0.40525534049659034, "learning_rate": 2.813996019276809e-05, "loss": 0.3354, "step": 1049 }, { "epoch": 2.1494370522006143, "grad_norm": 0.3733330999629505, "learning_rate": 2.8113816881337902e-05, "loss": 0.3146, "step": 1050 }, { "epoch": 2.151484135107472, "grad_norm": 0.39634148645678874, "learning_rate": 2.8087656964092472e-05, "loss": 0.3041, "step": 1051 }, { "epoch": 2.15353121801433, "grad_norm": 0.3852615546043453, "learning_rate": 2.806148049457093e-05, "loss": 0.3639, "step": 1052 }, { "epoch": 2.1555783009211873, "grad_norm": 0.3896444126363705, "learning_rate": 2.803528752634629e-05, "loss": 0.3317, "step": 1053 }, { "epoch": 2.1576253838280453, "grad_norm": 0.38474286304985633, "learning_rate": 2.8009078113025335e-05, "loss": 0.3363, "step": 1054 }, { "epoch": 2.1596724667349028, "grad_norm": 0.36285165391849056, "learning_rate": 2.798285230824849e-05, "loss": 0.3088, "step": 1055 }, { "epoch": 2.1617195496417603, "grad_norm": 0.39881869082842497, "learning_rate": 2.795661016568975e-05, "loss": 0.3472, "step": 1056 }, { "epoch": 2.1637666325486182, "grad_norm": 0.36428100096066146, "learning_rate": 2.7930351739056533e-05, "loss": 0.347, "step": 1057 }, { "epoch": 2.1658137154554757, "grad_norm": 0.4335731332830444, "learning_rate": 2.7904077082089574e-05, "loss": 0.325, "step": 1058 }, { "epoch": 2.1678607983623337, "grad_norm": 0.3984251614786555, "learning_rate": 2.787778624856286e-05, "loss": 0.3066, "step": 1059 }, { "epoch": 2.169907881269191, "grad_norm": 0.42180327285845043, "learning_rate": 2.7851479292283442e-05, "loss": 0.3415, "step": 1060 }, { "epoch": 2.171954964176049, "grad_norm": 0.41842751411141604, "learning_rate": 2.782515626709139e-05, "loss": 0.3498, "step": 1061 }, { "epoch": 2.1740020470829067, "grad_norm": 0.3982687240967601, "learning_rate": 2.7798817226859678e-05, "loss": 0.3311, "step": 1062 }, { "epoch": 2.1760491299897646, "grad_norm": 0.3653496091806209, "learning_rate": 2.7772462225494013e-05, "loss": 0.3393, "step": 1063 }, { "epoch": 2.178096212896622, "grad_norm": 0.35694917670185916, "learning_rate": 2.7746091316932807e-05, "loss": 0.2938, "step": 1064 }, { "epoch": 2.18014329580348, "grad_norm": 0.36076433028006755, "learning_rate": 2.7719704555147012e-05, "loss": 0.3176, "step": 1065 }, { "epoch": 2.1821903787103376, "grad_norm": 0.35991273790931844, "learning_rate": 2.7693301994140026e-05, "loss": 0.3369, "step": 1066 }, { "epoch": 2.1842374616171956, "grad_norm": 0.3605864665846731, "learning_rate": 2.7666883687947588e-05, "loss": 0.308, "step": 1067 }, { "epoch": 2.186284544524053, "grad_norm": 0.3868682601764087, "learning_rate": 2.7640449690637642e-05, "loss": 0.335, "step": 1068 }, { "epoch": 2.188331627430911, "grad_norm": 0.37080898315589017, "learning_rate": 2.761400005631028e-05, "loss": 0.3339, "step": 1069 }, { "epoch": 2.1903787103377685, "grad_norm": 0.35560785487130986, "learning_rate": 2.7587534839097556e-05, "loss": 0.3348, "step": 1070 }, { "epoch": 2.1924257932446265, "grad_norm": 0.36780049563494116, "learning_rate": 2.756105409316345e-05, "loss": 0.3042, "step": 1071 }, { "epoch": 2.194472876151484, "grad_norm": 0.3750046262445992, "learning_rate": 2.7534557872703705e-05, "loss": 0.3286, "step": 1072 }, { "epoch": 2.196519959058342, "grad_norm": 0.3643852622155405, "learning_rate": 2.750804623194574e-05, "loss": 0.3202, "step": 1073 }, { "epoch": 2.1985670419651995, "grad_norm": 0.35562711733903674, "learning_rate": 2.7481519225148537e-05, "loss": 0.3158, "step": 1074 }, { "epoch": 2.2006141248720574, "grad_norm": 0.34699724820587735, "learning_rate": 2.7454976906602513e-05, "loss": 0.3635, "step": 1075 }, { "epoch": 2.202661207778915, "grad_norm": 0.3632869672814209, "learning_rate": 2.742841933062944e-05, "loss": 0.3378, "step": 1076 }, { "epoch": 2.204708290685773, "grad_norm": 0.35456528308188723, "learning_rate": 2.7401846551582304e-05, "loss": 0.3078, "step": 1077 }, { "epoch": 2.2067553735926304, "grad_norm": 0.4081395245932041, "learning_rate": 2.7375258623845207e-05, "loss": 0.3429, "step": 1078 }, { "epoch": 2.2088024564994884, "grad_norm": 0.34135550119349445, "learning_rate": 2.7348655601833255e-05, "loss": 0.3338, "step": 1079 }, { "epoch": 2.210849539406346, "grad_norm": 0.42118938931480804, "learning_rate": 2.7322037539992457e-05, "loss": 0.3327, "step": 1080 }, { "epoch": 2.212896622313204, "grad_norm": 0.37577320629275823, "learning_rate": 2.7295404492799575e-05, "loss": 0.3002, "step": 1081 }, { "epoch": 2.2149437052200613, "grad_norm": 0.3303091254072807, "learning_rate": 2.726875651476207e-05, "loss": 0.3106, "step": 1082 }, { "epoch": 2.2169907881269193, "grad_norm": 0.35777928959386923, "learning_rate": 2.7242093660417954e-05, "loss": 0.364, "step": 1083 }, { "epoch": 2.219037871033777, "grad_norm": 0.3963714636770776, "learning_rate": 2.721541598433567e-05, "loss": 0.2969, "step": 1084 }, { "epoch": 2.2210849539406348, "grad_norm": 0.3409183879753277, "learning_rate": 2.718872354111401e-05, "loss": 0.3346, "step": 1085 }, { "epoch": 2.2231320368474923, "grad_norm": 0.35338149712684697, "learning_rate": 2.7162016385381975e-05, "loss": 0.3648, "step": 1086 }, { "epoch": 2.2251791197543502, "grad_norm": 0.3315730951687613, "learning_rate": 2.7135294571798706e-05, "loss": 0.3063, "step": 1087 }, { "epoch": 2.2272262026612077, "grad_norm": 0.3449763315274245, "learning_rate": 2.7108558155053296e-05, "loss": 0.3403, "step": 1088 }, { "epoch": 2.2292732855680657, "grad_norm": 0.37034261955083203, "learning_rate": 2.7081807189864764e-05, "loss": 0.3583, "step": 1089 }, { "epoch": 2.231320368474923, "grad_norm": 0.3621281223181069, "learning_rate": 2.70550417309819e-05, "loss": 0.3144, "step": 1090 }, { "epoch": 2.233367451381781, "grad_norm": 0.39746428647523574, "learning_rate": 2.7028261833183132e-05, "loss": 0.3376, "step": 1091 }, { "epoch": 2.2354145342886387, "grad_norm": 0.34536466760948237, "learning_rate": 2.7001467551276464e-05, "loss": 0.2973, "step": 1092 }, { "epoch": 2.237461617195496, "grad_norm": 0.3255654437979655, "learning_rate": 2.6974658940099337e-05, "loss": 0.3222, "step": 1093 }, { "epoch": 2.239508700102354, "grad_norm": 0.34570256484501904, "learning_rate": 2.6947836054518484e-05, "loss": 0.3585, "step": 1094 }, { "epoch": 2.241555783009212, "grad_norm": 0.3325137281496525, "learning_rate": 2.6920998949429913e-05, "loss": 0.309, "step": 1095 }, { "epoch": 2.2436028659160696, "grad_norm": 0.34364314604723273, "learning_rate": 2.6894147679758678e-05, "loss": 0.332, "step": 1096 }, { "epoch": 2.245649948822927, "grad_norm": 0.3323431139976392, "learning_rate": 2.6867282300458853e-05, "loss": 0.3365, "step": 1097 }, { "epoch": 2.247697031729785, "grad_norm": 0.36326221718241064, "learning_rate": 2.684040286651338e-05, "loss": 0.3361, "step": 1098 }, { "epoch": 2.2497441146366426, "grad_norm": 0.3309544898142775, "learning_rate": 2.6813509432933957e-05, "loss": 0.3142, "step": 1099 }, { "epoch": 2.2517911975435005, "grad_norm": 0.3686682735992276, "learning_rate": 2.6786602054760952e-05, "loss": 0.3078, "step": 1100 }, { "epoch": 2.253838280450358, "grad_norm": 0.3515712454533351, "learning_rate": 2.675968078706326e-05, "loss": 0.3151, "step": 1101 }, { "epoch": 2.255885363357216, "grad_norm": 0.3495717010136385, "learning_rate": 2.673274568493821e-05, "loss": 0.3243, "step": 1102 }, { "epoch": 2.2579324462640735, "grad_norm": 0.34663490722766044, "learning_rate": 2.670579680351143e-05, "loss": 0.3284, "step": 1103 }, { "epoch": 2.2599795291709315, "grad_norm": 0.34744444318737083, "learning_rate": 2.667883419793676e-05, "loss": 0.313, "step": 1104 }, { "epoch": 2.262026612077789, "grad_norm": 0.38465408810908674, "learning_rate": 2.6651857923396132e-05, "loss": 0.3759, "step": 1105 }, { "epoch": 2.264073694984647, "grad_norm": 0.3414863910860875, "learning_rate": 2.6624868035099445e-05, "loss": 0.3336, "step": 1106 }, { "epoch": 2.2661207778915045, "grad_norm": 0.33861934140219296, "learning_rate": 2.659786458828446e-05, "loss": 0.3587, "step": 1107 }, { "epoch": 2.2681678607983624, "grad_norm": 0.32698315646381576, "learning_rate": 2.6570847638216698e-05, "loss": 0.3506, "step": 1108 }, { "epoch": 2.27021494370522, "grad_norm": 0.3303135891963801, "learning_rate": 2.65438172401893e-05, "loss": 0.3451, "step": 1109 }, { "epoch": 2.272262026612078, "grad_norm": 0.3769317246092125, "learning_rate": 2.6516773449522936e-05, "loss": 0.3257, "step": 1110 }, { "epoch": 2.2743091095189354, "grad_norm": 0.3511492114236981, "learning_rate": 2.648971632156569e-05, "loss": 0.3147, "step": 1111 }, { "epoch": 2.2763561924257933, "grad_norm": 0.3675620800860699, "learning_rate": 2.6462645911692938e-05, "loss": 0.2979, "step": 1112 }, { "epoch": 2.278403275332651, "grad_norm": 0.3792036243843211, "learning_rate": 2.643556227530724e-05, "loss": 0.3065, "step": 1113 }, { "epoch": 2.280450358239509, "grad_norm": 0.32407850095411667, "learning_rate": 2.6408465467838225e-05, "loss": 0.332, "step": 1114 }, { "epoch": 2.2824974411463663, "grad_norm": 0.31664427206527734, "learning_rate": 2.6381355544742482e-05, "loss": 0.3963, "step": 1115 }, { "epoch": 2.2845445240532243, "grad_norm": 0.34686617702001726, "learning_rate": 2.6354232561503433e-05, "loss": 0.3357, "step": 1116 }, { "epoch": 2.286591606960082, "grad_norm": 0.33621332629180944, "learning_rate": 2.632709657363124e-05, "loss": 0.3089, "step": 1117 }, { "epoch": 2.2886386898669397, "grad_norm": 0.33153383111460555, "learning_rate": 2.6299947636662673e-05, "loss": 0.3054, "step": 1118 }, { "epoch": 2.2906857727737973, "grad_norm": 0.3416358925904056, "learning_rate": 2.6272785806161005e-05, "loss": 0.3278, "step": 1119 }, { "epoch": 2.292732855680655, "grad_norm": 0.40808146160514075, "learning_rate": 2.6245611137715897e-05, "loss": 0.3519, "step": 1120 }, { "epoch": 2.2947799385875127, "grad_norm": 0.32524026497753233, "learning_rate": 2.621842368694329e-05, "loss": 0.3239, "step": 1121 }, { "epoch": 2.2968270214943707, "grad_norm": 0.3658777033819554, "learning_rate": 2.6191223509485273e-05, "loss": 0.3286, "step": 1122 }, { "epoch": 2.298874104401228, "grad_norm": 0.36322923087428066, "learning_rate": 2.6164010661010007e-05, "loss": 0.3364, "step": 1123 }, { "epoch": 2.300921187308086, "grad_norm": 0.3278681613056945, "learning_rate": 2.613678519721155e-05, "loss": 0.3086, "step": 1124 }, { "epoch": 2.3029682702149437, "grad_norm": 0.3547085800078984, "learning_rate": 2.61095471738098e-05, "loss": 0.3215, "step": 1125 }, { "epoch": 2.3050153531218016, "grad_norm": 0.31393691776472127, "learning_rate": 2.6082296646550364e-05, "loss": 0.3114, "step": 1126 }, { "epoch": 2.307062436028659, "grad_norm": 0.35347094972995313, "learning_rate": 2.605503367120442e-05, "loss": 0.3621, "step": 1127 }, { "epoch": 2.309109518935517, "grad_norm": 0.3314912682101731, "learning_rate": 2.6027758303568643e-05, "loss": 0.3196, "step": 1128 }, { "epoch": 2.3111566018423746, "grad_norm": 0.30778697872934085, "learning_rate": 2.6000470599465065e-05, "loss": 0.3068, "step": 1129 }, { "epoch": 2.313203684749232, "grad_norm": 0.37088270704077186, "learning_rate": 2.5973170614740946e-05, "loss": 0.373, "step": 1130 }, { "epoch": 2.31525076765609, "grad_norm": 0.34268176472285344, "learning_rate": 2.5945858405268714e-05, "loss": 0.3413, "step": 1131 }, { "epoch": 2.317297850562948, "grad_norm": 0.3248279403276539, "learning_rate": 2.5918534026945787e-05, "loss": 0.3559, "step": 1132 }, { "epoch": 2.3193449334698055, "grad_norm": 0.30464581495860377, "learning_rate": 2.5891197535694507e-05, "loss": 0.3367, "step": 1133 }, { "epoch": 2.321392016376663, "grad_norm": 0.35318148922449216, "learning_rate": 2.5863848987461993e-05, "loss": 0.3529, "step": 1134 }, { "epoch": 2.323439099283521, "grad_norm": 0.37807352066757405, "learning_rate": 2.5836488438220044e-05, "loss": 0.3347, "step": 1135 }, { "epoch": 2.325486182190379, "grad_norm": 0.3608071303513835, "learning_rate": 2.5809115943965027e-05, "loss": 0.3366, "step": 1136 }, { "epoch": 2.3275332650972365, "grad_norm": 0.3499879431406946, "learning_rate": 2.5781731560717745e-05, "loss": 0.3106, "step": 1137 }, { "epoch": 2.329580348004094, "grad_norm": 0.3591909142140177, "learning_rate": 2.575433534452334e-05, "loss": 0.3396, "step": 1138 }, { "epoch": 2.331627430910952, "grad_norm": 0.33212351658248346, "learning_rate": 2.5726927351451178e-05, "loss": 0.3439, "step": 1139 }, { "epoch": 2.3336745138178094, "grad_norm": 0.36284882910248367, "learning_rate": 2.5699507637594706e-05, "loss": 0.304, "step": 1140 }, { "epoch": 2.3357215967246674, "grad_norm": 0.34302795456823765, "learning_rate": 2.5672076259071385e-05, "loss": 0.3276, "step": 1141 }, { "epoch": 2.337768679631525, "grad_norm": 0.36338586044821625, "learning_rate": 2.5644633272022536e-05, "loss": 0.36, "step": 1142 }, { "epoch": 2.339815762538383, "grad_norm": 0.3428090299047305, "learning_rate": 2.561717873261323e-05, "loss": 0.3062, "step": 1143 }, { "epoch": 2.3418628454452404, "grad_norm": 0.38038284417246715, "learning_rate": 2.558971269703219e-05, "loss": 0.3555, "step": 1144 }, { "epoch": 2.3439099283520983, "grad_norm": 0.35045801763083695, "learning_rate": 2.556223522149168e-05, "loss": 0.3433, "step": 1145 }, { "epoch": 2.345957011258956, "grad_norm": 0.36682906880387844, "learning_rate": 2.5534746362227355e-05, "loss": 0.3507, "step": 1146 }, { "epoch": 2.348004094165814, "grad_norm": 0.34210491558572537, "learning_rate": 2.5507246175498174e-05, "loss": 0.3178, "step": 1147 }, { "epoch": 2.3500511770726713, "grad_norm": 0.3421821678613336, "learning_rate": 2.5479734717586285e-05, "loss": 0.3124, "step": 1148 }, { "epoch": 2.3520982599795293, "grad_norm": 0.37370205112447163, "learning_rate": 2.5452212044796912e-05, "loss": 0.3179, "step": 1149 }, { "epoch": 2.3541453428863868, "grad_norm": 0.3575054301368409, "learning_rate": 2.5424678213458202e-05, "loss": 0.2982, "step": 1150 }, { "epoch": 2.3561924257932447, "grad_norm": 0.4005011767035321, "learning_rate": 2.539713327992117e-05, "loss": 0.329, "step": 1151 }, { "epoch": 2.3582395087001022, "grad_norm": 0.3796694210246127, "learning_rate": 2.5369577300559544e-05, "loss": 0.3495, "step": 1152 }, { "epoch": 2.36028659160696, "grad_norm": 0.38531974828237286, "learning_rate": 2.5342010331769635e-05, "loss": 0.3218, "step": 1153 }, { "epoch": 2.3623336745138177, "grad_norm": 0.4150417947081143, "learning_rate": 2.531443242997029e-05, "loss": 0.3714, "step": 1154 }, { "epoch": 2.3643807574206757, "grad_norm": 0.3665969184427945, "learning_rate": 2.5286843651602688e-05, "loss": 0.33, "step": 1155 }, { "epoch": 2.366427840327533, "grad_norm": 0.3668439390982461, "learning_rate": 2.5259244053130295e-05, "loss": 0.3338, "step": 1156 }, { "epoch": 2.368474923234391, "grad_norm": 0.4105022406361347, "learning_rate": 2.5231633691038716e-05, "loss": 0.3303, "step": 1157 }, { "epoch": 2.3705220061412486, "grad_norm": 0.3610884307782202, "learning_rate": 2.5204012621835575e-05, "loss": 0.3108, "step": 1158 }, { "epoch": 2.3725690890481066, "grad_norm": 0.39634461411533756, "learning_rate": 2.5176380902050418e-05, "loss": 0.3398, "step": 1159 }, { "epoch": 2.374616171954964, "grad_norm": 0.3956297902185592, "learning_rate": 2.5148738588234593e-05, "loss": 0.3199, "step": 1160 }, { "epoch": 2.376663254861822, "grad_norm": 0.33517128768421744, "learning_rate": 2.5121085736961112e-05, "loss": 0.3288, "step": 1161 }, { "epoch": 2.3787103377686796, "grad_norm": 0.3543225737911076, "learning_rate": 2.5093422404824574e-05, "loss": 0.3483, "step": 1162 }, { "epoch": 2.3807574206755375, "grad_norm": 0.35329611481723555, "learning_rate": 2.506574864844102e-05, "loss": 0.3439, "step": 1163 }, { "epoch": 2.382804503582395, "grad_norm": 0.3415016772188138, "learning_rate": 2.5038064524447827e-05, "loss": 0.3461, "step": 1164 }, { "epoch": 2.384851586489253, "grad_norm": 0.3618115220444992, "learning_rate": 2.5010370089503578e-05, "loss": 0.3243, "step": 1165 }, { "epoch": 2.3868986693961105, "grad_norm": 0.36205472648695425, "learning_rate": 2.4982665400287972e-05, "loss": 0.3411, "step": 1166 }, { "epoch": 2.3889457523029685, "grad_norm": 0.43026672506995006, "learning_rate": 2.4954950513501697e-05, "loss": 0.3319, "step": 1167 }, { "epoch": 2.390992835209826, "grad_norm": 0.350243563017003, "learning_rate": 2.4927225485866297e-05, "loss": 0.3479, "step": 1168 }, { "epoch": 2.393039918116684, "grad_norm": 0.3746353739097793, "learning_rate": 2.4899490374124085e-05, "loss": 0.3429, "step": 1169 }, { "epoch": 2.3950870010235414, "grad_norm": 0.3260898872775567, "learning_rate": 2.4871745235038006e-05, "loss": 0.3472, "step": 1170 }, { "epoch": 2.397134083930399, "grad_norm": 0.33558457363741073, "learning_rate": 2.4843990125391516e-05, "loss": 0.328, "step": 1171 }, { "epoch": 2.399181166837257, "grad_norm": 0.38996942202586005, "learning_rate": 2.4816225101988506e-05, "loss": 0.3391, "step": 1172 }, { "epoch": 2.401228249744115, "grad_norm": 0.3566387434855101, "learning_rate": 2.478845022165313e-05, "loss": 0.3667, "step": 1173 }, { "epoch": 2.4032753326509724, "grad_norm": 0.3444970227104489, "learning_rate": 2.4760665541229712e-05, "loss": 0.3301, "step": 1174 }, { "epoch": 2.40532241555783, "grad_norm": 0.3092553635632143, "learning_rate": 2.473287111758267e-05, "loss": 0.3401, "step": 1175 }, { "epoch": 2.407369498464688, "grad_norm": 0.3394352190867357, "learning_rate": 2.470506700759631e-05, "loss": 0.3218, "step": 1176 }, { "epoch": 2.409416581371546, "grad_norm": 0.3393116279567721, "learning_rate": 2.467725326817481e-05, "loss": 0.3113, "step": 1177 }, { "epoch": 2.4114636642784033, "grad_norm": 0.399667522842199, "learning_rate": 2.464942995624203e-05, "loss": 0.3269, "step": 1178 }, { "epoch": 2.413510747185261, "grad_norm": 0.3505124387280659, "learning_rate": 2.462159712874142e-05, "loss": 0.3411, "step": 1179 }, { "epoch": 2.4155578300921188, "grad_norm": 0.3519791912196212, "learning_rate": 2.4593754842635917e-05, "loss": 0.3036, "step": 1180 }, { "epoch": 2.4176049129989763, "grad_norm": 0.36680184492885926, "learning_rate": 2.4565903154907807e-05, "loss": 0.3898, "step": 1181 }, { "epoch": 2.4196519959058342, "grad_norm": 0.3778349635443584, "learning_rate": 2.453804212255862e-05, "loss": 0.3427, "step": 1182 }, { "epoch": 2.4216990788126918, "grad_norm": 0.3389030241399457, "learning_rate": 2.451017180260902e-05, "loss": 0.3271, "step": 1183 }, { "epoch": 2.4237461617195497, "grad_norm": 0.42513595322376974, "learning_rate": 2.448229225209865e-05, "loss": 0.3298, "step": 1184 }, { "epoch": 2.425793244626407, "grad_norm": 0.3712368776548631, "learning_rate": 2.4454403528086088e-05, "loss": 0.3323, "step": 1185 }, { "epoch": 2.427840327533265, "grad_norm": 0.3200914239556184, "learning_rate": 2.4426505687648653e-05, "loss": 0.3387, "step": 1186 }, { "epoch": 2.4298874104401227, "grad_norm": 0.346315254552554, "learning_rate": 2.4398598787882334e-05, "loss": 0.3449, "step": 1187 }, { "epoch": 2.4319344933469806, "grad_norm": 0.3184464070741258, "learning_rate": 2.4370682885901657e-05, "loss": 0.3006, "step": 1188 }, { "epoch": 2.433981576253838, "grad_norm": 0.34720894372009287, "learning_rate": 2.4342758038839573e-05, "loss": 0.3354, "step": 1189 }, { "epoch": 2.436028659160696, "grad_norm": 0.3963098146391974, "learning_rate": 2.4314824303847342e-05, "loss": 0.3273, "step": 1190 }, { "epoch": 2.4380757420675536, "grad_norm": 0.3419114471543989, "learning_rate": 2.4286881738094418e-05, "loss": 0.3334, "step": 1191 }, { "epoch": 2.4401228249744116, "grad_norm": 0.3476204838650686, "learning_rate": 2.4258930398768317e-05, "loss": 0.3405, "step": 1192 }, { "epoch": 2.442169907881269, "grad_norm": 0.3398409708251329, "learning_rate": 2.423097034307452e-05, "loss": 0.2964, "step": 1193 }, { "epoch": 2.444216990788127, "grad_norm": 0.33597865186885006, "learning_rate": 2.4203001628236346e-05, "loss": 0.3122, "step": 1194 }, { "epoch": 2.4462640736949846, "grad_norm": 0.7573334739644073, "learning_rate": 2.4175024311494835e-05, "loss": 0.37, "step": 1195 }, { "epoch": 2.4483111566018425, "grad_norm": 0.3096397452376292, "learning_rate": 2.4147038450108627e-05, "loss": 0.3462, "step": 1196 }, { "epoch": 2.4503582395087, "grad_norm": 0.32363631370662416, "learning_rate": 2.4119044101353853e-05, "loss": 0.3089, "step": 1197 }, { "epoch": 2.452405322415558, "grad_norm": 0.44476946502851955, "learning_rate": 2.4091041322524023e-05, "loss": 0.3891, "step": 1198 }, { "epoch": 2.4544524053224155, "grad_norm": 0.34282272355962545, "learning_rate": 2.406303017092988e-05, "loss": 0.3672, "step": 1199 }, { "epoch": 2.4564994882292734, "grad_norm": 0.3458362218172388, "learning_rate": 2.403501070389932e-05, "loss": 0.3446, "step": 1200 }, { "epoch": 2.458546571136131, "grad_norm": 0.330522133423883, "learning_rate": 2.4006982978777263e-05, "loss": 0.3547, "step": 1201 }, { "epoch": 2.460593654042989, "grad_norm": 0.33086606944472513, "learning_rate": 2.39789470529255e-05, "loss": 0.3145, "step": 1202 }, { "epoch": 2.4626407369498464, "grad_norm": 0.3611588885012983, "learning_rate": 2.3950902983722645e-05, "loss": 0.3512, "step": 1203 }, { "epoch": 2.4646878198567044, "grad_norm": 0.35160175468657195, "learning_rate": 2.392285082856394e-05, "loss": 0.331, "step": 1204 }, { "epoch": 2.466734902763562, "grad_norm": 0.3329347867861998, "learning_rate": 2.389479064486121e-05, "loss": 0.3697, "step": 1205 }, { "epoch": 2.46878198567042, "grad_norm": 0.35641819087178006, "learning_rate": 2.3866722490042685e-05, "loss": 0.3606, "step": 1206 }, { "epoch": 2.4708290685772774, "grad_norm": 0.3604571656733162, "learning_rate": 2.3838646421552917e-05, "loss": 0.3377, "step": 1207 }, { "epoch": 2.472876151484135, "grad_norm": 0.3352126815168747, "learning_rate": 2.3810562496852666e-05, "loss": 0.3262, "step": 1208 }, { "epoch": 2.474923234390993, "grad_norm": 0.35666331619036534, "learning_rate": 2.3782470773418756e-05, "loss": 0.339, "step": 1209 }, { "epoch": 2.4769703172978508, "grad_norm": 0.39672935648110513, "learning_rate": 2.3754371308743975e-05, "loss": 0.33, "step": 1210 }, { "epoch": 2.4790174002047083, "grad_norm": 0.3591230233614195, "learning_rate": 2.372626416033696e-05, "loss": 0.3209, "step": 1211 }, { "epoch": 2.481064483111566, "grad_norm": 0.34331298511904995, "learning_rate": 2.3698149385722067e-05, "loss": 0.3376, "step": 1212 }, { "epoch": 2.4831115660184238, "grad_norm": 0.3934047338427704, "learning_rate": 2.367002704243927e-05, "loss": 0.3198, "step": 1213 }, { "epoch": 2.4851586489252817, "grad_norm": 0.34587155423784893, "learning_rate": 2.3641897188044018e-05, "loss": 0.3442, "step": 1214 }, { "epoch": 2.487205731832139, "grad_norm": 0.3681467355523078, "learning_rate": 2.3613759880107133e-05, "loss": 0.343, "step": 1215 }, { "epoch": 2.4892528147389967, "grad_norm": 0.3249289712237619, "learning_rate": 2.3585615176214716e-05, "loss": 0.3066, "step": 1216 }, { "epoch": 2.4912998976458547, "grad_norm": 0.37477832639872893, "learning_rate": 2.3557463133967976e-05, "loss": 0.3325, "step": 1217 }, { "epoch": 2.493346980552712, "grad_norm": 0.36178361521501945, "learning_rate": 2.3529303810983154e-05, "loss": 0.3127, "step": 1218 }, { "epoch": 2.49539406345957, "grad_norm": 0.32831527326831483, "learning_rate": 2.3501137264891396e-05, "loss": 0.3248, "step": 1219 }, { "epoch": 2.4974411463664277, "grad_norm": 0.4027644606883135, "learning_rate": 2.3472963553338614e-05, "loss": 0.3023, "step": 1220 }, { "epoch": 2.4994882292732856, "grad_norm": 0.3439537192088776, "learning_rate": 2.3444782733985396e-05, "loss": 0.3252, "step": 1221 }, { "epoch": 2.501535312180143, "grad_norm": 0.32953719599686015, "learning_rate": 2.3416594864506887e-05, "loss": 0.3043, "step": 1222 }, { "epoch": 2.503582395087001, "grad_norm": 0.3490669906957581, "learning_rate": 2.338840000259264e-05, "loss": 0.3133, "step": 1223 }, { "epoch": 2.5056294779938586, "grad_norm": 0.36299216934576684, "learning_rate": 2.3360198205946542e-05, "loss": 0.3141, "step": 1224 }, { "epoch": 2.5076765609007166, "grad_norm": 0.3180197591727672, "learning_rate": 2.333198953228664e-05, "loss": 0.3535, "step": 1225 }, { "epoch": 2.509723643807574, "grad_norm": 0.5171123946314492, "learning_rate": 2.3303774039345098e-05, "loss": 0.3367, "step": 1226 }, { "epoch": 2.511770726714432, "grad_norm": 0.3840513193856834, "learning_rate": 2.3275551784867997e-05, "loss": 0.3188, "step": 1227 }, { "epoch": 2.5138178096212895, "grad_norm": 0.3346463848714001, "learning_rate": 2.3247322826615276e-05, "loss": 0.3596, "step": 1228 }, { "epoch": 2.5158648925281475, "grad_norm": 0.3505848734275376, "learning_rate": 2.3219087222360603e-05, "loss": 0.3395, "step": 1229 }, { "epoch": 2.517911975435005, "grad_norm": 0.3415846729855278, "learning_rate": 2.3190845029891218e-05, "loss": 0.3326, "step": 1230 }, { "epoch": 2.519959058341863, "grad_norm": 0.3273579902423157, "learning_rate": 2.316259630700787e-05, "loss": 0.3344, "step": 1231 }, { "epoch": 2.5220061412487205, "grad_norm": 0.3247531510829717, "learning_rate": 2.313434111152467e-05, "loss": 0.3346, "step": 1232 }, { "epoch": 2.5240532241555784, "grad_norm": 0.3503893886731608, "learning_rate": 2.310607950126896e-05, "loss": 0.3448, "step": 1233 }, { "epoch": 2.526100307062436, "grad_norm": 0.35713820818797615, "learning_rate": 2.307781153408124e-05, "loss": 0.3359, "step": 1234 }, { "epoch": 2.528147389969294, "grad_norm": 0.34251280908459203, "learning_rate": 2.3049537267814984e-05, "loss": 0.3431, "step": 1235 }, { "epoch": 2.5301944728761514, "grad_norm": 0.34661535982225733, "learning_rate": 2.3021256760336583e-05, "loss": 0.3604, "step": 1236 }, { "epoch": 2.5322415557830094, "grad_norm": 0.3630834914867728, "learning_rate": 2.2992970069525202e-05, "loss": 0.3472, "step": 1237 }, { "epoch": 2.534288638689867, "grad_norm": 0.318836844861738, "learning_rate": 2.296467725327264e-05, "loss": 0.3174, "step": 1238 }, { "epoch": 2.536335721596725, "grad_norm": 0.3365825133835808, "learning_rate": 2.293637836948325e-05, "loss": 0.3093, "step": 1239 }, { "epoch": 2.5383828045035823, "grad_norm": 0.35403182852053533, "learning_rate": 2.29080734760738e-05, "loss": 0.3382, "step": 1240 }, { "epoch": 2.54042988741044, "grad_norm": 0.3562520797769209, "learning_rate": 2.2879762630973355e-05, "loss": 0.3315, "step": 1241 }, { "epoch": 2.542476970317298, "grad_norm": 0.3188066365019869, "learning_rate": 2.285144589212316e-05, "loss": 0.3079, "step": 1242 }, { "epoch": 2.5445240532241558, "grad_norm": 0.3556195009937043, "learning_rate": 2.2823123317476522e-05, "loss": 0.3422, "step": 1243 }, { "epoch": 2.5465711361310133, "grad_norm": 0.3474070822067162, "learning_rate": 2.2794794964998705e-05, "loss": 0.3049, "step": 1244 }, { "epoch": 2.548618219037871, "grad_norm": 0.3464039443053579, "learning_rate": 2.276646089266677e-05, "loss": 0.2992, "step": 1245 }, { "epoch": 2.5506653019447287, "grad_norm": 0.3767709540930306, "learning_rate": 2.273812115846951e-05, "loss": 0.3726, "step": 1246 }, { "epoch": 2.5527123848515867, "grad_norm": 0.3206160362666913, "learning_rate": 2.2709775820407292e-05, "loss": 0.2909, "step": 1247 }, { "epoch": 2.554759467758444, "grad_norm": 0.3673509781890066, "learning_rate": 2.2681424936491954e-05, "loss": 0.3669, "step": 1248 }, { "epoch": 2.5568065506653017, "grad_norm": 0.3401943174394405, "learning_rate": 2.2653068564746692e-05, "loss": 0.3403, "step": 1249 }, { "epoch": 2.5588536335721597, "grad_norm": 0.3490447254588359, "learning_rate": 2.2624706763205935e-05, "loss": 0.3603, "step": 1250 }, { "epoch": 2.5609007164790176, "grad_norm": 0.3482076297056933, "learning_rate": 2.2596339589915197e-05, "loss": 0.3554, "step": 1251 }, { "epoch": 2.562947799385875, "grad_norm": 0.3115817757213185, "learning_rate": 2.2567967102931025e-05, "loss": 0.3136, "step": 1252 }, { "epoch": 2.5649948822927326, "grad_norm": 0.32599919557680007, "learning_rate": 2.2539589360320802e-05, "loss": 0.3256, "step": 1253 }, { "epoch": 2.5670419651995906, "grad_norm": 0.3039164892771023, "learning_rate": 2.2511206420162716e-05, "loss": 0.3414, "step": 1254 }, { "epoch": 2.5690890481064486, "grad_norm": 0.31157751749513196, "learning_rate": 2.2482818340545534e-05, "loss": 0.3102, "step": 1255 }, { "epoch": 2.571136131013306, "grad_norm": 0.33852273483094864, "learning_rate": 2.2454425179568594e-05, "loss": 0.3434, "step": 1256 }, { "epoch": 2.5731832139201636, "grad_norm": 0.30672315678878886, "learning_rate": 2.2426026995341602e-05, "loss": 0.318, "step": 1257 }, { "epoch": 2.5752302968270215, "grad_norm": 0.31206788352544473, "learning_rate": 2.2397623845984548e-05, "loss": 0.3749, "step": 1258 }, { "epoch": 2.5772773797338795, "grad_norm": 0.30841479866018223, "learning_rate": 2.2369215789627593e-05, "loss": 0.298, "step": 1259 }, { "epoch": 2.579324462640737, "grad_norm": 0.32527208718455825, "learning_rate": 2.234080288441095e-05, "loss": 0.3419, "step": 1260 }, { "epoch": 2.5813715455475945, "grad_norm": 0.3124189339174951, "learning_rate": 2.2312385188484718e-05, "loss": 0.3501, "step": 1261 }, { "epoch": 2.5834186284544525, "grad_norm": 0.3352181407147001, "learning_rate": 2.2283962760008845e-05, "loss": 0.339, "step": 1262 }, { "epoch": 2.58546571136131, "grad_norm": 0.3548208328356842, "learning_rate": 2.225553565715294e-05, "loss": 0.3334, "step": 1263 }, { "epoch": 2.587512794268168, "grad_norm": 0.32210075431253954, "learning_rate": 2.2227103938096176e-05, "loss": 0.3139, "step": 1264 }, { "epoch": 2.5895598771750254, "grad_norm": 0.3342742670935017, "learning_rate": 2.2198667661027193e-05, "loss": 0.3232, "step": 1265 }, { "epoch": 2.5916069600818834, "grad_norm": 0.3332141930024874, "learning_rate": 2.2170226884143942e-05, "loss": 0.3089, "step": 1266 }, { "epoch": 2.593654042988741, "grad_norm": 0.3259468299127148, "learning_rate": 2.2141781665653584e-05, "loss": 0.3069, "step": 1267 }, { "epoch": 2.595701125895599, "grad_norm": 0.3756435234426335, "learning_rate": 2.2113332063772387e-05, "loss": 0.3343, "step": 1268 }, { "epoch": 2.5977482088024564, "grad_norm": 0.38118415339444334, "learning_rate": 2.208487813672557e-05, "loss": 0.3246, "step": 1269 }, { "epoch": 2.5997952917093143, "grad_norm": 0.33475197046166133, "learning_rate": 2.205641994274721e-05, "loss": 0.3543, "step": 1270 }, { "epoch": 2.601842374616172, "grad_norm": 0.3609114565434103, "learning_rate": 2.2027957540080125e-05, "loss": 0.3412, "step": 1271 }, { "epoch": 2.60388945752303, "grad_norm": 0.35268666756991185, "learning_rate": 2.199949098697574e-05, "loss": 0.321, "step": 1272 }, { "epoch": 2.6059365404298873, "grad_norm": 0.3218634099552252, "learning_rate": 2.1971020341693973e-05, "loss": 0.321, "step": 1273 }, { "epoch": 2.6079836233367453, "grad_norm": 0.3886771490605891, "learning_rate": 2.1942545662503115e-05, "loss": 0.3366, "step": 1274 }, { "epoch": 2.610030706243603, "grad_norm": 0.3542254238953694, "learning_rate": 2.1914067007679733e-05, "loss": 0.346, "step": 1275 }, { "epoch": 2.6120777891504607, "grad_norm": 0.33442059420132036, "learning_rate": 2.188558443550849e-05, "loss": 0.3471, "step": 1276 }, { "epoch": 2.6141248720573182, "grad_norm": 0.31543081486502833, "learning_rate": 2.185709800428211e-05, "loss": 0.3523, "step": 1277 }, { "epoch": 2.616171954964176, "grad_norm": 0.32475295410492505, "learning_rate": 2.1828607772301187e-05, "loss": 0.3456, "step": 1278 }, { "epoch": 2.6182190378710337, "grad_norm": 0.35139374027023634, "learning_rate": 2.180011379787411e-05, "loss": 0.3309, "step": 1279 }, { "epoch": 2.6202661207778917, "grad_norm": 0.3057519944170325, "learning_rate": 2.1771616139316903e-05, "loss": 0.3351, "step": 1280 }, { "epoch": 2.622313203684749, "grad_norm": 0.363923325870162, "learning_rate": 2.174311485495317e-05, "loss": 0.3046, "step": 1281 }, { "epoch": 2.6243602865916067, "grad_norm": 0.3291114320223632, "learning_rate": 2.1714610003113887e-05, "loss": 0.303, "step": 1282 }, { "epoch": 2.6264073694984647, "grad_norm": 0.3289632021661567, "learning_rate": 2.168610164213738e-05, "loss": 0.3213, "step": 1283 }, { "epoch": 2.6284544524053226, "grad_norm": 0.33306777417969263, "learning_rate": 2.1657589830369113e-05, "loss": 0.351, "step": 1284 }, { "epoch": 2.63050153531218, "grad_norm": 0.3612382108372884, "learning_rate": 2.1629074626161647e-05, "loss": 0.3868, "step": 1285 }, { "epoch": 2.6325486182190376, "grad_norm": 0.3580608267992191, "learning_rate": 2.1600556087874472e-05, "loss": 0.3175, "step": 1286 }, { "epoch": 2.6345957011258956, "grad_norm": 0.31536303959925943, "learning_rate": 2.1572034273873893e-05, "loss": 0.3262, "step": 1287 }, { "epoch": 2.6366427840327535, "grad_norm": 0.3578677663211092, "learning_rate": 2.1543509242532932e-05, "loss": 0.3716, "step": 1288 }, { "epoch": 2.638689866939611, "grad_norm": 0.31607653936815944, "learning_rate": 2.1514981052231187e-05, "loss": 0.3166, "step": 1289 }, { "epoch": 2.6407369498464686, "grad_norm": 0.35206333188454375, "learning_rate": 2.1486449761354727e-05, "loss": 0.3315, "step": 1290 }, { "epoch": 2.6427840327533265, "grad_norm": 0.34365776192029646, "learning_rate": 2.145791542829597e-05, "loss": 0.3225, "step": 1291 }, { "epoch": 2.6448311156601845, "grad_norm": 0.40791136566579844, "learning_rate": 2.142937811145354e-05, "loss": 0.3839, "step": 1292 }, { "epoch": 2.646878198567042, "grad_norm": 0.3334669459335626, "learning_rate": 2.140083786923221e-05, "loss": 0.3277, "step": 1293 }, { "epoch": 2.6489252814738995, "grad_norm": 0.33721029481105136, "learning_rate": 2.1372294760042686e-05, "loss": 0.3396, "step": 1294 }, { "epoch": 2.6509723643807575, "grad_norm": 0.33022887238565724, "learning_rate": 2.1343748842301575e-05, "loss": 0.3199, "step": 1295 }, { "epoch": 2.6530194472876154, "grad_norm": 0.3578342406147216, "learning_rate": 2.1315200174431235e-05, "loss": 0.3264, "step": 1296 }, { "epoch": 2.655066530194473, "grad_norm": 0.2843605578164525, "learning_rate": 2.1286648814859636e-05, "loss": 0.3196, "step": 1297 }, { "epoch": 2.6571136131013304, "grad_norm": 0.3320141991189249, "learning_rate": 2.1258094822020263e-05, "loss": 0.3132, "step": 1298 }, { "epoch": 2.6591606960081884, "grad_norm": 0.37866233560469814, "learning_rate": 2.1229538254351995e-05, "loss": 0.3238, "step": 1299 }, { "epoch": 2.661207778915046, "grad_norm": 0.327131535806631, "learning_rate": 2.120097917029897e-05, "loss": 0.3843, "step": 1300 }, { "epoch": 2.663254861821904, "grad_norm": 0.3036315122393342, "learning_rate": 2.1172417628310487e-05, "loss": 0.3292, "step": 1301 }, { "epoch": 2.6653019447287614, "grad_norm": 0.3515024908888374, "learning_rate": 2.1143853686840874e-05, "loss": 0.3102, "step": 1302 }, { "epoch": 2.6673490276356193, "grad_norm": 0.34581222342499085, "learning_rate": 2.1115287404349357e-05, "loss": 0.3156, "step": 1303 }, { "epoch": 2.669396110542477, "grad_norm": 0.35858989267221897, "learning_rate": 2.1086718839299972e-05, "loss": 0.3461, "step": 1304 }, { "epoch": 2.671443193449335, "grad_norm": 0.3475194620828264, "learning_rate": 2.1058148050161412e-05, "loss": 0.3357, "step": 1305 }, { "epoch": 2.6734902763561923, "grad_norm": 0.3585594367622285, "learning_rate": 2.1029575095406933e-05, "loss": 0.3454, "step": 1306 }, { "epoch": 2.6755373592630503, "grad_norm": 0.3384161065196781, "learning_rate": 2.1001000033514215e-05, "loss": 0.3403, "step": 1307 }, { "epoch": 2.6775844421699078, "grad_norm": 0.33004176854360945, "learning_rate": 2.097242292296525e-05, "loss": 0.3643, "step": 1308 }, { "epoch": 2.6796315250767657, "grad_norm": 0.33026393179643815, "learning_rate": 2.0943843822246234e-05, "loss": 0.3224, "step": 1309 }, { "epoch": 2.6816786079836232, "grad_norm": 0.34661171805934476, "learning_rate": 2.0915262789847414e-05, "loss": 0.3368, "step": 1310 }, { "epoch": 2.683725690890481, "grad_norm": 0.3157977388993276, "learning_rate": 2.088667988426302e-05, "loss": 0.311, "step": 1311 }, { "epoch": 2.6857727737973387, "grad_norm": 0.32541710739849666, "learning_rate": 2.0858095163991094e-05, "loss": 0.3145, "step": 1312 }, { "epoch": 2.6878198567041967, "grad_norm": 0.3317095591959358, "learning_rate": 2.0829508687533387e-05, "loss": 0.343, "step": 1313 }, { "epoch": 2.689866939611054, "grad_norm": 0.33508091050613437, "learning_rate": 2.0800920513395276e-05, "loss": 0.3102, "step": 1314 }, { "epoch": 2.691914022517912, "grad_norm": 0.3092939306017529, "learning_rate": 2.077233070008557e-05, "loss": 0.3628, "step": 1315 }, { "epoch": 2.6939611054247696, "grad_norm": 0.3644518282154973, "learning_rate": 2.074373930611647e-05, "loss": 0.3611, "step": 1316 }, { "epoch": 2.6960081883316276, "grad_norm": 0.3268481968948613, "learning_rate": 2.0715146390003395e-05, "loss": 0.4382, "step": 1317 }, { "epoch": 2.698055271238485, "grad_norm": 0.32366874488936354, "learning_rate": 2.0686552010264872e-05, "loss": 0.3316, "step": 1318 }, { "epoch": 2.7001023541453426, "grad_norm": 0.3655397054326707, "learning_rate": 2.0657956225422438e-05, "loss": 0.3241, "step": 1319 }, { "epoch": 2.7021494370522006, "grad_norm": 0.3511986539192374, "learning_rate": 2.0629359094000502e-05, "loss": 0.3634, "step": 1320 }, { "epoch": 2.7041965199590585, "grad_norm": 0.33590806443682913, "learning_rate": 2.060076067452622e-05, "loss": 0.3434, "step": 1321 }, { "epoch": 2.706243602865916, "grad_norm": 0.34940605685458204, "learning_rate": 2.0572161025529396e-05, "loss": 0.3592, "step": 1322 }, { "epoch": 2.7082906857727735, "grad_norm": 0.33025728634487234, "learning_rate": 2.0543560205542338e-05, "loss": 0.3273, "step": 1323 }, { "epoch": 2.7103377686796315, "grad_norm": 0.3111565817907451, "learning_rate": 2.0514958273099778e-05, "loss": 0.3528, "step": 1324 }, { "epoch": 2.7123848515864895, "grad_norm": 0.34384380691800237, "learning_rate": 2.0486355286738675e-05, "loss": 0.3279, "step": 1325 }, { "epoch": 2.714431934493347, "grad_norm": 0.34483761426361903, "learning_rate": 2.0457751304998196e-05, "loss": 0.3154, "step": 1326 }, { "epoch": 2.7164790174002045, "grad_norm": 0.34739319472868174, "learning_rate": 2.042914638641952e-05, "loss": 0.3122, "step": 1327 }, { "epoch": 2.7185261003070624, "grad_norm": 0.34204787000879766, "learning_rate": 2.0400540589545738e-05, "loss": 0.2987, "step": 1328 }, { "epoch": 2.7205731832139204, "grad_norm": 0.31149494456320415, "learning_rate": 2.0371933972921756e-05, "loss": 0.3651, "step": 1329 }, { "epoch": 2.722620266120778, "grad_norm": 0.3206059172994117, "learning_rate": 2.0343326595094154e-05, "loss": 0.3056, "step": 1330 }, { "epoch": 2.7246673490276354, "grad_norm": 0.3460394891552501, "learning_rate": 2.031471851461105e-05, "loss": 0.3078, "step": 1331 }, { "epoch": 2.7267144319344934, "grad_norm": 0.31434160088392427, "learning_rate": 2.0286109790022023e-05, "loss": 0.3019, "step": 1332 }, { "epoch": 2.7287615148413513, "grad_norm": 0.30988899965990013, "learning_rate": 2.0257500479877965e-05, "loss": 0.3606, "step": 1333 }, { "epoch": 2.730808597748209, "grad_norm": 0.3038902310146715, "learning_rate": 2.0228890642730967e-05, "loss": 0.3188, "step": 1334 }, { "epoch": 2.7328556806550663, "grad_norm": 0.33615481049383383, "learning_rate": 2.020028033713418e-05, "loss": 0.3233, "step": 1335 }, { "epoch": 2.7349027635619243, "grad_norm": 0.3416566631514737, "learning_rate": 2.0171669621641743e-05, "loss": 0.3563, "step": 1336 }, { "epoch": 2.7369498464687823, "grad_norm": 0.33625693354138464, "learning_rate": 2.0143058554808622e-05, "loss": 0.3107, "step": 1337 }, { "epoch": 2.7389969293756398, "grad_norm": 0.32608262090854195, "learning_rate": 2.0114447195190486e-05, "loss": 0.3445, "step": 1338 }, { "epoch": 2.7410440122824973, "grad_norm": 0.3157455564515132, "learning_rate": 2.0085835601343627e-05, "loss": 0.3426, "step": 1339 }, { "epoch": 2.7430910951893552, "grad_norm": 0.3108979356348658, "learning_rate": 2.005722383182481e-05, "loss": 0.3216, "step": 1340 }, { "epoch": 2.7451381780962127, "grad_norm": 0.31525583618025826, "learning_rate": 2.002861194519114e-05, "loss": 0.3888, "step": 1341 }, { "epoch": 2.7471852610030707, "grad_norm": 0.35286260637825495, "learning_rate": 2e-05, "loss": 0.3448, "step": 1342 }, { "epoch": 2.749232343909928, "grad_norm": 0.33612012963227933, "learning_rate": 1.9971388054808863e-05, "loss": 0.3303, "step": 1343 }, { "epoch": 2.751279426816786, "grad_norm": 0.3293365486688113, "learning_rate": 1.99427761681752e-05, "loss": 0.3378, "step": 1344 }, { "epoch": 2.7533265097236437, "grad_norm": 0.3702026149273537, "learning_rate": 1.9914164398656383e-05, "loss": 0.3204, "step": 1345 }, { "epoch": 2.7553735926305016, "grad_norm": 0.3008607535549162, "learning_rate": 1.988555280480952e-05, "loss": 0.3245, "step": 1346 }, { "epoch": 2.757420675537359, "grad_norm": 0.3346226482861992, "learning_rate": 1.9856941445191388e-05, "loss": 0.2973, "step": 1347 }, { "epoch": 2.759467758444217, "grad_norm": 0.333107377703825, "learning_rate": 1.9828330378358264e-05, "loss": 0.3462, "step": 1348 }, { "epoch": 2.7615148413510746, "grad_norm": 0.33602464320692405, "learning_rate": 1.9799719662865828e-05, "loss": 0.3348, "step": 1349 }, { "epoch": 2.7635619242579326, "grad_norm": 0.3460744761311549, "learning_rate": 1.9771109357269047e-05, "loss": 0.3041, "step": 1350 }, { "epoch": 2.76560900716479, "grad_norm": 0.3398214327127035, "learning_rate": 1.974249952012204e-05, "loss": 0.3442, "step": 1351 }, { "epoch": 2.767656090071648, "grad_norm": 0.32323094867372437, "learning_rate": 1.9713890209977977e-05, "loss": 0.3133, "step": 1352 }, { "epoch": 2.7697031729785055, "grad_norm": 0.3439688117561741, "learning_rate": 1.9685281485388955e-05, "loss": 0.3091, "step": 1353 }, { "epoch": 2.7717502558853635, "grad_norm": 0.29746767666002316, "learning_rate": 1.9656673404905852e-05, "loss": 0.2957, "step": 1354 }, { "epoch": 2.773797338792221, "grad_norm": 0.32765073618687846, "learning_rate": 1.9628066027078247e-05, "loss": 0.3413, "step": 1355 }, { "epoch": 2.775844421699079, "grad_norm": 0.3473220628635939, "learning_rate": 1.9599459410454266e-05, "loss": 0.2997, "step": 1356 }, { "epoch": 2.7778915046059365, "grad_norm": 0.34784120186994494, "learning_rate": 1.957085361358049e-05, "loss": 0.3397, "step": 1357 }, { "epoch": 2.7799385875127944, "grad_norm": 0.32647960906928786, "learning_rate": 1.9542248695001808e-05, "loss": 0.3269, "step": 1358 }, { "epoch": 2.781985670419652, "grad_norm": 0.3386091012629272, "learning_rate": 1.9513644713261328e-05, "loss": 0.3398, "step": 1359 }, { "epoch": 2.7840327533265095, "grad_norm": 0.31112512902574707, "learning_rate": 1.9485041726900232e-05, "loss": 0.3247, "step": 1360 }, { "epoch": 2.7860798362333674, "grad_norm": 0.3267971495507797, "learning_rate": 1.9456439794457665e-05, "loss": 0.3699, "step": 1361 }, { "epoch": 2.7881269191402254, "grad_norm": 0.34461638842492454, "learning_rate": 1.942783897447061e-05, "loss": 0.3315, "step": 1362 }, { "epoch": 2.790174002047083, "grad_norm": 0.30853412340590924, "learning_rate": 1.939923932547379e-05, "loss": 0.3325, "step": 1363 }, { "epoch": 2.7922210849539404, "grad_norm": 0.32960376681405235, "learning_rate": 1.93706409059995e-05, "loss": 0.3048, "step": 1364 }, { "epoch": 2.7942681678607983, "grad_norm": 0.3216608733849923, "learning_rate": 1.9342043774577562e-05, "loss": 0.3478, "step": 1365 }, { "epoch": 2.7963152507676563, "grad_norm": 0.369356903841249, "learning_rate": 1.931344798973513e-05, "loss": 0.3428, "step": 1366 }, { "epoch": 2.798362333674514, "grad_norm": 0.3021279849899095, "learning_rate": 1.928485360999661e-05, "loss": 0.3641, "step": 1367 }, { "epoch": 2.8004094165813713, "grad_norm": 0.3573666945556479, "learning_rate": 1.9256260693883534e-05, "loss": 0.2993, "step": 1368 }, { "epoch": 2.8024564994882293, "grad_norm": 0.31061810018877584, "learning_rate": 1.922766929991443e-05, "loss": 0.3298, "step": 1369 }, { "epoch": 2.8045035823950872, "grad_norm": 0.32086151227350046, "learning_rate": 1.9199079486604727e-05, "loss": 0.293, "step": 1370 }, { "epoch": 2.8065506653019447, "grad_norm": 0.3339285911403012, "learning_rate": 1.9170491312466616e-05, "loss": 0.3239, "step": 1371 }, { "epoch": 2.8085977482088023, "grad_norm": 0.3273867205248011, "learning_rate": 1.914190483600891e-05, "loss": 0.3502, "step": 1372 }, { "epoch": 2.81064483111566, "grad_norm": 0.3707945367816578, "learning_rate": 1.9113320115736986e-05, "loss": 0.3357, "step": 1373 }, { "epoch": 2.812691914022518, "grad_norm": 0.34848249764459066, "learning_rate": 1.9084737210152593e-05, "loss": 0.3185, "step": 1374 }, { "epoch": 2.8147389969293757, "grad_norm": 0.32760861423269866, "learning_rate": 1.9056156177753776e-05, "loss": 0.3228, "step": 1375 }, { "epoch": 2.816786079836233, "grad_norm": 0.3391917065894771, "learning_rate": 1.902757707703475e-05, "loss": 0.3475, "step": 1376 }, { "epoch": 2.818833162743091, "grad_norm": 0.3257314300037398, "learning_rate": 1.899899996648579e-05, "loss": 0.3325, "step": 1377 }, { "epoch": 2.8208802456499487, "grad_norm": 0.3453324225261829, "learning_rate": 1.897042490459307e-05, "loss": 0.3301, "step": 1378 }, { "epoch": 2.8229273285568066, "grad_norm": 0.35448623951366837, "learning_rate": 1.8941851949838595e-05, "loss": 0.3261, "step": 1379 }, { "epoch": 2.824974411463664, "grad_norm": 0.322963697569429, "learning_rate": 1.8913281160700038e-05, "loss": 0.3602, "step": 1380 }, { "epoch": 2.827021494370522, "grad_norm": 0.36959087755198944, "learning_rate": 1.8884712595650653e-05, "loss": 0.3173, "step": 1381 }, { "epoch": 2.8290685772773796, "grad_norm": 0.3404440796977021, "learning_rate": 1.885614631315914e-05, "loss": 0.3549, "step": 1382 }, { "epoch": 2.8311156601842375, "grad_norm": 0.31042921976093146, "learning_rate": 1.8827582371689516e-05, "loss": 0.3202, "step": 1383 }, { "epoch": 2.833162743091095, "grad_norm": 0.3733330139685621, "learning_rate": 1.8799020829701036e-05, "loss": 0.3704, "step": 1384 }, { "epoch": 2.835209825997953, "grad_norm": 0.35311166747172257, "learning_rate": 1.8770461745648012e-05, "loss": 0.3159, "step": 1385 }, { "epoch": 2.8372569089048105, "grad_norm": 0.30009262162818595, "learning_rate": 1.8741905177979743e-05, "loss": 0.347, "step": 1386 }, { "epoch": 2.8393039918116685, "grad_norm": 0.3431225449790101, "learning_rate": 1.871335118514037e-05, "loss": 0.3259, "step": 1387 }, { "epoch": 2.841351074718526, "grad_norm": 0.3379879693204325, "learning_rate": 1.8684799825568775e-05, "loss": 0.3354, "step": 1388 }, { "epoch": 2.843398157625384, "grad_norm": 0.32284443139667096, "learning_rate": 1.8656251157698425e-05, "loss": 0.3332, "step": 1389 }, { "epoch": 2.8454452405322415, "grad_norm": 0.3366299858528211, "learning_rate": 1.862770523995732e-05, "loss": 0.3379, "step": 1390 }, { "epoch": 2.8474923234390994, "grad_norm": 0.3306626186125099, "learning_rate": 1.85991621307678e-05, "loss": 0.3304, "step": 1391 }, { "epoch": 2.849539406345957, "grad_norm": 0.3154237186664138, "learning_rate": 1.8570621888546464e-05, "loss": 0.3162, "step": 1392 }, { "epoch": 2.851586489252815, "grad_norm": 0.3070396971440444, "learning_rate": 1.854208457170404e-05, "loss": 0.3341, "step": 1393 }, { "epoch": 2.8536335721596724, "grad_norm": 0.31391792776294475, "learning_rate": 1.8513550238645283e-05, "loss": 0.3385, "step": 1394 }, { "epoch": 2.8556806550665303, "grad_norm": 0.3604543553956966, "learning_rate": 1.8485018947768817e-05, "loss": 0.3242, "step": 1395 }, { "epoch": 2.857727737973388, "grad_norm": 0.33119690178793554, "learning_rate": 1.8456490757467075e-05, "loss": 0.3172, "step": 1396 }, { "epoch": 2.859774820880246, "grad_norm": 0.3634627791673887, "learning_rate": 1.8427965726126114e-05, "loss": 0.3256, "step": 1397 }, { "epoch": 2.8618219037871033, "grad_norm": 0.30347193226396807, "learning_rate": 1.839944391212553e-05, "loss": 0.3375, "step": 1398 }, { "epoch": 2.8638689866939613, "grad_norm": 0.3273120420881456, "learning_rate": 1.8370925373838356e-05, "loss": 0.3388, "step": 1399 }, { "epoch": 2.865916069600819, "grad_norm": 0.31525363609472923, "learning_rate": 1.834241016963089e-05, "loss": 0.361, "step": 1400 }, { "epoch": 2.8679631525076763, "grad_norm": 0.32941174535440465, "learning_rate": 1.8313898357862623e-05, "loss": 0.3292, "step": 1401 }, { "epoch": 2.8700102354145343, "grad_norm": 0.33198094596750977, "learning_rate": 1.8285389996886113e-05, "loss": 0.3239, "step": 1402 }, { "epoch": 2.872057318321392, "grad_norm": 0.31365437902540855, "learning_rate": 1.8256885145046837e-05, "loss": 0.3442, "step": 1403 }, { "epoch": 2.8741044012282497, "grad_norm": 0.30554897801238856, "learning_rate": 1.82283838606831e-05, "loss": 0.3387, "step": 1404 }, { "epoch": 2.8761514841351072, "grad_norm": 0.32230864173441504, "learning_rate": 1.8199886202125897e-05, "loss": 0.364, "step": 1405 }, { "epoch": 2.878198567041965, "grad_norm": 0.3283203936597496, "learning_rate": 1.817139222769882e-05, "loss": 0.3135, "step": 1406 }, { "epoch": 2.880245649948823, "grad_norm": 0.3352622595322451, "learning_rate": 1.8142901995717894e-05, "loss": 0.3293, "step": 1407 }, { "epoch": 2.8822927328556807, "grad_norm": 0.3199512613271915, "learning_rate": 1.8114415564491513e-05, "loss": 0.3382, "step": 1408 }, { "epoch": 2.884339815762538, "grad_norm": 0.3243396061475919, "learning_rate": 1.8085932992320273e-05, "loss": 0.3346, "step": 1409 }, { "epoch": 2.886386898669396, "grad_norm": 0.3177758192621551, "learning_rate": 1.805745433749689e-05, "loss": 0.3193, "step": 1410 }, { "epoch": 2.888433981576254, "grad_norm": 0.36128950608945015, "learning_rate": 1.8028979658306033e-05, "loss": 0.3352, "step": 1411 }, { "epoch": 2.8904810644831116, "grad_norm": 0.34067398643062763, "learning_rate": 1.8000509013024266e-05, "loss": 0.3704, "step": 1412 }, { "epoch": 2.892528147389969, "grad_norm": 0.36758322105150537, "learning_rate": 1.7972042459919878e-05, "loss": 0.3548, "step": 1413 }, { "epoch": 2.894575230296827, "grad_norm": 0.3446360571011566, "learning_rate": 1.794358005725279e-05, "loss": 0.3379, "step": 1414 }, { "epoch": 2.896622313203685, "grad_norm": 0.33967697044665596, "learning_rate": 1.791512186327444e-05, "loss": 0.3685, "step": 1415 }, { "epoch": 2.8986693961105425, "grad_norm": 0.32938434012320356, "learning_rate": 1.7886667936227616e-05, "loss": 0.3224, "step": 1416 }, { "epoch": 2.9007164790174, "grad_norm": 0.3492413938668141, "learning_rate": 1.785821833434642e-05, "loss": 0.3234, "step": 1417 }, { "epoch": 2.902763561924258, "grad_norm": 0.3556331967459014, "learning_rate": 1.7829773115856065e-05, "loss": 0.305, "step": 1418 }, { "epoch": 2.9048106448311155, "grad_norm": 0.3322076227286167, "learning_rate": 1.7801332338972813e-05, "loss": 0.3463, "step": 1419 }, { "epoch": 2.9068577277379735, "grad_norm": 0.44401272792536567, "learning_rate": 1.7772896061903824e-05, "loss": 0.3441, "step": 1420 }, { "epoch": 2.908904810644831, "grad_norm": 0.3346724756218441, "learning_rate": 1.7744464342847062e-05, "loss": 0.3243, "step": 1421 }, { "epoch": 2.910951893551689, "grad_norm": 0.37288189845084224, "learning_rate": 1.771603723999116e-05, "loss": 0.328, "step": 1422 }, { "epoch": 2.9129989764585464, "grad_norm": 0.3543864660540506, "learning_rate": 1.768761481151529e-05, "loss": 0.3239, "step": 1423 }, { "epoch": 2.9150460593654044, "grad_norm": 0.33338686726894085, "learning_rate": 1.765919711558906e-05, "loss": 0.3373, "step": 1424 }, { "epoch": 2.917093142272262, "grad_norm": 0.32542482524106037, "learning_rate": 1.7630784210372413e-05, "loss": 0.3473, "step": 1425 }, { "epoch": 2.91914022517912, "grad_norm": 0.333470225286717, "learning_rate": 1.7602376154015456e-05, "loss": 0.3285, "step": 1426 }, { "epoch": 2.9211873080859774, "grad_norm": 0.33573772915244177, "learning_rate": 1.7573973004658404e-05, "loss": 0.3024, "step": 1427 }, { "epoch": 2.9232343909928353, "grad_norm": 0.34259215508028346, "learning_rate": 1.7545574820431412e-05, "loss": 0.3315, "step": 1428 }, { "epoch": 2.925281473899693, "grad_norm": 0.33102210911596164, "learning_rate": 1.751718165945447e-05, "loss": 0.3519, "step": 1429 }, { "epoch": 2.927328556806551, "grad_norm": 0.32436438467262657, "learning_rate": 1.7488793579837297e-05, "loss": 0.3661, "step": 1430 }, { "epoch": 2.9293756397134083, "grad_norm": 0.31331782808616476, "learning_rate": 1.74604106396792e-05, "loss": 0.3141, "step": 1431 }, { "epoch": 2.9314227226202663, "grad_norm": 0.4141533412404601, "learning_rate": 1.743203289706898e-05, "loss": 0.3557, "step": 1432 }, { "epoch": 2.9334698055271238, "grad_norm": 0.4622088658579696, "learning_rate": 1.7403660410084806e-05, "loss": 0.399, "step": 1433 }, { "epoch": 2.9355168884339817, "grad_norm": 0.3236744034505386, "learning_rate": 1.737529323679407e-05, "loss": 0.3286, "step": 1434 }, { "epoch": 2.9375639713408392, "grad_norm": 0.2993041210404907, "learning_rate": 1.734693143525331e-05, "loss": 0.3259, "step": 1435 }, { "epoch": 2.939611054247697, "grad_norm": 0.31919342750079005, "learning_rate": 1.731857506350805e-05, "loss": 0.3438, "step": 1436 }, { "epoch": 2.9416581371545547, "grad_norm": 0.3479704658589382, "learning_rate": 1.7290224179592718e-05, "loss": 0.3561, "step": 1437 }, { "epoch": 2.943705220061412, "grad_norm": 0.32794236732374565, "learning_rate": 1.7261878841530494e-05, "loss": 0.2956, "step": 1438 }, { "epoch": 2.94575230296827, "grad_norm": 0.3348567062695728, "learning_rate": 1.7233539107333234e-05, "loss": 0.3207, "step": 1439 }, { "epoch": 2.947799385875128, "grad_norm": 0.3107019732230144, "learning_rate": 1.72052050350013e-05, "loss": 0.3324, "step": 1440 }, { "epoch": 2.9498464687819856, "grad_norm": 0.334794121032537, "learning_rate": 1.717687668252348e-05, "loss": 0.3296, "step": 1441 }, { "epoch": 2.951893551688843, "grad_norm": 0.34219772695899153, "learning_rate": 1.7148554107876847e-05, "loss": 0.3504, "step": 1442 }, { "epoch": 2.953940634595701, "grad_norm": 0.33351281779847475, "learning_rate": 1.7120237369026655e-05, "loss": 0.378, "step": 1443 }, { "epoch": 2.955987717502559, "grad_norm": 0.3475392701501788, "learning_rate": 1.7091926523926205e-05, "loss": 0.3437, "step": 1444 }, { "epoch": 2.9580348004094166, "grad_norm": 0.33100597785259966, "learning_rate": 1.7063621630516755e-05, "loss": 0.3289, "step": 1445 }, { "epoch": 2.960081883316274, "grad_norm": 0.3615110501855717, "learning_rate": 1.7035322746727366e-05, "loss": 0.3148, "step": 1446 }, { "epoch": 2.962128966223132, "grad_norm": 0.31694424871677895, "learning_rate": 1.7007029930474804e-05, "loss": 0.3389, "step": 1447 }, { "epoch": 2.96417604912999, "grad_norm": 0.3428230537054886, "learning_rate": 1.697874323966342e-05, "loss": 0.3286, "step": 1448 }, { "epoch": 2.9662231320368475, "grad_norm": 0.33741669479723163, "learning_rate": 1.6950462732185023e-05, "loss": 0.3197, "step": 1449 }, { "epoch": 2.968270214943705, "grad_norm": 0.3203471822470365, "learning_rate": 1.6922188465918763e-05, "loss": 0.3297, "step": 1450 }, { "epoch": 2.970317297850563, "grad_norm": 0.34824872554820474, "learning_rate": 1.689392049873104e-05, "loss": 0.3577, "step": 1451 }, { "epoch": 2.972364380757421, "grad_norm": 0.33484691097376085, "learning_rate": 1.6865658888475334e-05, "loss": 0.3252, "step": 1452 }, { "epoch": 2.9744114636642784, "grad_norm": 0.3184369996466899, "learning_rate": 1.6837403692992136e-05, "loss": 0.3267, "step": 1453 }, { "epoch": 2.976458546571136, "grad_norm": 0.34524241797219873, "learning_rate": 1.680915497010879e-05, "loss": 0.362, "step": 1454 }, { "epoch": 2.978505629477994, "grad_norm": 0.3517522038568365, "learning_rate": 1.6780912777639407e-05, "loss": 0.3455, "step": 1455 }, { "epoch": 2.9805527123848514, "grad_norm": 0.35084133720055133, "learning_rate": 1.6752677173384734e-05, "loss": 0.3476, "step": 1456 }, { "epoch": 2.9825997952917094, "grad_norm": 0.33510673606401725, "learning_rate": 1.6724448215132006e-05, "loss": 0.373, "step": 1457 }, { "epoch": 2.984646878198567, "grad_norm": 0.32173623731436524, "learning_rate": 1.669622596065491e-05, "loss": 0.3416, "step": 1458 }, { "epoch": 2.986693961105425, "grad_norm": 0.3405913760888214, "learning_rate": 1.6668010467713363e-05, "loss": 0.3336, "step": 1459 }, { "epoch": 2.9887410440122824, "grad_norm": 0.33253196052599165, "learning_rate": 1.6639801794053468e-05, "loss": 0.3281, "step": 1460 }, { "epoch": 2.9907881269191403, "grad_norm": 0.3127274815254226, "learning_rate": 1.6611599997407366e-05, "loss": 0.3219, "step": 1461 }, { "epoch": 2.992835209825998, "grad_norm": 0.35589359153787264, "learning_rate": 1.658340513549312e-05, "loss": 0.3253, "step": 1462 }, { "epoch": 2.9948822927328558, "grad_norm": 0.33592684199714334, "learning_rate": 1.6555217266014604e-05, "loss": 0.3679, "step": 1463 }, { "epoch": 2.9969293756397133, "grad_norm": 0.3293658658288109, "learning_rate": 1.6527036446661396e-05, "loss": 0.3107, "step": 1464 }, { "epoch": 2.9989764585465712, "grad_norm": 0.32358581638845413, "learning_rate": 1.649886273510861e-05, "loss": 0.3537, "step": 1465 }, { "epoch": 3.0010235414534288, "grad_norm": 0.6530375339161546, "learning_rate": 1.6470696189016853e-05, "loss": 0.4219, "step": 1466 }, { "epoch": 3.0030706243602867, "grad_norm": 0.41349403660281325, "learning_rate": 1.6442536866032027e-05, "loss": 0.2489, "step": 1467 }, { "epoch": 3.0051177072671442, "grad_norm": 0.49742564287488156, "learning_rate": 1.641438482378529e-05, "loss": 0.2308, "step": 1468 }, { "epoch": 3.007164790174002, "grad_norm": 0.5603085497770336, "learning_rate": 1.6386240119892867e-05, "loss": 0.2283, "step": 1469 }, { "epoch": 3.0092118730808597, "grad_norm": 0.37112551523365644, "learning_rate": 1.6358102811955985e-05, "loss": 0.2255, "step": 1470 }, { "epoch": 3.0112589559877176, "grad_norm": 0.4964687759956586, "learning_rate": 1.6329972957560736e-05, "loss": 0.2499, "step": 1471 }, { "epoch": 3.013306038894575, "grad_norm": 0.5159224089011558, "learning_rate": 1.6301850614277936e-05, "loss": 0.2899, "step": 1472 }, { "epoch": 3.015353121801433, "grad_norm": 0.3865653836943384, "learning_rate": 1.6273735839663044e-05, "loss": 0.23, "step": 1473 }, { "epoch": 3.0174002047082906, "grad_norm": 0.36109737314160684, "learning_rate": 1.6245628691256032e-05, "loss": 0.2087, "step": 1474 }, { "epoch": 3.0194472876151486, "grad_norm": 0.44209392274842507, "learning_rate": 1.6217529226581247e-05, "loss": 0.2523, "step": 1475 }, { "epoch": 3.021494370522006, "grad_norm": 0.4472352934034512, "learning_rate": 1.6189437503147338e-05, "loss": 0.284, "step": 1476 }, { "epoch": 3.023541453428864, "grad_norm": 0.3324235418829143, "learning_rate": 1.616135357844709e-05, "loss": 0.203, "step": 1477 }, { "epoch": 3.0255885363357216, "grad_norm": 0.36268455499664337, "learning_rate": 1.613327750995732e-05, "loss": 0.2223, "step": 1478 }, { "epoch": 3.0276356192425795, "grad_norm": 0.38026408150755925, "learning_rate": 1.61052093551388e-05, "loss": 0.2122, "step": 1479 }, { "epoch": 3.029682702149437, "grad_norm": 0.3766577760390122, "learning_rate": 1.6077149171436063e-05, "loss": 0.263, "step": 1480 }, { "epoch": 3.031729785056295, "grad_norm": 0.36393521520091426, "learning_rate": 1.6049097016277358e-05, "loss": 0.2729, "step": 1481 }, { "epoch": 3.0337768679631525, "grad_norm": 0.36192400255610996, "learning_rate": 1.60210529470745e-05, "loss": 0.2318, "step": 1482 }, { "epoch": 3.0358239508700104, "grad_norm": 0.35713778733325363, "learning_rate": 1.599301702122274e-05, "loss": 0.2356, "step": 1483 }, { "epoch": 3.037871033776868, "grad_norm": 0.3518563642429611, "learning_rate": 1.5964989296100682e-05, "loss": 0.2367, "step": 1484 }, { "epoch": 3.039918116683726, "grad_norm": 0.40298228736514674, "learning_rate": 1.5936969829070125e-05, "loss": 0.3027, "step": 1485 }, { "epoch": 3.0419651995905834, "grad_norm": 0.31807307761105635, "learning_rate": 1.590895867747599e-05, "loss": 0.2541, "step": 1486 }, { "epoch": 3.044012282497441, "grad_norm": 0.3409445938787892, "learning_rate": 1.588095589864615e-05, "loss": 0.2289, "step": 1487 }, { "epoch": 3.046059365404299, "grad_norm": 0.3600773888625388, "learning_rate": 1.5852961549891376e-05, "loss": 0.2341, "step": 1488 }, { "epoch": 3.0481064483111564, "grad_norm": 0.32453736981408465, "learning_rate": 1.582497568850517e-05, "loss": 0.2797, "step": 1489 }, { "epoch": 3.0501535312180144, "grad_norm": 0.3331181801930424, "learning_rate": 1.579699837176366e-05, "loss": 0.2036, "step": 1490 }, { "epoch": 3.052200614124872, "grad_norm": 0.4136442196087828, "learning_rate": 1.5769029656925486e-05, "loss": 0.2599, "step": 1491 }, { "epoch": 3.05424769703173, "grad_norm": 0.35863640915527445, "learning_rate": 1.574106960123169e-05, "loss": 0.2526, "step": 1492 }, { "epoch": 3.0562947799385873, "grad_norm": 0.35564688451602927, "learning_rate": 1.571311826190559e-05, "loss": 0.2503, "step": 1493 }, { "epoch": 3.0583418628454453, "grad_norm": 0.33302867243814094, "learning_rate": 1.5685175696152657e-05, "loss": 0.2283, "step": 1494 }, { "epoch": 3.060388945752303, "grad_norm": 0.36946655881677004, "learning_rate": 1.5657241961160434e-05, "loss": 0.272, "step": 1495 }, { "epoch": 3.0624360286591608, "grad_norm": 0.33870469772598516, "learning_rate": 1.562931711409835e-05, "loss": 0.2527, "step": 1496 }, { "epoch": 3.0644831115660183, "grad_norm": 0.29057383945914955, "learning_rate": 1.5601401212117676e-05, "loss": 0.2468, "step": 1497 }, { "epoch": 3.0665301944728762, "grad_norm": 0.3538607726440218, "learning_rate": 1.557349431235135e-05, "loss": 0.2527, "step": 1498 }, { "epoch": 3.0685772773797337, "grad_norm": 0.36881305364134004, "learning_rate": 1.554559647191392e-05, "loss": 0.2572, "step": 1499 }, { "epoch": 3.0706243602865917, "grad_norm": 0.31303360312121764, "learning_rate": 1.5517707747901352e-05, "loss": 0.2015, "step": 1500 }, { "epoch": 3.072671443193449, "grad_norm": 0.3468523227469292, "learning_rate": 1.5489828197390988e-05, "loss": 0.2522, "step": 1501 }, { "epoch": 3.074718526100307, "grad_norm": 0.3189025943422717, "learning_rate": 1.5461957877441387e-05, "loss": 0.2467, "step": 1502 }, { "epoch": 3.0767656090071647, "grad_norm": 0.36693872460390387, "learning_rate": 1.5434096845092203e-05, "loss": 0.2266, "step": 1503 }, { "epoch": 3.0788126919140226, "grad_norm": 0.33990261356941326, "learning_rate": 1.5406245157364093e-05, "loss": 0.2511, "step": 1504 }, { "epoch": 3.08085977482088, "grad_norm": 0.3224675841478402, "learning_rate": 1.537840287125859e-05, "loss": 0.2013, "step": 1505 }, { "epoch": 3.082906857727738, "grad_norm": 0.3392490287207665, "learning_rate": 1.5350570043757976e-05, "loss": 0.2222, "step": 1506 }, { "epoch": 3.0849539406345956, "grad_norm": 0.33119892853448313, "learning_rate": 1.5322746731825195e-05, "loss": 0.2403, "step": 1507 }, { "epoch": 3.0870010235414536, "grad_norm": 0.3189197704594123, "learning_rate": 1.5294932992403695e-05, "loss": 0.2156, "step": 1508 }, { "epoch": 3.089048106448311, "grad_norm": 0.33202442707430363, "learning_rate": 1.526712888241734e-05, "loss": 0.2449, "step": 1509 }, { "epoch": 3.091095189355169, "grad_norm": 0.3126714573879982, "learning_rate": 1.5239334458770291e-05, "loss": 0.2345, "step": 1510 }, { "epoch": 3.0931422722620265, "grad_norm": 0.3199987863370429, "learning_rate": 1.5211549778346882e-05, "loss": 0.2547, "step": 1511 }, { "epoch": 3.0951893551688845, "grad_norm": 0.2865206932540882, "learning_rate": 1.5183774898011496e-05, "loss": 0.2262, "step": 1512 }, { "epoch": 3.097236438075742, "grad_norm": 0.2978122286180525, "learning_rate": 1.5156009874608484e-05, "loss": 0.2454, "step": 1513 }, { "epoch": 3.0992835209826, "grad_norm": 0.33009627047245504, "learning_rate": 1.5128254764962e-05, "loss": 0.2762, "step": 1514 }, { "epoch": 3.1013306038894575, "grad_norm": 0.328880261572203, "learning_rate": 1.5100509625875921e-05, "loss": 0.3284, "step": 1515 }, { "epoch": 3.1033776867963154, "grad_norm": 0.31916303688606795, "learning_rate": 1.5072774514133708e-05, "loss": 0.2299, "step": 1516 }, { "epoch": 3.105424769703173, "grad_norm": 0.3226893313357512, "learning_rate": 1.5045049486498311e-05, "loss": 0.2338, "step": 1517 }, { "epoch": 3.107471852610031, "grad_norm": 0.31484899101790875, "learning_rate": 1.5017334599712028e-05, "loss": 0.2039, "step": 1518 }, { "epoch": 3.1095189355168884, "grad_norm": 0.32998837828193894, "learning_rate": 1.4989629910496424e-05, "loss": 0.2345, "step": 1519 }, { "epoch": 3.1115660184237464, "grad_norm": 0.3027257775351982, "learning_rate": 1.4961935475552178e-05, "loss": 0.2285, "step": 1520 }, { "epoch": 3.113613101330604, "grad_norm": 0.3071518323366728, "learning_rate": 1.4934251351558983e-05, "loss": 0.2384, "step": 1521 }, { "epoch": 3.115660184237462, "grad_norm": 0.30152633341709373, "learning_rate": 1.4906577595175428e-05, "loss": 0.2201, "step": 1522 }, { "epoch": 3.1177072671443193, "grad_norm": 0.33663282454591203, "learning_rate": 1.4878914263038895e-05, "loss": 0.2324, "step": 1523 }, { "epoch": 3.119754350051177, "grad_norm": 0.33606866012096387, "learning_rate": 1.4851261411765414e-05, "loss": 0.2629, "step": 1524 }, { "epoch": 3.121801432958035, "grad_norm": 0.32073469626766654, "learning_rate": 1.4823619097949584e-05, "loss": 0.2309, "step": 1525 }, { "epoch": 3.1238485158648923, "grad_norm": 0.34825982979835035, "learning_rate": 1.4795987378164432e-05, "loss": 0.2361, "step": 1526 }, { "epoch": 3.1258955987717503, "grad_norm": 0.32245272558823557, "learning_rate": 1.4768366308961288e-05, "loss": 0.2281, "step": 1527 }, { "epoch": 3.127942681678608, "grad_norm": 0.35652651564540333, "learning_rate": 1.4740755946869708e-05, "loss": 0.2508, "step": 1528 }, { "epoch": 3.1299897645854657, "grad_norm": 0.3215049499878307, "learning_rate": 1.4713156348397317e-05, "loss": 0.2144, "step": 1529 }, { "epoch": 3.1320368474923233, "grad_norm": 0.33346781365441597, "learning_rate": 1.468556757002972e-05, "loss": 0.2425, "step": 1530 }, { "epoch": 3.134083930399181, "grad_norm": 0.35639968613552386, "learning_rate": 1.4657989668230363e-05, "loss": 0.2227, "step": 1531 }, { "epoch": 3.1361310133060387, "grad_norm": 0.3010242077787357, "learning_rate": 1.4630422699440461e-05, "loss": 0.2742, "step": 1532 }, { "epoch": 3.1381780962128967, "grad_norm": 0.36016875940221305, "learning_rate": 1.4602866720078832e-05, "loss": 0.2747, "step": 1533 }, { "epoch": 3.140225179119754, "grad_norm": 0.35237386827329403, "learning_rate": 1.4575321786541801e-05, "loss": 0.2408, "step": 1534 }, { "epoch": 3.142272262026612, "grad_norm": 0.319112209857982, "learning_rate": 1.45477879552031e-05, "loss": 0.2488, "step": 1535 }, { "epoch": 3.1443193449334697, "grad_norm": 0.2967734684654477, "learning_rate": 1.4520265282413722e-05, "loss": 0.213, "step": 1536 }, { "epoch": 3.1463664278403276, "grad_norm": 0.3524504353601547, "learning_rate": 1.4492753824501833e-05, "loss": 0.222, "step": 1537 }, { "epoch": 3.148413510747185, "grad_norm": 0.33500821048449647, "learning_rate": 1.4465253637772651e-05, "loss": 0.2513, "step": 1538 }, { "epoch": 3.150460593654043, "grad_norm": 0.34077100422476553, "learning_rate": 1.443776477850833e-05, "loss": 0.2701, "step": 1539 }, { "epoch": 3.1525076765609006, "grad_norm": 0.34275143758140053, "learning_rate": 1.4410287302967813e-05, "loss": 0.3137, "step": 1540 }, { "epoch": 3.1545547594677585, "grad_norm": 0.31464152797503897, "learning_rate": 1.4382821267386781e-05, "loss": 0.2329, "step": 1541 }, { "epoch": 3.156601842374616, "grad_norm": 0.29997611145802033, "learning_rate": 1.4355366727977473e-05, "loss": 0.2355, "step": 1542 }, { "epoch": 3.158648925281474, "grad_norm": 0.3098740828854044, "learning_rate": 1.4327923740928613e-05, "loss": 0.2364, "step": 1543 }, { "epoch": 3.1606960081883315, "grad_norm": 0.3182081076670239, "learning_rate": 1.4300492362405296e-05, "loss": 0.2307, "step": 1544 }, { "epoch": 3.1627430910951895, "grad_norm": 0.33289523643049107, "learning_rate": 1.4273072648548827e-05, "loss": 0.2658, "step": 1545 }, { "epoch": 3.164790174002047, "grad_norm": 0.3483349343296588, "learning_rate": 1.4245664655476663e-05, "loss": 0.239, "step": 1546 }, { "epoch": 3.166837256908905, "grad_norm": 0.3344228654090122, "learning_rate": 1.4218268439282259e-05, "loss": 0.2136, "step": 1547 }, { "epoch": 3.1688843398157625, "grad_norm": 0.31443022141500176, "learning_rate": 1.4190884056034983e-05, "loss": 0.2642, "step": 1548 }, { "epoch": 3.1709314227226204, "grad_norm": 0.32146571298562293, "learning_rate": 1.4163511561779956e-05, "loss": 0.2532, "step": 1549 }, { "epoch": 3.172978505629478, "grad_norm": 0.3550608808689066, "learning_rate": 1.4136151012538008e-05, "loss": 0.2358, "step": 1550 }, { "epoch": 3.175025588536336, "grad_norm": 0.3166912892735251, "learning_rate": 1.4108802464305496e-05, "loss": 0.2128, "step": 1551 }, { "epoch": 3.1770726714431934, "grad_norm": 0.3332651741225492, "learning_rate": 1.4081465973054216e-05, "loss": 0.2423, "step": 1552 }, { "epoch": 3.1791197543500513, "grad_norm": 0.3369578277421136, "learning_rate": 1.4054141594731289e-05, "loss": 0.205, "step": 1553 }, { "epoch": 3.181166837256909, "grad_norm": 0.3549410163798208, "learning_rate": 1.402682938525906e-05, "loss": 0.2587, "step": 1554 }, { "epoch": 3.183213920163767, "grad_norm": 0.31908575274366446, "learning_rate": 1.3999529400534941e-05, "loss": 0.2669, "step": 1555 }, { "epoch": 3.1852610030706243, "grad_norm": 0.33778438982482334, "learning_rate": 1.3972241696431357e-05, "loss": 0.244, "step": 1556 }, { "epoch": 3.1873080859774823, "grad_norm": 0.3536190099748012, "learning_rate": 1.3944966328795584e-05, "loss": 0.243, "step": 1557 }, { "epoch": 3.18935516888434, "grad_norm": 0.3395539666321202, "learning_rate": 1.3917703353449646e-05, "loss": 0.2231, "step": 1558 }, { "epoch": 3.1914022517911977, "grad_norm": 0.3107794197012843, "learning_rate": 1.3890452826190208e-05, "loss": 0.203, "step": 1559 }, { "epoch": 3.1934493346980553, "grad_norm": 0.33383730315910054, "learning_rate": 1.3863214802788459e-05, "loss": 0.2239, "step": 1560 }, { "epoch": 3.1954964176049128, "grad_norm": 0.3236027308800084, "learning_rate": 1.3835989338989996e-05, "loss": 0.2602, "step": 1561 }, { "epoch": 3.1975435005117707, "grad_norm": 0.3383450416141259, "learning_rate": 1.3808776490514727e-05, "loss": 0.2775, "step": 1562 }, { "epoch": 3.1995905834186287, "grad_norm": 0.29556678254047786, "learning_rate": 1.3781576313056713e-05, "loss": 0.2305, "step": 1563 }, { "epoch": 3.201637666325486, "grad_norm": 0.34240014338764224, "learning_rate": 1.375438886228411e-05, "loss": 0.2284, "step": 1564 }, { "epoch": 3.2036847492323437, "grad_norm": 0.35927171229864285, "learning_rate": 1.3727214193839002e-05, "loss": 0.2669, "step": 1565 }, { "epoch": 3.2057318321392017, "grad_norm": 0.295141784625082, "learning_rate": 1.3700052363337337e-05, "loss": 0.2237, "step": 1566 }, { "epoch": 3.207778915046059, "grad_norm": 0.31352782892999664, "learning_rate": 1.3672903426368773e-05, "loss": 0.2105, "step": 1567 }, { "epoch": 3.209825997952917, "grad_norm": 0.31282387528552047, "learning_rate": 1.3645767438496567e-05, "loss": 0.2252, "step": 1568 }, { "epoch": 3.2118730808597746, "grad_norm": 0.3487407042028981, "learning_rate": 1.3618644455257521e-05, "loss": 0.2717, "step": 1569 }, { "epoch": 3.2139201637666326, "grad_norm": 0.34749040489450855, "learning_rate": 1.3591534532161781e-05, "loss": 0.2463, "step": 1570 }, { "epoch": 3.21596724667349, "grad_norm": 0.32899297696851715, "learning_rate": 1.3564437724692766e-05, "loss": 0.2275, "step": 1571 }, { "epoch": 3.218014329580348, "grad_norm": 0.2725890794790335, "learning_rate": 1.353735408830707e-05, "loss": 0.2153, "step": 1572 }, { "epoch": 3.2200614124872056, "grad_norm": 0.37569994527165246, "learning_rate": 1.3510283678434317e-05, "loss": 0.2445, "step": 1573 }, { "epoch": 3.2221084953940635, "grad_norm": 0.3527821656094053, "learning_rate": 1.348322655047707e-05, "loss": 0.2088, "step": 1574 }, { "epoch": 3.224155578300921, "grad_norm": 0.3319266141186732, "learning_rate": 1.3456182759810708e-05, "loss": 0.2336, "step": 1575 }, { "epoch": 3.226202661207779, "grad_norm": 0.3545486115689844, "learning_rate": 1.3429152361783307e-05, "loss": 0.2681, "step": 1576 }, { "epoch": 3.2282497441146365, "grad_norm": 0.3385783041962988, "learning_rate": 1.3402135411715545e-05, "loss": 0.2315, "step": 1577 }, { "epoch": 3.2302968270214945, "grad_norm": 0.32938060023769156, "learning_rate": 1.337513196490056e-05, "loss": 0.2498, "step": 1578 }, { "epoch": 3.232343909928352, "grad_norm": 0.30316278108042816, "learning_rate": 1.3348142076603876e-05, "loss": 0.1928, "step": 1579 }, { "epoch": 3.23439099283521, "grad_norm": 0.34764955355779054, "learning_rate": 1.3321165802063243e-05, "loss": 0.28, "step": 1580 }, { "epoch": 3.2364380757420674, "grad_norm": 0.3268398726034169, "learning_rate": 1.3294203196488576e-05, "loss": 0.2804, "step": 1581 }, { "epoch": 3.2384851586489254, "grad_norm": 0.31999282008130525, "learning_rate": 1.3267254315061797e-05, "loss": 0.26, "step": 1582 }, { "epoch": 3.240532241555783, "grad_norm": 0.31712302189517827, "learning_rate": 1.324031921293674e-05, "loss": 0.23, "step": 1583 }, { "epoch": 3.242579324462641, "grad_norm": 0.3137795221764201, "learning_rate": 1.3213397945239053e-05, "loss": 0.243, "step": 1584 }, { "epoch": 3.2446264073694984, "grad_norm": 0.3236833391733087, "learning_rate": 1.318649056706605e-05, "loss": 0.2621, "step": 1585 }, { "epoch": 3.2466734902763563, "grad_norm": 0.28341768420158997, "learning_rate": 1.3159597133486628e-05, "loss": 0.2105, "step": 1586 }, { "epoch": 3.248720573183214, "grad_norm": 0.34778024409649644, "learning_rate": 1.313271769954115e-05, "loss": 0.2899, "step": 1587 }, { "epoch": 3.250767656090072, "grad_norm": 0.3072274493404741, "learning_rate": 1.3105852320241326e-05, "loss": 0.2141, "step": 1588 }, { "epoch": 3.2528147389969293, "grad_norm": 0.32380150432128973, "learning_rate": 1.307900105057009e-05, "loss": 0.2218, "step": 1589 }, { "epoch": 3.2548618219037873, "grad_norm": 0.3234313545922506, "learning_rate": 1.3052163945481517e-05, "loss": 0.2301, "step": 1590 }, { "epoch": 3.2569089048106448, "grad_norm": 0.3409588063960833, "learning_rate": 1.3025341059900675e-05, "loss": 0.2331, "step": 1591 }, { "epoch": 3.2589559877175027, "grad_norm": 0.33214628078621744, "learning_rate": 1.2998532448723536e-05, "loss": 0.2253, "step": 1592 }, { "epoch": 3.2610030706243602, "grad_norm": 0.3495327746489498, "learning_rate": 1.2971738166816871e-05, "loss": 0.2369, "step": 1593 }, { "epoch": 3.263050153531218, "grad_norm": 0.35093939156017323, "learning_rate": 1.2944958269018103e-05, "loss": 0.2329, "step": 1594 }, { "epoch": 3.2650972364380757, "grad_norm": 0.3204843846779589, "learning_rate": 1.291819281013524e-05, "loss": 0.2144, "step": 1595 }, { "epoch": 3.2671443193449337, "grad_norm": 0.3484251329111784, "learning_rate": 1.289144184494671e-05, "loss": 0.2531, "step": 1596 }, { "epoch": 3.269191402251791, "grad_norm": 0.29945051356789365, "learning_rate": 1.2864705428201307e-05, "loss": 0.2293, "step": 1597 }, { "epoch": 3.2712384851586487, "grad_norm": 0.3521856225970276, "learning_rate": 1.2837983614618023e-05, "loss": 0.3006, "step": 1598 }, { "epoch": 3.2732855680655066, "grad_norm": 0.3221884922752649, "learning_rate": 1.2811276458885993e-05, "loss": 0.2331, "step": 1599 }, { "epoch": 3.2753326509723646, "grad_norm": 0.30627973138617387, "learning_rate": 1.2784584015664337e-05, "loss": 0.222, "step": 1600 }, { "epoch": 3.277379733879222, "grad_norm": 0.3650912580402862, "learning_rate": 1.2757906339582053e-05, "loss": 0.269, "step": 1601 }, { "epoch": 3.2794268167860796, "grad_norm": 0.2847330580470038, "learning_rate": 1.2731243485237932e-05, "loss": 0.2161, "step": 1602 }, { "epoch": 3.2814738996929376, "grad_norm": 0.3140163871014674, "learning_rate": 1.2704595507200435e-05, "loss": 0.271, "step": 1603 }, { "epoch": 3.2835209825997955, "grad_norm": 0.324180451733573, "learning_rate": 1.2677962460007555e-05, "loss": 0.2124, "step": 1604 }, { "epoch": 3.285568065506653, "grad_norm": 0.32841827310726324, "learning_rate": 1.2651344398166745e-05, "loss": 0.2905, "step": 1605 }, { "epoch": 3.2876151484135105, "grad_norm": 0.2992524116524214, "learning_rate": 1.26247413761548e-05, "loss": 0.1951, "step": 1606 }, { "epoch": 3.2896622313203685, "grad_norm": 0.3536568449695457, "learning_rate": 1.2598153448417701e-05, "loss": 0.3008, "step": 1607 }, { "epoch": 3.291709314227226, "grad_norm": 0.2745017989746043, "learning_rate": 1.2571580669370565e-05, "loss": 0.218, "step": 1608 }, { "epoch": 3.293756397134084, "grad_norm": 0.336785618203528, "learning_rate": 1.254502309339749e-05, "loss": 0.2163, "step": 1609 }, { "epoch": 3.2958034800409415, "grad_norm": 0.29711300837975474, "learning_rate": 1.2518480774851472e-05, "loss": 0.2217, "step": 1610 }, { "epoch": 3.2978505629477994, "grad_norm": 0.32116672616384045, "learning_rate": 1.2491953768054263e-05, "loss": 0.2595, "step": 1611 }, { "epoch": 3.299897645854657, "grad_norm": 0.3367433516635412, "learning_rate": 1.2465442127296297e-05, "loss": 0.2293, "step": 1612 }, { "epoch": 3.301944728761515, "grad_norm": 0.3300984778131218, "learning_rate": 1.2438945906836557e-05, "loss": 0.2443, "step": 1613 }, { "epoch": 3.3039918116683724, "grad_norm": 0.31907195158009316, "learning_rate": 1.241246516090245e-05, "loss": 0.2321, "step": 1614 }, { "epoch": 3.3060388945752304, "grad_norm": 0.3096668811954514, "learning_rate": 1.2385999943689732e-05, "loss": 0.2216, "step": 1615 }, { "epoch": 3.308085977482088, "grad_norm": 0.3776462069541184, "learning_rate": 1.2359550309362368e-05, "loss": 0.2104, "step": 1616 }, { "epoch": 3.310133060388946, "grad_norm": 0.3196294164019087, "learning_rate": 1.2333116312052416e-05, "loss": 0.2403, "step": 1617 }, { "epoch": 3.3121801432958033, "grad_norm": 0.34176563178349245, "learning_rate": 1.2306698005859975e-05, "loss": 0.2409, "step": 1618 }, { "epoch": 3.3142272262026613, "grad_norm": 0.34417655045127915, "learning_rate": 1.2280295444852994e-05, "loss": 0.2899, "step": 1619 }, { "epoch": 3.316274309109519, "grad_norm": 0.34668338984896196, "learning_rate": 1.22539086830672e-05, "loss": 0.216, "step": 1620 }, { "epoch": 3.3183213920163768, "grad_norm": 0.307563527231833, "learning_rate": 1.2227537774505996e-05, "loss": 0.204, "step": 1621 }, { "epoch": 3.3203684749232343, "grad_norm": 0.36360020699523277, "learning_rate": 1.2201182773140334e-05, "loss": 0.2321, "step": 1622 }, { "epoch": 3.3224155578300922, "grad_norm": 0.34959023754084967, "learning_rate": 1.2174843732908609e-05, "loss": 0.2417, "step": 1623 }, { "epoch": 3.3244626407369497, "grad_norm": 0.35791334099991956, "learning_rate": 1.2148520707716567e-05, "loss": 0.267, "step": 1624 }, { "epoch": 3.3265097236438077, "grad_norm": 0.31882901909973, "learning_rate": 1.2122213751437147e-05, "loss": 0.2368, "step": 1625 }, { "epoch": 3.328556806550665, "grad_norm": 0.31734552594196086, "learning_rate": 1.2095922917910427e-05, "loss": 0.2437, "step": 1626 }, { "epoch": 3.330603889457523, "grad_norm": 0.3303738558908603, "learning_rate": 1.2069648260943473e-05, "loss": 0.2326, "step": 1627 }, { "epoch": 3.3326509723643807, "grad_norm": 0.32163501618546503, "learning_rate": 1.2043389834310257e-05, "loss": 0.2579, "step": 1628 }, { "epoch": 3.3346980552712386, "grad_norm": 0.33400441651552865, "learning_rate": 1.2017147691751512e-05, "loss": 0.2572, "step": 1629 }, { "epoch": 3.336745138178096, "grad_norm": 0.3062361134778084, "learning_rate": 1.1990921886974669e-05, "loss": 0.1971, "step": 1630 }, { "epoch": 3.338792221084954, "grad_norm": 0.3617880140065343, "learning_rate": 1.1964712473653713e-05, "loss": 0.2752, "step": 1631 }, { "epoch": 3.3408393039918116, "grad_norm": 0.31208903084798195, "learning_rate": 1.1938519505429072e-05, "loss": 0.1891, "step": 1632 }, { "epoch": 3.3428863868986696, "grad_norm": 0.3846438154154162, "learning_rate": 1.1912343035907535e-05, "loss": 0.2663, "step": 1633 }, { "epoch": 3.344933469805527, "grad_norm": 0.3360119347175398, "learning_rate": 1.1886183118662108e-05, "loss": 0.225, "step": 1634 }, { "epoch": 3.346980552712385, "grad_norm": 0.34434286466245134, "learning_rate": 1.1860039807231923e-05, "loss": 0.2311, "step": 1635 }, { "epoch": 3.3490276356192425, "grad_norm": 0.3193089695495996, "learning_rate": 1.1833913155122132e-05, "loss": 0.1958, "step": 1636 }, { "epoch": 3.3510747185261005, "grad_norm": 0.34254339904952036, "learning_rate": 1.1807803215803806e-05, "loss": 0.2301, "step": 1637 }, { "epoch": 3.353121801432958, "grad_norm": 0.37284073286468844, "learning_rate": 1.1781710042713783e-05, "loss": 0.2321, "step": 1638 }, { "epoch": 3.3551688843398155, "grad_norm": 0.3400024012806843, "learning_rate": 1.1755633689254609e-05, "loss": 0.2481, "step": 1639 }, { "epoch": 3.3572159672466735, "grad_norm": 0.3381525204885461, "learning_rate": 1.1729574208794388e-05, "loss": 0.2486, "step": 1640 }, { "epoch": 3.3592630501535314, "grad_norm": 0.3197266149899627, "learning_rate": 1.1703531654666714e-05, "loss": 0.2368, "step": 1641 }, { "epoch": 3.361310133060389, "grad_norm": 0.3130217545123078, "learning_rate": 1.1677506080170512e-05, "loss": 0.2342, "step": 1642 }, { "epoch": 3.3633572159672465, "grad_norm": 0.32947434661394787, "learning_rate": 1.1651497538569984e-05, "loss": 0.2124, "step": 1643 }, { "epoch": 3.3654042988741044, "grad_norm": 0.344686988795317, "learning_rate": 1.162550608309446e-05, "loss": 0.2464, "step": 1644 }, { "epoch": 3.3674513817809624, "grad_norm": 0.30351992821717133, "learning_rate": 1.1599531766938306e-05, "loss": 0.2378, "step": 1645 }, { "epoch": 3.36949846468782, "grad_norm": 0.31537723127663564, "learning_rate": 1.1573574643260787e-05, "loss": 0.2683, "step": 1646 }, { "epoch": 3.3715455475946774, "grad_norm": 0.3179109407008773, "learning_rate": 1.1547634765186016e-05, "loss": 0.21, "step": 1647 }, { "epoch": 3.3735926305015353, "grad_norm": 0.3414400925992681, "learning_rate": 1.1521712185802789e-05, "loss": 0.2467, "step": 1648 }, { "epoch": 3.375639713408393, "grad_norm": 0.3015513958093698, "learning_rate": 1.1495806958164508e-05, "loss": 0.2333, "step": 1649 }, { "epoch": 3.377686796315251, "grad_norm": 0.3221671987582906, "learning_rate": 1.1469919135289058e-05, "loss": 0.2697, "step": 1650 }, { "epoch": 3.3797338792221083, "grad_norm": 0.29450678032670125, "learning_rate": 1.1444048770158718e-05, "loss": 0.2255, "step": 1651 }, { "epoch": 3.3817809621289663, "grad_norm": 0.39235810212847905, "learning_rate": 1.1418195915720002e-05, "loss": 0.2599, "step": 1652 }, { "epoch": 3.383828045035824, "grad_norm": 0.2945867088296247, "learning_rate": 1.139236062488362e-05, "loss": 0.2336, "step": 1653 }, { "epoch": 3.3858751279426818, "grad_norm": 0.3178722215887932, "learning_rate": 1.136654295052433e-05, "loss": 0.266, "step": 1654 }, { "epoch": 3.3879222108495393, "grad_norm": 0.32262451442066525, "learning_rate": 1.134074294548082e-05, "loss": 0.2282, "step": 1655 }, { "epoch": 3.389969293756397, "grad_norm": 0.3260714484305257, "learning_rate": 1.1314960662555639e-05, "loss": 0.2294, "step": 1656 }, { "epoch": 3.3920163766632547, "grad_norm": 0.29552572043468256, "learning_rate": 1.1289196154515048e-05, "loss": 0.1852, "step": 1657 }, { "epoch": 3.3940634595701127, "grad_norm": 0.32257518359378057, "learning_rate": 1.1263449474088944e-05, "loss": 0.2122, "step": 1658 }, { "epoch": 3.39611054247697, "grad_norm": 0.31678272789529893, "learning_rate": 1.1237720673970713e-05, "loss": 0.2391, "step": 1659 }, { "epoch": 3.398157625383828, "grad_norm": 0.3294063880106706, "learning_rate": 1.1212009806817163e-05, "loss": 0.2582, "step": 1660 }, { "epoch": 3.4002047082906857, "grad_norm": 0.33445038877415256, "learning_rate": 1.118631692524843e-05, "loss": 0.2325, "step": 1661 }, { "epoch": 3.4022517911975436, "grad_norm": 0.31780769894415345, "learning_rate": 1.1160642081847782e-05, "loss": 0.2114, "step": 1662 }, { "epoch": 3.404298874104401, "grad_norm": 0.34936980398469303, "learning_rate": 1.1134985329161608e-05, "loss": 0.2633, "step": 1663 }, { "epoch": 3.406345957011259, "grad_norm": 0.25267194298381324, "learning_rate": 1.1109346719699263e-05, "loss": 0.1672, "step": 1664 }, { "epoch": 3.4083930399181166, "grad_norm": 0.39414125375432546, "learning_rate": 1.108372630593298e-05, "loss": 0.3388, "step": 1665 }, { "epoch": 3.4104401228249746, "grad_norm": 0.31231427401371586, "learning_rate": 1.1058124140297718e-05, "loss": 0.247, "step": 1666 }, { "epoch": 3.412487205731832, "grad_norm": 0.2819687858861257, "learning_rate": 1.1032540275191148e-05, "loss": 0.2171, "step": 1667 }, { "epoch": 3.41453428863869, "grad_norm": 0.32854044476275013, "learning_rate": 1.1006974762973425e-05, "loss": 0.2265, "step": 1668 }, { "epoch": 3.4165813715455475, "grad_norm": 0.34353435192327664, "learning_rate": 1.0981427655967183e-05, "loss": 0.2469, "step": 1669 }, { "epoch": 3.4186284544524055, "grad_norm": 0.32710285779281467, "learning_rate": 1.0955899006457373e-05, "loss": 0.2437, "step": 1670 }, { "epoch": 3.420675537359263, "grad_norm": 0.3122882795616273, "learning_rate": 1.0930388866691181e-05, "loss": 0.2433, "step": 1671 }, { "epoch": 3.422722620266121, "grad_norm": 0.29841602691342367, "learning_rate": 1.0904897288877891e-05, "loss": 0.2373, "step": 1672 }, { "epoch": 3.4247697031729785, "grad_norm": 0.32318146749811455, "learning_rate": 1.0879424325188805e-05, "loss": 0.2477, "step": 1673 }, { "epoch": 3.4268167860798364, "grad_norm": 0.32962691544440564, "learning_rate": 1.085397002775716e-05, "loss": 0.2577, "step": 1674 }, { "epoch": 3.428863868986694, "grad_norm": 0.3028821115102944, "learning_rate": 1.0828534448677942e-05, "loss": 0.256, "step": 1675 }, { "epoch": 3.4309109518935514, "grad_norm": 0.31035238566507123, "learning_rate": 1.080311764000786e-05, "loss": 0.223, "step": 1676 }, { "epoch": 3.4329580348004094, "grad_norm": 0.30168173543643445, "learning_rate": 1.0777719653765191e-05, "loss": 0.2389, "step": 1677 }, { "epoch": 3.4350051177072674, "grad_norm": 0.28500495929546144, "learning_rate": 1.0752340541929711e-05, "loss": 0.226, "step": 1678 }, { "epoch": 3.437052200614125, "grad_norm": 0.3141146687901098, "learning_rate": 1.0726980356442524e-05, "loss": 0.2684, "step": 1679 }, { "epoch": 3.4390992835209824, "grad_norm": 0.4682196146944551, "learning_rate": 1.0701639149206061e-05, "loss": 0.2647, "step": 1680 }, { "epoch": 3.4411463664278403, "grad_norm": 0.3365183925944955, "learning_rate": 1.0676316972083867e-05, "loss": 0.2432, "step": 1681 }, { "epoch": 3.4431934493346983, "grad_norm": 0.28313743388646095, "learning_rate": 1.0651013876900546e-05, "loss": 0.2099, "step": 1682 }, { "epoch": 3.445240532241556, "grad_norm": 0.34192081983514505, "learning_rate": 1.0625729915441659e-05, "loss": 0.2482, "step": 1683 }, { "epoch": 3.4472876151484133, "grad_norm": 0.31305659538484126, "learning_rate": 1.060046513945361e-05, "loss": 0.2137, "step": 1684 }, { "epoch": 3.4493346980552713, "grad_norm": 0.3191034853933784, "learning_rate": 1.0575219600643508e-05, "loss": 0.2329, "step": 1685 }, { "epoch": 3.4513817809621288, "grad_norm": 0.3245403446485034, "learning_rate": 1.0549993350679138e-05, "loss": 0.235, "step": 1686 }, { "epoch": 3.4534288638689867, "grad_norm": 0.3088632467059862, "learning_rate": 1.0524786441188786e-05, "loss": 0.2155, "step": 1687 }, { "epoch": 3.4554759467758442, "grad_norm": 0.3354339197261192, "learning_rate": 1.0499598923761139e-05, "loss": 0.2341, "step": 1688 }, { "epoch": 3.457523029682702, "grad_norm": 0.3090008978999402, "learning_rate": 1.0474430849945214e-05, "loss": 0.2081, "step": 1689 }, { "epoch": 3.4595701125895597, "grad_norm": 0.30640868139315103, "learning_rate": 1.0449282271250239e-05, "loss": 0.2258, "step": 1690 }, { "epoch": 3.4616171954964177, "grad_norm": 0.325313731285426, "learning_rate": 1.0424153239145527e-05, "loss": 0.267, "step": 1691 }, { "epoch": 3.463664278403275, "grad_norm": 0.3226758771977029, "learning_rate": 1.0399043805060406e-05, "loss": 0.2761, "step": 1692 }, { "epoch": 3.465711361310133, "grad_norm": 0.2751548117016905, "learning_rate": 1.0373954020384073e-05, "loss": 0.2102, "step": 1693 }, { "epoch": 3.4677584442169906, "grad_norm": 0.3197825524327039, "learning_rate": 1.0348883936465537e-05, "loss": 0.2558, "step": 1694 }, { "epoch": 3.4698055271238486, "grad_norm": 0.3009633405603159, "learning_rate": 1.0323833604613454e-05, "loss": 0.2214, "step": 1695 }, { "epoch": 3.471852610030706, "grad_norm": 0.3104955641697045, "learning_rate": 1.029880307609608e-05, "loss": 0.2686, "step": 1696 }, { "epoch": 3.473899692937564, "grad_norm": 0.2692226200882447, "learning_rate": 1.0273792402141134e-05, "loss": 0.2312, "step": 1697 }, { "epoch": 3.4759467758444216, "grad_norm": 0.31761762398562665, "learning_rate": 1.0248801633935699e-05, "loss": 0.2536, "step": 1698 }, { "epoch": 3.4779938587512795, "grad_norm": 0.3196706676343915, "learning_rate": 1.0223830822626124e-05, "loss": 0.2556, "step": 1699 }, { "epoch": 3.480040941658137, "grad_norm": 0.2983026635325646, "learning_rate": 1.0198880019317913e-05, "loss": 0.2468, "step": 1700 }, { "epoch": 3.482088024564995, "grad_norm": 0.304844430252478, "learning_rate": 1.017394927507561e-05, "loss": 0.2252, "step": 1701 }, { "epoch": 3.4841351074718525, "grad_norm": 0.32219494329823706, "learning_rate": 1.0149038640922715e-05, "loss": 0.2576, "step": 1702 }, { "epoch": 3.4861821903787105, "grad_norm": 0.27799343218915756, "learning_rate": 1.0124148167841577e-05, "loss": 0.1703, "step": 1703 }, { "epoch": 3.488229273285568, "grad_norm": 0.32497007511792664, "learning_rate": 1.009927790677327e-05, "loss": 0.2366, "step": 1704 }, { "epoch": 3.490276356192426, "grad_norm": 0.3095778446432627, "learning_rate": 1.0074427908617515e-05, "loss": 0.2806, "step": 1705 }, { "epoch": 3.4923234390992834, "grad_norm": 0.3249340961496839, "learning_rate": 1.004959822423255e-05, "loss": 0.2406, "step": 1706 }, { "epoch": 3.4943705220061414, "grad_norm": 0.3092490415262631, "learning_rate": 1.0024788904435054e-05, "loss": 0.225, "step": 1707 }, { "epoch": 3.496417604912999, "grad_norm": 0.3376859697600405, "learning_rate": 1.0000000000000006e-05, "loss": 0.2859, "step": 1708 }, { "epoch": 3.498464687819857, "grad_norm": 0.2986922022588596, "learning_rate": 9.975231561660617e-06, "loss": 0.2241, "step": 1709 }, { "epoch": 3.5005117707267144, "grad_norm": 0.3304776655435591, "learning_rate": 9.950483640108215e-06, "loss": 0.2865, "step": 1710 }, { "epoch": 3.5025588536335723, "grad_norm": 0.30674425880777934, "learning_rate": 9.92575628599213e-06, "loss": 0.204, "step": 1711 }, { "epoch": 3.50460593654043, "grad_norm": 0.3372001770402179, "learning_rate": 9.901049549919601e-06, "loss": 0.2705, "step": 1712 }, { "epoch": 3.5066530194472874, "grad_norm": 0.28957256516814167, "learning_rate": 9.876363482455675e-06, "loss": 0.215, "step": 1713 }, { "epoch": 3.5087001023541453, "grad_norm": 0.36096631037593485, "learning_rate": 9.851698134123095e-06, "loss": 0.2522, "step": 1714 }, { "epoch": 3.5107471852610033, "grad_norm": 0.3128276504087451, "learning_rate": 9.827053555402191e-06, "loss": 0.2661, "step": 1715 }, { "epoch": 3.512794268167861, "grad_norm": 0.3126033507485691, "learning_rate": 9.802429796730792e-06, "loss": 0.2329, "step": 1716 }, { "epoch": 3.5148413510747183, "grad_norm": 0.33709412127432403, "learning_rate": 9.777826908504126e-06, "loss": 0.2673, "step": 1717 }, { "epoch": 3.5168884339815762, "grad_norm": 0.31504566512999976, "learning_rate": 9.753244941074696e-06, "loss": 0.1942, "step": 1718 }, { "epoch": 3.518935516888434, "grad_norm": 0.3081794499055162, "learning_rate": 9.728683944752193e-06, "loss": 0.23, "step": 1719 }, { "epoch": 3.5209825997952917, "grad_norm": 0.3205819253659646, "learning_rate": 9.704143969803392e-06, "loss": 0.2599, "step": 1720 }, { "epoch": 3.5230296827021492, "grad_norm": 0.3179566007920735, "learning_rate": 9.679625066452028e-06, "loss": 0.2468, "step": 1721 }, { "epoch": 3.525076765609007, "grad_norm": 0.33546341224030785, "learning_rate": 9.655127284878723e-06, "loss": 0.2285, "step": 1722 }, { "epoch": 3.527123848515865, "grad_norm": 0.3406356093850999, "learning_rate": 9.630650675220892e-06, "loss": 0.2277, "step": 1723 }, { "epoch": 3.5291709314227226, "grad_norm": 0.297528564327505, "learning_rate": 9.606195287572577e-06, "loss": 0.199, "step": 1724 }, { "epoch": 3.53121801432958, "grad_norm": 0.3221532376937218, "learning_rate": 9.581761171984416e-06, "loss": 0.2157, "step": 1725 }, { "epoch": 3.533265097236438, "grad_norm": 0.35339763123663803, "learning_rate": 9.557348378463503e-06, "loss": 0.2377, "step": 1726 }, { "epoch": 3.535312180143296, "grad_norm": 0.3011217732360328, "learning_rate": 9.532956956973302e-06, "loss": 0.2316, "step": 1727 }, { "epoch": 3.5373592630501536, "grad_norm": 0.31883004437994317, "learning_rate": 9.50858695743351e-06, "loss": 0.2358, "step": 1728 }, { "epoch": 3.539406345957011, "grad_norm": 0.4148367793712331, "learning_rate": 9.484238429720018e-06, "loss": 0.2412, "step": 1729 }, { "epoch": 3.541453428863869, "grad_norm": 0.3651190389759453, "learning_rate": 9.459911423664763e-06, "loss": 0.2496, "step": 1730 }, { "epoch": 3.5435005117707266, "grad_norm": 0.2782188953479953, "learning_rate": 9.435605989055607e-06, "loss": 0.2028, "step": 1731 }, { "epoch": 3.5455475946775845, "grad_norm": 0.31391987453704123, "learning_rate": 9.411322175636298e-06, "loss": 0.2561, "step": 1732 }, { "epoch": 3.547594677584442, "grad_norm": 0.3158415427773136, "learning_rate": 9.387060033106321e-06, "loss": 0.2956, "step": 1733 }, { "epoch": 3.5496417604913, "grad_norm": 0.3175891657245625, "learning_rate": 9.362819611120793e-06, "loss": 0.2566, "step": 1734 }, { "epoch": 3.5516888433981575, "grad_norm": 0.3312219072193364, "learning_rate": 9.338600959290414e-06, "loss": 0.2317, "step": 1735 }, { "epoch": 3.5537359263050154, "grad_norm": 0.3341176350083145, "learning_rate": 9.314404127181307e-06, "loss": 0.293, "step": 1736 }, { "epoch": 3.555783009211873, "grad_norm": 0.29940835888683603, "learning_rate": 9.290229164314928e-06, "loss": 0.2221, "step": 1737 }, { "epoch": 3.557830092118731, "grad_norm": 0.3046910176714613, "learning_rate": 9.266076120167992e-06, "loss": 0.2472, "step": 1738 }, { "epoch": 3.5598771750255884, "grad_norm": 0.2841913350596404, "learning_rate": 9.241945044172353e-06, "loss": 0.2277, "step": 1739 }, { "epoch": 3.5619242579324464, "grad_norm": 0.323279705340557, "learning_rate": 9.217835985714898e-06, "loss": 0.2709, "step": 1740 }, { "epoch": 3.563971340839304, "grad_norm": 0.29421984064136286, "learning_rate": 9.193748994137462e-06, "loss": 0.2064, "step": 1741 }, { "epoch": 3.566018423746162, "grad_norm": 0.3431113829919363, "learning_rate": 9.169684118736708e-06, "loss": 0.2581, "step": 1742 }, { "epoch": 3.5680655066530194, "grad_norm": 0.31454166876772843, "learning_rate": 9.145641408764048e-06, "loss": 0.2135, "step": 1743 }, { "epoch": 3.5701125895598773, "grad_norm": 0.3294672393878416, "learning_rate": 9.121620913425508e-06, "loss": 0.2607, "step": 1744 }, { "epoch": 3.572159672466735, "grad_norm": 0.29027976941333605, "learning_rate": 9.097622681881673e-06, "loss": 0.1969, "step": 1745 }, { "epoch": 3.574206755373593, "grad_norm": 0.3177811895638415, "learning_rate": 9.073646763247558e-06, "loss": 0.2103, "step": 1746 }, { "epoch": 3.5762538382804503, "grad_norm": 0.3196047544970432, "learning_rate": 9.04969320659249e-06, "loss": 0.239, "step": 1747 }, { "epoch": 3.5783009211873082, "grad_norm": 0.2925767290546702, "learning_rate": 9.025762060940062e-06, "loss": 0.198, "step": 1748 }, { "epoch": 3.5803480040941658, "grad_norm": 0.33053764466506, "learning_rate": 9.001853375267989e-06, "loss": 0.2366, "step": 1749 }, { "epoch": 3.5823950870010233, "grad_norm": 0.3174304755008507, "learning_rate": 8.977967198508001e-06, "loss": 0.2256, "step": 1750 }, { "epoch": 3.5844421699078812, "grad_norm": 0.343170859287156, "learning_rate": 8.954103579545785e-06, "loss": 0.2341, "step": 1751 }, { "epoch": 3.586489252814739, "grad_norm": 0.3215320683527329, "learning_rate": 8.93026256722085e-06, "loss": 0.2045, "step": 1752 }, { "epoch": 3.5885363357215967, "grad_norm": 0.345488281284008, "learning_rate": 8.906444210326441e-06, "loss": 0.2708, "step": 1753 }, { "epoch": 3.590583418628454, "grad_norm": 0.29374650212717546, "learning_rate": 8.882648557609434e-06, "loss": 0.2144, "step": 1754 }, { "epoch": 3.592630501535312, "grad_norm": 0.30791903954132194, "learning_rate": 8.858875657770241e-06, "loss": 0.2196, "step": 1755 }, { "epoch": 3.59467758444217, "grad_norm": 0.37604604250081547, "learning_rate": 8.83512555946271e-06, "loss": 0.2842, "step": 1756 }, { "epoch": 3.5967246673490276, "grad_norm": 0.2815977468791065, "learning_rate": 8.811398311294008e-06, "loss": 0.2128, "step": 1757 }, { "epoch": 3.598771750255885, "grad_norm": 0.32463396760239493, "learning_rate": 8.787693961824555e-06, "loss": 0.2635, "step": 1758 }, { "epoch": 3.600818833162743, "grad_norm": 0.3178563263559145, "learning_rate": 8.764012559567899e-06, "loss": 0.2749, "step": 1759 }, { "epoch": 3.602865916069601, "grad_norm": 0.32193828000242514, "learning_rate": 8.740354152990624e-06, "loss": 0.232, "step": 1760 }, { "epoch": 3.6049129989764586, "grad_norm": 0.35115161370306397, "learning_rate": 8.716718790512251e-06, "loss": 0.2421, "step": 1761 }, { "epoch": 3.606960081883316, "grad_norm": 0.3150133989578825, "learning_rate": 8.693106520505147e-06, "loss": 0.2078, "step": 1762 }, { "epoch": 3.609007164790174, "grad_norm": 0.34022251620199256, "learning_rate": 8.669517391294397e-06, "loss": 0.248, "step": 1763 }, { "epoch": 3.611054247697032, "grad_norm": 0.3039508397036727, "learning_rate": 8.645951451157741e-06, "loss": 0.187, "step": 1764 }, { "epoch": 3.6131013306038895, "grad_norm": 0.3287212950737981, "learning_rate": 8.622408748325461e-06, "loss": 0.2774, "step": 1765 }, { "epoch": 3.615148413510747, "grad_norm": 0.3159497126746659, "learning_rate": 8.598889330980277e-06, "loss": 0.2251, "step": 1766 }, { "epoch": 3.617195496417605, "grad_norm": 0.2931231286120822, "learning_rate": 8.575393247257256e-06, "loss": 0.2267, "step": 1767 }, { "epoch": 3.619242579324463, "grad_norm": 0.3287345655736662, "learning_rate": 8.551920545243704e-06, "loss": 0.24, "step": 1768 }, { "epoch": 3.6212896622313204, "grad_norm": 0.30400959536179484, "learning_rate": 8.528471272979083e-06, "loss": 0.2133, "step": 1769 }, { "epoch": 3.623336745138178, "grad_norm": 0.31203010646854484, "learning_rate": 8.50504547845489e-06, "loss": 0.2404, "step": 1770 }, { "epoch": 3.625383828045036, "grad_norm": 0.30015537192427894, "learning_rate": 8.481643209614576e-06, "loss": 0.2059, "step": 1771 }, { "epoch": 3.6274309109518934, "grad_norm": 0.295949459736467, "learning_rate": 8.45826451435347e-06, "loss": 0.2209, "step": 1772 }, { "epoch": 3.6294779938587514, "grad_norm": 0.32651774554006335, "learning_rate": 8.434909440518613e-06, "loss": 0.258, "step": 1773 }, { "epoch": 3.631525076765609, "grad_norm": 0.28180464315900705, "learning_rate": 8.411578035908728e-06, "loss": 0.205, "step": 1774 }, { "epoch": 3.633572159672467, "grad_norm": 0.3413634143023636, "learning_rate": 8.388270348274092e-06, "loss": 0.2769, "step": 1775 }, { "epoch": 3.6356192425793243, "grad_norm": 0.33236322758981973, "learning_rate": 8.364986425316448e-06, "loss": 0.2234, "step": 1776 }, { "epoch": 3.6376663254861823, "grad_norm": 0.3020595253520059, "learning_rate": 8.341726314688875e-06, "loss": 0.2509, "step": 1777 }, { "epoch": 3.63971340839304, "grad_norm": 0.31852165056270737, "learning_rate": 8.318490063995761e-06, "loss": 0.2537, "step": 1778 }, { "epoch": 3.6417604912998978, "grad_norm": 0.2944306330035562, "learning_rate": 8.295277720792634e-06, "loss": 0.2222, "step": 1779 }, { "epoch": 3.6438075742067553, "grad_norm": 0.3191896831407699, "learning_rate": 8.272089332586089e-06, "loss": 0.2437, "step": 1780 }, { "epoch": 3.6458546571136132, "grad_norm": 0.3341784697234189, "learning_rate": 8.248924946833705e-06, "loss": 0.25, "step": 1781 }, { "epoch": 3.6479017400204707, "grad_norm": 0.28526708897000397, "learning_rate": 8.225784610943948e-06, "loss": 0.2586, "step": 1782 }, { "epoch": 3.6499488229273287, "grad_norm": 0.30819644727048756, "learning_rate": 8.20266837227603e-06, "loss": 0.2482, "step": 1783 }, { "epoch": 3.651995905834186, "grad_norm": 0.3391130155927512, "learning_rate": 8.179576278139872e-06, "loss": 0.2703, "step": 1784 }, { "epoch": 3.654042988741044, "grad_norm": 0.3457772155494689, "learning_rate": 8.156508375795995e-06, "loss": 0.2138, "step": 1785 }, { "epoch": 3.6560900716479017, "grad_norm": 0.3545988817947354, "learning_rate": 8.133464712455364e-06, "loss": 0.2381, "step": 1786 }, { "epoch": 3.6581371545547596, "grad_norm": 0.3051355101331329, "learning_rate": 8.11044533527937e-06, "loss": 0.2212, "step": 1787 }, { "epoch": 3.660184237461617, "grad_norm": 0.31093454380049773, "learning_rate": 8.087450291379693e-06, "loss": 0.2782, "step": 1788 }, { "epoch": 3.662231320368475, "grad_norm": 0.3076008520431847, "learning_rate": 8.064479627818213e-06, "loss": 0.2563, "step": 1789 }, { "epoch": 3.6642784032753326, "grad_norm": 0.2914759795466477, "learning_rate": 8.041533391606892e-06, "loss": 0.237, "step": 1790 }, { "epoch": 3.66632548618219, "grad_norm": 0.32811153076900396, "learning_rate": 8.018611629707735e-06, "loss": 0.2192, "step": 1791 }, { "epoch": 3.668372569089048, "grad_norm": 0.3349979236041854, "learning_rate": 7.995714389032638e-06, "loss": 0.2544, "step": 1792 }, { "epoch": 3.670419651995906, "grad_norm": 0.30481283300379364, "learning_rate": 7.972841716443304e-06, "loss": 0.2021, "step": 1793 }, { "epoch": 3.6724667349027635, "grad_norm": 0.31038423151434946, "learning_rate": 7.949993658751168e-06, "loss": 0.2714, "step": 1794 }, { "epoch": 3.674513817809621, "grad_norm": 0.3046182836563986, "learning_rate": 7.927170262717284e-06, "loss": 0.2486, "step": 1795 }, { "epoch": 3.676560900716479, "grad_norm": 0.321493522566335, "learning_rate": 7.904371575052224e-06, "loss": 0.257, "step": 1796 }, { "epoch": 3.678607983623337, "grad_norm": 0.32116628196410996, "learning_rate": 7.881597642416012e-06, "loss": 0.2351, "step": 1797 }, { "epoch": 3.6806550665301945, "grad_norm": 0.3225460384671046, "learning_rate": 7.858848511417998e-06, "loss": 0.2787, "step": 1798 }, { "epoch": 3.682702149437052, "grad_norm": 0.3065477845223782, "learning_rate": 7.836124228616762e-06, "loss": 0.2059, "step": 1799 }, { "epoch": 3.68474923234391, "grad_norm": 0.2820736922840727, "learning_rate": 7.81342484052004e-06, "loss": 0.2065, "step": 1800 }, { "epoch": 3.686796315250768, "grad_norm": 0.3199543711113544, "learning_rate": 7.790750393584616e-06, "loss": 0.2482, "step": 1801 }, { "epoch": 3.6888433981576254, "grad_norm": 0.3041948190106927, "learning_rate": 7.768100934216234e-06, "loss": 0.2278, "step": 1802 }, { "epoch": 3.690890481064483, "grad_norm": 0.3053390429608892, "learning_rate": 7.745476508769494e-06, "loss": 0.2356, "step": 1803 }, { "epoch": 3.692937563971341, "grad_norm": 0.3110750944178115, "learning_rate": 7.72287716354776e-06, "loss": 0.2402, "step": 1804 }, { "epoch": 3.694984646878199, "grad_norm": 0.28367962348790254, "learning_rate": 7.700302944803076e-06, "loss": 0.1827, "step": 1805 }, { "epoch": 3.6970317297850563, "grad_norm": 0.33323381054790285, "learning_rate": 7.67775389873604e-06, "loss": 0.2293, "step": 1806 }, { "epoch": 3.699078812691914, "grad_norm": 0.3521970989279458, "learning_rate": 7.65523007149575e-06, "loss": 0.2657, "step": 1807 }, { "epoch": 3.701125895598772, "grad_norm": 0.30699833196534315, "learning_rate": 7.63273150917969e-06, "loss": 0.2421, "step": 1808 }, { "epoch": 3.7031729785056293, "grad_norm": 0.31416536601357237, "learning_rate": 7.6102582578336315e-06, "loss": 0.1997, "step": 1809 }, { "epoch": 3.7052200614124873, "grad_norm": 0.3803922818493654, "learning_rate": 7.587810363451544e-06, "loss": 0.2428, "step": 1810 }, { "epoch": 3.707267144319345, "grad_norm": 0.32978409370696515, "learning_rate": 7.565387871975511e-06, "loss": 0.2037, "step": 1811 }, { "epoch": 3.7093142272262027, "grad_norm": 0.33553313939376517, "learning_rate": 7.5429908292956045e-06, "loss": 0.2675, "step": 1812 }, { "epoch": 3.7113613101330603, "grad_norm": 0.3164776677964802, "learning_rate": 7.5206192812498345e-06, "loss": 0.262, "step": 1813 }, { "epoch": 3.713408393039918, "grad_norm": 0.29867646878594656, "learning_rate": 7.498273273624022e-06, "loss": 0.2468, "step": 1814 }, { "epoch": 3.7154554759467757, "grad_norm": 0.31216831365580633, "learning_rate": 7.475952852151722e-06, "loss": 0.2225, "step": 1815 }, { "epoch": 3.7175025588536337, "grad_norm": 0.2854940367151503, "learning_rate": 7.4536580625141244e-06, "loss": 0.2302, "step": 1816 }, { "epoch": 3.719549641760491, "grad_norm": 0.3133497229188554, "learning_rate": 7.431388950339955e-06, "loss": 0.2188, "step": 1817 }, { "epoch": 3.721596724667349, "grad_norm": 0.3115695729233575, "learning_rate": 7.409145561205402e-06, "loss": 0.2251, "step": 1818 }, { "epoch": 3.7236438075742067, "grad_norm": 0.3211680619051747, "learning_rate": 7.386927940633981e-06, "loss": 0.244, "step": 1819 }, { "epoch": 3.7256908904810646, "grad_norm": 0.3048950617764936, "learning_rate": 7.364736134096497e-06, "loss": 0.264, "step": 1820 }, { "epoch": 3.727737973387922, "grad_norm": 0.2939363289247235, "learning_rate": 7.342570187010913e-06, "loss": 0.235, "step": 1821 }, { "epoch": 3.72978505629478, "grad_norm": 0.3233466134307953, "learning_rate": 7.32043014474227e-06, "loss": 0.306, "step": 1822 }, { "epoch": 3.7318321392016376, "grad_norm": 0.30575032489620757, "learning_rate": 7.2983160526025854e-06, "loss": 0.2719, "step": 1823 }, { "epoch": 3.7338792221084955, "grad_norm": 0.297249466993302, "learning_rate": 7.276227955850774e-06, "loss": 0.235, "step": 1824 }, { "epoch": 3.735926305015353, "grad_norm": 0.29688130457735584, "learning_rate": 7.254165899692554e-06, "loss": 0.2313, "step": 1825 }, { "epoch": 3.737973387922211, "grad_norm": 0.315867334334272, "learning_rate": 7.2321299292803275e-06, "loss": 0.2554, "step": 1826 }, { "epoch": 3.7400204708290685, "grad_norm": 0.35865826681363316, "learning_rate": 7.210120089713117e-06, "loss": 0.2657, "step": 1827 }, { "epoch": 3.742067553735926, "grad_norm": 0.29556390069528465, "learning_rate": 7.188136426036498e-06, "loss": 0.2309, "step": 1828 }, { "epoch": 3.744114636642784, "grad_norm": 0.3062750470379352, "learning_rate": 7.166178983242425e-06, "loss": 0.25, "step": 1829 }, { "epoch": 3.746161719549642, "grad_norm": 0.3388760658555746, "learning_rate": 7.1442478062692135e-06, "loss": 0.2837, "step": 1830 }, { "epoch": 3.7482088024564995, "grad_norm": 0.3198475086242932, "learning_rate": 7.12234294000143e-06, "loss": 0.2334, "step": 1831 }, { "epoch": 3.750255885363357, "grad_norm": 0.3089175153993069, "learning_rate": 7.100464429269769e-06, "loss": 0.2647, "step": 1832 }, { "epoch": 3.752302968270215, "grad_norm": 0.28196699295142263, "learning_rate": 7.078612318850999e-06, "loss": 0.2168, "step": 1833 }, { "epoch": 3.754350051177073, "grad_norm": 0.297420997662474, "learning_rate": 7.056786653467882e-06, "loss": 0.2028, "step": 1834 }, { "epoch": 3.7563971340839304, "grad_norm": 0.28045674846802615, "learning_rate": 7.034987477789008e-06, "loss": 0.1939, "step": 1835 }, { "epoch": 3.758444216990788, "grad_norm": 0.311062703079177, "learning_rate": 7.01321483642879e-06, "loss": 0.2611, "step": 1836 }, { "epoch": 3.760491299897646, "grad_norm": 0.297760397539936, "learning_rate": 6.991468773947321e-06, "loss": 0.2575, "step": 1837 }, { "epoch": 3.762538382804504, "grad_norm": 0.2972100075369938, "learning_rate": 6.969749334850308e-06, "loss": 0.2088, "step": 1838 }, { "epoch": 3.7645854657113613, "grad_norm": 0.3155194168733677, "learning_rate": 6.948056563588943e-06, "loss": 0.2469, "step": 1839 }, { "epoch": 3.766632548618219, "grad_norm": 0.33253411719664255, "learning_rate": 6.926390504559879e-06, "loss": 0.2066, "step": 1840 }, { "epoch": 3.768679631525077, "grad_norm": 0.31443745809454954, "learning_rate": 6.90475120210508e-06, "loss": 0.2462, "step": 1841 }, { "epoch": 3.7707267144319347, "grad_norm": 0.2916854017686706, "learning_rate": 6.883138700511735e-06, "loss": 0.206, "step": 1842 }, { "epoch": 3.7727737973387923, "grad_norm": 0.29896805378427554, "learning_rate": 6.861553044012206e-06, "loss": 0.2458, "step": 1843 }, { "epoch": 3.7748208802456498, "grad_norm": 0.3070231834047609, "learning_rate": 6.8399942767839075e-06, "loss": 0.2375, "step": 1844 }, { "epoch": 3.7768679631525077, "grad_norm": 0.35269614289266477, "learning_rate": 6.818462442949203e-06, "loss": 0.2354, "step": 1845 }, { "epoch": 3.7789150460593657, "grad_norm": 0.2885923676811397, "learning_rate": 6.796957586575364e-06, "loss": 0.2669, "step": 1846 }, { "epoch": 3.780962128966223, "grad_norm": 0.31946737683114335, "learning_rate": 6.775479751674439e-06, "loss": 0.2292, "step": 1847 }, { "epoch": 3.7830092118730807, "grad_norm": 0.31027789747129964, "learning_rate": 6.754028982203154e-06, "loss": 0.2204, "step": 1848 }, { "epoch": 3.7850562947799387, "grad_norm": 0.28163894937364203, "learning_rate": 6.732605322062869e-06, "loss": 0.2131, "step": 1849 }, { "epoch": 3.787103377686796, "grad_norm": 0.32382848241213846, "learning_rate": 6.711208815099451e-06, "loss": 0.2349, "step": 1850 }, { "epoch": 3.789150460593654, "grad_norm": 0.3138159392771698, "learning_rate": 6.689839505103195e-06, "loss": 0.2214, "step": 1851 }, { "epoch": 3.7911975435005116, "grad_norm": 0.3336295131707369, "learning_rate": 6.668497435808736e-06, "loss": 0.2318, "step": 1852 }, { "epoch": 3.7932446264073696, "grad_norm": 0.2805986894444167, "learning_rate": 6.647182650894956e-06, "loss": 0.1898, "step": 1853 }, { "epoch": 3.795291709314227, "grad_norm": 0.311652087268774, "learning_rate": 6.6258951939849055e-06, "loss": 0.2244, "step": 1854 }, { "epoch": 3.797338792221085, "grad_norm": 0.33194907539363816, "learning_rate": 6.604635108645683e-06, "loss": 0.272, "step": 1855 }, { "epoch": 3.7993858751279426, "grad_norm": 0.2974799213005746, "learning_rate": 6.583402438388391e-06, "loss": 0.2967, "step": 1856 }, { "epoch": 3.8014329580348005, "grad_norm": 0.31133998039092214, "learning_rate": 6.562197226668015e-06, "loss": 0.2587, "step": 1857 }, { "epoch": 3.803480040941658, "grad_norm": 0.30821960779378943, "learning_rate": 6.5410195168833425e-06, "loss": 0.242, "step": 1858 }, { "epoch": 3.805527123848516, "grad_norm": 0.30511264324428783, "learning_rate": 6.519869352376878e-06, "loss": 0.2318, "step": 1859 }, { "epoch": 3.8075742067553735, "grad_norm": 0.2885137726399915, "learning_rate": 6.498746776434759e-06, "loss": 0.2412, "step": 1860 }, { "epoch": 3.8096212896622315, "grad_norm": 0.30572665106073443, "learning_rate": 6.477651832286633e-06, "loss": 0.2714, "step": 1861 }, { "epoch": 3.811668372569089, "grad_norm": 0.2999528878346436, "learning_rate": 6.456584563105628e-06, "loss": 0.225, "step": 1862 }, { "epoch": 3.813715455475947, "grad_norm": 0.306996947567515, "learning_rate": 6.435545012008213e-06, "loss": 0.2228, "step": 1863 }, { "epoch": 3.8157625383828044, "grad_norm": 0.30156526722849053, "learning_rate": 6.414533222054138e-06, "loss": 0.2271, "step": 1864 }, { "epoch": 3.8178096212896624, "grad_norm": 0.3027861977559771, "learning_rate": 6.393549236246333e-06, "loss": 0.2358, "step": 1865 }, { "epoch": 3.81985670419652, "grad_norm": 0.3100676988224775, "learning_rate": 6.372593097530822e-06, "loss": 0.2224, "step": 1866 }, { "epoch": 3.821903787103378, "grad_norm": 0.30881142890916535, "learning_rate": 6.3516648487966456e-06, "loss": 0.212, "step": 1867 }, { "epoch": 3.8239508700102354, "grad_norm": 0.3074430788928016, "learning_rate": 6.330764532875748e-06, "loss": 0.2559, "step": 1868 }, { "epoch": 3.825997952917093, "grad_norm": 0.29993534561417257, "learning_rate": 6.309892192542919e-06, "loss": 0.2414, "step": 1869 }, { "epoch": 3.828045035823951, "grad_norm": 0.2948226082874635, "learning_rate": 6.289047870515692e-06, "loss": 0.2634, "step": 1870 }, { "epoch": 3.830092118730809, "grad_norm": 0.35876524638729923, "learning_rate": 6.268231609454254e-06, "loss": 0.3221, "step": 1871 }, { "epoch": 3.8321392016376663, "grad_norm": 0.29964527729201973, "learning_rate": 6.247443451961366e-06, "loss": 0.2046, "step": 1872 }, { "epoch": 3.834186284544524, "grad_norm": 0.32365319216721733, "learning_rate": 6.226683440582268e-06, "loss": 0.2233, "step": 1873 }, { "epoch": 3.8362333674513818, "grad_norm": 0.3064343796948702, "learning_rate": 6.2059516178046064e-06, "loss": 0.2548, "step": 1874 }, { "epoch": 3.8382804503582397, "grad_norm": 0.31074022748353647, "learning_rate": 6.185248026058312e-06, "loss": 0.2682, "step": 1875 }, { "epoch": 3.8403275332650972, "grad_norm": 0.3001350448200465, "learning_rate": 6.164572707715564e-06, "loss": 0.2287, "step": 1876 }, { "epoch": 3.8423746161719547, "grad_norm": 0.3230753170136733, "learning_rate": 6.143925705090666e-06, "loss": 0.3244, "step": 1877 }, { "epoch": 3.8444216990788127, "grad_norm": 0.2998343099705012, "learning_rate": 6.123307060439967e-06, "loss": 0.2163, "step": 1878 }, { "epoch": 3.8464687819856707, "grad_norm": 0.3171615427794975, "learning_rate": 6.102716815961787e-06, "loss": 0.2126, "step": 1879 }, { "epoch": 3.848515864892528, "grad_norm": 0.30038516037265045, "learning_rate": 6.082155013796323e-06, "loss": 0.2197, "step": 1880 }, { "epoch": 3.8505629477993857, "grad_norm": 0.2706760721698934, "learning_rate": 6.061621696025539e-06, "loss": 0.203, "step": 1881 }, { "epoch": 3.8526100307062436, "grad_norm": 0.3019695460717632, "learning_rate": 6.041116904673125e-06, "loss": 0.243, "step": 1882 }, { "epoch": 3.8546571136131016, "grad_norm": 0.3238748626818905, "learning_rate": 6.020640681704402e-06, "loss": 0.2387, "step": 1883 }, { "epoch": 3.856704196519959, "grad_norm": 0.2845226343722909, "learning_rate": 6.000193069026181e-06, "loss": 0.1939, "step": 1884 }, { "epoch": 3.8587512794268166, "grad_norm": 0.30062466940994315, "learning_rate": 5.979774108486751e-06, "loss": 0.2417, "step": 1885 }, { "epoch": 3.8607983623336746, "grad_norm": 0.29657229767791343, "learning_rate": 5.95938384187575e-06, "loss": 0.2513, "step": 1886 }, { "epoch": 3.862845445240532, "grad_norm": 0.2965578800271511, "learning_rate": 5.939022310924099e-06, "loss": 0.238, "step": 1887 }, { "epoch": 3.86489252814739, "grad_norm": 0.28944984642689814, "learning_rate": 5.918689557303885e-06, "loss": 0.233, "step": 1888 }, { "epoch": 3.8669396110542475, "grad_norm": 0.3405343709910588, "learning_rate": 5.898385622628336e-06, "loss": 0.2568, "step": 1889 }, { "epoch": 3.8689866939611055, "grad_norm": 0.31192910196034185, "learning_rate": 5.878110548451675e-06, "loss": 0.255, "step": 1890 }, { "epoch": 3.871033776867963, "grad_norm": 0.3090737941594832, "learning_rate": 5.857864376269051e-06, "loss": 0.2356, "step": 1891 }, { "epoch": 3.873080859774821, "grad_norm": 0.27438554656264486, "learning_rate": 5.837647147516483e-06, "loss": 0.19, "step": 1892 }, { "epoch": 3.8751279426816785, "grad_norm": 0.3021169809249445, "learning_rate": 5.817458903570747e-06, "loss": 0.2248, "step": 1893 }, { "epoch": 3.8771750255885364, "grad_norm": 0.3213447653550202, "learning_rate": 5.7972996857492896e-06, "loss": 0.2269, "step": 1894 }, { "epoch": 3.879222108495394, "grad_norm": 0.33058912837962084, "learning_rate": 5.777169535310152e-06, "loss": 0.2359, "step": 1895 }, { "epoch": 3.881269191402252, "grad_norm": 0.28572806654091426, "learning_rate": 5.7570684934519135e-06, "loss": 0.2147, "step": 1896 }, { "epoch": 3.8833162743091094, "grad_norm": 0.32405838839318063, "learning_rate": 5.736996601313545e-06, "loss": 0.25, "step": 1897 }, { "epoch": 3.8853633572159674, "grad_norm": 0.31407112083239963, "learning_rate": 5.716953899974371e-06, "loss": 0.2628, "step": 1898 }, { "epoch": 3.887410440122825, "grad_norm": 0.24848541316900655, "learning_rate": 5.696940430453981e-06, "loss": 0.1664, "step": 1899 }, { "epoch": 3.889457523029683, "grad_norm": 0.3315987651736212, "learning_rate": 5.676956233712139e-06, "loss": 0.2637, "step": 1900 }, { "epoch": 3.8915046059365404, "grad_norm": 0.2855594139469349, "learning_rate": 5.657001350648674e-06, "loss": 0.207, "step": 1901 }, { "epoch": 3.8935516888433983, "grad_norm": 0.3282465345015928, "learning_rate": 5.6370758221034595e-06, "loss": 0.2535, "step": 1902 }, { "epoch": 3.895598771750256, "grad_norm": 0.30475560401896684, "learning_rate": 5.617179688856271e-06, "loss": 0.2432, "step": 1903 }, { "epoch": 3.8976458546571138, "grad_norm": 0.30871123340361434, "learning_rate": 5.597312991626713e-06, "loss": 0.2134, "step": 1904 }, { "epoch": 3.8996929375639713, "grad_norm": 0.33995724036296693, "learning_rate": 5.577475771074168e-06, "loss": 0.2485, "step": 1905 }, { "epoch": 3.901740020470829, "grad_norm": 0.2811551664838514, "learning_rate": 5.557668067797677e-06, "loss": 0.2453, "step": 1906 }, { "epoch": 3.9037871033776868, "grad_norm": 0.3085482770411968, "learning_rate": 5.537889922335877e-06, "loss": 0.2841, "step": 1907 }, { "epoch": 3.9058341862845447, "grad_norm": 0.291552651807712, "learning_rate": 5.5181413751669125e-06, "loss": 0.2519, "step": 1908 }, { "epoch": 3.907881269191402, "grad_norm": 0.301795002796971, "learning_rate": 5.498422466708349e-06, "loss": 0.2157, "step": 1909 }, { "epoch": 3.9099283520982597, "grad_norm": 0.33010524936332025, "learning_rate": 5.478733237317084e-06, "loss": 0.2272, "step": 1910 }, { "epoch": 3.9119754350051177, "grad_norm": 0.30279565810368547, "learning_rate": 5.459073727289291e-06, "loss": 0.2462, "step": 1911 }, { "epoch": 3.9140225179119756, "grad_norm": 0.3180781810526128, "learning_rate": 5.439443976860306e-06, "loss": 0.2358, "step": 1912 }, { "epoch": 3.916069600818833, "grad_norm": 0.3040038712809124, "learning_rate": 5.419844026204568e-06, "loss": 0.2266, "step": 1913 }, { "epoch": 3.9181166837256907, "grad_norm": 0.31115682004986517, "learning_rate": 5.400273915435526e-06, "loss": 0.2706, "step": 1914 }, { "epoch": 3.9201637666325486, "grad_norm": 0.2723072282278328, "learning_rate": 5.38073368460555e-06, "loss": 0.2172, "step": 1915 }, { "epoch": 3.9222108495394066, "grad_norm": 0.3236989253933042, "learning_rate": 5.361223373705873e-06, "loss": 0.2671, "step": 1916 }, { "epoch": 3.924257932446264, "grad_norm": 0.2949286813519759, "learning_rate": 5.341743022666468e-06, "loss": 0.2575, "step": 1917 }, { "epoch": 3.9263050153531216, "grad_norm": 0.2929617720522718, "learning_rate": 5.32229267135602e-06, "loss": 0.2477, "step": 1918 }, { "epoch": 3.9283520982599796, "grad_norm": 0.29944445931066044, "learning_rate": 5.302872359581799e-06, "loss": 0.3016, "step": 1919 }, { "epoch": 3.9303991811668375, "grad_norm": 0.2792530693391496, "learning_rate": 5.283482127089603e-06, "loss": 0.2116, "step": 1920 }, { "epoch": 3.932446264073695, "grad_norm": 0.3148045265972349, "learning_rate": 5.2641220135636685e-06, "loss": 0.251, "step": 1921 }, { "epoch": 3.9344933469805525, "grad_norm": 0.29519836172160036, "learning_rate": 5.244792058626587e-06, "loss": 0.2379, "step": 1922 }, { "epoch": 3.9365404298874105, "grad_norm": 0.27640580348684785, "learning_rate": 5.2254923018392344e-06, "loss": 0.2094, "step": 1923 }, { "epoch": 3.9385875127942684, "grad_norm": 0.3216119033954627, "learning_rate": 5.206222782700667e-06, "loss": 0.2226, "step": 1924 }, { "epoch": 3.940634595701126, "grad_norm": 0.28724430058767236, "learning_rate": 5.186983540648074e-06, "loss": 0.229, "step": 1925 }, { "epoch": 3.9426816786079835, "grad_norm": 0.30838073757141554, "learning_rate": 5.167774615056669e-06, "loss": 0.2471, "step": 1926 }, { "epoch": 3.9447287615148414, "grad_norm": 0.3050237811038075, "learning_rate": 5.1485960452396266e-06, "loss": 0.2516, "step": 1927 }, { "epoch": 3.946775844421699, "grad_norm": 0.3269582811451654, "learning_rate": 5.1294478704479896e-06, "loss": 0.2757, "step": 1928 }, { "epoch": 3.948822927328557, "grad_norm": 0.32559352766006666, "learning_rate": 5.1103301298705995e-06, "loss": 0.2234, "step": 1929 }, { "epoch": 3.9508700102354144, "grad_norm": 0.3018696607635989, "learning_rate": 5.091242862634e-06, "loss": 0.2112, "step": 1930 }, { "epoch": 3.9529170931422724, "grad_norm": 0.28807273157869273, "learning_rate": 5.072186107802377e-06, "loss": 0.2698, "step": 1931 }, { "epoch": 3.95496417604913, "grad_norm": 0.324456956245341, "learning_rate": 5.05315990437747e-06, "loss": 0.2451, "step": 1932 }, { "epoch": 3.957011258955988, "grad_norm": 0.314429989537044, "learning_rate": 5.0341642912984844e-06, "loss": 0.2311, "step": 1933 }, { "epoch": 3.9590583418628453, "grad_norm": 0.2926963524885184, "learning_rate": 5.015199307442027e-06, "loss": 0.2418, "step": 1934 }, { "epoch": 3.9611054247697033, "grad_norm": 0.30442791024762783, "learning_rate": 4.996264991622015e-06, "loss": 0.2513, "step": 1935 }, { "epoch": 3.963152507676561, "grad_norm": 0.30348062095693085, "learning_rate": 4.977361382589607e-06, "loss": 0.217, "step": 1936 }, { "epoch": 3.9651995905834188, "grad_norm": 0.2876460419928197, "learning_rate": 4.958488519033096e-06, "loss": 0.2143, "step": 1937 }, { "epoch": 3.9672466734902763, "grad_norm": 0.3075426518738084, "learning_rate": 4.939646439577868e-06, "loss": 0.2275, "step": 1938 }, { "epoch": 3.969293756397134, "grad_norm": 0.3060349427123557, "learning_rate": 4.920835182786316e-06, "loss": 0.2371, "step": 1939 }, { "epoch": 3.9713408393039917, "grad_norm": 0.31280082710142615, "learning_rate": 4.9020547871577265e-06, "loss": 0.2234, "step": 1940 }, { "epoch": 3.9733879222108497, "grad_norm": 0.3134880106721245, "learning_rate": 4.8833052911282375e-06, "loss": 0.2492, "step": 1941 }, { "epoch": 3.975435005117707, "grad_norm": 0.2953871360194199, "learning_rate": 4.864586733070755e-06, "loss": 0.221, "step": 1942 }, { "epoch": 3.977482088024565, "grad_norm": 0.31671009607399275, "learning_rate": 4.845899151294848e-06, "loss": 0.2335, "step": 1943 }, { "epoch": 3.9795291709314227, "grad_norm": 0.2908681144449216, "learning_rate": 4.827242584046698e-06, "loss": 0.2767, "step": 1944 }, { "epoch": 3.9815762538382806, "grad_norm": 0.2931420599842891, "learning_rate": 4.808617069509034e-06, "loss": 0.2475, "step": 1945 }, { "epoch": 3.983623336745138, "grad_norm": 0.2955583206381109, "learning_rate": 4.790022645800994e-06, "loss": 0.2737, "step": 1946 }, { "epoch": 3.9856704196519956, "grad_norm": 0.2873161139594403, "learning_rate": 4.77145935097811e-06, "loss": 0.2116, "step": 1947 }, { "epoch": 3.9877175025588536, "grad_norm": 0.30419560839821447, "learning_rate": 4.752927223032196e-06, "loss": 0.2261, "step": 1948 }, { "epoch": 3.9897645854657116, "grad_norm": 0.3199774705798064, "learning_rate": 4.7344262998912885e-06, "loss": 0.2478, "step": 1949 }, { "epoch": 3.991811668372569, "grad_norm": 0.31331370811795206, "learning_rate": 4.715956619419539e-06, "loss": 0.2427, "step": 1950 }, { "epoch": 3.9938587512794266, "grad_norm": 0.3035495734347721, "learning_rate": 4.697518219417188e-06, "loss": 0.2412, "step": 1951 }, { "epoch": 3.9959058341862845, "grad_norm": 0.2835191629580971, "learning_rate": 4.679111137620442e-06, "loss": 0.2054, "step": 1952 }, { "epoch": 3.9979529170931425, "grad_norm": 0.29245602269669696, "learning_rate": 4.660735411701398e-06, "loss": 0.2047, "step": 1953 }, { "epoch": 4.0, "grad_norm": 0.7074262659570464, "learning_rate": 4.6423910792680005e-06, "loss": 0.2856, "step": 1954 }, { "epoch": 4.0020470829068575, "grad_norm": 0.4712597575605154, "learning_rate": 4.62407817786394e-06, "loss": 0.1814, "step": 1955 }, { "epoch": 4.004094165813715, "grad_norm": 0.38151984418088236, "learning_rate": 4.605796744968556e-06, "loss": 0.1724, "step": 1956 }, { "epoch": 4.006141248720573, "grad_norm": 0.34020967642434874, "learning_rate": 4.587546817996826e-06, "loss": 0.149, "step": 1957 }, { "epoch": 4.008188331627431, "grad_norm": 0.31262949437875953, "learning_rate": 4.56932843429922e-06, "loss": 0.161, "step": 1958 }, { "epoch": 4.0102354145342884, "grad_norm": 0.4303302435498056, "learning_rate": 4.551141631161651e-06, "loss": 0.1656, "step": 1959 }, { "epoch": 4.012282497441146, "grad_norm": 0.4285520812331891, "learning_rate": 4.532986445805405e-06, "loss": 0.1627, "step": 1960 }, { "epoch": 4.014329580348004, "grad_norm": 0.4192066755161973, "learning_rate": 4.514862915387059e-06, "loss": 0.1827, "step": 1961 }, { "epoch": 4.016376663254862, "grad_norm": 0.3658567404594509, "learning_rate": 4.496771076998405e-06, "loss": 0.2017, "step": 1962 }, { "epoch": 4.018423746161719, "grad_norm": 0.3317640866707838, "learning_rate": 4.478710967666371e-06, "loss": 0.1817, "step": 1963 }, { "epoch": 4.020470829068577, "grad_norm": 0.38931119420729465, "learning_rate": 4.460682624352952e-06, "loss": 0.1582, "step": 1964 }, { "epoch": 4.022517911975435, "grad_norm": 0.4025748777608708, "learning_rate": 4.442686083955132e-06, "loss": 0.1692, "step": 1965 }, { "epoch": 4.024564994882293, "grad_norm": 0.3465330209700674, "learning_rate": 4.424721383304791e-06, "loss": 0.1623, "step": 1966 }, { "epoch": 4.02661207778915, "grad_norm": 0.33474185562540504, "learning_rate": 4.4067885591686625e-06, "loss": 0.1629, "step": 1967 }, { "epoch": 4.028659160696008, "grad_norm": 0.2994162000223648, "learning_rate": 4.388887648248237e-06, "loss": 0.1943, "step": 1968 }, { "epoch": 4.030706243602866, "grad_norm": 0.30244570971558177, "learning_rate": 4.371018687179689e-06, "loss": 0.2009, "step": 1969 }, { "epoch": 4.032753326509724, "grad_norm": 0.31048799941390864, "learning_rate": 4.353181712533807e-06, "loss": 0.1763, "step": 1970 }, { "epoch": 4.034800409416581, "grad_norm": 0.31183964535945935, "learning_rate": 4.3353767608159125e-06, "loss": 0.1695, "step": 1971 }, { "epoch": 4.036847492323439, "grad_norm": 0.31495339654929116, "learning_rate": 4.317603868465794e-06, "loss": 0.156, "step": 1972 }, { "epoch": 4.038894575230297, "grad_norm": 0.2985528590711877, "learning_rate": 4.299863071857617e-06, "loss": 0.1687, "step": 1973 }, { "epoch": 4.040941658137155, "grad_norm": 0.2898165046108191, "learning_rate": 4.2821544072998655e-06, "loss": 0.1689, "step": 1974 }, { "epoch": 4.042988741044012, "grad_norm": 0.2806552847148077, "learning_rate": 4.264477911035265e-06, "loss": 0.1463, "step": 1975 }, { "epoch": 4.04503582395087, "grad_norm": 0.2973917660709716, "learning_rate": 4.246833619240702e-06, "loss": 0.1452, "step": 1976 }, { "epoch": 4.047082906857728, "grad_norm": 0.3137432130391123, "learning_rate": 4.229221568027151e-06, "loss": 0.1821, "step": 1977 }, { "epoch": 4.049129989764586, "grad_norm": 0.29853330078810963, "learning_rate": 4.211641793439609e-06, "loss": 0.168, "step": 1978 }, { "epoch": 4.051177072671443, "grad_norm": 0.3165013554197021, "learning_rate": 4.194094331457004e-06, "loss": 0.1753, "step": 1979 }, { "epoch": 4.053224155578301, "grad_norm": 0.30677661034631387, "learning_rate": 4.176579217992143e-06, "loss": 0.1525, "step": 1980 }, { "epoch": 4.055271238485159, "grad_norm": 0.29846779652611327, "learning_rate": 4.159096488891623e-06, "loss": 0.1559, "step": 1981 }, { "epoch": 4.0573183213920165, "grad_norm": 0.3014892204744538, "learning_rate": 4.1416461799357675e-06, "loss": 0.1732, "step": 1982 }, { "epoch": 4.059365404298874, "grad_norm": 0.29810838609292906, "learning_rate": 4.124228326838544e-06, "loss": 0.1442, "step": 1983 }, { "epoch": 4.061412487205732, "grad_norm": 0.28754100275614525, "learning_rate": 4.106842965247497e-06, "loss": 0.1587, "step": 1984 }, { "epoch": 4.06345957011259, "grad_norm": 0.28330676946462274, "learning_rate": 4.0894901307436805e-06, "loss": 0.1697, "step": 1985 }, { "epoch": 4.0655066530194475, "grad_norm": 0.285301416935174, "learning_rate": 4.072169858841561e-06, "loss": 0.1595, "step": 1986 }, { "epoch": 4.067553735926305, "grad_norm": 0.2837041098864574, "learning_rate": 4.054882184988971e-06, "loss": 0.1871, "step": 1987 }, { "epoch": 4.0696008188331625, "grad_norm": 0.28867050594006555, "learning_rate": 4.0376271445670465e-06, "loss": 0.1805, "step": 1988 }, { "epoch": 4.071647901740021, "grad_norm": 0.27159571765823515, "learning_rate": 4.020404772890101e-06, "loss": 0.1782, "step": 1989 }, { "epoch": 4.073694984646878, "grad_norm": 0.29070033660022687, "learning_rate": 4.003215105205613e-06, "loss": 0.1971, "step": 1990 }, { "epoch": 4.075742067553736, "grad_norm": 0.29431870068785426, "learning_rate": 3.986058176694123e-06, "loss": 0.1836, "step": 1991 }, { "epoch": 4.077789150460593, "grad_norm": 0.30901872319472756, "learning_rate": 3.968934022469157e-06, "loss": 0.1615, "step": 1992 }, { "epoch": 4.079836233367452, "grad_norm": 0.26872512623266737, "learning_rate": 3.951842677577171e-06, "loss": 0.1571, "step": 1993 }, { "epoch": 4.081883316274309, "grad_norm": 0.277601430071484, "learning_rate": 3.9347841769974925e-06, "loss": 0.1993, "step": 1994 }, { "epoch": 4.083930399181167, "grad_norm": 0.2815481603482528, "learning_rate": 3.917758555642195e-06, "loss": 0.1776, "step": 1995 }, { "epoch": 4.085977482088024, "grad_norm": 0.31199877087236383, "learning_rate": 3.900765848356083e-06, "loss": 0.1807, "step": 1996 }, { "epoch": 4.088024564994882, "grad_norm": 0.3049249771323251, "learning_rate": 3.883806089916593e-06, "loss": 0.1738, "step": 1997 }, { "epoch": 4.09007164790174, "grad_norm": 0.29235847173840074, "learning_rate": 3.866879315033738e-06, "loss": 0.178, "step": 1998 }, { "epoch": 4.092118730808598, "grad_norm": 0.2849050552270467, "learning_rate": 3.849985558349998e-06, "loss": 0.1679, "step": 1999 }, { "epoch": 4.094165813715455, "grad_norm": 0.2977669293917896, "learning_rate": 3.8331248544403135e-06, "loss": 0.1556, "step": 2000 }, { "epoch": 4.096212896622313, "grad_norm": 0.2927932662986805, "learning_rate": 3.8162972378119635e-06, "loss": 0.1894, "step": 2001 }, { "epoch": 4.098259979529171, "grad_norm": 0.29328702976060467, "learning_rate": 3.799502742904497e-06, "loss": 0.173, "step": 2002 }, { "epoch": 4.100307062436029, "grad_norm": 0.2969906241666511, "learning_rate": 3.7827414040896958e-06, "loss": 0.1812, "step": 2003 }, { "epoch": 4.102354145342886, "grad_norm": 0.32948789795871786, "learning_rate": 3.766013255671479e-06, "loss": 0.1825, "step": 2004 }, { "epoch": 4.104401228249744, "grad_norm": 0.28692228882067206, "learning_rate": 3.749318331885825e-06, "loss": 0.1654, "step": 2005 }, { "epoch": 4.106448311156602, "grad_norm": 0.27400445663207174, "learning_rate": 3.7326566669007268e-06, "loss": 0.1913, "step": 2006 }, { "epoch": 4.10849539406346, "grad_norm": 0.30606472444023525, "learning_rate": 3.716028294816119e-06, "loss": 0.1545, "step": 2007 }, { "epoch": 4.110542476970317, "grad_norm": 0.3123611845587673, "learning_rate": 3.699433249663775e-06, "loss": 0.201, "step": 2008 }, { "epoch": 4.112589559877175, "grad_norm": 0.31139724775462246, "learning_rate": 3.6828715654072776e-06, "loss": 0.1618, "step": 2009 }, { "epoch": 4.114636642784033, "grad_norm": 0.29336426025625717, "learning_rate": 3.666343275941926e-06, "loss": 0.1903, "step": 2010 }, { "epoch": 4.116683725690891, "grad_norm": 0.2893384058971807, "learning_rate": 3.649848415094681e-06, "loss": 0.17, "step": 2011 }, { "epoch": 4.118730808597748, "grad_norm": 0.29125235390587384, "learning_rate": 3.6333870166240703e-06, "loss": 0.1996, "step": 2012 }, { "epoch": 4.120777891504606, "grad_norm": 0.29871612249436, "learning_rate": 3.616959114220162e-06, "loss": 0.1985, "step": 2013 }, { "epoch": 4.122824974411464, "grad_norm": 0.271555858016362, "learning_rate": 3.60056474150446e-06, "loss": 0.1818, "step": 2014 }, { "epoch": 4.1248720573183215, "grad_norm": 0.3012910833873169, "learning_rate": 3.5842039320298327e-06, "loss": 0.1414, "step": 2015 }, { "epoch": 4.126919140225179, "grad_norm": 0.27667654273766706, "learning_rate": 3.5678767192804764e-06, "loss": 0.1882, "step": 2016 }, { "epoch": 4.1289662231320365, "grad_norm": 0.2869593416102505, "learning_rate": 3.551583136671817e-06, "loss": 0.1906, "step": 2017 }, { "epoch": 4.131013306038895, "grad_norm": 0.2822576095654785, "learning_rate": 3.5353232175504614e-06, "loss": 0.1828, "step": 2018 }, { "epoch": 4.1330603889457525, "grad_norm": 0.3121511458643644, "learning_rate": 3.5190969951941113e-06, "loss": 0.161, "step": 2019 }, { "epoch": 4.13510747185261, "grad_norm": 0.2829822535321191, "learning_rate": 3.5029045028115105e-06, "loss": 0.1514, "step": 2020 }, { "epoch": 4.1371545547594675, "grad_norm": 0.2837746021156371, "learning_rate": 3.486745773542375e-06, "loss": 0.1935, "step": 2021 }, { "epoch": 4.139201637666326, "grad_norm": 0.3025896496647199, "learning_rate": 3.470620840457304e-06, "loss": 0.208, "step": 2022 }, { "epoch": 4.141248720573183, "grad_norm": 0.28695976072627655, "learning_rate": 3.4545297365577437e-06, "loss": 0.18, "step": 2023 }, { "epoch": 4.143295803480041, "grad_norm": 0.29084046593921437, "learning_rate": 3.438472494775902e-06, "loss": 0.1797, "step": 2024 }, { "epoch": 4.145342886386898, "grad_norm": 0.2964669445537806, "learning_rate": 3.4224491479746822e-06, "loss": 0.2066, "step": 2025 }, { "epoch": 4.147389969293757, "grad_norm": 0.2735467093182229, "learning_rate": 3.406459728947622e-06, "loss": 0.1805, "step": 2026 }, { "epoch": 4.149437052200614, "grad_norm": 0.2883440394869004, "learning_rate": 3.390504270418822e-06, "loss": 0.1935, "step": 2027 }, { "epoch": 4.151484135107472, "grad_norm": 0.28347253308933157, "learning_rate": 3.3745828050428675e-06, "loss": 0.2042, "step": 2028 }, { "epoch": 4.153531218014329, "grad_norm": 0.287714374967884, "learning_rate": 3.358695365404785e-06, "loss": 0.1965, "step": 2029 }, { "epoch": 4.155578300921187, "grad_norm": 0.29340681821156966, "learning_rate": 3.3428419840199623e-06, "loss": 0.1603, "step": 2030 }, { "epoch": 4.157625383828045, "grad_norm": 0.2727874360906274, "learning_rate": 3.327022693334083e-06, "loss": 0.155, "step": 2031 }, { "epoch": 4.159672466734903, "grad_norm": 0.30308908503638726, "learning_rate": 3.3112375257230547e-06, "loss": 0.1909, "step": 2032 }, { "epoch": 4.16171954964176, "grad_norm": 0.2782743108107041, "learning_rate": 3.295486513492954e-06, "loss": 0.1912, "step": 2033 }, { "epoch": 4.163766632548619, "grad_norm": 0.2960778023128933, "learning_rate": 3.279769688879959e-06, "loss": 0.1706, "step": 2034 }, { "epoch": 4.165813715455476, "grad_norm": 0.2855041966453197, "learning_rate": 3.2640870840502646e-06, "loss": 0.1812, "step": 2035 }, { "epoch": 4.167860798362334, "grad_norm": 0.2982604591341762, "learning_rate": 3.2484387311000364e-06, "loss": 0.1651, "step": 2036 }, { "epoch": 4.169907881269191, "grad_norm": 0.2957570058273647, "learning_rate": 3.2328246620553605e-06, "loss": 0.1632, "step": 2037 }, { "epoch": 4.171954964176049, "grad_norm": 0.2951616591774616, "learning_rate": 3.2172449088721235e-06, "loss": 0.1624, "step": 2038 }, { "epoch": 4.174002047082907, "grad_norm": 0.2851463082934784, "learning_rate": 3.2016995034360045e-06, "loss": 0.1808, "step": 2039 }, { "epoch": 4.176049129989765, "grad_norm": 0.2845084237530719, "learning_rate": 3.186188477562382e-06, "loss": 0.1786, "step": 2040 }, { "epoch": 4.178096212896622, "grad_norm": 0.30650921370752987, "learning_rate": 3.1707118629962607e-06, "loss": 0.1472, "step": 2041 }, { "epoch": 4.18014329580348, "grad_norm": 0.2777664159680818, "learning_rate": 3.1552696914122327e-06, "loss": 0.1531, "step": 2042 }, { "epoch": 4.182190378710338, "grad_norm": 0.28828599604321864, "learning_rate": 3.139861994414397e-06, "loss": 0.1845, "step": 2043 }, { "epoch": 4.184237461617196, "grad_norm": 0.30773699136825977, "learning_rate": 3.1244888035362875e-06, "loss": 0.1769, "step": 2044 }, { "epoch": 4.186284544524053, "grad_norm": 0.3155552426235748, "learning_rate": 3.1091501502408293e-06, "loss": 0.1643, "step": 2045 }, { "epoch": 4.188331627430911, "grad_norm": 0.2898415631190067, "learning_rate": 3.093846065920254e-06, "loss": 0.168, "step": 2046 }, { "epoch": 4.190378710337769, "grad_norm": 0.3080290062858047, "learning_rate": 3.0785765818960534e-06, "loss": 0.215, "step": 2047 }, { "epoch": 4.1924257932446265, "grad_norm": 0.26531974258516, "learning_rate": 3.0633417294188896e-06, "loss": 0.1902, "step": 2048 }, { "epoch": 4.194472876151484, "grad_norm": 0.29101494010309975, "learning_rate": 3.0481415396685564e-06, "loss": 0.1851, "step": 2049 }, { "epoch": 4.1965199590583415, "grad_norm": 0.2805440252695212, "learning_rate": 3.0329760437539233e-06, "loss": 0.2106, "step": 2050 }, { "epoch": 4.1985670419652, "grad_norm": 0.2954059617809206, "learning_rate": 3.017845272712825e-06, "loss": 0.178, "step": 2051 }, { "epoch": 4.200614124872057, "grad_norm": 0.2750668596805512, "learning_rate": 3.0027492575120453e-06, "loss": 0.1639, "step": 2052 }, { "epoch": 4.202661207778915, "grad_norm": 0.2954652665384484, "learning_rate": 2.9876880290472376e-06, "loss": 0.1654, "step": 2053 }, { "epoch": 4.2047082906857725, "grad_norm": 0.2844611490184932, "learning_rate": 2.9726616181428515e-06, "loss": 0.1824, "step": 2054 }, { "epoch": 4.206755373592631, "grad_norm": 0.28973919117898683, "learning_rate": 2.957670055552078e-06, "loss": 0.1785, "step": 2055 }, { "epoch": 4.208802456499488, "grad_norm": 0.3065738121168008, "learning_rate": 2.942713371956809e-06, "loss": 0.1466, "step": 2056 }, { "epoch": 4.210849539406346, "grad_norm": 0.27595990092382205, "learning_rate": 2.927791597967522e-06, "loss": 0.1674, "step": 2057 }, { "epoch": 4.212896622313203, "grad_norm": 0.2787886162407018, "learning_rate": 2.9129047641232653e-06, "loss": 0.1739, "step": 2058 }, { "epoch": 4.214943705220062, "grad_norm": 0.2835601286496118, "learning_rate": 2.8980529008915793e-06, "loss": 0.1851, "step": 2059 }, { "epoch": 4.216990788126919, "grad_norm": 0.2734235721997136, "learning_rate": 2.8832360386684287e-06, "loss": 0.1894, "step": 2060 }, { "epoch": 4.219037871033777, "grad_norm": 0.313282561765329, "learning_rate": 2.8684542077781376e-06, "loss": 0.1844, "step": 2061 }, { "epoch": 4.221084953940634, "grad_norm": 0.29657107157085116, "learning_rate": 2.853707438473352e-06, "loss": 0.1861, "step": 2062 }, { "epoch": 4.223132036847493, "grad_norm": 0.2835879428086791, "learning_rate": 2.838995760934953e-06, "loss": 0.1992, "step": 2063 }, { "epoch": 4.22517911975435, "grad_norm": 0.28406764855143096, "learning_rate": 2.8243192052719902e-06, "loss": 0.1743, "step": 2064 }, { "epoch": 4.227226202661208, "grad_norm": 0.2735358759641007, "learning_rate": 2.8096778015216484e-06, "loss": 0.1663, "step": 2065 }, { "epoch": 4.229273285568065, "grad_norm": 0.28994987108300085, "learning_rate": 2.7950715796491623e-06, "loss": 0.1693, "step": 2066 }, { "epoch": 4.231320368474924, "grad_norm": 0.2918242884998421, "learning_rate": 2.7805005695477704e-06, "loss": 0.1659, "step": 2067 }, { "epoch": 4.233367451381781, "grad_norm": 0.2914555807372611, "learning_rate": 2.7659648010386365e-06, "loss": 0.2082, "step": 2068 }, { "epoch": 4.235414534288639, "grad_norm": 0.31049832846223957, "learning_rate": 2.75146430387081e-06, "loss": 0.1745, "step": 2069 }, { "epoch": 4.237461617195496, "grad_norm": 0.2786106859690272, "learning_rate": 2.736999107721137e-06, "loss": 0.1689, "step": 2070 }, { "epoch": 4.239508700102354, "grad_norm": 0.2813229103286878, "learning_rate": 2.7225692421942306e-06, "loss": 0.1938, "step": 2071 }, { "epoch": 4.241555783009212, "grad_norm": 0.2911201694179184, "learning_rate": 2.7081747368223953e-06, "loss": 0.1835, "step": 2072 }, { "epoch": 4.24360286591607, "grad_norm": 0.29816674195659204, "learning_rate": 2.6938156210655584e-06, "loss": 0.1842, "step": 2073 }, { "epoch": 4.245649948822927, "grad_norm": 0.2996092573579164, "learning_rate": 2.679491924311226e-06, "loss": 0.1847, "step": 2074 }, { "epoch": 4.247697031729785, "grad_norm": 0.2895248563688147, "learning_rate": 2.6652036758744148e-06, "loss": 0.1827, "step": 2075 }, { "epoch": 4.249744114636643, "grad_norm": 0.28226805129886656, "learning_rate": 2.6509509049975913e-06, "loss": 0.1765, "step": 2076 }, { "epoch": 4.2517911975435005, "grad_norm": 0.2949640489147984, "learning_rate": 2.6367336408506063e-06, "loss": 0.1705, "step": 2077 }, { "epoch": 4.253838280450358, "grad_norm": 0.2753720710190184, "learning_rate": 2.622551912530653e-06, "loss": 0.186, "step": 2078 }, { "epoch": 4.255885363357216, "grad_norm": 0.2936131556811062, "learning_rate": 2.608405749062193e-06, "loss": 0.1855, "step": 2079 }, { "epoch": 4.257932446264074, "grad_norm": 0.28222624495514065, "learning_rate": 2.594295179396895e-06, "loss": 0.1743, "step": 2080 }, { "epoch": 4.2599795291709315, "grad_norm": 0.29367758291953516, "learning_rate": 2.5802202324135926e-06, "loss": 0.1926, "step": 2081 }, { "epoch": 4.262026612077789, "grad_norm": 0.29329088174637774, "learning_rate": 2.566180936918203e-06, "loss": 0.1985, "step": 2082 }, { "epoch": 4.2640736949846465, "grad_norm": 0.27495980250181845, "learning_rate": 2.5521773216436875e-06, "loss": 0.1694, "step": 2083 }, { "epoch": 4.266120777891505, "grad_norm": 0.2745597317735457, "learning_rate": 2.5382094152499705e-06, "loss": 0.1623, "step": 2084 }, { "epoch": 4.268167860798362, "grad_norm": 0.2860214870247153, "learning_rate": 2.5242772463239075e-06, "loss": 0.1759, "step": 2085 }, { "epoch": 4.27021494370522, "grad_norm": 0.2829523123556956, "learning_rate": 2.5103808433792075e-06, "loss": 0.1953, "step": 2086 }, { "epoch": 4.272262026612077, "grad_norm": 0.29162058098697563, "learning_rate": 2.4965202348563834e-06, "loss": 0.1851, "step": 2087 }, { "epoch": 4.274309109518936, "grad_norm": 0.28401444187787156, "learning_rate": 2.4826954491226875e-06, "loss": 0.1625, "step": 2088 }, { "epoch": 4.276356192425793, "grad_norm": 0.29640280698764426, "learning_rate": 2.468906514472065e-06, "loss": 0.1733, "step": 2089 }, { "epoch": 4.278403275332651, "grad_norm": 0.28541928385296134, "learning_rate": 2.4551534591250725e-06, "loss": 0.2083, "step": 2090 }, { "epoch": 4.280450358239508, "grad_norm": 0.2927844886794109, "learning_rate": 2.4414363112288464e-06, "loss": 0.1518, "step": 2091 }, { "epoch": 4.282497441146367, "grad_norm": 0.29701394558657035, "learning_rate": 2.4277550988570362e-06, "loss": 0.1621, "step": 2092 }, { "epoch": 4.284544524053224, "grad_norm": 0.290771795085261, "learning_rate": 2.4141098500097403e-06, "loss": 0.1648, "step": 2093 }, { "epoch": 4.286591606960082, "grad_norm": 0.27774161175473394, "learning_rate": 2.400500592613455e-06, "loss": 0.1711, "step": 2094 }, { "epoch": 4.288638689866939, "grad_norm": 0.30883225453627294, "learning_rate": 2.3869273545210158e-06, "loss": 0.134, "step": 2095 }, { "epoch": 4.290685772773798, "grad_norm": 0.2797146271613822, "learning_rate": 2.3733901635115486e-06, "loss": 0.1692, "step": 2096 }, { "epoch": 4.292732855680655, "grad_norm": 0.2969804226131455, "learning_rate": 2.359889047290389e-06, "loss": 0.1671, "step": 2097 }, { "epoch": 4.294779938587513, "grad_norm": 0.3217916741414342, "learning_rate": 2.3464240334890496e-06, "loss": 0.1575, "step": 2098 }, { "epoch": 4.29682702149437, "grad_norm": 0.27919309597905084, "learning_rate": 2.332995149665169e-06, "loss": 0.2121, "step": 2099 }, { "epoch": 4.298874104401229, "grad_norm": 0.2800459009067448, "learning_rate": 2.3196024233024185e-06, "loss": 0.1837, "step": 2100 }, { "epoch": 4.300921187308086, "grad_norm": 0.2671979507192257, "learning_rate": 2.3062458818104804e-06, "loss": 0.1952, "step": 2101 }, { "epoch": 4.302968270214944, "grad_norm": 0.28262802438305323, "learning_rate": 2.2929255525249894e-06, "loss": 0.1713, "step": 2102 }, { "epoch": 4.305015353121801, "grad_norm": 0.3005757045715249, "learning_rate": 2.279641462707445e-06, "loss": 0.1668, "step": 2103 }, { "epoch": 4.30706243602866, "grad_norm": 0.286800196637234, "learning_rate": 2.266393639545197e-06, "loss": 0.1896, "step": 2104 }, { "epoch": 4.309109518935517, "grad_norm": 0.2833256809700336, "learning_rate": 2.2531821101513796e-06, "loss": 0.1417, "step": 2105 }, { "epoch": 4.311156601842375, "grad_norm": 0.31197523215491646, "learning_rate": 2.2400069015648173e-06, "loss": 0.1952, "step": 2106 }, { "epoch": 4.313203684749232, "grad_norm": 0.2678003946049261, "learning_rate": 2.22686804075003e-06, "loss": 0.1807, "step": 2107 }, { "epoch": 4.3152507676560905, "grad_norm": 0.30659909160474746, "learning_rate": 2.213765554597129e-06, "loss": 0.1873, "step": 2108 }, { "epoch": 4.317297850562948, "grad_norm": 0.27320285898761, "learning_rate": 2.2006994699217963e-06, "loss": 0.1783, "step": 2109 }, { "epoch": 4.3193449334698055, "grad_norm": 0.2952250752859734, "learning_rate": 2.187669813465192e-06, "loss": 0.1666, "step": 2110 }, { "epoch": 4.321392016376663, "grad_norm": 0.2648240745330133, "learning_rate": 2.174676611893947e-06, "loss": 0.1671, "step": 2111 }, { "epoch": 4.3234390992835205, "grad_norm": 0.2811036034280304, "learning_rate": 2.1617198918000737e-06, "loss": 0.1765, "step": 2112 }, { "epoch": 4.325486182190379, "grad_norm": 0.29793182339880603, "learning_rate": 2.1487996797009103e-06, "loss": 0.167, "step": 2113 }, { "epoch": 4.3275332650972365, "grad_norm": 0.2752900542965174, "learning_rate": 2.135916002039089e-06, "loss": 0.1821, "step": 2114 }, { "epoch": 4.329580348004094, "grad_norm": 0.27908851781361865, "learning_rate": 2.123068885182471e-06, "loss": 0.1875, "step": 2115 }, { "epoch": 4.3316274309109515, "grad_norm": 0.30369045741991585, "learning_rate": 2.110258355424093e-06, "loss": 0.1565, "step": 2116 }, { "epoch": 4.33367451381781, "grad_norm": 0.27387260120165463, "learning_rate": 2.0974844389820914e-06, "loss": 0.2037, "step": 2117 }, { "epoch": 4.335721596724667, "grad_norm": 0.2695259971121429, "learning_rate": 2.084747161999703e-06, "loss": 0.1883, "step": 2118 }, { "epoch": 4.337768679631525, "grad_norm": 0.27646790728531134, "learning_rate": 2.0720465505451524e-06, "loss": 0.1674, "step": 2119 }, { "epoch": 4.339815762538382, "grad_norm": 0.27363984511519157, "learning_rate": 2.0593826306116328e-06, "loss": 0.1987, "step": 2120 }, { "epoch": 4.341862845445241, "grad_norm": 0.3058041319945061, "learning_rate": 2.0467554281172443e-06, "loss": 0.1668, "step": 2121 }, { "epoch": 4.343909928352098, "grad_norm": 0.28711732195110085, "learning_rate": 2.0341649689049458e-06, "loss": 0.1589, "step": 2122 }, { "epoch": 4.345957011258956, "grad_norm": 0.29676194594555955, "learning_rate": 2.021611278742479e-06, "loss": 0.2006, "step": 2123 }, { "epoch": 4.348004094165813, "grad_norm": 0.2894509415136224, "learning_rate": 2.009094383322356e-06, "loss": 0.1759, "step": 2124 }, { "epoch": 4.350051177072672, "grad_norm": 0.27051010203429876, "learning_rate": 1.9966143082617797e-06, "loss": 0.2051, "step": 2125 }, { "epoch": 4.352098259979529, "grad_norm": 0.29097733973563933, "learning_rate": 1.9841710791025793e-06, "loss": 0.1718, "step": 2126 }, { "epoch": 4.354145342886387, "grad_norm": 0.321019324398248, "learning_rate": 1.971764721311191e-06, "loss": 0.2109, "step": 2127 }, { "epoch": 4.356192425793244, "grad_norm": 0.30390810271040936, "learning_rate": 1.959395260278587e-06, "loss": 0.1489, "step": 2128 }, { "epoch": 4.358239508700103, "grad_norm": 0.29799569461187914, "learning_rate": 1.947062721320221e-06, "loss": 0.1661, "step": 2129 }, { "epoch": 4.36028659160696, "grad_norm": 0.2832110200623686, "learning_rate": 1.9347671296759896e-06, "loss": 0.1714, "step": 2130 }, { "epoch": 4.362333674513818, "grad_norm": 0.2951503600181551, "learning_rate": 1.922508510510166e-06, "loss": 0.157, "step": 2131 }, { "epoch": 4.364380757420675, "grad_norm": 0.28017904966905216, "learning_rate": 1.9102868889113613e-06, "loss": 0.1822, "step": 2132 }, { "epoch": 4.366427840327534, "grad_norm": 0.27123684319205094, "learning_rate": 1.8981022898924562e-06, "loss": 0.2035, "step": 2133 }, { "epoch": 4.368474923234391, "grad_norm": 0.3005307169297444, "learning_rate": 1.885954738390572e-06, "loss": 0.178, "step": 2134 }, { "epoch": 4.370522006141249, "grad_norm": 0.30373485662664135, "learning_rate": 1.8738442592670014e-06, "loss": 0.1791, "step": 2135 }, { "epoch": 4.372569089048106, "grad_norm": 0.29560692277511486, "learning_rate": 1.8617708773071698e-06, "loss": 0.1702, "step": 2136 }, { "epoch": 4.3746161719549645, "grad_norm": 0.2799036916467131, "learning_rate": 1.8497346172205733e-06, "loss": 0.1757, "step": 2137 }, { "epoch": 4.376663254861822, "grad_norm": 0.2914915460350074, "learning_rate": 1.8377355036407408e-06, "loss": 0.1537, "step": 2138 }, { "epoch": 4.37871033776868, "grad_norm": 0.28848134194988, "learning_rate": 1.8257735611251704e-06, "loss": 0.1794, "step": 2139 }, { "epoch": 4.380757420675537, "grad_norm": 0.2663575035541803, "learning_rate": 1.8138488141552856e-06, "loss": 0.1895, "step": 2140 }, { "epoch": 4.3828045035823955, "grad_norm": 0.28041126513615366, "learning_rate": 1.801961287136391e-06, "loss": 0.1705, "step": 2141 }, { "epoch": 4.384851586489253, "grad_norm": 0.2878226194974552, "learning_rate": 1.7901110043976122e-06, "loss": 0.1869, "step": 2142 }, { "epoch": 4.3868986693961105, "grad_norm": 0.2845987197363273, "learning_rate": 1.7782979901918507e-06, "loss": 0.1822, "step": 2143 }, { "epoch": 4.388945752302968, "grad_norm": 0.2750503273666346, "learning_rate": 1.7665222686957362e-06, "loss": 0.1778, "step": 2144 }, { "epoch": 4.3909928352098255, "grad_norm": 0.3136573248950073, "learning_rate": 1.754783864009575e-06, "loss": 0.1569, "step": 2145 }, { "epoch": 4.393039918116684, "grad_norm": 0.2957114276760941, "learning_rate": 1.7430828001572897e-06, "loss": 0.2259, "step": 2146 }, { "epoch": 4.395087001023541, "grad_norm": 0.306095495611476, "learning_rate": 1.7314191010863933e-06, "loss": 0.2185, "step": 2147 }, { "epoch": 4.397134083930399, "grad_norm": 0.2870140982518843, "learning_rate": 1.7197927906679335e-06, "loss": 0.2054, "step": 2148 }, { "epoch": 4.399181166837257, "grad_norm": 0.2679693208920003, "learning_rate": 1.7082038926964162e-06, "loss": 0.1553, "step": 2149 }, { "epoch": 4.401228249744115, "grad_norm": 0.30186153363388674, "learning_rate": 1.6966524308897935e-06, "loss": 0.1927, "step": 2150 }, { "epoch": 4.403275332650972, "grad_norm": 0.2784578317629372, "learning_rate": 1.6851384288894058e-06, "loss": 0.1776, "step": 2151 }, { "epoch": 4.40532241555783, "grad_norm": 0.2843049715212868, "learning_rate": 1.6736619102599073e-06, "loss": 0.1672, "step": 2152 }, { "epoch": 4.407369498464687, "grad_norm": 0.31694116691154817, "learning_rate": 1.6622228984892585e-06, "loss": 0.1539, "step": 2153 }, { "epoch": 4.409416581371546, "grad_norm": 0.26686752025138655, "learning_rate": 1.6508214169886483e-06, "loss": 0.1754, "step": 2154 }, { "epoch": 4.411463664278403, "grad_norm": 0.2654670971586521, "learning_rate": 1.6394574890924574e-06, "loss": 0.2013, "step": 2155 }, { "epoch": 4.413510747185261, "grad_norm": 0.29492009598586416, "learning_rate": 1.6281311380582087e-06, "loss": 0.2082, "step": 2156 }, { "epoch": 4.415557830092118, "grad_norm": 0.30045313874805496, "learning_rate": 1.616842387066524e-06, "loss": 0.1767, "step": 2157 }, { "epoch": 4.417604912998977, "grad_norm": 0.2910283873769807, "learning_rate": 1.605591259221071e-06, "loss": 0.1766, "step": 2158 }, { "epoch": 4.419651995905834, "grad_norm": 0.28823745158894704, "learning_rate": 1.5943777775485058e-06, "loss": 0.1868, "step": 2159 }, { "epoch": 4.421699078812692, "grad_norm": 0.26486227426645287, "learning_rate": 1.583201964998451e-06, "loss": 0.2016, "step": 2160 }, { "epoch": 4.423746161719549, "grad_norm": 0.28343110323179144, "learning_rate": 1.572063844443441e-06, "loss": 0.1855, "step": 2161 }, { "epoch": 4.425793244626408, "grad_norm": 0.2863936086901088, "learning_rate": 1.5609634386788485e-06, "loss": 0.1952, "step": 2162 }, { "epoch": 4.427840327533265, "grad_norm": 0.29827769122199016, "learning_rate": 1.5499007704228742e-06, "loss": 0.1679, "step": 2163 }, { "epoch": 4.429887410440123, "grad_norm": 0.2492563784192527, "learning_rate": 1.5388758623164802e-06, "loss": 0.1679, "step": 2164 }, { "epoch": 4.43193449334698, "grad_norm": 0.2802794951076192, "learning_rate": 1.5278887369233509e-06, "loss": 0.1792, "step": 2165 }, { "epoch": 4.433981576253839, "grad_norm": 0.2859554141734814, "learning_rate": 1.5169394167298367e-06, "loss": 0.1771, "step": 2166 }, { "epoch": 4.436028659160696, "grad_norm": 0.3090415351876153, "learning_rate": 1.5060279241449304e-06, "loss": 0.1803, "step": 2167 }, { "epoch": 4.438075742067554, "grad_norm": 0.2838386004821354, "learning_rate": 1.4951542815001886e-06, "loss": 0.159, "step": 2168 }, { "epoch": 4.440122824974411, "grad_norm": 0.28615497006671264, "learning_rate": 1.4843185110497139e-06, "loss": 0.1654, "step": 2169 }, { "epoch": 4.4421699078812695, "grad_norm": 0.2828928258486327, "learning_rate": 1.4735206349701003e-06, "loss": 0.166, "step": 2170 }, { "epoch": 4.444216990788127, "grad_norm": 0.294693410920466, "learning_rate": 1.4627606753603886e-06, "loss": 0.1708, "step": 2171 }, { "epoch": 4.4462640736949846, "grad_norm": 0.28832385318223086, "learning_rate": 1.4520386542420006e-06, "loss": 0.173, "step": 2172 }, { "epoch": 4.448311156601842, "grad_norm": 0.2750702995509294, "learning_rate": 1.4413545935587415e-06, "loss": 0.1612, "step": 2173 }, { "epoch": 4.4503582395087005, "grad_norm": 0.2827730533019191, "learning_rate": 1.4307085151767086e-06, "loss": 0.1568, "step": 2174 }, { "epoch": 4.452405322415558, "grad_norm": 0.2724089303161446, "learning_rate": 1.4201004408842644e-06, "loss": 0.1577, "step": 2175 }, { "epoch": 4.4544524053224155, "grad_norm": 0.27196453176116103, "learning_rate": 1.4095303923919956e-06, "loss": 0.1773, "step": 2176 }, { "epoch": 4.456499488229273, "grad_norm": 0.3178765316382206, "learning_rate": 1.3989983913326665e-06, "loss": 0.159, "step": 2177 }, { "epoch": 4.458546571136131, "grad_norm": 0.29327822729767095, "learning_rate": 1.3885044592611706e-06, "loss": 0.1431, "step": 2178 }, { "epoch": 4.460593654042989, "grad_norm": 0.29769593027711694, "learning_rate": 1.3780486176544905e-06, "loss": 0.1985, "step": 2179 }, { "epoch": 4.462640736949846, "grad_norm": 0.2844568268296535, "learning_rate": 1.3676308879116507e-06, "loss": 0.1652, "step": 2180 }, { "epoch": 4.464687819856704, "grad_norm": 0.2824834673707818, "learning_rate": 1.3572512913536783e-06, "loss": 0.1957, "step": 2181 }, { "epoch": 4.466734902763562, "grad_norm": 0.2944499941869074, "learning_rate": 1.3469098492235521e-06, "loss": 0.1663, "step": 2182 }, { "epoch": 4.46878198567042, "grad_norm": 0.2860326191078485, "learning_rate": 1.3366065826861685e-06, "loss": 0.1508, "step": 2183 }, { "epoch": 4.470829068577277, "grad_norm": 0.29391486499526526, "learning_rate": 1.3263415128282908e-06, "loss": 0.1643, "step": 2184 }, { "epoch": 4.472876151484135, "grad_norm": 0.28228277616536795, "learning_rate": 1.316114660658505e-06, "loss": 0.165, "step": 2185 }, { "epoch": 4.474923234390992, "grad_norm": 0.28104222228106285, "learning_rate": 1.305926047107191e-06, "loss": 0.1787, "step": 2186 }, { "epoch": 4.476970317297851, "grad_norm": 0.3067397142838367, "learning_rate": 1.2957756930264642e-06, "loss": 0.1708, "step": 2187 }, { "epoch": 4.479017400204708, "grad_norm": 0.287797881234288, "learning_rate": 1.2856636191901296e-06, "loss": 0.1778, "step": 2188 }, { "epoch": 4.481064483111566, "grad_norm": 0.29131274478787034, "learning_rate": 1.2755898462936544e-06, "loss": 0.1754, "step": 2189 }, { "epoch": 4.483111566018424, "grad_norm": 0.2774665709211303, "learning_rate": 1.265554394954125e-06, "loss": 0.1702, "step": 2190 }, { "epoch": 4.485158648925282, "grad_norm": 0.2895032215404063, "learning_rate": 1.255557285710185e-06, "loss": 0.1572, "step": 2191 }, { "epoch": 4.487205731832139, "grad_norm": 0.29001265695476014, "learning_rate": 1.2455985390220193e-06, "loss": 0.2107, "step": 2192 }, { "epoch": 4.489252814738997, "grad_norm": 0.28177925431653544, "learning_rate": 1.2356781752712932e-06, "loss": 0.1821, "step": 2193 }, { "epoch": 4.491299897645854, "grad_norm": 0.3086719314394619, "learning_rate": 1.225796214761117e-06, "loss": 0.1513, "step": 2194 }, { "epoch": 4.493346980552713, "grad_norm": 0.29208730292486546, "learning_rate": 1.2159526777160036e-06, "loss": 0.1387, "step": 2195 }, { "epoch": 4.49539406345957, "grad_norm": 0.3397441507088236, "learning_rate": 1.2061475842818337e-06, "loss": 0.1566, "step": 2196 }, { "epoch": 4.497441146366428, "grad_norm": 0.2873481591587359, "learning_rate": 1.196380954525802e-06, "loss": 0.1726, "step": 2197 }, { "epoch": 4.499488229273285, "grad_norm": 0.27576099473692967, "learning_rate": 1.1866528084363881e-06, "loss": 0.1549, "step": 2198 }, { "epoch": 4.501535312180144, "grad_norm": 0.3686313423170628, "learning_rate": 1.1769631659233104e-06, "loss": 0.1567, "step": 2199 }, { "epoch": 4.503582395087001, "grad_norm": 0.29827782847327655, "learning_rate": 1.1673120468174837e-06, "loss": 0.1872, "step": 2200 }, { "epoch": 4.505629477993859, "grad_norm": 0.28857294210571843, "learning_rate": 1.1576994708709766e-06, "loss": 0.182, "step": 2201 }, { "epoch": 4.507676560900716, "grad_norm": 0.27379258065859163, "learning_rate": 1.148125457756981e-06, "loss": 0.1732, "step": 2202 }, { "epoch": 4.5097236438075745, "grad_norm": 0.2746324365416677, "learning_rate": 1.1385900270697658e-06, "loss": 0.1962, "step": 2203 }, { "epoch": 4.511770726714432, "grad_norm": 0.29241951735417715, "learning_rate": 1.1290931983246334e-06, "loss": 0.1652, "step": 2204 }, { "epoch": 4.5138178096212895, "grad_norm": 0.3084497347616217, "learning_rate": 1.119634990957883e-06, "loss": 0.1849, "step": 2205 }, { "epoch": 4.515864892528147, "grad_norm": 0.3071333917103865, "learning_rate": 1.110215424326775e-06, "loss": 0.1584, "step": 2206 }, { "epoch": 4.5179119754350054, "grad_norm": 0.2857404999852703, "learning_rate": 1.1008345177094859e-06, "loss": 0.195, "step": 2207 }, { "epoch": 4.519959058341863, "grad_norm": 0.2705119565200689, "learning_rate": 1.091492290305063e-06, "loss": 0.1665, "step": 2208 }, { "epoch": 4.5220061412487205, "grad_norm": 0.3090579005503219, "learning_rate": 1.0821887612333959e-06, "loss": 0.1802, "step": 2209 }, { "epoch": 4.524053224155578, "grad_norm": 0.27905532018019086, "learning_rate": 1.0729239495351917e-06, "loss": 0.1786, "step": 2210 }, { "epoch": 4.526100307062436, "grad_norm": 0.2818996052768231, "learning_rate": 1.0636978741718873e-06, "loss": 0.1951, "step": 2211 }, { "epoch": 4.528147389969294, "grad_norm": 0.28519212105139324, "learning_rate": 1.0545105540256628e-06, "loss": 0.1718, "step": 2212 }, { "epoch": 4.530194472876151, "grad_norm": 0.2752841091314753, "learning_rate": 1.0453620078993755e-06, "loss": 0.1904, "step": 2213 }, { "epoch": 4.532241555783009, "grad_norm": 0.27656267859889894, "learning_rate": 1.0362522545165276e-06, "loss": 0.1563, "step": 2214 }, { "epoch": 4.534288638689867, "grad_norm": 0.3034310152471821, "learning_rate": 1.0271813125212237e-06, "loss": 0.1967, "step": 2215 }, { "epoch": 4.536335721596725, "grad_norm": 0.28706993728461716, "learning_rate": 1.0181492004781467e-06, "loss": 0.159, "step": 2216 }, { "epoch": 4.538382804503582, "grad_norm": 0.2782931986242475, "learning_rate": 1.009155936872499e-06, "loss": 0.1926, "step": 2217 }, { "epoch": 4.54042988741044, "grad_norm": 0.2836768256218939, "learning_rate": 1.0002015401099797e-06, "loss": 0.1697, "step": 2218 }, { "epoch": 4.542476970317297, "grad_norm": 0.2708987463201031, "learning_rate": 9.91286028516747e-07, "loss": 0.1936, "step": 2219 }, { "epoch": 4.544524053224156, "grad_norm": 0.28619987569766664, "learning_rate": 9.824094203393697e-07, "loss": 0.1849, "step": 2220 }, { "epoch": 4.546571136131013, "grad_norm": 0.2707540756476124, "learning_rate": 9.735717337447981e-07, "loss": 0.1751, "step": 2221 }, { "epoch": 4.548618219037871, "grad_norm": 0.27410601443733645, "learning_rate": 9.647729868203238e-07, "loss": 0.1868, "step": 2222 }, { "epoch": 4.550665301944729, "grad_norm": 0.2957604221652705, "learning_rate": 9.56013197573553e-07, "loss": 0.1462, "step": 2223 }, { "epoch": 4.552712384851587, "grad_norm": 0.26338269002342407, "learning_rate": 9.4729238393235e-07, "loss": 0.193, "step": 2224 }, { "epoch": 4.554759467758444, "grad_norm": 0.2833184625486789, "learning_rate": 9.386105637448151e-07, "loss": 0.1621, "step": 2225 }, { "epoch": 4.556806550665302, "grad_norm": 0.2789058820952134, "learning_rate": 9.299677547792463e-07, "loss": 0.1593, "step": 2226 }, { "epoch": 4.558853633572159, "grad_norm": 0.2720509751943245, "learning_rate": 9.21363974724101e-07, "loss": 0.1802, "step": 2227 }, { "epoch": 4.560900716479018, "grad_norm": 0.2861949558256626, "learning_rate": 9.127992411879494e-07, "loss": 0.2003, "step": 2228 }, { "epoch": 4.562947799385875, "grad_norm": 0.28455385830088115, "learning_rate": 9.042735716994678e-07, "loss": 0.1772, "step": 2229 }, { "epoch": 4.564994882292733, "grad_norm": 0.2762848657480311, "learning_rate": 8.957869837073673e-07, "loss": 0.153, "step": 2230 }, { "epoch": 4.567041965199591, "grad_norm": 0.28684432410227595, "learning_rate": 8.873394945803793e-07, "loss": 0.139, "step": 2231 }, { "epoch": 4.569089048106449, "grad_norm": 0.27952234592413544, "learning_rate": 8.789311216072183e-07, "loss": 0.1655, "step": 2232 }, { "epoch": 4.571136131013306, "grad_norm": 0.2712591839850155, "learning_rate": 8.705618819965411e-07, "loss": 0.1687, "step": 2233 }, { "epoch": 4.573183213920164, "grad_norm": 0.31661374577038626, "learning_rate": 8.622317928769086e-07, "loss": 0.1797, "step": 2234 }, { "epoch": 4.575230296827021, "grad_norm": 0.30388013921486445, "learning_rate": 8.539408712967679e-07, "loss": 0.205, "step": 2235 }, { "epoch": 4.5772773797338795, "grad_norm": 0.29761363560711057, "learning_rate": 8.456891342243945e-07, "loss": 0.1323, "step": 2236 }, { "epoch": 4.579324462640737, "grad_norm": 0.2749824129298373, "learning_rate": 8.374765985478728e-07, "loss": 0.1662, "step": 2237 }, { "epoch": 4.5813715455475945, "grad_norm": 0.288719742941428, "learning_rate": 8.293032810750579e-07, "loss": 0.185, "step": 2238 }, { "epoch": 4.583418628454452, "grad_norm": 0.29201488163729705, "learning_rate": 8.211691985335357e-07, "loss": 0.1763, "step": 2239 }, { "epoch": 4.58546571136131, "grad_norm": 0.29009215527637505, "learning_rate": 8.130743675706032e-07, "loss": 0.1485, "step": 2240 }, { "epoch": 4.587512794268168, "grad_norm": 0.2855149297724901, "learning_rate": 8.050188047532148e-07, "loss": 0.172, "step": 2241 }, { "epoch": 4.5895598771750254, "grad_norm": 0.3379024479854454, "learning_rate": 7.970025265679648e-07, "loss": 0.173, "step": 2242 }, { "epoch": 4.591606960081883, "grad_norm": 0.2783179411940068, "learning_rate": 7.890255494210453e-07, "loss": 0.1801, "step": 2243 }, { "epoch": 4.593654042988741, "grad_norm": 0.30188617728523903, "learning_rate": 7.810878896382101e-07, "loss": 0.1668, "step": 2244 }, { "epoch": 4.595701125895599, "grad_norm": 0.27939878533555546, "learning_rate": 7.731895634647513e-07, "loss": 0.1544, "step": 2245 }, { "epoch": 4.597748208802456, "grad_norm": 0.29721546644084723, "learning_rate": 7.653305870654604e-07, "loss": 0.1459, "step": 2246 }, { "epoch": 4.599795291709314, "grad_norm": 0.2761040647920302, "learning_rate": 7.575109765245936e-07, "loss": 0.1991, "step": 2247 }, { "epoch": 4.601842374616172, "grad_norm": 0.26790523775903885, "learning_rate": 7.497307478458382e-07, "loss": 0.1881, "step": 2248 }, { "epoch": 4.60388945752303, "grad_norm": 0.26902868723836015, "learning_rate": 7.419899169522903e-07, "loss": 0.1956, "step": 2249 }, { "epoch": 4.605936540429887, "grad_norm": 0.27200231013008247, "learning_rate": 7.342884996863997e-07, "loss": 0.1656, "step": 2250 }, { "epoch": 4.607983623336745, "grad_norm": 0.3124482032601389, "learning_rate": 7.266265118099669e-07, "loss": 0.1753, "step": 2251 }, { "epoch": 4.610030706243603, "grad_norm": 0.2938080759180912, "learning_rate": 7.190039690040884e-07, "loss": 0.1864, "step": 2252 }, { "epoch": 4.612077789150461, "grad_norm": 0.28057707746789584, "learning_rate": 7.114208868691319e-07, "loss": 0.1655, "step": 2253 }, { "epoch": 4.614124872057318, "grad_norm": 0.3670804523977461, "learning_rate": 7.038772809247075e-07, "loss": 0.2006, "step": 2254 }, { "epoch": 4.616171954964176, "grad_norm": 0.3156535130425976, "learning_rate": 6.963731666096318e-07, "loss": 0.1873, "step": 2255 }, { "epoch": 4.618219037871034, "grad_norm": 0.29444479967748544, "learning_rate": 6.889085592818956e-07, "loss": 0.1698, "step": 2256 }, { "epoch": 4.620266120777892, "grad_norm": 0.2743736628864778, "learning_rate": 6.814834742186361e-07, "loss": 0.1851, "step": 2257 }, { "epoch": 4.622313203684749, "grad_norm": 0.2788944245236952, "learning_rate": 6.740979266161018e-07, "loss": 0.1649, "step": 2258 }, { "epoch": 4.624360286591607, "grad_norm": 0.28277515238178413, "learning_rate": 6.667519315896264e-07, "loss": 0.1707, "step": 2259 }, { "epoch": 4.626407369498464, "grad_norm": 0.29229311294327376, "learning_rate": 6.594455041735925e-07, "loss": 0.1577, "step": 2260 }, { "epoch": 4.628454452405323, "grad_norm": 0.2826367197666537, "learning_rate": 6.521786593214075e-07, "loss": 0.1694, "step": 2261 }, { "epoch": 4.63050153531218, "grad_norm": 0.27183167715863765, "learning_rate": 6.449514119054634e-07, "loss": 0.1821, "step": 2262 }, { "epoch": 4.632548618219038, "grad_norm": 0.2705799558624456, "learning_rate": 6.377637767171152e-07, "loss": 0.16, "step": 2263 }, { "epoch": 4.634595701125896, "grad_norm": 0.312451093844958, "learning_rate": 6.306157684666425e-07, "loss": 0.146, "step": 2264 }, { "epoch": 4.6366427840327535, "grad_norm": 0.30534769600917144, "learning_rate": 6.235074017832299e-07, "loss": 0.2026, "step": 2265 }, { "epoch": 4.638689866939611, "grad_norm": 0.2749885611375992, "learning_rate": 6.164386912149289e-07, "loss": 0.149, "step": 2266 }, { "epoch": 4.640736949846469, "grad_norm": 0.3043305615254814, "learning_rate": 6.094096512286297e-07, "loss": 0.1931, "step": 2267 }, { "epoch": 4.642784032753326, "grad_norm": 0.2926074707796185, "learning_rate": 6.024202962100312e-07, "loss": 0.1704, "step": 2268 }, { "epoch": 4.6448311156601845, "grad_norm": 0.27826474993716993, "learning_rate": 5.954706404636179e-07, "loss": 0.1868, "step": 2269 }, { "epoch": 4.646878198567042, "grad_norm": 0.2676299230869566, "learning_rate": 5.88560698212619e-07, "loss": 0.1656, "step": 2270 }, { "epoch": 4.6489252814738995, "grad_norm": 0.3004879943235446, "learning_rate": 5.816904835989867e-07, "loss": 0.1462, "step": 2271 }, { "epoch": 4.650972364380758, "grad_norm": 0.28512914647917514, "learning_rate": 5.748600106833735e-07, "loss": 0.1784, "step": 2272 }, { "epoch": 4.653019447287615, "grad_norm": 0.2810502748083855, "learning_rate": 5.680692934450837e-07, "loss": 0.2186, "step": 2273 }, { "epoch": 4.655066530194473, "grad_norm": 0.2855042987462666, "learning_rate": 5.613183457820714e-07, "loss": 0.1911, "step": 2274 }, { "epoch": 4.65711361310133, "grad_norm": 0.30832919924707025, "learning_rate": 5.546071815108845e-07, "loss": 0.1853, "step": 2275 }, { "epoch": 4.659160696008188, "grad_norm": 0.27863470748148456, "learning_rate": 5.479358143666602e-07, "loss": 0.1446, "step": 2276 }, { "epoch": 4.661207778915046, "grad_norm": 0.2954741773503375, "learning_rate": 5.413042580030792e-07, "loss": 0.1861, "step": 2277 }, { "epoch": 4.663254861821904, "grad_norm": 0.292054066538448, "learning_rate": 5.347125259923491e-07, "loss": 0.1693, "step": 2278 }, { "epoch": 4.665301944728761, "grad_norm": 0.2736497121658198, "learning_rate": 5.281606318251764e-07, "loss": 0.1548, "step": 2279 }, { "epoch": 4.667349027635619, "grad_norm": 0.30678020029811015, "learning_rate": 5.216485889107214e-07, "loss": 0.1982, "step": 2280 }, { "epoch": 4.669396110542477, "grad_norm": 0.3007630538627819, "learning_rate": 5.151764105766011e-07, "loss": 0.2082, "step": 2281 }, { "epoch": 4.671443193449335, "grad_norm": 0.30472720971722367, "learning_rate": 5.087441100688351e-07, "loss": 0.1913, "step": 2282 }, { "epoch": 4.673490276356192, "grad_norm": 0.28472393369780435, "learning_rate": 5.023517005518264e-07, "loss": 0.1795, "step": 2283 }, { "epoch": 4.67553735926305, "grad_norm": 0.30834546215081027, "learning_rate": 4.959991951083498e-07, "loss": 0.1677, "step": 2284 }, { "epoch": 4.677584442169908, "grad_norm": 0.2707613967004538, "learning_rate": 4.89686606739499e-07, "loss": 0.1885, "step": 2285 }, { "epoch": 4.679631525076766, "grad_norm": 0.2925354294386155, "learning_rate": 4.834139483646793e-07, "loss": 0.1369, "step": 2286 }, { "epoch": 4.681678607983623, "grad_norm": 0.295052171708121, "learning_rate": 4.771812328215708e-07, "loss": 0.1684, "step": 2287 }, { "epoch": 4.683725690890481, "grad_norm": 0.2816846080734314, "learning_rate": 4.709884728661118e-07, "loss": 0.1634, "step": 2288 }, { "epoch": 4.685772773797339, "grad_norm": 0.29537583958782165, "learning_rate": 4.648356811724619e-07, "loss": 0.1501, "step": 2289 }, { "epoch": 4.687819856704197, "grad_norm": 0.2776645155887449, "learning_rate": 4.587228703329838e-07, "loss": 0.1731, "step": 2290 }, { "epoch": 4.689866939611054, "grad_norm": 0.27540787792841026, "learning_rate": 4.5265005285821674e-07, "loss": 0.1688, "step": 2291 }, { "epoch": 4.691914022517912, "grad_norm": 0.2774319968994503, "learning_rate": 4.4661724117684545e-07, "loss": 0.1736, "step": 2292 }, { "epoch": 4.69396110542477, "grad_norm": 0.27404159260754235, "learning_rate": 4.40624447635678e-07, "loss": 0.1473, "step": 2293 }, { "epoch": 4.696008188331628, "grad_norm": 0.2944590643348067, "learning_rate": 4.346716844996279e-07, "loss": 0.1594, "step": 2294 }, { "epoch": 4.698055271238485, "grad_norm": 0.2881991440998457, "learning_rate": 4.2875896395167427e-07, "loss": 0.1988, "step": 2295 }, { "epoch": 4.700102354145343, "grad_norm": 0.28026776497774936, "learning_rate": 4.228862980928439e-07, "loss": 0.1784, "step": 2296 }, { "epoch": 4.702149437052201, "grad_norm": 0.2955331369567261, "learning_rate": 4.1705369894219584e-07, "loss": 0.1786, "step": 2297 }, { "epoch": 4.7041965199590585, "grad_norm": 0.292319027121268, "learning_rate": 4.112611784367837e-07, "loss": 0.1677, "step": 2298 }, { "epoch": 4.706243602865916, "grad_norm": 0.31348235722088247, "learning_rate": 4.0550874843163337e-07, "loss": 0.1796, "step": 2299 }, { "epoch": 4.7082906857727735, "grad_norm": 0.2903513190722839, "learning_rate": 3.997964206997207e-07, "loss": 0.1804, "step": 2300 }, { "epoch": 4.710337768679631, "grad_norm": 0.2879799305264078, "learning_rate": 3.941242069319562e-07, "loss": 0.1895, "step": 2301 }, { "epoch": 4.7123848515864895, "grad_norm": 0.3577665248128591, "learning_rate": 3.8849211873714266e-07, "loss": 0.1765, "step": 2302 }, { "epoch": 4.714431934493347, "grad_norm": 0.2903669946216332, "learning_rate": 3.8290016764196637e-07, "loss": 0.1716, "step": 2303 }, { "epoch": 4.7164790174002045, "grad_norm": 0.3014373493296439, "learning_rate": 3.7734836509096596e-07, "loss": 0.1388, "step": 2304 }, { "epoch": 4.718526100307063, "grad_norm": 0.3083437231349992, "learning_rate": 3.7183672244652135e-07, "loss": 0.1903, "step": 2305 }, { "epoch": 4.72057318321392, "grad_norm": 0.2957154629258413, "learning_rate": 3.663652509888027e-07, "loss": 0.1718, "step": 2306 }, { "epoch": 4.722620266120778, "grad_norm": 0.3046648653682266, "learning_rate": 3.6093396191578366e-07, "loss": 0.1979, "step": 2307 }, { "epoch": 4.724667349027635, "grad_norm": 0.28017443242244167, "learning_rate": 3.5554286634318814e-07, "loss": 0.1728, "step": 2308 }, { "epoch": 4.726714431934493, "grad_norm": 0.2748026640801257, "learning_rate": 3.501919753044836e-07, "loss": 0.2096, "step": 2309 }, { "epoch": 4.728761514841351, "grad_norm": 0.2900862431852642, "learning_rate": 3.448812997508588e-07, "loss": 0.1655, "step": 2310 }, { "epoch": 4.730808597748209, "grad_norm": 0.2833002934703294, "learning_rate": 3.3961085055119083e-07, "loss": 0.1527, "step": 2311 }, { "epoch": 4.732855680655066, "grad_norm": 0.27909247839420187, "learning_rate": 3.3438063849203116e-07, "loss": 0.1449, "step": 2312 }, { "epoch": 4.734902763561925, "grad_norm": 0.2997312028695653, "learning_rate": 3.2919067427758186e-07, "loss": 0.153, "step": 2313 }, { "epoch": 4.736949846468782, "grad_norm": 0.2788661977637299, "learning_rate": 3.2404096852967305e-07, "loss": 0.1686, "step": 2314 }, { "epoch": 4.73899692937564, "grad_norm": 0.2844108440502707, "learning_rate": 3.189315317877428e-07, "loss": 0.1575, "step": 2315 }, { "epoch": 4.741044012282497, "grad_norm": 0.29542954551351697, "learning_rate": 3.138623745088132e-07, "loss": 0.1489, "step": 2316 }, { "epoch": 4.743091095189355, "grad_norm": 0.2931309390733155, "learning_rate": 3.0883350706746973e-07, "loss": 0.1793, "step": 2317 }, { "epoch": 4.745138178096213, "grad_norm": 0.274362618181376, "learning_rate": 3.038449397558396e-07, "loss": 0.1635, "step": 2318 }, { "epoch": 4.747185261003071, "grad_norm": 0.27428372927016115, "learning_rate": 2.9889668278357376e-07, "loss": 0.1588, "step": 2319 }, { "epoch": 4.749232343909928, "grad_norm": 0.29732674571188733, "learning_rate": 2.9398874627782014e-07, "loss": 0.1708, "step": 2320 }, { "epoch": 4.751279426816786, "grad_norm": 0.3003256230614381, "learning_rate": 2.891211402832128e-07, "loss": 0.1725, "step": 2321 }, { "epoch": 4.753326509723644, "grad_norm": 0.3084720969154894, "learning_rate": 2.8429387476183624e-07, "loss": 0.1483, "step": 2322 }, { "epoch": 4.755373592630502, "grad_norm": 0.3036705097265483, "learning_rate": 2.7950695959322093e-07, "loss": 0.1623, "step": 2323 }, { "epoch": 4.757420675537359, "grad_norm": 0.26881070505536797, "learning_rate": 2.747604045743102e-07, "loss": 0.171, "step": 2324 }, { "epoch": 4.759467758444217, "grad_norm": 0.2880629649819299, "learning_rate": 2.7005421941945555e-07, "loss": 0.1646, "step": 2325 }, { "epoch": 4.761514841351075, "grad_norm": 0.28081087918300657, "learning_rate": 2.653884137603702e-07, "loss": 0.1427, "step": 2326 }, { "epoch": 4.763561924257933, "grad_norm": 0.29698506829638605, "learning_rate": 2.6076299714614673e-07, "loss": 0.1612, "step": 2327 }, { "epoch": 4.76560900716479, "grad_norm": 0.2830061118167646, "learning_rate": 2.5617797904320396e-07, "loss": 0.1731, "step": 2328 }, { "epoch": 4.767656090071648, "grad_norm": 0.3123860044528765, "learning_rate": 2.516333688352801e-07, "loss": 0.1561, "step": 2329 }, { "epoch": 4.769703172978506, "grad_norm": 0.2798082098938455, "learning_rate": 2.471291758234218e-07, "loss": 0.1902, "step": 2330 }, { "epoch": 4.7717502558853635, "grad_norm": 0.2987315382953632, "learning_rate": 2.426654092259528e-07, "loss": 0.1551, "step": 2331 }, { "epoch": 4.773797338792221, "grad_norm": 0.2808063055566694, "learning_rate": 2.382420781784589e-07, "loss": 0.1749, "step": 2332 }, { "epoch": 4.7758444216990785, "grad_norm": 0.2796699754953347, "learning_rate": 2.338591917337696e-07, "loss": 0.1727, "step": 2333 }, { "epoch": 4.777891504605937, "grad_norm": 0.277747972615087, "learning_rate": 2.295167588619518e-07, "loss": 0.1507, "step": 2334 }, { "epoch": 4.779938587512794, "grad_norm": 0.2987700878833016, "learning_rate": 2.2521478845025867e-07, "loss": 0.1798, "step": 2335 }, { "epoch": 4.781985670419652, "grad_norm": 0.2728906304255605, "learning_rate": 2.2095328930315184e-07, "loss": 0.171, "step": 2336 }, { "epoch": 4.7840327533265095, "grad_norm": 0.300885418994469, "learning_rate": 2.167322701422525e-07, "loss": 0.163, "step": 2337 }, { "epoch": 4.786079836233368, "grad_norm": 0.2843769306835108, "learning_rate": 2.1255173960634146e-07, "loss": 0.1788, "step": 2338 }, { "epoch": 4.788126919140225, "grad_norm": 0.29006108226073735, "learning_rate": 2.08411706251328e-07, "loss": 0.177, "step": 2339 }, { "epoch": 4.790174002047083, "grad_norm": 0.28369183368200673, "learning_rate": 2.0431217855025e-07, "loss": 0.18, "step": 2340 }, { "epoch": 4.79222108495394, "grad_norm": 0.28761647181705274, "learning_rate": 2.0025316489323597e-07, "loss": 0.1656, "step": 2341 }, { "epoch": 4.794268167860798, "grad_norm": 0.2839547733000503, "learning_rate": 1.9623467358750315e-07, "loss": 0.1743, "step": 2342 }, { "epoch": 4.796315250767656, "grad_norm": 0.2792802390584324, "learning_rate": 1.9225671285733272e-07, "loss": 0.1793, "step": 2343 }, { "epoch": 4.798362333674514, "grad_norm": 0.2953632540852749, "learning_rate": 1.8831929084406119e-07, "loss": 0.1697, "step": 2344 }, { "epoch": 4.800409416581371, "grad_norm": 0.26668806789326216, "learning_rate": 1.8442241560604922e-07, "loss": 0.1517, "step": 2345 }, { "epoch": 4.80245649948823, "grad_norm": 0.27943039216113114, "learning_rate": 1.8056609511868163e-07, "loss": 0.2131, "step": 2346 }, { "epoch": 4.804503582395087, "grad_norm": 0.26345026664491444, "learning_rate": 1.7675033727434288e-07, "loss": 0.1688, "step": 2347 }, { "epoch": 4.806550665301945, "grad_norm": 0.28602587264323004, "learning_rate": 1.7297514988239505e-07, "loss": 0.1958, "step": 2348 }, { "epoch": 4.808597748208802, "grad_norm": 0.27558338217328315, "learning_rate": 1.692405406691755e-07, "loss": 0.1796, "step": 2349 }, { "epoch": 4.81064483111566, "grad_norm": 0.2966627143037965, "learning_rate": 1.655465172779702e-07, "loss": 0.192, "step": 2350 }, { "epoch": 4.812691914022518, "grad_norm": 0.2944395579726434, "learning_rate": 1.6189308726900277e-07, "loss": 0.172, "step": 2351 }, { "epoch": 4.814738996929376, "grad_norm": 0.2711839256534087, "learning_rate": 1.5828025811941872e-07, "loss": 0.1901, "step": 2352 }, { "epoch": 4.816786079836233, "grad_norm": 0.29155015705326437, "learning_rate": 1.547080372232679e-07, "loss": 0.167, "step": 2353 }, { "epoch": 4.818833162743092, "grad_norm": 0.2815618639741477, "learning_rate": 1.5117643189149546e-07, "loss": 0.1516, "step": 2354 }, { "epoch": 4.820880245649949, "grad_norm": 0.29038125469870796, "learning_rate": 1.4768544935191088e-07, "loss": 0.1657, "step": 2355 }, { "epoch": 4.822927328556807, "grad_norm": 0.2867465831992582, "learning_rate": 1.44235096749199e-07, "loss": 0.1824, "step": 2356 }, { "epoch": 4.824974411463664, "grad_norm": 0.2708843561414753, "learning_rate": 1.408253811448823e-07, "loss": 0.1735, "step": 2357 }, { "epoch": 4.827021494370522, "grad_norm": 0.2891677780067326, "learning_rate": 1.374563095173187e-07, "loss": 0.1594, "step": 2358 }, { "epoch": 4.82906857727738, "grad_norm": 0.29721811114798363, "learning_rate": 1.3412788876167925e-07, "loss": 0.1681, "step": 2359 }, { "epoch": 4.8311156601842375, "grad_norm": 0.2819368121687383, "learning_rate": 1.3084012568994608e-07, "loss": 0.2588, "step": 2360 }, { "epoch": 4.833162743091095, "grad_norm": 0.2637703601334377, "learning_rate": 1.2759302703088117e-07, "loss": 0.1686, "step": 2361 }, { "epoch": 4.835209825997953, "grad_norm": 0.2790376537691832, "learning_rate": 1.2438659943003306e-07, "loss": 0.1824, "step": 2362 }, { "epoch": 4.837256908904811, "grad_norm": 0.2855413561087752, "learning_rate": 1.212208494497036e-07, "loss": 0.18, "step": 2363 }, { "epoch": 4.8393039918116685, "grad_norm": 0.2966645113711715, "learning_rate": 1.180957835689478e-07, "loss": 0.1462, "step": 2364 }, { "epoch": 4.841351074718526, "grad_norm": 0.28073303467220956, "learning_rate": 1.1501140818355627e-07, "loss": 0.2037, "step": 2365 }, { "epoch": 4.8433981576253835, "grad_norm": 0.27323730338581254, "learning_rate": 1.1196772960603952e-07, "loss": 0.1552, "step": 2366 }, { "epoch": 4.845445240532242, "grad_norm": 0.27979495089452416, "learning_rate": 1.0896475406562135e-07, "loss": 0.1911, "step": 2367 }, { "epoch": 4.847492323439099, "grad_norm": 0.27315088792468245, "learning_rate": 1.0600248770821886e-07, "loss": 0.1945, "step": 2368 }, { "epoch": 4.849539406345957, "grad_norm": 0.29508373388397574, "learning_rate": 1.0308093659643582e-07, "loss": 0.158, "step": 2369 }, { "epoch": 4.851586489252814, "grad_norm": 0.2790657492666217, "learning_rate": 1.0020010670954483e-07, "loss": 0.1798, "step": 2370 }, { "epoch": 4.853633572159673, "grad_norm": 0.2863828008417254, "learning_rate": 9.736000394348299e-08, "loss": 0.1688, "step": 2371 }, { "epoch": 4.85568065506653, "grad_norm": 0.2837857995206189, "learning_rate": 9.456063411082738e-08, "loss": 0.1818, "step": 2372 }, { "epoch": 4.857727737973388, "grad_norm": 0.26660018975441796, "learning_rate": 9.180200294079955e-08, "loss": 0.1681, "step": 2373 }, { "epoch": 4.859774820880245, "grad_norm": 0.30489283035917186, "learning_rate": 8.908411607923884e-08, "loss": 0.1724, "step": 2374 }, { "epoch": 4.861821903787103, "grad_norm": 0.27668845133173764, "learning_rate": 8.640697908859575e-08, "loss": 0.1871, "step": 2375 }, { "epoch": 4.863868986693961, "grad_norm": 0.27941980307619757, "learning_rate": 8.377059744792748e-08, "loss": 0.1875, "step": 2376 }, { "epoch": 4.865916069600819, "grad_norm": 0.2991779658323062, "learning_rate": 8.117497655287798e-08, "loss": 0.1607, "step": 2377 }, { "epoch": 4.867963152507676, "grad_norm": 0.2960605235453732, "learning_rate": 7.862012171566902e-08, "loss": 0.2025, "step": 2378 }, { "epoch": 4.870010235414535, "grad_norm": 0.2909596199436068, "learning_rate": 7.61060381650891e-08, "loss": 0.1976, "step": 2379 }, { "epoch": 4.872057318321392, "grad_norm": 0.3022247341372479, "learning_rate": 7.363273104648904e-08, "loss": 0.1873, "step": 2380 }, { "epoch": 4.87410440122825, "grad_norm": 0.27635276227359323, "learning_rate": 7.120020542176198e-08, "loss": 0.1815, "step": 2381 }, { "epoch": 4.876151484135107, "grad_norm": 0.3235414628513566, "learning_rate": 6.880846626933668e-08, "loss": 0.1526, "step": 2382 }, { "epoch": 4.878198567041965, "grad_norm": 0.3061161602814933, "learning_rate": 6.645751848417093e-08, "loss": 0.1672, "step": 2383 }, { "epoch": 4.880245649948823, "grad_norm": 0.2857967006307856, "learning_rate": 6.414736687773371e-08, "loss": 0.1662, "step": 2384 }, { "epoch": 4.882292732855681, "grad_norm": 0.27457282899975494, "learning_rate": 6.187801617800748e-08, "loss": 0.1564, "step": 2385 }, { "epoch": 4.884339815762538, "grad_norm": 0.2990684370251101, "learning_rate": 5.964947102946594e-08, "loss": 0.193, "step": 2386 }, { "epoch": 4.886386898669397, "grad_norm": 0.27143331999482234, "learning_rate": 5.746173599307181e-08, "loss": 0.172, "step": 2387 }, { "epoch": 4.888433981576254, "grad_norm": 0.2720324730535623, "learning_rate": 5.531481554626128e-08, "loss": 0.1466, "step": 2388 }, { "epoch": 4.890481064483112, "grad_norm": 0.29144216168390374, "learning_rate": 5.320871408294403e-08, "loss": 0.1622, "step": 2389 }, { "epoch": 4.892528147389969, "grad_norm": 0.27957166407177925, "learning_rate": 5.114343591348769e-08, "loss": 0.1744, "step": 2390 }, { "epoch": 4.894575230296827, "grad_norm": 0.2743372447650692, "learning_rate": 4.9118985264711147e-08, "loss": 0.1779, "step": 2391 }, { "epoch": 4.896622313203685, "grad_norm": 0.2864112361999076, "learning_rate": 4.713536627987347e-08, "loss": 0.1783, "step": 2392 }, { "epoch": 4.8986693961105425, "grad_norm": 0.2820613867682801, "learning_rate": 4.519258301866947e-08, "loss": 0.1764, "step": 2393 }, { "epoch": 4.9007164790174, "grad_norm": 0.2790446443208781, "learning_rate": 4.3290639457214125e-08, "loss": 0.1983, "step": 2394 }, { "epoch": 4.9027635619242576, "grad_norm": 0.3009212756955286, "learning_rate": 4.1429539488047066e-08, "loss": 0.1632, "step": 2395 }, { "epoch": 4.904810644831116, "grad_norm": 0.3048977634842906, "learning_rate": 3.960928692011257e-08, "loss": 0.1775, "step": 2396 }, { "epoch": 4.9068577277379735, "grad_norm": 0.2999700691804944, "learning_rate": 3.7829885478757324e-08, "loss": 0.1811, "step": 2397 }, { "epoch": 4.908904810644831, "grad_norm": 0.2867226352655605, "learning_rate": 3.6091338805719356e-08, "loss": 0.1749, "step": 2398 }, { "epoch": 4.9109518935516885, "grad_norm": 0.27853817331380126, "learning_rate": 3.439365045912801e-08, "loss": 0.1715, "step": 2399 }, { "epoch": 4.912998976458547, "grad_norm": 0.26671396491728905, "learning_rate": 3.273682391348398e-08, "loss": 0.1736, "step": 2400 }, { "epoch": 4.915046059365404, "grad_norm": 0.2981072341365868, "learning_rate": 3.1120862559670396e-08, "loss": 0.1608, "step": 2401 }, { "epoch": 4.917093142272262, "grad_norm": 0.3006161506174685, "learning_rate": 2.9545769704923954e-08, "loss": 0.174, "step": 2402 }, { "epoch": 4.919140225179119, "grad_norm": 0.2700128965804573, "learning_rate": 2.8011548572846047e-08, "loss": 0.1666, "step": 2403 }, { "epoch": 4.921187308085978, "grad_norm": 0.27640788721099124, "learning_rate": 2.651820230338942e-08, "loss": 0.1786, "step": 2404 }, { "epoch": 4.923234390992835, "grad_norm": 0.30195005723908613, "learning_rate": 2.50657339528515e-08, "loss": 0.1683, "step": 2405 }, { "epoch": 4.925281473899693, "grad_norm": 0.27803519346079886, "learning_rate": 2.365414649386555e-08, "loss": 0.2196, "step": 2406 }, { "epoch": 4.92732855680655, "grad_norm": 0.27873854091593087, "learning_rate": 2.2283442815402845e-08, "loss": 0.1772, "step": 2407 }, { "epoch": 4.929375639713409, "grad_norm": 0.31272442814047646, "learning_rate": 2.0953625722754943e-08, "loss": 0.1917, "step": 2408 }, { "epoch": 4.931422722620266, "grad_norm": 0.27074887610829274, "learning_rate": 1.9664697937542554e-08, "loss": 0.177, "step": 2409 }, { "epoch": 4.933469805527124, "grad_norm": 0.2809859827116871, "learning_rate": 1.8416662097693326e-08, "loss": 0.1735, "step": 2410 }, { "epoch": 4.935516888433981, "grad_norm": 0.2922562803880012, "learning_rate": 1.720952075745075e-08, "loss": 0.164, "step": 2411 }, { "epoch": 4.93756397134084, "grad_norm": 0.27866762770170544, "learning_rate": 1.604327638736525e-08, "loss": 0.1509, "step": 2412 }, { "epoch": 4.939611054247697, "grad_norm": 0.2757657750150201, "learning_rate": 1.491793137427866e-08, "loss": 0.1694, "step": 2413 }, { "epoch": 4.941658137154555, "grad_norm": 0.3690880615237033, "learning_rate": 1.3833488021335328e-08, "loss": 0.1459, "step": 2414 }, { "epoch": 4.943705220061412, "grad_norm": 0.28975968339512975, "learning_rate": 1.2789948547968779e-08, "loss": 0.1723, "step": 2415 }, { "epoch": 4.94575230296827, "grad_norm": 0.2676510309996025, "learning_rate": 1.1787315089895057e-08, "loss": 0.1587, "step": 2416 }, { "epoch": 4.947799385875128, "grad_norm": 0.29535503732393725, "learning_rate": 1.0825589699112737e-08, "loss": 0.1429, "step": 2417 }, { "epoch": 4.949846468781986, "grad_norm": 0.2851419625396727, "learning_rate": 9.904774343898471e-09, "loss": 0.1529, "step": 2418 }, { "epoch": 4.951893551688843, "grad_norm": 0.25772957787571077, "learning_rate": 9.024870908802552e-09, "loss": 0.187, "step": 2419 }, { "epoch": 4.9539406345957016, "grad_norm": 0.2900373956230706, "learning_rate": 8.185881194644474e-09, "loss": 0.1541, "step": 2420 }, { "epoch": 4.955987717502559, "grad_norm": 0.291463457270851, "learning_rate": 7.387806918508489e-09, "loss": 0.1585, "step": 2421 }, { "epoch": 4.958034800409417, "grad_norm": 0.2796933717518039, "learning_rate": 6.630649713739168e-09, "loss": 0.2045, "step": 2422 }, { "epoch": 4.960081883316274, "grad_norm": 0.2809689443794419, "learning_rate": 5.9144111299414e-09, "loss": 0.169, "step": 2423 }, { "epoch": 4.962128966223132, "grad_norm": 0.3074472408506574, "learning_rate": 5.239092632980391e-09, "loss": 0.1593, "step": 2424 }, { "epoch": 4.96417604912999, "grad_norm": 0.28551605159632193, "learning_rate": 4.6046956049639045e-09, "loss": 0.1518, "step": 2425 }, { "epoch": 4.9662231320368475, "grad_norm": 0.2889141054976907, "learning_rate": 4.011221344257799e-09, "loss": 0.1781, "step": 2426 }, { "epoch": 4.968270214943705, "grad_norm": 0.25486448368902137, "learning_rate": 3.4586710654727074e-09, "loss": 0.1688, "step": 2427 }, { "epoch": 4.970317297850563, "grad_norm": 0.3024410370860751, "learning_rate": 2.94704589946182e-09, "loss": 0.2105, "step": 2428 }, { "epoch": 4.972364380757421, "grad_norm": 0.29484058319269135, "learning_rate": 2.4763468933231005e-09, "loss": 0.178, "step": 2429 }, { "epoch": 4.974411463664278, "grad_norm": 0.28427681285996675, "learning_rate": 2.0465750103926263e-09, "loss": 0.2151, "step": 2430 }, { "epoch": 4.976458546571136, "grad_norm": 0.2758137593263903, "learning_rate": 1.657731130246809e-09, "loss": 0.1934, "step": 2431 }, { "epoch": 4.9785056294779935, "grad_norm": 0.3024473446528619, "learning_rate": 1.309816048697954e-09, "loss": 0.1732, "step": 2432 }, { "epoch": 4.980552712384852, "grad_norm": 0.26746264280387694, "learning_rate": 1.0028304777875975e-09, "loss": 0.1807, "step": 2433 }, { "epoch": 4.982599795291709, "grad_norm": 0.2689810857541332, "learning_rate": 7.367750458020518e-10, "loss": 0.1713, "step": 2434 }, { "epoch": 4.984646878198567, "grad_norm": 0.30444153746664904, "learning_rate": 5.116502972479787e-10, "loss": 0.172, "step": 2435 }, { "epoch": 4.986693961105424, "grad_norm": 0.2724209053996081, "learning_rate": 3.2745669287237435e-10, "loss": 0.1994, "step": 2436 }, { "epoch": 4.988741044012283, "grad_norm": 0.2666986989444724, "learning_rate": 1.8419460964258505e-10, "loss": 0.1735, "step": 2437 }, { "epoch": 4.99078812691914, "grad_norm": 0.2846154192735168, "learning_rate": 8.186434076185024e-11, "loss": 0.1539, "step": 2438 }, { "epoch": 4.992835209825998, "grad_norm": 0.28158014093768946, "learning_rate": 2.046609566264124e-11, "loss": 0.1756, "step": 2439 }, { "epoch": 4.994882292732855, "grad_norm": 0.3150094705291199, "learning_rate": 0.0, "loss": 0.2073, "step": 2440 }, { "epoch": 4.994882292732855, "step": 2440, "total_flos": 2619216084533248.0, "train_loss": 0.34357271391715183, "train_runtime": 45861.8772, "train_samples_per_second": 6.814, "train_steps_per_second": 0.053 } ], "logging_steps": 1, "max_steps": 2440, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2619216084533248.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }