diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4592 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.984202967927238, + "eval_steps": 500, + "global_step": 650, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007659167065581618, + "grad_norm": 5.921403529288989, + "learning_rate": 6.153846153846155e-07, + "loss": 0.9112, + "step": 1 + }, + { + "epoch": 0.015318334131163236, + "grad_norm": 6.161237327775742, + "learning_rate": 1.230769230769231e-06, + "loss": 0.9517, + "step": 2 + }, + { + "epoch": 0.022977501196744854, + "grad_norm": 6.058434736522049, + "learning_rate": 1.8461538461538465e-06, + "loss": 0.9295, + "step": 3 + }, + { + "epoch": 0.030636668262326472, + "grad_norm": 6.016130530129469, + "learning_rate": 2.461538461538462e-06, + "loss": 0.9235, + "step": 4 + }, + { + "epoch": 0.03829583532790809, + "grad_norm": 5.729487082326455, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.9035, + "step": 5 + }, + { + "epoch": 0.04595500239348971, + "grad_norm": 4.266411955281039, + "learning_rate": 3.692307692307693e-06, + "loss": 0.8544, + "step": 6 + }, + { + "epoch": 0.053614169459071326, + "grad_norm": 2.4685484993069466, + "learning_rate": 4.307692307692308e-06, + "loss": 0.8081, + "step": 7 + }, + { + "epoch": 0.061273336524652944, + "grad_norm": 2.2509637870140136, + "learning_rate": 4.923076923076924e-06, + "loss": 0.836, + "step": 8 + }, + { + "epoch": 0.06893250359023456, + "grad_norm": 3.6503885662831137, + "learning_rate": 5.538461538461539e-06, + "loss": 0.7788, + "step": 9 + }, + { + "epoch": 0.07659167065581618, + "grad_norm": 4.501975701703904, + "learning_rate": 6.153846153846155e-06, + "loss": 0.8309, + "step": 10 + }, + { + "epoch": 0.0842508377213978, + "grad_norm": 4.370930073810917, + "learning_rate": 6.76923076923077e-06, + "loss": 0.8224, + "step": 11 + }, + { + "epoch": 0.09191000478697942, + "grad_norm": 3.546183001564244, + "learning_rate": 7.384615384615386e-06, + "loss": 0.7941, + "step": 12 + }, + { + "epoch": 0.09956917185256103, + "grad_norm": 3.2331489380759555, + "learning_rate": 8.000000000000001e-06, + "loss": 0.7635, + "step": 13 + }, + { + "epoch": 0.10722833891814265, + "grad_norm": 2.5015396395106966, + "learning_rate": 8.615384615384617e-06, + "loss": 0.7327, + "step": 14 + }, + { + "epoch": 0.11488750598372427, + "grad_norm": 1.8441949964024125, + "learning_rate": 9.230769230769232e-06, + "loss": 0.7094, + "step": 15 + }, + { + "epoch": 0.12254667304930589, + "grad_norm": 1.732314176486123, + "learning_rate": 9.846153846153848e-06, + "loss": 0.6849, + "step": 16 + }, + { + "epoch": 0.1302058401148875, + "grad_norm": 1.8608940893991852, + "learning_rate": 1.0461538461538463e-05, + "loss": 0.6811, + "step": 17 + }, + { + "epoch": 0.13786500718046912, + "grad_norm": 1.8077531019190372, + "learning_rate": 1.1076923076923079e-05, + "loss": 0.6745, + "step": 18 + }, + { + "epoch": 0.14552417424605074, + "grad_norm": 1.2577918457624666, + "learning_rate": 1.1692307692307694e-05, + "loss": 0.6578, + "step": 19 + }, + { + "epoch": 0.15318334131163236, + "grad_norm": 1.289096433457239, + "learning_rate": 1.230769230769231e-05, + "loss": 0.6754, + "step": 20 + }, + { + "epoch": 0.16084250837721398, + "grad_norm": 1.4570529134788588, + "learning_rate": 1.2923076923076925e-05, + "loss": 0.6655, + "step": 21 + }, + { + "epoch": 0.1685016754427956, + "grad_norm": 1.1952396245104895, + "learning_rate": 1.353846153846154e-05, + "loss": 0.64, + "step": 22 + }, + { + "epoch": 0.1761608425083772, + "grad_norm": 1.0573908695788548, + "learning_rate": 1.4153846153846156e-05, + "loss": 0.6226, + "step": 23 + }, + { + "epoch": 0.18382000957395883, + "grad_norm": 1.0531289868486842, + "learning_rate": 1.4769230769230772e-05, + "loss": 0.5999, + "step": 24 + }, + { + "epoch": 0.19147917663954045, + "grad_norm": 0.8767976703429683, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.6224, + "step": 25 + }, + { + "epoch": 0.19913834370512207, + "grad_norm": 1.0191023286379297, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.6382, + "step": 26 + }, + { + "epoch": 0.20679751077070369, + "grad_norm": 0.8038756547216995, + "learning_rate": 1.6615384615384618e-05, + "loss": 0.6216, + "step": 27 + }, + { + "epoch": 0.2144566778362853, + "grad_norm": 0.958933004581782, + "learning_rate": 1.7230769230769234e-05, + "loss": 0.6154, + "step": 28 + }, + { + "epoch": 0.22211584490186692, + "grad_norm": 0.9664195885794775, + "learning_rate": 1.784615384615385e-05, + "loss": 0.6128, + "step": 29 + }, + { + "epoch": 0.22977501196744854, + "grad_norm": 0.9729481103689919, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.5937, + "step": 30 + }, + { + "epoch": 0.23743417903303016, + "grad_norm": 0.9246652793706185, + "learning_rate": 1.907692307692308e-05, + "loss": 0.6247, + "step": 31 + }, + { + "epoch": 0.24509334609861178, + "grad_norm": 0.9530006352752787, + "learning_rate": 1.9692307692307696e-05, + "loss": 0.5816, + "step": 32 + }, + { + "epoch": 0.2527525131641934, + "grad_norm": 0.8884934035836074, + "learning_rate": 2.0307692307692308e-05, + "loss": 0.6311, + "step": 33 + }, + { + "epoch": 0.260411680229775, + "grad_norm": 0.8458697710494548, + "learning_rate": 2.0923076923076927e-05, + "loss": 0.6026, + "step": 34 + }, + { + "epoch": 0.26807084729535663, + "grad_norm": 0.8241647035938812, + "learning_rate": 2.153846153846154e-05, + "loss": 0.5813, + "step": 35 + }, + { + "epoch": 0.27573001436093825, + "grad_norm": 0.6637335260284863, + "learning_rate": 2.2153846153846158e-05, + "loss": 0.5826, + "step": 36 + }, + { + "epoch": 0.28338918142651986, + "grad_norm": 0.9863751042251347, + "learning_rate": 2.276923076923077e-05, + "loss": 0.6061, + "step": 37 + }, + { + "epoch": 0.2910483484921015, + "grad_norm": 0.7725648338774505, + "learning_rate": 2.338461538461539e-05, + "loss": 0.5535, + "step": 38 + }, + { + "epoch": 0.2987075155576831, + "grad_norm": 0.9197682556877761, + "learning_rate": 2.4e-05, + "loss": 0.5891, + "step": 39 + }, + { + "epoch": 0.3063666826232647, + "grad_norm": 0.9877497296915406, + "learning_rate": 2.461538461538462e-05, + "loss": 0.575, + "step": 40 + }, + { + "epoch": 0.31402584968884634, + "grad_norm": 0.720541520642535, + "learning_rate": 2.523076923076923e-05, + "loss": 0.5779, + "step": 41 + }, + { + "epoch": 0.32168501675442795, + "grad_norm": 0.9225926491059669, + "learning_rate": 2.584615384615385e-05, + "loss": 0.5528, + "step": 42 + }, + { + "epoch": 0.3293441838200096, + "grad_norm": 0.8068833036305191, + "learning_rate": 2.6461538461538463e-05, + "loss": 0.5888, + "step": 43 + }, + { + "epoch": 0.3370033508855912, + "grad_norm": 0.957481417916251, + "learning_rate": 2.707692307692308e-05, + "loss": 0.5559, + "step": 44 + }, + { + "epoch": 0.3446625179511728, + "grad_norm": 0.78572658067857, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.5612, + "step": 45 + }, + { + "epoch": 0.3523216850167544, + "grad_norm": 0.9209329658610131, + "learning_rate": 2.8307692307692312e-05, + "loss": 0.584, + "step": 46 + }, + { + "epoch": 0.35998085208233604, + "grad_norm": 1.044590316803605, + "learning_rate": 2.8923076923076925e-05, + "loss": 0.5804, + "step": 47 + }, + { + "epoch": 0.36764001914791766, + "grad_norm": 1.172192022892308, + "learning_rate": 2.9538461538461543e-05, + "loss": 0.5518, + "step": 48 + }, + { + "epoch": 0.3752991862134993, + "grad_norm": 1.0766200831878052, + "learning_rate": 3.0153846153846155e-05, + "loss": 0.5846, + "step": 49 + }, + { + "epoch": 0.3829583532790809, + "grad_norm": 1.1041677201522513, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.5487, + "step": 50 + }, + { + "epoch": 0.3906175203446625, + "grad_norm": 1.0469042529413854, + "learning_rate": 3.1384615384615386e-05, + "loss": 0.5395, + "step": 51 + }, + { + "epoch": 0.39827668741024413, + "grad_norm": 1.0791323635301755, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.5657, + "step": 52 + }, + { + "epoch": 0.40593585447582575, + "grad_norm": 0.8078486400788046, + "learning_rate": 3.261538461538462e-05, + "loss": 0.5399, + "step": 53 + }, + { + "epoch": 0.41359502154140737, + "grad_norm": 1.191539346367779, + "learning_rate": 3.3230769230769236e-05, + "loss": 0.5377, + "step": 54 + }, + { + "epoch": 0.421254188606989, + "grad_norm": 0.8698447542598498, + "learning_rate": 3.384615384615385e-05, + "loss": 0.5551, + "step": 55 + }, + { + "epoch": 0.4289133556725706, + "grad_norm": 1.1800967382678582, + "learning_rate": 3.446153846153847e-05, + "loss": 0.5702, + "step": 56 + }, + { + "epoch": 0.4365725227381522, + "grad_norm": 1.220687522421987, + "learning_rate": 3.507692307692308e-05, + "loss": 0.5552, + "step": 57 + }, + { + "epoch": 0.44423168980373384, + "grad_norm": 0.9041645568135938, + "learning_rate": 3.56923076923077e-05, + "loss": 0.5688, + "step": 58 + }, + { + "epoch": 0.45189085686931546, + "grad_norm": 1.5855877495279507, + "learning_rate": 3.630769230769231e-05, + "loss": 0.5641, + "step": 59 + }, + { + "epoch": 0.4595500239348971, + "grad_norm": 0.8216109564331809, + "learning_rate": 3.692307692307693e-05, + "loss": 0.5536, + "step": 60 + }, + { + "epoch": 0.4672091910004787, + "grad_norm": 1.5857353518922366, + "learning_rate": 3.753846153846154e-05, + "loss": 0.5497, + "step": 61 + }, + { + "epoch": 0.4748683580660603, + "grad_norm": 1.0645534816694728, + "learning_rate": 3.815384615384616e-05, + "loss": 0.5611, + "step": 62 + }, + { + "epoch": 0.48252752513164193, + "grad_norm": 1.8572686093355073, + "learning_rate": 3.876923076923077e-05, + "loss": 0.5703, + "step": 63 + }, + { + "epoch": 0.49018669219722355, + "grad_norm": 1.6301218272572524, + "learning_rate": 3.938461538461539e-05, + "loss": 0.5468, + "step": 64 + }, + { + "epoch": 0.49784585926280517, + "grad_norm": 1.1854602191040988, + "learning_rate": 4e-05, + "loss": 0.564, + "step": 65 + }, + { + "epoch": 0.5055050263283868, + "grad_norm": 1.5458244211037195, + "learning_rate": 3.999971160550277e-05, + "loss": 0.5677, + "step": 66 + }, + { + "epoch": 0.5131641933939685, + "grad_norm": 1.2355216531245246, + "learning_rate": 3.999884643032821e-05, + "loss": 0.5606, + "step": 67 + }, + { + "epoch": 0.52082336045955, + "grad_norm": 1.0712313677482361, + "learning_rate": 3.9997404499427494e-05, + "loss": 0.551, + "step": 68 + }, + { + "epoch": 0.5284825275251317, + "grad_norm": 1.4808309855361168, + "learning_rate": 3.9995385854385124e-05, + "loss": 0.542, + "step": 69 + }, + { + "epoch": 0.5361416945907133, + "grad_norm": 0.8373088986931322, + "learning_rate": 3.999279055341771e-05, + "loss": 0.5255, + "step": 70 + }, + { + "epoch": 0.5438008616562949, + "grad_norm": 1.683135657978014, + "learning_rate": 3.9989618671372304e-05, + "loss": 0.5578, + "step": 71 + }, + { + "epoch": 0.5514600287218765, + "grad_norm": 1.1420245332567331, + "learning_rate": 3.998587029972423e-05, + "loss": 0.5729, + "step": 72 + }, + { + "epoch": 0.5591191957874582, + "grad_norm": 1.6242190582145548, + "learning_rate": 3.998154554657448e-05, + "loss": 0.5621, + "step": 73 + }, + { + "epoch": 0.5667783628530397, + "grad_norm": 1.4494865534227146, + "learning_rate": 3.997664453664654e-05, + "loss": 0.539, + "step": 74 + }, + { + "epoch": 0.5744375299186214, + "grad_norm": 1.2416402505339934, + "learning_rate": 3.9971167411282835e-05, + "loss": 0.5451, + "step": 75 + }, + { + "epoch": 0.582096696984203, + "grad_norm": 1.6591387835608409, + "learning_rate": 3.996511432844067e-05, + "loss": 0.5649, + "step": 76 + }, + { + "epoch": 0.5897558640497846, + "grad_norm": 1.2982089198354367, + "learning_rate": 3.9958485462687606e-05, + "loss": 0.5518, + "step": 77 + }, + { + "epoch": 0.5974150311153662, + "grad_norm": 1.3123456841661971, + "learning_rate": 3.9951281005196486e-05, + "loss": 0.5514, + "step": 78 + }, + { + "epoch": 0.6050741981809479, + "grad_norm": 1.189744218079718, + "learning_rate": 3.994350116373991e-05, + "loss": 0.5408, + "step": 79 + }, + { + "epoch": 0.6127333652465294, + "grad_norm": 1.4721204904424363, + "learning_rate": 3.9935146162684206e-05, + "loss": 0.5507, + "step": 80 + }, + { + "epoch": 0.6203925323121111, + "grad_norm": 1.1623645549927197, + "learning_rate": 3.9926216242983017e-05, + "loss": 0.5569, + "step": 81 + }, + { + "epoch": 0.6280516993776927, + "grad_norm": 1.325416617911315, + "learning_rate": 3.991671166217031e-05, + "loss": 0.5312, + "step": 82 + }, + { + "epoch": 0.6357108664432743, + "grad_norm": 1.1877951056068266, + "learning_rate": 3.990663269435298e-05, + "loss": 0.5644, + "step": 83 + }, + { + "epoch": 0.6433700335088559, + "grad_norm": 1.2286620885326376, + "learning_rate": 3.989597963020289e-05, + "loss": 0.5756, + "step": 84 + }, + { + "epoch": 0.6510292005744376, + "grad_norm": 1.1060602604243368, + "learning_rate": 3.9884752776948564e-05, + "loss": 0.5311, + "step": 85 + }, + { + "epoch": 0.6586883676400191, + "grad_norm": 1.0698164056511321, + "learning_rate": 3.9872952458366267e-05, + "loss": 0.5574, + "step": 86 + }, + { + "epoch": 0.6663475347056008, + "grad_norm": 1.1820578926976948, + "learning_rate": 3.986057901477069e-05, + "loss": 0.5513, + "step": 87 + }, + { + "epoch": 0.6740067017711824, + "grad_norm": 0.8169595363079514, + "learning_rate": 3.984763280300514e-05, + "loss": 0.5483, + "step": 88 + }, + { + "epoch": 0.681665868836764, + "grad_norm": 1.379920176811677, + "learning_rate": 3.983411419643125e-05, + "loss": 0.5892, + "step": 89 + }, + { + "epoch": 0.6893250359023456, + "grad_norm": 0.9681470926974101, + "learning_rate": 3.982002358491817e-05, + "loss": 0.5342, + "step": 90 + }, + { + "epoch": 0.6969842029679273, + "grad_norm": 1.119318426323717, + "learning_rate": 3.980536137483141e-05, + "loss": 0.5552, + "step": 91 + }, + { + "epoch": 0.7046433700335089, + "grad_norm": 1.4254561226777611, + "learning_rate": 3.9790127989021024e-05, + "loss": 0.5336, + "step": 92 + }, + { + "epoch": 0.7123025370990905, + "grad_norm": 1.0062543492174385, + "learning_rate": 3.9774323866809485e-05, + "loss": 0.5601, + "step": 93 + }, + { + "epoch": 0.7199617041646721, + "grad_norm": 1.705345700534379, + "learning_rate": 3.9757949463978975e-05, + "loss": 0.5659, + "step": 94 + }, + { + "epoch": 0.7276208712302538, + "grad_norm": 1.1181524317220572, + "learning_rate": 3.9741005252758255e-05, + "loss": 0.5204, + "step": 95 + }, + { + "epoch": 0.7352800382958353, + "grad_norm": 1.6689670685483973, + "learning_rate": 3.9723491721809076e-05, + "loss": 0.5498, + "step": 96 + }, + { + "epoch": 0.742939205361417, + "grad_norm": 1.4295009254142887, + "learning_rate": 3.970540937621201e-05, + "loss": 0.565, + "step": 97 + }, + { + "epoch": 0.7505983724269986, + "grad_norm": 1.4548186946761517, + "learning_rate": 3.9686758737451955e-05, + "loss": 0.55, + "step": 98 + }, + { + "epoch": 0.7582575394925802, + "grad_norm": 1.213694031644175, + "learning_rate": 3.966754034340308e-05, + "loss": 0.5348, + "step": 99 + }, + { + "epoch": 0.7659167065581618, + "grad_norm": 1.0580342997486925, + "learning_rate": 3.9647754748313294e-05, + "loss": 0.5242, + "step": 100 + }, + { + "epoch": 0.7735758736237435, + "grad_norm": 1.0821077174340752, + "learning_rate": 3.962740252278827e-05, + "loss": 0.542, + "step": 101 + }, + { + "epoch": 0.781235040689325, + "grad_norm": 1.2618844489347851, + "learning_rate": 3.960648425377499e-05, + "loss": 0.5465, + "step": 102 + }, + { + "epoch": 0.7888942077549067, + "grad_norm": 0.9576457209595916, + "learning_rate": 3.958500054454482e-05, + "loss": 0.5206, + "step": 103 + }, + { + "epoch": 0.7965533748204883, + "grad_norm": 1.6879319110317588, + "learning_rate": 3.9562952014676116e-05, + "loss": 0.5554, + "step": 104 + }, + { + "epoch": 0.8042125418860699, + "grad_norm": 1.3708595612461836, + "learning_rate": 3.954033930003634e-05, + "loss": 0.5745, + "step": 105 + }, + { + "epoch": 0.8118717089516515, + "grad_norm": 1.2986325728462578, + "learning_rate": 3.9517163052763756e-05, + "loss": 0.5413, + "step": 106 + }, + { + "epoch": 0.8195308760172332, + "grad_norm": 1.1532883579115862, + "learning_rate": 3.9493423941248564e-05, + "loss": 0.5367, + "step": 107 + }, + { + "epoch": 0.8271900430828147, + "grad_norm": 1.0800362940814041, + "learning_rate": 3.946912265011368e-05, + "loss": 0.5643, + "step": 108 + }, + { + "epoch": 0.8348492101483964, + "grad_norm": 1.0542424383185787, + "learning_rate": 3.944425988019498e-05, + "loss": 0.5434, + "step": 109 + }, + { + "epoch": 0.842508377213978, + "grad_norm": 0.9223839325666725, + "learning_rate": 3.9418836348521045e-05, + "loss": 0.5313, + "step": 110 + }, + { + "epoch": 0.8501675442795597, + "grad_norm": 0.9786970098640274, + "learning_rate": 3.9392852788292556e-05, + "loss": 0.5257, + "step": 111 + }, + { + "epoch": 0.8578267113451412, + "grad_norm": 0.8883004076978934, + "learning_rate": 3.936630994886109e-05, + "loss": 0.5315, + "step": 112 + }, + { + "epoch": 0.8654858784107229, + "grad_norm": 0.9210396895221461, + "learning_rate": 3.933920859570753e-05, + "loss": 0.5332, + "step": 113 + }, + { + "epoch": 0.8731450454763044, + "grad_norm": 1.0300033230225123, + "learning_rate": 3.931154951041998e-05, + "loss": 0.5364, + "step": 114 + }, + { + "epoch": 0.8808042125418861, + "grad_norm": 1.07072986303729, + "learning_rate": 3.928333349067125e-05, + "loss": 0.5581, + "step": 115 + }, + { + "epoch": 0.8884633796074677, + "grad_norm": 0.7856322271677167, + "learning_rate": 3.925456135019582e-05, + "loss": 0.5153, + "step": 116 + }, + { + "epoch": 0.8961225466730494, + "grad_norm": 1.3208925243839957, + "learning_rate": 3.922523391876638e-05, + "loss": 0.5397, + "step": 117 + }, + { + "epoch": 0.9037817137386309, + "grad_norm": 1.0321781910738392, + "learning_rate": 3.9195352042169924e-05, + "loss": 0.5434, + "step": 118 + }, + { + "epoch": 0.9114408808042126, + "grad_norm": 1.2263516987444574, + "learning_rate": 3.916491658218333e-05, + "loss": 0.5412, + "step": 119 + }, + { + "epoch": 0.9191000478697942, + "grad_norm": 1.0824446357123727, + "learning_rate": 3.913392841654851e-05, + "loss": 0.5345, + "step": 120 + }, + { + "epoch": 0.9267592149353758, + "grad_norm": 0.9492587544198882, + "learning_rate": 3.9102388438947104e-05, + "loss": 0.5374, + "step": 121 + }, + { + "epoch": 0.9344183820009574, + "grad_norm": 1.200654956363631, + "learning_rate": 3.907029755897473e-05, + "loss": 0.528, + "step": 122 + }, + { + "epoch": 0.9420775490665391, + "grad_norm": 0.8940904112935483, + "learning_rate": 3.903765670211469e-05, + "loss": 0.5432, + "step": 123 + }, + { + "epoch": 0.9497367161321206, + "grad_norm": 1.513206646677525, + "learning_rate": 3.9004466809711343e-05, + "loss": 0.55, + "step": 124 + }, + { + "epoch": 0.9573958831977023, + "grad_norm": 0.9181110535142029, + "learning_rate": 3.897072883894291e-05, + "loss": 0.5322, + "step": 125 + }, + { + "epoch": 0.9650550502632839, + "grad_norm": 1.493130538955249, + "learning_rate": 3.893644376279392e-05, + "loss": 0.5118, + "step": 126 + }, + { + "epoch": 0.9727142173288655, + "grad_norm": 1.199837488596111, + "learning_rate": 3.89016125700271e-05, + "loss": 0.5279, + "step": 127 + }, + { + "epoch": 0.9803733843944471, + "grad_norm": 1.7015944265286984, + "learning_rate": 3.8866236265154864e-05, + "loss": 0.5464, + "step": 128 + }, + { + "epoch": 0.9880325514600288, + "grad_norm": 1.4588454807007443, + "learning_rate": 3.88303158684104e-05, + "loss": 0.5544, + "step": 129 + }, + { + "epoch": 0.9956917185256103, + "grad_norm": 1.4514845707429012, + "learning_rate": 3.879385241571817e-05, + "loss": 0.5686, + "step": 130 + }, + { + "epoch": 1.0047869794159885, + "grad_norm": 2.2360445683138384, + "learning_rate": 3.875684695866409e-05, + "loss": 0.8604, + "step": 131 + }, + { + "epoch": 1.0124461464815702, + "grad_norm": 1.085982951452546, + "learning_rate": 3.871930056446518e-05, + "loss": 0.4109, + "step": 132 + }, + { + "epoch": 1.0201053135471518, + "grad_norm": 0.9862931565568734, + "learning_rate": 3.8681214315938786e-05, + "loss": 0.475, + "step": 133 + }, + { + "epoch": 1.0277644806127333, + "grad_norm": 1.33743772292278, + "learning_rate": 3.864258931147136e-05, + "loss": 0.4306, + "step": 134 + }, + { + "epoch": 1.035423647678315, + "grad_norm": 1.0921525549828976, + "learning_rate": 3.860342666498677e-05, + "loss": 0.4848, + "step": 135 + }, + { + "epoch": 1.0430828147438966, + "grad_norm": 1.2645353404433375, + "learning_rate": 3.856372750591419e-05, + "loss": 0.4485, + "step": 136 + }, + { + "epoch": 1.0507419818094783, + "grad_norm": 1.1984966128303223, + "learning_rate": 3.8523492979155534e-05, + "loss": 0.4808, + "step": 137 + }, + { + "epoch": 1.0584011488750598, + "grad_norm": 1.0500948118376512, + "learning_rate": 3.84827242450524e-05, + "loss": 0.4881, + "step": 138 + }, + { + "epoch": 1.0660603159406414, + "grad_norm": 0.746032471414876, + "learning_rate": 3.844142247935265e-05, + "loss": 0.4028, + "step": 139 + }, + { + "epoch": 1.073719483006223, + "grad_norm": 0.9623091350387484, + "learning_rate": 3.839958887317649e-05, + "loss": 0.4715, + "step": 140 + }, + { + "epoch": 1.0813786500718048, + "grad_norm": 0.7201641889161347, + "learning_rate": 3.835722463298208e-05, + "loss": 0.4606, + "step": 141 + }, + { + "epoch": 1.0890378171373862, + "grad_norm": 0.8943620531687253, + "learning_rate": 3.831433098053082e-05, + "loss": 0.4712, + "step": 142 + }, + { + "epoch": 1.096696984202968, + "grad_norm": 0.7046882034899636, + "learning_rate": 3.827090915285202e-05, + "loss": 0.4458, + "step": 143 + }, + { + "epoch": 1.1043561512685496, + "grad_norm": 0.6548369768890453, + "learning_rate": 3.8226960402207316e-05, + "loss": 0.3904, + "step": 144 + }, + { + "epoch": 1.1120153183341313, + "grad_norm": 0.8530160058379846, + "learning_rate": 3.818248599605448e-05, + "loss": 0.546, + "step": 145 + }, + { + "epoch": 1.1196744853997127, + "grad_norm": 0.8081896528154348, + "learning_rate": 3.813748721701091e-05, + "loss": 0.4651, + "step": 146 + }, + { + "epoch": 1.1273336524652944, + "grad_norm": 0.7600777048606053, + "learning_rate": 3.809196536281665e-05, + "loss": 0.3961, + "step": 147 + }, + { + "epoch": 1.134992819530876, + "grad_norm": 0.8170599343972957, + "learning_rate": 3.80459217462969e-05, + "loss": 0.4859, + "step": 148 + }, + { + "epoch": 1.1426519865964577, + "grad_norm": 0.5981126204379706, + "learning_rate": 3.799935769532425e-05, + "loss": 0.4112, + "step": 149 + }, + { + "epoch": 1.1503111536620392, + "grad_norm": 0.6918899342824092, + "learning_rate": 3.795227455278029e-05, + "loss": 0.445, + "step": 150 + }, + { + "epoch": 1.1579703207276208, + "grad_norm": 0.7615214743583981, + "learning_rate": 3.790467367651694e-05, + "loss": 0.5105, + "step": 151 + }, + { + "epoch": 1.1656294877932025, + "grad_norm": 0.5625993273766481, + "learning_rate": 3.785655643931728e-05, + "loss": 0.3587, + "step": 152 + }, + { + "epoch": 1.1732886548587842, + "grad_norm": 0.9402358775980364, + "learning_rate": 3.780792422885597e-05, + "loss": 0.5268, + "step": 153 + }, + { + "epoch": 1.1809478219243656, + "grad_norm": 1.0131842602903653, + "learning_rate": 3.7758778447659184e-05, + "loss": 0.4696, + "step": 154 + }, + { + "epoch": 1.1886069889899473, + "grad_norm": 0.7498386211674599, + "learning_rate": 3.7709120513064196e-05, + "loss": 0.4471, + "step": 155 + }, + { + "epoch": 1.196266156055529, + "grad_norm": 0.7182193922598371, + "learning_rate": 3.7658951857178544e-05, + "loss": 0.4344, + "step": 156 + }, + { + "epoch": 1.2039253231211107, + "grad_norm": 0.6854353371665008, + "learning_rate": 3.760827392683863e-05, + "loss": 0.4195, + "step": 157 + }, + { + "epoch": 1.2115844901866921, + "grad_norm": 1.1427693762223539, + "learning_rate": 3.755708818356809e-05, + "loss": 0.5111, + "step": 158 + }, + { + "epoch": 1.2192436572522738, + "grad_norm": 0.6625293413220111, + "learning_rate": 3.75053961035356e-05, + "loss": 0.4288, + "step": 159 + }, + { + "epoch": 1.2269028243178555, + "grad_norm": 0.9123675189917038, + "learning_rate": 3.745319917751229e-05, + "loss": 0.4656, + "step": 160 + }, + { + "epoch": 1.2345619913834371, + "grad_norm": 0.7341125406296108, + "learning_rate": 3.740049891082879e-05, + "loss": 0.425, + "step": 161 + }, + { + "epoch": 1.2422211584490186, + "grad_norm": 0.8631406479236612, + "learning_rate": 3.734729682333179e-05, + "loss": 0.4911, + "step": 162 + }, + { + "epoch": 1.2498803255146003, + "grad_norm": 0.7496345502817138, + "learning_rate": 3.729359444934022e-05, + "loss": 0.3874, + "step": 163 + }, + { + "epoch": 1.257539492580182, + "grad_norm": 0.7894969941658941, + "learning_rate": 3.723939333760099e-05, + "loss": 0.4578, + "step": 164 + }, + { + "epoch": 1.2651986596457636, + "grad_norm": 0.9223288305001206, + "learning_rate": 3.718469505124434e-05, + "loss": 0.523, + "step": 165 + }, + { + "epoch": 1.272857826711345, + "grad_norm": 0.7992363424810383, + "learning_rate": 3.712950116773875e-05, + "loss": 0.46, + "step": 166 + }, + { + "epoch": 1.2805169937769267, + "grad_norm": 0.8478230607138995, + "learning_rate": 3.707381327884545e-05, + "loss": 0.4749, + "step": 167 + }, + { + "epoch": 1.2881761608425084, + "grad_norm": 0.5454929496812748, + "learning_rate": 3.70176329905725e-05, + "loss": 0.4463, + "step": 168 + }, + { + "epoch": 1.2958353279080899, + "grad_norm": 0.750962890324772, + "learning_rate": 3.696096192312852e-05, + "loss": 0.4562, + "step": 169 + }, + { + "epoch": 1.3034944949736715, + "grad_norm": 0.6464014219567544, + "learning_rate": 3.69038017108759e-05, + "loss": 0.458, + "step": 170 + }, + { + "epoch": 1.3111536620392532, + "grad_norm": 0.6483521349636168, + "learning_rate": 3.6846154002283696e-05, + "loss": 0.4709, + "step": 171 + }, + { + "epoch": 1.3188128291048349, + "grad_norm": 0.7936957658021746, + "learning_rate": 3.678802045988012e-05, + "loss": 0.4581, + "step": 172 + }, + { + "epoch": 1.3264719961704166, + "grad_norm": 0.9116293942129717, + "learning_rate": 3.6729402760204535e-05, + "loss": 0.4367, + "step": 173 + }, + { + "epoch": 1.334131163235998, + "grad_norm": 0.8432492215433733, + "learning_rate": 3.667030259375915e-05, + "loss": 0.5148, + "step": 174 + }, + { + "epoch": 1.3417903303015797, + "grad_norm": 0.7438248026517662, + "learning_rate": 3.6610721664960236e-05, + "loss": 0.4653, + "step": 175 + }, + { + "epoch": 1.3494494973671614, + "grad_norm": 0.6889892205036577, + "learning_rate": 3.6550661692089e-05, + "loss": 0.4231, + "step": 176 + }, + { + "epoch": 1.3571086644327428, + "grad_norm": 0.7784374117922759, + "learning_rate": 3.6490124407242007e-05, + "loss": 0.5052, + "step": 177 + }, + { + "epoch": 1.3647678314983245, + "grad_norm": 0.8790420908019635, + "learning_rate": 3.642911155628124e-05, + "loss": 0.4415, + "step": 178 + }, + { + "epoch": 1.3724269985639062, + "grad_norm": 0.7174987210830999, + "learning_rate": 3.636762489878374e-05, + "loss": 0.4421, + "step": 179 + }, + { + "epoch": 1.3800861656294878, + "grad_norm": 0.8020599752892646, + "learning_rate": 3.6305666207990886e-05, + "loss": 0.4863, + "step": 180 + }, + { + "epoch": 1.3877453326950695, + "grad_norm": 0.7802012248909737, + "learning_rate": 3.624323727075723e-05, + "loss": 0.4426, + "step": 181 + }, + { + "epoch": 1.395404499760651, + "grad_norm": 0.671987752263627, + "learning_rate": 3.6180339887498953e-05, + "loss": 0.4077, + "step": 182 + }, + { + "epoch": 1.4030636668262326, + "grad_norm": 0.9587427415971348, + "learning_rate": 3.6116975872141984e-05, + "loss": 0.5101, + "step": 183 + }, + { + "epoch": 1.4107228338918143, + "grad_norm": 0.9118291287887855, + "learning_rate": 3.605314705206966e-05, + "loss": 0.4823, + "step": 184 + }, + { + "epoch": 1.4183820009573957, + "grad_norm": 0.8693454740841087, + "learning_rate": 3.598885526807003e-05, + "loss": 0.4712, + "step": 185 + }, + { + "epoch": 1.4260411680229774, + "grad_norm": 0.7409071346511643, + "learning_rate": 3.5924102374282754e-05, + "loss": 0.4422, + "step": 186 + }, + { + "epoch": 1.433700335088559, + "grad_norm": 0.9303869755499337, + "learning_rate": 3.5858890238145674e-05, + "loss": 0.4207, + "step": 187 + }, + { + "epoch": 1.4413595021541408, + "grad_norm": 0.852020974421498, + "learning_rate": 3.5793220740340904e-05, + "loss": 0.4927, + "step": 188 + }, + { + "epoch": 1.4490186692197224, + "grad_norm": 0.7715289615754756, + "learning_rate": 3.572709577474062e-05, + "loss": 0.4546, + "step": 189 + }, + { + "epoch": 1.456677836285304, + "grad_norm": 0.7648123067206034, + "learning_rate": 3.566051724835245e-05, + "loss": 0.4775, + "step": 190 + }, + { + "epoch": 1.4643370033508856, + "grad_norm": 0.6233854224617866, + "learning_rate": 3.559348708126445e-05, + "loss": 0.4428, + "step": 191 + }, + { + "epoch": 1.4719961704164672, + "grad_norm": 0.7266153094875722, + "learning_rate": 3.552600720658976e-05, + "loss": 0.4775, + "step": 192 + }, + { + "epoch": 1.4796553374820487, + "grad_norm": 0.6034805115192634, + "learning_rate": 3.545807957041084e-05, + "loss": 0.4353, + "step": 193 + }, + { + "epoch": 1.4873145045476304, + "grad_norm": 0.733938783383156, + "learning_rate": 3.538970613172332e-05, + "loss": 0.492, + "step": 194 + }, + { + "epoch": 1.494973671613212, + "grad_norm": 0.6587084782665951, + "learning_rate": 3.532088886237956e-05, + "loss": 0.4471, + "step": 195 + }, + { + "epoch": 1.5026328386787937, + "grad_norm": 0.7660206322795097, + "learning_rate": 3.525162974703174e-05, + "loss": 0.4278, + "step": 196 + }, + { + "epoch": 1.5102920057443754, + "grad_norm": 0.6137621338422651, + "learning_rate": 3.518193078307463e-05, + "loss": 0.4325, + "step": 197 + }, + { + "epoch": 1.517951172809957, + "grad_norm": 0.5768603184148955, + "learning_rate": 3.5111793980588006e-05, + "loss": 0.4529, + "step": 198 + }, + { + "epoch": 1.5256103398755385, + "grad_norm": 0.7888259317826345, + "learning_rate": 3.5041221362278644e-05, + "loss": 0.44, + "step": 199 + }, + { + "epoch": 1.5332695069411202, + "grad_norm": 0.5650383799794519, + "learning_rate": 3.497021496342203e-05, + "loss": 0.4585, + "step": 200 + }, + { + "epoch": 1.5409286740067016, + "grad_norm": 0.705765230853562, + "learning_rate": 3.489877683180362e-05, + "loss": 0.4616, + "step": 201 + }, + { + "epoch": 1.5485878410722833, + "grad_norm": 0.6873526187506311, + "learning_rate": 3.482690902765984e-05, + "loss": 0.4433, + "step": 202 + }, + { + "epoch": 1.556247008137865, + "grad_norm": 0.8007803368306625, + "learning_rate": 3.475461362361861e-05, + "loss": 0.5263, + "step": 203 + }, + { + "epoch": 1.5639061752034467, + "grad_norm": 0.6918325743424758, + "learning_rate": 3.468189270463959e-05, + "loss": 0.4325, + "step": 204 + }, + { + "epoch": 1.5715653422690283, + "grad_norm": 0.5677928436218839, + "learning_rate": 3.4608748367954064e-05, + "loss": 0.4654, + "step": 205 + }, + { + "epoch": 1.57922450933461, + "grad_norm": 0.7751869494065315, + "learning_rate": 3.4535182723004466e-05, + "loss": 0.461, + "step": 206 + }, + { + "epoch": 1.5868836764001915, + "grad_norm": 0.5725206135864617, + "learning_rate": 3.446119789138351e-05, + "loss": 0.4952, + "step": 207 + }, + { + "epoch": 1.5945428434657731, + "grad_norm": 0.7296776133168584, + "learning_rate": 3.438679600677303e-05, + "loss": 0.4313, + "step": 208 + }, + { + "epoch": 1.6022020105313546, + "grad_norm": 0.587167588865063, + "learning_rate": 3.431197921488242e-05, + "loss": 0.433, + "step": 209 + }, + { + "epoch": 1.6098611775969363, + "grad_norm": 0.8056341147184974, + "learning_rate": 3.423674967338681e-05, + "loss": 0.4514, + "step": 210 + }, + { + "epoch": 1.617520344662518, + "grad_norm": 0.7051970461447443, + "learning_rate": 3.416110955186477e-05, + "loss": 0.5431, + "step": 211 + }, + { + "epoch": 1.6251795117280996, + "grad_norm": 0.5607663565934765, + "learning_rate": 3.4085061031735794e-05, + "loss": 0.4408, + "step": 212 + }, + { + "epoch": 1.6328386787936813, + "grad_norm": 0.8246463721021458, + "learning_rate": 3.4008606306197336e-05, + "loss": 0.4864, + "step": 213 + }, + { + "epoch": 1.640497845859263, + "grad_norm": 0.6500034356382017, + "learning_rate": 3.393174758016161e-05, + "loss": 0.4215, + "step": 214 + }, + { + "epoch": 1.6481570129248444, + "grad_norm": 0.7062452977875294, + "learning_rate": 3.385448707019199e-05, + "loss": 0.4731, + "step": 215 + }, + { + "epoch": 1.655816179990426, + "grad_norm": 0.5942221349239608, + "learning_rate": 3.377682700443907e-05, + "loss": 0.4164, + "step": 216 + }, + { + "epoch": 1.6634753470560075, + "grad_norm": 0.7905876229344655, + "learning_rate": 3.3698769622576404e-05, + "loss": 0.5147, + "step": 217 + }, + { + "epoch": 1.6711345141215892, + "grad_norm": 0.5859170654910125, + "learning_rate": 3.3620317175735945e-05, + "loss": 0.4275, + "step": 218 + }, + { + "epoch": 1.6787936811871709, + "grad_norm": 0.6690808146840328, + "learning_rate": 3.3541471926443084e-05, + "loss": 0.4789, + "step": 219 + }, + { + "epoch": 1.6864528482527525, + "grad_norm": 0.6561200573249233, + "learning_rate": 3.34622361485514e-05, + "loss": 0.4991, + "step": 220 + }, + { + "epoch": 1.6941120153183342, + "grad_norm": 0.5562256326390935, + "learning_rate": 3.3382612127177166e-05, + "loss": 0.4343, + "step": 221 + }, + { + "epoch": 1.701771182383916, + "grad_norm": 0.7170323098042807, + "learning_rate": 3.330260215863332e-05, + "loss": 0.4486, + "step": 222 + }, + { + "epoch": 1.7094303494494973, + "grad_norm": 0.6014886240988541, + "learning_rate": 3.322220855036333e-05, + "loss": 0.4957, + "step": 223 + }, + { + "epoch": 1.717089516515079, + "grad_norm": 0.5278357930923132, + "learning_rate": 3.314143362087462e-05, + "loss": 0.442, + "step": 224 + }, + { + "epoch": 1.7247486835806605, + "grad_norm": 0.6035858404605409, + "learning_rate": 3.30602796996717e-05, + "loss": 0.4506, + "step": 225 + }, + { + "epoch": 1.7324078506462421, + "grad_norm": 0.6363716300860364, + "learning_rate": 3.297874912718902e-05, + "loss": 0.4663, + "step": 226 + }, + { + "epoch": 1.7400670177118238, + "grad_norm": 0.47737502299597623, + "learning_rate": 3.2896844254723414e-05, + "loss": 0.418, + "step": 227 + }, + { + "epoch": 1.7477261847774055, + "grad_norm": 0.8067471733349434, + "learning_rate": 3.281456744436634e-05, + "loss": 0.4957, + "step": 228 + }, + { + "epoch": 1.7553853518429872, + "grad_norm": 0.6089284826032468, + "learning_rate": 3.273192106893572e-05, + "loss": 0.4418, + "step": 229 + }, + { + "epoch": 1.7630445189085688, + "grad_norm": 0.5962202659179022, + "learning_rate": 3.2648907511907544e-05, + "loss": 0.4669, + "step": 230 + }, + { + "epoch": 1.7707036859741503, + "grad_norm": 0.5728200467314639, + "learning_rate": 3.256552916734713e-05, + "loss": 0.4678, + "step": 231 + }, + { + "epoch": 1.778362853039732, + "grad_norm": 0.541716837396919, + "learning_rate": 3.248178843984006e-05, + "loss": 0.3918, + "step": 232 + }, + { + "epoch": 1.7860220201053134, + "grad_norm": 0.6534786961268279, + "learning_rate": 3.239768774442281e-05, + "loss": 0.4922, + "step": 233 + }, + { + "epoch": 1.793681187170895, + "grad_norm": 0.8358591875149838, + "learning_rate": 3.2313229506513167e-05, + "loss": 0.4902, + "step": 234 + }, + { + "epoch": 1.8013403542364768, + "grad_norm": 0.591595847556571, + "learning_rate": 3.222841616184025e-05, + "loss": 0.4138, + "step": 235 + }, + { + "epoch": 1.8089995213020584, + "grad_norm": 0.6277241932218716, + "learning_rate": 3.2143250156374226e-05, + "loss": 0.4474, + "step": 236 + }, + { + "epoch": 1.81665868836764, + "grad_norm": 0.7278025713494423, + "learning_rate": 3.2057733946255844e-05, + "loss": 0.4709, + "step": 237 + }, + { + "epoch": 1.8243178554332218, + "grad_norm": 0.6410859956043007, + "learning_rate": 3.197186999772555e-05, + "loss": 0.4583, + "step": 238 + }, + { + "epoch": 1.8319770224988032, + "grad_norm": 0.6757146265800846, + "learning_rate": 3.188566078705235e-05, + "loss": 0.4704, + "step": 239 + }, + { + "epoch": 1.839636189564385, + "grad_norm": 0.7572042816661453, + "learning_rate": 3.1799108800462466e-05, + "loss": 0.4119, + "step": 240 + }, + { + "epoch": 1.8472953566299664, + "grad_norm": 0.590906796662085, + "learning_rate": 3.1712216534067536e-05, + "loss": 0.47, + "step": 241 + }, + { + "epoch": 1.854954523695548, + "grad_norm": 0.8241586342465295, + "learning_rate": 3.1624986493792735e-05, + "loss": 0.4624, + "step": 242 + }, + { + "epoch": 1.8626136907611297, + "grad_norm": 0.5604470218245483, + "learning_rate": 3.153742119530441e-05, + "loss": 0.4831, + "step": 243 + }, + { + "epoch": 1.8702728578267114, + "grad_norm": 0.8158955643638057, + "learning_rate": 3.144952316393758e-05, + "loss": 0.4204, + "step": 244 + }, + { + "epoch": 1.877932024892293, + "grad_norm": 0.5828232072086235, + "learning_rate": 3.136129493462312e-05, + "loss": 0.4646, + "step": 245 + }, + { + "epoch": 1.8855911919578747, + "grad_norm": 0.7063142478848118, + "learning_rate": 3.1272739051814594e-05, + "loss": 0.4576, + "step": 246 + }, + { + "epoch": 1.8932503590234562, + "grad_norm": 0.6560131266274781, + "learning_rate": 3.1183858069414936e-05, + "loss": 0.4537, + "step": 247 + }, + { + "epoch": 1.9009095260890378, + "grad_norm": 0.5709968540583878, + "learning_rate": 3.109465455070278e-05, + "loss": 0.4559, + "step": 248 + }, + { + "epoch": 1.9085686931546193, + "grad_norm": 0.6268646820645886, + "learning_rate": 3.1005131068258506e-05, + "loss": 0.3999, + "step": 249 + }, + { + "epoch": 1.916227860220201, + "grad_norm": 0.48362971083345346, + "learning_rate": 3.091529020389009e-05, + "loss": 0.4246, + "step": 250 + }, + { + "epoch": 1.9238870272857826, + "grad_norm": 0.6630981220865457, + "learning_rate": 3.082513454855863e-05, + "loss": 0.5091, + "step": 251 + }, + { + "epoch": 1.9315461943513643, + "grad_norm": 0.670450600935426, + "learning_rate": 3.073466670230361e-05, + "loss": 0.3812, + "step": 252 + }, + { + "epoch": 1.939205361416946, + "grad_norm": 0.552225236659151, + "learning_rate": 3.0643889274167926e-05, + "loss": 0.4504, + "step": 253 + }, + { + "epoch": 1.9468645284825277, + "grad_norm": 0.570779364405288, + "learning_rate": 3.055280488212266e-05, + "loss": 0.4444, + "step": 254 + }, + { + "epoch": 1.9545236955481091, + "grad_norm": 0.5961792678109743, + "learning_rate": 3.0461416152991555e-05, + "loss": 0.4763, + "step": 255 + }, + { + "epoch": 1.9621828626136908, + "grad_norm": 0.5968250026211483, + "learning_rate": 3.0369725722375274e-05, + "loss": 0.4754, + "step": 256 + }, + { + "epoch": 1.9698420296792722, + "grad_norm": 0.649954877349215, + "learning_rate": 3.0277736234575378e-05, + "loss": 0.4345, + "step": 257 + }, + { + "epoch": 1.977501196744854, + "grad_norm": 0.6643346772869196, + "learning_rate": 3.0185450342518075e-05, + "loss": 0.4532, + "step": 258 + }, + { + "epoch": 1.9851603638104356, + "grad_norm": 0.47332581564622644, + "learning_rate": 3.009287070767771e-05, + "loss": 0.4881, + "step": 259 + }, + { + "epoch": 1.9928195308760173, + "grad_norm": 0.5015320907864502, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.4242, + "step": 260 + }, + { + "epoch": 2.0019147917663953, + "grad_norm": 0.8833235426774388, + "learning_rate": 2.990684089782507e-05, + "loss": 0.6896, + "step": 261 + }, + { + "epoch": 2.009573958831977, + "grad_norm": 0.8871508493864276, + "learning_rate": 2.9813396087810134e-05, + "loss": 0.3589, + "step": 262 + }, + { + "epoch": 2.0172331258975587, + "grad_norm": 1.0765160264598035, + "learning_rate": 2.971966826485212e-05, + "loss": 0.3823, + "step": 263 + }, + { + "epoch": 2.0248922929631403, + "grad_norm": 1.2741275889363188, + "learning_rate": 2.962566013200986e-05, + "loss": 0.3751, + "step": 264 + }, + { + "epoch": 2.032551460028722, + "grad_norm": 0.7162387782791261, + "learning_rate": 2.9531374400426158e-05, + "loss": 0.3232, + "step": 265 + }, + { + "epoch": 2.0402106270943037, + "grad_norm": 0.972447687697029, + "learning_rate": 2.943681378924964e-05, + "loss": 0.3447, + "step": 266 + }, + { + "epoch": 2.047869794159885, + "grad_norm": 0.7472088308314776, + "learning_rate": 2.934198102555631e-05, + "loss": 0.3502, + "step": 267 + }, + { + "epoch": 2.0555289612254666, + "grad_norm": 0.9904131833727945, + "learning_rate": 2.924687884427087e-05, + "loss": 0.3328, + "step": 268 + }, + { + "epoch": 2.0631881282910483, + "grad_norm": 0.8268717893678047, + "learning_rate": 2.9151509988087912e-05, + "loss": 0.323, + "step": 269 + }, + { + "epoch": 2.07084729535663, + "grad_norm": 1.0133251834372121, + "learning_rate": 2.9055877207392752e-05, + "loss": 0.3314, + "step": 270 + }, + { + "epoch": 2.0785064624222116, + "grad_norm": 0.9295212140684717, + "learning_rate": 2.8959983260182166e-05, + "loss": 0.3433, + "step": 271 + }, + { + "epoch": 2.0861656294877933, + "grad_norm": 0.7493423793907141, + "learning_rate": 2.886383091198483e-05, + "loss": 0.3174, + "step": 272 + }, + { + "epoch": 2.093824796553375, + "grad_norm": 1.1449594495047748, + "learning_rate": 2.876742293578155e-05, + "loss": 0.4027, + "step": 273 + }, + { + "epoch": 2.1014839636189566, + "grad_norm": 0.8495410462496846, + "learning_rate": 2.8670762111925313e-05, + "loss": 0.3762, + "step": 274 + }, + { + "epoch": 2.109143130684538, + "grad_norm": 0.7356070365983308, + "learning_rate": 2.8573851228061084e-05, + "loss": 0.3116, + "step": 275 + }, + { + "epoch": 2.1168022977501195, + "grad_norm": 0.7959446709248563, + "learning_rate": 2.8476693079045432e-05, + "loss": 0.3379, + "step": 276 + }, + { + "epoch": 2.124461464815701, + "grad_norm": 0.8100378872218936, + "learning_rate": 2.8379290466865906e-05, + "loss": 0.3305, + "step": 277 + }, + { + "epoch": 2.132120631881283, + "grad_norm": 0.8085735074400692, + "learning_rate": 2.828164620056024e-05, + "loss": 0.3598, + "step": 278 + }, + { + "epoch": 2.1397797989468645, + "grad_norm": 0.6344360858538197, + "learning_rate": 2.818376309613535e-05, + "loss": 0.302, + "step": 279 + }, + { + "epoch": 2.147438966012446, + "grad_norm": 0.57409154364413, + "learning_rate": 2.80856439764861e-05, + "loss": 0.3686, + "step": 280 + }, + { + "epoch": 2.155098133078028, + "grad_norm": 0.5845504651635646, + "learning_rate": 2.798729167131391e-05, + "loss": 0.3526, + "step": 281 + }, + { + "epoch": 2.1627573001436096, + "grad_norm": 0.4776801469885686, + "learning_rate": 2.7888709017045146e-05, + "loss": 0.3326, + "step": 282 + }, + { + "epoch": 2.170416467209191, + "grad_norm": 0.48684077839230894, + "learning_rate": 2.7789898856749297e-05, + "loss": 0.3342, + "step": 283 + }, + { + "epoch": 2.1780756342747725, + "grad_norm": 0.49773085707991915, + "learning_rate": 2.7690864040057023e-05, + "loss": 0.3051, + "step": 284 + }, + { + "epoch": 2.185734801340354, + "grad_norm": 0.4525197933418408, + "learning_rate": 2.7591607423077932e-05, + "loss": 0.3087, + "step": 285 + }, + { + "epoch": 2.193393968405936, + "grad_norm": 0.5639448536870705, + "learning_rate": 2.7492131868318247e-05, + "loss": 0.3503, + "step": 286 + }, + { + "epoch": 2.2010531354715175, + "grad_norm": 0.5607287965131402, + "learning_rate": 2.739244024459822e-05, + "loss": 0.3654, + "step": 287 + }, + { + "epoch": 2.208712302537099, + "grad_norm": 0.4515128468379325, + "learning_rate": 2.7292535426969436e-05, + "loss": 0.3149, + "step": 288 + }, + { + "epoch": 2.216371469602681, + "grad_norm": 0.6011149807368861, + "learning_rate": 2.7192420296631835e-05, + "loss": 0.3911, + "step": 289 + }, + { + "epoch": 2.2240306366682625, + "grad_norm": 0.4554658307562261, + "learning_rate": 2.7092097740850712e-05, + "loss": 0.3068, + "step": 290 + }, + { + "epoch": 2.231689803733844, + "grad_norm": 0.5520804650288836, + "learning_rate": 2.6991570652873357e-05, + "loss": 0.3681, + "step": 291 + }, + { + "epoch": 2.2393489707994254, + "grad_norm": 0.48661650666410344, + "learning_rate": 2.6890841931845674e-05, + "loss": 0.3022, + "step": 292 + }, + { + "epoch": 2.247008137865007, + "grad_norm": 0.5416624941038123, + "learning_rate": 2.6789914482728546e-05, + "loss": 0.359, + "step": 293 + }, + { + "epoch": 2.2546673049305888, + "grad_norm": 0.582685239343103, + "learning_rate": 2.6688791216214064e-05, + "loss": 0.3693, + "step": 294 + }, + { + "epoch": 2.2623264719961704, + "grad_norm": 0.4568416200895835, + "learning_rate": 2.6587475048641596e-05, + "loss": 0.32, + "step": 295 + }, + { + "epoch": 2.269985639061752, + "grad_norm": 0.6028222179058447, + "learning_rate": 2.6485968901913658e-05, + "loss": 0.3726, + "step": 296 + }, + { + "epoch": 2.2776448061273338, + "grad_norm": 0.5497203840683457, + "learning_rate": 2.6384275703411666e-05, + "loss": 0.3584, + "step": 297 + }, + { + "epoch": 2.2853039731929154, + "grad_norm": 0.4601068271433548, + "learning_rate": 2.6282398385911503e-05, + "loss": 0.3263, + "step": 298 + }, + { + "epoch": 2.2929631402584967, + "grad_norm": 0.5843782768173683, + "learning_rate": 2.618033988749895e-05, + "loss": 0.3854, + "step": 299 + }, + { + "epoch": 2.3006223073240784, + "grad_norm": 0.4668138387272922, + "learning_rate": 2.607810315148494e-05, + "loss": 0.3445, + "step": 300 + }, + { + "epoch": 2.30828147438966, + "grad_norm": 0.4966561419101829, + "learning_rate": 2.5975691126320678e-05, + "loss": 0.3486, + "step": 301 + }, + { + "epoch": 2.3159406414552417, + "grad_norm": 0.552195612826531, + "learning_rate": 2.587310676551262e-05, + "loss": 0.3478, + "step": 302 + }, + { + "epoch": 2.3235998085208234, + "grad_norm": 0.49951171616233425, + "learning_rate": 2.5770353027537276e-05, + "loss": 0.3494, + "step": 303 + }, + { + "epoch": 2.331258975586405, + "grad_norm": 0.4487518241170282, + "learning_rate": 2.5667432875755904e-05, + "loss": 0.3267, + "step": 304 + }, + { + "epoch": 2.3389181426519867, + "grad_norm": 0.47306289952945507, + "learning_rate": 2.5564349278329056e-05, + "loss": 0.3246, + "step": 305 + }, + { + "epoch": 2.3465773097175684, + "grad_norm": 0.4974553141384758, + "learning_rate": 2.5461105208130953e-05, + "loss": 0.2918, + "step": 306 + }, + { + "epoch": 2.35423647678315, + "grad_norm": 0.4809251554335802, + "learning_rate": 2.5357703642663766e-05, + "loss": 0.357, + "step": 307 + }, + { + "epoch": 2.3618956438487313, + "grad_norm": 0.5596662616004654, + "learning_rate": 2.525414756397174e-05, + "loss": 0.3287, + "step": 308 + }, + { + "epoch": 2.369554810914313, + "grad_norm": 0.46194307500162646, + "learning_rate": 2.5150439958555205e-05, + "loss": 0.323, + "step": 309 + }, + { + "epoch": 2.3772139779798946, + "grad_norm": 0.5354975145364615, + "learning_rate": 2.5046583817284437e-05, + "loss": 0.3641, + "step": 310 + }, + { + "epoch": 2.3848731450454763, + "grad_norm": 0.5273653633413932, + "learning_rate": 2.4942582135313393e-05, + "loss": 0.3553, + "step": 311 + }, + { + "epoch": 2.392532312111058, + "grad_norm": 0.49817920339367244, + "learning_rate": 2.4838437911993355e-05, + "loss": 0.3455, + "step": 312 + }, + { + "epoch": 2.4001914791766397, + "grad_norm": 0.515170607443925, + "learning_rate": 2.473415415078642e-05, + "loss": 0.3619, + "step": 313 + }, + { + "epoch": 2.4078506462422213, + "grad_norm": 0.4926042303192558, + "learning_rate": 2.4629733859178867e-05, + "loss": 0.3485, + "step": 314 + }, + { + "epoch": 2.4155098133078026, + "grad_norm": 0.42905408480479645, + "learning_rate": 2.4525180048594452e-05, + "loss": 0.3057, + "step": 315 + }, + { + "epoch": 2.4231689803733842, + "grad_norm": 0.4867004151742306, + "learning_rate": 2.4420495734307527e-05, + "loss": 0.3747, + "step": 316 + }, + { + "epoch": 2.430828147438966, + "grad_norm": 0.41338427949555484, + "learning_rate": 2.4315683935356127e-05, + "loss": 0.3475, + "step": 317 + }, + { + "epoch": 2.4384873145045476, + "grad_norm": 0.44888986646854223, + "learning_rate": 2.421074767445485e-05, + "loss": 0.303, + "step": 318 + }, + { + "epoch": 2.4461464815701293, + "grad_norm": 0.5491552096723435, + "learning_rate": 2.4105689977907722e-05, + "loss": 0.3979, + "step": 319 + }, + { + "epoch": 2.453805648635711, + "grad_norm": 0.4927516444394549, + "learning_rate": 2.4000513875520892e-05, + "loss": 0.3407, + "step": 320 + }, + { + "epoch": 2.4614648157012926, + "grad_norm": 0.466811974445456, + "learning_rate": 2.3895222400515282e-05, + "loss": 0.3425, + "step": 321 + }, + { + "epoch": 2.4691239827668743, + "grad_norm": 0.47821175171844904, + "learning_rate": 2.3789818589439094e-05, + "loss": 0.3098, + "step": 322 + }, + { + "epoch": 2.476783149832456, + "grad_norm": 0.4912242879477096, + "learning_rate": 2.3684305482080233e-05, + "loss": 0.3591, + "step": 323 + }, + { + "epoch": 2.484442316898037, + "grad_norm": 0.4811573112209665, + "learning_rate": 2.357868612137866e-05, + "loss": 0.3773, + "step": 324 + }, + { + "epoch": 2.492101483963619, + "grad_norm": 0.4870566155608247, + "learning_rate": 2.3472963553338614e-05, + "loss": 0.3228, + "step": 325 + }, + { + "epoch": 2.4997606510292005, + "grad_norm": 0.5310392825279873, + "learning_rate": 2.3367140826940768e-05, + "loss": 0.3546, + "step": 326 + }, + { + "epoch": 2.507419818094782, + "grad_norm": 0.5483105250492419, + "learning_rate": 2.326122099405435e-05, + "loss": 0.3375, + "step": 327 + }, + { + "epoch": 2.515078985160364, + "grad_norm": 0.5003667793105148, + "learning_rate": 2.315520710934903e-05, + "loss": 0.3687, + "step": 328 + }, + { + "epoch": 2.5227381522259456, + "grad_norm": 0.533844789442384, + "learning_rate": 2.304910223020691e-05, + "loss": 0.3548, + "step": 329 + }, + { + "epoch": 2.5303973192915272, + "grad_norm": 0.5149064035903343, + "learning_rate": 2.2942909416634326e-05, + "loss": 0.3446, + "step": 330 + }, + { + "epoch": 2.5380564863571085, + "grad_norm": 0.4151169766555542, + "learning_rate": 2.2836631731173577e-05, + "loss": 0.3281, + "step": 331 + }, + { + "epoch": 2.54571565342269, + "grad_norm": 0.5156566437281681, + "learning_rate": 2.2730272238814636e-05, + "loss": 0.3479, + "step": 332 + }, + { + "epoch": 2.553374820488272, + "grad_norm": 0.536538863268432, + "learning_rate": 2.2623834006906732e-05, + "loss": 0.3468, + "step": 333 + }, + { + "epoch": 2.5610339875538535, + "grad_norm": 0.41537339391659595, + "learning_rate": 2.25173201050699e-05, + "loss": 0.311, + "step": 334 + }, + { + "epoch": 2.568693154619435, + "grad_norm": 0.5021990261148753, + "learning_rate": 2.2410733605106462e-05, + "loss": 0.3803, + "step": 335 + }, + { + "epoch": 2.576352321685017, + "grad_norm": 0.39621064090295866, + "learning_rate": 2.2304077580912423e-05, + "loss": 0.2936, + "step": 336 + }, + { + "epoch": 2.5840114887505985, + "grad_norm": 0.5372701642790138, + "learning_rate": 2.2197355108388835e-05, + "loss": 0.3184, + "step": 337 + }, + { + "epoch": 2.5916706558161797, + "grad_norm": 0.41383864714124363, + "learning_rate": 2.209056926535307e-05, + "loss": 0.3287, + "step": 338 + }, + { + "epoch": 2.599329822881762, + "grad_norm": 0.5114947611293079, + "learning_rate": 2.1983723131450088e-05, + "loss": 0.3683, + "step": 339 + }, + { + "epoch": 2.606988989947343, + "grad_norm": 0.4173762346729875, + "learning_rate": 2.1876819788063586e-05, + "loss": 0.3086, + "step": 340 + }, + { + "epoch": 2.6146481570129247, + "grad_norm": 0.5192212848345572, + "learning_rate": 2.176986231822717e-05, + "loss": 0.375, + "step": 341 + }, + { + "epoch": 2.6223073240785064, + "grad_norm": 0.5669533936852463, + "learning_rate": 2.166285380653541e-05, + "loss": 0.3432, + "step": 342 + }, + { + "epoch": 2.629966491144088, + "grad_norm": 0.4671450099921592, + "learning_rate": 2.1555797339054898e-05, + "loss": 0.3207, + "step": 343 + }, + { + "epoch": 2.6376256582096698, + "grad_norm": 0.5035698279484611, + "learning_rate": 2.1448696003235252e-05, + "loss": 0.3586, + "step": 344 + }, + { + "epoch": 2.6452848252752514, + "grad_norm": 0.49482528595990216, + "learning_rate": 2.1341552887820048e-05, + "loss": 0.3419, + "step": 345 + }, + { + "epoch": 2.652943992340833, + "grad_norm": 0.5124832493661444, + "learning_rate": 2.123437108275779e-05, + "loss": 0.3605, + "step": 346 + }, + { + "epoch": 2.6606031594064143, + "grad_norm": 0.3916326803311756, + "learning_rate": 2.112715367911275e-05, + "loss": 0.3102, + "step": 347 + }, + { + "epoch": 2.668262326471996, + "grad_norm": 0.41173103844867237, + "learning_rate": 2.1019903768975852e-05, + "loss": 0.3645, + "step": 348 + }, + { + "epoch": 2.6759214935375777, + "grad_norm": 0.48815468984239224, + "learning_rate": 2.0912624445375483e-05, + "loss": 0.3641, + "step": 349 + }, + { + "epoch": 2.6835806606031594, + "grad_norm": 0.40031092411352465, + "learning_rate": 2.0805318802188307e-05, + "loss": 0.3233, + "step": 350 + }, + { + "epoch": 2.691239827668741, + "grad_norm": 0.4801007606225588, + "learning_rate": 2.0697989934050025e-05, + "loss": 0.3492, + "step": 351 + }, + { + "epoch": 2.6988989947343227, + "grad_norm": 0.465959760136513, + "learning_rate": 2.0590640936266132e-05, + "loss": 0.3492, + "step": 352 + }, + { + "epoch": 2.7065581617999044, + "grad_norm": 0.432367828293715, + "learning_rate": 2.0483274904722647e-05, + "loss": 0.3478, + "step": 353 + }, + { + "epoch": 2.7142173288654856, + "grad_norm": 0.4510586932875561, + "learning_rate": 2.037589493579685e-05, + "loss": 0.3553, + "step": 354 + }, + { + "epoch": 2.7218764959310677, + "grad_norm": 0.467950300228232, + "learning_rate": 2.0268504126267952e-05, + "loss": 0.3653, + "step": 355 + }, + { + "epoch": 2.729535662996649, + "grad_norm": 0.5009397300600292, + "learning_rate": 2.0161105573227798e-05, + "loss": 0.36, + "step": 356 + }, + { + "epoch": 2.7371948300622306, + "grad_norm": 0.49408159207397456, + "learning_rate": 2.005370237399157e-05, + "loss": 0.3515, + "step": 357 + }, + { + "epoch": 2.7448539971278123, + "grad_norm": 0.35262515813271533, + "learning_rate": 1.9946297626008432e-05, + "loss": 0.2689, + "step": 358 + }, + { + "epoch": 2.752513164193394, + "grad_norm": 0.4827674543765931, + "learning_rate": 1.9838894426772205e-05, + "loss": 0.3706, + "step": 359 + }, + { + "epoch": 2.7601723312589757, + "grad_norm": 0.3780483362953392, + "learning_rate": 1.9731495873732055e-05, + "loss": 0.3135, + "step": 360 + }, + { + "epoch": 2.7678314983245573, + "grad_norm": 0.3811396663153698, + "learning_rate": 1.9624105064203157e-05, + "loss": 0.3576, + "step": 361 + }, + { + "epoch": 2.775490665390139, + "grad_norm": 0.45864928956878004, + "learning_rate": 1.951672509527736e-05, + "loss": 0.3321, + "step": 362 + }, + { + "epoch": 2.7831498324557202, + "grad_norm": 0.4507638328421475, + "learning_rate": 1.940935906373388e-05, + "loss": 0.3715, + "step": 363 + }, + { + "epoch": 2.790808999521302, + "grad_norm": 0.43134511596968567, + "learning_rate": 1.930201006594999e-05, + "loss": 0.3412, + "step": 364 + }, + { + "epoch": 2.7984681665868836, + "grad_norm": 0.52057087403904, + "learning_rate": 1.9194681197811703e-05, + "loss": 0.3849, + "step": 365 + }, + { + "epoch": 2.8061273336524652, + "grad_norm": 0.37380617464599, + "learning_rate": 1.9087375554624527e-05, + "loss": 0.3072, + "step": 366 + }, + { + "epoch": 2.813786500718047, + "grad_norm": 0.3665120285057917, + "learning_rate": 1.898009623102415e-05, + "loss": 0.3261, + "step": 367 + }, + { + "epoch": 2.8214456677836286, + "grad_norm": 0.45855248957112604, + "learning_rate": 1.887284632088725e-05, + "loss": 0.4029, + "step": 368 + }, + { + "epoch": 2.8291048348492103, + "grad_norm": 0.42598192016232783, + "learning_rate": 1.8765628917242213e-05, + "loss": 0.3588, + "step": 369 + }, + { + "epoch": 2.8367640019147915, + "grad_norm": 0.4155845765749022, + "learning_rate": 1.8658447112179952e-05, + "loss": 0.3039, + "step": 370 + }, + { + "epoch": 2.8444231689803736, + "grad_norm": 0.4601308365256824, + "learning_rate": 1.8551303996764755e-05, + "loss": 0.3836, + "step": 371 + }, + { + "epoch": 2.852082336045955, + "grad_norm": 0.4914229018003968, + "learning_rate": 1.8444202660945105e-05, + "loss": 0.3679, + "step": 372 + }, + { + "epoch": 2.8597415031115365, + "grad_norm": 0.46287822342404056, + "learning_rate": 1.8337146193464595e-05, + "loss": 0.288, + "step": 373 + }, + { + "epoch": 2.867400670177118, + "grad_norm": 0.49183098229163497, + "learning_rate": 1.8230137681772836e-05, + "loss": 0.3741, + "step": 374 + }, + { + "epoch": 2.8750598372427, + "grad_norm": 0.40911104249164226, + "learning_rate": 1.8123180211936417e-05, + "loss": 0.3425, + "step": 375 + }, + { + "epoch": 2.8827190043082815, + "grad_norm": 0.4656657585294656, + "learning_rate": 1.801627686854992e-05, + "loss": 0.3436, + "step": 376 + }, + { + "epoch": 2.890378171373863, + "grad_norm": 0.3955542489042361, + "learning_rate": 1.7909430734646936e-05, + "loss": 0.3142, + "step": 377 + }, + { + "epoch": 2.898037338439445, + "grad_norm": 0.3919921799832172, + "learning_rate": 1.780264489161117e-05, + "loss": 0.327, + "step": 378 + }, + { + "epoch": 2.905696505505026, + "grad_norm": 0.40565680454375597, + "learning_rate": 1.769592241908758e-05, + "loss": 0.3469, + "step": 379 + }, + { + "epoch": 2.913355672570608, + "grad_norm": 0.4612451296187882, + "learning_rate": 1.758926639489354e-05, + "loss": 0.398, + "step": 380 + }, + { + "epoch": 2.9210148396361895, + "grad_norm": 0.374323134300942, + "learning_rate": 1.748267989493011e-05, + "loss": 0.3077, + "step": 381 + }, + { + "epoch": 2.928674006701771, + "grad_norm": 0.447528401086559, + "learning_rate": 1.7376165993093278e-05, + "loss": 0.3791, + "step": 382 + }, + { + "epoch": 2.936333173767353, + "grad_norm": 0.3684289290106699, + "learning_rate": 1.7269727761185374e-05, + "loss": 0.3203, + "step": 383 + }, + { + "epoch": 2.9439923408329345, + "grad_norm": 0.3713011538957981, + "learning_rate": 1.7163368268826433e-05, + "loss": 0.3247, + "step": 384 + }, + { + "epoch": 2.951651507898516, + "grad_norm": 0.4288353235268095, + "learning_rate": 1.7057090583365678e-05, + "loss": 0.3714, + "step": 385 + }, + { + "epoch": 2.9593106749640974, + "grad_norm": 0.35222146097913287, + "learning_rate": 1.6950897769793093e-05, + "loss": 0.2943, + "step": 386 + }, + { + "epoch": 2.9669698420296795, + "grad_norm": 0.38643823068088784, + "learning_rate": 1.6844792890650976e-05, + "loss": 0.3443, + "step": 387 + }, + { + "epoch": 2.9746290090952607, + "grad_norm": 0.38938210740713053, + "learning_rate": 1.673877900594566e-05, + "loss": 0.3399, + "step": 388 + }, + { + "epoch": 2.9822881761608424, + "grad_norm": 0.43159107146354114, + "learning_rate": 1.6632859173059232e-05, + "loss": 0.344, + "step": 389 + }, + { + "epoch": 2.989947343226424, + "grad_norm": 0.38084254373242143, + "learning_rate": 1.6527036446661396e-05, + "loss": 0.3455, + "step": 390 + }, + { + "epoch": 2.9976065102920058, + "grad_norm": 0.5938797732009524, + "learning_rate": 1.6421313878621344e-05, + "loss": 0.4699, + "step": 391 + }, + { + "epoch": 3.006701771182384, + "grad_norm": 0.6492454140958951, + "learning_rate": 1.631569451791977e-05, + "loss": 0.3313, + "step": 392 + }, + { + "epoch": 3.0143609382479655, + "grad_norm": 0.4219633362824025, + "learning_rate": 1.6210181410560912e-05, + "loss": 0.2324, + "step": 393 + }, + { + "epoch": 3.022020105313547, + "grad_norm": 0.5182168578367924, + "learning_rate": 1.610477759948472e-05, + "loss": 0.245, + "step": 394 + }, + { + "epoch": 3.029679272379129, + "grad_norm": 0.6377564861463809, + "learning_rate": 1.5999486124479115e-05, + "loss": 0.2515, + "step": 395 + }, + { + "epoch": 3.0373384394447105, + "grad_norm": 0.5098950195718823, + "learning_rate": 1.5894310022092288e-05, + "loss": 0.2364, + "step": 396 + }, + { + "epoch": 3.044997606510292, + "grad_norm": 0.5168454095271555, + "learning_rate": 1.5789252325545157e-05, + "loss": 0.2334, + "step": 397 + }, + { + "epoch": 3.052656773575874, + "grad_norm": 0.5240320586315429, + "learning_rate": 1.568431606464388e-05, + "loss": 0.2675, + "step": 398 + }, + { + "epoch": 3.060315940641455, + "grad_norm": 0.44117429526871266, + "learning_rate": 1.557950426569248e-05, + "loss": 0.2315, + "step": 399 + }, + { + "epoch": 3.0679751077070367, + "grad_norm": 0.48765308835045107, + "learning_rate": 1.547481995140556e-05, + "loss": 0.2437, + "step": 400 + }, + { + "epoch": 3.0756342747726184, + "grad_norm": 0.4520916642343294, + "learning_rate": 1.5370266140821143e-05, + "loss": 0.2341, + "step": 401 + }, + { + "epoch": 3.0832934418382, + "grad_norm": 0.4647186437171434, + "learning_rate": 1.5265845849213588e-05, + "loss": 0.2468, + "step": 402 + }, + { + "epoch": 3.0909526089037818, + "grad_norm": 0.46140225100955573, + "learning_rate": 1.5161562088006649e-05, + "loss": 0.2345, + "step": 403 + }, + { + "epoch": 3.0986117759693634, + "grad_norm": 0.4382412999538287, + "learning_rate": 1.5057417864686607e-05, + "loss": 0.2386, + "step": 404 + }, + { + "epoch": 3.106270943034945, + "grad_norm": 0.45590994359507603, + "learning_rate": 1.4953416182715566e-05, + "loss": 0.2443, + "step": 405 + }, + { + "epoch": 3.113930110100527, + "grad_norm": 0.44632890994369173, + "learning_rate": 1.4849560041444795e-05, + "loss": 0.2216, + "step": 406 + }, + { + "epoch": 3.121589277166108, + "grad_norm": 0.3690202710694795, + "learning_rate": 1.4745852436028262e-05, + "loss": 0.2289, + "step": 407 + }, + { + "epoch": 3.1292484442316897, + "grad_norm": 0.3987565576025537, + "learning_rate": 1.464229635733624e-05, + "loss": 0.2097, + "step": 408 + }, + { + "epoch": 3.1369076112972714, + "grad_norm": 0.3922785950430538, + "learning_rate": 1.4538894791869052e-05, + "loss": 0.2205, + "step": 409 + }, + { + "epoch": 3.144566778362853, + "grad_norm": 0.4019487118753477, + "learning_rate": 1.443565072167095e-05, + "loss": 0.2536, + "step": 410 + }, + { + "epoch": 3.1522259454284347, + "grad_norm": 0.37368100647725233, + "learning_rate": 1.43325671242441e-05, + "loss": 0.2514, + "step": 411 + }, + { + "epoch": 3.1598851124940164, + "grad_norm": 0.3884417771257955, + "learning_rate": 1.4229646972462732e-05, + "loss": 0.2456, + "step": 412 + }, + { + "epoch": 3.167544279559598, + "grad_norm": 0.3767796290523846, + "learning_rate": 1.412689323448739e-05, + "loss": 0.2237, + "step": 413 + }, + { + "epoch": 3.1752034466251793, + "grad_norm": 0.3674660595771459, + "learning_rate": 1.4024308873679327e-05, + "loss": 0.2591, + "step": 414 + }, + { + "epoch": 3.182862613690761, + "grad_norm": 0.3572633760098233, + "learning_rate": 1.3921896848515064e-05, + "loss": 0.2163, + "step": 415 + }, + { + "epoch": 3.1905217807563426, + "grad_norm": 0.381550127572975, + "learning_rate": 1.3819660112501054e-05, + "loss": 0.2442, + "step": 416 + }, + { + "epoch": 3.1981809478219243, + "grad_norm": 0.37631597527744115, + "learning_rate": 1.37176016140885e-05, + "loss": 0.2323, + "step": 417 + }, + { + "epoch": 3.205840114887506, + "grad_norm": 0.35956143405057833, + "learning_rate": 1.3615724296588342e-05, + "loss": 0.2044, + "step": 418 + }, + { + "epoch": 3.2134992819530876, + "grad_norm": 0.38268492786329705, + "learning_rate": 1.3514031098086349e-05, + "loss": 0.2402, + "step": 419 + }, + { + "epoch": 3.2211584490186693, + "grad_norm": 0.38994977473417125, + "learning_rate": 1.341252495135841e-05, + "loss": 0.2319, + "step": 420 + }, + { + "epoch": 3.228817616084251, + "grad_norm": 0.35677922752266494, + "learning_rate": 1.3311208783785945e-05, + "loss": 0.2341, + "step": 421 + }, + { + "epoch": 3.2364767831498327, + "grad_norm": 0.37528548994142574, + "learning_rate": 1.3210085517271459e-05, + "loss": 0.25, + "step": 422 + }, + { + "epoch": 3.244135950215414, + "grad_norm": 0.382737915829752, + "learning_rate": 1.3109158068154329e-05, + "loss": 0.2501, + "step": 423 + }, + { + "epoch": 3.2517951172809956, + "grad_norm": 0.3540547128374988, + "learning_rate": 1.3008429347126641e-05, + "loss": 0.2482, + "step": 424 + }, + { + "epoch": 3.2594542843465772, + "grad_norm": 0.3474058116435946, + "learning_rate": 1.2907902259149287e-05, + "loss": 0.2359, + "step": 425 + }, + { + "epoch": 3.267113451412159, + "grad_norm": 0.3903729184634132, + "learning_rate": 1.2807579703368162e-05, + "loss": 0.2122, + "step": 426 + }, + { + "epoch": 3.2747726184777406, + "grad_norm": 0.40188660683701266, + "learning_rate": 1.2707464573030572e-05, + "loss": 0.2884, + "step": 427 + }, + { + "epoch": 3.2824317855433223, + "grad_norm": 0.3471695792330223, + "learning_rate": 1.260755975540178e-05, + "loss": 0.217, + "step": 428 + }, + { + "epoch": 3.290090952608904, + "grad_norm": 0.36589717504907115, + "learning_rate": 1.250786813168176e-05, + "loss": 0.2379, + "step": 429 + }, + { + "epoch": 3.297750119674485, + "grad_norm": 0.3688472677382392, + "learning_rate": 1.2408392576922075e-05, + "loss": 0.2317, + "step": 430 + }, + { + "epoch": 3.305409286740067, + "grad_norm": 0.3822664640855971, + "learning_rate": 1.2309135959942986e-05, + "loss": 0.2376, + "step": 431 + }, + { + "epoch": 3.3130684538056485, + "grad_norm": 0.35469891768647493, + "learning_rate": 1.2210101143250708e-05, + "loss": 0.2183, + "step": 432 + }, + { + "epoch": 3.32072762087123, + "grad_norm": 0.3609442874261945, + "learning_rate": 1.211129098295486e-05, + "loss": 0.2372, + "step": 433 + }, + { + "epoch": 3.328386787936812, + "grad_norm": 0.40218147421255007, + "learning_rate": 1.2012708328686093e-05, + "loss": 0.2722, + "step": 434 + }, + { + "epoch": 3.3360459550023935, + "grad_norm": 0.34284386522492444, + "learning_rate": 1.1914356023513904e-05, + "loss": 0.2249, + "step": 435 + }, + { + "epoch": 3.343705122067975, + "grad_norm": 0.3398937321234149, + "learning_rate": 1.1816236903864656e-05, + "loss": 0.2314, + "step": 436 + }, + { + "epoch": 3.351364289133557, + "grad_norm": 0.3645220719375267, + "learning_rate": 1.1718353799439766e-05, + "loss": 0.2415, + "step": 437 + }, + { + "epoch": 3.3590234561991386, + "grad_norm": 0.35966116512097646, + "learning_rate": 1.1620709533134104e-05, + "loss": 0.2539, + "step": 438 + }, + { + "epoch": 3.36668262326472, + "grad_norm": 0.35249135244554275, + "learning_rate": 1.1523306920954571e-05, + "loss": 0.207, + "step": 439 + }, + { + "epoch": 3.3743417903303015, + "grad_norm": 0.350527349824925, + "learning_rate": 1.1426148771938915e-05, + "loss": 0.215, + "step": 440 + }, + { + "epoch": 3.382000957395883, + "grad_norm": 0.3645807698031509, + "learning_rate": 1.1329237888074691e-05, + "loss": 0.2294, + "step": 441 + }, + { + "epoch": 3.389660124461465, + "grad_norm": 0.3526979558756505, + "learning_rate": 1.123257706421845e-05, + "loss": 0.2357, + "step": 442 + }, + { + "epoch": 3.3973192915270465, + "grad_norm": 0.3298007139758826, + "learning_rate": 1.1136169088015177e-05, + "loss": 0.2196, + "step": 443 + }, + { + "epoch": 3.404978458592628, + "grad_norm": 0.3901583239274404, + "learning_rate": 1.1040016739817836e-05, + "loss": 0.2593, + "step": 444 + }, + { + "epoch": 3.41263762565821, + "grad_norm": 0.3312336548111748, + "learning_rate": 1.094412279260726e-05, + "loss": 0.2201, + "step": 445 + }, + { + "epoch": 3.420296792723791, + "grad_norm": 0.37712815006038436, + "learning_rate": 1.0848490011912096e-05, + "loss": 0.2353, + "step": 446 + }, + { + "epoch": 3.4279559597893727, + "grad_norm": 0.33427689171156727, + "learning_rate": 1.0753121155729133e-05, + "loss": 0.2247, + "step": 447 + }, + { + "epoch": 3.4356151268549544, + "grad_norm": 0.3403837414551559, + "learning_rate": 1.0658018974443692e-05, + "loss": 0.2334, + "step": 448 + }, + { + "epoch": 3.443274293920536, + "grad_norm": 0.3427488817401635, + "learning_rate": 1.056318621075036e-05, + "loss": 0.2394, + "step": 449 + }, + { + "epoch": 3.4509334609861178, + "grad_norm": 0.35721031062542025, + "learning_rate": 1.0468625599573842e-05, + "loss": 0.238, + "step": 450 + }, + { + "epoch": 3.4585926280516994, + "grad_norm": 0.35498625884545276, + "learning_rate": 1.037433986799015e-05, + "loss": 0.248, + "step": 451 + }, + { + "epoch": 3.466251795117281, + "grad_norm": 0.3686856880290374, + "learning_rate": 1.028033173514788e-05, + "loss": 0.2602, + "step": 452 + }, + { + "epoch": 3.4739109621828628, + "grad_norm": 0.32569736084610973, + "learning_rate": 1.0186603912189867e-05, + "loss": 0.2261, + "step": 453 + }, + { + "epoch": 3.4815701292484444, + "grad_norm": 0.35179935045391797, + "learning_rate": 1.0093159102174938e-05, + "loss": 0.228, + "step": 454 + }, + { + "epoch": 3.4892292963140257, + "grad_norm": 0.37412117087411056, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.2558, + "step": 455 + }, + { + "epoch": 3.4968884633796073, + "grad_norm": 0.3196396284614333, + "learning_rate": 9.907129292322298e-06, + "loss": 0.2016, + "step": 456 + }, + { + "epoch": 3.504547630445189, + "grad_norm": 0.367949651744729, + "learning_rate": 9.814549657481935e-06, + "loss": 0.2345, + "step": 457 + }, + { + "epoch": 3.5122067975107707, + "grad_norm": 0.36477967052612537, + "learning_rate": 9.722263765424628e-06, + "loss": 0.2276, + "step": 458 + }, + { + "epoch": 3.5198659645763524, + "grad_norm": 0.3332594226268887, + "learning_rate": 9.630274277624729e-06, + "loss": 0.23, + "step": 459 + }, + { + "epoch": 3.527525131641934, + "grad_norm": 0.3477521018845692, + "learning_rate": 9.538583847008452e-06, + "loss": 0.2352, + "step": 460 + }, + { + "epoch": 3.5351842987075157, + "grad_norm": 0.32086860442292786, + "learning_rate": 9.447195117877343e-06, + "loss": 0.2125, + "step": 461 + }, + { + "epoch": 3.542843465773097, + "grad_norm": 0.3678326552292364, + "learning_rate": 9.356110725832081e-06, + "loss": 0.2576, + "step": 462 + }, + { + "epoch": 3.550502632838679, + "grad_norm": 0.3656606530951392, + "learning_rate": 9.265333297696395e-06, + "loss": 0.2461, + "step": 463 + }, + { + "epoch": 3.5581617999042603, + "grad_norm": 0.3229274366638359, + "learning_rate": 9.174865451441375e-06, + "loss": 0.2298, + "step": 464 + }, + { + "epoch": 3.565820966969842, + "grad_norm": 0.32293232206607697, + "learning_rate": 9.084709796109907e-06, + "loss": 0.2166, + "step": 465 + }, + { + "epoch": 3.5734801340354236, + "grad_norm": 0.3461993603233461, + "learning_rate": 8.994868931741499e-06, + "loss": 0.2342, + "step": 466 + }, + { + "epoch": 3.5811393011010053, + "grad_norm": 0.31931497478868687, + "learning_rate": 8.905345449297223e-06, + "loss": 0.2245, + "step": 467 + }, + { + "epoch": 3.588798468166587, + "grad_norm": 0.34882777970263285, + "learning_rate": 8.816141930585067e-06, + "loss": 0.2412, + "step": 468 + }, + { + "epoch": 3.5964576352321687, + "grad_norm": 0.31810617065052815, + "learning_rate": 8.72726094818541e-06, + "loss": 0.2202, + "step": 469 + }, + { + "epoch": 3.6041168022977503, + "grad_norm": 0.36687350590070866, + "learning_rate": 8.638705065376887e-06, + "loss": 0.225, + "step": 470 + }, + { + "epoch": 3.6117759693633316, + "grad_norm": 0.35726654000876984, + "learning_rate": 8.550476836062419e-06, + "loss": 0.2343, + "step": 471 + }, + { + "epoch": 3.6194351364289132, + "grad_norm": 0.33724954427539455, + "learning_rate": 8.462578804695595e-06, + "loss": 0.2328, + "step": 472 + }, + { + "epoch": 3.627094303494495, + "grad_norm": 0.33977950988225286, + "learning_rate": 8.375013506207275e-06, + "loss": 0.2593, + "step": 473 + }, + { + "epoch": 3.6347534705600766, + "grad_norm": 0.3357461336976667, + "learning_rate": 8.287783465932466e-06, + "loss": 0.2162, + "step": 474 + }, + { + "epoch": 3.6424126376256583, + "grad_norm": 0.33938581871332074, + "learning_rate": 8.200891199537549e-06, + "loss": 0.2159, + "step": 475 + }, + { + "epoch": 3.65007180469124, + "grad_norm": 0.30850555682034936, + "learning_rate": 8.114339212947655e-06, + "loss": 0.2308, + "step": 476 + }, + { + "epoch": 3.6577309717568216, + "grad_norm": 0.33712030493831774, + "learning_rate": 8.028130002274459e-06, + "loss": 0.2617, + "step": 477 + }, + { + "epoch": 3.665390138822403, + "grad_norm": 0.33952468314921025, + "learning_rate": 7.942266053744155e-06, + "loss": 0.2277, + "step": 478 + }, + { + "epoch": 3.673049305887985, + "grad_norm": 0.32790624314094596, + "learning_rate": 7.856749843625777e-06, + "loss": 0.2459, + "step": 479 + }, + { + "epoch": 3.680708472953566, + "grad_norm": 0.30426795240880533, + "learning_rate": 7.771583838159756e-06, + "loss": 0.2373, + "step": 480 + }, + { + "epoch": 3.688367640019148, + "grad_norm": 0.32139830397764274, + "learning_rate": 7.686770493486835e-06, + "loss": 0.265, + "step": 481 + }, + { + "epoch": 3.6960268070847295, + "grad_norm": 0.30384977031133725, + "learning_rate": 7.602312255577193e-06, + "loss": 0.2166, + "step": 482 + }, + { + "epoch": 3.703685974150311, + "grad_norm": 0.33085512515068155, + "learning_rate": 7.518211560159949e-06, + "loss": 0.2511, + "step": 483 + }, + { + "epoch": 3.711345141215893, + "grad_norm": 0.3176611144031733, + "learning_rate": 7.434470832652865e-06, + "loss": 0.2084, + "step": 484 + }, + { + "epoch": 3.7190043082814745, + "grad_norm": 0.33574434095691075, + "learning_rate": 7.3510924880924575e-06, + "loss": 0.2308, + "step": 485 + }, + { + "epoch": 3.726663475347056, + "grad_norm": 0.2971615945594848, + "learning_rate": 7.268078931064293e-06, + "loss": 0.2217, + "step": 486 + }, + { + "epoch": 3.7343226424126374, + "grad_norm": 0.32961234977638115, + "learning_rate": 7.185432555633672e-06, + "loss": 0.2548, + "step": 487 + }, + { + "epoch": 3.741981809478219, + "grad_norm": 0.32814885991409504, + "learning_rate": 7.1031557452765934e-06, + "loss": 0.2439, + "step": 488 + }, + { + "epoch": 3.749640976543801, + "grad_norm": 0.34360104917258844, + "learning_rate": 7.021250872810983e-06, + "loss": 0.224, + "step": 489 + }, + { + "epoch": 3.7573001436093825, + "grad_norm": 0.3505861571637595, + "learning_rate": 6.939720300328303e-06, + "loss": 0.2459, + "step": 490 + }, + { + "epoch": 3.764959310674964, + "grad_norm": 0.33976097846578596, + "learning_rate": 6.858566379125389e-06, + "loss": 0.2329, + "step": 491 + }, + { + "epoch": 3.772618477740546, + "grad_norm": 0.32593443522992444, + "learning_rate": 6.777791449636681e-06, + "loss": 0.2295, + "step": 492 + }, + { + "epoch": 3.7802776448061275, + "grad_norm": 0.33560750807739526, + "learning_rate": 6.697397841366686e-06, + "loss": 0.2148, + "step": 493 + }, + { + "epoch": 3.7879368118717087, + "grad_norm": 0.3347449057734125, + "learning_rate": 6.617387872822842e-06, + "loss": 0.2197, + "step": 494 + }, + { + "epoch": 3.795595978937291, + "grad_norm": 0.3467626472846674, + "learning_rate": 6.537763851448593e-06, + "loss": 0.2391, + "step": 495 + }, + { + "epoch": 3.803255146002872, + "grad_norm": 0.32356879416863116, + "learning_rate": 6.458528073556925e-06, + "loss": 0.2268, + "step": 496 + }, + { + "epoch": 3.8109143130684537, + "grad_norm": 0.3283370951000147, + "learning_rate": 6.379682824264055e-06, + "loss": 0.2337, + "step": 497 + }, + { + "epoch": 3.8185734801340354, + "grad_norm": 0.30770811712569635, + "learning_rate": 6.301230377423595e-06, + "loss": 0.2224, + "step": 498 + }, + { + "epoch": 3.826232647199617, + "grad_norm": 0.3253183469854635, + "learning_rate": 6.223172995560935e-06, + "loss": 0.2193, + "step": 499 + }, + { + "epoch": 3.8338918142651988, + "grad_norm": 0.3348399208299024, + "learning_rate": 6.145512929808013e-06, + "loss": 0.241, + "step": 500 + }, + { + "epoch": 3.8415509813307804, + "grad_norm": 0.33895071979833463, + "learning_rate": 6.068252419838399e-06, + "loss": 0.2427, + "step": 501 + }, + { + "epoch": 3.849210148396362, + "grad_norm": 0.3343502843045349, + "learning_rate": 5.991393693802674e-06, + "loss": 0.2319, + "step": 502 + }, + { + "epoch": 3.8568693154619433, + "grad_norm": 0.32674599015773287, + "learning_rate": 5.9149389682642165e-06, + "loss": 0.2438, + "step": 503 + }, + { + "epoch": 3.864528482527525, + "grad_norm": 0.324625635133392, + "learning_rate": 5.838890448135228e-06, + "loss": 0.2464, + "step": 504 + }, + { + "epoch": 3.8721876495931067, + "grad_norm": 0.33427787078954374, + "learning_rate": 5.7632503266131925e-06, + "loss": 0.216, + "step": 505 + }, + { + "epoch": 3.8798468166586884, + "grad_norm": 0.2975052588803942, + "learning_rate": 5.688020785117581e-06, + "loss": 0.2193, + "step": 506 + }, + { + "epoch": 3.88750598372427, + "grad_norm": 0.35541630429900906, + "learning_rate": 5.613203993226981e-06, + "loss": 0.2416, + "step": 507 + }, + { + "epoch": 3.8951651507898517, + "grad_norm": 0.3667627536084296, + "learning_rate": 5.538802108616494e-06, + "loss": 0.2339, + "step": 508 + }, + { + "epoch": 3.9028243178554334, + "grad_norm": 0.32563123367734753, + "learning_rate": 5.46481727699554e-06, + "loss": 0.2259, + "step": 509 + }, + { + "epoch": 3.9104834849210146, + "grad_norm": 0.3147349932477261, + "learning_rate": 5.39125163204594e-06, + "loss": 0.2166, + "step": 510 + }, + { + "epoch": 3.9181426519865967, + "grad_norm": 0.3264794954593056, + "learning_rate": 5.318107295360424e-06, + "loss": 0.2191, + "step": 511 + }, + { + "epoch": 3.925801819052178, + "grad_norm": 0.3493610273060971, + "learning_rate": 5.245386376381398e-06, + "loss": 0.223, + "step": 512 + }, + { + "epoch": 3.9334609861177596, + "grad_norm": 0.31947742082366326, + "learning_rate": 5.17309097234016e-06, + "loss": 0.2286, + "step": 513 + }, + { + "epoch": 3.9411201531833413, + "grad_norm": 0.3273845550020943, + "learning_rate": 5.101223168196381e-06, + "loss": 0.2425, + "step": 514 + }, + { + "epoch": 3.948779320248923, + "grad_norm": 0.33071553976584445, + "learning_rate": 5.029785036577976e-06, + "loss": 0.2142, + "step": 515 + }, + { + "epoch": 3.9564384873145046, + "grad_norm": 0.29920602227346177, + "learning_rate": 4.958778637721364e-06, + "loss": 0.2255, + "step": 516 + }, + { + "epoch": 3.9640976543800863, + "grad_norm": 0.3220643902328496, + "learning_rate": 4.8882060194119985e-06, + "loss": 0.2316, + "step": 517 + }, + { + "epoch": 3.971756821445668, + "grad_norm": 0.3348880545917222, + "learning_rate": 4.8180692169253714e-06, + "loss": 0.2319, + "step": 518 + }, + { + "epoch": 3.9794159885112492, + "grad_norm": 0.3183864890443684, + "learning_rate": 4.74837025296826e-06, + "loss": 0.2398, + "step": 519 + }, + { + "epoch": 3.987075155576831, + "grad_norm": 0.3170966409917256, + "learning_rate": 4.679111137620442e-06, + "loss": 0.2235, + "step": 520 + }, + { + "epoch": 3.9947343226424126, + "grad_norm": 0.2929870337825558, + "learning_rate": 4.610293868276681e-06, + "loss": 0.2118, + "step": 521 + }, + { + "epoch": 4.003829583532791, + "grad_norm": 0.6366855376673984, + "learning_rate": 4.541920429589168e-06, + "loss": 0.3375, + "step": 522 + }, + { + "epoch": 4.011488750598373, + "grad_norm": 0.39332713233031513, + "learning_rate": 4.47399279341024e-06, + "loss": 0.1091, + "step": 523 + }, + { + "epoch": 4.019147917663954, + "grad_norm": 0.46392007812335173, + "learning_rate": 4.406512918735555e-06, + "loss": 0.2075, + "step": 524 + }, + { + "epoch": 4.026807084729536, + "grad_norm": 0.32163834207187486, + "learning_rate": 4.339482751647557e-06, + "loss": 0.1486, + "step": 525 + }, + { + "epoch": 4.034466251795117, + "grad_norm": 0.346990014252883, + "learning_rate": 4.272904225259387e-06, + "loss": 0.1689, + "step": 526 + }, + { + "epoch": 4.0421254188606985, + "grad_norm": 0.4470904005969031, + "learning_rate": 4.206779259659102e-06, + "loss": 0.1619, + "step": 527 + }, + { + "epoch": 4.049784585926281, + "grad_norm": 0.5423468490222553, + "learning_rate": 4.141109761854332e-06, + "loss": 0.1583, + "step": 528 + }, + { + "epoch": 4.057443752991862, + "grad_norm": 0.4455453744472251, + "learning_rate": 4.075897625717249e-06, + "loss": 0.1561, + "step": 529 + }, + { + "epoch": 4.065102920057444, + "grad_norm": 0.40070748734343326, + "learning_rate": 4.011144731929981e-06, + "loss": 0.1711, + "step": 530 + }, + { + "epoch": 4.072762087123025, + "grad_norm": 0.3857460690221848, + "learning_rate": 3.9468529479303445e-06, + "loss": 0.1874, + "step": 531 + }, + { + "epoch": 4.080421254188607, + "grad_norm": 0.3386030886136809, + "learning_rate": 3.883024127858017e-06, + "loss": 0.1676, + "step": 532 + }, + { + "epoch": 4.088080421254189, + "grad_norm": 0.3665495913561285, + "learning_rate": 3.819660112501053e-06, + "loss": 0.1605, + "step": 533 + }, + { + "epoch": 4.09573958831977, + "grad_norm": 0.35417549475087723, + "learning_rate": 3.756762729242773e-06, + "loss": 0.1504, + "step": 534 + }, + { + "epoch": 4.103398755385352, + "grad_norm": 0.3396123769965615, + "learning_rate": 3.694333792009115e-06, + "loss": 0.1375, + "step": 535 + }, + { + "epoch": 4.111057922450933, + "grad_norm": 0.32569257575060306, + "learning_rate": 3.632375101216259e-06, + "loss": 0.1518, + "step": 536 + }, + { + "epoch": 4.118717089516515, + "grad_norm": 0.3219593442840899, + "learning_rate": 3.5708884437187673e-06, + "loss": 0.168, + "step": 537 + }, + { + "epoch": 4.1263762565820965, + "grad_norm": 0.31324123070534954, + "learning_rate": 3.509875592757999e-06, + "loss": 0.1681, + "step": 538 + }, + { + "epoch": 4.134035423647679, + "grad_norm": 0.3836751303057977, + "learning_rate": 3.4493383079110054e-06, + "loss": 0.1898, + "step": 539 + }, + { + "epoch": 4.14169459071326, + "grad_norm": 0.3500886642993717, + "learning_rate": 3.3892783350397675e-06, + "loss": 0.1793, + "step": 540 + }, + { + "epoch": 4.149353757778842, + "grad_norm": 0.3188272991338731, + "learning_rate": 3.329697406240855e-06, + "loss": 0.1721, + "step": 541 + }, + { + "epoch": 4.157012924844423, + "grad_norm": 0.3012590925590462, + "learning_rate": 3.2705972397954655e-06, + "loss": 0.1537, + "step": 542 + }, + { + "epoch": 4.164672091910004, + "grad_norm": 0.3367792622931613, + "learning_rate": 3.211979540119883e-06, + "loss": 0.1797, + "step": 543 + }, + { + "epoch": 4.1723312589755865, + "grad_norm": 0.33133930012065166, + "learning_rate": 3.153845997716303e-06, + "loss": 0.1925, + "step": 544 + }, + { + "epoch": 4.179990426041168, + "grad_norm": 0.3094187164114957, + "learning_rate": 3.0961982891241083e-06, + "loss": 0.1481, + "step": 545 + }, + { + "epoch": 4.18764959310675, + "grad_norm": 0.317376469580326, + "learning_rate": 3.039038076871481e-06, + "loss": 0.1727, + "step": 546 + }, + { + "epoch": 4.195308760172331, + "grad_norm": 0.27650160786044625, + "learning_rate": 2.9823670094275e-06, + "loss": 0.1446, + "step": 547 + }, + { + "epoch": 4.202967927237913, + "grad_norm": 0.30890083357655423, + "learning_rate": 2.9261867211545603e-06, + "loss": 0.1622, + "step": 548 + }, + { + "epoch": 4.2106270943034945, + "grad_norm": 0.33112327548553416, + "learning_rate": 2.870498832261257e-06, + "loss": 0.1663, + "step": 549 + }, + { + "epoch": 4.218286261369076, + "grad_norm": 0.3120284953069467, + "learning_rate": 2.815304948755664e-06, + "loss": 0.1617, + "step": 550 + }, + { + "epoch": 4.225945428434658, + "grad_norm": 0.31255801847483555, + "learning_rate": 2.7606066623990145e-06, + "loss": 0.1673, + "step": 551 + }, + { + "epoch": 4.233604595500239, + "grad_norm": 0.3087850852760736, + "learning_rate": 2.7064055506597875e-06, + "loss": 0.1608, + "step": 552 + }, + { + "epoch": 4.241263762565821, + "grad_norm": 0.29550399939819244, + "learning_rate": 2.6527031766682142e-06, + "loss": 0.149, + "step": 553 + }, + { + "epoch": 4.248922929631402, + "grad_norm": 0.28827764887901935, + "learning_rate": 2.599501089171217e-06, + "loss": 0.1545, + "step": 554 + }, + { + "epoch": 4.2565820966969845, + "grad_norm": 0.31764263264427883, + "learning_rate": 2.546800822487714e-06, + "loss": 0.1581, + "step": 555 + }, + { + "epoch": 4.264241263762566, + "grad_norm": 0.2896451556809014, + "learning_rate": 2.494603896464405e-06, + "loss": 0.1432, + "step": 556 + }, + { + "epoch": 4.271900430828148, + "grad_norm": 0.2846656674626245, + "learning_rate": 2.4429118164319076e-06, + "loss": 0.1506, + "step": 557 + }, + { + "epoch": 4.279559597893729, + "grad_norm": 0.3167572946509413, + "learning_rate": 2.3917260731613733e-06, + "loss": 0.1681, + "step": 558 + }, + { + "epoch": 4.28721876495931, + "grad_norm": 0.2994943543117942, + "learning_rate": 2.3410481428214602e-06, + "loss": 0.1582, + "step": 559 + }, + { + "epoch": 4.294877932024892, + "grad_norm": 0.2958413618672273, + "learning_rate": 2.2908794869358044e-06, + "loss": 0.1629, + "step": 560 + }, + { + "epoch": 4.302537099090474, + "grad_norm": 0.2854040506798909, + "learning_rate": 2.2412215523408266e-06, + "loss": 0.1504, + "step": 561 + }, + { + "epoch": 4.310196266156056, + "grad_norm": 0.27706245243194494, + "learning_rate": 2.1920757711440354e-06, + "loss": 0.1335, + "step": 562 + }, + { + "epoch": 4.317855433221637, + "grad_norm": 0.3002281927963276, + "learning_rate": 2.143443560682721e-06, + "loss": 0.1656, + "step": 563 + }, + { + "epoch": 4.325514600287219, + "grad_norm": 0.3053710633138099, + "learning_rate": 2.0953263234830667e-06, + "loss": 0.1793, + "step": 564 + }, + { + "epoch": 4.3331737673528, + "grad_norm": 0.29373337337239025, + "learning_rate": 2.0477254472197237e-06, + "loss": 0.1607, + "step": 565 + }, + { + "epoch": 4.340832934418382, + "grad_norm": 0.29361686795928127, + "learning_rate": 2.0006423046757596e-06, + "loss": 0.1417, + "step": 566 + }, + { + "epoch": 4.348492101483964, + "grad_norm": 0.30321625536955094, + "learning_rate": 1.9540782537031045e-06, + "loss": 0.1561, + "step": 567 + }, + { + "epoch": 4.356151268549545, + "grad_norm": 0.294608355859984, + "learning_rate": 1.908034637183356e-06, + "loss": 0.1436, + "step": 568 + }, + { + "epoch": 4.363810435615127, + "grad_norm": 0.2981183902576262, + "learning_rate": 1.8625127829890922e-06, + "loss": 0.1829, + "step": 569 + }, + { + "epoch": 4.371469602680708, + "grad_norm": 0.2662376876124655, + "learning_rate": 1.817514003945524e-06, + "loss": 0.1321, + "step": 570 + }, + { + "epoch": 4.37912876974629, + "grad_norm": 0.2857595944219623, + "learning_rate": 1.7730395977926917e-06, + "loss": 0.1568, + "step": 571 + }, + { + "epoch": 4.386787936811872, + "grad_norm": 0.29444711124099954, + "learning_rate": 1.7290908471479805e-06, + "loss": 0.1619, + "step": 572 + }, + { + "epoch": 4.394447103877454, + "grad_norm": 0.299861629149527, + "learning_rate": 1.6856690194691872e-06, + "loss": 0.1765, + "step": 573 + }, + { + "epoch": 4.402106270943035, + "grad_norm": 0.2885973088406149, + "learning_rate": 1.6427753670179214e-06, + "loss": 0.1599, + "step": 574 + }, + { + "epoch": 4.409765438008616, + "grad_norm": 0.284486199205028, + "learning_rate": 1.6004111268235156e-06, + "loss": 0.1595, + "step": 575 + }, + { + "epoch": 4.417424605074198, + "grad_norm": 0.2941729164644181, + "learning_rate": 1.5585775206473508e-06, + "loss": 0.161, + "step": 576 + }, + { + "epoch": 4.4250837721397795, + "grad_norm": 0.29517993856011077, + "learning_rate": 1.5172757549476024e-06, + "loss": 0.1608, + "step": 577 + }, + { + "epoch": 4.432742939205362, + "grad_norm": 0.3009878509805784, + "learning_rate": 1.4765070208444732e-06, + "loss": 0.1657, + "step": 578 + }, + { + "epoch": 4.440402106270943, + "grad_norm": 0.2832295303483787, + "learning_rate": 1.4362724940858109e-06, + "loss": 0.1481, + "step": 579 + }, + { + "epoch": 4.448061273336525, + "grad_norm": 0.30017167488484514, + "learning_rate": 1.396573335013236e-06, + "loss": 0.1758, + "step": 580 + }, + { + "epoch": 4.455720440402106, + "grad_norm": 0.26948255003997973, + "learning_rate": 1.3574106885286465e-06, + "loss": 0.1506, + "step": 581 + }, + { + "epoch": 4.463379607467688, + "grad_norm": 0.2989296760767762, + "learning_rate": 1.3187856840612167e-06, + "loss": 0.1745, + "step": 582 + }, + { + "epoch": 4.47103877453327, + "grad_norm": 0.3057384139973513, + "learning_rate": 1.2806994355348224e-06, + "loss": 0.1721, + "step": 583 + }, + { + "epoch": 4.478697941598851, + "grad_norm": 0.2636716603605, + "learning_rate": 1.2431530413359138e-06, + "loss": 0.1367, + "step": 584 + }, + { + "epoch": 4.486357108664433, + "grad_norm": 0.29595905590077376, + "learning_rate": 1.2061475842818337e-06, + "loss": 0.1651, + "step": 585 + }, + { + "epoch": 4.494016275730014, + "grad_norm": 0.28723857409594045, + "learning_rate": 1.169684131589608e-06, + "loss": 0.1686, + "step": 586 + }, + { + "epoch": 4.501675442795596, + "grad_norm": 0.28232920958455504, + "learning_rate": 1.1337637348451369e-06, + "loss": 0.1478, + "step": 587 + }, + { + "epoch": 4.5093346098611775, + "grad_norm": 0.2945977053685175, + "learning_rate": 1.0983874299729092e-06, + "loss": 0.1698, + "step": 588 + }, + { + "epoch": 4.516993776926759, + "grad_norm": 0.29635940814476264, + "learning_rate": 1.0635562372060825e-06, + "loss": 0.1684, + "step": 589 + }, + { + "epoch": 4.524652943992341, + "grad_norm": 0.28161502295558277, + "learning_rate": 1.0292711610570904e-06, + "loss": 0.1435, + "step": 590 + }, + { + "epoch": 4.532312111057922, + "grad_norm": 0.29973282189790135, + "learning_rate": 9.955331902886645e-07, + "loss": 0.1601, + "step": 591 + }, + { + "epoch": 4.539971278123504, + "grad_norm": 0.2912268259487591, + "learning_rate": 9.62343297885313e-07, + "loss": 0.1676, + "step": 592 + }, + { + "epoch": 4.547630445189085, + "grad_norm": 0.2804062252790609, + "learning_rate": 9.297024410252753e-07, + "loss": 0.1572, + "step": 593 + }, + { + "epoch": 4.5552896122546676, + "grad_norm": 0.2902402528058867, + "learning_rate": 8.976115610528957e-07, + "loss": 0.1665, + "step": 594 + }, + { + "epoch": 4.562948779320249, + "grad_norm": 0.2948360443924776, + "learning_rate": 8.660715834514977e-07, + "loss": 0.1798, + "step": 595 + }, + { + "epoch": 4.570607946385831, + "grad_norm": 0.30613400428544785, + "learning_rate": 8.350834178166755e-07, + "loss": 0.1708, + "step": 596 + }, + { + "epoch": 4.578267113451412, + "grad_norm": 0.29386467356756857, + "learning_rate": 8.046479578300803e-07, + "loss": 0.1674, + "step": 597 + }, + { + "epoch": 4.585926280516993, + "grad_norm": 0.26855112533508585, + "learning_rate": 7.747660812336221e-07, + "loss": 0.127, + "step": 598 + }, + { + "epoch": 4.5935854475825755, + "grad_norm": 0.30897844150919035, + "learning_rate": 7.454386498041865e-07, + "loss": 0.1832, + "step": 599 + }, + { + "epoch": 4.601244614648157, + "grad_norm": 0.26907534103375785, + "learning_rate": 7.166665093287539e-07, + "loss": 0.1514, + "step": 600 + }, + { + "epoch": 4.608903781713739, + "grad_norm": 0.2779300127385892, + "learning_rate": 6.884504895800237e-07, + "loss": 0.1596, + "step": 601 + }, + { + "epoch": 4.61656294877932, + "grad_norm": 0.283076309215598, + "learning_rate": 6.607914042924756e-07, + "loss": 0.1493, + "step": 602 + }, + { + "epoch": 4.624222115844902, + "grad_norm": 0.30512219679760166, + "learning_rate": 6.336900511389133e-07, + "loss": 0.1839, + "step": 603 + }, + { + "epoch": 4.631881282910483, + "grad_norm": 0.2784336379714144, + "learning_rate": 6.071472117074462e-07, + "loss": 0.1553, + "step": 604 + }, + { + "epoch": 4.6395404499760655, + "grad_norm": 0.2908490571653058, + "learning_rate": 5.811636514789598e-07, + "loss": 0.1735, + "step": 605 + }, + { + "epoch": 4.647199617041647, + "grad_norm": 0.2689077917871218, + "learning_rate": 5.557401198050327e-07, + "loss": 0.1406, + "step": 606 + }, + { + "epoch": 4.654858784107228, + "grad_norm": 0.30361924900865483, + "learning_rate": 5.308773498863251e-07, + "loss": 0.1806, + "step": 607 + }, + { + "epoch": 4.66251795117281, + "grad_norm": 0.2738298236426758, + "learning_rate": 5.065760587514446e-07, + "loss": 0.1555, + "step": 608 + }, + { + "epoch": 4.670177118238391, + "grad_norm": 0.28066674173939915, + "learning_rate": 4.828369472362493e-07, + "loss": 0.1456, + "step": 609 + }, + { + "epoch": 4.677836285303973, + "grad_norm": 0.28254726890674403, + "learning_rate": 4.5966069996365993e-07, + "loss": 0.1666, + "step": 610 + }, + { + "epoch": 4.685495452369555, + "grad_norm": 0.28356791946970256, + "learning_rate": 4.3704798532388624e-07, + "loss": 0.1623, + "step": 611 + }, + { + "epoch": 4.693154619435137, + "grad_norm": 0.27978608677096656, + "learning_rate": 4.1499945545518283e-07, + "loss": 0.1573, + "step": 612 + }, + { + "epoch": 4.700813786500718, + "grad_norm": 0.2807595614948837, + "learning_rate": 3.935157462250128e-07, + "loss": 0.1645, + "step": 613 + }, + { + "epoch": 4.7084729535663, + "grad_norm": 0.2903423678415902, + "learning_rate": 3.7259747721173134e-07, + "loss": 0.1697, + "step": 614 + }, + { + "epoch": 4.716132120631881, + "grad_norm": 0.27761326426161137, + "learning_rate": 3.522452516867048e-07, + "loss": 0.154, + "step": 615 + }, + { + "epoch": 4.723791287697463, + "grad_norm": 0.2867472262572132, + "learning_rate": 3.324596565969174e-07, + "loss": 0.1565, + "step": 616 + }, + { + "epoch": 4.731450454763045, + "grad_norm": 0.28465179062440776, + "learning_rate": 3.1324126254804524e-07, + "loss": 0.1401, + "step": 617 + }, + { + "epoch": 4.739109621828626, + "grad_norm": 0.2947996164689416, + "learning_rate": 2.9459062378799806e-07, + "loss": 0.1754, + "step": 618 + }, + { + "epoch": 4.746768788894208, + "grad_norm": 0.2863382862190152, + "learning_rate": 2.7650827819093005e-07, + "loss": 0.1599, + "step": 619 + }, + { + "epoch": 4.754427955959789, + "grad_norm": 0.2709108924581573, + "learning_rate": 2.5899474724174313e-07, + "loss": 0.1398, + "step": 620 + }, + { + "epoch": 4.7620871230253705, + "grad_norm": 0.30022047839665467, + "learning_rate": 2.4205053602103015e-07, + "loss": 0.1792, + "step": 621 + }, + { + "epoch": 4.769746290090953, + "grad_norm": 0.2981470620905006, + "learning_rate": 2.2567613319051997e-07, + "loss": 0.1449, + "step": 622 + }, + { + "epoch": 4.777405457156534, + "grad_norm": 0.2875561082245844, + "learning_rate": 2.0987201097897757e-07, + "loss": 0.1395, + "step": 623 + }, + { + "epoch": 4.785064624222116, + "grad_norm": 0.27525535566929177, + "learning_rate": 1.9463862516859277e-07, + "loss": 0.1436, + "step": 624 + }, + { + "epoch": 4.792723791287697, + "grad_norm": 0.2940141095677646, + "learning_rate": 1.799764150818306e-07, + "loss": 0.1655, + "step": 625 + }, + { + "epoch": 4.800382958353279, + "grad_norm": 0.3081953681996578, + "learning_rate": 1.658858035687594e-07, + "loss": 0.1667, + "step": 626 + }, + { + "epoch": 4.808042125418861, + "grad_norm": 0.26637485304480285, + "learning_rate": 1.5236719699486256e-07, + "loss": 0.1309, + "step": 627 + }, + { + "epoch": 4.815701292484443, + "grad_norm": 0.2705718541638546, + "learning_rate": 1.3942098522931491e-07, + "loss": 0.1613, + "step": 628 + }, + { + "epoch": 4.823360459550024, + "grad_norm": 0.3013254867336271, + "learning_rate": 1.2704754163374022e-07, + "loss": 0.1871, + "step": 629 + }, + { + "epoch": 4.831019626615605, + "grad_norm": 0.2781156531330604, + "learning_rate": 1.1524722305144231e-07, + "loss": 0.1583, + "step": 630 + }, + { + "epoch": 4.838678793681187, + "grad_norm": 0.27865348896247544, + "learning_rate": 1.0402036979711317e-07, + "loss": 0.1541, + "step": 631 + }, + { + "epoch": 4.8463379607467685, + "grad_norm": 0.3006123678070983, + "learning_rate": 9.336730564702745e-08, + "loss": 0.177, + "step": 632 + }, + { + "epoch": 4.853997127812351, + "grad_norm": 0.2933355244436529, + "learning_rate": 8.328833782969003e-08, + "loss": 0.1735, + "step": 633 + }, + { + "epoch": 4.861656294877932, + "grad_norm": 0.2612274384851061, + "learning_rate": 7.378375701698748e-08, + "loss": 0.142, + "step": 634 + }, + { + "epoch": 4.869315461943514, + "grad_norm": 0.27506364744141204, + "learning_rate": 6.485383731580142e-08, + "loss": 0.1599, + "step": 635 + }, + { + "epoch": 4.876974629009095, + "grad_norm": 0.2740746508966746, + "learning_rate": 5.649883626009933e-08, + "loss": 0.1515, + "step": 636 + }, + { + "epoch": 4.884633796074677, + "grad_norm": 0.2923776048680673, + "learning_rate": 4.871899480351605e-08, + "loss": 0.1614, + "step": 637 + }, + { + "epoch": 4.8922929631402585, + "grad_norm": 0.2563881864024306, + "learning_rate": 4.151453731239707e-08, + "loss": 0.1431, + "step": 638 + }, + { + "epoch": 4.89995213020584, + "grad_norm": 0.31290090906865226, + "learning_rate": 3.4885671559332645e-08, + "loss": 0.1929, + "step": 639 + }, + { + "epoch": 4.907611297271422, + "grad_norm": 0.2833641820727045, + "learning_rate": 2.8832588717164766e-08, + "loss": 0.1354, + "step": 640 + }, + { + "epoch": 4.915270464337003, + "grad_norm": 0.2886325657329948, + "learning_rate": 2.3355463353467168e-08, + "loss": 0.1543, + "step": 641 + }, + { + "epoch": 4.922929631402585, + "grad_norm": 0.27914356344167157, + "learning_rate": 1.8454453425527098e-08, + "loss": 0.1511, + "step": 642 + }, + { + "epoch": 4.9305887984681664, + "grad_norm": 0.3094563610476224, + "learning_rate": 1.4129700275771208e-08, + "loss": 0.1843, + "step": 643 + }, + { + "epoch": 4.938247965533749, + "grad_norm": 0.25323095165276716, + "learning_rate": 1.0381328627702136e-08, + "loss": 0.1128, + "step": 644 + }, + { + "epoch": 4.94590713259933, + "grad_norm": 0.2790763770971488, + "learning_rate": 7.209446582292501e-09, + "loss": 0.1528, + "step": 645 + }, + { + "epoch": 4.953566299664912, + "grad_norm": 0.294849164666762, + "learning_rate": 4.614145614876275e-09, + "loss": 0.1657, + "step": 646 + }, + { + "epoch": 4.961225466730493, + "grad_norm": 0.30506766088706516, + "learning_rate": 2.5955005725064597e-09, + "loss": 0.178, + "step": 647 + }, + { + "epoch": 4.968884633796074, + "grad_norm": 0.281891446902876, + "learning_rate": 1.1535696717945855e-09, + "loss": 0.1473, + "step": 648 + }, + { + "epoch": 4.9765438008616565, + "grad_norm": 0.2745560443089173, + "learning_rate": 2.8839449723205847e-10, + "loss": 0.1509, + "step": 649 + }, + { + "epoch": 4.984202967927238, + "grad_norm": 0.2816379591846223, + "learning_rate": 0.0, + "loss": 0.1471, + "step": 650 + }, + { + "epoch": 4.984202967927238, + "step": 650, + "total_flos": 1.3070689797343805e+18, + "train_loss": 0.36027532513325033, + "train_runtime": 68390.8162, + "train_samples_per_second": 1.222, + "train_steps_per_second": 0.01 + } + ], + "logging_steps": 1.0, + "max_steps": 650, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3070689797343805e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}