{ "best_global_step": 957, "best_metric": 0.34439337253570557, "best_model_checkpoint": "outputs/checkpoint-957", "epoch": 3.0, "eval_steps": 500, "global_step": 957, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003144036156415799, "grad_norm": 0.6504561901092529, "learning_rate": 0.0, "loss": 1.3418, "step": 1 }, { "epoch": 0.006288072312831598, "grad_norm": 0.5969553589820862, "learning_rate": 2e-05, "loss": 1.2269, "step": 2 }, { "epoch": 0.009432108469247396, "grad_norm": 0.5801867246627808, "learning_rate": 4e-05, "loss": 1.2115, "step": 3 }, { "epoch": 0.012576144625663196, "grad_norm": 0.5902700424194336, "learning_rate": 6e-05, "loss": 1.227, "step": 4 }, { "epoch": 0.015720180782078996, "grad_norm": 0.4828220307826996, "learning_rate": 8e-05, "loss": 1.1231, "step": 5 }, { "epoch": 0.018864216938494792, "grad_norm": 0.48476874828338623, "learning_rate": 0.0001, "loss": 1.0001, "step": 6 }, { "epoch": 0.02200825309491059, "grad_norm": 0.5598082542419434, "learning_rate": 9.995497523638001e-05, "loss": 0.9222, "step": 7 }, { "epoch": 0.02515228925132639, "grad_norm": 0.6134396195411682, "learning_rate": 9.990995047276002e-05, "loss": 0.8076, "step": 8 }, { "epoch": 0.028296325407742188, "grad_norm": 0.596885621547699, "learning_rate": 9.986492570914003e-05, "loss": 0.699, "step": 9 }, { "epoch": 0.03144036156415799, "grad_norm": 0.601597785949707, "learning_rate": 9.981990094552004e-05, "loss": 0.6339, "step": 10 }, { "epoch": 0.034584397720573784, "grad_norm": 0.43536919355392456, "learning_rate": 9.977487618190005e-05, "loss": 0.586, "step": 11 }, { "epoch": 0.037728433876989584, "grad_norm": 0.28295570611953735, "learning_rate": 9.972985141828006e-05, "loss": 0.5222, "step": 12 }, { "epoch": 0.040872470033405384, "grad_norm": 0.21935400366783142, "learning_rate": 9.968482665466006e-05, "loss": 0.5207, "step": 13 }, { "epoch": 0.04401650618982118, "grad_norm": 0.17207279801368713, "learning_rate": 9.963980189104007e-05, "loss": 0.5183, "step": 14 }, { "epoch": 0.04716054234623698, "grad_norm": 0.17337629199028015, "learning_rate": 9.95947771274201e-05, "loss": 0.4736, "step": 15 }, { "epoch": 0.05030457850265278, "grad_norm": 0.1346622258424759, "learning_rate": 9.954975236380009e-05, "loss": 0.4952, "step": 16 }, { "epoch": 0.053448614659068576, "grad_norm": 0.14799199998378754, "learning_rate": 9.95047276001801e-05, "loss": 0.4865, "step": 17 }, { "epoch": 0.056592650815484376, "grad_norm": 0.17467562854290009, "learning_rate": 9.945970283656011e-05, "loss": 0.4931, "step": 18 }, { "epoch": 0.059736686971900176, "grad_norm": 0.14084218442440033, "learning_rate": 9.941467807294013e-05, "loss": 0.4986, "step": 19 }, { "epoch": 0.06288072312831598, "grad_norm": 0.14518290758132935, "learning_rate": 9.936965330932014e-05, "loss": 0.4732, "step": 20 }, { "epoch": 0.06602475928473177, "grad_norm": 0.16654722392559052, "learning_rate": 9.932462854570013e-05, "loss": 0.5111, "step": 21 }, { "epoch": 0.06916879544114757, "grad_norm": 0.13385848701000214, "learning_rate": 9.927960378208014e-05, "loss": 0.4491, "step": 22 }, { "epoch": 0.07231283159756337, "grad_norm": 0.12942947447299957, "learning_rate": 9.923457901846016e-05, "loss": 0.4477, "step": 23 }, { "epoch": 0.07545686775397917, "grad_norm": 0.134714275598526, "learning_rate": 9.918955425484017e-05, "loss": 0.4669, "step": 24 }, { "epoch": 0.07860090391039497, "grad_norm": 0.15754961967468262, "learning_rate": 9.914452949122017e-05, "loss": 0.4729, "step": 25 }, { "epoch": 0.08174494006681077, "grad_norm": 0.13226869702339172, "learning_rate": 9.909950472760019e-05, "loss": 0.449, "step": 26 }, { "epoch": 0.08488897622322657, "grad_norm": 0.13787904381752014, "learning_rate": 9.90544799639802e-05, "loss": 0.4169, "step": 27 }, { "epoch": 0.08803301237964237, "grad_norm": 0.13542614877223969, "learning_rate": 9.900945520036021e-05, "loss": 0.4474, "step": 28 }, { "epoch": 0.09117704853605817, "grad_norm": 0.1803930401802063, "learning_rate": 9.89644304367402e-05, "loss": 0.4614, "step": 29 }, { "epoch": 0.09432108469247397, "grad_norm": 0.13009008765220642, "learning_rate": 9.891940567312022e-05, "loss": 0.4447, "step": 30 }, { "epoch": 0.09746512084888977, "grad_norm": 0.12229945510625839, "learning_rate": 9.887438090950023e-05, "loss": 0.4443, "step": 31 }, { "epoch": 0.10060915700530557, "grad_norm": 0.11478310078382492, "learning_rate": 9.882935614588024e-05, "loss": 0.4434, "step": 32 }, { "epoch": 0.10375319316172137, "grad_norm": 0.11311618983745575, "learning_rate": 9.878433138226025e-05, "loss": 0.4778, "step": 33 }, { "epoch": 0.10689722931813715, "grad_norm": 0.11936776340007782, "learning_rate": 9.873930661864026e-05, "loss": 0.4356, "step": 34 }, { "epoch": 0.11004126547455295, "grad_norm": 0.1237996444106102, "learning_rate": 9.869428185502027e-05, "loss": 0.4292, "step": 35 }, { "epoch": 0.11318530163096875, "grad_norm": 0.14266110956668854, "learning_rate": 9.864925709140028e-05, "loss": 0.3987, "step": 36 }, { "epoch": 0.11632933778738455, "grad_norm": 0.11767125129699707, "learning_rate": 9.860423232778028e-05, "loss": 0.4081, "step": 37 }, { "epoch": 0.11947337394380035, "grad_norm": 0.11443518847227097, "learning_rate": 9.855920756416029e-05, "loss": 0.4235, "step": 38 }, { "epoch": 0.12261741010021615, "grad_norm": 0.13285037875175476, "learning_rate": 9.85141828005403e-05, "loss": 0.4552, "step": 39 }, { "epoch": 0.12576144625663196, "grad_norm": 0.1199844554066658, "learning_rate": 9.846915803692031e-05, "loss": 0.4384, "step": 40 }, { "epoch": 0.12890548241304775, "grad_norm": 0.1192149817943573, "learning_rate": 9.842413327330032e-05, "loss": 0.4361, "step": 41 }, { "epoch": 0.13204951856946354, "grad_norm": 0.1373969316482544, "learning_rate": 9.837910850968033e-05, "loss": 0.4231, "step": 42 }, { "epoch": 0.13519355472587935, "grad_norm": 0.1313702017068863, "learning_rate": 9.833408374606034e-05, "loss": 0.4267, "step": 43 }, { "epoch": 0.13833759088229514, "grad_norm": 0.12744949758052826, "learning_rate": 9.828905898244036e-05, "loss": 0.4392, "step": 44 }, { "epoch": 0.14148162703871095, "grad_norm": 0.1385061889886856, "learning_rate": 9.824403421882035e-05, "loss": 0.4382, "step": 45 }, { "epoch": 0.14462566319512674, "grad_norm": 0.1346711665391922, "learning_rate": 9.819900945520036e-05, "loss": 0.3872, "step": 46 }, { "epoch": 0.14776969935154255, "grad_norm": 0.11777765303850174, "learning_rate": 9.815398469158037e-05, "loss": 0.3946, "step": 47 }, { "epoch": 0.15091373550795834, "grad_norm": 0.1113889142870903, "learning_rate": 9.810895992796039e-05, "loss": 0.4144, "step": 48 }, { "epoch": 0.15405777166437415, "grad_norm": 0.11616387218236923, "learning_rate": 9.806393516434039e-05, "loss": 0.4313, "step": 49 }, { "epoch": 0.15720180782078993, "grad_norm": 0.12321295589208603, "learning_rate": 9.80189104007204e-05, "loss": 0.4169, "step": 50 }, { "epoch": 0.16034584397720575, "grad_norm": 0.1254836469888687, "learning_rate": 9.797388563710042e-05, "loss": 0.4606, "step": 51 }, { "epoch": 0.16348988013362153, "grad_norm": 0.12028036266565323, "learning_rate": 9.792886087348043e-05, "loss": 0.4232, "step": 52 }, { "epoch": 0.16663391629003735, "grad_norm": 0.13416439294815063, "learning_rate": 9.788383610986042e-05, "loss": 0.4515, "step": 53 }, { "epoch": 0.16977795244645313, "grad_norm": 0.1263149082660675, "learning_rate": 9.783881134624043e-05, "loss": 0.4293, "step": 54 }, { "epoch": 0.17292198860286892, "grad_norm": 0.1367005556821823, "learning_rate": 9.779378658262045e-05, "loss": 0.4479, "step": 55 }, { "epoch": 0.17606602475928473, "grad_norm": 0.12966613471508026, "learning_rate": 9.774876181900046e-05, "loss": 0.4244, "step": 56 }, { "epoch": 0.17921006091570052, "grad_norm": 0.1284017562866211, "learning_rate": 9.770373705538046e-05, "loss": 0.4331, "step": 57 }, { "epoch": 0.18235409707211633, "grad_norm": 0.1253034770488739, "learning_rate": 9.765871229176046e-05, "loss": 0.3945, "step": 58 }, { "epoch": 0.18549813322853212, "grad_norm": 0.14342078566551208, "learning_rate": 9.761368752814049e-05, "loss": 0.4589, "step": 59 }, { "epoch": 0.18864216938494793, "grad_norm": 0.12838789820671082, "learning_rate": 9.75686627645205e-05, "loss": 0.4166, "step": 60 }, { "epoch": 0.19178620554136372, "grad_norm": 0.1271761655807495, "learning_rate": 9.752363800090049e-05, "loss": 0.4242, "step": 61 }, { "epoch": 0.19493024169777953, "grad_norm": 0.13261282444000244, "learning_rate": 9.747861323728051e-05, "loss": 0.4098, "step": 62 }, { "epoch": 0.19807427785419532, "grad_norm": 0.13815303146839142, "learning_rate": 9.743358847366052e-05, "loss": 0.4189, "step": 63 }, { "epoch": 0.20121831401061113, "grad_norm": 0.14853087067604065, "learning_rate": 9.738856371004053e-05, "loss": 0.405, "step": 64 }, { "epoch": 0.20436235016702692, "grad_norm": 0.13879799842834473, "learning_rate": 9.734353894642053e-05, "loss": 0.425, "step": 65 }, { "epoch": 0.20750638632344273, "grad_norm": 0.13774444162845612, "learning_rate": 9.729851418280055e-05, "loss": 0.3908, "step": 66 }, { "epoch": 0.21065042247985852, "grad_norm": 0.17253898084163666, "learning_rate": 9.725348941918056e-05, "loss": 0.4289, "step": 67 }, { "epoch": 0.2137944586362743, "grad_norm": 0.15715059638023376, "learning_rate": 9.720846465556056e-05, "loss": 0.403, "step": 68 }, { "epoch": 0.21693849479269012, "grad_norm": 0.13471247255802155, "learning_rate": 9.716343989194057e-05, "loss": 0.4142, "step": 69 }, { "epoch": 0.2200825309491059, "grad_norm": 0.15236668288707733, "learning_rate": 9.711841512832058e-05, "loss": 0.4109, "step": 70 }, { "epoch": 0.22322656710552172, "grad_norm": 0.14000849425792694, "learning_rate": 9.707339036470059e-05, "loss": 0.3894, "step": 71 }, { "epoch": 0.2263706032619375, "grad_norm": 0.14730283617973328, "learning_rate": 9.70283656010806e-05, "loss": 0.3821, "step": 72 }, { "epoch": 0.22951463941835332, "grad_norm": 0.14098961651325226, "learning_rate": 9.698334083746061e-05, "loss": 0.398, "step": 73 }, { "epoch": 0.2326586755747691, "grad_norm": 0.14531590044498444, "learning_rate": 9.693831607384062e-05, "loss": 0.4117, "step": 74 }, { "epoch": 0.23580271173118492, "grad_norm": 0.15122880041599274, "learning_rate": 9.689329131022062e-05, "loss": 0.394, "step": 75 }, { "epoch": 0.2389467478876007, "grad_norm": 0.14702145755290985, "learning_rate": 9.684826654660063e-05, "loss": 0.3868, "step": 76 }, { "epoch": 0.24209078404401652, "grad_norm": 0.1624772995710373, "learning_rate": 9.680324178298064e-05, "loss": 0.4086, "step": 77 }, { "epoch": 0.2452348202004323, "grad_norm": 0.1514746993780136, "learning_rate": 9.675821701936065e-05, "loss": 0.3993, "step": 78 }, { "epoch": 0.24837885635684812, "grad_norm": 0.1850007027387619, "learning_rate": 9.671319225574066e-05, "loss": 0.4172, "step": 79 }, { "epoch": 0.25152289251326393, "grad_norm": 0.1626746505498886, "learning_rate": 9.666816749212068e-05, "loss": 0.4239, "step": 80 }, { "epoch": 0.2546669286696797, "grad_norm": 0.16106756031513214, "learning_rate": 9.662314272850068e-05, "loss": 0.403, "step": 81 }, { "epoch": 0.2578109648260955, "grad_norm": 0.15971069037914276, "learning_rate": 9.657811796488068e-05, "loss": 0.4058, "step": 82 }, { "epoch": 0.2609550009825113, "grad_norm": 0.1679072380065918, "learning_rate": 9.653309320126069e-05, "loss": 0.3923, "step": 83 }, { "epoch": 0.2640990371389271, "grad_norm": 0.167164146900177, "learning_rate": 9.648806843764072e-05, "loss": 0.408, "step": 84 }, { "epoch": 0.2672430732953429, "grad_norm": 0.19360136985778809, "learning_rate": 9.644304367402071e-05, "loss": 0.3752, "step": 85 }, { "epoch": 0.2703871094517587, "grad_norm": 0.18764442205429077, "learning_rate": 9.639801891040072e-05, "loss": 0.4016, "step": 86 }, { "epoch": 0.2735311456081745, "grad_norm": 0.14717476069927216, "learning_rate": 9.635299414678074e-05, "loss": 0.3657, "step": 87 }, { "epoch": 0.27667518176459027, "grad_norm": 0.16398710012435913, "learning_rate": 9.630796938316075e-05, "loss": 0.3836, "step": 88 }, { "epoch": 0.2798192179210061, "grad_norm": 0.18360593914985657, "learning_rate": 9.626294461954074e-05, "loss": 0.402, "step": 89 }, { "epoch": 0.2829632540774219, "grad_norm": 0.16090594232082367, "learning_rate": 9.621791985592075e-05, "loss": 0.4183, "step": 90 }, { "epoch": 0.2861072902338377, "grad_norm": 0.15203045308589935, "learning_rate": 9.617289509230078e-05, "loss": 0.3868, "step": 91 }, { "epoch": 0.28925132639025347, "grad_norm": 0.14797526597976685, "learning_rate": 9.612787032868078e-05, "loss": 0.3815, "step": 92 }, { "epoch": 0.2923953625466693, "grad_norm": 0.17348520457744598, "learning_rate": 9.608284556506079e-05, "loss": 0.3836, "step": 93 }, { "epoch": 0.2955393987030851, "grad_norm": 0.15546481311321259, "learning_rate": 9.603782080144079e-05, "loss": 0.388, "step": 94 }, { "epoch": 0.2986834348595009, "grad_norm": 0.16100765764713287, "learning_rate": 9.599279603782081e-05, "loss": 0.4028, "step": 95 }, { "epoch": 0.30182747101591667, "grad_norm": 0.13970224559307098, "learning_rate": 9.594777127420082e-05, "loss": 0.4114, "step": 96 }, { "epoch": 0.30497150717233246, "grad_norm": 0.168798565864563, "learning_rate": 9.590274651058083e-05, "loss": 0.3957, "step": 97 }, { "epoch": 0.3081155433287483, "grad_norm": 0.18150930106639862, "learning_rate": 9.585772174696084e-05, "loss": 0.3952, "step": 98 }, { "epoch": 0.3112595794851641, "grad_norm": 0.1578027904033661, "learning_rate": 9.581269698334084e-05, "loss": 0.3852, "step": 99 }, { "epoch": 0.31440361564157987, "grad_norm": 0.16330371797084808, "learning_rate": 9.576767221972085e-05, "loss": 0.3918, "step": 100 }, { "epoch": 0.31754765179799566, "grad_norm": 0.19642502069473267, "learning_rate": 9.572264745610086e-05, "loss": 0.3673, "step": 101 }, { "epoch": 0.3206916879544115, "grad_norm": 0.15316760540008545, "learning_rate": 9.567762269248087e-05, "loss": 0.3654, "step": 102 }, { "epoch": 0.3238357241108273, "grad_norm": 0.15903666615486145, "learning_rate": 9.563259792886088e-05, "loss": 0.4156, "step": 103 }, { "epoch": 0.32697976026724307, "grad_norm": 0.1616946905851364, "learning_rate": 9.558757316524089e-05, "loss": 0.3945, "step": 104 }, { "epoch": 0.33012379642365886, "grad_norm": 0.17533862590789795, "learning_rate": 9.55425484016209e-05, "loss": 0.4097, "step": 105 }, { "epoch": 0.3332678325800747, "grad_norm": 0.16686199605464935, "learning_rate": 9.54975236380009e-05, "loss": 0.3642, "step": 106 }, { "epoch": 0.3364118687364905, "grad_norm": 0.16722401976585388, "learning_rate": 9.545249887438091e-05, "loss": 0.3922, "step": 107 }, { "epoch": 0.33955590489290627, "grad_norm": 0.1776529848575592, "learning_rate": 9.540747411076092e-05, "loss": 0.3747, "step": 108 }, { "epoch": 0.34269994104932205, "grad_norm": 0.16103170812129974, "learning_rate": 9.536244934714093e-05, "loss": 0.3419, "step": 109 }, { "epoch": 0.34584397720573784, "grad_norm": 0.16695958375930786, "learning_rate": 9.531742458352094e-05, "loss": 0.3835, "step": 110 }, { "epoch": 0.3489880133621537, "grad_norm": 0.16470766067504883, "learning_rate": 9.527239981990095e-05, "loss": 0.3832, "step": 111 }, { "epoch": 0.35213204951856947, "grad_norm": 0.2102184295654297, "learning_rate": 9.522737505628096e-05, "loss": 0.4077, "step": 112 }, { "epoch": 0.35527608567498525, "grad_norm": 0.16832542419433594, "learning_rate": 9.518235029266098e-05, "loss": 0.3865, "step": 113 }, { "epoch": 0.35842012183140104, "grad_norm": 0.1614522486925125, "learning_rate": 9.513732552904097e-05, "loss": 0.3815, "step": 114 }, { "epoch": 0.3615641579878169, "grad_norm": 0.15935182571411133, "learning_rate": 9.509230076542098e-05, "loss": 0.3938, "step": 115 }, { "epoch": 0.36470819414423267, "grad_norm": 0.15866465866565704, "learning_rate": 9.5047276001801e-05, "loss": 0.3453, "step": 116 }, { "epoch": 0.36785223030064845, "grad_norm": 0.22919830679893494, "learning_rate": 9.500225123818101e-05, "loss": 0.3805, "step": 117 }, { "epoch": 0.37099626645706424, "grad_norm": 0.157642662525177, "learning_rate": 9.495722647456101e-05, "loss": 0.3727, "step": 118 }, { "epoch": 0.3741403026134801, "grad_norm": 0.15399090945720673, "learning_rate": 9.491220171094102e-05, "loss": 0.3858, "step": 119 }, { "epoch": 0.37728433876989587, "grad_norm": 0.16368268430233002, "learning_rate": 9.486717694732104e-05, "loss": 0.3669, "step": 120 }, { "epoch": 0.38042837492631165, "grad_norm": 0.15748444199562073, "learning_rate": 9.482215218370105e-05, "loss": 0.3478, "step": 121 }, { "epoch": 0.38357241108272744, "grad_norm": 0.15478475391864777, "learning_rate": 9.477712742008104e-05, "loss": 0.3897, "step": 122 }, { "epoch": 0.3867164472391432, "grad_norm": 0.17049477994441986, "learning_rate": 9.473210265646106e-05, "loss": 0.4014, "step": 123 }, { "epoch": 0.38986048339555907, "grad_norm": 0.1540895700454712, "learning_rate": 9.468707789284107e-05, "loss": 0.3864, "step": 124 }, { "epoch": 0.39300451955197485, "grad_norm": 0.15698757767677307, "learning_rate": 9.464205312922108e-05, "loss": 0.3494, "step": 125 }, { "epoch": 0.39614855570839064, "grad_norm": 0.1802845299243927, "learning_rate": 9.459702836560108e-05, "loss": 0.4063, "step": 126 }, { "epoch": 0.3992925918648064, "grad_norm": 0.16863752901554108, "learning_rate": 9.45520036019811e-05, "loss": 0.3996, "step": 127 }, { "epoch": 0.40243662802122226, "grad_norm": 0.15374375879764557, "learning_rate": 9.45069788383611e-05, "loss": 0.3606, "step": 128 }, { "epoch": 0.40558066417763805, "grad_norm": 0.1838078647851944, "learning_rate": 9.446195407474112e-05, "loss": 0.4202, "step": 129 }, { "epoch": 0.40872470033405384, "grad_norm": 0.1697828471660614, "learning_rate": 9.441692931112111e-05, "loss": 0.4473, "step": 130 }, { "epoch": 0.4118687364904696, "grad_norm": 0.16546648740768433, "learning_rate": 9.437190454750113e-05, "loss": 0.3716, "step": 131 }, { "epoch": 0.41501277264688546, "grad_norm": 0.16032598912715912, "learning_rate": 9.432687978388114e-05, "loss": 0.3454, "step": 132 }, { "epoch": 0.41815680880330125, "grad_norm": 0.16294580698013306, "learning_rate": 9.428185502026115e-05, "loss": 0.3505, "step": 133 }, { "epoch": 0.42130084495971704, "grad_norm": 0.1657985895872116, "learning_rate": 9.423683025664116e-05, "loss": 0.3742, "step": 134 }, { "epoch": 0.4244448811161328, "grad_norm": 0.17439191043376923, "learning_rate": 9.419180549302117e-05, "loss": 0.3715, "step": 135 }, { "epoch": 0.4275889172725486, "grad_norm": 0.16780824959278107, "learning_rate": 9.414678072940118e-05, "loss": 0.3638, "step": 136 }, { "epoch": 0.43073295342896445, "grad_norm": 0.15180569887161255, "learning_rate": 9.410175596578118e-05, "loss": 0.3481, "step": 137 }, { "epoch": 0.43387698958538023, "grad_norm": 0.16709467768669128, "learning_rate": 9.405673120216119e-05, "loss": 0.3941, "step": 138 }, { "epoch": 0.437021025741796, "grad_norm": 0.1620159149169922, "learning_rate": 9.40117064385412e-05, "loss": 0.3709, "step": 139 }, { "epoch": 0.4401650618982118, "grad_norm": 0.14973655343055725, "learning_rate": 9.396668167492121e-05, "loss": 0.3337, "step": 140 }, { "epoch": 0.44330909805462765, "grad_norm": 0.1614982783794403, "learning_rate": 9.392165691130123e-05, "loss": 0.3835, "step": 141 }, { "epoch": 0.44645313421104343, "grad_norm": 0.15263184905052185, "learning_rate": 9.387663214768123e-05, "loss": 0.3561, "step": 142 }, { "epoch": 0.4495971703674592, "grad_norm": 0.1567995399236679, "learning_rate": 9.383160738406124e-05, "loss": 0.3833, "step": 143 }, { "epoch": 0.452741206523875, "grad_norm": 0.15289440751075745, "learning_rate": 9.378658262044124e-05, "loss": 0.3553, "step": 144 }, { "epoch": 0.45588524268029085, "grad_norm": 0.15743780136108398, "learning_rate": 9.374155785682127e-05, "loss": 0.3785, "step": 145 }, { "epoch": 0.45902927883670663, "grad_norm": 0.16213831305503845, "learning_rate": 9.369653309320126e-05, "loss": 0.3697, "step": 146 }, { "epoch": 0.4621733149931224, "grad_norm": 0.1589861363172531, "learning_rate": 9.365150832958127e-05, "loss": 0.3866, "step": 147 }, { "epoch": 0.4653173511495382, "grad_norm": 0.15693925321102142, "learning_rate": 9.360648356596128e-05, "loss": 0.3283, "step": 148 }, { "epoch": 0.468461387305954, "grad_norm": 0.15275168418884277, "learning_rate": 9.35614588023413e-05, "loss": 0.3743, "step": 149 }, { "epoch": 0.47160542346236983, "grad_norm": 0.15678094327449799, "learning_rate": 9.35164340387213e-05, "loss": 0.3902, "step": 150 }, { "epoch": 0.4747494596187856, "grad_norm": 0.16155363619327545, "learning_rate": 9.34714092751013e-05, "loss": 0.356, "step": 151 }, { "epoch": 0.4778934957752014, "grad_norm": 0.18230900168418884, "learning_rate": 9.342638451148133e-05, "loss": 0.4123, "step": 152 }, { "epoch": 0.4810375319316172, "grad_norm": 0.16941802203655243, "learning_rate": 9.338135974786133e-05, "loss": 0.3823, "step": 153 }, { "epoch": 0.48418156808803303, "grad_norm": 0.17373739182949066, "learning_rate": 9.333633498424133e-05, "loss": 0.4141, "step": 154 }, { "epoch": 0.4873256042444488, "grad_norm": 0.15033993124961853, "learning_rate": 9.329131022062134e-05, "loss": 0.3572, "step": 155 }, { "epoch": 0.4904696404008646, "grad_norm": 0.1643500179052353, "learning_rate": 9.324628545700136e-05, "loss": 0.3643, "step": 156 }, { "epoch": 0.4936136765572804, "grad_norm": 0.16949360072612762, "learning_rate": 9.320126069338137e-05, "loss": 0.3698, "step": 157 }, { "epoch": 0.49675771271369623, "grad_norm": 0.14246416091918945, "learning_rate": 9.315623592976136e-05, "loss": 0.336, "step": 158 }, { "epoch": 0.499901748870112, "grad_norm": 0.19311994314193726, "learning_rate": 9.311121116614139e-05, "loss": 0.3697, "step": 159 }, { "epoch": 0.5030457850265279, "grad_norm": 0.16809864342212677, "learning_rate": 9.30661864025214e-05, "loss": 0.3705, "step": 160 }, { "epoch": 0.5061898211829436, "grad_norm": 0.18551750481128693, "learning_rate": 9.30211616389014e-05, "loss": 0.3921, "step": 161 }, { "epoch": 0.5093338573393594, "grad_norm": 0.16373604536056519, "learning_rate": 9.29761368752814e-05, "loss": 0.3773, "step": 162 }, { "epoch": 0.5124778934957752, "grad_norm": 0.16230268776416779, "learning_rate": 9.293111211166142e-05, "loss": 0.3728, "step": 163 }, { "epoch": 0.515621929652191, "grad_norm": 0.15303924679756165, "learning_rate": 9.288608734804143e-05, "loss": 0.3586, "step": 164 }, { "epoch": 0.5187659658086068, "grad_norm": 0.15790314972400665, "learning_rate": 9.284106258442144e-05, "loss": 0.3746, "step": 165 }, { "epoch": 0.5219100019650226, "grad_norm": 0.15940716862678528, "learning_rate": 9.279603782080145e-05, "loss": 0.3638, "step": 166 }, { "epoch": 0.5250540381214384, "grad_norm": 0.1546989381313324, "learning_rate": 9.275101305718146e-05, "loss": 0.3599, "step": 167 }, { "epoch": 0.5281980742778541, "grad_norm": 0.17521463334560394, "learning_rate": 9.270598829356146e-05, "loss": 0.3563, "step": 168 }, { "epoch": 0.53134211043427, "grad_norm": 0.15796169638633728, "learning_rate": 9.266096352994147e-05, "loss": 0.3831, "step": 169 }, { "epoch": 0.5344861465906858, "grad_norm": 0.16639868915081024, "learning_rate": 9.261593876632148e-05, "loss": 0.3705, "step": 170 }, { "epoch": 0.5376301827471016, "grad_norm": 0.17908984422683716, "learning_rate": 9.257091400270149e-05, "loss": 0.3627, "step": 171 }, { "epoch": 0.5407742189035174, "grad_norm": 0.17089813947677612, "learning_rate": 9.25258892390815e-05, "loss": 0.3966, "step": 172 }, { "epoch": 0.5439182550599332, "grad_norm": 0.16613918542861938, "learning_rate": 9.24808644754615e-05, "loss": 0.3441, "step": 173 }, { "epoch": 0.547062291216349, "grad_norm": 0.15668480098247528, "learning_rate": 9.243583971184152e-05, "loss": 0.3656, "step": 174 }, { "epoch": 0.5502063273727648, "grad_norm": 0.1610320508480072, "learning_rate": 9.239081494822152e-05, "loss": 0.3615, "step": 175 }, { "epoch": 0.5533503635291805, "grad_norm": 0.1715031862258911, "learning_rate": 9.234579018460153e-05, "loss": 0.3728, "step": 176 }, { "epoch": 0.5564943996855963, "grad_norm": 0.16907502710819244, "learning_rate": 9.230076542098155e-05, "loss": 0.379, "step": 177 }, { "epoch": 0.5596384358420122, "grad_norm": 0.1507793366909027, "learning_rate": 9.225574065736155e-05, "loss": 0.3488, "step": 178 }, { "epoch": 0.562782471998428, "grad_norm": 0.17530782520771027, "learning_rate": 9.221071589374156e-05, "loss": 0.3721, "step": 179 }, { "epoch": 0.5659265081548438, "grad_norm": 0.16093795001506805, "learning_rate": 9.216569113012157e-05, "loss": 0.3707, "step": 180 }, { "epoch": 0.5690705443112596, "grad_norm": 0.1704872101545334, "learning_rate": 9.212066636650159e-05, "loss": 0.3862, "step": 181 }, { "epoch": 0.5722145804676754, "grad_norm": 0.157040536403656, "learning_rate": 9.207564160288158e-05, "loss": 0.3744, "step": 182 }, { "epoch": 0.5753586166240912, "grad_norm": 0.17162206768989563, "learning_rate": 9.203061683926159e-05, "loss": 0.3586, "step": 183 }, { "epoch": 0.5785026527805069, "grad_norm": 0.17531578242778778, "learning_rate": 9.19855920756416e-05, "loss": 0.3529, "step": 184 }, { "epoch": 0.5816466889369227, "grad_norm": 0.1650618463754654, "learning_rate": 9.194056731202162e-05, "loss": 0.3882, "step": 185 }, { "epoch": 0.5847907250933386, "grad_norm": 0.16133129596710205, "learning_rate": 9.189554254840163e-05, "loss": 0.37, "step": 186 }, { "epoch": 0.5879347612497544, "grad_norm": 0.1555107831954956, "learning_rate": 9.185051778478163e-05, "loss": 0.3706, "step": 187 }, { "epoch": 0.5910787974061702, "grad_norm": 0.15559113025665283, "learning_rate": 9.180549302116165e-05, "loss": 0.3759, "step": 188 }, { "epoch": 0.594222833562586, "grad_norm": 0.17183296382427216, "learning_rate": 9.176046825754166e-05, "loss": 0.3779, "step": 189 }, { "epoch": 0.5973668697190018, "grad_norm": 0.17152062058448792, "learning_rate": 9.171544349392167e-05, "loss": 0.3693, "step": 190 }, { "epoch": 0.6005109058754176, "grad_norm": 0.15135037899017334, "learning_rate": 9.167041873030166e-05, "loss": 0.3693, "step": 191 }, { "epoch": 0.6036549420318333, "grad_norm": 0.15813538432121277, "learning_rate": 9.162539396668168e-05, "loss": 0.3555, "step": 192 }, { "epoch": 0.6067989781882491, "grad_norm": 0.1635793000459671, "learning_rate": 9.158036920306169e-05, "loss": 0.3776, "step": 193 }, { "epoch": 0.6099430143446649, "grad_norm": 0.17420870065689087, "learning_rate": 9.15353444394417e-05, "loss": 0.3824, "step": 194 }, { "epoch": 0.6130870505010808, "grad_norm": 0.15110236406326294, "learning_rate": 9.149031967582171e-05, "loss": 0.3637, "step": 195 }, { "epoch": 0.6162310866574966, "grad_norm": 0.16803759336471558, "learning_rate": 9.144529491220172e-05, "loss": 0.3549, "step": 196 }, { "epoch": 0.6193751228139124, "grad_norm": 0.171513631939888, "learning_rate": 9.140027014858173e-05, "loss": 0.3823, "step": 197 }, { "epoch": 0.6225191589703282, "grad_norm": 0.16594447195529938, "learning_rate": 9.135524538496173e-05, "loss": 0.3629, "step": 198 }, { "epoch": 0.625663195126744, "grad_norm": 0.17563681304454803, "learning_rate": 9.131022062134174e-05, "loss": 0.3756, "step": 199 }, { "epoch": 0.6288072312831597, "grad_norm": 0.1667756289243698, "learning_rate": 9.126519585772175e-05, "loss": 0.3817, "step": 200 }, { "epoch": 0.6319512674395755, "grad_norm": 0.19527959823608398, "learning_rate": 9.122017109410176e-05, "loss": 0.3798, "step": 201 }, { "epoch": 0.6350953035959913, "grad_norm": 0.1836497187614441, "learning_rate": 9.117514633048177e-05, "loss": 0.3541, "step": 202 }, { "epoch": 0.6382393397524071, "grad_norm": 0.15115787088871002, "learning_rate": 9.113012156686178e-05, "loss": 0.3823, "step": 203 }, { "epoch": 0.641383375908823, "grad_norm": 0.1637955754995346, "learning_rate": 9.108509680324179e-05, "loss": 0.3953, "step": 204 }, { "epoch": 0.6445274120652388, "grad_norm": 0.15728555619716644, "learning_rate": 9.10400720396218e-05, "loss": 0.3483, "step": 205 }, { "epoch": 0.6476714482216546, "grad_norm": 0.16356363892555237, "learning_rate": 9.09950472760018e-05, "loss": 0.3583, "step": 206 }, { "epoch": 0.6508154843780704, "grad_norm": 0.15497809648513794, "learning_rate": 9.095002251238181e-05, "loss": 0.3676, "step": 207 }, { "epoch": 0.6539595205344861, "grad_norm": 0.16058075428009033, "learning_rate": 9.090499774876182e-05, "loss": 0.3641, "step": 208 }, { "epoch": 0.6571035566909019, "grad_norm": 0.1717848926782608, "learning_rate": 9.085997298514183e-05, "loss": 0.3416, "step": 209 }, { "epoch": 0.6602475928473177, "grad_norm": 0.17602519690990448, "learning_rate": 9.081494822152185e-05, "loss": 0.3606, "step": 210 }, { "epoch": 0.6633916290037335, "grad_norm": 0.17341384291648865, "learning_rate": 9.076992345790185e-05, "loss": 0.3585, "step": 211 }, { "epoch": 0.6665356651601494, "grad_norm": 0.16207322478294373, "learning_rate": 9.072489869428186e-05, "loss": 0.3726, "step": 212 }, { "epoch": 0.6696797013165652, "grad_norm": 0.15573233366012573, "learning_rate": 9.067987393066188e-05, "loss": 0.3528, "step": 213 }, { "epoch": 0.672823737472981, "grad_norm": 0.15792664885520935, "learning_rate": 9.063484916704189e-05, "loss": 0.3544, "step": 214 }, { "epoch": 0.6759677736293968, "grad_norm": 0.17947156727313995, "learning_rate": 9.058982440342188e-05, "loss": 0.3908, "step": 215 }, { "epoch": 0.6791118097858125, "grad_norm": 0.20065437257289886, "learning_rate": 9.054479963980189e-05, "loss": 0.3387, "step": 216 }, { "epoch": 0.6822558459422283, "grad_norm": 0.1582382172346115, "learning_rate": 9.049977487618191e-05, "loss": 0.3727, "step": 217 }, { "epoch": 0.6853998820986441, "grad_norm": 0.16093921661376953, "learning_rate": 9.045475011256192e-05, "loss": 0.365, "step": 218 }, { "epoch": 0.6885439182550599, "grad_norm": 0.16429083049297333, "learning_rate": 9.040972534894192e-05, "loss": 0.3417, "step": 219 }, { "epoch": 0.6916879544114757, "grad_norm": 0.164768248796463, "learning_rate": 9.036470058532192e-05, "loss": 0.3558, "step": 220 }, { "epoch": 0.6948319905678916, "grad_norm": 0.18584826588630676, "learning_rate": 9.031967582170195e-05, "loss": 0.3887, "step": 221 }, { "epoch": 0.6979760267243074, "grad_norm": 0.1719428300857544, "learning_rate": 9.027465105808195e-05, "loss": 0.3832, "step": 222 }, { "epoch": 0.7011200628807231, "grad_norm": 0.16765046119689941, "learning_rate": 9.022962629446195e-05, "loss": 0.3645, "step": 223 }, { "epoch": 0.7042640990371389, "grad_norm": 0.15755492448806763, "learning_rate": 9.018460153084197e-05, "loss": 0.3931, "step": 224 }, { "epoch": 0.7074081351935547, "grad_norm": 0.147786945104599, "learning_rate": 9.013957676722198e-05, "loss": 0.3519, "step": 225 }, { "epoch": 0.7105521713499705, "grad_norm": 0.15629728138446808, "learning_rate": 9.009455200360199e-05, "loss": 0.3561, "step": 226 }, { "epoch": 0.7136962075063863, "grad_norm": 0.17266887426376343, "learning_rate": 9.004952723998198e-05, "loss": 0.3481, "step": 227 }, { "epoch": 0.7168402436628021, "grad_norm": 0.16593407094478607, "learning_rate": 9.0004502476362e-05, "loss": 0.4037, "step": 228 }, { "epoch": 0.719984279819218, "grad_norm": 0.1608094871044159, "learning_rate": 8.995947771274201e-05, "loss": 0.3427, "step": 229 }, { "epoch": 0.7231283159756338, "grad_norm": 0.16763266921043396, "learning_rate": 8.991445294912202e-05, "loss": 0.3567, "step": 230 }, { "epoch": 0.7262723521320495, "grad_norm": 0.17378269135951996, "learning_rate": 8.986942818550203e-05, "loss": 0.3607, "step": 231 }, { "epoch": 0.7294163882884653, "grad_norm": 0.15659885108470917, "learning_rate": 8.982440342188204e-05, "loss": 0.3912, "step": 232 }, { "epoch": 0.7325604244448811, "grad_norm": 0.16094735264778137, "learning_rate": 8.977937865826205e-05, "loss": 0.3565, "step": 233 }, { "epoch": 0.7357044606012969, "grad_norm": 0.16130982339382172, "learning_rate": 8.973435389464206e-05, "loss": 0.3402, "step": 234 }, { "epoch": 0.7388484967577127, "grad_norm": 0.1688772737979889, "learning_rate": 8.968932913102207e-05, "loss": 0.4084, "step": 235 }, { "epoch": 0.7419925329141285, "grad_norm": 0.15843011438846588, "learning_rate": 8.964430436740207e-05, "loss": 0.3274, "step": 236 }, { "epoch": 0.7451365690705443, "grad_norm": 0.17376118898391724, "learning_rate": 8.959927960378208e-05, "loss": 0.3594, "step": 237 }, { "epoch": 0.7482806052269602, "grad_norm": 0.1474812924861908, "learning_rate": 8.955425484016209e-05, "loss": 0.3401, "step": 238 }, { "epoch": 0.751424641383376, "grad_norm": 0.17633631825447083, "learning_rate": 8.95092300765421e-05, "loss": 0.3446, "step": 239 }, { "epoch": 0.7545686775397917, "grad_norm": 0.1765802502632141, "learning_rate": 8.946420531292211e-05, "loss": 0.4047, "step": 240 }, { "epoch": 0.7577127136962075, "grad_norm": 0.16155122220516205, "learning_rate": 8.941918054930212e-05, "loss": 0.3768, "step": 241 }, { "epoch": 0.7608567498526233, "grad_norm": 0.15938226878643036, "learning_rate": 8.937415578568214e-05, "loss": 0.361, "step": 242 }, { "epoch": 0.7640007860090391, "grad_norm": 0.15787923336029053, "learning_rate": 8.932913102206213e-05, "loss": 0.3695, "step": 243 }, { "epoch": 0.7671448221654549, "grad_norm": 0.15216825902462006, "learning_rate": 8.928410625844214e-05, "loss": 0.3098, "step": 244 }, { "epoch": 0.7702888583218707, "grad_norm": 0.17453248798847198, "learning_rate": 8.923908149482215e-05, "loss": 0.3717, "step": 245 }, { "epoch": 0.7734328944782864, "grad_norm": 0.16575738787651062, "learning_rate": 8.919405673120217e-05, "loss": 0.3589, "step": 246 }, { "epoch": 0.7765769306347023, "grad_norm": 0.18751981854438782, "learning_rate": 8.914903196758217e-05, "loss": 0.3655, "step": 247 }, { "epoch": 0.7797209667911181, "grad_norm": 0.17968574166297913, "learning_rate": 8.910400720396218e-05, "loss": 0.3748, "step": 248 }, { "epoch": 0.7828650029475339, "grad_norm": 0.15899331867694855, "learning_rate": 8.90589824403422e-05, "loss": 0.3338, "step": 249 }, { "epoch": 0.7860090391039497, "grad_norm": 0.17183594405651093, "learning_rate": 8.901395767672221e-05, "loss": 0.3626, "step": 250 }, { "epoch": 0.7891530752603655, "grad_norm": 0.18076513707637787, "learning_rate": 8.89689329131022e-05, "loss": 0.3809, "step": 251 }, { "epoch": 0.7922971114167813, "grad_norm": 0.19676664471626282, "learning_rate": 8.892390814948221e-05, "loss": 0.3605, "step": 252 }, { "epoch": 0.7954411475731971, "grad_norm": 0.16450554132461548, "learning_rate": 8.887888338586223e-05, "loss": 0.3875, "step": 253 }, { "epoch": 0.7985851837296128, "grad_norm": 0.16082732379436493, "learning_rate": 8.883385862224224e-05, "loss": 0.3546, "step": 254 }, { "epoch": 0.8017292198860287, "grad_norm": 0.16340506076812744, "learning_rate": 8.878883385862224e-05, "loss": 0.347, "step": 255 }, { "epoch": 0.8048732560424445, "grad_norm": 0.1688399314880371, "learning_rate": 8.874380909500225e-05, "loss": 0.3509, "step": 256 }, { "epoch": 0.8080172921988603, "grad_norm": 0.17438183724880219, "learning_rate": 8.869878433138227e-05, "loss": 0.3711, "step": 257 }, { "epoch": 0.8111613283552761, "grad_norm": 0.16867169737815857, "learning_rate": 8.865375956776228e-05, "loss": 0.328, "step": 258 }, { "epoch": 0.8143053645116919, "grad_norm": 0.15355893969535828, "learning_rate": 8.860873480414229e-05, "loss": 0.3742, "step": 259 }, { "epoch": 0.8174494006681077, "grad_norm": 0.16060909628868103, "learning_rate": 8.85637100405223e-05, "loss": 0.335, "step": 260 }, { "epoch": 0.8205934368245235, "grad_norm": 0.15643706917762756, "learning_rate": 8.85186852769023e-05, "loss": 0.3396, "step": 261 }, { "epoch": 0.8237374729809392, "grad_norm": 0.16040891408920288, "learning_rate": 8.847366051328231e-05, "loss": 0.3315, "step": 262 }, { "epoch": 0.826881509137355, "grad_norm": 0.16379357874393463, "learning_rate": 8.842863574966232e-05, "loss": 0.3753, "step": 263 }, { "epoch": 0.8300255452937709, "grad_norm": 0.16337694227695465, "learning_rate": 8.838361098604233e-05, "loss": 0.3671, "step": 264 }, { "epoch": 0.8331695814501867, "grad_norm": 0.1569366157054901, "learning_rate": 8.833858622242234e-05, "loss": 0.3489, "step": 265 }, { "epoch": 0.8363136176066025, "grad_norm": 0.18730495870113373, "learning_rate": 8.829356145880235e-05, "loss": 0.3665, "step": 266 }, { "epoch": 0.8394576537630183, "grad_norm": 0.1649472713470459, "learning_rate": 8.824853669518235e-05, "loss": 0.3555, "step": 267 }, { "epoch": 0.8426016899194341, "grad_norm": 0.16628509759902954, "learning_rate": 8.820351193156236e-05, "loss": 0.369, "step": 268 }, { "epoch": 0.8457457260758499, "grad_norm": 0.14760325849056244, "learning_rate": 8.815848716794237e-05, "loss": 0.3259, "step": 269 }, { "epoch": 0.8488897622322656, "grad_norm": 0.17367328703403473, "learning_rate": 8.811346240432238e-05, "loss": 0.3893, "step": 270 }, { "epoch": 0.8520337983886814, "grad_norm": 0.16746094822883606, "learning_rate": 8.806843764070239e-05, "loss": 0.3764, "step": 271 }, { "epoch": 0.8551778345450972, "grad_norm": 0.15180997550487518, "learning_rate": 8.80234128770824e-05, "loss": 0.349, "step": 272 }, { "epoch": 0.8583218707015131, "grad_norm": 0.1864422708749771, "learning_rate": 8.79783881134624e-05, "loss": 0.3786, "step": 273 }, { "epoch": 0.8614659068579289, "grad_norm": 0.16122880578041077, "learning_rate": 8.793336334984241e-05, "loss": 0.3657, "step": 274 }, { "epoch": 0.8646099430143447, "grad_norm": 0.17343121767044067, "learning_rate": 8.788833858622242e-05, "loss": 0.3827, "step": 275 }, { "epoch": 0.8677539791707605, "grad_norm": 0.18036895990371704, "learning_rate": 8.784331382260243e-05, "loss": 0.3133, "step": 276 }, { "epoch": 0.8708980153271763, "grad_norm": 0.18427862226963043, "learning_rate": 8.779828905898244e-05, "loss": 0.3565, "step": 277 }, { "epoch": 0.874042051483592, "grad_norm": 0.16638705134391785, "learning_rate": 8.775326429536246e-05, "loss": 0.3478, "step": 278 }, { "epoch": 0.8771860876400078, "grad_norm": 0.16496343910694122, "learning_rate": 8.770823953174246e-05, "loss": 0.3568, "step": 279 }, { "epoch": 0.8803301237964236, "grad_norm": 0.17348064482212067, "learning_rate": 8.766321476812247e-05, "loss": 0.3574, "step": 280 }, { "epoch": 0.8834741599528395, "grad_norm": 0.15693892538547516, "learning_rate": 8.761819000450247e-05, "loss": 0.3198, "step": 281 }, { "epoch": 0.8866181961092553, "grad_norm": 0.15095357596874237, "learning_rate": 8.75731652408825e-05, "loss": 0.3264, "step": 282 }, { "epoch": 0.8897622322656711, "grad_norm": 0.1656336635351181, "learning_rate": 8.75281404772625e-05, "loss": 0.3674, "step": 283 }, { "epoch": 0.8929062684220869, "grad_norm": 0.17691737413406372, "learning_rate": 8.74831157136425e-05, "loss": 0.3664, "step": 284 }, { "epoch": 0.8960503045785027, "grad_norm": 0.1854889690876007, "learning_rate": 8.743809095002252e-05, "loss": 0.351, "step": 285 }, { "epoch": 0.8991943407349184, "grad_norm": 0.16093221306800842, "learning_rate": 8.739306618640253e-05, "loss": 0.3675, "step": 286 }, { "epoch": 0.9023383768913342, "grad_norm": 0.19017969071865082, "learning_rate": 8.734804142278254e-05, "loss": 0.368, "step": 287 }, { "epoch": 0.90548241304775, "grad_norm": 0.1654754877090454, "learning_rate": 8.730301665916253e-05, "loss": 0.355, "step": 288 }, { "epoch": 0.9086264492041658, "grad_norm": 0.1598285436630249, "learning_rate": 8.725799189554256e-05, "loss": 0.3518, "step": 289 }, { "epoch": 0.9117704853605817, "grad_norm": 0.1645420342683792, "learning_rate": 8.721296713192257e-05, "loss": 0.3415, "step": 290 }, { "epoch": 0.9149145215169975, "grad_norm": 0.16022057831287384, "learning_rate": 8.716794236830257e-05, "loss": 0.3283, "step": 291 }, { "epoch": 0.9180585576734133, "grad_norm": 0.15258321166038513, "learning_rate": 8.712291760468257e-05, "loss": 0.3411, "step": 292 }, { "epoch": 0.921202593829829, "grad_norm": 0.1652868241071701, "learning_rate": 8.707789284106259e-05, "loss": 0.3719, "step": 293 }, { "epoch": 0.9243466299862448, "grad_norm": 0.15822020173072815, "learning_rate": 8.70328680774426e-05, "loss": 0.3435, "step": 294 }, { "epoch": 0.9274906661426606, "grad_norm": 0.17401742935180664, "learning_rate": 8.698784331382261e-05, "loss": 0.3488, "step": 295 }, { "epoch": 0.9306347022990764, "grad_norm": 0.16339954733848572, "learning_rate": 8.694281855020262e-05, "loss": 0.3521, "step": 296 }, { "epoch": 0.9337787384554922, "grad_norm": 0.17316532135009766, "learning_rate": 8.689779378658263e-05, "loss": 0.3173, "step": 297 }, { "epoch": 0.936922774611908, "grad_norm": 0.19332842528820038, "learning_rate": 8.685276902296263e-05, "loss": 0.3646, "step": 298 }, { "epoch": 0.9400668107683239, "grad_norm": 0.17178936302661896, "learning_rate": 8.680774425934264e-05, "loss": 0.3505, "step": 299 }, { "epoch": 0.9432108469247397, "grad_norm": 0.18378332257270813, "learning_rate": 8.676271949572265e-05, "loss": 0.3695, "step": 300 }, { "epoch": 0.9463548830811555, "grad_norm": 0.15688535571098328, "learning_rate": 8.671769473210266e-05, "loss": 0.3625, "step": 301 }, { "epoch": 0.9494989192375712, "grad_norm": 0.1592821180820465, "learning_rate": 8.667266996848267e-05, "loss": 0.3401, "step": 302 }, { "epoch": 0.952642955393987, "grad_norm": 0.1847924143075943, "learning_rate": 8.662764520486268e-05, "loss": 0.3515, "step": 303 }, { "epoch": 0.9557869915504028, "grad_norm": 0.18507392704486847, "learning_rate": 8.658262044124269e-05, "loss": 0.3605, "step": 304 }, { "epoch": 0.9589310277068186, "grad_norm": 0.15528923273086548, "learning_rate": 8.65375956776227e-05, "loss": 0.354, "step": 305 }, { "epoch": 0.9620750638632344, "grad_norm": 0.15168017148971558, "learning_rate": 8.64925709140027e-05, "loss": 0.3654, "step": 306 }, { "epoch": 0.9652191000196503, "grad_norm": 0.177875354886055, "learning_rate": 8.644754615038273e-05, "loss": 0.3109, "step": 307 }, { "epoch": 0.9683631361760661, "grad_norm": 0.16372695565223694, "learning_rate": 8.640252138676272e-05, "loss": 0.3938, "step": 308 }, { "epoch": 0.9715071723324818, "grad_norm": 0.1749984472990036, "learning_rate": 8.635749662314273e-05, "loss": 0.3609, "step": 309 }, { "epoch": 0.9746512084888976, "grad_norm": 0.1631542593240738, "learning_rate": 8.631247185952274e-05, "loss": 0.3647, "step": 310 }, { "epoch": 0.9777952446453134, "grad_norm": 0.18512563407421112, "learning_rate": 8.626744709590276e-05, "loss": 0.3435, "step": 311 }, { "epoch": 0.9809392808017292, "grad_norm": 0.15601268410682678, "learning_rate": 8.622242233228275e-05, "loss": 0.3424, "step": 312 }, { "epoch": 0.984083316958145, "grad_norm": 0.15293952822685242, "learning_rate": 8.617739756866276e-05, "loss": 0.3667, "step": 313 }, { "epoch": 0.9872273531145608, "grad_norm": 0.13952770829200745, "learning_rate": 8.613237280504279e-05, "loss": 0.3192, "step": 314 }, { "epoch": 0.9903713892709766, "grad_norm": 0.16453564167022705, "learning_rate": 8.60873480414228e-05, "loss": 0.3547, "step": 315 }, { "epoch": 0.9935154254273925, "grad_norm": 0.16414062678813934, "learning_rate": 8.604232327780279e-05, "loss": 0.3525, "step": 316 }, { "epoch": 0.9966594615838082, "grad_norm": 0.17270039021968842, "learning_rate": 8.59972985141828e-05, "loss": 0.3524, "step": 317 }, { "epoch": 0.999803497740224, "grad_norm": 0.16230063140392303, "learning_rate": 8.595227375056282e-05, "loss": 0.3647, "step": 318 }, { "epoch": 1.0, "grad_norm": 0.5973103046417236, "learning_rate": 8.590724898694283e-05, "loss": 0.3986, "step": 319 }, { "epoch": 1.0, "eval_loss": 0.36474910378456116, "eval_runtime": 127.5964, "eval_samples_per_second": 9.969, "eval_steps_per_second": 9.969, "step": 319 }, { "epoch": 1.003144036156416, "grad_norm": 0.16523776948451996, "learning_rate": 8.586222422332282e-05, "loss": 0.3198, "step": 320 }, { "epoch": 1.0062880723128316, "grad_norm": 0.1900419145822525, "learning_rate": 8.581719945970285e-05, "loss": 0.3841, "step": 321 }, { "epoch": 1.0094321084692475, "grad_norm": 0.18057386577129364, "learning_rate": 8.577217469608285e-05, "loss": 0.3279, "step": 322 }, { "epoch": 1.0125761446256631, "grad_norm": 0.17099525034427643, "learning_rate": 8.572714993246286e-05, "loss": 0.3251, "step": 323 }, { "epoch": 1.015720180782079, "grad_norm": 0.17648081481456757, "learning_rate": 8.568212516884286e-05, "loss": 0.3501, "step": 324 }, { "epoch": 1.0188642169384947, "grad_norm": 0.1697852462530136, "learning_rate": 8.563710040522288e-05, "loss": 0.3295, "step": 325 }, { "epoch": 1.0220082530949106, "grad_norm": 0.2016778141260147, "learning_rate": 8.559207564160289e-05, "loss": 0.3784, "step": 326 }, { "epoch": 1.0251522892513263, "grad_norm": 0.1521521508693695, "learning_rate": 8.55470508779829e-05, "loss": 0.3447, "step": 327 }, { "epoch": 1.0282963254077422, "grad_norm": 0.16746023297309875, "learning_rate": 8.550202611436289e-05, "loss": 0.3573, "step": 328 }, { "epoch": 1.031440361564158, "grad_norm": 0.16330087184906006, "learning_rate": 8.545700135074291e-05, "loss": 0.3227, "step": 329 }, { "epoch": 1.0345843977205738, "grad_norm": 0.15202394127845764, "learning_rate": 8.541197658712292e-05, "loss": 0.3186, "step": 330 }, { "epoch": 1.0377284338769897, "grad_norm": 0.1556456834077835, "learning_rate": 8.536695182350293e-05, "loss": 0.2919, "step": 331 }, { "epoch": 1.0408724700334053, "grad_norm": 0.16662093997001648, "learning_rate": 8.532192705988294e-05, "loss": 0.3461, "step": 332 }, { "epoch": 1.0440165061898212, "grad_norm": 0.16684111952781677, "learning_rate": 8.527690229626295e-05, "loss": 0.2995, "step": 333 }, { "epoch": 1.047160542346237, "grad_norm": 0.16788020730018616, "learning_rate": 8.523187753264296e-05, "loss": 0.3466, "step": 334 }, { "epoch": 1.0503045785026528, "grad_norm": 0.16468633711338043, "learning_rate": 8.518685276902297e-05, "loss": 0.3322, "step": 335 }, { "epoch": 1.0534486146590685, "grad_norm": 0.1772635132074356, "learning_rate": 8.514182800540297e-05, "loss": 0.3457, "step": 336 }, { "epoch": 1.0565926508154844, "grad_norm": 0.1517447978258133, "learning_rate": 8.509680324178298e-05, "loss": 0.3155, "step": 337 }, { "epoch": 1.0597366869719003, "grad_norm": 0.1648634970188141, "learning_rate": 8.505177847816299e-05, "loss": 0.3419, "step": 338 }, { "epoch": 1.062880723128316, "grad_norm": 0.18393363058567047, "learning_rate": 8.500675371454301e-05, "loss": 0.3218, "step": 339 }, { "epoch": 1.0660247592847318, "grad_norm": 0.16214938461780548, "learning_rate": 8.496172895092301e-05, "loss": 0.3316, "step": 340 }, { "epoch": 1.0691687954411475, "grad_norm": 0.15870095789432526, "learning_rate": 8.491670418730302e-05, "loss": 0.3547, "step": 341 }, { "epoch": 1.0723128315975634, "grad_norm": 0.18420541286468506, "learning_rate": 8.487167942368303e-05, "loss": 0.314, "step": 342 }, { "epoch": 1.075456867753979, "grad_norm": 0.1758188009262085, "learning_rate": 8.482665466006305e-05, "loss": 0.3361, "step": 343 }, { "epoch": 1.078600903910395, "grad_norm": 0.14741818606853485, "learning_rate": 8.478162989644304e-05, "loss": 0.328, "step": 344 }, { "epoch": 1.0817449400668107, "grad_norm": 0.18498258292675018, "learning_rate": 8.473660513282305e-05, "loss": 0.3846, "step": 345 }, { "epoch": 1.0848889762232266, "grad_norm": 0.33076727390289307, "learning_rate": 8.469158036920306e-05, "loss": 0.3456, "step": 346 }, { "epoch": 1.0880330123796425, "grad_norm": 0.14659199118614197, "learning_rate": 8.464655560558308e-05, "loss": 0.3178, "step": 347 }, { "epoch": 1.0911770485360581, "grad_norm": 0.17345285415649414, "learning_rate": 8.460153084196308e-05, "loss": 0.3508, "step": 348 }, { "epoch": 1.094321084692474, "grad_norm": 0.14883090555667877, "learning_rate": 8.455650607834309e-05, "loss": 0.3345, "step": 349 }, { "epoch": 1.0974651208488897, "grad_norm": 0.16995102167129517, "learning_rate": 8.451148131472311e-05, "loss": 0.32, "step": 350 }, { "epoch": 1.1006091570053056, "grad_norm": 0.17145007848739624, "learning_rate": 8.446645655110312e-05, "loss": 0.3687, "step": 351 }, { "epoch": 1.1037531931617213, "grad_norm": 0.1666383445262909, "learning_rate": 8.442143178748311e-05, "loss": 0.3398, "step": 352 }, { "epoch": 1.1068972293181372, "grad_norm": 0.1655489206314087, "learning_rate": 8.437640702386312e-05, "loss": 0.342, "step": 353 }, { "epoch": 1.1100412654745528, "grad_norm": 0.17032459378242493, "learning_rate": 8.433138226024314e-05, "loss": 0.3517, "step": 354 }, { "epoch": 1.1131853016309687, "grad_norm": 0.17960140109062195, "learning_rate": 8.428635749662315e-05, "loss": 0.3579, "step": 355 }, { "epoch": 1.1163293377873846, "grad_norm": 0.16178424656391144, "learning_rate": 8.424133273300316e-05, "loss": 0.355, "step": 356 }, { "epoch": 1.1194733739438003, "grad_norm": 0.15782013535499573, "learning_rate": 8.419630796938317e-05, "loss": 0.3226, "step": 357 }, { "epoch": 1.1226174101002162, "grad_norm": 0.17023907601833344, "learning_rate": 8.415128320576318e-05, "loss": 0.3392, "step": 358 }, { "epoch": 1.1257614462566319, "grad_norm": 0.1831522434949875, "learning_rate": 8.410625844214319e-05, "loss": 0.3522, "step": 359 }, { "epoch": 1.1289054824130478, "grad_norm": 0.1575043648481369, "learning_rate": 8.40612336785232e-05, "loss": 0.3441, "step": 360 }, { "epoch": 1.1320495185694635, "grad_norm": 0.16423583030700684, "learning_rate": 8.40162089149032e-05, "loss": 0.3388, "step": 361 }, { "epoch": 1.1351935547258794, "grad_norm": 0.1563112735748291, "learning_rate": 8.397118415128321e-05, "loss": 0.3417, "step": 362 }, { "epoch": 1.138337590882295, "grad_norm": 0.15290412306785583, "learning_rate": 8.392615938766322e-05, "loss": 0.3008, "step": 363 }, { "epoch": 1.141481627038711, "grad_norm": 0.1714605838060379, "learning_rate": 8.388113462404323e-05, "loss": 0.3183, "step": 364 }, { "epoch": 1.1446256631951268, "grad_norm": 0.16895225644111633, "learning_rate": 8.383610986042324e-05, "loss": 0.3214, "step": 365 }, { "epoch": 1.1477696993515425, "grad_norm": 0.18005424737930298, "learning_rate": 8.379108509680325e-05, "loss": 0.3544, "step": 366 }, { "epoch": 1.1509137355079584, "grad_norm": 0.1656702160835266, "learning_rate": 8.374606033318325e-05, "loss": 0.3307, "step": 367 }, { "epoch": 1.154057771664374, "grad_norm": 0.15588630735874176, "learning_rate": 8.370103556956326e-05, "loss": 0.3143, "step": 368 }, { "epoch": 1.15720180782079, "grad_norm": 0.18243834376335144, "learning_rate": 8.365601080594327e-05, "loss": 0.3667, "step": 369 }, { "epoch": 1.1603458439772059, "grad_norm": 0.17501448094844818, "learning_rate": 8.361098604232328e-05, "loss": 0.329, "step": 370 }, { "epoch": 1.1634898801336215, "grad_norm": 0.15979626774787903, "learning_rate": 8.356596127870329e-05, "loss": 0.3451, "step": 371 }, { "epoch": 1.1666339162900374, "grad_norm": 0.1676332652568817, "learning_rate": 8.35209365150833e-05, "loss": 0.3422, "step": 372 }, { "epoch": 1.169777952446453, "grad_norm": 0.16612845659255981, "learning_rate": 8.34759117514633e-05, "loss": 0.2995, "step": 373 }, { "epoch": 1.172921988602869, "grad_norm": 0.17852109670639038, "learning_rate": 8.343088698784331e-05, "loss": 0.3401, "step": 374 }, { "epoch": 1.1760660247592847, "grad_norm": 0.16424892842769623, "learning_rate": 8.338586222422334e-05, "loss": 0.3337, "step": 375 }, { "epoch": 1.1792100609157006, "grad_norm": 0.15577007830142975, "learning_rate": 8.334083746060334e-05, "loss": 0.325, "step": 376 }, { "epoch": 1.1823540970721163, "grad_norm": 0.1749369204044342, "learning_rate": 8.329581269698334e-05, "loss": 0.352, "step": 377 }, { "epoch": 1.1854981332285321, "grad_norm": 0.17166905105113983, "learning_rate": 8.325078793336335e-05, "loss": 0.3125, "step": 378 }, { "epoch": 1.188642169384948, "grad_norm": 0.18739891052246094, "learning_rate": 8.320576316974337e-05, "loss": 0.335, "step": 379 }, { "epoch": 1.1917862055413637, "grad_norm": 0.1669483631849289, "learning_rate": 8.316073840612338e-05, "loss": 0.3242, "step": 380 }, { "epoch": 1.1949302416977796, "grad_norm": 0.16702710092067719, "learning_rate": 8.311571364250337e-05, "loss": 0.3471, "step": 381 }, { "epoch": 1.1980742778541953, "grad_norm": 0.17773807048797607, "learning_rate": 8.307068887888338e-05, "loss": 0.3358, "step": 382 }, { "epoch": 1.2012183140106112, "grad_norm": 0.18235358595848083, "learning_rate": 8.30256641152634e-05, "loss": 0.3345, "step": 383 }, { "epoch": 1.2043623501670269, "grad_norm": 0.1792757213115692, "learning_rate": 8.298063935164341e-05, "loss": 0.3833, "step": 384 }, { "epoch": 1.2075063863234428, "grad_norm": 0.17335045337677002, "learning_rate": 8.293561458802341e-05, "loss": 0.346, "step": 385 }, { "epoch": 1.2106504224798584, "grad_norm": 0.16977883875370026, "learning_rate": 8.289058982440343e-05, "loss": 0.33, "step": 386 }, { "epoch": 1.2137944586362743, "grad_norm": 0.18401476740837097, "learning_rate": 8.284556506078344e-05, "loss": 0.3328, "step": 387 }, { "epoch": 1.2169384947926902, "grad_norm": 0.1789979636669159, "learning_rate": 8.280054029716345e-05, "loss": 0.3185, "step": 388 }, { "epoch": 1.220082530949106, "grad_norm": 0.1774056851863861, "learning_rate": 8.275551553354344e-05, "loss": 0.3099, "step": 389 }, { "epoch": 1.2232265671055218, "grad_norm": 0.17615292966365814, "learning_rate": 8.271049076992347e-05, "loss": 0.2997, "step": 390 }, { "epoch": 1.2263706032619375, "grad_norm": 0.1617576628923416, "learning_rate": 8.266546600630347e-05, "loss": 0.3286, "step": 391 }, { "epoch": 1.2295146394183534, "grad_norm": 0.18003027141094208, "learning_rate": 8.262044124268348e-05, "loss": 0.3683, "step": 392 }, { "epoch": 1.232658675574769, "grad_norm": 0.16327126324176788, "learning_rate": 8.257541647906349e-05, "loss": 0.3374, "step": 393 }, { "epoch": 1.235802711731185, "grad_norm": 0.1615103781223297, "learning_rate": 8.25303917154435e-05, "loss": 0.2987, "step": 394 }, { "epoch": 1.2389467478876006, "grad_norm": 0.17990683019161224, "learning_rate": 8.248536695182351e-05, "loss": 0.3461, "step": 395 }, { "epoch": 1.2420907840440165, "grad_norm": 0.1706767976284027, "learning_rate": 8.244034218820352e-05, "loss": 0.3611, "step": 396 }, { "epoch": 1.2452348202004324, "grad_norm": 0.16595344245433807, "learning_rate": 8.239531742458353e-05, "loss": 0.3383, "step": 397 }, { "epoch": 1.248378856356848, "grad_norm": 0.17336389422416687, "learning_rate": 8.235029266096353e-05, "loss": 0.3407, "step": 398 }, { "epoch": 1.251522892513264, "grad_norm": 0.1728227734565735, "learning_rate": 8.230526789734354e-05, "loss": 0.3511, "step": 399 }, { "epoch": 1.2546669286696797, "grad_norm": 0.17600099742412567, "learning_rate": 8.226024313372355e-05, "loss": 0.3589, "step": 400 }, { "epoch": 1.2578109648260956, "grad_norm": 0.15924879908561707, "learning_rate": 8.221521837010356e-05, "loss": 0.3262, "step": 401 }, { "epoch": 1.2609550009825112, "grad_norm": 0.17619210481643677, "learning_rate": 8.217019360648357e-05, "loss": 0.3132, "step": 402 }, { "epoch": 1.2640990371389271, "grad_norm": 0.1946687251329422, "learning_rate": 8.212516884286358e-05, "loss": 0.3675, "step": 403 }, { "epoch": 1.2672430732953428, "grad_norm": 0.17818918824195862, "learning_rate": 8.20801440792436e-05, "loss": 0.3134, "step": 404 }, { "epoch": 1.2703871094517587, "grad_norm": 0.16201846301555634, "learning_rate": 8.20351193156236e-05, "loss": 0.3293, "step": 405 }, { "epoch": 1.2735311456081746, "grad_norm": 0.1574605256319046, "learning_rate": 8.19900945520036e-05, "loss": 0.3198, "step": 406 }, { "epoch": 1.2766751817645903, "grad_norm": 0.19238558411598206, "learning_rate": 8.194506978838361e-05, "loss": 0.3801, "step": 407 }, { "epoch": 1.2798192179210062, "grad_norm": 0.19049645960330963, "learning_rate": 8.190004502476363e-05, "loss": 0.3485, "step": 408 }, { "epoch": 1.2829632540774218, "grad_norm": 0.16655929386615753, "learning_rate": 8.185502026114363e-05, "loss": 0.329, "step": 409 }, { "epoch": 1.2861072902338377, "grad_norm": 0.17940281331539154, "learning_rate": 8.180999549752364e-05, "loss": 0.3396, "step": 410 }, { "epoch": 1.2892513263902534, "grad_norm": 0.15868957340717316, "learning_rate": 8.176497073390366e-05, "loss": 0.3079, "step": 411 }, { "epoch": 1.2923953625466693, "grad_norm": 0.1568639725446701, "learning_rate": 8.171994597028367e-05, "loss": 0.3286, "step": 412 }, { "epoch": 1.295539398703085, "grad_norm": 0.16856297850608826, "learning_rate": 8.167492120666366e-05, "loss": 0.329, "step": 413 }, { "epoch": 1.2986834348595009, "grad_norm": 0.17020520567893982, "learning_rate": 8.162989644304367e-05, "loss": 0.3191, "step": 414 }, { "epoch": 1.3018274710159168, "grad_norm": 0.16772745549678802, "learning_rate": 8.15848716794237e-05, "loss": 0.3425, "step": 415 }, { "epoch": 1.3049715071723325, "grad_norm": 0.17022623121738434, "learning_rate": 8.15398469158037e-05, "loss": 0.3055, "step": 416 }, { "epoch": 1.3081155433287484, "grad_norm": 0.16747280955314636, "learning_rate": 8.14948221521837e-05, "loss": 0.348, "step": 417 }, { "epoch": 1.311259579485164, "grad_norm": 0.17487798631191254, "learning_rate": 8.14497973885637e-05, "loss": 0.3286, "step": 418 }, { "epoch": 1.31440361564158, "grad_norm": 0.18908794224262238, "learning_rate": 8.140477262494373e-05, "loss": 0.3402, "step": 419 }, { "epoch": 1.3175476517979956, "grad_norm": 0.1633966714143753, "learning_rate": 8.135974786132374e-05, "loss": 0.3385, "step": 420 }, { "epoch": 1.3206916879544115, "grad_norm": 0.17941975593566895, "learning_rate": 8.131472309770373e-05, "loss": 0.3506, "step": 421 }, { "epoch": 1.3238357241108272, "grad_norm": 0.1576979160308838, "learning_rate": 8.126969833408375e-05, "loss": 0.3233, "step": 422 }, { "epoch": 1.326979760267243, "grad_norm": 0.16325940191745758, "learning_rate": 8.122467357046376e-05, "loss": 0.321, "step": 423 }, { "epoch": 1.330123796423659, "grad_norm": 0.1745273768901825, "learning_rate": 8.117964880684377e-05, "loss": 0.3257, "step": 424 }, { "epoch": 1.3332678325800746, "grad_norm": 0.16034242510795593, "learning_rate": 8.113462404322378e-05, "loss": 0.3083, "step": 425 }, { "epoch": 1.3364118687364905, "grad_norm": 0.1664658933877945, "learning_rate": 8.108959927960379e-05, "loss": 0.3368, "step": 426 }, { "epoch": 1.3395559048929062, "grad_norm": 0.17093968391418457, "learning_rate": 8.10445745159838e-05, "loss": 0.3288, "step": 427 }, { "epoch": 1.342699941049322, "grad_norm": 0.17588087916374207, "learning_rate": 8.09995497523638e-05, "loss": 0.3344, "step": 428 }, { "epoch": 1.3458439772057378, "grad_norm": 0.16316810250282288, "learning_rate": 8.095452498874381e-05, "loss": 0.3205, "step": 429 }, { "epoch": 1.3489880133621537, "grad_norm": 0.1559893637895584, "learning_rate": 8.090950022512382e-05, "loss": 0.3098, "step": 430 }, { "epoch": 1.3521320495185694, "grad_norm": 0.17345042526721954, "learning_rate": 8.086447546150383e-05, "loss": 0.3523, "step": 431 }, { "epoch": 1.3552760856749853, "grad_norm": 0.18487606942653656, "learning_rate": 8.081945069788384e-05, "loss": 0.3334, "step": 432 }, { "epoch": 1.3584201218314012, "grad_norm": 0.1806030422449112, "learning_rate": 8.077442593426385e-05, "loss": 0.3668, "step": 433 }, { "epoch": 1.3615641579878168, "grad_norm": 0.1625894010066986, "learning_rate": 8.072940117064386e-05, "loss": 0.3043, "step": 434 }, { "epoch": 1.3647081941442327, "grad_norm": 0.15658941864967346, "learning_rate": 8.068437640702387e-05, "loss": 0.3079, "step": 435 }, { "epoch": 1.3678522303006484, "grad_norm": 0.17467805743217468, "learning_rate": 8.063935164340387e-05, "loss": 0.3187, "step": 436 }, { "epoch": 1.3709962664570643, "grad_norm": 0.17728790640830994, "learning_rate": 8.059432687978388e-05, "loss": 0.3338, "step": 437 }, { "epoch": 1.3741403026134802, "grad_norm": 0.17277754843235016, "learning_rate": 8.054930211616389e-05, "loss": 0.3461, "step": 438 }, { "epoch": 1.3772843387698959, "grad_norm": 0.17730361223220825, "learning_rate": 8.05042773525439e-05, "loss": 0.3405, "step": 439 }, { "epoch": 1.3804283749263115, "grad_norm": 0.16203267872333527, "learning_rate": 8.045925258892392e-05, "loss": 0.3207, "step": 440 }, { "epoch": 1.3835724110827274, "grad_norm": 0.15925423800945282, "learning_rate": 8.041422782530392e-05, "loss": 0.322, "step": 441 }, { "epoch": 1.3867164472391433, "grad_norm": 0.1678253710269928, "learning_rate": 8.036920306168393e-05, "loss": 0.3166, "step": 442 }, { "epoch": 1.389860483395559, "grad_norm": 0.16990883648395538, "learning_rate": 8.032417829806393e-05, "loss": 0.3331, "step": 443 }, { "epoch": 1.393004519551975, "grad_norm": 0.159279003739357, "learning_rate": 8.027915353444396e-05, "loss": 0.343, "step": 444 }, { "epoch": 1.3961485557083906, "grad_norm": 0.16234956681728363, "learning_rate": 8.023412877082395e-05, "loss": 0.3369, "step": 445 }, { "epoch": 1.3992925918648065, "grad_norm": 0.15107199549674988, "learning_rate": 8.018910400720396e-05, "loss": 0.317, "step": 446 }, { "epoch": 1.4024366280212224, "grad_norm": 0.17709723114967346, "learning_rate": 8.014407924358398e-05, "loss": 0.3047, "step": 447 }, { "epoch": 1.405580664177638, "grad_norm": 0.1607123166322708, "learning_rate": 8.009905447996399e-05, "loss": 0.3266, "step": 448 }, { "epoch": 1.4087247003340537, "grad_norm": 0.1606593281030655, "learning_rate": 8.0054029716344e-05, "loss": 0.3457, "step": 449 }, { "epoch": 1.4118687364904696, "grad_norm": 0.16384440660476685, "learning_rate": 8.0009004952724e-05, "loss": 0.3561, "step": 450 }, { "epoch": 1.4150127726468855, "grad_norm": 0.16856162250041962, "learning_rate": 7.996398018910402e-05, "loss": 0.3179, "step": 451 }, { "epoch": 1.4181568088033012, "grad_norm": 0.16696326434612274, "learning_rate": 7.991895542548402e-05, "loss": 0.318, "step": 452 }, { "epoch": 1.421300844959717, "grad_norm": 0.1651603877544403, "learning_rate": 7.987393066186403e-05, "loss": 0.3252, "step": 453 }, { "epoch": 1.4244448811161328, "grad_norm": 0.15790776908397675, "learning_rate": 7.982890589824403e-05, "loss": 0.3415, "step": 454 }, { "epoch": 1.4275889172725487, "grad_norm": 0.160242959856987, "learning_rate": 7.978388113462405e-05, "loss": 0.328, "step": 455 }, { "epoch": 1.4307329534289646, "grad_norm": 0.18906033039093018, "learning_rate": 7.973885637100406e-05, "loss": 0.3346, "step": 456 }, { "epoch": 1.4338769895853802, "grad_norm": 0.17506416141986847, "learning_rate": 7.969383160738407e-05, "loss": 0.3486, "step": 457 }, { "epoch": 1.437021025741796, "grad_norm": 0.1868247091770172, "learning_rate": 7.964880684376408e-05, "loss": 0.3456, "step": 458 }, { "epoch": 1.4401650618982118, "grad_norm": 0.1670251041650772, "learning_rate": 7.960378208014408e-05, "loss": 0.3099, "step": 459 }, { "epoch": 1.4433090980546277, "grad_norm": 0.16608436405658722, "learning_rate": 7.95587573165241e-05, "loss": 0.3304, "step": 460 }, { "epoch": 1.4464531342110434, "grad_norm": 0.1757041960954666, "learning_rate": 7.95137325529041e-05, "loss": 0.3414, "step": 461 }, { "epoch": 1.4495971703674593, "grad_norm": 0.18956035375595093, "learning_rate": 7.946870778928411e-05, "loss": 0.3475, "step": 462 }, { "epoch": 1.452741206523875, "grad_norm": 0.1749645173549652, "learning_rate": 7.942368302566412e-05, "loss": 0.3453, "step": 463 }, { "epoch": 1.4558852426802908, "grad_norm": 0.17661790549755096, "learning_rate": 7.937865826204413e-05, "loss": 0.3122, "step": 464 }, { "epoch": 1.4590292788367067, "grad_norm": 0.16606219112873077, "learning_rate": 7.933363349842414e-05, "loss": 0.3275, "step": 465 }, { "epoch": 1.4621733149931224, "grad_norm": 0.17825940251350403, "learning_rate": 7.928860873480414e-05, "loss": 0.3329, "step": 466 }, { "epoch": 1.465317351149538, "grad_norm": 0.1581132709980011, "learning_rate": 7.924358397118415e-05, "loss": 0.33, "step": 467 }, { "epoch": 1.468461387305954, "grad_norm": 0.17290616035461426, "learning_rate": 7.919855920756416e-05, "loss": 0.3066, "step": 468 }, { "epoch": 1.4716054234623699, "grad_norm": 0.1645108461380005, "learning_rate": 7.915353444394417e-05, "loss": 0.339, "step": 469 }, { "epoch": 1.4747494596187856, "grad_norm": 0.18261072039604187, "learning_rate": 7.910850968032418e-05, "loss": 0.3947, "step": 470 }, { "epoch": 1.4778934957752015, "grad_norm": 0.15812186896800995, "learning_rate": 7.906348491670419e-05, "loss": 0.31, "step": 471 }, { "epoch": 1.4810375319316171, "grad_norm": 0.1644250452518463, "learning_rate": 7.90184601530842e-05, "loss": 0.3567, "step": 472 }, { "epoch": 1.484181568088033, "grad_norm": 0.1638532429933548, "learning_rate": 7.897343538946422e-05, "loss": 0.3285, "step": 473 }, { "epoch": 1.487325604244449, "grad_norm": 0.16853764653205872, "learning_rate": 7.892841062584421e-05, "loss": 0.3515, "step": 474 }, { "epoch": 1.4904696404008646, "grad_norm": 0.1728491187095642, "learning_rate": 7.888338586222422e-05, "loss": 0.3513, "step": 475 }, { "epoch": 1.4936136765572803, "grad_norm": 0.17471279203891754, "learning_rate": 7.883836109860424e-05, "loss": 0.3419, "step": 476 }, { "epoch": 1.4967577127136962, "grad_norm": 0.15975187718868256, "learning_rate": 7.879333633498425e-05, "loss": 0.318, "step": 477 }, { "epoch": 1.499901748870112, "grad_norm": 0.18203528225421906, "learning_rate": 7.874831157136425e-05, "loss": 0.3434, "step": 478 }, { "epoch": 1.503045785026528, "grad_norm": 0.16927766799926758, "learning_rate": 7.870328680774426e-05, "loss": 0.3272, "step": 479 }, { "epoch": 1.5061898211829436, "grad_norm": 0.16967304050922394, "learning_rate": 7.865826204412428e-05, "loss": 0.3319, "step": 480 }, { "epoch": 1.5093338573393593, "grad_norm": 0.16304318606853485, "learning_rate": 7.861323728050429e-05, "loss": 0.3298, "step": 481 }, { "epoch": 1.5124778934957752, "grad_norm": 0.1505865454673767, "learning_rate": 7.856821251688428e-05, "loss": 0.3066, "step": 482 }, { "epoch": 1.5156219296521911, "grad_norm": 0.16117632389068604, "learning_rate": 7.85231877532643e-05, "loss": 0.3203, "step": 483 }, { "epoch": 1.5187659658086068, "grad_norm": 0.17701475322246552, "learning_rate": 7.847816298964431e-05, "loss": 0.3548, "step": 484 }, { "epoch": 1.5219100019650225, "grad_norm": 0.17042210698127747, "learning_rate": 7.843313822602432e-05, "loss": 0.3231, "step": 485 }, { "epoch": 1.5250540381214384, "grad_norm": 0.16220158338546753, "learning_rate": 7.838811346240432e-05, "loss": 0.3622, "step": 486 }, { "epoch": 1.5281980742778543, "grad_norm": 0.15978193283081055, "learning_rate": 7.834308869878434e-05, "loss": 0.3288, "step": 487 }, { "epoch": 1.5313421104342702, "grad_norm": 0.1720646172761917, "learning_rate": 7.829806393516435e-05, "loss": 0.3243, "step": 488 }, { "epoch": 1.5344861465906858, "grad_norm": 0.17222121357917786, "learning_rate": 7.825303917154436e-05, "loss": 0.3162, "step": 489 }, { "epoch": 1.5376301827471015, "grad_norm": 0.1751534342765808, "learning_rate": 7.820801440792435e-05, "loss": 0.337, "step": 490 }, { "epoch": 1.5407742189035174, "grad_norm": 0.17199626564979553, "learning_rate": 7.816298964430437e-05, "loss": 0.3386, "step": 491 }, { "epoch": 1.5439182550599333, "grad_norm": 0.1767290085554123, "learning_rate": 7.811796488068438e-05, "loss": 0.3394, "step": 492 }, { "epoch": 1.547062291216349, "grad_norm": 0.1636359840631485, "learning_rate": 7.807294011706439e-05, "loss": 0.3455, "step": 493 }, { "epoch": 1.5502063273727646, "grad_norm": 0.17175675928592682, "learning_rate": 7.80279153534444e-05, "loss": 0.2961, "step": 494 }, { "epoch": 1.5533503635291805, "grad_norm": 0.1556348353624344, "learning_rate": 7.798289058982441e-05, "loss": 0.3021, "step": 495 }, { "epoch": 1.5564943996855964, "grad_norm": 0.16222867369651794, "learning_rate": 7.793786582620442e-05, "loss": 0.2764, "step": 496 }, { "epoch": 1.5596384358420123, "grad_norm": 0.19224032759666443, "learning_rate": 7.789284106258442e-05, "loss": 0.3863, "step": 497 }, { "epoch": 1.562782471998428, "grad_norm": 0.16556838154792786, "learning_rate": 7.784781629896443e-05, "loss": 0.3244, "step": 498 }, { "epoch": 1.5659265081548437, "grad_norm": 0.16483165323734283, "learning_rate": 7.780279153534444e-05, "loss": 0.3234, "step": 499 }, { "epoch": 1.5690705443112596, "grad_norm": 0.16399100422859192, "learning_rate": 7.775776677172445e-05, "loss": 0.3178, "step": 500 }, { "epoch": 1.5722145804676755, "grad_norm": 0.16007336974143982, "learning_rate": 7.771274200810447e-05, "loss": 0.3027, "step": 501 }, { "epoch": 1.5753586166240912, "grad_norm": 0.20383314788341522, "learning_rate": 7.766771724448447e-05, "loss": 0.3464, "step": 502 }, { "epoch": 1.5785026527805068, "grad_norm": 0.17048481106758118, "learning_rate": 7.762269248086448e-05, "loss": 0.3195, "step": 503 }, { "epoch": 1.5816466889369227, "grad_norm": 0.1666378229856491, "learning_rate": 7.757766771724448e-05, "loss": 0.3205, "step": 504 }, { "epoch": 1.5847907250933386, "grad_norm": 0.1569521576166153, "learning_rate": 7.753264295362451e-05, "loss": 0.3115, "step": 505 }, { "epoch": 1.5879347612497545, "grad_norm": 0.15977101027965546, "learning_rate": 7.74876181900045e-05, "loss": 0.3028, "step": 506 }, { "epoch": 1.5910787974061702, "grad_norm": 0.1699342429637909, "learning_rate": 7.744259342638451e-05, "loss": 0.352, "step": 507 }, { "epoch": 1.5942228335625859, "grad_norm": 0.15689171850681305, "learning_rate": 7.739756866276452e-05, "loss": 0.3067, "step": 508 }, { "epoch": 1.5973668697190018, "grad_norm": 0.16112978756427765, "learning_rate": 7.735254389914454e-05, "loss": 0.3119, "step": 509 }, { "epoch": 1.6005109058754177, "grad_norm": 0.16471809148788452, "learning_rate": 7.730751913552454e-05, "loss": 0.3206, "step": 510 }, { "epoch": 1.6036549420318333, "grad_norm": 0.16362188756465912, "learning_rate": 7.726249437190454e-05, "loss": 0.3385, "step": 511 }, { "epoch": 1.606798978188249, "grad_norm": 0.17033949494361877, "learning_rate": 7.721746960828457e-05, "loss": 0.3212, "step": 512 }, { "epoch": 1.609943014344665, "grad_norm": 0.2101941555738449, "learning_rate": 7.717244484466458e-05, "loss": 0.3414, "step": 513 }, { "epoch": 1.6130870505010808, "grad_norm": 0.1708775758743286, "learning_rate": 7.712742008104457e-05, "loss": 0.3165, "step": 514 }, { "epoch": 1.6162310866574967, "grad_norm": 0.15954916179180145, "learning_rate": 7.708239531742458e-05, "loss": 0.2882, "step": 515 }, { "epoch": 1.6193751228139124, "grad_norm": 0.18500474095344543, "learning_rate": 7.70373705538046e-05, "loss": 0.3406, "step": 516 }, { "epoch": 1.622519158970328, "grad_norm": 0.15775783360004425, "learning_rate": 7.699234579018461e-05, "loss": 0.3435, "step": 517 }, { "epoch": 1.625663195126744, "grad_norm": 0.17533423006534576, "learning_rate": 7.69473210265646e-05, "loss": 0.3378, "step": 518 }, { "epoch": 1.6288072312831599, "grad_norm": 0.1671397089958191, "learning_rate": 7.690229626294463e-05, "loss": 0.3184, "step": 519 }, { "epoch": 1.6319512674395755, "grad_norm": 0.16599297523498535, "learning_rate": 7.685727149932464e-05, "loss": 0.32, "step": 520 }, { "epoch": 1.6350953035959912, "grad_norm": 0.16642846167087555, "learning_rate": 7.681224673570464e-05, "loss": 0.3193, "step": 521 }, { "epoch": 1.638239339752407, "grad_norm": 0.1668187826871872, "learning_rate": 7.676722197208465e-05, "loss": 0.3361, "step": 522 }, { "epoch": 1.641383375908823, "grad_norm": 0.165154829621315, "learning_rate": 7.672219720846466e-05, "loss": 0.3257, "step": 523 }, { "epoch": 1.644527412065239, "grad_norm": 0.1803310215473175, "learning_rate": 7.667717244484467e-05, "loss": 0.3592, "step": 524 }, { "epoch": 1.6476714482216546, "grad_norm": 0.1636355221271515, "learning_rate": 7.663214768122468e-05, "loss": 0.32, "step": 525 }, { "epoch": 1.6508154843780702, "grad_norm": 0.17002321779727936, "learning_rate": 7.658712291760469e-05, "loss": 0.3532, "step": 526 }, { "epoch": 1.6539595205344861, "grad_norm": 0.17365287244319916, "learning_rate": 7.65420981539847e-05, "loss": 0.3543, "step": 527 }, { "epoch": 1.657103556690902, "grad_norm": 0.18107831478118896, "learning_rate": 7.64970733903647e-05, "loss": 0.3296, "step": 528 }, { "epoch": 1.6602475928473177, "grad_norm": 0.15890681743621826, "learning_rate": 7.645204862674471e-05, "loss": 0.3489, "step": 529 }, { "epoch": 1.6633916290037334, "grad_norm": 0.16823096573352814, "learning_rate": 7.640702386312472e-05, "loss": 0.3429, "step": 530 }, { "epoch": 1.6665356651601493, "grad_norm": 0.16939134895801544, "learning_rate": 7.636199909950473e-05, "loss": 0.2871, "step": 531 }, { "epoch": 1.6696797013165652, "grad_norm": 0.18005231022834778, "learning_rate": 7.631697433588474e-05, "loss": 0.3297, "step": 532 }, { "epoch": 1.672823737472981, "grad_norm": 0.18011395633220673, "learning_rate": 7.627194957226475e-05, "loss": 0.3239, "step": 533 }, { "epoch": 1.6759677736293968, "grad_norm": 0.16882002353668213, "learning_rate": 7.622692480864476e-05, "loss": 0.3578, "step": 534 }, { "epoch": 1.6791118097858124, "grad_norm": 0.17064803838729858, "learning_rate": 7.618190004502476e-05, "loss": 0.3324, "step": 535 }, { "epoch": 1.6822558459422283, "grad_norm": 0.1657048612833023, "learning_rate": 7.613687528140477e-05, "loss": 0.3312, "step": 536 }, { "epoch": 1.6853998820986442, "grad_norm": 0.1642867475748062, "learning_rate": 7.60918505177848e-05, "loss": 0.2868, "step": 537 }, { "epoch": 1.68854391825506, "grad_norm": 0.16523368656635284, "learning_rate": 7.604682575416479e-05, "loss": 0.3333, "step": 538 }, { "epoch": 1.6916879544114756, "grad_norm": 0.16212032735347748, "learning_rate": 7.60018009905448e-05, "loss": 0.3211, "step": 539 }, { "epoch": 1.6948319905678915, "grad_norm": 0.15837180614471436, "learning_rate": 7.595677622692481e-05, "loss": 0.3132, "step": 540 }, { "epoch": 1.6979760267243074, "grad_norm": 0.1792554408311844, "learning_rate": 7.591175146330483e-05, "loss": 0.3682, "step": 541 }, { "epoch": 1.7011200628807233, "grad_norm": 0.1621055006980896, "learning_rate": 7.586672669968482e-05, "loss": 0.3041, "step": 542 }, { "epoch": 1.704264099037139, "grad_norm": 0.17493993043899536, "learning_rate": 7.582170193606483e-05, "loss": 0.32, "step": 543 }, { "epoch": 1.7074081351935546, "grad_norm": 0.1641577035188675, "learning_rate": 7.577667717244484e-05, "loss": 0.3394, "step": 544 }, { "epoch": 1.7105521713499705, "grad_norm": 0.16471892595291138, "learning_rate": 7.573165240882486e-05, "loss": 0.3385, "step": 545 }, { "epoch": 1.7136962075063864, "grad_norm": 0.17224904894828796, "learning_rate": 7.568662764520487e-05, "loss": 0.2871, "step": 546 }, { "epoch": 1.716840243662802, "grad_norm": 0.17789947986602783, "learning_rate": 7.564160288158487e-05, "loss": 0.3517, "step": 547 }, { "epoch": 1.719984279819218, "grad_norm": 0.15281309187412262, "learning_rate": 7.559657811796489e-05, "loss": 0.31, "step": 548 }, { "epoch": 1.7231283159756337, "grad_norm": 0.16906626522541046, "learning_rate": 7.55515533543449e-05, "loss": 0.3377, "step": 549 }, { "epoch": 1.7262723521320495, "grad_norm": 0.1630704551935196, "learning_rate": 7.550652859072491e-05, "loss": 0.3271, "step": 550 }, { "epoch": 1.7294163882884654, "grad_norm": 0.17200839519500732, "learning_rate": 7.54615038271049e-05, "loss": 0.3351, "step": 551 }, { "epoch": 1.7325604244448811, "grad_norm": 0.17246218025684357, "learning_rate": 7.541647906348492e-05, "loss": 0.3646, "step": 552 }, { "epoch": 1.7357044606012968, "grad_norm": 0.17717881500720978, "learning_rate": 7.537145429986493e-05, "loss": 0.3399, "step": 553 }, { "epoch": 1.7388484967577127, "grad_norm": 0.1677379459142685, "learning_rate": 7.532642953624494e-05, "loss": 0.3588, "step": 554 }, { "epoch": 1.7419925329141286, "grad_norm": 0.16207338869571686, "learning_rate": 7.528140477262495e-05, "loss": 0.3258, "step": 555 }, { "epoch": 1.7451365690705443, "grad_norm": 0.15249457955360413, "learning_rate": 7.523638000900496e-05, "loss": 0.3359, "step": 556 }, { "epoch": 1.7482806052269602, "grad_norm": 0.1653815507888794, "learning_rate": 7.519135524538497e-05, "loss": 0.3118, "step": 557 }, { "epoch": 1.7514246413833758, "grad_norm": 0.16432282328605652, "learning_rate": 7.514633048176498e-05, "loss": 0.3059, "step": 558 }, { "epoch": 1.7545686775397917, "grad_norm": 0.16189369559288025, "learning_rate": 7.510130571814498e-05, "loss": 0.3296, "step": 559 }, { "epoch": 1.7577127136962076, "grad_norm": 0.15887871384620667, "learning_rate": 7.505628095452499e-05, "loss": 0.3251, "step": 560 }, { "epoch": 1.7608567498526233, "grad_norm": 0.16255879402160645, "learning_rate": 7.5011256190905e-05, "loss": 0.2987, "step": 561 }, { "epoch": 1.764000786009039, "grad_norm": 0.15512339770793915, "learning_rate": 7.496623142728501e-05, "loss": 0.3204, "step": 562 }, { "epoch": 1.7671448221654549, "grad_norm": 0.185036301612854, "learning_rate": 7.492120666366502e-05, "loss": 0.3184, "step": 563 }, { "epoch": 1.7702888583218708, "grad_norm": 0.16788752377033234, "learning_rate": 7.487618190004503e-05, "loss": 0.318, "step": 564 }, { "epoch": 1.7734328944782864, "grad_norm": 0.16279074549674988, "learning_rate": 7.483115713642504e-05, "loss": 0.3254, "step": 565 }, { "epoch": 1.7765769306347023, "grad_norm": 0.16731205582618713, "learning_rate": 7.478613237280504e-05, "loss": 0.359, "step": 566 }, { "epoch": 1.779720966791118, "grad_norm": 0.18140964210033417, "learning_rate": 7.474110760918505e-05, "loss": 0.3386, "step": 567 }, { "epoch": 1.782865002947534, "grad_norm": 0.16301101446151733, "learning_rate": 7.469608284556506e-05, "loss": 0.33, "step": 568 }, { "epoch": 1.7860090391039498, "grad_norm": 0.15827955305576324, "learning_rate": 7.465105808194507e-05, "loss": 0.3171, "step": 569 }, { "epoch": 1.7891530752603655, "grad_norm": 0.1748654991388321, "learning_rate": 7.460603331832509e-05, "loss": 0.3536, "step": 570 }, { "epoch": 1.7922971114167812, "grad_norm": 0.1615212857723236, "learning_rate": 7.456100855470509e-05, "loss": 0.3183, "step": 571 }, { "epoch": 1.795441147573197, "grad_norm": 0.15701080858707428, "learning_rate": 7.45159837910851e-05, "loss": 0.3452, "step": 572 }, { "epoch": 1.798585183729613, "grad_norm": 0.14941848814487457, "learning_rate": 7.447095902746512e-05, "loss": 0.3309, "step": 573 }, { "epoch": 1.8017292198860289, "grad_norm": 0.16245996952056885, "learning_rate": 7.442593426384513e-05, "loss": 0.3408, "step": 574 }, { "epoch": 1.8048732560424445, "grad_norm": 0.1553889513015747, "learning_rate": 7.438090950022512e-05, "loss": 0.3117, "step": 575 }, { "epoch": 1.8080172921988602, "grad_norm": 0.1645231992006302, "learning_rate": 7.433588473660513e-05, "loss": 0.311, "step": 576 }, { "epoch": 1.811161328355276, "grad_norm": 0.17609386146068573, "learning_rate": 7.429085997298515e-05, "loss": 0.3275, "step": 577 }, { "epoch": 1.814305364511692, "grad_norm": 0.15462878346443176, "learning_rate": 7.424583520936516e-05, "loss": 0.3373, "step": 578 }, { "epoch": 1.8174494006681077, "grad_norm": 0.15583834052085876, "learning_rate": 7.420081044574516e-05, "loss": 0.325, "step": 579 }, { "epoch": 1.8205934368245233, "grad_norm": 0.1692819744348526, "learning_rate": 7.415578568212516e-05, "loss": 0.3074, "step": 580 }, { "epoch": 1.8237374729809392, "grad_norm": 0.17038291692733765, "learning_rate": 7.411076091850519e-05, "loss": 0.3401, "step": 581 }, { "epoch": 1.8268815091373551, "grad_norm": 0.16895835101604462, "learning_rate": 7.40657361548852e-05, "loss": 0.3199, "step": 582 }, { "epoch": 1.830025545293771, "grad_norm": 0.1672675460577011, "learning_rate": 7.402071139126519e-05, "loss": 0.315, "step": 583 }, { "epoch": 1.8331695814501867, "grad_norm": 0.17299684882164001, "learning_rate": 7.397568662764521e-05, "loss": 0.3516, "step": 584 }, { "epoch": 1.8363136176066024, "grad_norm": 0.166487455368042, "learning_rate": 7.393066186402522e-05, "loss": 0.3324, "step": 585 }, { "epoch": 1.8394576537630183, "grad_norm": 0.17135372757911682, "learning_rate": 7.388563710040523e-05, "loss": 0.3509, "step": 586 }, { "epoch": 1.8426016899194342, "grad_norm": 0.1614961177110672, "learning_rate": 7.384061233678522e-05, "loss": 0.3074, "step": 587 }, { "epoch": 1.8457457260758499, "grad_norm": 0.16727496683597565, "learning_rate": 7.379558757316525e-05, "loss": 0.3318, "step": 588 }, { "epoch": 1.8488897622322655, "grad_norm": 0.17604076862335205, "learning_rate": 7.375056280954526e-05, "loss": 0.3412, "step": 589 }, { "epoch": 1.8520337983886814, "grad_norm": 0.15882007777690887, "learning_rate": 7.370553804592526e-05, "loss": 0.3233, "step": 590 }, { "epoch": 1.8551778345450973, "grad_norm": 0.162909135222435, "learning_rate": 7.366051328230527e-05, "loss": 0.3081, "step": 591 }, { "epoch": 1.8583218707015132, "grad_norm": 0.16135378181934357, "learning_rate": 7.361548851868528e-05, "loss": 0.3022, "step": 592 }, { "epoch": 1.861465906857929, "grad_norm": 0.16229267418384552, "learning_rate": 7.357046375506529e-05, "loss": 0.2886, "step": 593 }, { "epoch": 1.8646099430143446, "grad_norm": 0.18666622042655945, "learning_rate": 7.35254389914453e-05, "loss": 0.3307, "step": 594 }, { "epoch": 1.8677539791707605, "grad_norm": 0.17704243957996368, "learning_rate": 7.348041422782531e-05, "loss": 0.3407, "step": 595 }, { "epoch": 1.8708980153271764, "grad_norm": 0.1568397730588913, "learning_rate": 7.343538946420532e-05, "loss": 0.3154, "step": 596 }, { "epoch": 1.874042051483592, "grad_norm": 0.1668362319469452, "learning_rate": 7.339036470058532e-05, "loss": 0.3098, "step": 597 }, { "epoch": 1.8771860876400077, "grad_norm": 0.171774223446846, "learning_rate": 7.334533993696533e-05, "loss": 0.3255, "step": 598 }, { "epoch": 1.8803301237964236, "grad_norm": 0.1613488346338272, "learning_rate": 7.330031517334534e-05, "loss": 0.3373, "step": 599 }, { "epoch": 1.8834741599528395, "grad_norm": 0.17803837358951569, "learning_rate": 7.325529040972535e-05, "loss": 0.3438, "step": 600 }, { "epoch": 1.8866181961092554, "grad_norm": 0.16244632005691528, "learning_rate": 7.321026564610536e-05, "loss": 0.3253, "step": 601 }, { "epoch": 1.889762232265671, "grad_norm": 0.17939813435077667, "learning_rate": 7.316524088248538e-05, "loss": 0.3535, "step": 602 }, { "epoch": 1.8929062684220868, "grad_norm": 0.17398670315742493, "learning_rate": 7.312021611886538e-05, "loss": 0.3512, "step": 603 }, { "epoch": 1.8960503045785027, "grad_norm": 0.1678583174943924, "learning_rate": 7.307519135524538e-05, "loss": 0.3625, "step": 604 }, { "epoch": 1.8991943407349186, "grad_norm": 0.16046324372291565, "learning_rate": 7.303016659162539e-05, "loss": 0.3297, "step": 605 }, { "epoch": 1.9023383768913342, "grad_norm": 0.1703193485736847, "learning_rate": 7.298514182800542e-05, "loss": 0.3515, "step": 606 }, { "epoch": 1.90548241304775, "grad_norm": 0.16937416791915894, "learning_rate": 7.294011706438541e-05, "loss": 0.2962, "step": 607 }, { "epoch": 1.9086264492041658, "grad_norm": 0.17036621272563934, "learning_rate": 7.289509230076542e-05, "loss": 0.3248, "step": 608 }, { "epoch": 1.9117704853605817, "grad_norm": 0.15427525341510773, "learning_rate": 7.285006753714544e-05, "loss": 0.305, "step": 609 }, { "epoch": 1.9149145215169976, "grad_norm": 0.17526637017726898, "learning_rate": 7.280504277352545e-05, "loss": 0.3188, "step": 610 }, { "epoch": 1.9180585576734133, "grad_norm": 0.17091768980026245, "learning_rate": 7.276001800990544e-05, "loss": 0.3546, "step": 611 }, { "epoch": 1.921202593829829, "grad_norm": 0.16977418959140778, "learning_rate": 7.271499324628545e-05, "loss": 0.3338, "step": 612 }, { "epoch": 1.9243466299862448, "grad_norm": 0.16019406914710999, "learning_rate": 7.266996848266548e-05, "loss": 0.3004, "step": 613 }, { "epoch": 1.9274906661426607, "grad_norm": 0.15521669387817383, "learning_rate": 7.262494371904548e-05, "loss": 0.3466, "step": 614 }, { "epoch": 1.9306347022990764, "grad_norm": 0.1689404398202896, "learning_rate": 7.257991895542548e-05, "loss": 0.3216, "step": 615 }, { "epoch": 1.933778738455492, "grad_norm": 0.15882880985736847, "learning_rate": 7.253489419180549e-05, "loss": 0.3476, "step": 616 }, { "epoch": 1.936922774611908, "grad_norm": 0.1685037910938263, "learning_rate": 7.248986942818551e-05, "loss": 0.3784, "step": 617 }, { "epoch": 1.9400668107683239, "grad_norm": 0.17059996724128723, "learning_rate": 7.244484466456552e-05, "loss": 0.3173, "step": 618 }, { "epoch": 1.9432108469247398, "grad_norm": 0.18009281158447266, "learning_rate": 7.239981990094553e-05, "loss": 0.3188, "step": 619 }, { "epoch": 1.9463548830811555, "grad_norm": 0.17357571423053741, "learning_rate": 7.235479513732554e-05, "loss": 0.3375, "step": 620 }, { "epoch": 1.9494989192375711, "grad_norm": 0.15662403404712677, "learning_rate": 7.230977037370554e-05, "loss": 0.3219, "step": 621 }, { "epoch": 1.952642955393987, "grad_norm": 0.1947060227394104, "learning_rate": 7.226474561008555e-05, "loss": 0.3263, "step": 622 }, { "epoch": 1.955786991550403, "grad_norm": 0.1692500114440918, "learning_rate": 7.221972084646556e-05, "loss": 0.3105, "step": 623 }, { "epoch": 1.9589310277068186, "grad_norm": 0.16130737960338593, "learning_rate": 7.217469608284557e-05, "loss": 0.3177, "step": 624 }, { "epoch": 1.9620750638632343, "grad_norm": 0.1715959906578064, "learning_rate": 7.212967131922558e-05, "loss": 0.2878, "step": 625 }, { "epoch": 1.9652191000196502, "grad_norm": 0.15491920709609985, "learning_rate": 7.208464655560559e-05, "loss": 0.3142, "step": 626 }, { "epoch": 1.968363136176066, "grad_norm": 0.16922113299369812, "learning_rate": 7.20396217919856e-05, "loss": 0.3512, "step": 627 }, { "epoch": 1.971507172332482, "grad_norm": 0.1589455008506775, "learning_rate": 7.19945970283656e-05, "loss": 0.3539, "step": 628 }, { "epoch": 1.9746512084888976, "grad_norm": 0.16388365626335144, "learning_rate": 7.194957226474561e-05, "loss": 0.3072, "step": 629 }, { "epoch": 1.9777952446453133, "grad_norm": 0.17265450954437256, "learning_rate": 7.190454750112562e-05, "loss": 0.3112, "step": 630 }, { "epoch": 1.9809392808017292, "grad_norm": 0.17898835241794586, "learning_rate": 7.185952273750563e-05, "loss": 0.3209, "step": 631 }, { "epoch": 1.984083316958145, "grad_norm": 0.16306431591510773, "learning_rate": 7.181449797388564e-05, "loss": 0.3551, "step": 632 }, { "epoch": 1.9872273531145608, "grad_norm": 0.1657238006591797, "learning_rate": 7.176947321026565e-05, "loss": 0.3328, "step": 633 }, { "epoch": 1.9903713892709765, "grad_norm": 0.15753699839115143, "learning_rate": 7.172444844664566e-05, "loss": 0.3164, "step": 634 }, { "epoch": 1.9935154254273924, "grad_norm": 0.16978983581066132, "learning_rate": 7.167942368302566e-05, "loss": 0.3112, "step": 635 }, { "epoch": 1.9966594615838082, "grad_norm": 0.1637469083070755, "learning_rate": 7.163439891940567e-05, "loss": 0.3197, "step": 636 }, { "epoch": 1.9998034977402241, "grad_norm": 0.16330580413341522, "learning_rate": 7.158937415578568e-05, "loss": 0.3213, "step": 637 }, { "epoch": 2.0, "grad_norm": 0.7975326776504517, "learning_rate": 7.15443493921657e-05, "loss": 0.3876, "step": 638 }, { "epoch": 2.0, "eval_loss": 0.35074591636657715, "eval_runtime": 127.1939, "eval_samples_per_second": 10.0, "eval_steps_per_second": 10.0, "step": 638 }, { "epoch": 2.003144036156416, "grad_norm": 0.16744080185890198, "learning_rate": 7.149932462854571e-05, "loss": 0.3039, "step": 639 }, { "epoch": 2.006288072312832, "grad_norm": 0.18377025425434113, "learning_rate": 7.145429986492571e-05, "loss": 0.3023, "step": 640 }, { "epoch": 2.0094321084692472, "grad_norm": 0.20248235762119293, "learning_rate": 7.140927510130572e-05, "loss": 0.3189, "step": 641 }, { "epoch": 2.012576144625663, "grad_norm": 0.17356827855110168, "learning_rate": 7.136425033768574e-05, "loss": 0.2769, "step": 642 }, { "epoch": 2.015720180782079, "grad_norm": 0.1943397969007492, "learning_rate": 7.131922557406575e-05, "loss": 0.3244, "step": 643 }, { "epoch": 2.018864216938495, "grad_norm": 0.16790719330310822, "learning_rate": 7.127420081044574e-05, "loss": 0.2692, "step": 644 }, { "epoch": 2.0220082530949104, "grad_norm": 0.19615918397903442, "learning_rate": 7.122917604682576e-05, "loss": 0.3239, "step": 645 }, { "epoch": 2.0251522892513263, "grad_norm": 0.18600516021251678, "learning_rate": 7.118415128320577e-05, "loss": 0.2961, "step": 646 }, { "epoch": 2.028296325407742, "grad_norm": 0.18322791159152985, "learning_rate": 7.113912651958578e-05, "loss": 0.2753, "step": 647 }, { "epoch": 2.031440361564158, "grad_norm": 0.16769424080848694, "learning_rate": 7.109410175596578e-05, "loss": 0.2914, "step": 648 }, { "epoch": 2.034584397720574, "grad_norm": 0.17448030412197113, "learning_rate": 7.10490769923458e-05, "loss": 0.2789, "step": 649 }, { "epoch": 2.0377284338769894, "grad_norm": 0.16985324025154114, "learning_rate": 7.10040522287258e-05, "loss": 0.3006, "step": 650 }, { "epoch": 2.0408724700334053, "grad_norm": 0.1814301609992981, "learning_rate": 7.095902746510582e-05, "loss": 0.3212, "step": 651 }, { "epoch": 2.0440165061898212, "grad_norm": 0.17596714198589325, "learning_rate": 7.091400270148581e-05, "loss": 0.3112, "step": 652 }, { "epoch": 2.047160542346237, "grad_norm": 0.18193334341049194, "learning_rate": 7.086897793786583e-05, "loss": 0.3277, "step": 653 }, { "epoch": 2.0503045785026526, "grad_norm": 0.1632920652627945, "learning_rate": 7.082395317424584e-05, "loss": 0.3024, "step": 654 }, { "epoch": 2.0534486146590685, "grad_norm": 0.1734163463115692, "learning_rate": 7.077892841062585e-05, "loss": 0.3019, "step": 655 }, { "epoch": 2.0565926508154844, "grad_norm": 0.17051835358142853, "learning_rate": 7.073390364700586e-05, "loss": 0.2949, "step": 656 }, { "epoch": 2.0597366869719003, "grad_norm": 0.1792338490486145, "learning_rate": 7.068887888338587e-05, "loss": 0.3164, "step": 657 }, { "epoch": 2.062880723128316, "grad_norm": 0.17848795652389526, "learning_rate": 7.064385411976588e-05, "loss": 0.2954, "step": 658 }, { "epoch": 2.0660247592847316, "grad_norm": 0.17348197102546692, "learning_rate": 7.059882935614588e-05, "loss": 0.3103, "step": 659 }, { "epoch": 2.0691687954411475, "grad_norm": 0.17320173978805542, "learning_rate": 7.055380459252589e-05, "loss": 0.2976, "step": 660 }, { "epoch": 2.0723128315975634, "grad_norm": 0.17929640412330627, "learning_rate": 7.05087798289059e-05, "loss": 0.3134, "step": 661 }, { "epoch": 2.0754568677539793, "grad_norm": 0.1745566874742508, "learning_rate": 7.046375506528591e-05, "loss": 0.311, "step": 662 }, { "epoch": 2.0786009039103948, "grad_norm": 0.16974468529224396, "learning_rate": 7.041873030166593e-05, "loss": 0.2996, "step": 663 }, { "epoch": 2.0817449400668107, "grad_norm": 0.1687924563884735, "learning_rate": 7.037370553804593e-05, "loss": 0.2945, "step": 664 }, { "epoch": 2.0848889762232266, "grad_norm": 0.1747463196516037, "learning_rate": 7.032868077442594e-05, "loss": 0.3259, "step": 665 }, { "epoch": 2.0880330123796425, "grad_norm": 0.1635727733373642, "learning_rate": 7.028365601080594e-05, "loss": 0.2994, "step": 666 }, { "epoch": 2.0911770485360583, "grad_norm": 0.15798041224479675, "learning_rate": 7.023863124718597e-05, "loss": 0.2849, "step": 667 }, { "epoch": 2.094321084692474, "grad_norm": 0.1802162379026413, "learning_rate": 7.019360648356596e-05, "loss": 0.3135, "step": 668 }, { "epoch": 2.0974651208488897, "grad_norm": 0.16208456456661224, "learning_rate": 7.014858171994597e-05, "loss": 0.2836, "step": 669 }, { "epoch": 2.1006091570053056, "grad_norm": 0.17748893797397614, "learning_rate": 7.010355695632598e-05, "loss": 0.2782, "step": 670 }, { "epoch": 2.1037531931617215, "grad_norm": 0.17442883551120758, "learning_rate": 7.0058532192706e-05, "loss": 0.3365, "step": 671 }, { "epoch": 2.106897229318137, "grad_norm": 0.17609655857086182, "learning_rate": 7.0013507429086e-05, "loss": 0.3, "step": 672 }, { "epoch": 2.110041265474553, "grad_norm": 0.1712694615125656, "learning_rate": 6.9968482665466e-05, "loss": 0.303, "step": 673 }, { "epoch": 2.1131853016309687, "grad_norm": 0.1624165028333664, "learning_rate": 6.992345790184603e-05, "loss": 0.2699, "step": 674 }, { "epoch": 2.1163293377873846, "grad_norm": 0.18254142999649048, "learning_rate": 6.987843313822603e-05, "loss": 0.2932, "step": 675 }, { "epoch": 2.1194733739438005, "grad_norm": 0.1647147387266159, "learning_rate": 6.983340837460603e-05, "loss": 0.2695, "step": 676 }, { "epoch": 2.122617410100216, "grad_norm": 0.18291249871253967, "learning_rate": 6.978838361098604e-05, "loss": 0.3003, "step": 677 }, { "epoch": 2.125761446256632, "grad_norm": 0.16896897554397583, "learning_rate": 6.974335884736606e-05, "loss": 0.3078, "step": 678 }, { "epoch": 2.128905482413048, "grad_norm": 0.18863414227962494, "learning_rate": 6.969833408374607e-05, "loss": 0.3076, "step": 679 }, { "epoch": 2.1320495185694637, "grad_norm": 0.16777677834033966, "learning_rate": 6.965330932012606e-05, "loss": 0.3082, "step": 680 }, { "epoch": 2.1351935547258796, "grad_norm": 0.19012515246868134, "learning_rate": 6.960828455650609e-05, "loss": 0.3193, "step": 681 }, { "epoch": 2.138337590882295, "grad_norm": 0.17942240834236145, "learning_rate": 6.95632597928861e-05, "loss": 0.2888, "step": 682 }, { "epoch": 2.141481627038711, "grad_norm": 0.17384488880634308, "learning_rate": 6.95182350292661e-05, "loss": 0.3272, "step": 683 }, { "epoch": 2.144625663195127, "grad_norm": 0.19098258018493652, "learning_rate": 6.94732102656461e-05, "loss": 0.3404, "step": 684 }, { "epoch": 2.1477696993515427, "grad_norm": 0.1641370952129364, "learning_rate": 6.942818550202612e-05, "loss": 0.2996, "step": 685 }, { "epoch": 2.150913735507958, "grad_norm": 0.1653837412595749, "learning_rate": 6.938316073840613e-05, "loss": 0.2864, "step": 686 }, { "epoch": 2.154057771664374, "grad_norm": 0.1699887216091156, "learning_rate": 6.933813597478614e-05, "loss": 0.2979, "step": 687 }, { "epoch": 2.15720180782079, "grad_norm": 0.1814461499452591, "learning_rate": 6.929311121116615e-05, "loss": 0.291, "step": 688 }, { "epoch": 2.160345843977206, "grad_norm": 0.16752971708774567, "learning_rate": 6.924808644754615e-05, "loss": 0.283, "step": 689 }, { "epoch": 2.1634898801336213, "grad_norm": 0.17824186384677887, "learning_rate": 6.920306168392616e-05, "loss": 0.3333, "step": 690 }, { "epoch": 2.166633916290037, "grad_norm": 0.16603407263755798, "learning_rate": 6.915803692030617e-05, "loss": 0.2995, "step": 691 }, { "epoch": 2.169777952446453, "grad_norm": 0.17203231155872345, "learning_rate": 6.911301215668618e-05, "loss": 0.3122, "step": 692 }, { "epoch": 2.172921988602869, "grad_norm": 0.16511327028274536, "learning_rate": 6.906798739306619e-05, "loss": 0.3074, "step": 693 }, { "epoch": 2.176066024759285, "grad_norm": 0.1745661348104477, "learning_rate": 6.90229626294462e-05, "loss": 0.2976, "step": 694 }, { "epoch": 2.1792100609157004, "grad_norm": 0.1613238900899887, "learning_rate": 6.89779378658262e-05, "loss": 0.3032, "step": 695 }, { "epoch": 2.1823540970721163, "grad_norm": 0.15707257390022278, "learning_rate": 6.893291310220622e-05, "loss": 0.2769, "step": 696 }, { "epoch": 2.185498133228532, "grad_norm": 0.1752636432647705, "learning_rate": 6.888788833858622e-05, "loss": 0.3081, "step": 697 }, { "epoch": 2.188642169384948, "grad_norm": 0.1744633913040161, "learning_rate": 6.884286357496623e-05, "loss": 0.2985, "step": 698 }, { "epoch": 2.191786205541364, "grad_norm": 0.17221374809741974, "learning_rate": 6.879783881134625e-05, "loss": 0.2967, "step": 699 }, { "epoch": 2.1949302416977794, "grad_norm": 0.17623651027679443, "learning_rate": 6.875281404772625e-05, "loss": 0.2849, "step": 700 }, { "epoch": 2.1980742778541953, "grad_norm": 0.16893146932125092, "learning_rate": 6.870778928410626e-05, "loss": 0.3118, "step": 701 }, { "epoch": 2.201218314010611, "grad_norm": 0.16851678490638733, "learning_rate": 6.866276452048627e-05, "loss": 0.2931, "step": 702 }, { "epoch": 2.204362350167027, "grad_norm": 0.18565109372138977, "learning_rate": 6.861773975686629e-05, "loss": 0.3055, "step": 703 }, { "epoch": 2.2075063863234425, "grad_norm": 0.2212684601545334, "learning_rate": 6.857271499324628e-05, "loss": 0.2964, "step": 704 }, { "epoch": 2.2106504224798584, "grad_norm": 0.18393367528915405, "learning_rate": 6.852769022962629e-05, "loss": 0.324, "step": 705 }, { "epoch": 2.2137944586362743, "grad_norm": 0.1786426156759262, "learning_rate": 6.84826654660063e-05, "loss": 0.3215, "step": 706 }, { "epoch": 2.2169384947926902, "grad_norm": 0.18496280908584595, "learning_rate": 6.843764070238632e-05, "loss": 0.2856, "step": 707 }, { "epoch": 2.2200825309491057, "grad_norm": 0.19118329882621765, "learning_rate": 6.839261593876632e-05, "loss": 0.323, "step": 708 }, { "epoch": 2.2232265671055216, "grad_norm": 0.17779740691184998, "learning_rate": 6.834759117514633e-05, "loss": 0.3111, "step": 709 }, { "epoch": 2.2263706032619375, "grad_norm": 0.18645815551280975, "learning_rate": 6.830256641152635e-05, "loss": 0.2887, "step": 710 }, { "epoch": 2.2295146394183534, "grad_norm": 0.18136370182037354, "learning_rate": 6.825754164790636e-05, "loss": 0.3317, "step": 711 }, { "epoch": 2.2326586755747693, "grad_norm": 0.16164925694465637, "learning_rate": 6.821251688428637e-05, "loss": 0.2845, "step": 712 }, { "epoch": 2.2358027117311847, "grad_norm": 0.188680961728096, "learning_rate": 6.816749212066636e-05, "loss": 0.306, "step": 713 }, { "epoch": 2.2389467478876006, "grad_norm": 0.1774417608976364, "learning_rate": 6.812246735704638e-05, "loss": 0.2909, "step": 714 }, { "epoch": 2.2420907840440165, "grad_norm": 0.21063882112503052, "learning_rate": 6.807744259342639e-05, "loss": 0.3127, "step": 715 }, { "epoch": 2.2452348202004324, "grad_norm": 0.17486001551151276, "learning_rate": 6.80324178298064e-05, "loss": 0.3187, "step": 716 }, { "epoch": 2.2483788563568483, "grad_norm": 0.17519284784793854, "learning_rate": 6.798739306618641e-05, "loss": 0.2972, "step": 717 }, { "epoch": 2.2515228925132638, "grad_norm": 0.1834704726934433, "learning_rate": 6.794236830256642e-05, "loss": 0.3071, "step": 718 }, { "epoch": 2.2546669286696797, "grad_norm": 0.1916733831167221, "learning_rate": 6.789734353894643e-05, "loss": 0.2882, "step": 719 }, { "epoch": 2.2578109648260956, "grad_norm": 0.17214199900627136, "learning_rate": 6.785231877532643e-05, "loss": 0.3197, "step": 720 }, { "epoch": 2.2609550009825115, "grad_norm": 0.18873214721679688, "learning_rate": 6.780729401170644e-05, "loss": 0.2971, "step": 721 }, { "epoch": 2.264099037138927, "grad_norm": 0.17362910509109497, "learning_rate": 6.776226924808645e-05, "loss": 0.3009, "step": 722 }, { "epoch": 2.267243073295343, "grad_norm": 0.17229104042053223, "learning_rate": 6.771724448446646e-05, "loss": 0.2928, "step": 723 }, { "epoch": 2.2703871094517587, "grad_norm": 0.19510141015052795, "learning_rate": 6.767221972084647e-05, "loss": 0.3005, "step": 724 }, { "epoch": 2.2735311456081746, "grad_norm": 0.17983099818229675, "learning_rate": 6.762719495722648e-05, "loss": 0.3036, "step": 725 }, { "epoch": 2.27667518176459, "grad_norm": 0.16379517316818237, "learning_rate": 6.758217019360649e-05, "loss": 0.2939, "step": 726 }, { "epoch": 2.279819217921006, "grad_norm": 0.1834723949432373, "learning_rate": 6.75371454299865e-05, "loss": 0.3018, "step": 727 }, { "epoch": 2.282963254077422, "grad_norm": 0.1845984309911728, "learning_rate": 6.74921206663665e-05, "loss": 0.3261, "step": 728 }, { "epoch": 2.2861072902338377, "grad_norm": 0.19279064238071442, "learning_rate": 6.744709590274651e-05, "loss": 0.319, "step": 729 }, { "epoch": 2.2892513263902536, "grad_norm": 0.1857377290725708, "learning_rate": 6.740207113912652e-05, "loss": 0.2763, "step": 730 }, { "epoch": 2.292395362546669, "grad_norm": 0.18448881804943085, "learning_rate": 6.735704637550653e-05, "loss": 0.3089, "step": 731 }, { "epoch": 2.295539398703085, "grad_norm": 0.206381157040596, "learning_rate": 6.731202161188654e-05, "loss": 0.3062, "step": 732 }, { "epoch": 2.298683434859501, "grad_norm": 0.17306199669837952, "learning_rate": 6.726699684826655e-05, "loss": 0.3199, "step": 733 }, { "epoch": 2.301827471015917, "grad_norm": 0.18375885486602783, "learning_rate": 6.722197208464655e-05, "loss": 0.3053, "step": 734 }, { "epoch": 2.3049715071723327, "grad_norm": 0.18296293914318085, "learning_rate": 6.717694732102658e-05, "loss": 0.2863, "step": 735 }, { "epoch": 2.308115543328748, "grad_norm": 0.2020559459924698, "learning_rate": 6.713192255740659e-05, "loss": 0.303, "step": 736 }, { "epoch": 2.311259579485164, "grad_norm": 0.17968842387199402, "learning_rate": 6.708689779378658e-05, "loss": 0.2918, "step": 737 }, { "epoch": 2.31440361564158, "grad_norm": 0.18758021295070648, "learning_rate": 6.704187303016659e-05, "loss": 0.293, "step": 738 }, { "epoch": 2.317547651797996, "grad_norm": 0.17563396692276, "learning_rate": 6.699684826654661e-05, "loss": 0.2952, "step": 739 }, { "epoch": 2.3206916879544117, "grad_norm": 0.17301028966903687, "learning_rate": 6.695182350292662e-05, "loss": 0.2728, "step": 740 }, { "epoch": 2.323835724110827, "grad_norm": 0.19841277599334717, "learning_rate": 6.690679873930662e-05, "loss": 0.3135, "step": 741 }, { "epoch": 2.326979760267243, "grad_norm": 0.1739703118801117, "learning_rate": 6.686177397568662e-05, "loss": 0.2696, "step": 742 }, { "epoch": 2.330123796423659, "grad_norm": 0.17310677468776703, "learning_rate": 6.681674921206665e-05, "loss": 0.2724, "step": 743 }, { "epoch": 2.333267832580075, "grad_norm": 0.179913729429245, "learning_rate": 6.677172444844665e-05, "loss": 0.3146, "step": 744 }, { "epoch": 2.3364118687364903, "grad_norm": 0.1745288372039795, "learning_rate": 6.672669968482665e-05, "loss": 0.3064, "step": 745 }, { "epoch": 2.339555904892906, "grad_norm": 0.18219201266765594, "learning_rate": 6.668167492120667e-05, "loss": 0.2887, "step": 746 }, { "epoch": 2.342699941049322, "grad_norm": 0.1766086369752884, "learning_rate": 6.663665015758668e-05, "loss": 0.2895, "step": 747 }, { "epoch": 2.345843977205738, "grad_norm": 0.20296621322631836, "learning_rate": 6.659162539396669e-05, "loss": 0.3017, "step": 748 }, { "epoch": 2.3489880133621535, "grad_norm": 0.19249823689460754, "learning_rate": 6.654660063034668e-05, "loss": 0.3151, "step": 749 }, { "epoch": 2.3521320495185694, "grad_norm": 0.1770361214876175, "learning_rate": 6.65015758667267e-05, "loss": 0.2883, "step": 750 }, { "epoch": 2.3552760856749853, "grad_norm": 0.16738583147525787, "learning_rate": 6.645655110310671e-05, "loss": 0.295, "step": 751 }, { "epoch": 2.358420121831401, "grad_norm": 0.1856423318386078, "learning_rate": 6.641152633948672e-05, "loss": 0.3184, "step": 752 }, { "epoch": 2.361564157987817, "grad_norm": 0.1807333081960678, "learning_rate": 6.636650157586673e-05, "loss": 0.3085, "step": 753 }, { "epoch": 2.3647081941442325, "grad_norm": 0.1712781935930252, "learning_rate": 6.632147681224674e-05, "loss": 0.2964, "step": 754 }, { "epoch": 2.3678522303006484, "grad_norm": 0.15793918073177338, "learning_rate": 6.627645204862675e-05, "loss": 0.2839, "step": 755 }, { "epoch": 2.3709962664570643, "grad_norm": 0.16997607052326202, "learning_rate": 6.623142728500676e-05, "loss": 0.2997, "step": 756 }, { "epoch": 2.37414030261348, "grad_norm": 0.17945247888565063, "learning_rate": 6.618640252138677e-05, "loss": 0.2898, "step": 757 }, { "epoch": 2.377284338769896, "grad_norm": 0.1655392050743103, "learning_rate": 6.614137775776677e-05, "loss": 0.3101, "step": 758 }, { "epoch": 2.3804283749263115, "grad_norm": 0.1839962750673294, "learning_rate": 6.609635299414678e-05, "loss": 0.3124, "step": 759 }, { "epoch": 2.3835724110827274, "grad_norm": 0.18653053045272827, "learning_rate": 6.605132823052679e-05, "loss": 0.2953, "step": 760 }, { "epoch": 2.3867164472391433, "grad_norm": 0.2000558078289032, "learning_rate": 6.60063034669068e-05, "loss": 0.3015, "step": 761 }, { "epoch": 2.3898604833955592, "grad_norm": 0.1844789832830429, "learning_rate": 6.596127870328681e-05, "loss": 0.2934, "step": 762 }, { "epoch": 2.3930045195519747, "grad_norm": 0.19168713688850403, "learning_rate": 6.591625393966682e-05, "loss": 0.3006, "step": 763 }, { "epoch": 2.3961485557083906, "grad_norm": 0.18218077719211578, "learning_rate": 6.587122917604684e-05, "loss": 0.3039, "step": 764 }, { "epoch": 2.3992925918648065, "grad_norm": 0.17066723108291626, "learning_rate": 6.582620441242683e-05, "loss": 0.2972, "step": 765 }, { "epoch": 2.4024366280212224, "grad_norm": 0.20712348818778992, "learning_rate": 6.578117964880684e-05, "loss": 0.2978, "step": 766 }, { "epoch": 2.405580664177638, "grad_norm": 0.18007948994636536, "learning_rate": 6.573615488518685e-05, "loss": 0.321, "step": 767 }, { "epoch": 2.4087247003340537, "grad_norm": 0.17139753699302673, "learning_rate": 6.569113012156687e-05, "loss": 0.2915, "step": 768 }, { "epoch": 2.4118687364904696, "grad_norm": 0.17160378396511078, "learning_rate": 6.564610535794687e-05, "loss": 0.2867, "step": 769 }, { "epoch": 2.4150127726468855, "grad_norm": 0.18733811378479004, "learning_rate": 6.560108059432688e-05, "loss": 0.3057, "step": 770 }, { "epoch": 2.4181568088033014, "grad_norm": 0.18095803260803223, "learning_rate": 6.55560558307069e-05, "loss": 0.2844, "step": 771 }, { "epoch": 2.421300844959717, "grad_norm": 0.1770097315311432, "learning_rate": 6.551103106708691e-05, "loss": 0.2741, "step": 772 }, { "epoch": 2.4244448811161328, "grad_norm": 0.1827918291091919, "learning_rate": 6.54660063034669e-05, "loss": 0.3304, "step": 773 }, { "epoch": 2.4275889172725487, "grad_norm": 0.17145848274230957, "learning_rate": 6.542098153984691e-05, "loss": 0.2881, "step": 774 }, { "epoch": 2.4307329534289646, "grad_norm": 0.1687670797109604, "learning_rate": 6.537595677622693e-05, "loss": 0.2701, "step": 775 }, { "epoch": 2.4338769895853805, "grad_norm": 0.17930062115192413, "learning_rate": 6.533093201260694e-05, "loss": 0.3076, "step": 776 }, { "epoch": 2.437021025741796, "grad_norm": 0.17635630071163177, "learning_rate": 6.528590724898694e-05, "loss": 0.2754, "step": 777 }, { "epoch": 2.440165061898212, "grad_norm": 0.1941213458776474, "learning_rate": 6.524088248536695e-05, "loss": 0.3096, "step": 778 }, { "epoch": 2.4433090980546277, "grad_norm": 0.19597280025482178, "learning_rate": 6.519585772174697e-05, "loss": 0.3185, "step": 779 }, { "epoch": 2.4464531342110436, "grad_norm": 0.17912910878658295, "learning_rate": 6.515083295812698e-05, "loss": 0.3032, "step": 780 }, { "epoch": 2.449597170367459, "grad_norm": 0.17373843491077423, "learning_rate": 6.510580819450697e-05, "loss": 0.312, "step": 781 }, { "epoch": 2.452741206523875, "grad_norm": 0.17602920532226562, "learning_rate": 6.5060783430887e-05, "loss": 0.3009, "step": 782 }, { "epoch": 2.455885242680291, "grad_norm": 0.17300525307655334, "learning_rate": 6.5015758667267e-05, "loss": 0.288, "step": 783 }, { "epoch": 2.4590292788367067, "grad_norm": 0.17836594581604004, "learning_rate": 6.497073390364701e-05, "loss": 0.2983, "step": 784 }, { "epoch": 2.462173314993122, "grad_norm": 0.178053617477417, "learning_rate": 6.492570914002702e-05, "loss": 0.3247, "step": 785 }, { "epoch": 2.465317351149538, "grad_norm": 0.18728157877922058, "learning_rate": 6.488068437640703e-05, "loss": 0.3181, "step": 786 }, { "epoch": 2.468461387305954, "grad_norm": 0.18057219684123993, "learning_rate": 6.483565961278704e-05, "loss": 0.2981, "step": 787 }, { "epoch": 2.47160542346237, "grad_norm": 0.16796505451202393, "learning_rate": 6.479063484916705e-05, "loss": 0.3078, "step": 788 }, { "epoch": 2.474749459618786, "grad_norm": 0.190389946103096, "learning_rate": 6.474561008554705e-05, "loss": 0.3039, "step": 789 }, { "epoch": 2.4778934957752012, "grad_norm": 0.193536639213562, "learning_rate": 6.470058532192706e-05, "loss": 0.3242, "step": 790 }, { "epoch": 2.481037531931617, "grad_norm": 0.1886732131242752, "learning_rate": 6.465556055830707e-05, "loss": 0.3287, "step": 791 }, { "epoch": 2.484181568088033, "grad_norm": 0.1902109682559967, "learning_rate": 6.461053579468708e-05, "loss": 0.287, "step": 792 }, { "epoch": 2.487325604244449, "grad_norm": 0.1784544289112091, "learning_rate": 6.456551103106709e-05, "loss": 0.3389, "step": 793 }, { "epoch": 2.490469640400865, "grad_norm": 0.1953764259815216, "learning_rate": 6.45204862674471e-05, "loss": 0.2629, "step": 794 }, { "epoch": 2.4936136765572803, "grad_norm": 0.181540846824646, "learning_rate": 6.44754615038271e-05, "loss": 0.2758, "step": 795 }, { "epoch": 2.496757712713696, "grad_norm": 0.18179424107074738, "learning_rate": 6.443043674020711e-05, "loss": 0.2878, "step": 796 }, { "epoch": 2.499901748870112, "grad_norm": 0.18012291193008423, "learning_rate": 6.438541197658712e-05, "loss": 0.293, "step": 797 }, { "epoch": 2.503045785026528, "grad_norm": 0.1753896027803421, "learning_rate": 6.434038721296713e-05, "loss": 0.2952, "step": 798 }, { "epoch": 2.506189821182944, "grad_norm": 0.18676632642745972, "learning_rate": 6.429536244934714e-05, "loss": 0.2952, "step": 799 }, { "epoch": 2.5093338573393593, "grad_norm": 0.19325347244739532, "learning_rate": 6.425033768572716e-05, "loss": 0.3255, "step": 800 }, { "epoch": 2.512477893495775, "grad_norm": 0.18933020532131195, "learning_rate": 6.420531292210716e-05, "loss": 0.2995, "step": 801 }, { "epoch": 2.515621929652191, "grad_norm": 0.1744452267885208, "learning_rate": 6.416028815848717e-05, "loss": 0.2682, "step": 802 }, { "epoch": 2.5187659658086066, "grad_norm": 0.1780654489994049, "learning_rate": 6.411526339486717e-05, "loss": 0.2941, "step": 803 }, { "epoch": 2.5219100019650225, "grad_norm": 0.1746615618467331, "learning_rate": 6.40702386312472e-05, "loss": 0.2946, "step": 804 }, { "epoch": 2.5250540381214384, "grad_norm": 0.19993247091770172, "learning_rate": 6.402521386762719e-05, "loss": 0.302, "step": 805 }, { "epoch": 2.5281980742778543, "grad_norm": 0.1867285519838333, "learning_rate": 6.39801891040072e-05, "loss": 0.3026, "step": 806 }, { "epoch": 2.53134211043427, "grad_norm": 0.18069183826446533, "learning_rate": 6.393516434038722e-05, "loss": 0.2913, "step": 807 }, { "epoch": 2.5344861465906856, "grad_norm": 0.17721295356750488, "learning_rate": 6.389013957676723e-05, "loss": 0.2959, "step": 808 }, { "epoch": 2.5376301827471015, "grad_norm": 0.18728205561637878, "learning_rate": 6.384511481314724e-05, "loss": 0.3032, "step": 809 }, { "epoch": 2.5407742189035174, "grad_norm": 0.18671870231628418, "learning_rate": 6.380009004952723e-05, "loss": 0.2981, "step": 810 }, { "epoch": 2.5439182550599333, "grad_norm": 0.172725647687912, "learning_rate": 6.375506528590726e-05, "loss": 0.3198, "step": 811 }, { "epoch": 2.547062291216349, "grad_norm": 0.17702126502990723, "learning_rate": 6.371004052228727e-05, "loss": 0.3225, "step": 812 }, { "epoch": 2.5502063273727646, "grad_norm": 0.1773066371679306, "learning_rate": 6.366501575866727e-05, "loss": 0.2894, "step": 813 }, { "epoch": 2.5533503635291805, "grad_norm": 0.1835213303565979, "learning_rate": 6.361999099504727e-05, "loss": 0.3321, "step": 814 }, { "epoch": 2.5564943996855964, "grad_norm": 0.18341873586177826, "learning_rate": 6.357496623142729e-05, "loss": 0.2961, "step": 815 }, { "epoch": 2.5596384358420123, "grad_norm": 0.17506669461727142, "learning_rate": 6.35299414678073e-05, "loss": 0.2889, "step": 816 }, { "epoch": 2.5627824719984282, "grad_norm": 0.18534137308597565, "learning_rate": 6.348491670418731e-05, "loss": 0.3099, "step": 817 }, { "epoch": 2.5659265081548437, "grad_norm": 0.1681559979915619, "learning_rate": 6.343989194056732e-05, "loss": 0.2732, "step": 818 }, { "epoch": 2.5690705443112596, "grad_norm": 0.18323373794555664, "learning_rate": 6.339486717694733e-05, "loss": 0.3096, "step": 819 }, { "epoch": 2.5722145804676755, "grad_norm": 0.1837071180343628, "learning_rate": 6.334984241332733e-05, "loss": 0.2851, "step": 820 }, { "epoch": 2.575358616624091, "grad_norm": 0.17143307626247406, "learning_rate": 6.330481764970734e-05, "loss": 0.2896, "step": 821 }, { "epoch": 2.578502652780507, "grad_norm": 0.18958891928195953, "learning_rate": 6.325979288608735e-05, "loss": 0.3074, "step": 822 }, { "epoch": 2.5816466889369227, "grad_norm": 0.1758703589439392, "learning_rate": 6.321476812246736e-05, "loss": 0.2492, "step": 823 }, { "epoch": 2.5847907250933386, "grad_norm": 0.1919006109237671, "learning_rate": 6.316974335884737e-05, "loss": 0.3212, "step": 824 }, { "epoch": 2.5879347612497545, "grad_norm": 0.19297993183135986, "learning_rate": 6.312471859522738e-05, "loss": 0.2914, "step": 825 }, { "epoch": 2.59107879740617, "grad_norm": 0.17817267775535583, "learning_rate": 6.307969383160739e-05, "loss": 0.3057, "step": 826 }, { "epoch": 2.594222833562586, "grad_norm": 0.18181775510311127, "learning_rate": 6.30346690679874e-05, "loss": 0.2883, "step": 827 }, { "epoch": 2.5973668697190018, "grad_norm": 0.1789119690656662, "learning_rate": 6.29896443043674e-05, "loss": 0.2952, "step": 828 }, { "epoch": 2.6005109058754177, "grad_norm": 0.1905726045370102, "learning_rate": 6.294461954074741e-05, "loss": 0.3231, "step": 829 }, { "epoch": 2.6036549420318336, "grad_norm": 0.18004848062992096, "learning_rate": 6.289959477712742e-05, "loss": 0.3071, "step": 830 }, { "epoch": 2.606798978188249, "grad_norm": 0.1865493506193161, "learning_rate": 6.285457001350743e-05, "loss": 0.3151, "step": 831 }, { "epoch": 2.609943014344665, "grad_norm": 0.16819460690021515, "learning_rate": 6.280954524988744e-05, "loss": 0.2916, "step": 832 }, { "epoch": 2.613087050501081, "grad_norm": 0.17601875960826874, "learning_rate": 6.276452048626746e-05, "loss": 0.2911, "step": 833 }, { "epoch": 2.6162310866574967, "grad_norm": 0.17802783846855164, "learning_rate": 6.271949572264745e-05, "loss": 0.2896, "step": 834 }, { "epoch": 2.6193751228139126, "grad_norm": 0.1776685267686844, "learning_rate": 6.267447095902746e-05, "loss": 0.2863, "step": 835 }, { "epoch": 2.622519158970328, "grad_norm": 0.189042329788208, "learning_rate": 6.262944619540749e-05, "loss": 0.3027, "step": 836 }, { "epoch": 2.625663195126744, "grad_norm": 0.19149142503738403, "learning_rate": 6.25844214317875e-05, "loss": 0.31, "step": 837 }, { "epoch": 2.62880723128316, "grad_norm": 0.17297279834747314, "learning_rate": 6.253939666816749e-05, "loss": 0.2946, "step": 838 }, { "epoch": 2.6319512674395753, "grad_norm": 0.19117750227451324, "learning_rate": 6.24943719045475e-05, "loss": 0.3151, "step": 839 }, { "epoch": 2.635095303595991, "grad_norm": 0.17469459772109985, "learning_rate": 6.244934714092752e-05, "loss": 0.3149, "step": 840 }, { "epoch": 2.638239339752407, "grad_norm": 0.19314974546432495, "learning_rate": 6.240432237730753e-05, "loss": 0.3268, "step": 841 }, { "epoch": 2.641383375908823, "grad_norm": 0.17507623136043549, "learning_rate": 6.235929761368752e-05, "loss": 0.2974, "step": 842 }, { "epoch": 2.644527412065239, "grad_norm": 0.18038702011108398, "learning_rate": 6.231427285006755e-05, "loss": 0.3041, "step": 843 }, { "epoch": 2.6476714482216543, "grad_norm": 0.18314650654792786, "learning_rate": 6.226924808644755e-05, "loss": 0.2839, "step": 844 }, { "epoch": 2.6508154843780702, "grad_norm": 0.18236756324768066, "learning_rate": 6.222422332282756e-05, "loss": 0.3024, "step": 845 }, { "epoch": 2.653959520534486, "grad_norm": 0.18618281185626984, "learning_rate": 6.217919855920756e-05, "loss": 0.2799, "step": 846 }, { "epoch": 2.657103556690902, "grad_norm": 0.20062251389026642, "learning_rate": 6.213417379558758e-05, "loss": 0.2849, "step": 847 }, { "epoch": 2.660247592847318, "grad_norm": 0.18864601850509644, "learning_rate": 6.208914903196759e-05, "loss": 0.308, "step": 848 }, { "epoch": 2.6633916290037334, "grad_norm": 0.1816050410270691, "learning_rate": 6.20441242683476e-05, "loss": 0.2964, "step": 849 }, { "epoch": 2.6665356651601493, "grad_norm": 0.18332047760486603, "learning_rate": 6.199909950472759e-05, "loss": 0.3083, "step": 850 }, { "epoch": 2.669679701316565, "grad_norm": 0.17861318588256836, "learning_rate": 6.195407474110761e-05, "loss": 0.2996, "step": 851 }, { "epoch": 2.672823737472981, "grad_norm": 0.17434978485107422, "learning_rate": 6.190904997748762e-05, "loss": 0.3027, "step": 852 }, { "epoch": 2.675967773629397, "grad_norm": 0.1839853823184967, "learning_rate": 6.186402521386763e-05, "loss": 0.2963, "step": 853 }, { "epoch": 2.6791118097858124, "grad_norm": 0.19816765189170837, "learning_rate": 6.181900045024764e-05, "loss": 0.3027, "step": 854 }, { "epoch": 2.6822558459422283, "grad_norm": 0.19105638563632965, "learning_rate": 6.177397568662765e-05, "loss": 0.3136, "step": 855 }, { "epoch": 2.685399882098644, "grad_norm": 0.17210789024829865, "learning_rate": 6.172895092300766e-05, "loss": 0.2926, "step": 856 }, { "epoch": 2.6885439182550597, "grad_norm": 0.19876505434513092, "learning_rate": 6.168392615938767e-05, "loss": 0.2997, "step": 857 }, { "epoch": 2.6916879544114756, "grad_norm": 0.18161223828792572, "learning_rate": 6.163890139576767e-05, "loss": 0.2815, "step": 858 }, { "epoch": 2.6948319905678915, "grad_norm": 0.21193601191043854, "learning_rate": 6.159387663214768e-05, "loss": 0.3131, "step": 859 }, { "epoch": 2.6979760267243074, "grad_norm": 0.19129090011119843, "learning_rate": 6.154885186852769e-05, "loss": 0.3091, "step": 860 }, { "epoch": 2.7011200628807233, "grad_norm": 0.1943592131137848, "learning_rate": 6.150382710490771e-05, "loss": 0.2814, "step": 861 }, { "epoch": 2.7042640990371387, "grad_norm": 0.1883707195520401, "learning_rate": 6.145880234128771e-05, "loss": 0.3112, "step": 862 }, { "epoch": 2.7074081351935546, "grad_norm": 0.18551179766654968, "learning_rate": 6.141377757766772e-05, "loss": 0.282, "step": 863 }, { "epoch": 2.7105521713499705, "grad_norm": 0.1946878731250763, "learning_rate": 6.136875281404773e-05, "loss": 0.3037, "step": 864 }, { "epoch": 2.7136962075063864, "grad_norm": 0.1834578663110733, "learning_rate": 6.132372805042775e-05, "loss": 0.3135, "step": 865 }, { "epoch": 2.7168402436628023, "grad_norm": 0.1829901486635208, "learning_rate": 6.127870328680774e-05, "loss": 0.2865, "step": 866 }, { "epoch": 2.7199842798192178, "grad_norm": 0.17749235033988953, "learning_rate": 6.123367852318775e-05, "loss": 0.2928, "step": 867 }, { "epoch": 2.7231283159756337, "grad_norm": 0.17715714871883392, "learning_rate": 6.118865375956776e-05, "loss": 0.2845, "step": 868 }, { "epoch": 2.7262723521320495, "grad_norm": 0.18381257355213165, "learning_rate": 6.114362899594778e-05, "loss": 0.2984, "step": 869 }, { "epoch": 2.7294163882884654, "grad_norm": 0.19029419124126434, "learning_rate": 6.109860423232778e-05, "loss": 0.3378, "step": 870 }, { "epoch": 2.7325604244448813, "grad_norm": 0.17867213487625122, "learning_rate": 6.105357946870779e-05, "loss": 0.3032, "step": 871 }, { "epoch": 2.735704460601297, "grad_norm": 0.18541932106018066, "learning_rate": 6.100855470508781e-05, "loss": 0.3035, "step": 872 }, { "epoch": 2.7388484967577127, "grad_norm": 0.18637771904468536, "learning_rate": 6.096352994146781e-05, "loss": 0.3293, "step": 873 }, { "epoch": 2.7419925329141286, "grad_norm": 0.17814940214157104, "learning_rate": 6.091850517784782e-05, "loss": 0.2902, "step": 874 }, { "epoch": 2.745136569070544, "grad_norm": 0.16907231509685516, "learning_rate": 6.087348041422783e-05, "loss": 0.2503, "step": 875 }, { "epoch": 2.7482806052269604, "grad_norm": 0.17249789834022522, "learning_rate": 6.082845565060784e-05, "loss": 0.2937, "step": 876 }, { "epoch": 2.751424641383376, "grad_norm": 0.1802680343389511, "learning_rate": 6.0783430886987844e-05, "loss": 0.3068, "step": 877 }, { "epoch": 2.7545686775397917, "grad_norm": 0.16548851132392883, "learning_rate": 6.073840612336785e-05, "loss": 0.2782, "step": 878 }, { "epoch": 2.7577127136962076, "grad_norm": 0.18068614602088928, "learning_rate": 6.069338135974787e-05, "loss": 0.3193, "step": 879 }, { "epoch": 2.760856749852623, "grad_norm": 0.17812545597553253, "learning_rate": 6.0648356596127876e-05, "loss": 0.302, "step": 880 }, { "epoch": 2.764000786009039, "grad_norm": 0.17754609882831573, "learning_rate": 6.060333183250788e-05, "loss": 0.2939, "step": 881 }, { "epoch": 2.767144822165455, "grad_norm": 0.17532262206077576, "learning_rate": 6.055830706888789e-05, "loss": 0.3057, "step": 882 }, { "epoch": 2.7702888583218708, "grad_norm": 0.19197304546833038, "learning_rate": 6.05132823052679e-05, "loss": 0.3052, "step": 883 }, { "epoch": 2.7734328944782867, "grad_norm": 0.17263777554035187, "learning_rate": 6.046825754164791e-05, "loss": 0.2859, "step": 884 }, { "epoch": 2.776576930634702, "grad_norm": 0.17361198365688324, "learning_rate": 6.042323277802792e-05, "loss": 0.3017, "step": 885 }, { "epoch": 2.779720966791118, "grad_norm": 0.17638853192329407, "learning_rate": 6.037820801440792e-05, "loss": 0.2659, "step": 886 }, { "epoch": 2.782865002947534, "grad_norm": 0.18898698687553406, "learning_rate": 6.033318325078794e-05, "loss": 0.3115, "step": 887 }, { "epoch": 2.78600903910395, "grad_norm": 0.1923007071018219, "learning_rate": 6.0288158487167945e-05, "loss": 0.3476, "step": 888 }, { "epoch": 2.7891530752603657, "grad_norm": 0.18406765162944794, "learning_rate": 6.0243133723547954e-05, "loss": 0.3148, "step": 889 }, { "epoch": 2.792297111416781, "grad_norm": 0.1878264844417572, "learning_rate": 6.019810895992797e-05, "loss": 0.2948, "step": 890 }, { "epoch": 2.795441147573197, "grad_norm": 0.19613507390022278, "learning_rate": 6.015308419630797e-05, "loss": 0.3094, "step": 891 }, { "epoch": 2.798585183729613, "grad_norm": 0.18215800821781158, "learning_rate": 6.010805943268798e-05, "loss": 0.3093, "step": 892 }, { "epoch": 2.801729219886029, "grad_norm": 0.1736704409122467, "learning_rate": 6.006303466906799e-05, "loss": 0.2924, "step": 893 }, { "epoch": 2.8048732560424448, "grad_norm": 0.18090717494487762, "learning_rate": 6.0018009905448004e-05, "loss": 0.3152, "step": 894 }, { "epoch": 2.80801729219886, "grad_norm": 0.17692336440086365, "learning_rate": 5.9972985141828005e-05, "loss": 0.3008, "step": 895 }, { "epoch": 2.811161328355276, "grad_norm": 0.16891434788703918, "learning_rate": 5.9927960378208014e-05, "loss": 0.311, "step": 896 }, { "epoch": 2.814305364511692, "grad_norm": 0.17044934630393982, "learning_rate": 5.988293561458803e-05, "loss": 0.2952, "step": 897 }, { "epoch": 2.8174494006681075, "grad_norm": 0.17588554322719574, "learning_rate": 5.983791085096804e-05, "loss": 0.3036, "step": 898 }, { "epoch": 2.8205934368245233, "grad_norm": 0.16531124711036682, "learning_rate": 5.9792886087348046e-05, "loss": 0.275, "step": 899 }, { "epoch": 2.8237374729809392, "grad_norm": 0.18077024817466736, "learning_rate": 5.974786132372805e-05, "loss": 0.333, "step": 900 }, { "epoch": 2.826881509137355, "grad_norm": 0.16825056076049805, "learning_rate": 5.9702836560108064e-05, "loss": 0.2946, "step": 901 }, { "epoch": 2.830025545293771, "grad_norm": 0.17718370258808136, "learning_rate": 5.965781179648807e-05, "loss": 0.3275, "step": 902 }, { "epoch": 2.8331695814501865, "grad_norm": 0.18112193048000336, "learning_rate": 5.961278703286808e-05, "loss": 0.2944, "step": 903 }, { "epoch": 2.8363136176066024, "grad_norm": 0.19248449802398682, "learning_rate": 5.956776226924808e-05, "loss": 0.3181, "step": 904 }, { "epoch": 2.8394576537630183, "grad_norm": 0.1759863793849945, "learning_rate": 5.95227375056281e-05, "loss": 0.3063, "step": 905 }, { "epoch": 2.842601689919434, "grad_norm": 0.17761097848415375, "learning_rate": 5.9477712742008107e-05, "loss": 0.2673, "step": 906 }, { "epoch": 2.84574572607585, "grad_norm": 0.18235954642295837, "learning_rate": 5.9432687978388115e-05, "loss": 0.2866, "step": 907 }, { "epoch": 2.8488897622322655, "grad_norm": 0.18456900119781494, "learning_rate": 5.938766321476813e-05, "loss": 0.3212, "step": 908 }, { "epoch": 2.8520337983886814, "grad_norm": 0.16696888208389282, "learning_rate": 5.934263845114814e-05, "loss": 0.2579, "step": 909 }, { "epoch": 2.8551778345450973, "grad_norm": 0.17061343789100647, "learning_rate": 5.929761368752814e-05, "loss": 0.2803, "step": 910 }, { "epoch": 2.8583218707015132, "grad_norm": 0.17469722032546997, "learning_rate": 5.925258892390815e-05, "loss": 0.3067, "step": 911 }, { "epoch": 2.861465906857929, "grad_norm": 0.18299564719200134, "learning_rate": 5.9207564160288165e-05, "loss": 0.2998, "step": 912 }, { "epoch": 2.8646099430143446, "grad_norm": 0.1751292496919632, "learning_rate": 5.9162539396668173e-05, "loss": 0.3201, "step": 913 }, { "epoch": 2.8677539791707605, "grad_norm": 0.18247152864933014, "learning_rate": 5.9117514633048175e-05, "loss": 0.2793, "step": 914 }, { "epoch": 2.8708980153271764, "grad_norm": 0.16854503750801086, "learning_rate": 5.907248986942819e-05, "loss": 0.2996, "step": 915 }, { "epoch": 2.874042051483592, "grad_norm": 0.19287988543510437, "learning_rate": 5.90274651058082e-05, "loss": 0.3109, "step": 916 }, { "epoch": 2.8771860876400077, "grad_norm": 0.18206001818180084, "learning_rate": 5.898244034218821e-05, "loss": 0.3212, "step": 917 }, { "epoch": 2.8803301237964236, "grad_norm": 0.1863626092672348, "learning_rate": 5.893741557856821e-05, "loss": 0.2971, "step": 918 }, { "epoch": 2.8834741599528395, "grad_norm": 0.17158293724060059, "learning_rate": 5.889239081494823e-05, "loss": 0.2839, "step": 919 }, { "epoch": 2.8866181961092554, "grad_norm": 0.1785619556903839, "learning_rate": 5.8847366051328234e-05, "loss": 0.2911, "step": 920 }, { "epoch": 2.889762232265671, "grad_norm": 0.18220217525959015, "learning_rate": 5.880234128770824e-05, "loss": 0.3096, "step": 921 }, { "epoch": 2.8929062684220868, "grad_norm": 0.1800495982170105, "learning_rate": 5.8757316524088244e-05, "loss": 0.2788, "step": 922 }, { "epoch": 2.8960503045785027, "grad_norm": 0.18722409009933472, "learning_rate": 5.8712291760468266e-05, "loss": 0.2904, "step": 923 }, { "epoch": 2.8991943407349186, "grad_norm": 0.16829417645931244, "learning_rate": 5.866726699684827e-05, "loss": 0.2629, "step": 924 }, { "epoch": 2.9023383768913344, "grad_norm": 0.1962534487247467, "learning_rate": 5.8622242233228277e-05, "loss": 0.338, "step": 925 }, { "epoch": 2.90548241304775, "grad_norm": 0.1857425421476364, "learning_rate": 5.857721746960829e-05, "loss": 0.3026, "step": 926 }, { "epoch": 2.908626449204166, "grad_norm": 0.20022393763065338, "learning_rate": 5.85321927059883e-05, "loss": 0.3094, "step": 927 }, { "epoch": 2.9117704853605817, "grad_norm": 0.19462057948112488, "learning_rate": 5.84871679423683e-05, "loss": 0.3156, "step": 928 }, { "epoch": 2.9149145215169976, "grad_norm": 0.18324823677539825, "learning_rate": 5.844214317874831e-05, "loss": 0.3029, "step": 929 }, { "epoch": 2.9180585576734135, "grad_norm": 0.18704599142074585, "learning_rate": 5.8397118415128326e-05, "loss": 0.3326, "step": 930 }, { "epoch": 2.921202593829829, "grad_norm": 0.17740800976753235, "learning_rate": 5.8352093651508335e-05, "loss": 0.317, "step": 931 }, { "epoch": 2.924346629986245, "grad_norm": 0.16319014132022858, "learning_rate": 5.830706888788834e-05, "loss": 0.2893, "step": 932 }, { "epoch": 2.9274906661426607, "grad_norm": 0.18747149407863617, "learning_rate": 5.826204412426836e-05, "loss": 0.3022, "step": 933 }, { "epoch": 2.930634702299076, "grad_norm": 0.17464233934879303, "learning_rate": 5.821701936064836e-05, "loss": 0.3031, "step": 934 }, { "epoch": 2.933778738455492, "grad_norm": 0.1764432042837143, "learning_rate": 5.817199459702837e-05, "loss": 0.3281, "step": 935 }, { "epoch": 2.936922774611908, "grad_norm": 0.19287651777267456, "learning_rate": 5.812696983340837e-05, "loss": 0.2972, "step": 936 }, { "epoch": 2.940066810768324, "grad_norm": 0.17478767037391663, "learning_rate": 5.808194506978839e-05, "loss": 0.2863, "step": 937 }, { "epoch": 2.9432108469247398, "grad_norm": 0.1827182024717331, "learning_rate": 5.8036920306168395e-05, "loss": 0.3128, "step": 938 }, { "epoch": 2.9463548830811552, "grad_norm": 0.19342662394046783, "learning_rate": 5.7991895542548404e-05, "loss": 0.2875, "step": 939 }, { "epoch": 2.949498919237571, "grad_norm": 0.186715766787529, "learning_rate": 5.7946870778928405e-05, "loss": 0.272, "step": 940 }, { "epoch": 2.952642955393987, "grad_norm": 0.18234872817993164, "learning_rate": 5.790184601530843e-05, "loss": 0.2563, "step": 941 }, { "epoch": 2.955786991550403, "grad_norm": 0.20079456269741058, "learning_rate": 5.785682125168843e-05, "loss": 0.3312, "step": 942 }, { "epoch": 2.958931027706819, "grad_norm": 0.18754902482032776, "learning_rate": 5.781179648806844e-05, "loss": 0.2765, "step": 943 }, { "epoch": 2.9620750638632343, "grad_norm": 0.17483817040920258, "learning_rate": 5.776677172444845e-05, "loss": 0.2822, "step": 944 }, { "epoch": 2.96521910001965, "grad_norm": 0.19734202325344086, "learning_rate": 5.772174696082846e-05, "loss": 0.3156, "step": 945 }, { "epoch": 2.968363136176066, "grad_norm": 0.18706603348255157, "learning_rate": 5.7676722197208464e-05, "loss": 0.3057, "step": 946 }, { "epoch": 2.971507172332482, "grad_norm": 0.1804167777299881, "learning_rate": 5.763169743358847e-05, "loss": 0.3098, "step": 947 }, { "epoch": 2.974651208488898, "grad_norm": 0.17411871254444122, "learning_rate": 5.758667266996849e-05, "loss": 0.2892, "step": 948 }, { "epoch": 2.9777952446453133, "grad_norm": 0.1795538067817688, "learning_rate": 5.7541647906348496e-05, "loss": 0.2949, "step": 949 }, { "epoch": 2.980939280801729, "grad_norm": 0.18205681443214417, "learning_rate": 5.74966231427285e-05, "loss": 0.2957, "step": 950 }, { "epoch": 2.984083316958145, "grad_norm": 0.18035931885242462, "learning_rate": 5.745159837910852e-05, "loss": 0.3048, "step": 951 }, { "epoch": 2.9872273531145606, "grad_norm": 0.1858542114496231, "learning_rate": 5.740657361548852e-05, "loss": 0.2855, "step": 952 }, { "epoch": 2.9903713892709765, "grad_norm": 0.17347067594528198, "learning_rate": 5.736154885186853e-05, "loss": 0.2921, "step": 953 }, { "epoch": 2.9935154254273924, "grad_norm": 0.1727266162633896, "learning_rate": 5.731652408824853e-05, "loss": 0.2945, "step": 954 }, { "epoch": 2.9966594615838082, "grad_norm": 0.1809038370847702, "learning_rate": 5.7271499324628554e-05, "loss": 0.3087, "step": 955 }, { "epoch": 2.999803497740224, "grad_norm": 0.19076983630657196, "learning_rate": 5.7226474561008556e-05, "loss": 0.3062, "step": 956 }, { "epoch": 3.0, "grad_norm": 0.835304856300354, "learning_rate": 5.7181449797388565e-05, "loss": 0.419, "step": 957 }, { "epoch": 3.0, "eval_loss": 0.34439337253570557, "eval_runtime": 128.3444, "eval_samples_per_second": 9.911, "eval_steps_per_second": 9.911, "step": 957 } ], "logging_steps": 1, "max_steps": 2226, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1381612357676032e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }