diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,45192 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999534955820803, + "eval_steps": 500, + "global_step": 6450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004650441791970237, + "grad_norm": 6.0558061599731445, + "learning_rate": 1.550387596899225e-08, + "loss": 0.8113, + "step": 1 + }, + { + "epoch": 0.0009300883583940474, + "grad_norm": 6.12605619430542, + "learning_rate": 3.10077519379845e-08, + "loss": 0.825, + "step": 2 + }, + { + "epoch": 0.0013951325375910712, + "grad_norm": 6.057300567626953, + "learning_rate": 4.6511627906976744e-08, + "loss": 0.8147, + "step": 3 + }, + { + "epoch": 0.0018601767167880949, + "grad_norm": 6.23735237121582, + "learning_rate": 6.2015503875969e-08, + "loss": 0.8659, + "step": 4 + }, + { + "epoch": 0.002325220895985119, + "grad_norm": 5.970339298248291, + "learning_rate": 7.751937984496124e-08, + "loss": 0.7994, + "step": 5 + }, + { + "epoch": 0.0027902650751821423, + "grad_norm": 6.014130592346191, + "learning_rate": 9.302325581395349e-08, + "loss": 0.7927, + "step": 6 + }, + { + "epoch": 0.0032553092543791662, + "grad_norm": 6.405108451843262, + "learning_rate": 1.0852713178294575e-07, + "loss": 0.8445, + "step": 7 + }, + { + "epoch": 0.0037203534335761897, + "grad_norm": 5.915679931640625, + "learning_rate": 1.24031007751938e-07, + "loss": 0.7874, + "step": 8 + }, + { + "epoch": 0.004185397612773214, + "grad_norm": 6.164096355438232, + "learning_rate": 1.3953488372093024e-07, + "loss": 0.841, + "step": 9 + }, + { + "epoch": 0.004650441791970238, + "grad_norm": 6.06751012802124, + "learning_rate": 1.5503875968992249e-07, + "loss": 0.8139, + "step": 10 + }, + { + "epoch": 0.005115485971167261, + "grad_norm": 6.07175350189209, + "learning_rate": 1.7054263565891473e-07, + "loss": 0.8137, + "step": 11 + }, + { + "epoch": 0.005580530150364285, + "grad_norm": 6.381070613861084, + "learning_rate": 1.8604651162790698e-07, + "loss": 0.8233, + "step": 12 + }, + { + "epoch": 0.0060455743295613085, + "grad_norm": 6.221768379211426, + "learning_rate": 2.0155038759689925e-07, + "loss": 0.8265, + "step": 13 + }, + { + "epoch": 0.0065106185087583325, + "grad_norm": 5.883142471313477, + "learning_rate": 2.170542635658915e-07, + "loss": 0.7789, + "step": 14 + }, + { + "epoch": 0.0069756626879553555, + "grad_norm": 5.981773853302002, + "learning_rate": 2.3255813953488374e-07, + "loss": 0.7948, + "step": 15 + }, + { + "epoch": 0.0074407068671523795, + "grad_norm": 5.9482035636901855, + "learning_rate": 2.48062015503876e-07, + "loss": 0.782, + "step": 16 + }, + { + "epoch": 0.007905751046349403, + "grad_norm": 6.413148403167725, + "learning_rate": 2.6356589147286826e-07, + "loss": 0.8528, + "step": 17 + }, + { + "epoch": 0.008370795225546427, + "grad_norm": 5.644394397735596, + "learning_rate": 2.790697674418605e-07, + "loss": 0.8007, + "step": 18 + }, + { + "epoch": 0.00883583940474345, + "grad_norm": 5.608232021331787, + "learning_rate": 2.9457364341085275e-07, + "loss": 0.7642, + "step": 19 + }, + { + "epoch": 0.009300883583940475, + "grad_norm": 5.738401889801025, + "learning_rate": 3.1007751937984497e-07, + "loss": 0.8086, + "step": 20 + }, + { + "epoch": 0.009765927763137498, + "grad_norm": 5.834144115447998, + "learning_rate": 3.2558139534883724e-07, + "loss": 0.8128, + "step": 21 + }, + { + "epoch": 0.010230971942334521, + "grad_norm": 5.556512355804443, + "learning_rate": 3.4108527131782946e-07, + "loss": 0.7919, + "step": 22 + }, + { + "epoch": 0.010696016121531546, + "grad_norm": 5.6250200271606445, + "learning_rate": 3.5658914728682174e-07, + "loss": 0.7924, + "step": 23 + }, + { + "epoch": 0.01116106030072857, + "grad_norm": 4.8199992179870605, + "learning_rate": 3.7209302325581396e-07, + "loss": 0.7686, + "step": 24 + }, + { + "epoch": 0.011626104479925592, + "grad_norm": 4.794100761413574, + "learning_rate": 3.8759689922480623e-07, + "loss": 0.7635, + "step": 25 + }, + { + "epoch": 0.012091148659122617, + "grad_norm": 4.6750874519348145, + "learning_rate": 4.031007751937985e-07, + "loss": 0.748, + "step": 26 + }, + { + "epoch": 0.01255619283831964, + "grad_norm": 4.775611877441406, + "learning_rate": 4.186046511627907e-07, + "loss": 0.7594, + "step": 27 + }, + { + "epoch": 0.013021237017516665, + "grad_norm": 4.403256893157959, + "learning_rate": 4.34108527131783e-07, + "loss": 0.7302, + "step": 28 + }, + { + "epoch": 0.013486281196713688, + "grad_norm": 4.565180778503418, + "learning_rate": 4.496124031007752e-07, + "loss": 0.7729, + "step": 29 + }, + { + "epoch": 0.013951325375910711, + "grad_norm": 4.50689697265625, + "learning_rate": 4.651162790697675e-07, + "loss": 0.7524, + "step": 30 + }, + { + "epoch": 0.014416369555107736, + "grad_norm": 4.400961875915527, + "learning_rate": 4.806201550387598e-07, + "loss": 0.7466, + "step": 31 + }, + { + "epoch": 0.014881413734304759, + "grad_norm": 4.230681419372559, + "learning_rate": 4.96124031007752e-07, + "loss": 0.7352, + "step": 32 + }, + { + "epoch": 0.015346457913501782, + "grad_norm": 3.3713021278381348, + "learning_rate": 5.116279069767442e-07, + "loss": 0.7095, + "step": 33 + }, + { + "epoch": 0.015811502092698805, + "grad_norm": 2.6536169052124023, + "learning_rate": 5.271317829457365e-07, + "loss": 0.7099, + "step": 34 + }, + { + "epoch": 0.01627654627189583, + "grad_norm": 2.5625736713409424, + "learning_rate": 5.426356589147287e-07, + "loss": 0.7209, + "step": 35 + }, + { + "epoch": 0.016741590451092855, + "grad_norm": 2.7283709049224854, + "learning_rate": 5.58139534883721e-07, + "loss": 0.7052, + "step": 36 + }, + { + "epoch": 0.017206634630289878, + "grad_norm": 2.3967738151550293, + "learning_rate": 5.736434108527132e-07, + "loss": 0.6705, + "step": 37 + }, + { + "epoch": 0.0176716788094869, + "grad_norm": 2.436204195022583, + "learning_rate": 5.891472868217055e-07, + "loss": 0.726, + "step": 38 + }, + { + "epoch": 0.018136722988683924, + "grad_norm": 2.3361918926239014, + "learning_rate": 6.046511627906977e-07, + "loss": 0.6694, + "step": 39 + }, + { + "epoch": 0.01860176716788095, + "grad_norm": 2.221872568130493, + "learning_rate": 6.201550387596899e-07, + "loss": 0.6728, + "step": 40 + }, + { + "epoch": 0.019066811347077973, + "grad_norm": 2.210745096206665, + "learning_rate": 6.356589147286822e-07, + "loss": 0.6902, + "step": 41 + }, + { + "epoch": 0.019531855526274997, + "grad_norm": 2.1230080127716064, + "learning_rate": 6.511627906976745e-07, + "loss": 0.697, + "step": 42 + }, + { + "epoch": 0.01999689970547202, + "grad_norm": 2.061232089996338, + "learning_rate": 6.666666666666667e-07, + "loss": 0.6746, + "step": 43 + }, + { + "epoch": 0.020461943884669043, + "grad_norm": 1.6226105690002441, + "learning_rate": 6.821705426356589e-07, + "loss": 0.6732, + "step": 44 + }, + { + "epoch": 0.020926988063866066, + "grad_norm": 1.4576574563980103, + "learning_rate": 6.976744186046513e-07, + "loss": 0.6821, + "step": 45 + }, + { + "epoch": 0.021392032243063092, + "grad_norm": 1.4970301389694214, + "learning_rate": 7.131782945736435e-07, + "loss": 0.6661, + "step": 46 + }, + { + "epoch": 0.021857076422260115, + "grad_norm": 1.6816171407699585, + "learning_rate": 7.286821705426357e-07, + "loss": 0.671, + "step": 47 + }, + { + "epoch": 0.02232212060145714, + "grad_norm": 1.7516964673995972, + "learning_rate": 7.441860465116279e-07, + "loss": 0.5983, + "step": 48 + }, + { + "epoch": 0.02278716478065416, + "grad_norm": 1.8953578472137451, + "learning_rate": 7.596899224806202e-07, + "loss": 0.6441, + "step": 49 + }, + { + "epoch": 0.023252208959851185, + "grad_norm": 1.8705769777297974, + "learning_rate": 7.751937984496125e-07, + "loss": 0.6332, + "step": 50 + }, + { + "epoch": 0.02371725313904821, + "grad_norm": 1.9592753648757935, + "learning_rate": 7.906976744186047e-07, + "loss": 0.6445, + "step": 51 + }, + { + "epoch": 0.024182297318245234, + "grad_norm": 1.7867804765701294, + "learning_rate": 8.06201550387597e-07, + "loss": 0.6449, + "step": 52 + }, + { + "epoch": 0.024647341497442257, + "grad_norm": 1.7160060405731201, + "learning_rate": 8.217054263565892e-07, + "loss": 0.6299, + "step": 53 + }, + { + "epoch": 0.02511238567663928, + "grad_norm": 1.3905798196792603, + "learning_rate": 8.372093023255814e-07, + "loss": 0.5903, + "step": 54 + }, + { + "epoch": 0.025577429855836303, + "grad_norm": 1.2955713272094727, + "learning_rate": 8.527131782945737e-07, + "loss": 0.6332, + "step": 55 + }, + { + "epoch": 0.02604247403503333, + "grad_norm": 1.2509273290634155, + "learning_rate": 8.68217054263566e-07, + "loss": 0.6233, + "step": 56 + }, + { + "epoch": 0.026507518214230353, + "grad_norm": 1.0419048070907593, + "learning_rate": 8.837209302325582e-07, + "loss": 0.5845, + "step": 57 + }, + { + "epoch": 0.026972562393427376, + "grad_norm": 1.083308458328247, + "learning_rate": 8.992248062015504e-07, + "loss": 0.6179, + "step": 58 + }, + { + "epoch": 0.0274376065726244, + "grad_norm": 1.011205792427063, + "learning_rate": 9.147286821705427e-07, + "loss": 0.6055, + "step": 59 + }, + { + "epoch": 0.027902650751821422, + "grad_norm": 0.9679791927337646, + "learning_rate": 9.30232558139535e-07, + "loss": 0.549, + "step": 60 + }, + { + "epoch": 0.028367694931018445, + "grad_norm": 0.9358096718788147, + "learning_rate": 9.457364341085272e-07, + "loss": 0.5544, + "step": 61 + }, + { + "epoch": 0.028832739110215472, + "grad_norm": 0.8703131079673767, + "learning_rate": 9.612403100775195e-07, + "loss": 0.5859, + "step": 62 + }, + { + "epoch": 0.029297783289412495, + "grad_norm": 0.7809782028198242, + "learning_rate": 9.767441860465117e-07, + "loss": 0.5716, + "step": 63 + }, + { + "epoch": 0.029762827468609518, + "grad_norm": 0.7614837884902954, + "learning_rate": 9.92248062015504e-07, + "loss": 0.5784, + "step": 64 + }, + { + "epoch": 0.03022787164780654, + "grad_norm": 0.7258252501487732, + "learning_rate": 1.0077519379844962e-06, + "loss": 0.5985, + "step": 65 + }, + { + "epoch": 0.030692915827003564, + "grad_norm": 0.7365714907646179, + "learning_rate": 1.0232558139534884e-06, + "loss": 0.5962, + "step": 66 + }, + { + "epoch": 0.03115796000620059, + "grad_norm": 0.7005903124809265, + "learning_rate": 1.0387596899224806e-06, + "loss": 0.5533, + "step": 67 + }, + { + "epoch": 0.03162300418539761, + "grad_norm": 0.7548089623451233, + "learning_rate": 1.054263565891473e-06, + "loss": 0.6215, + "step": 68 + }, + { + "epoch": 0.03208804836459463, + "grad_norm": 0.6190081238746643, + "learning_rate": 1.0697674418604653e-06, + "loss": 0.5489, + "step": 69 + }, + { + "epoch": 0.03255309254379166, + "grad_norm": 0.7042703628540039, + "learning_rate": 1.0852713178294575e-06, + "loss": 0.5756, + "step": 70 + }, + { + "epoch": 0.033018136722988686, + "grad_norm": 0.6290520429611206, + "learning_rate": 1.1007751937984497e-06, + "loss": 0.5925, + "step": 71 + }, + { + "epoch": 0.03348318090218571, + "grad_norm": 0.5906243920326233, + "learning_rate": 1.116279069767442e-06, + "loss": 0.5618, + "step": 72 + }, + { + "epoch": 0.03394822508138273, + "grad_norm": 0.5369678735733032, + "learning_rate": 1.1317829457364341e-06, + "loss": 0.555, + "step": 73 + }, + { + "epoch": 0.034413269260579755, + "grad_norm": 0.5198186039924622, + "learning_rate": 1.1472868217054264e-06, + "loss": 0.534, + "step": 74 + }, + { + "epoch": 0.03487831343977678, + "grad_norm": 0.5380486845970154, + "learning_rate": 1.1627906976744188e-06, + "loss": 0.5528, + "step": 75 + }, + { + "epoch": 0.0353433576189738, + "grad_norm": 0.5507330298423767, + "learning_rate": 1.178294573643411e-06, + "loss": 0.5608, + "step": 76 + }, + { + "epoch": 0.035808401798170825, + "grad_norm": 0.5470713376998901, + "learning_rate": 1.1937984496124032e-06, + "loss": 0.549, + "step": 77 + }, + { + "epoch": 0.03627344597736785, + "grad_norm": 0.5553979277610779, + "learning_rate": 1.2093023255813954e-06, + "loss": 0.5203, + "step": 78 + }, + { + "epoch": 0.03673849015656487, + "grad_norm": 0.5427577495574951, + "learning_rate": 1.2248062015503877e-06, + "loss": 0.5348, + "step": 79 + }, + { + "epoch": 0.0372035343357619, + "grad_norm": 0.5340808629989624, + "learning_rate": 1.2403100775193799e-06, + "loss": 0.5373, + "step": 80 + }, + { + "epoch": 0.037668578514958924, + "grad_norm": 0.5276907682418823, + "learning_rate": 1.2558139534883723e-06, + "loss": 0.5403, + "step": 81 + }, + { + "epoch": 0.03813362269415595, + "grad_norm": 0.4982495605945587, + "learning_rate": 1.2713178294573643e-06, + "loss": 0.5283, + "step": 82 + }, + { + "epoch": 0.03859866687335297, + "grad_norm": 0.48218581080436707, + "learning_rate": 1.2868217054263568e-06, + "loss": 0.5499, + "step": 83 + }, + { + "epoch": 0.03906371105254999, + "grad_norm": 0.4612601399421692, + "learning_rate": 1.302325581395349e-06, + "loss": 0.5276, + "step": 84 + }, + { + "epoch": 0.039528755231747016, + "grad_norm": 0.48117271065711975, + "learning_rate": 1.3178294573643414e-06, + "loss": 0.5462, + "step": 85 + }, + { + "epoch": 0.03999379941094404, + "grad_norm": 0.464334636926651, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.5255, + "step": 86 + }, + { + "epoch": 0.04045884359014106, + "grad_norm": 0.5120902061462402, + "learning_rate": 1.3488372093023258e-06, + "loss": 0.553, + "step": 87 + }, + { + "epoch": 0.040923887769338085, + "grad_norm": 0.5096696019172668, + "learning_rate": 1.3643410852713179e-06, + "loss": 0.5242, + "step": 88 + }, + { + "epoch": 0.04138893194853511, + "grad_norm": 0.44623059034347534, + "learning_rate": 1.3798449612403103e-06, + "loss": 0.523, + "step": 89 + }, + { + "epoch": 0.04185397612773213, + "grad_norm": 0.47654956579208374, + "learning_rate": 1.3953488372093025e-06, + "loss": 0.5335, + "step": 90 + }, + { + "epoch": 0.04231902030692916, + "grad_norm": 0.4882866144180298, + "learning_rate": 1.4108527131782947e-06, + "loss": 0.5176, + "step": 91 + }, + { + "epoch": 0.042784064486126185, + "grad_norm": 0.46192339062690735, + "learning_rate": 1.426356589147287e-06, + "loss": 0.545, + "step": 92 + }, + { + "epoch": 0.04324910866532321, + "grad_norm": 0.4416024684906006, + "learning_rate": 1.4418604651162794e-06, + "loss": 0.526, + "step": 93 + }, + { + "epoch": 0.04371415284452023, + "grad_norm": 0.45274990797042847, + "learning_rate": 1.4573643410852714e-06, + "loss": 0.5233, + "step": 94 + }, + { + "epoch": 0.044179197023717254, + "grad_norm": 0.4544108808040619, + "learning_rate": 1.4728682170542638e-06, + "loss": 0.5118, + "step": 95 + }, + { + "epoch": 0.04464424120291428, + "grad_norm": 0.4827730655670166, + "learning_rate": 1.4883720930232558e-06, + "loss": 0.5087, + "step": 96 + }, + { + "epoch": 0.0451092853821113, + "grad_norm": 0.45739415287971497, + "learning_rate": 1.5038759689922483e-06, + "loss": 0.5058, + "step": 97 + }, + { + "epoch": 0.04557432956130832, + "grad_norm": 0.4581778645515442, + "learning_rate": 1.5193798449612405e-06, + "loss": 0.5272, + "step": 98 + }, + { + "epoch": 0.046039373740505346, + "grad_norm": 0.43706098198890686, + "learning_rate": 1.534883720930233e-06, + "loss": 0.5205, + "step": 99 + }, + { + "epoch": 0.04650441791970237, + "grad_norm": 0.43774181604385376, + "learning_rate": 1.550387596899225e-06, + "loss": 0.5033, + "step": 100 + }, + { + "epoch": 0.04696946209889939, + "grad_norm": 0.42946186661720276, + "learning_rate": 1.5658914728682173e-06, + "loss": 0.5567, + "step": 101 + }, + { + "epoch": 0.04743450627809642, + "grad_norm": 0.4620288908481598, + "learning_rate": 1.5813953488372093e-06, + "loss": 0.4879, + "step": 102 + }, + { + "epoch": 0.047899550457293445, + "grad_norm": 0.4660142958164215, + "learning_rate": 1.5968992248062018e-06, + "loss": 0.5373, + "step": 103 + }, + { + "epoch": 0.04836459463649047, + "grad_norm": 0.4969952404499054, + "learning_rate": 1.612403100775194e-06, + "loss": 0.5156, + "step": 104 + }, + { + "epoch": 0.04882963881568749, + "grad_norm": 0.4132183790206909, + "learning_rate": 1.6279069767441862e-06, + "loss": 0.496, + "step": 105 + }, + { + "epoch": 0.049294682994884514, + "grad_norm": 0.433496356010437, + "learning_rate": 1.6434108527131784e-06, + "loss": 0.5313, + "step": 106 + }, + { + "epoch": 0.04975972717408154, + "grad_norm": 0.4335452914237976, + "learning_rate": 1.6589147286821709e-06, + "loss": 0.5167, + "step": 107 + }, + { + "epoch": 0.05022477135327856, + "grad_norm": 0.43790504336357117, + "learning_rate": 1.6744186046511629e-06, + "loss": 0.4973, + "step": 108 + }, + { + "epoch": 0.050689815532475584, + "grad_norm": 0.4186980724334717, + "learning_rate": 1.6899224806201553e-06, + "loss": 0.4954, + "step": 109 + }, + { + "epoch": 0.05115485971167261, + "grad_norm": 0.40744200348854065, + "learning_rate": 1.7054263565891473e-06, + "loss": 0.4927, + "step": 110 + }, + { + "epoch": 0.05161990389086963, + "grad_norm": 0.43091249465942383, + "learning_rate": 1.7209302325581397e-06, + "loss": 0.5062, + "step": 111 + }, + { + "epoch": 0.05208494807006666, + "grad_norm": 0.4677499830722809, + "learning_rate": 1.736434108527132e-06, + "loss": 0.5107, + "step": 112 + }, + { + "epoch": 0.05254999224926368, + "grad_norm": 0.42137694358825684, + "learning_rate": 1.7519379844961242e-06, + "loss": 0.5132, + "step": 113 + }, + { + "epoch": 0.053015036428460706, + "grad_norm": 0.4344700872898102, + "learning_rate": 1.7674418604651164e-06, + "loss": 0.4793, + "step": 114 + }, + { + "epoch": 0.05348008060765773, + "grad_norm": 0.4326712489128113, + "learning_rate": 1.7829457364341088e-06, + "loss": 0.4715, + "step": 115 + }, + { + "epoch": 0.05394512478685475, + "grad_norm": 0.4487822949886322, + "learning_rate": 1.7984496124031008e-06, + "loss": 0.5165, + "step": 116 + }, + { + "epoch": 0.054410168966051775, + "grad_norm": 0.46336719393730164, + "learning_rate": 1.8139534883720933e-06, + "loss": 0.512, + "step": 117 + }, + { + "epoch": 0.0548752131452488, + "grad_norm": 0.4590426981449127, + "learning_rate": 1.8294573643410855e-06, + "loss": 0.5036, + "step": 118 + }, + { + "epoch": 0.05534025732444582, + "grad_norm": 0.4003012478351593, + "learning_rate": 1.8449612403100777e-06, + "loss": 0.4894, + "step": 119 + }, + { + "epoch": 0.055805301503642844, + "grad_norm": 0.45950761437416077, + "learning_rate": 1.86046511627907e-06, + "loss": 0.5109, + "step": 120 + }, + { + "epoch": 0.05627034568283987, + "grad_norm": 0.43410784006118774, + "learning_rate": 1.8759689922480624e-06, + "loss": 0.4966, + "step": 121 + }, + { + "epoch": 0.05673538986203689, + "grad_norm": 0.45534461736679077, + "learning_rate": 1.8914728682170544e-06, + "loss": 0.5311, + "step": 122 + }, + { + "epoch": 0.05720043404123392, + "grad_norm": 0.44469308853149414, + "learning_rate": 1.9069767441860468e-06, + "loss": 0.4967, + "step": 123 + }, + { + "epoch": 0.057665478220430944, + "grad_norm": 0.4573187232017517, + "learning_rate": 1.922480620155039e-06, + "loss": 0.4864, + "step": 124 + }, + { + "epoch": 0.05813052239962797, + "grad_norm": 0.45671388506889343, + "learning_rate": 1.9379844961240315e-06, + "loss": 0.5046, + "step": 125 + }, + { + "epoch": 0.05859556657882499, + "grad_norm": 0.4924532473087311, + "learning_rate": 1.9534883720930235e-06, + "loss": 0.5474, + "step": 126 + }, + { + "epoch": 0.05906061075802201, + "grad_norm": 0.45947206020355225, + "learning_rate": 1.968992248062016e-06, + "loss": 0.463, + "step": 127 + }, + { + "epoch": 0.059525654937219036, + "grad_norm": 0.4114924967288971, + "learning_rate": 1.984496124031008e-06, + "loss": 0.5045, + "step": 128 + }, + { + "epoch": 0.05999069911641606, + "grad_norm": 0.424427330493927, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.4865, + "step": 129 + }, + { + "epoch": 0.06045574329561308, + "grad_norm": 0.457966685295105, + "learning_rate": 2.0155038759689923e-06, + "loss": 0.492, + "step": 130 + }, + { + "epoch": 0.060920787474810105, + "grad_norm": 0.4255077540874481, + "learning_rate": 2.0310077519379848e-06, + "loss": 0.4961, + "step": 131 + }, + { + "epoch": 0.06138583165400713, + "grad_norm": 0.39936286211013794, + "learning_rate": 2.0465116279069768e-06, + "loss": 0.4595, + "step": 132 + }, + { + "epoch": 0.06185087583320415, + "grad_norm": 0.39685145020484924, + "learning_rate": 2.062015503875969e-06, + "loss": 0.4645, + "step": 133 + }, + { + "epoch": 0.06231592001240118, + "grad_norm": 0.44578003883361816, + "learning_rate": 2.0775193798449612e-06, + "loss": 0.4848, + "step": 134 + }, + { + "epoch": 0.0627809641915982, + "grad_norm": 0.4608939290046692, + "learning_rate": 2.0930232558139536e-06, + "loss": 0.444, + "step": 135 + }, + { + "epoch": 0.06324600837079522, + "grad_norm": 0.43158158659935, + "learning_rate": 2.108527131782946e-06, + "loss": 0.4781, + "step": 136 + }, + { + "epoch": 0.06371105254999225, + "grad_norm": 0.43560659885406494, + "learning_rate": 2.124031007751938e-06, + "loss": 0.4768, + "step": 137 + }, + { + "epoch": 0.06417609672918927, + "grad_norm": 0.39161673188209534, + "learning_rate": 2.1395348837209305e-06, + "loss": 0.4824, + "step": 138 + }, + { + "epoch": 0.0646411409083863, + "grad_norm": 0.492993026971817, + "learning_rate": 2.155038759689923e-06, + "loss": 0.4708, + "step": 139 + }, + { + "epoch": 0.06510618508758333, + "grad_norm": 0.4824112057685852, + "learning_rate": 2.170542635658915e-06, + "loss": 0.5064, + "step": 140 + }, + { + "epoch": 0.06557122926678034, + "grad_norm": 0.42813244462013245, + "learning_rate": 2.1860465116279074e-06, + "loss": 0.4815, + "step": 141 + }, + { + "epoch": 0.06603627344597737, + "grad_norm": 0.43404629826545715, + "learning_rate": 2.2015503875968994e-06, + "loss": 0.4808, + "step": 142 + }, + { + "epoch": 0.06650131762517439, + "grad_norm": 0.44562390446662903, + "learning_rate": 2.217054263565892e-06, + "loss": 0.495, + "step": 143 + }, + { + "epoch": 0.06696636180437142, + "grad_norm": 0.43140050768852234, + "learning_rate": 2.232558139534884e-06, + "loss": 0.4634, + "step": 144 + }, + { + "epoch": 0.06743140598356843, + "grad_norm": 0.43824145197868347, + "learning_rate": 2.2480620155038763e-06, + "loss": 0.5077, + "step": 145 + }, + { + "epoch": 0.06789645016276546, + "grad_norm": 0.45454517006874084, + "learning_rate": 2.2635658914728683e-06, + "loss": 0.4605, + "step": 146 + }, + { + "epoch": 0.06836149434196248, + "grad_norm": 0.4356607496738434, + "learning_rate": 2.2790697674418607e-06, + "loss": 0.5132, + "step": 147 + }, + { + "epoch": 0.06882653852115951, + "grad_norm": 0.4299354553222656, + "learning_rate": 2.2945736434108527e-06, + "loss": 0.4709, + "step": 148 + }, + { + "epoch": 0.06929158270035653, + "grad_norm": 0.4432096481323242, + "learning_rate": 2.310077519379845e-06, + "loss": 0.4743, + "step": 149 + }, + { + "epoch": 0.06975662687955356, + "grad_norm": 0.4662824273109436, + "learning_rate": 2.3255813953488376e-06, + "loss": 0.5184, + "step": 150 + }, + { + "epoch": 0.07022167105875059, + "grad_norm": 0.4367971420288086, + "learning_rate": 2.3410852713178296e-06, + "loss": 0.4778, + "step": 151 + }, + { + "epoch": 0.0706867152379476, + "grad_norm": 0.4500759243965149, + "learning_rate": 2.356589147286822e-06, + "loss": 0.4534, + "step": 152 + }, + { + "epoch": 0.07115175941714463, + "grad_norm": 0.4312923848628998, + "learning_rate": 2.3720930232558144e-06, + "loss": 0.5134, + "step": 153 + }, + { + "epoch": 0.07161680359634165, + "grad_norm": 0.4794353246688843, + "learning_rate": 2.3875968992248065e-06, + "loss": 0.4641, + "step": 154 + }, + { + "epoch": 0.07208184777553868, + "grad_norm": 0.4282940924167633, + "learning_rate": 2.403100775193799e-06, + "loss": 0.4693, + "step": 155 + }, + { + "epoch": 0.0725468919547357, + "grad_norm": 0.4426284730434418, + "learning_rate": 2.418604651162791e-06, + "loss": 0.4785, + "step": 156 + }, + { + "epoch": 0.07301193613393273, + "grad_norm": 0.45532897114753723, + "learning_rate": 2.4341085271317833e-06, + "loss": 0.4934, + "step": 157 + }, + { + "epoch": 0.07347698031312974, + "grad_norm": 0.5064568519592285, + "learning_rate": 2.4496124031007753e-06, + "loss": 0.495, + "step": 158 + }, + { + "epoch": 0.07394202449232677, + "grad_norm": 0.4576449692249298, + "learning_rate": 2.4651162790697678e-06, + "loss": 0.4551, + "step": 159 + }, + { + "epoch": 0.0744070686715238, + "grad_norm": 0.4424917697906494, + "learning_rate": 2.4806201550387598e-06, + "loss": 0.4861, + "step": 160 + }, + { + "epoch": 0.07487211285072082, + "grad_norm": 0.48563066124916077, + "learning_rate": 2.496124031007752e-06, + "loss": 0.4614, + "step": 161 + }, + { + "epoch": 0.07533715702991785, + "grad_norm": 0.43007153272628784, + "learning_rate": 2.5116279069767446e-06, + "loss": 0.4822, + "step": 162 + }, + { + "epoch": 0.07580220120911486, + "grad_norm": 0.41486161947250366, + "learning_rate": 2.5271317829457366e-06, + "loss": 0.4713, + "step": 163 + }, + { + "epoch": 0.0762672453883119, + "grad_norm": 0.39590543508529663, + "learning_rate": 2.5426356589147286e-06, + "loss": 0.5051, + "step": 164 + }, + { + "epoch": 0.07673228956750891, + "grad_norm": 0.4512157738208771, + "learning_rate": 2.558139534883721e-06, + "loss": 0.4597, + "step": 165 + }, + { + "epoch": 0.07719733374670594, + "grad_norm": 0.5571392178535461, + "learning_rate": 2.5736434108527135e-06, + "loss": 0.4783, + "step": 166 + }, + { + "epoch": 0.07766237792590296, + "grad_norm": 0.41649627685546875, + "learning_rate": 2.5891472868217055e-06, + "loss": 0.4648, + "step": 167 + }, + { + "epoch": 0.07812742210509999, + "grad_norm": 0.45857080817222595, + "learning_rate": 2.604651162790698e-06, + "loss": 0.4793, + "step": 168 + }, + { + "epoch": 0.078592466284297, + "grad_norm": 0.42742466926574707, + "learning_rate": 2.6201550387596904e-06, + "loss": 0.4894, + "step": 169 + }, + { + "epoch": 0.07905751046349403, + "grad_norm": 0.48390820622444153, + "learning_rate": 2.635658914728683e-06, + "loss": 0.4707, + "step": 170 + }, + { + "epoch": 0.07952255464269106, + "grad_norm": 0.5031663775444031, + "learning_rate": 2.6511627906976744e-06, + "loss": 0.4591, + "step": 171 + }, + { + "epoch": 0.07998759882188808, + "grad_norm": 0.4769449830055237, + "learning_rate": 2.666666666666667e-06, + "loss": 0.4735, + "step": 172 + }, + { + "epoch": 0.08045264300108511, + "grad_norm": 0.4525051712989807, + "learning_rate": 2.6821705426356593e-06, + "loss": 0.4655, + "step": 173 + }, + { + "epoch": 0.08091768718028212, + "grad_norm": 0.4477654695510864, + "learning_rate": 2.6976744186046517e-06, + "loss": 0.461, + "step": 174 + }, + { + "epoch": 0.08138273135947915, + "grad_norm": 0.45438140630722046, + "learning_rate": 2.7131782945736433e-06, + "loss": 0.4807, + "step": 175 + }, + { + "epoch": 0.08184777553867617, + "grad_norm": 0.47995445132255554, + "learning_rate": 2.7286821705426357e-06, + "loss": 0.4768, + "step": 176 + }, + { + "epoch": 0.0823128197178732, + "grad_norm": 0.40142083168029785, + "learning_rate": 2.744186046511628e-06, + "loss": 0.4814, + "step": 177 + }, + { + "epoch": 0.08277786389707022, + "grad_norm": 0.4523696005344391, + "learning_rate": 2.7596899224806206e-06, + "loss": 0.4561, + "step": 178 + }, + { + "epoch": 0.08324290807626725, + "grad_norm": 0.4552527368068695, + "learning_rate": 2.7751937984496126e-06, + "loss": 0.4796, + "step": 179 + }, + { + "epoch": 0.08370795225546426, + "grad_norm": 0.4601724445819855, + "learning_rate": 2.790697674418605e-06, + "loss": 0.4733, + "step": 180 + }, + { + "epoch": 0.08417299643466129, + "grad_norm": 0.4687198996543884, + "learning_rate": 2.8062015503875974e-06, + "loss": 0.4648, + "step": 181 + }, + { + "epoch": 0.08463804061385832, + "grad_norm": 0.4488019347190857, + "learning_rate": 2.8217054263565894e-06, + "loss": 0.4794, + "step": 182 + }, + { + "epoch": 0.08510308479305534, + "grad_norm": 0.462455153465271, + "learning_rate": 2.8372093023255815e-06, + "loss": 0.4699, + "step": 183 + }, + { + "epoch": 0.08556812897225237, + "grad_norm": 0.43786558508872986, + "learning_rate": 2.852713178294574e-06, + "loss": 0.4807, + "step": 184 + }, + { + "epoch": 0.08603317315144939, + "grad_norm": 0.4396842122077942, + "learning_rate": 2.8682170542635663e-06, + "loss": 0.4744, + "step": 185 + }, + { + "epoch": 0.08649821733064642, + "grad_norm": 0.40669193863868713, + "learning_rate": 2.8837209302325587e-06, + "loss": 0.5, + "step": 186 + }, + { + "epoch": 0.08696326150984343, + "grad_norm": 0.43069615960121155, + "learning_rate": 2.8992248062015503e-06, + "loss": 0.4666, + "step": 187 + }, + { + "epoch": 0.08742830568904046, + "grad_norm": 0.3929974138736725, + "learning_rate": 2.9147286821705428e-06, + "loss": 0.4599, + "step": 188 + }, + { + "epoch": 0.08789334986823748, + "grad_norm": 0.44300031661987305, + "learning_rate": 2.930232558139535e-06, + "loss": 0.4441, + "step": 189 + }, + { + "epoch": 0.08835839404743451, + "grad_norm": 0.40551915764808655, + "learning_rate": 2.9457364341085276e-06, + "loss": 0.4536, + "step": 190 + }, + { + "epoch": 0.08882343822663152, + "grad_norm": 0.45988717675209045, + "learning_rate": 2.9612403100775196e-06, + "loss": 0.4442, + "step": 191 + }, + { + "epoch": 0.08928848240582855, + "grad_norm": 0.41964244842529297, + "learning_rate": 2.9767441860465116e-06, + "loss": 0.4294, + "step": 192 + }, + { + "epoch": 0.08975352658502558, + "grad_norm": 0.4278603196144104, + "learning_rate": 2.992248062015504e-06, + "loss": 0.4512, + "step": 193 + }, + { + "epoch": 0.0902185707642226, + "grad_norm": 0.4759548008441925, + "learning_rate": 3.0077519379844965e-06, + "loss": 0.4856, + "step": 194 + }, + { + "epoch": 0.09068361494341963, + "grad_norm": 0.44720789790153503, + "learning_rate": 3.0232558139534885e-06, + "loss": 0.439, + "step": 195 + }, + { + "epoch": 0.09114865912261665, + "grad_norm": 0.4375242590904236, + "learning_rate": 3.038759689922481e-06, + "loss": 0.4559, + "step": 196 + }, + { + "epoch": 0.09161370330181368, + "grad_norm": 0.4382531940937042, + "learning_rate": 3.0542635658914734e-06, + "loss": 0.4651, + "step": 197 + }, + { + "epoch": 0.09207874748101069, + "grad_norm": 0.46348533034324646, + "learning_rate": 3.069767441860466e-06, + "loss": 0.4594, + "step": 198 + }, + { + "epoch": 0.09254379166020772, + "grad_norm": 0.5095227360725403, + "learning_rate": 3.0852713178294574e-06, + "loss": 0.4621, + "step": 199 + }, + { + "epoch": 0.09300883583940474, + "grad_norm": 0.5287566781044006, + "learning_rate": 3.10077519379845e-06, + "loss": 0.4805, + "step": 200 + }, + { + "epoch": 0.09347388001860177, + "grad_norm": 0.49240946769714355, + "learning_rate": 3.1162790697674423e-06, + "loss": 0.4836, + "step": 201 + }, + { + "epoch": 0.09393892419779878, + "grad_norm": 0.4462595283985138, + "learning_rate": 3.1317829457364347e-06, + "loss": 0.4466, + "step": 202 + }, + { + "epoch": 0.09440396837699581, + "grad_norm": 0.5251316428184509, + "learning_rate": 3.1472868217054263e-06, + "loss": 0.4754, + "step": 203 + }, + { + "epoch": 0.09486901255619284, + "grad_norm": 0.516490638256073, + "learning_rate": 3.1627906976744187e-06, + "loss": 0.4768, + "step": 204 + }, + { + "epoch": 0.09533405673538986, + "grad_norm": 0.455197274684906, + "learning_rate": 3.178294573643411e-06, + "loss": 0.449, + "step": 205 + }, + { + "epoch": 0.09579910091458689, + "grad_norm": 0.4805566966533661, + "learning_rate": 3.1937984496124036e-06, + "loss": 0.4775, + "step": 206 + }, + { + "epoch": 0.0962641450937839, + "grad_norm": 0.49571681022644043, + "learning_rate": 3.2093023255813956e-06, + "loss": 0.4297, + "step": 207 + }, + { + "epoch": 0.09672918927298094, + "grad_norm": 0.43034252524375916, + "learning_rate": 3.224806201550388e-06, + "loss": 0.4941, + "step": 208 + }, + { + "epoch": 0.09719423345217795, + "grad_norm": 0.539557933807373, + "learning_rate": 3.24031007751938e-06, + "loss": 0.4412, + "step": 209 + }, + { + "epoch": 0.09765927763137498, + "grad_norm": 0.5281848907470703, + "learning_rate": 3.2558139534883724e-06, + "loss": 0.468, + "step": 210 + }, + { + "epoch": 0.098124321810572, + "grad_norm": 0.4832194745540619, + "learning_rate": 3.2713178294573644e-06, + "loss": 0.4691, + "step": 211 + }, + { + "epoch": 0.09858936598976903, + "grad_norm": 0.4533843696117401, + "learning_rate": 3.286821705426357e-06, + "loss": 0.4746, + "step": 212 + }, + { + "epoch": 0.09905441016896605, + "grad_norm": 0.509421169757843, + "learning_rate": 3.3023255813953493e-06, + "loss": 0.4646, + "step": 213 + }, + { + "epoch": 0.09951945434816308, + "grad_norm": 0.4463287889957428, + "learning_rate": 3.3178294573643417e-06, + "loss": 0.4763, + "step": 214 + }, + { + "epoch": 0.0999844985273601, + "grad_norm": 0.443873792886734, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.443, + "step": 215 + }, + { + "epoch": 0.10044954270655712, + "grad_norm": 0.5233954191207886, + "learning_rate": 3.3488372093023258e-06, + "loss": 0.4574, + "step": 216 + }, + { + "epoch": 0.10091458688575415, + "grad_norm": 0.4142616093158722, + "learning_rate": 3.364341085271318e-06, + "loss": 0.4304, + "step": 217 + }, + { + "epoch": 0.10137963106495117, + "grad_norm": 0.4118039309978485, + "learning_rate": 3.3798449612403106e-06, + "loss": 0.4272, + "step": 218 + }, + { + "epoch": 0.1018446752441482, + "grad_norm": 0.4856218099594116, + "learning_rate": 3.3953488372093026e-06, + "loss": 0.4834, + "step": 219 + }, + { + "epoch": 0.10230971942334521, + "grad_norm": 0.4576835036277771, + "learning_rate": 3.4108527131782946e-06, + "loss": 0.4216, + "step": 220 + }, + { + "epoch": 0.10277476360254224, + "grad_norm": 0.5164148211479187, + "learning_rate": 3.426356589147287e-06, + "loss": 0.4545, + "step": 221 + }, + { + "epoch": 0.10323980778173926, + "grad_norm": 0.4515243470668793, + "learning_rate": 3.4418604651162795e-06, + "loss": 0.4515, + "step": 222 + }, + { + "epoch": 0.10370485196093629, + "grad_norm": 0.4391280710697174, + "learning_rate": 3.4573643410852715e-06, + "loss": 0.4466, + "step": 223 + }, + { + "epoch": 0.10416989614013332, + "grad_norm": 0.41174498200416565, + "learning_rate": 3.472868217054264e-06, + "loss": 0.4424, + "step": 224 + }, + { + "epoch": 0.10463494031933034, + "grad_norm": 0.45546606183052063, + "learning_rate": 3.4883720930232564e-06, + "loss": 0.481, + "step": 225 + }, + { + "epoch": 0.10509998449852737, + "grad_norm": 0.4433421194553375, + "learning_rate": 3.5038759689922484e-06, + "loss": 0.4586, + "step": 226 + }, + { + "epoch": 0.10556502867772438, + "grad_norm": 0.46707531809806824, + "learning_rate": 3.5193798449612404e-06, + "loss": 0.4627, + "step": 227 + }, + { + "epoch": 0.10603007285692141, + "grad_norm": 0.48355457186698914, + "learning_rate": 3.534883720930233e-06, + "loss": 0.4834, + "step": 228 + }, + { + "epoch": 0.10649511703611843, + "grad_norm": 0.5134936571121216, + "learning_rate": 3.5503875968992252e-06, + "loss": 0.4496, + "step": 229 + }, + { + "epoch": 0.10696016121531546, + "grad_norm": 0.4306149184703827, + "learning_rate": 3.5658914728682177e-06, + "loss": 0.4689, + "step": 230 + }, + { + "epoch": 0.10742520539451247, + "grad_norm": 0.4603032171726227, + "learning_rate": 3.5813953488372093e-06, + "loss": 0.4489, + "step": 231 + }, + { + "epoch": 0.1078902495737095, + "grad_norm": 0.46613025665283203, + "learning_rate": 3.5968992248062017e-06, + "loss": 0.4648, + "step": 232 + }, + { + "epoch": 0.10835529375290652, + "grad_norm": 0.497821182012558, + "learning_rate": 3.612403100775194e-06, + "loss": 0.4454, + "step": 233 + }, + { + "epoch": 0.10882033793210355, + "grad_norm": 0.44097408652305603, + "learning_rate": 3.6279069767441866e-06, + "loss": 0.4443, + "step": 234 + }, + { + "epoch": 0.10928538211130058, + "grad_norm": 0.4849492013454437, + "learning_rate": 3.6434108527131786e-06, + "loss": 0.4546, + "step": 235 + }, + { + "epoch": 0.1097504262904976, + "grad_norm": 0.4576839506626129, + "learning_rate": 3.658914728682171e-06, + "loss": 0.44, + "step": 236 + }, + { + "epoch": 0.11021547046969463, + "grad_norm": 0.44836199283599854, + "learning_rate": 3.674418604651163e-06, + "loss": 0.4779, + "step": 237 + }, + { + "epoch": 0.11068051464889164, + "grad_norm": 0.46997904777526855, + "learning_rate": 3.6899224806201554e-06, + "loss": 0.4755, + "step": 238 + }, + { + "epoch": 0.11114555882808867, + "grad_norm": 0.4699132740497589, + "learning_rate": 3.7054263565891474e-06, + "loss": 0.4279, + "step": 239 + }, + { + "epoch": 0.11161060300728569, + "grad_norm": 0.4158969819545746, + "learning_rate": 3.72093023255814e-06, + "loss": 0.4462, + "step": 240 + }, + { + "epoch": 0.11207564718648272, + "grad_norm": 0.4518444240093231, + "learning_rate": 3.7364341085271323e-06, + "loss": 0.4419, + "step": 241 + }, + { + "epoch": 0.11254069136567973, + "grad_norm": 0.43959298729896545, + "learning_rate": 3.7519379844961247e-06, + "loss": 0.4277, + "step": 242 + }, + { + "epoch": 0.11300573554487676, + "grad_norm": 0.46462079882621765, + "learning_rate": 3.7674418604651163e-06, + "loss": 0.4666, + "step": 243 + }, + { + "epoch": 0.11347077972407378, + "grad_norm": 0.44643181562423706, + "learning_rate": 3.7829457364341087e-06, + "loss": 0.4542, + "step": 244 + }, + { + "epoch": 0.11393582390327081, + "grad_norm": 0.4474817216396332, + "learning_rate": 3.798449612403101e-06, + "loss": 0.4457, + "step": 245 + }, + { + "epoch": 0.11440086808246784, + "grad_norm": 0.440893292427063, + "learning_rate": 3.8139534883720936e-06, + "loss": 0.4567, + "step": 246 + }, + { + "epoch": 0.11486591226166486, + "grad_norm": 0.5157288908958435, + "learning_rate": 3.829457364341085e-06, + "loss": 0.4668, + "step": 247 + }, + { + "epoch": 0.11533095644086189, + "grad_norm": 0.41228559613227844, + "learning_rate": 3.844961240310078e-06, + "loss": 0.4532, + "step": 248 + }, + { + "epoch": 0.1157960006200589, + "grad_norm": 0.49492067098617554, + "learning_rate": 3.86046511627907e-06, + "loss": 0.4717, + "step": 249 + }, + { + "epoch": 0.11626104479925593, + "grad_norm": 0.3908706605434418, + "learning_rate": 3.875968992248063e-06, + "loss": 0.4363, + "step": 250 + }, + { + "epoch": 0.11672608897845295, + "grad_norm": 0.4512355625629425, + "learning_rate": 3.891472868217054e-06, + "loss": 0.461, + "step": 251 + }, + { + "epoch": 0.11719113315764998, + "grad_norm": 0.45557087659835815, + "learning_rate": 3.906976744186047e-06, + "loss": 0.4479, + "step": 252 + }, + { + "epoch": 0.117656177336847, + "grad_norm": 0.42376312613487244, + "learning_rate": 3.922480620155039e-06, + "loss": 0.4614, + "step": 253 + }, + { + "epoch": 0.11812122151604403, + "grad_norm": 0.494684636592865, + "learning_rate": 3.937984496124032e-06, + "loss": 0.4528, + "step": 254 + }, + { + "epoch": 0.11858626569524104, + "grad_norm": 0.46942609548568726, + "learning_rate": 3.953488372093024e-06, + "loss": 0.4363, + "step": 255 + }, + { + "epoch": 0.11905130987443807, + "grad_norm": 0.4552278518676758, + "learning_rate": 3.968992248062016e-06, + "loss": 0.4462, + "step": 256 + }, + { + "epoch": 0.1195163540536351, + "grad_norm": 0.4802659749984741, + "learning_rate": 3.984496124031008e-06, + "loss": 0.4694, + "step": 257 + }, + { + "epoch": 0.11998139823283212, + "grad_norm": 0.49204587936401367, + "learning_rate": 4.000000000000001e-06, + "loss": 0.449, + "step": 258 + }, + { + "epoch": 0.12044644241202915, + "grad_norm": 0.4670722484588623, + "learning_rate": 4.015503875968993e-06, + "loss": 0.4624, + "step": 259 + }, + { + "epoch": 0.12091148659122616, + "grad_norm": 0.4534282386302948, + "learning_rate": 4.031007751937985e-06, + "loss": 0.4338, + "step": 260 + }, + { + "epoch": 0.1213765307704232, + "grad_norm": 0.4467426538467407, + "learning_rate": 4.0465116279069775e-06, + "loss": 0.4513, + "step": 261 + }, + { + "epoch": 0.12184157494962021, + "grad_norm": 0.4626162648200989, + "learning_rate": 4.0620155038759695e-06, + "loss": 0.4714, + "step": 262 + }, + { + "epoch": 0.12230661912881724, + "grad_norm": 0.45661380887031555, + "learning_rate": 4.0775193798449616e-06, + "loss": 0.4559, + "step": 263 + }, + { + "epoch": 0.12277166330801426, + "grad_norm": 0.5042157769203186, + "learning_rate": 4.0930232558139536e-06, + "loss": 0.4717, + "step": 264 + }, + { + "epoch": 0.12323670748721129, + "grad_norm": 0.5059670805931091, + "learning_rate": 4.108527131782946e-06, + "loss": 0.4517, + "step": 265 + }, + { + "epoch": 0.1237017516664083, + "grad_norm": 0.4609099328517914, + "learning_rate": 4.124031007751938e-06, + "loss": 0.4458, + "step": 266 + }, + { + "epoch": 0.12416679584560533, + "grad_norm": 0.5168102979660034, + "learning_rate": 4.1395348837209304e-06, + "loss": 0.4068, + "step": 267 + }, + { + "epoch": 0.12463184002480236, + "grad_norm": 0.4813331961631775, + "learning_rate": 4.1550387596899224e-06, + "loss": 0.4364, + "step": 268 + }, + { + "epoch": 0.1250968842039994, + "grad_norm": 0.4824882745742798, + "learning_rate": 4.170542635658915e-06, + "loss": 0.4726, + "step": 269 + }, + { + "epoch": 0.1255619283831964, + "grad_norm": 0.4692811965942383, + "learning_rate": 4.186046511627907e-06, + "loss": 0.462, + "step": 270 + }, + { + "epoch": 0.12602697256239342, + "grad_norm": 0.473664253950119, + "learning_rate": 4.201550387596899e-06, + "loss": 0.4681, + "step": 271 + }, + { + "epoch": 0.12649201674159044, + "grad_norm": 0.42069149017333984, + "learning_rate": 4.217054263565892e-06, + "loss": 0.4428, + "step": 272 + }, + { + "epoch": 0.12695706092078748, + "grad_norm": 0.4619064927101135, + "learning_rate": 4.232558139534884e-06, + "loss": 0.4677, + "step": 273 + }, + { + "epoch": 0.1274221050999845, + "grad_norm": 0.5394362807273865, + "learning_rate": 4.248062015503876e-06, + "loss": 0.4262, + "step": 274 + }, + { + "epoch": 0.12788714927918152, + "grad_norm": 0.44251418113708496, + "learning_rate": 4.263565891472868e-06, + "loss": 0.4826, + "step": 275 + }, + { + "epoch": 0.12835219345837853, + "grad_norm": 0.40653759241104126, + "learning_rate": 4.279069767441861e-06, + "loss": 0.4229, + "step": 276 + }, + { + "epoch": 0.12881723763757558, + "grad_norm": 0.5286734104156494, + "learning_rate": 4.294573643410853e-06, + "loss": 0.4283, + "step": 277 + }, + { + "epoch": 0.1292822818167726, + "grad_norm": 0.47520795464515686, + "learning_rate": 4.310077519379846e-06, + "loss": 0.4462, + "step": 278 + }, + { + "epoch": 0.1297473259959696, + "grad_norm": 0.40976548194885254, + "learning_rate": 4.325581395348837e-06, + "loss": 0.4408, + "step": 279 + }, + { + "epoch": 0.13021237017516665, + "grad_norm": 0.46813562512397766, + "learning_rate": 4.34108527131783e-06, + "loss": 0.4555, + "step": 280 + }, + { + "epoch": 0.13067741435436367, + "grad_norm": 0.4154888689517975, + "learning_rate": 4.356589147286822e-06, + "loss": 0.4492, + "step": 281 + }, + { + "epoch": 0.13114245853356069, + "grad_norm": 0.5551090240478516, + "learning_rate": 4.372093023255815e-06, + "loss": 0.4811, + "step": 282 + }, + { + "epoch": 0.1316075027127577, + "grad_norm": 0.47705549001693726, + "learning_rate": 4.387596899224806e-06, + "loss": 0.436, + "step": 283 + }, + { + "epoch": 0.13207254689195475, + "grad_norm": 0.4338889718055725, + "learning_rate": 4.403100775193799e-06, + "loss": 0.4636, + "step": 284 + }, + { + "epoch": 0.13253759107115176, + "grad_norm": 0.49717769026756287, + "learning_rate": 4.418604651162791e-06, + "loss": 0.4257, + "step": 285 + }, + { + "epoch": 0.13300263525034878, + "grad_norm": 0.4574042558670044, + "learning_rate": 4.434108527131784e-06, + "loss": 0.4135, + "step": 286 + }, + { + "epoch": 0.1334676794295458, + "grad_norm": 0.4649648368358612, + "learning_rate": 4.449612403100776e-06, + "loss": 0.4557, + "step": 287 + }, + { + "epoch": 0.13393272360874284, + "grad_norm": 0.43360039591789246, + "learning_rate": 4.465116279069768e-06, + "loss": 0.423, + "step": 288 + }, + { + "epoch": 0.13439776778793985, + "grad_norm": 0.43359556794166565, + "learning_rate": 4.4806201550387605e-06, + "loss": 0.4346, + "step": 289 + }, + { + "epoch": 0.13486281196713687, + "grad_norm": 0.4726753532886505, + "learning_rate": 4.4961240310077525e-06, + "loss": 0.4376, + "step": 290 + }, + { + "epoch": 0.1353278561463339, + "grad_norm": 0.43972247838974, + "learning_rate": 4.5116279069767445e-06, + "loss": 0.4422, + "step": 291 + }, + { + "epoch": 0.13579290032553093, + "grad_norm": 0.4216838479042053, + "learning_rate": 4.5271317829457366e-06, + "loss": 0.4275, + "step": 292 + }, + { + "epoch": 0.13625794450472795, + "grad_norm": 0.4523601830005646, + "learning_rate": 4.542635658914729e-06, + "loss": 0.4524, + "step": 293 + }, + { + "epoch": 0.13672298868392496, + "grad_norm": 0.4428309500217438, + "learning_rate": 4.558139534883721e-06, + "loss": 0.4247, + "step": 294 + }, + { + "epoch": 0.137188032863122, + "grad_norm": 0.42991116642951965, + "learning_rate": 4.573643410852713e-06, + "loss": 0.4232, + "step": 295 + }, + { + "epoch": 0.13765307704231902, + "grad_norm": 0.40712928771972656, + "learning_rate": 4.5891472868217054e-06, + "loss": 0.4426, + "step": 296 + }, + { + "epoch": 0.13811812122151604, + "grad_norm": 0.45721739530563354, + "learning_rate": 4.604651162790698e-06, + "loss": 0.4666, + "step": 297 + }, + { + "epoch": 0.13858316540071305, + "grad_norm": 0.41279730200767517, + "learning_rate": 4.62015503875969e-06, + "loss": 0.4258, + "step": 298 + }, + { + "epoch": 0.1390482095799101, + "grad_norm": 0.47948822379112244, + "learning_rate": 4.635658914728682e-06, + "loss": 0.4285, + "step": 299 + }, + { + "epoch": 0.13951325375910711, + "grad_norm": 0.45233485102653503, + "learning_rate": 4.651162790697675e-06, + "loss": 0.4522, + "step": 300 + }, + { + "epoch": 0.13997829793830413, + "grad_norm": 0.4481063783168793, + "learning_rate": 4.666666666666667e-06, + "loss": 0.4332, + "step": 301 + }, + { + "epoch": 0.14044334211750117, + "grad_norm": 0.4825620651245117, + "learning_rate": 4.682170542635659e-06, + "loss": 0.4643, + "step": 302 + }, + { + "epoch": 0.1409083862966982, + "grad_norm": 0.4766821265220642, + "learning_rate": 4.697674418604651e-06, + "loss": 0.4433, + "step": 303 + }, + { + "epoch": 0.1413734304758952, + "grad_norm": 0.46264582872390747, + "learning_rate": 4.713178294573644e-06, + "loss": 0.4409, + "step": 304 + }, + { + "epoch": 0.14183847465509222, + "grad_norm": 0.4554003179073334, + "learning_rate": 4.728682170542636e-06, + "loss": 0.4215, + "step": 305 + }, + { + "epoch": 0.14230351883428927, + "grad_norm": 0.45970794558525085, + "learning_rate": 4.744186046511629e-06, + "loss": 0.4259, + "step": 306 + }, + { + "epoch": 0.14276856301348628, + "grad_norm": 0.4238308370113373, + "learning_rate": 4.75968992248062e-06, + "loss": 0.4723, + "step": 307 + }, + { + "epoch": 0.1432336071926833, + "grad_norm": 0.4310249090194702, + "learning_rate": 4.775193798449613e-06, + "loss": 0.4526, + "step": 308 + }, + { + "epoch": 0.14369865137188031, + "grad_norm": 0.4667305648326874, + "learning_rate": 4.790697674418605e-06, + "loss": 0.4631, + "step": 309 + }, + { + "epoch": 0.14416369555107736, + "grad_norm": 0.465160071849823, + "learning_rate": 4.806201550387598e-06, + "loss": 0.4399, + "step": 310 + }, + { + "epoch": 0.14462873973027437, + "grad_norm": 0.44416457414627075, + "learning_rate": 4.821705426356589e-06, + "loss": 0.4398, + "step": 311 + }, + { + "epoch": 0.1450937839094714, + "grad_norm": 0.44952088594436646, + "learning_rate": 4.837209302325582e-06, + "loss": 0.4311, + "step": 312 + }, + { + "epoch": 0.14555882808866843, + "grad_norm": 0.44717299938201904, + "learning_rate": 4.852713178294574e-06, + "loss": 0.4541, + "step": 313 + }, + { + "epoch": 0.14602387226786545, + "grad_norm": 0.4240880310535431, + "learning_rate": 4.868217054263567e-06, + "loss": 0.4496, + "step": 314 + }, + { + "epoch": 0.14648891644706247, + "grad_norm": 0.46114540100097656, + "learning_rate": 4.883720930232559e-06, + "loss": 0.4493, + "step": 315 + }, + { + "epoch": 0.14695396062625948, + "grad_norm": 0.4437052309513092, + "learning_rate": 4.899224806201551e-06, + "loss": 0.435, + "step": 316 + }, + { + "epoch": 0.14741900480545653, + "grad_norm": 0.458609938621521, + "learning_rate": 4.9147286821705435e-06, + "loss": 0.4451, + "step": 317 + }, + { + "epoch": 0.14788404898465354, + "grad_norm": 0.4837147295475006, + "learning_rate": 4.9302325581395355e-06, + "loss": 0.4383, + "step": 318 + }, + { + "epoch": 0.14834909316385056, + "grad_norm": 0.4129205048084259, + "learning_rate": 4.9457364341085275e-06, + "loss": 0.4556, + "step": 319 + }, + { + "epoch": 0.1488141373430476, + "grad_norm": 0.5567089319229126, + "learning_rate": 4.9612403100775195e-06, + "loss": 0.4308, + "step": 320 + }, + { + "epoch": 0.14927918152224462, + "grad_norm": 0.47763490676879883, + "learning_rate": 4.976744186046512e-06, + "loss": 0.4492, + "step": 321 + }, + { + "epoch": 0.14974422570144164, + "grad_norm": 0.4176934063434601, + "learning_rate": 4.992248062015504e-06, + "loss": 0.4239, + "step": 322 + }, + { + "epoch": 0.15020926988063865, + "grad_norm": 0.5204504728317261, + "learning_rate": 5.007751937984496e-06, + "loss": 0.4357, + "step": 323 + }, + { + "epoch": 0.1506743140598357, + "grad_norm": 0.4915800988674164, + "learning_rate": 5.023255813953489e-06, + "loss": 0.4803, + "step": 324 + }, + { + "epoch": 0.1511393582390327, + "grad_norm": 0.4427052438259125, + "learning_rate": 5.038759689922481e-06, + "loss": 0.4279, + "step": 325 + }, + { + "epoch": 0.15160440241822973, + "grad_norm": 0.5173365473747253, + "learning_rate": 5.054263565891473e-06, + "loss": 0.4304, + "step": 326 + }, + { + "epoch": 0.15206944659742674, + "grad_norm": 0.4465036988258362, + "learning_rate": 5.069767441860466e-06, + "loss": 0.4288, + "step": 327 + }, + { + "epoch": 0.1525344907766238, + "grad_norm": 0.4534618556499481, + "learning_rate": 5.085271317829457e-06, + "loss": 0.4308, + "step": 328 + }, + { + "epoch": 0.1529995349558208, + "grad_norm": 0.42236584424972534, + "learning_rate": 5.100775193798449e-06, + "loss": 0.4221, + "step": 329 + }, + { + "epoch": 0.15346457913501782, + "grad_norm": 0.44343680143356323, + "learning_rate": 5.116279069767442e-06, + "loss": 0.4384, + "step": 330 + }, + { + "epoch": 0.15392962331421486, + "grad_norm": 0.45353367924690247, + "learning_rate": 5.131782945736434e-06, + "loss": 0.4425, + "step": 331 + }, + { + "epoch": 0.15439466749341188, + "grad_norm": 0.4323454797267914, + "learning_rate": 5.147286821705427e-06, + "loss": 0.4398, + "step": 332 + }, + { + "epoch": 0.1548597116726089, + "grad_norm": 0.4849368929862976, + "learning_rate": 5.162790697674419e-06, + "loss": 0.4586, + "step": 333 + }, + { + "epoch": 0.1553247558518059, + "grad_norm": 0.430549681186676, + "learning_rate": 5.178294573643411e-06, + "loss": 0.4308, + "step": 334 + }, + { + "epoch": 0.15578980003100296, + "grad_norm": 0.4729780852794647, + "learning_rate": 5.193798449612404e-06, + "loss": 0.4485, + "step": 335 + }, + { + "epoch": 0.15625484421019997, + "grad_norm": 0.45753175020217896, + "learning_rate": 5.209302325581396e-06, + "loss": 0.4205, + "step": 336 + }, + { + "epoch": 0.156719888389397, + "grad_norm": 0.44031214714050293, + "learning_rate": 5.224806201550388e-06, + "loss": 0.4309, + "step": 337 + }, + { + "epoch": 0.157184932568594, + "grad_norm": 0.5231649279594421, + "learning_rate": 5.240310077519381e-06, + "loss": 0.429, + "step": 338 + }, + { + "epoch": 0.15764997674779105, + "grad_norm": 0.48316752910614014, + "learning_rate": 5.255813953488372e-06, + "loss": 0.4651, + "step": 339 + }, + { + "epoch": 0.15811502092698806, + "grad_norm": 0.4905528426170349, + "learning_rate": 5.271317829457366e-06, + "loss": 0.4233, + "step": 340 + }, + { + "epoch": 0.15858006510618508, + "grad_norm": 0.44905000925064087, + "learning_rate": 5.286821705426357e-06, + "loss": 0.4741, + "step": 341 + }, + { + "epoch": 0.15904510928538212, + "grad_norm": 0.4897783696651459, + "learning_rate": 5.302325581395349e-06, + "loss": 0.4568, + "step": 342 + }, + { + "epoch": 0.15951015346457914, + "grad_norm": 0.4445964992046356, + "learning_rate": 5.317829457364342e-06, + "loss": 0.4294, + "step": 343 + }, + { + "epoch": 0.15997519764377616, + "grad_norm": 0.48060405254364014, + "learning_rate": 5.333333333333334e-06, + "loss": 0.4612, + "step": 344 + }, + { + "epoch": 0.16044024182297317, + "grad_norm": 0.4548921585083008, + "learning_rate": 5.348837209302326e-06, + "loss": 0.4569, + "step": 345 + }, + { + "epoch": 0.16090528600217022, + "grad_norm": 0.47072821855545044, + "learning_rate": 5.3643410852713185e-06, + "loss": 0.4403, + "step": 346 + }, + { + "epoch": 0.16137033018136723, + "grad_norm": 0.40528589487075806, + "learning_rate": 5.3798449612403105e-06, + "loss": 0.4524, + "step": 347 + }, + { + "epoch": 0.16183537436056425, + "grad_norm": 0.4871981739997864, + "learning_rate": 5.395348837209303e-06, + "loss": 0.4402, + "step": 348 + }, + { + "epoch": 0.16230041853976127, + "grad_norm": 0.4808438718318939, + "learning_rate": 5.410852713178295e-06, + "loss": 0.475, + "step": 349 + }, + { + "epoch": 0.1627654627189583, + "grad_norm": 0.4704870283603668, + "learning_rate": 5.4263565891472865e-06, + "loss": 0.432, + "step": 350 + }, + { + "epoch": 0.16323050689815533, + "grad_norm": 0.5283604860305786, + "learning_rate": 5.44186046511628e-06, + "loss": 0.4388, + "step": 351 + }, + { + "epoch": 0.16369555107735234, + "grad_norm": 0.4891282618045807, + "learning_rate": 5.457364341085271e-06, + "loss": 0.4561, + "step": 352 + }, + { + "epoch": 0.16416059525654939, + "grad_norm": 0.5148476362228394, + "learning_rate": 5.472868217054263e-06, + "loss": 0.4419, + "step": 353 + }, + { + "epoch": 0.1646256394357464, + "grad_norm": 0.43338412046432495, + "learning_rate": 5.488372093023256e-06, + "loss": 0.4295, + "step": 354 + }, + { + "epoch": 0.16509068361494342, + "grad_norm": 0.5159112811088562, + "learning_rate": 5.503875968992248e-06, + "loss": 0.4322, + "step": 355 + }, + { + "epoch": 0.16555572779414043, + "grad_norm": 0.4641515910625458, + "learning_rate": 5.519379844961241e-06, + "loss": 0.4303, + "step": 356 + }, + { + "epoch": 0.16602077197333748, + "grad_norm": 0.47173258662223816, + "learning_rate": 5.534883720930233e-06, + "loss": 0.4437, + "step": 357 + }, + { + "epoch": 0.1664858161525345, + "grad_norm": 0.5051222443580627, + "learning_rate": 5.550387596899225e-06, + "loss": 0.4265, + "step": 358 + }, + { + "epoch": 0.1669508603317315, + "grad_norm": 0.4815625250339508, + "learning_rate": 5.565891472868218e-06, + "loss": 0.4446, + "step": 359 + }, + { + "epoch": 0.16741590451092853, + "grad_norm": 0.5002831816673279, + "learning_rate": 5.58139534883721e-06, + "loss": 0.4428, + "step": 360 + }, + { + "epoch": 0.16788094869012557, + "grad_norm": 0.47330793738365173, + "learning_rate": 5.596899224806201e-06, + "loss": 0.4442, + "step": 361 + }, + { + "epoch": 0.16834599286932259, + "grad_norm": 0.4468974173069, + "learning_rate": 5.612403100775195e-06, + "loss": 0.4482, + "step": 362 + }, + { + "epoch": 0.1688110370485196, + "grad_norm": 0.46247512102127075, + "learning_rate": 5.627906976744186e-06, + "loss": 0.4534, + "step": 363 + }, + { + "epoch": 0.16927608122771665, + "grad_norm": 0.4856452941894531, + "learning_rate": 5.643410852713179e-06, + "loss": 0.4304, + "step": 364 + }, + { + "epoch": 0.16974112540691366, + "grad_norm": 0.48045817017555237, + "learning_rate": 5.658914728682171e-06, + "loss": 0.4347, + "step": 365 + }, + { + "epoch": 0.17020616958611068, + "grad_norm": 0.4817154109477997, + "learning_rate": 5.674418604651163e-06, + "loss": 0.455, + "step": 366 + }, + { + "epoch": 0.1706712137653077, + "grad_norm": 0.4779435396194458, + "learning_rate": 5.689922480620156e-06, + "loss": 0.398, + "step": 367 + }, + { + "epoch": 0.17113625794450474, + "grad_norm": 0.5880154371261597, + "learning_rate": 5.705426356589148e-06, + "loss": 0.4537, + "step": 368 + }, + { + "epoch": 0.17160130212370175, + "grad_norm": 0.53252112865448, + "learning_rate": 5.72093023255814e-06, + "loss": 0.446, + "step": 369 + }, + { + "epoch": 0.17206634630289877, + "grad_norm": 0.4977021813392639, + "learning_rate": 5.736434108527133e-06, + "loss": 0.4353, + "step": 370 + }, + { + "epoch": 0.1725313904820958, + "grad_norm": 0.5705452561378479, + "learning_rate": 5.751937984496125e-06, + "loss": 0.444, + "step": 371 + }, + { + "epoch": 0.17299643466129283, + "grad_norm": 0.4966931641101837, + "learning_rate": 5.7674418604651175e-06, + "loss": 0.4264, + "step": 372 + }, + { + "epoch": 0.17346147884048985, + "grad_norm": 0.5850646495819092, + "learning_rate": 5.782945736434109e-06, + "loss": 0.4223, + "step": 373 + }, + { + "epoch": 0.17392652301968686, + "grad_norm": 0.469243586063385, + "learning_rate": 5.798449612403101e-06, + "loss": 0.4206, + "step": 374 + }, + { + "epoch": 0.1743915671988839, + "grad_norm": 0.5241683125495911, + "learning_rate": 5.8139534883720935e-06, + "loss": 0.4422, + "step": 375 + }, + { + "epoch": 0.17485661137808092, + "grad_norm": 0.5510844588279724, + "learning_rate": 5.8294573643410855e-06, + "loss": 0.4335, + "step": 376 + }, + { + "epoch": 0.17532165555727794, + "grad_norm": 0.4422820508480072, + "learning_rate": 5.8449612403100775e-06, + "loss": 0.444, + "step": 377 + }, + { + "epoch": 0.17578669973647496, + "grad_norm": 0.4676722586154938, + "learning_rate": 5.86046511627907e-06, + "loss": 0.4477, + "step": 378 + }, + { + "epoch": 0.176251743915672, + "grad_norm": 0.41679903864860535, + "learning_rate": 5.875968992248062e-06, + "loss": 0.4085, + "step": 379 + }, + { + "epoch": 0.17671678809486902, + "grad_norm": 0.4864595830440521, + "learning_rate": 5.891472868217055e-06, + "loss": 0.4067, + "step": 380 + }, + { + "epoch": 0.17718183227406603, + "grad_norm": 0.47440460324287415, + "learning_rate": 5.906976744186047e-06, + "loss": 0.4496, + "step": 381 + }, + { + "epoch": 0.17764687645326305, + "grad_norm": 0.49285659193992615, + "learning_rate": 5.922480620155039e-06, + "loss": 0.4476, + "step": 382 + }, + { + "epoch": 0.1781119206324601, + "grad_norm": 0.4602559804916382, + "learning_rate": 5.937984496124032e-06, + "loss": 0.4111, + "step": 383 + }, + { + "epoch": 0.1785769648116571, + "grad_norm": 0.4376397728919983, + "learning_rate": 5.953488372093023e-06, + "loss": 0.4232, + "step": 384 + }, + { + "epoch": 0.17904200899085412, + "grad_norm": 0.4615304470062256, + "learning_rate": 5.968992248062015e-06, + "loss": 0.4413, + "step": 385 + }, + { + "epoch": 0.17950705317005117, + "grad_norm": 0.49313583970069885, + "learning_rate": 5.984496124031008e-06, + "loss": 0.4426, + "step": 386 + }, + { + "epoch": 0.17997209734924818, + "grad_norm": 0.4725257158279419, + "learning_rate": 6e-06, + "loss": 0.4515, + "step": 387 + }, + { + "epoch": 0.1804371415284452, + "grad_norm": 0.44713687896728516, + "learning_rate": 6.015503875968993e-06, + "loss": 0.4232, + "step": 388 + }, + { + "epoch": 0.18090218570764222, + "grad_norm": 0.444200724363327, + "learning_rate": 6.031007751937985e-06, + "loss": 0.4353, + "step": 389 + }, + { + "epoch": 0.18136722988683926, + "grad_norm": 0.4371640980243683, + "learning_rate": 6.046511627906977e-06, + "loss": 0.4257, + "step": 390 + }, + { + "epoch": 0.18183227406603628, + "grad_norm": 0.5189304947853088, + "learning_rate": 6.06201550387597e-06, + "loss": 0.4365, + "step": 391 + }, + { + "epoch": 0.1822973182452333, + "grad_norm": 0.4122354984283447, + "learning_rate": 6.077519379844962e-06, + "loss": 0.4226, + "step": 392 + }, + { + "epoch": 0.1827623624244303, + "grad_norm": 0.47240152955055237, + "learning_rate": 6.093023255813954e-06, + "loss": 0.4387, + "step": 393 + }, + { + "epoch": 0.18322740660362735, + "grad_norm": 0.46310725808143616, + "learning_rate": 6.108527131782947e-06, + "loss": 0.4572, + "step": 394 + }, + { + "epoch": 0.18369245078282437, + "grad_norm": 0.4549012780189514, + "learning_rate": 6.124031007751938e-06, + "loss": 0.4321, + "step": 395 + }, + { + "epoch": 0.18415749496202138, + "grad_norm": 0.5056280493736267, + "learning_rate": 6.139534883720932e-06, + "loss": 0.4233, + "step": 396 + }, + { + "epoch": 0.18462253914121843, + "grad_norm": 0.4270991086959839, + "learning_rate": 6.155038759689923e-06, + "loss": 0.4204, + "step": 397 + }, + { + "epoch": 0.18508758332041544, + "grad_norm": 0.47168463468551636, + "learning_rate": 6.170542635658915e-06, + "loss": 0.3967, + "step": 398 + }, + { + "epoch": 0.18555262749961246, + "grad_norm": 0.5067258477210999, + "learning_rate": 6.186046511627908e-06, + "loss": 0.4273, + "step": 399 + }, + { + "epoch": 0.18601767167880948, + "grad_norm": 0.46395808458328247, + "learning_rate": 6.2015503875969e-06, + "loss": 0.4256, + "step": 400 + }, + { + "epoch": 0.18648271585800652, + "grad_norm": 0.46003836393356323, + "learning_rate": 6.217054263565892e-06, + "loss": 0.41, + "step": 401 + }, + { + "epoch": 0.18694776003720354, + "grad_norm": 0.5008445382118225, + "learning_rate": 6.2325581395348845e-06, + "loss": 0.3952, + "step": 402 + }, + { + "epoch": 0.18741280421640055, + "grad_norm": 0.5617318153381348, + "learning_rate": 6.2480620155038765e-06, + "loss": 0.4104, + "step": 403 + }, + { + "epoch": 0.18787784839559757, + "grad_norm": 0.49167153239250183, + "learning_rate": 6.263565891472869e-06, + "loss": 0.4252, + "step": 404 + }, + { + "epoch": 0.1883428925747946, + "grad_norm": 0.4648562967777252, + "learning_rate": 6.279069767441861e-06, + "loss": 0.4485, + "step": 405 + }, + { + "epoch": 0.18880793675399163, + "grad_norm": 0.48742571473121643, + "learning_rate": 6.2945736434108525e-06, + "loss": 0.4088, + "step": 406 + }, + { + "epoch": 0.18927298093318864, + "grad_norm": 0.4988233745098114, + "learning_rate": 6.310077519379845e-06, + "loss": 0.4159, + "step": 407 + }, + { + "epoch": 0.1897380251123857, + "grad_norm": 0.45389029383659363, + "learning_rate": 6.325581395348837e-06, + "loss": 0.45, + "step": 408 + }, + { + "epoch": 0.1902030692915827, + "grad_norm": 0.48481884598731995, + "learning_rate": 6.341085271317829e-06, + "loss": 0.4327, + "step": 409 + }, + { + "epoch": 0.19066811347077972, + "grad_norm": 0.5068842768669128, + "learning_rate": 6.356589147286822e-06, + "loss": 0.4068, + "step": 410 + }, + { + "epoch": 0.19113315764997674, + "grad_norm": 0.5590021014213562, + "learning_rate": 6.372093023255814e-06, + "loss": 0.4622, + "step": 411 + }, + { + "epoch": 0.19159820182917378, + "grad_norm": 0.4978815019130707, + "learning_rate": 6.387596899224807e-06, + "loss": 0.4285, + "step": 412 + }, + { + "epoch": 0.1920632460083708, + "grad_norm": 0.48937058448791504, + "learning_rate": 6.403100775193799e-06, + "loss": 0.432, + "step": 413 + }, + { + "epoch": 0.1925282901875678, + "grad_norm": 0.47953492403030396, + "learning_rate": 6.418604651162791e-06, + "loss": 0.428, + "step": 414 + }, + { + "epoch": 0.19299333436676483, + "grad_norm": 0.5264999270439148, + "learning_rate": 6.434108527131784e-06, + "loss": 0.4338, + "step": 415 + }, + { + "epoch": 0.19345837854596187, + "grad_norm": 0.5225833058357239, + "learning_rate": 6.449612403100776e-06, + "loss": 0.4423, + "step": 416 + }, + { + "epoch": 0.1939234227251589, + "grad_norm": 0.43435758352279663, + "learning_rate": 6.465116279069767e-06, + "loss": 0.4034, + "step": 417 + }, + { + "epoch": 0.1943884669043559, + "grad_norm": 0.4764443337917328, + "learning_rate": 6.48062015503876e-06, + "loss": 0.4623, + "step": 418 + }, + { + "epoch": 0.19485351108355295, + "grad_norm": 0.49896514415740967, + "learning_rate": 6.496124031007752e-06, + "loss": 0.4413, + "step": 419 + }, + { + "epoch": 0.19531855526274997, + "grad_norm": 0.46910813450813293, + "learning_rate": 6.511627906976745e-06, + "loss": 0.4089, + "step": 420 + }, + { + "epoch": 0.19578359944194698, + "grad_norm": 0.46757951378822327, + "learning_rate": 6.527131782945737e-06, + "loss": 0.42, + "step": 421 + }, + { + "epoch": 0.196248643621144, + "grad_norm": 0.4977341890335083, + "learning_rate": 6.542635658914729e-06, + "loss": 0.4167, + "step": 422 + }, + { + "epoch": 0.19671368780034104, + "grad_norm": 0.4749394357204437, + "learning_rate": 6.558139534883722e-06, + "loss": 0.4044, + "step": 423 + }, + { + "epoch": 0.19717873197953806, + "grad_norm": 0.461539626121521, + "learning_rate": 6.573643410852714e-06, + "loss": 0.4022, + "step": 424 + }, + { + "epoch": 0.19764377615873507, + "grad_norm": 0.5062358379364014, + "learning_rate": 6.589147286821706e-06, + "loss": 0.435, + "step": 425 + }, + { + "epoch": 0.1981088203379321, + "grad_norm": 0.42745712399482727, + "learning_rate": 6.604651162790699e-06, + "loss": 0.4393, + "step": 426 + }, + { + "epoch": 0.19857386451712913, + "grad_norm": 0.43482205271720886, + "learning_rate": 6.620155038759691e-06, + "loss": 0.4132, + "step": 427 + }, + { + "epoch": 0.19903890869632615, + "grad_norm": 0.452120840549469, + "learning_rate": 6.6356589147286835e-06, + "loss": 0.4207, + "step": 428 + }, + { + "epoch": 0.19950395287552317, + "grad_norm": 0.453553169965744, + "learning_rate": 6.651162790697675e-06, + "loss": 0.4211, + "step": 429 + }, + { + "epoch": 0.1999689970547202, + "grad_norm": 0.49294397234916687, + "learning_rate": 6.666666666666667e-06, + "loss": 0.4354, + "step": 430 + }, + { + "epoch": 0.20043404123391723, + "grad_norm": 0.46980249881744385, + "learning_rate": 6.6821705426356595e-06, + "loss": 0.4315, + "step": 431 + }, + { + "epoch": 0.20089908541311424, + "grad_norm": 0.48995155096054077, + "learning_rate": 6.6976744186046515e-06, + "loss": 0.4428, + "step": 432 + }, + { + "epoch": 0.20136412959231126, + "grad_norm": 0.4816022515296936, + "learning_rate": 6.7131782945736435e-06, + "loss": 0.4411, + "step": 433 + }, + { + "epoch": 0.2018291737715083, + "grad_norm": 0.49030253291130066, + "learning_rate": 6.728682170542636e-06, + "loss": 0.409, + "step": 434 + }, + { + "epoch": 0.20229421795070532, + "grad_norm": 0.5009063482284546, + "learning_rate": 6.744186046511628e-06, + "loss": 0.4306, + "step": 435 + }, + { + "epoch": 0.20275926212990233, + "grad_norm": 0.49570536613464355, + "learning_rate": 6.759689922480621e-06, + "loss": 0.4166, + "step": 436 + }, + { + "epoch": 0.20322430630909935, + "grad_norm": 0.5443177223205566, + "learning_rate": 6.775193798449613e-06, + "loss": 0.4188, + "step": 437 + }, + { + "epoch": 0.2036893504882964, + "grad_norm": 0.46004390716552734, + "learning_rate": 6.790697674418605e-06, + "loss": 0.4149, + "step": 438 + }, + { + "epoch": 0.2041543946674934, + "grad_norm": 0.5454478859901428, + "learning_rate": 6.806201550387598e-06, + "loss": 0.4153, + "step": 439 + }, + { + "epoch": 0.20461943884669043, + "grad_norm": 0.6096868515014648, + "learning_rate": 6.821705426356589e-06, + "loss": 0.4373, + "step": 440 + }, + { + "epoch": 0.20508448302588747, + "grad_norm": 0.4331457018852234, + "learning_rate": 6.837209302325581e-06, + "loss": 0.4273, + "step": 441 + }, + { + "epoch": 0.2055495272050845, + "grad_norm": 0.5101227760314941, + "learning_rate": 6.852713178294574e-06, + "loss": 0.4174, + "step": 442 + }, + { + "epoch": 0.2060145713842815, + "grad_norm": 0.47638994455337524, + "learning_rate": 6.868217054263566e-06, + "loss": 0.4332, + "step": 443 + }, + { + "epoch": 0.20647961556347852, + "grad_norm": 0.48688259720802307, + "learning_rate": 6.883720930232559e-06, + "loss": 0.4269, + "step": 444 + }, + { + "epoch": 0.20694465974267556, + "grad_norm": 0.5044205188751221, + "learning_rate": 6.899224806201551e-06, + "loss": 0.4269, + "step": 445 + }, + { + "epoch": 0.20740970392187258, + "grad_norm": 0.5487541556358337, + "learning_rate": 6.914728682170543e-06, + "loss": 0.4411, + "step": 446 + }, + { + "epoch": 0.2078747481010696, + "grad_norm": 0.47763657569885254, + "learning_rate": 6.930232558139536e-06, + "loss": 0.4382, + "step": 447 + }, + { + "epoch": 0.20833979228026664, + "grad_norm": 0.49321773648262024, + "learning_rate": 6.945736434108528e-06, + "loss": 0.4441, + "step": 448 + }, + { + "epoch": 0.20880483645946366, + "grad_norm": 0.4817297160625458, + "learning_rate": 6.961240310077519e-06, + "loss": 0.4286, + "step": 449 + }, + { + "epoch": 0.20926988063866067, + "grad_norm": 0.5427500605583191, + "learning_rate": 6.976744186046513e-06, + "loss": 0.4021, + "step": 450 + }, + { + "epoch": 0.2097349248178577, + "grad_norm": 0.5691028237342834, + "learning_rate": 6.992248062015504e-06, + "loss": 0.4075, + "step": 451 + }, + { + "epoch": 0.21019996899705473, + "grad_norm": 0.4561224579811096, + "learning_rate": 7.007751937984497e-06, + "loss": 0.4075, + "step": 452 + }, + { + "epoch": 0.21066501317625175, + "grad_norm": 0.5992524027824402, + "learning_rate": 7.023255813953489e-06, + "loss": 0.4214, + "step": 453 + }, + { + "epoch": 0.21113005735544876, + "grad_norm": 0.6167622208595276, + "learning_rate": 7.038759689922481e-06, + "loss": 0.3898, + "step": 454 + }, + { + "epoch": 0.21159510153464578, + "grad_norm": 0.5301486253738403, + "learning_rate": 7.054263565891474e-06, + "loss": 0.4306, + "step": 455 + }, + { + "epoch": 0.21206014571384282, + "grad_norm": 0.6576060652732849, + "learning_rate": 7.069767441860466e-06, + "loss": 0.4162, + "step": 456 + }, + { + "epoch": 0.21252518989303984, + "grad_norm": 0.5479761958122253, + "learning_rate": 7.085271317829458e-06, + "loss": 0.4585, + "step": 457 + }, + { + "epoch": 0.21299023407223686, + "grad_norm": 0.47175103425979614, + "learning_rate": 7.1007751937984505e-06, + "loss": 0.4103, + "step": 458 + }, + { + "epoch": 0.2134552782514339, + "grad_norm": 0.5980085730552673, + "learning_rate": 7.1162790697674425e-06, + "loss": 0.4308, + "step": 459 + }, + { + "epoch": 0.21392032243063092, + "grad_norm": 0.49691030383110046, + "learning_rate": 7.131782945736435e-06, + "loss": 0.4177, + "step": 460 + }, + { + "epoch": 0.21438536660982793, + "grad_norm": 0.4692418575286865, + "learning_rate": 7.147286821705427e-06, + "loss": 0.4184, + "step": 461 + }, + { + "epoch": 0.21485041078902495, + "grad_norm": 0.4812504053115845, + "learning_rate": 7.1627906976744185e-06, + "loss": 0.4077, + "step": 462 + }, + { + "epoch": 0.215315454968222, + "grad_norm": 0.47342851758003235, + "learning_rate": 7.178294573643411e-06, + "loss": 0.4117, + "step": 463 + }, + { + "epoch": 0.215780499147419, + "grad_norm": 0.5251373648643494, + "learning_rate": 7.193798449612403e-06, + "loss": 0.451, + "step": 464 + }, + { + "epoch": 0.21624554332661602, + "grad_norm": 0.4466230869293213, + "learning_rate": 7.209302325581395e-06, + "loss": 0.4364, + "step": 465 + }, + { + "epoch": 0.21671058750581304, + "grad_norm": 0.502120852470398, + "learning_rate": 7.224806201550388e-06, + "loss": 0.451, + "step": 466 + }, + { + "epoch": 0.21717563168501008, + "grad_norm": 0.5081861615180969, + "learning_rate": 7.24031007751938e-06, + "loss": 0.4521, + "step": 467 + }, + { + "epoch": 0.2176406758642071, + "grad_norm": 0.5037353038787842, + "learning_rate": 7.255813953488373e-06, + "loss": 0.4055, + "step": 468 + }, + { + "epoch": 0.21810572004340412, + "grad_norm": 0.48872482776641846, + "learning_rate": 7.271317829457365e-06, + "loss": 0.4353, + "step": 469 + }, + { + "epoch": 0.21857076422260116, + "grad_norm": 0.4901602268218994, + "learning_rate": 7.286821705426357e-06, + "loss": 0.4176, + "step": 470 + }, + { + "epoch": 0.21903580840179818, + "grad_norm": 0.4682338535785675, + "learning_rate": 7.30232558139535e-06, + "loss": 0.394, + "step": 471 + }, + { + "epoch": 0.2195008525809952, + "grad_norm": 0.5010263919830322, + "learning_rate": 7.317829457364342e-06, + "loss": 0.4417, + "step": 472 + }, + { + "epoch": 0.2199658967601922, + "grad_norm": 0.4669908285140991, + "learning_rate": 7.333333333333333e-06, + "loss": 0.395, + "step": 473 + }, + { + "epoch": 0.22043094093938925, + "grad_norm": 0.47183507680892944, + "learning_rate": 7.348837209302326e-06, + "loss": 0.4388, + "step": 474 + }, + { + "epoch": 0.22089598511858627, + "grad_norm": 0.49007242918014526, + "learning_rate": 7.364341085271318e-06, + "loss": 0.4062, + "step": 475 + }, + { + "epoch": 0.22136102929778328, + "grad_norm": 0.5357131361961365, + "learning_rate": 7.379844961240311e-06, + "loss": 0.4414, + "step": 476 + }, + { + "epoch": 0.2218260734769803, + "grad_norm": 0.5643349289894104, + "learning_rate": 7.395348837209303e-06, + "loss": 0.4158, + "step": 477 + }, + { + "epoch": 0.22229111765617734, + "grad_norm": 0.4729050099849701, + "learning_rate": 7.410852713178295e-06, + "loss": 0.4273, + "step": 478 + }, + { + "epoch": 0.22275616183537436, + "grad_norm": 0.485498309135437, + "learning_rate": 7.426356589147288e-06, + "loss": 0.408, + "step": 479 + }, + { + "epoch": 0.22322120601457138, + "grad_norm": 0.5941533446311951, + "learning_rate": 7.44186046511628e-06, + "loss": 0.4191, + "step": 480 + }, + { + "epoch": 0.22368625019376842, + "grad_norm": 0.4895523488521576, + "learning_rate": 7.457364341085272e-06, + "loss": 0.4311, + "step": 481 + }, + { + "epoch": 0.22415129437296544, + "grad_norm": 0.6243600249290466, + "learning_rate": 7.472868217054265e-06, + "loss": 0.4214, + "step": 482 + }, + { + "epoch": 0.22461633855216245, + "grad_norm": 0.45703691244125366, + "learning_rate": 7.488372093023256e-06, + "loss": 0.4056, + "step": 483 + }, + { + "epoch": 0.22508138273135947, + "grad_norm": 0.5641500353813171, + "learning_rate": 7.5038759689922495e-06, + "loss": 0.4293, + "step": 484 + }, + { + "epoch": 0.2255464269105565, + "grad_norm": 0.5574773550033569, + "learning_rate": 7.519379844961241e-06, + "loss": 0.4195, + "step": 485 + }, + { + "epoch": 0.22601147108975353, + "grad_norm": 0.5667125582695007, + "learning_rate": 7.534883720930233e-06, + "loss": 0.4228, + "step": 486 + }, + { + "epoch": 0.22647651526895055, + "grad_norm": 0.4847676455974579, + "learning_rate": 7.5503875968992255e-06, + "loss": 0.4179, + "step": 487 + }, + { + "epoch": 0.22694155944814756, + "grad_norm": 0.51683509349823, + "learning_rate": 7.5658914728682175e-06, + "loss": 0.4035, + "step": 488 + }, + { + "epoch": 0.2274066036273446, + "grad_norm": 0.5952682495117188, + "learning_rate": 7.5813953488372095e-06, + "loss": 0.4308, + "step": 489 + }, + { + "epoch": 0.22787164780654162, + "grad_norm": 0.5058827996253967, + "learning_rate": 7.596899224806202e-06, + "loss": 0.4276, + "step": 490 + }, + { + "epoch": 0.22833669198573864, + "grad_norm": 0.7033687233924866, + "learning_rate": 7.612403100775194e-06, + "loss": 0.4345, + "step": 491 + }, + { + "epoch": 0.22880173616493568, + "grad_norm": 0.5743710398674011, + "learning_rate": 7.627906976744187e-06, + "loss": 0.4216, + "step": 492 + }, + { + "epoch": 0.2292667803441327, + "grad_norm": 0.637058675289154, + "learning_rate": 7.643410852713178e-06, + "loss": 0.439, + "step": 493 + }, + { + "epoch": 0.22973182452332971, + "grad_norm": 0.49991941452026367, + "learning_rate": 7.65891472868217e-06, + "loss": 0.4383, + "step": 494 + }, + { + "epoch": 0.23019686870252673, + "grad_norm": 0.6461667418479919, + "learning_rate": 7.674418604651164e-06, + "loss": 0.4542, + "step": 495 + }, + { + "epoch": 0.23066191288172377, + "grad_norm": 0.5306419134140015, + "learning_rate": 7.689922480620156e-06, + "loss": 0.4266, + "step": 496 + }, + { + "epoch": 0.2311269570609208, + "grad_norm": 0.5668182969093323, + "learning_rate": 7.705426356589148e-06, + "loss": 0.4369, + "step": 497 + }, + { + "epoch": 0.2315920012401178, + "grad_norm": 0.546444833278656, + "learning_rate": 7.72093023255814e-06, + "loss": 0.4281, + "step": 498 + }, + { + "epoch": 0.23205704541931482, + "grad_norm": 0.5376360416412354, + "learning_rate": 7.736434108527132e-06, + "loss": 0.4138, + "step": 499 + }, + { + "epoch": 0.23252208959851187, + "grad_norm": 0.6407905220985413, + "learning_rate": 7.751937984496126e-06, + "loss": 0.4204, + "step": 500 + }, + { + "epoch": 0.23298713377770888, + "grad_norm": 0.5134121179580688, + "learning_rate": 7.767441860465116e-06, + "loss": 0.4251, + "step": 501 + }, + { + "epoch": 0.2334521779569059, + "grad_norm": 0.6588038206100464, + "learning_rate": 7.782945736434108e-06, + "loss": 0.423, + "step": 502 + }, + { + "epoch": 0.23391722213610294, + "grad_norm": 0.5844888687133789, + "learning_rate": 7.798449612403102e-06, + "loss": 0.4362, + "step": 503 + }, + { + "epoch": 0.23438226631529996, + "grad_norm": 0.5324265956878662, + "learning_rate": 7.813953488372094e-06, + "loss": 0.4184, + "step": 504 + }, + { + "epoch": 0.23484731049449697, + "grad_norm": 0.6014962196350098, + "learning_rate": 7.829457364341086e-06, + "loss": 0.4314, + "step": 505 + }, + { + "epoch": 0.235312354673694, + "grad_norm": 0.51766037940979, + "learning_rate": 7.844961240310078e-06, + "loss": 0.405, + "step": 506 + }, + { + "epoch": 0.23577739885289103, + "grad_norm": 0.5190640091896057, + "learning_rate": 7.86046511627907e-06, + "loss": 0.4085, + "step": 507 + }, + { + "epoch": 0.23624244303208805, + "grad_norm": 0.5045374631881714, + "learning_rate": 7.875968992248064e-06, + "loss": 0.4219, + "step": 508 + }, + { + "epoch": 0.23670748721128507, + "grad_norm": 0.557181179523468, + "learning_rate": 7.891472868217056e-06, + "loss": 0.4026, + "step": 509 + }, + { + "epoch": 0.23717253139048208, + "grad_norm": 0.5194858908653259, + "learning_rate": 7.906976744186048e-06, + "loss": 0.4232, + "step": 510 + }, + { + "epoch": 0.23763757556967913, + "grad_norm": 0.5417995452880859, + "learning_rate": 7.92248062015504e-06, + "loss": 0.416, + "step": 511 + }, + { + "epoch": 0.23810261974887614, + "grad_norm": 0.48320630192756653, + "learning_rate": 7.937984496124032e-06, + "loss": 0.3952, + "step": 512 + }, + { + "epoch": 0.23856766392807316, + "grad_norm": 0.5941821336746216, + "learning_rate": 7.953488372093024e-06, + "loss": 0.4101, + "step": 513 + }, + { + "epoch": 0.2390327081072702, + "grad_norm": 0.5255295634269714, + "learning_rate": 7.968992248062016e-06, + "loss": 0.4091, + "step": 514 + }, + { + "epoch": 0.23949775228646722, + "grad_norm": 0.4808565378189087, + "learning_rate": 7.984496124031008e-06, + "loss": 0.4147, + "step": 515 + }, + { + "epoch": 0.23996279646566424, + "grad_norm": 0.6180495023727417, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4193, + "step": 516 + }, + { + "epoch": 0.24042784064486125, + "grad_norm": 0.6234874725341797, + "learning_rate": 8.015503875968993e-06, + "loss": 0.4413, + "step": 517 + }, + { + "epoch": 0.2408928848240583, + "grad_norm": 0.49461182951927185, + "learning_rate": 8.031007751937985e-06, + "loss": 0.3924, + "step": 518 + }, + { + "epoch": 0.2413579290032553, + "grad_norm": 0.6341349482536316, + "learning_rate": 8.046511627906977e-06, + "loss": 0.4284, + "step": 519 + }, + { + "epoch": 0.24182297318245233, + "grad_norm": 0.5244817137718201, + "learning_rate": 8.06201550387597e-06, + "loss": 0.4037, + "step": 520 + }, + { + "epoch": 0.24228801736164934, + "grad_norm": 0.5345357656478882, + "learning_rate": 8.077519379844961e-06, + "loss": 0.4568, + "step": 521 + }, + { + "epoch": 0.2427530615408464, + "grad_norm": 0.685580849647522, + "learning_rate": 8.093023255813955e-06, + "loss": 0.4431, + "step": 522 + }, + { + "epoch": 0.2432181057200434, + "grad_norm": 0.5502837896347046, + "learning_rate": 8.108527131782945e-06, + "loss": 0.4197, + "step": 523 + }, + { + "epoch": 0.24368314989924042, + "grad_norm": 0.5113924741744995, + "learning_rate": 8.124031007751939e-06, + "loss": 0.4108, + "step": 524 + }, + { + "epoch": 0.24414819407843746, + "grad_norm": 0.5599501729011536, + "learning_rate": 8.139534883720931e-06, + "loss": 0.4374, + "step": 525 + }, + { + "epoch": 0.24461323825763448, + "grad_norm": 0.5084413886070251, + "learning_rate": 8.155038759689923e-06, + "loss": 0.4296, + "step": 526 + }, + { + "epoch": 0.2450782824368315, + "grad_norm": 0.49575719237327576, + "learning_rate": 8.170542635658915e-06, + "loss": 0.4252, + "step": 527 + }, + { + "epoch": 0.2455433266160285, + "grad_norm": 0.4909055233001709, + "learning_rate": 8.186046511627907e-06, + "loss": 0.4206, + "step": 528 + }, + { + "epoch": 0.24600837079522556, + "grad_norm": 0.5348622798919678, + "learning_rate": 8.201550387596899e-06, + "loss": 0.4373, + "step": 529 + }, + { + "epoch": 0.24647341497442257, + "grad_norm": 0.41662895679473877, + "learning_rate": 8.217054263565893e-06, + "loss": 0.4021, + "step": 530 + }, + { + "epoch": 0.2469384591536196, + "grad_norm": 0.5400405526161194, + "learning_rate": 8.232558139534885e-06, + "loss": 0.4104, + "step": 531 + }, + { + "epoch": 0.2474035033328166, + "grad_norm": 0.4757499694824219, + "learning_rate": 8.248062015503877e-06, + "loss": 0.4034, + "step": 532 + }, + { + "epoch": 0.24786854751201365, + "grad_norm": 0.49683964252471924, + "learning_rate": 8.263565891472869e-06, + "loss": 0.4145, + "step": 533 + }, + { + "epoch": 0.24833359169121066, + "grad_norm": 0.529530942440033, + "learning_rate": 8.279069767441861e-06, + "loss": 0.4188, + "step": 534 + }, + { + "epoch": 0.24879863587040768, + "grad_norm": 0.5073703527450562, + "learning_rate": 8.294573643410853e-06, + "loss": 0.4071, + "step": 535 + }, + { + "epoch": 0.24926368004960472, + "grad_norm": 0.5136784315109253, + "learning_rate": 8.310077519379845e-06, + "loss": 0.4178, + "step": 536 + }, + { + "epoch": 0.24972872422880174, + "grad_norm": 0.4692307412624359, + "learning_rate": 8.325581395348837e-06, + "loss": 0.4155, + "step": 537 + }, + { + "epoch": 0.2501937684079988, + "grad_norm": 0.5519499778747559, + "learning_rate": 8.34108527131783e-06, + "loss": 0.4088, + "step": 538 + }, + { + "epoch": 0.2506588125871958, + "grad_norm": 0.5323079228401184, + "learning_rate": 8.356589147286823e-06, + "loss": 0.3887, + "step": 539 + }, + { + "epoch": 0.2511238567663928, + "grad_norm": 0.5130917429924011, + "learning_rate": 8.372093023255815e-06, + "loss": 0.4473, + "step": 540 + }, + { + "epoch": 0.25158890094558983, + "grad_norm": 0.5379475951194763, + "learning_rate": 8.387596899224807e-06, + "loss": 0.4092, + "step": 541 + }, + { + "epoch": 0.25205394512478685, + "grad_norm": 0.4682493507862091, + "learning_rate": 8.403100775193799e-06, + "loss": 0.407, + "step": 542 + }, + { + "epoch": 0.25251898930398387, + "grad_norm": 0.5762583017349243, + "learning_rate": 8.418604651162792e-06, + "loss": 0.4402, + "step": 543 + }, + { + "epoch": 0.2529840334831809, + "grad_norm": 0.4663183093070984, + "learning_rate": 8.434108527131784e-06, + "loss": 0.4079, + "step": 544 + }, + { + "epoch": 0.25344907766237795, + "grad_norm": 0.5599426627159119, + "learning_rate": 8.449612403100775e-06, + "loss": 0.4041, + "step": 545 + }, + { + "epoch": 0.25391412184157497, + "grad_norm": 0.4949679374694824, + "learning_rate": 8.465116279069768e-06, + "loss": 0.4055, + "step": 546 + }, + { + "epoch": 0.254379166020772, + "grad_norm": 0.4728296995162964, + "learning_rate": 8.48062015503876e-06, + "loss": 0.4044, + "step": 547 + }, + { + "epoch": 0.254844210199969, + "grad_norm": 0.5103499889373779, + "learning_rate": 8.496124031007752e-06, + "loss": 0.4158, + "step": 548 + }, + { + "epoch": 0.255309254379166, + "grad_norm": 0.5350103974342346, + "learning_rate": 8.511627906976744e-06, + "loss": 0.4333, + "step": 549 + }, + { + "epoch": 0.25577429855836303, + "grad_norm": 0.47756704688072205, + "learning_rate": 8.527131782945736e-06, + "loss": 0.4273, + "step": 550 + }, + { + "epoch": 0.25623934273756005, + "grad_norm": 0.6039769649505615, + "learning_rate": 8.54263565891473e-06, + "loss": 0.447, + "step": 551 + }, + { + "epoch": 0.25670438691675707, + "grad_norm": 0.5659769177436829, + "learning_rate": 8.558139534883722e-06, + "loss": 0.4298, + "step": 552 + }, + { + "epoch": 0.25716943109595414, + "grad_norm": 0.46599307656288147, + "learning_rate": 8.573643410852714e-06, + "loss": 0.3993, + "step": 553 + }, + { + "epoch": 0.25763447527515115, + "grad_norm": 0.5073681473731995, + "learning_rate": 8.589147286821706e-06, + "loss": 0.426, + "step": 554 + }, + { + "epoch": 0.25809951945434817, + "grad_norm": 0.4540097117424011, + "learning_rate": 8.604651162790698e-06, + "loss": 0.4197, + "step": 555 + }, + { + "epoch": 0.2585645636335452, + "grad_norm": 0.46219104528427124, + "learning_rate": 8.620155038759692e-06, + "loss": 0.4181, + "step": 556 + }, + { + "epoch": 0.2590296078127422, + "grad_norm": 0.5334506034851074, + "learning_rate": 8.635658914728682e-06, + "loss": 0.4244, + "step": 557 + }, + { + "epoch": 0.2594946519919392, + "grad_norm": 0.6203073859214783, + "learning_rate": 8.651162790697674e-06, + "loss": 0.4125, + "step": 558 + }, + { + "epoch": 0.25995969617113623, + "grad_norm": 0.5533168315887451, + "learning_rate": 8.666666666666668e-06, + "loss": 0.4251, + "step": 559 + }, + { + "epoch": 0.2604247403503333, + "grad_norm": 0.5441173911094666, + "learning_rate": 8.68217054263566e-06, + "loss": 0.4062, + "step": 560 + }, + { + "epoch": 0.2608897845295303, + "grad_norm": 0.6109142899513245, + "learning_rate": 8.697674418604652e-06, + "loss": 0.454, + "step": 561 + }, + { + "epoch": 0.26135482870872734, + "grad_norm": 0.5405747890472412, + "learning_rate": 8.713178294573644e-06, + "loss": 0.418, + "step": 562 + }, + { + "epoch": 0.26181987288792435, + "grad_norm": 0.48734837770462036, + "learning_rate": 8.728682170542636e-06, + "loss": 0.417, + "step": 563 + }, + { + "epoch": 0.26228491706712137, + "grad_norm": 0.6338199973106384, + "learning_rate": 8.74418604651163e-06, + "loss": 0.3976, + "step": 564 + }, + { + "epoch": 0.2627499612463184, + "grad_norm": 0.5382621884346008, + "learning_rate": 8.759689922480622e-06, + "loss": 0.446, + "step": 565 + }, + { + "epoch": 0.2632150054255154, + "grad_norm": 0.4548230767250061, + "learning_rate": 8.775193798449612e-06, + "loss": 0.4227, + "step": 566 + }, + { + "epoch": 0.2636800496047125, + "grad_norm": 0.6364620327949524, + "learning_rate": 8.790697674418606e-06, + "loss": 0.4114, + "step": 567 + }, + { + "epoch": 0.2641450937839095, + "grad_norm": 0.6153837442398071, + "learning_rate": 8.806201550387598e-06, + "loss": 0.4259, + "step": 568 + }, + { + "epoch": 0.2646101379631065, + "grad_norm": 0.5064429044723511, + "learning_rate": 8.82170542635659e-06, + "loss": 0.4256, + "step": 569 + }, + { + "epoch": 0.2650751821423035, + "grad_norm": 0.6010052561759949, + "learning_rate": 8.837209302325582e-06, + "loss": 0.411, + "step": 570 + }, + { + "epoch": 0.26554022632150054, + "grad_norm": 0.659740686416626, + "learning_rate": 8.852713178294574e-06, + "loss": 0.3959, + "step": 571 + }, + { + "epoch": 0.26600527050069755, + "grad_norm": 0.548054039478302, + "learning_rate": 8.868217054263567e-06, + "loss": 0.4324, + "step": 572 + }, + { + "epoch": 0.26647031467989457, + "grad_norm": 0.5880405306816101, + "learning_rate": 8.88372093023256e-06, + "loss": 0.3957, + "step": 573 + }, + { + "epoch": 0.2669353588590916, + "grad_norm": 0.5269092321395874, + "learning_rate": 8.899224806201551e-06, + "loss": 0.4376, + "step": 574 + }, + { + "epoch": 0.26740040303828866, + "grad_norm": 0.5422910451889038, + "learning_rate": 8.914728682170543e-06, + "loss": 0.3812, + "step": 575 + }, + { + "epoch": 0.2678654472174857, + "grad_norm": 0.5918363332748413, + "learning_rate": 8.930232558139535e-06, + "loss": 0.4285, + "step": 576 + }, + { + "epoch": 0.2683304913966827, + "grad_norm": 0.5592337250709534, + "learning_rate": 8.945736434108527e-06, + "loss": 0.3972, + "step": 577 + }, + { + "epoch": 0.2687955355758797, + "grad_norm": 0.5338684916496277, + "learning_rate": 8.961240310077521e-06, + "loss": 0.4245, + "step": 578 + }, + { + "epoch": 0.2692605797550767, + "grad_norm": 0.46233999729156494, + "learning_rate": 8.976744186046511e-06, + "loss": 0.416, + "step": 579 + }, + { + "epoch": 0.26972562393427374, + "grad_norm": 0.6775352954864502, + "learning_rate": 8.992248062015505e-06, + "loss": 0.4152, + "step": 580 + }, + { + "epoch": 0.27019066811347076, + "grad_norm": 0.6111975312232971, + "learning_rate": 9.007751937984497e-06, + "loss": 0.4496, + "step": 581 + }, + { + "epoch": 0.2706557122926678, + "grad_norm": 0.4907594621181488, + "learning_rate": 9.023255813953489e-06, + "loss": 0.4145, + "step": 582 + }, + { + "epoch": 0.27112075647186484, + "grad_norm": 0.6536867022514343, + "learning_rate": 9.038759689922481e-06, + "loss": 0.4421, + "step": 583 + }, + { + "epoch": 0.27158580065106186, + "grad_norm": 0.4851461946964264, + "learning_rate": 9.054263565891473e-06, + "loss": 0.3856, + "step": 584 + }, + { + "epoch": 0.2720508448302589, + "grad_norm": 0.552401602268219, + "learning_rate": 9.069767441860465e-06, + "loss": 0.4085, + "step": 585 + }, + { + "epoch": 0.2725158890094559, + "grad_norm": 0.5714566111564636, + "learning_rate": 9.085271317829459e-06, + "loss": 0.3893, + "step": 586 + }, + { + "epoch": 0.2729809331886529, + "grad_norm": 0.5839101076126099, + "learning_rate": 9.10077519379845e-06, + "loss": 0.4145, + "step": 587 + }, + { + "epoch": 0.2734459773678499, + "grad_norm": 0.5246545076370239, + "learning_rate": 9.116279069767443e-06, + "loss": 0.4316, + "step": 588 + }, + { + "epoch": 0.273911021547047, + "grad_norm": 0.49281787872314453, + "learning_rate": 9.131782945736435e-06, + "loss": 0.4272, + "step": 589 + }, + { + "epoch": 0.274376065726244, + "grad_norm": 0.5275765657424927, + "learning_rate": 9.147286821705427e-06, + "loss": 0.4141, + "step": 590 + }, + { + "epoch": 0.27484110990544103, + "grad_norm": 0.4734654724597931, + "learning_rate": 9.162790697674419e-06, + "loss": 0.4031, + "step": 591 + }, + { + "epoch": 0.27530615408463804, + "grad_norm": 0.5400969386100769, + "learning_rate": 9.178294573643411e-06, + "loss": 0.4517, + "step": 592 + }, + { + "epoch": 0.27577119826383506, + "grad_norm": 0.4679478406906128, + "learning_rate": 9.193798449612403e-06, + "loss": 0.4014, + "step": 593 + }, + { + "epoch": 0.2762362424430321, + "grad_norm": 0.6086196899414062, + "learning_rate": 9.209302325581397e-06, + "loss": 0.4185, + "step": 594 + }, + { + "epoch": 0.2767012866222291, + "grad_norm": 0.5482898950576782, + "learning_rate": 9.224806201550389e-06, + "loss": 0.4391, + "step": 595 + }, + { + "epoch": 0.2771663308014261, + "grad_norm": 0.5058215856552124, + "learning_rate": 9.24031007751938e-06, + "loss": 0.3826, + "step": 596 + }, + { + "epoch": 0.2776313749806232, + "grad_norm": 0.5932140350341797, + "learning_rate": 9.255813953488373e-06, + "loss": 0.4105, + "step": 597 + }, + { + "epoch": 0.2780964191598202, + "grad_norm": 0.4306205213069916, + "learning_rate": 9.271317829457365e-06, + "loss": 0.4083, + "step": 598 + }, + { + "epoch": 0.2785614633390172, + "grad_norm": 0.4987516701221466, + "learning_rate": 9.286821705426358e-06, + "loss": 0.4078, + "step": 599 + }, + { + "epoch": 0.27902650751821423, + "grad_norm": 0.5451740026473999, + "learning_rate": 9.30232558139535e-06, + "loss": 0.4184, + "step": 600 + }, + { + "epoch": 0.27949155169741124, + "grad_norm": 0.52000892162323, + "learning_rate": 9.31782945736434e-06, + "loss": 0.422, + "step": 601 + }, + { + "epoch": 0.27995659587660826, + "grad_norm": 0.524783194065094, + "learning_rate": 9.333333333333334e-06, + "loss": 0.4121, + "step": 602 + }, + { + "epoch": 0.2804216400558053, + "grad_norm": 0.5225769877433777, + "learning_rate": 9.348837209302326e-06, + "loss": 0.4339, + "step": 603 + }, + { + "epoch": 0.28088668423500235, + "grad_norm": 0.563931405544281, + "learning_rate": 9.364341085271318e-06, + "loss": 0.4212, + "step": 604 + }, + { + "epoch": 0.28135172841419936, + "grad_norm": 0.6426302790641785, + "learning_rate": 9.37984496124031e-06, + "loss": 0.4, + "step": 605 + }, + { + "epoch": 0.2818167725933964, + "grad_norm": 0.476382851600647, + "learning_rate": 9.395348837209302e-06, + "loss": 0.399, + "step": 606 + }, + { + "epoch": 0.2822818167725934, + "grad_norm": 0.6564604640007019, + "learning_rate": 9.410852713178296e-06, + "loss": 0.4106, + "step": 607 + }, + { + "epoch": 0.2827468609517904, + "grad_norm": 0.5618086457252502, + "learning_rate": 9.426356589147288e-06, + "loss": 0.4193, + "step": 608 + }, + { + "epoch": 0.28321190513098743, + "grad_norm": 0.6813092231750488, + "learning_rate": 9.44186046511628e-06, + "loss": 0.4184, + "step": 609 + }, + { + "epoch": 0.28367694931018445, + "grad_norm": 0.5115700960159302, + "learning_rate": 9.457364341085272e-06, + "loss": 0.4251, + "step": 610 + }, + { + "epoch": 0.2841419934893815, + "grad_norm": 0.620830774307251, + "learning_rate": 9.472868217054264e-06, + "loss": 0.3938, + "step": 611 + }, + { + "epoch": 0.28460703766857853, + "grad_norm": 0.4752020239830017, + "learning_rate": 9.488372093023258e-06, + "loss": 0.4005, + "step": 612 + }, + { + "epoch": 0.28507208184777555, + "grad_norm": 0.5442911386489868, + "learning_rate": 9.503875968992248e-06, + "loss": 0.4185, + "step": 613 + }, + { + "epoch": 0.28553712602697257, + "grad_norm": 0.5141138434410095, + "learning_rate": 9.51937984496124e-06, + "loss": 0.4421, + "step": 614 + }, + { + "epoch": 0.2860021702061696, + "grad_norm": 0.5211554169654846, + "learning_rate": 9.534883720930234e-06, + "loss": 0.402, + "step": 615 + }, + { + "epoch": 0.2864672143853666, + "grad_norm": 0.5350854396820068, + "learning_rate": 9.550387596899226e-06, + "loss": 0.4229, + "step": 616 + }, + { + "epoch": 0.2869322585645636, + "grad_norm": 0.5418005585670471, + "learning_rate": 9.565891472868218e-06, + "loss": 0.4264, + "step": 617 + }, + { + "epoch": 0.28739730274376063, + "grad_norm": 0.5651183128356934, + "learning_rate": 9.58139534883721e-06, + "loss": 0.412, + "step": 618 + }, + { + "epoch": 0.2878623469229577, + "grad_norm": 0.4958003759384155, + "learning_rate": 9.596899224806202e-06, + "loss": 0.3929, + "step": 619 + }, + { + "epoch": 0.2883273911021547, + "grad_norm": 0.6447129845619202, + "learning_rate": 9.612403100775196e-06, + "loss": 0.4177, + "step": 620 + }, + { + "epoch": 0.28879243528135173, + "grad_norm": 0.5777575969696045, + "learning_rate": 9.627906976744188e-06, + "loss": 0.4193, + "step": 621 + }, + { + "epoch": 0.28925747946054875, + "grad_norm": 0.7008224129676819, + "learning_rate": 9.643410852713178e-06, + "loss": 0.423, + "step": 622 + }, + { + "epoch": 0.28972252363974577, + "grad_norm": 0.4426723122596741, + "learning_rate": 9.658914728682172e-06, + "loss": 0.4038, + "step": 623 + }, + { + "epoch": 0.2901875678189428, + "grad_norm": 0.6383287310600281, + "learning_rate": 9.674418604651164e-06, + "loss": 0.4213, + "step": 624 + }, + { + "epoch": 0.2906526119981398, + "grad_norm": 0.4844573438167572, + "learning_rate": 9.689922480620156e-06, + "loss": 0.4355, + "step": 625 + }, + { + "epoch": 0.29111765617733687, + "grad_norm": 0.6763868927955627, + "learning_rate": 9.705426356589148e-06, + "loss": 0.4122, + "step": 626 + }, + { + "epoch": 0.2915827003565339, + "grad_norm": 0.508306622505188, + "learning_rate": 9.72093023255814e-06, + "loss": 0.4067, + "step": 627 + }, + { + "epoch": 0.2920477445357309, + "grad_norm": 0.5674280524253845, + "learning_rate": 9.736434108527133e-06, + "loss": 0.4359, + "step": 628 + }, + { + "epoch": 0.2925127887149279, + "grad_norm": 0.5861987471580505, + "learning_rate": 9.751937984496125e-06, + "loss": 0.3948, + "step": 629 + }, + { + "epoch": 0.29297783289412493, + "grad_norm": 0.5261598825454712, + "learning_rate": 9.767441860465117e-06, + "loss": 0.4339, + "step": 630 + }, + { + "epoch": 0.29344287707332195, + "grad_norm": 0.4901929199695587, + "learning_rate": 9.78294573643411e-06, + "loss": 0.3916, + "step": 631 + }, + { + "epoch": 0.29390792125251897, + "grad_norm": 0.5431584715843201, + "learning_rate": 9.798449612403101e-06, + "loss": 0.4372, + "step": 632 + }, + { + "epoch": 0.29437296543171604, + "grad_norm": 0.4933854043483734, + "learning_rate": 9.813953488372093e-06, + "loss": 0.4107, + "step": 633 + }, + { + "epoch": 0.29483800961091305, + "grad_norm": 0.5128764510154724, + "learning_rate": 9.829457364341087e-06, + "loss": 0.4222, + "step": 634 + }, + { + "epoch": 0.29530305379011007, + "grad_norm": 0.577727198600769, + "learning_rate": 9.844961240310077e-06, + "loss": 0.4417, + "step": 635 + }, + { + "epoch": 0.2957680979693071, + "grad_norm": 0.5306776762008667, + "learning_rate": 9.860465116279071e-06, + "loss": 0.4031, + "step": 636 + }, + { + "epoch": 0.2962331421485041, + "grad_norm": 0.5434820055961609, + "learning_rate": 9.875968992248063e-06, + "loss": 0.4148, + "step": 637 + }, + { + "epoch": 0.2966981863277011, + "grad_norm": 0.5308932065963745, + "learning_rate": 9.891472868217055e-06, + "loss": 0.4017, + "step": 638 + }, + { + "epoch": 0.29716323050689814, + "grad_norm": 0.6192681193351746, + "learning_rate": 9.906976744186047e-06, + "loss": 0.4236, + "step": 639 + }, + { + "epoch": 0.2976282746860952, + "grad_norm": 0.5245199799537659, + "learning_rate": 9.922480620155039e-06, + "loss": 0.4206, + "step": 640 + }, + { + "epoch": 0.2980933188652922, + "grad_norm": 0.6172415018081665, + "learning_rate": 9.937984496124031e-06, + "loss": 0.4147, + "step": 641 + }, + { + "epoch": 0.29855836304448924, + "grad_norm": 0.49070969223976135, + "learning_rate": 9.953488372093025e-06, + "loss": 0.4104, + "step": 642 + }, + { + "epoch": 0.29902340722368626, + "grad_norm": 0.5000258684158325, + "learning_rate": 9.968992248062017e-06, + "loss": 0.4026, + "step": 643 + }, + { + "epoch": 0.29948845140288327, + "grad_norm": 0.601202130317688, + "learning_rate": 9.984496124031009e-06, + "loss": 0.4187, + "step": 644 + }, + { + "epoch": 0.2999534955820803, + "grad_norm": 0.5487759113311768, + "learning_rate": 1e-05, + "loss": 0.3904, + "step": 645 + }, + { + "epoch": 0.3004185397612773, + "grad_norm": 0.4648313522338867, + "learning_rate": 9.99999926779061e-06, + "loss": 0.4065, + "step": 646 + }, + { + "epoch": 0.3008835839404743, + "grad_norm": 0.542242705821991, + "learning_rate": 9.999997071162647e-06, + "loss": 0.403, + "step": 647 + }, + { + "epoch": 0.3013486281196714, + "grad_norm": 0.5556986331939697, + "learning_rate": 9.999993410116758e-06, + "loss": 0.4012, + "step": 648 + }, + { + "epoch": 0.3018136722988684, + "grad_norm": 0.4553111493587494, + "learning_rate": 9.999988284654016e-06, + "loss": 0.423, + "step": 649 + }, + { + "epoch": 0.3022787164780654, + "grad_norm": 0.5509373545646667, + "learning_rate": 9.999981694775921e-06, + "loss": 0.4202, + "step": 650 + }, + { + "epoch": 0.30274376065726244, + "grad_norm": 0.4689308702945709, + "learning_rate": 9.999973640484402e-06, + "loss": 0.3928, + "step": 651 + }, + { + "epoch": 0.30320880483645946, + "grad_norm": 0.4984801411628723, + "learning_rate": 9.99996412178182e-06, + "loss": 0.4171, + "step": 652 + }, + { + "epoch": 0.30367384901565647, + "grad_norm": 0.5575225949287415, + "learning_rate": 9.999953138670961e-06, + "loss": 0.4134, + "step": 653 + }, + { + "epoch": 0.3041388931948535, + "grad_norm": 0.5620846748352051, + "learning_rate": 9.999940691155043e-06, + "loss": 0.4043, + "step": 654 + }, + { + "epoch": 0.30460393737405056, + "grad_norm": 0.5309882760047913, + "learning_rate": 9.999926779237713e-06, + "loss": 0.4286, + "step": 655 + }, + { + "epoch": 0.3050689815532476, + "grad_norm": 0.5248701572418213, + "learning_rate": 9.999911402923043e-06, + "loss": 0.4163, + "step": 656 + }, + { + "epoch": 0.3055340257324446, + "grad_norm": 0.5015712380409241, + "learning_rate": 9.999894562215538e-06, + "loss": 0.4136, + "step": 657 + }, + { + "epoch": 0.3059990699116416, + "grad_norm": 0.546489953994751, + "learning_rate": 9.999876257120127e-06, + "loss": 0.4217, + "step": 658 + }, + { + "epoch": 0.3064641140908386, + "grad_norm": 0.4728710651397705, + "learning_rate": 9.999856487642177e-06, + "loss": 0.3868, + "step": 659 + }, + { + "epoch": 0.30692915827003564, + "grad_norm": 0.5765323638916016, + "learning_rate": 9.999835253787472e-06, + "loss": 0.4518, + "step": 660 + }, + { + "epoch": 0.30739420244923266, + "grad_norm": 0.5118497014045715, + "learning_rate": 9.999812555562239e-06, + "loss": 0.432, + "step": 661 + }, + { + "epoch": 0.30785924662842973, + "grad_norm": 0.5273929238319397, + "learning_rate": 9.999788392973117e-06, + "loss": 0.3831, + "step": 662 + }, + { + "epoch": 0.30832429080762674, + "grad_norm": 0.5201013088226318, + "learning_rate": 9.99976276602719e-06, + "loss": 0.4127, + "step": 663 + }, + { + "epoch": 0.30878933498682376, + "grad_norm": 0.6540030837059021, + "learning_rate": 9.999735674731959e-06, + "loss": 0.4359, + "step": 664 + }, + { + "epoch": 0.3092543791660208, + "grad_norm": 0.5233065485954285, + "learning_rate": 9.999707119095361e-06, + "loss": 0.428, + "step": 665 + }, + { + "epoch": 0.3097194233452178, + "grad_norm": 0.5521329641342163, + "learning_rate": 9.99967709912576e-06, + "loss": 0.4054, + "step": 666 + }, + { + "epoch": 0.3101844675244148, + "grad_norm": 0.6145597696304321, + "learning_rate": 9.999645614831946e-06, + "loss": 0.3983, + "step": 667 + }, + { + "epoch": 0.3106495117036118, + "grad_norm": 0.6071444153785706, + "learning_rate": 9.999612666223139e-06, + "loss": 0.4286, + "step": 668 + }, + { + "epoch": 0.31111455588280884, + "grad_norm": 0.5950531363487244, + "learning_rate": 9.999578253308994e-06, + "loss": 0.4325, + "step": 669 + }, + { + "epoch": 0.3115796000620059, + "grad_norm": 0.608697772026062, + "learning_rate": 9.99954237609959e-06, + "loss": 0.3965, + "step": 670 + }, + { + "epoch": 0.31204464424120293, + "grad_norm": 0.5764162540435791, + "learning_rate": 9.999505034605428e-06, + "loss": 0.4166, + "step": 671 + }, + { + "epoch": 0.31250968842039994, + "grad_norm": 0.6588258147239685, + "learning_rate": 9.999466228837452e-06, + "loss": 0.4122, + "step": 672 + }, + { + "epoch": 0.31297473259959696, + "grad_norm": 0.5878240466117859, + "learning_rate": 9.999425958807023e-06, + "loss": 0.4289, + "step": 673 + }, + { + "epoch": 0.313439776778794, + "grad_norm": 0.6487173438072205, + "learning_rate": 9.999384224525938e-06, + "loss": 0.4348, + "step": 674 + }, + { + "epoch": 0.313904820957991, + "grad_norm": 0.5158074498176575, + "learning_rate": 9.99934102600642e-06, + "loss": 0.402, + "step": 675 + }, + { + "epoch": 0.314369865137188, + "grad_norm": 0.6373001933097839, + "learning_rate": 9.999296363261118e-06, + "loss": 0.4098, + "step": 676 + }, + { + "epoch": 0.3148349093163851, + "grad_norm": 0.5033225417137146, + "learning_rate": 9.999250236303118e-06, + "loss": 0.4035, + "step": 677 + }, + { + "epoch": 0.3152999534955821, + "grad_norm": 0.4822746217250824, + "learning_rate": 9.999202645145927e-06, + "loss": 0.4164, + "step": 678 + }, + { + "epoch": 0.3157649976747791, + "grad_norm": 0.4495120644569397, + "learning_rate": 9.999153589803484e-06, + "loss": 0.3747, + "step": 679 + }, + { + "epoch": 0.31623004185397613, + "grad_norm": 0.5534586310386658, + "learning_rate": 9.999103070290155e-06, + "loss": 0.4011, + "step": 680 + }, + { + "epoch": 0.31669508603317315, + "grad_norm": 0.4584212601184845, + "learning_rate": 9.999051086620738e-06, + "loss": 0.4116, + "step": 681 + }, + { + "epoch": 0.31716013021237016, + "grad_norm": 0.6077730655670166, + "learning_rate": 9.998997638810462e-06, + "loss": 0.4163, + "step": 682 + }, + { + "epoch": 0.3176251743915672, + "grad_norm": 0.4868007004261017, + "learning_rate": 9.998942726874974e-06, + "loss": 0.4075, + "step": 683 + }, + { + "epoch": 0.31809021857076425, + "grad_norm": 0.5131934285163879, + "learning_rate": 9.99888635083036e-06, + "loss": 0.4178, + "step": 684 + }, + { + "epoch": 0.31855526274996127, + "grad_norm": 0.5666374564170837, + "learning_rate": 9.998828510693133e-06, + "loss": 0.4147, + "step": 685 + }, + { + "epoch": 0.3190203069291583, + "grad_norm": 0.6439737677574158, + "learning_rate": 9.99876920648023e-06, + "loss": 0.4148, + "step": 686 + }, + { + "epoch": 0.3194853511083553, + "grad_norm": 0.5670461058616638, + "learning_rate": 9.998708438209022e-06, + "loss": 0.4088, + "step": 687 + }, + { + "epoch": 0.3199503952875523, + "grad_norm": 0.6633548736572266, + "learning_rate": 9.99864620589731e-06, + "loss": 0.3982, + "step": 688 + }, + { + "epoch": 0.32041543946674933, + "grad_norm": 0.6031489968299866, + "learning_rate": 9.998582509563315e-06, + "loss": 0.4095, + "step": 689 + }, + { + "epoch": 0.32088048364594635, + "grad_norm": 0.49488380551338196, + "learning_rate": 9.998517349225698e-06, + "loss": 0.3967, + "step": 690 + }, + { + "epoch": 0.32134552782514336, + "grad_norm": 0.6333311200141907, + "learning_rate": 9.99845072490354e-06, + "loss": 0.4425, + "step": 691 + }, + { + "epoch": 0.32181057200434043, + "grad_norm": 0.5096359252929688, + "learning_rate": 9.998382636616355e-06, + "loss": 0.4033, + "step": 692 + }, + { + "epoch": 0.32227561618353745, + "grad_norm": 0.502375602722168, + "learning_rate": 9.998313084384086e-06, + "loss": 0.3965, + "step": 693 + }, + { + "epoch": 0.32274066036273447, + "grad_norm": 0.6580330729484558, + "learning_rate": 9.998242068227103e-06, + "loss": 0.4052, + "step": 694 + }, + { + "epoch": 0.3232057045419315, + "grad_norm": 0.5036015510559082, + "learning_rate": 9.998169588166204e-06, + "loss": 0.4154, + "step": 695 + }, + { + "epoch": 0.3236707487211285, + "grad_norm": 0.5556278228759766, + "learning_rate": 9.99809564422262e-06, + "loss": 0.4296, + "step": 696 + }, + { + "epoch": 0.3241357929003255, + "grad_norm": 0.635485053062439, + "learning_rate": 9.998020236418008e-06, + "loss": 0.4379, + "step": 697 + }, + { + "epoch": 0.32460083707952253, + "grad_norm": 0.48889610171318054, + "learning_rate": 9.997943364774451e-06, + "loss": 0.414, + "step": 698 + }, + { + "epoch": 0.3250658812587196, + "grad_norm": 0.4967147707939148, + "learning_rate": 9.997865029314464e-06, + "loss": 0.4011, + "step": 699 + }, + { + "epoch": 0.3255309254379166, + "grad_norm": 0.595470666885376, + "learning_rate": 9.997785230060993e-06, + "loss": 0.438, + "step": 700 + }, + { + "epoch": 0.32599596961711363, + "grad_norm": 0.533093273639679, + "learning_rate": 9.997703967037406e-06, + "loss": 0.4152, + "step": 701 + }, + { + "epoch": 0.32646101379631065, + "grad_norm": 0.5301925539970398, + "learning_rate": 9.997621240267505e-06, + "loss": 0.4096, + "step": 702 + }, + { + "epoch": 0.32692605797550767, + "grad_norm": 0.5256016254425049, + "learning_rate": 9.997537049775522e-06, + "loss": 0.4057, + "step": 703 + }, + { + "epoch": 0.3273911021547047, + "grad_norm": 0.6002234816551208, + "learning_rate": 9.997451395586112e-06, + "loss": 0.4221, + "step": 704 + }, + { + "epoch": 0.3278561463339017, + "grad_norm": 0.5671415328979492, + "learning_rate": 9.997364277724362e-06, + "loss": 0.4038, + "step": 705 + }, + { + "epoch": 0.32832119051309877, + "grad_norm": 0.5628596544265747, + "learning_rate": 9.997275696215788e-06, + "loss": 0.3987, + "step": 706 + }, + { + "epoch": 0.3287862346922958, + "grad_norm": 0.6205607652664185, + "learning_rate": 9.997185651086336e-06, + "loss": 0.4259, + "step": 707 + }, + { + "epoch": 0.3292512788714928, + "grad_norm": 0.5086963176727295, + "learning_rate": 9.997094142362376e-06, + "loss": 0.4084, + "step": 708 + }, + { + "epoch": 0.3297163230506898, + "grad_norm": 0.5511040091514587, + "learning_rate": 9.99700117007071e-06, + "loss": 0.4085, + "step": 709 + }, + { + "epoch": 0.33018136722988684, + "grad_norm": 0.5336337089538574, + "learning_rate": 9.996906734238568e-06, + "loss": 0.4397, + "step": 710 + }, + { + "epoch": 0.33064641140908385, + "grad_norm": 0.5333123207092285, + "learning_rate": 9.99681083489361e-06, + "loss": 0.4234, + "step": 711 + }, + { + "epoch": 0.33111145558828087, + "grad_norm": 0.5140913724899292, + "learning_rate": 9.99671347206392e-06, + "loss": 0.4133, + "step": 712 + }, + { + "epoch": 0.3315764997674779, + "grad_norm": 0.5568680167198181, + "learning_rate": 9.996614645778019e-06, + "loss": 0.4144, + "step": 713 + }, + { + "epoch": 0.33204154394667496, + "grad_norm": 0.4299158751964569, + "learning_rate": 9.996514356064848e-06, + "loss": 0.4325, + "step": 714 + }, + { + "epoch": 0.33250658812587197, + "grad_norm": 0.5466892123222351, + "learning_rate": 9.996412602953782e-06, + "loss": 0.4002, + "step": 715 + }, + { + "epoch": 0.332971632305069, + "grad_norm": 0.5300124287605286, + "learning_rate": 9.99630938647462e-06, + "loss": 0.4113, + "step": 716 + }, + { + "epoch": 0.333436676484266, + "grad_norm": 0.4665888845920563, + "learning_rate": 9.996204706657597e-06, + "loss": 0.4264, + "step": 717 + }, + { + "epoch": 0.333901720663463, + "grad_norm": 0.6327192187309265, + "learning_rate": 9.99609856353337e-06, + "loss": 0.4139, + "step": 718 + }, + { + "epoch": 0.33436676484266004, + "grad_norm": 0.5009886622428894, + "learning_rate": 9.995990957133024e-06, + "loss": 0.3818, + "step": 719 + }, + { + "epoch": 0.33483180902185705, + "grad_norm": 0.5028725266456604, + "learning_rate": 9.99588188748808e-06, + "loss": 0.3934, + "step": 720 + }, + { + "epoch": 0.3352968532010541, + "grad_norm": 0.6734301447868347, + "learning_rate": 9.995771354630476e-06, + "loss": 0.4213, + "step": 721 + }, + { + "epoch": 0.33576189738025114, + "grad_norm": 0.5146905183792114, + "learning_rate": 9.995659358592592e-06, + "loss": 0.387, + "step": 722 + }, + { + "epoch": 0.33622694155944816, + "grad_norm": 0.4680449664592743, + "learning_rate": 9.995545899407226e-06, + "loss": 0.4296, + "step": 723 + }, + { + "epoch": 0.33669198573864517, + "grad_norm": 0.5770846605300903, + "learning_rate": 9.995430977107612e-06, + "loss": 0.4088, + "step": 724 + }, + { + "epoch": 0.3371570299178422, + "grad_norm": 0.5241069793701172, + "learning_rate": 9.995314591727404e-06, + "loss": 0.4065, + "step": 725 + }, + { + "epoch": 0.3376220740970392, + "grad_norm": 0.46266067028045654, + "learning_rate": 9.995196743300693e-06, + "loss": 0.4014, + "step": 726 + }, + { + "epoch": 0.3380871182762362, + "grad_norm": 0.4953598380088806, + "learning_rate": 9.995077431861992e-06, + "loss": 0.3707, + "step": 727 + }, + { + "epoch": 0.3385521624554333, + "grad_norm": 0.47356897592544556, + "learning_rate": 9.994956657446248e-06, + "loss": 0.4058, + "step": 728 + }, + { + "epoch": 0.3390172066346303, + "grad_norm": 0.47377198934555054, + "learning_rate": 9.994834420088832e-06, + "loss": 0.4042, + "step": 729 + }, + { + "epoch": 0.3394822508138273, + "grad_norm": 0.5034839510917664, + "learning_rate": 9.99471071982555e-06, + "loss": 0.4001, + "step": 730 + }, + { + "epoch": 0.33994729499302434, + "grad_norm": 0.462456613779068, + "learning_rate": 9.994585556692624e-06, + "loss": 0.3885, + "step": 731 + }, + { + "epoch": 0.34041233917222136, + "grad_norm": 0.5245911478996277, + "learning_rate": 9.994458930726717e-06, + "loss": 0.419, + "step": 732 + }, + { + "epoch": 0.3408773833514184, + "grad_norm": 0.4660412669181824, + "learning_rate": 9.994330841964916e-06, + "loss": 0.4144, + "step": 733 + }, + { + "epoch": 0.3413424275306154, + "grad_norm": 0.54203861951828, + "learning_rate": 9.994201290444734e-06, + "loss": 0.3974, + "step": 734 + }, + { + "epoch": 0.3418074717098124, + "grad_norm": 0.5267844200134277, + "learning_rate": 9.994070276204115e-06, + "loss": 0.428, + "step": 735 + }, + { + "epoch": 0.3422725158890095, + "grad_norm": 0.5129567384719849, + "learning_rate": 9.993937799281435e-06, + "loss": 0.4135, + "step": 736 + }, + { + "epoch": 0.3427375600682065, + "grad_norm": 0.5932128429412842, + "learning_rate": 9.993803859715488e-06, + "loss": 0.4103, + "step": 737 + }, + { + "epoch": 0.3432026042474035, + "grad_norm": 0.48848265409469604, + "learning_rate": 9.993668457545505e-06, + "loss": 0.4022, + "step": 738 + }, + { + "epoch": 0.3436676484266005, + "grad_norm": 0.5327843427658081, + "learning_rate": 9.993531592811146e-06, + "loss": 0.4113, + "step": 739 + }, + { + "epoch": 0.34413269260579754, + "grad_norm": 0.571151077747345, + "learning_rate": 9.993393265552494e-06, + "loss": 0.4191, + "step": 740 + }, + { + "epoch": 0.34459773678499456, + "grad_norm": 0.5364315509796143, + "learning_rate": 9.993253475810061e-06, + "loss": 0.4134, + "step": 741 + }, + { + "epoch": 0.3450627809641916, + "grad_norm": 0.5578524470329285, + "learning_rate": 9.993112223624793e-06, + "loss": 0.4269, + "step": 742 + }, + { + "epoch": 0.34552782514338864, + "grad_norm": 0.599855899810791, + "learning_rate": 9.992969509038057e-06, + "loss": 0.4095, + "step": 743 + }, + { + "epoch": 0.34599286932258566, + "grad_norm": 0.48594430088996887, + "learning_rate": 9.992825332091654e-06, + "loss": 0.4119, + "step": 744 + }, + { + "epoch": 0.3464579135017827, + "grad_norm": 0.6377593874931335, + "learning_rate": 9.992679692827812e-06, + "loss": 0.4411, + "step": 745 + }, + { + "epoch": 0.3469229576809797, + "grad_norm": 0.5200589895248413, + "learning_rate": 9.992532591289183e-06, + "loss": 0.4241, + "step": 746 + }, + { + "epoch": 0.3473880018601767, + "grad_norm": 0.5711746215820312, + "learning_rate": 9.992384027518853e-06, + "loss": 0.422, + "step": 747 + }, + { + "epoch": 0.3478530460393737, + "grad_norm": 0.5109982490539551, + "learning_rate": 9.992234001560333e-06, + "loss": 0.3909, + "step": 748 + }, + { + "epoch": 0.34831809021857074, + "grad_norm": 0.6026611328125, + "learning_rate": 9.992082513457564e-06, + "loss": 0.4277, + "step": 749 + }, + { + "epoch": 0.3487831343977678, + "grad_norm": 0.5200704336166382, + "learning_rate": 9.991929563254913e-06, + "loss": 0.4007, + "step": 750 + }, + { + "epoch": 0.34924817857696483, + "grad_norm": 0.5779603123664856, + "learning_rate": 9.99177515099718e-06, + "loss": 0.3991, + "step": 751 + }, + { + "epoch": 0.34971322275616185, + "grad_norm": 0.5946140885353088, + "learning_rate": 9.991619276729585e-06, + "loss": 0.439, + "step": 752 + }, + { + "epoch": 0.35017826693535886, + "grad_norm": 0.5260143280029297, + "learning_rate": 9.991461940497786e-06, + "loss": 0.4202, + "step": 753 + }, + { + "epoch": 0.3506433111145559, + "grad_norm": 0.5628990530967712, + "learning_rate": 9.99130314234786e-06, + "loss": 0.3997, + "step": 754 + }, + { + "epoch": 0.3511083552937529, + "grad_norm": 0.6147162318229675, + "learning_rate": 9.99114288232632e-06, + "loss": 0.4137, + "step": 755 + }, + { + "epoch": 0.3515733994729499, + "grad_norm": 0.5871340036392212, + "learning_rate": 9.990981160480098e-06, + "loss": 0.4044, + "step": 756 + }, + { + "epoch": 0.3520384436521469, + "grad_norm": 0.5283817648887634, + "learning_rate": 9.990817976856566e-06, + "loss": 0.3815, + "step": 757 + }, + { + "epoch": 0.352503487831344, + "grad_norm": 0.5750249624252319, + "learning_rate": 9.990653331503515e-06, + "loss": 0.4088, + "step": 758 + }, + { + "epoch": 0.352968532010541, + "grad_norm": 0.5326053500175476, + "learning_rate": 9.990487224469167e-06, + "loss": 0.4239, + "step": 759 + }, + { + "epoch": 0.35343357618973803, + "grad_norm": 0.50933837890625, + "learning_rate": 9.990319655802171e-06, + "loss": 0.4022, + "step": 760 + }, + { + "epoch": 0.35389862036893505, + "grad_norm": 0.4641541838645935, + "learning_rate": 9.990150625551609e-06, + "loss": 0.3847, + "step": 761 + }, + { + "epoch": 0.35436366454813206, + "grad_norm": 0.49445950984954834, + "learning_rate": 9.989980133766983e-06, + "loss": 0.395, + "step": 762 + }, + { + "epoch": 0.3548287087273291, + "grad_norm": 0.5578946471214294, + "learning_rate": 9.989808180498229e-06, + "loss": 0.4031, + "step": 763 + }, + { + "epoch": 0.3552937529065261, + "grad_norm": 0.47986650466918945, + "learning_rate": 9.98963476579571e-06, + "loss": 0.4222, + "step": 764 + }, + { + "epoch": 0.35575879708572317, + "grad_norm": 0.5039169788360596, + "learning_rate": 9.989459889710214e-06, + "loss": 0.3973, + "step": 765 + }, + { + "epoch": 0.3562238412649202, + "grad_norm": 0.47546157240867615, + "learning_rate": 9.98928355229296e-06, + "loss": 0.3891, + "step": 766 + }, + { + "epoch": 0.3566888854441172, + "grad_norm": 0.4875746965408325, + "learning_rate": 9.989105753595599e-06, + "loss": 0.3952, + "step": 767 + }, + { + "epoch": 0.3571539296233142, + "grad_norm": 0.5113381743431091, + "learning_rate": 9.988926493670198e-06, + "loss": 0.3624, + "step": 768 + }, + { + "epoch": 0.35761897380251123, + "grad_norm": 0.4663374722003937, + "learning_rate": 9.988745772569266e-06, + "loss": 0.417, + "step": 769 + }, + { + "epoch": 0.35808401798170825, + "grad_norm": 0.5435273051261902, + "learning_rate": 9.988563590345728e-06, + "loss": 0.4112, + "step": 770 + }, + { + "epoch": 0.35854906216090526, + "grad_norm": 0.46149757504463196, + "learning_rate": 9.988379947052944e-06, + "loss": 0.3915, + "step": 771 + }, + { + "epoch": 0.35901410634010233, + "grad_norm": 0.5259143710136414, + "learning_rate": 9.988194842744701e-06, + "loss": 0.4186, + "step": 772 + }, + { + "epoch": 0.35947915051929935, + "grad_norm": 0.4799407422542572, + "learning_rate": 9.988008277475214e-06, + "loss": 0.4059, + "step": 773 + }, + { + "epoch": 0.35994419469849637, + "grad_norm": 0.5095851421356201, + "learning_rate": 9.987820251299121e-06, + "loss": 0.4179, + "step": 774 + }, + { + "epoch": 0.3604092388776934, + "grad_norm": 0.4954157769680023, + "learning_rate": 9.987630764271497e-06, + "loss": 0.3825, + "step": 775 + }, + { + "epoch": 0.3608742830568904, + "grad_norm": 0.5038895010948181, + "learning_rate": 9.987439816447836e-06, + "loss": 0.4293, + "step": 776 + }, + { + "epoch": 0.3613393272360874, + "grad_norm": 0.5088568925857544, + "learning_rate": 9.987247407884064e-06, + "loss": 0.4113, + "step": 777 + }, + { + "epoch": 0.36180437141528443, + "grad_norm": 0.5863829851150513, + "learning_rate": 9.987053538636535e-06, + "loss": 0.3904, + "step": 778 + }, + { + "epoch": 0.3622694155944815, + "grad_norm": 0.44420042634010315, + "learning_rate": 9.986858208762032e-06, + "loss": 0.4165, + "step": 779 + }, + { + "epoch": 0.3627344597736785, + "grad_norm": 0.5741595029830933, + "learning_rate": 9.986661418317759e-06, + "loss": 0.4164, + "step": 780 + }, + { + "epoch": 0.36319950395287554, + "grad_norm": 0.5108224749565125, + "learning_rate": 9.986463167361358e-06, + "loss": 0.4025, + "step": 781 + }, + { + "epoch": 0.36366454813207255, + "grad_norm": 0.6276097893714905, + "learning_rate": 9.986263455950888e-06, + "loss": 0.4264, + "step": 782 + }, + { + "epoch": 0.36412959231126957, + "grad_norm": 0.5375624895095825, + "learning_rate": 9.986062284144848e-06, + "loss": 0.4184, + "step": 783 + }, + { + "epoch": 0.3645946364904666, + "grad_norm": 0.6147474050521851, + "learning_rate": 9.985859652002152e-06, + "loss": 0.3896, + "step": 784 + }, + { + "epoch": 0.3650596806696636, + "grad_norm": 0.586616039276123, + "learning_rate": 9.985655559582152e-06, + "loss": 0.4102, + "step": 785 + }, + { + "epoch": 0.3655247248488606, + "grad_norm": 0.5644299387931824, + "learning_rate": 9.985450006944621e-06, + "loss": 0.3909, + "step": 786 + }, + { + "epoch": 0.3659897690280577, + "grad_norm": 0.5941767692565918, + "learning_rate": 9.98524299414976e-06, + "loss": 0.4318, + "step": 787 + }, + { + "epoch": 0.3664548132072547, + "grad_norm": 0.5848608613014221, + "learning_rate": 9.985034521258206e-06, + "loss": 0.4178, + "step": 788 + }, + { + "epoch": 0.3669198573864517, + "grad_norm": 0.5143994688987732, + "learning_rate": 9.98482458833101e-06, + "loss": 0.395, + "step": 789 + }, + { + "epoch": 0.36738490156564874, + "grad_norm": 0.560663104057312, + "learning_rate": 9.984613195429662e-06, + "loss": 0.4118, + "step": 790 + }, + { + "epoch": 0.36784994574484575, + "grad_norm": 0.520053505897522, + "learning_rate": 9.984400342616076e-06, + "loss": 0.3849, + "step": 791 + }, + { + "epoch": 0.36831498992404277, + "grad_norm": 0.5158041715621948, + "learning_rate": 9.984186029952591e-06, + "loss": 0.4338, + "step": 792 + }, + { + "epoch": 0.3687800341032398, + "grad_norm": 0.5674142837524414, + "learning_rate": 9.983970257501978e-06, + "loss": 0.3954, + "step": 793 + }, + { + "epoch": 0.36924507828243686, + "grad_norm": 0.4826439321041107, + "learning_rate": 9.983753025327431e-06, + "loss": 0.3815, + "step": 794 + }, + { + "epoch": 0.36971012246163387, + "grad_norm": 0.4967092275619507, + "learning_rate": 9.983534333492575e-06, + "loss": 0.41, + "step": 795 + }, + { + "epoch": 0.3701751666408309, + "grad_norm": 0.5571979284286499, + "learning_rate": 9.983314182061461e-06, + "loss": 0.4217, + "step": 796 + }, + { + "epoch": 0.3706402108200279, + "grad_norm": 0.5594668984413147, + "learning_rate": 9.983092571098569e-06, + "loss": 0.3869, + "step": 797 + }, + { + "epoch": 0.3711052549992249, + "grad_norm": 0.50605309009552, + "learning_rate": 9.982869500668804e-06, + "loss": 0.435, + "step": 798 + }, + { + "epoch": 0.37157029917842194, + "grad_norm": 0.6257908344268799, + "learning_rate": 9.982644970837499e-06, + "loss": 0.408, + "step": 799 + }, + { + "epoch": 0.37203534335761895, + "grad_norm": 0.5521469712257385, + "learning_rate": 9.982418981670414e-06, + "loss": 0.4362, + "step": 800 + }, + { + "epoch": 0.372500387536816, + "grad_norm": 0.6889628171920776, + "learning_rate": 9.982191533233742e-06, + "loss": 0.4091, + "step": 801 + }, + { + "epoch": 0.37296543171601304, + "grad_norm": 0.5120216608047485, + "learning_rate": 9.981962625594094e-06, + "loss": 0.4025, + "step": 802 + }, + { + "epoch": 0.37343047589521006, + "grad_norm": 0.6625473499298096, + "learning_rate": 9.981732258818519e-06, + "loss": 0.4212, + "step": 803 + }, + { + "epoch": 0.3738955200744071, + "grad_norm": 0.5750365853309631, + "learning_rate": 9.981500432974482e-06, + "loss": 0.4146, + "step": 804 + }, + { + "epoch": 0.3743605642536041, + "grad_norm": 0.5535095930099487, + "learning_rate": 9.981267148129884e-06, + "loss": 0.4175, + "step": 805 + }, + { + "epoch": 0.3748256084328011, + "grad_norm": 0.6257491111755371, + "learning_rate": 9.981032404353052e-06, + "loss": 0.3994, + "step": 806 + }, + { + "epoch": 0.3752906526119981, + "grad_norm": 0.48537763953208923, + "learning_rate": 9.980796201712734e-06, + "loss": 0.4095, + "step": 807 + }, + { + "epoch": 0.37575569679119514, + "grad_norm": 0.5952275395393372, + "learning_rate": 9.980558540278113e-06, + "loss": 0.4032, + "step": 808 + }, + { + "epoch": 0.3762207409703922, + "grad_norm": 0.5576409101486206, + "learning_rate": 9.980319420118796e-06, + "loss": 0.4195, + "step": 809 + }, + { + "epoch": 0.3766857851495892, + "grad_norm": 0.5643303990364075, + "learning_rate": 9.980078841304817e-06, + "loss": 0.426, + "step": 810 + }, + { + "epoch": 0.37715082932878624, + "grad_norm": 0.5821006894111633, + "learning_rate": 9.979836803906636e-06, + "loss": 0.4227, + "step": 811 + }, + { + "epoch": 0.37761587350798326, + "grad_norm": 0.46532806754112244, + "learning_rate": 9.979593307995145e-06, + "loss": 0.3683, + "step": 812 + }, + { + "epoch": 0.3780809176871803, + "grad_norm": 0.534476637840271, + "learning_rate": 9.979348353641659e-06, + "loss": 0.4112, + "step": 813 + }, + { + "epoch": 0.3785459618663773, + "grad_norm": 0.5859639644622803, + "learning_rate": 9.979101940917918e-06, + "loss": 0.4257, + "step": 814 + }, + { + "epoch": 0.3790110060455743, + "grad_norm": 0.4468208849430084, + "learning_rate": 9.978854069896096e-06, + "loss": 0.4216, + "step": 815 + }, + { + "epoch": 0.3794760502247714, + "grad_norm": 0.597046434879303, + "learning_rate": 9.97860474064879e-06, + "loss": 0.4121, + "step": 816 + }, + { + "epoch": 0.3799410944039684, + "grad_norm": 0.5563921332359314, + "learning_rate": 9.978353953249023e-06, + "loss": 0.4184, + "step": 817 + }, + { + "epoch": 0.3804061385831654, + "grad_norm": 0.49227768182754517, + "learning_rate": 9.978101707770247e-06, + "loss": 0.4037, + "step": 818 + }, + { + "epoch": 0.3808711827623624, + "grad_norm": 0.48585137724876404, + "learning_rate": 9.977848004286342e-06, + "loss": 0.3985, + "step": 819 + }, + { + "epoch": 0.38133622694155944, + "grad_norm": 0.500493586063385, + "learning_rate": 9.977592842871612e-06, + "loss": 0.4261, + "step": 820 + }, + { + "epoch": 0.38180127112075646, + "grad_norm": 0.5106586217880249, + "learning_rate": 9.97733622360079e-06, + "loss": 0.4301, + "step": 821 + }, + { + "epoch": 0.3822663152999535, + "grad_norm": 0.4543416500091553, + "learning_rate": 9.977078146549036e-06, + "loss": 0.4127, + "step": 822 + }, + { + "epoch": 0.38273135947915055, + "grad_norm": 0.6294241547584534, + "learning_rate": 9.976818611791937e-06, + "loss": 0.4152, + "step": 823 + }, + { + "epoch": 0.38319640365834756, + "grad_norm": 0.5176417827606201, + "learning_rate": 9.976557619405503e-06, + "loss": 0.4029, + "step": 824 + }, + { + "epoch": 0.3836614478375446, + "grad_norm": 0.527394711971283, + "learning_rate": 9.97629516946618e-06, + "loss": 0.4007, + "step": 825 + }, + { + "epoch": 0.3841264920167416, + "grad_norm": 0.5030791163444519, + "learning_rate": 9.976031262050832e-06, + "loss": 0.3884, + "step": 826 + }, + { + "epoch": 0.3845915361959386, + "grad_norm": 0.5006067156791687, + "learning_rate": 9.975765897236754e-06, + "loss": 0.4097, + "step": 827 + }, + { + "epoch": 0.3850565803751356, + "grad_norm": 0.5824684500694275, + "learning_rate": 9.975499075101667e-06, + "loss": 0.4203, + "step": 828 + }, + { + "epoch": 0.38552162455433264, + "grad_norm": 0.48303505778312683, + "learning_rate": 9.975230795723717e-06, + "loss": 0.3836, + "step": 829 + }, + { + "epoch": 0.38598666873352966, + "grad_norm": 0.5456617474555969, + "learning_rate": 9.974961059181482e-06, + "loss": 0.427, + "step": 830 + }, + { + "epoch": 0.38645171291272673, + "grad_norm": 0.4804321229457855, + "learning_rate": 9.97468986555396e-06, + "loss": 0.409, + "step": 831 + }, + { + "epoch": 0.38691675709192375, + "grad_norm": 0.4723159372806549, + "learning_rate": 9.974417214920584e-06, + "loss": 0.4049, + "step": 832 + }, + { + "epoch": 0.38738180127112076, + "grad_norm": 0.5413050651550293, + "learning_rate": 9.974143107361205e-06, + "loss": 0.4057, + "step": 833 + }, + { + "epoch": 0.3878468454503178, + "grad_norm": 0.5091968774795532, + "learning_rate": 9.973867542956104e-06, + "loss": 0.4015, + "step": 834 + }, + { + "epoch": 0.3883118896295148, + "grad_norm": 0.4703305959701538, + "learning_rate": 9.973590521785992e-06, + "loss": 0.4009, + "step": 835 + }, + { + "epoch": 0.3887769338087118, + "grad_norm": 0.5493623614311218, + "learning_rate": 9.973312043932004e-06, + "loss": 0.424, + "step": 836 + }, + { + "epoch": 0.3892419779879088, + "grad_norm": 0.590499997138977, + "learning_rate": 9.9730321094757e-06, + "loss": 0.4294, + "step": 837 + }, + { + "epoch": 0.3897070221671059, + "grad_norm": 0.4965055584907532, + "learning_rate": 9.972750718499067e-06, + "loss": 0.3899, + "step": 838 + }, + { + "epoch": 0.3901720663463029, + "grad_norm": 0.5021485686302185, + "learning_rate": 9.972467871084524e-06, + "loss": 0.4507, + "step": 839 + }, + { + "epoch": 0.39063711052549993, + "grad_norm": 0.5755704045295715, + "learning_rate": 9.97218356731491e-06, + "loss": 0.3978, + "step": 840 + }, + { + "epoch": 0.39110215470469695, + "grad_norm": 0.5343739986419678, + "learning_rate": 9.971897807273492e-06, + "loss": 0.3725, + "step": 841 + }, + { + "epoch": 0.39156719888389396, + "grad_norm": 0.5048686265945435, + "learning_rate": 9.971610591043966e-06, + "loss": 0.4254, + "step": 842 + }, + { + "epoch": 0.392032243063091, + "grad_norm": 0.5324344038963318, + "learning_rate": 9.971321918710452e-06, + "loss": 0.3935, + "step": 843 + }, + { + "epoch": 0.392497287242288, + "grad_norm": 0.6189853549003601, + "learning_rate": 9.9710317903575e-06, + "loss": 0.4212, + "step": 844 + }, + { + "epoch": 0.39296233142148507, + "grad_norm": 0.48477646708488464, + "learning_rate": 9.97074020607008e-06, + "loss": 0.4003, + "step": 845 + }, + { + "epoch": 0.3934273756006821, + "grad_norm": 0.548075795173645, + "learning_rate": 9.970447165933594e-06, + "loss": 0.4232, + "step": 846 + }, + { + "epoch": 0.3938924197798791, + "grad_norm": 0.5070050954818726, + "learning_rate": 9.97015267003387e-06, + "loss": 0.4053, + "step": 847 + }, + { + "epoch": 0.3943574639590761, + "grad_norm": 0.5388278365135193, + "learning_rate": 9.96985671845716e-06, + "loss": 0.4132, + "step": 848 + }, + { + "epoch": 0.39482250813827313, + "grad_norm": 0.5621599555015564, + "learning_rate": 9.969559311290142e-06, + "loss": 0.4126, + "step": 849 + }, + { + "epoch": 0.39528755231747015, + "grad_norm": 0.500983476638794, + "learning_rate": 9.969260448619925e-06, + "loss": 0.4072, + "step": 850 + }, + { + "epoch": 0.39575259649666716, + "grad_norm": 0.5636142492294312, + "learning_rate": 9.968960130534036e-06, + "loss": 0.3949, + "step": 851 + }, + { + "epoch": 0.3962176406758642, + "grad_norm": 0.5257579684257507, + "learning_rate": 9.96865835712044e-06, + "loss": 0.3993, + "step": 852 + }, + { + "epoch": 0.39668268485506125, + "grad_norm": 0.6351518630981445, + "learning_rate": 9.968355128467515e-06, + "loss": 0.4175, + "step": 853 + }, + { + "epoch": 0.39714772903425827, + "grad_norm": 0.4981622099876404, + "learning_rate": 9.968050444664074e-06, + "loss": 0.4217, + "step": 854 + }, + { + "epoch": 0.3976127732134553, + "grad_norm": 0.5462387800216675, + "learning_rate": 9.967744305799358e-06, + "loss": 0.4094, + "step": 855 + }, + { + "epoch": 0.3980778173926523, + "grad_norm": 0.599955141544342, + "learning_rate": 9.967436711963026e-06, + "loss": 0.3897, + "step": 856 + }, + { + "epoch": 0.3985428615718493, + "grad_norm": 0.4772481918334961, + "learning_rate": 9.967127663245167e-06, + "loss": 0.426, + "step": 857 + }, + { + "epoch": 0.39900790575104633, + "grad_norm": 0.6691299676895142, + "learning_rate": 9.966817159736295e-06, + "loss": 0.3885, + "step": 858 + }, + { + "epoch": 0.39947294993024335, + "grad_norm": 0.6346452832221985, + "learning_rate": 9.966505201527357e-06, + "loss": 0.4217, + "step": 859 + }, + { + "epoch": 0.3999379941094404, + "grad_norm": 0.47586584091186523, + "learning_rate": 9.966191788709716e-06, + "loss": 0.4165, + "step": 860 + }, + { + "epoch": 0.40040303828863744, + "grad_norm": 0.6300751566886902, + "learning_rate": 9.965876921375165e-06, + "loss": 0.4087, + "step": 861 + }, + { + "epoch": 0.40086808246783445, + "grad_norm": 0.5367612242698669, + "learning_rate": 9.965560599615928e-06, + "loss": 0.3919, + "step": 862 + }, + { + "epoch": 0.40133312664703147, + "grad_norm": 0.4811550974845886, + "learning_rate": 9.965242823524648e-06, + "loss": 0.4112, + "step": 863 + }, + { + "epoch": 0.4017981708262285, + "grad_norm": 0.6218395233154297, + "learning_rate": 9.964923593194394e-06, + "loss": 0.3741, + "step": 864 + }, + { + "epoch": 0.4022632150054255, + "grad_norm": 0.5514636635780334, + "learning_rate": 9.964602908718667e-06, + "loss": 0.4253, + "step": 865 + }, + { + "epoch": 0.4027282591846225, + "grad_norm": 0.5390025973320007, + "learning_rate": 9.964280770191388e-06, + "loss": 0.3988, + "step": 866 + }, + { + "epoch": 0.4031933033638196, + "grad_norm": 0.64164799451828, + "learning_rate": 9.963957177706908e-06, + "loss": 0.4129, + "step": 867 + }, + { + "epoch": 0.4036583475430166, + "grad_norm": 0.4808669984340668, + "learning_rate": 9.96363213136e-06, + "loss": 0.4205, + "step": 868 + }, + { + "epoch": 0.4041233917222136, + "grad_norm": 0.5149733424186707, + "learning_rate": 9.963305631245866e-06, + "loss": 0.4049, + "step": 869 + }, + { + "epoch": 0.40458843590141064, + "grad_norm": 0.5427307486534119, + "learning_rate": 9.962977677460132e-06, + "loss": 0.402, + "step": 870 + }, + { + "epoch": 0.40505348008060765, + "grad_norm": 0.5210288763046265, + "learning_rate": 9.96264827009885e-06, + "loss": 0.4063, + "step": 871 + }, + { + "epoch": 0.40551852425980467, + "grad_norm": 0.5169207453727722, + "learning_rate": 9.962317409258501e-06, + "loss": 0.4032, + "step": 872 + }, + { + "epoch": 0.4059835684390017, + "grad_norm": 0.5519936084747314, + "learning_rate": 9.961985095035987e-06, + "loss": 0.3884, + "step": 873 + }, + { + "epoch": 0.4064486126181987, + "grad_norm": 0.5302849411964417, + "learning_rate": 9.961651327528636e-06, + "loss": 0.406, + "step": 874 + }, + { + "epoch": 0.4069136567973958, + "grad_norm": 0.5135921835899353, + "learning_rate": 9.961316106834202e-06, + "loss": 0.3885, + "step": 875 + }, + { + "epoch": 0.4073787009765928, + "grad_norm": 0.4829360842704773, + "learning_rate": 9.96097943305087e-06, + "loss": 0.4054, + "step": 876 + }, + { + "epoch": 0.4078437451557898, + "grad_norm": 0.48682668805122375, + "learning_rate": 9.960641306277244e-06, + "loss": 0.3872, + "step": 877 + }, + { + "epoch": 0.4083087893349868, + "grad_norm": 0.5358735918998718, + "learning_rate": 9.960301726612355e-06, + "loss": 0.4106, + "step": 878 + }, + { + "epoch": 0.40877383351418384, + "grad_norm": 0.491878867149353, + "learning_rate": 9.959960694155662e-06, + "loss": 0.4054, + "step": 879 + }, + { + "epoch": 0.40923887769338085, + "grad_norm": 0.4896703362464905, + "learning_rate": 9.959618209007045e-06, + "loss": 0.3981, + "step": 880 + }, + { + "epoch": 0.40970392187257787, + "grad_norm": 0.5020521283149719, + "learning_rate": 9.959274271266816e-06, + "loss": 0.3881, + "step": 881 + }, + { + "epoch": 0.41016896605177494, + "grad_norm": 0.5467438101768494, + "learning_rate": 9.958928881035708e-06, + "loss": 0.4188, + "step": 882 + }, + { + "epoch": 0.41063401023097196, + "grad_norm": 0.455102801322937, + "learning_rate": 9.958582038414878e-06, + "loss": 0.407, + "step": 883 + }, + { + "epoch": 0.411099054410169, + "grad_norm": 0.4519009292125702, + "learning_rate": 9.958233743505912e-06, + "loss": 0.4008, + "step": 884 + }, + { + "epoch": 0.411564098589366, + "grad_norm": 0.5280046463012695, + "learning_rate": 9.957883996410821e-06, + "loss": 0.3925, + "step": 885 + }, + { + "epoch": 0.412029142768563, + "grad_norm": 0.4995780885219574, + "learning_rate": 9.95753279723204e-06, + "loss": 0.397, + "step": 886 + }, + { + "epoch": 0.41249418694776, + "grad_norm": 0.4849075675010681, + "learning_rate": 9.957180146072426e-06, + "loss": 0.4214, + "step": 887 + }, + { + "epoch": 0.41295923112695704, + "grad_norm": 0.548945426940918, + "learning_rate": 9.956826043035268e-06, + "loss": 0.4044, + "step": 888 + }, + { + "epoch": 0.4134242753061541, + "grad_norm": 0.5245530009269714, + "learning_rate": 9.956470488224278e-06, + "loss": 0.4133, + "step": 889 + }, + { + "epoch": 0.4138893194853511, + "grad_norm": 0.5736802816390991, + "learning_rate": 9.95611348174359e-06, + "loss": 0.3939, + "step": 890 + }, + { + "epoch": 0.41435436366454814, + "grad_norm": 0.5248284935951233, + "learning_rate": 9.955755023697767e-06, + "loss": 0.4058, + "step": 891 + }, + { + "epoch": 0.41481940784374516, + "grad_norm": 0.4462113678455353, + "learning_rate": 9.955395114191792e-06, + "loss": 0.4168, + "step": 892 + }, + { + "epoch": 0.4152844520229422, + "grad_norm": 0.6396908164024353, + "learning_rate": 9.955033753331082e-06, + "loss": 0.4049, + "step": 893 + }, + { + "epoch": 0.4157494962021392, + "grad_norm": 0.5714189410209656, + "learning_rate": 9.954670941221469e-06, + "loss": 0.41, + "step": 894 + }, + { + "epoch": 0.4162145403813362, + "grad_norm": 0.5378727316856384, + "learning_rate": 9.954306677969218e-06, + "loss": 0.3946, + "step": 895 + }, + { + "epoch": 0.4166795845605333, + "grad_norm": 0.5027511715888977, + "learning_rate": 9.953940963681015e-06, + "loss": 0.3757, + "step": 896 + }, + { + "epoch": 0.4171446287397303, + "grad_norm": 0.5424126386642456, + "learning_rate": 9.953573798463972e-06, + "loss": 0.417, + "step": 897 + }, + { + "epoch": 0.4176096729189273, + "grad_norm": 0.5236470103263855, + "learning_rate": 9.953205182425623e-06, + "loss": 0.415, + "step": 898 + }, + { + "epoch": 0.4180747170981243, + "grad_norm": 0.5600041747093201, + "learning_rate": 9.952835115673933e-06, + "loss": 0.4227, + "step": 899 + }, + { + "epoch": 0.41853976127732134, + "grad_norm": 0.44945916533470154, + "learning_rate": 9.952463598317286e-06, + "loss": 0.405, + "step": 900 + }, + { + "epoch": 0.41900480545651836, + "grad_norm": 0.5260853171348572, + "learning_rate": 9.952090630464495e-06, + "loss": 0.4051, + "step": 901 + }, + { + "epoch": 0.4194698496357154, + "grad_norm": 0.5023328065872192, + "learning_rate": 9.951716212224798e-06, + "loss": 0.408, + "step": 902 + }, + { + "epoch": 0.4199348938149124, + "grad_norm": 0.5364269018173218, + "learning_rate": 9.951340343707852e-06, + "loss": 0.4036, + "step": 903 + }, + { + "epoch": 0.42039993799410946, + "grad_norm": 0.5100162625312805, + "learning_rate": 9.950963025023746e-06, + "loss": 0.4104, + "step": 904 + }, + { + "epoch": 0.4208649821733065, + "grad_norm": 0.5659197568893433, + "learning_rate": 9.950584256282988e-06, + "loss": 0.4216, + "step": 905 + }, + { + "epoch": 0.4213300263525035, + "grad_norm": 0.5025163292884827, + "learning_rate": 9.950204037596516e-06, + "loss": 0.3989, + "step": 906 + }, + { + "epoch": 0.4217950705317005, + "grad_norm": 0.6001001596450806, + "learning_rate": 9.949822369075687e-06, + "loss": 0.389, + "step": 907 + }, + { + "epoch": 0.4222601147108975, + "grad_norm": 0.5134057998657227, + "learning_rate": 9.949439250832287e-06, + "loss": 0.3975, + "step": 908 + }, + { + "epoch": 0.42272515889009454, + "grad_norm": 0.5745139122009277, + "learning_rate": 9.949054682978525e-06, + "loss": 0.4186, + "step": 909 + }, + { + "epoch": 0.42319020306929156, + "grad_norm": 0.6479323506355286, + "learning_rate": 9.948668665627034e-06, + "loss": 0.4201, + "step": 910 + }, + { + "epoch": 0.42365524724848863, + "grad_norm": 0.5262349843978882, + "learning_rate": 9.948281198890875e-06, + "loss": 0.4016, + "step": 911 + }, + { + "epoch": 0.42412029142768565, + "grad_norm": 0.6013730764389038, + "learning_rate": 9.947892282883527e-06, + "loss": 0.3814, + "step": 912 + }, + { + "epoch": 0.42458533560688266, + "grad_norm": 0.6294665336608887, + "learning_rate": 9.947501917718897e-06, + "loss": 0.391, + "step": 913 + }, + { + "epoch": 0.4250503797860797, + "grad_norm": 0.5530297756195068, + "learning_rate": 9.947110103511322e-06, + "loss": 0.3957, + "step": 914 + }, + { + "epoch": 0.4255154239652767, + "grad_norm": 0.5969755053520203, + "learning_rate": 9.946716840375552e-06, + "loss": 0.3933, + "step": 915 + }, + { + "epoch": 0.4259804681444737, + "grad_norm": 0.6482888460159302, + "learning_rate": 9.946322128426771e-06, + "loss": 0.4384, + "step": 916 + }, + { + "epoch": 0.42644551232367073, + "grad_norm": 0.6915735602378845, + "learning_rate": 9.945925967780581e-06, + "loss": 0.4165, + "step": 917 + }, + { + "epoch": 0.4269105565028678, + "grad_norm": 0.5215064883232117, + "learning_rate": 9.945528358553014e-06, + "loss": 0.4123, + "step": 918 + }, + { + "epoch": 0.4273756006820648, + "grad_norm": 0.6709214448928833, + "learning_rate": 9.945129300860521e-06, + "loss": 0.4214, + "step": 919 + }, + { + "epoch": 0.42784064486126183, + "grad_norm": 0.5739949345588684, + "learning_rate": 9.94472879481998e-06, + "loss": 0.3663, + "step": 920 + }, + { + "epoch": 0.42830568904045885, + "grad_norm": 0.5512825846672058, + "learning_rate": 9.944326840548693e-06, + "loss": 0.386, + "step": 921 + }, + { + "epoch": 0.42877073321965586, + "grad_norm": 0.6003051400184631, + "learning_rate": 9.943923438164387e-06, + "loss": 0.4002, + "step": 922 + }, + { + "epoch": 0.4292357773988529, + "grad_norm": 0.5427140593528748, + "learning_rate": 9.943518587785208e-06, + "loss": 0.3789, + "step": 923 + }, + { + "epoch": 0.4297008215780499, + "grad_norm": 0.4838401675224304, + "learning_rate": 9.943112289529737e-06, + "loss": 0.4048, + "step": 924 + }, + { + "epoch": 0.4301658657572469, + "grad_norm": 0.487448126077652, + "learning_rate": 9.942704543516966e-06, + "loss": 0.4082, + "step": 925 + }, + { + "epoch": 0.430630909936444, + "grad_norm": 0.5517868995666504, + "learning_rate": 9.94229534986632e-06, + "loss": 0.415, + "step": 926 + }, + { + "epoch": 0.431095954115641, + "grad_norm": 0.5240123867988586, + "learning_rate": 9.941884708697644e-06, + "loss": 0.3909, + "step": 927 + }, + { + "epoch": 0.431560998294838, + "grad_norm": 0.5013271570205688, + "learning_rate": 9.941472620131208e-06, + "loss": 0.3914, + "step": 928 + }, + { + "epoch": 0.43202604247403503, + "grad_norm": 0.49305182695388794, + "learning_rate": 9.941059084287708e-06, + "loss": 0.3725, + "step": 929 + }, + { + "epoch": 0.43249108665323205, + "grad_norm": 0.5340526103973389, + "learning_rate": 9.940644101288259e-06, + "loss": 0.3961, + "step": 930 + }, + { + "epoch": 0.43295613083242906, + "grad_norm": 0.5031039118766785, + "learning_rate": 9.940227671254406e-06, + "loss": 0.4108, + "step": 931 + }, + { + "epoch": 0.4334211750116261, + "grad_norm": 0.5576218962669373, + "learning_rate": 9.939809794308111e-06, + "loss": 0.3985, + "step": 932 + }, + { + "epoch": 0.43388621919082315, + "grad_norm": 0.48340535163879395, + "learning_rate": 9.939390470571769e-06, + "loss": 0.4029, + "step": 933 + }, + { + "epoch": 0.43435126337002017, + "grad_norm": 0.5440919399261475, + "learning_rate": 9.938969700168186e-06, + "loss": 0.3998, + "step": 934 + }, + { + "epoch": 0.4348163075492172, + "grad_norm": 0.44848132133483887, + "learning_rate": 9.938547483220602e-06, + "loss": 0.3841, + "step": 935 + }, + { + "epoch": 0.4352813517284142, + "grad_norm": 0.5585298538208008, + "learning_rate": 9.93812381985268e-06, + "loss": 0.4138, + "step": 936 + }, + { + "epoch": 0.4357463959076112, + "grad_norm": 0.5106562376022339, + "learning_rate": 9.9376987101885e-06, + "loss": 0.3974, + "step": 937 + }, + { + "epoch": 0.43621144008680823, + "grad_norm": 0.5067592263221741, + "learning_rate": 9.937272154352573e-06, + "loss": 0.3889, + "step": 938 + }, + { + "epoch": 0.43667648426600525, + "grad_norm": 0.5147528052330017, + "learning_rate": 9.936844152469828e-06, + "loss": 0.3935, + "step": 939 + }, + { + "epoch": 0.4371415284452023, + "grad_norm": 0.48401129245758057, + "learning_rate": 9.936414704665622e-06, + "loss": 0.3967, + "step": 940 + }, + { + "epoch": 0.43760657262439934, + "grad_norm": 0.6152265071868896, + "learning_rate": 9.935983811065732e-06, + "loss": 0.4419, + "step": 941 + }, + { + "epoch": 0.43807161680359635, + "grad_norm": 0.4920692443847656, + "learning_rate": 9.935551471796358e-06, + "loss": 0.3831, + "step": 942 + }, + { + "epoch": 0.43853666098279337, + "grad_norm": 0.4826490581035614, + "learning_rate": 9.935117686984128e-06, + "loss": 0.3911, + "step": 943 + }, + { + "epoch": 0.4390017051619904, + "grad_norm": 0.5583831667900085, + "learning_rate": 9.93468245675609e-06, + "loss": 0.4013, + "step": 944 + }, + { + "epoch": 0.4394667493411874, + "grad_norm": 0.5026276111602783, + "learning_rate": 9.934245781239714e-06, + "loss": 0.3954, + "step": 945 + }, + { + "epoch": 0.4399317935203844, + "grad_norm": 0.6956951022148132, + "learning_rate": 9.933807660562898e-06, + "loss": 0.4083, + "step": 946 + }, + { + "epoch": 0.44039683769958143, + "grad_norm": 0.43628400564193726, + "learning_rate": 9.933368094853958e-06, + "loss": 0.3529, + "step": 947 + }, + { + "epoch": 0.4408618818787785, + "grad_norm": 0.5511034727096558, + "learning_rate": 9.932927084241635e-06, + "loss": 0.4084, + "step": 948 + }, + { + "epoch": 0.4413269260579755, + "grad_norm": 0.5073305368423462, + "learning_rate": 9.932484628855097e-06, + "loss": 0.3879, + "step": 949 + }, + { + "epoch": 0.44179197023717254, + "grad_norm": 0.5978685617446899, + "learning_rate": 9.932040728823928e-06, + "loss": 0.4192, + "step": 950 + }, + { + "epoch": 0.44225701441636955, + "grad_norm": 0.5503034591674805, + "learning_rate": 9.931595384278143e-06, + "loss": 0.4073, + "step": 951 + }, + { + "epoch": 0.44272205859556657, + "grad_norm": 0.5104547739028931, + "learning_rate": 9.931148595348176e-06, + "loss": 0.3689, + "step": 952 + }, + { + "epoch": 0.4431871027747636, + "grad_norm": 0.594742476940155, + "learning_rate": 9.93070036216488e-06, + "loss": 0.418, + "step": 953 + }, + { + "epoch": 0.4436521469539606, + "grad_norm": 0.4962625205516815, + "learning_rate": 9.930250684859542e-06, + "loss": 0.4008, + "step": 954 + }, + { + "epoch": 0.4441171911331577, + "grad_norm": 0.5209368467330933, + "learning_rate": 9.929799563563858e-06, + "loss": 0.3777, + "step": 955 + }, + { + "epoch": 0.4445822353123547, + "grad_norm": 0.5389686226844788, + "learning_rate": 9.929346998409958e-06, + "loss": 0.4047, + "step": 956 + }, + { + "epoch": 0.4450472794915517, + "grad_norm": 0.5484662652015686, + "learning_rate": 9.92889298953039e-06, + "loss": 0.42, + "step": 957 + }, + { + "epoch": 0.4455123236707487, + "grad_norm": 0.5775797963142395, + "learning_rate": 9.928437537058126e-06, + "loss": 0.4016, + "step": 958 + }, + { + "epoch": 0.44597736784994574, + "grad_norm": 0.48525333404541016, + "learning_rate": 9.927980641126562e-06, + "loss": 0.385, + "step": 959 + }, + { + "epoch": 0.44644241202914275, + "grad_norm": 0.6398960947990417, + "learning_rate": 9.927522301869515e-06, + "loss": 0.4008, + "step": 960 + }, + { + "epoch": 0.44690745620833977, + "grad_norm": 0.5224960446357727, + "learning_rate": 9.927062519421223e-06, + "loss": 0.4061, + "step": 961 + }, + { + "epoch": 0.44737250038753684, + "grad_norm": 0.5396710634231567, + "learning_rate": 9.926601293916349e-06, + "loss": 0.4128, + "step": 962 + }, + { + "epoch": 0.44783754456673386, + "grad_norm": 0.540110170841217, + "learning_rate": 9.926138625489981e-06, + "loss": 0.4037, + "step": 963 + }, + { + "epoch": 0.4483025887459309, + "grad_norm": 0.5284585952758789, + "learning_rate": 9.925674514277625e-06, + "loss": 0.3929, + "step": 964 + }, + { + "epoch": 0.4487676329251279, + "grad_norm": 0.5301303863525391, + "learning_rate": 9.925208960415214e-06, + "loss": 0.4064, + "step": 965 + }, + { + "epoch": 0.4492326771043249, + "grad_norm": 0.5405663847923279, + "learning_rate": 9.924741964039098e-06, + "loss": 0.3813, + "step": 966 + }, + { + "epoch": 0.4496977212835219, + "grad_norm": 0.567683756351471, + "learning_rate": 9.924273525286053e-06, + "loss": 0.3801, + "step": 967 + }, + { + "epoch": 0.45016276546271894, + "grad_norm": 0.5406811237335205, + "learning_rate": 9.92380364429328e-06, + "loss": 0.394, + "step": 968 + }, + { + "epoch": 0.45062780964191596, + "grad_norm": 0.6923626065254211, + "learning_rate": 9.923332321198396e-06, + "loss": 0.389, + "step": 969 + }, + { + "epoch": 0.451092853821113, + "grad_norm": 0.5620383620262146, + "learning_rate": 9.922859556139447e-06, + "loss": 0.393, + "step": 970 + }, + { + "epoch": 0.45155789800031004, + "grad_norm": 0.5418940782546997, + "learning_rate": 9.922385349254895e-06, + "loss": 0.4071, + "step": 971 + }, + { + "epoch": 0.45202294217950706, + "grad_norm": 0.5649824738502502, + "learning_rate": 9.921909700683632e-06, + "loss": 0.3882, + "step": 972 + }, + { + "epoch": 0.4524879863587041, + "grad_norm": 0.5053907036781311, + "learning_rate": 9.921432610564962e-06, + "loss": 0.3893, + "step": 973 + }, + { + "epoch": 0.4529530305379011, + "grad_norm": 0.5486416816711426, + "learning_rate": 9.920954079038623e-06, + "loss": 0.4207, + "step": 974 + }, + { + "epoch": 0.4534180747170981, + "grad_norm": 0.4447232186794281, + "learning_rate": 9.920474106244764e-06, + "loss": 0.3837, + "step": 975 + }, + { + "epoch": 0.4538831188962951, + "grad_norm": 0.610753059387207, + "learning_rate": 9.919992692323965e-06, + "loss": 0.3905, + "step": 976 + }, + { + "epoch": 0.4543481630754922, + "grad_norm": 0.5709927678108215, + "learning_rate": 9.919509837417221e-06, + "loss": 0.3985, + "step": 977 + }, + { + "epoch": 0.4548132072546892, + "grad_norm": 0.6064723134040833, + "learning_rate": 9.919025541665955e-06, + "loss": 0.4376, + "step": 978 + }, + { + "epoch": 0.4552782514338862, + "grad_norm": 0.6571378111839294, + "learning_rate": 9.918539805212008e-06, + "loss": 0.3945, + "step": 979 + }, + { + "epoch": 0.45574329561308324, + "grad_norm": 0.528195321559906, + "learning_rate": 9.918052628197645e-06, + "loss": 0.406, + "step": 980 + }, + { + "epoch": 0.45620833979228026, + "grad_norm": 0.6316954493522644, + "learning_rate": 9.917564010765551e-06, + "loss": 0.3779, + "step": 981 + }, + { + "epoch": 0.4566733839714773, + "grad_norm": 0.5615218281745911, + "learning_rate": 9.917073953058836e-06, + "loss": 0.4308, + "step": 982 + }, + { + "epoch": 0.4571384281506743, + "grad_norm": 0.5495420098304749, + "learning_rate": 9.916582455221029e-06, + "loss": 0.4006, + "step": 983 + }, + { + "epoch": 0.45760347232987136, + "grad_norm": 0.5130931735038757, + "learning_rate": 9.916089517396081e-06, + "loss": 0.3881, + "step": 984 + }, + { + "epoch": 0.4580685165090684, + "grad_norm": 0.6450378894805908, + "learning_rate": 9.915595139728366e-06, + "loss": 0.4132, + "step": 985 + }, + { + "epoch": 0.4585335606882654, + "grad_norm": 0.5673415064811707, + "learning_rate": 9.915099322362681e-06, + "loss": 0.4007, + "step": 986 + }, + { + "epoch": 0.4589986048674624, + "grad_norm": 0.5420086979866028, + "learning_rate": 9.91460206544424e-06, + "loss": 0.4066, + "step": 987 + }, + { + "epoch": 0.45946364904665943, + "grad_norm": 0.5733571648597717, + "learning_rate": 9.914103369118682e-06, + "loss": 0.371, + "step": 988 + }, + { + "epoch": 0.45992869322585644, + "grad_norm": 0.5512123107910156, + "learning_rate": 9.913603233532067e-06, + "loss": 0.3742, + "step": 989 + }, + { + "epoch": 0.46039373740505346, + "grad_norm": 0.5796908140182495, + "learning_rate": 9.913101658830879e-06, + "loss": 0.4115, + "step": 990 + }, + { + "epoch": 0.4608587815842505, + "grad_norm": 0.5767021179199219, + "learning_rate": 9.91259864516202e-06, + "loss": 0.4203, + "step": 991 + }, + { + "epoch": 0.46132382576344755, + "grad_norm": 0.5332590937614441, + "learning_rate": 9.912094192672812e-06, + "loss": 0.3986, + "step": 992 + }, + { + "epoch": 0.46178886994264456, + "grad_norm": 0.4922131597995758, + "learning_rate": 9.911588301511004e-06, + "loss": 0.4085, + "step": 993 + }, + { + "epoch": 0.4622539141218416, + "grad_norm": 0.5280721783638, + "learning_rate": 9.911080971824762e-06, + "loss": 0.4059, + "step": 994 + }, + { + "epoch": 0.4627189583010386, + "grad_norm": 0.5713456869125366, + "learning_rate": 9.910572203762676e-06, + "loss": 0.4249, + "step": 995 + }, + { + "epoch": 0.4631840024802356, + "grad_norm": 0.5211708545684814, + "learning_rate": 9.910061997473753e-06, + "loss": 0.3874, + "step": 996 + }, + { + "epoch": 0.46364904665943263, + "grad_norm": 0.5675033330917358, + "learning_rate": 9.909550353107426e-06, + "loss": 0.4086, + "step": 997 + }, + { + "epoch": 0.46411409083862964, + "grad_norm": 0.6571573615074158, + "learning_rate": 9.909037270813547e-06, + "loss": 0.4148, + "step": 998 + }, + { + "epoch": 0.4645791350178267, + "grad_norm": 0.6051549911499023, + "learning_rate": 9.908522750742391e-06, + "loss": 0.4061, + "step": 999 + }, + { + "epoch": 0.46504417919702373, + "grad_norm": 0.5821381211280823, + "learning_rate": 9.90800679304465e-06, + "loss": 0.4033, + "step": 1000 + }, + { + "epoch": 0.46550922337622075, + "grad_norm": 0.6213275194168091, + "learning_rate": 9.907489397871441e-06, + "loss": 0.4018, + "step": 1001 + }, + { + "epoch": 0.46597426755541776, + "grad_norm": 0.5967691540718079, + "learning_rate": 9.9069705653743e-06, + "loss": 0.376, + "step": 1002 + }, + { + "epoch": 0.4664393117346148, + "grad_norm": 0.4699487090110779, + "learning_rate": 9.906450295705188e-06, + "loss": 0.4172, + "step": 1003 + }, + { + "epoch": 0.4669043559138118, + "grad_norm": 0.6863399147987366, + "learning_rate": 9.905928589016479e-06, + "loss": 0.4105, + "step": 1004 + }, + { + "epoch": 0.4673694000930088, + "grad_norm": 0.5886141657829285, + "learning_rate": 9.905405445460972e-06, + "loss": 0.3957, + "step": 1005 + }, + { + "epoch": 0.4678344442722059, + "grad_norm": 0.4847775399684906, + "learning_rate": 9.90488086519189e-06, + "loss": 0.4155, + "step": 1006 + }, + { + "epoch": 0.4682994884514029, + "grad_norm": 0.6540789604187012, + "learning_rate": 9.904354848362876e-06, + "loss": 0.4178, + "step": 1007 + }, + { + "epoch": 0.4687645326305999, + "grad_norm": 0.5532976388931274, + "learning_rate": 9.903827395127987e-06, + "loss": 0.3777, + "step": 1008 + }, + { + "epoch": 0.46922957680979693, + "grad_norm": 0.44451895356178284, + "learning_rate": 9.903298505641707e-06, + "loss": 0.4003, + "step": 1009 + }, + { + "epoch": 0.46969462098899395, + "grad_norm": 0.5525593757629395, + "learning_rate": 9.902768180058942e-06, + "loss": 0.3992, + "step": 1010 + }, + { + "epoch": 0.47015966516819097, + "grad_norm": 0.480257123708725, + "learning_rate": 9.902236418535012e-06, + "loss": 0.3906, + "step": 1011 + }, + { + "epoch": 0.470624709347388, + "grad_norm": 0.5119292140007019, + "learning_rate": 9.901703221225663e-06, + "loss": 0.42, + "step": 1012 + }, + { + "epoch": 0.47108975352658505, + "grad_norm": 0.500453531742096, + "learning_rate": 9.901168588287057e-06, + "loss": 0.4131, + "step": 1013 + }, + { + "epoch": 0.47155479770578207, + "grad_norm": 0.5501608848571777, + "learning_rate": 9.900632519875786e-06, + "loss": 0.3866, + "step": 1014 + }, + { + "epoch": 0.4720198418849791, + "grad_norm": 0.4848511219024658, + "learning_rate": 9.900095016148849e-06, + "loss": 0.4138, + "step": 1015 + }, + { + "epoch": 0.4724848860641761, + "grad_norm": 0.5268184542655945, + "learning_rate": 9.899556077263676e-06, + "loss": 0.3947, + "step": 1016 + }, + { + "epoch": 0.4729499302433731, + "grad_norm": 0.555121660232544, + "learning_rate": 9.899015703378115e-06, + "loss": 0.4129, + "step": 1017 + }, + { + "epoch": 0.47341497442257013, + "grad_norm": 0.5795762538909912, + "learning_rate": 9.898473894650425e-06, + "loss": 0.4121, + "step": 1018 + }, + { + "epoch": 0.47388001860176715, + "grad_norm": 0.515906810760498, + "learning_rate": 9.8979306512393e-06, + "loss": 0.3946, + "step": 1019 + }, + { + "epoch": 0.47434506278096417, + "grad_norm": 0.5714851021766663, + "learning_rate": 9.897385973303845e-06, + "loss": 0.3858, + "step": 1020 + }, + { + "epoch": 0.47481010696016124, + "grad_norm": 0.6117233037948608, + "learning_rate": 9.896839861003588e-06, + "loss": 0.411, + "step": 1021 + }, + { + "epoch": 0.47527515113935825, + "grad_norm": 0.5009344220161438, + "learning_rate": 9.896292314498475e-06, + "loss": 0.3939, + "step": 1022 + }, + { + "epoch": 0.47574019531855527, + "grad_norm": 0.46868664026260376, + "learning_rate": 9.895743333948875e-06, + "loss": 0.412, + "step": 1023 + }, + { + "epoch": 0.4762052394977523, + "grad_norm": 0.6705508828163147, + "learning_rate": 9.895192919515575e-06, + "loss": 0.3981, + "step": 1024 + }, + { + "epoch": 0.4766702836769493, + "grad_norm": 0.535101056098938, + "learning_rate": 9.894641071359784e-06, + "loss": 0.4057, + "step": 1025 + }, + { + "epoch": 0.4771353278561463, + "grad_norm": 0.4733811616897583, + "learning_rate": 9.894087789643123e-06, + "loss": 0.4066, + "step": 1026 + }, + { + "epoch": 0.47760037203534333, + "grad_norm": 0.514105498790741, + "learning_rate": 9.893533074527647e-06, + "loss": 0.3957, + "step": 1027 + }, + { + "epoch": 0.4780654162145404, + "grad_norm": 0.50275719165802, + "learning_rate": 9.892976926175819e-06, + "loss": 0.386, + "step": 1028 + }, + { + "epoch": 0.4785304603937374, + "grad_norm": 0.4467027187347412, + "learning_rate": 9.892419344750528e-06, + "loss": 0.3965, + "step": 1029 + }, + { + "epoch": 0.47899550457293444, + "grad_norm": 0.48868438601493835, + "learning_rate": 9.891860330415077e-06, + "loss": 0.408, + "step": 1030 + }, + { + "epoch": 0.47946054875213145, + "grad_norm": 0.4734288156032562, + "learning_rate": 9.891299883333197e-06, + "loss": 0.3778, + "step": 1031 + }, + { + "epoch": 0.47992559293132847, + "grad_norm": 0.49330592155456543, + "learning_rate": 9.890738003669029e-06, + "loss": 0.3766, + "step": 1032 + }, + { + "epoch": 0.4803906371105255, + "grad_norm": 0.5068243145942688, + "learning_rate": 9.890174691587142e-06, + "loss": 0.3832, + "step": 1033 + }, + { + "epoch": 0.4808556812897225, + "grad_norm": 0.4672917127609253, + "learning_rate": 9.889609947252519e-06, + "loss": 0.3933, + "step": 1034 + }, + { + "epoch": 0.4813207254689196, + "grad_norm": 0.5433964729309082, + "learning_rate": 9.889043770830566e-06, + "loss": 0.378, + "step": 1035 + }, + { + "epoch": 0.4817857696481166, + "grad_norm": 0.4645717740058899, + "learning_rate": 9.888476162487106e-06, + "loss": 0.3803, + "step": 1036 + }, + { + "epoch": 0.4822508138273136, + "grad_norm": 0.5179428458213806, + "learning_rate": 9.887907122388382e-06, + "loss": 0.4093, + "step": 1037 + }, + { + "epoch": 0.4827158580065106, + "grad_norm": 0.5166782140731812, + "learning_rate": 9.887336650701055e-06, + "loss": 0.4107, + "step": 1038 + }, + { + "epoch": 0.48318090218570764, + "grad_norm": 0.4443417489528656, + "learning_rate": 9.886764747592212e-06, + "loss": 0.3626, + "step": 1039 + }, + { + "epoch": 0.48364594636490466, + "grad_norm": 0.5102760195732117, + "learning_rate": 9.886191413229349e-06, + "loss": 0.4014, + "step": 1040 + }, + { + "epoch": 0.48411099054410167, + "grad_norm": 0.5018773674964905, + "learning_rate": 9.885616647780389e-06, + "loss": 0.3895, + "step": 1041 + }, + { + "epoch": 0.4845760347232987, + "grad_norm": 0.5214076638221741, + "learning_rate": 9.88504045141367e-06, + "loss": 0.389, + "step": 1042 + }, + { + "epoch": 0.48504107890249576, + "grad_norm": 0.4620245397090912, + "learning_rate": 9.884462824297952e-06, + "loss": 0.389, + "step": 1043 + }, + { + "epoch": 0.4855061230816928, + "grad_norm": 0.4827862083911896, + "learning_rate": 9.88388376660241e-06, + "loss": 0.4153, + "step": 1044 + }, + { + "epoch": 0.4859711672608898, + "grad_norm": 0.5498597025871277, + "learning_rate": 9.883303278496646e-06, + "loss": 0.4076, + "step": 1045 + }, + { + "epoch": 0.4864362114400868, + "grad_norm": 0.49216824769973755, + "learning_rate": 9.88272136015067e-06, + "loss": 0.4254, + "step": 1046 + }, + { + "epoch": 0.4869012556192838, + "grad_norm": 0.534773588180542, + "learning_rate": 9.882138011734919e-06, + "loss": 0.3948, + "step": 1047 + }, + { + "epoch": 0.48736629979848084, + "grad_norm": 0.5610633492469788, + "learning_rate": 9.881553233420244e-06, + "loss": 0.41, + "step": 1048 + }, + { + "epoch": 0.48783134397767786, + "grad_norm": 0.48247793316841125, + "learning_rate": 9.88096702537792e-06, + "loss": 0.394, + "step": 1049 + }, + { + "epoch": 0.4882963881568749, + "grad_norm": 0.5383715033531189, + "learning_rate": 9.880379387779637e-06, + "loss": 0.3893, + "step": 1050 + }, + { + "epoch": 0.48876143233607194, + "grad_norm": 0.5967577695846558, + "learning_rate": 9.879790320797504e-06, + "loss": 0.373, + "step": 1051 + }, + { + "epoch": 0.48922647651526896, + "grad_norm": 0.4835599660873413, + "learning_rate": 9.879199824604048e-06, + "loss": 0.4082, + "step": 1052 + }, + { + "epoch": 0.489691520694466, + "grad_norm": 0.5845022797584534, + "learning_rate": 9.878607899372217e-06, + "loss": 0.3954, + "step": 1053 + }, + { + "epoch": 0.490156564873663, + "grad_norm": 0.49885934591293335, + "learning_rate": 9.878014545275379e-06, + "loss": 0.4067, + "step": 1054 + }, + { + "epoch": 0.49062160905286, + "grad_norm": 0.5852560997009277, + "learning_rate": 9.877419762487312e-06, + "loss": 0.3932, + "step": 1055 + }, + { + "epoch": 0.491086653232057, + "grad_norm": 0.48509690165519714, + "learning_rate": 9.876823551182223e-06, + "loss": 0.433, + "step": 1056 + }, + { + "epoch": 0.4915516974112541, + "grad_norm": 0.44763508439064026, + "learning_rate": 9.876225911534729e-06, + "loss": 0.3651, + "step": 1057 + }, + { + "epoch": 0.4920167415904511, + "grad_norm": 0.5484299063682556, + "learning_rate": 9.875626843719871e-06, + "loss": 0.3959, + "step": 1058 + }, + { + "epoch": 0.49248178576964813, + "grad_norm": 0.481046199798584, + "learning_rate": 9.875026347913109e-06, + "loss": 0.3765, + "step": 1059 + }, + { + "epoch": 0.49294682994884514, + "grad_norm": 0.5341517925262451, + "learning_rate": 9.874424424290313e-06, + "loss": 0.3963, + "step": 1060 + }, + { + "epoch": 0.49341187412804216, + "grad_norm": 0.469200998544693, + "learning_rate": 9.87382107302778e-06, + "loss": 0.3995, + "step": 1061 + }, + { + "epoch": 0.4938769183072392, + "grad_norm": 0.5132390260696411, + "learning_rate": 9.87321629430222e-06, + "loss": 0.3837, + "step": 1062 + }, + { + "epoch": 0.4943419624864362, + "grad_norm": 0.5703272223472595, + "learning_rate": 9.872610088290766e-06, + "loss": 0.4025, + "step": 1063 + }, + { + "epoch": 0.4948070066656332, + "grad_norm": 0.5358856916427612, + "learning_rate": 9.87200245517096e-06, + "loss": 0.4287, + "step": 1064 + }, + { + "epoch": 0.4952720508448303, + "grad_norm": 0.41558536887168884, + "learning_rate": 9.871393395120774e-06, + "loss": 0.3621, + "step": 1065 + }, + { + "epoch": 0.4957370950240273, + "grad_norm": 0.5119854807853699, + "learning_rate": 9.870782908318591e-06, + "loss": 0.3781, + "step": 1066 + }, + { + "epoch": 0.4962021392032243, + "grad_norm": 0.4697974622249603, + "learning_rate": 9.87017099494321e-06, + "loss": 0.4138, + "step": 1067 + }, + { + "epoch": 0.49666718338242133, + "grad_norm": 0.4632170796394348, + "learning_rate": 9.869557655173849e-06, + "loss": 0.3952, + "step": 1068 + }, + { + "epoch": 0.49713222756161835, + "grad_norm": 0.4312477111816406, + "learning_rate": 9.86894288919015e-06, + "loss": 0.396, + "step": 1069 + }, + { + "epoch": 0.49759727174081536, + "grad_norm": 0.47128409147262573, + "learning_rate": 9.868326697172164e-06, + "loss": 0.4233, + "step": 1070 + }, + { + "epoch": 0.4980623159200124, + "grad_norm": 0.5045310258865356, + "learning_rate": 9.867709079300366e-06, + "loss": 0.3823, + "step": 1071 + }, + { + "epoch": 0.49852736009920945, + "grad_norm": 0.486145943403244, + "learning_rate": 9.867090035755648e-06, + "loss": 0.3714, + "step": 1072 + }, + { + "epoch": 0.49899240427840647, + "grad_norm": 0.4772482216358185, + "learning_rate": 9.866469566719314e-06, + "loss": 0.3799, + "step": 1073 + }, + { + "epoch": 0.4994574484576035, + "grad_norm": 0.546289324760437, + "learning_rate": 9.86584767237309e-06, + "loss": 0.4263, + "step": 1074 + }, + { + "epoch": 0.4999224926368005, + "grad_norm": 0.5823156833648682, + "learning_rate": 9.86522435289912e-06, + "loss": 0.4101, + "step": 1075 + }, + { + "epoch": 0.5003875368159976, + "grad_norm": 0.5479342341423035, + "learning_rate": 9.864599608479963e-06, + "loss": 0.3717, + "step": 1076 + }, + { + "epoch": 0.5008525809951946, + "grad_norm": 0.5319987535476685, + "learning_rate": 9.863973439298597e-06, + "loss": 0.3993, + "step": 1077 + }, + { + "epoch": 0.5013176251743916, + "grad_norm": 0.5460782647132874, + "learning_rate": 9.86334584553842e-06, + "loss": 0.4069, + "step": 1078 + }, + { + "epoch": 0.5017826693535886, + "grad_norm": 0.6070926189422607, + "learning_rate": 9.862716827383238e-06, + "loss": 0.3999, + "step": 1079 + }, + { + "epoch": 0.5022477135327856, + "grad_norm": 0.530796229839325, + "learning_rate": 9.862086385017283e-06, + "loss": 0.3975, + "step": 1080 + }, + { + "epoch": 0.5027127577119826, + "grad_norm": 0.4481984078884125, + "learning_rate": 9.861454518625202e-06, + "loss": 0.3962, + "step": 1081 + }, + { + "epoch": 0.5031778018911797, + "grad_norm": 0.4798201024532318, + "learning_rate": 9.86082122839206e-06, + "loss": 0.3859, + "step": 1082 + }, + { + "epoch": 0.5036428460703767, + "grad_norm": 0.5404430627822876, + "learning_rate": 9.86018651450333e-06, + "loss": 0.3999, + "step": 1083 + }, + { + "epoch": 0.5041078902495737, + "grad_norm": 0.4909595549106598, + "learning_rate": 9.85955037714492e-06, + "loss": 0.3951, + "step": 1084 + }, + { + "epoch": 0.5045729344287707, + "grad_norm": 0.5814231634140015, + "learning_rate": 9.858912816503136e-06, + "loss": 0.4005, + "step": 1085 + }, + { + "epoch": 0.5050379786079677, + "grad_norm": 0.5166906118392944, + "learning_rate": 9.858273832764712e-06, + "loss": 0.4184, + "step": 1086 + }, + { + "epoch": 0.5055030227871647, + "grad_norm": 0.5268685221672058, + "learning_rate": 9.8576334261168e-06, + "loss": 0.4257, + "step": 1087 + }, + { + "epoch": 0.5059680669663618, + "grad_norm": 0.4857880473136902, + "learning_rate": 9.856991596746957e-06, + "loss": 0.3995, + "step": 1088 + }, + { + "epoch": 0.5064331111455588, + "grad_norm": 0.6406244039535522, + "learning_rate": 9.85634834484317e-06, + "loss": 0.4282, + "step": 1089 + }, + { + "epoch": 0.5068981553247559, + "grad_norm": 0.5598058104515076, + "learning_rate": 9.855703670593834e-06, + "loss": 0.3901, + "step": 1090 + }, + { + "epoch": 0.5073631995039529, + "grad_norm": 0.5567111372947693, + "learning_rate": 9.855057574187766e-06, + "loss": 0.4043, + "step": 1091 + }, + { + "epoch": 0.5078282436831499, + "grad_norm": 0.534076452255249, + "learning_rate": 9.854410055814195e-06, + "loss": 0.4011, + "step": 1092 + }, + { + "epoch": 0.508293287862347, + "grad_norm": 0.42775899171829224, + "learning_rate": 9.85376111566277e-06, + "loss": 0.3986, + "step": 1093 + }, + { + "epoch": 0.508758332041544, + "grad_norm": 0.5820823907852173, + "learning_rate": 9.853110753923553e-06, + "loss": 0.4143, + "step": 1094 + }, + { + "epoch": 0.509223376220741, + "grad_norm": 0.46398279070854187, + "learning_rate": 9.852458970787027e-06, + "loss": 0.3816, + "step": 1095 + }, + { + "epoch": 0.509688420399938, + "grad_norm": 0.4692523777484894, + "learning_rate": 9.85180576644409e-06, + "loss": 0.3855, + "step": 1096 + }, + { + "epoch": 0.510153464579135, + "grad_norm": 0.45275288820266724, + "learning_rate": 9.851151141086049e-06, + "loss": 0.3736, + "step": 1097 + }, + { + "epoch": 0.510618508758332, + "grad_norm": 0.6101323962211609, + "learning_rate": 9.850495094904639e-06, + "loss": 0.3844, + "step": 1098 + }, + { + "epoch": 0.511083552937529, + "grad_norm": 0.5054144859313965, + "learning_rate": 9.849837628092003e-06, + "loss": 0.4187, + "step": 1099 + }, + { + "epoch": 0.5115485971167261, + "grad_norm": 0.4948999285697937, + "learning_rate": 9.849178740840701e-06, + "loss": 0.3826, + "step": 1100 + }, + { + "epoch": 0.5120136412959231, + "grad_norm": 0.6334494948387146, + "learning_rate": 9.848518433343714e-06, + "loss": 0.3986, + "step": 1101 + }, + { + "epoch": 0.5124786854751201, + "grad_norm": 0.5066748857498169, + "learning_rate": 9.847856705794432e-06, + "loss": 0.4185, + "step": 1102 + }, + { + "epoch": 0.5129437296543171, + "grad_norm": 0.5187157392501831, + "learning_rate": 9.847193558386666e-06, + "loss": 0.385, + "step": 1103 + }, + { + "epoch": 0.5134087738335141, + "grad_norm": 0.5060462951660156, + "learning_rate": 9.846528991314638e-06, + "loss": 0.415, + "step": 1104 + }, + { + "epoch": 0.5138738180127113, + "grad_norm": 0.418716162443161, + "learning_rate": 9.845863004772994e-06, + "loss": 0.3735, + "step": 1105 + }, + { + "epoch": 0.5143388621919083, + "grad_norm": 0.45739948749542236, + "learning_rate": 9.845195598956787e-06, + "loss": 0.3532, + "step": 1106 + }, + { + "epoch": 0.5148039063711053, + "grad_norm": 0.42475655674934387, + "learning_rate": 9.84452677406149e-06, + "loss": 0.3871, + "step": 1107 + }, + { + "epoch": 0.5152689505503023, + "grad_norm": 0.5018404722213745, + "learning_rate": 9.843856530282992e-06, + "loss": 0.396, + "step": 1108 + }, + { + "epoch": 0.5157339947294993, + "grad_norm": 0.4738996624946594, + "learning_rate": 9.843184867817596e-06, + "loss": 0.4255, + "step": 1109 + }, + { + "epoch": 0.5161990389086963, + "grad_norm": 0.47539880871772766, + "learning_rate": 9.842511786862018e-06, + "loss": 0.4214, + "step": 1110 + }, + { + "epoch": 0.5166640830878934, + "grad_norm": 0.43585407733917236, + "learning_rate": 9.841837287613399e-06, + "loss": 0.3895, + "step": 1111 + }, + { + "epoch": 0.5171291272670904, + "grad_norm": 0.4282119870185852, + "learning_rate": 9.841161370269284e-06, + "loss": 0.3917, + "step": 1112 + }, + { + "epoch": 0.5175941714462874, + "grad_norm": 0.42898568511009216, + "learning_rate": 9.84048403502764e-06, + "loss": 0.3847, + "step": 1113 + }, + { + "epoch": 0.5180592156254844, + "grad_norm": 0.5623050928115845, + "learning_rate": 9.839805282086844e-06, + "loss": 0.415, + "step": 1114 + }, + { + "epoch": 0.5185242598046814, + "grad_norm": 0.4873126447200775, + "learning_rate": 9.839125111645699e-06, + "loss": 0.4168, + "step": 1115 + }, + { + "epoch": 0.5189893039838784, + "grad_norm": 0.4611026644706726, + "learning_rate": 9.83844352390341e-06, + "loss": 0.3892, + "step": 1116 + }, + { + "epoch": 0.5194543481630755, + "grad_norm": 0.5437391996383667, + "learning_rate": 9.837760519059603e-06, + "loss": 0.4169, + "step": 1117 + }, + { + "epoch": 0.5199193923422725, + "grad_norm": 0.47111251950263977, + "learning_rate": 9.83707609731432e-06, + "loss": 0.398, + "step": 1118 + }, + { + "epoch": 0.5203844365214695, + "grad_norm": 0.4614557921886444, + "learning_rate": 9.83639025886802e-06, + "loss": 0.3906, + "step": 1119 + }, + { + "epoch": 0.5208494807006666, + "grad_norm": 0.5393845438957214, + "learning_rate": 9.835703003921569e-06, + "loss": 0.3886, + "step": 1120 + }, + { + "epoch": 0.5213145248798636, + "grad_norm": 0.4539320468902588, + "learning_rate": 9.835014332676256e-06, + "loss": 0.4078, + "step": 1121 + }, + { + "epoch": 0.5217795690590606, + "grad_norm": 0.5154919028282166, + "learning_rate": 9.834324245333782e-06, + "loss": 0.3883, + "step": 1122 + }, + { + "epoch": 0.5222446132382577, + "grad_norm": 0.5002031326293945, + "learning_rate": 9.833632742096259e-06, + "loss": 0.3834, + "step": 1123 + }, + { + "epoch": 0.5227096574174547, + "grad_norm": 0.551112949848175, + "learning_rate": 9.83293982316622e-06, + "loss": 0.4026, + "step": 1124 + }, + { + "epoch": 0.5231747015966517, + "grad_norm": 0.5185533761978149, + "learning_rate": 9.832245488746612e-06, + "loss": 0.3856, + "step": 1125 + }, + { + "epoch": 0.5236397457758487, + "grad_norm": 0.4748762249946594, + "learning_rate": 9.831549739040788e-06, + "loss": 0.3975, + "step": 1126 + }, + { + "epoch": 0.5241047899550457, + "grad_norm": 0.5362114906311035, + "learning_rate": 9.830852574252525e-06, + "loss": 0.4028, + "step": 1127 + }, + { + "epoch": 0.5245698341342427, + "grad_norm": 0.47415444254875183, + "learning_rate": 9.830153994586013e-06, + "loss": 0.3969, + "step": 1128 + }, + { + "epoch": 0.5250348783134398, + "grad_norm": 0.6344930529594421, + "learning_rate": 9.829454000245854e-06, + "loss": 0.4292, + "step": 1129 + }, + { + "epoch": 0.5254999224926368, + "grad_norm": 0.48250138759613037, + "learning_rate": 9.82875259143706e-06, + "loss": 0.3995, + "step": 1130 + }, + { + "epoch": 0.5259649666718338, + "grad_norm": 0.5570687055587769, + "learning_rate": 9.82804976836507e-06, + "loss": 0.3855, + "step": 1131 + }, + { + "epoch": 0.5264300108510308, + "grad_norm": 0.554617166519165, + "learning_rate": 9.827345531235722e-06, + "loss": 0.4045, + "step": 1132 + }, + { + "epoch": 0.5268950550302278, + "grad_norm": 0.544276773929596, + "learning_rate": 9.826639880255282e-06, + "loss": 0.4065, + "step": 1133 + }, + { + "epoch": 0.527360099209425, + "grad_norm": 0.5573837161064148, + "learning_rate": 9.825932815630418e-06, + "loss": 0.4231, + "step": 1134 + }, + { + "epoch": 0.527825143388622, + "grad_norm": 0.4598175883293152, + "learning_rate": 9.825224337568224e-06, + "loss": 0.4033, + "step": 1135 + }, + { + "epoch": 0.528290187567819, + "grad_norm": 0.5561431050300598, + "learning_rate": 9.824514446276197e-06, + "loss": 0.3891, + "step": 1136 + }, + { + "epoch": 0.528755231747016, + "grad_norm": 0.46380776166915894, + "learning_rate": 9.823803141962253e-06, + "loss": 0.3938, + "step": 1137 + }, + { + "epoch": 0.529220275926213, + "grad_norm": 0.5651960968971252, + "learning_rate": 9.823090424834725e-06, + "loss": 0.3797, + "step": 1138 + }, + { + "epoch": 0.52968532010541, + "grad_norm": 0.5517980456352234, + "learning_rate": 9.822376295102352e-06, + "loss": 0.3812, + "step": 1139 + }, + { + "epoch": 0.530150364284607, + "grad_norm": 0.5638952851295471, + "learning_rate": 9.821660752974294e-06, + "loss": 0.3835, + "step": 1140 + }, + { + "epoch": 0.5306154084638041, + "grad_norm": 0.5966677665710449, + "learning_rate": 9.82094379866012e-06, + "loss": 0.4018, + "step": 1141 + }, + { + "epoch": 0.5310804526430011, + "grad_norm": 0.5064060091972351, + "learning_rate": 9.820225432369814e-06, + "loss": 0.3956, + "step": 1142 + }, + { + "epoch": 0.5315454968221981, + "grad_norm": 0.5523810386657715, + "learning_rate": 9.819505654313775e-06, + "loss": 0.3872, + "step": 1143 + }, + { + "epoch": 0.5320105410013951, + "grad_norm": 0.5287060141563416, + "learning_rate": 9.818784464702813e-06, + "loss": 0.4018, + "step": 1144 + }, + { + "epoch": 0.5324755851805921, + "grad_norm": 0.5854330062866211, + "learning_rate": 9.818061863748153e-06, + "loss": 0.4165, + "step": 1145 + }, + { + "epoch": 0.5329406293597891, + "grad_norm": 0.4986901581287384, + "learning_rate": 9.817337851661436e-06, + "loss": 0.3944, + "step": 1146 + }, + { + "epoch": 0.5334056735389862, + "grad_norm": 0.51901775598526, + "learning_rate": 9.81661242865471e-06, + "loss": 0.4012, + "step": 1147 + }, + { + "epoch": 0.5338707177181832, + "grad_norm": 0.4892278015613556, + "learning_rate": 9.815885594940442e-06, + "loss": 0.3884, + "step": 1148 + }, + { + "epoch": 0.5343357618973803, + "grad_norm": 0.4971807599067688, + "learning_rate": 9.815157350731506e-06, + "loss": 0.3971, + "step": 1149 + }, + { + "epoch": 0.5348008060765773, + "grad_norm": 0.49850407242774963, + "learning_rate": 9.814427696241197e-06, + "loss": 0.3975, + "step": 1150 + }, + { + "epoch": 0.5352658502557743, + "grad_norm": 0.49632036685943604, + "learning_rate": 9.813696631683216e-06, + "loss": 0.3923, + "step": 1151 + }, + { + "epoch": 0.5357308944349713, + "grad_norm": 0.5661031007766724, + "learning_rate": 9.812964157271683e-06, + "loss": 0.376, + "step": 1152 + }, + { + "epoch": 0.5361959386141684, + "grad_norm": 0.48331817984580994, + "learning_rate": 9.812230273221124e-06, + "loss": 0.41, + "step": 1153 + }, + { + "epoch": 0.5366609827933654, + "grad_norm": 0.4824008047580719, + "learning_rate": 9.811494979746486e-06, + "loss": 0.397, + "step": 1154 + }, + { + "epoch": 0.5371260269725624, + "grad_norm": 0.47880852222442627, + "learning_rate": 9.81075827706312e-06, + "loss": 0.3718, + "step": 1155 + }, + { + "epoch": 0.5375910711517594, + "grad_norm": 0.4963204562664032, + "learning_rate": 9.810020165386797e-06, + "loss": 0.4233, + "step": 1156 + }, + { + "epoch": 0.5380561153309564, + "grad_norm": 0.5019689202308655, + "learning_rate": 9.809280644933698e-06, + "loss": 0.4058, + "step": 1157 + }, + { + "epoch": 0.5385211595101534, + "grad_norm": 0.5196408033370972, + "learning_rate": 9.808539715920415e-06, + "loss": 0.4112, + "step": 1158 + }, + { + "epoch": 0.5389862036893505, + "grad_norm": 0.48643895983695984, + "learning_rate": 9.807797378563957e-06, + "loss": 0.397, + "step": 1159 + }, + { + "epoch": 0.5394512478685475, + "grad_norm": 0.49545997381210327, + "learning_rate": 9.80705363308174e-06, + "loss": 0.3896, + "step": 1160 + }, + { + "epoch": 0.5399162920477445, + "grad_norm": 0.5133934617042542, + "learning_rate": 9.806308479691595e-06, + "loss": 0.3979, + "step": 1161 + }, + { + "epoch": 0.5403813362269415, + "grad_norm": 0.47291284799575806, + "learning_rate": 9.805561918611766e-06, + "loss": 0.3839, + "step": 1162 + }, + { + "epoch": 0.5408463804061386, + "grad_norm": 0.4970063865184784, + "learning_rate": 9.804813950060909e-06, + "loss": 0.383, + "step": 1163 + }, + { + "epoch": 0.5413114245853357, + "grad_norm": 0.46584850549697876, + "learning_rate": 9.804064574258092e-06, + "loss": 0.3885, + "step": 1164 + }, + { + "epoch": 0.5417764687645327, + "grad_norm": 0.5352959632873535, + "learning_rate": 9.803313791422793e-06, + "loss": 0.4041, + "step": 1165 + }, + { + "epoch": 0.5422415129437297, + "grad_norm": 0.5488241910934448, + "learning_rate": 9.802561601774905e-06, + "loss": 0.3859, + "step": 1166 + }, + { + "epoch": 0.5427065571229267, + "grad_norm": 0.45431554317474365, + "learning_rate": 9.801808005534734e-06, + "loss": 0.4022, + "step": 1167 + }, + { + "epoch": 0.5431716013021237, + "grad_norm": 0.5047218203544617, + "learning_rate": 9.801053002922994e-06, + "loss": 0.3866, + "step": 1168 + }, + { + "epoch": 0.5436366454813207, + "grad_norm": 0.5306795239448547, + "learning_rate": 9.800296594160814e-06, + "loss": 0.3917, + "step": 1169 + }, + { + "epoch": 0.5441016896605178, + "grad_norm": 0.48350954055786133, + "learning_rate": 9.799538779469734e-06, + "loss": 0.4058, + "step": 1170 + }, + { + "epoch": 0.5445667338397148, + "grad_norm": 0.5609748959541321, + "learning_rate": 9.798779559071706e-06, + "loss": 0.3784, + "step": 1171 + }, + { + "epoch": 0.5450317780189118, + "grad_norm": 0.4680517315864563, + "learning_rate": 9.798018933189089e-06, + "loss": 0.4218, + "step": 1172 + }, + { + "epoch": 0.5454968221981088, + "grad_norm": 0.4936869442462921, + "learning_rate": 9.797256902044666e-06, + "loss": 0.3879, + "step": 1173 + }, + { + "epoch": 0.5459618663773058, + "grad_norm": 0.48776108026504517, + "learning_rate": 9.796493465861613e-06, + "loss": 0.4006, + "step": 1174 + }, + { + "epoch": 0.5464269105565028, + "grad_norm": 0.49075883626937866, + "learning_rate": 9.795728624863539e-06, + "loss": 0.4062, + "step": 1175 + }, + { + "epoch": 0.5468919547356998, + "grad_norm": 0.4694967269897461, + "learning_rate": 9.794962379274448e-06, + "loss": 0.3955, + "step": 1176 + }, + { + "epoch": 0.5473569989148969, + "grad_norm": 0.5107150077819824, + "learning_rate": 9.79419472931876e-06, + "loss": 0.4148, + "step": 1177 + }, + { + "epoch": 0.547822043094094, + "grad_norm": 0.4907507002353668, + "learning_rate": 9.793425675221308e-06, + "loss": 0.3937, + "step": 1178 + }, + { + "epoch": 0.548287087273291, + "grad_norm": 0.6052386164665222, + "learning_rate": 9.79265521720734e-06, + "loss": 0.3849, + "step": 1179 + }, + { + "epoch": 0.548752131452488, + "grad_norm": 0.5238395929336548, + "learning_rate": 9.791883355502503e-06, + "loss": 0.4057, + "step": 1180 + }, + { + "epoch": 0.549217175631685, + "grad_norm": 0.5540690422058105, + "learning_rate": 9.791110090332866e-06, + "loss": 0.3914, + "step": 1181 + }, + { + "epoch": 0.5496822198108821, + "grad_norm": 0.5288317203521729, + "learning_rate": 9.79033542192491e-06, + "loss": 0.3932, + "step": 1182 + }, + { + "epoch": 0.5501472639900791, + "grad_norm": 0.5038856863975525, + "learning_rate": 9.789559350505515e-06, + "loss": 0.398, + "step": 1183 + }, + { + "epoch": 0.5506123081692761, + "grad_norm": 0.45208224654197693, + "learning_rate": 9.788781876301988e-06, + "loss": 0.4266, + "step": 1184 + }, + { + "epoch": 0.5510773523484731, + "grad_norm": 0.5108579993247986, + "learning_rate": 9.78800299954203e-06, + "loss": 0.4126, + "step": 1185 + }, + { + "epoch": 0.5515423965276701, + "grad_norm": 0.45797398686408997, + "learning_rate": 9.787222720453769e-06, + "loss": 0.4055, + "step": 1186 + }, + { + "epoch": 0.5520074407068671, + "grad_norm": 0.43965256214141846, + "learning_rate": 9.78644103926573e-06, + "loss": 0.3847, + "step": 1187 + }, + { + "epoch": 0.5524724848860642, + "grad_norm": 0.5082615613937378, + "learning_rate": 9.78565795620686e-06, + "loss": 0.3791, + "step": 1188 + }, + { + "epoch": 0.5529375290652612, + "grad_norm": 0.5475904941558838, + "learning_rate": 9.784873471506509e-06, + "loss": 0.379, + "step": 1189 + }, + { + "epoch": 0.5534025732444582, + "grad_norm": 0.5068783760070801, + "learning_rate": 9.784087585394437e-06, + "loss": 0.422, + "step": 1190 + }, + { + "epoch": 0.5538676174236552, + "grad_norm": 0.48961615562438965, + "learning_rate": 9.783300298100822e-06, + "loss": 0.4116, + "step": 1191 + }, + { + "epoch": 0.5543326616028522, + "grad_norm": 0.46741366386413574, + "learning_rate": 9.782511609856244e-06, + "loss": 0.4024, + "step": 1192 + }, + { + "epoch": 0.5547977057820493, + "grad_norm": 0.4937583804130554, + "learning_rate": 9.7817215208917e-06, + "loss": 0.3889, + "step": 1193 + }, + { + "epoch": 0.5552627499612464, + "grad_norm": 0.5000990033149719, + "learning_rate": 9.780930031438594e-06, + "loss": 0.4062, + "step": 1194 + }, + { + "epoch": 0.5557277941404434, + "grad_norm": 0.5129031538963318, + "learning_rate": 9.780137141728737e-06, + "loss": 0.3849, + "step": 1195 + }, + { + "epoch": 0.5561928383196404, + "grad_norm": 0.46762484312057495, + "learning_rate": 9.779342851994356e-06, + "loss": 0.3877, + "step": 1196 + }, + { + "epoch": 0.5566578824988374, + "grad_norm": 0.4732123911380768, + "learning_rate": 9.778547162468087e-06, + "loss": 0.3746, + "step": 1197 + }, + { + "epoch": 0.5571229266780344, + "grad_norm": 0.44343671202659607, + "learning_rate": 9.77775007338297e-06, + "loss": 0.3878, + "step": 1198 + }, + { + "epoch": 0.5575879708572314, + "grad_norm": 0.45024973154067993, + "learning_rate": 9.776951584972464e-06, + "loss": 0.3907, + "step": 1199 + }, + { + "epoch": 0.5580530150364285, + "grad_norm": 0.4816182255744934, + "learning_rate": 9.776151697470431e-06, + "loss": 0.4158, + "step": 1200 + }, + { + "epoch": 0.5585180592156255, + "grad_norm": 0.4887128472328186, + "learning_rate": 9.775350411111145e-06, + "loss": 0.4127, + "step": 1201 + }, + { + "epoch": 0.5589831033948225, + "grad_norm": 0.520602285861969, + "learning_rate": 9.77454772612929e-06, + "loss": 0.4024, + "step": 1202 + }, + { + "epoch": 0.5594481475740195, + "grad_norm": 0.4444476366043091, + "learning_rate": 9.773743642759961e-06, + "loss": 0.4007, + "step": 1203 + }, + { + "epoch": 0.5599131917532165, + "grad_norm": 0.5314255356788635, + "learning_rate": 9.77293816123866e-06, + "loss": 0.4009, + "step": 1204 + }, + { + "epoch": 0.5603782359324135, + "grad_norm": 0.5080925226211548, + "learning_rate": 9.7721312818013e-06, + "loss": 0.3741, + "step": 1205 + }, + { + "epoch": 0.5608432801116106, + "grad_norm": 0.4945761263370514, + "learning_rate": 9.7713230046842e-06, + "loss": 0.4204, + "step": 1206 + }, + { + "epoch": 0.5613083242908077, + "grad_norm": 0.5242900848388672, + "learning_rate": 9.770513330124094e-06, + "loss": 0.3796, + "step": 1207 + }, + { + "epoch": 0.5617733684700047, + "grad_norm": 0.48071667551994324, + "learning_rate": 9.769702258358123e-06, + "loss": 0.4133, + "step": 1208 + }, + { + "epoch": 0.5622384126492017, + "grad_norm": 0.497071772813797, + "learning_rate": 9.768889789623833e-06, + "loss": 0.3845, + "step": 1209 + }, + { + "epoch": 0.5627034568283987, + "grad_norm": 0.5164023041725159, + "learning_rate": 9.768075924159185e-06, + "loss": 0.3954, + "step": 1210 + }, + { + "epoch": 0.5631685010075957, + "grad_norm": 0.46989476680755615, + "learning_rate": 9.767260662202551e-06, + "loss": 0.4029, + "step": 1211 + }, + { + "epoch": 0.5636335451867928, + "grad_norm": 0.5252077579498291, + "learning_rate": 9.766444003992704e-06, + "loss": 0.3907, + "step": 1212 + }, + { + "epoch": 0.5640985893659898, + "grad_norm": 0.5021181106567383, + "learning_rate": 9.765625949768828e-06, + "loss": 0.3868, + "step": 1213 + }, + { + "epoch": 0.5645636335451868, + "grad_norm": 0.4650791585445404, + "learning_rate": 9.764806499770521e-06, + "loss": 0.3805, + "step": 1214 + }, + { + "epoch": 0.5650286777243838, + "grad_norm": 0.47574383020401, + "learning_rate": 9.763985654237785e-06, + "loss": 0.3812, + "step": 1215 + }, + { + "epoch": 0.5654937219035808, + "grad_norm": 0.44989126920700073, + "learning_rate": 9.763163413411034e-06, + "loss": 0.3736, + "step": 1216 + }, + { + "epoch": 0.5659587660827778, + "grad_norm": 0.5040990710258484, + "learning_rate": 9.762339777531088e-06, + "loss": 0.3744, + "step": 1217 + }, + { + "epoch": 0.5664238102619749, + "grad_norm": 0.5459234118461609, + "learning_rate": 9.761514746839176e-06, + "loss": 0.3916, + "step": 1218 + }, + { + "epoch": 0.5668888544411719, + "grad_norm": 0.48448291420936584, + "learning_rate": 9.760688321576938e-06, + "loss": 0.4425, + "step": 1219 + }, + { + "epoch": 0.5673538986203689, + "grad_norm": 0.5275092720985413, + "learning_rate": 9.759860501986417e-06, + "loss": 0.3998, + "step": 1220 + }, + { + "epoch": 0.5678189427995659, + "grad_norm": 0.6015114188194275, + "learning_rate": 9.759031288310072e-06, + "loss": 0.4155, + "step": 1221 + }, + { + "epoch": 0.568283986978763, + "grad_norm": 0.47165900468826294, + "learning_rate": 9.758200680790764e-06, + "loss": 0.387, + "step": 1222 + }, + { + "epoch": 0.56874903115796, + "grad_norm": 0.592989444732666, + "learning_rate": 9.757368679671764e-06, + "loss": 0.3973, + "step": 1223 + }, + { + "epoch": 0.5692140753371571, + "grad_norm": 0.4669657349586487, + "learning_rate": 9.756535285196754e-06, + "loss": 0.3762, + "step": 1224 + }, + { + "epoch": 0.5696791195163541, + "grad_norm": 0.48349782824516296, + "learning_rate": 9.755700497609819e-06, + "loss": 0.4043, + "step": 1225 + }, + { + "epoch": 0.5701441636955511, + "grad_norm": 0.5364663600921631, + "learning_rate": 9.754864317155455e-06, + "loss": 0.4216, + "step": 1226 + }, + { + "epoch": 0.5706092078747481, + "grad_norm": 0.5064231753349304, + "learning_rate": 9.754026744078569e-06, + "loss": 0.4004, + "step": 1227 + }, + { + "epoch": 0.5710742520539451, + "grad_norm": 0.48567453026771545, + "learning_rate": 9.753187778624467e-06, + "loss": 0.3985, + "step": 1228 + }, + { + "epoch": 0.5715392962331421, + "grad_norm": 0.5714061856269836, + "learning_rate": 9.752347421038873e-06, + "loss": 0.3836, + "step": 1229 + }, + { + "epoch": 0.5720043404123392, + "grad_norm": 0.4642956256866455, + "learning_rate": 9.751505671567914e-06, + "loss": 0.3919, + "step": 1230 + }, + { + "epoch": 0.5724693845915362, + "grad_norm": 0.4755660891532898, + "learning_rate": 9.750662530458121e-06, + "loss": 0.3881, + "step": 1231 + }, + { + "epoch": 0.5729344287707332, + "grad_norm": 0.48824259638786316, + "learning_rate": 9.749817997956438e-06, + "loss": 0.3871, + "step": 1232 + }, + { + "epoch": 0.5733994729499302, + "grad_norm": 0.49661746621131897, + "learning_rate": 9.748972074310217e-06, + "loss": 0.3709, + "step": 1233 + }, + { + "epoch": 0.5738645171291272, + "grad_norm": 0.4752178490161896, + "learning_rate": 9.748124759767215e-06, + "loss": 0.3907, + "step": 1234 + }, + { + "epoch": 0.5743295613083242, + "grad_norm": 0.47395604848861694, + "learning_rate": 9.747276054575593e-06, + "loss": 0.416, + "step": 1235 + }, + { + "epoch": 0.5747946054875213, + "grad_norm": 0.4589254558086395, + "learning_rate": 9.746425958983925e-06, + "loss": 0.4059, + "step": 1236 + }, + { + "epoch": 0.5752596496667184, + "grad_norm": 0.42140620946884155, + "learning_rate": 9.745574473241193e-06, + "loss": 0.3705, + "step": 1237 + }, + { + "epoch": 0.5757246938459154, + "grad_norm": 0.45815083384513855, + "learning_rate": 9.744721597596778e-06, + "loss": 0.4068, + "step": 1238 + }, + { + "epoch": 0.5761897380251124, + "grad_norm": 0.45956507325172424, + "learning_rate": 9.743867332300478e-06, + "loss": 0.3637, + "step": 1239 + }, + { + "epoch": 0.5766547822043094, + "grad_norm": 0.49071380496025085, + "learning_rate": 9.743011677602493e-06, + "loss": 0.375, + "step": 1240 + }, + { + "epoch": 0.5771198263835065, + "grad_norm": 0.4949245750904083, + "learning_rate": 9.742154633753428e-06, + "loss": 0.378, + "step": 1241 + }, + { + "epoch": 0.5775848705627035, + "grad_norm": 0.5535846948623657, + "learning_rate": 9.741296201004298e-06, + "loss": 0.3901, + "step": 1242 + }, + { + "epoch": 0.5780499147419005, + "grad_norm": 0.44817158579826355, + "learning_rate": 9.740436379606524e-06, + "loss": 0.4016, + "step": 1243 + }, + { + "epoch": 0.5785149589210975, + "grad_norm": 0.5253201723098755, + "learning_rate": 9.739575169811934e-06, + "loss": 0.38, + "step": 1244 + }, + { + "epoch": 0.5789800031002945, + "grad_norm": 0.6024049520492554, + "learning_rate": 9.738712571872765e-06, + "loss": 0.3948, + "step": 1245 + }, + { + "epoch": 0.5794450472794915, + "grad_norm": 0.41917502880096436, + "learning_rate": 9.737848586041652e-06, + "loss": 0.3869, + "step": 1246 + }, + { + "epoch": 0.5799100914586885, + "grad_norm": 0.5947693586349487, + "learning_rate": 9.736983212571646e-06, + "loss": 0.3991, + "step": 1247 + }, + { + "epoch": 0.5803751356378856, + "grad_norm": 0.6152756810188293, + "learning_rate": 9.736116451716203e-06, + "loss": 0.4075, + "step": 1248 + }, + { + "epoch": 0.5808401798170826, + "grad_norm": 0.48533540964126587, + "learning_rate": 9.735248303729178e-06, + "loss": 0.3881, + "step": 1249 + }, + { + "epoch": 0.5813052239962796, + "grad_norm": 0.6273863315582275, + "learning_rate": 9.734378768864843e-06, + "loss": 0.3723, + "step": 1250 + }, + { + "epoch": 0.5817702681754767, + "grad_norm": 0.5250769257545471, + "learning_rate": 9.733507847377866e-06, + "loss": 0.3853, + "step": 1251 + }, + { + "epoch": 0.5822353123546737, + "grad_norm": 0.49252286553382874, + "learning_rate": 9.73263553952333e-06, + "loss": 0.3889, + "step": 1252 + }, + { + "epoch": 0.5827003565338708, + "grad_norm": 0.5308325886726379, + "learning_rate": 9.731761845556713e-06, + "loss": 0.3936, + "step": 1253 + }, + { + "epoch": 0.5831654007130678, + "grad_norm": 0.5030363202095032, + "learning_rate": 9.730886765733914e-06, + "loss": 0.3786, + "step": 1254 + }, + { + "epoch": 0.5836304448922648, + "grad_norm": 0.46710026264190674, + "learning_rate": 9.730010300311226e-06, + "loss": 0.4055, + "step": 1255 + }, + { + "epoch": 0.5840954890714618, + "grad_norm": 0.5640740394592285, + "learning_rate": 9.72913244954535e-06, + "loss": 0.4313, + "step": 1256 + }, + { + "epoch": 0.5845605332506588, + "grad_norm": 0.43550801277160645, + "learning_rate": 9.728253213693395e-06, + "loss": 0.3745, + "step": 1257 + }, + { + "epoch": 0.5850255774298558, + "grad_norm": 0.5071004629135132, + "learning_rate": 9.727372593012875e-06, + "loss": 0.4029, + "step": 1258 + }, + { + "epoch": 0.5854906216090529, + "grad_norm": 0.5142107605934143, + "learning_rate": 9.72649058776171e-06, + "loss": 0.3912, + "step": 1259 + }, + { + "epoch": 0.5859556657882499, + "grad_norm": 0.4210042655467987, + "learning_rate": 9.725607198198227e-06, + "loss": 0.3884, + "step": 1260 + }, + { + "epoch": 0.5864207099674469, + "grad_norm": 0.47974395751953125, + "learning_rate": 9.724722424581154e-06, + "loss": 0.3834, + "step": 1261 + }, + { + "epoch": 0.5868857541466439, + "grad_norm": 0.49919867515563965, + "learning_rate": 9.723836267169626e-06, + "loss": 0.394, + "step": 1262 + }, + { + "epoch": 0.5873507983258409, + "grad_norm": 0.46105697751045227, + "learning_rate": 9.722948726223185e-06, + "loss": 0.383, + "step": 1263 + }, + { + "epoch": 0.5878158425050379, + "grad_norm": 0.5934450030326843, + "learning_rate": 9.72205980200178e-06, + "loss": 0.3716, + "step": 1264 + }, + { + "epoch": 0.588280886684235, + "grad_norm": 0.5127496719360352, + "learning_rate": 9.72116949476576e-06, + "loss": 0.3684, + "step": 1265 + }, + { + "epoch": 0.5887459308634321, + "grad_norm": 0.4598689675331116, + "learning_rate": 9.720277804775879e-06, + "loss": 0.4068, + "step": 1266 + }, + { + "epoch": 0.5892109750426291, + "grad_norm": 0.5856637358665466, + "learning_rate": 9.719384732293302e-06, + "loss": 0.3637, + "step": 1267 + }, + { + "epoch": 0.5896760192218261, + "grad_norm": 0.5395975708961487, + "learning_rate": 9.718490277579595e-06, + "loss": 0.3902, + "step": 1268 + }, + { + "epoch": 0.5901410634010231, + "grad_norm": 0.46271106600761414, + "learning_rate": 9.71759444089673e-06, + "loss": 0.374, + "step": 1269 + }, + { + "epoch": 0.5906061075802201, + "grad_norm": 0.5440768003463745, + "learning_rate": 9.716697222507081e-06, + "loss": 0.3957, + "step": 1270 + }, + { + "epoch": 0.5910711517594172, + "grad_norm": 0.5980409979820251, + "learning_rate": 9.715798622673429e-06, + "loss": 0.3879, + "step": 1271 + }, + { + "epoch": 0.5915361959386142, + "grad_norm": 0.5140041708946228, + "learning_rate": 9.71489864165896e-06, + "loss": 0.3859, + "step": 1272 + }, + { + "epoch": 0.5920012401178112, + "grad_norm": 0.6424972414970398, + "learning_rate": 9.713997279727265e-06, + "loss": 0.4169, + "step": 1273 + }, + { + "epoch": 0.5924662842970082, + "grad_norm": 0.5433067679405212, + "learning_rate": 9.713094537142336e-06, + "loss": 0.3957, + "step": 1274 + }, + { + "epoch": 0.5929313284762052, + "grad_norm": 0.5382039546966553, + "learning_rate": 9.712190414168573e-06, + "loss": 0.3878, + "step": 1275 + }, + { + "epoch": 0.5933963726554022, + "grad_norm": 0.5087778568267822, + "learning_rate": 9.711284911070777e-06, + "loss": 0.4115, + "step": 1276 + }, + { + "epoch": 0.5938614168345993, + "grad_norm": 0.5524824261665344, + "learning_rate": 9.71037802811416e-06, + "loss": 0.3898, + "step": 1277 + }, + { + "epoch": 0.5943264610137963, + "grad_norm": 0.579130232334137, + "learning_rate": 9.709469765564328e-06, + "loss": 0.3868, + "step": 1278 + }, + { + "epoch": 0.5947915051929933, + "grad_norm": 0.5298312306404114, + "learning_rate": 9.708560123687298e-06, + "loss": 0.4156, + "step": 1279 + }, + { + "epoch": 0.5952565493721904, + "grad_norm": 0.5209439396858215, + "learning_rate": 9.707649102749488e-06, + "loss": 0.384, + "step": 1280 + }, + { + "epoch": 0.5957215935513874, + "grad_norm": 0.5347442030906677, + "learning_rate": 9.706736703017725e-06, + "loss": 0.3886, + "step": 1281 + }, + { + "epoch": 0.5961866377305844, + "grad_norm": 0.5219368934631348, + "learning_rate": 9.705822924759235e-06, + "loss": 0.388, + "step": 1282 + }, + { + "epoch": 0.5966516819097815, + "grad_norm": 0.618898868560791, + "learning_rate": 9.704907768241648e-06, + "loss": 0.3944, + "step": 1283 + }, + { + "epoch": 0.5971167260889785, + "grad_norm": 0.5122677087783813, + "learning_rate": 9.703991233732995e-06, + "loss": 0.3902, + "step": 1284 + }, + { + "epoch": 0.5975817702681755, + "grad_norm": 0.5251182317733765, + "learning_rate": 9.70307332150172e-06, + "loss": 0.4144, + "step": 1285 + }, + { + "epoch": 0.5980468144473725, + "grad_norm": 0.5196467041969299, + "learning_rate": 9.702154031816659e-06, + "loss": 0.3837, + "step": 1286 + }, + { + "epoch": 0.5985118586265695, + "grad_norm": 0.5128290057182312, + "learning_rate": 9.701233364947062e-06, + "loss": 0.3989, + "step": 1287 + }, + { + "epoch": 0.5989769028057665, + "grad_norm": 0.5336976051330566, + "learning_rate": 9.700311321162577e-06, + "loss": 0.3915, + "step": 1288 + }, + { + "epoch": 0.5994419469849636, + "grad_norm": 0.6457362771034241, + "learning_rate": 9.69938790073325e-06, + "loss": 0.3883, + "step": 1289 + }, + { + "epoch": 0.5999069911641606, + "grad_norm": 0.5426515340805054, + "learning_rate": 9.698463103929542e-06, + "loss": 0.4055, + "step": 1290 + }, + { + "epoch": 0.6003720353433576, + "grad_norm": 0.49356475472450256, + "learning_rate": 9.697536931022308e-06, + "loss": 0.3999, + "step": 1291 + }, + { + "epoch": 0.6008370795225546, + "grad_norm": 0.5430606007575989, + "learning_rate": 9.69660938228281e-06, + "loss": 0.3807, + "step": 1292 + }, + { + "epoch": 0.6013021237017516, + "grad_norm": 0.5589854121208191, + "learning_rate": 9.695680457982713e-06, + "loss": 0.4153, + "step": 1293 + }, + { + "epoch": 0.6017671678809486, + "grad_norm": 0.6247249841690063, + "learning_rate": 9.694750158394081e-06, + "loss": 0.4118, + "step": 1294 + }, + { + "epoch": 0.6022322120601458, + "grad_norm": 0.6207286715507507, + "learning_rate": 9.693818483789386e-06, + "loss": 0.3833, + "step": 1295 + }, + { + "epoch": 0.6026972562393428, + "grad_norm": 0.5082651376724243, + "learning_rate": 9.692885434441498e-06, + "loss": 0.4056, + "step": 1296 + }, + { + "epoch": 0.6031623004185398, + "grad_norm": 0.5024476051330566, + "learning_rate": 9.691951010623696e-06, + "loss": 0.3943, + "step": 1297 + }, + { + "epoch": 0.6036273445977368, + "grad_norm": 0.5954005718231201, + "learning_rate": 9.691015212609654e-06, + "loss": 0.3939, + "step": 1298 + }, + { + "epoch": 0.6040923887769338, + "grad_norm": 0.6106828451156616, + "learning_rate": 9.690078040673454e-06, + "loss": 0.387, + "step": 1299 + }, + { + "epoch": 0.6045574329561308, + "grad_norm": 0.5513212084770203, + "learning_rate": 9.689139495089575e-06, + "loss": 0.3904, + "step": 1300 + }, + { + "epoch": 0.6050224771353279, + "grad_norm": 0.6389715671539307, + "learning_rate": 9.688199576132905e-06, + "loss": 0.3904, + "step": 1301 + }, + { + "epoch": 0.6054875213145249, + "grad_norm": 0.4722382426261902, + "learning_rate": 9.687258284078733e-06, + "loss": 0.393, + "step": 1302 + }, + { + "epoch": 0.6059525654937219, + "grad_norm": 0.5727136731147766, + "learning_rate": 9.686315619202743e-06, + "loss": 0.394, + "step": 1303 + }, + { + "epoch": 0.6064176096729189, + "grad_norm": 0.6134669780731201, + "learning_rate": 9.685371581781029e-06, + "loss": 0.3903, + "step": 1304 + }, + { + "epoch": 0.6068826538521159, + "grad_norm": 0.5710572600364685, + "learning_rate": 9.684426172090084e-06, + "loss": 0.4115, + "step": 1305 + }, + { + "epoch": 0.6073476980313129, + "grad_norm": 0.5541101098060608, + "learning_rate": 9.683479390406803e-06, + "loss": 0.3935, + "step": 1306 + }, + { + "epoch": 0.60781274221051, + "grad_norm": 0.5224672555923462, + "learning_rate": 9.682531237008483e-06, + "loss": 0.3989, + "step": 1307 + }, + { + "epoch": 0.608277786389707, + "grad_norm": 0.5431579351425171, + "learning_rate": 9.681581712172824e-06, + "loss": 0.3995, + "step": 1308 + }, + { + "epoch": 0.608742830568904, + "grad_norm": 0.5996741056442261, + "learning_rate": 9.680630816177924e-06, + "loss": 0.4012, + "step": 1309 + }, + { + "epoch": 0.6092078747481011, + "grad_norm": 0.5371108055114746, + "learning_rate": 9.679678549302287e-06, + "loss": 0.404, + "step": 1310 + }, + { + "epoch": 0.6096729189272981, + "grad_norm": 0.4972769320011139, + "learning_rate": 9.678724911824815e-06, + "loss": 0.3939, + "step": 1311 + }, + { + "epoch": 0.6101379631064952, + "grad_norm": 0.7065192461013794, + "learning_rate": 9.677769904024815e-06, + "loss": 0.3968, + "step": 1312 + }, + { + "epoch": 0.6106030072856922, + "grad_norm": 0.5725729465484619, + "learning_rate": 9.676813526181989e-06, + "loss": 0.4079, + "step": 1313 + }, + { + "epoch": 0.6110680514648892, + "grad_norm": 0.5286259651184082, + "learning_rate": 9.675855778576448e-06, + "loss": 0.3818, + "step": 1314 + }, + { + "epoch": 0.6115330956440862, + "grad_norm": 0.4823831617832184, + "learning_rate": 9.674896661488702e-06, + "loss": 0.3856, + "step": 1315 + }, + { + "epoch": 0.6119981398232832, + "grad_norm": 0.5325484871864319, + "learning_rate": 9.673936175199657e-06, + "loss": 0.3887, + "step": 1316 + }, + { + "epoch": 0.6124631840024802, + "grad_norm": 0.5285559296607971, + "learning_rate": 9.672974319990627e-06, + "loss": 0.4181, + "step": 1317 + }, + { + "epoch": 0.6129282281816772, + "grad_norm": 0.5194781422615051, + "learning_rate": 9.672011096143323e-06, + "loss": 0.4084, + "step": 1318 + }, + { + "epoch": 0.6133932723608743, + "grad_norm": 0.5809126496315002, + "learning_rate": 9.671046503939857e-06, + "loss": 0.3985, + "step": 1319 + }, + { + "epoch": 0.6138583165400713, + "grad_norm": 0.443920373916626, + "learning_rate": 9.670080543662742e-06, + "loss": 0.386, + "step": 1320 + }, + { + "epoch": 0.6143233607192683, + "grad_norm": 0.5228665471076965, + "learning_rate": 9.669113215594892e-06, + "loss": 0.391, + "step": 1321 + }, + { + "epoch": 0.6147884048984653, + "grad_norm": 0.504928469657898, + "learning_rate": 9.668144520019622e-06, + "loss": 0.402, + "step": 1322 + }, + { + "epoch": 0.6152534490776623, + "grad_norm": 0.5686684250831604, + "learning_rate": 9.667174457220648e-06, + "loss": 0.3837, + "step": 1323 + }, + { + "epoch": 0.6157184932568595, + "grad_norm": 0.5331105589866638, + "learning_rate": 9.666203027482086e-06, + "loss": 0.3919, + "step": 1324 + }, + { + "epoch": 0.6161835374360565, + "grad_norm": 0.5707592368125916, + "learning_rate": 9.665230231088451e-06, + "loss": 0.3985, + "step": 1325 + }, + { + "epoch": 0.6166485816152535, + "grad_norm": 0.5226622223854065, + "learning_rate": 9.664256068324657e-06, + "loss": 0.399, + "step": 1326 + }, + { + "epoch": 0.6171136257944505, + "grad_norm": 0.5462534427642822, + "learning_rate": 9.663280539476026e-06, + "loss": 0.399, + "step": 1327 + }, + { + "epoch": 0.6175786699736475, + "grad_norm": 0.4789917469024658, + "learning_rate": 9.66230364482827e-06, + "loss": 0.3907, + "step": 1328 + }, + { + "epoch": 0.6180437141528445, + "grad_norm": 0.45184510946273804, + "learning_rate": 9.661325384667508e-06, + "loss": 0.3753, + "step": 1329 + }, + { + "epoch": 0.6185087583320416, + "grad_norm": 0.5181789398193359, + "learning_rate": 9.660345759280254e-06, + "loss": 0.3864, + "step": 1330 + }, + { + "epoch": 0.6189738025112386, + "grad_norm": 0.504653811454773, + "learning_rate": 9.659364768953426e-06, + "loss": 0.3747, + "step": 1331 + }, + { + "epoch": 0.6194388466904356, + "grad_norm": 0.47670918703079224, + "learning_rate": 9.65838241397434e-06, + "loss": 0.3764, + "step": 1332 + }, + { + "epoch": 0.6199038908696326, + "grad_norm": 0.5914029479026794, + "learning_rate": 9.657398694630713e-06, + "loss": 0.4123, + "step": 1333 + }, + { + "epoch": 0.6203689350488296, + "grad_norm": 0.443935751914978, + "learning_rate": 9.656413611210657e-06, + "loss": 0.4088, + "step": 1334 + }, + { + "epoch": 0.6208339792280266, + "grad_norm": 0.4935535788536072, + "learning_rate": 9.655427164002692e-06, + "loss": 0.4064, + "step": 1335 + }, + { + "epoch": 0.6212990234072236, + "grad_norm": 0.41665172576904297, + "learning_rate": 9.654439353295728e-06, + "loss": 0.3886, + "step": 1336 + }, + { + "epoch": 0.6217640675864207, + "grad_norm": 0.524173378944397, + "learning_rate": 9.653450179379081e-06, + "loss": 0.4056, + "step": 1337 + }, + { + "epoch": 0.6222291117656177, + "grad_norm": 0.497357040643692, + "learning_rate": 9.65245964254246e-06, + "loss": 0.4009, + "step": 1338 + }, + { + "epoch": 0.6226941559448148, + "grad_norm": 0.46005746722221375, + "learning_rate": 9.651467743075984e-06, + "loss": 0.3666, + "step": 1339 + }, + { + "epoch": 0.6231592001240118, + "grad_norm": 0.5369196534156799, + "learning_rate": 9.650474481270159e-06, + "loss": 0.3952, + "step": 1340 + }, + { + "epoch": 0.6236242443032088, + "grad_norm": 0.5053161978721619, + "learning_rate": 9.649479857415896e-06, + "loss": 0.4066, + "step": 1341 + }, + { + "epoch": 0.6240892884824059, + "grad_norm": 0.50250643491745, + "learning_rate": 9.648483871804506e-06, + "loss": 0.3863, + "step": 1342 + }, + { + "epoch": 0.6245543326616029, + "grad_norm": 0.540744960308075, + "learning_rate": 9.647486524727696e-06, + "loss": 0.3592, + "step": 1343 + }, + { + "epoch": 0.6250193768407999, + "grad_norm": 0.4549930989742279, + "learning_rate": 9.646487816477575e-06, + "loss": 0.4106, + "step": 1344 + }, + { + "epoch": 0.6254844210199969, + "grad_norm": 0.5376303791999817, + "learning_rate": 9.645487747346643e-06, + "loss": 0.3856, + "step": 1345 + }, + { + "epoch": 0.6259494651991939, + "grad_norm": 0.5555692911148071, + "learning_rate": 9.644486317627808e-06, + "loss": 0.3952, + "step": 1346 + }, + { + "epoch": 0.6264145093783909, + "grad_norm": 0.48616769909858704, + "learning_rate": 9.643483527614372e-06, + "loss": 0.3639, + "step": 1347 + }, + { + "epoch": 0.626879553557588, + "grad_norm": 0.5379546284675598, + "learning_rate": 9.642479377600036e-06, + "loss": 0.3779, + "step": 1348 + }, + { + "epoch": 0.627344597736785, + "grad_norm": 0.5338390469551086, + "learning_rate": 9.641473867878898e-06, + "loss": 0.3819, + "step": 1349 + }, + { + "epoch": 0.627809641915982, + "grad_norm": 0.4833146929740906, + "learning_rate": 9.640466998745456e-06, + "loss": 0.38, + "step": 1350 + }, + { + "epoch": 0.628274686095179, + "grad_norm": 0.5547218918800354, + "learning_rate": 9.639458770494608e-06, + "loss": 0.3975, + "step": 1351 + }, + { + "epoch": 0.628739730274376, + "grad_norm": 0.4861772060394287, + "learning_rate": 9.638449183421644e-06, + "loss": 0.3833, + "step": 1352 + }, + { + "epoch": 0.629204774453573, + "grad_norm": 0.48040562868118286, + "learning_rate": 9.637438237822256e-06, + "loss": 0.3903, + "step": 1353 + }, + { + "epoch": 0.6296698186327702, + "grad_norm": 0.49921414256095886, + "learning_rate": 9.636425933992536e-06, + "loss": 0.4055, + "step": 1354 + }, + { + "epoch": 0.6301348628119672, + "grad_norm": 0.5730396509170532, + "learning_rate": 9.63541227222897e-06, + "loss": 0.3813, + "step": 1355 + }, + { + "epoch": 0.6305999069911642, + "grad_norm": 0.49284279346466064, + "learning_rate": 9.634397252828444e-06, + "loss": 0.3643, + "step": 1356 + }, + { + "epoch": 0.6310649511703612, + "grad_norm": 0.5882490277290344, + "learning_rate": 9.63338087608824e-06, + "loss": 0.3976, + "step": 1357 + }, + { + "epoch": 0.6315299953495582, + "grad_norm": 0.5045767426490784, + "learning_rate": 9.632363142306036e-06, + "loss": 0.4051, + "step": 1358 + }, + { + "epoch": 0.6319950395287552, + "grad_norm": 0.5159674882888794, + "learning_rate": 9.631344051779914e-06, + "loss": 0.3642, + "step": 1359 + }, + { + "epoch": 0.6324600837079523, + "grad_norm": 0.6393707990646362, + "learning_rate": 9.630323604808344e-06, + "loss": 0.4197, + "step": 1360 + }, + { + "epoch": 0.6329251278871493, + "grad_norm": 0.46538159251213074, + "learning_rate": 9.629301801690205e-06, + "loss": 0.3911, + "step": 1361 + }, + { + "epoch": 0.6333901720663463, + "grad_norm": 0.5735601782798767, + "learning_rate": 9.62827864272476e-06, + "loss": 0.401, + "step": 1362 + }, + { + "epoch": 0.6338552162455433, + "grad_norm": 0.44864973425865173, + "learning_rate": 9.62725412821168e-06, + "loss": 0.3685, + "step": 1363 + }, + { + "epoch": 0.6343202604247403, + "grad_norm": 0.46898770332336426, + "learning_rate": 9.626228258451027e-06, + "loss": 0.3885, + "step": 1364 + }, + { + "epoch": 0.6347853046039373, + "grad_norm": 0.4860677421092987, + "learning_rate": 9.625201033743262e-06, + "loss": 0.3992, + "step": 1365 + }, + { + "epoch": 0.6352503487831344, + "grad_norm": 0.47478920221328735, + "learning_rate": 9.62417245438924e-06, + "loss": 0.4055, + "step": 1366 + }, + { + "epoch": 0.6357153929623314, + "grad_norm": 0.5337481498718262, + "learning_rate": 9.623142520690219e-06, + "loss": 0.3892, + "step": 1367 + }, + { + "epoch": 0.6361804371415285, + "grad_norm": 0.5208576321601868, + "learning_rate": 9.622111232947847e-06, + "loss": 0.3763, + "step": 1368 + }, + { + "epoch": 0.6366454813207255, + "grad_norm": 0.490197092294693, + "learning_rate": 9.621078591464174e-06, + "loss": 0.4076, + "step": 1369 + }, + { + "epoch": 0.6371105254999225, + "grad_norm": 0.5205878615379333, + "learning_rate": 9.620044596541642e-06, + "loss": 0.3825, + "step": 1370 + }, + { + "epoch": 0.6375755696791195, + "grad_norm": 0.4583769142627716, + "learning_rate": 9.61900924848309e-06, + "loss": 0.3924, + "step": 1371 + }, + { + "epoch": 0.6380406138583166, + "grad_norm": 0.4924391806125641, + "learning_rate": 9.617972547591759e-06, + "loss": 0.4019, + "step": 1372 + }, + { + "epoch": 0.6385056580375136, + "grad_norm": 0.5124062895774841, + "learning_rate": 9.616934494171277e-06, + "loss": 0.3913, + "step": 1373 + }, + { + "epoch": 0.6389707022167106, + "grad_norm": 0.5022979974746704, + "learning_rate": 9.615895088525677e-06, + "loss": 0.3889, + "step": 1374 + }, + { + "epoch": 0.6394357463959076, + "grad_norm": 0.42661193013191223, + "learning_rate": 9.614854330959382e-06, + "loss": 0.3649, + "step": 1375 + }, + { + "epoch": 0.6399007905751046, + "grad_norm": 0.6055207848548889, + "learning_rate": 9.613812221777212e-06, + "loss": 0.3975, + "step": 1376 + }, + { + "epoch": 0.6403658347543016, + "grad_norm": 0.6047768592834473, + "learning_rate": 9.612768761284386e-06, + "loss": 0.3953, + "step": 1377 + }, + { + "epoch": 0.6408308789334987, + "grad_norm": 0.4532669186592102, + "learning_rate": 9.611723949786517e-06, + "loss": 0.3992, + "step": 1378 + }, + { + "epoch": 0.6412959231126957, + "grad_norm": 0.45877280831336975, + "learning_rate": 9.610677787589611e-06, + "loss": 0.4096, + "step": 1379 + }, + { + "epoch": 0.6417609672918927, + "grad_norm": 0.599404513835907, + "learning_rate": 9.609630275000072e-06, + "loss": 0.4022, + "step": 1380 + }, + { + "epoch": 0.6422260114710897, + "grad_norm": 0.5159070491790771, + "learning_rate": 9.608581412324701e-06, + "loss": 0.3832, + "step": 1381 + }, + { + "epoch": 0.6426910556502867, + "grad_norm": 0.45905065536499023, + "learning_rate": 9.607531199870692e-06, + "loss": 0.3963, + "step": 1382 + }, + { + "epoch": 0.6431560998294839, + "grad_norm": 0.6395869851112366, + "learning_rate": 9.606479637945635e-06, + "loss": 0.4031, + "step": 1383 + }, + { + "epoch": 0.6436211440086809, + "grad_norm": 0.5056988000869751, + "learning_rate": 9.60542672685752e-06, + "loss": 0.3899, + "step": 1384 + }, + { + "epoch": 0.6440861881878779, + "grad_norm": 0.56319260597229, + "learning_rate": 9.604372466914717e-06, + "loss": 0.388, + "step": 1385 + }, + { + "epoch": 0.6445512323670749, + "grad_norm": 0.6158375144004822, + "learning_rate": 9.603316858426014e-06, + "loss": 0.3957, + "step": 1386 + }, + { + "epoch": 0.6450162765462719, + "grad_norm": 0.4532202184200287, + "learning_rate": 9.60225990170057e-06, + "loss": 0.3801, + "step": 1387 + }, + { + "epoch": 0.6454813207254689, + "grad_norm": 0.6414798498153687, + "learning_rate": 9.60120159704796e-06, + "loss": 0.3984, + "step": 1388 + }, + { + "epoch": 0.645946364904666, + "grad_norm": 0.4911268651485443, + "learning_rate": 9.600141944778139e-06, + "loss": 0.3868, + "step": 1389 + }, + { + "epoch": 0.646411409083863, + "grad_norm": 0.4856494963169098, + "learning_rate": 9.599080945201462e-06, + "loss": 0.421, + "step": 1390 + }, + { + "epoch": 0.64687645326306, + "grad_norm": 0.49976852536201477, + "learning_rate": 9.598018598628682e-06, + "loss": 0.4045, + "step": 1391 + }, + { + "epoch": 0.647341497442257, + "grad_norm": 0.4958864748477936, + "learning_rate": 9.59695490537094e-06, + "loss": 0.3759, + "step": 1392 + }, + { + "epoch": 0.647806541621454, + "grad_norm": 0.4640512764453888, + "learning_rate": 9.595889865739774e-06, + "loss": 0.3725, + "step": 1393 + }, + { + "epoch": 0.648271585800651, + "grad_norm": 0.4767007529735565, + "learning_rate": 9.594823480047118e-06, + "loss": 0.3764, + "step": 1394 + }, + { + "epoch": 0.648736629979848, + "grad_norm": 0.509242594242096, + "learning_rate": 9.5937557486053e-06, + "loss": 0.374, + "step": 1395 + }, + { + "epoch": 0.6492016741590451, + "grad_norm": 0.49067145586013794, + "learning_rate": 9.59268667172704e-06, + "loss": 0.3969, + "step": 1396 + }, + { + "epoch": 0.6496667183382422, + "grad_norm": 0.5714993476867676, + "learning_rate": 9.591616249725456e-06, + "loss": 0.3665, + "step": 1397 + }, + { + "epoch": 0.6501317625174392, + "grad_norm": 0.5996623635292053, + "learning_rate": 9.590544482914052e-06, + "loss": 0.3722, + "step": 1398 + }, + { + "epoch": 0.6505968066966362, + "grad_norm": 0.4260701835155487, + "learning_rate": 9.589471371606735e-06, + "loss": 0.3707, + "step": 1399 + }, + { + "epoch": 0.6510618508758332, + "grad_norm": 0.6318243741989136, + "learning_rate": 9.5883969161178e-06, + "loss": 0.4004, + "step": 1400 + }, + { + "epoch": 0.6515268950550303, + "grad_norm": 0.5253607630729675, + "learning_rate": 9.587321116761938e-06, + "loss": 0.3856, + "step": 1401 + }, + { + "epoch": 0.6519919392342273, + "grad_norm": 0.4526579976081848, + "learning_rate": 9.586243973854234e-06, + "loss": 0.4028, + "step": 1402 + }, + { + "epoch": 0.6524569834134243, + "grad_norm": 0.5988115668296814, + "learning_rate": 9.585165487710167e-06, + "loss": 0.3829, + "step": 1403 + }, + { + "epoch": 0.6529220275926213, + "grad_norm": 0.47281304001808167, + "learning_rate": 9.584085658645604e-06, + "loss": 0.3911, + "step": 1404 + }, + { + "epoch": 0.6533870717718183, + "grad_norm": 0.48429980874061584, + "learning_rate": 9.583004486976813e-06, + "loss": 0.3721, + "step": 1405 + }, + { + "epoch": 0.6538521159510153, + "grad_norm": 0.5094161033630371, + "learning_rate": 9.58192197302045e-06, + "loss": 0.3976, + "step": 1406 + }, + { + "epoch": 0.6543171601302123, + "grad_norm": 0.41321924328804016, + "learning_rate": 9.580838117093564e-06, + "loss": 0.3871, + "step": 1407 + }, + { + "epoch": 0.6547822043094094, + "grad_norm": 0.5384671092033386, + "learning_rate": 9.579752919513602e-06, + "loss": 0.4158, + "step": 1408 + }, + { + "epoch": 0.6552472484886064, + "grad_norm": 0.5002989768981934, + "learning_rate": 9.5786663805984e-06, + "loss": 0.3833, + "step": 1409 + }, + { + "epoch": 0.6557122926678034, + "grad_norm": 0.4137031137943268, + "learning_rate": 9.577578500666187e-06, + "loss": 0.3849, + "step": 1410 + }, + { + "epoch": 0.6561773368470004, + "grad_norm": 0.5422747731208801, + "learning_rate": 9.576489280035584e-06, + "loss": 0.4073, + "step": 1411 + }, + { + "epoch": 0.6566423810261975, + "grad_norm": 0.49949032068252563, + "learning_rate": 9.57539871902561e-06, + "loss": 0.3724, + "step": 1412 + }, + { + "epoch": 0.6571074252053946, + "grad_norm": 0.4298678934574127, + "learning_rate": 9.574306817955669e-06, + "loss": 0.4176, + "step": 1413 + }, + { + "epoch": 0.6575724693845916, + "grad_norm": 0.48693087697029114, + "learning_rate": 9.57321357714556e-06, + "loss": 0.404, + "step": 1414 + }, + { + "epoch": 0.6580375135637886, + "grad_norm": 0.5552796721458435, + "learning_rate": 9.572118996915482e-06, + "loss": 0.4036, + "step": 1415 + }, + { + "epoch": 0.6585025577429856, + "grad_norm": 0.4444204866886139, + "learning_rate": 9.571023077586012e-06, + "loss": 0.391, + "step": 1416 + }, + { + "epoch": 0.6589676019221826, + "grad_norm": 0.46437713503837585, + "learning_rate": 9.569925819478132e-06, + "loss": 0.4061, + "step": 1417 + }, + { + "epoch": 0.6594326461013796, + "grad_norm": 0.47912266850471497, + "learning_rate": 9.56882722291321e-06, + "loss": 0.3957, + "step": 1418 + }, + { + "epoch": 0.6598976902805767, + "grad_norm": 0.4714813530445099, + "learning_rate": 9.567727288213005e-06, + "loss": 0.4099, + "step": 1419 + }, + { + "epoch": 0.6603627344597737, + "grad_norm": 0.4540778398513794, + "learning_rate": 9.566626015699673e-06, + "loss": 0.3901, + "step": 1420 + }, + { + "epoch": 0.6608277786389707, + "grad_norm": 0.47948190569877625, + "learning_rate": 9.565523405695756e-06, + "loss": 0.3996, + "step": 1421 + }, + { + "epoch": 0.6612928228181677, + "grad_norm": 0.4244531989097595, + "learning_rate": 9.564419458524193e-06, + "loss": 0.3919, + "step": 1422 + }, + { + "epoch": 0.6617578669973647, + "grad_norm": 0.4253584146499634, + "learning_rate": 9.563314174508312e-06, + "loss": 0.3817, + "step": 1423 + }, + { + "epoch": 0.6622229111765617, + "grad_norm": 0.4504033923149109, + "learning_rate": 9.56220755397183e-06, + "loss": 0.3648, + "step": 1424 + }, + { + "epoch": 0.6626879553557588, + "grad_norm": 0.4773210883140564, + "learning_rate": 9.561099597238862e-06, + "loss": 0.385, + "step": 1425 + }, + { + "epoch": 0.6631529995349558, + "grad_norm": 0.4333602488040924, + "learning_rate": 9.559990304633906e-06, + "loss": 0.3873, + "step": 1426 + }, + { + "epoch": 0.6636180437141529, + "grad_norm": 0.48123425245285034, + "learning_rate": 9.55887967648186e-06, + "loss": 0.3683, + "step": 1427 + }, + { + "epoch": 0.6640830878933499, + "grad_norm": 0.5246723890304565, + "learning_rate": 9.557767713108009e-06, + "loss": 0.3702, + "step": 1428 + }, + { + "epoch": 0.6645481320725469, + "grad_norm": 0.4679216146469116, + "learning_rate": 9.556654414838025e-06, + "loss": 0.3839, + "step": 1429 + }, + { + "epoch": 0.6650131762517439, + "grad_norm": 0.5367023944854736, + "learning_rate": 9.555539781997978e-06, + "loss": 0.3777, + "step": 1430 + }, + { + "epoch": 0.665478220430941, + "grad_norm": 0.48515602946281433, + "learning_rate": 9.554423814914324e-06, + "loss": 0.3749, + "step": 1431 + }, + { + "epoch": 0.665943264610138, + "grad_norm": 0.46489590406417847, + "learning_rate": 9.553306513913915e-06, + "loss": 0.3707, + "step": 1432 + }, + { + "epoch": 0.666408308789335, + "grad_norm": 0.4412256181240082, + "learning_rate": 9.552187879323987e-06, + "loss": 0.3971, + "step": 1433 + }, + { + "epoch": 0.666873352968532, + "grad_norm": 0.4211561977863312, + "learning_rate": 9.551067911472172e-06, + "loss": 0.39, + "step": 1434 + }, + { + "epoch": 0.667338397147729, + "grad_norm": 0.48854532837867737, + "learning_rate": 9.549946610686488e-06, + "loss": 0.4125, + "step": 1435 + }, + { + "epoch": 0.667803441326926, + "grad_norm": 0.47784653306007385, + "learning_rate": 9.548823977295348e-06, + "loss": 0.3656, + "step": 1436 + }, + { + "epoch": 0.6682684855061231, + "grad_norm": 0.4391965866088867, + "learning_rate": 9.547700011627552e-06, + "loss": 0.3929, + "step": 1437 + }, + { + "epoch": 0.6687335296853201, + "grad_norm": 0.4571762979030609, + "learning_rate": 9.546574714012291e-06, + "loss": 0.4102, + "step": 1438 + }, + { + "epoch": 0.6691985738645171, + "grad_norm": 0.4999452531337738, + "learning_rate": 9.545448084779148e-06, + "loss": 0.3953, + "step": 1439 + }, + { + "epoch": 0.6696636180437141, + "grad_norm": 0.5316603779792786, + "learning_rate": 9.544320124258093e-06, + "loss": 0.4101, + "step": 1440 + }, + { + "epoch": 0.6701286622229112, + "grad_norm": 0.502018928527832, + "learning_rate": 9.543190832779488e-06, + "loss": 0.394, + "step": 1441 + }, + { + "epoch": 0.6705937064021082, + "grad_norm": 0.49598726630210876, + "learning_rate": 9.542060210674084e-06, + "loss": 0.4067, + "step": 1442 + }, + { + "epoch": 0.6710587505813053, + "grad_norm": 0.5268638134002686, + "learning_rate": 9.540928258273021e-06, + "loss": 0.3724, + "step": 1443 + }, + { + "epoch": 0.6715237947605023, + "grad_norm": 0.63950514793396, + "learning_rate": 9.539794975907831e-06, + "loss": 0.393, + "step": 1444 + }, + { + "epoch": 0.6719888389396993, + "grad_norm": 0.45709770917892456, + "learning_rate": 9.538660363910433e-06, + "loss": 0.3779, + "step": 1445 + }, + { + "epoch": 0.6724538831188963, + "grad_norm": 0.6071939468383789, + "learning_rate": 9.537524422613135e-06, + "loss": 0.3989, + "step": 1446 + }, + { + "epoch": 0.6729189272980933, + "grad_norm": 0.5442386865615845, + "learning_rate": 9.53638715234864e-06, + "loss": 0.3666, + "step": 1447 + }, + { + "epoch": 0.6733839714772903, + "grad_norm": 0.5082783102989197, + "learning_rate": 9.535248553450031e-06, + "loss": 0.4001, + "step": 1448 + }, + { + "epoch": 0.6738490156564874, + "grad_norm": 0.5492592453956604, + "learning_rate": 9.53410862625079e-06, + "loss": 0.3681, + "step": 1449 + }, + { + "epoch": 0.6743140598356844, + "grad_norm": 0.5500537157058716, + "learning_rate": 9.532967371084778e-06, + "loss": 0.4094, + "step": 1450 + }, + { + "epoch": 0.6747791040148814, + "grad_norm": 0.528328001499176, + "learning_rate": 9.531824788286255e-06, + "loss": 0.4036, + "step": 1451 + }, + { + "epoch": 0.6752441481940784, + "grad_norm": 0.49432677030563354, + "learning_rate": 9.53068087818986e-06, + "loss": 0.4007, + "step": 1452 + }, + { + "epoch": 0.6757091923732754, + "grad_norm": 0.48525556921958923, + "learning_rate": 9.52953564113063e-06, + "loss": 0.3855, + "step": 1453 + }, + { + "epoch": 0.6761742365524724, + "grad_norm": 0.5153108835220337, + "learning_rate": 9.528389077443985e-06, + "loss": 0.3863, + "step": 1454 + }, + { + "epoch": 0.6766392807316695, + "grad_norm": 0.48874610662460327, + "learning_rate": 9.527241187465735e-06, + "loss": 0.3963, + "step": 1455 + }, + { + "epoch": 0.6771043249108666, + "grad_norm": 0.5944570302963257, + "learning_rate": 9.526091971532075e-06, + "loss": 0.3698, + "step": 1456 + }, + { + "epoch": 0.6775693690900636, + "grad_norm": 0.4809949994087219, + "learning_rate": 9.524941429979597e-06, + "loss": 0.3884, + "step": 1457 + }, + { + "epoch": 0.6780344132692606, + "grad_norm": 0.5726730227470398, + "learning_rate": 9.523789563145274e-06, + "loss": 0.3955, + "step": 1458 + }, + { + "epoch": 0.6784994574484576, + "grad_norm": 0.5116811394691467, + "learning_rate": 9.522636371366467e-06, + "loss": 0.3916, + "step": 1459 + }, + { + "epoch": 0.6789645016276546, + "grad_norm": 0.4806547462940216, + "learning_rate": 9.521481854980928e-06, + "loss": 0.3909, + "step": 1460 + }, + { + "epoch": 0.6794295458068517, + "grad_norm": 0.5462357997894287, + "learning_rate": 9.520326014326799e-06, + "loss": 0.3862, + "step": 1461 + }, + { + "epoch": 0.6798945899860487, + "grad_norm": 0.44127699732780457, + "learning_rate": 9.519168849742603e-06, + "loss": 0.395, + "step": 1462 + }, + { + "epoch": 0.6803596341652457, + "grad_norm": 0.5247107148170471, + "learning_rate": 9.518010361567259e-06, + "loss": 0.3771, + "step": 1463 + }, + { + "epoch": 0.6808246783444427, + "grad_norm": 0.49513915181159973, + "learning_rate": 9.516850550140064e-06, + "loss": 0.3896, + "step": 1464 + }, + { + "epoch": 0.6812897225236397, + "grad_norm": 0.4648475646972656, + "learning_rate": 9.515689415800713e-06, + "loss": 0.3866, + "step": 1465 + }, + { + "epoch": 0.6817547667028367, + "grad_norm": 0.5785578489303589, + "learning_rate": 9.514526958889279e-06, + "loss": 0.4013, + "step": 1466 + }, + { + "epoch": 0.6822198108820338, + "grad_norm": 0.5234367847442627, + "learning_rate": 9.51336317974623e-06, + "loss": 0.398, + "step": 1467 + }, + { + "epoch": 0.6826848550612308, + "grad_norm": 0.5856658220291138, + "learning_rate": 9.512198078712417e-06, + "loss": 0.3752, + "step": 1468 + }, + { + "epoch": 0.6831498992404278, + "grad_norm": 0.5868757367134094, + "learning_rate": 9.511031656129079e-06, + "loss": 0.3974, + "step": 1469 + }, + { + "epoch": 0.6836149434196248, + "grad_norm": 0.5716127157211304, + "learning_rate": 9.509863912337843e-06, + "loss": 0.4253, + "step": 1470 + }, + { + "epoch": 0.6840799875988219, + "grad_norm": 0.49978068470954895, + "learning_rate": 9.50869484768072e-06, + "loss": 0.4068, + "step": 1471 + }, + { + "epoch": 0.684545031778019, + "grad_norm": 0.5351032614707947, + "learning_rate": 9.507524462500112e-06, + "loss": 0.3921, + "step": 1472 + }, + { + "epoch": 0.685010075957216, + "grad_norm": 0.5618773102760315, + "learning_rate": 9.506352757138806e-06, + "loss": 0.3945, + "step": 1473 + }, + { + "epoch": 0.685475120136413, + "grad_norm": 0.6486148834228516, + "learning_rate": 9.505179731939975e-06, + "loss": 0.4109, + "step": 1474 + }, + { + "epoch": 0.68594016431561, + "grad_norm": 0.5250218510627747, + "learning_rate": 9.504005387247178e-06, + "loss": 0.413, + "step": 1475 + }, + { + "epoch": 0.686405208494807, + "grad_norm": 0.5226433277130127, + "learning_rate": 9.502829723404363e-06, + "loss": 0.3724, + "step": 1476 + }, + { + "epoch": 0.686870252674004, + "grad_norm": 0.5525023341178894, + "learning_rate": 9.50165274075586e-06, + "loss": 0.3952, + "step": 1477 + }, + { + "epoch": 0.687335296853201, + "grad_norm": 0.5429186820983887, + "learning_rate": 9.500474439646394e-06, + "loss": 0.3966, + "step": 1478 + }, + { + "epoch": 0.6878003410323981, + "grad_norm": 0.47291281819343567, + "learning_rate": 9.499294820421064e-06, + "loss": 0.415, + "step": 1479 + }, + { + "epoch": 0.6882653852115951, + "grad_norm": 0.6125563383102417, + "learning_rate": 9.498113883425364e-06, + "loss": 0.3928, + "step": 1480 + }, + { + "epoch": 0.6887304293907921, + "grad_norm": 0.48280781507492065, + "learning_rate": 9.496931629005171e-06, + "loss": 0.3946, + "step": 1481 + }, + { + "epoch": 0.6891954735699891, + "grad_norm": 0.5750291347503662, + "learning_rate": 9.49574805750675e-06, + "loss": 0.3947, + "step": 1482 + }, + { + "epoch": 0.6896605177491861, + "grad_norm": 0.5800142288208008, + "learning_rate": 9.494563169276747e-06, + "loss": 0.4352, + "step": 1483 + }, + { + "epoch": 0.6901255619283831, + "grad_norm": 0.469500869512558, + "learning_rate": 9.493376964662197e-06, + "loss": 0.3659, + "step": 1484 + }, + { + "epoch": 0.6905906061075803, + "grad_norm": 0.5604742765426636, + "learning_rate": 9.492189444010522e-06, + "loss": 0.3868, + "step": 1485 + }, + { + "epoch": 0.6910556502867773, + "grad_norm": 0.5544446706771851, + "learning_rate": 9.491000607669525e-06, + "loss": 0.4125, + "step": 1486 + }, + { + "epoch": 0.6915206944659743, + "grad_norm": 0.5978662371635437, + "learning_rate": 9.489810455987398e-06, + "loss": 0.374, + "step": 1487 + }, + { + "epoch": 0.6919857386451713, + "grad_norm": 0.4866108298301697, + "learning_rate": 9.488618989312719e-06, + "loss": 0.3797, + "step": 1488 + }, + { + "epoch": 0.6924507828243683, + "grad_norm": 0.5635210275650024, + "learning_rate": 9.487426207994445e-06, + "loss": 0.3627, + "step": 1489 + }, + { + "epoch": 0.6929158270035654, + "grad_norm": 0.534838855266571, + "learning_rate": 9.486232112381926e-06, + "loss": 0.397, + "step": 1490 + }, + { + "epoch": 0.6933808711827624, + "grad_norm": 0.5099875926971436, + "learning_rate": 9.485036702824892e-06, + "loss": 0.397, + "step": 1491 + }, + { + "epoch": 0.6938459153619594, + "grad_norm": 0.6172494888305664, + "learning_rate": 9.483839979673459e-06, + "loss": 0.4001, + "step": 1492 + }, + { + "epoch": 0.6943109595411564, + "grad_norm": 0.5031498670578003, + "learning_rate": 9.482641943278127e-06, + "loss": 0.395, + "step": 1493 + }, + { + "epoch": 0.6947760037203534, + "grad_norm": 0.5278390645980835, + "learning_rate": 9.481442593989781e-06, + "loss": 0.3882, + "step": 1494 + }, + { + "epoch": 0.6952410478995504, + "grad_norm": 0.564165472984314, + "learning_rate": 9.480241932159692e-06, + "loss": 0.3947, + "step": 1495 + }, + { + "epoch": 0.6957060920787475, + "grad_norm": 0.47055506706237793, + "learning_rate": 9.479039958139516e-06, + "loss": 0.3843, + "step": 1496 + }, + { + "epoch": 0.6961711362579445, + "grad_norm": 0.6169893741607666, + "learning_rate": 9.477836672281291e-06, + "loss": 0.3824, + "step": 1497 + }, + { + "epoch": 0.6966361804371415, + "grad_norm": 0.4662085473537445, + "learning_rate": 9.476632074937438e-06, + "loss": 0.4037, + "step": 1498 + }, + { + "epoch": 0.6971012246163385, + "grad_norm": 0.5007343888282776, + "learning_rate": 9.475426166460763e-06, + "loss": 0.3552, + "step": 1499 + }, + { + "epoch": 0.6975662687955356, + "grad_norm": 0.5537876486778259, + "learning_rate": 9.47421894720446e-06, + "loss": 0.3963, + "step": 1500 + }, + { + "epoch": 0.6980313129747326, + "grad_norm": 0.49283310770988464, + "learning_rate": 9.473010417522104e-06, + "loss": 0.3937, + "step": 1501 + }, + { + "epoch": 0.6984963571539297, + "grad_norm": 0.5946938395500183, + "learning_rate": 9.471800577767651e-06, + "loss": 0.3875, + "step": 1502 + }, + { + "epoch": 0.6989614013331267, + "grad_norm": 0.4660334885120392, + "learning_rate": 9.470589428295444e-06, + "loss": 0.3862, + "step": 1503 + }, + { + "epoch": 0.6994264455123237, + "grad_norm": 0.5642980337142944, + "learning_rate": 9.469376969460212e-06, + "loss": 0.3944, + "step": 1504 + }, + { + "epoch": 0.6998914896915207, + "grad_norm": 0.5256946682929993, + "learning_rate": 9.468163201617063e-06, + "loss": 0.3991, + "step": 1505 + }, + { + "epoch": 0.7003565338707177, + "grad_norm": 0.5281638503074646, + "learning_rate": 9.466948125121486e-06, + "loss": 0.4028, + "step": 1506 + }, + { + "epoch": 0.7008215780499147, + "grad_norm": 0.5681607723236084, + "learning_rate": 9.465731740329364e-06, + "loss": 0.3888, + "step": 1507 + }, + { + "epoch": 0.7012866222291118, + "grad_norm": 0.5455275177955627, + "learning_rate": 9.46451404759695e-06, + "loss": 0.3982, + "step": 1508 + }, + { + "epoch": 0.7017516664083088, + "grad_norm": 0.49493181705474854, + "learning_rate": 9.463295047280892e-06, + "loss": 0.4045, + "step": 1509 + }, + { + "epoch": 0.7022167105875058, + "grad_norm": 0.574228048324585, + "learning_rate": 9.462074739738212e-06, + "loss": 0.3897, + "step": 1510 + }, + { + "epoch": 0.7026817547667028, + "grad_norm": 0.5371845364570618, + "learning_rate": 9.460853125326317e-06, + "loss": 0.3884, + "step": 1511 + }, + { + "epoch": 0.7031467989458998, + "grad_norm": 0.5111741423606873, + "learning_rate": 9.459630204403001e-06, + "loss": 0.3557, + "step": 1512 + }, + { + "epoch": 0.7036118431250968, + "grad_norm": 0.5982129573822021, + "learning_rate": 9.458405977326436e-06, + "loss": 0.405, + "step": 1513 + }, + { + "epoch": 0.7040768873042939, + "grad_norm": 0.48484864830970764, + "learning_rate": 9.45718044445518e-06, + "loss": 0.3856, + "step": 1514 + }, + { + "epoch": 0.704541931483491, + "grad_norm": 0.4435729682445526, + "learning_rate": 9.455953606148172e-06, + "loss": 0.3944, + "step": 1515 + }, + { + "epoch": 0.705006975662688, + "grad_norm": 0.5365220308303833, + "learning_rate": 9.454725462764729e-06, + "loss": 0.3986, + "step": 1516 + }, + { + "epoch": 0.705472019841885, + "grad_norm": 0.5052123069763184, + "learning_rate": 9.453496014664557e-06, + "loss": 0.3782, + "step": 1517 + }, + { + "epoch": 0.705937064021082, + "grad_norm": 0.4664888381958008, + "learning_rate": 9.452265262207741e-06, + "loss": 0.3802, + "step": 1518 + }, + { + "epoch": 0.706402108200279, + "grad_norm": 0.5673739314079285, + "learning_rate": 9.451033205754749e-06, + "loss": 0.4167, + "step": 1519 + }, + { + "epoch": 0.7068671523794761, + "grad_norm": 0.5380971431732178, + "learning_rate": 9.44979984566643e-06, + "loss": 0.3781, + "step": 1520 + }, + { + "epoch": 0.7073321965586731, + "grad_norm": 0.4242795407772064, + "learning_rate": 9.448565182304015e-06, + "loss": 0.3824, + "step": 1521 + }, + { + "epoch": 0.7077972407378701, + "grad_norm": 0.6041889786720276, + "learning_rate": 9.447329216029117e-06, + "loss": 0.394, + "step": 1522 + }, + { + "epoch": 0.7082622849170671, + "grad_norm": 0.4233911633491516, + "learning_rate": 9.44609194720373e-06, + "loss": 0.3657, + "step": 1523 + }, + { + "epoch": 0.7087273290962641, + "grad_norm": 0.542805552482605, + "learning_rate": 9.44485337619023e-06, + "loss": 0.3966, + "step": 1524 + }, + { + "epoch": 0.7091923732754611, + "grad_norm": 0.5100137591362, + "learning_rate": 9.443613503351375e-06, + "loss": 0.3737, + "step": 1525 + }, + { + "epoch": 0.7096574174546582, + "grad_norm": 0.4733894169330597, + "learning_rate": 9.442372329050304e-06, + "loss": 0.3776, + "step": 1526 + }, + { + "epoch": 0.7101224616338552, + "grad_norm": 0.5760788917541504, + "learning_rate": 9.441129853650534e-06, + "loss": 0.3838, + "step": 1527 + }, + { + "epoch": 0.7105875058130522, + "grad_norm": 0.5053048729896545, + "learning_rate": 9.43988607751597e-06, + "loss": 0.3741, + "step": 1528 + }, + { + "epoch": 0.7110525499922493, + "grad_norm": 0.6125737428665161, + "learning_rate": 9.43864100101089e-06, + "loss": 0.3941, + "step": 1529 + }, + { + "epoch": 0.7115175941714463, + "grad_norm": 0.6082503795623779, + "learning_rate": 9.437394624499957e-06, + "loss": 0.3769, + "step": 1530 + }, + { + "epoch": 0.7119826383506433, + "grad_norm": 0.5114914178848267, + "learning_rate": 9.43614694834822e-06, + "loss": 0.3972, + "step": 1531 + }, + { + "epoch": 0.7124476825298404, + "grad_norm": 0.5447007417678833, + "learning_rate": 9.434897972921095e-06, + "loss": 0.3758, + "step": 1532 + }, + { + "epoch": 0.7129127267090374, + "grad_norm": 0.4863920509815216, + "learning_rate": 9.433647698584393e-06, + "loss": 0.385, + "step": 1533 + }, + { + "epoch": 0.7133777708882344, + "grad_norm": 0.5778424739837646, + "learning_rate": 9.432396125704294e-06, + "loss": 0.3576, + "step": 1534 + }, + { + "epoch": 0.7138428150674314, + "grad_norm": 0.541344940662384, + "learning_rate": 9.431143254647368e-06, + "loss": 0.4063, + "step": 1535 + }, + { + "epoch": 0.7143078592466284, + "grad_norm": 0.5227439999580383, + "learning_rate": 9.429889085780559e-06, + "loss": 0.4045, + "step": 1536 + }, + { + "epoch": 0.7147729034258254, + "grad_norm": 0.5893809795379639, + "learning_rate": 9.42863361947119e-06, + "loss": 0.3556, + "step": 1537 + }, + { + "epoch": 0.7152379476050225, + "grad_norm": 0.45113706588745117, + "learning_rate": 9.42737685608697e-06, + "loss": 0.3984, + "step": 1538 + }, + { + "epoch": 0.7157029917842195, + "grad_norm": 0.6279287338256836, + "learning_rate": 9.426118795995984e-06, + "loss": 0.368, + "step": 1539 + }, + { + "epoch": 0.7161680359634165, + "grad_norm": 0.45758605003356934, + "learning_rate": 9.424859439566696e-06, + "loss": 0.378, + "step": 1540 + }, + { + "epoch": 0.7166330801426135, + "grad_norm": 0.5116802453994751, + "learning_rate": 9.423598787167952e-06, + "loss": 0.3775, + "step": 1541 + }, + { + "epoch": 0.7170981243218105, + "grad_norm": 0.5365654826164246, + "learning_rate": 9.422336839168974e-06, + "loss": 0.3701, + "step": 1542 + }, + { + "epoch": 0.7175631685010075, + "grad_norm": 0.4998980760574341, + "learning_rate": 9.421073595939373e-06, + "loss": 0.3778, + "step": 1543 + }, + { + "epoch": 0.7180282126802047, + "grad_norm": 0.5392967462539673, + "learning_rate": 9.419809057849125e-06, + "loss": 0.4003, + "step": 1544 + }, + { + "epoch": 0.7184932568594017, + "grad_norm": 0.5534929633140564, + "learning_rate": 9.418543225268598e-06, + "loss": 0.3992, + "step": 1545 + }, + { + "epoch": 0.7189583010385987, + "grad_norm": 0.45041725039482117, + "learning_rate": 9.41727609856853e-06, + "loss": 0.3951, + "step": 1546 + }, + { + "epoch": 0.7194233452177957, + "grad_norm": 0.5060805082321167, + "learning_rate": 9.416007678120041e-06, + "loss": 0.3657, + "step": 1547 + }, + { + "epoch": 0.7198883893969927, + "grad_norm": 0.4890674352645874, + "learning_rate": 9.414737964294636e-06, + "loss": 0.3763, + "step": 1548 + }, + { + "epoch": 0.7203534335761898, + "grad_norm": 0.48126208782196045, + "learning_rate": 9.41346695746419e-06, + "loss": 0.403, + "step": 1549 + }, + { + "epoch": 0.7208184777553868, + "grad_norm": 0.42710211873054504, + "learning_rate": 9.41219465800096e-06, + "loss": 0.408, + "step": 1550 + }, + { + "epoch": 0.7212835219345838, + "grad_norm": 0.5024427771568298, + "learning_rate": 9.410921066277583e-06, + "loss": 0.3946, + "step": 1551 + }, + { + "epoch": 0.7217485661137808, + "grad_norm": 0.5658032298088074, + "learning_rate": 9.409646182667073e-06, + "loss": 0.4095, + "step": 1552 + }, + { + "epoch": 0.7222136102929778, + "grad_norm": 0.46872782707214355, + "learning_rate": 9.408370007542822e-06, + "loss": 0.3693, + "step": 1553 + }, + { + "epoch": 0.7226786544721748, + "grad_norm": 0.5501477718353271, + "learning_rate": 9.407092541278602e-06, + "loss": 0.3939, + "step": 1554 + }, + { + "epoch": 0.7231436986513718, + "grad_norm": 0.5214014649391174, + "learning_rate": 9.405813784248562e-06, + "loss": 0.3936, + "step": 1555 + }, + { + "epoch": 0.7236087428305689, + "grad_norm": 0.47471073269844055, + "learning_rate": 9.40453373682723e-06, + "loss": 0.3869, + "step": 1556 + }, + { + "epoch": 0.7240737870097659, + "grad_norm": 0.5597638487815857, + "learning_rate": 9.403252399389508e-06, + "loss": 0.3925, + "step": 1557 + }, + { + "epoch": 0.724538831188963, + "grad_norm": 0.43586617708206177, + "learning_rate": 9.401969772310681e-06, + "loss": 0.3636, + "step": 1558 + }, + { + "epoch": 0.72500387536816, + "grad_norm": 0.5254845023155212, + "learning_rate": 9.400685855966411e-06, + "loss": 0.3647, + "step": 1559 + }, + { + "epoch": 0.725468919547357, + "grad_norm": 0.5308626294136047, + "learning_rate": 9.399400650732735e-06, + "loss": 0.3843, + "step": 1560 + }, + { + "epoch": 0.725933963726554, + "grad_norm": 0.4976446032524109, + "learning_rate": 9.398114156986068e-06, + "loss": 0.4107, + "step": 1561 + }, + { + "epoch": 0.7263990079057511, + "grad_norm": 0.47500959038734436, + "learning_rate": 9.396826375103203e-06, + "loss": 0.3745, + "step": 1562 + }, + { + "epoch": 0.7268640520849481, + "grad_norm": 0.5488372445106506, + "learning_rate": 9.395537305461312e-06, + "loss": 0.3783, + "step": 1563 + }, + { + "epoch": 0.7273290962641451, + "grad_norm": 0.48007380962371826, + "learning_rate": 9.394246948437943e-06, + "loss": 0.3835, + "step": 1564 + }, + { + "epoch": 0.7277941404433421, + "grad_norm": 0.5635498762130737, + "learning_rate": 9.392955304411015e-06, + "loss": 0.38, + "step": 1565 + }, + { + "epoch": 0.7282591846225391, + "grad_norm": 0.48347046971321106, + "learning_rate": 9.391662373758836e-06, + "loss": 0.4036, + "step": 1566 + }, + { + "epoch": 0.7287242288017362, + "grad_norm": 0.448574423789978, + "learning_rate": 9.390368156860083e-06, + "loss": 0.3928, + "step": 1567 + }, + { + "epoch": 0.7291892729809332, + "grad_norm": 0.4473694860935211, + "learning_rate": 9.389072654093809e-06, + "loss": 0.3579, + "step": 1568 + }, + { + "epoch": 0.7296543171601302, + "grad_norm": 0.48732516169548035, + "learning_rate": 9.387775865839449e-06, + "loss": 0.3803, + "step": 1569 + }, + { + "epoch": 0.7301193613393272, + "grad_norm": 0.46211230754852295, + "learning_rate": 9.386477792476806e-06, + "loss": 0.4012, + "step": 1570 + }, + { + "epoch": 0.7305844055185242, + "grad_norm": 0.4076167643070221, + "learning_rate": 9.38517843438607e-06, + "loss": 0.3788, + "step": 1571 + }, + { + "epoch": 0.7310494496977212, + "grad_norm": 0.5187703967094421, + "learning_rate": 9.383877791947802e-06, + "loss": 0.4111, + "step": 1572 + }, + { + "epoch": 0.7315144938769184, + "grad_norm": 0.5166497230529785, + "learning_rate": 9.382575865542933e-06, + "loss": 0.3982, + "step": 1573 + }, + { + "epoch": 0.7319795380561154, + "grad_norm": 0.4667831063270569, + "learning_rate": 9.38127265555278e-06, + "loss": 0.4002, + "step": 1574 + }, + { + "epoch": 0.7324445822353124, + "grad_norm": 0.4515557289123535, + "learning_rate": 9.379968162359034e-06, + "loss": 0.3646, + "step": 1575 + }, + { + "epoch": 0.7329096264145094, + "grad_norm": 0.48344147205352783, + "learning_rate": 9.378662386343758e-06, + "loss": 0.4046, + "step": 1576 + }, + { + "epoch": 0.7333746705937064, + "grad_norm": 0.505967915058136, + "learning_rate": 9.377355327889391e-06, + "loss": 0.3664, + "step": 1577 + }, + { + "epoch": 0.7338397147729034, + "grad_norm": 0.43122488260269165, + "learning_rate": 9.37604698737875e-06, + "loss": 0.3962, + "step": 1578 + }, + { + "epoch": 0.7343047589521005, + "grad_norm": 0.4845030903816223, + "learning_rate": 9.374737365195028e-06, + "loss": 0.3937, + "step": 1579 + }, + { + "epoch": 0.7347698031312975, + "grad_norm": 0.4910258948802948, + "learning_rate": 9.37342646172179e-06, + "loss": 0.3892, + "step": 1580 + }, + { + "epoch": 0.7352348473104945, + "grad_norm": 0.4109004735946655, + "learning_rate": 9.372114277342981e-06, + "loss": 0.3735, + "step": 1581 + }, + { + "epoch": 0.7356998914896915, + "grad_norm": 0.4791964888572693, + "learning_rate": 9.370800812442917e-06, + "loss": 0.3956, + "step": 1582 + }, + { + "epoch": 0.7361649356688885, + "grad_norm": 0.4479603171348572, + "learning_rate": 9.36948606740629e-06, + "loss": 0.3587, + "step": 1583 + }, + { + "epoch": 0.7366299798480855, + "grad_norm": 0.49391624331474304, + "learning_rate": 9.36817004261817e-06, + "loss": 0.3785, + "step": 1584 + }, + { + "epoch": 0.7370950240272826, + "grad_norm": 0.4788481593132019, + "learning_rate": 9.366852738463995e-06, + "loss": 0.395, + "step": 1585 + }, + { + "epoch": 0.7375600682064796, + "grad_norm": 0.4550861120223999, + "learning_rate": 9.365534155329585e-06, + "loss": 0.3826, + "step": 1586 + }, + { + "epoch": 0.7380251123856766, + "grad_norm": 0.45728829503059387, + "learning_rate": 9.364214293601133e-06, + "loss": 0.3662, + "step": 1587 + }, + { + "epoch": 0.7384901565648737, + "grad_norm": 0.46696847677230835, + "learning_rate": 9.3628931536652e-06, + "loss": 0.3751, + "step": 1588 + }, + { + "epoch": 0.7389552007440707, + "grad_norm": 0.48484426736831665, + "learning_rate": 9.361570735908731e-06, + "loss": 0.3764, + "step": 1589 + }, + { + "epoch": 0.7394202449232677, + "grad_norm": 0.4298027455806732, + "learning_rate": 9.36024704071904e-06, + "loss": 0.3879, + "step": 1590 + }, + { + "epoch": 0.7398852891024648, + "grad_norm": 0.5281281471252441, + "learning_rate": 9.358922068483813e-06, + "loss": 0.3742, + "step": 1591 + }, + { + "epoch": 0.7403503332816618, + "grad_norm": 0.39582934975624084, + "learning_rate": 9.357595819591116e-06, + "loss": 0.3886, + "step": 1592 + }, + { + "epoch": 0.7408153774608588, + "grad_norm": 0.4835725724697113, + "learning_rate": 9.356268294429384e-06, + "loss": 0.3904, + "step": 1593 + }, + { + "epoch": 0.7412804216400558, + "grad_norm": 0.49166139960289, + "learning_rate": 9.354939493387428e-06, + "loss": 0.4038, + "step": 1594 + }, + { + "epoch": 0.7417454658192528, + "grad_norm": 0.4498700499534607, + "learning_rate": 9.353609416854432e-06, + "loss": 0.3906, + "step": 1595 + }, + { + "epoch": 0.7422105099984498, + "grad_norm": 0.5919501781463623, + "learning_rate": 9.352278065219955e-06, + "loss": 0.3753, + "step": 1596 + }, + { + "epoch": 0.7426755541776469, + "grad_norm": 0.5172655582427979, + "learning_rate": 9.350945438873927e-06, + "loss": 0.3858, + "step": 1597 + }, + { + "epoch": 0.7431405983568439, + "grad_norm": 0.4497489333152771, + "learning_rate": 9.349611538206654e-06, + "loss": 0.3979, + "step": 1598 + }, + { + "epoch": 0.7436056425360409, + "grad_norm": 0.4453744888305664, + "learning_rate": 9.348276363608812e-06, + "loss": 0.3854, + "step": 1599 + }, + { + "epoch": 0.7440706867152379, + "grad_norm": 0.5212786793708801, + "learning_rate": 9.346939915471453e-06, + "loss": 0.3838, + "step": 1600 + }, + { + "epoch": 0.7445357308944349, + "grad_norm": 0.5124901533126831, + "learning_rate": 9.345602194186001e-06, + "loss": 0.3859, + "step": 1601 + }, + { + "epoch": 0.745000775073632, + "grad_norm": 0.5541741847991943, + "learning_rate": 9.344263200144253e-06, + "loss": 0.3653, + "step": 1602 + }, + { + "epoch": 0.7454658192528291, + "grad_norm": 0.5633660554885864, + "learning_rate": 9.342922933738377e-06, + "loss": 0.3784, + "step": 1603 + }, + { + "epoch": 0.7459308634320261, + "grad_norm": 0.4769454300403595, + "learning_rate": 9.341581395360917e-06, + "loss": 0.389, + "step": 1604 + }, + { + "epoch": 0.7463959076112231, + "grad_norm": 0.48664960265159607, + "learning_rate": 9.340238585404787e-06, + "loss": 0.3782, + "step": 1605 + }, + { + "epoch": 0.7468609517904201, + "grad_norm": 0.45407751202583313, + "learning_rate": 9.338894504263276e-06, + "loss": 0.38, + "step": 1606 + }, + { + "epoch": 0.7473259959696171, + "grad_norm": 0.47645387053489685, + "learning_rate": 9.33754915233004e-06, + "loss": 0.3541, + "step": 1607 + }, + { + "epoch": 0.7477910401488141, + "grad_norm": 0.5349624752998352, + "learning_rate": 9.336202529999114e-06, + "loss": 0.3867, + "step": 1608 + }, + { + "epoch": 0.7482560843280112, + "grad_norm": 0.49598512053489685, + "learning_rate": 9.3348546376649e-06, + "loss": 0.3848, + "step": 1609 + }, + { + "epoch": 0.7487211285072082, + "grad_norm": 0.5270540714263916, + "learning_rate": 9.333505475722175e-06, + "loss": 0.3953, + "step": 1610 + }, + { + "epoch": 0.7491861726864052, + "grad_norm": 0.4383714199066162, + "learning_rate": 9.332155044566085e-06, + "loss": 0.3936, + "step": 1611 + }, + { + "epoch": 0.7496512168656022, + "grad_norm": 0.4920140206813812, + "learning_rate": 9.330803344592151e-06, + "loss": 0.3805, + "step": 1612 + }, + { + "epoch": 0.7501162610447992, + "grad_norm": 0.5141205787658691, + "learning_rate": 9.329450376196264e-06, + "loss": 0.3821, + "step": 1613 + }, + { + "epoch": 0.7505813052239962, + "grad_norm": 0.5176628828048706, + "learning_rate": 9.328096139774686e-06, + "loss": 0.3903, + "step": 1614 + }, + { + "epoch": 0.7510463494031933, + "grad_norm": 0.5027486085891724, + "learning_rate": 9.326740635724047e-06, + "loss": 0.3814, + "step": 1615 + }, + { + "epoch": 0.7515113935823903, + "grad_norm": 0.504041314125061, + "learning_rate": 9.32538386444136e-06, + "loss": 0.3701, + "step": 1616 + }, + { + "epoch": 0.7519764377615874, + "grad_norm": 0.4461967647075653, + "learning_rate": 9.324025826323995e-06, + "loss": 0.3603, + "step": 1617 + }, + { + "epoch": 0.7524414819407844, + "grad_norm": 0.4477754831314087, + "learning_rate": 9.3226665217697e-06, + "loss": 0.3585, + "step": 1618 + }, + { + "epoch": 0.7529065261199814, + "grad_norm": 0.48952674865722656, + "learning_rate": 9.321305951176597e-06, + "loss": 0.3974, + "step": 1619 + }, + { + "epoch": 0.7533715702991785, + "grad_norm": 0.4486616253852844, + "learning_rate": 9.319944114943171e-06, + "loss": 0.3745, + "step": 1620 + }, + { + "epoch": 0.7538366144783755, + "grad_norm": 0.5282906293869019, + "learning_rate": 9.318581013468285e-06, + "loss": 0.3743, + "step": 1621 + }, + { + "epoch": 0.7543016586575725, + "grad_norm": 0.5025021433830261, + "learning_rate": 9.317216647151166e-06, + "loss": 0.385, + "step": 1622 + }, + { + "epoch": 0.7547667028367695, + "grad_norm": 0.48723548650741577, + "learning_rate": 9.315851016391417e-06, + "loss": 0.3891, + "step": 1623 + }, + { + "epoch": 0.7552317470159665, + "grad_norm": 0.5467572212219238, + "learning_rate": 9.31448412158901e-06, + "loss": 0.3931, + "step": 1624 + }, + { + "epoch": 0.7556967911951635, + "grad_norm": 0.503555178642273, + "learning_rate": 9.313115963144281e-06, + "loss": 0.3466, + "step": 1625 + }, + { + "epoch": 0.7561618353743605, + "grad_norm": 0.570252537727356, + "learning_rate": 9.311746541457946e-06, + "loss": 0.3588, + "step": 1626 + }, + { + "epoch": 0.7566268795535576, + "grad_norm": 0.49849367141723633, + "learning_rate": 9.310375856931086e-06, + "loss": 0.4005, + "step": 1627 + }, + { + "epoch": 0.7570919237327546, + "grad_norm": 0.4657578766345978, + "learning_rate": 9.309003909965152e-06, + "loss": 0.3811, + "step": 1628 + }, + { + "epoch": 0.7575569679119516, + "grad_norm": 0.5395064353942871, + "learning_rate": 9.307630700961966e-06, + "loss": 0.4136, + "step": 1629 + }, + { + "epoch": 0.7580220120911486, + "grad_norm": 0.43396592140197754, + "learning_rate": 9.306256230323714e-06, + "loss": 0.3834, + "step": 1630 + }, + { + "epoch": 0.7584870562703456, + "grad_norm": 0.4718068838119507, + "learning_rate": 9.304880498452962e-06, + "loss": 0.3759, + "step": 1631 + }, + { + "epoch": 0.7589521004495428, + "grad_norm": 0.49737459421157837, + "learning_rate": 9.303503505752636e-06, + "loss": 0.3928, + "step": 1632 + }, + { + "epoch": 0.7594171446287398, + "grad_norm": 0.4730997681617737, + "learning_rate": 9.302125252626035e-06, + "loss": 0.3916, + "step": 1633 + }, + { + "epoch": 0.7598821888079368, + "grad_norm": 0.49301835894584656, + "learning_rate": 9.30074573947683e-06, + "loss": 0.3821, + "step": 1634 + }, + { + "epoch": 0.7603472329871338, + "grad_norm": 0.5187510251998901, + "learning_rate": 9.299364966709051e-06, + "loss": 0.3527, + "step": 1635 + }, + { + "epoch": 0.7608122771663308, + "grad_norm": 0.5241015553474426, + "learning_rate": 9.29798293472711e-06, + "loss": 0.378, + "step": 1636 + }, + { + "epoch": 0.7612773213455278, + "grad_norm": 0.4729224443435669, + "learning_rate": 9.296599643935782e-06, + "loss": 0.4126, + "step": 1637 + }, + { + "epoch": 0.7617423655247249, + "grad_norm": 0.6277045011520386, + "learning_rate": 9.295215094740208e-06, + "loss": 0.4036, + "step": 1638 + }, + { + "epoch": 0.7622074097039219, + "grad_norm": 0.4564441740512848, + "learning_rate": 9.293829287545902e-06, + "loss": 0.3894, + "step": 1639 + }, + { + "epoch": 0.7626724538831189, + "grad_norm": 0.48436111211776733, + "learning_rate": 9.292442222758741e-06, + "loss": 0.3695, + "step": 1640 + }, + { + "epoch": 0.7631374980623159, + "grad_norm": 0.5133222341537476, + "learning_rate": 9.291053900784977e-06, + "loss": 0.3843, + "step": 1641 + }, + { + "epoch": 0.7636025422415129, + "grad_norm": 0.4502391517162323, + "learning_rate": 9.289664322031225e-06, + "loss": 0.3745, + "step": 1642 + }, + { + "epoch": 0.7640675864207099, + "grad_norm": 0.5892338156700134, + "learning_rate": 9.28827348690447e-06, + "loss": 0.3855, + "step": 1643 + }, + { + "epoch": 0.764532630599907, + "grad_norm": 0.4681336283683777, + "learning_rate": 9.286881395812066e-06, + "loss": 0.3817, + "step": 1644 + }, + { + "epoch": 0.764997674779104, + "grad_norm": 0.5281332731246948, + "learning_rate": 9.285488049161735e-06, + "loss": 0.3903, + "step": 1645 + }, + { + "epoch": 0.7654627189583011, + "grad_norm": 0.5051922798156738, + "learning_rate": 9.284093447361563e-06, + "loss": 0.4035, + "step": 1646 + }, + { + "epoch": 0.7659277631374981, + "grad_norm": 0.4733637273311615, + "learning_rate": 9.282697590820008e-06, + "loss": 0.3933, + "step": 1647 + }, + { + "epoch": 0.7663928073166951, + "grad_norm": 0.43565672636032104, + "learning_rate": 9.281300479945894e-06, + "loss": 0.3654, + "step": 1648 + }, + { + "epoch": 0.7668578514958921, + "grad_norm": 0.49669885635375977, + "learning_rate": 9.27990211514841e-06, + "loss": 0.3514, + "step": 1649 + }, + { + "epoch": 0.7673228956750892, + "grad_norm": 0.4607134163379669, + "learning_rate": 9.278502496837116e-06, + "loss": 0.3954, + "step": 1650 + }, + { + "epoch": 0.7677879398542862, + "grad_norm": 0.5022844672203064, + "learning_rate": 9.277101625421938e-06, + "loss": 0.3917, + "step": 1651 + }, + { + "epoch": 0.7682529840334832, + "grad_norm": 0.5731377601623535, + "learning_rate": 9.275699501313164e-06, + "loss": 0.4014, + "step": 1652 + }, + { + "epoch": 0.7687180282126802, + "grad_norm": 0.44675296545028687, + "learning_rate": 9.27429612492146e-06, + "loss": 0.3754, + "step": 1653 + }, + { + "epoch": 0.7691830723918772, + "grad_norm": 0.6787543296813965, + "learning_rate": 9.27289149665785e-06, + "loss": 0.3779, + "step": 1654 + }, + { + "epoch": 0.7696481165710742, + "grad_norm": 0.6024219989776611, + "learning_rate": 9.271485616933725e-06, + "loss": 0.3736, + "step": 1655 + }, + { + "epoch": 0.7701131607502713, + "grad_norm": 0.49429091811180115, + "learning_rate": 9.270078486160843e-06, + "loss": 0.3944, + "step": 1656 + }, + { + "epoch": 0.7705782049294683, + "grad_norm": 0.6208419799804688, + "learning_rate": 9.268670104751334e-06, + "loss": 0.3796, + "step": 1657 + }, + { + "epoch": 0.7710432491086653, + "grad_norm": 0.5730566382408142, + "learning_rate": 9.267260473117687e-06, + "loss": 0.3936, + "step": 1658 + }, + { + "epoch": 0.7715082932878623, + "grad_norm": 0.4064892828464508, + "learning_rate": 9.265849591672762e-06, + "loss": 0.3665, + "step": 1659 + }, + { + "epoch": 0.7719733374670593, + "grad_norm": 0.6317027807235718, + "learning_rate": 9.264437460829783e-06, + "loss": 0.3973, + "step": 1660 + }, + { + "epoch": 0.7724383816462564, + "grad_norm": 0.4885081350803375, + "learning_rate": 9.263024081002338e-06, + "loss": 0.401, + "step": 1661 + }, + { + "epoch": 0.7729034258254535, + "grad_norm": 0.4672366976737976, + "learning_rate": 9.261609452604387e-06, + "loss": 0.3705, + "step": 1662 + }, + { + "epoch": 0.7733684700046505, + "grad_norm": 0.5605355501174927, + "learning_rate": 9.260193576050247e-06, + "loss": 0.4115, + "step": 1663 + }, + { + "epoch": 0.7738335141838475, + "grad_norm": 0.5084188580513, + "learning_rate": 9.25877645175461e-06, + "loss": 0.3905, + "step": 1664 + }, + { + "epoch": 0.7742985583630445, + "grad_norm": 0.4235016107559204, + "learning_rate": 9.257358080132524e-06, + "loss": 0.3808, + "step": 1665 + }, + { + "epoch": 0.7747636025422415, + "grad_norm": 0.5295476913452148, + "learning_rate": 9.25593846159941e-06, + "loss": 0.3704, + "step": 1666 + }, + { + "epoch": 0.7752286467214385, + "grad_norm": 0.4431585669517517, + "learning_rate": 9.25451759657105e-06, + "loss": 0.3784, + "step": 1667 + }, + { + "epoch": 0.7756936909006356, + "grad_norm": 0.43355417251586914, + "learning_rate": 9.253095485463594e-06, + "loss": 0.3708, + "step": 1668 + }, + { + "epoch": 0.7761587350798326, + "grad_norm": 0.5399253368377686, + "learning_rate": 9.251672128693553e-06, + "loss": 0.3887, + "step": 1669 + }, + { + "epoch": 0.7766237792590296, + "grad_norm": 0.47674787044525146, + "learning_rate": 9.250247526677806e-06, + "loss": 0.4136, + "step": 1670 + }, + { + "epoch": 0.7770888234382266, + "grad_norm": 0.39808011054992676, + "learning_rate": 9.248821679833596e-06, + "loss": 0.3894, + "step": 1671 + }, + { + "epoch": 0.7775538676174236, + "grad_norm": 0.5415627360343933, + "learning_rate": 9.24739458857853e-06, + "loss": 0.3698, + "step": 1672 + }, + { + "epoch": 0.7780189117966206, + "grad_norm": 0.4286542534828186, + "learning_rate": 9.245966253330581e-06, + "loss": 0.3781, + "step": 1673 + }, + { + "epoch": 0.7784839559758177, + "grad_norm": 0.42257702350616455, + "learning_rate": 9.244536674508085e-06, + "loss": 0.3868, + "step": 1674 + }, + { + "epoch": 0.7789490001550148, + "grad_norm": 0.41875237226486206, + "learning_rate": 9.243105852529739e-06, + "loss": 0.3594, + "step": 1675 + }, + { + "epoch": 0.7794140443342118, + "grad_norm": 0.45857277512550354, + "learning_rate": 9.241673787814612e-06, + "loss": 0.4015, + "step": 1676 + }, + { + "epoch": 0.7798790885134088, + "grad_norm": 0.4585971236228943, + "learning_rate": 9.24024048078213e-06, + "loss": 0.3895, + "step": 1677 + }, + { + "epoch": 0.7803441326926058, + "grad_norm": 0.4482724666595459, + "learning_rate": 9.238805931852088e-06, + "loss": 0.3787, + "step": 1678 + }, + { + "epoch": 0.7808091768718028, + "grad_norm": 0.4379400312900543, + "learning_rate": 9.237370141444636e-06, + "loss": 0.3712, + "step": 1679 + }, + { + "epoch": 0.7812742210509999, + "grad_norm": 0.452578604221344, + "learning_rate": 9.235933109980302e-06, + "loss": 0.4232, + "step": 1680 + }, + { + "epoch": 0.7817392652301969, + "grad_norm": 0.4528006613254547, + "learning_rate": 9.234494837879963e-06, + "loss": 0.3626, + "step": 1681 + }, + { + "epoch": 0.7822043094093939, + "grad_norm": 0.4689446985721588, + "learning_rate": 9.233055325564869e-06, + "loss": 0.386, + "step": 1682 + }, + { + "epoch": 0.7826693535885909, + "grad_norm": 0.4909113645553589, + "learning_rate": 9.231614573456628e-06, + "loss": 0.3899, + "step": 1683 + }, + { + "epoch": 0.7831343977677879, + "grad_norm": 0.45796748995780945, + "learning_rate": 9.230172581977212e-06, + "loss": 0.3907, + "step": 1684 + }, + { + "epoch": 0.7835994419469849, + "grad_norm": 0.4643401801586151, + "learning_rate": 9.22872935154896e-06, + "loss": 0.3749, + "step": 1685 + }, + { + "epoch": 0.784064486126182, + "grad_norm": 0.5391439199447632, + "learning_rate": 9.227284882594567e-06, + "loss": 0.3901, + "step": 1686 + }, + { + "epoch": 0.784529530305379, + "grad_norm": 0.4336642920970917, + "learning_rate": 9.225839175537096e-06, + "loss": 0.3952, + "step": 1687 + }, + { + "epoch": 0.784994574484576, + "grad_norm": 0.4696858823299408, + "learning_rate": 9.224392230799972e-06, + "loss": 0.3925, + "step": 1688 + }, + { + "epoch": 0.785459618663773, + "grad_norm": 0.43825915455818176, + "learning_rate": 9.222944048806982e-06, + "loss": 0.3605, + "step": 1689 + }, + { + "epoch": 0.7859246628429701, + "grad_norm": 0.5373666286468506, + "learning_rate": 9.221494629982274e-06, + "loss": 0.3634, + "step": 1690 + }, + { + "epoch": 0.7863897070221672, + "grad_norm": 0.4807939827442169, + "learning_rate": 9.22004397475036e-06, + "loss": 0.3908, + "step": 1691 + }, + { + "epoch": 0.7868547512013642, + "grad_norm": 0.462829053401947, + "learning_rate": 9.21859208353611e-06, + "loss": 0.351, + "step": 1692 + }, + { + "epoch": 0.7873197953805612, + "grad_norm": 0.49336567521095276, + "learning_rate": 9.217138956764764e-06, + "loss": 0.3722, + "step": 1693 + }, + { + "epoch": 0.7877848395597582, + "grad_norm": 0.5379830002784729, + "learning_rate": 9.215684594861915e-06, + "loss": 0.3967, + "step": 1694 + }, + { + "epoch": 0.7882498837389552, + "grad_norm": 0.4929734766483307, + "learning_rate": 9.214228998253526e-06, + "loss": 0.3666, + "step": 1695 + }, + { + "epoch": 0.7887149279181522, + "grad_norm": 0.5308467149734497, + "learning_rate": 9.212772167365915e-06, + "loss": 0.3863, + "step": 1696 + }, + { + "epoch": 0.7891799720973492, + "grad_norm": 0.43540507555007935, + "learning_rate": 9.211314102625768e-06, + "loss": 0.4, + "step": 1697 + }, + { + "epoch": 0.7896450162765463, + "grad_norm": 0.5050954222679138, + "learning_rate": 9.209854804460121e-06, + "loss": 0.3893, + "step": 1698 + }, + { + "epoch": 0.7901100604557433, + "grad_norm": 0.5224504470825195, + "learning_rate": 9.208394273296387e-06, + "loss": 0.3957, + "step": 1699 + }, + { + "epoch": 0.7905751046349403, + "grad_norm": 0.5353153944015503, + "learning_rate": 9.206932509562325e-06, + "loss": 0.3839, + "step": 1700 + }, + { + "epoch": 0.7910401488141373, + "grad_norm": 0.5350932478904724, + "learning_rate": 9.205469513686065e-06, + "loss": 0.4252, + "step": 1701 + }, + { + "epoch": 0.7915051929933343, + "grad_norm": 0.5694266557693481, + "learning_rate": 9.204005286096095e-06, + "loss": 0.3813, + "step": 1702 + }, + { + "epoch": 0.7919702371725313, + "grad_norm": 0.5682549476623535, + "learning_rate": 9.202539827221264e-06, + "loss": 0.3519, + "step": 1703 + }, + { + "epoch": 0.7924352813517284, + "grad_norm": 0.4279302656650543, + "learning_rate": 9.20107313749078e-06, + "loss": 0.3863, + "step": 1704 + }, + { + "epoch": 0.7929003255309255, + "grad_norm": 0.5173167586326599, + "learning_rate": 9.19960521733421e-06, + "loss": 0.3747, + "step": 1705 + }, + { + "epoch": 0.7933653697101225, + "grad_norm": 0.5256390571594238, + "learning_rate": 9.198136067181491e-06, + "loss": 0.3963, + "step": 1706 + }, + { + "epoch": 0.7938304138893195, + "grad_norm": 0.5231375098228455, + "learning_rate": 9.196665687462906e-06, + "loss": 0.3706, + "step": 1707 + }, + { + "epoch": 0.7942954580685165, + "grad_norm": 0.4618844985961914, + "learning_rate": 9.19519407860911e-06, + "loss": 0.3796, + "step": 1708 + }, + { + "epoch": 0.7947605022477136, + "grad_norm": 0.49417373538017273, + "learning_rate": 9.193721241051108e-06, + "loss": 0.3847, + "step": 1709 + }, + { + "epoch": 0.7952255464269106, + "grad_norm": 0.5576480031013489, + "learning_rate": 9.192247175220276e-06, + "loss": 0.3747, + "step": 1710 + }, + { + "epoch": 0.7956905906061076, + "grad_norm": 0.5018484592437744, + "learning_rate": 9.190771881548343e-06, + "loss": 0.4008, + "step": 1711 + }, + { + "epoch": 0.7961556347853046, + "grad_norm": 0.48867353796958923, + "learning_rate": 9.189295360467397e-06, + "loss": 0.3806, + "step": 1712 + }, + { + "epoch": 0.7966206789645016, + "grad_norm": 0.5056695342063904, + "learning_rate": 9.187817612409886e-06, + "loss": 0.3718, + "step": 1713 + }, + { + "epoch": 0.7970857231436986, + "grad_norm": 0.45721235871315, + "learning_rate": 9.18633863780862e-06, + "loss": 0.3794, + "step": 1714 + }, + { + "epoch": 0.7975507673228956, + "grad_norm": 0.487956702709198, + "learning_rate": 9.184858437096766e-06, + "loss": 0.3905, + "step": 1715 + }, + { + "epoch": 0.7980158115020927, + "grad_norm": 0.5465155243873596, + "learning_rate": 9.183377010707853e-06, + "loss": 0.3968, + "step": 1716 + }, + { + "epoch": 0.7984808556812897, + "grad_norm": 0.5411269068717957, + "learning_rate": 9.181894359075763e-06, + "loss": 0.3826, + "step": 1717 + }, + { + "epoch": 0.7989458998604867, + "grad_norm": 0.511201798915863, + "learning_rate": 9.180410482634744e-06, + "loss": 0.3873, + "step": 1718 + }, + { + "epoch": 0.7994109440396838, + "grad_norm": 0.5773457288742065, + "learning_rate": 9.178925381819396e-06, + "loss": 0.4048, + "step": 1719 + }, + { + "epoch": 0.7998759882188808, + "grad_norm": 0.5756568312644958, + "learning_rate": 9.177439057064684e-06, + "loss": 0.3772, + "step": 1720 + }, + { + "epoch": 0.8003410323980779, + "grad_norm": 0.5717048048973083, + "learning_rate": 9.175951508805924e-06, + "loss": 0.3796, + "step": 1721 + }, + { + "epoch": 0.8008060765772749, + "grad_norm": 0.5515778064727783, + "learning_rate": 9.174462737478801e-06, + "loss": 0.3781, + "step": 1722 + }, + { + "epoch": 0.8012711207564719, + "grad_norm": 0.6067471504211426, + "learning_rate": 9.172972743519348e-06, + "loss": 0.3922, + "step": 1723 + }, + { + "epoch": 0.8017361649356689, + "grad_norm": 0.4658670127391815, + "learning_rate": 9.17148152736396e-06, + "loss": 0.3948, + "step": 1724 + }, + { + "epoch": 0.8022012091148659, + "grad_norm": 0.5393667817115784, + "learning_rate": 9.16998908944939e-06, + "loss": 0.3883, + "step": 1725 + }, + { + "epoch": 0.8026662532940629, + "grad_norm": 0.5518912076950073, + "learning_rate": 9.168495430212752e-06, + "loss": 0.4006, + "step": 1726 + }, + { + "epoch": 0.80313129747326, + "grad_norm": 0.4799440801143646, + "learning_rate": 9.16700055009151e-06, + "loss": 0.3985, + "step": 1727 + }, + { + "epoch": 0.803596341652457, + "grad_norm": 0.48832663893699646, + "learning_rate": 9.165504449523492e-06, + "loss": 0.3831, + "step": 1728 + }, + { + "epoch": 0.804061385831654, + "grad_norm": 0.4758247435092926, + "learning_rate": 9.164007128946881e-06, + "loss": 0.3625, + "step": 1729 + }, + { + "epoch": 0.804526430010851, + "grad_norm": 0.4202597439289093, + "learning_rate": 9.162508588800221e-06, + "loss": 0.3416, + "step": 1730 + }, + { + "epoch": 0.804991474190048, + "grad_norm": 0.44846394658088684, + "learning_rate": 9.161008829522406e-06, + "loss": 0.3712, + "step": 1731 + }, + { + "epoch": 0.805456518369245, + "grad_norm": 0.4517316222190857, + "learning_rate": 9.159507851552693e-06, + "loss": 0.3893, + "step": 1732 + }, + { + "epoch": 0.805921562548442, + "grad_norm": 0.38894450664520264, + "learning_rate": 9.158005655330694e-06, + "loss": 0.3726, + "step": 1733 + }, + { + "epoch": 0.8063866067276392, + "grad_norm": 0.45114031434059143, + "learning_rate": 9.156502241296376e-06, + "loss": 0.3862, + "step": 1734 + }, + { + "epoch": 0.8068516509068362, + "grad_norm": 0.4183841943740845, + "learning_rate": 9.154997609890068e-06, + "loss": 0.3567, + "step": 1735 + }, + { + "epoch": 0.8073166950860332, + "grad_norm": 0.4706588387489319, + "learning_rate": 9.15349176155245e-06, + "loss": 0.3892, + "step": 1736 + }, + { + "epoch": 0.8077817392652302, + "grad_norm": 0.4675155282020569, + "learning_rate": 9.151984696724563e-06, + "loss": 0.4099, + "step": 1737 + }, + { + "epoch": 0.8082467834444272, + "grad_norm": 0.46057257056236267, + "learning_rate": 9.150476415847797e-06, + "loss": 0.3915, + "step": 1738 + }, + { + "epoch": 0.8087118276236243, + "grad_norm": 0.41681477427482605, + "learning_rate": 9.148966919363906e-06, + "loss": 0.3839, + "step": 1739 + }, + { + "epoch": 0.8091768718028213, + "grad_norm": 0.4783117473125458, + "learning_rate": 9.147456207714998e-06, + "loss": 0.3541, + "step": 1740 + }, + { + "epoch": 0.8096419159820183, + "grad_norm": 0.5050014853477478, + "learning_rate": 9.145944281343534e-06, + "loss": 0.36, + "step": 1741 + }, + { + "epoch": 0.8101069601612153, + "grad_norm": 0.4405074715614319, + "learning_rate": 9.144431140692332e-06, + "loss": 0.4016, + "step": 1742 + }, + { + "epoch": 0.8105720043404123, + "grad_norm": 0.41197729110717773, + "learning_rate": 9.142916786204568e-06, + "loss": 0.3787, + "step": 1743 + }, + { + "epoch": 0.8110370485196093, + "grad_norm": 0.4868341386318207, + "learning_rate": 9.141401218323772e-06, + "loss": 0.3678, + "step": 1744 + }, + { + "epoch": 0.8115020926988064, + "grad_norm": 0.4434308409690857, + "learning_rate": 9.139884437493828e-06, + "loss": 0.3633, + "step": 1745 + }, + { + "epoch": 0.8119671368780034, + "grad_norm": 0.4891102612018585, + "learning_rate": 9.138366444158977e-06, + "loss": 0.3761, + "step": 1746 + }, + { + "epoch": 0.8124321810572004, + "grad_norm": 0.5441362261772156, + "learning_rate": 9.136847238763814e-06, + "loss": 0.3794, + "step": 1747 + }, + { + "epoch": 0.8128972252363974, + "grad_norm": 0.44566887617111206, + "learning_rate": 9.135326821753291e-06, + "loss": 0.3944, + "step": 1748 + }, + { + "epoch": 0.8133622694155945, + "grad_norm": 0.4834020733833313, + "learning_rate": 9.133805193572713e-06, + "loss": 0.3902, + "step": 1749 + }, + { + "epoch": 0.8138273135947915, + "grad_norm": 0.46305882930755615, + "learning_rate": 9.132282354667741e-06, + "loss": 0.3725, + "step": 1750 + }, + { + "epoch": 0.8142923577739886, + "grad_norm": 0.4824235141277313, + "learning_rate": 9.130758305484387e-06, + "loss": 0.4076, + "step": 1751 + }, + { + "epoch": 0.8147574019531856, + "grad_norm": 0.5479027032852173, + "learning_rate": 9.129233046469021e-06, + "loss": 0.3881, + "step": 1752 + }, + { + "epoch": 0.8152224461323826, + "grad_norm": 0.42961257696151733, + "learning_rate": 9.127706578068369e-06, + "loss": 0.3726, + "step": 1753 + }, + { + "epoch": 0.8156874903115796, + "grad_norm": 0.4997014105319977, + "learning_rate": 9.126178900729507e-06, + "loss": 0.3848, + "step": 1754 + }, + { + "epoch": 0.8161525344907766, + "grad_norm": 0.5122492909431458, + "learning_rate": 9.124650014899868e-06, + "loss": 0.398, + "step": 1755 + }, + { + "epoch": 0.8166175786699736, + "grad_norm": 0.5152801275253296, + "learning_rate": 9.123119921027234e-06, + "loss": 0.383, + "step": 1756 + }, + { + "epoch": 0.8170826228491707, + "grad_norm": 0.4216311275959015, + "learning_rate": 9.121588619559752e-06, + "loss": 0.3729, + "step": 1757 + }, + { + "epoch": 0.8175476670283677, + "grad_norm": 0.5480169057846069, + "learning_rate": 9.120056110945907e-06, + "loss": 0.4093, + "step": 1758 + }, + { + "epoch": 0.8180127112075647, + "grad_norm": 0.6168447732925415, + "learning_rate": 9.118522395634552e-06, + "loss": 0.3907, + "step": 1759 + }, + { + "epoch": 0.8184777553867617, + "grad_norm": 0.45844393968582153, + "learning_rate": 9.116987474074885e-06, + "loss": 0.3886, + "step": 1760 + }, + { + "epoch": 0.8189427995659587, + "grad_norm": 0.543725311756134, + "learning_rate": 9.115451346716459e-06, + "loss": 0.3968, + "step": 1761 + }, + { + "epoch": 0.8194078437451557, + "grad_norm": 0.48329514265060425, + "learning_rate": 9.113914014009182e-06, + "loss": 0.3837, + "step": 1762 + }, + { + "epoch": 0.8198728879243529, + "grad_norm": 0.4946017265319824, + "learning_rate": 9.112375476403313e-06, + "loss": 0.3877, + "step": 1763 + }, + { + "epoch": 0.8203379321035499, + "grad_norm": 0.5260226130485535, + "learning_rate": 9.110835734349464e-06, + "loss": 0.3735, + "step": 1764 + }, + { + "epoch": 0.8208029762827469, + "grad_norm": 0.49095699191093445, + "learning_rate": 9.109294788298601e-06, + "loss": 0.3832, + "step": 1765 + }, + { + "epoch": 0.8212680204619439, + "grad_norm": 0.5328912138938904, + "learning_rate": 9.107752638702046e-06, + "loss": 0.3704, + "step": 1766 + }, + { + "epoch": 0.8217330646411409, + "grad_norm": 0.5146880745887756, + "learning_rate": 9.106209286011463e-06, + "loss": 0.3679, + "step": 1767 + }, + { + "epoch": 0.822198108820338, + "grad_norm": 0.6965378522872925, + "learning_rate": 9.104664730678878e-06, + "loss": 0.3939, + "step": 1768 + }, + { + "epoch": 0.822663152999535, + "grad_norm": 0.4589329659938812, + "learning_rate": 9.103118973156667e-06, + "loss": 0.3813, + "step": 1769 + }, + { + "epoch": 0.823128197178732, + "grad_norm": 0.4879869222640991, + "learning_rate": 9.101572013897555e-06, + "loss": 0.3608, + "step": 1770 + }, + { + "epoch": 0.823593241357929, + "grad_norm": 0.5170464515686035, + "learning_rate": 9.100023853354624e-06, + "loss": 0.3932, + "step": 1771 + }, + { + "epoch": 0.824058285537126, + "grad_norm": 0.48912927508354187, + "learning_rate": 9.098474491981305e-06, + "loss": 0.3588, + "step": 1772 + }, + { + "epoch": 0.824523329716323, + "grad_norm": 0.4621887803077698, + "learning_rate": 9.096923930231377e-06, + "loss": 0.3501, + "step": 1773 + }, + { + "epoch": 0.82498837389552, + "grad_norm": 0.6245998740196228, + "learning_rate": 9.095372168558977e-06, + "loss": 0.3937, + "step": 1774 + }, + { + "epoch": 0.8254534180747171, + "grad_norm": 0.43584007024765015, + "learning_rate": 9.09381920741859e-06, + "loss": 0.3947, + "step": 1775 + }, + { + "epoch": 0.8259184622539141, + "grad_norm": 0.5115625262260437, + "learning_rate": 9.092265047265057e-06, + "loss": 0.3878, + "step": 1776 + }, + { + "epoch": 0.8263835064331111, + "grad_norm": 0.5149562954902649, + "learning_rate": 9.090709688553561e-06, + "loss": 0.4063, + "step": 1777 + }, + { + "epoch": 0.8268485506123082, + "grad_norm": 0.4879224896430969, + "learning_rate": 9.089153131739642e-06, + "loss": 0.3646, + "step": 1778 + }, + { + "epoch": 0.8273135947915052, + "grad_norm": 0.4819486737251282, + "learning_rate": 9.087595377279192e-06, + "loss": 0.3915, + "step": 1779 + }, + { + "epoch": 0.8277786389707023, + "grad_norm": 0.5894208550453186, + "learning_rate": 9.086036425628453e-06, + "loss": 0.395, + "step": 1780 + }, + { + "epoch": 0.8282436831498993, + "grad_norm": 0.45492544770240784, + "learning_rate": 9.084476277244013e-06, + "loss": 0.3976, + "step": 1781 + }, + { + "epoch": 0.8287087273290963, + "grad_norm": 0.4946926236152649, + "learning_rate": 9.082914932582818e-06, + "loss": 0.3795, + "step": 1782 + }, + { + "epoch": 0.8291737715082933, + "grad_norm": 0.5285559892654419, + "learning_rate": 9.081352392102159e-06, + "loss": 0.3766, + "step": 1783 + }, + { + "epoch": 0.8296388156874903, + "grad_norm": 0.45280373096466064, + "learning_rate": 9.079788656259677e-06, + "loss": 0.3862, + "step": 1784 + }, + { + "epoch": 0.8301038598666873, + "grad_norm": 0.5158466100692749, + "learning_rate": 9.078223725513366e-06, + "loss": 0.3877, + "step": 1785 + }, + { + "epoch": 0.8305689040458843, + "grad_norm": 0.5172114968299866, + "learning_rate": 9.076657600321569e-06, + "loss": 0.395, + "step": 1786 + }, + { + "epoch": 0.8310339482250814, + "grad_norm": 0.4534607529640198, + "learning_rate": 9.07509028114298e-06, + "loss": 0.3724, + "step": 1787 + }, + { + "epoch": 0.8314989924042784, + "grad_norm": 0.45312753319740295, + "learning_rate": 9.073521768436638e-06, + "loss": 0.3771, + "step": 1788 + }, + { + "epoch": 0.8319640365834754, + "grad_norm": 0.4360067844390869, + "learning_rate": 9.071952062661938e-06, + "loss": 0.3927, + "step": 1789 + }, + { + "epoch": 0.8324290807626724, + "grad_norm": 0.4952300190925598, + "learning_rate": 9.070381164278622e-06, + "loss": 0.3784, + "step": 1790 + }, + { + "epoch": 0.8328941249418694, + "grad_norm": 0.4036773145198822, + "learning_rate": 9.068809073746776e-06, + "loss": 0.3878, + "step": 1791 + }, + { + "epoch": 0.8333591691210666, + "grad_norm": 0.4290243089199066, + "learning_rate": 9.067235791526844e-06, + "loss": 0.3467, + "step": 1792 + }, + { + "epoch": 0.8338242133002636, + "grad_norm": 0.49693888425827026, + "learning_rate": 9.065661318079613e-06, + "loss": 0.3583, + "step": 1793 + }, + { + "epoch": 0.8342892574794606, + "grad_norm": 0.5083548426628113, + "learning_rate": 9.064085653866222e-06, + "loss": 0.3691, + "step": 1794 + }, + { + "epoch": 0.8347543016586576, + "grad_norm": 0.47161993384361267, + "learning_rate": 9.062508799348155e-06, + "loss": 0.3803, + "step": 1795 + }, + { + "epoch": 0.8352193458378546, + "grad_norm": 0.4640964865684509, + "learning_rate": 9.06093075498725e-06, + "loss": 0.3636, + "step": 1796 + }, + { + "epoch": 0.8356843900170516, + "grad_norm": 0.4891905188560486, + "learning_rate": 9.059351521245688e-06, + "loss": 0.3904, + "step": 1797 + }, + { + "epoch": 0.8361494341962487, + "grad_norm": 0.5055262446403503, + "learning_rate": 9.057771098586003e-06, + "loss": 0.4009, + "step": 1798 + }, + { + "epoch": 0.8366144783754457, + "grad_norm": 0.44761812686920166, + "learning_rate": 9.056189487471074e-06, + "loss": 0.3927, + "step": 1799 + }, + { + "epoch": 0.8370795225546427, + "grad_norm": 0.49880966544151306, + "learning_rate": 9.05460668836413e-06, + "loss": 0.4054, + "step": 1800 + }, + { + "epoch": 0.8375445667338397, + "grad_norm": 0.427062451839447, + "learning_rate": 9.053022701728744e-06, + "loss": 0.3694, + "step": 1801 + }, + { + "epoch": 0.8380096109130367, + "grad_norm": 0.48414304852485657, + "learning_rate": 9.051437528028846e-06, + "loss": 0.3839, + "step": 1802 + }, + { + "epoch": 0.8384746550922337, + "grad_norm": 0.5623500347137451, + "learning_rate": 9.049851167728702e-06, + "loss": 0.405, + "step": 1803 + }, + { + "epoch": 0.8389396992714307, + "grad_norm": 0.4517265558242798, + "learning_rate": 9.048263621292934e-06, + "loss": 0.381, + "step": 1804 + }, + { + "epoch": 0.8394047434506278, + "grad_norm": 0.6233772039413452, + "learning_rate": 9.046674889186509e-06, + "loss": 0.4002, + "step": 1805 + }, + { + "epoch": 0.8398697876298248, + "grad_norm": 0.5412980914115906, + "learning_rate": 9.045084971874738e-06, + "loss": 0.3837, + "step": 1806 + }, + { + "epoch": 0.8403348318090219, + "grad_norm": 0.6208202838897705, + "learning_rate": 9.043493869823283e-06, + "loss": 0.4094, + "step": 1807 + }, + { + "epoch": 0.8407998759882189, + "grad_norm": 0.5141086578369141, + "learning_rate": 9.041901583498156e-06, + "loss": 0.4012, + "step": 1808 + }, + { + "epoch": 0.8412649201674159, + "grad_norm": 0.579380452632904, + "learning_rate": 9.040308113365706e-06, + "loss": 0.4096, + "step": 1809 + }, + { + "epoch": 0.841729964346613, + "grad_norm": 0.5328314304351807, + "learning_rate": 9.038713459892637e-06, + "loss": 0.3613, + "step": 1810 + }, + { + "epoch": 0.84219500852581, + "grad_norm": 0.4614507853984833, + "learning_rate": 9.037117623545998e-06, + "loss": 0.4189, + "step": 1811 + }, + { + "epoch": 0.842660052705007, + "grad_norm": 0.6282251477241516, + "learning_rate": 9.035520604793183e-06, + "loss": 0.389, + "step": 1812 + }, + { + "epoch": 0.843125096884204, + "grad_norm": 0.4729471206665039, + "learning_rate": 9.03392240410193e-06, + "loss": 0.3953, + "step": 1813 + }, + { + "epoch": 0.843590141063401, + "grad_norm": 0.5759819149971008, + "learning_rate": 9.03232302194033e-06, + "loss": 0.3916, + "step": 1814 + }, + { + "epoch": 0.844055185242598, + "grad_norm": 0.5700781345367432, + "learning_rate": 9.030722458776815e-06, + "loss": 0.4019, + "step": 1815 + }, + { + "epoch": 0.844520229421795, + "grad_norm": 0.429929256439209, + "learning_rate": 9.029120715080162e-06, + "loss": 0.3942, + "step": 1816 + }, + { + "epoch": 0.8449852736009921, + "grad_norm": 0.6056925654411316, + "learning_rate": 9.027517791319499e-06, + "loss": 0.3985, + "step": 1817 + }, + { + "epoch": 0.8454503177801891, + "grad_norm": 0.5345470905303955, + "learning_rate": 9.025913687964293e-06, + "loss": 0.3734, + "step": 1818 + }, + { + "epoch": 0.8459153619593861, + "grad_norm": 0.5462255477905273, + "learning_rate": 9.024308405484363e-06, + "loss": 0.3822, + "step": 1819 + }, + { + "epoch": 0.8463804061385831, + "grad_norm": 0.5566556453704834, + "learning_rate": 9.022701944349867e-06, + "loss": 0.3878, + "step": 1820 + }, + { + "epoch": 0.8468454503177801, + "grad_norm": 0.5047350525856018, + "learning_rate": 9.021094305031314e-06, + "loss": 0.3838, + "step": 1821 + }, + { + "epoch": 0.8473104944969773, + "grad_norm": 0.5499956011772156, + "learning_rate": 9.019485487999553e-06, + "loss": 0.4, + "step": 1822 + }, + { + "epoch": 0.8477755386761743, + "grad_norm": 0.49966034293174744, + "learning_rate": 9.017875493725783e-06, + "loss": 0.3779, + "step": 1823 + }, + { + "epoch": 0.8482405828553713, + "grad_norm": 0.5475969910621643, + "learning_rate": 9.016264322681543e-06, + "loss": 0.3703, + "step": 1824 + }, + { + "epoch": 0.8487056270345683, + "grad_norm": 0.5151212811470032, + "learning_rate": 9.01465197533872e-06, + "loss": 0.3803, + "step": 1825 + }, + { + "epoch": 0.8491706712137653, + "grad_norm": 0.5074899196624756, + "learning_rate": 9.013038452169544e-06, + "loss": 0.3724, + "step": 1826 + }, + { + "epoch": 0.8496357153929623, + "grad_norm": 0.49190855026245117, + "learning_rate": 9.01142375364659e-06, + "loss": 0.4028, + "step": 1827 + }, + { + "epoch": 0.8501007595721594, + "grad_norm": 0.48248571157455444, + "learning_rate": 9.009807880242777e-06, + "loss": 0.3861, + "step": 1828 + }, + { + "epoch": 0.8505658037513564, + "grad_norm": 0.5088501572608948, + "learning_rate": 9.008190832431367e-06, + "loss": 0.3925, + "step": 1829 + }, + { + "epoch": 0.8510308479305534, + "grad_norm": 0.5075740218162537, + "learning_rate": 9.006572610685969e-06, + "loss": 0.4187, + "step": 1830 + }, + { + "epoch": 0.8514958921097504, + "grad_norm": 0.6194763779640198, + "learning_rate": 9.004953215480532e-06, + "loss": 0.3997, + "step": 1831 + }, + { + "epoch": 0.8519609362889474, + "grad_norm": 0.526176929473877, + "learning_rate": 9.003332647289351e-06, + "loss": 0.3969, + "step": 1832 + }, + { + "epoch": 0.8524259804681444, + "grad_norm": 0.578385055065155, + "learning_rate": 9.001710906587064e-06, + "loss": 0.3834, + "step": 1833 + }, + { + "epoch": 0.8528910246473415, + "grad_norm": 0.47598031163215637, + "learning_rate": 9.000087993848655e-06, + "loss": 0.3935, + "step": 1834 + }, + { + "epoch": 0.8533560688265385, + "grad_norm": 0.4297819137573242, + "learning_rate": 8.998463909549445e-06, + "loss": 0.3873, + "step": 1835 + }, + { + "epoch": 0.8538211130057356, + "grad_norm": 0.49305081367492676, + "learning_rate": 8.996838654165103e-06, + "loss": 0.3863, + "step": 1836 + }, + { + "epoch": 0.8542861571849326, + "grad_norm": 0.5345546007156372, + "learning_rate": 8.99521222817164e-06, + "loss": 0.3716, + "step": 1837 + }, + { + "epoch": 0.8547512013641296, + "grad_norm": 0.43835559487342834, + "learning_rate": 8.993584632045412e-06, + "loss": 0.3649, + "step": 1838 + }, + { + "epoch": 0.8552162455433266, + "grad_norm": 0.5586456656455994, + "learning_rate": 8.991955866263112e-06, + "loss": 0.3804, + "step": 1839 + }, + { + "epoch": 0.8556812897225237, + "grad_norm": 0.5465202331542969, + "learning_rate": 8.990325931301783e-06, + "loss": 0.3674, + "step": 1840 + }, + { + "epoch": 0.8561463339017207, + "grad_norm": 0.40788722038269043, + "learning_rate": 8.988694827638803e-06, + "loss": 0.3849, + "step": 1841 + }, + { + "epoch": 0.8566113780809177, + "grad_norm": 0.47962188720703125, + "learning_rate": 8.987062555751896e-06, + "loss": 0.3805, + "step": 1842 + }, + { + "epoch": 0.8570764222601147, + "grad_norm": 0.4847969114780426, + "learning_rate": 8.985429116119132e-06, + "loss": 0.3694, + "step": 1843 + }, + { + "epoch": 0.8575414664393117, + "grad_norm": 0.4671127200126648, + "learning_rate": 8.983794509218912e-06, + "loss": 0.3918, + "step": 1844 + }, + { + "epoch": 0.8580065106185087, + "grad_norm": 0.473143070936203, + "learning_rate": 8.982158735529991e-06, + "loss": 0.382, + "step": 1845 + }, + { + "epoch": 0.8584715547977058, + "grad_norm": 0.4718664586544037, + "learning_rate": 8.980521795531461e-06, + "loss": 0.3806, + "step": 1846 + }, + { + "epoch": 0.8589365989769028, + "grad_norm": 0.4485674500465393, + "learning_rate": 8.97888368970275e-06, + "loss": 0.3918, + "step": 1847 + }, + { + "epoch": 0.8594016431560998, + "grad_norm": 0.4322403073310852, + "learning_rate": 8.977244418523638e-06, + "loss": 0.3745, + "step": 1848 + }, + { + "epoch": 0.8598666873352968, + "grad_norm": 0.44763296842575073, + "learning_rate": 8.97560398247424e-06, + "loss": 0.3607, + "step": 1849 + }, + { + "epoch": 0.8603317315144938, + "grad_norm": 0.5826919674873352, + "learning_rate": 8.97396238203501e-06, + "loss": 0.395, + "step": 1850 + }, + { + "epoch": 0.860796775693691, + "grad_norm": 0.42212069034576416, + "learning_rate": 8.97231961768675e-06, + "loss": 0.3603, + "step": 1851 + }, + { + "epoch": 0.861261819872888, + "grad_norm": 0.49117767810821533, + "learning_rate": 8.970675689910596e-06, + "loss": 0.3789, + "step": 1852 + }, + { + "epoch": 0.861726864052085, + "grad_norm": 0.5760953426361084, + "learning_rate": 8.969030599188027e-06, + "loss": 0.3962, + "step": 1853 + }, + { + "epoch": 0.862191908231282, + "grad_norm": 0.4554479420185089, + "learning_rate": 8.967384346000866e-06, + "loss": 0.3854, + "step": 1854 + }, + { + "epoch": 0.862656952410479, + "grad_norm": 0.4693755507469177, + "learning_rate": 8.965736930831272e-06, + "loss": 0.383, + "step": 1855 + }, + { + "epoch": 0.863121996589676, + "grad_norm": 0.487821102142334, + "learning_rate": 8.964088354161748e-06, + "loss": 0.3819, + "step": 1856 + }, + { + "epoch": 0.863587040768873, + "grad_norm": 0.46892300248146057, + "learning_rate": 8.962438616475136e-06, + "loss": 0.3768, + "step": 1857 + }, + { + "epoch": 0.8640520849480701, + "grad_norm": 0.44840800762176514, + "learning_rate": 8.960787718254615e-06, + "loss": 0.3779, + "step": 1858 + }, + { + "epoch": 0.8645171291272671, + "grad_norm": 0.4931188225746155, + "learning_rate": 8.959135659983706e-06, + "loss": 0.3753, + "step": 1859 + }, + { + "epoch": 0.8649821733064641, + "grad_norm": 0.4526351988315582, + "learning_rate": 8.957482442146271e-06, + "loss": 0.3664, + "step": 1860 + }, + { + "epoch": 0.8654472174856611, + "grad_norm": 0.46007290482521057, + "learning_rate": 8.955828065226512e-06, + "loss": 0.3647, + "step": 1861 + }, + { + "epoch": 0.8659122616648581, + "grad_norm": 0.4510989487171173, + "learning_rate": 8.954172529708967e-06, + "loss": 0.3839, + "step": 1862 + }, + { + "epoch": 0.8663773058440551, + "grad_norm": 0.46228083968162537, + "learning_rate": 8.952515836078516e-06, + "loss": 0.3681, + "step": 1863 + }, + { + "epoch": 0.8668423500232522, + "grad_norm": 0.5067192912101746, + "learning_rate": 8.950857984820378e-06, + "loss": 0.3969, + "step": 1864 + }, + { + "epoch": 0.8673073942024492, + "grad_norm": 0.5053953528404236, + "learning_rate": 8.94919897642011e-06, + "loss": 0.3643, + "step": 1865 + }, + { + "epoch": 0.8677724383816463, + "grad_norm": 0.5040225386619568, + "learning_rate": 8.947538811363612e-06, + "loss": 0.4229, + "step": 1866 + }, + { + "epoch": 0.8682374825608433, + "grad_norm": 0.5020115971565247, + "learning_rate": 8.945877490137113e-06, + "loss": 0.3804, + "step": 1867 + }, + { + "epoch": 0.8687025267400403, + "grad_norm": 0.518023669719696, + "learning_rate": 8.944215013227193e-06, + "loss": 0.4007, + "step": 1868 + }, + { + "epoch": 0.8691675709192374, + "grad_norm": 0.4483552873134613, + "learning_rate": 8.942551381120763e-06, + "loss": 0.3953, + "step": 1869 + }, + { + "epoch": 0.8696326150984344, + "grad_norm": 0.5058504939079285, + "learning_rate": 8.94088659430507e-06, + "loss": 0.3782, + "step": 1870 + }, + { + "epoch": 0.8700976592776314, + "grad_norm": 0.46518880128860474, + "learning_rate": 8.939220653267708e-06, + "loss": 0.3827, + "step": 1871 + }, + { + "epoch": 0.8705627034568284, + "grad_norm": 0.4913824200630188, + "learning_rate": 8.937553558496602e-06, + "loss": 0.3815, + "step": 1872 + }, + { + "epoch": 0.8710277476360254, + "grad_norm": 0.5250646471977234, + "learning_rate": 8.935885310480018e-06, + "loss": 0.3706, + "step": 1873 + }, + { + "epoch": 0.8714927918152224, + "grad_norm": 0.44793933629989624, + "learning_rate": 8.934215909706554e-06, + "loss": 0.3856, + "step": 1874 + }, + { + "epoch": 0.8719578359944195, + "grad_norm": 0.5750781297683716, + "learning_rate": 8.932545356665157e-06, + "loss": 0.3723, + "step": 1875 + }, + { + "epoch": 0.8724228801736165, + "grad_norm": 0.46956464648246765, + "learning_rate": 8.930873651845101e-06, + "loss": 0.3936, + "step": 1876 + }, + { + "epoch": 0.8728879243528135, + "grad_norm": 0.4354320168495178, + "learning_rate": 8.929200795736003e-06, + "loss": 0.3627, + "step": 1877 + }, + { + "epoch": 0.8733529685320105, + "grad_norm": 0.5778939127922058, + "learning_rate": 8.927526788827814e-06, + "loss": 0.3856, + "step": 1878 + }, + { + "epoch": 0.8738180127112075, + "grad_norm": 0.4760781526565552, + "learning_rate": 8.925851631610825e-06, + "loss": 0.3778, + "step": 1879 + }, + { + "epoch": 0.8742830568904046, + "grad_norm": 0.48736944794654846, + "learning_rate": 8.92417532457566e-06, + "loss": 0.3975, + "step": 1880 + }, + { + "epoch": 0.8747481010696017, + "grad_norm": 0.4641439914703369, + "learning_rate": 8.922497868213284e-06, + "loss": 0.3756, + "step": 1881 + }, + { + "epoch": 0.8752131452487987, + "grad_norm": 0.5082623362541199, + "learning_rate": 8.920819263014995e-06, + "loss": 0.3836, + "step": 1882 + }, + { + "epoch": 0.8756781894279957, + "grad_norm": 0.5371342301368713, + "learning_rate": 8.919139509472433e-06, + "loss": 0.3863, + "step": 1883 + }, + { + "epoch": 0.8761432336071927, + "grad_norm": 0.44863003492355347, + "learning_rate": 8.917458608077566e-06, + "loss": 0.3654, + "step": 1884 + }, + { + "epoch": 0.8766082777863897, + "grad_norm": 0.48676854372024536, + "learning_rate": 8.915776559322704e-06, + "loss": 0.3949, + "step": 1885 + }, + { + "epoch": 0.8770733219655867, + "grad_norm": 0.4752778708934784, + "learning_rate": 8.914093363700493e-06, + "loss": 0.3897, + "step": 1886 + }, + { + "epoch": 0.8775383661447838, + "grad_norm": 0.4733431935310364, + "learning_rate": 8.912409021703914e-06, + "loss": 0.3728, + "step": 1887 + }, + { + "epoch": 0.8780034103239808, + "grad_norm": 0.44890302419662476, + "learning_rate": 8.910723533826281e-06, + "loss": 0.3891, + "step": 1888 + }, + { + "epoch": 0.8784684545031778, + "grad_norm": 0.574249804019928, + "learning_rate": 8.909036900561248e-06, + "loss": 0.3874, + "step": 1889 + }, + { + "epoch": 0.8789334986823748, + "grad_norm": 0.5202791094779968, + "learning_rate": 8.907349122402803e-06, + "loss": 0.3842, + "step": 1890 + }, + { + "epoch": 0.8793985428615718, + "grad_norm": 0.46931275725364685, + "learning_rate": 8.905660199845265e-06, + "loss": 0.3718, + "step": 1891 + }, + { + "epoch": 0.8798635870407688, + "grad_norm": 0.5120000243186951, + "learning_rate": 8.903970133383297e-06, + "loss": 0.3927, + "step": 1892 + }, + { + "epoch": 0.8803286312199659, + "grad_norm": 0.44593966007232666, + "learning_rate": 8.902278923511888e-06, + "loss": 0.388, + "step": 1893 + }, + { + "epoch": 0.8807936753991629, + "grad_norm": 0.4737493097782135, + "learning_rate": 8.900586570726369e-06, + "loss": 0.3728, + "step": 1894 + }, + { + "epoch": 0.88125871957836, + "grad_norm": 0.4651558995246887, + "learning_rate": 8.8988930755224e-06, + "loss": 0.3921, + "step": 1895 + }, + { + "epoch": 0.881723763757557, + "grad_norm": 0.5006027817726135, + "learning_rate": 8.897198438395983e-06, + "loss": 0.3782, + "step": 1896 + }, + { + "epoch": 0.882188807936754, + "grad_norm": 0.444982647895813, + "learning_rate": 8.895502659843442e-06, + "loss": 0.3813, + "step": 1897 + }, + { + "epoch": 0.882653852115951, + "grad_norm": 0.4869557023048401, + "learning_rate": 8.89380574036145e-06, + "loss": 0.4084, + "step": 1898 + }, + { + "epoch": 0.8831188962951481, + "grad_norm": 0.496622771024704, + "learning_rate": 8.892107680447005e-06, + "loss": 0.392, + "step": 1899 + }, + { + "epoch": 0.8835839404743451, + "grad_norm": 0.49721208214759827, + "learning_rate": 8.890408480597437e-06, + "loss": 0.3591, + "step": 1900 + }, + { + "epoch": 0.8840489846535421, + "grad_norm": 0.5105478763580322, + "learning_rate": 8.88870814131042e-06, + "loss": 0.387, + "step": 1901 + }, + { + "epoch": 0.8845140288327391, + "grad_norm": 0.5221246480941772, + "learning_rate": 8.887006663083952e-06, + "loss": 0.373, + "step": 1902 + }, + { + "epoch": 0.8849790730119361, + "grad_norm": 0.4854063391685486, + "learning_rate": 8.885304046416369e-06, + "loss": 0.3778, + "step": 1903 + }, + { + "epoch": 0.8854441171911331, + "grad_norm": 0.4377305209636688, + "learning_rate": 8.883600291806344e-06, + "loss": 0.4064, + "step": 1904 + }, + { + "epoch": 0.8859091613703302, + "grad_norm": 0.4939655065536499, + "learning_rate": 8.881895399752873e-06, + "loss": 0.3839, + "step": 1905 + }, + { + "epoch": 0.8863742055495272, + "grad_norm": 0.5449526906013489, + "learning_rate": 8.880189370755293e-06, + "loss": 0.3848, + "step": 1906 + }, + { + "epoch": 0.8868392497287242, + "grad_norm": 0.46097972989082336, + "learning_rate": 8.878482205313275e-06, + "loss": 0.3829, + "step": 1907 + }, + { + "epoch": 0.8873042939079212, + "grad_norm": 0.5087864995002747, + "learning_rate": 8.876773903926816e-06, + "loss": 0.3827, + "step": 1908 + }, + { + "epoch": 0.8877693380871183, + "grad_norm": 0.5424960255622864, + "learning_rate": 8.875064467096252e-06, + "loss": 0.37, + "step": 1909 + }, + { + "epoch": 0.8882343822663153, + "grad_norm": 0.4373340904712677, + "learning_rate": 8.873353895322248e-06, + "loss": 0.3723, + "step": 1910 + }, + { + "epoch": 0.8886994264455124, + "grad_norm": 0.5504875183105469, + "learning_rate": 8.871642189105804e-06, + "loss": 0.4028, + "step": 1911 + }, + { + "epoch": 0.8891644706247094, + "grad_norm": 0.5143681168556213, + "learning_rate": 8.869929348948252e-06, + "loss": 0.3809, + "step": 1912 + }, + { + "epoch": 0.8896295148039064, + "grad_norm": 0.4348513185977936, + "learning_rate": 8.868215375351251e-06, + "loss": 0.3806, + "step": 1913 + }, + { + "epoch": 0.8900945589831034, + "grad_norm": 0.5529665350914001, + "learning_rate": 8.866500268816803e-06, + "loss": 0.3994, + "step": 1914 + }, + { + "epoch": 0.8905596031623004, + "grad_norm": 0.5090189576148987, + "learning_rate": 8.864784029847227e-06, + "loss": 0.3922, + "step": 1915 + }, + { + "epoch": 0.8910246473414974, + "grad_norm": 0.4400188624858856, + "learning_rate": 8.863066658945185e-06, + "loss": 0.3955, + "step": 1916 + }, + { + "epoch": 0.8914896915206945, + "grad_norm": 0.5315966606140137, + "learning_rate": 8.861348156613667e-06, + "loss": 0.3723, + "step": 1917 + }, + { + "epoch": 0.8919547356998915, + "grad_norm": 0.5052797198295593, + "learning_rate": 8.859628523355995e-06, + "loss": 0.3741, + "step": 1918 + }, + { + "epoch": 0.8924197798790885, + "grad_norm": 0.4738040864467621, + "learning_rate": 8.857907759675822e-06, + "loss": 0.368, + "step": 1919 + }, + { + "epoch": 0.8928848240582855, + "grad_norm": 0.5382641553878784, + "learning_rate": 8.85618586607713e-06, + "loss": 0.3906, + "step": 1920 + }, + { + "epoch": 0.8933498682374825, + "grad_norm": 0.5306910276412964, + "learning_rate": 8.854462843064233e-06, + "loss": 0.3685, + "step": 1921 + }, + { + "epoch": 0.8938149124166795, + "grad_norm": 0.4780616760253906, + "learning_rate": 8.85273869114178e-06, + "loss": 0.3719, + "step": 1922 + }, + { + "epoch": 0.8942799565958766, + "grad_norm": 0.5241276621818542, + "learning_rate": 8.851013410814745e-06, + "loss": 0.3668, + "step": 1923 + }, + { + "epoch": 0.8947450007750737, + "grad_norm": 0.547930896282196, + "learning_rate": 8.84928700258843e-06, + "loss": 0.3678, + "step": 1924 + }, + { + "epoch": 0.8952100449542707, + "grad_norm": 0.5266174077987671, + "learning_rate": 8.847559466968482e-06, + "loss": 0.3808, + "step": 1925 + }, + { + "epoch": 0.8956750891334677, + "grad_norm": 0.4367474317550659, + "learning_rate": 8.845830804460861e-06, + "loss": 0.3975, + "step": 1926 + }, + { + "epoch": 0.8961401333126647, + "grad_norm": 0.4753339886665344, + "learning_rate": 8.844101015571867e-06, + "loss": 0.3894, + "step": 1927 + }, + { + "epoch": 0.8966051774918617, + "grad_norm": 0.5285556316375732, + "learning_rate": 8.842370100808123e-06, + "loss": 0.3891, + "step": 1928 + }, + { + "epoch": 0.8970702216710588, + "grad_norm": 0.48880597949028015, + "learning_rate": 8.84063806067659e-06, + "loss": 0.3949, + "step": 1929 + }, + { + "epoch": 0.8975352658502558, + "grad_norm": 0.46564510464668274, + "learning_rate": 8.838904895684555e-06, + "loss": 0.4002, + "step": 1930 + }, + { + "epoch": 0.8980003100294528, + "grad_norm": 0.4695758521556854, + "learning_rate": 8.837170606339628e-06, + "loss": 0.3763, + "step": 1931 + }, + { + "epoch": 0.8984653542086498, + "grad_norm": 0.5208839178085327, + "learning_rate": 8.835435193149762e-06, + "loss": 0.3677, + "step": 1932 + }, + { + "epoch": 0.8989303983878468, + "grad_norm": 0.5239862203598022, + "learning_rate": 8.833698656623227e-06, + "loss": 0.3733, + "step": 1933 + }, + { + "epoch": 0.8993954425670438, + "grad_norm": 0.4721670150756836, + "learning_rate": 8.831960997268625e-06, + "loss": 0.3773, + "step": 1934 + }, + { + "epoch": 0.8998604867462409, + "grad_norm": 0.5257996320724487, + "learning_rate": 8.83022221559489e-06, + "loss": 0.3535, + "step": 1935 + }, + { + "epoch": 0.9003255309254379, + "grad_norm": 0.5932273268699646, + "learning_rate": 8.828482312111285e-06, + "loss": 0.3875, + "step": 1936 + }, + { + "epoch": 0.9007905751046349, + "grad_norm": 0.4747033417224884, + "learning_rate": 8.826741287327396e-06, + "loss": 0.3726, + "step": 1937 + }, + { + "epoch": 0.9012556192838319, + "grad_norm": 0.4879438877105713, + "learning_rate": 8.824999141753144e-06, + "loss": 0.3816, + "step": 1938 + }, + { + "epoch": 0.901720663463029, + "grad_norm": 0.5382803678512573, + "learning_rate": 8.82325587589877e-06, + "loss": 0.4059, + "step": 1939 + }, + { + "epoch": 0.902185707642226, + "grad_norm": 0.5141168236732483, + "learning_rate": 8.821511490274854e-06, + "loss": 0.3807, + "step": 1940 + }, + { + "epoch": 0.9026507518214231, + "grad_norm": 0.5135001540184021, + "learning_rate": 8.819765985392297e-06, + "loss": 0.4041, + "step": 1941 + }, + { + "epoch": 0.9031157960006201, + "grad_norm": 0.5456048250198364, + "learning_rate": 8.818019361762325e-06, + "loss": 0.4146, + "step": 1942 + }, + { + "epoch": 0.9035808401798171, + "grad_norm": 0.5352010726928711, + "learning_rate": 8.816271619896502e-06, + "loss": 0.385, + "step": 1943 + }, + { + "epoch": 0.9040458843590141, + "grad_norm": 0.43585771322250366, + "learning_rate": 8.814522760306708e-06, + "loss": 0.3692, + "step": 1944 + }, + { + "epoch": 0.9045109285382111, + "grad_norm": 0.47126439213752747, + "learning_rate": 8.812772783505158e-06, + "loss": 0.3653, + "step": 1945 + }, + { + "epoch": 0.9049759727174082, + "grad_norm": 0.5107633471488953, + "learning_rate": 8.811021690004389e-06, + "loss": 0.3952, + "step": 1946 + }, + { + "epoch": 0.9054410168966052, + "grad_norm": 0.482185423374176, + "learning_rate": 8.80926948031727e-06, + "loss": 0.3861, + "step": 1947 + }, + { + "epoch": 0.9059060610758022, + "grad_norm": 0.4702143669128418, + "learning_rate": 8.807516154956997e-06, + "loss": 0.3701, + "step": 1948 + }, + { + "epoch": 0.9063711052549992, + "grad_norm": 0.5154238343238831, + "learning_rate": 8.80576171443709e-06, + "loss": 0.3809, + "step": 1949 + }, + { + "epoch": 0.9068361494341962, + "grad_norm": 0.49696359038352966, + "learning_rate": 8.80400615927139e-06, + "loss": 0.3743, + "step": 1950 + }, + { + "epoch": 0.9073011936133932, + "grad_norm": 0.49918052554130554, + "learning_rate": 8.802249489974078e-06, + "loss": 0.4045, + "step": 1951 + }, + { + "epoch": 0.9077662377925902, + "grad_norm": 0.45246899127960205, + "learning_rate": 8.80049170705965e-06, + "loss": 0.3693, + "step": 1952 + }, + { + "epoch": 0.9082312819717874, + "grad_norm": 0.4851575791835785, + "learning_rate": 8.798732811042934e-06, + "loss": 0.4069, + "step": 1953 + }, + { + "epoch": 0.9086963261509844, + "grad_norm": 0.5256077647209167, + "learning_rate": 8.796972802439079e-06, + "loss": 0.3695, + "step": 1954 + }, + { + "epoch": 0.9091613703301814, + "grad_norm": 0.4606427848339081, + "learning_rate": 8.795211681763565e-06, + "loss": 0.3749, + "step": 1955 + }, + { + "epoch": 0.9096264145093784, + "grad_norm": 0.4951748847961426, + "learning_rate": 8.793449449532198e-06, + "loss": 0.4129, + "step": 1956 + }, + { + "epoch": 0.9100914586885754, + "grad_norm": 0.4839325547218323, + "learning_rate": 8.791686106261104e-06, + "loss": 0.3883, + "step": 1957 + }, + { + "epoch": 0.9105565028677725, + "grad_norm": 0.4781914949417114, + "learning_rate": 8.789921652466738e-06, + "loss": 0.3883, + "step": 1958 + }, + { + "epoch": 0.9110215470469695, + "grad_norm": 0.4980541467666626, + "learning_rate": 8.78815608866588e-06, + "loss": 0.3689, + "step": 1959 + }, + { + "epoch": 0.9114865912261665, + "grad_norm": 0.523271381855011, + "learning_rate": 8.786389415375636e-06, + "loss": 0.3942, + "step": 1960 + }, + { + "epoch": 0.9119516354053635, + "grad_norm": 0.5033456683158875, + "learning_rate": 8.784621633113434e-06, + "loss": 0.3624, + "step": 1961 + }, + { + "epoch": 0.9124166795845605, + "grad_norm": 0.441829651594162, + "learning_rate": 8.78285274239703e-06, + "loss": 0.371, + "step": 1962 + }, + { + "epoch": 0.9128817237637575, + "grad_norm": 0.49837011098861694, + "learning_rate": 8.781082743744505e-06, + "loss": 0.4078, + "step": 1963 + }, + { + "epoch": 0.9133467679429546, + "grad_norm": 0.46190333366394043, + "learning_rate": 8.779311637674259e-06, + "loss": 0.397, + "step": 1964 + }, + { + "epoch": 0.9138118121221516, + "grad_norm": 0.5549447536468506, + "learning_rate": 8.777539424705022e-06, + "loss": 0.3938, + "step": 1965 + }, + { + "epoch": 0.9142768563013486, + "grad_norm": 0.4479234516620636, + "learning_rate": 8.775766105355849e-06, + "loss": 0.3767, + "step": 1966 + }, + { + "epoch": 0.9147419004805456, + "grad_norm": 0.4744507372379303, + "learning_rate": 8.773991680146113e-06, + "loss": 0.3853, + "step": 1967 + }, + { + "epoch": 0.9152069446597427, + "grad_norm": 0.4301864206790924, + "learning_rate": 8.772216149595515e-06, + "loss": 0.3836, + "step": 1968 + }, + { + "epoch": 0.9156719888389397, + "grad_norm": 0.4343145489692688, + "learning_rate": 8.77043951422408e-06, + "loss": 0.3789, + "step": 1969 + }, + { + "epoch": 0.9161370330181368, + "grad_norm": 0.43467792868614197, + "learning_rate": 8.768661774552155e-06, + "loss": 0.3736, + "step": 1970 + }, + { + "epoch": 0.9166020771973338, + "grad_norm": 0.43103089928627014, + "learning_rate": 8.766882931100411e-06, + "loss": 0.3627, + "step": 1971 + }, + { + "epoch": 0.9170671213765308, + "grad_norm": 0.43862855434417725, + "learning_rate": 8.765102984389842e-06, + "loss": 0.3811, + "step": 1972 + }, + { + "epoch": 0.9175321655557278, + "grad_norm": 0.48002341389656067, + "learning_rate": 8.763321934941766e-06, + "loss": 0.4119, + "step": 1973 + }, + { + "epoch": 0.9179972097349248, + "grad_norm": 0.43228980898857117, + "learning_rate": 8.761539783277825e-06, + "loss": 0.3808, + "step": 1974 + }, + { + "epoch": 0.9184622539141218, + "grad_norm": 0.43084922432899475, + "learning_rate": 8.75975652991998e-06, + "loss": 0.3998, + "step": 1975 + }, + { + "epoch": 0.9189272980933189, + "grad_norm": 0.4248911738395691, + "learning_rate": 8.757972175390516e-06, + "loss": 0.4031, + "step": 1976 + }, + { + "epoch": 0.9193923422725159, + "grad_norm": 0.4406249523162842, + "learning_rate": 8.756186720212045e-06, + "loss": 0.3819, + "step": 1977 + }, + { + "epoch": 0.9198573864517129, + "grad_norm": 0.4917864501476288, + "learning_rate": 8.754400164907496e-06, + "loss": 0.3958, + "step": 1978 + }, + { + "epoch": 0.9203224306309099, + "grad_norm": 0.44568273425102234, + "learning_rate": 8.752612510000123e-06, + "loss": 0.3707, + "step": 1979 + }, + { + "epoch": 0.9207874748101069, + "grad_norm": 0.5774033069610596, + "learning_rate": 8.750823756013498e-06, + "loss": 0.3855, + "step": 1980 + }, + { + "epoch": 0.9212525189893039, + "grad_norm": 0.4670921564102173, + "learning_rate": 8.749033903471522e-06, + "loss": 0.4012, + "step": 1981 + }, + { + "epoch": 0.921717563168501, + "grad_norm": 0.45798271894454956, + "learning_rate": 8.74724295289841e-06, + "loss": 0.3566, + "step": 1982 + }, + { + "epoch": 0.9221826073476981, + "grad_norm": 0.47769033908843994, + "learning_rate": 8.745450904818705e-06, + "loss": 0.3623, + "step": 1983 + }, + { + "epoch": 0.9226476515268951, + "grad_norm": 0.42745545506477356, + "learning_rate": 8.743657759757267e-06, + "loss": 0.3977, + "step": 1984 + }, + { + "epoch": 0.9231126957060921, + "grad_norm": 0.4700676500797272, + "learning_rate": 8.741863518239283e-06, + "loss": 0.3815, + "step": 1985 + }, + { + "epoch": 0.9235777398852891, + "grad_norm": 0.4442138671875, + "learning_rate": 8.740068180790252e-06, + "loss": 0.3957, + "step": 1986 + }, + { + "epoch": 0.9240427840644861, + "grad_norm": 0.42371588945388794, + "learning_rate": 8.738271747936001e-06, + "loss": 0.3715, + "step": 1987 + }, + { + "epoch": 0.9245078282436832, + "grad_norm": 0.476406067609787, + "learning_rate": 8.736474220202675e-06, + "loss": 0.3946, + "step": 1988 + }, + { + "epoch": 0.9249728724228802, + "grad_norm": 0.4362446367740631, + "learning_rate": 8.734675598116743e-06, + "loss": 0.3665, + "step": 1989 + }, + { + "epoch": 0.9254379166020772, + "grad_norm": 0.4169759154319763, + "learning_rate": 8.732875882204993e-06, + "loss": 0.3747, + "step": 1990 + }, + { + "epoch": 0.9259029607812742, + "grad_norm": 0.4496033191680908, + "learning_rate": 8.73107507299453e-06, + "loss": 0.3798, + "step": 1991 + }, + { + "epoch": 0.9263680049604712, + "grad_norm": 0.47536760568618774, + "learning_rate": 8.729273171012782e-06, + "loss": 0.3914, + "step": 1992 + }, + { + "epoch": 0.9268330491396682, + "grad_norm": 0.4569942355155945, + "learning_rate": 8.727470176787498e-06, + "loss": 0.3665, + "step": 1993 + }, + { + "epoch": 0.9272980933188653, + "grad_norm": 0.46827104687690735, + "learning_rate": 8.725666090846746e-06, + "loss": 0.3673, + "step": 1994 + }, + { + "epoch": 0.9277631374980623, + "grad_norm": 0.5109233856201172, + "learning_rate": 8.72386091371891e-06, + "loss": 0.3717, + "step": 1995 + }, + { + "epoch": 0.9282281816772593, + "grad_norm": 0.4976223409175873, + "learning_rate": 8.7220546459327e-06, + "loss": 0.3748, + "step": 1996 + }, + { + "epoch": 0.9286932258564564, + "grad_norm": 0.42419055104255676, + "learning_rate": 8.720247288017143e-06, + "loss": 0.3478, + "step": 1997 + }, + { + "epoch": 0.9291582700356534, + "grad_norm": 0.5530095100402832, + "learning_rate": 8.718438840501585e-06, + "loss": 0.3746, + "step": 1998 + }, + { + "epoch": 0.9296233142148504, + "grad_norm": 0.44324877858161926, + "learning_rate": 8.716629303915689e-06, + "loss": 0.3711, + "step": 1999 + }, + { + "epoch": 0.9300883583940475, + "grad_norm": 0.4611654281616211, + "learning_rate": 8.71481867878944e-06, + "loss": 0.3642, + "step": 2000 + }, + { + "epoch": 0.9305534025732445, + "grad_norm": 0.5441577434539795, + "learning_rate": 8.71300696565314e-06, + "loss": 0.373, + "step": 2001 + }, + { + "epoch": 0.9310184467524415, + "grad_norm": 0.5150545239448547, + "learning_rate": 8.71119416503741e-06, + "loss": 0.366, + "step": 2002 + }, + { + "epoch": 0.9314834909316385, + "grad_norm": 0.5382184386253357, + "learning_rate": 8.709380277473191e-06, + "loss": 0.3885, + "step": 2003 + }, + { + "epoch": 0.9319485351108355, + "grad_norm": 0.5589922070503235, + "learning_rate": 8.707565303491741e-06, + "loss": 0.3871, + "step": 2004 + }, + { + "epoch": 0.9324135792900325, + "grad_norm": 0.5470889210700989, + "learning_rate": 8.705749243624635e-06, + "loss": 0.4045, + "step": 2005 + }, + { + "epoch": 0.9328786234692296, + "grad_norm": 0.5791128873825073, + "learning_rate": 8.70393209840377e-06, + "loss": 0.3931, + "step": 2006 + }, + { + "epoch": 0.9333436676484266, + "grad_norm": 0.4637211859226227, + "learning_rate": 8.702113868361357e-06, + "loss": 0.3823, + "step": 2007 + }, + { + "epoch": 0.9338087118276236, + "grad_norm": 0.5710762143135071, + "learning_rate": 8.700294554029926e-06, + "loss": 0.381, + "step": 2008 + }, + { + "epoch": 0.9342737560068206, + "grad_norm": 0.454738050699234, + "learning_rate": 8.698474155942325e-06, + "loss": 0.3765, + "step": 2009 + }, + { + "epoch": 0.9347388001860176, + "grad_norm": 0.5094452500343323, + "learning_rate": 8.696652674631716e-06, + "loss": 0.4076, + "step": 2010 + }, + { + "epoch": 0.9352038443652146, + "grad_norm": 0.46923717856407166, + "learning_rate": 8.694830110631587e-06, + "loss": 0.3796, + "step": 2011 + }, + { + "epoch": 0.9356688885444118, + "grad_norm": 0.5491566061973572, + "learning_rate": 8.693006464475732e-06, + "loss": 0.3716, + "step": 2012 + }, + { + "epoch": 0.9361339327236088, + "grad_norm": 0.5266678929328918, + "learning_rate": 8.691181736698272e-06, + "loss": 0.3616, + "step": 2013 + }, + { + "epoch": 0.9365989769028058, + "grad_norm": 0.4861200153827667, + "learning_rate": 8.689355927833636e-06, + "loss": 0.3638, + "step": 2014 + }, + { + "epoch": 0.9370640210820028, + "grad_norm": 0.5367874503135681, + "learning_rate": 8.687529038416575e-06, + "loss": 0.3787, + "step": 2015 + }, + { + "epoch": 0.9375290652611998, + "grad_norm": 0.47685980796813965, + "learning_rate": 8.685701068982158e-06, + "loss": 0.3904, + "step": 2016 + }, + { + "epoch": 0.9379941094403969, + "grad_norm": 0.4525419771671295, + "learning_rate": 8.683872020065763e-06, + "loss": 0.4035, + "step": 2017 + }, + { + "epoch": 0.9384591536195939, + "grad_norm": 0.45574548840522766, + "learning_rate": 8.682041892203093e-06, + "loss": 0.4004, + "step": 2018 + }, + { + "epoch": 0.9389241977987909, + "grad_norm": 0.4963834285736084, + "learning_rate": 8.68021068593016e-06, + "loss": 0.3706, + "step": 2019 + }, + { + "epoch": 0.9393892419779879, + "grad_norm": 0.46662190556526184, + "learning_rate": 8.678378401783293e-06, + "loss": 0.4034, + "step": 2020 + }, + { + "epoch": 0.9398542861571849, + "grad_norm": 0.45267191529273987, + "learning_rate": 8.676545040299145e-06, + "loss": 0.3866, + "step": 2021 + }, + { + "epoch": 0.9403193303363819, + "grad_norm": 0.4719865620136261, + "learning_rate": 8.674710602014672e-06, + "loss": 0.3831, + "step": 2022 + }, + { + "epoch": 0.940784374515579, + "grad_norm": 0.42862021923065186, + "learning_rate": 8.67287508746715e-06, + "loss": 0.3466, + "step": 2023 + }, + { + "epoch": 0.941249418694776, + "grad_norm": 0.48866480588912964, + "learning_rate": 8.671038497194175e-06, + "loss": 0.3776, + "step": 2024 + }, + { + "epoch": 0.941714462873973, + "grad_norm": 0.4378620684146881, + "learning_rate": 8.669200831733655e-06, + "loss": 0.3899, + "step": 2025 + }, + { + "epoch": 0.9421795070531701, + "grad_norm": 0.45231664180755615, + "learning_rate": 8.66736209162381e-06, + "loss": 0.3469, + "step": 2026 + }, + { + "epoch": 0.9426445512323671, + "grad_norm": 0.560242235660553, + "learning_rate": 8.665522277403177e-06, + "loss": 0.3633, + "step": 2027 + }, + { + "epoch": 0.9431095954115641, + "grad_norm": 0.431618332862854, + "learning_rate": 8.66368138961061e-06, + "loss": 0.3988, + "step": 2028 + }, + { + "epoch": 0.9435746395907612, + "grad_norm": 0.46893802285194397, + "learning_rate": 8.661839428785273e-06, + "loss": 0.3829, + "step": 2029 + }, + { + "epoch": 0.9440396837699582, + "grad_norm": 0.506652295589447, + "learning_rate": 8.659996395466648e-06, + "loss": 0.3853, + "step": 2030 + }, + { + "epoch": 0.9445047279491552, + "grad_norm": 0.5342637896537781, + "learning_rate": 8.658152290194526e-06, + "loss": 0.4173, + "step": 2031 + }, + { + "epoch": 0.9449697721283522, + "grad_norm": 0.43193793296813965, + "learning_rate": 8.656307113509021e-06, + "loss": 0.367, + "step": 2032 + }, + { + "epoch": 0.9454348163075492, + "grad_norm": 0.43902087211608887, + "learning_rate": 8.654460865950551e-06, + "loss": 0.3629, + "step": 2033 + }, + { + "epoch": 0.9458998604867462, + "grad_norm": 0.5017895698547363, + "learning_rate": 8.652613548059854e-06, + "loss": 0.3706, + "step": 2034 + }, + { + "epoch": 0.9463649046659433, + "grad_norm": 0.45130717754364014, + "learning_rate": 8.650765160377978e-06, + "loss": 0.3811, + "step": 2035 + }, + { + "epoch": 0.9468299488451403, + "grad_norm": 0.4452032744884491, + "learning_rate": 8.648915703446287e-06, + "loss": 0.3904, + "step": 2036 + }, + { + "epoch": 0.9472949930243373, + "grad_norm": 0.5498278141021729, + "learning_rate": 8.647065177806457e-06, + "loss": 0.3611, + "step": 2037 + }, + { + "epoch": 0.9477600372035343, + "grad_norm": 0.41174963116645813, + "learning_rate": 8.645213584000476e-06, + "loss": 0.3793, + "step": 2038 + }, + { + "epoch": 0.9482250813827313, + "grad_norm": 0.5462541580200195, + "learning_rate": 8.643360922570646e-06, + "loss": 0.3601, + "step": 2039 + }, + { + "epoch": 0.9486901255619283, + "grad_norm": 0.5303115844726562, + "learning_rate": 8.64150719405958e-06, + "loss": 0.3918, + "step": 2040 + }, + { + "epoch": 0.9491551697411255, + "grad_norm": 0.45511001348495483, + "learning_rate": 8.639652399010208e-06, + "loss": 0.4027, + "step": 2041 + }, + { + "epoch": 0.9496202139203225, + "grad_norm": 0.5173806548118591, + "learning_rate": 8.637796537965768e-06, + "loss": 0.372, + "step": 2042 + }, + { + "epoch": 0.9500852580995195, + "grad_norm": 0.45395195484161377, + "learning_rate": 8.63593961146981e-06, + "loss": 0.3794, + "step": 2043 + }, + { + "epoch": 0.9505503022787165, + "grad_norm": 0.5278088450431824, + "learning_rate": 8.634081620066199e-06, + "loss": 0.388, + "step": 2044 + }, + { + "epoch": 0.9510153464579135, + "grad_norm": 0.4131206274032593, + "learning_rate": 8.632222564299111e-06, + "loss": 0.3784, + "step": 2045 + }, + { + "epoch": 0.9514803906371105, + "grad_norm": 0.4544680118560791, + "learning_rate": 8.630362444713033e-06, + "loss": 0.3816, + "step": 2046 + }, + { + "epoch": 0.9519454348163076, + "grad_norm": 0.534386396408081, + "learning_rate": 8.628501261852765e-06, + "loss": 0.3543, + "step": 2047 + }, + { + "epoch": 0.9524104789955046, + "grad_norm": 0.44800546765327454, + "learning_rate": 8.626639016263413e-06, + "loss": 0.3832, + "step": 2048 + }, + { + "epoch": 0.9528755231747016, + "grad_norm": 0.5088453888893127, + "learning_rate": 8.624775708490403e-06, + "loss": 0.3755, + "step": 2049 + }, + { + "epoch": 0.9533405673538986, + "grad_norm": 0.5941973328590393, + "learning_rate": 8.622911339079464e-06, + "loss": 0.3948, + "step": 2050 + }, + { + "epoch": 0.9538056115330956, + "grad_norm": 0.45851653814315796, + "learning_rate": 8.621045908576642e-06, + "loss": 0.3968, + "step": 2051 + }, + { + "epoch": 0.9542706557122926, + "grad_norm": 0.5592479109764099, + "learning_rate": 8.619179417528293e-06, + "loss": 0.3982, + "step": 2052 + }, + { + "epoch": 0.9547356998914897, + "grad_norm": 0.5447689294815063, + "learning_rate": 8.617311866481076e-06, + "loss": 0.3877, + "step": 2053 + }, + { + "epoch": 0.9552007440706867, + "grad_norm": 0.4597291052341461, + "learning_rate": 8.61544325598197e-06, + "loss": 0.3644, + "step": 2054 + }, + { + "epoch": 0.9556657882498837, + "grad_norm": 0.5028977394104004, + "learning_rate": 8.613573586578262e-06, + "loss": 0.3604, + "step": 2055 + }, + { + "epoch": 0.9561308324290808, + "grad_norm": 0.5437200665473938, + "learning_rate": 8.611702858817545e-06, + "loss": 0.3916, + "step": 2056 + }, + { + "epoch": 0.9565958766082778, + "grad_norm": 0.4929006099700928, + "learning_rate": 8.609831073247728e-06, + "loss": 0.3633, + "step": 2057 + }, + { + "epoch": 0.9570609207874748, + "grad_norm": 0.4636727273464203, + "learning_rate": 8.607958230417024e-06, + "loss": 0.3587, + "step": 2058 + }, + { + "epoch": 0.9575259649666719, + "grad_norm": 0.39754369854927063, + "learning_rate": 8.606084330873958e-06, + "loss": 0.39, + "step": 2059 + }, + { + "epoch": 0.9579910091458689, + "grad_norm": 0.47713038325309753, + "learning_rate": 8.604209375167366e-06, + "loss": 0.3848, + "step": 2060 + }, + { + "epoch": 0.9584560533250659, + "grad_norm": 0.4594153165817261, + "learning_rate": 8.602333363846393e-06, + "loss": 0.3805, + "step": 2061 + }, + { + "epoch": 0.9589210975042629, + "grad_norm": 0.4612623453140259, + "learning_rate": 8.600456297460491e-06, + "loss": 0.3809, + "step": 2062 + }, + { + "epoch": 0.9593861416834599, + "grad_norm": 0.4625248312950134, + "learning_rate": 8.598578176559423e-06, + "loss": 0.3529, + "step": 2063 + }, + { + "epoch": 0.9598511858626569, + "grad_norm": 0.4562045633792877, + "learning_rate": 8.596699001693257e-06, + "loss": 0.367, + "step": 2064 + }, + { + "epoch": 0.960316230041854, + "grad_norm": 0.4903472065925598, + "learning_rate": 8.594818773412376e-06, + "loss": 0.3759, + "step": 2065 + }, + { + "epoch": 0.960781274221051, + "grad_norm": 0.4430171251296997, + "learning_rate": 8.592937492267466e-06, + "loss": 0.3993, + "step": 2066 + }, + { + "epoch": 0.961246318400248, + "grad_norm": 0.4540698826313019, + "learning_rate": 8.591055158809526e-06, + "loss": 0.3555, + "step": 2067 + }, + { + "epoch": 0.961711362579445, + "grad_norm": 0.4455225169658661, + "learning_rate": 8.589171773589861e-06, + "loss": 0.3937, + "step": 2068 + }, + { + "epoch": 0.962176406758642, + "grad_norm": 0.45158520340919495, + "learning_rate": 8.587287337160083e-06, + "loss": 0.3694, + "step": 2069 + }, + { + "epoch": 0.9626414509378391, + "grad_norm": 0.44573891162872314, + "learning_rate": 8.585401850072114e-06, + "loss": 0.364, + "step": 2070 + }, + { + "epoch": 0.9631064951170362, + "grad_norm": 0.44659656286239624, + "learning_rate": 8.58351531287818e-06, + "loss": 0.393, + "step": 2071 + }, + { + "epoch": 0.9635715392962332, + "grad_norm": 0.4550364911556244, + "learning_rate": 8.581627726130817e-06, + "loss": 0.3919, + "step": 2072 + }, + { + "epoch": 0.9640365834754302, + "grad_norm": 0.4669656455516815, + "learning_rate": 8.579739090382873e-06, + "loss": 0.4129, + "step": 2073 + }, + { + "epoch": 0.9645016276546272, + "grad_norm": 0.45073822140693665, + "learning_rate": 8.577849406187493e-06, + "loss": 0.3963, + "step": 2074 + }, + { + "epoch": 0.9649666718338242, + "grad_norm": 0.4639911949634552, + "learning_rate": 8.575958674098138e-06, + "loss": 0.3857, + "step": 2075 + }, + { + "epoch": 0.9654317160130212, + "grad_norm": 0.4226040244102478, + "learning_rate": 8.574066894668573e-06, + "loss": 0.3762, + "step": 2076 + }, + { + "epoch": 0.9658967601922183, + "grad_norm": 0.42542764544487, + "learning_rate": 8.572174068452867e-06, + "loss": 0.3502, + "step": 2077 + }, + { + "epoch": 0.9663618043714153, + "grad_norm": 0.46741747856140137, + "learning_rate": 8.570280196005403e-06, + "loss": 0.3717, + "step": 2078 + }, + { + "epoch": 0.9668268485506123, + "grad_norm": 0.4867299199104309, + "learning_rate": 8.568385277880859e-06, + "loss": 0.3835, + "step": 2079 + }, + { + "epoch": 0.9672918927298093, + "grad_norm": 0.4860565662384033, + "learning_rate": 8.56648931463423e-06, + "loss": 0.3879, + "step": 2080 + }, + { + "epoch": 0.9677569369090063, + "grad_norm": 0.44712382555007935, + "learning_rate": 8.564592306820813e-06, + "loss": 0.3715, + "step": 2081 + }, + { + "epoch": 0.9682219810882033, + "grad_norm": 0.4493529498577118, + "learning_rate": 8.562694254996208e-06, + "loss": 0.3594, + "step": 2082 + }, + { + "epoch": 0.9686870252674004, + "grad_norm": 0.44625014066696167, + "learning_rate": 8.560795159716327e-06, + "loss": 0.3816, + "step": 2083 + }, + { + "epoch": 0.9691520694465974, + "grad_norm": 0.44804733991622925, + "learning_rate": 8.55889502153738e-06, + "loss": 0.3942, + "step": 2084 + }, + { + "epoch": 0.9696171136257945, + "grad_norm": 0.43751975893974304, + "learning_rate": 8.55699384101589e-06, + "loss": 0.3754, + "step": 2085 + }, + { + "epoch": 0.9700821578049915, + "grad_norm": 0.5086488723754883, + "learning_rate": 8.555091618708681e-06, + "loss": 0.3861, + "step": 2086 + }, + { + "epoch": 0.9705472019841885, + "grad_norm": 0.4142938554286957, + "learning_rate": 8.553188355172882e-06, + "loss": 0.382, + "step": 2087 + }, + { + "epoch": 0.9710122461633856, + "grad_norm": 0.4268786311149597, + "learning_rate": 8.551284050965929e-06, + "loss": 0.3612, + "step": 2088 + }, + { + "epoch": 0.9714772903425826, + "grad_norm": 0.40908634662628174, + "learning_rate": 8.54937870664556e-06, + "loss": 0.3853, + "step": 2089 + }, + { + "epoch": 0.9719423345217796, + "grad_norm": 0.4633747935295105, + "learning_rate": 8.547472322769825e-06, + "loss": 0.3561, + "step": 2090 + }, + { + "epoch": 0.9724073787009766, + "grad_norm": 0.45280638337135315, + "learning_rate": 8.545564899897066e-06, + "loss": 0.4023, + "step": 2091 + }, + { + "epoch": 0.9728724228801736, + "grad_norm": 0.5002387166023254, + "learning_rate": 8.54365643858594e-06, + "loss": 0.3743, + "step": 2092 + }, + { + "epoch": 0.9733374670593706, + "grad_norm": 0.49016910791397095, + "learning_rate": 8.541746939395403e-06, + "loss": 0.3708, + "step": 2093 + }, + { + "epoch": 0.9738025112385676, + "grad_norm": 0.4835710823535919, + "learning_rate": 8.539836402884715e-06, + "loss": 0.3901, + "step": 2094 + }, + { + "epoch": 0.9742675554177647, + "grad_norm": 0.5034793615341187, + "learning_rate": 8.537924829613444e-06, + "loss": 0.3716, + "step": 2095 + }, + { + "epoch": 0.9747325995969617, + "grad_norm": 0.5456627011299133, + "learning_rate": 8.536012220141458e-06, + "loss": 0.3759, + "step": 2096 + }, + { + "epoch": 0.9751976437761587, + "grad_norm": 0.4535893499851227, + "learning_rate": 8.534098575028928e-06, + "loss": 0.3818, + "step": 2097 + }, + { + "epoch": 0.9756626879553557, + "grad_norm": 0.40453192591667175, + "learning_rate": 8.53218389483633e-06, + "loss": 0.3686, + "step": 2098 + }, + { + "epoch": 0.9761277321345527, + "grad_norm": 0.5098088979721069, + "learning_rate": 8.530268180124444e-06, + "loss": 0.385, + "step": 2099 + }, + { + "epoch": 0.9765927763137499, + "grad_norm": 0.47901231050491333, + "learning_rate": 8.528351431454352e-06, + "loss": 0.3932, + "step": 2100 + }, + { + "epoch": 0.9770578204929469, + "grad_norm": 0.4342050850391388, + "learning_rate": 8.526433649387435e-06, + "loss": 0.3771, + "step": 2101 + }, + { + "epoch": 0.9775228646721439, + "grad_norm": 0.424624502658844, + "learning_rate": 8.524514834485382e-06, + "loss": 0.3976, + "step": 2102 + }, + { + "epoch": 0.9779879088513409, + "grad_norm": 0.4495830237865448, + "learning_rate": 8.522594987310184e-06, + "loss": 0.3901, + "step": 2103 + }, + { + "epoch": 0.9784529530305379, + "grad_norm": 0.46972575783729553, + "learning_rate": 8.520674108424134e-06, + "loss": 0.4025, + "step": 2104 + }, + { + "epoch": 0.9789179972097349, + "grad_norm": 0.46081069111824036, + "learning_rate": 8.518752198389823e-06, + "loss": 0.3794, + "step": 2105 + }, + { + "epoch": 0.979383041388932, + "grad_norm": 0.47898101806640625, + "learning_rate": 8.51682925777015e-06, + "loss": 0.3701, + "step": 2106 + }, + { + "epoch": 0.979848085568129, + "grad_norm": 0.4954424500465393, + "learning_rate": 8.51490528712831e-06, + "loss": 0.3656, + "step": 2107 + }, + { + "epoch": 0.980313129747326, + "grad_norm": 0.5420515537261963, + "learning_rate": 8.512980287027805e-06, + "loss": 0.37, + "step": 2108 + }, + { + "epoch": 0.980778173926523, + "grad_norm": 0.42437291145324707, + "learning_rate": 8.511054258032436e-06, + "loss": 0.395, + "step": 2109 + }, + { + "epoch": 0.98124321810572, + "grad_norm": 0.4806736409664154, + "learning_rate": 8.509127200706305e-06, + "loss": 0.3452, + "step": 2110 + }, + { + "epoch": 0.981708262284917, + "grad_norm": 0.4276758134365082, + "learning_rate": 8.507199115613818e-06, + "loss": 0.3351, + "step": 2111 + }, + { + "epoch": 0.982173306464114, + "grad_norm": 0.484733909368515, + "learning_rate": 8.505270003319676e-06, + "loss": 0.3501, + "step": 2112 + }, + { + "epoch": 0.9826383506433111, + "grad_norm": 0.5105139017105103, + "learning_rate": 8.503339864388887e-06, + "loss": 0.4193, + "step": 2113 + }, + { + "epoch": 0.9831033948225082, + "grad_norm": 0.5405691266059875, + "learning_rate": 8.501408699386758e-06, + "loss": 0.3912, + "step": 2114 + }, + { + "epoch": 0.9835684390017052, + "grad_norm": 0.438722163438797, + "learning_rate": 8.499476508878894e-06, + "loss": 0.3935, + "step": 2115 + }, + { + "epoch": 0.9840334831809022, + "grad_norm": 0.5080367922782898, + "learning_rate": 8.497543293431202e-06, + "loss": 0.3764, + "step": 2116 + }, + { + "epoch": 0.9844985273600992, + "grad_norm": 0.49082526564598083, + "learning_rate": 8.495609053609893e-06, + "loss": 0.3737, + "step": 2117 + }, + { + "epoch": 0.9849635715392963, + "grad_norm": 0.4634253978729248, + "learning_rate": 8.49367378998147e-06, + "loss": 0.3979, + "step": 2118 + }, + { + "epoch": 0.9854286157184933, + "grad_norm": 0.4806537926197052, + "learning_rate": 8.491737503112744e-06, + "loss": 0.3773, + "step": 2119 + }, + { + "epoch": 0.9858936598976903, + "grad_norm": 0.5054664611816406, + "learning_rate": 8.489800193570818e-06, + "loss": 0.3704, + "step": 2120 + }, + { + "epoch": 0.9863587040768873, + "grad_norm": 0.47105222940444946, + "learning_rate": 8.487861861923103e-06, + "loss": 0.3684, + "step": 2121 + }, + { + "epoch": 0.9868237482560843, + "grad_norm": 0.4627211391925812, + "learning_rate": 8.485922508737302e-06, + "loss": 0.3776, + "step": 2122 + }, + { + "epoch": 0.9872887924352813, + "grad_norm": 0.4847998321056366, + "learning_rate": 8.483982134581419e-06, + "loss": 0.3859, + "step": 2123 + }, + { + "epoch": 0.9877538366144784, + "grad_norm": 0.5242019295692444, + "learning_rate": 8.48204074002376e-06, + "loss": 0.3859, + "step": 2124 + }, + { + "epoch": 0.9882188807936754, + "grad_norm": 0.48214638233184814, + "learning_rate": 8.480098325632928e-06, + "loss": 0.364, + "step": 2125 + }, + { + "epoch": 0.9886839249728724, + "grad_norm": 0.4689680337905884, + "learning_rate": 8.478154891977825e-06, + "loss": 0.3843, + "step": 2126 + }, + { + "epoch": 0.9891489691520694, + "grad_norm": 0.609089195728302, + "learning_rate": 8.47621043962765e-06, + "loss": 0.4067, + "step": 2127 + }, + { + "epoch": 0.9896140133312664, + "grad_norm": 0.4945160150527954, + "learning_rate": 8.474264969151902e-06, + "loss": 0.3718, + "step": 2128 + }, + { + "epoch": 0.9900790575104635, + "grad_norm": 0.5477257370948792, + "learning_rate": 8.472318481120377e-06, + "loss": 0.3698, + "step": 2129 + }, + { + "epoch": 0.9905441016896606, + "grad_norm": 0.5951197743415833, + "learning_rate": 8.470370976103171e-06, + "loss": 0.3752, + "step": 2130 + }, + { + "epoch": 0.9910091458688576, + "grad_norm": 0.4970504343509674, + "learning_rate": 8.468422454670674e-06, + "loss": 0.3917, + "step": 2131 + }, + { + "epoch": 0.9914741900480546, + "grad_norm": 0.48202067613601685, + "learning_rate": 8.46647291739358e-06, + "loss": 0.3855, + "step": 2132 + }, + { + "epoch": 0.9919392342272516, + "grad_norm": 0.49319443106651306, + "learning_rate": 8.464522364842874e-06, + "loss": 0.357, + "step": 2133 + }, + { + "epoch": 0.9924042784064486, + "grad_norm": 0.47641077637672424, + "learning_rate": 8.462570797589842e-06, + "loss": 0.3823, + "step": 2134 + }, + { + "epoch": 0.9928693225856456, + "grad_norm": 0.4628462791442871, + "learning_rate": 8.460618216206069e-06, + "loss": 0.36, + "step": 2135 + }, + { + "epoch": 0.9933343667648427, + "grad_norm": 0.4999821186065674, + "learning_rate": 8.458664621263428e-06, + "loss": 0.402, + "step": 2136 + }, + { + "epoch": 0.9937994109440397, + "grad_norm": 0.4732115864753723, + "learning_rate": 8.456710013334102e-06, + "loss": 0.376, + "step": 2137 + }, + { + "epoch": 0.9942644551232367, + "grad_norm": 0.42666521668434143, + "learning_rate": 8.45475439299056e-06, + "loss": 0.3759, + "step": 2138 + }, + { + "epoch": 0.9947294993024337, + "grad_norm": 0.512298583984375, + "learning_rate": 8.452797760805572e-06, + "loss": 0.3646, + "step": 2139 + }, + { + "epoch": 0.9951945434816307, + "grad_norm": 0.5457414984703064, + "learning_rate": 8.450840117352203e-06, + "loss": 0.3637, + "step": 2140 + }, + { + "epoch": 0.9956595876608277, + "grad_norm": 0.4568066895008087, + "learning_rate": 8.448881463203819e-06, + "loss": 0.4164, + "step": 2141 + }, + { + "epoch": 0.9961246318400248, + "grad_norm": 0.4378894865512848, + "learning_rate": 8.446921798934074e-06, + "loss": 0.3666, + "step": 2142 + }, + { + "epoch": 0.9965896760192218, + "grad_norm": 0.5727525949478149, + "learning_rate": 8.444961125116924e-06, + "loss": 0.376, + "step": 2143 + }, + { + "epoch": 0.9970547201984189, + "grad_norm": 0.5255966782569885, + "learning_rate": 8.442999442326617e-06, + "loss": 0.3902, + "step": 2144 + }, + { + "epoch": 0.9975197643776159, + "grad_norm": 0.4558255076408386, + "learning_rate": 8.441036751137697e-06, + "loss": 0.3963, + "step": 2145 + }, + { + "epoch": 0.9979848085568129, + "grad_norm": 0.49802669882774353, + "learning_rate": 8.439073052125006e-06, + "loss": 0.3579, + "step": 2146 + }, + { + "epoch": 0.99844985273601, + "grad_norm": 0.5205224752426147, + "learning_rate": 8.43710834586368e-06, + "loss": 0.3696, + "step": 2147 + }, + { + "epoch": 0.998914896915207, + "grad_norm": 0.4866522252559662, + "learning_rate": 8.435142632929149e-06, + "loss": 0.3896, + "step": 2148 + }, + { + "epoch": 0.999379941094404, + "grad_norm": 0.4538208842277527, + "learning_rate": 8.43317591389714e-06, + "loss": 0.3607, + "step": 2149 + }, + { + "epoch": 0.999844985273601, + "grad_norm": 0.5789209604263306, + "learning_rate": 8.43120818934367e-06, + "loss": 0.379, + "step": 2150 + }, + { + "epoch": 1.000310029452798, + "grad_norm": 0.8949771523475647, + "learning_rate": 8.429239459845053e-06, + "loss": 0.5959, + "step": 2151 + }, + { + "epoch": 1.0007750736319951, + "grad_norm": 0.4994892477989197, + "learning_rate": 8.427269725977902e-06, + "loss": 0.3726, + "step": 2152 + }, + { + "epoch": 1.001240117811192, + "grad_norm": 0.49586233496665955, + "learning_rate": 8.425298988319119e-06, + "loss": 0.3367, + "step": 2153 + }, + { + "epoch": 1.0017051619903892, + "grad_norm": 0.4944624900817871, + "learning_rate": 8.423327247445898e-06, + "loss": 0.3802, + "step": 2154 + }, + { + "epoch": 1.002170206169586, + "grad_norm": 0.45939287543296814, + "learning_rate": 8.421354503935733e-06, + "loss": 0.3271, + "step": 2155 + }, + { + "epoch": 1.0026352503487832, + "grad_norm": 0.5419012308120728, + "learning_rate": 8.419380758366407e-06, + "loss": 0.3778, + "step": 2156 + }, + { + "epoch": 1.00310029452798, + "grad_norm": 0.43736547231674194, + "learning_rate": 8.417406011316e-06, + "loss": 0.3852, + "step": 2157 + }, + { + "epoch": 1.0035653387071772, + "grad_norm": 0.5195603966712952, + "learning_rate": 8.415430263362878e-06, + "loss": 0.3837, + "step": 2158 + }, + { + "epoch": 1.0040303828863741, + "grad_norm": 0.5915653109550476, + "learning_rate": 8.413453515085712e-06, + "loss": 0.3953, + "step": 2159 + }, + { + "epoch": 1.0044954270655713, + "grad_norm": 0.4842241406440735, + "learning_rate": 8.411475767063454e-06, + "loss": 0.3372, + "step": 2160 + }, + { + "epoch": 1.0049604712447682, + "grad_norm": 0.46653223037719727, + "learning_rate": 8.409497019875362e-06, + "loss": 0.3541, + "step": 2161 + }, + { + "epoch": 1.0054255154239653, + "grad_norm": 0.4800226390361786, + "learning_rate": 8.40751727410097e-06, + "loss": 0.3579, + "step": 2162 + }, + { + "epoch": 1.0058905596031622, + "grad_norm": 0.4519990086555481, + "learning_rate": 8.405536530320118e-06, + "loss": 0.3372, + "step": 2163 + }, + { + "epoch": 1.0063556037823593, + "grad_norm": 0.4979685842990875, + "learning_rate": 8.403554789112934e-06, + "loss": 0.343, + "step": 2164 + }, + { + "epoch": 1.0068206479615562, + "grad_norm": 0.4970398545265198, + "learning_rate": 8.401572051059835e-06, + "loss": 0.3701, + "step": 2165 + }, + { + "epoch": 1.0072856921407534, + "grad_norm": 0.5320008993148804, + "learning_rate": 8.399588316741535e-06, + "loss": 0.3512, + "step": 2166 + }, + { + "epoch": 1.0077507363199505, + "grad_norm": 0.4425172209739685, + "learning_rate": 8.397603586739039e-06, + "loss": 0.3165, + "step": 2167 + }, + { + "epoch": 1.0082157804991474, + "grad_norm": 0.4745187759399414, + "learning_rate": 8.395617861633637e-06, + "loss": 0.3919, + "step": 2168 + }, + { + "epoch": 1.0086808246783445, + "grad_norm": 0.5007448196411133, + "learning_rate": 8.393631142006922e-06, + "loss": 0.3564, + "step": 2169 + }, + { + "epoch": 1.0091458688575414, + "grad_norm": 0.4433797001838684, + "learning_rate": 8.391643428440766e-06, + "loss": 0.367, + "step": 2170 + }, + { + "epoch": 1.0096109130367386, + "grad_norm": 0.37245190143585205, + "learning_rate": 8.389654721517341e-06, + "loss": 0.3361, + "step": 2171 + }, + { + "epoch": 1.0100759572159355, + "grad_norm": 0.5664768815040588, + "learning_rate": 8.38766502181911e-06, + "loss": 0.382, + "step": 2172 + }, + { + "epoch": 1.0105410013951326, + "grad_norm": 0.4073793888092041, + "learning_rate": 8.385674329928819e-06, + "loss": 0.3291, + "step": 2173 + }, + { + "epoch": 1.0110060455743295, + "grad_norm": 0.509306788444519, + "learning_rate": 8.383682646429509e-06, + "loss": 0.4038, + "step": 2174 + }, + { + "epoch": 1.0114710897535266, + "grad_norm": 0.39777812361717224, + "learning_rate": 8.381689971904514e-06, + "loss": 0.3413, + "step": 2175 + }, + { + "epoch": 1.0119361339327235, + "grad_norm": 0.5179665088653564, + "learning_rate": 8.379696306937457e-06, + "loss": 0.3944, + "step": 2176 + }, + { + "epoch": 1.0124011781119207, + "grad_norm": 0.4474855661392212, + "learning_rate": 8.377701652112249e-06, + "loss": 0.3663, + "step": 2177 + }, + { + "epoch": 1.0128662222911176, + "grad_norm": 0.4197114408016205, + "learning_rate": 8.37570600801309e-06, + "loss": 0.325, + "step": 2178 + }, + { + "epoch": 1.0133312664703147, + "grad_norm": 0.48934638500213623, + "learning_rate": 8.373709375224475e-06, + "loss": 0.3751, + "step": 2179 + }, + { + "epoch": 1.0137963106495118, + "grad_norm": 0.4234052002429962, + "learning_rate": 8.371711754331181e-06, + "loss": 0.3585, + "step": 2180 + }, + { + "epoch": 1.0142613548287087, + "grad_norm": 0.511944055557251, + "learning_rate": 8.369713145918284e-06, + "loss": 0.3847, + "step": 2181 + }, + { + "epoch": 1.0147263990079058, + "grad_norm": 0.46818193793296814, + "learning_rate": 8.36771355057114e-06, + "loss": 0.38, + "step": 2182 + }, + { + "epoch": 1.0151914431871027, + "grad_norm": 0.41124582290649414, + "learning_rate": 8.365712968875399e-06, + "loss": 0.3481, + "step": 2183 + }, + { + "epoch": 1.0156564873662999, + "grad_norm": 0.42587098479270935, + "learning_rate": 8.363711401417e-06, + "loss": 0.3544, + "step": 2184 + }, + { + "epoch": 1.0161215315454968, + "grad_norm": 0.41804370284080505, + "learning_rate": 8.36170884878217e-06, + "loss": 0.3613, + "step": 2185 + }, + { + "epoch": 1.016586575724694, + "grad_norm": 0.41786617040634155, + "learning_rate": 8.359705311557421e-06, + "loss": 0.3388, + "step": 2186 + }, + { + "epoch": 1.0170516199038908, + "grad_norm": 0.4178462028503418, + "learning_rate": 8.35770079032956e-06, + "loss": 0.3509, + "step": 2187 + }, + { + "epoch": 1.017516664083088, + "grad_norm": 0.3867464065551758, + "learning_rate": 8.355695285685675e-06, + "loss": 0.3443, + "step": 2188 + }, + { + "epoch": 1.0179817082622848, + "grad_norm": 0.5491018891334534, + "learning_rate": 8.35368879821315e-06, + "loss": 0.3935, + "step": 2189 + }, + { + "epoch": 1.018446752441482, + "grad_norm": 0.4546567499637604, + "learning_rate": 8.35168132849965e-06, + "loss": 0.3451, + "step": 2190 + }, + { + "epoch": 1.0189117966206789, + "grad_norm": 0.426011323928833, + "learning_rate": 8.349672877133131e-06, + "loss": 0.315, + "step": 2191 + }, + { + "epoch": 1.019376840799876, + "grad_norm": 0.43346622586250305, + "learning_rate": 8.347663444701835e-06, + "loss": 0.3507, + "step": 2192 + }, + { + "epoch": 1.019841884979073, + "grad_norm": 0.4720572233200073, + "learning_rate": 8.345653031794292e-06, + "loss": 0.3445, + "step": 2193 + }, + { + "epoch": 1.02030692915827, + "grad_norm": 0.4907239079475403, + "learning_rate": 8.34364163899932e-06, + "loss": 0.336, + "step": 2194 + }, + { + "epoch": 1.0207719733374672, + "grad_norm": 0.47017717361450195, + "learning_rate": 8.341629266906024e-06, + "loss": 0.4029, + "step": 2195 + }, + { + "epoch": 1.021237017516664, + "grad_norm": 0.4325951933860779, + "learning_rate": 8.339615916103795e-06, + "loss": 0.3129, + "step": 2196 + }, + { + "epoch": 1.0217020616958612, + "grad_norm": 0.4916847050189972, + "learning_rate": 8.33760158718231e-06, + "loss": 0.3647, + "step": 2197 + }, + { + "epoch": 1.022167105875058, + "grad_norm": 0.43246665596961975, + "learning_rate": 8.335586280731532e-06, + "loss": 0.3216, + "step": 2198 + }, + { + "epoch": 1.0226321500542552, + "grad_norm": 0.5200701355934143, + "learning_rate": 8.333569997341713e-06, + "loss": 0.355, + "step": 2199 + }, + { + "epoch": 1.0230971942334521, + "grad_norm": 0.4282546639442444, + "learning_rate": 8.33155273760339e-06, + "loss": 0.3829, + "step": 2200 + }, + { + "epoch": 1.0235622384126493, + "grad_norm": 0.42160195112228394, + "learning_rate": 8.329534502107386e-06, + "loss": 0.3202, + "step": 2201 + }, + { + "epoch": 1.0240272825918462, + "grad_norm": 0.42160677909851074, + "learning_rate": 8.327515291444807e-06, + "loss": 0.3368, + "step": 2202 + }, + { + "epoch": 1.0244923267710433, + "grad_norm": 0.4635396897792816, + "learning_rate": 8.325495106207049e-06, + "loss": 0.3864, + "step": 2203 + }, + { + "epoch": 1.0249573709502402, + "grad_norm": 0.4313600957393646, + "learning_rate": 8.32347394698579e-06, + "loss": 0.3538, + "step": 2204 + }, + { + "epoch": 1.0254224151294373, + "grad_norm": 0.4247414767742157, + "learning_rate": 8.321451814372998e-06, + "loss": 0.3499, + "step": 2205 + }, + { + "epoch": 1.0258874593086342, + "grad_norm": 0.5032163262367249, + "learning_rate": 8.319428708960917e-06, + "loss": 0.3768, + "step": 2206 + }, + { + "epoch": 1.0263525034878314, + "grad_norm": 0.44325828552246094, + "learning_rate": 8.317404631342088e-06, + "loss": 0.3473, + "step": 2207 + }, + { + "epoch": 1.0268175476670283, + "grad_norm": 0.3861435353755951, + "learning_rate": 8.315379582109326e-06, + "loss": 0.3206, + "step": 2208 + }, + { + "epoch": 1.0272825918462254, + "grad_norm": 0.4646325707435608, + "learning_rate": 8.313353561855737e-06, + "loss": 0.3673, + "step": 2209 + }, + { + "epoch": 1.0277476360254225, + "grad_norm": 0.4774647653102875, + "learning_rate": 8.31132657117471e-06, + "loss": 0.3534, + "step": 2210 + }, + { + "epoch": 1.0282126802046194, + "grad_norm": 0.4477936029434204, + "learning_rate": 8.309298610659917e-06, + "loss": 0.3531, + "step": 2211 + }, + { + "epoch": 1.0286777243838165, + "grad_norm": 0.5256711840629578, + "learning_rate": 8.307269680905312e-06, + "loss": 0.3825, + "step": 2212 + }, + { + "epoch": 1.0291427685630135, + "grad_norm": 0.43662089109420776, + "learning_rate": 8.305239782505142e-06, + "loss": 0.3622, + "step": 2213 + }, + { + "epoch": 1.0296078127422106, + "grad_norm": 0.4843900501728058, + "learning_rate": 8.303208916053924e-06, + "loss": 0.3555, + "step": 2214 + }, + { + "epoch": 1.0300728569214075, + "grad_norm": 0.5311547517776489, + "learning_rate": 8.30117708214647e-06, + "loss": 0.3647, + "step": 2215 + }, + { + "epoch": 1.0305379011006046, + "grad_norm": 0.4405294954776764, + "learning_rate": 8.299144281377869e-06, + "loss": 0.308, + "step": 2216 + }, + { + "epoch": 1.0310029452798015, + "grad_norm": 0.5497903823852539, + "learning_rate": 8.297110514343498e-06, + "loss": 0.3571, + "step": 2217 + }, + { + "epoch": 1.0314679894589986, + "grad_norm": 0.44999319314956665, + "learning_rate": 8.295075781639013e-06, + "loss": 0.3713, + "step": 2218 + }, + { + "epoch": 1.0319330336381956, + "grad_norm": 0.5399612188339233, + "learning_rate": 8.293040083860352e-06, + "loss": 0.3285, + "step": 2219 + }, + { + "epoch": 1.0323980778173927, + "grad_norm": 0.5000502467155457, + "learning_rate": 8.29100342160374e-06, + "loss": 0.351, + "step": 2220 + }, + { + "epoch": 1.0328631219965896, + "grad_norm": 0.5676979422569275, + "learning_rate": 8.288965795465684e-06, + "loss": 0.3698, + "step": 2221 + }, + { + "epoch": 1.0333281661757867, + "grad_norm": 0.5297927260398865, + "learning_rate": 8.28692720604297e-06, + "loss": 0.3449, + "step": 2222 + }, + { + "epoch": 1.0337932103549836, + "grad_norm": 0.5518194437026978, + "learning_rate": 8.284887653932665e-06, + "loss": 0.3581, + "step": 2223 + }, + { + "epoch": 1.0342582545341807, + "grad_norm": 0.49941563606262207, + "learning_rate": 8.282847139732125e-06, + "loss": 0.3357, + "step": 2224 + }, + { + "epoch": 1.0347232987133779, + "grad_norm": 0.5394763946533203, + "learning_rate": 8.28080566403898e-06, + "loss": 0.3671, + "step": 2225 + }, + { + "epoch": 1.0351883428925748, + "grad_norm": 0.5620566010475159, + "learning_rate": 8.278763227451148e-06, + "loss": 0.3658, + "step": 2226 + }, + { + "epoch": 1.035653387071772, + "grad_norm": 0.5896021723747253, + "learning_rate": 8.276719830566823e-06, + "loss": 0.3925, + "step": 2227 + }, + { + "epoch": 1.0361184312509688, + "grad_norm": 0.5375012159347534, + "learning_rate": 8.274675473984486e-06, + "loss": 0.3573, + "step": 2228 + }, + { + "epoch": 1.036583475430166, + "grad_norm": 0.48443183302879333, + "learning_rate": 8.272630158302892e-06, + "loss": 0.3385, + "step": 2229 + }, + { + "epoch": 1.0370485196093628, + "grad_norm": 0.5011869072914124, + "learning_rate": 8.270583884121083e-06, + "loss": 0.356, + "step": 2230 + }, + { + "epoch": 1.03751356378856, + "grad_norm": 0.5027603507041931, + "learning_rate": 8.268536652038379e-06, + "loss": 0.3276, + "step": 2231 + }, + { + "epoch": 1.0379786079677569, + "grad_norm": 0.593014657497406, + "learning_rate": 8.266488462654381e-06, + "loss": 0.3687, + "step": 2232 + }, + { + "epoch": 1.038443652146954, + "grad_norm": 0.6836045384407043, + "learning_rate": 8.264439316568969e-06, + "loss": 0.3779, + "step": 2233 + }, + { + "epoch": 1.038908696326151, + "grad_norm": 0.47743064165115356, + "learning_rate": 8.262389214382307e-06, + "loss": 0.3386, + "step": 2234 + }, + { + "epoch": 1.039373740505348, + "grad_norm": 0.7030071020126343, + "learning_rate": 8.260338156694836e-06, + "loss": 0.3694, + "step": 2235 + }, + { + "epoch": 1.039838784684545, + "grad_norm": 0.468199223279953, + "learning_rate": 8.258286144107277e-06, + "loss": 0.3337, + "step": 2236 + }, + { + "epoch": 1.040303828863742, + "grad_norm": 0.43109989166259766, + "learning_rate": 8.256233177220632e-06, + "loss": 0.3538, + "step": 2237 + }, + { + "epoch": 1.0407688730429392, + "grad_norm": 0.5127283930778503, + "learning_rate": 8.25417925663618e-06, + "loss": 0.3447, + "step": 2238 + }, + { + "epoch": 1.041233917222136, + "grad_norm": 0.5324409604072571, + "learning_rate": 8.25212438295548e-06, + "loss": 0.3476, + "step": 2239 + }, + { + "epoch": 1.0416989614013332, + "grad_norm": 0.4684849679470062, + "learning_rate": 8.250068556780376e-06, + "loss": 0.348, + "step": 2240 + }, + { + "epoch": 1.0421640055805301, + "grad_norm": 0.5129745006561279, + "learning_rate": 8.24801177871298e-06, + "loss": 0.3528, + "step": 2241 + }, + { + "epoch": 1.0426290497597273, + "grad_norm": 0.5319026112556458, + "learning_rate": 8.245954049355696e-06, + "loss": 0.3404, + "step": 2242 + }, + { + "epoch": 1.0430940939389242, + "grad_norm": 0.4605168104171753, + "learning_rate": 8.243895369311192e-06, + "loss": 0.3319, + "step": 2243 + }, + { + "epoch": 1.0435591381181213, + "grad_norm": 0.4982357621192932, + "learning_rate": 8.241835739182426e-06, + "loss": 0.3557, + "step": 2244 + }, + { + "epoch": 1.0440241822973182, + "grad_norm": 0.5314902067184448, + "learning_rate": 8.239775159572632e-06, + "loss": 0.352, + "step": 2245 + }, + { + "epoch": 1.0444892264765153, + "grad_norm": 0.4709586799144745, + "learning_rate": 8.237713631085316e-06, + "loss": 0.3741, + "step": 2246 + }, + { + "epoch": 1.0449542706557122, + "grad_norm": 0.4382578134536743, + "learning_rate": 8.235651154324269e-06, + "loss": 0.3414, + "step": 2247 + }, + { + "epoch": 1.0454193148349094, + "grad_norm": 0.43601369857788086, + "learning_rate": 8.233587729893555e-06, + "loss": 0.3409, + "step": 2248 + }, + { + "epoch": 1.0458843590141063, + "grad_norm": 0.4836721122264862, + "learning_rate": 8.23152335839752e-06, + "loss": 0.3506, + "step": 2249 + }, + { + "epoch": 1.0463494031933034, + "grad_norm": 0.5360286235809326, + "learning_rate": 8.229458040440783e-06, + "loss": 0.3817, + "step": 2250 + }, + { + "epoch": 1.0468144473725003, + "grad_norm": 0.4707900583744049, + "learning_rate": 8.227391776628242e-06, + "loss": 0.3439, + "step": 2251 + }, + { + "epoch": 1.0472794915516974, + "grad_norm": 0.48144766688346863, + "learning_rate": 8.225324567565071e-06, + "loss": 0.3562, + "step": 2252 + }, + { + "epoch": 1.0477445357308945, + "grad_norm": 0.5262990593910217, + "learning_rate": 8.223256413856726e-06, + "loss": 0.365, + "step": 2253 + }, + { + "epoch": 1.0482095799100914, + "grad_norm": 0.5710676312446594, + "learning_rate": 8.221187316108935e-06, + "loss": 0.3493, + "step": 2254 + }, + { + "epoch": 1.0486746240892886, + "grad_norm": 0.44150006771087646, + "learning_rate": 8.219117274927696e-06, + "loss": 0.3598, + "step": 2255 + }, + { + "epoch": 1.0491396682684855, + "grad_norm": 0.43997588753700256, + "learning_rate": 8.2170462909193e-06, + "loss": 0.307, + "step": 2256 + }, + { + "epoch": 1.0496047124476826, + "grad_norm": 0.582988440990448, + "learning_rate": 8.2149743646903e-06, + "loss": 0.3721, + "step": 2257 + }, + { + "epoch": 1.0500697566268795, + "grad_norm": 0.4914950728416443, + "learning_rate": 8.212901496847528e-06, + "loss": 0.3651, + "step": 2258 + }, + { + "epoch": 1.0505348008060766, + "grad_norm": 0.4819602966308594, + "learning_rate": 8.210827687998098e-06, + "loss": 0.4051, + "step": 2259 + }, + { + "epoch": 1.0509998449852735, + "grad_norm": 0.47203314304351807, + "learning_rate": 8.208752938749389e-06, + "loss": 0.3389, + "step": 2260 + }, + { + "epoch": 1.0514648891644707, + "grad_norm": 0.5789709687232971, + "learning_rate": 8.206677249709066e-06, + "loss": 0.3793, + "step": 2261 + }, + { + "epoch": 1.0519299333436676, + "grad_norm": 0.5130416750907898, + "learning_rate": 8.204600621485064e-06, + "loss": 0.3352, + "step": 2262 + }, + { + "epoch": 1.0523949775228647, + "grad_norm": 0.4451179802417755, + "learning_rate": 8.202523054685592e-06, + "loss": 0.3723, + "step": 2263 + }, + { + "epoch": 1.0528600217020616, + "grad_norm": 0.5486533641815186, + "learning_rate": 8.200444549919135e-06, + "loss": 0.3537, + "step": 2264 + }, + { + "epoch": 1.0533250658812587, + "grad_norm": 0.4916934072971344, + "learning_rate": 8.198365107794457e-06, + "loss": 0.3705, + "step": 2265 + }, + { + "epoch": 1.0537901100604556, + "grad_norm": 0.46391212940216064, + "learning_rate": 8.196284728920589e-06, + "loss": 0.3469, + "step": 2266 + }, + { + "epoch": 1.0542551542396528, + "grad_norm": 0.5530695915222168, + "learning_rate": 8.194203413906843e-06, + "loss": 0.3614, + "step": 2267 + }, + { + "epoch": 1.05472019841885, + "grad_norm": 0.4474685490131378, + "learning_rate": 8.1921211633628e-06, + "loss": 0.3722, + "step": 2268 + }, + { + "epoch": 1.0551852425980468, + "grad_norm": 0.4766155779361725, + "learning_rate": 8.190037977898319e-06, + "loss": 0.3416, + "step": 2269 + }, + { + "epoch": 1.055650286777244, + "grad_norm": 0.541581392288208, + "learning_rate": 8.187953858123529e-06, + "loss": 0.3799, + "step": 2270 + }, + { + "epoch": 1.0561153309564408, + "grad_norm": 0.5254855751991272, + "learning_rate": 8.185868804648838e-06, + "loss": 0.3685, + "step": 2271 + }, + { + "epoch": 1.056580375135638, + "grad_norm": 0.4539804458618164, + "learning_rate": 8.183782818084922e-06, + "loss": 0.3398, + "step": 2272 + }, + { + "epoch": 1.0570454193148349, + "grad_norm": 0.4451497197151184, + "learning_rate": 8.181695899042733e-06, + "loss": 0.35, + "step": 2273 + }, + { + "epoch": 1.057510463494032, + "grad_norm": 0.4841856360435486, + "learning_rate": 8.179608048133497e-06, + "loss": 0.3849, + "step": 2274 + }, + { + "epoch": 1.057975507673229, + "grad_norm": 0.3748435080051422, + "learning_rate": 8.17751926596871e-06, + "loss": 0.3153, + "step": 2275 + }, + { + "epoch": 1.058440551852426, + "grad_norm": 0.4471512734889984, + "learning_rate": 8.175429553160142e-06, + "loss": 0.3684, + "step": 2276 + }, + { + "epoch": 1.058905596031623, + "grad_norm": 0.47942283749580383, + "learning_rate": 8.17333891031984e-06, + "loss": 0.3909, + "step": 2277 + }, + { + "epoch": 1.05937064021082, + "grad_norm": 0.3868976831436157, + "learning_rate": 8.171247338060113e-06, + "loss": 0.3071, + "step": 2278 + }, + { + "epoch": 1.059835684390017, + "grad_norm": 0.42615851759910583, + "learning_rate": 8.16915483699355e-06, + "loss": 0.375, + "step": 2279 + }, + { + "epoch": 1.060300728569214, + "grad_norm": 0.42659878730773926, + "learning_rate": 8.167061407733018e-06, + "loss": 0.3678, + "step": 2280 + }, + { + "epoch": 1.060765772748411, + "grad_norm": 0.4303070306777954, + "learning_rate": 8.164967050891639e-06, + "loss": 0.3455, + "step": 2281 + }, + { + "epoch": 1.0612308169276081, + "grad_norm": 0.4534488618373871, + "learning_rate": 8.16287176708282e-06, + "loss": 0.3615, + "step": 2282 + }, + { + "epoch": 1.0616958611068052, + "grad_norm": 0.5386179685592651, + "learning_rate": 8.160775556920236e-06, + "loss": 0.3719, + "step": 2283 + }, + { + "epoch": 1.0621609052860022, + "grad_norm": 0.48542487621307373, + "learning_rate": 8.158678421017833e-06, + "loss": 0.34, + "step": 2284 + }, + { + "epoch": 1.0626259494651993, + "grad_norm": 0.43062224984169006, + "learning_rate": 8.156580359989827e-06, + "loss": 0.3546, + "step": 2285 + }, + { + "epoch": 1.0630909936443962, + "grad_norm": 0.5932109355926514, + "learning_rate": 8.154481374450707e-06, + "loss": 0.3409, + "step": 2286 + }, + { + "epoch": 1.0635560378235933, + "grad_norm": 0.5103998184204102, + "learning_rate": 8.15238146501523e-06, + "loss": 0.3665, + "step": 2287 + }, + { + "epoch": 1.0640210820027902, + "grad_norm": 0.41876837611198425, + "learning_rate": 8.150280632298426e-06, + "loss": 0.313, + "step": 2288 + }, + { + "epoch": 1.0644861261819873, + "grad_norm": 0.5669198036193848, + "learning_rate": 8.148178876915598e-06, + "loss": 0.3474, + "step": 2289 + }, + { + "epoch": 1.0649511703611843, + "grad_norm": 0.5304418206214905, + "learning_rate": 8.14607619948231e-06, + "loss": 0.3581, + "step": 2290 + }, + { + "epoch": 1.0654162145403814, + "grad_norm": 0.48082804679870605, + "learning_rate": 8.143972600614407e-06, + "loss": 0.3727, + "step": 2291 + }, + { + "epoch": 1.0658812587195783, + "grad_norm": 0.5468645691871643, + "learning_rate": 8.141868080927998e-06, + "loss": 0.3748, + "step": 2292 + }, + { + "epoch": 1.0663463028987754, + "grad_norm": 0.46219947934150696, + "learning_rate": 8.13976264103946e-06, + "loss": 0.392, + "step": 2293 + }, + { + "epoch": 1.0668113470779723, + "grad_norm": 0.508678674697876, + "learning_rate": 8.137656281565445e-06, + "loss": 0.345, + "step": 2294 + }, + { + "epoch": 1.0672763912571694, + "grad_norm": 0.5073127746582031, + "learning_rate": 8.135549003122871e-06, + "loss": 0.3621, + "step": 2295 + }, + { + "epoch": 1.0677414354363663, + "grad_norm": 0.4401457905769348, + "learning_rate": 8.133440806328925e-06, + "loss": 0.3351, + "step": 2296 + }, + { + "epoch": 1.0682064796155635, + "grad_norm": 0.4791904091835022, + "learning_rate": 8.131331691801066e-06, + "loss": 0.3336, + "step": 2297 + }, + { + "epoch": 1.0686715237947606, + "grad_norm": 0.4278305470943451, + "learning_rate": 8.129221660157014e-06, + "loss": 0.3651, + "step": 2298 + }, + { + "epoch": 1.0691365679739575, + "grad_norm": 0.47636422514915466, + "learning_rate": 8.127110712014767e-06, + "loss": 0.3022, + "step": 2299 + }, + { + "epoch": 1.0696016121531546, + "grad_norm": 0.4815133512020111, + "learning_rate": 8.124998847992587e-06, + "loss": 0.335, + "step": 2300 + }, + { + "epoch": 1.0700666563323515, + "grad_norm": 0.4794626235961914, + "learning_rate": 8.122886068709003e-06, + "loss": 0.374, + "step": 2301 + }, + { + "epoch": 1.0705317005115487, + "grad_norm": 0.4852800667285919, + "learning_rate": 8.120772374782818e-06, + "loss": 0.3387, + "step": 2302 + }, + { + "epoch": 1.0709967446907456, + "grad_norm": 0.4669995903968811, + "learning_rate": 8.118657766833093e-06, + "loss": 0.332, + "step": 2303 + }, + { + "epoch": 1.0714617888699427, + "grad_norm": 0.4429568946361542, + "learning_rate": 8.116542245479165e-06, + "loss": 0.3507, + "step": 2304 + }, + { + "epoch": 1.0719268330491396, + "grad_norm": 0.49815958738327026, + "learning_rate": 8.114425811340635e-06, + "loss": 0.3344, + "step": 2305 + }, + { + "epoch": 1.0723918772283367, + "grad_norm": 0.41795018315315247, + "learning_rate": 8.112308465037375e-06, + "loss": 0.3721, + "step": 2306 + }, + { + "epoch": 1.0728569214075336, + "grad_norm": 0.5189369916915894, + "learning_rate": 8.110190207189519e-06, + "loss": 0.3475, + "step": 2307 + }, + { + "epoch": 1.0733219655867308, + "grad_norm": 0.5295676589012146, + "learning_rate": 8.108071038417471e-06, + "loss": 0.378, + "step": 2308 + }, + { + "epoch": 1.0737870097659277, + "grad_norm": 0.4301398694515228, + "learning_rate": 8.1059509593419e-06, + "loss": 0.3511, + "step": 2309 + }, + { + "epoch": 1.0742520539451248, + "grad_norm": 0.47263363003730774, + "learning_rate": 8.103829970583742e-06, + "loss": 0.3233, + "step": 2310 + }, + { + "epoch": 1.0747170981243217, + "grad_norm": 0.519551157951355, + "learning_rate": 8.101708072764204e-06, + "loss": 0.3633, + "step": 2311 + }, + { + "epoch": 1.0751821423035188, + "grad_norm": 0.40797778964042664, + "learning_rate": 8.099585266504753e-06, + "loss": 0.3504, + "step": 2312 + }, + { + "epoch": 1.075647186482716, + "grad_norm": 0.4842034578323364, + "learning_rate": 8.097461552427123e-06, + "loss": 0.3747, + "step": 2313 + }, + { + "epoch": 1.0761122306619129, + "grad_norm": 0.5274269580841064, + "learning_rate": 8.095336931153318e-06, + "loss": 0.3811, + "step": 2314 + }, + { + "epoch": 1.07657727484111, + "grad_norm": 0.4279079735279083, + "learning_rate": 8.093211403305603e-06, + "loss": 0.339, + "step": 2315 + }, + { + "epoch": 1.077042319020307, + "grad_norm": 0.4614260792732239, + "learning_rate": 8.09108496950651e-06, + "loss": 0.3576, + "step": 2316 + }, + { + "epoch": 1.077507363199504, + "grad_norm": 0.5641934275627136, + "learning_rate": 8.088957630378842e-06, + "loss": 0.353, + "step": 2317 + }, + { + "epoch": 1.077972407378701, + "grad_norm": 0.5266045928001404, + "learning_rate": 8.086829386545655e-06, + "loss": 0.3438, + "step": 2318 + }, + { + "epoch": 1.078437451557898, + "grad_norm": 0.39795106649398804, + "learning_rate": 8.084700238630283e-06, + "loss": 0.2865, + "step": 2319 + }, + { + "epoch": 1.078902495737095, + "grad_norm": 0.47763675451278687, + "learning_rate": 8.082570187256315e-06, + "loss": 0.3863, + "step": 2320 + }, + { + "epoch": 1.079367539916292, + "grad_norm": 0.43691733479499817, + "learning_rate": 8.080439233047612e-06, + "loss": 0.3406, + "step": 2321 + }, + { + "epoch": 1.079832584095489, + "grad_norm": 0.42221641540527344, + "learning_rate": 8.078307376628292e-06, + "loss": 0.3743, + "step": 2322 + }, + { + "epoch": 1.0802976282746861, + "grad_norm": 0.4146184027194977, + "learning_rate": 8.076174618622744e-06, + "loss": 0.3584, + "step": 2323 + }, + { + "epoch": 1.080762672453883, + "grad_norm": 0.4470728039741516, + "learning_rate": 8.074040959655616e-06, + "loss": 0.379, + "step": 2324 + }, + { + "epoch": 1.0812277166330801, + "grad_norm": 0.512160062789917, + "learning_rate": 8.071906400351823e-06, + "loss": 0.3425, + "step": 2325 + }, + { + "epoch": 1.081692760812277, + "grad_norm": 0.39240655303001404, + "learning_rate": 8.069770941336542e-06, + "loss": 0.35, + "step": 2326 + }, + { + "epoch": 1.0821578049914742, + "grad_norm": 0.4476917088031769, + "learning_rate": 8.067634583235215e-06, + "loss": 0.3455, + "step": 2327 + }, + { + "epoch": 1.0826228491706713, + "grad_norm": 0.5166687965393066, + "learning_rate": 8.065497326673548e-06, + "loss": 0.4265, + "step": 2328 + }, + { + "epoch": 1.0830878933498682, + "grad_norm": 0.4078577160835266, + "learning_rate": 8.063359172277507e-06, + "loss": 0.3306, + "step": 2329 + }, + { + "epoch": 1.0835529375290653, + "grad_norm": 0.4202713072299957, + "learning_rate": 8.061220120673323e-06, + "loss": 0.3573, + "step": 2330 + }, + { + "epoch": 1.0840179817082622, + "grad_norm": 0.4990968108177185, + "learning_rate": 8.05908017248749e-06, + "loss": 0.3774, + "step": 2331 + }, + { + "epoch": 1.0844830258874594, + "grad_norm": 0.4540191888809204, + "learning_rate": 8.056939328346763e-06, + "loss": 0.3521, + "step": 2332 + }, + { + "epoch": 1.0849480700666563, + "grad_norm": 0.3652085065841675, + "learning_rate": 8.05479758887816e-06, + "loss": 0.2742, + "step": 2333 + }, + { + "epoch": 1.0854131142458534, + "grad_norm": 0.5675728917121887, + "learning_rate": 8.052654954708966e-06, + "loss": 0.3901, + "step": 2334 + }, + { + "epoch": 1.0858781584250503, + "grad_norm": 0.46835145354270935, + "learning_rate": 8.050511426466717e-06, + "loss": 0.3319, + "step": 2335 + }, + { + "epoch": 1.0863432026042474, + "grad_norm": 0.4239199459552765, + "learning_rate": 8.048367004779223e-06, + "loss": 0.3336, + "step": 2336 + }, + { + "epoch": 1.0868082467834443, + "grad_norm": 0.5129764080047607, + "learning_rate": 8.046221690274547e-06, + "loss": 0.3662, + "step": 2337 + }, + { + "epoch": 1.0872732909626415, + "grad_norm": 0.46359536051750183, + "learning_rate": 8.04407548358102e-06, + "loss": 0.3276, + "step": 2338 + }, + { + "epoch": 1.0877383351418384, + "grad_norm": 0.4142586290836334, + "learning_rate": 8.041928385327229e-06, + "loss": 0.3574, + "step": 2339 + }, + { + "epoch": 1.0882033793210355, + "grad_norm": 0.4258030951023102, + "learning_rate": 8.039780396142023e-06, + "loss": 0.334, + "step": 2340 + }, + { + "epoch": 1.0886684235002324, + "grad_norm": 0.43229323625564575, + "learning_rate": 8.037631516654516e-06, + "loss": 0.3329, + "step": 2341 + }, + { + "epoch": 1.0891334676794295, + "grad_norm": 0.4798569977283478, + "learning_rate": 8.035481747494078e-06, + "loss": 0.3859, + "step": 2342 + }, + { + "epoch": 1.0895985118586267, + "grad_norm": 0.4725603759288788, + "learning_rate": 8.033331089290342e-06, + "loss": 0.3613, + "step": 2343 + }, + { + "epoch": 1.0900635560378236, + "grad_norm": 0.4565368890762329, + "learning_rate": 8.0311795426732e-06, + "loss": 0.3214, + "step": 2344 + }, + { + "epoch": 1.0905286002170207, + "grad_norm": 0.436138778924942, + "learning_rate": 8.029027108272806e-06, + "loss": 0.3499, + "step": 2345 + }, + { + "epoch": 1.0909936443962176, + "grad_norm": 0.4970560371875763, + "learning_rate": 8.026873786719574e-06, + "loss": 0.3763, + "step": 2346 + }, + { + "epoch": 1.0914586885754147, + "grad_norm": 0.47304481267929077, + "learning_rate": 8.024719578644176e-06, + "loss": 0.3628, + "step": 2347 + }, + { + "epoch": 1.0919237327546116, + "grad_norm": 0.4886552393436432, + "learning_rate": 8.022564484677545e-06, + "loss": 0.3415, + "step": 2348 + }, + { + "epoch": 1.0923887769338088, + "grad_norm": 0.4249359965324402, + "learning_rate": 8.020408505450869e-06, + "loss": 0.3737, + "step": 2349 + }, + { + "epoch": 1.0928538211130057, + "grad_norm": 0.43461641669273376, + "learning_rate": 8.018251641595604e-06, + "loss": 0.3466, + "step": 2350 + }, + { + "epoch": 1.0933188652922028, + "grad_norm": 0.49044400453567505, + "learning_rate": 8.016093893743462e-06, + "loss": 0.3437, + "step": 2351 + }, + { + "epoch": 1.0937839094713997, + "grad_norm": 0.5193617343902588, + "learning_rate": 8.013935262526407e-06, + "loss": 0.4029, + "step": 2352 + }, + { + "epoch": 1.0942489536505968, + "grad_norm": 0.4232480823993683, + "learning_rate": 8.01177574857667e-06, + "loss": 0.3253, + "step": 2353 + }, + { + "epoch": 1.0947139978297937, + "grad_norm": 0.5477461814880371, + "learning_rate": 8.009615352526737e-06, + "loss": 0.3584, + "step": 2354 + }, + { + "epoch": 1.0951790420089909, + "grad_norm": 0.4962575137615204, + "learning_rate": 8.007454075009352e-06, + "loss": 0.387, + "step": 2355 + }, + { + "epoch": 1.0956440861881878, + "grad_norm": 0.4006698727607727, + "learning_rate": 8.00529191665752e-06, + "loss": 0.3545, + "step": 2356 + }, + { + "epoch": 1.0961091303673849, + "grad_norm": 0.44313761591911316, + "learning_rate": 8.0031288781045e-06, + "loss": 0.3291, + "step": 2357 + }, + { + "epoch": 1.096574174546582, + "grad_norm": 0.525131344795227, + "learning_rate": 8.000964959983815e-06, + "loss": 0.3665, + "step": 2358 + }, + { + "epoch": 1.097039218725779, + "grad_norm": 0.44491708278656006, + "learning_rate": 7.998800162929236e-06, + "loss": 0.3512, + "step": 2359 + }, + { + "epoch": 1.097504262904976, + "grad_norm": 0.5135394334793091, + "learning_rate": 7.9966344875748e-06, + "loss": 0.3638, + "step": 2360 + }, + { + "epoch": 1.097969307084173, + "grad_norm": 0.5230858325958252, + "learning_rate": 7.994467934554794e-06, + "loss": 0.3372, + "step": 2361 + }, + { + "epoch": 1.09843435126337, + "grad_norm": 0.5185748934745789, + "learning_rate": 7.992300504503774e-06, + "loss": 0.395, + "step": 2362 + }, + { + "epoch": 1.098899395442567, + "grad_norm": 0.440336138010025, + "learning_rate": 7.990132198056538e-06, + "loss": 0.3397, + "step": 2363 + }, + { + "epoch": 1.0993644396217641, + "grad_norm": 0.539335310459137, + "learning_rate": 7.987963015848152e-06, + "loss": 0.3714, + "step": 2364 + }, + { + "epoch": 1.099829483800961, + "grad_norm": 0.4946892559528351, + "learning_rate": 7.985792958513932e-06, + "loss": 0.3258, + "step": 2365 + }, + { + "epoch": 1.1002945279801581, + "grad_norm": 0.4519208073616028, + "learning_rate": 7.983622026689452e-06, + "loss": 0.3841, + "step": 2366 + }, + { + "epoch": 1.100759572159355, + "grad_norm": 0.4441494345664978, + "learning_rate": 7.981450221010547e-06, + "loss": 0.3603, + "step": 2367 + }, + { + "epoch": 1.1012246163385522, + "grad_norm": 0.545825183391571, + "learning_rate": 7.979277542113297e-06, + "loss": 0.3385, + "step": 2368 + }, + { + "epoch": 1.1016896605177493, + "grad_norm": 0.46227794885635376, + "learning_rate": 7.97710399063405e-06, + "loss": 0.3456, + "step": 2369 + }, + { + "epoch": 1.1021547046969462, + "grad_norm": 0.405261754989624, + "learning_rate": 7.974929567209399e-06, + "loss": 0.3543, + "step": 2370 + }, + { + "epoch": 1.1026197488761433, + "grad_norm": 0.5703674554824829, + "learning_rate": 7.972754272476203e-06, + "loss": 0.3407, + "step": 2371 + }, + { + "epoch": 1.1030847930553402, + "grad_norm": 0.5644888877868652, + "learning_rate": 7.970578107071566e-06, + "loss": 0.4069, + "step": 2372 + }, + { + "epoch": 1.1035498372345374, + "grad_norm": 0.391647070646286, + "learning_rate": 7.968401071632854e-06, + "loss": 0.3582, + "step": 2373 + }, + { + "epoch": 1.1040148814137343, + "grad_norm": 0.6647116541862488, + "learning_rate": 7.966223166797684e-06, + "loss": 0.3614, + "step": 2374 + }, + { + "epoch": 1.1044799255929314, + "grad_norm": 0.4719175696372986, + "learning_rate": 7.964044393203928e-06, + "loss": 0.3655, + "step": 2375 + }, + { + "epoch": 1.1049449697721283, + "grad_norm": 0.43519163131713867, + "learning_rate": 7.961864751489717e-06, + "loss": 0.3422, + "step": 2376 + }, + { + "epoch": 1.1054100139513254, + "grad_norm": 0.5003102421760559, + "learning_rate": 7.959684242293428e-06, + "loss": 0.326, + "step": 2377 + }, + { + "epoch": 1.1058750581305223, + "grad_norm": 0.5152011513710022, + "learning_rate": 7.957502866253699e-06, + "loss": 0.313, + "step": 2378 + }, + { + "epoch": 1.1063401023097195, + "grad_norm": 0.46767157316207886, + "learning_rate": 7.955320624009421e-06, + "loss": 0.374, + "step": 2379 + }, + { + "epoch": 1.1068051464889164, + "grad_norm": 0.4796226918697357, + "learning_rate": 7.953137516199737e-06, + "loss": 0.394, + "step": 2380 + }, + { + "epoch": 1.1072701906681135, + "grad_norm": 0.47181200981140137, + "learning_rate": 7.950953543464039e-06, + "loss": 0.3253, + "step": 2381 + }, + { + "epoch": 1.1077352348473104, + "grad_norm": 0.48946288228034973, + "learning_rate": 7.948768706441985e-06, + "loss": 0.3536, + "step": 2382 + }, + { + "epoch": 1.1082002790265075, + "grad_norm": 0.43955692648887634, + "learning_rate": 7.946583005773471e-06, + "loss": 0.3505, + "step": 2383 + }, + { + "epoch": 1.1086653232057047, + "grad_norm": 0.5506858825683594, + "learning_rate": 7.944396442098659e-06, + "loss": 0.3846, + "step": 2384 + }, + { + "epoch": 1.1091303673849016, + "grad_norm": 0.38817930221557617, + "learning_rate": 7.942209016057954e-06, + "loss": 0.3297, + "step": 2385 + }, + { + "epoch": 1.1095954115640987, + "grad_norm": 0.4331190288066864, + "learning_rate": 7.94002072829202e-06, + "loss": 0.3632, + "step": 2386 + }, + { + "epoch": 1.1100604557432956, + "grad_norm": 0.4732077419757843, + "learning_rate": 7.937831579441768e-06, + "loss": 0.3593, + "step": 2387 + }, + { + "epoch": 1.1105254999224927, + "grad_norm": 0.3843960464000702, + "learning_rate": 7.935641570148368e-06, + "loss": 0.3042, + "step": 2388 + }, + { + "epoch": 1.1109905441016896, + "grad_norm": 0.43816637992858887, + "learning_rate": 7.933450701053235e-06, + "loss": 0.3798, + "step": 2389 + }, + { + "epoch": 1.1114555882808868, + "grad_norm": 0.5748831033706665, + "learning_rate": 7.931258972798041e-06, + "loss": 0.3534, + "step": 2390 + }, + { + "epoch": 1.1119206324600837, + "grad_norm": 0.48593536019325256, + "learning_rate": 7.929066386024707e-06, + "loss": 0.3994, + "step": 2391 + }, + { + "epoch": 1.1123856766392808, + "grad_norm": 0.5419626832008362, + "learning_rate": 7.926872941375404e-06, + "loss": 0.331, + "step": 2392 + }, + { + "epoch": 1.1128507208184777, + "grad_norm": 0.5766515731811523, + "learning_rate": 7.924678639492559e-06, + "loss": 0.3805, + "step": 2393 + }, + { + "epoch": 1.1133157649976748, + "grad_norm": 0.4355737268924713, + "learning_rate": 7.922483481018848e-06, + "loss": 0.3525, + "step": 2394 + }, + { + "epoch": 1.1137808091768717, + "grad_norm": 0.5494896173477173, + "learning_rate": 7.920287466597193e-06, + "loss": 0.3321, + "step": 2395 + }, + { + "epoch": 1.1142458533560688, + "grad_norm": 0.5438748598098755, + "learning_rate": 7.918090596870776e-06, + "loss": 0.3795, + "step": 2396 + }, + { + "epoch": 1.1147108975352658, + "grad_norm": 0.4230845868587494, + "learning_rate": 7.915892872483023e-06, + "loss": 0.3506, + "step": 2397 + }, + { + "epoch": 1.1151759417144629, + "grad_norm": 0.6087936162948608, + "learning_rate": 7.913694294077607e-06, + "loss": 0.3583, + "step": 2398 + }, + { + "epoch": 1.11564098589366, + "grad_norm": 0.5080538988113403, + "learning_rate": 7.911494862298464e-06, + "loss": 0.3579, + "step": 2399 + }, + { + "epoch": 1.116106030072857, + "grad_norm": 0.5257863402366638, + "learning_rate": 7.909294577789765e-06, + "loss": 0.3751, + "step": 2400 + }, + { + "epoch": 1.116571074252054, + "grad_norm": 0.4901667833328247, + "learning_rate": 7.90709344119594e-06, + "loss": 0.3494, + "step": 2401 + }, + { + "epoch": 1.117036118431251, + "grad_norm": 0.5573136210441589, + "learning_rate": 7.90489145316167e-06, + "loss": 0.3318, + "step": 2402 + }, + { + "epoch": 1.117501162610448, + "grad_norm": 0.5585957765579224, + "learning_rate": 7.902688614331875e-06, + "loss": 0.3737, + "step": 2403 + }, + { + "epoch": 1.117966206789645, + "grad_norm": 0.5125075578689575, + "learning_rate": 7.900484925351734e-06, + "loss": 0.4047, + "step": 2404 + }, + { + "epoch": 1.118431250968842, + "grad_norm": 0.45521363615989685, + "learning_rate": 7.898280386866673e-06, + "loss": 0.321, + "step": 2405 + }, + { + "epoch": 1.118896295148039, + "grad_norm": 0.5232580304145813, + "learning_rate": 7.896074999522362e-06, + "loss": 0.3717, + "step": 2406 + }, + { + "epoch": 1.1193613393272361, + "grad_norm": 0.395935982465744, + "learning_rate": 7.893868763964724e-06, + "loss": 0.3292, + "step": 2407 + }, + { + "epoch": 1.119826383506433, + "grad_norm": 0.5463009476661682, + "learning_rate": 7.891661680839932e-06, + "loss": 0.373, + "step": 2408 + }, + { + "epoch": 1.1202914276856302, + "grad_norm": 0.44592776894569397, + "learning_rate": 7.889453750794405e-06, + "loss": 0.3653, + "step": 2409 + }, + { + "epoch": 1.120756471864827, + "grad_norm": 0.45497268438339233, + "learning_rate": 7.887244974474807e-06, + "loss": 0.3252, + "step": 2410 + }, + { + "epoch": 1.1212215160440242, + "grad_norm": 0.4712156653404236, + "learning_rate": 7.885035352528054e-06, + "loss": 0.3514, + "step": 2411 + }, + { + "epoch": 1.121686560223221, + "grad_norm": 0.4327942430973053, + "learning_rate": 7.882824885601308e-06, + "loss": 0.3591, + "step": 2412 + }, + { + "epoch": 1.1221516044024182, + "grad_norm": 0.3762139678001404, + "learning_rate": 7.88061357434198e-06, + "loss": 0.3263, + "step": 2413 + }, + { + "epoch": 1.1226166485816154, + "grad_norm": 0.5221520662307739, + "learning_rate": 7.878401419397725e-06, + "loss": 0.3907, + "step": 2414 + }, + { + "epoch": 1.1230816927608123, + "grad_norm": 0.41757580637931824, + "learning_rate": 7.87618842141645e-06, + "loss": 0.3556, + "step": 2415 + }, + { + "epoch": 1.1235467369400094, + "grad_norm": 0.4390140771865845, + "learning_rate": 7.873974581046303e-06, + "loss": 0.3543, + "step": 2416 + }, + { + "epoch": 1.1240117811192063, + "grad_norm": 0.40529686212539673, + "learning_rate": 7.871759898935685e-06, + "loss": 0.3468, + "step": 2417 + }, + { + "epoch": 1.1244768252984034, + "grad_norm": 0.4217506945133209, + "learning_rate": 7.86954437573324e-06, + "loss": 0.3467, + "step": 2418 + }, + { + "epoch": 1.1249418694776003, + "grad_norm": 0.418305367231369, + "learning_rate": 7.867328012087856e-06, + "loss": 0.3951, + "step": 2419 + }, + { + "epoch": 1.1254069136567975, + "grad_norm": 0.42547518014907837, + "learning_rate": 7.865110808648671e-06, + "loss": 0.3369, + "step": 2420 + }, + { + "epoch": 1.1258719578359944, + "grad_norm": 0.41389068961143494, + "learning_rate": 7.862892766065072e-06, + "loss": 0.3423, + "step": 2421 + }, + { + "epoch": 1.1263370020151915, + "grad_norm": 0.47316908836364746, + "learning_rate": 7.86067388498668e-06, + "loss": 0.375, + "step": 2422 + }, + { + "epoch": 1.1268020461943884, + "grad_norm": 0.4010957181453705, + "learning_rate": 7.858454166063376e-06, + "loss": 0.3348, + "step": 2423 + }, + { + "epoch": 1.1272670903735855, + "grad_norm": 0.4541621804237366, + "learning_rate": 7.856233609945276e-06, + "loss": 0.3627, + "step": 2424 + }, + { + "epoch": 1.1277321345527824, + "grad_norm": 0.4253348410129547, + "learning_rate": 7.854012217282747e-06, + "loss": 0.3391, + "step": 2425 + }, + { + "epoch": 1.1281971787319796, + "grad_norm": 0.4224770963191986, + "learning_rate": 7.851789988726397e-06, + "loss": 0.3227, + "step": 2426 + }, + { + "epoch": 1.1286622229111765, + "grad_norm": 0.4292997121810913, + "learning_rate": 7.849566924927082e-06, + "loss": 0.3815, + "step": 2427 + }, + { + "epoch": 1.1291272670903736, + "grad_norm": 0.39106887578964233, + "learning_rate": 7.8473430265359e-06, + "loss": 0.3313, + "step": 2428 + }, + { + "epoch": 1.1295923112695707, + "grad_norm": 0.41027796268463135, + "learning_rate": 7.845118294204195e-06, + "loss": 0.3478, + "step": 2429 + }, + { + "epoch": 1.1300573554487676, + "grad_norm": 0.454773873090744, + "learning_rate": 7.842892728583557e-06, + "loss": 0.3566, + "step": 2430 + }, + { + "epoch": 1.1305223996279647, + "grad_norm": 0.4108147919178009, + "learning_rate": 7.840666330325815e-06, + "loss": 0.338, + "step": 2431 + }, + { + "epoch": 1.1309874438071617, + "grad_norm": 0.41285791993141174, + "learning_rate": 7.838439100083048e-06, + "loss": 0.372, + "step": 2432 + }, + { + "epoch": 1.1314524879863588, + "grad_norm": 0.44633156061172485, + "learning_rate": 7.836211038507571e-06, + "loss": 0.341, + "step": 2433 + }, + { + "epoch": 1.1319175321655557, + "grad_norm": 0.3967339098453522, + "learning_rate": 7.833982146251952e-06, + "loss": 0.3189, + "step": 2434 + }, + { + "epoch": 1.1323825763447528, + "grad_norm": 0.4380015432834625, + "learning_rate": 7.831752423968995e-06, + "loss": 0.3506, + "step": 2435 + }, + { + "epoch": 1.1328476205239497, + "grad_norm": 0.5487619638442993, + "learning_rate": 7.829521872311747e-06, + "loss": 0.3675, + "step": 2436 + }, + { + "epoch": 1.1333126647031468, + "grad_norm": 0.4468625783920288, + "learning_rate": 7.827290491933506e-06, + "loss": 0.3792, + "step": 2437 + }, + { + "epoch": 1.1337777088823437, + "grad_norm": 0.3934735059738159, + "learning_rate": 7.825058283487803e-06, + "loss": 0.3421, + "step": 2438 + }, + { + "epoch": 1.1342427530615409, + "grad_norm": 0.49053192138671875, + "learning_rate": 7.822825247628416e-06, + "loss": 0.3482, + "step": 2439 + }, + { + "epoch": 1.1347077972407378, + "grad_norm": 0.512285053730011, + "learning_rate": 7.820591385009366e-06, + "loss": 0.3616, + "step": 2440 + }, + { + "epoch": 1.135172841419935, + "grad_norm": 0.49586769938468933, + "learning_rate": 7.818356696284916e-06, + "loss": 0.3796, + "step": 2441 + }, + { + "epoch": 1.1356378855991318, + "grad_norm": 0.4258926808834076, + "learning_rate": 7.816121182109567e-06, + "loss": 0.332, + "step": 2442 + }, + { + "epoch": 1.136102929778329, + "grad_norm": 0.45421749353408813, + "learning_rate": 7.813884843138067e-06, + "loss": 0.3504, + "step": 2443 + }, + { + "epoch": 1.136567973957526, + "grad_norm": 0.45513275265693665, + "learning_rate": 7.811647680025403e-06, + "loss": 0.3345, + "step": 2444 + }, + { + "epoch": 1.137033018136723, + "grad_norm": 0.4567432403564453, + "learning_rate": 7.809409693426803e-06, + "loss": 0.3494, + "step": 2445 + }, + { + "epoch": 1.13749806231592, + "grad_norm": 0.4656786620616913, + "learning_rate": 7.807170883997738e-06, + "loss": 0.3428, + "step": 2446 + }, + { + "epoch": 1.137963106495117, + "grad_norm": 0.44131651520729065, + "learning_rate": 7.804931252393918e-06, + "loss": 0.3475, + "step": 2447 + }, + { + "epoch": 1.1384281506743141, + "grad_norm": 0.42925527691841125, + "learning_rate": 7.802690799271295e-06, + "loss": 0.3882, + "step": 2448 + }, + { + "epoch": 1.138893194853511, + "grad_norm": 0.4417758584022522, + "learning_rate": 7.800449525286062e-06, + "loss": 0.3428, + "step": 2449 + }, + { + "epoch": 1.1393582390327082, + "grad_norm": 0.4203170835971832, + "learning_rate": 7.79820743109465e-06, + "loss": 0.3425, + "step": 2450 + }, + { + "epoch": 1.139823283211905, + "grad_norm": 0.45581138134002686, + "learning_rate": 7.795964517353734e-06, + "loss": 0.3596, + "step": 2451 + }, + { + "epoch": 1.1402883273911022, + "grad_norm": 0.44257980585098267, + "learning_rate": 7.793720784720227e-06, + "loss": 0.3394, + "step": 2452 + }, + { + "epoch": 1.140753371570299, + "grad_norm": 0.40145984292030334, + "learning_rate": 7.791476233851281e-06, + "loss": 0.3502, + "step": 2453 + }, + { + "epoch": 1.1412184157494962, + "grad_norm": 0.48211121559143066, + "learning_rate": 7.789230865404287e-06, + "loss": 0.3394, + "step": 2454 + }, + { + "epoch": 1.1416834599286931, + "grad_norm": 0.4565165340900421, + "learning_rate": 7.78698468003688e-06, + "loss": 0.3687, + "step": 2455 + }, + { + "epoch": 1.1421485041078903, + "grad_norm": 0.4464455842971802, + "learning_rate": 7.784737678406929e-06, + "loss": 0.3255, + "step": 2456 + }, + { + "epoch": 1.1426135482870872, + "grad_norm": 0.6133958101272583, + "learning_rate": 7.782489861172545e-06, + "loss": 0.3996, + "step": 2457 + }, + { + "epoch": 1.1430785924662843, + "grad_norm": 0.3757973909378052, + "learning_rate": 7.780241228992075e-06, + "loss": 0.3129, + "step": 2458 + }, + { + "epoch": 1.1435436366454814, + "grad_norm": 0.5780712366104126, + "learning_rate": 7.777991782524112e-06, + "loss": 0.3765, + "step": 2459 + }, + { + "epoch": 1.1440086808246783, + "grad_norm": 0.5257070660591125, + "learning_rate": 7.775741522427477e-06, + "loss": 0.3453, + "step": 2460 + }, + { + "epoch": 1.1444737250038755, + "grad_norm": 0.49410927295684814, + "learning_rate": 7.773490449361238e-06, + "loss": 0.3747, + "step": 2461 + }, + { + "epoch": 1.1449387691830724, + "grad_norm": 0.46261581778526306, + "learning_rate": 7.771238563984696e-06, + "loss": 0.3363, + "step": 2462 + }, + { + "epoch": 1.1454038133622695, + "grad_norm": 0.6092031598091125, + "learning_rate": 7.768985866957392e-06, + "loss": 0.3472, + "step": 2463 + }, + { + "epoch": 1.1458688575414664, + "grad_norm": 0.42954081296920776, + "learning_rate": 7.766732358939106e-06, + "loss": 0.3336, + "step": 2464 + }, + { + "epoch": 1.1463339017206635, + "grad_norm": 0.4083530604839325, + "learning_rate": 7.764478040589854e-06, + "loss": 0.3561, + "step": 2465 + }, + { + "epoch": 1.1467989458998604, + "grad_norm": 0.46050816774368286, + "learning_rate": 7.762222912569886e-06, + "loss": 0.3711, + "step": 2466 + }, + { + "epoch": 1.1472639900790575, + "grad_norm": 0.460467129945755, + "learning_rate": 7.759966975539693e-06, + "loss": 0.3107, + "step": 2467 + }, + { + "epoch": 1.1477290342582545, + "grad_norm": 0.5141720771789551, + "learning_rate": 7.757710230160003e-06, + "loss": 0.3846, + "step": 2468 + }, + { + "epoch": 1.1481940784374516, + "grad_norm": 0.45427727699279785, + "learning_rate": 7.755452677091783e-06, + "loss": 0.3422, + "step": 2469 + }, + { + "epoch": 1.1486591226166485, + "grad_norm": 0.47798460721969604, + "learning_rate": 7.75319431699623e-06, + "loss": 0.3462, + "step": 2470 + }, + { + "epoch": 1.1491241667958456, + "grad_norm": 0.4749840497970581, + "learning_rate": 7.750935150534781e-06, + "loss": 0.3547, + "step": 2471 + }, + { + "epoch": 1.1495892109750425, + "grad_norm": 0.46182170510292053, + "learning_rate": 7.748675178369112e-06, + "loss": 0.395, + "step": 2472 + }, + { + "epoch": 1.1500542551542396, + "grad_norm": 0.504657506942749, + "learning_rate": 7.74641440116113e-06, + "loss": 0.3707, + "step": 2473 + }, + { + "epoch": 1.1505192993334368, + "grad_norm": 0.4313875436782837, + "learning_rate": 7.74415281957298e-06, + "loss": 0.3357, + "step": 2474 + }, + { + "epoch": 1.1509843435126337, + "grad_norm": 0.5049738883972168, + "learning_rate": 7.741890434267043e-06, + "loss": 0.3556, + "step": 2475 + }, + { + "epoch": 1.1514493876918308, + "grad_norm": 0.4582679569721222, + "learning_rate": 7.739627245905935e-06, + "loss": 0.3233, + "step": 2476 + }, + { + "epoch": 1.1519144318710277, + "grad_norm": 0.463096559047699, + "learning_rate": 7.737363255152506e-06, + "loss": 0.3661, + "step": 2477 + }, + { + "epoch": 1.1523794760502248, + "grad_norm": 0.4227137267589569, + "learning_rate": 7.735098462669843e-06, + "loss": 0.3238, + "step": 2478 + }, + { + "epoch": 1.1528445202294217, + "grad_norm": 0.5194125771522522, + "learning_rate": 7.732832869121267e-06, + "loss": 0.3713, + "step": 2479 + }, + { + "epoch": 1.1533095644086189, + "grad_norm": 0.5065972805023193, + "learning_rate": 7.730566475170334e-06, + "loss": 0.3743, + "step": 2480 + }, + { + "epoch": 1.1537746085878158, + "grad_norm": 0.4301440715789795, + "learning_rate": 7.728299281480833e-06, + "loss": 0.3142, + "step": 2481 + }, + { + "epoch": 1.154239652767013, + "grad_norm": 0.4768223464488983, + "learning_rate": 7.726031288716789e-06, + "loss": 0.3369, + "step": 2482 + }, + { + "epoch": 1.1547046969462098, + "grad_norm": 0.5396889448165894, + "learning_rate": 7.723762497542459e-06, + "loss": 0.383, + "step": 2483 + }, + { + "epoch": 1.155169741125407, + "grad_norm": 0.4935142993927002, + "learning_rate": 7.72149290862234e-06, + "loss": 0.3244, + "step": 2484 + }, + { + "epoch": 1.155634785304604, + "grad_norm": 0.428690642118454, + "learning_rate": 7.719222522621149e-06, + "loss": 0.3658, + "step": 2485 + }, + { + "epoch": 1.156099829483801, + "grad_norm": 0.4382781982421875, + "learning_rate": 7.716951340203851e-06, + "loss": 0.3524, + "step": 2486 + }, + { + "epoch": 1.1565648736629979, + "grad_norm": 0.594252347946167, + "learning_rate": 7.714679362035638e-06, + "loss": 0.3808, + "step": 2487 + }, + { + "epoch": 1.157029917842195, + "grad_norm": 0.4175407290458679, + "learning_rate": 7.712406588781935e-06, + "loss": 0.3164, + "step": 2488 + }, + { + "epoch": 1.1574949620213921, + "grad_norm": 0.4126685559749603, + "learning_rate": 7.7101330211084e-06, + "loss": 0.3416, + "step": 2489 + }, + { + "epoch": 1.157960006200589, + "grad_norm": 0.5235841274261475, + "learning_rate": 7.707858659680924e-06, + "loss": 0.3892, + "step": 2490 + }, + { + "epoch": 1.1584250503797862, + "grad_norm": 0.3950689733028412, + "learning_rate": 7.70558350516563e-06, + "loss": 0.3272, + "step": 2491 + }, + { + "epoch": 1.158890094558983, + "grad_norm": 0.4256301820278168, + "learning_rate": 7.703307558228875e-06, + "loss": 0.3423, + "step": 2492 + }, + { + "epoch": 1.1593551387381802, + "grad_norm": 0.4690292775630951, + "learning_rate": 7.701030819537248e-06, + "loss": 0.333, + "step": 2493 + }, + { + "epoch": 1.159820182917377, + "grad_norm": 0.4405955672264099, + "learning_rate": 7.698753289757565e-06, + "loss": 0.3981, + "step": 2494 + }, + { + "epoch": 1.1602852270965742, + "grad_norm": 0.5139094591140747, + "learning_rate": 7.69647496955688e-06, + "loss": 0.35, + "step": 2495 + }, + { + "epoch": 1.1607502712757711, + "grad_norm": 0.39744675159454346, + "learning_rate": 7.694195859602475e-06, + "loss": 0.3465, + "step": 2496 + }, + { + "epoch": 1.1612153154549683, + "grad_norm": 0.4022337794303894, + "learning_rate": 7.691915960561869e-06, + "loss": 0.3574, + "step": 2497 + }, + { + "epoch": 1.1616803596341652, + "grad_norm": 0.3752380907535553, + "learning_rate": 7.6896352731028e-06, + "loss": 0.3333, + "step": 2498 + }, + { + "epoch": 1.1621454038133623, + "grad_norm": 0.48939090967178345, + "learning_rate": 7.687353797893249e-06, + "loss": 0.3831, + "step": 2499 + }, + { + "epoch": 1.1626104479925594, + "grad_norm": 0.4494510591030121, + "learning_rate": 7.68507153560142e-06, + "loss": 0.3882, + "step": 2500 + }, + { + "epoch": 1.1630754921717563, + "grad_norm": 0.38291284441947937, + "learning_rate": 7.682788486895754e-06, + "loss": 0.3589, + "step": 2501 + }, + { + "epoch": 1.1635405363509532, + "grad_norm": 0.383709192276001, + "learning_rate": 7.680504652444917e-06, + "loss": 0.37, + "step": 2502 + }, + { + "epoch": 1.1640055805301504, + "grad_norm": 0.507161557674408, + "learning_rate": 7.678220032917806e-06, + "loss": 0.3843, + "step": 2503 + }, + { + "epoch": 1.1644706247093475, + "grad_norm": 0.3985297977924347, + "learning_rate": 7.675934628983551e-06, + "loss": 0.3142, + "step": 2504 + }, + { + "epoch": 1.1649356688885444, + "grad_norm": 0.41284704208374023, + "learning_rate": 7.67364844131151e-06, + "loss": 0.3457, + "step": 2505 + }, + { + "epoch": 1.1654007130677415, + "grad_norm": 0.49253928661346436, + "learning_rate": 7.671361470571265e-06, + "loss": 0.3974, + "step": 2506 + }, + { + "epoch": 1.1658657572469384, + "grad_norm": 0.41742268204689026, + "learning_rate": 7.669073717432641e-06, + "loss": 0.346, + "step": 2507 + }, + { + "epoch": 1.1663308014261355, + "grad_norm": 0.4668549597263336, + "learning_rate": 7.666785182565676e-06, + "loss": 0.3568, + "step": 2508 + }, + { + "epoch": 1.1667958456053324, + "grad_norm": 0.43478819727897644, + "learning_rate": 7.66449586664065e-06, + "loss": 0.2967, + "step": 2509 + }, + { + "epoch": 1.1672608897845296, + "grad_norm": 0.5247063636779785, + "learning_rate": 7.662205770328064e-06, + "loss": 0.4326, + "step": 2510 + }, + { + "epoch": 1.1677259339637265, + "grad_norm": 0.3936578333377838, + "learning_rate": 7.65991489429865e-06, + "loss": 0.3177, + "step": 2511 + }, + { + "epoch": 1.1681909781429236, + "grad_norm": 0.43789342045783997, + "learning_rate": 7.65762323922337e-06, + "loss": 0.3701, + "step": 2512 + }, + { + "epoch": 1.1686560223221205, + "grad_norm": 0.46906614303588867, + "learning_rate": 7.655330805773411e-06, + "loss": 0.3368, + "step": 2513 + }, + { + "epoch": 1.1691210665013176, + "grad_norm": 0.4493185877799988, + "learning_rate": 7.653037594620189e-06, + "loss": 0.3654, + "step": 2514 + }, + { + "epoch": 1.1695861106805148, + "grad_norm": 0.40783825516700745, + "learning_rate": 7.650743606435352e-06, + "loss": 0.3143, + "step": 2515 + }, + { + "epoch": 1.1700511548597117, + "grad_norm": 0.5146582722663879, + "learning_rate": 7.648448841890765e-06, + "loss": 0.3812, + "step": 2516 + }, + { + "epoch": 1.1705161990389086, + "grad_norm": 0.4294111430644989, + "learning_rate": 7.646153301658534e-06, + "loss": 0.3192, + "step": 2517 + }, + { + "epoch": 1.1709812432181057, + "grad_norm": 0.4016132354736328, + "learning_rate": 7.643856986410983e-06, + "loss": 0.3582, + "step": 2518 + }, + { + "epoch": 1.1714462873973028, + "grad_norm": 0.4334622919559479, + "learning_rate": 7.641559896820664e-06, + "loss": 0.3611, + "step": 2519 + }, + { + "epoch": 1.1719113315764997, + "grad_norm": 0.4894271492958069, + "learning_rate": 7.63926203356036e-06, + "loss": 0.3673, + "step": 2520 + }, + { + "epoch": 1.1723763757556969, + "grad_norm": 0.45334476232528687, + "learning_rate": 7.636963397303074e-06, + "loss": 0.3622, + "step": 2521 + }, + { + "epoch": 1.1728414199348938, + "grad_norm": 0.4029448926448822, + "learning_rate": 7.634663988722044e-06, + "loss": 0.3332, + "step": 2522 + }, + { + "epoch": 1.173306464114091, + "grad_norm": 0.4322480261325836, + "learning_rate": 7.632363808490726e-06, + "loss": 0.3174, + "step": 2523 + }, + { + "epoch": 1.1737715082932878, + "grad_norm": 0.4665205180644989, + "learning_rate": 7.630062857282806e-06, + "loss": 0.3799, + "step": 2524 + }, + { + "epoch": 1.174236552472485, + "grad_norm": 0.438932329416275, + "learning_rate": 7.627761135772196e-06, + "loss": 0.3446, + "step": 2525 + }, + { + "epoch": 1.1747015966516818, + "grad_norm": 0.40907543897628784, + "learning_rate": 7.625458644633032e-06, + "loss": 0.3606, + "step": 2526 + }, + { + "epoch": 1.175166640830879, + "grad_norm": 0.39875200390815735, + "learning_rate": 7.623155384539678e-06, + "loss": 0.315, + "step": 2527 + }, + { + "epoch": 1.1756316850100759, + "grad_norm": 0.39767640829086304, + "learning_rate": 7.6208513561667184e-06, + "loss": 0.3272, + "step": 2528 + }, + { + "epoch": 1.176096729189273, + "grad_norm": 0.4325427711009979, + "learning_rate": 7.618546560188968e-06, + "loss": 0.3653, + "step": 2529 + }, + { + "epoch": 1.1765617733684701, + "grad_norm": 0.43984878063201904, + "learning_rate": 7.616240997281465e-06, + "loss": 0.3544, + "step": 2530 + }, + { + "epoch": 1.177026817547667, + "grad_norm": 0.42820581793785095, + "learning_rate": 7.613934668119467e-06, + "loss": 0.3906, + "step": 2531 + }, + { + "epoch": 1.177491861726864, + "grad_norm": 0.44489437341690063, + "learning_rate": 7.611627573378466e-06, + "loss": 0.3505, + "step": 2532 + }, + { + "epoch": 1.177956905906061, + "grad_norm": 0.4203685522079468, + "learning_rate": 7.609319713734169e-06, + "loss": 0.3437, + "step": 2533 + }, + { + "epoch": 1.1784219500852582, + "grad_norm": 0.47429367899894714, + "learning_rate": 7.607011089862512e-06, + "loss": 0.3635, + "step": 2534 + }, + { + "epoch": 1.178886994264455, + "grad_norm": 0.4002319276332855, + "learning_rate": 7.604701702439652e-06, + "loss": 0.3221, + "step": 2535 + }, + { + "epoch": 1.1793520384436522, + "grad_norm": 0.45908692479133606, + "learning_rate": 7.602391552141972e-06, + "loss": 0.3561, + "step": 2536 + }, + { + "epoch": 1.1798170826228491, + "grad_norm": 0.4550569951534271, + "learning_rate": 7.600080639646077e-06, + "loss": 0.3201, + "step": 2537 + }, + { + "epoch": 1.1802821268020462, + "grad_norm": 0.4702633321285248, + "learning_rate": 7.597768965628798e-06, + "loss": 0.36, + "step": 2538 + }, + { + "epoch": 1.1807471709812432, + "grad_norm": 0.5423824787139893, + "learning_rate": 7.595456530767185e-06, + "loss": 0.3632, + "step": 2539 + }, + { + "epoch": 1.1812122151604403, + "grad_norm": 0.44125810265541077, + "learning_rate": 7.593143335738511e-06, + "loss": 0.3491, + "step": 2540 + }, + { + "epoch": 1.1816772593396372, + "grad_norm": 0.4145257771015167, + "learning_rate": 7.590829381220275e-06, + "loss": 0.3175, + "step": 2541 + }, + { + "epoch": 1.1821423035188343, + "grad_norm": 0.5215808153152466, + "learning_rate": 7.5885146678901954e-06, + "loss": 0.3891, + "step": 2542 + }, + { + "epoch": 1.1826073476980312, + "grad_norm": 0.4379088878631592, + "learning_rate": 7.586199196426216e-06, + "loss": 0.3551, + "step": 2543 + }, + { + "epoch": 1.1830723918772283, + "grad_norm": 0.42644643783569336, + "learning_rate": 7.583882967506502e-06, + "loss": 0.3282, + "step": 2544 + }, + { + "epoch": 1.1835374360564255, + "grad_norm": 0.4398006200790405, + "learning_rate": 7.581565981809435e-06, + "loss": 0.3172, + "step": 2545 + }, + { + "epoch": 1.1840024802356224, + "grad_norm": 0.46293768286705017, + "learning_rate": 7.579248240013626e-06, + "loss": 0.3876, + "step": 2546 + }, + { + "epoch": 1.1844675244148193, + "grad_norm": 0.46180588006973267, + "learning_rate": 7.576929742797902e-06, + "loss": 0.3936, + "step": 2547 + }, + { + "epoch": 1.1849325685940164, + "grad_norm": 0.4307175874710083, + "learning_rate": 7.574610490841312e-06, + "loss": 0.3386, + "step": 2548 + }, + { + "epoch": 1.1853976127732135, + "grad_norm": 0.3494255542755127, + "learning_rate": 7.5722904848231315e-06, + "loss": 0.3124, + "step": 2549 + }, + { + "epoch": 1.1858626569524104, + "grad_norm": 0.4179413914680481, + "learning_rate": 7.5699697254228496e-06, + "loss": 0.3271, + "step": 2550 + }, + { + "epoch": 1.1863277011316076, + "grad_norm": 0.47171148657798767, + "learning_rate": 7.56764821332018e-06, + "loss": 0.3899, + "step": 2551 + }, + { + "epoch": 1.1867927453108045, + "grad_norm": 0.4213588535785675, + "learning_rate": 7.565325949195055e-06, + "loss": 0.3373, + "step": 2552 + }, + { + "epoch": 1.1872577894900016, + "grad_norm": 0.48638585209846497, + "learning_rate": 7.563002933727628e-06, + "loss": 0.3578, + "step": 2553 + }, + { + "epoch": 1.1877228336691985, + "grad_norm": 0.4090135097503662, + "learning_rate": 7.560679167598273e-06, + "loss": 0.3496, + "step": 2554 + }, + { + "epoch": 1.1881878778483956, + "grad_norm": 0.44248971343040466, + "learning_rate": 7.558354651487583e-06, + "loss": 0.3446, + "step": 2555 + }, + { + "epoch": 1.1886529220275925, + "grad_norm": 0.40973594784736633, + "learning_rate": 7.556029386076371e-06, + "loss": 0.3623, + "step": 2556 + }, + { + "epoch": 1.1891179662067897, + "grad_norm": 0.4441074728965759, + "learning_rate": 7.553703372045671e-06, + "loss": 0.3421, + "step": 2557 + }, + { + "epoch": 1.1895830103859866, + "grad_norm": 0.48949727416038513, + "learning_rate": 7.5513766100767334e-06, + "loss": 0.3372, + "step": 2558 + }, + { + "epoch": 1.1900480545651837, + "grad_norm": 0.4103635251522064, + "learning_rate": 7.549049100851029e-06, + "loss": 0.3492, + "step": 2559 + }, + { + "epoch": 1.1905130987443808, + "grad_norm": 0.395436555147171, + "learning_rate": 7.546720845050247e-06, + "loss": 0.3475, + "step": 2560 + }, + { + "epoch": 1.1909781429235777, + "grad_norm": 0.4113844335079193, + "learning_rate": 7.544391843356298e-06, + "loss": 0.3744, + "step": 2561 + }, + { + "epoch": 1.1914431871027749, + "grad_norm": 0.397686243057251, + "learning_rate": 7.542062096451306e-06, + "loss": 0.333, + "step": 2562 + }, + { + "epoch": 1.1919082312819718, + "grad_norm": 0.4004164934158325, + "learning_rate": 7.539731605017616e-06, + "loss": 0.3354, + "step": 2563 + }, + { + "epoch": 1.192373275461169, + "grad_norm": 0.43825533986091614, + "learning_rate": 7.537400369737793e-06, + "loss": 0.3887, + "step": 2564 + }, + { + "epoch": 1.1928383196403658, + "grad_norm": 0.433907151222229, + "learning_rate": 7.535068391294618e-06, + "loss": 0.3413, + "step": 2565 + }, + { + "epoch": 1.193303363819563, + "grad_norm": 0.3985256254673004, + "learning_rate": 7.532735670371088e-06, + "loss": 0.3353, + "step": 2566 + }, + { + "epoch": 1.1937684079987598, + "grad_norm": 0.4107392728328705, + "learning_rate": 7.530402207650418e-06, + "loss": 0.3753, + "step": 2567 + }, + { + "epoch": 1.194233452177957, + "grad_norm": 0.40422889590263367, + "learning_rate": 7.528068003816045e-06, + "loss": 0.3373, + "step": 2568 + }, + { + "epoch": 1.1946984963571539, + "grad_norm": 0.44744202494621277, + "learning_rate": 7.525733059551618e-06, + "loss": 0.3985, + "step": 2569 + }, + { + "epoch": 1.195163540536351, + "grad_norm": 0.4296942353248596, + "learning_rate": 7.523397375541003e-06, + "loss": 0.3075, + "step": 2570 + }, + { + "epoch": 1.195628584715548, + "grad_norm": 0.47275644540786743, + "learning_rate": 7.521060952468284e-06, + "loss": 0.3916, + "step": 2571 + }, + { + "epoch": 1.196093628894745, + "grad_norm": 0.4506266117095947, + "learning_rate": 7.518723791017762e-06, + "loss": 0.3457, + "step": 2572 + }, + { + "epoch": 1.196558673073942, + "grad_norm": 0.5241588354110718, + "learning_rate": 7.516385891873954e-06, + "loss": 0.398, + "step": 2573 + }, + { + "epoch": 1.197023717253139, + "grad_norm": 0.38779518008232117, + "learning_rate": 7.5140472557215945e-06, + "loss": 0.3554, + "step": 2574 + }, + { + "epoch": 1.1974887614323362, + "grad_norm": 0.418839693069458, + "learning_rate": 7.511707883245627e-06, + "loss": 0.3467, + "step": 2575 + }, + { + "epoch": 1.197953805611533, + "grad_norm": 0.5039485692977905, + "learning_rate": 7.50936777513122e-06, + "loss": 0.4058, + "step": 2576 + }, + { + "epoch": 1.1984188497907302, + "grad_norm": 0.4700038433074951, + "learning_rate": 7.50702693206375e-06, + "loss": 0.3266, + "step": 2577 + }, + { + "epoch": 1.1988838939699271, + "grad_norm": 0.43291565775871277, + "learning_rate": 7.5046853547288155e-06, + "loss": 0.3645, + "step": 2578 + }, + { + "epoch": 1.1993489381491242, + "grad_norm": 0.45220211148262024, + "learning_rate": 7.502343043812224e-06, + "loss": 0.3327, + "step": 2579 + }, + { + "epoch": 1.1998139823283211, + "grad_norm": 0.4677691161632538, + "learning_rate": 7.500000000000001e-06, + "loss": 0.3708, + "step": 2580 + }, + { + "epoch": 1.2002790265075183, + "grad_norm": 0.43373778462409973, + "learning_rate": 7.497656223978385e-06, + "loss": 0.339, + "step": 2581 + }, + { + "epoch": 1.2007440706867152, + "grad_norm": 0.44764167070388794, + "learning_rate": 7.495311716433833e-06, + "loss": 0.3613, + "step": 2582 + }, + { + "epoch": 1.2012091148659123, + "grad_norm": 0.4182588756084442, + "learning_rate": 7.492966478053009e-06, + "loss": 0.3325, + "step": 2583 + }, + { + "epoch": 1.2016741590451092, + "grad_norm": 0.35634884238243103, + "learning_rate": 7.490620509522797e-06, + "loss": 0.3005, + "step": 2584 + }, + { + "epoch": 1.2021392032243063, + "grad_norm": 0.48205626010894775, + "learning_rate": 7.488273811530294e-06, + "loss": 0.4052, + "step": 2585 + }, + { + "epoch": 1.2026042474035032, + "grad_norm": 0.41472601890563965, + "learning_rate": 7.48592638476281e-06, + "loss": 0.3258, + "step": 2586 + }, + { + "epoch": 1.2030692915827004, + "grad_norm": 0.4383096992969513, + "learning_rate": 7.483578229907866e-06, + "loss": 0.3553, + "step": 2587 + }, + { + "epoch": 1.2035343357618973, + "grad_norm": 0.45658400654792786, + "learning_rate": 7.481229347653201e-06, + "loss": 0.3551, + "step": 2588 + }, + { + "epoch": 1.2039993799410944, + "grad_norm": 0.4237629771232605, + "learning_rate": 7.4788797386867596e-06, + "loss": 0.3481, + "step": 2589 + }, + { + "epoch": 1.2044644241202915, + "grad_norm": 0.4323580861091614, + "learning_rate": 7.47652940369671e-06, + "loss": 0.3566, + "step": 2590 + }, + { + "epoch": 1.2049294682994884, + "grad_norm": 0.4833803176879883, + "learning_rate": 7.474178343371425e-06, + "loss": 0.3718, + "step": 2591 + }, + { + "epoch": 1.2053945124786856, + "grad_norm": 0.4594928026199341, + "learning_rate": 7.471826558399492e-06, + "loss": 0.3428, + "step": 2592 + }, + { + "epoch": 1.2058595566578825, + "grad_norm": 0.4109436571598053, + "learning_rate": 7.469474049469709e-06, + "loss": 0.3287, + "step": 2593 + }, + { + "epoch": 1.2063246008370796, + "grad_norm": 0.43107515573501587, + "learning_rate": 7.467120817271091e-06, + "loss": 0.3694, + "step": 2594 + }, + { + "epoch": 1.2067896450162765, + "grad_norm": 0.4271430969238281, + "learning_rate": 7.464766862492856e-06, + "loss": 0.3546, + "step": 2595 + }, + { + "epoch": 1.2072546891954736, + "grad_norm": 0.4466875195503235, + "learning_rate": 7.4624121858244455e-06, + "loss": 0.357, + "step": 2596 + }, + { + "epoch": 1.2077197333746705, + "grad_norm": 0.3861998915672302, + "learning_rate": 7.460056787955502e-06, + "loss": 0.3172, + "step": 2597 + }, + { + "epoch": 1.2081847775538677, + "grad_norm": 0.4079117774963379, + "learning_rate": 7.4577006695758845e-06, + "loss": 0.3402, + "step": 2598 + }, + { + "epoch": 1.2086498217330646, + "grad_norm": 0.521819531917572, + "learning_rate": 7.455343831375662e-06, + "loss": 0.4022, + "step": 2599 + }, + { + "epoch": 1.2091148659122617, + "grad_norm": 0.45120519399642944, + "learning_rate": 7.452986274045114e-06, + "loss": 0.3237, + "step": 2600 + }, + { + "epoch": 1.2095799100914586, + "grad_norm": 0.42835864424705505, + "learning_rate": 7.45062799827473e-06, + "loss": 0.3277, + "step": 2601 + }, + { + "epoch": 1.2100449542706557, + "grad_norm": 0.5196753144264221, + "learning_rate": 7.4482690047552105e-06, + "loss": 0.3841, + "step": 2602 + }, + { + "epoch": 1.2105099984498526, + "grad_norm": 0.616445004940033, + "learning_rate": 7.445909294177469e-06, + "loss": 0.4043, + "step": 2603 + }, + { + "epoch": 1.2109750426290498, + "grad_norm": 0.35232779383659363, + "learning_rate": 7.4435488672326235e-06, + "loss": 0.293, + "step": 2604 + }, + { + "epoch": 1.2114400868082469, + "grad_norm": 0.5146499276161194, + "learning_rate": 7.441187724612007e-06, + "loss": 0.3402, + "step": 2605 + }, + { + "epoch": 1.2119051309874438, + "grad_norm": 0.5325912833213806, + "learning_rate": 7.438825867007156e-06, + "loss": 0.3614, + "step": 2606 + }, + { + "epoch": 1.212370175166641, + "grad_norm": 0.4090646803379059, + "learning_rate": 7.436463295109824e-06, + "loss": 0.3651, + "step": 2607 + }, + { + "epoch": 1.2128352193458378, + "grad_norm": 0.4489936828613281, + "learning_rate": 7.434100009611969e-06, + "loss": 0.3855, + "step": 2608 + }, + { + "epoch": 1.213300263525035, + "grad_norm": 0.410040020942688, + "learning_rate": 7.431736011205759e-06, + "loss": 0.3056, + "step": 2609 + }, + { + "epoch": 1.2137653077042319, + "grad_norm": 0.4164040982723236, + "learning_rate": 7.42937130058357e-06, + "loss": 0.3439, + "step": 2610 + }, + { + "epoch": 1.214230351883429, + "grad_norm": 0.4618469774723053, + "learning_rate": 7.427005878437989e-06, + "loss": 0.3526, + "step": 2611 + }, + { + "epoch": 1.2146953960626259, + "grad_norm": 0.4020163416862488, + "learning_rate": 7.4246397454618054e-06, + "loss": 0.3325, + "step": 2612 + }, + { + "epoch": 1.215160440241823, + "grad_norm": 0.442120224237442, + "learning_rate": 7.422272902348027e-06, + "loss": 0.3265, + "step": 2613 + }, + { + "epoch": 1.21562548442102, + "grad_norm": 0.527863085269928, + "learning_rate": 7.41990534978986e-06, + "loss": 0.3838, + "step": 2614 + }, + { + "epoch": 1.216090528600217, + "grad_norm": 0.4838730990886688, + "learning_rate": 7.417537088480722e-06, + "loss": 0.3671, + "step": 2615 + }, + { + "epoch": 1.216555572779414, + "grad_norm": 0.3663950264453888, + "learning_rate": 7.41516811911424e-06, + "loss": 0.3008, + "step": 2616 + }, + { + "epoch": 1.217020616958611, + "grad_norm": 0.4481393098831177, + "learning_rate": 7.412798442384246e-06, + "loss": 0.375, + "step": 2617 + }, + { + "epoch": 1.217485661137808, + "grad_norm": 0.500136137008667, + "learning_rate": 7.4104280589847785e-06, + "loss": 0.3493, + "step": 2618 + }, + { + "epoch": 1.2179507053170051, + "grad_norm": 0.4056425988674164, + "learning_rate": 7.408056969610087e-06, + "loss": 0.3305, + "step": 2619 + }, + { + "epoch": 1.2184157494962022, + "grad_norm": 0.3582363724708557, + "learning_rate": 7.405685174954623e-06, + "loss": 0.3193, + "step": 2620 + }, + { + "epoch": 1.2188807936753991, + "grad_norm": 0.5270162224769592, + "learning_rate": 7.403312675713047e-06, + "loss": 0.3598, + "step": 2621 + }, + { + "epoch": 1.2193458378545963, + "grad_norm": 0.5136641263961792, + "learning_rate": 7.400939472580227e-06, + "loss": 0.3283, + "step": 2622 + }, + { + "epoch": 1.2198108820337932, + "grad_norm": 0.46304944157600403, + "learning_rate": 7.398565566251232e-06, + "loss": 0.3431, + "step": 2623 + }, + { + "epoch": 1.2202759262129903, + "grad_norm": 0.45918190479278564, + "learning_rate": 7.396190957421343e-06, + "loss": 0.3276, + "step": 2624 + }, + { + "epoch": 1.2207409703921872, + "grad_norm": 0.5070422291755676, + "learning_rate": 7.393815646786047e-06, + "loss": 0.3519, + "step": 2625 + }, + { + "epoch": 1.2212060145713843, + "grad_norm": 0.45554548501968384, + "learning_rate": 7.3914396350410285e-06, + "loss": 0.3358, + "step": 2626 + }, + { + "epoch": 1.2216710587505812, + "grad_norm": 0.43549612164497375, + "learning_rate": 7.389062922882187e-06, + "loss": 0.3329, + "step": 2627 + }, + { + "epoch": 1.2221361029297784, + "grad_norm": 0.4475855231285095, + "learning_rate": 7.3866855110056205e-06, + "loss": 0.3651, + "step": 2628 + }, + { + "epoch": 1.2226011471089753, + "grad_norm": 0.47854486107826233, + "learning_rate": 7.384307400107635e-06, + "loss": 0.3577, + "step": 2629 + }, + { + "epoch": 1.2230661912881724, + "grad_norm": 0.4717871844768524, + "learning_rate": 7.381928590884741e-06, + "loss": 0.4082, + "step": 2630 + }, + { + "epoch": 1.2235312354673693, + "grad_norm": 0.42780524492263794, + "learning_rate": 7.379549084033653e-06, + "loss": 0.3446, + "step": 2631 + }, + { + "epoch": 1.2239962796465664, + "grad_norm": 0.41821184754371643, + "learning_rate": 7.37716888025129e-06, + "loss": 0.3719, + "step": 2632 + }, + { + "epoch": 1.2244613238257633, + "grad_norm": 0.43905121088027954, + "learning_rate": 7.374787980234775e-06, + "loss": 0.3395, + "step": 2633 + }, + { + "epoch": 1.2249263680049605, + "grad_norm": 0.44092613458633423, + "learning_rate": 7.372406384681433e-06, + "loss": 0.3066, + "step": 2634 + }, + { + "epoch": 1.2253914121841576, + "grad_norm": 0.5179281234741211, + "learning_rate": 7.370024094288797e-06, + "loss": 0.3801, + "step": 2635 + }, + { + "epoch": 1.2258564563633545, + "grad_norm": 0.4214264452457428, + "learning_rate": 7.3676411097546e-06, + "loss": 0.3758, + "step": 2636 + }, + { + "epoch": 1.2263215005425516, + "grad_norm": 0.43435075879096985, + "learning_rate": 7.36525743177678e-06, + "loss": 0.3487, + "step": 2637 + }, + { + "epoch": 1.2267865447217485, + "grad_norm": 0.4535754323005676, + "learning_rate": 7.362873061053479e-06, + "loss": 0.3527, + "step": 2638 + }, + { + "epoch": 1.2272515889009457, + "grad_norm": 0.4143884778022766, + "learning_rate": 7.360487998283038e-06, + "loss": 0.3025, + "step": 2639 + }, + { + "epoch": 1.2277166330801426, + "grad_norm": 0.4507713317871094, + "learning_rate": 7.358102244164003e-06, + "loss": 0.3842, + "step": 2640 + }, + { + "epoch": 1.2281816772593397, + "grad_norm": 0.4342232942581177, + "learning_rate": 7.355715799395126e-06, + "loss": 0.37, + "step": 2641 + }, + { + "epoch": 1.2286467214385366, + "grad_norm": 0.38228335976600647, + "learning_rate": 7.353328664675353e-06, + "loss": 0.3279, + "step": 2642 + }, + { + "epoch": 1.2291117656177337, + "grad_norm": 0.42254742980003357, + "learning_rate": 7.350940840703842e-06, + "loss": 0.3522, + "step": 2643 + }, + { + "epoch": 1.2295768097969306, + "grad_norm": 0.48193877935409546, + "learning_rate": 7.348552328179945e-06, + "loss": 0.3823, + "step": 2644 + }, + { + "epoch": 1.2300418539761278, + "grad_norm": 0.4392937123775482, + "learning_rate": 7.3461631278032175e-06, + "loss": 0.3688, + "step": 2645 + }, + { + "epoch": 1.2305068981553249, + "grad_norm": 0.545441210269928, + "learning_rate": 7.34377324027342e-06, + "loss": 0.3537, + "step": 2646 + }, + { + "epoch": 1.2309719423345218, + "grad_norm": 0.43187499046325684, + "learning_rate": 7.3413826662905104e-06, + "loss": 0.314, + "step": 2647 + }, + { + "epoch": 1.2314369865137187, + "grad_norm": 0.4856451451778412, + "learning_rate": 7.33899140655465e-06, + "loss": 0.386, + "step": 2648 + }, + { + "epoch": 1.2319020306929158, + "grad_norm": 0.4500555396080017, + "learning_rate": 7.336599461766199e-06, + "loss": 0.33, + "step": 2649 + }, + { + "epoch": 1.232367074872113, + "grad_norm": 0.41082900762557983, + "learning_rate": 7.334206832625719e-06, + "loss": 0.3631, + "step": 2650 + }, + { + "epoch": 1.2328321190513098, + "grad_norm": 0.43610817193984985, + "learning_rate": 7.331813519833972e-06, + "loss": 0.3594, + "step": 2651 + }, + { + "epoch": 1.233297163230507, + "grad_norm": 0.3768427073955536, + "learning_rate": 7.329419524091923e-06, + "loss": 0.3397, + "step": 2652 + }, + { + "epoch": 1.2337622074097039, + "grad_norm": 0.4704180657863617, + "learning_rate": 7.32702484610073e-06, + "loss": 0.339, + "step": 2653 + }, + { + "epoch": 1.234227251588901, + "grad_norm": 0.5134731531143188, + "learning_rate": 7.32462948656176e-06, + "loss": 0.389, + "step": 2654 + }, + { + "epoch": 1.234692295768098, + "grad_norm": 0.454752117395401, + "learning_rate": 7.322233446176571e-06, + "loss": 0.3489, + "step": 2655 + }, + { + "epoch": 1.235157339947295, + "grad_norm": 0.4670364260673523, + "learning_rate": 7.319836725646927e-06, + "loss": 0.3759, + "step": 2656 + }, + { + "epoch": 1.235622384126492, + "grad_norm": 0.423016220331192, + "learning_rate": 7.317439325674788e-06, + "loss": 0.3212, + "step": 2657 + }, + { + "epoch": 1.236087428305689, + "grad_norm": 0.5560788512229919, + "learning_rate": 7.315041246962313e-06, + "loss": 0.3528, + "step": 2658 + }, + { + "epoch": 1.236552472484886, + "grad_norm": 0.3992140591144562, + "learning_rate": 7.312642490211857e-06, + "loss": 0.3611, + "step": 2659 + }, + { + "epoch": 1.237017516664083, + "grad_norm": 0.49364173412323, + "learning_rate": 7.310243056125984e-06, + "loss": 0.3515, + "step": 2660 + }, + { + "epoch": 1.2374825608432802, + "grad_norm": 0.5286840796470642, + "learning_rate": 7.307842945407444e-06, + "loss": 0.3108, + "step": 2661 + }, + { + "epoch": 1.2379476050224771, + "grad_norm": 0.4787932336330414, + "learning_rate": 7.3054421587591925e-06, + "loss": 0.3592, + "step": 2662 + }, + { + "epoch": 1.238412649201674, + "grad_norm": 0.4455261528491974, + "learning_rate": 7.30304069688438e-06, + "loss": 0.3157, + "step": 2663 + }, + { + "epoch": 1.2388776933808712, + "grad_norm": 0.4244392216205597, + "learning_rate": 7.300638560486357e-06, + "loss": 0.3612, + "step": 2664 + }, + { + "epoch": 1.2393427375600683, + "grad_norm": 0.4475732147693634, + "learning_rate": 7.298235750268667e-06, + "loss": 0.3858, + "step": 2665 + }, + { + "epoch": 1.2398077817392652, + "grad_norm": 0.4742658734321594, + "learning_rate": 7.295832266935059e-06, + "loss": 0.3213, + "step": 2666 + }, + { + "epoch": 1.2402728259184623, + "grad_norm": 0.4781046509742737, + "learning_rate": 7.29342811118947e-06, + "loss": 0.3711, + "step": 2667 + }, + { + "epoch": 1.2407378700976592, + "grad_norm": 0.4173920750617981, + "learning_rate": 7.29102328373604e-06, + "loss": 0.3644, + "step": 2668 + }, + { + "epoch": 1.2412029142768564, + "grad_norm": 0.4399764835834503, + "learning_rate": 7.288617785279104e-06, + "loss": 0.3529, + "step": 2669 + }, + { + "epoch": 1.2416679584560533, + "grad_norm": 0.41541317105293274, + "learning_rate": 7.286211616523193e-06, + "loss": 0.3791, + "step": 2670 + }, + { + "epoch": 1.2421330026352504, + "grad_norm": 0.4661194682121277, + "learning_rate": 7.283804778173035e-06, + "loss": 0.3629, + "step": 2671 + }, + { + "epoch": 1.2425980468144473, + "grad_norm": 0.45775488018989563, + "learning_rate": 7.281397270933553e-06, + "loss": 0.3493, + "step": 2672 + }, + { + "epoch": 1.2430630909936444, + "grad_norm": 0.45113879442214966, + "learning_rate": 7.278989095509867e-06, + "loss": 0.3641, + "step": 2673 + }, + { + "epoch": 1.2435281351728413, + "grad_norm": 0.4759120047092438, + "learning_rate": 7.2765802526072945e-06, + "loss": 0.3871, + "step": 2674 + }, + { + "epoch": 1.2439931793520385, + "grad_norm": 0.35728970170021057, + "learning_rate": 7.274170742931345e-06, + "loss": 0.3399, + "step": 2675 + }, + { + "epoch": 1.2444582235312356, + "grad_norm": 0.41665512323379517, + "learning_rate": 7.271760567187723e-06, + "loss": 0.3501, + "step": 2676 + }, + { + "epoch": 1.2449232677104325, + "grad_norm": 0.5279736518859863, + "learning_rate": 7.269349726082332e-06, + "loss": 0.3652, + "step": 2677 + }, + { + "epoch": 1.2453883118896294, + "grad_norm": 0.3690641522407532, + "learning_rate": 7.266938220321268e-06, + "loss": 0.3099, + "step": 2678 + }, + { + "epoch": 1.2458533560688265, + "grad_norm": 0.49263355135917664, + "learning_rate": 7.264526050610821e-06, + "loss": 0.3927, + "step": 2679 + }, + { + "epoch": 1.2463184002480237, + "grad_norm": 0.3883993327617645, + "learning_rate": 7.2621132176574774e-06, + "loss": 0.3367, + "step": 2680 + }, + { + "epoch": 1.2467834444272206, + "grad_norm": 0.4446150064468384, + "learning_rate": 7.2596997221679156e-06, + "loss": 0.3588, + "step": 2681 + }, + { + "epoch": 1.2472484886064177, + "grad_norm": 0.5633254051208496, + "learning_rate": 7.2572855648490105e-06, + "loss": 0.3633, + "step": 2682 + }, + { + "epoch": 1.2477135327856146, + "grad_norm": 0.4423445165157318, + "learning_rate": 7.25487074640783e-06, + "loss": 0.3665, + "step": 2683 + }, + { + "epoch": 1.2481785769648117, + "grad_norm": 0.48402294516563416, + "learning_rate": 7.252455267551631e-06, + "loss": 0.3508, + "step": 2684 + }, + { + "epoch": 1.2486436211440086, + "grad_norm": 0.53091961145401, + "learning_rate": 7.250039128987874e-06, + "loss": 0.3603, + "step": 2685 + }, + { + "epoch": 1.2491086653232057, + "grad_norm": 0.47006434202194214, + "learning_rate": 7.247622331424202e-06, + "loss": 0.3463, + "step": 2686 + }, + { + "epoch": 1.2495737095024027, + "grad_norm": 0.5498688817024231, + "learning_rate": 7.245204875568459e-06, + "loss": 0.3642, + "step": 2687 + }, + { + "epoch": 1.2500387536815998, + "grad_norm": 0.4160943627357483, + "learning_rate": 7.242786762128675e-06, + "loss": 0.3097, + "step": 2688 + }, + { + "epoch": 1.2505037978607967, + "grad_norm": 0.38064897060394287, + "learning_rate": 7.240367991813079e-06, + "loss": 0.3235, + "step": 2689 + }, + { + "epoch": 1.2509688420399938, + "grad_norm": 0.3919866383075714, + "learning_rate": 7.237948565330091e-06, + "loss": 0.3477, + "step": 2690 + }, + { + "epoch": 1.251433886219191, + "grad_norm": 0.4545346200466156, + "learning_rate": 7.235528483388318e-06, + "loss": 0.3706, + "step": 2691 + }, + { + "epoch": 1.2518989303983878, + "grad_norm": 0.43774187564849854, + "learning_rate": 7.233107746696564e-06, + "loss": 0.3314, + "step": 2692 + }, + { + "epoch": 1.2523639745775847, + "grad_norm": 0.44677144289016724, + "learning_rate": 7.230686355963824e-06, + "loss": 0.3369, + "step": 2693 + }, + { + "epoch": 1.2528290187567819, + "grad_norm": 0.4839126467704773, + "learning_rate": 7.228264311899284e-06, + "loss": 0.3949, + "step": 2694 + }, + { + "epoch": 1.253294062935979, + "grad_norm": 0.5056577920913696, + "learning_rate": 7.22584161521232e-06, + "loss": 0.3997, + "step": 2695 + }, + { + "epoch": 1.253759107115176, + "grad_norm": 0.35565152764320374, + "learning_rate": 7.223418266612503e-06, + "loss": 0.3188, + "step": 2696 + }, + { + "epoch": 1.254224151294373, + "grad_norm": 0.3856726884841919, + "learning_rate": 7.220994266809591e-06, + "loss": 0.3519, + "step": 2697 + }, + { + "epoch": 1.25468919547357, + "grad_norm": 0.46455973386764526, + "learning_rate": 7.218569616513533e-06, + "loss": 0.357, + "step": 2698 + }, + { + "epoch": 1.255154239652767, + "grad_norm": 0.37406978011131287, + "learning_rate": 7.2161443164344725e-06, + "loss": 0.3416, + "step": 2699 + }, + { + "epoch": 1.255619283831964, + "grad_norm": 0.43167954683303833, + "learning_rate": 7.213718367282737e-06, + "loss": 0.3567, + "step": 2700 + }, + { + "epoch": 1.256084328011161, + "grad_norm": 0.41151732206344604, + "learning_rate": 7.21129176976885e-06, + "loss": 0.3307, + "step": 2701 + }, + { + "epoch": 1.256549372190358, + "grad_norm": 0.4612469971179962, + "learning_rate": 7.208864524603523e-06, + "loss": 0.3684, + "step": 2702 + }, + { + "epoch": 1.2570144163695551, + "grad_norm": 0.4303331673145294, + "learning_rate": 7.206436632497656e-06, + "loss": 0.3358, + "step": 2703 + }, + { + "epoch": 1.257479460548752, + "grad_norm": 0.4344772696495056, + "learning_rate": 7.204008094162338e-06, + "loss": 0.3417, + "step": 2704 + }, + { + "epoch": 1.2579445047279492, + "grad_norm": 0.4836406409740448, + "learning_rate": 7.201578910308848e-06, + "loss": 0.3793, + "step": 2705 + }, + { + "epoch": 1.2584095489071463, + "grad_norm": 0.41068270802497864, + "learning_rate": 7.199149081648658e-06, + "loss": 0.3643, + "step": 2706 + }, + { + "epoch": 1.2588745930863432, + "grad_norm": 0.4286365211009979, + "learning_rate": 7.196718608893421e-06, + "loss": 0.3287, + "step": 2707 + }, + { + "epoch": 1.25933963726554, + "grad_norm": 0.4405948519706726, + "learning_rate": 7.1942874927549865e-06, + "loss": 0.3697, + "step": 2708 + }, + { + "epoch": 1.2598046814447372, + "grad_norm": 0.36979782581329346, + "learning_rate": 7.191855733945388e-06, + "loss": 0.3286, + "step": 2709 + }, + { + "epoch": 1.2602697256239344, + "grad_norm": 0.40628305077552795, + "learning_rate": 7.189423333176847e-06, + "loss": 0.3815, + "step": 2710 + }, + { + "epoch": 1.2607347698031313, + "grad_norm": 0.42872005701065063, + "learning_rate": 7.186990291161775e-06, + "loss": 0.3473, + "step": 2711 + }, + { + "epoch": 1.2611998139823284, + "grad_norm": 0.4314822852611542, + "learning_rate": 7.184556608612769e-06, + "loss": 0.3791, + "step": 2712 + }, + { + "epoch": 1.2616648581615253, + "grad_norm": 0.42616841197013855, + "learning_rate": 7.182122286242617e-06, + "loss": 0.3513, + "step": 2713 + }, + { + "epoch": 1.2621299023407224, + "grad_norm": 0.4406971037387848, + "learning_rate": 7.1796873247642925e-06, + "loss": 0.3609, + "step": 2714 + }, + { + "epoch": 1.2625949465199193, + "grad_norm": 0.38528046011924744, + "learning_rate": 7.177251724890957e-06, + "loss": 0.3372, + "step": 2715 + }, + { + "epoch": 1.2630599906991165, + "grad_norm": 0.4626089334487915, + "learning_rate": 7.1748154873359556e-06, + "loss": 0.3702, + "step": 2716 + }, + { + "epoch": 1.2635250348783134, + "grad_norm": 0.3866967260837555, + "learning_rate": 7.172378612812824e-06, + "loss": 0.3385, + "step": 2717 + }, + { + "epoch": 1.2639900790575105, + "grad_norm": 0.43559587001800537, + "learning_rate": 7.169941102035281e-06, + "loss": 0.3711, + "step": 2718 + }, + { + "epoch": 1.2644551232367074, + "grad_norm": 0.4723300635814667, + "learning_rate": 7.167502955717238e-06, + "loss": 0.3782, + "step": 2719 + }, + { + "epoch": 1.2649201674159045, + "grad_norm": 0.4137257933616638, + "learning_rate": 7.165064174572787e-06, + "loss": 0.3603, + "step": 2720 + }, + { + "epoch": 1.2653852115951016, + "grad_norm": 0.5321688652038574, + "learning_rate": 7.162624759316205e-06, + "loss": 0.3839, + "step": 2721 + }, + { + "epoch": 1.2658502557742985, + "grad_norm": 0.47028544545173645, + "learning_rate": 7.160184710661959e-06, + "loss": 0.3513, + "step": 2722 + }, + { + "epoch": 1.2663152999534955, + "grad_norm": 0.5355919599533081, + "learning_rate": 7.1577440293247e-06, + "loss": 0.3701, + "step": 2723 + }, + { + "epoch": 1.2667803441326926, + "grad_norm": 0.4389120936393738, + "learning_rate": 7.155302716019263e-06, + "loss": 0.3273, + "step": 2724 + }, + { + "epoch": 1.2672453883118897, + "grad_norm": 0.4850804805755615, + "learning_rate": 7.1528607714606706e-06, + "loss": 0.3639, + "step": 2725 + }, + { + "epoch": 1.2677104324910866, + "grad_norm": 0.4661966562271118, + "learning_rate": 7.1504181963641265e-06, + "loss": 0.3097, + "step": 2726 + }, + { + "epoch": 1.2681754766702837, + "grad_norm": 0.4354253113269806, + "learning_rate": 7.147974991445021e-06, + "loss": 0.3555, + "step": 2727 + }, + { + "epoch": 1.2686405208494806, + "grad_norm": 0.46909964084625244, + "learning_rate": 7.145531157418932e-06, + "loss": 0.3747, + "step": 2728 + }, + { + "epoch": 1.2691055650286778, + "grad_norm": 0.5229499340057373, + "learning_rate": 7.143086695001616e-06, + "loss": 0.3623, + "step": 2729 + }, + { + "epoch": 1.2695706092078747, + "grad_norm": 0.42865434288978577, + "learning_rate": 7.14064160490902e-06, + "loss": 0.3392, + "step": 2730 + }, + { + "epoch": 1.2700356533870718, + "grad_norm": 0.46530023217201233, + "learning_rate": 7.1381958878572665e-06, + "loss": 0.3611, + "step": 2731 + }, + { + "epoch": 1.270500697566269, + "grad_norm": 0.5126709938049316, + "learning_rate": 7.135749544562669e-06, + "loss": 0.3519, + "step": 2732 + }, + { + "epoch": 1.2709657417454658, + "grad_norm": 0.4154670536518097, + "learning_rate": 7.133302575741722e-06, + "loss": 0.3326, + "step": 2733 + }, + { + "epoch": 1.2714307859246627, + "grad_norm": 0.4609792232513428, + "learning_rate": 7.130854982111102e-06, + "loss": 0.375, + "step": 2734 + }, + { + "epoch": 1.2718958301038599, + "grad_norm": 0.4117323160171509, + "learning_rate": 7.128406764387667e-06, + "loss": 0.3381, + "step": 2735 + }, + { + "epoch": 1.272360874283057, + "grad_norm": 0.47705698013305664, + "learning_rate": 7.1259579232884655e-06, + "loss": 0.3487, + "step": 2736 + }, + { + "epoch": 1.272825918462254, + "grad_norm": 0.464181512594223, + "learning_rate": 7.12350845953072e-06, + "loss": 0.333, + "step": 2737 + }, + { + "epoch": 1.2732909626414508, + "grad_norm": 0.41154077649116516, + "learning_rate": 7.12105837383184e-06, + "loss": 0.3457, + "step": 2738 + }, + { + "epoch": 1.273756006820648, + "grad_norm": 0.4273618459701538, + "learning_rate": 7.118607666909413e-06, + "loss": 0.338, + "step": 2739 + }, + { + "epoch": 1.274221050999845, + "grad_norm": 0.4530821740627289, + "learning_rate": 7.116156339481215e-06, + "loss": 0.3423, + "step": 2740 + }, + { + "epoch": 1.274686095179042, + "grad_norm": 0.434441477060318, + "learning_rate": 7.113704392265196e-06, + "loss": 0.3672, + "step": 2741 + }, + { + "epoch": 1.275151139358239, + "grad_norm": 0.44967541098594666, + "learning_rate": 7.1112518259794946e-06, + "loss": 0.3257, + "step": 2742 + }, + { + "epoch": 1.275616183537436, + "grad_norm": 0.44372931122779846, + "learning_rate": 7.108798641342428e-06, + "loss": 0.313, + "step": 2743 + }, + { + "epoch": 1.2760812277166331, + "grad_norm": 0.4219914674758911, + "learning_rate": 7.106344839072492e-06, + "loss": 0.361, + "step": 2744 + }, + { + "epoch": 1.27654627189583, + "grad_norm": 0.4370742738246918, + "learning_rate": 7.103890419888367e-06, + "loss": 0.3304, + "step": 2745 + }, + { + "epoch": 1.2770113160750272, + "grad_norm": 0.41166791319847107, + "learning_rate": 7.1014353845089126e-06, + "loss": 0.3618, + "step": 2746 + }, + { + "epoch": 1.2774763602542243, + "grad_norm": 0.3665507137775421, + "learning_rate": 7.098979733653165e-06, + "loss": 0.3125, + "step": 2747 + }, + { + "epoch": 1.2779414044334212, + "grad_norm": 0.4135091006755829, + "learning_rate": 7.096523468040349e-06, + "loss": 0.3494, + "step": 2748 + }, + { + "epoch": 1.278406448612618, + "grad_norm": 0.5211425423622131, + "learning_rate": 7.094066588389864e-06, + "loss": 0.3392, + "step": 2749 + }, + { + "epoch": 1.2788714927918152, + "grad_norm": 0.4214104413986206, + "learning_rate": 7.09160909542129e-06, + "loss": 0.3606, + "step": 2750 + }, + { + "epoch": 1.2793365369710124, + "grad_norm": 0.4723619818687439, + "learning_rate": 7.089150989854385e-06, + "loss": 0.3315, + "step": 2751 + }, + { + "epoch": 1.2798015811502093, + "grad_norm": 0.5669287443161011, + "learning_rate": 7.08669227240909e-06, + "loss": 0.3577, + "step": 2752 + }, + { + "epoch": 1.2802666253294062, + "grad_norm": 0.43773379921913147, + "learning_rate": 7.084232943805522e-06, + "loss": 0.3266, + "step": 2753 + }, + { + "epoch": 1.2807316695086033, + "grad_norm": 0.4219588041305542, + "learning_rate": 7.081773004763981e-06, + "loss": 0.3465, + "step": 2754 + }, + { + "epoch": 1.2811967136878004, + "grad_norm": 0.45649266242980957, + "learning_rate": 7.079312456004941e-06, + "loss": 0.337, + "step": 2755 + }, + { + "epoch": 1.2816617578669973, + "grad_norm": 0.49470004439353943, + "learning_rate": 7.076851298249057e-06, + "loss": 0.3538, + "step": 2756 + }, + { + "epoch": 1.2821268020461944, + "grad_norm": 0.4066573679447174, + "learning_rate": 7.074389532217163e-06, + "loss": 0.3736, + "step": 2757 + }, + { + "epoch": 1.2825918462253914, + "grad_norm": 0.41550639271736145, + "learning_rate": 7.0719271586302675e-06, + "loss": 0.3284, + "step": 2758 + }, + { + "epoch": 1.2830568904045885, + "grad_norm": 0.45798173546791077, + "learning_rate": 7.069464178209563e-06, + "loss": 0.3366, + "step": 2759 + }, + { + "epoch": 1.2835219345837854, + "grad_norm": 0.44275960326194763, + "learning_rate": 7.067000591676416e-06, + "loss": 0.3668, + "step": 2760 + }, + { + "epoch": 1.2839869787629825, + "grad_norm": 0.40415266156196594, + "learning_rate": 7.06453639975237e-06, + "loss": 0.3625, + "step": 2761 + }, + { + "epoch": 1.2844520229421796, + "grad_norm": 0.4864114224910736, + "learning_rate": 7.062071603159147e-06, + "loss": 0.3692, + "step": 2762 + }, + { + "epoch": 1.2849170671213765, + "grad_norm": 0.4446645975112915, + "learning_rate": 7.059606202618646e-06, + "loss": 0.3724, + "step": 2763 + }, + { + "epoch": 1.2853821113005734, + "grad_norm": 0.406984806060791, + "learning_rate": 7.057140198852942e-06, + "loss": 0.3384, + "step": 2764 + }, + { + "epoch": 1.2858471554797706, + "grad_norm": 0.44974416494369507, + "learning_rate": 7.054673592584289e-06, + "loss": 0.3329, + "step": 2765 + }, + { + "epoch": 1.2863121996589677, + "grad_norm": 0.4763135313987732, + "learning_rate": 7.052206384535115e-06, + "loss": 0.3411, + "step": 2766 + }, + { + "epoch": 1.2867772438381646, + "grad_norm": 0.44069865345954895, + "learning_rate": 7.0497385754280255e-06, + "loss": 0.351, + "step": 2767 + }, + { + "epoch": 1.2872422880173615, + "grad_norm": 0.40506044030189514, + "learning_rate": 7.047270165985801e-06, + "loss": 0.3856, + "step": 2768 + }, + { + "epoch": 1.2877073321965586, + "grad_norm": 0.4121856689453125, + "learning_rate": 7.0448011569314e-06, + "loss": 0.3631, + "step": 2769 + }, + { + "epoch": 1.2881723763757558, + "grad_norm": 0.4210520088672638, + "learning_rate": 7.042331548987953e-06, + "loss": 0.3079, + "step": 2770 + }, + { + "epoch": 1.2886374205549527, + "grad_norm": 0.46315163373947144, + "learning_rate": 7.039861342878769e-06, + "loss": 0.3727, + "step": 2771 + }, + { + "epoch": 1.2891024647341498, + "grad_norm": 0.43790408968925476, + "learning_rate": 7.037390539327332e-06, + "loss": 0.3682, + "step": 2772 + }, + { + "epoch": 1.2895675089133467, + "grad_norm": 0.4229748845100403, + "learning_rate": 7.0349191390573e-06, + "loss": 0.3808, + "step": 2773 + }, + { + "epoch": 1.2900325530925438, + "grad_norm": 0.44016340374946594, + "learning_rate": 7.032447142792506e-06, + "loss": 0.3573, + "step": 2774 + }, + { + "epoch": 1.2904975972717407, + "grad_norm": 0.4395344853401184, + "learning_rate": 7.029974551256957e-06, + "loss": 0.3609, + "step": 2775 + }, + { + "epoch": 1.2909626414509379, + "grad_norm": 0.4566013514995575, + "learning_rate": 7.027501365174833e-06, + "loss": 0.3355, + "step": 2776 + }, + { + "epoch": 1.291427685630135, + "grad_norm": 0.44307535886764526, + "learning_rate": 7.025027585270495e-06, + "loss": 0.3421, + "step": 2777 + }, + { + "epoch": 1.291892729809332, + "grad_norm": 0.4520743489265442, + "learning_rate": 7.022553212268469e-06, + "loss": 0.3466, + "step": 2778 + }, + { + "epoch": 1.2923577739885288, + "grad_norm": 0.4748930335044861, + "learning_rate": 7.0200782468934605e-06, + "loss": 0.359, + "step": 2779 + }, + { + "epoch": 1.292822818167726, + "grad_norm": 0.4355354905128479, + "learning_rate": 7.017602689870345e-06, + "loss": 0.3346, + "step": 2780 + }, + { + "epoch": 1.293287862346923, + "grad_norm": 0.4658071994781494, + "learning_rate": 7.015126541924174e-06, + "loss": 0.3986, + "step": 2781 + }, + { + "epoch": 1.29375290652612, + "grad_norm": 0.45585206151008606, + "learning_rate": 7.012649803780171e-06, + "loss": 0.3595, + "step": 2782 + }, + { + "epoch": 1.294217950705317, + "grad_norm": 0.4219159781932831, + "learning_rate": 7.010172476163732e-06, + "loss": 0.3379, + "step": 2783 + }, + { + "epoch": 1.294682994884514, + "grad_norm": 0.40543264150619507, + "learning_rate": 7.007694559800427e-06, + "loss": 0.3518, + "step": 2784 + }, + { + "epoch": 1.2951480390637111, + "grad_norm": 0.4673357605934143, + "learning_rate": 7.005216055415996e-06, + "loss": 0.3548, + "step": 2785 + }, + { + "epoch": 1.295613083242908, + "grad_norm": 0.43243858218193054, + "learning_rate": 7.002736963736354e-06, + "loss": 0.3667, + "step": 2786 + }, + { + "epoch": 1.2960781274221052, + "grad_norm": 0.4363313913345337, + "learning_rate": 7.000257285487586e-06, + "loss": 0.3485, + "step": 2787 + }, + { + "epoch": 1.296543171601302, + "grad_norm": 0.44257044792175293, + "learning_rate": 6.997777021395949e-06, + "loss": 0.3625, + "step": 2788 + }, + { + "epoch": 1.2970082157804992, + "grad_norm": 0.448543906211853, + "learning_rate": 6.9952961721878734e-06, + "loss": 0.373, + "step": 2789 + }, + { + "epoch": 1.297473259959696, + "grad_norm": 0.41273754835128784, + "learning_rate": 6.992814738589958e-06, + "loss": 0.3761, + "step": 2790 + }, + { + "epoch": 1.2979383041388932, + "grad_norm": 0.540025532245636, + "learning_rate": 6.990332721328978e-06, + "loss": 0.3636, + "step": 2791 + }, + { + "epoch": 1.2984033483180903, + "grad_norm": 0.42229142785072327, + "learning_rate": 6.987850121131871e-06, + "loss": 0.3319, + "step": 2792 + }, + { + "epoch": 1.2988683924972872, + "grad_norm": 0.4574054777622223, + "learning_rate": 6.985366938725751e-06, + "loss": 0.3782, + "step": 2793 + }, + { + "epoch": 1.2993334366764842, + "grad_norm": 0.4920805096626282, + "learning_rate": 6.982883174837904e-06, + "loss": 0.376, + "step": 2794 + }, + { + "epoch": 1.2997984808556813, + "grad_norm": 0.40307578444480896, + "learning_rate": 6.980398830195785e-06, + "loss": 0.3285, + "step": 2795 + }, + { + "epoch": 1.3002635250348784, + "grad_norm": 0.41613009572029114, + "learning_rate": 6.977913905527016e-06, + "loss": 0.3433, + "step": 2796 + }, + { + "epoch": 1.3007285692140753, + "grad_norm": 0.44297316670417786, + "learning_rate": 6.975428401559392e-06, + "loss": 0.3558, + "step": 2797 + }, + { + "epoch": 1.3011936133932724, + "grad_norm": 0.3465144634246826, + "learning_rate": 6.972942319020876e-06, + "loss": 0.3213, + "step": 2798 + }, + { + "epoch": 1.3016586575724693, + "grad_norm": 0.40050673484802246, + "learning_rate": 6.9704556586396e-06, + "loss": 0.3431, + "step": 2799 + }, + { + "epoch": 1.3021237017516665, + "grad_norm": 0.45212817192077637, + "learning_rate": 6.967968421143869e-06, + "loss": 0.3546, + "step": 2800 + }, + { + "epoch": 1.3025887459308634, + "grad_norm": 0.4688425064086914, + "learning_rate": 6.965480607262154e-06, + "loss": 0.3559, + "step": 2801 + }, + { + "epoch": 1.3030537901100605, + "grad_norm": 0.4439482092857361, + "learning_rate": 6.962992217723094e-06, + "loss": 0.3234, + "step": 2802 + }, + { + "epoch": 1.3035188342892574, + "grad_norm": 0.43428176641464233, + "learning_rate": 6.960503253255499e-06, + "loss": 0.3435, + "step": 2803 + }, + { + "epoch": 1.3039838784684545, + "grad_norm": 0.3863378167152405, + "learning_rate": 6.958013714588348e-06, + "loss": 0.3557, + "step": 2804 + }, + { + "epoch": 1.3044489226476514, + "grad_norm": 0.40679073333740234, + "learning_rate": 6.95552360245078e-06, + "loss": 0.3292, + "step": 2805 + }, + { + "epoch": 1.3049139668268486, + "grad_norm": 0.45496124029159546, + "learning_rate": 6.953032917572116e-06, + "loss": 0.3961, + "step": 2806 + }, + { + "epoch": 1.3053790110060457, + "grad_norm": 0.3845018446445465, + "learning_rate": 6.950541660681833e-06, + "loss": 0.3522, + "step": 2807 + }, + { + "epoch": 1.3058440551852426, + "grad_norm": 0.4297010004520416, + "learning_rate": 6.94804983250958e-06, + "loss": 0.3595, + "step": 2808 + }, + { + "epoch": 1.3063090993644395, + "grad_norm": 0.3819620609283447, + "learning_rate": 6.945557433785175e-06, + "loss": 0.3066, + "step": 2809 + }, + { + "epoch": 1.3067741435436366, + "grad_norm": 0.42445626854896545, + "learning_rate": 6.943064465238598e-06, + "loss": 0.3727, + "step": 2810 + }, + { + "epoch": 1.3072391877228338, + "grad_norm": 0.4176945686340332, + "learning_rate": 6.940570927600001e-06, + "loss": 0.3371, + "step": 2811 + }, + { + "epoch": 1.3077042319020307, + "grad_norm": 0.4216483235359192, + "learning_rate": 6.9380768215996995e-06, + "loss": 0.337, + "step": 2812 + }, + { + "epoch": 1.3081692760812278, + "grad_norm": 0.4133038818836212, + "learning_rate": 6.93558214796818e-06, + "loss": 0.3371, + "step": 2813 + }, + { + "epoch": 1.3086343202604247, + "grad_norm": 0.47481316328048706, + "learning_rate": 6.933086907436087e-06, + "loss": 0.4007, + "step": 2814 + }, + { + "epoch": 1.3090993644396218, + "grad_norm": 0.41611889004707336, + "learning_rate": 6.93059110073424e-06, + "loss": 0.3471, + "step": 2815 + }, + { + "epoch": 1.3095644086188187, + "grad_norm": 0.45148274302482605, + "learning_rate": 6.928094728593617e-06, + "loss": 0.3308, + "step": 2816 + }, + { + "epoch": 1.3100294527980159, + "grad_norm": 0.4194914400577545, + "learning_rate": 6.9255977917453665e-06, + "loss": 0.3708, + "step": 2817 + }, + { + "epoch": 1.3104944969772128, + "grad_norm": 0.472514808177948, + "learning_rate": 6.923100290920801e-06, + "loss": 0.3499, + "step": 2818 + }, + { + "epoch": 1.31095954115641, + "grad_norm": 0.4619661569595337, + "learning_rate": 6.920602226851397e-06, + "loss": 0.3644, + "step": 2819 + }, + { + "epoch": 1.3114245853356068, + "grad_norm": 0.3975506126880646, + "learning_rate": 6.9181036002687985e-06, + "loss": 0.3624, + "step": 2820 + }, + { + "epoch": 1.311889629514804, + "grad_norm": 0.48552754521369934, + "learning_rate": 6.91560441190481e-06, + "loss": 0.3961, + "step": 2821 + }, + { + "epoch": 1.312354673694001, + "grad_norm": 0.4577177166938782, + "learning_rate": 6.913104662491406e-06, + "loss": 0.3915, + "step": 2822 + }, + { + "epoch": 1.312819717873198, + "grad_norm": 0.47331976890563965, + "learning_rate": 6.910604352760721e-06, + "loss": 0.3035, + "step": 2823 + }, + { + "epoch": 1.3132847620523949, + "grad_norm": 0.4498820900917053, + "learning_rate": 6.908103483445056e-06, + "loss": 0.3242, + "step": 2824 + }, + { + "epoch": 1.313749806231592, + "grad_norm": 0.5082352757453918, + "learning_rate": 6.905602055276872e-06, + "loss": 0.3974, + "step": 2825 + }, + { + "epoch": 1.3142148504107891, + "grad_norm": 0.4770672023296356, + "learning_rate": 6.9031000689888015e-06, + "loss": 0.3467, + "step": 2826 + }, + { + "epoch": 1.314679894589986, + "grad_norm": 0.5072925090789795, + "learning_rate": 6.9005975253136324e-06, + "loss": 0.3451, + "step": 2827 + }, + { + "epoch": 1.3151449387691831, + "grad_norm": 0.4119420051574707, + "learning_rate": 6.898094424984319e-06, + "loss": 0.3581, + "step": 2828 + }, + { + "epoch": 1.31560998294838, + "grad_norm": 0.453228235244751, + "learning_rate": 6.89559076873398e-06, + "loss": 0.3201, + "step": 2829 + }, + { + "epoch": 1.3160750271275772, + "grad_norm": 0.5295499563217163, + "learning_rate": 6.893086557295896e-06, + "loss": 0.387, + "step": 2830 + }, + { + "epoch": 1.316540071306774, + "grad_norm": 0.428733229637146, + "learning_rate": 6.890581791403509e-06, + "loss": 0.3681, + "step": 2831 + }, + { + "epoch": 1.3170051154859712, + "grad_norm": 0.4751463234424591, + "learning_rate": 6.888076471790423e-06, + "loss": 0.345, + "step": 2832 + }, + { + "epoch": 1.3174701596651681, + "grad_norm": 0.44530734419822693, + "learning_rate": 6.885570599190408e-06, + "loss": 0.3325, + "step": 2833 + }, + { + "epoch": 1.3179352038443652, + "grad_norm": 0.5325456857681274, + "learning_rate": 6.88306417433739e-06, + "loss": 0.3644, + "step": 2834 + }, + { + "epoch": 1.3184002480235621, + "grad_norm": 0.3941551446914673, + "learning_rate": 6.880557197965465e-06, + "loss": 0.3159, + "step": 2835 + }, + { + "epoch": 1.3188652922027593, + "grad_norm": 0.40330174565315247, + "learning_rate": 6.878049670808882e-06, + "loss": 0.3403, + "step": 2836 + }, + { + "epoch": 1.3193303363819564, + "grad_norm": 0.5025704503059387, + "learning_rate": 6.875541593602055e-06, + "loss": 0.3683, + "step": 2837 + }, + { + "epoch": 1.3197953805611533, + "grad_norm": 0.48347654938697815, + "learning_rate": 6.873032967079562e-06, + "loss": 0.3385, + "step": 2838 + }, + { + "epoch": 1.3202604247403502, + "grad_norm": 0.45224472880363464, + "learning_rate": 6.8705237919761344e-06, + "loss": 0.4031, + "step": 2839 + }, + { + "epoch": 1.3207254689195473, + "grad_norm": 0.4854915142059326, + "learning_rate": 6.868014069026672e-06, + "loss": 0.3297, + "step": 2840 + }, + { + "epoch": 1.3211905130987445, + "grad_norm": 0.5176591873168945, + "learning_rate": 6.865503798966232e-06, + "loss": 0.3377, + "step": 2841 + }, + { + "epoch": 1.3216555572779414, + "grad_norm": 0.43474873900413513, + "learning_rate": 6.86299298253003e-06, + "loss": 0.3378, + "step": 2842 + }, + { + "epoch": 1.3221206014571385, + "grad_norm": 0.5134937167167664, + "learning_rate": 6.860481620453445e-06, + "loss": 0.3775, + "step": 2843 + }, + { + "epoch": 1.3225856456363354, + "grad_norm": 0.47907009720802307, + "learning_rate": 6.857969713472015e-06, + "loss": 0.3326, + "step": 2844 + }, + { + "epoch": 1.3230506898155325, + "grad_norm": 0.4050777554512024, + "learning_rate": 6.855457262321433e-06, + "loss": 0.3613, + "step": 2845 + }, + { + "epoch": 1.3235157339947294, + "grad_norm": 0.4659248888492584, + "learning_rate": 6.852944267737557e-06, + "loss": 0.3669, + "step": 2846 + }, + { + "epoch": 1.3239807781739266, + "grad_norm": 0.42381158471107483, + "learning_rate": 6.850430730456403e-06, + "loss": 0.37, + "step": 2847 + }, + { + "epoch": 1.3244458223531235, + "grad_norm": 0.39930933713912964, + "learning_rate": 6.847916651214146e-06, + "loss": 0.3577, + "step": 2848 + }, + { + "epoch": 1.3249108665323206, + "grad_norm": 0.3815561532974243, + "learning_rate": 6.845402030747118e-06, + "loss": 0.3432, + "step": 2849 + }, + { + "epoch": 1.3253759107115175, + "grad_norm": 0.41214823722839355, + "learning_rate": 6.84288686979181e-06, + "loss": 0.3228, + "step": 2850 + }, + { + "epoch": 1.3258409548907146, + "grad_norm": 0.41365644335746765, + "learning_rate": 6.8403711690848715e-06, + "loss": 0.3858, + "step": 2851 + }, + { + "epoch": 1.3263059990699118, + "grad_norm": 0.4793543219566345, + "learning_rate": 6.83785492936311e-06, + "loss": 0.3466, + "step": 2852 + }, + { + "epoch": 1.3267710432491087, + "grad_norm": 0.39055493474006653, + "learning_rate": 6.8353381513634945e-06, + "loss": 0.3124, + "step": 2853 + }, + { + "epoch": 1.3272360874283056, + "grad_norm": 0.37652844190597534, + "learning_rate": 6.832820835823145e-06, + "loss": 0.3251, + "step": 2854 + }, + { + "epoch": 1.3277011316075027, + "grad_norm": 0.4325464069843292, + "learning_rate": 6.830302983479344e-06, + "loss": 0.3776, + "step": 2855 + }, + { + "epoch": 1.3281661757866998, + "grad_norm": 0.3975974917411804, + "learning_rate": 6.827784595069529e-06, + "loss": 0.3446, + "step": 2856 + }, + { + "epoch": 1.3286312199658967, + "grad_norm": 0.3837142288684845, + "learning_rate": 6.825265671331295e-06, + "loss": 0.3613, + "step": 2857 + }, + { + "epoch": 1.3290962641450939, + "grad_norm": 0.44566547870635986, + "learning_rate": 6.822746213002393e-06, + "loss": 0.3458, + "step": 2858 + }, + { + "epoch": 1.3295613083242908, + "grad_norm": 0.38540929555892944, + "learning_rate": 6.820226220820733e-06, + "loss": 0.3582, + "step": 2859 + }, + { + "epoch": 1.3300263525034879, + "grad_norm": 0.4004163444042206, + "learning_rate": 6.8177056955243794e-06, + "loss": 0.3327, + "step": 2860 + }, + { + "epoch": 1.3304913966826848, + "grad_norm": 0.3982474207878113, + "learning_rate": 6.815184637851553e-06, + "loss": 0.3696, + "step": 2861 + }, + { + "epoch": 1.330956440861882, + "grad_norm": 0.36453261971473694, + "learning_rate": 6.812663048540631e-06, + "loss": 0.3276, + "step": 2862 + }, + { + "epoch": 1.3314214850410788, + "grad_norm": 0.41965192556381226, + "learning_rate": 6.810140928330144e-06, + "loss": 0.3681, + "step": 2863 + }, + { + "epoch": 1.331886529220276, + "grad_norm": 0.4125366806983948, + "learning_rate": 6.807618277958783e-06, + "loss": 0.3285, + "step": 2864 + }, + { + "epoch": 1.3323515733994729, + "grad_norm": 0.3705972731113434, + "learning_rate": 6.805095098165388e-06, + "loss": 0.3291, + "step": 2865 + }, + { + "epoch": 1.33281661757867, + "grad_norm": 0.48080259561538696, + "learning_rate": 6.8025713896889615e-06, + "loss": 0.3379, + "step": 2866 + }, + { + "epoch": 1.333281661757867, + "grad_norm": 0.39710700511932373, + "learning_rate": 6.800047153268653e-06, + "loss": 0.3732, + "step": 2867 + }, + { + "epoch": 1.333746705937064, + "grad_norm": 0.3951427638530731, + "learning_rate": 6.797522389643772e-06, + "loss": 0.3569, + "step": 2868 + }, + { + "epoch": 1.334211750116261, + "grad_norm": 0.33918821811676025, + "learning_rate": 6.79499709955378e-06, + "loss": 0.3214, + "step": 2869 + }, + { + "epoch": 1.334676794295458, + "grad_norm": 0.39031147956848145, + "learning_rate": 6.792471283738293e-06, + "loss": 0.3722, + "step": 2870 + }, + { + "epoch": 1.3351418384746552, + "grad_norm": 0.3802085518836975, + "learning_rate": 6.789944942937084e-06, + "loss": 0.3364, + "step": 2871 + }, + { + "epoch": 1.335606882653852, + "grad_norm": 0.36190685629844666, + "learning_rate": 6.787418077890076e-06, + "loss": 0.3239, + "step": 2872 + }, + { + "epoch": 1.3360719268330492, + "grad_norm": 0.4449096620082855, + "learning_rate": 6.784890689337346e-06, + "loss": 0.3719, + "step": 2873 + }, + { + "epoch": 1.3365369710122461, + "grad_norm": 0.4153344929218292, + "learning_rate": 6.782362778019125e-06, + "loss": 0.3713, + "step": 2874 + }, + { + "epoch": 1.3370020151914432, + "grad_norm": 0.3987843692302704, + "learning_rate": 6.779834344675797e-06, + "loss": 0.3889, + "step": 2875 + }, + { + "epoch": 1.3374670593706401, + "grad_norm": 0.39704734086990356, + "learning_rate": 6.7773053900478995e-06, + "loss": 0.2937, + "step": 2876 + }, + { + "epoch": 1.3379321035498373, + "grad_norm": 0.42171093821525574, + "learning_rate": 6.774775914876123e-06, + "loss": 0.3604, + "step": 2877 + }, + { + "epoch": 1.3383971477290342, + "grad_norm": 0.42374035716056824, + "learning_rate": 6.7722459199013095e-06, + "loss": 0.3642, + "step": 2878 + }, + { + "epoch": 1.3388621919082313, + "grad_norm": 0.43570584058761597, + "learning_rate": 6.769715405864452e-06, + "loss": 0.391, + "step": 2879 + }, + { + "epoch": 1.3393272360874282, + "grad_norm": 0.361253947019577, + "learning_rate": 6.767184373506698e-06, + "loss": 0.3158, + "step": 2880 + }, + { + "epoch": 1.3397922802666253, + "grad_norm": 0.46323809027671814, + "learning_rate": 6.7646528235693445e-06, + "loss": 0.337, + "step": 2881 + }, + { + "epoch": 1.3402573244458225, + "grad_norm": 0.5015746355056763, + "learning_rate": 6.762120756793844e-06, + "loss": 0.3892, + "step": 2882 + }, + { + "epoch": 1.3407223686250194, + "grad_norm": 0.43376925587654114, + "learning_rate": 6.759588173921796e-06, + "loss": 0.3749, + "step": 2883 + }, + { + "epoch": 1.3411874128042163, + "grad_norm": 0.4221652150154114, + "learning_rate": 6.757055075694954e-06, + "loss": 0.3531, + "step": 2884 + }, + { + "epoch": 1.3416524569834134, + "grad_norm": 0.49050408601760864, + "learning_rate": 6.754521462855219e-06, + "loss": 0.3557, + "step": 2885 + }, + { + "epoch": 1.3421175011626105, + "grad_norm": 0.4941301941871643, + "learning_rate": 6.7519873361446475e-06, + "loss": 0.3587, + "step": 2886 + }, + { + "epoch": 1.3425825453418074, + "grad_norm": 0.378880113363266, + "learning_rate": 6.749452696305442e-06, + "loss": 0.3779, + "step": 2887 + }, + { + "epoch": 1.3430475895210046, + "grad_norm": 0.41799113154411316, + "learning_rate": 6.74691754407996e-06, + "loss": 0.339, + "step": 2888 + }, + { + "epoch": 1.3435126337002015, + "grad_norm": 0.39678072929382324, + "learning_rate": 6.744381880210703e-06, + "loss": 0.3487, + "step": 2889 + }, + { + "epoch": 1.3439776778793986, + "grad_norm": 0.41609761118888855, + "learning_rate": 6.741845705440329e-06, + "loss": 0.3937, + "step": 2890 + }, + { + "epoch": 1.3444427220585955, + "grad_norm": 0.4008464217185974, + "learning_rate": 6.739309020511641e-06, + "loss": 0.3155, + "step": 2891 + }, + { + "epoch": 1.3449077662377926, + "grad_norm": 0.45266151428222656, + "learning_rate": 6.736771826167592e-06, + "loss": 0.3634, + "step": 2892 + }, + { + "epoch": 1.3453728104169898, + "grad_norm": 0.423709511756897, + "learning_rate": 6.734234123151284e-06, + "loss": 0.3839, + "step": 2893 + }, + { + "epoch": 1.3458378545961867, + "grad_norm": 0.4130350649356842, + "learning_rate": 6.731695912205974e-06, + "loss": 0.3357, + "step": 2894 + }, + { + "epoch": 1.3463028987753836, + "grad_norm": 0.38522496819496155, + "learning_rate": 6.7291571940750575e-06, + "loss": 0.316, + "step": 2895 + }, + { + "epoch": 1.3467679429545807, + "grad_norm": 0.47391700744628906, + "learning_rate": 6.726617969502088e-06, + "loss": 0.3453, + "step": 2896 + }, + { + "epoch": 1.3472329871337778, + "grad_norm": 0.3721844255924225, + "learning_rate": 6.724078239230758e-06, + "loss": 0.3274, + "step": 2897 + }, + { + "epoch": 1.3476980313129747, + "grad_norm": 0.45813339948654175, + "learning_rate": 6.721538004004918e-06, + "loss": 0.3747, + "step": 2898 + }, + { + "epoch": 1.3481630754921716, + "grad_norm": 0.44626984000205994, + "learning_rate": 6.71899726456856e-06, + "loss": 0.3592, + "step": 2899 + }, + { + "epoch": 1.3486281196713688, + "grad_norm": 0.4165315330028534, + "learning_rate": 6.716456021665825e-06, + "loss": 0.3405, + "step": 2900 + }, + { + "epoch": 1.3490931638505659, + "grad_norm": 0.40565526485443115, + "learning_rate": 6.713914276041001e-06, + "loss": 0.351, + "step": 2901 + }, + { + "epoch": 1.3495582080297628, + "grad_norm": 0.4150947332382202, + "learning_rate": 6.7113720284385255e-06, + "loss": 0.3609, + "step": 2902 + }, + { + "epoch": 1.35002325220896, + "grad_norm": 0.35937878489494324, + "learning_rate": 6.708829279602982e-06, + "loss": 0.3121, + "step": 2903 + }, + { + "epoch": 1.3504882963881568, + "grad_norm": 0.4622684121131897, + "learning_rate": 6.7062860302790965e-06, + "loss": 0.3757, + "step": 2904 + }, + { + "epoch": 1.350953340567354, + "grad_norm": 0.3929741680622101, + "learning_rate": 6.70374228121175e-06, + "loss": 0.3468, + "step": 2905 + }, + { + "epoch": 1.3514183847465508, + "grad_norm": 0.3833123743534088, + "learning_rate": 6.7011980331459635e-06, + "loss": 0.3254, + "step": 2906 + }, + { + "epoch": 1.351883428925748, + "grad_norm": 0.4471578598022461, + "learning_rate": 6.698653286826906e-06, + "loss": 0.3819, + "step": 2907 + }, + { + "epoch": 1.352348473104945, + "grad_norm": 0.42805638909339905, + "learning_rate": 6.696108042999892e-06, + "loss": 0.3425, + "step": 2908 + }, + { + "epoch": 1.352813517284142, + "grad_norm": 0.40058985352516174, + "learning_rate": 6.693562302410384e-06, + "loss": 0.3844, + "step": 2909 + }, + { + "epoch": 1.353278561463339, + "grad_norm": 0.417976051568985, + "learning_rate": 6.6910160658039835e-06, + "loss": 0.3821, + "step": 2910 + }, + { + "epoch": 1.353743605642536, + "grad_norm": 0.36969244480133057, + "learning_rate": 6.6884693339264466e-06, + "loss": 0.3038, + "step": 2911 + }, + { + "epoch": 1.3542086498217332, + "grad_norm": 0.36144527792930603, + "learning_rate": 6.685922107523667e-06, + "loss": 0.3405, + "step": 2912 + }, + { + "epoch": 1.35467369400093, + "grad_norm": 0.4299263060092926, + "learning_rate": 6.683374387341688e-06, + "loss": 0.3538, + "step": 2913 + }, + { + "epoch": 1.355138738180127, + "grad_norm": 0.41073358058929443, + "learning_rate": 6.680826174126693e-06, + "loss": 0.3451, + "step": 2914 + }, + { + "epoch": 1.355603782359324, + "grad_norm": 0.42276236414909363, + "learning_rate": 6.678277468625014e-06, + "loss": 0.3641, + "step": 2915 + }, + { + "epoch": 1.3560688265385212, + "grad_norm": 0.39464181661605835, + "learning_rate": 6.675728271583124e-06, + "loss": 0.3313, + "step": 2916 + }, + { + "epoch": 1.3565338707177181, + "grad_norm": 0.3857058882713318, + "learning_rate": 6.673178583747644e-06, + "loss": 0.3363, + "step": 2917 + }, + { + "epoch": 1.3569989148969153, + "grad_norm": 0.40449684858322144, + "learning_rate": 6.670628405865334e-06, + "loss": 0.3499, + "step": 2918 + }, + { + "epoch": 1.3574639590761122, + "grad_norm": 0.43879038095474243, + "learning_rate": 6.6680777386831e-06, + "loss": 0.3781, + "step": 2919 + }, + { + "epoch": 1.3579290032553093, + "grad_norm": 0.4013797640800476, + "learning_rate": 6.665526582947991e-06, + "loss": 0.3759, + "step": 2920 + }, + { + "epoch": 1.3583940474345062, + "grad_norm": 0.5068005323410034, + "learning_rate": 6.6629749394071995e-06, + "loss": 0.3319, + "step": 2921 + }, + { + "epoch": 1.3588590916137033, + "grad_norm": 0.43432286381721497, + "learning_rate": 6.6604228088080605e-06, + "loss": 0.3545, + "step": 2922 + }, + { + "epoch": 1.3593241357929005, + "grad_norm": 0.5053911805152893, + "learning_rate": 6.657870191898051e-06, + "loss": 0.3545, + "step": 2923 + }, + { + "epoch": 1.3597891799720974, + "grad_norm": 0.37629908323287964, + "learning_rate": 6.655317089424791e-06, + "loss": 0.3084, + "step": 2924 + }, + { + "epoch": 1.3602542241512943, + "grad_norm": 0.40685826539993286, + "learning_rate": 6.652763502136044e-06, + "loss": 0.3445, + "step": 2925 + }, + { + "epoch": 1.3607192683304914, + "grad_norm": 0.46807217597961426, + "learning_rate": 6.6502094307797124e-06, + "loss": 0.3464, + "step": 2926 + }, + { + "epoch": 1.3611843125096885, + "grad_norm": 0.43712517619132996, + "learning_rate": 6.647654876103844e-06, + "loss": 0.3325, + "step": 2927 + }, + { + "epoch": 1.3616493566888854, + "grad_norm": 0.5149297118186951, + "learning_rate": 6.645099838856624e-06, + "loss": 0.405, + "step": 2928 + }, + { + "epoch": 1.3621144008680823, + "grad_norm": 0.41195258498191833, + "learning_rate": 6.6425443197863836e-06, + "loss": 0.3373, + "step": 2929 + }, + { + "epoch": 1.3625794450472795, + "grad_norm": 0.5708654522895813, + "learning_rate": 6.639988319641592e-06, + "loss": 0.3448, + "step": 2930 + }, + { + "epoch": 1.3630444892264766, + "grad_norm": 0.45768019556999207, + "learning_rate": 6.637431839170861e-06, + "loss": 0.3502, + "step": 2931 + }, + { + "epoch": 1.3635095334056735, + "grad_norm": 0.42099401354789734, + "learning_rate": 6.6348748791229416e-06, + "loss": 0.3665, + "step": 2932 + }, + { + "epoch": 1.3639745775848706, + "grad_norm": 0.45441335439682007, + "learning_rate": 6.632317440246725e-06, + "loss": 0.3322, + "step": 2933 + }, + { + "epoch": 1.3644396217640675, + "grad_norm": 0.5002641677856445, + "learning_rate": 6.629759523291242e-06, + "loss": 0.3342, + "step": 2934 + }, + { + "epoch": 1.3649046659432646, + "grad_norm": 0.4336529076099396, + "learning_rate": 6.627201129005672e-06, + "loss": 0.3839, + "step": 2935 + }, + { + "epoch": 1.3653697101224616, + "grad_norm": 0.47555792331695557, + "learning_rate": 6.624642258139318e-06, + "loss": 0.3358, + "step": 2936 + }, + { + "epoch": 1.3658347543016587, + "grad_norm": 0.47870880365371704, + "learning_rate": 6.622082911441637e-06, + "loss": 0.3573, + "step": 2937 + }, + { + "epoch": 1.3662997984808558, + "grad_norm": 0.44583725929260254, + "learning_rate": 6.619523089662219e-06, + "loss": 0.3536, + "step": 2938 + }, + { + "epoch": 1.3667648426600527, + "grad_norm": 0.44511839747428894, + "learning_rate": 6.616962793550794e-06, + "loss": 0.3496, + "step": 2939 + }, + { + "epoch": 1.3672298868392496, + "grad_norm": 0.4455788731575012, + "learning_rate": 6.614402023857231e-06, + "loss": 0.3345, + "step": 2940 + }, + { + "epoch": 1.3676949310184467, + "grad_norm": 0.42460909485816956, + "learning_rate": 6.61184078133154e-06, + "loss": 0.3473, + "step": 2941 + }, + { + "epoch": 1.3681599751976439, + "grad_norm": 0.44328975677490234, + "learning_rate": 6.609279066723865e-06, + "loss": 0.3733, + "step": 2942 + }, + { + "epoch": 1.3686250193768408, + "grad_norm": 0.4914466142654419, + "learning_rate": 6.606716880784491e-06, + "loss": 0.373, + "step": 2943 + }, + { + "epoch": 1.369090063556038, + "grad_norm": 0.4081343710422516, + "learning_rate": 6.604154224263839e-06, + "loss": 0.3409, + "step": 2944 + }, + { + "epoch": 1.3695551077352348, + "grad_norm": 0.44254809617996216, + "learning_rate": 6.601591097912472e-06, + "loss": 0.3659, + "step": 2945 + }, + { + "epoch": 1.370020151914432, + "grad_norm": 0.4107268452644348, + "learning_rate": 6.599027502481088e-06, + "loss": 0.3359, + "step": 2946 + }, + { + "epoch": 1.3704851960936288, + "grad_norm": 0.4272344410419464, + "learning_rate": 6.596463438720522e-06, + "loss": 0.3476, + "step": 2947 + }, + { + "epoch": 1.370950240272826, + "grad_norm": 0.47288623452186584, + "learning_rate": 6.593898907381746e-06, + "loss": 0.3282, + "step": 2948 + }, + { + "epoch": 1.3714152844520229, + "grad_norm": 0.4827193319797516, + "learning_rate": 6.59133390921587e-06, + "loss": 0.3738, + "step": 2949 + }, + { + "epoch": 1.37188032863122, + "grad_norm": 0.4004787504673004, + "learning_rate": 6.588768444974141e-06, + "loss": 0.3441, + "step": 2950 + }, + { + "epoch": 1.372345372810417, + "grad_norm": 0.44598275423049927, + "learning_rate": 6.58620251540794e-06, + "loss": 0.3349, + "step": 2951 + }, + { + "epoch": 1.372810416989614, + "grad_norm": 0.49700048565864563, + "learning_rate": 6.583636121268787e-06, + "loss": 0.3879, + "step": 2952 + }, + { + "epoch": 1.3732754611688112, + "grad_norm": 0.46100863814353943, + "learning_rate": 6.581069263308338e-06, + "loss": 0.3708, + "step": 2953 + }, + { + "epoch": 1.373740505348008, + "grad_norm": 0.47054529190063477, + "learning_rate": 6.5785019422783836e-06, + "loss": 0.352, + "step": 2954 + }, + { + "epoch": 1.374205549527205, + "grad_norm": 0.4136134386062622, + "learning_rate": 6.57593415893085e-06, + "loss": 0.3491, + "step": 2955 + }, + { + "epoch": 1.374670593706402, + "grad_norm": 0.49520325660705566, + "learning_rate": 6.5733659140178e-06, + "loss": 0.3456, + "step": 2956 + }, + { + "epoch": 1.3751356378855992, + "grad_norm": 0.3895581066608429, + "learning_rate": 6.5707972082914275e-06, + "loss": 0.3026, + "step": 2957 + }, + { + "epoch": 1.3756006820647961, + "grad_norm": 0.4688425660133362, + "learning_rate": 6.56822804250407e-06, + "loss": 0.3848, + "step": 2958 + }, + { + "epoch": 1.3760657262439933, + "grad_norm": 0.45133891701698303, + "learning_rate": 6.56565841740819e-06, + "loss": 0.3607, + "step": 2959 + }, + { + "epoch": 1.3765307704231902, + "grad_norm": 0.49955958127975464, + "learning_rate": 6.5630883337563935e-06, + "loss": 0.3493, + "step": 2960 + }, + { + "epoch": 1.3769958146023873, + "grad_norm": 0.43193817138671875, + "learning_rate": 6.560517792301412e-06, + "loss": 0.3393, + "step": 2961 + }, + { + "epoch": 1.3774608587815842, + "grad_norm": 0.4623286724090576, + "learning_rate": 6.557946793796116e-06, + "loss": 0.367, + "step": 2962 + }, + { + "epoch": 1.3779259029607813, + "grad_norm": 0.44610491394996643, + "learning_rate": 6.55537533899351e-06, + "loss": 0.3393, + "step": 2963 + }, + { + "epoch": 1.3783909471399782, + "grad_norm": 0.4373582601547241, + "learning_rate": 6.552803428646732e-06, + "loss": 0.3219, + "step": 2964 + }, + { + "epoch": 1.3788559913191754, + "grad_norm": 0.4308757185935974, + "learning_rate": 6.550231063509054e-06, + "loss": 0.3711, + "step": 2965 + }, + { + "epoch": 1.3793210354983723, + "grad_norm": 0.42997416853904724, + "learning_rate": 6.547658244333876e-06, + "loss": 0.3208, + "step": 2966 + }, + { + "epoch": 1.3797860796775694, + "grad_norm": 0.4055032432079315, + "learning_rate": 6.545084971874738e-06, + "loss": 0.3543, + "step": 2967 + }, + { + "epoch": 1.3802511238567665, + "grad_norm": 0.39875516295433044, + "learning_rate": 6.542511246885308e-06, + "loss": 0.3392, + "step": 2968 + }, + { + "epoch": 1.3807161680359634, + "grad_norm": 0.4401739239692688, + "learning_rate": 6.539937070119389e-06, + "loss": 0.3887, + "step": 2969 + }, + { + "epoch": 1.3811812122151603, + "grad_norm": 0.38288089632987976, + "learning_rate": 6.5373624423309165e-06, + "loss": 0.3448, + "step": 2970 + }, + { + "epoch": 1.3816462563943575, + "grad_norm": 0.44091928005218506, + "learning_rate": 6.534787364273957e-06, + "loss": 0.3297, + "step": 2971 + }, + { + "epoch": 1.3821113005735546, + "grad_norm": 0.5266174674034119, + "learning_rate": 6.532211836702708e-06, + "loss": 0.3979, + "step": 2972 + }, + { + "epoch": 1.3825763447527515, + "grad_norm": 0.40957939624786377, + "learning_rate": 6.529635860371497e-06, + "loss": 0.3353, + "step": 2973 + }, + { + "epoch": 1.3830413889319486, + "grad_norm": 0.4384903311729431, + "learning_rate": 6.527059436034791e-06, + "loss": 0.3386, + "step": 2974 + }, + { + "epoch": 1.3835064331111455, + "grad_norm": 0.3974456787109375, + "learning_rate": 6.524482564447181e-06, + "loss": 0.3046, + "step": 2975 + }, + { + "epoch": 1.3839714772903426, + "grad_norm": 0.43884211778640747, + "learning_rate": 6.521905246363389e-06, + "loss": 0.412, + "step": 2976 + }, + { + "epoch": 1.3844365214695395, + "grad_norm": 0.4392377436161041, + "learning_rate": 6.5193274825382724e-06, + "loss": 0.3413, + "step": 2977 + }, + { + "epoch": 1.3849015656487367, + "grad_norm": 0.4340946674346924, + "learning_rate": 6.516749273726814e-06, + "loss": 0.3686, + "step": 2978 + }, + { + "epoch": 1.3853666098279336, + "grad_norm": 0.3607385456562042, + "learning_rate": 6.514170620684128e-06, + "loss": 0.3132, + "step": 2979 + }, + { + "epoch": 1.3858316540071307, + "grad_norm": 0.5139884352684021, + "learning_rate": 6.511591524165465e-06, + "loss": 0.3757, + "step": 2980 + }, + { + "epoch": 1.3862966981863276, + "grad_norm": 0.43492573499679565, + "learning_rate": 6.509011984926197e-06, + "loss": 0.3282, + "step": 2981 + }, + { + "epoch": 1.3867617423655247, + "grad_norm": 0.3985266089439392, + "learning_rate": 6.50643200372183e-06, + "loss": 0.3391, + "step": 2982 + }, + { + "epoch": 1.3872267865447219, + "grad_norm": 0.5525508522987366, + "learning_rate": 6.503851581307997e-06, + "loss": 0.4244, + "step": 2983 + }, + { + "epoch": 1.3876918307239188, + "grad_norm": 0.3948424756526947, + "learning_rate": 6.5012707184404645e-06, + "loss": 0.3381, + "step": 2984 + }, + { + "epoch": 1.3881568749031157, + "grad_norm": 0.4646177589893341, + "learning_rate": 6.498689415875121e-06, + "loss": 0.3424, + "step": 2985 + }, + { + "epoch": 1.3886219190823128, + "grad_norm": 0.4283245801925659, + "learning_rate": 6.496107674367994e-06, + "loss": 0.3178, + "step": 2986 + }, + { + "epoch": 1.38908696326151, + "grad_norm": 0.45199066400527954, + "learning_rate": 6.49352549467523e-06, + "loss": 0.3601, + "step": 2987 + }, + { + "epoch": 1.3895520074407068, + "grad_norm": 0.3850160837173462, + "learning_rate": 6.4909428775531095e-06, + "loss": 0.3472, + "step": 2988 + }, + { + "epoch": 1.390017051619904, + "grad_norm": 0.4370267391204834, + "learning_rate": 6.488359823758036e-06, + "loss": 0.3827, + "step": 2989 + }, + { + "epoch": 1.3904820957991009, + "grad_norm": 0.4112876355648041, + "learning_rate": 6.485776334046546e-06, + "loss": 0.3564, + "step": 2990 + }, + { + "epoch": 1.390947139978298, + "grad_norm": 0.4564457833766937, + "learning_rate": 6.483192409175301e-06, + "loss": 0.3507, + "step": 2991 + }, + { + "epoch": 1.391412184157495, + "grad_norm": 0.4476487636566162, + "learning_rate": 6.4806080499010916e-06, + "loss": 0.3607, + "step": 2992 + }, + { + "epoch": 1.391877228336692, + "grad_norm": 0.4968775808811188, + "learning_rate": 6.478023256980835e-06, + "loss": 0.3816, + "step": 2993 + }, + { + "epoch": 1.392342272515889, + "grad_norm": 0.38123828172683716, + "learning_rate": 6.475438031171574e-06, + "loss": 0.3263, + "step": 2994 + }, + { + "epoch": 1.392807316695086, + "grad_norm": 0.46535784006118774, + "learning_rate": 6.472852373230478e-06, + "loss": 0.3571, + "step": 2995 + }, + { + "epoch": 1.393272360874283, + "grad_norm": 0.6109573841094971, + "learning_rate": 6.4702662839148476e-06, + "loss": 0.3365, + "step": 2996 + }, + { + "epoch": 1.39373740505348, + "grad_norm": 0.41744059324264526, + "learning_rate": 6.467679763982103e-06, + "loss": 0.3295, + "step": 2997 + }, + { + "epoch": 1.3942024492326772, + "grad_norm": 0.3972165286540985, + "learning_rate": 6.465092814189795e-06, + "loss": 0.3273, + "step": 2998 + }, + { + "epoch": 1.3946674934118741, + "grad_norm": 0.5437119007110596, + "learning_rate": 6.462505435295601e-06, + "loss": 0.3738, + "step": 2999 + }, + { + "epoch": 1.395132537591071, + "grad_norm": 0.4269973337650299, + "learning_rate": 6.459917628057319e-06, + "loss": 0.321, + "step": 3000 + }, + { + "epoch": 1.3955975817702682, + "grad_norm": 0.46754154562950134, + "learning_rate": 6.457329393232878e-06, + "loss": 0.3345, + "step": 3001 + }, + { + "epoch": 1.3960626259494653, + "grad_norm": 0.5135738253593445, + "learning_rate": 6.454740731580331e-06, + "loss": 0.3541, + "step": 3002 + }, + { + "epoch": 1.3965276701286622, + "grad_norm": 0.5451841950416565, + "learning_rate": 6.452151643857851e-06, + "loss": 0.3808, + "step": 3003 + }, + { + "epoch": 1.3969927143078593, + "grad_norm": 0.5181593298912048, + "learning_rate": 6.4495621308237435e-06, + "loss": 0.348, + "step": 3004 + }, + { + "epoch": 1.3974577584870562, + "grad_norm": 0.5118134617805481, + "learning_rate": 6.446972193236433e-06, + "loss": 0.3291, + "step": 3005 + }, + { + "epoch": 1.3979228026662534, + "grad_norm": 0.48063331842422485, + "learning_rate": 6.444381831854469e-06, + "loss": 0.3796, + "step": 3006 + }, + { + "epoch": 1.3983878468454503, + "grad_norm": 0.4797370433807373, + "learning_rate": 6.44179104743653e-06, + "loss": 0.3617, + "step": 3007 + }, + { + "epoch": 1.3988528910246474, + "grad_norm": 0.4799365699291229, + "learning_rate": 6.439199840741412e-06, + "loss": 0.3273, + "step": 3008 + }, + { + "epoch": 1.3993179352038443, + "grad_norm": 0.4450487494468689, + "learning_rate": 6.436608212528037e-06, + "loss": 0.3871, + "step": 3009 + }, + { + "epoch": 1.3997829793830414, + "grad_norm": 0.4422202706336975, + "learning_rate": 6.434016163555452e-06, + "loss": 0.3508, + "step": 3010 + }, + { + "epoch": 1.4002480235622383, + "grad_norm": 0.4393402636051178, + "learning_rate": 6.431423694582825e-06, + "loss": 0.3253, + "step": 3011 + }, + { + "epoch": 1.4007130677414354, + "grad_norm": 0.4574717879295349, + "learning_rate": 6.428830806369451e-06, + "loss": 0.3499, + "step": 3012 + }, + { + "epoch": 1.4011781119206326, + "grad_norm": 0.4212042987346649, + "learning_rate": 6.42623749967474e-06, + "loss": 0.353, + "step": 3013 + }, + { + "epoch": 1.4016431560998295, + "grad_norm": 0.3749128580093384, + "learning_rate": 6.423643775258232e-06, + "loss": 0.3304, + "step": 3014 + }, + { + "epoch": 1.4021082002790264, + "grad_norm": 0.4546009302139282, + "learning_rate": 6.421049633879588e-06, + "loss": 0.3276, + "step": 3015 + }, + { + "epoch": 1.4025732444582235, + "grad_norm": 0.4350774586200714, + "learning_rate": 6.418455076298587e-06, + "loss": 0.3725, + "step": 3016 + }, + { + "epoch": 1.4030382886374206, + "grad_norm": 0.4265797436237335, + "learning_rate": 6.415860103275136e-06, + "loss": 0.3663, + "step": 3017 + }, + { + "epoch": 1.4035033328166175, + "grad_norm": 0.3795028030872345, + "learning_rate": 6.413264715569259e-06, + "loss": 0.3185, + "step": 3018 + }, + { + "epoch": 1.4039683769958147, + "grad_norm": 0.39534875750541687, + "learning_rate": 6.4106689139411015e-06, + "loss": 0.3468, + "step": 3019 + }, + { + "epoch": 1.4044334211750116, + "grad_norm": 0.3997578024864197, + "learning_rate": 6.408072699150933e-06, + "loss": 0.371, + "step": 3020 + }, + { + "epoch": 1.4048984653542087, + "grad_norm": 0.3619006872177124, + "learning_rate": 6.405476071959142e-06, + "loss": 0.3284, + "step": 3021 + }, + { + "epoch": 1.4053635095334056, + "grad_norm": 0.41694512963294983, + "learning_rate": 6.402879033126239e-06, + "loss": 0.3559, + "step": 3022 + }, + { + "epoch": 1.4058285537126027, + "grad_norm": 0.41265085339546204, + "learning_rate": 6.400281583412855e-06, + "loss": 0.3171, + "step": 3023 + }, + { + "epoch": 1.4062935978917996, + "grad_norm": 0.5016796588897705, + "learning_rate": 6.397683723579741e-06, + "loss": 0.3731, + "step": 3024 + }, + { + "epoch": 1.4067586420709968, + "grad_norm": 0.39977139234542847, + "learning_rate": 6.395085454387766e-06, + "loss": 0.3065, + "step": 3025 + }, + { + "epoch": 1.4072236862501937, + "grad_norm": 0.41781142354011536, + "learning_rate": 6.392486776597921e-06, + "loss": 0.3491, + "step": 3026 + }, + { + "epoch": 1.4076887304293908, + "grad_norm": 0.4563649594783783, + "learning_rate": 6.389887690971319e-06, + "loss": 0.3464, + "step": 3027 + }, + { + "epoch": 1.408153774608588, + "grad_norm": 0.4177623987197876, + "learning_rate": 6.387288198269189e-06, + "loss": 0.3782, + "step": 3028 + }, + { + "epoch": 1.4086188187877848, + "grad_norm": 0.3898913264274597, + "learning_rate": 6.384688299252879e-06, + "loss": 0.3564, + "step": 3029 + }, + { + "epoch": 1.4090838629669817, + "grad_norm": 0.3614499568939209, + "learning_rate": 6.3820879946838585e-06, + "loss": 0.3215, + "step": 3030 + }, + { + "epoch": 1.4095489071461789, + "grad_norm": 0.45202839374542236, + "learning_rate": 6.379487285323713e-06, + "loss": 0.3829, + "step": 3031 + }, + { + "epoch": 1.410013951325376, + "grad_norm": 0.39168378710746765, + "learning_rate": 6.3768861719341475e-06, + "loss": 0.3619, + "step": 3032 + }, + { + "epoch": 1.410478995504573, + "grad_norm": 0.3808189630508423, + "learning_rate": 6.374284655276989e-06, + "loss": 0.3376, + "step": 3033 + }, + { + "epoch": 1.41094403968377, + "grad_norm": 0.43050652742385864, + "learning_rate": 6.371682736114178e-06, + "loss": 0.3206, + "step": 3034 + }, + { + "epoch": 1.411409083862967, + "grad_norm": 0.4489372968673706, + "learning_rate": 6.369080415207773e-06, + "loss": 0.3917, + "step": 3035 + }, + { + "epoch": 1.411874128042164, + "grad_norm": 0.37065914273262024, + "learning_rate": 6.366477693319953e-06, + "loss": 0.3446, + "step": 3036 + }, + { + "epoch": 1.412339172221361, + "grad_norm": 0.3754143714904785, + "learning_rate": 6.363874571213013e-06, + "loss": 0.356, + "step": 3037 + }, + { + "epoch": 1.412804216400558, + "grad_norm": 0.3956548273563385, + "learning_rate": 6.361271049649363e-06, + "loss": 0.3558, + "step": 3038 + }, + { + "epoch": 1.4132692605797552, + "grad_norm": 0.4007726013660431, + "learning_rate": 6.358667129391536e-06, + "loss": 0.3204, + "step": 3039 + }, + { + "epoch": 1.4137343047589521, + "grad_norm": 0.42144933342933655, + "learning_rate": 6.356062811202175e-06, + "loss": 0.3674, + "step": 3040 + }, + { + "epoch": 1.414199348938149, + "grad_norm": 0.4103334844112396, + "learning_rate": 6.3534580958440425e-06, + "loss": 0.3627, + "step": 3041 + }, + { + "epoch": 1.4146643931173462, + "grad_norm": 0.34175369143486023, + "learning_rate": 6.35085298408002e-06, + "loss": 0.3123, + "step": 3042 + }, + { + "epoch": 1.4151294372965433, + "grad_norm": 0.38748347759246826, + "learning_rate": 6.348247476673099e-06, + "loss": 0.3575, + "step": 3043 + }, + { + "epoch": 1.4155944814757402, + "grad_norm": 0.413284569978714, + "learning_rate": 6.345641574386393e-06, + "loss": 0.3341, + "step": 3044 + }, + { + "epoch": 1.416059525654937, + "grad_norm": 0.37702399492263794, + "learning_rate": 6.3430352779831275e-06, + "loss": 0.3339, + "step": 3045 + }, + { + "epoch": 1.4165245698341342, + "grad_norm": 0.4544568955898285, + "learning_rate": 6.340428588226643e-06, + "loss": 0.3564, + "step": 3046 + }, + { + "epoch": 1.4169896140133313, + "grad_norm": 0.3882623016834259, + "learning_rate": 6.337821505880399e-06, + "loss": 0.3699, + "step": 3047 + }, + { + "epoch": 1.4174546581925282, + "grad_norm": 0.3817591369152069, + "learning_rate": 6.335214031707966e-06, + "loss": 0.3165, + "step": 3048 + }, + { + "epoch": 1.4179197023717254, + "grad_norm": 0.39398717880249023, + "learning_rate": 6.33260616647303e-06, + "loss": 0.3486, + "step": 3049 + }, + { + "epoch": 1.4183847465509223, + "grad_norm": 0.4238983988761902, + "learning_rate": 6.329997910939394e-06, + "loss": 0.3892, + "step": 3050 + }, + { + "epoch": 1.4188497907301194, + "grad_norm": 0.43442922830581665, + "learning_rate": 6.327389265870974e-06, + "loss": 0.3235, + "step": 3051 + }, + { + "epoch": 1.4193148349093163, + "grad_norm": 0.4579649865627289, + "learning_rate": 6.324780232031799e-06, + "loss": 0.3464, + "step": 3052 + }, + { + "epoch": 1.4197798790885134, + "grad_norm": 0.3303820490837097, + "learning_rate": 6.322170810186013e-06, + "loss": 0.285, + "step": 3053 + }, + { + "epoch": 1.4202449232677106, + "grad_norm": 0.4255482256412506, + "learning_rate": 6.319561001097871e-06, + "loss": 0.3688, + "step": 3054 + }, + { + "epoch": 1.4207099674469075, + "grad_norm": 0.46313193440437317, + "learning_rate": 6.316950805531746e-06, + "loss": 0.3576, + "step": 3055 + }, + { + "epoch": 1.4211750116261044, + "grad_norm": 0.412056565284729, + "learning_rate": 6.314340224252124e-06, + "loss": 0.3525, + "step": 3056 + }, + { + "epoch": 1.4216400558053015, + "grad_norm": 0.4473661184310913, + "learning_rate": 6.311729258023597e-06, + "loss": 0.3644, + "step": 3057 + }, + { + "epoch": 1.4221050999844986, + "grad_norm": 0.3638128638267517, + "learning_rate": 6.309117907610878e-06, + "loss": 0.3013, + "step": 3058 + }, + { + "epoch": 1.4225701441636955, + "grad_norm": 0.471097856760025, + "learning_rate": 6.306506173778788e-06, + "loss": 0.3941, + "step": 3059 + }, + { + "epoch": 1.4230351883428924, + "grad_norm": 0.42378881573677063, + "learning_rate": 6.303894057292261e-06, + "loss": 0.3553, + "step": 3060 + }, + { + "epoch": 1.4235002325220896, + "grad_norm": 0.44823458790779114, + "learning_rate": 6.3012815589163435e-06, + "loss": 0.3624, + "step": 3061 + }, + { + "epoch": 1.4239652767012867, + "grad_norm": 0.39349788427352905, + "learning_rate": 6.2986686794161955e-06, + "loss": 0.335, + "step": 3062 + }, + { + "epoch": 1.4244303208804836, + "grad_norm": 0.39323508739471436, + "learning_rate": 6.296055419557086e-06, + "loss": 0.3689, + "step": 3063 + }, + { + "epoch": 1.4248953650596807, + "grad_norm": 0.39461150765419006, + "learning_rate": 6.293441780104394e-06, + "loss": 0.351, + "step": 3064 + }, + { + "epoch": 1.4253604092388776, + "grad_norm": 0.3934915065765381, + "learning_rate": 6.290827761823617e-06, + "loss": 0.3515, + "step": 3065 + }, + { + "epoch": 1.4258254534180748, + "grad_norm": 0.407262921333313, + "learning_rate": 6.2882133654803535e-06, + "loss": 0.3113, + "step": 3066 + }, + { + "epoch": 1.4262904975972717, + "grad_norm": 0.4266803562641144, + "learning_rate": 6.28559859184032e-06, + "loss": 0.3751, + "step": 3067 + }, + { + "epoch": 1.4267555417764688, + "grad_norm": 0.39150920510292053, + "learning_rate": 6.282983441669343e-06, + "loss": 0.3722, + "step": 3068 + }, + { + "epoch": 1.427220585955666, + "grad_norm": 0.42089784145355225, + "learning_rate": 6.280367915733354e-06, + "loss": 0.3574, + "step": 3069 + }, + { + "epoch": 1.4276856301348628, + "grad_norm": 0.3548668324947357, + "learning_rate": 6.277752014798401e-06, + "loss": 0.3241, + "step": 3070 + }, + { + "epoch": 1.4281506743140597, + "grad_norm": 0.4519556164741516, + "learning_rate": 6.275135739630636e-06, + "loss": 0.3671, + "step": 3071 + }, + { + "epoch": 1.4286157184932569, + "grad_norm": 0.4052821695804596, + "learning_rate": 6.272519090996326e-06, + "loss": 0.3497, + "step": 3072 + }, + { + "epoch": 1.429080762672454, + "grad_norm": 0.4167952835559845, + "learning_rate": 6.269902069661843e-06, + "loss": 0.3615, + "step": 3073 + }, + { + "epoch": 1.429545806851651, + "grad_norm": 0.4397052526473999, + "learning_rate": 6.267284676393672e-06, + "loss": 0.3694, + "step": 3074 + }, + { + "epoch": 1.4300108510308478, + "grad_norm": 0.42949673533439636, + "learning_rate": 6.264666911958404e-06, + "loss": 0.329, + "step": 3075 + }, + { + "epoch": 1.430475895210045, + "grad_norm": 0.39663806557655334, + "learning_rate": 6.26204877712274e-06, + "loss": 0.3553, + "step": 3076 + }, + { + "epoch": 1.430940939389242, + "grad_norm": 0.3915885388851166, + "learning_rate": 6.259430272653489e-06, + "loss": 0.349, + "step": 3077 + }, + { + "epoch": 1.431405983568439, + "grad_norm": 0.4367777705192566, + "learning_rate": 6.256811399317567e-06, + "loss": 0.343, + "step": 3078 + }, + { + "epoch": 1.431871027747636, + "grad_norm": 0.4142322242259979, + "learning_rate": 6.254192157882002e-06, + "loss": 0.3547, + "step": 3079 + }, + { + "epoch": 1.432336071926833, + "grad_norm": 0.41435766220092773, + "learning_rate": 6.251572549113925e-06, + "loss": 0.3363, + "step": 3080 + }, + { + "epoch": 1.4328011161060301, + "grad_norm": 0.3753543496131897, + "learning_rate": 6.248952573780578e-06, + "loss": 0.3099, + "step": 3081 + }, + { + "epoch": 1.433266160285227, + "grad_norm": 0.4042842388153076, + "learning_rate": 6.246332232649309e-06, + "loss": 0.3347, + "step": 3082 + }, + { + "epoch": 1.4337312044644241, + "grad_norm": 0.48114094138145447, + "learning_rate": 6.243711526487575e-06, + "loss": 0.4166, + "step": 3083 + }, + { + "epoch": 1.4341962486436213, + "grad_norm": 0.36960569024086, + "learning_rate": 6.241090456062934e-06, + "loss": 0.3309, + "step": 3084 + }, + { + "epoch": 1.4346612928228182, + "grad_norm": 0.4094170928001404, + "learning_rate": 6.238469022143059e-06, + "loss": 0.3325, + "step": 3085 + }, + { + "epoch": 1.435126337002015, + "grad_norm": 0.418476402759552, + "learning_rate": 6.235847225495724e-06, + "loss": 0.3791, + "step": 3086 + }, + { + "epoch": 1.4355913811812122, + "grad_norm": 0.3743997812271118, + "learning_rate": 6.23322506688881e-06, + "loss": 0.3432, + "step": 3087 + }, + { + "epoch": 1.4360564253604093, + "grad_norm": 0.44286203384399414, + "learning_rate": 6.230602547090307e-06, + "loss": 0.3745, + "step": 3088 + }, + { + "epoch": 1.4365214695396062, + "grad_norm": 0.40265190601348877, + "learning_rate": 6.227979666868307e-06, + "loss": 0.3441, + "step": 3089 + }, + { + "epoch": 1.4369865137188031, + "grad_norm": 0.4036969244480133, + "learning_rate": 6.225356426991007e-06, + "loss": 0.3609, + "step": 3090 + }, + { + "epoch": 1.4374515578980003, + "grad_norm": 0.3730718493461609, + "learning_rate": 6.222732828226714e-06, + "loss": 0.3337, + "step": 3091 + }, + { + "epoch": 1.4379166020771974, + "grad_norm": 0.4534870684146881, + "learning_rate": 6.2201088713438366e-06, + "loss": 0.369, + "step": 3092 + }, + { + "epoch": 1.4383816462563943, + "grad_norm": 0.4100678563117981, + "learning_rate": 6.2174845571108884e-06, + "loss": 0.3617, + "step": 3093 + }, + { + "epoch": 1.4388466904355914, + "grad_norm": 0.458183616399765, + "learning_rate": 6.214859886296491e-06, + "loss": 0.3556, + "step": 3094 + }, + { + "epoch": 1.4393117346147883, + "grad_norm": 0.4203883111476898, + "learning_rate": 6.212234859669366e-06, + "loss": 0.3503, + "step": 3095 + }, + { + "epoch": 1.4397767787939855, + "grad_norm": 0.43506595492362976, + "learning_rate": 6.209609477998339e-06, + "loss": 0.3604, + "step": 3096 + }, + { + "epoch": 1.4402418229731824, + "grad_norm": 0.37604156136512756, + "learning_rate": 6.206983742052345e-06, + "loss": 0.317, + "step": 3097 + }, + { + "epoch": 1.4407068671523795, + "grad_norm": 0.3998353183269501, + "learning_rate": 6.204357652600419e-06, + "loss": 0.3476, + "step": 3098 + }, + { + "epoch": 1.4411719113315766, + "grad_norm": 0.4210034906864166, + "learning_rate": 6.201731210411698e-06, + "loss": 0.368, + "step": 3099 + }, + { + "epoch": 1.4416369555107735, + "grad_norm": 0.44328659772872925, + "learning_rate": 6.199104416255426e-06, + "loss": 0.3929, + "step": 3100 + }, + { + "epoch": 1.4421019996899704, + "grad_norm": 0.44677838683128357, + "learning_rate": 6.196477270900947e-06, + "loss": 0.3404, + "step": 3101 + }, + { + "epoch": 1.4425670438691676, + "grad_norm": 0.42385387420654297, + "learning_rate": 6.193849775117709e-06, + "loss": 0.3779, + "step": 3102 + }, + { + "epoch": 1.4430320880483647, + "grad_norm": 0.4066379964351654, + "learning_rate": 6.191221929675266e-06, + "loss": 0.3221, + "step": 3103 + }, + { + "epoch": 1.4434971322275616, + "grad_norm": 0.40358468890190125, + "learning_rate": 6.188593735343269e-06, + "loss": 0.332, + "step": 3104 + }, + { + "epoch": 1.4439621764067587, + "grad_norm": 0.5439780950546265, + "learning_rate": 6.185965192891472e-06, + "loss": 0.3834, + "step": 3105 + }, + { + "epoch": 1.4444272205859556, + "grad_norm": 0.4213503897190094, + "learning_rate": 6.183336303089735e-06, + "loss": 0.3605, + "step": 3106 + }, + { + "epoch": 1.4448922647651528, + "grad_norm": 0.39889928698539734, + "learning_rate": 6.1807070667080145e-06, + "loss": 0.3439, + "step": 3107 + }, + { + "epoch": 1.4453573089443497, + "grad_norm": 0.5045921802520752, + "learning_rate": 6.1780774845163736e-06, + "loss": 0.3687, + "step": 3108 + }, + { + "epoch": 1.4458223531235468, + "grad_norm": 0.4484003186225891, + "learning_rate": 6.175447557284972e-06, + "loss": 0.349, + "step": 3109 + }, + { + "epoch": 1.4462873973027437, + "grad_norm": 0.4632101058959961, + "learning_rate": 6.172817285784076e-06, + "loss": 0.3897, + "step": 3110 + }, + { + "epoch": 1.4467524414819408, + "grad_norm": 0.43179023265838623, + "learning_rate": 6.170186670784047e-06, + "loss": 0.3275, + "step": 3111 + }, + { + "epoch": 1.4472174856611377, + "grad_norm": 0.48543015122413635, + "learning_rate": 6.1675557130553475e-06, + "loss": 0.3395, + "step": 3112 + }, + { + "epoch": 1.4476825298403349, + "grad_norm": 0.4830179214477539, + "learning_rate": 6.164924413368546e-06, + "loss": 0.3724, + "step": 3113 + }, + { + "epoch": 1.448147574019532, + "grad_norm": 0.4735562801361084, + "learning_rate": 6.162292772494305e-06, + "loss": 0.362, + "step": 3114 + }, + { + "epoch": 1.4486126181987289, + "grad_norm": 0.43904268741607666, + "learning_rate": 6.159660791203392e-06, + "loss": 0.3283, + "step": 3115 + }, + { + "epoch": 1.4490776623779258, + "grad_norm": 0.4425245523452759, + "learning_rate": 6.157028470266669e-06, + "loss": 0.3637, + "step": 3116 + }, + { + "epoch": 1.449542706557123, + "grad_norm": 0.40749219059944153, + "learning_rate": 6.1543958104551e-06, + "loss": 0.3578, + "step": 3117 + }, + { + "epoch": 1.45000775073632, + "grad_norm": 0.4425008296966553, + "learning_rate": 6.15176281253975e-06, + "loss": 0.3405, + "step": 3118 + }, + { + "epoch": 1.450472794915517, + "grad_norm": 0.38328611850738525, + "learning_rate": 6.1491294772917785e-06, + "loss": 0.3225, + "step": 3119 + }, + { + "epoch": 1.450937839094714, + "grad_norm": 0.47361764311790466, + "learning_rate": 6.146495805482451e-06, + "loss": 0.3796, + "step": 3120 + }, + { + "epoch": 1.451402883273911, + "grad_norm": 0.41950133442878723, + "learning_rate": 6.143861797883124e-06, + "loss": 0.355, + "step": 3121 + }, + { + "epoch": 1.451867927453108, + "grad_norm": 0.46674826741218567, + "learning_rate": 6.141227455265256e-06, + "loss": 0.37, + "step": 3122 + }, + { + "epoch": 1.452332971632305, + "grad_norm": 0.46177905797958374, + "learning_rate": 6.138592778400404e-06, + "loss": 0.3742, + "step": 3123 + }, + { + "epoch": 1.4527980158115021, + "grad_norm": 0.4034256637096405, + "learning_rate": 6.135957768060221e-06, + "loss": 0.3395, + "step": 3124 + }, + { + "epoch": 1.453263059990699, + "grad_norm": 0.47683438658714294, + "learning_rate": 6.133322425016459e-06, + "loss": 0.3755, + "step": 3125 + }, + { + "epoch": 1.4537281041698962, + "grad_norm": 0.46114102005958557, + "learning_rate": 6.1306867500409685e-06, + "loss": 0.3385, + "step": 3126 + }, + { + "epoch": 1.454193148349093, + "grad_norm": 0.40554434061050415, + "learning_rate": 6.128050743905695e-06, + "loss": 0.3636, + "step": 3127 + }, + { + "epoch": 1.4546581925282902, + "grad_norm": 0.4275938868522644, + "learning_rate": 6.12541440738268e-06, + "loss": 0.3578, + "step": 3128 + }, + { + "epoch": 1.4551232367074873, + "grad_norm": 0.42898625135421753, + "learning_rate": 6.122777741244067e-06, + "loss": 0.3635, + "step": 3129 + }, + { + "epoch": 1.4555882808866842, + "grad_norm": 0.4090706706047058, + "learning_rate": 6.120140746262091e-06, + "loss": 0.329, + "step": 3130 + }, + { + "epoch": 1.4560533250658811, + "grad_norm": 0.45951342582702637, + "learning_rate": 6.117503423209084e-06, + "loss": 0.3656, + "step": 3131 + }, + { + "epoch": 1.4565183692450783, + "grad_norm": 0.3707917332649231, + "learning_rate": 6.1148657728574765e-06, + "loss": 0.3696, + "step": 3132 + }, + { + "epoch": 1.4569834134242754, + "grad_norm": 0.4463897943496704, + "learning_rate": 6.1122277959797925e-06, + "loss": 0.335, + "step": 3133 + }, + { + "epoch": 1.4574484576034723, + "grad_norm": 0.48531046509742737, + "learning_rate": 6.109589493348655e-06, + "loss": 0.3419, + "step": 3134 + }, + { + "epoch": 1.4579135017826694, + "grad_norm": 0.3899511396884918, + "learning_rate": 6.106950865736777e-06, + "loss": 0.3556, + "step": 3135 + }, + { + "epoch": 1.4583785459618663, + "grad_norm": 0.3700244426727295, + "learning_rate": 6.10431191391697e-06, + "loss": 0.3633, + "step": 3136 + }, + { + "epoch": 1.4588435901410635, + "grad_norm": 0.43946826457977295, + "learning_rate": 6.101672638662141e-06, + "loss": 0.3294, + "step": 3137 + }, + { + "epoch": 1.4593086343202604, + "grad_norm": 0.407183438539505, + "learning_rate": 6.099033040745292e-06, + "loss": 0.3605, + "step": 3138 + }, + { + "epoch": 1.4597736784994575, + "grad_norm": 0.39797452092170715, + "learning_rate": 6.0963931209395165e-06, + "loss": 0.3581, + "step": 3139 + }, + { + "epoch": 1.4602387226786544, + "grad_norm": 0.429902583360672, + "learning_rate": 6.0937528800180056e-06, + "loss": 0.3423, + "step": 3140 + }, + { + "epoch": 1.4607037668578515, + "grad_norm": 0.43174222111701965, + "learning_rate": 6.0911123187540414e-06, + "loss": 0.3604, + "step": 3141 + }, + { + "epoch": 1.4611688110370484, + "grad_norm": 0.4118611514568329, + "learning_rate": 6.088471437921002e-06, + "loss": 0.3742, + "step": 3142 + }, + { + "epoch": 1.4616338552162456, + "grad_norm": 0.37793856859207153, + "learning_rate": 6.0858302382923585e-06, + "loss": 0.3462, + "step": 3143 + }, + { + "epoch": 1.4620988993954427, + "grad_norm": 0.4251405894756317, + "learning_rate": 6.083188720641676e-06, + "loss": 0.3283, + "step": 3144 + }, + { + "epoch": 1.4625639435746396, + "grad_norm": 0.3901570439338684, + "learning_rate": 6.080546885742611e-06, + "loss": 0.3649, + "step": 3145 + }, + { + "epoch": 1.4630289877538365, + "grad_norm": 0.39368346333503723, + "learning_rate": 6.077904734368915e-06, + "loss": 0.3652, + "step": 3146 + }, + { + "epoch": 1.4634940319330336, + "grad_norm": 0.4399242103099823, + "learning_rate": 6.075262267294432e-06, + "loss": 0.3585, + "step": 3147 + }, + { + "epoch": 1.4639590761122308, + "grad_norm": 0.37305113673210144, + "learning_rate": 6.072619485293095e-06, + "loss": 0.339, + "step": 3148 + }, + { + "epoch": 1.4644241202914277, + "grad_norm": 0.4052191376686096, + "learning_rate": 6.069976389138934e-06, + "loss": 0.3698, + "step": 3149 + }, + { + "epoch": 1.4648891644706248, + "grad_norm": 0.4240652620792389, + "learning_rate": 6.067332979606069e-06, + "loss": 0.3417, + "step": 3150 + }, + { + "epoch": 1.4653542086498217, + "grad_norm": 0.4290774464607239, + "learning_rate": 6.064689257468711e-06, + "loss": 0.3381, + "step": 3151 + }, + { + "epoch": 1.4658192528290188, + "grad_norm": 0.3692978024482727, + "learning_rate": 6.062045223501163e-06, + "loss": 0.3469, + "step": 3152 + }, + { + "epoch": 1.4662842970082157, + "grad_norm": 0.37645047903060913, + "learning_rate": 6.0594008784778206e-06, + "loss": 0.3889, + "step": 3153 + }, + { + "epoch": 1.4667493411874128, + "grad_norm": 0.4383949637413025, + "learning_rate": 6.056756223173167e-06, + "loss": 0.3384, + "step": 3154 + }, + { + "epoch": 1.4672143853666098, + "grad_norm": 0.4462020695209503, + "learning_rate": 6.054111258361782e-06, + "loss": 0.3641, + "step": 3155 + }, + { + "epoch": 1.4676794295458069, + "grad_norm": 0.38466379046440125, + "learning_rate": 6.051465984818332e-06, + "loss": 0.3538, + "step": 3156 + }, + { + "epoch": 1.4681444737250038, + "grad_norm": 0.42687302827835083, + "learning_rate": 6.048820403317575e-06, + "loss": 0.3286, + "step": 3157 + }, + { + "epoch": 1.468609517904201, + "grad_norm": 0.4158511161804199, + "learning_rate": 6.046174514634355e-06, + "loss": 0.3368, + "step": 3158 + }, + { + "epoch": 1.469074562083398, + "grad_norm": 0.409820020198822, + "learning_rate": 6.043528319543615e-06, + "loss": 0.3446, + "step": 3159 + }, + { + "epoch": 1.469539606262595, + "grad_norm": 0.4203595817089081, + "learning_rate": 6.04088181882038e-06, + "loss": 0.3366, + "step": 3160 + }, + { + "epoch": 1.4700046504417918, + "grad_norm": 0.4121120274066925, + "learning_rate": 6.038235013239767e-06, + "loss": 0.3271, + "step": 3161 + }, + { + "epoch": 1.470469694620989, + "grad_norm": 0.40760576725006104, + "learning_rate": 6.035587903576984e-06, + "loss": 0.3536, + "step": 3162 + }, + { + "epoch": 1.470934738800186, + "grad_norm": 0.4346151649951935, + "learning_rate": 6.032940490607324e-06, + "loss": 0.3356, + "step": 3163 + }, + { + "epoch": 1.471399782979383, + "grad_norm": 0.459246426820755, + "learning_rate": 6.030292775106173e-06, + "loss": 0.3679, + "step": 3164 + }, + { + "epoch": 1.4718648271585801, + "grad_norm": 0.42355459928512573, + "learning_rate": 6.027644757849004e-06, + "loss": 0.386, + "step": 3165 + }, + { + "epoch": 1.472329871337777, + "grad_norm": 0.4206094741821289, + "learning_rate": 6.024996439611376e-06, + "loss": 0.3344, + "step": 3166 + }, + { + "epoch": 1.4727949155169742, + "grad_norm": 0.36260485649108887, + "learning_rate": 6.022347821168941e-06, + "loss": 0.3362, + "step": 3167 + }, + { + "epoch": 1.473259959696171, + "grad_norm": 0.4596819281578064, + "learning_rate": 6.0196989032974366e-06, + "loss": 0.4029, + "step": 3168 + }, + { + "epoch": 1.4737250038753682, + "grad_norm": 0.4887857139110565, + "learning_rate": 6.017049686772685e-06, + "loss": 0.335, + "step": 3169 + }, + { + "epoch": 1.474190048054565, + "grad_norm": 0.4173000156879425, + "learning_rate": 6.0144001723706e-06, + "loss": 0.329, + "step": 3170 + }, + { + "epoch": 1.4746550922337622, + "grad_norm": 0.419015496969223, + "learning_rate": 6.011750360867183e-06, + "loss": 0.3812, + "step": 3171 + }, + { + "epoch": 1.4751201364129591, + "grad_norm": 0.5123944282531738, + "learning_rate": 6.009100253038518e-06, + "loss": 0.3391, + "step": 3172 + }, + { + "epoch": 1.4755851805921563, + "grad_norm": 0.44715067744255066, + "learning_rate": 6.00644984966078e-06, + "loss": 0.3153, + "step": 3173 + }, + { + "epoch": 1.4760502247713534, + "grad_norm": 0.4377215802669525, + "learning_rate": 6.003799151510229e-06, + "loss": 0.3815, + "step": 3174 + }, + { + "epoch": 1.4765152689505503, + "grad_norm": 0.5287955403327942, + "learning_rate": 6.001148159363213e-06, + "loss": 0.362, + "step": 3175 + }, + { + "epoch": 1.4769803131297472, + "grad_norm": 0.3864191770553589, + "learning_rate": 5.998496873996161e-06, + "loss": 0.3175, + "step": 3176 + }, + { + "epoch": 1.4774453573089443, + "grad_norm": 0.45076581835746765, + "learning_rate": 5.995845296185594e-06, + "loss": 0.3572, + "step": 3177 + }, + { + "epoch": 1.4779104014881415, + "grad_norm": 0.4349561929702759, + "learning_rate": 5.993193426708115e-06, + "loss": 0.3452, + "step": 3178 + }, + { + "epoch": 1.4783754456673384, + "grad_norm": 0.45883357524871826, + "learning_rate": 5.990541266340414e-06, + "loss": 0.3639, + "step": 3179 + }, + { + "epoch": 1.4788404898465355, + "grad_norm": 0.4192543923854828, + "learning_rate": 5.987888815859266e-06, + "loss": 0.3199, + "step": 3180 + }, + { + "epoch": 1.4793055340257324, + "grad_norm": 0.4741514325141907, + "learning_rate": 5.985236076041531e-06, + "loss": 0.3624, + "step": 3181 + }, + { + "epoch": 1.4797705782049295, + "grad_norm": 0.4844418168067932, + "learning_rate": 5.982583047664151e-06, + "loss": 0.3577, + "step": 3182 + }, + { + "epoch": 1.4802356223841264, + "grad_norm": 0.4832378923892975, + "learning_rate": 5.979929731504158e-06, + "loss": 0.3504, + "step": 3183 + }, + { + "epoch": 1.4807006665633236, + "grad_norm": 0.42179495096206665, + "learning_rate": 5.9772761283386626e-06, + "loss": 0.3384, + "step": 3184 + }, + { + "epoch": 1.4811657107425205, + "grad_norm": 0.4342862665653229, + "learning_rate": 5.9746222389448635e-06, + "loss": 0.3624, + "step": 3185 + }, + { + "epoch": 1.4816307549217176, + "grad_norm": 0.47698426246643066, + "learning_rate": 5.971968064100042e-06, + "loss": 0.3483, + "step": 3186 + }, + { + "epoch": 1.4820957991009145, + "grad_norm": 0.4859686493873596, + "learning_rate": 5.969313604581564e-06, + "loss": 0.3691, + "step": 3187 + }, + { + "epoch": 1.4825608432801116, + "grad_norm": 0.4531085789203644, + "learning_rate": 5.966658861166874e-06, + "loss": 0.3741, + "step": 3188 + }, + { + "epoch": 1.4830258874593087, + "grad_norm": 0.46530213952064514, + "learning_rate": 5.9640038346335045e-06, + "loss": 0.3501, + "step": 3189 + }, + { + "epoch": 1.4834909316385056, + "grad_norm": 0.4475176930427551, + "learning_rate": 5.961348525759072e-06, + "loss": 0.3515, + "step": 3190 + }, + { + "epoch": 1.4839559758177026, + "grad_norm": 0.4891159236431122, + "learning_rate": 5.958692935321271e-06, + "loss": 0.3485, + "step": 3191 + }, + { + "epoch": 1.4844210199968997, + "grad_norm": 0.4750014841556549, + "learning_rate": 5.956037064097881e-06, + "loss": 0.3773, + "step": 3192 + }, + { + "epoch": 1.4848860641760968, + "grad_norm": 0.37893229722976685, + "learning_rate": 5.953380912866764e-06, + "loss": 0.372, + "step": 3193 + }, + { + "epoch": 1.4853511083552937, + "grad_norm": 0.41365838050842285, + "learning_rate": 5.9507244824058644e-06, + "loss": 0.2967, + "step": 3194 + }, + { + "epoch": 1.4858161525344908, + "grad_norm": 0.5796143412590027, + "learning_rate": 5.948067773493205e-06, + "loss": 0.4209, + "step": 3195 + }, + { + "epoch": 1.4862811967136877, + "grad_norm": 0.388439804315567, + "learning_rate": 5.945410786906896e-06, + "loss": 0.3745, + "step": 3196 + }, + { + "epoch": 1.4867462408928849, + "grad_norm": 0.3897601068019867, + "learning_rate": 5.9427535234251235e-06, + "loss": 0.3073, + "step": 3197 + }, + { + "epoch": 1.4872112850720818, + "grad_norm": 0.5110495090484619, + "learning_rate": 5.940095983826157e-06, + "loss": 0.3641, + "step": 3198 + }, + { + "epoch": 1.487676329251279, + "grad_norm": 0.46713900566101074, + "learning_rate": 5.9374381688883475e-06, + "loss": 0.3414, + "step": 3199 + }, + { + "epoch": 1.488141373430476, + "grad_norm": 0.4882988929748535, + "learning_rate": 5.9347800793901245e-06, + "loss": 0.3823, + "step": 3200 + }, + { + "epoch": 1.488606417609673, + "grad_norm": 0.45495909452438354, + "learning_rate": 5.93212171611e-06, + "loss": 0.3497, + "step": 3201 + }, + { + "epoch": 1.4890714617888698, + "grad_norm": 0.38749927282333374, + "learning_rate": 5.929463079826565e-06, + "loss": 0.3585, + "step": 3202 + }, + { + "epoch": 1.489536505968067, + "grad_norm": 0.4609941244125366, + "learning_rate": 5.9268041713184934e-06, + "loss": 0.3501, + "step": 3203 + }, + { + "epoch": 1.490001550147264, + "grad_norm": 0.38551583886146545, + "learning_rate": 5.924144991364533e-06, + "loss": 0.3611, + "step": 3204 + }, + { + "epoch": 1.490466594326461, + "grad_norm": 0.37728172540664673, + "learning_rate": 5.921485540743516e-06, + "loss": 0.3233, + "step": 3205 + }, + { + "epoch": 1.490931638505658, + "grad_norm": 0.4780239760875702, + "learning_rate": 5.918825820234352e-06, + "loss": 0.3523, + "step": 3206 + }, + { + "epoch": 1.491396682684855, + "grad_norm": 0.43843451142311096, + "learning_rate": 5.9161658306160286e-06, + "loss": 0.3479, + "step": 3207 + }, + { + "epoch": 1.4918617268640522, + "grad_norm": 0.43270936608314514, + "learning_rate": 5.913505572667615e-06, + "loss": 0.3498, + "step": 3208 + }, + { + "epoch": 1.492326771043249, + "grad_norm": 0.4177435338497162, + "learning_rate": 5.910845047168259e-06, + "loss": 0.3481, + "step": 3209 + }, + { + "epoch": 1.4927918152224462, + "grad_norm": 0.4250778257846832, + "learning_rate": 5.908184254897183e-06, + "loss": 0.338, + "step": 3210 + }, + { + "epoch": 1.493256859401643, + "grad_norm": 0.40649059414863586, + "learning_rate": 5.90552319663369e-06, + "loss": 0.3486, + "step": 3211 + }, + { + "epoch": 1.4937219035808402, + "grad_norm": 0.40856921672821045, + "learning_rate": 5.902861873157162e-06, + "loss": 0.3573, + "step": 3212 + }, + { + "epoch": 1.4941869477600371, + "grad_norm": 0.5227980613708496, + "learning_rate": 5.900200285247055e-06, + "loss": 0.3883, + "step": 3213 + }, + { + "epoch": 1.4946519919392343, + "grad_norm": 0.3745517134666443, + "learning_rate": 5.897538433682909e-06, + "loss": 0.3327, + "step": 3214 + }, + { + "epoch": 1.4951170361184314, + "grad_norm": 0.38121265172958374, + "learning_rate": 5.894876319244334e-06, + "loss": 0.3519, + "step": 3215 + }, + { + "epoch": 1.4955820802976283, + "grad_norm": 0.45746102929115295, + "learning_rate": 5.892213942711019e-06, + "loss": 0.3917, + "step": 3216 + }, + { + "epoch": 1.4960471244768252, + "grad_norm": 0.47537440061569214, + "learning_rate": 5.889551304862735e-06, + "loss": 0.3296, + "step": 3217 + }, + { + "epoch": 1.4965121686560223, + "grad_norm": 0.427947461605072, + "learning_rate": 5.8868884064793215e-06, + "loss": 0.3772, + "step": 3218 + }, + { + "epoch": 1.4969772128352195, + "grad_norm": 0.40912094712257385, + "learning_rate": 5.884225248340699e-06, + "loss": 0.3814, + "step": 3219 + }, + { + "epoch": 1.4974422570144164, + "grad_norm": 0.40474238991737366, + "learning_rate": 5.881561831226865e-06, + "loss": 0.346, + "step": 3220 + }, + { + "epoch": 1.4979073011936133, + "grad_norm": 0.4801974296569824, + "learning_rate": 5.878898155917889e-06, + "loss": 0.3668, + "step": 3221 + }, + { + "epoch": 1.4983723453728104, + "grad_norm": 0.4175057113170624, + "learning_rate": 5.8762342231939205e-06, + "loss": 0.2951, + "step": 3222 + }, + { + "epoch": 1.4988373895520075, + "grad_norm": 0.41870221495628357, + "learning_rate": 5.873570033835181e-06, + "loss": 0.3757, + "step": 3223 + }, + { + "epoch": 1.4993024337312044, + "grad_norm": 0.3709333539009094, + "learning_rate": 5.8709055886219665e-06, + "loss": 0.3465, + "step": 3224 + }, + { + "epoch": 1.4997674779104015, + "grad_norm": 0.4033331871032715, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.3314, + "step": 3225 + }, + { + "epoch": 1.5002325220895987, + "grad_norm": 0.45277902483940125, + "learning_rate": 5.865575933753686e-06, + "loss": 0.3526, + "step": 3226 + }, + { + "epoch": 1.5006975662687956, + "grad_norm": 0.476517915725708, + "learning_rate": 5.862910725659586e-06, + "loss": 0.3651, + "step": 3227 + }, + { + "epoch": 1.5011626104479925, + "grad_norm": 0.372768372297287, + "learning_rate": 5.860245264832952e-06, + "loss": 0.3189, + "step": 3228 + }, + { + "epoch": 1.5016276546271896, + "grad_norm": 0.4148904085159302, + "learning_rate": 5.857579552054454e-06, + "loss": 0.3241, + "step": 3229 + }, + { + "epoch": 1.5020926988063867, + "grad_norm": 0.38212111592292786, + "learning_rate": 5.854913588104832e-06, + "loss": 0.3015, + "step": 3230 + }, + { + "epoch": 1.5025577429855836, + "grad_norm": 0.5229579210281372, + "learning_rate": 5.85224737376491e-06, + "loss": 0.3802, + "step": 3231 + }, + { + "epoch": 1.5030227871647805, + "grad_norm": 0.37114113569259644, + "learning_rate": 5.849580909815573e-06, + "loss": 0.3221, + "step": 3232 + }, + { + "epoch": 1.5034878313439777, + "grad_norm": 0.4346405863761902, + "learning_rate": 5.84691419703779e-06, + "loss": 0.3551, + "step": 3233 + }, + { + "epoch": 1.5039528755231748, + "grad_norm": 0.463509202003479, + "learning_rate": 5.844247236212593e-06, + "loss": 0.3559, + "step": 3234 + }, + { + "epoch": 1.5044179197023717, + "grad_norm": 0.5213866829872131, + "learning_rate": 5.8415800281210945e-06, + "loss": 0.3356, + "step": 3235 + }, + { + "epoch": 1.5048829638815686, + "grad_norm": 0.4189200699329376, + "learning_rate": 5.838912573544475e-06, + "loss": 0.349, + "step": 3236 + }, + { + "epoch": 1.5053480080607657, + "grad_norm": 0.41886717081069946, + "learning_rate": 5.836244873263989e-06, + "loss": 0.3107, + "step": 3237 + }, + { + "epoch": 1.5058130522399629, + "grad_norm": 0.4343456029891968, + "learning_rate": 5.833576928060964e-06, + "loss": 0.3686, + "step": 3238 + }, + { + "epoch": 1.5062780964191598, + "grad_norm": 0.41297784447669983, + "learning_rate": 5.830908738716797e-06, + "loss": 0.342, + "step": 3239 + }, + { + "epoch": 1.5067431405983567, + "grad_norm": 0.44648993015289307, + "learning_rate": 5.828240306012957e-06, + "loss": 0.3716, + "step": 3240 + }, + { + "epoch": 1.507208184777554, + "grad_norm": 0.40866819024086, + "learning_rate": 5.825571630730984e-06, + "loss": 0.3111, + "step": 3241 + }, + { + "epoch": 1.507673228956751, + "grad_norm": 0.45804688334465027, + "learning_rate": 5.8229027136524896e-06, + "loss": 0.3408, + "step": 3242 + }, + { + "epoch": 1.5081382731359478, + "grad_norm": 0.4614756107330322, + "learning_rate": 5.820233555559157e-06, + "loss": 0.3852, + "step": 3243 + }, + { + "epoch": 1.508603317315145, + "grad_norm": 0.39953604340553284, + "learning_rate": 5.81756415723274e-06, + "loss": 0.298, + "step": 3244 + }, + { + "epoch": 1.509068361494342, + "grad_norm": 0.4138416349887848, + "learning_rate": 5.814894519455061e-06, + "loss": 0.3585, + "step": 3245 + }, + { + "epoch": 1.509533405673539, + "grad_norm": 0.4480072557926178, + "learning_rate": 5.812224643008014e-06, + "loss": 0.3583, + "step": 3246 + }, + { + "epoch": 1.509998449852736, + "grad_norm": 0.3700372576713562, + "learning_rate": 5.809554528673562e-06, + "loss": 0.2974, + "step": 3247 + }, + { + "epoch": 1.510463494031933, + "grad_norm": 0.4125301241874695, + "learning_rate": 5.806884177233737e-06, + "loss": 0.3842, + "step": 3248 + }, + { + "epoch": 1.5109285382111302, + "grad_norm": 0.4698508679866791, + "learning_rate": 5.804213589470644e-06, + "loss": 0.3631, + "step": 3249 + }, + { + "epoch": 1.511393582390327, + "grad_norm": 0.37476885318756104, + "learning_rate": 5.801542766166454e-06, + "loss": 0.3072, + "step": 3250 + }, + { + "epoch": 1.511858626569524, + "grad_norm": 0.43928253650665283, + "learning_rate": 5.7988717081034066e-06, + "loss": 0.3819, + "step": 3251 + }, + { + "epoch": 1.512323670748721, + "grad_norm": 0.4448581635951996, + "learning_rate": 5.796200416063813e-06, + "loss": 0.325, + "step": 3252 + }, + { + "epoch": 1.5127887149279182, + "grad_norm": 0.5041275024414062, + "learning_rate": 5.793528890830049e-06, + "loss": 0.3903, + "step": 3253 + }, + { + "epoch": 1.5132537591071151, + "grad_norm": 0.3757099211215973, + "learning_rate": 5.790857133184563e-06, + "loss": 0.3212, + "step": 3254 + }, + { + "epoch": 1.5137188032863123, + "grad_norm": 0.5224563479423523, + "learning_rate": 5.788185143909868e-06, + "loss": 0.3684, + "step": 3255 + }, + { + "epoch": 1.5141838474655094, + "grad_norm": 0.46224769949913025, + "learning_rate": 5.785512923788549e-06, + "loss": 0.354, + "step": 3256 + }, + { + "epoch": 1.5146488916447063, + "grad_norm": 0.4003010392189026, + "learning_rate": 5.7828404736032515e-06, + "loss": 0.3085, + "step": 3257 + }, + { + "epoch": 1.5151139358239032, + "grad_norm": 0.4108293950557709, + "learning_rate": 5.780167794136696e-06, + "loss": 0.3633, + "step": 3258 + }, + { + "epoch": 1.5155789800031003, + "grad_norm": 0.4604003131389618, + "learning_rate": 5.777494886171667e-06, + "loss": 0.3237, + "step": 3259 + }, + { + "epoch": 1.5160440241822974, + "grad_norm": 0.49842220544815063, + "learning_rate": 5.7748217504910145e-06, + "loss": 0.3905, + "step": 3260 + }, + { + "epoch": 1.5165090683614944, + "grad_norm": 0.4144563376903534, + "learning_rate": 5.772148387877656e-06, + "loss": 0.3754, + "step": 3261 + }, + { + "epoch": 1.5169741125406913, + "grad_norm": 0.4629983901977539, + "learning_rate": 5.7694747991145775e-06, + "loss": 0.3352, + "step": 3262 + }, + { + "epoch": 1.5174391567198884, + "grad_norm": 0.3761589825153351, + "learning_rate": 5.766800984984828e-06, + "loss": 0.3488, + "step": 3263 + }, + { + "epoch": 1.5179042008990855, + "grad_norm": 0.3842097222805023, + "learning_rate": 5.764126946271526e-06, + "loss": 0.3479, + "step": 3264 + }, + { + "epoch": 1.5183692450782824, + "grad_norm": 0.39669203758239746, + "learning_rate": 5.7614526837578525e-06, + "loss": 0.3421, + "step": 3265 + }, + { + "epoch": 1.5188342892574793, + "grad_norm": 0.42259714007377625, + "learning_rate": 5.758778198227057e-06, + "loss": 0.36, + "step": 3266 + }, + { + "epoch": 1.5192993334366764, + "grad_norm": 0.39835333824157715, + "learning_rate": 5.7561034904624525e-06, + "loss": 0.3514, + "step": 3267 + }, + { + "epoch": 1.5197643776158736, + "grad_norm": 0.383436918258667, + "learning_rate": 5.753428561247416e-06, + "loss": 0.3601, + "step": 3268 + }, + { + "epoch": 1.5202294217950705, + "grad_norm": 0.39197006821632385, + "learning_rate": 5.750753411365394e-06, + "loss": 0.3377, + "step": 3269 + }, + { + "epoch": 1.5206944659742676, + "grad_norm": 0.4436843991279602, + "learning_rate": 5.74807804159989e-06, + "loss": 0.3428, + "step": 3270 + }, + { + "epoch": 1.5211595101534647, + "grad_norm": 0.3785765469074249, + "learning_rate": 5.74540245273448e-06, + "loss": 0.3329, + "step": 3271 + }, + { + "epoch": 1.5216245543326616, + "grad_norm": 0.3833819329738617, + "learning_rate": 5.7427266455528e-06, + "loss": 0.3533, + "step": 3272 + }, + { + "epoch": 1.5220895985118585, + "grad_norm": 0.4789281189441681, + "learning_rate": 5.7400506208385486e-06, + "loss": 0.3841, + "step": 3273 + }, + { + "epoch": 1.5225546426910557, + "grad_norm": 0.4086497724056244, + "learning_rate": 5.737374379375491e-06, + "loss": 0.3316, + "step": 3274 + }, + { + "epoch": 1.5230196868702528, + "grad_norm": 0.39715808629989624, + "learning_rate": 5.734697921947456e-06, + "loss": 0.3326, + "step": 3275 + }, + { + "epoch": 1.5234847310494497, + "grad_norm": 0.45392605662345886, + "learning_rate": 5.732021249338333e-06, + "loss": 0.3432, + "step": 3276 + }, + { + "epoch": 1.5239497752286466, + "grad_norm": 0.4217411279678345, + "learning_rate": 5.729344362332075e-06, + "loss": 0.3766, + "step": 3277 + }, + { + "epoch": 1.5244148194078437, + "grad_norm": 0.4188506603240967, + "learning_rate": 5.7266672617127014e-06, + "loss": 0.3234, + "step": 3278 + }, + { + "epoch": 1.5248798635870409, + "grad_norm": 0.40585535764694214, + "learning_rate": 5.723989948264291e-06, + "loss": 0.3499, + "step": 3279 + }, + { + "epoch": 1.5253449077662378, + "grad_norm": 0.4009935259819031, + "learning_rate": 5.721312422770984e-06, + "loss": 0.359, + "step": 3280 + }, + { + "epoch": 1.5258099519454347, + "grad_norm": 0.4260950982570648, + "learning_rate": 5.718634686016985e-06, + "loss": 0.3418, + "step": 3281 + }, + { + "epoch": 1.5262749961246318, + "grad_norm": 0.40163952112197876, + "learning_rate": 5.715956738786559e-06, + "loss": 0.3451, + "step": 3282 + }, + { + "epoch": 1.526740040303829, + "grad_norm": 0.38455137610435486, + "learning_rate": 5.713278581864032e-06, + "loss": 0.3522, + "step": 3283 + }, + { + "epoch": 1.5272050844830258, + "grad_norm": 0.3814459443092346, + "learning_rate": 5.710600216033797e-06, + "loss": 0.3334, + "step": 3284 + }, + { + "epoch": 1.527670128662223, + "grad_norm": 0.4370444416999817, + "learning_rate": 5.7079216420803e-06, + "loss": 0.3619, + "step": 3285 + }, + { + "epoch": 1.52813517284142, + "grad_norm": 0.40159928798675537, + "learning_rate": 5.705242860788052e-06, + "loss": 0.3379, + "step": 3286 + }, + { + "epoch": 1.528600217020617, + "grad_norm": 0.4162231385707855, + "learning_rate": 5.7025638729416275e-06, + "loss": 0.3562, + "step": 3287 + }, + { + "epoch": 1.529065261199814, + "grad_norm": 0.36917373538017273, + "learning_rate": 5.699884679325656e-06, + "loss": 0.3496, + "step": 3288 + }, + { + "epoch": 1.529530305379011, + "grad_norm": 0.36086469888687134, + "learning_rate": 5.697205280724828e-06, + "loss": 0.33, + "step": 3289 + }, + { + "epoch": 1.5299953495582082, + "grad_norm": 0.434421181678772, + "learning_rate": 5.6945256779239e-06, + "loss": 0.373, + "step": 3290 + }, + { + "epoch": 1.530460393737405, + "grad_norm": 0.39509040117263794, + "learning_rate": 5.691845871707682e-06, + "loss": 0.3426, + "step": 3291 + }, + { + "epoch": 1.530925437916602, + "grad_norm": 0.38857322931289673, + "learning_rate": 5.689165862861046e-06, + "loss": 0.3389, + "step": 3292 + }, + { + "epoch": 1.531390482095799, + "grad_norm": 0.38726887106895447, + "learning_rate": 5.686485652168923e-06, + "loss": 0.3262, + "step": 3293 + }, + { + "epoch": 1.5318555262749962, + "grad_norm": 0.4127531051635742, + "learning_rate": 5.683805240416302e-06, + "loss": 0.3861, + "step": 3294 + }, + { + "epoch": 1.5323205704541931, + "grad_norm": 0.3922295868396759, + "learning_rate": 5.681124628388235e-06, + "loss": 0.3506, + "step": 3295 + }, + { + "epoch": 1.53278561463339, + "grad_norm": 0.38945940136909485, + "learning_rate": 5.678443816869828e-06, + "loss": 0.34, + "step": 3296 + }, + { + "epoch": 1.5332506588125872, + "grad_norm": 0.38314536213874817, + "learning_rate": 5.675762806646247e-06, + "loss": 0.3136, + "step": 3297 + }, + { + "epoch": 1.5337157029917843, + "grad_norm": 0.4257749021053314, + "learning_rate": 5.673081598502715e-06, + "loss": 0.3641, + "step": 3298 + }, + { + "epoch": 1.5341807471709812, + "grad_norm": 0.4407237470149994, + "learning_rate": 5.670400193224516e-06, + "loss": 0.3381, + "step": 3299 + }, + { + "epoch": 1.5346457913501783, + "grad_norm": 0.4940028190612793, + "learning_rate": 5.66771859159699e-06, + "loss": 0.336, + "step": 3300 + }, + { + "epoch": 1.5351108355293754, + "grad_norm": 0.40870407223701477, + "learning_rate": 5.665036794405535e-06, + "loss": 0.3687, + "step": 3301 + }, + { + "epoch": 1.5355758797085723, + "grad_norm": 0.4065590500831604, + "learning_rate": 5.662354802435606e-06, + "loss": 0.3937, + "step": 3302 + }, + { + "epoch": 1.5360409238877692, + "grad_norm": 0.39256197214126587, + "learning_rate": 5.659672616472712e-06, + "loss": 0.3364, + "step": 3303 + }, + { + "epoch": 1.5365059680669664, + "grad_norm": 0.4631505608558655, + "learning_rate": 5.656990237302426e-06, + "loss": 0.3388, + "step": 3304 + }, + { + "epoch": 1.5369710122461635, + "grad_norm": 0.3832097053527832, + "learning_rate": 5.6543076657103705e-06, + "loss": 0.3108, + "step": 3305 + }, + { + "epoch": 1.5374360564253604, + "grad_norm": 0.44715628027915955, + "learning_rate": 5.651624902482225e-06, + "loss": 0.3716, + "step": 3306 + }, + { + "epoch": 1.5379011006045573, + "grad_norm": 0.5109707713127136, + "learning_rate": 5.648941948403732e-06, + "loss": 0.3541, + "step": 3307 + }, + { + "epoch": 1.5383661447837544, + "grad_norm": 0.35396090149879456, + "learning_rate": 5.646258804260685e-06, + "loss": 0.3068, + "step": 3308 + }, + { + "epoch": 1.5388311889629516, + "grad_norm": 0.5051494836807251, + "learning_rate": 5.643575470838929e-06, + "loss": 0.4256, + "step": 3309 + }, + { + "epoch": 1.5392962331421485, + "grad_norm": 0.4141186475753784, + "learning_rate": 5.640891948924373e-06, + "loss": 0.3335, + "step": 3310 + }, + { + "epoch": 1.5397612773213454, + "grad_norm": 0.3828221559524536, + "learning_rate": 5.638208239302975e-06, + "loss": 0.3464, + "step": 3311 + }, + { + "epoch": 1.5402263215005425, + "grad_norm": 0.39551404118537903, + "learning_rate": 5.6355243427607475e-06, + "loss": 0.387, + "step": 3312 + }, + { + "epoch": 1.5406913656797396, + "grad_norm": 0.3690919280052185, + "learning_rate": 5.632840260083766e-06, + "loss": 0.2866, + "step": 3313 + }, + { + "epoch": 1.5411564098589365, + "grad_norm": 0.458450585603714, + "learning_rate": 5.630155992058151e-06, + "loss": 0.4064, + "step": 3314 + }, + { + "epoch": 1.5416214540381337, + "grad_norm": 0.4220149517059326, + "learning_rate": 5.6274715394700805e-06, + "loss": 0.377, + "step": 3315 + }, + { + "epoch": 1.5420864982173308, + "grad_norm": 0.41717949509620667, + "learning_rate": 5.62478690310579e-06, + "loss": 0.3553, + "step": 3316 + }, + { + "epoch": 1.5425515423965277, + "grad_norm": 0.42877861857414246, + "learning_rate": 5.622102083751563e-06, + "loss": 0.3538, + "step": 3317 + }, + { + "epoch": 1.5430165865757246, + "grad_norm": 0.44071346521377563, + "learning_rate": 5.61941708219374e-06, + "loss": 0.3461, + "step": 3318 + }, + { + "epoch": 1.5434816307549217, + "grad_norm": 0.47426027059555054, + "learning_rate": 5.6167318992187155e-06, + "loss": 0.3504, + "step": 3319 + }, + { + "epoch": 1.5439466749341189, + "grad_norm": 0.4373508393764496, + "learning_rate": 5.614046535612936e-06, + "loss": 0.3585, + "step": 3320 + }, + { + "epoch": 1.5444117191133158, + "grad_norm": 0.48637011647224426, + "learning_rate": 5.6113609921629e-06, + "loss": 0.3465, + "step": 3321 + }, + { + "epoch": 1.5448767632925127, + "grad_norm": 0.3798205852508545, + "learning_rate": 5.60867526965516e-06, + "loss": 0.3345, + "step": 3322 + }, + { + "epoch": 1.5453418074717098, + "grad_norm": 0.3805210590362549, + "learning_rate": 5.60598936887632e-06, + "loss": 0.3352, + "step": 3323 + }, + { + "epoch": 1.545806851650907, + "grad_norm": 0.44706982374191284, + "learning_rate": 5.603303290613036e-06, + "loss": 0.3934, + "step": 3324 + }, + { + "epoch": 1.5462718958301038, + "grad_norm": 0.38400477170944214, + "learning_rate": 5.600617035652019e-06, + "loss": 0.3168, + "step": 3325 + }, + { + "epoch": 1.5467369400093007, + "grad_norm": 0.3989894688129425, + "learning_rate": 5.597930604780028e-06, + "loss": 0.3661, + "step": 3326 + }, + { + "epoch": 1.5472019841884979, + "grad_norm": 0.40619131922721863, + "learning_rate": 5.595243998783876e-06, + "loss": 0.355, + "step": 3327 + }, + { + "epoch": 1.547667028367695, + "grad_norm": 0.4321429431438446, + "learning_rate": 5.592557218450424e-06, + "loss": 0.3404, + "step": 3328 + }, + { + "epoch": 1.548132072546892, + "grad_norm": 0.3868047893047333, + "learning_rate": 5.589870264566588e-06, + "loss": 0.3609, + "step": 3329 + }, + { + "epoch": 1.548597116726089, + "grad_norm": 0.4493039548397064, + "learning_rate": 5.587183137919332e-06, + "loss": 0.3628, + "step": 3330 + }, + { + "epoch": 1.5490621609052861, + "grad_norm": 0.4076448082923889, + "learning_rate": 5.584495839295674e-06, + "loss": 0.3429, + "step": 3331 + }, + { + "epoch": 1.549527205084483, + "grad_norm": 0.4267314374446869, + "learning_rate": 5.58180836948268e-06, + "loss": 0.3768, + "step": 3332 + }, + { + "epoch": 1.54999224926368, + "grad_norm": 0.4145706295967102, + "learning_rate": 5.579120729267463e-06, + "loss": 0.3438, + "step": 3333 + }, + { + "epoch": 1.550457293442877, + "grad_norm": 0.42286407947540283, + "learning_rate": 5.576432919437193e-06, + "loss": 0.3253, + "step": 3334 + }, + { + "epoch": 1.5509223376220742, + "grad_norm": 0.3949490785598755, + "learning_rate": 5.57374494077908e-06, + "loss": 0.346, + "step": 3335 + }, + { + "epoch": 1.5513873818012711, + "grad_norm": 0.44627898931503296, + "learning_rate": 5.571056794080396e-06, + "loss": 0.3721, + "step": 3336 + }, + { + "epoch": 1.551852425980468, + "grad_norm": 0.3899642527103424, + "learning_rate": 5.568368480128453e-06, + "loss": 0.3405, + "step": 3337 + }, + { + "epoch": 1.5523174701596651, + "grad_norm": 0.4098135530948639, + "learning_rate": 5.565679999710614e-06, + "loss": 0.3319, + "step": 3338 + }, + { + "epoch": 1.5527825143388623, + "grad_norm": 0.39709505438804626, + "learning_rate": 5.562991353614292e-06, + "loss": 0.3294, + "step": 3339 + }, + { + "epoch": 1.5532475585180592, + "grad_norm": 0.3686107099056244, + "learning_rate": 5.560302542626947e-06, + "loss": 0.359, + "step": 3340 + }, + { + "epoch": 1.553712602697256, + "grad_norm": 0.3666870892047882, + "learning_rate": 5.557613567536087e-06, + "loss": 0.3197, + "step": 3341 + }, + { + "epoch": 1.5541776468764532, + "grad_norm": 0.4867759048938751, + "learning_rate": 5.554924429129271e-06, + "loss": 0.3515, + "step": 3342 + }, + { + "epoch": 1.5546426910556503, + "grad_norm": 0.3786954879760742, + "learning_rate": 5.552235128194105e-06, + "loss": 0.3297, + "step": 3343 + }, + { + "epoch": 1.5551077352348472, + "grad_norm": 0.3850143849849701, + "learning_rate": 5.5495456655182376e-06, + "loss": 0.3379, + "step": 3344 + }, + { + "epoch": 1.5555727794140444, + "grad_norm": 0.4032987952232361, + "learning_rate": 5.546856041889374e-06, + "loss": 0.3273, + "step": 3345 + }, + { + "epoch": 1.5560378235932415, + "grad_norm": 0.4159895181655884, + "learning_rate": 5.544166258095256e-06, + "loss": 0.3851, + "step": 3346 + }, + { + "epoch": 1.5565028677724384, + "grad_norm": 0.41379231214523315, + "learning_rate": 5.54147631492368e-06, + "loss": 0.3465, + "step": 3347 + }, + { + "epoch": 1.5569679119516353, + "grad_norm": 0.36884400248527527, + "learning_rate": 5.538786213162487e-06, + "loss": 0.3264, + "step": 3348 + }, + { + "epoch": 1.5574329561308324, + "grad_norm": 0.3928481638431549, + "learning_rate": 5.536095953599565e-06, + "loss": 0.3351, + "step": 3349 + }, + { + "epoch": 1.5578980003100296, + "grad_norm": 0.4361253082752228, + "learning_rate": 5.533405537022846e-06, + "loss": 0.3428, + "step": 3350 + }, + { + "epoch": 1.5583630444892265, + "grad_norm": 0.36980322003364563, + "learning_rate": 5.530714964220308e-06, + "loss": 0.3544, + "step": 3351 + }, + { + "epoch": 1.5588280886684234, + "grad_norm": 0.3828973174095154, + "learning_rate": 5.528024235979978e-06, + "loss": 0.3581, + "step": 3352 + }, + { + "epoch": 1.5592931328476205, + "grad_norm": 0.3844209313392639, + "learning_rate": 5.525333353089926e-06, + "loss": 0.3084, + "step": 3353 + }, + { + "epoch": 1.5597581770268176, + "grad_norm": 0.4435608386993408, + "learning_rate": 5.522642316338268e-06, + "loss": 0.3475, + "step": 3354 + }, + { + "epoch": 1.5602232212060145, + "grad_norm": 0.36281177401542664, + "learning_rate": 5.519951126513164e-06, + "loss": 0.3257, + "step": 3355 + }, + { + "epoch": 1.5606882653852114, + "grad_norm": 0.4244144558906555, + "learning_rate": 5.517259784402823e-06, + "loss": 0.3716, + "step": 3356 + }, + { + "epoch": 1.5611533095644086, + "grad_norm": 0.39875200390815735, + "learning_rate": 5.514568290795492e-06, + "loss": 0.3374, + "step": 3357 + }, + { + "epoch": 1.5616183537436057, + "grad_norm": 0.42064738273620605, + "learning_rate": 5.511876646479466e-06, + "loss": 0.3643, + "step": 3358 + }, + { + "epoch": 1.5620833979228026, + "grad_norm": 0.4080541133880615, + "learning_rate": 5.509184852243084e-06, + "loss": 0.3452, + "step": 3359 + }, + { + "epoch": 1.5625484421019997, + "grad_norm": 0.3764135539531708, + "learning_rate": 5.5064929088747324e-06, + "loss": 0.3365, + "step": 3360 + }, + { + "epoch": 1.5630134862811969, + "grad_norm": 0.39057546854019165, + "learning_rate": 5.503800817162833e-06, + "loss": 0.3422, + "step": 3361 + }, + { + "epoch": 1.5634785304603938, + "grad_norm": 0.3925474286079407, + "learning_rate": 5.501108577895858e-06, + "loss": 0.3133, + "step": 3362 + }, + { + "epoch": 1.5639435746395907, + "grad_norm": 0.4536259174346924, + "learning_rate": 5.49841619186232e-06, + "loss": 0.3756, + "step": 3363 + }, + { + "epoch": 1.5644086188187878, + "grad_norm": 0.4168597161769867, + "learning_rate": 5.495723659850776e-06, + "loss": 0.3313, + "step": 3364 + }, + { + "epoch": 1.564873662997985, + "grad_norm": 0.35290852189064026, + "learning_rate": 5.493030982649823e-06, + "loss": 0.3392, + "step": 3365 + }, + { + "epoch": 1.5653387071771818, + "grad_norm": 0.36755627393722534, + "learning_rate": 5.4903381610481034e-06, + "loss": 0.3271, + "step": 3366 + }, + { + "epoch": 1.5658037513563787, + "grad_norm": 0.42110899090766907, + "learning_rate": 5.487645195834302e-06, + "loss": 0.3726, + "step": 3367 + }, + { + "epoch": 1.5662687955355759, + "grad_norm": 0.3488036096096039, + "learning_rate": 5.484952087797144e-06, + "loss": 0.3097, + "step": 3368 + }, + { + "epoch": 1.566733839714773, + "grad_norm": 0.4092177152633667, + "learning_rate": 5.482258837725397e-06, + "loss": 0.3727, + "step": 3369 + }, + { + "epoch": 1.5671988838939699, + "grad_norm": 0.3953334093093872, + "learning_rate": 5.479565446407867e-06, + "loss": 0.3146, + "step": 3370 + }, + { + "epoch": 1.5676639280731668, + "grad_norm": 0.3620423376560211, + "learning_rate": 5.47687191463341e-06, + "loss": 0.3622, + "step": 3371 + }, + { + "epoch": 1.5681289722523641, + "grad_norm": 0.3988654613494873, + "learning_rate": 5.4741782431909144e-06, + "loss": 0.345, + "step": 3372 + }, + { + "epoch": 1.568594016431561, + "grad_norm": 0.3772416114807129, + "learning_rate": 5.471484432869314e-06, + "loss": 0.3463, + "step": 3373 + }, + { + "epoch": 1.569059060610758, + "grad_norm": 0.4692382216453552, + "learning_rate": 5.4687904844575814e-06, + "loss": 0.3523, + "step": 3374 + }, + { + "epoch": 1.569524104789955, + "grad_norm": 0.38080427050590515, + "learning_rate": 5.46609639874473e-06, + "loss": 0.3665, + "step": 3375 + }, + { + "epoch": 1.5699891489691522, + "grad_norm": 0.3956538438796997, + "learning_rate": 5.4634021765198135e-06, + "loss": 0.3536, + "step": 3376 + }, + { + "epoch": 1.570454193148349, + "grad_norm": 0.46239396929740906, + "learning_rate": 5.460707818571928e-06, + "loss": 0.3549, + "step": 3377 + }, + { + "epoch": 1.570919237327546, + "grad_norm": 0.42876169085502625, + "learning_rate": 5.458013325690205e-06, + "loss": 0.3599, + "step": 3378 + }, + { + "epoch": 1.5713842815067431, + "grad_norm": 0.3865092992782593, + "learning_rate": 5.455318698663819e-06, + "loss": 0.3271, + "step": 3379 + }, + { + "epoch": 1.5718493256859403, + "grad_norm": 0.40359625220298767, + "learning_rate": 5.452623938281983e-06, + "loss": 0.3451, + "step": 3380 + }, + { + "epoch": 1.5723143698651372, + "grad_norm": 0.4457720220088959, + "learning_rate": 5.449929045333946e-06, + "loss": 0.3897, + "step": 3381 + }, + { + "epoch": 1.572779414044334, + "grad_norm": 0.4244859516620636, + "learning_rate": 5.447234020608999e-06, + "loss": 0.3683, + "step": 3382 + }, + { + "epoch": 1.5732444582235312, + "grad_norm": 0.37125489115715027, + "learning_rate": 5.444538864896472e-06, + "loss": 0.3132, + "step": 3383 + }, + { + "epoch": 1.5737095024027283, + "grad_norm": 0.3960449993610382, + "learning_rate": 5.441843578985735e-06, + "loss": 0.3133, + "step": 3384 + }, + { + "epoch": 1.5741745465819252, + "grad_norm": 0.4028528034687042, + "learning_rate": 5.439148163666188e-06, + "loss": 0.367, + "step": 3385 + }, + { + "epoch": 1.5746395907611221, + "grad_norm": 0.38724517822265625, + "learning_rate": 5.436452619727278e-06, + "loss": 0.3709, + "step": 3386 + }, + { + "epoch": 1.5751046349403195, + "grad_norm": 0.41816893219947815, + "learning_rate": 5.4337569479584866e-06, + "loss": 0.3545, + "step": 3387 + }, + { + "epoch": 1.5755696791195164, + "grad_norm": 0.3713048994541168, + "learning_rate": 5.431061149149327e-06, + "loss": 0.3023, + "step": 3388 + }, + { + "epoch": 1.5760347232987133, + "grad_norm": 0.37138456106185913, + "learning_rate": 5.428365224089362e-06, + "loss": 0.3382, + "step": 3389 + }, + { + "epoch": 1.5764997674779104, + "grad_norm": 0.39625900983810425, + "learning_rate": 5.425669173568179e-06, + "loss": 0.3549, + "step": 3390 + }, + { + "epoch": 1.5769648116571076, + "grad_norm": 0.4074704349040985, + "learning_rate": 5.42297299837541e-06, + "loss": 0.3267, + "step": 3391 + }, + { + "epoch": 1.5774298558363045, + "grad_norm": 0.41984236240386963, + "learning_rate": 5.42027669930072e-06, + "loss": 0.3526, + "step": 3392 + }, + { + "epoch": 1.5778949000155014, + "grad_norm": 0.38742825388908386, + "learning_rate": 5.417580277133812e-06, + "loss": 0.3161, + "step": 3393 + }, + { + "epoch": 1.5783599441946985, + "grad_norm": 0.41518786549568176, + "learning_rate": 5.414883732664422e-06, + "loss": 0.3263, + "step": 3394 + }, + { + "epoch": 1.5788249883738956, + "grad_norm": 0.41794711351394653, + "learning_rate": 5.412187066682327e-06, + "loss": 0.3718, + "step": 3395 + }, + { + "epoch": 1.5792900325530925, + "grad_norm": 0.42802178859710693, + "learning_rate": 5.409490279977335e-06, + "loss": 0.3883, + "step": 3396 + }, + { + "epoch": 1.5797550767322894, + "grad_norm": 0.3825167715549469, + "learning_rate": 5.406793373339292e-06, + "loss": 0.3426, + "step": 3397 + }, + { + "epoch": 1.5802201209114866, + "grad_norm": 0.3995634913444519, + "learning_rate": 5.404096347558078e-06, + "loss": 0.3749, + "step": 3398 + }, + { + "epoch": 1.5806851650906837, + "grad_norm": 0.35901325941085815, + "learning_rate": 5.4013992034236065e-06, + "loss": 0.3438, + "step": 3399 + }, + { + "epoch": 1.5811502092698806, + "grad_norm": 0.3686612546443939, + "learning_rate": 5.398701941725827e-06, + "loss": 0.3198, + "step": 3400 + }, + { + "epoch": 1.5816152534490775, + "grad_norm": 0.3947772979736328, + "learning_rate": 5.396004563254728e-06, + "loss": 0.3274, + "step": 3401 + }, + { + "epoch": 1.5820802976282748, + "grad_norm": 0.45323094725608826, + "learning_rate": 5.393307068800322e-06, + "loss": 0.4098, + "step": 3402 + }, + { + "epoch": 1.5825453418074718, + "grad_norm": 0.346132755279541, + "learning_rate": 5.390609459152666e-06, + "loss": 0.3316, + "step": 3403 + }, + { + "epoch": 1.5830103859866687, + "grad_norm": 0.39992961287498474, + "learning_rate": 5.387911735101845e-06, + "loss": 0.3548, + "step": 3404 + }, + { + "epoch": 1.5834754301658658, + "grad_norm": 0.40256255865097046, + "learning_rate": 5.385213897437975e-06, + "loss": 0.3207, + "step": 3405 + }, + { + "epoch": 1.583940474345063, + "grad_norm": 0.428801029920578, + "learning_rate": 5.3825159469512135e-06, + "loss": 0.3389, + "step": 3406 + }, + { + "epoch": 1.5844055185242598, + "grad_norm": 0.40780290961265564, + "learning_rate": 5.3798178844317435e-06, + "loss": 0.3221, + "step": 3407 + }, + { + "epoch": 1.5848705627034567, + "grad_norm": 0.38632360100746155, + "learning_rate": 5.377119710669785e-06, + "loss": 0.3096, + "step": 3408 + }, + { + "epoch": 1.5853356068826538, + "grad_norm": 0.45682859420776367, + "learning_rate": 5.374421426455589e-06, + "loss": 0.3737, + "step": 3409 + }, + { + "epoch": 1.585800651061851, + "grad_norm": 0.3837336599826813, + "learning_rate": 5.371723032579439e-06, + "loss": 0.3283, + "step": 3410 + }, + { + "epoch": 1.5862656952410479, + "grad_norm": 0.4327690303325653, + "learning_rate": 5.369024529831649e-06, + "loss": 0.3497, + "step": 3411 + }, + { + "epoch": 1.5867307394202448, + "grad_norm": 0.4153764843940735, + "learning_rate": 5.366325919002569e-06, + "loss": 0.3354, + "step": 3412 + }, + { + "epoch": 1.587195783599442, + "grad_norm": 0.399143785238266, + "learning_rate": 5.36362720088258e-06, + "loss": 0.3495, + "step": 3413 + }, + { + "epoch": 1.587660827778639, + "grad_norm": 0.40303295850753784, + "learning_rate": 5.3609283762620875e-06, + "loss": 0.3219, + "step": 3414 + }, + { + "epoch": 1.588125871957836, + "grad_norm": 0.40428194403648376, + "learning_rate": 5.358229445931538e-06, + "loss": 0.3199, + "step": 3415 + }, + { + "epoch": 1.588590916137033, + "grad_norm": 0.41889360547065735, + "learning_rate": 5.355530410681402e-06, + "loss": 0.3695, + "step": 3416 + }, + { + "epoch": 1.5890559603162302, + "grad_norm": 0.43096888065338135, + "learning_rate": 5.352831271302183e-06, + "loss": 0.3836, + "step": 3417 + }, + { + "epoch": 1.589521004495427, + "grad_norm": 0.40473613142967224, + "learning_rate": 5.350132028584416e-06, + "loss": 0.3579, + "step": 3418 + }, + { + "epoch": 1.589986048674624, + "grad_norm": 0.39574551582336426, + "learning_rate": 5.3474326833186656e-06, + "loss": 0.3357, + "step": 3419 + }, + { + "epoch": 1.5904510928538211, + "grad_norm": 0.38140177726745605, + "learning_rate": 5.344733236295525e-06, + "loss": 0.3498, + "step": 3420 + }, + { + "epoch": 1.5909161370330183, + "grad_norm": 0.4418228566646576, + "learning_rate": 5.34203368830562e-06, + "loss": 0.3608, + "step": 3421 + }, + { + "epoch": 1.5913811812122152, + "grad_norm": 0.34866416454315186, + "learning_rate": 5.339334040139603e-06, + "loss": 0.3159, + "step": 3422 + }, + { + "epoch": 1.591846225391412, + "grad_norm": 0.4378582239151001, + "learning_rate": 5.336634292588156e-06, + "loss": 0.4073, + "step": 3423 + }, + { + "epoch": 1.5923112695706092, + "grad_norm": 0.3770564794540405, + "learning_rate": 5.333934446441994e-06, + "loss": 0.3165, + "step": 3424 + }, + { + "epoch": 1.5927763137498063, + "grad_norm": 0.4754953682422638, + "learning_rate": 5.3312345024918575e-06, + "loss": 0.4054, + "step": 3425 + }, + { + "epoch": 1.5932413579290032, + "grad_norm": 0.3792072534561157, + "learning_rate": 5.328534461528515e-06, + "loss": 0.2908, + "step": 3426 + }, + { + "epoch": 1.5937064021082001, + "grad_norm": 0.4275760054588318, + "learning_rate": 5.325834324342765e-06, + "loss": 0.3807, + "step": 3427 + }, + { + "epoch": 1.5941714462873973, + "grad_norm": 0.4156563878059387, + "learning_rate": 5.323134091725434e-06, + "loss": 0.3607, + "step": 3428 + }, + { + "epoch": 1.5946364904665944, + "grad_norm": 0.3854392170906067, + "learning_rate": 5.320433764467375e-06, + "loss": 0.3535, + "step": 3429 + }, + { + "epoch": 1.5951015346457913, + "grad_norm": 0.40652233362197876, + "learning_rate": 5.3177333433594734e-06, + "loss": 0.338, + "step": 3430 + }, + { + "epoch": 1.5955665788249884, + "grad_norm": 0.4638773202896118, + "learning_rate": 5.315032829192636e-06, + "loss": 0.3495, + "step": 3431 + }, + { + "epoch": 1.5960316230041856, + "grad_norm": 0.40089505910873413, + "learning_rate": 5.312332222757799e-06, + "loss": 0.309, + "step": 3432 + }, + { + "epoch": 1.5964966671833825, + "grad_norm": 0.41283005475997925, + "learning_rate": 5.309631524845929e-06, + "loss": 0.3703, + "step": 3433 + }, + { + "epoch": 1.5969617113625794, + "grad_norm": 0.4164558947086334, + "learning_rate": 5.306930736248013e-06, + "loss": 0.3475, + "step": 3434 + }, + { + "epoch": 1.5974267555417765, + "grad_norm": 0.38702747225761414, + "learning_rate": 5.30422985775507e-06, + "loss": 0.3319, + "step": 3435 + }, + { + "epoch": 1.5978917997209736, + "grad_norm": 0.3754745423793793, + "learning_rate": 5.301528890158143e-06, + "loss": 0.3057, + "step": 3436 + }, + { + "epoch": 1.5983568439001705, + "grad_norm": 0.41013574600219727, + "learning_rate": 5.298827834248303e-06, + "loss": 0.3675, + "step": 3437 + }, + { + "epoch": 1.5988218880793674, + "grad_norm": 0.43470141291618347, + "learning_rate": 5.296126690816644e-06, + "loss": 0.3695, + "step": 3438 + }, + { + "epoch": 1.5992869322585646, + "grad_norm": 0.42870068550109863, + "learning_rate": 5.293425460654288e-06, + "loss": 0.3576, + "step": 3439 + }, + { + "epoch": 1.5997519764377617, + "grad_norm": 0.3700660765171051, + "learning_rate": 5.290724144552379e-06, + "loss": 0.3345, + "step": 3440 + }, + { + "epoch": 1.6002170206169586, + "grad_norm": 0.37624475359916687, + "learning_rate": 5.288022743302093e-06, + "loss": 0.3434, + "step": 3441 + }, + { + "epoch": 1.6006820647961555, + "grad_norm": 0.4745047092437744, + "learning_rate": 5.2853212576946225e-06, + "loss": 0.367, + "step": 3442 + }, + { + "epoch": 1.6011471089753526, + "grad_norm": 0.3658214807510376, + "learning_rate": 5.282619688521189e-06, + "loss": 0.3596, + "step": 3443 + }, + { + "epoch": 1.6016121531545497, + "grad_norm": 0.44426223635673523, + "learning_rate": 5.2799180365730405e-06, + "loss": 0.3377, + "step": 3444 + }, + { + "epoch": 1.6020771973337466, + "grad_norm": 0.3807421326637268, + "learning_rate": 5.2772163026414455e-06, + "loss": 0.3378, + "step": 3445 + }, + { + "epoch": 1.6025422415129438, + "grad_norm": 0.4405421316623688, + "learning_rate": 5.274514487517698e-06, + "loss": 0.3689, + "step": 3446 + }, + { + "epoch": 1.603007285692141, + "grad_norm": 0.36217546463012695, + "learning_rate": 5.271812591993116e-06, + "loss": 0.3242, + "step": 3447 + }, + { + "epoch": 1.6034723298713378, + "grad_norm": 0.4049311876296997, + "learning_rate": 5.269110616859041e-06, + "loss": 0.3661, + "step": 3448 + }, + { + "epoch": 1.6039373740505347, + "grad_norm": 0.4050389230251312, + "learning_rate": 5.266408562906838e-06, + "loss": 0.3447, + "step": 3449 + }, + { + "epoch": 1.6044024182297318, + "grad_norm": 0.40756580233573914, + "learning_rate": 5.263706430927895e-06, + "loss": 0.3566, + "step": 3450 + }, + { + "epoch": 1.604867462408929, + "grad_norm": 0.3964058458805084, + "learning_rate": 5.261004221713621e-06, + "loss": 0.3193, + "step": 3451 + }, + { + "epoch": 1.6053325065881259, + "grad_norm": 0.439263254404068, + "learning_rate": 5.258301936055449e-06, + "loss": 0.359, + "step": 3452 + }, + { + "epoch": 1.6057975507673228, + "grad_norm": 0.4526593089103699, + "learning_rate": 5.255599574744836e-06, + "loss": 0.3515, + "step": 3453 + }, + { + "epoch": 1.60626259494652, + "grad_norm": 0.43033599853515625, + "learning_rate": 5.252897138573261e-06, + "loss": 0.3492, + "step": 3454 + }, + { + "epoch": 1.606727639125717, + "grad_norm": 0.43363288044929504, + "learning_rate": 5.2501946283322204e-06, + "loss": 0.3523, + "step": 3455 + }, + { + "epoch": 1.607192683304914, + "grad_norm": 0.3905112147331238, + "learning_rate": 5.247492044813237e-06, + "loss": 0.3388, + "step": 3456 + }, + { + "epoch": 1.6076577274841108, + "grad_norm": 0.45133721828460693, + "learning_rate": 5.244789388807855e-06, + "loss": 0.3748, + "step": 3457 + }, + { + "epoch": 1.608122771663308, + "grad_norm": 0.5364686250686646, + "learning_rate": 5.242086661107635e-06, + "loss": 0.3814, + "step": 3458 + }, + { + "epoch": 1.608587815842505, + "grad_norm": 0.4719644784927368, + "learning_rate": 5.239383862504166e-06, + "loss": 0.3346, + "step": 3459 + }, + { + "epoch": 1.609052860021702, + "grad_norm": 0.4059002697467804, + "learning_rate": 5.236680993789052e-06, + "loss": 0.3241, + "step": 3460 + }, + { + "epoch": 1.6095179042008991, + "grad_norm": 0.4945448040962219, + "learning_rate": 5.2339780557539185e-06, + "loss": 0.3588, + "step": 3461 + }, + { + "epoch": 1.6099829483800963, + "grad_norm": 0.49811092019081116, + "learning_rate": 5.231275049190414e-06, + "loss": 0.3167, + "step": 3462 + }, + { + "epoch": 1.6104479925592932, + "grad_norm": 0.47977715730667114, + "learning_rate": 5.228571974890204e-06, + "loss": 0.3564, + "step": 3463 + }, + { + "epoch": 1.61091303673849, + "grad_norm": 0.39883333444595337, + "learning_rate": 5.225868833644973e-06, + "loss": 0.3644, + "step": 3464 + }, + { + "epoch": 1.6113780809176872, + "grad_norm": 0.38860994577407837, + "learning_rate": 5.223165626246432e-06, + "loss": 0.347, + "step": 3465 + }, + { + "epoch": 1.6118431250968843, + "grad_norm": 0.4335678815841675, + "learning_rate": 5.220462353486304e-06, + "loss": 0.3534, + "step": 3466 + }, + { + "epoch": 1.6123081692760812, + "grad_norm": 0.4631887376308441, + "learning_rate": 5.217759016156333e-06, + "loss": 0.3494, + "step": 3467 + }, + { + "epoch": 1.6127732134552781, + "grad_norm": 0.4094836711883545, + "learning_rate": 5.215055615048283e-06, + "loss": 0.3841, + "step": 3468 + }, + { + "epoch": 1.6132382576344753, + "grad_norm": 0.36185774207115173, + "learning_rate": 5.2123521509539375e-06, + "loss": 0.3246, + "step": 3469 + }, + { + "epoch": 1.6137033018136724, + "grad_norm": 0.46863463521003723, + "learning_rate": 5.209648624665095e-06, + "loss": 0.3801, + "step": 3470 + }, + { + "epoch": 1.6141683459928693, + "grad_norm": 0.4251573085784912, + "learning_rate": 5.206945036973577e-06, + "loss": 0.3449, + "step": 3471 + }, + { + "epoch": 1.6146333901720662, + "grad_norm": 0.4120125472545624, + "learning_rate": 5.2042413886712176e-06, + "loss": 0.3312, + "step": 3472 + }, + { + "epoch": 1.6150984343512633, + "grad_norm": 0.41511979699134827, + "learning_rate": 5.201537680549874e-06, + "loss": 0.3495, + "step": 3473 + }, + { + "epoch": 1.6155634785304605, + "grad_norm": 0.43771082162857056, + "learning_rate": 5.198833913401418e-06, + "loss": 0.3843, + "step": 3474 + }, + { + "epoch": 1.6160285227096574, + "grad_norm": 0.39838284254074097, + "learning_rate": 5.196130088017737e-06, + "loss": 0.308, + "step": 3475 + }, + { + "epoch": 1.6164935668888545, + "grad_norm": 0.4643276631832123, + "learning_rate": 5.19342620519074e-06, + "loss": 0.351, + "step": 3476 + }, + { + "epoch": 1.6169586110680516, + "grad_norm": 0.34968698024749756, + "learning_rate": 5.190722265712349e-06, + "loss": 0.3099, + "step": 3477 + }, + { + "epoch": 1.6174236552472485, + "grad_norm": 0.40132972598075867, + "learning_rate": 5.188018270374504e-06, + "loss": 0.3358, + "step": 3478 + }, + { + "epoch": 1.6178886994264454, + "grad_norm": 0.3743666410446167, + "learning_rate": 5.185314219969163e-06, + "loss": 0.3669, + "step": 3479 + }, + { + "epoch": 1.6183537436056425, + "grad_norm": 0.3797807991504669, + "learning_rate": 5.182610115288296e-06, + "loss": 0.3471, + "step": 3480 + }, + { + "epoch": 1.6188187877848397, + "grad_norm": 0.43480437994003296, + "learning_rate": 5.179905957123891e-06, + "loss": 0.3913, + "step": 3481 + }, + { + "epoch": 1.6192838319640366, + "grad_norm": 0.3873792886734009, + "learning_rate": 5.177201746267955e-06, + "loss": 0.354, + "step": 3482 + }, + { + "epoch": 1.6197488761432335, + "grad_norm": 0.38425004482269287, + "learning_rate": 5.174497483512506e-06, + "loss": 0.3294, + "step": 3483 + }, + { + "epoch": 1.6202139203224306, + "grad_norm": 0.4359165132045746, + "learning_rate": 5.171793169649578e-06, + "loss": 0.3585, + "step": 3484 + }, + { + "epoch": 1.6206789645016277, + "grad_norm": 0.3433784246444702, + "learning_rate": 5.1690888054712215e-06, + "loss": 0.3329, + "step": 3485 + }, + { + "epoch": 1.6211440086808246, + "grad_norm": 0.4633955657482147, + "learning_rate": 5.1663843917695e-06, + "loss": 0.3745, + "step": 3486 + }, + { + "epoch": 1.6216090528600215, + "grad_norm": 0.44534632563591003, + "learning_rate": 5.163679929336491e-06, + "loss": 0.3692, + "step": 3487 + }, + { + "epoch": 1.6220740970392187, + "grad_norm": 0.3649267852306366, + "learning_rate": 5.160975418964293e-06, + "loss": 0.362, + "step": 3488 + }, + { + "epoch": 1.6225391412184158, + "grad_norm": 0.3883490264415741, + "learning_rate": 5.158270861445007e-06, + "loss": 0.3561, + "step": 3489 + }, + { + "epoch": 1.6230041853976127, + "grad_norm": 0.3937935531139374, + "learning_rate": 5.155566257570758e-06, + "loss": 0.32, + "step": 3490 + }, + { + "epoch": 1.6234692295768098, + "grad_norm": 0.36953264474868774, + "learning_rate": 5.152861608133678e-06, + "loss": 0.362, + "step": 3491 + }, + { + "epoch": 1.623934273756007, + "grad_norm": 0.49353107810020447, + "learning_rate": 5.150156913925916e-06, + "loss": 0.3646, + "step": 3492 + }, + { + "epoch": 1.6243993179352039, + "grad_norm": 0.393684446811676, + "learning_rate": 5.147452175739633e-06, + "loss": 0.3571, + "step": 3493 + }, + { + "epoch": 1.6248643621144008, + "grad_norm": 0.369238942861557, + "learning_rate": 5.144747394367002e-06, + "loss": 0.3446, + "step": 3494 + }, + { + "epoch": 1.625329406293598, + "grad_norm": 0.3968108296394348, + "learning_rate": 5.142042570600212e-06, + "loss": 0.3458, + "step": 3495 + }, + { + "epoch": 1.625794450472795, + "grad_norm": 0.44049766659736633, + "learning_rate": 5.139337705231459e-06, + "loss": 0.3774, + "step": 3496 + }, + { + "epoch": 1.626259494651992, + "grad_norm": 0.37539348006248474, + "learning_rate": 5.136632799052957e-06, + "loss": 0.3373, + "step": 3497 + }, + { + "epoch": 1.6267245388311888, + "grad_norm": 0.4646461009979248, + "learning_rate": 5.133927852856927e-06, + "loss": 0.352, + "step": 3498 + }, + { + "epoch": 1.627189583010386, + "grad_norm": 0.4138670861721039, + "learning_rate": 5.131222867435602e-06, + "loss": 0.3393, + "step": 3499 + }, + { + "epoch": 1.627654627189583, + "grad_norm": 0.40055084228515625, + "learning_rate": 5.128517843581233e-06, + "loss": 0.3605, + "step": 3500 + }, + { + "epoch": 1.62811967136878, + "grad_norm": 0.4990515112876892, + "learning_rate": 5.125812782086075e-06, + "loss": 0.338, + "step": 3501 + }, + { + "epoch": 1.628584715547977, + "grad_norm": 0.49589017033576965, + "learning_rate": 5.123107683742397e-06, + "loss": 0.3181, + "step": 3502 + }, + { + "epoch": 1.629049759727174, + "grad_norm": 0.4138791263103485, + "learning_rate": 5.1204025493424766e-06, + "loss": 0.3751, + "step": 3503 + }, + { + "epoch": 1.6295148039063712, + "grad_norm": 0.38624855875968933, + "learning_rate": 5.117697379678606e-06, + "loss": 0.3404, + "step": 3504 + }, + { + "epoch": 1.629979848085568, + "grad_norm": 0.5305654406547546, + "learning_rate": 5.114992175543084e-06, + "loss": 0.3786, + "step": 3505 + }, + { + "epoch": 1.6304448922647652, + "grad_norm": 0.4678351581096649, + "learning_rate": 5.112286937728223e-06, + "loss": 0.3552, + "step": 3506 + }, + { + "epoch": 1.6309099364439623, + "grad_norm": 0.4201079308986664, + "learning_rate": 5.109581667026341e-06, + "loss": 0.388, + "step": 3507 + }, + { + "epoch": 1.6313749806231592, + "grad_norm": 0.39116141200065613, + "learning_rate": 5.106876364229768e-06, + "loss": 0.338, + "step": 3508 + }, + { + "epoch": 1.6318400248023561, + "grad_norm": 0.3895026743412018, + "learning_rate": 5.104171030130846e-06, + "loss": 0.3374, + "step": 3509 + }, + { + "epoch": 1.6323050689815533, + "grad_norm": 0.3727634847164154, + "learning_rate": 5.101465665521919e-06, + "loss": 0.372, + "step": 3510 + }, + { + "epoch": 1.6327701131607504, + "grad_norm": 0.3340446352958679, + "learning_rate": 5.098760271195348e-06, + "loss": 0.3047, + "step": 3511 + }, + { + "epoch": 1.6332351573399473, + "grad_norm": 0.42071983218193054, + "learning_rate": 5.096054847943498e-06, + "loss": 0.3951, + "step": 3512 + }, + { + "epoch": 1.6337002015191442, + "grad_norm": 0.3533824682235718, + "learning_rate": 5.093349396558744e-06, + "loss": 0.3344, + "step": 3513 + }, + { + "epoch": 1.6341652456983413, + "grad_norm": 0.44178301095962524, + "learning_rate": 5.090643917833465e-06, + "loss": 0.3444, + "step": 3514 + }, + { + "epoch": 1.6346302898775384, + "grad_norm": 0.42436400055885315, + "learning_rate": 5.0879384125600565e-06, + "loss": 0.3877, + "step": 3515 + }, + { + "epoch": 1.6350953340567354, + "grad_norm": 0.4048612415790558, + "learning_rate": 5.085232881530916e-06, + "loss": 0.3495, + "step": 3516 + }, + { + "epoch": 1.6355603782359323, + "grad_norm": 0.3634321689605713, + "learning_rate": 5.082527325538449e-06, + "loss": 0.3845, + "step": 3517 + }, + { + "epoch": 1.6360254224151294, + "grad_norm": 0.3633408546447754, + "learning_rate": 5.0798217453750675e-06, + "loss": 0.3352, + "step": 3518 + }, + { + "epoch": 1.6364904665943265, + "grad_norm": 0.4096944034099579, + "learning_rate": 5.077116141833195e-06, + "loss": 0.3372, + "step": 3519 + }, + { + "epoch": 1.6369555107735234, + "grad_norm": 0.37343043088912964, + "learning_rate": 5.074410515705256e-06, + "loss": 0.345, + "step": 3520 + }, + { + "epoch": 1.6374205549527205, + "grad_norm": 0.4189586639404297, + "learning_rate": 5.071704867783684e-06, + "loss": 0.3634, + "step": 3521 + }, + { + "epoch": 1.6378855991319177, + "grad_norm": 0.358604371547699, + "learning_rate": 5.068999198860924e-06, + "loss": 0.3367, + "step": 3522 + }, + { + "epoch": 1.6383506433111146, + "grad_norm": 0.3955291509628296, + "learning_rate": 5.066293509729418e-06, + "loss": 0.3635, + "step": 3523 + }, + { + "epoch": 1.6388156874903115, + "grad_norm": 0.3491485118865967, + "learning_rate": 5.063587801181621e-06, + "loss": 0.3198, + "step": 3524 + }, + { + "epoch": 1.6392807316695086, + "grad_norm": 0.39579418301582336, + "learning_rate": 5.060882074009988e-06, + "loss": 0.3553, + "step": 3525 + }, + { + "epoch": 1.6397457758487057, + "grad_norm": 0.404255747795105, + "learning_rate": 5.0581763290069865e-06, + "loss": 0.3354, + "step": 3526 + }, + { + "epoch": 1.6402108200279026, + "grad_norm": 0.38574931025505066, + "learning_rate": 5.055470566965082e-06, + "loss": 0.3344, + "step": 3527 + }, + { + "epoch": 1.6406758642070995, + "grad_norm": 0.3492719829082489, + "learning_rate": 5.052764788676749e-06, + "loss": 0.3031, + "step": 3528 + }, + { + "epoch": 1.6411409083862967, + "grad_norm": 0.4036017954349518, + "learning_rate": 5.050058994934467e-06, + "loss": 0.3784, + "step": 3529 + }, + { + "epoch": 1.6416059525654938, + "grad_norm": 0.4062303900718689, + "learning_rate": 5.047353186530718e-06, + "loss": 0.38, + "step": 3530 + }, + { + "epoch": 1.6420709967446907, + "grad_norm": 0.3702806532382965, + "learning_rate": 5.04464736425799e-06, + "loss": 0.3046, + "step": 3531 + }, + { + "epoch": 1.6425360409238876, + "grad_norm": 0.40208354592323303, + "learning_rate": 5.0419415289087755e-06, + "loss": 0.3609, + "step": 3532 + }, + { + "epoch": 1.643001085103085, + "grad_norm": 0.35900288820266724, + "learning_rate": 5.039235681275568e-06, + "loss": 0.305, + "step": 3533 + }, + { + "epoch": 1.6434661292822819, + "grad_norm": 0.4107772707939148, + "learning_rate": 5.036529822150865e-06, + "loss": 0.367, + "step": 3534 + }, + { + "epoch": 1.6439311734614788, + "grad_norm": 0.36534854769706726, + "learning_rate": 5.033823952327173e-06, + "loss": 0.3382, + "step": 3535 + }, + { + "epoch": 1.644396217640676, + "grad_norm": 0.3700648844242096, + "learning_rate": 5.031118072596993e-06, + "loss": 0.3338, + "step": 3536 + }, + { + "epoch": 1.644861261819873, + "grad_norm": 0.3885965943336487, + "learning_rate": 5.028412183752835e-06, + "loss": 0.3292, + "step": 3537 + }, + { + "epoch": 1.64532630599907, + "grad_norm": 0.49532386660575867, + "learning_rate": 5.025706286587211e-06, + "loss": 0.3665, + "step": 3538 + }, + { + "epoch": 1.6457913501782668, + "grad_norm": 0.42976298928260803, + "learning_rate": 5.023000381892633e-06, + "loss": 0.3094, + "step": 3539 + }, + { + "epoch": 1.646256394357464, + "grad_norm": 0.4339199662208557, + "learning_rate": 5.020294470461615e-06, + "loss": 0.3615, + "step": 3540 + }, + { + "epoch": 1.646721438536661, + "grad_norm": 0.4408279061317444, + "learning_rate": 5.017588553086677e-06, + "loss": 0.3422, + "step": 3541 + }, + { + "epoch": 1.647186482715858, + "grad_norm": 0.4677359163761139, + "learning_rate": 5.014882630560339e-06, + "loss": 0.3658, + "step": 3542 + }, + { + "epoch": 1.647651526895055, + "grad_norm": 0.416978657245636, + "learning_rate": 5.01217670367512e-06, + "loss": 0.3575, + "step": 3543 + }, + { + "epoch": 1.648116571074252, + "grad_norm": 0.42392852902412415, + "learning_rate": 5.009470773223541e-06, + "loss": 0.3341, + "step": 3544 + }, + { + "epoch": 1.6485816152534492, + "grad_norm": 0.48418161273002625, + "learning_rate": 5.006764839998128e-06, + "loss": 0.3392, + "step": 3545 + }, + { + "epoch": 1.649046659432646, + "grad_norm": 0.4479508697986603, + "learning_rate": 5.004058904791402e-06, + "loss": 0.3764, + "step": 3546 + }, + { + "epoch": 1.649511703611843, + "grad_norm": 0.43179401755332947, + "learning_rate": 5.0013529683958885e-06, + "loss": 0.3507, + "step": 3547 + }, + { + "epoch": 1.6499767477910403, + "grad_norm": 0.3624599874019623, + "learning_rate": 4.998647031604114e-06, + "loss": 0.3289, + "step": 3548 + }, + { + "epoch": 1.6504417919702372, + "grad_norm": 0.46299293637275696, + "learning_rate": 4.995941095208599e-06, + "loss": 0.3391, + "step": 3549 + }, + { + "epoch": 1.6509068361494341, + "grad_norm": 0.41883838176727295, + "learning_rate": 4.993235160001874e-06, + "loss": 0.3454, + "step": 3550 + }, + { + "epoch": 1.6513718803286312, + "grad_norm": 0.40758463740348816, + "learning_rate": 4.99052922677646e-06, + "loss": 0.3604, + "step": 3551 + }, + { + "epoch": 1.6518369245078284, + "grad_norm": 0.4133349359035492, + "learning_rate": 4.987823296324882e-06, + "loss": 0.3388, + "step": 3552 + }, + { + "epoch": 1.6523019686870253, + "grad_norm": 0.4033692479133606, + "learning_rate": 4.985117369439661e-06, + "loss": 0.3454, + "step": 3553 + }, + { + "epoch": 1.6527670128662222, + "grad_norm": 0.38803398609161377, + "learning_rate": 4.982411446913324e-06, + "loss": 0.3475, + "step": 3554 + }, + { + "epoch": 1.6532320570454193, + "grad_norm": 0.4113628566265106, + "learning_rate": 4.979705529538385e-06, + "loss": 0.337, + "step": 3555 + }, + { + "epoch": 1.6536971012246164, + "grad_norm": 0.4451198875904083, + "learning_rate": 4.976999618107369e-06, + "loss": 0.3973, + "step": 3556 + }, + { + "epoch": 1.6541621454038133, + "grad_norm": 0.36026668548583984, + "learning_rate": 4.974293713412791e-06, + "loss": 0.3212, + "step": 3557 + }, + { + "epoch": 1.6546271895830102, + "grad_norm": 0.38637468218803406, + "learning_rate": 4.971587816247166e-06, + "loss": 0.3623, + "step": 3558 + }, + { + "epoch": 1.6550922337622074, + "grad_norm": 0.3913023769855499, + "learning_rate": 4.9688819274030074e-06, + "loss": 0.3304, + "step": 3559 + }, + { + "epoch": 1.6555572779414045, + "grad_norm": 0.3982096016407013, + "learning_rate": 4.96617604767283e-06, + "loss": 0.3727, + "step": 3560 + }, + { + "epoch": 1.6560223221206014, + "grad_norm": 0.3954141139984131, + "learning_rate": 4.963470177849135e-06, + "loss": 0.328, + "step": 3561 + }, + { + "epoch": 1.6564873662997985, + "grad_norm": 0.40723100304603577, + "learning_rate": 4.960764318724434e-06, + "loss": 0.344, + "step": 3562 + }, + { + "epoch": 1.6569524104789957, + "grad_norm": 0.35724228620529175, + "learning_rate": 4.958058471091225e-06, + "loss": 0.3379, + "step": 3563 + }, + { + "epoch": 1.6574174546581926, + "grad_norm": 0.3636278808116913, + "learning_rate": 4.9553526357420104e-06, + "loss": 0.3522, + "step": 3564 + }, + { + "epoch": 1.6578824988373895, + "grad_norm": 0.5159657597541809, + "learning_rate": 4.952646813469282e-06, + "loss": 0.3819, + "step": 3565 + }, + { + "epoch": 1.6583475430165866, + "grad_norm": 0.40651264786720276, + "learning_rate": 4.949941005065534e-06, + "loss": 0.3147, + "step": 3566 + }, + { + "epoch": 1.6588125871957837, + "grad_norm": 0.4187847673892975, + "learning_rate": 4.947235211323253e-06, + "loss": 0.3342, + "step": 3567 + }, + { + "epoch": 1.6592776313749806, + "grad_norm": 0.43306782841682434, + "learning_rate": 4.944529433034919e-06, + "loss": 0.3691, + "step": 3568 + }, + { + "epoch": 1.6597426755541775, + "grad_norm": 0.4201146960258484, + "learning_rate": 4.941823670993016e-06, + "loss": 0.3509, + "step": 3569 + }, + { + "epoch": 1.6602077197333747, + "grad_norm": 0.4412671625614166, + "learning_rate": 4.939117925990013e-06, + "loss": 0.3418, + "step": 3570 + }, + { + "epoch": 1.6606727639125718, + "grad_norm": 0.3967730700969696, + "learning_rate": 4.936412198818382e-06, + "loss": 0.3747, + "step": 3571 + }, + { + "epoch": 1.6611378080917687, + "grad_norm": 0.3809349834918976, + "learning_rate": 4.933706490270583e-06, + "loss": 0.3397, + "step": 3572 + }, + { + "epoch": 1.6616028522709656, + "grad_norm": 0.3735617697238922, + "learning_rate": 4.9310008011390774e-06, + "loss": 0.3398, + "step": 3573 + }, + { + "epoch": 1.6620678964501627, + "grad_norm": 0.44322773814201355, + "learning_rate": 4.9282951322163166e-06, + "loss": 0.3258, + "step": 3574 + }, + { + "epoch": 1.6625329406293599, + "grad_norm": 0.438362181186676, + "learning_rate": 4.925589484294747e-06, + "loss": 0.3926, + "step": 3575 + }, + { + "epoch": 1.6629979848085568, + "grad_norm": 0.36551767587661743, + "learning_rate": 4.922883858166807e-06, + "loss": 0.3317, + "step": 3576 + }, + { + "epoch": 1.6634630289877539, + "grad_norm": 0.39981698989868164, + "learning_rate": 4.920178254624935e-06, + "loss": 0.3266, + "step": 3577 + }, + { + "epoch": 1.663928073166951, + "grad_norm": 0.4227448105812073, + "learning_rate": 4.917472674461553e-06, + "loss": 0.3333, + "step": 3578 + }, + { + "epoch": 1.664393117346148, + "grad_norm": 0.4146755337715149, + "learning_rate": 4.9147671184690855e-06, + "loss": 0.3662, + "step": 3579 + }, + { + "epoch": 1.6648581615253448, + "grad_norm": 0.4139578342437744, + "learning_rate": 4.912061587439944e-06, + "loss": 0.3087, + "step": 3580 + }, + { + "epoch": 1.665323205704542, + "grad_norm": 0.4043808579444885, + "learning_rate": 4.9093560821665365e-06, + "loss": 0.3842, + "step": 3581 + }, + { + "epoch": 1.665788249883739, + "grad_norm": 0.3665468990802765, + "learning_rate": 4.906650603441259e-06, + "loss": 0.3199, + "step": 3582 + }, + { + "epoch": 1.666253294062936, + "grad_norm": 0.46738430857658386, + "learning_rate": 4.903945152056505e-06, + "loss": 0.374, + "step": 3583 + }, + { + "epoch": 1.666718338242133, + "grad_norm": 0.4571022689342499, + "learning_rate": 4.901239728804653e-06, + "loss": 0.3221, + "step": 3584 + }, + { + "epoch": 1.66718338242133, + "grad_norm": 0.41317233443260193, + "learning_rate": 4.8985343344780815e-06, + "loss": 0.3583, + "step": 3585 + }, + { + "epoch": 1.6676484266005271, + "grad_norm": 0.36751988530158997, + "learning_rate": 4.895828969869157e-06, + "loss": 0.3409, + "step": 3586 + }, + { + "epoch": 1.668113470779724, + "grad_norm": 0.44614648818969727, + "learning_rate": 4.8931236357702326e-06, + "loss": 0.3393, + "step": 3587 + }, + { + "epoch": 1.668578514958921, + "grad_norm": 0.3438377380371094, + "learning_rate": 4.89041833297366e-06, + "loss": 0.3349, + "step": 3588 + }, + { + "epoch": 1.669043559138118, + "grad_norm": 0.393669992685318, + "learning_rate": 4.88771306227178e-06, + "loss": 0.3534, + "step": 3589 + }, + { + "epoch": 1.6695086033173152, + "grad_norm": 0.40612417459487915, + "learning_rate": 4.885007824456917e-06, + "loss": 0.3435, + "step": 3590 + }, + { + "epoch": 1.6699736474965121, + "grad_norm": 0.3694112300872803, + "learning_rate": 4.882302620321395e-06, + "loss": 0.3275, + "step": 3591 + }, + { + "epoch": 1.6704386916757092, + "grad_norm": 0.3994767665863037, + "learning_rate": 4.879597450657525e-06, + "loss": 0.3593, + "step": 3592 + }, + { + "epoch": 1.6709037358549064, + "grad_norm": 0.3501642346382141, + "learning_rate": 4.876892316257605e-06, + "loss": 0.3165, + "step": 3593 + }, + { + "epoch": 1.6713687800341033, + "grad_norm": 0.3949240744113922, + "learning_rate": 4.874187217913926e-06, + "loss": 0.3407, + "step": 3594 + }, + { + "epoch": 1.6718338242133002, + "grad_norm": 0.4457712471485138, + "learning_rate": 4.871482156418769e-06, + "loss": 0.3615, + "step": 3595 + }, + { + "epoch": 1.6722988683924973, + "grad_norm": 0.36873969435691833, + "learning_rate": 4.868777132564398e-06, + "loss": 0.3671, + "step": 3596 + }, + { + "epoch": 1.6727639125716944, + "grad_norm": 0.35475531220436096, + "learning_rate": 4.866072147143075e-06, + "loss": 0.3593, + "step": 3597 + }, + { + "epoch": 1.6732289567508913, + "grad_norm": 0.3828467130661011, + "learning_rate": 4.863367200947044e-06, + "loss": 0.344, + "step": 3598 + }, + { + "epoch": 1.6736940009300882, + "grad_norm": 0.44382408261299133, + "learning_rate": 4.8606622947685415e-06, + "loss": 0.3121, + "step": 3599 + }, + { + "epoch": 1.6741590451092854, + "grad_norm": 0.4533607065677643, + "learning_rate": 4.857957429399788e-06, + "loss": 0.3396, + "step": 3600 + }, + { + "epoch": 1.6746240892884825, + "grad_norm": 0.4441719651222229, + "learning_rate": 4.855252605632999e-06, + "loss": 0.3897, + "step": 3601 + }, + { + "epoch": 1.6750891334676794, + "grad_norm": 0.36047327518463135, + "learning_rate": 4.852547824260369e-06, + "loss": 0.3307, + "step": 3602 + }, + { + "epoch": 1.6755541776468763, + "grad_norm": 0.39020687341690063, + "learning_rate": 4.849843086074085e-06, + "loss": 0.3056, + "step": 3603 + }, + { + "epoch": 1.6760192218260734, + "grad_norm": 0.5388873219490051, + "learning_rate": 4.847138391866325e-06, + "loss": 0.3605, + "step": 3604 + }, + { + "epoch": 1.6764842660052706, + "grad_norm": 0.4391537010669708, + "learning_rate": 4.8444337424292445e-06, + "loss": 0.3831, + "step": 3605 + }, + { + "epoch": 1.6769493101844675, + "grad_norm": 0.3435923457145691, + "learning_rate": 4.841729138554996e-06, + "loss": 0.3155, + "step": 3606 + }, + { + "epoch": 1.6774143543636646, + "grad_norm": 0.3751065135002136, + "learning_rate": 4.839024581035709e-06, + "loss": 0.3387, + "step": 3607 + }, + { + "epoch": 1.6778793985428617, + "grad_norm": 0.47324469685554504, + "learning_rate": 4.83632007066351e-06, + "loss": 0.3561, + "step": 3608 + }, + { + "epoch": 1.6783444427220586, + "grad_norm": 0.4418434202671051, + "learning_rate": 4.833615608230501e-06, + "loss": 0.3445, + "step": 3609 + }, + { + "epoch": 1.6788094869012555, + "grad_norm": 0.4009036719799042, + "learning_rate": 4.830911194528781e-06, + "loss": 0.3336, + "step": 3610 + }, + { + "epoch": 1.6792745310804527, + "grad_norm": 0.42753738164901733, + "learning_rate": 4.828206830350423e-06, + "loss": 0.3613, + "step": 3611 + }, + { + "epoch": 1.6797395752596498, + "grad_norm": 0.3864332139492035, + "learning_rate": 4.825502516487497e-06, + "loss": 0.3251, + "step": 3612 + }, + { + "epoch": 1.6802046194388467, + "grad_norm": 0.38312116265296936, + "learning_rate": 4.822798253732046e-06, + "loss": 0.3146, + "step": 3613 + }, + { + "epoch": 1.6806696636180436, + "grad_norm": 0.4273233413696289, + "learning_rate": 4.82009404287611e-06, + "loss": 0.34, + "step": 3614 + }, + { + "epoch": 1.6811347077972407, + "grad_norm": 0.4202836751937866, + "learning_rate": 4.817389884711706e-06, + "loss": 0.3703, + "step": 3615 + }, + { + "epoch": 1.6815997519764379, + "grad_norm": 0.4047132730484009, + "learning_rate": 4.81468578003084e-06, + "loss": 0.3389, + "step": 3616 + }, + { + "epoch": 1.6820647961556348, + "grad_norm": 0.4253717362880707, + "learning_rate": 4.8119817296254965e-06, + "loss": 0.3647, + "step": 3617 + }, + { + "epoch": 1.6825298403348317, + "grad_norm": 0.4105294644832611, + "learning_rate": 4.809277734287654e-06, + "loss": 0.3298, + "step": 3618 + }, + { + "epoch": 1.6829948845140288, + "grad_norm": 0.367784321308136, + "learning_rate": 4.8065737948092615e-06, + "loss": 0.3189, + "step": 3619 + }, + { + "epoch": 1.683459928693226, + "grad_norm": 0.42748555541038513, + "learning_rate": 4.803869911982264e-06, + "loss": 0.3723, + "step": 3620 + }, + { + "epoch": 1.6839249728724228, + "grad_norm": 0.44452059268951416, + "learning_rate": 4.801166086598584e-06, + "loss": 0.324, + "step": 3621 + }, + { + "epoch": 1.68439001705162, + "grad_norm": 0.4104439914226532, + "learning_rate": 4.798462319450127e-06, + "loss": 0.316, + "step": 3622 + }, + { + "epoch": 1.684855061230817, + "grad_norm": 0.37423205375671387, + "learning_rate": 4.795758611328782e-06, + "loss": 0.3399, + "step": 3623 + }, + { + "epoch": 1.685320105410014, + "grad_norm": 0.39451467990875244, + "learning_rate": 4.793054963026425e-06, + "loss": 0.3372, + "step": 3624 + }, + { + "epoch": 1.6857851495892109, + "grad_norm": 0.42838355898857117, + "learning_rate": 4.790351375334906e-06, + "loss": 0.358, + "step": 3625 + }, + { + "epoch": 1.686250193768408, + "grad_norm": 0.4108628034591675, + "learning_rate": 4.787647849046064e-06, + "loss": 0.31, + "step": 3626 + }, + { + "epoch": 1.6867152379476051, + "grad_norm": 0.481794536113739, + "learning_rate": 4.784944384951718e-06, + "loss": 0.3697, + "step": 3627 + }, + { + "epoch": 1.687180282126802, + "grad_norm": 0.4808673858642578, + "learning_rate": 4.782240983843668e-06, + "loss": 0.3575, + "step": 3628 + }, + { + "epoch": 1.687645326305999, + "grad_norm": 0.40240907669067383, + "learning_rate": 4.779537646513697e-06, + "loss": 0.3278, + "step": 3629 + }, + { + "epoch": 1.688110370485196, + "grad_norm": 0.3663456439971924, + "learning_rate": 4.7768343737535694e-06, + "loss": 0.299, + "step": 3630 + }, + { + "epoch": 1.6885754146643932, + "grad_norm": 0.470187246799469, + "learning_rate": 4.774131166355027e-06, + "loss": 0.4006, + "step": 3631 + }, + { + "epoch": 1.68904045884359, + "grad_norm": 0.4300827383995056, + "learning_rate": 4.771428025109798e-06, + "loss": 0.3551, + "step": 3632 + }, + { + "epoch": 1.689505503022787, + "grad_norm": 0.45297130942344666, + "learning_rate": 4.768724950809587e-06, + "loss": 0.3158, + "step": 3633 + }, + { + "epoch": 1.6899705472019841, + "grad_norm": 0.39991265535354614, + "learning_rate": 4.766021944246082e-06, + "loss": 0.3333, + "step": 3634 + }, + { + "epoch": 1.6904355913811813, + "grad_norm": 0.42256665229797363, + "learning_rate": 4.763319006210949e-06, + "loss": 0.3593, + "step": 3635 + }, + { + "epoch": 1.6909006355603782, + "grad_norm": 0.4344233274459839, + "learning_rate": 4.7606161374958355e-06, + "loss": 0.402, + "step": 3636 + }, + { + "epoch": 1.6913656797395753, + "grad_norm": 0.39570462703704834, + "learning_rate": 4.757913338892365e-06, + "loss": 0.3526, + "step": 3637 + }, + { + "epoch": 1.6918307239187724, + "grad_norm": 0.38186073303222656, + "learning_rate": 4.755210611192146e-06, + "loss": 0.328, + "step": 3638 + }, + { + "epoch": 1.6922957680979693, + "grad_norm": 0.3946187198162079, + "learning_rate": 4.752507955186765e-06, + "loss": 0.3226, + "step": 3639 + }, + { + "epoch": 1.6927608122771662, + "grad_norm": 0.4827534258365631, + "learning_rate": 4.749805371667781e-06, + "loss": 0.3992, + "step": 3640 + }, + { + "epoch": 1.6932258564563634, + "grad_norm": 0.3777780830860138, + "learning_rate": 4.747102861426742e-06, + "loss": 0.3196, + "step": 3641 + }, + { + "epoch": 1.6936909006355605, + "grad_norm": 0.4654094874858856, + "learning_rate": 4.744400425255165e-06, + "loss": 0.3779, + "step": 3642 + }, + { + "epoch": 1.6941559448147574, + "grad_norm": 0.3912859559059143, + "learning_rate": 4.741698063944553e-06, + "loss": 0.3197, + "step": 3643 + }, + { + "epoch": 1.6946209889939543, + "grad_norm": 0.40966254472732544, + "learning_rate": 4.7389957782863806e-06, + "loss": 0.3348, + "step": 3644 + }, + { + "epoch": 1.6950860331731514, + "grad_norm": 0.41277340054512024, + "learning_rate": 4.736293569072108e-06, + "loss": 0.3528, + "step": 3645 + }, + { + "epoch": 1.6955510773523486, + "grad_norm": 0.42663225531578064, + "learning_rate": 4.733591437093163e-06, + "loss": 0.3359, + "step": 3646 + }, + { + "epoch": 1.6960161215315455, + "grad_norm": 0.36344727873802185, + "learning_rate": 4.730889383140961e-06, + "loss": 0.35, + "step": 3647 + }, + { + "epoch": 1.6964811657107424, + "grad_norm": 0.41764384508132935, + "learning_rate": 4.7281874080068855e-06, + "loss": 0.375, + "step": 3648 + }, + { + "epoch": 1.6969462098899395, + "grad_norm": 0.4017370641231537, + "learning_rate": 4.725485512482304e-06, + "loss": 0.315, + "step": 3649 + }, + { + "epoch": 1.6974112540691366, + "grad_norm": 0.4225184917449951, + "learning_rate": 4.722783697358555e-06, + "loss": 0.3786, + "step": 3650 + }, + { + "epoch": 1.6978762982483335, + "grad_norm": 0.39845162630081177, + "learning_rate": 4.720081963426962e-06, + "loss": 0.3394, + "step": 3651 + }, + { + "epoch": 1.6983413424275307, + "grad_norm": 0.3981609642505646, + "learning_rate": 4.717380311478813e-06, + "loss": 0.3866, + "step": 3652 + }, + { + "epoch": 1.6988063866067278, + "grad_norm": 0.3535573184490204, + "learning_rate": 4.714678742305381e-06, + "loss": 0.3103, + "step": 3653 + }, + { + "epoch": 1.6992714307859247, + "grad_norm": 0.43783578276634216, + "learning_rate": 4.711977256697909e-06, + "loss": 0.3763, + "step": 3654 + }, + { + "epoch": 1.6997364749651216, + "grad_norm": 0.39808762073516846, + "learning_rate": 4.7092758554476215e-06, + "loss": 0.3485, + "step": 3655 + }, + { + "epoch": 1.7002015191443187, + "grad_norm": 0.3975062668323517, + "learning_rate": 4.706574539345712e-06, + "loss": 0.3592, + "step": 3656 + }, + { + "epoch": 1.7006665633235158, + "grad_norm": 0.3854736089706421, + "learning_rate": 4.703873309183357e-06, + "loss": 0.3236, + "step": 3657 + }, + { + "epoch": 1.7011316075027128, + "grad_norm": 0.3952430486679077, + "learning_rate": 4.7011721657516966e-06, + "loss": 0.363, + "step": 3658 + }, + { + "epoch": 1.7015966516819097, + "grad_norm": 0.47998958826065063, + "learning_rate": 4.698471109841858e-06, + "loss": 0.4107, + "step": 3659 + }, + { + "epoch": 1.7020616958611068, + "grad_norm": 0.37458357214927673, + "learning_rate": 4.695770142244931e-06, + "loss": 0.3453, + "step": 3660 + }, + { + "epoch": 1.702526740040304, + "grad_norm": 0.3739306330680847, + "learning_rate": 4.693069263751989e-06, + "loss": 0.3587, + "step": 3661 + }, + { + "epoch": 1.7029917842195008, + "grad_norm": 0.38612034916877747, + "learning_rate": 4.690368475154072e-06, + "loss": 0.3531, + "step": 3662 + }, + { + "epoch": 1.7034568283986977, + "grad_norm": 0.41020193696022034, + "learning_rate": 4.687667777242203e-06, + "loss": 0.3569, + "step": 3663 + }, + { + "epoch": 1.7039218725778948, + "grad_norm": 0.3760784864425659, + "learning_rate": 4.684967170807365e-06, + "loss": 0.3417, + "step": 3664 + }, + { + "epoch": 1.704386916757092, + "grad_norm": 0.3778047263622284, + "learning_rate": 4.682266656640528e-06, + "loss": 0.3226, + "step": 3665 + }, + { + "epoch": 1.7048519609362889, + "grad_norm": 0.4011727273464203, + "learning_rate": 4.679566235532625e-06, + "loss": 0.3394, + "step": 3666 + }, + { + "epoch": 1.705317005115486, + "grad_norm": 0.38582319021224976, + "learning_rate": 4.676865908274567e-06, + "loss": 0.3972, + "step": 3667 + }, + { + "epoch": 1.7057820492946831, + "grad_norm": 0.3527704179286957, + "learning_rate": 4.674165675657236e-06, + "loss": 0.3193, + "step": 3668 + }, + { + "epoch": 1.70624709347388, + "grad_norm": 0.4227658808231354, + "learning_rate": 4.671465538471487e-06, + "loss": 0.3571, + "step": 3669 + }, + { + "epoch": 1.706712137653077, + "grad_norm": 0.36377352476119995, + "learning_rate": 4.668765497508143e-06, + "loss": 0.3433, + "step": 3670 + }, + { + "epoch": 1.707177181832274, + "grad_norm": 0.3703608512878418, + "learning_rate": 4.666065553558007e-06, + "loss": 0.3152, + "step": 3671 + }, + { + "epoch": 1.7076422260114712, + "grad_norm": 0.39327993988990784, + "learning_rate": 4.663365707411845e-06, + "loss": 0.3686, + "step": 3672 + }, + { + "epoch": 1.708107270190668, + "grad_norm": 0.41823717951774597, + "learning_rate": 4.660665959860399e-06, + "loss": 0.3777, + "step": 3673 + }, + { + "epoch": 1.708572314369865, + "grad_norm": 0.3737585246562958, + "learning_rate": 4.657966311694383e-06, + "loss": 0.3365, + "step": 3674 + }, + { + "epoch": 1.7090373585490621, + "grad_norm": 0.43027764558792114, + "learning_rate": 4.655266763704476e-06, + "loss": 0.3414, + "step": 3675 + }, + { + "epoch": 1.7095024027282593, + "grad_norm": 0.376740962266922, + "learning_rate": 4.652567316681337e-06, + "loss": 0.3267, + "step": 3676 + }, + { + "epoch": 1.7099674469074562, + "grad_norm": 0.40848153829574585, + "learning_rate": 4.649867971415585e-06, + "loss": 0.361, + "step": 3677 + }, + { + "epoch": 1.710432491086653, + "grad_norm": 0.40334048867225647, + "learning_rate": 4.647168728697819e-06, + "loss": 0.3619, + "step": 3678 + }, + { + "epoch": 1.7108975352658504, + "grad_norm": 0.38001689314842224, + "learning_rate": 4.6444695893185994e-06, + "loss": 0.3197, + "step": 3679 + }, + { + "epoch": 1.7113625794450473, + "grad_norm": 0.4063551425933838, + "learning_rate": 4.641770554068465e-06, + "loss": 0.3519, + "step": 3680 + }, + { + "epoch": 1.7118276236242442, + "grad_norm": 0.46857863664627075, + "learning_rate": 4.639071623737913e-06, + "loss": 0.3636, + "step": 3681 + }, + { + "epoch": 1.7122926678034414, + "grad_norm": 0.4151492416858673, + "learning_rate": 4.636372799117424e-06, + "loss": 0.3372, + "step": 3682 + }, + { + "epoch": 1.7127577119826385, + "grad_norm": 0.43509939312934875, + "learning_rate": 4.6336740809974315e-06, + "loss": 0.3731, + "step": 3683 + }, + { + "epoch": 1.7132227561618354, + "grad_norm": 0.4010016918182373, + "learning_rate": 4.630975470168352e-06, + "loss": 0.3233, + "step": 3684 + }, + { + "epoch": 1.7136878003410323, + "grad_norm": 0.39253363013267517, + "learning_rate": 4.628276967420563e-06, + "loss": 0.3614, + "step": 3685 + }, + { + "epoch": 1.7141528445202294, + "grad_norm": 0.36530962586402893, + "learning_rate": 4.625578573544414e-06, + "loss": 0.3289, + "step": 3686 + }, + { + "epoch": 1.7146178886994266, + "grad_norm": 0.4295608103275299, + "learning_rate": 4.622880289330217e-06, + "loss": 0.3404, + "step": 3687 + }, + { + "epoch": 1.7150829328786235, + "grad_norm": 0.39288264513015747, + "learning_rate": 4.620182115568259e-06, + "loss": 0.3217, + "step": 3688 + }, + { + "epoch": 1.7155479770578204, + "grad_norm": 0.43459996581077576, + "learning_rate": 4.617484053048788e-06, + "loss": 0.3493, + "step": 3689 + }, + { + "epoch": 1.7160130212370175, + "grad_norm": 0.39573171734809875, + "learning_rate": 4.614786102562026e-06, + "loss": 0.3547, + "step": 3690 + }, + { + "epoch": 1.7164780654162146, + "grad_norm": 0.39884433150291443, + "learning_rate": 4.6120882648981565e-06, + "loss": 0.35, + "step": 3691 + }, + { + "epoch": 1.7169431095954115, + "grad_norm": 0.4326457679271698, + "learning_rate": 4.609390540847336e-06, + "loss": 0.3687, + "step": 3692 + }, + { + "epoch": 1.7174081537746084, + "grad_norm": 0.3984185755252838, + "learning_rate": 4.606692931199678e-06, + "loss": 0.3789, + "step": 3693 + }, + { + "epoch": 1.7178731979538058, + "grad_norm": 0.3940320312976837, + "learning_rate": 4.603995436745274e-06, + "loss": 0.3263, + "step": 3694 + }, + { + "epoch": 1.7183382421330027, + "grad_norm": 0.4455493688583374, + "learning_rate": 4.6012980582741725e-06, + "loss": 0.3492, + "step": 3695 + }, + { + "epoch": 1.7188032863121996, + "grad_norm": 0.4288008511066437, + "learning_rate": 4.598600796576395e-06, + "loss": 0.3087, + "step": 3696 + }, + { + "epoch": 1.7192683304913967, + "grad_norm": 0.3839088976383209, + "learning_rate": 4.595903652441923e-06, + "loss": 0.3869, + "step": 3697 + }, + { + "epoch": 1.7197333746705938, + "grad_norm": 0.3795816898345947, + "learning_rate": 4.59320662666071e-06, + "loss": 0.3492, + "step": 3698 + }, + { + "epoch": 1.7201984188497907, + "grad_norm": 0.39537104964256287, + "learning_rate": 4.590509720022665e-06, + "loss": 0.3488, + "step": 3699 + }, + { + "epoch": 1.7206634630289876, + "grad_norm": 0.3996933102607727, + "learning_rate": 4.587812933317674e-06, + "loss": 0.3653, + "step": 3700 + }, + { + "epoch": 1.7211285072081848, + "grad_norm": 0.3842652142047882, + "learning_rate": 4.5851162673355785e-06, + "loss": 0.3643, + "step": 3701 + }, + { + "epoch": 1.721593551387382, + "grad_norm": 0.3390163779258728, + "learning_rate": 4.58241972286619e-06, + "loss": 0.3105, + "step": 3702 + }, + { + "epoch": 1.7220585955665788, + "grad_norm": 0.3907347619533539, + "learning_rate": 4.5797233006992805e-06, + "loss": 0.3597, + "step": 3703 + }, + { + "epoch": 1.7225236397457757, + "grad_norm": 0.38712698221206665, + "learning_rate": 4.5770270016245915e-06, + "loss": 0.3285, + "step": 3704 + }, + { + "epoch": 1.7229886839249728, + "grad_norm": 0.40449246764183044, + "learning_rate": 4.574330826431822e-06, + "loss": 0.3639, + "step": 3705 + }, + { + "epoch": 1.72345372810417, + "grad_norm": 0.39589884877204895, + "learning_rate": 4.571634775910641e-06, + "loss": 0.3156, + "step": 3706 + }, + { + "epoch": 1.7239187722833669, + "grad_norm": 0.4132598042488098, + "learning_rate": 4.568938850850673e-06, + "loss": 0.3397, + "step": 3707 + }, + { + "epoch": 1.7243838164625638, + "grad_norm": 0.3807767927646637, + "learning_rate": 4.566243052041516e-06, + "loss": 0.3297, + "step": 3708 + }, + { + "epoch": 1.7248488606417611, + "grad_norm": 0.3866475820541382, + "learning_rate": 4.5635473802727225e-06, + "loss": 0.3258, + "step": 3709 + }, + { + "epoch": 1.725313904820958, + "grad_norm": 0.38793426752090454, + "learning_rate": 4.560851836333813e-06, + "loss": 0.3257, + "step": 3710 + }, + { + "epoch": 1.725778949000155, + "grad_norm": 0.440531849861145, + "learning_rate": 4.558156421014268e-06, + "loss": 0.3547, + "step": 3711 + }, + { + "epoch": 1.726243993179352, + "grad_norm": 0.4038770794868469, + "learning_rate": 4.555461135103529e-06, + "loss": 0.35, + "step": 3712 + }, + { + "epoch": 1.7267090373585492, + "grad_norm": 0.32992175221443176, + "learning_rate": 4.5527659793910025e-06, + "loss": 0.3144, + "step": 3713 + }, + { + "epoch": 1.727174081537746, + "grad_norm": 0.4566981792449951, + "learning_rate": 4.550070954666056e-06, + "loss": 0.4192, + "step": 3714 + }, + { + "epoch": 1.727639125716943, + "grad_norm": 0.419821172952652, + "learning_rate": 4.547376061718021e-06, + "loss": 0.3432, + "step": 3715 + }, + { + "epoch": 1.7281041698961401, + "grad_norm": 0.4241842031478882, + "learning_rate": 4.544681301336182e-06, + "loss": 0.3418, + "step": 3716 + }, + { + "epoch": 1.7285692140753373, + "grad_norm": 0.3986935019493103, + "learning_rate": 4.541986674309798e-06, + "loss": 0.3934, + "step": 3717 + }, + { + "epoch": 1.7290342582545342, + "grad_norm": 0.360019713640213, + "learning_rate": 4.539292181428074e-06, + "loss": 0.3393, + "step": 3718 + }, + { + "epoch": 1.729499302433731, + "grad_norm": 0.39975255727767944, + "learning_rate": 4.536597823480188e-06, + "loss": 0.3076, + "step": 3719 + }, + { + "epoch": 1.7299643466129282, + "grad_norm": 0.42592859268188477, + "learning_rate": 4.533903601255272e-06, + "loss": 0.3379, + "step": 3720 + }, + { + "epoch": 1.7304293907921253, + "grad_norm": 0.3946399986743927, + "learning_rate": 4.531209515542422e-06, + "loss": 0.377, + "step": 3721 + }, + { + "epoch": 1.7308944349713222, + "grad_norm": 0.39706122875213623, + "learning_rate": 4.528515567130688e-06, + "loss": 0.3682, + "step": 3722 + }, + { + "epoch": 1.7313594791505194, + "grad_norm": 0.3509725332260132, + "learning_rate": 4.525821756809088e-06, + "loss": 0.28, + "step": 3723 + }, + { + "epoch": 1.7318245233297165, + "grad_norm": 0.4142434895038605, + "learning_rate": 4.523128085366592e-06, + "loss": 0.3259, + "step": 3724 + }, + { + "epoch": 1.7322895675089134, + "grad_norm": 0.39951425790786743, + "learning_rate": 4.520434553592134e-06, + "loss": 0.3618, + "step": 3725 + }, + { + "epoch": 1.7327546116881103, + "grad_norm": 0.3927379250526428, + "learning_rate": 4.517741162274605e-06, + "loss": 0.3323, + "step": 3726 + }, + { + "epoch": 1.7332196558673074, + "grad_norm": 0.3651019036769867, + "learning_rate": 4.515047912202858e-06, + "loss": 0.3219, + "step": 3727 + }, + { + "epoch": 1.7336847000465045, + "grad_norm": 0.40421250462532043, + "learning_rate": 4.5123548041656984e-06, + "loss": 0.4112, + "step": 3728 + }, + { + "epoch": 1.7341497442257015, + "grad_norm": 0.39693304896354675, + "learning_rate": 4.509661838951897e-06, + "loss": 0.3038, + "step": 3729 + }, + { + "epoch": 1.7346147884048984, + "grad_norm": 0.3815270960330963, + "learning_rate": 4.506969017350178e-06, + "loss": 0.3384, + "step": 3730 + }, + { + "epoch": 1.7350798325840955, + "grad_norm": 0.3915429711341858, + "learning_rate": 4.5042763401492256e-06, + "loss": 0.3403, + "step": 3731 + }, + { + "epoch": 1.7355448767632926, + "grad_norm": 0.40549951791763306, + "learning_rate": 4.50158380813768e-06, + "loss": 0.3575, + "step": 3732 + }, + { + "epoch": 1.7360099209424895, + "grad_norm": 0.40941810607910156, + "learning_rate": 4.498891422104143e-06, + "loss": 0.3286, + "step": 3733 + }, + { + "epoch": 1.7364749651216864, + "grad_norm": 0.38618361949920654, + "learning_rate": 4.496199182837167e-06, + "loss": 0.3338, + "step": 3734 + }, + { + "epoch": 1.7369400093008835, + "grad_norm": 0.3790382146835327, + "learning_rate": 4.493507091125269e-06, + "loss": 0.3541, + "step": 3735 + }, + { + "epoch": 1.7374050534800807, + "grad_norm": 0.39466914534568787, + "learning_rate": 4.490815147756915e-06, + "loss": 0.3831, + "step": 3736 + }, + { + "epoch": 1.7378700976592776, + "grad_norm": 0.4218112528324127, + "learning_rate": 4.4881233535205345e-06, + "loss": 0.3405, + "step": 3737 + }, + { + "epoch": 1.7383351418384747, + "grad_norm": 0.41723906993865967, + "learning_rate": 4.4854317092045085e-06, + "loss": 0.37, + "step": 3738 + }, + { + "epoch": 1.7388001860176718, + "grad_norm": 0.3990018367767334, + "learning_rate": 4.482740215597179e-06, + "loss": 0.3546, + "step": 3739 + }, + { + "epoch": 1.7392652301968687, + "grad_norm": 0.4663386046886444, + "learning_rate": 4.480048873486836e-06, + "loss": 0.3627, + "step": 3740 + }, + { + "epoch": 1.7397302743760656, + "grad_norm": 0.36172690987586975, + "learning_rate": 4.477357683661734e-06, + "loss": 0.2987, + "step": 3741 + }, + { + "epoch": 1.7401953185552628, + "grad_norm": 0.3599375784397125, + "learning_rate": 4.474666646910074e-06, + "loss": 0.3212, + "step": 3742 + }, + { + "epoch": 1.74066036273446, + "grad_norm": 0.43623727560043335, + "learning_rate": 4.471975764020023e-06, + "loss": 0.3696, + "step": 3743 + }, + { + "epoch": 1.7411254069136568, + "grad_norm": 0.40363821387290955, + "learning_rate": 4.469285035779693e-06, + "loss": 0.3287, + "step": 3744 + }, + { + "epoch": 1.7415904510928537, + "grad_norm": 0.40188169479370117, + "learning_rate": 4.466594462977156e-06, + "loss": 0.3471, + "step": 3745 + }, + { + "epoch": 1.7420554952720508, + "grad_norm": 0.4021664261817932, + "learning_rate": 4.463904046400438e-06, + "loss": 0.3482, + "step": 3746 + }, + { + "epoch": 1.742520539451248, + "grad_norm": 0.3717334270477295, + "learning_rate": 4.4612137868375136e-06, + "loss": 0.3174, + "step": 3747 + }, + { + "epoch": 1.7429855836304449, + "grad_norm": 0.4289903938770294, + "learning_rate": 4.458523685076321e-06, + "loss": 0.3474, + "step": 3748 + }, + { + "epoch": 1.7434506278096418, + "grad_norm": 0.44443613290786743, + "learning_rate": 4.455833741904746e-06, + "loss": 0.3668, + "step": 3749 + }, + { + "epoch": 1.743915671988839, + "grad_norm": 0.36988234519958496, + "learning_rate": 4.4531439581106295e-06, + "loss": 0.3533, + "step": 3750 + }, + { + "epoch": 1.744380716168036, + "grad_norm": 0.4682080149650574, + "learning_rate": 4.450454334481763e-06, + "loss": 0.3772, + "step": 3751 + }, + { + "epoch": 1.744845760347233, + "grad_norm": 0.4056577682495117, + "learning_rate": 4.447764871805899e-06, + "loss": 0.3237, + "step": 3752 + }, + { + "epoch": 1.74531080452643, + "grad_norm": 0.4110541045665741, + "learning_rate": 4.4450755708707305e-06, + "loss": 0.3519, + "step": 3753 + }, + { + "epoch": 1.7457758487056272, + "grad_norm": 0.39790642261505127, + "learning_rate": 4.442386432463915e-06, + "loss": 0.3489, + "step": 3754 + }, + { + "epoch": 1.746240892884824, + "grad_norm": 0.3980329632759094, + "learning_rate": 4.439697457373055e-06, + "loss": 0.3612, + "step": 3755 + }, + { + "epoch": 1.746705937064021, + "grad_norm": 0.3943917751312256, + "learning_rate": 4.437008646385711e-06, + "loss": 0.3486, + "step": 3756 + }, + { + "epoch": 1.7471709812432181, + "grad_norm": 0.34319841861724854, + "learning_rate": 4.434320000289387e-06, + "loss": 0.3213, + "step": 3757 + }, + { + "epoch": 1.7476360254224153, + "grad_norm": 0.4046364724636078, + "learning_rate": 4.431631519871549e-06, + "loss": 0.343, + "step": 3758 + }, + { + "epoch": 1.7481010696016122, + "grad_norm": 0.4628947377204895, + "learning_rate": 4.428943205919605e-06, + "loss": 0.3734, + "step": 3759 + }, + { + "epoch": 1.748566113780809, + "grad_norm": 0.3862752914428711, + "learning_rate": 4.426255059220921e-06, + "loss": 0.3305, + "step": 3760 + }, + { + "epoch": 1.7490311579600062, + "grad_norm": 0.37330740690231323, + "learning_rate": 4.42356708056281e-06, + "loss": 0.3254, + "step": 3761 + }, + { + "epoch": 1.7494962021392033, + "grad_norm": 0.4342025816440582, + "learning_rate": 4.420879270732539e-06, + "loss": 0.3663, + "step": 3762 + }, + { + "epoch": 1.7499612463184002, + "grad_norm": 0.42904359102249146, + "learning_rate": 4.418191630517322e-06, + "loss": 0.3377, + "step": 3763 + }, + { + "epoch": 1.7504262904975971, + "grad_norm": 0.40578049421310425, + "learning_rate": 4.415504160704327e-06, + "loss": 0.3785, + "step": 3764 + }, + { + "epoch": 1.7508913346767943, + "grad_norm": 0.34141576290130615, + "learning_rate": 4.412816862080668e-06, + "loss": 0.2916, + "step": 3765 + }, + { + "epoch": 1.7513563788559914, + "grad_norm": 0.4625285565853119, + "learning_rate": 4.4101297354334135e-06, + "loss": 0.3609, + "step": 3766 + }, + { + "epoch": 1.7518214230351883, + "grad_norm": 0.40238505601882935, + "learning_rate": 4.407442781549577e-06, + "loss": 0.3996, + "step": 3767 + }, + { + "epoch": 1.7522864672143854, + "grad_norm": 0.40571123361587524, + "learning_rate": 4.404756001216126e-06, + "loss": 0.3533, + "step": 3768 + }, + { + "epoch": 1.7527515113935825, + "grad_norm": 0.37566980719566345, + "learning_rate": 4.4020693952199726e-06, + "loss": 0.3394, + "step": 3769 + }, + { + "epoch": 1.7532165555727794, + "grad_norm": 0.4016037583351135, + "learning_rate": 4.3993829643479825e-06, + "loss": 0.324, + "step": 3770 + }, + { + "epoch": 1.7536815997519763, + "grad_norm": 0.41683268547058105, + "learning_rate": 4.396696709386964e-06, + "loss": 0.3341, + "step": 3771 + }, + { + "epoch": 1.7541466439311735, + "grad_norm": 0.3792789876461029, + "learning_rate": 4.394010631123681e-06, + "loss": 0.3193, + "step": 3772 + }, + { + "epoch": 1.7546116881103706, + "grad_norm": 0.38343918323516846, + "learning_rate": 4.39132473034484e-06, + "loss": 0.3722, + "step": 3773 + }, + { + "epoch": 1.7550767322895675, + "grad_norm": 0.3963862359523773, + "learning_rate": 4.388639007837101e-06, + "loss": 0.3609, + "step": 3774 + }, + { + "epoch": 1.7555417764687644, + "grad_norm": 0.3763202428817749, + "learning_rate": 4.385953464387064e-06, + "loss": 0.3183, + "step": 3775 + }, + { + "epoch": 1.7560068206479615, + "grad_norm": 0.3931805491447449, + "learning_rate": 4.383268100781285e-06, + "loss": 0.3821, + "step": 3776 + }, + { + "epoch": 1.7564718648271587, + "grad_norm": 0.3617267906665802, + "learning_rate": 4.38058291780626e-06, + "loss": 0.3264, + "step": 3777 + }, + { + "epoch": 1.7569369090063556, + "grad_norm": 0.38972967863082886, + "learning_rate": 4.377897916248438e-06, + "loss": 0.3538, + "step": 3778 + }, + { + "epoch": 1.7574019531855525, + "grad_norm": 0.4271766245365143, + "learning_rate": 4.37521309689421e-06, + "loss": 0.3718, + "step": 3779 + }, + { + "epoch": 1.7578669973647496, + "grad_norm": 0.3271983861923218, + "learning_rate": 4.37252846052992e-06, + "loss": 0.3391, + "step": 3780 + }, + { + "epoch": 1.7583320415439467, + "grad_norm": 0.41087400913238525, + "learning_rate": 4.36984400794185e-06, + "loss": 0.3266, + "step": 3781 + }, + { + "epoch": 1.7587970857231436, + "grad_norm": 0.44612687826156616, + "learning_rate": 4.367159739916236e-06, + "loss": 0.334, + "step": 3782 + }, + { + "epoch": 1.7592621299023408, + "grad_norm": 0.3791424632072449, + "learning_rate": 4.364475657239253e-06, + "loss": 0.3684, + "step": 3783 + }, + { + "epoch": 1.759727174081538, + "grad_norm": 0.4565580189228058, + "learning_rate": 4.361791760697027e-06, + "loss": 0.3547, + "step": 3784 + }, + { + "epoch": 1.7601922182607348, + "grad_norm": 0.48362359404563904, + "learning_rate": 4.35910805107563e-06, + "loss": 0.3483, + "step": 3785 + }, + { + "epoch": 1.7606572624399317, + "grad_norm": 0.37739720940589905, + "learning_rate": 4.356424529161072e-06, + "loss": 0.351, + "step": 3786 + }, + { + "epoch": 1.7611223066191288, + "grad_norm": 0.36957481503486633, + "learning_rate": 4.353741195739318e-06, + "loss": 0.362, + "step": 3787 + }, + { + "epoch": 1.761587350798326, + "grad_norm": 0.39713501930236816, + "learning_rate": 4.351058051596269e-06, + "loss": 0.3421, + "step": 3788 + }, + { + "epoch": 1.7620523949775229, + "grad_norm": 0.37897831201553345, + "learning_rate": 4.348375097517776e-06, + "loss": 0.3196, + "step": 3789 + }, + { + "epoch": 1.7625174391567198, + "grad_norm": 0.38132160902023315, + "learning_rate": 4.345692334289632e-06, + "loss": 0.3764, + "step": 3790 + }, + { + "epoch": 1.762982483335917, + "grad_norm": 0.39471349120140076, + "learning_rate": 4.343009762697577e-06, + "loss": 0.3472, + "step": 3791 + }, + { + "epoch": 1.763447527515114, + "grad_norm": 0.35921382904052734, + "learning_rate": 4.340327383527289e-06, + "loss": 0.3489, + "step": 3792 + }, + { + "epoch": 1.763912571694311, + "grad_norm": 0.3691664934158325, + "learning_rate": 4.337645197564398e-06, + "loss": 0.3357, + "step": 3793 + }, + { + "epoch": 1.7643776158735078, + "grad_norm": 0.38761186599731445, + "learning_rate": 4.334963205594467e-06, + "loss": 0.3535, + "step": 3794 + }, + { + "epoch": 1.764842660052705, + "grad_norm": 0.4036206305027008, + "learning_rate": 4.332281408403011e-06, + "loss": 0.3786, + "step": 3795 + }, + { + "epoch": 1.765307704231902, + "grad_norm": 0.37231314182281494, + "learning_rate": 4.3295998067754844e-06, + "loss": 0.3068, + "step": 3796 + }, + { + "epoch": 1.765772748411099, + "grad_norm": 0.38186076283454895, + "learning_rate": 4.326918401497287e-06, + "loss": 0.334, + "step": 3797 + }, + { + "epoch": 1.7662377925902961, + "grad_norm": 0.4085099697113037, + "learning_rate": 4.3242371933537554e-06, + "loss": 0.3273, + "step": 3798 + }, + { + "epoch": 1.7667028367694932, + "grad_norm": 0.4823872447013855, + "learning_rate": 4.321556183130175e-06, + "loss": 0.3616, + "step": 3799 + }, + { + "epoch": 1.7671678809486902, + "grad_norm": 0.38693398237228394, + "learning_rate": 4.318875371611766e-06, + "loss": 0.3751, + "step": 3800 + }, + { + "epoch": 1.767632925127887, + "grad_norm": 0.4294361472129822, + "learning_rate": 4.3161947595836985e-06, + "loss": 0.3404, + "step": 3801 + }, + { + "epoch": 1.7680979693070842, + "grad_norm": 0.46202561259269714, + "learning_rate": 4.313514347831077e-06, + "loss": 0.3245, + "step": 3802 + }, + { + "epoch": 1.7685630134862813, + "grad_norm": 0.42043882608413696, + "learning_rate": 4.310834137138956e-06, + "loss": 0.3401, + "step": 3803 + }, + { + "epoch": 1.7690280576654782, + "grad_norm": 0.4185798168182373, + "learning_rate": 4.308154128292318e-06, + "loss": 0.359, + "step": 3804 + }, + { + "epoch": 1.7694931018446751, + "grad_norm": 0.44950559735298157, + "learning_rate": 4.305474322076102e-06, + "loss": 0.3354, + "step": 3805 + }, + { + "epoch": 1.7699581460238722, + "grad_norm": 0.4571090638637543, + "learning_rate": 4.302794719275173e-06, + "loss": 0.3484, + "step": 3806 + }, + { + "epoch": 1.7704231902030694, + "grad_norm": 0.42826327681541443, + "learning_rate": 4.300115320674346e-06, + "loss": 0.3595, + "step": 3807 + }, + { + "epoch": 1.7708882343822663, + "grad_norm": 0.42987918853759766, + "learning_rate": 4.297436127058373e-06, + "loss": 0.3028, + "step": 3808 + }, + { + "epoch": 1.7713532785614632, + "grad_norm": 0.4052066206932068, + "learning_rate": 4.294757139211948e-06, + "loss": 0.3719, + "step": 3809 + }, + { + "epoch": 1.7718183227406603, + "grad_norm": 0.4018250107765198, + "learning_rate": 4.292078357919701e-06, + "loss": 0.3937, + "step": 3810 + }, + { + "epoch": 1.7722833669198574, + "grad_norm": 0.40564820170402527, + "learning_rate": 4.289399783966205e-06, + "loss": 0.344, + "step": 3811 + }, + { + "epoch": 1.7727484110990543, + "grad_norm": 0.4192334711551666, + "learning_rate": 4.286721418135968e-06, + "loss": 0.3571, + "step": 3812 + }, + { + "epoch": 1.7732134552782515, + "grad_norm": 0.36322298645973206, + "learning_rate": 4.284043261213442e-06, + "loss": 0.3576, + "step": 3813 + }, + { + "epoch": 1.7736784994574486, + "grad_norm": 0.35039764642715454, + "learning_rate": 4.281365313983016e-06, + "loss": 0.3473, + "step": 3814 + }, + { + "epoch": 1.7741435436366455, + "grad_norm": 0.3839752972126007, + "learning_rate": 4.278687577229018e-06, + "loss": 0.3502, + "step": 3815 + }, + { + "epoch": 1.7746085878158424, + "grad_norm": 0.4036378264427185, + "learning_rate": 4.2760100517357095e-06, + "loss": 0.3351, + "step": 3816 + }, + { + "epoch": 1.7750736319950395, + "grad_norm": 0.3815009891986847, + "learning_rate": 4.273332738287299e-06, + "loss": 0.3496, + "step": 3817 + }, + { + "epoch": 1.7755386761742367, + "grad_norm": 0.38633623719215393, + "learning_rate": 4.270655637667926e-06, + "loss": 0.3215, + "step": 3818 + }, + { + "epoch": 1.7760037203534336, + "grad_norm": 0.39606183767318726, + "learning_rate": 4.267978750661669e-06, + "loss": 0.3445, + "step": 3819 + }, + { + "epoch": 1.7764687645326305, + "grad_norm": 0.39145374298095703, + "learning_rate": 4.265302078052546e-06, + "loss": 0.3274, + "step": 3820 + }, + { + "epoch": 1.7769338087118276, + "grad_norm": 0.37993159890174866, + "learning_rate": 4.26262562062451e-06, + "loss": 0.3546, + "step": 3821 + }, + { + "epoch": 1.7773988528910247, + "grad_norm": 0.41166287660598755, + "learning_rate": 4.259949379161454e-06, + "loss": 0.3559, + "step": 3822 + }, + { + "epoch": 1.7778638970702216, + "grad_norm": 0.4474835991859436, + "learning_rate": 4.2572733544472025e-06, + "loss": 0.3525, + "step": 3823 + }, + { + "epoch": 1.7783289412494185, + "grad_norm": 0.38438984751701355, + "learning_rate": 4.2545975472655215e-06, + "loss": 0.3461, + "step": 3824 + }, + { + "epoch": 1.7787939854286157, + "grad_norm": 0.37471044063568115, + "learning_rate": 4.2519219584001106e-06, + "loss": 0.3681, + "step": 3825 + }, + { + "epoch": 1.7792590296078128, + "grad_norm": 0.3626500368118286, + "learning_rate": 4.249246588634609e-06, + "loss": 0.334, + "step": 3826 + }, + { + "epoch": 1.7797240737870097, + "grad_norm": 0.34971731901168823, + "learning_rate": 4.246571438752585e-06, + "loss": 0.3683, + "step": 3827 + }, + { + "epoch": 1.7801891179662068, + "grad_norm": 0.37381622195243835, + "learning_rate": 4.243896509537551e-06, + "loss": 0.3602, + "step": 3828 + }, + { + "epoch": 1.780654162145404, + "grad_norm": 0.4181731343269348, + "learning_rate": 4.241221801772945e-06, + "loss": 0.3684, + "step": 3829 + }, + { + "epoch": 1.7811192063246009, + "grad_norm": 0.3543993830680847, + "learning_rate": 4.238547316242149e-06, + "loss": 0.3383, + "step": 3830 + }, + { + "epoch": 1.7815842505037978, + "grad_norm": 0.4083957076072693, + "learning_rate": 4.235873053728475e-06, + "loss": 0.372, + "step": 3831 + }, + { + "epoch": 1.7820492946829949, + "grad_norm": 0.3555138409137726, + "learning_rate": 4.2331990150151745e-06, + "loss": 0.351, + "step": 3832 + }, + { + "epoch": 1.782514338862192, + "grad_norm": 0.40743115544319153, + "learning_rate": 4.230525200885425e-06, + "loss": 0.3407, + "step": 3833 + }, + { + "epoch": 1.782979383041389, + "grad_norm": 0.435773104429245, + "learning_rate": 4.227851612122347e-06, + "loss": 0.3665, + "step": 3834 + }, + { + "epoch": 1.7834444272205858, + "grad_norm": 0.3714430332183838, + "learning_rate": 4.225178249508988e-06, + "loss": 0.3557, + "step": 3835 + }, + { + "epoch": 1.783909471399783, + "grad_norm": 0.35675156116485596, + "learning_rate": 4.222505113828335e-06, + "loss": 0.3125, + "step": 3836 + }, + { + "epoch": 1.78437451557898, + "grad_norm": 0.3813643753528595, + "learning_rate": 4.219832205863303e-06, + "loss": 0.3653, + "step": 3837 + }, + { + "epoch": 1.784839559758177, + "grad_norm": 0.4000234603881836, + "learning_rate": 4.217159526396749e-06, + "loss": 0.3811, + "step": 3838 + }, + { + "epoch": 1.785304603937374, + "grad_norm": 0.3764249086380005, + "learning_rate": 4.214487076211452e-06, + "loss": 0.2882, + "step": 3839 + }, + { + "epoch": 1.7857696481165712, + "grad_norm": 0.47971102595329285, + "learning_rate": 4.2118148560901325e-06, + "loss": 0.3693, + "step": 3840 + }, + { + "epoch": 1.7862346922957681, + "grad_norm": 0.3611883521080017, + "learning_rate": 4.209142866815438e-06, + "loss": 0.3483, + "step": 3841 + }, + { + "epoch": 1.786699736474965, + "grad_norm": 0.3263775110244751, + "learning_rate": 4.206471109169952e-06, + "loss": 0.3338, + "step": 3842 + }, + { + "epoch": 1.7871647806541622, + "grad_norm": 0.4258721172809601, + "learning_rate": 4.2037995839361876e-06, + "loss": 0.3556, + "step": 3843 + }, + { + "epoch": 1.7876298248333593, + "grad_norm": 0.43471091985702515, + "learning_rate": 4.201128291896594e-06, + "loss": 0.349, + "step": 3844 + }, + { + "epoch": 1.7880948690125562, + "grad_norm": 0.4108128249645233, + "learning_rate": 4.198457233833546e-06, + "loss": 0.3643, + "step": 3845 + }, + { + "epoch": 1.7885599131917531, + "grad_norm": 0.3940763771533966, + "learning_rate": 4.195786410529357e-06, + "loss": 0.3659, + "step": 3846 + }, + { + "epoch": 1.7890249573709502, + "grad_norm": 0.4194324314594269, + "learning_rate": 4.193115822766263e-06, + "loss": 0.3359, + "step": 3847 + }, + { + "epoch": 1.7894900015501474, + "grad_norm": 0.37528660893440247, + "learning_rate": 4.19044547132644e-06, + "loss": 0.3255, + "step": 3848 + }, + { + "epoch": 1.7899550457293443, + "grad_norm": 0.4335568845272064, + "learning_rate": 4.1877753569919865e-06, + "loss": 0.3884, + "step": 3849 + }, + { + "epoch": 1.7904200899085412, + "grad_norm": 0.41178300976753235, + "learning_rate": 4.18510548054494e-06, + "loss": 0.3179, + "step": 3850 + }, + { + "epoch": 1.7908851340877383, + "grad_norm": 0.41776037216186523, + "learning_rate": 4.18243584276726e-06, + "loss": 0.3448, + "step": 3851 + }, + { + "epoch": 1.7913501782669354, + "grad_norm": 0.40628287196159363, + "learning_rate": 4.179766444440844e-06, + "loss": 0.3959, + "step": 3852 + }, + { + "epoch": 1.7918152224461323, + "grad_norm": 0.37497735023498535, + "learning_rate": 4.177097286347511e-06, + "loss": 0.3118, + "step": 3853 + }, + { + "epoch": 1.7922802666253292, + "grad_norm": 0.4827350378036499, + "learning_rate": 4.174428369269018e-06, + "loss": 0.3839, + "step": 3854 + }, + { + "epoch": 1.7927453108045266, + "grad_norm": 0.3956352472305298, + "learning_rate": 4.171759693987046e-06, + "loss": 0.3378, + "step": 3855 + }, + { + "epoch": 1.7932103549837235, + "grad_norm": 0.3712174892425537, + "learning_rate": 4.169091261283205e-06, + "loss": 0.3207, + "step": 3856 + }, + { + "epoch": 1.7936753991629204, + "grad_norm": 0.48411279916763306, + "learning_rate": 4.166423071939038e-06, + "loss": 0.3747, + "step": 3857 + }, + { + "epoch": 1.7941404433421175, + "grad_norm": 0.38150522112846375, + "learning_rate": 4.163755126736011e-06, + "loss": 0.3312, + "step": 3858 + }, + { + "epoch": 1.7946054875213147, + "grad_norm": 0.3953656554222107, + "learning_rate": 4.1610874264555265e-06, + "loss": 0.3349, + "step": 3859 + }, + { + "epoch": 1.7950705317005116, + "grad_norm": 0.4521973431110382, + "learning_rate": 4.158419971878907e-06, + "loss": 0.3255, + "step": 3860 + }, + { + "epoch": 1.7955355758797085, + "grad_norm": 0.3754480481147766, + "learning_rate": 4.155752763787409e-06, + "loss": 0.3399, + "step": 3861 + }, + { + "epoch": 1.7960006200589056, + "grad_norm": 0.36341235041618347, + "learning_rate": 4.1530858029622125e-06, + "loss": 0.3306, + "step": 3862 + }, + { + "epoch": 1.7964656642381027, + "grad_norm": 0.5212793946266174, + "learning_rate": 4.150419090184428e-06, + "loss": 0.4108, + "step": 3863 + }, + { + "epoch": 1.7969307084172996, + "grad_norm": 0.37436044216156006, + "learning_rate": 4.147752626235092e-06, + "loss": 0.3122, + "step": 3864 + }, + { + "epoch": 1.7973957525964965, + "grad_norm": 0.3866221606731415, + "learning_rate": 4.145086411895168e-06, + "loss": 0.3834, + "step": 3865 + }, + { + "epoch": 1.7978607967756937, + "grad_norm": 0.42558223009109497, + "learning_rate": 4.142420447945548e-06, + "loss": 0.4011, + "step": 3866 + }, + { + "epoch": 1.7983258409548908, + "grad_norm": 0.3522246479988098, + "learning_rate": 4.13975473516705e-06, + "loss": 0.2951, + "step": 3867 + }, + { + "epoch": 1.7987908851340877, + "grad_norm": 0.37685632705688477, + "learning_rate": 4.137089274340415e-06, + "loss": 0.3199, + "step": 3868 + }, + { + "epoch": 1.7992559293132846, + "grad_norm": 0.3827194571495056, + "learning_rate": 4.134424066246318e-06, + "loss": 0.3436, + "step": 3869 + }, + { + "epoch": 1.799720973492482, + "grad_norm": 0.4337718188762665, + "learning_rate": 4.131759111665349e-06, + "loss": 0.3462, + "step": 3870 + }, + { + "epoch": 1.8001860176716789, + "grad_norm": 0.40320268273353577, + "learning_rate": 4.129094411378034e-06, + "loss": 0.3414, + "step": 3871 + }, + { + "epoch": 1.8006510618508758, + "grad_norm": 0.37748897075653076, + "learning_rate": 4.1264299661648195e-06, + "loss": 0.3365, + "step": 3872 + }, + { + "epoch": 1.8011161060300729, + "grad_norm": 0.3657347857952118, + "learning_rate": 4.123765776806081e-06, + "loss": 0.339, + "step": 3873 + }, + { + "epoch": 1.80158115020927, + "grad_norm": 0.4097522497177124, + "learning_rate": 4.121101844082111e-06, + "loss": 0.3451, + "step": 3874 + }, + { + "epoch": 1.802046194388467, + "grad_norm": 0.3985246419906616, + "learning_rate": 4.118438168773137e-06, + "loss": 0.3597, + "step": 3875 + }, + { + "epoch": 1.8025112385676638, + "grad_norm": 0.41197559237480164, + "learning_rate": 4.115774751659302e-06, + "loss": 0.3268, + "step": 3876 + }, + { + "epoch": 1.802976282746861, + "grad_norm": 0.3859005272388458, + "learning_rate": 4.11311159352068e-06, + "loss": 0.3354, + "step": 3877 + }, + { + "epoch": 1.803441326926058, + "grad_norm": 0.3585662841796875, + "learning_rate": 4.110448695137266e-06, + "loss": 0.3303, + "step": 3878 + }, + { + "epoch": 1.803906371105255, + "grad_norm": 0.38057127594947815, + "learning_rate": 4.107786057288982e-06, + "loss": 0.3306, + "step": 3879 + }, + { + "epoch": 1.8043714152844519, + "grad_norm": 0.4793699085712433, + "learning_rate": 4.105123680755667e-06, + "loss": 0.3414, + "step": 3880 + }, + { + "epoch": 1.804836459463649, + "grad_norm": 0.4067707061767578, + "learning_rate": 4.102461566317093e-06, + "loss": 0.322, + "step": 3881 + }, + { + "epoch": 1.8053015036428461, + "grad_norm": 0.4255411922931671, + "learning_rate": 4.099799714752944e-06, + "loss": 0.296, + "step": 3882 + }, + { + "epoch": 1.805766547822043, + "grad_norm": 0.40467318892478943, + "learning_rate": 4.097138126842839e-06, + "loss": 0.3672, + "step": 3883 + }, + { + "epoch": 1.8062315920012402, + "grad_norm": 0.41983935236930847, + "learning_rate": 4.09447680336631e-06, + "loss": 0.3322, + "step": 3884 + }, + { + "epoch": 1.8066966361804373, + "grad_norm": 0.37576842308044434, + "learning_rate": 4.091815745102818e-06, + "loss": 0.3064, + "step": 3885 + }, + { + "epoch": 1.8071616803596342, + "grad_norm": 0.44206252694129944, + "learning_rate": 4.089154952831741e-06, + "loss": 0.3848, + "step": 3886 + }, + { + "epoch": 1.807626724538831, + "grad_norm": 0.3728598356246948, + "learning_rate": 4.086494427332386e-06, + "loss": 0.3392, + "step": 3887 + }, + { + "epoch": 1.8080917687180282, + "grad_norm": 0.41656002402305603, + "learning_rate": 4.083834169383972e-06, + "loss": 0.3432, + "step": 3888 + }, + { + "epoch": 1.8085568128972254, + "grad_norm": 0.40540826320648193, + "learning_rate": 4.0811741797656505e-06, + "loss": 0.3381, + "step": 3889 + }, + { + "epoch": 1.8090218570764223, + "grad_norm": 0.40355539321899414, + "learning_rate": 4.078514459256485e-06, + "loss": 0.3464, + "step": 3890 + }, + { + "epoch": 1.8094869012556192, + "grad_norm": 0.37891075015068054, + "learning_rate": 4.075855008635468e-06, + "loss": 0.3557, + "step": 3891 + }, + { + "epoch": 1.8099519454348163, + "grad_norm": 0.35543182492256165, + "learning_rate": 4.073195828681509e-06, + "loss": 0.3177, + "step": 3892 + }, + { + "epoch": 1.8104169896140134, + "grad_norm": 0.400921106338501, + "learning_rate": 4.070536920173435e-06, + "loss": 0.3601, + "step": 3893 + }, + { + "epoch": 1.8108820337932103, + "grad_norm": 0.35732823610305786, + "learning_rate": 4.067878283890002e-06, + "loss": 0.345, + "step": 3894 + }, + { + "epoch": 1.8113470779724072, + "grad_norm": 0.39798301458358765, + "learning_rate": 4.065219920609877e-06, + "loss": 0.3464, + "step": 3895 + }, + { + "epoch": 1.8118121221516044, + "grad_norm": 0.3960334062576294, + "learning_rate": 4.062561831111656e-06, + "loss": 0.3632, + "step": 3896 + }, + { + "epoch": 1.8122771663308015, + "grad_norm": 0.3809148669242859, + "learning_rate": 4.059904016173844e-06, + "loss": 0.3482, + "step": 3897 + }, + { + "epoch": 1.8127422105099984, + "grad_norm": 0.397305965423584, + "learning_rate": 4.05724647657488e-06, + "loss": 0.3559, + "step": 3898 + }, + { + "epoch": 1.8132072546891955, + "grad_norm": 0.3920312225818634, + "learning_rate": 4.0545892130931065e-06, + "loss": 0.3402, + "step": 3899 + }, + { + "epoch": 1.8136722988683927, + "grad_norm": 0.39747723937034607, + "learning_rate": 4.051932226506797e-06, + "loss": 0.3551, + "step": 3900 + }, + { + "epoch": 1.8141373430475896, + "grad_norm": 0.4466649889945984, + "learning_rate": 4.049275517594137e-06, + "loss": 0.3301, + "step": 3901 + }, + { + "epoch": 1.8146023872267865, + "grad_norm": 0.413919597864151, + "learning_rate": 4.046619087133238e-06, + "loss": 0.3609, + "step": 3902 + }, + { + "epoch": 1.8150674314059836, + "grad_norm": 0.34546610713005066, + "learning_rate": 4.04396293590212e-06, + "loss": 0.3098, + "step": 3903 + }, + { + "epoch": 1.8155324755851807, + "grad_norm": 0.40819764137268066, + "learning_rate": 4.0413070646787325e-06, + "loss": 0.3451, + "step": 3904 + }, + { + "epoch": 1.8159975197643776, + "grad_norm": 0.3749530017375946, + "learning_rate": 4.03865147424093e-06, + "loss": 0.3456, + "step": 3905 + }, + { + "epoch": 1.8164625639435745, + "grad_norm": 0.3659387528896332, + "learning_rate": 4.035996165366497e-06, + "loss": 0.3042, + "step": 3906 + }, + { + "epoch": 1.8169276081227717, + "grad_norm": 0.41476431488990784, + "learning_rate": 4.033341138833127e-06, + "loss": 0.3314, + "step": 3907 + }, + { + "epoch": 1.8173926523019688, + "grad_norm": 0.39919087290763855, + "learning_rate": 4.030686395418439e-06, + "loss": 0.3424, + "step": 3908 + }, + { + "epoch": 1.8178576964811657, + "grad_norm": 0.40032151341438293, + "learning_rate": 4.028031935899958e-06, + "loss": 0.3332, + "step": 3909 + }, + { + "epoch": 1.8183227406603626, + "grad_norm": 0.41616636514663696, + "learning_rate": 4.025377761055138e-06, + "loss": 0.3357, + "step": 3910 + }, + { + "epoch": 1.8187877848395597, + "grad_norm": 0.40295374393463135, + "learning_rate": 4.022723871661338e-06, + "loss": 0.3281, + "step": 3911 + }, + { + "epoch": 1.8192528290187568, + "grad_norm": 0.49246686697006226, + "learning_rate": 4.020070268495844e-06, + "loss": 0.3867, + "step": 3912 + }, + { + "epoch": 1.8197178731979538, + "grad_norm": 0.38608917593955994, + "learning_rate": 4.017416952335849e-06, + "loss": 0.3213, + "step": 3913 + }, + { + "epoch": 1.8201829173771509, + "grad_norm": 0.3529578745365143, + "learning_rate": 4.014763923958471e-06, + "loss": 0.3246, + "step": 3914 + }, + { + "epoch": 1.820647961556348, + "grad_norm": 0.42414984107017517, + "learning_rate": 4.0121111841407345e-06, + "loss": 0.3668, + "step": 3915 + }, + { + "epoch": 1.821113005735545, + "grad_norm": 0.406495064496994, + "learning_rate": 4.0094587336595875e-06, + "loss": 0.3176, + "step": 3916 + }, + { + "epoch": 1.8215780499147418, + "grad_norm": 0.39914000034332275, + "learning_rate": 4.006806573291886e-06, + "loss": 0.343, + "step": 3917 + }, + { + "epoch": 1.822043094093939, + "grad_norm": 0.3730376958847046, + "learning_rate": 4.004154703814407e-06, + "loss": 0.2956, + "step": 3918 + }, + { + "epoch": 1.822508138273136, + "grad_norm": 0.43556198477745056, + "learning_rate": 4.00150312600384e-06, + "loss": 0.3393, + "step": 3919 + }, + { + "epoch": 1.822973182452333, + "grad_norm": 0.3719125986099243, + "learning_rate": 3.998851840636789e-06, + "loss": 0.3421, + "step": 3920 + }, + { + "epoch": 1.8234382266315299, + "grad_norm": 0.40865078568458557, + "learning_rate": 3.996200848489771e-06, + "loss": 0.3597, + "step": 3921 + }, + { + "epoch": 1.823903270810727, + "grad_norm": 0.38333582878112793, + "learning_rate": 3.9935501503392214e-06, + "loss": 0.3604, + "step": 3922 + }, + { + "epoch": 1.8243683149899241, + "grad_norm": 0.40074682235717773, + "learning_rate": 3.990899746961483e-06, + "loss": 0.3406, + "step": 3923 + }, + { + "epoch": 1.824833359169121, + "grad_norm": 0.4388389587402344, + "learning_rate": 3.9882496391328185e-06, + "loss": 0.3327, + "step": 3924 + }, + { + "epoch": 1.825298403348318, + "grad_norm": 0.40420886874198914, + "learning_rate": 3.9855998276294e-06, + "loss": 0.3187, + "step": 3925 + }, + { + "epoch": 1.825763447527515, + "grad_norm": 0.4143049418926239, + "learning_rate": 3.982950313227317e-06, + "loss": 0.3311, + "step": 3926 + }, + { + "epoch": 1.8262284917067122, + "grad_norm": 0.4336578845977783, + "learning_rate": 3.980301096702567e-06, + "loss": 0.3923, + "step": 3927 + }, + { + "epoch": 1.826693535885909, + "grad_norm": 0.395292729139328, + "learning_rate": 3.9776521788310605e-06, + "loss": 0.3403, + "step": 3928 + }, + { + "epoch": 1.8271585800651062, + "grad_norm": 0.38078877329826355, + "learning_rate": 3.975003560388625e-06, + "loss": 0.3363, + "step": 3929 + }, + { + "epoch": 1.8276236242443034, + "grad_norm": 0.375010222196579, + "learning_rate": 3.9723552421509975e-06, + "loss": 0.3441, + "step": 3930 + }, + { + "epoch": 1.8280886684235003, + "grad_norm": 0.3816002607345581, + "learning_rate": 3.969707224893829e-06, + "loss": 0.3388, + "step": 3931 + }, + { + "epoch": 1.8285537126026972, + "grad_norm": 0.3783058524131775, + "learning_rate": 3.967059509392677e-06, + "loss": 0.3584, + "step": 3932 + }, + { + "epoch": 1.8290187567818943, + "grad_norm": 0.4086306095123291, + "learning_rate": 3.964412096423019e-06, + "loss": 0.3437, + "step": 3933 + }, + { + "epoch": 1.8294838009610914, + "grad_norm": 0.3575378954410553, + "learning_rate": 3.961764986760234e-06, + "loss": 0.3245, + "step": 3934 + }, + { + "epoch": 1.8299488451402883, + "grad_norm": 0.4207441806793213, + "learning_rate": 3.959118181179622e-06, + "loss": 0.3375, + "step": 3935 + }, + { + "epoch": 1.8304138893194852, + "grad_norm": 0.39432263374328613, + "learning_rate": 3.9564716804563855e-06, + "loss": 0.3562, + "step": 3936 + }, + { + "epoch": 1.8308789334986824, + "grad_norm": 0.3997967541217804, + "learning_rate": 3.9538254853656465e-06, + "loss": 0.3544, + "step": 3937 + }, + { + "epoch": 1.8313439776778795, + "grad_norm": 0.369190514087677, + "learning_rate": 3.951179596682427e-06, + "loss": 0.3524, + "step": 3938 + }, + { + "epoch": 1.8318090218570764, + "grad_norm": 0.37524935603141785, + "learning_rate": 3.948534015181671e-06, + "loss": 0.3549, + "step": 3939 + }, + { + "epoch": 1.8322740660362733, + "grad_norm": 0.36414414644241333, + "learning_rate": 3.945888741638219e-06, + "loss": 0.3104, + "step": 3940 + }, + { + "epoch": 1.8327391102154704, + "grad_norm": 0.3788766860961914, + "learning_rate": 3.943243776826834e-06, + "loss": 0.3434, + "step": 3941 + }, + { + "epoch": 1.8332041543946676, + "grad_norm": 0.3915799558162689, + "learning_rate": 3.94059912152218e-06, + "loss": 0.3332, + "step": 3942 + }, + { + "epoch": 1.8336691985738645, + "grad_norm": 0.4084991216659546, + "learning_rate": 3.937954776498839e-06, + "loss": 0.356, + "step": 3943 + }, + { + "epoch": 1.8341342427530616, + "grad_norm": 0.38015294075012207, + "learning_rate": 3.93531074253129e-06, + "loss": 0.3525, + "step": 3944 + }, + { + "epoch": 1.8345992869322587, + "grad_norm": 0.3744085133075714, + "learning_rate": 3.932667020393933e-06, + "loss": 0.3628, + "step": 3945 + }, + { + "epoch": 1.8350643311114556, + "grad_norm": 0.32951560616493225, + "learning_rate": 3.930023610861067e-06, + "loss": 0.3227, + "step": 3946 + }, + { + "epoch": 1.8355293752906525, + "grad_norm": 0.3684207499027252, + "learning_rate": 3.927380514706906e-06, + "loss": 0.3635, + "step": 3947 + }, + { + "epoch": 1.8359944194698496, + "grad_norm": 0.34596481919288635, + "learning_rate": 3.924737732705568e-06, + "loss": 0.3528, + "step": 3948 + }, + { + "epoch": 1.8364594636490468, + "grad_norm": 0.3639676868915558, + "learning_rate": 3.9220952656310855e-06, + "loss": 0.3836, + "step": 3949 + }, + { + "epoch": 1.8369245078282437, + "grad_norm": 0.34727898240089417, + "learning_rate": 3.919453114257389e-06, + "loss": 0.31, + "step": 3950 + }, + { + "epoch": 1.8373895520074406, + "grad_norm": 0.3681538701057434, + "learning_rate": 3.916811279358326e-06, + "loss": 0.3589, + "step": 3951 + }, + { + "epoch": 1.8378545961866377, + "grad_norm": 0.41221883893013, + "learning_rate": 3.9141697617076414e-06, + "loss": 0.3592, + "step": 3952 + }, + { + "epoch": 1.8383196403658348, + "grad_norm": 0.3793887794017792, + "learning_rate": 3.911528562078999e-06, + "loss": 0.3131, + "step": 3953 + }, + { + "epoch": 1.8387846845450317, + "grad_norm": 0.3928253650665283, + "learning_rate": 3.9088876812459585e-06, + "loss": 0.3305, + "step": 3954 + }, + { + "epoch": 1.8392497287242286, + "grad_norm": 0.4711274802684784, + "learning_rate": 3.906247119981995e-06, + "loss": 0.4163, + "step": 3955 + }, + { + "epoch": 1.8397147729034258, + "grad_norm": 0.3746742606163025, + "learning_rate": 3.903606879060483e-06, + "loss": 0.3223, + "step": 3956 + }, + { + "epoch": 1.840179817082623, + "grad_norm": 0.4172133207321167, + "learning_rate": 3.900966959254709e-06, + "loss": 0.3512, + "step": 3957 + }, + { + "epoch": 1.8406448612618198, + "grad_norm": 0.3593921959400177, + "learning_rate": 3.898327361337859e-06, + "loss": 0.3413, + "step": 3958 + }, + { + "epoch": 1.841109905441017, + "grad_norm": 0.40812554955482483, + "learning_rate": 3.89568808608303e-06, + "loss": 0.3318, + "step": 3959 + }, + { + "epoch": 1.841574949620214, + "grad_norm": 0.42885205149650574, + "learning_rate": 3.8930491342632235e-06, + "loss": 0.3515, + "step": 3960 + }, + { + "epoch": 1.842039993799411, + "grad_norm": 0.3699992597103119, + "learning_rate": 3.890410506651346e-06, + "loss": 0.3183, + "step": 3961 + }, + { + "epoch": 1.8425050379786079, + "grad_norm": 0.44218167662620544, + "learning_rate": 3.887772204020207e-06, + "loss": 0.3596, + "step": 3962 + }, + { + "epoch": 1.842970082157805, + "grad_norm": 0.38194793462753296, + "learning_rate": 3.885134227142525e-06, + "loss": 0.3587, + "step": 3963 + }, + { + "epoch": 1.8434351263370021, + "grad_norm": 0.3831635117530823, + "learning_rate": 3.882496576790918e-06, + "loss": 0.3255, + "step": 3964 + }, + { + "epoch": 1.843900170516199, + "grad_norm": 0.4331647455692291, + "learning_rate": 3.879859253737911e-06, + "loss": 0.3348, + "step": 3965 + }, + { + "epoch": 1.844365214695396, + "grad_norm": 0.41714081168174744, + "learning_rate": 3.8772222587559345e-06, + "loss": 0.317, + "step": 3966 + }, + { + "epoch": 1.844830258874593, + "grad_norm": 0.3952346444129944, + "learning_rate": 3.8745855926173205e-06, + "loss": 0.3647, + "step": 3967 + }, + { + "epoch": 1.8452953030537902, + "grad_norm": 0.38905858993530273, + "learning_rate": 3.871949256094308e-06, + "loss": 0.3791, + "step": 3968 + }, + { + "epoch": 1.845760347232987, + "grad_norm": 0.4492240250110626, + "learning_rate": 3.869313249959033e-06, + "loss": 0.3301, + "step": 3969 + }, + { + "epoch": 1.846225391412184, + "grad_norm": 0.4257234036922455, + "learning_rate": 3.866677574983542e-06, + "loss": 0.3827, + "step": 3970 + }, + { + "epoch": 1.8466904355913811, + "grad_norm": 0.3498407304286957, + "learning_rate": 3.86404223193978e-06, + "loss": 0.356, + "step": 3971 + }, + { + "epoch": 1.8471554797705783, + "grad_norm": 0.35863253474235535, + "learning_rate": 3.861407221599598e-06, + "loss": 0.3364, + "step": 3972 + }, + { + "epoch": 1.8476205239497752, + "grad_norm": 0.3656211495399475, + "learning_rate": 3.858772544734745e-06, + "loss": 0.3302, + "step": 3973 + }, + { + "epoch": 1.8480855681289723, + "grad_norm": 0.46760740876197815, + "learning_rate": 3.856138202116878e-06, + "loss": 0.3454, + "step": 3974 + }, + { + "epoch": 1.8485506123081694, + "grad_norm": 0.4070281684398651, + "learning_rate": 3.853504194517551e-06, + "loss": 0.3385, + "step": 3975 + }, + { + "epoch": 1.8490156564873663, + "grad_norm": 0.3659380376338959, + "learning_rate": 3.850870522708222e-06, + "loss": 0.3412, + "step": 3976 + }, + { + "epoch": 1.8494807006665632, + "grad_norm": 0.3773347735404968, + "learning_rate": 3.848237187460252e-06, + "loss": 0.3553, + "step": 3977 + }, + { + "epoch": 1.8499457448457604, + "grad_norm": 0.4317549765110016, + "learning_rate": 3.845604189544902e-06, + "loss": 0.3417, + "step": 3978 + }, + { + "epoch": 1.8504107890249575, + "grad_norm": 0.4554439187049866, + "learning_rate": 3.842971529733333e-06, + "loss": 0.3545, + "step": 3979 + }, + { + "epoch": 1.8508758332041544, + "grad_norm": 0.4062388241291046, + "learning_rate": 3.840339208796611e-06, + "loss": 0.3467, + "step": 3980 + }, + { + "epoch": 1.8513408773833513, + "grad_norm": 0.36868301033973694, + "learning_rate": 3.837707227505696e-06, + "loss": 0.3109, + "step": 3981 + }, + { + "epoch": 1.8518059215625484, + "grad_norm": 0.4157978594303131, + "learning_rate": 3.8350755866314555e-06, + "loss": 0.3758, + "step": 3982 + }, + { + "epoch": 1.8522709657417455, + "grad_norm": 0.4424905776977539, + "learning_rate": 3.8324442869446525e-06, + "loss": 0.3405, + "step": 3983 + }, + { + "epoch": 1.8527360099209425, + "grad_norm": 0.3770042657852173, + "learning_rate": 3.829813329215956e-06, + "loss": 0.3587, + "step": 3984 + }, + { + "epoch": 1.8532010541001394, + "grad_norm": 0.39475545287132263, + "learning_rate": 3.827182714215925e-06, + "loss": 0.3133, + "step": 3985 + }, + { + "epoch": 1.8536660982793365, + "grad_norm": 0.48087528347969055, + "learning_rate": 3.824552442715029e-06, + "loss": 0.3636, + "step": 3986 + }, + { + "epoch": 1.8541311424585336, + "grad_norm": 0.41496339440345764, + "learning_rate": 3.821922515483627e-06, + "loss": 0.3245, + "step": 3987 + }, + { + "epoch": 1.8545961866377305, + "grad_norm": 0.38482680916786194, + "learning_rate": 3.819292933291986e-06, + "loss": 0.3443, + "step": 3988 + }, + { + "epoch": 1.8550612308169276, + "grad_norm": 0.40696123242378235, + "learning_rate": 3.8166636969102655e-06, + "loss": 0.3456, + "step": 3989 + }, + { + "epoch": 1.8555262749961248, + "grad_norm": 0.37776339054107666, + "learning_rate": 3.814034807108529e-06, + "loss": 0.3619, + "step": 3990 + }, + { + "epoch": 1.8559913191753217, + "grad_norm": 0.40916046500205994, + "learning_rate": 3.8114062646567317e-06, + "loss": 0.3444, + "step": 3991 + }, + { + "epoch": 1.8564563633545186, + "grad_norm": 0.3734774589538574, + "learning_rate": 3.808778070324735e-06, + "loss": 0.341, + "step": 3992 + }, + { + "epoch": 1.8569214075337157, + "grad_norm": 0.38891860842704773, + "learning_rate": 3.80615022488229e-06, + "loss": 0.3358, + "step": 3993 + }, + { + "epoch": 1.8573864517129128, + "grad_norm": 0.40861591696739197, + "learning_rate": 3.803522729099054e-06, + "loss": 0.3432, + "step": 3994 + }, + { + "epoch": 1.8578514958921097, + "grad_norm": 0.45857205986976624, + "learning_rate": 3.8008955837445742e-06, + "loss": 0.3707, + "step": 3995 + }, + { + "epoch": 1.8583165400713066, + "grad_norm": 0.38723862171173096, + "learning_rate": 3.7982687895883036e-06, + "loss": 0.3699, + "step": 3996 + }, + { + "epoch": 1.8587815842505038, + "grad_norm": 0.37383538484573364, + "learning_rate": 3.795642347399582e-06, + "loss": 0.3368, + "step": 3997 + }, + { + "epoch": 1.859246628429701, + "grad_norm": 0.38865286111831665, + "learning_rate": 3.7930162579476566e-06, + "loss": 0.3678, + "step": 3998 + }, + { + "epoch": 1.8597116726088978, + "grad_norm": 0.42665407061576843, + "learning_rate": 3.790390522001662e-06, + "loss": 0.3673, + "step": 3999 + }, + { + "epoch": 1.8601767167880947, + "grad_norm": 0.42146944999694824, + "learning_rate": 3.787765140330636e-06, + "loss": 0.3535, + "step": 4000 + }, + { + "epoch": 1.860641760967292, + "grad_norm": 0.36168861389160156, + "learning_rate": 3.7851401137035114e-06, + "loss": 0.3267, + "step": 4001 + }, + { + "epoch": 1.861106805146489, + "grad_norm": 0.4150577485561371, + "learning_rate": 3.782515442889112e-06, + "loss": 0.3572, + "step": 4002 + }, + { + "epoch": 1.8615718493256859, + "grad_norm": 0.4027031362056732, + "learning_rate": 3.7798911286561655e-06, + "loss": 0.3416, + "step": 4003 + }, + { + "epoch": 1.862036893504883, + "grad_norm": 0.4184225797653198, + "learning_rate": 3.777267171773288e-06, + "loss": 0.38, + "step": 4004 + }, + { + "epoch": 1.8625019376840801, + "grad_norm": 0.3823773264884949, + "learning_rate": 3.774643573008995e-06, + "loss": 0.2838, + "step": 4005 + }, + { + "epoch": 1.862966981863277, + "grad_norm": 0.4209512770175934, + "learning_rate": 3.7720203331316946e-06, + "loss": 0.3812, + "step": 4006 + }, + { + "epoch": 1.863432026042474, + "grad_norm": 0.37146735191345215, + "learning_rate": 3.769397452909695e-06, + "loss": 0.3212, + "step": 4007 + }, + { + "epoch": 1.863897070221671, + "grad_norm": 0.334895521402359, + "learning_rate": 3.76677493311119e-06, + "loss": 0.3148, + "step": 4008 + }, + { + "epoch": 1.8643621144008682, + "grad_norm": 0.372598797082901, + "learning_rate": 3.7641527745042784e-06, + "loss": 0.3257, + "step": 4009 + }, + { + "epoch": 1.864827158580065, + "grad_norm": 0.4964183270931244, + "learning_rate": 3.7615309778569427e-06, + "loss": 0.3659, + "step": 4010 + }, + { + "epoch": 1.865292202759262, + "grad_norm": 0.37151283025741577, + "learning_rate": 3.7589095439370676e-06, + "loss": 0.3561, + "step": 4011 + }, + { + "epoch": 1.8657572469384591, + "grad_norm": 0.43840986490249634, + "learning_rate": 3.7562884735124273e-06, + "loss": 0.3004, + "step": 4012 + }, + { + "epoch": 1.8662222911176563, + "grad_norm": 0.4607786536216736, + "learning_rate": 3.7536677673506926e-06, + "loss": 0.4157, + "step": 4013 + }, + { + "epoch": 1.8666873352968532, + "grad_norm": 0.3740638792514801, + "learning_rate": 3.751047426219423e-06, + "loss": 0.3024, + "step": 4014 + }, + { + "epoch": 1.86715237947605, + "grad_norm": 0.41392412781715393, + "learning_rate": 3.7484274508860776e-06, + "loss": 0.3458, + "step": 4015 + }, + { + "epoch": 1.8676174236552474, + "grad_norm": 0.46794068813323975, + "learning_rate": 3.745807842118e-06, + "loss": 0.3345, + "step": 4016 + }, + { + "epoch": 1.8680824678344443, + "grad_norm": 0.400758296251297, + "learning_rate": 3.7431886006824347e-06, + "loss": 0.339, + "step": 4017 + }, + { + "epoch": 1.8685475120136412, + "grad_norm": 0.3659023642539978, + "learning_rate": 3.7405697273465125e-06, + "loss": 0.3756, + "step": 4018 + }, + { + "epoch": 1.8690125561928383, + "grad_norm": 0.4087202548980713, + "learning_rate": 3.7379512228772618e-06, + "loss": 0.3622, + "step": 4019 + }, + { + "epoch": 1.8694776003720355, + "grad_norm": 0.3660109341144562, + "learning_rate": 3.7353330880415963e-06, + "loss": 0.3189, + "step": 4020 + }, + { + "epoch": 1.8699426445512324, + "grad_norm": 0.3783299922943115, + "learning_rate": 3.7327153236063295e-06, + "loss": 0.3218, + "step": 4021 + }, + { + "epoch": 1.8704076887304293, + "grad_norm": 0.3811236023902893, + "learning_rate": 3.7300979303381576e-06, + "loss": 0.3672, + "step": 4022 + }, + { + "epoch": 1.8708727329096264, + "grad_norm": 0.3874422013759613, + "learning_rate": 3.7274809090036757e-06, + "loss": 0.377, + "step": 4023 + }, + { + "epoch": 1.8713377770888235, + "grad_norm": 0.40507352352142334, + "learning_rate": 3.724864260369364e-06, + "loss": 0.3598, + "step": 4024 + }, + { + "epoch": 1.8718028212680204, + "grad_norm": 0.3792964220046997, + "learning_rate": 3.7222479852016015e-06, + "loss": 0.3304, + "step": 4025 + }, + { + "epoch": 1.8722678654472173, + "grad_norm": 0.3633670210838318, + "learning_rate": 3.7196320842666467e-06, + "loss": 0.2997, + "step": 4026 + }, + { + "epoch": 1.8727329096264145, + "grad_norm": 0.3763364851474762, + "learning_rate": 3.7170165583306595e-06, + "loss": 0.3596, + "step": 4027 + }, + { + "epoch": 1.8731979538056116, + "grad_norm": 0.3550674617290497, + "learning_rate": 3.71440140815968e-06, + "loss": 0.3433, + "step": 4028 + }, + { + "epoch": 1.8736629979848085, + "grad_norm": 0.3978072702884674, + "learning_rate": 3.7117866345196473e-06, + "loss": 0.3898, + "step": 4029 + }, + { + "epoch": 1.8741280421640054, + "grad_norm": 0.417102575302124, + "learning_rate": 3.709172238176384e-06, + "loss": 0.337, + "step": 4030 + }, + { + "epoch": 1.8745930863432028, + "grad_norm": 0.36194002628326416, + "learning_rate": 3.706558219895607e-06, + "loss": 0.3347, + "step": 4031 + }, + { + "epoch": 1.8750581305223997, + "grad_norm": 0.39445623755455017, + "learning_rate": 3.7039445804429154e-06, + "loss": 0.3578, + "step": 4032 + }, + { + "epoch": 1.8755231747015966, + "grad_norm": 0.3856761157512665, + "learning_rate": 3.7013313205838066e-06, + "loss": 0.3493, + "step": 4033 + }, + { + "epoch": 1.8759882188807937, + "grad_norm": 0.3411782383918762, + "learning_rate": 3.698718441083657e-06, + "loss": 0.3297, + "step": 4034 + }, + { + "epoch": 1.8764532630599908, + "grad_norm": 0.421271950006485, + "learning_rate": 3.6961059427077407e-06, + "loss": 0.3634, + "step": 4035 + }, + { + "epoch": 1.8769183072391877, + "grad_norm": 0.4099300801753998, + "learning_rate": 3.693493826221215e-06, + "loss": 0.3815, + "step": 4036 + }, + { + "epoch": 1.8773833514183846, + "grad_norm": 0.37041401863098145, + "learning_rate": 3.6908820923891235e-06, + "loss": 0.3621, + "step": 4037 + }, + { + "epoch": 1.8778483955975818, + "grad_norm": 0.36186453700065613, + "learning_rate": 3.6882707419764053e-06, + "loss": 0.3328, + "step": 4038 + }, + { + "epoch": 1.878313439776779, + "grad_norm": 0.38483038544654846, + "learning_rate": 3.6856597757478784e-06, + "loss": 0.3676, + "step": 4039 + }, + { + "epoch": 1.8787784839559758, + "grad_norm": 0.3237282335758209, + "learning_rate": 3.6830491944682543e-06, + "loss": 0.3129, + "step": 4040 + }, + { + "epoch": 1.8792435281351727, + "grad_norm": 0.3765997588634491, + "learning_rate": 3.6804389989021292e-06, + "loss": 0.3639, + "step": 4041 + }, + { + "epoch": 1.8797085723143698, + "grad_norm": 0.3259957432746887, + "learning_rate": 3.6778291898139907e-06, + "loss": 0.3183, + "step": 4042 + }, + { + "epoch": 1.880173616493567, + "grad_norm": 0.37962499260902405, + "learning_rate": 3.675219767968203e-06, + "loss": 0.3432, + "step": 4043 + }, + { + "epoch": 1.8806386606727639, + "grad_norm": 0.34864094853401184, + "learning_rate": 3.6726107341290285e-06, + "loss": 0.3045, + "step": 4044 + }, + { + "epoch": 1.881103704851961, + "grad_norm": 0.3533942997455597, + "learning_rate": 3.6700020890606068e-06, + "loss": 0.3542, + "step": 4045 + }, + { + "epoch": 1.8815687490311581, + "grad_norm": 0.351633220911026, + "learning_rate": 3.667393833526972e-06, + "loss": 0.3474, + "step": 4046 + }, + { + "epoch": 1.882033793210355, + "grad_norm": 0.3609406650066376, + "learning_rate": 3.664785968292036e-06, + "loss": 0.3546, + "step": 4047 + }, + { + "epoch": 1.882498837389552, + "grad_norm": 0.3869522511959076, + "learning_rate": 3.6621784941196036e-06, + "loss": 0.3715, + "step": 4048 + }, + { + "epoch": 1.882963881568749, + "grad_norm": 0.36395883560180664, + "learning_rate": 3.6595714117733583e-06, + "loss": 0.3602, + "step": 4049 + }, + { + "epoch": 1.8834289257479462, + "grad_norm": 0.36276188492774963, + "learning_rate": 3.656964722016875e-06, + "loss": 0.3046, + "step": 4050 + }, + { + "epoch": 1.883893969927143, + "grad_norm": 0.3588871955871582, + "learning_rate": 3.6543584256136076e-06, + "loss": 0.3384, + "step": 4051 + }, + { + "epoch": 1.88435901410634, + "grad_norm": 0.39765575528144836, + "learning_rate": 3.6517525233269015e-06, + "loss": 0.3879, + "step": 4052 + }, + { + "epoch": 1.8848240582855371, + "grad_norm": 0.34334567189216614, + "learning_rate": 3.6491470159199806e-06, + "loss": 0.316, + "step": 4053 + }, + { + "epoch": 1.8852891024647342, + "grad_norm": 0.3806763291358948, + "learning_rate": 3.646541904155958e-06, + "loss": 0.3474, + "step": 4054 + }, + { + "epoch": 1.8857541466439312, + "grad_norm": 0.3665696978569031, + "learning_rate": 3.643937188797826e-06, + "loss": 0.3653, + "step": 4055 + }, + { + "epoch": 1.886219190823128, + "grad_norm": 0.3573841154575348, + "learning_rate": 3.641332870608466e-06, + "loss": 0.3426, + "step": 4056 + }, + { + "epoch": 1.8866842350023252, + "grad_norm": 0.3744082450866699, + "learning_rate": 3.6387289503506375e-06, + "loss": 0.3397, + "step": 4057 + }, + { + "epoch": 1.8871492791815223, + "grad_norm": 0.37690484523773193, + "learning_rate": 3.6361254287869886e-06, + "loss": 0.3628, + "step": 4058 + }, + { + "epoch": 1.8876143233607192, + "grad_norm": 0.34452709555625916, + "learning_rate": 3.6335223066800466e-06, + "loss": 0.3244, + "step": 4059 + }, + { + "epoch": 1.8880793675399163, + "grad_norm": 0.4054635167121887, + "learning_rate": 3.6309195847922284e-06, + "loss": 0.3616, + "step": 4060 + }, + { + "epoch": 1.8885444117191135, + "grad_norm": 0.40721654891967773, + "learning_rate": 3.628317263885823e-06, + "loss": 0.3951, + "step": 4061 + }, + { + "epoch": 1.8890094558983104, + "grad_norm": 0.3469128906726837, + "learning_rate": 3.625715344723012e-06, + "loss": 0.2801, + "step": 4062 + }, + { + "epoch": 1.8894745000775073, + "grad_norm": 0.35594481229782104, + "learning_rate": 3.623113828065853e-06, + "loss": 0.3517, + "step": 4063 + }, + { + "epoch": 1.8899395442567044, + "grad_norm": 0.3937917947769165, + "learning_rate": 3.6205127146762885e-06, + "loss": 0.3679, + "step": 4064 + }, + { + "epoch": 1.8904045884359015, + "grad_norm": 0.3829907476902008, + "learning_rate": 3.617912005316142e-06, + "loss": 0.3391, + "step": 4065 + }, + { + "epoch": 1.8908696326150984, + "grad_norm": 0.36475563049316406, + "learning_rate": 3.615311700747122e-06, + "loss": 0.335, + "step": 4066 + }, + { + "epoch": 1.8913346767942953, + "grad_norm": 0.3677665591239929, + "learning_rate": 3.6127118017308116e-06, + "loss": 0.3373, + "step": 4067 + }, + { + "epoch": 1.8917997209734925, + "grad_norm": 0.3832724094390869, + "learning_rate": 3.6101123090286814e-06, + "loss": 0.3258, + "step": 4068 + }, + { + "epoch": 1.8922647651526896, + "grad_norm": 0.3629043698310852, + "learning_rate": 3.607513223402078e-06, + "loss": 0.3766, + "step": 4069 + }, + { + "epoch": 1.8927298093318865, + "grad_norm": 0.3398493826389313, + "learning_rate": 3.6049145456122347e-06, + "loss": 0.3425, + "step": 4070 + }, + { + "epoch": 1.8931948535110834, + "grad_norm": 0.36889976263046265, + "learning_rate": 3.6023162764202613e-06, + "loss": 0.3559, + "step": 4071 + }, + { + "epoch": 1.8936598976902805, + "grad_norm": 0.393019437789917, + "learning_rate": 3.599718416587146e-06, + "loss": 0.3255, + "step": 4072 + }, + { + "epoch": 1.8941249418694777, + "grad_norm": 0.401734322309494, + "learning_rate": 3.5971209668737626e-06, + "loss": 0.3756, + "step": 4073 + }, + { + "epoch": 1.8945899860486746, + "grad_norm": 0.34484103322029114, + "learning_rate": 3.5945239280408596e-06, + "loss": 0.321, + "step": 4074 + }, + { + "epoch": 1.8950550302278717, + "grad_norm": 0.41480597853660583, + "learning_rate": 3.591927300849069e-06, + "loss": 0.3551, + "step": 4075 + }, + { + "epoch": 1.8955200744070688, + "grad_norm": 0.3733346462249756, + "learning_rate": 3.5893310860588997e-06, + "loss": 0.3334, + "step": 4076 + }, + { + "epoch": 1.8959851185862657, + "grad_norm": 0.3196105360984802, + "learning_rate": 3.5867352844307433e-06, + "loss": 0.3302, + "step": 4077 + }, + { + "epoch": 1.8964501627654626, + "grad_norm": 0.3849073052406311, + "learning_rate": 3.5841398967248654e-06, + "loss": 0.3548, + "step": 4078 + }, + { + "epoch": 1.8969152069446598, + "grad_norm": 0.38783520460128784, + "learning_rate": 3.5815449237014144e-06, + "loss": 0.3313, + "step": 4079 + }, + { + "epoch": 1.8973802511238569, + "grad_norm": 0.3971535563468933, + "learning_rate": 3.578950366120414e-06, + "loss": 0.3581, + "step": 4080 + }, + { + "epoch": 1.8978452953030538, + "grad_norm": 0.33792009949684143, + "learning_rate": 3.5763562247417694e-06, + "loss": 0.3269, + "step": 4081 + }, + { + "epoch": 1.8983103394822507, + "grad_norm": 0.36532509326934814, + "learning_rate": 3.5737625003252606e-06, + "loss": 0.3342, + "step": 4082 + }, + { + "epoch": 1.8987753836614478, + "grad_norm": 0.36312729120254517, + "learning_rate": 3.5711691936305522e-06, + "loss": 0.3322, + "step": 4083 + }, + { + "epoch": 1.899240427840645, + "grad_norm": 0.36655622720718384, + "learning_rate": 3.568576305417175e-06, + "loss": 0.3621, + "step": 4084 + }, + { + "epoch": 1.8997054720198419, + "grad_norm": 0.38222381472587585, + "learning_rate": 3.5659838364445505e-06, + "loss": 0.3598, + "step": 4085 + }, + { + "epoch": 1.9001705161990388, + "grad_norm": 0.356871098279953, + "learning_rate": 3.5633917874719642e-06, + "loss": 0.3531, + "step": 4086 + }, + { + "epoch": 1.9006355603782359, + "grad_norm": 0.34935128688812256, + "learning_rate": 3.5608001592585895e-06, + "loss": 0.3085, + "step": 4087 + }, + { + "epoch": 1.901100604557433, + "grad_norm": 0.3709997236728668, + "learning_rate": 3.55820895256347e-06, + "loss": 0.3215, + "step": 4088 + }, + { + "epoch": 1.90156564873663, + "grad_norm": 0.39799949526786804, + "learning_rate": 3.5556181681455314e-06, + "loss": 0.3538, + "step": 4089 + }, + { + "epoch": 1.902030692915827, + "grad_norm": 0.35450243949890137, + "learning_rate": 3.553027806763568e-06, + "loss": 0.3455, + "step": 4090 + }, + { + "epoch": 1.9024957370950242, + "grad_norm": 0.43028944730758667, + "learning_rate": 3.5504378691762586e-06, + "loss": 0.3607, + "step": 4091 + }, + { + "epoch": 1.902960781274221, + "grad_norm": 0.35662081837654114, + "learning_rate": 3.5478483561421497e-06, + "loss": 0.312, + "step": 4092 + }, + { + "epoch": 1.903425825453418, + "grad_norm": 0.38295355439186096, + "learning_rate": 3.5452592684196707e-06, + "loss": 0.3484, + "step": 4093 + }, + { + "epoch": 1.9038908696326151, + "grad_norm": 0.38135382533073425, + "learning_rate": 3.542670606767121e-06, + "loss": 0.3646, + "step": 4094 + }, + { + "epoch": 1.9043559138118122, + "grad_norm": 0.3967789113521576, + "learning_rate": 3.540082371942682e-06, + "loss": 0.3255, + "step": 4095 + }, + { + "epoch": 1.9048209579910091, + "grad_norm": 0.4134833514690399, + "learning_rate": 3.5374945647044e-06, + "loss": 0.3487, + "step": 4096 + }, + { + "epoch": 1.905286002170206, + "grad_norm": 0.36041533946990967, + "learning_rate": 3.5349071858102056e-06, + "loss": 0.3114, + "step": 4097 + }, + { + "epoch": 1.9057510463494032, + "grad_norm": 0.40801483392715454, + "learning_rate": 3.5323202360178976e-06, + "loss": 0.3595, + "step": 4098 + }, + { + "epoch": 1.9062160905286003, + "grad_norm": 0.39261582493782043, + "learning_rate": 3.529733716085154e-06, + "loss": 0.3617, + "step": 4099 + }, + { + "epoch": 1.9066811347077972, + "grad_norm": 0.3464079797267914, + "learning_rate": 3.5271476267695216e-06, + "loss": 0.3262, + "step": 4100 + }, + { + "epoch": 1.9071461788869941, + "grad_norm": 0.461972177028656, + "learning_rate": 3.5245619688284277e-06, + "loss": 0.3115, + "step": 4101 + }, + { + "epoch": 1.9076112230661912, + "grad_norm": 0.35122260451316833, + "learning_rate": 3.5219767430191653e-06, + "loss": 0.3377, + "step": 4102 + }, + { + "epoch": 1.9080762672453884, + "grad_norm": 0.4606049060821533, + "learning_rate": 3.5193919500989093e-06, + "loss": 0.3413, + "step": 4103 + }, + { + "epoch": 1.9085413114245853, + "grad_norm": 0.40791839361190796, + "learning_rate": 3.516807590824699e-06, + "loss": 0.3605, + "step": 4104 + }, + { + "epoch": 1.9090063556037824, + "grad_norm": 0.3770095407962799, + "learning_rate": 3.514223665953455e-06, + "loss": 0.3335, + "step": 4105 + }, + { + "epoch": 1.9094713997829795, + "grad_norm": 0.3994191884994507, + "learning_rate": 3.5116401762419643e-06, + "loss": 0.2999, + "step": 4106 + }, + { + "epoch": 1.9099364439621764, + "grad_norm": 0.4128780663013458, + "learning_rate": 3.509057122446893e-06, + "loss": 0.3623, + "step": 4107 + }, + { + "epoch": 1.9104014881413733, + "grad_norm": 0.39665135741233826, + "learning_rate": 3.506474505324772e-06, + "loss": 0.3586, + "step": 4108 + }, + { + "epoch": 1.9108665323205705, + "grad_norm": 0.3825541138648987, + "learning_rate": 3.503892325632007e-06, + "loss": 0.3502, + "step": 4109 + }, + { + "epoch": 1.9113315764997676, + "grad_norm": 0.3846040368080139, + "learning_rate": 3.5013105841248794e-06, + "loss": 0.3397, + "step": 4110 + }, + { + "epoch": 1.9117966206789645, + "grad_norm": 0.4117841422557831, + "learning_rate": 3.4987292815595376e-06, + "loss": 0.3676, + "step": 4111 + }, + { + "epoch": 1.9122616648581614, + "grad_norm": 0.38701215386390686, + "learning_rate": 3.496148418692006e-06, + "loss": 0.3433, + "step": 4112 + }, + { + "epoch": 1.9127267090373585, + "grad_norm": 0.3598323464393616, + "learning_rate": 3.4935679962781722e-06, + "loss": 0.346, + "step": 4113 + }, + { + "epoch": 1.9131917532165557, + "grad_norm": 0.40813809633255005, + "learning_rate": 3.4909880150738057e-06, + "loss": 0.3356, + "step": 4114 + }, + { + "epoch": 1.9136567973957526, + "grad_norm": 0.3636961579322815, + "learning_rate": 3.4884084758345365e-06, + "loss": 0.3225, + "step": 4115 + }, + { + "epoch": 1.9141218415749495, + "grad_norm": 0.3554372489452362, + "learning_rate": 3.4858293793158727e-06, + "loss": 0.3349, + "step": 4116 + }, + { + "epoch": 1.9145868857541466, + "grad_norm": 0.4285871684551239, + "learning_rate": 3.4832507262731876e-06, + "loss": 0.3442, + "step": 4117 + }, + { + "epoch": 1.9150519299333437, + "grad_norm": 0.38151922821998596, + "learning_rate": 3.4806725174617305e-06, + "loss": 0.3202, + "step": 4118 + }, + { + "epoch": 1.9155169741125406, + "grad_norm": 0.4228741228580475, + "learning_rate": 3.4780947536366115e-06, + "loss": 0.3516, + "step": 4119 + }, + { + "epoch": 1.9159820182917378, + "grad_norm": 0.3742198944091797, + "learning_rate": 3.4755174355528214e-06, + "loss": 0.3673, + "step": 4120 + }, + { + "epoch": 1.9164470624709349, + "grad_norm": 0.3714161217212677, + "learning_rate": 3.4729405639652102e-06, + "loss": 0.3152, + "step": 4121 + }, + { + "epoch": 1.9169121066501318, + "grad_norm": 0.3599150478839874, + "learning_rate": 3.470364139628504e-06, + "loss": 0.3102, + "step": 4122 + }, + { + "epoch": 1.9173771508293287, + "grad_norm": 0.4052508771419525, + "learning_rate": 3.467788163297294e-06, + "loss": 0.3409, + "step": 4123 + }, + { + "epoch": 1.9178421950085258, + "grad_norm": 0.39652350544929504, + "learning_rate": 3.465212635726045e-06, + "loss": 0.4028, + "step": 4124 + }, + { + "epoch": 1.918307239187723, + "grad_norm": 0.3475923240184784, + "learning_rate": 3.462637557669084e-06, + "loss": 0.3194, + "step": 4125 + }, + { + "epoch": 1.9187722833669199, + "grad_norm": 0.35219481587409973, + "learning_rate": 3.460062929880612e-06, + "loss": 0.3094, + "step": 4126 + }, + { + "epoch": 1.9192373275461168, + "grad_norm": 0.3954284191131592, + "learning_rate": 3.4574887531146926e-06, + "loss": 0.3428, + "step": 4127 + }, + { + "epoch": 1.9197023717253139, + "grad_norm": 0.4456411600112915, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.3701, + "step": 4128 + }, + { + "epoch": 1.920167415904511, + "grad_norm": 0.3940032720565796, + "learning_rate": 3.4523417556661244e-06, + "loss": 0.3499, + "step": 4129 + }, + { + "epoch": 1.920632460083708, + "grad_norm": 0.36356043815612793, + "learning_rate": 3.4497689364909483e-06, + "loss": 0.362, + "step": 4130 + }, + { + "epoch": 1.9210975042629048, + "grad_norm": 0.34411749243736267, + "learning_rate": 3.4471965713532675e-06, + "loss": 0.3274, + "step": 4131 + }, + { + "epoch": 1.921562548442102, + "grad_norm": 0.36001211404800415, + "learning_rate": 3.444624661006491e-06, + "loss": 0.3625, + "step": 4132 + }, + { + "epoch": 1.922027592621299, + "grad_norm": 0.3642420172691345, + "learning_rate": 3.4420532062038846e-06, + "loss": 0.3257, + "step": 4133 + }, + { + "epoch": 1.922492636800496, + "grad_norm": 0.3693174719810486, + "learning_rate": 3.43948220769859e-06, + "loss": 0.3414, + "step": 4134 + }, + { + "epoch": 1.922957680979693, + "grad_norm": 0.4484702944755554, + "learning_rate": 3.4369116662436074e-06, + "loss": 0.3864, + "step": 4135 + }, + { + "epoch": 1.9234227251588902, + "grad_norm": 0.3540983200073242, + "learning_rate": 3.4343415825918102e-06, + "loss": 0.3052, + "step": 4136 + }, + { + "epoch": 1.9238877693380871, + "grad_norm": 0.35332098603248596, + "learning_rate": 3.4317719574959307e-06, + "loss": 0.3524, + "step": 4137 + }, + { + "epoch": 1.924352813517284, + "grad_norm": 0.41718167066574097, + "learning_rate": 3.4292027917085733e-06, + "loss": 0.3519, + "step": 4138 + }, + { + "epoch": 1.9248178576964812, + "grad_norm": 0.42364948987960815, + "learning_rate": 3.4266340859822023e-06, + "loss": 0.3231, + "step": 4139 + }, + { + "epoch": 1.9252829018756783, + "grad_norm": 0.38962802290916443, + "learning_rate": 3.424065841069152e-06, + "loss": 0.3384, + "step": 4140 + }, + { + "epoch": 1.9257479460548752, + "grad_norm": 0.37895727157592773, + "learning_rate": 3.421498057721617e-06, + "loss": 0.3477, + "step": 4141 + }, + { + "epoch": 1.926212990234072, + "grad_norm": 0.40839117765426636, + "learning_rate": 3.4189307366916635e-06, + "loss": 0.3364, + "step": 4142 + }, + { + "epoch": 1.9266780344132692, + "grad_norm": 0.39300408959388733, + "learning_rate": 3.4163638787312152e-06, + "loss": 0.3477, + "step": 4143 + }, + { + "epoch": 1.9271430785924664, + "grad_norm": 0.39907246828079224, + "learning_rate": 3.4137974845920616e-06, + "loss": 0.3376, + "step": 4144 + }, + { + "epoch": 1.9276081227716633, + "grad_norm": 0.38821595907211304, + "learning_rate": 3.411231555025861e-06, + "loss": 0.3703, + "step": 4145 + }, + { + "epoch": 1.9280731669508602, + "grad_norm": 0.3863977789878845, + "learning_rate": 3.4086660907841307e-06, + "loss": 0.3541, + "step": 4146 + }, + { + "epoch": 1.9285382111300573, + "grad_norm": 0.36651626229286194, + "learning_rate": 3.4061010926182557e-06, + "loss": 0.3067, + "step": 4147 + }, + { + "epoch": 1.9290032553092544, + "grad_norm": 0.4503706693649292, + "learning_rate": 3.403536561279479e-06, + "loss": 0.3646, + "step": 4148 + }, + { + "epoch": 1.9294682994884513, + "grad_norm": 0.417574942111969, + "learning_rate": 3.400972497518914e-06, + "loss": 0.3605, + "step": 4149 + }, + { + "epoch": 1.9299333436676485, + "grad_norm": 0.3759341239929199, + "learning_rate": 3.398408902087529e-06, + "loss": 0.3452, + "step": 4150 + }, + { + "epoch": 1.9303983878468456, + "grad_norm": 0.36154696345329285, + "learning_rate": 3.395845775736163e-06, + "loss": 0.3387, + "step": 4151 + }, + { + "epoch": 1.9308634320260425, + "grad_norm": 0.35659950971603394, + "learning_rate": 3.3932831192155115e-06, + "loss": 0.331, + "step": 4152 + }, + { + "epoch": 1.9313284762052394, + "grad_norm": 0.36559152603149414, + "learning_rate": 3.3907209332761383e-06, + "loss": 0.3622, + "step": 4153 + }, + { + "epoch": 1.9317935203844365, + "grad_norm": 0.365607887506485, + "learning_rate": 3.3881592186684616e-06, + "loss": 0.3181, + "step": 4154 + }, + { + "epoch": 1.9322585645636337, + "grad_norm": 0.3874776065349579, + "learning_rate": 3.3855979761427705e-06, + "loss": 0.3239, + "step": 4155 + }, + { + "epoch": 1.9327236087428306, + "grad_norm": 0.3716128170490265, + "learning_rate": 3.383037206449207e-06, + "loss": 0.3572, + "step": 4156 + }, + { + "epoch": 1.9331886529220275, + "grad_norm": 0.38972359895706177, + "learning_rate": 3.3804769103377827e-06, + "loss": 0.3569, + "step": 4157 + }, + { + "epoch": 1.9336536971012246, + "grad_norm": 0.3612194061279297, + "learning_rate": 3.377917088558364e-06, + "loss": 0.3391, + "step": 4158 + }, + { + "epoch": 1.9341187412804217, + "grad_norm": 0.3686378002166748, + "learning_rate": 3.3753577418606844e-06, + "loss": 0.3359, + "step": 4159 + }, + { + "epoch": 1.9345837854596186, + "grad_norm": 0.35202595591545105, + "learning_rate": 3.3727988709943303e-06, + "loss": 0.3094, + "step": 4160 + }, + { + "epoch": 1.9350488296388155, + "grad_norm": 0.3890860676765442, + "learning_rate": 3.370240476708759e-06, + "loss": 0.346, + "step": 4161 + }, + { + "epoch": 1.9355138738180129, + "grad_norm": 0.3682956397533417, + "learning_rate": 3.367682559753277e-06, + "loss": 0.3363, + "step": 4162 + }, + { + "epoch": 1.9359789179972098, + "grad_norm": 0.37430447340011597, + "learning_rate": 3.36512512087706e-06, + "loss": 0.335, + "step": 4163 + }, + { + "epoch": 1.9364439621764067, + "grad_norm": 0.371165007352829, + "learning_rate": 3.3625681608291393e-06, + "loss": 0.3534, + "step": 4164 + }, + { + "epoch": 1.9369090063556038, + "grad_norm": 0.3415764570236206, + "learning_rate": 3.360011680358409e-06, + "loss": 0.3183, + "step": 4165 + }, + { + "epoch": 1.937374050534801, + "grad_norm": 0.41029369831085205, + "learning_rate": 3.3574556802136164e-06, + "loss": 0.3807, + "step": 4166 + }, + { + "epoch": 1.9378390947139978, + "grad_norm": 0.41946086287498474, + "learning_rate": 3.354900161143377e-06, + "loss": 0.3535, + "step": 4167 + }, + { + "epoch": 1.9383041388931948, + "grad_norm": 0.36154383420944214, + "learning_rate": 3.352345123896158e-06, + "loss": 0.3153, + "step": 4168 + }, + { + "epoch": 1.9387691830723919, + "grad_norm": 0.3600989878177643, + "learning_rate": 3.3497905692202892e-06, + "loss": 0.3591, + "step": 4169 + }, + { + "epoch": 1.939234227251589, + "grad_norm": 0.37105801701545715, + "learning_rate": 3.347236497863957e-06, + "loss": 0.3842, + "step": 4170 + }, + { + "epoch": 1.939699271430786, + "grad_norm": 0.32860368490219116, + "learning_rate": 3.3446829105752103e-06, + "loss": 0.289, + "step": 4171 + }, + { + "epoch": 1.9401643156099828, + "grad_norm": 0.3648563325405121, + "learning_rate": 3.34212980810195e-06, + "loss": 0.3297, + "step": 4172 + }, + { + "epoch": 1.94062935978918, + "grad_norm": 0.3792143166065216, + "learning_rate": 3.3395771911919416e-06, + "loss": 0.3211, + "step": 4173 + }, + { + "epoch": 1.941094403968377, + "grad_norm": 0.4156646430492401, + "learning_rate": 3.3370250605928013e-06, + "loss": 0.3774, + "step": 4174 + }, + { + "epoch": 1.941559448147574, + "grad_norm": 0.3592356741428375, + "learning_rate": 3.33447341705201e-06, + "loss": 0.3604, + "step": 4175 + }, + { + "epoch": 1.9420244923267709, + "grad_norm": 0.4310251474380493, + "learning_rate": 3.3319222613169e-06, + "loss": 0.3351, + "step": 4176 + }, + { + "epoch": 1.9424895365059682, + "grad_norm": 0.40586546063423157, + "learning_rate": 3.3293715941346676e-06, + "loss": 0.312, + "step": 4177 + }, + { + "epoch": 1.9429545806851651, + "grad_norm": 0.374381422996521, + "learning_rate": 3.3268214162523563e-06, + "loss": 0.3451, + "step": 4178 + }, + { + "epoch": 1.943419624864362, + "grad_norm": 0.3669213056564331, + "learning_rate": 3.324271728416877e-06, + "loss": 0.3387, + "step": 4179 + }, + { + "epoch": 1.9438846690435592, + "grad_norm": 0.39169323444366455, + "learning_rate": 3.321722531374988e-06, + "loss": 0.3568, + "step": 4180 + }, + { + "epoch": 1.9443497132227563, + "grad_norm": 0.3700978457927704, + "learning_rate": 3.3191738258733085e-06, + "loss": 0.3265, + "step": 4181 + }, + { + "epoch": 1.9448147574019532, + "grad_norm": 0.46921029686927795, + "learning_rate": 3.316625612658315e-06, + "loss": 0.3883, + "step": 4182 + }, + { + "epoch": 1.94527980158115, + "grad_norm": 0.38272857666015625, + "learning_rate": 3.314077892476334e-06, + "loss": 0.3328, + "step": 4183 + }, + { + "epoch": 1.9457448457603472, + "grad_norm": 0.40746691823005676, + "learning_rate": 3.3115306660735564e-06, + "loss": 0.3558, + "step": 4184 + }, + { + "epoch": 1.9462098899395444, + "grad_norm": 0.33452197909355164, + "learning_rate": 3.308983934196018e-06, + "loss": 0.3091, + "step": 4185 + }, + { + "epoch": 1.9466749341187413, + "grad_norm": 0.4103114902973175, + "learning_rate": 3.3064376975896197e-06, + "loss": 0.3777, + "step": 4186 + }, + { + "epoch": 1.9471399782979382, + "grad_norm": 0.3644181489944458, + "learning_rate": 3.3038919570001086e-06, + "loss": 0.3238, + "step": 4187 + }, + { + "epoch": 1.9476050224771353, + "grad_norm": 0.42216435074806213, + "learning_rate": 3.301346713173096e-06, + "loss": 0.3551, + "step": 4188 + }, + { + "epoch": 1.9480700666563324, + "grad_norm": 0.39056453108787537, + "learning_rate": 3.2988019668540373e-06, + "loss": 0.3612, + "step": 4189 + }, + { + "epoch": 1.9485351108355293, + "grad_norm": 0.37854042649269104, + "learning_rate": 3.2962577187882517e-06, + "loss": 0.3339, + "step": 4190 + }, + { + "epoch": 1.9490001550147265, + "grad_norm": 0.34841084480285645, + "learning_rate": 3.2937139697209043e-06, + "loss": 0.305, + "step": 4191 + }, + { + "epoch": 1.9494651991939236, + "grad_norm": 0.4083377718925476, + "learning_rate": 3.2911707203970213e-06, + "loss": 0.3892, + "step": 4192 + }, + { + "epoch": 1.9499302433731205, + "grad_norm": 0.3734639585018158, + "learning_rate": 3.2886279715614754e-06, + "loss": 0.327, + "step": 4193 + }, + { + "epoch": 1.9503952875523174, + "grad_norm": 0.34090444445610046, + "learning_rate": 3.286085723959001e-06, + "loss": 0.3013, + "step": 4194 + }, + { + "epoch": 1.9508603317315145, + "grad_norm": 0.42237943410873413, + "learning_rate": 3.283543978334177e-06, + "loss": 0.3566, + "step": 4195 + }, + { + "epoch": 1.9513253759107116, + "grad_norm": 0.36327943205833435, + "learning_rate": 3.281002735431442e-06, + "loss": 0.3294, + "step": 4196 + }, + { + "epoch": 1.9517904200899086, + "grad_norm": 0.3818821310997009, + "learning_rate": 3.2784619959950832e-06, + "loss": 0.3715, + "step": 4197 + }, + { + "epoch": 1.9522554642691055, + "grad_norm": 0.3964533805847168, + "learning_rate": 3.2759217607692427e-06, + "loss": 0.3286, + "step": 4198 + }, + { + "epoch": 1.9527205084483026, + "grad_norm": 0.404161661863327, + "learning_rate": 3.2733820304979136e-06, + "loss": 0.3412, + "step": 4199 + }, + { + "epoch": 1.9531855526274997, + "grad_norm": 0.397344708442688, + "learning_rate": 3.2708428059249437e-06, + "loss": 0.3277, + "step": 4200 + }, + { + "epoch": 1.9536505968066966, + "grad_norm": 0.3819953501224518, + "learning_rate": 3.268304087794027e-06, + "loss": 0.325, + "step": 4201 + }, + { + "epoch": 1.9541156409858935, + "grad_norm": 0.40046122670173645, + "learning_rate": 3.2657658768487164e-06, + "loss": 0.3437, + "step": 4202 + }, + { + "epoch": 1.9545806851650906, + "grad_norm": 0.3873307704925537, + "learning_rate": 3.26322817383241e-06, + "loss": 0.3454, + "step": 4203 + }, + { + "epoch": 1.9550457293442878, + "grad_norm": 0.3931639790534973, + "learning_rate": 3.260690979488361e-06, + "loss": 0.3232, + "step": 4204 + }, + { + "epoch": 1.9555107735234847, + "grad_norm": 0.48349693417549133, + "learning_rate": 3.258154294559671e-06, + "loss": 0.4106, + "step": 4205 + }, + { + "epoch": 1.9559758177026818, + "grad_norm": 0.3327430486679077, + "learning_rate": 3.255618119789298e-06, + "loss": 0.305, + "step": 4206 + }, + { + "epoch": 1.956440861881879, + "grad_norm": 0.4101545512676239, + "learning_rate": 3.2530824559200415e-06, + "loss": 0.3245, + "step": 4207 + }, + { + "epoch": 1.9569059060610758, + "grad_norm": 0.4183330833911896, + "learning_rate": 3.2505473036945588e-06, + "loss": 0.3395, + "step": 4208 + }, + { + "epoch": 1.9573709502402727, + "grad_norm": 0.463460236787796, + "learning_rate": 3.2480126638553533e-06, + "loss": 0.3707, + "step": 4209 + }, + { + "epoch": 1.9578359944194699, + "grad_norm": 0.3990997076034546, + "learning_rate": 3.2454785371447817e-06, + "loss": 0.3211, + "step": 4210 + }, + { + "epoch": 1.958301038598667, + "grad_norm": 0.38769689202308655, + "learning_rate": 3.2429449243050464e-06, + "loss": 0.3219, + "step": 4211 + }, + { + "epoch": 1.958766082777864, + "grad_norm": 0.3971726596355438, + "learning_rate": 3.2404118260782047e-06, + "loss": 0.3451, + "step": 4212 + }, + { + "epoch": 1.9592311269570608, + "grad_norm": 0.3939190208911896, + "learning_rate": 3.2378792432061557e-06, + "loss": 0.3454, + "step": 4213 + }, + { + "epoch": 1.959696171136258, + "grad_norm": 0.3693125247955322, + "learning_rate": 3.2353471764306567e-06, + "loss": 0.3299, + "step": 4214 + }, + { + "epoch": 1.960161215315455, + "grad_norm": 0.42611974477767944, + "learning_rate": 3.2328156264933043e-06, + "loss": 0.3396, + "step": 4215 + }, + { + "epoch": 1.960626259494652, + "grad_norm": 0.4172080159187317, + "learning_rate": 3.23028459413555e-06, + "loss": 0.3579, + "step": 4216 + }, + { + "epoch": 1.9610913036738489, + "grad_norm": 0.4222067594528198, + "learning_rate": 3.227754080098694e-06, + "loss": 0.3634, + "step": 4217 + }, + { + "epoch": 1.961556347853046, + "grad_norm": 0.3900541365146637, + "learning_rate": 3.2252240851238786e-06, + "loss": 0.3096, + "step": 4218 + }, + { + "epoch": 1.9620213920322431, + "grad_norm": 0.3957304060459137, + "learning_rate": 3.2226946099521026e-06, + "loss": 0.3681, + "step": 4219 + }, + { + "epoch": 1.96248643621144, + "grad_norm": 0.44212228059768677, + "learning_rate": 3.2201656553242054e-06, + "loss": 0.3601, + "step": 4220 + }, + { + "epoch": 1.9629514803906372, + "grad_norm": 0.38398414850234985, + "learning_rate": 3.217637221980878e-06, + "loss": 0.308, + "step": 4221 + }, + { + "epoch": 1.9634165245698343, + "grad_norm": 0.4060637056827545, + "learning_rate": 3.215109310662656e-06, + "loss": 0.342, + "step": 4222 + }, + { + "epoch": 1.9638815687490312, + "grad_norm": 0.4322217106819153, + "learning_rate": 3.2125819221099265e-06, + "loss": 0.3304, + "step": 4223 + }, + { + "epoch": 1.964346612928228, + "grad_norm": 0.37583181262016296, + "learning_rate": 3.210055057062917e-06, + "loss": 0.2955, + "step": 4224 + }, + { + "epoch": 1.9648116571074252, + "grad_norm": 0.37987688183784485, + "learning_rate": 3.2075287162617084e-06, + "loss": 0.3506, + "step": 4225 + }, + { + "epoch": 1.9652767012866224, + "grad_norm": 0.42965471744537354, + "learning_rate": 3.2050029004462226e-06, + "loss": 0.3463, + "step": 4226 + }, + { + "epoch": 1.9657417454658193, + "grad_norm": 0.3723207712173462, + "learning_rate": 3.2024776103562304e-06, + "loss": 0.3088, + "step": 4227 + }, + { + "epoch": 1.9662067896450162, + "grad_norm": 0.3601841628551483, + "learning_rate": 3.199952846731349e-06, + "loss": 0.3396, + "step": 4228 + }, + { + "epoch": 1.9666718338242133, + "grad_norm": 0.45041221380233765, + "learning_rate": 3.197428610311042e-06, + "loss": 0.3399, + "step": 4229 + }, + { + "epoch": 1.9671368780034104, + "grad_norm": 0.3874083459377289, + "learning_rate": 3.194904901834613e-06, + "loss": 0.3283, + "step": 4230 + }, + { + "epoch": 1.9676019221826073, + "grad_norm": 0.3825784921646118, + "learning_rate": 3.19238172204122e-06, + "loss": 0.3291, + "step": 4231 + }, + { + "epoch": 1.9680669663618042, + "grad_norm": 0.3952145278453827, + "learning_rate": 3.1898590716698574e-06, + "loss": 0.3482, + "step": 4232 + }, + { + "epoch": 1.9685320105410014, + "grad_norm": 0.4280111789703369, + "learning_rate": 3.1873369514593712e-06, + "loss": 0.3488, + "step": 4233 + }, + { + "epoch": 1.9689970547201985, + "grad_norm": 0.44719386100769043, + "learning_rate": 3.184815362148448e-06, + "loss": 0.3692, + "step": 4234 + }, + { + "epoch": 1.9694620988993954, + "grad_norm": 0.4005783796310425, + "learning_rate": 3.1822943044756222e-06, + "loss": 0.3451, + "step": 4235 + }, + { + "epoch": 1.9699271430785925, + "grad_norm": 0.3623179495334625, + "learning_rate": 3.1797737791792672e-06, + "loss": 0.3514, + "step": 4236 + }, + { + "epoch": 1.9703921872577896, + "grad_norm": 0.4718107581138611, + "learning_rate": 3.177253786997609e-06, + "loss": 0.3607, + "step": 4237 + }, + { + "epoch": 1.9708572314369865, + "grad_norm": 0.44240835309028625, + "learning_rate": 3.1747343286687065e-06, + "loss": 0.3568, + "step": 4238 + }, + { + "epoch": 1.9713222756161835, + "grad_norm": 0.3883517384529114, + "learning_rate": 3.1722154049304728e-06, + "loss": 0.3407, + "step": 4239 + }, + { + "epoch": 1.9717873197953806, + "grad_norm": 0.4095773994922638, + "learning_rate": 3.1696970165206564e-06, + "loss": 0.3302, + "step": 4240 + }, + { + "epoch": 1.9722523639745777, + "grad_norm": 0.4610760509967804, + "learning_rate": 3.167179164176857e-06, + "loss": 0.3149, + "step": 4241 + }, + { + "epoch": 1.9727174081537746, + "grad_norm": 0.4434893727302551, + "learning_rate": 3.1646618486365068e-06, + "loss": 0.3795, + "step": 4242 + }, + { + "epoch": 1.9731824523329715, + "grad_norm": 0.3962785601615906, + "learning_rate": 3.1621450706368904e-06, + "loss": 0.3624, + "step": 4243 + }, + { + "epoch": 1.9736474965121686, + "grad_norm": 0.3942055404186249, + "learning_rate": 3.15962883091513e-06, + "loss": 0.3408, + "step": 4244 + }, + { + "epoch": 1.9741125406913658, + "grad_norm": 0.4068949520587921, + "learning_rate": 3.1571131302081916e-06, + "loss": 0.3692, + "step": 4245 + }, + { + "epoch": 1.9745775848705627, + "grad_norm": 0.46068617701530457, + "learning_rate": 3.154597969252883e-06, + "loss": 0.3489, + "step": 4246 + }, + { + "epoch": 1.9750426290497596, + "grad_norm": 0.47146105766296387, + "learning_rate": 3.1520833487858547e-06, + "loss": 0.3245, + "step": 4247 + }, + { + "epoch": 1.9755076732289567, + "grad_norm": 0.3786758482456207, + "learning_rate": 3.1495692695435966e-06, + "loss": 0.3297, + "step": 4248 + }, + { + "epoch": 1.9759727174081538, + "grad_norm": 0.4309166967868805, + "learning_rate": 3.147055732262444e-06, + "loss": 0.3595, + "step": 4249 + }, + { + "epoch": 1.9764377615873507, + "grad_norm": 0.4204605519771576, + "learning_rate": 3.1445427376785687e-06, + "loss": 0.3285, + "step": 4250 + }, + { + "epoch": 1.9769028057665479, + "grad_norm": 0.4231455624103546, + "learning_rate": 3.142030286527987e-06, + "loss": 0.3434, + "step": 4251 + }, + { + "epoch": 1.977367849945745, + "grad_norm": 0.39683791995048523, + "learning_rate": 3.1395183795465565e-06, + "loss": 0.3404, + "step": 4252 + }, + { + "epoch": 1.977832894124942, + "grad_norm": 0.43231719732284546, + "learning_rate": 3.137007017469971e-06, + "loss": 0.3534, + "step": 4253 + }, + { + "epoch": 1.9782979383041388, + "grad_norm": 0.45599350333213806, + "learning_rate": 3.1344962010337703e-06, + "loss": 0.3811, + "step": 4254 + }, + { + "epoch": 1.978762982483336, + "grad_norm": 0.4510321021080017, + "learning_rate": 3.131985930973329e-06, + "loss": 0.374, + "step": 4255 + }, + { + "epoch": 1.979228026662533, + "grad_norm": 0.44332581758499146, + "learning_rate": 3.1294762080238672e-06, + "loss": 0.3416, + "step": 4256 + }, + { + "epoch": 1.97969307084173, + "grad_norm": 0.34618374705314636, + "learning_rate": 3.12696703292044e-06, + "loss": 0.2877, + "step": 4257 + }, + { + "epoch": 1.9801581150209269, + "grad_norm": 0.455598384141922, + "learning_rate": 3.1244584063979467e-06, + "loss": 0.3718, + "step": 4258 + }, + { + "epoch": 1.980623159200124, + "grad_norm": 0.4085701107978821, + "learning_rate": 3.121950329191119e-06, + "loss": 0.3747, + "step": 4259 + }, + { + "epoch": 1.9810882033793211, + "grad_norm": 0.3253307044506073, + "learning_rate": 3.1194428020345375e-06, + "loss": 0.3125, + "step": 4260 + }, + { + "epoch": 1.981553247558518, + "grad_norm": 0.36619746685028076, + "learning_rate": 3.11693582566261e-06, + "loss": 0.343, + "step": 4261 + }, + { + "epoch": 1.982018291737715, + "grad_norm": 0.47953325510025024, + "learning_rate": 3.1144294008095942e-06, + "loss": 0.3779, + "step": 4262 + }, + { + "epoch": 1.982483335916912, + "grad_norm": 0.4037295877933502, + "learning_rate": 3.111923528209577e-06, + "loss": 0.3665, + "step": 4263 + }, + { + "epoch": 1.9829483800961092, + "grad_norm": 0.376261442899704, + "learning_rate": 3.1094182085964935e-06, + "loss": 0.3332, + "step": 4264 + }, + { + "epoch": 1.983413424275306, + "grad_norm": 0.4503709673881531, + "learning_rate": 3.1069134427041047e-06, + "loss": 0.3675, + "step": 4265 + }, + { + "epoch": 1.9838784684545032, + "grad_norm": 0.4101594388484955, + "learning_rate": 3.1044092312660213e-06, + "loss": 0.3787, + "step": 4266 + }, + { + "epoch": 1.9843435126337003, + "grad_norm": 0.45172303915023804, + "learning_rate": 3.101905575015682e-06, + "loss": 0.358, + "step": 4267 + }, + { + "epoch": 1.9848085568128973, + "grad_norm": 0.3164076805114746, + "learning_rate": 3.0994024746863692e-06, + "loss": 0.3028, + "step": 4268 + }, + { + "epoch": 1.9852736009920942, + "grad_norm": 0.3954414427280426, + "learning_rate": 3.0968999310111993e-06, + "loss": 0.3998, + "step": 4269 + }, + { + "epoch": 1.9857386451712913, + "grad_norm": 0.40084972977638245, + "learning_rate": 3.0943979447231287e-06, + "loss": 0.3176, + "step": 4270 + }, + { + "epoch": 1.9862036893504884, + "grad_norm": 0.4292057752609253, + "learning_rate": 3.091896516554945e-06, + "loss": 0.3524, + "step": 4271 + }, + { + "epoch": 1.9866687335296853, + "grad_norm": 0.4974363148212433, + "learning_rate": 3.0893956472392805e-06, + "loss": 0.3649, + "step": 4272 + }, + { + "epoch": 1.9871337777088822, + "grad_norm": 0.37316155433654785, + "learning_rate": 3.086895337508594e-06, + "loss": 0.3477, + "step": 4273 + }, + { + "epoch": 1.9875988218880793, + "grad_norm": 0.3752647638320923, + "learning_rate": 3.0843955880951906e-06, + "loss": 0.3585, + "step": 4274 + }, + { + "epoch": 1.9880638660672765, + "grad_norm": 0.3850919008255005, + "learning_rate": 3.081896399731202e-06, + "loss": 0.3594, + "step": 4275 + }, + { + "epoch": 1.9885289102464734, + "grad_norm": 0.40935975313186646, + "learning_rate": 3.0793977731486034e-06, + "loss": 0.3427, + "step": 4276 + }, + { + "epoch": 1.9889939544256703, + "grad_norm": 0.4816490411758423, + "learning_rate": 3.0768997090791995e-06, + "loss": 0.4007, + "step": 4277 + }, + { + "epoch": 1.9894589986048674, + "grad_norm": 0.3938618302345276, + "learning_rate": 3.0744022082546356e-06, + "loss": 0.3432, + "step": 4278 + }, + { + "epoch": 1.9899240427840645, + "grad_norm": 0.34527865052223206, + "learning_rate": 3.071905271406384e-06, + "loss": 0.3668, + "step": 4279 + }, + { + "epoch": 1.9903890869632614, + "grad_norm": 0.3559629023075104, + "learning_rate": 3.0694088992657617e-06, + "loss": 0.3488, + "step": 4280 + }, + { + "epoch": 1.9908541311424586, + "grad_norm": 0.4233744144439697, + "learning_rate": 3.066913092563913e-06, + "loss": 0.3593, + "step": 4281 + }, + { + "epoch": 1.9913191753216557, + "grad_norm": 0.36039772629737854, + "learning_rate": 3.064417852031822e-06, + "loss": 0.3275, + "step": 4282 + }, + { + "epoch": 1.9917842195008526, + "grad_norm": 0.41235482692718506, + "learning_rate": 3.0619231784003e-06, + "loss": 0.3885, + "step": 4283 + }, + { + "epoch": 1.9922492636800495, + "grad_norm": 0.3795316219329834, + "learning_rate": 3.059429072400001e-06, + "loss": 0.3176, + "step": 4284 + }, + { + "epoch": 1.9927143078592466, + "grad_norm": 0.4195452332496643, + "learning_rate": 3.0569355347614033e-06, + "loss": 0.3609, + "step": 4285 + }, + { + "epoch": 1.9931793520384438, + "grad_norm": 0.32910481095314026, + "learning_rate": 3.054442566214827e-06, + "loss": 0.3151, + "step": 4286 + }, + { + "epoch": 1.9936443962176407, + "grad_norm": 0.3915290832519531, + "learning_rate": 3.051950167490422e-06, + "loss": 0.3448, + "step": 4287 + }, + { + "epoch": 1.9941094403968376, + "grad_norm": 0.39752107858657837, + "learning_rate": 3.049458339318169e-06, + "loss": 0.3286, + "step": 4288 + }, + { + "epoch": 1.9945744845760347, + "grad_norm": 0.45507895946502686, + "learning_rate": 3.0469670824278863e-06, + "loss": 0.3607, + "step": 4289 + }, + { + "epoch": 1.9950395287552318, + "grad_norm": 0.3670455515384674, + "learning_rate": 3.044476397549221e-06, + "loss": 0.332, + "step": 4290 + }, + { + "epoch": 1.9955045729344287, + "grad_norm": 0.4452548623085022, + "learning_rate": 3.0419862854116554e-06, + "loss": 0.3882, + "step": 4291 + }, + { + "epoch": 1.9959696171136256, + "grad_norm": 0.3838987946510315, + "learning_rate": 3.0394967467445014e-06, + "loss": 0.3492, + "step": 4292 + }, + { + "epoch": 1.9964346612928228, + "grad_norm": 0.43697041273117065, + "learning_rate": 3.0370077822769073e-06, + "loss": 0.3688, + "step": 4293 + }, + { + "epoch": 1.99689970547202, + "grad_norm": 0.38790035247802734, + "learning_rate": 3.034519392737847e-06, + "loss": 0.2995, + "step": 4294 + }, + { + "epoch": 1.9973647496512168, + "grad_norm": 0.41673675179481506, + "learning_rate": 3.0320315788561334e-06, + "loss": 0.3068, + "step": 4295 + }, + { + "epoch": 1.997829793830414, + "grad_norm": 0.5042394995689392, + "learning_rate": 3.029544341360402e-06, + "loss": 0.3846, + "step": 4296 + }, + { + "epoch": 1.998294838009611, + "grad_norm": 0.3903370201587677, + "learning_rate": 3.0270576809791273e-06, + "loss": 0.3873, + "step": 4297 + }, + { + "epoch": 1.998759882188808, + "grad_norm": 0.4303033947944641, + "learning_rate": 3.02457159844061e-06, + "loss": 0.3656, + "step": 4298 + }, + { + "epoch": 1.9992249263680049, + "grad_norm": 0.44149455428123474, + "learning_rate": 3.022086094472986e-06, + "loss": 0.3233, + "step": 4299 + }, + { + "epoch": 1.999689970547202, + "grad_norm": 0.38690122961997986, + "learning_rate": 3.019601169804216e-06, + "loss": 0.3404, + "step": 4300 + }, + { + "epoch": 2.000155014726399, + "grad_norm": 0.7339352369308472, + "learning_rate": 3.0171168251620974e-06, + "loss": 0.5187, + "step": 4301 + }, + { + "epoch": 2.000620058905596, + "grad_norm": 0.42907798290252686, + "learning_rate": 3.01463306127425e-06, + "loss": 0.3366, + "step": 4302 + }, + { + "epoch": 2.001085103084793, + "grad_norm": 0.4182564616203308, + "learning_rate": 3.012149878868132e-06, + "loss": 0.2779, + "step": 4303 + }, + { + "epoch": 2.0015501472639903, + "grad_norm": 0.37000566720962524, + "learning_rate": 3.009667278671024e-06, + "loss": 0.3469, + "step": 4304 + }, + { + "epoch": 2.002015191443187, + "grad_norm": 0.36970627307891846, + "learning_rate": 3.0071852614100427e-06, + "loss": 0.2855, + "step": 4305 + }, + { + "epoch": 2.002480235622384, + "grad_norm": 0.39976659417152405, + "learning_rate": 3.004703827812128e-06, + "loss": 0.3605, + "step": 4306 + }, + { + "epoch": 2.002945279801581, + "grad_norm": 0.3608565032482147, + "learning_rate": 3.0022229786040526e-06, + "loss": 0.2864, + "step": 4307 + }, + { + "epoch": 2.0034103239807783, + "grad_norm": 0.37464091181755066, + "learning_rate": 2.999742714512415e-06, + "loss": 0.3287, + "step": 4308 + }, + { + "epoch": 2.0038753681599752, + "grad_norm": 0.37484025955200195, + "learning_rate": 2.997263036263647e-06, + "loss": 0.3256, + "step": 4309 + }, + { + "epoch": 2.004340412339172, + "grad_norm": 0.34249240159988403, + "learning_rate": 2.9947839445840045e-06, + "loss": 0.2974, + "step": 4310 + }, + { + "epoch": 2.004805456518369, + "grad_norm": 0.38859909772872925, + "learning_rate": 2.9923054401995745e-06, + "loss": 0.3534, + "step": 4311 + }, + { + "epoch": 2.0052705006975664, + "grad_norm": 0.3693307936191559, + "learning_rate": 2.9898275238362686e-06, + "loss": 0.3355, + "step": 4312 + }, + { + "epoch": 2.0057355448767633, + "grad_norm": 0.3429759442806244, + "learning_rate": 2.98735019621983e-06, + "loss": 0.2918, + "step": 4313 + }, + { + "epoch": 2.00620058905596, + "grad_norm": 0.3832457959651947, + "learning_rate": 2.984873458075827e-06, + "loss": 0.3288, + "step": 4314 + }, + { + "epoch": 2.006665633235157, + "grad_norm": 0.34353703260421753, + "learning_rate": 2.9823973101296564e-06, + "loss": 0.3088, + "step": 4315 + }, + { + "epoch": 2.0071306774143545, + "grad_norm": 0.37068885564804077, + "learning_rate": 2.9799217531065407e-06, + "loss": 0.3203, + "step": 4316 + }, + { + "epoch": 2.0075957215935514, + "grad_norm": 0.3725161850452423, + "learning_rate": 2.977446787731532e-06, + "loss": 0.3242, + "step": 4317 + }, + { + "epoch": 2.0080607657727483, + "grad_norm": 0.36433523893356323, + "learning_rate": 2.9749724147295054e-06, + "loss": 0.3368, + "step": 4318 + }, + { + "epoch": 2.0085258099519456, + "grad_norm": 0.4169025421142578, + "learning_rate": 2.972498634825168e-06, + "loss": 0.315, + "step": 4319 + }, + { + "epoch": 2.0089908541311425, + "grad_norm": 0.41936197876930237, + "learning_rate": 2.9700254487430448e-06, + "loss": 0.3359, + "step": 4320 + }, + { + "epoch": 2.0094558983103394, + "grad_norm": 0.3963291347026825, + "learning_rate": 2.9675528572074953e-06, + "loss": 0.3093, + "step": 4321 + }, + { + "epoch": 2.0099209424895363, + "grad_norm": 0.38462764024734497, + "learning_rate": 2.9650808609427e-06, + "loss": 0.3439, + "step": 4322 + }, + { + "epoch": 2.0103859866687337, + "grad_norm": 0.36186519265174866, + "learning_rate": 2.962609460672669e-06, + "loss": 0.3149, + "step": 4323 + }, + { + "epoch": 2.0108510308479306, + "grad_norm": 0.3426899015903473, + "learning_rate": 2.960138657121233e-06, + "loss": 0.2876, + "step": 4324 + }, + { + "epoch": 2.0113160750271275, + "grad_norm": 0.3653092682361603, + "learning_rate": 2.957668451012049e-06, + "loss": 0.347, + "step": 4325 + }, + { + "epoch": 2.0117811192063244, + "grad_norm": 0.3865642547607422, + "learning_rate": 2.955198843068603e-06, + "loss": 0.3204, + "step": 4326 + }, + { + "epoch": 2.0122461633855218, + "grad_norm": 0.4121185839176178, + "learning_rate": 2.9527298340142e-06, + "loss": 0.3238, + "step": 4327 + }, + { + "epoch": 2.0127112075647187, + "grad_norm": 0.32982850074768066, + "learning_rate": 2.950261424571977e-06, + "loss": 0.289, + "step": 4328 + }, + { + "epoch": 2.0131762517439156, + "grad_norm": 0.36676689982414246, + "learning_rate": 2.9477936154648866e-06, + "loss": 0.3467, + "step": 4329 + }, + { + "epoch": 2.0136412959231125, + "grad_norm": 0.368436336517334, + "learning_rate": 2.9453264074157134e-06, + "loss": 0.352, + "step": 4330 + }, + { + "epoch": 2.01410634010231, + "grad_norm": 0.4215182363986969, + "learning_rate": 2.9428598011470597e-06, + "loss": 0.3065, + "step": 4331 + }, + { + "epoch": 2.0145713842815067, + "grad_norm": 0.3354952037334442, + "learning_rate": 2.9403937973813564e-06, + "loss": 0.3313, + "step": 4332 + }, + { + "epoch": 2.0150364284607036, + "grad_norm": 0.3583640456199646, + "learning_rate": 2.9379283968408546e-06, + "loss": 0.2962, + "step": 4333 + }, + { + "epoch": 2.015501472639901, + "grad_norm": 0.33064424991607666, + "learning_rate": 2.9354636002476324e-06, + "loss": 0.3023, + "step": 4334 + }, + { + "epoch": 2.015966516819098, + "grad_norm": 0.33953142166137695, + "learning_rate": 2.9329994083235857e-06, + "loss": 0.3078, + "step": 4335 + }, + { + "epoch": 2.016431560998295, + "grad_norm": 0.3762052059173584, + "learning_rate": 2.930535821790439e-06, + "loss": 0.3592, + "step": 4336 + }, + { + "epoch": 2.0168966051774917, + "grad_norm": 0.35094988346099854, + "learning_rate": 2.928072841369734e-06, + "loss": 0.3177, + "step": 4337 + }, + { + "epoch": 2.017361649356689, + "grad_norm": 0.3473728597164154, + "learning_rate": 2.92561046778284e-06, + "loss": 0.35, + "step": 4338 + }, + { + "epoch": 2.017826693535886, + "grad_norm": 0.3321429491043091, + "learning_rate": 2.9231487017509442e-06, + "loss": 0.3325, + "step": 4339 + }, + { + "epoch": 2.018291737715083, + "grad_norm": 0.3240537941455841, + "learning_rate": 2.920687543995061e-06, + "loss": 0.2927, + "step": 4340 + }, + { + "epoch": 2.0187567818942798, + "grad_norm": 0.36483100056648254, + "learning_rate": 2.91822699523602e-06, + "loss": 0.3395, + "step": 4341 + }, + { + "epoch": 2.019221826073477, + "grad_norm": 0.3937772512435913, + "learning_rate": 2.915767056194479e-06, + "loss": 0.3633, + "step": 4342 + }, + { + "epoch": 2.019686870252674, + "grad_norm": 0.33440476655960083, + "learning_rate": 2.9133077275909112e-06, + "loss": 0.3051, + "step": 4343 + }, + { + "epoch": 2.020151914431871, + "grad_norm": 0.3541061282157898, + "learning_rate": 2.910849010145617e-06, + "loss": 0.3395, + "step": 4344 + }, + { + "epoch": 2.0206169586110683, + "grad_norm": 0.34767183661460876, + "learning_rate": 2.9083909045787116e-06, + "loss": 0.3267, + "step": 4345 + }, + { + "epoch": 2.021082002790265, + "grad_norm": 0.3523595333099365, + "learning_rate": 2.905933411610136e-06, + "loss": 0.3337, + "step": 4346 + }, + { + "epoch": 2.021547046969462, + "grad_norm": 0.35873571038246155, + "learning_rate": 2.9034765319596497e-06, + "loss": 0.3286, + "step": 4347 + }, + { + "epoch": 2.022012091148659, + "grad_norm": 0.38059335947036743, + "learning_rate": 2.9010202663468353e-06, + "loss": 0.3044, + "step": 4348 + }, + { + "epoch": 2.0224771353278563, + "grad_norm": 0.3723052144050598, + "learning_rate": 2.8985646154910887e-06, + "loss": 0.3459, + "step": 4349 + }, + { + "epoch": 2.0229421795070532, + "grad_norm": 0.3536323308944702, + "learning_rate": 2.896109580111634e-06, + "loss": 0.3042, + "step": 4350 + }, + { + "epoch": 2.02340722368625, + "grad_norm": 0.3543092608451843, + "learning_rate": 2.8936551609275078e-06, + "loss": 0.3361, + "step": 4351 + }, + { + "epoch": 2.023872267865447, + "grad_norm": 0.3618990182876587, + "learning_rate": 2.8912013586575733e-06, + "loss": 0.3393, + "step": 4352 + }, + { + "epoch": 2.0243373120446444, + "grad_norm": 0.35705238580703735, + "learning_rate": 2.8887481740205046e-06, + "loss": 0.3271, + "step": 4353 + }, + { + "epoch": 2.0248023562238413, + "grad_norm": 0.36953264474868774, + "learning_rate": 2.8862956077348054e-06, + "loss": 0.2905, + "step": 4354 + }, + { + "epoch": 2.025267400403038, + "grad_norm": 0.4037397801876068, + "learning_rate": 2.883843660518787e-06, + "loss": 0.3424, + "step": 4355 + }, + { + "epoch": 2.025732444582235, + "grad_norm": 0.3392297327518463, + "learning_rate": 2.881392333090589e-06, + "loss": 0.3267, + "step": 4356 + }, + { + "epoch": 2.0261974887614325, + "grad_norm": 0.36336854100227356, + "learning_rate": 2.8789416261681624e-06, + "loss": 0.3284, + "step": 4357 + }, + { + "epoch": 2.0266625329406294, + "grad_norm": 0.35528960824012756, + "learning_rate": 2.8764915404692805e-06, + "loss": 0.3275, + "step": 4358 + }, + { + "epoch": 2.0271275771198263, + "grad_norm": 0.36412903666496277, + "learning_rate": 2.874042076711536e-06, + "loss": 0.3135, + "step": 4359 + }, + { + "epoch": 2.0275926212990236, + "grad_norm": 0.3423042595386505, + "learning_rate": 2.871593235612333e-06, + "loss": 0.2972, + "step": 4360 + }, + { + "epoch": 2.0280576654782205, + "grad_norm": 0.36381906270980835, + "learning_rate": 2.8691450178889013e-06, + "loss": 0.3177, + "step": 4361 + }, + { + "epoch": 2.0285227096574174, + "grad_norm": 0.36212357878685, + "learning_rate": 2.8666974242582794e-06, + "loss": 0.337, + "step": 4362 + }, + { + "epoch": 2.0289877538366143, + "grad_norm": 0.38655251264572144, + "learning_rate": 2.864250455437333e-06, + "loss": 0.3042, + "step": 4363 + }, + { + "epoch": 2.0294527980158117, + "grad_norm": 0.32187148928642273, + "learning_rate": 2.8618041121427347e-06, + "loss": 0.3092, + "step": 4364 + }, + { + "epoch": 2.0299178421950086, + "grad_norm": 0.3990195691585541, + "learning_rate": 2.8593583950909833e-06, + "loss": 0.3392, + "step": 4365 + }, + { + "epoch": 2.0303828863742055, + "grad_norm": 0.35381731390953064, + "learning_rate": 2.8569133049983843e-06, + "loss": 0.325, + "step": 4366 + }, + { + "epoch": 2.0308479305534024, + "grad_norm": 0.3144361674785614, + "learning_rate": 2.8544688425810707e-06, + "loss": 0.2756, + "step": 4367 + }, + { + "epoch": 2.0313129747325998, + "grad_norm": 0.4486398994922638, + "learning_rate": 2.8520250085549807e-06, + "loss": 0.337, + "step": 4368 + }, + { + "epoch": 2.0317780189117967, + "grad_norm": 0.38106465339660645, + "learning_rate": 2.8495818036358756e-06, + "loss": 0.3487, + "step": 4369 + }, + { + "epoch": 2.0322430630909936, + "grad_norm": 0.34248411655426025, + "learning_rate": 2.8471392285393307e-06, + "loss": 0.3025, + "step": 4370 + }, + { + "epoch": 2.0327081072701905, + "grad_norm": 0.3708108067512512, + "learning_rate": 2.8446972839807384e-06, + "loss": 0.3159, + "step": 4371 + }, + { + "epoch": 2.033173151449388, + "grad_norm": 0.3808947503566742, + "learning_rate": 2.8422559706753004e-06, + "loss": 0.2909, + "step": 4372 + }, + { + "epoch": 2.0336381956285847, + "grad_norm": 0.36368271708488464, + "learning_rate": 2.8398152893380426e-06, + "loss": 0.3181, + "step": 4373 + }, + { + "epoch": 2.0341032398077816, + "grad_norm": 0.38620734214782715, + "learning_rate": 2.8373752406837963e-06, + "loss": 0.3181, + "step": 4374 + }, + { + "epoch": 2.034568283986979, + "grad_norm": 0.3720152974128723, + "learning_rate": 2.834935825427216e-06, + "loss": 0.335, + "step": 4375 + }, + { + "epoch": 2.035033328166176, + "grad_norm": 0.36220601201057434, + "learning_rate": 2.8324970442827627e-06, + "loss": 0.3028, + "step": 4376 + }, + { + "epoch": 2.035498372345373, + "grad_norm": 0.36199939250946045, + "learning_rate": 2.8300588979647202e-06, + "loss": 0.3198, + "step": 4377 + }, + { + "epoch": 2.0359634165245697, + "grad_norm": 0.3544287085533142, + "learning_rate": 2.827621387187178e-06, + "loss": 0.3024, + "step": 4378 + }, + { + "epoch": 2.036428460703767, + "grad_norm": 0.37387457489967346, + "learning_rate": 2.825184512664048e-06, + "loss": 0.3213, + "step": 4379 + }, + { + "epoch": 2.036893504882964, + "grad_norm": 0.37123391032218933, + "learning_rate": 2.8227482751090445e-06, + "loss": 0.2892, + "step": 4380 + }, + { + "epoch": 2.037358549062161, + "grad_norm": 0.34186607599258423, + "learning_rate": 2.8203126752357067e-06, + "loss": 0.2994, + "step": 4381 + }, + { + "epoch": 2.0378235932413578, + "grad_norm": 0.34526383876800537, + "learning_rate": 2.8178777137573814e-06, + "loss": 0.3343, + "step": 4382 + }, + { + "epoch": 2.038288637420555, + "grad_norm": 0.3409002721309662, + "learning_rate": 2.8154433913872314e-06, + "loss": 0.3204, + "step": 4383 + }, + { + "epoch": 2.038753681599752, + "grad_norm": 0.3608821630477905, + "learning_rate": 2.8130097088382256e-06, + "loss": 0.3083, + "step": 4384 + }, + { + "epoch": 2.039218725778949, + "grad_norm": 0.39243847131729126, + "learning_rate": 2.8105766668231548e-06, + "loss": 0.2993, + "step": 4385 + }, + { + "epoch": 2.039683769958146, + "grad_norm": 0.34924769401550293, + "learning_rate": 2.8081442660546126e-06, + "loss": 0.3602, + "step": 4386 + }, + { + "epoch": 2.040148814137343, + "grad_norm": 0.3145473003387451, + "learning_rate": 2.8057125072450143e-06, + "loss": 0.3078, + "step": 4387 + }, + { + "epoch": 2.04061385831654, + "grad_norm": 0.35963907837867737, + "learning_rate": 2.8032813911065795e-06, + "loss": 0.3269, + "step": 4388 + }, + { + "epoch": 2.041078902495737, + "grad_norm": 0.34630414843559265, + "learning_rate": 2.8008509183513444e-06, + "loss": 0.2996, + "step": 4389 + }, + { + "epoch": 2.0415439466749343, + "grad_norm": 0.32224032282829285, + "learning_rate": 2.7984210896911525e-06, + "loss": 0.3007, + "step": 4390 + }, + { + "epoch": 2.0420089908541312, + "grad_norm": 0.36333921551704407, + "learning_rate": 2.795991905837665e-06, + "loss": 0.3216, + "step": 4391 + }, + { + "epoch": 2.042474035033328, + "grad_norm": 0.3574282228946686, + "learning_rate": 2.793563367502346e-06, + "loss": 0.282, + "step": 4392 + }, + { + "epoch": 2.042939079212525, + "grad_norm": 0.3617522418498993, + "learning_rate": 2.791135475396477e-06, + "loss": 0.3471, + "step": 4393 + }, + { + "epoch": 2.0434041233917224, + "grad_norm": 0.3639947474002838, + "learning_rate": 2.7887082302311486e-06, + "loss": 0.3791, + "step": 4394 + }, + { + "epoch": 2.0438691675709193, + "grad_norm": 0.3631550967693329, + "learning_rate": 2.786281632717264e-06, + "loss": 0.292, + "step": 4395 + }, + { + "epoch": 2.044334211750116, + "grad_norm": 0.3522656261920929, + "learning_rate": 2.7838556835655304e-06, + "loss": 0.3327, + "step": 4396 + }, + { + "epoch": 2.044799255929313, + "grad_norm": 0.3434469699859619, + "learning_rate": 2.781430383486468e-06, + "loss": 0.3274, + "step": 4397 + }, + { + "epoch": 2.0452643001085105, + "grad_norm": 0.36457687616348267, + "learning_rate": 2.779005733190412e-06, + "loss": 0.3083, + "step": 4398 + }, + { + "epoch": 2.0457293442877074, + "grad_norm": 0.3624141812324524, + "learning_rate": 2.7765817333874984e-06, + "loss": 0.2975, + "step": 4399 + }, + { + "epoch": 2.0461943884669043, + "grad_norm": 0.3684295415878296, + "learning_rate": 2.7741583847876816e-06, + "loss": 0.3205, + "step": 4400 + }, + { + "epoch": 2.046659432646101, + "grad_norm": 0.3599317967891693, + "learning_rate": 2.7717356881007185e-06, + "loss": 0.3371, + "step": 4401 + }, + { + "epoch": 2.0471244768252985, + "grad_norm": 0.3688468039035797, + "learning_rate": 2.769313644036179e-06, + "loss": 0.3187, + "step": 4402 + }, + { + "epoch": 2.0475895210044954, + "grad_norm": 0.43112462759017944, + "learning_rate": 2.766892253303438e-06, + "loss": 0.3259, + "step": 4403 + }, + { + "epoch": 2.0480545651836923, + "grad_norm": 0.38703638315200806, + "learning_rate": 2.7644715166116835e-06, + "loss": 0.2869, + "step": 4404 + }, + { + "epoch": 2.0485196093628897, + "grad_norm": 0.39289000630378723, + "learning_rate": 2.7620514346699103e-06, + "loss": 0.3227, + "step": 4405 + }, + { + "epoch": 2.0489846535420866, + "grad_norm": 0.403656542301178, + "learning_rate": 2.7596320081869214e-06, + "loss": 0.3242, + "step": 4406 + }, + { + "epoch": 2.0494496977212835, + "grad_norm": 0.42674005031585693, + "learning_rate": 2.7572132378713255e-06, + "loss": 0.3151, + "step": 4407 + }, + { + "epoch": 2.0499147419004804, + "grad_norm": 0.35708075761795044, + "learning_rate": 2.754795124431544e-06, + "loss": 0.3025, + "step": 4408 + }, + { + "epoch": 2.0503797860796777, + "grad_norm": 0.41090068221092224, + "learning_rate": 2.752377668575799e-06, + "loss": 0.3488, + "step": 4409 + }, + { + "epoch": 2.0508448302588747, + "grad_norm": 0.3659050166606903, + "learning_rate": 2.749960871012129e-06, + "loss": 0.3236, + "step": 4410 + }, + { + "epoch": 2.0513098744380716, + "grad_norm": 0.4140544533729553, + "learning_rate": 2.7475447324483697e-06, + "loss": 0.317, + "step": 4411 + }, + { + "epoch": 2.0517749186172685, + "grad_norm": 0.3611926734447479, + "learning_rate": 2.7451292535921738e-06, + "loss": 0.2912, + "step": 4412 + }, + { + "epoch": 2.052239962796466, + "grad_norm": 0.3759238123893738, + "learning_rate": 2.7427144351509904e-06, + "loss": 0.3198, + "step": 4413 + }, + { + "epoch": 2.0527050069756627, + "grad_norm": 0.37930646538734436, + "learning_rate": 2.7403002778320865e-06, + "loss": 0.3247, + "step": 4414 + }, + { + "epoch": 2.0531700511548596, + "grad_norm": 0.3774726390838623, + "learning_rate": 2.737886782342524e-06, + "loss": 0.3085, + "step": 4415 + }, + { + "epoch": 2.0536350953340565, + "grad_norm": 0.4020675718784332, + "learning_rate": 2.735473949389179e-06, + "loss": 0.2979, + "step": 4416 + }, + { + "epoch": 2.054100139513254, + "grad_norm": 0.4103391766548157, + "learning_rate": 2.733061779678732e-06, + "loss": 0.3147, + "step": 4417 + }, + { + "epoch": 2.054565183692451, + "grad_norm": 0.41063860058784485, + "learning_rate": 2.7306502739176686e-06, + "loss": 0.3333, + "step": 4418 + }, + { + "epoch": 2.0550302278716477, + "grad_norm": 0.36508336663246155, + "learning_rate": 2.728239432812277e-06, + "loss": 0.3225, + "step": 4419 + }, + { + "epoch": 2.055495272050845, + "grad_norm": 0.34332776069641113, + "learning_rate": 2.7258292570686566e-06, + "loss": 0.2756, + "step": 4420 + }, + { + "epoch": 2.055960316230042, + "grad_norm": 0.400511771440506, + "learning_rate": 2.7234197473927054e-06, + "loss": 0.3704, + "step": 4421 + }, + { + "epoch": 2.056425360409239, + "grad_norm": 0.32971513271331787, + "learning_rate": 2.7210109044901335e-06, + "loss": 0.2891, + "step": 4422 + }, + { + "epoch": 2.0568904045884358, + "grad_norm": 0.357746422290802, + "learning_rate": 2.7186027290664474e-06, + "loss": 0.3439, + "step": 4423 + }, + { + "epoch": 2.057355448767633, + "grad_norm": 0.355873703956604, + "learning_rate": 2.716195221826967e-06, + "loss": 0.329, + "step": 4424 + }, + { + "epoch": 2.05782049294683, + "grad_norm": 0.3346231281757355, + "learning_rate": 2.7137883834768076e-06, + "loss": 0.3268, + "step": 4425 + }, + { + "epoch": 2.058285537126027, + "grad_norm": 0.35958102345466614, + "learning_rate": 2.711382214720898e-06, + "loss": 0.3305, + "step": 4426 + }, + { + "epoch": 2.058750581305224, + "grad_norm": 0.3547190725803375, + "learning_rate": 2.708976716263961e-06, + "loss": 0.3142, + "step": 4427 + }, + { + "epoch": 2.059215625484421, + "grad_norm": 0.3430922329425812, + "learning_rate": 2.7065718888105298e-06, + "loss": 0.3278, + "step": 4428 + }, + { + "epoch": 2.059680669663618, + "grad_norm": 0.36960989236831665, + "learning_rate": 2.7041677330649408e-06, + "loss": 0.3641, + "step": 4429 + }, + { + "epoch": 2.060145713842815, + "grad_norm": 0.3565279543399811, + "learning_rate": 2.7017642497313324e-06, + "loss": 0.3027, + "step": 4430 + }, + { + "epoch": 2.060610758022012, + "grad_norm": 0.3367528021335602, + "learning_rate": 2.6993614395136454e-06, + "loss": 0.3133, + "step": 4431 + }, + { + "epoch": 2.0610758022012092, + "grad_norm": 0.35852697491645813, + "learning_rate": 2.6969593031156205e-06, + "loss": 0.342, + "step": 4432 + }, + { + "epoch": 2.061540846380406, + "grad_norm": 0.34888017177581787, + "learning_rate": 2.694557841240809e-06, + "loss": 0.3282, + "step": 4433 + }, + { + "epoch": 2.062005890559603, + "grad_norm": 0.35499727725982666, + "learning_rate": 2.692157054592557e-06, + "loss": 0.2948, + "step": 4434 + }, + { + "epoch": 2.0624709347388004, + "grad_norm": 0.33716249465942383, + "learning_rate": 2.689756943874019e-06, + "loss": 0.2994, + "step": 4435 + }, + { + "epoch": 2.0629359789179973, + "grad_norm": 0.33978933095932007, + "learning_rate": 2.687357509788143e-06, + "loss": 0.3069, + "step": 4436 + }, + { + "epoch": 2.063401023097194, + "grad_norm": 0.3515429198741913, + "learning_rate": 2.684958753037691e-06, + "loss": 0.3488, + "step": 4437 + }, + { + "epoch": 2.063866067276391, + "grad_norm": 0.3632570207118988, + "learning_rate": 2.682560674325215e-06, + "loss": 0.326, + "step": 4438 + }, + { + "epoch": 2.0643311114555885, + "grad_norm": 0.36300498247146606, + "learning_rate": 2.680163274353075e-06, + "loss": 0.2862, + "step": 4439 + }, + { + "epoch": 2.0647961556347854, + "grad_norm": 0.3675801753997803, + "learning_rate": 2.6777665538234292e-06, + "loss": 0.3299, + "step": 4440 + }, + { + "epoch": 2.0652611998139823, + "grad_norm": 0.3985215127468109, + "learning_rate": 2.6753705134382425e-06, + "loss": 0.3205, + "step": 4441 + }, + { + "epoch": 2.065726243993179, + "grad_norm": 0.35572221875190735, + "learning_rate": 2.6729751538992704e-06, + "loss": 0.3302, + "step": 4442 + }, + { + "epoch": 2.0661912881723765, + "grad_norm": 0.35556313395500183, + "learning_rate": 2.67058047590808e-06, + "loss": 0.2882, + "step": 4443 + }, + { + "epoch": 2.0666563323515734, + "grad_norm": 0.4020705819129944, + "learning_rate": 2.6681864801660284e-06, + "loss": 0.3273, + "step": 4444 + }, + { + "epoch": 2.0671213765307703, + "grad_norm": 0.40115731954574585, + "learning_rate": 2.6657931673742834e-06, + "loss": 0.3591, + "step": 4445 + }, + { + "epoch": 2.0675864207099672, + "grad_norm": 0.320512592792511, + "learning_rate": 2.6634005382338025e-06, + "loss": 0.2962, + "step": 4446 + }, + { + "epoch": 2.0680514648891646, + "grad_norm": 0.35881075263023376, + "learning_rate": 2.6610085934453523e-06, + "loss": 0.3679, + "step": 4447 + }, + { + "epoch": 2.0685165090683615, + "grad_norm": 0.35448718070983887, + "learning_rate": 2.6586173337094904e-06, + "loss": 0.2736, + "step": 4448 + }, + { + "epoch": 2.0689815532475584, + "grad_norm": 0.378061980009079, + "learning_rate": 2.656226759726582e-06, + "loss": 0.3526, + "step": 4449 + }, + { + "epoch": 2.0694465974267557, + "grad_norm": 0.3106938600540161, + "learning_rate": 2.6538368721967838e-06, + "loss": 0.2833, + "step": 4450 + }, + { + "epoch": 2.0699116416059526, + "grad_norm": 0.3516807556152344, + "learning_rate": 2.6514476718200566e-06, + "loss": 0.3392, + "step": 4451 + }, + { + "epoch": 2.0703766857851496, + "grad_norm": 0.3462466299533844, + "learning_rate": 2.649059159296158e-06, + "loss": 0.3042, + "step": 4452 + }, + { + "epoch": 2.0708417299643465, + "grad_norm": 0.36509615182876587, + "learning_rate": 2.646671335324647e-06, + "loss": 0.3551, + "step": 4453 + }, + { + "epoch": 2.071306774143544, + "grad_norm": 0.347945898771286, + "learning_rate": 2.644284200604874e-06, + "loss": 0.3431, + "step": 4454 + }, + { + "epoch": 2.0717718183227407, + "grad_norm": 0.3239026963710785, + "learning_rate": 2.641897755835997e-06, + "loss": 0.2844, + "step": 4455 + }, + { + "epoch": 2.0722368625019376, + "grad_norm": 0.35373586416244507, + "learning_rate": 2.6395120017169627e-06, + "loss": 0.3261, + "step": 4456 + }, + { + "epoch": 2.0727019066811345, + "grad_norm": 0.3564927875995636, + "learning_rate": 2.6371269389465227e-06, + "loss": 0.3213, + "step": 4457 + }, + { + "epoch": 2.073166950860332, + "grad_norm": 0.3221034109592438, + "learning_rate": 2.6347425682232196e-06, + "loss": 0.2954, + "step": 4458 + }, + { + "epoch": 2.0736319950395288, + "grad_norm": 0.36184707283973694, + "learning_rate": 2.6323588902454013e-06, + "loss": 0.3331, + "step": 4459 + }, + { + "epoch": 2.0740970392187257, + "grad_norm": 0.3728765845298767, + "learning_rate": 2.629975905711204e-06, + "loss": 0.3231, + "step": 4460 + }, + { + "epoch": 2.0745620833979226, + "grad_norm": 0.35533544421195984, + "learning_rate": 2.6275936153185694e-06, + "loss": 0.3132, + "step": 4461 + }, + { + "epoch": 2.07502712757712, + "grad_norm": 0.337354451417923, + "learning_rate": 2.6252120197652277e-06, + "loss": 0.2933, + "step": 4462 + }, + { + "epoch": 2.075492171756317, + "grad_norm": 0.33192774653434753, + "learning_rate": 2.622831119748711e-06, + "loss": 0.3354, + "step": 4463 + }, + { + "epoch": 2.0759572159355137, + "grad_norm": 0.36651748418807983, + "learning_rate": 2.620450915966346e-06, + "loss": 0.3294, + "step": 4464 + }, + { + "epoch": 2.076422260114711, + "grad_norm": 0.3576674461364746, + "learning_rate": 2.618071409115259e-06, + "loss": 0.3174, + "step": 4465 + }, + { + "epoch": 2.076887304293908, + "grad_norm": 0.33194971084594727, + "learning_rate": 2.615692599892364e-06, + "loss": 0.3175, + "step": 4466 + }, + { + "epoch": 2.077352348473105, + "grad_norm": 0.40041840076446533, + "learning_rate": 2.6133144889943808e-06, + "loss": 0.317, + "step": 4467 + }, + { + "epoch": 2.077817392652302, + "grad_norm": 0.39717933535575867, + "learning_rate": 2.6109370771178155e-06, + "loss": 0.3476, + "step": 4468 + }, + { + "epoch": 2.078282436831499, + "grad_norm": 0.35431498289108276, + "learning_rate": 2.6085603649589723e-06, + "loss": 0.3019, + "step": 4469 + }, + { + "epoch": 2.078747481010696, + "grad_norm": 0.3609691560268402, + "learning_rate": 2.6061843532139563e-06, + "loss": 0.3173, + "step": 4470 + }, + { + "epoch": 2.079212525189893, + "grad_norm": 0.37636587023735046, + "learning_rate": 2.6038090425786577e-06, + "loss": 0.3253, + "step": 4471 + }, + { + "epoch": 2.07967756936909, + "grad_norm": 0.34753209352493286, + "learning_rate": 2.601434433748771e-06, + "loss": 0.2982, + "step": 4472 + }, + { + "epoch": 2.0801426135482872, + "grad_norm": 0.3801688849925995, + "learning_rate": 2.5990605274197763e-06, + "loss": 0.336, + "step": 4473 + }, + { + "epoch": 2.080607657727484, + "grad_norm": 0.37925639748573303, + "learning_rate": 2.596687324286954e-06, + "loss": 0.3277, + "step": 4474 + }, + { + "epoch": 2.081072701906681, + "grad_norm": 0.3775731027126312, + "learning_rate": 2.5943148250453774e-06, + "loss": 0.3043, + "step": 4475 + }, + { + "epoch": 2.0815377460858784, + "grad_norm": 0.37141674757003784, + "learning_rate": 2.5919430303899144e-06, + "loss": 0.3141, + "step": 4476 + }, + { + "epoch": 2.0820027902650753, + "grad_norm": 0.38826245069503784, + "learning_rate": 2.589571941015222e-06, + "loss": 0.312, + "step": 4477 + }, + { + "epoch": 2.082467834444272, + "grad_norm": 0.3552106022834778, + "learning_rate": 2.587201557615756e-06, + "loss": 0.2905, + "step": 4478 + }, + { + "epoch": 2.082932878623469, + "grad_norm": 0.40252459049224854, + "learning_rate": 2.584831880885761e-06, + "loss": 0.3681, + "step": 4479 + }, + { + "epoch": 2.0833979228026664, + "grad_norm": 0.35997259616851807, + "learning_rate": 2.58246291151928e-06, + "loss": 0.3051, + "step": 4480 + }, + { + "epoch": 2.0838629669818634, + "grad_norm": 0.3523218333721161, + "learning_rate": 2.580094650210142e-06, + "loss": 0.2918, + "step": 4481 + }, + { + "epoch": 2.0843280111610603, + "grad_norm": 0.4533151090145111, + "learning_rate": 2.577727097651976e-06, + "loss": 0.3553, + "step": 4482 + }, + { + "epoch": 2.084793055340257, + "grad_norm": 0.35789787769317627, + "learning_rate": 2.575360254538195e-06, + "loss": 0.2959, + "step": 4483 + }, + { + "epoch": 2.0852580995194545, + "grad_norm": 0.39699408411979675, + "learning_rate": 2.5729941215620148e-06, + "loss": 0.3356, + "step": 4484 + }, + { + "epoch": 2.0857231436986514, + "grad_norm": 0.3448418974876404, + "learning_rate": 2.5706286994164315e-06, + "loss": 0.2892, + "step": 4485 + }, + { + "epoch": 2.0861881878778483, + "grad_norm": 0.3884351849555969, + "learning_rate": 2.568263988794242e-06, + "loss": 0.3183, + "step": 4486 + }, + { + "epoch": 2.0866532320570452, + "grad_norm": 0.4005531072616577, + "learning_rate": 2.56589999038803e-06, + "loss": 0.3394, + "step": 4487 + }, + { + "epoch": 2.0871182762362426, + "grad_norm": 0.3274531960487366, + "learning_rate": 2.563536704890176e-06, + "loss": 0.2915, + "step": 4488 + }, + { + "epoch": 2.0875833204154395, + "grad_norm": 0.418525367975235, + "learning_rate": 2.5611741329928436e-06, + "loss": 0.347, + "step": 4489 + }, + { + "epoch": 2.0880483645946364, + "grad_norm": 0.3738149106502533, + "learning_rate": 2.558812275387995e-06, + "loss": 0.3231, + "step": 4490 + }, + { + "epoch": 2.0885134087738333, + "grad_norm": 0.3588385581970215, + "learning_rate": 2.556451132767377e-06, + "loss": 0.3281, + "step": 4491 + }, + { + "epoch": 2.0889784529530306, + "grad_norm": 0.3635399639606476, + "learning_rate": 2.554090705822533e-06, + "loss": 0.3367, + "step": 4492 + }, + { + "epoch": 2.0894434971322275, + "grad_norm": 0.342986136674881, + "learning_rate": 2.5517309952447887e-06, + "loss": 0.3051, + "step": 4493 + }, + { + "epoch": 2.0899085413114245, + "grad_norm": 0.3733537495136261, + "learning_rate": 2.549372001725272e-06, + "loss": 0.3154, + "step": 4494 + }, + { + "epoch": 2.090373585490622, + "grad_norm": 0.404184490442276, + "learning_rate": 2.547013725954887e-06, + "loss": 0.3149, + "step": 4495 + }, + { + "epoch": 2.0908386296698187, + "grad_norm": 0.39761337637901306, + "learning_rate": 2.5446561686243397e-06, + "loss": 0.3353, + "step": 4496 + }, + { + "epoch": 2.0913036738490156, + "grad_norm": 0.36512845754623413, + "learning_rate": 2.5422993304241163e-06, + "loss": 0.3217, + "step": 4497 + }, + { + "epoch": 2.0917687180282125, + "grad_norm": 0.35249972343444824, + "learning_rate": 2.5399432120444985e-06, + "loss": 0.2946, + "step": 4498 + }, + { + "epoch": 2.09223376220741, + "grad_norm": 0.3695544898509979, + "learning_rate": 2.537587814175554e-06, + "loss": 0.3332, + "step": 4499 + }, + { + "epoch": 2.0926988063866068, + "grad_norm": 0.3467404246330261, + "learning_rate": 2.5352331375071437e-06, + "loss": 0.3147, + "step": 4500 + }, + { + "epoch": 2.0931638505658037, + "grad_norm": 0.3637906610965729, + "learning_rate": 2.53287918272891e-06, + "loss": 0.3054, + "step": 4501 + }, + { + "epoch": 2.0936288947450006, + "grad_norm": 0.3547385036945343, + "learning_rate": 2.5305259505302914e-06, + "loss": 0.3231, + "step": 4502 + }, + { + "epoch": 2.094093938924198, + "grad_norm": 0.36574193835258484, + "learning_rate": 2.5281734416005107e-06, + "loss": 0.3125, + "step": 4503 + }, + { + "epoch": 2.094558983103395, + "grad_norm": 0.3319716155529022, + "learning_rate": 2.5258216566285758e-06, + "loss": 0.3083, + "step": 4504 + }, + { + "epoch": 2.0950240272825917, + "grad_norm": 0.3524357080459595, + "learning_rate": 2.5234705963032917e-06, + "loss": 0.3457, + "step": 4505 + }, + { + "epoch": 2.095489071461789, + "grad_norm": 0.34479501843452454, + "learning_rate": 2.5211202613132413e-06, + "loss": 0.3415, + "step": 4506 + }, + { + "epoch": 2.095954115640986, + "grad_norm": 0.3542589247226715, + "learning_rate": 2.5187706523468034e-06, + "loss": 0.3051, + "step": 4507 + }, + { + "epoch": 2.096419159820183, + "grad_norm": 0.3570210337638855, + "learning_rate": 2.516421770092136e-06, + "loss": 0.3496, + "step": 4508 + }, + { + "epoch": 2.09688420399938, + "grad_norm": 0.3366619050502777, + "learning_rate": 2.5140736152371916e-06, + "loss": 0.3166, + "step": 4509 + }, + { + "epoch": 2.097349248178577, + "grad_norm": 0.3258604407310486, + "learning_rate": 2.5117261884697066e-06, + "loss": 0.2912, + "step": 4510 + }, + { + "epoch": 2.097814292357774, + "grad_norm": 0.3651737868785858, + "learning_rate": 2.509379490477204e-06, + "loss": 0.3397, + "step": 4511 + }, + { + "epoch": 2.098279336536971, + "grad_norm": 0.335183322429657, + "learning_rate": 2.507033521946992e-06, + "loss": 0.2965, + "step": 4512 + }, + { + "epoch": 2.098744380716168, + "grad_norm": 0.37854668498039246, + "learning_rate": 2.5046882835661694e-06, + "loss": 0.3288, + "step": 4513 + }, + { + "epoch": 2.099209424895365, + "grad_norm": 0.39023512601852417, + "learning_rate": 2.502343776021615e-06, + "loss": 0.3475, + "step": 4514 + }, + { + "epoch": 2.099674469074562, + "grad_norm": 0.37429752945899963, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.325, + "step": 4515 + }, + { + "epoch": 2.100139513253759, + "grad_norm": 0.3756406605243683, + "learning_rate": 2.4976569561877774e-06, + "loss": 0.327, + "step": 4516 + }, + { + "epoch": 2.100604557432956, + "grad_norm": 0.36603933572769165, + "learning_rate": 2.4953146452711866e-06, + "loss": 0.341, + "step": 4517 + }, + { + "epoch": 2.1010696016121533, + "grad_norm": 0.34370267391204834, + "learning_rate": 2.492973067936251e-06, + "loss": 0.317, + "step": 4518 + }, + { + "epoch": 2.10153464579135, + "grad_norm": 0.40021899342536926, + "learning_rate": 2.490632224868783e-06, + "loss": 0.3222, + "step": 4519 + }, + { + "epoch": 2.101999689970547, + "grad_norm": 0.39490172266960144, + "learning_rate": 2.4882921167543745e-06, + "loss": 0.3088, + "step": 4520 + }, + { + "epoch": 2.102464734149744, + "grad_norm": 0.37795066833496094, + "learning_rate": 2.485952744278407e-06, + "loss": 0.3156, + "step": 4521 + }, + { + "epoch": 2.1029297783289413, + "grad_norm": 0.3622766435146332, + "learning_rate": 2.483614108126045e-06, + "loss": 0.3002, + "step": 4522 + }, + { + "epoch": 2.1033948225081383, + "grad_norm": 0.33893483877182007, + "learning_rate": 2.4812762089822384e-06, + "loss": 0.3183, + "step": 4523 + }, + { + "epoch": 2.103859866687335, + "grad_norm": 0.3871879279613495, + "learning_rate": 2.478939047531716e-06, + "loss": 0.3391, + "step": 4524 + }, + { + "epoch": 2.1043249108665325, + "grad_norm": 0.34787362813949585, + "learning_rate": 2.4766026244589986e-06, + "loss": 0.2944, + "step": 4525 + }, + { + "epoch": 2.1047899550457294, + "grad_norm": 0.36640048027038574, + "learning_rate": 2.4742669404483825e-06, + "loss": 0.3368, + "step": 4526 + }, + { + "epoch": 2.1052549992249263, + "grad_norm": 0.36439046263694763, + "learning_rate": 2.471931996183956e-06, + "loss": 0.3047, + "step": 4527 + }, + { + "epoch": 2.105720043404123, + "grad_norm": 0.33959466218948364, + "learning_rate": 2.4695977923495816e-06, + "loss": 0.2989, + "step": 4528 + }, + { + "epoch": 2.1061850875833206, + "grad_norm": 0.3902963697910309, + "learning_rate": 2.4672643296289145e-06, + "loss": 0.3183, + "step": 4529 + }, + { + "epoch": 2.1066501317625175, + "grad_norm": 0.4295981228351593, + "learning_rate": 2.464931608705384e-06, + "loss": 0.2918, + "step": 4530 + }, + { + "epoch": 2.1071151759417144, + "grad_norm": 0.36755356192588806, + "learning_rate": 2.462599630262209e-06, + "loss": 0.324, + "step": 4531 + }, + { + "epoch": 2.1075802201209113, + "grad_norm": 0.3295392394065857, + "learning_rate": 2.4602683949823853e-06, + "loss": 0.3026, + "step": 4532 + }, + { + "epoch": 2.1080452643001086, + "grad_norm": 0.34905338287353516, + "learning_rate": 2.457937903548695e-06, + "loss": 0.3265, + "step": 4533 + }, + { + "epoch": 2.1085103084793055, + "grad_norm": 0.37717199325561523, + "learning_rate": 2.4556081566437025e-06, + "loss": 0.3326, + "step": 4534 + }, + { + "epoch": 2.1089753526585024, + "grad_norm": 0.34889674186706543, + "learning_rate": 2.453279154949753e-06, + "loss": 0.3113, + "step": 4535 + }, + { + "epoch": 2.1094403968377, + "grad_norm": 0.35179656744003296, + "learning_rate": 2.4509508991489704e-06, + "loss": 0.3188, + "step": 4536 + }, + { + "epoch": 2.1099054410168967, + "grad_norm": 0.35203817486763, + "learning_rate": 2.4486233899232674e-06, + "loss": 0.3106, + "step": 4537 + }, + { + "epoch": 2.1103704851960936, + "grad_norm": 0.3592386841773987, + "learning_rate": 2.4462966279543287e-06, + "loss": 0.3241, + "step": 4538 + }, + { + "epoch": 2.1108355293752905, + "grad_norm": 0.37422919273376465, + "learning_rate": 2.4439706139236295e-06, + "loss": 0.2891, + "step": 4539 + }, + { + "epoch": 2.111300573554488, + "grad_norm": 0.37109753489494324, + "learning_rate": 2.4416453485124196e-06, + "loss": 0.3353, + "step": 4540 + }, + { + "epoch": 2.1117656177336848, + "grad_norm": 0.35821226239204407, + "learning_rate": 2.4393208324017294e-06, + "loss": 0.3409, + "step": 4541 + }, + { + "epoch": 2.1122306619128817, + "grad_norm": 0.3365572690963745, + "learning_rate": 2.4369970662723756e-06, + "loss": 0.2886, + "step": 4542 + }, + { + "epoch": 2.1126957060920786, + "grad_norm": 0.3579605221748352, + "learning_rate": 2.4346740508049484e-06, + "loss": 0.3297, + "step": 4543 + }, + { + "epoch": 2.113160750271276, + "grad_norm": 0.32147571444511414, + "learning_rate": 2.432351786679822e-06, + "loss": 0.2968, + "step": 4544 + }, + { + "epoch": 2.113625794450473, + "grad_norm": 0.34730449318885803, + "learning_rate": 2.430030274577151e-06, + "loss": 0.3432, + "step": 4545 + }, + { + "epoch": 2.1140908386296697, + "grad_norm": 0.3238038718700409, + "learning_rate": 2.4277095151768698e-06, + "loss": 0.2922, + "step": 4546 + }, + { + "epoch": 2.1145558828088666, + "grad_norm": 0.3770063817501068, + "learning_rate": 2.4253895091586883e-06, + "loss": 0.2981, + "step": 4547 + }, + { + "epoch": 2.115020926988064, + "grad_norm": 0.39007771015167236, + "learning_rate": 2.423070257202101e-06, + "loss": 0.3372, + "step": 4548 + }, + { + "epoch": 2.115485971167261, + "grad_norm": 0.3754151463508606, + "learning_rate": 2.420751759986376e-06, + "loss": 0.318, + "step": 4549 + }, + { + "epoch": 2.115951015346458, + "grad_norm": 0.3623104989528656, + "learning_rate": 2.4184340181905675e-06, + "loss": 0.3214, + "step": 4550 + }, + { + "epoch": 2.116416059525655, + "grad_norm": 0.34606826305389404, + "learning_rate": 2.4161170324935e-06, + "loss": 0.3351, + "step": 4551 + }, + { + "epoch": 2.116881103704852, + "grad_norm": 0.36132997274398804, + "learning_rate": 2.4138008035737858e-06, + "loss": 0.3475, + "step": 4552 + }, + { + "epoch": 2.117346147884049, + "grad_norm": 0.36312153935432434, + "learning_rate": 2.411485332109806e-06, + "loss": 0.3111, + "step": 4553 + }, + { + "epoch": 2.117811192063246, + "grad_norm": 0.38179275393486023, + "learning_rate": 2.4091706187797286e-06, + "loss": 0.321, + "step": 4554 + }, + { + "epoch": 2.118276236242443, + "grad_norm": 0.37554097175598145, + "learning_rate": 2.4068566642614923e-06, + "loss": 0.2791, + "step": 4555 + }, + { + "epoch": 2.11874128042164, + "grad_norm": 0.3833089768886566, + "learning_rate": 2.4045434692328172e-06, + "loss": 0.3357, + "step": 4556 + }, + { + "epoch": 2.119206324600837, + "grad_norm": 0.35031166672706604, + "learning_rate": 2.4022310343712022e-06, + "loss": 0.2867, + "step": 4557 + }, + { + "epoch": 2.119671368780034, + "grad_norm": 0.37646663188934326, + "learning_rate": 2.3999193603539234e-06, + "loss": 0.3037, + "step": 4558 + }, + { + "epoch": 2.1201364129592313, + "grad_norm": 0.34302130341529846, + "learning_rate": 2.3976084478580282e-06, + "loss": 0.3407, + "step": 4559 + }, + { + "epoch": 2.120601457138428, + "grad_norm": 0.336687296628952, + "learning_rate": 2.3952982975603494e-06, + "loss": 0.2947, + "step": 4560 + }, + { + "epoch": 2.121066501317625, + "grad_norm": 0.3723877966403961, + "learning_rate": 2.3929889101374887e-06, + "loss": 0.3331, + "step": 4561 + }, + { + "epoch": 2.121531545496822, + "grad_norm": 0.369495153427124, + "learning_rate": 2.3906802862658325e-06, + "loss": 0.3428, + "step": 4562 + }, + { + "epoch": 2.1219965896760193, + "grad_norm": 0.3618394136428833, + "learning_rate": 2.388372426621534e-06, + "loss": 0.3158, + "step": 4563 + }, + { + "epoch": 2.1224616338552162, + "grad_norm": 0.34468409419059753, + "learning_rate": 2.386065331880534e-06, + "loss": 0.3068, + "step": 4564 + }, + { + "epoch": 2.122926678034413, + "grad_norm": 0.3602186441421509, + "learning_rate": 2.3837590027185364e-06, + "loss": 0.326, + "step": 4565 + }, + { + "epoch": 2.1233917222136105, + "grad_norm": 0.339193195104599, + "learning_rate": 2.381453439811034e-06, + "loss": 0.3237, + "step": 4566 + }, + { + "epoch": 2.1238567663928074, + "grad_norm": 0.3470028340816498, + "learning_rate": 2.379148643833283e-06, + "loss": 0.3042, + "step": 4567 + }, + { + "epoch": 2.1243218105720043, + "grad_norm": 0.35298952460289, + "learning_rate": 2.3768446154603234e-06, + "loss": 0.3114, + "step": 4568 + }, + { + "epoch": 2.124786854751201, + "grad_norm": 0.39458978176116943, + "learning_rate": 2.3745413553669678e-06, + "loss": 0.331, + "step": 4569 + }, + { + "epoch": 2.1252518989303986, + "grad_norm": 0.36957892775535583, + "learning_rate": 2.3722388642278047e-06, + "loss": 0.2894, + "step": 4570 + }, + { + "epoch": 2.1257169431095955, + "grad_norm": 0.38413190841674805, + "learning_rate": 2.369937142717194e-06, + "loss": 0.3403, + "step": 4571 + }, + { + "epoch": 2.1261819872887924, + "grad_norm": 0.32337597012519836, + "learning_rate": 2.3676361915092757e-06, + "loss": 0.2965, + "step": 4572 + }, + { + "epoch": 2.1266470314679893, + "grad_norm": 0.3725772500038147, + "learning_rate": 2.3653360112779567e-06, + "loss": 0.3541, + "step": 4573 + }, + { + "epoch": 2.1271120756471866, + "grad_norm": 0.3611546456813812, + "learning_rate": 2.363036602696927e-06, + "loss": 0.2821, + "step": 4574 + }, + { + "epoch": 2.1275771198263835, + "grad_norm": 0.362352192401886, + "learning_rate": 2.3607379664396414e-06, + "loss": 0.3477, + "step": 4575 + }, + { + "epoch": 2.1280421640055804, + "grad_norm": 0.3450590968132019, + "learning_rate": 2.3584401031793377e-06, + "loss": 0.2978, + "step": 4576 + }, + { + "epoch": 2.128507208184778, + "grad_norm": 0.4075535535812378, + "learning_rate": 2.35614301358902e-06, + "loss": 0.358, + "step": 4577 + }, + { + "epoch": 2.1289722523639747, + "grad_norm": 0.3503515422344208, + "learning_rate": 2.353846698341468e-06, + "loss": 0.3289, + "step": 4578 + }, + { + "epoch": 2.1294372965431716, + "grad_norm": 0.38733020424842834, + "learning_rate": 2.351551158109235e-06, + "loss": 0.2964, + "step": 4579 + }, + { + "epoch": 2.1299023407223685, + "grad_norm": 0.35857686400413513, + "learning_rate": 2.3492563935646493e-06, + "loss": 0.3276, + "step": 4580 + }, + { + "epoch": 2.130367384901566, + "grad_norm": 0.34984397888183594, + "learning_rate": 2.3469624053798117e-06, + "loss": 0.3162, + "step": 4581 + }, + { + "epoch": 2.1308324290807628, + "grad_norm": 0.34331047534942627, + "learning_rate": 2.34466919422659e-06, + "loss": 0.309, + "step": 4582 + }, + { + "epoch": 2.1312974732599597, + "grad_norm": 0.32851484417915344, + "learning_rate": 2.3423767607766316e-06, + "loss": 0.3186, + "step": 4583 + }, + { + "epoch": 2.1317625174391566, + "grad_norm": 0.360724538564682, + "learning_rate": 2.34008510570135e-06, + "loss": 0.2974, + "step": 4584 + }, + { + "epoch": 2.132227561618354, + "grad_norm": 0.3493599593639374, + "learning_rate": 2.337794229671938e-06, + "loss": 0.3563, + "step": 4585 + }, + { + "epoch": 2.132692605797551, + "grad_norm": 0.3586118519306183, + "learning_rate": 2.3355041333593507e-06, + "loss": 0.3515, + "step": 4586 + }, + { + "epoch": 2.1331576499767477, + "grad_norm": 0.3299708664417267, + "learning_rate": 2.3332148174343257e-06, + "loss": 0.3117, + "step": 4587 + }, + { + "epoch": 2.1336226941559446, + "grad_norm": 0.31566065549850464, + "learning_rate": 2.3309262825673616e-06, + "loss": 0.3165, + "step": 4588 + }, + { + "epoch": 2.134087738335142, + "grad_norm": 0.35169747471809387, + "learning_rate": 2.3286385294287367e-06, + "loss": 0.3247, + "step": 4589 + }, + { + "epoch": 2.134552782514339, + "grad_norm": 0.3681233525276184, + "learning_rate": 2.3263515586884935e-06, + "loss": 0.3373, + "step": 4590 + }, + { + "epoch": 2.135017826693536, + "grad_norm": 0.3795618414878845, + "learning_rate": 2.32406537101645e-06, + "loss": 0.3029, + "step": 4591 + }, + { + "epoch": 2.1354828708727327, + "grad_norm": 0.3609673082828522, + "learning_rate": 2.3217799670821938e-06, + "loss": 0.3362, + "step": 4592 + }, + { + "epoch": 2.13594791505193, + "grad_norm": 0.3580998480319977, + "learning_rate": 2.3194953475550846e-06, + "loss": 0.3337, + "step": 4593 + }, + { + "epoch": 2.136412959231127, + "grad_norm": 0.33573347330093384, + "learning_rate": 2.3172115131042466e-06, + "loss": 0.2821, + "step": 4594 + }, + { + "epoch": 2.136878003410324, + "grad_norm": 0.3740091919898987, + "learning_rate": 2.314928464398581e-06, + "loss": 0.3662, + "step": 4595 + }, + { + "epoch": 2.137343047589521, + "grad_norm": 0.385172963142395, + "learning_rate": 2.3126462021067518e-06, + "loss": 0.3709, + "step": 4596 + }, + { + "epoch": 2.137808091768718, + "grad_norm": 0.3355659246444702, + "learning_rate": 2.310364726897202e-06, + "loss": 0.3105, + "step": 4597 + }, + { + "epoch": 2.138273135947915, + "grad_norm": 0.36844390630722046, + "learning_rate": 2.3080840394381327e-06, + "loss": 0.3155, + "step": 4598 + }, + { + "epoch": 2.138738180127112, + "grad_norm": 0.3672247529029846, + "learning_rate": 2.305804140397525e-06, + "loss": 0.3089, + "step": 4599 + }, + { + "epoch": 2.1392032243063093, + "grad_norm": 0.364468514919281, + "learning_rate": 2.3035250304431206e-06, + "loss": 0.3194, + "step": 4600 + }, + { + "epoch": 2.139668268485506, + "grad_norm": 0.37167108058929443, + "learning_rate": 2.3012467102424373e-06, + "loss": 0.3588, + "step": 4601 + }, + { + "epoch": 2.140133312664703, + "grad_norm": 0.3726482689380646, + "learning_rate": 2.2989691804627544e-06, + "loss": 0.3193, + "step": 4602 + }, + { + "epoch": 2.1405983568439, + "grad_norm": 0.3460210859775543, + "learning_rate": 2.296692441771125e-06, + "loss": 0.3113, + "step": 4603 + }, + { + "epoch": 2.1410634010230973, + "grad_norm": 0.3563862442970276, + "learning_rate": 2.29441649483437e-06, + "loss": 0.3405, + "step": 4604 + }, + { + "epoch": 2.1415284452022942, + "grad_norm": 0.3387638032436371, + "learning_rate": 2.2921413403190774e-06, + "loss": 0.3125, + "step": 4605 + }, + { + "epoch": 2.141993489381491, + "grad_norm": 0.38365113735198975, + "learning_rate": 2.2898669788916006e-06, + "loss": 0.3201, + "step": 4606 + }, + { + "epoch": 2.1424585335606885, + "grad_norm": 0.369579553604126, + "learning_rate": 2.2875934112180664e-06, + "loss": 0.3501, + "step": 4607 + }, + { + "epoch": 2.1429235777398854, + "grad_norm": 0.36825814843177795, + "learning_rate": 2.285320637964362e-06, + "loss": 0.3147, + "step": 4608 + }, + { + "epoch": 2.1433886219190823, + "grad_norm": 0.3360195457935333, + "learning_rate": 2.2830486597961504e-06, + "loss": 0.2841, + "step": 4609 + }, + { + "epoch": 2.143853666098279, + "grad_norm": 0.3493092358112335, + "learning_rate": 2.2807774773788518e-06, + "loss": 0.3183, + "step": 4610 + }, + { + "epoch": 2.1443187102774766, + "grad_norm": 0.3550946116447449, + "learning_rate": 2.2785070913776635e-06, + "loss": 0.3226, + "step": 4611 + }, + { + "epoch": 2.1447837544566735, + "grad_norm": 0.3619578778743744, + "learning_rate": 2.2762375024575424e-06, + "loss": 0.3189, + "step": 4612 + }, + { + "epoch": 2.1452487986358704, + "grad_norm": 0.388798326253891, + "learning_rate": 2.2739687112832125e-06, + "loss": 0.3142, + "step": 4613 + }, + { + "epoch": 2.1457138428150673, + "grad_norm": 0.3689102530479431, + "learning_rate": 2.2717007185191673e-06, + "loss": 0.3336, + "step": 4614 + }, + { + "epoch": 2.1461788869942646, + "grad_norm": 0.3550708591938019, + "learning_rate": 2.269433524829666e-06, + "loss": 0.3183, + "step": 4615 + }, + { + "epoch": 2.1466439311734615, + "grad_norm": 0.3319764733314514, + "learning_rate": 2.267167130878734e-06, + "loss": 0.3024, + "step": 4616 + }, + { + "epoch": 2.1471089753526584, + "grad_norm": 0.35746893286705017, + "learning_rate": 2.2649015373301574e-06, + "loss": 0.2984, + "step": 4617 + }, + { + "epoch": 2.1475740195318553, + "grad_norm": 0.3471658527851105, + "learning_rate": 2.2626367448474963e-06, + "loss": 0.3148, + "step": 4618 + }, + { + "epoch": 2.1480390637110527, + "grad_norm": 0.3576483130455017, + "learning_rate": 2.2603727540940673e-06, + "loss": 0.3567, + "step": 4619 + }, + { + "epoch": 2.1485041078902496, + "grad_norm": 0.31263265013694763, + "learning_rate": 2.25810956573296e-06, + "loss": 0.294, + "step": 4620 + }, + { + "epoch": 2.1489691520694465, + "grad_norm": 0.3338179588317871, + "learning_rate": 2.255847180427022e-06, + "loss": 0.319, + "step": 4621 + }, + { + "epoch": 2.1494341962486434, + "grad_norm": 0.33670803904533386, + "learning_rate": 2.2535855988388734e-06, + "loss": 0.3338, + "step": 4622 + }, + { + "epoch": 2.1498992404278408, + "grad_norm": 0.3188488185405731, + "learning_rate": 2.2513248216308897e-06, + "loss": 0.2784, + "step": 4623 + }, + { + "epoch": 2.1503642846070377, + "grad_norm": 0.40295830368995667, + "learning_rate": 2.249064849465221e-06, + "loss": 0.3436, + "step": 4624 + }, + { + "epoch": 2.1508293287862346, + "grad_norm": 0.35394805669784546, + "learning_rate": 2.2468056830037725e-06, + "loss": 0.3132, + "step": 4625 + }, + { + "epoch": 2.151294372965432, + "grad_norm": 0.3387957215309143, + "learning_rate": 2.2445473229082186e-06, + "loss": 0.3198, + "step": 4626 + }, + { + "epoch": 2.151759417144629, + "grad_norm": 0.34971579909324646, + "learning_rate": 2.2422897698399964e-06, + "loss": 0.2797, + "step": 4627 + }, + { + "epoch": 2.1522244613238257, + "grad_norm": 0.4021191895008087, + "learning_rate": 2.240033024460309e-06, + "loss": 0.3572, + "step": 4628 + }, + { + "epoch": 2.1526895055030226, + "grad_norm": 0.34752318263053894, + "learning_rate": 2.2377770874301157e-06, + "loss": 0.3026, + "step": 4629 + }, + { + "epoch": 2.15315454968222, + "grad_norm": 0.338008850812912, + "learning_rate": 2.2355219594101483e-06, + "loss": 0.3024, + "step": 4630 + }, + { + "epoch": 2.153619593861417, + "grad_norm": 0.331898033618927, + "learning_rate": 2.2332676410608937e-06, + "loss": 0.3002, + "step": 4631 + }, + { + "epoch": 2.154084638040614, + "grad_norm": 0.36906805634498596, + "learning_rate": 2.231014133042608e-06, + "loss": 0.3276, + "step": 4632 + }, + { + "epoch": 2.1545496822198107, + "grad_norm": 0.35603275895118713, + "learning_rate": 2.2287614360153042e-06, + "loss": 0.3409, + "step": 4633 + }, + { + "epoch": 2.155014726399008, + "grad_norm": 0.3487357199192047, + "learning_rate": 2.226509550638764e-06, + "loss": 0.326, + "step": 4634 + }, + { + "epoch": 2.155479770578205, + "grad_norm": 0.3237025737762451, + "learning_rate": 2.224258477572524e-06, + "loss": 0.2981, + "step": 4635 + }, + { + "epoch": 2.155944814757402, + "grad_norm": 0.35809311270713806, + "learning_rate": 2.222008217475891e-06, + "loss": 0.307, + "step": 4636 + }, + { + "epoch": 2.156409858936599, + "grad_norm": 0.3373444080352783, + "learning_rate": 2.219758771007926e-06, + "loss": 0.3359, + "step": 4637 + }, + { + "epoch": 2.156874903115796, + "grad_norm": 0.36277028918266296, + "learning_rate": 2.217510138827457e-06, + "loss": 0.3766, + "step": 4638 + }, + { + "epoch": 2.157339947294993, + "grad_norm": 0.32372814416885376, + "learning_rate": 2.215262321593072e-06, + "loss": 0.2788, + "step": 4639 + }, + { + "epoch": 2.15780499147419, + "grad_norm": 0.37135806679725647, + "learning_rate": 2.2130153199631214e-06, + "loss": 0.3146, + "step": 4640 + }, + { + "epoch": 2.1582700356533873, + "grad_norm": 0.3312724828720093, + "learning_rate": 2.2107691345957133e-06, + "loss": 0.3261, + "step": 4641 + }, + { + "epoch": 2.158735079832584, + "grad_norm": 0.3527110517024994, + "learning_rate": 2.208523766148721e-06, + "loss": 0.3189, + "step": 4642 + }, + { + "epoch": 2.159200124011781, + "grad_norm": 0.3383033275604248, + "learning_rate": 2.2062792152797733e-06, + "loss": 0.3293, + "step": 4643 + }, + { + "epoch": 2.159665168190978, + "grad_norm": 0.4051493704319, + "learning_rate": 2.204035482646267e-06, + "loss": 0.3627, + "step": 4644 + }, + { + "epoch": 2.1601302123701753, + "grad_norm": 0.3591381311416626, + "learning_rate": 2.20179256890535e-06, + "loss": 0.2986, + "step": 4645 + }, + { + "epoch": 2.1605952565493722, + "grad_norm": 0.38014209270477295, + "learning_rate": 2.1995504747139397e-06, + "loss": 0.306, + "step": 4646 + }, + { + "epoch": 2.161060300728569, + "grad_norm": 0.33583784103393555, + "learning_rate": 2.1973092007287054e-06, + "loss": 0.3432, + "step": 4647 + }, + { + "epoch": 2.161525344907766, + "grad_norm": 0.3436342775821686, + "learning_rate": 2.195068747606084e-06, + "loss": 0.3368, + "step": 4648 + }, + { + "epoch": 2.1619903890869634, + "grad_norm": 0.33585768938064575, + "learning_rate": 2.1928291160022634e-06, + "loss": 0.3136, + "step": 4649 + }, + { + "epoch": 2.1624554332661603, + "grad_norm": 0.35160210728645325, + "learning_rate": 2.190590306573198e-06, + "loss": 0.3302, + "step": 4650 + }, + { + "epoch": 2.162920477445357, + "grad_norm": 0.35300135612487793, + "learning_rate": 2.1883523199745987e-06, + "loss": 0.2987, + "step": 4651 + }, + { + "epoch": 2.163385521624554, + "grad_norm": 0.4004720151424408, + "learning_rate": 2.1861151568619336e-06, + "loss": 0.3809, + "step": 4652 + }, + { + "epoch": 2.1638505658037515, + "grad_norm": 0.3017584979534149, + "learning_rate": 2.1838788178904346e-06, + "loss": 0.2655, + "step": 4653 + }, + { + "epoch": 2.1643156099829484, + "grad_norm": 0.3657880127429962, + "learning_rate": 2.1816433037150856e-06, + "loss": 0.3563, + "step": 4654 + }, + { + "epoch": 2.1647806541621453, + "grad_norm": 0.362338125705719, + "learning_rate": 2.179408614990635e-06, + "loss": 0.3145, + "step": 4655 + }, + { + "epoch": 2.1652456983413426, + "grad_norm": 0.3580748736858368, + "learning_rate": 2.177174752371585e-06, + "loss": 0.3113, + "step": 4656 + }, + { + "epoch": 2.1657107425205395, + "grad_norm": 0.37395989894866943, + "learning_rate": 2.1749417165121994e-06, + "loss": 0.3297, + "step": 4657 + }, + { + "epoch": 2.1661757866997364, + "grad_norm": 0.35120391845703125, + "learning_rate": 2.1727095080664956e-06, + "loss": 0.3417, + "step": 4658 + }, + { + "epoch": 2.1666408308789333, + "grad_norm": 0.3511483371257782, + "learning_rate": 2.1704781276882547e-06, + "loss": 0.3232, + "step": 4659 + }, + { + "epoch": 2.1671058750581307, + "grad_norm": 0.3221030533313751, + "learning_rate": 2.168247576031008e-06, + "loss": 0.3062, + "step": 4660 + }, + { + "epoch": 2.1675709192373276, + "grad_norm": 0.3379102349281311, + "learning_rate": 2.16601785374805e-06, + "loss": 0.3103, + "step": 4661 + }, + { + "epoch": 2.1680359634165245, + "grad_norm": 0.35525548458099365, + "learning_rate": 2.163788961492429e-06, + "loss": 0.3502, + "step": 4662 + }, + { + "epoch": 2.1685010075957214, + "grad_norm": 0.33914363384246826, + "learning_rate": 2.161560899916954e-06, + "loss": 0.3336, + "step": 4663 + }, + { + "epoch": 2.1689660517749187, + "grad_norm": 0.36163097620010376, + "learning_rate": 2.159333669674185e-06, + "loss": 0.3073, + "step": 4664 + }, + { + "epoch": 2.1694310959541157, + "grad_norm": 0.34654346108436584, + "learning_rate": 2.1571072714164445e-06, + "loss": 0.3466, + "step": 4665 + }, + { + "epoch": 2.1698961401333126, + "grad_norm": 0.33517155051231384, + "learning_rate": 2.1548817057958043e-06, + "loss": 0.3351, + "step": 4666 + }, + { + "epoch": 2.17036118431251, + "grad_norm": 0.3178362250328064, + "learning_rate": 2.152656973464101e-06, + "loss": 0.2925, + "step": 4667 + }, + { + "epoch": 2.170826228491707, + "grad_norm": 0.4054083824157715, + "learning_rate": 2.1504330750729185e-06, + "loss": 0.3176, + "step": 4668 + }, + { + "epoch": 2.1712912726709037, + "grad_norm": 0.31070810556411743, + "learning_rate": 2.1482100112736044e-06, + "loss": 0.2867, + "step": 4669 + }, + { + "epoch": 2.1717563168501006, + "grad_norm": 0.3718927800655365, + "learning_rate": 2.1459877827172538e-06, + "loss": 0.3633, + "step": 4670 + }, + { + "epoch": 2.172221361029298, + "grad_norm": 0.36688584089279175, + "learning_rate": 2.1437663900547255e-06, + "loss": 0.3169, + "step": 4671 + }, + { + "epoch": 2.172686405208495, + "grad_norm": 0.35190173983573914, + "learning_rate": 2.141545833936625e-06, + "loss": 0.3077, + "step": 4672 + }, + { + "epoch": 2.173151449387692, + "grad_norm": 0.3758264482021332, + "learning_rate": 2.13932611501332e-06, + "loss": 0.36, + "step": 4673 + }, + { + "epoch": 2.1736164935668887, + "grad_norm": 0.33575019240379333, + "learning_rate": 2.1371072339349293e-06, + "loss": 0.3308, + "step": 4674 + }, + { + "epoch": 2.174081537746086, + "grad_norm": 0.33818864822387695, + "learning_rate": 2.1348891913513293e-06, + "loss": 0.3019, + "step": 4675 + }, + { + "epoch": 2.174546581925283, + "grad_norm": 0.3424922227859497, + "learning_rate": 2.132671987912145e-06, + "loss": 0.2985, + "step": 4676 + }, + { + "epoch": 2.17501162610448, + "grad_norm": 0.3274764120578766, + "learning_rate": 2.130455624266762e-06, + "loss": 0.3055, + "step": 4677 + }, + { + "epoch": 2.1754766702836768, + "grad_norm": 0.3366675078868866, + "learning_rate": 2.128240101064315e-06, + "loss": 0.3189, + "step": 4678 + }, + { + "epoch": 2.175941714462874, + "grad_norm": 0.38803109526634216, + "learning_rate": 2.126025418953698e-06, + "loss": 0.3544, + "step": 4679 + }, + { + "epoch": 2.176406758642071, + "grad_norm": 0.32740333676338196, + "learning_rate": 2.1238115785835512e-06, + "loss": 0.3153, + "step": 4680 + }, + { + "epoch": 2.176871802821268, + "grad_norm": 0.30742892622947693, + "learning_rate": 2.1215985806022765e-06, + "loss": 0.2642, + "step": 4681 + }, + { + "epoch": 2.177336847000465, + "grad_norm": 0.3487866520881653, + "learning_rate": 2.1193864256580215e-06, + "loss": 0.3637, + "step": 4682 + }, + { + "epoch": 2.177801891179662, + "grad_norm": 0.35300448536872864, + "learning_rate": 2.117175114398694e-06, + "loss": 0.3174, + "step": 4683 + }, + { + "epoch": 2.178266935358859, + "grad_norm": 0.3436736762523651, + "learning_rate": 2.1149646474719475e-06, + "loss": 0.3213, + "step": 4684 + }, + { + "epoch": 2.178731979538056, + "grad_norm": 0.32160818576812744, + "learning_rate": 2.112755025525193e-06, + "loss": 0.3061, + "step": 4685 + }, + { + "epoch": 2.1791970237172533, + "grad_norm": 0.3675907552242279, + "learning_rate": 2.110546249205597e-06, + "loss": 0.3207, + "step": 4686 + }, + { + "epoch": 2.1796620678964502, + "grad_norm": 0.3436359465122223, + "learning_rate": 2.1083383191600676e-06, + "loss": 0.3222, + "step": 4687 + }, + { + "epoch": 2.180127112075647, + "grad_norm": 0.32878395915031433, + "learning_rate": 2.106131236035277e-06, + "loss": 0.3153, + "step": 4688 + }, + { + "epoch": 2.180592156254844, + "grad_norm": 0.3562372624874115, + "learning_rate": 2.1039250004776397e-06, + "loss": 0.349, + "step": 4689 + }, + { + "epoch": 2.1810572004340414, + "grad_norm": 0.33452579379081726, + "learning_rate": 2.1017196131333306e-06, + "loss": 0.3016, + "step": 4690 + }, + { + "epoch": 2.1815222446132383, + "grad_norm": 0.37932857871055603, + "learning_rate": 2.099515074648267e-06, + "loss": 0.3182, + "step": 4691 + }, + { + "epoch": 2.181987288792435, + "grad_norm": 0.35700809955596924, + "learning_rate": 2.0973113856681277e-06, + "loss": 0.3217, + "step": 4692 + }, + { + "epoch": 2.182452332971632, + "grad_norm": 0.37079155445098877, + "learning_rate": 2.0951085468383326e-06, + "loss": 0.3399, + "step": 4693 + }, + { + "epoch": 2.1829173771508295, + "grad_norm": 0.3523573577404022, + "learning_rate": 2.0929065588040615e-06, + "loss": 0.3152, + "step": 4694 + }, + { + "epoch": 2.1833824213300264, + "grad_norm": 0.33730053901672363, + "learning_rate": 2.0907054222102367e-06, + "loss": 0.3066, + "step": 4695 + }, + { + "epoch": 2.1838474655092233, + "grad_norm": 0.34836792945861816, + "learning_rate": 2.088505137701538e-06, + "loss": 0.3236, + "step": 4696 + }, + { + "epoch": 2.1843125096884206, + "grad_norm": 0.3443335294723511, + "learning_rate": 2.0863057059223923e-06, + "loss": 0.2979, + "step": 4697 + }, + { + "epoch": 2.1847775538676175, + "grad_norm": 0.32043859362602234, + "learning_rate": 2.08410712751698e-06, + "loss": 0.3132, + "step": 4698 + }, + { + "epoch": 2.1852425980468144, + "grad_norm": 0.3443070948123932, + "learning_rate": 2.081909403129225e-06, + "loss": 0.297, + "step": 4699 + }, + { + "epoch": 2.1857076422260113, + "grad_norm": 0.3612792193889618, + "learning_rate": 2.079712533402808e-06, + "loss": 0.3369, + "step": 4700 + }, + { + "epoch": 2.1861726864052087, + "grad_norm": 0.3530242443084717, + "learning_rate": 2.0775165189811534e-06, + "loss": 0.2989, + "step": 4701 + }, + { + "epoch": 2.1866377305844056, + "grad_norm": 0.33956217765808105, + "learning_rate": 2.0753213605074424e-06, + "loss": 0.3195, + "step": 4702 + }, + { + "epoch": 2.1871027747636025, + "grad_norm": 0.3230421245098114, + "learning_rate": 2.0731270586245972e-06, + "loss": 0.3039, + "step": 4703 + }, + { + "epoch": 2.1875678189427994, + "grad_norm": 0.41725921630859375, + "learning_rate": 2.070933613975296e-06, + "loss": 0.3244, + "step": 4704 + }, + { + "epoch": 2.1880328631219967, + "grad_norm": 0.33200743794441223, + "learning_rate": 2.068741027201961e-06, + "loss": 0.3153, + "step": 4705 + }, + { + "epoch": 2.1884979073011936, + "grad_norm": 0.3534869849681854, + "learning_rate": 2.066549298946767e-06, + "loss": 0.344, + "step": 4706 + }, + { + "epoch": 2.1889629514803906, + "grad_norm": 0.308760404586792, + "learning_rate": 2.064358429851634e-06, + "loss": 0.2962, + "step": 4707 + }, + { + "epoch": 2.1894279956595875, + "grad_norm": 0.36021149158477783, + "learning_rate": 2.062168420558232e-06, + "loss": 0.3497, + "step": 4708 + }, + { + "epoch": 2.189893039838785, + "grad_norm": 0.3680166006088257, + "learning_rate": 2.0599792717079807e-06, + "loss": 0.3084, + "step": 4709 + }, + { + "epoch": 2.1903580840179817, + "grad_norm": 0.3480318486690521, + "learning_rate": 2.0577909839420468e-06, + "loss": 0.3213, + "step": 4710 + }, + { + "epoch": 2.1908231281971786, + "grad_norm": 0.33287766575813293, + "learning_rate": 2.0556035579013417e-06, + "loss": 0.2899, + "step": 4711 + }, + { + "epoch": 2.1912881723763755, + "grad_norm": 0.36140042543411255, + "learning_rate": 2.0534169942265298e-06, + "loss": 0.3306, + "step": 4712 + }, + { + "epoch": 2.191753216555573, + "grad_norm": 0.3293544352054596, + "learning_rate": 2.0512312935580163e-06, + "loss": 0.2999, + "step": 4713 + }, + { + "epoch": 2.1922182607347698, + "grad_norm": 0.3596755266189575, + "learning_rate": 2.0490464565359615e-06, + "loss": 0.3341, + "step": 4714 + }, + { + "epoch": 2.1926833049139667, + "grad_norm": 0.3495473563671112, + "learning_rate": 2.0468624838002647e-06, + "loss": 0.3237, + "step": 4715 + }, + { + "epoch": 2.193148349093164, + "grad_norm": 0.3244265615940094, + "learning_rate": 2.044679375990581e-06, + "loss": 0.3232, + "step": 4716 + }, + { + "epoch": 2.193613393272361, + "grad_norm": 0.34927719831466675, + "learning_rate": 2.0424971337463017e-06, + "loss": 0.3305, + "step": 4717 + }, + { + "epoch": 2.194078437451558, + "grad_norm": 0.3691467046737671, + "learning_rate": 2.0403157577065746e-06, + "loss": 0.3485, + "step": 4718 + }, + { + "epoch": 2.1945434816307547, + "grad_norm": 0.3577490448951721, + "learning_rate": 2.0381352485102857e-06, + "loss": 0.3291, + "step": 4719 + }, + { + "epoch": 2.195008525809952, + "grad_norm": 0.344149649143219, + "learning_rate": 2.0359556067960727e-06, + "loss": 0.3146, + "step": 4720 + }, + { + "epoch": 2.195473569989149, + "grad_norm": 0.33729931712150574, + "learning_rate": 2.0337768332023185e-06, + "loss": 0.2903, + "step": 4721 + }, + { + "epoch": 2.195938614168346, + "grad_norm": 0.36155104637145996, + "learning_rate": 2.0315989283671474e-06, + "loss": 0.3305, + "step": 4722 + }, + { + "epoch": 2.196403658347543, + "grad_norm": 0.3909613788127899, + "learning_rate": 2.029421892928436e-06, + "loss": 0.3343, + "step": 4723 + }, + { + "epoch": 2.19686870252674, + "grad_norm": 0.3571639358997345, + "learning_rate": 2.027245727523798e-06, + "loss": 0.2831, + "step": 4724 + }, + { + "epoch": 2.197333746705937, + "grad_norm": 0.3750656247138977, + "learning_rate": 2.0250704327906025e-06, + "loss": 0.3303, + "step": 4725 + }, + { + "epoch": 2.197798790885134, + "grad_norm": 0.35987409949302673, + "learning_rate": 2.022896009365952e-06, + "loss": 0.3574, + "step": 4726 + }, + { + "epoch": 2.1982638350643313, + "grad_norm": 0.3430827558040619, + "learning_rate": 2.020722457886705e-06, + "loss": 0.3216, + "step": 4727 + }, + { + "epoch": 2.1987288792435282, + "grad_norm": 0.3294491171836853, + "learning_rate": 2.018549778989456e-06, + "loss": 0.3249, + "step": 4728 + }, + { + "epoch": 2.199193923422725, + "grad_norm": 0.39135822653770447, + "learning_rate": 2.0163779733105497e-06, + "loss": 0.3184, + "step": 4729 + }, + { + "epoch": 2.199658967601922, + "grad_norm": 0.3750271201133728, + "learning_rate": 2.0142070414860704e-06, + "loss": 0.3288, + "step": 4730 + }, + { + "epoch": 2.2001240117811194, + "grad_norm": 0.35246542096138, + "learning_rate": 2.0120369841518496e-06, + "loss": 0.3001, + "step": 4731 + }, + { + "epoch": 2.2005890559603163, + "grad_norm": 0.3286935091018677, + "learning_rate": 2.009867801943462e-06, + "loss": 0.331, + "step": 4732 + }, + { + "epoch": 2.201054100139513, + "grad_norm": 0.37689733505249023, + "learning_rate": 2.007699495496228e-06, + "loss": 0.3225, + "step": 4733 + }, + { + "epoch": 2.20151914431871, + "grad_norm": 0.358782559633255, + "learning_rate": 2.0055320654452055e-06, + "loss": 0.3297, + "step": 4734 + }, + { + "epoch": 2.2019841884979074, + "grad_norm": 0.33447474241256714, + "learning_rate": 2.0033655124252033e-06, + "loss": 0.3196, + "step": 4735 + }, + { + "epoch": 2.2024492326771044, + "grad_norm": 0.36790183186531067, + "learning_rate": 2.001199837070766e-06, + "loss": 0.2924, + "step": 4736 + }, + { + "epoch": 2.2029142768563013, + "grad_norm": 0.384657084941864, + "learning_rate": 1.999035040016188e-06, + "loss": 0.355, + "step": 4737 + }, + { + "epoch": 2.2033793210354986, + "grad_norm": 0.32145190238952637, + "learning_rate": 1.9968711218954994e-06, + "loss": 0.3055, + "step": 4738 + }, + { + "epoch": 2.2038443652146955, + "grad_norm": 0.34638088941574097, + "learning_rate": 1.9947080833424816e-06, + "loss": 0.3214, + "step": 4739 + }, + { + "epoch": 2.2043094093938924, + "grad_norm": 0.33720049262046814, + "learning_rate": 1.9925459249906488e-06, + "loss": 0.3181, + "step": 4740 + }, + { + "epoch": 2.2047744535730893, + "grad_norm": 0.3274615705013275, + "learning_rate": 1.990384647473265e-06, + "loss": 0.3015, + "step": 4741 + }, + { + "epoch": 2.2052394977522867, + "grad_norm": 0.3554820716381073, + "learning_rate": 1.9882242514233313e-06, + "loss": 0.3141, + "step": 4742 + }, + { + "epoch": 2.2057045419314836, + "grad_norm": 0.35325977206230164, + "learning_rate": 1.9860647374735937e-06, + "loss": 0.309, + "step": 4743 + }, + { + "epoch": 2.2061695861106805, + "grad_norm": 0.3346039652824402, + "learning_rate": 1.9839061062565384e-06, + "loss": 0.2945, + "step": 4744 + }, + { + "epoch": 2.2066346302898774, + "grad_norm": 0.3343028426170349, + "learning_rate": 1.9817483584043954e-06, + "loss": 0.3328, + "step": 4745 + }, + { + "epoch": 2.2070996744690747, + "grad_norm": 0.33999499678611755, + "learning_rate": 1.9795914945491305e-06, + "loss": 0.323, + "step": 4746 + }, + { + "epoch": 2.2075647186482716, + "grad_norm": 0.32321181893348694, + "learning_rate": 1.977435515322458e-06, + "loss": 0.2882, + "step": 4747 + }, + { + "epoch": 2.2080297628274685, + "grad_norm": 0.38171252608299255, + "learning_rate": 1.9752804213558254e-06, + "loss": 0.3073, + "step": 4748 + }, + { + "epoch": 2.2084948070066655, + "grad_norm": 0.37734076380729675, + "learning_rate": 1.9731262132804275e-06, + "loss": 0.3522, + "step": 4749 + }, + { + "epoch": 2.208959851185863, + "grad_norm": 0.34129461646080017, + "learning_rate": 1.970972891727194e-06, + "loss": 0.3072, + "step": 4750 + }, + { + "epoch": 2.2094248953650597, + "grad_norm": 0.36008572578430176, + "learning_rate": 1.9688204573268015e-06, + "loss": 0.3233, + "step": 4751 + }, + { + "epoch": 2.2098899395442566, + "grad_norm": 0.3995450735092163, + "learning_rate": 1.9666689107096597e-06, + "loss": 0.3445, + "step": 4752 + }, + { + "epoch": 2.2103549837234535, + "grad_norm": 0.3434513211250305, + "learning_rate": 1.964518252505925e-06, + "loss": 0.3023, + "step": 4753 + }, + { + "epoch": 2.210820027902651, + "grad_norm": 0.31967639923095703, + "learning_rate": 1.962368483345486e-06, + "loss": 0.3327, + "step": 4754 + }, + { + "epoch": 2.2112850720818478, + "grad_norm": 0.30786556005477905, + "learning_rate": 1.9602196038579774e-06, + "loss": 0.3022, + "step": 4755 + }, + { + "epoch": 2.2117501162610447, + "grad_norm": 0.3452177345752716, + "learning_rate": 1.9580716146727734e-06, + "loss": 0.3256, + "step": 4756 + }, + { + "epoch": 2.212215160440242, + "grad_norm": 0.33406537771224976, + "learning_rate": 1.9559245164189812e-06, + "loss": 0.3443, + "step": 4757 + }, + { + "epoch": 2.212680204619439, + "grad_norm": 0.31842756271362305, + "learning_rate": 1.9537783097254543e-06, + "loss": 0.3247, + "step": 4758 + }, + { + "epoch": 2.213145248798636, + "grad_norm": 0.3189336955547333, + "learning_rate": 1.9516329952207787e-06, + "loss": 0.3472, + "step": 4759 + }, + { + "epoch": 2.2136102929778327, + "grad_norm": 0.3681834638118744, + "learning_rate": 1.949488573533285e-06, + "loss": 0.3135, + "step": 4760 + }, + { + "epoch": 2.21407533715703, + "grad_norm": 0.3542393445968628, + "learning_rate": 1.9473450452910365e-06, + "loss": 0.3136, + "step": 4761 + }, + { + "epoch": 2.214540381336227, + "grad_norm": 0.3548336327075958, + "learning_rate": 1.9452024111218414e-06, + "loss": 0.3259, + "step": 4762 + }, + { + "epoch": 2.215005425515424, + "grad_norm": 0.34949004650115967, + "learning_rate": 1.9430606716532393e-06, + "loss": 0.331, + "step": 4763 + }, + { + "epoch": 2.215470469694621, + "grad_norm": 0.3351438641548157, + "learning_rate": 1.940919827512513e-06, + "loss": 0.2809, + "step": 4764 + }, + { + "epoch": 2.215935513873818, + "grad_norm": 0.3751305639743805, + "learning_rate": 1.938779879326679e-06, + "loss": 0.3407, + "step": 4765 + }, + { + "epoch": 2.216400558053015, + "grad_norm": 0.3458346724510193, + "learning_rate": 1.936640827722494e-06, + "loss": 0.3044, + "step": 4766 + }, + { + "epoch": 2.216865602232212, + "grad_norm": 0.31323379278182983, + "learning_rate": 1.934502673326452e-06, + "loss": 0.2983, + "step": 4767 + }, + { + "epoch": 2.2173306464114093, + "grad_norm": 0.35735267400741577, + "learning_rate": 1.9323654167647854e-06, + "loss": 0.3316, + "step": 4768 + }, + { + "epoch": 2.217795690590606, + "grad_norm": 0.3447880744934082, + "learning_rate": 1.930229058663459e-06, + "loss": 0.2798, + "step": 4769 + }, + { + "epoch": 2.218260734769803, + "grad_norm": 0.3700970709323883, + "learning_rate": 1.9280935996481792e-06, + "loss": 0.319, + "step": 4770 + }, + { + "epoch": 2.218725778949, + "grad_norm": 0.34711953997612, + "learning_rate": 1.9259590403443857e-06, + "loss": 0.3449, + "step": 4771 + }, + { + "epoch": 2.2191908231281974, + "grad_norm": 0.3344910144805908, + "learning_rate": 1.923825381377259e-06, + "loss": 0.2688, + "step": 4772 + }, + { + "epoch": 2.2196558673073943, + "grad_norm": 0.38596683740615845, + "learning_rate": 1.9216926233717087e-06, + "loss": 0.3415, + "step": 4773 + }, + { + "epoch": 2.220120911486591, + "grad_norm": 0.36054715514183044, + "learning_rate": 1.9195607669523903e-06, + "loss": 0.2957, + "step": 4774 + }, + { + "epoch": 2.220585955665788, + "grad_norm": 0.33113333582878113, + "learning_rate": 1.9174298127436845e-06, + "loss": 0.3162, + "step": 4775 + }, + { + "epoch": 2.2210509998449854, + "grad_norm": 0.3493165373802185, + "learning_rate": 1.9152997613697184e-06, + "loss": 0.3168, + "step": 4776 + }, + { + "epoch": 2.2215160440241823, + "grad_norm": 0.37465736269950867, + "learning_rate": 1.913170613454345e-06, + "loss": 0.351, + "step": 4777 + }, + { + "epoch": 2.2219810882033793, + "grad_norm": 0.3411155045032501, + "learning_rate": 1.9110423696211588e-06, + "loss": 0.3113, + "step": 4778 + }, + { + "epoch": 2.222446132382576, + "grad_norm": 0.35005953907966614, + "learning_rate": 1.9089150304934883e-06, + "loss": 0.3244, + "step": 4779 + }, + { + "epoch": 2.2229111765617735, + "grad_norm": 0.3763487935066223, + "learning_rate": 1.9067885966943983e-06, + "loss": 0.3509, + "step": 4780 + }, + { + "epoch": 2.2233762207409704, + "grad_norm": 0.3517736792564392, + "learning_rate": 1.9046630688466827e-06, + "loss": 0.3222, + "step": 4781 + }, + { + "epoch": 2.2238412649201673, + "grad_norm": 0.3561932146549225, + "learning_rate": 1.9025384475728787e-06, + "loss": 0.3047, + "step": 4782 + }, + { + "epoch": 2.224306309099364, + "grad_norm": 0.3545885980129242, + "learning_rate": 1.9004147334952483e-06, + "loss": 0.3192, + "step": 4783 + }, + { + "epoch": 2.2247713532785616, + "grad_norm": 0.3312489092350006, + "learning_rate": 1.8982919272357974e-06, + "loss": 0.3082, + "step": 4784 + }, + { + "epoch": 2.2252363974577585, + "grad_norm": 0.36054548621177673, + "learning_rate": 1.8961700294162578e-06, + "loss": 0.3275, + "step": 4785 + }, + { + "epoch": 2.2257014416369554, + "grad_norm": 0.350093811750412, + "learning_rate": 1.8940490406581018e-06, + "loss": 0.2902, + "step": 4786 + }, + { + "epoch": 2.2261664858161527, + "grad_norm": 0.34745752811431885, + "learning_rate": 1.8919289615825304e-06, + "loss": 0.3468, + "step": 4787 + }, + { + "epoch": 2.2266315299953496, + "grad_norm": 0.3414476811885834, + "learning_rate": 1.8898097928104825e-06, + "loss": 0.2941, + "step": 4788 + }, + { + "epoch": 2.2270965741745465, + "grad_norm": 0.3686240315437317, + "learning_rate": 1.8876915349626258e-06, + "loss": 0.3379, + "step": 4789 + }, + { + "epoch": 2.2275616183537434, + "grad_norm": 0.3241952657699585, + "learning_rate": 1.8855741886593643e-06, + "loss": 0.3107, + "step": 4790 + }, + { + "epoch": 2.228026662532941, + "grad_norm": 0.3654305934906006, + "learning_rate": 1.883457754520835e-06, + "loss": 0.3199, + "step": 4791 + }, + { + "epoch": 2.2284917067121377, + "grad_norm": 0.3426763415336609, + "learning_rate": 1.8813422331669084e-06, + "loss": 0.3018, + "step": 4792 + }, + { + "epoch": 2.2289567508913346, + "grad_norm": 0.39639508724212646, + "learning_rate": 1.8792276252171855e-06, + "loss": 0.3529, + "step": 4793 + }, + { + "epoch": 2.2294217950705315, + "grad_norm": 0.3361477553844452, + "learning_rate": 1.8771139312909976e-06, + "loss": 0.2977, + "step": 4794 + }, + { + "epoch": 2.229886839249729, + "grad_norm": 0.3340165913105011, + "learning_rate": 1.8750011520074158e-06, + "loss": 0.2923, + "step": 4795 + }, + { + "epoch": 2.2303518834289258, + "grad_norm": 0.3512633442878723, + "learning_rate": 1.8728892879852345e-06, + "loss": 0.3543, + "step": 4796 + }, + { + "epoch": 2.2308169276081227, + "grad_norm": 0.35980337858200073, + "learning_rate": 1.870778339842989e-06, + "loss": 0.3258, + "step": 4797 + }, + { + "epoch": 2.23128197178732, + "grad_norm": 0.3445799946784973, + "learning_rate": 1.8686683081989371e-06, + "loss": 0.3035, + "step": 4798 + }, + { + "epoch": 2.231747015966517, + "grad_norm": 0.3666563630104065, + "learning_rate": 1.866559193671077e-06, + "loss": 0.2764, + "step": 4799 + }, + { + "epoch": 2.232212060145714, + "grad_norm": 0.36018139123916626, + "learning_rate": 1.8644509968771302e-06, + "loss": 0.3334, + "step": 4800 + }, + { + "epoch": 2.2326771043249107, + "grad_norm": 0.3475804626941681, + "learning_rate": 1.8623437184345556e-06, + "loss": 0.34, + "step": 4801 + }, + { + "epoch": 2.233142148504108, + "grad_norm": 0.33812299370765686, + "learning_rate": 1.86023735896054e-06, + "loss": 0.3183, + "step": 4802 + }, + { + "epoch": 2.233607192683305, + "grad_norm": 0.3337913453578949, + "learning_rate": 1.8581319190720038e-06, + "loss": 0.3004, + "step": 4803 + }, + { + "epoch": 2.234072236862502, + "grad_norm": 0.3546447157859802, + "learning_rate": 1.8560273993855938e-06, + "loss": 0.3009, + "step": 4804 + }, + { + "epoch": 2.234537281041699, + "grad_norm": 0.37179654836654663, + "learning_rate": 1.8539238005176912e-06, + "loss": 0.3277, + "step": 4805 + }, + { + "epoch": 2.235002325220896, + "grad_norm": 0.36469766497612, + "learning_rate": 1.8518211230844042e-06, + "loss": 0.3292, + "step": 4806 + }, + { + "epoch": 2.235467369400093, + "grad_norm": 0.3361158072948456, + "learning_rate": 1.849719367701575e-06, + "loss": 0.3354, + "step": 4807 + }, + { + "epoch": 2.23593241357929, + "grad_norm": 0.3247022330760956, + "learning_rate": 1.8476185349847713e-06, + "loss": 0.3196, + "step": 4808 + }, + { + "epoch": 2.236397457758487, + "grad_norm": 0.36382511258125305, + "learning_rate": 1.8455186255492956e-06, + "loss": 0.342, + "step": 4809 + }, + { + "epoch": 2.236862501937684, + "grad_norm": 0.33687886595726013, + "learning_rate": 1.8434196400101744e-06, + "loss": 0.2892, + "step": 4810 + }, + { + "epoch": 2.237327546116881, + "grad_norm": 0.3983268737792969, + "learning_rate": 1.8413215789821692e-06, + "loss": 0.3366, + "step": 4811 + }, + { + "epoch": 2.237792590296078, + "grad_norm": 0.37651053071022034, + "learning_rate": 1.839224443079765e-06, + "loss": 0.35, + "step": 4812 + }, + { + "epoch": 2.238257634475275, + "grad_norm": 0.33488836884498596, + "learning_rate": 1.8371282329171803e-06, + "loss": 0.2801, + "step": 4813 + }, + { + "epoch": 2.2387226786544723, + "grad_norm": 0.341766357421875, + "learning_rate": 1.8350329491083613e-06, + "loss": 0.3194, + "step": 4814 + }, + { + "epoch": 2.239187722833669, + "grad_norm": 0.3747437596321106, + "learning_rate": 1.832938592266984e-06, + "loss": 0.3123, + "step": 4815 + }, + { + "epoch": 2.239652767012866, + "grad_norm": 0.351366251707077, + "learning_rate": 1.8308451630064484e-06, + "loss": 0.309, + "step": 4816 + }, + { + "epoch": 2.2401178111920634, + "grad_norm": 0.40906503796577454, + "learning_rate": 1.8287526619398888e-06, + "loss": 0.3532, + "step": 4817 + }, + { + "epoch": 2.2405828553712603, + "grad_norm": 0.3506213426589966, + "learning_rate": 1.8266610896801624e-06, + "loss": 0.3236, + "step": 4818 + }, + { + "epoch": 2.2410478995504572, + "grad_norm": 0.3513505756855011, + "learning_rate": 1.824570446839859e-06, + "loss": 0.3113, + "step": 4819 + }, + { + "epoch": 2.241512943729654, + "grad_norm": 0.33928313851356506, + "learning_rate": 1.8224807340312912e-06, + "loss": 0.3216, + "step": 4820 + }, + { + "epoch": 2.2419779879088515, + "grad_norm": 0.3162320852279663, + "learning_rate": 1.8203919518665049e-06, + "loss": 0.3097, + "step": 4821 + }, + { + "epoch": 2.2424430320880484, + "grad_norm": 0.36044222116470337, + "learning_rate": 1.8183041009572678e-06, + "loss": 0.33, + "step": 4822 + }, + { + "epoch": 2.2429080762672453, + "grad_norm": 0.4068707525730133, + "learning_rate": 1.8162171819150798e-06, + "loss": 0.3338, + "step": 4823 + }, + { + "epoch": 2.243373120446442, + "grad_norm": 0.3441452383995056, + "learning_rate": 1.8141311953511637e-06, + "loss": 0.3387, + "step": 4824 + }, + { + "epoch": 2.2438381646256396, + "grad_norm": 0.33748528361320496, + "learning_rate": 1.8120461418764711e-06, + "loss": 0.3243, + "step": 4825 + }, + { + "epoch": 2.2443032088048365, + "grad_norm": 0.3908786475658417, + "learning_rate": 1.8099620221016818e-06, + "loss": 0.3282, + "step": 4826 + }, + { + "epoch": 2.2447682529840334, + "grad_norm": 0.3668133616447449, + "learning_rate": 1.8078788366372008e-06, + "loss": 0.2975, + "step": 4827 + }, + { + "epoch": 2.2452332971632307, + "grad_norm": 0.3833257257938385, + "learning_rate": 1.8057965860931593e-06, + "loss": 0.312, + "step": 4828 + }, + { + "epoch": 2.2456983413424276, + "grad_norm": 0.3340941369533539, + "learning_rate": 1.8037152710794115e-06, + "loss": 0.3094, + "step": 4829 + }, + { + "epoch": 2.2461633855216245, + "grad_norm": 0.3409848213195801, + "learning_rate": 1.8016348922055448e-06, + "loss": 0.2966, + "step": 4830 + }, + { + "epoch": 2.2466284297008214, + "grad_norm": 0.3662347197532654, + "learning_rate": 1.7995554500808655e-06, + "loss": 0.3617, + "step": 4831 + }, + { + "epoch": 2.247093473880019, + "grad_norm": 0.4032919406890869, + "learning_rate": 1.7974769453144102e-06, + "loss": 0.3042, + "step": 4832 + }, + { + "epoch": 2.2475585180592157, + "grad_norm": 0.3917955160140991, + "learning_rate": 1.7953993785149377e-06, + "loss": 0.3188, + "step": 4833 + }, + { + "epoch": 2.2480235622384126, + "grad_norm": 0.36748558282852173, + "learning_rate": 1.7933227502909361e-06, + "loss": 0.3355, + "step": 4834 + }, + { + "epoch": 2.2484886064176095, + "grad_norm": 0.36093080043792725, + "learning_rate": 1.7912470612506123e-06, + "loss": 0.3389, + "step": 4835 + }, + { + "epoch": 2.248953650596807, + "grad_norm": 0.34391847252845764, + "learning_rate": 1.7891723120019038e-06, + "loss": 0.3463, + "step": 4836 + }, + { + "epoch": 2.2494186947760038, + "grad_norm": 0.3272242844104767, + "learning_rate": 1.7870985031524718e-06, + "loss": 0.3093, + "step": 4837 + }, + { + "epoch": 2.2498837389552007, + "grad_norm": 0.3449787199497223, + "learning_rate": 1.7850256353097017e-06, + "loss": 0.3272, + "step": 4838 + }, + { + "epoch": 2.250348783134398, + "grad_norm": 0.36595356464385986, + "learning_rate": 1.7829537090807002e-06, + "loss": 0.325, + "step": 4839 + }, + { + "epoch": 2.250813827313595, + "grad_norm": 0.35330867767333984, + "learning_rate": 1.7808827250723043e-06, + "loss": 0.3626, + "step": 4840 + }, + { + "epoch": 2.251278871492792, + "grad_norm": 0.3324193060398102, + "learning_rate": 1.7788126838910674e-06, + "loss": 0.3043, + "step": 4841 + }, + { + "epoch": 2.2517439156719887, + "grad_norm": 0.36338382959365845, + "learning_rate": 1.7767435861432752e-06, + "loss": 0.3228, + "step": 4842 + }, + { + "epoch": 2.2522089598511856, + "grad_norm": 0.3679875135421753, + "learning_rate": 1.7746754324349291e-06, + "loss": 0.366, + "step": 4843 + }, + { + "epoch": 2.252674004030383, + "grad_norm": 0.3294106423854828, + "learning_rate": 1.7726082233717607e-06, + "loss": 0.3195, + "step": 4844 + }, + { + "epoch": 2.25313904820958, + "grad_norm": 0.3595585227012634, + "learning_rate": 1.7705419595592193e-06, + "loss": 0.3553, + "step": 4845 + }, + { + "epoch": 2.253604092388777, + "grad_norm": 0.34528473019599915, + "learning_rate": 1.7684766416024828e-06, + "loss": 0.302, + "step": 4846 + }, + { + "epoch": 2.254069136567974, + "grad_norm": 0.38927531242370605, + "learning_rate": 1.7664122701064462e-06, + "loss": 0.3594, + "step": 4847 + }, + { + "epoch": 2.254534180747171, + "grad_norm": 0.30416175723075867, + "learning_rate": 1.7643488456757324e-06, + "loss": 0.3064, + "step": 4848 + }, + { + "epoch": 2.254999224926368, + "grad_norm": 0.37094974517822266, + "learning_rate": 1.7622863689146841e-06, + "loss": 0.3446, + "step": 4849 + }, + { + "epoch": 2.255464269105565, + "grad_norm": 0.31221383810043335, + "learning_rate": 1.7602248404273692e-06, + "loss": 0.2903, + "step": 4850 + }, + { + "epoch": 2.255929313284762, + "grad_norm": 0.33237534761428833, + "learning_rate": 1.758164260817573e-06, + "loss": 0.3272, + "step": 4851 + }, + { + "epoch": 2.256394357463959, + "grad_norm": 0.3221777677536011, + "learning_rate": 1.7561046306888092e-06, + "loss": 0.3238, + "step": 4852 + }, + { + "epoch": 2.256859401643156, + "grad_norm": 0.3504292964935303, + "learning_rate": 1.7540459506443052e-06, + "loss": 0.3285, + "step": 4853 + }, + { + "epoch": 2.257324445822353, + "grad_norm": 0.3311966359615326, + "learning_rate": 1.7519882212870204e-06, + "loss": 0.3255, + "step": 4854 + }, + { + "epoch": 2.2577894900015503, + "grad_norm": 0.36982831358909607, + "learning_rate": 1.7499314432196257e-06, + "loss": 0.3495, + "step": 4855 + }, + { + "epoch": 2.258254534180747, + "grad_norm": 0.35521289706230164, + "learning_rate": 1.747875617044521e-06, + "loss": 0.3106, + "step": 4856 + }, + { + "epoch": 2.258719578359944, + "grad_norm": 0.3456581234931946, + "learning_rate": 1.7458207433638225e-06, + "loss": 0.3315, + "step": 4857 + }, + { + "epoch": 2.2591846225391414, + "grad_norm": 0.3502543270587921, + "learning_rate": 1.7437668227793714e-06, + "loss": 0.308, + "step": 4858 + }, + { + "epoch": 2.2596496667183383, + "grad_norm": 0.3269558846950531, + "learning_rate": 1.7417138558927244e-06, + "loss": 0.2988, + "step": 4859 + }, + { + "epoch": 2.2601147108975352, + "grad_norm": 0.3678964376449585, + "learning_rate": 1.7396618433051648e-06, + "loss": 0.336, + "step": 4860 + }, + { + "epoch": 2.260579755076732, + "grad_norm": 0.3400316834449768, + "learning_rate": 1.7376107856176928e-06, + "loss": 0.3004, + "step": 4861 + }, + { + "epoch": 2.2610447992559295, + "grad_norm": 0.35273489356040955, + "learning_rate": 1.7355606834310317e-06, + "loss": 0.3352, + "step": 4862 + }, + { + "epoch": 2.2615098434351264, + "grad_norm": 0.35708242654800415, + "learning_rate": 1.7335115373456202e-06, + "loss": 0.3263, + "step": 4863 + }, + { + "epoch": 2.2619748876143233, + "grad_norm": 0.31572067737579346, + "learning_rate": 1.7314633479616227e-06, + "loss": 0.2921, + "step": 4864 + }, + { + "epoch": 2.26243993179352, + "grad_norm": 0.3223276436328888, + "learning_rate": 1.7294161158789197e-06, + "loss": 0.3239, + "step": 4865 + }, + { + "epoch": 2.2629049759727176, + "grad_norm": 0.32633039355278015, + "learning_rate": 1.7273698416971095e-06, + "loss": 0.3272, + "step": 4866 + }, + { + "epoch": 2.2633700201519145, + "grad_norm": 0.3426065444946289, + "learning_rate": 1.725324526015517e-06, + "loss": 0.3702, + "step": 4867 + }, + { + "epoch": 2.2638350643311114, + "grad_norm": 0.33616894483566284, + "learning_rate": 1.723280169433178e-06, + "loss": 0.3157, + "step": 4868 + }, + { + "epoch": 2.2643001085103087, + "grad_norm": 0.3157622516155243, + "learning_rate": 1.7212367725488544e-06, + "loss": 0.282, + "step": 4869 + }, + { + "epoch": 2.2647651526895056, + "grad_norm": 0.3689837157726288, + "learning_rate": 1.7191943359610214e-06, + "loss": 0.3646, + "step": 4870 + }, + { + "epoch": 2.2652301968687025, + "grad_norm": 0.337637722492218, + "learning_rate": 1.7171528602678767e-06, + "loss": 0.3203, + "step": 4871 + }, + { + "epoch": 2.2656952410478994, + "grad_norm": 0.34560054540634155, + "learning_rate": 1.7151123460673353e-06, + "loss": 0.3025, + "step": 4872 + }, + { + "epoch": 2.2661602852270963, + "grad_norm": 0.3452926576137543, + "learning_rate": 1.7130727939570325e-06, + "loss": 0.3539, + "step": 4873 + }, + { + "epoch": 2.2666253294062937, + "grad_norm": 0.3485986888408661, + "learning_rate": 1.7110342045343164e-06, + "loss": 0.3004, + "step": 4874 + }, + { + "epoch": 2.2670903735854906, + "grad_norm": 0.3492816984653473, + "learning_rate": 1.7089965783962608e-06, + "loss": 0.2975, + "step": 4875 + }, + { + "epoch": 2.2675554177646875, + "grad_norm": 0.3906685709953308, + "learning_rate": 1.7069599161396488e-06, + "loss": 0.3477, + "step": 4876 + }, + { + "epoch": 2.268020461943885, + "grad_norm": 0.35646331310272217, + "learning_rate": 1.70492421836099e-06, + "loss": 0.3069, + "step": 4877 + }, + { + "epoch": 2.2684855061230818, + "grad_norm": 0.35431551933288574, + "learning_rate": 1.7028894856565036e-06, + "loss": 0.305, + "step": 4878 + }, + { + "epoch": 2.2689505503022787, + "grad_norm": 0.3560280501842499, + "learning_rate": 1.700855718622133e-06, + "loss": 0.3439, + "step": 4879 + }, + { + "epoch": 2.2694155944814756, + "grad_norm": 0.3366261124610901, + "learning_rate": 1.698822917853532e-06, + "loss": 0.3093, + "step": 4880 + }, + { + "epoch": 2.269880638660673, + "grad_norm": 0.3454442322254181, + "learning_rate": 1.6967910839460788e-06, + "loss": 0.3045, + "step": 4881 + }, + { + "epoch": 2.27034568283987, + "grad_norm": 0.360445499420166, + "learning_rate": 1.6947602174948609e-06, + "loss": 0.3238, + "step": 4882 + }, + { + "epoch": 2.2708107270190667, + "grad_norm": 0.342626690864563, + "learning_rate": 1.6927303190946876e-06, + "loss": 0.3257, + "step": 4883 + }, + { + "epoch": 2.2712757711982636, + "grad_norm": 0.3575427532196045, + "learning_rate": 1.6907013893400838e-06, + "loss": 0.334, + "step": 4884 + }, + { + "epoch": 2.271740815377461, + "grad_norm": 0.3537481427192688, + "learning_rate": 1.6886734288252904e-06, + "loss": 0.3557, + "step": 4885 + }, + { + "epoch": 2.272205859556658, + "grad_norm": 0.32836195826530457, + "learning_rate": 1.6866464381442622e-06, + "loss": 0.3094, + "step": 4886 + }, + { + "epoch": 2.272670903735855, + "grad_norm": 0.3523426651954651, + "learning_rate": 1.6846204178906744e-06, + "loss": 0.3144, + "step": 4887 + }, + { + "epoch": 2.273135947915052, + "grad_norm": 0.35294294357299805, + "learning_rate": 1.6825953686579126e-06, + "loss": 0.3225, + "step": 4888 + }, + { + "epoch": 2.273600992094249, + "grad_norm": 0.33541440963745117, + "learning_rate": 1.6805712910390836e-06, + "loss": 0.3106, + "step": 4889 + }, + { + "epoch": 2.274066036273446, + "grad_norm": 0.3566812574863434, + "learning_rate": 1.6785481856270042e-06, + "loss": 0.3216, + "step": 4890 + }, + { + "epoch": 2.274531080452643, + "grad_norm": 0.3586536645889282, + "learning_rate": 1.6765260530142114e-06, + "loss": 0.3015, + "step": 4891 + }, + { + "epoch": 2.27499612463184, + "grad_norm": 0.31585457921028137, + "learning_rate": 1.6745048937929525e-06, + "loss": 0.295, + "step": 4892 + }, + { + "epoch": 2.275461168811037, + "grad_norm": 0.36500096321105957, + "learning_rate": 1.6724847085551955e-06, + "loss": 0.3317, + "step": 4893 + }, + { + "epoch": 2.275926212990234, + "grad_norm": 0.3432101309299469, + "learning_rate": 1.6704654978926167e-06, + "loss": 0.3005, + "step": 4894 + }, + { + "epoch": 2.276391257169431, + "grad_norm": 0.34062010049819946, + "learning_rate": 1.6684472623966108e-06, + "loss": 0.2974, + "step": 4895 + }, + { + "epoch": 2.2768563013486283, + "grad_norm": 0.33995094895362854, + "learning_rate": 1.666430002658287e-06, + "loss": 0.3219, + "step": 4896 + }, + { + "epoch": 2.277321345527825, + "grad_norm": 0.32602110505104065, + "learning_rate": 1.6644137192684694e-06, + "loss": 0.3365, + "step": 4897 + }, + { + "epoch": 2.277786389707022, + "grad_norm": 0.3418422341346741, + "learning_rate": 1.6623984128176912e-06, + "loss": 0.3219, + "step": 4898 + }, + { + "epoch": 2.2782514338862194, + "grad_norm": 0.3250005841255188, + "learning_rate": 1.6603840838962066e-06, + "loss": 0.3283, + "step": 4899 + }, + { + "epoch": 2.2787164780654163, + "grad_norm": 0.366692453622818, + "learning_rate": 1.6583707330939774e-06, + "loss": 0.3365, + "step": 4900 + }, + { + "epoch": 2.2791815222446132, + "grad_norm": 0.29823222756385803, + "learning_rate": 1.6563583610006806e-06, + "loss": 0.3037, + "step": 4901 + }, + { + "epoch": 2.27964656642381, + "grad_norm": 0.36057764291763306, + "learning_rate": 1.6543469682057105e-06, + "loss": 0.3333, + "step": 4902 + }, + { + "epoch": 2.280111610603007, + "grad_norm": 0.34210824966430664, + "learning_rate": 1.6523365552981674e-06, + "loss": 0.3378, + "step": 4903 + }, + { + "epoch": 2.2805766547822044, + "grad_norm": 0.3876771926879883, + "learning_rate": 1.6503271228668726e-06, + "loss": 0.3132, + "step": 4904 + }, + { + "epoch": 2.2810416989614013, + "grad_norm": 0.36838680505752563, + "learning_rate": 1.6483186715003523e-06, + "loss": 0.3283, + "step": 4905 + }, + { + "epoch": 2.281506743140598, + "grad_norm": 0.32702258229255676, + "learning_rate": 1.6463112017868516e-06, + "loss": 0.3193, + "step": 4906 + }, + { + "epoch": 2.2819717873197956, + "grad_norm": 0.33795708417892456, + "learning_rate": 1.6443047143143248e-06, + "loss": 0.3234, + "step": 4907 + }, + { + "epoch": 2.2824368314989925, + "grad_norm": 0.3263553977012634, + "learning_rate": 1.6422992096704422e-06, + "loss": 0.2925, + "step": 4908 + }, + { + "epoch": 2.2829018756781894, + "grad_norm": 0.38260510563850403, + "learning_rate": 1.6402946884425796e-06, + "loss": 0.3497, + "step": 4909 + }, + { + "epoch": 2.2833669198573863, + "grad_norm": 0.38565513491630554, + "learning_rate": 1.6382911512178323e-06, + "loss": 0.32, + "step": 4910 + }, + { + "epoch": 2.2838319640365836, + "grad_norm": 0.3692467212677002, + "learning_rate": 1.6362885985830001e-06, + "loss": 0.3332, + "step": 4911 + }, + { + "epoch": 2.2842970082157805, + "grad_norm": 0.3257414996623993, + "learning_rate": 1.6342870311246024e-06, + "loss": 0.3331, + "step": 4912 + }, + { + "epoch": 2.2847620523949774, + "grad_norm": 0.3339729607105255, + "learning_rate": 1.6322864494288616e-06, + "loss": 0.3091, + "step": 4913 + }, + { + "epoch": 2.2852270965741743, + "grad_norm": 0.32545122504234314, + "learning_rate": 1.6302868540817184e-06, + "loss": 0.3183, + "step": 4914 + }, + { + "epoch": 2.2856921407533717, + "grad_norm": 0.33002758026123047, + "learning_rate": 1.6282882456688197e-06, + "loss": 0.3126, + "step": 4915 + }, + { + "epoch": 2.2861571849325686, + "grad_norm": 0.3634626865386963, + "learning_rate": 1.6262906247755284e-06, + "loss": 0.3097, + "step": 4916 + }, + { + "epoch": 2.2866222291117655, + "grad_norm": 0.33301109075546265, + "learning_rate": 1.6242939919869117e-06, + "loss": 0.3269, + "step": 4917 + }, + { + "epoch": 2.287087273290963, + "grad_norm": 0.3408260941505432, + "learning_rate": 1.6222983478877525e-06, + "loss": 0.3362, + "step": 4918 + }, + { + "epoch": 2.2875523174701597, + "grad_norm": 0.35351526737213135, + "learning_rate": 1.6203036930625427e-06, + "loss": 0.3216, + "step": 4919 + }, + { + "epoch": 2.2880173616493567, + "grad_norm": 0.3414493799209595, + "learning_rate": 1.618310028095486e-06, + "loss": 0.356, + "step": 4920 + }, + { + "epoch": 2.2884824058285536, + "grad_norm": 0.3114349842071533, + "learning_rate": 1.6163173535704913e-06, + "loss": 0.3032, + "step": 4921 + }, + { + "epoch": 2.288947450007751, + "grad_norm": 0.3438819944858551, + "learning_rate": 1.6143256700711835e-06, + "loss": 0.3043, + "step": 4922 + }, + { + "epoch": 2.289412494186948, + "grad_norm": 0.35198479890823364, + "learning_rate": 1.6123349781808911e-06, + "loss": 0.3026, + "step": 4923 + }, + { + "epoch": 2.2898775383661447, + "grad_norm": 0.339100182056427, + "learning_rate": 1.610345278482659e-06, + "loss": 0.3188, + "step": 4924 + }, + { + "epoch": 2.2903425825453416, + "grad_norm": 0.3438659608364105, + "learning_rate": 1.6083565715592343e-06, + "loss": 0.311, + "step": 4925 + }, + { + "epoch": 2.290807626724539, + "grad_norm": 0.3378322720527649, + "learning_rate": 1.606368857993081e-06, + "loss": 0.3261, + "step": 4926 + }, + { + "epoch": 2.291272670903736, + "grad_norm": 0.3330194652080536, + "learning_rate": 1.6043821383663638e-06, + "loss": 0.3251, + "step": 4927 + }, + { + "epoch": 2.291737715082933, + "grad_norm": 0.3478861451148987, + "learning_rate": 1.6023964132609642e-06, + "loss": 0.3453, + "step": 4928 + }, + { + "epoch": 2.29220275926213, + "grad_norm": 0.33348485827445984, + "learning_rate": 1.600411683258466e-06, + "loss": 0.3251, + "step": 4929 + }, + { + "epoch": 2.292667803441327, + "grad_norm": 0.33961689472198486, + "learning_rate": 1.5984279489401655e-06, + "loss": 0.3144, + "step": 4930 + }, + { + "epoch": 2.293132847620524, + "grad_norm": 0.33800020813941956, + "learning_rate": 1.596445210887067e-06, + "loss": 0.3234, + "step": 4931 + }, + { + "epoch": 2.293597891799721, + "grad_norm": 0.3581719398498535, + "learning_rate": 1.5944634696798827e-06, + "loss": 0.3088, + "step": 4932 + }, + { + "epoch": 2.2940629359789177, + "grad_norm": 0.32927682995796204, + "learning_rate": 1.5924827258990305e-06, + "loss": 0.2917, + "step": 4933 + }, + { + "epoch": 2.294527980158115, + "grad_norm": 0.3471710681915283, + "learning_rate": 1.5905029801246401e-06, + "loss": 0.3399, + "step": 4934 + }, + { + "epoch": 2.294993024337312, + "grad_norm": 0.36960649490356445, + "learning_rate": 1.5885242329365448e-06, + "loss": 0.319, + "step": 4935 + }, + { + "epoch": 2.295458068516509, + "grad_norm": 0.3120764493942261, + "learning_rate": 1.5865464849142897e-06, + "loss": 0.309, + "step": 4936 + }, + { + "epoch": 2.2959231126957063, + "grad_norm": 0.32464638352394104, + "learning_rate": 1.5845697366371237e-06, + "loss": 0.3333, + "step": 4937 + }, + { + "epoch": 2.296388156874903, + "grad_norm": 0.35078492760658264, + "learning_rate": 1.5825939886840036e-06, + "loss": 0.3327, + "step": 4938 + }, + { + "epoch": 2.2968532010541, + "grad_norm": 0.3693057596683502, + "learning_rate": 1.5806192416335959e-06, + "loss": 0.2898, + "step": 4939 + }, + { + "epoch": 2.297318245233297, + "grad_norm": 0.39447295665740967, + "learning_rate": 1.5786454960642694e-06, + "loss": 0.3496, + "step": 4940 + }, + { + "epoch": 2.2977832894124943, + "grad_norm": 0.30697473883628845, + "learning_rate": 1.576672752554103e-06, + "loss": 0.2768, + "step": 4941 + }, + { + "epoch": 2.2982483335916912, + "grad_norm": 0.33453670144081116, + "learning_rate": 1.574701011680882e-06, + "loss": 0.3336, + "step": 4942 + }, + { + "epoch": 2.298713377770888, + "grad_norm": 0.3246103525161743, + "learning_rate": 1.572730274022099e-06, + "loss": 0.318, + "step": 4943 + }, + { + "epoch": 2.299178421950085, + "grad_norm": 0.38411572575569153, + "learning_rate": 1.570760540154947e-06, + "loss": 0.3616, + "step": 4944 + }, + { + "epoch": 2.2996434661292824, + "grad_norm": 0.35833507776260376, + "learning_rate": 1.5687918106563326e-06, + "loss": 0.323, + "step": 4945 + }, + { + "epoch": 2.3001085103084793, + "grad_norm": 0.3165144920349121, + "learning_rate": 1.566824086102862e-06, + "loss": 0.2817, + "step": 4946 + }, + { + "epoch": 2.300573554487676, + "grad_norm": 0.3616514801979065, + "learning_rate": 1.5648573670708527e-06, + "loss": 0.3341, + "step": 4947 + }, + { + "epoch": 2.3010385986668735, + "grad_norm": 0.35235756635665894, + "learning_rate": 1.562891654136321e-06, + "loss": 0.3283, + "step": 4948 + }, + { + "epoch": 2.3015036428460705, + "grad_norm": 0.3527206480503082, + "learning_rate": 1.560926947874996e-06, + "loss": 0.3329, + "step": 4949 + }, + { + "epoch": 2.3019686870252674, + "grad_norm": 0.35835447907447815, + "learning_rate": 1.5589632488623053e-06, + "loss": 0.3169, + "step": 4950 + }, + { + "epoch": 2.3024337312044643, + "grad_norm": 0.33192187547683716, + "learning_rate": 1.557000557673387e-06, + "loss": 0.2644, + "step": 4951 + }, + { + "epoch": 2.3028987753836616, + "grad_norm": 0.3572143614292145, + "learning_rate": 1.5550388748830786e-06, + "loss": 0.3176, + "step": 4952 + }, + { + "epoch": 2.3033638195628585, + "grad_norm": 0.34204208850860596, + "learning_rate": 1.5530782010659267e-06, + "loss": 0.3265, + "step": 4953 + }, + { + "epoch": 2.3038288637420554, + "grad_norm": 0.3122238516807556, + "learning_rate": 1.5511185367961813e-06, + "loss": 0.2969, + "step": 4954 + }, + { + "epoch": 2.3042939079212523, + "grad_norm": 0.34026968479156494, + "learning_rate": 1.5491598826477967e-06, + "loss": 0.3238, + "step": 4955 + }, + { + "epoch": 2.3047589521004497, + "grad_norm": 0.31696975231170654, + "learning_rate": 1.5472022391944285e-06, + "loss": 0.2969, + "step": 4956 + }, + { + "epoch": 2.3052239962796466, + "grad_norm": 0.38175731897354126, + "learning_rate": 1.5452456070094419e-06, + "loss": 0.3834, + "step": 4957 + }, + { + "epoch": 2.3056890404588435, + "grad_norm": 0.31309717893600464, + "learning_rate": 1.543289986665899e-06, + "loss": 0.2931, + "step": 4958 + }, + { + "epoch": 2.306154084638041, + "grad_norm": 0.3553571403026581, + "learning_rate": 1.5413353787365726e-06, + "loss": 0.3586, + "step": 4959 + }, + { + "epoch": 2.3066191288172377, + "grad_norm": 0.3609411418437958, + "learning_rate": 1.5393817837939328e-06, + "loss": 0.2954, + "step": 4960 + }, + { + "epoch": 2.3070841729964346, + "grad_norm": 0.3583955466747284, + "learning_rate": 1.537429202410159e-06, + "loss": 0.3251, + "step": 4961 + }, + { + "epoch": 2.3075492171756316, + "grad_norm": 0.3274742662906647, + "learning_rate": 1.5354776351571266e-06, + "loss": 0.2977, + "step": 4962 + }, + { + "epoch": 2.308014261354829, + "grad_norm": 0.3816562294960022, + "learning_rate": 1.533527082606422e-06, + "loss": 0.3322, + "step": 4963 + }, + { + "epoch": 2.308479305534026, + "grad_norm": 0.33534786105155945, + "learning_rate": 1.5315775453293269e-06, + "loss": 0.3119, + "step": 4964 + }, + { + "epoch": 2.3089443497132227, + "grad_norm": 0.33446696400642395, + "learning_rate": 1.5296290238968303e-06, + "loss": 0.3481, + "step": 4965 + }, + { + "epoch": 2.3094093938924196, + "grad_norm": 0.32276517152786255, + "learning_rate": 1.5276815188796235e-06, + "loss": 0.3235, + "step": 4966 + }, + { + "epoch": 2.309874438071617, + "grad_norm": 0.3487582504749298, + "learning_rate": 1.5257350308480994e-06, + "loss": 0.3368, + "step": 4967 + }, + { + "epoch": 2.310339482250814, + "grad_norm": 0.32539722323417664, + "learning_rate": 1.5237895603723501e-06, + "loss": 0.3009, + "step": 4968 + }, + { + "epoch": 2.3108045264300108, + "grad_norm": 0.3673970699310303, + "learning_rate": 1.5218451080221763e-06, + "loss": 0.3305, + "step": 4969 + }, + { + "epoch": 2.311269570609208, + "grad_norm": 0.31702741980552673, + "learning_rate": 1.5199016743670719e-06, + "loss": 0.2929, + "step": 4970 + }, + { + "epoch": 2.311734614788405, + "grad_norm": 0.3536950647830963, + "learning_rate": 1.517959259976241e-06, + "loss": 0.3543, + "step": 4971 + }, + { + "epoch": 2.312199658967602, + "grad_norm": 0.3390069603919983, + "learning_rate": 1.5160178654185836e-06, + "loss": 0.3179, + "step": 4972 + }, + { + "epoch": 2.312664703146799, + "grad_norm": 0.3432963192462921, + "learning_rate": 1.5140774912627005e-06, + "loss": 0.3357, + "step": 4973 + }, + { + "epoch": 2.3131297473259957, + "grad_norm": 0.35164210200309753, + "learning_rate": 1.5121381380769002e-06, + "loss": 0.3311, + "step": 4974 + }, + { + "epoch": 2.313594791505193, + "grad_norm": 0.33762413263320923, + "learning_rate": 1.5101998064291828e-06, + "loss": 0.3111, + "step": 4975 + }, + { + "epoch": 2.31405983568439, + "grad_norm": 0.3147108852863312, + "learning_rate": 1.5082624968872578e-06, + "loss": 0.2938, + "step": 4976 + }, + { + "epoch": 2.314524879863587, + "grad_norm": 0.3715774714946747, + "learning_rate": 1.50632621001853e-06, + "loss": 0.3709, + "step": 4977 + }, + { + "epoch": 2.3149899240427843, + "grad_norm": 0.32812413573265076, + "learning_rate": 1.5043909463901086e-06, + "loss": 0.3058, + "step": 4978 + }, + { + "epoch": 2.315454968221981, + "grad_norm": 0.3472438156604767, + "learning_rate": 1.5024567065687977e-06, + "loss": 0.3163, + "step": 4979 + }, + { + "epoch": 2.315920012401178, + "grad_norm": 0.3634364604949951, + "learning_rate": 1.500523491121108e-06, + "loss": 0.3306, + "step": 4980 + }, + { + "epoch": 2.316385056580375, + "grad_norm": 0.3792670667171478, + "learning_rate": 1.4985913006132435e-06, + "loss": 0.336, + "step": 4981 + }, + { + "epoch": 2.3168501007595723, + "grad_norm": 0.32765889167785645, + "learning_rate": 1.496660135611115e-06, + "loss": 0.3053, + "step": 4982 + }, + { + "epoch": 2.3173151449387692, + "grad_norm": 0.35471829771995544, + "learning_rate": 1.4947299966803259e-06, + "loss": 0.3403, + "step": 4983 + }, + { + "epoch": 2.317780189117966, + "grad_norm": 0.3708203136920929, + "learning_rate": 1.4928008843861851e-06, + "loss": 0.2946, + "step": 4984 + }, + { + "epoch": 2.318245233297163, + "grad_norm": 0.3640722930431366, + "learning_rate": 1.490872799293696e-06, + "loss": 0.3321, + "step": 4985 + }, + { + "epoch": 2.3187102774763604, + "grad_norm": 0.33545586466789246, + "learning_rate": 1.4889457419675669e-06, + "loss": 0.2907, + "step": 4986 + }, + { + "epoch": 2.3191753216555573, + "grad_norm": 0.3201104402542114, + "learning_rate": 1.487019712972197e-06, + "loss": 0.3066, + "step": 4987 + }, + { + "epoch": 2.319640365834754, + "grad_norm": 0.32187438011169434, + "learning_rate": 1.4850947128716914e-06, + "loss": 0.3197, + "step": 4988 + }, + { + "epoch": 2.3201054100139515, + "grad_norm": 0.3507417142391205, + "learning_rate": 1.4831707422298513e-06, + "loss": 0.323, + "step": 4989 + }, + { + "epoch": 2.3205704541931484, + "grad_norm": 0.3247746229171753, + "learning_rate": 1.4812478016101784e-06, + "loss": 0.3163, + "step": 4990 + }, + { + "epoch": 2.3210354983723454, + "grad_norm": 0.34104475378990173, + "learning_rate": 1.4793258915758668e-06, + "loss": 0.3392, + "step": 4991 + }, + { + "epoch": 2.3215005425515423, + "grad_norm": 0.32340723276138306, + "learning_rate": 1.4774050126898164e-06, + "loss": 0.3241, + "step": 4992 + }, + { + "epoch": 2.3219655867307396, + "grad_norm": 0.348816454410553, + "learning_rate": 1.475485165514618e-06, + "loss": 0.3026, + "step": 4993 + }, + { + "epoch": 2.3224306309099365, + "grad_norm": 0.372328519821167, + "learning_rate": 1.473566350612567e-06, + "loss": 0.3019, + "step": 4994 + }, + { + "epoch": 2.3228956750891334, + "grad_norm": 0.3437679409980774, + "learning_rate": 1.47164856854565e-06, + "loss": 0.3165, + "step": 4995 + }, + { + "epoch": 2.3233607192683303, + "grad_norm": 0.36129605770111084, + "learning_rate": 1.4697318198755572e-06, + "loss": 0.3472, + "step": 4996 + }, + { + "epoch": 2.3238257634475277, + "grad_norm": 0.32315579056739807, + "learning_rate": 1.4678161051636703e-06, + "loss": 0.3008, + "step": 4997 + }, + { + "epoch": 2.3242908076267246, + "grad_norm": 0.35113513469696045, + "learning_rate": 1.4659014249710734e-06, + "loss": 0.3222, + "step": 4998 + }, + { + "epoch": 2.3247558518059215, + "grad_norm": 0.3178529143333435, + "learning_rate": 1.4639877798585434e-06, + "loss": 0.3028, + "step": 4999 + }, + { + "epoch": 2.325220895985119, + "grad_norm": 0.31862160563468933, + "learning_rate": 1.462075170386556e-06, + "loss": 0.306, + "step": 5000 + }, + { + "epoch": 2.3256859401643157, + "grad_norm": 0.3178994953632355, + "learning_rate": 1.4601635971152844e-06, + "loss": 0.3442, + "step": 5001 + }, + { + "epoch": 2.3261509843435126, + "grad_norm": 0.33765512704849243, + "learning_rate": 1.4582530606045986e-06, + "loss": 0.3313, + "step": 5002 + }, + { + "epoch": 2.3266160285227095, + "grad_norm": 0.3193725347518921, + "learning_rate": 1.456343561414061e-06, + "loss": 0.2883, + "step": 5003 + }, + { + "epoch": 2.3270810727019065, + "grad_norm": 0.3627553880214691, + "learning_rate": 1.4544351001029349e-06, + "loss": 0.3271, + "step": 5004 + }, + { + "epoch": 2.327546116881104, + "grad_norm": 0.3708834648132324, + "learning_rate": 1.4525276772301761e-06, + "loss": 0.3387, + "step": 5005 + }, + { + "epoch": 2.3280111610603007, + "grad_norm": 0.33413180708885193, + "learning_rate": 1.4506212933544394e-06, + "loss": 0.3144, + "step": 5006 + }, + { + "epoch": 2.3284762052394976, + "grad_norm": 0.3867631256580353, + "learning_rate": 1.4487159490340714e-06, + "loss": 0.3447, + "step": 5007 + }, + { + "epoch": 2.328941249418695, + "grad_norm": 0.3161846697330475, + "learning_rate": 1.4468116448271196e-06, + "loss": 0.3167, + "step": 5008 + }, + { + "epoch": 2.329406293597892, + "grad_norm": 0.3061203956604004, + "learning_rate": 1.4449083812913217e-06, + "loss": 0.3078, + "step": 5009 + }, + { + "epoch": 2.3298713377770888, + "grad_norm": 0.33758896589279175, + "learning_rate": 1.4430061589841122e-06, + "loss": 0.3081, + "step": 5010 + }, + { + "epoch": 2.3303363819562857, + "grad_norm": 0.37586334347724915, + "learning_rate": 1.4411049784626213e-06, + "loss": 0.324, + "step": 5011 + }, + { + "epoch": 2.330801426135483, + "grad_norm": 0.36526045203208923, + "learning_rate": 1.4392048402836744e-06, + "loss": 0.3264, + "step": 5012 + }, + { + "epoch": 2.33126647031468, + "grad_norm": 0.3223170042037964, + "learning_rate": 1.437305745003793e-06, + "loss": 0.2935, + "step": 5013 + }, + { + "epoch": 2.331731514493877, + "grad_norm": 0.3323318660259247, + "learning_rate": 1.4354076931791876e-06, + "loss": 0.3261, + "step": 5014 + }, + { + "epoch": 2.3321965586730737, + "grad_norm": 0.3371385931968689, + "learning_rate": 1.433510685365771e-06, + "loss": 0.3157, + "step": 5015 + }, + { + "epoch": 2.332661602852271, + "grad_norm": 0.31770241260528564, + "learning_rate": 1.4316147221191411e-06, + "loss": 0.3195, + "step": 5016 + }, + { + "epoch": 2.333126647031468, + "grad_norm": 0.35454097390174866, + "learning_rate": 1.4297198039945998e-06, + "loss": 0.3435, + "step": 5017 + }, + { + "epoch": 2.333591691210665, + "grad_norm": 0.34481361508369446, + "learning_rate": 1.4278259315471332e-06, + "loss": 0.312, + "step": 5018 + }, + { + "epoch": 2.3340567353898622, + "grad_norm": 0.37690550088882446, + "learning_rate": 1.425933105331429e-06, + "loss": 0.3404, + "step": 5019 + }, + { + "epoch": 2.334521779569059, + "grad_norm": 0.32657355070114136, + "learning_rate": 1.4240413259018632e-06, + "loss": 0.3181, + "step": 5020 + }, + { + "epoch": 2.334986823748256, + "grad_norm": 0.33882707357406616, + "learning_rate": 1.4221505938125097e-06, + "loss": 0.3137, + "step": 5021 + }, + { + "epoch": 2.335451867927453, + "grad_norm": 0.35311123728752136, + "learning_rate": 1.42026090961713e-06, + "loss": 0.3127, + "step": 5022 + }, + { + "epoch": 2.3359169121066503, + "grad_norm": 0.3354308307170868, + "learning_rate": 1.4183722738691834e-06, + "loss": 0.3445, + "step": 5023 + }, + { + "epoch": 2.336381956285847, + "grad_norm": 0.3541325330734253, + "learning_rate": 1.4164846871218213e-06, + "loss": 0.3136, + "step": 5024 + }, + { + "epoch": 2.336847000465044, + "grad_norm": 0.32369163632392883, + "learning_rate": 1.4145981499278877e-06, + "loss": 0.3183, + "step": 5025 + }, + { + "epoch": 2.337312044644241, + "grad_norm": 0.33266210556030273, + "learning_rate": 1.4127126628399168e-06, + "loss": 0.3208, + "step": 5026 + }, + { + "epoch": 2.3377770888234384, + "grad_norm": 0.3531791567802429, + "learning_rate": 1.410828226410139e-06, + "loss": 0.3399, + "step": 5027 + }, + { + "epoch": 2.3382421330026353, + "grad_norm": 0.3102116882801056, + "learning_rate": 1.4089448411904733e-06, + "loss": 0.3203, + "step": 5028 + }, + { + "epoch": 2.338707177181832, + "grad_norm": 0.39151903986930847, + "learning_rate": 1.4070625077325345e-06, + "loss": 0.3452, + "step": 5029 + }, + { + "epoch": 2.3391722213610295, + "grad_norm": 0.3667720854282379, + "learning_rate": 1.4051812265876257e-06, + "loss": 0.2859, + "step": 5030 + }, + { + "epoch": 2.3396372655402264, + "grad_norm": 0.30760088562965393, + "learning_rate": 1.4033009983067454e-06, + "loss": 0.2963, + "step": 5031 + }, + { + "epoch": 2.3401023097194233, + "grad_norm": 0.3295292854309082, + "learning_rate": 1.4014218234405796e-06, + "loss": 0.3199, + "step": 5032 + }, + { + "epoch": 2.3405673538986203, + "grad_norm": 0.321175754070282, + "learning_rate": 1.3995437025395109e-06, + "loss": 0.3068, + "step": 5033 + }, + { + "epoch": 2.341032398077817, + "grad_norm": 0.35323259234428406, + "learning_rate": 1.3976666361536074e-06, + "loss": 0.2936, + "step": 5034 + }, + { + "epoch": 2.3414974422570145, + "grad_norm": 0.3676642179489136, + "learning_rate": 1.395790624832633e-06, + "loss": 0.3327, + "step": 5035 + }, + { + "epoch": 2.3419624864362114, + "grad_norm": 0.31863468885421753, + "learning_rate": 1.3939156691260407e-06, + "loss": 0.2977, + "step": 5036 + }, + { + "epoch": 2.3424275306154083, + "grad_norm": 0.3256819248199463, + "learning_rate": 1.392041769582977e-06, + "loss": 0.3326, + "step": 5037 + }, + { + "epoch": 2.3428925747946057, + "grad_norm": 0.3244876265525818, + "learning_rate": 1.3901689267522718e-06, + "loss": 0.3405, + "step": 5038 + }, + { + "epoch": 2.3433576189738026, + "grad_norm": 0.31455737352371216, + "learning_rate": 1.3882971411824547e-06, + "loss": 0.3042, + "step": 5039 + }, + { + "epoch": 2.3438226631529995, + "grad_norm": 0.32401779294013977, + "learning_rate": 1.386426413421738e-06, + "loss": 0.3078, + "step": 5040 + }, + { + "epoch": 2.3442877073321964, + "grad_norm": 0.33598318696022034, + "learning_rate": 1.3845567440180308e-06, + "loss": 0.297, + "step": 5041 + }, + { + "epoch": 2.3447527515113937, + "grad_norm": 0.35334211587905884, + "learning_rate": 1.3826881335189258e-06, + "loss": 0.3692, + "step": 5042 + }, + { + "epoch": 2.3452177956905906, + "grad_norm": 0.35467445850372314, + "learning_rate": 1.3808205824717108e-06, + "loss": 0.3316, + "step": 5043 + }, + { + "epoch": 2.3456828398697875, + "grad_norm": 0.34783780574798584, + "learning_rate": 1.3789540914233607e-06, + "loss": 0.316, + "step": 5044 + }, + { + "epoch": 2.3461478840489844, + "grad_norm": 0.36762094497680664, + "learning_rate": 1.3770886609205381e-06, + "loss": 0.3563, + "step": 5045 + }, + { + "epoch": 2.346612928228182, + "grad_norm": 0.35472989082336426, + "learning_rate": 1.3752242915095993e-06, + "loss": 0.337, + "step": 5046 + }, + { + "epoch": 2.3470779724073787, + "grad_norm": 0.33739933371543884, + "learning_rate": 1.373360983736588e-06, + "loss": 0.2881, + "step": 5047 + }, + { + "epoch": 2.3475430165865756, + "grad_norm": 0.374533474445343, + "learning_rate": 1.3714987381472378e-06, + "loss": 0.3296, + "step": 5048 + }, + { + "epoch": 2.348008060765773, + "grad_norm": 0.32988038659095764, + "learning_rate": 1.3696375552869673e-06, + "loss": 0.316, + "step": 5049 + }, + { + "epoch": 2.34847310494497, + "grad_norm": 0.33644863963127136, + "learning_rate": 1.36777743570089e-06, + "loss": 0.3117, + "step": 5050 + }, + { + "epoch": 2.3489381491241668, + "grad_norm": 0.3263695538043976, + "learning_rate": 1.365918379933801e-06, + "loss": 0.299, + "step": 5051 + }, + { + "epoch": 2.3494031933033637, + "grad_norm": 0.3548851013183594, + "learning_rate": 1.3640603885301917e-06, + "loss": 0.3732, + "step": 5052 + }, + { + "epoch": 2.349868237482561, + "grad_norm": 0.32212895154953003, + "learning_rate": 1.362203462034234e-06, + "loss": 0.3201, + "step": 5053 + }, + { + "epoch": 2.350333281661758, + "grad_norm": 0.3988026976585388, + "learning_rate": 1.3603476009897942e-06, + "loss": 0.3429, + "step": 5054 + }, + { + "epoch": 2.350798325840955, + "grad_norm": 0.3584456145763397, + "learning_rate": 1.3584928059404207e-06, + "loss": 0.3324, + "step": 5055 + }, + { + "epoch": 2.3512633700201517, + "grad_norm": 0.315178781747818, + "learning_rate": 1.356639077429357e-06, + "loss": 0.3269, + "step": 5056 + }, + { + "epoch": 2.351728414199349, + "grad_norm": 0.3512877821922302, + "learning_rate": 1.354786415999526e-06, + "loss": 0.3097, + "step": 5057 + }, + { + "epoch": 2.352193458378546, + "grad_norm": 0.35129326581954956, + "learning_rate": 1.352934822193544e-06, + "loss": 0.3193, + "step": 5058 + }, + { + "epoch": 2.352658502557743, + "grad_norm": 0.3396090269088745, + "learning_rate": 1.351084296553713e-06, + "loss": 0.2999, + "step": 5059 + }, + { + "epoch": 2.3531235467369402, + "grad_norm": 0.3327389359474182, + "learning_rate": 1.3492348396220229e-06, + "loss": 0.3217, + "step": 5060 + }, + { + "epoch": 2.353588590916137, + "grad_norm": 0.31320449709892273, + "learning_rate": 1.3473864519401463e-06, + "loss": 0.2914, + "step": 5061 + }, + { + "epoch": 2.354053635095334, + "grad_norm": 0.343872994184494, + "learning_rate": 1.34553913404945e-06, + "loss": 0.3318, + "step": 5062 + }, + { + "epoch": 2.354518679274531, + "grad_norm": 0.31648504734039307, + "learning_rate": 1.3436928864909799e-06, + "loss": 0.2752, + "step": 5063 + }, + { + "epoch": 2.354983723453728, + "grad_norm": 0.38725370168685913, + "learning_rate": 1.341847709805475e-06, + "loss": 0.3429, + "step": 5064 + }, + { + "epoch": 2.355448767632925, + "grad_norm": 0.3474979102611542, + "learning_rate": 1.3400036045333542e-06, + "loss": 0.3161, + "step": 5065 + }, + { + "epoch": 2.355913811812122, + "grad_norm": 0.3323841392993927, + "learning_rate": 1.3381605712147294e-06, + "loss": 0.2989, + "step": 5066 + }, + { + "epoch": 2.356378855991319, + "grad_norm": 0.3497825562953949, + "learning_rate": 1.3363186103893916e-06, + "loss": 0.356, + "step": 5067 + }, + { + "epoch": 2.3568439001705164, + "grad_norm": 0.35843122005462646, + "learning_rate": 1.3344777225968247e-06, + "loss": 0.3005, + "step": 5068 + }, + { + "epoch": 2.3573089443497133, + "grad_norm": 0.3550424873828888, + "learning_rate": 1.332637908376192e-06, + "loss": 0.3315, + "step": 5069 + }, + { + "epoch": 2.35777398852891, + "grad_norm": 0.32174983620643616, + "learning_rate": 1.3307991682663463e-06, + "loss": 0.2728, + "step": 5070 + }, + { + "epoch": 2.358239032708107, + "grad_norm": 0.37295761704444885, + "learning_rate": 1.328961502805825e-06, + "loss": 0.3422, + "step": 5071 + }, + { + "epoch": 2.3587040768873044, + "grad_norm": 0.34429284930229187, + "learning_rate": 1.3271249125328512e-06, + "loss": 0.3188, + "step": 5072 + }, + { + "epoch": 2.3591691210665013, + "grad_norm": 0.32858747243881226, + "learning_rate": 1.3252893979853304e-06, + "loss": 0.2941, + "step": 5073 + }, + { + "epoch": 2.3596341652456982, + "grad_norm": 0.3379555344581604, + "learning_rate": 1.3234549597008572e-06, + "loss": 0.3626, + "step": 5074 + }, + { + "epoch": 2.360099209424895, + "grad_norm": 0.33183690905570984, + "learning_rate": 1.3216215982167064e-06, + "loss": 0.3313, + "step": 5075 + }, + { + "epoch": 2.3605642536040925, + "grad_norm": 0.386362761259079, + "learning_rate": 1.3197893140698426e-06, + "loss": 0.3743, + "step": 5076 + }, + { + "epoch": 2.3610292977832894, + "grad_norm": 0.3396303653717041, + "learning_rate": 1.3179581077969084e-06, + "loss": 0.2914, + "step": 5077 + }, + { + "epoch": 2.3614943419624863, + "grad_norm": 0.3387748599052429, + "learning_rate": 1.3161279799342385e-06, + "loss": 0.3185, + "step": 5078 + }, + { + "epoch": 2.3619593861416837, + "grad_norm": 0.32866546511650085, + "learning_rate": 1.314298931017844e-06, + "loss": 0.2987, + "step": 5079 + }, + { + "epoch": 2.3624244303208806, + "grad_norm": 0.3564012348651886, + "learning_rate": 1.3124709615834263e-06, + "loss": 0.3356, + "step": 5080 + }, + { + "epoch": 2.3628894745000775, + "grad_norm": 0.34756961464881897, + "learning_rate": 1.3106440721663655e-06, + "loss": 0.358, + "step": 5081 + }, + { + "epoch": 2.3633545186792744, + "grad_norm": 0.3577297627925873, + "learning_rate": 1.3088182633017294e-06, + "loss": 0.3194, + "step": 5082 + }, + { + "epoch": 2.3638195628584717, + "grad_norm": 0.3133348822593689, + "learning_rate": 1.306993535524269e-06, + "loss": 0.3099, + "step": 5083 + }, + { + "epoch": 2.3642846070376686, + "grad_norm": 0.3245254456996918, + "learning_rate": 1.3051698893684144e-06, + "loss": 0.3172, + "step": 5084 + }, + { + "epoch": 2.3647496512168655, + "grad_norm": 0.3281231224536896, + "learning_rate": 1.303347325368285e-06, + "loss": 0.3242, + "step": 5085 + }, + { + "epoch": 2.3652146953960624, + "grad_norm": 0.3328626751899719, + "learning_rate": 1.3015258440576767e-06, + "loss": 0.3048, + "step": 5086 + }, + { + "epoch": 2.36567973957526, + "grad_norm": 0.34870782494544983, + "learning_rate": 1.299705445970076e-06, + "loss": 0.2954, + "step": 5087 + }, + { + "epoch": 2.3661447837544567, + "grad_norm": 0.31751129031181335, + "learning_rate": 1.2978861316386437e-06, + "loss": 0.3195, + "step": 5088 + }, + { + "epoch": 2.3666098279336536, + "grad_norm": 0.3291054368019104, + "learning_rate": 1.2960679015962313e-06, + "loss": 0.3173, + "step": 5089 + }, + { + "epoch": 2.367074872112851, + "grad_norm": 0.34506654739379883, + "learning_rate": 1.2942507563753653e-06, + "loss": 0.3536, + "step": 5090 + }, + { + "epoch": 2.367539916292048, + "grad_norm": 0.3213541507720947, + "learning_rate": 1.2924346965082612e-06, + "loss": 0.3214, + "step": 5091 + }, + { + "epoch": 2.3680049604712448, + "grad_norm": 0.3615787625312805, + "learning_rate": 1.2906197225268108e-06, + "loss": 0.3476, + "step": 5092 + }, + { + "epoch": 2.3684700046504417, + "grad_norm": 0.32223793864250183, + "learning_rate": 1.288805834962591e-06, + "loss": 0.2795, + "step": 5093 + }, + { + "epoch": 2.3689350488296386, + "grad_norm": 0.357710599899292, + "learning_rate": 1.2869930343468611e-06, + "loss": 0.3424, + "step": 5094 + }, + { + "epoch": 2.369400093008836, + "grad_norm": 0.3196834921836853, + "learning_rate": 1.285181321210562e-06, + "loss": 0.2891, + "step": 5095 + }, + { + "epoch": 2.369865137188033, + "grad_norm": 0.33670881390571594, + "learning_rate": 1.2833706960843118e-06, + "loss": 0.3391, + "step": 5096 + }, + { + "epoch": 2.3703301813672297, + "grad_norm": 0.30696120858192444, + "learning_rate": 1.2815611594984162e-06, + "loss": 0.275, + "step": 5097 + }, + { + "epoch": 2.370795225546427, + "grad_norm": 0.35666152834892273, + "learning_rate": 1.2797527119828567e-06, + "loss": 0.342, + "step": 5098 + }, + { + "epoch": 2.371260269725624, + "grad_norm": 0.31616926193237305, + "learning_rate": 1.2779453540673009e-06, + "loss": 0.3172, + "step": 5099 + }, + { + "epoch": 2.371725313904821, + "grad_norm": 0.3312261700630188, + "learning_rate": 1.2761390862810907e-06, + "loss": 0.3278, + "step": 5100 + }, + { + "epoch": 2.372190358084018, + "grad_norm": 0.33016443252563477, + "learning_rate": 1.274333909153257e-06, + "loss": 0.3146, + "step": 5101 + }, + { + "epoch": 2.372655402263215, + "grad_norm": 0.34783631563186646, + "learning_rate": 1.2725298232125034e-06, + "loss": 0.3116, + "step": 5102 + }, + { + "epoch": 2.373120446442412, + "grad_norm": 0.34138286113739014, + "learning_rate": 1.27072682898722e-06, + "loss": 0.3093, + "step": 5103 + }, + { + "epoch": 2.373585490621609, + "grad_norm": 0.34735816717147827, + "learning_rate": 1.2689249270054716e-06, + "loss": 0.3073, + "step": 5104 + }, + { + "epoch": 2.374050534800806, + "grad_norm": 0.3784792721271515, + "learning_rate": 1.2671241177950078e-06, + "loss": 0.3441, + "step": 5105 + }, + { + "epoch": 2.374515578980003, + "grad_norm": 0.3438231945037842, + "learning_rate": 1.2653244018832562e-06, + "loss": 0.2984, + "step": 5106 + }, + { + "epoch": 2.3749806231592, + "grad_norm": 0.3264318108558655, + "learning_rate": 1.2635257797973255e-06, + "loss": 0.3278, + "step": 5107 + }, + { + "epoch": 2.375445667338397, + "grad_norm": 0.3333817422389984, + "learning_rate": 1.2617282520640007e-06, + "loss": 0.2979, + "step": 5108 + }, + { + "epoch": 2.3759107115175944, + "grad_norm": 0.37426552176475525, + "learning_rate": 1.2599318192097509e-06, + "loss": 0.3836, + "step": 5109 + }, + { + "epoch": 2.3763757556967913, + "grad_norm": 0.3461456596851349, + "learning_rate": 1.2581364817607194e-06, + "loss": 0.3214, + "step": 5110 + }, + { + "epoch": 2.376840799875988, + "grad_norm": 0.35083499550819397, + "learning_rate": 1.2563422402427339e-06, + "loss": 0.3083, + "step": 5111 + }, + { + "epoch": 2.377305844055185, + "grad_norm": 0.3530008792877197, + "learning_rate": 1.254549095181296e-06, + "loss": 0.2944, + "step": 5112 + }, + { + "epoch": 2.3777708882343824, + "grad_norm": 0.3381586968898773, + "learning_rate": 1.2527570471015915e-06, + "loss": 0.3435, + "step": 5113 + }, + { + "epoch": 2.3782359324135793, + "grad_norm": 0.3386650085449219, + "learning_rate": 1.2509660965284797e-06, + "loss": 0.3303, + "step": 5114 + }, + { + "epoch": 2.3787009765927762, + "grad_norm": 0.333477646112442, + "learning_rate": 1.2491762439865034e-06, + "loss": 0.3108, + "step": 5115 + }, + { + "epoch": 2.379166020771973, + "grad_norm": 0.32221055030822754, + "learning_rate": 1.247387489999879e-06, + "loss": 0.307, + "step": 5116 + }, + { + "epoch": 2.3796310649511705, + "grad_norm": 0.35751134157180786, + "learning_rate": 1.2455998350925042e-06, + "loss": 0.3388, + "step": 5117 + }, + { + "epoch": 2.3800961091303674, + "grad_norm": 0.3639261722564697, + "learning_rate": 1.2438132797879554e-06, + "loss": 0.348, + "step": 5118 + }, + { + "epoch": 2.3805611533095643, + "grad_norm": 0.31775328516960144, + "learning_rate": 1.2420278246094835e-06, + "loss": 0.2986, + "step": 5119 + }, + { + "epoch": 2.3810261974887617, + "grad_norm": 0.36067667603492737, + "learning_rate": 1.240243470080022e-06, + "loss": 0.3553, + "step": 5120 + }, + { + "epoch": 2.3814912416679586, + "grad_norm": 0.31480494141578674, + "learning_rate": 1.2384602167221765e-06, + "loss": 0.2935, + "step": 5121 + }, + { + "epoch": 2.3819562858471555, + "grad_norm": 0.30289679765701294, + "learning_rate": 1.2366780650582355e-06, + "loss": 0.2969, + "step": 5122 + }, + { + "epoch": 2.3824213300263524, + "grad_norm": 0.3624131977558136, + "learning_rate": 1.2348970156101592e-06, + "loss": 0.3583, + "step": 5123 + }, + { + "epoch": 2.3828863742055497, + "grad_norm": 0.3010319769382477, + "learning_rate": 1.233117068899592e-06, + "loss": 0.2917, + "step": 5124 + }, + { + "epoch": 2.3833514183847466, + "grad_norm": 0.3690085709095001, + "learning_rate": 1.2313382254478473e-06, + "loss": 0.3563, + "step": 5125 + }, + { + "epoch": 2.3838164625639435, + "grad_norm": 0.33377185463905334, + "learning_rate": 1.229560485775923e-06, + "loss": 0.3258, + "step": 5126 + }, + { + "epoch": 2.3842815067431404, + "grad_norm": 0.31576281785964966, + "learning_rate": 1.227783850404487e-06, + "loss": 0.3138, + "step": 5127 + }, + { + "epoch": 2.384746550922338, + "grad_norm": 0.3261266350746155, + "learning_rate": 1.2260083198538886e-06, + "loss": 0.3253, + "step": 5128 + }, + { + "epoch": 2.3852115951015347, + "grad_norm": 0.34286534786224365, + "learning_rate": 1.2242338946441518e-06, + "loss": 0.357, + "step": 5129 + }, + { + "epoch": 2.3856766392807316, + "grad_norm": 0.3160213530063629, + "learning_rate": 1.2224605752949786e-06, + "loss": 0.3051, + "step": 5130 + }, + { + "epoch": 2.386141683459929, + "grad_norm": 0.34241053462028503, + "learning_rate": 1.2206883623257421e-06, + "loss": 0.3066, + "step": 5131 + }, + { + "epoch": 2.386606727639126, + "grad_norm": 0.34416183829307556, + "learning_rate": 1.2189172562554973e-06, + "loss": 0.3326, + "step": 5132 + }, + { + "epoch": 2.3870717718183228, + "grad_norm": 0.3707297742366791, + "learning_rate": 1.2171472576029707e-06, + "loss": 0.3504, + "step": 5133 + }, + { + "epoch": 2.3875368159975197, + "grad_norm": 0.31945255398750305, + "learning_rate": 1.2153783668865681e-06, + "loss": 0.3035, + "step": 5134 + }, + { + "epoch": 2.3880018601767166, + "grad_norm": 0.33111390471458435, + "learning_rate": 1.2136105846243662e-06, + "loss": 0.3256, + "step": 5135 + }, + { + "epoch": 2.388466904355914, + "grad_norm": 0.3572760224342346, + "learning_rate": 1.2118439113341224e-06, + "loss": 0.3382, + "step": 5136 + }, + { + "epoch": 2.388931948535111, + "grad_norm": 0.32829439640045166, + "learning_rate": 1.210078347533264e-06, + "loss": 0.3172, + "step": 5137 + }, + { + "epoch": 2.3893969927143077, + "grad_norm": 0.3324892818927765, + "learning_rate": 1.2083138937388989e-06, + "loss": 0.3209, + "step": 5138 + }, + { + "epoch": 2.389862036893505, + "grad_norm": 0.31397631764411926, + "learning_rate": 1.2065505504678038e-06, + "loss": 0.3149, + "step": 5139 + }, + { + "epoch": 2.390327081072702, + "grad_norm": 0.3297927677631378, + "learning_rate": 1.2047883182364351e-06, + "loss": 0.2948, + "step": 5140 + }, + { + "epoch": 2.390792125251899, + "grad_norm": 0.33224743604660034, + "learning_rate": 1.2030271975609214e-06, + "loss": 0.3089, + "step": 5141 + }, + { + "epoch": 2.391257169431096, + "grad_norm": 0.37292471528053284, + "learning_rate": 1.2012671889570683e-06, + "loss": 0.3241, + "step": 5142 + }, + { + "epoch": 2.391722213610293, + "grad_norm": 0.3111366927623749, + "learning_rate": 1.1995082929403507e-06, + "loss": 0.2858, + "step": 5143 + }, + { + "epoch": 2.39218725778949, + "grad_norm": 0.37733349204063416, + "learning_rate": 1.1977505100259235e-06, + "loss": 0.3593, + "step": 5144 + }, + { + "epoch": 2.392652301968687, + "grad_norm": 0.36650681495666504, + "learning_rate": 1.1959938407286099e-06, + "loss": 0.3479, + "step": 5145 + }, + { + "epoch": 2.393117346147884, + "grad_norm": 0.31650200486183167, + "learning_rate": 1.1942382855629131e-06, + "loss": 0.2705, + "step": 5146 + }, + { + "epoch": 2.393582390327081, + "grad_norm": 0.33961057662963867, + "learning_rate": 1.1924838450430032e-06, + "loss": 0.3244, + "step": 5147 + }, + { + "epoch": 2.394047434506278, + "grad_norm": 0.31274569034576416, + "learning_rate": 1.19073051968273e-06, + "loss": 0.2908, + "step": 5148 + }, + { + "epoch": 2.394512478685475, + "grad_norm": 0.32725125551223755, + "learning_rate": 1.188978309995612e-06, + "loss": 0.311, + "step": 5149 + }, + { + "epoch": 2.3949775228646724, + "grad_norm": 0.3675840198993683, + "learning_rate": 1.1872272164948456e-06, + "loss": 0.356, + "step": 5150 + }, + { + "epoch": 2.3954425670438693, + "grad_norm": 0.3493657410144806, + "learning_rate": 1.1854772396932946e-06, + "loss": 0.3292, + "step": 5151 + }, + { + "epoch": 2.395907611223066, + "grad_norm": 0.344338595867157, + "learning_rate": 1.1837283801034998e-06, + "loss": 0.3117, + "step": 5152 + }, + { + "epoch": 2.396372655402263, + "grad_norm": 0.344719260931015, + "learning_rate": 1.181980638237676e-06, + "loss": 0.3277, + "step": 5153 + }, + { + "epoch": 2.3968376995814604, + "grad_norm": 0.32095858454704285, + "learning_rate": 1.1802340146077045e-06, + "loss": 0.2881, + "step": 5154 + }, + { + "epoch": 2.3973027437606573, + "grad_norm": 0.3079981803894043, + "learning_rate": 1.1784885097251474e-06, + "loss": 0.3379, + "step": 5155 + }, + { + "epoch": 2.3977677879398542, + "grad_norm": 0.33286038041114807, + "learning_rate": 1.1767441241012307e-06, + "loss": 0.3374, + "step": 5156 + }, + { + "epoch": 2.398232832119051, + "grad_norm": 0.36355435848236084, + "learning_rate": 1.1750008582468592e-06, + "loss": 0.3364, + "step": 5157 + }, + { + "epoch": 2.3986978762982485, + "grad_norm": 0.3209869861602783, + "learning_rate": 1.1732587126726054e-06, + "loss": 0.3159, + "step": 5158 + }, + { + "epoch": 2.3991629204774454, + "grad_norm": 0.32692816853523254, + "learning_rate": 1.1715176878887174e-06, + "loss": 0.3143, + "step": 5159 + }, + { + "epoch": 2.3996279646566423, + "grad_norm": 0.345546692609787, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.3354, + "step": 5160 + }, + { + "epoch": 2.4000930088358396, + "grad_norm": 0.3152925670146942, + "learning_rate": 1.168039002731377e-06, + "loss": 0.3035, + "step": 5161 + }, + { + "epoch": 2.4005580530150366, + "grad_norm": 0.379638671875, + "learning_rate": 1.1663013433767756e-06, + "loss": 0.3057, + "step": 5162 + }, + { + "epoch": 2.4010230971942335, + "grad_norm": 0.31644484400749207, + "learning_rate": 1.164564806850239e-06, + "loss": 0.3047, + "step": 5163 + }, + { + "epoch": 2.4014881413734304, + "grad_norm": 0.34637710452079773, + "learning_rate": 1.1628293936603707e-06, + "loss": 0.3396, + "step": 5164 + }, + { + "epoch": 2.4019531855526273, + "grad_norm": 0.3282802700996399, + "learning_rate": 1.1610951043154472e-06, + "loss": 0.337, + "step": 5165 + }, + { + "epoch": 2.4024182297318246, + "grad_norm": 0.30934378504753113, + "learning_rate": 1.1593619393234096e-06, + "loss": 0.3007, + "step": 5166 + }, + { + "epoch": 2.4028832739110215, + "grad_norm": 0.33991992473602295, + "learning_rate": 1.1576298991918778e-06, + "loss": 0.3462, + "step": 5167 + }, + { + "epoch": 2.4033483180902184, + "grad_norm": 0.30924656987190247, + "learning_rate": 1.1558989844281349e-06, + "loss": 0.3036, + "step": 5168 + }, + { + "epoch": 2.4038133622694158, + "grad_norm": 0.341217577457428, + "learning_rate": 1.1541691955391403e-06, + "loss": 0.3183, + "step": 5169 + }, + { + "epoch": 2.4042784064486127, + "grad_norm": 0.30020660161972046, + "learning_rate": 1.1524405330315187e-06, + "loss": 0.3064, + "step": 5170 + }, + { + "epoch": 2.4047434506278096, + "grad_norm": 0.3430638611316681, + "learning_rate": 1.15071299741157e-06, + "loss": 0.3619, + "step": 5171 + }, + { + "epoch": 2.4052084948070065, + "grad_norm": 0.33357352018356323, + "learning_rate": 1.148986589185258e-06, + "loss": 0.3348, + "step": 5172 + }, + { + "epoch": 2.405673538986204, + "grad_norm": 0.34539151191711426, + "learning_rate": 1.147261308858223e-06, + "loss": 0.3226, + "step": 5173 + }, + { + "epoch": 2.4061385831654007, + "grad_norm": 0.34261536598205566, + "learning_rate": 1.145537156935768e-06, + "loss": 0.3085, + "step": 5174 + }, + { + "epoch": 2.4066036273445977, + "grad_norm": 0.3475549817085266, + "learning_rate": 1.143814133922872e-06, + "loss": 0.326, + "step": 5175 + }, + { + "epoch": 2.4070686715237946, + "grad_norm": 0.3175617754459381, + "learning_rate": 1.142092240324179e-06, + "loss": 0.3038, + "step": 5176 + }, + { + "epoch": 2.407533715702992, + "grad_norm": 0.33959630131721497, + "learning_rate": 1.1403714766440061e-06, + "loss": 0.3148, + "step": 5177 + }, + { + "epoch": 2.407998759882189, + "grad_norm": 0.34172576665878296, + "learning_rate": 1.1386518433863331e-06, + "loss": 0.3174, + "step": 5178 + }, + { + "epoch": 2.4084638040613857, + "grad_norm": 0.30820226669311523, + "learning_rate": 1.1369333410548166e-06, + "loss": 0.2975, + "step": 5179 + }, + { + "epoch": 2.408928848240583, + "grad_norm": 0.3713323473930359, + "learning_rate": 1.1352159701527743e-06, + "loss": 0.3735, + "step": 5180 + }, + { + "epoch": 2.40939389241978, + "grad_norm": 0.3337565064430237, + "learning_rate": 1.1334997311832003e-06, + "loss": 0.2932, + "step": 5181 + }, + { + "epoch": 2.409858936598977, + "grad_norm": 0.35603073239326477, + "learning_rate": 1.1317846246487485e-06, + "loss": 0.2922, + "step": 5182 + }, + { + "epoch": 2.410323980778174, + "grad_norm": 0.34648483991622925, + "learning_rate": 1.13007065105175e-06, + "loss": 0.3224, + "step": 5183 + }, + { + "epoch": 2.410789024957371, + "grad_norm": 0.3309410810470581, + "learning_rate": 1.128357810894196e-06, + "loss": 0.3081, + "step": 5184 + }, + { + "epoch": 2.411254069136568, + "grad_norm": 0.3483981788158417, + "learning_rate": 1.1266461046777537e-06, + "loss": 0.3221, + "step": 5185 + }, + { + "epoch": 2.411719113315765, + "grad_norm": 0.3361232280731201, + "learning_rate": 1.1249355329037498e-06, + "loss": 0.3369, + "step": 5186 + }, + { + "epoch": 2.412184157494962, + "grad_norm": 0.30131569504737854, + "learning_rate": 1.1232260960731855e-06, + "loss": 0.2907, + "step": 5187 + }, + { + "epoch": 2.412649201674159, + "grad_norm": 0.3845626711845398, + "learning_rate": 1.1215177946867262e-06, + "loss": 0.3715, + "step": 5188 + }, + { + "epoch": 2.413114245853356, + "grad_norm": 0.3543691337108612, + "learning_rate": 1.1198106292447076e-06, + "loss": 0.3494, + "step": 5189 + }, + { + "epoch": 2.413579290032553, + "grad_norm": 0.4014170467853546, + "learning_rate": 1.1181046002471292e-06, + "loss": 0.3547, + "step": 5190 + }, + { + "epoch": 2.4140443342117504, + "grad_norm": 0.3230542540550232, + "learning_rate": 1.1163997081936578e-06, + "loss": 0.2861, + "step": 5191 + }, + { + "epoch": 2.4145093783909473, + "grad_norm": 0.31293249130249023, + "learning_rate": 1.1146959535836317e-06, + "loss": 0.2862, + "step": 5192 + }, + { + "epoch": 2.414974422570144, + "grad_norm": 0.3535180389881134, + "learning_rate": 1.112993336916049e-06, + "loss": 0.3462, + "step": 5193 + }, + { + "epoch": 2.415439466749341, + "grad_norm": 0.32987791299819946, + "learning_rate": 1.1112918586895826e-06, + "loss": 0.3061, + "step": 5194 + }, + { + "epoch": 2.415904510928538, + "grad_norm": 0.3237574100494385, + "learning_rate": 1.1095915194025642e-06, + "loss": 0.2999, + "step": 5195 + }, + { + "epoch": 2.4163695551077353, + "grad_norm": 0.35278692841529846, + "learning_rate": 1.1078923195529973e-06, + "loss": 0.3329, + "step": 5196 + }, + { + "epoch": 2.4168345992869322, + "grad_norm": 0.34075209498405457, + "learning_rate": 1.1061942596385516e-06, + "loss": 0.3127, + "step": 5197 + }, + { + "epoch": 2.417299643466129, + "grad_norm": 0.3517972230911255, + "learning_rate": 1.1044973401565578e-06, + "loss": 0.3275, + "step": 5198 + }, + { + "epoch": 2.4177646876453265, + "grad_norm": 0.4040515720844269, + "learning_rate": 1.1028015616040182e-06, + "loss": 0.3465, + "step": 5199 + }, + { + "epoch": 2.4182297318245234, + "grad_norm": 0.3578769266605377, + "learning_rate": 1.1011069244775996e-06, + "loss": 0.3302, + "step": 5200 + }, + { + "epoch": 2.4186947760037203, + "grad_norm": 0.3215552866458893, + "learning_rate": 1.0994134292736307e-06, + "loss": 0.2755, + "step": 5201 + }, + { + "epoch": 2.419159820182917, + "grad_norm": 0.3483463227748871, + "learning_rate": 1.0977210764881124e-06, + "loss": 0.3649, + "step": 5202 + }, + { + "epoch": 2.4196248643621145, + "grad_norm": 0.32858699560165405, + "learning_rate": 1.096029866616704e-06, + "loss": 0.3099, + "step": 5203 + }, + { + "epoch": 2.4200899085413115, + "grad_norm": 0.34919896721839905, + "learning_rate": 1.0943398001547362e-06, + "loss": 0.3519, + "step": 5204 + }, + { + "epoch": 2.4205549527205084, + "grad_norm": 0.3417411148548126, + "learning_rate": 1.0926508775971995e-06, + "loss": 0.301, + "step": 5205 + }, + { + "epoch": 2.4210199968997053, + "grad_norm": 0.3448435068130493, + "learning_rate": 1.0909630994387538e-06, + "loss": 0.319, + "step": 5206 + }, + { + "epoch": 2.4214850410789026, + "grad_norm": 0.31199708580970764, + "learning_rate": 1.0892764661737204e-06, + "loss": 0.2935, + "step": 5207 + }, + { + "epoch": 2.4219500852580995, + "grad_norm": 0.33110520243644714, + "learning_rate": 1.0875909782960887e-06, + "loss": 0.3122, + "step": 5208 + }, + { + "epoch": 2.4224151294372964, + "grad_norm": 0.3433871865272522, + "learning_rate": 1.0859066362995085e-06, + "loss": 0.2962, + "step": 5209 + }, + { + "epoch": 2.4228801736164938, + "grad_norm": 0.34019917249679565, + "learning_rate": 1.0842234406772973e-06, + "loss": 0.3437, + "step": 5210 + }, + { + "epoch": 2.4233452177956907, + "grad_norm": 0.3699561655521393, + "learning_rate": 1.0825413919224353e-06, + "loss": 0.3118, + "step": 5211 + }, + { + "epoch": 2.4238102619748876, + "grad_norm": 0.364387184381485, + "learning_rate": 1.0808604905275693e-06, + "loss": 0.3258, + "step": 5212 + }, + { + "epoch": 2.4242753061540845, + "grad_norm": 0.35317301750183105, + "learning_rate": 1.0791807369850048e-06, + "loss": 0.3585, + "step": 5213 + }, + { + "epoch": 2.424740350333282, + "grad_norm": 0.341281533241272, + "learning_rate": 1.077502131786718e-06, + "loss": 0.3319, + "step": 5214 + }, + { + "epoch": 2.4252053945124787, + "grad_norm": 0.3004712164402008, + "learning_rate": 1.0758246754243412e-06, + "loss": 0.2823, + "step": 5215 + }, + { + "epoch": 2.4256704386916756, + "grad_norm": 0.34995582699775696, + "learning_rate": 1.0741483683891773e-06, + "loss": 0.2991, + "step": 5216 + }, + { + "epoch": 2.4261354828708726, + "grad_norm": 0.3658967912197113, + "learning_rate": 1.072473211172187e-06, + "loss": 0.3243, + "step": 5217 + }, + { + "epoch": 2.42660052705007, + "grad_norm": 0.3104728162288666, + "learning_rate": 1.0707992042639986e-06, + "loss": 0.3104, + "step": 5218 + }, + { + "epoch": 2.427065571229267, + "grad_norm": 0.3573016822338104, + "learning_rate": 1.0691263481548996e-06, + "loss": 0.3329, + "step": 5219 + }, + { + "epoch": 2.4275306154084637, + "grad_norm": 0.31468331813812256, + "learning_rate": 1.0674546433348453e-06, + "loss": 0.2965, + "step": 5220 + }, + { + "epoch": 2.427995659587661, + "grad_norm": 0.3958417773246765, + "learning_rate": 1.0657840902934469e-06, + "loss": 0.3601, + "step": 5221 + }, + { + "epoch": 2.428460703766858, + "grad_norm": 0.3233220875263214, + "learning_rate": 1.064114689519985e-06, + "loss": 0.3139, + "step": 5222 + }, + { + "epoch": 2.428925747946055, + "grad_norm": 0.3252237141132355, + "learning_rate": 1.0624464415033987e-06, + "loss": 0.3017, + "step": 5223 + }, + { + "epoch": 2.4293907921252518, + "grad_norm": 0.32925140857696533, + "learning_rate": 1.060779346732293e-06, + "loss": 0.3318, + "step": 5224 + }, + { + "epoch": 2.4298558363044487, + "grad_norm": 0.3464730978012085, + "learning_rate": 1.0591134056949314e-06, + "loss": 0.3449, + "step": 5225 + }, + { + "epoch": 2.430320880483646, + "grad_norm": 0.3610069155693054, + "learning_rate": 1.0574486188792393e-06, + "loss": 0.3089, + "step": 5226 + }, + { + "epoch": 2.430785924662843, + "grad_norm": 0.33958742022514343, + "learning_rate": 1.0557849867728088e-06, + "loss": 0.3386, + "step": 5227 + }, + { + "epoch": 2.43125096884204, + "grad_norm": 0.3290470540523529, + "learning_rate": 1.0541225098628877e-06, + "loss": 0.3015, + "step": 5228 + }, + { + "epoch": 2.431716013021237, + "grad_norm": 0.32628098130226135, + "learning_rate": 1.0524611886363912e-06, + "loss": 0.3187, + "step": 5229 + }, + { + "epoch": 2.432181057200434, + "grad_norm": 0.33828026056289673, + "learning_rate": 1.0508010235798904e-06, + "loss": 0.3105, + "step": 5230 + }, + { + "epoch": 2.432646101379631, + "grad_norm": 0.33995717763900757, + "learning_rate": 1.0491420151796227e-06, + "loss": 0.3409, + "step": 5231 + }, + { + "epoch": 2.433111145558828, + "grad_norm": 0.30739811062812805, + "learning_rate": 1.047484163921486e-06, + "loss": 0.3109, + "step": 5232 + }, + { + "epoch": 2.4335761897380253, + "grad_norm": 0.32465070486068726, + "learning_rate": 1.0458274702910347e-06, + "loss": 0.299, + "step": 5233 + }, + { + "epoch": 2.434041233917222, + "grad_norm": 0.36253416538238525, + "learning_rate": 1.044171934773489e-06, + "loss": 0.3325, + "step": 5234 + }, + { + "epoch": 2.434506278096419, + "grad_norm": 0.31193816661834717, + "learning_rate": 1.04251755785373e-06, + "loss": 0.3138, + "step": 5235 + }, + { + "epoch": 2.434971322275616, + "grad_norm": 0.3374314308166504, + "learning_rate": 1.0408643400162949e-06, + "loss": 0.3161, + "step": 5236 + }, + { + "epoch": 2.4354363664548133, + "grad_norm": 0.34814929962158203, + "learning_rate": 1.039212281745387e-06, + "loss": 0.3192, + "step": 5237 + }, + { + "epoch": 2.4359014106340102, + "grad_norm": 0.3480595648288727, + "learning_rate": 1.0375613835248648e-06, + "loss": 0.3186, + "step": 5238 + }, + { + "epoch": 2.436366454813207, + "grad_norm": 0.35934314131736755, + "learning_rate": 1.0359116458382523e-06, + "loss": 0.3206, + "step": 5239 + }, + { + "epoch": 2.4368314989924045, + "grad_norm": 0.34923362731933594, + "learning_rate": 1.0342630691687283e-06, + "loss": 0.3336, + "step": 5240 + }, + { + "epoch": 2.4372965431716014, + "grad_norm": 0.3344292640686035, + "learning_rate": 1.0326156539991361e-06, + "loss": 0.329, + "step": 5241 + }, + { + "epoch": 2.4377615873507983, + "grad_norm": 0.34684503078460693, + "learning_rate": 1.0309694008119748e-06, + "loss": 0.3105, + "step": 5242 + }, + { + "epoch": 2.438226631529995, + "grad_norm": 0.32323524355888367, + "learning_rate": 1.0293243100894068e-06, + "loss": 0.3101, + "step": 5243 + }, + { + "epoch": 2.4386916757091925, + "grad_norm": 0.3490697741508484, + "learning_rate": 1.027680382313253e-06, + "loss": 0.3156, + "step": 5244 + }, + { + "epoch": 2.4391567198883894, + "grad_norm": 0.33271536231040955, + "learning_rate": 1.0260376179649905e-06, + "loss": 0.3069, + "step": 5245 + }, + { + "epoch": 2.4396217640675864, + "grad_norm": 0.31948867440223694, + "learning_rate": 1.0243960175257605e-06, + "loss": 0.3263, + "step": 5246 + }, + { + "epoch": 2.4400868082467833, + "grad_norm": 0.3256017863750458, + "learning_rate": 1.0227555814763623e-06, + "loss": 0.3457, + "step": 5247 + }, + { + "epoch": 2.4405518524259806, + "grad_norm": 0.31808292865753174, + "learning_rate": 1.0211163102972494e-06, + "loss": 0.3109, + "step": 5248 + }, + { + "epoch": 2.4410168966051775, + "grad_norm": 0.33860480785369873, + "learning_rate": 1.0194782044685414e-06, + "loss": 0.3262, + "step": 5249 + }, + { + "epoch": 2.4414819407843744, + "grad_norm": 0.3586934506893158, + "learning_rate": 1.0178412644700093e-06, + "loss": 0.3337, + "step": 5250 + }, + { + "epoch": 2.4419469849635718, + "grad_norm": 0.31557610630989075, + "learning_rate": 1.0162054907810892e-06, + "loss": 0.2812, + "step": 5251 + }, + { + "epoch": 2.4424120291427687, + "grad_norm": 0.3515093922615051, + "learning_rate": 1.0145708838808704e-06, + "loss": 0.3759, + "step": 5252 + }, + { + "epoch": 2.4428770733219656, + "grad_norm": 0.3308696448802948, + "learning_rate": 1.012937444248105e-06, + "loss": 0.2839, + "step": 5253 + }, + { + "epoch": 2.4433421175011625, + "grad_norm": 0.35928985476493835, + "learning_rate": 1.0113051723611989e-06, + "loss": 0.3185, + "step": 5254 + }, + { + "epoch": 2.4438071616803594, + "grad_norm": 0.3398415744304657, + "learning_rate": 1.0096740686982192e-06, + "loss": 0.295, + "step": 5255 + }, + { + "epoch": 2.4442722058595567, + "grad_norm": 0.3303294777870178, + "learning_rate": 1.0080441337368884e-06, + "loss": 0.3409, + "step": 5256 + }, + { + "epoch": 2.4447372500387536, + "grad_norm": 0.33345457911491394, + "learning_rate": 1.0064153679545891e-06, + "loss": 0.3565, + "step": 5257 + }, + { + "epoch": 2.4452022942179505, + "grad_norm": 0.30849790573120117, + "learning_rate": 1.00478777182836e-06, + "loss": 0.3075, + "step": 5258 + }, + { + "epoch": 2.445667338397148, + "grad_norm": 0.3234153091907501, + "learning_rate": 1.0031613458348988e-06, + "loss": 0.2985, + "step": 5259 + }, + { + "epoch": 2.446132382576345, + "grad_norm": 0.3142699897289276, + "learning_rate": 1.0015360904505573e-06, + "loss": 0.2921, + "step": 5260 + }, + { + "epoch": 2.4465974267555417, + "grad_norm": 0.36145663261413574, + "learning_rate": 9.99912006151348e-07, + "loss": 0.3293, + "step": 5261 + }, + { + "epoch": 2.4470624709347386, + "grad_norm": 0.354502409696579, + "learning_rate": 9.98289093412938e-07, + "loss": 0.3246, + "step": 5262 + }, + { + "epoch": 2.447527515113936, + "grad_norm": 0.36151012778282166, + "learning_rate": 9.966673527106514e-07, + "loss": 0.3153, + "step": 5263 + }, + { + "epoch": 2.447992559293133, + "grad_norm": 0.3437816798686981, + "learning_rate": 9.950467845194712e-07, + "loss": 0.3397, + "step": 5264 + }, + { + "epoch": 2.4484576034723298, + "grad_norm": 0.335560142993927, + "learning_rate": 9.934273893140335e-07, + "loss": 0.3223, + "step": 5265 + }, + { + "epoch": 2.4489226476515267, + "grad_norm": 0.33611413836479187, + "learning_rate": 9.918091675686343e-07, + "loss": 0.3288, + "step": 5266 + }, + { + "epoch": 2.449387691830724, + "grad_norm": 0.3385131359100342, + "learning_rate": 9.90192119757225e-07, + "loss": 0.3153, + "step": 5267 + }, + { + "epoch": 2.449852736009921, + "grad_norm": 0.35654857754707336, + "learning_rate": 9.88576246353411e-07, + "loss": 0.2852, + "step": 5268 + }, + { + "epoch": 2.450317780189118, + "grad_norm": 0.3671344220638275, + "learning_rate": 9.869615478304567e-07, + "loss": 0.3212, + "step": 5269 + }, + { + "epoch": 2.450782824368315, + "grad_norm": 0.33951297402381897, + "learning_rate": 9.853480246612812e-07, + "loss": 0.3201, + "step": 5270 + }, + { + "epoch": 2.451247868547512, + "grad_norm": 0.34729960560798645, + "learning_rate": 9.837356773184576e-07, + "loss": 0.3244, + "step": 5271 + }, + { + "epoch": 2.451712912726709, + "grad_norm": 0.3578476011753082, + "learning_rate": 9.821245062742191e-07, + "loss": 0.2941, + "step": 5272 + }, + { + "epoch": 2.452177956905906, + "grad_norm": 0.3414701521396637, + "learning_rate": 9.805145120004478e-07, + "loss": 0.3043, + "step": 5273 + }, + { + "epoch": 2.4526430010851032, + "grad_norm": 0.30916067957878113, + "learning_rate": 9.789056949686882e-07, + "loss": 0.299, + "step": 5274 + }, + { + "epoch": 2.4531080452643, + "grad_norm": 0.3154885470867157, + "learning_rate": 9.772980556501338e-07, + "loss": 0.313, + "step": 5275 + }, + { + "epoch": 2.453573089443497, + "grad_norm": 0.3455050587654114, + "learning_rate": 9.756915945156392e-07, + "loss": 0.3221, + "step": 5276 + }, + { + "epoch": 2.454038133622694, + "grad_norm": 0.3348415195941925, + "learning_rate": 9.74086312035708e-07, + "loss": 0.312, + "step": 5277 + }, + { + "epoch": 2.4545031778018913, + "grad_norm": 0.34347227215766907, + "learning_rate": 9.724822086805019e-07, + "loss": 0.3398, + "step": 5278 + }, + { + "epoch": 2.454968221981088, + "grad_norm": 0.340069055557251, + "learning_rate": 9.70879284919839e-07, + "loss": 0.3009, + "step": 5279 + }, + { + "epoch": 2.455433266160285, + "grad_norm": 0.3648471236228943, + "learning_rate": 9.692775412231863e-07, + "loss": 0.3182, + "step": 5280 + }, + { + "epoch": 2.4558983103394825, + "grad_norm": 0.33610066771507263, + "learning_rate": 9.6767697805967e-07, + "loss": 0.303, + "step": 5281 + }, + { + "epoch": 2.4563633545186794, + "grad_norm": 0.3398694694042206, + "learning_rate": 9.660775958980712e-07, + "loss": 0.3386, + "step": 5282 + }, + { + "epoch": 2.4568283986978763, + "grad_norm": 0.32278192043304443, + "learning_rate": 9.644793952068187e-07, + "loss": 0.2896, + "step": 5283 + }, + { + "epoch": 2.457293442877073, + "grad_norm": 0.3328745365142822, + "learning_rate": 9.628823764540035e-07, + "loss": 0.303, + "step": 5284 + }, + { + "epoch": 2.4577584870562705, + "grad_norm": 0.3310834765434265, + "learning_rate": 9.612865401073634e-07, + "loss": 0.3166, + "step": 5285 + }, + { + "epoch": 2.4582235312354674, + "grad_norm": 0.43221089243888855, + "learning_rate": 9.596918866342959e-07, + "loss": 0.3503, + "step": 5286 + }, + { + "epoch": 2.4586885754146643, + "grad_norm": 0.36144617199897766, + "learning_rate": 9.580984165018458e-07, + "loss": 0.3356, + "step": 5287 + }, + { + "epoch": 2.4591536195938613, + "grad_norm": 0.3269920349121094, + "learning_rate": 9.565061301767176e-07, + "loss": 0.3237, + "step": 5288 + }, + { + "epoch": 2.4596186637730586, + "grad_norm": 0.32409968972206116, + "learning_rate": 9.549150281252633e-07, + "loss": 0.3154, + "step": 5289 + }, + { + "epoch": 2.4600837079522555, + "grad_norm": 0.3190405070781708, + "learning_rate": 9.533251108134922e-07, + "loss": 0.3, + "step": 5290 + }, + { + "epoch": 2.4605487521314524, + "grad_norm": 0.3245762884616852, + "learning_rate": 9.517363787070672e-07, + "loss": 0.3034, + "step": 5291 + }, + { + "epoch": 2.4610137963106498, + "grad_norm": 0.32275402545928955, + "learning_rate": 9.501488322712987e-07, + "loss": 0.3172, + "step": 5292 + }, + { + "epoch": 2.4614788404898467, + "grad_norm": 0.33684971928596497, + "learning_rate": 9.485624719711551e-07, + "loss": 0.3347, + "step": 5293 + }, + { + "epoch": 2.4619438846690436, + "grad_norm": 0.35810065269470215, + "learning_rate": 9.469772982712561e-07, + "loss": 0.3018, + "step": 5294 + }, + { + "epoch": 2.4624089288482405, + "grad_norm": 0.34780067205429077, + "learning_rate": 9.453933116358715e-07, + "loss": 0.3157, + "step": 5295 + }, + { + "epoch": 2.4628739730274374, + "grad_norm": 0.36027947068214417, + "learning_rate": 9.438105125289276e-07, + "loss": 0.3516, + "step": 5296 + }, + { + "epoch": 2.4633390172066347, + "grad_norm": 0.317049503326416, + "learning_rate": 9.422289014139996e-07, + "loss": 0.2994, + "step": 5297 + }, + { + "epoch": 2.4638040613858316, + "grad_norm": 0.3178118169307709, + "learning_rate": 9.406484787543136e-07, + "loss": 0.3013, + "step": 5298 + }, + { + "epoch": 2.4642691055650285, + "grad_norm": 0.3206396698951721, + "learning_rate": 9.390692450127531e-07, + "loss": 0.3182, + "step": 5299 + }, + { + "epoch": 2.464734149744226, + "grad_norm": 0.35063356161117554, + "learning_rate": 9.374912006518467e-07, + "loss": 0.3607, + "step": 5300 + }, + { + "epoch": 2.465199193923423, + "grad_norm": 0.35182520747184753, + "learning_rate": 9.359143461337799e-07, + "loss": 0.316, + "step": 5301 + }, + { + "epoch": 2.4656642381026197, + "grad_norm": 0.3166978359222412, + "learning_rate": 9.343386819203892e-07, + "loss": 0.2737, + "step": 5302 + }, + { + "epoch": 2.4661292822818166, + "grad_norm": 0.3476794362068176, + "learning_rate": 9.327642084731575e-07, + "loss": 0.3288, + "step": 5303 + }, + { + "epoch": 2.466594326461014, + "grad_norm": 0.33571144938468933, + "learning_rate": 9.311909262532248e-07, + "loss": 0.319, + "step": 5304 + }, + { + "epoch": 2.467059370640211, + "grad_norm": 0.3019503355026245, + "learning_rate": 9.296188357213804e-07, + "loss": 0.2941, + "step": 5305 + }, + { + "epoch": 2.4675244148194078, + "grad_norm": 0.37272417545318604, + "learning_rate": 9.280479373380624e-07, + "loss": 0.3377, + "step": 5306 + }, + { + "epoch": 2.4679894589986047, + "grad_norm": 0.30881267786026, + "learning_rate": 9.26478231563363e-07, + "loss": 0.3139, + "step": 5307 + }, + { + "epoch": 2.468454503177802, + "grad_norm": 0.32374900579452515, + "learning_rate": 9.249097188570216e-07, + "loss": 0.3063, + "step": 5308 + }, + { + "epoch": 2.468919547356999, + "grad_norm": 0.3490180969238281, + "learning_rate": 9.23342399678433e-07, + "loss": 0.3599, + "step": 5309 + }, + { + "epoch": 2.469384591536196, + "grad_norm": 0.34069767594337463, + "learning_rate": 9.21776274486636e-07, + "loss": 0.3225, + "step": 5310 + }, + { + "epoch": 2.469849635715393, + "grad_norm": 0.32864606380462646, + "learning_rate": 9.202113437403259e-07, + "loss": 0.3247, + "step": 5311 + }, + { + "epoch": 2.47031467989459, + "grad_norm": 0.3595259487628937, + "learning_rate": 9.18647607897844e-07, + "loss": 0.3265, + "step": 5312 + }, + { + "epoch": 2.470779724073787, + "grad_norm": 0.3525889217853546, + "learning_rate": 9.170850674171833e-07, + "loss": 0.3167, + "step": 5313 + }, + { + "epoch": 2.471244768252984, + "grad_norm": 0.3449327051639557, + "learning_rate": 9.155237227559883e-07, + "loss": 0.3431, + "step": 5314 + }, + { + "epoch": 2.4717098124321812, + "grad_norm": 0.3054923117160797, + "learning_rate": 9.139635743715486e-07, + "loss": 0.2732, + "step": 5315 + }, + { + "epoch": 2.472174856611378, + "grad_norm": 0.3427397310733795, + "learning_rate": 9.124046227208083e-07, + "loss": 0.3412, + "step": 5316 + }, + { + "epoch": 2.472639900790575, + "grad_norm": 0.34003254771232605, + "learning_rate": 9.108468682603594e-07, + "loss": 0.3276, + "step": 5317 + }, + { + "epoch": 2.473104944969772, + "grad_norm": 0.31388726830482483, + "learning_rate": 9.092903114464407e-07, + "loss": 0.3003, + "step": 5318 + }, + { + "epoch": 2.4735699891489693, + "grad_norm": 0.31117507815361023, + "learning_rate": 9.077349527349455e-07, + "loss": 0.3139, + "step": 5319 + }, + { + "epoch": 2.474035033328166, + "grad_norm": 0.32882410287857056, + "learning_rate": 9.061807925814098e-07, + "loss": 0.3111, + "step": 5320 + }, + { + "epoch": 2.474500077507363, + "grad_norm": 0.33620962500572205, + "learning_rate": 9.046278314410245e-07, + "loss": 0.3183, + "step": 5321 + }, + { + "epoch": 2.4749651216865605, + "grad_norm": 0.3354683518409729, + "learning_rate": 9.030760697686247e-07, + "loss": 0.3094, + "step": 5322 + }, + { + "epoch": 2.4754301658657574, + "grad_norm": 0.3122057616710663, + "learning_rate": 9.01525508018698e-07, + "loss": 0.3064, + "step": 5323 + }, + { + "epoch": 2.4758952100449543, + "grad_norm": 0.35190722346305847, + "learning_rate": 8.999761466453771e-07, + "loss": 0.351, + "step": 5324 + }, + { + "epoch": 2.476360254224151, + "grad_norm": 0.3170509338378906, + "learning_rate": 8.984279861024453e-07, + "loss": 0.3036, + "step": 5325 + }, + { + "epoch": 2.476825298403348, + "grad_norm": 0.33579573035240173, + "learning_rate": 8.968810268433347e-07, + "loss": 0.3135, + "step": 5326 + }, + { + "epoch": 2.4772903425825454, + "grad_norm": 0.35341697931289673, + "learning_rate": 8.953352693211232e-07, + "loss": 0.312, + "step": 5327 + }, + { + "epoch": 2.4777553867617423, + "grad_norm": 0.32664796710014343, + "learning_rate": 8.937907139885376e-07, + "loss": 0.2932, + "step": 5328 + }, + { + "epoch": 2.4782204309409392, + "grad_norm": 0.3640941083431244, + "learning_rate": 8.922473612979565e-07, + "loss": 0.3478, + "step": 5329 + }, + { + "epoch": 2.4786854751201366, + "grad_norm": 0.30336251854896545, + "learning_rate": 8.907052117013981e-07, + "loss": 0.2738, + "step": 5330 + }, + { + "epoch": 2.4791505192993335, + "grad_norm": 0.36704903841018677, + "learning_rate": 8.891642656505373e-07, + "loss": 0.3298, + "step": 5331 + }, + { + "epoch": 2.4796155634785304, + "grad_norm": 0.3368130624294281, + "learning_rate": 8.876245235966884e-07, + "loss": 0.295, + "step": 5332 + }, + { + "epoch": 2.4800806076577273, + "grad_norm": 0.3467327058315277, + "learning_rate": 8.860859859908199e-07, + "loss": 0.3234, + "step": 5333 + }, + { + "epoch": 2.4805456518369247, + "grad_norm": 0.3419296145439148, + "learning_rate": 8.845486532835435e-07, + "loss": 0.3423, + "step": 5334 + }, + { + "epoch": 2.4810106960161216, + "grad_norm": 0.3196740448474884, + "learning_rate": 8.830125259251171e-07, + "loss": 0.3017, + "step": 5335 + }, + { + "epoch": 2.4814757401953185, + "grad_norm": 0.3675239086151123, + "learning_rate": 8.814776043654494e-07, + "loss": 0.3654, + "step": 5336 + }, + { + "epoch": 2.4819407843745154, + "grad_norm": 0.3341238796710968, + "learning_rate": 8.799438890540929e-07, + "loss": 0.2776, + "step": 5337 + }, + { + "epoch": 2.4824058285537127, + "grad_norm": 0.3249601125717163, + "learning_rate": 8.784113804402506e-07, + "loss": 0.3288, + "step": 5338 + }, + { + "epoch": 2.4828708727329096, + "grad_norm": 0.3380821645259857, + "learning_rate": 8.768800789727655e-07, + "loss": 0.3253, + "step": 5339 + }, + { + "epoch": 2.4833359169121065, + "grad_norm": 0.3296220302581787, + "learning_rate": 8.753499851001341e-07, + "loss": 0.2792, + "step": 5340 + }, + { + "epoch": 2.483800961091304, + "grad_norm": 0.3442380428314209, + "learning_rate": 8.738210992704937e-07, + "loss": 0.336, + "step": 5341 + }, + { + "epoch": 2.484266005270501, + "grad_norm": 0.3112342059612274, + "learning_rate": 8.72293421931632e-07, + "loss": 0.3206, + "step": 5342 + }, + { + "epoch": 2.4847310494496977, + "grad_norm": 0.3362886905670166, + "learning_rate": 8.707669535309793e-07, + "loss": 0.3584, + "step": 5343 + }, + { + "epoch": 2.4851960936288946, + "grad_norm": 0.3538823127746582, + "learning_rate": 8.692416945156151e-07, + "loss": 0.3174, + "step": 5344 + }, + { + "epoch": 2.485661137808092, + "grad_norm": 0.33575835824012756, + "learning_rate": 8.677176453322611e-07, + "loss": 0.295, + "step": 5345 + }, + { + "epoch": 2.486126181987289, + "grad_norm": 0.3484673500061035, + "learning_rate": 8.66194806427288e-07, + "loss": 0.3196, + "step": 5346 + }, + { + "epoch": 2.4865912261664858, + "grad_norm": 0.36958253383636475, + "learning_rate": 8.646731782467094e-07, + "loss": 0.3701, + "step": 5347 + }, + { + "epoch": 2.4870562703456827, + "grad_norm": 0.31938305497169495, + "learning_rate": 8.631527612361861e-07, + "loss": 0.2928, + "step": 5348 + }, + { + "epoch": 2.48752131452488, + "grad_norm": 0.33608078956604004, + "learning_rate": 8.616335558410244e-07, + "loss": 0.3191, + "step": 5349 + }, + { + "epoch": 2.487986358704077, + "grad_norm": 0.31729990243911743, + "learning_rate": 8.601155625061736e-07, + "loss": 0.2873, + "step": 5350 + }, + { + "epoch": 2.488451402883274, + "grad_norm": 0.35453861951828003, + "learning_rate": 8.585987816762292e-07, + "loss": 0.3096, + "step": 5351 + }, + { + "epoch": 2.488916447062471, + "grad_norm": 0.3638750910758972, + "learning_rate": 8.570832137954333e-07, + "loss": 0.3396, + "step": 5352 + }, + { + "epoch": 2.489381491241668, + "grad_norm": 0.35378140211105347, + "learning_rate": 8.555688593076689e-07, + "loss": 0.3485, + "step": 5353 + }, + { + "epoch": 2.489846535420865, + "grad_norm": 0.34362515807151794, + "learning_rate": 8.540557186564685e-07, + "loss": 0.2998, + "step": 5354 + }, + { + "epoch": 2.490311579600062, + "grad_norm": 0.33936387300491333, + "learning_rate": 8.525437922850033e-07, + "loss": 0.3127, + "step": 5355 + }, + { + "epoch": 2.490776623779259, + "grad_norm": 0.3667736053466797, + "learning_rate": 8.51033080636095e-07, + "loss": 0.3394, + "step": 5356 + }, + { + "epoch": 2.491241667958456, + "grad_norm": 0.3014102578163147, + "learning_rate": 8.495235841522038e-07, + "loss": 0.2753, + "step": 5357 + }, + { + "epoch": 2.491706712137653, + "grad_norm": 0.31866636872291565, + "learning_rate": 8.480153032754396e-07, + "loss": 0.3203, + "step": 5358 + }, + { + "epoch": 2.49217175631685, + "grad_norm": 0.34170639514923096, + "learning_rate": 8.465082384475499e-07, + "loss": 0.3131, + "step": 5359 + }, + { + "epoch": 2.4926368004960473, + "grad_norm": 0.31605064868927, + "learning_rate": 8.450023901099314e-07, + "loss": 0.3298, + "step": 5360 + }, + { + "epoch": 2.493101844675244, + "grad_norm": 0.336970716714859, + "learning_rate": 8.434977587036242e-07, + "loss": 0.3574, + "step": 5361 + }, + { + "epoch": 2.493566888854441, + "grad_norm": 0.3367874324321747, + "learning_rate": 8.419943446693069e-07, + "loss": 0.3065, + "step": 5362 + }, + { + "epoch": 2.494031933033638, + "grad_norm": 0.34468874335289, + "learning_rate": 8.404921484473072e-07, + "loss": 0.3462, + "step": 5363 + }, + { + "epoch": 2.4944969772128354, + "grad_norm": 0.30258965492248535, + "learning_rate": 8.38991170477595e-07, + "loss": 0.2844, + "step": 5364 + }, + { + "epoch": 2.4949620213920323, + "grad_norm": 0.32411083579063416, + "learning_rate": 8.3749141119978e-07, + "loss": 0.3055, + "step": 5365 + }, + { + "epoch": 2.495427065571229, + "grad_norm": 0.33687278628349304, + "learning_rate": 8.359928710531195e-07, + "loss": 0.3058, + "step": 5366 + }, + { + "epoch": 2.495892109750426, + "grad_norm": 0.33974799513816833, + "learning_rate": 8.344955504765089e-07, + "loss": 0.3166, + "step": 5367 + }, + { + "epoch": 2.4963571539296234, + "grad_norm": 0.3623729646205902, + "learning_rate": 8.32999449908492e-07, + "loss": 0.3154, + "step": 5368 + }, + { + "epoch": 2.4968221981088203, + "grad_norm": 0.36364513635635376, + "learning_rate": 8.315045697872514e-07, + "loss": 0.3118, + "step": 5369 + }, + { + "epoch": 2.4972872422880172, + "grad_norm": 0.32118678092956543, + "learning_rate": 8.30010910550611e-07, + "loss": 0.3173, + "step": 5370 + }, + { + "epoch": 2.4977522864672146, + "grad_norm": 0.329588383436203, + "learning_rate": 8.285184726360412e-07, + "loss": 0.3409, + "step": 5371 + }, + { + "epoch": 2.4982173306464115, + "grad_norm": 0.29638898372650146, + "learning_rate": 8.27027256480653e-07, + "loss": 0.2994, + "step": 5372 + }, + { + "epoch": 2.4986823748256084, + "grad_norm": 0.3159889578819275, + "learning_rate": 8.255372625212005e-07, + "loss": 0.3076, + "step": 5373 + }, + { + "epoch": 2.4991474190048053, + "grad_norm": 0.3349873423576355, + "learning_rate": 8.240484911940755e-07, + "loss": 0.3355, + "step": 5374 + }, + { + "epoch": 2.4996124631840027, + "grad_norm": 0.34741565585136414, + "learning_rate": 8.225609429353187e-07, + "loss": 0.3413, + "step": 5375 + }, + { + "epoch": 2.5000775073631996, + "grad_norm": 0.3568498492240906, + "learning_rate": 8.210746181806051e-07, + "loss": 0.3315, + "step": 5376 + }, + { + "epoch": 2.5005425515423965, + "grad_norm": 0.3185107409954071, + "learning_rate": 8.195895173652585e-07, + "loss": 0.2859, + "step": 5377 + }, + { + "epoch": 2.5010075957215934, + "grad_norm": 0.32899004220962524, + "learning_rate": 8.181056409242377e-07, + "loss": 0.3093, + "step": 5378 + }, + { + "epoch": 2.5014726399007907, + "grad_norm": 0.30876079201698303, + "learning_rate": 8.16622989292149e-07, + "loss": 0.3068, + "step": 5379 + }, + { + "epoch": 2.5019376840799876, + "grad_norm": 0.31439492106437683, + "learning_rate": 8.151415629032338e-07, + "loss": 0.3405, + "step": 5380 + }, + { + "epoch": 2.5024027282591845, + "grad_norm": 0.3182373344898224, + "learning_rate": 8.136613621913813e-07, + "loss": 0.3143, + "step": 5381 + }, + { + "epoch": 2.502867772438382, + "grad_norm": 0.3052506744861603, + "learning_rate": 8.121823875901152e-07, + "loss": 0.32, + "step": 5382 + }, + { + "epoch": 2.503332816617579, + "grad_norm": 0.33077898621559143, + "learning_rate": 8.107046395326041e-07, + "loss": 0.3325, + "step": 5383 + }, + { + "epoch": 2.5037978607967757, + "grad_norm": 0.31627658009529114, + "learning_rate": 8.092281184516571e-07, + "loss": 0.2717, + "step": 5384 + }, + { + "epoch": 2.5042629049759726, + "grad_norm": 0.34410303831100464, + "learning_rate": 8.077528247797234e-07, + "loss": 0.3265, + "step": 5385 + }, + { + "epoch": 2.5047279491551695, + "grad_norm": 0.3441540002822876, + "learning_rate": 8.062787589488913e-07, + "loss": 0.3192, + "step": 5386 + }, + { + "epoch": 2.505192993334367, + "grad_norm": 0.3462598919868469, + "learning_rate": 8.048059213908927e-07, + "loss": 0.3397, + "step": 5387 + }, + { + "epoch": 2.5056580375135638, + "grad_norm": 0.302598774433136, + "learning_rate": 8.033343125370952e-07, + "loss": 0.2927, + "step": 5388 + }, + { + "epoch": 2.5061230816927607, + "grad_norm": 0.33706727623939514, + "learning_rate": 8.018639328185113e-07, + "loss": 0.3264, + "step": 5389 + }, + { + "epoch": 2.506588125871958, + "grad_norm": 0.3481765687465668, + "learning_rate": 8.003947826657898e-07, + "loss": 0.3415, + "step": 5390 + }, + { + "epoch": 2.507053170051155, + "grad_norm": 0.31380075216293335, + "learning_rate": 7.989268625092223e-07, + "loss": 0.2639, + "step": 5391 + }, + { + "epoch": 2.507518214230352, + "grad_norm": 0.34795790910720825, + "learning_rate": 7.974601727787374e-07, + "loss": 0.3427, + "step": 5392 + }, + { + "epoch": 2.507983258409549, + "grad_norm": 0.32209163904190063, + "learning_rate": 7.959947139039065e-07, + "loss": 0.3024, + "step": 5393 + }, + { + "epoch": 2.508448302588746, + "grad_norm": 0.3540125787258148, + "learning_rate": 7.945304863139358e-07, + "loss": 0.3565, + "step": 5394 + }, + { + "epoch": 2.508913346767943, + "grad_norm": 0.37601977586746216, + "learning_rate": 7.930674904376762e-07, + "loss": 0.3115, + "step": 5395 + }, + { + "epoch": 2.50937839094714, + "grad_norm": 0.33575406670570374, + "learning_rate": 7.916057267036159e-07, + "loss": 0.3297, + "step": 5396 + }, + { + "epoch": 2.509843435126337, + "grad_norm": 0.32133379578590393, + "learning_rate": 7.901451955398792e-07, + "loss": 0.3136, + "step": 5397 + }, + { + "epoch": 2.510308479305534, + "grad_norm": 0.3117270767688751, + "learning_rate": 7.886858973742334e-07, + "loss": 0.2933, + "step": 5398 + }, + { + "epoch": 2.510773523484731, + "grad_norm": 0.3320932984352112, + "learning_rate": 7.872278326340849e-07, + "loss": 0.3404, + "step": 5399 + }, + { + "epoch": 2.511238567663928, + "grad_norm": 0.3289964497089386, + "learning_rate": 7.857710017464737e-07, + "loss": 0.3172, + "step": 5400 + }, + { + "epoch": 2.5117036118431253, + "grad_norm": 0.32881319522857666, + "learning_rate": 7.843154051380852e-07, + "loss": 0.3287, + "step": 5401 + }, + { + "epoch": 2.512168656022322, + "grad_norm": 0.3330029547214508, + "learning_rate": 7.828610432352373e-07, + "loss": 0.2865, + "step": 5402 + }, + { + "epoch": 2.512633700201519, + "grad_norm": 0.3702026307582855, + "learning_rate": 7.814079164638915e-07, + "loss": 0.3376, + "step": 5403 + }, + { + "epoch": 2.513098744380716, + "grad_norm": 0.3574296832084656, + "learning_rate": 7.799560252496424e-07, + "loss": 0.3269, + "step": 5404 + }, + { + "epoch": 2.513563788559913, + "grad_norm": 0.3529759645462036, + "learning_rate": 7.785053700177275e-07, + "loss": 0.3205, + "step": 5405 + }, + { + "epoch": 2.5140288327391103, + "grad_norm": 0.33158957958221436, + "learning_rate": 7.770559511930187e-07, + "loss": 0.3341, + "step": 5406 + }, + { + "epoch": 2.514493876918307, + "grad_norm": 0.3311402499675751, + "learning_rate": 7.756077692000274e-07, + "loss": 0.3263, + "step": 5407 + }, + { + "epoch": 2.514958921097504, + "grad_norm": 0.377869188785553, + "learning_rate": 7.741608244629045e-07, + "loss": 0.3237, + "step": 5408 + }, + { + "epoch": 2.5154239652767014, + "grad_norm": 0.3278749883174896, + "learning_rate": 7.727151174054342e-07, + "loss": 0.3334, + "step": 5409 + }, + { + "epoch": 2.5158890094558983, + "grad_norm": 0.34262946248054504, + "learning_rate": 7.712706484510424e-07, + "loss": 0.332, + "step": 5410 + }, + { + "epoch": 2.5163540536350952, + "grad_norm": 0.32814016938209534, + "learning_rate": 7.698274180227888e-07, + "loss": 0.2826, + "step": 5411 + }, + { + "epoch": 2.5168190978142926, + "grad_norm": 0.3465859293937683, + "learning_rate": 7.683854265433737e-07, + "loss": 0.3271, + "step": 5412 + }, + { + "epoch": 2.5172841419934895, + "grad_norm": 0.33419957756996155, + "learning_rate": 7.669446744351317e-07, + "loss": 0.2999, + "step": 5413 + }, + { + "epoch": 2.5177491861726864, + "grad_norm": 0.3603866994380951, + "learning_rate": 7.655051621200377e-07, + "loss": 0.3173, + "step": 5414 + }, + { + "epoch": 2.5182142303518833, + "grad_norm": 0.33212265372276306, + "learning_rate": 7.640668900196985e-07, + "loss": 0.292, + "step": 5415 + }, + { + "epoch": 2.51867927453108, + "grad_norm": 0.35766664147377014, + "learning_rate": 7.626298585553637e-07, + "loss": 0.3385, + "step": 5416 + }, + { + "epoch": 2.5191443187102776, + "grad_norm": 0.35297131538391113, + "learning_rate": 7.611940681479141e-07, + "loss": 0.3162, + "step": 5417 + }, + { + "epoch": 2.5196093628894745, + "grad_norm": 0.3046947419643402, + "learning_rate": 7.597595192178702e-07, + "loss": 0.2764, + "step": 5418 + }, + { + "epoch": 2.5200744070686714, + "grad_norm": 0.38086947798728943, + "learning_rate": 7.583262121853879e-07, + "loss": 0.3521, + "step": 5419 + }, + { + "epoch": 2.5205394512478687, + "grad_norm": 0.357149600982666, + "learning_rate": 7.568941474702618e-07, + "loss": 0.3225, + "step": 5420 + }, + { + "epoch": 2.5210044954270656, + "grad_norm": 0.32697343826293945, + "learning_rate": 7.554633254919169e-07, + "loss": 0.3138, + "step": 5421 + }, + { + "epoch": 2.5214695396062625, + "grad_norm": 0.3249689042568207, + "learning_rate": 7.540337466694203e-07, + "loss": 0.3149, + "step": 5422 + }, + { + "epoch": 2.52193458378546, + "grad_norm": 0.3241865634918213, + "learning_rate": 7.526054114214704e-07, + "loss": 0.3102, + "step": 5423 + }, + { + "epoch": 2.5223996279646568, + "grad_norm": 0.33546963334083557, + "learning_rate": 7.511783201664053e-07, + "loss": 0.34, + "step": 5424 + }, + { + "epoch": 2.5228646721438537, + "grad_norm": 0.3344916105270386, + "learning_rate": 7.49752473322195e-07, + "loss": 0.3078, + "step": 5425 + }, + { + "epoch": 2.5233297163230506, + "grad_norm": 0.3365899622440338, + "learning_rate": 7.48327871306449e-07, + "loss": 0.3172, + "step": 5426 + }, + { + "epoch": 2.5237947605022475, + "grad_norm": 0.3597327768802643, + "learning_rate": 7.469045145364079e-07, + "loss": 0.3203, + "step": 5427 + }, + { + "epoch": 2.524259804681445, + "grad_norm": 0.3610391914844513, + "learning_rate": 7.454824034289515e-07, + "loss": 0.2926, + "step": 5428 + }, + { + "epoch": 2.5247248488606417, + "grad_norm": 0.33750978112220764, + "learning_rate": 7.440615384005917e-07, + "loss": 0.288, + "step": 5429 + }, + { + "epoch": 2.5251898930398387, + "grad_norm": 0.35543230175971985, + "learning_rate": 7.426419198674773e-07, + "loss": 0.3081, + "step": 5430 + }, + { + "epoch": 2.525654937219036, + "grad_norm": 0.3200836777687073, + "learning_rate": 7.412235482453911e-07, + "loss": 0.3226, + "step": 5431 + }, + { + "epoch": 2.526119981398233, + "grad_norm": 0.46929338574409485, + "learning_rate": 7.398064239497538e-07, + "loss": 0.3042, + "step": 5432 + }, + { + "epoch": 2.52658502557743, + "grad_norm": 0.31712985038757324, + "learning_rate": 7.383905473956137e-07, + "loss": 0.313, + "step": 5433 + }, + { + "epoch": 2.5270500697566267, + "grad_norm": 0.31012576818466187, + "learning_rate": 7.369759189976622e-07, + "loss": 0.3, + "step": 5434 + }, + { + "epoch": 2.527515113935824, + "grad_norm": 0.3287335932254791, + "learning_rate": 7.355625391702176e-07, + "loss": 0.3219, + "step": 5435 + }, + { + "epoch": 2.527980158115021, + "grad_norm": 0.3296997547149658, + "learning_rate": 7.341504083272388e-07, + "loss": 0.3344, + "step": 5436 + }, + { + "epoch": 2.528445202294218, + "grad_norm": 0.3109728693962097, + "learning_rate": 7.327395268823128e-07, + "loss": 0.323, + "step": 5437 + }, + { + "epoch": 2.528910246473415, + "grad_norm": 0.2954341471195221, + "learning_rate": 7.313298952486675e-07, + "loss": 0.2939, + "step": 5438 + }, + { + "epoch": 2.529375290652612, + "grad_norm": 0.3587803542613983, + "learning_rate": 7.299215138391574e-07, + "loss": 0.3491, + "step": 5439 + }, + { + "epoch": 2.529840334831809, + "grad_norm": 0.29528895020484924, + "learning_rate": 7.285143830662778e-07, + "loss": 0.2828, + "step": 5440 + }, + { + "epoch": 2.530305379011006, + "grad_norm": 0.33470654487609863, + "learning_rate": 7.271085033421516e-07, + "loss": 0.3367, + "step": 5441 + }, + { + "epoch": 2.5307704231902033, + "grad_norm": 0.3154504895210266, + "learning_rate": 7.2570387507854e-07, + "loss": 0.2794, + "step": 5442 + }, + { + "epoch": 2.5312354673694, + "grad_norm": 0.34760943055152893, + "learning_rate": 7.243004986868357e-07, + "loss": 0.3367, + "step": 5443 + }, + { + "epoch": 2.531700511548597, + "grad_norm": 0.33287641406059265, + "learning_rate": 7.228983745780643e-07, + "loss": 0.3303, + "step": 5444 + }, + { + "epoch": 2.532165555727794, + "grad_norm": 0.3180674612522125, + "learning_rate": 7.214975031628856e-07, + "loss": 0.2925, + "step": 5445 + }, + { + "epoch": 2.532630599906991, + "grad_norm": 0.3350416421890259, + "learning_rate": 7.200978848515911e-07, + "loss": 0.34, + "step": 5446 + }, + { + "epoch": 2.5330956440861883, + "grad_norm": 0.347086101770401, + "learning_rate": 7.186995200541086e-07, + "loss": 0.3296, + "step": 5447 + }, + { + "epoch": 2.533560688265385, + "grad_norm": 0.3760274350643158, + "learning_rate": 7.17302409179993e-07, + "loss": 0.349, + "step": 5448 + }, + { + "epoch": 2.534025732444582, + "grad_norm": 0.3314683437347412, + "learning_rate": 7.159065526384384e-07, + "loss": 0.2927, + "step": 5449 + }, + { + "epoch": 2.5344907766237794, + "grad_norm": 0.309643417596817, + "learning_rate": 7.145119508382664e-07, + "loss": 0.2969, + "step": 5450 + }, + { + "epoch": 2.5349558208029763, + "grad_norm": 0.4211340546607971, + "learning_rate": 7.131186041879357e-07, + "loss": 0.3603, + "step": 5451 + }, + { + "epoch": 2.5354208649821732, + "grad_norm": 0.3383927643299103, + "learning_rate": 7.117265130955314e-07, + "loss": 0.3306, + "step": 5452 + }, + { + "epoch": 2.5358859091613706, + "grad_norm": 0.30522429943084717, + "learning_rate": 7.10335677968777e-07, + "loss": 0.2957, + "step": 5453 + }, + { + "epoch": 2.5363509533405675, + "grad_norm": 0.31220120191574097, + "learning_rate": 7.089460992150243e-07, + "loss": 0.3003, + "step": 5454 + }, + { + "epoch": 2.5368159975197644, + "grad_norm": 0.36357051134109497, + "learning_rate": 7.075577772412607e-07, + "loss": 0.3585, + "step": 5455 + }, + { + "epoch": 2.5372810416989613, + "grad_norm": 0.3370119333267212, + "learning_rate": 7.061707124540995e-07, + "loss": 0.3196, + "step": 5456 + }, + { + "epoch": 2.537746085878158, + "grad_norm": 0.33187320828437805, + "learning_rate": 7.047849052597927e-07, + "loss": 0.3124, + "step": 5457 + }, + { + "epoch": 2.5382111300573555, + "grad_norm": 0.3284560739994049, + "learning_rate": 7.034003560642183e-07, + "loss": 0.321, + "step": 5458 + }, + { + "epoch": 2.5386761742365525, + "grad_norm": 0.31819069385528564, + "learning_rate": 7.020170652728903e-07, + "loss": 0.3049, + "step": 5459 + }, + { + "epoch": 2.5391412184157494, + "grad_norm": 0.2943916618824005, + "learning_rate": 7.006350332909495e-07, + "loss": 0.2641, + "step": 5460 + }, + { + "epoch": 2.5396062625949467, + "grad_norm": 0.3456489145755768, + "learning_rate": 6.992542605231739e-07, + "loss": 0.367, + "step": 5461 + }, + { + "epoch": 2.5400713067741436, + "grad_norm": 0.30786919593811035, + "learning_rate": 6.978747473739666e-07, + "loss": 0.3087, + "step": 5462 + }, + { + "epoch": 2.5405363509533405, + "grad_norm": 0.3164925277233124, + "learning_rate": 6.964964942473662e-07, + "loss": 0.3066, + "step": 5463 + }, + { + "epoch": 2.541001395132538, + "grad_norm": 0.32444530725479126, + "learning_rate": 6.951195015470396e-07, + "loss": 0.3156, + "step": 5464 + }, + { + "epoch": 2.5414664393117348, + "grad_norm": 0.3363860845565796, + "learning_rate": 6.937437696762861e-07, + "loss": 0.3014, + "step": 5465 + }, + { + "epoch": 2.5419314834909317, + "grad_norm": 0.34026390314102173, + "learning_rate": 6.923692990380349e-07, + "loss": 0.3131, + "step": 5466 + }, + { + "epoch": 2.5423965276701286, + "grad_norm": 0.31564757227897644, + "learning_rate": 6.909960900348483e-07, + "loss": 0.3066, + "step": 5467 + }, + { + "epoch": 2.5428615718493255, + "grad_norm": 0.3490791618824005, + "learning_rate": 6.896241430689133e-07, + "loss": 0.3221, + "step": 5468 + }, + { + "epoch": 2.543326616028523, + "grad_norm": 0.346028208732605, + "learning_rate": 6.882534585420542e-07, + "loss": 0.3227, + "step": 5469 + }, + { + "epoch": 2.5437916602077197, + "grad_norm": 0.3382766842842102, + "learning_rate": 6.868840368557194e-07, + "loss": 0.3471, + "step": 5470 + }, + { + "epoch": 2.5442567043869166, + "grad_norm": 0.30995672941207886, + "learning_rate": 6.855158784109927e-07, + "loss": 0.3273, + "step": 5471 + }, + { + "epoch": 2.544721748566114, + "grad_norm": 0.3215526342391968, + "learning_rate": 6.841489836085835e-07, + "loss": 0.3241, + "step": 5472 + }, + { + "epoch": 2.545186792745311, + "grad_norm": 0.3126165270805359, + "learning_rate": 6.827833528488348e-07, + "loss": 0.2946, + "step": 5473 + }, + { + "epoch": 2.545651836924508, + "grad_norm": 0.3339928388595581, + "learning_rate": 6.814189865317156e-07, + "loss": 0.3383, + "step": 5474 + }, + { + "epoch": 2.5461168811037047, + "grad_norm": 0.30406853556632996, + "learning_rate": 6.800558850568295e-07, + "loss": 0.3031, + "step": 5475 + }, + { + "epoch": 2.5465819252829016, + "grad_norm": 0.3174978494644165, + "learning_rate": 6.786940488234034e-07, + "loss": 0.3134, + "step": 5476 + }, + { + "epoch": 2.547046969462099, + "grad_norm": 0.32636481523513794, + "learning_rate": 6.773334782302993e-07, + "loss": 0.3233, + "step": 5477 + }, + { + "epoch": 2.547512013641296, + "grad_norm": 0.31358960270881653, + "learning_rate": 6.759741736760062e-07, + "loss": 0.2805, + "step": 5478 + }, + { + "epoch": 2.5479770578204928, + "grad_norm": 0.36361631751060486, + "learning_rate": 6.746161355586411e-07, + "loss": 0.3479, + "step": 5479 + }, + { + "epoch": 2.54844210199969, + "grad_norm": 0.3181024193763733, + "learning_rate": 6.732593642759533e-07, + "loss": 0.2933, + "step": 5480 + }, + { + "epoch": 2.548907146178887, + "grad_norm": 0.3153908848762512, + "learning_rate": 6.719038602253164e-07, + "loss": 0.3188, + "step": 5481 + }, + { + "epoch": 2.549372190358084, + "grad_norm": 0.3370981216430664, + "learning_rate": 6.705496238037379e-07, + "loss": 0.3397, + "step": 5482 + }, + { + "epoch": 2.5498372345372813, + "grad_norm": 0.32690051198005676, + "learning_rate": 6.691966554078494e-07, + "loss": 0.2883, + "step": 5483 + }, + { + "epoch": 2.550302278716478, + "grad_norm": 0.35293567180633545, + "learning_rate": 6.678449554339161e-07, + "loss": 0.3367, + "step": 5484 + }, + { + "epoch": 2.550767322895675, + "grad_norm": 0.2994639575481415, + "learning_rate": 6.664945242778264e-07, + "loss": 0.3071, + "step": 5485 + }, + { + "epoch": 2.551232367074872, + "grad_norm": 0.30928120017051697, + "learning_rate": 6.651453623351017e-07, + "loss": 0.3188, + "step": 5486 + }, + { + "epoch": 2.551697411254069, + "grad_norm": 0.3208847939968109, + "learning_rate": 6.637974700008876e-07, + "loss": 0.3407, + "step": 5487 + }, + { + "epoch": 2.5521624554332663, + "grad_norm": 0.3318404257297516, + "learning_rate": 6.624508476699609e-07, + "loss": 0.3058, + "step": 5488 + }, + { + "epoch": 2.552627499612463, + "grad_norm": 0.34476324915885925, + "learning_rate": 6.611054957367253e-07, + "loss": 0.2937, + "step": 5489 + }, + { + "epoch": 2.55309254379166, + "grad_norm": 0.3424232304096222, + "learning_rate": 6.597614145952136e-07, + "loss": 0.3149, + "step": 5490 + }, + { + "epoch": 2.5535575879708574, + "grad_norm": 0.336582750082016, + "learning_rate": 6.584186046390839e-07, + "loss": 0.3275, + "step": 5491 + }, + { + "epoch": 2.5540226321500543, + "grad_norm": 0.30492153763771057, + "learning_rate": 6.570770662616244e-07, + "loss": 0.3226, + "step": 5492 + }, + { + "epoch": 2.5544876763292512, + "grad_norm": 0.344941645860672, + "learning_rate": 6.557367998557485e-07, + "loss": 0.3153, + "step": 5493 + }, + { + "epoch": 2.5549527205084486, + "grad_norm": 0.3531348407268524, + "learning_rate": 6.543978058140005e-07, + "loss": 0.3301, + "step": 5494 + }, + { + "epoch": 2.5554177646876455, + "grad_norm": 0.3221784234046936, + "learning_rate": 6.530600845285478e-07, + "loss": 0.3318, + "step": 5495 + }, + { + "epoch": 2.5558828088668424, + "grad_norm": 0.30838826298713684, + "learning_rate": 6.517236363911894e-07, + "loss": 0.2999, + "step": 5496 + }, + { + "epoch": 2.5563478530460393, + "grad_norm": 0.3686372935771942, + "learning_rate": 6.503884617933471e-07, + "loss": 0.3305, + "step": 5497 + }, + { + "epoch": 2.556812897225236, + "grad_norm": 0.31591683626174927, + "learning_rate": 6.490545611260741e-07, + "loss": 0.3122, + "step": 5498 + }, + { + "epoch": 2.5572779414044335, + "grad_norm": 0.34594687819480896, + "learning_rate": 6.477219347800462e-07, + "loss": 0.3461, + "step": 5499 + }, + { + "epoch": 2.5577429855836304, + "grad_norm": 0.31076663732528687, + "learning_rate": 6.463905831455685e-07, + "loss": 0.2906, + "step": 5500 + }, + { + "epoch": 2.5582080297628274, + "grad_norm": 0.3259563446044922, + "learning_rate": 6.450605066125726e-07, + "loss": 0.3008, + "step": 5501 + }, + { + "epoch": 2.5586730739420247, + "grad_norm": 0.31875672936439514, + "learning_rate": 6.437317055706172e-07, + "loss": 0.3028, + "step": 5502 + }, + { + "epoch": 2.5591381181212216, + "grad_norm": 0.3503369390964508, + "learning_rate": 6.424041804088848e-07, + "loss": 0.3006, + "step": 5503 + }, + { + "epoch": 2.5596031623004185, + "grad_norm": 0.3309253454208374, + "learning_rate": 6.410779315161885e-07, + "loss": 0.3309, + "step": 5504 + }, + { + "epoch": 2.5600682064796154, + "grad_norm": 0.30641940236091614, + "learning_rate": 6.397529592809615e-07, + "loss": 0.3159, + "step": 5505 + }, + { + "epoch": 2.5605332506588123, + "grad_norm": 0.4112693965435028, + "learning_rate": 6.384292640912704e-07, + "loss": 0.328, + "step": 5506 + }, + { + "epoch": 2.5609982948380097, + "grad_norm": 0.3142835199832916, + "learning_rate": 6.371068463348006e-07, + "loss": 0.3059, + "step": 5507 + }, + { + "epoch": 2.5614633390172066, + "grad_norm": 0.32879573106765747, + "learning_rate": 6.357857063988692e-07, + "loss": 0.3145, + "step": 5508 + }, + { + "epoch": 2.5619283831964035, + "grad_norm": 0.3391149640083313, + "learning_rate": 6.344658446704155e-07, + "loss": 0.3231, + "step": 5509 + }, + { + "epoch": 2.562393427375601, + "grad_norm": 0.30871260166168213, + "learning_rate": 6.331472615360062e-07, + "loss": 0.3248, + "step": 5510 + }, + { + "epoch": 2.5628584715547977, + "grad_norm": 0.32863956689834595, + "learning_rate": 6.318299573818315e-07, + "loss": 0.2954, + "step": 5511 + }, + { + "epoch": 2.5633235157339946, + "grad_norm": 0.3561699688434601, + "learning_rate": 6.305139325937098e-07, + "loss": 0.3346, + "step": 5512 + }, + { + "epoch": 2.563788559913192, + "grad_norm": 0.32806986570358276, + "learning_rate": 6.291991875570841e-07, + "loss": 0.2992, + "step": 5513 + }, + { + "epoch": 2.564253604092389, + "grad_norm": 0.4038988947868347, + "learning_rate": 6.278857226570196e-07, + "loss": 0.3525, + "step": 5514 + }, + { + "epoch": 2.564718648271586, + "grad_norm": 0.35461750626564026, + "learning_rate": 6.265735382782106e-07, + "loss": 0.3138, + "step": 5515 + }, + { + "epoch": 2.5651836924507827, + "grad_norm": 0.3094637393951416, + "learning_rate": 6.252626348049734e-07, + "loss": 0.305, + "step": 5516 + }, + { + "epoch": 2.5656487366299796, + "grad_norm": 0.3154189884662628, + "learning_rate": 6.239530126212518e-07, + "loss": 0.3293, + "step": 5517 + }, + { + "epoch": 2.566113780809177, + "grad_norm": 0.3151610493659973, + "learning_rate": 6.226446721106111e-07, + "loss": 0.3055, + "step": 5518 + }, + { + "epoch": 2.566578824988374, + "grad_norm": 0.3299187421798706, + "learning_rate": 6.213376136562449e-07, + "loss": 0.3182, + "step": 5519 + }, + { + "epoch": 2.5670438691675708, + "grad_norm": 0.34355783462524414, + "learning_rate": 6.20031837640967e-07, + "loss": 0.3599, + "step": 5520 + }, + { + "epoch": 2.567508913346768, + "grad_norm": 0.3010472059249878, + "learning_rate": 6.187273444472202e-07, + "loss": 0.2715, + "step": 5521 + }, + { + "epoch": 2.567973957525965, + "grad_norm": 0.3234943151473999, + "learning_rate": 6.174241344570681e-07, + "loss": 0.3174, + "step": 5522 + }, + { + "epoch": 2.568439001705162, + "grad_norm": 0.3738628327846527, + "learning_rate": 6.161222080522e-07, + "loss": 0.3164, + "step": 5523 + }, + { + "epoch": 2.5689040458843593, + "grad_norm": 0.31000205874443054, + "learning_rate": 6.14821565613929e-07, + "loss": 0.3146, + "step": 5524 + }, + { + "epoch": 2.569369090063556, + "grad_norm": 0.34590592980384827, + "learning_rate": 6.135222075231933e-07, + "loss": 0.3303, + "step": 5525 + }, + { + "epoch": 2.569834134242753, + "grad_norm": 0.3145350217819214, + "learning_rate": 6.122241341605523e-07, + "loss": 0.3044, + "step": 5526 + }, + { + "epoch": 2.57029917842195, + "grad_norm": 0.3343406021595001, + "learning_rate": 6.109273459061916e-07, + "loss": 0.3601, + "step": 5527 + }, + { + "epoch": 2.570764222601147, + "grad_norm": 0.30140528082847595, + "learning_rate": 6.096318431399178e-07, + "loss": 0.3012, + "step": 5528 + }, + { + "epoch": 2.5712292667803442, + "grad_norm": 0.34568503499031067, + "learning_rate": 6.083376262411644e-07, + "loss": 0.3658, + "step": 5529 + }, + { + "epoch": 2.571694310959541, + "grad_norm": 0.33937206864356995, + "learning_rate": 6.070446955889853e-07, + "loss": 0.3469, + "step": 5530 + }, + { + "epoch": 2.572159355138738, + "grad_norm": 0.3581177592277527, + "learning_rate": 6.057530515620608e-07, + "loss": 0.3144, + "step": 5531 + }, + { + "epoch": 2.5726243993179354, + "grad_norm": 0.29124531149864197, + "learning_rate": 6.044626945386894e-07, + "loss": 0.3194, + "step": 5532 + }, + { + "epoch": 2.5730894434971323, + "grad_norm": 0.30630677938461304, + "learning_rate": 6.031736248967984e-07, + "loss": 0.3128, + "step": 5533 + }, + { + "epoch": 2.573554487676329, + "grad_norm": 0.3839922547340393, + "learning_rate": 6.018858430139335e-07, + "loss": 0.3398, + "step": 5534 + }, + { + "epoch": 2.574019531855526, + "grad_norm": 0.336647093296051, + "learning_rate": 6.005993492672657e-07, + "loss": 0.2957, + "step": 5535 + }, + { + "epoch": 2.574484576034723, + "grad_norm": 0.3169485330581665, + "learning_rate": 5.993141440335887e-07, + "loss": 0.3208, + "step": 5536 + }, + { + "epoch": 2.5749496202139204, + "grad_norm": 0.3146721124649048, + "learning_rate": 5.980302276893191e-07, + "loss": 0.3066, + "step": 5537 + }, + { + "epoch": 2.5754146643931173, + "grad_norm": 0.32376930117607117, + "learning_rate": 5.967476006104922e-07, + "loss": 0.3335, + "step": 5538 + }, + { + "epoch": 2.575879708572314, + "grad_norm": 0.33218416571617126, + "learning_rate": 5.95466263172772e-07, + "loss": 0.3186, + "step": 5539 + }, + { + "epoch": 2.5763447527515115, + "grad_norm": 0.31140658259391785, + "learning_rate": 5.941862157514383e-07, + "loss": 0.3257, + "step": 5540 + }, + { + "epoch": 2.5768097969307084, + "grad_norm": 0.324327677488327, + "learning_rate": 5.92907458721399e-07, + "loss": 0.3425, + "step": 5541 + }, + { + "epoch": 2.5772748411099053, + "grad_norm": 0.3449290692806244, + "learning_rate": 5.916299924571789e-07, + "loss": 0.3705, + "step": 5542 + }, + { + "epoch": 2.5777398852891027, + "grad_norm": 0.32734236121177673, + "learning_rate": 5.903538173329287e-07, + "loss": 0.2884, + "step": 5543 + }, + { + "epoch": 2.5782049294682996, + "grad_norm": 0.33377382159233093, + "learning_rate": 5.890789337224184e-07, + "loss": 0.3193, + "step": 5544 + }, + { + "epoch": 2.5786699736474965, + "grad_norm": 0.31567618250846863, + "learning_rate": 5.87805341999042e-07, + "loss": 0.3011, + "step": 5545 + }, + { + "epoch": 2.5791350178266934, + "grad_norm": 0.3374476134777069, + "learning_rate": 5.865330425358118e-07, + "loss": 0.3549, + "step": 5546 + }, + { + "epoch": 2.5796000620058903, + "grad_norm": 0.3086042106151581, + "learning_rate": 5.852620357053651e-07, + "loss": 0.3016, + "step": 5547 + }, + { + "epoch": 2.5800651061850877, + "grad_norm": 0.31613045930862427, + "learning_rate": 5.839923218799587e-07, + "loss": 0.3169, + "step": 5548 + }, + { + "epoch": 2.5805301503642846, + "grad_norm": 0.2952466309070587, + "learning_rate": 5.827239014314723e-07, + "loss": 0.3013, + "step": 5549 + }, + { + "epoch": 2.5809951945434815, + "grad_norm": 0.32223108410835266, + "learning_rate": 5.814567747314049e-07, + "loss": 0.3347, + "step": 5550 + }, + { + "epoch": 2.581460238722679, + "grad_norm": 0.3356245458126068, + "learning_rate": 5.801909421508756e-07, + "loss": 0.3239, + "step": 5551 + }, + { + "epoch": 2.5819252829018757, + "grad_norm": 0.35723865032196045, + "learning_rate": 5.789264040606291e-07, + "loss": 0.3591, + "step": 5552 + }, + { + "epoch": 2.5823903270810726, + "grad_norm": 0.3004617989063263, + "learning_rate": 5.776631608310257e-07, + "loss": 0.3046, + "step": 5553 + }, + { + "epoch": 2.58285537126027, + "grad_norm": 0.35467642545700073, + "learning_rate": 5.764012128320507e-07, + "loss": 0.3802, + "step": 5554 + }, + { + "epoch": 2.583320415439467, + "grad_norm": 0.3245300054550171, + "learning_rate": 5.75140560433306e-07, + "loss": 0.2898, + "step": 5555 + }, + { + "epoch": 2.583785459618664, + "grad_norm": 0.342220276594162, + "learning_rate": 5.738812040040187e-07, + "loss": 0.3478, + "step": 5556 + }, + { + "epoch": 2.5842505037978607, + "grad_norm": 0.3202744126319885, + "learning_rate": 5.726231439130314e-07, + "loss": 0.3162, + "step": 5557 + }, + { + "epoch": 2.5847155479770576, + "grad_norm": 0.33512887358665466, + "learning_rate": 5.713663805288106e-07, + "loss": 0.308, + "step": 5558 + }, + { + "epoch": 2.585180592156255, + "grad_norm": 0.33824869990348816, + "learning_rate": 5.701109142194422e-07, + "loss": 0.308, + "step": 5559 + }, + { + "epoch": 2.585645636335452, + "grad_norm": 0.3539738953113556, + "learning_rate": 5.688567453526328e-07, + "loss": 0.3317, + "step": 5560 + }, + { + "epoch": 2.5861106805146488, + "grad_norm": 0.32088860869407654, + "learning_rate": 5.676038742957057e-07, + "loss": 0.3272, + "step": 5561 + }, + { + "epoch": 2.586575724693846, + "grad_norm": 0.33904799818992615, + "learning_rate": 5.663523014156086e-07, + "loss": 0.2933, + "step": 5562 + }, + { + "epoch": 2.587040768873043, + "grad_norm": 0.3523750901222229, + "learning_rate": 5.651020270789049e-07, + "loss": 0.2996, + "step": 5563 + }, + { + "epoch": 2.58750581305224, + "grad_norm": 0.32999008893966675, + "learning_rate": 5.638530516517821e-07, + "loss": 0.3068, + "step": 5564 + }, + { + "epoch": 2.587970857231437, + "grad_norm": 0.3218596279621124, + "learning_rate": 5.626053755000421e-07, + "loss": 0.32, + "step": 5565 + }, + { + "epoch": 2.588435901410634, + "grad_norm": 0.32146209478378296, + "learning_rate": 5.613589989891116e-07, + "loss": 0.3225, + "step": 5566 + }, + { + "epoch": 2.588900945589831, + "grad_norm": 0.2964208424091339, + "learning_rate": 5.601139224840318e-07, + "loss": 0.2975, + "step": 5567 + }, + { + "epoch": 2.589365989769028, + "grad_norm": 0.332450270652771, + "learning_rate": 5.588701463494672e-07, + "loss": 0.3431, + "step": 5568 + }, + { + "epoch": 2.589831033948225, + "grad_norm": 0.28528085350990295, + "learning_rate": 5.576276709496975e-07, + "loss": 0.2784, + "step": 5569 + }, + { + "epoch": 2.5902960781274222, + "grad_norm": 0.3144235908985138, + "learning_rate": 5.563864966486254e-07, + "loss": 0.3402, + "step": 5570 + }, + { + "epoch": 2.590761122306619, + "grad_norm": 0.35045549273490906, + "learning_rate": 5.551466238097697e-07, + "loss": 0.3278, + "step": 5571 + }, + { + "epoch": 2.591226166485816, + "grad_norm": 0.3082601726055145, + "learning_rate": 5.539080527962704e-07, + "loss": 0.3108, + "step": 5572 + }, + { + "epoch": 2.5916912106650134, + "grad_norm": 0.30737003684043884, + "learning_rate": 5.526707839708834e-07, + "loss": 0.3004, + "step": 5573 + }, + { + "epoch": 2.5921562548442103, + "grad_norm": 0.34509941935539246, + "learning_rate": 5.514348176959855e-07, + "loss": 0.3588, + "step": 5574 + }, + { + "epoch": 2.592621299023407, + "grad_norm": 0.33185267448425293, + "learning_rate": 5.5020015433357e-07, + "loss": 0.3188, + "step": 5575 + }, + { + "epoch": 2.593086343202604, + "grad_norm": 0.32632753252983093, + "learning_rate": 5.489667942452515e-07, + "loss": 0.2991, + "step": 5576 + }, + { + "epoch": 2.593551387381801, + "grad_norm": 0.31657204031944275, + "learning_rate": 5.477347377922593e-07, + "loss": 0.3415, + "step": 5577 + }, + { + "epoch": 2.5940164315609984, + "grad_norm": 0.3294719457626343, + "learning_rate": 5.465039853354442e-07, + "loss": 0.3144, + "step": 5578 + }, + { + "epoch": 2.5944814757401953, + "grad_norm": 0.3257904052734375, + "learning_rate": 5.452745372352725e-07, + "loss": 0.2761, + "step": 5579 + }, + { + "epoch": 2.594946519919392, + "grad_norm": 0.3162056505680084, + "learning_rate": 5.440463938518304e-07, + "loss": 0.333, + "step": 5580 + }, + { + "epoch": 2.5954115640985895, + "grad_norm": 0.3013499081134796, + "learning_rate": 5.428195555448202e-07, + "loss": 0.3004, + "step": 5581 + }, + { + "epoch": 2.5958766082777864, + "grad_norm": 0.3342210352420807, + "learning_rate": 5.415940226735633e-07, + "loss": 0.3388, + "step": 5582 + }, + { + "epoch": 2.5963416524569833, + "grad_norm": 0.31008031964302063, + "learning_rate": 5.403697955969988e-07, + "loss": 0.2927, + "step": 5583 + }, + { + "epoch": 2.5968066966361807, + "grad_norm": 0.32369595766067505, + "learning_rate": 5.391468746736834e-07, + "loss": 0.335, + "step": 5584 + }, + { + "epoch": 2.5972717408153776, + "grad_norm": 0.3301704525947571, + "learning_rate": 5.379252602617902e-07, + "loss": 0.3322, + "step": 5585 + }, + { + "epoch": 2.5977367849945745, + "grad_norm": 0.2886703908443451, + "learning_rate": 5.367049527191093e-07, + "loss": 0.2747, + "step": 5586 + }, + { + "epoch": 2.5982018291737714, + "grad_norm": 0.3267196714878082, + "learning_rate": 5.354859524030503e-07, + "loss": 0.3076, + "step": 5587 + }, + { + "epoch": 2.5986668733529683, + "grad_norm": 0.34606629610061646, + "learning_rate": 5.342682596706372e-07, + "loss": 0.3378, + "step": 5588 + }, + { + "epoch": 2.5991319175321657, + "grad_norm": 0.3182050287723541, + "learning_rate": 5.330518748785147e-07, + "loss": 0.3144, + "step": 5589 + }, + { + "epoch": 2.5995969617113626, + "grad_norm": 0.3103780746459961, + "learning_rate": 5.318367983829393e-07, + "loss": 0.2717, + "step": 5590 + }, + { + "epoch": 2.6000620058905595, + "grad_norm": 0.36062633991241455, + "learning_rate": 5.306230305397897e-07, + "loss": 0.3159, + "step": 5591 + }, + { + "epoch": 2.600527050069757, + "grad_norm": 0.3258778750896454, + "learning_rate": 5.294105717045567e-07, + "loss": 0.3356, + "step": 5592 + }, + { + "epoch": 2.6009920942489537, + "grad_norm": 0.33117127418518066, + "learning_rate": 5.281994222323506e-07, + "loss": 0.329, + "step": 5593 + }, + { + "epoch": 2.6014571384281506, + "grad_norm": 0.32020604610443115, + "learning_rate": 5.269895824778976e-07, + "loss": 0.3093, + "step": 5594 + }, + { + "epoch": 2.6019221826073475, + "grad_norm": 0.33276131749153137, + "learning_rate": 5.25781052795541e-07, + "loss": 0.3143, + "step": 5595 + }, + { + "epoch": 2.602387226786545, + "grad_norm": 0.3160843551158905, + "learning_rate": 5.245738335392376e-07, + "loss": 0.2935, + "step": 5596 + }, + { + "epoch": 2.602852270965742, + "grad_norm": 0.3074491620063782, + "learning_rate": 5.233679250625646e-07, + "loss": 0.3083, + "step": 5597 + }, + { + "epoch": 2.6033173151449387, + "grad_norm": 0.3221345841884613, + "learning_rate": 5.221633277187104e-07, + "loss": 0.3324, + "step": 5598 + }, + { + "epoch": 2.6037823593241356, + "grad_norm": 0.36884576082229614, + "learning_rate": 5.209600418604843e-07, + "loss": 0.3293, + "step": 5599 + }, + { + "epoch": 2.604247403503333, + "grad_norm": 0.3398898243904114, + "learning_rate": 5.197580678403074e-07, + "loss": 0.3435, + "step": 5600 + }, + { + "epoch": 2.60471244768253, + "grad_norm": 0.2826615273952484, + "learning_rate": 5.185574060102206e-07, + "loss": 0.2942, + "step": 5601 + }, + { + "epoch": 2.6051774918617268, + "grad_norm": 0.35747769474983215, + "learning_rate": 5.17358056721875e-07, + "loss": 0.3186, + "step": 5602 + }, + { + "epoch": 2.605642536040924, + "grad_norm": 0.32651564478874207, + "learning_rate": 5.161600203265438e-07, + "loss": 0.3283, + "step": 5603 + }, + { + "epoch": 2.606107580220121, + "grad_norm": 0.305650532245636, + "learning_rate": 5.149632971751096e-07, + "loss": 0.3121, + "step": 5604 + }, + { + "epoch": 2.606572624399318, + "grad_norm": 0.30761486291885376, + "learning_rate": 5.137678876180746e-07, + "loss": 0.3106, + "step": 5605 + }, + { + "epoch": 2.607037668578515, + "grad_norm": 0.3115074038505554, + "learning_rate": 5.125737920055551e-07, + "loss": 0.3197, + "step": 5606 + }, + { + "epoch": 2.6075027127577117, + "grad_norm": 0.3347545564174652, + "learning_rate": 5.113810106872825e-07, + "loss": 0.3098, + "step": 5607 + }, + { + "epoch": 2.607967756936909, + "grad_norm": 0.3298303782939911, + "learning_rate": 5.10189544012602e-07, + "loss": 0.316, + "step": 5608 + }, + { + "epoch": 2.608432801116106, + "grad_norm": 0.33597150444984436, + "learning_rate": 5.089993923304759e-07, + "loss": 0.3258, + "step": 5609 + }, + { + "epoch": 2.608897845295303, + "grad_norm": 0.312759667634964, + "learning_rate": 5.078105559894791e-07, + "loss": 0.3181, + "step": 5610 + }, + { + "epoch": 2.6093628894745002, + "grad_norm": 0.32988691329956055, + "learning_rate": 5.066230353378038e-07, + "loss": 0.3211, + "step": 5611 + }, + { + "epoch": 2.609827933653697, + "grad_norm": 0.333040714263916, + "learning_rate": 5.054368307232537e-07, + "loss": 0.3087, + "step": 5612 + }, + { + "epoch": 2.610292977832894, + "grad_norm": 0.3228285014629364, + "learning_rate": 5.042519424932512e-07, + "loss": 0.3095, + "step": 5613 + }, + { + "epoch": 2.6107580220120914, + "grad_norm": 0.35871362686157227, + "learning_rate": 5.030683709948292e-07, + "loss": 0.3171, + "step": 5614 + }, + { + "epoch": 2.6112230661912883, + "grad_norm": 0.3096243739128113, + "learning_rate": 5.018861165746369e-07, + "loss": 0.3054, + "step": 5615 + }, + { + "epoch": 2.611688110370485, + "grad_norm": 0.2971365451812744, + "learning_rate": 5.007051795789375e-07, + "loss": 0.2977, + "step": 5616 + }, + { + "epoch": 2.612153154549682, + "grad_norm": 0.31067079305648804, + "learning_rate": 4.995255603536076e-07, + "loss": 0.3265, + "step": 5617 + }, + { + "epoch": 2.612618198728879, + "grad_norm": 0.3296055793762207, + "learning_rate": 4.983472592441391e-07, + "loss": 0.3023, + "step": 5618 + }, + { + "epoch": 2.6130832429080764, + "grad_norm": 0.33535173535346985, + "learning_rate": 4.971702765956388e-07, + "loss": 0.3282, + "step": 5619 + }, + { + "epoch": 2.6135482870872733, + "grad_norm": 0.30592218041419983, + "learning_rate": 4.959946127528231e-07, + "loss": 0.3186, + "step": 5620 + }, + { + "epoch": 2.61401333126647, + "grad_norm": 0.32471343874931335, + "learning_rate": 4.948202680600267e-07, + "loss": 0.2948, + "step": 5621 + }, + { + "epoch": 2.6144783754456675, + "grad_norm": 0.331249862909317, + "learning_rate": 4.936472428611961e-07, + "loss": 0.3168, + "step": 5622 + }, + { + "epoch": 2.6149434196248644, + "grad_norm": 0.33011171221733093, + "learning_rate": 4.924755374998891e-07, + "loss": 0.3389, + "step": 5623 + }, + { + "epoch": 2.6154084638040613, + "grad_norm": 0.3165659010410309, + "learning_rate": 4.913051523192819e-07, + "loss": 0.317, + "step": 5624 + }, + { + "epoch": 2.6158735079832587, + "grad_norm": 0.3213482201099396, + "learning_rate": 4.901360876621597e-07, + "loss": 0.3103, + "step": 5625 + }, + { + "epoch": 2.6163385521624556, + "grad_norm": 0.3293680250644684, + "learning_rate": 4.88968343870923e-07, + "loss": 0.3189, + "step": 5626 + }, + { + "epoch": 2.6168035963416525, + "grad_norm": 0.3114880323410034, + "learning_rate": 4.87801921287585e-07, + "loss": 0.317, + "step": 5627 + }, + { + "epoch": 2.6172686405208494, + "grad_norm": 0.32358518242836, + "learning_rate": 4.866368202537714e-07, + "loss": 0.3315, + "step": 5628 + }, + { + "epoch": 2.6177336847000463, + "grad_norm": 0.3022099733352661, + "learning_rate": 4.854730411107217e-07, + "loss": 0.3197, + "step": 5629 + }, + { + "epoch": 2.6181987288792437, + "grad_norm": 0.34834814071655273, + "learning_rate": 4.843105841992895e-07, + "loss": 0.3536, + "step": 5630 + }, + { + "epoch": 2.6186637730584406, + "grad_norm": 0.3669452369213104, + "learning_rate": 4.831494498599371e-07, + "loss": 0.3462, + "step": 5631 + }, + { + "epoch": 2.6191288172376375, + "grad_norm": 0.3363710343837738, + "learning_rate": 4.819896384327433e-07, + "loss": 0.3383, + "step": 5632 + }, + { + "epoch": 2.619593861416835, + "grad_norm": 0.30958956480026245, + "learning_rate": 4.808311502573976e-07, + "loss": 0.2891, + "step": 5633 + }, + { + "epoch": 2.6200589055960317, + "grad_norm": 0.3210178315639496, + "learning_rate": 4.796739856732024e-07, + "loss": 0.3327, + "step": 5634 + }, + { + "epoch": 2.6205239497752286, + "grad_norm": 0.31170886754989624, + "learning_rate": 4.785181450190723e-07, + "loss": 0.3178, + "step": 5635 + }, + { + "epoch": 2.6209889939544255, + "grad_norm": 0.3083958029747009, + "learning_rate": 4.773636286335348e-07, + "loss": 0.3016, + "step": 5636 + }, + { + "epoch": 2.6214540381336224, + "grad_norm": 0.35575956106185913, + "learning_rate": 4.7621043685472824e-07, + "loss": 0.3492, + "step": 5637 + }, + { + "epoch": 2.62191908231282, + "grad_norm": 0.3041512370109558, + "learning_rate": 4.750585700204047e-07, + "loss": 0.2905, + "step": 5638 + }, + { + "epoch": 2.6223841264920167, + "grad_norm": 0.3430851399898529, + "learning_rate": 4.739080284679254e-07, + "loss": 0.3088, + "step": 5639 + }, + { + "epoch": 2.6228491706712136, + "grad_norm": 0.3387916088104248, + "learning_rate": 4.727588125342669e-07, + "loss": 0.319, + "step": 5640 + }, + { + "epoch": 2.623314214850411, + "grad_norm": 0.3547907769680023, + "learning_rate": 4.716109225560156e-07, + "loss": 0.2953, + "step": 5641 + }, + { + "epoch": 2.623779259029608, + "grad_norm": 0.32964009046554565, + "learning_rate": 4.7046435886937024e-07, + "loss": 0.3286, + "step": 5642 + }, + { + "epoch": 2.6242443032088048, + "grad_norm": 0.38167962431907654, + "learning_rate": 4.6931912181014007e-07, + "loss": 0.3104, + "step": 5643 + }, + { + "epoch": 2.624709347388002, + "grad_norm": 0.33540862798690796, + "learning_rate": 4.681752117137467e-07, + "loss": 0.3494, + "step": 5644 + }, + { + "epoch": 2.625174391567199, + "grad_norm": 0.3289260268211365, + "learning_rate": 4.6703262891522214e-07, + "loss": 0.3212, + "step": 5645 + }, + { + "epoch": 2.625639435746396, + "grad_norm": 0.3132651150226593, + "learning_rate": 4.6589137374921155e-07, + "loss": 0.2817, + "step": 5646 + }, + { + "epoch": 2.626104479925593, + "grad_norm": 0.33676326274871826, + "learning_rate": 4.647514465499686e-07, + "loss": 0.3141, + "step": 5647 + }, + { + "epoch": 2.6265695241047897, + "grad_norm": 0.3420335054397583, + "learning_rate": 4.6361284765136125e-07, + "loss": 0.3312, + "step": 5648 + }, + { + "epoch": 2.627034568283987, + "grad_norm": 0.33695536851882935, + "learning_rate": 4.6247557738686445e-07, + "loss": 0.3208, + "step": 5649 + }, + { + "epoch": 2.627499612463184, + "grad_norm": 0.351370245218277, + "learning_rate": 4.613396360895683e-07, + "loss": 0.3156, + "step": 5650 + }, + { + "epoch": 2.627964656642381, + "grad_norm": 0.3574759364128113, + "learning_rate": 4.602050240921696e-07, + "loss": 0.305, + "step": 5651 + }, + { + "epoch": 2.6284297008215782, + "grad_norm": 0.3272240161895752, + "learning_rate": 4.590717417269791e-07, + "loss": 0.338, + "step": 5652 + }, + { + "epoch": 2.628894745000775, + "grad_norm": 0.3331619203090668, + "learning_rate": 4.5793978932591574e-07, + "loss": 0.3186, + "step": 5653 + }, + { + "epoch": 2.629359789179972, + "grad_norm": 0.3595447242259979, + "learning_rate": 4.568091672205122e-07, + "loss": 0.3238, + "step": 5654 + }, + { + "epoch": 2.6298248333591694, + "grad_norm": 0.327280730009079, + "learning_rate": 4.5567987574190677e-07, + "loss": 0.3365, + "step": 5655 + }, + { + "epoch": 2.6302898775383663, + "grad_norm": 0.28672948479652405, + "learning_rate": 4.5455191522085274e-07, + "loss": 0.2672, + "step": 5656 + }, + { + "epoch": 2.630754921717563, + "grad_norm": 0.3301229774951935, + "learning_rate": 4.534252859877097e-07, + "loss": 0.344, + "step": 5657 + }, + { + "epoch": 2.63121996589676, + "grad_norm": 0.3244812488555908, + "learning_rate": 4.522999883724494e-07, + "loss": 0.3144, + "step": 5658 + }, + { + "epoch": 2.631685010075957, + "grad_norm": 0.3279408812522888, + "learning_rate": 4.511760227046541e-07, + "loss": 0.3403, + "step": 5659 + }, + { + "epoch": 2.6321500542551544, + "grad_norm": 0.32007279992103577, + "learning_rate": 4.500533893135134e-07, + "loss": 0.3198, + "step": 5660 + }, + { + "epoch": 2.6326150984343513, + "grad_norm": 0.3092459440231323, + "learning_rate": 4.489320885278309e-07, + "loss": 0.3085, + "step": 5661 + }, + { + "epoch": 2.633080142613548, + "grad_norm": 0.32061436772346497, + "learning_rate": 4.4781212067601445e-07, + "loss": 0.3115, + "step": 5662 + }, + { + "epoch": 2.6335451867927455, + "grad_norm": 0.3219727873802185, + "learning_rate": 4.4669348608608664e-07, + "loss": 0.3278, + "step": 5663 + }, + { + "epoch": 2.6340102309719424, + "grad_norm": 0.33633241057395935, + "learning_rate": 4.4557618508567603e-07, + "loss": 0.3318, + "step": 5664 + }, + { + "epoch": 2.6344752751511393, + "grad_norm": 0.31990426778793335, + "learning_rate": 4.4446021800202356e-07, + "loss": 0.3395, + "step": 5665 + }, + { + "epoch": 2.6349403193303362, + "grad_norm": 0.33744657039642334, + "learning_rate": 4.4334558516197666e-07, + "loss": 0.3202, + "step": 5666 + }, + { + "epoch": 2.635405363509533, + "grad_norm": 0.301940381526947, + "learning_rate": 4.422322868919937e-07, + "loss": 0.2888, + "step": 5667 + }, + { + "epoch": 2.6358704076887305, + "grad_norm": 0.34101638197898865, + "learning_rate": 4.411203235181405e-07, + "loss": 0.329, + "step": 5668 + }, + { + "epoch": 2.6363354518679274, + "grad_norm": 0.3637217581272125, + "learning_rate": 4.400096953660948e-07, + "loss": 0.3665, + "step": 5669 + }, + { + "epoch": 2.6368004960471243, + "grad_norm": 0.30867865681648254, + "learning_rate": 4.3890040276114044e-07, + "loss": 0.2851, + "step": 5670 + }, + { + "epoch": 2.6372655402263216, + "grad_norm": 0.3138599693775177, + "learning_rate": 4.377924460281718e-07, + "loss": 0.3383, + "step": 5671 + }, + { + "epoch": 2.6377305844055186, + "grad_norm": 0.2975723147392273, + "learning_rate": 4.3668582549169005e-07, + "loss": 0.2762, + "step": 5672 + }, + { + "epoch": 2.6381956285847155, + "grad_norm": 0.3239637315273285, + "learning_rate": 4.355805414758085e-07, + "loss": 0.3482, + "step": 5673 + }, + { + "epoch": 2.638660672763913, + "grad_norm": 0.3464727997779846, + "learning_rate": 4.3447659430424507e-07, + "loss": 0.3351, + "step": 5674 + }, + { + "epoch": 2.6391257169431097, + "grad_norm": 0.309800386428833, + "learning_rate": 4.3337398430032815e-07, + "loss": 0.301, + "step": 5675 + }, + { + "epoch": 2.6395907611223066, + "grad_norm": 0.30942457914352417, + "learning_rate": 4.322727117869951e-07, + "loss": 0.3041, + "step": 5676 + }, + { + "epoch": 2.6400558053015035, + "grad_norm": 0.3338727355003357, + "learning_rate": 4.3117277708679126e-07, + "loss": 0.3128, + "step": 5677 + }, + { + "epoch": 2.6405208494807004, + "grad_norm": 0.33797094225883484, + "learning_rate": 4.3007418052186834e-07, + "loss": 0.3582, + "step": 5678 + }, + { + "epoch": 2.6409858936598978, + "grad_norm": 0.336549311876297, + "learning_rate": 4.289769224139884e-07, + "loss": 0.3085, + "step": 5679 + }, + { + "epoch": 2.6414509378390947, + "grad_norm": 0.3069862425327301, + "learning_rate": 4.278810030845193e-07, + "loss": 0.3163, + "step": 5680 + }, + { + "epoch": 2.6419159820182916, + "grad_norm": 0.3145103454589844, + "learning_rate": 4.2678642285443937e-07, + "loss": 0.3241, + "step": 5681 + }, + { + "epoch": 2.642381026197489, + "grad_norm": 0.30022549629211426, + "learning_rate": 4.2569318204433217e-07, + "loss": 0.2823, + "step": 5682 + }, + { + "epoch": 2.642846070376686, + "grad_norm": 0.37692004442214966, + "learning_rate": 4.2460128097439157e-07, + "loss": 0.3223, + "step": 5683 + }, + { + "epoch": 2.6433111145558827, + "grad_norm": 0.33564627170562744, + "learning_rate": 4.235107199644162e-07, + "loss": 0.318, + "step": 5684 + }, + { + "epoch": 2.64377615873508, + "grad_norm": 0.3265214264392853, + "learning_rate": 4.224214993338149e-07, + "loss": 0.3559, + "step": 5685 + }, + { + "epoch": 2.644241202914277, + "grad_norm": 0.31681397557258606, + "learning_rate": 4.2133361940160153e-07, + "loss": 0.3053, + "step": 5686 + }, + { + "epoch": 2.644706247093474, + "grad_norm": 0.317182332277298, + "learning_rate": 4.202470804863984e-07, + "loss": 0.3074, + "step": 5687 + }, + { + "epoch": 2.645171291272671, + "grad_norm": 0.3192555606365204, + "learning_rate": 4.1916188290643643e-07, + "loss": 0.3069, + "step": 5688 + }, + { + "epoch": 2.6456363354518677, + "grad_norm": 0.3164333403110504, + "learning_rate": 4.1807802697955256e-07, + "loss": 0.2939, + "step": 5689 + }, + { + "epoch": 2.646101379631065, + "grad_norm": 0.32663610577583313, + "learning_rate": 4.169955130231884e-07, + "loss": 0.3759, + "step": 5690 + }, + { + "epoch": 2.646566423810262, + "grad_norm": 0.3227081000804901, + "learning_rate": 4.15914341354397e-07, + "loss": 0.3027, + "step": 5691 + }, + { + "epoch": 2.647031467989459, + "grad_norm": 0.3126518726348877, + "learning_rate": 4.1483451228983453e-07, + "loss": 0.3067, + "step": 5692 + }, + { + "epoch": 2.6474965121686562, + "grad_norm": 0.3389665484428406, + "learning_rate": 4.137560261457663e-07, + "loss": 0.3341, + "step": 5693 + }, + { + "epoch": 2.647961556347853, + "grad_norm": 0.3213329613208771, + "learning_rate": 4.1267888323806294e-07, + "loss": 0.3122, + "step": 5694 + }, + { + "epoch": 2.64842660052705, + "grad_norm": 0.341896116733551, + "learning_rate": 4.1160308388220103e-07, + "loss": 0.3159, + "step": 5695 + }, + { + "epoch": 2.648891644706247, + "grad_norm": 0.33184269070625305, + "learning_rate": 4.1052862839326745e-07, + "loss": 0.3124, + "step": 5696 + }, + { + "epoch": 2.649356688885444, + "grad_norm": 0.32483184337615967, + "learning_rate": 4.0945551708594934e-07, + "loss": 0.3103, + "step": 5697 + }, + { + "epoch": 2.649821733064641, + "grad_norm": 0.33331936597824097, + "learning_rate": 4.083837502745458e-07, + "loss": 0.3271, + "step": 5698 + }, + { + "epoch": 2.650286777243838, + "grad_norm": 0.3043390214443207, + "learning_rate": 4.0731332827295966e-07, + "loss": 0.2737, + "step": 5699 + }, + { + "epoch": 2.650751821423035, + "grad_norm": 0.3190237581729889, + "learning_rate": 4.062442513947007e-07, + "loss": 0.3411, + "step": 5700 + }, + { + "epoch": 2.6512168656022324, + "grad_norm": 0.3191225826740265, + "learning_rate": 4.051765199528823e-07, + "loss": 0.3043, + "step": 5701 + }, + { + "epoch": 2.6516819097814293, + "grad_norm": 0.33000683784484863, + "learning_rate": 4.0411013426022773e-07, + "loss": 0.3551, + "step": 5702 + }, + { + "epoch": 2.652146953960626, + "grad_norm": 0.32607778906822205, + "learning_rate": 4.0304509462906203e-07, + "loss": 0.3328, + "step": 5703 + }, + { + "epoch": 2.6526119981398235, + "grad_norm": 0.3193208575248718, + "learning_rate": 4.0198140137132024e-07, + "loss": 0.3034, + "step": 5704 + }, + { + "epoch": 2.6530770423190204, + "grad_norm": 0.3008403778076172, + "learning_rate": 4.0091905479853865e-07, + "loss": 0.3229, + "step": 5705 + }, + { + "epoch": 2.6535420864982173, + "grad_norm": 0.3564378023147583, + "learning_rate": 3.9985805522186336e-07, + "loss": 0.3812, + "step": 5706 + }, + { + "epoch": 2.6540071306774142, + "grad_norm": 0.37547022104263306, + "learning_rate": 3.98798402952042e-07, + "loss": 0.332, + "step": 5707 + }, + { + "epoch": 2.654472174856611, + "grad_norm": 0.2705591917037964, + "learning_rate": 3.977400982994306e-07, + "loss": 0.2647, + "step": 5708 + }, + { + "epoch": 2.6549372190358085, + "grad_norm": 0.31391727924346924, + "learning_rate": 3.966831415739891e-07, + "loss": 0.326, + "step": 5709 + }, + { + "epoch": 2.6554022632150054, + "grad_norm": 0.3211490213871002, + "learning_rate": 3.9562753308528267e-07, + "loss": 0.3346, + "step": 5710 + }, + { + "epoch": 2.6558673073942023, + "grad_norm": 0.3080466687679291, + "learning_rate": 3.945732731424823e-07, + "loss": 0.3227, + "step": 5711 + }, + { + "epoch": 2.6563323515733996, + "grad_norm": 0.3370213508605957, + "learning_rate": 3.935203620543643e-07, + "loss": 0.3306, + "step": 5712 + }, + { + "epoch": 2.6567973957525965, + "grad_norm": 0.29656779766082764, + "learning_rate": 3.924688001293081e-07, + "loss": 0.2599, + "step": 5713 + }, + { + "epoch": 2.6572624399317935, + "grad_norm": 0.3250420093536377, + "learning_rate": 3.9141858767530014e-07, + "loss": 0.3182, + "step": 5714 + }, + { + "epoch": 2.657727484110991, + "grad_norm": 0.33750423789024353, + "learning_rate": 3.903697249999289e-07, + "loss": 0.3342, + "step": 5715 + }, + { + "epoch": 2.6581925282901877, + "grad_norm": 0.29559361934661865, + "learning_rate": 3.8932221241039125e-07, + "loss": 0.295, + "step": 5716 + }, + { + "epoch": 2.6586575724693846, + "grad_norm": 0.35010501742362976, + "learning_rate": 3.882760502134847e-07, + "loss": 0.328, + "step": 5717 + }, + { + "epoch": 2.6591226166485815, + "grad_norm": 0.3221285343170166, + "learning_rate": 3.872312387156146e-07, + "loss": 0.3553, + "step": 5718 + }, + { + "epoch": 2.6595876608277784, + "grad_norm": 0.3222235441207886, + "learning_rate": 3.8618777822278854e-07, + "loss": 0.3039, + "step": 5719 + }, + { + "epoch": 2.6600527050069758, + "grad_norm": 0.32205820083618164, + "learning_rate": 3.8514566904061967e-07, + "loss": 0.3102, + "step": 5720 + }, + { + "epoch": 2.6605177491861727, + "grad_norm": 0.33951982855796814, + "learning_rate": 3.841049114743239e-07, + "loss": 0.3002, + "step": 5721 + }, + { + "epoch": 2.6609827933653696, + "grad_norm": 0.33085671067237854, + "learning_rate": 3.8306550582872306e-07, + "loss": 0.3237, + "step": 5722 + }, + { + "epoch": 2.661447837544567, + "grad_norm": 0.3133414685726166, + "learning_rate": 3.820274524082418e-07, + "loss": 0.2978, + "step": 5723 + }, + { + "epoch": 2.661912881723764, + "grad_norm": 0.3263706862926483, + "learning_rate": 3.809907515169103e-07, + "loss": 0.3129, + "step": 5724 + }, + { + "epoch": 2.6623779259029607, + "grad_norm": 0.3518964350223541, + "learning_rate": 3.7995540345835914e-07, + "loss": 0.2998, + "step": 5725 + }, + { + "epoch": 2.6628429700821576, + "grad_norm": 0.32105782628059387, + "learning_rate": 3.7892140853582725e-07, + "loss": 0.3411, + "step": 5726 + }, + { + "epoch": 2.663308014261355, + "grad_norm": 0.30892109870910645, + "learning_rate": 3.7788876705215307e-07, + "loss": 0.3243, + "step": 5727 + }, + { + "epoch": 2.663773058440552, + "grad_norm": 0.31716519594192505, + "learning_rate": 3.7685747930978236e-07, + "loss": 0.341, + "step": 5728 + }, + { + "epoch": 2.664238102619749, + "grad_norm": 0.31774482131004333, + "learning_rate": 3.758275456107613e-07, + "loss": 0.2949, + "step": 5729 + }, + { + "epoch": 2.6647031467989457, + "grad_norm": 0.31866157054901123, + "learning_rate": 3.747989662567403e-07, + "loss": 0.3215, + "step": 5730 + }, + { + "epoch": 2.665168190978143, + "grad_norm": 0.33754652738571167, + "learning_rate": 3.73771741548975e-07, + "loss": 0.3054, + "step": 5731 + }, + { + "epoch": 2.66563323515734, + "grad_norm": 0.3196556270122528, + "learning_rate": 3.727458717883209e-07, + "loss": 0.3075, + "step": 5732 + }, + { + "epoch": 2.666098279336537, + "grad_norm": 0.356734037399292, + "learning_rate": 3.717213572752404e-07, + "loss": 0.3356, + "step": 5733 + }, + { + "epoch": 2.666563323515734, + "grad_norm": 0.3005916178226471, + "learning_rate": 3.706981983097957e-07, + "loss": 0.3205, + "step": 5734 + }, + { + "epoch": 2.667028367694931, + "grad_norm": 0.35259753465652466, + "learning_rate": 3.6967639519165546e-07, + "loss": 0.3188, + "step": 5735 + }, + { + "epoch": 2.667493411874128, + "grad_norm": 0.3179108500480652, + "learning_rate": 3.686559482200874e-07, + "loss": 0.3535, + "step": 5736 + }, + { + "epoch": 2.667958456053325, + "grad_norm": 0.3107699751853943, + "learning_rate": 3.6763685769396484e-07, + "loss": 0.2986, + "step": 5737 + }, + { + "epoch": 2.668423500232522, + "grad_norm": 0.29539617896080017, + "learning_rate": 3.6661912391176223e-07, + "loss": 0.3016, + "step": 5738 + }, + { + "epoch": 2.668888544411719, + "grad_norm": 0.3341010808944702, + "learning_rate": 3.6560274717155784e-07, + "loss": 0.3494, + "step": 5739 + }, + { + "epoch": 2.669353588590916, + "grad_norm": 0.3309285342693329, + "learning_rate": 3.645877277710308e-07, + "loss": 0.295, + "step": 5740 + }, + { + "epoch": 2.669818632770113, + "grad_norm": 0.3219027817249298, + "learning_rate": 3.635740660074655e-07, + "loss": 0.3116, + "step": 5741 + }, + { + "epoch": 2.6702836769493103, + "grad_norm": 0.33891165256500244, + "learning_rate": 3.6256176217774496e-07, + "loss": 0.3293, + "step": 5742 + }, + { + "epoch": 2.6707487211285073, + "grad_norm": 0.3448745608329773, + "learning_rate": 3.6155081657835876e-07, + "loss": 0.3291, + "step": 5743 + }, + { + "epoch": 2.671213765307704, + "grad_norm": 0.32368507981300354, + "learning_rate": 3.6054122950539447e-07, + "loss": 0.3126, + "step": 5744 + }, + { + "epoch": 2.6716788094869015, + "grad_norm": 0.3642202913761139, + "learning_rate": 3.595330012545445e-07, + "loss": 0.3264, + "step": 5745 + }, + { + "epoch": 2.6721438536660984, + "grad_norm": 0.3226480782032013, + "learning_rate": 3.5852613212110307e-07, + "loss": 0.3025, + "step": 5746 + }, + { + "epoch": 2.6726088978452953, + "grad_norm": 0.3831692039966583, + "learning_rate": 3.5752062239996554e-07, + "loss": 0.3591, + "step": 5747 + }, + { + "epoch": 2.6730739420244922, + "grad_norm": 0.3260175585746765, + "learning_rate": 3.5651647238562904e-07, + "loss": 0.3103, + "step": 5748 + }, + { + "epoch": 2.673538986203689, + "grad_norm": 0.3270534574985504, + "learning_rate": 3.555136823721933e-07, + "loss": 0.3214, + "step": 5749 + }, + { + "epoch": 2.6740040303828865, + "grad_norm": 0.32291585206985474, + "learning_rate": 3.545122526533579e-07, + "loss": 0.3335, + "step": 5750 + }, + { + "epoch": 2.6744690745620834, + "grad_norm": 0.2985171675682068, + "learning_rate": 3.5351218352242755e-07, + "loss": 0.3173, + "step": 5751 + }, + { + "epoch": 2.6749341187412803, + "grad_norm": 0.32506510615348816, + "learning_rate": 3.525134752723042e-07, + "loss": 0.3106, + "step": 5752 + }, + { + "epoch": 2.6753991629204776, + "grad_norm": 0.33014851808547974, + "learning_rate": 3.515161281954943e-07, + "loss": 0.3127, + "step": 5753 + }, + { + "epoch": 2.6758642070996745, + "grad_norm": 0.33311840891838074, + "learning_rate": 3.5052014258410426e-07, + "loss": 0.3143, + "step": 5754 + }, + { + "epoch": 2.6763292512788714, + "grad_norm": 0.3252905309200287, + "learning_rate": 3.4952551872984295e-07, + "loss": 0.3435, + "step": 5755 + }, + { + "epoch": 2.6767942954580684, + "grad_norm": 0.31564247608184814, + "learning_rate": 3.485322569240174e-07, + "loss": 0.3095, + "step": 5756 + }, + { + "epoch": 2.6772593396372657, + "grad_norm": 0.3435322642326355, + "learning_rate": 3.475403574575398e-07, + "loss": 0.3198, + "step": 5757 + }, + { + "epoch": 2.6777243838164626, + "grad_norm": 0.3655998706817627, + "learning_rate": 3.4654982062092113e-07, + "loss": 0.3149, + "step": 5758 + }, + { + "epoch": 2.6781894279956595, + "grad_norm": 0.32407233119010925, + "learning_rate": 3.455606467042738e-07, + "loss": 0.3186, + "step": 5759 + }, + { + "epoch": 2.6786544721748564, + "grad_norm": 0.3063153624534607, + "learning_rate": 3.445728359973094e-07, + "loss": 0.2878, + "step": 5760 + }, + { + "epoch": 2.6791195163540538, + "grad_norm": 0.35395288467407227, + "learning_rate": 3.435863887893431e-07, + "loss": 0.3123, + "step": 5761 + }, + { + "epoch": 2.6795845605332507, + "grad_norm": 0.3261089026927948, + "learning_rate": 3.426013053692878e-07, + "loss": 0.3338, + "step": 5762 + }, + { + "epoch": 2.6800496047124476, + "grad_norm": 0.3305519223213196, + "learning_rate": 3.4161758602566043e-07, + "loss": 0.3206, + "step": 5763 + }, + { + "epoch": 2.680514648891645, + "grad_norm": 0.32225289940834045, + "learning_rate": 3.406352310465749e-07, + "loss": 0.3315, + "step": 5764 + }, + { + "epoch": 2.680979693070842, + "grad_norm": 0.3169860243797302, + "learning_rate": 3.3965424071974727e-07, + "loss": 0.3072, + "step": 5765 + }, + { + "epoch": 2.6814447372500387, + "grad_norm": 0.3510488271713257, + "learning_rate": 3.386746153324943e-07, + "loss": 0.3437, + "step": 5766 + }, + { + "epoch": 2.6819097814292356, + "grad_norm": 0.3206140995025635, + "learning_rate": 3.3769635517173103e-07, + "loss": 0.2735, + "step": 5767 + }, + { + "epoch": 2.6823748256084325, + "grad_norm": 0.33280014991760254, + "learning_rate": 3.3671946052397486e-07, + "loss": 0.3409, + "step": 5768 + }, + { + "epoch": 2.68283986978763, + "grad_norm": 0.3153855502605438, + "learning_rate": 3.3574393167534247e-07, + "loss": 0.2849, + "step": 5769 + }, + { + "epoch": 2.683304913966827, + "grad_norm": 0.3374989926815033, + "learning_rate": 3.347697689115509e-07, + "loss": 0.3465, + "step": 5770 + }, + { + "epoch": 2.6837699581460237, + "grad_norm": 0.32558801770210266, + "learning_rate": 3.337969725179152e-07, + "loss": 0.2962, + "step": 5771 + }, + { + "epoch": 2.684235002325221, + "grad_norm": 0.31771788001060486, + "learning_rate": 3.328255427793531e-07, + "loss": 0.3208, + "step": 5772 + }, + { + "epoch": 2.684700046504418, + "grad_norm": 0.3207005560398102, + "learning_rate": 3.318554799803786e-07, + "loss": 0.3074, + "step": 5773 + }, + { + "epoch": 2.685165090683615, + "grad_norm": 0.3172682225704193, + "learning_rate": 3.3088678440511e-07, + "loss": 0.3266, + "step": 5774 + }, + { + "epoch": 2.685630134862812, + "grad_norm": 0.3359968960285187, + "learning_rate": 3.299194563372604e-07, + "loss": 0.2934, + "step": 5775 + }, + { + "epoch": 2.686095179042009, + "grad_norm": 0.345907598733902, + "learning_rate": 3.289534960601454e-07, + "loss": 0.3541, + "step": 5776 + }, + { + "epoch": 2.686560223221206, + "grad_norm": 0.328243225812912, + "learning_rate": 3.279889038566786e-07, + "loss": 0.3324, + "step": 5777 + }, + { + "epoch": 2.687025267400403, + "grad_norm": 0.2853672206401825, + "learning_rate": 3.2702568000937404e-07, + "loss": 0.2756, + "step": 5778 + }, + { + "epoch": 2.6874903115796, + "grad_norm": 0.32393786311149597, + "learning_rate": 3.260638248003434e-07, + "loss": 0.3193, + "step": 5779 + }, + { + "epoch": 2.687955355758797, + "grad_norm": 0.3525981903076172, + "learning_rate": 3.2510333851129895e-07, + "loss": 0.3326, + "step": 5780 + }, + { + "epoch": 2.688420399937994, + "grad_norm": 0.347851037979126, + "learning_rate": 3.2414422142355184e-07, + "loss": 0.3176, + "step": 5781 + }, + { + "epoch": 2.688885444117191, + "grad_norm": 0.3218478858470917, + "learning_rate": 3.2318647381801237e-07, + "loss": 0.311, + "step": 5782 + }, + { + "epoch": 2.6893504882963883, + "grad_norm": 0.32304203510284424, + "learning_rate": 3.222300959751873e-07, + "loss": 0.3275, + "step": 5783 + }, + { + "epoch": 2.6898155324755852, + "grad_norm": 0.33753058314323425, + "learning_rate": 3.2127508817518637e-07, + "loss": 0.3146, + "step": 5784 + }, + { + "epoch": 2.690280576654782, + "grad_norm": 0.336089164018631, + "learning_rate": 3.2032145069771424e-07, + "loss": 0.3564, + "step": 5785 + }, + { + "epoch": 2.6907456208339795, + "grad_norm": 0.3043957054615021, + "learning_rate": 3.1936918382207696e-07, + "loss": 0.3022, + "step": 5786 + }, + { + "epoch": 2.6912106650131764, + "grad_norm": 0.31595054268836975, + "learning_rate": 3.1841828782717685e-07, + "loss": 0.3224, + "step": 5787 + }, + { + "epoch": 2.6916757091923733, + "grad_norm": 0.35378995537757874, + "learning_rate": 3.174687629915174e-07, + "loss": 0.3291, + "step": 5788 + }, + { + "epoch": 2.69214075337157, + "grad_norm": 0.31758201122283936, + "learning_rate": 3.165206095931972e-07, + "loss": 0.2988, + "step": 5789 + }, + { + "epoch": 2.692605797550767, + "grad_norm": 0.3146103024482727, + "learning_rate": 3.1557382790991686e-07, + "loss": 0.332, + "step": 5790 + }, + { + "epoch": 2.6930708417299645, + "grad_norm": 0.32033929228782654, + "learning_rate": 3.146284182189718e-07, + "loss": 0.304, + "step": 5791 + }, + { + "epoch": 2.6935358859091614, + "grad_norm": 0.33226868510246277, + "learning_rate": 3.1368438079725784e-07, + "loss": 0.3309, + "step": 5792 + }, + { + "epoch": 2.6940009300883583, + "grad_norm": 0.32148411870002747, + "learning_rate": 3.1274171592126814e-07, + "loss": 0.3002, + "step": 5793 + }, + { + "epoch": 2.6944659742675556, + "grad_norm": 0.30781638622283936, + "learning_rate": 3.1180042386709463e-07, + "loss": 0.3173, + "step": 5794 + }, + { + "epoch": 2.6949310184467525, + "grad_norm": 0.31994858384132385, + "learning_rate": 3.108605049104246e-07, + "loss": 0.3421, + "step": 5795 + }, + { + "epoch": 2.6953960626259494, + "grad_norm": 0.34452393651008606, + "learning_rate": 3.099219593265479e-07, + "loss": 0.3137, + "step": 5796 + }, + { + "epoch": 2.6958611068051463, + "grad_norm": 0.30855563282966614, + "learning_rate": 3.089847873903462e-07, + "loss": 0.3054, + "step": 5797 + }, + { + "epoch": 2.6963261509843433, + "grad_norm": 0.31589534878730774, + "learning_rate": 3.0804898937630444e-07, + "loss": 0.3194, + "step": 5798 + }, + { + "epoch": 2.6967911951635406, + "grad_norm": 0.33643150329589844, + "learning_rate": 3.0711456555850117e-07, + "loss": 0.3233, + "step": 5799 + }, + { + "epoch": 2.6972562393427375, + "grad_norm": 0.34752482175827026, + "learning_rate": 3.0618151621061464e-07, + "loss": 0.3242, + "step": 5800 + }, + { + "epoch": 2.6977212835219344, + "grad_norm": 0.31614622473716736, + "learning_rate": 3.0524984160591963e-07, + "loss": 0.275, + "step": 5801 + }, + { + "epoch": 2.6981863277011318, + "grad_norm": 0.33584392070770264, + "learning_rate": 3.043195420172879e-07, + "loss": 0.3686, + "step": 5802 + }, + { + "epoch": 2.6986513718803287, + "grad_norm": 0.3025575280189514, + "learning_rate": 3.033906177171897e-07, + "loss": 0.2816, + "step": 5803 + }, + { + "epoch": 2.6991164160595256, + "grad_norm": 0.30763640999794006, + "learning_rate": 3.024630689776914e-07, + "loss": 0.294, + "step": 5804 + }, + { + "epoch": 2.699581460238723, + "grad_norm": 0.3446342647075653, + "learning_rate": 3.015368960704584e-07, + "loss": 0.3349, + "step": 5805 + }, + { + "epoch": 2.70004650441792, + "grad_norm": 0.3446672558784485, + "learning_rate": 3.006120992667499e-07, + "loss": 0.369, + "step": 5806 + }, + { + "epoch": 2.7005115485971167, + "grad_norm": 0.29590803384780884, + "learning_rate": 2.9968867883742534e-07, + "loss": 0.2653, + "step": 5807 + }, + { + "epoch": 2.7009765927763136, + "grad_norm": 0.33212965726852417, + "learning_rate": 2.9876663505293833e-07, + "loss": 0.3127, + "step": 5808 + }, + { + "epoch": 2.7014416369555105, + "grad_norm": 0.32688334584236145, + "learning_rate": 2.978459681833412e-07, + "loss": 0.3235, + "step": 5809 + }, + { + "epoch": 2.701906681134708, + "grad_norm": 0.3344423174858093, + "learning_rate": 2.969266784982822e-07, + "loss": 0.3131, + "step": 5810 + }, + { + "epoch": 2.702371725313905, + "grad_norm": 0.3305979371070862, + "learning_rate": 2.9600876626700637e-07, + "loss": 0.3075, + "step": 5811 + }, + { + "epoch": 2.7028367694931017, + "grad_norm": 0.36284005641937256, + "learning_rate": 2.9509223175835487e-07, + "loss": 0.3072, + "step": 5812 + }, + { + "epoch": 2.703301813672299, + "grad_norm": 0.3447756767272949, + "learning_rate": 2.941770752407669e-07, + "loss": 0.319, + "step": 5813 + }, + { + "epoch": 2.703766857851496, + "grad_norm": 0.3431076109409332, + "learning_rate": 2.9326329698227516e-07, + "loss": 0.32, + "step": 5814 + }, + { + "epoch": 2.704231902030693, + "grad_norm": 0.32309937477111816, + "learning_rate": 2.923508972505118e-07, + "loss": 0.3199, + "step": 5815 + }, + { + "epoch": 2.70469694620989, + "grad_norm": 0.3181701600551605, + "learning_rate": 2.9143987631270296e-07, + "loss": 0.311, + "step": 5816 + }, + { + "epoch": 2.705161990389087, + "grad_norm": 0.3445183038711548, + "learning_rate": 2.905302344356742e-07, + "loss": 0.3241, + "step": 5817 + }, + { + "epoch": 2.705627034568284, + "grad_norm": 0.3397235870361328, + "learning_rate": 2.8962197188584175e-07, + "loss": 0.3536, + "step": 5818 + }, + { + "epoch": 2.706092078747481, + "grad_norm": 0.3191192150115967, + "learning_rate": 2.8871508892922286e-07, + "loss": 0.3158, + "step": 5819 + }, + { + "epoch": 2.706557122926678, + "grad_norm": 0.3246058225631714, + "learning_rate": 2.878095858314278e-07, + "loss": 0.3048, + "step": 5820 + }, + { + "epoch": 2.707022167105875, + "grad_norm": 0.3111349940299988, + "learning_rate": 2.869054628576651e-07, + "loss": 0.3247, + "step": 5821 + }, + { + "epoch": 2.707487211285072, + "grad_norm": 0.3155844211578369, + "learning_rate": 2.860027202727361e-07, + "loss": 0.329, + "step": 5822 + }, + { + "epoch": 2.707952255464269, + "grad_norm": 0.2947717010974884, + "learning_rate": 2.851013583410406e-07, + "loss": 0.2873, + "step": 5823 + }, + { + "epoch": 2.7084172996434663, + "grad_norm": 0.34042876958847046, + "learning_rate": 2.8420137732657174e-07, + "loss": 0.3402, + "step": 5824 + }, + { + "epoch": 2.7088823438226632, + "grad_norm": 0.31821197271347046, + "learning_rate": 2.833027774929209e-07, + "loss": 0.3091, + "step": 5825 + }, + { + "epoch": 2.70934738800186, + "grad_norm": 0.33676040172576904, + "learning_rate": 2.824055591032715e-07, + "loss": 0.3005, + "step": 5826 + }, + { + "epoch": 2.709812432181057, + "grad_norm": 0.3211042582988739, + "learning_rate": 2.81509722420405e-07, + "loss": 0.3051, + "step": 5827 + }, + { + "epoch": 2.710277476360254, + "grad_norm": 0.3290630280971527, + "learning_rate": 2.8061526770669813e-07, + "loss": 0.3338, + "step": 5828 + }, + { + "epoch": 2.7107425205394513, + "grad_norm": 0.3298051953315735, + "learning_rate": 2.7972219522412194e-07, + "loss": 0.3061, + "step": 5829 + }, + { + "epoch": 2.711207564718648, + "grad_norm": 0.3290122449398041, + "learning_rate": 2.7883050523424214e-07, + "loss": 0.3103, + "step": 5830 + }, + { + "epoch": 2.711672608897845, + "grad_norm": 0.3085375130176544, + "learning_rate": 2.779401979982216e-07, + "loss": 0.3207, + "step": 5831 + }, + { + "epoch": 2.7121376530770425, + "grad_norm": 0.3622879385948181, + "learning_rate": 2.7705127377681494e-07, + "loss": 0.3434, + "step": 5832 + }, + { + "epoch": 2.7126026972562394, + "grad_norm": 0.30665135383605957, + "learning_rate": 2.7616373283037514e-07, + "loss": 0.2987, + "step": 5833 + }, + { + "epoch": 2.7130677414354363, + "grad_norm": 0.31220006942749023, + "learning_rate": 2.752775754188475e-07, + "loss": 0.3643, + "step": 5834 + }, + { + "epoch": 2.7135327856146336, + "grad_norm": 0.31486833095550537, + "learning_rate": 2.743928018017744e-07, + "loss": 0.3106, + "step": 5835 + }, + { + "epoch": 2.7139978297938305, + "grad_norm": 0.33274805545806885, + "learning_rate": 2.7350941223828975e-07, + "loss": 0.3206, + "step": 5836 + }, + { + "epoch": 2.7144628739730274, + "grad_norm": 0.3076656758785248, + "learning_rate": 2.72627406987126e-07, + "loss": 0.3123, + "step": 5837 + }, + { + "epoch": 2.7149279181522243, + "grad_norm": 0.3071771264076233, + "learning_rate": 2.71746786306607e-07, + "loss": 0.3229, + "step": 5838 + }, + { + "epoch": 2.7153929623314212, + "grad_norm": 0.32797494530677795, + "learning_rate": 2.708675504546521e-07, + "loss": 0.3368, + "step": 5839 + }, + { + "epoch": 2.7158580065106186, + "grad_norm": 0.30586639046669006, + "learning_rate": 2.699896996887763e-07, + "loss": 0.2811, + "step": 5840 + }, + { + "epoch": 2.7163230506898155, + "grad_norm": 0.31758421659469604, + "learning_rate": 2.691132342660868e-07, + "loss": 0.3155, + "step": 5841 + }, + { + "epoch": 2.7167880948690124, + "grad_norm": 0.34208863973617554, + "learning_rate": 2.682381544432866e-07, + "loss": 0.3223, + "step": 5842 + }, + { + "epoch": 2.7172531390482098, + "grad_norm": 0.29078277945518494, + "learning_rate": 2.673644604766718e-07, + "loss": 0.2754, + "step": 5843 + }, + { + "epoch": 2.7177181832274067, + "grad_norm": 0.33728206157684326, + "learning_rate": 2.664921526221348e-07, + "loss": 0.3452, + "step": 5844 + }, + { + "epoch": 2.7181832274066036, + "grad_norm": 0.29417553544044495, + "learning_rate": 2.65621231135158e-07, + "loss": 0.2996, + "step": 5845 + }, + { + "epoch": 2.718648271585801, + "grad_norm": 0.36140957474708557, + "learning_rate": 2.647516962708219e-07, + "loss": 0.3438, + "step": 5846 + }, + { + "epoch": 2.719113315764998, + "grad_norm": 0.30681052803993225, + "learning_rate": 2.6388354828379813e-07, + "loss": 0.3148, + "step": 5847 + }, + { + "epoch": 2.7195783599441947, + "grad_norm": 0.31248223781585693, + "learning_rate": 2.63016787428354e-07, + "loss": 0.3289, + "step": 5848 + }, + { + "epoch": 2.7200434041233916, + "grad_norm": 0.32591187953948975, + "learning_rate": 2.621514139583492e-07, + "loss": 0.3017, + "step": 5849 + }, + { + "epoch": 2.7205084483025885, + "grad_norm": 0.32240256667137146, + "learning_rate": 2.612874281272371e-07, + "loss": 0.3067, + "step": 5850 + }, + { + "epoch": 2.720973492481786, + "grad_norm": 0.3212602138519287, + "learning_rate": 2.6042483018806577e-07, + "loss": 0.2961, + "step": 5851 + }, + { + "epoch": 2.721438536660983, + "grad_norm": 0.35369014739990234, + "learning_rate": 2.595636203934765e-07, + "loss": 0.3466, + "step": 5852 + }, + { + "epoch": 2.7219035808401797, + "grad_norm": 0.3174188435077667, + "learning_rate": 2.587037989957031e-07, + "loss": 0.3181, + "step": 5853 + }, + { + "epoch": 2.722368625019377, + "grad_norm": 0.356917142868042, + "learning_rate": 2.5784536624657354e-07, + "loss": 0.3176, + "step": 5854 + }, + { + "epoch": 2.722833669198574, + "grad_norm": 0.36071768403053284, + "learning_rate": 2.569883223975078e-07, + "loss": 0.3259, + "step": 5855 + }, + { + "epoch": 2.723298713377771, + "grad_norm": 0.3190617561340332, + "learning_rate": 2.5613266769952183e-07, + "loss": 0.3215, + "step": 5856 + }, + { + "epoch": 2.7237637575569678, + "grad_norm": 0.31165647506713867, + "learning_rate": 2.552784024032218e-07, + "loss": 0.2974, + "step": 5857 + }, + { + "epoch": 2.7242288017361647, + "grad_norm": 0.32557412981987, + "learning_rate": 2.544255267588086e-07, + "loss": 0.3313, + "step": 5858 + }, + { + "epoch": 2.724693845915362, + "grad_norm": 0.329074889421463, + "learning_rate": 2.535740410160753e-07, + "loss": 0.3183, + "step": 5859 + }, + { + "epoch": 2.725158890094559, + "grad_norm": 0.3058283030986786, + "learning_rate": 2.5272394542440847e-07, + "loss": 0.3057, + "step": 5860 + }, + { + "epoch": 2.725623934273756, + "grad_norm": 0.3319613039493561, + "learning_rate": 2.518752402327873e-07, + "loss": 0.2884, + "step": 5861 + }, + { + "epoch": 2.726088978452953, + "grad_norm": 0.3359425663948059, + "learning_rate": 2.5102792568978354e-07, + "loss": 0.3352, + "step": 5862 + }, + { + "epoch": 2.72655402263215, + "grad_norm": 0.32430392503738403, + "learning_rate": 2.501820020435619e-07, + "loss": 0.341, + "step": 5863 + }, + { + "epoch": 2.727019066811347, + "grad_norm": 0.35566022992134094, + "learning_rate": 2.4933746954188045e-07, + "loss": 0.2976, + "step": 5864 + }, + { + "epoch": 2.7274841109905443, + "grad_norm": 0.3226439356803894, + "learning_rate": 2.4849432843208786e-07, + "loss": 0.3071, + "step": 5865 + }, + { + "epoch": 2.7279491551697412, + "grad_norm": 0.3440340757369995, + "learning_rate": 2.476525789611278e-07, + "loss": 0.3447, + "step": 5866 + }, + { + "epoch": 2.728414199348938, + "grad_norm": 0.32926130294799805, + "learning_rate": 2.4681222137553304e-07, + "loss": 0.3287, + "step": 5867 + }, + { + "epoch": 2.728879243528135, + "grad_norm": 0.30582261085510254, + "learning_rate": 2.4597325592143285e-07, + "loss": 0.3176, + "step": 5868 + }, + { + "epoch": 2.729344287707332, + "grad_norm": 0.3010387718677521, + "learning_rate": 2.4513568284454504e-07, + "loss": 0.3091, + "step": 5869 + }, + { + "epoch": 2.7298093318865293, + "grad_norm": 0.3204473555088043, + "learning_rate": 2.4429950239018285e-07, + "loss": 0.3201, + "step": 5870 + }, + { + "epoch": 2.730274376065726, + "grad_norm": 0.31000176072120667, + "learning_rate": 2.4346471480324763e-07, + "loss": 0.3046, + "step": 5871 + }, + { + "epoch": 2.730739420244923, + "grad_norm": 0.323900431394577, + "learning_rate": 2.4263132032823656e-07, + "loss": 0.3215, + "step": 5872 + }, + { + "epoch": 2.7312044644241205, + "grad_norm": 0.34527266025543213, + "learning_rate": 2.417993192092372e-07, + "loss": 0.3417, + "step": 5873 + }, + { + "epoch": 2.7316695086033174, + "grad_norm": 0.3277050852775574, + "learning_rate": 2.409687116899284e-07, + "loss": 0.3255, + "step": 5874 + }, + { + "epoch": 2.7321345527825143, + "grad_norm": 0.32256096601486206, + "learning_rate": 2.401394980135835e-07, + "loss": 0.301, + "step": 5875 + }, + { + "epoch": 2.7325995969617116, + "grad_norm": 0.3484511971473694, + "learning_rate": 2.3931167842306314e-07, + "loss": 0.3085, + "step": 5876 + }, + { + "epoch": 2.7330646411409085, + "grad_norm": 0.317190557718277, + "learning_rate": 2.3848525316082503e-07, + "loss": 0.3135, + "step": 5877 + }, + { + "epoch": 2.7335296853201054, + "grad_norm": 0.3190753757953644, + "learning_rate": 2.3766022246891284e-07, + "loss": 0.3283, + "step": 5878 + }, + { + "epoch": 2.7339947294993023, + "grad_norm": 0.3363284468650818, + "learning_rate": 2.3683658658896713e-07, + "loss": 0.3127, + "step": 5879 + }, + { + "epoch": 2.7344597736784992, + "grad_norm": 0.32483959197998047, + "learning_rate": 2.3601434576221548e-07, + "loss": 0.3224, + "step": 5880 + }, + { + "epoch": 2.7349248178576966, + "grad_norm": 0.29627302289009094, + "learning_rate": 2.3519350022948083e-07, + "loss": 0.3114, + "step": 5881 + }, + { + "epoch": 2.7353898620368935, + "grad_norm": 0.32418790459632874, + "learning_rate": 2.3437405023117366e-07, + "loss": 0.3299, + "step": 5882 + }, + { + "epoch": 2.7358549062160904, + "grad_norm": 0.33903124928474426, + "learning_rate": 2.3355599600729916e-07, + "loss": 0.325, + "step": 5883 + }, + { + "epoch": 2.7363199503952877, + "grad_norm": 0.2996440827846527, + "learning_rate": 2.3273933779745016e-07, + "loss": 0.2877, + "step": 5884 + }, + { + "epoch": 2.7367849945744847, + "grad_norm": 0.3519968092441559, + "learning_rate": 2.3192407584081423e-07, + "loss": 0.3545, + "step": 5885 + }, + { + "epoch": 2.7372500387536816, + "grad_norm": 0.3154807984828949, + "learning_rate": 2.3111021037616755e-07, + "loss": 0.2675, + "step": 5886 + }, + { + "epoch": 2.7377150829328785, + "grad_norm": 0.33764511346817017, + "learning_rate": 2.3029774164187945e-07, + "loss": 0.3311, + "step": 5887 + }, + { + "epoch": 2.738180127112076, + "grad_norm": 0.31895461678504944, + "learning_rate": 2.2948666987590683e-07, + "loss": 0.2989, + "step": 5888 + }, + { + "epoch": 2.7386451712912727, + "grad_norm": 0.30818265676498413, + "learning_rate": 2.2867699531580134e-07, + "loss": 0.2961, + "step": 5889 + }, + { + "epoch": 2.7391102154704696, + "grad_norm": 0.32495778799057007, + "learning_rate": 2.278687181987016e-07, + "loss": 0.3442, + "step": 5890 + }, + { + "epoch": 2.7395752596496665, + "grad_norm": 0.3362978994846344, + "learning_rate": 2.2706183876134047e-07, + "loss": 0.3452, + "step": 5891 + }, + { + "epoch": 2.740040303828864, + "grad_norm": 0.32582196593284607, + "learning_rate": 2.262563572400389e-07, + "loss": 0.3089, + "step": 5892 + }, + { + "epoch": 2.740505348008061, + "grad_norm": 0.30926793813705444, + "learning_rate": 2.2545227387070988e-07, + "loss": 0.293, + "step": 5893 + }, + { + "epoch": 2.7409703921872577, + "grad_norm": 0.3550074100494385, + "learning_rate": 2.2464958888885613e-07, + "loss": 0.3285, + "step": 5894 + }, + { + "epoch": 2.741435436366455, + "grad_norm": 0.30779311060905457, + "learning_rate": 2.2384830252957068e-07, + "loss": 0.3237, + "step": 5895 + }, + { + "epoch": 2.741900480545652, + "grad_norm": 0.32127946615219116, + "learning_rate": 2.2304841502753804e-07, + "loss": 0.3402, + "step": 5896 + }, + { + "epoch": 2.742365524724849, + "grad_norm": 0.3655702471733093, + "learning_rate": 2.2224992661703139e-07, + "loss": 0.3244, + "step": 5897 + }, + { + "epoch": 2.7428305689040458, + "grad_norm": 0.3156966269016266, + "learning_rate": 2.2145283753191526e-07, + "loss": 0.2846, + "step": 5898 + }, + { + "epoch": 2.7432956130832427, + "grad_norm": 0.32252970337867737, + "learning_rate": 2.206571480056452e-07, + "loss": 0.3368, + "step": 5899 + }, + { + "epoch": 2.74376065726244, + "grad_norm": 0.33229538798332214, + "learning_rate": 2.1986285827126418e-07, + "loss": 0.3075, + "step": 5900 + }, + { + "epoch": 2.744225701441637, + "grad_norm": 0.33600881695747375, + "learning_rate": 2.1906996856140783e-07, + "loss": 0.334, + "step": 5901 + }, + { + "epoch": 2.744690745620834, + "grad_norm": 0.4041096270084381, + "learning_rate": 2.1827847910830034e-07, + "loss": 0.3149, + "step": 5902 + }, + { + "epoch": 2.745155789800031, + "grad_norm": 0.3284103572368622, + "learning_rate": 2.1748839014375632e-07, + "loss": 0.3283, + "step": 5903 + }, + { + "epoch": 2.745620833979228, + "grad_norm": 0.2934693396091461, + "learning_rate": 2.16699701899179e-07, + "loss": 0.2902, + "step": 5904 + }, + { + "epoch": 2.746085878158425, + "grad_norm": 0.3179258108139038, + "learning_rate": 2.1591241460556355e-07, + "loss": 0.3039, + "step": 5905 + }, + { + "epoch": 2.7465509223376223, + "grad_norm": 0.32590535283088684, + "learning_rate": 2.151265284934928e-07, + "loss": 0.3358, + "step": 5906 + }, + { + "epoch": 2.7470159665168192, + "grad_norm": 0.3357870280742645, + "learning_rate": 2.143420437931415e-07, + "loss": 0.3274, + "step": 5907 + }, + { + "epoch": 2.747481010696016, + "grad_norm": 0.3348116874694824, + "learning_rate": 2.1355896073427028e-07, + "loss": 0.2949, + "step": 5908 + }, + { + "epoch": 2.747946054875213, + "grad_norm": 0.31171727180480957, + "learning_rate": 2.127772795462324e-07, + "loss": 0.3113, + "step": 5909 + }, + { + "epoch": 2.74841109905441, + "grad_norm": 0.30832639336586, + "learning_rate": 2.1199700045797077e-07, + "loss": 0.3246, + "step": 5910 + }, + { + "epoch": 2.7488761432336073, + "grad_norm": 0.3134484887123108, + "learning_rate": 2.112181236980143e-07, + "loss": 0.3223, + "step": 5911 + }, + { + "epoch": 2.749341187412804, + "grad_norm": 0.32488688826560974, + "learning_rate": 2.104406494944855e-07, + "loss": 0.3119, + "step": 5912 + }, + { + "epoch": 2.749806231592001, + "grad_norm": 0.3216099739074707, + "learning_rate": 2.0966457807509222e-07, + "loss": 0.3364, + "step": 5913 + }, + { + "epoch": 2.7502712757711985, + "grad_norm": 0.32613542675971985, + "learning_rate": 2.088899096671343e-07, + "loss": 0.3025, + "step": 5914 + }, + { + "epoch": 2.7507363199503954, + "grad_norm": 0.32675376534461975, + "learning_rate": 2.0811664449749857e-07, + "loss": 0.3239, + "step": 5915 + }, + { + "epoch": 2.7512013641295923, + "grad_norm": 0.28670793771743774, + "learning_rate": 2.073447827926628e-07, + "loss": 0.2655, + "step": 5916 + }, + { + "epoch": 2.751666408308789, + "grad_norm": 0.31409409642219543, + "learning_rate": 2.0657432477869165e-07, + "loss": 0.3476, + "step": 5917 + }, + { + "epoch": 2.7521314524879865, + "grad_norm": 0.3178742229938507, + "learning_rate": 2.0580527068124134e-07, + "loss": 0.3484, + "step": 5918 + }, + { + "epoch": 2.7525964966671834, + "grad_norm": 0.3407832980155945, + "learning_rate": 2.0503762072555387e-07, + "loss": 0.329, + "step": 5919 + }, + { + "epoch": 2.7530615408463803, + "grad_norm": 0.3183245360851288, + "learning_rate": 2.0427137513646167e-07, + "loss": 0.3063, + "step": 5920 + }, + { + "epoch": 2.7535265850255772, + "grad_norm": 0.3081268072128296, + "learning_rate": 2.0350653413838573e-07, + "loss": 0.3044, + "step": 5921 + }, + { + "epoch": 2.7539916292047746, + "grad_norm": 0.31228625774383545, + "learning_rate": 2.0274309795533687e-07, + "loss": 0.3156, + "step": 5922 + }, + { + "epoch": 2.7544566733839715, + "grad_norm": 0.3150801956653595, + "learning_rate": 2.0198106681091124e-07, + "loss": 0.3221, + "step": 5923 + }, + { + "epoch": 2.7549217175631684, + "grad_norm": 0.2927737534046173, + "learning_rate": 2.012204409282964e-07, + "loss": 0.2869, + "step": 5924 + }, + { + "epoch": 2.7553867617423657, + "grad_norm": 0.32298728823661804, + "learning_rate": 2.0046122053026697e-07, + "loss": 0.3349, + "step": 5925 + }, + { + "epoch": 2.7558518059215626, + "grad_norm": 0.34636053442955017, + "learning_rate": 1.9970340583918668e-07, + "loss": 0.33, + "step": 5926 + }, + { + "epoch": 2.7563168501007596, + "grad_norm": 0.3538902699947357, + "learning_rate": 1.989469970770064e-07, + "loss": 0.3174, + "step": 5927 + }, + { + "epoch": 2.7567818942799565, + "grad_norm": 0.32646122574806213, + "learning_rate": 1.9819199446526716e-07, + "loss": 0.2947, + "step": 5928 + }, + { + "epoch": 2.7572469384591534, + "grad_norm": 0.35514765977859497, + "learning_rate": 1.9743839822509547e-07, + "loss": 0.3108, + "step": 5929 + }, + { + "epoch": 2.7577119826383507, + "grad_norm": 0.31268438696861267, + "learning_rate": 1.9668620857720865e-07, + "loss": 0.3144, + "step": 5930 + }, + { + "epoch": 2.7581770268175476, + "grad_norm": 0.2988133728504181, + "learning_rate": 1.9593542574190993e-07, + "loss": 0.2633, + "step": 5931 + }, + { + "epoch": 2.7586420709967445, + "grad_norm": 0.3349534869194031, + "learning_rate": 1.9518604993909175e-07, + "loss": 0.3689, + "step": 5932 + }, + { + "epoch": 2.759107115175942, + "grad_norm": 0.3589261472225189, + "learning_rate": 1.9443808138823404e-07, + "loss": 0.3216, + "step": 5933 + }, + { + "epoch": 2.7595721593551388, + "grad_norm": 0.3319076895713806, + "learning_rate": 1.9369152030840553e-07, + "loss": 0.3064, + "step": 5934 + }, + { + "epoch": 2.7600372035343357, + "grad_norm": 0.30595728754997253, + "learning_rate": 1.9294636691826073e-07, + "loss": 0.3035, + "step": 5935 + }, + { + "epoch": 2.760502247713533, + "grad_norm": 0.32479432225227356, + "learning_rate": 1.9220262143604395e-07, + "loss": 0.317, + "step": 5936 + }, + { + "epoch": 2.76096729189273, + "grad_norm": 0.33985117077827454, + "learning_rate": 1.9146028407958483e-07, + "loss": 0.3118, + "step": 5937 + }, + { + "epoch": 2.761432336071927, + "grad_norm": 0.32750189304351807, + "learning_rate": 1.907193550663028e-07, + "loss": 0.3164, + "step": 5938 + }, + { + "epoch": 2.7618973802511237, + "grad_norm": 0.3345829248428345, + "learning_rate": 1.899798346132037e-07, + "loss": 0.3506, + "step": 5939 + }, + { + "epoch": 2.7623624244303207, + "grad_norm": 0.31959566473960876, + "learning_rate": 1.8924172293688148e-07, + "loss": 0.2971, + "step": 5940 + }, + { + "epoch": 2.762827468609518, + "grad_norm": 0.3132396638393402, + "learning_rate": 1.885050202535166e-07, + "loss": 0.2998, + "step": 5941 + }, + { + "epoch": 2.763292512788715, + "grad_norm": 0.3381306231021881, + "learning_rate": 1.877697267788775e-07, + "loss": 0.3455, + "step": 5942 + }, + { + "epoch": 2.763757556967912, + "grad_norm": 0.3132924735546112, + "learning_rate": 1.870358427283192e-07, + "loss": 0.3172, + "step": 5943 + }, + { + "epoch": 2.764222601147109, + "grad_norm": 0.303196519613266, + "learning_rate": 1.8630336831678475e-07, + "loss": 0.3258, + "step": 5944 + }, + { + "epoch": 2.764687645326306, + "grad_norm": 0.3348020017147064, + "learning_rate": 1.8557230375880364e-07, + "loss": 0.3243, + "step": 5945 + }, + { + "epoch": 2.765152689505503, + "grad_norm": 0.33610332012176514, + "learning_rate": 1.848426492684946e-07, + "loss": 0.362, + "step": 5946 + }, + { + "epoch": 2.7656177336847003, + "grad_norm": 0.31693512201309204, + "learning_rate": 1.8411440505956e-07, + "loss": 0.2884, + "step": 5947 + }, + { + "epoch": 2.7660827778638972, + "grad_norm": 0.3176809847354889, + "learning_rate": 1.833875713452904e-07, + "loss": 0.3094, + "step": 5948 + }, + { + "epoch": 2.766547822043094, + "grad_norm": 0.3155810534954071, + "learning_rate": 1.8266214833856432e-07, + "loss": 0.3281, + "step": 5949 + }, + { + "epoch": 2.767012866222291, + "grad_norm": 0.31794315576553345, + "learning_rate": 1.819381362518463e-07, + "loss": 0.3407, + "step": 5950 + }, + { + "epoch": 2.767477910401488, + "grad_norm": 0.2952190935611725, + "learning_rate": 1.8121553529718782e-07, + "loss": 0.296, + "step": 5951 + }, + { + "epoch": 2.7679429545806853, + "grad_norm": 0.32220354676246643, + "learning_rate": 1.8049434568622627e-07, + "loss": 0.3271, + "step": 5952 + }, + { + "epoch": 2.768407998759882, + "grad_norm": 0.3344385027885437, + "learning_rate": 1.7977456763018764e-07, + "loss": 0.321, + "step": 5953 + }, + { + "epoch": 2.768873042939079, + "grad_norm": 0.330684095621109, + "learning_rate": 1.7905620133988166e-07, + "loss": 0.3028, + "step": 5954 + }, + { + "epoch": 2.7693380871182764, + "grad_norm": 0.31999465823173523, + "learning_rate": 1.7833924702570725e-07, + "loss": 0.3465, + "step": 5955 + }, + { + "epoch": 2.7698031312974734, + "grad_norm": 0.28892767429351807, + "learning_rate": 1.7762370489764813e-07, + "loss": 0.3031, + "step": 5956 + }, + { + "epoch": 2.7702681754766703, + "grad_norm": 0.3207509219646454, + "learning_rate": 1.7690957516527607e-07, + "loss": 0.322, + "step": 5957 + }, + { + "epoch": 2.770733219655867, + "grad_norm": 0.32161515951156616, + "learning_rate": 1.76196858037746e-07, + "loss": 0.3114, + "step": 5958 + }, + { + "epoch": 2.771198263835064, + "grad_norm": 0.3222302496433258, + "learning_rate": 1.7548555372380372e-07, + "loss": 0.3151, + "step": 5959 + }, + { + "epoch": 2.7716633080142614, + "grad_norm": 0.3225840926170349, + "learning_rate": 1.7477566243177647e-07, + "loss": 0.301, + "step": 5960 + }, + { + "epoch": 2.7721283521934583, + "grad_norm": 0.33795180916786194, + "learning_rate": 1.740671843695818e-07, + "loss": 0.3138, + "step": 5961 + }, + { + "epoch": 2.7725933963726552, + "grad_norm": 0.3079066574573517, + "learning_rate": 1.7336011974471933e-07, + "loss": 0.3115, + "step": 5962 + }, + { + "epoch": 2.7730584405518526, + "grad_norm": 0.317829430103302, + "learning_rate": 1.7265446876427895e-07, + "loss": 0.3104, + "step": 5963 + }, + { + "epoch": 2.7735234847310495, + "grad_norm": 0.32310232520103455, + "learning_rate": 1.7195023163493253e-07, + "loss": 0.318, + "step": 5964 + }, + { + "epoch": 2.7739885289102464, + "grad_norm": 0.3459528386592865, + "learning_rate": 1.712474085629412e-07, + "loss": 0.3189, + "step": 5965 + }, + { + "epoch": 2.7744535730894437, + "grad_norm": 0.3017194867134094, + "learning_rate": 1.7054599975414866e-07, + "loss": 0.3033, + "step": 5966 + }, + { + "epoch": 2.7749186172686406, + "grad_norm": 0.32311463356018066, + "learning_rate": 1.6984600541398777e-07, + "loss": 0.3371, + "step": 5967 + }, + { + "epoch": 2.7753836614478375, + "grad_norm": 0.29081717133522034, + "learning_rate": 1.6914742574747455e-07, + "loss": 0.3338, + "step": 5968 + }, + { + "epoch": 2.7758487056270345, + "grad_norm": 0.3164746165275574, + "learning_rate": 1.6845026095921314e-07, + "loss": 0.3281, + "step": 5969 + }, + { + "epoch": 2.7763137498062314, + "grad_norm": 0.3146812915802002, + "learning_rate": 1.677545112533896e-07, + "loss": 0.3042, + "step": 5970 + }, + { + "epoch": 2.7767787939854287, + "grad_norm": 0.3127114474773407, + "learning_rate": 1.6706017683377928e-07, + "loss": 0.2914, + "step": 5971 + }, + { + "epoch": 2.7772438381646256, + "grad_norm": 0.3269754946231842, + "learning_rate": 1.663672579037412e-07, + "loss": 0.3242, + "step": 5972 + }, + { + "epoch": 2.7777088823438225, + "grad_norm": 0.323904424905777, + "learning_rate": 1.6567575466621964e-07, + "loss": 0.2977, + "step": 5973 + }, + { + "epoch": 2.77817392652302, + "grad_norm": 0.3124120235443115, + "learning_rate": 1.6498566732374433e-07, + "loss": 0.328, + "step": 5974 + }, + { + "epoch": 2.7786389707022168, + "grad_norm": 0.31669923663139343, + "learning_rate": 1.6429699607843185e-07, + "loss": 0.3143, + "step": 5975 + }, + { + "epoch": 2.7791040148814137, + "grad_norm": 0.3284464478492737, + "learning_rate": 1.6360974113198203e-07, + "loss": 0.3065, + "step": 5976 + }, + { + "epoch": 2.779569059060611, + "grad_norm": 0.31875741481781006, + "learning_rate": 1.6292390268568103e-07, + "loss": 0.316, + "step": 5977 + }, + { + "epoch": 2.780034103239808, + "grad_norm": 0.34978044033050537, + "learning_rate": 1.6223948094039876e-07, + "loss": 0.3321, + "step": 5978 + }, + { + "epoch": 2.780499147419005, + "grad_norm": 0.32049715518951416, + "learning_rate": 1.615564760965921e-07, + "loss": 0.3292, + "step": 5979 + }, + { + "epoch": 2.7809641915982017, + "grad_norm": 0.30878329277038574, + "learning_rate": 1.6087488835430208e-07, + "loss": 0.3102, + "step": 5980 + }, + { + "epoch": 2.7814292357773986, + "grad_norm": 0.3212791979312897, + "learning_rate": 1.6019471791315522e-07, + "loss": 0.2984, + "step": 5981 + }, + { + "epoch": 2.781894279956596, + "grad_norm": 0.3322344124317169, + "learning_rate": 1.5951596497236154e-07, + "loss": 0.3274, + "step": 5982 + }, + { + "epoch": 2.782359324135793, + "grad_norm": 0.3315823972225189, + "learning_rate": 1.5883862973071652e-07, + "loss": 0.3367, + "step": 5983 + }, + { + "epoch": 2.78282436831499, + "grad_norm": 0.33415260910987854, + "learning_rate": 1.5816271238660196e-07, + "loss": 0.3106, + "step": 5984 + }, + { + "epoch": 2.783289412494187, + "grad_norm": 0.31730806827545166, + "learning_rate": 1.5748821313798124e-07, + "loss": 0.3076, + "step": 5985 + }, + { + "epoch": 2.783754456673384, + "grad_norm": 0.3219851553440094, + "learning_rate": 1.5681513218240573e-07, + "loss": 0.3218, + "step": 5986 + }, + { + "epoch": 2.784219500852581, + "grad_norm": 0.30945420265197754, + "learning_rate": 1.5614346971700945e-07, + "loss": 0.3242, + "step": 5987 + }, + { + "epoch": 2.784684545031778, + "grad_norm": 0.3280915915966034, + "learning_rate": 1.554732259385111e-07, + "loss": 0.3007, + "step": 5988 + }, + { + "epoch": 2.7851495892109748, + "grad_norm": 0.320148229598999, + "learning_rate": 1.5480440104321481e-07, + "loss": 0.3254, + "step": 5989 + }, + { + "epoch": 2.785614633390172, + "grad_norm": 0.31728118658065796, + "learning_rate": 1.5413699522700775e-07, + "loss": 0.3202, + "step": 5990 + }, + { + "epoch": 2.786079677569369, + "grad_norm": 0.30805543065071106, + "learning_rate": 1.5347100868536246e-07, + "loss": 0.3265, + "step": 5991 + }, + { + "epoch": 2.786544721748566, + "grad_norm": 0.3176276385784149, + "learning_rate": 1.5280644161333625e-07, + "loss": 0.3275, + "step": 5992 + }, + { + "epoch": 2.7870097659277633, + "grad_norm": 0.3044149577617645, + "learning_rate": 1.521432942055695e-07, + "loss": 0.297, + "step": 5993 + }, + { + "epoch": 2.78747481010696, + "grad_norm": 0.3212355971336365, + "learning_rate": 1.51481566656288e-07, + "loss": 0.3568, + "step": 5994 + }, + { + "epoch": 2.787939854286157, + "grad_norm": 0.3032307028770447, + "learning_rate": 1.5082125915929946e-07, + "loss": 0.3037, + "step": 5995 + }, + { + "epoch": 2.7884048984653544, + "grad_norm": 0.3063505291938782, + "learning_rate": 1.5016237190799866e-07, + "loss": 0.2971, + "step": 5996 + }, + { + "epoch": 2.7888699426445513, + "grad_norm": 0.2914872169494629, + "learning_rate": 1.4950490509536176e-07, + "loss": 0.313, + "step": 5997 + }, + { + "epoch": 2.7893349868237483, + "grad_norm": 0.3118445575237274, + "learning_rate": 1.4884885891395196e-07, + "loss": 0.3192, + "step": 5998 + }, + { + "epoch": 2.789800031002945, + "grad_norm": 0.33914464712142944, + "learning_rate": 1.4819423355591223e-07, + "loss": 0.348, + "step": 5999 + }, + { + "epoch": 2.790265075182142, + "grad_norm": 0.32917240262031555, + "learning_rate": 1.4754102921297363e-07, + "loss": 0.3033, + "step": 6000 + }, + { + "epoch": 2.7907301193613394, + "grad_norm": 0.33126407861709595, + "learning_rate": 1.4688924607644817e-07, + "loss": 0.312, + "step": 6001 + }, + { + "epoch": 2.7911951635405363, + "grad_norm": 0.3238784372806549, + "learning_rate": 1.46238884337232e-07, + "loss": 0.3091, + "step": 6002 + }, + { + "epoch": 2.7916602077197332, + "grad_norm": 0.32593873143196106, + "learning_rate": 1.4558994418580663e-07, + "loss": 0.3353, + "step": 6003 + }, + { + "epoch": 2.7921252518989306, + "grad_norm": 0.2928268611431122, + "learning_rate": 1.4494242581223615e-07, + "loss": 0.2997, + "step": 6004 + }, + { + "epoch": 2.7925902960781275, + "grad_norm": 0.3363851308822632, + "learning_rate": 1.4429632940616721e-07, + "loss": 0.3243, + "step": 6005 + }, + { + "epoch": 2.7930553402573244, + "grad_norm": 0.3205671012401581, + "learning_rate": 1.4365165515683176e-07, + "loss": 0.3337, + "step": 6006 + }, + { + "epoch": 2.7935203844365217, + "grad_norm": 0.3109828233718872, + "learning_rate": 1.4300840325304377e-07, + "loss": 0.2985, + "step": 6007 + }, + { + "epoch": 2.7939854286157186, + "grad_norm": 0.29912734031677246, + "learning_rate": 1.4236657388320198e-07, + "loss": 0.3093, + "step": 6008 + }, + { + "epoch": 2.7944504727949155, + "grad_norm": 0.3295075297355652, + "learning_rate": 1.417261672352871e-07, + "loss": 0.3717, + "step": 6009 + }, + { + "epoch": 2.7949155169741124, + "grad_norm": 0.29881277680397034, + "learning_rate": 1.4108718349686468e-07, + "loss": 0.3031, + "step": 6010 + }, + { + "epoch": 2.7953805611533094, + "grad_norm": 0.32853347063064575, + "learning_rate": 1.4044962285508113e-07, + "loss": 0.3193, + "step": 6011 + }, + { + "epoch": 2.7958456053325067, + "grad_norm": 0.32658717036247253, + "learning_rate": 1.3981348549666928e-07, + "loss": 0.3161, + "step": 6012 + }, + { + "epoch": 2.7963106495117036, + "grad_norm": 0.3037392795085907, + "learning_rate": 1.3917877160794236e-07, + "loss": 0.3352, + "step": 6013 + }, + { + "epoch": 2.7967756936909005, + "grad_norm": 0.3436054289340973, + "learning_rate": 1.385454813747983e-07, + "loss": 0.328, + "step": 6014 + }, + { + "epoch": 2.797240737870098, + "grad_norm": 0.2963773012161255, + "learning_rate": 1.3791361498271704e-07, + "loss": 0.2987, + "step": 6015 + }, + { + "epoch": 2.7977057820492948, + "grad_norm": 0.3178195655345917, + "learning_rate": 1.3728317261676338e-07, + "loss": 0.3486, + "step": 6016 + }, + { + "epoch": 2.7981708262284917, + "grad_norm": 0.39669740200042725, + "learning_rate": 1.3665415446158182e-07, + "loss": 0.3288, + "step": 6017 + }, + { + "epoch": 2.7986358704076886, + "grad_norm": 0.3178330659866333, + "learning_rate": 1.3602656070140275e-07, + "loss": 0.3005, + "step": 6018 + }, + { + "epoch": 2.7991009145868855, + "grad_norm": 0.31485146284103394, + "learning_rate": 1.354003915200375e-07, + "loss": 0.3319, + "step": 6019 + }, + { + "epoch": 2.799565958766083, + "grad_norm": 0.31439441442489624, + "learning_rate": 1.3477564710088097e-07, + "loss": 0.2969, + "step": 6020 + }, + { + "epoch": 2.8000310029452797, + "grad_norm": 0.3106626570224762, + "learning_rate": 1.3415232762691134e-07, + "loss": 0.3352, + "step": 6021 + }, + { + "epoch": 2.8004960471244766, + "grad_norm": 0.3026930093765259, + "learning_rate": 1.335304332806875e-07, + "loss": 0.312, + "step": 6022 + }, + { + "epoch": 2.800961091303674, + "grad_norm": 0.31321385502815247, + "learning_rate": 1.3290996424435375e-07, + "loss": 0.2761, + "step": 6023 + }, + { + "epoch": 2.801426135482871, + "grad_norm": 0.32276248931884766, + "learning_rate": 1.3229092069963368e-07, + "loss": 0.3453, + "step": 6024 + }, + { + "epoch": 2.801891179662068, + "grad_norm": 0.3255274295806885, + "learning_rate": 1.3167330282783608e-07, + "loss": 0.3046, + "step": 6025 + }, + { + "epoch": 2.802356223841265, + "grad_norm": 0.32487818598747253, + "learning_rate": 1.3105711080985128e-07, + "loss": 0.3285, + "step": 6026 + }, + { + "epoch": 2.802821268020462, + "grad_norm": 0.43458884954452515, + "learning_rate": 1.3044234482615216e-07, + "loss": 0.3202, + "step": 6027 + }, + { + "epoch": 2.803286312199659, + "grad_norm": 0.32257169485092163, + "learning_rate": 1.298290050567924e-07, + "loss": 0.3324, + "step": 6028 + }, + { + "epoch": 2.803751356378856, + "grad_norm": 0.3104286193847656, + "learning_rate": 1.2921709168141116e-07, + "loss": 0.3087, + "step": 6029 + }, + { + "epoch": 2.8042164005580528, + "grad_norm": 0.3442256450653076, + "learning_rate": 1.2860660487922616e-07, + "loss": 0.3041, + "step": 6030 + }, + { + "epoch": 2.80468144473725, + "grad_norm": 0.33402782678604126, + "learning_rate": 1.2799754482903992e-07, + "loss": 0.3278, + "step": 6031 + }, + { + "epoch": 2.805146488916447, + "grad_norm": 0.3222576379776001, + "learning_rate": 1.2738991170923588e-07, + "loss": 0.2998, + "step": 6032 + }, + { + "epoch": 2.805611533095644, + "grad_norm": 0.3217984139919281, + "learning_rate": 1.2678370569778052e-07, + "loss": 0.3382, + "step": 6033 + }, + { + "epoch": 2.8060765772748413, + "grad_norm": 0.3216967284679413, + "learning_rate": 1.2617892697222135e-07, + "loss": 0.3328, + "step": 6034 + }, + { + "epoch": 2.806541621454038, + "grad_norm": 0.3223290741443634, + "learning_rate": 1.2557557570968825e-07, + "loss": 0.3037, + "step": 6035 + }, + { + "epoch": 2.807006665633235, + "grad_norm": 0.3248218894004822, + "learning_rate": 1.2497365208689272e-07, + "loss": 0.3154, + "step": 6036 + }, + { + "epoch": 2.8074717098124324, + "grad_norm": 0.2976897358894348, + "learning_rate": 1.2437315628012868e-07, + "loss": 0.2948, + "step": 6037 + }, + { + "epoch": 2.8079367539916293, + "grad_norm": 0.34811344742774963, + "learning_rate": 1.2377408846527105e-07, + "loss": 0.3377, + "step": 6038 + }, + { + "epoch": 2.8084017981708262, + "grad_norm": 0.3315483033657074, + "learning_rate": 1.231764488177789e-07, + "loss": 0.3402, + "step": 6039 + }, + { + "epoch": 2.808866842350023, + "grad_norm": 0.3172501027584076, + "learning_rate": 1.225802375126889e-07, + "loss": 0.3076, + "step": 6040 + }, + { + "epoch": 2.80933188652922, + "grad_norm": 0.3447789251804352, + "learning_rate": 1.2198545472462297e-07, + "loss": 0.3221, + "step": 6041 + }, + { + "epoch": 2.8097969307084174, + "grad_norm": 0.3011327385902405, + "learning_rate": 1.2139210062778294e-07, + "loss": 0.3261, + "step": 6042 + }, + { + "epoch": 2.8102619748876143, + "grad_norm": 0.2930125594139099, + "learning_rate": 1.2080017539595312e-07, + "loss": 0.294, + "step": 6043 + }, + { + "epoch": 2.810727019066811, + "grad_norm": 0.32197386026382446, + "learning_rate": 1.202096792024976e-07, + "loss": 0.3191, + "step": 6044 + }, + { + "epoch": 2.8111920632460086, + "grad_norm": 0.3363608717918396, + "learning_rate": 1.196206122203647e-07, + "loss": 0.3183, + "step": 6045 + }, + { + "epoch": 2.8116571074252055, + "grad_norm": 0.32255131006240845, + "learning_rate": 1.1903297462208085e-07, + "loss": 0.3075, + "step": 6046 + }, + { + "epoch": 2.8121221516044024, + "grad_norm": 0.32174059748649597, + "learning_rate": 1.1844676657975673e-07, + "loss": 0.3559, + "step": 6047 + }, + { + "epoch": 2.8125871957835993, + "grad_norm": 0.31685617566108704, + "learning_rate": 1.1786198826508277e-07, + "loss": 0.2865, + "step": 6048 + }, + { + "epoch": 2.8130522399627966, + "grad_norm": 0.3263781666755676, + "learning_rate": 1.1727863984933086e-07, + "loss": 0.313, + "step": 6049 + }, + { + "epoch": 2.8135172841419935, + "grad_norm": 0.3233252763748169, + "learning_rate": 1.1669672150335487e-07, + "loss": 0.3147, + "step": 6050 + }, + { + "epoch": 2.8139823283211904, + "grad_norm": 0.32005932927131653, + "learning_rate": 1.1611623339758904e-07, + "loss": 0.3125, + "step": 6051 + }, + { + "epoch": 2.8144473725003873, + "grad_norm": 0.33904585242271423, + "learning_rate": 1.1553717570204847e-07, + "loss": 0.301, + "step": 6052 + }, + { + "epoch": 2.8149124166795847, + "grad_norm": 0.33245572447776794, + "learning_rate": 1.149595485863303e-07, + "loss": 0.3283, + "step": 6053 + }, + { + "epoch": 2.8153774608587816, + "grad_norm": 0.3273317515850067, + "learning_rate": 1.1438335221961195e-07, + "loss": 0.3298, + "step": 6054 + }, + { + "epoch": 2.8158425050379785, + "grad_norm": 0.31557828187942505, + "learning_rate": 1.1380858677065177e-07, + "loss": 0.3146, + "step": 6055 + }, + { + "epoch": 2.816307549217176, + "grad_norm": 0.29916810989379883, + "learning_rate": 1.1323525240778954e-07, + "loss": 0.3233, + "step": 6056 + }, + { + "epoch": 2.8167725933963728, + "grad_norm": 0.3167668282985687, + "learning_rate": 1.1266334929894485e-07, + "loss": 0.3322, + "step": 6057 + }, + { + "epoch": 2.8172376375755697, + "grad_norm": 0.2994489371776581, + "learning_rate": 1.120928776116198e-07, + "loss": 0.3082, + "step": 6058 + }, + { + "epoch": 2.8177026817547666, + "grad_norm": 0.31627991795539856, + "learning_rate": 1.1152383751289575e-07, + "loss": 0.3136, + "step": 6059 + }, + { + "epoch": 2.8181677259339635, + "grad_norm": 0.3162107467651367, + "learning_rate": 1.1095622916943494e-07, + "loss": 0.3221, + "step": 6060 + }, + { + "epoch": 2.818632770113161, + "grad_norm": 0.31993138790130615, + "learning_rate": 1.103900527474816e-07, + "loss": 0.3206, + "step": 6061 + }, + { + "epoch": 2.8190978142923577, + "grad_norm": 0.3255464732646942, + "learning_rate": 1.0982530841285921e-07, + "loss": 0.3131, + "step": 6062 + }, + { + "epoch": 2.8195628584715546, + "grad_norm": 0.317570298910141, + "learning_rate": 1.0926199633097156e-07, + "loss": 0.2974, + "step": 6063 + }, + { + "epoch": 2.820027902650752, + "grad_norm": 0.3149189352989197, + "learning_rate": 1.0870011666680502e-07, + "loss": 0.2871, + "step": 6064 + }, + { + "epoch": 2.820492946829949, + "grad_norm": 0.3116709589958191, + "learning_rate": 1.081396695849235e-07, + "loss": 0.3331, + "step": 6065 + }, + { + "epoch": 2.820957991009146, + "grad_norm": 0.3222356140613556, + "learning_rate": 1.0758065524947403e-07, + "loss": 0.328, + "step": 6066 + }, + { + "epoch": 2.821423035188343, + "grad_norm": 0.2908288538455963, + "learning_rate": 1.0702307382418175e-07, + "loss": 0.2997, + "step": 6067 + }, + { + "epoch": 2.82188807936754, + "grad_norm": 0.3209805190563202, + "learning_rate": 1.0646692547235437e-07, + "loss": 0.3317, + "step": 6068 + }, + { + "epoch": 2.822353123546737, + "grad_norm": 0.3027243912220001, + "learning_rate": 1.0591221035687716e-07, + "loss": 0.3113, + "step": 6069 + }, + { + "epoch": 2.822818167725934, + "grad_norm": 0.30290907621383667, + "learning_rate": 1.0535892864021901e-07, + "loss": 0.3267, + "step": 6070 + }, + { + "epoch": 2.8232832119051308, + "grad_norm": 0.3006175756454468, + "learning_rate": 1.0480708048442589e-07, + "loss": 0.3119, + "step": 6071 + }, + { + "epoch": 2.823748256084328, + "grad_norm": 0.34534621238708496, + "learning_rate": 1.0425666605112516e-07, + "loss": 0.3862, + "step": 6072 + }, + { + "epoch": 2.824213300263525, + "grad_norm": 0.317735493183136, + "learning_rate": 1.0370768550152454e-07, + "loss": 0.3082, + "step": 6073 + }, + { + "epoch": 2.824678344442722, + "grad_norm": 0.36195850372314453, + "learning_rate": 1.0316013899641264e-07, + "loss": 0.3223, + "step": 6074 + }, + { + "epoch": 2.8251433886219193, + "grad_norm": 0.3154940903186798, + "learning_rate": 1.0261402669615505e-07, + "loss": 0.3041, + "step": 6075 + }, + { + "epoch": 2.825608432801116, + "grad_norm": 0.33876118063926697, + "learning_rate": 1.0206934876070052e-07, + "loss": 0.3145, + "step": 6076 + }, + { + "epoch": 2.826073476980313, + "grad_norm": 0.3257219195365906, + "learning_rate": 1.015261053495753e-07, + "loss": 0.3304, + "step": 6077 + }, + { + "epoch": 2.8265385211595104, + "grad_norm": 0.30201414227485657, + "learning_rate": 1.0098429662188769e-07, + "loss": 0.3044, + "step": 6078 + }, + { + "epoch": 2.8270035653387073, + "grad_norm": 0.31688815355300903, + "learning_rate": 1.0044392273632354e-07, + "loss": 0.3012, + "step": 6079 + }, + { + "epoch": 2.8274686095179042, + "grad_norm": 0.32895052433013916, + "learning_rate": 9.990498385115066e-08, + "loss": 0.3256, + "step": 6080 + }, + { + "epoch": 2.827933653697101, + "grad_norm": 0.32152754068374634, + "learning_rate": 9.936748012421504e-08, + "loss": 0.3223, + "step": 6081 + }, + { + "epoch": 2.828398697876298, + "grad_norm": 0.31743600964546204, + "learning_rate": 9.883141171294242e-08, + "loss": 0.2809, + "step": 6082 + }, + { + "epoch": 2.8288637420554954, + "grad_norm": 0.3381618857383728, + "learning_rate": 9.829677877433886e-08, + "loss": 0.3185, + "step": 6083 + }, + { + "epoch": 2.8293287862346923, + "grad_norm": 0.32062768936157227, + "learning_rate": 9.776358146498966e-08, + "loss": 0.3108, + "step": 6084 + }, + { + "epoch": 2.829793830413889, + "grad_norm": 0.30004435777664185, + "learning_rate": 9.72318199410599e-08, + "loss": 0.3067, + "step": 6085 + }, + { + "epoch": 2.8302588745930866, + "grad_norm": 0.3237650990486145, + "learning_rate": 9.670149435829334e-08, + "loss": 0.3609, + "step": 6086 + }, + { + "epoch": 2.8307239187722835, + "grad_norm": 0.3168402314186096, + "learning_rate": 9.617260487201407e-08, + "loss": 0.3113, + "step": 6087 + }, + { + "epoch": 2.8311889629514804, + "grad_norm": 0.3079497218132019, + "learning_rate": 9.564515163712595e-08, + "loss": 0.2955, + "step": 6088 + }, + { + "epoch": 2.8316540071306773, + "grad_norm": 0.3398304581642151, + "learning_rate": 9.511913480810985e-08, + "loss": 0.3315, + "step": 6089 + }, + { + "epoch": 2.832119051309874, + "grad_norm": 0.35483306646347046, + "learning_rate": 9.459455453902866e-08, + "loss": 0.3551, + "step": 6090 + }, + { + "epoch": 2.8325840954890715, + "grad_norm": 0.34219107031822205, + "learning_rate": 9.407141098352335e-08, + "loss": 0.3129, + "step": 6091 + }, + { + "epoch": 2.8330491396682684, + "grad_norm": 0.29722991585731506, + "learning_rate": 9.354970429481413e-08, + "loss": 0.2773, + "step": 6092 + }, + { + "epoch": 2.8335141838474653, + "grad_norm": 0.31382736563682556, + "learning_rate": 9.302943462569991e-08, + "loss": 0.3483, + "step": 6093 + }, + { + "epoch": 2.8339792280266627, + "grad_norm": 0.3087048828601837, + "learning_rate": 9.25106021285599e-08, + "loss": 0.2727, + "step": 6094 + }, + { + "epoch": 2.8344442722058596, + "grad_norm": 0.3467859923839569, + "learning_rate": 9.199320695535086e-08, + "loss": 0.3607, + "step": 6095 + }, + { + "epoch": 2.8349093163850565, + "grad_norm": 0.33929920196533203, + "learning_rate": 9.147724925760993e-08, + "loss": 0.2912, + "step": 6096 + }, + { + "epoch": 2.835374360564254, + "grad_norm": 0.31342393159866333, + "learning_rate": 9.096272918645343e-08, + "loss": 0.2958, + "step": 6097 + }, + { + "epoch": 2.8358394047434508, + "grad_norm": 0.31853240728378296, + "learning_rate": 9.044964689257474e-08, + "loss": 0.3308, + "step": 6098 + }, + { + "epoch": 2.8363044489226477, + "grad_norm": 0.32699882984161377, + "learning_rate": 8.993800252624863e-08, + "loss": 0.2928, + "step": 6099 + }, + { + "epoch": 2.8367694931018446, + "grad_norm": 0.2974814474582672, + "learning_rate": 8.942779623732578e-08, + "loss": 0.2999, + "step": 6100 + }, + { + "epoch": 2.8372345372810415, + "grad_norm": 0.32415345311164856, + "learning_rate": 8.89190281752389e-08, + "loss": 0.326, + "step": 6101 + }, + { + "epoch": 2.837699581460239, + "grad_norm": 0.31111282110214233, + "learning_rate": 8.841169848899711e-08, + "loss": 0.3196, + "step": 6102 + }, + { + "epoch": 2.8381646256394357, + "grad_norm": 0.3355112671852112, + "learning_rate": 8.790580732718934e-08, + "loss": 0.3027, + "step": 6103 + }, + { + "epoch": 2.8386296698186326, + "grad_norm": 0.33080142736434937, + "learning_rate": 8.740135483798207e-08, + "loss": 0.3272, + "step": 6104 + }, + { + "epoch": 2.83909471399783, + "grad_norm": 0.3315957486629486, + "learning_rate": 8.68983411691221e-08, + "loss": 0.3239, + "step": 6105 + }, + { + "epoch": 2.839559758177027, + "grad_norm": 0.3323264420032501, + "learning_rate": 8.639676646793382e-08, + "loss": 0.3448, + "step": 6106 + }, + { + "epoch": 2.840024802356224, + "grad_norm": 0.29447272419929504, + "learning_rate": 8.589663088131972e-08, + "loss": 0.2987, + "step": 6107 + }, + { + "epoch": 2.840489846535421, + "grad_norm": 0.3482167422771454, + "learning_rate": 8.539793455576207e-08, + "loss": 0.3381, + "step": 6108 + }, + { + "epoch": 2.840954890714618, + "grad_norm": 0.29525330662727356, + "learning_rate": 8.490067763732124e-08, + "loss": 0.3098, + "step": 6109 + }, + { + "epoch": 2.841419934893815, + "grad_norm": 0.3327022194862366, + "learning_rate": 8.44048602716352e-08, + "loss": 0.3471, + "step": 6110 + }, + { + "epoch": 2.841884979073012, + "grad_norm": 0.3110980987548828, + "learning_rate": 8.391048260392054e-08, + "loss": 0.3353, + "step": 6111 + }, + { + "epoch": 2.8423500232522088, + "grad_norm": 0.31810250878334045, + "learning_rate": 8.341754477897257e-08, + "loss": 0.3223, + "step": 6112 + }, + { + "epoch": 2.842815067431406, + "grad_norm": 0.3068527281284332, + "learning_rate": 8.292604694116523e-08, + "loss": 0.3382, + "step": 6113 + }, + { + "epoch": 2.843280111610603, + "grad_norm": 0.32268133759498596, + "learning_rate": 8.24359892344495e-08, + "loss": 0.3259, + "step": 6114 + }, + { + "epoch": 2.8437451557898, + "grad_norm": 0.31271931529045105, + "learning_rate": 8.194737180235668e-08, + "loss": 0.3327, + "step": 6115 + }, + { + "epoch": 2.8442101999689973, + "grad_norm": 0.31981274485588074, + "learning_rate": 8.146019478799282e-08, + "loss": 0.2942, + "step": 6116 + }, + { + "epoch": 2.844675244148194, + "grad_norm": 0.3350343406200409, + "learning_rate": 8.097445833404605e-08, + "loss": 0.3287, + "step": 6117 + }, + { + "epoch": 2.845140288327391, + "grad_norm": 0.316834419965744, + "learning_rate": 8.049016258277976e-08, + "loss": 0.2982, + "step": 6118 + }, + { + "epoch": 2.845605332506588, + "grad_norm": 0.3442407548427582, + "learning_rate": 8.000730767603604e-08, + "loss": 0.342, + "step": 6119 + }, + { + "epoch": 2.846070376685785, + "grad_norm": 0.3273371160030365, + "learning_rate": 7.952589375523567e-08, + "loss": 0.346, + "step": 6120 + }, + { + "epoch": 2.8465354208649822, + "grad_norm": 0.29861021041870117, + "learning_rate": 7.904592096137753e-08, + "loss": 0.3062, + "step": 6121 + }, + { + "epoch": 2.847000465044179, + "grad_norm": 0.3003145158290863, + "learning_rate": 7.856738943503694e-08, + "loss": 0.3057, + "step": 6122 + }, + { + "epoch": 2.847465509223376, + "grad_norm": 0.2993358373641968, + "learning_rate": 7.809029931636902e-08, + "loss": 0.3087, + "step": 6123 + }, + { + "epoch": 2.8479305534025734, + "grad_norm": 0.31024885177612305, + "learning_rate": 7.761465074510422e-08, + "loss": 0.3314, + "step": 6124 + }, + { + "epoch": 2.8483955975817703, + "grad_norm": 0.31834203004837036, + "learning_rate": 7.714044386055386e-08, + "loss": 0.3016, + "step": 6125 + }, + { + "epoch": 2.848860641760967, + "grad_norm": 0.3217991888523102, + "learning_rate": 7.666767880160464e-08, + "loss": 0.3448, + "step": 6126 + }, + { + "epoch": 2.8493256859401646, + "grad_norm": 0.29849451780319214, + "learning_rate": 7.619635570672135e-08, + "loss": 0.3216, + "step": 6127 + }, + { + "epoch": 2.8497907301193615, + "grad_norm": 0.31368592381477356, + "learning_rate": 7.5726474713948e-08, + "loss": 0.303, + "step": 6128 + }, + { + "epoch": 2.8502557742985584, + "grad_norm": 0.295265257358551, + "learning_rate": 7.525803596090397e-08, + "loss": 0.3073, + "step": 6129 + }, + { + "epoch": 2.8507208184777553, + "grad_norm": 0.31519418954849243, + "learning_rate": 7.479103958478783e-08, + "loss": 0.3233, + "step": 6130 + }, + { + "epoch": 2.851185862656952, + "grad_norm": 0.3676014840602875, + "learning_rate": 7.432548572237519e-08, + "loss": 0.3799, + "step": 6131 + }, + { + "epoch": 2.8516509068361495, + "grad_norm": 0.3276804983615875, + "learning_rate": 7.386137451001974e-08, + "loss": 0.3389, + "step": 6132 + }, + { + "epoch": 2.8521159510153464, + "grad_norm": 0.3015483319759369, + "learning_rate": 7.339870608365107e-08, + "loss": 0.291, + "step": 6133 + }, + { + "epoch": 2.8525809951945433, + "grad_norm": 0.3150058090686798, + "learning_rate": 7.293748057877859e-08, + "loss": 0.3113, + "step": 6134 + }, + { + "epoch": 2.8530460393737407, + "grad_norm": 0.3133685886859894, + "learning_rate": 7.247769813048644e-08, + "loss": 0.3141, + "step": 6135 + }, + { + "epoch": 2.8535110835529376, + "grad_norm": 0.3154557943344116, + "learning_rate": 7.201935887343858e-08, + "loss": 0.3067, + "step": 6136 + }, + { + "epoch": 2.8539761277321345, + "grad_norm": 0.32975584268569946, + "learning_rate": 7.156246294187374e-08, + "loss": 0.3272, + "step": 6137 + }, + { + "epoch": 2.854441171911332, + "grad_norm": 0.31270360946655273, + "learning_rate": 7.110701046961044e-08, + "loss": 0.3105, + "step": 6138 + }, + { + "epoch": 2.8549062160905287, + "grad_norm": 0.3216429650783539, + "learning_rate": 7.065300159004307e-08, + "loss": 0.299, + "step": 6139 + }, + { + "epoch": 2.8553712602697257, + "grad_norm": 0.33457085490226746, + "learning_rate": 7.02004364361436e-08, + "loss": 0.3355, + "step": 6140 + }, + { + "epoch": 2.8558363044489226, + "grad_norm": 0.31512632966041565, + "learning_rate": 6.974931514046046e-08, + "loss": 0.3328, + "step": 6141 + }, + { + "epoch": 2.8563013486281195, + "grad_norm": 0.3386141359806061, + "learning_rate": 6.929963783511961e-08, + "loss": 0.3283, + "step": 6142 + }, + { + "epoch": 2.856766392807317, + "grad_norm": 0.30703625082969666, + "learning_rate": 6.885140465182516e-08, + "loss": 0.3204, + "step": 6143 + }, + { + "epoch": 2.8572314369865137, + "grad_norm": 0.31355178356170654, + "learning_rate": 6.840461572185708e-08, + "loss": 0.2829, + "step": 6144 + }, + { + "epoch": 2.8576964811657106, + "grad_norm": 0.36136549711227417, + "learning_rate": 6.795927117607238e-08, + "loss": 0.3508, + "step": 6145 + }, + { + "epoch": 2.858161525344908, + "grad_norm": 0.30663174390792847, + "learning_rate": 6.751537114490503e-08, + "loss": 0.3015, + "step": 6146 + }, + { + "epoch": 2.858626569524105, + "grad_norm": 0.35951730608940125, + "learning_rate": 6.707291575836661e-08, + "loss": 0.3571, + "step": 6147 + }, + { + "epoch": 2.859091613703302, + "grad_norm": 0.3195739984512329, + "learning_rate": 6.663190514604456e-08, + "loss": 0.304, + "step": 6148 + }, + { + "epoch": 2.8595566578824987, + "grad_norm": 0.32330840826034546, + "learning_rate": 6.61923394371039e-08, + "loss": 0.314, + "step": 6149 + }, + { + "epoch": 2.8600217020616956, + "grad_norm": 0.30728137493133545, + "learning_rate": 6.575421876028721e-08, + "loss": 0.3342, + "step": 6150 + }, + { + "epoch": 2.860486746240893, + "grad_norm": 0.2898307740688324, + "learning_rate": 6.531754324391126e-08, + "loss": 0.3215, + "step": 6151 + }, + { + "epoch": 2.86095179042009, + "grad_norm": 0.308747798204422, + "learning_rate": 6.488231301587266e-08, + "loss": 0.3281, + "step": 6152 + }, + { + "epoch": 2.8614168345992868, + "grad_norm": 0.32874009013175964, + "learning_rate": 6.444852820364222e-08, + "loss": 0.3273, + "step": 6153 + }, + { + "epoch": 2.861881878778484, + "grad_norm": 0.3160831332206726, + "learning_rate": 6.401618893426886e-08, + "loss": 0.3156, + "step": 6154 + }, + { + "epoch": 2.862346922957681, + "grad_norm": 0.3409363031387329, + "learning_rate": 6.358529533437796e-08, + "loss": 0.3194, + "step": 6155 + }, + { + "epoch": 2.862811967136878, + "grad_norm": 0.29154202342033386, + "learning_rate": 6.315584753017134e-08, + "loss": 0.3113, + "step": 6156 + }, + { + "epoch": 2.8632770113160753, + "grad_norm": 0.3012997508049011, + "learning_rate": 6.272784564742673e-08, + "loss": 0.3508, + "step": 6157 + }, + { + "epoch": 2.863742055495272, + "grad_norm": 0.3225801885128021, + "learning_rate": 6.230128981149941e-08, + "loss": 0.3195, + "step": 6158 + }, + { + "epoch": 2.864207099674469, + "grad_norm": 0.30362215638160706, + "learning_rate": 6.187618014732056e-08, + "loss": 0.3027, + "step": 6159 + }, + { + "epoch": 2.864672143853666, + "grad_norm": 0.32935047149658203, + "learning_rate": 6.145251677939778e-08, + "loss": 0.3351, + "step": 6160 + }, + { + "epoch": 2.865137188032863, + "grad_norm": 0.3463619649410248, + "learning_rate": 6.103029983181519e-08, + "loss": 0.3409, + "step": 6161 + }, + { + "epoch": 2.8656022322120602, + "grad_norm": 0.29759082198143005, + "learning_rate": 6.060952942823328e-08, + "loss": 0.3043, + "step": 6162 + }, + { + "epoch": 2.866067276391257, + "grad_norm": 0.3281930685043335, + "learning_rate": 6.01902056918896e-08, + "loss": 0.3291, + "step": 6163 + }, + { + "epoch": 2.866532320570454, + "grad_norm": 0.3233920931816101, + "learning_rate": 5.977232874559535e-08, + "loss": 0.2848, + "step": 6164 + }, + { + "epoch": 2.8669973647496514, + "grad_norm": 0.31291288137435913, + "learning_rate": 5.935589871174208e-08, + "loss": 0.341, + "step": 6165 + }, + { + "epoch": 2.8674624089288483, + "grad_norm": 0.3225501775741577, + "learning_rate": 5.8940915712293875e-08, + "loss": 0.3351, + "step": 6166 + }, + { + "epoch": 2.867927453108045, + "grad_norm": 0.33493563532829285, + "learning_rate": 5.8527379868792976e-08, + "loss": 0.3184, + "step": 6167 + }, + { + "epoch": 2.8683924972872425, + "grad_norm": 0.32679054141044617, + "learning_rate": 5.811529130235749e-08, + "loss": 0.2963, + "step": 6168 + }, + { + "epoch": 2.8688575414664395, + "grad_norm": 0.319558709859848, + "learning_rate": 5.770465013368198e-08, + "loss": 0.33, + "step": 6169 + }, + { + "epoch": 2.8693225856456364, + "grad_norm": 0.28156691789627075, + "learning_rate": 5.729545648303525e-08, + "loss": 0.2726, + "step": 6170 + }, + { + "epoch": 2.8697876298248333, + "grad_norm": 0.3368379771709442, + "learning_rate": 5.688771047026476e-08, + "loss": 0.3513, + "step": 6171 + }, + { + "epoch": 2.87025267400403, + "grad_norm": 0.32251864671707153, + "learning_rate": 5.648141221479164e-08, + "loss": 0.3177, + "step": 6172 + }, + { + "epoch": 2.8707177181832275, + "grad_norm": 0.3322902321815491, + "learning_rate": 5.6076561835615164e-08, + "loss": 0.3423, + "step": 6173 + }, + { + "epoch": 2.8711827623624244, + "grad_norm": 0.3071969449520111, + "learning_rate": 5.5673159451308246e-08, + "loss": 0.3236, + "step": 6174 + }, + { + "epoch": 2.8716478065416213, + "grad_norm": 0.29615259170532227, + "learning_rate": 5.527120518002138e-08, + "loss": 0.2841, + "step": 6175 + }, + { + "epoch": 2.8721128507208187, + "grad_norm": 0.32115840911865234, + "learning_rate": 5.487069913948096e-08, + "loss": 0.366, + "step": 6176 + }, + { + "epoch": 2.8725778949000156, + "grad_norm": 0.31044548749923706, + "learning_rate": 5.447164144698758e-08, + "loss": 0.3026, + "step": 6177 + }, + { + "epoch": 2.8730429390792125, + "grad_norm": 0.3024675250053406, + "learning_rate": 5.407403221941998e-08, + "loss": 0.3232, + "step": 6178 + }, + { + "epoch": 2.8735079832584094, + "grad_norm": 0.3414202034473419, + "learning_rate": 5.367787157323057e-08, + "loss": 0.3284, + "step": 6179 + }, + { + "epoch": 2.8739730274376063, + "grad_norm": 0.3175242841243744, + "learning_rate": 5.3283159624448745e-08, + "loss": 0.2636, + "step": 6180 + }, + { + "epoch": 2.8744380716168036, + "grad_norm": 0.490583211183548, + "learning_rate": 5.2889896488679816e-08, + "loss": 0.3264, + "step": 6181 + }, + { + "epoch": 2.8749031157960006, + "grad_norm": 0.32635655999183655, + "learning_rate": 5.249808228110276e-08, + "loss": 0.3085, + "step": 6182 + }, + { + "epoch": 2.8753681599751975, + "grad_norm": 0.32171571254730225, + "learning_rate": 5.2107717116474665e-08, + "loss": 0.3289, + "step": 6183 + }, + { + "epoch": 2.875833204154395, + "grad_norm": 0.30658993124961853, + "learning_rate": 5.171880110912686e-08, + "loss": 0.2749, + "step": 6184 + }, + { + "epoch": 2.8762982483335917, + "grad_norm": 0.3477241098880768, + "learning_rate": 5.133133437296656e-08, + "loss": 0.3022, + "step": 6185 + }, + { + "epoch": 2.8767632925127886, + "grad_norm": 0.31438449025154114, + "learning_rate": 5.094531702147632e-08, + "loss": 0.2979, + "step": 6186 + }, + { + "epoch": 2.877228336691986, + "grad_norm": 0.32434505224227905, + "learning_rate": 5.056074916771458e-08, + "loss": 0.2972, + "step": 6187 + }, + { + "epoch": 2.877693380871183, + "grad_norm": 0.33703556656837463, + "learning_rate": 5.01776309243146e-08, + "loss": 0.3343, + "step": 6188 + }, + { + "epoch": 2.8781584250503798, + "grad_norm": 0.2929964065551758, + "learning_rate": 4.97959624034855e-08, + "loss": 0.3, + "step": 6189 + }, + { + "epoch": 2.8786234692295767, + "grad_norm": 0.3615955114364624, + "learning_rate": 4.9415743717012296e-08, + "loss": 0.3695, + "step": 6190 + }, + { + "epoch": 2.8790885134087736, + "grad_norm": 0.33232712745666504, + "learning_rate": 4.903697497625537e-08, + "loss": 0.2803, + "step": 6191 + }, + { + "epoch": 2.879553557587971, + "grad_norm": 0.32238370180130005, + "learning_rate": 4.865965629214819e-08, + "loss": 0.342, + "step": 6192 + }, + { + "epoch": 2.880018601767168, + "grad_norm": 0.29459741711616516, + "learning_rate": 4.828378777520293e-08, + "loss": 0.3223, + "step": 6193 + }, + { + "epoch": 2.8804836459463647, + "grad_norm": 0.3244211971759796, + "learning_rate": 4.790936953550485e-08, + "loss": 0.3027, + "step": 6194 + }, + { + "epoch": 2.880948690125562, + "grad_norm": 0.33812856674194336, + "learning_rate": 4.753640168271456e-08, + "loss": 0.3327, + "step": 6195 + }, + { + "epoch": 2.881413734304759, + "grad_norm": 0.3344283998012543, + "learning_rate": 4.7164884326068584e-08, + "loss": 0.3385, + "step": 6196 + }, + { + "epoch": 2.881878778483956, + "grad_norm": 0.332630455493927, + "learning_rate": 4.6794817574378204e-08, + "loss": 0.3118, + "step": 6197 + }, + { + "epoch": 2.8823438226631533, + "grad_norm": 0.33553823828697205, + "learning_rate": 4.6426201536030616e-08, + "loss": 0.3352, + "step": 6198 + }, + { + "epoch": 2.88280886684235, + "grad_norm": 0.32330450415611267, + "learning_rate": 4.605903631898612e-08, + "loss": 0.3162, + "step": 6199 + }, + { + "epoch": 2.883273911021547, + "grad_norm": 0.32343700528144836, + "learning_rate": 4.569332203078258e-08, + "loss": 0.3222, + "step": 6200 + }, + { + "epoch": 2.883738955200744, + "grad_norm": 0.310979425907135, + "learning_rate": 4.5329058778531e-08, + "loss": 0.2865, + "step": 6201 + }, + { + "epoch": 2.884203999379941, + "grad_norm": 0.3251758813858032, + "learning_rate": 4.4966246668919355e-08, + "loss": 0.3159, + "step": 6202 + }, + { + "epoch": 2.8846690435591382, + "grad_norm": 0.3213922381401062, + "learning_rate": 4.460488580820821e-08, + "loss": 0.3045, + "step": 6203 + }, + { + "epoch": 2.885134087738335, + "grad_norm": 0.3345593512058258, + "learning_rate": 4.424497630223512e-08, + "loss": 0.3395, + "step": 6204 + }, + { + "epoch": 2.885599131917532, + "grad_norm": 0.3209022879600525, + "learning_rate": 4.3886518256411325e-08, + "loss": 0.3152, + "step": 6205 + }, + { + "epoch": 2.8860641760967294, + "grad_norm": 0.31403934955596924, + "learning_rate": 4.35295117757234e-08, + "loss": 0.3292, + "step": 6206 + }, + { + "epoch": 2.8865292202759263, + "grad_norm": 0.289949893951416, + "learning_rate": 4.3173956964732145e-08, + "loss": 0.304, + "step": 6207 + }, + { + "epoch": 2.886994264455123, + "grad_norm": 0.33771443367004395, + "learning_rate": 4.281985392757537e-08, + "loss": 0.3385, + "step": 6208 + }, + { + "epoch": 2.88745930863432, + "grad_norm": 0.2989499866962433, + "learning_rate": 4.2467202767962346e-08, + "loss": 0.2799, + "step": 6209 + }, + { + "epoch": 2.8879243528135174, + "grad_norm": 0.318925142288208, + "learning_rate": 4.211600358917989e-08, + "loss": 0.3248, + "step": 6210 + }, + { + "epoch": 2.8883893969927144, + "grad_norm": 0.31016743183135986, + "learning_rate": 4.17662564940885e-08, + "loss": 0.3164, + "step": 6211 + }, + { + "epoch": 2.8888544411719113, + "grad_norm": 0.3462943732738495, + "learning_rate": 4.14179615851229e-08, + "loss": 0.3617, + "step": 6212 + }, + { + "epoch": 2.889319485351108, + "grad_norm": 0.3116661310195923, + "learning_rate": 4.1071118964293166e-08, + "loss": 0.3037, + "step": 6213 + }, + { + "epoch": 2.8897845295303055, + "grad_norm": 0.29562506079673767, + "learning_rate": 4.07257287331847e-08, + "loss": 0.3133, + "step": 6214 + }, + { + "epoch": 2.8902495737095024, + "grad_norm": 0.3385922610759735, + "learning_rate": 4.038179099295547e-08, + "loss": 0.3416, + "step": 6215 + }, + { + "epoch": 2.8907146178886993, + "grad_norm": 0.28111937642097473, + "learning_rate": 4.0039305844339905e-08, + "loss": 0.3165, + "step": 6216 + }, + { + "epoch": 2.8911796620678967, + "grad_norm": 0.3083990216255188, + "learning_rate": 3.969827338764665e-08, + "loss": 0.3474, + "step": 6217 + }, + { + "epoch": 2.8916447062470936, + "grad_norm": 0.3193405270576477, + "learning_rate": 3.935869372275747e-08, + "loss": 0.3144, + "step": 6218 + }, + { + "epoch": 2.8921097504262905, + "grad_norm": 0.3196331858634949, + "learning_rate": 3.9020566949131145e-08, + "loss": 0.3228, + "step": 6219 + }, + { + "epoch": 2.8925747946054874, + "grad_norm": 0.31240054965019226, + "learning_rate": 3.868389316579846e-08, + "loss": 0.3234, + "step": 6220 + }, + { + "epoch": 2.8930398387846843, + "grad_norm": 0.2907924950122833, + "learning_rate": 3.834867247136553e-08, + "loss": 0.3052, + "step": 6221 + }, + { + "epoch": 2.8935048829638816, + "grad_norm": 0.3368573784828186, + "learning_rate": 3.801490496401439e-08, + "loss": 0.3779, + "step": 6222 + }, + { + "epoch": 2.8939699271430785, + "grad_norm": 0.3105674088001251, + "learning_rate": 3.768259074149905e-08, + "loss": 0.3045, + "step": 6223 + }, + { + "epoch": 2.8944349713222755, + "grad_norm": 0.31339845061302185, + "learning_rate": 3.735172990114888e-08, + "loss": 0.3456, + "step": 6224 + }, + { + "epoch": 2.894900015501473, + "grad_norm": 0.32460853457450867, + "learning_rate": 3.702232253986804e-08, + "loss": 0.3115, + "step": 6225 + }, + { + "epoch": 2.8953650596806697, + "grad_norm": 0.33056336641311646, + "learning_rate": 3.6694368754134346e-08, + "loss": 0.3283, + "step": 6226 + }, + { + "epoch": 2.8958301038598666, + "grad_norm": 0.31713178753852844, + "learning_rate": 3.6367868640000416e-08, + "loss": 0.3124, + "step": 6227 + }, + { + "epoch": 2.896295148039064, + "grad_norm": 0.3436635732650757, + "learning_rate": 3.6042822293093083e-08, + "loss": 0.3683, + "step": 6228 + }, + { + "epoch": 2.896760192218261, + "grad_norm": 0.30953043699264526, + "learning_rate": 3.571922980861231e-08, + "loss": 0.3073, + "step": 6229 + }, + { + "epoch": 2.8972252363974578, + "grad_norm": 0.32656288146972656, + "learning_rate": 3.539709128133395e-08, + "loss": 0.2919, + "step": 6230 + }, + { + "epoch": 2.8976902805766547, + "grad_norm": 0.31821176409721375, + "learning_rate": 3.5076406805606425e-08, + "loss": 0.3195, + "step": 6231 + }, + { + "epoch": 2.8981553247558516, + "grad_norm": 0.31194084882736206, + "learning_rate": 3.475717647535348e-08, + "loss": 0.3455, + "step": 6232 + }, + { + "epoch": 2.898620368935049, + "grad_norm": 0.31423068046569824, + "learning_rate": 3.443940038407256e-08, + "loss": 0.3265, + "step": 6233 + }, + { + "epoch": 2.899085413114246, + "grad_norm": 0.2951379418373108, + "learning_rate": 3.4123078624834214e-08, + "loss": 0.2827, + "step": 6234 + }, + { + "epoch": 2.8995504572934427, + "grad_norm": 0.3533174395561218, + "learning_rate": 3.3808211290284886e-08, + "loss": 0.3611, + "step": 6235 + }, + { + "epoch": 2.90001550147264, + "grad_norm": 0.3347010612487793, + "learning_rate": 3.349479847264414e-08, + "loss": 0.3355, + "step": 6236 + }, + { + "epoch": 2.900480545651837, + "grad_norm": 0.3282628059387207, + "learning_rate": 3.318284026370522e-08, + "loss": 0.3093, + "step": 6237 + }, + { + "epoch": 2.900945589831034, + "grad_norm": 0.31977033615112305, + "learning_rate": 3.287233675483503e-08, + "loss": 0.328, + "step": 6238 + }, + { + "epoch": 2.9014106340102312, + "grad_norm": 0.3105235993862152, + "learning_rate": 3.2563288036976394e-08, + "loss": 0.3117, + "step": 6239 + }, + { + "epoch": 2.901875678189428, + "grad_norm": 0.3219479024410248, + "learning_rate": 3.2255694200643003e-08, + "loss": 0.3167, + "step": 6240 + }, + { + "epoch": 2.902340722368625, + "grad_norm": 0.30277541279792786, + "learning_rate": 3.194955533592559e-08, + "loss": 0.2801, + "step": 6241 + }, + { + "epoch": 2.902805766547822, + "grad_norm": 0.3230368196964264, + "learning_rate": 3.16448715324863e-08, + "loss": 0.3138, + "step": 6242 + }, + { + "epoch": 2.903270810727019, + "grad_norm": 0.3114720582962036, + "learning_rate": 3.13416428795621e-08, + "loss": 0.3266, + "step": 6243 + }, + { + "epoch": 2.903735854906216, + "grad_norm": 0.324063777923584, + "learning_rate": 3.103986946596415e-08, + "loss": 0.3465, + "step": 6244 + }, + { + "epoch": 2.904200899085413, + "grad_norm": 0.318876177072525, + "learning_rate": 3.073955138007734e-08, + "loss": 0.3217, + "step": 6245 + }, + { + "epoch": 2.90466594326461, + "grad_norm": 0.3103938698768616, + "learning_rate": 3.044068870985906e-08, + "loss": 0.3207, + "step": 6246 + }, + { + "epoch": 2.9051309874438074, + "grad_norm": 0.31359586119651794, + "learning_rate": 3.014328154284152e-08, + "loss": 0.3173, + "step": 6247 + }, + { + "epoch": 2.9055960316230043, + "grad_norm": 0.311901330947876, + "learning_rate": 2.98473299661306e-08, + "loss": 0.3194, + "step": 6248 + }, + { + "epoch": 2.906061075802201, + "grad_norm": 0.31329211592674255, + "learning_rate": 2.955283406640641e-08, + "loss": 0.2925, + "step": 6249 + }, + { + "epoch": 2.906526119981398, + "grad_norm": 0.31857290863990784, + "learning_rate": 2.9259793929921066e-08, + "loss": 0.3267, + "step": 6250 + }, + { + "epoch": 2.906991164160595, + "grad_norm": 0.3248409628868103, + "learning_rate": 2.8968209642501465e-08, + "loss": 0.323, + "step": 6251 + }, + { + "epoch": 2.9074562083397923, + "grad_norm": 0.3088364005088806, + "learning_rate": 2.8678081289548187e-08, + "loss": 0.284, + "step": 6252 + }, + { + "epoch": 2.9079212525189893, + "grad_norm": 0.3107573091983795, + "learning_rate": 2.8389408956034925e-08, + "loss": 0.3242, + "step": 6253 + }, + { + "epoch": 2.908386296698186, + "grad_norm": 0.3310908079147339, + "learning_rate": 2.810219272650849e-08, + "loss": 0.3107, + "step": 6254 + }, + { + "epoch": 2.9088513408773835, + "grad_norm": 0.30595463514328003, + "learning_rate": 2.7816432685091598e-08, + "loss": 0.3522, + "step": 6255 + }, + { + "epoch": 2.9093163850565804, + "grad_norm": 0.3248220980167389, + "learning_rate": 2.7532128915476742e-08, + "loss": 0.3116, + "step": 6256 + }, + { + "epoch": 2.9097814292357773, + "grad_norm": 0.3430537283420563, + "learning_rate": 2.7249281500932868e-08, + "loss": 0.3414, + "step": 6257 + }, + { + "epoch": 2.9102464734149747, + "grad_norm": 0.32663294672966003, + "learning_rate": 2.6967890524301488e-08, + "loss": 0.2961, + "step": 6258 + }, + { + "epoch": 2.9107115175941716, + "grad_norm": 0.32773134112358093, + "learning_rate": 2.6687956067997234e-08, + "loss": 0.2928, + "step": 6259 + }, + { + "epoch": 2.9111765617733685, + "grad_norm": 0.3033883571624756, + "learning_rate": 2.640947821400841e-08, + "loss": 0.3062, + "step": 6260 + }, + { + "epoch": 2.9116416059525654, + "grad_norm": 0.33305469155311584, + "learning_rate": 2.6132457043896442e-08, + "loss": 0.2965, + "step": 6261 + }, + { + "epoch": 2.9121066501317623, + "grad_norm": 0.33847376704216003, + "learning_rate": 2.585689263879643e-08, + "loss": 0.3387, + "step": 6262 + }, + { + "epoch": 2.9125716943109596, + "grad_norm": 0.3103272616863251, + "learning_rate": 2.558278507941714e-08, + "loss": 0.2881, + "step": 6263 + }, + { + "epoch": 2.9130367384901565, + "grad_norm": 0.33586204051971436, + "learning_rate": 2.5310134446039357e-08, + "loss": 0.3428, + "step": 6264 + }, + { + "epoch": 2.9135017826693534, + "grad_norm": 0.34026390314102173, + "learning_rate": 2.503894081851921e-08, + "loss": 0.3164, + "step": 6265 + }, + { + "epoch": 2.913966826848551, + "grad_norm": 0.3134699761867523, + "learning_rate": 2.476920427628371e-08, + "loss": 0.3022, + "step": 6266 + }, + { + "epoch": 2.9144318710277477, + "grad_norm": 0.31349417567253113, + "learning_rate": 2.4500924898335223e-08, + "loss": 0.3335, + "step": 6267 + }, + { + "epoch": 2.9148969152069446, + "grad_norm": 0.30703291296958923, + "learning_rate": 2.4234102763247558e-08, + "loss": 0.3165, + "step": 6268 + }, + { + "epoch": 2.915361959386142, + "grad_norm": 0.30995601415634155, + "learning_rate": 2.3968737949169318e-08, + "loss": 0.3276, + "step": 6269 + }, + { + "epoch": 2.915827003565339, + "grad_norm": 0.3151072561740875, + "learning_rate": 2.370483053382111e-08, + "loss": 0.3182, + "step": 6270 + }, + { + "epoch": 2.9162920477445358, + "grad_norm": 0.3360000252723694, + "learning_rate": 2.3442380594497215e-08, + "loss": 0.3073, + "step": 6271 + }, + { + "epoch": 2.9167570919237327, + "grad_norm": 0.3323414921760559, + "learning_rate": 2.3181388208065036e-08, + "loss": 0.3643, + "step": 6272 + }, + { + "epoch": 2.9172221361029296, + "grad_norm": 0.3142452538013458, + "learning_rate": 2.2921853450965094e-08, + "loss": 0.2902, + "step": 6273 + }, + { + "epoch": 2.917687180282127, + "grad_norm": 0.332619309425354, + "learning_rate": 2.2663776399211024e-08, + "loss": 0.3332, + "step": 6274 + }, + { + "epoch": 2.918152224461324, + "grad_norm": 0.30116286873817444, + "learning_rate": 2.2407157128389033e-08, + "loss": 0.2971, + "step": 6275 + }, + { + "epoch": 2.9186172686405207, + "grad_norm": 0.3235343396663666, + "learning_rate": 2.2151995713659e-08, + "loss": 0.3239, + "step": 6276 + }, + { + "epoch": 2.919082312819718, + "grad_norm": 0.3417525887489319, + "learning_rate": 2.189829222975337e-08, + "loss": 0.3347, + "step": 6277 + }, + { + "epoch": 2.919547356998915, + "grad_norm": 0.34328693151474, + "learning_rate": 2.1646046750978255e-08, + "loss": 0.3241, + "step": 6278 + }, + { + "epoch": 2.920012401178112, + "grad_norm": 0.3048785924911499, + "learning_rate": 2.1395259351211227e-08, + "loss": 0.2951, + "step": 6279 + }, + { + "epoch": 2.920477445357309, + "grad_norm": 0.31795376539230347, + "learning_rate": 2.1145930103904645e-08, + "loss": 0.3376, + "step": 6280 + }, + { + "epoch": 2.9209424895365057, + "grad_norm": 0.34835532307624817, + "learning_rate": 2.0898059082082868e-08, + "loss": 0.3394, + "step": 6281 + }, + { + "epoch": 2.921407533715703, + "grad_norm": 0.30004480481147766, + "learning_rate": 2.065164635834338e-08, + "loss": 0.2802, + "step": 6282 + }, + { + "epoch": 2.9218725778949, + "grad_norm": 0.31442791223526, + "learning_rate": 2.040669200485623e-08, + "loss": 0.3024, + "step": 6283 + }, + { + "epoch": 2.922337622074097, + "grad_norm": 0.33434808254241943, + "learning_rate": 2.016319609336459e-08, + "loss": 0.3347, + "step": 6284 + }, + { + "epoch": 2.922802666253294, + "grad_norm": 0.3095998167991638, + "learning_rate": 1.992115869518474e-08, + "loss": 0.3192, + "step": 6285 + }, + { + "epoch": 2.923267710432491, + "grad_norm": 0.2919743061065674, + "learning_rate": 1.968057988120553e-08, + "loss": 0.3011, + "step": 6286 + }, + { + "epoch": 2.923732754611688, + "grad_norm": 0.3112308084964752, + "learning_rate": 1.9441459721887822e-08, + "loss": 0.3284, + "step": 6287 + }, + { + "epoch": 2.9241977987908854, + "grad_norm": 0.32711583375930786, + "learning_rate": 1.920379828726726e-08, + "loss": 0.3329, + "step": 6288 + }, + { + "epoch": 2.9246628429700823, + "grad_norm": 0.30145832896232605, + "learning_rate": 1.8967595646949834e-08, + "loss": 0.292, + "step": 6289 + }, + { + "epoch": 2.925127887149279, + "grad_norm": 0.3420089781284332, + "learning_rate": 1.8732851870115755e-08, + "loss": 0.32, + "step": 6290 + }, + { + "epoch": 2.925592931328476, + "grad_norm": 0.34311848878860474, + "learning_rate": 1.849956702551836e-08, + "loss": 0.3236, + "step": 6291 + }, + { + "epoch": 2.926057975507673, + "grad_norm": 0.31155329942703247, + "learning_rate": 1.826774118148189e-08, + "loss": 0.3191, + "step": 6292 + }, + { + "epoch": 2.9265230196868703, + "grad_norm": 0.3236565589904785, + "learning_rate": 1.803737440590536e-08, + "loss": 0.3369, + "step": 6293 + }, + { + "epoch": 2.9269880638660672, + "grad_norm": 0.28220975399017334, + "learning_rate": 1.7808466766259246e-08, + "loss": 0.2988, + "step": 6294 + }, + { + "epoch": 2.927453108045264, + "grad_norm": 0.3211870789527893, + "learning_rate": 1.758101832958603e-08, + "loss": 0.3305, + "step": 6295 + }, + { + "epoch": 2.9279181522244615, + "grad_norm": 0.3059036135673523, + "learning_rate": 1.7355029162502978e-08, + "loss": 0.2947, + "step": 6296 + }, + { + "epoch": 2.9283831964036584, + "grad_norm": 0.31441813707351685, + "learning_rate": 1.7130499331197703e-08, + "loss": 0.2993, + "step": 6297 + }, + { + "epoch": 2.9288482405828553, + "grad_norm": 0.32152053713798523, + "learning_rate": 1.6907428901432045e-08, + "loss": 0.3135, + "step": 6298 + }, + { + "epoch": 2.9293132847620527, + "grad_norm": 0.328326940536499, + "learning_rate": 1.668581793853874e-08, + "loss": 0.3092, + "step": 6299 + }, + { + "epoch": 2.9297783289412496, + "grad_norm": 0.3213350772857666, + "learning_rate": 1.6465666507425314e-08, + "loss": 0.3505, + "step": 6300 + }, + { + "epoch": 2.9302433731204465, + "grad_norm": 0.3014281988143921, + "learning_rate": 1.6246974672569083e-08, + "loss": 0.2903, + "step": 6301 + }, + { + "epoch": 2.9307084172996434, + "grad_norm": 0.31483855843544006, + "learning_rate": 1.6029742498022692e-08, + "loss": 0.3267, + "step": 6302 + }, + { + "epoch": 2.9311734614788403, + "grad_norm": 0.32072848081588745, + "learning_rate": 1.5813970047409144e-08, + "loss": 0.2982, + "step": 6303 + }, + { + "epoch": 2.9316385056580376, + "grad_norm": 0.3045773208141327, + "learning_rate": 1.559965738392455e-08, + "loss": 0.3062, + "step": 6304 + }, + { + "epoch": 2.9321035498372345, + "grad_norm": 0.3604282736778259, + "learning_rate": 1.538680457033814e-08, + "loss": 0.3295, + "step": 6305 + }, + { + "epoch": 2.9325685940164314, + "grad_norm": 0.3138650357723236, + "learning_rate": 1.5175411668990613e-08, + "loss": 0.2937, + "step": 6306 + }, + { + "epoch": 2.933033638195629, + "grad_norm": 0.330479234457016, + "learning_rate": 1.496547874179577e-08, + "loss": 0.3091, + "step": 6307 + }, + { + "epoch": 2.9334986823748257, + "grad_norm": 0.3400230407714844, + "learning_rate": 1.475700585023998e-08, + "loss": 0.3206, + "step": 6308 + }, + { + "epoch": 2.9339637265540226, + "grad_norm": 0.33280929923057556, + "learning_rate": 1.4549993055380519e-08, + "loss": 0.3632, + "step": 6309 + }, + { + "epoch": 2.9344287707332195, + "grad_norm": 0.29419514536857605, + "learning_rate": 1.4344440417848882e-08, + "loss": 0.2756, + "step": 6310 + }, + { + "epoch": 2.9348938149124164, + "grad_norm": 0.3062974214553833, + "learning_rate": 1.4140347997848025e-08, + "loss": 0.3265, + "step": 6311 + }, + { + "epoch": 2.9353588590916138, + "grad_norm": 0.3415301740169525, + "learning_rate": 1.3937715855152912e-08, + "loss": 0.3005, + "step": 6312 + }, + { + "epoch": 2.9358239032708107, + "grad_norm": 0.3116869032382965, + "learning_rate": 1.3736544049111622e-08, + "loss": 0.2983, + "step": 6313 + }, + { + "epoch": 2.9362889474500076, + "grad_norm": 0.32688966393470764, + "learning_rate": 1.3536832638643693e-08, + "loss": 0.3142, + "step": 6314 + }, + { + "epoch": 2.936753991629205, + "grad_norm": 0.3033595681190491, + "learning_rate": 1.333858168224178e-08, + "loss": 0.3131, + "step": 6315 + }, + { + "epoch": 2.937219035808402, + "grad_norm": 0.3291476368904114, + "learning_rate": 1.3141791237969991e-08, + "loss": 0.3139, + "step": 6316 + }, + { + "epoch": 2.9376840799875987, + "grad_norm": 0.3021133542060852, + "learning_rate": 1.2946461363465557e-08, + "loss": 0.2932, + "step": 6317 + }, + { + "epoch": 2.938149124166796, + "grad_norm": 0.32439282536506653, + "learning_rate": 1.2752592115936601e-08, + "loss": 0.3559, + "step": 6318 + }, + { + "epoch": 2.938614168345993, + "grad_norm": 0.3350219428539276, + "learning_rate": 1.2560183552164928e-08, + "loss": 0.3485, + "step": 6319 + }, + { + "epoch": 2.93907921252519, + "grad_norm": 0.3032825291156769, + "learning_rate": 1.2369235728503792e-08, + "loss": 0.315, + "step": 6320 + }, + { + "epoch": 2.939544256704387, + "grad_norm": 0.311538428068161, + "learning_rate": 1.2179748700879013e-08, + "loss": 0.325, + "step": 6321 + }, + { + "epoch": 2.9400093008835837, + "grad_norm": 0.3261773884296417, + "learning_rate": 1.1991722524787307e-08, + "loss": 0.3301, + "step": 6322 + }, + { + "epoch": 2.940474345062781, + "grad_norm": 0.33156725764274597, + "learning_rate": 1.1805157255299626e-08, + "loss": 0.3229, + "step": 6323 + }, + { + "epoch": 2.940939389241978, + "grad_norm": 0.3547118008136749, + "learning_rate": 1.1620052947056703e-08, + "loss": 0.2986, + "step": 6324 + }, + { + "epoch": 2.941404433421175, + "grad_norm": 0.32039710879325867, + "learning_rate": 1.1436409654273506e-08, + "loss": 0.3285, + "step": 6325 + }, + { + "epoch": 2.941869477600372, + "grad_norm": 0.31532758474349976, + "learning_rate": 1.1254227430735898e-08, + "loss": 0.3099, + "step": 6326 + }, + { + "epoch": 2.942334521779569, + "grad_norm": 0.33241426944732666, + "learning_rate": 1.1073506329802309e-08, + "loss": 0.3177, + "step": 6327 + }, + { + "epoch": 2.942799565958766, + "grad_norm": 0.3229113221168518, + "learning_rate": 1.089424640440262e-08, + "loss": 0.3215, + "step": 6328 + }, + { + "epoch": 2.9432646101379634, + "grad_norm": 0.3332347273826599, + "learning_rate": 1.0716447707039279e-08, + "loss": 0.351, + "step": 6329 + }, + { + "epoch": 2.9437296543171603, + "grad_norm": 0.3146977126598358, + "learning_rate": 1.0540110289786742e-08, + "loss": 0.3142, + "step": 6330 + }, + { + "epoch": 2.944194698496357, + "grad_norm": 0.3125641942024231, + "learning_rate": 1.0365234204291475e-08, + "loss": 0.2814, + "step": 6331 + }, + { + "epoch": 2.944659742675554, + "grad_norm": 0.32364359498023987, + "learning_rate": 1.019181950177195e-08, + "loss": 0.3173, + "step": 6332 + }, + { + "epoch": 2.945124786854751, + "grad_norm": 0.3259218633174896, + "learning_rate": 1.001986623301754e-08, + "loss": 0.3478, + "step": 6333 + }, + { + "epoch": 2.9455898310339483, + "grad_norm": 0.3398853540420532, + "learning_rate": 9.849374448391846e-09, + "loss": 0.3149, + "step": 6334 + }, + { + "epoch": 2.9460548752131452, + "grad_norm": 0.31736600399017334, + "learning_rate": 9.680344197828262e-09, + "loss": 0.2984, + "step": 6335 + }, + { + "epoch": 2.946519919392342, + "grad_norm": 0.3133545219898224, + "learning_rate": 9.512775530833296e-09, + "loss": 0.3267, + "step": 6336 + }, + { + "epoch": 2.9469849635715395, + "grad_norm": 0.32863521575927734, + "learning_rate": 9.346668496485468e-09, + "loss": 0.3461, + "step": 6337 + }, + { + "epoch": 2.9474500077507364, + "grad_norm": 0.3177638053894043, + "learning_rate": 9.182023143434193e-09, + "loss": 0.2977, + "step": 6338 + }, + { + "epoch": 2.9479150519299333, + "grad_norm": 0.31308892369270325, + "learning_rate": 9.018839519902012e-09, + "loss": 0.3232, + "step": 6339 + }, + { + "epoch": 2.94838009610913, + "grad_norm": 0.3451671600341797, + "learning_rate": 8.857117673681804e-09, + "loss": 0.3357, + "step": 6340 + }, + { + "epoch": 2.948845140288327, + "grad_norm": 0.32019293308258057, + "learning_rate": 8.696857652140677e-09, + "loss": 0.3324, + "step": 6341 + }, + { + "epoch": 2.9493101844675245, + "grad_norm": 0.33214157819747925, + "learning_rate": 8.538059502214979e-09, + "loss": 0.2954, + "step": 6342 + }, + { + "epoch": 2.9497752286467214, + "grad_norm": 0.33576342463493347, + "learning_rate": 8.380723270414726e-09, + "loss": 0.3511, + "step": 6343 + }, + { + "epoch": 2.9502402728259183, + "grad_norm": 0.32539987564086914, + "learning_rate": 8.224849002820834e-09, + "loss": 0.3342, + "step": 6344 + }, + { + "epoch": 2.9507053170051156, + "grad_norm": 0.31811603903770447, + "learning_rate": 8.07043674508623e-09, + "loss": 0.3285, + "step": 6345 + }, + { + "epoch": 2.9511703611843125, + "grad_norm": 0.30037549138069153, + "learning_rate": 7.917486542436404e-09, + "loss": 0.3136, + "step": 6346 + }, + { + "epoch": 2.9516354053635094, + "grad_norm": 0.3149525225162506, + "learning_rate": 7.765998439667743e-09, + "loss": 0.3187, + "step": 6347 + }, + { + "epoch": 2.952100449542707, + "grad_norm": 0.3301972448825836, + "learning_rate": 7.615972481148094e-09, + "loss": 0.329, + "step": 6348 + }, + { + "epoch": 2.9525654937219037, + "grad_norm": 0.3264675736427307, + "learning_rate": 7.467408710818414e-09, + "loss": 0.335, + "step": 6349 + }, + { + "epoch": 2.9530305379011006, + "grad_norm": 0.32105743885040283, + "learning_rate": 7.320307172190011e-09, + "loss": 0.3006, + "step": 6350 + }, + { + "epoch": 2.9534955820802975, + "grad_norm": 0.3135838806629181, + "learning_rate": 7.174667908346755e-09, + "loss": 0.3276, + "step": 6351 + }, + { + "epoch": 2.9539606262594944, + "grad_norm": 0.36441266536712646, + "learning_rate": 7.0304909619439695e-09, + "loss": 0.347, + "step": 6352 + }, + { + "epoch": 2.9544256704386918, + "grad_norm": 0.32082051038742065, + "learning_rate": 6.887776375208433e-09, + "loss": 0.3151, + "step": 6353 + }, + { + "epoch": 2.9548907146178887, + "grad_norm": 0.3240443170070648, + "learning_rate": 6.74652418994004e-09, + "loss": 0.3199, + "step": 6354 + }, + { + "epoch": 2.9553557587970856, + "grad_norm": 0.31917044520378113, + "learning_rate": 6.606734447507923e-09, + "loss": 0.306, + "step": 6355 + }, + { + "epoch": 2.955820802976283, + "grad_norm": 0.3240707218647003, + "learning_rate": 6.4684071888554415e-09, + "loss": 0.3439, + "step": 6356 + }, + { + "epoch": 2.95628584715548, + "grad_norm": 0.3343864679336548, + "learning_rate": 6.331542454495188e-09, + "loss": 0.299, + "step": 6357 + }, + { + "epoch": 2.9567508913346767, + "grad_norm": 0.3145192563533783, + "learning_rate": 6.19614028451343e-09, + "loss": 0.2969, + "step": 6358 + }, + { + "epoch": 2.957215935513874, + "grad_norm": 0.3098326027393341, + "learning_rate": 6.062200718567335e-09, + "loss": 0.3037, + "step": 6359 + }, + { + "epoch": 2.957680979693071, + "grad_norm": 0.3226139545440674, + "learning_rate": 5.929723795884967e-09, + "loss": 0.303, + "step": 6360 + }, + { + "epoch": 2.958146023872268, + "grad_norm": 0.31032806634902954, + "learning_rate": 5.798709555266957e-09, + "loss": 0.3072, + "step": 6361 + }, + { + "epoch": 2.958611068051465, + "grad_norm": 0.3300245702266693, + "learning_rate": 5.669158035085387e-09, + "loss": 0.3254, + "step": 6362 + }, + { + "epoch": 2.9590761122306617, + "grad_norm": 0.30949413776397705, + "learning_rate": 5.5410692732837946e-09, + "loss": 0.3046, + "step": 6363 + }, + { + "epoch": 2.959541156409859, + "grad_norm": 0.32829898595809937, + "learning_rate": 5.414443307377171e-09, + "loss": 0.3468, + "step": 6364 + }, + { + "epoch": 2.960006200589056, + "grad_norm": 0.3059823215007782, + "learning_rate": 5.2892801744525154e-09, + "loss": 0.3097, + "step": 6365 + }, + { + "epoch": 2.960471244768253, + "grad_norm": 0.3185559809207916, + "learning_rate": 5.165579911167729e-09, + "loss": 0.304, + "step": 6366 + }, + { + "epoch": 2.96093628894745, + "grad_norm": 0.3140532076358795, + "learning_rate": 5.043342553752717e-09, + "loss": 0.2994, + "step": 6367 + }, + { + "epoch": 2.961401333126647, + "grad_norm": 0.3156139552593231, + "learning_rate": 4.922568138008843e-09, + "loss": 0.3252, + "step": 6368 + }, + { + "epoch": 2.961866377305844, + "grad_norm": 0.3156965374946594, + "learning_rate": 4.803256699308923e-09, + "loss": 0.3367, + "step": 6369 + }, + { + "epoch": 2.962331421485041, + "grad_norm": 0.3089465796947479, + "learning_rate": 4.685408272597225e-09, + "loss": 0.3097, + "step": 6370 + }, + { + "epoch": 2.9627964656642383, + "grad_norm": 0.3329167664051056, + "learning_rate": 4.5690228923894744e-09, + "loss": 0.3361, + "step": 6371 + }, + { + "epoch": 2.963261509843435, + "grad_norm": 0.3192375600337982, + "learning_rate": 4.454100592773958e-09, + "loss": 0.3245, + "step": 6372 + }, + { + "epoch": 2.963726554022632, + "grad_norm": 0.31667426228523254, + "learning_rate": 4.340641407408752e-09, + "loss": 0.3076, + "step": 6373 + }, + { + "epoch": 2.964191598201829, + "grad_norm": 0.3276313841342926, + "learning_rate": 4.228645369523943e-09, + "loss": 0.3068, + "step": 6374 + }, + { + "epoch": 2.9646566423810263, + "grad_norm": 0.3252321481704712, + "learning_rate": 4.1181125119221785e-09, + "loss": 0.3306, + "step": 6375 + }, + { + "epoch": 2.9651216865602232, + "grad_norm": 0.3393605351448059, + "learning_rate": 4.009042866976454e-09, + "loss": 0.3405, + "step": 6376 + }, + { + "epoch": 2.96558673073942, + "grad_norm": 0.3192119002342224, + "learning_rate": 3.901436466631215e-09, + "loss": 0.3125, + "step": 6377 + }, + { + "epoch": 2.9660517749186175, + "grad_norm": 0.32561612129211426, + "learning_rate": 3.795293342402917e-09, + "loss": 0.331, + "step": 6378 + }, + { + "epoch": 2.9665168190978144, + "grad_norm": 0.29712536931037903, + "learning_rate": 3.690613525379472e-09, + "loss": 0.3094, + "step": 6379 + }, + { + "epoch": 2.9669818632770113, + "grad_norm": 0.3177908658981323, + "learning_rate": 3.587397046219132e-09, + "loss": 0.32, + "step": 6380 + }, + { + "epoch": 2.967446907456208, + "grad_norm": 0.334224134683609, + "learning_rate": 3.485643935152716e-09, + "loss": 0.3327, + "step": 6381 + }, + { + "epoch": 2.967911951635405, + "grad_norm": 0.3135557174682617, + "learning_rate": 3.3853542219819403e-09, + "loss": 0.3077, + "step": 6382 + }, + { + "epoch": 2.9683769958146025, + "grad_norm": 0.3370562195777893, + "learning_rate": 3.286527936079975e-09, + "loss": 0.3069, + "step": 6383 + }, + { + "epoch": 2.9688420399937994, + "grad_norm": 0.2906818091869354, + "learning_rate": 3.1891651063920003e-09, + "loss": 0.2884, + "step": 6384 + }, + { + "epoch": 2.9693070841729963, + "grad_norm": 0.3011315166950226, + "learning_rate": 3.0932657614329843e-09, + "loss": 0.3154, + "step": 6385 + }, + { + "epoch": 2.9697721283521936, + "grad_norm": 0.3322399854660034, + "learning_rate": 2.998829929291569e-09, + "loss": 0.3232, + "step": 6386 + }, + { + "epoch": 2.9702371725313905, + "grad_norm": 0.31616201996803284, + "learning_rate": 2.905857637625076e-09, + "loss": 0.338, + "step": 6387 + }, + { + "epoch": 2.9707022167105874, + "grad_norm": 0.2996443510055542, + "learning_rate": 2.8143489136650547e-09, + "loss": 0.2643, + "step": 6388 + }, + { + "epoch": 2.971167260889785, + "grad_norm": 0.330763578414917, + "learning_rate": 2.724303784211735e-09, + "loss": 0.3372, + "step": 6389 + }, + { + "epoch": 2.9716323050689817, + "grad_norm": 0.3177904486656189, + "learning_rate": 2.635722275638464e-09, + "loss": 0.3079, + "step": 6390 + }, + { + "epoch": 2.9720973492481786, + "grad_norm": 0.32477807998657227, + "learning_rate": 2.548604413888933e-09, + "loss": 0.3184, + "step": 6391 + }, + { + "epoch": 2.9725623934273755, + "grad_norm": 0.29389679431915283, + "learning_rate": 2.462950224478844e-09, + "loss": 0.2963, + "step": 6392 + }, + { + "epoch": 2.9730274376065724, + "grad_norm": 0.3086054027080536, + "learning_rate": 2.3787597324947953e-09, + "loss": 0.3402, + "step": 6393 + }, + { + "epoch": 2.9734924817857697, + "grad_norm": 0.2977517247200012, + "learning_rate": 2.2960329625953957e-09, + "loss": 0.3352, + "step": 6394 + }, + { + "epoch": 2.9739575259649667, + "grad_norm": 0.31417447328567505, + "learning_rate": 2.2147699390090425e-09, + "loss": 0.3128, + "step": 6395 + }, + { + "epoch": 2.9744225701441636, + "grad_norm": 0.3330288827419281, + "learning_rate": 2.134970685536697e-09, + "loss": 0.3592, + "step": 6396 + }, + { + "epoch": 2.974887614323361, + "grad_norm": 0.30559176206588745, + "learning_rate": 2.056635225550219e-09, + "loss": 0.2894, + "step": 6397 + }, + { + "epoch": 2.975352658502558, + "grad_norm": 0.31366166472435, + "learning_rate": 1.9797635819934768e-09, + "loss": 0.3098, + "step": 6398 + }, + { + "epoch": 2.9758177026817547, + "grad_norm": 0.30758044123649597, + "learning_rate": 1.904355777379574e-09, + "loss": 0.2947, + "step": 6399 + }, + { + "epoch": 2.976282746860952, + "grad_norm": 0.2907547652721405, + "learning_rate": 1.830411833795287e-09, + "loss": 0.298, + "step": 6400 + }, + { + "epoch": 2.976747791040149, + "grad_norm": 0.3252081274986267, + "learning_rate": 1.7579317728977363e-09, + "loss": 0.3325, + "step": 6401 + }, + { + "epoch": 2.977212835219346, + "grad_norm": 0.3026246428489685, + "learning_rate": 1.6869156159143861e-09, + "loss": 0.3322, + "step": 6402 + }, + { + "epoch": 2.977677879398543, + "grad_norm": 0.3425191640853882, + "learning_rate": 1.617363383645265e-09, + "loss": 0.3364, + "step": 6403 + }, + { + "epoch": 2.9781429235777397, + "grad_norm": 0.32005518674850464, + "learning_rate": 1.549275096460745e-09, + "loss": 0.332, + "step": 6404 + }, + { + "epoch": 2.978607967756937, + "grad_norm": 0.2961218059062958, + "learning_rate": 1.4826507743032071e-09, + "loss": 0.2848, + "step": 6405 + }, + { + "epoch": 2.979073011936134, + "grad_norm": 0.3002302944660187, + "learning_rate": 1.417490436685376e-09, + "loss": 0.3021, + "step": 6406 + }, + { + "epoch": 2.979538056115331, + "grad_norm": 0.3076646625995636, + "learning_rate": 1.3537941026914302e-09, + "loss": 0.3196, + "step": 6407 + }, + { + "epoch": 2.980003100294528, + "grad_norm": 0.29915478825569153, + "learning_rate": 1.291561790978113e-09, + "loss": 0.283, + "step": 6408 + }, + { + "epoch": 2.980468144473725, + "grad_norm": 0.313909113407135, + "learning_rate": 1.2307935197708453e-09, + "loss": 0.3223, + "step": 6409 + }, + { + "epoch": 2.980933188652922, + "grad_norm": 0.33077502250671387, + "learning_rate": 1.1714893068687228e-09, + "loss": 0.3171, + "step": 6410 + }, + { + "epoch": 2.981398232832119, + "grad_norm": 0.3245851397514343, + "learning_rate": 1.1136491696406293e-09, + "loss": 0.3136, + "step": 6411 + }, + { + "epoch": 2.981863277011316, + "grad_norm": 0.3183169364929199, + "learning_rate": 1.057273125026903e-09, + "loss": 0.3079, + "step": 6412 + }, + { + "epoch": 2.982328321190513, + "grad_norm": 0.3176499605178833, + "learning_rate": 1.0023611895393358e-09, + "loss": 0.3185, + "step": 6413 + }, + { + "epoch": 2.98279336536971, + "grad_norm": 0.29598644375801086, + "learning_rate": 9.489133792611738e-10, + "loss": 0.2992, + "step": 6414 + }, + { + "epoch": 2.983258409548907, + "grad_norm": 0.32176917791366577, + "learning_rate": 8.96929709845451e-10, + "loss": 0.3118, + "step": 6415 + }, + { + "epoch": 2.9837234537281043, + "grad_norm": 0.32616156339645386, + "learning_rate": 8.464101965177662e-10, + "loss": 0.342, + "step": 6416 + }, + { + "epoch": 2.9841884979073012, + "grad_norm": 0.3436664640903473, + "learning_rate": 7.973548540740616e-10, + "loss": 0.323, + "step": 6417 + }, + { + "epoch": 2.984653542086498, + "grad_norm": 0.3062160313129425, + "learning_rate": 7.497636968828436e-10, + "loss": 0.2861, + "step": 6418 + }, + { + "epoch": 2.9851185862656955, + "grad_norm": 0.3244827389717102, + "learning_rate": 7.036367388824073e-10, + "loss": 0.3261, + "step": 6419 + }, + { + "epoch": 2.9855836304448924, + "grad_norm": 0.3119675815105438, + "learning_rate": 6.589739935819461e-10, + "loss": 0.3165, + "step": 6420 + }, + { + "epoch": 2.9860486746240893, + "grad_norm": 0.3089711666107178, + "learning_rate": 6.157754740632183e-10, + "loss": 0.3039, + "step": 6421 + }, + { + "epoch": 2.986513718803286, + "grad_norm": 0.3472222089767456, + "learning_rate": 5.7404119297777e-10, + "loss": 0.2805, + "step": 6422 + }, + { + "epoch": 2.986978762982483, + "grad_norm": 0.343313604593277, + "learning_rate": 5.337711625497122e-10, + "loss": 0.3385, + "step": 6423 + }, + { + "epoch": 2.9874438071616805, + "grad_norm": 0.34118393063545227, + "learning_rate": 4.949653945723886e-10, + "loss": 0.3172, + "step": 6424 + }, + { + "epoch": 2.9879088513408774, + "grad_norm": 0.331533282995224, + "learning_rate": 4.576239004122629e-10, + "loss": 0.3628, + "step": 6425 + }, + { + "epoch": 2.9883738955200743, + "grad_norm": 0.30840861797332764, + "learning_rate": 4.2174669100558673e-10, + "loss": 0.2938, + "step": 6426 + }, + { + "epoch": 2.9888389396992716, + "grad_norm": 0.3454267382621765, + "learning_rate": 3.8733377686062115e-10, + "loss": 0.3555, + "step": 6427 + }, + { + "epoch": 2.9893039838784685, + "grad_norm": 0.3079105019569397, + "learning_rate": 3.5438516805597067e-10, + "loss": 0.3275, + "step": 6428 + }, + { + "epoch": 2.9897690280576654, + "grad_norm": 0.32878822088241577, + "learning_rate": 3.229008742416939e-10, + "loss": 0.3502, + "step": 6429 + }, + { + "epoch": 2.9902340722368628, + "grad_norm": 0.3611544072628021, + "learning_rate": 2.928809046398584e-10, + "loss": 0.3244, + "step": 6430 + }, + { + "epoch": 2.9906991164160597, + "grad_norm": 0.3192159831523895, + "learning_rate": 2.6432526804176517e-10, + "loss": 0.3126, + "step": 6431 + }, + { + "epoch": 2.9911641605952566, + "grad_norm": 0.353327214717865, + "learning_rate": 2.3723397281127937e-10, + "loss": 0.3284, + "step": 6432 + }, + { + "epoch": 2.9916292047744535, + "grad_norm": 0.31189650297164917, + "learning_rate": 2.1160702688260981e-10, + "loss": 0.3192, + "step": 6433 + }, + { + "epoch": 2.9920942489536504, + "grad_norm": 0.31306004524230957, + "learning_rate": 1.8744443776252952e-10, + "loss": 0.3168, + "step": 6434 + }, + { + "epoch": 2.9925592931328477, + "grad_norm": 0.3070635497570038, + "learning_rate": 1.6474621252704494e-10, + "loss": 0.3385, + "step": 6435 + }, + { + "epoch": 2.9930243373120446, + "grad_norm": 0.32101500034332275, + "learning_rate": 1.435123578241715e-10, + "loss": 0.3346, + "step": 6436 + }, + { + "epoch": 2.9934893814912416, + "grad_norm": 0.3093291223049164, + "learning_rate": 1.2374287987337864e-10, + "loss": 0.3378, + "step": 6437 + }, + { + "epoch": 2.993954425670439, + "grad_norm": 0.3259122371673584, + "learning_rate": 1.0543778446392427e-10, + "loss": 0.3228, + "step": 6438 + }, + { + "epoch": 2.994419469849636, + "grad_norm": 0.31066158413887024, + "learning_rate": 8.859707695818564e-11, + "loss": 0.29, + "step": 6439 + }, + { + "epoch": 2.9948845140288327, + "grad_norm": 0.31667444109916687, + "learning_rate": 7.322076228777342e-11, + "loss": 0.3338, + "step": 6440 + }, + { + "epoch": 2.9953495582080296, + "grad_norm": 0.3047296404838562, + "learning_rate": 5.93088449568624e-11, + "loss": 0.2968, + "step": 6441 + }, + { + "epoch": 2.9958146023872265, + "grad_norm": 0.3278631567955017, + "learning_rate": 4.686132903886087e-11, + "loss": 0.3333, + "step": 6442 + }, + { + "epoch": 2.996279646566424, + "grad_norm": 0.3081776201725006, + "learning_rate": 3.587821818085146e-11, + "loss": 0.2953, + "step": 6443 + }, + { + "epoch": 2.9967446907456208, + "grad_norm": 0.3233565092086792, + "learning_rate": 2.6359515598595174e-11, + "loss": 0.3309, + "step": 6444 + }, + { + "epoch": 2.9972097349248177, + "grad_norm": 0.29509052634239197, + "learning_rate": 1.8305224079862015e-11, + "loss": 0.3168, + "step": 6445 + }, + { + "epoch": 2.997674779104015, + "grad_norm": 0.3146141469478607, + "learning_rate": 1.1715345984431026e-11, + "loss": 0.3229, + "step": 6446 + }, + { + "epoch": 2.998139823283212, + "grad_norm": 0.3134753406047821, + "learning_rate": 6.5898832424249324e-12, + "loss": 0.3339, + "step": 6447 + }, + { + "epoch": 2.998604867462409, + "grad_norm": 0.30789443850517273, + "learning_rate": 2.928837353755043e-12, + "loss": 0.3139, + "step": 6448 + }, + { + "epoch": 2.999069911641606, + "grad_norm": 0.31371957063674927, + "learning_rate": 7.322093920070217e-13, + "loss": 0.3199, + "step": 6449 + }, + { + "epoch": 2.999534955820803, + "grad_norm": 0.3083919286727905, + "learning_rate": 0.0, + "loss": 0.3054, + "step": 6450 + }, + { + "epoch": 2.999534955820803, + "step": 6450, + "total_flos": 6039468541263872.0, + "train_loss": 0.3618491206889929, + "train_runtime": 197008.5242, + "train_samples_per_second": 3.143, + "train_steps_per_second": 0.033 + } + ], + "logging_steps": 1.0, + "max_steps": 6450, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6039468541263872.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}