{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06842285323297982, "eval_steps": 500, "global_step": 650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010526592805073818, "grad_norm": 2.6939258575439453, "learning_rate": 0.0, "loss": 0.8515, "step": 1 }, { "epoch": 0.00021053185610147635, "grad_norm": 2.7966604232788086, "learning_rate": 1.6666666666666667e-05, "loss": 0.8166, "step": 2 }, { "epoch": 0.00031579778415221455, "grad_norm": 2.257108211517334, "learning_rate": 3.3333333333333335e-05, "loss": 0.8018, "step": 3 }, { "epoch": 0.0004210637122029527, "grad_norm": 1.1128956079483032, "learning_rate": 5e-05, "loss": 0.5497, "step": 4 }, { "epoch": 0.0005263296402536909, "grad_norm": 1.2874521017074585, "learning_rate": 4.999473462510531e-05, "loss": 0.6127, "step": 5 }, { "epoch": 0.0006315955683044291, "grad_norm": 0.9699161648750305, "learning_rate": 4.998946925021062e-05, "loss": 0.6407, "step": 6 }, { "epoch": 0.0007368614963551673, "grad_norm": 1.0630613565444946, "learning_rate": 4.998420387531593e-05, "loss": 0.4727, "step": 7 }, { "epoch": 0.0008421274244059054, "grad_norm": 0.882173478603363, "learning_rate": 4.997893850042124e-05, "loss": 0.5235, "step": 8 }, { "epoch": 0.0009473933524566436, "grad_norm": 0.676689088344574, "learning_rate": 4.997367312552654e-05, "loss": 0.4341, "step": 9 }, { "epoch": 0.0010526592805073817, "grad_norm": 0.7519457936286926, "learning_rate": 4.996840775063184e-05, "loss": 0.4338, "step": 10 }, { "epoch": 0.00115792520855812, "grad_norm": 0.7073312401771545, "learning_rate": 4.996314237573715e-05, "loss": 0.3924, "step": 11 }, { "epoch": 0.0012631911366088582, "grad_norm": 0.7425239086151123, "learning_rate": 4.995787700084246e-05, "loss": 0.4859, "step": 12 }, { "epoch": 0.0013684570646595963, "grad_norm": 0.6777500510215759, "learning_rate": 4.995261162594777e-05, "loss": 0.4235, "step": 13 }, { "epoch": 0.0014737229927103345, "grad_norm": 0.6901292204856873, "learning_rate": 4.994734625105308e-05, "loss": 0.4709, "step": 14 }, { "epoch": 0.0015789889207610726, "grad_norm": 0.8694287538528442, "learning_rate": 4.994208087615839e-05, "loss": 0.5086, "step": 15 }, { "epoch": 0.0016842548488118108, "grad_norm": 0.6798275113105774, "learning_rate": 4.99368155012637e-05, "loss": 0.4937, "step": 16 }, { "epoch": 0.001789520776862549, "grad_norm": 0.7667484879493713, "learning_rate": 4.9931550126369e-05, "loss": 0.4974, "step": 17 }, { "epoch": 0.0018947867049132871, "grad_norm": 0.6613733172416687, "learning_rate": 4.992628475147431e-05, "loss": 0.4181, "step": 18 }, { "epoch": 0.0020000526329640254, "grad_norm": 0.7069230079650879, "learning_rate": 4.992101937657962e-05, "loss": 0.4834, "step": 19 }, { "epoch": 0.0021053185610147634, "grad_norm": 0.5691242814064026, "learning_rate": 4.991575400168492e-05, "loss": 0.4405, "step": 20 }, { "epoch": 0.002210584489065502, "grad_norm": 0.701371431350708, "learning_rate": 4.991048862679023e-05, "loss": 0.3933, "step": 21 }, { "epoch": 0.00231585041711624, "grad_norm": 0.5670080780982971, "learning_rate": 4.990522325189554e-05, "loss": 0.5061, "step": 22 }, { "epoch": 0.002421116345166978, "grad_norm": 0.6001436114311218, "learning_rate": 4.9899957877000847e-05, "loss": 0.4879, "step": 23 }, { "epoch": 0.0025263822732177164, "grad_norm": 0.6185859441757202, "learning_rate": 4.9894692502106156e-05, "loss": 0.4006, "step": 24 }, { "epoch": 0.0026316482012684545, "grad_norm": 0.677813708782196, "learning_rate": 4.988942712721146e-05, "loss": 0.4347, "step": 25 }, { "epoch": 0.0027369141293191925, "grad_norm": 0.5357967019081116, "learning_rate": 4.988416175231677e-05, "loss": 0.4594, "step": 26 }, { "epoch": 0.002842180057369931, "grad_norm": 0.5995861291885376, "learning_rate": 4.9878896377422076e-05, "loss": 0.4947, "step": 27 }, { "epoch": 0.002947445985420669, "grad_norm": 0.5909422636032104, "learning_rate": 4.9873631002527385e-05, "loss": 0.5316, "step": 28 }, { "epoch": 0.003052711913471407, "grad_norm": 0.6816675662994385, "learning_rate": 4.9868365627632694e-05, "loss": 0.4705, "step": 29 }, { "epoch": 0.003157977841522145, "grad_norm": 0.5410743355751038, "learning_rate": 4.9863100252737996e-05, "loss": 0.4229, "step": 30 }, { "epoch": 0.0032632437695728836, "grad_norm": 0.7362250089645386, "learning_rate": 4.9857834877843305e-05, "loss": 0.4922, "step": 31 }, { "epoch": 0.0033685096976236216, "grad_norm": 0.7518715262413025, "learning_rate": 4.9852569502948614e-05, "loss": 0.3942, "step": 32 }, { "epoch": 0.0034737756256743597, "grad_norm": 0.6200836300849915, "learning_rate": 4.9847304128053916e-05, "loss": 0.3937, "step": 33 }, { "epoch": 0.003579041553725098, "grad_norm": 0.6816834807395935, "learning_rate": 4.9842038753159225e-05, "loss": 0.492, "step": 34 }, { "epoch": 0.003684307481775836, "grad_norm": 0.6341183185577393, "learning_rate": 4.9836773378264534e-05, "loss": 0.5873, "step": 35 }, { "epoch": 0.0037895734098265742, "grad_norm": 0.5888874530792236, "learning_rate": 4.9831508003369843e-05, "loss": 0.3784, "step": 36 }, { "epoch": 0.0038948393378773127, "grad_norm": 0.503926157951355, "learning_rate": 4.982624262847515e-05, "loss": 0.5116, "step": 37 }, { "epoch": 0.004000105265928051, "grad_norm": 0.6205700039863586, "learning_rate": 4.982097725358046e-05, "loss": 0.4237, "step": 38 }, { "epoch": 0.004105371193978789, "grad_norm": 0.47863858938217163, "learning_rate": 4.9815711878685764e-05, "loss": 0.5397, "step": 39 }, { "epoch": 0.004210637122029527, "grad_norm": 0.5036730766296387, "learning_rate": 4.981044650379107e-05, "loss": 0.46, "step": 40 }, { "epoch": 0.004315903050080265, "grad_norm": 0.4822523593902588, "learning_rate": 4.9805181128896375e-05, "loss": 0.4988, "step": 41 }, { "epoch": 0.004421168978131004, "grad_norm": 0.5173696875572205, "learning_rate": 4.9799915754001684e-05, "loss": 0.4003, "step": 42 }, { "epoch": 0.004526434906181742, "grad_norm": 0.6021311283111572, "learning_rate": 4.979465037910699e-05, "loss": 0.4306, "step": 43 }, { "epoch": 0.00463170083423248, "grad_norm": 0.5137932300567627, "learning_rate": 4.97893850042123e-05, "loss": 0.4453, "step": 44 }, { "epoch": 0.004736966762283218, "grad_norm": 0.5420482158660889, "learning_rate": 4.978411962931761e-05, "loss": 0.5377, "step": 45 }, { "epoch": 0.004842232690333956, "grad_norm": 0.5643067359924316, "learning_rate": 4.977885425442292e-05, "loss": 0.4519, "step": 46 }, { "epoch": 0.004947498618384694, "grad_norm": 0.5466287136077881, "learning_rate": 4.977358887952823e-05, "loss": 0.4221, "step": 47 }, { "epoch": 0.005052764546435433, "grad_norm": 0.5712279677391052, "learning_rate": 4.976832350463354e-05, "loss": 0.4987, "step": 48 }, { "epoch": 0.005158030474486171, "grad_norm": 0.4822379946708679, "learning_rate": 4.976305812973884e-05, "loss": 0.4848, "step": 49 }, { "epoch": 0.005263296402536909, "grad_norm": 0.5017122626304626, "learning_rate": 4.975779275484414e-05, "loss": 0.4196, "step": 50 }, { "epoch": 0.005368562330587647, "grad_norm": 0.4559021592140198, "learning_rate": 4.975252737994945e-05, "loss": 0.4412, "step": 51 }, { "epoch": 0.005473828258638385, "grad_norm": 0.5421490669250488, "learning_rate": 4.974726200505476e-05, "loss": 0.3746, "step": 52 }, { "epoch": 0.005579094186689123, "grad_norm": 0.46819037199020386, "learning_rate": 4.974199663016007e-05, "loss": 0.4521, "step": 53 }, { "epoch": 0.005684360114739862, "grad_norm": 0.45857539772987366, "learning_rate": 4.973673125526538e-05, "loss": 0.3941, "step": 54 }, { "epoch": 0.0057896260427906, "grad_norm": 0.5490565896034241, "learning_rate": 4.973146588037069e-05, "loss": 0.4551, "step": 55 }, { "epoch": 0.005894891970841338, "grad_norm": 0.5232876539230347, "learning_rate": 4.9726200505475997e-05, "loss": 0.4356, "step": 56 }, { "epoch": 0.006000157898892076, "grad_norm": 0.5434950590133667, "learning_rate": 4.97209351305813e-05, "loss": 0.436, "step": 57 }, { "epoch": 0.006105423826942814, "grad_norm": 0.44252631068229675, "learning_rate": 4.971566975568661e-05, "loss": 0.4263, "step": 58 }, { "epoch": 0.006210689754993552, "grad_norm": 0.49957412481307983, "learning_rate": 4.971040438079192e-05, "loss": 0.4422, "step": 59 }, { "epoch": 0.00631595568304429, "grad_norm": 0.46676474809646606, "learning_rate": 4.970513900589722e-05, "loss": 0.434, "step": 60 }, { "epoch": 0.006421221611095029, "grad_norm": 0.5107528567314148, "learning_rate": 4.969987363100253e-05, "loss": 0.5225, "step": 61 }, { "epoch": 0.006526487539145767, "grad_norm": 0.4967051148414612, "learning_rate": 4.969460825610784e-05, "loss": 0.4199, "step": 62 }, { "epoch": 0.006631753467196505, "grad_norm": 0.4968240559101105, "learning_rate": 4.9689342881213146e-05, "loss": 0.4157, "step": 63 }, { "epoch": 0.006737019395247243, "grad_norm": 0.5468823909759521, "learning_rate": 4.9684077506318455e-05, "loss": 0.4204, "step": 64 }, { "epoch": 0.006842285323297981, "grad_norm": 0.49830362200737, "learning_rate": 4.967881213142376e-05, "loss": 0.4381, "step": 65 }, { "epoch": 0.006947551251348719, "grad_norm": 0.6488986611366272, "learning_rate": 4.9673546756529066e-05, "loss": 0.5919, "step": 66 }, { "epoch": 0.007052817179399458, "grad_norm": 0.5668662190437317, "learning_rate": 4.9668281381634375e-05, "loss": 0.4089, "step": 67 }, { "epoch": 0.007158083107450196, "grad_norm": 0.5446314811706543, "learning_rate": 4.9663016006739684e-05, "loss": 0.4116, "step": 68 }, { "epoch": 0.007263349035500934, "grad_norm": 0.5011276006698608, "learning_rate": 4.9657750631844993e-05, "loss": 0.4808, "step": 69 }, { "epoch": 0.007368614963551672, "grad_norm": 0.7226698398590088, "learning_rate": 4.9652485256950296e-05, "loss": 0.4645, "step": 70 }, { "epoch": 0.00747388089160241, "grad_norm": 0.47501352429389954, "learning_rate": 4.9647219882055605e-05, "loss": 0.5207, "step": 71 }, { "epoch": 0.0075791468196531485, "grad_norm": 0.4856880307197571, "learning_rate": 4.9641954507160914e-05, "loss": 0.4878, "step": 72 }, { "epoch": 0.007684412747703887, "grad_norm": 0.5225908756256104, "learning_rate": 4.9636689132266216e-05, "loss": 0.514, "step": 73 }, { "epoch": 0.007789678675754625, "grad_norm": 0.526539146900177, "learning_rate": 4.9631423757371525e-05, "loss": 0.4572, "step": 74 }, { "epoch": 0.007894944603805363, "grad_norm": 0.49719616770744324, "learning_rate": 4.9626158382476834e-05, "loss": 0.4352, "step": 75 }, { "epoch": 0.008000210531856101, "grad_norm": 0.5542761087417603, "learning_rate": 4.962089300758214e-05, "loss": 0.5408, "step": 76 }, { "epoch": 0.00810547645990684, "grad_norm": 0.4830870032310486, "learning_rate": 4.961562763268745e-05, "loss": 0.3941, "step": 77 }, { "epoch": 0.008210742387957578, "grad_norm": 0.504296600818634, "learning_rate": 4.961036225779276e-05, "loss": 0.4609, "step": 78 }, { "epoch": 0.008316008316008316, "grad_norm": 0.5107358694076538, "learning_rate": 4.960509688289807e-05, "loss": 0.4313, "step": 79 }, { "epoch": 0.008421274244059054, "grad_norm": 0.5339490175247192, "learning_rate": 4.959983150800337e-05, "loss": 0.5046, "step": 80 }, { "epoch": 0.008526540172109793, "grad_norm": 0.4737516939640045, "learning_rate": 4.9594566133108675e-05, "loss": 0.5728, "step": 81 }, { "epoch": 0.00863180610016053, "grad_norm": 0.4952607750892639, "learning_rate": 4.9589300758213984e-05, "loss": 0.4464, "step": 82 }, { "epoch": 0.008737072028211269, "grad_norm": 0.5083893537521362, "learning_rate": 4.958403538331929e-05, "loss": 0.4429, "step": 83 }, { "epoch": 0.008842337956262008, "grad_norm": 0.5977057218551636, "learning_rate": 4.95787700084246e-05, "loss": 0.4367, "step": 84 }, { "epoch": 0.008947603884312745, "grad_norm": 0.6330780982971191, "learning_rate": 4.957350463352991e-05, "loss": 0.4736, "step": 85 }, { "epoch": 0.009052869812363484, "grad_norm": 0.4460638761520386, "learning_rate": 4.956823925863522e-05, "loss": 0.4933, "step": 86 }, { "epoch": 0.00915813574041422, "grad_norm": 0.508703351020813, "learning_rate": 4.956297388374053e-05, "loss": 0.5794, "step": 87 }, { "epoch": 0.00926340166846496, "grad_norm": 0.45487043261528015, "learning_rate": 4.955770850884584e-05, "loss": 0.5156, "step": 88 }, { "epoch": 0.009368667596515699, "grad_norm": 0.46359360218048096, "learning_rate": 4.955244313395114e-05, "loss": 0.4634, "step": 89 }, { "epoch": 0.009473933524566436, "grad_norm": 0.5234309434890747, "learning_rate": 4.954717775905645e-05, "loss": 0.383, "step": 90 }, { "epoch": 0.009579199452617175, "grad_norm": 0.5344865322113037, "learning_rate": 4.954191238416175e-05, "loss": 0.4619, "step": 91 }, { "epoch": 0.009684465380667912, "grad_norm": 0.6055357456207275, "learning_rate": 4.953664700926706e-05, "loss": 0.495, "step": 92 }, { "epoch": 0.00978973130871865, "grad_norm": 0.4749431014060974, "learning_rate": 4.953138163437237e-05, "loss": 0.5209, "step": 93 }, { "epoch": 0.009894997236769388, "grad_norm": 0.4775514602661133, "learning_rate": 4.952611625947768e-05, "loss": 0.4064, "step": 94 }, { "epoch": 0.010000263164820127, "grad_norm": 0.4580100178718567, "learning_rate": 4.952085088458299e-05, "loss": 0.4479, "step": 95 }, { "epoch": 0.010105529092870866, "grad_norm": 0.5589710474014282, "learning_rate": 4.9515585509688296e-05, "loss": 0.4072, "step": 96 }, { "epoch": 0.010210795020921603, "grad_norm": 0.45461875200271606, "learning_rate": 4.95103201347936e-05, "loss": 0.4933, "step": 97 }, { "epoch": 0.010316060948972342, "grad_norm": 0.4552902579307556, "learning_rate": 4.950505475989891e-05, "loss": 0.4038, "step": 98 }, { "epoch": 0.010421326877023079, "grad_norm": 0.5590063333511353, "learning_rate": 4.9499789385004216e-05, "loss": 0.4928, "step": 99 }, { "epoch": 0.010526592805073818, "grad_norm": 0.5689685344696045, "learning_rate": 4.949452401010952e-05, "loss": 0.3962, "step": 100 }, { "epoch": 0.010631858733124555, "grad_norm": 0.5378232598304749, "learning_rate": 4.948925863521483e-05, "loss": 0.4843, "step": 101 }, { "epoch": 0.010737124661175294, "grad_norm": 0.6677789688110352, "learning_rate": 4.948399326032014e-05, "loss": 0.5839, "step": 102 }, { "epoch": 0.010842390589226033, "grad_norm": 0.4315250515937805, "learning_rate": 4.9478727885425446e-05, "loss": 0.5467, "step": 103 }, { "epoch": 0.01094765651727677, "grad_norm": 0.6344457268714905, "learning_rate": 4.9473462510530755e-05, "loss": 0.5048, "step": 104 }, { "epoch": 0.011052922445327509, "grad_norm": 0.41527998447418213, "learning_rate": 4.946819713563606e-05, "loss": 0.5559, "step": 105 }, { "epoch": 0.011158188373378246, "grad_norm": 0.46887871623039246, "learning_rate": 4.9462931760741366e-05, "loss": 0.4165, "step": 106 }, { "epoch": 0.011263454301428985, "grad_norm": 0.5572345852851868, "learning_rate": 4.9457666385846675e-05, "loss": 0.4496, "step": 107 }, { "epoch": 0.011368720229479724, "grad_norm": 0.46033406257629395, "learning_rate": 4.9452401010951984e-05, "loss": 0.4699, "step": 108 }, { "epoch": 0.011473986157530461, "grad_norm": 0.5205333232879639, "learning_rate": 4.944713563605729e-05, "loss": 0.4291, "step": 109 }, { "epoch": 0.0115792520855812, "grad_norm": 0.5044732689857483, "learning_rate": 4.9441870261162595e-05, "loss": 0.4424, "step": 110 }, { "epoch": 0.011684518013631937, "grad_norm": 0.5410451889038086, "learning_rate": 4.9436604886267904e-05, "loss": 0.3803, "step": 111 }, { "epoch": 0.011789783941682676, "grad_norm": 0.5163026452064514, "learning_rate": 4.943133951137321e-05, "loss": 0.4276, "step": 112 }, { "epoch": 0.011895049869733413, "grad_norm": 0.4330487847328186, "learning_rate": 4.9426074136478516e-05, "loss": 0.4644, "step": 113 }, { "epoch": 0.012000315797784152, "grad_norm": 0.41046929359436035, "learning_rate": 4.9420808761583825e-05, "loss": 0.4161, "step": 114 }, { "epoch": 0.012105581725834891, "grad_norm": 0.4908786714076996, "learning_rate": 4.9415543386689134e-05, "loss": 0.4502, "step": 115 }, { "epoch": 0.012210847653885628, "grad_norm": 0.4866664707660675, "learning_rate": 4.941027801179444e-05, "loss": 0.3923, "step": 116 }, { "epoch": 0.012316113581936367, "grad_norm": 0.4272409677505493, "learning_rate": 4.940501263689975e-05, "loss": 0.4828, "step": 117 }, { "epoch": 0.012421379509987104, "grad_norm": 0.517900288105011, "learning_rate": 4.939974726200506e-05, "loss": 0.4661, "step": 118 }, { "epoch": 0.012526645438037843, "grad_norm": 0.5139513611793518, "learning_rate": 4.939448188711037e-05, "loss": 0.536, "step": 119 }, { "epoch": 0.01263191136608858, "grad_norm": 0.5204519629478455, "learning_rate": 4.938921651221567e-05, "loss": 0.4156, "step": 120 }, { "epoch": 0.01273717729413932, "grad_norm": 0.566659152507782, "learning_rate": 4.9383951137320974e-05, "loss": 0.4675, "step": 121 }, { "epoch": 0.012842443222190058, "grad_norm": 0.5262351632118225, "learning_rate": 4.937868576242628e-05, "loss": 0.5002, "step": 122 }, { "epoch": 0.012947709150240795, "grad_norm": 0.5888293981552124, "learning_rate": 4.937342038753159e-05, "loss": 0.4058, "step": 123 }, { "epoch": 0.013052975078291534, "grad_norm": 0.5911523103713989, "learning_rate": 4.93681550126369e-05, "loss": 0.392, "step": 124 }, { "epoch": 0.013158241006342272, "grad_norm": 0.48798367381095886, "learning_rate": 4.936288963774221e-05, "loss": 0.4442, "step": 125 }, { "epoch": 0.01326350693439301, "grad_norm": 0.5228798985481262, "learning_rate": 4.935762426284752e-05, "loss": 0.4673, "step": 126 }, { "epoch": 0.01336877286244375, "grad_norm": 0.4832141399383545, "learning_rate": 4.935235888795283e-05, "loss": 0.4259, "step": 127 }, { "epoch": 0.013474038790494487, "grad_norm": 0.6188245415687561, "learning_rate": 4.934709351305814e-05, "loss": 0.4982, "step": 128 }, { "epoch": 0.013579304718545225, "grad_norm": 0.4905821979045868, "learning_rate": 4.934182813816344e-05, "loss": 0.4539, "step": 129 }, { "epoch": 0.013684570646595963, "grad_norm": 0.6506298184394836, "learning_rate": 4.933656276326875e-05, "loss": 0.3982, "step": 130 }, { "epoch": 0.013789836574646702, "grad_norm": 0.570380687713623, "learning_rate": 4.933129738837405e-05, "loss": 0.3901, "step": 131 }, { "epoch": 0.013895102502697439, "grad_norm": 0.44687098264694214, "learning_rate": 4.932603201347936e-05, "loss": 0.4176, "step": 132 }, { "epoch": 0.014000368430748178, "grad_norm": 0.6272158622741699, "learning_rate": 4.932076663858467e-05, "loss": 0.4455, "step": 133 }, { "epoch": 0.014105634358798917, "grad_norm": 0.6358391046524048, "learning_rate": 4.931550126368998e-05, "loss": 0.384, "step": 134 }, { "epoch": 0.014210900286849654, "grad_norm": 0.6558123826980591, "learning_rate": 4.931023588879529e-05, "loss": 0.5024, "step": 135 }, { "epoch": 0.014316166214900393, "grad_norm": 0.4577985107898712, "learning_rate": 4.9304970513900596e-05, "loss": 0.3906, "step": 136 }, { "epoch": 0.01442143214295113, "grad_norm": 0.5580503344535828, "learning_rate": 4.92997051390059e-05, "loss": 0.4589, "step": 137 }, { "epoch": 0.014526698071001869, "grad_norm": 0.5660861134529114, "learning_rate": 4.929443976411121e-05, "loss": 0.3913, "step": 138 }, { "epoch": 0.014631963999052606, "grad_norm": 0.49188342690467834, "learning_rate": 4.9289174389216516e-05, "loss": 0.3951, "step": 139 }, { "epoch": 0.014737229927103345, "grad_norm": 0.6210848689079285, "learning_rate": 4.9283909014321825e-05, "loss": 0.4282, "step": 140 }, { "epoch": 0.014842495855154084, "grad_norm": 0.48430967330932617, "learning_rate": 4.927864363942713e-05, "loss": 0.4667, "step": 141 }, { "epoch": 0.01494776178320482, "grad_norm": 0.5269038677215576, "learning_rate": 4.9273378264532436e-05, "loss": 0.3845, "step": 142 }, { "epoch": 0.01505302771125556, "grad_norm": 0.5490912199020386, "learning_rate": 4.9268112889637745e-05, "loss": 0.4477, "step": 143 }, { "epoch": 0.015158293639306297, "grad_norm": 0.4111802279949188, "learning_rate": 4.9262847514743054e-05, "loss": 0.4351, "step": 144 }, { "epoch": 0.015263559567357036, "grad_norm": 0.48929688334465027, "learning_rate": 4.9257582139848357e-05, "loss": 0.4512, "step": 145 }, { "epoch": 0.015368825495407775, "grad_norm": 0.9201393723487854, "learning_rate": 4.9252316764953666e-05, "loss": 0.5254, "step": 146 }, { "epoch": 0.015474091423458512, "grad_norm": 0.5191910862922668, "learning_rate": 4.9247051390058975e-05, "loss": 0.5455, "step": 147 }, { "epoch": 0.01557935735150925, "grad_norm": 0.3562093675136566, "learning_rate": 4.9241786015164284e-05, "loss": 0.5303, "step": 148 }, { "epoch": 0.015684623279559988, "grad_norm": 0.7195460796356201, "learning_rate": 4.923652064026959e-05, "loss": 0.4389, "step": 149 }, { "epoch": 0.015789889207610725, "grad_norm": 0.448176771402359, "learning_rate": 4.9231255265374895e-05, "loss": 0.3987, "step": 150 }, { "epoch": 0.015895155135661466, "grad_norm": 0.48504385352134705, "learning_rate": 4.9225989890480204e-05, "loss": 0.4725, "step": 151 }, { "epoch": 0.016000421063712203, "grad_norm": 0.5456967353820801, "learning_rate": 4.922072451558551e-05, "loss": 0.5143, "step": 152 }, { "epoch": 0.01610568699176294, "grad_norm": 0.61397784948349, "learning_rate": 4.9215459140690815e-05, "loss": 0.4295, "step": 153 }, { "epoch": 0.01621095291981368, "grad_norm": 0.6359485387802124, "learning_rate": 4.9210193765796124e-05, "loss": 0.4498, "step": 154 }, { "epoch": 0.016316218847864418, "grad_norm": 0.5002400279045105, "learning_rate": 4.920492839090143e-05, "loss": 0.467, "step": 155 }, { "epoch": 0.016421484775915155, "grad_norm": 0.5669925212860107, "learning_rate": 4.919966301600674e-05, "loss": 0.5151, "step": 156 }, { "epoch": 0.016526750703965892, "grad_norm": 0.4109033942222595, "learning_rate": 4.919439764111205e-05, "loss": 0.4672, "step": 157 }, { "epoch": 0.016632016632016633, "grad_norm": 0.5119397044181824, "learning_rate": 4.918913226621736e-05, "loss": 0.4846, "step": 158 }, { "epoch": 0.01673728256006737, "grad_norm": 0.5187058448791504, "learning_rate": 4.918386689132267e-05, "loss": 0.4698, "step": 159 }, { "epoch": 0.016842548488118107, "grad_norm": 0.55632483959198, "learning_rate": 4.917860151642797e-05, "loss": 0.3904, "step": 160 }, { "epoch": 0.016947814416168848, "grad_norm": 0.5332942008972168, "learning_rate": 4.917333614153328e-05, "loss": 0.4253, "step": 161 }, { "epoch": 0.017053080344219585, "grad_norm": 0.5523495078086853, "learning_rate": 4.916807076663858e-05, "loss": 0.415, "step": 162 }, { "epoch": 0.017158346272270322, "grad_norm": 0.5162644386291504, "learning_rate": 4.916280539174389e-05, "loss": 0.514, "step": 163 }, { "epoch": 0.01726361220032106, "grad_norm": 0.414809912443161, "learning_rate": 4.91575400168492e-05, "loss": 0.4757, "step": 164 }, { "epoch": 0.0173688781283718, "grad_norm": 0.5634474754333496, "learning_rate": 4.915227464195451e-05, "loss": 0.3643, "step": 165 }, { "epoch": 0.017474144056422537, "grad_norm": 0.5438713431358337, "learning_rate": 4.914700926705982e-05, "loss": 0.4315, "step": 166 }, { "epoch": 0.017579409984473274, "grad_norm": 0.49885427951812744, "learning_rate": 4.914174389216513e-05, "loss": 0.4697, "step": 167 }, { "epoch": 0.017684675912524015, "grad_norm": 0.46923205256462097, "learning_rate": 4.913647851727044e-05, "loss": 0.4189, "step": 168 }, { "epoch": 0.017789941840574752, "grad_norm": 0.4423271119594574, "learning_rate": 4.913121314237574e-05, "loss": 0.4602, "step": 169 }, { "epoch": 0.01789520776862549, "grad_norm": 0.6115851402282715, "learning_rate": 4.912594776748105e-05, "loss": 0.4399, "step": 170 }, { "epoch": 0.018000473696676227, "grad_norm": 0.5554397106170654, "learning_rate": 4.912068239258635e-05, "loss": 0.4262, "step": 171 }, { "epoch": 0.018105739624726967, "grad_norm": 0.565323531627655, "learning_rate": 4.911541701769166e-05, "loss": 0.4424, "step": 172 }, { "epoch": 0.018211005552777704, "grad_norm": 0.44236519932746887, "learning_rate": 4.911015164279697e-05, "loss": 0.424, "step": 173 }, { "epoch": 0.01831627148082844, "grad_norm": 0.6567726731300354, "learning_rate": 4.910488626790228e-05, "loss": 0.4231, "step": 174 }, { "epoch": 0.018421537408879182, "grad_norm": 0.42518746852874756, "learning_rate": 4.9099620893007586e-05, "loss": 0.4878, "step": 175 }, { "epoch": 0.01852680333692992, "grad_norm": 0.5739135146141052, "learning_rate": 4.9094355518112895e-05, "loss": 0.4514, "step": 176 }, { "epoch": 0.018632069264980657, "grad_norm": 0.628442645072937, "learning_rate": 4.90890901432182e-05, "loss": 0.3625, "step": 177 }, { "epoch": 0.018737335193031397, "grad_norm": 0.445872962474823, "learning_rate": 4.9083824768323507e-05, "loss": 0.5256, "step": 178 }, { "epoch": 0.018842601121082134, "grad_norm": 0.5037261247634888, "learning_rate": 4.9078559393428816e-05, "loss": 0.4322, "step": 179 }, { "epoch": 0.01894786704913287, "grad_norm": 0.5586241483688354, "learning_rate": 4.9073294018534125e-05, "loss": 0.5682, "step": 180 }, { "epoch": 0.01905313297718361, "grad_norm": 0.5735304355621338, "learning_rate": 4.906802864363943e-05, "loss": 0.4486, "step": 181 }, { "epoch": 0.01915839890523435, "grad_norm": 0.6629624962806702, "learning_rate": 4.9062763268744736e-05, "loss": 0.4748, "step": 182 }, { "epoch": 0.019263664833285087, "grad_norm": 0.5536085963249207, "learning_rate": 4.9057497893850045e-05, "loss": 0.3779, "step": 183 }, { "epoch": 0.019368930761335824, "grad_norm": 0.37973251938819885, "learning_rate": 4.9052232518955354e-05, "loss": 0.4913, "step": 184 }, { "epoch": 0.019474196689386564, "grad_norm": 0.6046680212020874, "learning_rate": 4.9046967144060656e-05, "loss": 0.4644, "step": 185 }, { "epoch": 0.0195794626174373, "grad_norm": 0.5051435828208923, "learning_rate": 4.9041701769165965e-05, "loss": 0.5042, "step": 186 }, { "epoch": 0.01968472854548804, "grad_norm": 0.5261257290840149, "learning_rate": 4.9036436394271274e-05, "loss": 0.4679, "step": 187 }, { "epoch": 0.019789994473538776, "grad_norm": 0.5349376797676086, "learning_rate": 4.903117101937658e-05, "loss": 0.4206, "step": 188 }, { "epoch": 0.019895260401589517, "grad_norm": 0.5617197751998901, "learning_rate": 4.902590564448189e-05, "loss": 0.3974, "step": 189 }, { "epoch": 0.020000526329640254, "grad_norm": 0.549514889717102, "learning_rate": 4.90206402695872e-05, "loss": 0.5034, "step": 190 }, { "epoch": 0.02010579225769099, "grad_norm": 0.6475022435188293, "learning_rate": 4.9015374894692503e-05, "loss": 0.4651, "step": 191 }, { "epoch": 0.02021105818574173, "grad_norm": 0.6060453057289124, "learning_rate": 4.901010951979781e-05, "loss": 0.3981, "step": 192 }, { "epoch": 0.02031632411379247, "grad_norm": 0.6936651468276978, "learning_rate": 4.9004844144903115e-05, "loss": 0.3804, "step": 193 }, { "epoch": 0.020421590041843206, "grad_norm": 0.44638895988464355, "learning_rate": 4.8999578770008424e-05, "loss": 0.4596, "step": 194 }, { "epoch": 0.020526855969893943, "grad_norm": 0.5297572612762451, "learning_rate": 4.899431339511373e-05, "loss": 0.4385, "step": 195 }, { "epoch": 0.020632121897944684, "grad_norm": 0.5046480894088745, "learning_rate": 4.898904802021904e-05, "loss": 0.4557, "step": 196 }, { "epoch": 0.02073738782599542, "grad_norm": 0.5276935696601868, "learning_rate": 4.898378264532435e-05, "loss": 0.39, "step": 197 }, { "epoch": 0.020842653754046158, "grad_norm": 0.4923096001148224, "learning_rate": 4.897851727042966e-05, "loss": 0.4585, "step": 198 }, { "epoch": 0.0209479196820969, "grad_norm": 0.4554820954799652, "learning_rate": 4.897325189553497e-05, "loss": 0.5175, "step": 199 }, { "epoch": 0.021053185610147636, "grad_norm": 0.47559452056884766, "learning_rate": 4.896798652064027e-05, "loss": 0.5275, "step": 200 }, { "epoch": 0.021158451538198373, "grad_norm": 0.5070779323577881, "learning_rate": 4.896272114574558e-05, "loss": 0.4958, "step": 201 }, { "epoch": 0.02126371746624911, "grad_norm": 0.5040444135665894, "learning_rate": 4.895745577085088e-05, "loss": 0.4616, "step": 202 }, { "epoch": 0.02136898339429985, "grad_norm": 0.5290699601173401, "learning_rate": 4.895219039595619e-05, "loss": 0.5178, "step": 203 }, { "epoch": 0.021474249322350588, "grad_norm": 0.5007508993148804, "learning_rate": 4.89469250210615e-05, "loss": 0.4489, "step": 204 }, { "epoch": 0.021579515250401325, "grad_norm": 0.6373962759971619, "learning_rate": 4.894165964616681e-05, "loss": 0.4124, "step": 205 }, { "epoch": 0.021684781178452066, "grad_norm": 0.5132836699485779, "learning_rate": 4.893639427127212e-05, "loss": 0.4534, "step": 206 }, { "epoch": 0.021790047106502803, "grad_norm": 0.6253231167793274, "learning_rate": 4.893112889637743e-05, "loss": 0.3986, "step": 207 }, { "epoch": 0.02189531303455354, "grad_norm": 0.5937986373901367, "learning_rate": 4.8925863521482736e-05, "loss": 0.3956, "step": 208 }, { "epoch": 0.022000578962604277, "grad_norm": 0.4578053951263428, "learning_rate": 4.892059814658804e-05, "loss": 0.4068, "step": 209 }, { "epoch": 0.022105844890655018, "grad_norm": 0.5060281157493591, "learning_rate": 4.891533277169335e-05, "loss": 0.5179, "step": 210 }, { "epoch": 0.022211110818705755, "grad_norm": 0.561792254447937, "learning_rate": 4.8910067396798657e-05, "loss": 0.4547, "step": 211 }, { "epoch": 0.022316376746756492, "grad_norm": 0.38052886724472046, "learning_rate": 4.890480202190396e-05, "loss": 0.4493, "step": 212 }, { "epoch": 0.022421642674807233, "grad_norm": 0.5639155507087708, "learning_rate": 4.889953664700927e-05, "loss": 0.4239, "step": 213 }, { "epoch": 0.02252690860285797, "grad_norm": 0.5452573299407959, "learning_rate": 4.889427127211458e-05, "loss": 0.4393, "step": 214 }, { "epoch": 0.022632174530908707, "grad_norm": 0.4861447811126709, "learning_rate": 4.8889005897219886e-05, "loss": 0.4971, "step": 215 }, { "epoch": 0.022737440458959448, "grad_norm": 0.5619585514068604, "learning_rate": 4.8883740522325195e-05, "loss": 0.3992, "step": 216 }, { "epoch": 0.022842706387010185, "grad_norm": 0.5488256812095642, "learning_rate": 4.88784751474305e-05, "loss": 0.4155, "step": 217 }, { "epoch": 0.022947972315060922, "grad_norm": 0.517796516418457, "learning_rate": 4.8873209772535806e-05, "loss": 0.5018, "step": 218 }, { "epoch": 0.02305323824311166, "grad_norm": 0.6027892827987671, "learning_rate": 4.8867944397641115e-05, "loss": 0.4684, "step": 219 }, { "epoch": 0.0231585041711624, "grad_norm": 0.47196510434150696, "learning_rate": 4.8862679022746424e-05, "loss": 0.4423, "step": 220 }, { "epoch": 0.023263770099213137, "grad_norm": 0.41390231251716614, "learning_rate": 4.8857413647851726e-05, "loss": 0.4031, "step": 221 }, { "epoch": 0.023369036027263874, "grad_norm": 0.5514193773269653, "learning_rate": 4.8852148272957035e-05, "loss": 0.6308, "step": 222 }, { "epoch": 0.023474301955314615, "grad_norm": 0.4564357101917267, "learning_rate": 4.8846882898062344e-05, "loss": 0.5284, "step": 223 }, { "epoch": 0.023579567883365352, "grad_norm": 0.45888492465019226, "learning_rate": 4.8841617523167653e-05, "loss": 0.4536, "step": 224 }, { "epoch": 0.02368483381141609, "grad_norm": 0.4363495409488678, "learning_rate": 4.8836352148272956e-05, "loss": 0.4838, "step": 225 }, { "epoch": 0.023790099739466827, "grad_norm": 0.40970975160598755, "learning_rate": 4.8831086773378265e-05, "loss": 0.5299, "step": 226 }, { "epoch": 0.023895365667517567, "grad_norm": 0.5274611711502075, "learning_rate": 4.8825821398483574e-05, "loss": 0.3967, "step": 227 }, { "epoch": 0.024000631595568304, "grad_norm": 0.5038068890571594, "learning_rate": 4.882055602358888e-05, "loss": 0.5067, "step": 228 }, { "epoch": 0.02410589752361904, "grad_norm": 0.5031372904777527, "learning_rate": 4.881529064869419e-05, "loss": 0.3756, "step": 229 }, { "epoch": 0.024211163451669782, "grad_norm": 0.49740293622016907, "learning_rate": 4.88100252737995e-05, "loss": 0.4809, "step": 230 }, { "epoch": 0.02431642937972052, "grad_norm": 0.4950021207332611, "learning_rate": 4.88047598989048e-05, "loss": 0.4149, "step": 231 }, { "epoch": 0.024421695307771257, "grad_norm": 0.46618038415908813, "learning_rate": 4.879949452401011e-05, "loss": 0.4737, "step": 232 }, { "epoch": 0.024526961235821994, "grad_norm": 0.4663354158401489, "learning_rate": 4.8794229149115414e-05, "loss": 0.3884, "step": 233 }, { "epoch": 0.024632227163872734, "grad_norm": 0.6165478229522705, "learning_rate": 4.878896377422072e-05, "loss": 0.3875, "step": 234 }, { "epoch": 0.02473749309192347, "grad_norm": 0.4838646948337555, "learning_rate": 4.878369839932603e-05, "loss": 0.4679, "step": 235 }, { "epoch": 0.02484275901997421, "grad_norm": 0.49089592695236206, "learning_rate": 4.877843302443134e-05, "loss": 0.5484, "step": 236 }, { "epoch": 0.02494802494802495, "grad_norm": 0.4166033864021301, "learning_rate": 4.877316764953665e-05, "loss": 0.4594, "step": 237 }, { "epoch": 0.025053290876075687, "grad_norm": 0.6557610630989075, "learning_rate": 4.876790227464196e-05, "loss": 0.422, "step": 238 }, { "epoch": 0.025158556804126424, "grad_norm": 0.4997393786907196, "learning_rate": 4.876263689974727e-05, "loss": 0.4165, "step": 239 }, { "epoch": 0.02526382273217716, "grad_norm": 0.3650420606136322, "learning_rate": 4.875737152485258e-05, "loss": 0.4758, "step": 240 }, { "epoch": 0.0253690886602279, "grad_norm": 0.5316746830940247, "learning_rate": 4.875210614995788e-05, "loss": 0.4703, "step": 241 }, { "epoch": 0.02547435458827864, "grad_norm": 0.3838014602661133, "learning_rate": 4.874684077506318e-05, "loss": 0.6512, "step": 242 }, { "epoch": 0.025579620516329376, "grad_norm": 0.5243346095085144, "learning_rate": 4.874157540016849e-05, "loss": 0.4515, "step": 243 }, { "epoch": 0.025684886444380117, "grad_norm": 0.46801677346229553, "learning_rate": 4.87363100252738e-05, "loss": 0.4605, "step": 244 }, { "epoch": 0.025790152372430854, "grad_norm": 0.4614790081977844, "learning_rate": 4.873104465037911e-05, "loss": 0.4101, "step": 245 }, { "epoch": 0.02589541830048159, "grad_norm": 0.4433145821094513, "learning_rate": 4.872577927548442e-05, "loss": 0.4578, "step": 246 }, { "epoch": 0.026000684228532328, "grad_norm": 0.43368014693260193, "learning_rate": 4.872051390058973e-05, "loss": 0.4077, "step": 247 }, { "epoch": 0.02610595015658307, "grad_norm": 0.4347352385520935, "learning_rate": 4.8715248525695036e-05, "loss": 0.4451, "step": 248 }, { "epoch": 0.026211216084633806, "grad_norm": 0.5047518610954285, "learning_rate": 4.870998315080034e-05, "loss": 0.4308, "step": 249 }, { "epoch": 0.026316482012684543, "grad_norm": 0.6036553978919983, "learning_rate": 4.870471777590565e-05, "loss": 0.5001, "step": 250 }, { "epoch": 0.026421747940735284, "grad_norm": 0.5581931471824646, "learning_rate": 4.8699452401010956e-05, "loss": 0.3939, "step": 251 }, { "epoch": 0.02652701386878602, "grad_norm": 0.4085439145565033, "learning_rate": 4.869418702611626e-05, "loss": 0.5321, "step": 252 }, { "epoch": 0.026632279796836758, "grad_norm": 0.6976563334465027, "learning_rate": 4.868892165122157e-05, "loss": 0.4767, "step": 253 }, { "epoch": 0.0267375457248875, "grad_norm": 0.48653343319892883, "learning_rate": 4.8683656276326876e-05, "loss": 0.5387, "step": 254 }, { "epoch": 0.026842811652938236, "grad_norm": 0.5379003286361694, "learning_rate": 4.8678390901432185e-05, "loss": 0.4418, "step": 255 }, { "epoch": 0.026948077580988973, "grad_norm": 0.42478466033935547, "learning_rate": 4.8673125526537494e-05, "loss": 0.4751, "step": 256 }, { "epoch": 0.02705334350903971, "grad_norm": 0.4857715666294098, "learning_rate": 4.86678601516428e-05, "loss": 0.4608, "step": 257 }, { "epoch": 0.02715860943709045, "grad_norm": 0.46174147725105286, "learning_rate": 4.8662594776748106e-05, "loss": 0.4611, "step": 258 }, { "epoch": 0.027263875365141188, "grad_norm": 0.5316092371940613, "learning_rate": 4.8657329401853415e-05, "loss": 0.4463, "step": 259 }, { "epoch": 0.027369141293191925, "grad_norm": 0.5541107058525085, "learning_rate": 4.8652064026958724e-05, "loss": 0.4619, "step": 260 }, { "epoch": 0.027474407221242666, "grad_norm": 0.4637160003185272, "learning_rate": 4.864679865206403e-05, "loss": 0.425, "step": 261 }, { "epoch": 0.027579673149293403, "grad_norm": 0.4406774938106537, "learning_rate": 4.8641533277169335e-05, "loss": 0.5234, "step": 262 }, { "epoch": 0.02768493907734414, "grad_norm": 0.5540871620178223, "learning_rate": 4.8636267902274644e-05, "loss": 0.4565, "step": 263 }, { "epoch": 0.027790205005394877, "grad_norm": 0.5119719505310059, "learning_rate": 4.863100252737995e-05, "loss": 0.4224, "step": 264 }, { "epoch": 0.027895470933445618, "grad_norm": 0.6064046025276184, "learning_rate": 4.8625737152485255e-05, "loss": 0.453, "step": 265 }, { "epoch": 0.028000736861496355, "grad_norm": 0.5928232669830322, "learning_rate": 4.8620471777590564e-05, "loss": 0.4444, "step": 266 }, { "epoch": 0.028106002789547092, "grad_norm": 0.5610330700874329, "learning_rate": 4.861520640269587e-05, "loss": 0.4051, "step": 267 }, { "epoch": 0.028211268717597833, "grad_norm": 0.4866770803928375, "learning_rate": 4.860994102780118e-05, "loss": 0.4629, "step": 268 }, { "epoch": 0.02831653464564857, "grad_norm": 0.5181504487991333, "learning_rate": 4.860467565290649e-05, "loss": 0.4225, "step": 269 }, { "epoch": 0.028421800573699307, "grad_norm": 0.36064937710762024, "learning_rate": 4.85994102780118e-05, "loss": 0.4136, "step": 270 }, { "epoch": 0.028527066501750045, "grad_norm": 0.4846802353858948, "learning_rate": 4.85941449031171e-05, "loss": 0.4321, "step": 271 }, { "epoch": 0.028632332429800785, "grad_norm": 0.4463631510734558, "learning_rate": 4.858887952822241e-05, "loss": 0.5485, "step": 272 }, { "epoch": 0.028737598357851522, "grad_norm": 0.4516132175922394, "learning_rate": 4.8583614153327714e-05, "loss": 0.4853, "step": 273 }, { "epoch": 0.02884286428590226, "grad_norm": 0.40815305709838867, "learning_rate": 4.857834877843302e-05, "loss": 0.3355, "step": 274 }, { "epoch": 0.028948130213953, "grad_norm": 0.54203200340271, "learning_rate": 4.857308340353833e-05, "loss": 0.3969, "step": 275 }, { "epoch": 0.029053396142003737, "grad_norm": 0.5161415338516235, "learning_rate": 4.856781802864364e-05, "loss": 0.3776, "step": 276 }, { "epoch": 0.029158662070054474, "grad_norm": 0.4058281183242798, "learning_rate": 4.856255265374895e-05, "loss": 0.4268, "step": 277 }, { "epoch": 0.02926392799810521, "grad_norm": 0.43867388367652893, "learning_rate": 4.855728727885426e-05, "loss": 0.4458, "step": 278 }, { "epoch": 0.029369193926155952, "grad_norm": 0.441211998462677, "learning_rate": 4.855202190395957e-05, "loss": 0.4532, "step": 279 }, { "epoch": 0.02947445985420669, "grad_norm": 0.5454714894294739, "learning_rate": 4.854675652906488e-05, "loss": 0.4907, "step": 280 }, { "epoch": 0.029579725782257427, "grad_norm": 0.47156885266304016, "learning_rate": 4.854149115417018e-05, "loss": 0.4905, "step": 281 }, { "epoch": 0.029684991710308167, "grad_norm": 0.40513938665390015, "learning_rate": 4.853622577927549e-05, "loss": 0.4808, "step": 282 }, { "epoch": 0.029790257638358904, "grad_norm": 0.47520211338996887, "learning_rate": 4.853096040438079e-05, "loss": 0.4501, "step": 283 }, { "epoch": 0.02989552356640964, "grad_norm": 0.5248693823814392, "learning_rate": 4.85256950294861e-05, "loss": 0.4287, "step": 284 }, { "epoch": 0.03000078949446038, "grad_norm": 0.4880824089050293, "learning_rate": 4.852042965459141e-05, "loss": 0.3947, "step": 285 }, { "epoch": 0.03010605542251112, "grad_norm": 0.4884517788887024, "learning_rate": 4.851516427969672e-05, "loss": 0.4521, "step": 286 }, { "epoch": 0.030211321350561857, "grad_norm": 0.5394681096076965, "learning_rate": 4.8509898904802026e-05, "loss": 0.4033, "step": 287 }, { "epoch": 0.030316587278612594, "grad_norm": 0.46996134519577026, "learning_rate": 4.8504633529907335e-05, "loss": 0.4217, "step": 288 }, { "epoch": 0.030421853206663334, "grad_norm": 0.4631175398826599, "learning_rate": 4.849936815501264e-05, "loss": 0.4114, "step": 289 }, { "epoch": 0.03052711913471407, "grad_norm": 0.5271033644676208, "learning_rate": 4.849410278011795e-05, "loss": 0.4044, "step": 290 }, { "epoch": 0.03063238506276481, "grad_norm": 0.46999993920326233, "learning_rate": 4.8488837405223256e-05, "loss": 0.4408, "step": 291 }, { "epoch": 0.03073765099081555, "grad_norm": 0.3656292259693146, "learning_rate": 4.848357203032856e-05, "loss": 0.4169, "step": 292 }, { "epoch": 0.030842916918866287, "grad_norm": 0.5758498907089233, "learning_rate": 4.847830665543387e-05, "loss": 0.4718, "step": 293 }, { "epoch": 0.030948182846917024, "grad_norm": 0.43184739351272583, "learning_rate": 4.8473041280539176e-05, "loss": 0.4081, "step": 294 }, { "epoch": 0.03105344877496776, "grad_norm": 0.44835662841796875, "learning_rate": 4.8467775905644485e-05, "loss": 0.4249, "step": 295 }, { "epoch": 0.0311587147030185, "grad_norm": 0.4488978087902069, "learning_rate": 4.8462510530749794e-05, "loss": 0.5449, "step": 296 }, { "epoch": 0.031263980631069235, "grad_norm": 0.5275838971138, "learning_rate": 4.8457245155855096e-05, "loss": 0.4624, "step": 297 }, { "epoch": 0.031369246559119976, "grad_norm": 0.6487151980400085, "learning_rate": 4.8451979780960405e-05, "loss": 0.4815, "step": 298 }, { "epoch": 0.03147451248717072, "grad_norm": 0.5481114983558655, "learning_rate": 4.8446714406065714e-05, "loss": 0.3889, "step": 299 }, { "epoch": 0.03157977841522145, "grad_norm": 0.516204833984375, "learning_rate": 4.844144903117102e-05, "loss": 0.3923, "step": 300 }, { "epoch": 0.03168504434327219, "grad_norm": 0.5541898012161255, "learning_rate": 4.843618365627633e-05, "loss": 0.4513, "step": 301 }, { "epoch": 0.03179031027132293, "grad_norm": 0.5141636729240417, "learning_rate": 4.8430918281381635e-05, "loss": 0.4993, "step": 302 }, { "epoch": 0.031895576199373665, "grad_norm": 0.46877187490463257, "learning_rate": 4.8425652906486944e-05, "loss": 0.4815, "step": 303 }, { "epoch": 0.032000842127424406, "grad_norm": 0.5002549886703491, "learning_rate": 4.842038753159225e-05, "loss": 0.5064, "step": 304 }, { "epoch": 0.03210610805547515, "grad_norm": 0.45424237847328186, "learning_rate": 4.8415122156697555e-05, "loss": 0.4549, "step": 305 }, { "epoch": 0.03221137398352588, "grad_norm": 0.4908994138240814, "learning_rate": 4.8409856781802864e-05, "loss": 0.5029, "step": 306 }, { "epoch": 0.03231663991157662, "grad_norm": 0.6221848726272583, "learning_rate": 4.840459140690817e-05, "loss": 0.4033, "step": 307 }, { "epoch": 0.03242190583962736, "grad_norm": 0.5026724934577942, "learning_rate": 4.839932603201348e-05, "loss": 0.3765, "step": 308 }, { "epoch": 0.032527171767678095, "grad_norm": 0.4318561255931854, "learning_rate": 4.839406065711879e-05, "loss": 0.4174, "step": 309 }, { "epoch": 0.032632437695728836, "grad_norm": 0.5485970377922058, "learning_rate": 4.83887952822241e-05, "loss": 0.4528, "step": 310 }, { "epoch": 0.03273770362377958, "grad_norm": 0.49032801389694214, "learning_rate": 4.838352990732941e-05, "loss": 0.4687, "step": 311 }, { "epoch": 0.03284296955183031, "grad_norm": 0.4289769232273102, "learning_rate": 4.837826453243471e-05, "loss": 0.5144, "step": 312 }, { "epoch": 0.03294823547988105, "grad_norm": 0.500663697719574, "learning_rate": 4.8372999157540013e-05, "loss": 0.3923, "step": 313 }, { "epoch": 0.033053501407931785, "grad_norm": 0.5670647025108337, "learning_rate": 4.836773378264532e-05, "loss": 0.4049, "step": 314 }, { "epoch": 0.033158767335982525, "grad_norm": 0.4813581109046936, "learning_rate": 4.836246840775063e-05, "loss": 0.443, "step": 315 }, { "epoch": 0.033264033264033266, "grad_norm": 0.5485454797744751, "learning_rate": 4.835720303285594e-05, "loss": 0.4008, "step": 316 }, { "epoch": 0.033369299192084, "grad_norm": 0.5390880703926086, "learning_rate": 4.835193765796125e-05, "loss": 0.3993, "step": 317 }, { "epoch": 0.03347456512013474, "grad_norm": 0.498060017824173, "learning_rate": 4.834667228306656e-05, "loss": 0.3953, "step": 318 }, { "epoch": 0.03357983104818548, "grad_norm": 0.49461764097213745, "learning_rate": 4.834140690817187e-05, "loss": 0.3972, "step": 319 }, { "epoch": 0.033685096976236215, "grad_norm": 0.723934531211853, "learning_rate": 4.8336141533277176e-05, "loss": 0.4582, "step": 320 }, { "epoch": 0.033790362904286955, "grad_norm": 0.4396905303001404, "learning_rate": 4.833087615838248e-05, "loss": 0.404, "step": 321 }, { "epoch": 0.033895628832337696, "grad_norm": 0.4418332576751709, "learning_rate": 4.832561078348779e-05, "loss": 0.5145, "step": 322 }, { "epoch": 0.03400089476038843, "grad_norm": 0.5111250281333923, "learning_rate": 4.832034540859309e-05, "loss": 0.5276, "step": 323 }, { "epoch": 0.03410616068843917, "grad_norm": 0.5635156035423279, "learning_rate": 4.83150800336984e-05, "loss": 0.5484, "step": 324 }, { "epoch": 0.03421142661648991, "grad_norm": 0.5792466402053833, "learning_rate": 4.830981465880371e-05, "loss": 0.5747, "step": 325 }, { "epoch": 0.034316692544540645, "grad_norm": 0.4661281406879425, "learning_rate": 4.830454928390902e-05, "loss": 0.4601, "step": 326 }, { "epoch": 0.034421958472591385, "grad_norm": 0.6661891937255859, "learning_rate": 4.8299283909014326e-05, "loss": 0.4993, "step": 327 }, { "epoch": 0.03452722440064212, "grad_norm": 0.5207692384719849, "learning_rate": 4.8294018534119635e-05, "loss": 0.421, "step": 328 }, { "epoch": 0.03463249032869286, "grad_norm": 0.6618428826332092, "learning_rate": 4.828875315922494e-05, "loss": 0.4163, "step": 329 }, { "epoch": 0.0347377562567436, "grad_norm": 0.513272225856781, "learning_rate": 4.8283487784330246e-05, "loss": 0.3797, "step": 330 }, { "epoch": 0.034843022184794334, "grad_norm": 0.4838692545890808, "learning_rate": 4.8278222409435555e-05, "loss": 0.3843, "step": 331 }, { "epoch": 0.034948288112845075, "grad_norm": 0.5403527021408081, "learning_rate": 4.8272957034540864e-05, "loss": 0.4821, "step": 332 }, { "epoch": 0.035053554040895815, "grad_norm": 0.48934701085090637, "learning_rate": 4.8267691659646167e-05, "loss": 0.4205, "step": 333 }, { "epoch": 0.03515881996894655, "grad_norm": 0.5227293968200684, "learning_rate": 4.8262426284751476e-05, "loss": 0.483, "step": 334 }, { "epoch": 0.03526408589699729, "grad_norm": 0.5904392004013062, "learning_rate": 4.8257160909856785e-05, "loss": 0.3868, "step": 335 }, { "epoch": 0.03536935182504803, "grad_norm": 0.4555564522743225, "learning_rate": 4.8251895534962094e-05, "loss": 0.4235, "step": 336 }, { "epoch": 0.035474617753098764, "grad_norm": 0.8526967763900757, "learning_rate": 4.8246630160067396e-05, "loss": 0.4588, "step": 337 }, { "epoch": 0.035579883681149505, "grad_norm": 0.45085299015045166, "learning_rate": 4.8241364785172705e-05, "loss": 0.4228, "step": 338 }, { "epoch": 0.035685149609200245, "grad_norm": 0.5043511390686035, "learning_rate": 4.8236099410278014e-05, "loss": 0.4632, "step": 339 }, { "epoch": 0.03579041553725098, "grad_norm": 0.5064621567726135, "learning_rate": 4.823083403538332e-05, "loss": 0.4844, "step": 340 }, { "epoch": 0.03589568146530172, "grad_norm": 0.48965758085250854, "learning_rate": 4.822556866048863e-05, "loss": 0.4481, "step": 341 }, { "epoch": 0.03600094739335245, "grad_norm": 0.4565337300300598, "learning_rate": 4.8220303285593934e-05, "loss": 0.4011, "step": 342 }, { "epoch": 0.036106213321403194, "grad_norm": 0.5424944758415222, "learning_rate": 4.821503791069924e-05, "loss": 0.5101, "step": 343 }, { "epoch": 0.036211479249453934, "grad_norm": 0.4527457058429718, "learning_rate": 4.820977253580455e-05, "loss": 0.4097, "step": 344 }, { "epoch": 0.03631674517750467, "grad_norm": 0.3896700441837311, "learning_rate": 4.8204507160909854e-05, "loss": 0.4177, "step": 345 }, { "epoch": 0.03642201110555541, "grad_norm": 0.5583755373954773, "learning_rate": 4.8199241786015163e-05, "loss": 0.4437, "step": 346 }, { "epoch": 0.03652727703360615, "grad_norm": 0.41155165433883667, "learning_rate": 4.819397641112047e-05, "loss": 0.4382, "step": 347 }, { "epoch": 0.03663254296165688, "grad_norm": 0.36993688344955444, "learning_rate": 4.818871103622578e-05, "loss": 0.4839, "step": 348 }, { "epoch": 0.036737808889707624, "grad_norm": 0.449740469455719, "learning_rate": 4.818344566133109e-05, "loss": 0.4251, "step": 349 }, { "epoch": 0.036843074817758364, "grad_norm": 0.3957495391368866, "learning_rate": 4.81781802864364e-05, "loss": 0.4743, "step": 350 }, { "epoch": 0.0369483407458091, "grad_norm": 0.5629512667655945, "learning_rate": 4.817291491154171e-05, "loss": 0.4002, "step": 351 }, { "epoch": 0.03705360667385984, "grad_norm": 0.4598921239376068, "learning_rate": 4.816764953664701e-05, "loss": 0.4692, "step": 352 }, { "epoch": 0.03715887260191058, "grad_norm": 0.516234278678894, "learning_rate": 4.816238416175232e-05, "loss": 0.4175, "step": 353 }, { "epoch": 0.03726413852996131, "grad_norm": 0.5708214044570923, "learning_rate": 4.815711878685762e-05, "loss": 0.4306, "step": 354 }, { "epoch": 0.037369404458012054, "grad_norm": 0.6185720562934875, "learning_rate": 4.815185341196293e-05, "loss": 0.4598, "step": 355 }, { "epoch": 0.037474670386062794, "grad_norm": 0.5227758884429932, "learning_rate": 4.814658803706824e-05, "loss": 0.3782, "step": 356 }, { "epoch": 0.03757993631411353, "grad_norm": 0.5345552563667297, "learning_rate": 4.814132266217355e-05, "loss": 0.418, "step": 357 }, { "epoch": 0.03768520224216427, "grad_norm": 0.5797765254974365, "learning_rate": 4.813605728727886e-05, "loss": 0.5089, "step": 358 }, { "epoch": 0.037790468170215, "grad_norm": 0.5567287802696228, "learning_rate": 4.813079191238417e-05, "loss": 0.4304, "step": 359 }, { "epoch": 0.03789573409826574, "grad_norm": 0.4520246982574463, "learning_rate": 4.8125526537489476e-05, "loss": 0.4626, "step": 360 }, { "epoch": 0.038001000026316484, "grad_norm": 0.44900500774383545, "learning_rate": 4.812026116259478e-05, "loss": 0.3843, "step": 361 }, { "epoch": 0.03810626595436722, "grad_norm": 0.48296135663986206, "learning_rate": 4.811499578770009e-05, "loss": 0.4855, "step": 362 }, { "epoch": 0.03821153188241796, "grad_norm": 0.4269002377986908, "learning_rate": 4.810973041280539e-05, "loss": 0.3795, "step": 363 }, { "epoch": 0.0383167978104687, "grad_norm": 0.9296995401382446, "learning_rate": 4.81044650379107e-05, "loss": 0.4861, "step": 364 }, { "epoch": 0.03842206373851943, "grad_norm": 0.5746780633926392, "learning_rate": 4.809919966301601e-05, "loss": 0.3991, "step": 365 }, { "epoch": 0.03852732966657017, "grad_norm": 0.47170913219451904, "learning_rate": 4.8093934288121317e-05, "loss": 0.4348, "step": 366 }, { "epoch": 0.038632595594620914, "grad_norm": 0.4327333867549896, "learning_rate": 4.8088668913226626e-05, "loss": 0.405, "step": 367 }, { "epoch": 0.03873786152267165, "grad_norm": 0.4907747507095337, "learning_rate": 4.8083403538331935e-05, "loss": 0.4467, "step": 368 }, { "epoch": 0.03884312745072239, "grad_norm": 0.48626840114593506, "learning_rate": 4.807813816343724e-05, "loss": 0.485, "step": 369 }, { "epoch": 0.03894839337877313, "grad_norm": 0.5155723094940186, "learning_rate": 4.8072872788542546e-05, "loss": 0.3931, "step": 370 }, { "epoch": 0.03905365930682386, "grad_norm": 0.5703728795051575, "learning_rate": 4.8067607413647855e-05, "loss": 0.3728, "step": 371 }, { "epoch": 0.0391589252348746, "grad_norm": 0.5467020273208618, "learning_rate": 4.8062342038753164e-05, "loss": 0.477, "step": 372 }, { "epoch": 0.03926419116292534, "grad_norm": 0.4459872543811798, "learning_rate": 4.8057076663858466e-05, "loss": 0.4712, "step": 373 }, { "epoch": 0.03936945709097608, "grad_norm": 0.511060357093811, "learning_rate": 4.8051811288963775e-05, "loss": 0.5146, "step": 374 }, { "epoch": 0.03947472301902682, "grad_norm": 0.3677018880844116, "learning_rate": 4.8046545914069084e-05, "loss": 0.4605, "step": 375 }, { "epoch": 0.03957998894707755, "grad_norm": 0.47560691833496094, "learning_rate": 4.804128053917439e-05, "loss": 0.4479, "step": 376 }, { "epoch": 0.03968525487512829, "grad_norm": 0.5171210169792175, "learning_rate": 4.8036015164279695e-05, "loss": 0.4413, "step": 377 }, { "epoch": 0.03979052080317903, "grad_norm": 0.448194295167923, "learning_rate": 4.8030749789385004e-05, "loss": 0.4637, "step": 378 }, { "epoch": 0.03989578673122977, "grad_norm": 0.5280170440673828, "learning_rate": 4.8025484414490313e-05, "loss": 0.4365, "step": 379 }, { "epoch": 0.04000105265928051, "grad_norm": 0.490249902009964, "learning_rate": 4.802021903959562e-05, "loss": 0.4618, "step": 380 }, { "epoch": 0.04010631858733125, "grad_norm": 0.5452317595481873, "learning_rate": 4.801495366470093e-05, "loss": 0.3972, "step": 381 }, { "epoch": 0.04021158451538198, "grad_norm": 0.5572560429573059, "learning_rate": 4.800968828980624e-05, "loss": 0.4756, "step": 382 }, { "epoch": 0.04031685044343272, "grad_norm": 0.45014721155166626, "learning_rate": 4.800442291491154e-05, "loss": 0.3915, "step": 383 }, { "epoch": 0.04042211637148346, "grad_norm": 0.6049466729164124, "learning_rate": 4.799915754001685e-05, "loss": 0.3675, "step": 384 }, { "epoch": 0.0405273822995342, "grad_norm": 0.6129103302955627, "learning_rate": 4.7993892165122154e-05, "loss": 0.378, "step": 385 }, { "epoch": 0.04063264822758494, "grad_norm": 0.5461925864219666, "learning_rate": 4.798862679022746e-05, "loss": 0.4091, "step": 386 }, { "epoch": 0.04073791415563568, "grad_norm": 0.41969093680381775, "learning_rate": 4.798336141533277e-05, "loss": 0.4843, "step": 387 }, { "epoch": 0.04084318008368641, "grad_norm": 0.510870635509491, "learning_rate": 4.797809604043808e-05, "loss": 0.581, "step": 388 }, { "epoch": 0.04094844601173715, "grad_norm": 0.5956604480743408, "learning_rate": 4.797283066554339e-05, "loss": 0.3163, "step": 389 }, { "epoch": 0.041053711939787886, "grad_norm": 0.4685046076774597, "learning_rate": 4.79675652906487e-05, "loss": 0.4587, "step": 390 }, { "epoch": 0.04115897786783863, "grad_norm": 0.4563463628292084, "learning_rate": 4.796229991575401e-05, "loss": 0.468, "step": 391 }, { "epoch": 0.04126424379588937, "grad_norm": 0.5047011971473694, "learning_rate": 4.795703454085931e-05, "loss": 0.4117, "step": 392 }, { "epoch": 0.0413695097239401, "grad_norm": 0.6256960034370422, "learning_rate": 4.795176916596462e-05, "loss": 0.4522, "step": 393 }, { "epoch": 0.04147477565199084, "grad_norm": 0.479109525680542, "learning_rate": 4.794650379106992e-05, "loss": 0.5458, "step": 394 }, { "epoch": 0.04158004158004158, "grad_norm": 0.5637032985687256, "learning_rate": 4.794123841617523e-05, "loss": 0.4724, "step": 395 }, { "epoch": 0.041685307508092316, "grad_norm": 0.5758900046348572, "learning_rate": 4.793597304128054e-05, "loss": 0.3943, "step": 396 }, { "epoch": 0.04179057343614306, "grad_norm": 0.41813746094703674, "learning_rate": 4.793070766638585e-05, "loss": 0.4937, "step": 397 }, { "epoch": 0.0418958393641938, "grad_norm": 0.4549589455127716, "learning_rate": 4.792544229149116e-05, "loss": 0.4055, "step": 398 }, { "epoch": 0.04200110529224453, "grad_norm": 0.42384806275367737, "learning_rate": 4.792017691659647e-05, "loss": 0.4189, "step": 399 }, { "epoch": 0.04210637122029527, "grad_norm": 0.4235416352748871, "learning_rate": 4.7914911541701776e-05, "loss": 0.4304, "step": 400 }, { "epoch": 0.04221163714834601, "grad_norm": 0.44901612401008606, "learning_rate": 4.7909646166807085e-05, "loss": 0.4575, "step": 401 }, { "epoch": 0.042316903076396746, "grad_norm": 0.4786452353000641, "learning_rate": 4.790438079191239e-05, "loss": 0.4031, "step": 402 }, { "epoch": 0.04242216900444749, "grad_norm": 0.64895099401474, "learning_rate": 4.7899115417017696e-05, "loss": 0.4437, "step": 403 }, { "epoch": 0.04252743493249822, "grad_norm": 0.7129364609718323, "learning_rate": 4.7893850042123e-05, "loss": 0.426, "step": 404 }, { "epoch": 0.04263270086054896, "grad_norm": 0.5261722207069397, "learning_rate": 4.788858466722831e-05, "loss": 0.4704, "step": 405 }, { "epoch": 0.0427379667885997, "grad_norm": 0.5278510451316833, "learning_rate": 4.7883319292333616e-05, "loss": 0.43, "step": 406 }, { "epoch": 0.042843232716650435, "grad_norm": 0.47645267844200134, "learning_rate": 4.7878053917438925e-05, "loss": 0.4399, "step": 407 }, { "epoch": 0.042948498644701176, "grad_norm": 0.5606099367141724, "learning_rate": 4.7872788542544234e-05, "loss": 0.5023, "step": 408 }, { "epoch": 0.04305376457275192, "grad_norm": 0.5183596611022949, "learning_rate": 4.786752316764954e-05, "loss": 0.4431, "step": 409 }, { "epoch": 0.04315903050080265, "grad_norm": 0.4570636451244354, "learning_rate": 4.7862257792754845e-05, "loss": 0.4435, "step": 410 }, { "epoch": 0.04326429642885339, "grad_norm": 0.5054503679275513, "learning_rate": 4.7856992417860154e-05, "loss": 0.4884, "step": 411 }, { "epoch": 0.04336956235690413, "grad_norm": 0.4896951913833618, "learning_rate": 4.7851727042965463e-05, "loss": 0.472, "step": 412 }, { "epoch": 0.043474828284954865, "grad_norm": 0.6141940951347351, "learning_rate": 4.7846461668070766e-05, "loss": 0.426, "step": 413 }, { "epoch": 0.043580094213005606, "grad_norm": 0.48963436484336853, "learning_rate": 4.7841196293176075e-05, "loss": 0.4668, "step": 414 }, { "epoch": 0.04368536014105635, "grad_norm": 0.5451966524124146, "learning_rate": 4.7835930918281384e-05, "loss": 0.4728, "step": 415 }, { "epoch": 0.04379062606910708, "grad_norm": 0.434573769569397, "learning_rate": 4.783066554338669e-05, "loss": 0.4055, "step": 416 }, { "epoch": 0.04389589199715782, "grad_norm": 0.5499134659767151, "learning_rate": 4.7825400168492e-05, "loss": 0.3879, "step": 417 }, { "epoch": 0.044001157925208555, "grad_norm": 0.5180830955505371, "learning_rate": 4.7820134793597304e-05, "loss": 0.4445, "step": 418 }, { "epoch": 0.044106423853259295, "grad_norm": 0.4541892409324646, "learning_rate": 4.781486941870261e-05, "loss": 0.4059, "step": 419 }, { "epoch": 0.044211689781310036, "grad_norm": 0.3752939999103546, "learning_rate": 4.780960404380792e-05, "loss": 0.5885, "step": 420 }, { "epoch": 0.04431695570936077, "grad_norm": 0.4906155467033386, "learning_rate": 4.780433866891323e-05, "loss": 0.4839, "step": 421 }, { "epoch": 0.04442222163741151, "grad_norm": 0.4721757769584656, "learning_rate": 4.779907329401854e-05, "loss": 0.4177, "step": 422 }, { "epoch": 0.04452748756546225, "grad_norm": 0.42130014300346375, "learning_rate": 4.779380791912384e-05, "loss": 0.4295, "step": 423 }, { "epoch": 0.044632753493512985, "grad_norm": 0.5732069611549377, "learning_rate": 4.778854254422915e-05, "loss": 0.3721, "step": 424 }, { "epoch": 0.044738019421563725, "grad_norm": 0.48826277256011963, "learning_rate": 4.778327716933446e-05, "loss": 0.4228, "step": 425 }, { "epoch": 0.044843285349614466, "grad_norm": 0.5234729051589966, "learning_rate": 4.777801179443976e-05, "loss": 0.4014, "step": 426 }, { "epoch": 0.0449485512776652, "grad_norm": 0.46457454562187195, "learning_rate": 4.777274641954507e-05, "loss": 0.5259, "step": 427 }, { "epoch": 0.04505381720571594, "grad_norm": 0.5036742091178894, "learning_rate": 4.776748104465038e-05, "loss": 0.4361, "step": 428 }, { "epoch": 0.04515908313376668, "grad_norm": 0.5410817265510559, "learning_rate": 4.776221566975569e-05, "loss": 0.463, "step": 429 }, { "epoch": 0.045264349061817415, "grad_norm": 0.4173840284347534, "learning_rate": 4.7756950294861e-05, "loss": 0.4048, "step": 430 }, { "epoch": 0.045369614989868155, "grad_norm": 0.726842999458313, "learning_rate": 4.775168491996631e-05, "loss": 0.5549, "step": 431 }, { "epoch": 0.045474880917918896, "grad_norm": 0.40877723693847656, "learning_rate": 4.774641954507162e-05, "loss": 0.4433, "step": 432 }, { "epoch": 0.04558014684596963, "grad_norm": 0.6194121241569519, "learning_rate": 4.774115417017692e-05, "loss": 0.4257, "step": 433 }, { "epoch": 0.04568541277402037, "grad_norm": 0.5976036787033081, "learning_rate": 4.773588879528222e-05, "loss": 0.4709, "step": 434 }, { "epoch": 0.045790678702071104, "grad_norm": 0.6144199371337891, "learning_rate": 4.773062342038753e-05, "loss": 0.3868, "step": 435 }, { "epoch": 0.045895944630121845, "grad_norm": 0.5125494599342346, "learning_rate": 4.772535804549284e-05, "loss": 0.4116, "step": 436 }, { "epoch": 0.046001210558172585, "grad_norm": 0.5164209604263306, "learning_rate": 4.772009267059815e-05, "loss": 0.3564, "step": 437 }, { "epoch": 0.04610647648622332, "grad_norm": 0.4817107319831848, "learning_rate": 4.771482729570346e-05, "loss": 0.4801, "step": 438 }, { "epoch": 0.04621174241427406, "grad_norm": 0.44076791405677795, "learning_rate": 4.7709561920808766e-05, "loss": 0.551, "step": 439 }, { "epoch": 0.0463170083423248, "grad_norm": 0.634650707244873, "learning_rate": 4.7704296545914075e-05, "loss": 0.533, "step": 440 }, { "epoch": 0.046422274270375534, "grad_norm": 0.4300638437271118, "learning_rate": 4.7699031171019384e-05, "loss": 0.4219, "step": 441 }, { "epoch": 0.046527540198426275, "grad_norm": 0.5052940249443054, "learning_rate": 4.7693765796124686e-05, "loss": 0.4419, "step": 442 }, { "epoch": 0.046632806126477015, "grad_norm": 0.4833763539791107, "learning_rate": 4.7688500421229995e-05, "loss": 0.4074, "step": 443 }, { "epoch": 0.04673807205452775, "grad_norm": 0.4841054677963257, "learning_rate": 4.76832350463353e-05, "loss": 0.4357, "step": 444 }, { "epoch": 0.04684333798257849, "grad_norm": 0.5227946639060974, "learning_rate": 4.767796967144061e-05, "loss": 0.4471, "step": 445 }, { "epoch": 0.04694860391062923, "grad_norm": 0.5761273503303528, "learning_rate": 4.7672704296545916e-05, "loss": 0.4422, "step": 446 }, { "epoch": 0.047053869838679964, "grad_norm": 0.47115081548690796, "learning_rate": 4.7667438921651225e-05, "loss": 0.4172, "step": 447 }, { "epoch": 0.047159135766730705, "grad_norm": 0.5475848913192749, "learning_rate": 4.7662173546756534e-05, "loss": 0.4435, "step": 448 }, { "epoch": 0.04726440169478144, "grad_norm": 0.4437314569950104, "learning_rate": 4.765690817186184e-05, "loss": 0.389, "step": 449 }, { "epoch": 0.04736966762283218, "grad_norm": 0.4307888448238373, "learning_rate": 4.7651642796967145e-05, "loss": 0.4354, "step": 450 }, { "epoch": 0.04747493355088292, "grad_norm": 0.3933163285255432, "learning_rate": 4.7646377422072454e-05, "loss": 0.561, "step": 451 }, { "epoch": 0.04758019947893365, "grad_norm": 0.37329408526420593, "learning_rate": 4.764111204717776e-05, "loss": 0.4767, "step": 452 }, { "epoch": 0.047685465406984394, "grad_norm": 0.554229199886322, "learning_rate": 4.763584667228307e-05, "loss": 0.3594, "step": 453 }, { "epoch": 0.047790731335035135, "grad_norm": 0.4243522882461548, "learning_rate": 4.7630581297388374e-05, "loss": 0.44, "step": 454 }, { "epoch": 0.04789599726308587, "grad_norm": 0.5723696351051331, "learning_rate": 4.762531592249368e-05, "loss": 0.4377, "step": 455 }, { "epoch": 0.04800126319113661, "grad_norm": 0.5366947054862976, "learning_rate": 4.762005054759899e-05, "loss": 0.4021, "step": 456 }, { "epoch": 0.04810652911918735, "grad_norm": 0.5559504628181458, "learning_rate": 4.76147851727043e-05, "loss": 0.3775, "step": 457 }, { "epoch": 0.04821179504723808, "grad_norm": 0.48702389001846313, "learning_rate": 4.7609519797809604e-05, "loss": 0.4751, "step": 458 }, { "epoch": 0.048317060975288824, "grad_norm": 0.36137351393699646, "learning_rate": 4.760425442291491e-05, "loss": 0.482, "step": 459 }, { "epoch": 0.048422326903339565, "grad_norm": 0.4528438150882721, "learning_rate": 4.759898904802022e-05, "loss": 0.4059, "step": 460 }, { "epoch": 0.0485275928313903, "grad_norm": 0.5218043923377991, "learning_rate": 4.759372367312553e-05, "loss": 0.4095, "step": 461 }, { "epoch": 0.04863285875944104, "grad_norm": 0.5252096652984619, "learning_rate": 4.758845829823084e-05, "loss": 0.4989, "step": 462 }, { "epoch": 0.04873812468749178, "grad_norm": 0.3626563549041748, "learning_rate": 4.758319292333614e-05, "loss": 0.5983, "step": 463 }, { "epoch": 0.04884339061554251, "grad_norm": 0.473537415266037, "learning_rate": 4.757792754844145e-05, "loss": 0.5459, "step": 464 }, { "epoch": 0.048948656543593254, "grad_norm": 0.7054407596588135, "learning_rate": 4.757266217354676e-05, "loss": 0.3718, "step": 465 }, { "epoch": 0.04905392247164399, "grad_norm": 0.4829826056957245, "learning_rate": 4.756739679865206e-05, "loss": 0.4165, "step": 466 }, { "epoch": 0.04915918839969473, "grad_norm": 0.5529534816741943, "learning_rate": 4.756213142375737e-05, "loss": 0.5058, "step": 467 }, { "epoch": 0.04926445432774547, "grad_norm": 0.4331270456314087, "learning_rate": 4.755686604886268e-05, "loss": 0.4267, "step": 468 }, { "epoch": 0.0493697202557962, "grad_norm": 0.48735421895980835, "learning_rate": 4.755160067396799e-05, "loss": 0.443, "step": 469 }, { "epoch": 0.04947498618384694, "grad_norm": 0.6138409972190857, "learning_rate": 4.75463352990733e-05, "loss": 0.4449, "step": 470 }, { "epoch": 0.049580252111897684, "grad_norm": 0.4512140154838562, "learning_rate": 4.754106992417861e-05, "loss": 0.486, "step": 471 }, { "epoch": 0.04968551803994842, "grad_norm": 0.5221918225288391, "learning_rate": 4.7535804549283916e-05, "loss": 0.4122, "step": 472 }, { "epoch": 0.04979078396799916, "grad_norm": 0.5450029969215393, "learning_rate": 4.753053917438922e-05, "loss": 0.3362, "step": 473 }, { "epoch": 0.0498960498960499, "grad_norm": 0.5064875483512878, "learning_rate": 4.752527379949452e-05, "loss": 0.4868, "step": 474 }, { "epoch": 0.05000131582410063, "grad_norm": 0.5182908177375793, "learning_rate": 4.752000842459983e-05, "loss": 0.4034, "step": 475 }, { "epoch": 0.05010658175215137, "grad_norm": 0.5384114384651184, "learning_rate": 4.751474304970514e-05, "loss": 0.5353, "step": 476 }, { "epoch": 0.050211847680202114, "grad_norm": 0.5357162952423096, "learning_rate": 4.750947767481045e-05, "loss": 0.3976, "step": 477 }, { "epoch": 0.05031711360825285, "grad_norm": 0.45556405186653137, "learning_rate": 4.750421229991576e-05, "loss": 0.403, "step": 478 }, { "epoch": 0.05042237953630359, "grad_norm": 0.5855860710144043, "learning_rate": 4.7498946925021066e-05, "loss": 0.3754, "step": 479 }, { "epoch": 0.05052764546435432, "grad_norm": 0.5920200943946838, "learning_rate": 4.7493681550126375e-05, "loss": 0.3944, "step": 480 }, { "epoch": 0.05063291139240506, "grad_norm": 0.5460993051528931, "learning_rate": 4.7488416175231684e-05, "loss": 0.5356, "step": 481 }, { "epoch": 0.0507381773204558, "grad_norm": 0.5433392524719238, "learning_rate": 4.7483150800336986e-05, "loss": 0.4043, "step": 482 }, { "epoch": 0.05084344324850654, "grad_norm": 0.6986379027366638, "learning_rate": 4.7477885425442295e-05, "loss": 0.4374, "step": 483 }, { "epoch": 0.05094870917655728, "grad_norm": 0.6336686611175537, "learning_rate": 4.74726200505476e-05, "loss": 0.4308, "step": 484 }, { "epoch": 0.05105397510460802, "grad_norm": 0.5509925484657288, "learning_rate": 4.7467354675652906e-05, "loss": 0.4101, "step": 485 }, { "epoch": 0.05115924103265875, "grad_norm": 0.5978362560272217, "learning_rate": 4.7462089300758215e-05, "loss": 0.379, "step": 486 }, { "epoch": 0.05126450696070949, "grad_norm": 0.5480085015296936, "learning_rate": 4.7456823925863524e-05, "loss": 0.3327, "step": 487 }, { "epoch": 0.05136977288876023, "grad_norm": 0.5396241545677185, "learning_rate": 4.745155855096883e-05, "loss": 0.4283, "step": 488 }, { "epoch": 0.05147503881681097, "grad_norm": 0.43143001198768616, "learning_rate": 4.744629317607414e-05, "loss": 0.584, "step": 489 }, { "epoch": 0.05158030474486171, "grad_norm": 0.4590414762496948, "learning_rate": 4.7441027801179445e-05, "loss": 0.3794, "step": 490 }, { "epoch": 0.05168557067291245, "grad_norm": 0.4620942175388336, "learning_rate": 4.7435762426284754e-05, "loss": 0.4421, "step": 491 }, { "epoch": 0.05179083660096318, "grad_norm": 0.5003826022148132, "learning_rate": 4.743049705139006e-05, "loss": 0.4408, "step": 492 }, { "epoch": 0.05189610252901392, "grad_norm": 0.5184903740882874, "learning_rate": 4.742523167649537e-05, "loss": 0.4523, "step": 493 }, { "epoch": 0.052001368457064656, "grad_norm": 0.5750355124473572, "learning_rate": 4.7419966301600674e-05, "loss": 0.3512, "step": 494 }, { "epoch": 0.0521066343851154, "grad_norm": 0.516768217086792, "learning_rate": 4.741470092670598e-05, "loss": 0.3583, "step": 495 }, { "epoch": 0.05221190031316614, "grad_norm": 0.5511295199394226, "learning_rate": 4.740943555181129e-05, "loss": 0.4536, "step": 496 }, { "epoch": 0.05231716624121687, "grad_norm": 0.4026057720184326, "learning_rate": 4.74041701769166e-05, "loss": 0.4834, "step": 497 }, { "epoch": 0.05242243216926761, "grad_norm": 0.6032986044883728, "learning_rate": 4.73989048020219e-05, "loss": 0.3901, "step": 498 }, { "epoch": 0.05252769809731835, "grad_norm": 0.45538461208343506, "learning_rate": 4.739363942712721e-05, "loss": 0.4174, "step": 499 }, { "epoch": 0.052632964025369086, "grad_norm": 0.564687967300415, "learning_rate": 4.738837405223252e-05, "loss": 0.4543, "step": 500 }, { "epoch": 0.05273822995341983, "grad_norm": 0.5365861058235168, "learning_rate": 4.738310867733783e-05, "loss": 0.3998, "step": 501 }, { "epoch": 0.05284349588147057, "grad_norm": 0.5887376666069031, "learning_rate": 4.737784330244314e-05, "loss": 0.4881, "step": 502 }, { "epoch": 0.0529487618095213, "grad_norm": 0.5137104392051697, "learning_rate": 4.737257792754845e-05, "loss": 0.4158, "step": 503 }, { "epoch": 0.05305402773757204, "grad_norm": 0.7075323462486267, "learning_rate": 4.736731255265375e-05, "loss": 0.4249, "step": 504 }, { "epoch": 0.05315929366562278, "grad_norm": 0.5085923075675964, "learning_rate": 4.736204717775906e-05, "loss": 0.3974, "step": 505 }, { "epoch": 0.053264559593673516, "grad_norm": 0.4885638654232025, "learning_rate": 4.735678180286436e-05, "loss": 0.3569, "step": 506 }, { "epoch": 0.05336982552172426, "grad_norm": 0.5807955265045166, "learning_rate": 4.735151642796967e-05, "loss": 0.3868, "step": 507 }, { "epoch": 0.053475091449775, "grad_norm": 0.4715438485145569, "learning_rate": 4.734625105307498e-05, "loss": 0.4592, "step": 508 }, { "epoch": 0.05358035737782573, "grad_norm": 0.4971379041671753, "learning_rate": 4.734098567818029e-05, "loss": 0.4449, "step": 509 }, { "epoch": 0.05368562330587647, "grad_norm": 0.5600916743278503, "learning_rate": 4.73357203032856e-05, "loss": 0.3905, "step": 510 }, { "epoch": 0.053790889233927205, "grad_norm": 0.5462086200714111, "learning_rate": 4.733045492839091e-05, "loss": 0.3757, "step": 511 }, { "epoch": 0.053896155161977946, "grad_norm": 0.4880779981613159, "learning_rate": 4.7325189553496216e-05, "loss": 0.4084, "step": 512 }, { "epoch": 0.05400142109002869, "grad_norm": 0.5553451180458069, "learning_rate": 4.731992417860152e-05, "loss": 0.4088, "step": 513 }, { "epoch": 0.05410668701807942, "grad_norm": 0.4913026690483093, "learning_rate": 4.731465880370683e-05, "loss": 0.4286, "step": 514 }, { "epoch": 0.05421195294613016, "grad_norm": 0.43161246180534363, "learning_rate": 4.730939342881213e-05, "loss": 0.413, "step": 515 }, { "epoch": 0.0543172188741809, "grad_norm": 0.5062459707260132, "learning_rate": 4.730412805391744e-05, "loss": 0.4713, "step": 516 }, { "epoch": 0.054422484802231635, "grad_norm": 0.4592074751853943, "learning_rate": 4.729886267902275e-05, "loss": 0.4902, "step": 517 }, { "epoch": 0.054527750730282376, "grad_norm": 0.49476075172424316, "learning_rate": 4.7293597304128056e-05, "loss": 0.4016, "step": 518 }, { "epoch": 0.05463301665833312, "grad_norm": 0.4191977381706238, "learning_rate": 4.7288331929233365e-05, "loss": 0.4672, "step": 519 }, { "epoch": 0.05473828258638385, "grad_norm": 0.5030830502510071, "learning_rate": 4.7283066554338674e-05, "loss": 0.4905, "step": 520 }, { "epoch": 0.05484354851443459, "grad_norm": 0.4686654210090637, "learning_rate": 4.727780117944398e-05, "loss": 0.4441, "step": 521 }, { "epoch": 0.05494881444248533, "grad_norm": 0.46608471870422363, "learning_rate": 4.7272535804549286e-05, "loss": 0.3742, "step": 522 }, { "epoch": 0.055054080370536065, "grad_norm": 0.5822672247886658, "learning_rate": 4.7267270429654595e-05, "loss": 0.4266, "step": 523 }, { "epoch": 0.055159346298586806, "grad_norm": 0.4522544741630554, "learning_rate": 4.7262005054759904e-05, "loss": 0.4532, "step": 524 }, { "epoch": 0.05526461222663754, "grad_norm": 0.47990643978118896, "learning_rate": 4.7256739679865206e-05, "loss": 0.3853, "step": 525 }, { "epoch": 0.05536987815468828, "grad_norm": 0.5252960920333862, "learning_rate": 4.7251474304970515e-05, "loss": 0.4716, "step": 526 }, { "epoch": 0.05547514408273902, "grad_norm": 0.45028603076934814, "learning_rate": 4.7246208930075824e-05, "loss": 0.4579, "step": 527 }, { "epoch": 0.055580410010789755, "grad_norm": 0.5253304243087769, "learning_rate": 4.724094355518113e-05, "loss": 0.4433, "step": 528 }, { "epoch": 0.055685675938840495, "grad_norm": 0.48800671100616455, "learning_rate": 4.723567818028644e-05, "loss": 0.4228, "step": 529 }, { "epoch": 0.055790941866891236, "grad_norm": 0.5435435771942139, "learning_rate": 4.7230412805391744e-05, "loss": 0.4181, "step": 530 }, { "epoch": 0.05589620779494197, "grad_norm": 0.5906736254692078, "learning_rate": 4.722514743049705e-05, "loss": 0.4003, "step": 531 }, { "epoch": 0.05600147372299271, "grad_norm": 0.49869149923324585, "learning_rate": 4.721988205560236e-05, "loss": 0.4781, "step": 532 }, { "epoch": 0.05610673965104345, "grad_norm": 0.4748145341873169, "learning_rate": 4.721461668070767e-05, "loss": 0.4291, "step": 533 }, { "epoch": 0.056212005579094185, "grad_norm": 0.471021831035614, "learning_rate": 4.7209351305812973e-05, "loss": 0.4683, "step": 534 }, { "epoch": 0.056317271507144925, "grad_norm": 0.6247691512107849, "learning_rate": 4.720408593091828e-05, "loss": 0.3932, "step": 535 }, { "epoch": 0.056422537435195666, "grad_norm": 0.6917199492454529, "learning_rate": 4.719882055602359e-05, "loss": 0.597, "step": 536 }, { "epoch": 0.0565278033632464, "grad_norm": 0.607105553150177, "learning_rate": 4.71935551811289e-05, "loss": 0.5024, "step": 537 }, { "epoch": 0.05663306929129714, "grad_norm": 0.6015260815620422, "learning_rate": 4.71882898062342e-05, "loss": 0.4569, "step": 538 }, { "epoch": 0.05673833521934788, "grad_norm": 0.6226845979690552, "learning_rate": 4.718302443133951e-05, "loss": 0.4134, "step": 539 }, { "epoch": 0.056843601147398615, "grad_norm": 0.46711722016334534, "learning_rate": 4.717775905644482e-05, "loss": 0.4957, "step": 540 }, { "epoch": 0.056948867075449355, "grad_norm": 0.4069374203681946, "learning_rate": 4.717249368155013e-05, "loss": 0.4173, "step": 541 }, { "epoch": 0.05705413300350009, "grad_norm": 0.47599026560783386, "learning_rate": 4.716722830665544e-05, "loss": 0.4865, "step": 542 }, { "epoch": 0.05715939893155083, "grad_norm": 0.46828117966651917, "learning_rate": 4.716196293176075e-05, "loss": 0.4763, "step": 543 }, { "epoch": 0.05726466485960157, "grad_norm": 0.3772525191307068, "learning_rate": 4.715669755686605e-05, "loss": 0.4225, "step": 544 }, { "epoch": 0.057369930787652304, "grad_norm": 0.44674021005630493, "learning_rate": 4.715143218197136e-05, "loss": 0.5063, "step": 545 }, { "epoch": 0.057475196715703045, "grad_norm": 0.5613642334938049, "learning_rate": 4.714616680707666e-05, "loss": 0.5388, "step": 546 }, { "epoch": 0.057580462643753785, "grad_norm": 0.5140121579170227, "learning_rate": 4.714090143218197e-05, "loss": 0.4481, "step": 547 }, { "epoch": 0.05768572857180452, "grad_norm": 0.4728577435016632, "learning_rate": 4.713563605728728e-05, "loss": 0.3896, "step": 548 }, { "epoch": 0.05779099449985526, "grad_norm": 0.4167439639568329, "learning_rate": 4.713037068239259e-05, "loss": 0.3863, "step": 549 }, { "epoch": 0.057896260427906, "grad_norm": 0.5620428919792175, "learning_rate": 4.71251053074979e-05, "loss": 0.4342, "step": 550 }, { "epoch": 0.058001526355956734, "grad_norm": 0.424396812915802, "learning_rate": 4.7119839932603206e-05, "loss": 0.5043, "step": 551 }, { "epoch": 0.058106792284007475, "grad_norm": 0.4943045675754547, "learning_rate": 4.7114574557708515e-05, "loss": 0.3649, "step": 552 }, { "epoch": 0.058212058212058215, "grad_norm": 0.5179657340049744, "learning_rate": 4.7109309182813824e-05, "loss": 0.3986, "step": 553 }, { "epoch": 0.05831732414010895, "grad_norm": 0.46122902631759644, "learning_rate": 4.710404380791913e-05, "loss": 0.4501, "step": 554 }, { "epoch": 0.05842259006815969, "grad_norm": 0.5129498243331909, "learning_rate": 4.709877843302443e-05, "loss": 0.4105, "step": 555 }, { "epoch": 0.05852785599621042, "grad_norm": 0.5061764121055603, "learning_rate": 4.709351305812974e-05, "loss": 0.3993, "step": 556 }, { "epoch": 0.058633121924261164, "grad_norm": 0.5676811933517456, "learning_rate": 4.708824768323505e-05, "loss": 0.3786, "step": 557 }, { "epoch": 0.058738387852311905, "grad_norm": 0.5383573174476624, "learning_rate": 4.7082982308340356e-05, "loss": 0.4541, "step": 558 }, { "epoch": 0.05884365378036264, "grad_norm": 0.6130087375640869, "learning_rate": 4.7077716933445665e-05, "loss": 0.4215, "step": 559 }, { "epoch": 0.05894891970841338, "grad_norm": 0.6504372954368591, "learning_rate": 4.7072451558550974e-05, "loss": 0.3891, "step": 560 }, { "epoch": 0.05905418563646412, "grad_norm": 0.5079691410064697, "learning_rate": 4.706718618365628e-05, "loss": 0.4668, "step": 561 }, { "epoch": 0.05915945156451485, "grad_norm": 0.528856635093689, "learning_rate": 4.7061920808761585e-05, "loss": 0.3965, "step": 562 }, { "epoch": 0.059264717492565594, "grad_norm": 0.44504040479660034, "learning_rate": 4.7056655433866894e-05, "loss": 0.5032, "step": 563 }, { "epoch": 0.059369983420616335, "grad_norm": 0.5209716558456421, "learning_rate": 4.70513900589722e-05, "loss": 0.4837, "step": 564 }, { "epoch": 0.05947524934866707, "grad_norm": 0.48046526312828064, "learning_rate": 4.7046124684077505e-05, "loss": 0.3989, "step": 565 }, { "epoch": 0.05958051527671781, "grad_norm": 0.5712192058563232, "learning_rate": 4.7040859309182814e-05, "loss": 0.4788, "step": 566 }, { "epoch": 0.05968578120476855, "grad_norm": 0.6029406785964966, "learning_rate": 4.7035593934288123e-05, "loss": 0.3974, "step": 567 }, { "epoch": 0.05979104713281928, "grad_norm": 0.5272865295410156, "learning_rate": 4.703032855939343e-05, "loss": 0.4562, "step": 568 }, { "epoch": 0.059896313060870024, "grad_norm": 0.5821331143379211, "learning_rate": 4.702506318449874e-05, "loss": 0.3848, "step": 569 }, { "epoch": 0.06000157898892076, "grad_norm": 0.45264291763305664, "learning_rate": 4.7019797809604044e-05, "loss": 0.491, "step": 570 }, { "epoch": 0.0601068449169715, "grad_norm": 0.5712417364120483, "learning_rate": 4.701453243470935e-05, "loss": 0.4128, "step": 571 }, { "epoch": 0.06021211084502224, "grad_norm": 0.5191047787666321, "learning_rate": 4.700926705981466e-05, "loss": 0.4552, "step": 572 }, { "epoch": 0.06031737677307297, "grad_norm": 0.4191204607486725, "learning_rate": 4.700400168491997e-05, "loss": 0.4669, "step": 573 }, { "epoch": 0.06042264270112371, "grad_norm": 0.508425235748291, "learning_rate": 4.699873631002528e-05, "loss": 0.4031, "step": 574 }, { "epoch": 0.060527908629174454, "grad_norm": 0.47075721621513367, "learning_rate": 4.699347093513058e-05, "loss": 0.4773, "step": 575 }, { "epoch": 0.06063317455722519, "grad_norm": 0.5133448839187622, "learning_rate": 4.698820556023589e-05, "loss": 0.3865, "step": 576 }, { "epoch": 0.06073844048527593, "grad_norm": 0.5425415635108948, "learning_rate": 4.69829401853412e-05, "loss": 0.4117, "step": 577 }, { "epoch": 0.06084370641332667, "grad_norm": 0.61476731300354, "learning_rate": 4.69776748104465e-05, "loss": 0.4307, "step": 578 }, { "epoch": 0.0609489723413774, "grad_norm": 0.553023099899292, "learning_rate": 4.697240943555181e-05, "loss": 0.3579, "step": 579 }, { "epoch": 0.06105423826942814, "grad_norm": 0.4436430037021637, "learning_rate": 4.696714406065712e-05, "loss": 0.4099, "step": 580 }, { "epoch": 0.061159504197478884, "grad_norm": 0.5598846673965454, "learning_rate": 4.696187868576243e-05, "loss": 0.3615, "step": 581 }, { "epoch": 0.06126477012552962, "grad_norm": 0.6036468744277954, "learning_rate": 4.695661331086774e-05, "loss": 0.4438, "step": 582 }, { "epoch": 0.06137003605358036, "grad_norm": 0.6011479496955872, "learning_rate": 4.695134793597305e-05, "loss": 0.4288, "step": 583 }, { "epoch": 0.0614753019816311, "grad_norm": 0.5292397141456604, "learning_rate": 4.694608256107835e-05, "loss": 0.4086, "step": 584 }, { "epoch": 0.06158056790968183, "grad_norm": 0.5526982545852661, "learning_rate": 4.694081718618366e-05, "loss": 0.3941, "step": 585 }, { "epoch": 0.06168583383773257, "grad_norm": 0.5088376402854919, "learning_rate": 4.693555181128896e-05, "loss": 0.4356, "step": 586 }, { "epoch": 0.06179109976578331, "grad_norm": 0.5751054286956787, "learning_rate": 4.693028643639427e-05, "loss": 0.4629, "step": 587 }, { "epoch": 0.06189636569383405, "grad_norm": 0.47562679648399353, "learning_rate": 4.692502106149958e-05, "loss": 0.4875, "step": 588 }, { "epoch": 0.06200163162188479, "grad_norm": 0.406876802444458, "learning_rate": 4.691975568660489e-05, "loss": 0.5099, "step": 589 }, { "epoch": 0.06210689754993552, "grad_norm": 0.43212106823921204, "learning_rate": 4.69144903117102e-05, "loss": 0.5271, "step": 590 }, { "epoch": 0.06221216347798626, "grad_norm": 0.5265733003616333, "learning_rate": 4.6909224936815506e-05, "loss": 0.4456, "step": 591 }, { "epoch": 0.062317429406037, "grad_norm": 0.37871816754341125, "learning_rate": 4.6903959561920815e-05, "loss": 0.3964, "step": 592 }, { "epoch": 0.06242269533408774, "grad_norm": 0.443781316280365, "learning_rate": 4.6898694187026124e-05, "loss": 0.4575, "step": 593 }, { "epoch": 0.06252796126213847, "grad_norm": 0.5184212923049927, "learning_rate": 4.6893428812131426e-05, "loss": 0.4791, "step": 594 }, { "epoch": 0.06263322719018921, "grad_norm": 0.4982917308807373, "learning_rate": 4.688816343723673e-05, "loss": 0.4104, "step": 595 }, { "epoch": 0.06273849311823995, "grad_norm": 0.43113309144973755, "learning_rate": 4.688289806234204e-05, "loss": 0.4384, "step": 596 }, { "epoch": 0.06284375904629069, "grad_norm": 0.5594951510429382, "learning_rate": 4.6877632687447346e-05, "loss": 0.4428, "step": 597 }, { "epoch": 0.06294902497434143, "grad_norm": 0.408655047416687, "learning_rate": 4.6872367312552655e-05, "loss": 0.4328, "step": 598 }, { "epoch": 0.06305429090239217, "grad_norm": 0.41858869791030884, "learning_rate": 4.6867101937657964e-05, "loss": 0.4822, "step": 599 }, { "epoch": 0.0631595568304429, "grad_norm": 0.5304632186889648, "learning_rate": 4.6861836562763274e-05, "loss": 0.4376, "step": 600 }, { "epoch": 0.06326482275849364, "grad_norm": 0.4693495035171509, "learning_rate": 4.685657118786858e-05, "loss": 0.3905, "step": 601 }, { "epoch": 0.06337008868654438, "grad_norm": 0.5536295771598816, "learning_rate": 4.6851305812973885e-05, "loss": 0.4378, "step": 602 }, { "epoch": 0.06347535461459512, "grad_norm": 0.4618769884109497, "learning_rate": 4.6846040438079194e-05, "loss": 0.4642, "step": 603 }, { "epoch": 0.06358062054264586, "grad_norm": 0.463776171207428, "learning_rate": 4.68407750631845e-05, "loss": 0.4518, "step": 604 }, { "epoch": 0.0636858864706966, "grad_norm": 0.5297257900238037, "learning_rate": 4.6835509688289805e-05, "loss": 0.3222, "step": 605 }, { "epoch": 0.06379115239874733, "grad_norm": 0.47493240237236023, "learning_rate": 4.6830244313395114e-05, "loss": 0.4, "step": 606 }, { "epoch": 0.06389641832679807, "grad_norm": 0.6347471475601196, "learning_rate": 4.682497893850042e-05, "loss": 0.4315, "step": 607 }, { "epoch": 0.06400168425484881, "grad_norm": 0.5118055939674377, "learning_rate": 4.681971356360573e-05, "loss": 0.4136, "step": 608 }, { "epoch": 0.06410695018289955, "grad_norm": 0.5062241554260254, "learning_rate": 4.681444818871104e-05, "loss": 0.51, "step": 609 }, { "epoch": 0.0642122161109503, "grad_norm": 0.45359355211257935, "learning_rate": 4.680918281381634e-05, "loss": 0.3897, "step": 610 }, { "epoch": 0.06431748203900102, "grad_norm": 0.4978649914264679, "learning_rate": 4.680391743892165e-05, "loss": 0.4234, "step": 611 }, { "epoch": 0.06442274796705176, "grad_norm": 0.5025052428245544, "learning_rate": 4.679865206402696e-05, "loss": 0.4344, "step": 612 }, { "epoch": 0.0645280138951025, "grad_norm": 0.4677049517631531, "learning_rate": 4.679338668913227e-05, "loss": 0.3997, "step": 613 }, { "epoch": 0.06463327982315324, "grad_norm": 0.38490286469459534, "learning_rate": 4.678812131423758e-05, "loss": 0.4778, "step": 614 }, { "epoch": 0.06473854575120398, "grad_norm": 0.4486238956451416, "learning_rate": 4.678285593934288e-05, "loss": 0.4002, "step": 615 }, { "epoch": 0.06484381167925472, "grad_norm": 0.48641228675842285, "learning_rate": 4.677759056444819e-05, "loss": 0.4302, "step": 616 }, { "epoch": 0.06494907760730545, "grad_norm": 0.5490376353263855, "learning_rate": 4.67723251895535e-05, "loss": 0.4203, "step": 617 }, { "epoch": 0.06505434353535619, "grad_norm": 0.4899100363254547, "learning_rate": 4.67670598146588e-05, "loss": 0.399, "step": 618 }, { "epoch": 0.06515960946340693, "grad_norm": 0.7570556998252869, "learning_rate": 4.676179443976411e-05, "loss": 0.4409, "step": 619 }, { "epoch": 0.06526487539145767, "grad_norm": 0.5624217391014099, "learning_rate": 4.675652906486942e-05, "loss": 0.3867, "step": 620 }, { "epoch": 0.06537014131950841, "grad_norm": 0.47434237599372864, "learning_rate": 4.675126368997473e-05, "loss": 0.3962, "step": 621 }, { "epoch": 0.06547540724755915, "grad_norm": 0.5388314723968506, "learning_rate": 4.674599831508004e-05, "loss": 0.3872, "step": 622 }, { "epoch": 0.06558067317560988, "grad_norm": 0.49027901887893677, "learning_rate": 4.674073294018535e-05, "loss": 0.4786, "step": 623 }, { "epoch": 0.06568593910366062, "grad_norm": 0.4333001673221588, "learning_rate": 4.6735467565290656e-05, "loss": 0.4245, "step": 624 }, { "epoch": 0.06579120503171136, "grad_norm": 0.4188300669193268, "learning_rate": 4.673020219039596e-05, "loss": 0.4713, "step": 625 }, { "epoch": 0.0658964709597621, "grad_norm": 0.48492878675460815, "learning_rate": 4.672493681550126e-05, "loss": 0.4896, "step": 626 }, { "epoch": 0.06600173688781284, "grad_norm": 0.5120576024055481, "learning_rate": 4.671967144060657e-05, "loss": 0.4209, "step": 627 }, { "epoch": 0.06610700281586357, "grad_norm": 0.5438317060470581, "learning_rate": 4.671440606571188e-05, "loss": 0.4494, "step": 628 }, { "epoch": 0.06621226874391431, "grad_norm": 0.5266952514648438, "learning_rate": 4.670914069081719e-05, "loss": 0.5609, "step": 629 }, { "epoch": 0.06631753467196505, "grad_norm": 0.6691259741783142, "learning_rate": 4.6703875315922496e-05, "loss": 0.405, "step": 630 }, { "epoch": 0.06642280060001579, "grad_norm": 0.6721771955490112, "learning_rate": 4.6698609941027806e-05, "loss": 0.537, "step": 631 }, { "epoch": 0.06652806652806653, "grad_norm": 0.6021822690963745, "learning_rate": 4.6693344566133115e-05, "loss": 0.4862, "step": 632 }, { "epoch": 0.06663333245611727, "grad_norm": 0.42799803614616394, "learning_rate": 4.6688079191238424e-05, "loss": 0.4316, "step": 633 }, { "epoch": 0.066738598384168, "grad_norm": 0.3875657320022583, "learning_rate": 4.6682813816343726e-05, "loss": 0.4557, "step": 634 }, { "epoch": 0.06684386431221874, "grad_norm": 0.4300662577152252, "learning_rate": 4.6677548441449035e-05, "loss": 0.5253, "step": 635 }, { "epoch": 0.06694913024026948, "grad_norm": 0.4926076829433441, "learning_rate": 4.667228306655434e-05, "loss": 0.5151, "step": 636 }, { "epoch": 0.06705439616832022, "grad_norm": 0.457466185092926, "learning_rate": 4.6667017691659646e-05, "loss": 0.4296, "step": 637 }, { "epoch": 0.06715966209637096, "grad_norm": 0.5367447137832642, "learning_rate": 4.6661752316764955e-05, "loss": 0.43, "step": 638 }, { "epoch": 0.06726492802442169, "grad_norm": 0.5215645432472229, "learning_rate": 4.6656486941870264e-05, "loss": 0.4355, "step": 639 }, { "epoch": 0.06737019395247243, "grad_norm": 0.5821287035942078, "learning_rate": 4.665122156697557e-05, "loss": 0.3576, "step": 640 }, { "epoch": 0.06747545988052317, "grad_norm": 0.5504344701766968, "learning_rate": 4.664595619208088e-05, "loss": 0.4843, "step": 641 }, { "epoch": 0.06758072580857391, "grad_norm": 0.4482622742652893, "learning_rate": 4.6640690817186184e-05, "loss": 0.4474, "step": 642 }, { "epoch": 0.06768599173662465, "grad_norm": 0.5162287950515747, "learning_rate": 4.663542544229149e-05, "loss": 0.5323, "step": 643 }, { "epoch": 0.06779125766467539, "grad_norm": 0.5771566033363342, "learning_rate": 4.66301600673968e-05, "loss": 0.3508, "step": 644 }, { "epoch": 0.06789652359272612, "grad_norm": 0.473014235496521, "learning_rate": 4.6624894692502105e-05, "loss": 0.3959, "step": 645 }, { "epoch": 0.06800178952077686, "grad_norm": 0.4953562915325165, "learning_rate": 4.6619629317607414e-05, "loss": 0.4301, "step": 646 }, { "epoch": 0.0681070554488276, "grad_norm": 0.519964337348938, "learning_rate": 4.661436394271272e-05, "loss": 0.4395, "step": 647 }, { "epoch": 0.06821232137687834, "grad_norm": 0.5988878607749939, "learning_rate": 4.660909856781803e-05, "loss": 0.4151, "step": 648 }, { "epoch": 0.06831758730492908, "grad_norm": 0.5311563014984131, "learning_rate": 4.660383319292334e-05, "loss": 0.431, "step": 649 }, { "epoch": 0.06842285323297982, "grad_norm": 0.48196783661842346, "learning_rate": 4.659856781802864e-05, "loss": 0.4645, "step": 650 } ], "logging_steps": 1, "max_steps": 9499, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.570341741428736e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }