diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4157 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 58914, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005092297899427116, + "grad_norm": 3.7509827613830566, + "learning_rate": 4.9500000000000004e-05, + "loss": 3.2642, + "step": 100 + }, + { + "epoch": 0.010184595798854232, + "grad_norm": 3.509660005569458, + "learning_rate": 4.991583636549121e-05, + "loss": 3.014, + "step": 200 + }, + { + "epoch": 0.015276893698281349, + "grad_norm": 3.0620908737182617, + "learning_rate": 4.983082259326011e-05, + "loss": 2.901, + "step": 300 + }, + { + "epoch": 0.020369191597708464, + "grad_norm": 2.637498617172241, + "learning_rate": 4.974580882102901e-05, + "loss": 2.8888, + "step": 400 + }, + { + "epoch": 0.025461489497135583, + "grad_norm": 2.582336902618408, + "learning_rate": 4.966079504879791e-05, + "loss": 2.8137, + "step": 500 + }, + { + "epoch": 0.030553787396562698, + "grad_norm": 2.575382709503174, + "learning_rate": 4.957578127656681e-05, + "loss": 2.8131, + "step": 600 + }, + { + "epoch": 0.03564608529598982, + "grad_norm": 2.2707173824310303, + "learning_rate": 4.94907675043357e-05, + "loss": 2.7753, + "step": 700 + }, + { + "epoch": 0.04073838319541693, + "grad_norm": 2.1095917224884033, + "learning_rate": 4.94057537321046e-05, + "loss": 2.7512, + "step": 800 + }, + { + "epoch": 0.04583068109484405, + "grad_norm": 1.7593672275543213, + "learning_rate": 4.93207399598735e-05, + "loss": 2.7556, + "step": 900 + }, + { + "epoch": 0.050922978994271166, + "grad_norm": 2.2801873683929443, + "learning_rate": 4.92357261876424e-05, + "loss": 2.7417, + "step": 1000 + }, + { + "epoch": 0.056015276893698285, + "grad_norm": 1.9631321430206299, + "learning_rate": 4.91507124154113e-05, + "loss": 2.7122, + "step": 1100 + }, + { + "epoch": 0.061107574793125397, + "grad_norm": 1.6080312728881836, + "learning_rate": 4.90656986431802e-05, + "loss": 2.687, + "step": 1200 + }, + { + "epoch": 0.06619987269255251, + "grad_norm": 2.1147282123565674, + "learning_rate": 4.89806848709491e-05, + "loss": 2.6617, + "step": 1300 + }, + { + "epoch": 0.07129217059197963, + "grad_norm": 1.905120849609375, + "learning_rate": 4.889567109871799e-05, + "loss": 2.6655, + "step": 1400 + }, + { + "epoch": 0.07638446849140675, + "grad_norm": 1.6756385564804077, + "learning_rate": 4.881065732648689e-05, + "loss": 2.6286, + "step": 1500 + }, + { + "epoch": 0.08147676639083386, + "grad_norm": 1.8816139698028564, + "learning_rate": 4.872564355425579e-05, + "loss": 2.6414, + "step": 1600 + }, + { + "epoch": 0.08656906429026098, + "grad_norm": 1.611456036567688, + "learning_rate": 4.864062978202469e-05, + "loss": 2.635, + "step": 1700 + }, + { + "epoch": 0.0916613621896881, + "grad_norm": 1.8698660135269165, + "learning_rate": 4.855561600979359e-05, + "loss": 2.6683, + "step": 1800 + }, + { + "epoch": 0.0967536600891152, + "grad_norm": 1.6007249355316162, + "learning_rate": 4.847060223756249e-05, + "loss": 2.633, + "step": 1900 + }, + { + "epoch": 0.10184595798854233, + "grad_norm": 1.5520641803741455, + "learning_rate": 4.838558846533139e-05, + "loss": 2.5912, + "step": 2000 + }, + { + "epoch": 0.10693825588796944, + "grad_norm": 1.522303819656372, + "learning_rate": 4.8300574693100286e-05, + "loss": 2.614, + "step": 2100 + }, + { + "epoch": 0.11203055378739657, + "grad_norm": 1.752119541168213, + "learning_rate": 4.8215560920869186e-05, + "loss": 2.5984, + "step": 2200 + }, + { + "epoch": 0.11712285168682368, + "grad_norm": 1.5005803108215332, + "learning_rate": 4.8130547148638085e-05, + "loss": 2.6231, + "step": 2300 + }, + { + "epoch": 0.12221514958625079, + "grad_norm": 1.3557181358337402, + "learning_rate": 4.8045533376406984e-05, + "loss": 2.6189, + "step": 2400 + }, + { + "epoch": 0.1273074474856779, + "grad_norm": 1.5560193061828613, + "learning_rate": 4.796051960417588e-05, + "loss": 2.5609, + "step": 2500 + }, + { + "epoch": 0.13239974538510502, + "grad_norm": 1.4254344701766968, + "learning_rate": 4.7875505831944776e-05, + "loss": 2.5632, + "step": 2600 + }, + { + "epoch": 0.13749204328453216, + "grad_norm": 1.366593599319458, + "learning_rate": 4.7790492059713675e-05, + "loss": 2.54, + "step": 2700 + }, + { + "epoch": 0.14258434118395927, + "grad_norm": 1.2629475593566895, + "learning_rate": 4.7705478287482575e-05, + "loss": 2.5679, + "step": 2800 + }, + { + "epoch": 0.14767663908338638, + "grad_norm": 1.234580159187317, + "learning_rate": 4.7620464515251474e-05, + "loss": 2.5671, + "step": 2900 + }, + { + "epoch": 0.1527689369828135, + "grad_norm": 1.4017528295516968, + "learning_rate": 4.7535450743020373e-05, + "loss": 2.5859, + "step": 3000 + }, + { + "epoch": 0.1578612348822406, + "grad_norm": 1.3464558124542236, + "learning_rate": 4.7450436970789266e-05, + "loss": 2.5289, + "step": 3100 + }, + { + "epoch": 0.16295353278166771, + "grad_norm": 1.3121877908706665, + "learning_rate": 4.7365423198558165e-05, + "loss": 2.548, + "step": 3200 + }, + { + "epoch": 0.16804583068109485, + "grad_norm": 1.2319351434707642, + "learning_rate": 4.7280409426327065e-05, + "loss": 2.5425, + "step": 3300 + }, + { + "epoch": 0.17313812858052197, + "grad_norm": 1.243325114250183, + "learning_rate": 4.7195395654095964e-05, + "loss": 2.5798, + "step": 3400 + }, + { + "epoch": 0.17823042647994908, + "grad_norm": 1.2152389287948608, + "learning_rate": 4.711038188186486e-05, + "loss": 2.5235, + "step": 3500 + }, + { + "epoch": 0.1833227243793762, + "grad_norm": 1.2546372413635254, + "learning_rate": 4.702536810963376e-05, + "loss": 2.5451, + "step": 3600 + }, + { + "epoch": 0.1884150222788033, + "grad_norm": 1.2566453218460083, + "learning_rate": 4.694035433740266e-05, + "loss": 2.5031, + "step": 3700 + }, + { + "epoch": 0.1935073201782304, + "grad_norm": 1.4164502620697021, + "learning_rate": 4.685534056517156e-05, + "loss": 2.5002, + "step": 3800 + }, + { + "epoch": 0.19859961807765755, + "grad_norm": 1.2575647830963135, + "learning_rate": 4.677032679294046e-05, + "loss": 2.5175, + "step": 3900 + }, + { + "epoch": 0.20369191597708466, + "grad_norm": 1.2546263933181763, + "learning_rate": 4.668531302070936e-05, + "loss": 2.5374, + "step": 4000 + }, + { + "epoch": 0.20878421387651178, + "grad_norm": 1.4746454954147339, + "learning_rate": 4.660029924847826e-05, + "loss": 2.5077, + "step": 4100 + }, + { + "epoch": 0.2138765117759389, + "grad_norm": 1.3161815404891968, + "learning_rate": 4.651528547624716e-05, + "loss": 2.4939, + "step": 4200 + }, + { + "epoch": 0.218968809675366, + "grad_norm": 1.2247682809829712, + "learning_rate": 4.643027170401605e-05, + "loss": 2.5047, + "step": 4300 + }, + { + "epoch": 0.22406110757479314, + "grad_norm": 1.024702787399292, + "learning_rate": 4.634525793178495e-05, + "loss": 2.4986, + "step": 4400 + }, + { + "epoch": 0.22915340547422025, + "grad_norm": 1.2271933555603027, + "learning_rate": 4.626024415955385e-05, + "loss": 2.4815, + "step": 4500 + }, + { + "epoch": 0.23424570337364736, + "grad_norm": 1.1049838066101074, + "learning_rate": 4.617523038732275e-05, + "loss": 2.5055, + "step": 4600 + }, + { + "epoch": 0.23933800127307447, + "grad_norm": 1.1865185499191284, + "learning_rate": 4.609021661509165e-05, + "loss": 2.4932, + "step": 4700 + }, + { + "epoch": 0.24443029917250159, + "grad_norm": 1.2031099796295166, + "learning_rate": 4.600520284286055e-05, + "loss": 2.4857, + "step": 4800 + }, + { + "epoch": 0.2495225970719287, + "grad_norm": 1.2100847959518433, + "learning_rate": 4.592018907062944e-05, + "loss": 2.4704, + "step": 4900 + }, + { + "epoch": 0.2546148949713558, + "grad_norm": 1.306518793106079, + "learning_rate": 4.583517529839834e-05, + "loss": 2.4679, + "step": 5000 + }, + { + "epoch": 0.2597071928707829, + "grad_norm": 1.3596395254135132, + "learning_rate": 4.575016152616724e-05, + "loss": 2.5029, + "step": 5100 + }, + { + "epoch": 0.26479949077021003, + "grad_norm": 1.1463990211486816, + "learning_rate": 4.566514775393614e-05, + "loss": 2.4678, + "step": 5200 + }, + { + "epoch": 0.2698917886696372, + "grad_norm": 1.4843939542770386, + "learning_rate": 4.558013398170504e-05, + "loss": 2.4549, + "step": 5300 + }, + { + "epoch": 0.2749840865690643, + "grad_norm": 1.4119912385940552, + "learning_rate": 4.549512020947394e-05, + "loss": 2.4449, + "step": 5400 + }, + { + "epoch": 0.2800763844684914, + "grad_norm": 1.1640745401382446, + "learning_rate": 4.5410106437242836e-05, + "loss": 2.4133, + "step": 5500 + }, + { + "epoch": 0.28516868236791854, + "grad_norm": 1.2901395559310913, + "learning_rate": 4.532509266501173e-05, + "loss": 2.4493, + "step": 5600 + }, + { + "epoch": 0.29026098026734565, + "grad_norm": 1.3150924444198608, + "learning_rate": 4.5240078892780635e-05, + "loss": 2.4616, + "step": 5700 + }, + { + "epoch": 0.29535327816677276, + "grad_norm": 1.1391271352767944, + "learning_rate": 4.5155065120549534e-05, + "loss": 2.4491, + "step": 5800 + }, + { + "epoch": 0.30044557606619987, + "grad_norm": 1.047142505645752, + "learning_rate": 4.5070051348318434e-05, + "loss": 2.4664, + "step": 5900 + }, + { + "epoch": 0.305537873965627, + "grad_norm": 1.2513772249221802, + "learning_rate": 4.498503757608733e-05, + "loss": 2.4356, + "step": 6000 + }, + { + "epoch": 0.3106301718650541, + "grad_norm": 1.2248339653015137, + "learning_rate": 4.4900023803856225e-05, + "loss": 2.458, + "step": 6100 + }, + { + "epoch": 0.3157224697644812, + "grad_norm": 0.9861664772033691, + "learning_rate": 4.4815010031625125e-05, + "loss": 2.4494, + "step": 6200 + }, + { + "epoch": 0.3208147676639083, + "grad_norm": 1.087272047996521, + "learning_rate": 4.4729996259394024e-05, + "loss": 2.4459, + "step": 6300 + }, + { + "epoch": 0.32590706556333543, + "grad_norm": 1.0361382961273193, + "learning_rate": 4.464498248716292e-05, + "loss": 2.451, + "step": 6400 + }, + { + "epoch": 0.3309993634627626, + "grad_norm": 1.0861406326293945, + "learning_rate": 4.455996871493182e-05, + "loss": 2.4426, + "step": 6500 + }, + { + "epoch": 0.3360916613621897, + "grad_norm": 0.9402614235877991, + "learning_rate": 4.447495494270072e-05, + "loss": 2.4189, + "step": 6600 + }, + { + "epoch": 0.3411839592616168, + "grad_norm": 0.9866734743118286, + "learning_rate": 4.4389941170469615e-05, + "loss": 2.4521, + "step": 6700 + }, + { + "epoch": 0.34627625716104393, + "grad_norm": 1.0977962017059326, + "learning_rate": 4.4304927398238514e-05, + "loss": 2.4505, + "step": 6800 + }, + { + "epoch": 0.35136855506047104, + "grad_norm": 1.1266326904296875, + "learning_rate": 4.421991362600741e-05, + "loss": 2.3999, + "step": 6900 + }, + { + "epoch": 0.35646085295989816, + "grad_norm": 1.1100637912750244, + "learning_rate": 4.413489985377631e-05, + "loss": 2.4226, + "step": 7000 + }, + { + "epoch": 0.36155315085932527, + "grad_norm": 1.1532678604125977, + "learning_rate": 4.404988608154521e-05, + "loss": 2.4048, + "step": 7100 + }, + { + "epoch": 0.3666454487587524, + "grad_norm": 1.02146315574646, + "learning_rate": 4.396487230931411e-05, + "loss": 2.4177, + "step": 7200 + }, + { + "epoch": 0.3717377466581795, + "grad_norm": 1.1943087577819824, + "learning_rate": 4.387985853708301e-05, + "loss": 2.4276, + "step": 7300 + }, + { + "epoch": 0.3768300445576066, + "grad_norm": 1.118034839630127, + "learning_rate": 4.37948447648519e-05, + "loss": 2.3933, + "step": 7400 + }, + { + "epoch": 0.3819223424570337, + "grad_norm": 1.0506726503372192, + "learning_rate": 4.370983099262081e-05, + "loss": 2.4162, + "step": 7500 + }, + { + "epoch": 0.3870146403564608, + "grad_norm": 1.1072652339935303, + "learning_rate": 4.362481722038971e-05, + "loss": 2.4166, + "step": 7600 + }, + { + "epoch": 0.392106938255888, + "grad_norm": 0.9805678129196167, + "learning_rate": 4.353980344815861e-05, + "loss": 2.3771, + "step": 7700 + }, + { + "epoch": 0.3971992361553151, + "grad_norm": 1.0781447887420654, + "learning_rate": 4.345478967592751e-05, + "loss": 2.3971, + "step": 7800 + }, + { + "epoch": 0.4022915340547422, + "grad_norm": 1.1752007007598877, + "learning_rate": 4.33697759036964e-05, + "loss": 2.3837, + "step": 7900 + }, + { + "epoch": 0.40738383195416933, + "grad_norm": 1.0886644124984741, + "learning_rate": 4.32847621314653e-05, + "loss": 2.4372, + "step": 8000 + }, + { + "epoch": 0.41247612985359644, + "grad_norm": 1.01775062084198, + "learning_rate": 4.31997483592342e-05, + "loss": 2.4051, + "step": 8100 + }, + { + "epoch": 0.41756842775302355, + "grad_norm": 1.0455646514892578, + "learning_rate": 4.31147345870031e-05, + "loss": 2.366, + "step": 8200 + }, + { + "epoch": 0.42266072565245066, + "grad_norm": 0.9850195646286011, + "learning_rate": 4.3029720814772e-05, + "loss": 2.3816, + "step": 8300 + }, + { + "epoch": 0.4277530235518778, + "grad_norm": 1.092155933380127, + "learning_rate": 4.2944707042540896e-05, + "loss": 2.396, + "step": 8400 + }, + { + "epoch": 0.4328453214513049, + "grad_norm": 1.008317232131958, + "learning_rate": 4.285969327030979e-05, + "loss": 2.3976, + "step": 8500 + }, + { + "epoch": 0.437937619350732, + "grad_norm": 1.1001275777816772, + "learning_rate": 4.277467949807869e-05, + "loss": 2.4009, + "step": 8600 + }, + { + "epoch": 0.4430299172501591, + "grad_norm": 0.9589524865150452, + "learning_rate": 4.268966572584759e-05, + "loss": 2.3755, + "step": 8700 + }, + { + "epoch": 0.4481222151495863, + "grad_norm": 0.9529566168785095, + "learning_rate": 4.260465195361649e-05, + "loss": 2.3961, + "step": 8800 + }, + { + "epoch": 0.4532145130490134, + "grad_norm": 1.0157649517059326, + "learning_rate": 4.2519638181385386e-05, + "loss": 2.3743, + "step": 8900 + }, + { + "epoch": 0.4583068109484405, + "grad_norm": 1.0096311569213867, + "learning_rate": 4.2434624409154286e-05, + "loss": 2.3702, + "step": 9000 + }, + { + "epoch": 0.4633991088478676, + "grad_norm": 1.0700254440307617, + "learning_rate": 4.2349610636923185e-05, + "loss": 2.3486, + "step": 9100 + }, + { + "epoch": 0.4684914067472947, + "grad_norm": 0.9580355286598206, + "learning_rate": 4.226459686469208e-05, + "loss": 2.3686, + "step": 9200 + }, + { + "epoch": 0.47358370464672184, + "grad_norm": 1.0027587413787842, + "learning_rate": 4.217958309246098e-05, + "loss": 2.4074, + "step": 9300 + }, + { + "epoch": 0.47867600254614895, + "grad_norm": 0.9647036194801331, + "learning_rate": 4.209456932022988e-05, + "loss": 2.3631, + "step": 9400 + }, + { + "epoch": 0.48376830044557606, + "grad_norm": 1.0718977451324463, + "learning_rate": 4.200955554799878e-05, + "loss": 2.3613, + "step": 9500 + }, + { + "epoch": 0.48886059834500317, + "grad_norm": 1.1674007177352905, + "learning_rate": 4.192454177576768e-05, + "loss": 2.3604, + "step": 9600 + }, + { + "epoch": 0.4939528962444303, + "grad_norm": 0.8964582681655884, + "learning_rate": 4.1839528003536574e-05, + "loss": 2.3517, + "step": 9700 + }, + { + "epoch": 0.4990451941438574, + "grad_norm": 0.9950689673423767, + "learning_rate": 4.175451423130547e-05, + "loss": 2.3609, + "step": 9800 + }, + { + "epoch": 0.5041374920432845, + "grad_norm": 1.0391299724578857, + "learning_rate": 4.166950045907437e-05, + "loss": 2.3764, + "step": 9900 + }, + { + "epoch": 0.5092297899427116, + "grad_norm": 0.9937861561775208, + "learning_rate": 4.158448668684327e-05, + "loss": 2.3439, + "step": 10000 + }, + { + "epoch": 0.5143220878421387, + "grad_norm": 0.9637438654899597, + "learning_rate": 4.149947291461217e-05, + "loss": 2.3599, + "step": 10100 + }, + { + "epoch": 0.5194143857415658, + "grad_norm": 0.991791844367981, + "learning_rate": 4.141445914238107e-05, + "loss": 2.3688, + "step": 10200 + }, + { + "epoch": 0.524506683640993, + "grad_norm": 1.1475801467895508, + "learning_rate": 4.132944537014996e-05, + "loss": 2.351, + "step": 10300 + }, + { + "epoch": 0.5295989815404201, + "grad_norm": 1.018678069114685, + "learning_rate": 4.124443159791886e-05, + "loss": 2.3381, + "step": 10400 + }, + { + "epoch": 0.5346912794398472, + "grad_norm": 1.0166884660720825, + "learning_rate": 4.115941782568776e-05, + "loss": 2.3393, + "step": 10500 + }, + { + "epoch": 0.5397835773392744, + "grad_norm": 0.9590491652488708, + "learning_rate": 4.107440405345666e-05, + "loss": 2.3428, + "step": 10600 + }, + { + "epoch": 0.5448758752387015, + "grad_norm": 1.0007227659225464, + "learning_rate": 4.098939028122556e-05, + "loss": 2.3388, + "step": 10700 + }, + { + "epoch": 0.5499681731381286, + "grad_norm": 0.8273807764053345, + "learning_rate": 4.090437650899446e-05, + "loss": 2.3238, + "step": 10800 + }, + { + "epoch": 0.5550604710375557, + "grad_norm": 0.9188222885131836, + "learning_rate": 4.081936273676335e-05, + "loss": 2.3171, + "step": 10900 + }, + { + "epoch": 0.5601527689369828, + "grad_norm": 1.2066142559051514, + "learning_rate": 4.073434896453225e-05, + "loss": 2.385, + "step": 11000 + }, + { + "epoch": 0.56524506683641, + "grad_norm": 1.0904101133346558, + "learning_rate": 4.064933519230115e-05, + "loss": 2.341, + "step": 11100 + }, + { + "epoch": 0.5703373647358371, + "grad_norm": 1.0374412536621094, + "learning_rate": 4.056432142007005e-05, + "loss": 2.3398, + "step": 11200 + }, + { + "epoch": 0.5754296626352642, + "grad_norm": 0.9854114055633545, + "learning_rate": 4.0479307647838956e-05, + "loss": 2.3512, + "step": 11300 + }, + { + "epoch": 0.5805219605346913, + "grad_norm": 1.071382999420166, + "learning_rate": 4.0394293875607856e-05, + "loss": 2.3145, + "step": 11400 + }, + { + "epoch": 0.5856142584341184, + "grad_norm": 0.9923407435417175, + "learning_rate": 4.030928010337675e-05, + "loss": 2.3475, + "step": 11500 + }, + { + "epoch": 0.5907065563335455, + "grad_norm": 1.034600019454956, + "learning_rate": 4.022426633114565e-05, + "loss": 2.3196, + "step": 11600 + }, + { + "epoch": 0.5957988542329726, + "grad_norm": 1.4072537422180176, + "learning_rate": 4.013925255891455e-05, + "loss": 2.3435, + "step": 11700 + }, + { + "epoch": 0.6008911521323997, + "grad_norm": 1.0498465299606323, + "learning_rate": 4.0054238786683446e-05, + "loss": 2.3488, + "step": 11800 + }, + { + "epoch": 0.6059834500318269, + "grad_norm": 0.9911717176437378, + "learning_rate": 3.9969225014452346e-05, + "loss": 2.3286, + "step": 11900 + }, + { + "epoch": 0.611075747931254, + "grad_norm": 0.9431672692298889, + "learning_rate": 3.9884211242221245e-05, + "loss": 2.3502, + "step": 12000 + }, + { + "epoch": 0.6161680458306811, + "grad_norm": 1.0439810752868652, + "learning_rate": 3.979919746999014e-05, + "loss": 2.3516, + "step": 12100 + }, + { + "epoch": 0.6212603437301082, + "grad_norm": 0.8762308955192566, + "learning_rate": 3.971418369775904e-05, + "loss": 2.2836, + "step": 12200 + }, + { + "epoch": 0.6263526416295353, + "grad_norm": 0.8706735372543335, + "learning_rate": 3.9629169925527936e-05, + "loss": 2.349, + "step": 12300 + }, + { + "epoch": 0.6314449395289624, + "grad_norm": 0.9823511838912964, + "learning_rate": 3.9544156153296836e-05, + "loss": 2.3356, + "step": 12400 + }, + { + "epoch": 0.6365372374283895, + "grad_norm": 0.939285933971405, + "learning_rate": 3.9459142381065735e-05, + "loss": 2.3435, + "step": 12500 + }, + { + "epoch": 0.6416295353278166, + "grad_norm": 1.033011555671692, + "learning_rate": 3.9374128608834634e-05, + "loss": 2.3208, + "step": 12600 + }, + { + "epoch": 0.6467218332272437, + "grad_norm": 0.9835578799247742, + "learning_rate": 3.928911483660353e-05, + "loss": 2.3332, + "step": 12700 + }, + { + "epoch": 0.6518141311266709, + "grad_norm": 0.9082310795783997, + "learning_rate": 3.9204101064372426e-05, + "loss": 2.3216, + "step": 12800 + }, + { + "epoch": 0.6569064290260981, + "grad_norm": 0.8588578701019287, + "learning_rate": 3.9119087292141325e-05, + "loss": 2.3114, + "step": 12900 + }, + { + "epoch": 0.6619987269255252, + "grad_norm": 1.040531873703003, + "learning_rate": 3.9034073519910225e-05, + "loss": 2.3328, + "step": 13000 + }, + { + "epoch": 0.6670910248249523, + "grad_norm": 1.0225043296813965, + "learning_rate": 3.894905974767913e-05, + "loss": 2.3245, + "step": 13100 + }, + { + "epoch": 0.6721833227243794, + "grad_norm": 1.0172550678253174, + "learning_rate": 3.886404597544803e-05, + "loss": 2.3056, + "step": 13200 + }, + { + "epoch": 0.6772756206238065, + "grad_norm": 0.9119499921798706, + "learning_rate": 3.877903220321692e-05, + "loss": 2.317, + "step": 13300 + }, + { + "epoch": 0.6823679185232336, + "grad_norm": 0.8971495032310486, + "learning_rate": 3.869401843098582e-05, + "loss": 2.3292, + "step": 13400 + }, + { + "epoch": 0.6874602164226608, + "grad_norm": 0.9643430709838867, + "learning_rate": 3.860900465875472e-05, + "loss": 2.3779, + "step": 13500 + }, + { + "epoch": 0.6925525143220879, + "grad_norm": 0.919440507888794, + "learning_rate": 3.852399088652362e-05, + "loss": 2.2993, + "step": 13600 + }, + { + "epoch": 0.697644812221515, + "grad_norm": 0.9949972033500671, + "learning_rate": 3.843897711429252e-05, + "loss": 2.3255, + "step": 13700 + }, + { + "epoch": 0.7027371101209421, + "grad_norm": 0.9251271486282349, + "learning_rate": 3.835396334206142e-05, + "loss": 2.2997, + "step": 13800 + }, + { + "epoch": 0.7078294080203692, + "grad_norm": 0.9567040205001831, + "learning_rate": 3.826894956983031e-05, + "loss": 2.3198, + "step": 13900 + }, + { + "epoch": 0.7129217059197963, + "grad_norm": 1.1165566444396973, + "learning_rate": 3.818393579759921e-05, + "loss": 2.3074, + "step": 14000 + }, + { + "epoch": 0.7180140038192234, + "grad_norm": 0.9649367928504944, + "learning_rate": 3.809892202536811e-05, + "loss": 2.2916, + "step": 14100 + }, + { + "epoch": 0.7231063017186505, + "grad_norm": 0.8595756888389587, + "learning_rate": 3.801390825313701e-05, + "loss": 2.3386, + "step": 14200 + }, + { + "epoch": 0.7281985996180776, + "grad_norm": 0.7877846360206604, + "learning_rate": 3.792889448090591e-05, + "loss": 2.2741, + "step": 14300 + }, + { + "epoch": 0.7332908975175048, + "grad_norm": 0.9086227416992188, + "learning_rate": 3.784388070867481e-05, + "loss": 2.3186, + "step": 14400 + }, + { + "epoch": 0.7383831954169319, + "grad_norm": 0.9466003179550171, + "learning_rate": 3.77588669364437e-05, + "loss": 2.2916, + "step": 14500 + }, + { + "epoch": 0.743475493316359, + "grad_norm": 0.8069922924041748, + "learning_rate": 3.76738531642126e-05, + "loss": 2.3108, + "step": 14600 + }, + { + "epoch": 0.7485677912157861, + "grad_norm": 1.0324113368988037, + "learning_rate": 3.75888393919815e-05, + "loss": 2.3066, + "step": 14700 + }, + { + "epoch": 0.7536600891152132, + "grad_norm": 0.892573893070221, + "learning_rate": 3.75038256197504e-05, + "loss": 2.2738, + "step": 14800 + }, + { + "epoch": 0.7587523870146403, + "grad_norm": 0.7999922037124634, + "learning_rate": 3.74188118475193e-05, + "loss": 2.3195, + "step": 14900 + }, + { + "epoch": 0.7638446849140674, + "grad_norm": 1.004957914352417, + "learning_rate": 3.73337980752882e-05, + "loss": 2.2935, + "step": 15000 + }, + { + "epoch": 0.7689369828134945, + "grad_norm": 1.046640157699585, + "learning_rate": 3.72487843030571e-05, + "loss": 2.3109, + "step": 15100 + }, + { + "epoch": 0.7740292807129217, + "grad_norm": 0.9236047863960266, + "learning_rate": 3.7163770530825996e-05, + "loss": 2.3128, + "step": 15200 + }, + { + "epoch": 0.7791215786123489, + "grad_norm": 1.0190492868423462, + "learning_rate": 3.7078756758594896e-05, + "loss": 2.3018, + "step": 15300 + }, + { + "epoch": 0.784213876511776, + "grad_norm": 0.8099306225776672, + "learning_rate": 3.6993742986363795e-05, + "loss": 2.313, + "step": 15400 + }, + { + "epoch": 0.7893061744112031, + "grad_norm": 0.9618342518806458, + "learning_rate": 3.6908729214132694e-05, + "loss": 2.2864, + "step": 15500 + }, + { + "epoch": 0.7943984723106302, + "grad_norm": 1.046680212020874, + "learning_rate": 3.6823715441901594e-05, + "loss": 2.2853, + "step": 15600 + }, + { + "epoch": 0.7994907702100573, + "grad_norm": 0.8486195206642151, + "learning_rate": 3.6738701669670486e-05, + "loss": 2.2854, + "step": 15700 + }, + { + "epoch": 0.8045830681094844, + "grad_norm": 0.9708773493766785, + "learning_rate": 3.6653687897439386e-05, + "loss": 2.2928, + "step": 15800 + }, + { + "epoch": 0.8096753660089115, + "grad_norm": 0.8969681262969971, + "learning_rate": 3.6568674125208285e-05, + "loss": 2.2976, + "step": 15900 + }, + { + "epoch": 0.8147676639083387, + "grad_norm": 0.9385348558425903, + "learning_rate": 3.6483660352977184e-05, + "loss": 2.2847, + "step": 16000 + }, + { + "epoch": 0.8198599618077658, + "grad_norm": 0.8899937272071838, + "learning_rate": 3.6398646580746083e-05, + "loss": 2.2972, + "step": 16100 + }, + { + "epoch": 0.8249522597071929, + "grad_norm": 0.8900747299194336, + "learning_rate": 3.631363280851498e-05, + "loss": 2.2952, + "step": 16200 + }, + { + "epoch": 0.83004455760662, + "grad_norm": 1.026571273803711, + "learning_rate": 3.6228619036283875e-05, + "loss": 2.2842, + "step": 16300 + }, + { + "epoch": 0.8351368555060471, + "grad_norm": 0.9016963839530945, + "learning_rate": 3.6143605264052775e-05, + "loss": 2.288, + "step": 16400 + }, + { + "epoch": 0.8402291534054742, + "grad_norm": 0.8101049065589905, + "learning_rate": 3.6058591491821674e-05, + "loss": 2.2486, + "step": 16500 + }, + { + "epoch": 0.8453214513049013, + "grad_norm": 0.860748827457428, + "learning_rate": 3.597357771959057e-05, + "loss": 2.2911, + "step": 16600 + }, + { + "epoch": 0.8504137492043284, + "grad_norm": 0.9295821189880371, + "learning_rate": 3.588856394735947e-05, + "loss": 2.2477, + "step": 16700 + }, + { + "epoch": 0.8555060471037556, + "grad_norm": 0.9582170844078064, + "learning_rate": 3.580355017512837e-05, + "loss": 2.307, + "step": 16800 + }, + { + "epoch": 0.8605983450031827, + "grad_norm": 0.9199303984642029, + "learning_rate": 3.571853640289727e-05, + "loss": 2.2692, + "step": 16900 + }, + { + "epoch": 0.8656906429026098, + "grad_norm": 0.8835098743438721, + "learning_rate": 3.563352263066617e-05, + "loss": 2.2681, + "step": 17000 + }, + { + "epoch": 0.8707829408020369, + "grad_norm": 0.9898850917816162, + "learning_rate": 3.554850885843507e-05, + "loss": 2.2718, + "step": 17100 + }, + { + "epoch": 0.875875238701464, + "grad_norm": 1.0997586250305176, + "learning_rate": 3.546349508620397e-05, + "loss": 2.2577, + "step": 17200 + }, + { + "epoch": 0.8809675366008911, + "grad_norm": 0.8374606370925903, + "learning_rate": 3.537848131397287e-05, + "loss": 2.2731, + "step": 17300 + }, + { + "epoch": 0.8860598345003182, + "grad_norm": 0.9752559065818787, + "learning_rate": 3.529346754174177e-05, + "loss": 2.2776, + "step": 17400 + }, + { + "epoch": 0.8911521323997453, + "grad_norm": 0.8918510675430298, + "learning_rate": 3.520845376951066e-05, + "loss": 2.2838, + "step": 17500 + }, + { + "epoch": 0.8962444302991726, + "grad_norm": 0.9751953482627869, + "learning_rate": 3.512343999727956e-05, + "loss": 2.268, + "step": 17600 + }, + { + "epoch": 0.9013367281985997, + "grad_norm": 0.9787586331367493, + "learning_rate": 3.503842622504846e-05, + "loss": 2.2927, + "step": 17700 + }, + { + "epoch": 0.9064290260980268, + "grad_norm": 0.9199690222740173, + "learning_rate": 3.495341245281736e-05, + "loss": 2.2785, + "step": 17800 + }, + { + "epoch": 0.9115213239974539, + "grad_norm": 0.8526634573936462, + "learning_rate": 3.486839868058626e-05, + "loss": 2.2818, + "step": 17900 + }, + { + "epoch": 0.916613621896881, + "grad_norm": 0.9445266127586365, + "learning_rate": 3.478338490835516e-05, + "loss": 2.3147, + "step": 18000 + }, + { + "epoch": 0.9217059197963081, + "grad_norm": 0.9607738256454468, + "learning_rate": 3.469837113612405e-05, + "loss": 2.2663, + "step": 18100 + }, + { + "epoch": 0.9267982176957352, + "grad_norm": 0.8561920523643494, + "learning_rate": 3.461335736389295e-05, + "loss": 2.2355, + "step": 18200 + }, + { + "epoch": 0.9318905155951623, + "grad_norm": 0.8668131828308105, + "learning_rate": 3.452834359166185e-05, + "loss": 2.2801, + "step": 18300 + }, + { + "epoch": 0.9369828134945895, + "grad_norm": 0.9161975979804993, + "learning_rate": 3.444332981943075e-05, + "loss": 2.2668, + "step": 18400 + }, + { + "epoch": 0.9420751113940166, + "grad_norm": 0.9021576046943665, + "learning_rate": 3.435831604719965e-05, + "loss": 2.2887, + "step": 18500 + }, + { + "epoch": 0.9471674092934437, + "grad_norm": 0.8754701018333435, + "learning_rate": 3.4273302274968546e-05, + "loss": 2.2567, + "step": 18600 + }, + { + "epoch": 0.9522597071928708, + "grad_norm": 0.9762224555015564, + "learning_rate": 3.4188288502737446e-05, + "loss": 2.2574, + "step": 18700 + }, + { + "epoch": 0.9573520050922979, + "grad_norm": 0.8961549401283264, + "learning_rate": 3.4103274730506345e-05, + "loss": 2.252, + "step": 18800 + }, + { + "epoch": 0.962444302991725, + "grad_norm": 0.8942741751670837, + "learning_rate": 3.4018260958275244e-05, + "loss": 2.3098, + "step": 18900 + }, + { + "epoch": 0.9675366008911521, + "grad_norm": 0.8678953051567078, + "learning_rate": 3.3933247186044144e-05, + "loss": 2.2751, + "step": 19000 + }, + { + "epoch": 0.9726288987905792, + "grad_norm": 0.9803009629249573, + "learning_rate": 3.384823341381304e-05, + "loss": 2.2329, + "step": 19100 + }, + { + "epoch": 0.9777211966900063, + "grad_norm": 0.8548142313957214, + "learning_rate": 3.376321964158194e-05, + "loss": 2.2577, + "step": 19200 + }, + { + "epoch": 0.9828134945894335, + "grad_norm": 0.8247301578521729, + "learning_rate": 3.3678205869350835e-05, + "loss": 2.2776, + "step": 19300 + }, + { + "epoch": 0.9879057924888606, + "grad_norm": 0.8970145583152771, + "learning_rate": 3.3593192097119734e-05, + "loss": 2.2436, + "step": 19400 + }, + { + "epoch": 0.9929980903882877, + "grad_norm": 0.9450452923774719, + "learning_rate": 3.3508178324888633e-05, + "loss": 2.274, + "step": 19500 + }, + { + "epoch": 0.9980903882877148, + "grad_norm": 0.9455347061157227, + "learning_rate": 3.342316455265753e-05, + "loss": 2.2618, + "step": 19600 + }, + { + "epoch": 1.0031572246976448, + "grad_norm": 0.9727960228919983, + "learning_rate": 3.333815078042643e-05, + "loss": 2.2148, + "step": 19700 + }, + { + "epoch": 1.008249522597072, + "grad_norm": 1.0244638919830322, + "learning_rate": 3.325313700819533e-05, + "loss": 2.2209, + "step": 19800 + }, + { + "epoch": 1.013341820496499, + "grad_norm": 1.002837061882019, + "learning_rate": 3.3168123235964224e-05, + "loss": 2.2011, + "step": 19900 + }, + { + "epoch": 1.0184341183959262, + "grad_norm": 0.8974801898002625, + "learning_rate": 3.308310946373312e-05, + "loss": 2.2186, + "step": 20000 + }, + { + "epoch": 1.0235264162953532, + "grad_norm": 1.0660030841827393, + "learning_rate": 3.299809569150202e-05, + "loss": 2.2368, + "step": 20100 + }, + { + "epoch": 1.0286187141947805, + "grad_norm": 0.8874944448471069, + "learning_rate": 3.291308191927092e-05, + "loss": 2.2552, + "step": 20200 + }, + { + "epoch": 1.0337110120942075, + "grad_norm": 0.9332163333892822, + "learning_rate": 3.282806814703982e-05, + "loss": 2.231, + "step": 20300 + }, + { + "epoch": 1.0388033099936347, + "grad_norm": 0.8272064328193665, + "learning_rate": 3.274305437480872e-05, + "loss": 2.2287, + "step": 20400 + }, + { + "epoch": 1.0438956078930617, + "grad_norm": 0.8333924412727356, + "learning_rate": 3.265804060257761e-05, + "loss": 2.2217, + "step": 20500 + }, + { + "epoch": 1.048987905792489, + "grad_norm": 0.9589939117431641, + "learning_rate": 3.257302683034652e-05, + "loss": 2.2328, + "step": 20600 + }, + { + "epoch": 1.054080203691916, + "grad_norm": 0.8918903470039368, + "learning_rate": 3.248801305811542e-05, + "loss": 2.2169, + "step": 20700 + }, + { + "epoch": 1.0591725015913431, + "grad_norm": 0.9166114926338196, + "learning_rate": 3.240299928588432e-05, + "loss": 2.2605, + "step": 20800 + }, + { + "epoch": 1.0642647994907701, + "grad_norm": 0.8604680895805359, + "learning_rate": 3.231798551365322e-05, + "loss": 2.2591, + "step": 20900 + }, + { + "epoch": 1.0693570973901974, + "grad_norm": 0.82822185754776, + "learning_rate": 3.2232971741422117e-05, + "loss": 2.2075, + "step": 21000 + }, + { + "epoch": 1.0744493952896244, + "grad_norm": 0.8195912837982178, + "learning_rate": 3.214795796919101e-05, + "loss": 2.2054, + "step": 21100 + }, + { + "epoch": 1.0795416931890516, + "grad_norm": 0.9587050080299377, + "learning_rate": 3.206294419695991e-05, + "loss": 2.2558, + "step": 21200 + }, + { + "epoch": 1.0846339910884786, + "grad_norm": 0.9604052901268005, + "learning_rate": 3.197793042472881e-05, + "loss": 2.2023, + "step": 21300 + }, + { + "epoch": 1.0897262889879058, + "grad_norm": 0.9480250477790833, + "learning_rate": 3.189291665249771e-05, + "loss": 2.2168, + "step": 21400 + }, + { + "epoch": 1.094818586887333, + "grad_norm": 0.8999929428100586, + "learning_rate": 3.1807902880266606e-05, + "loss": 2.2089, + "step": 21500 + }, + { + "epoch": 1.09991088478676, + "grad_norm": 0.9180619716644287, + "learning_rate": 3.1722889108035506e-05, + "loss": 2.2092, + "step": 21600 + }, + { + "epoch": 1.105003182686187, + "grad_norm": 0.8434627056121826, + "learning_rate": 3.16378753358044e-05, + "loss": 2.2179, + "step": 21700 + }, + { + "epoch": 1.1100954805856142, + "grad_norm": 0.8810749053955078, + "learning_rate": 3.15528615635733e-05, + "loss": 2.1857, + "step": 21800 + }, + { + "epoch": 1.1151877784850415, + "grad_norm": 0.9257334470748901, + "learning_rate": 3.14678477913422e-05, + "loss": 2.2205, + "step": 21900 + }, + { + "epoch": 1.1202800763844685, + "grad_norm": 0.8661274313926697, + "learning_rate": 3.1382834019111096e-05, + "loss": 2.1995, + "step": 22000 + }, + { + "epoch": 1.1253723742838957, + "grad_norm": 0.8728938698768616, + "learning_rate": 3.1297820246879996e-05, + "loss": 2.2125, + "step": 22100 + }, + { + "epoch": 1.1304646721833227, + "grad_norm": 0.9176629185676575, + "learning_rate": 3.1212806474648895e-05, + "loss": 2.1908, + "step": 22200 + }, + { + "epoch": 1.13555697008275, + "grad_norm": 0.9520237445831299, + "learning_rate": 3.112779270241779e-05, + "loss": 2.2345, + "step": 22300 + }, + { + "epoch": 1.140649267982177, + "grad_norm": 0.8356249928474426, + "learning_rate": 3.1042778930186694e-05, + "loss": 2.2452, + "step": 22400 + }, + { + "epoch": 1.1457415658816041, + "grad_norm": 1.0978131294250488, + "learning_rate": 3.095776515795559e-05, + "loss": 2.1776, + "step": 22500 + }, + { + "epoch": 1.1508338637810311, + "grad_norm": 1.1184298992156982, + "learning_rate": 3.087275138572449e-05, + "loss": 2.2174, + "step": 22600 + }, + { + "epoch": 1.1559261616804584, + "grad_norm": 0.9109058380126953, + "learning_rate": 3.078773761349339e-05, + "loss": 2.2168, + "step": 22700 + }, + { + "epoch": 1.1610184595798854, + "grad_norm": 0.8274030089378357, + "learning_rate": 3.0702723841262284e-05, + "loss": 2.224, + "step": 22800 + }, + { + "epoch": 1.1661107574793126, + "grad_norm": 0.8593317270278931, + "learning_rate": 3.0617710069031183e-05, + "loss": 2.2653, + "step": 22900 + }, + { + "epoch": 1.1712030553787396, + "grad_norm": 1.1305369138717651, + "learning_rate": 3.053269629680008e-05, + "loss": 2.241, + "step": 23000 + }, + { + "epoch": 1.1762953532781668, + "grad_norm": 1.0249735116958618, + "learning_rate": 3.0447682524568982e-05, + "loss": 2.2044, + "step": 23100 + }, + { + "epoch": 1.1813876511775938, + "grad_norm": 0.762690007686615, + "learning_rate": 3.036266875233788e-05, + "loss": 2.2057, + "step": 23200 + }, + { + "epoch": 1.186479949077021, + "grad_norm": 0.7995686531066895, + "learning_rate": 3.0277654980106777e-05, + "loss": 2.2435, + "step": 23300 + }, + { + "epoch": 1.191572246976448, + "grad_norm": 1.0537996292114258, + "learning_rate": 3.0192641207875677e-05, + "loss": 2.2155, + "step": 23400 + }, + { + "epoch": 1.1966645448758753, + "grad_norm": 0.8992569446563721, + "learning_rate": 3.0107627435644576e-05, + "loss": 2.217, + "step": 23500 + }, + { + "epoch": 1.2017568427753023, + "grad_norm": 0.9041591286659241, + "learning_rate": 3.0022613663413472e-05, + "loss": 2.2277, + "step": 23600 + }, + { + "epoch": 1.2068491406747295, + "grad_norm": 0.9437869787216187, + "learning_rate": 2.993759989118237e-05, + "loss": 2.2151, + "step": 23700 + }, + { + "epoch": 1.2119414385741565, + "grad_norm": 0.7999377846717834, + "learning_rate": 2.985258611895127e-05, + "loss": 2.2103, + "step": 23800 + }, + { + "epoch": 1.2170337364735837, + "grad_norm": 0.932995080947876, + "learning_rate": 2.976757234672017e-05, + "loss": 2.1964, + "step": 23900 + }, + { + "epoch": 1.222126034373011, + "grad_norm": 0.846868097782135, + "learning_rate": 2.9682558574489066e-05, + "loss": 2.1821, + "step": 24000 + }, + { + "epoch": 1.227218332272438, + "grad_norm": 0.889284610748291, + "learning_rate": 2.9597544802257965e-05, + "loss": 2.2227, + "step": 24100 + }, + { + "epoch": 1.2323106301718652, + "grad_norm": 0.9376260042190552, + "learning_rate": 2.9512531030026865e-05, + "loss": 2.226, + "step": 24200 + }, + { + "epoch": 1.2374029280712922, + "grad_norm": 0.8779696226119995, + "learning_rate": 2.9427517257795767e-05, + "loss": 2.2086, + "step": 24300 + }, + { + "epoch": 1.2424952259707194, + "grad_norm": 0.9524549841880798, + "learning_rate": 2.9342503485564667e-05, + "loss": 2.2026, + "step": 24400 + }, + { + "epoch": 1.2475875238701464, + "grad_norm": 0.919808030128479, + "learning_rate": 2.9257489713333563e-05, + "loss": 2.192, + "step": 24500 + }, + { + "epoch": 1.2526798217695736, + "grad_norm": 1.0228092670440674, + "learning_rate": 2.9172475941102462e-05, + "loss": 2.2241, + "step": 24600 + }, + { + "epoch": 1.2577721196690006, + "grad_norm": 0.8363624811172485, + "learning_rate": 2.908746216887136e-05, + "loss": 2.1808, + "step": 24700 + }, + { + "epoch": 1.2628644175684278, + "grad_norm": 0.8711551427841187, + "learning_rate": 2.9002448396640257e-05, + "loss": 2.2093, + "step": 24800 + }, + { + "epoch": 1.2679567154678548, + "grad_norm": 0.9497014284133911, + "learning_rate": 2.8917434624409156e-05, + "loss": 2.1856, + "step": 24900 + }, + { + "epoch": 1.273049013367282, + "grad_norm": 0.9282352924346924, + "learning_rate": 2.8832420852178056e-05, + "loss": 2.1787, + "step": 25000 + }, + { + "epoch": 1.278141311266709, + "grad_norm": 0.9017792344093323, + "learning_rate": 2.8747407079946952e-05, + "loss": 2.2054, + "step": 25100 + }, + { + "epoch": 1.2832336091661363, + "grad_norm": 0.9470519423484802, + "learning_rate": 2.866239330771585e-05, + "loss": 2.1885, + "step": 25200 + }, + { + "epoch": 1.2883259070655633, + "grad_norm": 0.991397500038147, + "learning_rate": 2.857737953548475e-05, + "loss": 2.1875, + "step": 25300 + }, + { + "epoch": 1.2934182049649905, + "grad_norm": 0.920644223690033, + "learning_rate": 2.8492365763253646e-05, + "loss": 2.2418, + "step": 25400 + }, + { + "epoch": 1.2985105028644175, + "grad_norm": 0.8312422037124634, + "learning_rate": 2.8407351991022546e-05, + "loss": 2.1635, + "step": 25500 + }, + { + "epoch": 1.3036028007638447, + "grad_norm": 0.9457144737243652, + "learning_rate": 2.8322338218791445e-05, + "loss": 2.1945, + "step": 25600 + }, + { + "epoch": 1.308695098663272, + "grad_norm": 0.8914629220962524, + "learning_rate": 2.8237324446560344e-05, + "loss": 2.2092, + "step": 25700 + }, + { + "epoch": 1.313787396562699, + "grad_norm": 0.9140703082084656, + "learning_rate": 2.815231067432924e-05, + "loss": 2.2162, + "step": 25800 + }, + { + "epoch": 1.318879694462126, + "grad_norm": 0.926543116569519, + "learning_rate": 2.806729690209814e-05, + "loss": 2.1906, + "step": 25900 + }, + { + "epoch": 1.3239719923615532, + "grad_norm": 0.888692319393158, + "learning_rate": 2.798228312986704e-05, + "loss": 2.1866, + "step": 26000 + }, + { + "epoch": 1.3290642902609804, + "grad_norm": 0.7925876379013062, + "learning_rate": 2.7897269357635935e-05, + "loss": 2.1988, + "step": 26100 + }, + { + "epoch": 1.3341565881604074, + "grad_norm": 0.8814985752105713, + "learning_rate": 2.781225558540484e-05, + "loss": 2.2072, + "step": 26200 + }, + { + "epoch": 1.3392488860598344, + "grad_norm": 0.8415858745574951, + "learning_rate": 2.7727241813173737e-05, + "loss": 2.2227, + "step": 26300 + }, + { + "epoch": 1.3443411839592616, + "grad_norm": 0.9423860907554626, + "learning_rate": 2.7642228040942636e-05, + "loss": 2.2426, + "step": 26400 + }, + { + "epoch": 1.3494334818586888, + "grad_norm": 0.8816553950309753, + "learning_rate": 2.7557214268711535e-05, + "loss": 2.206, + "step": 26500 + }, + { + "epoch": 1.3545257797581158, + "grad_norm": 0.8283177018165588, + "learning_rate": 2.747220049648043e-05, + "loss": 2.1859, + "step": 26600 + }, + { + "epoch": 1.3596180776575428, + "grad_norm": 0.8860555291175842, + "learning_rate": 2.738718672424933e-05, + "loss": 2.178, + "step": 26700 + }, + { + "epoch": 1.36471037555697, + "grad_norm": 0.8853309154510498, + "learning_rate": 2.730217295201823e-05, + "loss": 2.1844, + "step": 26800 + }, + { + "epoch": 1.3698026734563973, + "grad_norm": 0.9043028950691223, + "learning_rate": 2.7217159179787126e-05, + "loss": 2.2105, + "step": 26900 + }, + { + "epoch": 1.3748949713558243, + "grad_norm": 0.8943936824798584, + "learning_rate": 2.7132145407556025e-05, + "loss": 2.1814, + "step": 27000 + }, + { + "epoch": 1.3799872692552515, + "grad_norm": 0.7901210188865662, + "learning_rate": 2.7047131635324925e-05, + "loss": 2.1819, + "step": 27100 + }, + { + "epoch": 1.3850795671546785, + "grad_norm": 0.9602735638618469, + "learning_rate": 2.696211786309382e-05, + "loss": 2.2121, + "step": 27200 + }, + { + "epoch": 1.3901718650541057, + "grad_norm": 0.8327048420906067, + "learning_rate": 2.687710409086272e-05, + "loss": 2.2128, + "step": 27300 + }, + { + "epoch": 1.3952641629535327, + "grad_norm": 0.8546739220619202, + "learning_rate": 2.679209031863162e-05, + "loss": 2.2035, + "step": 27400 + }, + { + "epoch": 1.40035646085296, + "grad_norm": 1.585236668586731, + "learning_rate": 2.6707076546400515e-05, + "loss": 2.1845, + "step": 27500 + }, + { + "epoch": 1.405448758752387, + "grad_norm": 0.9497547745704651, + "learning_rate": 2.6622062774169415e-05, + "loss": 2.1886, + "step": 27600 + }, + { + "epoch": 1.4105410566518142, + "grad_norm": 0.8747720718383789, + "learning_rate": 2.6537049001938314e-05, + "loss": 2.1735, + "step": 27700 + }, + { + "epoch": 1.4156333545512412, + "grad_norm": 0.9204273223876953, + "learning_rate": 2.6452035229707213e-05, + "loss": 2.2153, + "step": 27800 + }, + { + "epoch": 1.4207256524506684, + "grad_norm": 0.868325412273407, + "learning_rate": 2.636702145747611e-05, + "loss": 2.209, + "step": 27900 + }, + { + "epoch": 1.4258179503500954, + "grad_norm": 0.9367715716362, + "learning_rate": 2.6282007685245015e-05, + "loss": 2.1868, + "step": 28000 + }, + { + "epoch": 1.4309102482495226, + "grad_norm": 0.9658358693122864, + "learning_rate": 2.619699391301391e-05, + "loss": 2.1757, + "step": 28100 + }, + { + "epoch": 1.4360025461489498, + "grad_norm": 0.8091734051704407, + "learning_rate": 2.611198014078281e-05, + "loss": 2.1878, + "step": 28200 + }, + { + "epoch": 1.4410948440483768, + "grad_norm": 0.8200072050094604, + "learning_rate": 2.602696636855171e-05, + "loss": 2.192, + "step": 28300 + }, + { + "epoch": 1.4461871419478038, + "grad_norm": 0.9280868768692017, + "learning_rate": 2.5941952596320606e-05, + "loss": 2.1829, + "step": 28400 + }, + { + "epoch": 1.451279439847231, + "grad_norm": 0.9731032252311707, + "learning_rate": 2.5856938824089505e-05, + "loss": 2.156, + "step": 28500 + }, + { + "epoch": 1.4563717377466583, + "grad_norm": 0.8023040294647217, + "learning_rate": 2.5771925051858404e-05, + "loss": 2.1913, + "step": 28600 + }, + { + "epoch": 1.4614640356460853, + "grad_norm": 1.003476619720459, + "learning_rate": 2.56869112796273e-05, + "loss": 2.1537, + "step": 28700 + }, + { + "epoch": 1.4665563335455123, + "grad_norm": 1.0280425548553467, + "learning_rate": 2.56018975073962e-05, + "loss": 2.2106, + "step": 28800 + }, + { + "epoch": 1.4716486314449395, + "grad_norm": 0.9685016870498657, + "learning_rate": 2.55168837351651e-05, + "loss": 2.1758, + "step": 28900 + }, + { + "epoch": 1.4767409293443667, + "grad_norm": 0.8572561144828796, + "learning_rate": 2.5431869962933995e-05, + "loss": 2.1647, + "step": 29000 + }, + { + "epoch": 1.4818332272437937, + "grad_norm": 0.8688543438911438, + "learning_rate": 2.5346856190702894e-05, + "loss": 2.1973, + "step": 29100 + }, + { + "epoch": 1.486925525143221, + "grad_norm": 1.0197324752807617, + "learning_rate": 2.5261842418471794e-05, + "loss": 2.1649, + "step": 29200 + }, + { + "epoch": 1.492017823042648, + "grad_norm": 0.8760496377944946, + "learning_rate": 2.517682864624069e-05, + "loss": 2.2024, + "step": 29300 + }, + { + "epoch": 1.4971101209420752, + "grad_norm": 0.9327671527862549, + "learning_rate": 2.509181487400959e-05, + "loss": 2.2006, + "step": 29400 + }, + { + "epoch": 1.5022024188415022, + "grad_norm": 0.9184695482254028, + "learning_rate": 2.5006801101778488e-05, + "loss": 2.1616, + "step": 29500 + }, + { + "epoch": 1.5072947167409292, + "grad_norm": 0.8531858325004578, + "learning_rate": 2.4921787329547387e-05, + "loss": 2.1688, + "step": 29600 + }, + { + "epoch": 1.5123870146403564, + "grad_norm": 0.8902334570884705, + "learning_rate": 2.4836773557316287e-05, + "loss": 2.1692, + "step": 29700 + }, + { + "epoch": 1.5174793125397836, + "grad_norm": 0.8231461644172668, + "learning_rate": 2.4751759785085186e-05, + "loss": 2.1855, + "step": 29800 + }, + { + "epoch": 1.5225716104392109, + "grad_norm": 0.9362125396728516, + "learning_rate": 2.4666746012854082e-05, + "loss": 2.1798, + "step": 29900 + }, + { + "epoch": 1.5276639083386379, + "grad_norm": 0.8145864009857178, + "learning_rate": 2.458173224062298e-05, + "loss": 2.1655, + "step": 30000 + }, + { + "epoch": 1.5327562062380649, + "grad_norm": 0.9912553429603577, + "learning_rate": 2.449671846839188e-05, + "loss": 2.2025, + "step": 30100 + }, + { + "epoch": 1.537848504137492, + "grad_norm": 0.818953275680542, + "learning_rate": 2.4411704696160777e-05, + "loss": 2.1845, + "step": 30200 + }, + { + "epoch": 1.5429408020369193, + "grad_norm": 0.845649778842926, + "learning_rate": 2.4326690923929676e-05, + "loss": 2.199, + "step": 30300 + }, + { + "epoch": 1.5480330999363463, + "grad_norm": 1.0135074853897095, + "learning_rate": 2.424167715169858e-05, + "loss": 2.1912, + "step": 30400 + }, + { + "epoch": 1.5531253978357733, + "grad_norm": 0.9612752199172974, + "learning_rate": 2.4156663379467475e-05, + "loss": 2.159, + "step": 30500 + }, + { + "epoch": 1.5582176957352005, + "grad_norm": 0.8450791239738464, + "learning_rate": 2.4071649607236374e-05, + "loss": 2.1615, + "step": 30600 + }, + { + "epoch": 1.5633099936346277, + "grad_norm": 0.9979317784309387, + "learning_rate": 2.3986635835005273e-05, + "loss": 2.1713, + "step": 30700 + }, + { + "epoch": 1.5684022915340547, + "grad_norm": 0.904403567314148, + "learning_rate": 2.390162206277417e-05, + "loss": 2.2114, + "step": 30800 + }, + { + "epoch": 1.5734945894334817, + "grad_norm": 0.8977887630462646, + "learning_rate": 2.381660829054307e-05, + "loss": 2.1867, + "step": 30900 + }, + { + "epoch": 1.578586887332909, + "grad_norm": 0.9076321125030518, + "learning_rate": 2.3731594518311968e-05, + "loss": 2.167, + "step": 31000 + }, + { + "epoch": 1.5836791852323362, + "grad_norm": 0.9048725962638855, + "learning_rate": 2.3646580746080864e-05, + "loss": 2.1645, + "step": 31100 + }, + { + "epoch": 1.5887714831317632, + "grad_norm": 0.9547775387763977, + "learning_rate": 2.3561566973849763e-05, + "loss": 2.1849, + "step": 31200 + }, + { + "epoch": 1.5938637810311902, + "grad_norm": 0.7886509299278259, + "learning_rate": 2.3476553201618666e-05, + "loss": 2.187, + "step": 31300 + }, + { + "epoch": 1.5989560789306174, + "grad_norm": 0.8473970293998718, + "learning_rate": 2.3391539429387562e-05, + "loss": 2.1722, + "step": 31400 + }, + { + "epoch": 1.6040483768300446, + "grad_norm": 0.8617937564849854, + "learning_rate": 2.330652565715646e-05, + "loss": 2.2002, + "step": 31500 + }, + { + "epoch": 1.6091406747294716, + "grad_norm": 0.9672524333000183, + "learning_rate": 2.322151188492536e-05, + "loss": 2.1623, + "step": 31600 + }, + { + "epoch": 1.6142329726288986, + "grad_norm": 0.8769922852516174, + "learning_rate": 2.3136498112694256e-05, + "loss": 2.1695, + "step": 31700 + }, + { + "epoch": 1.6193252705283259, + "grad_norm": 0.8249488472938538, + "learning_rate": 2.3051484340463156e-05, + "loss": 2.1647, + "step": 31800 + }, + { + "epoch": 1.624417568427753, + "grad_norm": 0.9503587484359741, + "learning_rate": 2.2966470568232055e-05, + "loss": 2.2024, + "step": 31900 + }, + { + "epoch": 1.62950986632718, + "grad_norm": 0.9500870108604431, + "learning_rate": 2.288145679600095e-05, + "loss": 2.1467, + "step": 32000 + }, + { + "epoch": 1.634602164226607, + "grad_norm": 0.888297975063324, + "learning_rate": 2.279644302376985e-05, + "loss": 2.1586, + "step": 32100 + }, + { + "epoch": 1.6396944621260343, + "grad_norm": 0.8958535194396973, + "learning_rate": 2.2711429251538753e-05, + "loss": 2.1923, + "step": 32200 + }, + { + "epoch": 1.6447867600254615, + "grad_norm": 0.7949930429458618, + "learning_rate": 2.262641547930765e-05, + "loss": 2.1925, + "step": 32300 + }, + { + "epoch": 1.6498790579248888, + "grad_norm": 0.8516358733177185, + "learning_rate": 2.2541401707076548e-05, + "loss": 2.1818, + "step": 32400 + }, + { + "epoch": 1.6549713558243158, + "grad_norm": 0.9597014784812927, + "learning_rate": 2.2456387934845448e-05, + "loss": 2.1412, + "step": 32500 + }, + { + "epoch": 1.6600636537237428, + "grad_norm": 0.8643897771835327, + "learning_rate": 2.2371374162614344e-05, + "loss": 2.1645, + "step": 32600 + }, + { + "epoch": 1.66515595162317, + "grad_norm": 1.069393515586853, + "learning_rate": 2.2286360390383243e-05, + "loss": 2.1468, + "step": 32700 + }, + { + "epoch": 1.6702482495225972, + "grad_norm": 0.8896872401237488, + "learning_rate": 2.2201346618152142e-05, + "loss": 2.1732, + "step": 32800 + }, + { + "epoch": 1.6753405474220242, + "grad_norm": 0.8662711381912231, + "learning_rate": 2.2116332845921038e-05, + "loss": 2.1901, + "step": 32900 + }, + { + "epoch": 1.6804328453214512, + "grad_norm": 0.7606475353240967, + "learning_rate": 2.2031319073689937e-05, + "loss": 2.2045, + "step": 33000 + }, + { + "epoch": 1.6855251432208784, + "grad_norm": 0.9675360918045044, + "learning_rate": 2.1946305301458837e-05, + "loss": 2.1782, + "step": 33100 + }, + { + "epoch": 1.6906174411203057, + "grad_norm": 0.8184406757354736, + "learning_rate": 2.1861291529227736e-05, + "loss": 2.1827, + "step": 33200 + }, + { + "epoch": 1.6957097390197327, + "grad_norm": 0.8774561882019043, + "learning_rate": 2.1776277756996635e-05, + "loss": 2.1592, + "step": 33300 + }, + { + "epoch": 1.7008020369191597, + "grad_norm": 0.8667624592781067, + "learning_rate": 2.1691263984765535e-05, + "loss": 2.1779, + "step": 33400 + }, + { + "epoch": 1.7058943348185869, + "grad_norm": 0.9804625511169434, + "learning_rate": 2.160625021253443e-05, + "loss": 2.1985, + "step": 33500 + }, + { + "epoch": 1.710986632718014, + "grad_norm": 0.9614706039428711, + "learning_rate": 2.152123644030333e-05, + "loss": 2.1687, + "step": 33600 + }, + { + "epoch": 1.716078930617441, + "grad_norm": 0.8270972967147827, + "learning_rate": 2.143622266807223e-05, + "loss": 2.1539, + "step": 33700 + }, + { + "epoch": 1.721171228516868, + "grad_norm": 0.9252774119377136, + "learning_rate": 2.1351208895841125e-05, + "loss": 2.16, + "step": 33800 + }, + { + "epoch": 1.7262635264162953, + "grad_norm": 0.855818510055542, + "learning_rate": 2.1266195123610025e-05, + "loss": 2.1928, + "step": 33900 + }, + { + "epoch": 1.7313558243157225, + "grad_norm": 0.8505380153656006, + "learning_rate": 2.1181181351378924e-05, + "loss": 2.1748, + "step": 34000 + }, + { + "epoch": 1.7364481222151495, + "grad_norm": 0.8876926898956299, + "learning_rate": 2.1096167579147823e-05, + "loss": 2.2102, + "step": 34100 + }, + { + "epoch": 1.7415404201145765, + "grad_norm": 0.8772891163825989, + "learning_rate": 2.1011153806916723e-05, + "loss": 2.1691, + "step": 34200 + }, + { + "epoch": 1.7466327180140038, + "grad_norm": 0.9799501299858093, + "learning_rate": 2.0926140034685622e-05, + "loss": 2.1858, + "step": 34300 + }, + { + "epoch": 1.751725015913431, + "grad_norm": 0.8863718509674072, + "learning_rate": 2.0841126262454518e-05, + "loss": 2.1745, + "step": 34400 + }, + { + "epoch": 1.7568173138128582, + "grad_norm": 0.8394114375114441, + "learning_rate": 2.0756112490223417e-05, + "loss": 2.1629, + "step": 34500 + }, + { + "epoch": 1.7619096117122852, + "grad_norm": 0.8472095727920532, + "learning_rate": 2.0671098717992317e-05, + "loss": 2.1665, + "step": 34600 + }, + { + "epoch": 1.7670019096117122, + "grad_norm": 0.9460027813911438, + "learning_rate": 2.0586084945761212e-05, + "loss": 2.1458, + "step": 34700 + }, + { + "epoch": 1.7720942075111394, + "grad_norm": 0.9211781620979309, + "learning_rate": 2.0501071173530112e-05, + "loss": 2.1922, + "step": 34800 + }, + { + "epoch": 1.7771865054105667, + "grad_norm": 0.9996361136436462, + "learning_rate": 2.041605740129901e-05, + "loss": 2.1447, + "step": 34900 + }, + { + "epoch": 1.7822788033099937, + "grad_norm": 0.8266726136207581, + "learning_rate": 2.033104362906791e-05, + "loss": 2.1881, + "step": 35000 + }, + { + "epoch": 1.7873711012094207, + "grad_norm": 0.8855674862861633, + "learning_rate": 2.024602985683681e-05, + "loss": 2.198, + "step": 35100 + }, + { + "epoch": 1.7924633991088479, + "grad_norm": 0.9789201021194458, + "learning_rate": 2.016101608460571e-05, + "loss": 2.1685, + "step": 35200 + }, + { + "epoch": 1.797555697008275, + "grad_norm": 0.8354413509368896, + "learning_rate": 2.0076002312374605e-05, + "loss": 2.1535, + "step": 35300 + }, + { + "epoch": 1.8026479949077021, + "grad_norm": 0.9418453574180603, + "learning_rate": 1.9990988540143504e-05, + "loss": 2.1671, + "step": 35400 + }, + { + "epoch": 1.8077402928071291, + "grad_norm": 0.9462503790855408, + "learning_rate": 1.9905974767912404e-05, + "loss": 2.1339, + "step": 35500 + }, + { + "epoch": 1.8128325907065563, + "grad_norm": 0.8490837216377258, + "learning_rate": 1.98209609956813e-05, + "loss": 2.1528, + "step": 35600 + }, + { + "epoch": 1.8179248886059836, + "grad_norm": 0.9105218052864075, + "learning_rate": 1.97359472234502e-05, + "loss": 2.1717, + "step": 35700 + }, + { + "epoch": 1.8230171865054106, + "grad_norm": 0.9058020710945129, + "learning_rate": 1.9650933451219098e-05, + "loss": 2.1535, + "step": 35800 + }, + { + "epoch": 1.8281094844048376, + "grad_norm": 0.9724037647247314, + "learning_rate": 1.9565919678987994e-05, + "loss": 2.166, + "step": 35900 + }, + { + "epoch": 1.8332017823042648, + "grad_norm": 0.9018999338150024, + "learning_rate": 1.9480905906756897e-05, + "loss": 2.1528, + "step": 36000 + }, + { + "epoch": 1.838294080203692, + "grad_norm": 0.9223784804344177, + "learning_rate": 1.9395892134525796e-05, + "loss": 2.1982, + "step": 36100 + }, + { + "epoch": 1.843386378103119, + "grad_norm": 0.8883550763130188, + "learning_rate": 1.9310878362294692e-05, + "loss": 2.1701, + "step": 36200 + }, + { + "epoch": 1.848478676002546, + "grad_norm": 0.8294488787651062, + "learning_rate": 1.922586459006359e-05, + "loss": 2.2064, + "step": 36300 + }, + { + "epoch": 1.8535709739019732, + "grad_norm": 0.8737560510635376, + "learning_rate": 1.914085081783249e-05, + "loss": 2.1529, + "step": 36400 + }, + { + "epoch": 1.8586632718014005, + "grad_norm": 0.8156319260597229, + "learning_rate": 1.9055837045601387e-05, + "loss": 2.1628, + "step": 36500 + }, + { + "epoch": 1.8637555697008275, + "grad_norm": 0.8669657111167908, + "learning_rate": 1.8970823273370286e-05, + "loss": 2.2155, + "step": 36600 + }, + { + "epoch": 1.8688478676002545, + "grad_norm": 0.8657876253128052, + "learning_rate": 1.8885809501139185e-05, + "loss": 2.1506, + "step": 36700 + }, + { + "epoch": 1.8739401654996817, + "grad_norm": 0.8771129250526428, + "learning_rate": 1.880079572890808e-05, + "loss": 2.1797, + "step": 36800 + }, + { + "epoch": 1.879032463399109, + "grad_norm": 0.8845404982566833, + "learning_rate": 1.8715781956676984e-05, + "loss": 2.1434, + "step": 36900 + }, + { + "epoch": 1.8841247612985361, + "grad_norm": 0.9354609251022339, + "learning_rate": 1.8630768184445883e-05, + "loss": 2.1701, + "step": 37000 + }, + { + "epoch": 1.8892170591979631, + "grad_norm": 0.7781304717063904, + "learning_rate": 1.854575441221478e-05, + "loss": 2.2095, + "step": 37100 + }, + { + "epoch": 1.8943093570973901, + "grad_norm": 0.9069561958312988, + "learning_rate": 1.846074063998368e-05, + "loss": 2.169, + "step": 37200 + }, + { + "epoch": 1.8994016549968173, + "grad_norm": 0.9173194766044617, + "learning_rate": 1.8375726867752578e-05, + "loss": 2.121, + "step": 37300 + }, + { + "epoch": 1.9044939528962446, + "grad_norm": 0.864583432674408, + "learning_rate": 1.8290713095521474e-05, + "loss": 2.1711, + "step": 37400 + }, + { + "epoch": 1.9095862507956716, + "grad_norm": 0.7620731592178345, + "learning_rate": 1.8205699323290373e-05, + "loss": 2.1967, + "step": 37500 + }, + { + "epoch": 1.9146785486950986, + "grad_norm": 0.7830232977867126, + "learning_rate": 1.8120685551059273e-05, + "loss": 2.1574, + "step": 37600 + }, + { + "epoch": 1.9197708465945258, + "grad_norm": 0.8825329542160034, + "learning_rate": 1.803567177882817e-05, + "loss": 2.1432, + "step": 37700 + }, + { + "epoch": 1.924863144493953, + "grad_norm": 0.9680500030517578, + "learning_rate": 1.795065800659707e-05, + "loss": 2.1808, + "step": 37800 + }, + { + "epoch": 1.92995544239338, + "grad_norm": 0.9914782047271729, + "learning_rate": 1.786564423436597e-05, + "loss": 2.1945, + "step": 37900 + }, + { + "epoch": 1.935047740292807, + "grad_norm": 0.882604718208313, + "learning_rate": 1.7780630462134867e-05, + "loss": 2.15, + "step": 38000 + }, + { + "epoch": 1.9401400381922342, + "grad_norm": 0.8211714625358582, + "learning_rate": 1.7695616689903766e-05, + "loss": 2.178, + "step": 38100 + }, + { + "epoch": 1.9452323360916615, + "grad_norm": 0.9662156701087952, + "learning_rate": 1.7610602917672665e-05, + "loss": 2.164, + "step": 38200 + }, + { + "epoch": 1.9503246339910885, + "grad_norm": 0.8627343773841858, + "learning_rate": 1.752558914544156e-05, + "loss": 2.1977, + "step": 38300 + }, + { + "epoch": 1.9554169318905155, + "grad_norm": 0.8883799910545349, + "learning_rate": 1.744057537321046e-05, + "loss": 2.1813, + "step": 38400 + }, + { + "epoch": 1.9605092297899427, + "grad_norm": 0.9309747219085693, + "learning_rate": 1.735556160097936e-05, + "loss": 2.1686, + "step": 38500 + }, + { + "epoch": 1.96560152768937, + "grad_norm": 0.9126595854759216, + "learning_rate": 1.7270547828748256e-05, + "loss": 2.1571, + "step": 38600 + }, + { + "epoch": 1.970693825588797, + "grad_norm": 0.9490466117858887, + "learning_rate": 1.718553405651716e-05, + "loss": 2.1589, + "step": 38700 + }, + { + "epoch": 1.975786123488224, + "grad_norm": 0.8641236424446106, + "learning_rate": 1.7100520284286058e-05, + "loss": 2.1728, + "step": 38800 + }, + { + "epoch": 1.9808784213876511, + "grad_norm": 1.040710210800171, + "learning_rate": 1.7015506512054954e-05, + "loss": 2.1888, + "step": 38900 + }, + { + "epoch": 1.9859707192870784, + "grad_norm": 0.8207067251205444, + "learning_rate": 1.6930492739823853e-05, + "loss": 2.1504, + "step": 39000 + }, + { + "epoch": 1.9910630171865054, + "grad_norm": 0.8451477289199829, + "learning_rate": 1.6845478967592752e-05, + "loss": 2.1791, + "step": 39100 + }, + { + "epoch": 1.9961553150859326, + "grad_norm": 0.9080301523208618, + "learning_rate": 1.6760465195361648e-05, + "loss": 2.181, + "step": 39200 + }, + { + "epoch": 2.0012221514958624, + "grad_norm": 0.9207277297973633, + "learning_rate": 1.6675451423130548e-05, + "loss": 2.1537, + "step": 39300 + }, + { + "epoch": 2.0063144493952896, + "grad_norm": 0.8472919464111328, + "learning_rate": 1.6590437650899447e-05, + "loss": 2.1691, + "step": 39400 + }, + { + "epoch": 2.011406747294717, + "grad_norm": 0.9604014754295349, + "learning_rate": 1.6505423878668343e-05, + "loss": 2.1409, + "step": 39500 + }, + { + "epoch": 2.016499045194144, + "grad_norm": 0.957785964012146, + "learning_rate": 1.6420410106437242e-05, + "loss": 2.126, + "step": 39600 + }, + { + "epoch": 2.021591343093571, + "grad_norm": 0.8542806506156921, + "learning_rate": 1.6335396334206145e-05, + "loss": 2.1532, + "step": 39700 + }, + { + "epoch": 2.026683640992998, + "grad_norm": 0.9949219822883606, + "learning_rate": 1.625038256197504e-05, + "loss": 2.1437, + "step": 39800 + }, + { + "epoch": 2.0317759388924252, + "grad_norm": 0.8735845685005188, + "learning_rate": 1.616536878974394e-05, + "loss": 2.133, + "step": 39900 + }, + { + "epoch": 2.0368682367918525, + "grad_norm": 0.9472355842590332, + "learning_rate": 1.608035501751284e-05, + "loss": 2.146, + "step": 40000 + }, + { + "epoch": 2.0419605346912792, + "grad_norm": 0.9042348861694336, + "learning_rate": 1.5995341245281735e-05, + "loss": 2.1288, + "step": 40100 + }, + { + "epoch": 2.0470528325907065, + "grad_norm": 0.8667154908180237, + "learning_rate": 1.5910327473050635e-05, + "loss": 2.1182, + "step": 40200 + }, + { + "epoch": 2.0521451304901337, + "grad_norm": 0.9168582558631897, + "learning_rate": 1.5825313700819534e-05, + "loss": 2.1227, + "step": 40300 + }, + { + "epoch": 2.057237428389561, + "grad_norm": 0.8843423128128052, + "learning_rate": 1.574029992858843e-05, + "loss": 2.1564, + "step": 40400 + }, + { + "epoch": 2.0623297262889877, + "grad_norm": 0.8709278106689453, + "learning_rate": 1.565528615635733e-05, + "loss": 2.129, + "step": 40500 + }, + { + "epoch": 2.067422024188415, + "grad_norm": 1.0448068380355835, + "learning_rate": 1.5570272384126232e-05, + "loss": 2.1259, + "step": 40600 + }, + { + "epoch": 2.072514322087842, + "grad_norm": 1.014841914176941, + "learning_rate": 1.5485258611895128e-05, + "loss": 2.1526, + "step": 40700 + }, + { + "epoch": 2.0776066199872694, + "grad_norm": 0.9346544146537781, + "learning_rate": 1.5400244839664027e-05, + "loss": 2.1349, + "step": 40800 + }, + { + "epoch": 2.082698917886696, + "grad_norm": 1.029351830482483, + "learning_rate": 1.5315231067432927e-05, + "loss": 2.1224, + "step": 40900 + }, + { + "epoch": 2.0877912157861234, + "grad_norm": 0.8560373783111572, + "learning_rate": 1.5230217295201824e-05, + "loss": 2.0945, + "step": 41000 + }, + { + "epoch": 2.0928835136855506, + "grad_norm": 0.8771845698356628, + "learning_rate": 1.5145203522970722e-05, + "loss": 2.1215, + "step": 41100 + }, + { + "epoch": 2.097975811584978, + "grad_norm": 0.7786750197410583, + "learning_rate": 1.506018975073962e-05, + "loss": 2.1119, + "step": 41200 + }, + { + "epoch": 2.103068109484405, + "grad_norm": 0.8961013555526733, + "learning_rate": 1.4975175978508519e-05, + "loss": 2.1284, + "step": 41300 + }, + { + "epoch": 2.108160407383832, + "grad_norm": 0.7917054295539856, + "learning_rate": 1.4890162206277417e-05, + "loss": 2.1663, + "step": 41400 + }, + { + "epoch": 2.113252705283259, + "grad_norm": 0.9229695200920105, + "learning_rate": 1.4805148434046318e-05, + "loss": 2.1255, + "step": 41500 + }, + { + "epoch": 2.1183450031826863, + "grad_norm": 0.8761498332023621, + "learning_rate": 1.4720134661815215e-05, + "loss": 2.1271, + "step": 41600 + }, + { + "epoch": 2.1234373010821135, + "grad_norm": 0.8369442820549011, + "learning_rate": 1.4635120889584114e-05, + "loss": 2.1381, + "step": 41700 + }, + { + "epoch": 2.1285295989815403, + "grad_norm": 1.058815836906433, + "learning_rate": 1.4550107117353012e-05, + "loss": 2.1253, + "step": 41800 + }, + { + "epoch": 2.1336218968809675, + "grad_norm": 0.8793694972991943, + "learning_rate": 1.4465093345121911e-05, + "loss": 2.1327, + "step": 41900 + }, + { + "epoch": 2.1387141947803947, + "grad_norm": 0.9903535842895508, + "learning_rate": 1.4380079572890809e-05, + "loss": 2.1585, + "step": 42000 + }, + { + "epoch": 2.143806492679822, + "grad_norm": 0.8910212516784668, + "learning_rate": 1.4295065800659707e-05, + "loss": 2.1482, + "step": 42100 + }, + { + "epoch": 2.1488987905792487, + "grad_norm": 0.9088174700737, + "learning_rate": 1.4210052028428606e-05, + "loss": 2.1391, + "step": 42200 + }, + { + "epoch": 2.153991088478676, + "grad_norm": 0.9213513731956482, + "learning_rate": 1.4125038256197504e-05, + "loss": 2.1447, + "step": 42300 + }, + { + "epoch": 2.159083386378103, + "grad_norm": 0.9317104816436768, + "learning_rate": 1.4040024483966401e-05, + "loss": 2.1115, + "step": 42400 + }, + { + "epoch": 2.1641756842775304, + "grad_norm": 0.7989690899848938, + "learning_rate": 1.3955010711735302e-05, + "loss": 2.1385, + "step": 42500 + }, + { + "epoch": 2.169267982176957, + "grad_norm": 0.8436581492424011, + "learning_rate": 1.3869996939504202e-05, + "loss": 2.1487, + "step": 42600 + }, + { + "epoch": 2.1743602800763844, + "grad_norm": 0.9113427400588989, + "learning_rate": 1.37849831672731e-05, + "loss": 2.0851, + "step": 42700 + }, + { + "epoch": 2.1794525779758116, + "grad_norm": 0.8313522338867188, + "learning_rate": 1.3699969395041997e-05, + "loss": 2.1502, + "step": 42800 + }, + { + "epoch": 2.184544875875239, + "grad_norm": 0.9525701999664307, + "learning_rate": 1.3614955622810896e-05, + "loss": 2.1206, + "step": 42900 + }, + { + "epoch": 2.189637173774666, + "grad_norm": 0.9474479556083679, + "learning_rate": 1.3529941850579794e-05, + "loss": 2.1117, + "step": 43000 + }, + { + "epoch": 2.194729471674093, + "grad_norm": 0.8311910629272461, + "learning_rate": 1.3444928078348693e-05, + "loss": 2.1268, + "step": 43100 + }, + { + "epoch": 2.19982176957352, + "grad_norm": 0.879364013671875, + "learning_rate": 1.335991430611759e-05, + "loss": 2.1426, + "step": 43200 + }, + { + "epoch": 2.2049140674729473, + "grad_norm": 0.8633144497871399, + "learning_rate": 1.3274900533886488e-05, + "loss": 2.1324, + "step": 43300 + }, + { + "epoch": 2.210006365372374, + "grad_norm": 0.8333730697631836, + "learning_rate": 1.318988676165539e-05, + "loss": 2.1246, + "step": 43400 + }, + { + "epoch": 2.2150986632718013, + "grad_norm": 0.8649702072143555, + "learning_rate": 1.3104872989424289e-05, + "loss": 2.122, + "step": 43500 + }, + { + "epoch": 2.2201909611712285, + "grad_norm": 0.8680943846702576, + "learning_rate": 1.3019859217193186e-05, + "loss": 2.1295, + "step": 43600 + }, + { + "epoch": 2.2252832590706557, + "grad_norm": 0.9396230578422546, + "learning_rate": 1.2934845444962084e-05, + "loss": 2.1458, + "step": 43700 + }, + { + "epoch": 2.230375556970083, + "grad_norm": 0.9014144539833069, + "learning_rate": 1.2849831672730983e-05, + "loss": 2.1573, + "step": 43800 + }, + { + "epoch": 2.2354678548695097, + "grad_norm": 0.9344182014465332, + "learning_rate": 1.2764817900499881e-05, + "loss": 2.1516, + "step": 43900 + }, + { + "epoch": 2.240560152768937, + "grad_norm": 0.979686439037323, + "learning_rate": 1.267980412826878e-05, + "loss": 2.1307, + "step": 44000 + }, + { + "epoch": 2.245652450668364, + "grad_norm": 0.8325761556625366, + "learning_rate": 1.2594790356037678e-05, + "loss": 2.1498, + "step": 44100 + }, + { + "epoch": 2.2507447485677914, + "grad_norm": 0.8997836709022522, + "learning_rate": 1.2509776583806576e-05, + "loss": 2.1494, + "step": 44200 + }, + { + "epoch": 2.255837046467218, + "grad_norm": 0.8690670132637024, + "learning_rate": 1.2424762811575475e-05, + "loss": 2.1393, + "step": 44300 + }, + { + "epoch": 2.2609293443666454, + "grad_norm": 0.7817577719688416, + "learning_rate": 1.2339749039344374e-05, + "loss": 2.1341, + "step": 44400 + }, + { + "epoch": 2.2660216422660726, + "grad_norm": 0.8697742223739624, + "learning_rate": 1.2254735267113272e-05, + "loss": 2.1469, + "step": 44500 + }, + { + "epoch": 2.2711139401655, + "grad_norm": 0.8965489268302917, + "learning_rate": 1.2169721494882171e-05, + "loss": 2.1257, + "step": 44600 + }, + { + "epoch": 2.2762062380649266, + "grad_norm": 1.0732325315475464, + "learning_rate": 1.208470772265107e-05, + "loss": 2.1131, + "step": 44700 + }, + { + "epoch": 2.281298535964354, + "grad_norm": 0.7745924592018127, + "learning_rate": 1.1999693950419968e-05, + "loss": 2.1153, + "step": 44800 + }, + { + "epoch": 2.286390833863781, + "grad_norm": 0.8988758325576782, + "learning_rate": 1.1914680178188868e-05, + "loss": 2.1545, + "step": 44900 + }, + { + "epoch": 2.2914831317632083, + "grad_norm": 0.9772248268127441, + "learning_rate": 1.1829666405957767e-05, + "loss": 2.1333, + "step": 45000 + }, + { + "epoch": 2.296575429662635, + "grad_norm": 0.8579228520393372, + "learning_rate": 1.1744652633726664e-05, + "loss": 2.1122, + "step": 45100 + }, + { + "epoch": 2.3016677275620623, + "grad_norm": 0.8738901019096375, + "learning_rate": 1.1659638861495562e-05, + "loss": 2.0938, + "step": 45200 + }, + { + "epoch": 2.3067600254614895, + "grad_norm": 0.8962051868438721, + "learning_rate": 1.1574625089264461e-05, + "loss": 2.1216, + "step": 45300 + }, + { + "epoch": 2.3118523233609167, + "grad_norm": 0.8730968236923218, + "learning_rate": 1.1489611317033359e-05, + "loss": 2.1067, + "step": 45400 + }, + { + "epoch": 2.316944621260344, + "grad_norm": 0.9516613483428955, + "learning_rate": 1.1404597544802258e-05, + "loss": 2.1092, + "step": 45500 + }, + { + "epoch": 2.3220369191597707, + "grad_norm": 1.0411871671676636, + "learning_rate": 1.1319583772571158e-05, + "loss": 2.1199, + "step": 45600 + }, + { + "epoch": 2.327129217059198, + "grad_norm": 0.9724430441856384, + "learning_rate": 1.1234570000340055e-05, + "loss": 2.1335, + "step": 45700 + }, + { + "epoch": 2.332221514958625, + "grad_norm": 0.8349046111106873, + "learning_rate": 1.1149556228108955e-05, + "loss": 2.1249, + "step": 45800 + }, + { + "epoch": 2.337313812858052, + "grad_norm": 0.8713769316673279, + "learning_rate": 1.1064542455877852e-05, + "loss": 2.1054, + "step": 45900 + }, + { + "epoch": 2.342406110757479, + "grad_norm": 0.8659300208091736, + "learning_rate": 1.0979528683646752e-05, + "loss": 2.1095, + "step": 46000 + }, + { + "epoch": 2.3474984086569064, + "grad_norm": 1.0436406135559082, + "learning_rate": 1.089451491141565e-05, + "loss": 2.1337, + "step": 46100 + }, + { + "epoch": 2.3525907065563336, + "grad_norm": 0.8275535106658936, + "learning_rate": 1.0809501139184549e-05, + "loss": 2.1209, + "step": 46200 + }, + { + "epoch": 2.357683004455761, + "grad_norm": 0.9503908157348633, + "learning_rate": 1.0724487366953446e-05, + "loss": 2.1262, + "step": 46300 + }, + { + "epoch": 2.3627753023551876, + "grad_norm": 0.8849694728851318, + "learning_rate": 1.0639473594722346e-05, + "loss": 2.121, + "step": 46400 + }, + { + "epoch": 2.367867600254615, + "grad_norm": 0.8742644786834717, + "learning_rate": 1.0554459822491245e-05, + "loss": 2.1421, + "step": 46500 + }, + { + "epoch": 2.372959898154042, + "grad_norm": 0.8519076704978943, + "learning_rate": 1.0469446050260143e-05, + "loss": 2.1046, + "step": 46600 + }, + { + "epoch": 2.3780521960534693, + "grad_norm": 0.8561546206474304, + "learning_rate": 1.038443227802904e-05, + "loss": 2.1262, + "step": 46700 + }, + { + "epoch": 2.383144493952896, + "grad_norm": 0.8309553265571594, + "learning_rate": 1.029941850579794e-05, + "loss": 2.138, + "step": 46800 + }, + { + "epoch": 2.3882367918523233, + "grad_norm": 1.0880669355392456, + "learning_rate": 1.0214404733566839e-05, + "loss": 2.0881, + "step": 46900 + }, + { + "epoch": 2.3933290897517505, + "grad_norm": 0.9982330799102783, + "learning_rate": 1.0129390961335736e-05, + "loss": 2.1086, + "step": 47000 + }, + { + "epoch": 2.3984213876511777, + "grad_norm": 0.9612807035446167, + "learning_rate": 1.0044377189104636e-05, + "loss": 2.1207, + "step": 47100 + }, + { + "epoch": 2.4035136855506045, + "grad_norm": 0.848710298538208, + "learning_rate": 9.959363416873533e-06, + "loss": 2.1301, + "step": 47200 + }, + { + "epoch": 2.4086059834500317, + "grad_norm": 0.8840051889419556, + "learning_rate": 9.874349644642433e-06, + "loss": 2.1118, + "step": 47300 + }, + { + "epoch": 2.413698281349459, + "grad_norm": 0.916346549987793, + "learning_rate": 9.789335872411332e-06, + "loss": 2.128, + "step": 47400 + }, + { + "epoch": 2.418790579248886, + "grad_norm": 0.8974706530570984, + "learning_rate": 9.70432210018023e-06, + "loss": 2.1452, + "step": 47500 + }, + { + "epoch": 2.423882877148313, + "grad_norm": 1.0237131118774414, + "learning_rate": 9.619308327949127e-06, + "loss": 2.121, + "step": 47600 + }, + { + "epoch": 2.42897517504774, + "grad_norm": 0.9156752228736877, + "learning_rate": 9.534294555718027e-06, + "loss": 2.0985, + "step": 47700 + }, + { + "epoch": 2.4340674729471674, + "grad_norm": 0.9210427403450012, + "learning_rate": 9.449280783486926e-06, + "loss": 2.0653, + "step": 47800 + }, + { + "epoch": 2.4391597708465946, + "grad_norm": 0.8185928463935852, + "learning_rate": 9.364267011255824e-06, + "loss": 2.0994, + "step": 47900 + }, + { + "epoch": 2.444252068746022, + "grad_norm": 0.923605740070343, + "learning_rate": 9.279253239024723e-06, + "loss": 2.1402, + "step": 48000 + }, + { + "epoch": 2.4493443666454486, + "grad_norm": 0.8515633344650269, + "learning_rate": 9.19423946679362e-06, + "loss": 2.1273, + "step": 48100 + }, + { + "epoch": 2.454436664544876, + "grad_norm": 0.8325629830360413, + "learning_rate": 9.109225694562518e-06, + "loss": 2.0974, + "step": 48200 + }, + { + "epoch": 2.459528962444303, + "grad_norm": 0.8125095963478088, + "learning_rate": 9.02421192233142e-06, + "loss": 2.1157, + "step": 48300 + }, + { + "epoch": 2.4646212603437303, + "grad_norm": 0.8951058387756348, + "learning_rate": 8.939198150100317e-06, + "loss": 2.1111, + "step": 48400 + }, + { + "epoch": 2.469713558243157, + "grad_norm": 0.8785336017608643, + "learning_rate": 8.854184377869214e-06, + "loss": 2.1412, + "step": 48500 + }, + { + "epoch": 2.4748058561425843, + "grad_norm": 0.9884998202323914, + "learning_rate": 8.769170605638114e-06, + "loss": 2.1403, + "step": 48600 + }, + { + "epoch": 2.4798981540420115, + "grad_norm": 0.9092361330986023, + "learning_rate": 8.684156833407011e-06, + "loss": 2.1341, + "step": 48700 + }, + { + "epoch": 2.4849904519414387, + "grad_norm": 0.9467695951461792, + "learning_rate": 8.59914306117591e-06, + "loss": 2.1098, + "step": 48800 + }, + { + "epoch": 2.4900827498408655, + "grad_norm": 0.8339031338691711, + "learning_rate": 8.51412928894481e-06, + "loss": 2.1146, + "step": 48900 + }, + { + "epoch": 2.4951750477402928, + "grad_norm": 0.8132495284080505, + "learning_rate": 8.429115516713708e-06, + "loss": 2.1721, + "step": 49000 + }, + { + "epoch": 2.50026734563972, + "grad_norm": 0.9209297895431519, + "learning_rate": 8.344101744482605e-06, + "loss": 2.0942, + "step": 49100 + }, + { + "epoch": 2.505359643539147, + "grad_norm": 0.9470928311347961, + "learning_rate": 8.259087972251506e-06, + "loss": 2.0926, + "step": 49200 + }, + { + "epoch": 2.510451941438574, + "grad_norm": 0.9337894320487976, + "learning_rate": 8.174074200020404e-06, + "loss": 2.1189, + "step": 49300 + }, + { + "epoch": 2.515544239338001, + "grad_norm": 0.9764918088912964, + "learning_rate": 8.089060427789302e-06, + "loss": 2.1185, + "step": 49400 + }, + { + "epoch": 2.5206365372374284, + "grad_norm": 0.894453763961792, + "learning_rate": 8.004046655558201e-06, + "loss": 2.1289, + "step": 49500 + }, + { + "epoch": 2.5257288351368556, + "grad_norm": 0.8645434379577637, + "learning_rate": 7.919032883327099e-06, + "loss": 2.1025, + "step": 49600 + }, + { + "epoch": 2.530821133036283, + "grad_norm": 0.8322845101356506, + "learning_rate": 7.834019111095998e-06, + "loss": 2.1128, + "step": 49700 + }, + { + "epoch": 2.5359134309357096, + "grad_norm": 1.0294426679611206, + "learning_rate": 7.749005338864897e-06, + "loss": 2.1348, + "step": 49800 + }, + { + "epoch": 2.541005728835137, + "grad_norm": 0.9489388465881348, + "learning_rate": 7.663991566633795e-06, + "loss": 2.1089, + "step": 49900 + }, + { + "epoch": 2.546098026734564, + "grad_norm": 0.9332979917526245, + "learning_rate": 7.578977794402693e-06, + "loss": 2.1677, + "step": 50000 + }, + { + "epoch": 2.551190324633991, + "grad_norm": 0.8114882111549377, + "learning_rate": 7.493964022171592e-06, + "loss": 2.1265, + "step": 50100 + }, + { + "epoch": 2.556282622533418, + "grad_norm": 0.8496439456939697, + "learning_rate": 7.408950249940491e-06, + "loss": 2.1713, + "step": 50200 + }, + { + "epoch": 2.5613749204328453, + "grad_norm": 1.149905800819397, + "learning_rate": 7.32393647770939e-06, + "loss": 2.1234, + "step": 50300 + }, + { + "epoch": 2.5664672183322725, + "grad_norm": 1.0552695989608765, + "learning_rate": 7.238922705478287e-06, + "loss": 2.1398, + "step": 50400 + }, + { + "epoch": 2.5715595162316998, + "grad_norm": 0.9433385133743286, + "learning_rate": 7.153908933247186e-06, + "loss": 2.0986, + "step": 50500 + }, + { + "epoch": 2.5766518141311265, + "grad_norm": 0.889086127281189, + "learning_rate": 7.068895161016086e-06, + "loss": 2.1338, + "step": 50600 + }, + { + "epoch": 2.5817441120305538, + "grad_norm": 0.8793154358863831, + "learning_rate": 6.9838813887849835e-06, + "loss": 2.1095, + "step": 50700 + }, + { + "epoch": 2.586836409929981, + "grad_norm": 0.7565730214118958, + "learning_rate": 6.898867616553882e-06, + "loss": 2.1219, + "step": 50800 + }, + { + "epoch": 2.5919287078294078, + "grad_norm": 0.8305276036262512, + "learning_rate": 6.8138538443227805e-06, + "loss": 2.099, + "step": 50900 + }, + { + "epoch": 2.597021005728835, + "grad_norm": 0.9467841386795044, + "learning_rate": 6.728840072091679e-06, + "loss": 2.123, + "step": 51000 + }, + { + "epoch": 2.602113303628262, + "grad_norm": 0.9913722276687622, + "learning_rate": 6.643826299860578e-06, + "loss": 2.1189, + "step": 51100 + }, + { + "epoch": 2.6072056015276894, + "grad_norm": 0.9008012413978577, + "learning_rate": 6.558812527629477e-06, + "loss": 2.155, + "step": 51200 + }, + { + "epoch": 2.6122978994271167, + "grad_norm": 0.9230712056159973, + "learning_rate": 6.473798755398374e-06, + "loss": 2.1333, + "step": 51300 + }, + { + "epoch": 2.617390197326544, + "grad_norm": 1.0198971033096313, + "learning_rate": 6.388784983167273e-06, + "loss": 2.1374, + "step": 51400 + }, + { + "epoch": 2.6224824952259707, + "grad_norm": 0.9199273586273193, + "learning_rate": 6.303771210936171e-06, + "loss": 2.1332, + "step": 51500 + }, + { + "epoch": 2.627574793125398, + "grad_norm": 0.8723760843276978, + "learning_rate": 6.21875743870507e-06, + "loss": 2.1547, + "step": 51600 + }, + { + "epoch": 2.632667091024825, + "grad_norm": 0.9192347526550293, + "learning_rate": 6.133743666473969e-06, + "loss": 2.1192, + "step": 51700 + }, + { + "epoch": 2.637759388924252, + "grad_norm": 0.9517456889152527, + "learning_rate": 6.048729894242868e-06, + "loss": 2.1143, + "step": 51800 + }, + { + "epoch": 2.642851686823679, + "grad_norm": 0.9906876683235168, + "learning_rate": 5.963716122011766e-06, + "loss": 2.1171, + "step": 51900 + }, + { + "epoch": 2.6479439847231063, + "grad_norm": 0.9755644202232361, + "learning_rate": 5.878702349780665e-06, + "loss": 2.163, + "step": 52000 + }, + { + "epoch": 2.6530362826225335, + "grad_norm": 0.9300287961959839, + "learning_rate": 5.793688577549564e-06, + "loss": 2.1218, + "step": 52100 + }, + { + "epoch": 2.6581285805219608, + "grad_norm": 0.8865501284599304, + "learning_rate": 5.7086748053184616e-06, + "loss": 2.1356, + "step": 52200 + }, + { + "epoch": 2.6632208784213875, + "grad_norm": 0.8156447410583496, + "learning_rate": 5.62366103308736e-06, + "loss": 2.1171, + "step": 52300 + }, + { + "epoch": 2.6683131763208148, + "grad_norm": 0.8186530470848083, + "learning_rate": 5.538647260856259e-06, + "loss": 2.1052, + "step": 52400 + }, + { + "epoch": 2.673405474220242, + "grad_norm": 0.790550708770752, + "learning_rate": 5.453633488625157e-06, + "loss": 2.1071, + "step": 52500 + }, + { + "epoch": 2.6784977721196688, + "grad_norm": 0.8866438865661621, + "learning_rate": 5.368619716394056e-06, + "loss": 2.1354, + "step": 52600 + }, + { + "epoch": 2.683590070019096, + "grad_norm": 0.9953215718269348, + "learning_rate": 5.283605944162955e-06, + "loss": 2.1383, + "step": 52700 + }, + { + "epoch": 2.688682367918523, + "grad_norm": 0.9829987287521362, + "learning_rate": 5.198592171931853e-06, + "loss": 2.0919, + "step": 52800 + }, + { + "epoch": 2.6937746658179504, + "grad_norm": 0.9085790514945984, + "learning_rate": 5.113578399700752e-06, + "loss": 2.1178, + "step": 52900 + }, + { + "epoch": 2.6988669637173777, + "grad_norm": 0.8004271388053894, + "learning_rate": 5.02856462746965e-06, + "loss": 2.1239, + "step": 53000 + }, + { + "epoch": 2.7039592616168044, + "grad_norm": 0.9412344098091125, + "learning_rate": 4.943550855238549e-06, + "loss": 2.108, + "step": 53100 + }, + { + "epoch": 2.7090515595162317, + "grad_norm": 0.9245398640632629, + "learning_rate": 4.858537083007447e-06, + "loss": 2.1241, + "step": 53200 + }, + { + "epoch": 2.714143857415659, + "grad_norm": 0.9695274233818054, + "learning_rate": 4.7735233107763465e-06, + "loss": 2.1106, + "step": 53300 + }, + { + "epoch": 2.7192361553150857, + "grad_norm": 0.9269813895225525, + "learning_rate": 4.688509538545244e-06, + "loss": 2.1075, + "step": 53400 + }, + { + "epoch": 2.724328453214513, + "grad_norm": 0.9783353805541992, + "learning_rate": 4.6034957663141435e-06, + "loss": 2.1127, + "step": 53500 + }, + { + "epoch": 2.72942075111394, + "grad_norm": 0.9476038813591003, + "learning_rate": 4.518481994083042e-06, + "loss": 2.1284, + "step": 53600 + }, + { + "epoch": 2.7345130490133673, + "grad_norm": 0.93116295337677, + "learning_rate": 4.43346822185194e-06, + "loss": 2.1004, + "step": 53700 + }, + { + "epoch": 2.7396053469127946, + "grad_norm": 0.9898892641067505, + "learning_rate": 4.348454449620839e-06, + "loss": 2.1129, + "step": 53800 + }, + { + "epoch": 2.744697644812222, + "grad_norm": 0.9059526920318604, + "learning_rate": 4.263440677389737e-06, + "loss": 2.1189, + "step": 53900 + }, + { + "epoch": 2.7497899427116486, + "grad_norm": 0.8806390762329102, + "learning_rate": 4.178426905158636e-06, + "loss": 2.1416, + "step": 54000 + }, + { + "epoch": 2.754882240611076, + "grad_norm": 0.9231753945350647, + "learning_rate": 4.093413132927534e-06, + "loss": 2.1373, + "step": 54100 + }, + { + "epoch": 2.759974538510503, + "grad_norm": 0.7574446201324463, + "learning_rate": 4.008399360696434e-06, + "loss": 2.1355, + "step": 54200 + }, + { + "epoch": 2.76506683640993, + "grad_norm": 0.8553287982940674, + "learning_rate": 3.923385588465331e-06, + "loss": 2.0786, + "step": 54300 + }, + { + "epoch": 2.770159134309357, + "grad_norm": 0.7898595333099365, + "learning_rate": 3.83837181623423e-06, + "loss": 2.0941, + "step": 54400 + }, + { + "epoch": 2.7752514322087842, + "grad_norm": 0.8895372748374939, + "learning_rate": 3.7533580440031287e-06, + "loss": 2.1311, + "step": 54500 + }, + { + "epoch": 2.7803437301082115, + "grad_norm": 0.9352322816848755, + "learning_rate": 3.668344271772027e-06, + "loss": 2.102, + "step": 54600 + }, + { + "epoch": 2.7854360280076387, + "grad_norm": 1.003927230834961, + "learning_rate": 3.583330499540926e-06, + "loss": 2.1119, + "step": 54700 + }, + { + "epoch": 2.7905283259070655, + "grad_norm": 0.9228959083557129, + "learning_rate": 3.498316727309824e-06, + "loss": 2.142, + "step": 54800 + }, + { + "epoch": 2.7956206238064927, + "grad_norm": 0.9431111812591553, + "learning_rate": 3.413302955078723e-06, + "loss": 2.12, + "step": 54900 + }, + { + "epoch": 2.80071292170592, + "grad_norm": 0.9116231799125671, + "learning_rate": 3.3282891828476215e-06, + "loss": 2.1261, + "step": 55000 + }, + { + "epoch": 2.8058052196053467, + "grad_norm": 0.9542424082756042, + "learning_rate": 3.2432754106165196e-06, + "loss": 2.1151, + "step": 55100 + }, + { + "epoch": 2.810897517504774, + "grad_norm": 0.8199505805969238, + "learning_rate": 3.1582616383854185e-06, + "loss": 2.0883, + "step": 55200 + }, + { + "epoch": 2.815989815404201, + "grad_norm": 0.8526725769042969, + "learning_rate": 3.0732478661543174e-06, + "loss": 2.1094, + "step": 55300 + }, + { + "epoch": 2.8210821133036283, + "grad_norm": 0.9284189343452454, + "learning_rate": 2.9882340939232155e-06, + "loss": 2.1072, + "step": 55400 + }, + { + "epoch": 2.8261744112030556, + "grad_norm": 0.9289183616638184, + "learning_rate": 2.9032203216921144e-06, + "loss": 2.1227, + "step": 55500 + }, + { + "epoch": 2.8312667091024823, + "grad_norm": 1.0548968315124512, + "learning_rate": 2.818206549461013e-06, + "loss": 2.138, + "step": 55600 + }, + { + "epoch": 2.8363590070019096, + "grad_norm": 0.8402355313301086, + "learning_rate": 2.7331927772299113e-06, + "loss": 2.1394, + "step": 55700 + }, + { + "epoch": 2.841451304901337, + "grad_norm": 0.9172413349151611, + "learning_rate": 2.64817900499881e-06, + "loss": 2.114, + "step": 55800 + }, + { + "epoch": 2.8465436028007636, + "grad_norm": 0.8457333445549011, + "learning_rate": 2.5631652327677087e-06, + "loss": 2.1268, + "step": 55900 + }, + { + "epoch": 2.851635900700191, + "grad_norm": 0.8858858942985535, + "learning_rate": 2.478151460536607e-06, + "loss": 2.0901, + "step": 56000 + }, + { + "epoch": 2.856728198599618, + "grad_norm": 0.8789589405059814, + "learning_rate": 2.3931376883055057e-06, + "loss": 2.1154, + "step": 56100 + }, + { + "epoch": 2.8618204964990452, + "grad_norm": 0.9234612584114075, + "learning_rate": 2.308123916074404e-06, + "loss": 2.1106, + "step": 56200 + }, + { + "epoch": 2.8669127943984725, + "grad_norm": 0.8070857524871826, + "learning_rate": 2.2231101438433026e-06, + "loss": 2.1181, + "step": 56300 + }, + { + "epoch": 2.8720050922978997, + "grad_norm": 0.9172016978263855, + "learning_rate": 2.138096371612201e-06, + "loss": 2.0832, + "step": 56400 + }, + { + "epoch": 2.8770973901973265, + "grad_norm": 0.9449873566627502, + "learning_rate": 2.0530825993811e-06, + "loss": 2.126, + "step": 56500 + }, + { + "epoch": 2.8821896880967537, + "grad_norm": 1.0262093544006348, + "learning_rate": 1.9680688271499985e-06, + "loss": 2.1117, + "step": 56600 + }, + { + "epoch": 2.887281985996181, + "grad_norm": 0.7934767007827759, + "learning_rate": 1.8830550549188972e-06, + "loss": 2.1256, + "step": 56700 + }, + { + "epoch": 2.8923742838956077, + "grad_norm": 0.9590465426445007, + "learning_rate": 1.7980412826877954e-06, + "loss": 2.1335, + "step": 56800 + }, + { + "epoch": 2.897466581795035, + "grad_norm": 1.006219744682312, + "learning_rate": 1.713027510456694e-06, + "loss": 2.0888, + "step": 56900 + }, + { + "epoch": 2.902558879694462, + "grad_norm": 0.9063106179237366, + "learning_rate": 1.6280137382255926e-06, + "loss": 2.1506, + "step": 57000 + }, + { + "epoch": 2.9076511775938894, + "grad_norm": 0.8653075695037842, + "learning_rate": 1.542999965994491e-06, + "loss": 2.0845, + "step": 57100 + }, + { + "epoch": 2.9127434754933166, + "grad_norm": 0.9707706570625305, + "learning_rate": 1.4579861937633898e-06, + "loss": 2.0865, + "step": 57200 + }, + { + "epoch": 2.9178357733927434, + "grad_norm": 0.9578688740730286, + "learning_rate": 1.3729724215322882e-06, + "loss": 2.1098, + "step": 57300 + }, + { + "epoch": 2.9229280712921706, + "grad_norm": 0.8037517070770264, + "learning_rate": 1.2879586493011867e-06, + "loss": 2.085, + "step": 57400 + }, + { + "epoch": 2.928020369191598, + "grad_norm": 0.9694920182228088, + "learning_rate": 1.2029448770700854e-06, + "loss": 2.0926, + "step": 57500 + }, + { + "epoch": 2.9331126670910246, + "grad_norm": 0.8718476891517639, + "learning_rate": 1.1179311048389841e-06, + "loss": 2.1112, + "step": 57600 + }, + { + "epoch": 2.938204964990452, + "grad_norm": 0.8940988779067993, + "learning_rate": 1.0329173326078824e-06, + "loss": 2.12, + "step": 57700 + }, + { + "epoch": 2.943297262889879, + "grad_norm": 0.9514064192771912, + "learning_rate": 9.479035603767811e-07, + "loss": 2.1416, + "step": 57800 + }, + { + "epoch": 2.9483895607893063, + "grad_norm": 0.9789698719978333, + "learning_rate": 8.628897881456797e-07, + "loss": 2.0913, + "step": 57900 + }, + { + "epoch": 2.9534818586887335, + "grad_norm": 1.028600811958313, + "learning_rate": 7.778760159145782e-07, + "loss": 2.1142, + "step": 58000 + }, + { + "epoch": 2.9585741565881603, + "grad_norm": 0.850046694278717, + "learning_rate": 6.928622436834767e-07, + "loss": 2.0929, + "step": 58100 + }, + { + "epoch": 2.9636664544875875, + "grad_norm": 0.8758450150489807, + "learning_rate": 6.078484714523753e-07, + "loss": 2.0991, + "step": 58200 + }, + { + "epoch": 2.9687587523870147, + "grad_norm": 0.9652713537216187, + "learning_rate": 5.228346992212739e-07, + "loss": 2.1095, + "step": 58300 + }, + { + "epoch": 2.973851050286442, + "grad_norm": 1.0260512828826904, + "learning_rate": 4.3782092699017247e-07, + "loss": 2.1069, + "step": 58400 + }, + { + "epoch": 2.9789433481858687, + "grad_norm": 0.7857241034507751, + "learning_rate": 3.5280715475907095e-07, + "loss": 2.1014, + "step": 58500 + }, + { + "epoch": 2.984035646085296, + "grad_norm": 0.964096188545227, + "learning_rate": 2.6779338252796954e-07, + "loss": 2.0981, + "step": 58600 + }, + { + "epoch": 2.989127943984723, + "grad_norm": 0.8568851351737976, + "learning_rate": 1.827796102968681e-07, + "loss": 2.1283, + "step": 58700 + }, + { + "epoch": 2.9942202418841504, + "grad_norm": 0.9048463702201843, + "learning_rate": 9.776583806576667e-08, + "loss": 2.1011, + "step": 58800 + }, + { + "epoch": 2.9993125397835776, + "grad_norm": 0.8119781613349915, + "learning_rate": 1.2752065834665216e-08, + "loss": 2.0672, + "step": 58900 + } + ], + "logging_steps": 100, + "max_steps": 58914, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.462938693632e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}