diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24640 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9995732696082618, + "eval_steps": 500, + "global_step": 17574, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008534607834769992, + "grad_norm": 99.42818782210857, + "learning_rate": 1.4220705346985212e-07, + "loss": 6.2791, + "step": 5 + }, + { + "epoch": 0.0017069215669539984, + "grad_norm": 104.39808292384225, + "learning_rate": 2.8441410693970424e-07, + "loss": 6.1853, + "step": 10 + }, + { + "epoch": 0.0025603823504309975, + "grad_norm": 103.48376567294774, + "learning_rate": 4.2662116040955633e-07, + "loss": 6.3846, + "step": 15 + }, + { + "epoch": 0.003413843133907997, + "grad_norm": 104.04994240213826, + "learning_rate": 5.688282138794085e-07, + "loss": 5.7934, + "step": 20 + }, + { + "epoch": 0.004267303917384996, + "grad_norm": 87.84439258281431, + "learning_rate": 7.110352673492606e-07, + "loss": 5.2152, + "step": 25 + }, + { + "epoch": 0.005120764700861995, + "grad_norm": 77.01441231565465, + "learning_rate": 8.532423208191127e-07, + "loss": 3.8525, + "step": 30 + }, + { + "epoch": 0.005974225484338995, + "grad_norm": 21.83631807774304, + "learning_rate": 9.954493742889649e-07, + "loss": 2.9729, + "step": 35 + }, + { + "epoch": 0.006827686267815994, + "grad_norm": 14.113464115950803, + "learning_rate": 1.137656427758817e-06, + "loss": 1.7155, + "step": 40 + }, + { + "epoch": 0.0076811470512929934, + "grad_norm": 4.7553020883795405, + "learning_rate": 1.279863481228669e-06, + "loss": 1.4743, + "step": 45 + }, + { + "epoch": 0.008534607834769992, + "grad_norm": 7.79414775358199, + "learning_rate": 1.4220705346985211e-06, + "loss": 1.3836, + "step": 50 + }, + { + "epoch": 0.009388068618246991, + "grad_norm": 2.4354082720834977, + "learning_rate": 1.5642775881683732e-06, + "loss": 1.2929, + "step": 55 + }, + { + "epoch": 0.01024152940172399, + "grad_norm": 2.2849690291702522, + "learning_rate": 1.7064846416382253e-06, + "loss": 1.1902, + "step": 60 + }, + { + "epoch": 0.01109499018520099, + "grad_norm": 2.895821184135636, + "learning_rate": 1.8486916951080774e-06, + "loss": 1.0828, + "step": 65 + }, + { + "epoch": 0.01194845096867799, + "grad_norm": 1.1877547807762439, + "learning_rate": 1.9908987485779297e-06, + "loss": 1.0027, + "step": 70 + }, + { + "epoch": 0.012801911752154988, + "grad_norm": 1.0268455953088027, + "learning_rate": 2.133105802047782e-06, + "loss": 0.9333, + "step": 75 + }, + { + "epoch": 0.013655372535631987, + "grad_norm": 5.481550209143814, + "learning_rate": 2.275312855517634e-06, + "loss": 0.8887, + "step": 80 + }, + { + "epoch": 0.014508833319108986, + "grad_norm": 0.8793984613448603, + "learning_rate": 2.417519908987486e-06, + "loss": 0.8555, + "step": 85 + }, + { + "epoch": 0.015362294102585987, + "grad_norm": 0.7225726544778598, + "learning_rate": 2.559726962457338e-06, + "loss": 0.8049, + "step": 90 + }, + { + "epoch": 0.016215754886062986, + "grad_norm": 0.8823937518440574, + "learning_rate": 2.7019340159271904e-06, + "loss": 0.8209, + "step": 95 + }, + { + "epoch": 0.017069215669539985, + "grad_norm": 0.829188508851399, + "learning_rate": 2.8441410693970423e-06, + "loss": 0.7801, + "step": 100 + }, + { + "epoch": 0.017922676453016984, + "grad_norm": 0.7270054979813305, + "learning_rate": 2.9863481228668946e-06, + "loss": 0.7423, + "step": 105 + }, + { + "epoch": 0.018776137236493982, + "grad_norm": 0.6801373549704796, + "learning_rate": 3.1285551763367464e-06, + "loss": 0.7323, + "step": 110 + }, + { + "epoch": 0.01962959801997098, + "grad_norm": 1.0385640589855576, + "learning_rate": 3.2707622298065988e-06, + "loss": 0.7392, + "step": 115 + }, + { + "epoch": 0.02048305880344798, + "grad_norm": 0.8671806836602034, + "learning_rate": 3.4129692832764506e-06, + "loss": 0.6864, + "step": 120 + }, + { + "epoch": 0.02133651958692498, + "grad_norm": 0.6716992405032944, + "learning_rate": 3.5551763367463025e-06, + "loss": 0.7209, + "step": 125 + }, + { + "epoch": 0.02218998037040198, + "grad_norm": 2.3299276907106763, + "learning_rate": 3.697383390216155e-06, + "loss": 0.6971, + "step": 130 + }, + { + "epoch": 0.02304344115387898, + "grad_norm": 0.6490988370494056, + "learning_rate": 3.8395904436860075e-06, + "loss": 0.7131, + "step": 135 + }, + { + "epoch": 0.02389690193735598, + "grad_norm": 0.6320268165032422, + "learning_rate": 3.981797497155859e-06, + "loss": 0.6888, + "step": 140 + }, + { + "epoch": 0.024750362720832978, + "grad_norm": 0.7161127122525036, + "learning_rate": 4.124004550625711e-06, + "loss": 0.6837, + "step": 145 + }, + { + "epoch": 0.025603823504309977, + "grad_norm": 0.6972994060141804, + "learning_rate": 4.266211604095564e-06, + "loss": 0.6523, + "step": 150 + }, + { + "epoch": 0.026457284287786976, + "grad_norm": 0.7062186818568618, + "learning_rate": 4.408418657565416e-06, + "loss": 0.6649, + "step": 155 + }, + { + "epoch": 0.027310745071263975, + "grad_norm": 0.6568377650981212, + "learning_rate": 4.550625711035268e-06, + "loss": 0.6876, + "step": 160 + }, + { + "epoch": 0.028164205854740974, + "grad_norm": 0.8430733551488065, + "learning_rate": 4.69283276450512e-06, + "loss": 0.6741, + "step": 165 + }, + { + "epoch": 0.029017666638217973, + "grad_norm": 0.6106681738824012, + "learning_rate": 4.835039817974972e-06, + "loss": 0.6777, + "step": 170 + }, + { + "epoch": 0.02987112742169497, + "grad_norm": 0.7901990923848753, + "learning_rate": 4.977246871444824e-06, + "loss": 0.6641, + "step": 175 + }, + { + "epoch": 0.030724588205171974, + "grad_norm": 0.7623032565109146, + "learning_rate": 5.119453924914676e-06, + "loss": 0.6035, + "step": 180 + }, + { + "epoch": 0.03157804898864897, + "grad_norm": 0.9433218529441518, + "learning_rate": 5.261660978384528e-06, + "loss": 0.6372, + "step": 185 + }, + { + "epoch": 0.03243150977212597, + "grad_norm": 0.6318034752486974, + "learning_rate": 5.403868031854381e-06, + "loss": 0.6517, + "step": 190 + }, + { + "epoch": 0.03328497055560297, + "grad_norm": 0.7984227582107997, + "learning_rate": 5.546075085324233e-06, + "loss": 0.6507, + "step": 195 + }, + { + "epoch": 0.03413843133907997, + "grad_norm": 0.6970637034010194, + "learning_rate": 5.6882821387940845e-06, + "loss": 0.6132, + "step": 200 + }, + { + "epoch": 0.03499189212255697, + "grad_norm": 0.693661339636327, + "learning_rate": 5.830489192263936e-06, + "loss": 0.6447, + "step": 205 + }, + { + "epoch": 0.03584535290603397, + "grad_norm": 0.5750112660425444, + "learning_rate": 5.972696245733789e-06, + "loss": 0.6354, + "step": 210 + }, + { + "epoch": 0.036698813689510966, + "grad_norm": 0.7707644501420617, + "learning_rate": 6.114903299203641e-06, + "loss": 0.613, + "step": 215 + }, + { + "epoch": 0.037552274472987965, + "grad_norm": 0.7610886999828581, + "learning_rate": 6.257110352673493e-06, + "loss": 0.5985, + "step": 220 + }, + { + "epoch": 0.038405735256464964, + "grad_norm": 0.7219024880249069, + "learning_rate": 6.399317406143345e-06, + "loss": 0.6147, + "step": 225 + }, + { + "epoch": 0.03925919603994196, + "grad_norm": 0.694773090873567, + "learning_rate": 6.5415244596131975e-06, + "loss": 0.6138, + "step": 230 + }, + { + "epoch": 0.04011265682341896, + "grad_norm": 0.6507600820281264, + "learning_rate": 6.683731513083049e-06, + "loss": 0.6358, + "step": 235 + }, + { + "epoch": 0.04096611760689596, + "grad_norm": 0.6821754118235962, + "learning_rate": 6.825938566552901e-06, + "loss": 0.6064, + "step": 240 + }, + { + "epoch": 0.04181957839037296, + "grad_norm": 0.6685037872512465, + "learning_rate": 6.968145620022753e-06, + "loss": 0.6141, + "step": 245 + }, + { + "epoch": 0.04267303917384996, + "grad_norm": 0.650855824979963, + "learning_rate": 7.110352673492605e-06, + "loss": 0.6099, + "step": 250 + }, + { + "epoch": 0.043526499957326964, + "grad_norm": 0.6283039075662383, + "learning_rate": 7.252559726962458e-06, + "loss": 0.6487, + "step": 255 + }, + { + "epoch": 0.04437996074080396, + "grad_norm": 0.7017487578081022, + "learning_rate": 7.39476678043231e-06, + "loss": 0.6117, + "step": 260 + }, + { + "epoch": 0.04523342152428096, + "grad_norm": 0.6021311846646746, + "learning_rate": 7.536973833902162e-06, + "loss": 0.6074, + "step": 265 + }, + { + "epoch": 0.04608688230775796, + "grad_norm": 0.705303126743963, + "learning_rate": 7.679180887372015e-06, + "loss": 0.6228, + "step": 270 + }, + { + "epoch": 0.04694034309123496, + "grad_norm": 0.6331398845315224, + "learning_rate": 7.821387940841867e-06, + "loss": 0.6186, + "step": 275 + }, + { + "epoch": 0.04779380387471196, + "grad_norm": 0.7364757969311059, + "learning_rate": 7.963594994311719e-06, + "loss": 0.6028, + "step": 280 + }, + { + "epoch": 0.04864726465818896, + "grad_norm": 0.5864444756610997, + "learning_rate": 8.10580204778157e-06, + "loss": 0.5572, + "step": 285 + }, + { + "epoch": 0.049500725441665956, + "grad_norm": 0.7188508735007383, + "learning_rate": 8.248009101251423e-06, + "loss": 0.5983, + "step": 290 + }, + { + "epoch": 0.050354186225142955, + "grad_norm": 0.5871960228317447, + "learning_rate": 8.390216154721274e-06, + "loss": 0.5697, + "step": 295 + }, + { + "epoch": 0.051207647008619954, + "grad_norm": 0.6415281272301155, + "learning_rate": 8.532423208191128e-06, + "loss": 0.5669, + "step": 300 + }, + { + "epoch": 0.05206110779209695, + "grad_norm": 0.7422606435155473, + "learning_rate": 8.67463026166098e-06, + "loss": 0.6099, + "step": 305 + }, + { + "epoch": 0.05291456857557395, + "grad_norm": 0.8031320770409729, + "learning_rate": 8.816837315130832e-06, + "loss": 0.5805, + "step": 310 + }, + { + "epoch": 0.05376802935905095, + "grad_norm": 0.6235161179887372, + "learning_rate": 8.959044368600684e-06, + "loss": 0.5829, + "step": 315 + }, + { + "epoch": 0.05462149014252795, + "grad_norm": 0.6368377190363229, + "learning_rate": 9.101251422070536e-06, + "loss": 0.586, + "step": 320 + }, + { + "epoch": 0.05547495092600495, + "grad_norm": 0.7355127518957415, + "learning_rate": 9.243458475540387e-06, + "loss": 0.6052, + "step": 325 + }, + { + "epoch": 0.05632841170948195, + "grad_norm": 0.6491383433315371, + "learning_rate": 9.38566552901024e-06, + "loss": 0.5641, + "step": 330 + }, + { + "epoch": 0.057181872492958946, + "grad_norm": 0.6074404231611151, + "learning_rate": 9.527872582480093e-06, + "loss": 0.5805, + "step": 335 + }, + { + "epoch": 0.058035333276435945, + "grad_norm": 0.621909618300053, + "learning_rate": 9.670079635949945e-06, + "loss": 0.5921, + "step": 340 + }, + { + "epoch": 0.058888794059912944, + "grad_norm": 0.7596871354310805, + "learning_rate": 9.812286689419797e-06, + "loss": 0.5905, + "step": 345 + }, + { + "epoch": 0.05974225484338994, + "grad_norm": 0.6852465582192806, + "learning_rate": 9.954493742889649e-06, + "loss": 0.5718, + "step": 350 + }, + { + "epoch": 0.06059571562686695, + "grad_norm": 0.6323485065460291, + "learning_rate": 1.00967007963595e-05, + "loss": 0.6071, + "step": 355 + }, + { + "epoch": 0.06144917641034395, + "grad_norm": 0.702431722756456, + "learning_rate": 1.0238907849829352e-05, + "loss": 0.5612, + "step": 360 + }, + { + "epoch": 0.062302637193820946, + "grad_norm": 0.7446033311458136, + "learning_rate": 1.0381114903299204e-05, + "loss": 0.595, + "step": 365 + }, + { + "epoch": 0.06315609797729795, + "grad_norm": 0.6942761058832195, + "learning_rate": 1.0523321956769056e-05, + "loss": 0.5742, + "step": 370 + }, + { + "epoch": 0.06400955876077494, + "grad_norm": 0.7488476031298855, + "learning_rate": 1.066552901023891e-05, + "loss": 0.5651, + "step": 375 + }, + { + "epoch": 0.06486301954425194, + "grad_norm": 0.7465993777161892, + "learning_rate": 1.0807736063708762e-05, + "loss": 0.5731, + "step": 380 + }, + { + "epoch": 0.06571648032772893, + "grad_norm": 0.6785228215376499, + "learning_rate": 1.0949943117178613e-05, + "loss": 0.58, + "step": 385 + }, + { + "epoch": 0.06656994111120594, + "grad_norm": 0.6009750474296774, + "learning_rate": 1.1092150170648465e-05, + "loss": 0.6299, + "step": 390 + }, + { + "epoch": 0.06742340189468293, + "grad_norm": 0.6112998501665518, + "learning_rate": 1.1234357224118317e-05, + "loss": 0.5448, + "step": 395 + }, + { + "epoch": 0.06827686267815994, + "grad_norm": 0.6141285561163018, + "learning_rate": 1.1376564277588169e-05, + "loss": 0.5536, + "step": 400 + }, + { + "epoch": 0.06913032346163694, + "grad_norm": 0.7039674370569571, + "learning_rate": 1.1518771331058021e-05, + "loss": 0.5846, + "step": 405 + }, + { + "epoch": 0.06998378424511394, + "grad_norm": 0.6628719873901499, + "learning_rate": 1.1660978384527873e-05, + "loss": 0.5996, + "step": 410 + }, + { + "epoch": 0.07083724502859094, + "grad_norm": 0.733225603137678, + "learning_rate": 1.1803185437997725e-05, + "loss": 0.5604, + "step": 415 + }, + { + "epoch": 0.07169070581206793, + "grad_norm": 0.7999324830307888, + "learning_rate": 1.1945392491467578e-05, + "loss": 0.5845, + "step": 420 + }, + { + "epoch": 0.07254416659554494, + "grad_norm": 0.6696390387855465, + "learning_rate": 1.208759954493743e-05, + "loss": 0.5663, + "step": 425 + }, + { + "epoch": 0.07339762737902193, + "grad_norm": 0.6495085711609115, + "learning_rate": 1.2229806598407282e-05, + "loss": 0.5823, + "step": 430 + }, + { + "epoch": 0.07425108816249894, + "grad_norm": 0.6508779382883285, + "learning_rate": 1.2372013651877134e-05, + "loss": 0.5692, + "step": 435 + }, + { + "epoch": 0.07510454894597593, + "grad_norm": 0.677992969506062, + "learning_rate": 1.2514220705346986e-05, + "loss": 0.5867, + "step": 440 + }, + { + "epoch": 0.07595800972945294, + "grad_norm": 0.6481542235806955, + "learning_rate": 1.2656427758816838e-05, + "loss": 0.5458, + "step": 445 + }, + { + "epoch": 0.07681147051292993, + "grad_norm": 0.6507351226676409, + "learning_rate": 1.279863481228669e-05, + "loss": 0.542, + "step": 450 + }, + { + "epoch": 0.07766493129640693, + "grad_norm": 0.6520426456185812, + "learning_rate": 1.2940841865756541e-05, + "loss": 0.561, + "step": 455 + }, + { + "epoch": 0.07851839207988393, + "grad_norm": 0.7153844095429435, + "learning_rate": 1.3083048919226395e-05, + "loss": 0.5792, + "step": 460 + }, + { + "epoch": 0.07937185286336093, + "grad_norm": 0.9169128648612137, + "learning_rate": 1.3225255972696247e-05, + "loss": 0.5663, + "step": 465 + }, + { + "epoch": 0.08022531364683792, + "grad_norm": 0.6346143572448639, + "learning_rate": 1.3367463026166099e-05, + "loss": 0.5454, + "step": 470 + }, + { + "epoch": 0.08107877443031493, + "grad_norm": 0.661156267249212, + "learning_rate": 1.350967007963595e-05, + "loss": 0.5284, + "step": 475 + }, + { + "epoch": 0.08193223521379192, + "grad_norm": 0.6570366689901866, + "learning_rate": 1.3651877133105803e-05, + "loss": 0.583, + "step": 480 + }, + { + "epoch": 0.08278569599726893, + "grad_norm": 0.6925487929822411, + "learning_rate": 1.3794084186575654e-05, + "loss": 0.5734, + "step": 485 + }, + { + "epoch": 0.08363915678074592, + "grad_norm": 0.6825610761421598, + "learning_rate": 1.3936291240045506e-05, + "loss": 0.5604, + "step": 490 + }, + { + "epoch": 0.08449261756422292, + "grad_norm": 0.6455121109752382, + "learning_rate": 1.407849829351536e-05, + "loss": 0.5577, + "step": 495 + }, + { + "epoch": 0.08534607834769992, + "grad_norm": 0.7057436426948772, + "learning_rate": 1.422070534698521e-05, + "loss": 0.5635, + "step": 500 + }, + { + "epoch": 0.08619953913117692, + "grad_norm": 0.6035482154147205, + "learning_rate": 1.4362912400455064e-05, + "loss": 0.5608, + "step": 505 + }, + { + "epoch": 0.08705299991465393, + "grad_norm": 0.6945499780138561, + "learning_rate": 1.4505119453924915e-05, + "loss": 0.5734, + "step": 510 + }, + { + "epoch": 0.08790646069813092, + "grad_norm": 0.6342408596975896, + "learning_rate": 1.4647326507394767e-05, + "loss": 0.5478, + "step": 515 + }, + { + "epoch": 0.08875992148160793, + "grad_norm": 0.669235415531277, + "learning_rate": 1.478953356086462e-05, + "loss": 0.563, + "step": 520 + }, + { + "epoch": 0.08961338226508492, + "grad_norm": 0.8846650505038849, + "learning_rate": 1.4931740614334471e-05, + "loss": 0.5926, + "step": 525 + }, + { + "epoch": 0.09046684304856192, + "grad_norm": 0.7901893564028634, + "learning_rate": 1.5073947667804325e-05, + "loss": 0.5368, + "step": 530 + }, + { + "epoch": 0.09132030383203892, + "grad_norm": 0.7257391495825279, + "learning_rate": 1.5216154721274175e-05, + "loss": 0.5116, + "step": 535 + }, + { + "epoch": 0.09217376461551592, + "grad_norm": 0.8055262915442394, + "learning_rate": 1.535836177474403e-05, + "loss": 0.5737, + "step": 540 + }, + { + "epoch": 0.09302722539899291, + "grad_norm": 0.7724777138200022, + "learning_rate": 1.550056882821388e-05, + "loss": 0.5531, + "step": 545 + }, + { + "epoch": 0.09388068618246992, + "grad_norm": 0.7277006149678841, + "learning_rate": 1.5642775881683734e-05, + "loss": 0.5399, + "step": 550 + }, + { + "epoch": 0.09473414696594691, + "grad_norm": 1.0778652880491968, + "learning_rate": 1.5784982935153582e-05, + "loss": 0.5721, + "step": 555 + }, + { + "epoch": 0.09558760774942392, + "grad_norm": 0.9104304283180978, + "learning_rate": 1.5927189988623438e-05, + "loss": 0.5572, + "step": 560 + }, + { + "epoch": 0.09644106853290091, + "grad_norm": 0.8149446361982572, + "learning_rate": 1.606939704209329e-05, + "loss": 0.534, + "step": 565 + }, + { + "epoch": 0.09729452931637791, + "grad_norm": 0.6508255643810232, + "learning_rate": 1.621160409556314e-05, + "loss": 0.5901, + "step": 570 + }, + { + "epoch": 0.0981479900998549, + "grad_norm": 0.7908571035291143, + "learning_rate": 1.6353811149032993e-05, + "loss": 0.5345, + "step": 575 + }, + { + "epoch": 0.09900145088333191, + "grad_norm": 0.8711338798337782, + "learning_rate": 1.6496018202502845e-05, + "loss": 0.5439, + "step": 580 + }, + { + "epoch": 0.0998549116668089, + "grad_norm": 0.8510864530724201, + "learning_rate": 1.6638225255972697e-05, + "loss": 0.5346, + "step": 585 + }, + { + "epoch": 0.10070837245028591, + "grad_norm": 0.6004623000077266, + "learning_rate": 1.678043230944255e-05, + "loss": 0.5671, + "step": 590 + }, + { + "epoch": 0.1015618332337629, + "grad_norm": 0.7836797955774423, + "learning_rate": 1.69226393629124e-05, + "loss": 0.5728, + "step": 595 + }, + { + "epoch": 0.10241529401723991, + "grad_norm": 0.7164324569628334, + "learning_rate": 1.7064846416382256e-05, + "loss": 0.5808, + "step": 600 + }, + { + "epoch": 0.10326875480071691, + "grad_norm": 0.658437507959372, + "learning_rate": 1.7207053469852105e-05, + "loss": 0.527, + "step": 605 + }, + { + "epoch": 0.1041222155841939, + "grad_norm": 0.648235707862094, + "learning_rate": 1.734926052332196e-05, + "loss": 0.5042, + "step": 610 + }, + { + "epoch": 0.10497567636767091, + "grad_norm": 0.7066278650620545, + "learning_rate": 1.749146757679181e-05, + "loss": 0.5484, + "step": 615 + }, + { + "epoch": 0.1058291371511479, + "grad_norm": 0.7306436553829413, + "learning_rate": 1.7633674630261664e-05, + "loss": 0.5233, + "step": 620 + }, + { + "epoch": 0.10668259793462491, + "grad_norm": 0.5940294914233756, + "learning_rate": 1.7775881683731512e-05, + "loss": 0.5497, + "step": 625 + }, + { + "epoch": 0.1075360587181019, + "grad_norm": 0.6766881884349177, + "learning_rate": 1.7918088737201367e-05, + "loss": 0.524, + "step": 630 + }, + { + "epoch": 0.10838951950157891, + "grad_norm": 0.6374323634721849, + "learning_rate": 1.806029579067122e-05, + "loss": 0.5576, + "step": 635 + }, + { + "epoch": 0.1092429802850559, + "grad_norm": 0.7284674831221161, + "learning_rate": 1.820250284414107e-05, + "loss": 0.5529, + "step": 640 + }, + { + "epoch": 0.1100964410685329, + "grad_norm": 0.6615725114492135, + "learning_rate": 1.8344709897610923e-05, + "loss": 0.5044, + "step": 645 + }, + { + "epoch": 0.1109499018520099, + "grad_norm": 0.7676539593033215, + "learning_rate": 1.8486916951080775e-05, + "loss": 0.5512, + "step": 650 + }, + { + "epoch": 0.1118033626354869, + "grad_norm": 0.5885868256001245, + "learning_rate": 1.8629124004550627e-05, + "loss": 0.5001, + "step": 655 + }, + { + "epoch": 0.1126568234189639, + "grad_norm": 0.5686249348694413, + "learning_rate": 1.877133105802048e-05, + "loss": 0.5332, + "step": 660 + }, + { + "epoch": 0.1135102842024409, + "grad_norm": 0.6097676324328544, + "learning_rate": 1.891353811149033e-05, + "loss": 0.5374, + "step": 665 + }, + { + "epoch": 0.11436374498591789, + "grad_norm": 0.7494192367995726, + "learning_rate": 1.9055745164960186e-05, + "loss": 0.5428, + "step": 670 + }, + { + "epoch": 0.1152172057693949, + "grad_norm": 0.6181360753587957, + "learning_rate": 1.9197952218430034e-05, + "loss": 0.4983, + "step": 675 + }, + { + "epoch": 0.11607066655287189, + "grad_norm": 0.7148707525002317, + "learning_rate": 1.934015927189989e-05, + "loss": 0.5474, + "step": 680 + }, + { + "epoch": 0.1169241273363489, + "grad_norm": 0.6890678940899136, + "learning_rate": 1.9482366325369738e-05, + "loss": 0.539, + "step": 685 + }, + { + "epoch": 0.11777758811982589, + "grad_norm": 1.5520380828368654, + "learning_rate": 1.9624573378839593e-05, + "loss": 0.5304, + "step": 690 + }, + { + "epoch": 0.1186310489033029, + "grad_norm": 0.6879630723852271, + "learning_rate": 1.9766780432309442e-05, + "loss": 0.5523, + "step": 695 + }, + { + "epoch": 0.11948450968677989, + "grad_norm": 0.5952074420310725, + "learning_rate": 1.9908987485779297e-05, + "loss": 0.546, + "step": 700 + }, + { + "epoch": 0.12033797047025689, + "grad_norm": 0.6568607864977468, + "learning_rate": 2.005119453924915e-05, + "loss": 0.535, + "step": 705 + }, + { + "epoch": 0.1211914312537339, + "grad_norm": 0.7573004872488551, + "learning_rate": 2.0193401592719e-05, + "loss": 0.5698, + "step": 710 + }, + { + "epoch": 0.12204489203721089, + "grad_norm": 0.6038232065159463, + "learning_rate": 2.0335608646188853e-05, + "loss": 0.5355, + "step": 715 + }, + { + "epoch": 0.1228983528206879, + "grad_norm": 0.8905543409020308, + "learning_rate": 2.0477815699658705e-05, + "loss": 0.5501, + "step": 720 + }, + { + "epoch": 0.12375181360416489, + "grad_norm": 0.6494164955676679, + "learning_rate": 2.0620022753128557e-05, + "loss": 0.597, + "step": 725 + }, + { + "epoch": 0.12460527438764189, + "grad_norm": 0.6068419377405997, + "learning_rate": 2.076222980659841e-05, + "loss": 0.5191, + "step": 730 + }, + { + "epoch": 0.12545873517111888, + "grad_norm": 0.6748891771108966, + "learning_rate": 2.090443686006826e-05, + "loss": 0.5624, + "step": 735 + }, + { + "epoch": 0.1263121959545959, + "grad_norm": 0.6003241284298655, + "learning_rate": 2.1046643913538112e-05, + "loss": 0.5199, + "step": 740 + }, + { + "epoch": 0.1271656567380729, + "grad_norm": 0.993073185386471, + "learning_rate": 2.1188850967007964e-05, + "loss": 0.5379, + "step": 745 + }, + { + "epoch": 0.12801911752154987, + "grad_norm": 0.6830807878540766, + "learning_rate": 2.133105802047782e-05, + "loss": 0.5718, + "step": 750 + }, + { + "epoch": 0.12887257830502688, + "grad_norm": 0.5637706434424037, + "learning_rate": 2.1473265073947668e-05, + "loss": 0.5329, + "step": 755 + }, + { + "epoch": 0.12972603908850389, + "grad_norm": 0.5901909162808682, + "learning_rate": 2.1615472127417523e-05, + "loss": 0.5005, + "step": 760 + }, + { + "epoch": 0.1305794998719809, + "grad_norm": 0.6384938080969448, + "learning_rate": 2.175767918088737e-05, + "loss": 0.5081, + "step": 765 + }, + { + "epoch": 0.13143296065545787, + "grad_norm": 0.6515001609818502, + "learning_rate": 2.1899886234357227e-05, + "loss": 0.5139, + "step": 770 + }, + { + "epoch": 0.13228642143893488, + "grad_norm": 0.6594065584484019, + "learning_rate": 2.204209328782708e-05, + "loss": 0.5584, + "step": 775 + }, + { + "epoch": 0.13313988222241188, + "grad_norm": 0.5788918184456333, + "learning_rate": 2.218430034129693e-05, + "loss": 0.4872, + "step": 780 + }, + { + "epoch": 0.1339933430058889, + "grad_norm": 0.7132592163471578, + "learning_rate": 2.2326507394766782e-05, + "loss": 0.526, + "step": 785 + }, + { + "epoch": 0.13484680378936587, + "grad_norm": 0.6701954948574094, + "learning_rate": 2.2468714448236634e-05, + "loss": 0.5369, + "step": 790 + }, + { + "epoch": 0.13570026457284287, + "grad_norm": 0.664429218741904, + "learning_rate": 2.2610921501706486e-05, + "loss": 0.5622, + "step": 795 + }, + { + "epoch": 0.13655372535631988, + "grad_norm": 0.7401195985099817, + "learning_rate": 2.2753128555176338e-05, + "loss": 0.5276, + "step": 800 + }, + { + "epoch": 0.13740718613979688, + "grad_norm": 0.5761092021531733, + "learning_rate": 2.289533560864619e-05, + "loss": 0.5003, + "step": 805 + }, + { + "epoch": 0.1382606469232739, + "grad_norm": 0.7211376155782887, + "learning_rate": 2.3037542662116042e-05, + "loss": 0.5346, + "step": 810 + }, + { + "epoch": 0.13911410770675087, + "grad_norm": 0.6870353313728967, + "learning_rate": 2.3179749715585894e-05, + "loss": 0.5412, + "step": 815 + }, + { + "epoch": 0.13996756849022787, + "grad_norm": 0.7484951113803281, + "learning_rate": 2.3321956769055746e-05, + "loss": 0.5352, + "step": 820 + }, + { + "epoch": 0.14082102927370488, + "grad_norm": 0.5633549500567068, + "learning_rate": 2.3464163822525598e-05, + "loss": 0.5373, + "step": 825 + }, + { + "epoch": 0.14167449005718188, + "grad_norm": 0.6175161903456321, + "learning_rate": 2.360637087599545e-05, + "loss": 0.5293, + "step": 830 + }, + { + "epoch": 0.14252795084065886, + "grad_norm": 0.5286458735194771, + "learning_rate": 2.37485779294653e-05, + "loss": 0.4924, + "step": 835 + }, + { + "epoch": 0.14338141162413587, + "grad_norm": 0.809888612811748, + "learning_rate": 2.3890784982935157e-05, + "loss": 0.5476, + "step": 840 + }, + { + "epoch": 0.14423487240761287, + "grad_norm": 0.6320119433876821, + "learning_rate": 2.403299203640501e-05, + "loss": 0.5175, + "step": 845 + }, + { + "epoch": 0.14508833319108988, + "grad_norm": 0.7042156271509452, + "learning_rate": 2.417519908987486e-05, + "loss": 0.5225, + "step": 850 + }, + { + "epoch": 0.14594179397456686, + "grad_norm": 0.6630329659099481, + "learning_rate": 2.4317406143344712e-05, + "loss": 0.5219, + "step": 855 + }, + { + "epoch": 0.14679525475804386, + "grad_norm": 0.8047566311916006, + "learning_rate": 2.4459613196814564e-05, + "loss": 0.5282, + "step": 860 + }, + { + "epoch": 0.14764871554152087, + "grad_norm": 0.6322398706113659, + "learning_rate": 2.4601820250284416e-05, + "loss": 0.5496, + "step": 865 + }, + { + "epoch": 0.14850217632499788, + "grad_norm": 0.5394073805212835, + "learning_rate": 2.4744027303754268e-05, + "loss": 0.5391, + "step": 870 + }, + { + "epoch": 0.14935563710847485, + "grad_norm": 0.8539124043452883, + "learning_rate": 2.488623435722412e-05, + "loss": 0.5615, + "step": 875 + }, + { + "epoch": 0.15020909789195186, + "grad_norm": 0.5973992591932995, + "learning_rate": 2.502844141069397e-05, + "loss": 0.5109, + "step": 880 + }, + { + "epoch": 0.15106255867542887, + "grad_norm": 0.7868095705273332, + "learning_rate": 2.5170648464163827e-05, + "loss": 0.4925, + "step": 885 + }, + { + "epoch": 0.15191601945890587, + "grad_norm": 0.5303131516550371, + "learning_rate": 2.5312855517633675e-05, + "loss": 0.5223, + "step": 890 + }, + { + "epoch": 0.15276948024238285, + "grad_norm": 0.6915392576850998, + "learning_rate": 2.5455062571103527e-05, + "loss": 0.5199, + "step": 895 + }, + { + "epoch": 0.15362294102585985, + "grad_norm": 0.6501693825506315, + "learning_rate": 2.559726962457338e-05, + "loss": 0.5196, + "step": 900 + }, + { + "epoch": 0.15447640180933686, + "grad_norm": 0.8308211131956251, + "learning_rate": 2.5739476678043234e-05, + "loss": 0.5797, + "step": 905 + }, + { + "epoch": 0.15532986259281387, + "grad_norm": 0.6205667096580658, + "learning_rate": 2.5881683731513083e-05, + "loss": 0.5626, + "step": 910 + }, + { + "epoch": 0.15618332337629087, + "grad_norm": 0.78307958464136, + "learning_rate": 2.6023890784982935e-05, + "loss": 0.5572, + "step": 915 + }, + { + "epoch": 0.15703678415976785, + "grad_norm": 0.6971854241217064, + "learning_rate": 2.616609783845279e-05, + "loss": 0.5283, + "step": 920 + }, + { + "epoch": 0.15789024494324486, + "grad_norm": 0.790435113721265, + "learning_rate": 2.6308304891922642e-05, + "loss": 0.5096, + "step": 925 + }, + { + "epoch": 0.15874370572672186, + "grad_norm": 0.6263290150135153, + "learning_rate": 2.6450511945392494e-05, + "loss": 0.511, + "step": 930 + }, + { + "epoch": 0.15959716651019887, + "grad_norm": 1.0182762203606528, + "learning_rate": 2.6592718998862342e-05, + "loss": 0.5176, + "step": 935 + }, + { + "epoch": 0.16045062729367585, + "grad_norm": 0.6764980698940198, + "learning_rate": 2.6734926052332198e-05, + "loss": 0.532, + "step": 940 + }, + { + "epoch": 0.16130408807715285, + "grad_norm": 0.617066330968239, + "learning_rate": 2.687713310580205e-05, + "loss": 0.5215, + "step": 945 + }, + { + "epoch": 0.16215754886062986, + "grad_norm": 0.6650618699860503, + "learning_rate": 2.70193401592719e-05, + "loss": 0.5173, + "step": 950 + }, + { + "epoch": 0.16301100964410686, + "grad_norm": 0.6880915197162699, + "learning_rate": 2.7161547212741757e-05, + "loss": 0.5183, + "step": 955 + }, + { + "epoch": 0.16386447042758384, + "grad_norm": 0.6334400197912421, + "learning_rate": 2.7303754266211605e-05, + "loss": 0.5068, + "step": 960 + }, + { + "epoch": 0.16471793121106085, + "grad_norm": 0.6614924655158634, + "learning_rate": 2.7445961319681457e-05, + "loss": 0.5034, + "step": 965 + }, + { + "epoch": 0.16557139199453785, + "grad_norm": 0.6410739649524626, + "learning_rate": 2.758816837315131e-05, + "loss": 0.543, + "step": 970 + }, + { + "epoch": 0.16642485277801486, + "grad_norm": 0.6153025614066233, + "learning_rate": 2.7730375426621164e-05, + "loss": 0.5204, + "step": 975 + }, + { + "epoch": 0.16727831356149184, + "grad_norm": 0.5942921081010195, + "learning_rate": 2.7872582480091013e-05, + "loss": 0.5047, + "step": 980 + }, + { + "epoch": 0.16813177434496884, + "grad_norm": 0.6022254575923327, + "learning_rate": 2.8014789533560864e-05, + "loss": 0.5077, + "step": 985 + }, + { + "epoch": 0.16898523512844585, + "grad_norm": 0.7132287202025721, + "learning_rate": 2.815699658703072e-05, + "loss": 0.5722, + "step": 990 + }, + { + "epoch": 0.16983869591192285, + "grad_norm": 1.1177874785166635, + "learning_rate": 2.829920364050057e-05, + "loss": 0.5189, + "step": 995 + }, + { + "epoch": 0.17069215669539983, + "grad_norm": 1.193082385576556, + "learning_rate": 2.844141069397042e-05, + "loss": 0.5347, + "step": 1000 + }, + { + "epoch": 0.17154561747887684, + "grad_norm": 0.9273224155143907, + "learning_rate": 2.8583617747440272e-05, + "loss": 0.5199, + "step": 1005 + }, + { + "epoch": 0.17239907826235384, + "grad_norm": 0.8009696728529175, + "learning_rate": 2.8725824800910127e-05, + "loss": 0.5413, + "step": 1010 + }, + { + "epoch": 0.17325253904583085, + "grad_norm": 0.6389356197858123, + "learning_rate": 2.886803185437998e-05, + "loss": 0.5197, + "step": 1015 + }, + { + "epoch": 0.17410599982930786, + "grad_norm": 0.5643611328367691, + "learning_rate": 2.901023890784983e-05, + "loss": 0.5416, + "step": 1020 + }, + { + "epoch": 0.17495946061278483, + "grad_norm": 0.8203413707371002, + "learning_rate": 2.9152445961319686e-05, + "loss": 0.5517, + "step": 1025 + }, + { + "epoch": 0.17581292139626184, + "grad_norm": 0.5767388331401231, + "learning_rate": 2.9294653014789535e-05, + "loss": 0.5281, + "step": 1030 + }, + { + "epoch": 0.17666638217973885, + "grad_norm": 0.7639414252803709, + "learning_rate": 2.9436860068259387e-05, + "loss": 0.5117, + "step": 1035 + }, + { + "epoch": 0.17751984296321585, + "grad_norm": 0.7435837191495894, + "learning_rate": 2.957906712172924e-05, + "loss": 0.5174, + "step": 1040 + }, + { + "epoch": 0.17837330374669283, + "grad_norm": 0.7406989837738298, + "learning_rate": 2.9721274175199094e-05, + "loss": 0.5564, + "step": 1045 + }, + { + "epoch": 0.17922676453016984, + "grad_norm": 0.7609524854068137, + "learning_rate": 2.9863481228668942e-05, + "loss": 0.5393, + "step": 1050 + }, + { + "epoch": 0.18008022531364684, + "grad_norm": 0.7602059635412194, + "learning_rate": 3.0005688282138794e-05, + "loss": 0.5085, + "step": 1055 + }, + { + "epoch": 0.18093368609712385, + "grad_norm": 0.6041864111887089, + "learning_rate": 3.014789533560865e-05, + "loss": 0.5022, + "step": 1060 + }, + { + "epoch": 0.18178714688060083, + "grad_norm": 0.6725193484426399, + "learning_rate": 3.02901023890785e-05, + "loss": 0.5255, + "step": 1065 + }, + { + "epoch": 0.18264060766407783, + "grad_norm": 0.5809953323844759, + "learning_rate": 3.043230944254835e-05, + "loss": 0.5222, + "step": 1070 + }, + { + "epoch": 0.18349406844755484, + "grad_norm": 0.5561450815754222, + "learning_rate": 3.0574516496018205e-05, + "loss": 0.497, + "step": 1075 + }, + { + "epoch": 0.18434752923103184, + "grad_norm": 0.7384676137709544, + "learning_rate": 3.071672354948806e-05, + "loss": 0.5228, + "step": 1080 + }, + { + "epoch": 0.18520099001450882, + "grad_norm": 0.5611330862106643, + "learning_rate": 3.085893060295791e-05, + "loss": 0.5278, + "step": 1085 + }, + { + "epoch": 0.18605445079798583, + "grad_norm": 0.9520323231067833, + "learning_rate": 3.100113765642776e-05, + "loss": 0.5224, + "step": 1090 + }, + { + "epoch": 0.18690791158146283, + "grad_norm": 0.7633270460301356, + "learning_rate": 3.114334470989761e-05, + "loss": 0.5234, + "step": 1095 + }, + { + "epoch": 0.18776137236493984, + "grad_norm": 1.8224752677288898, + "learning_rate": 3.128555176336747e-05, + "loss": 0.5076, + "step": 1100 + }, + { + "epoch": 0.18861483314841684, + "grad_norm": 0.6362222349727383, + "learning_rate": 3.1427758816837316e-05, + "loss": 0.5458, + "step": 1105 + }, + { + "epoch": 0.18946829393189382, + "grad_norm": 0.8293350005314576, + "learning_rate": 3.1569965870307165e-05, + "loss": 0.5106, + "step": 1110 + }, + { + "epoch": 0.19032175471537083, + "grad_norm": 0.6021094733250301, + "learning_rate": 3.171217292377702e-05, + "loss": 0.5316, + "step": 1115 + }, + { + "epoch": 0.19117521549884783, + "grad_norm": 1.0643292393092736, + "learning_rate": 3.1854379977246875e-05, + "loss": 0.5596, + "step": 1120 + }, + { + "epoch": 0.19202867628232484, + "grad_norm": 0.6299150040629254, + "learning_rate": 3.1996587030716724e-05, + "loss": 0.4812, + "step": 1125 + }, + { + "epoch": 0.19288213706580182, + "grad_norm": 0.5043450499219864, + "learning_rate": 3.213879408418658e-05, + "loss": 0.4895, + "step": 1130 + }, + { + "epoch": 0.19373559784927882, + "grad_norm": 0.5990024804144823, + "learning_rate": 3.228100113765643e-05, + "loss": 0.5318, + "step": 1135 + }, + { + "epoch": 0.19458905863275583, + "grad_norm": 0.640259423124337, + "learning_rate": 3.242320819112628e-05, + "loss": 0.4925, + "step": 1140 + }, + { + "epoch": 0.19544251941623283, + "grad_norm": 0.5721401072678143, + "learning_rate": 3.256541524459613e-05, + "loss": 0.5364, + "step": 1145 + }, + { + "epoch": 0.1962959801997098, + "grad_norm": 0.6483175670557406, + "learning_rate": 3.270762229806599e-05, + "loss": 0.5097, + "step": 1150 + }, + { + "epoch": 0.19714944098318682, + "grad_norm": 0.6304883041191003, + "learning_rate": 3.2849829351535835e-05, + "loss": 0.529, + "step": 1155 + }, + { + "epoch": 0.19800290176666382, + "grad_norm": 0.5626780535279388, + "learning_rate": 3.299203640500569e-05, + "loss": 0.5207, + "step": 1160 + }, + { + "epoch": 0.19885636255014083, + "grad_norm": 0.6141866957416666, + "learning_rate": 3.3134243458475546e-05, + "loss": 0.4979, + "step": 1165 + }, + { + "epoch": 0.1997098233336178, + "grad_norm": 0.5923434635716356, + "learning_rate": 3.3276450511945394e-05, + "loss": 0.5477, + "step": 1170 + }, + { + "epoch": 0.20056328411709481, + "grad_norm": 0.6022601757807821, + "learning_rate": 3.341865756541524e-05, + "loss": 0.5082, + "step": 1175 + }, + { + "epoch": 0.20141674490057182, + "grad_norm": 0.5816942821573442, + "learning_rate": 3.35608646188851e-05, + "loss": 0.5256, + "step": 1180 + }, + { + "epoch": 0.20227020568404883, + "grad_norm": 0.6388083158763254, + "learning_rate": 3.370307167235495e-05, + "loss": 0.5539, + "step": 1185 + }, + { + "epoch": 0.2031236664675258, + "grad_norm": 0.5887921842154867, + "learning_rate": 3.38452787258248e-05, + "loss": 0.4929, + "step": 1190 + }, + { + "epoch": 0.2039771272510028, + "grad_norm": 0.7673632049052999, + "learning_rate": 3.398748577929465e-05, + "loss": 0.5263, + "step": 1195 + }, + { + "epoch": 0.20483058803447982, + "grad_norm": 0.606375881456679, + "learning_rate": 3.412969283276451e-05, + "loss": 0.5263, + "step": 1200 + }, + { + "epoch": 0.20568404881795682, + "grad_norm": 0.5865526644035232, + "learning_rate": 3.427189988623436e-05, + "loss": 0.5384, + "step": 1205 + }, + { + "epoch": 0.20653750960143383, + "grad_norm": 0.6854151362919446, + "learning_rate": 3.441410693970421e-05, + "loss": 0.5061, + "step": 1210 + }, + { + "epoch": 0.2073909703849108, + "grad_norm": 0.5159891247284191, + "learning_rate": 3.455631399317406e-05, + "loss": 0.484, + "step": 1215 + }, + { + "epoch": 0.2082444311683878, + "grad_norm": 0.5624066702071717, + "learning_rate": 3.469852104664392e-05, + "loss": 0.5225, + "step": 1220 + }, + { + "epoch": 0.20909789195186482, + "grad_norm": 0.598719341617163, + "learning_rate": 3.484072810011377e-05, + "loss": 0.497, + "step": 1225 + }, + { + "epoch": 0.20995135273534182, + "grad_norm": 0.528097154070161, + "learning_rate": 3.498293515358362e-05, + "loss": 0.4962, + "step": 1230 + }, + { + "epoch": 0.2108048135188188, + "grad_norm": 0.9598704728440921, + "learning_rate": 3.512514220705347e-05, + "loss": 0.5462, + "step": 1235 + }, + { + "epoch": 0.2116582743022958, + "grad_norm": 0.6266977130887611, + "learning_rate": 3.526734926052333e-05, + "loss": 0.4967, + "step": 1240 + }, + { + "epoch": 0.2125117350857728, + "grad_norm": 0.6043602329334674, + "learning_rate": 3.5409556313993176e-05, + "loss": 0.5505, + "step": 1245 + }, + { + "epoch": 0.21336519586924982, + "grad_norm": 0.611914515574889, + "learning_rate": 3.5551763367463024e-05, + "loss": 0.5037, + "step": 1250 + }, + { + "epoch": 0.2142186566527268, + "grad_norm": 0.4802733556079159, + "learning_rate": 3.569397042093288e-05, + "loss": 0.4863, + "step": 1255 + }, + { + "epoch": 0.2150721174362038, + "grad_norm": 0.7703363378874821, + "learning_rate": 3.5836177474402735e-05, + "loss": 0.559, + "step": 1260 + }, + { + "epoch": 0.2159255782196808, + "grad_norm": 0.6511129669959277, + "learning_rate": 3.597838452787258e-05, + "loss": 0.5513, + "step": 1265 + }, + { + "epoch": 0.21677903900315781, + "grad_norm": 0.5230313645523024, + "learning_rate": 3.612059158134244e-05, + "loss": 0.5241, + "step": 1270 + }, + { + "epoch": 0.2176324997866348, + "grad_norm": 0.7233367326250001, + "learning_rate": 3.626279863481229e-05, + "loss": 0.5109, + "step": 1275 + }, + { + "epoch": 0.2184859605701118, + "grad_norm": 0.7801443277079605, + "learning_rate": 3.640500568828214e-05, + "loss": 0.5012, + "step": 1280 + }, + { + "epoch": 0.2193394213535888, + "grad_norm": 0.5504196600204198, + "learning_rate": 3.654721274175199e-05, + "loss": 0.529, + "step": 1285 + }, + { + "epoch": 0.2201928821370658, + "grad_norm": 0.6541346146815773, + "learning_rate": 3.6689419795221846e-05, + "loss": 0.5132, + "step": 1290 + }, + { + "epoch": 0.2210463429205428, + "grad_norm": 0.6054161786868589, + "learning_rate": 3.6831626848691695e-05, + "loss": 0.5424, + "step": 1295 + }, + { + "epoch": 0.2218998037040198, + "grad_norm": 0.5020466281770849, + "learning_rate": 3.697383390216155e-05, + "loss": 0.5178, + "step": 1300 + }, + { + "epoch": 0.2227532644874968, + "grad_norm": 0.720100028158539, + "learning_rate": 3.7116040955631405e-05, + "loss": 0.5332, + "step": 1305 + }, + { + "epoch": 0.2236067252709738, + "grad_norm": 0.6013562540953598, + "learning_rate": 3.7258248009101254e-05, + "loss": 0.492, + "step": 1310 + }, + { + "epoch": 0.2244601860544508, + "grad_norm": 0.6014554688745792, + "learning_rate": 3.74004550625711e-05, + "loss": 0.5031, + "step": 1315 + }, + { + "epoch": 0.2253136468379278, + "grad_norm": 0.5996694354616003, + "learning_rate": 3.754266211604096e-05, + "loss": 0.5083, + "step": 1320 + }, + { + "epoch": 0.2261671076214048, + "grad_norm": 0.5402077348620818, + "learning_rate": 3.768486916951081e-05, + "loss": 0.5015, + "step": 1325 + }, + { + "epoch": 0.2270205684048818, + "grad_norm": 0.5622664960418867, + "learning_rate": 3.782707622298066e-05, + "loss": 0.5029, + "step": 1330 + }, + { + "epoch": 0.2278740291883588, + "grad_norm": 0.6200593249070808, + "learning_rate": 3.796928327645051e-05, + "loss": 0.5067, + "step": 1335 + }, + { + "epoch": 0.22872748997183578, + "grad_norm": 1.0366247721032074, + "learning_rate": 3.811149032992037e-05, + "loss": 0.5268, + "step": 1340 + }, + { + "epoch": 0.2295809507553128, + "grad_norm": 0.6004382440555587, + "learning_rate": 3.825369738339022e-05, + "loss": 0.5191, + "step": 1345 + }, + { + "epoch": 0.2304344115387898, + "grad_norm": 0.6329180454286693, + "learning_rate": 3.839590443686007e-05, + "loss": 0.5188, + "step": 1350 + }, + { + "epoch": 0.2312878723222668, + "grad_norm": 0.5840877646471871, + "learning_rate": 3.853811149032992e-05, + "loss": 0.5373, + "step": 1355 + }, + { + "epoch": 0.23214133310574378, + "grad_norm": 0.7714178352259251, + "learning_rate": 3.868031854379978e-05, + "loss": 0.5194, + "step": 1360 + }, + { + "epoch": 0.23299479388922079, + "grad_norm": 0.7433746732202924, + "learning_rate": 3.882252559726963e-05, + "loss": 0.5263, + "step": 1365 + }, + { + "epoch": 0.2338482546726978, + "grad_norm": 0.6800388915463906, + "learning_rate": 3.8964732650739476e-05, + "loss": 0.5085, + "step": 1370 + }, + { + "epoch": 0.2347017154561748, + "grad_norm": 0.627799071795484, + "learning_rate": 3.910693970420933e-05, + "loss": 0.5252, + "step": 1375 + }, + { + "epoch": 0.23555517623965178, + "grad_norm": 0.6550245229952462, + "learning_rate": 3.924914675767919e-05, + "loss": 0.5407, + "step": 1380 + }, + { + "epoch": 0.23640863702312878, + "grad_norm": 0.4875389791747265, + "learning_rate": 3.9391353811149035e-05, + "loss": 0.5582, + "step": 1385 + }, + { + "epoch": 0.2372620978066058, + "grad_norm": 0.7704861487727674, + "learning_rate": 3.9533560864618884e-05, + "loss": 0.5073, + "step": 1390 + }, + { + "epoch": 0.2381155585900828, + "grad_norm": 0.5482257906332871, + "learning_rate": 3.967576791808874e-05, + "loss": 0.5321, + "step": 1395 + }, + { + "epoch": 0.23896901937355977, + "grad_norm": 0.6132459928728721, + "learning_rate": 3.9817974971558594e-05, + "loss": 0.5372, + "step": 1400 + }, + { + "epoch": 0.23982248015703678, + "grad_norm": 0.5444425945346957, + "learning_rate": 3.996018202502844e-05, + "loss": 0.4966, + "step": 1405 + }, + { + "epoch": 0.24067594094051378, + "grad_norm": 0.5467973803559824, + "learning_rate": 4.01023890784983e-05, + "loss": 0.5108, + "step": 1410 + }, + { + "epoch": 0.2415294017239908, + "grad_norm": 0.577440466198162, + "learning_rate": 4.0244596131968146e-05, + "loss": 0.4918, + "step": 1415 + }, + { + "epoch": 0.2423828625074678, + "grad_norm": 0.5852142132880568, + "learning_rate": 4.0386803185438e-05, + "loss": 0.5174, + "step": 1420 + }, + { + "epoch": 0.24323632329094477, + "grad_norm": 0.6956282360042076, + "learning_rate": 4.052901023890785e-05, + "loss": 0.4887, + "step": 1425 + }, + { + "epoch": 0.24408978407442178, + "grad_norm": 0.6291740557738249, + "learning_rate": 4.0671217292377706e-05, + "loss": 0.5317, + "step": 1430 + }, + { + "epoch": 0.24494324485789878, + "grad_norm": 0.6356720737025028, + "learning_rate": 4.0813424345847554e-05, + "loss": 0.5151, + "step": 1435 + }, + { + "epoch": 0.2457967056413758, + "grad_norm": 0.5466555007670854, + "learning_rate": 4.095563139931741e-05, + "loss": 0.5315, + "step": 1440 + }, + { + "epoch": 0.24665016642485277, + "grad_norm": 0.49685090209168625, + "learning_rate": 4.1097838452787265e-05, + "loss": 0.5087, + "step": 1445 + }, + { + "epoch": 0.24750362720832977, + "grad_norm": 0.6805393500926865, + "learning_rate": 4.124004550625711e-05, + "loss": 0.5096, + "step": 1450 + }, + { + "epoch": 0.24835708799180678, + "grad_norm": 0.5725498300350252, + "learning_rate": 4.138225255972696e-05, + "loss": 0.4944, + "step": 1455 + }, + { + "epoch": 0.24921054877528379, + "grad_norm": 0.6133397456710716, + "learning_rate": 4.152445961319682e-05, + "loss": 0.504, + "step": 1460 + }, + { + "epoch": 0.2500640095587608, + "grad_norm": 0.718665494692223, + "learning_rate": 4.166666666666667e-05, + "loss": 0.5238, + "step": 1465 + }, + { + "epoch": 0.25091747034223777, + "grad_norm": 0.6247896227403243, + "learning_rate": 4.180887372013652e-05, + "loss": 0.5065, + "step": 1470 + }, + { + "epoch": 0.25177093112571475, + "grad_norm": 0.538557800481529, + "learning_rate": 4.195108077360637e-05, + "loss": 0.493, + "step": 1475 + }, + { + "epoch": 0.2526243919091918, + "grad_norm": 0.6496369108151522, + "learning_rate": 4.2093287827076224e-05, + "loss": 0.5221, + "step": 1480 + }, + { + "epoch": 0.25347785269266876, + "grad_norm": 0.48828596472972324, + "learning_rate": 4.223549488054608e-05, + "loss": 0.5136, + "step": 1485 + }, + { + "epoch": 0.2543313134761458, + "grad_norm": 0.5625364791015088, + "learning_rate": 4.237770193401593e-05, + "loss": 0.5145, + "step": 1490 + }, + { + "epoch": 0.25518477425962277, + "grad_norm": 0.5017063317201218, + "learning_rate": 4.2519908987485777e-05, + "loss": 0.5582, + "step": 1495 + }, + { + "epoch": 0.25603823504309975, + "grad_norm": 0.6288989261344257, + "learning_rate": 4.266211604095564e-05, + "loss": 0.53, + "step": 1500 + }, + { + "epoch": 0.2568916958265768, + "grad_norm": 0.6328069624699362, + "learning_rate": 4.280432309442549e-05, + "loss": 0.5444, + "step": 1505 + }, + { + "epoch": 0.25774515661005376, + "grad_norm": 0.5393975697229143, + "learning_rate": 4.2946530147895336e-05, + "loss": 0.4809, + "step": 1510 + }, + { + "epoch": 0.2585986173935308, + "grad_norm": 0.5408443482735174, + "learning_rate": 4.308873720136519e-05, + "loss": 0.509, + "step": 1515 + }, + { + "epoch": 0.25945207817700777, + "grad_norm": 0.7121021821053504, + "learning_rate": 4.3230944254835046e-05, + "loss": 0.5138, + "step": 1520 + }, + { + "epoch": 0.26030553896048475, + "grad_norm": 0.5929259032947689, + "learning_rate": 4.3373151308304895e-05, + "loss": 0.4904, + "step": 1525 + }, + { + "epoch": 0.2611589997439618, + "grad_norm": 0.6827715964490704, + "learning_rate": 4.351535836177474e-05, + "loss": 0.5197, + "step": 1530 + }, + { + "epoch": 0.26201246052743876, + "grad_norm": 0.5893962009428101, + "learning_rate": 4.36575654152446e-05, + "loss": 0.5113, + "step": 1535 + }, + { + "epoch": 0.26286592131091574, + "grad_norm": 0.7665006113847211, + "learning_rate": 4.3799772468714454e-05, + "loss": 0.4894, + "step": 1540 + }, + { + "epoch": 0.2637193820943928, + "grad_norm": 0.5784712623580104, + "learning_rate": 4.39419795221843e-05, + "loss": 0.5031, + "step": 1545 + }, + { + "epoch": 0.26457284287786975, + "grad_norm": 0.4779882713725444, + "learning_rate": 4.408418657565416e-05, + "loss": 0.4841, + "step": 1550 + }, + { + "epoch": 0.2654263036613468, + "grad_norm": 0.5265214659857844, + "learning_rate": 4.4226393629124006e-05, + "loss": 0.5366, + "step": 1555 + }, + { + "epoch": 0.26627976444482376, + "grad_norm": 0.5062628084381423, + "learning_rate": 4.436860068259386e-05, + "loss": 0.516, + "step": 1560 + }, + { + "epoch": 0.26713322522830074, + "grad_norm": 0.5384657066787054, + "learning_rate": 4.451080773606371e-05, + "loss": 0.5437, + "step": 1565 + }, + { + "epoch": 0.2679866860117778, + "grad_norm": 0.4975884755019286, + "learning_rate": 4.4653014789533565e-05, + "loss": 0.4922, + "step": 1570 + }, + { + "epoch": 0.26884014679525475, + "grad_norm": 0.5763335477313506, + "learning_rate": 4.4795221843003413e-05, + "loss": 0.5283, + "step": 1575 + }, + { + "epoch": 0.26969360757873173, + "grad_norm": 0.5978071784312756, + "learning_rate": 4.493742889647327e-05, + "loss": 0.5144, + "step": 1580 + }, + { + "epoch": 0.27054706836220876, + "grad_norm": 0.8359278465607787, + "learning_rate": 4.5079635949943124e-05, + "loss": 0.4983, + "step": 1585 + }, + { + "epoch": 0.27140052914568574, + "grad_norm": 0.6214765417774613, + "learning_rate": 4.522184300341297e-05, + "loss": 0.5367, + "step": 1590 + }, + { + "epoch": 0.2722539899291628, + "grad_norm": 0.508626898726594, + "learning_rate": 4.536405005688282e-05, + "loss": 0.5132, + "step": 1595 + }, + { + "epoch": 0.27310745071263975, + "grad_norm": 0.6022081309698558, + "learning_rate": 4.5506257110352676e-05, + "loss": 0.5004, + "step": 1600 + }, + { + "epoch": 0.27396091149611673, + "grad_norm": 0.5155913335233575, + "learning_rate": 4.564846416382253e-05, + "loss": 0.4656, + "step": 1605 + }, + { + "epoch": 0.27481437227959377, + "grad_norm": 0.5791109901377541, + "learning_rate": 4.579067121729238e-05, + "loss": 0.4984, + "step": 1610 + }, + { + "epoch": 0.27566783306307074, + "grad_norm": 0.5767520283303881, + "learning_rate": 4.593287827076223e-05, + "loss": 0.5158, + "step": 1615 + }, + { + "epoch": 0.2765212938465478, + "grad_norm": 0.48349328202135766, + "learning_rate": 4.6075085324232084e-05, + "loss": 0.516, + "step": 1620 + }, + { + "epoch": 0.27737475463002476, + "grad_norm": 0.5355748029090559, + "learning_rate": 4.621729237770194e-05, + "loss": 0.5548, + "step": 1625 + }, + { + "epoch": 0.27822821541350173, + "grad_norm": 0.5274726573128763, + "learning_rate": 4.635949943117179e-05, + "loss": 0.5171, + "step": 1630 + }, + { + "epoch": 0.27908167619697877, + "grad_norm": 0.6055861657371587, + "learning_rate": 4.6501706484641636e-05, + "loss": 0.5496, + "step": 1635 + }, + { + "epoch": 0.27993513698045575, + "grad_norm": 0.5525370956087899, + "learning_rate": 4.664391353811149e-05, + "loss": 0.4954, + "step": 1640 + }, + { + "epoch": 0.2807885977639327, + "grad_norm": 0.7432424117951163, + "learning_rate": 4.6786120591581347e-05, + "loss": 0.5069, + "step": 1645 + }, + { + "epoch": 0.28164205854740976, + "grad_norm": 0.5428701235562144, + "learning_rate": 4.6928327645051195e-05, + "loss": 0.516, + "step": 1650 + }, + { + "epoch": 0.28249551933088674, + "grad_norm": 0.5595977569856022, + "learning_rate": 4.707053469852105e-05, + "loss": 0.5287, + "step": 1655 + }, + { + "epoch": 0.28334898011436377, + "grad_norm": 0.5031490669266451, + "learning_rate": 4.72127417519909e-05, + "loss": 0.4772, + "step": 1660 + }, + { + "epoch": 0.28420244089784075, + "grad_norm": 0.5529373305843895, + "learning_rate": 4.7354948805460754e-05, + "loss": 0.5293, + "step": 1665 + }, + { + "epoch": 0.2850559016813177, + "grad_norm": 0.5050081484901403, + "learning_rate": 4.74971558589306e-05, + "loss": 0.5124, + "step": 1670 + }, + { + "epoch": 0.28590936246479476, + "grad_norm": 0.5466603151465805, + "learning_rate": 4.763936291240046e-05, + "loss": 0.5161, + "step": 1675 + }, + { + "epoch": 0.28676282324827174, + "grad_norm": 0.5114919686427399, + "learning_rate": 4.778156996587031e-05, + "loss": 0.5122, + "step": 1680 + }, + { + "epoch": 0.2876162840317487, + "grad_norm": 0.505864506408219, + "learning_rate": 4.792377701934016e-05, + "loss": 0.5015, + "step": 1685 + }, + { + "epoch": 0.28846974481522575, + "grad_norm": 0.5253048001147426, + "learning_rate": 4.806598407281002e-05, + "loss": 0.5189, + "step": 1690 + }, + { + "epoch": 0.2893232055987027, + "grad_norm": 0.577856690154839, + "learning_rate": 4.8208191126279865e-05, + "loss": 0.5189, + "step": 1695 + }, + { + "epoch": 0.29017666638217976, + "grad_norm": 0.6130819699326354, + "learning_rate": 4.835039817974972e-05, + "loss": 0.5302, + "step": 1700 + }, + { + "epoch": 0.29103012716565674, + "grad_norm": 0.5188890895398451, + "learning_rate": 4.849260523321957e-05, + "loss": 0.5273, + "step": 1705 + }, + { + "epoch": 0.2918835879491337, + "grad_norm": 0.5787667071281957, + "learning_rate": 4.8634812286689424e-05, + "loss": 0.5221, + "step": 1710 + }, + { + "epoch": 0.29273704873261075, + "grad_norm": 0.5207600512605698, + "learning_rate": 4.877701934015927e-05, + "loss": 0.5224, + "step": 1715 + }, + { + "epoch": 0.2935905095160877, + "grad_norm": 0.47177284402958103, + "learning_rate": 4.891922639362913e-05, + "loss": 0.5052, + "step": 1720 + }, + { + "epoch": 0.29444397029956476, + "grad_norm": 0.5622381226349713, + "learning_rate": 4.906143344709898e-05, + "loss": 0.4944, + "step": 1725 + }, + { + "epoch": 0.29529743108304174, + "grad_norm": 0.5135889415167705, + "learning_rate": 4.920364050056883e-05, + "loss": 0.4853, + "step": 1730 + }, + { + "epoch": 0.2961508918665187, + "grad_norm": 0.5568571026359952, + "learning_rate": 4.934584755403868e-05, + "loss": 0.5118, + "step": 1735 + }, + { + "epoch": 0.29700435264999575, + "grad_norm": 0.5358216069493535, + "learning_rate": 4.9488054607508536e-05, + "loss": 0.5026, + "step": 1740 + }, + { + "epoch": 0.29785781343347273, + "grad_norm": 0.530619616402973, + "learning_rate": 4.963026166097839e-05, + "loss": 0.5146, + "step": 1745 + }, + { + "epoch": 0.2987112742169497, + "grad_norm": 0.499787376121546, + "learning_rate": 4.977246871444824e-05, + "loss": 0.4968, + "step": 1750 + }, + { + "epoch": 0.29956473500042674, + "grad_norm": 0.5101078686357036, + "learning_rate": 4.991467576791809e-05, + "loss": 0.4946, + "step": 1755 + }, + { + "epoch": 0.3004181957839037, + "grad_norm": 0.6548359705660204, + "learning_rate": 4.999367728882145e-05, + "loss": 0.5244, + "step": 1760 + }, + { + "epoch": 0.30127165656738075, + "grad_norm": 0.48137998048070096, + "learning_rate": 4.997787051087506e-05, + "loss": 0.5162, + "step": 1765 + }, + { + "epoch": 0.30212511735085773, + "grad_norm": 0.48614869274456174, + "learning_rate": 4.9962063732928684e-05, + "loss": 0.5062, + "step": 1770 + }, + { + "epoch": 0.3029785781343347, + "grad_norm": 0.497782874867819, + "learning_rate": 4.99462569549823e-05, + "loss": 0.5455, + "step": 1775 + }, + { + "epoch": 0.30383203891781174, + "grad_norm": 0.5230168871985573, + "learning_rate": 4.993045017703591e-05, + "loss": 0.5312, + "step": 1780 + }, + { + "epoch": 0.3046854997012887, + "grad_norm": 0.582983660254052, + "learning_rate": 4.991464339908954e-05, + "loss": 0.5127, + "step": 1785 + }, + { + "epoch": 0.3055389604847657, + "grad_norm": 0.5396571524519772, + "learning_rate": 4.989883662114315e-05, + "loss": 0.4799, + "step": 1790 + }, + { + "epoch": 0.30639242126824273, + "grad_norm": 0.4678056267208111, + "learning_rate": 4.9883029843196765e-05, + "loss": 0.4892, + "step": 1795 + }, + { + "epoch": 0.3072458820517197, + "grad_norm": 0.5877677893826099, + "learning_rate": 4.986722306525038e-05, + "loss": 0.5267, + "step": 1800 + }, + { + "epoch": 0.30809934283519674, + "grad_norm": 0.47625277215848183, + "learning_rate": 4.9851416287304e-05, + "loss": 0.51, + "step": 1805 + }, + { + "epoch": 0.3089528036186737, + "grad_norm": 0.4607978832628274, + "learning_rate": 4.983560950935762e-05, + "loss": 0.5005, + "step": 1810 + }, + { + "epoch": 0.3098062644021507, + "grad_norm": 0.508720567496007, + "learning_rate": 4.981980273141123e-05, + "loss": 0.4917, + "step": 1815 + }, + { + "epoch": 0.31065972518562773, + "grad_norm": 0.5498811575656698, + "learning_rate": 4.980399595346485e-05, + "loss": 0.539, + "step": 1820 + }, + { + "epoch": 0.3115131859691047, + "grad_norm": 0.5059386725583341, + "learning_rate": 4.978818917551846e-05, + "loss": 0.5072, + "step": 1825 + }, + { + "epoch": 0.31236664675258174, + "grad_norm": 0.4591900174762019, + "learning_rate": 4.977238239757208e-05, + "loss": 0.5296, + "step": 1830 + }, + { + "epoch": 0.3132201075360587, + "grad_norm": 0.6389348492531627, + "learning_rate": 4.97565756196257e-05, + "loss": 0.4879, + "step": 1835 + }, + { + "epoch": 0.3140735683195357, + "grad_norm": 0.5536804544450629, + "learning_rate": 4.9740768841679315e-05, + "loss": 0.4865, + "step": 1840 + }, + { + "epoch": 0.31492702910301273, + "grad_norm": 0.47776468085928686, + "learning_rate": 4.972496206373293e-05, + "loss": 0.5003, + "step": 1845 + }, + { + "epoch": 0.3157804898864897, + "grad_norm": 0.5040049671237692, + "learning_rate": 4.970915528578654e-05, + "loss": 0.487, + "step": 1850 + }, + { + "epoch": 0.3166339506699667, + "grad_norm": 0.48062488023690886, + "learning_rate": 4.969334850784017e-05, + "loss": 0.4891, + "step": 1855 + }, + { + "epoch": 0.3174874114534437, + "grad_norm": 0.5093613218378991, + "learning_rate": 4.967754172989378e-05, + "loss": 0.4753, + "step": 1860 + }, + { + "epoch": 0.3183408722369207, + "grad_norm": 0.5624601450965416, + "learning_rate": 4.9661734951947396e-05, + "loss": 0.5231, + "step": 1865 + }, + { + "epoch": 0.31919433302039774, + "grad_norm": 0.5871555056740465, + "learning_rate": 4.964592817400101e-05, + "loss": 0.502, + "step": 1870 + }, + { + "epoch": 0.3200477938038747, + "grad_norm": 0.6212161463505985, + "learning_rate": 4.963012139605463e-05, + "loss": 0.5248, + "step": 1875 + }, + { + "epoch": 0.3209012545873517, + "grad_norm": 0.5483362010525371, + "learning_rate": 4.961431461810825e-05, + "loss": 0.497, + "step": 1880 + }, + { + "epoch": 0.3217547153708287, + "grad_norm": 0.5386127950130882, + "learning_rate": 4.959850784016186e-05, + "loss": 0.5255, + "step": 1885 + }, + { + "epoch": 0.3226081761543057, + "grad_norm": 0.5878591256347312, + "learning_rate": 4.958270106221548e-05, + "loss": 0.4989, + "step": 1890 + }, + { + "epoch": 0.3234616369377827, + "grad_norm": 0.5174651720515663, + "learning_rate": 4.9566894284269094e-05, + "loss": 0.5361, + "step": 1895 + }, + { + "epoch": 0.3243150977212597, + "grad_norm": 0.5698729020989101, + "learning_rate": 4.955108750632271e-05, + "loss": 0.4916, + "step": 1900 + }, + { + "epoch": 0.3251685585047367, + "grad_norm": 0.4601199986994053, + "learning_rate": 4.9535280728376335e-05, + "loss": 0.4879, + "step": 1905 + }, + { + "epoch": 0.3260220192882137, + "grad_norm": 0.6866248215777093, + "learning_rate": 4.9519473950429946e-05, + "loss": 0.5312, + "step": 1910 + }, + { + "epoch": 0.3268754800716907, + "grad_norm": 0.5412387753636492, + "learning_rate": 4.9503667172483563e-05, + "loss": 0.5012, + "step": 1915 + }, + { + "epoch": 0.3277289408551677, + "grad_norm": 0.5873267793920224, + "learning_rate": 4.948786039453718e-05, + "loss": 0.509, + "step": 1920 + }, + { + "epoch": 0.3285824016386447, + "grad_norm": 0.5189115320755102, + "learning_rate": 4.94720536165908e-05, + "loss": 0.484, + "step": 1925 + }, + { + "epoch": 0.3294358624221217, + "grad_norm": 0.5022541830055647, + "learning_rate": 4.9456246838644416e-05, + "loss": 0.502, + "step": 1930 + }, + { + "epoch": 0.33028932320559873, + "grad_norm": 0.6636199777411177, + "learning_rate": 4.9440440060698027e-05, + "loss": 0.5468, + "step": 1935 + }, + { + "epoch": 0.3311427839890757, + "grad_norm": 0.47976829113815256, + "learning_rate": 4.942463328275165e-05, + "loss": 0.5609, + "step": 1940 + }, + { + "epoch": 0.3319962447725527, + "grad_norm": 0.5364386551157393, + "learning_rate": 4.940882650480526e-05, + "loss": 0.5277, + "step": 1945 + }, + { + "epoch": 0.3328497055560297, + "grad_norm": 0.5284348268891784, + "learning_rate": 4.939301972685888e-05, + "loss": 0.5115, + "step": 1950 + }, + { + "epoch": 0.3337031663395067, + "grad_norm": 0.5985767793816986, + "learning_rate": 4.9377212948912496e-05, + "loss": 0.4869, + "step": 1955 + }, + { + "epoch": 0.3345566271229837, + "grad_norm": 0.49985285222802744, + "learning_rate": 4.9361406170966114e-05, + "loss": 0.5194, + "step": 1960 + }, + { + "epoch": 0.3354100879064607, + "grad_norm": 0.47473725204758305, + "learning_rate": 4.934559939301973e-05, + "loss": 0.5156, + "step": 1965 + }, + { + "epoch": 0.3362635486899377, + "grad_norm": 0.4813926004817435, + "learning_rate": 4.932979261507334e-05, + "loss": 0.511, + "step": 1970 + }, + { + "epoch": 0.3371170094734147, + "grad_norm": 0.44287958712492753, + "learning_rate": 4.9313985837126966e-05, + "loss": 0.5052, + "step": 1975 + }, + { + "epoch": 0.3379704702568917, + "grad_norm": 0.8696903571161176, + "learning_rate": 4.929817905918058e-05, + "loss": 0.4993, + "step": 1980 + }, + { + "epoch": 0.3388239310403687, + "grad_norm": 0.4961065331077907, + "learning_rate": 4.9282372281234194e-05, + "loss": 0.4847, + "step": 1985 + }, + { + "epoch": 0.3396773918238457, + "grad_norm": 0.4303833412454493, + "learning_rate": 4.926656550328781e-05, + "loss": 0.5273, + "step": 1990 + }, + { + "epoch": 0.3405308526073227, + "grad_norm": 0.5307093252344055, + "learning_rate": 4.925075872534143e-05, + "loss": 0.4798, + "step": 1995 + }, + { + "epoch": 0.34138431339079967, + "grad_norm": 0.588253233323147, + "learning_rate": 4.923495194739505e-05, + "loss": 0.4944, + "step": 2000 + }, + { + "epoch": 0.3422377741742767, + "grad_norm": 0.4260031992882977, + "learning_rate": 4.921914516944866e-05, + "loss": 0.498, + "step": 2005 + }, + { + "epoch": 0.3430912349577537, + "grad_norm": 0.6840740508852956, + "learning_rate": 4.920333839150228e-05, + "loss": 0.5146, + "step": 2010 + }, + { + "epoch": 0.3439446957412307, + "grad_norm": 0.515886091219417, + "learning_rate": 4.918753161355589e-05, + "loss": 0.5015, + "step": 2015 + }, + { + "epoch": 0.3447981565247077, + "grad_norm": 0.5556290220178304, + "learning_rate": 4.917172483560951e-05, + "loss": 0.5045, + "step": 2020 + }, + { + "epoch": 0.34565161730818467, + "grad_norm": 0.49030475027530085, + "learning_rate": 4.9155918057663134e-05, + "loss": 0.5244, + "step": 2025 + }, + { + "epoch": 0.3465050780916617, + "grad_norm": 0.5762252501319363, + "learning_rate": 4.9140111279716745e-05, + "loss": 0.5091, + "step": 2030 + }, + { + "epoch": 0.3473585388751387, + "grad_norm": 0.5442309646550838, + "learning_rate": 4.912430450177036e-05, + "loss": 0.5201, + "step": 2035 + }, + { + "epoch": 0.3482119996586157, + "grad_norm": 0.49245899017001116, + "learning_rate": 4.910849772382398e-05, + "loss": 0.4858, + "step": 2040 + }, + { + "epoch": 0.3490654604420927, + "grad_norm": 0.4810738247547212, + "learning_rate": 4.90926909458776e-05, + "loss": 0.4851, + "step": 2045 + }, + { + "epoch": 0.34991892122556967, + "grad_norm": 0.5208290744758776, + "learning_rate": 4.907688416793121e-05, + "loss": 0.5017, + "step": 2050 + }, + { + "epoch": 0.3507723820090467, + "grad_norm": 0.5332087434013345, + "learning_rate": 4.9061077389984825e-05, + "loss": 0.4798, + "step": 2055 + }, + { + "epoch": 0.3516258427925237, + "grad_norm": 0.4661707910424319, + "learning_rate": 4.904527061203845e-05, + "loss": 0.4949, + "step": 2060 + }, + { + "epoch": 0.35247930357600066, + "grad_norm": 0.6661288913768391, + "learning_rate": 4.902946383409206e-05, + "loss": 0.523, + "step": 2065 + }, + { + "epoch": 0.3533327643594777, + "grad_norm": 0.495108641158868, + "learning_rate": 4.901365705614568e-05, + "loss": 0.4852, + "step": 2070 + }, + { + "epoch": 0.35418622514295467, + "grad_norm": 0.4680124924286701, + "learning_rate": 4.8997850278199295e-05, + "loss": 0.4814, + "step": 2075 + }, + { + "epoch": 0.3550396859264317, + "grad_norm": 0.39154448085109655, + "learning_rate": 4.898204350025291e-05, + "loss": 0.4762, + "step": 2080 + }, + { + "epoch": 0.3558931467099087, + "grad_norm": 0.4315351278111706, + "learning_rate": 4.896623672230653e-05, + "loss": 0.5076, + "step": 2085 + }, + { + "epoch": 0.35674660749338566, + "grad_norm": 0.6209355293840191, + "learning_rate": 4.895042994436014e-05, + "loss": 0.4814, + "step": 2090 + }, + { + "epoch": 0.3576000682768627, + "grad_norm": 0.58744008191501, + "learning_rate": 4.8934623166413765e-05, + "loss": 0.5386, + "step": 2095 + }, + { + "epoch": 0.35845352906033967, + "grad_norm": 0.523456459006555, + "learning_rate": 4.8918816388467376e-05, + "loss": 0.5407, + "step": 2100 + }, + { + "epoch": 0.3593069898438167, + "grad_norm": 0.5858458920130489, + "learning_rate": 4.890300961052099e-05, + "loss": 0.5043, + "step": 2105 + }, + { + "epoch": 0.3601604506272937, + "grad_norm": 0.622924245935873, + "learning_rate": 4.888720283257461e-05, + "loss": 0.4852, + "step": 2110 + }, + { + "epoch": 0.36101391141077066, + "grad_norm": 0.5331367222638274, + "learning_rate": 4.887139605462823e-05, + "loss": 0.5179, + "step": 2115 + }, + { + "epoch": 0.3618673721942477, + "grad_norm": 0.6035015648064055, + "learning_rate": 4.8855589276681845e-05, + "loss": 0.4673, + "step": 2120 + }, + { + "epoch": 0.36272083297772467, + "grad_norm": 0.5614942447133979, + "learning_rate": 4.8839782498735456e-05, + "loss": 0.4953, + "step": 2125 + }, + { + "epoch": 0.36357429376120165, + "grad_norm": 0.5417698788101754, + "learning_rate": 4.882397572078908e-05, + "loss": 0.51, + "step": 2130 + }, + { + "epoch": 0.3644277545446787, + "grad_norm": 0.43028318471622506, + "learning_rate": 4.880816894284269e-05, + "loss": 0.5143, + "step": 2135 + }, + { + "epoch": 0.36528121532815566, + "grad_norm": 0.46453203731114756, + "learning_rate": 4.879236216489631e-05, + "loss": 0.5029, + "step": 2140 + }, + { + "epoch": 0.3661346761116327, + "grad_norm": 0.4414681810312383, + "learning_rate": 4.8776555386949926e-05, + "loss": 0.517, + "step": 2145 + }, + { + "epoch": 0.3669881368951097, + "grad_norm": 0.5440667658494504, + "learning_rate": 4.876074860900354e-05, + "loss": 0.4952, + "step": 2150 + }, + { + "epoch": 0.36784159767858665, + "grad_norm": 0.5719949542451221, + "learning_rate": 4.874494183105716e-05, + "loss": 0.5067, + "step": 2155 + }, + { + "epoch": 0.3686950584620637, + "grad_norm": 0.4987125482108236, + "learning_rate": 4.872913505311078e-05, + "loss": 0.5011, + "step": 2160 + }, + { + "epoch": 0.36954851924554066, + "grad_norm": 0.48371745245377856, + "learning_rate": 4.8713328275164396e-05, + "loss": 0.5104, + "step": 2165 + }, + { + "epoch": 0.37040198002901764, + "grad_norm": 0.5571059180363649, + "learning_rate": 4.8697521497218006e-05, + "loss": 0.4963, + "step": 2170 + }, + { + "epoch": 0.3712554408124947, + "grad_norm": 0.43563340403560935, + "learning_rate": 4.8681714719271624e-05, + "loss": 0.5039, + "step": 2175 + }, + { + "epoch": 0.37210890159597165, + "grad_norm": 0.4691492163127887, + "learning_rate": 4.866590794132525e-05, + "loss": 0.4919, + "step": 2180 + }, + { + "epoch": 0.3729623623794487, + "grad_norm": 0.6600911089239349, + "learning_rate": 4.865010116337886e-05, + "loss": 0.5053, + "step": 2185 + }, + { + "epoch": 0.37381582316292566, + "grad_norm": 0.5111477799911323, + "learning_rate": 4.8634294385432476e-05, + "loss": 0.4909, + "step": 2190 + }, + { + "epoch": 0.37466928394640264, + "grad_norm": 0.6057332546181553, + "learning_rate": 4.8618487607486094e-05, + "loss": 0.5082, + "step": 2195 + }, + { + "epoch": 0.3755227447298797, + "grad_norm": 0.44911636114761194, + "learning_rate": 4.860268082953971e-05, + "loss": 0.4956, + "step": 2200 + }, + { + "epoch": 0.37637620551335665, + "grad_norm": 0.5266248333158463, + "learning_rate": 4.858687405159332e-05, + "loss": 0.4905, + "step": 2205 + }, + { + "epoch": 0.3772296662968337, + "grad_norm": 0.5209847386036796, + "learning_rate": 4.857106727364694e-05, + "loss": 0.5158, + "step": 2210 + }, + { + "epoch": 0.37808312708031067, + "grad_norm": 0.42162475531591376, + "learning_rate": 4.8555260495700563e-05, + "loss": 0.4957, + "step": 2215 + }, + { + "epoch": 0.37893658786378764, + "grad_norm": 0.5268786836115406, + "learning_rate": 4.8539453717754174e-05, + "loss": 0.5108, + "step": 2220 + }, + { + "epoch": 0.3797900486472647, + "grad_norm": 0.5502003172812554, + "learning_rate": 4.852364693980779e-05, + "loss": 0.503, + "step": 2225 + }, + { + "epoch": 0.38064350943074166, + "grad_norm": 0.47184815550175846, + "learning_rate": 4.850784016186141e-05, + "loss": 0.4816, + "step": 2230 + }, + { + "epoch": 0.38149697021421863, + "grad_norm": 0.4914467821125124, + "learning_rate": 4.8492033383915027e-05, + "loss": 0.5124, + "step": 2235 + }, + { + "epoch": 0.38235043099769567, + "grad_norm": 0.5284192832763485, + "learning_rate": 4.8476226605968644e-05, + "loss": 0.527, + "step": 2240 + }, + { + "epoch": 0.38320389178117265, + "grad_norm": 0.43916650960746295, + "learning_rate": 4.8460419828022255e-05, + "loss": 0.4853, + "step": 2245 + }, + { + "epoch": 0.3840573525646497, + "grad_norm": 0.4822030333495498, + "learning_rate": 4.844461305007588e-05, + "loss": 0.5259, + "step": 2250 + }, + { + "epoch": 0.38491081334812666, + "grad_norm": 0.4976857587767006, + "learning_rate": 4.842880627212949e-05, + "loss": 0.5126, + "step": 2255 + }, + { + "epoch": 0.38576427413160363, + "grad_norm": 0.5028406437234613, + "learning_rate": 4.841299949418311e-05, + "loss": 0.5037, + "step": 2260 + }, + { + "epoch": 0.38661773491508067, + "grad_norm": 0.5703060961212163, + "learning_rate": 4.8397192716236725e-05, + "loss": 0.5136, + "step": 2265 + }, + { + "epoch": 0.38747119569855765, + "grad_norm": 0.5540371585065555, + "learning_rate": 4.838138593829034e-05, + "loss": 0.5219, + "step": 2270 + }, + { + "epoch": 0.3883246564820346, + "grad_norm": 0.460303219646615, + "learning_rate": 4.836557916034396e-05, + "loss": 0.465, + "step": 2275 + }, + { + "epoch": 0.38917811726551166, + "grad_norm": 0.6376586309384963, + "learning_rate": 4.834977238239757e-05, + "loss": 0.5159, + "step": 2280 + }, + { + "epoch": 0.39003157804898864, + "grad_norm": 0.5385467971311747, + "learning_rate": 4.8333965604451194e-05, + "loss": 0.5133, + "step": 2285 + }, + { + "epoch": 0.39088503883246567, + "grad_norm": 0.45603340338427445, + "learning_rate": 4.8318158826504805e-05, + "loss": 0.4883, + "step": 2290 + }, + { + "epoch": 0.39173849961594265, + "grad_norm": 0.495022738683918, + "learning_rate": 4.830235204855842e-05, + "loss": 0.4919, + "step": 2295 + }, + { + "epoch": 0.3925919603994196, + "grad_norm": 0.4198291402943894, + "learning_rate": 4.828654527061204e-05, + "loss": 0.4913, + "step": 2300 + }, + { + "epoch": 0.39344542118289666, + "grad_norm": 0.4897660003658656, + "learning_rate": 4.827073849266566e-05, + "loss": 0.4885, + "step": 2305 + }, + { + "epoch": 0.39429888196637364, + "grad_norm": 0.5880897395232438, + "learning_rate": 4.8254931714719275e-05, + "loss": 0.5182, + "step": 2310 + }, + { + "epoch": 0.39515234274985067, + "grad_norm": 0.5237830796008816, + "learning_rate": 4.823912493677289e-05, + "loss": 0.5066, + "step": 2315 + }, + { + "epoch": 0.39600580353332765, + "grad_norm": 0.46915821115914, + "learning_rate": 4.822331815882651e-05, + "loss": 0.5038, + "step": 2320 + }, + { + "epoch": 0.3968592643168046, + "grad_norm": 0.489977862747463, + "learning_rate": 4.820751138088012e-05, + "loss": 0.5023, + "step": 2325 + }, + { + "epoch": 0.39771272510028166, + "grad_norm": 0.5519206595985753, + "learning_rate": 4.819170460293374e-05, + "loss": 0.5092, + "step": 2330 + }, + { + "epoch": 0.39856618588375864, + "grad_norm": 0.7853155996315747, + "learning_rate": 4.817589782498736e-05, + "loss": 0.4821, + "step": 2335 + }, + { + "epoch": 0.3994196466672356, + "grad_norm": 0.533071490199691, + "learning_rate": 4.816009104704097e-05, + "loss": 0.4623, + "step": 2340 + }, + { + "epoch": 0.40027310745071265, + "grad_norm": 0.4311014867792575, + "learning_rate": 4.814428426909459e-05, + "loss": 0.5367, + "step": 2345 + }, + { + "epoch": 0.40112656823418963, + "grad_norm": 0.5065115129965704, + "learning_rate": 4.812847749114821e-05, + "loss": 0.4755, + "step": 2350 + }, + { + "epoch": 0.40198002901766666, + "grad_norm": 0.6459279188772843, + "learning_rate": 4.8112670713201825e-05, + "loss": 0.4819, + "step": 2355 + }, + { + "epoch": 0.40283348980114364, + "grad_norm": 0.5234927153215261, + "learning_rate": 4.8096863935255436e-05, + "loss": 0.5055, + "step": 2360 + }, + { + "epoch": 0.4036869505846206, + "grad_norm": 0.5002221843150646, + "learning_rate": 4.808105715730905e-05, + "loss": 0.4641, + "step": 2365 + }, + { + "epoch": 0.40454041136809765, + "grad_norm": 0.7926000042008581, + "learning_rate": 4.806525037936268e-05, + "loss": 0.4794, + "step": 2370 + }, + { + "epoch": 0.40539387215157463, + "grad_norm": 0.4819003808100341, + "learning_rate": 4.804944360141629e-05, + "loss": 0.5226, + "step": 2375 + }, + { + "epoch": 0.4062473329350516, + "grad_norm": 0.7494131850862085, + "learning_rate": 4.8033636823469906e-05, + "loss": 0.4793, + "step": 2380 + }, + { + "epoch": 0.40710079371852864, + "grad_norm": 0.43027822412816397, + "learning_rate": 4.801783004552352e-05, + "loss": 0.4661, + "step": 2385 + }, + { + "epoch": 0.4079542545020056, + "grad_norm": 0.5258254862312308, + "learning_rate": 4.800202326757714e-05, + "loss": 0.494, + "step": 2390 + }, + { + "epoch": 0.40880771528548265, + "grad_norm": 0.5089133616963131, + "learning_rate": 4.798621648963076e-05, + "loss": 0.4804, + "step": 2395 + }, + { + "epoch": 0.40966117606895963, + "grad_norm": 0.4832185604406258, + "learning_rate": 4.797040971168437e-05, + "loss": 0.4921, + "step": 2400 + }, + { + "epoch": 0.4105146368524366, + "grad_norm": 0.4978598215249046, + "learning_rate": 4.795460293373799e-05, + "loss": 0.4759, + "step": 2405 + }, + { + "epoch": 0.41136809763591364, + "grad_norm": 0.48133388100148394, + "learning_rate": 4.7938796155791604e-05, + "loss": 0.5048, + "step": 2410 + }, + { + "epoch": 0.4122215584193906, + "grad_norm": 0.4696469239540106, + "learning_rate": 4.792298937784522e-05, + "loss": 0.5042, + "step": 2415 + }, + { + "epoch": 0.41307501920286765, + "grad_norm": 0.5581633064738356, + "learning_rate": 4.790718259989884e-05, + "loss": 0.4927, + "step": 2420 + }, + { + "epoch": 0.41392847998634463, + "grad_norm": 0.6278902607749504, + "learning_rate": 4.7891375821952456e-05, + "loss": 0.503, + "step": 2425 + }, + { + "epoch": 0.4147819407698216, + "grad_norm": 0.4517239717322925, + "learning_rate": 4.7875569044006074e-05, + "loss": 0.5011, + "step": 2430 + }, + { + "epoch": 0.41563540155329864, + "grad_norm": 0.6903762794458227, + "learning_rate": 4.785976226605969e-05, + "loss": 0.5233, + "step": 2435 + }, + { + "epoch": 0.4164888623367756, + "grad_norm": 3.2978680656603636, + "learning_rate": 4.784395548811331e-05, + "loss": 0.5109, + "step": 2440 + }, + { + "epoch": 0.4173423231202526, + "grad_norm": 1.1420518689716137, + "learning_rate": 4.782814871016692e-05, + "loss": 0.4938, + "step": 2445 + }, + { + "epoch": 0.41819578390372963, + "grad_norm": 1.8300738320275258, + "learning_rate": 4.7812341932220537e-05, + "loss": 0.5105, + "step": 2450 + }, + { + "epoch": 0.4190492446872066, + "grad_norm": 0.6481455070850186, + "learning_rate": 4.7796535154274154e-05, + "loss": 0.51, + "step": 2455 + }, + { + "epoch": 0.41990270547068365, + "grad_norm": 0.7913096084633933, + "learning_rate": 4.778072837632777e-05, + "loss": 0.4987, + "step": 2460 + }, + { + "epoch": 0.4207561662541606, + "grad_norm": 1.2255098650997986, + "learning_rate": 4.776492159838139e-05, + "loss": 0.5009, + "step": 2465 + }, + { + "epoch": 0.4216096270376376, + "grad_norm": 0.6795160628453747, + "learning_rate": 4.7749114820435006e-05, + "loss": 0.5186, + "step": 2470 + }, + { + "epoch": 0.42246308782111464, + "grad_norm": 0.4898195410241313, + "learning_rate": 4.7733308042488624e-05, + "loss": 0.495, + "step": 2475 + }, + { + "epoch": 0.4233165486045916, + "grad_norm": 0.6029736724871774, + "learning_rate": 4.7717501264542235e-05, + "loss": 0.5097, + "step": 2480 + }, + { + "epoch": 0.4241700093880686, + "grad_norm": 0.6029111635582989, + "learning_rate": 4.770169448659585e-05, + "loss": 0.4871, + "step": 2485 + }, + { + "epoch": 0.4250234701715456, + "grad_norm": 0.49507690107864344, + "learning_rate": 4.7685887708649476e-05, + "loss": 0.4755, + "step": 2490 + }, + { + "epoch": 0.4258769309550226, + "grad_norm": 0.6613373223792829, + "learning_rate": 4.767008093070309e-05, + "loss": 0.5154, + "step": 2495 + }, + { + "epoch": 0.42673039173849964, + "grad_norm": 0.49685624332129247, + "learning_rate": 4.7654274152756704e-05, + "loss": 0.4927, + "step": 2500 + }, + { + "epoch": 0.4275838525219766, + "grad_norm": 0.4437325089367657, + "learning_rate": 4.763846737481032e-05, + "loss": 0.4792, + "step": 2505 + }, + { + "epoch": 0.4284373133054536, + "grad_norm": 0.5576342725164943, + "learning_rate": 4.762266059686394e-05, + "loss": 0.5079, + "step": 2510 + }, + { + "epoch": 0.4292907740889306, + "grad_norm": 0.46585229501787667, + "learning_rate": 4.760685381891755e-05, + "loss": 0.5006, + "step": 2515 + }, + { + "epoch": 0.4301442348724076, + "grad_norm": 0.6845463777228801, + "learning_rate": 4.759104704097117e-05, + "loss": 0.5115, + "step": 2520 + }, + { + "epoch": 0.43099769565588464, + "grad_norm": 0.5057601413614787, + "learning_rate": 4.757524026302479e-05, + "loss": 0.4927, + "step": 2525 + }, + { + "epoch": 0.4318511564393616, + "grad_norm": 0.5684767431715674, + "learning_rate": 4.75594334850784e-05, + "loss": 0.5094, + "step": 2530 + }, + { + "epoch": 0.4327046172228386, + "grad_norm": 0.7288429534702419, + "learning_rate": 4.754362670713202e-05, + "loss": 0.5102, + "step": 2535 + }, + { + "epoch": 0.43355807800631563, + "grad_norm": 0.6797933337175003, + "learning_rate": 4.752781992918564e-05, + "loss": 0.5011, + "step": 2540 + }, + { + "epoch": 0.4344115387897926, + "grad_norm": 0.7310663062589878, + "learning_rate": 4.7512013151239255e-05, + "loss": 0.5418, + "step": 2545 + }, + { + "epoch": 0.4352649995732696, + "grad_norm": 1.048960668662016, + "learning_rate": 4.7496206373292865e-05, + "loss": 0.4918, + "step": 2550 + }, + { + "epoch": 0.4361184603567466, + "grad_norm": 0.48307167584529137, + "learning_rate": 4.748039959534649e-05, + "loss": 0.4774, + "step": 2555 + }, + { + "epoch": 0.4369719211402236, + "grad_norm": 0.5814531480371788, + "learning_rate": 4.746459281740011e-05, + "loss": 0.4581, + "step": 2560 + }, + { + "epoch": 0.43782538192370063, + "grad_norm": 0.42862234878740185, + "learning_rate": 4.744878603945372e-05, + "loss": 0.5208, + "step": 2565 + }, + { + "epoch": 0.4386788427071776, + "grad_norm": 0.4657196512442816, + "learning_rate": 4.7432979261507335e-05, + "loss": 0.5068, + "step": 2570 + }, + { + "epoch": 0.4395323034906546, + "grad_norm": 0.5131288521906384, + "learning_rate": 4.741717248356095e-05, + "loss": 0.5498, + "step": 2575 + }, + { + "epoch": 0.4403857642741316, + "grad_norm": 0.5803920539575499, + "learning_rate": 4.740136570561457e-05, + "loss": 0.5143, + "step": 2580 + }, + { + "epoch": 0.4412392250576086, + "grad_norm": 0.5181878370826999, + "learning_rate": 4.738555892766819e-05, + "loss": 0.4873, + "step": 2585 + }, + { + "epoch": 0.4420926858410856, + "grad_norm": 0.6480775342678349, + "learning_rate": 4.7369752149721805e-05, + "loss": 0.4641, + "step": 2590 + }, + { + "epoch": 0.4429461466245626, + "grad_norm": 0.4511955581917607, + "learning_rate": 4.735394537177542e-05, + "loss": 0.483, + "step": 2595 + }, + { + "epoch": 0.4437996074080396, + "grad_norm": 0.44086238762815927, + "learning_rate": 4.733813859382903e-05, + "loss": 0.4931, + "step": 2600 + }, + { + "epoch": 0.4446530681915166, + "grad_norm": 0.5191620095061635, + "learning_rate": 4.732233181588265e-05, + "loss": 0.4814, + "step": 2605 + }, + { + "epoch": 0.4455065289749936, + "grad_norm": 0.5151747201853522, + "learning_rate": 4.730652503793627e-05, + "loss": 0.506, + "step": 2610 + }, + { + "epoch": 0.4463599897584706, + "grad_norm": 0.597319461144412, + "learning_rate": 4.7290718259989886e-05, + "loss": 0.5201, + "step": 2615 + }, + { + "epoch": 0.4472134505419476, + "grad_norm": 0.4871202179954294, + "learning_rate": 4.72749114820435e-05, + "loss": 0.4899, + "step": 2620 + }, + { + "epoch": 0.4480669113254246, + "grad_norm": 0.6523872164581523, + "learning_rate": 4.725910470409712e-05, + "loss": 0.4962, + "step": 2625 + }, + { + "epoch": 0.4489203721089016, + "grad_norm": 1.5505489626714442, + "learning_rate": 4.724329792615074e-05, + "loss": 0.5084, + "step": 2630 + }, + { + "epoch": 0.4497738328923786, + "grad_norm": 0.5726929915232936, + "learning_rate": 4.722749114820435e-05, + "loss": 0.4919, + "step": 2635 + }, + { + "epoch": 0.4506272936758556, + "grad_norm": 0.4825017560375864, + "learning_rate": 4.7211684370257966e-05, + "loss": 0.4652, + "step": 2640 + }, + { + "epoch": 0.4514807544593326, + "grad_norm": 0.530849848752361, + "learning_rate": 4.719587759231159e-05, + "loss": 0.5028, + "step": 2645 + }, + { + "epoch": 0.4523342152428096, + "grad_norm": 0.49903865492028343, + "learning_rate": 4.71800708143652e-05, + "loss": 0.545, + "step": 2650 + }, + { + "epoch": 0.45318767602628657, + "grad_norm": 0.43232891166576487, + "learning_rate": 4.716426403641882e-05, + "loss": 0.4847, + "step": 2655 + }, + { + "epoch": 0.4540411368097636, + "grad_norm": 0.4962269363045644, + "learning_rate": 4.7148457258472436e-05, + "loss": 0.495, + "step": 2660 + }, + { + "epoch": 0.4548945975932406, + "grad_norm": 0.3904494391212347, + "learning_rate": 4.713265048052605e-05, + "loss": 0.5095, + "step": 2665 + }, + { + "epoch": 0.4557480583767176, + "grad_norm": 0.4859232333362679, + "learning_rate": 4.7116843702579664e-05, + "loss": 0.499, + "step": 2670 + }, + { + "epoch": 0.4566015191601946, + "grad_norm": 0.5253121206824946, + "learning_rate": 4.710103692463329e-05, + "loss": 0.512, + "step": 2675 + }, + { + "epoch": 0.45745497994367157, + "grad_norm": 0.44486107791609475, + "learning_rate": 4.7085230146686906e-05, + "loss": 0.4962, + "step": 2680 + }, + { + "epoch": 0.4583084407271486, + "grad_norm": 2.0847784182788134, + "learning_rate": 4.7069423368740516e-05, + "loss": 0.4794, + "step": 2685 + }, + { + "epoch": 0.4591619015106256, + "grad_norm": 0.49662006211171916, + "learning_rate": 4.7053616590794134e-05, + "loss": 0.4929, + "step": 2690 + }, + { + "epoch": 0.46001536229410256, + "grad_norm": 0.4588170270812257, + "learning_rate": 4.703780981284775e-05, + "loss": 0.5252, + "step": 2695 + }, + { + "epoch": 0.4608688230775796, + "grad_norm": 0.4238447364177512, + "learning_rate": 4.702200303490137e-05, + "loss": 0.4757, + "step": 2700 + }, + { + "epoch": 0.46172228386105657, + "grad_norm": 0.7349134241268634, + "learning_rate": 4.700619625695498e-05, + "loss": 0.4923, + "step": 2705 + }, + { + "epoch": 0.4625757446445336, + "grad_norm": 0.5620110128820872, + "learning_rate": 4.6990389479008604e-05, + "loss": 0.5046, + "step": 2710 + }, + { + "epoch": 0.4634292054280106, + "grad_norm": 0.43117658804511005, + "learning_rate": 4.697458270106222e-05, + "loss": 0.4937, + "step": 2715 + }, + { + "epoch": 0.46428266621148756, + "grad_norm": 0.5258019562099506, + "learning_rate": 4.695877592311583e-05, + "loss": 0.5429, + "step": 2720 + }, + { + "epoch": 0.4651361269949646, + "grad_norm": 0.7152247622932054, + "learning_rate": 4.694296914516945e-05, + "loss": 0.4617, + "step": 2725 + }, + { + "epoch": 0.46598958777844157, + "grad_norm": 0.5525572451632204, + "learning_rate": 4.692716236722307e-05, + "loss": 0.4953, + "step": 2730 + }, + { + "epoch": 0.4668430485619186, + "grad_norm": 0.4592347168689562, + "learning_rate": 4.6911355589276684e-05, + "loss": 0.4777, + "step": 2735 + }, + { + "epoch": 0.4676965093453956, + "grad_norm": 0.4995452622352298, + "learning_rate": 4.68955488113303e-05, + "loss": 0.4842, + "step": 2740 + }, + { + "epoch": 0.46854997012887256, + "grad_norm": 0.4572243792841723, + "learning_rate": 4.687974203338392e-05, + "loss": 0.4788, + "step": 2745 + }, + { + "epoch": 0.4694034309123496, + "grad_norm": 0.4839684835554378, + "learning_rate": 4.6863935255437537e-05, + "loss": 0.4687, + "step": 2750 + }, + { + "epoch": 0.4702568916958266, + "grad_norm": 0.47612394912228895, + "learning_rate": 4.684812847749115e-05, + "loss": 0.4854, + "step": 2755 + }, + { + "epoch": 0.47111035247930355, + "grad_norm": 0.7490520620929824, + "learning_rate": 4.6832321699544765e-05, + "loss": 0.5242, + "step": 2760 + }, + { + "epoch": 0.4719638132627806, + "grad_norm": 0.47986695057589407, + "learning_rate": 4.681651492159838e-05, + "loss": 0.4879, + "step": 2765 + }, + { + "epoch": 0.47281727404625756, + "grad_norm": 0.4286048199133557, + "learning_rate": 4.6800708143652e-05, + "loss": 0.5156, + "step": 2770 + }, + { + "epoch": 0.4736707348297346, + "grad_norm": 0.45170713322736494, + "learning_rate": 4.678490136570562e-05, + "loss": 0.4836, + "step": 2775 + }, + { + "epoch": 0.4745241956132116, + "grad_norm": 0.6234029954594431, + "learning_rate": 4.6769094587759235e-05, + "loss": 0.5, + "step": 2780 + }, + { + "epoch": 0.47537765639668855, + "grad_norm": 0.759643106803168, + "learning_rate": 4.675328780981285e-05, + "loss": 0.5208, + "step": 2785 + }, + { + "epoch": 0.4762311171801656, + "grad_norm": 1.8711982791961406, + "learning_rate": 4.673748103186646e-05, + "loss": 0.4743, + "step": 2790 + }, + { + "epoch": 0.47708457796364256, + "grad_norm": 0.5627847698519466, + "learning_rate": 4.672167425392009e-05, + "loss": 0.5029, + "step": 2795 + }, + { + "epoch": 0.47793803874711954, + "grad_norm": 0.6159815105546127, + "learning_rate": 4.6705867475973704e-05, + "loss": 0.512, + "step": 2800 + }, + { + "epoch": 0.4787914995305966, + "grad_norm": 0.6492068132157409, + "learning_rate": 4.6690060698027315e-05, + "loss": 0.4964, + "step": 2805 + }, + { + "epoch": 0.47964496031407355, + "grad_norm": 0.4136980806781585, + "learning_rate": 4.667425392008093e-05, + "loss": 0.491, + "step": 2810 + }, + { + "epoch": 0.4804984210975506, + "grad_norm": 0.6222877223645581, + "learning_rate": 4.665844714213455e-05, + "loss": 0.5039, + "step": 2815 + }, + { + "epoch": 0.48135188188102757, + "grad_norm": 0.42847295588776324, + "learning_rate": 4.664264036418817e-05, + "loss": 0.4892, + "step": 2820 + }, + { + "epoch": 0.48220534266450454, + "grad_norm": 0.47614961391215077, + "learning_rate": 4.662683358624178e-05, + "loss": 0.482, + "step": 2825 + }, + { + "epoch": 0.4830588034479816, + "grad_norm": 0.4725249173125883, + "learning_rate": 4.66110268082954e-05, + "loss": 0.5098, + "step": 2830 + }, + { + "epoch": 0.48391226423145856, + "grad_norm": 0.4697674688999553, + "learning_rate": 4.659522003034902e-05, + "loss": 0.5265, + "step": 2835 + }, + { + "epoch": 0.4847657250149356, + "grad_norm": 0.5982380189162461, + "learning_rate": 4.657941325240263e-05, + "loss": 0.4896, + "step": 2840 + }, + { + "epoch": 0.48561918579841257, + "grad_norm": 0.40133960414105363, + "learning_rate": 4.656360647445625e-05, + "loss": 0.5034, + "step": 2845 + }, + { + "epoch": 0.48647264658188955, + "grad_norm": 0.48525613870731715, + "learning_rate": 4.6547799696509865e-05, + "loss": 0.4745, + "step": 2850 + }, + { + "epoch": 0.4873261073653666, + "grad_norm": 0.42933710960223315, + "learning_rate": 4.653199291856348e-05, + "loss": 0.5306, + "step": 2855 + }, + { + "epoch": 0.48817956814884356, + "grad_norm": 1.1063656861366253, + "learning_rate": 4.6516186140617094e-05, + "loss": 0.5048, + "step": 2860 + }, + { + "epoch": 0.48903302893232053, + "grad_norm": 0.5224278678720886, + "learning_rate": 4.650037936267072e-05, + "loss": 0.5039, + "step": 2865 + }, + { + "epoch": 0.48988648971579757, + "grad_norm": 0.5882672184596583, + "learning_rate": 4.6484572584724335e-05, + "loss": 0.4749, + "step": 2870 + }, + { + "epoch": 0.49073995049927455, + "grad_norm": 0.479519347249896, + "learning_rate": 4.6468765806777946e-05, + "loss": 0.5241, + "step": 2875 + }, + { + "epoch": 0.4915934112827516, + "grad_norm": 0.6728167983699064, + "learning_rate": 4.645295902883156e-05, + "loss": 0.5106, + "step": 2880 + }, + { + "epoch": 0.49244687206622856, + "grad_norm": 0.43211549845106334, + "learning_rate": 4.643715225088518e-05, + "loss": 0.4844, + "step": 2885 + }, + { + "epoch": 0.49330033284970554, + "grad_norm": 0.5784010762486059, + "learning_rate": 4.64213454729388e-05, + "loss": 0.4564, + "step": 2890 + }, + { + "epoch": 0.49415379363318257, + "grad_norm": 0.5342212683412771, + "learning_rate": 4.6405538694992416e-05, + "loss": 0.4614, + "step": 2895 + }, + { + "epoch": 0.49500725441665955, + "grad_norm": 0.6805185680905738, + "learning_rate": 4.638973191704603e-05, + "loss": 0.47, + "step": 2900 + }, + { + "epoch": 0.4958607152001366, + "grad_norm": 0.5574973662984423, + "learning_rate": 4.637392513909965e-05, + "loss": 0.515, + "step": 2905 + }, + { + "epoch": 0.49671417598361356, + "grad_norm": 0.7435686355881591, + "learning_rate": 4.635811836115326e-05, + "loss": 0.4926, + "step": 2910 + }, + { + "epoch": 0.49756763676709054, + "grad_norm": 0.6874015511986851, + "learning_rate": 4.634231158320688e-05, + "loss": 0.4809, + "step": 2915 + }, + { + "epoch": 0.49842109755056757, + "grad_norm": 0.5383764757447236, + "learning_rate": 4.6326504805260496e-05, + "loss": 0.4861, + "step": 2920 + }, + { + "epoch": 0.49927455833404455, + "grad_norm": 0.4665152035108908, + "learning_rate": 4.6310698027314114e-05, + "loss": 0.4847, + "step": 2925 + }, + { + "epoch": 0.5001280191175216, + "grad_norm": 0.5836806834915259, + "learning_rate": 4.629489124936773e-05, + "loss": 0.4998, + "step": 2930 + }, + { + "epoch": 0.5009814799009985, + "grad_norm": 0.4361648214216843, + "learning_rate": 4.627908447142135e-05, + "loss": 0.4808, + "step": 2935 + }, + { + "epoch": 0.5018349406844755, + "grad_norm": 0.49583680476975295, + "learning_rate": 4.6263277693474966e-05, + "loss": 0.4868, + "step": 2940 + }, + { + "epoch": 0.5026884014679526, + "grad_norm": 0.4483956246468155, + "learning_rate": 4.624747091552858e-05, + "loss": 0.5048, + "step": 2945 + }, + { + "epoch": 0.5035418622514295, + "grad_norm": 0.5294571252245484, + "learning_rate": 4.62316641375822e-05, + "loss": 0.5058, + "step": 2950 + }, + { + "epoch": 0.5043953230349065, + "grad_norm": 0.5252090443214954, + "learning_rate": 4.621585735963581e-05, + "loss": 0.5169, + "step": 2955 + }, + { + "epoch": 0.5052487838183836, + "grad_norm": 0.46760767827848043, + "learning_rate": 4.620005058168943e-05, + "loss": 0.4562, + "step": 2960 + }, + { + "epoch": 0.5061022446018606, + "grad_norm": 0.8415597340930349, + "learning_rate": 4.6184243803743047e-05, + "loss": 0.4544, + "step": 2965 + }, + { + "epoch": 0.5069557053853375, + "grad_norm": 0.6247587825872346, + "learning_rate": 4.6168437025796664e-05, + "loss": 0.4783, + "step": 2970 + }, + { + "epoch": 0.5078091661688146, + "grad_norm": 0.43871676248593794, + "learning_rate": 4.615263024785028e-05, + "loss": 0.5082, + "step": 2975 + }, + { + "epoch": 0.5086626269522916, + "grad_norm": 0.5148243512845597, + "learning_rate": 4.613682346990389e-05, + "loss": 0.5239, + "step": 2980 + }, + { + "epoch": 0.5095160877357685, + "grad_norm": 0.5385515871310813, + "learning_rate": 4.6121016691957516e-05, + "loss": 0.516, + "step": 2985 + }, + { + "epoch": 0.5103695485192455, + "grad_norm": 0.5138201657544148, + "learning_rate": 4.6105209914011134e-05, + "loss": 0.472, + "step": 2990 + }, + { + "epoch": 0.5112230093027226, + "grad_norm": 0.4716007255261149, + "learning_rate": 4.6089403136064745e-05, + "loss": 0.5122, + "step": 2995 + }, + { + "epoch": 0.5120764700861995, + "grad_norm": 0.5368887676130082, + "learning_rate": 4.607359635811836e-05, + "loss": 0.5264, + "step": 3000 + }, + { + "epoch": 0.5129299308696765, + "grad_norm": 0.43583049564141374, + "learning_rate": 4.605778958017198e-05, + "loss": 0.4887, + "step": 3005 + }, + { + "epoch": 0.5137833916531536, + "grad_norm": 0.5455177937923521, + "learning_rate": 4.60419828022256e-05, + "loss": 0.4742, + "step": 3010 + }, + { + "epoch": 0.5146368524366305, + "grad_norm": 0.44161273126802225, + "learning_rate": 4.602617602427921e-05, + "loss": 0.4569, + "step": 3015 + }, + { + "epoch": 0.5154903132201075, + "grad_norm": 0.5588901866324794, + "learning_rate": 4.601036924633283e-05, + "loss": 0.4948, + "step": 3020 + }, + { + "epoch": 0.5163437740035846, + "grad_norm": 0.43294336422490237, + "learning_rate": 4.599456246838645e-05, + "loss": 0.4824, + "step": 3025 + }, + { + "epoch": 0.5171972347870616, + "grad_norm": 0.4262908125239168, + "learning_rate": 4.597875569044006e-05, + "loss": 0.4674, + "step": 3030 + }, + { + "epoch": 0.5180506955705385, + "grad_norm": 0.4134934926125551, + "learning_rate": 4.596294891249368e-05, + "loss": 0.478, + "step": 3035 + }, + { + "epoch": 0.5189041563540155, + "grad_norm": 0.5146435082463794, + "learning_rate": 4.5947142134547295e-05, + "loss": 0.48, + "step": 3040 + }, + { + "epoch": 0.5197576171374926, + "grad_norm": 0.5275388619042292, + "learning_rate": 4.593133535660091e-05, + "loss": 0.5283, + "step": 3045 + }, + { + "epoch": 0.5206110779209695, + "grad_norm": 0.41285683367409964, + "learning_rate": 4.591552857865453e-05, + "loss": 0.4905, + "step": 3050 + }, + { + "epoch": 0.5214645387044465, + "grad_norm": 0.4414766971805224, + "learning_rate": 4.589972180070815e-05, + "loss": 0.482, + "step": 3055 + }, + { + "epoch": 0.5223179994879236, + "grad_norm": 0.5560472533327718, + "learning_rate": 4.5883915022761765e-05, + "loss": 0.4754, + "step": 3060 + }, + { + "epoch": 0.5231714602714005, + "grad_norm": 0.47111606947459106, + "learning_rate": 4.5868108244815375e-05, + "loss": 0.4926, + "step": 3065 + }, + { + "epoch": 0.5240249210548775, + "grad_norm": 0.433105229367798, + "learning_rate": 4.5852301466869e-05, + "loss": 0.4725, + "step": 3070 + }, + { + "epoch": 0.5248783818383546, + "grad_norm": 0.4628901557642212, + "learning_rate": 4.583649468892261e-05, + "loss": 0.4785, + "step": 3075 + }, + { + "epoch": 0.5257318426218315, + "grad_norm": 0.6049145619127523, + "learning_rate": 4.582068791097623e-05, + "loss": 0.4915, + "step": 3080 + }, + { + "epoch": 0.5265853034053085, + "grad_norm": 0.6860256332793251, + "learning_rate": 4.5804881133029845e-05, + "loss": 0.5093, + "step": 3085 + }, + { + "epoch": 0.5274387641887855, + "grad_norm": 0.4977844134723854, + "learning_rate": 4.578907435508346e-05, + "loss": 0.4954, + "step": 3090 + }, + { + "epoch": 0.5282922249722625, + "grad_norm": 0.5739524858960392, + "learning_rate": 4.577326757713708e-05, + "loss": 0.4651, + "step": 3095 + }, + { + "epoch": 0.5291456857557395, + "grad_norm": 0.5123685569383267, + "learning_rate": 4.575746079919069e-05, + "loss": 0.5025, + "step": 3100 + }, + { + "epoch": 0.5299991465392165, + "grad_norm": 0.4796521979465388, + "learning_rate": 4.5741654021244315e-05, + "loss": 0.5068, + "step": 3105 + }, + { + "epoch": 0.5308526073226936, + "grad_norm": 0.47308033426789425, + "learning_rate": 4.5725847243297926e-05, + "loss": 0.4909, + "step": 3110 + }, + { + "epoch": 0.5317060681061705, + "grad_norm": 0.38443216822110887, + "learning_rate": 4.571004046535154e-05, + "loss": 0.4683, + "step": 3115 + }, + { + "epoch": 0.5325595288896475, + "grad_norm": 0.5224091434335496, + "learning_rate": 4.569423368740516e-05, + "loss": 0.4884, + "step": 3120 + }, + { + "epoch": 0.5334129896731246, + "grad_norm": 0.5429417668695532, + "learning_rate": 4.567842690945878e-05, + "loss": 0.4857, + "step": 3125 + }, + { + "epoch": 0.5342664504566015, + "grad_norm": 0.41612083241826625, + "learning_rate": 4.5662620131512396e-05, + "loss": 0.4998, + "step": 3130 + }, + { + "epoch": 0.5351199112400785, + "grad_norm": 0.5384787879906221, + "learning_rate": 4.5646813353566006e-05, + "loss": 0.5083, + "step": 3135 + }, + { + "epoch": 0.5359733720235555, + "grad_norm": 0.5734922772320576, + "learning_rate": 4.563100657561963e-05, + "loss": 0.451, + "step": 3140 + }, + { + "epoch": 0.5368268328070325, + "grad_norm": 0.4541825803148031, + "learning_rate": 4.561519979767325e-05, + "loss": 0.493, + "step": 3145 + }, + { + "epoch": 0.5376802935905095, + "grad_norm": 0.4994486724205086, + "learning_rate": 4.559939301972686e-05, + "loss": 0.4685, + "step": 3150 + }, + { + "epoch": 0.5385337543739865, + "grad_norm": 0.47708325681611224, + "learning_rate": 4.5583586241780476e-05, + "loss": 0.4609, + "step": 3155 + }, + { + "epoch": 0.5393872151574635, + "grad_norm": 0.46817706499113726, + "learning_rate": 4.5567779463834094e-05, + "loss": 0.4758, + "step": 3160 + }, + { + "epoch": 0.5402406759409405, + "grad_norm": 0.4902818781364108, + "learning_rate": 4.555197268588771e-05, + "loss": 0.5077, + "step": 3165 + }, + { + "epoch": 0.5410941367244175, + "grad_norm": 0.45922084807176566, + "learning_rate": 4.553616590794132e-05, + "loss": 0.5119, + "step": 3170 + }, + { + "epoch": 0.5419475975078946, + "grad_norm": 0.6712454332090487, + "learning_rate": 4.5520359129994946e-05, + "loss": 0.4795, + "step": 3175 + }, + { + "epoch": 0.5428010582913715, + "grad_norm": 0.43605119685450705, + "learning_rate": 4.5504552352048563e-05, + "loss": 0.4677, + "step": 3180 + }, + { + "epoch": 0.5436545190748485, + "grad_norm": 0.4925537431519938, + "learning_rate": 4.5488745574102174e-05, + "loss": 0.4914, + "step": 3185 + }, + { + "epoch": 0.5445079798583256, + "grad_norm": 0.531654901487706, + "learning_rate": 4.54729387961558e-05, + "loss": 0.4965, + "step": 3190 + }, + { + "epoch": 0.5453614406418025, + "grad_norm": 0.542627550461915, + "learning_rate": 4.545713201820941e-05, + "loss": 0.505, + "step": 3195 + }, + { + "epoch": 0.5462149014252795, + "grad_norm": 0.5005989461431033, + "learning_rate": 4.5441325240263026e-05, + "loss": 0.4676, + "step": 3200 + }, + { + "epoch": 0.5470683622087565, + "grad_norm": 0.8356311379255125, + "learning_rate": 4.5425518462316644e-05, + "loss": 0.4878, + "step": 3205 + }, + { + "epoch": 0.5479218229922335, + "grad_norm": 0.6159083414874345, + "learning_rate": 4.540971168437026e-05, + "loss": 0.5381, + "step": 3210 + }, + { + "epoch": 0.5487752837757105, + "grad_norm": 0.6463799250831206, + "learning_rate": 4.539390490642388e-05, + "loss": 0.5048, + "step": 3215 + }, + { + "epoch": 0.5496287445591875, + "grad_norm": 0.514052351649874, + "learning_rate": 4.537809812847749e-05, + "loss": 0.4608, + "step": 3220 + }, + { + "epoch": 0.5504822053426645, + "grad_norm": 0.43715311139472457, + "learning_rate": 4.5362291350531114e-05, + "loss": 0.5101, + "step": 3225 + }, + { + "epoch": 0.5513356661261415, + "grad_norm": 0.6500744851584536, + "learning_rate": 4.5346484572584724e-05, + "loss": 0.488, + "step": 3230 + }, + { + "epoch": 0.5521891269096185, + "grad_norm": 0.495178494171075, + "learning_rate": 4.533067779463834e-05, + "loss": 0.4887, + "step": 3235 + }, + { + "epoch": 0.5530425876930956, + "grad_norm": 0.798100029506453, + "learning_rate": 4.531487101669196e-05, + "loss": 0.491, + "step": 3240 + }, + { + "epoch": 0.5538960484765725, + "grad_norm": 0.9902280186281872, + "learning_rate": 4.529906423874558e-05, + "loss": 0.4627, + "step": 3245 + }, + { + "epoch": 0.5547495092600495, + "grad_norm": 1.7680027677139394, + "learning_rate": 4.5283257460799194e-05, + "loss": 0.4694, + "step": 3250 + }, + { + "epoch": 0.5556029700435265, + "grad_norm": 0.6487867409422882, + "learning_rate": 4.5267450682852805e-05, + "loss": 0.5086, + "step": 3255 + }, + { + "epoch": 0.5564564308270035, + "grad_norm": 0.4778060701450197, + "learning_rate": 4.525164390490643e-05, + "loss": 0.4672, + "step": 3260 + }, + { + "epoch": 0.5573098916104805, + "grad_norm": 0.6933757779273402, + "learning_rate": 4.523583712696004e-05, + "loss": 0.4947, + "step": 3265 + }, + { + "epoch": 0.5581633523939575, + "grad_norm": 0.8292966610796421, + "learning_rate": 4.522003034901366e-05, + "loss": 0.4817, + "step": 3270 + }, + { + "epoch": 0.5590168131774345, + "grad_norm": 0.5753336708545032, + "learning_rate": 4.5204223571067275e-05, + "loss": 0.4603, + "step": 3275 + }, + { + "epoch": 0.5598702739609115, + "grad_norm": 0.7087010804750883, + "learning_rate": 4.518841679312089e-05, + "loss": 0.4935, + "step": 3280 + }, + { + "epoch": 0.5607237347443885, + "grad_norm": 0.4609635531108082, + "learning_rate": 4.517261001517451e-05, + "loss": 0.4767, + "step": 3285 + }, + { + "epoch": 0.5615771955278654, + "grad_norm": 1.0131378296218625, + "learning_rate": 4.515680323722812e-05, + "loss": 0.4919, + "step": 3290 + }, + { + "epoch": 0.5624306563113425, + "grad_norm": 0.5286009061837952, + "learning_rate": 4.5140996459281745e-05, + "loss": 0.4629, + "step": 3295 + }, + { + "epoch": 0.5632841170948195, + "grad_norm": 0.5233857128596109, + "learning_rate": 4.512518968133536e-05, + "loss": 0.4787, + "step": 3300 + }, + { + "epoch": 0.5641375778782965, + "grad_norm": 0.8557917738316622, + "learning_rate": 4.510938290338897e-05, + "loss": 0.4996, + "step": 3305 + }, + { + "epoch": 0.5649910386617735, + "grad_norm": 0.4526387213257986, + "learning_rate": 4.50935761254426e-05, + "loss": 0.504, + "step": 3310 + }, + { + "epoch": 0.5658444994452505, + "grad_norm": 2.9307163982033657, + "learning_rate": 4.507776934749621e-05, + "loss": 0.4833, + "step": 3315 + }, + { + "epoch": 0.5666979602287275, + "grad_norm": 0.5201434642577162, + "learning_rate": 4.5061962569549825e-05, + "loss": 0.4934, + "step": 3320 + }, + { + "epoch": 0.5675514210122045, + "grad_norm": 0.49175135020457955, + "learning_rate": 4.504615579160344e-05, + "loss": 0.5206, + "step": 3325 + }, + { + "epoch": 0.5684048817956815, + "grad_norm": 0.3878627218856043, + "learning_rate": 4.503034901365706e-05, + "loss": 0.463, + "step": 3330 + }, + { + "epoch": 0.5692583425791585, + "grad_norm": 0.38549872056761764, + "learning_rate": 4.501454223571068e-05, + "loss": 0.4836, + "step": 3335 + }, + { + "epoch": 0.5701118033626354, + "grad_norm": 0.5321198076144228, + "learning_rate": 4.499873545776429e-05, + "loss": 0.4712, + "step": 3340 + }, + { + "epoch": 0.5709652641461125, + "grad_norm": 0.4262011189522238, + "learning_rate": 4.498292867981791e-05, + "loss": 0.4926, + "step": 3345 + }, + { + "epoch": 0.5718187249295895, + "grad_norm": 0.46600041458591407, + "learning_rate": 4.496712190187152e-05, + "loss": 0.4785, + "step": 3350 + }, + { + "epoch": 0.5726721857130664, + "grad_norm": 0.6200806088331268, + "learning_rate": 4.495131512392514e-05, + "loss": 0.4992, + "step": 3355 + }, + { + "epoch": 0.5735256464965435, + "grad_norm": 0.9996035433021478, + "learning_rate": 4.493550834597876e-05, + "loss": 0.4715, + "step": 3360 + }, + { + "epoch": 0.5743791072800205, + "grad_norm": 0.49082524222147683, + "learning_rate": 4.4919701568032375e-05, + "loss": 0.4718, + "step": 3365 + }, + { + "epoch": 0.5752325680634974, + "grad_norm": 0.8837435528802063, + "learning_rate": 4.490389479008599e-05, + "loss": 0.4734, + "step": 3370 + }, + { + "epoch": 0.5760860288469745, + "grad_norm": 1.7445854952652542, + "learning_rate": 4.4888088012139604e-05, + "loss": 0.4883, + "step": 3375 + }, + { + "epoch": 0.5769394896304515, + "grad_norm": 0.6301167609521606, + "learning_rate": 4.487228123419323e-05, + "loss": 0.4933, + "step": 3380 + }, + { + "epoch": 0.5777929504139285, + "grad_norm": 0.3939525490223246, + "learning_rate": 4.485647445624684e-05, + "loss": 0.4684, + "step": 3385 + }, + { + "epoch": 0.5786464111974055, + "grad_norm": 0.46250171089281755, + "learning_rate": 4.4840667678300456e-05, + "loss": 0.4897, + "step": 3390 + }, + { + "epoch": 0.5794998719808825, + "grad_norm": 0.45920802219487256, + "learning_rate": 4.4824860900354073e-05, + "loss": 0.5029, + "step": 3395 + }, + { + "epoch": 0.5803533327643595, + "grad_norm": 0.4819306175315199, + "learning_rate": 4.480905412240769e-05, + "loss": 0.4412, + "step": 3400 + }, + { + "epoch": 0.5812067935478364, + "grad_norm": 0.4256152645527321, + "learning_rate": 4.479324734446131e-05, + "loss": 0.444, + "step": 3405 + }, + { + "epoch": 0.5820602543313135, + "grad_norm": 0.42620197572305174, + "learning_rate": 4.477744056651492e-05, + "loss": 0.4685, + "step": 3410 + }, + { + "epoch": 0.5829137151147905, + "grad_norm": 0.5550364644460304, + "learning_rate": 4.476163378856854e-05, + "loss": 0.4907, + "step": 3415 + }, + { + "epoch": 0.5837671758982674, + "grad_norm": 0.45984600307346984, + "learning_rate": 4.4745827010622154e-05, + "loss": 0.4982, + "step": 3420 + }, + { + "epoch": 0.5846206366817445, + "grad_norm": 0.689863334490102, + "learning_rate": 4.473002023267577e-05, + "loss": 0.4893, + "step": 3425 + }, + { + "epoch": 0.5854740974652215, + "grad_norm": 0.5275670181218156, + "learning_rate": 4.4714213454729396e-05, + "loss": 0.5077, + "step": 3430 + }, + { + "epoch": 0.5863275582486984, + "grad_norm": 0.5796184441646595, + "learning_rate": 4.4698406676783006e-05, + "loss": 0.4639, + "step": 3435 + }, + { + "epoch": 0.5871810190321755, + "grad_norm": 0.45709870220036597, + "learning_rate": 4.4682599898836624e-05, + "loss": 0.4673, + "step": 3440 + }, + { + "epoch": 0.5880344798156525, + "grad_norm": 0.4538463488322591, + "learning_rate": 4.466679312089024e-05, + "loss": 0.4873, + "step": 3445 + }, + { + "epoch": 0.5888879405991295, + "grad_norm": 0.44323971254558264, + "learning_rate": 4.465098634294386e-05, + "loss": 0.4845, + "step": 3450 + }, + { + "epoch": 0.5897414013826064, + "grad_norm": 0.4801465832311005, + "learning_rate": 4.4635179564997476e-05, + "loss": 0.4614, + "step": 3455 + }, + { + "epoch": 0.5905948621660835, + "grad_norm": 0.40555326089246163, + "learning_rate": 4.461937278705109e-05, + "loss": 0.4798, + "step": 3460 + }, + { + "epoch": 0.5914483229495605, + "grad_norm": 0.5648457333532083, + "learning_rate": 4.460356600910471e-05, + "loss": 0.4583, + "step": 3465 + }, + { + "epoch": 0.5923017837330374, + "grad_norm": 0.4626371797805737, + "learning_rate": 4.458775923115832e-05, + "loss": 0.4556, + "step": 3470 + }, + { + "epoch": 0.5931552445165145, + "grad_norm": 0.4645009958255317, + "learning_rate": 4.457195245321194e-05, + "loss": 0.4429, + "step": 3475 + }, + { + "epoch": 0.5940087052999915, + "grad_norm": 0.5553987660550683, + "learning_rate": 4.455614567526556e-05, + "loss": 0.5151, + "step": 3480 + }, + { + "epoch": 0.5948621660834684, + "grad_norm": 0.526142044453019, + "learning_rate": 4.4540338897319174e-05, + "loss": 0.4707, + "step": 3485 + }, + { + "epoch": 0.5957156268669455, + "grad_norm": 0.5645287259798881, + "learning_rate": 4.452453211937279e-05, + "loss": 0.5235, + "step": 3490 + }, + { + "epoch": 0.5965690876504225, + "grad_norm": 0.46749936432388794, + "learning_rate": 4.45087253414264e-05, + "loss": 0.4797, + "step": 3495 + }, + { + "epoch": 0.5974225484338994, + "grad_norm": 0.7045403748184238, + "learning_rate": 4.4492918563480026e-05, + "loss": 0.4866, + "step": 3500 + }, + { + "epoch": 0.5982760092173764, + "grad_norm": 0.4342980770279321, + "learning_rate": 4.447711178553364e-05, + "loss": 0.4775, + "step": 3505 + }, + { + "epoch": 0.5991294700008535, + "grad_norm": 0.4806934305956506, + "learning_rate": 4.4461305007587255e-05, + "loss": 0.4686, + "step": 3510 + }, + { + "epoch": 0.5999829307843305, + "grad_norm": 63.95029639895003, + "learning_rate": 4.444549822964087e-05, + "loss": 0.4532, + "step": 3515 + }, + { + "epoch": 0.6008363915678074, + "grad_norm": 0.500224009602108, + "learning_rate": 4.442969145169449e-05, + "loss": 0.4426, + "step": 3520 + }, + { + "epoch": 0.6016898523512845, + "grad_norm": 0.50512012301229, + "learning_rate": 4.441388467374811e-05, + "loss": 0.4478, + "step": 3525 + }, + { + "epoch": 0.6025433131347615, + "grad_norm": 0.484096459835353, + "learning_rate": 4.439807789580172e-05, + "loss": 0.452, + "step": 3530 + }, + { + "epoch": 0.6033967739182384, + "grad_norm": 0.4688040856385147, + "learning_rate": 4.438227111785534e-05, + "loss": 0.5168, + "step": 3535 + }, + { + "epoch": 0.6042502347017155, + "grad_norm": 0.6679582478264732, + "learning_rate": 4.436646433990895e-05, + "loss": 0.4672, + "step": 3540 + }, + { + "epoch": 0.6051036954851925, + "grad_norm": 0.47770775490769585, + "learning_rate": 4.435065756196257e-05, + "loss": 0.5019, + "step": 3545 + }, + { + "epoch": 0.6059571562686694, + "grad_norm": 0.3880046321693871, + "learning_rate": 4.433485078401619e-05, + "loss": 0.4797, + "step": 3550 + }, + { + "epoch": 0.6068106170521465, + "grad_norm": 0.5414538314833576, + "learning_rate": 4.4319044006069805e-05, + "loss": 0.4706, + "step": 3555 + }, + { + "epoch": 0.6076640778356235, + "grad_norm": 0.4809579676224301, + "learning_rate": 4.430323722812342e-05, + "loss": 0.4838, + "step": 3560 + }, + { + "epoch": 0.6085175386191004, + "grad_norm": 0.48144765077507257, + "learning_rate": 4.428743045017704e-05, + "loss": 0.5095, + "step": 3565 + }, + { + "epoch": 0.6093709994025774, + "grad_norm": 0.5138696069532919, + "learning_rate": 4.427162367223066e-05, + "loss": 0.4944, + "step": 3570 + }, + { + "epoch": 0.6102244601860545, + "grad_norm": 0.45587797333607455, + "learning_rate": 4.425581689428427e-05, + "loss": 0.4901, + "step": 3575 + }, + { + "epoch": 0.6110779209695314, + "grad_norm": 0.48425921551782086, + "learning_rate": 4.4240010116337885e-05, + "loss": 0.4403, + "step": 3580 + }, + { + "epoch": 0.6119313817530084, + "grad_norm": 0.43899108915854307, + "learning_rate": 4.422420333839151e-05, + "loss": 0.4636, + "step": 3585 + }, + { + "epoch": 0.6127848425364855, + "grad_norm": 0.5141934864853146, + "learning_rate": 4.420839656044512e-05, + "loss": 0.4854, + "step": 3590 + }, + { + "epoch": 0.6136383033199625, + "grad_norm": 0.5051172565590736, + "learning_rate": 4.419258978249874e-05, + "loss": 0.4811, + "step": 3595 + }, + { + "epoch": 0.6144917641034394, + "grad_norm": 0.43738734095879495, + "learning_rate": 4.4176783004552355e-05, + "loss": 0.5052, + "step": 3600 + }, + { + "epoch": 0.6153452248869165, + "grad_norm": 0.39331118925627184, + "learning_rate": 4.416097622660597e-05, + "loss": 0.4798, + "step": 3605 + }, + { + "epoch": 0.6161986856703935, + "grad_norm": 0.4133684384057315, + "learning_rate": 4.4145169448659583e-05, + "loss": 0.4894, + "step": 3610 + }, + { + "epoch": 0.6170521464538704, + "grad_norm": 0.45545934534082405, + "learning_rate": 4.41293626707132e-05, + "loss": 0.5195, + "step": 3615 + }, + { + "epoch": 0.6179056072373474, + "grad_norm": 0.7681187073171971, + "learning_rate": 4.4113555892766825e-05, + "loss": 0.4689, + "step": 3620 + }, + { + "epoch": 0.6187590680208245, + "grad_norm": 0.4325591179904035, + "learning_rate": 4.4097749114820436e-05, + "loss": 0.523, + "step": 3625 + }, + { + "epoch": 0.6196125288043014, + "grad_norm": 0.5070197418845998, + "learning_rate": 4.408194233687405e-05, + "loss": 0.4583, + "step": 3630 + }, + { + "epoch": 0.6204659895877784, + "grad_norm": 0.45243291995651214, + "learning_rate": 4.406613555892767e-05, + "loss": 0.4922, + "step": 3635 + }, + { + "epoch": 0.6213194503712555, + "grad_norm": 0.449732217059902, + "learning_rate": 4.405032878098129e-05, + "loss": 0.4699, + "step": 3640 + }, + { + "epoch": 0.6221729111547324, + "grad_norm": 0.7246861327311569, + "learning_rate": 4.4034522003034906e-05, + "loss": 0.4998, + "step": 3645 + }, + { + "epoch": 0.6230263719382094, + "grad_norm": 0.4280068084173374, + "learning_rate": 4.4018715225088516e-05, + "loss": 0.4978, + "step": 3650 + }, + { + "epoch": 0.6238798327216865, + "grad_norm": 0.4014939390871873, + "learning_rate": 4.400290844714214e-05, + "loss": 0.474, + "step": 3655 + }, + { + "epoch": 0.6247332935051635, + "grad_norm": 0.4497317854323919, + "learning_rate": 4.398710166919575e-05, + "loss": 0.4929, + "step": 3660 + }, + { + "epoch": 0.6255867542886404, + "grad_norm": 0.36552726366718447, + "learning_rate": 4.397129489124937e-05, + "loss": 0.4726, + "step": 3665 + }, + { + "epoch": 0.6264402150721174, + "grad_norm": 0.4328997047135238, + "learning_rate": 4.3955488113302986e-05, + "loss": 0.5043, + "step": 3670 + }, + { + "epoch": 0.6272936758555945, + "grad_norm": 0.7066126750337065, + "learning_rate": 4.3939681335356604e-05, + "loss": 0.4773, + "step": 3675 + }, + { + "epoch": 0.6281471366390714, + "grad_norm": 0.47512560430075906, + "learning_rate": 4.392387455741022e-05, + "loss": 0.4426, + "step": 3680 + }, + { + "epoch": 0.6290005974225484, + "grad_norm": 0.42252564493650935, + "learning_rate": 4.390806777946383e-05, + "loss": 0.452, + "step": 3685 + }, + { + "epoch": 0.6298540582060255, + "grad_norm": 0.43013539904554504, + "learning_rate": 4.3892261001517456e-05, + "loss": 0.4888, + "step": 3690 + }, + { + "epoch": 0.6307075189895024, + "grad_norm": 0.5071874143036144, + "learning_rate": 4.387645422357107e-05, + "loss": 0.479, + "step": 3695 + }, + { + "epoch": 0.6315609797729794, + "grad_norm": 0.5470278547942871, + "learning_rate": 4.3860647445624684e-05, + "loss": 0.478, + "step": 3700 + }, + { + "epoch": 0.6324144405564565, + "grad_norm": 0.4167734022580235, + "learning_rate": 4.384484066767831e-05, + "loss": 0.4917, + "step": 3705 + }, + { + "epoch": 0.6332679013399334, + "grad_norm": 0.4890306405293852, + "learning_rate": 4.382903388973192e-05, + "loss": 0.4493, + "step": 3710 + }, + { + "epoch": 0.6341213621234104, + "grad_norm": 0.5470391911698399, + "learning_rate": 4.3813227111785537e-05, + "loss": 0.476, + "step": 3715 + }, + { + "epoch": 0.6349748229068874, + "grad_norm": 0.47745469088807063, + "learning_rate": 4.3797420333839154e-05, + "loss": 0.4793, + "step": 3720 + }, + { + "epoch": 0.6358282836903645, + "grad_norm": 0.5113844420452976, + "learning_rate": 4.378161355589277e-05, + "loss": 0.4656, + "step": 3725 + }, + { + "epoch": 0.6366817444738414, + "grad_norm": 0.4458910221298414, + "learning_rate": 4.376580677794638e-05, + "loss": 0.4776, + "step": 3730 + }, + { + "epoch": 0.6375352052573184, + "grad_norm": 0.4818584623158423, + "learning_rate": 4.375e-05, + "loss": 0.4918, + "step": 3735 + }, + { + "epoch": 0.6383886660407955, + "grad_norm": 0.4492848878575235, + "learning_rate": 4.3734193222053624e-05, + "loss": 0.508, + "step": 3740 + }, + { + "epoch": 0.6392421268242724, + "grad_norm": 0.45574504864146553, + "learning_rate": 4.3718386444107234e-05, + "loss": 0.4764, + "step": 3745 + }, + { + "epoch": 0.6400955876077494, + "grad_norm": 0.42562056908018, + "learning_rate": 4.370257966616085e-05, + "loss": 0.5007, + "step": 3750 + }, + { + "epoch": 0.6409490483912265, + "grad_norm": 0.708859438990614, + "learning_rate": 4.368677288821447e-05, + "loss": 0.4744, + "step": 3755 + }, + { + "epoch": 0.6418025091747034, + "grad_norm": 0.5326165948781, + "learning_rate": 4.367096611026809e-05, + "loss": 0.5026, + "step": 3760 + }, + { + "epoch": 0.6426559699581804, + "grad_norm": 0.39341698760922, + "learning_rate": 4.36551593323217e-05, + "loss": 0.4491, + "step": 3765 + }, + { + "epoch": 0.6435094307416575, + "grad_norm": 0.40912745072852724, + "learning_rate": 4.3639352554375315e-05, + "loss": 0.4642, + "step": 3770 + }, + { + "epoch": 0.6443628915251344, + "grad_norm": 0.4144693996623863, + "learning_rate": 4.362354577642894e-05, + "loss": 0.4859, + "step": 3775 + }, + { + "epoch": 0.6452163523086114, + "grad_norm": 0.4881292009372189, + "learning_rate": 4.360773899848255e-05, + "loss": 0.4984, + "step": 3780 + }, + { + "epoch": 0.6460698130920884, + "grad_norm": 0.45053933392519724, + "learning_rate": 4.359193222053617e-05, + "loss": 0.4768, + "step": 3785 + }, + { + "epoch": 0.6469232738755654, + "grad_norm": 0.47325450782615, + "learning_rate": 4.3576125442589785e-05, + "loss": 0.4493, + "step": 3790 + }, + { + "epoch": 0.6477767346590424, + "grad_norm": 0.48811233595627407, + "learning_rate": 4.35603186646434e-05, + "loss": 0.4935, + "step": 3795 + }, + { + "epoch": 0.6486301954425194, + "grad_norm": 0.421010723444544, + "learning_rate": 4.354451188669702e-05, + "loss": 0.4689, + "step": 3800 + }, + { + "epoch": 0.6494836562259965, + "grad_norm": 0.5627086061065368, + "learning_rate": 4.352870510875063e-05, + "loss": 0.4753, + "step": 3805 + }, + { + "epoch": 0.6503371170094734, + "grad_norm": 0.4463014770154561, + "learning_rate": 4.3512898330804255e-05, + "loss": 0.4884, + "step": 3810 + }, + { + "epoch": 0.6511905777929504, + "grad_norm": 0.4638012220494922, + "learning_rate": 4.3497091552857865e-05, + "loss": 0.4851, + "step": 3815 + }, + { + "epoch": 0.6520440385764275, + "grad_norm": 0.7544906461658837, + "learning_rate": 4.348128477491148e-05, + "loss": 0.4444, + "step": 3820 + }, + { + "epoch": 0.6528974993599044, + "grad_norm": 0.44167391706768905, + "learning_rate": 4.34654779969651e-05, + "loss": 0.4799, + "step": 3825 + }, + { + "epoch": 0.6537509601433814, + "grad_norm": 0.43567757946878566, + "learning_rate": 4.344967121901872e-05, + "loss": 0.4941, + "step": 3830 + }, + { + "epoch": 0.6546044209268584, + "grad_norm": 0.504526904318, + "learning_rate": 4.3433864441072335e-05, + "loss": 0.4414, + "step": 3835 + }, + { + "epoch": 0.6554578817103354, + "grad_norm": 0.42110493335879345, + "learning_rate": 4.341805766312595e-05, + "loss": 0.4514, + "step": 3840 + }, + { + "epoch": 0.6563113424938124, + "grad_norm": 0.3854733011231964, + "learning_rate": 4.340225088517957e-05, + "loss": 0.4715, + "step": 3845 + }, + { + "epoch": 0.6571648032772894, + "grad_norm": 0.6822309966180405, + "learning_rate": 4.338644410723318e-05, + "loss": 0.4958, + "step": 3850 + }, + { + "epoch": 0.6580182640607664, + "grad_norm": 0.5186683164282136, + "learning_rate": 4.33706373292868e-05, + "loss": 0.4722, + "step": 3855 + }, + { + "epoch": 0.6588717248442434, + "grad_norm": 0.47780293082012615, + "learning_rate": 4.335483055134042e-05, + "loss": 0.5217, + "step": 3860 + }, + { + "epoch": 0.6597251856277204, + "grad_norm": 0.4764829465540424, + "learning_rate": 4.333902377339403e-05, + "loss": 0.4605, + "step": 3865 + }, + { + "epoch": 0.6605786464111975, + "grad_norm": 0.4541715815782311, + "learning_rate": 4.332321699544765e-05, + "loss": 0.4812, + "step": 3870 + }, + { + "epoch": 0.6614321071946744, + "grad_norm": 0.5603266519555172, + "learning_rate": 4.330741021750127e-05, + "loss": 0.4651, + "step": 3875 + }, + { + "epoch": 0.6622855679781514, + "grad_norm": 0.48041672062511726, + "learning_rate": 4.3291603439554885e-05, + "loss": 0.4824, + "step": 3880 + }, + { + "epoch": 0.6631390287616284, + "grad_norm": 0.47907954922802726, + "learning_rate": 4.3275796661608496e-05, + "loss": 0.4608, + "step": 3885 + }, + { + "epoch": 0.6639924895451054, + "grad_norm": 0.44609812672729476, + "learning_rate": 4.3259989883662114e-05, + "loss": 0.462, + "step": 3890 + }, + { + "epoch": 0.6648459503285824, + "grad_norm": 0.7993792487509449, + "learning_rate": 4.324418310571574e-05, + "loss": 0.4719, + "step": 3895 + }, + { + "epoch": 0.6656994111120594, + "grad_norm": 0.4311820933387167, + "learning_rate": 4.322837632776935e-05, + "loss": 0.465, + "step": 3900 + }, + { + "epoch": 0.6665528718955364, + "grad_norm": 0.4388514679700474, + "learning_rate": 4.3212569549822966e-05, + "loss": 0.4778, + "step": 3905 + }, + { + "epoch": 0.6674063326790134, + "grad_norm": 0.4575188747935784, + "learning_rate": 4.3196762771876583e-05, + "loss": 0.4956, + "step": 3910 + }, + { + "epoch": 0.6682597934624904, + "grad_norm": 0.8660842719376415, + "learning_rate": 4.31809559939302e-05, + "loss": 0.4726, + "step": 3915 + }, + { + "epoch": 0.6691132542459673, + "grad_norm": 0.5516487261948204, + "learning_rate": 4.316514921598381e-05, + "loss": 0.4762, + "step": 3920 + }, + { + "epoch": 0.6699667150294444, + "grad_norm": 0.47455394040106036, + "learning_rate": 4.314934243803743e-05, + "loss": 0.4604, + "step": 3925 + }, + { + "epoch": 0.6708201758129214, + "grad_norm": 0.5207320878622663, + "learning_rate": 4.313353566009105e-05, + "loss": 0.4804, + "step": 3930 + }, + { + "epoch": 0.6716736365963984, + "grad_norm": 0.47742075584221105, + "learning_rate": 4.3117728882144664e-05, + "loss": 0.4556, + "step": 3935 + }, + { + "epoch": 0.6725270973798754, + "grad_norm": 0.45314483542298994, + "learning_rate": 4.310192210419828e-05, + "loss": 0.4429, + "step": 3940 + }, + { + "epoch": 0.6733805581633524, + "grad_norm": 0.4381070559951893, + "learning_rate": 4.30861153262519e-05, + "loss": 0.4627, + "step": 3945 + }, + { + "epoch": 0.6742340189468294, + "grad_norm": 0.4071218465306306, + "learning_rate": 4.3070308548305516e-05, + "loss": 0.4602, + "step": 3950 + }, + { + "epoch": 0.6750874797303064, + "grad_norm": 0.6499999822102005, + "learning_rate": 4.3054501770359134e-05, + "loss": 0.4684, + "step": 3955 + }, + { + "epoch": 0.6759409405137834, + "grad_norm": 0.4846648689655542, + "learning_rate": 4.303869499241275e-05, + "loss": 0.4667, + "step": 3960 + }, + { + "epoch": 0.6767944012972604, + "grad_norm": 0.4062643508656444, + "learning_rate": 4.302288821446637e-05, + "loss": 0.471, + "step": 3965 + }, + { + "epoch": 0.6776478620807374, + "grad_norm": 0.4172505487094595, + "learning_rate": 4.300708143651998e-05, + "loss": 0.4447, + "step": 3970 + }, + { + "epoch": 0.6785013228642144, + "grad_norm": 0.517751315224531, + "learning_rate": 4.29912746585736e-05, + "loss": 0.4495, + "step": 3975 + }, + { + "epoch": 0.6793547836476914, + "grad_norm": 0.521540079831743, + "learning_rate": 4.2975467880627214e-05, + "loss": 0.4749, + "step": 3980 + }, + { + "epoch": 0.6802082444311683, + "grad_norm": 0.5457293452763281, + "learning_rate": 4.295966110268083e-05, + "loss": 0.485, + "step": 3985 + }, + { + "epoch": 0.6810617052146454, + "grad_norm": 0.4501647552450407, + "learning_rate": 4.294385432473445e-05, + "loss": 0.4869, + "step": 3990 + }, + { + "epoch": 0.6819151659981224, + "grad_norm": 0.518406725291088, + "learning_rate": 4.292804754678807e-05, + "loss": 0.4719, + "step": 3995 + }, + { + "epoch": 0.6827686267815993, + "grad_norm": 0.4457480223254278, + "learning_rate": 4.2912240768841684e-05, + "loss": 0.466, + "step": 4000 + }, + { + "epoch": 0.6836220875650764, + "grad_norm": 0.6292094801052758, + "learning_rate": 4.2896433990895295e-05, + "loss": 0.5088, + "step": 4005 + }, + { + "epoch": 0.6844755483485534, + "grad_norm": 0.5116550207377446, + "learning_rate": 4.288062721294891e-05, + "loss": 0.4723, + "step": 4010 + }, + { + "epoch": 0.6853290091320304, + "grad_norm": 0.46845065186174567, + "learning_rate": 4.286482043500253e-05, + "loss": 0.4818, + "step": 4015 + }, + { + "epoch": 0.6861824699155074, + "grad_norm": 0.7301404751357614, + "learning_rate": 4.284901365705615e-05, + "loss": 0.4917, + "step": 4020 + }, + { + "epoch": 0.6870359306989844, + "grad_norm": 0.44690134946083243, + "learning_rate": 4.2833206879109765e-05, + "loss": 0.4556, + "step": 4025 + }, + { + "epoch": 0.6878893914824614, + "grad_norm": 0.413374957143859, + "learning_rate": 4.281740010116338e-05, + "loss": 0.4586, + "step": 4030 + }, + { + "epoch": 0.6887428522659383, + "grad_norm": 0.4497745580402003, + "learning_rate": 4.2801593323217e-05, + "loss": 0.4523, + "step": 4035 + }, + { + "epoch": 0.6895963130494154, + "grad_norm": 0.5756425206553233, + "learning_rate": 4.278578654527061e-05, + "loss": 0.4698, + "step": 4040 + }, + { + "epoch": 0.6904497738328924, + "grad_norm": 0.39328406249104275, + "learning_rate": 4.276997976732423e-05, + "loss": 0.4576, + "step": 4045 + }, + { + "epoch": 0.6913032346163693, + "grad_norm": 0.5660539542345869, + "learning_rate": 4.275417298937785e-05, + "loss": 0.4913, + "step": 4050 + }, + { + "epoch": 0.6921566953998464, + "grad_norm": 0.7728511153118378, + "learning_rate": 4.273836621143146e-05, + "loss": 0.4834, + "step": 4055 + }, + { + "epoch": 0.6930101561833234, + "grad_norm": 0.5208945931014687, + "learning_rate": 4.272255943348508e-05, + "loss": 0.4686, + "step": 4060 + }, + { + "epoch": 0.6938636169668003, + "grad_norm": 0.4279123272455591, + "learning_rate": 4.27067526555387e-05, + "loss": 0.4632, + "step": 4065 + }, + { + "epoch": 0.6947170777502774, + "grad_norm": 0.43309100601295614, + "learning_rate": 4.2690945877592315e-05, + "loss": 0.4789, + "step": 4070 + }, + { + "epoch": 0.6955705385337544, + "grad_norm": 0.4800452561798458, + "learning_rate": 4.2675139099645926e-05, + "loss": 0.4556, + "step": 4075 + }, + { + "epoch": 0.6964239993172314, + "grad_norm": 0.43338273426642704, + "learning_rate": 4.265933232169955e-05, + "loss": 0.4703, + "step": 4080 + }, + { + "epoch": 0.6972774601007083, + "grad_norm": 0.5380585209184109, + "learning_rate": 4.264352554375317e-05, + "loss": 0.4503, + "step": 4085 + }, + { + "epoch": 0.6981309208841854, + "grad_norm": 0.5593367082780836, + "learning_rate": 4.262771876580678e-05, + "loss": 0.4739, + "step": 4090 + }, + { + "epoch": 0.6989843816676624, + "grad_norm": 0.4153747243000154, + "learning_rate": 4.2611911987860396e-05, + "loss": 0.5017, + "step": 4095 + }, + { + "epoch": 0.6998378424511393, + "grad_norm": 0.6526841312322189, + "learning_rate": 4.259610520991401e-05, + "loss": 0.4988, + "step": 4100 + }, + { + "epoch": 0.7006913032346164, + "grad_norm": 0.5092368692530734, + "learning_rate": 4.258029843196763e-05, + "loss": 0.4733, + "step": 4105 + }, + { + "epoch": 0.7015447640180934, + "grad_norm": 0.6272371729345999, + "learning_rate": 4.256449165402125e-05, + "loss": 0.4739, + "step": 4110 + }, + { + "epoch": 0.7023982248015703, + "grad_norm": 0.7150201654065392, + "learning_rate": 4.2548684876074865e-05, + "loss": 0.4818, + "step": 4115 + }, + { + "epoch": 0.7032516855850474, + "grad_norm": 0.6606867428972878, + "learning_rate": 4.253287809812848e-05, + "loss": 0.4625, + "step": 4120 + }, + { + "epoch": 0.7041051463685244, + "grad_norm": 2.4043603785982803, + "learning_rate": 4.2517071320182093e-05, + "loss": 0.4724, + "step": 4125 + }, + { + "epoch": 0.7049586071520013, + "grad_norm": 0.7085148116148299, + "learning_rate": 4.250126454223571e-05, + "loss": 0.498, + "step": 4130 + }, + { + "epoch": 0.7058120679354783, + "grad_norm": 0.5309349094860171, + "learning_rate": 4.248545776428933e-05, + "loss": 0.4711, + "step": 4135 + }, + { + "epoch": 0.7066655287189554, + "grad_norm": 1.4124045053824865, + "learning_rate": 4.2469650986342946e-05, + "loss": 0.4619, + "step": 4140 + }, + { + "epoch": 0.7075189895024324, + "grad_norm": 0.5323652304800572, + "learning_rate": 4.245384420839656e-05, + "loss": 0.4813, + "step": 4145 + }, + { + "epoch": 0.7083724502859093, + "grad_norm": 0.584382089693229, + "learning_rate": 4.243803743045018e-05, + "loss": 0.4609, + "step": 4150 + }, + { + "epoch": 0.7092259110693864, + "grad_norm": 0.7538382751874487, + "learning_rate": 4.24222306525038e-05, + "loss": 0.4981, + "step": 4155 + }, + { + "epoch": 0.7100793718528634, + "grad_norm": 0.6751026260244125, + "learning_rate": 4.240642387455741e-05, + "loss": 0.5001, + "step": 4160 + }, + { + "epoch": 0.7109328326363403, + "grad_norm": 0.5439814041009411, + "learning_rate": 4.2390617096611026e-05, + "loss": 0.4721, + "step": 4165 + }, + { + "epoch": 0.7117862934198174, + "grad_norm": 0.5032188165856872, + "learning_rate": 4.2374810318664644e-05, + "loss": 0.4666, + "step": 4170 + }, + { + "epoch": 0.7126397542032944, + "grad_norm": 0.6113467103393669, + "learning_rate": 4.235900354071826e-05, + "loss": 0.4541, + "step": 4175 + }, + { + "epoch": 0.7134932149867713, + "grad_norm": 0.5201854878427893, + "learning_rate": 4.234319676277188e-05, + "loss": 0.4756, + "step": 4180 + }, + { + "epoch": 0.7143466757702484, + "grad_norm": 0.4910717521923143, + "learning_rate": 4.2327389984825496e-05, + "loss": 0.4675, + "step": 4185 + }, + { + "epoch": 0.7152001365537254, + "grad_norm": 12.982698067516175, + "learning_rate": 4.2311583206879114e-05, + "loss": 0.4653, + "step": 4190 + }, + { + "epoch": 0.7160535973372023, + "grad_norm": 0.5000586078973243, + "learning_rate": 4.2295776428932724e-05, + "loss": 0.517, + "step": 4195 + }, + { + "epoch": 0.7169070581206793, + "grad_norm": 0.6077145038020854, + "learning_rate": 4.227996965098635e-05, + "loss": 0.4655, + "step": 4200 + }, + { + "epoch": 0.7177605189041564, + "grad_norm": 0.4708526675368424, + "learning_rate": 4.2264162873039966e-05, + "loss": 0.4888, + "step": 4205 + }, + { + "epoch": 0.7186139796876334, + "grad_norm": 0.43314402639331606, + "learning_rate": 4.224835609509358e-05, + "loss": 0.4418, + "step": 4210 + }, + { + "epoch": 0.7194674404711103, + "grad_norm": 0.4044626975468846, + "learning_rate": 4.2232549317147194e-05, + "loss": 0.455, + "step": 4215 + }, + { + "epoch": 0.7203209012545874, + "grad_norm": 0.533450940376835, + "learning_rate": 4.221674253920081e-05, + "loss": 0.4699, + "step": 4220 + }, + { + "epoch": 0.7211743620380644, + "grad_norm": 1.367143406475424, + "learning_rate": 4.220093576125443e-05, + "loss": 0.4934, + "step": 4225 + }, + { + "epoch": 0.7220278228215413, + "grad_norm": 0.48597158199842877, + "learning_rate": 4.218512898330804e-05, + "loss": 0.4682, + "step": 4230 + }, + { + "epoch": 0.7228812836050184, + "grad_norm": 0.7095642481694201, + "learning_rate": 4.2169322205361664e-05, + "loss": 0.4755, + "step": 4235 + }, + { + "epoch": 0.7237347443884954, + "grad_norm": 0.6921834984338361, + "learning_rate": 4.215351542741528e-05, + "loss": 0.4764, + "step": 4240 + }, + { + "epoch": 0.7245882051719723, + "grad_norm": 0.4669850856765737, + "learning_rate": 4.213770864946889e-05, + "loss": 0.4571, + "step": 4245 + }, + { + "epoch": 0.7254416659554493, + "grad_norm": 0.47156030803819626, + "learning_rate": 4.212190187152251e-05, + "loss": 0.4713, + "step": 4250 + }, + { + "epoch": 0.7262951267389264, + "grad_norm": 0.4591076554168388, + "learning_rate": 4.210609509357613e-05, + "loss": 0.4559, + "step": 4255 + }, + { + "epoch": 0.7271485875224033, + "grad_norm": 0.619852825322946, + "learning_rate": 4.2090288315629745e-05, + "loss": 0.5021, + "step": 4260 + }, + { + "epoch": 0.7280020483058803, + "grad_norm": 0.6062909723217509, + "learning_rate": 4.2074481537683355e-05, + "loss": 0.4624, + "step": 4265 + }, + { + "epoch": 0.7288555090893574, + "grad_norm": 0.4146634798044198, + "learning_rate": 4.205867475973698e-05, + "loss": 0.4807, + "step": 4270 + }, + { + "epoch": 0.7297089698728343, + "grad_norm": 0.648514590985978, + "learning_rate": 4.20428679817906e-05, + "loss": 0.4488, + "step": 4275 + }, + { + "epoch": 0.7305624306563113, + "grad_norm": 0.5969831455917731, + "learning_rate": 4.202706120384421e-05, + "loss": 0.4757, + "step": 4280 + }, + { + "epoch": 0.7314158914397884, + "grad_norm": 0.5313730082368189, + "learning_rate": 4.2011254425897825e-05, + "loss": 0.4571, + "step": 4285 + }, + { + "epoch": 0.7322693522232654, + "grad_norm": 0.4251657799991048, + "learning_rate": 4.199544764795144e-05, + "loss": 0.4786, + "step": 4290 + }, + { + "epoch": 0.7331228130067423, + "grad_norm": 0.4738411097488647, + "learning_rate": 4.197964087000506e-05, + "loss": 0.4721, + "step": 4295 + }, + { + "epoch": 0.7339762737902193, + "grad_norm": 0.6266077393685796, + "learning_rate": 4.196383409205868e-05, + "loss": 0.5001, + "step": 4300 + }, + { + "epoch": 0.7348297345736964, + "grad_norm": 0.6021787676695373, + "learning_rate": 4.1948027314112295e-05, + "loss": 0.4645, + "step": 4305 + }, + { + "epoch": 0.7356831953571733, + "grad_norm": 0.7975390508151426, + "learning_rate": 4.193222053616591e-05, + "loss": 0.4871, + "step": 4310 + }, + { + "epoch": 0.7365366561406503, + "grad_norm": 0.5021933795796126, + "learning_rate": 4.191641375821952e-05, + "loss": 0.4755, + "step": 4315 + }, + { + "epoch": 0.7373901169241274, + "grad_norm": 0.47386820243446054, + "learning_rate": 4.190060698027314e-05, + "loss": 0.4782, + "step": 4320 + }, + { + "epoch": 0.7382435777076043, + "grad_norm": 0.47399583150250074, + "learning_rate": 4.188480020232676e-05, + "loss": 0.5062, + "step": 4325 + }, + { + "epoch": 0.7390970384910813, + "grad_norm": 0.9263847986919433, + "learning_rate": 4.1868993424380375e-05, + "loss": 0.4713, + "step": 4330 + }, + { + "epoch": 0.7399504992745584, + "grad_norm": 0.38060764152791543, + "learning_rate": 4.185318664643399e-05, + "loss": 0.4931, + "step": 4335 + }, + { + "epoch": 0.7408039600580353, + "grad_norm": 0.4408892386193912, + "learning_rate": 4.183737986848761e-05, + "loss": 0.4812, + "step": 4340 + }, + { + "epoch": 0.7416574208415123, + "grad_norm": 0.5146022417429106, + "learning_rate": 4.182157309054123e-05, + "loss": 0.4908, + "step": 4345 + }, + { + "epoch": 0.7425108816249893, + "grad_norm": 0.5124824948297765, + "learning_rate": 4.180576631259484e-05, + "loss": 0.4333, + "step": 4350 + }, + { + "epoch": 0.7433643424084664, + "grad_norm": 0.43519678035197396, + "learning_rate": 4.178995953464846e-05, + "loss": 0.5134, + "step": 4355 + }, + { + "epoch": 0.7442178031919433, + "grad_norm": 0.40836843733058104, + "learning_rate": 4.177415275670208e-05, + "loss": 0.4759, + "step": 4360 + }, + { + "epoch": 0.7450712639754203, + "grad_norm": 0.6539216082100642, + "learning_rate": 4.175834597875569e-05, + "loss": 0.491, + "step": 4365 + }, + { + "epoch": 0.7459247247588974, + "grad_norm": 0.741333557246768, + "learning_rate": 4.174253920080931e-05, + "loss": 0.4521, + "step": 4370 + }, + { + "epoch": 0.7467781855423743, + "grad_norm": 0.4584230945611621, + "learning_rate": 4.1726732422862926e-05, + "loss": 0.4787, + "step": 4375 + }, + { + "epoch": 0.7476316463258513, + "grad_norm": 0.5047238639694727, + "learning_rate": 4.171092564491654e-05, + "loss": 0.4596, + "step": 4380 + }, + { + "epoch": 0.7484851071093284, + "grad_norm": 0.4388777622815176, + "learning_rate": 4.1695118866970154e-05, + "loss": 0.4342, + "step": 4385 + }, + { + "epoch": 0.7493385678928053, + "grad_norm": 0.6473129680064909, + "learning_rate": 4.167931208902378e-05, + "loss": 0.4496, + "step": 4390 + }, + { + "epoch": 0.7501920286762823, + "grad_norm": 0.4667194781780377, + "learning_rate": 4.1663505311077396e-05, + "loss": 0.4729, + "step": 4395 + }, + { + "epoch": 0.7510454894597594, + "grad_norm": 0.4798661445158895, + "learning_rate": 4.1647698533131006e-05, + "loss": 0.4781, + "step": 4400 + }, + { + "epoch": 0.7518989502432363, + "grad_norm": 0.5258313097710264, + "learning_rate": 4.1631891755184624e-05, + "loss": 0.4574, + "step": 4405 + }, + { + "epoch": 0.7527524110267133, + "grad_norm": 0.4367681199785881, + "learning_rate": 4.161608497723824e-05, + "loss": 0.4629, + "step": 4410 + }, + { + "epoch": 0.7536058718101903, + "grad_norm": 0.4058766887370437, + "learning_rate": 4.160027819929186e-05, + "loss": 0.4349, + "step": 4415 + }, + { + "epoch": 0.7544593325936674, + "grad_norm": 0.45389133719116836, + "learning_rate": 4.158447142134547e-05, + "loss": 0.4683, + "step": 4420 + }, + { + "epoch": 0.7553127933771443, + "grad_norm": 0.9384019269281727, + "learning_rate": 4.1568664643399094e-05, + "loss": 0.4725, + "step": 4425 + }, + { + "epoch": 0.7561662541606213, + "grad_norm": 0.4171621248299094, + "learning_rate": 4.155285786545271e-05, + "loss": 0.4577, + "step": 4430 + }, + { + "epoch": 0.7570197149440984, + "grad_norm": 0.4293985411718834, + "learning_rate": 4.153705108750632e-05, + "loss": 0.4583, + "step": 4435 + }, + { + "epoch": 0.7578731757275753, + "grad_norm": 0.5110467466987901, + "learning_rate": 4.152124430955994e-05, + "loss": 0.4726, + "step": 4440 + }, + { + "epoch": 0.7587266365110523, + "grad_norm": 0.36866945596984196, + "learning_rate": 4.1505437531613557e-05, + "loss": 0.4706, + "step": 4445 + }, + { + "epoch": 0.7595800972945294, + "grad_norm": 0.3968297640314202, + "learning_rate": 4.1489630753667174e-05, + "loss": 0.4678, + "step": 4450 + }, + { + "epoch": 0.7604335580780063, + "grad_norm": 0.38358786025829267, + "learning_rate": 4.147382397572079e-05, + "loss": 0.424, + "step": 4455 + }, + { + "epoch": 0.7612870188614833, + "grad_norm": 0.44851307779408023, + "learning_rate": 4.145801719777441e-05, + "loss": 0.4786, + "step": 4460 + }, + { + "epoch": 0.7621404796449603, + "grad_norm": 0.4372288670484839, + "learning_rate": 4.1442210419828026e-05, + "loss": 0.461, + "step": 4465 + }, + { + "epoch": 0.7629939404284373, + "grad_norm": 0.3884950987593795, + "learning_rate": 4.142640364188164e-05, + "loss": 0.4601, + "step": 4470 + }, + { + "epoch": 0.7638474012119143, + "grad_norm": 1.003434034563983, + "learning_rate": 4.141059686393526e-05, + "loss": 0.4751, + "step": 4475 + }, + { + "epoch": 0.7647008619953913, + "grad_norm": 0.41989132522212425, + "learning_rate": 4.139479008598887e-05, + "loss": 0.4405, + "step": 4480 + }, + { + "epoch": 0.7655543227788683, + "grad_norm": 0.4421070338541073, + "learning_rate": 4.137898330804249e-05, + "loss": 0.4627, + "step": 4485 + }, + { + "epoch": 0.7664077835623453, + "grad_norm": 0.4343925351326079, + "learning_rate": 4.136317653009611e-05, + "loss": 0.4718, + "step": 4490 + }, + { + "epoch": 0.7672612443458223, + "grad_norm": 0.36233285655270203, + "learning_rate": 4.1347369752149724e-05, + "loss": 0.4532, + "step": 4495 + }, + { + "epoch": 0.7681147051292994, + "grad_norm": 0.42515097175129407, + "learning_rate": 4.133156297420334e-05, + "loss": 0.4685, + "step": 4500 + }, + { + "epoch": 0.7689681659127763, + "grad_norm": 0.5464773560885858, + "learning_rate": 4.131575619625695e-05, + "loss": 0.4744, + "step": 4505 + }, + { + "epoch": 0.7698216266962533, + "grad_norm": 0.7900641435836122, + "learning_rate": 4.129994941831058e-05, + "loss": 0.4636, + "step": 4510 + }, + { + "epoch": 0.7706750874797303, + "grad_norm": 0.38514288708300826, + "learning_rate": 4.1284142640364194e-05, + "loss": 0.4565, + "step": 4515 + }, + { + "epoch": 0.7715285482632073, + "grad_norm": 0.513740255362734, + "learning_rate": 4.1268335862417805e-05, + "loss": 0.4498, + "step": 4520 + }, + { + "epoch": 0.7723820090466843, + "grad_norm": 0.4685507535202432, + "learning_rate": 4.125252908447142e-05, + "loss": 0.449, + "step": 4525 + }, + { + "epoch": 0.7732354698301613, + "grad_norm": 0.748890376912051, + "learning_rate": 4.123672230652504e-05, + "loss": 0.4908, + "step": 4530 + }, + { + "epoch": 0.7740889306136383, + "grad_norm": 0.5297351959814006, + "learning_rate": 4.122091552857866e-05, + "loss": 0.4748, + "step": 4535 + }, + { + "epoch": 0.7749423913971153, + "grad_norm": 0.4004090062602965, + "learning_rate": 4.120510875063227e-05, + "loss": 0.456, + "step": 4540 + }, + { + "epoch": 0.7757958521805923, + "grad_norm": 0.4603580473495154, + "learning_rate": 4.118930197268589e-05, + "loss": 0.4835, + "step": 4545 + }, + { + "epoch": 0.7766493129640692, + "grad_norm": 0.4091151289367216, + "learning_rate": 4.117349519473951e-05, + "loss": 0.4632, + "step": 4550 + }, + { + "epoch": 0.7775027737475463, + "grad_norm": 0.4336955748413149, + "learning_rate": 4.115768841679312e-05, + "loss": 0.4466, + "step": 4555 + }, + { + "epoch": 0.7783562345310233, + "grad_norm": 0.4596120126536533, + "learning_rate": 4.114188163884674e-05, + "loss": 0.4658, + "step": 4560 + }, + { + "epoch": 0.7792096953145004, + "grad_norm": 0.4537868261444298, + "learning_rate": 4.1126074860900355e-05, + "loss": 0.4484, + "step": 4565 + }, + { + "epoch": 0.7800631560979773, + "grad_norm": 0.49903144617561895, + "learning_rate": 4.111026808295397e-05, + "loss": 0.4902, + "step": 4570 + }, + { + "epoch": 0.7809166168814543, + "grad_norm": 0.4300468454390534, + "learning_rate": 4.109446130500758e-05, + "loss": 0.4664, + "step": 4575 + }, + { + "epoch": 0.7817700776649313, + "grad_norm": 0.5268366623180529, + "learning_rate": 4.107865452706121e-05, + "loss": 0.4493, + "step": 4580 + }, + { + "epoch": 0.7826235384484083, + "grad_norm": 0.46124267219055926, + "learning_rate": 4.1062847749114825e-05, + "loss": 0.4627, + "step": 4585 + }, + { + "epoch": 0.7834769992318853, + "grad_norm": 0.4226627432522082, + "learning_rate": 4.1047040971168436e-05, + "loss": 0.4368, + "step": 4590 + }, + { + "epoch": 0.7843304600153623, + "grad_norm": 0.44261675551299395, + "learning_rate": 4.103123419322206e-05, + "loss": 0.463, + "step": 4595 + }, + { + "epoch": 0.7851839207988393, + "grad_norm": 0.8991940019280324, + "learning_rate": 4.101542741527567e-05, + "loss": 0.4683, + "step": 4600 + }, + { + "epoch": 0.7860373815823163, + "grad_norm": 0.384751334196363, + "learning_rate": 4.099962063732929e-05, + "loss": 0.4768, + "step": 4605 + }, + { + "epoch": 0.7868908423657933, + "grad_norm": 0.5951659423169229, + "learning_rate": 4.0983813859382906e-05, + "loss": 0.4961, + "step": 4610 + }, + { + "epoch": 0.7877443031492702, + "grad_norm": 0.8089623040423884, + "learning_rate": 4.096800708143652e-05, + "loss": 0.4645, + "step": 4615 + }, + { + "epoch": 0.7885977639327473, + "grad_norm": 0.5242955484230468, + "learning_rate": 4.095220030349014e-05, + "loss": 0.4764, + "step": 4620 + }, + { + "epoch": 0.7894512247162243, + "grad_norm": 0.45832328760285135, + "learning_rate": 4.093639352554375e-05, + "loss": 0.4693, + "step": 4625 + }, + { + "epoch": 0.7903046854997013, + "grad_norm": 0.4500365088370673, + "learning_rate": 4.0920586747597375e-05, + "loss": 0.4801, + "step": 4630 + }, + { + "epoch": 0.7911581462831783, + "grad_norm": 0.5533871333970412, + "learning_rate": 4.0904779969650986e-05, + "loss": 0.4714, + "step": 4635 + }, + { + "epoch": 0.7920116070666553, + "grad_norm": 0.6154994613950776, + "learning_rate": 4.0888973191704604e-05, + "loss": 0.4845, + "step": 4640 + }, + { + "epoch": 0.7928650678501323, + "grad_norm": 0.46089491268483523, + "learning_rate": 4.087316641375822e-05, + "loss": 0.4745, + "step": 4645 + }, + { + "epoch": 0.7937185286336093, + "grad_norm": 0.5361431726193799, + "learning_rate": 4.085735963581184e-05, + "loss": 0.4538, + "step": 4650 + }, + { + "epoch": 0.7945719894170863, + "grad_norm": 0.5220493537026821, + "learning_rate": 4.0841552857865456e-05, + "loss": 0.4396, + "step": 4655 + }, + { + "epoch": 0.7954254502005633, + "grad_norm": 0.6361482571581994, + "learning_rate": 4.0825746079919067e-05, + "loss": 0.4727, + "step": 4660 + }, + { + "epoch": 0.7962789109840402, + "grad_norm": 0.40651694156272317, + "learning_rate": 4.080993930197269e-05, + "loss": 0.4816, + "step": 4665 + }, + { + "epoch": 0.7971323717675173, + "grad_norm": 0.6135578534897143, + "learning_rate": 4.07941325240263e-05, + "loss": 0.4748, + "step": 4670 + }, + { + "epoch": 0.7979858325509943, + "grad_norm": 0.41497484588020045, + "learning_rate": 4.077832574607992e-05, + "loss": 0.4564, + "step": 4675 + }, + { + "epoch": 0.7988392933344712, + "grad_norm": 0.3853446441725627, + "learning_rate": 4.0762518968133536e-05, + "loss": 0.4515, + "step": 4680 + }, + { + "epoch": 0.7996927541179483, + "grad_norm": 0.5824960605024038, + "learning_rate": 4.0746712190187154e-05, + "loss": 0.4524, + "step": 4685 + }, + { + "epoch": 0.8005462149014253, + "grad_norm": 0.49185525639089317, + "learning_rate": 4.073090541224077e-05, + "loss": 0.4626, + "step": 4690 + }, + { + "epoch": 0.8013996756849022, + "grad_norm": 0.612221451639026, + "learning_rate": 4.071509863429438e-05, + "loss": 0.4769, + "step": 4695 + }, + { + "epoch": 0.8022531364683793, + "grad_norm": 0.3994074480084441, + "learning_rate": 4.0699291856348006e-05, + "loss": 0.4637, + "step": 4700 + }, + { + "epoch": 0.8031065972518563, + "grad_norm": 0.7539597800025871, + "learning_rate": 4.0683485078401624e-05, + "loss": 0.4722, + "step": 4705 + }, + { + "epoch": 0.8039600580353333, + "grad_norm": 0.5070926595384238, + "learning_rate": 4.0667678300455234e-05, + "loss": 0.4914, + "step": 4710 + }, + { + "epoch": 0.8048135188188102, + "grad_norm": 0.5513153710741796, + "learning_rate": 4.065187152250886e-05, + "loss": 0.492, + "step": 4715 + }, + { + "epoch": 0.8056669796022873, + "grad_norm": 0.370937807426415, + "learning_rate": 4.063606474456247e-05, + "loss": 0.4894, + "step": 4720 + }, + { + "epoch": 0.8065204403857643, + "grad_norm": 0.6682684562843318, + "learning_rate": 4.062025796661609e-05, + "loss": 0.4733, + "step": 4725 + }, + { + "epoch": 0.8073739011692412, + "grad_norm": 0.44302192953537, + "learning_rate": 4.0604451188669704e-05, + "loss": 0.4873, + "step": 4730 + }, + { + "epoch": 0.8082273619527183, + "grad_norm": 0.5333647000913362, + "learning_rate": 4.058864441072332e-05, + "loss": 0.4736, + "step": 4735 + }, + { + "epoch": 0.8090808227361953, + "grad_norm": 0.5505767154605103, + "learning_rate": 4.057283763277694e-05, + "loss": 0.4815, + "step": 4740 + }, + { + "epoch": 0.8099342835196722, + "grad_norm": 0.4854973753670553, + "learning_rate": 4.055703085483055e-05, + "loss": 0.4616, + "step": 4745 + }, + { + "epoch": 0.8107877443031493, + "grad_norm": 0.4216977758902865, + "learning_rate": 4.0541224076884174e-05, + "loss": 0.4692, + "step": 4750 + }, + { + "epoch": 0.8116412050866263, + "grad_norm": 0.6480224440676884, + "learning_rate": 4.0525417298937785e-05, + "loss": 0.4656, + "step": 4755 + }, + { + "epoch": 0.8124946658701032, + "grad_norm": 0.5196720864934075, + "learning_rate": 4.05096105209914e-05, + "loss": 0.4724, + "step": 4760 + }, + { + "epoch": 0.8133481266535803, + "grad_norm": 0.4142240721140529, + "learning_rate": 4.049380374304502e-05, + "loss": 0.4386, + "step": 4765 + }, + { + "epoch": 0.8142015874370573, + "grad_norm": 0.5044032502799161, + "learning_rate": 4.047799696509864e-05, + "loss": 0.4647, + "step": 4770 + }, + { + "epoch": 0.8150550482205343, + "grad_norm": 1.0326614960275178, + "learning_rate": 4.0462190187152255e-05, + "loss": 0.4406, + "step": 4775 + }, + { + "epoch": 0.8159085090040112, + "grad_norm": 0.502366249679762, + "learning_rate": 4.0446383409205865e-05, + "loss": 0.4639, + "step": 4780 + }, + { + "epoch": 0.8167619697874883, + "grad_norm": 1.083322203978478, + "learning_rate": 4.043057663125949e-05, + "loss": 0.4567, + "step": 4785 + }, + { + "epoch": 0.8176154305709653, + "grad_norm": 0.4807750034947064, + "learning_rate": 4.04147698533131e-05, + "loss": 0.4534, + "step": 4790 + }, + { + "epoch": 0.8184688913544422, + "grad_norm": 0.42630273833796006, + "learning_rate": 4.039896307536672e-05, + "loss": 0.4424, + "step": 4795 + }, + { + "epoch": 0.8193223521379193, + "grad_norm": 0.5471666962224171, + "learning_rate": 4.0383156297420335e-05, + "loss": 0.5013, + "step": 4800 + }, + { + "epoch": 0.8201758129213963, + "grad_norm": 0.40088630057325625, + "learning_rate": 4.036734951947395e-05, + "loss": 0.4684, + "step": 4805 + }, + { + "epoch": 0.8210292737048732, + "grad_norm": 0.44204961810028137, + "learning_rate": 4.035154274152757e-05, + "loss": 0.4949, + "step": 4810 + }, + { + "epoch": 0.8218827344883503, + "grad_norm": 0.43055591123212844, + "learning_rate": 4.033573596358118e-05, + "loss": 0.4473, + "step": 4815 + }, + { + "epoch": 0.8227361952718273, + "grad_norm": 0.4270110673581626, + "learning_rate": 4.0319929185634805e-05, + "loss": 0.4974, + "step": 4820 + }, + { + "epoch": 0.8235896560553042, + "grad_norm": 0.508380546480638, + "learning_rate": 4.0304122407688416e-05, + "loss": 0.4386, + "step": 4825 + }, + { + "epoch": 0.8244431168387812, + "grad_norm": 0.8178606336757268, + "learning_rate": 4.028831562974203e-05, + "loss": 0.4791, + "step": 4830 + }, + { + "epoch": 0.8252965776222583, + "grad_norm": 10.544437701616161, + "learning_rate": 4.027250885179566e-05, + "loss": 0.4405, + "step": 4835 + }, + { + "epoch": 0.8261500384057353, + "grad_norm": 0.6872417396438704, + "learning_rate": 4.025670207384927e-05, + "loss": 0.4496, + "step": 4840 + }, + { + "epoch": 0.8270034991892122, + "grad_norm": 0.37016603686175553, + "learning_rate": 4.0240895295902885e-05, + "loss": 0.4566, + "step": 4845 + }, + { + "epoch": 0.8278569599726893, + "grad_norm": 0.43379088296174073, + "learning_rate": 4.02250885179565e-05, + "loss": 0.4756, + "step": 4850 + }, + { + "epoch": 0.8287104207561663, + "grad_norm": 0.5484048511104972, + "learning_rate": 4.020928174001012e-05, + "loss": 0.4365, + "step": 4855 + }, + { + "epoch": 0.8295638815396432, + "grad_norm": 0.41841300895893785, + "learning_rate": 4.019347496206374e-05, + "loss": 0.4388, + "step": 4860 + }, + { + "epoch": 0.8304173423231203, + "grad_norm": 0.4472193830259819, + "learning_rate": 4.017766818411735e-05, + "loss": 0.4765, + "step": 4865 + }, + { + "epoch": 0.8312708031065973, + "grad_norm": 0.35853153795034776, + "learning_rate": 4.016186140617097e-05, + "loss": 0.449, + "step": 4870 + }, + { + "epoch": 0.8321242638900742, + "grad_norm": 0.4057918958397313, + "learning_rate": 4.014605462822458e-05, + "loss": 0.5011, + "step": 4875 + }, + { + "epoch": 0.8329777246735512, + "grad_norm": 0.3958130117885879, + "learning_rate": 4.01302478502782e-05, + "loss": 0.4667, + "step": 4880 + }, + { + "epoch": 0.8338311854570283, + "grad_norm": 0.4738033473804357, + "learning_rate": 4.011444107233182e-05, + "loss": 0.4565, + "step": 4885 + }, + { + "epoch": 0.8346846462405052, + "grad_norm": 0.4025662142201199, + "learning_rate": 4.0098634294385436e-05, + "loss": 0.4978, + "step": 4890 + }, + { + "epoch": 0.8355381070239822, + "grad_norm": 0.4434428406725396, + "learning_rate": 4.008282751643905e-05, + "loss": 0.4512, + "step": 4895 + }, + { + "epoch": 0.8363915678074593, + "grad_norm": 0.5150848531149659, + "learning_rate": 4.0067020738492664e-05, + "loss": 0.4427, + "step": 4900 + }, + { + "epoch": 0.8372450285909362, + "grad_norm": 0.45758051602960453, + "learning_rate": 4.005121396054629e-05, + "loss": 0.4313, + "step": 4905 + }, + { + "epoch": 0.8380984893744132, + "grad_norm": 0.43431208239996316, + "learning_rate": 4.00354071825999e-05, + "loss": 0.4606, + "step": 4910 + }, + { + "epoch": 0.8389519501578903, + "grad_norm": 0.36914787928203885, + "learning_rate": 4.0019600404653516e-05, + "loss": 0.4554, + "step": 4915 + }, + { + "epoch": 0.8398054109413673, + "grad_norm": 0.34338051155507815, + "learning_rate": 4.0003793626707134e-05, + "loss": 0.4671, + "step": 4920 + }, + { + "epoch": 0.8406588717248442, + "grad_norm": 0.3423170869242225, + "learning_rate": 3.998798684876075e-05, + "loss": 0.4487, + "step": 4925 + }, + { + "epoch": 0.8415123325083212, + "grad_norm": 0.42559210239977796, + "learning_rate": 3.997218007081437e-05, + "loss": 0.4592, + "step": 4930 + }, + { + "epoch": 0.8423657932917983, + "grad_norm": 0.4026293804737442, + "learning_rate": 3.995637329286798e-05, + "loss": 0.4699, + "step": 4935 + }, + { + "epoch": 0.8432192540752752, + "grad_norm": 0.5283079560242373, + "learning_rate": 3.9940566514921604e-05, + "loss": 0.4578, + "step": 4940 + }, + { + "epoch": 0.8440727148587522, + "grad_norm": 0.576609671663474, + "learning_rate": 3.9924759736975214e-05, + "loss": 0.4704, + "step": 4945 + }, + { + "epoch": 0.8449261756422293, + "grad_norm": 0.3581273587093246, + "learning_rate": 3.990895295902883e-05, + "loss": 0.4376, + "step": 4950 + }, + { + "epoch": 0.8457796364257062, + "grad_norm": 0.4994142001135852, + "learning_rate": 3.989314618108245e-05, + "loss": 0.4848, + "step": 4955 + }, + { + "epoch": 0.8466330972091832, + "grad_norm": 0.40318144070697504, + "learning_rate": 3.9877339403136067e-05, + "loss": 0.4286, + "step": 4960 + }, + { + "epoch": 0.8474865579926603, + "grad_norm": 0.45045415873930383, + "learning_rate": 3.9861532625189684e-05, + "loss": 0.4813, + "step": 4965 + }, + { + "epoch": 0.8483400187761372, + "grad_norm": 0.5210299348454007, + "learning_rate": 3.98457258472433e-05, + "loss": 0.4714, + "step": 4970 + }, + { + "epoch": 0.8491934795596142, + "grad_norm": 0.39032612913864473, + "learning_rate": 3.982991906929692e-05, + "loss": 0.464, + "step": 4975 + }, + { + "epoch": 0.8500469403430913, + "grad_norm": 0.4572815340178369, + "learning_rate": 3.981411229135053e-05, + "loss": 0.4681, + "step": 4980 + }, + { + "epoch": 0.8509004011265683, + "grad_norm": 0.4248878833345479, + "learning_rate": 3.979830551340415e-05, + "loss": 0.4883, + "step": 4985 + }, + { + "epoch": 0.8517538619100452, + "grad_norm": 0.4227418728003324, + "learning_rate": 3.978249873545777e-05, + "loss": 0.4746, + "step": 4990 + }, + { + "epoch": 0.8526073226935222, + "grad_norm": 0.44564270844937715, + "learning_rate": 3.976669195751138e-05, + "loss": 0.4639, + "step": 4995 + }, + { + "epoch": 0.8534607834769993, + "grad_norm": 0.43257274810461194, + "learning_rate": 3.9750885179565e-05, + "loss": 0.4761, + "step": 5000 + }, + { + "epoch": 0.8543142442604762, + "grad_norm": 0.6510055910669625, + "learning_rate": 3.973507840161862e-05, + "loss": 0.483, + "step": 5005 + }, + { + "epoch": 0.8551677050439532, + "grad_norm": 0.4174446601702513, + "learning_rate": 3.9719271623672234e-05, + "loss": 0.4585, + "step": 5010 + }, + { + "epoch": 0.8560211658274303, + "grad_norm": 0.46208025471964614, + "learning_rate": 3.970346484572585e-05, + "loss": 0.4531, + "step": 5015 + }, + { + "epoch": 0.8568746266109072, + "grad_norm": 0.38656337115836403, + "learning_rate": 3.968765806777946e-05, + "loss": 0.4773, + "step": 5020 + }, + { + "epoch": 0.8577280873943842, + "grad_norm": 0.3995048395641776, + "learning_rate": 3.967185128983309e-05, + "loss": 0.492, + "step": 5025 + }, + { + "epoch": 0.8585815481778613, + "grad_norm": 0.646673653905275, + "learning_rate": 3.96560445118867e-05, + "loss": 0.4532, + "step": 5030 + }, + { + "epoch": 0.8594350089613382, + "grad_norm": 0.4480550691124822, + "learning_rate": 3.9640237733940315e-05, + "loss": 0.4809, + "step": 5035 + }, + { + "epoch": 0.8602884697448152, + "grad_norm": 0.5130279783696443, + "learning_rate": 3.962443095599393e-05, + "loss": 0.4597, + "step": 5040 + }, + { + "epoch": 0.8611419305282922, + "grad_norm": 0.5187207242807629, + "learning_rate": 3.960862417804755e-05, + "loss": 0.4516, + "step": 5045 + }, + { + "epoch": 0.8619953913117693, + "grad_norm": 0.3890802932011367, + "learning_rate": 3.959281740010117e-05, + "loss": 0.4479, + "step": 5050 + }, + { + "epoch": 0.8628488520952462, + "grad_norm": 0.444180638493775, + "learning_rate": 3.957701062215478e-05, + "loss": 0.4724, + "step": 5055 + }, + { + "epoch": 0.8637023128787232, + "grad_norm": 0.44490943443342873, + "learning_rate": 3.95612038442084e-05, + "loss": 0.4796, + "step": 5060 + }, + { + "epoch": 0.8645557736622003, + "grad_norm": 0.39454985098988465, + "learning_rate": 3.954539706626201e-05, + "loss": 0.4767, + "step": 5065 + }, + { + "epoch": 0.8654092344456772, + "grad_norm": 0.43075230815685706, + "learning_rate": 3.952959028831563e-05, + "loss": 0.4943, + "step": 5070 + }, + { + "epoch": 0.8662626952291542, + "grad_norm": 0.579774474859966, + "learning_rate": 3.951378351036925e-05, + "loss": 0.4781, + "step": 5075 + }, + { + "epoch": 0.8671161560126313, + "grad_norm": 0.6050594246243188, + "learning_rate": 3.9497976732422865e-05, + "loss": 0.462, + "step": 5080 + }, + { + "epoch": 0.8679696167961082, + "grad_norm": 0.4906701824581924, + "learning_rate": 3.948216995447648e-05, + "loss": 0.4717, + "step": 5085 + }, + { + "epoch": 0.8688230775795852, + "grad_norm": 0.3628131269499048, + "learning_rate": 3.9466363176530093e-05, + "loss": 0.4469, + "step": 5090 + }, + { + "epoch": 0.8696765383630622, + "grad_norm": 0.3875750676153778, + "learning_rate": 3.945055639858372e-05, + "loss": 0.447, + "step": 5095 + }, + { + "epoch": 0.8705299991465392, + "grad_norm": 0.3785699572390181, + "learning_rate": 3.943474962063733e-05, + "loss": 0.4666, + "step": 5100 + }, + { + "epoch": 0.8713834599300162, + "grad_norm": 0.41413344640524147, + "learning_rate": 3.9418942842690946e-05, + "loss": 0.4764, + "step": 5105 + }, + { + "epoch": 0.8722369207134932, + "grad_norm": 0.37990935238738166, + "learning_rate": 3.940313606474457e-05, + "loss": 0.4341, + "step": 5110 + }, + { + "epoch": 0.8730903814969703, + "grad_norm": 0.325922151892048, + "learning_rate": 3.938732928679818e-05, + "loss": 0.4464, + "step": 5115 + }, + { + "epoch": 0.8739438422804472, + "grad_norm": 0.3625407227950122, + "learning_rate": 3.93715225088518e-05, + "loss": 0.4572, + "step": 5120 + }, + { + "epoch": 0.8747973030639242, + "grad_norm": 0.4305070799506871, + "learning_rate": 3.9355715730905416e-05, + "loss": 0.4209, + "step": 5125 + }, + { + "epoch": 0.8756507638474013, + "grad_norm": 0.34770288354175327, + "learning_rate": 3.933990895295903e-05, + "loss": 0.4957, + "step": 5130 + }, + { + "epoch": 0.8765042246308782, + "grad_norm": 0.370649858584003, + "learning_rate": 3.9324102175012644e-05, + "loss": 0.4369, + "step": 5135 + }, + { + "epoch": 0.8773576854143552, + "grad_norm": 0.6631930203969955, + "learning_rate": 3.930829539706626e-05, + "loss": 0.4637, + "step": 5140 + }, + { + "epoch": 0.8782111461978322, + "grad_norm": 0.3821885520825206, + "learning_rate": 3.9292488619119885e-05, + "loss": 0.4468, + "step": 5145 + }, + { + "epoch": 0.8790646069813092, + "grad_norm": 0.42015351568091, + "learning_rate": 3.9276681841173496e-05, + "loss": 0.4551, + "step": 5150 + }, + { + "epoch": 0.8799180677647862, + "grad_norm": 0.39848695247719024, + "learning_rate": 3.9260875063227114e-05, + "loss": 0.4633, + "step": 5155 + }, + { + "epoch": 0.8807715285482632, + "grad_norm": 0.39805656884228074, + "learning_rate": 3.924506828528073e-05, + "loss": 0.453, + "step": 5160 + }, + { + "epoch": 0.8816249893317402, + "grad_norm": 0.72730511596657, + "learning_rate": 3.922926150733435e-05, + "loss": 0.482, + "step": 5165 + }, + { + "epoch": 0.8824784501152172, + "grad_norm": 0.3835150212424347, + "learning_rate": 3.9213454729387966e-05, + "loss": 0.4268, + "step": 5170 + }, + { + "epoch": 0.8833319108986942, + "grad_norm": 0.5549806218770832, + "learning_rate": 3.919764795144158e-05, + "loss": 0.4976, + "step": 5175 + }, + { + "epoch": 0.8841853716821712, + "grad_norm": 0.45886131028562016, + "learning_rate": 3.91818411734952e-05, + "loss": 0.4771, + "step": 5180 + }, + { + "epoch": 0.8850388324656482, + "grad_norm": 0.38461247343867677, + "learning_rate": 3.916603439554881e-05, + "loss": 0.4479, + "step": 5185 + }, + { + "epoch": 0.8858922932491252, + "grad_norm": 0.39432030577402233, + "learning_rate": 3.915022761760243e-05, + "loss": 0.4389, + "step": 5190 + }, + { + "epoch": 0.8867457540326023, + "grad_norm": 0.4445775875626058, + "learning_rate": 3.9134420839656046e-05, + "loss": 0.4424, + "step": 5195 + }, + { + "epoch": 0.8875992148160792, + "grad_norm": 0.47542478261605625, + "learning_rate": 3.9118614061709664e-05, + "loss": 0.4603, + "step": 5200 + }, + { + "epoch": 0.8884526755995562, + "grad_norm": 0.3887252750059788, + "learning_rate": 3.910280728376328e-05, + "loss": 0.446, + "step": 5205 + }, + { + "epoch": 0.8893061363830332, + "grad_norm": 0.5388718629032798, + "learning_rate": 3.908700050581689e-05, + "loss": 0.4911, + "step": 5210 + }, + { + "epoch": 0.8901595971665102, + "grad_norm": 0.415472898172573, + "learning_rate": 3.9071193727870516e-05, + "loss": 0.4503, + "step": 5215 + }, + { + "epoch": 0.8910130579499872, + "grad_norm": 0.41063762995113084, + "learning_rate": 3.905538694992413e-05, + "loss": 0.4546, + "step": 5220 + }, + { + "epoch": 0.8918665187334642, + "grad_norm": 0.44084348091256453, + "learning_rate": 3.9039580171977744e-05, + "loss": 0.4809, + "step": 5225 + }, + { + "epoch": 0.8927199795169412, + "grad_norm": 0.45463091201990335, + "learning_rate": 3.902377339403136e-05, + "loss": 0.4559, + "step": 5230 + }, + { + "epoch": 0.8935734403004182, + "grad_norm": 0.38493395938624353, + "learning_rate": 3.900796661608498e-05, + "loss": 0.4572, + "step": 5235 + }, + { + "epoch": 0.8944269010838952, + "grad_norm": 0.3792702027768502, + "learning_rate": 3.89921598381386e-05, + "loss": 0.4438, + "step": 5240 + }, + { + "epoch": 0.8952803618673721, + "grad_norm": 0.35552476732779537, + "learning_rate": 3.8976353060192214e-05, + "loss": 0.4857, + "step": 5245 + }, + { + "epoch": 0.8961338226508492, + "grad_norm": 0.4411242106417821, + "learning_rate": 3.896054628224583e-05, + "loss": 0.4431, + "step": 5250 + }, + { + "epoch": 0.8969872834343262, + "grad_norm": 0.3501135290159158, + "learning_rate": 3.894473950429944e-05, + "loss": 0.475, + "step": 5255 + }, + { + "epoch": 0.8978407442178032, + "grad_norm": 0.46967490673458273, + "learning_rate": 3.892893272635306e-05, + "loss": 0.4732, + "step": 5260 + }, + { + "epoch": 0.8986942050012802, + "grad_norm": 0.41543836571528997, + "learning_rate": 3.8913125948406684e-05, + "loss": 0.4693, + "step": 5265 + }, + { + "epoch": 0.8995476657847572, + "grad_norm": 0.3803928447120958, + "learning_rate": 3.8897319170460295e-05, + "loss": 0.4574, + "step": 5270 + }, + { + "epoch": 0.9004011265682342, + "grad_norm": 0.4043183675600425, + "learning_rate": 3.888151239251391e-05, + "loss": 0.4446, + "step": 5275 + }, + { + "epoch": 0.9012545873517112, + "grad_norm": 0.35773284894988394, + "learning_rate": 3.886570561456753e-05, + "loss": 0.4506, + "step": 5280 + }, + { + "epoch": 0.9021080481351882, + "grad_norm": 0.39756526693714445, + "learning_rate": 3.884989883662115e-05, + "loss": 0.4637, + "step": 5285 + }, + { + "epoch": 0.9029615089186652, + "grad_norm": 0.3968235423417824, + "learning_rate": 3.883409205867476e-05, + "loss": 0.4524, + "step": 5290 + }, + { + "epoch": 0.9038149697021421, + "grad_norm": 0.3976823096856294, + "learning_rate": 3.8818285280728375e-05, + "loss": 0.4617, + "step": 5295 + }, + { + "epoch": 0.9046684304856192, + "grad_norm": 0.37734719862477745, + "learning_rate": 3.8802478502782e-05, + "loss": 0.4396, + "step": 5300 + }, + { + "epoch": 0.9055218912690962, + "grad_norm": 0.3713872249714145, + "learning_rate": 3.878667172483561e-05, + "loss": 0.4701, + "step": 5305 + }, + { + "epoch": 0.9063753520525731, + "grad_norm": 0.356091804946768, + "learning_rate": 3.877086494688923e-05, + "loss": 0.4368, + "step": 5310 + }, + { + "epoch": 0.9072288128360502, + "grad_norm": 0.394160250063924, + "learning_rate": 3.8755058168942845e-05, + "loss": 0.4555, + "step": 5315 + }, + { + "epoch": 0.9080822736195272, + "grad_norm": 0.43170759667100317, + "learning_rate": 3.873925139099646e-05, + "loss": 0.4284, + "step": 5320 + }, + { + "epoch": 0.9089357344030042, + "grad_norm": 0.4179349029183989, + "learning_rate": 3.872344461305007e-05, + "loss": 0.4523, + "step": 5325 + }, + { + "epoch": 0.9097891951864812, + "grad_norm": 0.3664464693848915, + "learning_rate": 3.870763783510369e-05, + "loss": 0.4502, + "step": 5330 + }, + { + "epoch": 0.9106426559699582, + "grad_norm": 0.38687957544969326, + "learning_rate": 3.8691831057157315e-05, + "loss": 0.4597, + "step": 5335 + }, + { + "epoch": 0.9114961167534352, + "grad_norm": 0.44979205797145444, + "learning_rate": 3.8676024279210926e-05, + "loss": 0.4613, + "step": 5340 + }, + { + "epoch": 0.9123495775369121, + "grad_norm": 0.38598553360485094, + "learning_rate": 3.866021750126454e-05, + "loss": 0.4427, + "step": 5345 + }, + { + "epoch": 0.9132030383203892, + "grad_norm": 1.6236580699720844, + "learning_rate": 3.864441072331816e-05, + "loss": 0.4739, + "step": 5350 + }, + { + "epoch": 0.9140564991038662, + "grad_norm": 0.4246158964041592, + "learning_rate": 3.862860394537178e-05, + "loss": 0.4876, + "step": 5355 + }, + { + "epoch": 0.9149099598873431, + "grad_norm": 0.41328536026606627, + "learning_rate": 3.8612797167425395e-05, + "loss": 0.4887, + "step": 5360 + }, + { + "epoch": 0.9157634206708202, + "grad_norm": 0.7643525341386238, + "learning_rate": 3.859699038947901e-05, + "loss": 0.4387, + "step": 5365 + }, + { + "epoch": 0.9166168814542972, + "grad_norm": 0.39333252384942685, + "learning_rate": 3.858118361153263e-05, + "loss": 0.4603, + "step": 5370 + }, + { + "epoch": 0.9174703422377741, + "grad_norm": 0.36549208355741303, + "learning_rate": 3.856537683358624e-05, + "loss": 0.4313, + "step": 5375 + }, + { + "epoch": 0.9183238030212512, + "grad_norm": 0.38888946522034146, + "learning_rate": 3.854957005563986e-05, + "loss": 0.4526, + "step": 5380 + }, + { + "epoch": 0.9191772638047282, + "grad_norm": 0.39411286500370646, + "learning_rate": 3.8533763277693476e-05, + "loss": 0.4325, + "step": 5385 + }, + { + "epoch": 0.9200307245882051, + "grad_norm": 0.3927495233983897, + "learning_rate": 3.8517956499747093e-05, + "loss": 0.4575, + "step": 5390 + }, + { + "epoch": 0.9208841853716822, + "grad_norm": 0.42377440168134617, + "learning_rate": 3.850214972180071e-05, + "loss": 0.4673, + "step": 5395 + }, + { + "epoch": 0.9217376461551592, + "grad_norm": 0.4432199119619774, + "learning_rate": 3.848634294385433e-05, + "loss": 0.4875, + "step": 5400 + }, + { + "epoch": 0.9225911069386362, + "grad_norm": 0.40585755422175995, + "learning_rate": 3.8470536165907946e-05, + "loss": 0.4389, + "step": 5405 + }, + { + "epoch": 0.9234445677221131, + "grad_norm": 0.4538103442430755, + "learning_rate": 3.8454729387961556e-05, + "loss": 0.4859, + "step": 5410 + }, + { + "epoch": 0.9242980285055902, + "grad_norm": 0.3621378142955139, + "learning_rate": 3.8438922610015174e-05, + "loss": 0.4815, + "step": 5415 + }, + { + "epoch": 0.9251514892890672, + "grad_norm": 0.3697916050267976, + "learning_rate": 3.84231158320688e-05, + "loss": 0.4686, + "step": 5420 + }, + { + "epoch": 0.9260049500725441, + "grad_norm": 0.3792912721600657, + "learning_rate": 3.840730905412241e-05, + "loss": 0.4481, + "step": 5425 + }, + { + "epoch": 0.9268584108560212, + "grad_norm": 0.34922627192313405, + "learning_rate": 3.8391502276176026e-05, + "loss": 0.446, + "step": 5430 + }, + { + "epoch": 0.9277118716394982, + "grad_norm": 0.3940130604158538, + "learning_rate": 3.8375695498229644e-05, + "loss": 0.4578, + "step": 5435 + }, + { + "epoch": 0.9285653324229751, + "grad_norm": 0.4821375933416552, + "learning_rate": 3.835988872028326e-05, + "loss": 0.4579, + "step": 5440 + }, + { + "epoch": 0.9294187932064522, + "grad_norm": 0.3960119821025436, + "learning_rate": 3.834408194233687e-05, + "loss": 0.4309, + "step": 5445 + }, + { + "epoch": 0.9302722539899292, + "grad_norm": 0.6492593695986181, + "learning_rate": 3.832827516439049e-05, + "loss": 0.443, + "step": 5450 + }, + { + "epoch": 0.9311257147734061, + "grad_norm": 0.3866711989379687, + "learning_rate": 3.8312468386444114e-05, + "loss": 0.4596, + "step": 5455 + }, + { + "epoch": 0.9319791755568831, + "grad_norm": 0.4548216235756429, + "learning_rate": 3.8296661608497724e-05, + "loss": 0.4493, + "step": 5460 + }, + { + "epoch": 0.9328326363403602, + "grad_norm": 0.3559021780563998, + "learning_rate": 3.828085483055134e-05, + "loss": 0.4338, + "step": 5465 + }, + { + "epoch": 0.9336860971238372, + "grad_norm": 0.34946564902279204, + "learning_rate": 3.826504805260496e-05, + "loss": 0.4472, + "step": 5470 + }, + { + "epoch": 0.9345395579073141, + "grad_norm": 0.37578990267881635, + "learning_rate": 3.824924127465858e-05, + "loss": 0.4496, + "step": 5475 + }, + { + "epoch": 0.9353930186907912, + "grad_norm": 0.4716502415591886, + "learning_rate": 3.823343449671219e-05, + "loss": 0.5159, + "step": 5480 + }, + { + "epoch": 0.9362464794742682, + "grad_norm": 1.0266519725140473, + "learning_rate": 3.821762771876581e-05, + "loss": 0.4503, + "step": 5485 + }, + { + "epoch": 0.9370999402577451, + "grad_norm": 0.44412547391687285, + "learning_rate": 3.820182094081943e-05, + "loss": 0.4524, + "step": 5490 + }, + { + "epoch": 0.9379534010412222, + "grad_norm": 0.503593542190173, + "learning_rate": 3.818601416287304e-05, + "loss": 0.495, + "step": 5495 + }, + { + "epoch": 0.9388068618246992, + "grad_norm": 0.390089010381301, + "learning_rate": 3.817020738492666e-05, + "loss": 0.4438, + "step": 5500 + }, + { + "epoch": 0.9396603226081761, + "grad_norm": 0.5237476550840837, + "learning_rate": 3.8154400606980275e-05, + "loss": 0.4737, + "step": 5505 + }, + { + "epoch": 0.9405137833916531, + "grad_norm": 0.5745085161307547, + "learning_rate": 3.813859382903389e-05, + "loss": 0.4677, + "step": 5510 + }, + { + "epoch": 0.9413672441751302, + "grad_norm": 0.42441116822101493, + "learning_rate": 3.812278705108751e-05, + "loss": 0.433, + "step": 5515 + }, + { + "epoch": 0.9422207049586071, + "grad_norm": 0.4141184188803559, + "learning_rate": 3.810698027314113e-05, + "loss": 0.4731, + "step": 5520 + }, + { + "epoch": 0.9430741657420841, + "grad_norm": 0.3863082733523815, + "learning_rate": 3.8091173495194744e-05, + "loss": 0.4562, + "step": 5525 + }, + { + "epoch": 0.9439276265255612, + "grad_norm": 0.4710681089385406, + "learning_rate": 3.8075366717248355e-05, + "loss": 0.4643, + "step": 5530 + }, + { + "epoch": 0.9447810873090382, + "grad_norm": 0.4433783546619392, + "learning_rate": 3.805955993930197e-05, + "loss": 0.4793, + "step": 5535 + }, + { + "epoch": 0.9456345480925151, + "grad_norm": 0.40443344260421626, + "learning_rate": 3.804375316135559e-05, + "loss": 0.4623, + "step": 5540 + }, + { + "epoch": 0.9464880088759922, + "grad_norm": 0.47758717691243485, + "learning_rate": 3.802794638340921e-05, + "loss": 0.4584, + "step": 5545 + }, + { + "epoch": 0.9473414696594692, + "grad_norm": 0.5070813150960025, + "learning_rate": 3.8012139605462825e-05, + "loss": 0.4451, + "step": 5550 + }, + { + "epoch": 0.9481949304429461, + "grad_norm": 0.4787056345105152, + "learning_rate": 3.799633282751644e-05, + "loss": 0.4798, + "step": 5555 + }, + { + "epoch": 0.9490483912264231, + "grad_norm": 0.41180507381024867, + "learning_rate": 3.798052604957006e-05, + "loss": 0.4603, + "step": 5560 + }, + { + "epoch": 0.9499018520099002, + "grad_norm": 0.4275967348346644, + "learning_rate": 3.796471927162367e-05, + "loss": 0.4589, + "step": 5565 + }, + { + "epoch": 0.9507553127933771, + "grad_norm": 0.409501042986901, + "learning_rate": 3.794891249367729e-05, + "loss": 0.4745, + "step": 5570 + }, + { + "epoch": 0.9516087735768541, + "grad_norm": 0.4671380078413468, + "learning_rate": 3.793310571573091e-05, + "loss": 0.4755, + "step": 5575 + }, + { + "epoch": 0.9524622343603312, + "grad_norm": 0.4160105726538355, + "learning_rate": 3.791729893778452e-05, + "loss": 0.4587, + "step": 5580 + }, + { + "epoch": 0.9533156951438081, + "grad_norm": 0.4816419888813679, + "learning_rate": 3.790149215983814e-05, + "loss": 0.4593, + "step": 5585 + }, + { + "epoch": 0.9541691559272851, + "grad_norm": 0.41394130403273693, + "learning_rate": 3.788568538189176e-05, + "loss": 0.4666, + "step": 5590 + }, + { + "epoch": 0.9550226167107622, + "grad_norm": 0.42424423245469944, + "learning_rate": 3.7869878603945375e-05, + "loss": 0.4545, + "step": 5595 + }, + { + "epoch": 0.9558760774942391, + "grad_norm": 0.4639841340731286, + "learning_rate": 3.7854071825998986e-05, + "loss": 0.4488, + "step": 5600 + }, + { + "epoch": 0.9567295382777161, + "grad_norm": 0.5060482156153165, + "learning_rate": 3.783826504805261e-05, + "loss": 0.4315, + "step": 5605 + }, + { + "epoch": 0.9575829990611932, + "grad_norm": 0.4145197030432268, + "learning_rate": 3.782245827010623e-05, + "loss": 0.4472, + "step": 5610 + }, + { + "epoch": 0.9584364598446702, + "grad_norm": 0.42736718516659117, + "learning_rate": 3.780665149215984e-05, + "loss": 0.463, + "step": 5615 + }, + { + "epoch": 0.9592899206281471, + "grad_norm": 0.5099800714999574, + "learning_rate": 3.7790844714213456e-05, + "loss": 0.4494, + "step": 5620 + }, + { + "epoch": 0.9601433814116241, + "grad_norm": 0.3932085090574082, + "learning_rate": 3.777503793626707e-05, + "loss": 0.4472, + "step": 5625 + }, + { + "epoch": 0.9609968421951012, + "grad_norm": 0.4041638575629075, + "learning_rate": 3.775923115832069e-05, + "loss": 0.4716, + "step": 5630 + }, + { + "epoch": 0.9618503029785781, + "grad_norm": 0.35654399586608365, + "learning_rate": 3.77434243803743e-05, + "loss": 0.4256, + "step": 5635 + }, + { + "epoch": 0.9627037637620551, + "grad_norm": 0.40490143000313356, + "learning_rate": 3.7727617602427926e-05, + "loss": 0.4427, + "step": 5640 + }, + { + "epoch": 0.9635572245455322, + "grad_norm": 0.4306969286472383, + "learning_rate": 3.771181082448154e-05, + "loss": 0.4599, + "step": 5645 + }, + { + "epoch": 0.9644106853290091, + "grad_norm": 0.44608416080202795, + "learning_rate": 3.7696004046535154e-05, + "loss": 0.4504, + "step": 5650 + }, + { + "epoch": 0.9652641461124861, + "grad_norm": 0.4909756812121427, + "learning_rate": 3.768019726858877e-05, + "loss": 0.4454, + "step": 5655 + }, + { + "epoch": 0.9661176068959632, + "grad_norm": 0.42378492668995615, + "learning_rate": 3.766439049064239e-05, + "loss": 0.4792, + "step": 5660 + }, + { + "epoch": 0.9669710676794401, + "grad_norm": 0.39735362969367277, + "learning_rate": 3.7648583712696006e-05, + "loss": 0.4881, + "step": 5665 + }, + { + "epoch": 0.9678245284629171, + "grad_norm": 0.3573636936700055, + "learning_rate": 3.7632776934749624e-05, + "loss": 0.431, + "step": 5670 + }, + { + "epoch": 0.9686779892463941, + "grad_norm": 0.856707855907997, + "learning_rate": 3.761697015680324e-05, + "loss": 0.4403, + "step": 5675 + }, + { + "epoch": 0.9695314500298712, + "grad_norm": 0.3907319920812853, + "learning_rate": 3.760116337885686e-05, + "loss": 0.4591, + "step": 5680 + }, + { + "epoch": 0.9703849108133481, + "grad_norm": 0.3617016366595509, + "learning_rate": 3.758535660091047e-05, + "loss": 0.4381, + "step": 5685 + }, + { + "epoch": 0.9712383715968251, + "grad_norm": 0.4184546987813738, + "learning_rate": 3.756954982296409e-05, + "loss": 0.5012, + "step": 5690 + }, + { + "epoch": 0.9720918323803022, + "grad_norm": 4.862897933769316, + "learning_rate": 3.7553743045017704e-05, + "loss": 0.4867, + "step": 5695 + }, + { + "epoch": 0.9729452931637791, + "grad_norm": 0.5282636819263453, + "learning_rate": 3.753793626707132e-05, + "loss": 0.4659, + "step": 5700 + }, + { + "epoch": 0.9737987539472561, + "grad_norm": 0.43193117699851213, + "learning_rate": 3.752212948912494e-05, + "loss": 0.4693, + "step": 5705 + }, + { + "epoch": 0.9746522147307332, + "grad_norm": 0.42975743283527323, + "learning_rate": 3.7506322711178557e-05, + "loss": 0.4738, + "step": 5710 + }, + { + "epoch": 0.9755056755142101, + "grad_norm": 0.4343654508833946, + "learning_rate": 3.7490515933232174e-05, + "loss": 0.4388, + "step": 5715 + }, + { + "epoch": 0.9763591362976871, + "grad_norm": 0.365610873557326, + "learning_rate": 3.7474709155285785e-05, + "loss": 0.4799, + "step": 5720 + }, + { + "epoch": 0.9772125970811641, + "grad_norm": 0.4683124404513186, + "learning_rate": 3.74589023773394e-05, + "loss": 0.4498, + "step": 5725 + }, + { + "epoch": 0.9780660578646411, + "grad_norm": 0.7878480520304444, + "learning_rate": 3.744309559939302e-05, + "loss": 0.4575, + "step": 5730 + }, + { + "epoch": 0.9789195186481181, + "grad_norm": 0.4801039090352379, + "learning_rate": 3.742728882144664e-05, + "loss": 0.484, + "step": 5735 + }, + { + "epoch": 0.9797729794315951, + "grad_norm": 0.5152189238178845, + "learning_rate": 3.7411482043500254e-05, + "loss": 0.4705, + "step": 5740 + }, + { + "epoch": 0.9806264402150722, + "grad_norm": 0.4867310274108711, + "learning_rate": 3.739567526555387e-05, + "loss": 0.4771, + "step": 5745 + }, + { + "epoch": 0.9814799009985491, + "grad_norm": 0.5645780923606522, + "learning_rate": 3.737986848760749e-05, + "loss": 0.4746, + "step": 5750 + }, + { + "epoch": 0.9823333617820261, + "grad_norm": 0.38450727974879706, + "learning_rate": 3.73640617096611e-05, + "loss": 0.4484, + "step": 5755 + }, + { + "epoch": 0.9831868225655032, + "grad_norm": 0.41981751648706767, + "learning_rate": 3.7348254931714724e-05, + "loss": 0.4366, + "step": 5760 + }, + { + "epoch": 0.9840402833489801, + "grad_norm": 0.5124579749342585, + "learning_rate": 3.733244815376834e-05, + "loss": 0.4655, + "step": 5765 + }, + { + "epoch": 0.9848937441324571, + "grad_norm": 0.43040573748634436, + "learning_rate": 3.731664137582195e-05, + "loss": 0.4458, + "step": 5770 + }, + { + "epoch": 0.9857472049159341, + "grad_norm": 0.371097389052583, + "learning_rate": 3.730083459787557e-05, + "loss": 0.4646, + "step": 5775 + }, + { + "epoch": 0.9866006656994111, + "grad_norm": 0.8835102192758597, + "learning_rate": 3.728502781992919e-05, + "loss": 0.4987, + "step": 5780 + }, + { + "epoch": 0.9874541264828881, + "grad_norm": 0.34985676739187793, + "learning_rate": 3.7269221041982805e-05, + "loss": 0.4307, + "step": 5785 + }, + { + "epoch": 0.9883075872663651, + "grad_norm": 0.4482204094674099, + "learning_rate": 3.7253414264036416e-05, + "loss": 0.462, + "step": 5790 + }, + { + "epoch": 0.9891610480498421, + "grad_norm": 0.4641563365689893, + "learning_rate": 3.723760748609004e-05, + "loss": 0.4458, + "step": 5795 + }, + { + "epoch": 0.9900145088333191, + "grad_norm": 0.6375724302920852, + "learning_rate": 3.722180070814366e-05, + "loss": 0.4595, + "step": 5800 + }, + { + "epoch": 0.9908679696167961, + "grad_norm": 0.5559232925637143, + "learning_rate": 3.720599393019727e-05, + "loss": 0.4439, + "step": 5805 + }, + { + "epoch": 0.9917214304002732, + "grad_norm": 0.37657819341882687, + "learning_rate": 3.7190187152250885e-05, + "loss": 0.4632, + "step": 5810 + }, + { + "epoch": 0.9925748911837501, + "grad_norm": 0.3901766835132743, + "learning_rate": 3.71743803743045e-05, + "loss": 0.4596, + "step": 5815 + }, + { + "epoch": 0.9934283519672271, + "grad_norm": 0.4313043353543776, + "learning_rate": 3.715857359635812e-05, + "loss": 0.4735, + "step": 5820 + }, + { + "epoch": 0.9942818127507042, + "grad_norm": 0.37318724262666314, + "learning_rate": 3.714276681841174e-05, + "loss": 0.4684, + "step": 5825 + }, + { + "epoch": 0.9951352735341811, + "grad_norm": 0.4813693313647458, + "learning_rate": 3.7126960040465355e-05, + "loss": 0.4623, + "step": 5830 + }, + { + "epoch": 0.9959887343176581, + "grad_norm": 0.4486878697427319, + "learning_rate": 3.711115326251897e-05, + "loss": 0.4533, + "step": 5835 + }, + { + "epoch": 0.9968421951011351, + "grad_norm": 0.42413980193320633, + "learning_rate": 3.709534648457258e-05, + "loss": 0.4569, + "step": 5840 + }, + { + "epoch": 0.9976956558846121, + "grad_norm": 0.4333397282299119, + "learning_rate": 3.70795397066262e-05, + "loss": 0.4722, + "step": 5845 + }, + { + "epoch": 0.9985491166680891, + "grad_norm": 0.3499741056409646, + "learning_rate": 3.706373292867982e-05, + "loss": 0.422, + "step": 5850 + }, + { + "epoch": 0.9994025774515661, + "grad_norm": 0.4027209707705357, + "learning_rate": 3.7047926150733436e-05, + "loss": 0.4636, + "step": 5855 + }, + { + "epoch": 1.0001706921566953, + "grad_norm": 0.6288062417613335, + "learning_rate": 3.703211937278705e-05, + "loss": 0.4473, + "step": 5860 + }, + { + "epoch": 1.0010241529401724, + "grad_norm": 0.38754061072346385, + "learning_rate": 3.701631259484067e-05, + "loss": 0.3766, + "step": 5865 + }, + { + "epoch": 1.0018776137236494, + "grad_norm": 0.33960600570302646, + "learning_rate": 3.700050581689429e-05, + "loss": 0.3663, + "step": 5870 + }, + { + "epoch": 1.0027310745071265, + "grad_norm": 0.4136489943272252, + "learning_rate": 3.69846990389479e-05, + "loss": 0.4111, + "step": 5875 + }, + { + "epoch": 1.0035845352906034, + "grad_norm": 0.61166635664291, + "learning_rate": 3.696889226100152e-05, + "loss": 0.3794, + "step": 5880 + }, + { + "epoch": 1.0044379960740804, + "grad_norm": 0.42563074032686277, + "learning_rate": 3.6953085483055134e-05, + "loss": 0.3654, + "step": 5885 + }, + { + "epoch": 1.0052914568575575, + "grad_norm": 0.4164695520452888, + "learning_rate": 3.693727870510875e-05, + "loss": 0.4062, + "step": 5890 + }, + { + "epoch": 1.0061449176410344, + "grad_norm": 0.35877236247587296, + "learning_rate": 3.692147192716237e-05, + "loss": 0.3721, + "step": 5895 + }, + { + "epoch": 1.0069983784245113, + "grad_norm": 0.4070961912817456, + "learning_rate": 3.6905665149215986e-05, + "loss": 0.3549, + "step": 5900 + }, + { + "epoch": 1.0078518392079885, + "grad_norm": 0.4514198162271714, + "learning_rate": 3.6889858371269603e-05, + "loss": 0.3823, + "step": 5905 + }, + { + "epoch": 1.0087052999914654, + "grad_norm": 0.35551701094292903, + "learning_rate": 3.6874051593323214e-05, + "loss": 0.3563, + "step": 5910 + }, + { + "epoch": 1.0095587607749423, + "grad_norm": 0.4206702553978487, + "learning_rate": 3.685824481537684e-05, + "loss": 0.39, + "step": 5915 + }, + { + "epoch": 1.0104122215584195, + "grad_norm": 0.4050493735514043, + "learning_rate": 3.6842438037430456e-05, + "loss": 0.3833, + "step": 5920 + }, + { + "epoch": 1.0112656823418964, + "grad_norm": 0.41089629306713504, + "learning_rate": 3.6826631259484067e-05, + "loss": 0.391, + "step": 5925 + }, + { + "epoch": 1.0121191431253733, + "grad_norm": 0.4847686846807888, + "learning_rate": 3.6810824481537684e-05, + "loss": 0.3576, + "step": 5930 + }, + { + "epoch": 1.0129726039088505, + "grad_norm": 0.4310668413326392, + "learning_rate": 3.67950177035913e-05, + "loss": 0.4042, + "step": 5935 + }, + { + "epoch": 1.0138260646923274, + "grad_norm": 0.3616617447543886, + "learning_rate": 3.677921092564492e-05, + "loss": 0.4214, + "step": 5940 + }, + { + "epoch": 1.0146795254758043, + "grad_norm": 0.3256661980486338, + "learning_rate": 3.676340414769853e-05, + "loss": 0.3656, + "step": 5945 + }, + { + "epoch": 1.0155329862592815, + "grad_norm": 0.37290726450601397, + "learning_rate": 3.6747597369752154e-05, + "loss": 0.3952, + "step": 5950 + }, + { + "epoch": 1.0163864470427584, + "grad_norm": 0.33989355267900506, + "learning_rate": 3.673179059180577e-05, + "loss": 0.3617, + "step": 5955 + }, + { + "epoch": 1.0172399078262353, + "grad_norm": 0.39699402875988227, + "learning_rate": 3.671598381385938e-05, + "loss": 0.3721, + "step": 5960 + }, + { + "epoch": 1.0180933686097124, + "grad_norm": 0.3264087946543946, + "learning_rate": 3.6700177035913e-05, + "loss": 0.3734, + "step": 5965 + }, + { + "epoch": 1.0189468293931894, + "grad_norm": 0.4111875246383104, + "learning_rate": 3.668437025796662e-05, + "loss": 0.3915, + "step": 5970 + }, + { + "epoch": 1.0198002901766663, + "grad_norm": 0.3995828309649097, + "learning_rate": 3.6668563480020234e-05, + "loss": 0.3882, + "step": 5975 + }, + { + "epoch": 1.0206537509601434, + "grad_norm": 0.3747886147565575, + "learning_rate": 3.6652756702073845e-05, + "loss": 0.3916, + "step": 5980 + }, + { + "epoch": 1.0215072117436204, + "grad_norm": 0.3372085408259476, + "learning_rate": 3.663694992412747e-05, + "loss": 0.3849, + "step": 5985 + }, + { + "epoch": 1.0223606725270973, + "grad_norm": 0.4160692716074461, + "learning_rate": 3.662114314618109e-05, + "loss": 0.3641, + "step": 5990 + }, + { + "epoch": 1.0232141333105744, + "grad_norm": 0.3729675376120794, + "learning_rate": 3.66053363682347e-05, + "loss": 0.4029, + "step": 5995 + }, + { + "epoch": 1.0240675940940513, + "grad_norm": 0.33227559391363237, + "learning_rate": 3.658952959028832e-05, + "loss": 0.3856, + "step": 6000 + }, + { + "epoch": 1.0249210548775283, + "grad_norm": 0.35293680411023437, + "learning_rate": 3.657372281234193e-05, + "loss": 0.3484, + "step": 6005 + }, + { + "epoch": 1.0257745156610054, + "grad_norm": 0.4034296244720003, + "learning_rate": 3.655791603439555e-05, + "loss": 0.3546, + "step": 6010 + }, + { + "epoch": 1.0266279764444823, + "grad_norm": 0.38737538918982256, + "learning_rate": 3.654210925644917e-05, + "loss": 0.3696, + "step": 6015 + }, + { + "epoch": 1.0274814372279595, + "grad_norm": 0.35939055045698165, + "learning_rate": 3.6526302478502785e-05, + "loss": 0.386, + "step": 6020 + }, + { + "epoch": 1.0283348980114364, + "grad_norm": 0.3428720112959876, + "learning_rate": 3.65104957005564e-05, + "loss": 0.3796, + "step": 6025 + }, + { + "epoch": 1.0291883587949133, + "grad_norm": 0.33521610732253654, + "learning_rate": 3.649468892261001e-05, + "loss": 0.3781, + "step": 6030 + }, + { + "epoch": 1.0300418195783905, + "grad_norm": 0.37984871713555923, + "learning_rate": 3.647888214466364e-05, + "loss": 0.3796, + "step": 6035 + }, + { + "epoch": 1.0308952803618674, + "grad_norm": 0.5985744943668265, + "learning_rate": 3.646307536671725e-05, + "loss": 0.3668, + "step": 6040 + }, + { + "epoch": 1.0317487411453443, + "grad_norm": 0.3352657163150749, + "learning_rate": 3.6447268588770865e-05, + "loss": 0.409, + "step": 6045 + }, + { + "epoch": 1.0326022019288215, + "grad_norm": 0.3491380338159307, + "learning_rate": 3.643146181082448e-05, + "loss": 0.3683, + "step": 6050 + }, + { + "epoch": 1.0334556627122984, + "grad_norm": 0.46292870144229437, + "learning_rate": 3.64156550328781e-05, + "loss": 0.3837, + "step": 6055 + }, + { + "epoch": 1.0343091234957753, + "grad_norm": 0.30738126723971215, + "learning_rate": 3.639984825493172e-05, + "loss": 0.3796, + "step": 6060 + }, + { + "epoch": 1.0351625842792525, + "grad_norm": 0.34472849345856665, + "learning_rate": 3.638404147698533e-05, + "loss": 0.3911, + "step": 6065 + }, + { + "epoch": 1.0360160450627294, + "grad_norm": 0.3346180101688685, + "learning_rate": 3.636823469903895e-05, + "loss": 0.3668, + "step": 6070 + }, + { + "epoch": 1.0368695058462063, + "grad_norm": 0.35690751165072565, + "learning_rate": 3.635242792109257e-05, + "loss": 0.3778, + "step": 6075 + }, + { + "epoch": 1.0377229666296834, + "grad_norm": 0.3677037386510019, + "learning_rate": 3.633662114314618e-05, + "loss": 0.37, + "step": 6080 + }, + { + "epoch": 1.0385764274131604, + "grad_norm": 0.4255067817756184, + "learning_rate": 3.63208143651998e-05, + "loss": 0.3993, + "step": 6085 + }, + { + "epoch": 1.0394298881966373, + "grad_norm": 0.47237275766808884, + "learning_rate": 3.6305007587253416e-05, + "loss": 0.4001, + "step": 6090 + }, + { + "epoch": 1.0402833489801144, + "grad_norm": 0.3160522482641632, + "learning_rate": 3.628920080930703e-05, + "loss": 0.3811, + "step": 6095 + }, + { + "epoch": 1.0411368097635914, + "grad_norm": 0.420033345369002, + "learning_rate": 3.6273394031360644e-05, + "loss": 0.3561, + "step": 6100 + }, + { + "epoch": 1.0419902705470683, + "grad_norm": 0.45986808309255994, + "learning_rate": 3.625758725341427e-05, + "loss": 0.3515, + "step": 6105 + }, + { + "epoch": 1.0428437313305454, + "grad_norm": 0.4289388137838117, + "learning_rate": 3.6241780475467885e-05, + "loss": 0.3903, + "step": 6110 + }, + { + "epoch": 1.0436971921140223, + "grad_norm": 0.44362015679472544, + "learning_rate": 3.6225973697521496e-05, + "loss": 0.3781, + "step": 6115 + }, + { + "epoch": 1.0445506528974993, + "grad_norm": 0.415183964660911, + "learning_rate": 3.621016691957512e-05, + "loss": 0.39, + "step": 6120 + }, + { + "epoch": 1.0454041136809764, + "grad_norm": 0.31920896239376617, + "learning_rate": 3.619436014162873e-05, + "loss": 0.4002, + "step": 6125 + }, + { + "epoch": 1.0462575744644533, + "grad_norm": 0.34023466791164747, + "learning_rate": 3.617855336368235e-05, + "loss": 0.3885, + "step": 6130 + }, + { + "epoch": 1.0471110352479303, + "grad_norm": 0.398273235152284, + "learning_rate": 3.6162746585735966e-05, + "loss": 0.3781, + "step": 6135 + }, + { + "epoch": 1.0479644960314074, + "grad_norm": 0.3513130838940666, + "learning_rate": 3.614693980778958e-05, + "loss": 0.3908, + "step": 6140 + }, + { + "epoch": 1.0488179568148843, + "grad_norm": 0.28614314157111614, + "learning_rate": 3.61311330298432e-05, + "loss": 0.3478, + "step": 6145 + }, + { + "epoch": 1.0496714175983612, + "grad_norm": 0.34306163268100404, + "learning_rate": 3.611532625189681e-05, + "loss": 0.3856, + "step": 6150 + }, + { + "epoch": 1.0505248783818384, + "grad_norm": 0.36278534942222906, + "learning_rate": 3.6099519473950436e-05, + "loss": 0.4104, + "step": 6155 + }, + { + "epoch": 1.0513783391653153, + "grad_norm": 0.38037742648197687, + "learning_rate": 3.6083712696004046e-05, + "loss": 0.3489, + "step": 6160 + }, + { + "epoch": 1.0522317999487925, + "grad_norm": 0.37240475311150545, + "learning_rate": 3.6067905918057664e-05, + "loss": 0.3747, + "step": 6165 + }, + { + "epoch": 1.0530852607322694, + "grad_norm": 0.34139587197340454, + "learning_rate": 3.605209914011128e-05, + "loss": 0.3767, + "step": 6170 + }, + { + "epoch": 1.0539387215157463, + "grad_norm": 0.40863545347694546, + "learning_rate": 3.60362923621649e-05, + "loss": 0.3654, + "step": 6175 + }, + { + "epoch": 1.0547921822992234, + "grad_norm": 0.3753918782608841, + "learning_rate": 3.6020485584218516e-05, + "loss": 0.3605, + "step": 6180 + }, + { + "epoch": 1.0556456430827004, + "grad_norm": 0.32009725853280235, + "learning_rate": 3.600467880627213e-05, + "loss": 0.3655, + "step": 6185 + }, + { + "epoch": 1.0564991038661773, + "grad_norm": 0.5092431911324296, + "learning_rate": 3.598887202832575e-05, + "loss": 0.3881, + "step": 6190 + }, + { + "epoch": 1.0573525646496544, + "grad_norm": 0.43456819194948054, + "learning_rate": 3.597306525037936e-05, + "loss": 0.3669, + "step": 6195 + }, + { + "epoch": 1.0582060254331314, + "grad_norm": 0.3243068352215124, + "learning_rate": 3.595725847243298e-05, + "loss": 0.3541, + "step": 6200 + }, + { + "epoch": 1.0590594862166083, + "grad_norm": 0.5383990238343098, + "learning_rate": 3.59414516944866e-05, + "loss": 0.3623, + "step": 6205 + }, + { + "epoch": 1.0599129470000854, + "grad_norm": 0.3541289721102046, + "learning_rate": 3.5925644916540214e-05, + "loss": 0.4034, + "step": 6210 + }, + { + "epoch": 1.0607664077835623, + "grad_norm": 0.38223980100958427, + "learning_rate": 3.590983813859383e-05, + "loss": 0.3959, + "step": 6215 + }, + { + "epoch": 1.0616198685670393, + "grad_norm": 0.3215840779602491, + "learning_rate": 3.589403136064744e-05, + "loss": 0.3737, + "step": 6220 + }, + { + "epoch": 1.0624733293505164, + "grad_norm": 0.36266124166010544, + "learning_rate": 3.5878224582701067e-05, + "loss": 0.3575, + "step": 6225 + }, + { + "epoch": 1.0633267901339933, + "grad_norm": 0.3668600032359126, + "learning_rate": 3.5862417804754684e-05, + "loss": 0.3597, + "step": 6230 + }, + { + "epoch": 1.0641802509174703, + "grad_norm": 0.2960195006335881, + "learning_rate": 3.5846611026808295e-05, + "loss": 0.3921, + "step": 6235 + }, + { + "epoch": 1.0650337117009474, + "grad_norm": 0.31845474075202623, + "learning_rate": 3.583080424886192e-05, + "loss": 0.3911, + "step": 6240 + }, + { + "epoch": 1.0658871724844243, + "grad_norm": 0.3725948362521795, + "learning_rate": 3.581499747091553e-05, + "loss": 0.3888, + "step": 6245 + }, + { + "epoch": 1.0667406332679013, + "grad_norm": 0.34149111645486385, + "learning_rate": 3.579919069296915e-05, + "loss": 0.3674, + "step": 6250 + }, + { + "epoch": 1.0675940940513784, + "grad_norm": 0.4858579069975639, + "learning_rate": 3.5783383915022765e-05, + "loss": 0.4212, + "step": 6255 + }, + { + "epoch": 1.0684475548348553, + "grad_norm": 0.37017559385021276, + "learning_rate": 3.576757713707638e-05, + "loss": 0.3764, + "step": 6260 + }, + { + "epoch": 1.0693010156183322, + "grad_norm": 0.3501088365141956, + "learning_rate": 3.575177035913e-05, + "loss": 0.381, + "step": 6265 + }, + { + "epoch": 1.0701544764018094, + "grad_norm": 0.4536112262409545, + "learning_rate": 3.573596358118361e-05, + "loss": 0.3927, + "step": 6270 + }, + { + "epoch": 1.0710079371852863, + "grad_norm": 0.3581084690276881, + "learning_rate": 3.5720156803237234e-05, + "loss": 0.3531, + "step": 6275 + }, + { + "epoch": 1.0718613979687635, + "grad_norm": 0.35442213729757155, + "learning_rate": 3.5704350025290845e-05, + "loss": 0.3679, + "step": 6280 + }, + { + "epoch": 1.0727148587522404, + "grad_norm": 0.3300824599413051, + "learning_rate": 3.568854324734446e-05, + "loss": 0.3536, + "step": 6285 + }, + { + "epoch": 1.0735683195357173, + "grad_norm": 0.37005100248415596, + "learning_rate": 3.567273646939808e-05, + "loss": 0.3753, + "step": 6290 + }, + { + "epoch": 1.0744217803191942, + "grad_norm": 0.37767347033399445, + "learning_rate": 3.56569296914517e-05, + "loss": 0.3748, + "step": 6295 + }, + { + "epoch": 1.0752752411026714, + "grad_norm": 0.3549726169959677, + "learning_rate": 3.5641122913505315e-05, + "loss": 0.3704, + "step": 6300 + }, + { + "epoch": 1.0761287018861483, + "grad_norm": 0.42642399641544837, + "learning_rate": 3.5625316135558926e-05, + "loss": 0.3947, + "step": 6305 + }, + { + "epoch": 1.0769821626696254, + "grad_norm": 0.4184430610863713, + "learning_rate": 3.560950935761255e-05, + "loss": 0.4113, + "step": 6310 + }, + { + "epoch": 1.0778356234531024, + "grad_norm": 0.34839888808095437, + "learning_rate": 3.559370257966616e-05, + "loss": 0.3741, + "step": 6315 + }, + { + "epoch": 1.0786890842365793, + "grad_norm": 0.3541232583117419, + "learning_rate": 3.557789580171978e-05, + "loss": 0.3609, + "step": 6320 + }, + { + "epoch": 1.0795425450200564, + "grad_norm": 0.4127048987151015, + "learning_rate": 3.5562089023773395e-05, + "loss": 0.3681, + "step": 6325 + }, + { + "epoch": 1.0803960058035333, + "grad_norm": 0.3967872211761154, + "learning_rate": 3.554628224582701e-05, + "loss": 0.3852, + "step": 6330 + }, + { + "epoch": 1.0812494665870103, + "grad_norm": 0.40621764304381197, + "learning_rate": 3.553047546788063e-05, + "loss": 0.3631, + "step": 6335 + }, + { + "epoch": 1.0821029273704874, + "grad_norm": 0.3507900861501906, + "learning_rate": 3.551466868993424e-05, + "loss": 0.4227, + "step": 6340 + }, + { + "epoch": 1.0829563881539643, + "grad_norm": 0.34308174986128376, + "learning_rate": 3.5498861911987865e-05, + "loss": 0.3793, + "step": 6345 + }, + { + "epoch": 1.0838098489374413, + "grad_norm": 0.7806539078148241, + "learning_rate": 3.5483055134041476e-05, + "loss": 0.3627, + "step": 6350 + }, + { + "epoch": 1.0846633097209184, + "grad_norm": 0.40689762094410925, + "learning_rate": 3.546724835609509e-05, + "loss": 0.3799, + "step": 6355 + }, + { + "epoch": 1.0855167705043953, + "grad_norm": 0.3372893848996216, + "learning_rate": 3.545144157814871e-05, + "loss": 0.3706, + "step": 6360 + }, + { + "epoch": 1.0863702312878722, + "grad_norm": 0.5105115199964079, + "learning_rate": 3.543563480020233e-05, + "loss": 0.3517, + "step": 6365 + }, + { + "epoch": 1.0872236920713494, + "grad_norm": 0.49674907906461885, + "learning_rate": 3.5419828022255946e-05, + "loss": 0.3976, + "step": 6370 + }, + { + "epoch": 1.0880771528548263, + "grad_norm": 0.3616314250841704, + "learning_rate": 3.540402124430956e-05, + "loss": 0.3624, + "step": 6375 + }, + { + "epoch": 1.0889306136383032, + "grad_norm": 0.3261095293861316, + "learning_rate": 3.538821446636318e-05, + "loss": 0.4025, + "step": 6380 + }, + { + "epoch": 1.0897840744217804, + "grad_norm": 0.3995130514496934, + "learning_rate": 3.537240768841679e-05, + "loss": 0.3816, + "step": 6385 + }, + { + "epoch": 1.0906375352052573, + "grad_norm": 0.6368223818268357, + "learning_rate": 3.535660091047041e-05, + "loss": 0.3668, + "step": 6390 + }, + { + "epoch": 1.0914909959887342, + "grad_norm": 0.40057929508373474, + "learning_rate": 3.534079413252403e-05, + "loss": 0.3464, + "step": 6395 + }, + { + "epoch": 1.0923444567722114, + "grad_norm": 0.4494989616048801, + "learning_rate": 3.5324987354577644e-05, + "loss": 0.3896, + "step": 6400 + }, + { + "epoch": 1.0931979175556883, + "grad_norm": 0.3670399719103569, + "learning_rate": 3.530918057663126e-05, + "loss": 0.371, + "step": 6405 + }, + { + "epoch": 1.0940513783391652, + "grad_norm": 0.38767765917433505, + "learning_rate": 3.529337379868488e-05, + "loss": 0.355, + "step": 6410 + }, + { + "epoch": 1.0949048391226424, + "grad_norm": 0.3578300732267279, + "learning_rate": 3.5277567020738496e-05, + "loss": 0.3777, + "step": 6415 + }, + { + "epoch": 1.0957582999061193, + "grad_norm": 0.36797054114616534, + "learning_rate": 3.5261760242792114e-05, + "loss": 0.3748, + "step": 6420 + }, + { + "epoch": 1.0966117606895964, + "grad_norm": 0.3752746066914836, + "learning_rate": 3.5245953464845724e-05, + "loss": 0.3951, + "step": 6425 + }, + { + "epoch": 1.0974652214730733, + "grad_norm": 0.4992245034330738, + "learning_rate": 3.523014668689935e-05, + "loss": 0.3996, + "step": 6430 + }, + { + "epoch": 1.0983186822565503, + "grad_norm": 0.3802733826060877, + "learning_rate": 3.521433990895296e-05, + "loss": 0.3683, + "step": 6435 + }, + { + "epoch": 1.0991721430400272, + "grad_norm": 0.4479737303674627, + "learning_rate": 3.5198533131006577e-05, + "loss": 0.3849, + "step": 6440 + }, + { + "epoch": 1.1000256038235043, + "grad_norm": 0.42426668896227515, + "learning_rate": 3.5182726353060194e-05, + "loss": 0.3919, + "step": 6445 + }, + { + "epoch": 1.1008790646069813, + "grad_norm": 0.48513194649690067, + "learning_rate": 3.516691957511381e-05, + "loss": 0.3588, + "step": 6450 + }, + { + "epoch": 1.1017325253904584, + "grad_norm": 0.414365401738906, + "learning_rate": 3.515111279716743e-05, + "loss": 0.3514, + "step": 6455 + }, + { + "epoch": 1.1025859861739353, + "grad_norm": 0.3772732173172724, + "learning_rate": 3.513530601922104e-05, + "loss": 0.3982, + "step": 6460 + }, + { + "epoch": 1.1034394469574123, + "grad_norm": 0.5082277911704493, + "learning_rate": 3.5119499241274664e-05, + "loss": 0.3986, + "step": 6465 + }, + { + "epoch": 1.1042929077408894, + "grad_norm": 0.5876443202759482, + "learning_rate": 3.5103692463328275e-05, + "loss": 0.3659, + "step": 6470 + }, + { + "epoch": 1.1051463685243663, + "grad_norm": 0.3388469265049481, + "learning_rate": 3.508788568538189e-05, + "loss": 0.3699, + "step": 6475 + }, + { + "epoch": 1.1059998293078432, + "grad_norm": 0.4262580363531045, + "learning_rate": 3.507207890743551e-05, + "loss": 0.3553, + "step": 6480 + }, + { + "epoch": 1.1068532900913204, + "grad_norm": 0.4133261577814819, + "learning_rate": 3.505627212948913e-05, + "loss": 0.3917, + "step": 6485 + }, + { + "epoch": 1.1077067508747973, + "grad_norm": 0.29908511017730205, + "learning_rate": 3.5040465351542744e-05, + "loss": 0.3848, + "step": 6490 + }, + { + "epoch": 1.1085602116582742, + "grad_norm": 0.4519310135103528, + "learning_rate": 3.5024658573596355e-05, + "loss": 0.3669, + "step": 6495 + }, + { + "epoch": 1.1094136724417514, + "grad_norm": 0.36753690476682793, + "learning_rate": 3.500885179564998e-05, + "loss": 0.362, + "step": 6500 + }, + { + "epoch": 1.1102671332252283, + "grad_norm": 0.43419707034688587, + "learning_rate": 3.499304501770359e-05, + "loss": 0.3717, + "step": 6505 + }, + { + "epoch": 1.1111205940087052, + "grad_norm": 0.3362000973234563, + "learning_rate": 3.497723823975721e-05, + "loss": 0.3707, + "step": 6510 + }, + { + "epoch": 1.1119740547921824, + "grad_norm": 0.41338929812634956, + "learning_rate": 3.496143146181083e-05, + "loss": 0.3636, + "step": 6515 + }, + { + "epoch": 1.1128275155756593, + "grad_norm": 0.38546860525336446, + "learning_rate": 3.494562468386444e-05, + "loss": 0.3432, + "step": 6520 + }, + { + "epoch": 1.1136809763591362, + "grad_norm": 0.4837974451704121, + "learning_rate": 3.492981790591806e-05, + "loss": 0.3856, + "step": 6525 + }, + { + "epoch": 1.1145344371426134, + "grad_norm": 0.4099945195244458, + "learning_rate": 3.491401112797168e-05, + "loss": 0.3991, + "step": 6530 + }, + { + "epoch": 1.1153878979260903, + "grad_norm": 0.4090729970162646, + "learning_rate": 3.4898204350025295e-05, + "loss": 0.399, + "step": 6535 + }, + { + "epoch": 1.1162413587095672, + "grad_norm": 0.3293074613655451, + "learning_rate": 3.4882397572078905e-05, + "loss": 0.3773, + "step": 6540 + }, + { + "epoch": 1.1170948194930443, + "grad_norm": 0.31970254478771015, + "learning_rate": 3.486659079413252e-05, + "loss": 0.4063, + "step": 6545 + }, + { + "epoch": 1.1179482802765213, + "grad_norm": 0.3404686139395737, + "learning_rate": 3.485078401618615e-05, + "loss": 0.3783, + "step": 6550 + }, + { + "epoch": 1.1188017410599982, + "grad_norm": 0.37921888621672023, + "learning_rate": 3.483497723823976e-05, + "loss": 0.3732, + "step": 6555 + }, + { + "epoch": 1.1196552018434753, + "grad_norm": 0.3937102414061394, + "learning_rate": 3.4819170460293375e-05, + "loss": 0.3667, + "step": 6560 + }, + { + "epoch": 1.1205086626269523, + "grad_norm": 0.34761818920877413, + "learning_rate": 3.480336368234699e-05, + "loss": 0.3588, + "step": 6565 + }, + { + "epoch": 1.1213621234104294, + "grad_norm": 0.34424410517904946, + "learning_rate": 3.478755690440061e-05, + "loss": 0.3841, + "step": 6570 + }, + { + "epoch": 1.1222155841939063, + "grad_norm": 0.4214224963242259, + "learning_rate": 3.477175012645423e-05, + "loss": 0.3739, + "step": 6575 + }, + { + "epoch": 1.1230690449773832, + "grad_norm": 0.3865334872789258, + "learning_rate": 3.475594334850784e-05, + "loss": 0.3947, + "step": 6580 + }, + { + "epoch": 1.1239225057608604, + "grad_norm": 0.40708734545542935, + "learning_rate": 3.474013657056146e-05, + "loss": 0.3706, + "step": 6585 + }, + { + "epoch": 1.1247759665443373, + "grad_norm": 0.9045066835294332, + "learning_rate": 3.472432979261507e-05, + "loss": 0.3699, + "step": 6590 + }, + { + "epoch": 1.1256294273278142, + "grad_norm": 0.345613816124089, + "learning_rate": 3.470852301466869e-05, + "loss": 0.3627, + "step": 6595 + }, + { + "epoch": 1.1264828881112914, + "grad_norm": 0.4417786367821901, + "learning_rate": 3.469271623672231e-05, + "loss": 0.3952, + "step": 6600 + }, + { + "epoch": 1.1273363488947683, + "grad_norm": 0.3488869958861024, + "learning_rate": 3.4676909458775926e-05, + "loss": 0.383, + "step": 6605 + }, + { + "epoch": 1.1281898096782452, + "grad_norm": 0.32968002715251665, + "learning_rate": 3.466110268082954e-05, + "loss": 0.3885, + "step": 6610 + }, + { + "epoch": 1.1290432704617224, + "grad_norm": 0.32465319986861174, + "learning_rate": 3.4645295902883154e-05, + "loss": 0.3843, + "step": 6615 + }, + { + "epoch": 1.1298967312451993, + "grad_norm": 0.42478512459153034, + "learning_rate": 3.462948912493678e-05, + "loss": 0.3774, + "step": 6620 + }, + { + "epoch": 1.1307501920286762, + "grad_norm": 0.3691686696446473, + "learning_rate": 3.461368234699039e-05, + "loss": 0.3477, + "step": 6625 + }, + { + "epoch": 1.1316036528121534, + "grad_norm": 0.3294388018000737, + "learning_rate": 3.4597875569044006e-05, + "loss": 0.3521, + "step": 6630 + }, + { + "epoch": 1.1324571135956303, + "grad_norm": 0.5078573889585375, + "learning_rate": 3.4582068791097624e-05, + "loss": 0.331, + "step": 6635 + }, + { + "epoch": 1.1333105743791072, + "grad_norm": 0.3540787108069881, + "learning_rate": 3.456626201315124e-05, + "loss": 0.3548, + "step": 6640 + }, + { + "epoch": 1.1341640351625843, + "grad_norm": 0.425664971743762, + "learning_rate": 3.455045523520486e-05, + "loss": 0.3614, + "step": 6645 + }, + { + "epoch": 1.1350174959460613, + "grad_norm": 0.4519194755911954, + "learning_rate": 3.4534648457258476e-05, + "loss": 0.376, + "step": 6650 + }, + { + "epoch": 1.1358709567295382, + "grad_norm": 0.3792641841284432, + "learning_rate": 3.451884167931209e-05, + "loss": 0.3663, + "step": 6655 + }, + { + "epoch": 1.1367244175130153, + "grad_norm": 0.38336214452272527, + "learning_rate": 3.4503034901365704e-05, + "loss": 0.3729, + "step": 6660 + }, + { + "epoch": 1.1375778782964923, + "grad_norm": 0.44490882098941625, + "learning_rate": 3.448722812341932e-05, + "loss": 0.3898, + "step": 6665 + }, + { + "epoch": 1.1384313390799692, + "grad_norm": 0.4923210533563228, + "learning_rate": 3.4471421345472946e-05, + "loss": 0.3671, + "step": 6670 + }, + { + "epoch": 1.1392847998634463, + "grad_norm": 0.32576638199942004, + "learning_rate": 3.4455614567526556e-05, + "loss": 0.408, + "step": 6675 + }, + { + "epoch": 1.1401382606469233, + "grad_norm": 0.3698146388980988, + "learning_rate": 3.4439807789580174e-05, + "loss": 0.3656, + "step": 6680 + }, + { + "epoch": 1.1409917214304004, + "grad_norm": 0.4038930256287759, + "learning_rate": 3.442400101163379e-05, + "loss": 0.3809, + "step": 6685 + }, + { + "epoch": 1.1418451822138773, + "grad_norm": 0.3678224472834901, + "learning_rate": 3.440819423368741e-05, + "loss": 0.4072, + "step": 6690 + }, + { + "epoch": 1.1426986429973542, + "grad_norm": 0.37674876188896234, + "learning_rate": 3.439238745574102e-05, + "loss": 0.3597, + "step": 6695 + }, + { + "epoch": 1.1435521037808312, + "grad_norm": 0.36763677912434917, + "learning_rate": 3.437658067779464e-05, + "loss": 0.3937, + "step": 6700 + }, + { + "epoch": 1.1444055645643083, + "grad_norm": 0.34982750551205294, + "learning_rate": 3.436077389984826e-05, + "loss": 0.3937, + "step": 6705 + }, + { + "epoch": 1.1452590253477852, + "grad_norm": 0.769092786227432, + "learning_rate": 3.434496712190187e-05, + "loss": 0.3773, + "step": 6710 + }, + { + "epoch": 1.1461124861312624, + "grad_norm": 0.3615013250403249, + "learning_rate": 3.432916034395549e-05, + "loss": 0.3776, + "step": 6715 + }, + { + "epoch": 1.1469659469147393, + "grad_norm": 0.4250703191148808, + "learning_rate": 3.431335356600911e-05, + "loss": 0.3793, + "step": 6720 + }, + { + "epoch": 1.1478194076982162, + "grad_norm": 0.375871467769371, + "learning_rate": 3.4297546788062724e-05, + "loss": 0.3661, + "step": 6725 + }, + { + "epoch": 1.1486728684816931, + "grad_norm": 0.3965560488665287, + "learning_rate": 3.428174001011634e-05, + "loss": 0.3926, + "step": 6730 + }, + { + "epoch": 1.1495263292651703, + "grad_norm": 0.33386379563780616, + "learning_rate": 3.426593323216995e-05, + "loss": 0.3415, + "step": 6735 + }, + { + "epoch": 1.1503797900486472, + "grad_norm": 0.49744433445623903, + "learning_rate": 3.4250126454223577e-05, + "loss": 0.3984, + "step": 6740 + }, + { + "epoch": 1.1512332508321244, + "grad_norm": 0.35276686175575156, + "learning_rate": 3.423431967627719e-05, + "loss": 0.3796, + "step": 6745 + }, + { + "epoch": 1.1520867116156013, + "grad_norm": 0.3782105257502626, + "learning_rate": 3.4218512898330805e-05, + "loss": 0.3939, + "step": 6750 + }, + { + "epoch": 1.1529401723990782, + "grad_norm": 0.374200039079983, + "learning_rate": 3.420270612038442e-05, + "loss": 0.396, + "step": 6755 + }, + { + "epoch": 1.1537936331825553, + "grad_norm": 0.3836710306563444, + "learning_rate": 3.418689934243804e-05, + "loss": 0.3562, + "step": 6760 + }, + { + "epoch": 1.1546470939660323, + "grad_norm": 0.42364950051694567, + "learning_rate": 3.417109256449166e-05, + "loss": 0.3864, + "step": 6765 + }, + { + "epoch": 1.1555005547495092, + "grad_norm": 0.38861266994602445, + "learning_rate": 3.4155285786545275e-05, + "loss": 0.3844, + "step": 6770 + }, + { + "epoch": 1.1563540155329863, + "grad_norm": 0.6018078136039005, + "learning_rate": 3.413947900859889e-05, + "loss": 0.3489, + "step": 6775 + }, + { + "epoch": 1.1572074763164633, + "grad_norm": 0.5972874263639776, + "learning_rate": 3.41236722306525e-05, + "loss": 0.3945, + "step": 6780 + }, + { + "epoch": 1.1580609370999402, + "grad_norm": 0.4093857505193346, + "learning_rate": 3.410786545270612e-05, + "loss": 0.3791, + "step": 6785 + }, + { + "epoch": 1.1589143978834173, + "grad_norm": 0.42600590826537443, + "learning_rate": 3.409205867475974e-05, + "loss": 0.376, + "step": 6790 + }, + { + "epoch": 1.1597678586668942, + "grad_norm": 0.4065411805048917, + "learning_rate": 3.4076251896813355e-05, + "loss": 0.384, + "step": 6795 + }, + { + "epoch": 1.1606213194503712, + "grad_norm": 0.4244702012793157, + "learning_rate": 3.406044511886697e-05, + "loss": 0.3883, + "step": 6800 + }, + { + "epoch": 1.1614747802338483, + "grad_norm": 0.374917508843201, + "learning_rate": 3.404463834092059e-05, + "loss": 0.3931, + "step": 6805 + }, + { + "epoch": 1.1623282410173252, + "grad_norm": 0.5211371296519884, + "learning_rate": 3.402883156297421e-05, + "loss": 0.3828, + "step": 6810 + }, + { + "epoch": 1.1631817018008022, + "grad_norm": 0.7561755882811956, + "learning_rate": 3.401302478502782e-05, + "loss": 0.356, + "step": 6815 + }, + { + "epoch": 1.1640351625842793, + "grad_norm": 0.6436005006819279, + "learning_rate": 3.3997218007081436e-05, + "loss": 0.3848, + "step": 6820 + }, + { + "epoch": 1.1648886233677562, + "grad_norm": 0.42468254817266504, + "learning_rate": 3.398141122913506e-05, + "loss": 0.3802, + "step": 6825 + }, + { + "epoch": 1.1657420841512334, + "grad_norm": 0.3982367416624562, + "learning_rate": 3.396560445118867e-05, + "loss": 0.399, + "step": 6830 + }, + { + "epoch": 1.1665955449347103, + "grad_norm": 0.5806335612905161, + "learning_rate": 3.394979767324229e-05, + "loss": 0.379, + "step": 6835 + }, + { + "epoch": 1.1674490057181872, + "grad_norm": 0.6539048969095816, + "learning_rate": 3.3933990895295905e-05, + "loss": 0.3814, + "step": 6840 + }, + { + "epoch": 1.1683024665016641, + "grad_norm": 0.45131138747036637, + "learning_rate": 3.391818411734952e-05, + "loss": 0.3901, + "step": 6845 + }, + { + "epoch": 1.1691559272851413, + "grad_norm": 0.4857094349513415, + "learning_rate": 3.3902377339403134e-05, + "loss": 0.3933, + "step": 6850 + }, + { + "epoch": 1.1700093880686182, + "grad_norm": 0.3439760439525917, + "learning_rate": 3.388657056145675e-05, + "loss": 0.3781, + "step": 6855 + }, + { + "epoch": 1.1708628488520954, + "grad_norm": 5.327471762596987, + "learning_rate": 3.3870763783510375e-05, + "loss": 0.3671, + "step": 6860 + }, + { + "epoch": 1.1717163096355723, + "grad_norm": 0.37046626612176936, + "learning_rate": 3.3854957005563986e-05, + "loss": 0.3977, + "step": 6865 + }, + { + "epoch": 1.1725697704190492, + "grad_norm": 0.4327085457178763, + "learning_rate": 3.3839150227617603e-05, + "loss": 0.3748, + "step": 6870 + }, + { + "epoch": 1.1734232312025263, + "grad_norm": 0.333034919237716, + "learning_rate": 3.382334344967122e-05, + "loss": 0.3933, + "step": 6875 + }, + { + "epoch": 1.1742766919860033, + "grad_norm": 0.35345434918838137, + "learning_rate": 3.380753667172484e-05, + "loss": 0.4103, + "step": 6880 + }, + { + "epoch": 1.1751301527694802, + "grad_norm": 0.3421462474321589, + "learning_rate": 3.3791729893778456e-05, + "loss": 0.3832, + "step": 6885 + }, + { + "epoch": 1.1759836135529573, + "grad_norm": 0.3561427234439529, + "learning_rate": 3.377592311583207e-05, + "loss": 0.3839, + "step": 6890 + }, + { + "epoch": 1.1768370743364343, + "grad_norm": 0.3531319457485075, + "learning_rate": 3.376011633788569e-05, + "loss": 0.3836, + "step": 6895 + }, + { + "epoch": 1.1776905351199112, + "grad_norm": 0.8047701928164278, + "learning_rate": 3.37443095599393e-05, + "loss": 0.3878, + "step": 6900 + }, + { + "epoch": 1.1785439959033883, + "grad_norm": 0.33725477031687834, + "learning_rate": 3.372850278199292e-05, + "loss": 0.3529, + "step": 6905 + }, + { + "epoch": 1.1793974566868652, + "grad_norm": 0.4488080219229851, + "learning_rate": 3.3712696004046536e-05, + "loss": 0.3735, + "step": 6910 + }, + { + "epoch": 1.1802509174703422, + "grad_norm": 0.4004718951161043, + "learning_rate": 3.3696889226100154e-05, + "loss": 0.3768, + "step": 6915 + }, + { + "epoch": 1.1811043782538193, + "grad_norm": 0.3308681548917742, + "learning_rate": 3.368108244815377e-05, + "loss": 0.3966, + "step": 6920 + }, + { + "epoch": 1.1819578390372962, + "grad_norm": 0.41582127810301883, + "learning_rate": 3.366527567020739e-05, + "loss": 0.3705, + "step": 6925 + }, + { + "epoch": 1.1828112998207732, + "grad_norm": 0.4892347947895599, + "learning_rate": 3.3649468892261006e-05, + "loss": 0.3847, + "step": 6930 + }, + { + "epoch": 1.1836647606042503, + "grad_norm": 0.49948370788401486, + "learning_rate": 3.363366211431462e-05, + "loss": 0.3879, + "step": 6935 + }, + { + "epoch": 1.1845182213877272, + "grad_norm": 0.5170019696339748, + "learning_rate": 3.3617855336368234e-05, + "loss": 0.378, + "step": 6940 + }, + { + "epoch": 1.1853716821712041, + "grad_norm": 0.7060410855161342, + "learning_rate": 3.360204855842185e-05, + "loss": 0.3666, + "step": 6945 + }, + { + "epoch": 1.1862251429546813, + "grad_norm": 0.42389425542138154, + "learning_rate": 3.358624178047547e-05, + "loss": 0.3649, + "step": 6950 + }, + { + "epoch": 1.1870786037381582, + "grad_norm": 0.36140348377650977, + "learning_rate": 3.357043500252909e-05, + "loss": 0.3897, + "step": 6955 + }, + { + "epoch": 1.1879320645216351, + "grad_norm": 0.565928565250766, + "learning_rate": 3.3554628224582704e-05, + "loss": 0.3707, + "step": 6960 + }, + { + "epoch": 1.1887855253051123, + "grad_norm": 0.458982871858116, + "learning_rate": 3.353882144663632e-05, + "loss": 0.4007, + "step": 6965 + }, + { + "epoch": 1.1896389860885892, + "grad_norm": 0.43602823762012966, + "learning_rate": 3.352301466868993e-05, + "loss": 0.3722, + "step": 6970 + }, + { + "epoch": 1.1904924468720663, + "grad_norm": 0.5078272538835501, + "learning_rate": 3.350720789074355e-05, + "loss": 0.4005, + "step": 6975 + }, + { + "epoch": 1.1913459076555433, + "grad_norm": 0.47470167814658665, + "learning_rate": 3.3491401112797174e-05, + "loss": 0.3815, + "step": 6980 + }, + { + "epoch": 1.1921993684390202, + "grad_norm": 0.496588669310637, + "learning_rate": 3.3475594334850785e-05, + "loss": 0.3485, + "step": 6985 + }, + { + "epoch": 1.1930528292224971, + "grad_norm": 0.4339679366968848, + "learning_rate": 3.34597875569044e-05, + "loss": 0.3881, + "step": 6990 + }, + { + "epoch": 1.1939062900059743, + "grad_norm": 0.8520920328281085, + "learning_rate": 3.344398077895802e-05, + "loss": 0.3791, + "step": 6995 + }, + { + "epoch": 1.1947597507894512, + "grad_norm": 0.39706879572408627, + "learning_rate": 3.342817400101164e-05, + "loss": 0.3711, + "step": 7000 + }, + { + "epoch": 1.1956132115729283, + "grad_norm": 0.4629972024434518, + "learning_rate": 3.341236722306525e-05, + "loss": 0.3521, + "step": 7005 + }, + { + "epoch": 1.1964666723564052, + "grad_norm": 0.4278575426029068, + "learning_rate": 3.339656044511887e-05, + "loss": 0.4015, + "step": 7010 + }, + { + "epoch": 1.1973201331398822, + "grad_norm": 1.5156192916688398, + "learning_rate": 3.338075366717249e-05, + "loss": 0.4231, + "step": 7015 + }, + { + "epoch": 1.1981735939233593, + "grad_norm": 0.5013914924256465, + "learning_rate": 3.33649468892261e-05, + "loss": 0.3706, + "step": 7020 + }, + { + "epoch": 1.1990270547068362, + "grad_norm": 0.7785649440633896, + "learning_rate": 3.334914011127972e-05, + "loss": 0.3604, + "step": 7025 + }, + { + "epoch": 1.1998805154903132, + "grad_norm": 0.408375287115701, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.3798, + "step": 7030 + }, + { + "epoch": 1.2007339762737903, + "grad_norm": 0.6229062930638491, + "learning_rate": 3.331752655538695e-05, + "loss": 0.397, + "step": 7035 + }, + { + "epoch": 1.2015874370572672, + "grad_norm": 1.852308593695395, + "learning_rate": 3.330171977744056e-05, + "loss": 0.3776, + "step": 7040 + }, + { + "epoch": 1.2024408978407441, + "grad_norm": 0.41876048398721233, + "learning_rate": 3.328591299949419e-05, + "loss": 0.3771, + "step": 7045 + }, + { + "epoch": 1.2032943586242213, + "grad_norm": 0.43459573619031744, + "learning_rate": 3.3270106221547805e-05, + "loss": 0.3791, + "step": 7050 + }, + { + "epoch": 1.2041478194076982, + "grad_norm": 0.3696811553353602, + "learning_rate": 3.3254299443601415e-05, + "loss": 0.379, + "step": 7055 + }, + { + "epoch": 1.2050012801911751, + "grad_norm": 0.38109035981244466, + "learning_rate": 3.323849266565503e-05, + "loss": 0.3669, + "step": 7060 + }, + { + "epoch": 1.2058547409746523, + "grad_norm": 0.4723170910022811, + "learning_rate": 3.322268588770865e-05, + "loss": 0.3558, + "step": 7065 + }, + { + "epoch": 1.2067082017581292, + "grad_norm": 0.4897771860746794, + "learning_rate": 3.320687910976227e-05, + "loss": 0.3833, + "step": 7070 + }, + { + "epoch": 1.2075616625416061, + "grad_norm": 0.3298927449050978, + "learning_rate": 3.3191072331815885e-05, + "loss": 0.3643, + "step": 7075 + }, + { + "epoch": 1.2084151233250833, + "grad_norm": 0.4285871454607902, + "learning_rate": 3.31752655538695e-05, + "loss": 0.3762, + "step": 7080 + }, + { + "epoch": 1.2092685841085602, + "grad_norm": 0.6603932639317657, + "learning_rate": 3.315945877592312e-05, + "loss": 0.4047, + "step": 7085 + }, + { + "epoch": 1.2101220448920371, + "grad_norm": 0.4502007277418514, + "learning_rate": 3.314365199797673e-05, + "loss": 0.3846, + "step": 7090 + }, + { + "epoch": 1.2109755056755143, + "grad_norm": 0.4390536913234136, + "learning_rate": 3.312784522003035e-05, + "loss": 0.3868, + "step": 7095 + }, + { + "epoch": 1.2118289664589912, + "grad_norm": 1.4489440576914383, + "learning_rate": 3.3112038442083966e-05, + "loss": 0.3641, + "step": 7100 + }, + { + "epoch": 1.212682427242468, + "grad_norm": 0.809919753108125, + "learning_rate": 3.309623166413758e-05, + "loss": 0.4055, + "step": 7105 + }, + { + "epoch": 1.2135358880259453, + "grad_norm": 0.8926732603170776, + "learning_rate": 3.30804248861912e-05, + "loss": 0.386, + "step": 7110 + }, + { + "epoch": 1.2143893488094222, + "grad_norm": 0.665918402333069, + "learning_rate": 3.306461810824482e-05, + "loss": 0.3566, + "step": 7115 + }, + { + "epoch": 1.2152428095928993, + "grad_norm": 0.9914253719657289, + "learning_rate": 3.3048811330298436e-05, + "loss": 0.3832, + "step": 7120 + }, + { + "epoch": 1.2160962703763762, + "grad_norm": 0.44527666815736894, + "learning_rate": 3.3033004552352046e-05, + "loss": 0.3981, + "step": 7125 + }, + { + "epoch": 1.2169497311598532, + "grad_norm": 0.62674383923586, + "learning_rate": 3.3017197774405664e-05, + "loss": 0.3838, + "step": 7130 + }, + { + "epoch": 1.21780319194333, + "grad_norm": 0.5277348895958146, + "learning_rate": 3.300139099645929e-05, + "loss": 0.3705, + "step": 7135 + }, + { + "epoch": 1.2186566527268072, + "grad_norm": 0.35825319195406957, + "learning_rate": 3.29855842185129e-05, + "loss": 0.402, + "step": 7140 + }, + { + "epoch": 1.2195101135102842, + "grad_norm": 0.48097154642704415, + "learning_rate": 3.2969777440566516e-05, + "loss": 0.3589, + "step": 7145 + }, + { + "epoch": 1.2203635742937613, + "grad_norm": 0.4636097364299617, + "learning_rate": 3.2953970662620134e-05, + "loss": 0.3606, + "step": 7150 + }, + { + "epoch": 1.2212170350772382, + "grad_norm": 0.5011562959849015, + "learning_rate": 3.293816388467375e-05, + "loss": 0.3955, + "step": 7155 + }, + { + "epoch": 1.2220704958607151, + "grad_norm": 0.46403897845965236, + "learning_rate": 3.292235710672736e-05, + "loss": 0.3897, + "step": 7160 + }, + { + "epoch": 1.2229239566441923, + "grad_norm": 0.4948350216460918, + "learning_rate": 3.2906550328780986e-05, + "loss": 0.4061, + "step": 7165 + }, + { + "epoch": 1.2237774174276692, + "grad_norm": 0.3015142861226733, + "learning_rate": 3.2890743550834603e-05, + "loss": 0.3889, + "step": 7170 + }, + { + "epoch": 1.2246308782111461, + "grad_norm": 0.31718569415925846, + "learning_rate": 3.2874936772888214e-05, + "loss": 0.3646, + "step": 7175 + }, + { + "epoch": 1.2254843389946233, + "grad_norm": 0.505590576371758, + "learning_rate": 3.285912999494183e-05, + "loss": 0.3714, + "step": 7180 + }, + { + "epoch": 1.2263377997781002, + "grad_norm": 0.3377072659297116, + "learning_rate": 3.284332321699545e-05, + "loss": 0.366, + "step": 7185 + }, + { + "epoch": 1.2271912605615771, + "grad_norm": 0.34519374819683335, + "learning_rate": 3.2827516439049066e-05, + "loss": 0.3797, + "step": 7190 + }, + { + "epoch": 1.2280447213450543, + "grad_norm": 0.5307861769223624, + "learning_rate": 3.281170966110268e-05, + "loss": 0.3635, + "step": 7195 + }, + { + "epoch": 1.2288981821285312, + "grad_norm": 0.3803821448197012, + "learning_rate": 3.27959028831563e-05, + "loss": 0.3877, + "step": 7200 + }, + { + "epoch": 1.2297516429120081, + "grad_norm": 0.38378582601697386, + "learning_rate": 3.278009610520992e-05, + "loss": 0.3716, + "step": 7205 + }, + { + "epoch": 1.2306051036954853, + "grad_norm": 0.3722126409807913, + "learning_rate": 3.276428932726353e-05, + "loss": 0.3494, + "step": 7210 + }, + { + "epoch": 1.2314585644789622, + "grad_norm": 0.4701650733043367, + "learning_rate": 3.274848254931715e-05, + "loss": 0.3735, + "step": 7215 + }, + { + "epoch": 1.232312025262439, + "grad_norm": 0.360950752183332, + "learning_rate": 3.2732675771370764e-05, + "loss": 0.3798, + "step": 7220 + }, + { + "epoch": 1.2331654860459162, + "grad_norm": 0.38238358550781987, + "learning_rate": 3.271686899342438e-05, + "loss": 0.4007, + "step": 7225 + }, + { + "epoch": 1.2340189468293932, + "grad_norm": 0.3832560631518119, + "learning_rate": 3.2701062215478e-05, + "loss": 0.3862, + "step": 7230 + }, + { + "epoch": 1.23487240761287, + "grad_norm": 0.5471877311072296, + "learning_rate": 3.268525543753162e-05, + "loss": 0.3706, + "step": 7235 + }, + { + "epoch": 1.2357258683963472, + "grad_norm": 0.3265459517779948, + "learning_rate": 3.2669448659585234e-05, + "loss": 0.3964, + "step": 7240 + }, + { + "epoch": 1.2365793291798242, + "grad_norm": 0.43050611655984045, + "learning_rate": 3.2653641881638845e-05, + "loss": 0.3713, + "step": 7245 + }, + { + "epoch": 1.237432789963301, + "grad_norm": 0.3151548630949105, + "learning_rate": 3.263783510369246e-05, + "loss": 0.3584, + "step": 7250 + }, + { + "epoch": 1.2382862507467782, + "grad_norm": 0.4371362314804908, + "learning_rate": 3.262202832574608e-05, + "loss": 0.3903, + "step": 7255 + }, + { + "epoch": 1.2391397115302552, + "grad_norm": 0.358335905265931, + "learning_rate": 3.26062215477997e-05, + "loss": 0.3847, + "step": 7260 + }, + { + "epoch": 1.2399931723137323, + "grad_norm": 0.3764231398756448, + "learning_rate": 3.2590414769853315e-05, + "loss": 0.3907, + "step": 7265 + }, + { + "epoch": 1.2408466330972092, + "grad_norm": 0.3794729091152979, + "learning_rate": 3.257460799190693e-05, + "loss": 0.3689, + "step": 7270 + }, + { + "epoch": 1.2417000938806861, + "grad_norm": 0.3329388410145465, + "learning_rate": 3.255880121396055e-05, + "loss": 0.3619, + "step": 7275 + }, + { + "epoch": 1.242553554664163, + "grad_norm": 0.391398848493733, + "learning_rate": 3.254299443601416e-05, + "loss": 0.3711, + "step": 7280 + }, + { + "epoch": 1.2434070154476402, + "grad_norm": 0.4583419085661663, + "learning_rate": 3.2527187658067785e-05, + "loss": 0.3812, + "step": 7285 + }, + { + "epoch": 1.2442604762311171, + "grad_norm": 0.6730001261007689, + "learning_rate": 3.25113808801214e-05, + "loss": 0.384, + "step": 7290 + }, + { + "epoch": 1.2451139370145943, + "grad_norm": 0.40309795667742904, + "learning_rate": 3.249557410217501e-05, + "loss": 0.3574, + "step": 7295 + }, + { + "epoch": 1.2459673977980712, + "grad_norm": 0.3682695719098797, + "learning_rate": 3.247976732422863e-05, + "loss": 0.3632, + "step": 7300 + }, + { + "epoch": 1.2468208585815481, + "grad_norm": 0.42014217847259433, + "learning_rate": 3.246396054628225e-05, + "loss": 0.3517, + "step": 7305 + }, + { + "epoch": 1.2476743193650253, + "grad_norm": 0.40438319288989455, + "learning_rate": 3.2448153768335865e-05, + "loss": 0.3863, + "step": 7310 + }, + { + "epoch": 1.2485277801485022, + "grad_norm": 0.4019821499230842, + "learning_rate": 3.2432346990389476e-05, + "loss": 0.3457, + "step": 7315 + }, + { + "epoch": 1.249381240931979, + "grad_norm": 0.3613372896117948, + "learning_rate": 3.24165402124431e-05, + "loss": 0.3433, + "step": 7320 + }, + { + "epoch": 1.2502347017154563, + "grad_norm": 0.3026609174834825, + "learning_rate": 3.240073343449672e-05, + "loss": 0.3884, + "step": 7325 + }, + { + "epoch": 1.2510881624989332, + "grad_norm": 0.3138234605619824, + "learning_rate": 3.238492665655033e-05, + "loss": 0.3431, + "step": 7330 + }, + { + "epoch": 1.25194162328241, + "grad_norm": 0.30825661611686755, + "learning_rate": 3.2369119878603946e-05, + "loss": 0.3995, + "step": 7335 + }, + { + "epoch": 1.2527950840658872, + "grad_norm": 0.35993177196568976, + "learning_rate": 3.235331310065756e-05, + "loss": 0.3853, + "step": 7340 + }, + { + "epoch": 1.2536485448493642, + "grad_norm": 0.36522745740979284, + "learning_rate": 3.233750632271118e-05, + "loss": 0.3562, + "step": 7345 + }, + { + "epoch": 1.254502005632841, + "grad_norm": 0.45001752961029345, + "learning_rate": 3.232169954476479e-05, + "loss": 0.3431, + "step": 7350 + }, + { + "epoch": 1.2553554664163182, + "grad_norm": 0.3788022310918792, + "learning_rate": 3.2305892766818415e-05, + "loss": 0.354, + "step": 7355 + }, + { + "epoch": 1.2562089271997952, + "grad_norm": 0.36572687727000264, + "learning_rate": 3.229008598887203e-05, + "loss": 0.3784, + "step": 7360 + }, + { + "epoch": 1.257062387983272, + "grad_norm": 0.3401462065540811, + "learning_rate": 3.2274279210925644e-05, + "loss": 0.3618, + "step": 7365 + }, + { + "epoch": 1.2579158487667492, + "grad_norm": 0.3118211840094879, + "learning_rate": 3.225847243297926e-05, + "loss": 0.3703, + "step": 7370 + }, + { + "epoch": 1.2587693095502261, + "grad_norm": 0.44163497557223913, + "learning_rate": 3.224266565503288e-05, + "loss": 0.3798, + "step": 7375 + }, + { + "epoch": 1.2596227703337033, + "grad_norm": 0.48378884941491146, + "learning_rate": 3.2226858877086496e-05, + "loss": 0.3822, + "step": 7380 + }, + { + "epoch": 1.2604762311171802, + "grad_norm": 0.33225096808340443, + "learning_rate": 3.2211052099140113e-05, + "loss": 0.3956, + "step": 7385 + }, + { + "epoch": 1.2613296919006571, + "grad_norm": 0.39311029190076513, + "learning_rate": 3.219524532119373e-05, + "loss": 0.3842, + "step": 7390 + }, + { + "epoch": 1.262183152684134, + "grad_norm": 0.36608142885222406, + "learning_rate": 3.217943854324735e-05, + "loss": 0.4025, + "step": 7395 + }, + { + "epoch": 1.2630366134676112, + "grad_norm": 0.45751117064550084, + "learning_rate": 3.216363176530096e-05, + "loss": 0.3525, + "step": 7400 + }, + { + "epoch": 1.2638900742510881, + "grad_norm": 2.3313256094843053, + "learning_rate": 3.214782498735458e-05, + "loss": 0.3887, + "step": 7405 + }, + { + "epoch": 1.2647435350345653, + "grad_norm": 0.47541991430934355, + "learning_rate": 3.2132018209408194e-05, + "loss": 0.3965, + "step": 7410 + }, + { + "epoch": 1.2655969958180422, + "grad_norm": 0.5849483567906478, + "learning_rate": 3.211621143146181e-05, + "loss": 0.3697, + "step": 7415 + }, + { + "epoch": 1.2664504566015191, + "grad_norm": 0.5057974177473267, + "learning_rate": 3.210040465351543e-05, + "loss": 0.3925, + "step": 7420 + }, + { + "epoch": 1.267303917384996, + "grad_norm": 0.4481431566018363, + "learning_rate": 3.2084597875569046e-05, + "loss": 0.3666, + "step": 7425 + }, + { + "epoch": 1.2681573781684732, + "grad_norm": 0.376348907461285, + "learning_rate": 3.2068791097622664e-05, + "loss": 0.4056, + "step": 7430 + }, + { + "epoch": 1.26901083895195, + "grad_norm": 0.6364161503465311, + "learning_rate": 3.2052984319676274e-05, + "loss": 0.372, + "step": 7435 + }, + { + "epoch": 1.2698642997354272, + "grad_norm": 0.3804417117420967, + "learning_rate": 3.20371775417299e-05, + "loss": 0.3755, + "step": 7440 + }, + { + "epoch": 1.2707177605189042, + "grad_norm": 0.36191706587531747, + "learning_rate": 3.202137076378351e-05, + "loss": 0.3541, + "step": 7445 + }, + { + "epoch": 1.271571221302381, + "grad_norm": 0.5558036802892538, + "learning_rate": 3.200556398583713e-05, + "loss": 0.3661, + "step": 7450 + }, + { + "epoch": 1.272424682085858, + "grad_norm": 0.382324393894525, + "learning_rate": 3.1989757207890744e-05, + "loss": 0.3626, + "step": 7455 + }, + { + "epoch": 1.2732781428693352, + "grad_norm": 0.3563296482156405, + "learning_rate": 3.197395042994436e-05, + "loss": 0.3669, + "step": 7460 + }, + { + "epoch": 1.274131603652812, + "grad_norm": 0.4647270589907194, + "learning_rate": 3.195814365199798e-05, + "loss": 0.3901, + "step": 7465 + }, + { + "epoch": 1.2749850644362892, + "grad_norm": 0.44178128241828774, + "learning_rate": 3.194233687405159e-05, + "loss": 0.374, + "step": 7470 + }, + { + "epoch": 1.2758385252197662, + "grad_norm": 0.4185597275471627, + "learning_rate": 3.1926530096105214e-05, + "loss": 0.3818, + "step": 7475 + }, + { + "epoch": 1.276691986003243, + "grad_norm": 0.3937336884055784, + "learning_rate": 3.191072331815883e-05, + "loss": 0.383, + "step": 7480 + }, + { + "epoch": 1.2775454467867202, + "grad_norm": 0.39788730239138864, + "learning_rate": 3.189491654021244e-05, + "loss": 0.4014, + "step": 7485 + }, + { + "epoch": 1.2783989075701971, + "grad_norm": 0.34102604629935046, + "learning_rate": 3.187910976226606e-05, + "loss": 0.3661, + "step": 7490 + }, + { + "epoch": 1.2792523683536743, + "grad_norm": 0.32253182333469643, + "learning_rate": 3.186330298431968e-05, + "loss": 0.3918, + "step": 7495 + }, + { + "epoch": 1.2801058291371512, + "grad_norm": 0.5846100005497011, + "learning_rate": 3.1847496206373295e-05, + "loss": 0.3791, + "step": 7500 + }, + { + "epoch": 1.2809592899206281, + "grad_norm": 0.434655248897244, + "learning_rate": 3.1831689428426905e-05, + "loss": 0.367, + "step": 7505 + }, + { + "epoch": 1.281812750704105, + "grad_norm": 0.3518144063630111, + "learning_rate": 3.181588265048053e-05, + "loss": 0.3685, + "step": 7510 + }, + { + "epoch": 1.2826662114875822, + "grad_norm": 0.44386299533464313, + "learning_rate": 3.180007587253415e-05, + "loss": 0.3712, + "step": 7515 + }, + { + "epoch": 1.2835196722710591, + "grad_norm": 0.5085192195133006, + "learning_rate": 3.178426909458776e-05, + "loss": 0.3672, + "step": 7520 + }, + { + "epoch": 1.2843731330545363, + "grad_norm": 0.4561094288931108, + "learning_rate": 3.176846231664138e-05, + "loss": 0.3886, + "step": 7525 + }, + { + "epoch": 1.2852265938380132, + "grad_norm": 0.3477676921430173, + "learning_rate": 3.175265553869499e-05, + "loss": 0.3848, + "step": 7530 + }, + { + "epoch": 1.28608005462149, + "grad_norm": 0.5728733842271062, + "learning_rate": 3.173684876074861e-05, + "loss": 0.3666, + "step": 7535 + }, + { + "epoch": 1.286933515404967, + "grad_norm": 0.40073643354033833, + "learning_rate": 3.172104198280223e-05, + "loss": 0.3537, + "step": 7540 + }, + { + "epoch": 1.2877869761884442, + "grad_norm": 0.6490800328728087, + "learning_rate": 3.1705235204855845e-05, + "loss": 0.3622, + "step": 7545 + }, + { + "epoch": 1.288640436971921, + "grad_norm": 0.3555309507557241, + "learning_rate": 3.168942842690946e-05, + "loss": 0.4001, + "step": 7550 + }, + { + "epoch": 1.2894938977553982, + "grad_norm": 0.41740412935467214, + "learning_rate": 3.167362164896307e-05, + "loss": 0.3628, + "step": 7555 + }, + { + "epoch": 1.2903473585388752, + "grad_norm": 0.39299843024898157, + "learning_rate": 3.16578148710167e-05, + "loss": 0.3696, + "step": 7560 + }, + { + "epoch": 1.291200819322352, + "grad_norm": 0.34495576011293283, + "learning_rate": 3.164200809307031e-05, + "loss": 0.3658, + "step": 7565 + }, + { + "epoch": 1.292054280105829, + "grad_norm": 0.3684441928525252, + "learning_rate": 3.1626201315123925e-05, + "loss": 0.3639, + "step": 7570 + }, + { + "epoch": 1.2929077408893062, + "grad_norm": 0.45395816645691744, + "learning_rate": 3.161039453717754e-05, + "loss": 0.3933, + "step": 7575 + }, + { + "epoch": 1.293761201672783, + "grad_norm": 0.38866887520272303, + "learning_rate": 3.159458775923116e-05, + "loss": 0.4058, + "step": 7580 + }, + { + "epoch": 1.2946146624562602, + "grad_norm": 0.43712059420894067, + "learning_rate": 3.157878098128478e-05, + "loss": 0.3581, + "step": 7585 + }, + { + "epoch": 1.2954681232397371, + "grad_norm": 0.3760028727190519, + "learning_rate": 3.156297420333839e-05, + "loss": 0.3825, + "step": 7590 + }, + { + "epoch": 1.296321584023214, + "grad_norm": 0.3670961920825581, + "learning_rate": 3.154716742539201e-05, + "loss": 0.3859, + "step": 7595 + }, + { + "epoch": 1.2971750448066912, + "grad_norm": 0.2919355002339907, + "learning_rate": 3.1531360647445623e-05, + "loss": 0.3808, + "step": 7600 + }, + { + "epoch": 1.2980285055901681, + "grad_norm": 0.4068715415938785, + "learning_rate": 3.151555386949924e-05, + "loss": 0.3913, + "step": 7605 + }, + { + "epoch": 1.298881966373645, + "grad_norm": 0.35915673766672923, + "learning_rate": 3.149974709155286e-05, + "loss": 0.3963, + "step": 7610 + }, + { + "epoch": 1.2997354271571222, + "grad_norm": 0.3702895029464927, + "learning_rate": 3.1483940313606476e-05, + "loss": 0.3971, + "step": 7615 + }, + { + "epoch": 1.3005888879405991, + "grad_norm": 0.3265715421342069, + "learning_rate": 3.146813353566009e-05, + "loss": 0.3879, + "step": 7620 + }, + { + "epoch": 1.301442348724076, + "grad_norm": 0.43103727650242346, + "learning_rate": 3.1452326757713704e-05, + "loss": 0.3736, + "step": 7625 + }, + { + "epoch": 1.3022958095075532, + "grad_norm": 0.48845220319095556, + "learning_rate": 3.143651997976733e-05, + "loss": 0.4164, + "step": 7630 + }, + { + "epoch": 1.3031492702910301, + "grad_norm": 0.3409680039457544, + "learning_rate": 3.1420713201820946e-05, + "loss": 0.3614, + "step": 7635 + }, + { + "epoch": 1.3040027310745073, + "grad_norm": 0.5580659730454305, + "learning_rate": 3.1404906423874556e-05, + "loss": 0.3589, + "step": 7640 + }, + { + "epoch": 1.3048561918579842, + "grad_norm": 0.47995922826383103, + "learning_rate": 3.138909964592818e-05, + "loss": 0.3756, + "step": 7645 + }, + { + "epoch": 1.305709652641461, + "grad_norm": 0.41730841016773473, + "learning_rate": 3.137329286798179e-05, + "loss": 0.3621, + "step": 7650 + }, + { + "epoch": 1.306563113424938, + "grad_norm": 0.3934331300158891, + "learning_rate": 3.135748609003541e-05, + "loss": 0.3921, + "step": 7655 + }, + { + "epoch": 1.3074165742084152, + "grad_norm": 0.5423469201599359, + "learning_rate": 3.1341679312089026e-05, + "loss": 0.36, + "step": 7660 + }, + { + "epoch": 1.308270034991892, + "grad_norm": 0.3373234551343298, + "learning_rate": 3.1325872534142644e-05, + "loss": 0.3905, + "step": 7665 + }, + { + "epoch": 1.3091234957753692, + "grad_norm": 1.3857392438443161, + "learning_rate": 3.131006575619626e-05, + "loss": 0.3626, + "step": 7670 + }, + { + "epoch": 1.3099769565588462, + "grad_norm": 0.33705392183486976, + "learning_rate": 3.129425897824987e-05, + "loss": 0.3598, + "step": 7675 + }, + { + "epoch": 1.310830417342323, + "grad_norm": 0.48904999688941714, + "learning_rate": 3.1278452200303496e-05, + "loss": 0.3938, + "step": 7680 + }, + { + "epoch": 1.3116838781258, + "grad_norm": 0.3708751012077216, + "learning_rate": 3.126264542235711e-05, + "loss": 0.3829, + "step": 7685 + }, + { + "epoch": 1.3125373389092772, + "grad_norm": 0.3811349233254703, + "learning_rate": 3.1246838644410724e-05, + "loss": 0.4035, + "step": 7690 + }, + { + "epoch": 1.313390799692754, + "grad_norm": 0.45778200280902426, + "learning_rate": 3.123103186646434e-05, + "loss": 0.3635, + "step": 7695 + }, + { + "epoch": 1.3142442604762312, + "grad_norm": 0.44592689042058387, + "learning_rate": 3.121522508851796e-05, + "loss": 0.4066, + "step": 7700 + }, + { + "epoch": 1.3150977212597081, + "grad_norm": 0.3944483419896346, + "learning_rate": 3.1199418310571577e-05, + "loss": 0.3518, + "step": 7705 + }, + { + "epoch": 1.315951182043185, + "grad_norm": 0.38297744340106776, + "learning_rate": 3.118361153262519e-05, + "loss": 0.376, + "step": 7710 + }, + { + "epoch": 1.316804642826662, + "grad_norm": 0.6446635723826704, + "learning_rate": 3.116780475467881e-05, + "loss": 0.3775, + "step": 7715 + }, + { + "epoch": 1.3176581036101391, + "grad_norm": 0.428645446586942, + "learning_rate": 3.115199797673242e-05, + "loss": 0.3939, + "step": 7720 + }, + { + "epoch": 1.318511564393616, + "grad_norm": 0.32052118851991424, + "learning_rate": 3.113619119878604e-05, + "loss": 0.3718, + "step": 7725 + }, + { + "epoch": 1.3193650251770932, + "grad_norm": 0.4274457410022372, + "learning_rate": 3.112038442083966e-05, + "loss": 0.3769, + "step": 7730 + }, + { + "epoch": 1.3202184859605701, + "grad_norm": 0.4300172034421353, + "learning_rate": 3.1104577642893274e-05, + "loss": 0.4138, + "step": 7735 + }, + { + "epoch": 1.321071946744047, + "grad_norm": 0.39964267758451666, + "learning_rate": 3.108877086494689e-05, + "loss": 0.3845, + "step": 7740 + }, + { + "epoch": 1.3219254075275242, + "grad_norm": 0.38500495269783735, + "learning_rate": 3.10729640870005e-05, + "loss": 0.3873, + "step": 7745 + }, + { + "epoch": 1.322778868311001, + "grad_norm": 0.482078988228766, + "learning_rate": 3.105715730905413e-05, + "loss": 0.3831, + "step": 7750 + }, + { + "epoch": 1.323632329094478, + "grad_norm": 0.5148872754346172, + "learning_rate": 3.104135053110774e-05, + "loss": 0.3491, + "step": 7755 + }, + { + "epoch": 1.3244857898779552, + "grad_norm": 0.6946469363221514, + "learning_rate": 3.1025543753161355e-05, + "loss": 0.4165, + "step": 7760 + }, + { + "epoch": 1.325339250661432, + "grad_norm": 0.3418292401578764, + "learning_rate": 3.100973697521497e-05, + "loss": 0.3764, + "step": 7765 + }, + { + "epoch": 1.326192711444909, + "grad_norm": 0.32461219033431804, + "learning_rate": 3.099393019726859e-05, + "loss": 0.3401, + "step": 7770 + }, + { + "epoch": 1.3270461722283862, + "grad_norm": 0.3522364921006484, + "learning_rate": 3.097812341932221e-05, + "loss": 0.3625, + "step": 7775 + }, + { + "epoch": 1.327899633011863, + "grad_norm": 0.359786914606667, + "learning_rate": 3.0962316641375825e-05, + "loss": 0.3669, + "step": 7780 + }, + { + "epoch": 1.3287530937953402, + "grad_norm": 0.4588973269434624, + "learning_rate": 3.094650986342944e-05, + "loss": 0.3745, + "step": 7785 + }, + { + "epoch": 1.3296065545788172, + "grad_norm": 0.3743763419328188, + "learning_rate": 3.093070308548306e-05, + "loss": 0.382, + "step": 7790 + }, + { + "epoch": 1.330460015362294, + "grad_norm": 0.39118720849370636, + "learning_rate": 3.091489630753667e-05, + "loss": 0.3999, + "step": 7795 + }, + { + "epoch": 1.331313476145771, + "grad_norm": 0.35444759440072005, + "learning_rate": 3.0899089529590295e-05, + "loss": 0.3687, + "step": 7800 + }, + { + "epoch": 1.3321669369292481, + "grad_norm": 0.9234039532835139, + "learning_rate": 3.0883282751643905e-05, + "loss": 0.3862, + "step": 7805 + }, + { + "epoch": 1.333020397712725, + "grad_norm": 0.667172260558289, + "learning_rate": 3.086747597369752e-05, + "loss": 0.3987, + "step": 7810 + }, + { + "epoch": 1.3338738584962022, + "grad_norm": 0.3452488818103813, + "learning_rate": 3.085166919575114e-05, + "loss": 0.3747, + "step": 7815 + }, + { + "epoch": 1.3347273192796791, + "grad_norm": 0.3223161181571908, + "learning_rate": 3.083586241780476e-05, + "loss": 0.356, + "step": 7820 + }, + { + "epoch": 1.335580780063156, + "grad_norm": 0.3458890990408114, + "learning_rate": 3.0820055639858375e-05, + "loss": 0.3811, + "step": 7825 + }, + { + "epoch": 1.336434240846633, + "grad_norm": 0.3597519874961351, + "learning_rate": 3.0804248861911986e-05, + "loss": 0.3749, + "step": 7830 + }, + { + "epoch": 1.3372877016301101, + "grad_norm": 0.3524589306255129, + "learning_rate": 3.078844208396561e-05, + "loss": 0.3783, + "step": 7835 + }, + { + "epoch": 1.338141162413587, + "grad_norm": 0.32905689659120674, + "learning_rate": 3.077263530601922e-05, + "loss": 0.3988, + "step": 7840 + }, + { + "epoch": 1.3389946231970642, + "grad_norm": 0.4535575415728494, + "learning_rate": 3.075682852807284e-05, + "loss": 0.3842, + "step": 7845 + }, + { + "epoch": 1.3398480839805411, + "grad_norm": 0.387173113373346, + "learning_rate": 3.0741021750126456e-05, + "loss": 0.3858, + "step": 7850 + }, + { + "epoch": 1.340701544764018, + "grad_norm": 0.5401931885388208, + "learning_rate": 3.072521497218007e-05, + "loss": 0.3626, + "step": 7855 + }, + { + "epoch": 1.341555005547495, + "grad_norm": 0.3974982171224348, + "learning_rate": 3.070940819423369e-05, + "loss": 0.3898, + "step": 7860 + }, + { + "epoch": 1.342408466330972, + "grad_norm": 0.4010112756869794, + "learning_rate": 3.06936014162873e-05, + "loss": 0.3498, + "step": 7865 + }, + { + "epoch": 1.343261927114449, + "grad_norm": 0.398779730142885, + "learning_rate": 3.0677794638340926e-05, + "loss": 0.3668, + "step": 7870 + }, + { + "epoch": 1.3441153878979262, + "grad_norm": 0.3388769073868379, + "learning_rate": 3.0661987860394536e-05, + "loss": 0.3694, + "step": 7875 + }, + { + "epoch": 1.344968848681403, + "grad_norm": 0.44410138691153744, + "learning_rate": 3.0646181082448154e-05, + "loss": 0.3487, + "step": 7880 + }, + { + "epoch": 1.34582230946488, + "grad_norm": 0.44520525990871945, + "learning_rate": 3.063037430450177e-05, + "loss": 0.4044, + "step": 7885 + }, + { + "epoch": 1.3466757702483572, + "grad_norm": 0.40870853561648024, + "learning_rate": 3.061456752655539e-05, + "loss": 0.3996, + "step": 7890 + }, + { + "epoch": 1.347529231031834, + "grad_norm": 0.45739908444749894, + "learning_rate": 3.0598760748609006e-05, + "loss": 0.4049, + "step": 7895 + }, + { + "epoch": 1.348382691815311, + "grad_norm": 0.4344239547490103, + "learning_rate": 3.058295397066262e-05, + "loss": 0.3908, + "step": 7900 + }, + { + "epoch": 1.3492361525987882, + "grad_norm": 0.5573287877268787, + "learning_rate": 3.056714719271624e-05, + "loss": 0.3423, + "step": 7905 + }, + { + "epoch": 1.350089613382265, + "grad_norm": 0.3558345062805742, + "learning_rate": 3.055134041476985e-05, + "loss": 0.3697, + "step": 7910 + }, + { + "epoch": 1.350943074165742, + "grad_norm": 0.388868854432985, + "learning_rate": 3.053553363682347e-05, + "loss": 0.381, + "step": 7915 + }, + { + "epoch": 1.3517965349492191, + "grad_norm": 0.3249176781431105, + "learning_rate": 3.051972685887709e-05, + "loss": 0.3692, + "step": 7920 + }, + { + "epoch": 1.352649995732696, + "grad_norm": 0.4996246353360835, + "learning_rate": 3.0503920080930704e-05, + "loss": 0.3598, + "step": 7925 + }, + { + "epoch": 1.3535034565161732, + "grad_norm": 0.39933935553381533, + "learning_rate": 3.048811330298432e-05, + "loss": 0.3527, + "step": 7930 + }, + { + "epoch": 1.3543569172996501, + "grad_norm": 0.3681641698853502, + "learning_rate": 3.0472306525037936e-05, + "loss": 0.3881, + "step": 7935 + }, + { + "epoch": 1.355210378083127, + "grad_norm": 0.3653004963141978, + "learning_rate": 3.0456499747091556e-05, + "loss": 0.3753, + "step": 7940 + }, + { + "epoch": 1.356063838866604, + "grad_norm": 0.38532162730812364, + "learning_rate": 3.0440692969145174e-05, + "loss": 0.3747, + "step": 7945 + }, + { + "epoch": 1.3569172996500811, + "grad_norm": 0.41075651148330905, + "learning_rate": 3.0424886191198788e-05, + "loss": 0.3954, + "step": 7950 + }, + { + "epoch": 1.357770760433558, + "grad_norm": 0.36456529148274497, + "learning_rate": 3.0409079413252405e-05, + "loss": 0.3533, + "step": 7955 + }, + { + "epoch": 1.3586242212170352, + "grad_norm": 0.33276477226522555, + "learning_rate": 3.039327263530602e-05, + "loss": 0.3599, + "step": 7960 + }, + { + "epoch": 1.359477682000512, + "grad_norm": 0.32834117277130925, + "learning_rate": 3.0377465857359637e-05, + "loss": 0.3572, + "step": 7965 + }, + { + "epoch": 1.360331142783989, + "grad_norm": 0.40041837294621246, + "learning_rate": 3.036165907941325e-05, + "loss": 0.3721, + "step": 7970 + }, + { + "epoch": 1.361184603567466, + "grad_norm": 0.7082390310178429, + "learning_rate": 3.0345852301466872e-05, + "loss": 0.3756, + "step": 7975 + }, + { + "epoch": 1.362038064350943, + "grad_norm": 0.3581008506419958, + "learning_rate": 3.033004552352049e-05, + "loss": 0.3867, + "step": 7980 + }, + { + "epoch": 1.36289152513442, + "grad_norm": 0.4005690622739677, + "learning_rate": 3.0314238745574103e-05, + "loss": 0.3464, + "step": 7985 + }, + { + "epoch": 1.3637449859178972, + "grad_norm": 0.42662455749251676, + "learning_rate": 3.029843196762772e-05, + "loss": 0.3822, + "step": 7990 + }, + { + "epoch": 1.364598446701374, + "grad_norm": 0.36182851158669216, + "learning_rate": 3.0282625189681335e-05, + "loss": 0.3903, + "step": 7995 + }, + { + "epoch": 1.365451907484851, + "grad_norm": 0.4182698715308899, + "learning_rate": 3.0266818411734956e-05, + "loss": 0.3781, + "step": 8000 + }, + { + "epoch": 1.366305368268328, + "grad_norm": 0.38527955794425545, + "learning_rate": 3.0251011633788566e-05, + "loss": 0.3793, + "step": 8005 + }, + { + "epoch": 1.367158829051805, + "grad_norm": 0.41184995139370656, + "learning_rate": 3.0235204855842187e-05, + "loss": 0.3643, + "step": 8010 + }, + { + "epoch": 1.368012289835282, + "grad_norm": 0.37017177931689527, + "learning_rate": 3.0219398077895805e-05, + "loss": 0.3746, + "step": 8015 + }, + { + "epoch": 1.3688657506187591, + "grad_norm": 0.3517497261409417, + "learning_rate": 3.020359129994942e-05, + "loss": 0.4003, + "step": 8020 + }, + { + "epoch": 1.369719211402236, + "grad_norm": 0.36285052188375777, + "learning_rate": 3.0187784522003036e-05, + "loss": 0.3829, + "step": 8025 + }, + { + "epoch": 1.370572672185713, + "grad_norm": 0.407723005803761, + "learning_rate": 3.017197774405665e-05, + "loss": 0.3904, + "step": 8030 + }, + { + "epoch": 1.3714261329691901, + "grad_norm": 0.4660175917055238, + "learning_rate": 3.015617096611027e-05, + "loss": 0.3852, + "step": 8035 + }, + { + "epoch": 1.372279593752667, + "grad_norm": 0.35277520394249384, + "learning_rate": 3.014036418816389e-05, + "loss": 0.3805, + "step": 8040 + }, + { + "epoch": 1.373133054536144, + "grad_norm": 0.4244683993930311, + "learning_rate": 3.0124557410217503e-05, + "loss": 0.3724, + "step": 8045 + }, + { + "epoch": 1.3739865153196211, + "grad_norm": 0.534391863994486, + "learning_rate": 3.010875063227112e-05, + "loss": 0.3877, + "step": 8050 + }, + { + "epoch": 1.374839976103098, + "grad_norm": 0.3877045020895413, + "learning_rate": 3.0092943854324734e-05, + "loss": 0.3642, + "step": 8055 + }, + { + "epoch": 1.375693436886575, + "grad_norm": 0.3923341331422191, + "learning_rate": 3.007713707637835e-05, + "loss": 0.3816, + "step": 8060 + }, + { + "epoch": 1.3765468976700521, + "grad_norm": 0.40198899955815753, + "learning_rate": 3.0061330298431966e-05, + "loss": 0.3757, + "step": 8065 + }, + { + "epoch": 1.377400358453529, + "grad_norm": 0.3077601297800156, + "learning_rate": 3.0045523520485587e-05, + "loss": 0.3779, + "step": 8070 + }, + { + "epoch": 1.3782538192370062, + "grad_norm": 0.5506486589825703, + "learning_rate": 3.0029716742539204e-05, + "loss": 0.3771, + "step": 8075 + }, + { + "epoch": 1.379107280020483, + "grad_norm": 0.416658660112756, + "learning_rate": 3.0013909964592818e-05, + "loss": 0.3881, + "step": 8080 + }, + { + "epoch": 1.37996074080396, + "grad_norm": 0.3910076290601425, + "learning_rate": 2.9998103186646436e-05, + "loss": 0.4073, + "step": 8085 + }, + { + "epoch": 1.380814201587437, + "grad_norm": 0.3926307963488214, + "learning_rate": 2.998229640870005e-05, + "loss": 0.3822, + "step": 8090 + }, + { + "epoch": 1.381667662370914, + "grad_norm": 0.5148249677887993, + "learning_rate": 2.996648963075367e-05, + "loss": 0.3846, + "step": 8095 + }, + { + "epoch": 1.382521123154391, + "grad_norm": 0.4227190091549857, + "learning_rate": 2.995068285280728e-05, + "loss": 0.3882, + "step": 8100 + }, + { + "epoch": 1.3833745839378682, + "grad_norm": 0.39891875888144984, + "learning_rate": 2.9934876074860902e-05, + "loss": 0.3739, + "step": 8105 + }, + { + "epoch": 1.384228044721345, + "grad_norm": 0.3650419019224378, + "learning_rate": 2.991906929691452e-05, + "loss": 0.3711, + "step": 8110 + }, + { + "epoch": 1.385081505504822, + "grad_norm": 0.42465523808031613, + "learning_rate": 2.9903262518968134e-05, + "loss": 0.3791, + "step": 8115 + }, + { + "epoch": 1.385934966288299, + "grad_norm": 0.2805002865152103, + "learning_rate": 2.988745574102175e-05, + "loss": 0.3663, + "step": 8120 + }, + { + "epoch": 1.386788427071776, + "grad_norm": 0.44417964430675694, + "learning_rate": 2.9871648963075365e-05, + "loss": 0.367, + "step": 8125 + }, + { + "epoch": 1.387641887855253, + "grad_norm": 0.38110904409638974, + "learning_rate": 2.9855842185128986e-05, + "loss": 0.3814, + "step": 8130 + }, + { + "epoch": 1.3884953486387301, + "grad_norm": 0.4473363162850736, + "learning_rate": 2.9840035407182603e-05, + "loss": 0.3703, + "step": 8135 + }, + { + "epoch": 1.389348809422207, + "grad_norm": 0.370077716856123, + "learning_rate": 2.9824228629236217e-05, + "loss": 0.3822, + "step": 8140 + }, + { + "epoch": 1.390202270205684, + "grad_norm": 0.4409116111956241, + "learning_rate": 2.9808421851289835e-05, + "loss": 0.3613, + "step": 8145 + }, + { + "epoch": 1.391055730989161, + "grad_norm": 0.45142254858558284, + "learning_rate": 2.979261507334345e-05, + "loss": 0.3893, + "step": 8150 + }, + { + "epoch": 1.391909191772638, + "grad_norm": 0.3649482987051066, + "learning_rate": 2.977680829539707e-05, + "loss": 0.3629, + "step": 8155 + }, + { + "epoch": 1.392762652556115, + "grad_norm": 0.38984275503151034, + "learning_rate": 2.976100151745068e-05, + "loss": 0.3921, + "step": 8160 + }, + { + "epoch": 1.3936161133395921, + "grad_norm": 0.35334520856945323, + "learning_rate": 2.97451947395043e-05, + "loss": 0.3642, + "step": 8165 + }, + { + "epoch": 1.394469574123069, + "grad_norm": 0.39985450271284095, + "learning_rate": 2.972938796155792e-05, + "loss": 0.3732, + "step": 8170 + }, + { + "epoch": 1.395323034906546, + "grad_norm": 0.35479709027549755, + "learning_rate": 2.9713581183611533e-05, + "loss": 0.3711, + "step": 8175 + }, + { + "epoch": 1.3961764956900231, + "grad_norm": 0.29189648002820756, + "learning_rate": 2.969777440566515e-05, + "loss": 0.3884, + "step": 8180 + }, + { + "epoch": 1.3970299564735, + "grad_norm": 0.46385100833457055, + "learning_rate": 2.9681967627718764e-05, + "loss": 0.376, + "step": 8185 + }, + { + "epoch": 1.3978834172569772, + "grad_norm": 0.4131075562028953, + "learning_rate": 2.9666160849772385e-05, + "loss": 0.3801, + "step": 8190 + }, + { + "epoch": 1.398736878040454, + "grad_norm": 0.33214764010056536, + "learning_rate": 2.9650354071826003e-05, + "loss": 0.3681, + "step": 8195 + }, + { + "epoch": 1.399590338823931, + "grad_norm": 0.53525220637501, + "learning_rate": 2.9634547293879617e-05, + "loss": 0.3984, + "step": 8200 + }, + { + "epoch": 1.400443799607408, + "grad_norm": 0.3525049517732988, + "learning_rate": 2.9618740515933234e-05, + "loss": 0.3993, + "step": 8205 + }, + { + "epoch": 1.401297260390885, + "grad_norm": 0.2862840539021341, + "learning_rate": 2.9602933737986848e-05, + "loss": 0.3694, + "step": 8210 + }, + { + "epoch": 1.402150721174362, + "grad_norm": 0.4106370473387779, + "learning_rate": 2.958712696004047e-05, + "loss": 0.3542, + "step": 8215 + }, + { + "epoch": 1.4030041819578392, + "grad_norm": 0.3709496464873719, + "learning_rate": 2.957132018209408e-05, + "loss": 0.3848, + "step": 8220 + }, + { + "epoch": 1.403857642741316, + "grad_norm": 0.4866566318354896, + "learning_rate": 2.95555134041477e-05, + "loss": 0.386, + "step": 8225 + }, + { + "epoch": 1.404711103524793, + "grad_norm": 0.3574489462236936, + "learning_rate": 2.9539706626201318e-05, + "loss": 0.3569, + "step": 8230 + }, + { + "epoch": 1.40556456430827, + "grad_norm": 0.5124624087288366, + "learning_rate": 2.9523899848254932e-05, + "loss": 0.387, + "step": 8235 + }, + { + "epoch": 1.406418025091747, + "grad_norm": 0.45569556196234207, + "learning_rate": 2.950809307030855e-05, + "loss": 0.4009, + "step": 8240 + }, + { + "epoch": 1.407271485875224, + "grad_norm": 0.4171523942953767, + "learning_rate": 2.9492286292362164e-05, + "loss": 0.3642, + "step": 8245 + }, + { + "epoch": 1.4081249466587011, + "grad_norm": 0.29889924634374965, + "learning_rate": 2.9476479514415785e-05, + "loss": 0.3834, + "step": 8250 + }, + { + "epoch": 1.408978407442178, + "grad_norm": 0.3360512543164952, + "learning_rate": 2.9460672736469395e-05, + "loss": 0.3748, + "step": 8255 + }, + { + "epoch": 1.409831868225655, + "grad_norm": 0.38582001935622595, + "learning_rate": 2.9444865958523016e-05, + "loss": 0.3472, + "step": 8260 + }, + { + "epoch": 1.410685329009132, + "grad_norm": 0.3130260111344319, + "learning_rate": 2.9429059180576634e-05, + "loss": 0.3665, + "step": 8265 + }, + { + "epoch": 1.411538789792609, + "grad_norm": 0.3624333359753043, + "learning_rate": 2.9413252402630248e-05, + "loss": 0.3813, + "step": 8270 + }, + { + "epoch": 1.412392250576086, + "grad_norm": 0.3637937324332378, + "learning_rate": 2.939744562468387e-05, + "loss": 0.3851, + "step": 8275 + }, + { + "epoch": 1.4132457113595631, + "grad_norm": 0.3912603378405003, + "learning_rate": 2.938163884673748e-05, + "loss": 0.3752, + "step": 8280 + }, + { + "epoch": 1.41409917214304, + "grad_norm": 0.4295669156261878, + "learning_rate": 2.93658320687911e-05, + "loss": 0.3699, + "step": 8285 + }, + { + "epoch": 1.414952632926517, + "grad_norm": 0.3834328296660925, + "learning_rate": 2.9350025290844717e-05, + "loss": 0.3644, + "step": 8290 + }, + { + "epoch": 1.415806093709994, + "grad_norm": 0.3746515593442747, + "learning_rate": 2.933421851289833e-05, + "loss": 0.365, + "step": 8295 + }, + { + "epoch": 1.416659554493471, + "grad_norm": 0.7066203168458007, + "learning_rate": 2.931841173495195e-05, + "loss": 0.3813, + "step": 8300 + }, + { + "epoch": 1.417513015276948, + "grad_norm": 0.34315167367633687, + "learning_rate": 2.9302604957005563e-05, + "loss": 0.3794, + "step": 8305 + }, + { + "epoch": 1.418366476060425, + "grad_norm": 0.38712370941731256, + "learning_rate": 2.9286798179059184e-05, + "loss": 0.3894, + "step": 8310 + }, + { + "epoch": 1.419219936843902, + "grad_norm": 0.3673660256432574, + "learning_rate": 2.9270991401112795e-05, + "loss": 0.3686, + "step": 8315 + }, + { + "epoch": 1.420073397627379, + "grad_norm": 0.3677150794575681, + "learning_rate": 2.9255184623166415e-05, + "loss": 0.413, + "step": 8320 + }, + { + "epoch": 1.420926858410856, + "grad_norm": 0.42763016123464537, + "learning_rate": 2.9239377845220033e-05, + "loss": 0.35, + "step": 8325 + }, + { + "epoch": 1.421780319194333, + "grad_norm": 0.3446902671135802, + "learning_rate": 2.9223571067273647e-05, + "loss": 0.37, + "step": 8330 + }, + { + "epoch": 1.4226337799778102, + "grad_norm": 0.34060123260542946, + "learning_rate": 2.9207764289327268e-05, + "loss": 0.386, + "step": 8335 + }, + { + "epoch": 1.423487240761287, + "grad_norm": 0.45806486515155437, + "learning_rate": 2.919195751138088e-05, + "loss": 0.384, + "step": 8340 + }, + { + "epoch": 1.424340701544764, + "grad_norm": 0.3739214262633945, + "learning_rate": 2.91761507334345e-05, + "loss": 0.3522, + "step": 8345 + }, + { + "epoch": 1.425194162328241, + "grad_norm": 0.40813115432879005, + "learning_rate": 2.9160343955488113e-05, + "loss": 0.3904, + "step": 8350 + }, + { + "epoch": 1.426047623111718, + "grad_norm": 0.3647379706539541, + "learning_rate": 2.914453717754173e-05, + "loss": 0.359, + "step": 8355 + }, + { + "epoch": 1.426901083895195, + "grad_norm": 0.3112262907787899, + "learning_rate": 2.9128730399595348e-05, + "loss": 0.3527, + "step": 8360 + }, + { + "epoch": 1.4277545446786721, + "grad_norm": 0.380474743802569, + "learning_rate": 2.9112923621648962e-05, + "loss": 0.375, + "step": 8365 + }, + { + "epoch": 1.428608005462149, + "grad_norm": 0.40453440065650126, + "learning_rate": 2.9097116843702583e-05, + "loss": 0.3797, + "step": 8370 + }, + { + "epoch": 1.429461466245626, + "grad_norm": 0.29419053059544575, + "learning_rate": 2.9081310065756194e-05, + "loss": 0.3748, + "step": 8375 + }, + { + "epoch": 1.430314927029103, + "grad_norm": 0.29630117830668, + "learning_rate": 2.9065503287809815e-05, + "loss": 0.3729, + "step": 8380 + }, + { + "epoch": 1.43116838781258, + "grad_norm": 0.47399249555344675, + "learning_rate": 2.9049696509863432e-05, + "loss": 0.3898, + "step": 8385 + }, + { + "epoch": 1.432021848596057, + "grad_norm": 0.4194551512756209, + "learning_rate": 2.9033889731917046e-05, + "loss": 0.346, + "step": 8390 + }, + { + "epoch": 1.4328753093795341, + "grad_norm": 0.3505631403613706, + "learning_rate": 2.9018082953970667e-05, + "loss": 0.3649, + "step": 8395 + }, + { + "epoch": 1.433728770163011, + "grad_norm": 0.3338652798914307, + "learning_rate": 2.9002276176024278e-05, + "loss": 0.378, + "step": 8400 + }, + { + "epoch": 1.434582230946488, + "grad_norm": 0.4133051714483541, + "learning_rate": 2.89864693980779e-05, + "loss": 0.3764, + "step": 8405 + }, + { + "epoch": 1.4354356917299649, + "grad_norm": 0.3221303004684508, + "learning_rate": 2.8970662620131513e-05, + "loss": 0.3594, + "step": 8410 + }, + { + "epoch": 1.436289152513442, + "grad_norm": 0.5510652482425935, + "learning_rate": 2.895485584218513e-05, + "loss": 0.3817, + "step": 8415 + }, + { + "epoch": 1.437142613296919, + "grad_norm": 0.3615619462568365, + "learning_rate": 2.8939049064238748e-05, + "loss": 0.384, + "step": 8420 + }, + { + "epoch": 1.437996074080396, + "grad_norm": 0.4800111548347184, + "learning_rate": 2.892324228629236e-05, + "loss": 0.3847, + "step": 8425 + }, + { + "epoch": 1.438849534863873, + "grad_norm": 0.34094030765100325, + "learning_rate": 2.8907435508345983e-05, + "loss": 0.3643, + "step": 8430 + }, + { + "epoch": 1.43970299564735, + "grad_norm": 0.33032663807125756, + "learning_rate": 2.8891628730399593e-05, + "loss": 0.3856, + "step": 8435 + }, + { + "epoch": 1.440556456430827, + "grad_norm": 0.3093031003852718, + "learning_rate": 2.8875821952453214e-05, + "loss": 0.3618, + "step": 8440 + }, + { + "epoch": 1.441409917214304, + "grad_norm": 0.39872434410355373, + "learning_rate": 2.886001517450683e-05, + "loss": 0.378, + "step": 8445 + }, + { + "epoch": 1.442263377997781, + "grad_norm": 0.6054554551753893, + "learning_rate": 2.8844208396560446e-05, + "loss": 0.3892, + "step": 8450 + }, + { + "epoch": 1.443116838781258, + "grad_norm": 0.39556706642501605, + "learning_rate": 2.8828401618614066e-05, + "loss": 0.3792, + "step": 8455 + }, + { + "epoch": 1.443970299564735, + "grad_norm": 0.3272752075394602, + "learning_rate": 2.8812594840667677e-05, + "loss": 0.3737, + "step": 8460 + }, + { + "epoch": 1.444823760348212, + "grad_norm": 0.37048934763363117, + "learning_rate": 2.8796788062721298e-05, + "loss": 0.3649, + "step": 8465 + }, + { + "epoch": 1.445677221131689, + "grad_norm": 0.4596849364173726, + "learning_rate": 2.8780981284774912e-05, + "loss": 0.3632, + "step": 8470 + }, + { + "epoch": 1.446530681915166, + "grad_norm": 2.293224386936095, + "learning_rate": 2.876517450682853e-05, + "loss": 0.3635, + "step": 8475 + }, + { + "epoch": 1.4473841426986431, + "grad_norm": 0.33651129082014747, + "learning_rate": 2.8749367728882147e-05, + "loss": 0.381, + "step": 8480 + }, + { + "epoch": 1.44823760348212, + "grad_norm": 0.3456310747207851, + "learning_rate": 2.873356095093576e-05, + "loss": 0.3634, + "step": 8485 + }, + { + "epoch": 1.449091064265597, + "grad_norm": 0.3995683842923382, + "learning_rate": 2.8717754172989382e-05, + "loss": 0.3782, + "step": 8490 + }, + { + "epoch": 1.449944525049074, + "grad_norm": 0.3458136741834777, + "learning_rate": 2.8701947395042993e-05, + "loss": 0.3617, + "step": 8495 + }, + { + "epoch": 1.450797985832551, + "grad_norm": 0.43859808049241406, + "learning_rate": 2.8686140617096613e-05, + "loss": 0.3676, + "step": 8500 + }, + { + "epoch": 1.451651446616028, + "grad_norm": 0.34478162797919504, + "learning_rate": 2.8670333839150227e-05, + "loss": 0.3695, + "step": 8505 + }, + { + "epoch": 1.452504907399505, + "grad_norm": 0.3697671344377753, + "learning_rate": 2.8654527061203845e-05, + "loss": 0.3793, + "step": 8510 + }, + { + "epoch": 1.453358368182982, + "grad_norm": 0.3823214798575854, + "learning_rate": 2.8638720283257466e-05, + "loss": 0.3784, + "step": 8515 + }, + { + "epoch": 1.454211828966459, + "grad_norm": 0.38784507192907286, + "learning_rate": 2.8622913505311076e-05, + "loss": 0.3837, + "step": 8520 + }, + { + "epoch": 1.4550652897499359, + "grad_norm": 0.41557813313362896, + "learning_rate": 2.8607106727364697e-05, + "loss": 0.3378, + "step": 8525 + }, + { + "epoch": 1.455918750533413, + "grad_norm": 0.3275493993112729, + "learning_rate": 2.859129994941831e-05, + "loss": 0.3715, + "step": 8530 + }, + { + "epoch": 1.45677221131689, + "grad_norm": 0.6038453812193175, + "learning_rate": 2.857549317147193e-05, + "loss": 0.3962, + "step": 8535 + }, + { + "epoch": 1.457625672100367, + "grad_norm": 0.531888329219478, + "learning_rate": 2.8559686393525546e-05, + "loss": 0.3635, + "step": 8540 + }, + { + "epoch": 1.458479132883844, + "grad_norm": 0.4508160296461119, + "learning_rate": 2.854387961557916e-05, + "loss": 0.3694, + "step": 8545 + }, + { + "epoch": 1.459332593667321, + "grad_norm": 0.3785246712895831, + "learning_rate": 2.852807283763278e-05, + "loss": 0.3818, + "step": 8550 + }, + { + "epoch": 1.4601860544507979, + "grad_norm": 0.36557857481825257, + "learning_rate": 2.8512266059686392e-05, + "loss": 0.3697, + "step": 8555 + }, + { + "epoch": 1.461039515234275, + "grad_norm": 0.3965680377021494, + "learning_rate": 2.8496459281740013e-05, + "loss": 0.4041, + "step": 8560 + }, + { + "epoch": 1.461892976017752, + "grad_norm": 0.33612947628386897, + "learning_rate": 2.8480652503793627e-05, + "loss": 0.3515, + "step": 8565 + }, + { + "epoch": 1.462746436801229, + "grad_norm": 0.3365847961971949, + "learning_rate": 2.8464845725847244e-05, + "loss": 0.346, + "step": 8570 + }, + { + "epoch": 1.463599897584706, + "grad_norm": 0.3670891194765111, + "learning_rate": 2.8449038947900865e-05, + "loss": 0.3412, + "step": 8575 + }, + { + "epoch": 1.464453358368183, + "grad_norm": 0.357725309927502, + "learning_rate": 2.8433232169954476e-05, + "loss": 0.3561, + "step": 8580 + }, + { + "epoch": 1.46530681915166, + "grad_norm": 0.6195894783397705, + "learning_rate": 2.8417425392008097e-05, + "loss": 0.3661, + "step": 8585 + }, + { + "epoch": 1.466160279935137, + "grad_norm": 0.3755371747074425, + "learning_rate": 2.840161861406171e-05, + "loss": 0.4006, + "step": 8590 + }, + { + "epoch": 1.467013740718614, + "grad_norm": 0.44702736073352417, + "learning_rate": 2.8385811836115328e-05, + "loss": 0.4065, + "step": 8595 + }, + { + "epoch": 1.467867201502091, + "grad_norm": 0.34275483033089893, + "learning_rate": 2.8370005058168946e-05, + "loss": 0.3735, + "step": 8600 + }, + { + "epoch": 1.468720662285568, + "grad_norm": 0.4273574089186848, + "learning_rate": 2.835419828022256e-05, + "loss": 0.3963, + "step": 8605 + }, + { + "epoch": 1.469574123069045, + "grad_norm": 0.40888070045662017, + "learning_rate": 2.833839150227618e-05, + "loss": 0.3936, + "step": 8610 + }, + { + "epoch": 1.470427583852522, + "grad_norm": 0.34082565712865803, + "learning_rate": 2.832258472432979e-05, + "loss": 0.3645, + "step": 8615 + }, + { + "epoch": 1.471281044635999, + "grad_norm": 0.48554931370305, + "learning_rate": 2.8306777946383412e-05, + "loss": 0.3707, + "step": 8620 + }, + { + "epoch": 1.472134505419476, + "grad_norm": 0.3271541302725915, + "learning_rate": 2.8290971168437026e-05, + "loss": 0.3519, + "step": 8625 + }, + { + "epoch": 1.472987966202953, + "grad_norm": 0.3903889081674045, + "learning_rate": 2.8275164390490644e-05, + "loss": 0.3632, + "step": 8630 + }, + { + "epoch": 1.47384142698643, + "grad_norm": 0.4284030291366842, + "learning_rate": 2.8259357612544264e-05, + "loss": 0.3633, + "step": 8635 + }, + { + "epoch": 1.4746948877699069, + "grad_norm": 0.3216128934656074, + "learning_rate": 2.8243550834597875e-05, + "loss": 0.3686, + "step": 8640 + }, + { + "epoch": 1.475548348553384, + "grad_norm": 0.3895542158384611, + "learning_rate": 2.8227744056651496e-05, + "loss": 0.4004, + "step": 8645 + }, + { + "epoch": 1.476401809336861, + "grad_norm": 0.38894558511697247, + "learning_rate": 2.821193727870511e-05, + "loss": 0.3757, + "step": 8650 + }, + { + "epoch": 1.477255270120338, + "grad_norm": 0.454030090547461, + "learning_rate": 2.8196130500758727e-05, + "loss": 0.3672, + "step": 8655 + }, + { + "epoch": 1.478108730903815, + "grad_norm": 0.3678890776042101, + "learning_rate": 2.818032372281234e-05, + "loss": 0.3656, + "step": 8660 + }, + { + "epoch": 1.478962191687292, + "grad_norm": 0.3679379479229768, + "learning_rate": 2.816451694486596e-05, + "loss": 0.3611, + "step": 8665 + }, + { + "epoch": 1.4798156524707688, + "grad_norm": 0.36577399619786805, + "learning_rate": 2.814871016691958e-05, + "loss": 0.372, + "step": 8670 + }, + { + "epoch": 1.480669113254246, + "grad_norm": 0.41365093673266295, + "learning_rate": 2.813290338897319e-05, + "loss": 0.3916, + "step": 8675 + }, + { + "epoch": 1.481522574037723, + "grad_norm": 0.5075863875682871, + "learning_rate": 2.811709661102681e-05, + "loss": 0.3795, + "step": 8680 + }, + { + "epoch": 1.4823760348212, + "grad_norm": 0.4153661164502053, + "learning_rate": 2.8101289833080425e-05, + "loss": 0.4021, + "step": 8685 + }, + { + "epoch": 1.483229495604677, + "grad_norm": 0.3878147450114199, + "learning_rate": 2.8085483055134043e-05, + "loss": 0.3908, + "step": 8690 + }, + { + "epoch": 1.484082956388154, + "grad_norm": 0.41708422810946044, + "learning_rate": 2.806967627718766e-05, + "loss": 0.3482, + "step": 8695 + }, + { + "epoch": 1.4849364171716308, + "grad_norm": 0.39439490995397697, + "learning_rate": 2.8053869499241274e-05, + "loss": 0.3647, + "step": 8700 + }, + { + "epoch": 1.485789877955108, + "grad_norm": 0.5110644577060773, + "learning_rate": 2.8038062721294895e-05, + "loss": 0.3442, + "step": 8705 + }, + { + "epoch": 1.486643338738585, + "grad_norm": 0.307577556175372, + "learning_rate": 2.802225594334851e-05, + "loss": 0.3564, + "step": 8710 + }, + { + "epoch": 1.487496799522062, + "grad_norm": 0.4158390196218934, + "learning_rate": 2.8006449165402127e-05, + "loss": 0.3971, + "step": 8715 + }, + { + "epoch": 1.488350260305539, + "grad_norm": 0.39969357057334653, + "learning_rate": 2.799064238745574e-05, + "loss": 0.3935, + "step": 8720 + }, + { + "epoch": 1.4892037210890159, + "grad_norm": 0.3722754066149683, + "learning_rate": 2.7974835609509358e-05, + "loss": 0.3716, + "step": 8725 + }, + { + "epoch": 1.490057181872493, + "grad_norm": 0.33067377058218317, + "learning_rate": 2.795902883156298e-05, + "loss": 0.3746, + "step": 8730 + }, + { + "epoch": 1.49091064265597, + "grad_norm": 0.405252723193417, + "learning_rate": 2.794322205361659e-05, + "loss": 0.356, + "step": 8735 + }, + { + "epoch": 1.4917641034394469, + "grad_norm": 0.3694165764691589, + "learning_rate": 2.792741527567021e-05, + "loss": 0.366, + "step": 8740 + }, + { + "epoch": 1.492617564222924, + "grad_norm": 0.426916135457793, + "learning_rate": 2.7911608497723825e-05, + "loss": 0.3824, + "step": 8745 + }, + { + "epoch": 1.493471025006401, + "grad_norm": 0.754366531660231, + "learning_rate": 2.7895801719777442e-05, + "loss": 0.4195, + "step": 8750 + }, + { + "epoch": 1.4943244857898779, + "grad_norm": 0.5434925483779819, + "learning_rate": 2.7879994941831056e-05, + "loss": 0.3823, + "step": 8755 + }, + { + "epoch": 1.495177946573355, + "grad_norm": 0.29984507565571433, + "learning_rate": 2.7864188163884674e-05, + "loss": 0.3804, + "step": 8760 + }, + { + "epoch": 1.496031407356832, + "grad_norm": 0.32650876732692063, + "learning_rate": 2.7848381385938295e-05, + "loss": 0.3849, + "step": 8765 + }, + { + "epoch": 1.496884868140309, + "grad_norm": 0.3331507148161785, + "learning_rate": 2.783257460799191e-05, + "loss": 0.3668, + "step": 8770 + }, + { + "epoch": 1.497738328923786, + "grad_norm": 0.37201454830785047, + "learning_rate": 2.7816767830045526e-05, + "loss": 0.3724, + "step": 8775 + }, + { + "epoch": 1.498591789707263, + "grad_norm": 0.3883608809915117, + "learning_rate": 2.780096105209914e-05, + "loss": 0.3742, + "step": 8780 + }, + { + "epoch": 1.4994452504907398, + "grad_norm": 0.4341517061568229, + "learning_rate": 2.7785154274152758e-05, + "loss": 0.3796, + "step": 8785 + }, + { + "epoch": 1.500298711274217, + "grad_norm": 0.43802481543794175, + "learning_rate": 2.776934749620638e-05, + "loss": 0.3863, + "step": 8790 + }, + { + "epoch": 1.501152172057694, + "grad_norm": 0.422125554109543, + "learning_rate": 2.775354071825999e-05, + "loss": 0.3795, + "step": 8795 + }, + { + "epoch": 1.502005632841171, + "grad_norm": 0.5053590727369254, + "learning_rate": 2.773773394031361e-05, + "loss": 0.3596, + "step": 8800 + }, + { + "epoch": 1.502859093624648, + "grad_norm": 2.003664078861723, + "learning_rate": 2.7721927162367224e-05, + "loss": 0.3823, + "step": 8805 + }, + { + "epoch": 1.503712554408125, + "grad_norm": 0.503061980341984, + "learning_rate": 2.770612038442084e-05, + "loss": 0.3435, + "step": 8810 + }, + { + "epoch": 1.5045660151916018, + "grad_norm": 0.3839031362370193, + "learning_rate": 2.7690313606474456e-05, + "loss": 0.3787, + "step": 8815 + }, + { + "epoch": 1.505419475975079, + "grad_norm": 0.6262488963362381, + "learning_rate": 2.7674506828528073e-05, + "loss": 0.3775, + "step": 8820 + }, + { + "epoch": 1.506272936758556, + "grad_norm": 0.3837957990180745, + "learning_rate": 2.7658700050581694e-05, + "loss": 0.3558, + "step": 8825 + }, + { + "epoch": 1.507126397542033, + "grad_norm": 0.5708127621802429, + "learning_rate": 2.7642893272635305e-05, + "loss": 0.3633, + "step": 8830 + }, + { + "epoch": 1.50797985832551, + "grad_norm": 0.37810240026882774, + "learning_rate": 2.7627086494688925e-05, + "loss": 0.3696, + "step": 8835 + }, + { + "epoch": 1.5088333191089869, + "grad_norm": 0.4379948264672438, + "learning_rate": 2.761127971674254e-05, + "loss": 0.3966, + "step": 8840 + }, + { + "epoch": 1.5096867798924638, + "grad_norm": 0.4864435508523675, + "learning_rate": 2.7595472938796157e-05, + "loss": 0.3626, + "step": 8845 + }, + { + "epoch": 1.510540240675941, + "grad_norm": 0.37753863697846646, + "learning_rate": 2.7579666160849778e-05, + "loss": 0.3773, + "step": 8850 + }, + { + "epoch": 1.511393701459418, + "grad_norm": 0.42430613680275475, + "learning_rate": 2.756385938290339e-05, + "loss": 0.3775, + "step": 8855 + }, + { + "epoch": 1.512247162242895, + "grad_norm": 0.544003009322885, + "learning_rate": 2.754805260495701e-05, + "loss": 0.3859, + "step": 8860 + }, + { + "epoch": 1.513100623026372, + "grad_norm": 0.4074885598009095, + "learning_rate": 2.7532245827010623e-05, + "loss": 0.3484, + "step": 8865 + }, + { + "epoch": 1.5139540838098489, + "grad_norm": 0.3988185137624454, + "learning_rate": 2.751643904906424e-05, + "loss": 0.377, + "step": 8870 + }, + { + "epoch": 1.5148075445933258, + "grad_norm": 0.5744208283221645, + "learning_rate": 2.7500632271117855e-05, + "loss": 0.3988, + "step": 8875 + }, + { + "epoch": 1.515661005376803, + "grad_norm": 0.4850703376232935, + "learning_rate": 2.7484825493171472e-05, + "loss": 0.3533, + "step": 8880 + }, + { + "epoch": 1.51651446616028, + "grad_norm": 0.48494000229877243, + "learning_rate": 2.7469018715225093e-05, + "loss": 0.3627, + "step": 8885 + }, + { + "epoch": 1.517367926943757, + "grad_norm": 0.3332980410205891, + "learning_rate": 2.7453211937278704e-05, + "loss": 0.3879, + "step": 8890 + }, + { + "epoch": 1.518221387727234, + "grad_norm": 0.3294632163479773, + "learning_rate": 2.7437405159332325e-05, + "loss": 0.3924, + "step": 8895 + }, + { + "epoch": 1.5190748485107108, + "grad_norm": 0.338661944064282, + "learning_rate": 2.742159838138594e-05, + "loss": 0.3641, + "step": 8900 + }, + { + "epoch": 1.519928309294188, + "grad_norm": 0.3636548212436217, + "learning_rate": 2.7405791603439556e-05, + "loss": 0.3822, + "step": 8905 + }, + { + "epoch": 1.520781770077665, + "grad_norm": 0.463133124848011, + "learning_rate": 2.738998482549317e-05, + "loss": 0.3697, + "step": 8910 + }, + { + "epoch": 1.521635230861142, + "grad_norm": 0.42984954316114005, + "learning_rate": 2.7374178047546788e-05, + "loss": 0.3764, + "step": 8915 + }, + { + "epoch": 1.522488691644619, + "grad_norm": 0.3745966593564767, + "learning_rate": 2.735837126960041e-05, + "loss": 0.3734, + "step": 8920 + }, + { + "epoch": 1.523342152428096, + "grad_norm": 0.38169440992259, + "learning_rate": 2.7342564491654023e-05, + "loss": 0.3773, + "step": 8925 + }, + { + "epoch": 1.5241956132115728, + "grad_norm": 0.6318648268520636, + "learning_rate": 2.732675771370764e-05, + "loss": 0.408, + "step": 8930 + }, + { + "epoch": 1.52504907399505, + "grad_norm": 0.4723539663658168, + "learning_rate": 2.7310950935761254e-05, + "loss": 0.3749, + "step": 8935 + }, + { + "epoch": 1.5259025347785269, + "grad_norm": 0.34942791277198937, + "learning_rate": 2.7295144157814872e-05, + "loss": 0.3689, + "step": 8940 + }, + { + "epoch": 1.526755995562004, + "grad_norm": 0.41284349215649063, + "learning_rate": 2.7279337379868493e-05, + "loss": 0.3786, + "step": 8945 + }, + { + "epoch": 1.527609456345481, + "grad_norm": 0.3161233364871583, + "learning_rate": 2.7263530601922103e-05, + "loss": 0.3678, + "step": 8950 + }, + { + "epoch": 1.5284629171289579, + "grad_norm": 0.39487784144550014, + "learning_rate": 2.7247723823975724e-05, + "loss": 0.3562, + "step": 8955 + }, + { + "epoch": 1.5293163779124348, + "grad_norm": 0.3068547919022294, + "learning_rate": 2.7231917046029338e-05, + "loss": 0.3689, + "step": 8960 + }, + { + "epoch": 1.530169838695912, + "grad_norm": 10.742316504226467, + "learning_rate": 2.7216110268082956e-05, + "loss": 0.3945, + "step": 8965 + }, + { + "epoch": 1.531023299479389, + "grad_norm": 0.655424979342714, + "learning_rate": 2.720030349013657e-05, + "loss": 0.3879, + "step": 8970 + }, + { + "epoch": 1.531876760262866, + "grad_norm": 0.40648843586294353, + "learning_rate": 2.7184496712190187e-05, + "loss": 0.3979, + "step": 8975 + }, + { + "epoch": 1.532730221046343, + "grad_norm": 0.4803385811688802, + "learning_rate": 2.7168689934243808e-05, + "loss": 0.3637, + "step": 8980 + }, + { + "epoch": 1.5335836818298199, + "grad_norm": 0.4245263455311067, + "learning_rate": 2.7152883156297422e-05, + "loss": 0.3618, + "step": 8985 + }, + { + "epoch": 1.5344371426132968, + "grad_norm": 0.39932903050533153, + "learning_rate": 2.713707637835104e-05, + "loss": 0.3761, + "step": 8990 + }, + { + "epoch": 1.535290603396774, + "grad_norm": 0.44406625007850303, + "learning_rate": 2.7121269600404654e-05, + "loss": 0.3915, + "step": 8995 + }, + { + "epoch": 1.536144064180251, + "grad_norm": 0.32849238846478673, + "learning_rate": 2.710546282245827e-05, + "loss": 0.3668, + "step": 9000 + }, + { + "epoch": 1.536997524963728, + "grad_norm": 0.3925200210031384, + "learning_rate": 2.7089656044511892e-05, + "loss": 0.3905, + "step": 9005 + }, + { + "epoch": 1.537850985747205, + "grad_norm": 0.7415926818661323, + "learning_rate": 2.7073849266565503e-05, + "loss": 0.3888, + "step": 9010 + }, + { + "epoch": 1.5387044465306818, + "grad_norm": 0.3707764983758511, + "learning_rate": 2.7058042488619123e-05, + "loss": 0.3605, + "step": 9015 + }, + { + "epoch": 1.5395579073141588, + "grad_norm": 0.3933176309338654, + "learning_rate": 2.7042235710672737e-05, + "loss": 0.3583, + "step": 9020 + }, + { + "epoch": 1.540411368097636, + "grad_norm": 0.3297557693413401, + "learning_rate": 2.7026428932726355e-05, + "loss": 0.3479, + "step": 9025 + }, + { + "epoch": 1.541264828881113, + "grad_norm": 0.4398309704400533, + "learning_rate": 2.701062215477997e-05, + "loss": 0.4116, + "step": 9030 + }, + { + "epoch": 1.54211828966459, + "grad_norm": 0.42724442868004253, + "learning_rate": 2.6994815376833586e-05, + "loss": 0.3671, + "step": 9035 + }, + { + "epoch": 1.542971750448067, + "grad_norm": 0.3989452060612025, + "learning_rate": 2.6979008598887207e-05, + "loss": 0.3718, + "step": 9040 + }, + { + "epoch": 1.5438252112315438, + "grad_norm": 0.37428979509096105, + "learning_rate": 2.696320182094082e-05, + "loss": 0.3786, + "step": 9045 + }, + { + "epoch": 1.544678672015021, + "grad_norm": 0.5076240642768487, + "learning_rate": 2.694739504299444e-05, + "loss": 0.377, + "step": 9050 + }, + { + "epoch": 1.5455321327984979, + "grad_norm": 0.7285376829530452, + "learning_rate": 2.6931588265048053e-05, + "loss": 0.3886, + "step": 9055 + }, + { + "epoch": 1.546385593581975, + "grad_norm": 0.5204765982447884, + "learning_rate": 2.691578148710167e-05, + "loss": 0.4023, + "step": 9060 + }, + { + "epoch": 1.547239054365452, + "grad_norm": 0.3569364007758203, + "learning_rate": 2.6899974709155284e-05, + "loss": 0.3562, + "step": 9065 + }, + { + "epoch": 1.5480925151489289, + "grad_norm": 0.3811351720837915, + "learning_rate": 2.6884167931208902e-05, + "loss": 0.3824, + "step": 9070 + }, + { + "epoch": 1.5489459759324058, + "grad_norm": 0.32962360850539596, + "learning_rate": 2.6868361153262523e-05, + "loss": 0.3924, + "step": 9075 + }, + { + "epoch": 1.549799436715883, + "grad_norm": 0.432679451843628, + "learning_rate": 2.6852554375316137e-05, + "loss": 0.3895, + "step": 9080 + }, + { + "epoch": 1.5506528974993599, + "grad_norm": 0.4945031259578414, + "learning_rate": 2.6836747597369754e-05, + "loss": 0.3868, + "step": 9085 + }, + { + "epoch": 1.551506358282837, + "grad_norm": 0.3701671834144567, + "learning_rate": 2.682094081942337e-05, + "loss": 0.3808, + "step": 9090 + }, + { + "epoch": 1.552359819066314, + "grad_norm": 0.3636808660556998, + "learning_rate": 2.6805134041476986e-05, + "loss": 0.3966, + "step": 9095 + }, + { + "epoch": 1.5532132798497909, + "grad_norm": 0.36849988756897306, + "learning_rate": 2.6789327263530607e-05, + "loss": 0.3855, + "step": 9100 + }, + { + "epoch": 1.5540667406332678, + "grad_norm": 0.5038956078553172, + "learning_rate": 2.677352048558422e-05, + "loss": 0.3869, + "step": 9105 + }, + { + "epoch": 1.554920201416745, + "grad_norm": 0.34468219300425484, + "learning_rate": 2.6757713707637838e-05, + "loss": 0.3686, + "step": 9110 + }, + { + "epoch": 1.555773662200222, + "grad_norm": 0.401718519770233, + "learning_rate": 2.6741906929691452e-05, + "loss": 0.3738, + "step": 9115 + }, + { + "epoch": 1.556627122983699, + "grad_norm": 0.4171816020041063, + "learning_rate": 2.672610015174507e-05, + "loss": 0.366, + "step": 9120 + }, + { + "epoch": 1.557480583767176, + "grad_norm": 0.39786329975089246, + "learning_rate": 2.6710293373798684e-05, + "loss": 0.3761, + "step": 9125 + }, + { + "epoch": 1.5583340445506528, + "grad_norm": 0.7040786337578375, + "learning_rate": 2.66944865958523e-05, + "loss": 0.3979, + "step": 9130 + }, + { + "epoch": 1.5591875053341298, + "grad_norm": 0.557664904636061, + "learning_rate": 2.6678679817905922e-05, + "loss": 0.3655, + "step": 9135 + }, + { + "epoch": 1.560040966117607, + "grad_norm": 0.3497624468193592, + "learning_rate": 2.6662873039959536e-05, + "loss": 0.3738, + "step": 9140 + }, + { + "epoch": 1.560894426901084, + "grad_norm": 0.40404210994387985, + "learning_rate": 2.6647066262013154e-05, + "loss": 0.3607, + "step": 9145 + }, + { + "epoch": 1.561747887684561, + "grad_norm": 0.36077575739029377, + "learning_rate": 2.6631259484066768e-05, + "loss": 0.371, + "step": 9150 + }, + { + "epoch": 1.5626013484680379, + "grad_norm": 0.4729302920624471, + "learning_rate": 2.6615452706120385e-05, + "loss": 0.3864, + "step": 9155 + }, + { + "epoch": 1.5634548092515148, + "grad_norm": 0.2874203120762668, + "learning_rate": 2.6599645928174e-05, + "loss": 0.3856, + "step": 9160 + }, + { + "epoch": 1.5643082700349917, + "grad_norm": 0.36858966565331214, + "learning_rate": 2.658383915022762e-05, + "loss": 0.3872, + "step": 9165 + }, + { + "epoch": 1.5651617308184689, + "grad_norm": 0.38098337930660564, + "learning_rate": 2.6568032372281237e-05, + "loss": 0.3847, + "step": 9170 + }, + { + "epoch": 1.566015191601946, + "grad_norm": 0.800233524254945, + "learning_rate": 2.655222559433485e-05, + "loss": 0.3685, + "step": 9175 + }, + { + "epoch": 1.566868652385423, + "grad_norm": 0.48202523987743545, + "learning_rate": 2.653641881638847e-05, + "loss": 0.3699, + "step": 9180 + }, + { + "epoch": 1.5677221131688999, + "grad_norm": 0.5688858346090507, + "learning_rate": 2.6520612038442083e-05, + "loss": 0.379, + "step": 9185 + }, + { + "epoch": 1.5685755739523768, + "grad_norm": 0.3767776776816685, + "learning_rate": 2.65048052604957e-05, + "loss": 0.3842, + "step": 9190 + }, + { + "epoch": 1.569429034735854, + "grad_norm": 0.3629471220319574, + "learning_rate": 2.648899848254932e-05, + "loss": 0.3647, + "step": 9195 + }, + { + "epoch": 1.5702824955193309, + "grad_norm": 0.3949093492225141, + "learning_rate": 2.6473191704602935e-05, + "loss": 0.3515, + "step": 9200 + }, + { + "epoch": 1.571135956302808, + "grad_norm": 0.43793661800851835, + "learning_rate": 2.6457384926656553e-05, + "loss": 0.3881, + "step": 9205 + }, + { + "epoch": 1.571989417086285, + "grad_norm": 0.3727516948465313, + "learning_rate": 2.6441578148710167e-05, + "loss": 0.3831, + "step": 9210 + }, + { + "epoch": 1.5728428778697618, + "grad_norm": 0.37882602636586166, + "learning_rate": 2.6425771370763784e-05, + "loss": 0.3851, + "step": 9215 + }, + { + "epoch": 1.5736963386532388, + "grad_norm": 0.33033538606628826, + "learning_rate": 2.64099645928174e-05, + "loss": 0.3765, + "step": 9220 + }, + { + "epoch": 1.574549799436716, + "grad_norm": 0.3784344567080915, + "learning_rate": 2.639415781487102e-05, + "loss": 0.39, + "step": 9225 + }, + { + "epoch": 1.5754032602201928, + "grad_norm": 0.35185440416602554, + "learning_rate": 2.6378351036924637e-05, + "loss": 0.3646, + "step": 9230 + }, + { + "epoch": 1.57625672100367, + "grad_norm": 0.3219427366807726, + "learning_rate": 2.636254425897825e-05, + "loss": 0.376, + "step": 9235 + }, + { + "epoch": 1.577110181787147, + "grad_norm": 0.35824132884480425, + "learning_rate": 2.634673748103187e-05, + "loss": 0.3897, + "step": 9240 + }, + { + "epoch": 1.5779636425706238, + "grad_norm": 0.34487482225992905, + "learning_rate": 2.6330930703085482e-05, + "loss": 0.3953, + "step": 9245 + }, + { + "epoch": 1.5788171033541007, + "grad_norm": 0.362644522036318, + "learning_rate": 2.63151239251391e-05, + "loss": 0.3762, + "step": 9250 + }, + { + "epoch": 1.579670564137578, + "grad_norm": 0.3780291497619649, + "learning_rate": 2.629931714719272e-05, + "loss": 0.376, + "step": 9255 + }, + { + "epoch": 1.580524024921055, + "grad_norm": 0.40088734596731784, + "learning_rate": 2.6283510369246335e-05, + "loss": 0.3823, + "step": 9260 + }, + { + "epoch": 1.581377485704532, + "grad_norm": 0.36782407616549584, + "learning_rate": 2.6267703591299952e-05, + "loss": 0.3907, + "step": 9265 + }, + { + "epoch": 1.5822309464880089, + "grad_norm": 0.5086138052713359, + "learning_rate": 2.6251896813353566e-05, + "loss": 0.3942, + "step": 9270 + }, + { + "epoch": 1.5830844072714858, + "grad_norm": 0.36939975099414485, + "learning_rate": 2.6236090035407184e-05, + "loss": 0.3888, + "step": 9275 + }, + { + "epoch": 1.5839378680549627, + "grad_norm": 0.3539713587920976, + "learning_rate": 2.6220283257460798e-05, + "loss": 0.3494, + "step": 9280 + }, + { + "epoch": 1.5847913288384399, + "grad_norm": 0.35250448460289285, + "learning_rate": 2.620447647951442e-05, + "loss": 0.3757, + "step": 9285 + }, + { + "epoch": 1.585644789621917, + "grad_norm": 0.5006557278450434, + "learning_rate": 2.6188669701568036e-05, + "loss": 0.37, + "step": 9290 + }, + { + "epoch": 1.586498250405394, + "grad_norm": 0.29962215222651417, + "learning_rate": 2.617286292362165e-05, + "loss": 0.3692, + "step": 9295 + }, + { + "epoch": 1.5873517111888709, + "grad_norm": 0.31878668347807404, + "learning_rate": 2.6157056145675268e-05, + "loss": 0.3486, + "step": 9300 + }, + { + "epoch": 1.5882051719723478, + "grad_norm": 0.290472656585368, + "learning_rate": 2.6141249367728882e-05, + "loss": 0.3823, + "step": 9305 + }, + { + "epoch": 1.5890586327558247, + "grad_norm": 0.34575943425850814, + "learning_rate": 2.61254425897825e-05, + "loss": 0.3585, + "step": 9310 + }, + { + "epoch": 1.5899120935393019, + "grad_norm": 0.6272976523179152, + "learning_rate": 2.6109635811836113e-05, + "loss": 0.3926, + "step": 9315 + }, + { + "epoch": 1.590765554322779, + "grad_norm": 0.43107217052336677, + "learning_rate": 2.6093829033889734e-05, + "loss": 0.3863, + "step": 9320 + }, + { + "epoch": 1.591619015106256, + "grad_norm": 0.43085347816235414, + "learning_rate": 2.607802225594335e-05, + "loss": 0.3793, + "step": 9325 + }, + { + "epoch": 1.5924724758897328, + "grad_norm": 0.34121218921916524, + "learning_rate": 2.6062215477996966e-05, + "loss": 0.3767, + "step": 9330 + }, + { + "epoch": 1.5933259366732098, + "grad_norm": 0.434036624709166, + "learning_rate": 2.6046408700050583e-05, + "loss": 0.3608, + "step": 9335 + }, + { + "epoch": 1.594179397456687, + "grad_norm": 0.4432817785731385, + "learning_rate": 2.6030601922104197e-05, + "loss": 0.3775, + "step": 9340 + }, + { + "epoch": 1.5950328582401638, + "grad_norm": 0.45732496141433143, + "learning_rate": 2.6014795144157818e-05, + "loss": 0.3737, + "step": 9345 + }, + { + "epoch": 1.595886319023641, + "grad_norm": 0.3275847167701077, + "learning_rate": 2.5998988366211435e-05, + "loss": 0.3585, + "step": 9350 + }, + { + "epoch": 1.596739779807118, + "grad_norm": 0.4462161939109244, + "learning_rate": 2.598318158826505e-05, + "loss": 0.3812, + "step": 9355 + }, + { + "epoch": 1.5975932405905948, + "grad_norm": 0.36722959104251734, + "learning_rate": 2.5967374810318667e-05, + "loss": 0.361, + "step": 9360 + }, + { + "epoch": 1.5984467013740717, + "grad_norm": 0.3708369371991551, + "learning_rate": 2.595156803237228e-05, + "loss": 0.3524, + "step": 9365 + }, + { + "epoch": 1.5993001621575489, + "grad_norm": 0.38301506596702944, + "learning_rate": 2.59357612544259e-05, + "loss": 0.3771, + "step": 9370 + }, + { + "epoch": 1.6001536229410258, + "grad_norm": 0.3112861622370914, + "learning_rate": 2.5919954476479513e-05, + "loss": 0.3828, + "step": 9375 + }, + { + "epoch": 1.601007083724503, + "grad_norm": 0.5593603068779465, + "learning_rate": 2.5904147698533133e-05, + "loss": 0.3909, + "step": 9380 + }, + { + "epoch": 1.6018605445079799, + "grad_norm": 0.324086537459784, + "learning_rate": 2.588834092058675e-05, + "loss": 0.3806, + "step": 9385 + }, + { + "epoch": 1.6027140052914568, + "grad_norm": 0.3909912111430699, + "learning_rate": 2.5872534142640365e-05, + "loss": 0.3779, + "step": 9390 + }, + { + "epoch": 1.6035674660749337, + "grad_norm": 0.3699949482868292, + "learning_rate": 2.5856727364693982e-05, + "loss": 0.3646, + "step": 9395 + }, + { + "epoch": 1.6044209268584109, + "grad_norm": 0.28406496348136745, + "learning_rate": 2.5840920586747596e-05, + "loss": 0.3579, + "step": 9400 + }, + { + "epoch": 1.605274387641888, + "grad_norm": 0.3616251180170039, + "learning_rate": 2.5825113808801217e-05, + "loss": 0.3526, + "step": 9405 + }, + { + "epoch": 1.606127848425365, + "grad_norm": 0.809971232093663, + "learning_rate": 2.5809307030854828e-05, + "loss": 0.3855, + "step": 9410 + }, + { + "epoch": 1.6069813092088419, + "grad_norm": 0.3519168231586514, + "learning_rate": 2.579350025290845e-05, + "loss": 0.3811, + "step": 9415 + }, + { + "epoch": 1.6078347699923188, + "grad_norm": 0.3721702280093411, + "learning_rate": 2.5777693474962066e-05, + "loss": 0.3781, + "step": 9420 + }, + { + "epoch": 1.6086882307757957, + "grad_norm": 0.3459379061704033, + "learning_rate": 2.576188669701568e-05, + "loss": 0.3824, + "step": 9425 + }, + { + "epoch": 1.6095416915592728, + "grad_norm": 0.38834628814085237, + "learning_rate": 2.5746079919069298e-05, + "loss": 0.3668, + "step": 9430 + }, + { + "epoch": 1.61039515234275, + "grad_norm": 0.3766126582753333, + "learning_rate": 2.5730273141122912e-05, + "loss": 0.3779, + "step": 9435 + }, + { + "epoch": 1.611248613126227, + "grad_norm": 0.5430538563590361, + "learning_rate": 2.5714466363176533e-05, + "loss": 0.3662, + "step": 9440 + }, + { + "epoch": 1.6121020739097038, + "grad_norm": 0.5057585205733404, + "learning_rate": 2.569865958523015e-05, + "loss": 0.3974, + "step": 9445 + }, + { + "epoch": 1.6129555346931808, + "grad_norm": 0.3993719986455356, + "learning_rate": 2.5682852807283764e-05, + "loss": 0.3776, + "step": 9450 + }, + { + "epoch": 1.6138089954766577, + "grad_norm": 0.43665620531008, + "learning_rate": 2.5667046029337382e-05, + "loss": 0.3669, + "step": 9455 + }, + { + "epoch": 1.6146624562601348, + "grad_norm": 0.37226881957182756, + "learning_rate": 2.5651239251390996e-05, + "loss": 0.3794, + "step": 9460 + }, + { + "epoch": 1.615515917043612, + "grad_norm": 0.630966720469816, + "learning_rate": 2.5635432473444613e-05, + "loss": 0.3592, + "step": 9465 + }, + { + "epoch": 1.616369377827089, + "grad_norm": 0.3939621369972374, + "learning_rate": 2.5619625695498227e-05, + "loss": 0.3896, + "step": 9470 + }, + { + "epoch": 1.6172228386105658, + "grad_norm": 0.43487915001420685, + "learning_rate": 2.5603818917551848e-05, + "loss": 0.3566, + "step": 9475 + }, + { + "epoch": 1.6180762993940427, + "grad_norm": 0.35916149088239563, + "learning_rate": 2.5588012139605466e-05, + "loss": 0.3883, + "step": 9480 + }, + { + "epoch": 1.6189297601775199, + "grad_norm": 0.36317699165579764, + "learning_rate": 2.557220536165908e-05, + "loss": 0.3516, + "step": 9485 + }, + { + "epoch": 1.6197832209609968, + "grad_norm": 0.351646812340972, + "learning_rate": 2.5556398583712697e-05, + "loss": 0.3682, + "step": 9490 + }, + { + "epoch": 1.620636681744474, + "grad_norm": 0.5051537961986949, + "learning_rate": 2.554059180576631e-05, + "loss": 0.3839, + "step": 9495 + }, + { + "epoch": 1.6214901425279509, + "grad_norm": 0.37903178035144436, + "learning_rate": 2.5524785027819932e-05, + "loss": 0.3817, + "step": 9500 + }, + { + "epoch": 1.6223436033114278, + "grad_norm": 0.41297845376305514, + "learning_rate": 2.550897824987355e-05, + "loss": 0.3823, + "step": 9505 + }, + { + "epoch": 1.6231970640949047, + "grad_norm": 0.36913854812775393, + "learning_rate": 2.5493171471927164e-05, + "loss": 0.3683, + "step": 9510 + }, + { + "epoch": 1.6240505248783819, + "grad_norm": 0.3610013396592195, + "learning_rate": 2.547736469398078e-05, + "loss": 0.3749, + "step": 9515 + }, + { + "epoch": 1.6249039856618588, + "grad_norm": 0.3870563224746467, + "learning_rate": 2.5461557916034395e-05, + "loss": 0.3783, + "step": 9520 + }, + { + "epoch": 1.625757446445336, + "grad_norm": 0.41329546305715414, + "learning_rate": 2.5445751138088013e-05, + "loss": 0.3624, + "step": 9525 + }, + { + "epoch": 1.6266109072288129, + "grad_norm": 0.3316826147883295, + "learning_rate": 2.5429944360141627e-05, + "loss": 0.3789, + "step": 9530 + }, + { + "epoch": 1.6274643680122898, + "grad_norm": 0.34889825755925274, + "learning_rate": 2.5414137582195248e-05, + "loss": 0.3531, + "step": 9535 + }, + { + "epoch": 1.6283178287957667, + "grad_norm": 0.35270280103669066, + "learning_rate": 2.5398330804248865e-05, + "loss": 0.3634, + "step": 9540 + }, + { + "epoch": 1.6291712895792438, + "grad_norm": 0.3818055527627773, + "learning_rate": 2.538252402630248e-05, + "loss": 0.367, + "step": 9545 + }, + { + "epoch": 1.630024750362721, + "grad_norm": 0.37334001874481865, + "learning_rate": 2.5366717248356097e-05, + "loss": 0.3504, + "step": 9550 + }, + { + "epoch": 1.630878211146198, + "grad_norm": 0.36463766396357167, + "learning_rate": 2.535091047040971e-05, + "loss": 0.3832, + "step": 9555 + }, + { + "epoch": 1.6317316719296748, + "grad_norm": 0.5269109992538331, + "learning_rate": 2.533510369246333e-05, + "loss": 0.3837, + "step": 9560 + }, + { + "epoch": 1.6325851327131518, + "grad_norm": 0.3894795153131237, + "learning_rate": 2.5319296914516942e-05, + "loss": 0.3826, + "step": 9565 + }, + { + "epoch": 1.6334385934966287, + "grad_norm": 0.4911555396539967, + "learning_rate": 2.5303490136570563e-05, + "loss": 0.3696, + "step": 9570 + }, + { + "epoch": 1.6342920542801058, + "grad_norm": 0.35874576935761504, + "learning_rate": 2.528768335862418e-05, + "loss": 0.3484, + "step": 9575 + }, + { + "epoch": 1.635145515063583, + "grad_norm": 0.3238490838476584, + "learning_rate": 2.5271876580677794e-05, + "loss": 0.3627, + "step": 9580 + }, + { + "epoch": 1.6359989758470599, + "grad_norm": 0.42527871728802813, + "learning_rate": 2.5256069802731412e-05, + "loss": 0.3923, + "step": 9585 + }, + { + "epoch": 1.6368524366305368, + "grad_norm": 0.32991180591923897, + "learning_rate": 2.5240263024785026e-05, + "loss": 0.3655, + "step": 9590 + }, + { + "epoch": 1.6377058974140137, + "grad_norm": 0.37893644741260296, + "learning_rate": 2.5224456246838647e-05, + "loss": 0.3674, + "step": 9595 + }, + { + "epoch": 1.6385593581974909, + "grad_norm": 0.36987743038688486, + "learning_rate": 2.5208649468892264e-05, + "loss": 0.3351, + "step": 9600 + }, + { + "epoch": 1.6394128189809678, + "grad_norm": 0.3811319455423694, + "learning_rate": 2.519284269094588e-05, + "loss": 0.3729, + "step": 9605 + }, + { + "epoch": 1.640266279764445, + "grad_norm": 0.35332370354771675, + "learning_rate": 2.5177035912999496e-05, + "loss": 0.3748, + "step": 9610 + }, + { + "epoch": 1.6411197405479219, + "grad_norm": 0.37714158965201894, + "learning_rate": 2.516122913505311e-05, + "loss": 0.3599, + "step": 9615 + }, + { + "epoch": 1.6419732013313988, + "grad_norm": 0.3532253534495296, + "learning_rate": 2.514542235710673e-05, + "loss": 0.4038, + "step": 9620 + }, + { + "epoch": 1.6428266621148757, + "grad_norm": 0.43917181291586693, + "learning_rate": 2.512961557916034e-05, + "loss": 0.3575, + "step": 9625 + }, + { + "epoch": 1.6436801228983529, + "grad_norm": 0.38639765123345415, + "learning_rate": 2.5113808801213962e-05, + "loss": 0.3534, + "step": 9630 + }, + { + "epoch": 1.6445335836818298, + "grad_norm": 0.3277553454765587, + "learning_rate": 2.509800202326758e-05, + "loss": 0.3479, + "step": 9635 + }, + { + "epoch": 1.645387044465307, + "grad_norm": 0.40405233020628756, + "learning_rate": 2.5082195245321194e-05, + "loss": 0.3726, + "step": 9640 + }, + { + "epoch": 1.6462405052487838, + "grad_norm": 0.40336960689920276, + "learning_rate": 2.506638846737481e-05, + "loss": 0.3843, + "step": 9645 + }, + { + "epoch": 1.6470939660322608, + "grad_norm": 0.3713185152846159, + "learning_rate": 2.5050581689428425e-05, + "loss": 0.3696, + "step": 9650 + }, + { + "epoch": 1.6479474268157377, + "grad_norm": 0.30365171938062613, + "learning_rate": 2.5034774911482046e-05, + "loss": 0.3871, + "step": 9655 + }, + { + "epoch": 1.6488008875992148, + "grad_norm": 0.39868229296031327, + "learning_rate": 2.5018968133535664e-05, + "loss": 0.3832, + "step": 9660 + }, + { + "epoch": 1.649654348382692, + "grad_norm": 0.31783690689314664, + "learning_rate": 2.5003161355589278e-05, + "loss": 0.3587, + "step": 9665 + }, + { + "epoch": 1.650507809166169, + "grad_norm": 0.44308631034536705, + "learning_rate": 2.4987354577642895e-05, + "loss": 0.3931, + "step": 9670 + }, + { + "epoch": 1.6513612699496458, + "grad_norm": 0.358684922290691, + "learning_rate": 2.497154779969651e-05, + "loss": 0.3561, + "step": 9675 + }, + { + "epoch": 1.6522147307331227, + "grad_norm": 0.3480662742675701, + "learning_rate": 2.495574102175013e-05, + "loss": 0.3917, + "step": 9680 + }, + { + "epoch": 1.6530681915165997, + "grad_norm": 0.344709757767127, + "learning_rate": 2.4939934243803744e-05, + "loss": 0.3533, + "step": 9685 + }, + { + "epoch": 1.6539216523000768, + "grad_norm": 0.5403638550208598, + "learning_rate": 2.492412746585736e-05, + "loss": 0.3756, + "step": 9690 + }, + { + "epoch": 1.654775113083554, + "grad_norm": 0.49560256448326895, + "learning_rate": 2.4908320687910976e-05, + "loss": 0.3619, + "step": 9695 + }, + { + "epoch": 1.6556285738670309, + "grad_norm": 0.4479970829130771, + "learning_rate": 2.4892513909964593e-05, + "loss": 0.3785, + "step": 9700 + }, + { + "epoch": 1.6564820346505078, + "grad_norm": 0.3643797818915992, + "learning_rate": 2.487670713201821e-05, + "loss": 0.3768, + "step": 9705 + }, + { + "epoch": 1.6573354954339847, + "grad_norm": 0.30986452031482137, + "learning_rate": 2.4860900354071828e-05, + "loss": 0.3684, + "step": 9710 + }, + { + "epoch": 1.6581889562174617, + "grad_norm": 0.34834214775622496, + "learning_rate": 2.4845093576125446e-05, + "loss": 0.3846, + "step": 9715 + }, + { + "epoch": 1.6590424170009388, + "grad_norm": 0.3240949698223232, + "learning_rate": 2.482928679817906e-05, + "loss": 0.3486, + "step": 9720 + }, + { + "epoch": 1.659895877784416, + "grad_norm": 0.4986974751176869, + "learning_rate": 2.4813480020232677e-05, + "loss": 0.3601, + "step": 9725 + }, + { + "epoch": 1.6607493385678929, + "grad_norm": 0.30454103767734464, + "learning_rate": 2.4797673242286294e-05, + "loss": 0.3635, + "step": 9730 + }, + { + "epoch": 1.6616027993513698, + "grad_norm": 0.4645322349398634, + "learning_rate": 2.478186646433991e-05, + "loss": 0.3974, + "step": 9735 + }, + { + "epoch": 1.6624562601348467, + "grad_norm": 0.33063087630227694, + "learning_rate": 2.476605968639353e-05, + "loss": 0.3595, + "step": 9740 + }, + { + "epoch": 1.6633097209183239, + "grad_norm": 0.3202227448808457, + "learning_rate": 2.4750252908447143e-05, + "loss": 0.3595, + "step": 9745 + }, + { + "epoch": 1.6641631817018008, + "grad_norm": 1.0984498791246415, + "learning_rate": 2.473444613050076e-05, + "loss": 0.3525, + "step": 9750 + }, + { + "epoch": 1.665016642485278, + "grad_norm": 0.3007709975602925, + "learning_rate": 2.4718639352554375e-05, + "loss": 0.3554, + "step": 9755 + }, + { + "epoch": 1.6658701032687548, + "grad_norm": 0.472316702108902, + "learning_rate": 2.4702832574607992e-05, + "loss": 0.3762, + "step": 9760 + }, + { + "epoch": 1.6667235640522318, + "grad_norm": 0.31508308960368503, + "learning_rate": 2.468702579666161e-05, + "loss": 0.3643, + "step": 9765 + }, + { + "epoch": 1.6675770248357087, + "grad_norm": 0.37722753928340996, + "learning_rate": 2.4671219018715227e-05, + "loss": 0.3902, + "step": 9770 + }, + { + "epoch": 1.6684304856191858, + "grad_norm": 0.3586764574812393, + "learning_rate": 2.4655412240768845e-05, + "loss": 0.3601, + "step": 9775 + }, + { + "epoch": 1.6692839464026628, + "grad_norm": 0.6380925765420232, + "learning_rate": 2.463960546282246e-05, + "loss": 0.3617, + "step": 9780 + }, + { + "epoch": 1.67013740718614, + "grad_norm": 0.3383616224692915, + "learning_rate": 2.4623798684876076e-05, + "loss": 0.3576, + "step": 9785 + }, + { + "epoch": 1.6709908679696168, + "grad_norm": 0.31425957298125445, + "learning_rate": 2.4607991906929694e-05, + "loss": 0.3393, + "step": 9790 + }, + { + "epoch": 1.6718443287530937, + "grad_norm": 0.4147864909222179, + "learning_rate": 2.4592185128983308e-05, + "loss": 0.3428, + "step": 9795 + }, + { + "epoch": 1.6726977895365707, + "grad_norm": 0.45264259568950715, + "learning_rate": 2.4576378351036925e-05, + "loss": 0.3886, + "step": 9800 + }, + { + "epoch": 1.6735512503200478, + "grad_norm": 0.36370608206159005, + "learning_rate": 2.4560571573090543e-05, + "loss": 0.3701, + "step": 9805 + }, + { + "epoch": 1.674404711103525, + "grad_norm": 0.3361008102105946, + "learning_rate": 2.454476479514416e-05, + "loss": 0.3765, + "step": 9810 + }, + { + "epoch": 1.6752581718870019, + "grad_norm": 0.31285208882339277, + "learning_rate": 2.4528958017197774e-05, + "loss": 0.3756, + "step": 9815 + }, + { + "epoch": 1.6761116326704788, + "grad_norm": 0.329371451846838, + "learning_rate": 2.4513151239251392e-05, + "loss": 0.3731, + "step": 9820 + }, + { + "epoch": 1.6769650934539557, + "grad_norm": 0.34101086796868185, + "learning_rate": 2.449734446130501e-05, + "loss": 0.3692, + "step": 9825 + }, + { + "epoch": 1.6778185542374326, + "grad_norm": 0.327415797309251, + "learning_rate": 2.4481537683358623e-05, + "loss": 0.3579, + "step": 9830 + }, + { + "epoch": 1.6786720150209098, + "grad_norm": 0.36212891124970775, + "learning_rate": 2.4465730905412244e-05, + "loss": 0.3544, + "step": 9835 + }, + { + "epoch": 1.679525475804387, + "grad_norm": 0.4808949754230029, + "learning_rate": 2.4449924127465858e-05, + "loss": 0.3717, + "step": 9840 + }, + { + "epoch": 1.6803789365878639, + "grad_norm": 0.35727492749761064, + "learning_rate": 2.4434117349519476e-05, + "loss": 0.37, + "step": 9845 + }, + { + "epoch": 1.6812323973713408, + "grad_norm": 0.5483344985996038, + "learning_rate": 2.441831057157309e-05, + "loss": 0.3586, + "step": 9850 + }, + { + "epoch": 1.6820858581548177, + "grad_norm": 0.38437005134873453, + "learning_rate": 2.4402503793626707e-05, + "loss": 0.3656, + "step": 9855 + }, + { + "epoch": 1.6829393189382946, + "grad_norm": 0.38040213577450993, + "learning_rate": 2.4386697015680325e-05, + "loss": 0.3733, + "step": 9860 + }, + { + "epoch": 1.6837927797217718, + "grad_norm": 0.35000993626904336, + "learning_rate": 2.4370890237733942e-05, + "loss": 0.355, + "step": 9865 + }, + { + "epoch": 1.684646240505249, + "grad_norm": 0.4048599449467024, + "learning_rate": 2.435508345978756e-05, + "loss": 0.3836, + "step": 9870 + }, + { + "epoch": 1.6854997012887258, + "grad_norm": 0.3751379685656599, + "learning_rate": 2.4339276681841174e-05, + "loss": 0.3967, + "step": 9875 + }, + { + "epoch": 1.6863531620722028, + "grad_norm": 0.3165261830400712, + "learning_rate": 2.432346990389479e-05, + "loss": 0.3419, + "step": 9880 + }, + { + "epoch": 1.6872066228556797, + "grad_norm": 0.38120809943595424, + "learning_rate": 2.430766312594841e-05, + "loss": 0.3962, + "step": 9885 + }, + { + "epoch": 1.6880600836391568, + "grad_norm": 0.3165567647867523, + "learning_rate": 2.4291856348002023e-05, + "loss": 0.3799, + "step": 9890 + }, + { + "epoch": 1.6889135444226338, + "grad_norm": 0.35431128900669323, + "learning_rate": 2.4276049570055643e-05, + "loss": 0.3442, + "step": 9895 + }, + { + "epoch": 1.689767005206111, + "grad_norm": 0.3404306623210263, + "learning_rate": 2.4260242792109258e-05, + "loss": 0.3576, + "step": 9900 + }, + { + "epoch": 1.6906204659895878, + "grad_norm": 0.38923418157520556, + "learning_rate": 2.4244436014162875e-05, + "loss": 0.3702, + "step": 9905 + }, + { + "epoch": 1.6914739267730647, + "grad_norm": 0.3701957123731767, + "learning_rate": 2.422862923621649e-05, + "loss": 0.3564, + "step": 9910 + }, + { + "epoch": 1.6923273875565417, + "grad_norm": 0.3662805891531023, + "learning_rate": 2.4212822458270107e-05, + "loss": 0.394, + "step": 9915 + }, + { + "epoch": 1.6931808483400188, + "grad_norm": 0.3420394235920587, + "learning_rate": 2.4197015680323724e-05, + "loss": 0.3418, + "step": 9920 + }, + { + "epoch": 1.6940343091234957, + "grad_norm": 0.30379527638119164, + "learning_rate": 2.418120890237734e-05, + "loss": 0.347, + "step": 9925 + }, + { + "epoch": 1.6948877699069729, + "grad_norm": 0.3383450236304016, + "learning_rate": 2.416540212443096e-05, + "loss": 0.3895, + "step": 9930 + }, + { + "epoch": 1.6957412306904498, + "grad_norm": 0.36141397006934706, + "learning_rate": 2.4149595346484573e-05, + "loss": 0.3588, + "step": 9935 + }, + { + "epoch": 1.6965946914739267, + "grad_norm": 0.34458445452666825, + "learning_rate": 2.413378856853819e-05, + "loss": 0.3729, + "step": 9940 + }, + { + "epoch": 1.6974481522574036, + "grad_norm": 0.3007875269036116, + "learning_rate": 2.4117981790591808e-05, + "loss": 0.3794, + "step": 9945 + }, + { + "epoch": 1.6983016130408808, + "grad_norm": 0.3532242665277833, + "learning_rate": 2.4102175012645422e-05, + "loss": 0.3651, + "step": 9950 + }, + { + "epoch": 1.699155073824358, + "grad_norm": 0.3374537867998225, + "learning_rate": 2.408636823469904e-05, + "loss": 0.3726, + "step": 9955 + }, + { + "epoch": 1.7000085346078349, + "grad_norm": 0.3328228201959643, + "learning_rate": 2.4070561456752657e-05, + "loss": 0.36, + "step": 9960 + }, + { + "epoch": 1.7008619953913118, + "grad_norm": 0.29198609814295523, + "learning_rate": 2.4054754678806274e-05, + "loss": 0.3924, + "step": 9965 + }, + { + "epoch": 1.7017154561747887, + "grad_norm": 0.3613528956300614, + "learning_rate": 2.403894790085989e-05, + "loss": 0.3762, + "step": 9970 + }, + { + "epoch": 1.7025689169582656, + "grad_norm": 0.42149928313668533, + "learning_rate": 2.4023141122913506e-05, + "loss": 0.387, + "step": 9975 + }, + { + "epoch": 1.7034223777417428, + "grad_norm": 0.4093811404104952, + "learning_rate": 2.4007334344967123e-05, + "loss": 0.3757, + "step": 9980 + }, + { + "epoch": 1.70427583852522, + "grad_norm": 0.31232798392425404, + "learning_rate": 2.3991527567020737e-05, + "loss": 0.3588, + "step": 9985 + }, + { + "epoch": 1.7051292993086968, + "grad_norm": 0.39676720303151275, + "learning_rate": 2.3975720789074358e-05, + "loss": 0.3714, + "step": 9990 + }, + { + "epoch": 1.7059827600921738, + "grad_norm": 1.2822911601946423, + "learning_rate": 2.3959914011127972e-05, + "loss": 0.3758, + "step": 9995 + }, + { + "epoch": 1.7068362208756507, + "grad_norm": 0.38618418843218183, + "learning_rate": 2.394410723318159e-05, + "loss": 0.356, + "step": 10000 + }, + { + "epoch": 1.7076896816591276, + "grad_norm": 0.28221632964104154, + "learning_rate": 2.3928300455235207e-05, + "loss": 0.3503, + "step": 10005 + }, + { + "epoch": 1.7085431424426047, + "grad_norm": 1.8906529696817969, + "learning_rate": 2.391249367728882e-05, + "loss": 0.3981, + "step": 10010 + }, + { + "epoch": 1.709396603226082, + "grad_norm": 0.37767485910480847, + "learning_rate": 2.389668689934244e-05, + "loss": 0.3476, + "step": 10015 + }, + { + "epoch": 1.7102500640095588, + "grad_norm": 0.5819221743981302, + "learning_rate": 2.3880880121396056e-05, + "loss": 0.3376, + "step": 10020 + }, + { + "epoch": 1.7111035247930357, + "grad_norm": 0.36288759811979004, + "learning_rate": 2.3865073343449674e-05, + "loss": 0.3754, + "step": 10025 + }, + { + "epoch": 1.7119569855765127, + "grad_norm": 0.3691645217803475, + "learning_rate": 2.3849266565503288e-05, + "loss": 0.3642, + "step": 10030 + }, + { + "epoch": 1.7128104463599898, + "grad_norm": 0.3892005134188453, + "learning_rate": 2.3833459787556905e-05, + "loss": 0.3643, + "step": 10035 + }, + { + "epoch": 1.7136639071434667, + "grad_norm": 0.3789287552931439, + "learning_rate": 2.3817653009610523e-05, + "loss": 0.3512, + "step": 10040 + }, + { + "epoch": 1.7145173679269439, + "grad_norm": 0.497272194873337, + "learning_rate": 2.3801846231664137e-05, + "loss": 0.3353, + "step": 10045 + }, + { + "epoch": 1.7153708287104208, + "grad_norm": 0.7177210423529197, + "learning_rate": 2.3786039453717758e-05, + "loss": 0.3646, + "step": 10050 + }, + { + "epoch": 1.7162242894938977, + "grad_norm": 0.35545288550050264, + "learning_rate": 2.377023267577137e-05, + "loss": 0.3839, + "step": 10055 + }, + { + "epoch": 1.7170777502773746, + "grad_norm": 0.3475567969581151, + "learning_rate": 2.375442589782499e-05, + "loss": 0.3621, + "step": 10060 + }, + { + "epoch": 1.7179312110608518, + "grad_norm": 0.32022819698741734, + "learning_rate": 2.3738619119878607e-05, + "loss": 0.367, + "step": 10065 + }, + { + "epoch": 1.7187846718443287, + "grad_norm": 0.38584311787546577, + "learning_rate": 2.372281234193222e-05, + "loss": 0.3487, + "step": 10070 + }, + { + "epoch": 1.7196381326278058, + "grad_norm": 0.385576863467266, + "learning_rate": 2.3707005563985838e-05, + "loss": 0.3812, + "step": 10075 + }, + { + "epoch": 1.7204915934112828, + "grad_norm": 0.29632475338105196, + "learning_rate": 2.3691198786039452e-05, + "loss": 0.369, + "step": 10080 + }, + { + "epoch": 1.7213450541947597, + "grad_norm": 0.3455973736066388, + "learning_rate": 2.3675392008093073e-05, + "loss": 0.358, + "step": 10085 + }, + { + "epoch": 1.7221985149782366, + "grad_norm": 0.3218490085562857, + "learning_rate": 2.3659585230146687e-05, + "loss": 0.3736, + "step": 10090 + }, + { + "epoch": 1.7230519757617138, + "grad_norm": 0.40162109561154596, + "learning_rate": 2.3643778452200305e-05, + "loss": 0.3573, + "step": 10095 + }, + { + "epoch": 1.723905436545191, + "grad_norm": 0.4538109362924726, + "learning_rate": 2.3627971674253922e-05, + "loss": 0.3653, + "step": 10100 + }, + { + "epoch": 1.7247588973286678, + "grad_norm": 0.33572753039570163, + "learning_rate": 2.3612164896307536e-05, + "loss": 0.354, + "step": 10105 + }, + { + "epoch": 1.7256123581121448, + "grad_norm": 0.32031756723298443, + "learning_rate": 2.3596358118361154e-05, + "loss": 0.3789, + "step": 10110 + }, + { + "epoch": 1.7264658188956217, + "grad_norm": 0.29450177423269897, + "learning_rate": 2.358055134041477e-05, + "loss": 0.3616, + "step": 10115 + }, + { + "epoch": 1.7273192796790986, + "grad_norm": 0.3565780967071999, + "learning_rate": 2.356474456246839e-05, + "loss": 0.3696, + "step": 10120 + }, + { + "epoch": 1.7281727404625757, + "grad_norm": 0.4310022631641016, + "learning_rate": 2.3548937784522006e-05, + "loss": 0.3505, + "step": 10125 + }, + { + "epoch": 1.7290262012460529, + "grad_norm": 0.3691592372099381, + "learning_rate": 2.353313100657562e-05, + "loss": 0.3567, + "step": 10130 + }, + { + "epoch": 1.7298796620295298, + "grad_norm": 0.316140033477628, + "learning_rate": 2.3517324228629237e-05, + "loss": 0.3406, + "step": 10135 + }, + { + "epoch": 1.7307331228130067, + "grad_norm": 0.3632478706450496, + "learning_rate": 2.350151745068285e-05, + "loss": 0.3545, + "step": 10140 + }, + { + "epoch": 1.7315865835964837, + "grad_norm": 0.4417561062140959, + "learning_rate": 2.3485710672736472e-05, + "loss": 0.3608, + "step": 10145 + }, + { + "epoch": 1.7324400443799606, + "grad_norm": 0.3284369454386695, + "learning_rate": 2.3469903894790086e-05, + "loss": 0.3714, + "step": 10150 + }, + { + "epoch": 1.7332935051634377, + "grad_norm": 0.3769972935790573, + "learning_rate": 2.3454097116843704e-05, + "loss": 0.3675, + "step": 10155 + }, + { + "epoch": 1.7341469659469149, + "grad_norm": 0.39666950231256687, + "learning_rate": 2.343829033889732e-05, + "loss": 0.3866, + "step": 10160 + }, + { + "epoch": 1.7350004267303918, + "grad_norm": 0.4108714616727913, + "learning_rate": 2.3422483560950935e-05, + "loss": 0.3868, + "step": 10165 + }, + { + "epoch": 1.7358538875138687, + "grad_norm": 0.36737787273895234, + "learning_rate": 2.3406676783004553e-05, + "loss": 0.3769, + "step": 10170 + }, + { + "epoch": 1.7367073482973456, + "grad_norm": 0.4113399543149594, + "learning_rate": 2.339087000505817e-05, + "loss": 0.388, + "step": 10175 + }, + { + "epoch": 1.7375608090808228, + "grad_norm": 0.39169442625744216, + "learning_rate": 2.3375063227111788e-05, + "loss": 0.3444, + "step": 10180 + }, + { + "epoch": 1.7384142698642997, + "grad_norm": 0.29177123848011854, + "learning_rate": 2.3359256449165405e-05, + "loss": 0.3753, + "step": 10185 + }, + { + "epoch": 1.7392677306477768, + "grad_norm": 0.3383770494978673, + "learning_rate": 2.334344967121902e-05, + "loss": 0.3561, + "step": 10190 + }, + { + "epoch": 1.7401211914312538, + "grad_norm": 0.3724319975484265, + "learning_rate": 2.3327642893272637e-05, + "loss": 0.3926, + "step": 10195 + }, + { + "epoch": 1.7409746522147307, + "grad_norm": 0.4986854755701425, + "learning_rate": 2.331183611532625e-05, + "loss": 0.3941, + "step": 10200 + }, + { + "epoch": 1.7418281129982076, + "grad_norm": 0.4390558287126891, + "learning_rate": 2.3296029337379868e-05, + "loss": 0.3591, + "step": 10205 + }, + { + "epoch": 1.7426815737816848, + "grad_norm": 0.33959687704720026, + "learning_rate": 2.3280222559433486e-05, + "loss": 0.3671, + "step": 10210 + }, + { + "epoch": 1.7435350345651617, + "grad_norm": 0.29763719970702696, + "learning_rate": 2.3264415781487103e-05, + "loss": 0.3561, + "step": 10215 + }, + { + "epoch": 1.7443884953486388, + "grad_norm": 0.475731508454673, + "learning_rate": 2.324860900354072e-05, + "loss": 0.3844, + "step": 10220 + }, + { + "epoch": 1.7452419561321157, + "grad_norm": 0.3356547866961049, + "learning_rate": 2.3232802225594335e-05, + "loss": 0.387, + "step": 10225 + }, + { + "epoch": 1.7460954169155927, + "grad_norm": 0.3505708880260287, + "learning_rate": 2.3216995447647952e-05, + "loss": 0.3617, + "step": 10230 + }, + { + "epoch": 1.7469488776990696, + "grad_norm": 0.39851400765382256, + "learning_rate": 2.3201188669701566e-05, + "loss": 0.3761, + "step": 10235 + }, + { + "epoch": 1.7478023384825467, + "grad_norm": 0.3762568602632757, + "learning_rate": 2.3185381891755187e-05, + "loss": 0.3758, + "step": 10240 + }, + { + "epoch": 1.7486557992660239, + "grad_norm": 0.35144964132478174, + "learning_rate": 2.3169575113808805e-05, + "loss": 0.3809, + "step": 10245 + }, + { + "epoch": 1.7495092600495008, + "grad_norm": 0.41360431860790936, + "learning_rate": 2.315376833586242e-05, + "loss": 0.3545, + "step": 10250 + }, + { + "epoch": 1.7503627208329777, + "grad_norm": 0.3154093255362032, + "learning_rate": 2.3137961557916036e-05, + "loss": 0.3491, + "step": 10255 + }, + { + "epoch": 1.7512161816164546, + "grad_norm": 1.4137719194929725, + "learning_rate": 2.312215477996965e-05, + "loss": 0.3721, + "step": 10260 + }, + { + "epoch": 1.7520696423999316, + "grad_norm": 0.34016453789967843, + "learning_rate": 2.3106348002023268e-05, + "loss": 0.3647, + "step": 10265 + }, + { + "epoch": 1.7529231031834087, + "grad_norm": 0.38216600334837486, + "learning_rate": 2.3090541224076885e-05, + "loss": 0.3609, + "step": 10270 + }, + { + "epoch": 1.7537765639668859, + "grad_norm": 0.33344039798406455, + "learning_rate": 2.3074734446130503e-05, + "loss": 0.3817, + "step": 10275 + }, + { + "epoch": 1.7546300247503628, + "grad_norm": 0.3505756778067142, + "learning_rate": 2.305892766818412e-05, + "loss": 0.3999, + "step": 10280 + }, + { + "epoch": 1.7554834855338397, + "grad_norm": 0.2908220891462724, + "learning_rate": 2.3043120890237734e-05, + "loss": 0.376, + "step": 10285 + }, + { + "epoch": 1.7563369463173166, + "grad_norm": 0.4531864302133631, + "learning_rate": 2.302731411229135e-05, + "loss": 0.353, + "step": 10290 + }, + { + "epoch": 1.7571904071007938, + "grad_norm": 0.3960793007598124, + "learning_rate": 2.3011507334344966e-05, + "loss": 0.3773, + "step": 10295 + }, + { + "epoch": 1.7580438678842707, + "grad_norm": 0.2987489593726992, + "learning_rate": 2.2995700556398586e-05, + "loss": 0.3791, + "step": 10300 + }, + { + "epoch": 1.7588973286677478, + "grad_norm": 0.39906326924904945, + "learning_rate": 2.2979893778452204e-05, + "loss": 0.3499, + "step": 10305 + }, + { + "epoch": 1.7597507894512248, + "grad_norm": 0.4200226635689522, + "learning_rate": 2.2964087000505818e-05, + "loss": 0.3922, + "step": 10310 + }, + { + "epoch": 1.7606042502347017, + "grad_norm": 0.35742914237215445, + "learning_rate": 2.2948280222559435e-05, + "loss": 0.3775, + "step": 10315 + }, + { + "epoch": 1.7614577110181786, + "grad_norm": 0.3327129777054162, + "learning_rate": 2.293247344461305e-05, + "loss": 0.3762, + "step": 10320 + }, + { + "epoch": 1.7623111718016558, + "grad_norm": 0.36120457351243423, + "learning_rate": 2.2916666666666667e-05, + "loss": 0.3462, + "step": 10325 + }, + { + "epoch": 1.7631646325851327, + "grad_norm": 0.3472423813510895, + "learning_rate": 2.2900859888720284e-05, + "loss": 0.3787, + "step": 10330 + }, + { + "epoch": 1.7640180933686098, + "grad_norm": 0.36961950942276844, + "learning_rate": 2.2885053110773902e-05, + "loss": 0.3682, + "step": 10335 + }, + { + "epoch": 1.7648715541520867, + "grad_norm": 0.32673319213139956, + "learning_rate": 2.286924633282752e-05, + "loss": 0.3489, + "step": 10340 + }, + { + "epoch": 1.7657250149355637, + "grad_norm": 0.3268261260976071, + "learning_rate": 2.2853439554881133e-05, + "loss": 0.3739, + "step": 10345 + }, + { + "epoch": 1.7665784757190406, + "grad_norm": 0.3530261635729621, + "learning_rate": 2.283763277693475e-05, + "loss": 0.3879, + "step": 10350 + }, + { + "epoch": 1.7674319365025177, + "grad_norm": 0.3268436411952027, + "learning_rate": 2.2821825998988365e-05, + "loss": 0.3636, + "step": 10355 + }, + { + "epoch": 1.7682853972859947, + "grad_norm": 0.3229942323059562, + "learning_rate": 2.2806019221041982e-05, + "loss": 0.3821, + "step": 10360 + }, + { + "epoch": 1.7691388580694718, + "grad_norm": 0.5568367398126395, + "learning_rate": 2.2790212443095603e-05, + "loss": 0.3725, + "step": 10365 + }, + { + "epoch": 1.7699923188529487, + "grad_norm": 0.3977205017910189, + "learning_rate": 2.2774405665149217e-05, + "loss": 0.3722, + "step": 10370 + }, + { + "epoch": 1.7708457796364256, + "grad_norm": 0.3460952652909466, + "learning_rate": 2.2758598887202835e-05, + "loss": 0.3771, + "step": 10375 + }, + { + "epoch": 1.7716992404199026, + "grad_norm": 0.43227517832685763, + "learning_rate": 2.274279210925645e-05, + "loss": 0.361, + "step": 10380 + }, + { + "epoch": 1.7725527012033797, + "grad_norm": 0.3524632847384698, + "learning_rate": 2.2726985331310066e-05, + "loss": 0.3521, + "step": 10385 + }, + { + "epoch": 1.7734061619868569, + "grad_norm": 0.7478051106054819, + "learning_rate": 2.2711178553363684e-05, + "loss": 0.3458, + "step": 10390 + }, + { + "epoch": 1.7742596227703338, + "grad_norm": 0.4109190829078148, + "learning_rate": 2.26953717754173e-05, + "loss": 0.351, + "step": 10395 + }, + { + "epoch": 1.7751130835538107, + "grad_norm": 1.5678713067526842, + "learning_rate": 2.267956499747092e-05, + "loss": 0.4199, + "step": 10400 + }, + { + "epoch": 1.7759665443372876, + "grad_norm": 0.42333237544214836, + "learning_rate": 2.2663758219524533e-05, + "loss": 0.3572, + "step": 10405 + }, + { + "epoch": 1.7768200051207645, + "grad_norm": 0.39219567327864413, + "learning_rate": 2.264795144157815e-05, + "loss": 0.3654, + "step": 10410 + }, + { + "epoch": 1.7776734659042417, + "grad_norm": 0.49609649621588464, + "learning_rate": 2.2632144663631764e-05, + "loss": 0.3465, + "step": 10415 + }, + { + "epoch": 1.7785269266877188, + "grad_norm": 0.37267128866203747, + "learning_rate": 2.261633788568538e-05, + "loss": 0.3754, + "step": 10420 + }, + { + "epoch": 1.7793803874711958, + "grad_norm": 0.3748502000740906, + "learning_rate": 2.2600531107739003e-05, + "loss": 0.3533, + "step": 10425 + }, + { + "epoch": 1.7802338482546727, + "grad_norm": 0.4463158896622891, + "learning_rate": 2.2584724329792617e-05, + "loss": 0.3623, + "step": 10430 + }, + { + "epoch": 1.7810873090381496, + "grad_norm": 0.3555591871261391, + "learning_rate": 2.2568917551846234e-05, + "loss": 0.3414, + "step": 10435 + }, + { + "epoch": 1.7819407698216267, + "grad_norm": 0.4765275492824118, + "learning_rate": 2.2553110773899848e-05, + "loss": 0.3873, + "step": 10440 + }, + { + "epoch": 1.7827942306051037, + "grad_norm": 0.46616758415086174, + "learning_rate": 2.2537303995953466e-05, + "loss": 0.3629, + "step": 10445 + }, + { + "epoch": 1.7836476913885808, + "grad_norm": 0.34573027591378713, + "learning_rate": 2.2521497218007083e-05, + "loss": 0.3782, + "step": 10450 + }, + { + "epoch": 1.7845011521720577, + "grad_norm": 0.5251500050749887, + "learning_rate": 2.25056904400607e-05, + "loss": 0.3665, + "step": 10455 + }, + { + "epoch": 1.7853546129555347, + "grad_norm": 0.3764774923808963, + "learning_rate": 2.2489883662114318e-05, + "loss": 0.3656, + "step": 10460 + }, + { + "epoch": 1.7862080737390116, + "grad_norm": 0.3382205983972903, + "learning_rate": 2.2474076884167932e-05, + "loss": 0.3604, + "step": 10465 + }, + { + "epoch": 1.7870615345224887, + "grad_norm": 0.5317388106722926, + "learning_rate": 2.245827010622155e-05, + "loss": 0.356, + "step": 10470 + }, + { + "epoch": 1.7879149953059656, + "grad_norm": 1.4903783114176634, + "learning_rate": 2.2442463328275164e-05, + "loss": 0.3596, + "step": 10475 + }, + { + "epoch": 1.7887684560894428, + "grad_norm": 0.36496507612857976, + "learning_rate": 2.242665655032878e-05, + "loss": 0.3502, + "step": 10480 + }, + { + "epoch": 1.7896219168729197, + "grad_norm": 0.3937612529189051, + "learning_rate": 2.24108497723824e-05, + "loss": 0.3995, + "step": 10485 + }, + { + "epoch": 1.7904753776563966, + "grad_norm": 0.31508253953639564, + "learning_rate": 2.2395042994436016e-05, + "loss": 0.36, + "step": 10490 + }, + { + "epoch": 1.7913288384398736, + "grad_norm": 0.4404646582186003, + "learning_rate": 2.2379236216489633e-05, + "loss": 0.3781, + "step": 10495 + }, + { + "epoch": 1.7921822992233507, + "grad_norm": 0.32583844450645083, + "learning_rate": 2.2363429438543247e-05, + "loss": 0.3931, + "step": 10500 + }, + { + "epoch": 1.7930357600068279, + "grad_norm": 0.902758638180003, + "learning_rate": 2.2347622660596865e-05, + "loss": 0.3594, + "step": 10505 + }, + { + "epoch": 1.7938892207903048, + "grad_norm": 0.3365733522219216, + "learning_rate": 2.2331815882650482e-05, + "loss": 0.3591, + "step": 10510 + }, + { + "epoch": 1.7947426815737817, + "grad_norm": 0.4934569504171031, + "learning_rate": 2.2316009104704096e-05, + "loss": 0.3699, + "step": 10515 + }, + { + "epoch": 1.7955961423572586, + "grad_norm": 0.34520444249585464, + "learning_rate": 2.2300202326757717e-05, + "loss": 0.358, + "step": 10520 + }, + { + "epoch": 1.7964496031407355, + "grad_norm": 0.40578450288272916, + "learning_rate": 2.228439554881133e-05, + "loss": 0.3727, + "step": 10525 + }, + { + "epoch": 1.7973030639242127, + "grad_norm": 0.4141773602465772, + "learning_rate": 2.226858877086495e-05, + "loss": 0.3776, + "step": 10530 + }, + { + "epoch": 1.7981565247076898, + "grad_norm": 0.39931050960669756, + "learning_rate": 2.2252781992918563e-05, + "loss": 0.3614, + "step": 10535 + }, + { + "epoch": 1.7990099854911668, + "grad_norm": 0.5136945097329725, + "learning_rate": 2.223697521497218e-05, + "loss": 0.364, + "step": 10540 + }, + { + "epoch": 1.7998634462746437, + "grad_norm": 0.4337614573293977, + "learning_rate": 2.2221168437025798e-05, + "loss": 0.3793, + "step": 10545 + }, + { + "epoch": 1.8007169070581206, + "grad_norm": 0.31625048284765106, + "learning_rate": 2.2205361659079415e-05, + "loss": 0.3454, + "step": 10550 + }, + { + "epoch": 1.8015703678415975, + "grad_norm": 0.7410211808156124, + "learning_rate": 2.2189554881133033e-05, + "loss": 0.3663, + "step": 10555 + }, + { + "epoch": 1.8024238286250747, + "grad_norm": 0.4919923140230198, + "learning_rate": 2.2173748103186647e-05, + "loss": 0.3439, + "step": 10560 + }, + { + "epoch": 1.8032772894085518, + "grad_norm": 0.37579765211598076, + "learning_rate": 2.2157941325240264e-05, + "loss": 0.3683, + "step": 10565 + }, + { + "epoch": 1.8041307501920287, + "grad_norm": 0.36895523353830173, + "learning_rate": 2.214213454729388e-05, + "loss": 0.3569, + "step": 10570 + }, + { + "epoch": 1.8049842109755057, + "grad_norm": 0.3033833823701229, + "learning_rate": 2.2126327769347496e-05, + "loss": 0.3732, + "step": 10575 + }, + { + "epoch": 1.8058376717589826, + "grad_norm": 0.39266161317165993, + "learning_rate": 2.2110520991401117e-05, + "loss": 0.3621, + "step": 10580 + }, + { + "epoch": 1.8066911325424597, + "grad_norm": 0.42813518689174307, + "learning_rate": 2.209471421345473e-05, + "loss": 0.3716, + "step": 10585 + }, + { + "epoch": 1.8075445933259366, + "grad_norm": 0.3638748905898181, + "learning_rate": 2.2078907435508348e-05, + "loss": 0.3718, + "step": 10590 + }, + { + "epoch": 1.8083980541094138, + "grad_norm": 0.5950632824365684, + "learning_rate": 2.2063100657561962e-05, + "loss": 0.3659, + "step": 10595 + }, + { + "epoch": 1.8092515148928907, + "grad_norm": 0.3486629271103143, + "learning_rate": 2.204729387961558e-05, + "loss": 0.353, + "step": 10600 + }, + { + "epoch": 1.8101049756763676, + "grad_norm": 0.3942577678706724, + "learning_rate": 2.2031487101669197e-05, + "loss": 0.3758, + "step": 10605 + }, + { + "epoch": 1.8109584364598446, + "grad_norm": 0.3631065961712478, + "learning_rate": 2.201568032372281e-05, + "loss": 0.3628, + "step": 10610 + }, + { + "epoch": 1.8118118972433217, + "grad_norm": 0.28850470186236804, + "learning_rate": 2.1999873545776432e-05, + "loss": 0.3583, + "step": 10615 + }, + { + "epoch": 1.8126653580267986, + "grad_norm": 0.3084338342525628, + "learning_rate": 2.1984066767830046e-05, + "loss": 0.3747, + "step": 10620 + }, + { + "epoch": 1.8135188188102758, + "grad_norm": 0.48109290660100135, + "learning_rate": 2.1968259989883664e-05, + "loss": 0.357, + "step": 10625 + }, + { + "epoch": 1.8143722795937527, + "grad_norm": 0.593473543871764, + "learning_rate": 2.195245321193728e-05, + "loss": 0.3727, + "step": 10630 + }, + { + "epoch": 1.8152257403772296, + "grad_norm": 0.4100937199773082, + "learning_rate": 2.1936646433990895e-05, + "loss": 0.3697, + "step": 10635 + }, + { + "epoch": 1.8160792011607065, + "grad_norm": 0.32877977611372144, + "learning_rate": 2.1920839656044513e-05, + "loss": 0.3617, + "step": 10640 + }, + { + "epoch": 1.8169326619441837, + "grad_norm": 0.35612619937395745, + "learning_rate": 2.190503287809813e-05, + "loss": 0.3466, + "step": 10645 + }, + { + "epoch": 1.8177861227276608, + "grad_norm": 0.35396257019449956, + "learning_rate": 2.1889226100151747e-05, + "loss": 0.3571, + "step": 10650 + }, + { + "epoch": 1.8186395835111377, + "grad_norm": 0.39690438673164563, + "learning_rate": 2.187341932220536e-05, + "loss": 0.3674, + "step": 10655 + }, + { + "epoch": 1.8194930442946147, + "grad_norm": 0.5242482990321449, + "learning_rate": 2.185761254425898e-05, + "loss": 0.3953, + "step": 10660 + }, + { + "epoch": 1.8203465050780916, + "grad_norm": 0.4214529237838405, + "learning_rate": 2.1841805766312596e-05, + "loss": 0.3727, + "step": 10665 + }, + { + "epoch": 1.8211999658615685, + "grad_norm": 1.1136726595991726, + "learning_rate": 2.182599898836621e-05, + "loss": 0.3728, + "step": 10670 + }, + { + "epoch": 1.8220534266450457, + "grad_norm": 0.327682234110669, + "learning_rate": 2.181019221041983e-05, + "loss": 0.3747, + "step": 10675 + }, + { + "epoch": 1.8229068874285228, + "grad_norm": 0.28611775290909447, + "learning_rate": 2.1794385432473445e-05, + "loss": 0.3587, + "step": 10680 + }, + { + "epoch": 1.8237603482119997, + "grad_norm": 0.4407991387242986, + "learning_rate": 2.1778578654527063e-05, + "loss": 0.364, + "step": 10685 + }, + { + "epoch": 1.8246138089954766, + "grad_norm": 0.47526465046818295, + "learning_rate": 2.176277187658068e-05, + "loss": 0.3497, + "step": 10690 + }, + { + "epoch": 1.8254672697789536, + "grad_norm": 0.3928753576304463, + "learning_rate": 2.1746965098634294e-05, + "loss": 0.359, + "step": 10695 + }, + { + "epoch": 1.8263207305624305, + "grad_norm": 0.36232606753636554, + "learning_rate": 2.1731158320687912e-05, + "loss": 0.3623, + "step": 10700 + }, + { + "epoch": 1.8271741913459076, + "grad_norm": 0.46836363032120787, + "learning_rate": 2.171535154274153e-05, + "loss": 0.3518, + "step": 10705 + }, + { + "epoch": 1.8280276521293848, + "grad_norm": 0.3895529808865236, + "learning_rate": 2.1699544764795147e-05, + "loss": 0.3751, + "step": 10710 + }, + { + "epoch": 1.8288811129128617, + "grad_norm": 0.5460199251051238, + "learning_rate": 2.168373798684876e-05, + "loss": 0.3708, + "step": 10715 + }, + { + "epoch": 1.8297345736963386, + "grad_norm": 0.40974357623987206, + "learning_rate": 2.166793120890238e-05, + "loss": 0.3667, + "step": 10720 + }, + { + "epoch": 1.8305880344798156, + "grad_norm": 0.38992645911807666, + "learning_rate": 2.1652124430955996e-05, + "loss": 0.366, + "step": 10725 + }, + { + "epoch": 1.8314414952632927, + "grad_norm": 0.41047123669309143, + "learning_rate": 2.163631765300961e-05, + "loss": 0.3931, + "step": 10730 + }, + { + "epoch": 1.8322949560467696, + "grad_norm": 0.40673065790434665, + "learning_rate": 2.1620510875063227e-05, + "loss": 0.3805, + "step": 10735 + }, + { + "epoch": 1.8331484168302468, + "grad_norm": 0.5322917028214879, + "learning_rate": 2.1604704097116845e-05, + "loss": 0.3715, + "step": 10740 + }, + { + "epoch": 1.8340018776137237, + "grad_norm": 0.3181019613142807, + "learning_rate": 2.1588897319170462e-05, + "loss": 0.3868, + "step": 10745 + }, + { + "epoch": 1.8348553383972006, + "grad_norm": 0.31096990946905845, + "learning_rate": 2.157309054122408e-05, + "loss": 0.3615, + "step": 10750 + }, + { + "epoch": 1.8357087991806775, + "grad_norm": 0.36823862849584077, + "learning_rate": 2.1557283763277694e-05, + "loss": 0.368, + "step": 10755 + }, + { + "epoch": 1.8365622599641547, + "grad_norm": 0.3250417469863049, + "learning_rate": 2.154147698533131e-05, + "loss": 0.3659, + "step": 10760 + }, + { + "epoch": 1.8374157207476316, + "grad_norm": 0.45584925598743514, + "learning_rate": 2.1525670207384925e-05, + "loss": 0.3677, + "step": 10765 + }, + { + "epoch": 1.8382691815311087, + "grad_norm": 0.36190482925923284, + "learning_rate": 2.1509863429438546e-05, + "loss": 0.3798, + "step": 10770 + }, + { + "epoch": 1.8391226423145857, + "grad_norm": 0.4258078428713466, + "learning_rate": 2.149405665149216e-05, + "loss": 0.3453, + "step": 10775 + }, + { + "epoch": 1.8399761030980626, + "grad_norm": 0.5146661868072934, + "learning_rate": 2.1478249873545778e-05, + "loss": 0.3835, + "step": 10780 + }, + { + "epoch": 1.8408295638815395, + "grad_norm": 0.35699074579188195, + "learning_rate": 2.1462443095599395e-05, + "loss": 0.3917, + "step": 10785 + }, + { + "epoch": 1.8416830246650167, + "grad_norm": 0.7331207287649647, + "learning_rate": 2.144663631765301e-05, + "loss": 0.3935, + "step": 10790 + }, + { + "epoch": 1.8425364854484938, + "grad_norm": 0.3419839330270435, + "learning_rate": 2.1430829539706627e-05, + "loss": 0.3599, + "step": 10795 + }, + { + "epoch": 1.8433899462319707, + "grad_norm": 0.3822457077968837, + "learning_rate": 2.1415022761760244e-05, + "loss": 0.3644, + "step": 10800 + }, + { + "epoch": 1.8442434070154476, + "grad_norm": 0.4084306894441126, + "learning_rate": 2.139921598381386e-05, + "loss": 0.3534, + "step": 10805 + }, + { + "epoch": 1.8450968677989246, + "grad_norm": 0.4874881885032455, + "learning_rate": 2.138340920586748e-05, + "loss": 0.3841, + "step": 10810 + }, + { + "epoch": 1.8459503285824015, + "grad_norm": 0.4127729887949137, + "learning_rate": 2.1367602427921093e-05, + "loss": 0.3733, + "step": 10815 + }, + { + "epoch": 1.8468037893658786, + "grad_norm": 0.43915191784894403, + "learning_rate": 2.135179564997471e-05, + "loss": 0.3719, + "step": 10820 + }, + { + "epoch": 1.8476572501493558, + "grad_norm": 0.4940688648002275, + "learning_rate": 2.1335988872028325e-05, + "loss": 0.361, + "step": 10825 + }, + { + "epoch": 1.8485107109328327, + "grad_norm": 0.45004907244050096, + "learning_rate": 2.1320182094081945e-05, + "loss": 0.3734, + "step": 10830 + }, + { + "epoch": 1.8493641717163096, + "grad_norm": 0.5525318440453658, + "learning_rate": 2.130437531613556e-05, + "loss": 0.3438, + "step": 10835 + }, + { + "epoch": 1.8502176324997865, + "grad_norm": 0.326606963379231, + "learning_rate": 2.1288568538189177e-05, + "loss": 0.3491, + "step": 10840 + }, + { + "epoch": 1.8510710932832635, + "grad_norm": 0.37527385318494266, + "learning_rate": 2.1272761760242794e-05, + "loss": 0.4038, + "step": 10845 + }, + { + "epoch": 1.8519245540667406, + "grad_norm": 0.3939938913670757, + "learning_rate": 2.125695498229641e-05, + "loss": 0.3715, + "step": 10850 + }, + { + "epoch": 1.8527780148502178, + "grad_norm": 0.326301072046663, + "learning_rate": 2.1241148204350026e-05, + "loss": 0.3764, + "step": 10855 + }, + { + "epoch": 1.8536314756336947, + "grad_norm": 0.4400152642190683, + "learning_rate": 2.122534142640364e-05, + "loss": 0.3747, + "step": 10860 + }, + { + "epoch": 1.8544849364171716, + "grad_norm": 0.43012907394312316, + "learning_rate": 2.120953464845726e-05, + "loss": 0.3546, + "step": 10865 + }, + { + "epoch": 1.8553383972006485, + "grad_norm": 0.3541062863654811, + "learning_rate": 2.1193727870510875e-05, + "loss": 0.385, + "step": 10870 + }, + { + "epoch": 1.8561918579841257, + "grad_norm": 0.42864551170374077, + "learning_rate": 2.1177921092564492e-05, + "loss": 0.3632, + "step": 10875 + }, + { + "epoch": 1.8570453187676026, + "grad_norm": 0.6092328779930328, + "learning_rate": 2.116211431461811e-05, + "loss": 0.3545, + "step": 10880 + }, + { + "epoch": 1.8578987795510797, + "grad_norm": 0.36373812884671775, + "learning_rate": 2.1146307536671724e-05, + "loss": 0.3811, + "step": 10885 + }, + { + "epoch": 1.8587522403345567, + "grad_norm": 0.48966929587361846, + "learning_rate": 2.113050075872534e-05, + "loss": 0.3629, + "step": 10890 + }, + { + "epoch": 1.8596057011180336, + "grad_norm": 0.3294541247900809, + "learning_rate": 2.111469398077896e-05, + "loss": 0.3589, + "step": 10895 + }, + { + "epoch": 1.8604591619015105, + "grad_norm": 0.3762098114886841, + "learning_rate": 2.1098887202832576e-05, + "loss": 0.3624, + "step": 10900 + }, + { + "epoch": 1.8613126226849876, + "grad_norm": 0.4498972865269915, + "learning_rate": 2.1083080424886194e-05, + "loss": 0.3742, + "step": 10905 + }, + { + "epoch": 1.8621660834684646, + "grad_norm": 0.3806443462128479, + "learning_rate": 2.1067273646939808e-05, + "loss": 0.3544, + "step": 10910 + }, + { + "epoch": 1.8630195442519417, + "grad_norm": 0.4187900337274261, + "learning_rate": 2.1051466868993425e-05, + "loss": 0.3744, + "step": 10915 + }, + { + "epoch": 1.8638730050354186, + "grad_norm": 0.43709248143647517, + "learning_rate": 2.103566009104704e-05, + "loss": 0.3653, + "step": 10920 + }, + { + "epoch": 1.8647264658188956, + "grad_norm": 0.5847621244254114, + "learning_rate": 2.101985331310066e-05, + "loss": 0.3741, + "step": 10925 + }, + { + "epoch": 1.8655799266023725, + "grad_norm": 0.3726576210378853, + "learning_rate": 2.1004046535154274e-05, + "loss": 0.3444, + "step": 10930 + }, + { + "epoch": 1.8664333873858496, + "grad_norm": 0.7331755128492993, + "learning_rate": 2.0988239757207892e-05, + "loss": 0.3854, + "step": 10935 + }, + { + "epoch": 1.8672868481693268, + "grad_norm": 0.3396553895603976, + "learning_rate": 2.097243297926151e-05, + "loss": 0.3815, + "step": 10940 + }, + { + "epoch": 1.8681403089528037, + "grad_norm": 0.361490335839466, + "learning_rate": 2.0956626201315123e-05, + "loss": 0.3724, + "step": 10945 + }, + { + "epoch": 1.8689937697362806, + "grad_norm": 0.3673689734763417, + "learning_rate": 2.094081942336874e-05, + "loss": 0.349, + "step": 10950 + }, + { + "epoch": 1.8698472305197575, + "grad_norm": 0.39744288396915534, + "learning_rate": 2.0925012645422358e-05, + "loss": 0.3663, + "step": 10955 + }, + { + "epoch": 1.8707006913032345, + "grad_norm": 0.42295654199751714, + "learning_rate": 2.0909205867475976e-05, + "loss": 0.3801, + "step": 10960 + }, + { + "epoch": 1.8715541520867116, + "grad_norm": 0.360461220134643, + "learning_rate": 2.0893399089529593e-05, + "loss": 0.3642, + "step": 10965 + }, + { + "epoch": 1.8724076128701888, + "grad_norm": 0.2789941075918898, + "learning_rate": 2.0877592311583207e-05, + "loss": 0.3565, + "step": 10970 + }, + { + "epoch": 1.8732610736536657, + "grad_norm": 0.46125387391646, + "learning_rate": 2.0861785533636825e-05, + "loss": 0.3521, + "step": 10975 + }, + { + "epoch": 1.8741145344371426, + "grad_norm": 0.41105185493252544, + "learning_rate": 2.084597875569044e-05, + "loss": 0.3831, + "step": 10980 + }, + { + "epoch": 1.8749679952206195, + "grad_norm": 0.354426825365594, + "learning_rate": 2.0830171977744056e-05, + "loss": 0.373, + "step": 10985 + }, + { + "epoch": 1.8758214560040967, + "grad_norm": 0.34640586063871975, + "learning_rate": 2.0814365199797674e-05, + "loss": 0.3784, + "step": 10990 + }, + { + "epoch": 1.8766749167875736, + "grad_norm": 0.409642869323863, + "learning_rate": 2.079855842185129e-05, + "loss": 0.3844, + "step": 10995 + }, + { + "epoch": 1.8775283775710507, + "grad_norm": 0.3194906214235848, + "learning_rate": 2.078275164390491e-05, + "loss": 0.3612, + "step": 11000 + }, + { + "epoch": 1.8783818383545277, + "grad_norm": 0.36984853554037195, + "learning_rate": 2.0766944865958523e-05, + "loss": 0.3589, + "step": 11005 + }, + { + "epoch": 1.8792352991380046, + "grad_norm": 0.37674468309420256, + "learning_rate": 2.075113808801214e-05, + "loss": 0.3684, + "step": 11010 + }, + { + "epoch": 1.8800887599214815, + "grad_norm": 0.3577464377599332, + "learning_rate": 2.0735331310065757e-05, + "loss": 0.3433, + "step": 11015 + }, + { + "epoch": 1.8809422207049586, + "grad_norm": 0.3657277409746499, + "learning_rate": 2.0719524532119375e-05, + "loss": 0.3678, + "step": 11020 + }, + { + "epoch": 1.8817956814884356, + "grad_norm": 0.3239080611529195, + "learning_rate": 2.0703717754172992e-05, + "loss": 0.3668, + "step": 11025 + }, + { + "epoch": 1.8826491422719127, + "grad_norm": 0.4133691952735056, + "learning_rate": 2.0687910976226606e-05, + "loss": 0.3417, + "step": 11030 + }, + { + "epoch": 1.8835026030553896, + "grad_norm": 0.31025729042046996, + "learning_rate": 2.0672104198280224e-05, + "loss": 0.3651, + "step": 11035 + }, + { + "epoch": 1.8843560638388666, + "grad_norm": 0.2653544568477746, + "learning_rate": 2.0656297420333838e-05, + "loss": 0.396, + "step": 11040 + }, + { + "epoch": 1.8852095246223435, + "grad_norm": 0.34035557157096347, + "learning_rate": 2.0640490642387455e-05, + "loss": 0.3692, + "step": 11045 + }, + { + "epoch": 1.8860629854058206, + "grad_norm": 0.25176173499633664, + "learning_rate": 2.0624683864441073e-05, + "loss": 0.3688, + "step": 11050 + }, + { + "epoch": 1.8869164461892975, + "grad_norm": 0.4855932237933519, + "learning_rate": 2.060887708649469e-05, + "loss": 0.3518, + "step": 11055 + }, + { + "epoch": 1.8877699069727747, + "grad_norm": 0.7704487813945038, + "learning_rate": 2.0593070308548308e-05, + "loss": 0.3626, + "step": 11060 + }, + { + "epoch": 1.8886233677562516, + "grad_norm": 0.397441661431441, + "learning_rate": 2.0577263530601922e-05, + "loss": 0.3453, + "step": 11065 + }, + { + "epoch": 1.8894768285397285, + "grad_norm": 0.3122045807760479, + "learning_rate": 2.056145675265554e-05, + "loss": 0.3464, + "step": 11070 + }, + { + "epoch": 1.8903302893232055, + "grad_norm": 0.3885736967225049, + "learning_rate": 2.0545649974709157e-05, + "loss": 0.3596, + "step": 11075 + }, + { + "epoch": 1.8911837501066826, + "grad_norm": 0.33420689549505933, + "learning_rate": 2.0529843196762774e-05, + "loss": 0.3389, + "step": 11080 + }, + { + "epoch": 1.8920372108901597, + "grad_norm": 0.4701672640614866, + "learning_rate": 2.0514036418816392e-05, + "loss": 0.3685, + "step": 11085 + }, + { + "epoch": 1.8928906716736367, + "grad_norm": 0.3229552993180945, + "learning_rate": 2.0498229640870006e-05, + "loss": 0.3536, + "step": 11090 + }, + { + "epoch": 1.8937441324571136, + "grad_norm": 0.9219103914160934, + "learning_rate": 2.0482422862923623e-05, + "loss": 0.3542, + "step": 11095 + }, + { + "epoch": 1.8945975932405905, + "grad_norm": 0.32298789544258444, + "learning_rate": 2.0466616084977237e-05, + "loss": 0.3662, + "step": 11100 + }, + { + "epoch": 1.8954510540240674, + "grad_norm": 0.4619059980890969, + "learning_rate": 2.0450809307030855e-05, + "loss": 0.3539, + "step": 11105 + }, + { + "epoch": 1.8963045148075446, + "grad_norm": 0.3683441404553503, + "learning_rate": 2.0435002529084472e-05, + "loss": 0.3582, + "step": 11110 + }, + { + "epoch": 1.8971579755910217, + "grad_norm": 0.6612297982391425, + "learning_rate": 2.041919575113809e-05, + "loss": 0.3232, + "step": 11115 + }, + { + "epoch": 1.8980114363744987, + "grad_norm": 0.5625119257799257, + "learning_rate": 2.0403388973191707e-05, + "loss": 0.3498, + "step": 11120 + }, + { + "epoch": 1.8988648971579756, + "grad_norm": 0.3541435603942317, + "learning_rate": 2.038758219524532e-05, + "loss": 0.3626, + "step": 11125 + }, + { + "epoch": 1.8997183579414525, + "grad_norm": 0.3155807293604779, + "learning_rate": 2.037177541729894e-05, + "loss": 0.371, + "step": 11130 + }, + { + "epoch": 1.9005718187249296, + "grad_norm": 0.3826111273138795, + "learning_rate": 2.0355968639352556e-05, + "loss": 0.3849, + "step": 11135 + }, + { + "epoch": 1.9014252795084066, + "grad_norm": 0.37475780016940374, + "learning_rate": 2.034016186140617e-05, + "loss": 0.388, + "step": 11140 + }, + { + "epoch": 1.9022787402918837, + "grad_norm": 0.32392024173798867, + "learning_rate": 2.032435508345979e-05, + "loss": 0.3687, + "step": 11145 + }, + { + "epoch": 1.9031322010753606, + "grad_norm": 0.3030706062232896, + "learning_rate": 2.0308548305513405e-05, + "loss": 0.3674, + "step": 11150 + }, + { + "epoch": 1.9039856618588376, + "grad_norm": 0.47629616214997345, + "learning_rate": 2.0292741527567023e-05, + "loss": 0.371, + "step": 11155 + }, + { + "epoch": 1.9048391226423145, + "grad_norm": 0.352930507119559, + "learning_rate": 2.0276934749620637e-05, + "loss": 0.3804, + "step": 11160 + }, + { + "epoch": 1.9056925834257916, + "grad_norm": 0.457987910274872, + "learning_rate": 2.0261127971674254e-05, + "loss": 0.3407, + "step": 11165 + }, + { + "epoch": 1.9065460442092685, + "grad_norm": 0.31367635452942055, + "learning_rate": 2.024532119372787e-05, + "loss": 0.3606, + "step": 11170 + }, + { + "epoch": 1.9073995049927457, + "grad_norm": 0.3238065727672519, + "learning_rate": 2.022951441578149e-05, + "loss": 0.3878, + "step": 11175 + }, + { + "epoch": 1.9082529657762226, + "grad_norm": 0.32758691010339364, + "learning_rate": 2.0213707637835106e-05, + "loss": 0.3306, + "step": 11180 + }, + { + "epoch": 1.9091064265596995, + "grad_norm": 0.3119731116231921, + "learning_rate": 2.019790085988872e-05, + "loss": 0.3471, + "step": 11185 + }, + { + "epoch": 1.9099598873431765, + "grad_norm": 0.36084222880900263, + "learning_rate": 2.0182094081942338e-05, + "loss": 0.3712, + "step": 11190 + }, + { + "epoch": 1.9108133481266536, + "grad_norm": 0.4090976778708613, + "learning_rate": 2.0166287303995955e-05, + "loss": 0.3518, + "step": 11195 + }, + { + "epoch": 1.9116668089101307, + "grad_norm": 0.37520141214522906, + "learning_rate": 2.015048052604957e-05, + "loss": 0.3704, + "step": 11200 + }, + { + "epoch": 1.9125202696936077, + "grad_norm": 0.37574938620862963, + "learning_rate": 2.013467374810319e-05, + "loss": 0.3622, + "step": 11205 + }, + { + "epoch": 1.9133737304770846, + "grad_norm": 0.44480299534892875, + "learning_rate": 2.0118866970156804e-05, + "loss": 0.3363, + "step": 11210 + }, + { + "epoch": 1.9142271912605615, + "grad_norm": 0.35464180850066146, + "learning_rate": 2.0103060192210422e-05, + "loss": 0.3519, + "step": 11215 + }, + { + "epoch": 1.9150806520440384, + "grad_norm": 0.3324121298201554, + "learning_rate": 2.0087253414264036e-05, + "loss": 0.3676, + "step": 11220 + }, + { + "epoch": 1.9159341128275156, + "grad_norm": 0.3533966232549052, + "learning_rate": 2.0071446636317653e-05, + "loss": 0.3652, + "step": 11225 + }, + { + "epoch": 1.9167875736109927, + "grad_norm": 0.36029559671898753, + "learning_rate": 2.005563985837127e-05, + "loss": 0.365, + "step": 11230 + }, + { + "epoch": 1.9176410343944696, + "grad_norm": 0.4350178376187806, + "learning_rate": 2.003983308042489e-05, + "loss": 0.3608, + "step": 11235 + }, + { + "epoch": 1.9184944951779466, + "grad_norm": 0.35699404918806876, + "learning_rate": 2.0024026302478506e-05, + "loss": 0.3931, + "step": 11240 + }, + { + "epoch": 1.9193479559614235, + "grad_norm": 0.3446151856091827, + "learning_rate": 2.000821952453212e-05, + "loss": 0.3812, + "step": 11245 + }, + { + "epoch": 1.9202014167449004, + "grad_norm": 0.3526287131807039, + "learning_rate": 1.9992412746585737e-05, + "loss": 0.3908, + "step": 11250 + }, + { + "epoch": 1.9210548775283776, + "grad_norm": 0.38504624802916554, + "learning_rate": 1.997660596863935e-05, + "loss": 0.3554, + "step": 11255 + }, + { + "epoch": 1.9219083383118547, + "grad_norm": 0.37777764566097194, + "learning_rate": 1.996079919069297e-05, + "loss": 0.3651, + "step": 11260 + }, + { + "epoch": 1.9227617990953316, + "grad_norm": 0.3125281065863975, + "learning_rate": 1.9944992412746586e-05, + "loss": 0.3639, + "step": 11265 + }, + { + "epoch": 1.9236152598788085, + "grad_norm": 0.6245378333272329, + "learning_rate": 1.9929185634800204e-05, + "loss": 0.3501, + "step": 11270 + }, + { + "epoch": 1.9244687206622855, + "grad_norm": 0.3280900134912351, + "learning_rate": 1.991337885685382e-05, + "loss": 0.368, + "step": 11275 + }, + { + "epoch": 1.9253221814457626, + "grad_norm": 0.398874931108072, + "learning_rate": 1.9897572078907435e-05, + "loss": 0.3731, + "step": 11280 + }, + { + "epoch": 1.9261756422292395, + "grad_norm": 0.35962176717808686, + "learning_rate": 1.9881765300961053e-05, + "loss": 0.355, + "step": 11285 + }, + { + "epoch": 1.9270291030127167, + "grad_norm": 0.3338696595537415, + "learning_rate": 1.986595852301467e-05, + "loss": 0.3613, + "step": 11290 + }, + { + "epoch": 1.9278825637961936, + "grad_norm": 0.3065677342530692, + "learning_rate": 1.9850151745068284e-05, + "loss": 0.3326, + "step": 11295 + }, + { + "epoch": 1.9287360245796705, + "grad_norm": 0.32622888385701815, + "learning_rate": 1.9834344967121905e-05, + "loss": 0.3662, + "step": 11300 + }, + { + "epoch": 1.9295894853631474, + "grad_norm": 0.3292226957746066, + "learning_rate": 1.981853818917552e-05, + "loss": 0.3492, + "step": 11305 + }, + { + "epoch": 1.9304429461466246, + "grad_norm": 0.3667746710987046, + "learning_rate": 1.9802731411229137e-05, + "loss": 0.359, + "step": 11310 + }, + { + "epoch": 1.9312964069301015, + "grad_norm": 0.38086219300338536, + "learning_rate": 1.978692463328275e-05, + "loss": 0.3716, + "step": 11315 + }, + { + "epoch": 1.9321498677135787, + "grad_norm": 0.4722974686907586, + "learning_rate": 1.9771117855336368e-05, + "loss": 0.3744, + "step": 11320 + }, + { + "epoch": 1.9330033284970556, + "grad_norm": 0.3304308918934173, + "learning_rate": 1.9755311077389986e-05, + "loss": 0.3722, + "step": 11325 + }, + { + "epoch": 1.9338567892805325, + "grad_norm": 0.39895671630370916, + "learning_rate": 1.9739504299443603e-05, + "loss": 0.3566, + "step": 11330 + }, + { + "epoch": 1.9347102500640094, + "grad_norm": 0.33445870300135433, + "learning_rate": 1.972369752149722e-05, + "loss": 0.3688, + "step": 11335 + }, + { + "epoch": 1.9355637108474866, + "grad_norm": 0.3624243432384039, + "learning_rate": 1.9707890743550835e-05, + "loss": 0.3741, + "step": 11340 + }, + { + "epoch": 1.9364171716309637, + "grad_norm": 0.38588661759205267, + "learning_rate": 1.9692083965604452e-05, + "loss": 0.3764, + "step": 11345 + }, + { + "epoch": 1.9372706324144406, + "grad_norm": 0.3861164155920311, + "learning_rate": 1.967627718765807e-05, + "loss": 0.3538, + "step": 11350 + }, + { + "epoch": 1.9381240931979176, + "grad_norm": 0.3571311832146459, + "learning_rate": 1.9660470409711684e-05, + "loss": 0.3737, + "step": 11355 + }, + { + "epoch": 1.9389775539813945, + "grad_norm": 0.3503066025720848, + "learning_rate": 1.9644663631765304e-05, + "loss": 0.37, + "step": 11360 + }, + { + "epoch": 1.9398310147648714, + "grad_norm": 0.31192881974980474, + "learning_rate": 1.962885685381892e-05, + "loss": 0.3645, + "step": 11365 + }, + { + "epoch": 1.9406844755483486, + "grad_norm": 0.4104030121912594, + "learning_rate": 1.9613050075872536e-05, + "loss": 0.3631, + "step": 11370 + }, + { + "epoch": 1.9415379363318257, + "grad_norm": 0.3066578523567166, + "learning_rate": 1.959724329792615e-05, + "loss": 0.3606, + "step": 11375 + }, + { + "epoch": 1.9423913971153026, + "grad_norm": 0.261312140891305, + "learning_rate": 1.9581436519979768e-05, + "loss": 0.3952, + "step": 11380 + }, + { + "epoch": 1.9432448578987795, + "grad_norm": 0.3413334933644581, + "learning_rate": 1.9565629742033385e-05, + "loss": 0.3779, + "step": 11385 + }, + { + "epoch": 1.9440983186822565, + "grad_norm": 0.3786158496861722, + "learning_rate": 1.9549822964087e-05, + "loss": 0.3623, + "step": 11390 + }, + { + "epoch": 1.9449517794657334, + "grad_norm": 0.3694445662662908, + "learning_rate": 1.953401618614062e-05, + "loss": 0.3544, + "step": 11395 + }, + { + "epoch": 1.9458052402492105, + "grad_norm": 0.3624189966994706, + "learning_rate": 1.9518209408194234e-05, + "loss": 0.3631, + "step": 11400 + }, + { + "epoch": 1.9466587010326877, + "grad_norm": 0.28941849668007225, + "learning_rate": 1.950240263024785e-05, + "loss": 0.339, + "step": 11405 + }, + { + "epoch": 1.9475121618161646, + "grad_norm": 0.7991996892591037, + "learning_rate": 1.948659585230147e-05, + "loss": 0.3741, + "step": 11410 + }, + { + "epoch": 1.9483656225996415, + "grad_norm": 0.3605776512297038, + "learning_rate": 1.9470789074355083e-05, + "loss": 0.3637, + "step": 11415 + }, + { + "epoch": 1.9492190833831184, + "grad_norm": 0.3319018707245592, + "learning_rate": 1.94549822964087e-05, + "loss": 0.3642, + "step": 11420 + }, + { + "epoch": 1.9500725441665956, + "grad_norm": 0.3925895291989305, + "learning_rate": 1.9439175518462318e-05, + "loss": 0.3571, + "step": 11425 + }, + { + "epoch": 1.9509260049500725, + "grad_norm": 0.4445945431545767, + "learning_rate": 1.9423368740515935e-05, + "loss": 0.3602, + "step": 11430 + }, + { + "epoch": 1.9517794657335497, + "grad_norm": 0.3263330072466279, + "learning_rate": 1.940756196256955e-05, + "loss": 0.3799, + "step": 11435 + }, + { + "epoch": 1.9526329265170266, + "grad_norm": 0.9395242224578074, + "learning_rate": 1.9391755184623167e-05, + "loss": 0.3598, + "step": 11440 + }, + { + "epoch": 1.9534863873005035, + "grad_norm": 0.33484251821398536, + "learning_rate": 1.9375948406676784e-05, + "loss": 0.3762, + "step": 11445 + }, + { + "epoch": 1.9543398480839804, + "grad_norm": 0.49024773217209083, + "learning_rate": 1.93601416287304e-05, + "loss": 0.395, + "step": 11450 + }, + { + "epoch": 1.9551933088674576, + "grad_norm": 0.3717301732218937, + "learning_rate": 1.934433485078402e-05, + "loss": 0.3876, + "step": 11455 + }, + { + "epoch": 1.9560467696509345, + "grad_norm": 0.33147738637956553, + "learning_rate": 1.9328528072837633e-05, + "loss": 0.3644, + "step": 11460 + }, + { + "epoch": 1.9569002304344116, + "grad_norm": 0.4014253422212822, + "learning_rate": 1.931272129489125e-05, + "loss": 0.3651, + "step": 11465 + }, + { + "epoch": 1.9577536912178886, + "grad_norm": 0.3400205004163349, + "learning_rate": 1.9296914516944868e-05, + "loss": 0.3463, + "step": 11470 + }, + { + "epoch": 1.9586071520013655, + "grad_norm": 0.3264483566767699, + "learning_rate": 1.9281107738998482e-05, + "loss": 0.3701, + "step": 11475 + }, + { + "epoch": 1.9594606127848424, + "grad_norm": 0.38572487164686403, + "learning_rate": 1.92653009610521e-05, + "loss": 0.3384, + "step": 11480 + }, + { + "epoch": 1.9603140735683195, + "grad_norm": 0.4467785592513792, + "learning_rate": 1.9249494183105717e-05, + "loss": 0.3617, + "step": 11485 + }, + { + "epoch": 1.9611675343517967, + "grad_norm": 0.3129488961883966, + "learning_rate": 1.9233687405159335e-05, + "loss": 0.3857, + "step": 11490 + }, + { + "epoch": 1.9620209951352736, + "grad_norm": 0.3432421500027621, + "learning_rate": 1.921788062721295e-05, + "loss": 0.3481, + "step": 11495 + }, + { + "epoch": 1.9628744559187505, + "grad_norm": 0.8997788869734178, + "learning_rate": 1.9202073849266566e-05, + "loss": 0.3686, + "step": 11500 + }, + { + "epoch": 1.9637279167022275, + "grad_norm": 0.36006822237742797, + "learning_rate": 1.9186267071320184e-05, + "loss": 0.3591, + "step": 11505 + }, + { + "epoch": 1.9645813774857044, + "grad_norm": 0.3324090120282234, + "learning_rate": 1.9170460293373798e-05, + "loss": 0.357, + "step": 11510 + }, + { + "epoch": 1.9654348382691815, + "grad_norm": 0.3485251661481423, + "learning_rate": 1.9154653515427415e-05, + "loss": 0.3762, + "step": 11515 + }, + { + "epoch": 1.9662882990526587, + "grad_norm": 0.383412188275394, + "learning_rate": 1.9138846737481033e-05, + "loss": 0.3594, + "step": 11520 + }, + { + "epoch": 1.9671417598361356, + "grad_norm": 0.3666698926714686, + "learning_rate": 1.912303995953465e-05, + "loss": 0.3641, + "step": 11525 + }, + { + "epoch": 1.9679952206196125, + "grad_norm": 0.6887105354347631, + "learning_rate": 1.9107233181588268e-05, + "loss": 0.3577, + "step": 11530 + }, + { + "epoch": 1.9688486814030894, + "grad_norm": 0.3299880013438571, + "learning_rate": 1.909142640364188e-05, + "loss": 0.3584, + "step": 11535 + }, + { + "epoch": 1.9697021421865664, + "grad_norm": 0.3361477361924093, + "learning_rate": 1.90756196256955e-05, + "loss": 0.3472, + "step": 11540 + }, + { + "epoch": 1.9705556029700435, + "grad_norm": 0.619409623572859, + "learning_rate": 1.9059812847749113e-05, + "loss": 0.3783, + "step": 11545 + }, + { + "epoch": 1.9714090637535207, + "grad_norm": 0.3462415307913492, + "learning_rate": 1.9044006069802734e-05, + "loss": 0.3455, + "step": 11550 + }, + { + "epoch": 1.9722625245369976, + "grad_norm": 0.2879089288950731, + "learning_rate": 1.9028199291856348e-05, + "loss": 0.3663, + "step": 11555 + }, + { + "epoch": 1.9731159853204745, + "grad_norm": 0.3862808024430762, + "learning_rate": 1.9012392513909966e-05, + "loss": 0.3743, + "step": 11560 + }, + { + "epoch": 1.9739694461039514, + "grad_norm": 0.3952032121696526, + "learning_rate": 1.8996585735963583e-05, + "loss": 0.3604, + "step": 11565 + }, + { + "epoch": 1.9748229068874286, + "grad_norm": 0.2765527056826614, + "learning_rate": 1.8980778958017197e-05, + "loss": 0.3346, + "step": 11570 + }, + { + "epoch": 1.9756763676709055, + "grad_norm": 0.3262199224171904, + "learning_rate": 1.8964972180070814e-05, + "loss": 0.3548, + "step": 11575 + }, + { + "epoch": 1.9765298284543826, + "grad_norm": 0.3555939118146572, + "learning_rate": 1.8949165402124432e-05, + "loss": 0.3752, + "step": 11580 + }, + { + "epoch": 1.9773832892378596, + "grad_norm": 0.38534503417504223, + "learning_rate": 1.893335862417805e-05, + "loss": 0.3717, + "step": 11585 + }, + { + "epoch": 1.9782367500213365, + "grad_norm": 0.3525087826480515, + "learning_rate": 1.8917551846231667e-05, + "loss": 0.3435, + "step": 11590 + }, + { + "epoch": 1.9790902108048134, + "grad_norm": 0.41355689384936367, + "learning_rate": 1.890174506828528e-05, + "loss": 0.3452, + "step": 11595 + }, + { + "epoch": 1.9799436715882905, + "grad_norm": 0.32554251750894886, + "learning_rate": 1.88859382903389e-05, + "loss": 0.3674, + "step": 11600 + }, + { + "epoch": 1.9807971323717675, + "grad_norm": 0.38617312349391203, + "learning_rate": 1.8870131512392512e-05, + "loss": 0.3961, + "step": 11605 + }, + { + "epoch": 1.9816505931552446, + "grad_norm": 0.35810908236896655, + "learning_rate": 1.8854324734446133e-05, + "loss": 0.36, + "step": 11610 + }, + { + "epoch": 1.9825040539387215, + "grad_norm": 0.3959067679105283, + "learning_rate": 1.8838517956499747e-05, + "loss": 0.3812, + "step": 11615 + }, + { + "epoch": 1.9833575147221985, + "grad_norm": 0.44481170611894405, + "learning_rate": 1.8822711178553365e-05, + "loss": 0.3563, + "step": 11620 + }, + { + "epoch": 1.9842109755056754, + "grad_norm": 0.29757742501818696, + "learning_rate": 1.8806904400606982e-05, + "loss": 0.3575, + "step": 11625 + }, + { + "epoch": 1.9850644362891525, + "grad_norm": 0.34151394562783016, + "learning_rate": 1.8791097622660596e-05, + "loss": 0.3673, + "step": 11630 + }, + { + "epoch": 1.9859178970726297, + "grad_norm": 0.4115597667769673, + "learning_rate": 1.8775290844714214e-05, + "loss": 0.373, + "step": 11635 + }, + { + "epoch": 1.9867713578561066, + "grad_norm": 0.5589463365602205, + "learning_rate": 1.875948406676783e-05, + "loss": 0.3663, + "step": 11640 + }, + { + "epoch": 1.9876248186395835, + "grad_norm": 0.32174338662763924, + "learning_rate": 1.874367728882145e-05, + "loss": 0.3693, + "step": 11645 + }, + { + "epoch": 1.9884782794230604, + "grad_norm": 0.3992664638486375, + "learning_rate": 1.8727870510875066e-05, + "loss": 0.3649, + "step": 11650 + }, + { + "epoch": 1.9893317402065374, + "grad_norm": 0.3004563209664813, + "learning_rate": 1.871206373292868e-05, + "loss": 0.3669, + "step": 11655 + }, + { + "epoch": 1.9901852009900145, + "grad_norm": 0.2891256520749365, + "learning_rate": 1.8696256954982298e-05, + "loss": 0.3501, + "step": 11660 + }, + { + "epoch": 1.9910386617734916, + "grad_norm": 0.3523769019050455, + "learning_rate": 1.8680450177035912e-05, + "loss": 0.3661, + "step": 11665 + }, + { + "epoch": 1.9918921225569686, + "grad_norm": 0.3736500940750435, + "learning_rate": 1.866464339908953e-05, + "loss": 0.3867, + "step": 11670 + }, + { + "epoch": 1.9927455833404455, + "grad_norm": 0.35462316288841506, + "learning_rate": 1.8648836621143147e-05, + "loss": 0.3488, + "step": 11675 + }, + { + "epoch": 1.9935990441239224, + "grad_norm": 0.3031867758368206, + "learning_rate": 1.8633029843196764e-05, + "loss": 0.3447, + "step": 11680 + }, + { + "epoch": 1.9944525049073996, + "grad_norm": 0.3263771067325506, + "learning_rate": 1.861722306525038e-05, + "loss": 0.365, + "step": 11685 + }, + { + "epoch": 1.9953059656908765, + "grad_norm": 0.3174180681613223, + "learning_rate": 1.8601416287303996e-05, + "loss": 0.356, + "step": 11690 + }, + { + "epoch": 1.9961594264743536, + "grad_norm": 0.3758309298391433, + "learning_rate": 1.8585609509357613e-05, + "loss": 0.3618, + "step": 11695 + }, + { + "epoch": 1.9970128872578305, + "grad_norm": 0.37441528860296364, + "learning_rate": 1.8569802731411227e-05, + "loss": 0.3449, + "step": 11700 + }, + { + "epoch": 1.9978663480413075, + "grad_norm": 0.4118868612211099, + "learning_rate": 1.8553995953464848e-05, + "loss": 0.3535, + "step": 11705 + }, + { + "epoch": 1.9987198088247844, + "grad_norm": 1.8941609581505108, + "learning_rate": 1.8538189175518466e-05, + "loss": 0.3636, + "step": 11710 + }, + { + "epoch": 1.9995732696082615, + "grad_norm": 0.33030098032605626, + "learning_rate": 1.852238239757208e-05, + "loss": 0.3583, + "step": 11715 + }, + { + "epoch": 2.0003413843133906, + "grad_norm": 0.36601217529631147, + "learning_rate": 1.8506575619625697e-05, + "loss": 0.3355, + "step": 11720 + }, + { + "epoch": 2.001194845096868, + "grad_norm": 0.4087276302441495, + "learning_rate": 1.849076884167931e-05, + "loss": 0.274, + "step": 11725 + }, + { + "epoch": 2.002048305880345, + "grad_norm": 0.4409873629189802, + "learning_rate": 1.847496206373293e-05, + "loss": 0.2851, + "step": 11730 + }, + { + "epoch": 2.002901766663822, + "grad_norm": 0.3332216524255506, + "learning_rate": 1.8459155285786546e-05, + "loss": 0.2731, + "step": 11735 + }, + { + "epoch": 2.0037552274472987, + "grad_norm": 0.3616256277252148, + "learning_rate": 1.8443348507840163e-05, + "loss": 0.2801, + "step": 11740 + }, + { + "epoch": 2.0046086882307756, + "grad_norm": 0.3424298707047687, + "learning_rate": 1.842754172989378e-05, + "loss": 0.2736, + "step": 11745 + }, + { + "epoch": 2.005462149014253, + "grad_norm": 0.3445224253336677, + "learning_rate": 1.8411734951947395e-05, + "loss": 0.2557, + "step": 11750 + }, + { + "epoch": 2.00631560979773, + "grad_norm": 0.3351716189441447, + "learning_rate": 1.8395928174001012e-05, + "loss": 0.2615, + "step": 11755 + }, + { + "epoch": 2.007169070581207, + "grad_norm": 0.3332607228646462, + "learning_rate": 1.8380121396054627e-05, + "loss": 0.269, + "step": 11760 + }, + { + "epoch": 2.008022531364684, + "grad_norm": 0.36985606309509017, + "learning_rate": 1.8364314618108247e-05, + "loss": 0.269, + "step": 11765 + }, + { + "epoch": 2.0088759921481607, + "grad_norm": 0.34963039815133334, + "learning_rate": 1.8348507840161865e-05, + "loss": 0.2551, + "step": 11770 + }, + { + "epoch": 2.0097294529316376, + "grad_norm": 0.36421348478429594, + "learning_rate": 1.833270106221548e-05, + "loss": 0.2785, + "step": 11775 + }, + { + "epoch": 2.010582913715115, + "grad_norm": 0.3200502728315867, + "learning_rate": 1.8316894284269096e-05, + "loss": 0.2778, + "step": 11780 + }, + { + "epoch": 2.011436374498592, + "grad_norm": 0.3567813226557995, + "learning_rate": 1.830108750632271e-05, + "loss": 0.262, + "step": 11785 + }, + { + "epoch": 2.012289835282069, + "grad_norm": 0.38018272176428664, + "learning_rate": 1.8285280728376328e-05, + "loss": 0.2643, + "step": 11790 + }, + { + "epoch": 2.0131432960655458, + "grad_norm": 0.3196365171312658, + "learning_rate": 1.8269473950429945e-05, + "loss": 0.2939, + "step": 11795 + }, + { + "epoch": 2.0139967568490227, + "grad_norm": 0.3749386596391697, + "learning_rate": 1.8253667172483563e-05, + "loss": 0.2609, + "step": 11800 + }, + { + "epoch": 2.0148502176324996, + "grad_norm": 0.3320723067712335, + "learning_rate": 1.823786039453718e-05, + "loss": 0.2845, + "step": 11805 + }, + { + "epoch": 2.015703678415977, + "grad_norm": 0.3567384530855997, + "learning_rate": 1.8222053616590794e-05, + "loss": 0.2837, + "step": 11810 + }, + { + "epoch": 2.016557139199454, + "grad_norm": 0.5478809009656719, + "learning_rate": 1.8206246838644412e-05, + "loss": 0.2701, + "step": 11815 + }, + { + "epoch": 2.017410599982931, + "grad_norm": 0.4015324014536703, + "learning_rate": 1.8190440060698026e-05, + "loss": 0.2561, + "step": 11820 + }, + { + "epoch": 2.0182640607664077, + "grad_norm": 0.3082295737208152, + "learning_rate": 1.8174633282751643e-05, + "loss": 0.2556, + "step": 11825 + }, + { + "epoch": 2.0191175215498847, + "grad_norm": 0.35450120044324074, + "learning_rate": 1.8158826504805264e-05, + "loss": 0.2607, + "step": 11830 + }, + { + "epoch": 2.0199709823333616, + "grad_norm": 0.37204359376393903, + "learning_rate": 1.8143019726858878e-05, + "loss": 0.2509, + "step": 11835 + }, + { + "epoch": 2.020824443116839, + "grad_norm": 0.327493742049166, + "learning_rate": 1.8127212948912496e-05, + "loss": 0.2727, + "step": 11840 + }, + { + "epoch": 2.021677903900316, + "grad_norm": 0.34014078032855066, + "learning_rate": 1.811140617096611e-05, + "loss": 0.2891, + "step": 11845 + }, + { + "epoch": 2.022531364683793, + "grad_norm": 0.3714956426888765, + "learning_rate": 1.8095599393019727e-05, + "loss": 0.2714, + "step": 11850 + }, + { + "epoch": 2.0233848254672697, + "grad_norm": 0.34384771814885784, + "learning_rate": 1.8079792615073345e-05, + "loss": 0.2765, + "step": 11855 + }, + { + "epoch": 2.0242382862507466, + "grad_norm": 0.37971783255947067, + "learning_rate": 1.8063985837126962e-05, + "loss": 0.2569, + "step": 11860 + }, + { + "epoch": 2.0250917470342236, + "grad_norm": 0.35988065541812736, + "learning_rate": 1.804817905918058e-05, + "loss": 0.2596, + "step": 11865 + }, + { + "epoch": 2.025945207817701, + "grad_norm": 0.3249526782969596, + "learning_rate": 1.8032372281234194e-05, + "loss": 0.2762, + "step": 11870 + }, + { + "epoch": 2.026798668601178, + "grad_norm": 0.3514693210668101, + "learning_rate": 1.801656550328781e-05, + "loss": 0.2708, + "step": 11875 + }, + { + "epoch": 2.0276521293846548, + "grad_norm": 0.48699825557546833, + "learning_rate": 1.8000758725341425e-05, + "loss": 0.2978, + "step": 11880 + }, + { + "epoch": 2.0285055901681317, + "grad_norm": 0.3302433246221382, + "learning_rate": 1.7984951947395043e-05, + "loss": 0.2723, + "step": 11885 + }, + { + "epoch": 2.0293590509516086, + "grad_norm": 0.3494563980866781, + "learning_rate": 1.796914516944866e-05, + "loss": 0.2567, + "step": 11890 + }, + { + "epoch": 2.030212511735086, + "grad_norm": 0.5078171681533615, + "learning_rate": 1.7953338391502278e-05, + "loss": 0.3068, + "step": 11895 + }, + { + "epoch": 2.031065972518563, + "grad_norm": 0.33115962232712237, + "learning_rate": 1.7937531613555895e-05, + "loss": 0.2774, + "step": 11900 + }, + { + "epoch": 2.03191943330204, + "grad_norm": 0.3601496012552449, + "learning_rate": 1.792172483560951e-05, + "loss": 0.2873, + "step": 11905 + }, + { + "epoch": 2.0327728940855168, + "grad_norm": 0.2972739692797199, + "learning_rate": 1.7905918057663127e-05, + "loss": 0.2746, + "step": 11910 + }, + { + "epoch": 2.0336263548689937, + "grad_norm": 0.34598117202222756, + "learning_rate": 1.7890111279716744e-05, + "loss": 0.2788, + "step": 11915 + }, + { + "epoch": 2.0344798156524706, + "grad_norm": 0.2932483984070083, + "learning_rate": 1.7874304501770358e-05, + "loss": 0.2541, + "step": 11920 + }, + { + "epoch": 2.035333276435948, + "grad_norm": 0.2863647603035123, + "learning_rate": 1.785849772382398e-05, + "loss": 0.2627, + "step": 11925 + }, + { + "epoch": 2.036186737219425, + "grad_norm": 0.37976530277574055, + "learning_rate": 1.7842690945877593e-05, + "loss": 0.293, + "step": 11930 + }, + { + "epoch": 2.037040198002902, + "grad_norm": 0.31537747743198213, + "learning_rate": 1.782688416793121e-05, + "loss": 0.2723, + "step": 11935 + }, + { + "epoch": 2.0378936587863787, + "grad_norm": 0.3300382729548969, + "learning_rate": 1.7811077389984825e-05, + "loss": 0.2643, + "step": 11940 + }, + { + "epoch": 2.0387471195698557, + "grad_norm": 0.447352553052991, + "learning_rate": 1.7795270612038442e-05, + "loss": 0.2677, + "step": 11945 + }, + { + "epoch": 2.0396005803533326, + "grad_norm": 0.36568175470105174, + "learning_rate": 1.777946383409206e-05, + "loss": 0.2759, + "step": 11950 + }, + { + "epoch": 2.04045404113681, + "grad_norm": 0.34674038246091743, + "learning_rate": 1.7763657056145677e-05, + "loss": 0.2775, + "step": 11955 + }, + { + "epoch": 2.041307501920287, + "grad_norm": 0.3099595102646737, + "learning_rate": 1.7747850278199294e-05, + "loss": 0.2406, + "step": 11960 + }, + { + "epoch": 2.042160962703764, + "grad_norm": 0.34062152209947577, + "learning_rate": 1.773204350025291e-05, + "loss": 0.2546, + "step": 11965 + }, + { + "epoch": 2.0430144234872407, + "grad_norm": 0.3212507029051799, + "learning_rate": 1.7716236722306526e-05, + "loss": 0.2781, + "step": 11970 + }, + { + "epoch": 2.0438678842707176, + "grad_norm": 0.3669402570578361, + "learning_rate": 1.7700429944360143e-05, + "loss": 0.2508, + "step": 11975 + }, + { + "epoch": 2.0447213450541946, + "grad_norm": 0.3256430454535454, + "learning_rate": 1.7684623166413757e-05, + "loss": 0.2707, + "step": 11980 + }, + { + "epoch": 2.045574805837672, + "grad_norm": 0.3644091779560888, + "learning_rate": 1.7668816388467378e-05, + "loss": 0.2611, + "step": 11985 + }, + { + "epoch": 2.046428266621149, + "grad_norm": 0.29339094551071926, + "learning_rate": 1.7653009610520992e-05, + "loss": 0.277, + "step": 11990 + }, + { + "epoch": 2.0472817274046258, + "grad_norm": 0.32224421179142887, + "learning_rate": 1.763720283257461e-05, + "loss": 0.2854, + "step": 11995 + }, + { + "epoch": 2.0481351881881027, + "grad_norm": 0.6105808733353931, + "learning_rate": 1.7621396054628224e-05, + "loss": 0.29, + "step": 12000 + }, + { + "epoch": 2.0489886489715796, + "grad_norm": 0.6269079828473192, + "learning_rate": 1.760558927668184e-05, + "loss": 0.2836, + "step": 12005 + }, + { + "epoch": 2.0498421097550565, + "grad_norm": 0.33382413769115177, + "learning_rate": 1.758978249873546e-05, + "loss": 0.2804, + "step": 12010 + }, + { + "epoch": 2.050695570538534, + "grad_norm": 0.3833260788413062, + "learning_rate": 1.7573975720789076e-05, + "loss": 0.2761, + "step": 12015 + }, + { + "epoch": 2.051549031322011, + "grad_norm": 0.7526300116993044, + "learning_rate": 1.7558168942842694e-05, + "loss": 0.2675, + "step": 12020 + }, + { + "epoch": 2.0524024921054878, + "grad_norm": 0.4095339100076474, + "learning_rate": 1.7542362164896308e-05, + "loss": 0.2531, + "step": 12025 + }, + { + "epoch": 2.0532559528889647, + "grad_norm": 0.3324065066305095, + "learning_rate": 1.7526555386949925e-05, + "loss": 0.2664, + "step": 12030 + }, + { + "epoch": 2.0541094136724416, + "grad_norm": 0.31380277400476436, + "learning_rate": 1.7510748609003543e-05, + "loss": 0.2596, + "step": 12035 + }, + { + "epoch": 2.054962874455919, + "grad_norm": 0.4274697022704499, + "learning_rate": 1.7494941831057157e-05, + "loss": 0.2463, + "step": 12040 + }, + { + "epoch": 2.055816335239396, + "grad_norm": 0.35626841520038793, + "learning_rate": 1.7479135053110774e-05, + "loss": 0.2719, + "step": 12045 + }, + { + "epoch": 2.056669796022873, + "grad_norm": 0.3974235988514715, + "learning_rate": 1.746332827516439e-05, + "loss": 0.2709, + "step": 12050 + }, + { + "epoch": 2.0575232568063497, + "grad_norm": 0.30990091059678676, + "learning_rate": 1.744752149721801e-05, + "loss": 0.2596, + "step": 12055 + }, + { + "epoch": 2.0583767175898267, + "grad_norm": 0.33322796671815513, + "learning_rate": 1.7431714719271623e-05, + "loss": 0.2913, + "step": 12060 + }, + { + "epoch": 2.0592301783733036, + "grad_norm": 0.3171079041098653, + "learning_rate": 1.741590794132524e-05, + "loss": 0.2807, + "step": 12065 + }, + { + "epoch": 2.060083639156781, + "grad_norm": 0.3588034806290993, + "learning_rate": 1.7400101163378858e-05, + "loss": 0.2531, + "step": 12070 + }, + { + "epoch": 2.060937099940258, + "grad_norm": 0.33898417082895527, + "learning_rate": 1.7384294385432472e-05, + "loss": 0.2732, + "step": 12075 + }, + { + "epoch": 2.061790560723735, + "grad_norm": 0.3092639727199814, + "learning_rate": 1.7368487607486093e-05, + "loss": 0.2898, + "step": 12080 + }, + { + "epoch": 2.0626440215072117, + "grad_norm": 0.31338268131353825, + "learning_rate": 1.7352680829539707e-05, + "loss": 0.3038, + "step": 12085 + }, + { + "epoch": 2.0634974822906886, + "grad_norm": 0.29827852349198464, + "learning_rate": 1.7336874051593325e-05, + "loss": 0.2584, + "step": 12090 + }, + { + "epoch": 2.0643509430741656, + "grad_norm": 0.3657097608792625, + "learning_rate": 1.7321067273646942e-05, + "loss": 0.2759, + "step": 12095 + }, + { + "epoch": 2.065204403857643, + "grad_norm": 0.3765895570517396, + "learning_rate": 1.7305260495700556e-05, + "loss": 0.2534, + "step": 12100 + }, + { + "epoch": 2.06605786464112, + "grad_norm": 0.4322246859086725, + "learning_rate": 1.7289453717754174e-05, + "loss": 0.2672, + "step": 12105 + }, + { + "epoch": 2.0669113254245968, + "grad_norm": 0.3118296114497835, + "learning_rate": 1.727364693980779e-05, + "loss": 0.2722, + "step": 12110 + }, + { + "epoch": 2.0677647862080737, + "grad_norm": 0.31531418590925914, + "learning_rate": 1.725784016186141e-05, + "loss": 0.2729, + "step": 12115 + }, + { + "epoch": 2.0686182469915506, + "grad_norm": 0.29201923605182517, + "learning_rate": 1.7242033383915023e-05, + "loss": 0.2588, + "step": 12120 + }, + { + "epoch": 2.0694717077750275, + "grad_norm": 0.3403082569544838, + "learning_rate": 1.722622660596864e-05, + "loss": 0.2729, + "step": 12125 + }, + { + "epoch": 2.070325168558505, + "grad_norm": 0.35162950149295324, + "learning_rate": 1.7210419828022257e-05, + "loss": 0.2631, + "step": 12130 + }, + { + "epoch": 2.071178629341982, + "grad_norm": 0.4200351820771278, + "learning_rate": 1.719461305007587e-05, + "loss": 0.2719, + "step": 12135 + }, + { + "epoch": 2.0720320901254587, + "grad_norm": 0.37018489015593226, + "learning_rate": 1.7178806272129492e-05, + "loss": 0.2532, + "step": 12140 + }, + { + "epoch": 2.0728855509089357, + "grad_norm": 0.2948475725320786, + "learning_rate": 1.7162999494183106e-05, + "loss": 0.2305, + "step": 12145 + }, + { + "epoch": 2.0737390116924126, + "grad_norm": 0.31880448697394564, + "learning_rate": 1.7147192716236724e-05, + "loss": 0.2916, + "step": 12150 + }, + { + "epoch": 2.0745924724758895, + "grad_norm": 0.30086387149635874, + "learning_rate": 1.713138593829034e-05, + "loss": 0.2728, + "step": 12155 + }, + { + "epoch": 2.075445933259367, + "grad_norm": 0.35225846509084374, + "learning_rate": 1.7115579160343955e-05, + "loss": 0.2755, + "step": 12160 + }, + { + "epoch": 2.076299394042844, + "grad_norm": 0.34709944866279024, + "learning_rate": 1.7099772382397573e-05, + "loss": 0.2615, + "step": 12165 + }, + { + "epoch": 2.0771528548263207, + "grad_norm": 0.333291664285541, + "learning_rate": 1.708396560445119e-05, + "loss": 0.2624, + "step": 12170 + }, + { + "epoch": 2.0780063156097976, + "grad_norm": 0.3360796911333201, + "learning_rate": 1.7068158826504808e-05, + "loss": 0.2841, + "step": 12175 + }, + { + "epoch": 2.0788597763932746, + "grad_norm": 0.40102339611112403, + "learning_rate": 1.7052352048558422e-05, + "loss": 0.2472, + "step": 12180 + }, + { + "epoch": 2.079713237176752, + "grad_norm": 0.3524313454573767, + "learning_rate": 1.703654527061204e-05, + "loss": 0.2738, + "step": 12185 + }, + { + "epoch": 2.080566697960229, + "grad_norm": 0.38390466639979504, + "learning_rate": 1.7020738492665657e-05, + "loss": 0.2764, + "step": 12190 + }, + { + "epoch": 2.081420158743706, + "grad_norm": 0.3616939115523455, + "learning_rate": 1.700493171471927e-05, + "loss": 0.2676, + "step": 12195 + }, + { + "epoch": 2.0822736195271827, + "grad_norm": 0.34575626616021204, + "learning_rate": 1.6989124936772888e-05, + "loss": 0.2717, + "step": 12200 + }, + { + "epoch": 2.0831270803106596, + "grad_norm": 0.5088527280069074, + "learning_rate": 1.6973318158826506e-05, + "loss": 0.2601, + "step": 12205 + }, + { + "epoch": 2.0839805410941366, + "grad_norm": 0.3481856832523284, + "learning_rate": 1.6957511380880123e-05, + "loss": 0.2659, + "step": 12210 + }, + { + "epoch": 2.084834001877614, + "grad_norm": 0.3584208310807397, + "learning_rate": 1.694170460293374e-05, + "loss": 0.2682, + "step": 12215 + }, + { + "epoch": 2.085687462661091, + "grad_norm": 0.33623978474962535, + "learning_rate": 1.6925897824987355e-05, + "loss": 0.274, + "step": 12220 + }, + { + "epoch": 2.0865409234445678, + "grad_norm": 0.3430855451783835, + "learning_rate": 1.6910091047040972e-05, + "loss": 0.2723, + "step": 12225 + }, + { + "epoch": 2.0873943842280447, + "grad_norm": 0.3239022405078139, + "learning_rate": 1.6894284269094586e-05, + "loss": 0.2657, + "step": 12230 + }, + { + "epoch": 2.0882478450115216, + "grad_norm": 0.43544523014277337, + "learning_rate": 1.6878477491148207e-05, + "loss": 0.2632, + "step": 12235 + }, + { + "epoch": 2.0891013057949985, + "grad_norm": 0.4528751490778456, + "learning_rate": 1.686267071320182e-05, + "loss": 0.2576, + "step": 12240 + }, + { + "epoch": 2.089954766578476, + "grad_norm": 0.3585745420304474, + "learning_rate": 1.684686393525544e-05, + "loss": 0.2739, + "step": 12245 + }, + { + "epoch": 2.090808227361953, + "grad_norm": 0.311146872238834, + "learning_rate": 1.6831057157309056e-05, + "loss": 0.2575, + "step": 12250 + }, + { + "epoch": 2.0916616881454297, + "grad_norm": 0.3552959349700159, + "learning_rate": 1.681525037936267e-05, + "loss": 0.2805, + "step": 12255 + }, + { + "epoch": 2.0925151489289067, + "grad_norm": 0.32835550560208493, + "learning_rate": 1.6799443601416288e-05, + "loss": 0.2516, + "step": 12260 + }, + { + "epoch": 2.0933686097123836, + "grad_norm": 0.3347494591407445, + "learning_rate": 1.6783636823469905e-05, + "loss": 0.2804, + "step": 12265 + }, + { + "epoch": 2.0942220704958605, + "grad_norm": 0.6363722617507217, + "learning_rate": 1.6767830045523523e-05, + "loss": 0.275, + "step": 12270 + }, + { + "epoch": 2.095075531279338, + "grad_norm": 0.34387693365627625, + "learning_rate": 1.6752023267577137e-05, + "loss": 0.2639, + "step": 12275 + }, + { + "epoch": 2.095928992062815, + "grad_norm": 0.35194711797385764, + "learning_rate": 1.6736216489630754e-05, + "loss": 0.2815, + "step": 12280 + }, + { + "epoch": 2.0967824528462917, + "grad_norm": 0.3278093489773815, + "learning_rate": 1.672040971168437e-05, + "loss": 0.3025, + "step": 12285 + }, + { + "epoch": 2.0976359136297686, + "grad_norm": 0.32201858704326325, + "learning_rate": 1.6704602933737986e-05, + "loss": 0.26, + "step": 12290 + }, + { + "epoch": 2.0984893744132456, + "grad_norm": 0.43088177963709523, + "learning_rate": 1.6688796155791606e-05, + "loss": 0.2664, + "step": 12295 + }, + { + "epoch": 2.0993428351967225, + "grad_norm": 0.35898411380628825, + "learning_rate": 1.667298937784522e-05, + "loss": 0.2829, + "step": 12300 + }, + { + "epoch": 2.1001962959802, + "grad_norm": 0.36341604943601946, + "learning_rate": 1.6657182599898838e-05, + "loss": 0.2787, + "step": 12305 + }, + { + "epoch": 2.101049756763677, + "grad_norm": 0.3801967985876427, + "learning_rate": 1.6641375821952455e-05, + "loss": 0.2525, + "step": 12310 + }, + { + "epoch": 2.1019032175471537, + "grad_norm": 0.3484349896151566, + "learning_rate": 1.662556904400607e-05, + "loss": 0.2703, + "step": 12315 + }, + { + "epoch": 2.1027566783306306, + "grad_norm": 0.6689341642203409, + "learning_rate": 1.6609762266059687e-05, + "loss": 0.2767, + "step": 12320 + }, + { + "epoch": 2.1036101391141075, + "grad_norm": 0.42341943337546845, + "learning_rate": 1.65939554881133e-05, + "loss": 0.2605, + "step": 12325 + }, + { + "epoch": 2.104463599897585, + "grad_norm": 0.32348121955528175, + "learning_rate": 1.6578148710166922e-05, + "loss": 0.2735, + "step": 12330 + }, + { + "epoch": 2.105317060681062, + "grad_norm": 0.3205700673140748, + "learning_rate": 1.6562341932220536e-05, + "loss": 0.2685, + "step": 12335 + }, + { + "epoch": 2.1061705214645388, + "grad_norm": 0.34986014628067635, + "learning_rate": 1.6546535154274153e-05, + "loss": 0.2567, + "step": 12340 + }, + { + "epoch": 2.1070239822480157, + "grad_norm": 0.43101180839913, + "learning_rate": 1.653072837632777e-05, + "loss": 0.2679, + "step": 12345 + }, + { + "epoch": 2.1078774430314926, + "grad_norm": 0.3627838318748922, + "learning_rate": 1.6514921598381385e-05, + "loss": 0.2455, + "step": 12350 + }, + { + "epoch": 2.1087309038149695, + "grad_norm": 0.323420809694066, + "learning_rate": 1.6499114820435002e-05, + "loss": 0.2886, + "step": 12355 + }, + { + "epoch": 2.109584364598447, + "grad_norm": 0.3552575219189849, + "learning_rate": 1.648330804248862e-05, + "loss": 0.2668, + "step": 12360 + }, + { + "epoch": 2.110437825381924, + "grad_norm": 0.35622871312938437, + "learning_rate": 1.6467501264542237e-05, + "loss": 0.2722, + "step": 12365 + }, + { + "epoch": 2.1112912861654007, + "grad_norm": 0.3758135642451043, + "learning_rate": 1.6451694486595855e-05, + "loss": 0.29, + "step": 12370 + }, + { + "epoch": 2.1121447469488777, + "grad_norm": 0.5326988667885914, + "learning_rate": 1.643588770864947e-05, + "loss": 0.2707, + "step": 12375 + }, + { + "epoch": 2.1129982077323546, + "grad_norm": 1.3780005011298029, + "learning_rate": 1.6420080930703086e-05, + "loss": 0.2974, + "step": 12380 + }, + { + "epoch": 2.1138516685158315, + "grad_norm": 0.3316592274921884, + "learning_rate": 1.64042741527567e-05, + "loss": 0.2607, + "step": 12385 + }, + { + "epoch": 2.114705129299309, + "grad_norm": 0.32763820515024833, + "learning_rate": 1.638846737481032e-05, + "loss": 0.2716, + "step": 12390 + }, + { + "epoch": 2.115558590082786, + "grad_norm": 0.37938700687076043, + "learning_rate": 1.6372660596863935e-05, + "loss": 0.2753, + "step": 12395 + }, + { + "epoch": 2.1164120508662627, + "grad_norm": 0.311938748878213, + "learning_rate": 1.6356853818917553e-05, + "loss": 0.2682, + "step": 12400 + }, + { + "epoch": 2.1172655116497396, + "grad_norm": 0.3436823316900887, + "learning_rate": 1.634104704097117e-05, + "loss": 0.2773, + "step": 12405 + }, + { + "epoch": 2.1181189724332166, + "grad_norm": 0.3693852853641911, + "learning_rate": 1.6325240263024784e-05, + "loss": 0.2697, + "step": 12410 + }, + { + "epoch": 2.1189724332166935, + "grad_norm": 0.30534991718254095, + "learning_rate": 1.63094334850784e-05, + "loss": 0.2944, + "step": 12415 + }, + { + "epoch": 2.119825894000171, + "grad_norm": 0.46673971087004035, + "learning_rate": 1.629362670713202e-05, + "loss": 0.2775, + "step": 12420 + }, + { + "epoch": 2.1206793547836478, + "grad_norm": 0.40866693342916965, + "learning_rate": 1.6277819929185637e-05, + "loss": 0.2769, + "step": 12425 + }, + { + "epoch": 2.1215328155671247, + "grad_norm": 0.3348396839675023, + "learning_rate": 1.6262013151239254e-05, + "loss": 0.2585, + "step": 12430 + }, + { + "epoch": 2.1223862763506016, + "grad_norm": 0.3458705634911477, + "learning_rate": 1.6246206373292868e-05, + "loss": 0.2822, + "step": 12435 + }, + { + "epoch": 2.1232397371340785, + "grad_norm": 0.34401778783058806, + "learning_rate": 1.6230399595346486e-05, + "loss": 0.277, + "step": 12440 + }, + { + "epoch": 2.1240931979175555, + "grad_norm": 0.3155670129755863, + "learning_rate": 1.62145928174001e-05, + "loss": 0.2474, + "step": 12445 + }, + { + "epoch": 2.124946658701033, + "grad_norm": 0.4164779372341016, + "learning_rate": 1.6198786039453717e-05, + "loss": 0.2674, + "step": 12450 + }, + { + "epoch": 2.1258001194845098, + "grad_norm": 0.30873321446598606, + "learning_rate": 1.6182979261507335e-05, + "loss": 0.2356, + "step": 12455 + }, + { + "epoch": 2.1266535802679867, + "grad_norm": 0.33359895490839675, + "learning_rate": 1.6167172483560952e-05, + "loss": 0.2396, + "step": 12460 + }, + { + "epoch": 2.1275070410514636, + "grad_norm": 0.34527339376243676, + "learning_rate": 1.615136570561457e-05, + "loss": 0.2721, + "step": 12465 + }, + { + "epoch": 2.1283605018349405, + "grad_norm": 0.3467447095430177, + "learning_rate": 1.6135558927668184e-05, + "loss": 0.261, + "step": 12470 + }, + { + "epoch": 2.129213962618418, + "grad_norm": 0.3336284624075867, + "learning_rate": 1.61197521497218e-05, + "loss": 0.2873, + "step": 12475 + }, + { + "epoch": 2.130067423401895, + "grad_norm": 0.3427654798703003, + "learning_rate": 1.610394537177542e-05, + "loss": 0.2747, + "step": 12480 + }, + { + "epoch": 2.1309208841853717, + "grad_norm": 0.2954348761039776, + "learning_rate": 1.6088138593829036e-05, + "loss": 0.2646, + "step": 12485 + }, + { + "epoch": 2.1317743449688487, + "grad_norm": 0.3508140722897979, + "learning_rate": 1.6072331815882653e-05, + "loss": 0.2632, + "step": 12490 + }, + { + "epoch": 2.1326278057523256, + "grad_norm": 0.3202599343544827, + "learning_rate": 1.6056525037936267e-05, + "loss": 0.2658, + "step": 12495 + }, + { + "epoch": 2.1334812665358025, + "grad_norm": 0.3465604693790608, + "learning_rate": 1.6040718259989885e-05, + "loss": 0.244, + "step": 12500 + }, + { + "epoch": 2.13433472731928, + "grad_norm": 0.36359975874338335, + "learning_rate": 1.60249114820435e-05, + "loss": 0.2608, + "step": 12505 + }, + { + "epoch": 2.135188188102757, + "grad_norm": 0.308184805668463, + "learning_rate": 1.6009104704097116e-05, + "loss": 0.2698, + "step": 12510 + }, + { + "epoch": 2.1360416488862337, + "grad_norm": 0.3247399308382604, + "learning_rate": 1.5993297926150734e-05, + "loss": 0.2738, + "step": 12515 + }, + { + "epoch": 2.1368951096697106, + "grad_norm": 0.3907883620181, + "learning_rate": 1.597749114820435e-05, + "loss": 0.2655, + "step": 12520 + }, + { + "epoch": 2.1377485704531876, + "grad_norm": 0.33844432676774516, + "learning_rate": 1.596168437025797e-05, + "loss": 0.2627, + "step": 12525 + }, + { + "epoch": 2.1386020312366645, + "grad_norm": 0.3170816006943378, + "learning_rate": 1.5945877592311583e-05, + "loss": 0.2641, + "step": 12530 + }, + { + "epoch": 2.139455492020142, + "grad_norm": 0.3397399340019784, + "learning_rate": 1.59300708143652e-05, + "loss": 0.2834, + "step": 12535 + }, + { + "epoch": 2.1403089528036188, + "grad_norm": 0.31757903497442197, + "learning_rate": 1.5914264036418818e-05, + "loss": 0.2898, + "step": 12540 + }, + { + "epoch": 2.1411624135870957, + "grad_norm": 0.4508104912347626, + "learning_rate": 1.5898457258472435e-05, + "loss": 0.2583, + "step": 12545 + }, + { + "epoch": 2.1420158743705726, + "grad_norm": 0.3312541425875485, + "learning_rate": 1.5882650480526053e-05, + "loss": 0.2638, + "step": 12550 + }, + { + "epoch": 2.1428693351540495, + "grad_norm": 0.3454086840727479, + "learning_rate": 1.5866843702579667e-05, + "loss": 0.2949, + "step": 12555 + }, + { + "epoch": 2.143722795937527, + "grad_norm": 0.35233563685593, + "learning_rate": 1.5851036924633284e-05, + "loss": 0.2789, + "step": 12560 + }, + { + "epoch": 2.144576256721004, + "grad_norm": 0.35464891316951724, + "learning_rate": 1.5835230146686898e-05, + "loss": 0.2554, + "step": 12565 + }, + { + "epoch": 2.1454297175044807, + "grad_norm": 0.3423594887215211, + "learning_rate": 1.5819423368740516e-05, + "loss": 0.2672, + "step": 12570 + }, + { + "epoch": 2.1462831782879577, + "grad_norm": 0.364168129489653, + "learning_rate": 1.5803616590794133e-05, + "loss": 0.2628, + "step": 12575 + }, + { + "epoch": 2.1471366390714346, + "grad_norm": 0.3512621654880636, + "learning_rate": 1.578780981284775e-05, + "loss": 0.2714, + "step": 12580 + }, + { + "epoch": 2.1479900998549115, + "grad_norm": 0.32118410562314237, + "learning_rate": 1.5772003034901368e-05, + "loss": 0.2814, + "step": 12585 + }, + { + "epoch": 2.1488435606383884, + "grad_norm": 0.32706659710720487, + "learning_rate": 1.5756196256954982e-05, + "loss": 0.2612, + "step": 12590 + }, + { + "epoch": 2.149697021421866, + "grad_norm": 0.3491312081639566, + "learning_rate": 1.57403894790086e-05, + "loss": 0.2349, + "step": 12595 + }, + { + "epoch": 2.1505504822053427, + "grad_norm": 0.4315399093375859, + "learning_rate": 1.5724582701062217e-05, + "loss": 0.2883, + "step": 12600 + }, + { + "epoch": 2.1514039429888197, + "grad_norm": 0.31659750259238256, + "learning_rate": 1.570877592311583e-05, + "loss": 0.2831, + "step": 12605 + }, + { + "epoch": 2.1522574037722966, + "grad_norm": 0.35447492593381486, + "learning_rate": 1.5692969145169452e-05, + "loss": 0.2772, + "step": 12610 + }, + { + "epoch": 2.1531108645557735, + "grad_norm": 0.3252605965205536, + "learning_rate": 1.5677162367223066e-05, + "loss": 0.2711, + "step": 12615 + }, + { + "epoch": 2.153964325339251, + "grad_norm": 0.33529255229945437, + "learning_rate": 1.5661355589276684e-05, + "loss": 0.2564, + "step": 12620 + }, + { + "epoch": 2.154817786122728, + "grad_norm": 0.3092992567695559, + "learning_rate": 1.5645548811330298e-05, + "loss": 0.2872, + "step": 12625 + }, + { + "epoch": 2.1556712469062047, + "grad_norm": 0.36336461468734477, + "learning_rate": 1.5629742033383915e-05, + "loss": 0.2922, + "step": 12630 + }, + { + "epoch": 2.1565247076896816, + "grad_norm": 0.3432044815496815, + "learning_rate": 1.5613935255437533e-05, + "loss": 0.2567, + "step": 12635 + }, + { + "epoch": 2.1573781684731586, + "grad_norm": 0.4069346298755619, + "learning_rate": 1.559812847749115e-05, + "loss": 0.2445, + "step": 12640 + }, + { + "epoch": 2.1582316292566355, + "grad_norm": 0.40033171137575607, + "learning_rate": 1.5582321699544767e-05, + "loss": 0.2596, + "step": 12645 + }, + { + "epoch": 2.159085090040113, + "grad_norm": 0.2854063154369835, + "learning_rate": 1.556651492159838e-05, + "loss": 0.2698, + "step": 12650 + }, + { + "epoch": 2.1599385508235898, + "grad_norm": 0.4636509583084091, + "learning_rate": 1.5550708143652e-05, + "loss": 0.2866, + "step": 12655 + }, + { + "epoch": 2.1607920116070667, + "grad_norm": 0.34682881592035003, + "learning_rate": 1.5534901365705613e-05, + "loss": 0.2823, + "step": 12660 + }, + { + "epoch": 2.1616454723905436, + "grad_norm": 0.3290308156842946, + "learning_rate": 1.551909458775923e-05, + "loss": 0.2759, + "step": 12665 + }, + { + "epoch": 2.1624989331740205, + "grad_norm": 0.36783948757997753, + "learning_rate": 1.550328780981285e-05, + "loss": 0.2405, + "step": 12670 + }, + { + "epoch": 2.1633523939574975, + "grad_norm": 0.35236119660393694, + "learning_rate": 1.5487481031866465e-05, + "loss": 0.2803, + "step": 12675 + }, + { + "epoch": 2.164205854740975, + "grad_norm": 0.3754353607264287, + "learning_rate": 1.5471674253920083e-05, + "loss": 0.2575, + "step": 12680 + }, + { + "epoch": 2.1650593155244517, + "grad_norm": 0.3861606015120469, + "learning_rate": 1.5455867475973697e-05, + "loss": 0.2592, + "step": 12685 + }, + { + "epoch": 2.1659127763079287, + "grad_norm": 0.3675140543631446, + "learning_rate": 1.5440060698027314e-05, + "loss": 0.2601, + "step": 12690 + }, + { + "epoch": 2.1667662370914056, + "grad_norm": 0.32619482564351854, + "learning_rate": 1.5424253920080932e-05, + "loss": 0.2616, + "step": 12695 + }, + { + "epoch": 2.1676196978748825, + "grad_norm": 0.3884753490924257, + "learning_rate": 1.5408447142134546e-05, + "loss": 0.2773, + "step": 12700 + }, + { + "epoch": 2.16847315865836, + "grad_norm": 0.34120808204855113, + "learning_rate": 1.5392640364188167e-05, + "loss": 0.2652, + "step": 12705 + }, + { + "epoch": 2.169326619441837, + "grad_norm": 0.37573300008074934, + "learning_rate": 1.537683358624178e-05, + "loss": 0.2796, + "step": 12710 + }, + { + "epoch": 2.1701800802253137, + "grad_norm": 0.32540937938049824, + "learning_rate": 1.53610268082954e-05, + "loss": 0.3074, + "step": 12715 + }, + { + "epoch": 2.1710335410087906, + "grad_norm": 0.3082161890237831, + "learning_rate": 1.5345220030349012e-05, + "loss": 0.2566, + "step": 12720 + }, + { + "epoch": 2.1718870017922676, + "grad_norm": 0.32691786850356097, + "learning_rate": 1.532941325240263e-05, + "loss": 0.2825, + "step": 12725 + }, + { + "epoch": 2.1727404625757445, + "grad_norm": 0.3575672326173452, + "learning_rate": 1.5313606474456247e-05, + "loss": 0.2758, + "step": 12730 + }, + { + "epoch": 2.1735939233592214, + "grad_norm": 0.3228001556590004, + "learning_rate": 1.5297799696509865e-05, + "loss": 0.2923, + "step": 12735 + }, + { + "epoch": 2.174447384142699, + "grad_norm": 0.3680400486658653, + "learning_rate": 1.5281992918563482e-05, + "loss": 0.2645, + "step": 12740 + }, + { + "epoch": 2.1753008449261757, + "grad_norm": 0.2988167970677839, + "learning_rate": 1.5266186140617096e-05, + "loss": 0.2574, + "step": 12745 + }, + { + "epoch": 2.1761543057096526, + "grad_norm": 0.3481716359669406, + "learning_rate": 1.5250379362670714e-05, + "loss": 0.2704, + "step": 12750 + }, + { + "epoch": 2.1770077664931295, + "grad_norm": 0.6012227455690394, + "learning_rate": 1.523457258472433e-05, + "loss": 0.2756, + "step": 12755 + }, + { + "epoch": 2.1778612272766065, + "grad_norm": 0.3418187429879156, + "learning_rate": 1.5218765806777945e-05, + "loss": 0.2823, + "step": 12760 + }, + { + "epoch": 2.178714688060084, + "grad_norm": 0.3116854425472003, + "learning_rate": 1.5202959028831564e-05, + "loss": 0.2636, + "step": 12765 + }, + { + "epoch": 2.1795681488435608, + "grad_norm": 0.3524114961325397, + "learning_rate": 1.518715225088518e-05, + "loss": 0.2911, + "step": 12770 + }, + { + "epoch": 2.1804216096270377, + "grad_norm": 0.352148168631824, + "learning_rate": 1.5171345472938798e-05, + "loss": 0.2962, + "step": 12775 + }, + { + "epoch": 2.1812750704105146, + "grad_norm": 0.338679396703598, + "learning_rate": 1.5155538694992413e-05, + "loss": 0.2564, + "step": 12780 + }, + { + "epoch": 2.1821285311939915, + "grad_norm": 0.32320553715208916, + "learning_rate": 1.513973191704603e-05, + "loss": 0.2732, + "step": 12785 + }, + { + "epoch": 2.1829819919774685, + "grad_norm": 0.35736734339987114, + "learning_rate": 1.5123925139099645e-05, + "loss": 0.2631, + "step": 12790 + }, + { + "epoch": 2.183835452760946, + "grad_norm": 0.34756966658806226, + "learning_rate": 1.5108118361153264e-05, + "loss": 0.2557, + "step": 12795 + }, + { + "epoch": 2.1846889135444227, + "grad_norm": 0.37993879830853133, + "learning_rate": 1.509231158320688e-05, + "loss": 0.2503, + "step": 12800 + }, + { + "epoch": 2.1855423743278997, + "grad_norm": 0.3152500178102411, + "learning_rate": 1.5076504805260497e-05, + "loss": 0.2726, + "step": 12805 + }, + { + "epoch": 2.1863958351113766, + "grad_norm": 0.3530014163806682, + "learning_rate": 1.5060698027314113e-05, + "loss": 0.2509, + "step": 12810 + }, + { + "epoch": 2.1872492958948535, + "grad_norm": 0.3375905873609432, + "learning_rate": 1.5044891249367729e-05, + "loss": 0.2601, + "step": 12815 + }, + { + "epoch": 2.1881027566783304, + "grad_norm": 0.3208022206971321, + "learning_rate": 1.5029084471421345e-05, + "loss": 0.2641, + "step": 12820 + }, + { + "epoch": 2.188956217461808, + "grad_norm": 0.32239717872885754, + "learning_rate": 1.5013277693474964e-05, + "loss": 0.2709, + "step": 12825 + }, + { + "epoch": 2.1898096782452847, + "grad_norm": 0.3667722488933535, + "learning_rate": 1.499747091552858e-05, + "loss": 0.2588, + "step": 12830 + }, + { + "epoch": 2.1906631390287616, + "grad_norm": 0.3700698531378623, + "learning_rate": 1.4981664137582197e-05, + "loss": 0.2773, + "step": 12835 + }, + { + "epoch": 2.1915165998122386, + "grad_norm": 0.36627608522584637, + "learning_rate": 1.4965857359635813e-05, + "loss": 0.2595, + "step": 12840 + }, + { + "epoch": 2.1923700605957155, + "grad_norm": 0.43087433500491623, + "learning_rate": 1.4950050581689428e-05, + "loss": 0.2764, + "step": 12845 + }, + { + "epoch": 2.193223521379193, + "grad_norm": 0.3093574838883957, + "learning_rate": 1.4934243803743044e-05, + "loss": 0.266, + "step": 12850 + }, + { + "epoch": 2.1940769821626698, + "grad_norm": 0.5303469339533032, + "learning_rate": 1.4918437025796662e-05, + "loss": 0.2541, + "step": 12855 + }, + { + "epoch": 2.1949304429461467, + "grad_norm": 0.326459257267585, + "learning_rate": 1.490263024785028e-05, + "loss": 0.2816, + "step": 12860 + }, + { + "epoch": 2.1957839037296236, + "grad_norm": 0.31829004688424295, + "learning_rate": 1.4886823469903897e-05, + "loss": 0.2688, + "step": 12865 + }, + { + "epoch": 2.1966373645131005, + "grad_norm": 0.34921997998373205, + "learning_rate": 1.4871016691957512e-05, + "loss": 0.262, + "step": 12870 + }, + { + "epoch": 2.1974908252965775, + "grad_norm": 0.30883359187418896, + "learning_rate": 1.4855209914011128e-05, + "loss": 0.2882, + "step": 12875 + }, + { + "epoch": 2.1983442860800544, + "grad_norm": 0.33608701417398557, + "learning_rate": 1.4839403136064744e-05, + "loss": 0.2862, + "step": 12880 + }, + { + "epoch": 2.1991977468635318, + "grad_norm": 0.3203124453274556, + "learning_rate": 1.4823596358118361e-05, + "loss": 0.2484, + "step": 12885 + }, + { + "epoch": 2.2000512076470087, + "grad_norm": 0.32154313066050594, + "learning_rate": 1.4807789580171979e-05, + "loss": 0.2464, + "step": 12890 + }, + { + "epoch": 2.2009046684304856, + "grad_norm": 0.40498751721611115, + "learning_rate": 1.4791982802225596e-05, + "loss": 0.2595, + "step": 12895 + }, + { + "epoch": 2.2017581292139625, + "grad_norm": 0.328804572442942, + "learning_rate": 1.4776176024279212e-05, + "loss": 0.2595, + "step": 12900 + }, + { + "epoch": 2.2026115899974394, + "grad_norm": 0.3953120231902593, + "learning_rate": 1.4760369246332828e-05, + "loss": 0.2844, + "step": 12905 + }, + { + "epoch": 2.203465050780917, + "grad_norm": 0.31917535029732214, + "learning_rate": 1.4744562468386444e-05, + "loss": 0.293, + "step": 12910 + }, + { + "epoch": 2.2043185115643937, + "grad_norm": 0.3390405116903046, + "learning_rate": 1.4728755690440061e-05, + "loss": 0.2824, + "step": 12915 + }, + { + "epoch": 2.2051719723478707, + "grad_norm": 0.3022370342046419, + "learning_rate": 1.4712948912493679e-05, + "loss": 0.2439, + "step": 12920 + }, + { + "epoch": 2.2060254331313476, + "grad_norm": 0.34719223033193247, + "learning_rate": 1.4697142134547296e-05, + "loss": 0.2752, + "step": 12925 + }, + { + "epoch": 2.2068788939148245, + "grad_norm": 0.3338113211641244, + "learning_rate": 1.4681335356600912e-05, + "loss": 0.2725, + "step": 12930 + }, + { + "epoch": 2.2077323546983014, + "grad_norm": 0.3085027669451087, + "learning_rate": 1.4665528578654527e-05, + "loss": 0.2726, + "step": 12935 + }, + { + "epoch": 2.208585815481779, + "grad_norm": 0.3491580780264821, + "learning_rate": 1.4649721800708143e-05, + "loss": 0.2746, + "step": 12940 + }, + { + "epoch": 2.2094392762652557, + "grad_norm": 0.3381555139448322, + "learning_rate": 1.463391502276176e-05, + "loss": 0.2679, + "step": 12945 + }, + { + "epoch": 2.2102927370487326, + "grad_norm": 0.3620677646590881, + "learning_rate": 1.4618108244815378e-05, + "loss": 0.2748, + "step": 12950 + }, + { + "epoch": 2.2111461978322096, + "grad_norm": 0.33091174734322143, + "learning_rate": 1.4602301466868996e-05, + "loss": 0.2558, + "step": 12955 + }, + { + "epoch": 2.2119996586156865, + "grad_norm": 0.32578075958840164, + "learning_rate": 1.4586494688922611e-05, + "loss": 0.2563, + "step": 12960 + }, + { + "epoch": 2.2128531193991634, + "grad_norm": 0.3148099453779239, + "learning_rate": 1.4570687910976227e-05, + "loss": 0.2746, + "step": 12965 + }, + { + "epoch": 2.2137065801826408, + "grad_norm": 0.3020408957231368, + "learning_rate": 1.4554881133029843e-05, + "loss": 0.3052, + "step": 12970 + }, + { + "epoch": 2.2145600409661177, + "grad_norm": 0.33901774599802326, + "learning_rate": 1.453907435508346e-05, + "loss": 0.2809, + "step": 12975 + }, + { + "epoch": 2.2154135017495946, + "grad_norm": 0.3672776534404785, + "learning_rate": 1.4523267577137076e-05, + "loss": 0.2614, + "step": 12980 + }, + { + "epoch": 2.2162669625330715, + "grad_norm": 0.35687879044065507, + "learning_rate": 1.4507460799190695e-05, + "loss": 0.2523, + "step": 12985 + }, + { + "epoch": 2.2171204233165485, + "grad_norm": 0.33417825892532527, + "learning_rate": 1.4491654021244311e-05, + "loss": 0.2649, + "step": 12990 + }, + { + "epoch": 2.217973884100026, + "grad_norm": 0.3248124831537634, + "learning_rate": 1.4475847243297927e-05, + "loss": 0.2606, + "step": 12995 + }, + { + "epoch": 2.2188273448835028, + "grad_norm": 0.328358316916069, + "learning_rate": 1.4460040465351543e-05, + "loss": 0.2684, + "step": 13000 + }, + { + "epoch": 2.2196808056669797, + "grad_norm": 0.32013619138294613, + "learning_rate": 1.444423368740516e-05, + "loss": 0.2745, + "step": 13005 + }, + { + "epoch": 2.2205342664504566, + "grad_norm": 0.4578169853135187, + "learning_rate": 1.4428426909458776e-05, + "loss": 0.2596, + "step": 13010 + }, + { + "epoch": 2.2213877272339335, + "grad_norm": 0.32431737220922735, + "learning_rate": 1.4412620131512395e-05, + "loss": 0.2789, + "step": 13015 + }, + { + "epoch": 2.2222411880174104, + "grad_norm": 0.3505141108910819, + "learning_rate": 1.439681335356601e-05, + "loss": 0.2821, + "step": 13020 + }, + { + "epoch": 2.223094648800888, + "grad_norm": 0.33086072608819334, + "learning_rate": 1.4381006575619626e-05, + "loss": 0.2905, + "step": 13025 + }, + { + "epoch": 2.2239481095843647, + "grad_norm": 0.33884909980139977, + "learning_rate": 1.4365199797673242e-05, + "loss": 0.2553, + "step": 13030 + }, + { + "epoch": 2.2248015703678417, + "grad_norm": 0.37831012435066824, + "learning_rate": 1.434939301972686e-05, + "loss": 0.2592, + "step": 13035 + }, + { + "epoch": 2.2256550311513186, + "grad_norm": 0.31582713332257867, + "learning_rate": 1.4333586241780475e-05, + "loss": 0.2833, + "step": 13040 + }, + { + "epoch": 2.2265084919347955, + "grad_norm": 0.3455450182611844, + "learning_rate": 1.4317779463834095e-05, + "loss": 0.2675, + "step": 13045 + }, + { + "epoch": 2.2273619527182724, + "grad_norm": 0.3374489300728345, + "learning_rate": 1.430197268588771e-05, + "loss": 0.2979, + "step": 13050 + }, + { + "epoch": 2.22821541350175, + "grad_norm": 0.45405171174075065, + "learning_rate": 1.4286165907941326e-05, + "loss": 0.2867, + "step": 13055 + }, + { + "epoch": 2.2290688742852267, + "grad_norm": 0.3394515701351571, + "learning_rate": 1.4270359129994942e-05, + "loss": 0.2699, + "step": 13060 + }, + { + "epoch": 2.2299223350687036, + "grad_norm": 0.36895329729472887, + "learning_rate": 1.425455235204856e-05, + "loss": 0.2709, + "step": 13065 + }, + { + "epoch": 2.2307757958521806, + "grad_norm": 0.35443931340420204, + "learning_rate": 1.4238745574102175e-05, + "loss": 0.2612, + "step": 13070 + }, + { + "epoch": 2.2316292566356575, + "grad_norm": 0.37439768952743185, + "learning_rate": 1.4222938796155794e-05, + "loss": 0.2566, + "step": 13075 + }, + { + "epoch": 2.2324827174191344, + "grad_norm": 0.3989028279151154, + "learning_rate": 1.420713201820941e-05, + "loss": 0.267, + "step": 13080 + }, + { + "epoch": 2.2333361782026118, + "grad_norm": 0.3145608456005138, + "learning_rate": 1.4191325240263026e-05, + "loss": 0.2595, + "step": 13085 + }, + { + "epoch": 2.2341896389860887, + "grad_norm": 0.404315313993326, + "learning_rate": 1.4175518462316642e-05, + "loss": 0.2476, + "step": 13090 + }, + { + "epoch": 2.2350430997695656, + "grad_norm": 0.33175989786238314, + "learning_rate": 1.4159711684370259e-05, + "loss": 0.2445, + "step": 13095 + }, + { + "epoch": 2.2358965605530425, + "grad_norm": 0.3673847446231519, + "learning_rate": 1.4143904906423875e-05, + "loss": 0.2795, + "step": 13100 + }, + { + "epoch": 2.2367500213365195, + "grad_norm": 0.32920478196154535, + "learning_rate": 1.412809812847749e-05, + "loss": 0.2612, + "step": 13105 + }, + { + "epoch": 2.2376034821199964, + "grad_norm": 0.3942709927381735, + "learning_rate": 1.411229135053111e-05, + "loss": 0.2664, + "step": 13110 + }, + { + "epoch": 2.2384569429034737, + "grad_norm": 0.34105379614174075, + "learning_rate": 1.4096484572584725e-05, + "loss": 0.2713, + "step": 13115 + }, + { + "epoch": 2.2393104036869507, + "grad_norm": 0.3319067913692869, + "learning_rate": 1.4080677794638341e-05, + "loss": 0.2797, + "step": 13120 + }, + { + "epoch": 2.2401638644704276, + "grad_norm": 0.33108165381903953, + "learning_rate": 1.4064871016691957e-05, + "loss": 0.274, + "step": 13125 + }, + { + "epoch": 2.2410173252539045, + "grad_norm": 0.3389736881258419, + "learning_rate": 1.4049064238745574e-05, + "loss": 0.28, + "step": 13130 + }, + { + "epoch": 2.2418707860373814, + "grad_norm": 0.3936727543709566, + "learning_rate": 1.403325746079919e-05, + "loss": 0.2851, + "step": 13135 + }, + { + "epoch": 2.242724246820859, + "grad_norm": 0.35429129447576024, + "learning_rate": 1.401745068285281e-05, + "loss": 0.2819, + "step": 13140 + }, + { + "epoch": 2.2435777076043357, + "grad_norm": 0.3327919813822197, + "learning_rate": 1.4001643904906425e-05, + "loss": 0.2712, + "step": 13145 + }, + { + "epoch": 2.2444311683878126, + "grad_norm": 0.36825000296835053, + "learning_rate": 1.3985837126960041e-05, + "loss": 0.2704, + "step": 13150 + }, + { + "epoch": 2.2452846291712896, + "grad_norm": 0.3037327517059426, + "learning_rate": 1.3970030349013657e-05, + "loss": 0.2845, + "step": 13155 + }, + { + "epoch": 2.2461380899547665, + "grad_norm": 0.3463895684505155, + "learning_rate": 1.3954223571067274e-05, + "loss": 0.2614, + "step": 13160 + }, + { + "epoch": 2.2469915507382434, + "grad_norm": 0.3716484778076624, + "learning_rate": 1.393841679312089e-05, + "loss": 0.265, + "step": 13165 + }, + { + "epoch": 2.247845011521721, + "grad_norm": 0.34069370568224044, + "learning_rate": 1.3922610015174509e-05, + "loss": 0.272, + "step": 13170 + }, + { + "epoch": 2.2486984723051977, + "grad_norm": 0.3560029147034675, + "learning_rate": 1.3906803237228125e-05, + "loss": 0.2796, + "step": 13175 + }, + { + "epoch": 2.2495519330886746, + "grad_norm": 0.38746486241761824, + "learning_rate": 1.389099645928174e-05, + "loss": 0.2818, + "step": 13180 + }, + { + "epoch": 2.2504053938721515, + "grad_norm": 0.3482362639054943, + "learning_rate": 1.3875189681335356e-05, + "loss": 0.2777, + "step": 13185 + }, + { + "epoch": 2.2512588546556285, + "grad_norm": 0.3255965079568052, + "learning_rate": 1.3859382903388974e-05, + "loss": 0.2851, + "step": 13190 + }, + { + "epoch": 2.2521123154391054, + "grad_norm": 0.31144665726446935, + "learning_rate": 1.384357612544259e-05, + "loss": 0.2717, + "step": 13195 + }, + { + "epoch": 2.2529657762225828, + "grad_norm": 0.3313669197754686, + "learning_rate": 1.3827769347496209e-05, + "loss": 0.2776, + "step": 13200 + }, + { + "epoch": 2.2538192370060597, + "grad_norm": 0.2952076521595467, + "learning_rate": 1.3811962569549824e-05, + "loss": 0.2752, + "step": 13205 + }, + { + "epoch": 2.2546726977895366, + "grad_norm": 0.3530678406931364, + "learning_rate": 1.379615579160344e-05, + "loss": 0.2576, + "step": 13210 + }, + { + "epoch": 2.2555261585730135, + "grad_norm": 0.36836585151120216, + "learning_rate": 1.3780349013657056e-05, + "loss": 0.2628, + "step": 13215 + }, + { + "epoch": 2.2563796193564905, + "grad_norm": 0.33551894077187827, + "learning_rate": 1.3764542235710673e-05, + "loss": 0.2684, + "step": 13220 + }, + { + "epoch": 2.257233080139968, + "grad_norm": 0.4224354356132112, + "learning_rate": 1.374873545776429e-05, + "loss": 0.2868, + "step": 13225 + }, + { + "epoch": 2.2580865409234447, + "grad_norm": 0.4123832020951836, + "learning_rate": 1.3732928679817905e-05, + "loss": 0.2767, + "step": 13230 + }, + { + "epoch": 2.2589400017069217, + "grad_norm": 0.3906636906336848, + "learning_rate": 1.3717121901871524e-05, + "loss": 0.2656, + "step": 13235 + }, + { + "epoch": 2.2597934624903986, + "grad_norm": 0.34283897290348253, + "learning_rate": 1.370131512392514e-05, + "loss": 0.2786, + "step": 13240 + }, + { + "epoch": 2.2606469232738755, + "grad_norm": 0.3163011115260496, + "learning_rate": 1.3685508345978756e-05, + "loss": 0.2606, + "step": 13245 + }, + { + "epoch": 2.2615003840573524, + "grad_norm": 0.39600564834207624, + "learning_rate": 1.3669701568032373e-05, + "loss": 0.2605, + "step": 13250 + }, + { + "epoch": 2.2623538448408294, + "grad_norm": 0.3083749239635374, + "learning_rate": 1.3653894790085989e-05, + "loss": 0.265, + "step": 13255 + }, + { + "epoch": 2.2632073056243067, + "grad_norm": 0.3760609634293888, + "learning_rate": 1.3638088012139605e-05, + "loss": 0.2462, + "step": 13260 + }, + { + "epoch": 2.2640607664077836, + "grad_norm": 0.3757475388409267, + "learning_rate": 1.3622281234193224e-05, + "loss": 0.2803, + "step": 13265 + }, + { + "epoch": 2.2649142271912606, + "grad_norm": 0.48846304920007216, + "learning_rate": 1.360647445624684e-05, + "loss": 0.2603, + "step": 13270 + }, + { + "epoch": 2.2657676879747375, + "grad_norm": 0.3514453908540707, + "learning_rate": 1.3590667678300455e-05, + "loss": 0.287, + "step": 13275 + }, + { + "epoch": 2.2666211487582144, + "grad_norm": 0.3470686782406026, + "learning_rate": 1.3574860900354073e-05, + "loss": 0.3474, + "step": 13280 + }, + { + "epoch": 2.2674746095416918, + "grad_norm": 0.3508841337950813, + "learning_rate": 1.3559054122407689e-05, + "loss": 0.2558, + "step": 13285 + }, + { + "epoch": 2.2683280703251687, + "grad_norm": 0.3139648663004901, + "learning_rate": 1.3543247344461304e-05, + "loss": 0.2645, + "step": 13290 + }, + { + "epoch": 2.2691815311086456, + "grad_norm": 0.32112362909304765, + "learning_rate": 1.3527440566514923e-05, + "loss": 0.2534, + "step": 13295 + }, + { + "epoch": 2.2700349918921225, + "grad_norm": 0.40308720353751076, + "learning_rate": 1.351163378856854e-05, + "loss": 0.249, + "step": 13300 + }, + { + "epoch": 2.2708884526755995, + "grad_norm": 0.3711046566966942, + "learning_rate": 1.3495827010622155e-05, + "loss": 0.2605, + "step": 13305 + }, + { + "epoch": 2.2717419134590764, + "grad_norm": 0.3396794923247863, + "learning_rate": 1.3480020232675772e-05, + "loss": 0.2707, + "step": 13310 + }, + { + "epoch": 2.2725953742425533, + "grad_norm": 0.35625958725160256, + "learning_rate": 1.3464213454729388e-05, + "loss": 0.2635, + "step": 13315 + }, + { + "epoch": 2.2734488350260307, + "grad_norm": 0.31309163461644807, + "learning_rate": 1.3448406676783004e-05, + "loss": 0.2773, + "step": 13320 + }, + { + "epoch": 2.2743022958095076, + "grad_norm": 0.3445215453677931, + "learning_rate": 1.3432599898836623e-05, + "loss": 0.2555, + "step": 13325 + }, + { + "epoch": 2.2751557565929845, + "grad_norm": 0.36659337926037683, + "learning_rate": 1.3416793120890239e-05, + "loss": 0.2702, + "step": 13330 + }, + { + "epoch": 2.2760092173764614, + "grad_norm": 0.35224502414420666, + "learning_rate": 1.3400986342943855e-05, + "loss": 0.2683, + "step": 13335 + }, + { + "epoch": 2.2768626781599384, + "grad_norm": 0.3050596078699797, + "learning_rate": 1.3385179564997472e-05, + "loss": 0.2742, + "step": 13340 + }, + { + "epoch": 2.2777161389434157, + "grad_norm": 0.3445279792044359, + "learning_rate": 1.3369372787051088e-05, + "loss": 0.284, + "step": 13345 + }, + { + "epoch": 2.2785695997268927, + "grad_norm": 0.4521506535764485, + "learning_rate": 1.3353566009104704e-05, + "loss": 0.2514, + "step": 13350 + }, + { + "epoch": 2.2794230605103696, + "grad_norm": 0.303913246496587, + "learning_rate": 1.3337759231158323e-05, + "loss": 0.2633, + "step": 13355 + }, + { + "epoch": 2.2802765212938465, + "grad_norm": 0.3502568952160238, + "learning_rate": 1.3321952453211939e-05, + "loss": 0.2594, + "step": 13360 + }, + { + "epoch": 2.2811299820773234, + "grad_norm": 0.3717036717618989, + "learning_rate": 1.3306145675265554e-05, + "loss": 0.253, + "step": 13365 + }, + { + "epoch": 2.281983442860801, + "grad_norm": 0.31335236449414805, + "learning_rate": 1.3290338897319172e-05, + "loss": 0.289, + "step": 13370 + }, + { + "epoch": 2.2828369036442777, + "grad_norm": 0.47642486009392315, + "learning_rate": 1.3274532119372788e-05, + "loss": 0.312, + "step": 13375 + }, + { + "epoch": 2.2836903644277546, + "grad_norm": 0.3706481152626613, + "learning_rate": 1.3258725341426403e-05, + "loss": 0.2734, + "step": 13380 + }, + { + "epoch": 2.2845438252112316, + "grad_norm": 0.335699018913271, + "learning_rate": 1.3242918563480019e-05, + "loss": 0.2588, + "step": 13385 + }, + { + "epoch": 2.2853972859947085, + "grad_norm": 0.39600171459187933, + "learning_rate": 1.3227111785533638e-05, + "loss": 0.2668, + "step": 13390 + }, + { + "epoch": 2.2862507467781854, + "grad_norm": 0.31398579074501, + "learning_rate": 1.3211305007587254e-05, + "loss": 0.2751, + "step": 13395 + }, + { + "epoch": 2.2871042075616623, + "grad_norm": 0.33944474559657767, + "learning_rate": 1.3195498229640871e-05, + "loss": 0.2719, + "step": 13400 + }, + { + "epoch": 2.2879576683451397, + "grad_norm": 0.3552249703706572, + "learning_rate": 1.3179691451694487e-05, + "loss": 0.2578, + "step": 13405 + }, + { + "epoch": 2.2888111291286166, + "grad_norm": 0.34558029914006605, + "learning_rate": 1.3163884673748103e-05, + "loss": 0.253, + "step": 13410 + }, + { + "epoch": 2.2896645899120935, + "grad_norm": 0.34898837234878644, + "learning_rate": 1.3148077895801719e-05, + "loss": 0.2707, + "step": 13415 + }, + { + "epoch": 2.2905180506955705, + "grad_norm": 0.3293566621281475, + "learning_rate": 1.3132271117855338e-05, + "loss": 0.2694, + "step": 13420 + }, + { + "epoch": 2.2913715114790474, + "grad_norm": 0.33463051881200645, + "learning_rate": 1.3116464339908954e-05, + "loss": 0.2584, + "step": 13425 + }, + { + "epoch": 2.2922249722625248, + "grad_norm": 0.32214858707844063, + "learning_rate": 1.3100657561962571e-05, + "loss": 0.265, + "step": 13430 + }, + { + "epoch": 2.2930784330460017, + "grad_norm": 0.2894622609386176, + "learning_rate": 1.3084850784016187e-05, + "loss": 0.2721, + "step": 13435 + }, + { + "epoch": 2.2939318938294786, + "grad_norm": 0.3351765964390977, + "learning_rate": 1.3069044006069803e-05, + "loss": 0.2602, + "step": 13440 + }, + { + "epoch": 2.2947853546129555, + "grad_norm": 0.4465919787461111, + "learning_rate": 1.3053237228123418e-05, + "loss": 0.2686, + "step": 13445 + }, + { + "epoch": 2.2956388153964324, + "grad_norm": 0.3568598133398877, + "learning_rate": 1.3037430450177038e-05, + "loss": 0.2734, + "step": 13450 + }, + { + "epoch": 2.2964922761799094, + "grad_norm": 0.6384608561412038, + "learning_rate": 1.3021623672230653e-05, + "loss": 0.2655, + "step": 13455 + }, + { + "epoch": 2.2973457369633863, + "grad_norm": 0.33839005812425016, + "learning_rate": 1.300581689428427e-05, + "loss": 0.2868, + "step": 13460 + }, + { + "epoch": 2.2981991977468637, + "grad_norm": 0.3483743337197857, + "learning_rate": 1.2990010116337887e-05, + "loss": 0.2533, + "step": 13465 + }, + { + "epoch": 2.2990526585303406, + "grad_norm": 0.3389495049205933, + "learning_rate": 1.2974203338391502e-05, + "loss": 0.2666, + "step": 13470 + }, + { + "epoch": 2.2999061193138175, + "grad_norm": 0.33493171178968356, + "learning_rate": 1.2958396560445118e-05, + "loss": 0.246, + "step": 13475 + }, + { + "epoch": 2.3007595800972944, + "grad_norm": 0.3279772510257887, + "learning_rate": 1.2942589782498737e-05, + "loss": 0.2824, + "step": 13480 + }, + { + "epoch": 2.3016130408807713, + "grad_norm": 0.36115760396041924, + "learning_rate": 1.2926783004552353e-05, + "loss": 0.2621, + "step": 13485 + }, + { + "epoch": 2.3024665016642487, + "grad_norm": 0.3373729497737809, + "learning_rate": 1.291097622660597e-05, + "loss": 0.2767, + "step": 13490 + }, + { + "epoch": 2.3033199624477256, + "grad_norm": 0.35983074089182177, + "learning_rate": 1.2895169448659586e-05, + "loss": 0.2633, + "step": 13495 + }, + { + "epoch": 2.3041734232312026, + "grad_norm": 0.36750791033347413, + "learning_rate": 1.2879362670713202e-05, + "loss": 0.297, + "step": 13500 + }, + { + "epoch": 2.3050268840146795, + "grad_norm": 0.37034125219454295, + "learning_rate": 1.2863555892766818e-05, + "loss": 0.2723, + "step": 13505 + }, + { + "epoch": 2.3058803447981564, + "grad_norm": 0.31177905436906944, + "learning_rate": 1.2847749114820433e-05, + "loss": 0.2711, + "step": 13510 + }, + { + "epoch": 2.3067338055816338, + "grad_norm": 0.33355398096509026, + "learning_rate": 1.2831942336874053e-05, + "loss": 0.2648, + "step": 13515 + }, + { + "epoch": 2.3075872663651107, + "grad_norm": 0.3337734453400322, + "learning_rate": 1.281613555892767e-05, + "loss": 0.2897, + "step": 13520 + }, + { + "epoch": 2.3084407271485876, + "grad_norm": 0.35520779216386267, + "learning_rate": 1.2800328780981286e-05, + "loss": 0.2554, + "step": 13525 + }, + { + "epoch": 2.3092941879320645, + "grad_norm": 0.32716894576133215, + "learning_rate": 1.2784522003034902e-05, + "loss": 0.2681, + "step": 13530 + }, + { + "epoch": 2.3101476487155415, + "grad_norm": 0.38456397670233244, + "learning_rate": 1.2768715225088517e-05, + "loss": 0.2503, + "step": 13535 + }, + { + "epoch": 2.3110011094990184, + "grad_norm": 0.354510794826201, + "learning_rate": 1.2752908447142133e-05, + "loss": 0.2566, + "step": 13540 + }, + { + "epoch": 2.3118545702824953, + "grad_norm": 0.3240400186178979, + "learning_rate": 1.2737101669195752e-05, + "loss": 0.2767, + "step": 13545 + }, + { + "epoch": 2.3127080310659727, + "grad_norm": 0.3151313415868495, + "learning_rate": 1.272129489124937e-05, + "loss": 0.2611, + "step": 13550 + }, + { + "epoch": 2.3135614918494496, + "grad_norm": 0.3645467755833886, + "learning_rate": 1.2705488113302986e-05, + "loss": 0.2738, + "step": 13555 + }, + { + "epoch": 2.3144149526329265, + "grad_norm": 0.33916988264617853, + "learning_rate": 1.2689681335356601e-05, + "loss": 0.2685, + "step": 13560 + }, + { + "epoch": 2.3152684134164034, + "grad_norm": 0.35312157453609383, + "learning_rate": 1.2673874557410217e-05, + "loss": 0.2651, + "step": 13565 + }, + { + "epoch": 2.3161218741998804, + "grad_norm": 0.3486353831660235, + "learning_rate": 1.2658067779463833e-05, + "loss": 0.2562, + "step": 13570 + }, + { + "epoch": 2.3169753349833577, + "grad_norm": 0.35167361210599335, + "learning_rate": 1.2642261001517452e-05, + "loss": 0.2624, + "step": 13575 + }, + { + "epoch": 2.3178287957668346, + "grad_norm": 0.3675526623335268, + "learning_rate": 1.262645422357107e-05, + "loss": 0.2447, + "step": 13580 + }, + { + "epoch": 2.3186822565503116, + "grad_norm": 0.3801187038171107, + "learning_rate": 1.2610647445624685e-05, + "loss": 0.2543, + "step": 13585 + }, + { + "epoch": 2.3195357173337885, + "grad_norm": 0.3260726023256078, + "learning_rate": 1.2594840667678301e-05, + "loss": 0.2593, + "step": 13590 + }, + { + "epoch": 2.3203891781172654, + "grad_norm": 0.337574532861758, + "learning_rate": 1.2579033889731917e-05, + "loss": 0.2794, + "step": 13595 + }, + { + "epoch": 2.3212426389007423, + "grad_norm": 1.7267629397768802, + "learning_rate": 1.2563227111785532e-05, + "loss": 0.2702, + "step": 13600 + }, + { + "epoch": 2.3220960996842193, + "grad_norm": 0.3955834078147852, + "learning_rate": 1.2547420333839152e-05, + "loss": 0.2465, + "step": 13605 + }, + { + "epoch": 2.3229495604676966, + "grad_norm": 0.3262499618665922, + "learning_rate": 1.2531613555892769e-05, + "loss": 0.277, + "step": 13610 + }, + { + "epoch": 2.3238030212511736, + "grad_norm": 0.35351031166159325, + "learning_rate": 1.2515806777946385e-05, + "loss": 0.2865, + "step": 13615 + }, + { + "epoch": 2.3246564820346505, + "grad_norm": 0.42411178148525197, + "learning_rate": 1.25e-05, + "loss": 0.2516, + "step": 13620 + }, + { + "epoch": 2.3255099428181274, + "grad_norm": 0.3531445964873752, + "learning_rate": 1.2484193222053616e-05, + "loss": 0.268, + "step": 13625 + }, + { + "epoch": 2.3263634036016043, + "grad_norm": 0.41456787080265933, + "learning_rate": 1.2468386444107234e-05, + "loss": 0.2745, + "step": 13630 + }, + { + "epoch": 2.3272168643850817, + "grad_norm": 0.28403448926776254, + "learning_rate": 1.245257966616085e-05, + "loss": 0.2858, + "step": 13635 + }, + { + "epoch": 2.3280703251685586, + "grad_norm": 0.32908123683743307, + "learning_rate": 1.2436772888214467e-05, + "loss": 0.2875, + "step": 13640 + }, + { + "epoch": 2.3289237859520355, + "grad_norm": 0.4612657030725096, + "learning_rate": 1.2420966110268084e-05, + "loss": 0.2496, + "step": 13645 + }, + { + "epoch": 2.3297772467355125, + "grad_norm": 0.38676606881990483, + "learning_rate": 1.24051593323217e-05, + "loss": 0.2556, + "step": 13650 + }, + { + "epoch": 2.3306307075189894, + "grad_norm": 0.3337040823652374, + "learning_rate": 1.2389352554375316e-05, + "loss": 0.2713, + "step": 13655 + }, + { + "epoch": 2.3314841683024667, + "grad_norm": 0.3898313745285282, + "learning_rate": 1.2373545776428933e-05, + "loss": 0.2587, + "step": 13660 + }, + { + "epoch": 2.3323376290859437, + "grad_norm": 0.40488411353562653, + "learning_rate": 1.235773899848255e-05, + "loss": 0.281, + "step": 13665 + }, + { + "epoch": 2.3331910898694206, + "grad_norm": 0.3341003634320264, + "learning_rate": 1.2341932220536167e-05, + "loss": 0.2711, + "step": 13670 + }, + { + "epoch": 2.3340445506528975, + "grad_norm": 0.3280264582320497, + "learning_rate": 1.2326125442589784e-05, + "loss": 0.2566, + "step": 13675 + }, + { + "epoch": 2.3348980114363744, + "grad_norm": 0.3941865741053653, + "learning_rate": 1.23103186646434e-05, + "loss": 0.2895, + "step": 13680 + }, + { + "epoch": 2.3357514722198514, + "grad_norm": 0.3545181728225511, + "learning_rate": 1.2294511886697016e-05, + "loss": 0.2442, + "step": 13685 + }, + { + "epoch": 2.3366049330033283, + "grad_norm": 0.3993830195808206, + "learning_rate": 1.2278705108750631e-05, + "loss": 0.2502, + "step": 13690 + }, + { + "epoch": 2.3374583937868056, + "grad_norm": 0.3491825617229024, + "learning_rate": 1.2262898330804249e-05, + "loss": 0.2605, + "step": 13695 + }, + { + "epoch": 2.3383118545702826, + "grad_norm": 0.3220282909490161, + "learning_rate": 1.2247091552857866e-05, + "loss": 0.2784, + "step": 13700 + }, + { + "epoch": 2.3391653153537595, + "grad_norm": 0.3062997099550015, + "learning_rate": 1.2231284774911482e-05, + "loss": 0.2562, + "step": 13705 + }, + { + "epoch": 2.3400187761372364, + "grad_norm": 0.45019869708623317, + "learning_rate": 1.22154779969651e-05, + "loss": 0.2731, + "step": 13710 + }, + { + "epoch": 2.3408722369207133, + "grad_norm": 0.3647848167556784, + "learning_rate": 1.2199671219018715e-05, + "loss": 0.2635, + "step": 13715 + }, + { + "epoch": 2.3417256977041907, + "grad_norm": 0.31568148779307076, + "learning_rate": 1.2183864441072331e-05, + "loss": 0.2543, + "step": 13720 + }, + { + "epoch": 2.3425791584876676, + "grad_norm": 0.39945924823601864, + "learning_rate": 1.2168057663125949e-05, + "loss": 0.2768, + "step": 13725 + }, + { + "epoch": 2.3434326192711445, + "grad_norm": 0.3292869322699895, + "learning_rate": 1.2152250885179566e-05, + "loss": 0.2529, + "step": 13730 + }, + { + "epoch": 2.3442860800546215, + "grad_norm": 0.35104556195991654, + "learning_rate": 1.2136444107233182e-05, + "loss": 0.2749, + "step": 13735 + }, + { + "epoch": 2.3451395408380984, + "grad_norm": 0.3458157948154376, + "learning_rate": 1.21206373292868e-05, + "loss": 0.2653, + "step": 13740 + }, + { + "epoch": 2.3459930016215753, + "grad_norm": 0.3701665159887945, + "learning_rate": 1.2104830551340415e-05, + "loss": 0.2372, + "step": 13745 + }, + { + "epoch": 2.3468464624050527, + "grad_norm": 0.3500403307761384, + "learning_rate": 1.208902377339403e-05, + "loss": 0.2522, + "step": 13750 + }, + { + "epoch": 2.3476999231885296, + "grad_norm": 0.3699869244805946, + "learning_rate": 1.2073216995447648e-05, + "loss": 0.2631, + "step": 13755 + }, + { + "epoch": 2.3485533839720065, + "grad_norm": 0.3297373602356559, + "learning_rate": 1.2057410217501266e-05, + "loss": 0.2721, + "step": 13760 + }, + { + "epoch": 2.3494068447554834, + "grad_norm": 0.36704741894542436, + "learning_rate": 1.2041603439554881e-05, + "loss": 0.2497, + "step": 13765 + }, + { + "epoch": 2.3502603055389604, + "grad_norm": 0.3752754677463977, + "learning_rate": 1.2025796661608499e-05, + "loss": 0.2519, + "step": 13770 + }, + { + "epoch": 2.3511137663224373, + "grad_norm": 0.3466693760766624, + "learning_rate": 1.2009989883662115e-05, + "loss": 0.2529, + "step": 13775 + }, + { + "epoch": 2.3519672271059147, + "grad_norm": 0.4805276702632299, + "learning_rate": 1.199418310571573e-05, + "loss": 0.2829, + "step": 13780 + }, + { + "epoch": 2.3528206878893916, + "grad_norm": 0.3959562211082513, + "learning_rate": 1.1978376327769348e-05, + "loss": 0.2693, + "step": 13785 + }, + { + "epoch": 2.3536741486728685, + "grad_norm": 0.35924528379883774, + "learning_rate": 1.1962569549822965e-05, + "loss": 0.265, + "step": 13790 + }, + { + "epoch": 2.3545276094563454, + "grad_norm": 0.2774038864591207, + "learning_rate": 1.1946762771876581e-05, + "loss": 0.2575, + "step": 13795 + }, + { + "epoch": 2.3553810702398223, + "grad_norm": 0.33781070283823095, + "learning_rate": 1.1930955993930199e-05, + "loss": 0.2841, + "step": 13800 + }, + { + "epoch": 2.3562345310232997, + "grad_norm": 0.35215720977487114, + "learning_rate": 1.1915149215983814e-05, + "loss": 0.2337, + "step": 13805 + }, + { + "epoch": 2.3570879918067766, + "grad_norm": 0.3380343209709989, + "learning_rate": 1.189934243803743e-05, + "loss": 0.2493, + "step": 13810 + }, + { + "epoch": 2.3579414525902536, + "grad_norm": 0.5667691024279196, + "learning_rate": 1.1883535660091048e-05, + "loss": 0.2467, + "step": 13815 + }, + { + "epoch": 2.3587949133737305, + "grad_norm": 0.35442986611563343, + "learning_rate": 1.1867728882144665e-05, + "loss": 0.2622, + "step": 13820 + }, + { + "epoch": 2.3596483741572074, + "grad_norm": 0.4573056954650897, + "learning_rate": 1.185192210419828e-05, + "loss": 0.2958, + "step": 13825 + }, + { + "epoch": 2.3605018349406843, + "grad_norm": 0.3464846363619561, + "learning_rate": 1.1836115326251897e-05, + "loss": 0.2557, + "step": 13830 + }, + { + "epoch": 2.3613552957241613, + "grad_norm": 0.31107449399274584, + "learning_rate": 1.1820308548305514e-05, + "loss": 0.2569, + "step": 13835 + }, + { + "epoch": 2.3622087565076386, + "grad_norm": 0.31223474529197637, + "learning_rate": 1.180450177035913e-05, + "loss": 0.2636, + "step": 13840 + }, + { + "epoch": 2.3630622172911155, + "grad_norm": 0.3423766402813196, + "learning_rate": 1.1788694992412747e-05, + "loss": 0.254, + "step": 13845 + }, + { + "epoch": 2.3639156780745925, + "grad_norm": 0.4642496449605801, + "learning_rate": 1.1772888214466365e-05, + "loss": 0.2397, + "step": 13850 + }, + { + "epoch": 2.3647691388580694, + "grad_norm": 0.34953469626709727, + "learning_rate": 1.175708143651998e-05, + "loss": 0.2986, + "step": 13855 + }, + { + "epoch": 2.3656225996415463, + "grad_norm": 0.35708543589868114, + "learning_rate": 1.1741274658573596e-05, + "loss": 0.2582, + "step": 13860 + }, + { + "epoch": 2.3664760604250237, + "grad_norm": 0.3698635896251769, + "learning_rate": 1.1725467880627214e-05, + "loss": 0.2721, + "step": 13865 + }, + { + "epoch": 2.3673295212085006, + "grad_norm": 0.3302748261797436, + "learning_rate": 1.170966110268083e-05, + "loss": 0.2894, + "step": 13870 + }, + { + "epoch": 2.3681829819919775, + "grad_norm": 0.31393263378581593, + "learning_rate": 1.1693854324734447e-05, + "loss": 0.2562, + "step": 13875 + }, + { + "epoch": 2.3690364427754544, + "grad_norm": 0.5538836388276289, + "learning_rate": 1.1678047546788064e-05, + "loss": 0.2488, + "step": 13880 + }, + { + "epoch": 2.3698899035589314, + "grad_norm": 0.34646042875522115, + "learning_rate": 1.166224076884168e-05, + "loss": 0.258, + "step": 13885 + }, + { + "epoch": 2.3707433643424083, + "grad_norm": 0.3567861355385066, + "learning_rate": 1.1646433990895296e-05, + "loss": 0.2684, + "step": 13890 + }, + { + "epoch": 2.3715968251258857, + "grad_norm": 0.3622011831997506, + "learning_rate": 1.1630627212948913e-05, + "loss": 0.279, + "step": 13895 + }, + { + "epoch": 2.3724502859093626, + "grad_norm": 0.3329058907180884, + "learning_rate": 1.1614820435002529e-05, + "loss": 0.2448, + "step": 13900 + }, + { + "epoch": 2.3733037466928395, + "grad_norm": 0.47032289909689734, + "learning_rate": 1.1599013657056147e-05, + "loss": 0.2876, + "step": 13905 + }, + { + "epoch": 2.3741572074763164, + "grad_norm": 0.3102544235708446, + "learning_rate": 1.1583206879109764e-05, + "loss": 0.2461, + "step": 13910 + }, + { + "epoch": 2.3750106682597933, + "grad_norm": 0.31929100036653174, + "learning_rate": 1.156740010116338e-05, + "loss": 0.2419, + "step": 13915 + }, + { + "epoch": 2.3758641290432703, + "grad_norm": 0.35223857287658794, + "learning_rate": 1.1551593323216996e-05, + "loss": 0.2897, + "step": 13920 + }, + { + "epoch": 2.3767175898267476, + "grad_norm": 0.31929075368848847, + "learning_rate": 1.1535786545270613e-05, + "loss": 0.2653, + "step": 13925 + }, + { + "epoch": 2.3775710506102246, + "grad_norm": 0.5705191250133481, + "learning_rate": 1.1519979767324229e-05, + "loss": 0.2505, + "step": 13930 + }, + { + "epoch": 2.3784245113937015, + "grad_norm": 0.32864593989844176, + "learning_rate": 1.1504172989377846e-05, + "loss": 0.2477, + "step": 13935 + }, + { + "epoch": 2.3792779721771784, + "grad_norm": 0.3760333483534563, + "learning_rate": 1.1488366211431464e-05, + "loss": 0.242, + "step": 13940 + }, + { + "epoch": 2.3801314329606553, + "grad_norm": 0.30221859215315, + "learning_rate": 1.147255943348508e-05, + "loss": 0.2675, + "step": 13945 + }, + { + "epoch": 2.3809848937441327, + "grad_norm": 0.3094129613854578, + "learning_rate": 1.1456752655538695e-05, + "loss": 0.2855, + "step": 13950 + }, + { + "epoch": 2.3818383545276096, + "grad_norm": 0.3118239798941746, + "learning_rate": 1.1440945877592311e-05, + "loss": 0.2509, + "step": 13955 + }, + { + "epoch": 2.3826918153110865, + "grad_norm": 0.33807031265894355, + "learning_rate": 1.1425139099645928e-05, + "loss": 0.2519, + "step": 13960 + }, + { + "epoch": 2.3835452760945635, + "grad_norm": 0.3589548365905337, + "learning_rate": 1.1409332321699546e-05, + "loss": 0.2626, + "step": 13965 + }, + { + "epoch": 2.3843987368780404, + "grad_norm": 0.3601196386452014, + "learning_rate": 1.1393525543753162e-05, + "loss": 0.2368, + "step": 13970 + }, + { + "epoch": 2.3852521976615173, + "grad_norm": 0.37435014821697704, + "learning_rate": 1.1377718765806779e-05, + "loss": 0.2737, + "step": 13975 + }, + { + "epoch": 2.3861056584449942, + "grad_norm": 0.39186184407312485, + "learning_rate": 1.1361911987860395e-05, + "loss": 0.2549, + "step": 13980 + }, + { + "epoch": 2.3869591192284716, + "grad_norm": 0.3480509116803668, + "learning_rate": 1.134610520991401e-05, + "loss": 0.2747, + "step": 13985 + }, + { + "epoch": 2.3878125800119485, + "grad_norm": 0.3108613164406919, + "learning_rate": 1.1330298431967628e-05, + "loss": 0.2721, + "step": 13990 + }, + { + "epoch": 2.3886660407954254, + "grad_norm": 0.33249119855304893, + "learning_rate": 1.1314491654021246e-05, + "loss": 0.2617, + "step": 13995 + }, + { + "epoch": 2.3895195015789024, + "grad_norm": 2.6530979762285525, + "learning_rate": 1.1298684876074861e-05, + "loss": 0.301, + "step": 14000 + }, + { + "epoch": 2.3903729623623793, + "grad_norm": 0.658749679558768, + "learning_rate": 1.1282878098128479e-05, + "loss": 0.2676, + "step": 14005 + }, + { + "epoch": 2.3912264231458566, + "grad_norm": 0.36243360222353593, + "learning_rate": 1.1267071320182095e-05, + "loss": 0.2522, + "step": 14010 + }, + { + "epoch": 2.3920798839293336, + "grad_norm": 0.3763675627463445, + "learning_rate": 1.125126454223571e-05, + "loss": 0.2512, + "step": 14015 + }, + { + "epoch": 2.3929333447128105, + "grad_norm": 0.4142890979332789, + "learning_rate": 1.1235457764289328e-05, + "loss": 0.2677, + "step": 14020 + }, + { + "epoch": 2.3937868054962874, + "grad_norm": 0.3439817539904734, + "learning_rate": 1.1219650986342945e-05, + "loss": 0.2736, + "step": 14025 + }, + { + "epoch": 2.3946402662797643, + "grad_norm": 1.8155640993639846, + "learning_rate": 1.1203844208396561e-05, + "loss": 0.2408, + "step": 14030 + }, + { + "epoch": 2.3954937270632413, + "grad_norm": 0.32729231033143624, + "learning_rate": 1.1188037430450178e-05, + "loss": 0.2655, + "step": 14035 + }, + { + "epoch": 2.3963471878467186, + "grad_norm": 0.3273285197702505, + "learning_rate": 1.1172230652503794e-05, + "loss": 0.26, + "step": 14040 + }, + { + "epoch": 2.3972006486301956, + "grad_norm": 0.30793617073489615, + "learning_rate": 1.115642387455741e-05, + "loss": 0.2903, + "step": 14045 + }, + { + "epoch": 2.3980541094136725, + "grad_norm": 0.36727247145711506, + "learning_rate": 1.1140617096611027e-05, + "loss": 0.2849, + "step": 14050 + }, + { + "epoch": 2.3989075701971494, + "grad_norm": 0.4386468565369713, + "learning_rate": 1.1124810318664645e-05, + "loss": 0.2312, + "step": 14055 + }, + { + "epoch": 2.3997610309806263, + "grad_norm": 0.3906762379869074, + "learning_rate": 1.110900354071826e-05, + "loss": 0.2691, + "step": 14060 + }, + { + "epoch": 2.4006144917641032, + "grad_norm": 0.38600382005800393, + "learning_rate": 1.1093196762771878e-05, + "loss": 0.2972, + "step": 14065 + }, + { + "epoch": 2.4014679525475806, + "grad_norm": 0.3425366869475592, + "learning_rate": 1.1077389984825494e-05, + "loss": 0.2817, + "step": 14070 + }, + { + "epoch": 2.4023214133310575, + "grad_norm": 0.370235777382759, + "learning_rate": 1.106158320687911e-05, + "loss": 0.2596, + "step": 14075 + }, + { + "epoch": 2.4031748741145345, + "grad_norm": 0.30431484417216215, + "learning_rate": 1.1045776428932727e-05, + "loss": 0.2554, + "step": 14080 + }, + { + "epoch": 2.4040283348980114, + "grad_norm": 0.35170746169540634, + "learning_rate": 1.1029969650986345e-05, + "loss": 0.273, + "step": 14085 + }, + { + "epoch": 2.4048817956814883, + "grad_norm": 0.3469725693929598, + "learning_rate": 1.101416287303996e-05, + "loss": 0.2758, + "step": 14090 + }, + { + "epoch": 2.4057352564649657, + "grad_norm": 0.37712041067128005, + "learning_rate": 1.0998356095093576e-05, + "loss": 0.2824, + "step": 14095 + }, + { + "epoch": 2.4065887172484426, + "grad_norm": 0.3538335750117501, + "learning_rate": 1.0982549317147194e-05, + "loss": 0.2575, + "step": 14100 + }, + { + "epoch": 2.4074421780319195, + "grad_norm": 0.3320536986234714, + "learning_rate": 1.096674253920081e-05, + "loss": 0.2782, + "step": 14105 + }, + { + "epoch": 2.4082956388153964, + "grad_norm": 0.47800365739823614, + "learning_rate": 1.0950935761254425e-05, + "loss": 0.2607, + "step": 14110 + }, + { + "epoch": 2.4091490995988734, + "grad_norm": 0.3550919226992691, + "learning_rate": 1.0935128983308044e-05, + "loss": 0.2737, + "step": 14115 + }, + { + "epoch": 2.4100025603823503, + "grad_norm": 0.319973289046971, + "learning_rate": 1.091932220536166e-05, + "loss": 0.2918, + "step": 14120 + }, + { + "epoch": 2.410856021165827, + "grad_norm": 0.6546241723107031, + "learning_rate": 1.0903515427415276e-05, + "loss": 0.2589, + "step": 14125 + }, + { + "epoch": 2.4117094819493046, + "grad_norm": 0.34912701414952313, + "learning_rate": 1.0887708649468893e-05, + "loss": 0.2749, + "step": 14130 + }, + { + "epoch": 2.4125629427327815, + "grad_norm": 0.31161437564403516, + "learning_rate": 1.0871901871522509e-05, + "loss": 0.2402, + "step": 14135 + }, + { + "epoch": 2.4134164035162584, + "grad_norm": 0.3596705941470503, + "learning_rate": 1.0856095093576125e-05, + "loss": 0.2608, + "step": 14140 + }, + { + "epoch": 2.4142698642997353, + "grad_norm": 0.38807872345735817, + "learning_rate": 1.0840288315629742e-05, + "loss": 0.2654, + "step": 14145 + }, + { + "epoch": 2.4151233250832123, + "grad_norm": 0.3543470511620338, + "learning_rate": 1.082448153768336e-05, + "loss": 0.2535, + "step": 14150 + }, + { + "epoch": 2.4159767858666896, + "grad_norm": 0.30110137348341887, + "learning_rate": 1.0808674759736975e-05, + "loss": 0.2792, + "step": 14155 + }, + { + "epoch": 2.4168302466501665, + "grad_norm": 0.34315261887349163, + "learning_rate": 1.0792867981790593e-05, + "loss": 0.2511, + "step": 14160 + }, + { + "epoch": 2.4176837074336435, + "grad_norm": 0.35974940683996065, + "learning_rate": 1.0777061203844209e-05, + "loss": 0.2568, + "step": 14165 + }, + { + "epoch": 2.4185371682171204, + "grad_norm": 0.3416035040366801, + "learning_rate": 1.0761254425897824e-05, + "loss": 0.2677, + "step": 14170 + }, + { + "epoch": 2.4193906290005973, + "grad_norm": 0.40959594994649545, + "learning_rate": 1.0745447647951442e-05, + "loss": 0.2619, + "step": 14175 + }, + { + "epoch": 2.4202440897840742, + "grad_norm": 0.34899014164854364, + "learning_rate": 1.072964087000506e-05, + "loss": 0.2788, + "step": 14180 + }, + { + "epoch": 2.4210975505675516, + "grad_norm": 0.34914763029264057, + "learning_rate": 1.0713834092058675e-05, + "loss": 0.263, + "step": 14185 + }, + { + "epoch": 2.4219510113510285, + "grad_norm": 0.38033095181588245, + "learning_rate": 1.0698027314112293e-05, + "loss": 0.2658, + "step": 14190 + }, + { + "epoch": 2.4228044721345054, + "grad_norm": 0.3394889343943498, + "learning_rate": 1.0682220536165908e-05, + "loss": 0.2848, + "step": 14195 + }, + { + "epoch": 2.4236579329179824, + "grad_norm": 0.3760457990875017, + "learning_rate": 1.0666413758219524e-05, + "loss": 0.2537, + "step": 14200 + }, + { + "epoch": 2.4245113937014593, + "grad_norm": 0.3380987792591969, + "learning_rate": 1.0650606980273141e-05, + "loss": 0.2644, + "step": 14205 + }, + { + "epoch": 2.425364854484936, + "grad_norm": 0.318549850666751, + "learning_rate": 1.0634800202326759e-05, + "loss": 0.2574, + "step": 14210 + }, + { + "epoch": 2.4262183152684136, + "grad_norm": 0.31342348190936237, + "learning_rate": 1.0618993424380375e-05, + "loss": 0.2642, + "step": 14215 + }, + { + "epoch": 2.4270717760518905, + "grad_norm": 0.3462677540696492, + "learning_rate": 1.060318664643399e-05, + "loss": 0.2537, + "step": 14220 + }, + { + "epoch": 2.4279252368353674, + "grad_norm": 0.3826866159391629, + "learning_rate": 1.0587379868487608e-05, + "loss": 0.2742, + "step": 14225 + }, + { + "epoch": 2.4287786976188444, + "grad_norm": 0.3263531749919483, + "learning_rate": 1.0571573090541224e-05, + "loss": 0.2882, + "step": 14230 + }, + { + "epoch": 2.4296321584023213, + "grad_norm": 0.28544342560666724, + "learning_rate": 1.0555766312594841e-05, + "loss": 0.2674, + "step": 14235 + }, + { + "epoch": 2.4304856191857986, + "grad_norm": 0.3396665329122308, + "learning_rate": 1.0539959534648459e-05, + "loss": 0.2787, + "step": 14240 + }, + { + "epoch": 2.4313390799692756, + "grad_norm": 0.3399252000182434, + "learning_rate": 1.0524152756702074e-05, + "loss": 0.2667, + "step": 14245 + }, + { + "epoch": 2.4321925407527525, + "grad_norm": 0.3247161010293173, + "learning_rate": 1.050834597875569e-05, + "loss": 0.2591, + "step": 14250 + }, + { + "epoch": 2.4330460015362294, + "grad_norm": 0.36080600895528747, + "learning_rate": 1.0492539200809308e-05, + "loss": 0.258, + "step": 14255 + }, + { + "epoch": 2.4338994623197063, + "grad_norm": 0.3464012982629838, + "learning_rate": 1.0476732422862923e-05, + "loss": 0.2562, + "step": 14260 + }, + { + "epoch": 2.4347529231031833, + "grad_norm": 0.328033884048764, + "learning_rate": 1.046092564491654e-05, + "loss": 0.2595, + "step": 14265 + }, + { + "epoch": 2.43560638388666, + "grad_norm": 0.34044030311146867, + "learning_rate": 1.0445118866970158e-05, + "loss": 0.2701, + "step": 14270 + }, + { + "epoch": 2.4364598446701375, + "grad_norm": 0.3787577646980057, + "learning_rate": 1.0429312089023774e-05, + "loss": 0.2757, + "step": 14275 + }, + { + "epoch": 2.4373133054536145, + "grad_norm": 0.4621132454628748, + "learning_rate": 1.041350531107739e-05, + "loss": 0.2706, + "step": 14280 + }, + { + "epoch": 2.4381667662370914, + "grad_norm": 0.3699920523962295, + "learning_rate": 1.0397698533131007e-05, + "loss": 0.2467, + "step": 14285 + }, + { + "epoch": 2.4390202270205683, + "grad_norm": 0.37567265417832557, + "learning_rate": 1.0381891755184623e-05, + "loss": 0.2841, + "step": 14290 + }, + { + "epoch": 2.4398736878040452, + "grad_norm": 0.31674098891327795, + "learning_rate": 1.036608497723824e-05, + "loss": 0.2765, + "step": 14295 + }, + { + "epoch": 2.4407271485875226, + "grad_norm": 0.3683870689428794, + "learning_rate": 1.0350278199291858e-05, + "loss": 0.2566, + "step": 14300 + }, + { + "epoch": 2.4415806093709995, + "grad_norm": 0.3312211455701073, + "learning_rate": 1.0334471421345474e-05, + "loss": 0.2854, + "step": 14305 + }, + { + "epoch": 2.4424340701544764, + "grad_norm": 0.3047702631079437, + "learning_rate": 1.031866464339909e-05, + "loss": 0.2713, + "step": 14310 + }, + { + "epoch": 2.4432875309379534, + "grad_norm": 0.29809655107369376, + "learning_rate": 1.0302857865452707e-05, + "loss": 0.2606, + "step": 14315 + }, + { + "epoch": 2.4441409917214303, + "grad_norm": 0.3742642086400778, + "learning_rate": 1.0287051087506323e-05, + "loss": 0.2688, + "step": 14320 + }, + { + "epoch": 2.444994452504907, + "grad_norm": 0.3722878341115334, + "learning_rate": 1.027124430955994e-05, + "loss": 0.2607, + "step": 14325 + }, + { + "epoch": 2.4458479132883846, + "grad_norm": 0.31636039883162703, + "learning_rate": 1.0255437531613558e-05, + "loss": 0.2713, + "step": 14330 + }, + { + "epoch": 2.4467013740718615, + "grad_norm": 0.3528678978404139, + "learning_rate": 1.0239630753667173e-05, + "loss": 0.269, + "step": 14335 + }, + { + "epoch": 2.4475548348553384, + "grad_norm": 0.33220518954122014, + "learning_rate": 1.0223823975720789e-05, + "loss": 0.2773, + "step": 14340 + }, + { + "epoch": 2.4484082956388153, + "grad_norm": 0.3433486949077163, + "learning_rate": 1.0208017197774405e-05, + "loss": 0.224, + "step": 14345 + }, + { + "epoch": 2.4492617564222923, + "grad_norm": 0.31614126849766067, + "learning_rate": 1.0192210419828022e-05, + "loss": 0.269, + "step": 14350 + }, + { + "epoch": 2.450115217205769, + "grad_norm": 0.3171829857680147, + "learning_rate": 1.017640364188164e-05, + "loss": 0.284, + "step": 14355 + }, + { + "epoch": 2.4509686779892466, + "grad_norm": 0.36265633846142, + "learning_rate": 1.0160596863935256e-05, + "loss": 0.2498, + "step": 14360 + }, + { + "epoch": 2.4518221387727235, + "grad_norm": 0.3282258036126791, + "learning_rate": 1.0144790085988873e-05, + "loss": 0.2704, + "step": 14365 + }, + { + "epoch": 2.4526755995562004, + "grad_norm": 0.35214370660567523, + "learning_rate": 1.0128983308042489e-05, + "loss": 0.2869, + "step": 14370 + }, + { + "epoch": 2.4535290603396773, + "grad_norm": 0.3363280362651704, + "learning_rate": 1.0113176530096105e-05, + "loss": 0.2637, + "step": 14375 + }, + { + "epoch": 2.4543825211231542, + "grad_norm": 0.39213689524643197, + "learning_rate": 1.0097369752149722e-05, + "loss": 0.2701, + "step": 14380 + }, + { + "epoch": 2.4552359819066316, + "grad_norm": 0.343905126836761, + "learning_rate": 1.008156297420334e-05, + "loss": 0.253, + "step": 14385 + }, + { + "epoch": 2.4560894426901085, + "grad_norm": 0.34344106048561895, + "learning_rate": 1.0065756196256955e-05, + "loss": 0.258, + "step": 14390 + }, + { + "epoch": 2.4569429034735855, + "grad_norm": 0.29586278047989606, + "learning_rate": 1.0049949418310573e-05, + "loss": 0.273, + "step": 14395 + }, + { + "epoch": 2.4577963642570624, + "grad_norm": 0.4027576698874579, + "learning_rate": 1.0034142640364188e-05, + "loss": 0.2648, + "step": 14400 + }, + { + "epoch": 2.4586498250405393, + "grad_norm": 0.36593665379768425, + "learning_rate": 1.0018335862417804e-05, + "loss": 0.2557, + "step": 14405 + }, + { + "epoch": 2.4595032858240162, + "grad_norm": 0.3598055453128078, + "learning_rate": 1.0002529084471422e-05, + "loss": 0.2439, + "step": 14410 + }, + { + "epoch": 2.460356746607493, + "grad_norm": 0.38253852146152434, + "learning_rate": 9.986722306525039e-06, + "loss": 0.2587, + "step": 14415 + }, + { + "epoch": 2.4612102073909705, + "grad_norm": 0.30682301559292546, + "learning_rate": 9.970915528578655e-06, + "loss": 0.255, + "step": 14420 + }, + { + "epoch": 2.4620636681744474, + "grad_norm": 0.346241220452941, + "learning_rate": 9.955108750632272e-06, + "loss": 0.2609, + "step": 14425 + }, + { + "epoch": 2.4629171289579244, + "grad_norm": 0.39098463135896994, + "learning_rate": 9.939301972685888e-06, + "loss": 0.2614, + "step": 14430 + }, + { + "epoch": 2.4637705897414013, + "grad_norm": 0.36311594138575676, + "learning_rate": 9.923495194739504e-06, + "loss": 0.2663, + "step": 14435 + }, + { + "epoch": 2.464624050524878, + "grad_norm": 0.3694187382426125, + "learning_rate": 9.907688416793121e-06, + "loss": 0.2725, + "step": 14440 + }, + { + "epoch": 2.4654775113083556, + "grad_norm": 0.3181368980124899, + "learning_rate": 9.891881638846739e-06, + "loss": 0.273, + "step": 14445 + }, + { + "epoch": 2.4663309720918325, + "grad_norm": 0.3001019744638354, + "learning_rate": 9.876074860900355e-06, + "loss": 0.2496, + "step": 14450 + }, + { + "epoch": 2.4671844328753094, + "grad_norm": 0.31428730167056335, + "learning_rate": 9.860268082953972e-06, + "loss": 0.2554, + "step": 14455 + }, + { + "epoch": 2.4680378936587863, + "grad_norm": 0.3712724482982424, + "learning_rate": 9.844461305007588e-06, + "loss": 0.2833, + "step": 14460 + }, + { + "epoch": 2.4688913544422633, + "grad_norm": 0.44433449011826265, + "learning_rate": 9.828654527061204e-06, + "loss": 0.2721, + "step": 14465 + }, + { + "epoch": 2.46974481522574, + "grad_norm": 0.32849313014573395, + "learning_rate": 9.812847749114821e-06, + "loss": 0.2854, + "step": 14470 + }, + { + "epoch": 2.4705982760092176, + "grad_norm": 0.3440790071717394, + "learning_rate": 9.797040971168438e-06, + "loss": 0.2573, + "step": 14475 + }, + { + "epoch": 2.4714517367926945, + "grad_norm": 0.3587060980032648, + "learning_rate": 9.781234193222054e-06, + "loss": 0.2358, + "step": 14480 + }, + { + "epoch": 2.4723051975761714, + "grad_norm": 0.3348288455825785, + "learning_rate": 9.76542741527567e-06, + "loss": 0.2786, + "step": 14485 + }, + { + "epoch": 2.4731586583596483, + "grad_norm": 0.34934828195955436, + "learning_rate": 9.749620637329287e-06, + "loss": 0.2677, + "step": 14490 + }, + { + "epoch": 2.4740121191431252, + "grad_norm": 0.8305033574248635, + "learning_rate": 9.733813859382903e-06, + "loss": 0.2619, + "step": 14495 + }, + { + "epoch": 2.474865579926602, + "grad_norm": 0.3580936459587481, + "learning_rate": 9.71800708143652e-06, + "loss": 0.2549, + "step": 14500 + }, + { + "epoch": 2.4757190407100795, + "grad_norm": 0.31674144817736993, + "learning_rate": 9.702200303490138e-06, + "loss": 0.2696, + "step": 14505 + }, + { + "epoch": 2.4765725014935565, + "grad_norm": 0.33296233907632666, + "learning_rate": 9.686393525543754e-06, + "loss": 0.277, + "step": 14510 + }, + { + "epoch": 2.4774259622770334, + "grad_norm": 0.3625925072432881, + "learning_rate": 9.67058674759737e-06, + "loss": 0.2552, + "step": 14515 + }, + { + "epoch": 2.4782794230605103, + "grad_norm": 0.3052866481204275, + "learning_rate": 9.654779969650987e-06, + "loss": 0.2745, + "step": 14520 + }, + { + "epoch": 2.4791328838439872, + "grad_norm": 0.3436194097126335, + "learning_rate": 9.638973191704603e-06, + "loss": 0.2821, + "step": 14525 + }, + { + "epoch": 2.4799863446274646, + "grad_norm": 0.3954215609284519, + "learning_rate": 9.623166413758219e-06, + "loss": 0.2558, + "step": 14530 + }, + { + "epoch": 2.4808398054109415, + "grad_norm": 0.4587324591471796, + "learning_rate": 9.607359635811838e-06, + "loss": 0.2469, + "step": 14535 + }, + { + "epoch": 2.4816932661944184, + "grad_norm": 0.3269705637376026, + "learning_rate": 9.591552857865454e-06, + "loss": 0.2672, + "step": 14540 + }, + { + "epoch": 2.4825467269778954, + "grad_norm": 0.3725948745994985, + "learning_rate": 9.57574607991907e-06, + "loss": 0.2642, + "step": 14545 + }, + { + "epoch": 2.4834001877613723, + "grad_norm": 0.4251797756192436, + "learning_rate": 9.559939301972687e-06, + "loss": 0.2442, + "step": 14550 + }, + { + "epoch": 2.484253648544849, + "grad_norm": 0.35811671563058445, + "learning_rate": 9.544132524026303e-06, + "loss": 0.3074, + "step": 14555 + }, + { + "epoch": 2.485107109328326, + "grad_norm": 0.3747974354254869, + "learning_rate": 9.528325746079918e-06, + "loss": 0.2693, + "step": 14560 + }, + { + "epoch": 2.4859605701118035, + "grad_norm": 0.33567451968078693, + "learning_rate": 9.512518968133537e-06, + "loss": 0.2384, + "step": 14565 + }, + { + "epoch": 2.4868140308952804, + "grad_norm": 0.3189977713488994, + "learning_rate": 9.496712190187153e-06, + "loss": 0.2626, + "step": 14570 + }, + { + "epoch": 2.4876674916787573, + "grad_norm": 0.3489836368302477, + "learning_rate": 9.480905412240769e-06, + "loss": 0.2523, + "step": 14575 + }, + { + "epoch": 2.4885209524622343, + "grad_norm": 0.3613479235819589, + "learning_rate": 9.465098634294386e-06, + "loss": 0.2729, + "step": 14580 + }, + { + "epoch": 2.489374413245711, + "grad_norm": 0.40565449622708155, + "learning_rate": 9.449291856348002e-06, + "loss": 0.2444, + "step": 14585 + }, + { + "epoch": 2.4902278740291885, + "grad_norm": 0.3680615304096154, + "learning_rate": 9.433485078401618e-06, + "loss": 0.2566, + "step": 14590 + }, + { + "epoch": 2.4910813348126655, + "grad_norm": 0.3128924900072794, + "learning_rate": 9.417678300455237e-06, + "loss": 0.2318, + "step": 14595 + }, + { + "epoch": 2.4919347955961424, + "grad_norm": 0.3186196091002627, + "learning_rate": 9.401871522508853e-06, + "loss": 0.2822, + "step": 14600 + }, + { + "epoch": 2.4927882563796193, + "grad_norm": 0.3287136069133009, + "learning_rate": 9.386064744562469e-06, + "loss": 0.2533, + "step": 14605 + }, + { + "epoch": 2.4936417171630962, + "grad_norm": 0.4396657971919846, + "learning_rate": 9.370257966616084e-06, + "loss": 0.2827, + "step": 14610 + }, + { + "epoch": 2.494495177946573, + "grad_norm": 0.3124085405558122, + "learning_rate": 9.354451188669702e-06, + "loss": 0.2409, + "step": 14615 + }, + { + "epoch": 2.4953486387300505, + "grad_norm": 0.36168954024164707, + "learning_rate": 9.338644410723318e-06, + "loss": 0.2791, + "step": 14620 + }, + { + "epoch": 2.4962020995135275, + "grad_norm": 0.3471248448213307, + "learning_rate": 9.322837632776935e-06, + "loss": 0.278, + "step": 14625 + }, + { + "epoch": 2.4970555602970044, + "grad_norm": 0.3520183299863951, + "learning_rate": 9.307030854830553e-06, + "loss": 0.2674, + "step": 14630 + }, + { + "epoch": 2.4979090210804813, + "grad_norm": 0.326191389335285, + "learning_rate": 9.291224076884168e-06, + "loss": 0.2542, + "step": 14635 + }, + { + "epoch": 2.498762481863958, + "grad_norm": 0.3220048844992483, + "learning_rate": 9.275417298937784e-06, + "loss": 0.2532, + "step": 14640 + }, + { + "epoch": 2.499615942647435, + "grad_norm": 0.33818063700540074, + "learning_rate": 9.259610520991402e-06, + "loss": 0.279, + "step": 14645 + }, + { + "epoch": 2.5004694034309125, + "grad_norm": 0.33944084356888454, + "learning_rate": 9.243803743045017e-06, + "loss": 0.2603, + "step": 14650 + }, + { + "epoch": 2.5013228642143894, + "grad_norm": 0.7220901229701975, + "learning_rate": 9.227996965098635e-06, + "loss": 0.2567, + "step": 14655 + }, + { + "epoch": 2.5021763249978664, + "grad_norm": 0.3115755105531366, + "learning_rate": 9.212190187152252e-06, + "loss": 0.2495, + "step": 14660 + }, + { + "epoch": 2.5030297857813433, + "grad_norm": 0.3498764937461434, + "learning_rate": 9.196383409205868e-06, + "loss": 0.2939, + "step": 14665 + }, + { + "epoch": 2.50388324656482, + "grad_norm": 0.4011131591215743, + "learning_rate": 9.180576631259484e-06, + "loss": 0.2477, + "step": 14670 + }, + { + "epoch": 2.5047367073482976, + "grad_norm": 0.3447402466363007, + "learning_rate": 9.164769853313101e-06, + "loss": 0.2782, + "step": 14675 + }, + { + "epoch": 2.5055901681317745, + "grad_norm": 0.3408437723228539, + "learning_rate": 9.148963075366717e-06, + "loss": 0.2883, + "step": 14680 + }, + { + "epoch": 2.5064436289152514, + "grad_norm": 0.3598563871864662, + "learning_rate": 9.133156297420334e-06, + "loss": 0.2661, + "step": 14685 + }, + { + "epoch": 2.5072970896987283, + "grad_norm": 0.3524261962338911, + "learning_rate": 9.117349519473952e-06, + "loss": 0.2299, + "step": 14690 + }, + { + "epoch": 2.5081505504822053, + "grad_norm": 0.35548469805642413, + "learning_rate": 9.101542741527568e-06, + "loss": 0.2725, + "step": 14695 + }, + { + "epoch": 2.509004011265682, + "grad_norm": 0.312906336303122, + "learning_rate": 9.085735963581183e-06, + "loss": 0.26, + "step": 14700 + }, + { + "epoch": 2.509857472049159, + "grad_norm": 0.3389670840404128, + "learning_rate": 9.069929185634801e-06, + "loss": 0.2735, + "step": 14705 + }, + { + "epoch": 2.5107109328326365, + "grad_norm": 0.3364038657527453, + "learning_rate": 9.054122407688417e-06, + "loss": 0.27, + "step": 14710 + }, + { + "epoch": 2.5115643936161134, + "grad_norm": 0.32104461280395497, + "learning_rate": 9.038315629742034e-06, + "loss": 0.2838, + "step": 14715 + }, + { + "epoch": 2.5124178543995903, + "grad_norm": 0.4144744341346533, + "learning_rate": 9.022508851795652e-06, + "loss": 0.2545, + "step": 14720 + }, + { + "epoch": 2.5132713151830672, + "grad_norm": 0.3319490971303601, + "learning_rate": 9.006702073849267e-06, + "loss": 0.2625, + "step": 14725 + }, + { + "epoch": 2.514124775966544, + "grad_norm": 0.37661552718672137, + "learning_rate": 8.990895295902883e-06, + "loss": 0.2497, + "step": 14730 + }, + { + "epoch": 2.5149782367500215, + "grad_norm": 0.32876186506762023, + "learning_rate": 8.9750885179565e-06, + "loss": 0.2433, + "step": 14735 + }, + { + "epoch": 2.5158316975334984, + "grad_norm": 0.43033153828122256, + "learning_rate": 8.959281740010116e-06, + "loss": 0.2522, + "step": 14740 + }, + { + "epoch": 2.5166851583169754, + "grad_norm": 0.3640377089384684, + "learning_rate": 8.943474962063734e-06, + "loss": 0.2555, + "step": 14745 + }, + { + "epoch": 2.5175386191004523, + "grad_norm": 0.31362572989653836, + "learning_rate": 8.92766818411735e-06, + "loss": 0.2837, + "step": 14750 + }, + { + "epoch": 2.518392079883929, + "grad_norm": 0.40458398017180186, + "learning_rate": 8.911861406170967e-06, + "loss": 0.2629, + "step": 14755 + }, + { + "epoch": 2.5192455406674066, + "grad_norm": 0.3248341879661295, + "learning_rate": 8.896054628224583e-06, + "loss": 0.2743, + "step": 14760 + }, + { + "epoch": 2.520099001450883, + "grad_norm": 0.3578432284103998, + "learning_rate": 8.880247850278198e-06, + "loss": 0.2507, + "step": 14765 + }, + { + "epoch": 2.5209524622343604, + "grad_norm": 0.35467076421052923, + "learning_rate": 8.864441072331816e-06, + "loss": 0.2837, + "step": 14770 + }, + { + "epoch": 2.5218059230178373, + "grad_norm": 0.4006464151945454, + "learning_rate": 8.848634294385433e-06, + "loss": 0.2741, + "step": 14775 + }, + { + "epoch": 2.5226593838013143, + "grad_norm": 0.29135652611912566, + "learning_rate": 8.83282751643905e-06, + "loss": 0.2738, + "step": 14780 + }, + { + "epoch": 2.523512844584791, + "grad_norm": 0.3323270796757813, + "learning_rate": 8.817020738492667e-06, + "loss": 0.2684, + "step": 14785 + }, + { + "epoch": 2.524366305368268, + "grad_norm": 0.3291607431817088, + "learning_rate": 8.801213960546282e-06, + "loss": 0.2821, + "step": 14790 + }, + { + "epoch": 2.5252197661517455, + "grad_norm": 0.32039698349844725, + "learning_rate": 8.785407182599898e-06, + "loss": 0.2633, + "step": 14795 + }, + { + "epoch": 2.5260732269352224, + "grad_norm": 0.3604688035888555, + "learning_rate": 8.769600404653516e-06, + "loss": 0.2634, + "step": 14800 + }, + { + "epoch": 2.5269266877186993, + "grad_norm": 0.34790212358787853, + "learning_rate": 8.753793626707133e-06, + "loss": 0.2556, + "step": 14805 + }, + { + "epoch": 2.5277801485021762, + "grad_norm": 0.33700470901798213, + "learning_rate": 8.737986848760749e-06, + "loss": 0.2818, + "step": 14810 + }, + { + "epoch": 2.528633609285653, + "grad_norm": 0.3044950344194103, + "learning_rate": 8.722180070814366e-06, + "loss": 0.2748, + "step": 14815 + }, + { + "epoch": 2.5294870700691305, + "grad_norm": 0.7149306608806377, + "learning_rate": 8.706373292867982e-06, + "loss": 0.2246, + "step": 14820 + }, + { + "epoch": 2.5303405308526075, + "grad_norm": 0.30250017016183345, + "learning_rate": 8.690566514921598e-06, + "loss": 0.2555, + "step": 14825 + }, + { + "epoch": 2.5311939916360844, + "grad_norm": 0.3278541640816871, + "learning_rate": 8.674759736975215e-06, + "loss": 0.2981, + "step": 14830 + }, + { + "epoch": 2.5320474524195613, + "grad_norm": 0.33474662014247003, + "learning_rate": 8.658952959028833e-06, + "loss": 0.2437, + "step": 14835 + }, + { + "epoch": 2.5329009132030382, + "grad_norm": 0.36515057289873015, + "learning_rate": 8.643146181082449e-06, + "loss": 0.2771, + "step": 14840 + }, + { + "epoch": 2.5337543739865156, + "grad_norm": 0.31446094410200376, + "learning_rate": 8.627339403136066e-06, + "loss": 0.2545, + "step": 14845 + }, + { + "epoch": 2.534607834769992, + "grad_norm": 0.32208686775598677, + "learning_rate": 8.611532625189682e-06, + "loss": 0.2594, + "step": 14850 + }, + { + "epoch": 2.5354612955534694, + "grad_norm": 0.3204019032092192, + "learning_rate": 8.595725847243297e-06, + "loss": 0.2582, + "step": 14855 + }, + { + "epoch": 2.5363147563369464, + "grad_norm": 0.31283853850881443, + "learning_rate": 8.579919069296915e-06, + "loss": 0.2997, + "step": 14860 + }, + { + "epoch": 2.5371682171204233, + "grad_norm": 0.2888961506768333, + "learning_rate": 8.564112291350532e-06, + "loss": 0.2618, + "step": 14865 + }, + { + "epoch": 2.5380216779039, + "grad_norm": 0.33297411246830283, + "learning_rate": 8.548305513404148e-06, + "loss": 0.2738, + "step": 14870 + }, + { + "epoch": 2.538875138687377, + "grad_norm": 0.4111717314073259, + "learning_rate": 8.532498735457764e-06, + "loss": 0.2666, + "step": 14875 + }, + { + "epoch": 2.5397285994708545, + "grad_norm": 0.3093128801039588, + "learning_rate": 8.516691957511381e-06, + "loss": 0.2788, + "step": 14880 + }, + { + "epoch": 2.5405820602543314, + "grad_norm": 0.3471700671659906, + "learning_rate": 8.500885179564997e-06, + "loss": 0.2499, + "step": 14885 + }, + { + "epoch": 2.5414355210378083, + "grad_norm": 0.34457282907570624, + "learning_rate": 8.485078401618615e-06, + "loss": 0.279, + "step": 14890 + }, + { + "epoch": 2.5422889818212853, + "grad_norm": 0.3703156588027841, + "learning_rate": 8.469271623672232e-06, + "loss": 0.2805, + "step": 14895 + }, + { + "epoch": 2.543142442604762, + "grad_norm": 0.3854068611469067, + "learning_rate": 8.453464845725848e-06, + "loss": 0.2684, + "step": 14900 + }, + { + "epoch": 2.5439959033882396, + "grad_norm": 0.3258732154948925, + "learning_rate": 8.437658067779464e-06, + "loss": 0.2783, + "step": 14905 + }, + { + "epoch": 2.544849364171716, + "grad_norm": 0.3257588125906597, + "learning_rate": 8.421851289833081e-06, + "loss": 0.2478, + "step": 14910 + }, + { + "epoch": 2.5457028249551934, + "grad_norm": 0.3327368767692538, + "learning_rate": 8.406044511886697e-06, + "loss": 0.2658, + "step": 14915 + }, + { + "epoch": 2.5465562857386703, + "grad_norm": 0.308838692022257, + "learning_rate": 8.390237733940314e-06, + "loss": 0.2681, + "step": 14920 + }, + { + "epoch": 2.5474097465221472, + "grad_norm": 0.31348836145437325, + "learning_rate": 8.374430955993932e-06, + "loss": 0.2677, + "step": 14925 + }, + { + "epoch": 2.548263207305624, + "grad_norm": 0.34955246448296834, + "learning_rate": 8.358624178047547e-06, + "loss": 0.2652, + "step": 14930 + }, + { + "epoch": 2.549116668089101, + "grad_norm": 0.3622847088386915, + "learning_rate": 8.342817400101163e-06, + "loss": 0.2554, + "step": 14935 + }, + { + "epoch": 2.5499701288725785, + "grad_norm": 0.3255722255573887, + "learning_rate": 8.32701062215478e-06, + "loss": 0.2649, + "step": 14940 + }, + { + "epoch": 2.5508235896560554, + "grad_norm": 0.36982499193159035, + "learning_rate": 8.311203844208396e-06, + "loss": 0.2948, + "step": 14945 + }, + { + "epoch": 2.5516770504395323, + "grad_norm": 0.36951547737636453, + "learning_rate": 8.295397066262014e-06, + "loss": 0.2585, + "step": 14950 + }, + { + "epoch": 2.5525305112230092, + "grad_norm": 0.3590117026965425, + "learning_rate": 8.279590288315631e-06, + "loss": 0.2559, + "step": 14955 + }, + { + "epoch": 2.553383972006486, + "grad_norm": 0.3636835050984558, + "learning_rate": 8.263783510369247e-06, + "loss": 0.2513, + "step": 14960 + }, + { + "epoch": 2.5542374327899635, + "grad_norm": 0.38929351543522134, + "learning_rate": 8.247976732422863e-06, + "loss": 0.2822, + "step": 14965 + }, + { + "epoch": 2.5550908935734404, + "grad_norm": 0.2865223071614123, + "learning_rate": 8.23216995447648e-06, + "loss": 0.2585, + "step": 14970 + }, + { + "epoch": 2.5559443543569174, + "grad_norm": 0.33793589845307387, + "learning_rate": 8.216363176530096e-06, + "loss": 0.2494, + "step": 14975 + }, + { + "epoch": 2.5567978151403943, + "grad_norm": 0.42855016920914785, + "learning_rate": 8.200556398583714e-06, + "loss": 0.2586, + "step": 14980 + }, + { + "epoch": 2.557651275923871, + "grad_norm": 0.3710576936912798, + "learning_rate": 8.184749620637331e-06, + "loss": 0.2653, + "step": 14985 + }, + { + "epoch": 2.5585047367073486, + "grad_norm": 0.3192702670716028, + "learning_rate": 8.168942842690947e-06, + "loss": 0.2742, + "step": 14990 + }, + { + "epoch": 2.559358197490825, + "grad_norm": 0.326286449535892, + "learning_rate": 8.153136064744563e-06, + "loss": 0.269, + "step": 14995 + }, + { + "epoch": 2.5602116582743024, + "grad_norm": 0.36306778061934497, + "learning_rate": 8.13732928679818e-06, + "loss": 0.267, + "step": 15000 + }, + { + "epoch": 2.5610651190577793, + "grad_norm": 0.3674230986760816, + "learning_rate": 8.121522508851796e-06, + "loss": 0.2556, + "step": 15005 + }, + { + "epoch": 2.5619185798412563, + "grad_norm": 0.3886694315470275, + "learning_rate": 8.105715730905413e-06, + "loss": 0.2586, + "step": 15010 + }, + { + "epoch": 2.562772040624733, + "grad_norm": 0.31734254900201264, + "learning_rate": 8.089908952959029e-06, + "loss": 0.2752, + "step": 15015 + }, + { + "epoch": 2.56362550140821, + "grad_norm": 0.3462925041349604, + "learning_rate": 8.074102175012646e-06, + "loss": 0.2601, + "step": 15020 + }, + { + "epoch": 2.5644789621916875, + "grad_norm": 0.36436655644205235, + "learning_rate": 8.058295397066262e-06, + "loss": 0.2734, + "step": 15025 + }, + { + "epoch": 2.5653324229751644, + "grad_norm": 0.3349267498877912, + "learning_rate": 8.042488619119878e-06, + "loss": 0.2777, + "step": 15030 + }, + { + "epoch": 2.5661858837586413, + "grad_norm": 0.353138665294069, + "learning_rate": 8.026681841173495e-06, + "loss": 0.287, + "step": 15035 + }, + { + "epoch": 2.5670393445421182, + "grad_norm": 0.3345752313165088, + "learning_rate": 8.010875063227111e-06, + "loss": 0.2488, + "step": 15040 + }, + { + "epoch": 2.567892805325595, + "grad_norm": 0.3606955744346641, + "learning_rate": 7.995068285280729e-06, + "loss": 0.2786, + "step": 15045 + }, + { + "epoch": 2.5687462661090725, + "grad_norm": 0.3412036584846417, + "learning_rate": 7.979261507334346e-06, + "loss": 0.258, + "step": 15050 + }, + { + "epoch": 2.5695997268925495, + "grad_norm": 0.35693653636226713, + "learning_rate": 7.963454729387962e-06, + "loss": 0.2642, + "step": 15055 + }, + { + "epoch": 2.5704531876760264, + "grad_norm": 0.3280891820451852, + "learning_rate": 7.947647951441578e-06, + "loss": 0.2845, + "step": 15060 + }, + { + "epoch": 2.5713066484595033, + "grad_norm": 0.3579927841500715, + "learning_rate": 7.931841173495195e-06, + "loss": 0.2667, + "step": 15065 + }, + { + "epoch": 2.57216010924298, + "grad_norm": 0.3384300716371121, + "learning_rate": 7.916034395548811e-06, + "loss": 0.2602, + "step": 15070 + }, + { + "epoch": 2.573013570026457, + "grad_norm": 0.35418106860695336, + "learning_rate": 7.900227617602428e-06, + "loss": 0.2673, + "step": 15075 + }, + { + "epoch": 2.573867030809934, + "grad_norm": 0.3698438742352702, + "learning_rate": 7.884420839656046e-06, + "loss": 0.2743, + "step": 15080 + }, + { + "epoch": 2.5747204915934114, + "grad_norm": 0.363613103217298, + "learning_rate": 7.868614061709662e-06, + "loss": 0.2549, + "step": 15085 + }, + { + "epoch": 2.5755739523768884, + "grad_norm": 0.3394372180741034, + "learning_rate": 7.852807283763277e-06, + "loss": 0.2718, + "step": 15090 + }, + { + "epoch": 2.5764274131603653, + "grad_norm": 0.36423762991683717, + "learning_rate": 7.837000505816895e-06, + "loss": 0.2949, + "step": 15095 + }, + { + "epoch": 2.577280873943842, + "grad_norm": 0.31743731897346655, + "learning_rate": 7.82119372787051e-06, + "loss": 0.2222, + "step": 15100 + }, + { + "epoch": 2.578134334727319, + "grad_norm": 0.32554087737361476, + "learning_rate": 7.805386949924128e-06, + "loss": 0.2709, + "step": 15105 + }, + { + "epoch": 2.5789877955107965, + "grad_norm": 0.31559740264213476, + "learning_rate": 7.789580171977745e-06, + "loss": 0.2755, + "step": 15110 + }, + { + "epoch": 2.5798412562942734, + "grad_norm": 0.3063018528733892, + "learning_rate": 7.773773394031361e-06, + "loss": 0.2489, + "step": 15115 + }, + { + "epoch": 2.5806947170777503, + "grad_norm": 0.31891238791816007, + "learning_rate": 7.757966616084977e-06, + "loss": 0.2815, + "step": 15120 + }, + { + "epoch": 2.5815481778612273, + "grad_norm": 0.3482705279801674, + "learning_rate": 7.742159838138594e-06, + "loss": 0.2372, + "step": 15125 + }, + { + "epoch": 2.582401638644704, + "grad_norm": 0.3468335021357962, + "learning_rate": 7.72635306019221e-06, + "loss": 0.2815, + "step": 15130 + }, + { + "epoch": 2.5832550994281815, + "grad_norm": 0.3465393777242725, + "learning_rate": 7.710546282245828e-06, + "loss": 0.2731, + "step": 15135 + }, + { + "epoch": 2.584108560211658, + "grad_norm": 0.3913655087141834, + "learning_rate": 7.694739504299443e-06, + "loss": 0.2575, + "step": 15140 + }, + { + "epoch": 2.5849620209951354, + "grad_norm": 0.3190687438763158, + "learning_rate": 7.678932726353061e-06, + "loss": 0.2669, + "step": 15145 + }, + { + "epoch": 2.5858154817786123, + "grad_norm": 0.358078662616606, + "learning_rate": 7.663125948406677e-06, + "loss": 0.2901, + "step": 15150 + }, + { + "epoch": 2.5866689425620892, + "grad_norm": 0.34333694637136325, + "learning_rate": 7.647319170460292e-06, + "loss": 0.2697, + "step": 15155 + }, + { + "epoch": 2.587522403345566, + "grad_norm": 0.35238615533564754, + "learning_rate": 7.63151239251391e-06, + "loss": 0.2598, + "step": 15160 + }, + { + "epoch": 2.588375864129043, + "grad_norm": 0.34817701038063625, + "learning_rate": 7.6157056145675265e-06, + "loss": 0.2519, + "step": 15165 + }, + { + "epoch": 2.5892293249125204, + "grad_norm": 0.31586534444356135, + "learning_rate": 7.599898836621143e-06, + "loss": 0.2504, + "step": 15170 + }, + { + "epoch": 2.5900827856959974, + "grad_norm": 0.37410390706038105, + "learning_rate": 7.5840920586747606e-06, + "loss": 0.2859, + "step": 15175 + }, + { + "epoch": 2.5909362464794743, + "grad_norm": 0.4076676227039792, + "learning_rate": 7.568285280728376e-06, + "loss": 0.2664, + "step": 15180 + }, + { + "epoch": 2.591789707262951, + "grad_norm": 0.328956954930043, + "learning_rate": 7.552478502781993e-06, + "loss": 0.2795, + "step": 15185 + }, + { + "epoch": 2.592643168046428, + "grad_norm": 0.3242313957454638, + "learning_rate": 7.53667172483561e-06, + "loss": 0.2605, + "step": 15190 + }, + { + "epoch": 2.5934966288299055, + "grad_norm": 0.35632982846401845, + "learning_rate": 7.520864946889226e-06, + "loss": 0.2711, + "step": 15195 + }, + { + "epoch": 2.5943500896133824, + "grad_norm": 0.3673822981048876, + "learning_rate": 7.505058168942843e-06, + "loss": 0.2431, + "step": 15200 + }, + { + "epoch": 2.5952035503968593, + "grad_norm": 0.38628552518056947, + "learning_rate": 7.48925139099646e-06, + "loss": 0.2666, + "step": 15205 + }, + { + "epoch": 2.5960570111803363, + "grad_norm": 0.3477240224200369, + "learning_rate": 7.473444613050076e-06, + "loss": 0.2615, + "step": 15210 + }, + { + "epoch": 2.596910471963813, + "grad_norm": 0.3444502479801836, + "learning_rate": 7.457637835103693e-06, + "loss": 0.2654, + "step": 15215 + }, + { + "epoch": 2.59776393274729, + "grad_norm": 0.3889329248670148, + "learning_rate": 7.44183105715731e-06, + "loss": 0.2655, + "step": 15220 + }, + { + "epoch": 2.598617393530767, + "grad_norm": 0.7088173666283505, + "learning_rate": 7.426024279210926e-06, + "loss": 0.2666, + "step": 15225 + }, + { + "epoch": 2.5994708543142444, + "grad_norm": 0.37264826890797625, + "learning_rate": 7.4102175012645424e-06, + "loss": 0.2666, + "step": 15230 + }, + { + "epoch": 2.6003243150977213, + "grad_norm": 0.3639291886301692, + "learning_rate": 7.39441072331816e-06, + "loss": 0.28, + "step": 15235 + }, + { + "epoch": 2.6011777758811983, + "grad_norm": 0.3427815985351985, + "learning_rate": 7.378603945371776e-06, + "loss": 0.2731, + "step": 15240 + }, + { + "epoch": 2.602031236664675, + "grad_norm": 0.32112484418580056, + "learning_rate": 7.362797167425392e-06, + "loss": 0.2465, + "step": 15245 + }, + { + "epoch": 2.602884697448152, + "grad_norm": 0.3613054889577417, + "learning_rate": 7.34699038947901e-06, + "loss": 0.2826, + "step": 15250 + }, + { + "epoch": 2.6037381582316295, + "grad_norm": 0.30921220095070084, + "learning_rate": 7.3311836115326255e-06, + "loss": 0.2635, + "step": 15255 + }, + { + "epoch": 2.6045916190151064, + "grad_norm": 0.35104175864231874, + "learning_rate": 7.315376833586242e-06, + "loss": 0.2503, + "step": 15260 + }, + { + "epoch": 2.6054450797985833, + "grad_norm": 0.3290929790912007, + "learning_rate": 7.2995700556398596e-06, + "loss": 0.2818, + "step": 15265 + }, + { + "epoch": 2.6062985405820602, + "grad_norm": 0.38330817541779394, + "learning_rate": 7.283763277693475e-06, + "loss": 0.2515, + "step": 15270 + }, + { + "epoch": 2.607152001365537, + "grad_norm": 0.3732479596804675, + "learning_rate": 7.267956499747092e-06, + "loss": 0.2756, + "step": 15275 + }, + { + "epoch": 2.6080054621490145, + "grad_norm": 0.34507633371042173, + "learning_rate": 7.252149721800708e-06, + "loss": 0.2564, + "step": 15280 + }, + { + "epoch": 2.608858922932491, + "grad_norm": 0.35935976480326104, + "learning_rate": 7.236342943854325e-06, + "loss": 0.2627, + "step": 15285 + }, + { + "epoch": 2.6097123837159684, + "grad_norm": 0.3126641707142665, + "learning_rate": 7.220536165907942e-06, + "loss": 0.2649, + "step": 15290 + }, + { + "epoch": 2.6105658444994453, + "grad_norm": 0.3924070249485932, + "learning_rate": 7.2047293879615575e-06, + "loss": 0.2532, + "step": 15295 + }, + { + "epoch": 2.611419305282922, + "grad_norm": 0.39147945187954714, + "learning_rate": 7.188922610015175e-06, + "loss": 0.2594, + "step": 15300 + }, + { + "epoch": 2.612272766066399, + "grad_norm": 0.33173543761510493, + "learning_rate": 7.173115832068792e-06, + "loss": 0.2505, + "step": 15305 + }, + { + "epoch": 2.613126226849876, + "grad_norm": 0.3320536524500648, + "learning_rate": 7.157309054122407e-06, + "loss": 0.2704, + "step": 15310 + }, + { + "epoch": 2.6139796876333534, + "grad_norm": 0.31324033158088654, + "learning_rate": 7.141502276176025e-06, + "loss": 0.2588, + "step": 15315 + }, + { + "epoch": 2.6148331484168303, + "grad_norm": 0.3673039365008783, + "learning_rate": 7.1256954982296414e-06, + "loss": 0.2577, + "step": 15320 + }, + { + "epoch": 2.6156866092003073, + "grad_norm": 0.35635933014280124, + "learning_rate": 7.109888720283257e-06, + "loss": 0.2847, + "step": 15325 + }, + { + "epoch": 2.616540069983784, + "grad_norm": 0.32938069554069876, + "learning_rate": 7.094081942336875e-06, + "loss": 0.2482, + "step": 15330 + }, + { + "epoch": 2.617393530767261, + "grad_norm": 0.32267895465365887, + "learning_rate": 7.078275164390491e-06, + "loss": 0.2595, + "step": 15335 + }, + { + "epoch": 2.6182469915507385, + "grad_norm": 0.338747778656447, + "learning_rate": 7.062468386444107e-06, + "loss": 0.2308, + "step": 15340 + }, + { + "epoch": 2.6191004523342154, + "grad_norm": 2.6701997243958746, + "learning_rate": 7.0466616084977245e-06, + "loss": 0.2452, + "step": 15345 + }, + { + "epoch": 2.6199539131176923, + "grad_norm": 0.3638867152831773, + "learning_rate": 7.030854830551341e-06, + "loss": 0.2659, + "step": 15350 + }, + { + "epoch": 2.6208073739011692, + "grad_norm": 0.3411134390071248, + "learning_rate": 7.015048052604957e-06, + "loss": 0.2765, + "step": 15355 + }, + { + "epoch": 2.621660834684646, + "grad_norm": 0.33325268514592343, + "learning_rate": 6.999241274658574e-06, + "loss": 0.2389, + "step": 15360 + }, + { + "epoch": 2.622514295468123, + "grad_norm": 0.3303227362918107, + "learning_rate": 6.983434496712191e-06, + "loss": 0.2505, + "step": 15365 + }, + { + "epoch": 2.6233677562516, + "grad_norm": 0.3383050930967925, + "learning_rate": 6.967627718765807e-06, + "loss": 0.2783, + "step": 15370 + }, + { + "epoch": 2.6242212170350774, + "grad_norm": 0.3280058631343651, + "learning_rate": 6.951820940819424e-06, + "loss": 0.2689, + "step": 15375 + }, + { + "epoch": 2.6250746778185543, + "grad_norm": 0.35144419432689084, + "learning_rate": 6.936014162873041e-06, + "loss": 0.2519, + "step": 15380 + }, + { + "epoch": 2.6259281386020312, + "grad_norm": 0.37545184334564646, + "learning_rate": 6.9202073849266565e-06, + "loss": 0.2723, + "step": 15385 + }, + { + "epoch": 2.626781599385508, + "grad_norm": 0.37231014008438706, + "learning_rate": 6.904400606980274e-06, + "loss": 0.2524, + "step": 15390 + }, + { + "epoch": 2.627635060168985, + "grad_norm": 0.33419225335162134, + "learning_rate": 6.888593829033891e-06, + "loss": 0.258, + "step": 15395 + }, + { + "epoch": 2.6284885209524624, + "grad_norm": 0.29567237571501975, + "learning_rate": 6.872787051087506e-06, + "loss": 0.2567, + "step": 15400 + }, + { + "epoch": 2.6293419817359394, + "grad_norm": 0.30445908419779605, + "learning_rate": 6.856980273141123e-06, + "loss": 0.2596, + "step": 15405 + }, + { + "epoch": 2.6301954425194163, + "grad_norm": 0.30655934744530733, + "learning_rate": 6.84117349519474e-06, + "loss": 0.2509, + "step": 15410 + }, + { + "epoch": 2.631048903302893, + "grad_norm": 0.5928998708875786, + "learning_rate": 6.825366717248356e-06, + "loss": 0.2709, + "step": 15415 + }, + { + "epoch": 2.63190236408637, + "grad_norm": 0.3644074401787618, + "learning_rate": 6.809559939301973e-06, + "loss": 0.2633, + "step": 15420 + }, + { + "epoch": 2.6327558248698475, + "grad_norm": 0.417807151329196, + "learning_rate": 6.79375316135559e-06, + "loss": 0.2622, + "step": 15425 + }, + { + "epoch": 2.633609285653324, + "grad_norm": 0.3564349216934015, + "learning_rate": 6.777946383409206e-06, + "loss": 0.2412, + "step": 15430 + }, + { + "epoch": 2.6344627464368013, + "grad_norm": 0.3645799274924827, + "learning_rate": 6.762139605462823e-06, + "loss": 0.2539, + "step": 15435 + }, + { + "epoch": 2.6353162072202783, + "grad_norm": 0.46329426878383684, + "learning_rate": 6.74633282751644e-06, + "loss": 0.2878, + "step": 15440 + }, + { + "epoch": 2.636169668003755, + "grad_norm": 0.41205339021908743, + "learning_rate": 6.730526049570056e-06, + "loss": 0.2789, + "step": 15445 + }, + { + "epoch": 2.637023128787232, + "grad_norm": 0.36220454124846574, + "learning_rate": 6.7147192716236725e-06, + "loss": 0.2563, + "step": 15450 + }, + { + "epoch": 2.637876589570709, + "grad_norm": 0.36394619076485407, + "learning_rate": 6.69891249367729e-06, + "loss": 0.2702, + "step": 15455 + }, + { + "epoch": 2.6387300503541864, + "grad_norm": 0.31510412525522824, + "learning_rate": 6.683105715730906e-06, + "loss": 0.2705, + "step": 15460 + }, + { + "epoch": 2.6395835111376633, + "grad_norm": 0.35959438946296624, + "learning_rate": 6.6672989377845214e-06, + "loss": 0.2638, + "step": 15465 + }, + { + "epoch": 2.6404369719211402, + "grad_norm": 0.3719073881386995, + "learning_rate": 6.65149215983814e-06, + "loss": 0.2614, + "step": 15470 + }, + { + "epoch": 2.641290432704617, + "grad_norm": 0.34851867562967587, + "learning_rate": 6.6356853818917555e-06, + "loss": 0.2704, + "step": 15475 + }, + { + "epoch": 2.642143893488094, + "grad_norm": 0.3412509124045147, + "learning_rate": 6.619878603945371e-06, + "loss": 0.2965, + "step": 15480 + }, + { + "epoch": 2.6429973542715715, + "grad_norm": 0.2969856118052839, + "learning_rate": 6.60407182599899e-06, + "loss": 0.2606, + "step": 15485 + }, + { + "epoch": 2.6438508150550484, + "grad_norm": 0.33960837625779033, + "learning_rate": 6.588265048052605e-06, + "loss": 0.2827, + "step": 15490 + }, + { + "epoch": 2.6447042758385253, + "grad_norm": 0.3435040294889023, + "learning_rate": 6.572458270106221e-06, + "loss": 0.2543, + "step": 15495 + }, + { + "epoch": 2.645557736622002, + "grad_norm": 0.4138041998758704, + "learning_rate": 6.556651492159839e-06, + "loss": 0.2535, + "step": 15500 + }, + { + "epoch": 2.646411197405479, + "grad_norm": 0.30733760212114253, + "learning_rate": 6.540844714213455e-06, + "loss": 0.2723, + "step": 15505 + }, + { + "epoch": 2.647264658188956, + "grad_norm": 0.34787237731846093, + "learning_rate": 6.525037936267071e-06, + "loss": 0.2574, + "step": 15510 + }, + { + "epoch": 2.648118118972433, + "grad_norm": 0.3549540085305931, + "learning_rate": 6.509231158320689e-06, + "loss": 0.2653, + "step": 15515 + }, + { + "epoch": 2.6489715797559104, + "grad_norm": 0.37233586082518344, + "learning_rate": 6.493424380374305e-06, + "loss": 0.2551, + "step": 15520 + }, + { + "epoch": 2.6498250405393873, + "grad_norm": 0.323166190585263, + "learning_rate": 6.477617602427921e-06, + "loss": 0.2646, + "step": 15525 + }, + { + "epoch": 2.650678501322864, + "grad_norm": 0.34153440033426474, + "learning_rate": 6.461810824481539e-06, + "loss": 0.2538, + "step": 15530 + }, + { + "epoch": 2.651531962106341, + "grad_norm": 0.34495479489164316, + "learning_rate": 6.446004046535155e-06, + "loss": 0.2454, + "step": 15535 + }, + { + "epoch": 2.652385422889818, + "grad_norm": 0.2990419238606832, + "learning_rate": 6.430197268588771e-06, + "loss": 0.2983, + "step": 15540 + }, + { + "epoch": 2.6532388836732954, + "grad_norm": 0.3373808033775284, + "learning_rate": 6.414390490642387e-06, + "loss": 0.2648, + "step": 15545 + }, + { + "epoch": 2.6540923444567723, + "grad_norm": 0.30005592281826204, + "learning_rate": 6.398583712696005e-06, + "loss": 0.2769, + "step": 15550 + }, + { + "epoch": 2.6549458052402493, + "grad_norm": 0.2950390468483142, + "learning_rate": 6.3827769347496204e-06, + "loss": 0.2502, + "step": 15555 + }, + { + "epoch": 2.655799266023726, + "grad_norm": 0.7361620948064368, + "learning_rate": 6.366970156803237e-06, + "loss": 0.226, + "step": 15560 + }, + { + "epoch": 2.656652726807203, + "grad_norm": 0.35702882770988104, + "learning_rate": 6.3511633788568545e-06, + "loss": 0.2887, + "step": 15565 + }, + { + "epoch": 2.6575061875906805, + "grad_norm": 0.40098849474500975, + "learning_rate": 6.33535660091047e-06, + "loss": 0.2595, + "step": 15570 + }, + { + "epoch": 2.658359648374157, + "grad_norm": 0.33299844553405433, + "learning_rate": 6.319549822964087e-06, + "loss": 0.2624, + "step": 15575 + }, + { + "epoch": 2.6592131091576343, + "grad_norm": 0.9552251998611956, + "learning_rate": 6.303743045017704e-06, + "loss": 0.2657, + "step": 15580 + }, + { + "epoch": 2.6600665699411112, + "grad_norm": 0.33097635247691237, + "learning_rate": 6.28793626707132e-06, + "loss": 0.2329, + "step": 15585 + }, + { + "epoch": 2.660920030724588, + "grad_norm": 0.33191527265529913, + "learning_rate": 6.272129489124937e-06, + "loss": 0.2682, + "step": 15590 + }, + { + "epoch": 2.661773491508065, + "grad_norm": 0.3159377513980589, + "learning_rate": 6.256322711178554e-06, + "loss": 0.2587, + "step": 15595 + }, + { + "epoch": 2.662626952291542, + "grad_norm": 0.3141137315073459, + "learning_rate": 6.24051593323217e-06, + "loss": 0.288, + "step": 15600 + }, + { + "epoch": 2.6634804130750194, + "grad_norm": 0.3116398274186534, + "learning_rate": 6.224709155285787e-06, + "loss": 0.2686, + "step": 15605 + }, + { + "epoch": 2.6643338738584963, + "grad_norm": 0.3848915059332201, + "learning_rate": 6.208902377339403e-06, + "loss": 0.2774, + "step": 15610 + }, + { + "epoch": 2.665187334641973, + "grad_norm": 0.3917115023600133, + "learning_rate": 6.19309559939302e-06, + "loss": 0.2635, + "step": 15615 + }, + { + "epoch": 2.66604079542545, + "grad_norm": 0.37598166690722307, + "learning_rate": 6.177288821446637e-06, + "loss": 0.2359, + "step": 15620 + }, + { + "epoch": 2.666894256208927, + "grad_norm": 0.3427634213516318, + "learning_rate": 6.161482043500253e-06, + "loss": 0.2621, + "step": 15625 + }, + { + "epoch": 2.6677477169924044, + "grad_norm": 0.32992098983283125, + "learning_rate": 6.14567526555387e-06, + "loss": 0.2628, + "step": 15630 + }, + { + "epoch": 2.6686011777758814, + "grad_norm": 0.38083250474499536, + "learning_rate": 6.129868487607486e-06, + "loss": 0.2593, + "step": 15635 + }, + { + "epoch": 2.6694546385593583, + "grad_norm": 0.3737949530874446, + "learning_rate": 6.114061709661103e-06, + "loss": 0.2485, + "step": 15640 + }, + { + "epoch": 2.670308099342835, + "grad_norm": 0.36254880895682357, + "learning_rate": 6.098254931714719e-06, + "loss": 0.2741, + "step": 15645 + }, + { + "epoch": 2.671161560126312, + "grad_norm": 0.3679555211260355, + "learning_rate": 6.082448153768336e-06, + "loss": 0.2622, + "step": 15650 + }, + { + "epoch": 2.672015020909789, + "grad_norm": 0.32060345874403146, + "learning_rate": 6.066641375821953e-06, + "loss": 0.2662, + "step": 15655 + }, + { + "epoch": 2.672868481693266, + "grad_norm": 0.3079424030638837, + "learning_rate": 6.050834597875569e-06, + "loss": 0.2518, + "step": 15660 + }, + { + "epoch": 2.6737219424767433, + "grad_norm": 0.33856647147712504, + "learning_rate": 6.035027819929186e-06, + "loss": 0.2501, + "step": 15665 + }, + { + "epoch": 2.6745754032602203, + "grad_norm": 0.3653565329375958, + "learning_rate": 6.0192210419828025e-06, + "loss": 0.2542, + "step": 15670 + }, + { + "epoch": 2.675428864043697, + "grad_norm": 0.3595713968478812, + "learning_rate": 6.003414264036419e-06, + "loss": 0.2538, + "step": 15675 + }, + { + "epoch": 2.676282324827174, + "grad_norm": 0.33627575601187915, + "learning_rate": 5.987607486090036e-06, + "loss": 0.267, + "step": 15680 + }, + { + "epoch": 2.677135785610651, + "grad_norm": 0.4062703192670018, + "learning_rate": 5.971800708143652e-06, + "loss": 0.249, + "step": 15685 + }, + { + "epoch": 2.6779892463941284, + "grad_norm": 0.3594122030085205, + "learning_rate": 5.955993930197269e-06, + "loss": 0.2798, + "step": 15690 + }, + { + "epoch": 2.6788427071776053, + "grad_norm": 0.3895681673156118, + "learning_rate": 5.9401871522508855e-06, + "loss": 0.2818, + "step": 15695 + }, + { + "epoch": 2.6796961679610822, + "grad_norm": 0.43038452358267465, + "learning_rate": 5.924380374304502e-06, + "loss": 0.2598, + "step": 15700 + }, + { + "epoch": 2.680549628744559, + "grad_norm": 0.3995685056530944, + "learning_rate": 5.908573596358119e-06, + "loss": 0.2933, + "step": 15705 + }, + { + "epoch": 2.681403089528036, + "grad_norm": 0.3734290041378778, + "learning_rate": 5.892766818411735e-06, + "loss": 0.2743, + "step": 15710 + }, + { + "epoch": 2.6822565503115134, + "grad_norm": 0.31717697701338593, + "learning_rate": 5.876960040465352e-06, + "loss": 0.2742, + "step": 15715 + }, + { + "epoch": 2.68311001109499, + "grad_norm": 0.37878051677289487, + "learning_rate": 5.861153262518968e-06, + "loss": 0.2395, + "step": 15720 + }, + { + "epoch": 2.6839634718784673, + "grad_norm": 0.3502105273684265, + "learning_rate": 5.845346484572585e-06, + "loss": 0.2554, + "step": 15725 + }, + { + "epoch": 2.684816932661944, + "grad_norm": 0.36504635411920455, + "learning_rate": 5.829539706626202e-06, + "loss": 0.2799, + "step": 15730 + }, + { + "epoch": 2.685670393445421, + "grad_norm": 0.32571179571423386, + "learning_rate": 5.8137329286798176e-06, + "loss": 0.2738, + "step": 15735 + }, + { + "epoch": 2.686523854228898, + "grad_norm": 0.3450408141278497, + "learning_rate": 5.797926150733435e-06, + "loss": 0.2568, + "step": 15740 + }, + { + "epoch": 2.687377315012375, + "grad_norm": 0.35448784854975235, + "learning_rate": 5.782119372787052e-06, + "loss": 0.27, + "step": 15745 + }, + { + "epoch": 2.6882307757958523, + "grad_norm": 0.33889855398398216, + "learning_rate": 5.766312594840667e-06, + "loss": 0.2481, + "step": 15750 + }, + { + "epoch": 2.6890842365793293, + "grad_norm": 0.35149130323176103, + "learning_rate": 5.750505816894285e-06, + "loss": 0.2301, + "step": 15755 + }, + { + "epoch": 2.689937697362806, + "grad_norm": 0.3660742482701346, + "learning_rate": 5.7346990389479015e-06, + "loss": 0.2823, + "step": 15760 + }, + { + "epoch": 2.690791158146283, + "grad_norm": 0.3411786259643136, + "learning_rate": 5.718892261001517e-06, + "loss": 0.2463, + "step": 15765 + }, + { + "epoch": 2.69164461892976, + "grad_norm": 0.3730178861789147, + "learning_rate": 5.703085483055135e-06, + "loss": 0.2242, + "step": 15770 + }, + { + "epoch": 2.6924980797132374, + "grad_norm": 0.3267702608488623, + "learning_rate": 5.6872787051087505e-06, + "loss": 0.2437, + "step": 15775 + }, + { + "epoch": 2.6933515404967143, + "grad_norm": 0.3303506142224966, + "learning_rate": 5.671471927162367e-06, + "loss": 0.2716, + "step": 15780 + }, + { + "epoch": 2.6942050012801912, + "grad_norm": 0.3483159930338373, + "learning_rate": 5.6556651492159845e-06, + "loss": 0.2964, + "step": 15785 + }, + { + "epoch": 2.695058462063668, + "grad_norm": 0.34365776948938526, + "learning_rate": 5.6398583712696e-06, + "loss": 0.2501, + "step": 15790 + }, + { + "epoch": 2.695911922847145, + "grad_norm": 0.5091088648901413, + "learning_rate": 5.624051593323217e-06, + "loss": 0.2498, + "step": 15795 + }, + { + "epoch": 2.696765383630622, + "grad_norm": 0.3219123113570026, + "learning_rate": 5.608244815376834e-06, + "loss": 0.2548, + "step": 15800 + }, + { + "epoch": 2.697618844414099, + "grad_norm": 0.36789902114116035, + "learning_rate": 5.59243803743045e-06, + "loss": 0.2388, + "step": 15805 + }, + { + "epoch": 2.6984723051975763, + "grad_norm": 0.40298010696935843, + "learning_rate": 5.576631259484067e-06, + "loss": 0.2751, + "step": 15810 + }, + { + "epoch": 2.6993257659810532, + "grad_norm": 0.4110009799120327, + "learning_rate": 5.560824481537684e-06, + "loss": 0.2667, + "step": 15815 + }, + { + "epoch": 2.70017922676453, + "grad_norm": 0.33758326531005173, + "learning_rate": 5.5450177035913e-06, + "loss": 0.2563, + "step": 15820 + }, + { + "epoch": 2.701032687548007, + "grad_norm": 0.33169441729925814, + "learning_rate": 5.5292109256449166e-06, + "loss": 0.2477, + "step": 15825 + }, + { + "epoch": 2.701886148331484, + "grad_norm": 0.3708573039512998, + "learning_rate": 5.513404147698533e-06, + "loss": 0.2853, + "step": 15830 + }, + { + "epoch": 2.7027396091149614, + "grad_norm": 0.32450941495953645, + "learning_rate": 5.49759736975215e-06, + "loss": 0.2648, + "step": 15835 + }, + { + "epoch": 2.7035930698984383, + "grad_norm": 0.5847723828809334, + "learning_rate": 5.481790591805766e-06, + "loss": 0.272, + "step": 15840 + }, + { + "epoch": 2.704446530681915, + "grad_norm": 0.3362346277513503, + "learning_rate": 5.465983813859383e-06, + "loss": 0.2465, + "step": 15845 + }, + { + "epoch": 2.705299991465392, + "grad_norm": 0.35963561018554313, + "learning_rate": 5.450177035913e-06, + "loss": 0.2742, + "step": 15850 + }, + { + "epoch": 2.706153452248869, + "grad_norm": 0.3016792294913857, + "learning_rate": 5.434370257966616e-06, + "loss": 0.28, + "step": 15855 + }, + { + "epoch": 2.7070069130323464, + "grad_norm": 0.3303518225889094, + "learning_rate": 5.418563480020233e-06, + "loss": 0.2682, + "step": 15860 + }, + { + "epoch": 2.707860373815823, + "grad_norm": 0.31989733013292765, + "learning_rate": 5.4027567020738494e-06, + "loss": 0.2792, + "step": 15865 + }, + { + "epoch": 2.7087138345993003, + "grad_norm": 0.34750150943486774, + "learning_rate": 5.386949924127466e-06, + "loss": 0.2613, + "step": 15870 + }, + { + "epoch": 2.709567295382777, + "grad_norm": 0.35145405872984736, + "learning_rate": 5.371143146181083e-06, + "loss": 0.2453, + "step": 15875 + }, + { + "epoch": 2.710420756166254, + "grad_norm": 0.3140012579713542, + "learning_rate": 5.355336368234699e-06, + "loss": 0.2592, + "step": 15880 + }, + { + "epoch": 2.711274216949731, + "grad_norm": 0.33591125148705864, + "learning_rate": 5.339529590288316e-06, + "loss": 0.2669, + "step": 15885 + }, + { + "epoch": 2.712127677733208, + "grad_norm": 0.3087501936993232, + "learning_rate": 5.3237228123419325e-06, + "loss": 0.2506, + "step": 15890 + }, + { + "epoch": 2.7129811385166853, + "grad_norm": 0.36517175964213555, + "learning_rate": 5.307916034395549e-06, + "loss": 0.2564, + "step": 15895 + }, + { + "epoch": 2.7138345993001622, + "grad_norm": 0.3475024929941956, + "learning_rate": 5.292109256449166e-06, + "loss": 0.2758, + "step": 15900 + }, + { + "epoch": 2.714688060083639, + "grad_norm": 0.334050236491831, + "learning_rate": 5.276302478502782e-06, + "loss": 0.2882, + "step": 15905 + }, + { + "epoch": 2.715541520867116, + "grad_norm": 0.33305463616687425, + "learning_rate": 5.260495700556399e-06, + "loss": 0.2417, + "step": 15910 + }, + { + "epoch": 2.716394981650593, + "grad_norm": 0.3399104079077125, + "learning_rate": 5.2446889226100155e-06, + "loss": 0.2563, + "step": 15915 + }, + { + "epoch": 2.7172484424340704, + "grad_norm": 0.35992749919916034, + "learning_rate": 5.228882144663632e-06, + "loss": 0.2698, + "step": 15920 + }, + { + "epoch": 2.7181019032175473, + "grad_norm": 0.37463460779301216, + "learning_rate": 5.213075366717249e-06, + "loss": 0.2644, + "step": 15925 + }, + { + "epoch": 2.718955364001024, + "grad_norm": 0.34481025499311296, + "learning_rate": 5.197268588770865e-06, + "loss": 0.2767, + "step": 15930 + }, + { + "epoch": 2.719808824784501, + "grad_norm": 0.31824100918089626, + "learning_rate": 5.181461810824482e-06, + "loss": 0.2768, + "step": 15935 + }, + { + "epoch": 2.720662285567978, + "grad_norm": 0.39036827158201115, + "learning_rate": 5.165655032878099e-06, + "loss": 0.2468, + "step": 15940 + }, + { + "epoch": 2.721515746351455, + "grad_norm": 0.35354453353195064, + "learning_rate": 5.149848254931715e-06, + "loss": 0.2574, + "step": 15945 + }, + { + "epoch": 2.722369207134932, + "grad_norm": 0.3651947064412998, + "learning_rate": 5.134041476985332e-06, + "loss": 0.2598, + "step": 15950 + }, + { + "epoch": 2.7232226679184093, + "grad_norm": 0.3200578358249782, + "learning_rate": 5.1182346990389484e-06, + "loss": 0.2524, + "step": 15955 + }, + { + "epoch": 2.724076128701886, + "grad_norm": 0.32106339624302854, + "learning_rate": 5.102427921092565e-06, + "loss": 0.2834, + "step": 15960 + }, + { + "epoch": 2.724929589485363, + "grad_norm": 0.35695751957917327, + "learning_rate": 5.086621143146182e-06, + "loss": 0.2335, + "step": 15965 + }, + { + "epoch": 2.72578305026884, + "grad_norm": 0.3458854817400443, + "learning_rate": 5.070814365199797e-06, + "loss": 0.266, + "step": 15970 + }, + { + "epoch": 2.726636511052317, + "grad_norm": 0.3768715108688561, + "learning_rate": 5.055007587253414e-06, + "loss": 0.2441, + "step": 15975 + }, + { + "epoch": 2.7274899718357943, + "grad_norm": 0.401465053041461, + "learning_rate": 5.0392008093070315e-06, + "loss": 0.2586, + "step": 15980 + }, + { + "epoch": 2.7283434326192713, + "grad_norm": 0.35909876371708843, + "learning_rate": 5.023394031360647e-06, + "loss": 0.2668, + "step": 15985 + }, + { + "epoch": 2.729196893402748, + "grad_norm": 0.3404648389384397, + "learning_rate": 5.007587253414264e-06, + "loss": 0.2799, + "step": 15990 + }, + { + "epoch": 2.730050354186225, + "grad_norm": 0.3559294370769041, + "learning_rate": 4.991780475467881e-06, + "loss": 0.2634, + "step": 15995 + }, + { + "epoch": 2.730903814969702, + "grad_norm": 0.3254507220933428, + "learning_rate": 4.975973697521497e-06, + "loss": 0.2711, + "step": 16000 + }, + { + "epoch": 2.7317572757531794, + "grad_norm": 0.3289249884006375, + "learning_rate": 4.960166919575114e-06, + "loss": 0.2674, + "step": 16005 + }, + { + "epoch": 2.732610736536656, + "grad_norm": 0.35530721566450874, + "learning_rate": 4.944360141628731e-06, + "loss": 0.2764, + "step": 16010 + }, + { + "epoch": 2.7334641973201332, + "grad_norm": 0.3322818786935116, + "learning_rate": 4.928553363682347e-06, + "loss": 0.2568, + "step": 16015 + }, + { + "epoch": 2.73431765810361, + "grad_norm": 0.4173584135881613, + "learning_rate": 4.9127465857359635e-06, + "loss": 0.3052, + "step": 16020 + }, + { + "epoch": 2.735171118887087, + "grad_norm": 0.34952729726552456, + "learning_rate": 4.896939807789581e-06, + "loss": 0.2538, + "step": 16025 + }, + { + "epoch": 2.736024579670564, + "grad_norm": 0.36015010665089303, + "learning_rate": 4.881133029843197e-06, + "loss": 0.2727, + "step": 16030 + }, + { + "epoch": 2.736878040454041, + "grad_norm": 0.36300589588357357, + "learning_rate": 4.865326251896813e-06, + "loss": 0.2749, + "step": 16035 + }, + { + "epoch": 2.7377315012375183, + "grad_norm": 0.3523399666869812, + "learning_rate": 4.84951947395043e-06, + "loss": 0.2735, + "step": 16040 + }, + { + "epoch": 2.738584962020995, + "grad_norm": 0.32157226769536656, + "learning_rate": 4.833712696004047e-06, + "loss": 0.2646, + "step": 16045 + }, + { + "epoch": 2.739438422804472, + "grad_norm": 0.31282308394905234, + "learning_rate": 4.817905918057663e-06, + "loss": 0.2716, + "step": 16050 + }, + { + "epoch": 2.740291883587949, + "grad_norm": 0.35253803863997824, + "learning_rate": 4.80209914011128e-06, + "loss": 0.2824, + "step": 16055 + }, + { + "epoch": 2.741145344371426, + "grad_norm": 0.3998407459813653, + "learning_rate": 4.786292362164896e-06, + "loss": 0.2555, + "step": 16060 + }, + { + "epoch": 2.7419988051549034, + "grad_norm": 0.3330651152990024, + "learning_rate": 4.770485584218513e-06, + "loss": 0.234, + "step": 16065 + }, + { + "epoch": 2.7428522659383803, + "grad_norm": 0.34791339272080873, + "learning_rate": 4.75467880627213e-06, + "loss": 0.2464, + "step": 16070 + }, + { + "epoch": 2.743705726721857, + "grad_norm": 0.39063260922066667, + "learning_rate": 4.738872028325746e-06, + "loss": 0.2463, + "step": 16075 + }, + { + "epoch": 2.744559187505334, + "grad_norm": 0.32492035824152354, + "learning_rate": 4.723065250379363e-06, + "loss": 0.2537, + "step": 16080 + }, + { + "epoch": 2.745412648288811, + "grad_norm": 0.3731137115036364, + "learning_rate": 4.7072584724329795e-06, + "loss": 0.2679, + "step": 16085 + }, + { + "epoch": 2.746266109072288, + "grad_norm": 0.3456348621232528, + "learning_rate": 4.691451694486596e-06, + "loss": 0.2378, + "step": 16090 + }, + { + "epoch": 2.747119569855765, + "grad_norm": 0.6624254071052443, + "learning_rate": 4.675644916540213e-06, + "loss": 0.2481, + "step": 16095 + }, + { + "epoch": 2.7479730306392423, + "grad_norm": 0.355392307772178, + "learning_rate": 4.659838138593829e-06, + "loss": 0.2532, + "step": 16100 + }, + { + "epoch": 2.748826491422719, + "grad_norm": 0.3152096224980243, + "learning_rate": 4.644031360647446e-06, + "loss": 0.2586, + "step": 16105 + }, + { + "epoch": 2.749679952206196, + "grad_norm": 0.4539853050997346, + "learning_rate": 4.6282245827010625e-06, + "loss": 0.255, + "step": 16110 + }, + { + "epoch": 2.750533412989673, + "grad_norm": 0.36829194095607254, + "learning_rate": 4.612417804754679e-06, + "loss": 0.2704, + "step": 16115 + }, + { + "epoch": 2.75138687377315, + "grad_norm": 0.33596245320254203, + "learning_rate": 4.596611026808296e-06, + "loss": 0.2722, + "step": 16120 + }, + { + "epoch": 2.7522403345566273, + "grad_norm": 0.3484077352550164, + "learning_rate": 4.580804248861912e-06, + "loss": 0.2832, + "step": 16125 + }, + { + "epoch": 2.7530937953401042, + "grad_norm": 0.30581596719805965, + "learning_rate": 4.564997470915529e-06, + "loss": 0.2872, + "step": 16130 + }, + { + "epoch": 2.753947256123581, + "grad_norm": 0.35869044112332943, + "learning_rate": 4.5491906929691456e-06, + "loss": 0.2911, + "step": 16135 + }, + { + "epoch": 2.754800716907058, + "grad_norm": 0.3351541729240264, + "learning_rate": 4.533383915022762e-06, + "loss": 0.2431, + "step": 16140 + }, + { + "epoch": 2.755654177690535, + "grad_norm": 0.3364680381630783, + "learning_rate": 4.517577137076379e-06, + "loss": 0.2764, + "step": 16145 + }, + { + "epoch": 2.7565076384740124, + "grad_norm": 0.38641523054155913, + "learning_rate": 4.501770359129995e-06, + "loss": 0.2741, + "step": 16150 + }, + { + "epoch": 2.757361099257489, + "grad_norm": 0.46147946741974905, + "learning_rate": 4.485963581183612e-06, + "loss": 0.2459, + "step": 16155 + }, + { + "epoch": 2.758214560040966, + "grad_norm": 0.346631136401872, + "learning_rate": 4.470156803237229e-06, + "loss": 0.2617, + "step": 16160 + }, + { + "epoch": 2.759068020824443, + "grad_norm": 0.3563142377923633, + "learning_rate": 4.454350025290844e-06, + "loss": 0.2447, + "step": 16165 + }, + { + "epoch": 2.75992148160792, + "grad_norm": 0.3205055929160683, + "learning_rate": 4.438543247344462e-06, + "loss": 0.2811, + "step": 16170 + }, + { + "epoch": 2.760774942391397, + "grad_norm": 0.3543129423537271, + "learning_rate": 4.4227364693980785e-06, + "loss": 0.2739, + "step": 16175 + }, + { + "epoch": 2.761628403174874, + "grad_norm": 0.7640148553266405, + "learning_rate": 4.406929691451694e-06, + "loss": 0.2907, + "step": 16180 + }, + { + "epoch": 2.7624818639583513, + "grad_norm": 0.30945303636667076, + "learning_rate": 4.391122913505312e-06, + "loss": 0.2674, + "step": 16185 + }, + { + "epoch": 2.763335324741828, + "grad_norm": 0.3032270814697051, + "learning_rate": 4.375316135558928e-06, + "loss": 0.2625, + "step": 16190 + }, + { + "epoch": 2.764188785525305, + "grad_norm": 0.3517085066160985, + "learning_rate": 4.359509357612544e-06, + "loss": 0.2671, + "step": 16195 + }, + { + "epoch": 2.765042246308782, + "grad_norm": 0.3879520393018638, + "learning_rate": 4.3437025796661615e-06, + "loss": 0.2581, + "step": 16200 + }, + { + "epoch": 2.765895707092259, + "grad_norm": 0.4244100788756175, + "learning_rate": 4.327895801719778e-06, + "loss": 0.2507, + "step": 16205 + }, + { + "epoch": 2.7667491678757363, + "grad_norm": 0.3199529571535488, + "learning_rate": 4.312089023773394e-06, + "loss": 0.2713, + "step": 16210 + }, + { + "epoch": 2.7676026286592132, + "grad_norm": 0.5247639264974975, + "learning_rate": 4.296282245827011e-06, + "loss": 0.2678, + "step": 16215 + }, + { + "epoch": 2.76845608944269, + "grad_norm": 0.3445060865346845, + "learning_rate": 4.280475467880628e-06, + "loss": 0.2706, + "step": 16220 + }, + { + "epoch": 2.769309550226167, + "grad_norm": 0.3944631570601775, + "learning_rate": 4.264668689934244e-06, + "loss": 0.2693, + "step": 16225 + }, + { + "epoch": 2.770163011009644, + "grad_norm": 0.36783086003405474, + "learning_rate": 4.24886191198786e-06, + "loss": 0.2809, + "step": 16230 + }, + { + "epoch": 2.7710164717931214, + "grad_norm": 0.3638614241928813, + "learning_rate": 4.233055134041477e-06, + "loss": 0.255, + "step": 16235 + }, + { + "epoch": 2.771869932576598, + "grad_norm": 0.5399309369816859, + "learning_rate": 4.2172483560950935e-06, + "loss": 0.2785, + "step": 16240 + }, + { + "epoch": 2.7727233933600752, + "grad_norm": 0.3501567327898987, + "learning_rate": 4.20144157814871e-06, + "loss": 0.2741, + "step": 16245 + }, + { + "epoch": 2.773576854143552, + "grad_norm": 0.31510586430391474, + "learning_rate": 4.185634800202327e-06, + "loss": 0.2707, + "step": 16250 + }, + { + "epoch": 2.774430314927029, + "grad_norm": 0.3432568121675401, + "learning_rate": 4.169828022255943e-06, + "loss": 0.2728, + "step": 16255 + }, + { + "epoch": 2.775283775710506, + "grad_norm": 0.3380908042725392, + "learning_rate": 4.15402124430956e-06, + "loss": 0.286, + "step": 16260 + }, + { + "epoch": 2.776137236493983, + "grad_norm": 0.3451036213791636, + "learning_rate": 4.138214466363177e-06, + "loss": 0.2497, + "step": 16265 + }, + { + "epoch": 2.7769906972774603, + "grad_norm": 0.3671258981765301, + "learning_rate": 4.122407688416793e-06, + "loss": 0.2403, + "step": 16270 + }, + { + "epoch": 2.777844158060937, + "grad_norm": 0.6107452247947716, + "learning_rate": 4.10660091047041e-06, + "loss": 0.257, + "step": 16275 + }, + { + "epoch": 2.778697618844414, + "grad_norm": 0.3152322065227391, + "learning_rate": 4.0907941325240264e-06, + "loss": 0.2649, + "step": 16280 + }, + { + "epoch": 2.779551079627891, + "grad_norm": 0.35977789232362667, + "learning_rate": 4.074987354577643e-06, + "loss": 0.2451, + "step": 16285 + }, + { + "epoch": 2.780404540411368, + "grad_norm": 0.3628960931576398, + "learning_rate": 4.05918057663126e-06, + "loss": 0.2593, + "step": 16290 + }, + { + "epoch": 2.7812580011948453, + "grad_norm": 0.3123003825387298, + "learning_rate": 4.043373798684876e-06, + "loss": 0.261, + "step": 16295 + }, + { + "epoch": 2.782111461978322, + "grad_norm": 0.334998792544953, + "learning_rate": 4.027567020738493e-06, + "loss": 0.2563, + "step": 16300 + }, + { + "epoch": 2.782964922761799, + "grad_norm": 0.311287963703417, + "learning_rate": 4.0117602427921095e-06, + "loss": 0.2662, + "step": 16305 + }, + { + "epoch": 2.783818383545276, + "grad_norm": 0.400043297606707, + "learning_rate": 3.995953464845726e-06, + "loss": 0.2537, + "step": 16310 + }, + { + "epoch": 2.784671844328753, + "grad_norm": 0.33348227720765716, + "learning_rate": 3.980146686899343e-06, + "loss": 0.2489, + "step": 16315 + }, + { + "epoch": 2.78552530511223, + "grad_norm": 0.32342434498550293, + "learning_rate": 3.964339908952959e-06, + "loss": 0.2484, + "step": 16320 + }, + { + "epoch": 2.786378765895707, + "grad_norm": 0.3383306821346228, + "learning_rate": 3.948533131006576e-06, + "loss": 0.2678, + "step": 16325 + }, + { + "epoch": 2.7872322266791842, + "grad_norm": 0.333702445040295, + "learning_rate": 3.9327263530601925e-06, + "loss": 0.2498, + "step": 16330 + }, + { + "epoch": 2.788085687462661, + "grad_norm": 0.3529633567573724, + "learning_rate": 3.916919575113809e-06, + "loss": 0.2629, + "step": 16335 + }, + { + "epoch": 2.788939148246138, + "grad_norm": 0.3188183771237849, + "learning_rate": 3.901112797167426e-06, + "loss": 0.2715, + "step": 16340 + }, + { + "epoch": 2.789792609029615, + "grad_norm": 0.3186689929202362, + "learning_rate": 3.885306019221042e-06, + "loss": 0.2703, + "step": 16345 + }, + { + "epoch": 2.790646069813092, + "grad_norm": 0.359627613155441, + "learning_rate": 3.869499241274659e-06, + "loss": 0.2653, + "step": 16350 + }, + { + "epoch": 2.7914995305965693, + "grad_norm": 0.3679817694329449, + "learning_rate": 3.853692463328276e-06, + "loss": 0.2694, + "step": 16355 + }, + { + "epoch": 2.7923529913800462, + "grad_norm": 0.3084239161690286, + "learning_rate": 3.837885685381891e-06, + "loss": 0.2804, + "step": 16360 + }, + { + "epoch": 2.793206452163523, + "grad_norm": 0.3551961472696471, + "learning_rate": 3.822078907435509e-06, + "loss": 0.2701, + "step": 16365 + }, + { + "epoch": 2.794059912947, + "grad_norm": 0.3466176637809046, + "learning_rate": 3.8062721294891254e-06, + "loss": 0.2719, + "step": 16370 + }, + { + "epoch": 2.794913373730477, + "grad_norm": 0.3539991568159895, + "learning_rate": 3.7904653515427416e-06, + "loss": 0.2529, + "step": 16375 + }, + { + "epoch": 2.7957668345139544, + "grad_norm": 0.33891118964849615, + "learning_rate": 3.7746585735963582e-06, + "loss": 0.2636, + "step": 16380 + }, + { + "epoch": 2.796620295297431, + "grad_norm": 0.38201978122494606, + "learning_rate": 3.7588517956499753e-06, + "loss": 0.2641, + "step": 16385 + }, + { + "epoch": 2.797473756080908, + "grad_norm": 0.38292117731613895, + "learning_rate": 3.7430450177035914e-06, + "loss": 0.2561, + "step": 16390 + }, + { + "epoch": 2.798327216864385, + "grad_norm": 0.375796386857593, + "learning_rate": 3.727238239757208e-06, + "loss": 0.2706, + "step": 16395 + }, + { + "epoch": 2.799180677647862, + "grad_norm": 0.3432949267689113, + "learning_rate": 3.711431461810825e-06, + "loss": 0.2422, + "step": 16400 + }, + { + "epoch": 2.800034138431339, + "grad_norm": 0.34469894777136945, + "learning_rate": 3.6956246838644413e-06, + "loss": 0.256, + "step": 16405 + }, + { + "epoch": 2.800887599214816, + "grad_norm": 0.3848539159422196, + "learning_rate": 3.679817905918058e-06, + "loss": 0.2747, + "step": 16410 + }, + { + "epoch": 2.8017410599982933, + "grad_norm": 0.3604161648728695, + "learning_rate": 3.664011127971675e-06, + "loss": 0.2712, + "step": 16415 + }, + { + "epoch": 2.80259452078177, + "grad_norm": 0.33661256175350984, + "learning_rate": 3.648204350025291e-06, + "loss": 0.2836, + "step": 16420 + }, + { + "epoch": 2.803447981565247, + "grad_norm": 0.3746634874315744, + "learning_rate": 3.6323975720789077e-06, + "loss": 0.2765, + "step": 16425 + }, + { + "epoch": 2.804301442348724, + "grad_norm": 0.3412400860143434, + "learning_rate": 3.616590794132524e-06, + "loss": 0.2618, + "step": 16430 + }, + { + "epoch": 2.805154903132201, + "grad_norm": 0.4037360350688074, + "learning_rate": 3.600784016186141e-06, + "loss": 0.2663, + "step": 16435 + }, + { + "epoch": 2.8060083639156783, + "grad_norm": 0.36083859316854466, + "learning_rate": 3.5849772382397575e-06, + "loss": 0.2419, + "step": 16440 + }, + { + "epoch": 2.8068618246991552, + "grad_norm": 0.31857929374958394, + "learning_rate": 3.5691704602933737e-06, + "loss": 0.2917, + "step": 16445 + }, + { + "epoch": 2.807715285482632, + "grad_norm": 0.345950627257534, + "learning_rate": 3.5533636823469903e-06, + "loss": 0.2633, + "step": 16450 + }, + { + "epoch": 2.808568746266109, + "grad_norm": 0.33337471948836517, + "learning_rate": 3.5375569044006074e-06, + "loss": 0.2529, + "step": 16455 + }, + { + "epoch": 2.809422207049586, + "grad_norm": 0.33150891064007126, + "learning_rate": 3.5217501264542236e-06, + "loss": 0.261, + "step": 16460 + }, + { + "epoch": 2.810275667833063, + "grad_norm": 0.34937316105036914, + "learning_rate": 3.50594334850784e-06, + "loss": 0.2729, + "step": 16465 + }, + { + "epoch": 2.81112912861654, + "grad_norm": 0.36289531075289244, + "learning_rate": 3.490136570561457e-06, + "loss": 0.2439, + "step": 16470 + }, + { + "epoch": 2.811982589400017, + "grad_norm": 0.2887422562013696, + "learning_rate": 3.4743297926150734e-06, + "loss": 0.2507, + "step": 16475 + }, + { + "epoch": 2.812836050183494, + "grad_norm": 0.34289352944168966, + "learning_rate": 3.45852301466869e-06, + "loss": 0.253, + "step": 16480 + }, + { + "epoch": 2.813689510966971, + "grad_norm": 0.30363644256498634, + "learning_rate": 3.442716236722307e-06, + "loss": 0.2637, + "step": 16485 + }, + { + "epoch": 2.814542971750448, + "grad_norm": 0.34353385014258925, + "learning_rate": 3.4269094587759232e-06, + "loss": 0.277, + "step": 16490 + }, + { + "epoch": 2.815396432533925, + "grad_norm": 0.38427960033906927, + "learning_rate": 3.41110268082954e-06, + "loss": 0.2897, + "step": 16495 + }, + { + "epoch": 2.8162498933174023, + "grad_norm": 0.3616312249215327, + "learning_rate": 3.395295902883156e-06, + "loss": 0.2602, + "step": 16500 + }, + { + "epoch": 2.817103354100879, + "grad_norm": 0.37667058584403595, + "learning_rate": 3.379489124936773e-06, + "loss": 0.2548, + "step": 16505 + }, + { + "epoch": 2.817956814884356, + "grad_norm": 0.32923855347540354, + "learning_rate": 3.3636823469903897e-06, + "loss": 0.2705, + "step": 16510 + }, + { + "epoch": 2.818810275667833, + "grad_norm": 0.3135200057575021, + "learning_rate": 3.347875569044006e-06, + "loss": 0.253, + "step": 16515 + }, + { + "epoch": 2.81966373645131, + "grad_norm": 0.3142372830706972, + "learning_rate": 3.332068791097623e-06, + "loss": 0.2647, + "step": 16520 + }, + { + "epoch": 2.8205171972347873, + "grad_norm": 0.5160670686122224, + "learning_rate": 3.3162620131512395e-06, + "loss": 0.2705, + "step": 16525 + }, + { + "epoch": 2.821370658018264, + "grad_norm": 0.3462859455541581, + "learning_rate": 3.3004552352048557e-06, + "loss": 0.2583, + "step": 16530 + }, + { + "epoch": 2.822224118801741, + "grad_norm": 0.3189931770031935, + "learning_rate": 3.2846484572584727e-06, + "loss": 0.2486, + "step": 16535 + }, + { + "epoch": 2.823077579585218, + "grad_norm": 0.3672142254284251, + "learning_rate": 3.2688416793120893e-06, + "loss": 0.2425, + "step": 16540 + }, + { + "epoch": 2.823931040368695, + "grad_norm": 0.32596923087577295, + "learning_rate": 3.2530349013657055e-06, + "loss": 0.2783, + "step": 16545 + }, + { + "epoch": 2.824784501152172, + "grad_norm": 0.3467893622117871, + "learning_rate": 3.2372281234193226e-06, + "loss": 0.2707, + "step": 16550 + }, + { + "epoch": 2.825637961935649, + "grad_norm": 0.3396071877068457, + "learning_rate": 3.2214213454729387e-06, + "loss": 0.2367, + "step": 16555 + }, + { + "epoch": 2.8264914227191262, + "grad_norm": 0.3294933065611471, + "learning_rate": 3.2056145675265554e-06, + "loss": 0.2577, + "step": 16560 + }, + { + "epoch": 2.827344883502603, + "grad_norm": 0.32399162837887974, + "learning_rate": 3.1898077895801724e-06, + "loss": 0.2442, + "step": 16565 + }, + { + "epoch": 2.82819834428608, + "grad_norm": 0.3586333716102596, + "learning_rate": 3.1740010116337886e-06, + "loss": 0.2592, + "step": 16570 + }, + { + "epoch": 2.829051805069557, + "grad_norm": 0.3454188547978276, + "learning_rate": 3.158194233687405e-06, + "loss": 0.2542, + "step": 16575 + }, + { + "epoch": 2.829905265853034, + "grad_norm": 0.3560102356813727, + "learning_rate": 3.1423874557410222e-06, + "loss": 0.2559, + "step": 16580 + }, + { + "epoch": 2.8307587266365113, + "grad_norm": 0.3033730297464928, + "learning_rate": 3.1265806777946384e-06, + "loss": 0.2606, + "step": 16585 + }, + { + "epoch": 2.831612187419988, + "grad_norm": 0.3531667605024932, + "learning_rate": 3.110773899848255e-06, + "loss": 0.258, + "step": 16590 + }, + { + "epoch": 2.832465648203465, + "grad_norm": 0.3052730678155858, + "learning_rate": 3.0949671219018716e-06, + "loss": 0.2556, + "step": 16595 + }, + { + "epoch": 2.833319108986942, + "grad_norm": 0.43194185162288795, + "learning_rate": 3.0791603439554882e-06, + "loss": 0.251, + "step": 16600 + }, + { + "epoch": 2.834172569770419, + "grad_norm": 0.39939057437506964, + "learning_rate": 3.063353566009105e-06, + "loss": 0.2577, + "step": 16605 + }, + { + "epoch": 2.835026030553896, + "grad_norm": 0.36369109290470947, + "learning_rate": 3.0475467880627215e-06, + "loss": 0.2647, + "step": 16610 + }, + { + "epoch": 2.835879491337373, + "grad_norm": 0.38320813822635386, + "learning_rate": 3.031740010116338e-06, + "loss": 0.2514, + "step": 16615 + }, + { + "epoch": 2.83673295212085, + "grad_norm": 0.323973442141843, + "learning_rate": 3.0159332321699547e-06, + "loss": 0.2558, + "step": 16620 + }, + { + "epoch": 2.837586412904327, + "grad_norm": 0.4320919774217402, + "learning_rate": 3.0001264542235713e-06, + "loss": 0.2424, + "step": 16625 + }, + { + "epoch": 2.838439873687804, + "grad_norm": 0.32919915583200104, + "learning_rate": 2.984319676277188e-06, + "loss": 0.2478, + "step": 16630 + }, + { + "epoch": 2.839293334471281, + "grad_norm": 0.3450598144167833, + "learning_rate": 2.968512898330804e-06, + "loss": 0.2364, + "step": 16635 + }, + { + "epoch": 2.840146795254758, + "grad_norm": 0.35180820216469266, + "learning_rate": 2.952706120384421e-06, + "loss": 0.2591, + "step": 16640 + }, + { + "epoch": 2.8410002560382352, + "grad_norm": 0.37189345161365905, + "learning_rate": 2.9368993424380377e-06, + "loss": 0.2577, + "step": 16645 + }, + { + "epoch": 2.841853716821712, + "grad_norm": 0.40593001371536785, + "learning_rate": 2.921092564491654e-06, + "loss": 0.2836, + "step": 16650 + }, + { + "epoch": 2.842707177605189, + "grad_norm": 0.4050608018511317, + "learning_rate": 2.905285786545271e-06, + "loss": 0.2838, + "step": 16655 + }, + { + "epoch": 2.843560638388666, + "grad_norm": 0.30154619102257724, + "learning_rate": 2.8894790085988876e-06, + "loss": 0.2705, + "step": 16660 + }, + { + "epoch": 2.844414099172143, + "grad_norm": 0.4456602608003933, + "learning_rate": 2.8736722306525038e-06, + "loss": 0.2516, + "step": 16665 + }, + { + "epoch": 2.8452675599556203, + "grad_norm": 0.3489930788552161, + "learning_rate": 2.8578654527061204e-06, + "loss": 0.2623, + "step": 16670 + }, + { + "epoch": 2.846121020739097, + "grad_norm": 0.4369218078659227, + "learning_rate": 2.8420586747597374e-06, + "loss": 0.2539, + "step": 16675 + }, + { + "epoch": 2.846974481522574, + "grad_norm": 0.3779573979887877, + "learning_rate": 2.8262518968133536e-06, + "loss": 0.271, + "step": 16680 + }, + { + "epoch": 2.847827942306051, + "grad_norm": 0.33284706908380346, + "learning_rate": 2.81044511886697e-06, + "loss": 0.246, + "step": 16685 + }, + { + "epoch": 2.848681403089528, + "grad_norm": 0.3510187464918839, + "learning_rate": 2.7946383409205872e-06, + "loss": 0.2645, + "step": 16690 + }, + { + "epoch": 2.849534863873005, + "grad_norm": 0.35695375118711864, + "learning_rate": 2.7788315629742034e-06, + "loss": 0.2436, + "step": 16695 + }, + { + "epoch": 2.850388324656482, + "grad_norm": 0.31463851753808714, + "learning_rate": 2.76302478502782e-06, + "loss": 0.263, + "step": 16700 + }, + { + "epoch": 2.851241785439959, + "grad_norm": 0.3171378429621654, + "learning_rate": 2.7472180070814366e-06, + "loss": 0.2733, + "step": 16705 + }, + { + "epoch": 2.852095246223436, + "grad_norm": 0.36093161149175196, + "learning_rate": 2.7314112291350532e-06, + "loss": 0.2594, + "step": 16710 + }, + { + "epoch": 2.852948707006913, + "grad_norm": 0.326078521195147, + "learning_rate": 2.71560445118867e-06, + "loss": 0.2743, + "step": 16715 + }, + { + "epoch": 2.85380216779039, + "grad_norm": 0.3291895030132031, + "learning_rate": 2.6997976732422865e-06, + "loss": 0.2506, + "step": 16720 + }, + { + "epoch": 2.854655628573867, + "grad_norm": 0.31928657109893294, + "learning_rate": 2.683990895295903e-06, + "loss": 0.2527, + "step": 16725 + }, + { + "epoch": 2.8555090893573443, + "grad_norm": 0.3426980151395178, + "learning_rate": 2.6681841173495197e-06, + "loss": 0.2786, + "step": 16730 + }, + { + "epoch": 2.856362550140821, + "grad_norm": 0.3754166980773152, + "learning_rate": 2.6523773394031363e-06, + "loss": 0.2655, + "step": 16735 + }, + { + "epoch": 2.857216010924298, + "grad_norm": 0.3203125267306217, + "learning_rate": 2.6365705614567525e-06, + "loss": 0.2729, + "step": 16740 + }, + { + "epoch": 2.858069471707775, + "grad_norm": 0.3178226957464267, + "learning_rate": 2.6207637835103695e-06, + "loss": 0.281, + "step": 16745 + }, + { + "epoch": 2.858922932491252, + "grad_norm": 0.659502083111794, + "learning_rate": 2.604957005563986e-06, + "loss": 0.2517, + "step": 16750 + }, + { + "epoch": 2.859776393274729, + "grad_norm": 0.331750346845881, + "learning_rate": 2.5891502276176023e-06, + "loss": 0.2807, + "step": 16755 + }, + { + "epoch": 2.860629854058206, + "grad_norm": 0.37287672831585017, + "learning_rate": 2.5733434496712194e-06, + "loss": 0.2557, + "step": 16760 + }, + { + "epoch": 2.861483314841683, + "grad_norm": 0.3048070529596822, + "learning_rate": 2.557536671724836e-06, + "loss": 0.3214, + "step": 16765 + }, + { + "epoch": 2.86233677562516, + "grad_norm": 0.36514272157012023, + "learning_rate": 2.541729893778452e-06, + "loss": 0.2559, + "step": 16770 + }, + { + "epoch": 2.863190236408637, + "grad_norm": 0.31132155666325784, + "learning_rate": 2.5259231158320688e-06, + "loss": 0.2613, + "step": 16775 + }, + { + "epoch": 2.864043697192114, + "grad_norm": 0.34363123991986055, + "learning_rate": 2.510116337885686e-06, + "loss": 0.2509, + "step": 16780 + }, + { + "epoch": 2.864897157975591, + "grad_norm": 0.3512546391917189, + "learning_rate": 2.494309559939302e-06, + "loss": 0.2804, + "step": 16785 + }, + { + "epoch": 2.8657506187590682, + "grad_norm": 0.42972027282074027, + "learning_rate": 2.4785027819929186e-06, + "loss": 0.2461, + "step": 16790 + }, + { + "epoch": 2.866604079542545, + "grad_norm": 0.4125362084616382, + "learning_rate": 2.4626960040465356e-06, + "loss": 0.2688, + "step": 16795 + }, + { + "epoch": 2.867457540326022, + "grad_norm": 0.36012044835888873, + "learning_rate": 2.446889226100152e-06, + "loss": 0.2803, + "step": 16800 + }, + { + "epoch": 2.868311001109499, + "grad_norm": 0.316236201957246, + "learning_rate": 2.4310824481537684e-06, + "loss": 0.2704, + "step": 16805 + }, + { + "epoch": 2.869164461892976, + "grad_norm": 0.33919948307445347, + "learning_rate": 2.415275670207385e-06, + "loss": 0.2616, + "step": 16810 + }, + { + "epoch": 2.8700179226764533, + "grad_norm": 0.3961019397081436, + "learning_rate": 2.3994688922610016e-06, + "loss": 0.2572, + "step": 16815 + }, + { + "epoch": 2.8708713834599298, + "grad_norm": 0.3383384992169964, + "learning_rate": 2.3836621143146183e-06, + "loss": 0.2871, + "step": 16820 + }, + { + "epoch": 2.871724844243407, + "grad_norm": 0.3522245154276703, + "learning_rate": 2.367855336368235e-06, + "loss": 0.2704, + "step": 16825 + }, + { + "epoch": 2.872578305026884, + "grad_norm": 0.36351140878303867, + "learning_rate": 2.3520485584218515e-06, + "loss": 0.2625, + "step": 16830 + }, + { + "epoch": 2.873431765810361, + "grad_norm": 0.35581151645056946, + "learning_rate": 2.336241780475468e-06, + "loss": 0.2558, + "step": 16835 + }, + { + "epoch": 2.874285226593838, + "grad_norm": 0.39000427984634917, + "learning_rate": 2.3204350025290847e-06, + "loss": 0.2587, + "step": 16840 + }, + { + "epoch": 2.875138687377315, + "grad_norm": 0.3493985030961467, + "learning_rate": 2.304628224582701e-06, + "loss": 0.2688, + "step": 16845 + }, + { + "epoch": 2.875992148160792, + "grad_norm": 0.36280522890102784, + "learning_rate": 2.288821446636318e-06, + "loss": 0.2424, + "step": 16850 + }, + { + "epoch": 2.876845608944269, + "grad_norm": 0.3612496948579779, + "learning_rate": 2.2730146686899345e-06, + "loss": 0.2569, + "step": 16855 + }, + { + "epoch": 2.877699069727746, + "grad_norm": 0.33287369626609126, + "learning_rate": 2.2572078907435507e-06, + "loss": 0.27, + "step": 16860 + }, + { + "epoch": 2.878552530511223, + "grad_norm": 0.3196880230200889, + "learning_rate": 2.2414011127971678e-06, + "loss": 0.2624, + "step": 16865 + }, + { + "epoch": 2.8794059912947, + "grad_norm": 0.34624656208866267, + "learning_rate": 2.2255943348507844e-06, + "loss": 0.2607, + "step": 16870 + }, + { + "epoch": 2.8802594520781772, + "grad_norm": 0.34565250818861276, + "learning_rate": 2.2097875569044005e-06, + "loss": 0.2632, + "step": 16875 + }, + { + "epoch": 2.881112912861654, + "grad_norm": 0.3560040622984322, + "learning_rate": 2.193980778958017e-06, + "loss": 0.2438, + "step": 16880 + }, + { + "epoch": 2.881966373645131, + "grad_norm": 0.3001588959553635, + "learning_rate": 2.178174001011634e-06, + "loss": 0.2578, + "step": 16885 + }, + { + "epoch": 2.882819834428608, + "grad_norm": 0.3456646370534301, + "learning_rate": 2.1623672230652504e-06, + "loss": 0.2495, + "step": 16890 + }, + { + "epoch": 2.883673295212085, + "grad_norm": 0.3647852415506097, + "learning_rate": 2.146560445118867e-06, + "loss": 0.2531, + "step": 16895 + }, + { + "epoch": 2.884526755995562, + "grad_norm": 0.36332599424420253, + "learning_rate": 2.1307536671724836e-06, + "loss": 0.2629, + "step": 16900 + }, + { + "epoch": 2.8853802167790388, + "grad_norm": 0.3608278828622979, + "learning_rate": 2.1149468892261002e-06, + "loss": 0.24, + "step": 16905 + }, + { + "epoch": 2.886233677562516, + "grad_norm": 0.35888182245405303, + "learning_rate": 2.099140111279717e-06, + "loss": 0.271, + "step": 16910 + }, + { + "epoch": 2.887087138345993, + "grad_norm": 0.3297762540169189, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.2596, + "step": 16915 + }, + { + "epoch": 2.88794059912947, + "grad_norm": 0.2938186937645521, + "learning_rate": 2.06752655538695e-06, + "loss": 0.2693, + "step": 16920 + }, + { + "epoch": 2.888794059912947, + "grad_norm": 0.34128051267106463, + "learning_rate": 2.0517197774405667e-06, + "loss": 0.259, + "step": 16925 + }, + { + "epoch": 2.889647520696424, + "grad_norm": 0.34065879725858617, + "learning_rate": 2.0359129994941833e-06, + "loss": 0.2574, + "step": 16930 + }, + { + "epoch": 2.890500981479901, + "grad_norm": 0.2959496947932216, + "learning_rate": 2.0201062215477995e-06, + "loss": 0.2538, + "step": 16935 + }, + { + "epoch": 2.891354442263378, + "grad_norm": 0.3368931588753245, + "learning_rate": 2.0042994436014165e-06, + "loss": 0.2465, + "step": 16940 + }, + { + "epoch": 2.892207903046855, + "grad_norm": 0.3385532028442737, + "learning_rate": 1.988492665655033e-06, + "loss": 0.2625, + "step": 16945 + }, + { + "epoch": 2.893061363830332, + "grad_norm": 0.3209120610716774, + "learning_rate": 1.9726858877086493e-06, + "loss": 0.2509, + "step": 16950 + }, + { + "epoch": 2.893914824613809, + "grad_norm": 0.36205688214370463, + "learning_rate": 1.9568791097622663e-06, + "loss": 0.2636, + "step": 16955 + }, + { + "epoch": 2.8947682853972863, + "grad_norm": 0.350272241148083, + "learning_rate": 1.941072331815883e-06, + "loss": 0.2445, + "step": 16960 + }, + { + "epoch": 2.8956217461807627, + "grad_norm": 0.3831033921951975, + "learning_rate": 1.925265553869499e-06, + "loss": 0.2839, + "step": 16965 + }, + { + "epoch": 2.89647520696424, + "grad_norm": 0.366810305348369, + "learning_rate": 1.9094587759231157e-06, + "loss": 0.2661, + "step": 16970 + }, + { + "epoch": 2.897328667747717, + "grad_norm": 0.4126959808151922, + "learning_rate": 1.8936519979767325e-06, + "loss": 0.253, + "step": 16975 + }, + { + "epoch": 2.898182128531194, + "grad_norm": 0.3442295663869524, + "learning_rate": 1.8778452200303492e-06, + "loss": 0.2575, + "step": 16980 + }, + { + "epoch": 2.899035589314671, + "grad_norm": 0.3357700060647457, + "learning_rate": 1.8620384420839656e-06, + "loss": 0.2742, + "step": 16985 + }, + { + "epoch": 2.899889050098148, + "grad_norm": 0.3563811992415023, + "learning_rate": 1.8462316641375824e-06, + "loss": 0.2613, + "step": 16990 + }, + { + "epoch": 2.900742510881625, + "grad_norm": 0.33392508061801196, + "learning_rate": 1.830424886191199e-06, + "loss": 0.2704, + "step": 16995 + }, + { + "epoch": 2.901595971665102, + "grad_norm": 0.3498820219330395, + "learning_rate": 1.8146181082448154e-06, + "loss": 0.2575, + "step": 17000 + }, + { + "epoch": 2.902449432448579, + "grad_norm": 0.39194232479705393, + "learning_rate": 1.798811330298432e-06, + "loss": 0.2553, + "step": 17005 + }, + { + "epoch": 2.903302893232056, + "grad_norm": 0.322665717495448, + "learning_rate": 1.7830045523520488e-06, + "loss": 0.2599, + "step": 17010 + }, + { + "epoch": 2.904156354015533, + "grad_norm": 0.3199064344376355, + "learning_rate": 1.7671977744056652e-06, + "loss": 0.2604, + "step": 17015 + }, + { + "epoch": 2.90500981479901, + "grad_norm": 0.33304783509978897, + "learning_rate": 1.7513909964592818e-06, + "loss": 0.288, + "step": 17020 + }, + { + "epoch": 2.905863275582487, + "grad_norm": 0.2988482159195373, + "learning_rate": 1.7355842185128987e-06, + "loss": 0.2516, + "step": 17025 + }, + { + "epoch": 2.906716736365964, + "grad_norm": 0.36098320106595283, + "learning_rate": 1.719777440566515e-06, + "loss": 0.2678, + "step": 17030 + }, + { + "epoch": 2.907570197149441, + "grad_norm": 0.3147338768925288, + "learning_rate": 1.7039706626201315e-06, + "loss": 0.2499, + "step": 17035 + }, + { + "epoch": 2.908423657932918, + "grad_norm": 0.3286458368796154, + "learning_rate": 1.688163884673748e-06, + "loss": 0.2542, + "step": 17040 + }, + { + "epoch": 2.909277118716395, + "grad_norm": 0.3300835746620113, + "learning_rate": 1.6723571067273649e-06, + "loss": 0.2674, + "step": 17045 + }, + { + "epoch": 2.9101305794998718, + "grad_norm": 0.4587706915837577, + "learning_rate": 1.6565503287809813e-06, + "loss": 0.2483, + "step": 17050 + }, + { + "epoch": 2.910984040283349, + "grad_norm": 0.3182933111004528, + "learning_rate": 1.640743550834598e-06, + "loss": 0.2463, + "step": 17055 + }, + { + "epoch": 2.911837501066826, + "grad_norm": 0.32706183790213167, + "learning_rate": 1.6249367728882147e-06, + "loss": 0.2671, + "step": 17060 + }, + { + "epoch": 2.912690961850303, + "grad_norm": 0.36289049183139876, + "learning_rate": 1.6091299949418311e-06, + "loss": 0.2708, + "step": 17065 + }, + { + "epoch": 2.91354442263378, + "grad_norm": 0.5199517659672486, + "learning_rate": 1.5933232169954477e-06, + "loss": 0.2502, + "step": 17070 + }, + { + "epoch": 2.914397883417257, + "grad_norm": 0.3400098933724761, + "learning_rate": 1.5775164390490641e-06, + "loss": 0.2827, + "step": 17075 + }, + { + "epoch": 2.915251344200734, + "grad_norm": 0.42551756690623826, + "learning_rate": 1.561709661102681e-06, + "loss": 0.2649, + "step": 17080 + }, + { + "epoch": 2.916104804984211, + "grad_norm": 0.3257057211231263, + "learning_rate": 1.5459028831562976e-06, + "loss": 0.2587, + "step": 17085 + }, + { + "epoch": 2.916958265767688, + "grad_norm": 2.89360211234992, + "learning_rate": 1.530096105209914e-06, + "loss": 0.2646, + "step": 17090 + }, + { + "epoch": 2.917811726551165, + "grad_norm": 0.3178695407416205, + "learning_rate": 1.5142893272635306e-06, + "loss": 0.266, + "step": 17095 + }, + { + "epoch": 2.918665187334642, + "grad_norm": 0.35538425322722705, + "learning_rate": 1.4984825493171474e-06, + "loss": 0.2762, + "step": 17100 + }, + { + "epoch": 2.9195186481181192, + "grad_norm": 0.32448846647232016, + "learning_rate": 1.4826757713707638e-06, + "loss": 0.2418, + "step": 17105 + }, + { + "epoch": 2.9203721089015957, + "grad_norm": 0.32022695751267866, + "learning_rate": 1.4668689934243804e-06, + "loss": 0.2519, + "step": 17110 + }, + { + "epoch": 2.921225569685073, + "grad_norm": 0.3735232904592393, + "learning_rate": 1.451062215477997e-06, + "loss": 0.2594, + "step": 17115 + }, + { + "epoch": 2.92207903046855, + "grad_norm": 0.2901142010923986, + "learning_rate": 1.4352554375316136e-06, + "loss": 0.2619, + "step": 17120 + }, + { + "epoch": 2.922932491252027, + "grad_norm": 0.33541596937195634, + "learning_rate": 1.4194486595852302e-06, + "loss": 0.2653, + "step": 17125 + }, + { + "epoch": 2.923785952035504, + "grad_norm": 0.3377068527301117, + "learning_rate": 1.4036418816388468e-06, + "loss": 0.2497, + "step": 17130 + }, + { + "epoch": 2.9246394128189808, + "grad_norm": 0.4479816694601586, + "learning_rate": 1.3878351036924635e-06, + "loss": 0.2608, + "step": 17135 + }, + { + "epoch": 2.925492873602458, + "grad_norm": 0.5782450043876725, + "learning_rate": 1.37202832574608e-06, + "loss": 0.25, + "step": 17140 + }, + { + "epoch": 2.926346334385935, + "grad_norm": 0.37424540658123895, + "learning_rate": 1.3562215477996967e-06, + "loss": 0.2582, + "step": 17145 + }, + { + "epoch": 2.927199795169412, + "grad_norm": 0.30976988672860206, + "learning_rate": 1.340414769853313e-06, + "loss": 0.2431, + "step": 17150 + }, + { + "epoch": 2.928053255952889, + "grad_norm": 0.4082064861242491, + "learning_rate": 1.3246079919069297e-06, + "loss": 0.2761, + "step": 17155 + }, + { + "epoch": 2.928906716736366, + "grad_norm": 0.3785968340688524, + "learning_rate": 1.3088012139605463e-06, + "loss": 0.2608, + "step": 17160 + }, + { + "epoch": 2.929760177519843, + "grad_norm": 0.35607863049253763, + "learning_rate": 1.292994436014163e-06, + "loss": 0.2339, + "step": 17165 + }, + { + "epoch": 2.93061363830332, + "grad_norm": 0.31075015355334745, + "learning_rate": 1.2771876580677795e-06, + "loss": 0.2698, + "step": 17170 + }, + { + "epoch": 2.931467099086797, + "grad_norm": 0.5132625909881907, + "learning_rate": 1.2613808801213961e-06, + "loss": 0.2538, + "step": 17175 + }, + { + "epoch": 2.932320559870274, + "grad_norm": 0.35694491064633377, + "learning_rate": 1.2455741021750127e-06, + "loss": 0.2597, + "step": 17180 + }, + { + "epoch": 2.933174020653751, + "grad_norm": 0.3568641308814207, + "learning_rate": 1.2297673242286293e-06, + "loss": 0.2628, + "step": 17185 + }, + { + "epoch": 2.934027481437228, + "grad_norm": 0.33791318392593883, + "learning_rate": 1.213960546282246e-06, + "loss": 0.2628, + "step": 17190 + }, + { + "epoch": 2.9348809422207047, + "grad_norm": 0.3405716955626368, + "learning_rate": 1.1981537683358624e-06, + "loss": 0.269, + "step": 17195 + }, + { + "epoch": 2.935734403004182, + "grad_norm": 0.346110735300808, + "learning_rate": 1.1823469903894792e-06, + "loss": 0.2488, + "step": 17200 + }, + { + "epoch": 2.936587863787659, + "grad_norm": 0.33703040230967274, + "learning_rate": 1.1665402124430956e-06, + "loss": 0.2538, + "step": 17205 + }, + { + "epoch": 2.937441324571136, + "grad_norm": 0.3777437358981184, + "learning_rate": 1.1507334344967122e-06, + "loss": 0.2559, + "step": 17210 + }, + { + "epoch": 2.938294785354613, + "grad_norm": 0.3288510930223717, + "learning_rate": 1.134926656550329e-06, + "loss": 0.2232, + "step": 17215 + }, + { + "epoch": 2.93914824613809, + "grad_norm": 0.39222128486985425, + "learning_rate": 1.1191198786039454e-06, + "loss": 0.2493, + "step": 17220 + }, + { + "epoch": 2.940001706921567, + "grad_norm": 0.3671423223654542, + "learning_rate": 1.103313100657562e-06, + "loss": 0.2552, + "step": 17225 + }, + { + "epoch": 2.940855167705044, + "grad_norm": 0.29071214402041584, + "learning_rate": 1.0875063227111786e-06, + "loss": 0.2609, + "step": 17230 + }, + { + "epoch": 2.941708628488521, + "grad_norm": 0.37206901536039744, + "learning_rate": 1.0716995447647952e-06, + "loss": 0.261, + "step": 17235 + }, + { + "epoch": 2.942562089271998, + "grad_norm": 0.33367800371886974, + "learning_rate": 1.0558927668184116e-06, + "loss": 0.2517, + "step": 17240 + }, + { + "epoch": 2.943415550055475, + "grad_norm": 0.4043754763601572, + "learning_rate": 1.0400859888720285e-06, + "loss": 0.236, + "step": 17245 + }, + { + "epoch": 2.944269010838952, + "grad_norm": 0.3134703397311146, + "learning_rate": 1.024279210925645e-06, + "loss": 0.2445, + "step": 17250 + }, + { + "epoch": 2.9451224716224287, + "grad_norm": 0.3158404085722153, + "learning_rate": 1.0084724329792615e-06, + "loss": 0.2694, + "step": 17255 + }, + { + "epoch": 2.945975932405906, + "grad_norm": 0.35844186338513584, + "learning_rate": 9.926656550328783e-07, + "loss": 0.2828, + "step": 17260 + }, + { + "epoch": 2.946829393189383, + "grad_norm": 0.36568115466039464, + "learning_rate": 9.768588770864947e-07, + "loss": 0.2555, + "step": 17265 + }, + { + "epoch": 2.94768285397286, + "grad_norm": 0.344447356357639, + "learning_rate": 9.610520991401113e-07, + "loss": 0.2666, + "step": 17270 + }, + { + "epoch": 2.948536314756337, + "grad_norm": 0.6918163596519813, + "learning_rate": 9.452453211937278e-07, + "loss": 0.2632, + "step": 17275 + }, + { + "epoch": 2.9493897755398137, + "grad_norm": 0.342203389149714, + "learning_rate": 9.294385432473445e-07, + "loss": 0.2548, + "step": 17280 + }, + { + "epoch": 2.950243236323291, + "grad_norm": 0.31059695350905575, + "learning_rate": 9.136317653009611e-07, + "loss": 0.262, + "step": 17285 + }, + { + "epoch": 2.951096697106768, + "grad_norm": 0.3588886087799026, + "learning_rate": 8.978249873545776e-07, + "loss": 0.2443, + "step": 17290 + }, + { + "epoch": 2.951950157890245, + "grad_norm": 0.3725143712962683, + "learning_rate": 8.820182094081944e-07, + "loss": 0.2575, + "step": 17295 + }, + { + "epoch": 2.952803618673722, + "grad_norm": 0.33495674773065304, + "learning_rate": 8.662114314618109e-07, + "loss": 0.2647, + "step": 17300 + }, + { + "epoch": 2.953657079457199, + "grad_norm": 0.32461708186233, + "learning_rate": 8.504046535154275e-07, + "loss": 0.2672, + "step": 17305 + }, + { + "epoch": 2.954510540240676, + "grad_norm": 0.35713058747720877, + "learning_rate": 8.34597875569044e-07, + "loss": 0.2515, + "step": 17310 + }, + { + "epoch": 2.955364001024153, + "grad_norm": 0.3030379252571698, + "learning_rate": 8.187910976226607e-07, + "loss": 0.2534, + "step": 17315 + }, + { + "epoch": 2.95621746180763, + "grad_norm": 0.36601393888480166, + "learning_rate": 8.029843196762771e-07, + "loss": 0.2649, + "step": 17320 + }, + { + "epoch": 2.957070922591107, + "grad_norm": 0.390598927269396, + "learning_rate": 7.871775417298938e-07, + "loss": 0.2526, + "step": 17325 + }, + { + "epoch": 2.957924383374584, + "grad_norm": 0.3233437749380773, + "learning_rate": 7.713707637835104e-07, + "loss": 0.2621, + "step": 17330 + }, + { + "epoch": 2.9587778441580608, + "grad_norm": 0.3719088518172723, + "learning_rate": 7.555639858371269e-07, + "loss": 0.2597, + "step": 17335 + }, + { + "epoch": 2.9596313049415377, + "grad_norm": 0.35190840373585636, + "learning_rate": 7.397572078907435e-07, + "loss": 0.2503, + "step": 17340 + }, + { + "epoch": 2.960484765725015, + "grad_norm": 0.3542481971089052, + "learning_rate": 7.239504299443603e-07, + "loss": 0.2276, + "step": 17345 + }, + { + "epoch": 2.961338226508492, + "grad_norm": 0.3634321686781663, + "learning_rate": 7.081436519979768e-07, + "loss": 0.2655, + "step": 17350 + }, + { + "epoch": 2.962191687291969, + "grad_norm": 0.35586040496530635, + "learning_rate": 6.923368740515934e-07, + "loss": 0.2702, + "step": 17355 + }, + { + "epoch": 2.963045148075446, + "grad_norm": 0.4031032163040328, + "learning_rate": 6.7653009610521e-07, + "loss": 0.2475, + "step": 17360 + }, + { + "epoch": 2.9638986088589228, + "grad_norm": 0.2984585847340631, + "learning_rate": 6.607233181588265e-07, + "loss": 0.2579, + "step": 17365 + }, + { + "epoch": 2.9647520696424, + "grad_norm": 0.3714213545869481, + "learning_rate": 6.449165402124431e-07, + "loss": 0.2617, + "step": 17370 + }, + { + "epoch": 2.965605530425877, + "grad_norm": 0.38048976333390644, + "learning_rate": 6.291097622660597e-07, + "loss": 0.2612, + "step": 17375 + }, + { + "epoch": 2.966458991209354, + "grad_norm": 0.3764636736515448, + "learning_rate": 6.133029843196763e-07, + "loss": 0.2476, + "step": 17380 + }, + { + "epoch": 2.967312451992831, + "grad_norm": 0.3209064073897257, + "learning_rate": 5.974962063732929e-07, + "loss": 0.2663, + "step": 17385 + }, + { + "epoch": 2.968165912776308, + "grad_norm": 0.3659162352363195, + "learning_rate": 5.816894284269095e-07, + "loss": 0.2647, + "step": 17390 + }, + { + "epoch": 2.969019373559785, + "grad_norm": 0.33640959187301384, + "learning_rate": 5.65882650480526e-07, + "loss": 0.2714, + "step": 17395 + }, + { + "epoch": 2.9698728343432617, + "grad_norm": 0.32997317618874944, + "learning_rate": 5.500758725341426e-07, + "loss": 0.2637, + "step": 17400 + }, + { + "epoch": 2.970726295126739, + "grad_norm": 0.367167190417402, + "learning_rate": 5.342690945877593e-07, + "loss": 0.2638, + "step": 17405 + }, + { + "epoch": 2.971579755910216, + "grad_norm": 0.31105922748958315, + "learning_rate": 5.184623166413759e-07, + "loss": 0.2589, + "step": 17410 + }, + { + "epoch": 2.972433216693693, + "grad_norm": 0.32146559627153726, + "learning_rate": 5.026555386949924e-07, + "loss": 0.2556, + "step": 17415 + }, + { + "epoch": 2.97328667747717, + "grad_norm": 0.35793847937600215, + "learning_rate": 4.868487607486091e-07, + "loss": 0.263, + "step": 17420 + }, + { + "epoch": 2.9741401382606467, + "grad_norm": 0.35914293332697467, + "learning_rate": 4.7104198280222565e-07, + "loss": 0.2454, + "step": 17425 + }, + { + "epoch": 2.974993599044124, + "grad_norm": 0.2887747176193576, + "learning_rate": 4.552352048558422e-07, + "loss": 0.2372, + "step": 17430 + }, + { + "epoch": 2.975847059827601, + "grad_norm": 0.35855886226897943, + "learning_rate": 4.394284269094588e-07, + "loss": 0.2507, + "step": 17435 + }, + { + "epoch": 2.976700520611078, + "grad_norm": 0.34818857261223235, + "learning_rate": 4.236216489630754e-07, + "loss": 0.2767, + "step": 17440 + }, + { + "epoch": 2.977553981394555, + "grad_norm": 0.2878549501916922, + "learning_rate": 4.0781487101669193e-07, + "loss": 0.2726, + "step": 17445 + }, + { + "epoch": 2.9784074421780318, + "grad_norm": 0.3111344748617036, + "learning_rate": 3.9200809307030854e-07, + "loss": 0.2559, + "step": 17450 + }, + { + "epoch": 2.979260902961509, + "grad_norm": 0.3719374434219703, + "learning_rate": 3.7620131512392515e-07, + "loss": 0.2605, + "step": 17455 + }, + { + "epoch": 2.980114363744986, + "grad_norm": 0.3466211071811781, + "learning_rate": 3.603945371775417e-07, + "loss": 0.2701, + "step": 17460 + }, + { + "epoch": 2.980967824528463, + "grad_norm": 0.3734051181873891, + "learning_rate": 3.445877592311583e-07, + "loss": 0.2702, + "step": 17465 + }, + { + "epoch": 2.98182128531194, + "grad_norm": 0.35577982396358515, + "learning_rate": 3.2878098128477493e-07, + "loss": 0.2412, + "step": 17470 + }, + { + "epoch": 2.982674746095417, + "grad_norm": 0.3250053297942569, + "learning_rate": 3.129742033383915e-07, + "loss": 0.2798, + "step": 17475 + }, + { + "epoch": 2.9835282068788938, + "grad_norm": 0.2980448644295891, + "learning_rate": 2.971674253920081e-07, + "loss": 0.2579, + "step": 17480 + }, + { + "epoch": 2.9843816676623707, + "grad_norm": 0.308352770806981, + "learning_rate": 2.813606474456247e-07, + "loss": 0.2782, + "step": 17485 + }, + { + "epoch": 2.985235128445848, + "grad_norm": 0.3503886222854535, + "learning_rate": 2.6555386949924127e-07, + "loss": 0.259, + "step": 17490 + }, + { + "epoch": 2.986088589229325, + "grad_norm": 0.38014445587318535, + "learning_rate": 2.497470915528579e-07, + "loss": 0.2489, + "step": 17495 + }, + { + "epoch": 2.986942050012802, + "grad_norm": 0.34281879430471657, + "learning_rate": 2.3394031360647446e-07, + "loss": 0.2329, + "step": 17500 + }, + { + "epoch": 2.987795510796279, + "grad_norm": 0.3149722235884717, + "learning_rate": 2.1813353566009108e-07, + "loss": 0.2476, + "step": 17505 + }, + { + "epoch": 2.9886489715797557, + "grad_norm": 0.4067252040037474, + "learning_rate": 2.0232675771370766e-07, + "loss": 0.2771, + "step": 17510 + }, + { + "epoch": 2.989502432363233, + "grad_norm": 0.3822581744090084, + "learning_rate": 1.8651997976732424e-07, + "loss": 0.2473, + "step": 17515 + }, + { + "epoch": 2.99035589314671, + "grad_norm": 0.34724226273280534, + "learning_rate": 1.7071320182094083e-07, + "loss": 0.2642, + "step": 17520 + }, + { + "epoch": 2.991209353930187, + "grad_norm": 0.3194994143020438, + "learning_rate": 1.549064238745574e-07, + "loss": 0.2702, + "step": 17525 + }, + { + "epoch": 2.992062814713664, + "grad_norm": 0.31852349595999996, + "learning_rate": 1.3909964592817402e-07, + "loss": 0.2559, + "step": 17530 + }, + { + "epoch": 2.992916275497141, + "grad_norm": 0.32612990576667494, + "learning_rate": 1.2329286798179058e-07, + "loss": 0.2558, + "step": 17535 + }, + { + "epoch": 2.993769736280618, + "grad_norm": 0.3423943025137262, + "learning_rate": 1.0748609003540719e-07, + "loss": 0.2718, + "step": 17540 + }, + { + "epoch": 2.9946231970640946, + "grad_norm": 0.3751142492781964, + "learning_rate": 9.167931208902377e-08, + "loss": 0.244, + "step": 17545 + }, + { + "epoch": 2.995476657847572, + "grad_norm": 0.33765844355783725, + "learning_rate": 7.587253414264037e-08, + "loss": 0.2514, + "step": 17550 + }, + { + "epoch": 2.996330118631049, + "grad_norm": 0.34932096769176885, + "learning_rate": 6.006575619625696e-08, + "loss": 0.2481, + "step": 17555 + }, + { + "epoch": 2.997183579414526, + "grad_norm": 0.41118628511885635, + "learning_rate": 4.425897824987355e-08, + "loss": 0.2469, + "step": 17560 + }, + { + "epoch": 2.9980370401980028, + "grad_norm": 0.36482301233773756, + "learning_rate": 2.8452200303490138e-08, + "loss": 0.2756, + "step": 17565 + }, + { + "epoch": 2.9988905009814797, + "grad_norm": 0.323345446655115, + "learning_rate": 1.2645422357106729e-08, + "loss": 0.2363, + "step": 17570 + }, + { + "epoch": 2.9995732696082618, + "step": 17574, + "total_flos": 2.7506794053018583e+18, + "train_loss": 0.39039740704073767, + "train_runtime": 61759.1154, + "train_samples_per_second": 4.553, + "train_steps_per_second": 0.285 + } + ], + "logging_steps": 5, + "max_steps": 17574, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.7506794053018583e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}