{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9995732696082618, "eval_steps": 500, "global_step": 17574, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008534607834769992, "grad_norm": 65.82237917399702, "learning_rate": 1.4220705346985212e-07, "loss": 6.2538, "step": 5 }, { "epoch": 0.0017069215669539984, "grad_norm": 61.78237310336742, "learning_rate": 2.8441410693970424e-07, "loss": 5.9214, "step": 10 }, { "epoch": 0.0025603823504309975, "grad_norm": 65.01408922354292, "learning_rate": 4.2662116040955633e-07, "loss": 6.4985, "step": 15 }, { "epoch": 0.003413843133907997, "grad_norm": 70.96972752867183, "learning_rate": 5.688282138794085e-07, "loss": 5.7209, "step": 20 }, { "epoch": 0.004267303917384996, "grad_norm": 62.61398347247295, "learning_rate": 7.110352673492606e-07, "loss": 5.3015, "step": 25 }, { "epoch": 0.005120764700861995, "grad_norm": 61.05604755149443, "learning_rate": 8.532423208191127e-07, "loss": 3.9051, "step": 30 }, { "epoch": 0.005974225484338995, "grad_norm": 26.310596990103257, "learning_rate": 9.954493742889649e-07, "loss": 3.2509, "step": 35 }, { "epoch": 0.006827686267815994, "grad_norm": 17.245039469465983, "learning_rate": 1.137656427758817e-06, "loss": 1.7815, "step": 40 }, { "epoch": 0.0076811470512929934, "grad_norm": 10.186583975751686, "learning_rate": 1.279863481228669e-06, "loss": 1.4517, "step": 45 }, { "epoch": 0.008534607834769992, "grad_norm": 2.6886555890072783, "learning_rate": 1.4220705346985211e-06, "loss": 1.1816, "step": 50 }, { "epoch": 0.009388068618246991, "grad_norm": 1.7396933206659158, "learning_rate": 1.5642775881683732e-06, "loss": 1.1061, "step": 55 }, { "epoch": 0.01024152940172399, "grad_norm": 1.490732509826844, "learning_rate": 1.7064846416382253e-06, "loss": 1.01, "step": 60 }, { "epoch": 0.01109499018520099, "grad_norm": 1.0571874520303985, "learning_rate": 1.8486916951080774e-06, "loss": 0.903, "step": 65 }, { "epoch": 0.01194845096867799, "grad_norm": 0.936175193413941, "learning_rate": 1.9908987485779297e-06, "loss": 0.8348, "step": 70 }, { "epoch": 0.012801911752154988, "grad_norm": 0.7727511908562947, "learning_rate": 2.133105802047782e-06, "loss": 0.773, "step": 75 }, { "epoch": 0.013655372535631987, "grad_norm": 0.6500228464874108, "learning_rate": 2.275312855517634e-06, "loss": 0.7484, "step": 80 }, { "epoch": 0.014508833319108986, "grad_norm": 0.6704169217628638, "learning_rate": 2.417519908987486e-06, "loss": 0.7212, "step": 85 }, { "epoch": 0.015362294102585987, "grad_norm": 0.5557635021907732, "learning_rate": 2.559726962457338e-06, "loss": 0.6766, "step": 90 }, { "epoch": 0.016215754886062986, "grad_norm": 0.6433223276039268, "learning_rate": 2.7019340159271904e-06, "loss": 0.6904, "step": 95 }, { "epoch": 0.017069215669539985, "grad_norm": 0.6955316631183114, "learning_rate": 2.8441410693970423e-06, "loss": 0.6581, "step": 100 }, { "epoch": 0.017922676453016984, "grad_norm": 0.5642898802303687, "learning_rate": 2.9863481228668946e-06, "loss": 0.6315, "step": 105 }, { "epoch": 0.018776137236493982, "grad_norm": 0.5439896608168134, "learning_rate": 3.1285551763367464e-06, "loss": 0.625, "step": 110 }, { "epoch": 0.01962959801997098, "grad_norm": 0.7376593292566979, "learning_rate": 3.2707622298065988e-06, "loss": 0.6254, "step": 115 }, { "epoch": 0.02048305880344798, "grad_norm": 0.6523728079456945, "learning_rate": 3.4129692832764506e-06, "loss": 0.5805, "step": 120 }, { "epoch": 0.02133651958692498, "grad_norm": 0.5659985560752062, "learning_rate": 3.5551763367463025e-06, "loss": 0.6157, "step": 125 }, { "epoch": 0.02218998037040198, "grad_norm": 0.6177114762693549, "learning_rate": 3.697383390216155e-06, "loss": 0.5986, "step": 130 }, { "epoch": 0.02304344115387898, "grad_norm": 0.5470041535776172, "learning_rate": 3.8395904436860075e-06, "loss": 0.6083, "step": 135 }, { "epoch": 0.02389690193735598, "grad_norm": 0.5108351443295549, "learning_rate": 3.981797497155859e-06, "loss": 0.5847, "step": 140 }, { "epoch": 0.024750362720832978, "grad_norm": 0.6278433297890169, "learning_rate": 4.124004550625711e-06, "loss": 0.5832, "step": 145 }, { "epoch": 0.025603823504309977, "grad_norm": 0.5628852671548289, "learning_rate": 4.266211604095564e-06, "loss": 0.5596, "step": 150 }, { "epoch": 0.026457284287786976, "grad_norm": 0.5910396106383523, "learning_rate": 4.408418657565416e-06, "loss": 0.5631, "step": 155 }, { "epoch": 0.027310745071263975, "grad_norm": 0.6016195740875766, "learning_rate": 4.550625711035268e-06, "loss": 0.5797, "step": 160 }, { "epoch": 0.028164205854740974, "grad_norm": 0.6160786427351045, "learning_rate": 4.69283276450512e-06, "loss": 0.5746, "step": 165 }, { "epoch": 0.029017666638217973, "grad_norm": 0.5245421852675025, "learning_rate": 4.835039817974972e-06, "loss": 0.5753, "step": 170 }, { "epoch": 0.02987112742169497, "grad_norm": 0.5222240064983131, "learning_rate": 4.977246871444824e-06, "loss": 0.5623, "step": 175 }, { "epoch": 0.030724588205171974, "grad_norm": 0.5330659472283653, "learning_rate": 5.119453924914676e-06, "loss": 0.514, "step": 180 }, { "epoch": 0.03157804898864897, "grad_norm": 0.565273447908288, "learning_rate": 5.261660978384528e-06, "loss": 0.5436, "step": 185 }, { "epoch": 0.03243150977212597, "grad_norm": 0.5375616663952012, "learning_rate": 5.403868031854381e-06, "loss": 0.5521, "step": 190 }, { "epoch": 0.03328497055560297, "grad_norm": 0.6338543088009376, "learning_rate": 5.546075085324233e-06, "loss": 0.5526, "step": 195 }, { "epoch": 0.03413843133907997, "grad_norm": 0.6062572472605493, "learning_rate": 5.6882821387940845e-06, "loss": 0.5225, "step": 200 }, { "epoch": 0.03499189212255697, "grad_norm": 0.6060235605131771, "learning_rate": 5.830489192263936e-06, "loss": 0.5485, "step": 205 }, { "epoch": 0.03584535290603397, "grad_norm": 0.4757042223533204, "learning_rate": 5.972696245733789e-06, "loss": 0.5359, "step": 210 }, { "epoch": 0.036698813689510966, "grad_norm": 0.4943549334840461, "learning_rate": 6.114903299203641e-06, "loss": 0.5167, "step": 215 }, { "epoch": 0.037552274472987965, "grad_norm": 0.5746091986959267, "learning_rate": 6.257110352673493e-06, "loss": 0.507, "step": 220 }, { "epoch": 0.038405735256464964, "grad_norm": 0.5258001901741969, "learning_rate": 6.399317406143345e-06, "loss": 0.5192, "step": 225 }, { "epoch": 0.03925919603994196, "grad_norm": 0.5625156657980386, "learning_rate": 6.5415244596131975e-06, "loss": 0.526, "step": 230 }, { "epoch": 0.04011265682341896, "grad_norm": 0.561360178358828, "learning_rate": 6.683731513083049e-06, "loss": 0.5407, "step": 235 }, { "epoch": 0.04096611760689596, "grad_norm": 0.5933855326363557, "learning_rate": 6.825938566552901e-06, "loss": 0.5208, "step": 240 }, { "epoch": 0.04181957839037296, "grad_norm": 0.5602837864096152, "learning_rate": 6.968145620022753e-06, "loss": 0.5175, "step": 245 }, { "epoch": 0.04267303917384996, "grad_norm": 0.5315851502327856, "learning_rate": 7.110352673492605e-06, "loss": 0.5193, "step": 250 }, { "epoch": 0.043526499957326964, "grad_norm": 0.5070353123438208, "learning_rate": 7.252559726962458e-06, "loss": 0.5503, "step": 255 }, { "epoch": 0.04437996074080396, "grad_norm": 0.6551079876158198, "learning_rate": 7.39476678043231e-06, "loss": 0.5182, "step": 260 }, { "epoch": 0.04523342152428096, "grad_norm": 0.5083258873436571, "learning_rate": 7.536973833902162e-06, "loss": 0.5163, "step": 265 }, { "epoch": 0.04608688230775796, "grad_norm": 0.6002288466288257, "learning_rate": 7.679180887372015e-06, "loss": 0.5291, "step": 270 }, { "epoch": 0.04694034309123496, "grad_norm": 0.5490970715515444, "learning_rate": 7.821387940841867e-06, "loss": 0.5234, "step": 275 }, { "epoch": 0.04779380387471196, "grad_norm": 0.609404506863521, "learning_rate": 7.963594994311719e-06, "loss": 0.5113, "step": 280 }, { "epoch": 0.04864726465818896, "grad_norm": 0.5008987686586568, "learning_rate": 8.10580204778157e-06, "loss": 0.4721, "step": 285 }, { "epoch": 0.049500725441665956, "grad_norm": 0.5750927594832669, "learning_rate": 8.248009101251423e-06, "loss": 0.5145, "step": 290 }, { "epoch": 0.050354186225142955, "grad_norm": 0.5204918722045246, "learning_rate": 8.390216154721274e-06, "loss": 0.4846, "step": 295 }, { "epoch": 0.051207647008619954, "grad_norm": 0.5525466771495826, "learning_rate": 8.532423208191128e-06, "loss": 0.4852, "step": 300 }, { "epoch": 0.05206110779209695, "grad_norm": 0.5565066409255734, "learning_rate": 8.67463026166098e-06, "loss": 0.5166, "step": 305 }, { "epoch": 0.05291456857557395, "grad_norm": 0.5976250954897291, "learning_rate": 8.816837315130832e-06, "loss": 0.497, "step": 310 }, { "epoch": 0.05376802935905095, "grad_norm": 0.4962968708397676, "learning_rate": 8.959044368600684e-06, "loss": 0.4981, "step": 315 }, { "epoch": 0.05462149014252795, "grad_norm": 0.5235693924477097, "learning_rate": 9.101251422070536e-06, "loss": 0.5008, "step": 320 }, { "epoch": 0.05547495092600495, "grad_norm": 0.5560313580446026, "learning_rate": 9.243458475540387e-06, "loss": 0.5134, "step": 325 }, { "epoch": 0.05632841170948195, "grad_norm": 0.586090331155775, "learning_rate": 9.38566552901024e-06, "loss": 0.4751, "step": 330 }, { "epoch": 0.057181872492958946, "grad_norm": 0.4805790206028912, "learning_rate": 9.527872582480093e-06, "loss": 0.4931, "step": 335 }, { "epoch": 0.058035333276435945, "grad_norm": 0.5102599416036809, "learning_rate": 9.670079635949945e-06, "loss": 0.5074, "step": 340 }, { "epoch": 0.058888794059912944, "grad_norm": 0.5269373940253493, "learning_rate": 9.812286689419797e-06, "loss": 0.4994, "step": 345 }, { "epoch": 0.05974225484338994, "grad_norm": 0.5266104676032257, "learning_rate": 9.954493742889649e-06, "loss": 0.4898, "step": 350 }, { "epoch": 0.06059571562686695, "grad_norm": 0.5015087365956544, "learning_rate": 1.00967007963595e-05, "loss": 0.5163, "step": 355 }, { "epoch": 0.06144917641034395, "grad_norm": 0.5722794975832386, "learning_rate": 1.0238907849829352e-05, "loss": 0.4792, "step": 360 }, { "epoch": 0.062302637193820946, "grad_norm": 0.6125388908144989, "learning_rate": 1.0381114903299204e-05, "loss": 0.5101, "step": 365 }, { "epoch": 0.06315609797729795, "grad_norm": 0.5890464340317972, "learning_rate": 1.0523321956769056e-05, "loss": 0.4951, "step": 370 }, { "epoch": 0.06400955876077494, "grad_norm": 0.5696458988895636, "learning_rate": 1.066552901023891e-05, "loss": 0.4844, "step": 375 }, { "epoch": 0.06486301954425194, "grad_norm": 0.6355890836087996, "learning_rate": 1.0807736063708762e-05, "loss": 0.4893, "step": 380 }, { "epoch": 0.06571648032772893, "grad_norm": 0.5654944358354584, "learning_rate": 1.0949943117178613e-05, "loss": 0.4935, "step": 385 }, { "epoch": 0.06656994111120594, "grad_norm": 0.518850806672097, "learning_rate": 1.1092150170648465e-05, "loss": 0.5371, "step": 390 }, { "epoch": 0.06742340189468293, "grad_norm": 0.5019994903033126, "learning_rate": 1.1234357224118317e-05, "loss": 0.4673, "step": 395 }, { "epoch": 0.06827686267815994, "grad_norm": 0.46865292802092595, "learning_rate": 1.1376564277588169e-05, "loss": 0.4756, "step": 400 }, { "epoch": 0.06913032346163694, "grad_norm": 0.5588416943930714, "learning_rate": 1.1518771331058021e-05, "loss": 0.4975, "step": 405 }, { "epoch": 0.06998378424511394, "grad_norm": 0.5254779155778078, "learning_rate": 1.1660978384527873e-05, "loss": 0.5136, "step": 410 }, { "epoch": 0.07083724502859094, "grad_norm": 0.5521735407281068, "learning_rate": 1.1803185437997725e-05, "loss": 0.4808, "step": 415 }, { "epoch": 0.07169070581206793, "grad_norm": 0.6531675144683418, "learning_rate": 1.1945392491467578e-05, "loss": 0.5009, "step": 420 }, { "epoch": 0.07254416659554494, "grad_norm": 0.5698624780359882, "learning_rate": 1.208759954493743e-05, "loss": 0.4811, "step": 425 }, { "epoch": 0.07339762737902193, "grad_norm": 0.5255574070633653, "learning_rate": 1.2229806598407282e-05, "loss": 0.4896, "step": 430 }, { "epoch": 0.07425108816249894, "grad_norm": 0.5451180804001506, "learning_rate": 1.2372013651877134e-05, "loss": 0.489, "step": 435 }, { "epoch": 0.07510454894597593, "grad_norm": 0.5212986303023895, "learning_rate": 1.2514220705346986e-05, "loss": 0.4945, "step": 440 }, { "epoch": 0.07595800972945294, "grad_norm": 0.5198003250336133, "learning_rate": 1.2656427758816838e-05, "loss": 0.4681, "step": 445 }, { "epoch": 0.07681147051292993, "grad_norm": 0.5725985287628533, "learning_rate": 1.279863481228669e-05, "loss": 0.4632, "step": 450 }, { "epoch": 0.07766493129640693, "grad_norm": 0.5298351826543551, "learning_rate": 1.2940841865756541e-05, "loss": 0.4793, "step": 455 }, { "epoch": 0.07851839207988393, "grad_norm": 0.5735617735208751, "learning_rate": 1.3083048919226395e-05, "loss": 0.4981, "step": 460 }, { "epoch": 0.07937185286336093, "grad_norm": 0.5990878952736066, "learning_rate": 1.3225255972696247e-05, "loss": 0.4834, "step": 465 }, { "epoch": 0.08022531364683792, "grad_norm": 0.5513663837349132, "learning_rate": 1.3367463026166099e-05, "loss": 0.4671, "step": 470 }, { "epoch": 0.08107877443031493, "grad_norm": 0.49563741965263997, "learning_rate": 1.350967007963595e-05, "loss": 0.4517, "step": 475 }, { "epoch": 0.08193223521379192, "grad_norm": 0.5275745894579796, "learning_rate": 1.3651877133105803e-05, "loss": 0.4969, "step": 480 }, { "epoch": 0.08278569599726893, "grad_norm": 0.5703838195802612, "learning_rate": 1.3794084186575654e-05, "loss": 0.4949, "step": 485 }, { "epoch": 0.08363915678074592, "grad_norm": 0.5524459182519725, "learning_rate": 1.3936291240045506e-05, "loss": 0.478, "step": 490 }, { "epoch": 0.08449261756422292, "grad_norm": 0.49819651192360687, "learning_rate": 1.407849829351536e-05, "loss": 0.4724, "step": 495 }, { "epoch": 0.08534607834769992, "grad_norm": 0.53227302303531, "learning_rate": 1.422070534698521e-05, "loss": 0.4801, "step": 500 }, { "epoch": 0.08619953913117692, "grad_norm": 0.4669677774818698, "learning_rate": 1.4362912400455064e-05, "loss": 0.4787, "step": 505 }, { "epoch": 0.08705299991465393, "grad_norm": 0.5441918360005731, "learning_rate": 1.4505119453924915e-05, "loss": 0.49, "step": 510 }, { "epoch": 0.08790646069813092, "grad_norm": 0.4924479643334617, "learning_rate": 1.4647326507394767e-05, "loss": 0.4708, "step": 515 }, { "epoch": 0.08875992148160793, "grad_norm": 0.5544411690215525, "learning_rate": 1.478953356086462e-05, "loss": 0.4824, "step": 520 }, { "epoch": 0.08961338226508492, "grad_norm": 0.6575526705781556, "learning_rate": 1.4931740614334471e-05, "loss": 0.5069, "step": 525 }, { "epoch": 0.09046684304856192, "grad_norm": 0.5524917999265188, "learning_rate": 1.5073947667804325e-05, "loss": 0.4554, "step": 530 }, { "epoch": 0.09132030383203892, "grad_norm": 0.5380062736800153, "learning_rate": 1.5216154721274175e-05, "loss": 0.4397, "step": 535 }, { "epoch": 0.09217376461551592, "grad_norm": 0.6465618271608081, "learning_rate": 1.535836177474403e-05, "loss": 0.4919, "step": 540 }, { "epoch": 0.09302722539899291, "grad_norm": 0.5065592333706098, "learning_rate": 1.550056882821388e-05, "loss": 0.4691, "step": 545 }, { "epoch": 0.09388068618246992, "grad_norm": 0.5995290132081805, "learning_rate": 1.5642775881683734e-05, "loss": 0.461, "step": 550 }, { "epoch": 0.09473414696594691, "grad_norm": 0.5119477523430689, "learning_rate": 1.5784982935153582e-05, "loss": 0.4913, "step": 555 }, { "epoch": 0.09558760774942392, "grad_norm": 0.5411726788932973, "learning_rate": 1.5927189988623438e-05, "loss": 0.4727, "step": 560 }, { "epoch": 0.09644106853290091, "grad_norm": 0.4793203782330964, "learning_rate": 1.606939704209329e-05, "loss": 0.4535, "step": 565 }, { "epoch": 0.09729452931637791, "grad_norm": 0.465582862643326, "learning_rate": 1.621160409556314e-05, "loss": 0.5029, "step": 570 }, { "epoch": 0.0981479900998549, "grad_norm": 0.6250731829240942, "learning_rate": 1.6353811149032993e-05, "loss": 0.4578, "step": 575 }, { "epoch": 0.09900145088333191, "grad_norm": 0.5274569805270367, "learning_rate": 1.6496018202502845e-05, "loss": 0.4642, "step": 580 }, { "epoch": 0.0998549116668089, "grad_norm": 0.704249062342399, "learning_rate": 1.6638225255972697e-05, "loss": 0.4601, "step": 585 }, { "epoch": 0.10070837245028591, "grad_norm": 0.47906906892618834, "learning_rate": 1.678043230944255e-05, "loss": 0.4845, "step": 590 }, { "epoch": 0.1015618332337629, "grad_norm": 0.5621036531031371, "learning_rate": 1.69226393629124e-05, "loss": 0.4945, "step": 595 }, { "epoch": 0.10241529401723991, "grad_norm": 0.568166350950811, "learning_rate": 1.7064846416382256e-05, "loss": 0.5, "step": 600 }, { "epoch": 0.10326875480071691, "grad_norm": 0.526050009608863, "learning_rate": 1.7207053469852105e-05, "loss": 0.4518, "step": 605 }, { "epoch": 0.1041222155841939, "grad_norm": 0.5101676220842896, "learning_rate": 1.734926052332196e-05, "loss": 0.4312, "step": 610 }, { "epoch": 0.10497567636767091, "grad_norm": 0.5971493354193895, "learning_rate": 1.749146757679181e-05, "loss": 0.4645, "step": 615 }, { "epoch": 0.1058291371511479, "grad_norm": 0.5995569466514504, "learning_rate": 1.7633674630261664e-05, "loss": 0.4493, "step": 620 }, { "epoch": 0.10668259793462491, "grad_norm": 0.49992734759090895, "learning_rate": 1.7775881683731512e-05, "loss": 0.4713, "step": 625 }, { "epoch": 0.1075360587181019, "grad_norm": 0.6002501705064306, "learning_rate": 1.7918088737201367e-05, "loss": 0.4515, "step": 630 }, { "epoch": 0.10838951950157891, "grad_norm": 0.6008048193179941, "learning_rate": 1.806029579067122e-05, "loss": 0.4752, "step": 635 }, { "epoch": 0.1092429802850559, "grad_norm": 0.5990904504975437, "learning_rate": 1.820250284414107e-05, "loss": 0.4768, "step": 640 }, { "epoch": 0.1100964410685329, "grad_norm": 0.4903193404479473, "learning_rate": 1.8344709897610923e-05, "loss": 0.4283, "step": 645 }, { "epoch": 0.1109499018520099, "grad_norm": 0.5540831949744953, "learning_rate": 1.8486916951080775e-05, "loss": 0.4691, "step": 650 }, { "epoch": 0.1118033626354869, "grad_norm": 0.4809043257721993, "learning_rate": 1.8629124004550627e-05, "loss": 0.4229, "step": 655 }, { "epoch": 0.1126568234189639, "grad_norm": 0.42854347756644773, "learning_rate": 1.877133105802048e-05, "loss": 0.4554, "step": 660 }, { "epoch": 0.1135102842024409, "grad_norm": 0.48988643305355273, "learning_rate": 1.891353811149033e-05, "loss": 0.4548, "step": 665 }, { "epoch": 0.11436374498591789, "grad_norm": 0.6234024023610745, "learning_rate": 1.9055745164960186e-05, "loss": 0.4659, "step": 670 }, { "epoch": 0.1152172057693949, "grad_norm": 0.47008106334942906, "learning_rate": 1.9197952218430034e-05, "loss": 0.425, "step": 675 }, { "epoch": 0.11607066655287189, "grad_norm": 0.5417686049774755, "learning_rate": 1.934015927189989e-05, "loss": 0.4709, "step": 680 }, { "epoch": 0.1169241273363489, "grad_norm": 0.5326262936619862, "learning_rate": 1.9482366325369738e-05, "loss": 0.4607, "step": 685 }, { "epoch": 0.11777758811982589, "grad_norm": 0.5139387819039124, "learning_rate": 1.9624573378839593e-05, "loss": 0.4531, "step": 690 }, { "epoch": 0.1186310489033029, "grad_norm": 0.5510213955319201, "learning_rate": 1.9766780432309442e-05, "loss": 0.4743, "step": 695 }, { "epoch": 0.11948450968677989, "grad_norm": 0.4570592080474545, "learning_rate": 1.9908987485779297e-05, "loss": 0.4661, "step": 700 }, { "epoch": 0.12033797047025689, "grad_norm": 0.6287145643304716, "learning_rate": 2.005119453924915e-05, "loss": 0.4607, "step": 705 }, { "epoch": 0.1211914312537339, "grad_norm": 0.6128183811804463, "learning_rate": 2.0193401592719e-05, "loss": 0.4838, "step": 710 }, { "epoch": 0.12204489203721089, "grad_norm": 0.5241494967769496, "learning_rate": 2.0335608646188853e-05, "loss": 0.4572, "step": 715 }, { "epoch": 0.1228983528206879, "grad_norm": 0.7483832627900028, "learning_rate": 2.0477815699658705e-05, "loss": 0.4726, "step": 720 }, { "epoch": 0.12375181360416489, "grad_norm": 0.5242167687144432, "learning_rate": 2.0620022753128557e-05, "loss": 0.5075, "step": 725 }, { "epoch": 0.12460527438764189, "grad_norm": 0.5128070620828036, "learning_rate": 2.076222980659841e-05, "loss": 0.4423, "step": 730 }, { "epoch": 0.12545873517111888, "grad_norm": 0.5799512345349864, "learning_rate": 2.090443686006826e-05, "loss": 0.4813, "step": 735 }, { "epoch": 0.1263121959545959, "grad_norm": 0.5491697714560304, "learning_rate": 2.1046643913538112e-05, "loss": 0.4494, "step": 740 }, { "epoch": 0.1271656567380729, "grad_norm": 0.6083459393750741, "learning_rate": 2.1188850967007964e-05, "loss": 0.4593, "step": 745 }, { "epoch": 0.12801911752154987, "grad_norm": 0.6114057705486389, "learning_rate": 2.133105802047782e-05, "loss": 0.4926, "step": 750 }, { "epoch": 0.12887257830502688, "grad_norm": 0.4804641405831696, "learning_rate": 2.1473265073947668e-05, "loss": 0.4586, "step": 755 }, { "epoch": 0.12972603908850389, "grad_norm": 0.478323830214806, "learning_rate": 2.1615472127417523e-05, "loss": 0.4248, "step": 760 }, { "epoch": 0.1305794998719809, "grad_norm": 0.5753431097217332, "learning_rate": 2.175767918088737e-05, "loss": 0.4313, "step": 765 }, { "epoch": 0.13143296065545787, "grad_norm": 0.4638284338868714, "learning_rate": 2.1899886234357227e-05, "loss": 0.438, "step": 770 }, { "epoch": 0.13228642143893488, "grad_norm": 0.5673813201657227, "learning_rate": 2.204209328782708e-05, "loss": 0.477, "step": 775 }, { "epoch": 0.13313988222241188, "grad_norm": 0.5210130091055161, "learning_rate": 2.218430034129693e-05, "loss": 0.4203, "step": 780 }, { "epoch": 0.1339933430058889, "grad_norm": 0.4853461794372711, "learning_rate": 2.2326507394766782e-05, "loss": 0.4491, "step": 785 }, { "epoch": 0.13484680378936587, "grad_norm": 0.5773569091514296, "learning_rate": 2.2468714448236634e-05, "loss": 0.463, "step": 790 }, { "epoch": 0.13570026457284287, "grad_norm": 0.5394999262802425, "learning_rate": 2.2610921501706486e-05, "loss": 0.4809, "step": 795 }, { "epoch": 0.13655372535631988, "grad_norm": 0.5528236856123632, "learning_rate": 2.2753128555176338e-05, "loss": 0.454, "step": 800 }, { "epoch": 0.13740718613979688, "grad_norm": 0.4930335919204957, "learning_rate": 2.289533560864619e-05, "loss": 0.4236, "step": 805 }, { "epoch": 0.1382606469232739, "grad_norm": 0.5534079421133676, "learning_rate": 2.3037542662116042e-05, "loss": 0.4654, "step": 810 }, { "epoch": 0.13911410770675087, "grad_norm": 0.5782605963845524, "learning_rate": 2.3179749715585894e-05, "loss": 0.465, "step": 815 }, { "epoch": 0.13996756849022787, "grad_norm": 0.574550259727311, "learning_rate": 2.3321956769055746e-05, "loss": 0.4489, "step": 820 }, { "epoch": 0.14082102927370488, "grad_norm": 0.5180412267182674, "learning_rate": 2.3464163822525598e-05, "loss": 0.4607, "step": 825 }, { "epoch": 0.14167449005718188, "grad_norm": 0.5984878701805567, "learning_rate": 2.360637087599545e-05, "loss": 0.4546, "step": 830 }, { "epoch": 0.14252795084065886, "grad_norm": 0.49495559651163545, "learning_rate": 2.37485779294653e-05, "loss": 0.4207, "step": 835 }, { "epoch": 0.14338141162413587, "grad_norm": 0.7635498503564752, "learning_rate": 2.3890784982935157e-05, "loss": 0.4673, "step": 840 }, { "epoch": 0.14423487240761287, "grad_norm": 0.5564319136989937, "learning_rate": 2.403299203640501e-05, "loss": 0.4439, "step": 845 }, { "epoch": 0.14508833319108988, "grad_norm": 0.5430927875340247, "learning_rate": 2.417519908987486e-05, "loss": 0.4446, "step": 850 }, { "epoch": 0.14594179397456686, "grad_norm": 0.5702080390428099, "learning_rate": 2.4317406143344712e-05, "loss": 0.4472, "step": 855 }, { "epoch": 0.14679525475804386, "grad_norm": 0.6062637608996635, "learning_rate": 2.4459613196814564e-05, "loss": 0.456, "step": 860 }, { "epoch": 0.14764871554152087, "grad_norm": 0.511093278345879, "learning_rate": 2.4601820250284416e-05, "loss": 0.4747, "step": 865 }, { "epoch": 0.14850217632499788, "grad_norm": 0.48226964198949374, "learning_rate": 2.4744027303754268e-05, "loss": 0.4595, "step": 870 }, { "epoch": 0.14935563710847485, "grad_norm": 0.5232075450050768, "learning_rate": 2.488623435722412e-05, "loss": 0.475, "step": 875 }, { "epoch": 0.15020909789195186, "grad_norm": 0.4675247970862001, "learning_rate": 2.502844141069397e-05, "loss": 0.4368, "step": 880 }, { "epoch": 0.15106255867542887, "grad_norm": 0.5329833924135905, "learning_rate": 2.5170648464163827e-05, "loss": 0.427, "step": 885 }, { "epoch": 0.15191601945890587, "grad_norm": 0.42872929620397615, "learning_rate": 2.5312855517633675e-05, "loss": 0.4513, "step": 890 }, { "epoch": 0.15276948024238285, "grad_norm": 0.521430902363332, "learning_rate": 2.5455062571103527e-05, "loss": 0.4435, "step": 895 }, { "epoch": 0.15362294102585985, "grad_norm": 0.4903251653354628, "learning_rate": 2.559726962457338e-05, "loss": 0.4482, "step": 900 }, { "epoch": 0.15447640180933686, "grad_norm": 0.6438792357402869, "learning_rate": 2.5739476678043234e-05, "loss": 0.4965, "step": 905 }, { "epoch": 0.15532986259281387, "grad_norm": 0.478554141824113, "learning_rate": 2.5881683731513083e-05, "loss": 0.4753, "step": 910 }, { "epoch": 0.15618332337629087, "grad_norm": 0.5272377065910798, "learning_rate": 2.6023890784982935e-05, "loss": 0.4773, "step": 915 }, { "epoch": 0.15703678415976785, "grad_norm": 0.4909878835826609, "learning_rate": 2.616609783845279e-05, "loss": 0.4499, "step": 920 }, { "epoch": 0.15789024494324486, "grad_norm": 0.5236078373269508, "learning_rate": 2.6308304891922642e-05, "loss": 0.4353, "step": 925 }, { "epoch": 0.15874370572672186, "grad_norm": 0.5131009670948186, "learning_rate": 2.6450511945392494e-05, "loss": 0.4406, "step": 930 }, { "epoch": 0.15959716651019887, "grad_norm": 0.49512939341909834, "learning_rate": 2.6592718998862342e-05, "loss": 0.4434, "step": 935 }, { "epoch": 0.16045062729367585, "grad_norm": 0.5552118396297326, "learning_rate": 2.6734926052332198e-05, "loss": 0.4561, "step": 940 }, { "epoch": 0.16130408807715285, "grad_norm": 0.53061633693325, "learning_rate": 2.687713310580205e-05, "loss": 0.4494, "step": 945 }, { "epoch": 0.16215754886062986, "grad_norm": 0.5794042402873977, "learning_rate": 2.70193401592719e-05, "loss": 0.4338, "step": 950 }, { "epoch": 0.16301100964410686, "grad_norm": 0.4601293584062521, "learning_rate": 2.7161547212741757e-05, "loss": 0.4436, "step": 955 }, { "epoch": 0.16386447042758384, "grad_norm": 0.46565084593383227, "learning_rate": 2.7303754266211605e-05, "loss": 0.4329, "step": 960 }, { "epoch": 0.16471793121106085, "grad_norm": 0.5072064322392361, "learning_rate": 2.7445961319681457e-05, "loss": 0.4297, "step": 965 }, { "epoch": 0.16557139199453785, "grad_norm": 0.47342171879211936, "learning_rate": 2.758816837315131e-05, "loss": 0.4618, "step": 970 }, { "epoch": 0.16642485277801486, "grad_norm": 0.42413494699639553, "learning_rate": 2.7730375426621164e-05, "loss": 0.4402, "step": 975 }, { "epoch": 0.16727831356149184, "grad_norm": 0.432362355920378, "learning_rate": 2.7872582480091013e-05, "loss": 0.4313, "step": 980 }, { "epoch": 0.16813177434496884, "grad_norm": 0.4660716312581375, "learning_rate": 2.8014789533560864e-05, "loss": 0.4313, "step": 985 }, { "epoch": 0.16898523512844585, "grad_norm": 0.5623111795871139, "learning_rate": 2.815699658703072e-05, "loss": 0.4965, "step": 990 }, { "epoch": 0.16983869591192285, "grad_norm": 0.5099929778616481, "learning_rate": 2.829920364050057e-05, "loss": 0.4406, "step": 995 }, { "epoch": 0.17069215669539983, "grad_norm": 0.5191154778058368, "learning_rate": 2.844141069397042e-05, "loss": 0.453, "step": 1000 }, { "epoch": 0.17154561747887684, "grad_norm": 0.537780142067558, "learning_rate": 2.8583617747440272e-05, "loss": 0.4397, "step": 1005 }, { "epoch": 0.17239907826235384, "grad_norm": 0.6655793341453065, "learning_rate": 2.8725824800910127e-05, "loss": 0.46, "step": 1010 }, { "epoch": 0.17325253904583085, "grad_norm": 0.5033355307697461, "learning_rate": 2.886803185437998e-05, "loss": 0.4434, "step": 1015 }, { "epoch": 0.17410599982930786, "grad_norm": 0.41191651602947554, "learning_rate": 2.901023890784983e-05, "loss": 0.4518, "step": 1020 }, { "epoch": 0.17495946061278483, "grad_norm": 0.7165005379704251, "learning_rate": 2.9152445961319686e-05, "loss": 0.4688, "step": 1025 }, { "epoch": 0.17581292139626184, "grad_norm": 0.4547290490314569, "learning_rate": 2.9294653014789535e-05, "loss": 0.4484, "step": 1030 }, { "epoch": 0.17666638217973885, "grad_norm": 0.5575607057968245, "learning_rate": 2.9436860068259387e-05, "loss": 0.4359, "step": 1035 }, { "epoch": 0.17751984296321585, "grad_norm": 0.5745866420844785, "learning_rate": 2.957906712172924e-05, "loss": 0.4385, "step": 1040 }, { "epoch": 0.17837330374669283, "grad_norm": 0.5054585742053151, "learning_rate": 2.9721274175199094e-05, "loss": 0.4725, "step": 1045 }, { "epoch": 0.17922676453016984, "grad_norm": 0.5095172052665057, "learning_rate": 2.9863481228668942e-05, "loss": 0.4577, "step": 1050 }, { "epoch": 0.18008022531364684, "grad_norm": 0.47050353737302864, "learning_rate": 3.0005688282138794e-05, "loss": 0.4358, "step": 1055 }, { "epoch": 0.18093368609712385, "grad_norm": 0.5230696539335701, "learning_rate": 3.014789533560865e-05, "loss": 0.4314, "step": 1060 }, { "epoch": 0.18178714688060083, "grad_norm": 0.5116879391222897, "learning_rate": 3.02901023890785e-05, "loss": 0.4488, "step": 1065 }, { "epoch": 0.18264060766407783, "grad_norm": 0.4327647161840352, "learning_rate": 3.043230944254835e-05, "loss": 0.4485, "step": 1070 }, { "epoch": 0.18349406844755484, "grad_norm": 0.529333193446321, "learning_rate": 3.0574516496018205e-05, "loss": 0.4237, "step": 1075 }, { "epoch": 0.18434752923103184, "grad_norm": 0.5296462141874214, "learning_rate": 3.071672354948806e-05, "loss": 0.4533, "step": 1080 }, { "epoch": 0.18520099001450882, "grad_norm": 0.42749172692337334, "learning_rate": 3.085893060295791e-05, "loss": 0.4536, "step": 1085 }, { "epoch": 0.18605445079798583, "grad_norm": 0.5722034430719456, "learning_rate": 3.100113765642776e-05, "loss": 0.4492, "step": 1090 }, { "epoch": 0.18690791158146283, "grad_norm": 0.5069756335890092, "learning_rate": 3.114334470989761e-05, "loss": 0.448, "step": 1095 }, { "epoch": 0.18776137236493984, "grad_norm": 0.47298220523051143, "learning_rate": 3.128555176336747e-05, "loss": 0.4398, "step": 1100 }, { "epoch": 0.18861483314841684, "grad_norm": 0.4872276089133869, "learning_rate": 3.1427758816837316e-05, "loss": 0.4738, "step": 1105 }, { "epoch": 0.18946829393189382, "grad_norm": 0.677837785814161, "learning_rate": 3.1569965870307165e-05, "loss": 0.4436, "step": 1110 }, { "epoch": 0.19032175471537083, "grad_norm": 0.4900090529705781, "learning_rate": 3.171217292377702e-05, "loss": 0.4574, "step": 1115 }, { "epoch": 0.19117521549884783, "grad_norm": 0.4478445343574828, "learning_rate": 3.1854379977246875e-05, "loss": 0.4766, "step": 1120 }, { "epoch": 0.19202867628232484, "grad_norm": 0.48298124947952625, "learning_rate": 3.1996587030716724e-05, "loss": 0.4112, "step": 1125 }, { "epoch": 0.19288213706580182, "grad_norm": 0.4171497735218762, "learning_rate": 3.213879408418658e-05, "loss": 0.4207, "step": 1130 }, { "epoch": 0.19373559784927882, "grad_norm": 0.5352615626759646, "learning_rate": 3.228100113765643e-05, "loss": 0.456, "step": 1135 }, { "epoch": 0.19458905863275583, "grad_norm": 0.4534670104244814, "learning_rate": 3.242320819112628e-05, "loss": 0.4222, "step": 1140 }, { "epoch": 0.19544251941623283, "grad_norm": 0.46840859181144384, "learning_rate": 3.256541524459613e-05, "loss": 0.4588, "step": 1145 }, { "epoch": 0.1962959801997098, "grad_norm": 0.5504813993986616, "learning_rate": 3.270762229806599e-05, "loss": 0.432, "step": 1150 }, { "epoch": 0.19714944098318682, "grad_norm": 0.4954673624737293, "learning_rate": 3.2849829351535835e-05, "loss": 0.4497, "step": 1155 }, { "epoch": 0.19800290176666382, "grad_norm": 0.4589891336551354, "learning_rate": 3.299203640500569e-05, "loss": 0.4381, "step": 1160 }, { "epoch": 0.19885636255014083, "grad_norm": 0.4871923740355435, "learning_rate": 3.3134243458475546e-05, "loss": 0.4293, "step": 1165 }, { "epoch": 0.1997098233336178, "grad_norm": 0.4857293937827585, "learning_rate": 3.3276450511945394e-05, "loss": 0.4692, "step": 1170 }, { "epoch": 0.20056328411709481, "grad_norm": 0.4526513297399229, "learning_rate": 3.341865756541524e-05, "loss": 0.4369, "step": 1175 }, { "epoch": 0.20141674490057182, "grad_norm": 0.46769763419007665, "learning_rate": 3.35608646188851e-05, "loss": 0.4546, "step": 1180 }, { "epoch": 0.20227020568404883, "grad_norm": 0.44651984720677607, "learning_rate": 3.370307167235495e-05, "loss": 0.4773, "step": 1185 }, { "epoch": 0.2031236664675258, "grad_norm": 0.45858142784898104, "learning_rate": 3.38452787258248e-05, "loss": 0.4247, "step": 1190 }, { "epoch": 0.2039771272510028, "grad_norm": 0.5380231079184115, "learning_rate": 3.398748577929465e-05, "loss": 0.4571, "step": 1195 }, { "epoch": 0.20483058803447982, "grad_norm": 0.5162291567036547, "learning_rate": 3.412969283276451e-05, "loss": 0.4521, "step": 1200 }, { "epoch": 0.20568404881795682, "grad_norm": 0.46581238611024695, "learning_rate": 3.427189988623436e-05, "loss": 0.4609, "step": 1205 }, { "epoch": 0.20653750960143383, "grad_norm": 0.48296433023368407, "learning_rate": 3.441410693970421e-05, "loss": 0.4376, "step": 1210 }, { "epoch": 0.2073909703849108, "grad_norm": 0.4071315538678185, "learning_rate": 3.455631399317406e-05, "loss": 0.4176, "step": 1215 }, { "epoch": 0.2082444311683878, "grad_norm": 0.45271702857607743, "learning_rate": 3.469852104664392e-05, "loss": 0.4452, "step": 1220 }, { "epoch": 0.20909789195186482, "grad_norm": 0.5258371736541105, "learning_rate": 3.484072810011377e-05, "loss": 0.4276, "step": 1225 }, { "epoch": 0.20995135273534182, "grad_norm": 0.48467082325280775, "learning_rate": 3.498293515358362e-05, "loss": 0.4275, "step": 1230 }, { "epoch": 0.2108048135188188, "grad_norm": 0.4526280170950802, "learning_rate": 3.512514220705347e-05, "loss": 0.4655, "step": 1235 }, { "epoch": 0.2116582743022958, "grad_norm": 0.5114559248971698, "learning_rate": 3.526734926052333e-05, "loss": 0.4294, "step": 1240 }, { "epoch": 0.2125117350857728, "grad_norm": 0.48177910338479685, "learning_rate": 3.5409556313993176e-05, "loss": 0.4725, "step": 1245 }, { "epoch": 0.21336519586924982, "grad_norm": 0.4515640584931449, "learning_rate": 3.5551763367463024e-05, "loss": 0.4353, "step": 1250 }, { "epoch": 0.2142186566527268, "grad_norm": 0.39516434226220454, "learning_rate": 3.569397042093288e-05, "loss": 0.4148, "step": 1255 }, { "epoch": 0.2150721174362038, "grad_norm": 0.632103770252062, "learning_rate": 3.5836177474402735e-05, "loss": 0.4786, "step": 1260 }, { "epoch": 0.2159255782196808, "grad_norm": 0.5611995944466676, "learning_rate": 3.597838452787258e-05, "loss": 0.4773, "step": 1265 }, { "epoch": 0.21677903900315781, "grad_norm": 0.45389685492503434, "learning_rate": 3.612059158134244e-05, "loss": 0.4502, "step": 1270 }, { "epoch": 0.2176324997866348, "grad_norm": 0.5401663755208751, "learning_rate": 3.626279863481229e-05, "loss": 0.4349, "step": 1275 }, { "epoch": 0.2184859605701118, "grad_norm": 0.5007488788865058, "learning_rate": 3.640500568828214e-05, "loss": 0.4342, "step": 1280 }, { "epoch": 0.2193394213535888, "grad_norm": 0.3834750434148639, "learning_rate": 3.654721274175199e-05, "loss": 0.4529, "step": 1285 }, { "epoch": 0.2201928821370658, "grad_norm": 0.5178025844547683, "learning_rate": 3.6689419795221846e-05, "loss": 0.4442, "step": 1290 }, { "epoch": 0.2210463429205428, "grad_norm": 0.4887041425409445, "learning_rate": 3.6831626848691695e-05, "loss": 0.4651, "step": 1295 }, { "epoch": 0.2218998037040198, "grad_norm": 0.4112609814433523, "learning_rate": 3.697383390216155e-05, "loss": 0.4432, "step": 1300 }, { "epoch": 0.2227532644874968, "grad_norm": 0.5483294346203349, "learning_rate": 3.7116040955631405e-05, "loss": 0.4647, "step": 1305 }, { "epoch": 0.2236067252709738, "grad_norm": 0.5022252026867281, "learning_rate": 3.7258248009101254e-05, "loss": 0.4258, "step": 1310 }, { "epoch": 0.2244601860544508, "grad_norm": 0.5649550415651314, "learning_rate": 3.74004550625711e-05, "loss": 0.4339, "step": 1315 }, { "epoch": 0.2253136468379278, "grad_norm": 0.5102215752600618, "learning_rate": 3.754266211604096e-05, "loss": 0.4369, "step": 1320 }, { "epoch": 0.2261671076214048, "grad_norm": 0.4876288790207226, "learning_rate": 3.768486916951081e-05, "loss": 0.4322, "step": 1325 }, { "epoch": 0.2270205684048818, "grad_norm": 0.44254565609199464, "learning_rate": 3.782707622298066e-05, "loss": 0.4291, "step": 1330 }, { "epoch": 0.2278740291883588, "grad_norm": 0.4486164878004238, "learning_rate": 3.796928327645051e-05, "loss": 0.4339, "step": 1335 }, { "epoch": 0.22872748997183578, "grad_norm": 0.4744618596523882, "learning_rate": 3.811149032992037e-05, "loss": 0.4481, "step": 1340 }, { "epoch": 0.2295809507553128, "grad_norm": 0.4382477271797375, "learning_rate": 3.825369738339022e-05, "loss": 0.4458, "step": 1345 }, { "epoch": 0.2304344115387898, "grad_norm": 0.4182720303258187, "learning_rate": 3.839590443686007e-05, "loss": 0.4411, "step": 1350 }, { "epoch": 0.2312878723222668, "grad_norm": 0.4853111267768434, "learning_rate": 3.853811149032992e-05, "loss": 0.4605, "step": 1355 }, { "epoch": 0.23214133310574378, "grad_norm": 0.44931625606488496, "learning_rate": 3.868031854379978e-05, "loss": 0.4503, "step": 1360 }, { "epoch": 0.23299479388922079, "grad_norm": 0.43213265680233653, "learning_rate": 3.882252559726963e-05, "loss": 0.4506, "step": 1365 }, { "epoch": 0.2338482546726978, "grad_norm": 0.47372430914308755, "learning_rate": 3.8964732650739476e-05, "loss": 0.4394, "step": 1370 }, { "epoch": 0.2347017154561748, "grad_norm": 0.4527000974956502, "learning_rate": 3.910693970420933e-05, "loss": 0.4494, "step": 1375 }, { "epoch": 0.23555517623965178, "grad_norm": 0.5321774234377923, "learning_rate": 3.924914675767919e-05, "loss": 0.4627, "step": 1380 }, { "epoch": 0.23640863702312878, "grad_norm": 0.4422205466501829, "learning_rate": 3.9391353811149035e-05, "loss": 0.4774, "step": 1385 }, { "epoch": 0.2372620978066058, "grad_norm": 0.5375207064293367, "learning_rate": 3.9533560864618884e-05, "loss": 0.4387, "step": 1390 }, { "epoch": 0.2381155585900828, "grad_norm": 0.480902448283816, "learning_rate": 3.967576791808874e-05, "loss": 0.4614, "step": 1395 }, { "epoch": 0.23896901937355977, "grad_norm": 0.5624534608697677, "learning_rate": 3.9817974971558594e-05, "loss": 0.4692, "step": 1400 }, { "epoch": 0.23982248015703678, "grad_norm": 0.4429940297742835, "learning_rate": 3.996018202502844e-05, "loss": 0.4351, "step": 1405 }, { "epoch": 0.24067594094051378, "grad_norm": 0.46027490394924614, "learning_rate": 4.01023890784983e-05, "loss": 0.4419, "step": 1410 }, { "epoch": 0.2415294017239908, "grad_norm": 0.4703903995169714, "learning_rate": 4.0244596131968146e-05, "loss": 0.4238, "step": 1415 }, { "epoch": 0.2423828625074678, "grad_norm": 0.44801221523648466, "learning_rate": 4.0386803185438e-05, "loss": 0.452, "step": 1420 }, { "epoch": 0.24323632329094477, "grad_norm": 1.2998216178922568, "learning_rate": 4.052901023890785e-05, "loss": 0.4273, "step": 1425 }, { "epoch": 0.24408978407442178, "grad_norm": 0.48772403177181456, "learning_rate": 4.0671217292377706e-05, "loss": 0.4652, "step": 1430 }, { "epoch": 0.24494324485789878, "grad_norm": 0.5730742877787742, "learning_rate": 4.0813424345847554e-05, "loss": 0.4428, "step": 1435 }, { "epoch": 0.2457967056413758, "grad_norm": 0.4608964147949435, "learning_rate": 4.095563139931741e-05, "loss": 0.4536, "step": 1440 }, { "epoch": 0.24665016642485277, "grad_norm": 0.41254547149771387, "learning_rate": 4.1097838452787265e-05, "loss": 0.4394, "step": 1445 }, { "epoch": 0.24750362720832977, "grad_norm": 0.6020336833954437, "learning_rate": 4.124004550625711e-05, "loss": 0.4437, "step": 1450 }, { "epoch": 0.24835708799180678, "grad_norm": 0.4568650389809021, "learning_rate": 4.138225255972696e-05, "loss": 0.4245, "step": 1455 }, { "epoch": 0.24921054877528379, "grad_norm": 0.548217816028164, "learning_rate": 4.152445961319682e-05, "loss": 0.4357, "step": 1460 }, { "epoch": 0.2500640095587608, "grad_norm": 0.5678979064243518, "learning_rate": 4.166666666666667e-05, "loss": 0.4561, "step": 1465 }, { "epoch": 0.25091747034223777, "grad_norm": 0.5052806630695195, "learning_rate": 4.180887372013652e-05, "loss": 0.444, "step": 1470 }, { "epoch": 0.25177093112571475, "grad_norm": 0.3952290234994121, "learning_rate": 4.195108077360637e-05, "loss": 0.4287, "step": 1475 }, { "epoch": 0.2526243919091918, "grad_norm": 0.4703270791691944, "learning_rate": 4.2093287827076224e-05, "loss": 0.4532, "step": 1480 }, { "epoch": 0.25347785269266876, "grad_norm": 0.3976777596842063, "learning_rate": 4.223549488054608e-05, "loss": 0.4371, "step": 1485 }, { "epoch": 0.2543313134761458, "grad_norm": 0.5064159597233353, "learning_rate": 4.237770193401593e-05, "loss": 0.4434, "step": 1490 }, { "epoch": 0.25518477425962277, "grad_norm": 0.3967066906624392, "learning_rate": 4.2519908987485777e-05, "loss": 0.4773, "step": 1495 }, { "epoch": 0.25603823504309975, "grad_norm": 0.4652990156887633, "learning_rate": 4.266211604095564e-05, "loss": 0.4602, "step": 1500 }, { "epoch": 0.2568916958265768, "grad_norm": 0.4876919530547206, "learning_rate": 4.280432309442549e-05, "loss": 0.4696, "step": 1505 }, { "epoch": 0.25774515661005376, "grad_norm": 0.4548076987071943, "learning_rate": 4.2946530147895336e-05, "loss": 0.4163, "step": 1510 }, { "epoch": 0.2585986173935308, "grad_norm": 0.42065646346664515, "learning_rate": 4.308873720136519e-05, "loss": 0.4431, "step": 1515 }, { "epoch": 0.25945207817700777, "grad_norm": 0.5291298415272344, "learning_rate": 4.3230944254835046e-05, "loss": 0.439, "step": 1520 }, { "epoch": 0.26030553896048475, "grad_norm": 0.539031398384455, "learning_rate": 4.3373151308304895e-05, "loss": 0.4406, "step": 1525 }, { "epoch": 0.2611589997439618, "grad_norm": 0.5751278855689929, "learning_rate": 4.351535836177474e-05, "loss": 0.4518, "step": 1530 }, { "epoch": 0.26201246052743876, "grad_norm": 0.47506949327402476, "learning_rate": 4.36575654152446e-05, "loss": 0.4366, "step": 1535 }, { "epoch": 0.26286592131091574, "grad_norm": 0.5628602580779011, "learning_rate": 4.3799772468714454e-05, "loss": 0.4283, "step": 1540 }, { "epoch": 0.2637193820943928, "grad_norm": 0.447366426013562, "learning_rate": 4.39419795221843e-05, "loss": 0.4354, "step": 1545 }, { "epoch": 0.26457284287786975, "grad_norm": 0.40933399804062365, "learning_rate": 4.408418657565416e-05, "loss": 0.4188, "step": 1550 }, { "epoch": 0.2654263036613468, "grad_norm": 0.46498373181398767, "learning_rate": 4.4226393629124006e-05, "loss": 0.468, "step": 1555 }, { "epoch": 0.26627976444482376, "grad_norm": 0.428779996833117, "learning_rate": 4.436860068259386e-05, "loss": 0.4536, "step": 1560 }, { "epoch": 0.26713322522830074, "grad_norm": 0.4512290441964604, "learning_rate": 4.451080773606371e-05, "loss": 0.4713, "step": 1565 }, { "epoch": 0.2679866860117778, "grad_norm": 0.416872152454922, "learning_rate": 4.4653014789533565e-05, "loss": 0.4262, "step": 1570 }, { "epoch": 0.26884014679525475, "grad_norm": 0.4840922107089889, "learning_rate": 4.4795221843003413e-05, "loss": 0.4527, "step": 1575 }, { "epoch": 0.26969360757873173, "grad_norm": 0.4885107942892858, "learning_rate": 4.493742889647327e-05, "loss": 0.4496, "step": 1580 }, { "epoch": 0.27054706836220876, "grad_norm": 0.5928566568893636, "learning_rate": 4.5079635949943124e-05, "loss": 0.4328, "step": 1585 }, { "epoch": 0.27140052914568574, "grad_norm": 0.4641182477627298, "learning_rate": 4.522184300341297e-05, "loss": 0.4649, "step": 1590 }, { "epoch": 0.2722539899291628, "grad_norm": 0.4270440696728802, "learning_rate": 4.536405005688282e-05, "loss": 0.4419, "step": 1595 }, { "epoch": 0.27310745071263975, "grad_norm": 0.46075209755821517, "learning_rate": 4.5506257110352676e-05, "loss": 0.4355, "step": 1600 }, { "epoch": 0.27396091149611673, "grad_norm": 0.45716270341705356, "learning_rate": 4.564846416382253e-05, "loss": 0.4039, "step": 1605 }, { "epoch": 0.27481437227959377, "grad_norm": 0.43430493552723265, "learning_rate": 4.579067121729238e-05, "loss": 0.4293, "step": 1610 }, { "epoch": 0.27566783306307074, "grad_norm": 0.4764292615545536, "learning_rate": 4.593287827076223e-05, "loss": 0.443, "step": 1615 }, { "epoch": 0.2765212938465478, "grad_norm": 0.412847270912387, "learning_rate": 4.6075085324232084e-05, "loss": 0.443, "step": 1620 }, { "epoch": 0.27737475463002476, "grad_norm": 0.44234594694898255, "learning_rate": 4.621729237770194e-05, "loss": 0.4786, "step": 1625 }, { "epoch": 0.27822821541350173, "grad_norm": 0.40580020639458236, "learning_rate": 4.635949943117179e-05, "loss": 0.4505, "step": 1630 }, { "epoch": 0.27908167619697877, "grad_norm": 0.48320758855512164, "learning_rate": 4.6501706484641636e-05, "loss": 0.4757, "step": 1635 }, { "epoch": 0.27993513698045575, "grad_norm": 0.440770842040657, "learning_rate": 4.664391353811149e-05, "loss": 0.4284, "step": 1640 }, { "epoch": 0.2807885977639327, "grad_norm": 0.5767605360491868, "learning_rate": 4.6786120591581347e-05, "loss": 0.441, "step": 1645 }, { "epoch": 0.28164205854740976, "grad_norm": 0.4085076757573628, "learning_rate": 4.6928327645051195e-05, "loss": 0.4505, "step": 1650 }, { "epoch": 0.28249551933088674, "grad_norm": 0.40564229563219534, "learning_rate": 4.707053469852105e-05, "loss": 0.4585, "step": 1655 }, { "epoch": 0.28334898011436377, "grad_norm": 0.3896787908848227, "learning_rate": 4.72127417519909e-05, "loss": 0.4097, "step": 1660 }, { "epoch": 0.28420244089784075, "grad_norm": 0.47409145611131515, "learning_rate": 4.7354948805460754e-05, "loss": 0.4614, "step": 1665 }, { "epoch": 0.2850559016813177, "grad_norm": 0.40251162857002987, "learning_rate": 4.74971558589306e-05, "loss": 0.4424, "step": 1670 }, { "epoch": 0.28590936246479476, "grad_norm": 0.4998262607356055, "learning_rate": 4.763936291240046e-05, "loss": 0.4444, "step": 1675 }, { "epoch": 0.28676282324827174, "grad_norm": 0.41122779611643206, "learning_rate": 4.778156996587031e-05, "loss": 0.444, "step": 1680 }, { "epoch": 0.2876162840317487, "grad_norm": 0.42658321148897044, "learning_rate": 4.792377701934016e-05, "loss": 0.4377, "step": 1685 }, { "epoch": 0.28846974481522575, "grad_norm": 0.4426297903932404, "learning_rate": 4.806598407281002e-05, "loss": 0.4445, "step": 1690 }, { "epoch": 0.2893232055987027, "grad_norm": 0.43302242707670985, "learning_rate": 4.8208191126279865e-05, "loss": 0.4484, "step": 1695 }, { "epoch": 0.29017666638217976, "grad_norm": 0.4364579283417304, "learning_rate": 4.835039817974972e-05, "loss": 0.4584, "step": 1700 }, { "epoch": 0.29103012716565674, "grad_norm": 0.37724384147629314, "learning_rate": 4.849260523321957e-05, "loss": 0.4544, "step": 1705 }, { "epoch": 0.2918835879491337, "grad_norm": 0.4238372999746499, "learning_rate": 4.8634812286689424e-05, "loss": 0.4569, "step": 1710 }, { "epoch": 0.29273704873261075, "grad_norm": 0.3879169221067591, "learning_rate": 4.877701934015927e-05, "loss": 0.4499, "step": 1715 }, { "epoch": 0.2935905095160877, "grad_norm": 0.3860902080887719, "learning_rate": 4.891922639362913e-05, "loss": 0.433, "step": 1720 }, { "epoch": 0.29444397029956476, "grad_norm": 0.4283849828890237, "learning_rate": 4.906143344709898e-05, "loss": 0.4243, "step": 1725 }, { "epoch": 0.29529743108304174, "grad_norm": 0.44036104465932996, "learning_rate": 4.920364050056883e-05, "loss": 0.4195, "step": 1730 }, { "epoch": 0.2961508918665187, "grad_norm": 0.45042793032137496, "learning_rate": 4.934584755403868e-05, "loss": 0.4422, "step": 1735 }, { "epoch": 0.29700435264999575, "grad_norm": 0.4461675002367327, "learning_rate": 4.9488054607508536e-05, "loss": 0.4377, "step": 1740 }, { "epoch": 0.29785781343347273, "grad_norm": 0.4613569372920741, "learning_rate": 4.963026166097839e-05, "loss": 0.4387, "step": 1745 }, { "epoch": 0.2987112742169497, "grad_norm": 0.572365580424042, "learning_rate": 4.977246871444824e-05, "loss": 0.4295, "step": 1750 }, { "epoch": 0.29956473500042674, "grad_norm": 0.4013511976561577, "learning_rate": 4.991467576791809e-05, "loss": 0.4299, "step": 1755 }, { "epoch": 0.3004181957839037, "grad_norm": 0.6163374155061488, "learning_rate": 4.999367728882145e-05, "loss": 0.4514, "step": 1760 }, { "epoch": 0.30127165656738075, "grad_norm": 0.3555359095707959, "learning_rate": 4.997787051087506e-05, "loss": 0.4488, "step": 1765 }, { "epoch": 0.30212511735085773, "grad_norm": 0.4085512037786229, "learning_rate": 4.9962063732928684e-05, "loss": 0.4393, "step": 1770 }, { "epoch": 0.3029785781343347, "grad_norm": 0.3767875289930178, "learning_rate": 4.99462569549823e-05, "loss": 0.4662, "step": 1775 }, { "epoch": 0.30383203891781174, "grad_norm": 0.395469462831486, "learning_rate": 4.993045017703591e-05, "loss": 0.4592, "step": 1780 }, { "epoch": 0.3046854997012887, "grad_norm": 0.4668716042516809, "learning_rate": 4.991464339908954e-05, "loss": 0.4476, "step": 1785 }, { "epoch": 0.3055389604847657, "grad_norm": 0.4258777389644843, "learning_rate": 4.989883662114315e-05, "loss": 0.4174, "step": 1790 }, { "epoch": 0.30639242126824273, "grad_norm": 0.38561293195595003, "learning_rate": 4.9883029843196765e-05, "loss": 0.417, "step": 1795 }, { "epoch": 0.3072458820517197, "grad_norm": 0.5124272473124463, "learning_rate": 4.986722306525038e-05, "loss": 0.4559, "step": 1800 }, { "epoch": 0.30809934283519674, "grad_norm": 0.36497494008094594, "learning_rate": 4.9851416287304e-05, "loss": 0.4362, "step": 1805 }, { "epoch": 0.3089528036186737, "grad_norm": 0.38910637433469697, "learning_rate": 4.983560950935762e-05, "loss": 0.4332, "step": 1810 }, { "epoch": 0.3098062644021507, "grad_norm": 0.3897241235653279, "learning_rate": 4.981980273141123e-05, "loss": 0.4283, "step": 1815 }, { "epoch": 0.31065972518562773, "grad_norm": 0.48060107868509144, "learning_rate": 4.980399595346485e-05, "loss": 0.4708, "step": 1820 }, { "epoch": 0.3115131859691047, "grad_norm": 0.4281446836526605, "learning_rate": 4.978818917551846e-05, "loss": 0.4383, "step": 1825 }, { "epoch": 0.31236664675258174, "grad_norm": 0.38444099441108925, "learning_rate": 4.977238239757208e-05, "loss": 0.4558, "step": 1830 }, { "epoch": 0.3132201075360587, "grad_norm": 0.460827471803298, "learning_rate": 4.97565756196257e-05, "loss": 0.4177, "step": 1835 }, { "epoch": 0.3140735683195357, "grad_norm": 0.40997335367627036, "learning_rate": 4.9740768841679315e-05, "loss": 0.4172, "step": 1840 }, { "epoch": 0.31492702910301273, "grad_norm": 0.3639064335672678, "learning_rate": 4.972496206373293e-05, "loss": 0.4286, "step": 1845 }, { "epoch": 0.3157804898864897, "grad_norm": 0.4073530152643747, "learning_rate": 4.970915528578654e-05, "loss": 0.4251, "step": 1850 }, { "epoch": 0.3166339506699667, "grad_norm": 0.37060288652507867, "learning_rate": 4.969334850784017e-05, "loss": 0.4218, "step": 1855 }, { "epoch": 0.3174874114534437, "grad_norm": 0.4550392403480856, "learning_rate": 4.967754172989378e-05, "loss": 0.4114, "step": 1860 }, { "epoch": 0.3183408722369207, "grad_norm": 0.4176508909107961, "learning_rate": 4.9661734951947396e-05, "loss": 0.4559, "step": 1865 }, { "epoch": 0.31919433302039774, "grad_norm": 0.4855718134087837, "learning_rate": 4.964592817400101e-05, "loss": 0.4322, "step": 1870 }, { "epoch": 0.3200477938038747, "grad_norm": 0.3780721495228835, "learning_rate": 4.963012139605463e-05, "loss": 0.4519, "step": 1875 }, { "epoch": 0.3209012545873517, "grad_norm": 0.43130728866977464, "learning_rate": 4.961431461810825e-05, "loss": 0.4289, "step": 1880 }, { "epoch": 0.3217547153708287, "grad_norm": 0.4629749741871921, "learning_rate": 4.959850784016186e-05, "loss": 0.4553, "step": 1885 }, { "epoch": 0.3226081761543057, "grad_norm": 0.49788970597508586, "learning_rate": 4.958270106221548e-05, "loss": 0.4289, "step": 1890 }, { "epoch": 0.3234616369377827, "grad_norm": 0.41479256768854533, "learning_rate": 4.9566894284269094e-05, "loss": 0.4605, "step": 1895 }, { "epoch": 0.3243150977212597, "grad_norm": 0.4269258370244536, "learning_rate": 4.955108750632271e-05, "loss": 0.4249, "step": 1900 }, { "epoch": 0.3251685585047367, "grad_norm": 0.37963672325485187, "learning_rate": 4.9535280728376335e-05, "loss": 0.418, "step": 1905 }, { "epoch": 0.3260220192882137, "grad_norm": 0.7633037989224982, "learning_rate": 4.9519473950429946e-05, "loss": 0.4604, "step": 1910 }, { "epoch": 0.3268754800716907, "grad_norm": 0.4370170793382303, "learning_rate": 4.9503667172483563e-05, "loss": 0.4309, "step": 1915 }, { "epoch": 0.3277289408551677, "grad_norm": 0.38401396890167044, "learning_rate": 4.948786039453718e-05, "loss": 0.4405, "step": 1920 }, { "epoch": 0.3285824016386447, "grad_norm": 0.3946099035237171, "learning_rate": 4.94720536165908e-05, "loss": 0.4211, "step": 1925 }, { "epoch": 0.3294358624221217, "grad_norm": 0.3927618122618321, "learning_rate": 4.9456246838644416e-05, "loss": 0.4298, "step": 1930 }, { "epoch": 0.33028932320559873, "grad_norm": 0.4805866715720291, "learning_rate": 4.9440440060698027e-05, "loss": 0.4723, "step": 1935 }, { "epoch": 0.3311427839890757, "grad_norm": 0.42519497566523956, "learning_rate": 4.942463328275165e-05, "loss": 0.4873, "step": 1940 }, { "epoch": 0.3319962447725527, "grad_norm": 0.46615243214863217, "learning_rate": 4.940882650480526e-05, "loss": 0.4521, "step": 1945 }, { "epoch": 0.3328497055560297, "grad_norm": 0.417009051746185, "learning_rate": 4.939301972685888e-05, "loss": 0.4402, "step": 1950 }, { "epoch": 0.3337031663395067, "grad_norm": 0.4586519367913976, "learning_rate": 4.9377212948912496e-05, "loss": 0.4155, "step": 1955 }, { "epoch": 0.3345566271229837, "grad_norm": 0.4222107662197675, "learning_rate": 4.9361406170966114e-05, "loss": 0.4527, "step": 1960 }, { "epoch": 0.3354100879064607, "grad_norm": 0.39148673554948543, "learning_rate": 4.934559939301973e-05, "loss": 0.4503, "step": 1965 }, { "epoch": 0.3362635486899377, "grad_norm": 0.378821238203712, "learning_rate": 4.932979261507334e-05, "loss": 0.441, "step": 1970 }, { "epoch": 0.3371170094734147, "grad_norm": 0.3479983077135637, "learning_rate": 4.9313985837126966e-05, "loss": 0.4349, "step": 1975 }, { "epoch": 0.3379704702568917, "grad_norm": 0.44131510498190785, "learning_rate": 4.929817905918058e-05, "loss": 0.4311, "step": 1980 }, { "epoch": 0.3388239310403687, "grad_norm": 0.5667534646176338, "learning_rate": 4.9282372281234194e-05, "loss": 0.4192, "step": 1985 }, { "epoch": 0.3396773918238457, "grad_norm": 0.36242677757059, "learning_rate": 4.926656550328781e-05, "loss": 0.4506, "step": 1990 }, { "epoch": 0.3405308526073227, "grad_norm": 0.37075550108716954, "learning_rate": 4.925075872534143e-05, "loss": 0.4173, "step": 1995 }, { "epoch": 0.34138431339079967, "grad_norm": 0.3710719919675142, "learning_rate": 4.923495194739505e-05, "loss": 0.4305, "step": 2000 }, { "epoch": 0.3422377741742767, "grad_norm": 0.340402378173557, "learning_rate": 4.921914516944866e-05, "loss": 0.4225, "step": 2005 }, { "epoch": 0.3430912349577537, "grad_norm": 0.46795402983580436, "learning_rate": 4.920333839150228e-05, "loss": 0.4434, "step": 2010 }, { "epoch": 0.3439446957412307, "grad_norm": 0.3695151220261464, "learning_rate": 4.918753161355589e-05, "loss": 0.4312, "step": 2015 }, { "epoch": 0.3447981565247077, "grad_norm": 0.4114279718967292, "learning_rate": 4.917172483560951e-05, "loss": 0.438, "step": 2020 }, { "epoch": 0.34565161730818467, "grad_norm": 0.39548887139797206, "learning_rate": 4.9155918057663134e-05, "loss": 0.4534, "step": 2025 }, { "epoch": 0.3465050780916617, "grad_norm": 0.4552272699380185, "learning_rate": 4.9140111279716745e-05, "loss": 0.4474, "step": 2030 }, { "epoch": 0.3473585388751387, "grad_norm": 0.4283792771165656, "learning_rate": 4.912430450177036e-05, "loss": 0.4517, "step": 2035 }, { "epoch": 0.3482119996586157, "grad_norm": 0.3548379926368685, "learning_rate": 4.910849772382398e-05, "loss": 0.4159, "step": 2040 }, { "epoch": 0.3490654604420927, "grad_norm": 0.4245941904712946, "learning_rate": 4.90926909458776e-05, "loss": 0.4238, "step": 2045 }, { "epoch": 0.34991892122556967, "grad_norm": 0.39246914426116064, "learning_rate": 4.907688416793121e-05, "loss": 0.433, "step": 2050 }, { "epoch": 0.3507723820090467, "grad_norm": 0.39774782148634963, "learning_rate": 4.9061077389984825e-05, "loss": 0.408, "step": 2055 }, { "epoch": 0.3516258427925237, "grad_norm": 0.3714701260183567, "learning_rate": 4.904527061203845e-05, "loss": 0.4293, "step": 2060 }, { "epoch": 0.35247930357600066, "grad_norm": 0.46422783083366087, "learning_rate": 4.902946383409206e-05, "loss": 0.4482, "step": 2065 }, { "epoch": 0.3533327643594777, "grad_norm": 0.3829998257937418, "learning_rate": 4.901365705614568e-05, "loss": 0.4168, "step": 2070 }, { "epoch": 0.35418622514295467, "grad_norm": 0.3584555741298019, "learning_rate": 4.8997850278199295e-05, "loss": 0.4194, "step": 2075 }, { "epoch": 0.3550396859264317, "grad_norm": 0.3057755033653566, "learning_rate": 4.898204350025291e-05, "loss": 0.4128, "step": 2080 }, { "epoch": 0.3558931467099087, "grad_norm": 0.33782176784425105, "learning_rate": 4.896623672230653e-05, "loss": 0.4371, "step": 2085 }, { "epoch": 0.35674660749338566, "grad_norm": 0.5095165636419938, "learning_rate": 4.895042994436014e-05, "loss": 0.4215, "step": 2090 }, { "epoch": 0.3576000682768627, "grad_norm": 0.4578812884707099, "learning_rate": 4.8934623166413765e-05, "loss": 0.4671, "step": 2095 }, { "epoch": 0.35845352906033967, "grad_norm": 0.4301732309899731, "learning_rate": 4.8918816388467376e-05, "loss": 0.4668, "step": 2100 }, { "epoch": 0.3593069898438167, "grad_norm": 0.4584433021301342, "learning_rate": 4.890300961052099e-05, "loss": 0.437, "step": 2105 }, { "epoch": 0.3601604506272937, "grad_norm": 0.4518802030058222, "learning_rate": 4.888720283257461e-05, "loss": 0.4198, "step": 2110 }, { "epoch": 0.36101391141077066, "grad_norm": 0.4792417456925794, "learning_rate": 4.887139605462823e-05, "loss": 0.4544, "step": 2115 }, { "epoch": 0.3618673721942477, "grad_norm": 0.37188799163979924, "learning_rate": 4.8855589276681845e-05, "loss": 0.4039, "step": 2120 }, { "epoch": 0.36272083297772467, "grad_norm": 0.448705171947849, "learning_rate": 4.8839782498735456e-05, "loss": 0.4283, "step": 2125 }, { "epoch": 0.36357429376120165, "grad_norm": 0.42820126332892383, "learning_rate": 4.882397572078908e-05, "loss": 0.4456, "step": 2130 }, { "epoch": 0.3644277545446787, "grad_norm": 0.3859076160211945, "learning_rate": 4.880816894284269e-05, "loss": 0.4445, "step": 2135 }, { "epoch": 0.36528121532815566, "grad_norm": 0.4001725357145856, "learning_rate": 4.879236216489631e-05, "loss": 0.4298, "step": 2140 }, { "epoch": 0.3661346761116327, "grad_norm": 0.3833123103892692, "learning_rate": 4.8776555386949926e-05, "loss": 0.4434, "step": 2145 }, { "epoch": 0.3669881368951097, "grad_norm": 0.42373523599687285, "learning_rate": 4.876074860900354e-05, "loss": 0.4301, "step": 2150 }, { "epoch": 0.36784159767858665, "grad_norm": 0.3813692294008235, "learning_rate": 4.874494183105716e-05, "loss": 0.4342, "step": 2155 }, { "epoch": 0.3686950584620637, "grad_norm": 0.36713522006919747, "learning_rate": 4.872913505311078e-05, "loss": 0.4346, "step": 2160 }, { "epoch": 0.36954851924554066, "grad_norm": 0.38639783602671923, "learning_rate": 4.8713328275164396e-05, "loss": 0.4438, "step": 2165 }, { "epoch": 0.37040198002901764, "grad_norm": 0.3820721349423652, "learning_rate": 4.8697521497218006e-05, "loss": 0.4291, "step": 2170 }, { "epoch": 0.3712554408124947, "grad_norm": 0.33194940097805914, "learning_rate": 4.8681714719271624e-05, "loss": 0.4339, "step": 2175 }, { "epoch": 0.37210890159597165, "grad_norm": 0.36326408285911505, "learning_rate": 4.866590794132525e-05, "loss": 0.4178, "step": 2180 }, { "epoch": 0.3729623623794487, "grad_norm": 0.657812141557126, "learning_rate": 4.865010116337886e-05, "loss": 0.4358, "step": 2185 }, { "epoch": 0.37381582316292566, "grad_norm": 0.43208925203812065, "learning_rate": 4.8634294385432476e-05, "loss": 0.4241, "step": 2190 }, { "epoch": 0.37466928394640264, "grad_norm": 0.49219031090939686, "learning_rate": 4.8618487607486094e-05, "loss": 0.4415, "step": 2195 }, { "epoch": 0.3755227447298797, "grad_norm": 0.3932445485145676, "learning_rate": 4.860268082953971e-05, "loss": 0.4284, "step": 2200 }, { "epoch": 0.37637620551335665, "grad_norm": 0.4274722047416287, "learning_rate": 4.858687405159332e-05, "loss": 0.4244, "step": 2205 }, { "epoch": 0.3772296662968337, "grad_norm": 0.39056872433573964, "learning_rate": 4.857106727364694e-05, "loss": 0.4513, "step": 2210 }, { "epoch": 0.37808312708031067, "grad_norm": 0.332962093743331, "learning_rate": 4.8555260495700563e-05, "loss": 0.4258, "step": 2215 }, { "epoch": 0.37893658786378764, "grad_norm": 0.4261767384406321, "learning_rate": 4.8539453717754174e-05, "loss": 0.4421, "step": 2220 }, { "epoch": 0.3797900486472647, "grad_norm": 0.46441607390692574, "learning_rate": 4.852364693980779e-05, "loss": 0.4364, "step": 2225 }, { "epoch": 0.38064350943074166, "grad_norm": 0.3473009988639465, "learning_rate": 4.850784016186141e-05, "loss": 0.418, "step": 2230 }, { "epoch": 0.38149697021421863, "grad_norm": 0.474139181035017, "learning_rate": 4.8492033383915027e-05, "loss": 0.4433, "step": 2235 }, { "epoch": 0.38235043099769567, "grad_norm": 0.47147496338761324, "learning_rate": 4.8476226605968644e-05, "loss": 0.4632, "step": 2240 }, { "epoch": 0.38320389178117265, "grad_norm": 0.34928970263485487, "learning_rate": 4.8460419828022255e-05, "loss": 0.4175, "step": 2245 }, { "epoch": 0.3840573525646497, "grad_norm": 0.4242838596460406, "learning_rate": 4.844461305007588e-05, "loss": 0.4604, "step": 2250 }, { "epoch": 0.38491081334812666, "grad_norm": 0.4559865880078733, "learning_rate": 4.842880627212949e-05, "loss": 0.4463, "step": 2255 }, { "epoch": 0.38576427413160363, "grad_norm": 0.4545790736325755, "learning_rate": 4.841299949418311e-05, "loss": 0.438, "step": 2260 }, { "epoch": 0.38661773491508067, "grad_norm": 0.42114219794514496, "learning_rate": 4.8397192716236725e-05, "loss": 0.4451, "step": 2265 }, { "epoch": 0.38747119569855765, "grad_norm": 0.4238959826902125, "learning_rate": 4.838138593829034e-05, "loss": 0.4482, "step": 2270 }, { "epoch": 0.3883246564820346, "grad_norm": 0.37450934178704215, "learning_rate": 4.836557916034396e-05, "loss": 0.4012, "step": 2275 }, { "epoch": 0.38917811726551166, "grad_norm": 0.4081343281350565, "learning_rate": 4.834977238239757e-05, "loss": 0.4433, "step": 2280 }, { "epoch": 0.39003157804898864, "grad_norm": 0.4335188596764046, "learning_rate": 4.8333965604451194e-05, "loss": 0.4447, "step": 2285 }, { "epoch": 0.39088503883246567, "grad_norm": 0.35011620005272515, "learning_rate": 4.8318158826504805e-05, "loss": 0.4187, "step": 2290 }, { "epoch": 0.39173849961594265, "grad_norm": 0.3981102864903299, "learning_rate": 4.830235204855842e-05, "loss": 0.4275, "step": 2295 }, { "epoch": 0.3925919603994196, "grad_norm": 0.38771147505499537, "learning_rate": 4.828654527061204e-05, "loss": 0.4277, "step": 2300 }, { "epoch": 0.39344542118289666, "grad_norm": 0.334188543776544, "learning_rate": 4.827073849266566e-05, "loss": 0.4186, "step": 2305 }, { "epoch": 0.39429888196637364, "grad_norm": 0.4852632231144291, "learning_rate": 4.8254931714719275e-05, "loss": 0.4491, "step": 2310 }, { "epoch": 0.39515234274985067, "grad_norm": 0.42689099728408947, "learning_rate": 4.823912493677289e-05, "loss": 0.4392, "step": 2315 }, { "epoch": 0.39600580353332765, "grad_norm": 0.3894105668622364, "learning_rate": 4.822331815882651e-05, "loss": 0.4362, "step": 2320 }, { "epoch": 0.3968592643168046, "grad_norm": 0.4231790536877768, "learning_rate": 4.820751138088012e-05, "loss": 0.4397, "step": 2325 }, { "epoch": 0.39771272510028166, "grad_norm": 0.4509188009021153, "learning_rate": 4.819170460293374e-05, "loss": 0.4435, "step": 2330 }, { "epoch": 0.39856618588375864, "grad_norm": 0.5502530092768969, "learning_rate": 4.817589782498736e-05, "loss": 0.4188, "step": 2335 }, { "epoch": 0.3994196466672356, "grad_norm": 0.4219233186942371, "learning_rate": 4.816009104704097e-05, "loss": 0.4001, "step": 2340 }, { "epoch": 0.40027310745071265, "grad_norm": 0.3667212550368481, "learning_rate": 4.814428426909459e-05, "loss": 0.4608, "step": 2345 }, { "epoch": 0.40112656823418963, "grad_norm": 0.3789030810331988, "learning_rate": 4.812847749114821e-05, "loss": 0.4166, "step": 2350 }, { "epoch": 0.40198002901766666, "grad_norm": 0.4315833002265229, "learning_rate": 4.8112670713201825e-05, "loss": 0.4159, "step": 2355 }, { "epoch": 0.40283348980114364, "grad_norm": 0.7249790527020222, "learning_rate": 4.8096863935255436e-05, "loss": 0.4355, "step": 2360 }, { "epoch": 0.4036869505846206, "grad_norm": 0.42660249142525775, "learning_rate": 4.808105715730905e-05, "loss": 0.4069, "step": 2365 }, { "epoch": 0.40454041136809765, "grad_norm": 0.4288206520068741, "learning_rate": 4.806525037936268e-05, "loss": 0.4126, "step": 2370 }, { "epoch": 0.40539387215157463, "grad_norm": 0.339546930626178, "learning_rate": 4.804944360141629e-05, "loss": 0.4531, "step": 2375 }, { "epoch": 0.4062473329350516, "grad_norm": 0.3687401926493744, "learning_rate": 4.8033636823469906e-05, "loss": 0.4176, "step": 2380 }, { "epoch": 0.40710079371852864, "grad_norm": 0.3338162430291278, "learning_rate": 4.801783004552352e-05, "loss": 0.4047, "step": 2385 }, { "epoch": 0.4079542545020056, "grad_norm": 0.3672544509043547, "learning_rate": 4.800202326757714e-05, "loss": 0.4302, "step": 2390 }, { "epoch": 0.40880771528548265, "grad_norm": 0.4131384570567746, "learning_rate": 4.798621648963076e-05, "loss": 0.4176, "step": 2395 }, { "epoch": 0.40966117606895963, "grad_norm": 0.3506403094911692, "learning_rate": 4.797040971168437e-05, "loss": 0.4277, "step": 2400 }, { "epoch": 0.4105146368524366, "grad_norm": 0.39016621250867733, "learning_rate": 4.795460293373799e-05, "loss": 0.4176, "step": 2405 }, { "epoch": 0.41136809763591364, "grad_norm": 0.3881714486911804, "learning_rate": 4.7938796155791604e-05, "loss": 0.4375, "step": 2410 }, { "epoch": 0.4122215584193906, "grad_norm": 0.3692396699941196, "learning_rate": 4.792298937784522e-05, "loss": 0.4389, "step": 2415 }, { "epoch": 0.41307501920286765, "grad_norm": 0.33172468294439866, "learning_rate": 4.790718259989884e-05, "loss": 0.4242, "step": 2420 }, { "epoch": 0.41392847998634463, "grad_norm": 0.4413822621043275, "learning_rate": 4.7891375821952456e-05, "loss": 0.4378, "step": 2425 }, { "epoch": 0.4147819407698216, "grad_norm": 0.35983195285572867, "learning_rate": 4.7875569044006074e-05, "loss": 0.4349, "step": 2430 }, { "epoch": 0.41563540155329864, "grad_norm": 0.39402335762165913, "learning_rate": 4.785976226605969e-05, "loss": 0.4534, "step": 2435 }, { "epoch": 0.4164888623367756, "grad_norm": 0.36879420651924233, "learning_rate": 4.784395548811331e-05, "loss": 0.435, "step": 2440 }, { "epoch": 0.4173423231202526, "grad_norm": 0.3501115013072352, "learning_rate": 4.782814871016692e-05, "loss": 0.4165, "step": 2445 }, { "epoch": 0.41819578390372963, "grad_norm": 0.40952696291636287, "learning_rate": 4.7812341932220537e-05, "loss": 0.4311, "step": 2450 }, { "epoch": 0.4190492446872066, "grad_norm": 0.43572588608753965, "learning_rate": 4.7796535154274154e-05, "loss": 0.4403, "step": 2455 }, { "epoch": 0.41990270547068365, "grad_norm": 0.4415496314549699, "learning_rate": 4.778072837632777e-05, "loss": 0.4275, "step": 2460 }, { "epoch": 0.4207561662541606, "grad_norm": 0.39030807246494026, "learning_rate": 4.776492159838139e-05, "loss": 0.4277, "step": 2465 }, { "epoch": 0.4216096270376376, "grad_norm": 0.39810778226001964, "learning_rate": 4.7749114820435006e-05, "loss": 0.4458, "step": 2470 }, { "epoch": 0.42246308782111464, "grad_norm": 0.37363740892949676, "learning_rate": 4.7733308042488624e-05, "loss": 0.4243, "step": 2475 }, { "epoch": 0.4233165486045916, "grad_norm": 0.4010446802149582, "learning_rate": 4.7717501264542235e-05, "loss": 0.4431, "step": 2480 }, { "epoch": 0.4241700093880686, "grad_norm": 0.48901861140608593, "learning_rate": 4.770169448659585e-05, "loss": 0.4209, "step": 2485 }, { "epoch": 0.4250234701715456, "grad_norm": 0.34783583800274137, "learning_rate": 4.7685887708649476e-05, "loss": 0.4105, "step": 2490 }, { "epoch": 0.4258769309550226, "grad_norm": 0.3724169870778699, "learning_rate": 4.767008093070309e-05, "loss": 0.4429, "step": 2495 }, { "epoch": 0.42673039173849964, "grad_norm": 0.3976932703870188, "learning_rate": 4.7654274152756704e-05, "loss": 0.4266, "step": 2500 }, { "epoch": 0.4275838525219766, "grad_norm": 0.3376944547914455, "learning_rate": 4.763846737481032e-05, "loss": 0.4143, "step": 2505 }, { "epoch": 0.4284373133054536, "grad_norm": 0.40002662680767104, "learning_rate": 4.762266059686394e-05, "loss": 0.4382, "step": 2510 }, { "epoch": 0.4292907740889306, "grad_norm": 0.3841520412663537, "learning_rate": 4.760685381891755e-05, "loss": 0.4361, "step": 2515 }, { "epoch": 0.4301442348724076, "grad_norm": 0.5111921589827901, "learning_rate": 4.759104704097117e-05, "loss": 0.4426, "step": 2520 }, { "epoch": 0.43099769565588464, "grad_norm": 0.4182962275417889, "learning_rate": 4.757524026302479e-05, "loss": 0.4253, "step": 2525 }, { "epoch": 0.4318511564393616, "grad_norm": 0.3918186224726592, "learning_rate": 4.75594334850784e-05, "loss": 0.4375, "step": 2530 }, { "epoch": 0.4327046172228386, "grad_norm": 1.4787767318388065, "learning_rate": 4.754362670713202e-05, "loss": 0.4428, "step": 2535 }, { "epoch": 0.43355807800631563, "grad_norm": 0.5071604982171775, "learning_rate": 4.752781992918564e-05, "loss": 0.4316, "step": 2540 }, { "epoch": 0.4344115387897926, "grad_norm": 0.3962793721997279, "learning_rate": 4.7512013151239255e-05, "loss": 0.4707, "step": 2545 }, { "epoch": 0.4352649995732696, "grad_norm": 0.6763450463166941, "learning_rate": 4.7496206373292865e-05, "loss": 0.4233, "step": 2550 }, { "epoch": 0.4361184603567466, "grad_norm": 0.35868836680242033, "learning_rate": 4.748039959534649e-05, "loss": 0.4172, "step": 2555 }, { "epoch": 0.4369719211402236, "grad_norm": 0.35971414343861885, "learning_rate": 4.746459281740011e-05, "loss": 0.3954, "step": 2560 }, { "epoch": 0.43782538192370063, "grad_norm": 0.3133747307691234, "learning_rate": 4.744878603945372e-05, "loss": 0.4482, "step": 2565 }, { "epoch": 0.4386788427071776, "grad_norm": 0.3423105509660373, "learning_rate": 4.7432979261507335e-05, "loss": 0.439, "step": 2570 }, { "epoch": 0.4395323034906546, "grad_norm": 0.42385314488825626, "learning_rate": 4.741717248356095e-05, "loss": 0.4754, "step": 2575 }, { "epoch": 0.4403857642741316, "grad_norm": 0.4349117905595688, "learning_rate": 4.740136570561457e-05, "loss": 0.4512, "step": 2580 }, { "epoch": 0.4412392250576086, "grad_norm": 0.3844336130161431, "learning_rate": 4.738555892766819e-05, "loss": 0.4192, "step": 2585 }, { "epoch": 0.4420926858410856, "grad_norm": 0.38088259817670894, "learning_rate": 4.7369752149721805e-05, "loss": 0.4049, "step": 2590 }, { "epoch": 0.4429461466245626, "grad_norm": 0.3348711018475399, "learning_rate": 4.735394537177542e-05, "loss": 0.4212, "step": 2595 }, { "epoch": 0.4437996074080396, "grad_norm": 0.33182043658259974, "learning_rate": 4.733813859382903e-05, "loss": 0.4285, "step": 2600 }, { "epoch": 0.4446530681915166, "grad_norm": 0.3305134855728252, "learning_rate": 4.732233181588265e-05, "loss": 0.4144, "step": 2605 }, { "epoch": 0.4455065289749936, "grad_norm": 0.3650978997758936, "learning_rate": 4.730652503793627e-05, "loss": 0.44, "step": 2610 }, { "epoch": 0.4463599897584706, "grad_norm": 0.38040758314253204, "learning_rate": 4.7290718259989886e-05, "loss": 0.4516, "step": 2615 }, { "epoch": 0.4472134505419476, "grad_norm": 0.3894003690126016, "learning_rate": 4.72749114820435e-05, "loss": 0.419, "step": 2620 }, { "epoch": 0.4480669113254246, "grad_norm": 0.41862402018245387, "learning_rate": 4.725910470409712e-05, "loss": 0.427, "step": 2625 }, { "epoch": 0.4489203721089016, "grad_norm": 0.41091828284107657, "learning_rate": 4.724329792615074e-05, "loss": 0.4336, "step": 2630 }, { "epoch": 0.4497738328923786, "grad_norm": 0.41842332951515043, "learning_rate": 4.722749114820435e-05, "loss": 0.4218, "step": 2635 }, { "epoch": 0.4506272936758556, "grad_norm": 0.3591923226606208, "learning_rate": 4.7211684370257966e-05, "loss": 0.3959, "step": 2640 }, { "epoch": 0.4514807544593326, "grad_norm": 0.3774926215760285, "learning_rate": 4.719587759231159e-05, "loss": 0.4331, "step": 2645 }, { "epoch": 0.4523342152428096, "grad_norm": 0.4150755085824676, "learning_rate": 4.71800708143652e-05, "loss": 0.4733, "step": 2650 }, { "epoch": 0.45318767602628657, "grad_norm": 0.32672908612755236, "learning_rate": 4.716426403641882e-05, "loss": 0.4207, "step": 2655 }, { "epoch": 0.4540411368097636, "grad_norm": 0.3642112275196622, "learning_rate": 4.7148457258472436e-05, "loss": 0.4266, "step": 2660 }, { "epoch": 0.4548945975932406, "grad_norm": 0.3058524401278306, "learning_rate": 4.713265048052605e-05, "loss": 0.4386, "step": 2665 }, { "epoch": 0.4557480583767176, "grad_norm": 0.35309015570187063, "learning_rate": 4.7116843702579664e-05, "loss": 0.4292, "step": 2670 }, { "epoch": 0.4566015191601946, "grad_norm": 0.326072008119082, "learning_rate": 4.710103692463329e-05, "loss": 0.4409, "step": 2675 }, { "epoch": 0.45745497994367157, "grad_norm": 0.3988587727905436, "learning_rate": 4.7085230146686906e-05, "loss": 0.4241, "step": 2680 }, { "epoch": 0.4583084407271486, "grad_norm": 0.3608991624607888, "learning_rate": 4.7069423368740516e-05, "loss": 0.4145, "step": 2685 }, { "epoch": 0.4591619015106256, "grad_norm": 0.37087783576203853, "learning_rate": 4.7053616590794134e-05, "loss": 0.4278, "step": 2690 }, { "epoch": 0.46001536229410256, "grad_norm": 0.35529704807556706, "learning_rate": 4.703780981284775e-05, "loss": 0.4445, "step": 2695 }, { "epoch": 0.4608688230775796, "grad_norm": 0.33894674208581815, "learning_rate": 4.702200303490137e-05, "loss": 0.4091, "step": 2700 }, { "epoch": 0.46172228386105657, "grad_norm": 0.7391370325523735, "learning_rate": 4.700619625695498e-05, "loss": 0.4283, "step": 2705 }, { "epoch": 0.4625757446445336, "grad_norm": 0.39759009719773364, "learning_rate": 4.6990389479008604e-05, "loss": 0.4337, "step": 2710 }, { "epoch": 0.4634292054280106, "grad_norm": 0.34980873144126934, "learning_rate": 4.697458270106222e-05, "loss": 0.4243, "step": 2715 }, { "epoch": 0.46428266621148756, "grad_norm": 0.34008891776131894, "learning_rate": 4.695877592311583e-05, "loss": 0.462, "step": 2720 }, { "epoch": 0.4651361269949646, "grad_norm": 0.3658779141060006, "learning_rate": 4.694296914516945e-05, "loss": 0.4052, "step": 2725 }, { "epoch": 0.46598958777844157, "grad_norm": 0.34118917542071747, "learning_rate": 4.692716236722307e-05, "loss": 0.4234, "step": 2730 }, { "epoch": 0.4668430485619186, "grad_norm": 0.3504822858178377, "learning_rate": 4.6911355589276684e-05, "loss": 0.4199, "step": 2735 }, { "epoch": 0.4676965093453956, "grad_norm": 0.3143444828226461, "learning_rate": 4.68955488113303e-05, "loss": 0.416, "step": 2740 }, { "epoch": 0.46854997012887256, "grad_norm": 0.3525630489017658, "learning_rate": 4.687974203338392e-05, "loss": 0.4178, "step": 2745 }, { "epoch": 0.4694034309123496, "grad_norm": 0.3473136138686212, "learning_rate": 4.6863935255437537e-05, "loss": 0.4096, "step": 2750 }, { "epoch": 0.4702568916958266, "grad_norm": 0.38270928102197466, "learning_rate": 4.684812847749115e-05, "loss": 0.4212, "step": 2755 }, { "epoch": 0.47111035247930355, "grad_norm": 0.3999899748417905, "learning_rate": 4.6832321699544765e-05, "loss": 0.4483, "step": 2760 }, { "epoch": 0.4719638132627806, "grad_norm": 0.36650870582102457, "learning_rate": 4.681651492159838e-05, "loss": 0.4184, "step": 2765 }, { "epoch": 0.47281727404625756, "grad_norm": 0.3515470412285171, "learning_rate": 4.6800708143652e-05, "loss": 0.4469, "step": 2770 }, { "epoch": 0.4736707348297346, "grad_norm": 0.36069258473965443, "learning_rate": 4.678490136570562e-05, "loss": 0.4233, "step": 2775 }, { "epoch": 0.4745241956132116, "grad_norm": 0.41107157776648634, "learning_rate": 4.6769094587759235e-05, "loss": 0.4353, "step": 2780 }, { "epoch": 0.47537765639668855, "grad_norm": 0.3800043500577896, "learning_rate": 4.675328780981285e-05, "loss": 0.4482, "step": 2785 }, { "epoch": 0.4762311171801656, "grad_norm": 0.3481136423050304, "learning_rate": 4.673748103186646e-05, "loss": 0.4073, "step": 2790 }, { "epoch": 0.47708457796364256, "grad_norm": 0.4010395772318815, "learning_rate": 4.672167425392009e-05, "loss": 0.4257, "step": 2795 }, { "epoch": 0.47793803874711954, "grad_norm": 0.39255971862594957, "learning_rate": 4.6705867475973704e-05, "loss": 0.4418, "step": 2800 }, { "epoch": 0.4787914995305966, "grad_norm": 0.4241077693661008, "learning_rate": 4.6690060698027315e-05, "loss": 0.4288, "step": 2805 }, { "epoch": 0.47964496031407355, "grad_norm": 0.3287054386860946, "learning_rate": 4.667425392008093e-05, "loss": 0.4313, "step": 2810 }, { "epoch": 0.4804984210975506, "grad_norm": 0.342619182707535, "learning_rate": 4.665844714213455e-05, "loss": 0.4369, "step": 2815 }, { "epoch": 0.48135188188102757, "grad_norm": 0.3608455941879005, "learning_rate": 4.664264036418817e-05, "loss": 0.429, "step": 2820 }, { "epoch": 0.48220534266450454, "grad_norm": 0.3850258160450555, "learning_rate": 4.662683358624178e-05, "loss": 0.4163, "step": 2825 }, { "epoch": 0.4830588034479816, "grad_norm": 0.3596165166435102, "learning_rate": 4.66110268082954e-05, "loss": 0.4394, "step": 2830 }, { "epoch": 0.48391226423145856, "grad_norm": 0.4002417088722784, "learning_rate": 4.659522003034902e-05, "loss": 0.4578, "step": 2835 }, { "epoch": 0.4847657250149356, "grad_norm": 0.4304481505203044, "learning_rate": 4.657941325240263e-05, "loss": 0.4248, "step": 2840 }, { "epoch": 0.48561918579841257, "grad_norm": 0.3265334897128937, "learning_rate": 4.656360647445625e-05, "loss": 0.4429, "step": 2845 }, { "epoch": 0.48647264658188955, "grad_norm": 0.36843175163197933, "learning_rate": 4.6547799696509865e-05, "loss": 0.4129, "step": 2850 }, { "epoch": 0.4873261073653666, "grad_norm": 0.33566523553975053, "learning_rate": 4.653199291856348e-05, "loss": 0.4618, "step": 2855 }, { "epoch": 0.48817956814884356, "grad_norm": 0.5135871071281183, "learning_rate": 4.6516186140617094e-05, "loss": 0.4372, "step": 2860 }, { "epoch": 0.48903302893232053, "grad_norm": 0.417601059082247, "learning_rate": 4.650037936267072e-05, "loss": 0.4297, "step": 2865 }, { "epoch": 0.48988648971579757, "grad_norm": 0.42847313363481804, "learning_rate": 4.6484572584724335e-05, "loss": 0.4114, "step": 2870 }, { "epoch": 0.49073995049927455, "grad_norm": 0.38626175540473634, "learning_rate": 4.6468765806777946e-05, "loss": 0.4524, "step": 2875 }, { "epoch": 0.4915934112827516, "grad_norm": 0.5278745650094537, "learning_rate": 4.645295902883156e-05, "loss": 0.4465, "step": 2880 }, { "epoch": 0.49244687206622856, "grad_norm": 0.327960555576621, "learning_rate": 4.643715225088518e-05, "loss": 0.42, "step": 2885 }, { "epoch": 0.49330033284970554, "grad_norm": 0.49200145037609316, "learning_rate": 4.64213454729388e-05, "loss": 0.4015, "step": 2890 }, { "epoch": 0.49415379363318257, "grad_norm": 0.3371912570430401, "learning_rate": 4.6405538694992416e-05, "loss": 0.3996, "step": 2895 }, { "epoch": 0.49500725441665955, "grad_norm": 0.3458584461853983, "learning_rate": 4.638973191704603e-05, "loss": 0.4073, "step": 2900 }, { "epoch": 0.4958607152001366, "grad_norm": 0.41545883303437997, "learning_rate": 4.637392513909965e-05, "loss": 0.4502, "step": 2905 }, { "epoch": 0.49671417598361356, "grad_norm": 0.5320520076527305, "learning_rate": 4.635811836115326e-05, "loss": 0.4241, "step": 2910 }, { "epoch": 0.49756763676709054, "grad_norm": 0.5737463824028761, "learning_rate": 4.634231158320688e-05, "loss": 0.4161, "step": 2915 }, { "epoch": 0.49842109755056757, "grad_norm": 0.3638125747690571, "learning_rate": 4.6326504805260496e-05, "loss": 0.4206, "step": 2920 }, { "epoch": 0.49927455833404455, "grad_norm": 0.38744246769660445, "learning_rate": 4.6310698027314114e-05, "loss": 0.4167, "step": 2925 }, { "epoch": 0.5001280191175216, "grad_norm": 0.4412989765332016, "learning_rate": 4.629489124936773e-05, "loss": 0.4243, "step": 2930 }, { "epoch": 0.5009814799009985, "grad_norm": 0.3405690314164637, "learning_rate": 4.627908447142135e-05, "loss": 0.4132, "step": 2935 }, { "epoch": 0.5018349406844755, "grad_norm": 0.4263566128709542, "learning_rate": 4.6263277693474966e-05, "loss": 0.4193, "step": 2940 }, { "epoch": 0.5026884014679526, "grad_norm": 0.3562721526546993, "learning_rate": 4.624747091552858e-05, "loss": 0.4303, "step": 2945 }, { "epoch": 0.5035418622514295, "grad_norm": 0.4267224290515076, "learning_rate": 4.62316641375822e-05, "loss": 0.4333, "step": 2950 }, { "epoch": 0.5043953230349065, "grad_norm": 0.4636164808451784, "learning_rate": 4.621585735963581e-05, "loss": 0.4489, "step": 2955 }, { "epoch": 0.5052487838183836, "grad_norm": 0.34740664492254425, "learning_rate": 4.620005058168943e-05, "loss": 0.3923, "step": 2960 }, { "epoch": 0.5061022446018606, "grad_norm": 0.2898757036455138, "learning_rate": 4.6184243803743047e-05, "loss": 0.3933, "step": 2965 }, { "epoch": 0.5069557053853375, "grad_norm": 0.3679472806997201, "learning_rate": 4.6168437025796664e-05, "loss": 0.4141, "step": 2970 }, { "epoch": 0.5078091661688146, "grad_norm": 0.3413289815209779, "learning_rate": 4.615263024785028e-05, "loss": 0.436, "step": 2975 }, { "epoch": 0.5086626269522916, "grad_norm": 0.34500769794315095, "learning_rate": 4.613682346990389e-05, "loss": 0.4499, "step": 2980 }, { "epoch": 0.5095160877357685, "grad_norm": 0.3729624872350033, "learning_rate": 4.6121016691957516e-05, "loss": 0.4421, "step": 2985 }, { "epoch": 0.5103695485192455, "grad_norm": 0.3804546252912562, "learning_rate": 4.6105209914011134e-05, "loss": 0.4086, "step": 2990 }, { "epoch": 0.5112230093027226, "grad_norm": 0.36273422390513926, "learning_rate": 4.6089403136064745e-05, "loss": 0.442, "step": 2995 }, { "epoch": 0.5120764700861995, "grad_norm": 0.42930178527357343, "learning_rate": 4.607359635811836e-05, "loss": 0.455, "step": 3000 }, { "epoch": 0.5129299308696765, "grad_norm": 0.34585562572931067, "learning_rate": 4.605778958017198e-05, "loss": 0.4186, "step": 3005 }, { "epoch": 0.5137833916531536, "grad_norm": 0.4579335161155005, "learning_rate": 4.60419828022256e-05, "loss": 0.4116, "step": 3010 }, { "epoch": 0.5146368524366305, "grad_norm": 0.35282465802575447, "learning_rate": 4.602617602427921e-05, "loss": 0.3965, "step": 3015 }, { "epoch": 0.5154903132201075, "grad_norm": 0.45753255957244465, "learning_rate": 4.601036924633283e-05, "loss": 0.4264, "step": 3020 }, { "epoch": 0.5163437740035846, "grad_norm": 0.362241626552743, "learning_rate": 4.599456246838645e-05, "loss": 0.4172, "step": 3025 }, { "epoch": 0.5171972347870616, "grad_norm": 0.35350281155426677, "learning_rate": 4.597875569044006e-05, "loss": 0.4056, "step": 3030 }, { "epoch": 0.5180506955705385, "grad_norm": 0.3463825155662528, "learning_rate": 4.596294891249368e-05, "loss": 0.4186, "step": 3035 }, { "epoch": 0.5189041563540155, "grad_norm": 0.4025227745528952, "learning_rate": 4.5947142134547295e-05, "loss": 0.4178, "step": 3040 }, { "epoch": 0.5197576171374926, "grad_norm": 0.3870775358159716, "learning_rate": 4.593133535660091e-05, "loss": 0.4612, "step": 3045 }, { "epoch": 0.5206110779209695, "grad_norm": 0.3404867705570577, "learning_rate": 4.591552857865453e-05, "loss": 0.4261, "step": 3050 }, { "epoch": 0.5214645387044465, "grad_norm": 0.41094475023370386, "learning_rate": 4.589972180070815e-05, "loss": 0.4157, "step": 3055 }, { "epoch": 0.5223179994879236, "grad_norm": 0.3321817858795556, "learning_rate": 4.5883915022761765e-05, "loss": 0.4085, "step": 3060 }, { "epoch": 0.5231714602714005, "grad_norm": 0.33273981602539543, "learning_rate": 4.5868108244815375e-05, "loss": 0.425, "step": 3065 }, { "epoch": 0.5240249210548775, "grad_norm": 0.35220055553381097, "learning_rate": 4.5852301466869e-05, "loss": 0.4145, "step": 3070 }, { "epoch": 0.5248783818383546, "grad_norm": 0.3648031440330624, "learning_rate": 4.583649468892261e-05, "loss": 0.4132, "step": 3075 }, { "epoch": 0.5257318426218315, "grad_norm": 0.3480357357209367, "learning_rate": 4.582068791097623e-05, "loss": 0.427, "step": 3080 }, { "epoch": 0.5265853034053085, "grad_norm": 0.4165386739467003, "learning_rate": 4.5804881133029845e-05, "loss": 0.4394, "step": 3085 }, { "epoch": 0.5274387641887855, "grad_norm": 0.3964331674237493, "learning_rate": 4.578907435508346e-05, "loss": 0.4261, "step": 3090 }, { "epoch": 0.5282922249722625, "grad_norm": 0.31195730527142, "learning_rate": 4.577326757713708e-05, "loss": 0.4064, "step": 3095 }, { "epoch": 0.5291456857557395, "grad_norm": 0.34836814990393034, "learning_rate": 4.575746079919069e-05, "loss": 0.4359, "step": 3100 }, { "epoch": 0.5299991465392165, "grad_norm": 0.3511875565778794, "learning_rate": 4.5741654021244315e-05, "loss": 0.4347, "step": 3105 }, { "epoch": 0.5308526073226936, "grad_norm": 0.32578987806286847, "learning_rate": 4.5725847243297926e-05, "loss": 0.4216, "step": 3110 }, { "epoch": 0.5317060681061705, "grad_norm": 0.29950525524223, "learning_rate": 4.571004046535154e-05, "loss": 0.4056, "step": 3115 }, { "epoch": 0.5325595288896475, "grad_norm": 0.3605235034244537, "learning_rate": 4.569423368740516e-05, "loss": 0.4263, "step": 3120 }, { "epoch": 0.5334129896731246, "grad_norm": 0.35670311989633496, "learning_rate": 4.567842690945878e-05, "loss": 0.4209, "step": 3125 }, { "epoch": 0.5342664504566015, "grad_norm": 0.3082514441000325, "learning_rate": 4.5662620131512396e-05, "loss": 0.4277, "step": 3130 }, { "epoch": 0.5351199112400785, "grad_norm": 0.4231515944592346, "learning_rate": 4.5646813353566006e-05, "loss": 0.435, "step": 3135 }, { "epoch": 0.5359733720235555, "grad_norm": 0.37930074533599945, "learning_rate": 4.563100657561963e-05, "loss": 0.3907, "step": 3140 }, { "epoch": 0.5368268328070325, "grad_norm": 0.3496290820350017, "learning_rate": 4.561519979767325e-05, "loss": 0.4275, "step": 3145 }, { "epoch": 0.5376802935905095, "grad_norm": 0.3596744882927907, "learning_rate": 4.559939301972686e-05, "loss": 0.4058, "step": 3150 }, { "epoch": 0.5385337543739865, "grad_norm": 0.3031468820643947, "learning_rate": 4.5583586241780476e-05, "loss": 0.397, "step": 3155 }, { "epoch": 0.5393872151574635, "grad_norm": 0.3333987950832821, "learning_rate": 4.5567779463834094e-05, "loss": 0.4141, "step": 3160 }, { "epoch": 0.5402406759409405, "grad_norm": 0.37952765286137463, "learning_rate": 4.555197268588771e-05, "loss": 0.4292, "step": 3165 }, { "epoch": 0.5410941367244175, "grad_norm": 0.33433744037443414, "learning_rate": 4.553616590794132e-05, "loss": 0.4453, "step": 3170 }, { "epoch": 0.5419475975078946, "grad_norm": 0.3371342121594201, "learning_rate": 4.5520359129994946e-05, "loss": 0.4175, "step": 3175 }, { "epoch": 0.5428010582913715, "grad_norm": 0.3313917403472154, "learning_rate": 4.5504552352048563e-05, "loss": 0.4062, "step": 3180 }, { "epoch": 0.5436545190748485, "grad_norm": 0.32820588361025493, "learning_rate": 4.5488745574102174e-05, "loss": 0.4252, "step": 3185 }, { "epoch": 0.5445079798583256, "grad_norm": 0.3456217354410609, "learning_rate": 4.54729387961558e-05, "loss": 0.4333, "step": 3190 }, { "epoch": 0.5453614406418025, "grad_norm": 0.38316823658470084, "learning_rate": 4.545713201820941e-05, "loss": 0.4375, "step": 3195 }, { "epoch": 0.5462149014252795, "grad_norm": 0.3395049639306009, "learning_rate": 4.5441325240263026e-05, "loss": 0.4103, "step": 3200 }, { "epoch": 0.5470683622087565, "grad_norm": 0.3390891377318112, "learning_rate": 4.5425518462316644e-05, "loss": 0.4241, "step": 3205 }, { "epoch": 0.5479218229922335, "grad_norm": 0.4022781408391875, "learning_rate": 4.540971168437026e-05, "loss": 0.4649, "step": 3210 }, { "epoch": 0.5487752837757105, "grad_norm": 0.34508167177478977, "learning_rate": 4.539390490642388e-05, "loss": 0.4378, "step": 3215 }, { "epoch": 0.5496287445591875, "grad_norm": 0.35226298255447536, "learning_rate": 4.537809812847749e-05, "loss": 0.4005, "step": 3220 }, { "epoch": 0.5504822053426645, "grad_norm": 0.304769513536871, "learning_rate": 4.5362291350531114e-05, "loss": 0.4415, "step": 3225 }, { "epoch": 0.5513356661261415, "grad_norm": 0.46758100795024804, "learning_rate": 4.5346484572584724e-05, "loss": 0.4213, "step": 3230 }, { "epoch": 0.5521891269096185, "grad_norm": 0.37922136841319765, "learning_rate": 4.533067779463834e-05, "loss": 0.4216, "step": 3235 }, { "epoch": 0.5530425876930956, "grad_norm": 0.6015722322932944, "learning_rate": 4.531487101669196e-05, "loss": 0.4276, "step": 3240 }, { "epoch": 0.5538960484765725, "grad_norm": 0.34557591278443023, "learning_rate": 4.529906423874558e-05, "loss": 0.4023, "step": 3245 }, { "epoch": 0.5547495092600495, "grad_norm": 0.37176327513462537, "learning_rate": 4.5283257460799194e-05, "loss": 0.4103, "step": 3250 }, { "epoch": 0.5556029700435265, "grad_norm": 0.3815299588499549, "learning_rate": 4.5267450682852805e-05, "loss": 0.4345, "step": 3255 }, { "epoch": 0.5564564308270035, "grad_norm": 0.34611998356001517, "learning_rate": 4.525164390490643e-05, "loss": 0.404, "step": 3260 }, { "epoch": 0.5573098916104805, "grad_norm": 0.36537698149746106, "learning_rate": 4.523583712696004e-05, "loss": 0.4281, "step": 3265 }, { "epoch": 0.5581633523939575, "grad_norm": 0.4999308997185186, "learning_rate": 4.522003034901366e-05, "loss": 0.4243, "step": 3270 }, { "epoch": 0.5590168131774345, "grad_norm": 0.46118530836367416, "learning_rate": 4.5204223571067275e-05, "loss": 0.4001, "step": 3275 }, { "epoch": 0.5598702739609115, "grad_norm": 0.3532082387268381, "learning_rate": 4.518841679312089e-05, "loss": 0.4254, "step": 3280 }, { "epoch": 0.5607237347443885, "grad_norm": 0.34471741074761086, "learning_rate": 4.517261001517451e-05, "loss": 0.4084, "step": 3285 }, { "epoch": 0.5615771955278654, "grad_norm": 0.5519218337611455, "learning_rate": 4.515680323722812e-05, "loss": 0.4237, "step": 3290 }, { "epoch": 0.5624306563113425, "grad_norm": 0.3871921532779645, "learning_rate": 4.5140996459281745e-05, "loss": 0.3952, "step": 3295 }, { "epoch": 0.5632841170948195, "grad_norm": 0.3187398638923619, "learning_rate": 4.512518968133536e-05, "loss": 0.4117, "step": 3300 }, { "epoch": 0.5641375778782965, "grad_norm": 0.4286555357435917, "learning_rate": 4.510938290338897e-05, "loss": 0.4318, "step": 3305 }, { "epoch": 0.5649910386617735, "grad_norm": 0.3251784060436212, "learning_rate": 4.50935761254426e-05, "loss": 0.4304, "step": 3310 }, { "epoch": 0.5658444994452505, "grad_norm": 0.3656416472225787, "learning_rate": 4.507776934749621e-05, "loss": 0.4117, "step": 3315 }, { "epoch": 0.5666979602287275, "grad_norm": 0.38458996545333124, "learning_rate": 4.5061962569549825e-05, "loss": 0.4238, "step": 3320 }, { "epoch": 0.5675514210122045, "grad_norm": 0.43465361869178587, "learning_rate": 4.504615579160344e-05, "loss": 0.4512, "step": 3325 }, { "epoch": 0.5684048817956815, "grad_norm": 0.3120082200845649, "learning_rate": 4.503034901365706e-05, "loss": 0.3914, "step": 3330 }, { "epoch": 0.5692583425791585, "grad_norm": 0.30129858577421115, "learning_rate": 4.501454223571068e-05, "loss": 0.4186, "step": 3335 }, { "epoch": 0.5701118033626354, "grad_norm": 0.3629248911820029, "learning_rate": 4.499873545776429e-05, "loss": 0.4004, "step": 3340 }, { "epoch": 0.5709652641461125, "grad_norm": 0.3201188898847056, "learning_rate": 4.498292867981791e-05, "loss": 0.4245, "step": 3345 }, { "epoch": 0.5718187249295895, "grad_norm": 0.4098024386543257, "learning_rate": 4.496712190187152e-05, "loss": 0.4104, "step": 3350 }, { "epoch": 0.5726721857130664, "grad_norm": 0.4632809462034015, "learning_rate": 4.495131512392514e-05, "loss": 0.4348, "step": 3355 }, { "epoch": 0.5735256464965435, "grad_norm": 0.36701247276573584, "learning_rate": 4.493550834597876e-05, "loss": 0.4048, "step": 3360 }, { "epoch": 0.5743791072800205, "grad_norm": 0.3940909829090357, "learning_rate": 4.4919701568032375e-05, "loss": 0.4059, "step": 3365 }, { "epoch": 0.5752325680634974, "grad_norm": 0.34345808553107876, "learning_rate": 4.490389479008599e-05, "loss": 0.4117, "step": 3370 }, { "epoch": 0.5760860288469745, "grad_norm": 0.35787245876127344, "learning_rate": 4.4888088012139604e-05, "loss": 0.4237, "step": 3375 }, { "epoch": 0.5769394896304515, "grad_norm": 0.3457313845266693, "learning_rate": 4.487228123419323e-05, "loss": 0.4275, "step": 3380 }, { "epoch": 0.5777929504139285, "grad_norm": 0.285187276369896, "learning_rate": 4.485647445624684e-05, "loss": 0.4034, "step": 3385 }, { "epoch": 0.5786464111974055, "grad_norm": 0.3364699377043636, "learning_rate": 4.4840667678300456e-05, "loss": 0.426, "step": 3390 }, { "epoch": 0.5794998719808825, "grad_norm": 0.3531737067131605, "learning_rate": 4.4824860900354073e-05, "loss": 0.4315, "step": 3395 }, { "epoch": 0.5803533327643595, "grad_norm": 0.3511636661908256, "learning_rate": 4.480905412240769e-05, "loss": 0.3848, "step": 3400 }, { "epoch": 0.5812067935478364, "grad_norm": 0.33523930068578517, "learning_rate": 4.479324734446131e-05, "loss": 0.3862, "step": 3405 }, { "epoch": 0.5820602543313135, "grad_norm": 0.35322054073930176, "learning_rate": 4.477744056651492e-05, "loss": 0.4113, "step": 3410 }, { "epoch": 0.5829137151147905, "grad_norm": 0.34549631607138825, "learning_rate": 4.476163378856854e-05, "loss": 0.4243, "step": 3415 }, { "epoch": 0.5837671758982674, "grad_norm": 0.32167709994047033, "learning_rate": 4.4745827010622154e-05, "loss": 0.4331, "step": 3420 }, { "epoch": 0.5846206366817445, "grad_norm": 0.3416086987115391, "learning_rate": 4.473002023267577e-05, "loss": 0.4171, "step": 3425 }, { "epoch": 0.5854740974652215, "grad_norm": 0.36401794114734276, "learning_rate": 4.4714213454729396e-05, "loss": 0.4388, "step": 3430 }, { "epoch": 0.5863275582486984, "grad_norm": 0.3369686272565596, "learning_rate": 4.4698406676783006e-05, "loss": 0.4022, "step": 3435 }, { "epoch": 0.5871810190321755, "grad_norm": 0.41614728647528515, "learning_rate": 4.4682599898836624e-05, "loss": 0.4086, "step": 3440 }, { "epoch": 0.5880344798156525, "grad_norm": 0.35099686869905117, "learning_rate": 4.466679312089024e-05, "loss": 0.426, "step": 3445 }, { "epoch": 0.5888879405991295, "grad_norm": 0.35327709956173076, "learning_rate": 4.465098634294386e-05, "loss": 0.4185, "step": 3450 }, { "epoch": 0.5897414013826064, "grad_norm": 0.45831627901950156, "learning_rate": 4.4635179564997476e-05, "loss": 0.4034, "step": 3455 }, { "epoch": 0.5905948621660835, "grad_norm": 0.3238308525327094, "learning_rate": 4.461937278705109e-05, "loss": 0.4182, "step": 3460 }, { "epoch": 0.5914483229495605, "grad_norm": 0.3163627491879537, "learning_rate": 4.460356600910471e-05, "loss": 0.3961, "step": 3465 }, { "epoch": 0.5923017837330374, "grad_norm": 0.36267125224971497, "learning_rate": 4.458775923115832e-05, "loss": 0.3974, "step": 3470 }, { "epoch": 0.5931552445165145, "grad_norm": 0.39254003018747385, "learning_rate": 4.457195245321194e-05, "loss": 0.3859, "step": 3475 }, { "epoch": 0.5940087052999915, "grad_norm": 0.41382442783808865, "learning_rate": 4.455614567526556e-05, "loss": 0.4483, "step": 3480 }, { "epoch": 0.5948621660834684, "grad_norm": 0.3404919316416257, "learning_rate": 4.4540338897319174e-05, "loss": 0.4093, "step": 3485 }, { "epoch": 0.5957156268669455, "grad_norm": 0.43439474310608645, "learning_rate": 4.452453211937279e-05, "loss": 0.4501, "step": 3490 }, { "epoch": 0.5965690876504225, "grad_norm": 0.3770641883569541, "learning_rate": 4.45087253414264e-05, "loss": 0.4116, "step": 3495 }, { "epoch": 0.5974225484338994, "grad_norm": 0.3793883387776861, "learning_rate": 4.4492918563480026e-05, "loss": 0.4175, "step": 3500 }, { "epoch": 0.5982760092173764, "grad_norm": 0.3365955331173776, "learning_rate": 4.447711178553364e-05, "loss": 0.4107, "step": 3505 }, { "epoch": 0.5991294700008535, "grad_norm": 0.35634026468805763, "learning_rate": 4.4461305007587255e-05, "loss": 0.4044, "step": 3510 }, { "epoch": 0.5999829307843305, "grad_norm": 0.31580257253442656, "learning_rate": 4.444549822964087e-05, "loss": 0.3901, "step": 3515 }, { "epoch": 0.6008363915678074, "grad_norm": 0.30495539262318544, "learning_rate": 4.442969145169449e-05, "loss": 0.3835, "step": 3520 }, { "epoch": 0.6016898523512845, "grad_norm": 0.3852862154222247, "learning_rate": 4.441388467374811e-05, "loss": 0.393, "step": 3525 }, { "epoch": 0.6025433131347615, "grad_norm": 0.39907040148424067, "learning_rate": 4.439807789580172e-05, "loss": 0.3932, "step": 3530 }, { "epoch": 0.6033967739182384, "grad_norm": 0.3716766710119807, "learning_rate": 4.438227111785534e-05, "loss": 0.446, "step": 3535 }, { "epoch": 0.6042502347017155, "grad_norm": 0.3460911917872473, "learning_rate": 4.436646433990895e-05, "loss": 0.4045, "step": 3540 }, { "epoch": 0.6051036954851925, "grad_norm": 0.334176957733884, "learning_rate": 4.435065756196257e-05, "loss": 0.4313, "step": 3545 }, { "epoch": 0.6059571562686694, "grad_norm": 0.2797381062480785, "learning_rate": 4.433485078401619e-05, "loss": 0.4123, "step": 3550 }, { "epoch": 0.6068106170521465, "grad_norm": 0.38199316709133085, "learning_rate": 4.4319044006069805e-05, "loss": 0.4104, "step": 3555 }, { "epoch": 0.6076640778356235, "grad_norm": 0.32986283405219846, "learning_rate": 4.430323722812342e-05, "loss": 0.415, "step": 3560 }, { "epoch": 0.6085175386191004, "grad_norm": 0.360634076237603, "learning_rate": 4.428743045017704e-05, "loss": 0.4394, "step": 3565 }, { "epoch": 0.6093709994025774, "grad_norm": 0.4065089251205816, "learning_rate": 4.427162367223066e-05, "loss": 0.4287, "step": 3570 }, { "epoch": 0.6102244601860545, "grad_norm": 0.35344913168438225, "learning_rate": 4.425581689428427e-05, "loss": 0.4268, "step": 3575 }, { "epoch": 0.6110779209695314, "grad_norm": 0.31480443503205546, "learning_rate": 4.4240010116337885e-05, "loss": 0.3836, "step": 3580 }, { "epoch": 0.6119313817530084, "grad_norm": 0.2892148599495086, "learning_rate": 4.422420333839151e-05, "loss": 0.4033, "step": 3585 }, { "epoch": 0.6127848425364855, "grad_norm": 0.33230625960002347, "learning_rate": 4.420839656044512e-05, "loss": 0.4147, "step": 3590 }, { "epoch": 0.6136383033199625, "grad_norm": 0.31874905851821783, "learning_rate": 4.419258978249874e-05, "loss": 0.4117, "step": 3595 }, { "epoch": 0.6144917641034394, "grad_norm": 0.3233327656543225, "learning_rate": 4.4176783004552355e-05, "loss": 0.4319, "step": 3600 }, { "epoch": 0.6153452248869165, "grad_norm": 0.3119198531055476, "learning_rate": 4.416097622660597e-05, "loss": 0.4185, "step": 3605 }, { "epoch": 0.6161986856703935, "grad_norm": 0.4835951307210119, "learning_rate": 4.4145169448659583e-05, "loss": 0.4213, "step": 3610 }, { "epoch": 0.6170521464538704, "grad_norm": 0.34130196896831533, "learning_rate": 4.41293626707132e-05, "loss": 0.4502, "step": 3615 }, { "epoch": 0.6179056072373474, "grad_norm": 0.40716644700680066, "learning_rate": 4.4113555892766825e-05, "loss": 0.4067, "step": 3620 }, { "epoch": 0.6187590680208245, "grad_norm": 0.3249373985212781, "learning_rate": 4.4097749114820436e-05, "loss": 0.4507, "step": 3625 }, { "epoch": 0.6196125288043014, "grad_norm": 0.35755744845437554, "learning_rate": 4.408194233687405e-05, "loss": 0.3978, "step": 3630 }, { "epoch": 0.6204659895877784, "grad_norm": 0.34565920071119, "learning_rate": 4.406613555892767e-05, "loss": 0.427, "step": 3635 }, { "epoch": 0.6213194503712555, "grad_norm": 0.4579481832443601, "learning_rate": 4.405032878098129e-05, "loss": 0.4083, "step": 3640 }, { "epoch": 0.6221729111547324, "grad_norm": 0.32932172982179964, "learning_rate": 4.4034522003034906e-05, "loss": 0.4316, "step": 3645 }, { "epoch": 0.6230263719382094, "grad_norm": 0.3676201919324164, "learning_rate": 4.4018715225088516e-05, "loss": 0.4289, "step": 3650 }, { "epoch": 0.6238798327216865, "grad_norm": 0.3038817607723838, "learning_rate": 4.400290844714214e-05, "loss": 0.4134, "step": 3655 }, { "epoch": 0.6247332935051635, "grad_norm": 0.36562781789769006, "learning_rate": 4.398710166919575e-05, "loss": 0.425, "step": 3660 }, { "epoch": 0.6255867542886404, "grad_norm": 0.29927354954878904, "learning_rate": 4.397129489124937e-05, "loss": 0.4103, "step": 3665 }, { "epoch": 0.6264402150721174, "grad_norm": 0.34697854940816675, "learning_rate": 4.3955488113302986e-05, "loss": 0.4336, "step": 3670 }, { "epoch": 0.6272936758555945, "grad_norm": 0.3870308034689782, "learning_rate": 4.3939681335356604e-05, "loss": 0.4148, "step": 3675 }, { "epoch": 0.6281471366390714, "grad_norm": 0.3736743897484259, "learning_rate": 4.392387455741022e-05, "loss": 0.3824, "step": 3680 }, { "epoch": 0.6290005974225484, "grad_norm": 0.32714989680704865, "learning_rate": 4.390806777946383e-05, "loss": 0.3949, "step": 3685 }, { "epoch": 0.6298540582060255, "grad_norm": 0.36294294756749157, "learning_rate": 4.3892261001517456e-05, "loss": 0.428, "step": 3690 }, { "epoch": 0.6307075189895024, "grad_norm": 0.36435366438786687, "learning_rate": 4.387645422357107e-05, "loss": 0.4141, "step": 3695 }, { "epoch": 0.6315609797729794, "grad_norm": 0.32591824341151626, "learning_rate": 4.3860647445624684e-05, "loss": 0.4132, "step": 3700 }, { "epoch": 0.6324144405564565, "grad_norm": 0.30136806075160766, "learning_rate": 4.384484066767831e-05, "loss": 0.4224, "step": 3705 }, { "epoch": 0.6332679013399334, "grad_norm": 0.297042793479401, "learning_rate": 4.382903388973192e-05, "loss": 0.389, "step": 3710 }, { "epoch": 0.6341213621234104, "grad_norm": 0.2923364322466672, "learning_rate": 4.3813227111785537e-05, "loss": 0.4156, "step": 3715 }, { "epoch": 0.6349748229068874, "grad_norm": 0.3322237922153755, "learning_rate": 4.3797420333839154e-05, "loss": 0.4157, "step": 3720 }, { "epoch": 0.6358282836903645, "grad_norm": 0.3364568957429455, "learning_rate": 4.378161355589277e-05, "loss": 0.4025, "step": 3725 }, { "epoch": 0.6366817444738414, "grad_norm": 0.3333261511123634, "learning_rate": 4.376580677794638e-05, "loss": 0.4121, "step": 3730 }, { "epoch": 0.6375352052573184, "grad_norm": 0.384825451270811, "learning_rate": 4.375e-05, "loss": 0.4325, "step": 3735 }, { "epoch": 0.6383886660407955, "grad_norm": 0.38406007829714905, "learning_rate": 4.3734193222053624e-05, "loss": 0.4364, "step": 3740 }, { "epoch": 0.6392421268242724, "grad_norm": 0.3445710370839018, "learning_rate": 4.3718386444107234e-05, "loss": 0.4127, "step": 3745 }, { "epoch": 0.6400955876077494, "grad_norm": 0.312358588941458, "learning_rate": 4.370257966616085e-05, "loss": 0.4328, "step": 3750 }, { "epoch": 0.6409490483912265, "grad_norm": 0.37137786496806946, "learning_rate": 4.368677288821447e-05, "loss": 0.4163, "step": 3755 }, { "epoch": 0.6418025091747034, "grad_norm": 0.4233104019688607, "learning_rate": 4.367096611026809e-05, "loss": 0.4358, "step": 3760 }, { "epoch": 0.6426559699581804, "grad_norm": 0.2931999184742935, "learning_rate": 4.36551593323217e-05, "loss": 0.387, "step": 3765 }, { "epoch": 0.6435094307416575, "grad_norm": 0.3124397886650001, "learning_rate": 4.3639352554375315e-05, "loss": 0.4041, "step": 3770 }, { "epoch": 0.6443628915251344, "grad_norm": 0.33622078070269973, "learning_rate": 4.362354577642894e-05, "loss": 0.4219, "step": 3775 }, { "epoch": 0.6452163523086114, "grad_norm": 0.37927205274065084, "learning_rate": 4.360773899848255e-05, "loss": 0.4331, "step": 3780 }, { "epoch": 0.6460698130920884, "grad_norm": 0.3323232309615696, "learning_rate": 4.359193222053617e-05, "loss": 0.4118, "step": 3785 }, { "epoch": 0.6469232738755654, "grad_norm": 0.32514719172273243, "learning_rate": 4.3576125442589785e-05, "loss": 0.39, "step": 3790 }, { "epoch": 0.6477767346590424, "grad_norm": 0.40409285288550345, "learning_rate": 4.35603186646434e-05, "loss": 0.4303, "step": 3795 }, { "epoch": 0.6486301954425194, "grad_norm": 0.38382978319020006, "learning_rate": 4.354451188669702e-05, "loss": 0.4049, "step": 3800 }, { "epoch": 0.6494836562259965, "grad_norm": 0.4612987262812354, "learning_rate": 4.352870510875063e-05, "loss": 0.4151, "step": 3805 }, { "epoch": 0.6503371170094734, "grad_norm": 0.3535574719478506, "learning_rate": 4.3512898330804255e-05, "loss": 0.4196, "step": 3810 }, { "epoch": 0.6511905777929504, "grad_norm": 0.38335467911234017, "learning_rate": 4.3497091552857865e-05, "loss": 0.4218, "step": 3815 }, { "epoch": 0.6520440385764275, "grad_norm": 0.44679361979359805, "learning_rate": 4.348128477491148e-05, "loss": 0.3824, "step": 3820 }, { "epoch": 0.6528974993599044, "grad_norm": 0.34768090457622186, "learning_rate": 4.34654779969651e-05, "loss": 0.4171, "step": 3825 }, { "epoch": 0.6537509601433814, "grad_norm": 0.36247237706504093, "learning_rate": 4.344967121901872e-05, "loss": 0.4473, "step": 3830 }, { "epoch": 0.6546044209268584, "grad_norm": 0.30573563764653355, "learning_rate": 4.3433864441072335e-05, "loss": 0.3838, "step": 3835 }, { "epoch": 0.6554578817103354, "grad_norm": 0.407798017437456, "learning_rate": 4.341805766312595e-05, "loss": 0.3883, "step": 3840 }, { "epoch": 0.6563113424938124, "grad_norm": 0.28237422202256923, "learning_rate": 4.340225088517957e-05, "loss": 0.4096, "step": 3845 }, { "epoch": 0.6571648032772894, "grad_norm": 0.3213375663363969, "learning_rate": 4.338644410723318e-05, "loss": 0.4304, "step": 3850 }, { "epoch": 0.6580182640607664, "grad_norm": 0.36533135445836873, "learning_rate": 4.33706373292868e-05, "loss": 0.4134, "step": 3855 }, { "epoch": 0.6588717248442434, "grad_norm": 0.3806299844003694, "learning_rate": 4.335483055134042e-05, "loss": 0.4559, "step": 3860 }, { "epoch": 0.6597251856277204, "grad_norm": 0.4585089599143678, "learning_rate": 4.333902377339403e-05, "loss": 0.4026, "step": 3865 }, { "epoch": 0.6605786464111975, "grad_norm": 0.35141433751411755, "learning_rate": 4.332321699544765e-05, "loss": 0.4193, "step": 3870 }, { "epoch": 0.6614321071946744, "grad_norm": 0.37589499256633796, "learning_rate": 4.330741021750127e-05, "loss": 0.4026, "step": 3875 }, { "epoch": 0.6622855679781514, "grad_norm": 0.3718598774676493, "learning_rate": 4.3291603439554885e-05, "loss": 0.419, "step": 3880 }, { "epoch": 0.6631390287616284, "grad_norm": 0.4031826663620715, "learning_rate": 4.3275796661608496e-05, "loss": 0.4037, "step": 3885 }, { "epoch": 0.6639924895451054, "grad_norm": 0.3373529760339809, "learning_rate": 4.3259989883662114e-05, "loss": 0.4007, "step": 3890 }, { "epoch": 0.6648459503285824, "grad_norm": 0.28592393267853616, "learning_rate": 4.324418310571574e-05, "loss": 0.4081, "step": 3895 }, { "epoch": 0.6656994111120594, "grad_norm": 0.28926724168123924, "learning_rate": 4.322837632776935e-05, "loss": 0.4027, "step": 3900 }, { "epoch": 0.6665528718955364, "grad_norm": 0.35990664287503005, "learning_rate": 4.3212569549822966e-05, "loss": 0.4159, "step": 3905 }, { "epoch": 0.6674063326790134, "grad_norm": 0.3132902091310108, "learning_rate": 4.3196762771876583e-05, "loss": 0.4255, "step": 3910 }, { "epoch": 0.6682597934624904, "grad_norm": 0.4386988020813814, "learning_rate": 4.31809559939302e-05, "loss": 0.4107, "step": 3915 }, { "epoch": 0.6691132542459673, "grad_norm": 0.3177942162950774, "learning_rate": 4.316514921598381e-05, "loss": 0.4075, "step": 3920 }, { "epoch": 0.6699667150294444, "grad_norm": 0.4082493257411119, "learning_rate": 4.314934243803743e-05, "loss": 0.4005, "step": 3925 }, { "epoch": 0.6708201758129214, "grad_norm": 0.39670339963522655, "learning_rate": 4.313353566009105e-05, "loss": 0.4184, "step": 3930 }, { "epoch": 0.6716736365963984, "grad_norm": 0.36718863070440483, "learning_rate": 4.3117728882144664e-05, "loss": 0.3975, "step": 3935 }, { "epoch": 0.6725270973798754, "grad_norm": 0.2913689394049604, "learning_rate": 4.310192210419828e-05, "loss": 0.3815, "step": 3940 }, { "epoch": 0.6733805581633524, "grad_norm": 0.332486816786956, "learning_rate": 4.30861153262519e-05, "loss": 0.4025, "step": 3945 }, { "epoch": 0.6742340189468294, "grad_norm": 0.28942240357396437, "learning_rate": 4.3070308548305516e-05, "loss": 0.4016, "step": 3950 }, { "epoch": 0.6750874797303064, "grad_norm": 0.33098205008773623, "learning_rate": 4.3054501770359134e-05, "loss": 0.4064, "step": 3955 }, { "epoch": 0.6759409405137834, "grad_norm": 0.40122281005808, "learning_rate": 4.303869499241275e-05, "loss": 0.4103, "step": 3960 }, { "epoch": 0.6767944012972604, "grad_norm": 0.32443575720477963, "learning_rate": 4.302288821446637e-05, "loss": 0.4086, "step": 3965 }, { "epoch": 0.6776478620807374, "grad_norm": 0.29406673117990434, "learning_rate": 4.300708143651998e-05, "loss": 0.3872, "step": 3970 }, { "epoch": 0.6785013228642144, "grad_norm": 0.39076445327167264, "learning_rate": 4.29912746585736e-05, "loss": 0.3892, "step": 3975 }, { "epoch": 0.6793547836476914, "grad_norm": 0.3550596081872687, "learning_rate": 4.2975467880627214e-05, "loss": 0.4105, "step": 3980 }, { "epoch": 0.6802082444311683, "grad_norm": 0.3704966374798372, "learning_rate": 4.295966110268083e-05, "loss": 0.4239, "step": 3985 }, { "epoch": 0.6810617052146454, "grad_norm": 0.31156270971027017, "learning_rate": 4.294385432473445e-05, "loss": 0.4219, "step": 3990 }, { "epoch": 0.6819151659981224, "grad_norm": 0.29769743296811885, "learning_rate": 4.292804754678807e-05, "loss": 0.4085, "step": 3995 }, { "epoch": 0.6827686267815993, "grad_norm": 0.36853460732031024, "learning_rate": 4.2912240768841684e-05, "loss": 0.4049, "step": 4000 }, { "epoch": 0.6836220875650764, "grad_norm": 0.3448236246803131, "learning_rate": 4.2896433990895295e-05, "loss": 0.4348, "step": 4005 }, { "epoch": 0.6844755483485534, "grad_norm": 0.35116421047722796, "learning_rate": 4.288062721294891e-05, "loss": 0.4076, "step": 4010 }, { "epoch": 0.6853290091320304, "grad_norm": 0.4053182890250621, "learning_rate": 4.286482043500253e-05, "loss": 0.4202, "step": 4015 }, { "epoch": 0.6861824699155074, "grad_norm": 0.3873405477221313, "learning_rate": 4.284901365705615e-05, "loss": 0.4278, "step": 4020 }, { "epoch": 0.6870359306989844, "grad_norm": 0.3204954224888714, "learning_rate": 4.2833206879109765e-05, "loss": 0.3976, "step": 4025 }, { "epoch": 0.6878893914824614, "grad_norm": 0.3360170342721195, "learning_rate": 4.281740010116338e-05, "loss": 0.3979, "step": 4030 }, { "epoch": 0.6887428522659383, "grad_norm": 0.3211330161286565, "learning_rate": 4.2801593323217e-05, "loss": 0.3929, "step": 4035 }, { "epoch": 0.6895963130494154, "grad_norm": 0.44138811285177737, "learning_rate": 4.278578654527061e-05, "loss": 0.41, "step": 4040 }, { "epoch": 0.6904497738328924, "grad_norm": 0.30776804971160837, "learning_rate": 4.276997976732423e-05, "loss": 0.3999, "step": 4045 }, { "epoch": 0.6913032346163693, "grad_norm": 0.4139367590724909, "learning_rate": 4.275417298937785e-05, "loss": 0.4241, "step": 4050 }, { "epoch": 0.6921566953998464, "grad_norm": 0.4242161201503065, "learning_rate": 4.273836621143146e-05, "loss": 0.4176, "step": 4055 }, { "epoch": 0.6930101561833234, "grad_norm": 0.3568805668493416, "learning_rate": 4.272255943348508e-05, "loss": 0.4035, "step": 4060 }, { "epoch": 0.6938636169668003, "grad_norm": 0.33150280317692044, "learning_rate": 4.27067526555387e-05, "loss": 0.4044, "step": 4065 }, { "epoch": 0.6947170777502774, "grad_norm": 0.34148177454976486, "learning_rate": 4.2690945877592315e-05, "loss": 0.4148, "step": 4070 }, { "epoch": 0.6955705385337544, "grad_norm": 0.3298415426962926, "learning_rate": 4.2675139099645926e-05, "loss": 0.3946, "step": 4075 }, { "epoch": 0.6964239993172314, "grad_norm": 0.29907274608086243, "learning_rate": 4.265933232169955e-05, "loss": 0.4067, "step": 4080 }, { "epoch": 0.6972774601007083, "grad_norm": 0.3320845853474826, "learning_rate": 4.264352554375317e-05, "loss": 0.3905, "step": 4085 }, { "epoch": 0.6981309208841854, "grad_norm": 0.345735348213086, "learning_rate": 4.262771876580678e-05, "loss": 0.4074, "step": 4090 }, { "epoch": 0.6989843816676624, "grad_norm": 0.3109114820396989, "learning_rate": 4.2611911987860396e-05, "loss": 0.436, "step": 4095 }, { "epoch": 0.6998378424511393, "grad_norm": 0.42113504325759366, "learning_rate": 4.259610520991401e-05, "loss": 0.4338, "step": 4100 }, { "epoch": 0.7006913032346164, "grad_norm": 0.3386323988579902, "learning_rate": 4.258029843196763e-05, "loss": 0.4088, "step": 4105 }, { "epoch": 0.7015447640180934, "grad_norm": 0.411205653136817, "learning_rate": 4.256449165402125e-05, "loss": 0.413, "step": 4110 }, { "epoch": 0.7023982248015703, "grad_norm": 0.36991761685019486, "learning_rate": 4.2548684876074865e-05, "loss": 0.4174, "step": 4115 }, { "epoch": 0.7032516855850474, "grad_norm": 0.3396062427824557, "learning_rate": 4.253287809812848e-05, "loss": 0.4014, "step": 4120 }, { "epoch": 0.7041051463685244, "grad_norm": 0.31805805418026606, "learning_rate": 4.2517071320182093e-05, "loss": 0.4106, "step": 4125 }, { "epoch": 0.7049586071520013, "grad_norm": 0.30359838719856674, "learning_rate": 4.250126454223571e-05, "loss": 0.4323, "step": 4130 }, { "epoch": 0.7058120679354783, "grad_norm": 0.3198119363854244, "learning_rate": 4.248545776428933e-05, "loss": 0.4074, "step": 4135 }, { "epoch": 0.7066655287189554, "grad_norm": 0.35434461102868575, "learning_rate": 4.2469650986342946e-05, "loss": 0.4004, "step": 4140 }, { "epoch": 0.7075189895024324, "grad_norm": 0.37543373493322213, "learning_rate": 4.245384420839656e-05, "loss": 0.417, "step": 4145 }, { "epoch": 0.7083724502859093, "grad_norm": 0.39100667955962454, "learning_rate": 4.243803743045018e-05, "loss": 0.3967, "step": 4150 }, { "epoch": 0.7092259110693864, "grad_norm": 0.3635638254793225, "learning_rate": 4.24222306525038e-05, "loss": 0.4299, "step": 4155 }, { "epoch": 0.7100793718528634, "grad_norm": 0.30696876417046176, "learning_rate": 4.240642387455741e-05, "loss": 0.437, "step": 4160 }, { "epoch": 0.7109328326363403, "grad_norm": 0.3086185036494573, "learning_rate": 4.2390617096611026e-05, "loss": 0.4067, "step": 4165 }, { "epoch": 0.7117862934198174, "grad_norm": 0.30835055729081945, "learning_rate": 4.2374810318664644e-05, "loss": 0.4085, "step": 4170 }, { "epoch": 0.7126397542032944, "grad_norm": 0.34618570800833687, "learning_rate": 4.235900354071826e-05, "loss": 0.3954, "step": 4175 }, { "epoch": 0.7134932149867713, "grad_norm": 0.31961221208910906, "learning_rate": 4.234319676277188e-05, "loss": 0.4105, "step": 4180 }, { "epoch": 0.7143466757702484, "grad_norm": 0.4026962141433966, "learning_rate": 4.2327389984825496e-05, "loss": 0.4074, "step": 4185 }, { "epoch": 0.7152001365537254, "grad_norm": 0.4095248840799454, "learning_rate": 4.2311583206879114e-05, "loss": 0.4015, "step": 4190 }, { "epoch": 0.7160535973372023, "grad_norm": 0.39486986368979426, "learning_rate": 4.2295776428932724e-05, "loss": 0.4546, "step": 4195 }, { "epoch": 0.7169070581206793, "grad_norm": 0.3061206534811209, "learning_rate": 4.227996965098635e-05, "loss": 0.4025, "step": 4200 }, { "epoch": 0.7177605189041564, "grad_norm": 0.369589159958681, "learning_rate": 4.2264162873039966e-05, "loss": 0.4243, "step": 4205 }, { "epoch": 0.7186139796876334, "grad_norm": 0.29667585766811777, "learning_rate": 4.224835609509358e-05, "loss": 0.3845, "step": 4210 }, { "epoch": 0.7194674404711103, "grad_norm": 0.3176745349040905, "learning_rate": 4.2232549317147194e-05, "loss": 0.3969, "step": 4215 }, { "epoch": 0.7203209012545874, "grad_norm": 0.2981952599338546, "learning_rate": 4.221674253920081e-05, "loss": 0.4025, "step": 4220 }, { "epoch": 0.7211743620380644, "grad_norm": 0.3662409873876372, "learning_rate": 4.220093576125443e-05, "loss": 0.4316, "step": 4225 }, { "epoch": 0.7220278228215413, "grad_norm": 0.3422361792090042, "learning_rate": 4.218512898330804e-05, "loss": 0.4049, "step": 4230 }, { "epoch": 0.7228812836050184, "grad_norm": 0.30462575069304965, "learning_rate": 4.2169322205361664e-05, "loss": 0.4115, "step": 4235 }, { "epoch": 0.7237347443884954, "grad_norm": 0.3503153970776218, "learning_rate": 4.215351542741528e-05, "loss": 0.4107, "step": 4240 }, { "epoch": 0.7245882051719723, "grad_norm": 0.2527201183739076, "learning_rate": 4.213770864946889e-05, "loss": 0.3978, "step": 4245 }, { "epoch": 0.7254416659554493, "grad_norm": 0.3429186035189598, "learning_rate": 4.212190187152251e-05, "loss": 0.4115, "step": 4250 }, { "epoch": 0.7262951267389264, "grad_norm": 0.3346265796344594, "learning_rate": 4.210609509357613e-05, "loss": 0.397, "step": 4255 }, { "epoch": 0.7271485875224033, "grad_norm": 0.33615817964375727, "learning_rate": 4.2090288315629745e-05, "loss": 0.4303, "step": 4260 }, { "epoch": 0.7280020483058803, "grad_norm": 0.3090982064278139, "learning_rate": 4.2074481537683355e-05, "loss": 0.4025, "step": 4265 }, { "epoch": 0.7288555090893574, "grad_norm": 0.32289637813678346, "learning_rate": 4.205867475973698e-05, "loss": 0.4153, "step": 4270 }, { "epoch": 0.7297089698728343, "grad_norm": 0.2909560288292368, "learning_rate": 4.20428679817906e-05, "loss": 0.3914, "step": 4275 }, { "epoch": 0.7305624306563113, "grad_norm": 0.36784967016477804, "learning_rate": 4.202706120384421e-05, "loss": 0.4158, "step": 4280 }, { "epoch": 0.7314158914397884, "grad_norm": 0.33937220092757686, "learning_rate": 4.2011254425897825e-05, "loss": 0.3985, "step": 4285 }, { "epoch": 0.7322693522232654, "grad_norm": 0.3403875552541558, "learning_rate": 4.199544764795144e-05, "loss": 0.4192, "step": 4290 }, { "epoch": 0.7331228130067423, "grad_norm": 0.33701808810162925, "learning_rate": 4.197964087000506e-05, "loss": 0.4086, "step": 4295 }, { "epoch": 0.7339762737902193, "grad_norm": 0.33393504905059407, "learning_rate": 4.196383409205868e-05, "loss": 0.4371, "step": 4300 }, { "epoch": 0.7348297345736964, "grad_norm": 0.4568747583322918, "learning_rate": 4.1948027314112295e-05, "loss": 0.4053, "step": 4305 }, { "epoch": 0.7356831953571733, "grad_norm": 0.355699716126515, "learning_rate": 4.193222053616591e-05, "loss": 0.4248, "step": 4310 }, { "epoch": 0.7365366561406503, "grad_norm": 0.3161932803777326, "learning_rate": 4.191641375821952e-05, "loss": 0.4132, "step": 4315 }, { "epoch": 0.7373901169241274, "grad_norm": 0.31362170704336956, "learning_rate": 4.190060698027314e-05, "loss": 0.4146, "step": 4320 }, { "epoch": 0.7382435777076043, "grad_norm": 0.34010375078844796, "learning_rate": 4.188480020232676e-05, "loss": 0.4344, "step": 4325 }, { "epoch": 0.7390970384910813, "grad_norm": 0.35406542016497544, "learning_rate": 4.1868993424380375e-05, "loss": 0.4102, "step": 4330 }, { "epoch": 0.7399504992745584, "grad_norm": 0.28597958638354865, "learning_rate": 4.185318664643399e-05, "loss": 0.4261, "step": 4335 }, { "epoch": 0.7408039600580353, "grad_norm": 0.34597799063069856, "learning_rate": 4.183737986848761e-05, "loss": 0.4176, "step": 4340 }, { "epoch": 0.7416574208415123, "grad_norm": 0.34531103255834966, "learning_rate": 4.182157309054123e-05, "loss": 0.4312, "step": 4345 }, { "epoch": 0.7425108816249893, "grad_norm": 0.32078958200851737, "learning_rate": 4.180576631259484e-05, "loss": 0.3729, "step": 4350 }, { "epoch": 0.7433643424084664, "grad_norm": 0.3155999118948804, "learning_rate": 4.178995953464846e-05, "loss": 0.4408, "step": 4355 }, { "epoch": 0.7442178031919433, "grad_norm": 0.28868644434022755, "learning_rate": 4.177415275670208e-05, "loss": 0.4085, "step": 4360 }, { "epoch": 0.7450712639754203, "grad_norm": 0.32152201048123946, "learning_rate": 4.175834597875569e-05, "loss": 0.424, "step": 4365 }, { "epoch": 0.7459247247588974, "grad_norm": 0.2907118528191575, "learning_rate": 4.174253920080931e-05, "loss": 0.3944, "step": 4370 }, { "epoch": 0.7467781855423743, "grad_norm": 0.3324795745936359, "learning_rate": 4.1726732422862926e-05, "loss": 0.4119, "step": 4375 }, { "epoch": 0.7476316463258513, "grad_norm": 0.3538070652814999, "learning_rate": 4.171092564491654e-05, "loss": 0.3999, "step": 4380 }, { "epoch": 0.7484851071093284, "grad_norm": 0.3277848982685285, "learning_rate": 4.1695118866970154e-05, "loss": 0.3777, "step": 4385 }, { "epoch": 0.7493385678928053, "grad_norm": 0.32042257803183943, "learning_rate": 4.167931208902378e-05, "loss": 0.3909, "step": 4390 }, { "epoch": 0.7501920286762823, "grad_norm": 0.3305451679792623, "learning_rate": 4.1663505311077396e-05, "loss": 0.4077, "step": 4395 }, { "epoch": 0.7510454894597594, "grad_norm": 0.35532927645089113, "learning_rate": 4.1647698533131006e-05, "loss": 0.4113, "step": 4400 }, { "epoch": 0.7518989502432363, "grad_norm": 0.3262998290611088, "learning_rate": 4.1631891755184624e-05, "loss": 0.4011, "step": 4405 }, { "epoch": 0.7527524110267133, "grad_norm": 0.32905670412647686, "learning_rate": 4.161608497723824e-05, "loss": 0.4041, "step": 4410 }, { "epoch": 0.7536058718101903, "grad_norm": 0.3319088655427751, "learning_rate": 4.160027819929186e-05, "loss": 0.3757, "step": 4415 }, { "epoch": 0.7544593325936674, "grad_norm": 0.2986074743098335, "learning_rate": 4.158447142134547e-05, "loss": 0.3992, "step": 4420 }, { "epoch": 0.7553127933771443, "grad_norm": 0.3044595794242737, "learning_rate": 4.1568664643399094e-05, "loss": 0.4146, "step": 4425 }, { "epoch": 0.7561662541606213, "grad_norm": 0.3161675328878183, "learning_rate": 4.155285786545271e-05, "loss": 0.3993, "step": 4430 }, { "epoch": 0.7570197149440984, "grad_norm": 0.305687887179435, "learning_rate": 4.153705108750632e-05, "loss": 0.4035, "step": 4435 }, { "epoch": 0.7578731757275753, "grad_norm": 0.32434363564394125, "learning_rate": 4.152124430955994e-05, "loss": 0.4093, "step": 4440 }, { "epoch": 0.7587266365110523, "grad_norm": 0.2755025371818015, "learning_rate": 4.1505437531613557e-05, "loss": 0.4035, "step": 4445 }, { "epoch": 0.7595800972945294, "grad_norm": 0.32624825931705015, "learning_rate": 4.1489630753667174e-05, "loss": 0.4053, "step": 4450 }, { "epoch": 0.7604335580780063, "grad_norm": 0.3942237261830777, "learning_rate": 4.147382397572079e-05, "loss": 0.37, "step": 4455 }, { "epoch": 0.7612870188614833, "grad_norm": 0.32424365195451116, "learning_rate": 4.145801719777441e-05, "loss": 0.4179, "step": 4460 }, { "epoch": 0.7621404796449603, "grad_norm": 0.3136585624239122, "learning_rate": 4.1442210419828026e-05, "loss": 0.4013, "step": 4465 }, { "epoch": 0.7629939404284373, "grad_norm": 0.30924146906465927, "learning_rate": 4.142640364188164e-05, "loss": 0.3955, "step": 4470 }, { "epoch": 0.7638474012119143, "grad_norm": 0.35230384884934085, "learning_rate": 4.141059686393526e-05, "loss": 0.4075, "step": 4475 }, { "epoch": 0.7647008619953913, "grad_norm": 0.290653463606052, "learning_rate": 4.139479008598887e-05, "loss": 0.3847, "step": 4480 }, { "epoch": 0.7655543227788683, "grad_norm": 0.31478336675663804, "learning_rate": 4.137898330804249e-05, "loss": 0.4041, "step": 4485 }, { "epoch": 0.7664077835623453, "grad_norm": 0.338118949228681, "learning_rate": 4.136317653009611e-05, "loss": 0.5015, "step": 4490 }, { "epoch": 0.7672612443458223, "grad_norm": 0.3096773208757094, "learning_rate": 4.1347369752149724e-05, "loss": 0.3975, "step": 4495 }, { "epoch": 0.7681147051292994, "grad_norm": 0.2784010084816463, "learning_rate": 4.133156297420334e-05, "loss": 0.403, "step": 4500 }, { "epoch": 0.7689681659127763, "grad_norm": 0.40636515820520797, "learning_rate": 4.131575619625695e-05, "loss": 0.4093, "step": 4505 }, { "epoch": 0.7698216266962533, "grad_norm": 0.3081228081944889, "learning_rate": 4.129994941831058e-05, "loss": 0.4025, "step": 4510 }, { "epoch": 0.7706750874797303, "grad_norm": 0.3198636278153371, "learning_rate": 4.1284142640364194e-05, "loss": 0.3948, "step": 4515 }, { "epoch": 0.7715285482632073, "grad_norm": 0.2881840115590486, "learning_rate": 4.1268335862417805e-05, "loss": 0.3898, "step": 4520 }, { "epoch": 0.7723820090466843, "grad_norm": 0.3034710629639315, "learning_rate": 4.125252908447142e-05, "loss": 0.39, "step": 4525 }, { "epoch": 0.7732354698301613, "grad_norm": 0.5848372369921458, "learning_rate": 4.123672230652504e-05, "loss": 0.4253, "step": 4530 }, { "epoch": 0.7740889306136383, "grad_norm": 0.3384455046384722, "learning_rate": 4.122091552857866e-05, "loss": 0.4081, "step": 4535 }, { "epoch": 0.7749423913971153, "grad_norm": 0.2753452307349275, "learning_rate": 4.120510875063227e-05, "loss": 0.3944, "step": 4540 }, { "epoch": 0.7757958521805923, "grad_norm": 0.3744377330836796, "learning_rate": 4.118930197268589e-05, "loss": 0.4179, "step": 4545 }, { "epoch": 0.7766493129640692, "grad_norm": 0.29938259446545207, "learning_rate": 4.117349519473951e-05, "loss": 0.3978, "step": 4550 }, { "epoch": 0.7775027737475463, "grad_norm": 0.28494586009615697, "learning_rate": 4.115768841679312e-05, "loss": 0.3914, "step": 4555 }, { "epoch": 0.7783562345310233, "grad_norm": 0.35708961400292716, "learning_rate": 4.114188163884674e-05, "loss": 0.3961, "step": 4560 }, { "epoch": 0.7792096953145004, "grad_norm": 0.34779942345223, "learning_rate": 4.1126074860900355e-05, "loss": 0.3871, "step": 4565 }, { "epoch": 0.7800631560979773, "grad_norm": 0.3469383159460142, "learning_rate": 4.111026808295397e-05, "loss": 0.4242, "step": 4570 }, { "epoch": 0.7809166168814543, "grad_norm": 0.30754940417254145, "learning_rate": 4.109446130500758e-05, "loss": 0.4035, "step": 4575 }, { "epoch": 0.7817700776649313, "grad_norm": 0.3295429329842511, "learning_rate": 4.107865452706121e-05, "loss": 0.3885, "step": 4580 }, { "epoch": 0.7826235384484083, "grad_norm": 0.31856245370273367, "learning_rate": 4.1062847749114825e-05, "loss": 0.4009, "step": 4585 }, { "epoch": 0.7834769992318853, "grad_norm": 0.2704621666300267, "learning_rate": 4.1047040971168436e-05, "loss": 0.3796, "step": 4590 }, { "epoch": 0.7843304600153623, "grad_norm": 0.2939190271169085, "learning_rate": 4.103123419322206e-05, "loss": 0.4019, "step": 4595 }, { "epoch": 0.7851839207988393, "grad_norm": 0.29474624209453865, "learning_rate": 4.101542741527567e-05, "loss": 0.4009, "step": 4600 }, { "epoch": 0.7860373815823163, "grad_norm": 0.27045185102969166, "learning_rate": 4.099962063732929e-05, "loss": 0.4102, "step": 4605 }, { "epoch": 0.7868908423657933, "grad_norm": 0.33478918527627305, "learning_rate": 4.0983813859382906e-05, "loss": 0.4302, "step": 4610 }, { "epoch": 0.7877443031492702, "grad_norm": 0.2797483440001289, "learning_rate": 4.096800708143652e-05, "loss": 0.3976, "step": 4615 }, { "epoch": 0.7885977639327473, "grad_norm": 0.3417775572557836, "learning_rate": 4.095220030349014e-05, "loss": 0.4151, "step": 4620 }, { "epoch": 0.7894512247162243, "grad_norm": 0.3297042777056447, "learning_rate": 4.093639352554375e-05, "loss": 0.4049, "step": 4625 }, { "epoch": 0.7903046854997013, "grad_norm": 0.31925672581295045, "learning_rate": 4.0920586747597375e-05, "loss": 0.4177, "step": 4630 }, { "epoch": 0.7911581462831783, "grad_norm": 0.39096973742691915, "learning_rate": 4.0904779969650986e-05, "loss": 0.4121, "step": 4635 }, { "epoch": 0.7920116070666553, "grad_norm": 0.505410622151423, "learning_rate": 4.0888973191704604e-05, "loss": 0.4205, "step": 4640 }, { "epoch": 0.7928650678501323, "grad_norm": 0.36487840676727556, "learning_rate": 4.087316641375822e-05, "loss": 0.4159, "step": 4645 }, { "epoch": 0.7937185286336093, "grad_norm": 0.35525123386630897, "learning_rate": 4.085735963581184e-05, "loss": 0.3944, "step": 4650 }, { "epoch": 0.7945719894170863, "grad_norm": 0.29716828870244033, "learning_rate": 4.0841552857865456e-05, "loss": 0.3822, "step": 4655 }, { "epoch": 0.7954254502005633, "grad_norm": 0.3868678022049167, "learning_rate": 4.0825746079919067e-05, "loss": 0.4147, "step": 4660 }, { "epoch": 0.7962789109840402, "grad_norm": 0.3016916913436977, "learning_rate": 4.080993930197269e-05, "loss": 0.4158, "step": 4665 }, { "epoch": 0.7971323717675173, "grad_norm": 0.32993144701841487, "learning_rate": 4.07941325240263e-05, "loss": 0.4119, "step": 4670 }, { "epoch": 0.7979858325509943, "grad_norm": 0.32027448826875743, "learning_rate": 4.077832574607992e-05, "loss": 0.3937, "step": 4675 }, { "epoch": 0.7988392933344712, "grad_norm": 0.30554924348516194, "learning_rate": 4.0762518968133536e-05, "loss": 0.3946, "step": 4680 }, { "epoch": 0.7996927541179483, "grad_norm": 0.3104102534517124, "learning_rate": 4.0746712190187154e-05, "loss": 0.3932, "step": 4685 }, { "epoch": 0.8005462149014253, "grad_norm": 0.4032254241139037, "learning_rate": 4.073090541224077e-05, "loss": 0.4054, "step": 4690 }, { "epoch": 0.8013996756849022, "grad_norm": 0.31615866551315425, "learning_rate": 4.071509863429438e-05, "loss": 0.4124, "step": 4695 }, { "epoch": 0.8022531364683793, "grad_norm": 0.28974398935268386, "learning_rate": 4.0699291856348006e-05, "loss": 0.3995, "step": 4700 }, { "epoch": 0.8031065972518563, "grad_norm": 0.3174575854135146, "learning_rate": 4.0683485078401624e-05, "loss": 0.4052, "step": 4705 }, { "epoch": 0.8039600580353333, "grad_norm": 0.34454219048771767, "learning_rate": 4.0667678300455234e-05, "loss": 0.4251, "step": 4710 }, { "epoch": 0.8048135188188102, "grad_norm": 0.3232074461636175, "learning_rate": 4.065187152250886e-05, "loss": 0.4267, "step": 4715 }, { "epoch": 0.8056669796022873, "grad_norm": 0.2991995934596982, "learning_rate": 4.063606474456247e-05, "loss": 0.4178, "step": 4720 }, { "epoch": 0.8065204403857643, "grad_norm": 0.6473022105553348, "learning_rate": 4.062025796661609e-05, "loss": 0.4138, "step": 4725 }, { "epoch": 0.8073739011692412, "grad_norm": 0.3540906914342969, "learning_rate": 4.0604451188669704e-05, "loss": 0.4257, "step": 4730 }, { "epoch": 0.8082273619527183, "grad_norm": 0.3862695103295584, "learning_rate": 4.058864441072332e-05, "loss": 0.4101, "step": 4735 }, { "epoch": 0.8090808227361953, "grad_norm": 0.30599353695794984, "learning_rate": 4.057283763277694e-05, "loss": 0.402, "step": 4740 }, { "epoch": 0.8099342835196722, "grad_norm": 0.2886779083143616, "learning_rate": 4.055703085483055e-05, "loss": 0.4028, "step": 4745 }, { "epoch": 0.8107877443031493, "grad_norm": 0.37013807120115944, "learning_rate": 4.0541224076884174e-05, "loss": 0.4084, "step": 4750 }, { "epoch": 0.8116412050866263, "grad_norm": 0.33046909050271644, "learning_rate": 4.0525417298937785e-05, "loss": 0.5261, "step": 4755 }, { "epoch": 0.8124946658701032, "grad_norm": 0.4917784205430057, "learning_rate": 4.05096105209914e-05, "loss": 0.4128, "step": 4760 }, { "epoch": 0.8133481266535803, "grad_norm": 0.2806447910329089, "learning_rate": 4.049380374304502e-05, "loss": 0.3808, "step": 4765 }, { "epoch": 0.8142015874370573, "grad_norm": 0.3715125347762736, "learning_rate": 4.047799696509864e-05, "loss": 0.4025, "step": 4770 }, { "epoch": 0.8150550482205343, "grad_norm": 0.37638977845836274, "learning_rate": 4.0462190187152255e-05, "loss": 0.382, "step": 4775 }, { "epoch": 0.8159085090040112, "grad_norm": 0.36253944811301614, "learning_rate": 4.0446383409205865e-05, "loss": 0.4033, "step": 4780 }, { "epoch": 0.8167619697874883, "grad_norm": 0.4279959086393371, "learning_rate": 4.043057663125949e-05, "loss": 0.4028, "step": 4785 }, { "epoch": 0.8176154305709653, "grad_norm": 0.3306066024555949, "learning_rate": 4.04147698533131e-05, "loss": 0.3958, "step": 4790 }, { "epoch": 0.8184688913544422, "grad_norm": 0.28071084070992086, "learning_rate": 4.039896307536672e-05, "loss": 0.3823, "step": 4795 }, { "epoch": 0.8193223521379193, "grad_norm": 0.34140400119686315, "learning_rate": 4.0383156297420335e-05, "loss": 0.4357, "step": 4800 }, { "epoch": 0.8201758129213963, "grad_norm": 0.28905678507284915, "learning_rate": 4.036734951947395e-05, "loss": 0.4071, "step": 4805 }, { "epoch": 0.8210292737048732, "grad_norm": 0.3142097996084378, "learning_rate": 4.035154274152757e-05, "loss": 0.432, "step": 4810 }, { "epoch": 0.8218827344883503, "grad_norm": 0.31977749163983715, "learning_rate": 4.033573596358118e-05, "loss": 0.3871, "step": 4815 }, { "epoch": 0.8227361952718273, "grad_norm": 0.30500708754655803, "learning_rate": 4.0319929185634805e-05, "loss": 0.4317, "step": 4820 }, { "epoch": 0.8235896560553042, "grad_norm": 0.2684349025908979, "learning_rate": 4.0304122407688416e-05, "loss": 0.3844, "step": 4825 }, { "epoch": 0.8244431168387812, "grad_norm": 0.38310433090082086, "learning_rate": 4.028831562974203e-05, "loss": 0.4195, "step": 4830 }, { "epoch": 0.8252965776222583, "grad_norm": 0.29813693032812494, "learning_rate": 4.027250885179566e-05, "loss": 0.3821, "step": 4835 }, { "epoch": 0.8261500384057353, "grad_norm": 0.3827919343859201, "learning_rate": 4.025670207384927e-05, "loss": 0.3923, "step": 4840 }, { "epoch": 0.8270034991892122, "grad_norm": 0.30645888208111105, "learning_rate": 4.0240895295902885e-05, "loss": 0.3955, "step": 4845 }, { "epoch": 0.8278569599726893, "grad_norm": 0.3088768636631694, "learning_rate": 4.02250885179565e-05, "loss": 0.4107, "step": 4850 }, { "epoch": 0.8287104207561663, "grad_norm": 0.28511414498027743, "learning_rate": 4.020928174001012e-05, "loss": 0.3803, "step": 4855 }, { "epoch": 0.8295638815396432, "grad_norm": 0.3246746177623308, "learning_rate": 4.019347496206374e-05, "loss": 0.3859, "step": 4860 }, { "epoch": 0.8304173423231203, "grad_norm": 0.35448270212780275, "learning_rate": 4.017766818411735e-05, "loss": 0.4163, "step": 4865 }, { "epoch": 0.8312708031065973, "grad_norm": 0.29783012475100684, "learning_rate": 4.016186140617097e-05, "loss": 0.3914, "step": 4870 }, { "epoch": 0.8321242638900742, "grad_norm": 0.3255194329801512, "learning_rate": 4.014605462822458e-05, "loss": 0.4438, "step": 4875 }, { "epoch": 0.8329777246735512, "grad_norm": 0.2919100189156125, "learning_rate": 4.01302478502782e-05, "loss": 0.4046, "step": 4880 }, { "epoch": 0.8338311854570283, "grad_norm": 0.28055761497825904, "learning_rate": 4.011444107233182e-05, "loss": 0.3969, "step": 4885 }, { "epoch": 0.8346846462405052, "grad_norm": 0.31690866109449606, "learning_rate": 4.0098634294385436e-05, "loss": 0.4342, "step": 4890 }, { "epoch": 0.8355381070239822, "grad_norm": 0.30412196269883246, "learning_rate": 4.008282751643905e-05, "loss": 0.3925, "step": 4895 }, { "epoch": 0.8363915678074593, "grad_norm": 0.41560283857290314, "learning_rate": 4.0067020738492664e-05, "loss": 0.3828, "step": 4900 }, { "epoch": 0.8372450285909362, "grad_norm": 0.32256829886779914, "learning_rate": 4.005121396054629e-05, "loss": 0.3758, "step": 4905 }, { "epoch": 0.8380984893744132, "grad_norm": 0.32529008241351176, "learning_rate": 4.00354071825999e-05, "loss": 0.3965, "step": 4910 }, { "epoch": 0.8389519501578903, "grad_norm": 0.3001520274578627, "learning_rate": 4.0019600404653516e-05, "loss": 0.3972, "step": 4915 }, { "epoch": 0.8398054109413673, "grad_norm": 0.27279023658113916, "learning_rate": 4.0003793626707134e-05, "loss": 0.4075, "step": 4920 }, { "epoch": 0.8406588717248442, "grad_norm": 0.2953368256798912, "learning_rate": 3.998798684876075e-05, "loss": 0.3915, "step": 4925 }, { "epoch": 0.8415123325083212, "grad_norm": 0.3470377591947916, "learning_rate": 3.997218007081437e-05, "loss": 0.4007, "step": 4930 }, { "epoch": 0.8423657932917983, "grad_norm": 0.32979242963669586, "learning_rate": 3.995637329286798e-05, "loss": 0.4059, "step": 4935 }, { "epoch": 0.8432192540752752, "grad_norm": 0.3377985028136022, "learning_rate": 3.9940566514921604e-05, "loss": 0.3991, "step": 4940 }, { "epoch": 0.8440727148587522, "grad_norm": 0.3257128739786348, "learning_rate": 3.9924759736975214e-05, "loss": 0.4109, "step": 4945 }, { "epoch": 0.8449261756422293, "grad_norm": 0.2710886656353616, "learning_rate": 3.990895295902883e-05, "loss": 0.381, "step": 4950 }, { "epoch": 0.8457796364257062, "grad_norm": 0.29251857885155075, "learning_rate": 3.989314618108245e-05, "loss": 0.4166, "step": 4955 }, { "epoch": 0.8466330972091832, "grad_norm": 0.2713786400764459, "learning_rate": 3.9877339403136067e-05, "loss": 0.3735, "step": 4960 }, { "epoch": 0.8474865579926603, "grad_norm": 0.34107109756279425, "learning_rate": 3.9861532625189684e-05, "loss": 0.4123, "step": 4965 }, { "epoch": 0.8483400187761372, "grad_norm": 0.45870431604925593, "learning_rate": 3.98457258472433e-05, "loss": 0.398, "step": 4970 }, { "epoch": 0.8491934795596142, "grad_norm": 0.2977277437394345, "learning_rate": 3.982991906929692e-05, "loss": 0.4034, "step": 4975 }, { "epoch": 0.8500469403430913, "grad_norm": 0.298048156407772, "learning_rate": 3.981411229135053e-05, "loss": 0.4069, "step": 4980 }, { "epoch": 0.8509004011265683, "grad_norm": 0.33305943677070826, "learning_rate": 3.979830551340415e-05, "loss": 0.4146, "step": 4985 }, { "epoch": 0.8517538619100452, "grad_norm": 0.30230946113273033, "learning_rate": 3.978249873545777e-05, "loss": 0.4129, "step": 4990 }, { "epoch": 0.8526073226935222, "grad_norm": 0.3335437900533083, "learning_rate": 3.976669195751138e-05, "loss": 0.4033, "step": 4995 }, { "epoch": 0.8534607834769993, "grad_norm": 0.2966870450318052, "learning_rate": 3.9750885179565e-05, "loss": 0.4064, "step": 5000 }, { "epoch": 0.8543142442604762, "grad_norm": 0.6028627926995674, "learning_rate": 3.973507840161862e-05, "loss": 0.4199, "step": 5005 }, { "epoch": 0.8551677050439532, "grad_norm": 0.27719455366098295, "learning_rate": 3.9719271623672234e-05, "loss": 0.397, "step": 5010 }, { "epoch": 0.8560211658274303, "grad_norm": 0.3129978998328572, "learning_rate": 3.970346484572585e-05, "loss": 0.3894, "step": 5015 }, { "epoch": 0.8568746266109072, "grad_norm": 0.2656482222223697, "learning_rate": 3.968765806777946e-05, "loss": 0.409, "step": 5020 }, { "epoch": 0.8577280873943842, "grad_norm": 0.29503390893528886, "learning_rate": 3.967185128983309e-05, "loss": 0.421, "step": 5025 }, { "epoch": 0.8585815481778613, "grad_norm": 0.33614222466844057, "learning_rate": 3.96560445118867e-05, "loss": 0.3915, "step": 5030 }, { "epoch": 0.8594350089613382, "grad_norm": 0.34782003390052313, "learning_rate": 3.9640237733940315e-05, "loss": 0.42, "step": 5035 }, { "epoch": 0.8602884697448152, "grad_norm": 0.3008476742900976, "learning_rate": 3.962443095599393e-05, "loss": 0.396, "step": 5040 }, { "epoch": 0.8611419305282922, "grad_norm": 0.3813049216453907, "learning_rate": 3.960862417804755e-05, "loss": 0.3865, "step": 5045 }, { "epoch": 0.8619953913117693, "grad_norm": 0.2615762385004988, "learning_rate": 3.959281740010117e-05, "loss": 0.3925, "step": 5050 }, { "epoch": 0.8628488520952462, "grad_norm": 0.2900357362915595, "learning_rate": 3.957701062215478e-05, "loss": 0.4098, "step": 5055 }, { "epoch": 0.8637023128787232, "grad_norm": 0.3495575274976206, "learning_rate": 3.95612038442084e-05, "loss": 0.4159, "step": 5060 }, { "epoch": 0.8645557736622003, "grad_norm": 0.26671648117251756, "learning_rate": 3.954539706626201e-05, "loss": 0.4116, "step": 5065 }, { "epoch": 0.8654092344456772, "grad_norm": 0.32247692817822354, "learning_rate": 3.952959028831563e-05, "loss": 0.4244, "step": 5070 }, { "epoch": 0.8662626952291542, "grad_norm": 0.3794068592354095, "learning_rate": 3.951378351036925e-05, "loss": 0.4146, "step": 5075 }, { "epoch": 0.8671161560126313, "grad_norm": 0.3550897304513579, "learning_rate": 3.9497976732422865e-05, "loss": 0.4053, "step": 5080 }, { "epoch": 0.8679696167961082, "grad_norm": 0.4089554319852101, "learning_rate": 3.948216995447648e-05, "loss": 0.4054, "step": 5085 }, { "epoch": 0.8688230775795852, "grad_norm": 0.2926870638879009, "learning_rate": 3.9466363176530093e-05, "loss": 0.385, "step": 5090 }, { "epoch": 0.8696765383630622, "grad_norm": 0.2709805509186034, "learning_rate": 3.945055639858372e-05, "loss": 0.3873, "step": 5095 }, { "epoch": 0.8705299991465392, "grad_norm": 0.3038108281782244, "learning_rate": 3.943474962063733e-05, "loss": 0.3998, "step": 5100 }, { "epoch": 0.8713834599300162, "grad_norm": 0.32795566122512076, "learning_rate": 3.9418942842690946e-05, "loss": 0.4179, "step": 5105 }, { "epoch": 0.8722369207134932, "grad_norm": 0.2745514106811243, "learning_rate": 3.940313606474457e-05, "loss": 0.3755, "step": 5110 }, { "epoch": 0.8730903814969703, "grad_norm": 0.24791991552773082, "learning_rate": 3.938732928679818e-05, "loss": 0.3849, "step": 5115 }, { "epoch": 0.8739438422804472, "grad_norm": 0.26897667410930254, "learning_rate": 3.93715225088518e-05, "loss": 0.3949, "step": 5120 }, { "epoch": 0.8747973030639242, "grad_norm": 0.3450499459092217, "learning_rate": 3.9355715730905416e-05, "loss": 0.3707, "step": 5125 }, { "epoch": 0.8756507638474013, "grad_norm": 0.2654438339576666, "learning_rate": 3.933990895295903e-05, "loss": 0.4263, "step": 5130 }, { "epoch": 0.8765042246308782, "grad_norm": 0.3137707560545838, "learning_rate": 3.9324102175012644e-05, "loss": 0.3838, "step": 5135 }, { "epoch": 0.8773576854143552, "grad_norm": 0.28375110531930614, "learning_rate": 3.930829539706626e-05, "loss": 0.4003, "step": 5140 }, { "epoch": 0.8782111461978322, "grad_norm": 0.3006772483778044, "learning_rate": 3.9292488619119885e-05, "loss": 0.3871, "step": 5145 }, { "epoch": 0.8790646069813092, "grad_norm": 0.2813976183652599, "learning_rate": 3.9276681841173496e-05, "loss": 0.3925, "step": 5150 }, { "epoch": 0.8799180677647862, "grad_norm": 0.32207309347786145, "learning_rate": 3.9260875063227114e-05, "loss": 0.4008, "step": 5155 }, { "epoch": 0.8807715285482632, "grad_norm": 0.32945652183429003, "learning_rate": 3.924506828528073e-05, "loss": 0.3984, "step": 5160 }, { "epoch": 0.8816249893317402, "grad_norm": 0.2896606109239708, "learning_rate": 3.922926150733435e-05, "loss": 0.4156, "step": 5165 }, { "epoch": 0.8824784501152172, "grad_norm": 0.2526747324067112, "learning_rate": 3.9213454729387966e-05, "loss": 0.3699, "step": 5170 }, { "epoch": 0.8833319108986942, "grad_norm": 0.35745377869019557, "learning_rate": 3.919764795144158e-05, "loss": 0.4329, "step": 5175 }, { "epoch": 0.8841853716821712, "grad_norm": 0.31236963144898006, "learning_rate": 3.91818411734952e-05, "loss": 0.4129, "step": 5180 }, { "epoch": 0.8850388324656482, "grad_norm": 0.28461497397693913, "learning_rate": 3.916603439554881e-05, "loss": 0.3895, "step": 5185 }, { "epoch": 0.8858922932491252, "grad_norm": 0.27964933467210573, "learning_rate": 3.915022761760243e-05, "loss": 0.3799, "step": 5190 }, { "epoch": 0.8867457540326023, "grad_norm": 0.2785644840730855, "learning_rate": 3.9134420839656046e-05, "loss": 0.3817, "step": 5195 }, { "epoch": 0.8875992148160792, "grad_norm": 0.29356113226599323, "learning_rate": 3.9118614061709664e-05, "loss": 0.402, "step": 5200 }, { "epoch": 0.8884526755995562, "grad_norm": 0.30768420876246444, "learning_rate": 3.910280728376328e-05, "loss": 0.3899, "step": 5205 }, { "epoch": 0.8893061363830332, "grad_norm": 0.35745453531424337, "learning_rate": 3.908700050581689e-05, "loss": 0.4269, "step": 5210 }, { "epoch": 0.8901595971665102, "grad_norm": 0.3408472402202522, "learning_rate": 3.9071193727870516e-05, "loss": 0.3884, "step": 5215 }, { "epoch": 0.8910130579499872, "grad_norm": 0.286810012183386, "learning_rate": 3.905538694992413e-05, "loss": 0.393, "step": 5220 }, { "epoch": 0.8918665187334642, "grad_norm": 0.334987795157886, "learning_rate": 3.9039580171977744e-05, "loss": 0.4151, "step": 5225 }, { "epoch": 0.8927199795169412, "grad_norm": 0.36821328936539877, "learning_rate": 3.902377339403136e-05, "loss": 0.3948, "step": 5230 }, { "epoch": 0.8935734403004182, "grad_norm": 0.3046607736300186, "learning_rate": 3.900796661608498e-05, "loss": 0.3968, "step": 5235 }, { "epoch": 0.8944269010838952, "grad_norm": 0.2922391116777222, "learning_rate": 3.89921598381386e-05, "loss": 0.3845, "step": 5240 }, { "epoch": 0.8952803618673721, "grad_norm": 0.2903208644625814, "learning_rate": 3.8976353060192214e-05, "loss": 0.4197, "step": 5245 }, { "epoch": 0.8961338226508492, "grad_norm": 0.3327185838210238, "learning_rate": 3.896054628224583e-05, "loss": 0.3882, "step": 5250 }, { "epoch": 0.8969872834343262, "grad_norm": 0.2635948287501551, "learning_rate": 3.894473950429944e-05, "loss": 0.4096, "step": 5255 }, { "epoch": 0.8978407442178032, "grad_norm": 0.35205989780779473, "learning_rate": 3.892893272635306e-05, "loss": 0.4133, "step": 5260 }, { "epoch": 0.8986942050012802, "grad_norm": 0.3030447917995427, "learning_rate": 3.8913125948406684e-05, "loss": 0.4041, "step": 5265 }, { "epoch": 0.8995476657847572, "grad_norm": 0.31505112929862816, "learning_rate": 3.8897319170460295e-05, "loss": 0.3996, "step": 5270 }, { "epoch": 0.9004011265682342, "grad_norm": 0.3515099946362734, "learning_rate": 3.888151239251391e-05, "loss": 0.3866, "step": 5275 }, { "epoch": 0.9012545873517112, "grad_norm": 0.3058321410633702, "learning_rate": 3.886570561456753e-05, "loss": 0.3971, "step": 5280 }, { "epoch": 0.9021080481351882, "grad_norm": 0.295513576646524, "learning_rate": 3.884989883662115e-05, "loss": 0.4004, "step": 5285 }, { "epoch": 0.9029615089186652, "grad_norm": 0.3257699861774394, "learning_rate": 3.883409205867476e-05, "loss": 0.3929, "step": 5290 }, { "epoch": 0.9038149697021421, "grad_norm": 0.299722654905589, "learning_rate": 3.8818285280728375e-05, "loss": 0.4022, "step": 5295 }, { "epoch": 0.9046684304856192, "grad_norm": 0.2803278148730567, "learning_rate": 3.8802478502782e-05, "loss": 0.3829, "step": 5300 }, { "epoch": 0.9055218912690962, "grad_norm": 0.305056584379854, "learning_rate": 3.878667172483561e-05, "loss": 0.4144, "step": 5305 }, { "epoch": 0.9063753520525731, "grad_norm": 0.2722689541424799, "learning_rate": 3.877086494688923e-05, "loss": 0.3836, "step": 5310 }, { "epoch": 0.9072288128360502, "grad_norm": 0.3141693701806631, "learning_rate": 3.8755058168942845e-05, "loss": 0.3993, "step": 5315 }, { "epoch": 0.9080822736195272, "grad_norm": 0.33534260232238616, "learning_rate": 3.873925139099646e-05, "loss": 0.3763, "step": 5320 }, { "epoch": 0.9089357344030042, "grad_norm": 0.3569155054009768, "learning_rate": 3.872344461305007e-05, "loss": 0.3968, "step": 5325 }, { "epoch": 0.9097891951864812, "grad_norm": 0.29162043937112064, "learning_rate": 3.870763783510369e-05, "loss": 0.3908, "step": 5330 }, { "epoch": 0.9106426559699582, "grad_norm": 0.2849203117948649, "learning_rate": 3.8691831057157315e-05, "loss": 0.4018, "step": 5335 }, { "epoch": 0.9114961167534352, "grad_norm": 0.3635540825517537, "learning_rate": 3.8676024279210926e-05, "loss": 0.4, "step": 5340 }, { "epoch": 0.9123495775369121, "grad_norm": 0.2760460784914764, "learning_rate": 3.866021750126454e-05, "loss": 0.3881, "step": 5345 }, { "epoch": 0.9132030383203892, "grad_norm": 0.2674744017557598, "learning_rate": 3.864441072331816e-05, "loss": 0.4095, "step": 5350 }, { "epoch": 0.9140564991038662, "grad_norm": 0.34806507635248995, "learning_rate": 3.862860394537178e-05, "loss": 0.4197, "step": 5355 }, { "epoch": 0.9149099598873431, "grad_norm": 0.33011377332037456, "learning_rate": 3.8612797167425395e-05, "loss": 0.4255, "step": 5360 }, { "epoch": 0.9157634206708202, "grad_norm": 0.32539221738793855, "learning_rate": 3.859699038947901e-05, "loss": 0.383, "step": 5365 }, { "epoch": 0.9166168814542972, "grad_norm": 0.29914253713430833, "learning_rate": 3.858118361153263e-05, "loss": 0.4003, "step": 5370 }, { "epoch": 0.9174703422377741, "grad_norm": 0.28578853621997774, "learning_rate": 3.856537683358624e-05, "loss": 0.3763, "step": 5375 }, { "epoch": 0.9183238030212512, "grad_norm": 0.30078372468059994, "learning_rate": 3.854957005563986e-05, "loss": 0.3954, "step": 5380 }, { "epoch": 0.9191772638047282, "grad_norm": 0.30589646194762893, "learning_rate": 3.8533763277693476e-05, "loss": 0.3764, "step": 5385 }, { "epoch": 0.9200307245882051, "grad_norm": 0.32531092109078386, "learning_rate": 3.8517956499747093e-05, "loss": 0.4025, "step": 5390 }, { "epoch": 0.9208841853716822, "grad_norm": 0.3431785318604217, "learning_rate": 3.850214972180071e-05, "loss": 0.4063, "step": 5395 }, { "epoch": 0.9217376461551592, "grad_norm": 0.33296805142850056, "learning_rate": 3.848634294385433e-05, "loss": 0.4204, "step": 5400 }, { "epoch": 0.9225911069386362, "grad_norm": 0.35969433303724685, "learning_rate": 3.8470536165907946e-05, "loss": 0.3847, "step": 5405 }, { "epoch": 0.9234445677221131, "grad_norm": 0.35555526480984867, "learning_rate": 3.8454729387961556e-05, "loss": 0.42, "step": 5410 }, { "epoch": 0.9242980285055902, "grad_norm": 0.2995995598901696, "learning_rate": 3.8438922610015174e-05, "loss": 0.4253, "step": 5415 }, { "epoch": 0.9251514892890672, "grad_norm": 0.3070161658616088, "learning_rate": 3.84231158320688e-05, "loss": 0.4083, "step": 5420 }, { "epoch": 0.9260049500725441, "grad_norm": 0.2998824754537497, "learning_rate": 3.840730905412241e-05, "loss": 0.3929, "step": 5425 }, { "epoch": 0.9268584108560212, "grad_norm": 0.3010582898265144, "learning_rate": 3.8391502276176026e-05, "loss": 0.3939, "step": 5430 }, { "epoch": 0.9277118716394982, "grad_norm": 0.3077792695689317, "learning_rate": 3.8375695498229644e-05, "loss": 0.4005, "step": 5435 }, { "epoch": 0.9285653324229751, "grad_norm": 0.40995212365992123, "learning_rate": 3.835988872028326e-05, "loss": 0.3989, "step": 5440 }, { "epoch": 0.9294187932064522, "grad_norm": 0.33418085749009013, "learning_rate": 3.834408194233687e-05, "loss": 0.3778, "step": 5445 }, { "epoch": 0.9302722539899292, "grad_norm": 0.3518855958023947, "learning_rate": 3.832827516439049e-05, "loss": 0.3879, "step": 5450 }, { "epoch": 0.9311257147734061, "grad_norm": 0.3046894366689021, "learning_rate": 3.8312468386444114e-05, "loss": 0.3975, "step": 5455 }, { "epoch": 0.9319791755568831, "grad_norm": 0.3345785426974873, "learning_rate": 3.8296661608497724e-05, "loss": 0.3903, "step": 5460 }, { "epoch": 0.9328326363403602, "grad_norm": 0.27311761756181024, "learning_rate": 3.828085483055134e-05, "loss": 0.3782, "step": 5465 }, { "epoch": 0.9336860971238372, "grad_norm": 0.28102356568335196, "learning_rate": 3.826504805260496e-05, "loss": 0.3825, "step": 5470 }, { "epoch": 0.9345395579073141, "grad_norm": 0.2972318088350182, "learning_rate": 3.824924127465858e-05, "loss": 0.386, "step": 5475 }, { "epoch": 0.9353930186907912, "grad_norm": 0.37972428099829025, "learning_rate": 3.823343449671219e-05, "loss": 0.4471, "step": 5480 }, { "epoch": 0.9362464794742682, "grad_norm": 0.3402001739193404, "learning_rate": 3.821762771876581e-05, "loss": 0.3899, "step": 5485 }, { "epoch": 0.9370999402577451, "grad_norm": 0.32399777046464795, "learning_rate": 3.820182094081943e-05, "loss": 0.3976, "step": 5490 }, { "epoch": 0.9379534010412222, "grad_norm": 0.2661367697407966, "learning_rate": 3.818601416287304e-05, "loss": 0.4251, "step": 5495 }, { "epoch": 0.9388068618246992, "grad_norm": 0.23972631787354762, "learning_rate": 3.817020738492666e-05, "loss": 0.3813, "step": 5500 }, { "epoch": 0.9396603226081761, "grad_norm": 0.39754356573715705, "learning_rate": 3.8154400606980275e-05, "loss": 0.4067, "step": 5505 }, { "epoch": 0.9405137833916531, "grad_norm": 0.2827471269019589, "learning_rate": 3.813859382903389e-05, "loss": 0.4054, "step": 5510 }, { "epoch": 0.9413672441751302, "grad_norm": 0.2602938324157807, "learning_rate": 3.812278705108751e-05, "loss": 0.3755, "step": 5515 }, { "epoch": 0.9422207049586071, "grad_norm": 0.3414991003765851, "learning_rate": 3.810698027314113e-05, "loss": 0.4159, "step": 5520 }, { "epoch": 0.9430741657420841, "grad_norm": 0.2805149122934009, "learning_rate": 3.8091173495194744e-05, "loss": 0.3912, "step": 5525 }, { "epoch": 0.9439276265255612, "grad_norm": 0.5582561369082443, "learning_rate": 3.8075366717248355e-05, "loss": 0.4046, "step": 5530 }, { "epoch": 0.9447810873090382, "grad_norm": 0.3537931845620279, "learning_rate": 3.805955993930197e-05, "loss": 0.4169, "step": 5535 }, { "epoch": 0.9456345480925151, "grad_norm": 0.29991850603086845, "learning_rate": 3.804375316135559e-05, "loss": 0.4043, "step": 5540 }, { "epoch": 0.9464880088759922, "grad_norm": 0.34267224444774536, "learning_rate": 3.802794638340921e-05, "loss": 0.3963, "step": 5545 }, { "epoch": 0.9473414696594692, "grad_norm": 0.3596481251842892, "learning_rate": 3.8012139605462825e-05, "loss": 0.3874, "step": 5550 }, { "epoch": 0.9481949304429461, "grad_norm": 0.30753013208963503, "learning_rate": 3.799633282751644e-05, "loss": 0.415, "step": 5555 }, { "epoch": 0.9490483912264231, "grad_norm": 0.28864004607871757, "learning_rate": 3.798052604957006e-05, "loss": 0.4066, "step": 5560 }, { "epoch": 0.9499018520099002, "grad_norm": 0.3385139107633918, "learning_rate": 3.796471927162367e-05, "loss": 0.4007, "step": 5565 }, { "epoch": 0.9507553127933771, "grad_norm": 0.2887904261250965, "learning_rate": 3.794891249367729e-05, "loss": 0.413, "step": 5570 }, { "epoch": 0.9516087735768541, "grad_norm": 0.3037064966506466, "learning_rate": 3.793310571573091e-05, "loss": 0.4183, "step": 5575 }, { "epoch": 0.9524622343603312, "grad_norm": 0.27915879909879293, "learning_rate": 3.791729893778452e-05, "loss": 0.3966, "step": 5580 }, { "epoch": 0.9533156951438081, "grad_norm": 0.35221038957559775, "learning_rate": 3.790149215983814e-05, "loss": 0.4013, "step": 5585 }, { "epoch": 0.9541691559272851, "grad_norm": 0.29043917382737156, "learning_rate": 3.788568538189176e-05, "loss": 0.4026, "step": 5590 }, { "epoch": 0.9550226167107622, "grad_norm": 0.31041448375410824, "learning_rate": 3.7869878603945375e-05, "loss": 0.3943, "step": 5595 }, { "epoch": 0.9558760774942391, "grad_norm": 0.29603767014457066, "learning_rate": 3.7854071825998986e-05, "loss": 0.3879, "step": 5600 }, { "epoch": 0.9567295382777161, "grad_norm": 0.38801248687522416, "learning_rate": 3.783826504805261e-05, "loss": 0.3757, "step": 5605 }, { "epoch": 0.9575829990611932, "grad_norm": 0.30556685014549967, "learning_rate": 3.782245827010623e-05, "loss": 0.3899, "step": 5610 }, { "epoch": 0.9584364598446702, "grad_norm": 0.313607736498249, "learning_rate": 3.780665149215984e-05, "loss": 0.3995, "step": 5615 }, { "epoch": 0.9592899206281471, "grad_norm": 0.2846353776630875, "learning_rate": 3.7790844714213456e-05, "loss": 0.3884, "step": 5620 }, { "epoch": 0.9601433814116241, "grad_norm": 0.3147982079268056, "learning_rate": 3.777503793626707e-05, "loss": 0.3891, "step": 5625 }, { "epoch": 0.9609968421951012, "grad_norm": 0.2994112281895544, "learning_rate": 3.775923115832069e-05, "loss": 0.4107, "step": 5630 }, { "epoch": 0.9618503029785781, "grad_norm": 0.28255243843544564, "learning_rate": 3.77434243803743e-05, "loss": 0.3745, "step": 5635 }, { "epoch": 0.9627037637620551, "grad_norm": 0.3312484469034916, "learning_rate": 3.7727617602427926e-05, "loss": 0.3865, "step": 5640 }, { "epoch": 0.9635572245455322, "grad_norm": 0.286412718788769, "learning_rate": 3.771181082448154e-05, "loss": 0.404, "step": 5645 }, { "epoch": 0.9644106853290091, "grad_norm": 0.33555847559611607, "learning_rate": 3.7696004046535154e-05, "loss": 0.391, "step": 5650 }, { "epoch": 0.9652641461124861, "grad_norm": 0.45721966081031057, "learning_rate": 3.768019726858877e-05, "loss": 0.3886, "step": 5655 }, { "epoch": 0.9661176068959632, "grad_norm": 0.31223463649786687, "learning_rate": 3.766439049064239e-05, "loss": 0.4156, "step": 5660 }, { "epoch": 0.9669710676794401, "grad_norm": 0.31212475931873984, "learning_rate": 3.7648583712696006e-05, "loss": 0.426, "step": 5665 }, { "epoch": 0.9678245284629171, "grad_norm": 0.2791404497724465, "learning_rate": 3.7632776934749624e-05, "loss": 0.3746, "step": 5670 }, { "epoch": 0.9686779892463941, "grad_norm": 0.29769799590370605, "learning_rate": 3.761697015680324e-05, "loss": 0.3828, "step": 5675 }, { "epoch": 0.9695314500298712, "grad_norm": 0.2961530168179679, "learning_rate": 3.760116337885686e-05, "loss": 0.3947, "step": 5680 }, { "epoch": 0.9703849108133481, "grad_norm": 0.28569949369188957, "learning_rate": 3.758535660091047e-05, "loss": 0.3795, "step": 5685 }, { "epoch": 0.9712383715968251, "grad_norm": 0.33544169926577966, "learning_rate": 3.756954982296409e-05, "loss": 0.4362, "step": 5690 }, { "epoch": 0.9720918323803022, "grad_norm": 0.2943645594158022, "learning_rate": 3.7553743045017704e-05, "loss": 0.422, "step": 5695 }, { "epoch": 0.9729452931637791, "grad_norm": 0.37255888094837364, "learning_rate": 3.753793626707132e-05, "loss": 0.4027, "step": 5700 }, { "epoch": 0.9737987539472561, "grad_norm": 0.36118643584057847, "learning_rate": 3.752212948912494e-05, "loss": 0.409, "step": 5705 }, { "epoch": 0.9746522147307332, "grad_norm": 0.33472587101049334, "learning_rate": 3.7506322711178557e-05, "loss": 0.4125, "step": 5710 }, { "epoch": 0.9755056755142101, "grad_norm": 0.2732864588201706, "learning_rate": 3.7490515933232174e-05, "loss": 0.3842, "step": 5715 }, { "epoch": 0.9763591362976871, "grad_norm": 0.289398290523563, "learning_rate": 3.7474709155285785e-05, "loss": 0.4202, "step": 5720 }, { "epoch": 0.9772125970811641, "grad_norm": 0.30307889180013436, "learning_rate": 3.74589023773394e-05, "loss": 0.3949, "step": 5725 }, { "epoch": 0.9780660578646411, "grad_norm": 0.3473490032728617, "learning_rate": 3.744309559939302e-05, "loss": 0.3978, "step": 5730 }, { "epoch": 0.9789195186481181, "grad_norm": 0.3682922468408759, "learning_rate": 3.742728882144664e-05, "loss": 0.4071, "step": 5735 }, { "epoch": 0.9797729794315951, "grad_norm": 0.2986007322495295, "learning_rate": 3.7411482043500254e-05, "loss": 0.4128, "step": 5740 }, { "epoch": 0.9806264402150722, "grad_norm": 0.3039734655666956, "learning_rate": 3.739567526555387e-05, "loss": 0.4122, "step": 5745 }, { "epoch": 0.9814799009985491, "grad_norm": 0.4209027422491525, "learning_rate": 3.737986848760749e-05, "loss": 0.4158, "step": 5750 }, { "epoch": 0.9823333617820261, "grad_norm": 0.24421142719411204, "learning_rate": 3.73640617096611e-05, "loss": 0.3864, "step": 5755 }, { "epoch": 0.9831868225655032, "grad_norm": 0.35592527833644383, "learning_rate": 3.7348254931714724e-05, "loss": 0.3819, "step": 5760 }, { "epoch": 0.9840402833489801, "grad_norm": 0.4463174813558046, "learning_rate": 3.733244815376834e-05, "loss": 0.407, "step": 5765 }, { "epoch": 0.9848937441324571, "grad_norm": 0.3640258700739679, "learning_rate": 3.731664137582195e-05, "loss": 0.3848, "step": 5770 }, { "epoch": 0.9857472049159341, "grad_norm": 0.3045169960410884, "learning_rate": 3.730083459787557e-05, "loss": 0.4033, "step": 5775 }, { "epoch": 0.9866006656994111, "grad_norm": 1.0762742262148222, "learning_rate": 3.728502781992919e-05, "loss": 0.4373, "step": 5780 }, { "epoch": 0.9874541264828881, "grad_norm": 0.26723458626059915, "learning_rate": 3.7269221041982805e-05, "loss": 0.3718, "step": 5785 }, { "epoch": 0.9883075872663651, "grad_norm": 0.45763665826187494, "learning_rate": 3.7253414264036416e-05, "loss": 0.3993, "step": 5790 }, { "epoch": 0.9891610480498421, "grad_norm": 0.3536702561907353, "learning_rate": 3.723760748609004e-05, "loss": 0.3879, "step": 5795 }, { "epoch": 0.9900145088333191, "grad_norm": 0.5574991757546487, "learning_rate": 3.722180070814366e-05, "loss": 0.3954, "step": 5800 }, { "epoch": 0.9908679696167961, "grad_norm": 0.6794886149016697, "learning_rate": 3.720599393019727e-05, "loss": 0.3854, "step": 5805 }, { "epoch": 0.9917214304002732, "grad_norm": 0.3388815153553088, "learning_rate": 3.7190187152250885e-05, "loss": 0.4024, "step": 5810 }, { "epoch": 0.9925748911837501, "grad_norm": 0.2953943205454612, "learning_rate": 3.71743803743045e-05, "loss": 0.3994, "step": 5815 }, { "epoch": 0.9934283519672271, "grad_norm": 0.3097835710898156, "learning_rate": 3.715857359635812e-05, "loss": 0.4108, "step": 5820 }, { "epoch": 0.9942818127507042, "grad_norm": 0.28359077377526304, "learning_rate": 3.714276681841174e-05, "loss": 0.4074, "step": 5825 }, { "epoch": 0.9951352735341811, "grad_norm": 0.5991486919925008, "learning_rate": 3.7126960040465355e-05, "loss": 0.4026, "step": 5830 }, { "epoch": 0.9959887343176581, "grad_norm": 0.3653576512023109, "learning_rate": 3.711115326251897e-05, "loss": 0.3947, "step": 5835 }, { "epoch": 0.9968421951011351, "grad_norm": 0.32632085286705004, "learning_rate": 3.709534648457258e-05, "loss": 0.3963, "step": 5840 }, { "epoch": 0.9976956558846121, "grad_norm": 0.32058450014053097, "learning_rate": 3.70795397066262e-05, "loss": 0.4135, "step": 5845 }, { "epoch": 0.9985491166680891, "grad_norm": 0.2744274187337335, "learning_rate": 3.706373292867982e-05, "loss": 0.3703, "step": 5850 }, { "epoch": 0.9994025774515661, "grad_norm": 0.32471173435659345, "learning_rate": 3.7047926150733436e-05, "loss": 0.4042, "step": 5855 }, { "epoch": 1.0001706921566953, "grad_norm": 0.4797685504848518, "learning_rate": 3.703211937278705e-05, "loss": 0.3877, "step": 5860 }, { "epoch": 1.0010241529401724, "grad_norm": 0.2801373082160797, "learning_rate": 3.701631259484067e-05, "loss": 0.3186, "step": 5865 }, { "epoch": 1.0018776137236494, "grad_norm": 0.28177603066865214, "learning_rate": 3.700050581689429e-05, "loss": 0.3108, "step": 5870 }, { "epoch": 1.0027310745071265, "grad_norm": 0.3421225186837307, "learning_rate": 3.69846990389479e-05, "loss": 0.3461, "step": 5875 }, { "epoch": 1.0035845352906034, "grad_norm": 0.31607625652199484, "learning_rate": 3.696889226100152e-05, "loss": 0.3215, "step": 5880 }, { "epoch": 1.0044379960740804, "grad_norm": 0.3463622697977156, "learning_rate": 3.6953085483055134e-05, "loss": 0.3102, "step": 5885 }, { "epoch": 1.0052914568575575, "grad_norm": 0.3348735202905259, "learning_rate": 3.693727870510875e-05, "loss": 0.3427, "step": 5890 }, { "epoch": 1.0061449176410344, "grad_norm": 0.2911993363639739, "learning_rate": 3.692147192716237e-05, "loss": 0.314, "step": 5895 }, { "epoch": 1.0069983784245113, "grad_norm": 0.3399627447818369, "learning_rate": 3.6905665149215986e-05, "loss": 0.3038, "step": 5900 }, { "epoch": 1.0078518392079885, "grad_norm": 0.35235085327396465, "learning_rate": 3.6889858371269603e-05, "loss": 0.3224, "step": 5905 }, { "epoch": 1.0087052999914654, "grad_norm": 0.2996175302383873, "learning_rate": 3.6874051593323214e-05, "loss": 0.3037, "step": 5910 }, { "epoch": 1.0095587607749423, "grad_norm": 0.3214561229738476, "learning_rate": 3.685824481537684e-05, "loss": 0.3298, "step": 5915 }, { "epoch": 1.0104122215584195, "grad_norm": 0.3360471228843239, "learning_rate": 3.6842438037430456e-05, "loss": 0.3256, "step": 5920 }, { "epoch": 1.0112656823418964, "grad_norm": 0.31662101795991565, "learning_rate": 3.6826631259484067e-05, "loss": 0.3283, "step": 5925 }, { "epoch": 1.0121191431253733, "grad_norm": 0.27660018467897624, "learning_rate": 3.6810824481537684e-05, "loss": 0.3012, "step": 5930 }, { "epoch": 1.0129726039088505, "grad_norm": 0.3669518608818731, "learning_rate": 3.67950177035913e-05, "loss": 0.3452, "step": 5935 }, { "epoch": 1.0138260646923274, "grad_norm": 0.2941994753760339, "learning_rate": 3.677921092564492e-05, "loss": 0.3587, "step": 5940 }, { "epoch": 1.0146795254758043, "grad_norm": 0.2853494667102857, "learning_rate": 3.676340414769853e-05, "loss": 0.3103, "step": 5945 }, { "epoch": 1.0155329862592815, "grad_norm": 0.30500036689568, "learning_rate": 3.6747597369752154e-05, "loss": 0.3402, "step": 5950 }, { "epoch": 1.0163864470427584, "grad_norm": 0.29043721695776253, "learning_rate": 3.673179059180577e-05, "loss": 0.3053, "step": 5955 }, { "epoch": 1.0172399078262353, "grad_norm": 0.3133428492634947, "learning_rate": 3.671598381385938e-05, "loss": 0.3121, "step": 5960 }, { "epoch": 1.0180933686097124, "grad_norm": 0.27022254309998794, "learning_rate": 3.6700177035913e-05, "loss": 0.3185, "step": 5965 }, { "epoch": 1.0189468293931894, "grad_norm": 0.3006867402596937, "learning_rate": 3.668437025796662e-05, "loss": 0.3283, "step": 5970 }, { "epoch": 1.0198002901766663, "grad_norm": 0.3335412268021444, "learning_rate": 3.6668563480020234e-05, "loss": 0.3242, "step": 5975 }, { "epoch": 1.0206537509601434, "grad_norm": 0.3207489092460867, "learning_rate": 3.6652756702073845e-05, "loss": 0.3321, "step": 5980 }, { "epoch": 1.0215072117436204, "grad_norm": 0.272996768351977, "learning_rate": 3.663694992412747e-05, "loss": 0.3249, "step": 5985 }, { "epoch": 1.0223606725270973, "grad_norm": 0.3085839108347025, "learning_rate": 3.662114314618109e-05, "loss": 0.3096, "step": 5990 }, { "epoch": 1.0232141333105744, "grad_norm": 0.29749426578944055, "learning_rate": 3.66053363682347e-05, "loss": 0.3403, "step": 5995 }, { "epoch": 1.0240675940940513, "grad_norm": 0.2702607639753831, "learning_rate": 3.658952959028832e-05, "loss": 0.328, "step": 6000 }, { "epoch": 1.0249210548775283, "grad_norm": 0.2899861838611535, "learning_rate": 3.657372281234193e-05, "loss": 0.297, "step": 6005 }, { "epoch": 1.0257745156610054, "grad_norm": 0.31541127306446715, "learning_rate": 3.655791603439555e-05, "loss": 0.3019, "step": 6010 }, { "epoch": 1.0266279764444823, "grad_norm": 0.2763062494437961, "learning_rate": 3.654210925644917e-05, "loss": 0.3134, "step": 6015 }, { "epoch": 1.0274814372279595, "grad_norm": 0.27998753505976093, "learning_rate": 3.6526302478502785e-05, "loss": 0.3279, "step": 6020 }, { "epoch": 1.0283348980114364, "grad_norm": 0.2905958063329204, "learning_rate": 3.65104957005564e-05, "loss": 0.321, "step": 6025 }, { "epoch": 1.0291883587949133, "grad_norm": 0.295189412140392, "learning_rate": 3.649468892261001e-05, "loss": 0.3188, "step": 6030 }, { "epoch": 1.0300418195783905, "grad_norm": 0.33036033745397986, "learning_rate": 3.647888214466364e-05, "loss": 0.3189, "step": 6035 }, { "epoch": 1.0308952803618674, "grad_norm": 0.2756847558012219, "learning_rate": 3.646307536671725e-05, "loss": 0.312, "step": 6040 }, { "epoch": 1.0317487411453443, "grad_norm": 0.258600693929412, "learning_rate": 3.6447268588770865e-05, "loss": 0.3436, "step": 6045 }, { "epoch": 1.0326022019288215, "grad_norm": 0.23722766754762667, "learning_rate": 3.643146181082448e-05, "loss": 0.3142, "step": 6050 }, { "epoch": 1.0334556627122984, "grad_norm": 0.38133003370899715, "learning_rate": 3.64156550328781e-05, "loss": 0.3269, "step": 6055 }, { "epoch": 1.0343091234957753, "grad_norm": 0.25912699540529355, "learning_rate": 3.639984825493172e-05, "loss": 0.3239, "step": 6060 }, { "epoch": 1.0351625842792525, "grad_norm": 0.2793529884454957, "learning_rate": 3.638404147698533e-05, "loss": 0.3312, "step": 6065 }, { "epoch": 1.0360160450627294, "grad_norm": 0.2668576908314199, "learning_rate": 3.636823469903895e-05, "loss": 0.3089, "step": 6070 }, { "epoch": 1.0368695058462063, "grad_norm": 0.3137694639600711, "learning_rate": 3.635242792109257e-05, "loss": 0.3162, "step": 6075 }, { "epoch": 1.0377229666296834, "grad_norm": 0.2845967668176905, "learning_rate": 3.633662114314618e-05, "loss": 0.3186, "step": 6080 }, { "epoch": 1.0385764274131604, "grad_norm": 0.32146979940727605, "learning_rate": 3.63208143651998e-05, "loss": 0.3413, "step": 6085 }, { "epoch": 1.0394298881966373, "grad_norm": 0.37300228442833944, "learning_rate": 3.6305007587253416e-05, "loss": 0.3386, "step": 6090 }, { "epoch": 1.0402833489801144, "grad_norm": 0.26296254759673826, "learning_rate": 3.628920080930703e-05, "loss": 0.3243, "step": 6095 }, { "epoch": 1.0411368097635914, "grad_norm": 0.33019829231834297, "learning_rate": 3.6273394031360644e-05, "loss": 0.3037, "step": 6100 }, { "epoch": 1.0419902705470683, "grad_norm": 0.3008189439261612, "learning_rate": 3.625758725341427e-05, "loss": 0.298, "step": 6105 }, { "epoch": 1.0428437313305454, "grad_norm": 0.34285065404911863, "learning_rate": 3.6241780475467885e-05, "loss": 0.3343, "step": 6110 }, { "epoch": 1.0436971921140223, "grad_norm": 0.32886883400361533, "learning_rate": 3.6225973697521496e-05, "loss": 0.3226, "step": 6115 }, { "epoch": 1.0445506528974993, "grad_norm": 0.320777420743409, "learning_rate": 3.621016691957512e-05, "loss": 0.3315, "step": 6120 }, { "epoch": 1.0454041136809764, "grad_norm": 0.2645612833483574, "learning_rate": 3.619436014162873e-05, "loss": 0.3417, "step": 6125 }, { "epoch": 1.0462575744644533, "grad_norm": 0.3034169948981878, "learning_rate": 3.617855336368235e-05, "loss": 0.3281, "step": 6130 }, { "epoch": 1.0471110352479303, "grad_norm": 0.3382840918749714, "learning_rate": 3.6162746585735966e-05, "loss": 0.32, "step": 6135 }, { "epoch": 1.0479644960314074, "grad_norm": 0.27721229684187554, "learning_rate": 3.614693980778958e-05, "loss": 0.3311, "step": 6140 }, { "epoch": 1.0488179568148843, "grad_norm": 0.2252503412584565, "learning_rate": 3.61311330298432e-05, "loss": 0.2939, "step": 6145 }, { "epoch": 1.0496714175983612, "grad_norm": 0.2627240387533886, "learning_rate": 3.611532625189681e-05, "loss": 0.3311, "step": 6150 }, { "epoch": 1.0505248783818384, "grad_norm": 0.3083101120287208, "learning_rate": 3.6099519473950436e-05, "loss": 0.3499, "step": 6155 }, { "epoch": 1.0513783391653153, "grad_norm": 0.2831576886378951, "learning_rate": 3.6083712696004046e-05, "loss": 0.2985, "step": 6160 }, { "epoch": 1.0522317999487925, "grad_norm": 0.2811129121249787, "learning_rate": 3.6067905918057664e-05, "loss": 0.3158, "step": 6165 }, { "epoch": 1.0530852607322694, "grad_norm": 0.3069868284607618, "learning_rate": 3.605209914011128e-05, "loss": 0.3216, "step": 6170 }, { "epoch": 1.0539387215157463, "grad_norm": 0.2940541238904923, "learning_rate": 3.60362923621649e-05, "loss": 0.3078, "step": 6175 }, { "epoch": 1.0547921822992234, "grad_norm": 0.27017426181518023, "learning_rate": 3.6020485584218516e-05, "loss": 0.3063, "step": 6180 }, { "epoch": 1.0556456430827004, "grad_norm": 0.27575379826107677, "learning_rate": 3.600467880627213e-05, "loss": 0.3091, "step": 6185 }, { "epoch": 1.0564991038661773, "grad_norm": 0.31159895834407836, "learning_rate": 3.598887202832575e-05, "loss": 0.3306, "step": 6190 }, { "epoch": 1.0573525646496544, "grad_norm": 0.3715744831309604, "learning_rate": 3.597306525037936e-05, "loss": 0.3146, "step": 6195 }, { "epoch": 1.0582060254331314, "grad_norm": 0.26393197062752566, "learning_rate": 3.595725847243298e-05, "loss": 0.2991, "step": 6200 }, { "epoch": 1.0590594862166083, "grad_norm": 0.32886162007737935, "learning_rate": 3.59414516944866e-05, "loss": 0.3143, "step": 6205 }, { "epoch": 1.0599129470000854, "grad_norm": 0.29740671242330813, "learning_rate": 3.5925644916540214e-05, "loss": 0.3395, "step": 6210 }, { "epoch": 1.0607664077835623, "grad_norm": 0.31733726151873687, "learning_rate": 3.590983813859383e-05, "loss": 0.3361, "step": 6215 }, { "epoch": 1.0616198685670393, "grad_norm": 0.26228435653660825, "learning_rate": 3.589403136064744e-05, "loss": 0.3151, "step": 6220 }, { "epoch": 1.0624733293505164, "grad_norm": 0.290319432436748, "learning_rate": 3.5878224582701067e-05, "loss": 0.3025, "step": 6225 }, { "epoch": 1.0633267901339933, "grad_norm": 0.2770820269444923, "learning_rate": 3.5862417804754684e-05, "loss": 0.3069, "step": 6230 }, { "epoch": 1.0641802509174703, "grad_norm": 0.2501325512200226, "learning_rate": 3.5846611026808295e-05, "loss": 0.3363, "step": 6235 }, { "epoch": 1.0650337117009474, "grad_norm": 0.2777310168159857, "learning_rate": 3.583080424886192e-05, "loss": 0.3369, "step": 6240 }, { "epoch": 1.0658871724844243, "grad_norm": 0.31665307655396807, "learning_rate": 3.581499747091553e-05, "loss": 0.3317, "step": 6245 }, { "epoch": 1.0667406332679013, "grad_norm": 0.2783930635168734, "learning_rate": 3.579919069296915e-05, "loss": 0.3123, "step": 6250 }, { "epoch": 1.0675940940513784, "grad_norm": 0.35565703446609437, "learning_rate": 3.5783383915022765e-05, "loss": 0.3575, "step": 6255 }, { "epoch": 1.0684475548348553, "grad_norm": 0.28701999234787584, "learning_rate": 3.576757713707638e-05, "loss": 0.3193, "step": 6260 }, { "epoch": 1.0693010156183322, "grad_norm": 0.29932831484010936, "learning_rate": 3.575177035913e-05, "loss": 0.3239, "step": 6265 }, { "epoch": 1.0701544764018094, "grad_norm": 0.3532687789523987, "learning_rate": 3.573596358118361e-05, "loss": 0.3336, "step": 6270 }, { "epoch": 1.0710079371852863, "grad_norm": 0.2922445972362714, "learning_rate": 3.5720156803237234e-05, "loss": 0.2974, "step": 6275 }, { "epoch": 1.0718613979687635, "grad_norm": 0.3064497147911919, "learning_rate": 3.5704350025290845e-05, "loss": 0.3124, "step": 6280 }, { "epoch": 1.0727148587522404, "grad_norm": 0.28262237880481483, "learning_rate": 3.568854324734446e-05, "loss": 0.3026, "step": 6285 }, { "epoch": 1.0735683195357173, "grad_norm": 0.28124406183217454, "learning_rate": 3.567273646939808e-05, "loss": 0.3163, "step": 6290 }, { "epoch": 1.0744217803191942, "grad_norm": 0.2970441582962062, "learning_rate": 3.56569296914517e-05, "loss": 0.3195, "step": 6295 }, { "epoch": 1.0752752411026714, "grad_norm": 0.3077923571348644, "learning_rate": 3.5641122913505315e-05, "loss": 0.3163, "step": 6300 }, { "epoch": 1.0761287018861483, "grad_norm": 0.2633888435190342, "learning_rate": 3.5625316135558926e-05, "loss": 0.3357, "step": 6305 }, { "epoch": 1.0769821626696254, "grad_norm": 0.31861464801931644, "learning_rate": 3.560950935761255e-05, "loss": 0.3486, "step": 6310 }, { "epoch": 1.0778356234531024, "grad_norm": 0.28051581398536907, "learning_rate": 3.559370257966616e-05, "loss": 0.3194, "step": 6315 }, { "epoch": 1.0786890842365793, "grad_norm": 0.2902327079919843, "learning_rate": 3.557789580171978e-05, "loss": 0.3083, "step": 6320 }, { "epoch": 1.0795425450200564, "grad_norm": 0.33482027382933915, "learning_rate": 3.5562089023773395e-05, "loss": 0.3111, "step": 6325 }, { "epoch": 1.0803960058035333, "grad_norm": 0.3246595744902799, "learning_rate": 3.554628224582701e-05, "loss": 0.3279, "step": 6330 }, { "epoch": 1.0812494665870103, "grad_norm": 0.27032733796093267, "learning_rate": 3.553047546788063e-05, "loss": 0.3103, "step": 6335 }, { "epoch": 1.0821029273704874, "grad_norm": 0.2824536276459102, "learning_rate": 3.551466868993424e-05, "loss": 0.3608, "step": 6340 }, { "epoch": 1.0829563881539643, "grad_norm": 0.27173068617903157, "learning_rate": 3.5498861911987865e-05, "loss": 0.3222, "step": 6345 }, { "epoch": 1.0838098489374413, "grad_norm": 0.2522998378428293, "learning_rate": 3.5483055134041476e-05, "loss": 0.3067, "step": 6350 }, { "epoch": 1.0846633097209184, "grad_norm": 0.3059043780900187, "learning_rate": 3.546724835609509e-05, "loss": 0.3251, "step": 6355 }, { "epoch": 1.0855167705043953, "grad_norm": 0.27986395766720584, "learning_rate": 3.545144157814871e-05, "loss": 0.3163, "step": 6360 }, { "epoch": 1.0863702312878722, "grad_norm": 0.283966150757833, "learning_rate": 3.543563480020233e-05, "loss": 0.299, "step": 6365 }, { "epoch": 1.0872236920713494, "grad_norm": 0.37710456352683047, "learning_rate": 3.5419828022255946e-05, "loss": 0.3397, "step": 6370 }, { "epoch": 1.0880771528548263, "grad_norm": 0.2933746257388099, "learning_rate": 3.540402124430956e-05, "loss": 0.3078, "step": 6375 }, { "epoch": 1.0889306136383032, "grad_norm": 0.26287343754891423, "learning_rate": 3.538821446636318e-05, "loss": 0.3392, "step": 6380 }, { "epoch": 1.0897840744217804, "grad_norm": 0.2823375011765789, "learning_rate": 3.537240768841679e-05, "loss": 0.3212, "step": 6385 }, { "epoch": 1.0906375352052573, "grad_norm": 0.3146987645542551, "learning_rate": 3.535660091047041e-05, "loss": 0.3134, "step": 6390 }, { "epoch": 1.0914909959887342, "grad_norm": 0.2750047563322391, "learning_rate": 3.534079413252403e-05, "loss": 0.2946, "step": 6395 }, { "epoch": 1.0923444567722114, "grad_norm": 0.36474554090039885, "learning_rate": 3.5324987354577644e-05, "loss": 0.3328, "step": 6400 }, { "epoch": 1.0931979175556883, "grad_norm": 0.29675545735578923, "learning_rate": 3.530918057663126e-05, "loss": 0.3187, "step": 6405 }, { "epoch": 1.0940513783391652, "grad_norm": 0.2895424498475061, "learning_rate": 3.529337379868488e-05, "loss": 0.3013, "step": 6410 }, { "epoch": 1.0949048391226424, "grad_norm": 0.3128653394146357, "learning_rate": 3.5277567020738496e-05, "loss": 0.3216, "step": 6415 }, { "epoch": 1.0957582999061193, "grad_norm": 0.27716709222589075, "learning_rate": 3.5261760242792114e-05, "loss": 0.318, "step": 6420 }, { "epoch": 1.0966117606895964, "grad_norm": 0.2753194039140635, "learning_rate": 3.5245953464845724e-05, "loss": 0.3333, "step": 6425 }, { "epoch": 1.0974652214730733, "grad_norm": 0.3397307502562668, "learning_rate": 3.523014668689935e-05, "loss": 0.3361, "step": 6430 }, { "epoch": 1.0983186822565503, "grad_norm": 0.2868784284223315, "learning_rate": 3.521433990895296e-05, "loss": 0.3159, "step": 6435 }, { "epoch": 1.0991721430400272, "grad_norm": 0.3159503590297133, "learning_rate": 3.5198533131006577e-05, "loss": 0.326, "step": 6440 }, { "epoch": 1.1000256038235043, "grad_norm": 0.33760793936878447, "learning_rate": 3.5182726353060194e-05, "loss": 0.3287, "step": 6445 }, { "epoch": 1.1008790646069813, "grad_norm": 0.2974061789276727, "learning_rate": 3.516691957511381e-05, "loss": 0.3047, "step": 6450 }, { "epoch": 1.1017325253904584, "grad_norm": 0.2931502943269955, "learning_rate": 3.515111279716743e-05, "loss": 0.2985, "step": 6455 }, { "epoch": 1.1025859861739353, "grad_norm": 0.3049117257446533, "learning_rate": 3.513530601922104e-05, "loss": 0.336, "step": 6460 }, { "epoch": 1.1034394469574123, "grad_norm": 0.3110935040050201, "learning_rate": 3.5119499241274664e-05, "loss": 0.342, "step": 6465 }, { "epoch": 1.1042929077408894, "grad_norm": 0.3316836302493635, "learning_rate": 3.5103692463328275e-05, "loss": 0.3131, "step": 6470 }, { "epoch": 1.1051463685243663, "grad_norm": 0.2711344960467574, "learning_rate": 3.508788568538189e-05, "loss": 0.3124, "step": 6475 }, { "epoch": 1.1059998293078432, "grad_norm": 0.3218591023490937, "learning_rate": 3.507207890743551e-05, "loss": 0.3035, "step": 6480 }, { "epoch": 1.1068532900913204, "grad_norm": 0.3586101157122948, "learning_rate": 3.505627212948913e-05, "loss": 0.3325, "step": 6485 }, { "epoch": 1.1077067508747973, "grad_norm": 0.21206274145351137, "learning_rate": 3.5040465351542744e-05, "loss": 0.3243, "step": 6490 }, { "epoch": 1.1085602116582742, "grad_norm": 0.3129465651433643, "learning_rate": 3.5024658573596355e-05, "loss": 0.3112, "step": 6495 }, { "epoch": 1.1094136724417514, "grad_norm": 0.2405011572250767, "learning_rate": 3.500885179564998e-05, "loss": 0.3066, "step": 6500 }, { "epoch": 1.1102671332252283, "grad_norm": 0.3250639206122012, "learning_rate": 3.499304501770359e-05, "loss": 0.3132, "step": 6505 }, { "epoch": 1.1111205940087052, "grad_norm": 0.2932654498755712, "learning_rate": 3.497723823975721e-05, "loss": 0.3162, "step": 6510 }, { "epoch": 1.1119740547921824, "grad_norm": 0.3886924148303815, "learning_rate": 3.496143146181083e-05, "loss": 0.3043, "step": 6515 }, { "epoch": 1.1128275155756593, "grad_norm": 0.28407232017417394, "learning_rate": 3.494562468386444e-05, "loss": 0.2938, "step": 6520 }, { "epoch": 1.1136809763591362, "grad_norm": 0.3718491161601195, "learning_rate": 3.492981790591806e-05, "loss": 0.3287, "step": 6525 }, { "epoch": 1.1145344371426134, "grad_norm": 0.31621668143975573, "learning_rate": 3.491401112797168e-05, "loss": 0.3388, "step": 6530 }, { "epoch": 1.1153878979260903, "grad_norm": 0.3059366389922238, "learning_rate": 3.4898204350025295e-05, "loss": 0.336, "step": 6535 }, { "epoch": 1.1162413587095672, "grad_norm": 0.26249822345866347, "learning_rate": 3.4882397572078905e-05, "loss": 0.3202, "step": 6540 }, { "epoch": 1.1170948194930443, "grad_norm": 0.25544305725881944, "learning_rate": 3.486659079413252e-05, "loss": 0.3416, "step": 6545 }, { "epoch": 1.1179482802765213, "grad_norm": 0.2759904263290515, "learning_rate": 3.485078401618615e-05, "loss": 0.3213, "step": 6550 }, { "epoch": 1.1188017410599982, "grad_norm": 0.29218826580991536, "learning_rate": 3.483497723823976e-05, "loss": 0.3172, "step": 6555 }, { "epoch": 1.1196552018434753, "grad_norm": 0.34389139461681123, "learning_rate": 3.4819170460293375e-05, "loss": 0.3142, "step": 6560 }, { "epoch": 1.1205086626269523, "grad_norm": 0.2502407347967488, "learning_rate": 3.480336368234699e-05, "loss": 0.3039, "step": 6565 }, { "epoch": 1.1213621234104294, "grad_norm": 0.28688943308133336, "learning_rate": 3.478755690440061e-05, "loss": 0.324, "step": 6570 }, { "epoch": 1.1222155841939063, "grad_norm": 0.2718367278503554, "learning_rate": 3.477175012645423e-05, "loss": 0.3152, "step": 6575 }, { "epoch": 1.1230690449773832, "grad_norm": 0.2664458667970966, "learning_rate": 3.475594334850784e-05, "loss": 0.3359, "step": 6580 }, { "epoch": 1.1239225057608604, "grad_norm": 0.3164795394426142, "learning_rate": 3.474013657056146e-05, "loss": 0.3169, "step": 6585 }, { "epoch": 1.1247759665443373, "grad_norm": 0.28043028945331744, "learning_rate": 3.472432979261507e-05, "loss": 0.3139, "step": 6590 }, { "epoch": 1.1256294273278142, "grad_norm": 0.29671559334149883, "learning_rate": 3.470852301466869e-05, "loss": 0.3097, "step": 6595 }, { "epoch": 1.1264828881112914, "grad_norm": 0.3569874986394922, "learning_rate": 3.469271623672231e-05, "loss": 0.3384, "step": 6600 }, { "epoch": 1.1273363488947683, "grad_norm": 0.3160429973351902, "learning_rate": 3.4676909458775926e-05, "loss": 0.3236, "step": 6605 }, { "epoch": 1.1281898096782452, "grad_norm": 0.27139504249372776, "learning_rate": 3.466110268082954e-05, "loss": 0.3347, "step": 6610 }, { "epoch": 1.1290432704617224, "grad_norm": 0.2652025535027243, "learning_rate": 3.4645295902883154e-05, "loss": 0.3268, "step": 6615 }, { "epoch": 1.1298967312451993, "grad_norm": 0.307277684980844, "learning_rate": 3.462948912493678e-05, "loss": 0.3206, "step": 6620 }, { "epoch": 1.1307501920286762, "grad_norm": 0.2883152499853476, "learning_rate": 3.461368234699039e-05, "loss": 0.2976, "step": 6625 }, { "epoch": 1.1316036528121534, "grad_norm": 0.25683925549057673, "learning_rate": 3.4597875569044006e-05, "loss": 0.3009, "step": 6630 }, { "epoch": 1.1324571135956303, "grad_norm": 0.28164158792619826, "learning_rate": 3.4582068791097624e-05, "loss": 0.2825, "step": 6635 }, { "epoch": 1.1333105743791072, "grad_norm": 0.30765945957916585, "learning_rate": 3.456626201315124e-05, "loss": 0.3023, "step": 6640 }, { "epoch": 1.1341640351625843, "grad_norm": 0.34042371295519186, "learning_rate": 3.455045523520486e-05, "loss": 0.3089, "step": 6645 }, { "epoch": 1.1350174959460613, "grad_norm": 0.29237710252993526, "learning_rate": 3.4534648457258476e-05, "loss": 0.3201, "step": 6650 }, { "epoch": 1.1358709567295382, "grad_norm": 0.2824818751971144, "learning_rate": 3.451884167931209e-05, "loss": 0.315, "step": 6655 }, { "epoch": 1.1367244175130153, "grad_norm": 0.3157495620803381, "learning_rate": 3.4503034901365704e-05, "loss": 0.3154, "step": 6660 }, { "epoch": 1.1375778782964923, "grad_norm": 0.3474083445795491, "learning_rate": 3.448722812341932e-05, "loss": 0.335, "step": 6665 }, { "epoch": 1.1384313390799692, "grad_norm": 0.3096824179835637, "learning_rate": 3.4471421345472946e-05, "loss": 0.3124, "step": 6670 }, { "epoch": 1.1392847998634463, "grad_norm": 0.2722898268409296, "learning_rate": 3.4455614567526556e-05, "loss": 0.3457, "step": 6675 }, { "epoch": 1.1401382606469233, "grad_norm": 0.2683720965153674, "learning_rate": 3.4439807789580174e-05, "loss": 0.3117, "step": 6680 }, { "epoch": 1.1409917214304004, "grad_norm": 0.25170165400733124, "learning_rate": 3.442400101163379e-05, "loss": 0.3214, "step": 6685 }, { "epoch": 1.1418451822138773, "grad_norm": 0.2628073474516704, "learning_rate": 3.440819423368741e-05, "loss": 0.376, "step": 6690 }, { "epoch": 1.1426986429973542, "grad_norm": 0.31337074816044674, "learning_rate": 3.439238745574102e-05, "loss": 0.3089, "step": 6695 }, { "epoch": 1.1435521037808312, "grad_norm": 0.2684952578288054, "learning_rate": 3.437658067779464e-05, "loss": 0.334, "step": 6700 }, { "epoch": 1.1444055645643083, "grad_norm": 0.26807561756180553, "learning_rate": 3.436077389984826e-05, "loss": 0.3328, "step": 6705 }, { "epoch": 1.1452590253477852, "grad_norm": 0.28793050496923717, "learning_rate": 3.434496712190187e-05, "loss": 0.3201, "step": 6710 }, { "epoch": 1.1461124861312624, "grad_norm": 0.29075982714846826, "learning_rate": 3.432916034395549e-05, "loss": 0.3187, "step": 6715 }, { "epoch": 1.1469659469147393, "grad_norm": 0.26907395442374854, "learning_rate": 3.431335356600911e-05, "loss": 0.3215, "step": 6720 }, { "epoch": 1.1478194076982162, "grad_norm": 0.2810140460988384, "learning_rate": 3.4297546788062724e-05, "loss": 0.3121, "step": 6725 }, { "epoch": 1.1486728684816931, "grad_norm": 0.30611097316775143, "learning_rate": 3.428174001011634e-05, "loss": 0.3327, "step": 6730 }, { "epoch": 1.1495263292651703, "grad_norm": 0.27617016952026874, "learning_rate": 3.426593323216995e-05, "loss": 0.2873, "step": 6735 }, { "epoch": 1.1503797900486472, "grad_norm": 0.34549502819169436, "learning_rate": 3.4250126454223577e-05, "loss": 0.3376, "step": 6740 }, { "epoch": 1.1512332508321244, "grad_norm": 0.2986221766304674, "learning_rate": 3.423431967627719e-05, "loss": 0.3208, "step": 6745 }, { "epoch": 1.1520867116156013, "grad_norm": 0.3138957620139587, "learning_rate": 3.4218512898330805e-05, "loss": 0.3314, "step": 6750 }, { "epoch": 1.1529401723990782, "grad_norm": 0.2960133983970914, "learning_rate": 3.420270612038442e-05, "loss": 0.3338, "step": 6755 }, { "epoch": 1.1537936331825553, "grad_norm": 0.3649595092789743, "learning_rate": 3.418689934243804e-05, "loss": 0.3003, "step": 6760 }, { "epoch": 1.1546470939660323, "grad_norm": 0.2555264044441144, "learning_rate": 3.417109256449166e-05, "loss": 0.3218, "step": 6765 }, { "epoch": 1.1555005547495092, "grad_norm": 0.34700792264884817, "learning_rate": 3.4155285786545275e-05, "loss": 0.3216, "step": 6770 }, { "epoch": 1.1563540155329863, "grad_norm": 0.325294091402767, "learning_rate": 3.413947900859889e-05, "loss": 0.2966, "step": 6775 }, { "epoch": 1.1572074763164633, "grad_norm": 0.34287727572604926, "learning_rate": 3.41236722306525e-05, "loss": 0.3327, "step": 6780 }, { "epoch": 1.1580609370999402, "grad_norm": 0.3282649000355505, "learning_rate": 3.410786545270612e-05, "loss": 0.3199, "step": 6785 }, { "epoch": 1.1589143978834173, "grad_norm": 0.3214720193885718, "learning_rate": 3.409205867475974e-05, "loss": 0.3151, "step": 6790 }, { "epoch": 1.1597678586668942, "grad_norm": 0.26634569016703674, "learning_rate": 3.4076251896813355e-05, "loss": 0.3222, "step": 6795 }, { "epoch": 1.1606213194503712, "grad_norm": 0.2748073471408819, "learning_rate": 3.406044511886697e-05, "loss": 0.332, "step": 6800 }, { "epoch": 1.1614747802338483, "grad_norm": 0.2888414501818845, "learning_rate": 3.404463834092059e-05, "loss": 0.3355, "step": 6805 }, { "epoch": 1.1623282410173252, "grad_norm": 0.32310201790919124, "learning_rate": 3.402883156297421e-05, "loss": 0.3242, "step": 6810 }, { "epoch": 1.1631817018008022, "grad_norm": 0.30865671720221644, "learning_rate": 3.401302478502782e-05, "loss": 0.3047, "step": 6815 }, { "epoch": 1.1640351625842793, "grad_norm": 0.29594056024421805, "learning_rate": 3.3997218007081436e-05, "loss": 0.3233, "step": 6820 }, { "epoch": 1.1648886233677562, "grad_norm": 0.279033280394343, "learning_rate": 3.398141122913506e-05, "loss": 0.3205, "step": 6825 }, { "epoch": 1.1657420841512334, "grad_norm": 0.30130824745544177, "learning_rate": 3.396560445118867e-05, "loss": 0.3376, "step": 6830 }, { "epoch": 1.1665955449347103, "grad_norm": 0.3005767506500797, "learning_rate": 3.394979767324229e-05, "loss": 0.3236, "step": 6835 }, { "epoch": 1.1674490057181872, "grad_norm": 0.30918069828741435, "learning_rate": 3.3933990895295905e-05, "loss": 0.3245, "step": 6840 }, { "epoch": 1.1683024665016641, "grad_norm": 0.30339392907351065, "learning_rate": 3.391818411734952e-05, "loss": 0.3251, "step": 6845 }, { "epoch": 1.1691559272851413, "grad_norm": 0.27455410368720506, "learning_rate": 3.3902377339403134e-05, "loss": 0.3337, "step": 6850 }, { "epoch": 1.1700093880686182, "grad_norm": 0.24657753116687448, "learning_rate": 3.388657056145675e-05, "loss": 0.3183, "step": 6855 }, { "epoch": 1.1708628488520954, "grad_norm": 0.2704717699063409, "learning_rate": 3.3870763783510375e-05, "loss": 0.3091, "step": 6860 }, { "epoch": 1.1717163096355723, "grad_norm": 0.276387526875196, "learning_rate": 3.3854957005563986e-05, "loss": 0.3335, "step": 6865 }, { "epoch": 1.1725697704190492, "grad_norm": 0.31865721833953886, "learning_rate": 3.3839150227617603e-05, "loss": 0.3202, "step": 6870 }, { "epoch": 1.1734232312025263, "grad_norm": 0.2760777564702528, "learning_rate": 3.382334344967122e-05, "loss": 0.3279, "step": 6875 }, { "epoch": 1.1742766919860033, "grad_norm": 0.2765645958510654, "learning_rate": 3.380753667172484e-05, "loss": 0.3482, "step": 6880 }, { "epoch": 1.1751301527694802, "grad_norm": 0.2433236598092554, "learning_rate": 3.3791729893778456e-05, "loss": 0.322, "step": 6885 }, { "epoch": 1.1759836135529573, "grad_norm": 0.2520311774959746, "learning_rate": 3.377592311583207e-05, "loss": 0.3212, "step": 6890 }, { "epoch": 1.1768370743364343, "grad_norm": 0.27096565976142206, "learning_rate": 3.376011633788569e-05, "loss": 0.3259, "step": 6895 }, { "epoch": 1.1776905351199112, "grad_norm": 0.36178275279487926, "learning_rate": 3.37443095599393e-05, "loss": 0.3292, "step": 6900 }, { "epoch": 1.1785439959033883, "grad_norm": 0.2651880889912189, "learning_rate": 3.372850278199292e-05, "loss": 0.2944, "step": 6905 }, { "epoch": 1.1793974566868652, "grad_norm": 0.3208081698327905, "learning_rate": 3.3712696004046536e-05, "loss": 0.3135, "step": 6910 }, { "epoch": 1.1802509174703422, "grad_norm": 0.3103835178357861, "learning_rate": 3.3696889226100154e-05, "loss": 0.3197, "step": 6915 }, { "epoch": 1.1811043782538193, "grad_norm": 0.26953117813535027, "learning_rate": 3.368108244815377e-05, "loss": 0.3369, "step": 6920 }, { "epoch": 1.1819578390372962, "grad_norm": 0.3277061207667989, "learning_rate": 3.366527567020739e-05, "loss": 0.3116, "step": 6925 }, { "epoch": 1.1828112998207732, "grad_norm": 0.2507338362764609, "learning_rate": 3.3649468892261006e-05, "loss": 0.3261, "step": 6930 }, { "epoch": 1.1836647606042503, "grad_norm": 0.3881962233715374, "learning_rate": 3.363366211431462e-05, "loss": 0.3306, "step": 6935 }, { "epoch": 1.1845182213877272, "grad_norm": 0.2930498960642902, "learning_rate": 3.3617855336368234e-05, "loss": 0.3211, "step": 6940 }, { "epoch": 1.1853716821712041, "grad_norm": 0.2947095468378264, "learning_rate": 3.360204855842185e-05, "loss": 0.3115, "step": 6945 }, { "epoch": 1.1862251429546813, "grad_norm": 0.2850914323392816, "learning_rate": 3.358624178047547e-05, "loss": 0.3114, "step": 6950 }, { "epoch": 1.1870786037381582, "grad_norm": 0.2810612022770895, "learning_rate": 3.357043500252909e-05, "loss": 0.3308, "step": 6955 }, { "epoch": 1.1879320645216351, "grad_norm": 0.2563603759867467, "learning_rate": 3.3554628224582704e-05, "loss": 0.3128, "step": 6960 }, { "epoch": 1.1887855253051123, "grad_norm": 0.3283358592701071, "learning_rate": 3.353882144663632e-05, "loss": 0.3359, "step": 6965 }, { "epoch": 1.1896389860885892, "grad_norm": 0.31336832787615065, "learning_rate": 3.352301466868993e-05, "loss": 0.3167, "step": 6970 }, { "epoch": 1.1904924468720663, "grad_norm": 0.2923458439617226, "learning_rate": 3.350720789074355e-05, "loss": 0.3334, "step": 6975 }, { "epoch": 1.1913459076555433, "grad_norm": 0.3677069629690386, "learning_rate": 3.3491401112797174e-05, "loss": 0.3263, "step": 6980 }, { "epoch": 1.1921993684390202, "grad_norm": 0.28824766917487327, "learning_rate": 3.3475594334850785e-05, "loss": 0.295, "step": 6985 }, { "epoch": 1.1930528292224971, "grad_norm": 0.32296238372195024, "learning_rate": 3.34597875569044e-05, "loss": 0.3305, "step": 6990 }, { "epoch": 1.1939062900059743, "grad_norm": 0.29678274993089393, "learning_rate": 3.344398077895802e-05, "loss": 0.3235, "step": 6995 }, { "epoch": 1.1947597507894512, "grad_norm": 0.2802795591229062, "learning_rate": 3.342817400101164e-05, "loss": 0.3153, "step": 7000 }, { "epoch": 1.1956132115729283, "grad_norm": 0.3035574715573163, "learning_rate": 3.341236722306525e-05, "loss": 0.2973, "step": 7005 }, { "epoch": 1.1964666723564052, "grad_norm": 0.2832345297490118, "learning_rate": 3.339656044511887e-05, "loss": 0.3388, "step": 7010 }, { "epoch": 1.1973201331398822, "grad_norm": 0.29519821485924336, "learning_rate": 3.338075366717249e-05, "loss": 0.3607, "step": 7015 }, { "epoch": 1.1981735939233593, "grad_norm": 0.3692064500552574, "learning_rate": 3.33649468892261e-05, "loss": 0.3156, "step": 7020 }, { "epoch": 1.1990270547068362, "grad_norm": 0.34456361255890144, "learning_rate": 3.334914011127972e-05, "loss": 0.308, "step": 7025 }, { "epoch": 1.1998805154903132, "grad_norm": 0.31782406045262, "learning_rate": 3.3333333333333335e-05, "loss": 0.3223, "step": 7030 }, { "epoch": 1.2007339762737903, "grad_norm": 0.29991042754422387, "learning_rate": 3.331752655538695e-05, "loss": 0.3399, "step": 7035 }, { "epoch": 1.2015874370572672, "grad_norm": 0.33476149298873126, "learning_rate": 3.330171977744056e-05, "loss": 0.3191, "step": 7040 }, { "epoch": 1.2024408978407441, "grad_norm": 0.31685385200039984, "learning_rate": 3.328591299949419e-05, "loss": 0.3162, "step": 7045 }, { "epoch": 1.2032943586242213, "grad_norm": 0.32578331619571094, "learning_rate": 3.3270106221547805e-05, "loss": 0.3224, "step": 7050 }, { "epoch": 1.2041478194076982, "grad_norm": 0.2593142568823331, "learning_rate": 3.3254299443601415e-05, "loss": 0.3208, "step": 7055 }, { "epoch": 1.2050012801911751, "grad_norm": 0.28859327552597763, "learning_rate": 3.323849266565503e-05, "loss": 0.3125, "step": 7060 }, { "epoch": 1.2058547409746523, "grad_norm": 0.28784679226335225, "learning_rate": 3.322268588770865e-05, "loss": 0.3005, "step": 7065 }, { "epoch": 1.2067082017581292, "grad_norm": 0.3134419793152386, "learning_rate": 3.320687910976227e-05, "loss": 0.3217, "step": 7070 }, { "epoch": 1.2075616625416061, "grad_norm": 0.24494689974367254, "learning_rate": 3.3191072331815885e-05, "loss": 0.3071, "step": 7075 }, { "epoch": 1.2084151233250833, "grad_norm": 0.31244508305372193, "learning_rate": 3.31752655538695e-05, "loss": 0.3176, "step": 7080 }, { "epoch": 1.2092685841085602, "grad_norm": 0.3066525106514405, "learning_rate": 3.315945877592312e-05, "loss": 0.3454, "step": 7085 }, { "epoch": 1.2101220448920371, "grad_norm": 0.27647638924895823, "learning_rate": 3.314365199797673e-05, "loss": 0.3305, "step": 7090 }, { "epoch": 1.2109755056755143, "grad_norm": 0.2852443123357244, "learning_rate": 3.312784522003035e-05, "loss": 0.327, "step": 7095 }, { "epoch": 1.2118289664589912, "grad_norm": 0.29337733600719496, "learning_rate": 3.3112038442083966e-05, "loss": 0.309, "step": 7100 }, { "epoch": 1.212682427242468, "grad_norm": 0.32413313455273446, "learning_rate": 3.309623166413758e-05, "loss": 0.3427, "step": 7105 }, { "epoch": 1.2135358880259453, "grad_norm": 0.34157318543143006, "learning_rate": 3.30804248861912e-05, "loss": 0.3253, "step": 7110 }, { "epoch": 1.2143893488094222, "grad_norm": 0.32116881544747083, "learning_rate": 3.306461810824482e-05, "loss": 0.3038, "step": 7115 }, { "epoch": 1.2152428095928993, "grad_norm": 0.2886874340843595, "learning_rate": 3.3048811330298436e-05, "loss": 0.3268, "step": 7120 }, { "epoch": 1.2160962703763762, "grad_norm": 0.2826912533747012, "learning_rate": 3.3033004552352046e-05, "loss": 0.3334, "step": 7125 }, { "epoch": 1.2169497311598532, "grad_norm": 0.3167286957868306, "learning_rate": 3.3017197774405664e-05, "loss": 0.318, "step": 7130 }, { "epoch": 1.21780319194333, "grad_norm": 0.34748907062638773, "learning_rate": 3.300139099645929e-05, "loss": 0.3142, "step": 7135 }, { "epoch": 1.2186566527268072, "grad_norm": 0.27407779846816815, "learning_rate": 3.29855842185129e-05, "loss": 0.3369, "step": 7140 }, { "epoch": 1.2195101135102842, "grad_norm": 0.29268181630715245, "learning_rate": 3.2969777440566516e-05, "loss": 0.3034, "step": 7145 }, { "epoch": 1.2203635742937613, "grad_norm": 0.2970878030538742, "learning_rate": 3.2953970662620134e-05, "loss": 0.3062, "step": 7150 }, { "epoch": 1.2212170350772382, "grad_norm": 0.24978423517160062, "learning_rate": 3.293816388467375e-05, "loss": 0.346, "step": 7155 }, { "epoch": 1.2220704958607151, "grad_norm": 0.34511522194799843, "learning_rate": 3.292235710672736e-05, "loss": 0.3293, "step": 7160 }, { "epoch": 1.2229239566441923, "grad_norm": 0.3276031590274247, "learning_rate": 3.2906550328780986e-05, "loss": 0.3445, "step": 7165 }, { "epoch": 1.2237774174276692, "grad_norm": 0.22772889488284223, "learning_rate": 3.2890743550834603e-05, "loss": 0.33, "step": 7170 }, { "epoch": 1.2246308782111461, "grad_norm": 0.27032664050176836, "learning_rate": 3.2874936772888214e-05, "loss": 0.3092, "step": 7175 }, { "epoch": 1.2254843389946233, "grad_norm": 0.32471522806990943, "learning_rate": 3.285912999494183e-05, "loss": 0.318, "step": 7180 }, { "epoch": 1.2263377997781002, "grad_norm": 0.2604087706368128, "learning_rate": 3.284332321699545e-05, "loss": 0.3084, "step": 7185 }, { "epoch": 1.2271912605615771, "grad_norm": 0.2575756679773192, "learning_rate": 3.2827516439049066e-05, "loss": 0.3201, "step": 7190 }, { "epoch": 1.2280447213450543, "grad_norm": 0.26915571198475724, "learning_rate": 3.281170966110268e-05, "loss": 0.3067, "step": 7195 }, { "epoch": 1.2288981821285312, "grad_norm": 0.28408247011258936, "learning_rate": 3.27959028831563e-05, "loss": 0.3261, "step": 7200 }, { "epoch": 1.2297516429120081, "grad_norm": 0.3024104826836887, "learning_rate": 3.278009610520992e-05, "loss": 0.3129, "step": 7205 }, { "epoch": 1.2306051036954853, "grad_norm": 0.30735509917733833, "learning_rate": 3.276428932726353e-05, "loss": 0.297, "step": 7210 }, { "epoch": 1.2314585644789622, "grad_norm": 0.2583791373846269, "learning_rate": 3.274848254931715e-05, "loss": 0.3155, "step": 7215 }, { "epoch": 1.232312025262439, "grad_norm": 0.2814034592326079, "learning_rate": 3.2732675771370764e-05, "loss": 0.3193, "step": 7220 }, { "epoch": 1.2331654860459162, "grad_norm": 0.298757802460849, "learning_rate": 3.271686899342438e-05, "loss": 0.3401, "step": 7225 }, { "epoch": 1.2340189468293932, "grad_norm": 0.3182301503192778, "learning_rate": 3.2701062215478e-05, "loss": 0.3238, "step": 7230 }, { "epoch": 1.23487240761287, "grad_norm": 0.3114906046411783, "learning_rate": 3.268525543753162e-05, "loss": 0.3136, "step": 7235 }, { "epoch": 1.2357258683963472, "grad_norm": 0.24027105017135958, "learning_rate": 3.2669448659585234e-05, "loss": 0.3348, "step": 7240 }, { "epoch": 1.2365793291798242, "grad_norm": 0.27115083254864925, "learning_rate": 3.2653641881638845e-05, "loss": 0.3135, "step": 7245 }, { "epoch": 1.237432789963301, "grad_norm": 0.2693300962569263, "learning_rate": 3.263783510369246e-05, "loss": 0.3078, "step": 7250 }, { "epoch": 1.2382862507467782, "grad_norm": 0.3162541568135139, "learning_rate": 3.262202832574608e-05, "loss": 0.3287, "step": 7255 }, { "epoch": 1.2391397115302552, "grad_norm": 0.2826801336470516, "learning_rate": 3.26062215477997e-05, "loss": 0.326, "step": 7260 }, { "epoch": 1.2399931723137323, "grad_norm": 0.2764201101112852, "learning_rate": 3.2590414769853315e-05, "loss": 0.3307, "step": 7265 }, { "epoch": 1.2408466330972092, "grad_norm": 0.28193484877647185, "learning_rate": 3.257460799190693e-05, "loss": 0.3131, "step": 7270 }, { "epoch": 1.2417000938806861, "grad_norm": 0.2615011270087652, "learning_rate": 3.255880121396055e-05, "loss": 0.3039, "step": 7275 }, { "epoch": 1.242553554664163, "grad_norm": 0.23999314827653234, "learning_rate": 3.254299443601416e-05, "loss": 0.3127, "step": 7280 }, { "epoch": 1.2434070154476402, "grad_norm": 0.2985899289266311, "learning_rate": 3.2527187658067785e-05, "loss": 0.323, "step": 7285 }, { "epoch": 1.2442604762311171, "grad_norm": 0.3068277042987539, "learning_rate": 3.25113808801214e-05, "loss": 0.3234, "step": 7290 }, { "epoch": 1.2451139370145943, "grad_norm": 0.2939428069630531, "learning_rate": 3.249557410217501e-05, "loss": 0.3007, "step": 7295 }, { "epoch": 1.2459673977980712, "grad_norm": 0.2633642070642613, "learning_rate": 3.247976732422863e-05, "loss": 0.3096, "step": 7300 }, { "epoch": 1.2468208585815481, "grad_norm": 0.27117484533509956, "learning_rate": 3.246396054628225e-05, "loss": 0.2994, "step": 7305 }, { "epoch": 1.2476743193650253, "grad_norm": 0.2982240355567415, "learning_rate": 3.2448153768335865e-05, "loss": 0.3258, "step": 7310 }, { "epoch": 1.2485277801485022, "grad_norm": 0.27385051822532996, "learning_rate": 3.2432346990389476e-05, "loss": 0.2956, "step": 7315 }, { "epoch": 1.249381240931979, "grad_norm": 0.2709157042662307, "learning_rate": 3.24165402124431e-05, "loss": 0.2929, "step": 7320 }, { "epoch": 1.2502347017154563, "grad_norm": 0.23189101055532887, "learning_rate": 3.240073343449672e-05, "loss": 0.3283, "step": 7325 }, { "epoch": 1.2510881624989332, "grad_norm": 0.2796388575169934, "learning_rate": 3.238492665655033e-05, "loss": 0.291, "step": 7330 }, { "epoch": 1.25194162328241, "grad_norm": 0.2783917507577443, "learning_rate": 3.2369119878603946e-05, "loss": 0.337, "step": 7335 }, { "epoch": 1.2527950840658872, "grad_norm": 0.25279814229940445, "learning_rate": 3.235331310065756e-05, "loss": 0.3219, "step": 7340 }, { "epoch": 1.2536485448493642, "grad_norm": 0.27567482499497425, "learning_rate": 3.233750632271118e-05, "loss": 0.3013, "step": 7345 }, { "epoch": 1.254502005632841, "grad_norm": 0.36890866644918363, "learning_rate": 3.232169954476479e-05, "loss": 0.2889, "step": 7350 }, { "epoch": 1.2553554664163182, "grad_norm": 0.2961958610132813, "learning_rate": 3.2305892766818415e-05, "loss": 0.299, "step": 7355 }, { "epoch": 1.2562089271997952, "grad_norm": 0.29966488815431097, "learning_rate": 3.229008598887203e-05, "loss": 0.3197, "step": 7360 }, { "epoch": 1.257062387983272, "grad_norm": 0.28234297534057884, "learning_rate": 3.2274279210925644e-05, "loss": 0.3076, "step": 7365 }, { "epoch": 1.2579158487667492, "grad_norm": 0.2615090554293542, "learning_rate": 3.225847243297926e-05, "loss": 0.3117, "step": 7370 }, { "epoch": 1.2587693095502261, "grad_norm": 0.2871075950638998, "learning_rate": 3.224266565503288e-05, "loss": 0.316, "step": 7375 }, { "epoch": 1.2596227703337033, "grad_norm": 0.28538506002189695, "learning_rate": 3.2226858877086496e-05, "loss": 0.3235, "step": 7380 }, { "epoch": 1.2604762311171802, "grad_norm": 0.26954153587746693, "learning_rate": 3.2211052099140113e-05, "loss": 0.3351, "step": 7385 }, { "epoch": 1.2613296919006571, "grad_norm": 0.25934501369648416, "learning_rate": 3.219524532119373e-05, "loss": 0.324, "step": 7390 }, { "epoch": 1.262183152684134, "grad_norm": 0.2925011120385152, "learning_rate": 3.217943854324735e-05, "loss": 0.3419, "step": 7395 }, { "epoch": 1.2630366134676112, "grad_norm": 0.322280655946708, "learning_rate": 3.216363176530096e-05, "loss": 0.3008, "step": 7400 }, { "epoch": 1.2638900742510881, "grad_norm": 0.27040819766485347, "learning_rate": 3.214782498735458e-05, "loss": 0.3321, "step": 7405 }, { "epoch": 1.2647435350345653, "grad_norm": 0.34310175865852216, "learning_rate": 3.2132018209408194e-05, "loss": 0.3342, "step": 7410 }, { "epoch": 1.2655969958180422, "grad_norm": 0.2776179262325134, "learning_rate": 3.211621143146181e-05, "loss": 0.3159, "step": 7415 }, { "epoch": 1.2664504566015191, "grad_norm": 0.560012325699375, "learning_rate": 3.210040465351543e-05, "loss": 0.3315, "step": 7420 }, { "epoch": 1.267303917384996, "grad_norm": 0.2834084966916368, "learning_rate": 3.2084597875569046e-05, "loss": 0.3083, "step": 7425 }, { "epoch": 1.2681573781684732, "grad_norm": 0.28720181883144497, "learning_rate": 3.2068791097622664e-05, "loss": 0.3435, "step": 7430 }, { "epoch": 1.26901083895195, "grad_norm": 0.32204834478311206, "learning_rate": 3.2052984319676274e-05, "loss": 0.3145, "step": 7435 }, { "epoch": 1.2698642997354272, "grad_norm": 0.3186863838880478, "learning_rate": 3.20371775417299e-05, "loss": 0.3163, "step": 7440 }, { "epoch": 1.2707177605189042, "grad_norm": 0.30168639984043216, "learning_rate": 3.202137076378351e-05, "loss": 0.3044, "step": 7445 }, { "epoch": 1.271571221302381, "grad_norm": 0.2936837818346504, "learning_rate": 3.200556398583713e-05, "loss": 0.3077, "step": 7450 }, { "epoch": 1.272424682085858, "grad_norm": 0.2805025521026154, "learning_rate": 3.1989757207890744e-05, "loss": 0.3068, "step": 7455 }, { "epoch": 1.2732781428693352, "grad_norm": 0.2726884298888378, "learning_rate": 3.197395042994436e-05, "loss": 0.3131, "step": 7460 }, { "epoch": 1.274131603652812, "grad_norm": 0.2614923987638576, "learning_rate": 3.195814365199798e-05, "loss": 0.3327, "step": 7465 }, { "epoch": 1.2749850644362892, "grad_norm": 0.35099590975857103, "learning_rate": 3.194233687405159e-05, "loss": 0.3184, "step": 7470 }, { "epoch": 1.2758385252197662, "grad_norm": 0.32348027755437836, "learning_rate": 3.1926530096105214e-05, "loss": 0.3199, "step": 7475 }, { "epoch": 1.276691986003243, "grad_norm": 0.2817444096355855, "learning_rate": 3.191072331815883e-05, "loss": 0.3192, "step": 7480 }, { "epoch": 1.2775454467867202, "grad_norm": 0.33482221632547116, "learning_rate": 3.189491654021244e-05, "loss": 0.3342, "step": 7485 }, { "epoch": 1.2783989075701971, "grad_norm": 0.314695520642127, "learning_rate": 3.187910976226606e-05, "loss": 0.3131, "step": 7490 }, { "epoch": 1.2792523683536743, "grad_norm": 0.25202876802241025, "learning_rate": 3.186330298431968e-05, "loss": 0.3306, "step": 7495 }, { "epoch": 1.2801058291371512, "grad_norm": 0.277656216910588, "learning_rate": 3.1847496206373295e-05, "loss": 0.3192, "step": 7500 }, { "epoch": 1.2809592899206281, "grad_norm": 0.3180446744171403, "learning_rate": 3.1831689428426905e-05, "loss": 0.3081, "step": 7505 }, { "epoch": 1.281812750704105, "grad_norm": 0.29356534665188794, "learning_rate": 3.181588265048053e-05, "loss": 0.3131, "step": 7510 }, { "epoch": 1.2826662114875822, "grad_norm": 0.29446878587536124, "learning_rate": 3.180007587253415e-05, "loss": 0.3166, "step": 7515 }, { "epoch": 1.2835196722710591, "grad_norm": 0.3512735294352893, "learning_rate": 3.178426909458776e-05, "loss": 0.315, "step": 7520 }, { "epoch": 1.2843731330545363, "grad_norm": 0.3611911359530234, "learning_rate": 3.176846231664138e-05, "loss": 0.3307, "step": 7525 }, { "epoch": 1.2852265938380132, "grad_norm": 0.2741975494246371, "learning_rate": 3.175265553869499e-05, "loss": 0.3242, "step": 7530 }, { "epoch": 1.28608005462149, "grad_norm": 0.2949155020082315, "learning_rate": 3.173684876074861e-05, "loss": 0.3116, "step": 7535 }, { "epoch": 1.286933515404967, "grad_norm": 0.31255140113556923, "learning_rate": 3.172104198280223e-05, "loss": 0.299, "step": 7540 }, { "epoch": 1.2877869761884442, "grad_norm": 0.30482051103451746, "learning_rate": 3.1705235204855845e-05, "loss": 0.3098, "step": 7545 }, { "epoch": 1.288640436971921, "grad_norm": 0.26885452392669257, "learning_rate": 3.168942842690946e-05, "loss": 0.3401, "step": 7550 }, { "epoch": 1.2894938977553982, "grad_norm": 0.3008739231610797, "learning_rate": 3.167362164896307e-05, "loss": 0.3084, "step": 7555 }, { "epoch": 1.2903473585388752, "grad_norm": 0.30377907696050843, "learning_rate": 3.16578148710167e-05, "loss": 0.3125, "step": 7560 }, { "epoch": 1.291200819322352, "grad_norm": 0.27344830675835796, "learning_rate": 3.164200809307031e-05, "loss": 0.312, "step": 7565 }, { "epoch": 1.292054280105829, "grad_norm": 0.30234929767840724, "learning_rate": 3.1626201315123925e-05, "loss": 0.3101, "step": 7570 }, { "epoch": 1.2929077408893062, "grad_norm": 0.2819738557635255, "learning_rate": 3.161039453717754e-05, "loss": 0.3335, "step": 7575 }, { "epoch": 1.293761201672783, "grad_norm": 0.2938949877759031, "learning_rate": 3.159458775923116e-05, "loss": 0.3421, "step": 7580 }, { "epoch": 1.2946146624562602, "grad_norm": 0.2932574511035601, "learning_rate": 3.157878098128478e-05, "loss": 0.304, "step": 7585 }, { "epoch": 1.2954681232397371, "grad_norm": 0.4331685009895891, "learning_rate": 3.156297420333839e-05, "loss": 0.3244, "step": 7590 }, { "epoch": 1.296321584023214, "grad_norm": 0.2935314412962229, "learning_rate": 3.154716742539201e-05, "loss": 0.3295, "step": 7595 }, { "epoch": 1.2971750448066912, "grad_norm": 0.24594444752604233, "learning_rate": 3.1531360647445623e-05, "loss": 0.323, "step": 7600 }, { "epoch": 1.2980285055901681, "grad_norm": 0.304793013347465, "learning_rate": 3.151555386949924e-05, "loss": 0.3316, "step": 7605 }, { "epoch": 1.298881966373645, "grad_norm": 0.27386936291451064, "learning_rate": 3.149974709155286e-05, "loss": 0.3343, "step": 7610 }, { "epoch": 1.2997354271571222, "grad_norm": 0.29421582292049864, "learning_rate": 3.1483940313606476e-05, "loss": 0.3354, "step": 7615 }, { "epoch": 1.3005888879405991, "grad_norm": 0.2662267900814284, "learning_rate": 3.146813353566009e-05, "loss": 0.329, "step": 7620 }, { "epoch": 1.301442348724076, "grad_norm": 0.27103172964090183, "learning_rate": 3.1452326757713704e-05, "loss": 0.318, "step": 7625 }, { "epoch": 1.3022958095075532, "grad_norm": 0.3753279103424276, "learning_rate": 3.143651997976733e-05, "loss": 0.3534, "step": 7630 }, { "epoch": 1.3031492702910301, "grad_norm": 0.22903874904345786, "learning_rate": 3.1420713201820946e-05, "loss": 0.3052, "step": 7635 }, { "epoch": 1.3040027310745073, "grad_norm": 0.28980153804491293, "learning_rate": 3.1404906423874556e-05, "loss": 0.2999, "step": 7640 }, { "epoch": 1.3048561918579842, "grad_norm": 0.3301599266454669, "learning_rate": 3.138909964592818e-05, "loss": 0.3178, "step": 7645 }, { "epoch": 1.305709652641461, "grad_norm": 0.732761269314101, "learning_rate": 3.137329286798179e-05, "loss": 0.3091, "step": 7650 }, { "epoch": 1.306563113424938, "grad_norm": 0.30961426956813826, "learning_rate": 3.135748609003541e-05, "loss": 0.3295, "step": 7655 }, { "epoch": 1.3074165742084152, "grad_norm": 0.4825183691553914, "learning_rate": 3.1341679312089026e-05, "loss": 0.3103, "step": 7660 }, { "epoch": 1.308270034991892, "grad_norm": 0.2905153970945341, "learning_rate": 3.1325872534142644e-05, "loss": 0.3341, "step": 7665 }, { "epoch": 1.3091234957753692, "grad_norm": 0.2588346298539659, "learning_rate": 3.131006575619626e-05, "loss": 0.3082, "step": 7670 }, { "epoch": 1.3099769565588462, "grad_norm": 0.2620860402257922, "learning_rate": 3.129425897824987e-05, "loss": 0.3126, "step": 7675 }, { "epoch": 1.310830417342323, "grad_norm": 0.27860940056511385, "learning_rate": 3.1278452200303496e-05, "loss": 0.3337, "step": 7680 }, { "epoch": 1.3116838781258, "grad_norm": 0.2878602288874049, "learning_rate": 3.126264542235711e-05, "loss": 0.3225, "step": 7685 }, { "epoch": 1.3125373389092772, "grad_norm": 0.29936540438070075, "learning_rate": 3.1246838644410724e-05, "loss": 0.3383, "step": 7690 }, { "epoch": 1.313390799692754, "grad_norm": 0.33382095528311273, "learning_rate": 3.123103186646434e-05, "loss": 0.31, "step": 7695 }, { "epoch": 1.3142442604762312, "grad_norm": 0.26471895319338007, "learning_rate": 3.121522508851796e-05, "loss": 0.3452, "step": 7700 }, { "epoch": 1.3150977212597081, "grad_norm": 0.29560366213784844, "learning_rate": 3.1199418310571577e-05, "loss": 0.296, "step": 7705 }, { "epoch": 1.315951182043185, "grad_norm": 0.3274884435109619, "learning_rate": 3.118361153262519e-05, "loss": 0.3186, "step": 7710 }, { "epoch": 1.316804642826662, "grad_norm": 0.3238154441233838, "learning_rate": 3.116780475467881e-05, "loss": 0.3208, "step": 7715 }, { "epoch": 1.3176581036101391, "grad_norm": 0.3486467709111546, "learning_rate": 3.115199797673242e-05, "loss": 0.3346, "step": 7720 }, { "epoch": 1.318511564393616, "grad_norm": 0.27185582990757046, "learning_rate": 3.113619119878604e-05, "loss": 0.3162, "step": 7725 }, { "epoch": 1.3193650251770932, "grad_norm": 0.2804697485957818, "learning_rate": 3.112038442083966e-05, "loss": 0.3193, "step": 7730 }, { "epoch": 1.3202184859605701, "grad_norm": 0.3257729230044319, "learning_rate": 3.1104577642893274e-05, "loss": 0.3468, "step": 7735 }, { "epoch": 1.321071946744047, "grad_norm": 0.30696590848313404, "learning_rate": 3.108877086494689e-05, "loss": 0.3263, "step": 7740 }, { "epoch": 1.3219254075275242, "grad_norm": 0.3085913533030638, "learning_rate": 3.10729640870005e-05, "loss": 0.3265, "step": 7745 }, { "epoch": 1.322778868311001, "grad_norm": 0.32739749551252517, "learning_rate": 3.105715730905413e-05, "loss": 0.3275, "step": 7750 }, { "epoch": 1.323632329094478, "grad_norm": 0.2974614065932767, "learning_rate": 3.104135053110774e-05, "loss": 0.2943, "step": 7755 }, { "epoch": 1.3244857898779552, "grad_norm": 0.2647481894065992, "learning_rate": 3.1025543753161355e-05, "loss": 0.3505, "step": 7760 }, { "epoch": 1.325339250661432, "grad_norm": 0.28063710705915457, "learning_rate": 3.100973697521497e-05, "loss": 0.3209, "step": 7765 }, { "epoch": 1.326192711444909, "grad_norm": 0.2633868732441539, "learning_rate": 3.099393019726859e-05, "loss": 0.2889, "step": 7770 }, { "epoch": 1.3270461722283862, "grad_norm": 0.21123956180401343, "learning_rate": 3.097812341932221e-05, "loss": 0.3063, "step": 7775 }, { "epoch": 1.327899633011863, "grad_norm": 0.2911198047274057, "learning_rate": 3.0962316641375825e-05, "loss": 0.3109, "step": 7780 }, { "epoch": 1.3287530937953402, "grad_norm": 0.2833544330478825, "learning_rate": 3.094650986342944e-05, "loss": 0.3201, "step": 7785 }, { "epoch": 1.3296065545788172, "grad_norm": 0.25801014329268535, "learning_rate": 3.093070308548306e-05, "loss": 0.3255, "step": 7790 }, { "epoch": 1.330460015362294, "grad_norm": 0.3362618367465961, "learning_rate": 3.091489630753667e-05, "loss": 0.3401, "step": 7795 }, { "epoch": 1.331313476145771, "grad_norm": 0.30424487468457473, "learning_rate": 3.0899089529590295e-05, "loss": 0.3166, "step": 7800 }, { "epoch": 1.3321669369292481, "grad_norm": 0.3069683290637859, "learning_rate": 3.0883282751643905e-05, "loss": 0.3254, "step": 7805 }, { "epoch": 1.333020397712725, "grad_norm": 0.2785909966576685, "learning_rate": 3.086747597369752e-05, "loss": 0.3368, "step": 7810 }, { "epoch": 1.3338738584962022, "grad_norm": 0.2856254673740834, "learning_rate": 3.085166919575114e-05, "loss": 0.321, "step": 7815 }, { "epoch": 1.3347273192796791, "grad_norm": 0.2726799895789868, "learning_rate": 3.083586241780476e-05, "loss": 0.3014, "step": 7820 }, { "epoch": 1.335580780063156, "grad_norm": 0.2748966038453586, "learning_rate": 3.0820055639858375e-05, "loss": 0.3222, "step": 7825 }, { "epoch": 1.336434240846633, "grad_norm": 0.2872994997875884, "learning_rate": 3.0804248861911986e-05, "loss": 0.3161, "step": 7830 }, { "epoch": 1.3372877016301101, "grad_norm": 0.27738195473492605, "learning_rate": 3.078844208396561e-05, "loss": 0.3215, "step": 7835 }, { "epoch": 1.338141162413587, "grad_norm": 0.2602296755634983, "learning_rate": 3.077263530601922e-05, "loss": 0.3381, "step": 7840 }, { "epoch": 1.3389946231970642, "grad_norm": 0.3016210006573887, "learning_rate": 3.075682852807284e-05, "loss": 0.3233, "step": 7845 }, { "epoch": 1.3398480839805411, "grad_norm": 0.26675260074223794, "learning_rate": 3.0741021750126456e-05, "loss": 0.325, "step": 7850 }, { "epoch": 1.340701544764018, "grad_norm": 0.3341586881792732, "learning_rate": 3.072521497218007e-05, "loss": 0.3114, "step": 7855 }, { "epoch": 1.341555005547495, "grad_norm": 0.29026123876789844, "learning_rate": 3.070940819423369e-05, "loss": 0.3332, "step": 7860 }, { "epoch": 1.342408466330972, "grad_norm": 0.33202757109225306, "learning_rate": 3.06936014162873e-05, "loss": 0.2953, "step": 7865 }, { "epoch": 1.343261927114449, "grad_norm": 0.31421047260208, "learning_rate": 3.0677794638340926e-05, "loss": 0.3112, "step": 7870 }, { "epoch": 1.3441153878979262, "grad_norm": 0.2709558236275689, "learning_rate": 3.0661987860394536e-05, "loss": 0.315, "step": 7875 }, { "epoch": 1.344968848681403, "grad_norm": 0.29619896556831427, "learning_rate": 3.0646181082448154e-05, "loss": 0.2989, "step": 7880 }, { "epoch": 1.34582230946488, "grad_norm": 0.34867155517945886, "learning_rate": 3.063037430450177e-05, "loss": 0.3431, "step": 7885 }, { "epoch": 1.3466757702483572, "grad_norm": 0.31684286030326936, "learning_rate": 3.061456752655539e-05, "loss": 0.3407, "step": 7890 }, { "epoch": 1.347529231031834, "grad_norm": 0.353864900450062, "learning_rate": 3.0598760748609006e-05, "loss": 0.3436, "step": 7895 }, { "epoch": 1.348382691815311, "grad_norm": 0.3244435533358605, "learning_rate": 3.058295397066262e-05, "loss": 0.3298, "step": 7900 }, { "epoch": 1.3492361525987882, "grad_norm": 0.36746858135888594, "learning_rate": 3.056714719271624e-05, "loss": 0.2934, "step": 7905 }, { "epoch": 1.350089613382265, "grad_norm": 0.27612912930436423, "learning_rate": 3.055134041476985e-05, "loss": 0.3069, "step": 7910 }, { "epoch": 1.350943074165742, "grad_norm": 0.3266634780777724, "learning_rate": 3.053553363682347e-05, "loss": 0.3232, "step": 7915 }, { "epoch": 1.3517965349492191, "grad_norm": 0.23955240654464713, "learning_rate": 3.051972685887709e-05, "loss": 0.3129, "step": 7920 }, { "epoch": 1.352649995732696, "grad_norm": 0.33606520730146544, "learning_rate": 3.0503920080930704e-05, "loss": 0.3026, "step": 7925 }, { "epoch": 1.3535034565161732, "grad_norm": 0.3146382609024731, "learning_rate": 3.048811330298432e-05, "loss": 0.3008, "step": 7930 }, { "epoch": 1.3543569172996501, "grad_norm": 0.29650808917262467, "learning_rate": 3.0472306525037936e-05, "loss": 0.3324, "step": 7935 }, { "epoch": 1.355210378083127, "grad_norm": 0.2884395656229706, "learning_rate": 3.0456499747091556e-05, "loss": 0.3188, "step": 7940 }, { "epoch": 1.356063838866604, "grad_norm": 0.31061803516245434, "learning_rate": 3.0440692969145174e-05, "loss": 0.3175, "step": 7945 }, { "epoch": 1.3569172996500811, "grad_norm": 0.27508879838823497, "learning_rate": 3.0424886191198788e-05, "loss": 0.3336, "step": 7950 }, { "epoch": 1.357770760433558, "grad_norm": 0.29438933473835766, "learning_rate": 3.0409079413252405e-05, "loss": 0.2996, "step": 7955 }, { "epoch": 1.3586242212170352, "grad_norm": 0.26454876813735817, "learning_rate": 3.039327263530602e-05, "loss": 0.306, "step": 7960 }, { "epoch": 1.359477682000512, "grad_norm": 0.2844680026546355, "learning_rate": 3.0377465857359637e-05, "loss": 0.3036, "step": 7965 }, { "epoch": 1.360331142783989, "grad_norm": 0.2972874677228446, "learning_rate": 3.036165907941325e-05, "loss": 0.3124, "step": 7970 }, { "epoch": 1.361184603567466, "grad_norm": 0.26004444833118234, "learning_rate": 3.0345852301466872e-05, "loss": 0.3203, "step": 7975 }, { "epoch": 1.362038064350943, "grad_norm": 0.25719513756119283, "learning_rate": 3.033004552352049e-05, "loss": 0.33, "step": 7980 }, { "epoch": 1.36289152513442, "grad_norm": 0.34343898547855106, "learning_rate": 3.0314238745574103e-05, "loss": 0.2937, "step": 7985 }, { "epoch": 1.3637449859178972, "grad_norm": 0.29795657843112705, "learning_rate": 3.029843196762772e-05, "loss": 0.3237, "step": 7990 }, { "epoch": 1.364598446701374, "grad_norm": 0.2705338973643908, "learning_rate": 3.0282625189681335e-05, "loss": 0.3276, "step": 7995 }, { "epoch": 1.365451907484851, "grad_norm": 0.28972373427646797, "learning_rate": 3.0266818411734956e-05, "loss": 0.3183, "step": 8000 }, { "epoch": 1.366305368268328, "grad_norm": 0.3200869358750654, "learning_rate": 3.0251011633788566e-05, "loss": 0.3205, "step": 8005 }, { "epoch": 1.367158829051805, "grad_norm": 0.32400676512243826, "learning_rate": 3.0235204855842187e-05, "loss": 0.309, "step": 8010 }, { "epoch": 1.368012289835282, "grad_norm": 0.2832489752766406, "learning_rate": 3.0219398077895805e-05, "loss": 0.3176, "step": 8015 }, { "epoch": 1.3688657506187591, "grad_norm": 0.28164273308118004, "learning_rate": 3.020359129994942e-05, "loss": 0.339, "step": 8020 }, { "epoch": 1.369719211402236, "grad_norm": 0.27600605584176585, "learning_rate": 3.0187784522003036e-05, "loss": 0.3205, "step": 8025 }, { "epoch": 1.370572672185713, "grad_norm": 0.3381215779967088, "learning_rate": 3.017197774405665e-05, "loss": 0.3304, "step": 8030 }, { "epoch": 1.3714261329691901, "grad_norm": 0.2981385182939686, "learning_rate": 3.015617096611027e-05, "loss": 0.3255, "step": 8035 }, { "epoch": 1.372279593752667, "grad_norm": 0.29545174175698596, "learning_rate": 3.014036418816389e-05, "loss": 0.3217, "step": 8040 }, { "epoch": 1.373133054536144, "grad_norm": 0.2708356271936128, "learning_rate": 3.0124557410217503e-05, "loss": 0.3171, "step": 8045 }, { "epoch": 1.3739865153196211, "grad_norm": 0.29269807767930683, "learning_rate": 3.010875063227112e-05, "loss": 0.333, "step": 8050 }, { "epoch": 1.374839976103098, "grad_norm": 0.32017342463915754, "learning_rate": 3.0092943854324734e-05, "loss": 0.3114, "step": 8055 }, { "epoch": 1.375693436886575, "grad_norm": 0.32059532885832154, "learning_rate": 3.007713707637835e-05, "loss": 0.325, "step": 8060 }, { "epoch": 1.3765468976700521, "grad_norm": 0.3095807498063771, "learning_rate": 3.0061330298431966e-05, "loss": 0.3169, "step": 8065 }, { "epoch": 1.377400358453529, "grad_norm": 0.24362064246633358, "learning_rate": 3.0045523520485587e-05, "loss": 0.3233, "step": 8070 }, { "epoch": 1.3782538192370062, "grad_norm": 0.27804358130484413, "learning_rate": 3.0029716742539204e-05, "loss": 0.3197, "step": 8075 }, { "epoch": 1.379107280020483, "grad_norm": 0.31154969335571736, "learning_rate": 3.0013909964592818e-05, "loss": 0.3287, "step": 8080 }, { "epoch": 1.37996074080396, "grad_norm": 0.2647138789884974, "learning_rate": 2.9998103186646436e-05, "loss": 0.3418, "step": 8085 }, { "epoch": 1.380814201587437, "grad_norm": 0.2771973479264328, "learning_rate": 2.998229640870005e-05, "loss": 0.3235, "step": 8090 }, { "epoch": 1.381667662370914, "grad_norm": 0.26280642087271444, "learning_rate": 2.996648963075367e-05, "loss": 0.3232, "step": 8095 }, { "epoch": 1.382521123154391, "grad_norm": 0.29680287525223603, "learning_rate": 2.995068285280728e-05, "loss": 0.326, "step": 8100 }, { "epoch": 1.3833745839378682, "grad_norm": 0.26532758896194675, "learning_rate": 2.9934876074860902e-05, "loss": 0.315, "step": 8105 }, { "epoch": 1.384228044721345, "grad_norm": 0.266901187006551, "learning_rate": 2.991906929691452e-05, "loss": 0.3116, "step": 8110 }, { "epoch": 1.385081505504822, "grad_norm": 0.28065541379147124, "learning_rate": 2.9903262518968134e-05, "loss": 0.3243, "step": 8115 }, { "epoch": 1.385934966288299, "grad_norm": 0.23533969210100517, "learning_rate": 2.988745574102175e-05, "loss": 0.3091, "step": 8120 }, { "epoch": 1.386788427071776, "grad_norm": 0.3020006344695904, "learning_rate": 2.9871648963075365e-05, "loss": 0.3101, "step": 8125 }, { "epoch": 1.387641887855253, "grad_norm": 0.30684691552734444, "learning_rate": 2.9855842185128986e-05, "loss": 0.3203, "step": 8130 }, { "epoch": 1.3884953486387301, "grad_norm": 0.3448924180427949, "learning_rate": 2.9840035407182603e-05, "loss": 0.3149, "step": 8135 }, { "epoch": 1.389348809422207, "grad_norm": 0.29289906059946424, "learning_rate": 2.9824228629236217e-05, "loss": 0.3224, "step": 8140 }, { "epoch": 1.390202270205684, "grad_norm": 0.3437439964139529, "learning_rate": 2.9808421851289835e-05, "loss": 0.308, "step": 8145 }, { "epoch": 1.391055730989161, "grad_norm": 0.2606534056953345, "learning_rate": 2.979261507334345e-05, "loss": 0.3295, "step": 8150 }, { "epoch": 1.391909191772638, "grad_norm": 0.3031773312641512, "learning_rate": 2.977680829539707e-05, "loss": 0.307, "step": 8155 }, { "epoch": 1.392762652556115, "grad_norm": 0.2920856968175428, "learning_rate": 2.976100151745068e-05, "loss": 0.3257, "step": 8160 }, { "epoch": 1.3936161133395921, "grad_norm": 0.2649033900145662, "learning_rate": 2.97451947395043e-05, "loss": 0.3084, "step": 8165 }, { "epoch": 1.394469574123069, "grad_norm": 0.33155076850997506, "learning_rate": 2.972938796155792e-05, "loss": 0.3153, "step": 8170 }, { "epoch": 1.395323034906546, "grad_norm": 0.2781383876439027, "learning_rate": 2.9713581183611533e-05, "loss": 0.3132, "step": 8175 }, { "epoch": 1.3961764956900231, "grad_norm": 0.2482511395987714, "learning_rate": 2.969777440566515e-05, "loss": 0.3285, "step": 8180 }, { "epoch": 1.3970299564735, "grad_norm": 0.3110269779586047, "learning_rate": 2.9681967627718764e-05, "loss": 0.3192, "step": 8185 }, { "epoch": 1.3978834172569772, "grad_norm": 0.44652176760395385, "learning_rate": 2.9666160849772385e-05, "loss": 0.3251, "step": 8190 }, { "epoch": 1.398736878040454, "grad_norm": 0.23304513742432542, "learning_rate": 2.9650354071826003e-05, "loss": 0.3083, "step": 8195 }, { "epoch": 1.399590338823931, "grad_norm": 0.3002816624828034, "learning_rate": 2.9634547293879617e-05, "loss": 0.3393, "step": 8200 }, { "epoch": 1.400443799607408, "grad_norm": 0.27883374131075706, "learning_rate": 2.9618740515933234e-05, "loss": 0.339, "step": 8205 }, { "epoch": 1.401297260390885, "grad_norm": 0.25164448147806, "learning_rate": 2.9602933737986848e-05, "loss": 0.311, "step": 8210 }, { "epoch": 1.402150721174362, "grad_norm": 0.30186489086602225, "learning_rate": 2.958712696004047e-05, "loss": 0.2991, "step": 8215 }, { "epoch": 1.4030041819578392, "grad_norm": 0.2806642185620836, "learning_rate": 2.957132018209408e-05, "loss": 0.3285, "step": 8220 }, { "epoch": 1.403857642741316, "grad_norm": 0.26643151643241564, "learning_rate": 2.95555134041477e-05, "loss": 0.3269, "step": 8225 }, { "epoch": 1.404711103524793, "grad_norm": 0.272973186733074, "learning_rate": 2.9539706626201318e-05, "loss": 0.3046, "step": 8230 }, { "epoch": 1.40556456430827, "grad_norm": 0.33683752954359153, "learning_rate": 2.9523899848254932e-05, "loss": 0.327, "step": 8235 }, { "epoch": 1.406418025091747, "grad_norm": 0.32865857811515875, "learning_rate": 2.950809307030855e-05, "loss": 0.3395, "step": 8240 }, { "epoch": 1.407271485875224, "grad_norm": 0.27182427673933934, "learning_rate": 2.9492286292362164e-05, "loss": 0.3117, "step": 8245 }, { "epoch": 1.4081249466587011, "grad_norm": 0.24875919509902317, "learning_rate": 2.9476479514415785e-05, "loss": 0.3249, "step": 8250 }, { "epoch": 1.408978407442178, "grad_norm": 0.24442739604015778, "learning_rate": 2.9460672736469395e-05, "loss": 0.3147, "step": 8255 }, { "epoch": 1.409831868225655, "grad_norm": 0.2578920334635185, "learning_rate": 2.9444865958523016e-05, "loss": 0.2953, "step": 8260 }, { "epoch": 1.410685329009132, "grad_norm": 0.27108938418580997, "learning_rate": 2.9429059180576634e-05, "loss": 0.3126, "step": 8265 }, { "epoch": 1.411538789792609, "grad_norm": 0.29639326536123706, "learning_rate": 2.9413252402630248e-05, "loss": 0.3243, "step": 8270 }, { "epoch": 1.412392250576086, "grad_norm": 0.3019855674771033, "learning_rate": 2.939744562468387e-05, "loss": 0.3277, "step": 8275 }, { "epoch": 1.4132457113595631, "grad_norm": 0.3176934986440865, "learning_rate": 2.938163884673748e-05, "loss": 0.3192, "step": 8280 }, { "epoch": 1.41409917214304, "grad_norm": 0.3580059014707014, "learning_rate": 2.93658320687911e-05, "loss": 0.3197, "step": 8285 }, { "epoch": 1.414952632926517, "grad_norm": 0.26566808000519276, "learning_rate": 2.9350025290844717e-05, "loss": 0.3111, "step": 8290 }, { "epoch": 1.415806093709994, "grad_norm": 0.26455148765190806, "learning_rate": 2.933421851289833e-05, "loss": 0.3089, "step": 8295 }, { "epoch": 1.416659554493471, "grad_norm": 0.25131239184259035, "learning_rate": 2.931841173495195e-05, "loss": 0.324, "step": 8300 }, { "epoch": 1.417513015276948, "grad_norm": 0.2690306205152151, "learning_rate": 2.9302604957005563e-05, "loss": 0.3247, "step": 8305 }, { "epoch": 1.418366476060425, "grad_norm": 0.29766354539969614, "learning_rate": 2.9286798179059184e-05, "loss": 0.3295, "step": 8310 }, { "epoch": 1.419219936843902, "grad_norm": 0.2940929486921227, "learning_rate": 2.9270991401112795e-05, "loss": 0.3135, "step": 8315 }, { "epoch": 1.420073397627379, "grad_norm": 0.32609347560443175, "learning_rate": 2.9255184623166415e-05, "loss": 0.353, "step": 8320 }, { "epoch": 1.420926858410856, "grad_norm": 0.2955039748057472, "learning_rate": 2.9239377845220033e-05, "loss": 0.2993, "step": 8325 }, { "epoch": 1.421780319194333, "grad_norm": 0.2976591720955615, "learning_rate": 2.9223571067273647e-05, "loss": 0.3154, "step": 8330 }, { "epoch": 1.4226337799778102, "grad_norm": 0.3293432418496437, "learning_rate": 2.9207764289327268e-05, "loss": 0.3312, "step": 8335 }, { "epoch": 1.423487240761287, "grad_norm": 0.3684461666198814, "learning_rate": 2.919195751138088e-05, "loss": 0.3258, "step": 8340 }, { "epoch": 1.424340701544764, "grad_norm": 0.30860909366744327, "learning_rate": 2.91761507334345e-05, "loss": 0.2962, "step": 8345 }, { "epoch": 1.425194162328241, "grad_norm": 0.2837927756690489, "learning_rate": 2.9160343955488113e-05, "loss": 0.3333, "step": 8350 }, { "epoch": 1.426047623111718, "grad_norm": 0.2476810030928027, "learning_rate": 2.914453717754173e-05, "loss": 0.3022, "step": 8355 }, { "epoch": 1.426901083895195, "grad_norm": 0.26310587009772635, "learning_rate": 2.9128730399595348e-05, "loss": 0.3002, "step": 8360 }, { "epoch": 1.4277545446786721, "grad_norm": 0.2810852494456509, "learning_rate": 2.9112923621648962e-05, "loss": 0.3171, "step": 8365 }, { "epoch": 1.428608005462149, "grad_norm": 0.31499857794393943, "learning_rate": 2.9097116843702583e-05, "loss": 0.3214, "step": 8370 }, { "epoch": 1.429461466245626, "grad_norm": 0.22919792541364556, "learning_rate": 2.9081310065756194e-05, "loss": 0.3199, "step": 8375 }, { "epoch": 1.430314927029103, "grad_norm": 0.22405325226313577, "learning_rate": 2.9065503287809815e-05, "loss": 0.3127, "step": 8380 }, { "epoch": 1.43116838781258, "grad_norm": 0.2730040773590126, "learning_rate": 2.9049696509863432e-05, "loss": 0.3296, "step": 8385 }, { "epoch": 1.432021848596057, "grad_norm": 0.29496668430116624, "learning_rate": 2.9033889731917046e-05, "loss": 0.295, "step": 8390 }, { "epoch": 1.4328753093795341, "grad_norm": 0.2988014132837995, "learning_rate": 2.9018082953970667e-05, "loss": 0.3118, "step": 8395 }, { "epoch": 1.433728770163011, "grad_norm": 0.25488844341599676, "learning_rate": 2.9002276176024278e-05, "loss": 0.3207, "step": 8400 }, { "epoch": 1.434582230946488, "grad_norm": 0.3145270364994345, "learning_rate": 2.89864693980779e-05, "loss": 0.3209, "step": 8405 }, { "epoch": 1.4354356917299649, "grad_norm": 0.24927226093102742, "learning_rate": 2.8970662620131513e-05, "loss": 0.306, "step": 8410 }, { "epoch": 1.436289152513442, "grad_norm": 0.2625554175522972, "learning_rate": 2.895485584218513e-05, "loss": 0.321, "step": 8415 }, { "epoch": 1.437142613296919, "grad_norm": 0.28081083814242164, "learning_rate": 2.8939049064238748e-05, "loss": 0.3235, "step": 8420 }, { "epoch": 1.437996074080396, "grad_norm": 0.22901128090575584, "learning_rate": 2.892324228629236e-05, "loss": 0.3259, "step": 8425 }, { "epoch": 1.438849534863873, "grad_norm": 0.25775580667882386, "learning_rate": 2.8907435508345983e-05, "loss": 0.3127, "step": 8430 }, { "epoch": 1.43970299564735, "grad_norm": 0.2562722586015722, "learning_rate": 2.8891628730399593e-05, "loss": 0.3281, "step": 8435 }, { "epoch": 1.440556456430827, "grad_norm": 0.25811434085080254, "learning_rate": 2.8875821952453214e-05, "loss": 0.3055, "step": 8440 }, { "epoch": 1.441409917214304, "grad_norm": 0.2955577444972227, "learning_rate": 2.886001517450683e-05, "loss": 0.3236, "step": 8445 }, { "epoch": 1.442263377997781, "grad_norm": 0.25339157187295536, "learning_rate": 2.8844208396560446e-05, "loss": 0.328, "step": 8450 }, { "epoch": 1.443116838781258, "grad_norm": 0.2645155981424776, "learning_rate": 2.8828401618614066e-05, "loss": 0.3189, "step": 8455 }, { "epoch": 1.443970299564735, "grad_norm": 0.24381092725398612, "learning_rate": 2.8812594840667677e-05, "loss": 0.3194, "step": 8460 }, { "epoch": 1.444823760348212, "grad_norm": 0.2682339843382882, "learning_rate": 2.8796788062721298e-05, "loss": 0.3109, "step": 8465 }, { "epoch": 1.445677221131689, "grad_norm": 0.2732778250511094, "learning_rate": 2.8780981284774912e-05, "loss": 0.3086, "step": 8470 }, { "epoch": 1.446530681915166, "grad_norm": 0.2643014329164515, "learning_rate": 2.876517450682853e-05, "loss": 0.3089, "step": 8475 }, { "epoch": 1.4473841426986431, "grad_norm": 0.27127234764840535, "learning_rate": 2.8749367728882147e-05, "loss": 0.323, "step": 8480 }, { "epoch": 1.44823760348212, "grad_norm": 0.2528291778842226, "learning_rate": 2.873356095093576e-05, "loss": 0.3076, "step": 8485 }, { "epoch": 1.449091064265597, "grad_norm": 0.29660118259546064, "learning_rate": 2.8717754172989382e-05, "loss": 0.3232, "step": 8490 }, { "epoch": 1.449944525049074, "grad_norm": 0.3053866395724026, "learning_rate": 2.8701947395042993e-05, "loss": 0.3091, "step": 8495 }, { "epoch": 1.450797985832551, "grad_norm": 0.35207232677704475, "learning_rate": 2.8686140617096613e-05, "loss": 0.3113, "step": 8500 }, { "epoch": 1.451651446616028, "grad_norm": 0.2789367032146855, "learning_rate": 2.8670333839150227e-05, "loss": 0.3106, "step": 8505 }, { "epoch": 1.452504907399505, "grad_norm": 0.2669363489155264, "learning_rate": 2.8654527061203845e-05, "loss": 0.3234, "step": 8510 }, { "epoch": 1.453358368182982, "grad_norm": 0.3067158513866478, "learning_rate": 2.8638720283257466e-05, "loss": 0.3227, "step": 8515 }, { "epoch": 1.454211828966459, "grad_norm": 0.31926689727168744, "learning_rate": 2.8622913505311076e-05, "loss": 0.3282, "step": 8520 }, { "epoch": 1.4550652897499359, "grad_norm": 0.3042612404200383, "learning_rate": 2.8607106727364697e-05, "loss": 0.2869, "step": 8525 }, { "epoch": 1.455918750533413, "grad_norm": 0.24269884411135498, "learning_rate": 2.859129994941831e-05, "loss": 0.3142, "step": 8530 }, { "epoch": 1.45677221131689, "grad_norm": 0.30890852856396683, "learning_rate": 2.857549317147193e-05, "loss": 0.3366, "step": 8535 }, { "epoch": 1.457625672100367, "grad_norm": 0.28104734446744456, "learning_rate": 2.8559686393525546e-05, "loss": 0.3067, "step": 8540 }, { "epoch": 1.458479132883844, "grad_norm": 0.35365313460364234, "learning_rate": 2.854387961557916e-05, "loss": 0.3161, "step": 8545 }, { "epoch": 1.459332593667321, "grad_norm": 0.29297974141800576, "learning_rate": 2.852807283763278e-05, "loss": 0.3216, "step": 8550 }, { "epoch": 1.4601860544507979, "grad_norm": 0.3057800584243111, "learning_rate": 2.8512266059686392e-05, "loss": 0.3161, "step": 8555 }, { "epoch": 1.461039515234275, "grad_norm": 0.3033752302211874, "learning_rate": 2.8496459281740013e-05, "loss": 0.3419, "step": 8560 }, { "epoch": 1.461892976017752, "grad_norm": 0.22687255743295606, "learning_rate": 2.8480652503793627e-05, "loss": 0.2987, "step": 8565 }, { "epoch": 1.462746436801229, "grad_norm": 0.28421878722810023, "learning_rate": 2.8464845725847244e-05, "loss": 0.2968, "step": 8570 }, { "epoch": 1.463599897584706, "grad_norm": 0.24541334161121953, "learning_rate": 2.8449038947900865e-05, "loss": 0.2852, "step": 8575 }, { "epoch": 1.464453358368183, "grad_norm": 0.28128338857244695, "learning_rate": 2.8433232169954476e-05, "loss": 0.3037, "step": 8580 }, { "epoch": 1.46530681915166, "grad_norm": 0.32131612737734366, "learning_rate": 2.8417425392008097e-05, "loss": 0.3051, "step": 8585 }, { "epoch": 1.466160279935137, "grad_norm": 0.29576718411227215, "learning_rate": 2.840161861406171e-05, "loss": 0.3428, "step": 8590 }, { "epoch": 1.467013740718614, "grad_norm": 0.2817218426933392, "learning_rate": 2.8385811836115328e-05, "loss": 0.3387, "step": 8595 }, { "epoch": 1.467867201502091, "grad_norm": 0.2306767235989589, "learning_rate": 2.8370005058168946e-05, "loss": 0.3155, "step": 8600 }, { "epoch": 1.468720662285568, "grad_norm": 0.33181306809906386, "learning_rate": 2.835419828022256e-05, "loss": 0.3365, "step": 8605 }, { "epoch": 1.469574123069045, "grad_norm": 0.28160824449913424, "learning_rate": 2.833839150227618e-05, "loss": 0.3313, "step": 8610 }, { "epoch": 1.470427583852522, "grad_norm": 0.27318286273023223, "learning_rate": 2.832258472432979e-05, "loss": 0.3117, "step": 8615 }, { "epoch": 1.471281044635999, "grad_norm": 0.4622760313506491, "learning_rate": 2.8306777946383412e-05, "loss": 0.3164, "step": 8620 }, { "epoch": 1.472134505419476, "grad_norm": 0.2570080947683145, "learning_rate": 2.8290971168437026e-05, "loss": 0.2992, "step": 8625 }, { "epoch": 1.472987966202953, "grad_norm": 0.30767668139773346, "learning_rate": 2.8275164390490644e-05, "loss": 0.3096, "step": 8630 }, { "epoch": 1.47384142698643, "grad_norm": 0.28243466620110436, "learning_rate": 2.8259357612544264e-05, "loss": 0.3097, "step": 8635 }, { "epoch": 1.4746948877699069, "grad_norm": 0.26632850097348854, "learning_rate": 2.8243550834597875e-05, "loss": 0.3159, "step": 8640 }, { "epoch": 1.475548348553384, "grad_norm": 0.3500125770690911, "learning_rate": 2.8227744056651496e-05, "loss": 0.3406, "step": 8645 }, { "epoch": 1.476401809336861, "grad_norm": 0.24610346994970939, "learning_rate": 2.821193727870511e-05, "loss": 0.3178, "step": 8650 }, { "epoch": 1.477255270120338, "grad_norm": 0.34823273431585133, "learning_rate": 2.8196130500758727e-05, "loss": 0.3128, "step": 8655 }, { "epoch": 1.478108730903815, "grad_norm": 0.3056162081670413, "learning_rate": 2.818032372281234e-05, "loss": 0.3114, "step": 8660 }, { "epoch": 1.478962191687292, "grad_norm": 0.2745607406470403, "learning_rate": 2.816451694486596e-05, "loss": 0.3068, "step": 8665 }, { "epoch": 1.4798156524707688, "grad_norm": 0.28717889311426453, "learning_rate": 2.814871016691958e-05, "loss": 0.3168, "step": 8670 }, { "epoch": 1.480669113254246, "grad_norm": 0.29256262879387473, "learning_rate": 2.813290338897319e-05, "loss": 0.3308, "step": 8675 }, { "epoch": 1.481522574037723, "grad_norm": 0.292039950044191, "learning_rate": 2.811709661102681e-05, "loss": 0.3258, "step": 8680 }, { "epoch": 1.4823760348212, "grad_norm": 0.31089475968624497, "learning_rate": 2.8101289833080425e-05, "loss": 0.3416, "step": 8685 }, { "epoch": 1.483229495604677, "grad_norm": 0.268462316109838, "learning_rate": 2.8085483055134043e-05, "loss": 0.3321, "step": 8690 }, { "epoch": 1.484082956388154, "grad_norm": 0.26208259119841604, "learning_rate": 2.806967627718766e-05, "loss": 0.2959, "step": 8695 }, { "epoch": 1.4849364171716308, "grad_norm": 0.28417689481712705, "learning_rate": 2.8053869499241274e-05, "loss": 0.3102, "step": 8700 }, { "epoch": 1.485789877955108, "grad_norm": 0.25230649149158885, "learning_rate": 2.8038062721294895e-05, "loss": 0.2937, "step": 8705 }, { "epoch": 1.486643338738585, "grad_norm": 0.20555887670903453, "learning_rate": 2.802225594334851e-05, "loss": 0.2999, "step": 8710 }, { "epoch": 1.487496799522062, "grad_norm": 0.30704950377366524, "learning_rate": 2.8006449165402127e-05, "loss": 0.3349, "step": 8715 }, { "epoch": 1.488350260305539, "grad_norm": 0.25372237166942085, "learning_rate": 2.799064238745574e-05, "loss": 0.3317, "step": 8720 }, { "epoch": 1.4892037210890159, "grad_norm": 0.2906716166910217, "learning_rate": 2.7974835609509358e-05, "loss": 0.3161, "step": 8725 }, { "epoch": 1.490057181872493, "grad_norm": 0.27541586610595137, "learning_rate": 2.795902883156298e-05, "loss": 0.3188, "step": 8730 }, { "epoch": 1.49091064265597, "grad_norm": 0.2929617722782728, "learning_rate": 2.794322205361659e-05, "loss": 0.3003, "step": 8735 }, { "epoch": 1.4917641034394469, "grad_norm": 0.28430637843650314, "learning_rate": 2.792741527567021e-05, "loss": 0.31, "step": 8740 }, { "epoch": 1.492617564222924, "grad_norm": 0.3192441361682844, "learning_rate": 2.7911608497723825e-05, "loss": 0.3228, "step": 8745 }, { "epoch": 1.493471025006401, "grad_norm": 0.3252256087880591, "learning_rate": 2.7895801719777442e-05, "loss": 0.3578, "step": 8750 }, { "epoch": 1.4943244857898779, "grad_norm": 0.2875275812395272, "learning_rate": 2.7879994941831056e-05, "loss": 0.3249, "step": 8755 }, { "epoch": 1.495177946573355, "grad_norm": 0.24834227540551015, "learning_rate": 2.7864188163884674e-05, "loss": 0.3183, "step": 8760 }, { "epoch": 1.496031407356832, "grad_norm": 0.24130537284232803, "learning_rate": 2.7848381385938295e-05, "loss": 0.3247, "step": 8765 }, { "epoch": 1.496884868140309, "grad_norm": 0.26261933741193766, "learning_rate": 2.783257460799191e-05, "loss": 0.31, "step": 8770 }, { "epoch": 1.497738328923786, "grad_norm": 0.2902145642533593, "learning_rate": 2.7816767830045526e-05, "loss": 0.3129, "step": 8775 }, { "epoch": 1.498591789707263, "grad_norm": 0.28070896349189045, "learning_rate": 2.780096105209914e-05, "loss": 0.3156, "step": 8780 }, { "epoch": 1.4994452504907398, "grad_norm": 0.3077355681491917, "learning_rate": 2.7785154274152758e-05, "loss": 0.3213, "step": 8785 }, { "epoch": 1.500298711274217, "grad_norm": 0.26769191624429317, "learning_rate": 2.776934749620638e-05, "loss": 0.3252, "step": 8790 }, { "epoch": 1.501152172057694, "grad_norm": 0.31768086002559737, "learning_rate": 2.775354071825999e-05, "loss": 0.3208, "step": 8795 }, { "epoch": 1.502005632841171, "grad_norm": 0.25655502928056106, "learning_rate": 2.773773394031361e-05, "loss": 0.3043, "step": 8800 }, { "epoch": 1.502859093624648, "grad_norm": 0.2631094357270462, "learning_rate": 2.7721927162367224e-05, "loss": 0.3266, "step": 8805 }, { "epoch": 1.503712554408125, "grad_norm": 0.28290998711831317, "learning_rate": 2.770612038442084e-05, "loss": 0.2921, "step": 8810 }, { "epoch": 1.5045660151916018, "grad_norm": 0.24998713956808116, "learning_rate": 2.7690313606474456e-05, "loss": 0.3214, "step": 8815 }, { "epoch": 1.505419475975079, "grad_norm": 0.23227224846714256, "learning_rate": 2.7674506828528073e-05, "loss": 0.3197, "step": 8820 }, { "epoch": 1.506272936758556, "grad_norm": 0.2619020294104187, "learning_rate": 2.7658700050581694e-05, "loss": 0.2977, "step": 8825 }, { "epoch": 1.507126397542033, "grad_norm": 0.2724054760527883, "learning_rate": 2.7642893272635305e-05, "loss": 0.309, "step": 8830 }, { "epoch": 1.50797985832551, "grad_norm": 0.240611625532514, "learning_rate": 2.7627086494688925e-05, "loss": 0.3136, "step": 8835 }, { "epoch": 1.5088333191089869, "grad_norm": 0.30116929621346, "learning_rate": 2.761127971674254e-05, "loss": 0.3335, "step": 8840 }, { "epoch": 1.5096867798924638, "grad_norm": 0.28682401991258805, "learning_rate": 2.7595472938796157e-05, "loss": 0.3049, "step": 8845 }, { "epoch": 1.510540240675941, "grad_norm": 0.2691173592641695, "learning_rate": 2.7579666160849778e-05, "loss": 0.3176, "step": 8850 }, { "epoch": 1.511393701459418, "grad_norm": 0.31063051744380793, "learning_rate": 2.756385938290339e-05, "loss": 0.3189, "step": 8855 }, { "epoch": 1.512247162242895, "grad_norm": 0.3200991332998642, "learning_rate": 2.754805260495701e-05, "loss": 0.3263, "step": 8860 }, { "epoch": 1.513100623026372, "grad_norm": 0.3366906416573457, "learning_rate": 2.7532245827010623e-05, "loss": 0.2999, "step": 8865 }, { "epoch": 1.5139540838098489, "grad_norm": 0.3326168489388344, "learning_rate": 2.751643904906424e-05, "loss": 0.3247, "step": 8870 }, { "epoch": 1.5148075445933258, "grad_norm": 0.31942068963438197, "learning_rate": 2.7500632271117855e-05, "loss": 0.3398, "step": 8875 }, { "epoch": 1.515661005376803, "grad_norm": 0.26877056022533213, "learning_rate": 2.7484825493171472e-05, "loss": 0.302, "step": 8880 }, { "epoch": 1.51651446616028, "grad_norm": 0.2927638819337981, "learning_rate": 2.7469018715225093e-05, "loss": 0.308, "step": 8885 }, { "epoch": 1.517367926943757, "grad_norm": 0.23396515307798255, "learning_rate": 2.7453211937278704e-05, "loss": 0.3237, "step": 8890 }, { "epoch": 1.518221387727234, "grad_norm": 0.2543735100151876, "learning_rate": 2.7437405159332325e-05, "loss": 0.3349, "step": 8895 }, { "epoch": 1.5190748485107108, "grad_norm": 0.23897862071881804, "learning_rate": 2.742159838138594e-05, "loss": 0.3076, "step": 8900 }, { "epoch": 1.519928309294188, "grad_norm": 0.26312389130399955, "learning_rate": 2.7405791603439556e-05, "loss": 0.3226, "step": 8905 }, { "epoch": 1.520781770077665, "grad_norm": 0.34640665443546514, "learning_rate": 2.738998482549317e-05, "loss": 0.3143, "step": 8910 }, { "epoch": 1.521635230861142, "grad_norm": 0.5863091768682009, "learning_rate": 2.7374178047546788e-05, "loss": 0.3195, "step": 8915 }, { "epoch": 1.522488691644619, "grad_norm": 0.27183513946331544, "learning_rate": 2.735837126960041e-05, "loss": 0.316, "step": 8920 }, { "epoch": 1.523342152428096, "grad_norm": 0.30253311634168023, "learning_rate": 2.7342564491654023e-05, "loss": 0.3237, "step": 8925 }, { "epoch": 1.5241956132115728, "grad_norm": 0.33332840335792285, "learning_rate": 2.732675771370764e-05, "loss": 0.3355, "step": 8930 }, { "epoch": 1.52504907399505, "grad_norm": 0.2625254325284299, "learning_rate": 2.7310950935761254e-05, "loss": 0.318, "step": 8935 }, { "epoch": 1.5259025347785269, "grad_norm": 0.2727331433683416, "learning_rate": 2.7295144157814872e-05, "loss": 0.3109, "step": 8940 }, { "epoch": 1.526755995562004, "grad_norm": 0.27963365437835835, "learning_rate": 2.7279337379868493e-05, "loss": 0.3192, "step": 8945 }, { "epoch": 1.527609456345481, "grad_norm": 0.2385252304320426, "learning_rate": 2.7263530601922103e-05, "loss": 0.3167, "step": 8950 }, { "epoch": 1.5284629171289579, "grad_norm": 0.2544206888691298, "learning_rate": 2.7247723823975724e-05, "loss": 0.3068, "step": 8955 }, { "epoch": 1.5293163779124348, "grad_norm": 0.23322160196492417, "learning_rate": 2.7231917046029338e-05, "loss": 0.3113, "step": 8960 }, { "epoch": 1.530169838695912, "grad_norm": 0.2821561164994656, "learning_rate": 2.7216110268082956e-05, "loss": 0.3338, "step": 8965 }, { "epoch": 1.531023299479389, "grad_norm": 0.41214635667062227, "learning_rate": 2.720030349013657e-05, "loss": 0.3278, "step": 8970 }, { "epoch": 1.531876760262866, "grad_norm": 0.2837203508902763, "learning_rate": 2.7184496712190187e-05, "loss": 0.336, "step": 8975 }, { "epoch": 1.532730221046343, "grad_norm": 0.31148736710625524, "learning_rate": 2.7168689934243808e-05, "loss": 0.3057, "step": 8980 }, { "epoch": 1.5335836818298199, "grad_norm": 0.32294626127727133, "learning_rate": 2.7152883156297422e-05, "loss": 0.3083, "step": 8985 }, { "epoch": 1.5344371426132968, "grad_norm": 0.2935429635350806, "learning_rate": 2.713707637835104e-05, "loss": 0.3161, "step": 8990 }, { "epoch": 1.535290603396774, "grad_norm": 0.3144144470976903, "learning_rate": 2.7121269600404654e-05, "loss": 0.3335, "step": 8995 }, { "epoch": 1.536144064180251, "grad_norm": 0.2500464442386334, "learning_rate": 2.710546282245827e-05, "loss": 0.3108, "step": 9000 }, { "epoch": 1.536997524963728, "grad_norm": 0.2654410437250442, "learning_rate": 2.7089656044511892e-05, "loss": 0.3289, "step": 9005 }, { "epoch": 1.537850985747205, "grad_norm": 0.2753010420900831, "learning_rate": 2.7073849266565503e-05, "loss": 0.332, "step": 9010 }, { "epoch": 1.5387044465306818, "grad_norm": 0.2693628640462915, "learning_rate": 2.7058042488619123e-05, "loss": 0.3054, "step": 9015 }, { "epoch": 1.5395579073141588, "grad_norm": 0.3123598898463066, "learning_rate": 2.7042235710672737e-05, "loss": 0.3047, "step": 9020 }, { "epoch": 1.540411368097636, "grad_norm": 0.2507688340170781, "learning_rate": 2.7026428932726355e-05, "loss": 0.2932, "step": 9025 }, { "epoch": 1.541264828881113, "grad_norm": 0.33259602872527644, "learning_rate": 2.701062215477997e-05, "loss": 0.3528, "step": 9030 }, { "epoch": 1.54211828966459, "grad_norm": 0.3473648408781555, "learning_rate": 2.6994815376833586e-05, "loss": 0.3101, "step": 9035 }, { "epoch": 1.542971750448067, "grad_norm": 0.30370982553006476, "learning_rate": 2.6979008598887207e-05, "loss": 0.3179, "step": 9040 }, { "epoch": 1.5438252112315438, "grad_norm": 0.2934099301913217, "learning_rate": 2.696320182094082e-05, "loss": 0.3181, "step": 9045 }, { "epoch": 1.544678672015021, "grad_norm": 0.27999316816197456, "learning_rate": 2.694739504299444e-05, "loss": 0.3159, "step": 9050 }, { "epoch": 1.5455321327984979, "grad_norm": 0.3698165215141441, "learning_rate": 2.6931588265048053e-05, "loss": 0.3333, "step": 9055 }, { "epoch": 1.546385593581975, "grad_norm": 0.35871924026295254, "learning_rate": 2.691578148710167e-05, "loss": 0.3404, "step": 9060 }, { "epoch": 1.547239054365452, "grad_norm": 0.2670938807188544, "learning_rate": 2.6899974709155284e-05, "loss": 0.304, "step": 9065 }, { "epoch": 1.5480925151489289, "grad_norm": 0.2912129480805395, "learning_rate": 2.6884167931208902e-05, "loss": 0.328, "step": 9070 }, { "epoch": 1.5489459759324058, "grad_norm": 0.2686570600109525, "learning_rate": 2.6868361153262523e-05, "loss": 0.3326, "step": 9075 }, { "epoch": 1.549799436715883, "grad_norm": 0.3249687908021935, "learning_rate": 2.6852554375316137e-05, "loss": 0.328, "step": 9080 }, { "epoch": 1.5506528974993599, "grad_norm": 0.4042705084917681, "learning_rate": 2.6836747597369754e-05, "loss": 0.3314, "step": 9085 }, { "epoch": 1.551506358282837, "grad_norm": 0.29689297267225634, "learning_rate": 2.682094081942337e-05, "loss": 0.324, "step": 9090 }, { "epoch": 1.552359819066314, "grad_norm": 0.5132805895012987, "learning_rate": 2.6805134041476986e-05, "loss": 0.3367, "step": 9095 }, { "epoch": 1.5532132798497909, "grad_norm": 0.2650204663285175, "learning_rate": 2.6789327263530607e-05, "loss": 0.3277, "step": 9100 }, { "epoch": 1.5540667406332678, "grad_norm": 0.268352772739777, "learning_rate": 2.677352048558422e-05, "loss": 0.3266, "step": 9105 }, { "epoch": 1.554920201416745, "grad_norm": 0.27253610891872904, "learning_rate": 2.6757713707637838e-05, "loss": 0.3149, "step": 9110 }, { "epoch": 1.555773662200222, "grad_norm": 0.2660084623724433, "learning_rate": 2.6741906929691452e-05, "loss": 0.3177, "step": 9115 }, { "epoch": 1.556627122983699, "grad_norm": 0.27624927999458815, "learning_rate": 2.672610015174507e-05, "loss": 0.31, "step": 9120 }, { "epoch": 1.557480583767176, "grad_norm": 0.3133788049260409, "learning_rate": 2.6710293373798684e-05, "loss": 0.3153, "step": 9125 }, { "epoch": 1.5583340445506528, "grad_norm": 0.2770761856796869, "learning_rate": 2.66944865958523e-05, "loss": 0.3361, "step": 9130 }, { "epoch": 1.5591875053341298, "grad_norm": 0.3476145346345189, "learning_rate": 2.6678679817905922e-05, "loss": 0.3121, "step": 9135 }, { "epoch": 1.560040966117607, "grad_norm": 0.2808338240876052, "learning_rate": 2.6662873039959536e-05, "loss": 0.3175, "step": 9140 }, { "epoch": 1.560894426901084, "grad_norm": 0.2579989846821336, "learning_rate": 2.6647066262013154e-05, "loss": 0.3079, "step": 9145 }, { "epoch": 1.561747887684561, "grad_norm": 0.27898816264321946, "learning_rate": 2.6631259484066768e-05, "loss": 0.3186, "step": 9150 }, { "epoch": 1.5626013484680379, "grad_norm": 0.30337754911906395, "learning_rate": 2.6615452706120385e-05, "loss": 0.3271, "step": 9155 }, { "epoch": 1.5634548092515148, "grad_norm": 0.22418067459258412, "learning_rate": 2.6599645928174e-05, "loss": 0.3244, "step": 9160 }, { "epoch": 1.5643082700349917, "grad_norm": 0.2337198304302215, "learning_rate": 2.658383915022762e-05, "loss": 0.3306, "step": 9165 }, { "epoch": 1.5651617308184689, "grad_norm": 0.26606361361744396, "learning_rate": 2.6568032372281237e-05, "loss": 0.3266, "step": 9170 }, { "epoch": 1.566015191601946, "grad_norm": 0.2659936548614503, "learning_rate": 2.655222559433485e-05, "loss": 0.3096, "step": 9175 }, { "epoch": 1.566868652385423, "grad_norm": 0.3057912401045365, "learning_rate": 2.653641881638847e-05, "loss": 0.3148, "step": 9180 }, { "epoch": 1.5677221131688999, "grad_norm": 0.3170281291651793, "learning_rate": 2.6520612038442083e-05, "loss": 0.3195, "step": 9185 }, { "epoch": 1.5685755739523768, "grad_norm": 0.2654382082215115, "learning_rate": 2.65048052604957e-05, "loss": 0.3193, "step": 9190 }, { "epoch": 1.569429034735854, "grad_norm": 0.28433393003316476, "learning_rate": 2.648899848254932e-05, "loss": 0.3088, "step": 9195 }, { "epoch": 1.5702824955193309, "grad_norm": 0.3135530462466572, "learning_rate": 2.6473191704602935e-05, "loss": 0.2998, "step": 9200 }, { "epoch": 1.571135956302808, "grad_norm": 0.3012760874304556, "learning_rate": 2.6457384926656553e-05, "loss": 0.3298, "step": 9205 }, { "epoch": 1.571989417086285, "grad_norm": 0.3002507501476291, "learning_rate": 2.6441578148710167e-05, "loss": 0.3217, "step": 9210 }, { "epoch": 1.5728428778697618, "grad_norm": 0.28264173250928076, "learning_rate": 2.6425771370763784e-05, "loss": 0.3319, "step": 9215 }, { "epoch": 1.5736963386532388, "grad_norm": 0.2702076250200713, "learning_rate": 2.64099645928174e-05, "loss": 0.3179, "step": 9220 }, { "epoch": 1.574549799436716, "grad_norm": 0.2935726352198598, "learning_rate": 2.639415781487102e-05, "loss": 0.3288, "step": 9225 }, { "epoch": 1.5754032602201928, "grad_norm": 0.26207961549286696, "learning_rate": 2.6378351036924637e-05, "loss": 0.3079, "step": 9230 }, { "epoch": 1.57625672100367, "grad_norm": 0.26253576796957523, "learning_rate": 2.636254425897825e-05, "loss": 0.3198, "step": 9235 }, { "epoch": 1.577110181787147, "grad_norm": 0.2864311903809092, "learning_rate": 2.634673748103187e-05, "loss": 0.3283, "step": 9240 }, { "epoch": 1.5779636425706238, "grad_norm": 0.24525873212436777, "learning_rate": 2.6330930703085482e-05, "loss": 0.3349, "step": 9245 }, { "epoch": 1.5788171033541007, "grad_norm": 0.2775259905393127, "learning_rate": 2.63151239251391e-05, "loss": 0.317, "step": 9250 }, { "epoch": 1.579670564137578, "grad_norm": 0.2735495368407676, "learning_rate": 2.629931714719272e-05, "loss": 0.3202, "step": 9255 }, { "epoch": 1.580524024921055, "grad_norm": 0.29016722052736243, "learning_rate": 2.6283510369246335e-05, "loss": 0.3283, "step": 9260 }, { "epoch": 1.581377485704532, "grad_norm": 0.3051303929315788, "learning_rate": 2.6267703591299952e-05, "loss": 0.3337, "step": 9265 }, { "epoch": 1.5822309464880089, "grad_norm": 0.36263530041095676, "learning_rate": 2.6251896813353566e-05, "loss": 0.3325, "step": 9270 }, { "epoch": 1.5830844072714858, "grad_norm": 0.2677846840518092, "learning_rate": 2.6236090035407184e-05, "loss": 0.3354, "step": 9275 }, { "epoch": 1.5839378680549627, "grad_norm": 0.25845719545358004, "learning_rate": 2.6220283257460798e-05, "loss": 0.2983, "step": 9280 }, { "epoch": 1.5847913288384399, "grad_norm": 0.29633191533744097, "learning_rate": 2.620447647951442e-05, "loss": 0.3191, "step": 9285 }, { "epoch": 1.585644789621917, "grad_norm": 0.30111102366967435, "learning_rate": 2.6188669701568036e-05, "loss": 0.3163, "step": 9290 }, { "epoch": 1.586498250405394, "grad_norm": 0.2278679969184886, "learning_rate": 2.617286292362165e-05, "loss": 0.3169, "step": 9295 }, { "epoch": 1.5873517111888709, "grad_norm": 0.25386254649086926, "learning_rate": 2.6157056145675268e-05, "loss": 0.2978, "step": 9300 }, { "epoch": 1.5882051719723478, "grad_norm": 0.2358251876289842, "learning_rate": 2.6141249367728882e-05, "loss": 0.3248, "step": 9305 }, { "epoch": 1.5890586327558247, "grad_norm": 0.28163731470263453, "learning_rate": 2.61254425897825e-05, "loss": 0.3028, "step": 9310 }, { "epoch": 1.5899120935393019, "grad_norm": 0.3771556328610975, "learning_rate": 2.6109635811836113e-05, "loss": 0.3324, "step": 9315 }, { "epoch": 1.590765554322779, "grad_norm": 0.32401639013827843, "learning_rate": 2.6093829033889734e-05, "loss": 0.3256, "step": 9320 }, { "epoch": 1.591619015106256, "grad_norm": 0.3216794850719463, "learning_rate": 2.607802225594335e-05, "loss": 0.3223, "step": 9325 }, { "epoch": 1.5924724758897328, "grad_norm": 0.26800806344613637, "learning_rate": 2.6062215477996966e-05, "loss": 0.3181, "step": 9330 }, { "epoch": 1.5933259366732098, "grad_norm": 0.3836491080236076, "learning_rate": 2.6046408700050583e-05, "loss": 0.3068, "step": 9335 }, { "epoch": 1.594179397456687, "grad_norm": 0.31349567608996803, "learning_rate": 2.6030601922104197e-05, "loss": 0.3197, "step": 9340 }, { "epoch": 1.5950328582401638, "grad_norm": 0.30077696159787937, "learning_rate": 2.6014795144157818e-05, "loss": 0.3191, "step": 9345 }, { "epoch": 1.595886319023641, "grad_norm": 0.27337637149153726, "learning_rate": 2.5998988366211435e-05, "loss": 0.3018, "step": 9350 }, { "epoch": 1.596739779807118, "grad_norm": 0.3208923593838835, "learning_rate": 2.598318158826505e-05, "loss": 0.3241, "step": 9355 }, { "epoch": 1.5975932405905948, "grad_norm": 0.28236705807830886, "learning_rate": 2.5967374810318667e-05, "loss": 0.3055, "step": 9360 }, { "epoch": 1.5984467013740717, "grad_norm": 0.2558439162833257, "learning_rate": 2.595156803237228e-05, "loss": 0.3009, "step": 9365 }, { "epoch": 1.5993001621575489, "grad_norm": 0.27601510235230436, "learning_rate": 2.59357612544259e-05, "loss": 0.3192, "step": 9370 }, { "epoch": 1.6001536229410258, "grad_norm": 0.2399686947476117, "learning_rate": 2.5919954476479513e-05, "loss": 0.3209, "step": 9375 }, { "epoch": 1.601007083724503, "grad_norm": 0.3109877353842411, "learning_rate": 2.5904147698533133e-05, "loss": 0.3326, "step": 9380 }, { "epoch": 1.6018605445079799, "grad_norm": 0.23652093302022964, "learning_rate": 2.588834092058675e-05, "loss": 0.3198, "step": 9385 }, { "epoch": 1.6027140052914568, "grad_norm": 0.27189663633810773, "learning_rate": 2.5872534142640365e-05, "loss": 0.3189, "step": 9390 }, { "epoch": 1.6035674660749337, "grad_norm": 0.2792227149647411, "learning_rate": 2.5856727364693982e-05, "loss": 0.3083, "step": 9395 }, { "epoch": 1.6044209268584109, "grad_norm": 0.22955725977500135, "learning_rate": 2.5840920586747596e-05, "loss": 0.3062, "step": 9400 }, { "epoch": 1.605274387641888, "grad_norm": 0.29207687761338097, "learning_rate": 2.5825113808801217e-05, "loss": 0.3004, "step": 9405 }, { "epoch": 1.606127848425365, "grad_norm": 0.33419581644022095, "learning_rate": 2.5809307030854828e-05, "loss": 0.33, "step": 9410 }, { "epoch": 1.6069813092088419, "grad_norm": 0.28646863074892953, "learning_rate": 2.579350025290845e-05, "loss": 0.324, "step": 9415 }, { "epoch": 1.6078347699923188, "grad_norm": 0.29212855087309286, "learning_rate": 2.5777693474962066e-05, "loss": 0.3218, "step": 9420 }, { "epoch": 1.6086882307757957, "grad_norm": 0.2766450150209438, "learning_rate": 2.576188669701568e-05, "loss": 0.3243, "step": 9425 }, { "epoch": 1.6095416915592728, "grad_norm": 0.311833635079516, "learning_rate": 2.5746079919069298e-05, "loss": 0.3084, "step": 9430 }, { "epoch": 1.61039515234275, "grad_norm": 0.3116698158990861, "learning_rate": 2.5730273141122912e-05, "loss": 0.3221, "step": 9435 }, { "epoch": 1.611248613126227, "grad_norm": 0.3455527190333094, "learning_rate": 2.5714466363176533e-05, "loss": 0.3145, "step": 9440 }, { "epoch": 1.6121020739097038, "grad_norm": 0.3042259869317829, "learning_rate": 2.569865958523015e-05, "loss": 0.3289, "step": 9445 }, { "epoch": 1.6129555346931808, "grad_norm": 0.3408727338255188, "learning_rate": 2.5682852807283764e-05, "loss": 0.3216, "step": 9450 }, { "epoch": 1.6138089954766577, "grad_norm": 0.28776929173744026, "learning_rate": 2.5667046029337382e-05, "loss": 0.3129, "step": 9455 }, { "epoch": 1.6146624562601348, "grad_norm": 0.29100950021971267, "learning_rate": 2.5651239251390996e-05, "loss": 0.3205, "step": 9460 }, { "epoch": 1.615515917043612, "grad_norm": 0.2942325002443078, "learning_rate": 2.5635432473444613e-05, "loss": 0.3069, "step": 9465 }, { "epoch": 1.616369377827089, "grad_norm": 0.30065185635786923, "learning_rate": 2.5619625695498227e-05, "loss": 0.3282, "step": 9470 }, { "epoch": 1.6172228386105658, "grad_norm": 0.2845415502560194, "learning_rate": 2.5603818917551848e-05, "loss": 0.3035, "step": 9475 }, { "epoch": 1.6180762993940427, "grad_norm": 0.23475956907658024, "learning_rate": 2.5588012139605466e-05, "loss": 0.3281, "step": 9480 }, { "epoch": 1.6189297601775199, "grad_norm": 0.25262702473835735, "learning_rate": 2.557220536165908e-05, "loss": 0.2993, "step": 9485 }, { "epoch": 1.6197832209609968, "grad_norm": 0.2455406662042028, "learning_rate": 2.5556398583712697e-05, "loss": 0.3141, "step": 9490 }, { "epoch": 1.620636681744474, "grad_norm": 0.26714679807160535, "learning_rate": 2.554059180576631e-05, "loss": 0.3285, "step": 9495 }, { "epoch": 1.6214901425279509, "grad_norm": 0.2954294502420084, "learning_rate": 2.5524785027819932e-05, "loss": 0.3246, "step": 9500 }, { "epoch": 1.6223436033114278, "grad_norm": 0.31580125785280144, "learning_rate": 2.550897824987355e-05, "loss": 0.3232, "step": 9505 }, { "epoch": 1.6231970640949047, "grad_norm": 0.2758536669465857, "learning_rate": 2.5493171471927164e-05, "loss": 0.31, "step": 9510 }, { "epoch": 1.6240505248783819, "grad_norm": 0.27992734863630697, "learning_rate": 2.547736469398078e-05, "loss": 0.317, "step": 9515 }, { "epoch": 1.6249039856618588, "grad_norm": 0.2900938347180162, "learning_rate": 2.5461557916034395e-05, "loss": 0.3189, "step": 9520 }, { "epoch": 1.625757446445336, "grad_norm": 0.23172221074767377, "learning_rate": 2.5445751138088013e-05, "loss": 0.3067, "step": 9525 }, { "epoch": 1.6266109072288129, "grad_norm": 0.26367312357197026, "learning_rate": 2.5429944360141627e-05, "loss": 0.3224, "step": 9530 }, { "epoch": 1.6274643680122898, "grad_norm": 0.2636253742132394, "learning_rate": 2.5414137582195248e-05, "loss": 0.2973, "step": 9535 }, { "epoch": 1.6283178287957667, "grad_norm": 0.2663875465372204, "learning_rate": 2.5398330804248865e-05, "loss": 0.3095, "step": 9540 }, { "epoch": 1.6291712895792438, "grad_norm": 0.2804052097406286, "learning_rate": 2.538252402630248e-05, "loss": 0.3067, "step": 9545 }, { "epoch": 1.630024750362721, "grad_norm": 0.3016995308340634, "learning_rate": 2.5366717248356097e-05, "loss": 0.3006, "step": 9550 }, { "epoch": 1.630878211146198, "grad_norm": 0.29034920851038826, "learning_rate": 2.535091047040971e-05, "loss": 0.3215, "step": 9555 }, { "epoch": 1.6317316719296748, "grad_norm": 0.3491147568185525, "learning_rate": 2.533510369246333e-05, "loss": 0.3209, "step": 9560 }, { "epoch": 1.6325851327131518, "grad_norm": 0.25644024084515127, "learning_rate": 2.5319296914516942e-05, "loss": 0.3277, "step": 9565 }, { "epoch": 1.6334385934966287, "grad_norm": 0.40830348147670725, "learning_rate": 2.5303490136570563e-05, "loss": 0.3154, "step": 9570 }, { "epoch": 1.6342920542801058, "grad_norm": 0.2866250219891567, "learning_rate": 2.528768335862418e-05, "loss": 0.2963, "step": 9575 }, { "epoch": 1.635145515063583, "grad_norm": 0.2516996859826282, "learning_rate": 2.5271876580677794e-05, "loss": 0.3104, "step": 9580 }, { "epoch": 1.6359989758470599, "grad_norm": 0.29871194853622723, "learning_rate": 2.5256069802731412e-05, "loss": 0.3335, "step": 9585 }, { "epoch": 1.6368524366305368, "grad_norm": 0.2542625797918168, "learning_rate": 2.5240263024785026e-05, "loss": 0.3109, "step": 9590 }, { "epoch": 1.6377058974140137, "grad_norm": 0.29226435470504675, "learning_rate": 2.5224456246838647e-05, "loss": 0.3102, "step": 9595 }, { "epoch": 1.6385593581974909, "grad_norm": 0.28418563275222913, "learning_rate": 2.5208649468892264e-05, "loss": 0.2863, "step": 9600 }, { "epoch": 1.6394128189809678, "grad_norm": 0.2967755708608498, "learning_rate": 2.519284269094588e-05, "loss": 0.3191, "step": 9605 }, { "epoch": 1.640266279764445, "grad_norm": 0.32358784624311704, "learning_rate": 2.5177035912999496e-05, "loss": 0.3193, "step": 9610 }, { "epoch": 1.6411197405479219, "grad_norm": 0.27007931649973177, "learning_rate": 2.516122913505311e-05, "loss": 0.3071, "step": 9615 }, { "epoch": 1.6419732013313988, "grad_norm": 0.2763258289273834, "learning_rate": 2.514542235710673e-05, "loss": 0.3385, "step": 9620 }, { "epoch": 1.6428266621148757, "grad_norm": 0.3325707621914579, "learning_rate": 2.512961557916034e-05, "loss": 0.305, "step": 9625 }, { "epoch": 1.6436801228983529, "grad_norm": 0.4010470842449891, "learning_rate": 2.5113808801213962e-05, "loss": 0.2998, "step": 9630 }, { "epoch": 1.6445335836818298, "grad_norm": 0.2821731179896881, "learning_rate": 2.509800202326758e-05, "loss": 0.2975, "step": 9635 }, { "epoch": 1.645387044465307, "grad_norm": 0.30067131636833244, "learning_rate": 2.5082195245321194e-05, "loss": 0.3148, "step": 9640 }, { "epoch": 1.6462405052487838, "grad_norm": 0.3201349001540346, "learning_rate": 2.506638846737481e-05, "loss": 0.324, "step": 9645 }, { "epoch": 1.6470939660322608, "grad_norm": 0.3253670987646229, "learning_rate": 2.5050581689428425e-05, "loss": 0.3147, "step": 9650 }, { "epoch": 1.6479474268157377, "grad_norm": 0.25454511306999966, "learning_rate": 2.5034774911482046e-05, "loss": 0.3288, "step": 9655 }, { "epoch": 1.6488008875992148, "grad_norm": 0.30670721557637093, "learning_rate": 2.5018968133535664e-05, "loss": 0.3466, "step": 9660 }, { "epoch": 1.649654348382692, "grad_norm": 0.24020556739140697, "learning_rate": 2.5003161355589278e-05, "loss": 0.3043, "step": 9665 }, { "epoch": 1.650507809166169, "grad_norm": 0.3034119028705514, "learning_rate": 2.4987354577642895e-05, "loss": 0.3351, "step": 9670 }, { "epoch": 1.6513612699496458, "grad_norm": 0.27801185897257613, "learning_rate": 2.497154779969651e-05, "loss": 0.3034, "step": 9675 }, { "epoch": 1.6522147307331227, "grad_norm": 0.24986722954811633, "learning_rate": 2.495574102175013e-05, "loss": 0.3291, "step": 9680 }, { "epoch": 1.6530681915165997, "grad_norm": 0.2943687742962442, "learning_rate": 2.4939934243803744e-05, "loss": 0.2997, "step": 9685 }, { "epoch": 1.6539216523000768, "grad_norm": 0.3033697152756259, "learning_rate": 2.492412746585736e-05, "loss": 0.3163, "step": 9690 }, { "epoch": 1.654775113083554, "grad_norm": 0.3038325756841647, "learning_rate": 2.4908320687910976e-05, "loss": 0.3105, "step": 9695 }, { "epoch": 1.6556285738670309, "grad_norm": 0.3177442374943874, "learning_rate": 2.4892513909964593e-05, "loss": 0.3206, "step": 9700 }, { "epoch": 1.6564820346505078, "grad_norm": 0.3070569946332167, "learning_rate": 2.487670713201821e-05, "loss": 0.3144, "step": 9705 }, { "epoch": 1.6573354954339847, "grad_norm": 0.2648723187294161, "learning_rate": 2.4860900354071828e-05, "loss": 0.315, "step": 9710 }, { "epoch": 1.6581889562174617, "grad_norm": 0.25238664974673947, "learning_rate": 2.4845093576125446e-05, "loss": 0.3244, "step": 9715 }, { "epoch": 1.6590424170009388, "grad_norm": 0.24457995945013927, "learning_rate": 2.482928679817906e-05, "loss": 0.2952, "step": 9720 }, { "epoch": 1.659895877784416, "grad_norm": 0.31297258360403296, "learning_rate": 2.4813480020232677e-05, "loss": 0.3108, "step": 9725 }, { "epoch": 1.6607493385678929, "grad_norm": 0.23578604074443185, "learning_rate": 2.4797673242286294e-05, "loss": 0.3069, "step": 9730 }, { "epoch": 1.6616027993513698, "grad_norm": 0.3182338591711494, "learning_rate": 2.478186646433991e-05, "loss": 0.3368, "step": 9735 }, { "epoch": 1.6624562601348467, "grad_norm": 0.30438511847628047, "learning_rate": 2.476605968639353e-05, "loss": 0.309, "step": 9740 }, { "epoch": 1.6633097209183239, "grad_norm": 0.26115424754980715, "learning_rate": 2.4750252908447143e-05, "loss": 0.3086, "step": 9745 }, { "epoch": 1.6641631817018008, "grad_norm": 0.27175160716938573, "learning_rate": 2.473444613050076e-05, "loss": 0.2995, "step": 9750 }, { "epoch": 1.665016642485278, "grad_norm": 0.2445758153613062, "learning_rate": 2.4718639352554375e-05, "loss": 0.3001, "step": 9755 }, { "epoch": 1.6658701032687548, "grad_norm": 0.33927455481233326, "learning_rate": 2.4702832574607992e-05, "loss": 0.3185, "step": 9760 }, { "epoch": 1.6667235640522318, "grad_norm": 0.2647112672842108, "learning_rate": 2.468702579666161e-05, "loss": 0.3087, "step": 9765 }, { "epoch": 1.6675770248357087, "grad_norm": 0.31416331507472506, "learning_rate": 2.4671219018715227e-05, "loss": 0.3361, "step": 9770 }, { "epoch": 1.6684304856191858, "grad_norm": 0.30208826950011225, "learning_rate": 2.4655412240768845e-05, "loss": 0.3053, "step": 9775 }, { "epoch": 1.6692839464026628, "grad_norm": 0.2939843747670554, "learning_rate": 2.463960546282246e-05, "loss": 0.3084, "step": 9780 }, { "epoch": 1.67013740718614, "grad_norm": 0.2604726875403896, "learning_rate": 2.4623798684876076e-05, "loss": 0.3052, "step": 9785 }, { "epoch": 1.6709908679696168, "grad_norm": 0.26579940649906136, "learning_rate": 2.4607991906929694e-05, "loss": 0.2935, "step": 9790 }, { "epoch": 1.6718443287530937, "grad_norm": 0.28700896457956593, "learning_rate": 2.4592185128983308e-05, "loss": 0.2943, "step": 9795 }, { "epoch": 1.6726977895365707, "grad_norm": 0.28593351163351577, "learning_rate": 2.4576378351036925e-05, "loss": 0.3295, "step": 9800 }, { "epoch": 1.6735512503200478, "grad_norm": 0.27299751516346143, "learning_rate": 2.4560571573090543e-05, "loss": 0.3112, "step": 9805 }, { "epoch": 1.674404711103525, "grad_norm": 0.2663846174897732, "learning_rate": 2.454476479514416e-05, "loss": 0.3197, "step": 9810 }, { "epoch": 1.6752581718870019, "grad_norm": 0.28731484318229156, "learning_rate": 2.4528958017197774e-05, "loss": 0.3204, "step": 9815 }, { "epoch": 1.6761116326704788, "grad_norm": 0.2533085010043687, "learning_rate": 2.4513151239251392e-05, "loss": 0.3177, "step": 9820 }, { "epoch": 1.6769650934539557, "grad_norm": 0.22643171380339228, "learning_rate": 2.449734446130501e-05, "loss": 0.3101, "step": 9825 }, { "epoch": 1.6778185542374326, "grad_norm": 0.2694738633328775, "learning_rate": 2.4481537683358623e-05, "loss": 0.304, "step": 9830 }, { "epoch": 1.6786720150209098, "grad_norm": 0.29154467686312396, "learning_rate": 2.4465730905412244e-05, "loss": 0.3026, "step": 9835 }, { "epoch": 1.679525475804387, "grad_norm": 0.30830905892808697, "learning_rate": 2.4449924127465858e-05, "loss": 0.314, "step": 9840 }, { "epoch": 1.6803789365878639, "grad_norm": 0.3193185340566677, "learning_rate": 2.4434117349519476e-05, "loss": 0.3172, "step": 9845 }, { "epoch": 1.6812323973713408, "grad_norm": 0.2785652881157592, "learning_rate": 2.441831057157309e-05, "loss": 0.3052, "step": 9850 }, { "epoch": 1.6820858581548177, "grad_norm": 0.3053632537872067, "learning_rate": 2.4402503793626707e-05, "loss": 0.3158, "step": 9855 }, { "epoch": 1.6829393189382946, "grad_norm": 0.2809155882704635, "learning_rate": 2.4386697015680325e-05, "loss": 0.3115, "step": 9860 }, { "epoch": 1.6837927797217718, "grad_norm": 0.285834549597778, "learning_rate": 2.4370890237733942e-05, "loss": 0.302, "step": 9865 }, { "epoch": 1.684646240505249, "grad_norm": 0.2949486137871518, "learning_rate": 2.435508345978756e-05, "loss": 0.3256, "step": 9870 }, { "epoch": 1.6854997012887258, "grad_norm": 0.29681886182175815, "learning_rate": 2.4339276681841174e-05, "loss": 0.3366, "step": 9875 }, { "epoch": 1.6863531620722028, "grad_norm": 0.24656725383476072, "learning_rate": 2.432346990389479e-05, "loss": 0.287, "step": 9880 }, { "epoch": 1.6872066228556797, "grad_norm": 0.29017127845906693, "learning_rate": 2.430766312594841e-05, "loss": 0.3354, "step": 9885 }, { "epoch": 1.6880600836391568, "grad_norm": 0.2496620314055489, "learning_rate": 2.4291856348002023e-05, "loss": 0.3195, "step": 9890 }, { "epoch": 1.6889135444226338, "grad_norm": 0.2828010037353762, "learning_rate": 2.4276049570055643e-05, "loss": 0.2949, "step": 9895 }, { "epoch": 1.689767005206111, "grad_norm": 0.27646797924073296, "learning_rate": 2.4260242792109258e-05, "loss": 0.3057, "step": 9900 }, { "epoch": 1.6906204659895878, "grad_norm": 0.32284635985225524, "learning_rate": 2.4244436014162875e-05, "loss": 0.3117, "step": 9905 }, { "epoch": 1.6914739267730647, "grad_norm": 0.2595958713257303, "learning_rate": 2.422862923621649e-05, "loss": 0.3034, "step": 9910 }, { "epoch": 1.6923273875565417, "grad_norm": 0.3068677532227572, "learning_rate": 2.4212822458270107e-05, "loss": 0.3396, "step": 9915 }, { "epoch": 1.6931808483400188, "grad_norm": 0.285298874806758, "learning_rate": 2.4197015680323724e-05, "loss": 0.2909, "step": 9920 }, { "epoch": 1.6940343091234957, "grad_norm": 0.24086152282838583, "learning_rate": 2.418120890237734e-05, "loss": 0.2943, "step": 9925 }, { "epoch": 1.6948877699069729, "grad_norm": 0.2745126314567204, "learning_rate": 2.416540212443096e-05, "loss": 0.3339, "step": 9930 }, { "epoch": 1.6957412306904498, "grad_norm": 0.2889564351321148, "learning_rate": 2.4149595346484573e-05, "loss": 0.3027, "step": 9935 }, { "epoch": 1.6965946914739267, "grad_norm": 0.24997302424868165, "learning_rate": 2.413378856853819e-05, "loss": 0.3145, "step": 9940 }, { "epoch": 1.6974481522574036, "grad_norm": 0.24611106153162882, "learning_rate": 2.4117981790591808e-05, "loss": 0.3199, "step": 9945 }, { "epoch": 1.6983016130408808, "grad_norm": 0.311330342618305, "learning_rate": 2.4102175012645422e-05, "loss": 0.3095, "step": 9950 }, { "epoch": 1.699155073824358, "grad_norm": 0.26530459747674234, "learning_rate": 2.408636823469904e-05, "loss": 0.3223, "step": 9955 }, { "epoch": 1.7000085346078349, "grad_norm": 0.30451651190419765, "learning_rate": 2.4070561456752657e-05, "loss": 0.3081, "step": 9960 }, { "epoch": 1.7008619953913118, "grad_norm": 0.24772232780674808, "learning_rate": 2.4054754678806274e-05, "loss": 0.3332, "step": 9965 }, { "epoch": 1.7017154561747887, "grad_norm": 0.2504604115648748, "learning_rate": 2.403894790085989e-05, "loss": 0.3195, "step": 9970 }, { "epoch": 1.7025689169582656, "grad_norm": 0.31950797479319254, "learning_rate": 2.4023141122913506e-05, "loss": 0.3301, "step": 9975 }, { "epoch": 1.7034223777417428, "grad_norm": 0.2902096609017144, "learning_rate": 2.4007334344967123e-05, "loss": 0.3197, "step": 9980 }, { "epoch": 1.70427583852522, "grad_norm": 0.24739330315705166, "learning_rate": 2.3991527567020737e-05, "loss": 0.3033, "step": 9985 }, { "epoch": 1.7051292993086968, "grad_norm": 0.28820746286076543, "learning_rate": 2.3975720789074358e-05, "loss": 0.3109, "step": 9990 }, { "epoch": 1.7059827600921738, "grad_norm": 0.30889165984155026, "learning_rate": 2.3959914011127972e-05, "loss": 0.3224, "step": 9995 }, { "epoch": 1.7068362208756507, "grad_norm": 0.2668169445685062, "learning_rate": 2.394410723318159e-05, "loss": 0.3028, "step": 10000 }, { "epoch": 1.7076896816591276, "grad_norm": 0.23767136276325537, "learning_rate": 2.3928300455235207e-05, "loss": 0.3006, "step": 10005 }, { "epoch": 1.7085431424426047, "grad_norm": 0.24868890471687213, "learning_rate": 2.391249367728882e-05, "loss": 0.3396, "step": 10010 }, { "epoch": 1.709396603226082, "grad_norm": 0.3202960424840767, "learning_rate": 2.389668689934244e-05, "loss": 0.2961, "step": 10015 }, { "epoch": 1.7102500640095588, "grad_norm": 0.28762346581616066, "learning_rate": 2.3880880121396056e-05, "loss": 0.2906, "step": 10020 }, { "epoch": 1.7111035247930357, "grad_norm": 0.40705795372301, "learning_rate": 2.3865073343449674e-05, "loss": 0.3202, "step": 10025 }, { "epoch": 1.7119569855765127, "grad_norm": 0.2835132359619317, "learning_rate": 2.3849266565503288e-05, "loss": 0.3066, "step": 10030 }, { "epoch": 1.7128104463599898, "grad_norm": 0.3324385736903004, "learning_rate": 2.3833459787556905e-05, "loss": 0.3125, "step": 10035 }, { "epoch": 1.7136639071434667, "grad_norm": 0.3046147719922428, "learning_rate": 2.3817653009610523e-05, "loss": 0.2972, "step": 10040 }, { "epoch": 1.7145173679269439, "grad_norm": 0.26709824529613085, "learning_rate": 2.3801846231664137e-05, "loss": 0.2828, "step": 10045 }, { "epoch": 1.7153708287104208, "grad_norm": 0.29501445750829597, "learning_rate": 2.3786039453717758e-05, "loss": 0.3116, "step": 10050 }, { "epoch": 1.7162242894938977, "grad_norm": 0.29611947206938966, "learning_rate": 2.377023267577137e-05, "loss": 0.328, "step": 10055 }, { "epoch": 1.7170777502773746, "grad_norm": 0.2847677942585591, "learning_rate": 2.375442589782499e-05, "loss": 0.3086, "step": 10060 }, { "epoch": 1.7179312110608518, "grad_norm": 0.2644951539064842, "learning_rate": 2.3738619119878607e-05, "loss": 0.3108, "step": 10065 }, { "epoch": 1.7187846718443287, "grad_norm": 0.2725434984402983, "learning_rate": 2.372281234193222e-05, "loss": 0.2969, "step": 10070 }, { "epoch": 1.7196381326278058, "grad_norm": 0.2715390385438678, "learning_rate": 2.3707005563985838e-05, "loss": 0.3217, "step": 10075 }, { "epoch": 1.7204915934112828, "grad_norm": 0.2553834420200442, "learning_rate": 2.3691198786039452e-05, "loss": 0.3122, "step": 10080 }, { "epoch": 1.7213450541947597, "grad_norm": 0.2752229805482609, "learning_rate": 2.3675392008093073e-05, "loss": 0.3051, "step": 10085 }, { "epoch": 1.7221985149782366, "grad_norm": 0.2557939683712064, "learning_rate": 2.3659585230146687e-05, "loss": 0.3147, "step": 10090 }, { "epoch": 1.7230519757617138, "grad_norm": 0.3109251406929401, "learning_rate": 2.3643778452200305e-05, "loss": 0.3, "step": 10095 }, { "epoch": 1.723905436545191, "grad_norm": 0.2826822611160273, "learning_rate": 2.3627971674253922e-05, "loss": 0.3098, "step": 10100 }, { "epoch": 1.7247588973286678, "grad_norm": 0.2761015475303958, "learning_rate": 2.3612164896307536e-05, "loss": 0.2983, "step": 10105 }, { "epoch": 1.7256123581121448, "grad_norm": 0.23431760302267984, "learning_rate": 2.3596358118361154e-05, "loss": 0.3237, "step": 10110 }, { "epoch": 1.7264658188956217, "grad_norm": 0.25924994543335367, "learning_rate": 2.358055134041477e-05, "loss": 0.3091, "step": 10115 }, { "epoch": 1.7273192796790986, "grad_norm": 0.3046320279267855, "learning_rate": 2.356474456246839e-05, "loss": 0.3134, "step": 10120 }, { "epoch": 1.7281727404625757, "grad_norm": 0.2782129227758942, "learning_rate": 2.3548937784522006e-05, "loss": 0.2993, "step": 10125 }, { "epoch": 1.7290262012460529, "grad_norm": 0.2902411031100804, "learning_rate": 2.353313100657562e-05, "loss": 0.3027, "step": 10130 }, { "epoch": 1.7298796620295298, "grad_norm": 0.23983945255883066, "learning_rate": 2.3517324228629237e-05, "loss": 0.2877, "step": 10135 }, { "epoch": 1.7307331228130067, "grad_norm": 0.28853627889378636, "learning_rate": 2.350151745068285e-05, "loss": 0.3012, "step": 10140 }, { "epoch": 1.7315865835964837, "grad_norm": 0.3330325352251142, "learning_rate": 2.3485710672736472e-05, "loss": 0.3051, "step": 10145 }, { "epoch": 1.7324400443799606, "grad_norm": 0.27428252704000344, "learning_rate": 2.3469903894790086e-05, "loss": 0.3176, "step": 10150 }, { "epoch": 1.7332935051634377, "grad_norm": 0.2964100848515858, "learning_rate": 2.3454097116843704e-05, "loss": 0.3149, "step": 10155 }, { "epoch": 1.7341469659469149, "grad_norm": 0.39355483033879174, "learning_rate": 2.343829033889732e-05, "loss": 0.332, "step": 10160 }, { "epoch": 1.7350004267303918, "grad_norm": 0.2775348547751164, "learning_rate": 2.3422483560950935e-05, "loss": 0.3307, "step": 10165 }, { "epoch": 1.7358538875138687, "grad_norm": 0.27937304629012344, "learning_rate": 2.3406676783004553e-05, "loss": 0.3184, "step": 10170 }, { "epoch": 1.7367073482973456, "grad_norm": 0.28843238967922025, "learning_rate": 2.339087000505817e-05, "loss": 0.3362, "step": 10175 }, { "epoch": 1.7375608090808228, "grad_norm": 0.321017088518279, "learning_rate": 2.3375063227111788e-05, "loss": 0.2923, "step": 10180 }, { "epoch": 1.7384142698642997, "grad_norm": 0.2253340685289638, "learning_rate": 2.3359256449165405e-05, "loss": 0.3172, "step": 10185 }, { "epoch": 1.7392677306477768, "grad_norm": 0.2690639791163282, "learning_rate": 2.334344967121902e-05, "loss": 0.3043, "step": 10190 }, { "epoch": 1.7401211914312538, "grad_norm": 0.3021658190597809, "learning_rate": 2.3327642893272637e-05, "loss": 0.3353, "step": 10195 }, { "epoch": 1.7409746522147307, "grad_norm": 0.37670425750278036, "learning_rate": 2.331183611532625e-05, "loss": 0.3328, "step": 10200 }, { "epoch": 1.7418281129982076, "grad_norm": 0.31129076598421346, "learning_rate": 2.3296029337379868e-05, "loss": 0.3092, "step": 10205 }, { "epoch": 1.7426815737816848, "grad_norm": 0.26080830769311064, "learning_rate": 2.3280222559433486e-05, "loss": 0.3113, "step": 10210 }, { "epoch": 1.7435350345651617, "grad_norm": 0.23485192395744972, "learning_rate": 2.3264415781487103e-05, "loss": 0.3011, "step": 10215 }, { "epoch": 1.7443884953486388, "grad_norm": 0.29433014556994175, "learning_rate": 2.324860900354072e-05, "loss": 0.3249, "step": 10220 }, { "epoch": 1.7452419561321157, "grad_norm": 0.27203884632328235, "learning_rate": 2.3232802225594335e-05, "loss": 0.3282, "step": 10225 }, { "epoch": 1.7460954169155927, "grad_norm": 0.3247292384839154, "learning_rate": 2.3216995447647952e-05, "loss": 0.3078, "step": 10230 }, { "epoch": 1.7469488776990696, "grad_norm": 0.31509721259202145, "learning_rate": 2.3201188669701566e-05, "loss": 0.3177, "step": 10235 }, { "epoch": 1.7478023384825467, "grad_norm": 0.280804536529413, "learning_rate": 2.3185381891755187e-05, "loss": 0.3197, "step": 10240 }, { "epoch": 1.7486557992660239, "grad_norm": 0.2675377949942349, "learning_rate": 2.3169575113808805e-05, "loss": 0.3221, "step": 10245 }, { "epoch": 1.7495092600495008, "grad_norm": 0.3051226861424147, "learning_rate": 2.315376833586242e-05, "loss": 0.3018, "step": 10250 }, { "epoch": 1.7503627208329777, "grad_norm": 0.26191014210874464, "learning_rate": 2.3137961557916036e-05, "loss": 0.2938, "step": 10255 }, { "epoch": 1.7512161816164546, "grad_norm": 0.35698818102319124, "learning_rate": 2.312215477996965e-05, "loss": 0.3166, "step": 10260 }, { "epoch": 1.7520696423999316, "grad_norm": 0.2771505837431129, "learning_rate": 2.3106348002023268e-05, "loss": 0.3098, "step": 10265 }, { "epoch": 1.7529231031834087, "grad_norm": 0.28818865125574256, "learning_rate": 2.3090541224076885e-05, "loss": 0.306, "step": 10270 }, { "epoch": 1.7537765639668859, "grad_norm": 0.27488584174597247, "learning_rate": 2.3074734446130503e-05, "loss": 0.3237, "step": 10275 }, { "epoch": 1.7546300247503628, "grad_norm": 0.25252745419007405, "learning_rate": 2.305892766818412e-05, "loss": 0.3387, "step": 10280 }, { "epoch": 1.7554834855338397, "grad_norm": 0.2328412487462422, "learning_rate": 2.3043120890237734e-05, "loss": 0.3195, "step": 10285 }, { "epoch": 1.7563369463173166, "grad_norm": 0.30457338786651733, "learning_rate": 2.302731411229135e-05, "loss": 0.304, "step": 10290 }, { "epoch": 1.7571904071007938, "grad_norm": 0.2735155594468563, "learning_rate": 2.3011507334344966e-05, "loss": 0.3213, "step": 10295 }, { "epoch": 1.7580438678842707, "grad_norm": 0.2529960094129811, "learning_rate": 2.2995700556398586e-05, "loss": 0.3222, "step": 10300 }, { "epoch": 1.7588973286677478, "grad_norm": 0.30352610692244847, "learning_rate": 2.2979893778452204e-05, "loss": 0.3001, "step": 10305 }, { "epoch": 1.7597507894512248, "grad_norm": 0.309149018229516, "learning_rate": 2.2964087000505818e-05, "loss": 0.335, "step": 10310 }, { "epoch": 1.7606042502347017, "grad_norm": 0.2601277073185921, "learning_rate": 2.2948280222559435e-05, "loss": 0.3217, "step": 10315 }, { "epoch": 1.7614577110181786, "grad_norm": 0.26441728254555114, "learning_rate": 2.293247344461305e-05, "loss": 0.3225, "step": 10320 }, { "epoch": 1.7623111718016558, "grad_norm": 0.30216971816104155, "learning_rate": 2.2916666666666667e-05, "loss": 0.2975, "step": 10325 }, { "epoch": 1.7631646325851327, "grad_norm": 0.2656015229672402, "learning_rate": 2.2900859888720284e-05, "loss": 0.3205, "step": 10330 }, { "epoch": 1.7640180933686098, "grad_norm": 0.29441311848364776, "learning_rate": 2.2885053110773902e-05, "loss": 0.312, "step": 10335 }, { "epoch": 1.7648715541520867, "grad_norm": 0.2588432981235434, "learning_rate": 2.286924633282752e-05, "loss": 0.2965, "step": 10340 }, { "epoch": 1.7657250149355637, "grad_norm": 0.24480248594520768, "learning_rate": 2.2853439554881133e-05, "loss": 0.3193, "step": 10345 }, { "epoch": 1.7665784757190406, "grad_norm": 0.3007503662405179, "learning_rate": 2.283763277693475e-05, "loss": 0.3341, "step": 10350 }, { "epoch": 1.7674319365025177, "grad_norm": 0.27182789842068916, "learning_rate": 2.2821825998988365e-05, "loss": 0.3084, "step": 10355 }, { "epoch": 1.7682853972859947, "grad_norm": 0.2535156053795688, "learning_rate": 2.2806019221041982e-05, "loss": 0.3207, "step": 10360 }, { "epoch": 1.7691388580694718, "grad_norm": 0.290632845098278, "learning_rate": 2.2790212443095603e-05, "loss": 0.3139, "step": 10365 }, { "epoch": 1.7699923188529487, "grad_norm": 0.28667121587517413, "learning_rate": 2.2774405665149217e-05, "loss": 0.3146, "step": 10370 }, { "epoch": 1.7708457796364256, "grad_norm": 0.26709075945914973, "learning_rate": 2.2758598887202835e-05, "loss": 0.3165, "step": 10375 }, { "epoch": 1.7716992404199026, "grad_norm": 0.34152883982878435, "learning_rate": 2.274279210925645e-05, "loss": 0.3064, "step": 10380 }, { "epoch": 1.7725527012033797, "grad_norm": 0.2774863761968104, "learning_rate": 2.2726985331310066e-05, "loss": 0.3001, "step": 10385 }, { "epoch": 1.7734061619868569, "grad_norm": 0.3139477104631993, "learning_rate": 2.2711178553363684e-05, "loss": 0.2915, "step": 10390 }, { "epoch": 1.7742596227703338, "grad_norm": 0.30727295402068705, "learning_rate": 2.26953717754173e-05, "loss": 0.3009, "step": 10395 }, { "epoch": 1.7751130835538107, "grad_norm": 0.29947358061495194, "learning_rate": 2.267956499747092e-05, "loss": 0.357, "step": 10400 }, { "epoch": 1.7759665443372876, "grad_norm": 0.28141569943253275, "learning_rate": 2.2663758219524533e-05, "loss": 0.304, "step": 10405 }, { "epoch": 1.7768200051207645, "grad_norm": 0.2839741203276114, "learning_rate": 2.264795144157815e-05, "loss": 0.3082, "step": 10410 }, { "epoch": 1.7776734659042417, "grad_norm": 0.4200548614194502, "learning_rate": 2.2632144663631764e-05, "loss": 0.2973, "step": 10415 }, { "epoch": 1.7785269266877188, "grad_norm": 0.2631525516006065, "learning_rate": 2.261633788568538e-05, "loss": 0.3167, "step": 10420 }, { "epoch": 1.7793803874711958, "grad_norm": 0.27963924813211066, "learning_rate": 2.2600531107739003e-05, "loss": 0.3004, "step": 10425 }, { "epoch": 1.7802338482546727, "grad_norm": 0.312091189990957, "learning_rate": 2.2584724329792617e-05, "loss": 0.308, "step": 10430 }, { "epoch": 1.7810873090381496, "grad_norm": 0.3150394905201412, "learning_rate": 2.2568917551846234e-05, "loss": 0.2957, "step": 10435 }, { "epoch": 1.7819407698216267, "grad_norm": 0.2614001367057355, "learning_rate": 2.2553110773899848e-05, "loss": 0.3248, "step": 10440 }, { "epoch": 1.7827942306051037, "grad_norm": 0.3094128026823892, "learning_rate": 2.2537303995953466e-05, "loss": 0.3097, "step": 10445 }, { "epoch": 1.7836476913885808, "grad_norm": 0.2920108468928593, "learning_rate": 2.2521497218007083e-05, "loss": 0.326, "step": 10450 }, { "epoch": 1.7845011521720577, "grad_norm": 0.2608019512304576, "learning_rate": 2.25056904400607e-05, "loss": 0.3102, "step": 10455 }, { "epoch": 1.7853546129555347, "grad_norm": 0.26692638349065856, "learning_rate": 2.2489883662114318e-05, "loss": 0.3087, "step": 10460 }, { "epoch": 1.7862080737390116, "grad_norm": 0.2519964124937002, "learning_rate": 2.2474076884167932e-05, "loss": 0.3079, "step": 10465 }, { "epoch": 1.7870615345224887, "grad_norm": 0.2904581429524586, "learning_rate": 2.245827010622155e-05, "loss": 0.306, "step": 10470 }, { "epoch": 1.7879149953059656, "grad_norm": 0.29429362771998807, "learning_rate": 2.2442463328275164e-05, "loss": 0.3065, "step": 10475 }, { "epoch": 1.7887684560894428, "grad_norm": 0.29813440538620184, "learning_rate": 2.242665655032878e-05, "loss": 0.2969, "step": 10480 }, { "epoch": 1.7896219168729197, "grad_norm": 0.3133120369880597, "learning_rate": 2.24108497723824e-05, "loss": 0.3404, "step": 10485 }, { "epoch": 1.7904753776563966, "grad_norm": 0.280110045501517, "learning_rate": 2.2395042994436016e-05, "loss": 0.305, "step": 10490 }, { "epoch": 1.7913288384398736, "grad_norm": 0.3524650760754119, "learning_rate": 2.2379236216489633e-05, "loss": 0.3225, "step": 10495 }, { "epoch": 1.7921822992233507, "grad_norm": 0.2851725073570076, "learning_rate": 2.2363429438543247e-05, "loss": 0.3368, "step": 10500 }, { "epoch": 1.7930357600068279, "grad_norm": 0.3011301902482055, "learning_rate": 2.2347622660596865e-05, "loss": 0.3029, "step": 10505 }, { "epoch": 1.7938892207903048, "grad_norm": 0.27306169444125994, "learning_rate": 2.2331815882650482e-05, "loss": 0.3056, "step": 10510 }, { "epoch": 1.7947426815737817, "grad_norm": 0.2592177715350263, "learning_rate": 2.2316009104704096e-05, "loss": 0.3162, "step": 10515 }, { "epoch": 1.7955961423572586, "grad_norm": 0.30011993190922437, "learning_rate": 2.2300202326757717e-05, "loss": 0.307, "step": 10520 }, { "epoch": 1.7964496031407355, "grad_norm": 0.33987185837709266, "learning_rate": 2.228439554881133e-05, "loss": 0.3166, "step": 10525 }, { "epoch": 1.7973030639242127, "grad_norm": 0.2734352709360251, "learning_rate": 2.226858877086495e-05, "loss": 0.3198, "step": 10530 }, { "epoch": 1.7981565247076898, "grad_norm": 0.3141928682665081, "learning_rate": 2.2252781992918563e-05, "loss": 0.3054, "step": 10535 }, { "epoch": 1.7990099854911668, "grad_norm": 0.2997933074126133, "learning_rate": 2.223697521497218e-05, "loss": 0.3099, "step": 10540 }, { "epoch": 1.7998634462746437, "grad_norm": 0.3824610164354866, "learning_rate": 2.2221168437025798e-05, "loss": 0.3234, "step": 10545 }, { "epoch": 1.8007169070581206, "grad_norm": 0.24118962516194517, "learning_rate": 2.2205361659079415e-05, "loss": 0.2954, "step": 10550 }, { "epoch": 1.8015703678415975, "grad_norm": 0.28123012950216164, "learning_rate": 2.2189554881133033e-05, "loss": 0.316, "step": 10555 }, { "epoch": 1.8024238286250747, "grad_norm": 0.27370800912742627, "learning_rate": 2.2173748103186647e-05, "loss": 0.2956, "step": 10560 }, { "epoch": 1.8032772894085518, "grad_norm": 0.2726516486090715, "learning_rate": 2.2157941325240264e-05, "loss": 0.3154, "step": 10565 }, { "epoch": 1.8041307501920287, "grad_norm": 0.2863301187887699, "learning_rate": 2.214213454729388e-05, "loss": 0.3041, "step": 10570 }, { "epoch": 1.8049842109755057, "grad_norm": 0.22763733522855206, "learning_rate": 2.2126327769347496e-05, "loss": 0.3187, "step": 10575 }, { "epoch": 1.8058376717589826, "grad_norm": 0.3208097824546306, "learning_rate": 2.2110520991401117e-05, "loss": 0.3062, "step": 10580 }, { "epoch": 1.8066911325424597, "grad_norm": 0.2980212855862603, "learning_rate": 2.209471421345473e-05, "loss": 0.316, "step": 10585 }, { "epoch": 1.8075445933259366, "grad_norm": 0.28133942752215363, "learning_rate": 2.2078907435508348e-05, "loss": 0.3156, "step": 10590 }, { "epoch": 1.8083980541094138, "grad_norm": 0.2738317474931386, "learning_rate": 2.2063100657561962e-05, "loss": 0.3127, "step": 10595 }, { "epoch": 1.8092515148928907, "grad_norm": 0.27046462764435764, "learning_rate": 2.204729387961558e-05, "loss": 0.3033, "step": 10600 }, { "epoch": 1.8101049756763676, "grad_norm": 0.26552998868360955, "learning_rate": 2.2031487101669197e-05, "loss": 0.3212, "step": 10605 }, { "epoch": 1.8109584364598446, "grad_norm": 0.2786063339445375, "learning_rate": 2.201568032372281e-05, "loss": 0.3045, "step": 10610 }, { "epoch": 1.8118118972433217, "grad_norm": 0.22484798230173408, "learning_rate": 2.1999873545776432e-05, "loss": 0.3031, "step": 10615 }, { "epoch": 1.8126653580267986, "grad_norm": 0.25146933767171187, "learning_rate": 2.1984066767830046e-05, "loss": 0.3177, "step": 10620 }, { "epoch": 1.8135188188102758, "grad_norm": 0.2899639480648358, "learning_rate": 2.1968259989883664e-05, "loss": 0.3047, "step": 10625 }, { "epoch": 1.8143722795937527, "grad_norm": 0.332119942498462, "learning_rate": 2.195245321193728e-05, "loss": 0.316, "step": 10630 }, { "epoch": 1.8152257403772296, "grad_norm": 0.32030852008406396, "learning_rate": 2.1936646433990895e-05, "loss": 0.3127, "step": 10635 }, { "epoch": 1.8160792011607065, "grad_norm": 0.273263048742988, "learning_rate": 2.1920839656044513e-05, "loss": 0.3096, "step": 10640 }, { "epoch": 1.8169326619441837, "grad_norm": 0.26695581384206807, "learning_rate": 2.190503287809813e-05, "loss": 0.2973, "step": 10645 }, { "epoch": 1.8177861227276608, "grad_norm": 0.2724109780847453, "learning_rate": 2.1889226100151747e-05, "loss": 0.3053, "step": 10650 }, { "epoch": 1.8186395835111377, "grad_norm": 0.31099591593986037, "learning_rate": 2.187341932220536e-05, "loss": 0.3087, "step": 10655 }, { "epoch": 1.8194930442946147, "grad_norm": 0.31700693918666384, "learning_rate": 2.185761254425898e-05, "loss": 0.3355, "step": 10660 }, { "epoch": 1.8203465050780916, "grad_norm": 0.2874470267461024, "learning_rate": 2.1841805766312596e-05, "loss": 0.3163, "step": 10665 }, { "epoch": 1.8211999658615685, "grad_norm": 0.2951656182792951, "learning_rate": 2.182599898836621e-05, "loss": 0.3166, "step": 10670 }, { "epoch": 1.8220534266450457, "grad_norm": 0.2927906859048263, "learning_rate": 2.181019221041983e-05, "loss": 0.314, "step": 10675 }, { "epoch": 1.8229068874285228, "grad_norm": 0.22386341130822446, "learning_rate": 2.1794385432473445e-05, "loss": 0.3043, "step": 10680 }, { "epoch": 1.8237603482119997, "grad_norm": 0.27440467978037464, "learning_rate": 2.1778578654527063e-05, "loss": 0.3085, "step": 10685 }, { "epoch": 1.8246138089954766, "grad_norm": 0.29342550926160277, "learning_rate": 2.176277187658068e-05, "loss": 0.2979, "step": 10690 }, { "epoch": 1.8254672697789536, "grad_norm": 0.3223240846453698, "learning_rate": 2.1746965098634294e-05, "loss": 0.3026, "step": 10695 }, { "epoch": 1.8263207305624305, "grad_norm": 0.2775936156450699, "learning_rate": 2.1731158320687912e-05, "loss": 0.3082, "step": 10700 }, { "epoch": 1.8271741913459076, "grad_norm": 0.3437617009404465, "learning_rate": 2.171535154274153e-05, "loss": 0.3005, "step": 10705 }, { "epoch": 1.8280276521293848, "grad_norm": 0.272203214549919, "learning_rate": 2.1699544764795147e-05, "loss": 0.3157, "step": 10710 }, { "epoch": 1.8288811129128617, "grad_norm": 0.2658030527305792, "learning_rate": 2.168373798684876e-05, "loss": 0.3156, "step": 10715 }, { "epoch": 1.8297345736963386, "grad_norm": 0.32445681932641157, "learning_rate": 2.166793120890238e-05, "loss": 0.3123, "step": 10720 }, { "epoch": 1.8305880344798156, "grad_norm": 0.30591032989368405, "learning_rate": 2.1652124430955996e-05, "loss": 0.3093, "step": 10725 }, { "epoch": 1.8314414952632927, "grad_norm": 0.2769685259829598, "learning_rate": 2.163631765300961e-05, "loss": 0.3303, "step": 10730 }, { "epoch": 1.8322949560467696, "grad_norm": 0.24358336257807703, "learning_rate": 2.1620510875063227e-05, "loss": 0.3213, "step": 10735 }, { "epoch": 1.8331484168302468, "grad_norm": 0.3624154431322184, "learning_rate": 2.1604704097116845e-05, "loss": 0.313, "step": 10740 }, { "epoch": 1.8340018776137237, "grad_norm": 0.2702209553847913, "learning_rate": 2.1588897319170462e-05, "loss": 0.3309, "step": 10745 }, { "epoch": 1.8348553383972006, "grad_norm": 0.23389633995753287, "learning_rate": 2.157309054122408e-05, "loss": 0.3043, "step": 10750 }, { "epoch": 1.8357087991806775, "grad_norm": 0.28770383365075075, "learning_rate": 2.1557283763277694e-05, "loss": 0.3119, "step": 10755 }, { "epoch": 1.8365622599641547, "grad_norm": 0.26477738399179734, "learning_rate": 2.154147698533131e-05, "loss": 0.3118, "step": 10760 }, { "epoch": 1.8374157207476316, "grad_norm": 0.3203786112920576, "learning_rate": 2.1525670207384925e-05, "loss": 0.3123, "step": 10765 }, { "epoch": 1.8382691815311087, "grad_norm": 0.29274284226667585, "learning_rate": 2.1509863429438546e-05, "loss": 0.3235, "step": 10770 }, { "epoch": 1.8391226423145857, "grad_norm": 0.30205406263892426, "learning_rate": 2.149405665149216e-05, "loss": 0.2948, "step": 10775 }, { "epoch": 1.8399761030980626, "grad_norm": 0.30551072829077425, "learning_rate": 2.1478249873545778e-05, "loss": 0.324, "step": 10780 }, { "epoch": 1.8408295638815395, "grad_norm": 0.30805641098275954, "learning_rate": 2.1462443095599395e-05, "loss": 0.3314, "step": 10785 }, { "epoch": 1.8416830246650167, "grad_norm": 0.26316989178197897, "learning_rate": 2.144663631765301e-05, "loss": 0.333, "step": 10790 }, { "epoch": 1.8425364854484938, "grad_norm": 0.2475715724677562, "learning_rate": 2.1430829539706627e-05, "loss": 0.303, "step": 10795 }, { "epoch": 1.8433899462319707, "grad_norm": 0.3094328093993087, "learning_rate": 2.1415022761760244e-05, "loss": 0.3057, "step": 10800 }, { "epoch": 1.8442434070154476, "grad_norm": 0.30895027907234124, "learning_rate": 2.139921598381386e-05, "loss": 0.3029, "step": 10805 }, { "epoch": 1.8450968677989246, "grad_norm": 0.2593894949501174, "learning_rate": 2.138340920586748e-05, "loss": 0.3263, "step": 10810 }, { "epoch": 1.8459503285824015, "grad_norm": 0.3077169344034299, "learning_rate": 2.1367602427921093e-05, "loss": 0.3187, "step": 10815 }, { "epoch": 1.8468037893658786, "grad_norm": 0.3234728705536577, "learning_rate": 2.135179564997471e-05, "loss": 0.313, "step": 10820 }, { "epoch": 1.8476572501493558, "grad_norm": 0.3886813324167672, "learning_rate": 2.1335988872028325e-05, "loss": 0.3022, "step": 10825 }, { "epoch": 1.8485107109328327, "grad_norm": 0.3399546939744835, "learning_rate": 2.1320182094081945e-05, "loss": 0.3164, "step": 10830 }, { "epoch": 1.8493641717163096, "grad_norm": 0.3410602556381905, "learning_rate": 2.130437531613556e-05, "loss": 0.2945, "step": 10835 }, { "epoch": 1.8502176324997865, "grad_norm": 0.2720845531580212, "learning_rate": 2.1288568538189177e-05, "loss": 0.2972, "step": 10840 }, { "epoch": 1.8510710932832635, "grad_norm": 0.3254249966006687, "learning_rate": 2.1272761760242794e-05, "loss": 0.344, "step": 10845 }, { "epoch": 1.8519245540667406, "grad_norm": 0.295706347273412, "learning_rate": 2.125695498229641e-05, "loss": 0.3142, "step": 10850 }, { "epoch": 1.8527780148502178, "grad_norm": 0.23841843768023677, "learning_rate": 2.1241148204350026e-05, "loss": 0.3201, "step": 10855 }, { "epoch": 1.8536314756336947, "grad_norm": 0.29027644124799706, "learning_rate": 2.122534142640364e-05, "loss": 0.3191, "step": 10860 }, { "epoch": 1.8544849364171716, "grad_norm": 0.27735852902856434, "learning_rate": 2.120953464845726e-05, "loss": 0.3025, "step": 10865 }, { "epoch": 1.8553383972006485, "grad_norm": 0.27578032594330043, "learning_rate": 2.1193727870510875e-05, "loss": 0.3313, "step": 10870 }, { "epoch": 1.8561918579841257, "grad_norm": 0.31443261658567623, "learning_rate": 2.1177921092564492e-05, "loss": 0.3055, "step": 10875 }, { "epoch": 1.8570453187676026, "grad_norm": 0.2639854946436977, "learning_rate": 2.116211431461811e-05, "loss": 0.3013, "step": 10880 }, { "epoch": 1.8578987795510797, "grad_norm": 0.2513777247505605, "learning_rate": 2.1146307536671724e-05, "loss": 0.3232, "step": 10885 }, { "epoch": 1.8587522403345567, "grad_norm": 0.3383527247419007, "learning_rate": 2.113050075872534e-05, "loss": 0.3097, "step": 10890 }, { "epoch": 1.8596057011180336, "grad_norm": 0.27043101098352784, "learning_rate": 2.111469398077896e-05, "loss": 0.3033, "step": 10895 }, { "epoch": 1.8604591619015105, "grad_norm": 0.3158127632907112, "learning_rate": 2.1098887202832576e-05, "loss": 0.3113, "step": 10900 }, { "epoch": 1.8613126226849876, "grad_norm": 0.31936019322927794, "learning_rate": 2.1083080424886194e-05, "loss": 0.3204, "step": 10905 }, { "epoch": 1.8621660834684646, "grad_norm": 0.26636194754526654, "learning_rate": 2.1067273646939808e-05, "loss": 0.2979, "step": 10910 }, { "epoch": 1.8630195442519417, "grad_norm": 0.3018393100769779, "learning_rate": 2.1051466868993425e-05, "loss": 0.3148, "step": 10915 }, { "epoch": 1.8638730050354186, "grad_norm": 0.41302207393497137, "learning_rate": 2.103566009104704e-05, "loss": 0.3127, "step": 10920 }, { "epoch": 1.8647264658188956, "grad_norm": 0.30775787102173974, "learning_rate": 2.101985331310066e-05, "loss": 0.3135, "step": 10925 }, { "epoch": 1.8655799266023725, "grad_norm": 0.2902241650258772, "learning_rate": 2.1004046535154274e-05, "loss": 0.2935, "step": 10930 }, { "epoch": 1.8664333873858496, "grad_norm": 0.3383675304858579, "learning_rate": 2.0988239757207892e-05, "loss": 0.3295, "step": 10935 }, { "epoch": 1.8672868481693268, "grad_norm": 0.2920561766515733, "learning_rate": 2.097243297926151e-05, "loss": 0.326, "step": 10940 }, { "epoch": 1.8681403089528037, "grad_norm": 0.28520237162578227, "learning_rate": 2.0956626201315123e-05, "loss": 0.3159, "step": 10945 }, { "epoch": 1.8689937697362806, "grad_norm": 0.25927131720343516, "learning_rate": 2.094081942336874e-05, "loss": 0.2927, "step": 10950 }, { "epoch": 1.8698472305197575, "grad_norm": 0.2676133314278998, "learning_rate": 2.0925012645422358e-05, "loss": 0.3088, "step": 10955 }, { "epoch": 1.8707006913032345, "grad_norm": 0.23231444938423423, "learning_rate": 2.0909205867475976e-05, "loss": 0.3193, "step": 10960 }, { "epoch": 1.8715541520867116, "grad_norm": 0.2729583451691963, "learning_rate": 2.0893399089529593e-05, "loss": 0.3081, "step": 10965 }, { "epoch": 1.8724076128701888, "grad_norm": 0.2334307722186081, "learning_rate": 2.0877592311583207e-05, "loss": 0.3015, "step": 10970 }, { "epoch": 1.8732610736536657, "grad_norm": 0.27492082631307735, "learning_rate": 2.0861785533636825e-05, "loss": 0.2988, "step": 10975 }, { "epoch": 1.8741145344371426, "grad_norm": 0.3495008619769614, "learning_rate": 2.084597875569044e-05, "loss": 0.3255, "step": 10980 }, { "epoch": 1.8749679952206195, "grad_norm": 0.2806631255292244, "learning_rate": 2.0830171977744056e-05, "loss": 0.3187, "step": 10985 }, { "epoch": 1.8758214560040967, "grad_norm": 0.26740106156416055, "learning_rate": 2.0814365199797674e-05, "loss": 0.3207, "step": 10990 }, { "epoch": 1.8766749167875736, "grad_norm": 0.28043196734225934, "learning_rate": 2.079855842185129e-05, "loss": 0.3249, "step": 10995 }, { "epoch": 1.8775283775710507, "grad_norm": 0.236584095540382, "learning_rate": 2.078275164390491e-05, "loss": 0.3045, "step": 11000 }, { "epoch": 1.8783818383545277, "grad_norm": 0.3088064080238884, "learning_rate": 2.0766944865958523e-05, "loss": 0.3079, "step": 11005 }, { "epoch": 1.8792352991380046, "grad_norm": 0.29817074963246243, "learning_rate": 2.075113808801214e-05, "loss": 0.3152, "step": 11010 }, { "epoch": 1.8800887599214815, "grad_norm": 0.2730197340522276, "learning_rate": 2.0735331310065757e-05, "loss": 0.2911, "step": 11015 }, { "epoch": 1.8809422207049586, "grad_norm": 0.3259290826561265, "learning_rate": 2.0719524532119375e-05, "loss": 0.3129, "step": 11020 }, { "epoch": 1.8817956814884356, "grad_norm": 0.2444788007153159, "learning_rate": 2.0703717754172992e-05, "loss": 0.3085, "step": 11025 }, { "epoch": 1.8826491422719127, "grad_norm": 0.3097458539502419, "learning_rate": 2.0687910976226606e-05, "loss": 0.2916, "step": 11030 }, { "epoch": 1.8835026030553896, "grad_norm": 0.24067141687649615, "learning_rate": 2.0672104198280224e-05, "loss": 0.3086, "step": 11035 }, { "epoch": 1.8843560638388666, "grad_norm": 0.21594529416686895, "learning_rate": 2.0656297420333838e-05, "loss": 0.3371, "step": 11040 }, { "epoch": 1.8852095246223435, "grad_norm": 0.28183046504979564, "learning_rate": 2.0640490642387455e-05, "loss": 0.3123, "step": 11045 }, { "epoch": 1.8860629854058206, "grad_norm": 0.21022837799634844, "learning_rate": 2.0624683864441073e-05, "loss": 0.316, "step": 11050 }, { "epoch": 1.8869164461892975, "grad_norm": 0.2742734311824316, "learning_rate": 2.060887708649469e-05, "loss": 0.2979, "step": 11055 }, { "epoch": 1.8877699069727747, "grad_norm": 0.3031481346608741, "learning_rate": 2.0593070308548308e-05, "loss": 0.3111, "step": 11060 }, { "epoch": 1.8886233677562516, "grad_norm": 0.2738404989907127, "learning_rate": 2.0577263530601922e-05, "loss": 0.293, "step": 11065 }, { "epoch": 1.8894768285397285, "grad_norm": 0.26818220960616074, "learning_rate": 2.056145675265554e-05, "loss": 0.2953, "step": 11070 }, { "epoch": 1.8903302893232055, "grad_norm": 0.27014237282449927, "learning_rate": 2.0545649974709157e-05, "loss": 0.3068, "step": 11075 }, { "epoch": 1.8911837501066826, "grad_norm": 0.265419818018807, "learning_rate": 2.0529843196762774e-05, "loss": 0.2867, "step": 11080 }, { "epoch": 1.8920372108901597, "grad_norm": 0.2709569454157105, "learning_rate": 2.0514036418816392e-05, "loss": 0.311, "step": 11085 }, { "epoch": 1.8928906716736367, "grad_norm": 0.2860498368474054, "learning_rate": 2.0498229640870006e-05, "loss": 0.3001, "step": 11090 }, { "epoch": 1.8937441324571136, "grad_norm": 0.30598991214738636, "learning_rate": 2.0482422862923623e-05, "loss": 0.3027, "step": 11095 }, { "epoch": 1.8945975932405905, "grad_norm": 0.25400655998782146, "learning_rate": 2.0466616084977237e-05, "loss": 0.31, "step": 11100 }, { "epoch": 1.8954510540240674, "grad_norm": 0.2593642454933725, "learning_rate": 2.0450809307030855e-05, "loss": 0.3015, "step": 11105 }, { "epoch": 1.8963045148075446, "grad_norm": 0.2632213593292669, "learning_rate": 2.0435002529084472e-05, "loss": 0.3024, "step": 11110 }, { "epoch": 1.8971579755910217, "grad_norm": 0.2865764123455578, "learning_rate": 2.041919575113809e-05, "loss": 0.2763, "step": 11115 }, { "epoch": 1.8980114363744987, "grad_norm": 0.34345725242869435, "learning_rate": 2.0403388973191707e-05, "loss": 0.2999, "step": 11120 }, { "epoch": 1.8988648971579756, "grad_norm": 0.2434023059965411, "learning_rate": 2.038758219524532e-05, "loss": 0.3031, "step": 11125 }, { "epoch": 1.8997183579414525, "grad_norm": 0.2553782382137181, "learning_rate": 2.037177541729894e-05, "loss": 0.3183, "step": 11130 }, { "epoch": 1.9005718187249296, "grad_norm": 0.28971840793038206, "learning_rate": 2.0355968639352556e-05, "loss": 0.3265, "step": 11135 }, { "epoch": 1.9014252795084066, "grad_norm": 0.2778396948016992, "learning_rate": 2.034016186140617e-05, "loss": 0.3292, "step": 11140 }, { "epoch": 1.9022787402918837, "grad_norm": 0.24520495339409093, "learning_rate": 2.032435508345979e-05, "loss": 0.3147, "step": 11145 }, { "epoch": 1.9031322010753606, "grad_norm": 0.24276442939037074, "learning_rate": 2.0308548305513405e-05, "loss": 0.3089, "step": 11150 }, { "epoch": 1.9039856618588376, "grad_norm": 0.3395712702759526, "learning_rate": 2.0292741527567023e-05, "loss": 0.3148, "step": 11155 }, { "epoch": 1.9048391226423145, "grad_norm": 0.2690160180341484, "learning_rate": 2.0276934749620637e-05, "loss": 0.3235, "step": 11160 }, { "epoch": 1.9056925834257916, "grad_norm": 0.29201317458434595, "learning_rate": 2.0261127971674254e-05, "loss": 0.2921, "step": 11165 }, { "epoch": 1.9065460442092685, "grad_norm": 0.24955541287210153, "learning_rate": 2.024532119372787e-05, "loss": 0.3056, "step": 11170 }, { "epoch": 1.9073995049927457, "grad_norm": 0.24495429182213857, "learning_rate": 2.022951441578149e-05, "loss": 0.3268, "step": 11175 }, { "epoch": 1.9082529657762226, "grad_norm": 0.25837204204172554, "learning_rate": 2.0213707637835106e-05, "loss": 0.2815, "step": 11180 }, { "epoch": 1.9091064265596995, "grad_norm": 0.24025553857010584, "learning_rate": 2.019790085988872e-05, "loss": 0.2954, "step": 11185 }, { "epoch": 1.9099598873431765, "grad_norm": 0.3095462826469391, "learning_rate": 2.0182094081942338e-05, "loss": 0.3141, "step": 11190 }, { "epoch": 1.9108133481266536, "grad_norm": 0.3148202548697605, "learning_rate": 2.0166287303995955e-05, "loss": 0.3007, "step": 11195 }, { "epoch": 1.9116668089101307, "grad_norm": 0.2787359297135444, "learning_rate": 2.015048052604957e-05, "loss": 0.313, "step": 11200 }, { "epoch": 1.9125202696936077, "grad_norm": 0.2699964850245832, "learning_rate": 2.013467374810319e-05, "loss": 0.3049, "step": 11205 }, { "epoch": 1.9133737304770846, "grad_norm": 0.3010774682098107, "learning_rate": 2.0118866970156804e-05, "loss": 0.2858, "step": 11210 }, { "epoch": 1.9142271912605615, "grad_norm": 0.30045725395118056, "learning_rate": 2.0103060192210422e-05, "loss": 0.2997, "step": 11215 }, { "epoch": 1.9150806520440384, "grad_norm": 0.2802629995322089, "learning_rate": 2.0087253414264036e-05, "loss": 0.3077, "step": 11220 }, { "epoch": 1.9159341128275156, "grad_norm": 0.2697206582193647, "learning_rate": 2.0071446636317653e-05, "loss": 0.3097, "step": 11225 }, { "epoch": 1.9167875736109927, "grad_norm": 0.2900507602336662, "learning_rate": 2.005563985837127e-05, "loss": 0.3102, "step": 11230 }, { "epoch": 1.9176410343944696, "grad_norm": 0.31369005137146416, "learning_rate": 2.003983308042489e-05, "loss": 0.3062, "step": 11235 }, { "epoch": 1.9184944951779466, "grad_norm": 0.284922024207156, "learning_rate": 2.0024026302478506e-05, "loss": 0.3357, "step": 11240 }, { "epoch": 1.9193479559614235, "grad_norm": 0.26609619834191567, "learning_rate": 2.000821952453212e-05, "loss": 0.329, "step": 11245 }, { "epoch": 1.9202014167449004, "grad_norm": 0.3136619415768956, "learning_rate": 1.9992412746585737e-05, "loss": 0.335, "step": 11250 }, { "epoch": 1.9210548775283776, "grad_norm": 0.3169664067848951, "learning_rate": 1.997660596863935e-05, "loss": 0.3006, "step": 11255 }, { "epoch": 1.9219083383118547, "grad_norm": 0.26285624900188853, "learning_rate": 1.996079919069297e-05, "loss": 0.3112, "step": 11260 }, { "epoch": 1.9227617990953316, "grad_norm": 0.2747620130765795, "learning_rate": 1.9944992412746586e-05, "loss": 0.3102, "step": 11265 }, { "epoch": 1.9236152598788085, "grad_norm": 0.3169416036527013, "learning_rate": 1.9929185634800204e-05, "loss": 0.2985, "step": 11270 }, { "epoch": 1.9244687206622855, "grad_norm": 0.2693723990684541, "learning_rate": 1.991337885685382e-05, "loss": 0.3103, "step": 11275 }, { "epoch": 1.9253221814457626, "grad_norm": 0.3173224203281029, "learning_rate": 1.9897572078907435e-05, "loss": 0.3177, "step": 11280 }, { "epoch": 1.9261756422292395, "grad_norm": 0.2780723177134725, "learning_rate": 1.9881765300961053e-05, "loss": 0.304, "step": 11285 }, { "epoch": 1.9270291030127167, "grad_norm": 0.26781897160424356, "learning_rate": 1.986595852301467e-05, "loss": 0.3057, "step": 11290 }, { "epoch": 1.9278825637961936, "grad_norm": 0.25666896856623955, "learning_rate": 1.9850151745068284e-05, "loss": 0.283, "step": 11295 }, { "epoch": 1.9287360245796705, "grad_norm": 0.2547824172972609, "learning_rate": 1.9834344967121905e-05, "loss": 0.3108, "step": 11300 }, { "epoch": 1.9295894853631474, "grad_norm": 0.27092307336730465, "learning_rate": 1.981853818917552e-05, "loss": 0.2955, "step": 11305 }, { "epoch": 1.9304429461466246, "grad_norm": 0.3284231009484845, "learning_rate": 1.9802731411229137e-05, "loss": 0.3044, "step": 11310 }, { "epoch": 1.9312964069301015, "grad_norm": 0.30322012598558984, "learning_rate": 1.978692463328275e-05, "loss": 0.3201, "step": 11315 }, { "epoch": 1.9321498677135787, "grad_norm": 0.27025980838189034, "learning_rate": 1.9771117855336368e-05, "loss": 0.3195, "step": 11320 }, { "epoch": 1.9330033284970556, "grad_norm": 0.2775783052773263, "learning_rate": 1.9755311077389986e-05, "loss": 0.3179, "step": 11325 }, { "epoch": 1.9338567892805325, "grad_norm": 0.3213995026286005, "learning_rate": 1.9739504299443603e-05, "loss": 0.3042, "step": 11330 }, { "epoch": 1.9347102500640094, "grad_norm": 0.2655139986062076, "learning_rate": 1.972369752149722e-05, "loss": 0.3148, "step": 11335 }, { "epoch": 1.9355637108474866, "grad_norm": 0.27645312062658195, "learning_rate": 1.9707890743550835e-05, "loss": 0.3178, "step": 11340 }, { "epoch": 1.9364171716309637, "grad_norm": 0.27172503409024584, "learning_rate": 1.9692083965604452e-05, "loss": 0.3199, "step": 11345 }, { "epoch": 1.9372706324144406, "grad_norm": 0.27261589255314334, "learning_rate": 1.967627718765807e-05, "loss": 0.298, "step": 11350 }, { "epoch": 1.9381240931979176, "grad_norm": 0.27184685103790507, "learning_rate": 1.9660470409711684e-05, "loss": 0.3188, "step": 11355 }, { "epoch": 1.9389775539813945, "grad_norm": 0.271893568153185, "learning_rate": 1.9644663631765304e-05, "loss": 0.3152, "step": 11360 }, { "epoch": 1.9398310147648714, "grad_norm": 0.261623695653045, "learning_rate": 1.962885685381892e-05, "loss": 0.3136, "step": 11365 }, { "epoch": 1.9406844755483486, "grad_norm": 0.2941011708677073, "learning_rate": 1.9613050075872536e-05, "loss": 0.3075, "step": 11370 }, { "epoch": 1.9415379363318257, "grad_norm": 0.26094090304836, "learning_rate": 1.959724329792615e-05, "loss": 0.3049, "step": 11375 }, { "epoch": 1.9423913971153026, "grad_norm": 0.2219207402373316, "learning_rate": 1.9581436519979768e-05, "loss": 0.3333, "step": 11380 }, { "epoch": 1.9432448578987795, "grad_norm": 0.2674675226840287, "learning_rate": 1.9565629742033385e-05, "loss": 0.3207, "step": 11385 }, { "epoch": 1.9440983186822565, "grad_norm": 0.25375766038622594, "learning_rate": 1.9549822964087e-05, "loss": 0.3074, "step": 11390 }, { "epoch": 1.9449517794657334, "grad_norm": 0.2871070593229922, "learning_rate": 1.953401618614062e-05, "loss": 0.3002, "step": 11395 }, { "epoch": 1.9458052402492105, "grad_norm": 0.24504511807290577, "learning_rate": 1.9518209408194234e-05, "loss": 0.3083, "step": 11400 }, { "epoch": 1.9466587010326877, "grad_norm": 0.2352600252008979, "learning_rate": 1.950240263024785e-05, "loss": 0.289, "step": 11405 }, { "epoch": 1.9475121618161646, "grad_norm": 0.28105316934632996, "learning_rate": 1.948659585230147e-05, "loss": 0.3164, "step": 11410 }, { "epoch": 1.9483656225996415, "grad_norm": 0.28551974924092066, "learning_rate": 1.9470789074355083e-05, "loss": 0.3109, "step": 11415 }, { "epoch": 1.9492190833831184, "grad_norm": 0.26703886423846845, "learning_rate": 1.94549822964087e-05, "loss": 0.3116, "step": 11420 }, { "epoch": 1.9500725441665956, "grad_norm": 0.262715217342799, "learning_rate": 1.9439175518462318e-05, "loss": 0.3017, "step": 11425 }, { "epoch": 1.9509260049500725, "grad_norm": 0.3107115674052192, "learning_rate": 1.9423368740515935e-05, "loss": 0.3062, "step": 11430 }, { "epoch": 1.9517794657335497, "grad_norm": 0.2573311731038781, "learning_rate": 1.940756196256955e-05, "loss": 0.3228, "step": 11435 }, { "epoch": 1.9526329265170266, "grad_norm": 0.2643036655163505, "learning_rate": 1.9391755184623167e-05, "loss": 0.3046, "step": 11440 }, { "epoch": 1.9534863873005035, "grad_norm": 0.28857879039605056, "learning_rate": 1.9375948406676784e-05, "loss": 0.3248, "step": 11445 }, { "epoch": 1.9543398480839804, "grad_norm": 0.27970725789563095, "learning_rate": 1.93601416287304e-05, "loss": 0.3367, "step": 11450 }, { "epoch": 1.9551933088674576, "grad_norm": 0.2808722028294523, "learning_rate": 1.934433485078402e-05, "loss": 0.3286, "step": 11455 }, { "epoch": 1.9560467696509345, "grad_norm": 0.2525812508573616, "learning_rate": 1.9328528072837633e-05, "loss": 0.3107, "step": 11460 }, { "epoch": 1.9569002304344116, "grad_norm": 0.29918716083466057, "learning_rate": 1.931272129489125e-05, "loss": 0.3141, "step": 11465 }, { "epoch": 1.9577536912178886, "grad_norm": 0.24209583846446778, "learning_rate": 1.9296914516944868e-05, "loss": 0.3003, "step": 11470 }, { "epoch": 1.9586071520013655, "grad_norm": 0.2660869249887226, "learning_rate": 1.9281107738998482e-05, "loss": 0.3168, "step": 11475 }, { "epoch": 1.9594606127848424, "grad_norm": 0.32317410586167683, "learning_rate": 1.92653009610521e-05, "loss": 0.2887, "step": 11480 }, { "epoch": 1.9603140735683195, "grad_norm": 0.2866839569669144, "learning_rate": 1.9249494183105717e-05, "loss": 0.3074, "step": 11485 }, { "epoch": 1.9611675343517967, "grad_norm": 0.21559300034006146, "learning_rate": 1.9233687405159335e-05, "loss": 0.3201, "step": 11490 }, { "epoch": 1.9620209951352736, "grad_norm": 0.25372322122500474, "learning_rate": 1.921788062721295e-05, "loss": 0.2947, "step": 11495 }, { "epoch": 1.9628744559187505, "grad_norm": 0.2733533737208337, "learning_rate": 1.9202073849266566e-05, "loss": 0.3139, "step": 11500 }, { "epoch": 1.9637279167022275, "grad_norm": 0.26766081032729877, "learning_rate": 1.9186267071320184e-05, "loss": 0.3065, "step": 11505 }, { "epoch": 1.9645813774857044, "grad_norm": 0.2541256403323289, "learning_rate": 1.9170460293373798e-05, "loss": 0.3014, "step": 11510 }, { "epoch": 1.9654348382691815, "grad_norm": 0.27968552868017804, "learning_rate": 1.9154653515427415e-05, "loss": 0.3165, "step": 11515 }, { "epoch": 1.9662882990526587, "grad_norm": 0.3033843915530958, "learning_rate": 1.9138846737481033e-05, "loss": 0.3058, "step": 11520 }, { "epoch": 1.9671417598361356, "grad_norm": 0.3079245950444975, "learning_rate": 1.912303995953465e-05, "loss": 0.3143, "step": 11525 }, { "epoch": 1.9679952206196125, "grad_norm": 0.28427282838374884, "learning_rate": 1.9107233181588268e-05, "loss": 0.3026, "step": 11530 }, { "epoch": 1.9688486814030894, "grad_norm": 0.26497511940394314, "learning_rate": 1.909142640364188e-05, "loss": 0.305, "step": 11535 }, { "epoch": 1.9697021421865664, "grad_norm": 0.24697383340510523, "learning_rate": 1.90756196256955e-05, "loss": 0.2975, "step": 11540 }, { "epoch": 1.9705556029700435, "grad_norm": 0.3024757312466317, "learning_rate": 1.9059812847749113e-05, "loss": 0.3174, "step": 11545 }, { "epoch": 1.9714090637535207, "grad_norm": 0.2488098140424247, "learning_rate": 1.9044006069802734e-05, "loss": 0.2914, "step": 11550 }, { "epoch": 1.9722625245369976, "grad_norm": 0.21081187152074715, "learning_rate": 1.9028199291856348e-05, "loss": 0.3087, "step": 11555 }, { "epoch": 1.9731159853204745, "grad_norm": 0.29395394337945346, "learning_rate": 1.9012392513909966e-05, "loss": 0.3221, "step": 11560 }, { "epoch": 1.9739694461039514, "grad_norm": 0.29771250912273467, "learning_rate": 1.8996585735963583e-05, "loss": 0.3059, "step": 11565 }, { "epoch": 1.9748229068874286, "grad_norm": 0.23541942931532947, "learning_rate": 1.8980778958017197e-05, "loss": 0.2873, "step": 11570 }, { "epoch": 1.9756763676709055, "grad_norm": 0.2845520331468578, "learning_rate": 1.8964972180070814e-05, "loss": 0.3019, "step": 11575 }, { "epoch": 1.9765298284543826, "grad_norm": 0.26456548137691954, "learning_rate": 1.8949165402124432e-05, "loss": 0.3169, "step": 11580 }, { "epoch": 1.9773832892378596, "grad_norm": 0.29465100108750136, "learning_rate": 1.893335862417805e-05, "loss": 0.3172, "step": 11585 }, { "epoch": 1.9782367500213365, "grad_norm": 0.26833944009344524, "learning_rate": 1.8917551846231667e-05, "loss": 0.293, "step": 11590 }, { "epoch": 1.9790902108048134, "grad_norm": 0.26929776035585185, "learning_rate": 1.890174506828528e-05, "loss": 0.2947, "step": 11595 }, { "epoch": 1.9799436715882905, "grad_norm": 0.27348353234063927, "learning_rate": 1.88859382903389e-05, "loss": 0.3126, "step": 11600 }, { "epoch": 1.9807971323717675, "grad_norm": 0.2608634679483541, "learning_rate": 1.8870131512392512e-05, "loss": 0.3396, "step": 11605 }, { "epoch": 1.9816505931552446, "grad_norm": 0.310724811583607, "learning_rate": 1.8854324734446133e-05, "loss": 0.3078, "step": 11610 }, { "epoch": 1.9825040539387215, "grad_norm": 0.2874267050157969, "learning_rate": 1.8838517956499747e-05, "loss": 0.3238, "step": 11615 }, { "epoch": 1.9833575147221985, "grad_norm": 0.31560742768443295, "learning_rate": 1.8822711178553365e-05, "loss": 0.3013, "step": 11620 }, { "epoch": 1.9842109755056754, "grad_norm": 0.2527161463891475, "learning_rate": 1.8806904400606982e-05, "loss": 0.3043, "step": 11625 }, { "epoch": 1.9850644362891525, "grad_norm": 0.28949105202342934, "learning_rate": 1.8791097622660596e-05, "loss": 0.3145, "step": 11630 }, { "epoch": 1.9859178970726297, "grad_norm": 0.310237999641645, "learning_rate": 1.8775290844714214e-05, "loss": 0.3146, "step": 11635 }, { "epoch": 1.9867713578561066, "grad_norm": 0.6725630731350639, "learning_rate": 1.875948406676783e-05, "loss": 0.3148, "step": 11640 }, { "epoch": 1.9876248186395835, "grad_norm": 0.26808047154803955, "learning_rate": 1.874367728882145e-05, "loss": 0.3157, "step": 11645 }, { "epoch": 1.9884782794230604, "grad_norm": 0.30402715867802244, "learning_rate": 1.8727870510875066e-05, "loss": 0.3093, "step": 11650 }, { "epoch": 1.9893317402065374, "grad_norm": 0.225823332679662, "learning_rate": 1.871206373292868e-05, "loss": 0.3087, "step": 11655 }, { "epoch": 1.9901852009900145, "grad_norm": 0.23917526655283702, "learning_rate": 1.8696256954982298e-05, "loss": 0.2991, "step": 11660 }, { "epoch": 1.9910386617734916, "grad_norm": 0.285025942241649, "learning_rate": 1.8680450177035912e-05, "loss": 0.3139, "step": 11665 }, { "epoch": 1.9918921225569686, "grad_norm": 0.2769459516557168, "learning_rate": 1.866464339908953e-05, "loss": 0.3266, "step": 11670 }, { "epoch": 1.9927455833404455, "grad_norm": 0.27889545729983006, "learning_rate": 1.8648836621143147e-05, "loss": 0.2961, "step": 11675 }, { "epoch": 1.9935990441239224, "grad_norm": 0.2249055694305042, "learning_rate": 1.8633029843196764e-05, "loss": 0.2899, "step": 11680 }, { "epoch": 1.9944525049073996, "grad_norm": 0.2513801436428232, "learning_rate": 1.861722306525038e-05, "loss": 0.3118, "step": 11685 }, { "epoch": 1.9953059656908765, "grad_norm": 0.25244748734242745, "learning_rate": 1.8601416287303996e-05, "loss": 0.305, "step": 11690 }, { "epoch": 1.9961594264743536, "grad_norm": 0.31799694441538345, "learning_rate": 1.8585609509357613e-05, "loss": 0.307, "step": 11695 }, { "epoch": 1.9970128872578305, "grad_norm": 0.25846242081928683, "learning_rate": 1.8569802731411227e-05, "loss": 0.2927, "step": 11700 }, { "epoch": 1.9978663480413075, "grad_norm": 0.2721126685452157, "learning_rate": 1.8553995953464848e-05, "loss": 0.3009, "step": 11705 }, { "epoch": 1.9987198088247844, "grad_norm": 0.2753539581013206, "learning_rate": 1.8538189175518466e-05, "loss": 0.312, "step": 11710 }, { "epoch": 1.9995732696082615, "grad_norm": 0.2539326058500821, "learning_rate": 1.852238239757208e-05, "loss": 0.3004, "step": 11715 }, { "epoch": 2.0003413843133906, "grad_norm": 0.2717025827692001, "learning_rate": 1.8506575619625697e-05, "loss": 0.2732, "step": 11720 }, { "epoch": 2.001194845096868, "grad_norm": 0.2877966504455341, "learning_rate": 1.849076884167931e-05, "loss": 0.2153, "step": 11725 }, { "epoch": 2.002048305880345, "grad_norm": 0.2825784062820552, "learning_rate": 1.847496206373293e-05, "loss": 0.2239, "step": 11730 }, { "epoch": 2.002901766663822, "grad_norm": 0.31842855897394323, "learning_rate": 1.8459155285786546e-05, "loss": 0.2123, "step": 11735 }, { "epoch": 2.0037552274472987, "grad_norm": 0.30168488984126446, "learning_rate": 1.8443348507840163e-05, "loss": 0.2223, "step": 11740 }, { "epoch": 2.0046086882307756, "grad_norm": 0.28541261285458136, "learning_rate": 1.842754172989378e-05, "loss": 0.2155, "step": 11745 }, { "epoch": 2.005462149014253, "grad_norm": 0.30399085693297123, "learning_rate": 1.8411734951947395e-05, "loss": 0.2028, "step": 11750 }, { "epoch": 2.00631560979773, "grad_norm": 0.3014005578693101, "learning_rate": 1.8395928174001012e-05, "loss": 0.2068, "step": 11755 }, { "epoch": 2.007169070581207, "grad_norm": 0.30134249683190106, "learning_rate": 1.8380121396054627e-05, "loss": 0.2136, "step": 11760 }, { "epoch": 2.008022531364684, "grad_norm": 0.29347473848815103, "learning_rate": 1.8364314618108247e-05, "loss": 0.2127, "step": 11765 }, { "epoch": 2.0088759921481607, "grad_norm": 0.27600890403451983, "learning_rate": 1.8348507840161865e-05, "loss": 0.2003, "step": 11770 }, { "epoch": 2.0097294529316376, "grad_norm": 0.3050001587201491, "learning_rate": 1.833270106221548e-05, "loss": 0.2224, "step": 11775 }, { "epoch": 2.010582913715115, "grad_norm": 0.2818627715223362, "learning_rate": 1.8316894284269096e-05, "loss": 0.2156, "step": 11780 }, { "epoch": 2.011436374498592, "grad_norm": 0.3126707810761198, "learning_rate": 1.830108750632271e-05, "loss": 0.2075, "step": 11785 }, { "epoch": 2.012289835282069, "grad_norm": 0.3068145179170214, "learning_rate": 1.8285280728376328e-05, "loss": 0.2085, "step": 11790 }, { "epoch": 2.0131432960655458, "grad_norm": 0.2720665668873232, "learning_rate": 1.8269473950429945e-05, "loss": 0.231, "step": 11795 }, { "epoch": 2.0139967568490227, "grad_norm": 0.3367023708935221, "learning_rate": 1.8253667172483563e-05, "loss": 0.2066, "step": 11800 }, { "epoch": 2.0148502176324996, "grad_norm": 0.2657729632869617, "learning_rate": 1.823786039453718e-05, "loss": 0.2229, "step": 11805 }, { "epoch": 2.015703678415977, "grad_norm": 0.25174135575696377, "learning_rate": 1.8222053616590794e-05, "loss": 0.2206, "step": 11810 }, { "epoch": 2.016557139199454, "grad_norm": 0.28913272665026124, "learning_rate": 1.8206246838644412e-05, "loss": 0.2106, "step": 11815 }, { "epoch": 2.017410599982931, "grad_norm": 0.3499344232041033, "learning_rate": 1.8190440060698026e-05, "loss": 0.2045, "step": 11820 }, { "epoch": 2.0182640607664077, "grad_norm": 0.2767939161046466, "learning_rate": 1.8174633282751643e-05, "loss": 0.2035, "step": 11825 }, { "epoch": 2.0191175215498847, "grad_norm": 0.31427760533829296, "learning_rate": 1.8158826504805264e-05, "loss": 0.2071, "step": 11830 }, { "epoch": 2.0199709823333616, "grad_norm": 0.30118265686301887, "learning_rate": 1.8143019726858878e-05, "loss": 0.1992, "step": 11835 }, { "epoch": 2.020824443116839, "grad_norm": 0.3383136782605298, "learning_rate": 1.8127212948912496e-05, "loss": 0.216, "step": 11840 }, { "epoch": 2.021677903900316, "grad_norm": 0.30834006456493884, "learning_rate": 1.811140617096611e-05, "loss": 0.2289, "step": 11845 }, { "epoch": 2.022531364683793, "grad_norm": 0.30743090237239085, "learning_rate": 1.8095599393019727e-05, "loss": 0.2138, "step": 11850 }, { "epoch": 2.0233848254672697, "grad_norm": 0.3032119626916758, "learning_rate": 1.8079792615073345e-05, "loss": 0.2214, "step": 11855 }, { "epoch": 2.0242382862507466, "grad_norm": 0.26017439877730386, "learning_rate": 1.8063985837126962e-05, "loss": 0.2038, "step": 11860 }, { "epoch": 2.0250917470342236, "grad_norm": 0.3071425944629756, "learning_rate": 1.804817905918058e-05, "loss": 0.2034, "step": 11865 }, { "epoch": 2.025945207817701, "grad_norm": 0.2870077550104575, "learning_rate": 1.8032372281234194e-05, "loss": 0.2162, "step": 11870 }, { "epoch": 2.026798668601178, "grad_norm": 0.2992345155861402, "learning_rate": 1.801656550328781e-05, "loss": 0.2129, "step": 11875 }, { "epoch": 2.0276521293846548, "grad_norm": 0.35119786685032806, "learning_rate": 1.8000758725341425e-05, "loss": 0.2339, "step": 11880 }, { "epoch": 2.0285055901681317, "grad_norm": 0.2709582906509985, "learning_rate": 1.7984951947395043e-05, "loss": 0.2148, "step": 11885 }, { "epoch": 2.0293590509516086, "grad_norm": 0.2779533468023149, "learning_rate": 1.796914516944866e-05, "loss": 0.2022, "step": 11890 }, { "epoch": 2.030212511735086, "grad_norm": 0.2864189157722537, "learning_rate": 1.7953338391502278e-05, "loss": 0.2431, "step": 11895 }, { "epoch": 2.031065972518563, "grad_norm": 0.27031382241383317, "learning_rate": 1.7937531613555895e-05, "loss": 0.2211, "step": 11900 }, { "epoch": 2.03191943330204, "grad_norm": 0.3297766415885481, "learning_rate": 1.792172483560951e-05, "loss": 0.2266, "step": 11905 }, { "epoch": 2.0327728940855168, "grad_norm": 0.2511554582766255, "learning_rate": 1.7905918057663127e-05, "loss": 0.2144, "step": 11910 }, { "epoch": 2.0336263548689937, "grad_norm": 0.2926242242378917, "learning_rate": 1.7890111279716744e-05, "loss": 0.2229, "step": 11915 }, { "epoch": 2.0344798156524706, "grad_norm": 0.25066059119227413, "learning_rate": 1.7874304501770358e-05, "loss": 0.2026, "step": 11920 }, { "epoch": 2.035333276435948, "grad_norm": 0.26334915216360516, "learning_rate": 1.785849772382398e-05, "loss": 0.2071, "step": 11925 }, { "epoch": 2.036186737219425, "grad_norm": 0.27277813966534376, "learning_rate": 1.7842690945877593e-05, "loss": 0.2355, "step": 11930 }, { "epoch": 2.037040198002902, "grad_norm": 0.28201983183913554, "learning_rate": 1.782688416793121e-05, "loss": 0.2173, "step": 11935 }, { "epoch": 2.0378936587863787, "grad_norm": 0.30341226324782694, "learning_rate": 1.7811077389984825e-05, "loss": 0.2085, "step": 11940 }, { "epoch": 2.0387471195698557, "grad_norm": 0.285997929387636, "learning_rate": 1.7795270612038442e-05, "loss": 0.2126, "step": 11945 }, { "epoch": 2.0396005803533326, "grad_norm": 0.3320733346063396, "learning_rate": 1.777946383409206e-05, "loss": 0.2186, "step": 11950 }, { "epoch": 2.04045404113681, "grad_norm": 0.27560638653734326, "learning_rate": 1.7763657056145677e-05, "loss": 0.2201, "step": 11955 }, { "epoch": 2.041307501920287, "grad_norm": 0.27552840208409585, "learning_rate": 1.7747850278199294e-05, "loss": 0.1907, "step": 11960 }, { "epoch": 2.042160962703764, "grad_norm": 0.30090410863564987, "learning_rate": 1.773204350025291e-05, "loss": 0.2048, "step": 11965 }, { "epoch": 2.0430144234872407, "grad_norm": 0.27798655650913806, "learning_rate": 1.7716236722306526e-05, "loss": 0.2219, "step": 11970 }, { "epoch": 2.0438678842707176, "grad_norm": 0.2968941672553664, "learning_rate": 1.7700429944360143e-05, "loss": 0.2011, "step": 11975 }, { "epoch": 2.0447213450541946, "grad_norm": 0.2867474993521676, "learning_rate": 1.7684623166413757e-05, "loss": 0.2173, "step": 11980 }, { "epoch": 2.045574805837672, "grad_norm": 0.3260727597396212, "learning_rate": 1.7668816388467378e-05, "loss": 0.2032, "step": 11985 }, { "epoch": 2.046428266621149, "grad_norm": 0.2542636922161859, "learning_rate": 1.7653009610520992e-05, "loss": 0.2167, "step": 11990 }, { "epoch": 2.0472817274046258, "grad_norm": 0.2852674473719234, "learning_rate": 1.763720283257461e-05, "loss": 0.2238, "step": 11995 }, { "epoch": 2.0481351881881027, "grad_norm": 0.35051537976922437, "learning_rate": 1.7621396054628224e-05, "loss": 0.2307, "step": 12000 }, { "epoch": 2.0489886489715796, "grad_norm": 0.2922276251816413, "learning_rate": 1.760558927668184e-05, "loss": 0.2221, "step": 12005 }, { "epoch": 2.0498421097550565, "grad_norm": 0.29241994108491265, "learning_rate": 1.758978249873546e-05, "loss": 0.2223, "step": 12010 }, { "epoch": 2.050695570538534, "grad_norm": 0.3160253872752005, "learning_rate": 1.7573975720789076e-05, "loss": 0.2176, "step": 12015 }, { "epoch": 2.051549031322011, "grad_norm": 0.27259533768227, "learning_rate": 1.7558168942842694e-05, "loss": 0.2107, "step": 12020 }, { "epoch": 2.0524024921054878, "grad_norm": 0.36361144292276, "learning_rate": 1.7542362164896308e-05, "loss": 0.2041, "step": 12025 }, { "epoch": 2.0532559528889647, "grad_norm": 0.2871126983717125, "learning_rate": 1.7526555386949925e-05, "loss": 0.2152, "step": 12030 }, { "epoch": 2.0541094136724416, "grad_norm": 0.27982538994758815, "learning_rate": 1.7510748609003543e-05, "loss": 0.2059, "step": 12035 }, { "epoch": 2.054962874455919, "grad_norm": 0.4052374753238366, "learning_rate": 1.7494941831057157e-05, "loss": 0.1974, "step": 12040 }, { "epoch": 2.055816335239396, "grad_norm": 0.24058626990012216, "learning_rate": 1.7479135053110774e-05, "loss": 0.2168, "step": 12045 }, { "epoch": 2.056669796022873, "grad_norm": 0.36860052600399273, "learning_rate": 1.746332827516439e-05, "loss": 0.2149, "step": 12050 }, { "epoch": 2.0575232568063497, "grad_norm": 0.24540064025388436, "learning_rate": 1.744752149721801e-05, "loss": 0.2043, "step": 12055 }, { "epoch": 2.0583767175898267, "grad_norm": 0.30121333138572537, "learning_rate": 1.7431714719271623e-05, "loss": 0.2305, "step": 12060 }, { "epoch": 2.0592301783733036, "grad_norm": 0.2729102255626994, "learning_rate": 1.741590794132524e-05, "loss": 0.2223, "step": 12065 }, { "epoch": 2.060083639156781, "grad_norm": 0.3025099623527366, "learning_rate": 1.7400101163378858e-05, "loss": 0.1982, "step": 12070 }, { "epoch": 2.060937099940258, "grad_norm": 0.27018092350194783, "learning_rate": 1.7384294385432472e-05, "loss": 0.2171, "step": 12075 }, { "epoch": 2.061790560723735, "grad_norm": 0.2709533092747071, "learning_rate": 1.7368487607486093e-05, "loss": 0.2292, "step": 12080 }, { "epoch": 2.0626440215072117, "grad_norm": 0.2679078973291597, "learning_rate": 1.7352680829539707e-05, "loss": 0.2413, "step": 12085 }, { "epoch": 2.0634974822906886, "grad_norm": 0.27072809631665906, "learning_rate": 1.7336874051593325e-05, "loss": 0.2063, "step": 12090 }, { "epoch": 2.0643509430741656, "grad_norm": 0.337831564511127, "learning_rate": 1.7321067273646942e-05, "loss": 0.2194, "step": 12095 }, { "epoch": 2.065204403857643, "grad_norm": 0.3172426013107838, "learning_rate": 1.7305260495700556e-05, "loss": 0.1995, "step": 12100 }, { "epoch": 2.06605786464112, "grad_norm": 0.29035981803341415, "learning_rate": 1.7289453717754174e-05, "loss": 0.2119, "step": 12105 }, { "epoch": 2.0669113254245968, "grad_norm": 0.26863143500170056, "learning_rate": 1.727364693980779e-05, "loss": 0.2157, "step": 12110 }, { "epoch": 2.0677647862080737, "grad_norm": 0.29101649091046894, "learning_rate": 1.725784016186141e-05, "loss": 0.2169, "step": 12115 }, { "epoch": 2.0686182469915506, "grad_norm": 0.2579190496794579, "learning_rate": 1.7242033383915023e-05, "loss": 0.2037, "step": 12120 }, { "epoch": 2.0694717077750275, "grad_norm": 0.2860385758916714, "learning_rate": 1.722622660596864e-05, "loss": 0.2132, "step": 12125 }, { "epoch": 2.070325168558505, "grad_norm": 0.3079536528710597, "learning_rate": 1.7210419828022257e-05, "loss": 0.2105, "step": 12130 }, { "epoch": 2.071178629341982, "grad_norm": 0.30849883783592746, "learning_rate": 1.719461305007587e-05, "loss": 0.217, "step": 12135 }, { "epoch": 2.0720320901254587, "grad_norm": 0.28679020814264095, "learning_rate": 1.7178806272129492e-05, "loss": 0.203, "step": 12140 }, { "epoch": 2.0728855509089357, "grad_norm": 0.33497195997237755, "learning_rate": 1.7162999494183106e-05, "loss": 0.1807, "step": 12145 }, { "epoch": 2.0737390116924126, "grad_norm": 0.279827437365827, "learning_rate": 1.7147192716236724e-05, "loss": 0.2317, "step": 12150 }, { "epoch": 2.0745924724758895, "grad_norm": 0.26888229239643036, "learning_rate": 1.713138593829034e-05, "loss": 0.2172, "step": 12155 }, { "epoch": 2.075445933259367, "grad_norm": 0.2552291456464643, "learning_rate": 1.7115579160343955e-05, "loss": 0.2151, "step": 12160 }, { "epoch": 2.076299394042844, "grad_norm": 0.3227276405707484, "learning_rate": 1.7099772382397573e-05, "loss": 0.2083, "step": 12165 }, { "epoch": 2.0771528548263207, "grad_norm": 0.2933397100183842, "learning_rate": 1.708396560445119e-05, "loss": 0.2091, "step": 12170 }, { "epoch": 2.0780063156097976, "grad_norm": 0.28414275324082316, "learning_rate": 1.7068158826504808e-05, "loss": 0.2223, "step": 12175 }, { "epoch": 2.0788597763932746, "grad_norm": 0.3211441640303238, "learning_rate": 1.7052352048558422e-05, "loss": 0.1959, "step": 12180 }, { "epoch": 2.079713237176752, "grad_norm": 0.30031665390260387, "learning_rate": 1.703654527061204e-05, "loss": 0.2183, "step": 12185 }, { "epoch": 2.080566697960229, "grad_norm": 0.2813617896788799, "learning_rate": 1.7020738492665657e-05, "loss": 0.2241, "step": 12190 }, { "epoch": 2.081420158743706, "grad_norm": 0.2661806791326225, "learning_rate": 1.700493171471927e-05, "loss": 0.2095, "step": 12195 }, { "epoch": 2.0822736195271827, "grad_norm": 0.2968202158634795, "learning_rate": 1.6989124936772888e-05, "loss": 0.2144, "step": 12200 }, { "epoch": 2.0831270803106596, "grad_norm": 0.3072401892623041, "learning_rate": 1.6973318158826506e-05, "loss": 0.2063, "step": 12205 }, { "epoch": 2.0839805410941366, "grad_norm": 0.3159126756806355, "learning_rate": 1.6957511380880123e-05, "loss": 0.211, "step": 12210 }, { "epoch": 2.084834001877614, "grad_norm": 0.2925527378808134, "learning_rate": 1.694170460293374e-05, "loss": 0.2088, "step": 12215 }, { "epoch": 2.085687462661091, "grad_norm": 0.30701320918624014, "learning_rate": 1.6925897824987355e-05, "loss": 0.2149, "step": 12220 }, { "epoch": 2.0865409234445678, "grad_norm": 0.2928607932807345, "learning_rate": 1.6910091047040972e-05, "loss": 0.2138, "step": 12225 }, { "epoch": 2.0873943842280447, "grad_norm": 0.2879282535373842, "learning_rate": 1.6894284269094586e-05, "loss": 0.2067, "step": 12230 }, { "epoch": 2.0882478450115216, "grad_norm": 0.31694985696567496, "learning_rate": 1.6878477491148207e-05, "loss": 0.2092, "step": 12235 }, { "epoch": 2.0891013057949985, "grad_norm": 0.3183585012995344, "learning_rate": 1.686267071320182e-05, "loss": 0.2064, "step": 12240 }, { "epoch": 2.089954766578476, "grad_norm": 0.31395683765900273, "learning_rate": 1.684686393525544e-05, "loss": 0.216, "step": 12245 }, { "epoch": 2.090808227361953, "grad_norm": 0.2565250115057307, "learning_rate": 1.6831057157309056e-05, "loss": 0.202, "step": 12250 }, { "epoch": 2.0916616881454297, "grad_norm": 0.3155038672315227, "learning_rate": 1.681525037936267e-05, "loss": 0.2209, "step": 12255 }, { "epoch": 2.0925151489289067, "grad_norm": 0.2909953211054769, "learning_rate": 1.6799443601416288e-05, "loss": 0.2006, "step": 12260 }, { "epoch": 2.0933686097123836, "grad_norm": 0.2964796633389792, "learning_rate": 1.6783636823469905e-05, "loss": 0.2207, "step": 12265 }, { "epoch": 2.0942220704958605, "grad_norm": 0.3108563599801358, "learning_rate": 1.6767830045523523e-05, "loss": 0.2161, "step": 12270 }, { "epoch": 2.095075531279338, "grad_norm": 0.2972007865361862, "learning_rate": 1.6752023267577137e-05, "loss": 0.2069, "step": 12275 }, { "epoch": 2.095928992062815, "grad_norm": 0.2788990102631724, "learning_rate": 1.6736216489630754e-05, "loss": 0.2242, "step": 12280 }, { "epoch": 2.0967824528462917, "grad_norm": 0.29647250340698766, "learning_rate": 1.672040971168437e-05, "loss": 0.2426, "step": 12285 }, { "epoch": 2.0976359136297686, "grad_norm": 0.25752599432832324, "learning_rate": 1.6704602933737986e-05, "loss": 0.2049, "step": 12290 }, { "epoch": 2.0984893744132456, "grad_norm": 0.292410619448879, "learning_rate": 1.6688796155791606e-05, "loss": 0.209, "step": 12295 }, { "epoch": 2.0993428351967225, "grad_norm": 0.29950484783145276, "learning_rate": 1.667298937784522e-05, "loss": 0.225, "step": 12300 }, { "epoch": 2.1001962959802, "grad_norm": 0.31841198458058206, "learning_rate": 1.6657182599898838e-05, "loss": 0.22, "step": 12305 }, { "epoch": 2.101049756763677, "grad_norm": 0.3184621712472175, "learning_rate": 1.6641375821952455e-05, "loss": 0.2001, "step": 12310 }, { "epoch": 2.1019032175471537, "grad_norm": 0.31802625861441436, "learning_rate": 1.662556904400607e-05, "loss": 0.2131, "step": 12315 }, { "epoch": 2.1027566783306306, "grad_norm": 0.28130677127138953, "learning_rate": 1.6609762266059687e-05, "loss": 0.2235, "step": 12320 }, { "epoch": 2.1036101391141075, "grad_norm": 0.3303752877316371, "learning_rate": 1.65939554881133e-05, "loss": 0.2044, "step": 12325 }, { "epoch": 2.104463599897585, "grad_norm": 0.29554538346025616, "learning_rate": 1.6578148710166922e-05, "loss": 0.2162, "step": 12330 }, { "epoch": 2.105317060681062, "grad_norm": 0.2807408091334661, "learning_rate": 1.6562341932220536e-05, "loss": 0.2144, "step": 12335 }, { "epoch": 2.1061705214645388, "grad_norm": 0.3052411071547068, "learning_rate": 1.6546535154274153e-05, "loss": 0.2041, "step": 12340 }, { "epoch": 2.1070239822480157, "grad_norm": 0.32710298303178703, "learning_rate": 1.653072837632777e-05, "loss": 0.2115, "step": 12345 }, { "epoch": 2.1078774430314926, "grad_norm": 0.29998511506686293, "learning_rate": 1.6514921598381385e-05, "loss": 0.1934, "step": 12350 }, { "epoch": 2.1087309038149695, "grad_norm": 0.2888502437587194, "learning_rate": 1.6499114820435002e-05, "loss": 0.2257, "step": 12355 }, { "epoch": 2.109584364598447, "grad_norm": 0.3102092421073532, "learning_rate": 1.648330804248862e-05, "loss": 0.2121, "step": 12360 }, { "epoch": 2.110437825381924, "grad_norm": 0.2932706086458866, "learning_rate": 1.6467501264542237e-05, "loss": 0.2141, "step": 12365 }, { "epoch": 2.1112912861654007, "grad_norm": 0.32217094629809195, "learning_rate": 1.6451694486595855e-05, "loss": 0.2285, "step": 12370 }, { "epoch": 2.1121447469488777, "grad_norm": 0.2824842998489969, "learning_rate": 1.643588770864947e-05, "loss": 0.2128, "step": 12375 }, { "epoch": 2.1129982077323546, "grad_norm": 0.32684091916031843, "learning_rate": 1.6420080930703086e-05, "loss": 0.239, "step": 12380 }, { "epoch": 2.1138516685158315, "grad_norm": 0.2913826190688312, "learning_rate": 1.64042741527567e-05, "loss": 0.2047, "step": 12385 }, { "epoch": 2.114705129299309, "grad_norm": 0.30268603696125884, "learning_rate": 1.638846737481032e-05, "loss": 0.2139, "step": 12390 }, { "epoch": 2.115558590082786, "grad_norm": 0.309347541049331, "learning_rate": 1.6372660596863935e-05, "loss": 0.2173, "step": 12395 }, { "epoch": 2.1164120508662627, "grad_norm": 0.2926681783357437, "learning_rate": 1.6356853818917553e-05, "loss": 0.2151, "step": 12400 }, { "epoch": 2.1172655116497396, "grad_norm": 0.3200752106257981, "learning_rate": 1.634104704097117e-05, "loss": 0.2202, "step": 12405 }, { "epoch": 2.1181189724332166, "grad_norm": 0.3265229603823691, "learning_rate": 1.6325240263024784e-05, "loss": 0.214, "step": 12410 }, { "epoch": 2.1189724332166935, "grad_norm": 0.2657987274424382, "learning_rate": 1.63094334850784e-05, "loss": 0.2331, "step": 12415 }, { "epoch": 2.119825894000171, "grad_norm": 0.33715526645279364, "learning_rate": 1.629362670713202e-05, "loss": 0.2251, "step": 12420 }, { "epoch": 2.1206793547836478, "grad_norm": 0.3609425830966628, "learning_rate": 1.6277819929185637e-05, "loss": 0.2191, "step": 12425 }, { "epoch": 2.1215328155671247, "grad_norm": 0.3148298661340768, "learning_rate": 1.6262013151239254e-05, "loss": 0.2008, "step": 12430 }, { "epoch": 2.1223862763506016, "grad_norm": 0.30978982542343464, "learning_rate": 1.6246206373292868e-05, "loss": 0.223, "step": 12435 }, { "epoch": 2.1232397371340785, "grad_norm": 0.2941536970930722, "learning_rate": 1.6230399595346486e-05, "loss": 0.2203, "step": 12440 }, { "epoch": 2.1240931979175555, "grad_norm": 0.2775762001322, "learning_rate": 1.62145928174001e-05, "loss": 0.1963, "step": 12445 }, { "epoch": 2.124946658701033, "grad_norm": 0.32899440492489546, "learning_rate": 1.6198786039453717e-05, "loss": 0.2125, "step": 12450 }, { "epoch": 2.1258001194845098, "grad_norm": 0.27538474837957805, "learning_rate": 1.6182979261507335e-05, "loss": 0.1856, "step": 12455 }, { "epoch": 2.1266535802679867, "grad_norm": 0.2776344540629496, "learning_rate": 1.6167172483560952e-05, "loss": 0.1912, "step": 12460 }, { "epoch": 2.1275070410514636, "grad_norm": 0.2901033107597535, "learning_rate": 1.615136570561457e-05, "loss": 0.2122, "step": 12465 }, { "epoch": 2.1283605018349405, "grad_norm": 0.30945901815799026, "learning_rate": 1.6135558927668184e-05, "loss": 0.207, "step": 12470 }, { "epoch": 2.129213962618418, "grad_norm": 0.2879411379212899, "learning_rate": 1.61197521497218e-05, "loss": 0.227, "step": 12475 }, { "epoch": 2.130067423401895, "grad_norm": 0.28698162800773236, "learning_rate": 1.610394537177542e-05, "loss": 0.2185, "step": 12480 }, { "epoch": 2.1309208841853717, "grad_norm": 0.2644373413029429, "learning_rate": 1.6088138593829036e-05, "loss": 0.2078, "step": 12485 }, { "epoch": 2.1317743449688487, "grad_norm": 0.2830350361759574, "learning_rate": 1.6072331815882653e-05, "loss": 0.2075, "step": 12490 }, { "epoch": 2.1326278057523256, "grad_norm": 0.2709140970399191, "learning_rate": 1.6056525037936267e-05, "loss": 0.2111, "step": 12495 }, { "epoch": 2.1334812665358025, "grad_norm": 0.3074572275299605, "learning_rate": 1.6040718259989885e-05, "loss": 0.1939, "step": 12500 }, { "epoch": 2.13433472731928, "grad_norm": 0.2912856381751007, "learning_rate": 1.60249114820435e-05, "loss": 0.2041, "step": 12505 }, { "epoch": 2.135188188102757, "grad_norm": 0.27373279547250157, "learning_rate": 1.6009104704097116e-05, "loss": 0.2155, "step": 12510 }, { "epoch": 2.1360416488862337, "grad_norm": 0.28545470071370577, "learning_rate": 1.5993297926150734e-05, "loss": 0.2171, "step": 12515 }, { "epoch": 2.1368951096697106, "grad_norm": 0.3787517541994233, "learning_rate": 1.597749114820435e-05, "loss": 0.2103, "step": 12520 }, { "epoch": 2.1377485704531876, "grad_norm": 0.290085926303246, "learning_rate": 1.596168437025797e-05, "loss": 0.208, "step": 12525 }, { "epoch": 2.1386020312366645, "grad_norm": 0.2795570997861008, "learning_rate": 1.5945877592311583e-05, "loss": 0.2082, "step": 12530 }, { "epoch": 2.139455492020142, "grad_norm": 0.5675274377073386, "learning_rate": 1.59300708143652e-05, "loss": 0.2242, "step": 12535 }, { "epoch": 2.1403089528036188, "grad_norm": 0.2747757301302407, "learning_rate": 1.5914264036418818e-05, "loss": 0.2307, "step": 12540 }, { "epoch": 2.1411624135870957, "grad_norm": 0.3695430694627631, "learning_rate": 1.5898457258472435e-05, "loss": 0.2047, "step": 12545 }, { "epoch": 2.1420158743705726, "grad_norm": 0.3098279393374623, "learning_rate": 1.5882650480526053e-05, "loss": 0.2082, "step": 12550 }, { "epoch": 2.1428693351540495, "grad_norm": 0.28423731944372604, "learning_rate": 1.5866843702579667e-05, "loss": 0.2308, "step": 12555 }, { "epoch": 2.143722795937527, "grad_norm": 0.3014309218153793, "learning_rate": 1.5851036924633284e-05, "loss": 0.2194, "step": 12560 }, { "epoch": 2.144576256721004, "grad_norm": 0.337223404466219, "learning_rate": 1.5835230146686898e-05, "loss": 0.2061, "step": 12565 }, { "epoch": 2.1454297175044807, "grad_norm": 0.31936849267805445, "learning_rate": 1.5819423368740516e-05, "loss": 0.2149, "step": 12570 }, { "epoch": 2.1462831782879577, "grad_norm": 0.3213830527801218, "learning_rate": 1.5803616590794133e-05, "loss": 0.2079, "step": 12575 }, { "epoch": 2.1471366390714346, "grad_norm": 0.3206891980567062, "learning_rate": 1.578780981284775e-05, "loss": 0.2152, "step": 12580 }, { "epoch": 2.1479900998549115, "grad_norm": 0.2870206919444092, "learning_rate": 1.5772003034901368e-05, "loss": 0.2243, "step": 12585 }, { "epoch": 2.1488435606383884, "grad_norm": 0.28994457365763526, "learning_rate": 1.5756196256954982e-05, "loss": 0.2068, "step": 12590 }, { "epoch": 2.149697021421866, "grad_norm": 0.3010059939198214, "learning_rate": 1.57403894790086e-05, "loss": 0.1851, "step": 12595 }, { "epoch": 2.1505504822053427, "grad_norm": 0.3324970507180259, "learning_rate": 1.5724582701062217e-05, "loss": 0.2288, "step": 12600 }, { "epoch": 2.1514039429888197, "grad_norm": 0.27416808131908127, "learning_rate": 1.570877592311583e-05, "loss": 0.2253, "step": 12605 }, { "epoch": 2.1522574037722966, "grad_norm": 0.27012677304400495, "learning_rate": 1.5692969145169452e-05, "loss": 0.2178, "step": 12610 }, { "epoch": 2.1531108645557735, "grad_norm": 0.25665407803435125, "learning_rate": 1.5677162367223066e-05, "loss": 0.2145, "step": 12615 }, { "epoch": 2.153964325339251, "grad_norm": 0.29017877129445374, "learning_rate": 1.5661355589276684e-05, "loss": 0.2018, "step": 12620 }, { "epoch": 2.154817786122728, "grad_norm": 0.27384678351332004, "learning_rate": 1.5645548811330298e-05, "loss": 0.225, "step": 12625 }, { "epoch": 2.1556712469062047, "grad_norm": 0.28266934614132405, "learning_rate": 1.5629742033383915e-05, "loss": 0.2311, "step": 12630 }, { "epoch": 2.1565247076896816, "grad_norm": 0.3217399550202195, "learning_rate": 1.5613935255437533e-05, "loss": 0.2035, "step": 12635 }, { "epoch": 2.1573781684731586, "grad_norm": 0.30242500815788065, "learning_rate": 1.559812847749115e-05, "loss": 0.195, "step": 12640 }, { "epoch": 2.1582316292566355, "grad_norm": 0.40690602793391534, "learning_rate": 1.5582321699544767e-05, "loss": 0.2079, "step": 12645 }, { "epoch": 2.159085090040113, "grad_norm": 0.25194126849346016, "learning_rate": 1.556651492159838e-05, "loss": 0.215, "step": 12650 }, { "epoch": 2.1599385508235898, "grad_norm": 0.2986372505758614, "learning_rate": 1.5550708143652e-05, "loss": 0.2261, "step": 12655 }, { "epoch": 2.1607920116070667, "grad_norm": 0.2823751848247609, "learning_rate": 1.5534901365705613e-05, "loss": 0.2223, "step": 12660 }, { "epoch": 2.1616454723905436, "grad_norm": 0.28271364615819666, "learning_rate": 1.551909458775923e-05, "loss": 0.2183, "step": 12665 }, { "epoch": 2.1624989331740205, "grad_norm": 0.3334634759663433, "learning_rate": 1.550328780981285e-05, "loss": 0.1935, "step": 12670 }, { "epoch": 2.1633523939574975, "grad_norm": 0.28979679478902637, "learning_rate": 1.5487481031866465e-05, "loss": 0.2244, "step": 12675 }, { "epoch": 2.164205854740975, "grad_norm": 0.30705779126868926, "learning_rate": 1.5471674253920083e-05, "loss": 0.2048, "step": 12680 }, { "epoch": 2.1650593155244517, "grad_norm": 0.27981166880869685, "learning_rate": 1.5455867475973697e-05, "loss": 0.2033, "step": 12685 }, { "epoch": 2.1659127763079287, "grad_norm": 0.297844996342522, "learning_rate": 1.5440060698027314e-05, "loss": 0.2074, "step": 12690 }, { "epoch": 2.1667662370914056, "grad_norm": 0.2782890692089488, "learning_rate": 1.5424253920080932e-05, "loss": 0.207, "step": 12695 }, { "epoch": 2.1676196978748825, "grad_norm": 0.3104785941640944, "learning_rate": 1.5408447142134546e-05, "loss": 0.2185, "step": 12700 }, { "epoch": 2.16847315865836, "grad_norm": 0.29860012243886797, "learning_rate": 1.5392640364188167e-05, "loss": 0.2064, "step": 12705 }, { "epoch": 2.169326619441837, "grad_norm": 0.3300891635259585, "learning_rate": 1.537683358624178e-05, "loss": 0.2195, "step": 12710 }, { "epoch": 2.1701800802253137, "grad_norm": 0.2900767022812366, "learning_rate": 1.53610268082954e-05, "loss": 0.2436, "step": 12715 }, { "epoch": 2.1710335410087906, "grad_norm": 0.2702348412812184, "learning_rate": 1.5345220030349012e-05, "loss": 0.2039, "step": 12720 }, { "epoch": 2.1718870017922676, "grad_norm": 0.2888549045412832, "learning_rate": 1.532941325240263e-05, "loss": 0.2232, "step": 12725 }, { "epoch": 2.1727404625757445, "grad_norm": 0.32099795555979455, "learning_rate": 1.5313606474456247e-05, "loss": 0.22, "step": 12730 }, { "epoch": 2.1735939233592214, "grad_norm": 0.27758961429433837, "learning_rate": 1.5297799696509865e-05, "loss": 0.232, "step": 12735 }, { "epoch": 2.174447384142699, "grad_norm": 0.30478657736359294, "learning_rate": 1.5281992918563482e-05, "loss": 0.2089, "step": 12740 }, { "epoch": 2.1753008449261757, "grad_norm": 0.2628285040900879, "learning_rate": 1.5266186140617096e-05, "loss": 0.2027, "step": 12745 }, { "epoch": 2.1761543057096526, "grad_norm": 0.3008415707097667, "learning_rate": 1.5250379362670714e-05, "loss": 0.2124, "step": 12750 }, { "epoch": 2.1770077664931295, "grad_norm": 0.310924765865378, "learning_rate": 1.523457258472433e-05, "loss": 0.2176, "step": 12755 }, { "epoch": 2.1778612272766065, "grad_norm": 0.31507905661344676, "learning_rate": 1.5218765806777945e-05, "loss": 0.2224, "step": 12760 }, { "epoch": 2.178714688060084, "grad_norm": 0.2746106105854747, "learning_rate": 1.5202959028831564e-05, "loss": 0.2068, "step": 12765 }, { "epoch": 2.1795681488435608, "grad_norm": 0.30896615462492844, "learning_rate": 1.518715225088518e-05, "loss": 0.2293, "step": 12770 }, { "epoch": 2.1804216096270377, "grad_norm": 0.29791640431176575, "learning_rate": 1.5171345472938798e-05, "loss": 0.2377, "step": 12775 }, { "epoch": 2.1812750704105146, "grad_norm": 0.2868026253251758, "learning_rate": 1.5155538694992413e-05, "loss": 0.202, "step": 12780 }, { "epoch": 2.1821285311939915, "grad_norm": 0.28335550967127043, "learning_rate": 1.513973191704603e-05, "loss": 0.2167, "step": 12785 }, { "epoch": 2.1829819919774685, "grad_norm": 0.31032208218267837, "learning_rate": 1.5123925139099645e-05, "loss": 0.2099, "step": 12790 }, { "epoch": 2.183835452760946, "grad_norm": 0.2717914592747321, "learning_rate": 1.5108118361153264e-05, "loss": 0.2027, "step": 12795 }, { "epoch": 2.1846889135444227, "grad_norm": 0.28889934526912564, "learning_rate": 1.509231158320688e-05, "loss": 0.1992, "step": 12800 }, { "epoch": 2.1855423743278997, "grad_norm": 0.28641064366460933, "learning_rate": 1.5076504805260497e-05, "loss": 0.2176, "step": 12805 }, { "epoch": 2.1863958351113766, "grad_norm": 0.3065648897869217, "learning_rate": 1.5060698027314113e-05, "loss": 0.1971, "step": 12810 }, { "epoch": 2.1872492958948535, "grad_norm": 0.31088472862870736, "learning_rate": 1.5044891249367729e-05, "loss": 0.2046, "step": 12815 }, { "epoch": 2.1881027566783304, "grad_norm": 0.29380881443185347, "learning_rate": 1.5029084471421345e-05, "loss": 0.2092, "step": 12820 }, { "epoch": 2.188956217461808, "grad_norm": 0.2991291299827504, "learning_rate": 1.5013277693474964e-05, "loss": 0.2171, "step": 12825 }, { "epoch": 2.1898096782452847, "grad_norm": 0.3091365259453851, "learning_rate": 1.499747091552858e-05, "loss": 0.2054, "step": 12830 }, { "epoch": 2.1906631390287616, "grad_norm": 0.32908102938801304, "learning_rate": 1.4981664137582197e-05, "loss": 0.2196, "step": 12835 }, { "epoch": 2.1915165998122386, "grad_norm": 0.3254743433683044, "learning_rate": 1.4965857359635813e-05, "loss": 0.2046, "step": 12840 }, { "epoch": 2.1923700605957155, "grad_norm": 0.3456424931944419, "learning_rate": 1.4950050581689428e-05, "loss": 0.2161, "step": 12845 }, { "epoch": 2.193223521379193, "grad_norm": 0.27001744733002625, "learning_rate": 1.4934243803743044e-05, "loss": 0.2095, "step": 12850 }, { "epoch": 2.1940769821626698, "grad_norm": 0.2997756235604696, "learning_rate": 1.4918437025796662e-05, "loss": 0.2005, "step": 12855 }, { "epoch": 2.1949304429461467, "grad_norm": 0.28363143443818234, "learning_rate": 1.490263024785028e-05, "loss": 0.2243, "step": 12860 }, { "epoch": 2.1957839037296236, "grad_norm": 0.2896952560687488, "learning_rate": 1.4886823469903897e-05, "loss": 0.2137, "step": 12865 }, { "epoch": 2.1966373645131005, "grad_norm": 0.29362882103164045, "learning_rate": 1.4871016691957512e-05, "loss": 0.2074, "step": 12870 }, { "epoch": 2.1974908252965775, "grad_norm": 0.2746464252316413, "learning_rate": 1.4855209914011128e-05, "loss": 0.2285, "step": 12875 }, { "epoch": 2.1983442860800544, "grad_norm": 0.2824957956595061, "learning_rate": 1.4839403136064744e-05, "loss": 0.2257, "step": 12880 }, { "epoch": 2.1991977468635318, "grad_norm": 0.2768049528701583, "learning_rate": 1.4823596358118361e-05, "loss": 0.1941, "step": 12885 }, { "epoch": 2.2000512076470087, "grad_norm": 0.2872971985249956, "learning_rate": 1.4807789580171979e-05, "loss": 0.1951, "step": 12890 }, { "epoch": 2.2009046684304856, "grad_norm": 0.3234826621040122, "learning_rate": 1.4791982802225596e-05, "loss": 0.2068, "step": 12895 }, { "epoch": 2.2017581292139625, "grad_norm": 0.2901447960595998, "learning_rate": 1.4776176024279212e-05, "loss": 0.2069, "step": 12900 }, { "epoch": 2.2026115899974394, "grad_norm": 0.30969364933692906, "learning_rate": 1.4760369246332828e-05, "loss": 0.2253, "step": 12905 }, { "epoch": 2.203465050780917, "grad_norm": 0.28714024193012144, "learning_rate": 1.4744562468386444e-05, "loss": 0.2334, "step": 12910 }, { "epoch": 2.2043185115643937, "grad_norm": 0.2978819038730986, "learning_rate": 1.4728755690440061e-05, "loss": 0.2238, "step": 12915 }, { "epoch": 2.2051719723478707, "grad_norm": 0.2730616538251703, "learning_rate": 1.4712948912493679e-05, "loss": 0.1954, "step": 12920 }, { "epoch": 2.2060254331313476, "grad_norm": 0.33415812642584986, "learning_rate": 1.4697142134547296e-05, "loss": 0.2182, "step": 12925 }, { "epoch": 2.2068788939148245, "grad_norm": 0.2959731203635697, "learning_rate": 1.4681335356600912e-05, "loss": 0.2107, "step": 12930 }, { "epoch": 2.2077323546983014, "grad_norm": 0.2864442812548418, "learning_rate": 1.4665528578654527e-05, "loss": 0.2209, "step": 12935 }, { "epoch": 2.208585815481779, "grad_norm": 0.306759271698419, "learning_rate": 1.4649721800708143e-05, "loss": 0.2162, "step": 12940 }, { "epoch": 2.2094392762652557, "grad_norm": 0.2808864019365652, "learning_rate": 1.463391502276176e-05, "loss": 0.2135, "step": 12945 }, { "epoch": 2.2102927370487326, "grad_norm": 0.3213092272997695, "learning_rate": 1.4618108244815378e-05, "loss": 0.2175, "step": 12950 }, { "epoch": 2.2111461978322096, "grad_norm": 0.28769442648127663, "learning_rate": 1.4602301466868996e-05, "loss": 0.202, "step": 12955 }, { "epoch": 2.2119996586156865, "grad_norm": 0.27924783268622555, "learning_rate": 1.4586494688922611e-05, "loss": 0.2052, "step": 12960 }, { "epoch": 2.2128531193991634, "grad_norm": 0.2933640649963345, "learning_rate": 1.4570687910976227e-05, "loss": 0.2173, "step": 12965 }, { "epoch": 2.2137065801826408, "grad_norm": 0.24785644032772974, "learning_rate": 1.4554881133029843e-05, "loss": 0.2427, "step": 12970 }, { "epoch": 2.2145600409661177, "grad_norm": 0.2729219346965846, "learning_rate": 1.453907435508346e-05, "loss": 0.221, "step": 12975 }, { "epoch": 2.2154135017495946, "grad_norm": 0.3392235404991562, "learning_rate": 1.4523267577137076e-05, "loss": 0.2091, "step": 12980 }, { "epoch": 2.2162669625330715, "grad_norm": 0.30592519496676684, "learning_rate": 1.4507460799190695e-05, "loss": 0.198, "step": 12985 }, { "epoch": 2.2171204233165485, "grad_norm": 0.2973887366799171, "learning_rate": 1.4491654021244311e-05, "loss": 0.2088, "step": 12990 }, { "epoch": 2.217973884100026, "grad_norm": 0.26680727265975357, "learning_rate": 1.4475847243297927e-05, "loss": 0.2077, "step": 12995 }, { "epoch": 2.2188273448835028, "grad_norm": 0.25179803741628193, "learning_rate": 1.4460040465351543e-05, "loss": 0.2114, "step": 13000 }, { "epoch": 2.2196808056669797, "grad_norm": 0.2787139354221501, "learning_rate": 1.444423368740516e-05, "loss": 0.2172, "step": 13005 }, { "epoch": 2.2205342664504566, "grad_norm": 0.437334760487075, "learning_rate": 1.4428426909458776e-05, "loss": 0.2085, "step": 13010 }, { "epoch": 2.2213877272339335, "grad_norm": 0.2753531538743388, "learning_rate": 1.4412620131512395e-05, "loss": 0.2205, "step": 13015 }, { "epoch": 2.2222411880174104, "grad_norm": 0.3113266254102051, "learning_rate": 1.439681335356601e-05, "loss": 0.2237, "step": 13020 }, { "epoch": 2.223094648800888, "grad_norm": 0.2824786957708549, "learning_rate": 1.4381006575619626e-05, "loss": 0.2292, "step": 13025 }, { "epoch": 2.2239481095843647, "grad_norm": 0.3084749001277605, "learning_rate": 1.4365199797673242e-05, "loss": 0.1994, "step": 13030 }, { "epoch": 2.2248015703678417, "grad_norm": 0.3027759753279354, "learning_rate": 1.434939301972686e-05, "loss": 0.205, "step": 13035 }, { "epoch": 2.2256550311513186, "grad_norm": 0.2693135415359196, "learning_rate": 1.4333586241780475e-05, "loss": 0.2264, "step": 13040 }, { "epoch": 2.2265084919347955, "grad_norm": 0.3111534435164997, "learning_rate": 1.4317779463834095e-05, "loss": 0.2121, "step": 13045 }, { "epoch": 2.2273619527182724, "grad_norm": 0.2835848757967294, "learning_rate": 1.430197268588771e-05, "loss": 0.2369, "step": 13050 }, { "epoch": 2.22821541350175, "grad_norm": 0.27945467900075543, "learning_rate": 1.4286165907941326e-05, "loss": 0.2264, "step": 13055 }, { "epoch": 2.2290688742852267, "grad_norm": 0.28770468029552193, "learning_rate": 1.4270359129994942e-05, "loss": 0.212, "step": 13060 }, { "epoch": 2.2299223350687036, "grad_norm": 0.3209111269637494, "learning_rate": 1.425455235204856e-05, "loss": 0.2127, "step": 13065 }, { "epoch": 2.2307757958521806, "grad_norm": 0.32357394550244145, "learning_rate": 1.4238745574102175e-05, "loss": 0.208, "step": 13070 }, { "epoch": 2.2316292566356575, "grad_norm": 0.32784145223190786, "learning_rate": 1.4222938796155794e-05, "loss": 0.2025, "step": 13075 }, { "epoch": 2.2324827174191344, "grad_norm": 0.3640809597837081, "learning_rate": 1.420713201820941e-05, "loss": 0.2127, "step": 13080 }, { "epoch": 2.2333361782026118, "grad_norm": 0.2730133228305057, "learning_rate": 1.4191325240263026e-05, "loss": 0.2024, "step": 13085 }, { "epoch": 2.2341896389860887, "grad_norm": 0.35120237282678196, "learning_rate": 1.4175518462316642e-05, "loss": 0.1978, "step": 13090 }, { "epoch": 2.2350430997695656, "grad_norm": 0.28594933271374384, "learning_rate": 1.4159711684370259e-05, "loss": 0.1917, "step": 13095 }, { "epoch": 2.2358965605530425, "grad_norm": 0.2975994721802316, "learning_rate": 1.4143904906423875e-05, "loss": 0.2213, "step": 13100 }, { "epoch": 2.2367500213365195, "grad_norm": 0.28926319375102255, "learning_rate": 1.412809812847749e-05, "loss": 0.2084, "step": 13105 }, { "epoch": 2.2376034821199964, "grad_norm": 0.3561447525893215, "learning_rate": 1.411229135053111e-05, "loss": 0.2123, "step": 13110 }, { "epoch": 2.2384569429034737, "grad_norm": 0.3070860803028503, "learning_rate": 1.4096484572584725e-05, "loss": 0.217, "step": 13115 }, { "epoch": 2.2393104036869507, "grad_norm": 0.2961305878275838, "learning_rate": 1.4080677794638341e-05, "loss": 0.2235, "step": 13120 }, { "epoch": 2.2401638644704276, "grad_norm": 0.28387226136575217, "learning_rate": 1.4064871016691957e-05, "loss": 0.2155, "step": 13125 }, { "epoch": 2.2410173252539045, "grad_norm": 0.28631267466502064, "learning_rate": 1.4049064238745574e-05, "loss": 0.2237, "step": 13130 }, { "epoch": 2.2418707860373814, "grad_norm": 0.30784455369950436, "learning_rate": 1.403325746079919e-05, "loss": 0.2272, "step": 13135 }, { "epoch": 2.242724246820859, "grad_norm": 0.32001437345987205, "learning_rate": 1.401745068285281e-05, "loss": 0.2267, "step": 13140 }, { "epoch": 2.2435777076043357, "grad_norm": 0.2953462675770879, "learning_rate": 1.4001643904906425e-05, "loss": 0.2129, "step": 13145 }, { "epoch": 2.2444311683878126, "grad_norm": 0.3059786106836141, "learning_rate": 1.3985837126960041e-05, "loss": 0.2153, "step": 13150 }, { "epoch": 2.2452846291712896, "grad_norm": 0.26702024745934244, "learning_rate": 1.3970030349013657e-05, "loss": 0.2258, "step": 13155 }, { "epoch": 2.2461380899547665, "grad_norm": 0.26931036694405047, "learning_rate": 1.3954223571067274e-05, "loss": 0.2064, "step": 13160 }, { "epoch": 2.2469915507382434, "grad_norm": 0.33524259220819386, "learning_rate": 1.393841679312089e-05, "loss": 0.2101, "step": 13165 }, { "epoch": 2.247845011521721, "grad_norm": 0.2969854070688166, "learning_rate": 1.3922610015174509e-05, "loss": 0.216, "step": 13170 }, { "epoch": 2.2486984723051977, "grad_norm": 0.3208618242001608, "learning_rate": 1.3906803237228125e-05, "loss": 0.2222, "step": 13175 }, { "epoch": 2.2495519330886746, "grad_norm": 0.34229782515896395, "learning_rate": 1.389099645928174e-05, "loss": 0.2244, "step": 13180 }, { "epoch": 2.2504053938721515, "grad_norm": 0.2976269117962387, "learning_rate": 1.3875189681335356e-05, "loss": 0.22, "step": 13185 }, { "epoch": 2.2512588546556285, "grad_norm": 0.30156999320985717, "learning_rate": 1.3859382903388974e-05, "loss": 0.2262, "step": 13190 }, { "epoch": 2.2521123154391054, "grad_norm": 0.26841705083205175, "learning_rate": 1.384357612544259e-05, "loss": 0.2146, "step": 13195 }, { "epoch": 2.2529657762225828, "grad_norm": 0.27245188608272247, "learning_rate": 1.3827769347496209e-05, "loss": 0.2216, "step": 13200 }, { "epoch": 2.2538192370060597, "grad_norm": 0.2633655066190417, "learning_rate": 1.3811962569549824e-05, "loss": 0.22, "step": 13205 }, { "epoch": 2.2546726977895366, "grad_norm": 0.31103755582199577, "learning_rate": 1.379615579160344e-05, "loss": 0.2009, "step": 13210 }, { "epoch": 2.2555261585730135, "grad_norm": 0.3371528163728692, "learning_rate": 1.3780349013657056e-05, "loss": 0.2071, "step": 13215 }, { "epoch": 2.2563796193564905, "grad_norm": 0.3016914901245741, "learning_rate": 1.3764542235710673e-05, "loss": 0.213, "step": 13220 }, { "epoch": 2.257233080139968, "grad_norm": 0.33073768086895655, "learning_rate": 1.374873545776429e-05, "loss": 0.2237, "step": 13225 }, { "epoch": 2.2580865409234447, "grad_norm": 0.3269016644384927, "learning_rate": 1.3732928679817905e-05, "loss": 0.2197, "step": 13230 }, { "epoch": 2.2589400017069217, "grad_norm": 0.31906708656812577, "learning_rate": 1.3717121901871524e-05, "loss": 0.2086, "step": 13235 }, { "epoch": 2.2597934624903986, "grad_norm": 0.3042524670678712, "learning_rate": 1.370131512392514e-05, "loss": 0.2219, "step": 13240 }, { "epoch": 2.2606469232738755, "grad_norm": 0.2712695905848906, "learning_rate": 1.3685508345978756e-05, "loss": 0.2063, "step": 13245 }, { "epoch": 2.2615003840573524, "grad_norm": 0.2617481252090542, "learning_rate": 1.3669701568032373e-05, "loss": 0.2076, "step": 13250 }, { "epoch": 2.2623538448408294, "grad_norm": 0.2762436320936438, "learning_rate": 1.3653894790085989e-05, "loss": 0.2067, "step": 13255 }, { "epoch": 2.2632073056243067, "grad_norm": 0.3262021508837165, "learning_rate": 1.3638088012139605e-05, "loss": 0.194, "step": 13260 }, { "epoch": 2.2640607664077836, "grad_norm": 0.297569488938874, "learning_rate": 1.3622281234193224e-05, "loss": 0.2205, "step": 13265 }, { "epoch": 2.2649142271912606, "grad_norm": 0.3485610316888799, "learning_rate": 1.360647445624684e-05, "loss": 0.2033, "step": 13270 }, { "epoch": 2.2657676879747375, "grad_norm": 0.28868766159089704, "learning_rate": 1.3590667678300455e-05, "loss": 0.2265, "step": 13275 }, { "epoch": 2.2666211487582144, "grad_norm": 0.3072177769802167, "learning_rate": 1.3574860900354073e-05, "loss": 0.21, "step": 13280 }, { "epoch": 2.2674746095416918, "grad_norm": 0.34103161024204426, "learning_rate": 1.3559054122407689e-05, "loss": 0.2038, "step": 13285 }, { "epoch": 2.2683280703251687, "grad_norm": 0.27710155360196836, "learning_rate": 1.3543247344461304e-05, "loss": 0.2082, "step": 13290 }, { "epoch": 2.2691815311086456, "grad_norm": 0.30152928917746463, "learning_rate": 1.3527440566514923e-05, "loss": 0.2036, "step": 13295 }, { "epoch": 2.2700349918921225, "grad_norm": 0.35300537077295696, "learning_rate": 1.351163378856854e-05, "loss": 0.2035, "step": 13300 }, { "epoch": 2.2708884526755995, "grad_norm": 0.3181065781022525, "learning_rate": 1.3495827010622155e-05, "loss": 0.2057, "step": 13305 }, { "epoch": 2.2717419134590764, "grad_norm": 0.2919343415693902, "learning_rate": 1.3480020232675772e-05, "loss": 0.2122, "step": 13310 }, { "epoch": 2.2725953742425533, "grad_norm": 0.3268038548349459, "learning_rate": 1.3464213454729388e-05, "loss": 0.2079, "step": 13315 }, { "epoch": 2.2734488350260307, "grad_norm": 0.25785633197698465, "learning_rate": 1.3448406676783004e-05, "loss": 0.2171, "step": 13320 }, { "epoch": 2.2743022958095076, "grad_norm": 0.3026119746978006, "learning_rate": 1.3432599898836623e-05, "loss": 0.2041, "step": 13325 }, { "epoch": 2.2751557565929845, "grad_norm": 0.34122828543422956, "learning_rate": 1.3416793120890239e-05, "loss": 0.2135, "step": 13330 }, { "epoch": 2.2760092173764614, "grad_norm": 0.28967958550217704, "learning_rate": 1.3400986342943855e-05, "loss": 0.2138, "step": 13335 }, { "epoch": 2.2768626781599384, "grad_norm": 0.2666929420884562, "learning_rate": 1.3385179564997472e-05, "loss": 0.2179, "step": 13340 }, { "epoch": 2.2777161389434157, "grad_norm": 0.29006389218100326, "learning_rate": 1.3369372787051088e-05, "loss": 0.2253, "step": 13345 }, { "epoch": 2.2785695997268927, "grad_norm": 0.32007728941184277, "learning_rate": 1.3353566009104704e-05, "loss": 0.2, "step": 13350 }, { "epoch": 2.2794230605103696, "grad_norm": 0.2568600573480449, "learning_rate": 1.3337759231158323e-05, "loss": 0.2065, "step": 13355 }, { "epoch": 2.2802765212938465, "grad_norm": 0.31817439556322596, "learning_rate": 1.3321952453211939e-05, "loss": 0.2081, "step": 13360 }, { "epoch": 2.2811299820773234, "grad_norm": 0.3420601803930402, "learning_rate": 1.3306145675265554e-05, "loss": 0.2, "step": 13365 }, { "epoch": 2.281983442860801, "grad_norm": 0.2642186636320623, "learning_rate": 1.3290338897319172e-05, "loss": 0.2271, "step": 13370 }, { "epoch": 2.2828369036442777, "grad_norm": 0.32728428706222734, "learning_rate": 1.3274532119372788e-05, "loss": 0.2441, "step": 13375 }, { "epoch": 2.2836903644277546, "grad_norm": 0.33551608392640997, "learning_rate": 1.3258725341426403e-05, "loss": 0.2169, "step": 13380 }, { "epoch": 2.2845438252112316, "grad_norm": 0.3019205333869423, "learning_rate": 1.3242918563480019e-05, "loss": 0.2057, "step": 13385 }, { "epoch": 2.2853972859947085, "grad_norm": 0.3529005309374569, "learning_rate": 1.3227111785533638e-05, "loss": 0.21, "step": 13390 }, { "epoch": 2.2862507467781854, "grad_norm": 0.2588531453888455, "learning_rate": 1.3211305007587254e-05, "loss": 0.2174, "step": 13395 }, { "epoch": 2.2871042075616623, "grad_norm": 0.3141540770063283, "learning_rate": 1.3195498229640871e-05, "loss": 0.2131, "step": 13400 }, { "epoch": 2.2879576683451397, "grad_norm": 0.31424223507989274, "learning_rate": 1.3179691451694487e-05, "loss": 0.1993, "step": 13405 }, { "epoch": 2.2888111291286166, "grad_norm": 0.28748962925571614, "learning_rate": 1.3163884673748103e-05, "loss": 0.1995, "step": 13410 }, { "epoch": 2.2896645899120935, "grad_norm": 0.3166005979133634, "learning_rate": 1.3148077895801719e-05, "loss": 0.2147, "step": 13415 }, { "epoch": 2.2905180506955705, "grad_norm": 0.3001203941983412, "learning_rate": 1.3132271117855338e-05, "loss": 0.2147, "step": 13420 }, { "epoch": 2.2913715114790474, "grad_norm": 0.2975608944878685, "learning_rate": 1.3116464339908954e-05, "loss": 0.2057, "step": 13425 }, { "epoch": 2.2922249722625248, "grad_norm": 0.2978182346185745, "learning_rate": 1.3100657561962571e-05, "loss": 0.2114, "step": 13430 }, { "epoch": 2.2930784330460017, "grad_norm": 0.2664431574433402, "learning_rate": 1.3084850784016187e-05, "loss": 0.2157, "step": 13435 }, { "epoch": 2.2939318938294786, "grad_norm": 0.2974882435134114, "learning_rate": 1.3069044006069803e-05, "loss": 0.2039, "step": 13440 }, { "epoch": 2.2947853546129555, "grad_norm": 0.32720307460568476, "learning_rate": 1.3053237228123418e-05, "loss": 0.2095, "step": 13445 }, { "epoch": 2.2956388153964324, "grad_norm": 0.2991107213755254, "learning_rate": 1.3037430450177038e-05, "loss": 0.2144, "step": 13450 }, { "epoch": 2.2964922761799094, "grad_norm": 0.2832496638436616, "learning_rate": 1.3021623672230653e-05, "loss": 0.2098, "step": 13455 }, { "epoch": 2.2973457369633863, "grad_norm": 0.3140580372874864, "learning_rate": 1.300581689428427e-05, "loss": 0.2256, "step": 13460 }, { "epoch": 2.2981991977468637, "grad_norm": 0.30563434481669327, "learning_rate": 1.2990010116337887e-05, "loss": 0.1999, "step": 13465 }, { "epoch": 2.2990526585303406, "grad_norm": 0.25542997839418147, "learning_rate": 1.2974203338391502e-05, "loss": 0.2102, "step": 13470 }, { "epoch": 2.2999061193138175, "grad_norm": 0.2986694081906472, "learning_rate": 1.2958396560445118e-05, "loss": 0.1949, "step": 13475 }, { "epoch": 2.3007595800972944, "grad_norm": 0.32319339213157117, "learning_rate": 1.2942589782498737e-05, "loss": 0.2256, "step": 13480 }, { "epoch": 2.3016130408807713, "grad_norm": 0.3240202358455632, "learning_rate": 1.2926783004552353e-05, "loss": 0.2074, "step": 13485 }, { "epoch": 2.3024665016642487, "grad_norm": 0.29444743451811217, "learning_rate": 1.291097622660597e-05, "loss": 0.216, "step": 13490 }, { "epoch": 2.3033199624477256, "grad_norm": 0.3101155768296177, "learning_rate": 1.2895169448659586e-05, "loss": 0.205, "step": 13495 }, { "epoch": 2.3041734232312026, "grad_norm": 0.3336701262710029, "learning_rate": 1.2879362670713202e-05, "loss": 0.235, "step": 13500 }, { "epoch": 2.3050268840146795, "grad_norm": 0.2808712931272374, "learning_rate": 1.2863555892766818e-05, "loss": 0.2143, "step": 13505 }, { "epoch": 2.3058803447981564, "grad_norm": 0.283264889277899, "learning_rate": 1.2847749114820433e-05, "loss": 0.2167, "step": 13510 }, { "epoch": 2.3067338055816338, "grad_norm": 0.29944382969562133, "learning_rate": 1.2831942336874053e-05, "loss": 0.2084, "step": 13515 }, { "epoch": 2.3075872663651107, "grad_norm": 0.2930821253346565, "learning_rate": 1.281613555892767e-05, "loss": 0.2322, "step": 13520 }, { "epoch": 2.3084407271485876, "grad_norm": 0.2965354032965979, "learning_rate": 1.2800328780981286e-05, "loss": 0.2043, "step": 13525 }, { "epoch": 2.3092941879320645, "grad_norm": 0.294132453428861, "learning_rate": 1.2784522003034902e-05, "loss": 0.2106, "step": 13530 }, { "epoch": 2.3101476487155415, "grad_norm": 0.33772017869501963, "learning_rate": 1.2768715225088517e-05, "loss": 0.1978, "step": 13535 }, { "epoch": 2.3110011094990184, "grad_norm": 0.33140678113520866, "learning_rate": 1.2752908447142133e-05, "loss": 0.2045, "step": 13540 }, { "epoch": 2.3118545702824953, "grad_norm": 0.3087203730137417, "learning_rate": 1.2737101669195752e-05, "loss": 0.2203, "step": 13545 }, { "epoch": 2.3127080310659727, "grad_norm": 0.2833320271671184, "learning_rate": 1.272129489124937e-05, "loss": 0.2062, "step": 13550 }, { "epoch": 2.3135614918494496, "grad_norm": 0.28308464937295785, "learning_rate": 1.2705488113302986e-05, "loss": 0.2145, "step": 13555 }, { "epoch": 2.3144149526329265, "grad_norm": 0.3084742592115181, "learning_rate": 1.2689681335356601e-05, "loss": 0.2113, "step": 13560 }, { "epoch": 2.3152684134164034, "grad_norm": 0.3162391108166095, "learning_rate": 1.2673874557410217e-05, "loss": 0.2088, "step": 13565 }, { "epoch": 2.3161218741998804, "grad_norm": 0.3158367885337515, "learning_rate": 1.2658067779463833e-05, "loss": 0.2037, "step": 13570 }, { "epoch": 2.3169753349833577, "grad_norm": 0.30314370750761077, "learning_rate": 1.2642261001517452e-05, "loss": 0.2023, "step": 13575 }, { "epoch": 2.3178287957668346, "grad_norm": 0.3250463351974723, "learning_rate": 1.262645422357107e-05, "loss": 0.1917, "step": 13580 }, { "epoch": 2.3186822565503116, "grad_norm": 0.34822861810430045, "learning_rate": 1.2610647445624685e-05, "loss": 0.202, "step": 13585 }, { "epoch": 2.3195357173337885, "grad_norm": 0.2908473154531126, "learning_rate": 1.2594840667678301e-05, "loss": 0.2067, "step": 13590 }, { "epoch": 2.3203891781172654, "grad_norm": 0.2977826843234019, "learning_rate": 1.2579033889731917e-05, "loss": 0.2218, "step": 13595 }, { "epoch": 2.3212426389007423, "grad_norm": 0.32595757760761257, "learning_rate": 1.2563227111785532e-05, "loss": 0.2138, "step": 13600 }, { "epoch": 2.3220960996842193, "grad_norm": 0.34000733943188083, "learning_rate": 1.2547420333839152e-05, "loss": 0.1957, "step": 13605 }, { "epoch": 2.3229495604676966, "grad_norm": 0.2824487751747822, "learning_rate": 1.2531613555892769e-05, "loss": 0.2187, "step": 13610 }, { "epoch": 2.3238030212511736, "grad_norm": 0.3246624612966314, "learning_rate": 1.2515806777946385e-05, "loss": 0.2254, "step": 13615 }, { "epoch": 2.3246564820346505, "grad_norm": 0.36271716853147196, "learning_rate": 1.25e-05, "loss": 0.1984, "step": 13620 }, { "epoch": 2.3255099428181274, "grad_norm": 0.2888238583349264, "learning_rate": 1.2484193222053616e-05, "loss": 0.2114, "step": 13625 }, { "epoch": 2.3263634036016043, "grad_norm": 0.3419754471615376, "learning_rate": 1.2468386444107234e-05, "loss": 0.2177, "step": 13630 }, { "epoch": 2.3272168643850817, "grad_norm": 0.2497942363274148, "learning_rate": 1.245257966616085e-05, "loss": 0.2234, "step": 13635 }, { "epoch": 2.3280703251685586, "grad_norm": 0.2829198977191555, "learning_rate": 1.2436772888214467e-05, "loss": 0.2284, "step": 13640 }, { "epoch": 2.3289237859520355, "grad_norm": 0.350609987164919, "learning_rate": 1.2420966110268084e-05, "loss": 0.2001, "step": 13645 }, { "epoch": 2.3297772467355125, "grad_norm": 0.3354434177143731, "learning_rate": 1.24051593323217e-05, "loss": 0.2008, "step": 13650 }, { "epoch": 2.3306307075189894, "grad_norm": 0.30510041406639565, "learning_rate": 1.2389352554375316e-05, "loss": 0.2103, "step": 13655 }, { "epoch": 2.3314841683024667, "grad_norm": 0.33687574227158135, "learning_rate": 1.2373545776428933e-05, "loss": 0.2048, "step": 13660 }, { "epoch": 2.3323376290859437, "grad_norm": 0.34264003459420733, "learning_rate": 1.235773899848255e-05, "loss": 0.2243, "step": 13665 }, { "epoch": 2.3331910898694206, "grad_norm": 0.2785388020211905, "learning_rate": 1.2341932220536167e-05, "loss": 0.2112, "step": 13670 }, { "epoch": 2.3340445506528975, "grad_norm": 0.3025164299400196, "learning_rate": 1.2326125442589784e-05, "loss": 0.1992, "step": 13675 }, { "epoch": 2.3348980114363744, "grad_norm": 0.3227287066037025, "learning_rate": 1.23103186646434e-05, "loss": 0.2279, "step": 13680 }, { "epoch": 2.3357514722198514, "grad_norm": 0.3142138786407541, "learning_rate": 1.2294511886697016e-05, "loss": 0.1948, "step": 13685 }, { "epoch": 2.3366049330033283, "grad_norm": 0.3495362803307892, "learning_rate": 1.2278705108750631e-05, "loss": 0.1972, "step": 13690 }, { "epoch": 2.3374583937868056, "grad_norm": 0.3524708012448484, "learning_rate": 1.2262898330804249e-05, "loss": 0.2052, "step": 13695 }, { "epoch": 2.3383118545702826, "grad_norm": 0.2887154435549219, "learning_rate": 1.2247091552857866e-05, "loss": 0.2231, "step": 13700 }, { "epoch": 2.3391653153537595, "grad_norm": 0.2763015450763579, "learning_rate": 1.2231284774911482e-05, "loss": 0.199, "step": 13705 }, { "epoch": 2.3400187761372364, "grad_norm": 0.3362527606821105, "learning_rate": 1.22154779969651e-05, "loss": 0.2169, "step": 13710 }, { "epoch": 2.3408722369207133, "grad_norm": 0.3126543851242967, "learning_rate": 1.2199671219018715e-05, "loss": 0.2072, "step": 13715 }, { "epoch": 2.3417256977041907, "grad_norm": 0.27471602544980955, "learning_rate": 1.2183864441072331e-05, "loss": 0.2033, "step": 13720 }, { "epoch": 2.3425791584876676, "grad_norm": 0.3315561602734119, "learning_rate": 1.2168057663125949e-05, "loss": 0.2191, "step": 13725 }, { "epoch": 2.3434326192711445, "grad_norm": 0.3039582613379289, "learning_rate": 1.2152250885179566e-05, "loss": 0.2011, "step": 13730 }, { "epoch": 2.3442860800546215, "grad_norm": 0.29153577480342086, "learning_rate": 1.2136444107233182e-05, "loss": 0.216, "step": 13735 }, { "epoch": 2.3451395408380984, "grad_norm": 0.3014085213620397, "learning_rate": 1.21206373292868e-05, "loss": 0.2121, "step": 13740 }, { "epoch": 2.3459930016215753, "grad_norm": 0.34083454209144076, "learning_rate": 1.2104830551340415e-05, "loss": 0.1889, "step": 13745 }, { "epoch": 2.3468464624050527, "grad_norm": 0.321716529853453, "learning_rate": 1.208902377339403e-05, "loss": 0.2, "step": 13750 }, { "epoch": 2.3476999231885296, "grad_norm": 0.3134794402778035, "learning_rate": 1.2073216995447648e-05, "loss": 0.2074, "step": 13755 }, { "epoch": 2.3485533839720065, "grad_norm": 0.2730396892478221, "learning_rate": 1.2057410217501266e-05, "loss": 0.2157, "step": 13760 }, { "epoch": 2.3494068447554834, "grad_norm": 0.3120686446017357, "learning_rate": 1.2041603439554881e-05, "loss": 0.197, "step": 13765 }, { "epoch": 2.3502603055389604, "grad_norm": 0.32465621002511585, "learning_rate": 1.2025796661608499e-05, "loss": 0.1982, "step": 13770 }, { "epoch": 2.3511137663224373, "grad_norm": 0.2954021641437943, "learning_rate": 1.2009989883662115e-05, "loss": 0.2009, "step": 13775 }, { "epoch": 2.3519672271059147, "grad_norm": 0.36323697472707595, "learning_rate": 1.199418310571573e-05, "loss": 0.2227, "step": 13780 }, { "epoch": 2.3528206878893916, "grad_norm": 0.36310091081675355, "learning_rate": 1.1978376327769348e-05, "loss": 0.2141, "step": 13785 }, { "epoch": 2.3536741486728685, "grad_norm": 0.3147460011957549, "learning_rate": 1.1962569549822965e-05, "loss": 0.2105, "step": 13790 }, { "epoch": 2.3545276094563454, "grad_norm": 0.25007674715037687, "learning_rate": 1.1946762771876581e-05, "loss": 0.2064, "step": 13795 }, { "epoch": 2.3553810702398223, "grad_norm": 0.2976704905071141, "learning_rate": 1.1930955993930199e-05, "loss": 0.2249, "step": 13800 }, { "epoch": 2.3562345310232997, "grad_norm": 0.30967421466661693, "learning_rate": 1.1915149215983814e-05, "loss": 0.183, "step": 13805 }, { "epoch": 2.3570879918067766, "grad_norm": 0.31772863765393883, "learning_rate": 1.189934243803743e-05, "loss": 0.1987, "step": 13810 }, { "epoch": 2.3579414525902536, "grad_norm": 0.27905975405263916, "learning_rate": 1.1883535660091048e-05, "loss": 0.1915, "step": 13815 }, { "epoch": 2.3587949133737305, "grad_norm": 0.31940376684925614, "learning_rate": 1.1867728882144665e-05, "loss": 0.2068, "step": 13820 }, { "epoch": 2.3596483741572074, "grad_norm": 0.36648859657045024, "learning_rate": 1.185192210419828e-05, "loss": 0.2309, "step": 13825 }, { "epoch": 2.3605018349406843, "grad_norm": 0.32126722146474646, "learning_rate": 1.1836115326251897e-05, "loss": 0.2018, "step": 13830 }, { "epoch": 2.3613552957241613, "grad_norm": 0.29180964522029956, "learning_rate": 1.1820308548305514e-05, "loss": 0.2009, "step": 13835 }, { "epoch": 2.3622087565076386, "grad_norm": 0.2803478621955719, "learning_rate": 1.180450177035913e-05, "loss": 0.207, "step": 13840 }, { "epoch": 2.3630622172911155, "grad_norm": 0.29059057051756104, "learning_rate": 1.1788694992412747e-05, "loss": 0.2004, "step": 13845 }, { "epoch": 2.3639156780745925, "grad_norm": 0.30827211891490164, "learning_rate": 1.1772888214466365e-05, "loss": 0.1888, "step": 13850 }, { "epoch": 2.3647691388580694, "grad_norm": 0.31655208222464226, "learning_rate": 1.175708143651998e-05, "loss": 0.2378, "step": 13855 }, { "epoch": 2.3656225996415463, "grad_norm": 0.2894637448417857, "learning_rate": 1.1741274658573596e-05, "loss": 0.2028, "step": 13860 }, { "epoch": 2.3664760604250237, "grad_norm": 0.3265571403351102, "learning_rate": 1.1725467880627214e-05, "loss": 0.2149, "step": 13865 }, { "epoch": 2.3673295212085006, "grad_norm": 0.32656324661280534, "learning_rate": 1.170966110268083e-05, "loss": 0.228, "step": 13870 }, { "epoch": 2.3681829819919775, "grad_norm": 0.27250745500205625, "learning_rate": 1.1693854324734447e-05, "loss": 0.2053, "step": 13875 }, { "epoch": 2.3690364427754544, "grad_norm": 0.41078727260785375, "learning_rate": 1.1678047546788064e-05, "loss": 0.1974, "step": 13880 }, { "epoch": 2.3698899035589314, "grad_norm": 0.3073066638033105, "learning_rate": 1.166224076884168e-05, "loss": 0.2046, "step": 13885 }, { "epoch": 2.3707433643424083, "grad_norm": 0.3179197913520734, "learning_rate": 1.1646433990895296e-05, "loss": 0.213, "step": 13890 }, { "epoch": 2.3715968251258857, "grad_norm": 0.3046602975347698, "learning_rate": 1.1630627212948913e-05, "loss": 0.2199, "step": 13895 }, { "epoch": 2.3724502859093626, "grad_norm": 0.30599636666461244, "learning_rate": 1.1614820435002529e-05, "loss": 0.1931, "step": 13900 }, { "epoch": 2.3733037466928395, "grad_norm": 0.2490602435541965, "learning_rate": 1.1599013657056147e-05, "loss": 0.2284, "step": 13905 }, { "epoch": 2.3741572074763164, "grad_norm": 0.2588509482895115, "learning_rate": 1.1583206879109764e-05, "loss": 0.1929, "step": 13910 }, { "epoch": 2.3750106682597933, "grad_norm": 0.3066469598611019, "learning_rate": 1.156740010116338e-05, "loss": 0.1918, "step": 13915 }, { "epoch": 2.3758641290432703, "grad_norm": 0.321753243266527, "learning_rate": 1.1551593323216996e-05, "loss": 0.2302, "step": 13920 }, { "epoch": 2.3767175898267476, "grad_norm": 0.282165360235603, "learning_rate": 1.1535786545270613e-05, "loss": 0.2086, "step": 13925 }, { "epoch": 2.3775710506102246, "grad_norm": 0.2873663304887399, "learning_rate": 1.1519979767324229e-05, "loss": 0.1978, "step": 13930 }, { "epoch": 2.3784245113937015, "grad_norm": 0.3125983263340995, "learning_rate": 1.1504172989377846e-05, "loss": 0.1977, "step": 13935 }, { "epoch": 2.3792779721771784, "grad_norm": 0.2941227182513949, "learning_rate": 1.1488366211431464e-05, "loss": 0.1893, "step": 13940 }, { "epoch": 2.3801314329606553, "grad_norm": 0.2813075809601596, "learning_rate": 1.147255943348508e-05, "loss": 0.2075, "step": 13945 }, { "epoch": 2.3809848937441327, "grad_norm": 0.2718412425038134, "learning_rate": 1.1456752655538695e-05, "loss": 0.2277, "step": 13950 }, { "epoch": 2.3818383545276096, "grad_norm": 0.2624197272105783, "learning_rate": 1.1440945877592311e-05, "loss": 0.1977, "step": 13955 }, { "epoch": 2.3826918153110865, "grad_norm": 0.3128325989158548, "learning_rate": 1.1425139099645928e-05, "loss": 0.1957, "step": 13960 }, { "epoch": 2.3835452760945635, "grad_norm": 0.2612314636061772, "learning_rate": 1.1409332321699546e-05, "loss": 0.2063, "step": 13965 }, { "epoch": 2.3843987368780404, "grad_norm": 0.3257662457855599, "learning_rate": 1.1393525543753162e-05, "loss": 0.188, "step": 13970 }, { "epoch": 2.3852521976615173, "grad_norm": 0.3034523315038139, "learning_rate": 1.1377718765806779e-05, "loss": 0.217, "step": 13975 }, { "epoch": 2.3861056584449942, "grad_norm": 0.3422307157407671, "learning_rate": 1.1361911987860395e-05, "loss": 0.2044, "step": 13980 }, { "epoch": 2.3869591192284716, "grad_norm": 0.3008575172864302, "learning_rate": 1.134610520991401e-05, "loss": 0.2159, "step": 13985 }, { "epoch": 2.3878125800119485, "grad_norm": 0.2772734437898112, "learning_rate": 1.1330298431967628e-05, "loss": 0.2134, "step": 13990 }, { "epoch": 2.3886660407954254, "grad_norm": 0.30967133506476313, "learning_rate": 1.1314491654021246e-05, "loss": 0.2085, "step": 13995 }, { "epoch": 2.3895195015789024, "grad_norm": 1.6013781099646365, "learning_rate": 1.1298684876074861e-05, "loss": 0.2484, "step": 14000 }, { "epoch": 2.3903729623623793, "grad_norm": 0.37966306805475036, "learning_rate": 1.1282878098128479e-05, "loss": 0.2074, "step": 14005 }, { "epoch": 2.3912264231458566, "grad_norm": 0.30157459269486764, "learning_rate": 1.1267071320182095e-05, "loss": 0.1987, "step": 14010 }, { "epoch": 2.3920798839293336, "grad_norm": 0.3278285365209402, "learning_rate": 1.125126454223571e-05, "loss": 0.1972, "step": 14015 }, { "epoch": 2.3929333447128105, "grad_norm": 0.2958191968534545, "learning_rate": 1.1235457764289328e-05, "loss": 0.2102, "step": 14020 }, { "epoch": 2.3937868054962874, "grad_norm": 0.3088347771017022, "learning_rate": 1.1219650986342945e-05, "loss": 0.218, "step": 14025 }, { "epoch": 2.3946402662797643, "grad_norm": 0.32603220056290744, "learning_rate": 1.1203844208396561e-05, "loss": 0.1906, "step": 14030 }, { "epoch": 2.3954937270632413, "grad_norm": 0.2949098882137962, "learning_rate": 1.1188037430450178e-05, "loss": 0.2116, "step": 14035 }, { "epoch": 2.3963471878467186, "grad_norm": 0.3020414872119159, "learning_rate": 1.1172230652503794e-05, "loss": 0.2078, "step": 14040 }, { "epoch": 2.3972006486301956, "grad_norm": 0.2629412290521286, "learning_rate": 1.115642387455741e-05, "loss": 0.2293, "step": 14045 }, { "epoch": 2.3980541094136725, "grad_norm": 0.2984586730057452, "learning_rate": 1.1140617096611027e-05, "loss": 0.2244, "step": 14050 }, { "epoch": 2.3989075701971494, "grad_norm": 0.31531536762452245, "learning_rate": 1.1124810318664645e-05, "loss": 0.1848, "step": 14055 }, { "epoch": 2.3997610309806263, "grad_norm": 0.37543892287655517, "learning_rate": 1.110900354071826e-05, "loss": 0.2157, "step": 14060 }, { "epoch": 2.4006144917641032, "grad_norm": 0.2591494281675972, "learning_rate": 1.1093196762771878e-05, "loss": 0.2366, "step": 14065 }, { "epoch": 2.4014679525475806, "grad_norm": 0.2989585808272384, "learning_rate": 1.1077389984825494e-05, "loss": 0.2282, "step": 14070 }, { "epoch": 2.4023214133310575, "grad_norm": 0.32735710263961404, "learning_rate": 1.106158320687911e-05, "loss": 0.2014, "step": 14075 }, { "epoch": 2.4031748741145345, "grad_norm": 0.24785159906268303, "learning_rate": 1.1045776428932727e-05, "loss": 0.1984, "step": 14080 }, { "epoch": 2.4040283348980114, "grad_norm": 0.31377318324210945, "learning_rate": 1.1029969650986345e-05, "loss": 0.2186, "step": 14085 }, { "epoch": 2.4048817956814883, "grad_norm": 0.3085371170270808, "learning_rate": 1.101416287303996e-05, "loss": 0.2151, "step": 14090 }, { "epoch": 2.4057352564649657, "grad_norm": 0.3450971986283205, "learning_rate": 1.0998356095093576e-05, "loss": 0.2235, "step": 14095 }, { "epoch": 2.4065887172484426, "grad_norm": 0.32800720692597113, "learning_rate": 1.0982549317147194e-05, "loss": 0.2027, "step": 14100 }, { "epoch": 2.4074421780319195, "grad_norm": 0.2961868905336815, "learning_rate": 1.096674253920081e-05, "loss": 0.2164, "step": 14105 }, { "epoch": 2.4082956388153964, "grad_norm": 0.2648157085873721, "learning_rate": 1.0950935761254425e-05, "loss": 0.2036, "step": 14110 }, { "epoch": 2.4091490995988734, "grad_norm": 0.3155826185696189, "learning_rate": 1.0935128983308044e-05, "loss": 0.2178, "step": 14115 }, { "epoch": 2.4100025603823503, "grad_norm": 0.2940118830259592, "learning_rate": 1.091932220536166e-05, "loss": 0.2326, "step": 14120 }, { "epoch": 2.410856021165827, "grad_norm": 0.25891775626168706, "learning_rate": 1.0903515427415276e-05, "loss": 0.2042, "step": 14125 }, { "epoch": 2.4117094819493046, "grad_norm": 0.27337796000407333, "learning_rate": 1.0887708649468893e-05, "loss": 0.2181, "step": 14130 }, { "epoch": 2.4125629427327815, "grad_norm": 0.2655684038239894, "learning_rate": 1.0871901871522509e-05, "loss": 0.1879, "step": 14135 }, { "epoch": 2.4134164035162584, "grad_norm": 0.33169546519632853, "learning_rate": 1.0856095093576125e-05, "loss": 0.2038, "step": 14140 }, { "epoch": 2.4142698642997353, "grad_norm": 0.2801305855840222, "learning_rate": 1.0840288315629742e-05, "loss": 0.2111, "step": 14145 }, { "epoch": 2.4151233250832123, "grad_norm": 0.3376627503181449, "learning_rate": 1.082448153768336e-05, "loss": 0.1994, "step": 14150 }, { "epoch": 2.4159767858666896, "grad_norm": 0.27514617704519934, "learning_rate": 1.0808674759736975e-05, "loss": 0.2195, "step": 14155 }, { "epoch": 2.4168302466501665, "grad_norm": 0.3373520507827409, "learning_rate": 1.0792867981790593e-05, "loss": 0.2008, "step": 14160 }, { "epoch": 2.4176837074336435, "grad_norm": 0.3220162765545422, "learning_rate": 1.0777061203844209e-05, "loss": 0.2039, "step": 14165 }, { "epoch": 2.4185371682171204, "grad_norm": 0.3077962124071351, "learning_rate": 1.0761254425897824e-05, "loss": 0.2153, "step": 14170 }, { "epoch": 2.4193906290005973, "grad_norm": 0.3659562275132755, "learning_rate": 1.0745447647951442e-05, "loss": 0.207, "step": 14175 }, { "epoch": 2.4202440897840742, "grad_norm": 0.2674706499533644, "learning_rate": 1.072964087000506e-05, "loss": 0.2219, "step": 14180 }, { "epoch": 2.4210975505675516, "grad_norm": 0.30107451237513644, "learning_rate": 1.0713834092058675e-05, "loss": 0.2057, "step": 14185 }, { "epoch": 2.4219510113510285, "grad_norm": 0.41015771344654967, "learning_rate": 1.0698027314112293e-05, "loss": 0.2106, "step": 14190 }, { "epoch": 2.4228044721345054, "grad_norm": 0.3198217757852331, "learning_rate": 1.0682220536165908e-05, "loss": 0.2247, "step": 14195 }, { "epoch": 2.4236579329179824, "grad_norm": 0.3292014164617729, "learning_rate": 1.0666413758219524e-05, "loss": 0.1981, "step": 14200 }, { "epoch": 2.4245113937014593, "grad_norm": 0.29578780153404344, "learning_rate": 1.0650606980273141e-05, "loss": 0.2068, "step": 14205 }, { "epoch": 2.425364854484936, "grad_norm": 0.2801948160958686, "learning_rate": 1.0634800202326759e-05, "loss": 0.2031, "step": 14210 }, { "epoch": 2.4262183152684136, "grad_norm": 0.2695847738899817, "learning_rate": 1.0618993424380375e-05, "loss": 0.2073, "step": 14215 }, { "epoch": 2.4270717760518905, "grad_norm": 0.3056266347049957, "learning_rate": 1.060318664643399e-05, "loss": 0.2018, "step": 14220 }, { "epoch": 2.4279252368353674, "grad_norm": 0.3421803648446021, "learning_rate": 1.0587379868487608e-05, "loss": 0.216, "step": 14225 }, { "epoch": 2.4287786976188444, "grad_norm": 0.30701291456453805, "learning_rate": 1.0571573090541224e-05, "loss": 0.2264, "step": 14230 }, { "epoch": 2.4296321584023213, "grad_norm": 0.2618434499350331, "learning_rate": 1.0555766312594841e-05, "loss": 0.2114, "step": 14235 }, { "epoch": 2.4304856191857986, "grad_norm": 0.30299250039455994, "learning_rate": 1.0539959534648459e-05, "loss": 0.2228, "step": 14240 }, { "epoch": 2.4313390799692756, "grad_norm": 0.31214430333917925, "learning_rate": 1.0524152756702074e-05, "loss": 0.2102, "step": 14245 }, { "epoch": 2.4321925407527525, "grad_norm": 0.2969823321953325, "learning_rate": 1.050834597875569e-05, "loss": 0.2026, "step": 14250 }, { "epoch": 2.4330460015362294, "grad_norm": 0.30969028300081797, "learning_rate": 1.0492539200809308e-05, "loss": 0.2023, "step": 14255 }, { "epoch": 2.4338994623197063, "grad_norm": 0.3098922988715544, "learning_rate": 1.0476732422862923e-05, "loss": 0.2023, "step": 14260 }, { "epoch": 2.4347529231031833, "grad_norm": 0.2856818309467379, "learning_rate": 1.046092564491654e-05, "loss": 0.2016, "step": 14265 }, { "epoch": 2.43560638388666, "grad_norm": 0.3001901082593489, "learning_rate": 1.0445118866970158e-05, "loss": 0.2131, "step": 14270 }, { "epoch": 2.4364598446701375, "grad_norm": 0.29715286758233034, "learning_rate": 1.0429312089023774e-05, "loss": 0.2174, "step": 14275 }, { "epoch": 2.4373133054536145, "grad_norm": 0.28184200886410027, "learning_rate": 1.041350531107739e-05, "loss": 0.2174, "step": 14280 }, { "epoch": 2.4381667662370914, "grad_norm": 0.3551568144734817, "learning_rate": 1.0397698533131007e-05, "loss": 0.1941, "step": 14285 }, { "epoch": 2.4390202270205683, "grad_norm": 0.26735267612506824, "learning_rate": 1.0381891755184623e-05, "loss": 0.2226, "step": 14290 }, { "epoch": 2.4398736878040452, "grad_norm": 0.2714927052259652, "learning_rate": 1.036608497723824e-05, "loss": 0.2162, "step": 14295 }, { "epoch": 2.4407271485875226, "grad_norm": 0.3119792437868736, "learning_rate": 1.0350278199291858e-05, "loss": 0.2009, "step": 14300 }, { "epoch": 2.4415806093709995, "grad_norm": 0.30394651390336375, "learning_rate": 1.0334471421345474e-05, "loss": 0.2239, "step": 14305 }, { "epoch": 2.4424340701544764, "grad_norm": 0.2806984911935375, "learning_rate": 1.031866464339909e-05, "loss": 0.2136, "step": 14310 }, { "epoch": 2.4432875309379534, "grad_norm": 0.27261969966439115, "learning_rate": 1.0302857865452707e-05, "loss": 0.2078, "step": 14315 }, { "epoch": 2.4441409917214303, "grad_norm": 0.3152077102252792, "learning_rate": 1.0287051087506323e-05, "loss": 0.2162, "step": 14320 }, { "epoch": 2.444994452504907, "grad_norm": 0.29604155804582094, "learning_rate": 1.027124430955994e-05, "loss": 0.2023, "step": 14325 }, { "epoch": 2.4458479132883846, "grad_norm": 0.2847399926827651, "learning_rate": 1.0255437531613558e-05, "loss": 0.2163, "step": 14330 }, { "epoch": 2.4467013740718615, "grad_norm": 0.30716862896980707, "learning_rate": 1.0239630753667173e-05, "loss": 0.2144, "step": 14335 }, { "epoch": 2.4475548348553384, "grad_norm": 0.2986475827407688, "learning_rate": 1.0223823975720789e-05, "loss": 0.217, "step": 14340 }, { "epoch": 2.4484082956388153, "grad_norm": 0.30690485416408153, "learning_rate": 1.0208017197774405e-05, "loss": 0.1771, "step": 14345 }, { "epoch": 2.4492617564222923, "grad_norm": 0.287172760295048, "learning_rate": 1.0192210419828022e-05, "loss": 0.2105, "step": 14350 }, { "epoch": 2.450115217205769, "grad_norm": 0.29694782254042457, "learning_rate": 1.017640364188164e-05, "loss": 0.2227, "step": 14355 }, { "epoch": 2.4509686779892466, "grad_norm": 0.28206682258461996, "learning_rate": 1.0160596863935256e-05, "loss": 0.197, "step": 14360 }, { "epoch": 2.4518221387727235, "grad_norm": 0.29895039378561006, "learning_rate": 1.0144790085988873e-05, "loss": 0.2137, "step": 14365 }, { "epoch": 2.4526755995562004, "grad_norm": 0.2990435460918407, "learning_rate": 1.0128983308042489e-05, "loss": 0.2267, "step": 14370 }, { "epoch": 2.4535290603396773, "grad_norm": 0.2825013731517185, "learning_rate": 1.0113176530096105e-05, "loss": 0.2091, "step": 14375 }, { "epoch": 2.4543825211231542, "grad_norm": 0.3107681087807651, "learning_rate": 1.0097369752149722e-05, "loss": 0.2149, "step": 14380 }, { "epoch": 2.4552359819066316, "grad_norm": 0.30211082465426375, "learning_rate": 1.008156297420334e-05, "loss": 0.1999, "step": 14385 }, { "epoch": 2.4560894426901085, "grad_norm": 0.27933301879863326, "learning_rate": 1.0065756196256955e-05, "loss": 0.2014, "step": 14390 }, { "epoch": 2.4569429034735855, "grad_norm": 0.2599940002303497, "learning_rate": 1.0049949418310573e-05, "loss": 0.2153, "step": 14395 }, { "epoch": 2.4577963642570624, "grad_norm": 0.300785431552765, "learning_rate": 1.0034142640364188e-05, "loss": 0.2082, "step": 14400 }, { "epoch": 2.4586498250405393, "grad_norm": 0.3169896812539754, "learning_rate": 1.0018335862417804e-05, "loss": 0.2027, "step": 14405 }, { "epoch": 2.4595032858240162, "grad_norm": 0.3328056764551749, "learning_rate": 1.0002529084471422e-05, "loss": 0.1938, "step": 14410 }, { "epoch": 2.460356746607493, "grad_norm": 0.3520689997652059, "learning_rate": 9.986722306525039e-06, "loss": 0.2058, "step": 14415 }, { "epoch": 2.4612102073909705, "grad_norm": 0.262322511672704, "learning_rate": 9.970915528578655e-06, "loss": 0.2026, "step": 14420 }, { "epoch": 2.4620636681744474, "grad_norm": 0.29988528719221075, "learning_rate": 9.955108750632272e-06, "loss": 0.2043, "step": 14425 }, { "epoch": 2.4629171289579244, "grad_norm": 0.36416490706020704, "learning_rate": 9.939301972685888e-06, "loss": 0.2061, "step": 14430 }, { "epoch": 2.4637705897414013, "grad_norm": 0.3459381154450959, "learning_rate": 9.923495194739504e-06, "loss": 0.2095, "step": 14435 }, { "epoch": 2.464624050524878, "grad_norm": 0.33881788716420064, "learning_rate": 9.907688416793121e-06, "loss": 0.2148, "step": 14440 }, { "epoch": 2.4654775113083556, "grad_norm": 0.2601662666396057, "learning_rate": 9.891881638846739e-06, "loss": 0.2161, "step": 14445 }, { "epoch": 2.4663309720918325, "grad_norm": 0.2686451024813113, "learning_rate": 9.876074860900355e-06, "loss": 0.197, "step": 14450 }, { "epoch": 2.4671844328753094, "grad_norm": 0.2687329223756362, "learning_rate": 9.860268082953972e-06, "loss": 0.2008, "step": 14455 }, { "epoch": 2.4680378936587863, "grad_norm": 0.3134134620271264, "learning_rate": 9.844461305007588e-06, "loss": 0.2247, "step": 14460 }, { "epoch": 2.4688913544422633, "grad_norm": 0.3719744607013002, "learning_rate": 9.828654527061204e-06, "loss": 0.2172, "step": 14465 }, { "epoch": 2.46974481522574, "grad_norm": 0.2907103084270228, "learning_rate": 9.812847749114821e-06, "loss": 0.2242, "step": 14470 }, { "epoch": 2.4705982760092176, "grad_norm": 0.31935227139555455, "learning_rate": 9.797040971168438e-06, "loss": 0.2029, "step": 14475 }, { "epoch": 2.4714517367926945, "grad_norm": 0.31669395322056704, "learning_rate": 9.781234193222054e-06, "loss": 0.186, "step": 14480 }, { "epoch": 2.4723051975761714, "grad_norm": 0.30252949209933655, "learning_rate": 9.76542741527567e-06, "loss": 0.2201, "step": 14485 }, { "epoch": 2.4731586583596483, "grad_norm": 0.2974693143355698, "learning_rate": 9.749620637329287e-06, "loss": 0.2111, "step": 14490 }, { "epoch": 2.4740121191431252, "grad_norm": 0.2920915802026588, "learning_rate": 9.733813859382903e-06, "loss": 0.2035, "step": 14495 }, { "epoch": 2.474865579926602, "grad_norm": 0.28901014972207195, "learning_rate": 9.71800708143652e-06, "loss": 0.203, "step": 14500 }, { "epoch": 2.4757190407100795, "grad_norm": 0.2954645284830918, "learning_rate": 9.702200303490138e-06, "loss": 0.213, "step": 14505 }, { "epoch": 2.4765725014935565, "grad_norm": 0.28276485484545605, "learning_rate": 9.686393525543754e-06, "loss": 0.2191, "step": 14510 }, { "epoch": 2.4774259622770334, "grad_norm": 0.3316563564752413, "learning_rate": 9.67058674759737e-06, "loss": 0.2028, "step": 14515 }, { "epoch": 2.4782794230605103, "grad_norm": 0.2654963812346402, "learning_rate": 9.654779969650987e-06, "loss": 0.2165, "step": 14520 }, { "epoch": 2.4791328838439872, "grad_norm": 0.3176999403331372, "learning_rate": 9.638973191704603e-06, "loss": 0.2223, "step": 14525 }, { "epoch": 2.4799863446274646, "grad_norm": 0.27613018461928496, "learning_rate": 9.623166413758219e-06, "loss": 0.1992, "step": 14530 }, { "epoch": 2.4808398054109415, "grad_norm": 0.28417958847947017, "learning_rate": 9.607359635811838e-06, "loss": 0.1887, "step": 14535 }, { "epoch": 2.4816932661944184, "grad_norm": 0.2938849097528958, "learning_rate": 9.591552857865454e-06, "loss": 0.21, "step": 14540 }, { "epoch": 2.4825467269778954, "grad_norm": 0.2850127473636765, "learning_rate": 9.57574607991907e-06, "loss": 0.2059, "step": 14545 }, { "epoch": 2.4834001877613723, "grad_norm": 0.3254366711089696, "learning_rate": 9.559939301972687e-06, "loss": 0.1924, "step": 14550 }, { "epoch": 2.484253648544849, "grad_norm": 0.3141891444400128, "learning_rate": 9.544132524026303e-06, "loss": 0.2485, "step": 14555 }, { "epoch": 2.485107109328326, "grad_norm": 0.3147773525418255, "learning_rate": 9.528325746079918e-06, "loss": 0.2111, "step": 14560 }, { "epoch": 2.4859605701118035, "grad_norm": 0.30126769475380727, "learning_rate": 9.512518968133537e-06, "loss": 0.1892, "step": 14565 }, { "epoch": 2.4868140308952804, "grad_norm": 0.2905332001543924, "learning_rate": 9.496712190187153e-06, "loss": 0.2078, "step": 14570 }, { "epoch": 2.4876674916787573, "grad_norm": 0.32657694675081445, "learning_rate": 9.480905412240769e-06, "loss": 0.2006, "step": 14575 }, { "epoch": 2.4885209524622343, "grad_norm": 0.32201510918814674, "learning_rate": 9.465098634294386e-06, "loss": 0.2155, "step": 14580 }, { "epoch": 2.489374413245711, "grad_norm": 0.34438211510610506, "learning_rate": 9.449291856348002e-06, "loss": 0.1913, "step": 14585 }, { "epoch": 2.4902278740291885, "grad_norm": 0.3279119046875183, "learning_rate": 9.433485078401618e-06, "loss": 0.2043, "step": 14590 }, { "epoch": 2.4910813348126655, "grad_norm": 0.2787549152293142, "learning_rate": 9.417678300455237e-06, "loss": 0.1854, "step": 14595 }, { "epoch": 2.4919347955961424, "grad_norm": 0.28791004514551843, "learning_rate": 9.401871522508853e-06, "loss": 0.2221, "step": 14600 }, { "epoch": 2.4927882563796193, "grad_norm": 0.30033596069049473, "learning_rate": 9.386064744562469e-06, "loss": 0.1998, "step": 14605 }, { "epoch": 2.4936417171630962, "grad_norm": 0.28198423677707446, "learning_rate": 9.370257966616084e-06, "loss": 0.2215, "step": 14610 }, { "epoch": 2.494495177946573, "grad_norm": 0.2869482530563864, "learning_rate": 9.354451188669702e-06, "loss": 0.1853, "step": 14615 }, { "epoch": 2.4953486387300505, "grad_norm": 0.3202832658095036, "learning_rate": 9.338644410723318e-06, "loss": 0.2196, "step": 14620 }, { "epoch": 2.4962020995135275, "grad_norm": 0.28717112445208526, "learning_rate": 9.322837632776935e-06, "loss": 0.2189, "step": 14625 }, { "epoch": 2.4970555602970044, "grad_norm": 0.2883560646491726, "learning_rate": 9.307030854830553e-06, "loss": 0.2092, "step": 14630 }, { "epoch": 2.4979090210804813, "grad_norm": 0.2993169126925263, "learning_rate": 9.291224076884168e-06, "loss": 0.1988, "step": 14635 }, { "epoch": 2.498762481863958, "grad_norm": 0.25487789488761603, "learning_rate": 9.275417298937784e-06, "loss": 0.1995, "step": 14640 }, { "epoch": 2.499615942647435, "grad_norm": 0.29993334603646543, "learning_rate": 9.259610520991402e-06, "loss": 0.2208, "step": 14645 }, { "epoch": 2.5004694034309125, "grad_norm": 0.30052669656191205, "learning_rate": 9.243803743045017e-06, "loss": 0.2056, "step": 14650 }, { "epoch": 2.5013228642143894, "grad_norm": 0.29948325962478006, "learning_rate": 9.227996965098635e-06, "loss": 0.2007, "step": 14655 }, { "epoch": 2.5021763249978664, "grad_norm": 0.27331421230603753, "learning_rate": 9.212190187152252e-06, "loss": 0.1982, "step": 14660 }, { "epoch": 2.5030297857813433, "grad_norm": 0.3235548662605652, "learning_rate": 9.196383409205868e-06, "loss": 0.2337, "step": 14665 }, { "epoch": 2.50388324656482, "grad_norm": 0.3509036724521923, "learning_rate": 9.180576631259484e-06, "loss": 0.194, "step": 14670 }, { "epoch": 2.5047367073482976, "grad_norm": 0.3151895297502629, "learning_rate": 9.164769853313101e-06, "loss": 0.221, "step": 14675 }, { "epoch": 2.5055901681317745, "grad_norm": 0.31103578459202363, "learning_rate": 9.148963075366717e-06, "loss": 0.2269, "step": 14680 }, { "epoch": 2.5064436289152514, "grad_norm": 0.3038672566352142, "learning_rate": 9.133156297420334e-06, "loss": 0.2099, "step": 14685 }, { "epoch": 2.5072970896987283, "grad_norm": 0.31028839860987395, "learning_rate": 9.117349519473952e-06, "loss": 0.1798, "step": 14690 }, { "epoch": 2.5081505504822053, "grad_norm": 0.308411235309962, "learning_rate": 9.101542741527568e-06, "loss": 0.2144, "step": 14695 }, { "epoch": 2.509004011265682, "grad_norm": 0.3037807976204573, "learning_rate": 9.085735963581183e-06, "loss": 0.2057, "step": 14700 }, { "epoch": 2.509857472049159, "grad_norm": 0.3076500697358676, "learning_rate": 9.069929185634801e-06, "loss": 0.2146, "step": 14705 }, { "epoch": 2.5107109328326365, "grad_norm": 0.2933647551642839, "learning_rate": 9.054122407688417e-06, "loss": 0.2142, "step": 14710 }, { "epoch": 2.5115643936161134, "grad_norm": 0.273678617532261, "learning_rate": 9.038315629742034e-06, "loss": 0.2224, "step": 14715 }, { "epoch": 2.5124178543995903, "grad_norm": 0.26966458878938193, "learning_rate": 9.022508851795652e-06, "loss": 0.2004, "step": 14720 }, { "epoch": 2.5132713151830672, "grad_norm": 0.2952976290261045, "learning_rate": 9.006702073849267e-06, "loss": 0.207, "step": 14725 }, { "epoch": 2.514124775966544, "grad_norm": 0.3516173328880711, "learning_rate": 8.990895295902883e-06, "loss": 0.1985, "step": 14730 }, { "epoch": 2.5149782367500215, "grad_norm": 0.2904515716410185, "learning_rate": 8.9750885179565e-06, "loss": 0.1908, "step": 14735 }, { "epoch": 2.5158316975334984, "grad_norm": 0.44454035786398166, "learning_rate": 8.959281740010116e-06, "loss": 0.2024, "step": 14740 }, { "epoch": 2.5166851583169754, "grad_norm": 0.3035625057171344, "learning_rate": 8.943474962063734e-06, "loss": 0.1995, "step": 14745 }, { "epoch": 2.5175386191004523, "grad_norm": 0.2881255361787169, "learning_rate": 8.92766818411735e-06, "loss": 0.2218, "step": 14750 }, { "epoch": 2.518392079883929, "grad_norm": 0.32668862299674006, "learning_rate": 8.911861406170967e-06, "loss": 0.2076, "step": 14755 }, { "epoch": 2.5192455406674066, "grad_norm": 0.2900488920933858, "learning_rate": 8.896054628224583e-06, "loss": 0.215, "step": 14760 }, { "epoch": 2.520099001450883, "grad_norm": 0.27615700998913023, "learning_rate": 8.880247850278198e-06, "loss": 0.1981, "step": 14765 }, { "epoch": 2.5209524622343604, "grad_norm": 0.31668404711134196, "learning_rate": 8.864441072331816e-06, "loss": 0.2238, "step": 14770 }, { "epoch": 2.5218059230178373, "grad_norm": 0.2938582622447334, "learning_rate": 8.848634294385433e-06, "loss": 0.2151, "step": 14775 }, { "epoch": 2.5226593838013143, "grad_norm": 0.25216291826774456, "learning_rate": 8.83282751643905e-06, "loss": 0.2149, "step": 14780 }, { "epoch": 2.523512844584791, "grad_norm": 0.275366735783369, "learning_rate": 8.817020738492667e-06, "loss": 0.2134, "step": 14785 }, { "epoch": 2.524366305368268, "grad_norm": 0.29631327072898345, "learning_rate": 8.801213960546282e-06, "loss": 0.2217, "step": 14790 }, { "epoch": 2.5252197661517455, "grad_norm": 0.2808458873761412, "learning_rate": 8.785407182599898e-06, "loss": 0.2092, "step": 14795 }, { "epoch": 2.5260732269352224, "grad_norm": 0.32698829781571825, "learning_rate": 8.769600404653516e-06, "loss": 0.2081, "step": 14800 }, { "epoch": 2.5269266877186993, "grad_norm": 0.30895852619685593, "learning_rate": 8.753793626707133e-06, "loss": 0.1987, "step": 14805 }, { "epoch": 2.5277801485021762, "grad_norm": 0.3050261921994531, "learning_rate": 8.737986848760749e-06, "loss": 0.2227, "step": 14810 }, { "epoch": 2.528633609285653, "grad_norm": 0.2690854256318603, "learning_rate": 8.722180070814366e-06, "loss": 0.2177, "step": 14815 }, { "epoch": 2.5294870700691305, "grad_norm": 0.3127481688429854, "learning_rate": 8.706373292867982e-06, "loss": 0.1758, "step": 14820 }, { "epoch": 2.5303405308526075, "grad_norm": 0.2818441964195298, "learning_rate": 8.690566514921598e-06, "loss": 0.2003, "step": 14825 }, { "epoch": 2.5311939916360844, "grad_norm": 0.2854176504652069, "learning_rate": 8.674759736975215e-06, "loss": 0.233, "step": 14830 }, { "epoch": 2.5320474524195613, "grad_norm": 0.3541774125660776, "learning_rate": 8.658952959028833e-06, "loss": 0.1963, "step": 14835 }, { "epoch": 2.5329009132030382, "grad_norm": 0.32895920647563437, "learning_rate": 8.643146181082449e-06, "loss": 0.2184, "step": 14840 }, { "epoch": 2.5337543739865156, "grad_norm": 0.28769317546282985, "learning_rate": 8.627339403136066e-06, "loss": 0.2012, "step": 14845 }, { "epoch": 2.534607834769992, "grad_norm": 0.26929644596147184, "learning_rate": 8.611532625189682e-06, "loss": 0.2028, "step": 14850 }, { "epoch": 2.5354612955534694, "grad_norm": 0.2782660014452314, "learning_rate": 8.595725847243297e-06, "loss": 0.2059, "step": 14855 }, { "epoch": 2.5363147563369464, "grad_norm": 0.2813221586374973, "learning_rate": 8.579919069296915e-06, "loss": 0.2348, "step": 14860 }, { "epoch": 2.5371682171204233, "grad_norm": 0.25779612092882676, "learning_rate": 8.564112291350532e-06, "loss": 0.2054, "step": 14865 }, { "epoch": 2.5380216779039, "grad_norm": 0.272784753990905, "learning_rate": 8.548305513404148e-06, "loss": 0.2158, "step": 14870 }, { "epoch": 2.538875138687377, "grad_norm": 0.3109764404566526, "learning_rate": 8.532498735457764e-06, "loss": 0.2087, "step": 14875 }, { "epoch": 2.5397285994708545, "grad_norm": 0.2812062314445669, "learning_rate": 8.516691957511381e-06, "loss": 0.2218, "step": 14880 }, { "epoch": 2.5405820602543314, "grad_norm": 0.2967488892488958, "learning_rate": 8.500885179564997e-06, "loss": 0.1937, "step": 14885 }, { "epoch": 2.5414355210378083, "grad_norm": 0.3054508866762204, "learning_rate": 8.485078401618615e-06, "loss": 0.2187, "step": 14890 }, { "epoch": 2.5422889818212853, "grad_norm": 0.32225108257013046, "learning_rate": 8.469271623672232e-06, "loss": 0.2185, "step": 14895 }, { "epoch": 2.543142442604762, "grad_norm": 0.34942023930434746, "learning_rate": 8.453464845725848e-06, "loss": 0.2137, "step": 14900 }, { "epoch": 2.5439959033882396, "grad_norm": 0.29091185057608726, "learning_rate": 8.437658067779464e-06, "loss": 0.2243, "step": 14905 }, { "epoch": 2.544849364171716, "grad_norm": 0.2833234753132348, "learning_rate": 8.421851289833081e-06, "loss": 0.1964, "step": 14910 }, { "epoch": 2.5457028249551934, "grad_norm": 0.2834333281294411, "learning_rate": 8.406044511886697e-06, "loss": 0.2073, "step": 14915 }, { "epoch": 2.5465562857386703, "grad_norm": 0.25466653832049513, "learning_rate": 8.390237733940314e-06, "loss": 0.2126, "step": 14920 }, { "epoch": 2.5474097465221472, "grad_norm": 0.274218005788, "learning_rate": 8.374430955993932e-06, "loss": 0.2112, "step": 14925 }, { "epoch": 2.548263207305624, "grad_norm": 0.29776640969244095, "learning_rate": 8.358624178047547e-06, "loss": 0.2119, "step": 14930 }, { "epoch": 2.549116668089101, "grad_norm": 0.30095413281240857, "learning_rate": 8.342817400101163e-06, "loss": 0.1997, "step": 14935 }, { "epoch": 2.5499701288725785, "grad_norm": 0.2733131486765432, "learning_rate": 8.32701062215478e-06, "loss": 0.2091, "step": 14940 }, { "epoch": 2.5508235896560554, "grad_norm": 0.32469256021269605, "learning_rate": 8.311203844208396e-06, "loss": 0.2308, "step": 14945 }, { "epoch": 2.5516770504395323, "grad_norm": 0.33587040032926524, "learning_rate": 8.295397066262014e-06, "loss": 0.2038, "step": 14950 }, { "epoch": 2.5525305112230092, "grad_norm": 0.3092983184821567, "learning_rate": 8.279590288315631e-06, "loss": 0.2017, "step": 14955 }, { "epoch": 2.553383972006486, "grad_norm": 0.33259354945413744, "learning_rate": 8.263783510369247e-06, "loss": 0.1996, "step": 14960 }, { "epoch": 2.5542374327899635, "grad_norm": 0.28093748950979186, "learning_rate": 8.247976732422863e-06, "loss": 0.2231, "step": 14965 }, { "epoch": 2.5550908935734404, "grad_norm": 0.25460034994393865, "learning_rate": 8.23216995447648e-06, "loss": 0.2037, "step": 14970 }, { "epoch": 2.5559443543569174, "grad_norm": 0.2930871436298649, "learning_rate": 8.216363176530096e-06, "loss": 0.1952, "step": 14975 }, { "epoch": 2.5567978151403943, "grad_norm": 0.29530805669162724, "learning_rate": 8.200556398583714e-06, "loss": 0.2062, "step": 14980 }, { "epoch": 2.557651275923871, "grad_norm": 0.32135652388628166, "learning_rate": 8.184749620637331e-06, "loss": 0.2112, "step": 14985 }, { "epoch": 2.5585047367073486, "grad_norm": 0.2824126222829732, "learning_rate": 8.168942842690947e-06, "loss": 0.2159, "step": 14990 }, { "epoch": 2.559358197490825, "grad_norm": 0.28910194452027066, "learning_rate": 8.153136064744563e-06, "loss": 0.2128, "step": 14995 }, { "epoch": 2.5602116582743024, "grad_norm": 0.3299147267064249, "learning_rate": 8.13732928679818e-06, "loss": 0.21, "step": 15000 }, { "epoch": 2.5610651190577793, "grad_norm": 0.3396067487504206, "learning_rate": 8.121522508851796e-06, "loss": 0.2043, "step": 15005 }, { "epoch": 2.5619185798412563, "grad_norm": 0.33742330028250417, "learning_rate": 8.105715730905413e-06, "loss": 0.2046, "step": 15010 }, { "epoch": 2.562772040624733, "grad_norm": 0.27446915979953485, "learning_rate": 8.089908952959029e-06, "loss": 0.2215, "step": 15015 }, { "epoch": 2.56362550140821, "grad_norm": 0.2878296980266294, "learning_rate": 8.074102175012646e-06, "loss": 0.2035, "step": 15020 }, { "epoch": 2.5644789621916875, "grad_norm": 0.32252706687621113, "learning_rate": 8.058295397066262e-06, "loss": 0.2168, "step": 15025 }, { "epoch": 2.5653324229751644, "grad_norm": 0.27332318876173073, "learning_rate": 8.042488619119878e-06, "loss": 0.218, "step": 15030 }, { "epoch": 2.5661858837586413, "grad_norm": 0.28026382186225846, "learning_rate": 8.026681841173495e-06, "loss": 0.2265, "step": 15035 }, { "epoch": 2.5670393445421182, "grad_norm": 0.29315827309856296, "learning_rate": 8.010875063227111e-06, "loss": 0.1919, "step": 15040 }, { "epoch": 2.567892805325595, "grad_norm": 0.3102226191638849, "learning_rate": 7.995068285280729e-06, "loss": 0.2176, "step": 15045 }, { "epoch": 2.5687462661090725, "grad_norm": 0.30265744651318316, "learning_rate": 7.979261507334346e-06, "loss": 0.2014, "step": 15050 }, { "epoch": 2.5695997268925495, "grad_norm": 0.32730621337535915, "learning_rate": 7.963454729387962e-06, "loss": 0.21, "step": 15055 }, { "epoch": 2.5704531876760264, "grad_norm": 0.28785604334012194, "learning_rate": 7.947647951441578e-06, "loss": 0.2235, "step": 15060 }, { "epoch": 2.5713066484595033, "grad_norm": 0.31840019182261625, "learning_rate": 7.931841173495195e-06, "loss": 0.2113, "step": 15065 }, { "epoch": 2.57216010924298, "grad_norm": 0.3007373298195007, "learning_rate": 7.916034395548811e-06, "loss": 0.2054, "step": 15070 }, { "epoch": 2.573013570026457, "grad_norm": 0.3225639941026148, "learning_rate": 7.900227617602428e-06, "loss": 0.2112, "step": 15075 }, { "epoch": 2.573867030809934, "grad_norm": 0.32029850769405666, "learning_rate": 7.884420839656046e-06, "loss": 0.2161, "step": 15080 }, { "epoch": 2.5747204915934114, "grad_norm": 0.31702046208940254, "learning_rate": 7.868614061709662e-06, "loss": 0.203, "step": 15085 }, { "epoch": 2.5755739523768884, "grad_norm": 0.30111180468004883, "learning_rate": 7.852807283763277e-06, "loss": 0.2132, "step": 15090 }, { "epoch": 2.5764274131603653, "grad_norm": 0.3042517208485932, "learning_rate": 7.837000505816895e-06, "loss": 0.2353, "step": 15095 }, { "epoch": 2.577280873943842, "grad_norm": 0.2874184245104927, "learning_rate": 7.82119372787051e-06, "loss": 0.1755, "step": 15100 }, { "epoch": 2.578134334727319, "grad_norm": 0.2873093686184649, "learning_rate": 7.805386949924128e-06, "loss": 0.2131, "step": 15105 }, { "epoch": 2.5789877955107965, "grad_norm": 0.29399886692762606, "learning_rate": 7.789580171977745e-06, "loss": 0.2182, "step": 15110 }, { "epoch": 2.5798412562942734, "grad_norm": 0.2711464886732094, "learning_rate": 7.773773394031361e-06, "loss": 0.1962, "step": 15115 }, { "epoch": 2.5806947170777503, "grad_norm": 0.269942601438957, "learning_rate": 7.757966616084977e-06, "loss": 0.2204, "step": 15120 }, { "epoch": 2.5815481778612273, "grad_norm": 0.31217835337246713, "learning_rate": 7.742159838138594e-06, "loss": 0.1878, "step": 15125 }, { "epoch": 2.582401638644704, "grad_norm": 0.3216448911233945, "learning_rate": 7.72635306019221e-06, "loss": 0.2213, "step": 15130 }, { "epoch": 2.5832550994281815, "grad_norm": 0.30943373096293225, "learning_rate": 7.710546282245828e-06, "loss": 0.2183, "step": 15135 }, { "epoch": 2.584108560211658, "grad_norm": 0.2785300883620992, "learning_rate": 7.694739504299443e-06, "loss": 0.2027, "step": 15140 }, { "epoch": 2.5849620209951354, "grad_norm": 0.2778403921522237, "learning_rate": 7.678932726353061e-06, "loss": 0.2101, "step": 15145 }, { "epoch": 2.5858154817786123, "grad_norm": 0.3050434563012529, "learning_rate": 7.663125948406677e-06, "loss": 0.2281, "step": 15150 }, { "epoch": 2.5866689425620892, "grad_norm": 0.30855066693479183, "learning_rate": 7.647319170460292e-06, "loss": 0.2147, "step": 15155 }, { "epoch": 2.587522403345566, "grad_norm": 0.2972246372616466, "learning_rate": 7.63151239251391e-06, "loss": 0.2047, "step": 15160 }, { "epoch": 2.588375864129043, "grad_norm": 0.33093575054565133, "learning_rate": 7.6157056145675265e-06, "loss": 0.1972, "step": 15165 }, { "epoch": 2.5892293249125204, "grad_norm": 0.28602413846954916, "learning_rate": 7.599898836621143e-06, "loss": 0.1984, "step": 15170 }, { "epoch": 2.5900827856959974, "grad_norm": 0.26896577694818813, "learning_rate": 7.5840920586747606e-06, "loss": 0.2244, "step": 15175 }, { "epoch": 2.5909362464794743, "grad_norm": 0.301390580555708, "learning_rate": 7.568285280728376e-06, "loss": 0.2094, "step": 15180 }, { "epoch": 2.591789707262951, "grad_norm": 0.2797082672426372, "learning_rate": 7.552478502781993e-06, "loss": 0.2174, "step": 15185 }, { "epoch": 2.592643168046428, "grad_norm": 0.2979847429707958, "learning_rate": 7.53667172483561e-06, "loss": 0.2074, "step": 15190 }, { "epoch": 2.5934966288299055, "grad_norm": 0.32254520512951895, "learning_rate": 7.520864946889226e-06, "loss": 0.2147, "step": 15195 }, { "epoch": 2.5943500896133824, "grad_norm": 0.2644355316767364, "learning_rate": 7.505058168942843e-06, "loss": 0.191, "step": 15200 }, { "epoch": 2.5952035503968593, "grad_norm": 0.35018346786561877, "learning_rate": 7.48925139099646e-06, "loss": 0.2143, "step": 15205 }, { "epoch": 2.5960570111803363, "grad_norm": 0.3233433551060496, "learning_rate": 7.473444613050076e-06, "loss": 0.2041, "step": 15210 }, { "epoch": 2.596910471963813, "grad_norm": 0.3086886568339892, "learning_rate": 7.457637835103693e-06, "loss": 0.2102, "step": 15215 }, { "epoch": 2.59776393274729, "grad_norm": 0.31347523325171467, "learning_rate": 7.44183105715731e-06, "loss": 0.211, "step": 15220 }, { "epoch": 2.598617393530767, "grad_norm": 0.3344896760909856, "learning_rate": 7.426024279210926e-06, "loss": 0.2042, "step": 15225 }, { "epoch": 2.5994708543142444, "grad_norm": 0.3363896237548711, "learning_rate": 7.4102175012645424e-06, "loss": 0.2085, "step": 15230 }, { "epoch": 2.6003243150977213, "grad_norm": 0.3180082238853291, "learning_rate": 7.39441072331816e-06, "loss": 0.2197, "step": 15235 }, { "epoch": 2.6011777758811983, "grad_norm": 0.3153433434191373, "learning_rate": 7.378603945371776e-06, "loss": 0.2159, "step": 15240 }, { "epoch": 2.602031236664675, "grad_norm": 0.27190987498917757, "learning_rate": 7.362797167425392e-06, "loss": 0.1975, "step": 15245 }, { "epoch": 2.602884697448152, "grad_norm": 0.3310039369957887, "learning_rate": 7.34699038947901e-06, "loss": 0.2213, "step": 15250 }, { "epoch": 2.6037381582316295, "grad_norm": 0.2590244680274066, "learning_rate": 7.3311836115326255e-06, "loss": 0.2077, "step": 15255 }, { "epoch": 2.6045916190151064, "grad_norm": 0.3165724296311588, "learning_rate": 7.315376833586242e-06, "loss": 0.1975, "step": 15260 }, { "epoch": 2.6054450797985833, "grad_norm": 0.29535455135177285, "learning_rate": 7.2995700556398596e-06, "loss": 0.2219, "step": 15265 }, { "epoch": 2.6062985405820602, "grad_norm": 0.3658392112896942, "learning_rate": 7.283763277693475e-06, "loss": 0.1965, "step": 15270 }, { "epoch": 2.607152001365537, "grad_norm": 0.3018726520748986, "learning_rate": 7.267956499747092e-06, "loss": 0.2171, "step": 15275 }, { "epoch": 2.6080054621490145, "grad_norm": 0.30693165930814226, "learning_rate": 7.252149721800708e-06, "loss": 0.2051, "step": 15280 }, { "epoch": 2.608858922932491, "grad_norm": 0.315156615022672, "learning_rate": 7.236342943854325e-06, "loss": 0.2061, "step": 15285 }, { "epoch": 2.6097123837159684, "grad_norm": 0.2853729952092828, "learning_rate": 7.220536165907942e-06, "loss": 0.2085, "step": 15290 }, { "epoch": 2.6105658444994453, "grad_norm": 0.27107885790572606, "learning_rate": 7.2047293879615575e-06, "loss": 0.1987, "step": 15295 }, { "epoch": 2.611419305282922, "grad_norm": 0.3408287993502234, "learning_rate": 7.188922610015175e-06, "loss": 0.2052, "step": 15300 }, { "epoch": 2.612272766066399, "grad_norm": 0.28508529823714296, "learning_rate": 7.173115832068792e-06, "loss": 0.1964, "step": 15305 }, { "epoch": 2.613126226849876, "grad_norm": 0.2930652493637157, "learning_rate": 7.157309054122407e-06, "loss": 0.2101, "step": 15310 }, { "epoch": 2.6139796876333534, "grad_norm": 0.2728880329622997, "learning_rate": 7.141502276176025e-06, "loss": 0.2051, "step": 15315 }, { "epoch": 2.6148331484168303, "grad_norm": 0.3113764179708425, "learning_rate": 7.1256954982296414e-06, "loss": 0.1992, "step": 15320 }, { "epoch": 2.6156866092003073, "grad_norm": 0.3283398218180235, "learning_rate": 7.109888720283257e-06, "loss": 0.2241, "step": 15325 }, { "epoch": 2.616540069983784, "grad_norm": 0.29532716359975925, "learning_rate": 7.094081942336875e-06, "loss": 0.1986, "step": 15330 }, { "epoch": 2.617393530767261, "grad_norm": 0.28764541686073714, "learning_rate": 7.078275164390491e-06, "loss": 0.2062, "step": 15335 }, { "epoch": 2.6182469915507385, "grad_norm": 0.3055719497318647, "learning_rate": 7.062468386444107e-06, "loss": 0.1811, "step": 15340 }, { "epoch": 2.6191004523342154, "grad_norm": 0.30311807665800294, "learning_rate": 7.0466616084977245e-06, "loss": 0.1928, "step": 15345 }, { "epoch": 2.6199539131176923, "grad_norm": 0.3261602762919266, "learning_rate": 7.030854830551341e-06, "loss": 0.2106, "step": 15350 }, { "epoch": 2.6208073739011692, "grad_norm": 0.26735675980600054, "learning_rate": 7.015048052604957e-06, "loss": 0.2172, "step": 15355 }, { "epoch": 2.621660834684646, "grad_norm": 0.29000968451835973, "learning_rate": 6.999241274658574e-06, "loss": 0.1867, "step": 15360 }, { "epoch": 2.622514295468123, "grad_norm": 0.30899734026472775, "learning_rate": 6.983434496712191e-06, "loss": 0.1963, "step": 15365 }, { "epoch": 2.6233677562516, "grad_norm": 0.32407414758867287, "learning_rate": 6.967627718765807e-06, "loss": 0.2204, "step": 15370 }, { "epoch": 2.6242212170350774, "grad_norm": 0.31595733340417603, "learning_rate": 6.951820940819424e-06, "loss": 0.2103, "step": 15375 }, { "epoch": 2.6250746778185543, "grad_norm": 0.3034042659404631, "learning_rate": 6.936014162873041e-06, "loss": 0.2001, "step": 15380 }, { "epoch": 2.6259281386020312, "grad_norm": 0.27394162515330894, "learning_rate": 6.9202073849266565e-06, "loss": 0.2125, "step": 15385 }, { "epoch": 2.626781599385508, "grad_norm": 0.3141763322065136, "learning_rate": 6.904400606980274e-06, "loss": 0.1979, "step": 15390 }, { "epoch": 2.627635060168985, "grad_norm": 0.30236550320567884, "learning_rate": 6.888593829033891e-06, "loss": 0.2021, "step": 15395 }, { "epoch": 2.6284885209524624, "grad_norm": 0.26995960363152594, "learning_rate": 6.872787051087506e-06, "loss": 0.2027, "step": 15400 }, { "epoch": 2.6293419817359394, "grad_norm": 0.274436493271229, "learning_rate": 6.856980273141123e-06, "loss": 0.2072, "step": 15405 }, { "epoch": 2.6301954425194163, "grad_norm": 0.29157952970806134, "learning_rate": 6.84117349519474e-06, "loss": 0.1972, "step": 15410 }, { "epoch": 2.631048903302893, "grad_norm": 0.30615435879995534, "learning_rate": 6.825366717248356e-06, "loss": 0.2117, "step": 15415 }, { "epoch": 2.63190236408637, "grad_norm": 0.3185211790273233, "learning_rate": 6.809559939301973e-06, "loss": 0.2073, "step": 15420 }, { "epoch": 2.6327558248698475, "grad_norm": 0.31439981528916994, "learning_rate": 6.79375316135559e-06, "loss": 0.2076, "step": 15425 }, { "epoch": 2.633609285653324, "grad_norm": 0.3182605676201803, "learning_rate": 6.777946383409206e-06, "loss": 0.1905, "step": 15430 }, { "epoch": 2.6344627464368013, "grad_norm": 0.32534945169875334, "learning_rate": 6.762139605462823e-06, "loss": 0.2004, "step": 15435 }, { "epoch": 2.6353162072202783, "grad_norm": 0.34354520022360613, "learning_rate": 6.74633282751644e-06, "loss": 0.2219, "step": 15440 }, { "epoch": 2.636169668003755, "grad_norm": 0.29662726735514144, "learning_rate": 6.730526049570056e-06, "loss": 0.2165, "step": 15445 }, { "epoch": 2.637023128787232, "grad_norm": 0.3433343043387418, "learning_rate": 6.7147192716236725e-06, "loss": 0.2028, "step": 15450 }, { "epoch": 2.637876589570709, "grad_norm": 0.30010261633004603, "learning_rate": 6.69891249367729e-06, "loss": 0.2111, "step": 15455 }, { "epoch": 2.6387300503541864, "grad_norm": 0.26183740987727955, "learning_rate": 6.683105715730906e-06, "loss": 0.2145, "step": 15460 }, { "epoch": 2.6395835111376633, "grad_norm": 0.3037087296013404, "learning_rate": 6.6672989377845214e-06, "loss": 0.21, "step": 15465 }, { "epoch": 2.6404369719211402, "grad_norm": 0.332997259238743, "learning_rate": 6.65149215983814e-06, "loss": 0.2066, "step": 15470 }, { "epoch": 2.641290432704617, "grad_norm": 0.3044686483800474, "learning_rate": 6.6356853818917555e-06, "loss": 0.2122, "step": 15475 }, { "epoch": 2.642143893488094, "grad_norm": 0.29717131286129334, "learning_rate": 6.619878603945371e-06, "loss": 0.235, "step": 15480 }, { "epoch": 2.6429973542715715, "grad_norm": 0.2656517293747368, "learning_rate": 6.60407182599899e-06, "loss": 0.2036, "step": 15485 }, { "epoch": 2.6438508150550484, "grad_norm": 0.31253544063875444, "learning_rate": 6.588265048052605e-06, "loss": 0.2223, "step": 15490 }, { "epoch": 2.6447042758385253, "grad_norm": 0.30073550953081923, "learning_rate": 6.572458270106221e-06, "loss": 0.1986, "step": 15495 }, { "epoch": 2.645557736622002, "grad_norm": 0.3988868599490686, "learning_rate": 6.556651492159839e-06, "loss": 0.2028, "step": 15500 }, { "epoch": 2.646411197405479, "grad_norm": 0.28021425635877006, "learning_rate": 6.540844714213455e-06, "loss": 0.2143, "step": 15505 }, { "epoch": 2.647264658188956, "grad_norm": 0.33366423286196717, "learning_rate": 6.525037936267071e-06, "loss": 0.2034, "step": 15510 }, { "epoch": 2.648118118972433, "grad_norm": 0.3228996679954544, "learning_rate": 6.509231158320689e-06, "loss": 0.2102, "step": 15515 }, { "epoch": 2.6489715797559104, "grad_norm": 0.3365633019183645, "learning_rate": 6.493424380374305e-06, "loss": 0.2031, "step": 15520 }, { "epoch": 2.6498250405393873, "grad_norm": 0.29149351721900674, "learning_rate": 6.477617602427921e-06, "loss": 0.2075, "step": 15525 }, { "epoch": 2.650678501322864, "grad_norm": 0.3097685877406625, "learning_rate": 6.461810824481539e-06, "loss": 0.2031, "step": 15530 }, { "epoch": 2.651531962106341, "grad_norm": 0.3125270251011444, "learning_rate": 6.446004046535155e-06, "loss": 0.1867, "step": 15535 }, { "epoch": 2.652385422889818, "grad_norm": 0.2582248079429999, "learning_rate": 6.430197268588771e-06, "loss": 0.2337, "step": 15540 }, { "epoch": 2.6532388836732954, "grad_norm": 0.30986448716963017, "learning_rate": 6.414390490642387e-06, "loss": 0.209, "step": 15545 }, { "epoch": 2.6540923444567723, "grad_norm": 0.2637078268565643, "learning_rate": 6.398583712696005e-06, "loss": 0.2173, "step": 15550 }, { "epoch": 2.6549458052402493, "grad_norm": 0.23639308889577487, "learning_rate": 6.3827769347496204e-06, "loss": 0.1985, "step": 15555 }, { "epoch": 2.655799266023726, "grad_norm": 0.39920571998962406, "learning_rate": 6.366970156803237e-06, "loss": 0.177, "step": 15560 }, { "epoch": 2.656652726807203, "grad_norm": 0.3000048210090101, "learning_rate": 6.3511633788568545e-06, "loss": 0.2271, "step": 15565 }, { "epoch": 2.6575061875906805, "grad_norm": 0.3817330741743107, "learning_rate": 6.33535660091047e-06, "loss": 0.206, "step": 15570 }, { "epoch": 2.658359648374157, "grad_norm": 0.28235048865080575, "learning_rate": 6.319549822964087e-06, "loss": 0.204, "step": 15575 }, { "epoch": 2.6592131091576343, "grad_norm": 0.47010823336711, "learning_rate": 6.303743045017704e-06, "loss": 0.2077, "step": 15580 }, { "epoch": 2.6600665699411112, "grad_norm": 0.3056642129427295, "learning_rate": 6.28793626707132e-06, "loss": 0.1846, "step": 15585 }, { "epoch": 2.660920030724588, "grad_norm": 0.29793968677182336, "learning_rate": 6.272129489124937e-06, "loss": 0.2123, "step": 15590 }, { "epoch": 2.661773491508065, "grad_norm": 0.2817216823901349, "learning_rate": 6.256322711178554e-06, "loss": 0.2038, "step": 15595 }, { "epoch": 2.662626952291542, "grad_norm": 0.2636894234107654, "learning_rate": 6.24051593323217e-06, "loss": 0.2277, "step": 15600 }, { "epoch": 2.6634804130750194, "grad_norm": 0.27403896058342087, "learning_rate": 6.224709155285787e-06, "loss": 0.2091, "step": 15605 }, { "epoch": 2.6643338738584963, "grad_norm": 0.34231692210945686, "learning_rate": 6.208902377339403e-06, "loss": 0.22, "step": 15610 }, { "epoch": 2.665187334641973, "grad_norm": 0.332076929230426, "learning_rate": 6.19309559939302e-06, "loss": 0.2078, "step": 15615 }, { "epoch": 2.66604079542545, "grad_norm": 0.3425282381296489, "learning_rate": 6.177288821446637e-06, "loss": 0.1838, "step": 15620 }, { "epoch": 2.666894256208927, "grad_norm": 0.3045284717166107, "learning_rate": 6.161482043500253e-06, "loss": 0.2052, "step": 15625 }, { "epoch": 2.6677477169924044, "grad_norm": 0.3094308949803082, "learning_rate": 6.14567526555387e-06, "loss": 0.206, "step": 15630 }, { "epoch": 2.6686011777758814, "grad_norm": 0.31696772138407087, "learning_rate": 6.129868487607486e-06, "loss": 0.2018, "step": 15635 }, { "epoch": 2.6694546385593583, "grad_norm": 0.34170415417053357, "learning_rate": 6.114061709661103e-06, "loss": 0.1979, "step": 15640 }, { "epoch": 2.670308099342835, "grad_norm": 0.32370875694307244, "learning_rate": 6.098254931714719e-06, "loss": 0.2166, "step": 15645 }, { "epoch": 2.671161560126312, "grad_norm": 0.2827372595431929, "learning_rate": 6.082448153768336e-06, "loss": 0.2103, "step": 15650 }, { "epoch": 2.672015020909789, "grad_norm": 0.28179192302912826, "learning_rate": 6.066641375821953e-06, "loss": 0.2104, "step": 15655 }, { "epoch": 2.672868481693266, "grad_norm": 0.2776720915643278, "learning_rate": 6.050834597875569e-06, "loss": 0.1998, "step": 15660 }, { "epoch": 2.6737219424767433, "grad_norm": 0.3058347995453126, "learning_rate": 6.035027819929186e-06, "loss": 0.2, "step": 15665 }, { "epoch": 2.6745754032602203, "grad_norm": 0.2904147182363896, "learning_rate": 6.0192210419828025e-06, "loss": 0.2002, "step": 15670 }, { "epoch": 2.675428864043697, "grad_norm": 0.3388082472509339, "learning_rate": 6.003414264036419e-06, "loss": 0.2021, "step": 15675 }, { "epoch": 2.676282324827174, "grad_norm": 0.27099812587565064, "learning_rate": 5.987607486090036e-06, "loss": 0.2096, "step": 15680 }, { "epoch": 2.677135785610651, "grad_norm": 0.34015300634990575, "learning_rate": 5.971800708143652e-06, "loss": 0.1957, "step": 15685 }, { "epoch": 2.6779892463941284, "grad_norm": 0.30592673359589784, "learning_rate": 5.955993930197269e-06, "loss": 0.2182, "step": 15690 }, { "epoch": 2.6788427071776053, "grad_norm": 0.27723520974175125, "learning_rate": 5.9401871522508855e-06, "loss": 0.2204, "step": 15695 }, { "epoch": 2.6796961679610822, "grad_norm": 0.3598478872387859, "learning_rate": 5.924380374304502e-06, "loss": 0.2061, "step": 15700 }, { "epoch": 2.680549628744559, "grad_norm": 0.3597010040974615, "learning_rate": 5.908573596358119e-06, "loss": 0.2327, "step": 15705 }, { "epoch": 2.681403089528036, "grad_norm": 0.27401196505670483, "learning_rate": 5.892766818411735e-06, "loss": 0.217, "step": 15710 }, { "epoch": 2.6822565503115134, "grad_norm": 0.287236035642295, "learning_rate": 5.876960040465352e-06, "loss": 0.213, "step": 15715 }, { "epoch": 2.68311001109499, "grad_norm": 0.3386853607935798, "learning_rate": 5.861153262518968e-06, "loss": 0.187, "step": 15720 }, { "epoch": 2.6839634718784673, "grad_norm": 0.32286350171516687, "learning_rate": 5.845346484572585e-06, "loss": 0.2029, "step": 15725 }, { "epoch": 2.684816932661944, "grad_norm": 0.29815403816246877, "learning_rate": 5.829539706626202e-06, "loss": 0.2212, "step": 15730 }, { "epoch": 2.685670393445421, "grad_norm": 0.2962099719461072, "learning_rate": 5.8137329286798176e-06, "loss": 0.2161, "step": 15735 }, { "epoch": 2.686523854228898, "grad_norm": 0.3243726622579163, "learning_rate": 5.797926150733435e-06, "loss": 0.1998, "step": 15740 }, { "epoch": 2.687377315012375, "grad_norm": 0.3157598297003378, "learning_rate": 5.782119372787052e-06, "loss": 0.2114, "step": 15745 }, { "epoch": 2.6882307757958523, "grad_norm": 0.29393720014586416, "learning_rate": 5.766312594840667e-06, "loss": 0.1977, "step": 15750 }, { "epoch": 2.6890842365793293, "grad_norm": 0.314285622042596, "learning_rate": 5.750505816894285e-06, "loss": 0.1808, "step": 15755 }, { "epoch": 2.689937697362806, "grad_norm": 0.2899466708389144, "learning_rate": 5.7346990389479015e-06, "loss": 0.2217, "step": 15760 }, { "epoch": 2.690791158146283, "grad_norm": 0.28977579469741954, "learning_rate": 5.718892261001517e-06, "loss": 0.191, "step": 15765 }, { "epoch": 2.69164461892976, "grad_norm": 0.3295402325944783, "learning_rate": 5.703085483055135e-06, "loss": 0.1779, "step": 15770 }, { "epoch": 2.6924980797132374, "grad_norm": 0.2951235349284477, "learning_rate": 5.6872787051087505e-06, "loss": 0.1937, "step": 15775 }, { "epoch": 2.6933515404967143, "grad_norm": 0.3003875114807517, "learning_rate": 5.671471927162367e-06, "loss": 0.2158, "step": 15780 }, { "epoch": 2.6942050012801912, "grad_norm": 0.26242706747741734, "learning_rate": 5.6556651492159845e-06, "loss": 0.2307, "step": 15785 }, { "epoch": 2.695058462063668, "grad_norm": 0.28334423791680124, "learning_rate": 5.6398583712696e-06, "loss": 0.1976, "step": 15790 }, { "epoch": 2.695911922847145, "grad_norm": 0.327060207889704, "learning_rate": 5.624051593323217e-06, "loss": 0.1954, "step": 15795 }, { "epoch": 2.696765383630622, "grad_norm": 0.28167258847097004, "learning_rate": 5.608244815376834e-06, "loss": 0.1991, "step": 15800 }, { "epoch": 2.697618844414099, "grad_norm": 0.29953960888869297, "learning_rate": 5.59243803743045e-06, "loss": 0.1884, "step": 15805 }, { "epoch": 2.6984723051975763, "grad_norm": 0.283199211575958, "learning_rate": 5.576631259484067e-06, "loss": 0.2177, "step": 15810 }, { "epoch": 2.6993257659810532, "grad_norm": 0.34582837627543384, "learning_rate": 5.560824481537684e-06, "loss": 0.2096, "step": 15815 }, { "epoch": 2.70017922676453, "grad_norm": 0.28602625621986694, "learning_rate": 5.5450177035913e-06, "loss": 0.2016, "step": 15820 }, { "epoch": 2.701032687548007, "grad_norm": 0.2923202721559455, "learning_rate": 5.5292109256449166e-06, "loss": 0.1944, "step": 15825 }, { "epoch": 2.701886148331484, "grad_norm": 0.3260018790161068, "learning_rate": 5.513404147698533e-06, "loss": 0.2277, "step": 15830 }, { "epoch": 2.7027396091149614, "grad_norm": 0.301555063884695, "learning_rate": 5.49759736975215e-06, "loss": 0.2088, "step": 15835 }, { "epoch": 2.7035930698984383, "grad_norm": 0.30799832411838113, "learning_rate": 5.481790591805766e-06, "loss": 0.215, "step": 15840 }, { "epoch": 2.704446530681915, "grad_norm": 0.2892919344858003, "learning_rate": 5.465983813859383e-06, "loss": 0.196, "step": 15845 }, { "epoch": 2.705299991465392, "grad_norm": 0.3003370065797013, "learning_rate": 5.450177035913e-06, "loss": 0.2152, "step": 15850 }, { "epoch": 2.706153452248869, "grad_norm": 0.265825736658078, "learning_rate": 5.434370257966616e-06, "loss": 0.219, "step": 15855 }, { "epoch": 2.7070069130323464, "grad_norm": 0.30490533672697084, "learning_rate": 5.418563480020233e-06, "loss": 0.2137, "step": 15860 }, { "epoch": 2.707860373815823, "grad_norm": 0.29502865152948154, "learning_rate": 5.4027567020738494e-06, "loss": 0.2209, "step": 15865 }, { "epoch": 2.7087138345993003, "grad_norm": 0.2988078065829238, "learning_rate": 5.386949924127466e-06, "loss": 0.2054, "step": 15870 }, { "epoch": 2.709567295382777, "grad_norm": 0.3170586777087799, "learning_rate": 5.371143146181083e-06, "loss": 0.1924, "step": 15875 }, { "epoch": 2.710420756166254, "grad_norm": 0.28253282587212214, "learning_rate": 5.355336368234699e-06, "loss": 0.2058, "step": 15880 }, { "epoch": 2.711274216949731, "grad_norm": 0.3031766184548689, "learning_rate": 5.339529590288316e-06, "loss": 0.2095, "step": 15885 }, { "epoch": 2.712127677733208, "grad_norm": 0.272267480092774, "learning_rate": 5.3237228123419325e-06, "loss": 0.1953, "step": 15890 }, { "epoch": 2.7129811385166853, "grad_norm": 0.339557448842918, "learning_rate": 5.307916034395549e-06, "loss": 0.2034, "step": 15895 }, { "epoch": 2.7138345993001622, "grad_norm": 0.29187630276993987, "learning_rate": 5.292109256449166e-06, "loss": 0.2176, "step": 15900 }, { "epoch": 2.714688060083639, "grad_norm": 0.2917392973105694, "learning_rate": 5.276302478502782e-06, "loss": 0.2248, "step": 15905 }, { "epoch": 2.715541520867116, "grad_norm": 0.24696996504170068, "learning_rate": 5.260495700556399e-06, "loss": 0.1883, "step": 15910 }, { "epoch": 2.716394981650593, "grad_norm": 0.2996842880979571, "learning_rate": 5.2446889226100155e-06, "loss": 0.2039, "step": 15915 }, { "epoch": 2.7172484424340704, "grad_norm": 0.29812811745542483, "learning_rate": 5.228882144663632e-06, "loss": 0.2119, "step": 15920 }, { "epoch": 2.7181019032175473, "grad_norm": 0.3476924281400017, "learning_rate": 5.213075366717249e-06, "loss": 0.2076, "step": 15925 }, { "epoch": 2.718955364001024, "grad_norm": 0.28758750350519297, "learning_rate": 5.197268588770865e-06, "loss": 0.2201, "step": 15930 }, { "epoch": 2.719808824784501, "grad_norm": 0.29864757313306556, "learning_rate": 5.181461810824482e-06, "loss": 0.2142, "step": 15935 }, { "epoch": 2.720662285567978, "grad_norm": 0.3584422686370773, "learning_rate": 5.165655032878099e-06, "loss": 0.1923, "step": 15940 }, { "epoch": 2.721515746351455, "grad_norm": 0.2991396969647478, "learning_rate": 5.149848254931715e-06, "loss": 0.2028, "step": 15945 }, { "epoch": 2.722369207134932, "grad_norm": 0.318756758203459, "learning_rate": 5.134041476985332e-06, "loss": 0.2023, "step": 15950 }, { "epoch": 2.7232226679184093, "grad_norm": 0.2957156239143027, "learning_rate": 5.1182346990389484e-06, "loss": 0.1995, "step": 15955 }, { "epoch": 2.724076128701886, "grad_norm": 0.2936814113512672, "learning_rate": 5.102427921092565e-06, "loss": 0.2195, "step": 15960 }, { "epoch": 2.724929589485363, "grad_norm": 0.3224755952331169, "learning_rate": 5.086621143146182e-06, "loss": 0.1853, "step": 15965 }, { "epoch": 2.72578305026884, "grad_norm": 0.31256703297126887, "learning_rate": 5.070814365199797e-06, "loss": 0.2118, "step": 15970 }, { "epoch": 2.726636511052317, "grad_norm": 0.2862751029245076, "learning_rate": 5.055007587253414e-06, "loss": 0.1949, "step": 15975 }, { "epoch": 2.7274899718357943, "grad_norm": 0.36107062849039523, "learning_rate": 5.0392008093070315e-06, "loss": 0.2045, "step": 15980 }, { "epoch": 2.7283434326192713, "grad_norm": 0.32539273576377964, "learning_rate": 5.023394031360647e-06, "loss": 0.2084, "step": 15985 }, { "epoch": 2.729196893402748, "grad_norm": 0.30789061183738736, "learning_rate": 5.007587253414264e-06, "loss": 0.2202, "step": 15990 }, { "epoch": 2.730050354186225, "grad_norm": 0.3192768101749474, "learning_rate": 4.991780475467881e-06, "loss": 0.2069, "step": 15995 }, { "epoch": 2.730903814969702, "grad_norm": 0.3031584702692578, "learning_rate": 4.975973697521497e-06, "loss": 0.2091, "step": 16000 }, { "epoch": 2.7317572757531794, "grad_norm": 0.2898511226779278, "learning_rate": 4.960166919575114e-06, "loss": 0.2134, "step": 16005 }, { "epoch": 2.732610736536656, "grad_norm": 0.3207841588029926, "learning_rate": 4.944360141628731e-06, "loss": 0.2145, "step": 16010 }, { "epoch": 2.7334641973201332, "grad_norm": 0.28045072333959165, "learning_rate": 4.928553363682347e-06, "loss": 0.2005, "step": 16015 }, { "epoch": 2.73431765810361, "grad_norm": 0.2696326870346685, "learning_rate": 4.9127465857359635e-06, "loss": 0.2432, "step": 16020 }, { "epoch": 2.735171118887087, "grad_norm": 0.3155349303191132, "learning_rate": 4.896939807789581e-06, "loss": 0.1988, "step": 16025 }, { "epoch": 2.736024579670564, "grad_norm": 0.30933108777712554, "learning_rate": 4.881133029843197e-06, "loss": 0.213, "step": 16030 }, { "epoch": 2.736878040454041, "grad_norm": 0.2793306296723204, "learning_rate": 4.865326251896813e-06, "loss": 0.218, "step": 16035 }, { "epoch": 2.7377315012375183, "grad_norm": 0.2775018842178003, "learning_rate": 4.84951947395043e-06, "loss": 0.2141, "step": 16040 }, { "epoch": 2.738584962020995, "grad_norm": 0.2798357843007031, "learning_rate": 4.833712696004047e-06, "loss": 0.2091, "step": 16045 }, { "epoch": 2.739438422804472, "grad_norm": 0.2817797852834651, "learning_rate": 4.817905918057663e-06, "loss": 0.2161, "step": 16050 }, { "epoch": 2.740291883587949, "grad_norm": 0.3186746867997902, "learning_rate": 4.80209914011128e-06, "loss": 0.2236, "step": 16055 }, { "epoch": 2.741145344371426, "grad_norm": 0.30384732254580665, "learning_rate": 4.786292362164896e-06, "loss": 0.2011, "step": 16060 }, { "epoch": 2.7419988051549034, "grad_norm": 0.27104323924708495, "learning_rate": 4.770485584218513e-06, "loss": 0.1822, "step": 16065 }, { "epoch": 2.7428522659383803, "grad_norm": 0.3180874651620502, "learning_rate": 4.75467880627213e-06, "loss": 0.1952, "step": 16070 }, { "epoch": 2.743705726721857, "grad_norm": 0.33675082412827373, "learning_rate": 4.738872028325746e-06, "loss": 0.1947, "step": 16075 }, { "epoch": 2.744559187505334, "grad_norm": 0.3090309139326914, "learning_rate": 4.723065250379363e-06, "loss": 0.2019, "step": 16080 }, { "epoch": 2.745412648288811, "grad_norm": 0.3354048592078423, "learning_rate": 4.7072584724329795e-06, "loss": 0.213, "step": 16085 }, { "epoch": 2.746266109072288, "grad_norm": 0.3064407948256961, "learning_rate": 4.691451694486596e-06, "loss": 0.1874, "step": 16090 }, { "epoch": 2.747119569855765, "grad_norm": 0.30432003101309846, "learning_rate": 4.675644916540213e-06, "loss": 0.1943, "step": 16095 }, { "epoch": 2.7479730306392423, "grad_norm": 0.2789199192428393, "learning_rate": 4.659838138593829e-06, "loss": 0.1976, "step": 16100 }, { "epoch": 2.748826491422719, "grad_norm": 0.27543949433539877, "learning_rate": 4.644031360647446e-06, "loss": 0.2016, "step": 16105 }, { "epoch": 2.749679952206196, "grad_norm": 0.28387823821849567, "learning_rate": 4.6282245827010625e-06, "loss": 0.1992, "step": 16110 }, { "epoch": 2.750533412989673, "grad_norm": 0.33527331903964475, "learning_rate": 4.612417804754679e-06, "loss": 0.2112, "step": 16115 }, { "epoch": 2.75138687377315, "grad_norm": 0.2877846385546269, "learning_rate": 4.596611026808296e-06, "loss": 0.2134, "step": 16120 }, { "epoch": 2.7522403345566273, "grad_norm": 0.30175215165238184, "learning_rate": 4.580804248861912e-06, "loss": 0.2258, "step": 16125 }, { "epoch": 2.7530937953401042, "grad_norm": 0.25558115438781764, "learning_rate": 4.564997470915529e-06, "loss": 0.2259, "step": 16130 }, { "epoch": 2.753947256123581, "grad_norm": 0.31739823758913244, "learning_rate": 4.5491906929691456e-06, "loss": 0.231, "step": 16135 }, { "epoch": 2.754800716907058, "grad_norm": 0.29867683255243016, "learning_rate": 4.533383915022762e-06, "loss": 0.1898, "step": 16140 }, { "epoch": 2.755654177690535, "grad_norm": 0.2952948311311863, "learning_rate": 4.517577137076379e-06, "loss": 0.2146, "step": 16145 }, { "epoch": 2.7565076384740124, "grad_norm": 0.3429240600723172, "learning_rate": 4.501770359129995e-06, "loss": 0.2157, "step": 16150 }, { "epoch": 2.757361099257489, "grad_norm": 0.2725625385605698, "learning_rate": 4.485963581183612e-06, "loss": 0.1941, "step": 16155 }, { "epoch": 2.758214560040966, "grad_norm": 0.28910953270378786, "learning_rate": 4.470156803237229e-06, "loss": 0.2056, "step": 16160 }, { "epoch": 2.759068020824443, "grad_norm": 0.31548578476347033, "learning_rate": 4.454350025290844e-06, "loss": 0.1995, "step": 16165 }, { "epoch": 2.75992148160792, "grad_norm": 0.25362801092678644, "learning_rate": 4.438543247344462e-06, "loss": 0.2216, "step": 16170 }, { "epoch": 2.760774942391397, "grad_norm": 0.30441294327892104, "learning_rate": 4.4227364693980785e-06, "loss": 0.2162, "step": 16175 }, { "epoch": 2.761628403174874, "grad_norm": 0.30758597720365594, "learning_rate": 4.406929691451694e-06, "loss": 0.2286, "step": 16180 }, { "epoch": 2.7624818639583513, "grad_norm": 0.28267565428715535, "learning_rate": 4.391122913505312e-06, "loss": 0.2108, "step": 16185 }, { "epoch": 2.763335324741828, "grad_norm": 0.27124190146781657, "learning_rate": 4.375316135558928e-06, "loss": 0.2058, "step": 16190 }, { "epoch": 2.764188785525305, "grad_norm": 0.28288210348007214, "learning_rate": 4.359509357612544e-06, "loss": 0.2114, "step": 16195 }, { "epoch": 2.765042246308782, "grad_norm": 0.31984138475703, "learning_rate": 4.3437025796661615e-06, "loss": 0.2031, "step": 16200 }, { "epoch": 2.765895707092259, "grad_norm": 0.28151790527437137, "learning_rate": 4.327895801719778e-06, "loss": 0.1947, "step": 16205 }, { "epoch": 2.7667491678757363, "grad_norm": 0.2974605836120453, "learning_rate": 4.312089023773394e-06, "loss": 0.2123, "step": 16210 }, { "epoch": 2.7676026286592132, "grad_norm": 0.28951020542488587, "learning_rate": 4.296282245827011e-06, "loss": 0.2093, "step": 16215 }, { "epoch": 2.76845608944269, "grad_norm": 0.2791895064496862, "learning_rate": 4.280475467880628e-06, "loss": 0.2081, "step": 16220 }, { "epoch": 2.769309550226167, "grad_norm": 0.3039387553563788, "learning_rate": 4.264668689934244e-06, "loss": 0.2095, "step": 16225 }, { "epoch": 2.770163011009644, "grad_norm": 0.30905905091318814, "learning_rate": 4.24886191198786e-06, "loss": 0.2215, "step": 16230 }, { "epoch": 2.7710164717931214, "grad_norm": 0.3073661636652629, "learning_rate": 4.233055134041477e-06, "loss": 0.2009, "step": 16235 }, { "epoch": 2.771869932576598, "grad_norm": 0.3146859099961923, "learning_rate": 4.2172483560950935e-06, "loss": 0.2179, "step": 16240 }, { "epoch": 2.7727233933600752, "grad_norm": 0.2911080468873359, "learning_rate": 4.20144157814871e-06, "loss": 0.2139, "step": 16245 }, { "epoch": 2.773576854143552, "grad_norm": 0.2646876986364466, "learning_rate": 4.185634800202327e-06, "loss": 0.2104, "step": 16250 }, { "epoch": 2.774430314927029, "grad_norm": 0.3122318685773588, "learning_rate": 4.169828022255943e-06, "loss": 0.2147, "step": 16255 }, { "epoch": 2.775283775710506, "grad_norm": 0.27347718924284165, "learning_rate": 4.15402124430956e-06, "loss": 0.2257, "step": 16260 }, { "epoch": 2.776137236493983, "grad_norm": 0.3145821470334454, "learning_rate": 4.138214466363177e-06, "loss": 0.196, "step": 16265 }, { "epoch": 2.7769906972774603, "grad_norm": 0.3249047955184392, "learning_rate": 4.122407688416793e-06, "loss": 0.1862, "step": 16270 }, { "epoch": 2.777844158060937, "grad_norm": 0.3132789270646679, "learning_rate": 4.10660091047041e-06, "loss": 0.2023, "step": 16275 }, { "epoch": 2.778697618844414, "grad_norm": 0.2759064527081392, "learning_rate": 4.0907941325240264e-06, "loss": 0.2072, "step": 16280 }, { "epoch": 2.779551079627891, "grad_norm": 0.3306138346721802, "learning_rate": 4.074987354577643e-06, "loss": 0.1917, "step": 16285 }, { "epoch": 2.780404540411368, "grad_norm": 0.3069234401158518, "learning_rate": 4.05918057663126e-06, "loss": 0.2043, "step": 16290 }, { "epoch": 2.7812580011948453, "grad_norm": 0.2830139081835964, "learning_rate": 4.043373798684876e-06, "loss": 0.2042, "step": 16295 }, { "epoch": 2.782111461978322, "grad_norm": 0.3100455277406796, "learning_rate": 4.027567020738493e-06, "loss": 0.203, "step": 16300 }, { "epoch": 2.782964922761799, "grad_norm": 0.2781892234707754, "learning_rate": 4.0117602427921095e-06, "loss": 0.2125, "step": 16305 }, { "epoch": 2.783818383545276, "grad_norm": 0.2951322503588762, "learning_rate": 3.995953464845726e-06, "loss": 0.2022, "step": 16310 }, { "epoch": 2.784671844328753, "grad_norm": 0.3040288136011962, "learning_rate": 3.980146686899343e-06, "loss": 0.1969, "step": 16315 }, { "epoch": 2.78552530511223, "grad_norm": 0.2834119380276239, "learning_rate": 3.964339908952959e-06, "loss": 0.1962, "step": 16320 }, { "epoch": 2.786378765895707, "grad_norm": 0.3139452952970753, "learning_rate": 3.948533131006576e-06, "loss": 0.2109, "step": 16325 }, { "epoch": 2.7872322266791842, "grad_norm": 0.2696105563249733, "learning_rate": 3.9327263530601925e-06, "loss": 0.1984, "step": 16330 }, { "epoch": 2.788085687462661, "grad_norm": 0.3188958700085112, "learning_rate": 3.916919575113809e-06, "loss": 0.2068, "step": 16335 }, { "epoch": 2.788939148246138, "grad_norm": 0.29385092332451374, "learning_rate": 3.901112797167426e-06, "loss": 0.2136, "step": 16340 }, { "epoch": 2.789792609029615, "grad_norm": 0.2772574312730247, "learning_rate": 3.885306019221042e-06, "loss": 0.2143, "step": 16345 }, { "epoch": 2.790646069813092, "grad_norm": 0.3113105973895474, "learning_rate": 3.869499241274659e-06, "loss": 0.2088, "step": 16350 }, { "epoch": 2.7914995305965693, "grad_norm": 0.3233719449569847, "learning_rate": 3.853692463328276e-06, "loss": 0.2097, "step": 16355 }, { "epoch": 2.7923529913800462, "grad_norm": 0.2906146496242921, "learning_rate": 3.837885685381891e-06, "loss": 0.2202, "step": 16360 }, { "epoch": 2.793206452163523, "grad_norm": 0.2878742496109735, "learning_rate": 3.822078907435509e-06, "loss": 0.2129, "step": 16365 }, { "epoch": 2.794059912947, "grad_norm": 0.3024076148704142, "learning_rate": 3.8062721294891254e-06, "loss": 0.2138, "step": 16370 }, { "epoch": 2.794913373730477, "grad_norm": 0.3168736440886399, "learning_rate": 3.7904653515427416e-06, "loss": 0.1994, "step": 16375 }, { "epoch": 2.7957668345139544, "grad_norm": 0.29654294407581194, "learning_rate": 3.7746585735963582e-06, "loss": 0.2082, "step": 16380 }, { "epoch": 2.796620295297431, "grad_norm": 0.30573421322760413, "learning_rate": 3.7588517956499753e-06, "loss": 0.2054, "step": 16385 }, { "epoch": 2.797473756080908, "grad_norm": 0.3175298040187923, "learning_rate": 3.7430450177035914e-06, "loss": 0.203, "step": 16390 }, { "epoch": 2.798327216864385, "grad_norm": 0.30834082539723845, "learning_rate": 3.727238239757208e-06, "loss": 0.2121, "step": 16395 }, { "epoch": 2.799180677647862, "grad_norm": 0.30234632480758583, "learning_rate": 3.711431461810825e-06, "loss": 0.1917, "step": 16400 }, { "epoch": 2.800034138431339, "grad_norm": 0.32824340868810853, "learning_rate": 3.6956246838644413e-06, "loss": 0.2015, "step": 16405 }, { "epoch": 2.800887599214816, "grad_norm": 0.3278409491445729, "learning_rate": 3.679817905918058e-06, "loss": 0.2151, "step": 16410 }, { "epoch": 2.8017410599982933, "grad_norm": 0.3424866674069857, "learning_rate": 3.664011127971675e-06, "loss": 0.2166, "step": 16415 }, { "epoch": 2.80259452078177, "grad_norm": 0.2899712654810411, "learning_rate": 3.648204350025291e-06, "loss": 0.2196, "step": 16420 }, { "epoch": 2.803447981565247, "grad_norm": 0.3161328426670568, "learning_rate": 3.6323975720789077e-06, "loss": 0.2184, "step": 16425 }, { "epoch": 2.804301442348724, "grad_norm": 0.313798095829363, "learning_rate": 3.616590794132524e-06, "loss": 0.2039, "step": 16430 }, { "epoch": 2.805154903132201, "grad_norm": 0.3381255154041987, "learning_rate": 3.600784016186141e-06, "loss": 0.2089, "step": 16435 }, { "epoch": 2.8060083639156783, "grad_norm": 0.3026649242407844, "learning_rate": 3.5849772382397575e-06, "loss": 0.193, "step": 16440 }, { "epoch": 2.8068618246991552, "grad_norm": 0.24699501555534614, "learning_rate": 3.5691704602933737e-06, "loss": 0.2289, "step": 16445 }, { "epoch": 2.807715285482632, "grad_norm": 0.2711608640996708, "learning_rate": 3.5533636823469903e-06, "loss": 0.2066, "step": 16450 }, { "epoch": 2.808568746266109, "grad_norm": 0.3070561227061133, "learning_rate": 3.5375569044006074e-06, "loss": 0.1971, "step": 16455 }, { "epoch": 2.809422207049586, "grad_norm": 0.2538535206535511, "learning_rate": 3.5217501264542236e-06, "loss": 0.205, "step": 16460 }, { "epoch": 2.810275667833063, "grad_norm": 0.29937347404990705, "learning_rate": 3.50594334850784e-06, "loss": 0.217, "step": 16465 }, { "epoch": 2.81112912861654, "grad_norm": 0.28414922695586153, "learning_rate": 3.490136570561457e-06, "loss": 0.1915, "step": 16470 }, { "epoch": 2.811982589400017, "grad_norm": 0.2518892126604444, "learning_rate": 3.4743297926150734e-06, "loss": 0.1983, "step": 16475 }, { "epoch": 2.812836050183494, "grad_norm": 0.3122994989359656, "learning_rate": 3.45852301466869e-06, "loss": 0.1985, "step": 16480 }, { "epoch": 2.813689510966971, "grad_norm": 0.2613982352019185, "learning_rate": 3.442716236722307e-06, "loss": 0.2059, "step": 16485 }, { "epoch": 2.814542971750448, "grad_norm": 0.3163571315146126, "learning_rate": 3.4269094587759232e-06, "loss": 0.2164, "step": 16490 }, { "epoch": 2.815396432533925, "grad_norm": 0.3251756560396609, "learning_rate": 3.41110268082954e-06, "loss": 0.2297, "step": 16495 }, { "epoch": 2.8162498933174023, "grad_norm": 0.33151323781232633, "learning_rate": 3.395295902883156e-06, "loss": 0.205, "step": 16500 }, { "epoch": 2.817103354100879, "grad_norm": 0.3204999377329183, "learning_rate": 3.379489124936773e-06, "loss": 0.2025, "step": 16505 }, { "epoch": 2.817956814884356, "grad_norm": 0.3075233314297435, "learning_rate": 3.3636823469903897e-06, "loss": 0.2122, "step": 16510 }, { "epoch": 2.818810275667833, "grad_norm": 0.2789797433784303, "learning_rate": 3.347875569044006e-06, "loss": 0.2013, "step": 16515 }, { "epoch": 2.81966373645131, "grad_norm": 0.2747944094649088, "learning_rate": 3.332068791097623e-06, "loss": 0.2057, "step": 16520 }, { "epoch": 2.8205171972347873, "grad_norm": 0.31191000078565173, "learning_rate": 3.3162620131512395e-06, "loss": 0.2143, "step": 16525 }, { "epoch": 2.821370658018264, "grad_norm": 0.2924804828000546, "learning_rate": 3.3004552352048557e-06, "loss": 0.2003, "step": 16530 }, { "epoch": 2.822224118801741, "grad_norm": 0.28085022154844513, "learning_rate": 3.2846484572584727e-06, "loss": 0.1958, "step": 16535 }, { "epoch": 2.823077579585218, "grad_norm": 0.32084643314807854, "learning_rate": 3.2688416793120893e-06, "loss": 0.1905, "step": 16540 }, { "epoch": 2.823931040368695, "grad_norm": 0.2876213271325214, "learning_rate": 3.2530349013657055e-06, "loss": 0.2207, "step": 16545 }, { "epoch": 2.824784501152172, "grad_norm": 0.28002627123874135, "learning_rate": 3.2372281234193226e-06, "loss": 0.2158, "step": 16550 }, { "epoch": 2.825637961935649, "grad_norm": 0.3049040980544474, "learning_rate": 3.2214213454729387e-06, "loss": 0.1874, "step": 16555 }, { "epoch": 2.8264914227191262, "grad_norm": 0.2948931584904353, "learning_rate": 3.2056145675265554e-06, "loss": 0.2041, "step": 16560 }, { "epoch": 2.827344883502603, "grad_norm": 0.2940886671419153, "learning_rate": 3.1898077895801724e-06, "loss": 0.1927, "step": 16565 }, { "epoch": 2.82819834428608, "grad_norm": 0.32098595940043767, "learning_rate": 3.1740010116337886e-06, "loss": 0.2042, "step": 16570 }, { "epoch": 2.829051805069557, "grad_norm": 0.33101361553387193, "learning_rate": 3.158194233687405e-06, "loss": 0.1986, "step": 16575 }, { "epoch": 2.829905265853034, "grad_norm": 0.3293585824166995, "learning_rate": 3.1423874557410222e-06, "loss": 0.201, "step": 16580 }, { "epoch": 2.8307587266365113, "grad_norm": 0.262905462224898, "learning_rate": 3.1265806777946384e-06, "loss": 0.2025, "step": 16585 }, { "epoch": 2.831612187419988, "grad_norm": 0.3181198358280363, "learning_rate": 3.110773899848255e-06, "loss": 0.2044, "step": 16590 }, { "epoch": 2.832465648203465, "grad_norm": 0.28022888001638946, "learning_rate": 3.0949671219018716e-06, "loss": 0.2033, "step": 16595 }, { "epoch": 2.833319108986942, "grad_norm": 0.36649693489377, "learning_rate": 3.0791603439554882e-06, "loss": 0.1996, "step": 16600 }, { "epoch": 2.834172569770419, "grad_norm": 0.3985106495581557, "learning_rate": 3.063353566009105e-06, "loss": 0.2034, "step": 16605 }, { "epoch": 2.835026030553896, "grad_norm": 0.32488712987501084, "learning_rate": 3.0475467880627215e-06, "loss": 0.2085, "step": 16610 }, { "epoch": 2.835879491337373, "grad_norm": 0.3197898352408073, "learning_rate": 3.031740010116338e-06, "loss": 0.1966, "step": 16615 }, { "epoch": 2.83673295212085, "grad_norm": 0.2820220822896516, "learning_rate": 3.0159332321699547e-06, "loss": 0.2024, "step": 16620 }, { "epoch": 2.837586412904327, "grad_norm": 0.3790777029919425, "learning_rate": 3.0001264542235713e-06, "loss": 0.1923, "step": 16625 }, { "epoch": 2.838439873687804, "grad_norm": 0.2985841356799152, "learning_rate": 2.984319676277188e-06, "loss": 0.1936, "step": 16630 }, { "epoch": 2.839293334471281, "grad_norm": 0.31709748092917084, "learning_rate": 2.968512898330804e-06, "loss": 0.1854, "step": 16635 }, { "epoch": 2.840146795254758, "grad_norm": 0.2695083610137018, "learning_rate": 2.952706120384421e-06, "loss": 0.2041, "step": 16640 }, { "epoch": 2.8410002560382352, "grad_norm": 0.32087543494030907, "learning_rate": 2.9368993424380377e-06, "loss": 0.204, "step": 16645 }, { "epoch": 2.841853716821712, "grad_norm": 0.32154679142205544, "learning_rate": 2.921092564491654e-06, "loss": 0.2184, "step": 16650 }, { "epoch": 2.842707177605189, "grad_norm": 0.3375018125317281, "learning_rate": 2.905285786545271e-06, "loss": 0.2203, "step": 16655 }, { "epoch": 2.843560638388666, "grad_norm": 0.2609891731345887, "learning_rate": 2.8894790085988876e-06, "loss": 0.2086, "step": 16660 }, { "epoch": 2.844414099172143, "grad_norm": 0.3478935118572067, "learning_rate": 2.8736722306525038e-06, "loss": 0.198, "step": 16665 }, { "epoch": 2.8452675599556203, "grad_norm": 0.3212046872450367, "learning_rate": 2.8578654527061204e-06, "loss": 0.208, "step": 16670 }, { "epoch": 2.846121020739097, "grad_norm": 0.3245588263301543, "learning_rate": 2.8420586747597374e-06, "loss": 0.1959, "step": 16675 }, { "epoch": 2.846974481522574, "grad_norm": 0.3284558303797445, "learning_rate": 2.8262518968133536e-06, "loss": 0.2145, "step": 16680 }, { "epoch": 2.847827942306051, "grad_norm": 0.2787383009345952, "learning_rate": 2.81044511886697e-06, "loss": 0.1927, "step": 16685 }, { "epoch": 2.848681403089528, "grad_norm": 0.30813651049157853, "learning_rate": 2.7946383409205872e-06, "loss": 0.2104, "step": 16690 }, { "epoch": 2.849534863873005, "grad_norm": 0.29075105100198256, "learning_rate": 2.7788315629742034e-06, "loss": 0.1918, "step": 16695 }, { "epoch": 2.850388324656482, "grad_norm": 0.28558590640129755, "learning_rate": 2.76302478502782e-06, "loss": 0.2058, "step": 16700 }, { "epoch": 2.851241785439959, "grad_norm": 0.2839293597138735, "learning_rate": 2.7472180070814366e-06, "loss": 0.2163, "step": 16705 }, { "epoch": 2.852095246223436, "grad_norm": 0.2928571515651883, "learning_rate": 2.7314112291350532e-06, "loss": 0.2046, "step": 16710 }, { "epoch": 2.852948707006913, "grad_norm": 0.2763379262521816, "learning_rate": 2.71560445118867e-06, "loss": 0.2164, "step": 16715 }, { "epoch": 2.85380216779039, "grad_norm": 0.29214303116700613, "learning_rate": 2.6997976732422865e-06, "loss": 0.194, "step": 16720 }, { "epoch": 2.854655628573867, "grad_norm": 0.2912761257243044, "learning_rate": 2.683990895295903e-06, "loss": 0.1996, "step": 16725 }, { "epoch": 2.8555090893573443, "grad_norm": 0.2920152340257753, "learning_rate": 2.6681841173495197e-06, "loss": 0.2192, "step": 16730 }, { "epoch": 2.856362550140821, "grad_norm": 0.3288384141660532, "learning_rate": 2.6523773394031363e-06, "loss": 0.2094, "step": 16735 }, { "epoch": 2.857216010924298, "grad_norm": 0.2872890721992404, "learning_rate": 2.6365705614567525e-06, "loss": 0.2123, "step": 16740 }, { "epoch": 2.858069471707775, "grad_norm": 0.281143504404024, "learning_rate": 2.6207637835103695e-06, "loss": 0.2225, "step": 16745 }, { "epoch": 2.858922932491252, "grad_norm": 0.3589295231385812, "learning_rate": 2.604957005563986e-06, "loss": 0.1959, "step": 16750 }, { "epoch": 2.859776393274729, "grad_norm": 0.2837347073427969, "learning_rate": 2.5891502276176023e-06, "loss": 0.2192, "step": 16755 }, { "epoch": 2.860629854058206, "grad_norm": 0.3253918329408533, "learning_rate": 2.5733434496712194e-06, "loss": 0.2024, "step": 16760 }, { "epoch": 2.861483314841683, "grad_norm": 0.26589668672336, "learning_rate": 2.557536671724836e-06, "loss": 0.251, "step": 16765 }, { "epoch": 2.86233677562516, "grad_norm": 0.2968035942755862, "learning_rate": 2.541729893778452e-06, "loss": 0.2017, "step": 16770 }, { "epoch": 2.863190236408637, "grad_norm": 0.2873874645900919, "learning_rate": 2.5259231158320688e-06, "loss": 0.2075, "step": 16775 }, { "epoch": 2.864043697192114, "grad_norm": 0.2934935786485589, "learning_rate": 2.510116337885686e-06, "loss": 0.1967, "step": 16780 }, { "epoch": 2.864897157975591, "grad_norm": 0.3381823960513163, "learning_rate": 2.494309559939302e-06, "loss": 0.2198, "step": 16785 }, { "epoch": 2.8657506187590682, "grad_norm": 0.3564639472615141, "learning_rate": 2.4785027819929186e-06, "loss": 0.1938, "step": 16790 }, { "epoch": 2.866604079542545, "grad_norm": 0.33501656151838405, "learning_rate": 2.4626960040465356e-06, "loss": 0.2131, "step": 16795 }, { "epoch": 2.867457540326022, "grad_norm": 0.2878157781050671, "learning_rate": 2.446889226100152e-06, "loss": 0.222, "step": 16800 }, { "epoch": 2.868311001109499, "grad_norm": 0.278460364262515, "learning_rate": 2.4310824481537684e-06, "loss": 0.2097, "step": 16805 }, { "epoch": 2.869164461892976, "grad_norm": 0.2977487695105044, "learning_rate": 2.415275670207385e-06, "loss": 0.2055, "step": 16810 }, { "epoch": 2.8700179226764533, "grad_norm": 0.31384595369214435, "learning_rate": 2.3994688922610016e-06, "loss": 0.2024, "step": 16815 }, { "epoch": 2.8708713834599298, "grad_norm": 0.27936596285436166, "learning_rate": 2.3836621143146183e-06, "loss": 0.2274, "step": 16820 }, { "epoch": 2.871724844243407, "grad_norm": 0.3212698928548633, "learning_rate": 2.367855336368235e-06, "loss": 0.2084, "step": 16825 }, { "epoch": 2.872578305026884, "grad_norm": 0.33132159151433205, "learning_rate": 2.3520485584218515e-06, "loss": 0.2059, "step": 16830 }, { "epoch": 2.873431765810361, "grad_norm": 0.3343771636134836, "learning_rate": 2.336241780475468e-06, "loss": 0.2015, "step": 16835 }, { "epoch": 2.874285226593838, "grad_norm": 0.33765174123967145, "learning_rate": 2.3204350025290847e-06, "loss": 0.2017, "step": 16840 }, { "epoch": 2.875138687377315, "grad_norm": 0.2748177504991637, "learning_rate": 2.304628224582701e-06, "loss": 0.2098, "step": 16845 }, { "epoch": 2.875992148160792, "grad_norm": 0.32223042752456243, "learning_rate": 2.288821446636318e-06, "loss": 0.1883, "step": 16850 }, { "epoch": 2.876845608944269, "grad_norm": 0.32217690779386665, "learning_rate": 2.2730146686899345e-06, "loss": 0.2001, "step": 16855 }, { "epoch": 2.877699069727746, "grad_norm": 0.2935809821583733, "learning_rate": 2.2572078907435507e-06, "loss": 0.2121, "step": 16860 }, { "epoch": 2.878552530511223, "grad_norm": 0.29289199177829633, "learning_rate": 2.2414011127971678e-06, "loss": 0.2073, "step": 16865 }, { "epoch": 2.8794059912947, "grad_norm": 0.3072558722282407, "learning_rate": 2.2255943348507844e-06, "loss": 0.2032, "step": 16870 }, { "epoch": 2.8802594520781772, "grad_norm": 0.29643237904007175, "learning_rate": 2.2097875569044005e-06, "loss": 0.2061, "step": 16875 }, { "epoch": 2.881112912861654, "grad_norm": 0.32831167080590196, "learning_rate": 2.193980778958017e-06, "loss": 0.1891, "step": 16880 }, { "epoch": 2.881966373645131, "grad_norm": 0.2553059652876756, "learning_rate": 2.178174001011634e-06, "loss": 0.203, "step": 16885 }, { "epoch": 2.882819834428608, "grad_norm": 0.30246159624744584, "learning_rate": 2.1623672230652504e-06, "loss": 0.1979, "step": 16890 }, { "epoch": 2.883673295212085, "grad_norm": 0.28393668735954675, "learning_rate": 2.146560445118867e-06, "loss": 0.2021, "step": 16895 }, { "epoch": 2.884526755995562, "grad_norm": 0.27894497432475157, "learning_rate": 2.1307536671724836e-06, "loss": 0.2064, "step": 16900 }, { "epoch": 2.8853802167790388, "grad_norm": 0.3480943203739307, "learning_rate": 2.1149468892261002e-06, "loss": 0.1854, "step": 16905 }, { "epoch": 2.886233677562516, "grad_norm": 0.3215110831053852, "learning_rate": 2.099140111279717e-06, "loss": 0.2151, "step": 16910 }, { "epoch": 2.887087138345993, "grad_norm": 0.3044981358621058, "learning_rate": 2.0833333333333334e-06, "loss": 0.2038, "step": 16915 }, { "epoch": 2.88794059912947, "grad_norm": 0.23422177850192888, "learning_rate": 2.06752655538695e-06, "loss": 0.2141, "step": 16920 }, { "epoch": 2.888794059912947, "grad_norm": 0.28489446213081826, "learning_rate": 2.0517197774405667e-06, "loss": 0.205, "step": 16925 }, { "epoch": 2.889647520696424, "grad_norm": 0.2897558833846524, "learning_rate": 2.0359129994941833e-06, "loss": 0.2031, "step": 16930 }, { "epoch": 2.890500981479901, "grad_norm": 0.2676216756228542, "learning_rate": 2.0201062215477995e-06, "loss": 0.1983, "step": 16935 }, { "epoch": 2.891354442263378, "grad_norm": 0.29198986902152524, "learning_rate": 2.0042994436014165e-06, "loss": 0.2028, "step": 16940 }, { "epoch": 2.892207903046855, "grad_norm": 0.2919205232983492, "learning_rate": 1.988492665655033e-06, "loss": 0.2069, "step": 16945 }, { "epoch": 2.893061363830332, "grad_norm": 0.28066776969897533, "learning_rate": 1.9726858877086493e-06, "loss": 0.1976, "step": 16950 }, { "epoch": 2.893914824613809, "grad_norm": 0.3132705466186365, "learning_rate": 1.9568791097622663e-06, "loss": 0.2087, "step": 16955 }, { "epoch": 2.8947682853972863, "grad_norm": 0.320031199352025, "learning_rate": 1.941072331815883e-06, "loss": 0.1941, "step": 16960 }, { "epoch": 2.8956217461807627, "grad_norm": 0.3271822354169633, "learning_rate": 1.925265553869499e-06, "loss": 0.2264, "step": 16965 }, { "epoch": 2.89647520696424, "grad_norm": 0.3483602706460579, "learning_rate": 1.9094587759231157e-06, "loss": 0.2128, "step": 16970 }, { "epoch": 2.897328667747717, "grad_norm": 0.340651675260474, "learning_rate": 1.8936519979767325e-06, "loss": 0.1997, "step": 16975 }, { "epoch": 2.898182128531194, "grad_norm": 0.31440287921718146, "learning_rate": 1.8778452200303492e-06, "loss": 0.2036, "step": 16980 }, { "epoch": 2.899035589314671, "grad_norm": 0.2977430086332578, "learning_rate": 1.8620384420839656e-06, "loss": 0.2137, "step": 16985 }, { "epoch": 2.899889050098148, "grad_norm": 0.32920942754378446, "learning_rate": 1.8462316641375824e-06, "loss": 0.2056, "step": 16990 }, { "epoch": 2.900742510881625, "grad_norm": 0.3281145569999493, "learning_rate": 1.830424886191199e-06, "loss": 0.2155, "step": 16995 }, { "epoch": 2.901595971665102, "grad_norm": 0.2902453504220539, "learning_rate": 1.8146181082448154e-06, "loss": 0.2017, "step": 17000 }, { "epoch": 2.902449432448579, "grad_norm": 0.3132353078365459, "learning_rate": 1.798811330298432e-06, "loss": 0.2, "step": 17005 }, { "epoch": 2.903302893232056, "grad_norm": 0.2787315413024899, "learning_rate": 1.7830045523520488e-06, "loss": 0.2019, "step": 17010 }, { "epoch": 2.904156354015533, "grad_norm": 0.2867491364148422, "learning_rate": 1.7671977744056652e-06, "loss": 0.2, "step": 17015 }, { "epoch": 2.90500981479901, "grad_norm": 0.28322720256761535, "learning_rate": 1.7513909964592818e-06, "loss": 0.2277, "step": 17020 }, { "epoch": 2.905863275582487, "grad_norm": 0.2714379865527004, "learning_rate": 1.7355842185128987e-06, "loss": 0.1965, "step": 17025 }, { "epoch": 2.906716736365964, "grad_norm": 0.3360622334575726, "learning_rate": 1.719777440566515e-06, "loss": 0.2121, "step": 17030 }, { "epoch": 2.907570197149441, "grad_norm": 0.2888296557717694, "learning_rate": 1.7039706626201315e-06, "loss": 0.198, "step": 17035 }, { "epoch": 2.908423657932918, "grad_norm": 0.29556279628536397, "learning_rate": 1.688163884673748e-06, "loss": 0.1986, "step": 17040 }, { "epoch": 2.909277118716395, "grad_norm": 0.28453360611989975, "learning_rate": 1.6723571067273649e-06, "loss": 0.2094, "step": 17045 }, { "epoch": 2.9101305794998718, "grad_norm": 0.30492643162859917, "learning_rate": 1.6565503287809813e-06, "loss": 0.1956, "step": 17050 }, { "epoch": 2.910984040283349, "grad_norm": 0.29307536092572845, "learning_rate": 1.640743550834598e-06, "loss": 0.1933, "step": 17055 }, { "epoch": 2.911837501066826, "grad_norm": 0.2616526955603826, "learning_rate": 1.6249367728882147e-06, "loss": 0.2101, "step": 17060 }, { "epoch": 2.912690961850303, "grad_norm": 0.30095564204562114, "learning_rate": 1.6091299949418311e-06, "loss": 0.2148, "step": 17065 }, { "epoch": 2.91354442263378, "grad_norm": 0.3146237342038121, "learning_rate": 1.5933232169954477e-06, "loss": 0.1956, "step": 17070 }, { "epoch": 2.914397883417257, "grad_norm": 0.315679064475717, "learning_rate": 1.5775164390490641e-06, "loss": 0.2195, "step": 17075 }, { "epoch": 2.915251344200734, "grad_norm": 0.3098051286329028, "learning_rate": 1.561709661102681e-06, "loss": 0.2064, "step": 17080 }, { "epoch": 2.916104804984211, "grad_norm": 0.27976942515043507, "learning_rate": 1.5459028831562976e-06, "loss": 0.2029, "step": 17085 }, { "epoch": 2.916958265767688, "grad_norm": 0.2881835030727533, "learning_rate": 1.530096105209914e-06, "loss": 0.2074, "step": 17090 }, { "epoch": 2.917811726551165, "grad_norm": 0.2797319344340243, "learning_rate": 1.5142893272635306e-06, "loss": 0.2086, "step": 17095 }, { "epoch": 2.918665187334642, "grad_norm": 0.28613615646152124, "learning_rate": 1.4984825493171474e-06, "loss": 0.2167, "step": 17100 }, { "epoch": 2.9195186481181192, "grad_norm": 0.28788709024858605, "learning_rate": 1.4826757713707638e-06, "loss": 0.1899, "step": 17105 }, { "epoch": 2.9203721089015957, "grad_norm": 0.28310120051004733, "learning_rate": 1.4668689934243804e-06, "loss": 0.2012, "step": 17110 }, { "epoch": 2.921225569685073, "grad_norm": 0.3002178708819116, "learning_rate": 1.451062215477997e-06, "loss": 0.2064, "step": 17115 }, { "epoch": 2.92207903046855, "grad_norm": 0.26191558987266106, "learning_rate": 1.4352554375316136e-06, "loss": 0.2095, "step": 17120 }, { "epoch": 2.922932491252027, "grad_norm": 0.2947360012259959, "learning_rate": 1.4194486595852302e-06, "loss": 0.2071, "step": 17125 }, { "epoch": 2.923785952035504, "grad_norm": 0.31390380861185485, "learning_rate": 1.4036418816388468e-06, "loss": 0.1974, "step": 17130 }, { "epoch": 2.9246394128189808, "grad_norm": 0.4478824284000525, "learning_rate": 1.3878351036924635e-06, "loss": 0.2025, "step": 17135 }, { "epoch": 2.925492873602458, "grad_norm": 0.31928195457256175, "learning_rate": 1.37202832574608e-06, "loss": 0.1959, "step": 17140 }, { "epoch": 2.926346334385935, "grad_norm": 0.35258055414744954, "learning_rate": 1.3562215477996967e-06, "loss": 0.2049, "step": 17145 }, { "epoch": 2.927199795169412, "grad_norm": 0.26472342555140455, "learning_rate": 1.340414769853313e-06, "loss": 0.1895, "step": 17150 }, { "epoch": 2.928053255952889, "grad_norm": 0.37848142930329776, "learning_rate": 1.3246079919069297e-06, "loss": 0.2186, "step": 17155 }, { "epoch": 2.928906716736366, "grad_norm": 0.35314620512775885, "learning_rate": 1.3088012139605463e-06, "loss": 0.2039, "step": 17160 }, { "epoch": 2.929760177519843, "grad_norm": 0.3194828509155015, "learning_rate": 1.292994436014163e-06, "loss": 0.1849, "step": 17165 }, { "epoch": 2.93061363830332, "grad_norm": 0.2838051872405007, "learning_rate": 1.2771876580677795e-06, "loss": 0.2106, "step": 17170 }, { "epoch": 2.931467099086797, "grad_norm": 0.32225277009503955, "learning_rate": 1.2613808801213961e-06, "loss": 0.1976, "step": 17175 }, { "epoch": 2.932320559870274, "grad_norm": 0.3188912950687936, "learning_rate": 1.2455741021750127e-06, "loss": 0.2007, "step": 17180 }, { "epoch": 2.933174020653751, "grad_norm": 0.3246131322371708, "learning_rate": 1.2297673242286293e-06, "loss": 0.2091, "step": 17185 }, { "epoch": 2.934027481437228, "grad_norm": 0.3033462012528161, "learning_rate": 1.213960546282246e-06, "loss": 0.2053, "step": 17190 }, { "epoch": 2.9348809422207047, "grad_norm": 0.31270140153332937, "learning_rate": 1.1981537683358624e-06, "loss": 0.2121, "step": 17195 }, { "epoch": 2.935734403004182, "grad_norm": 0.31845478349953826, "learning_rate": 1.1823469903894792e-06, "loss": 0.1945, "step": 17200 }, { "epoch": 2.936587863787659, "grad_norm": 0.3004402372660244, "learning_rate": 1.1665402124430956e-06, "loss": 0.1996, "step": 17205 }, { "epoch": 2.937441324571136, "grad_norm": 0.30634276017139833, "learning_rate": 1.1507334344967122e-06, "loss": 0.2029, "step": 17210 }, { "epoch": 2.938294785354613, "grad_norm": 0.2986248866541451, "learning_rate": 1.134926656550329e-06, "loss": 0.1744, "step": 17215 }, { "epoch": 2.93914824613809, "grad_norm": 0.2940473966941769, "learning_rate": 1.1191198786039454e-06, "loss": 0.197, "step": 17220 }, { "epoch": 2.940001706921567, "grad_norm": 0.34131383255199294, "learning_rate": 1.103313100657562e-06, "loss": 0.1954, "step": 17225 }, { "epoch": 2.940855167705044, "grad_norm": 0.24211371159221357, "learning_rate": 1.0875063227111786e-06, "loss": 0.2053, "step": 17230 }, { "epoch": 2.941708628488521, "grad_norm": 0.2830903399893913, "learning_rate": 1.0716995447647952e-06, "loss": 0.2039, "step": 17235 }, { "epoch": 2.942562089271998, "grad_norm": 0.3065162871886276, "learning_rate": 1.0558927668184116e-06, "loss": 0.1968, "step": 17240 }, { "epoch": 2.943415550055475, "grad_norm": 0.3526383670297659, "learning_rate": 1.0400859888720285e-06, "loss": 0.1856, "step": 17245 }, { "epoch": 2.944269010838952, "grad_norm": 0.2802896796476681, "learning_rate": 1.024279210925645e-06, "loss": 0.1907, "step": 17250 }, { "epoch": 2.9451224716224287, "grad_norm": 0.27457159444873286, "learning_rate": 1.0084724329792615e-06, "loss": 0.2117, "step": 17255 }, { "epoch": 2.945975932405906, "grad_norm": 0.2702273580943847, "learning_rate": 9.926656550328783e-07, "loss": 0.2247, "step": 17260 }, { "epoch": 2.946829393189383, "grad_norm": 0.33946808662209843, "learning_rate": 9.768588770864947e-07, "loss": 0.2004, "step": 17265 }, { "epoch": 2.94768285397286, "grad_norm": 0.31035685447064765, "learning_rate": 9.610520991401113e-07, "loss": 0.2117, "step": 17270 }, { "epoch": 2.948536314756337, "grad_norm": 0.33622355529023296, "learning_rate": 9.452453211937278e-07, "loss": 0.2066, "step": 17275 }, { "epoch": 2.9493897755398137, "grad_norm": 0.31123009544274716, "learning_rate": 9.294385432473445e-07, "loss": 0.1979, "step": 17280 }, { "epoch": 2.950243236323291, "grad_norm": 0.2855730419560878, "learning_rate": 9.136317653009611e-07, "loss": 0.2067, "step": 17285 }, { "epoch": 2.951096697106768, "grad_norm": 0.3286835875667557, "learning_rate": 8.978249873545776e-07, "loss": 0.1936, "step": 17290 }, { "epoch": 2.951950157890245, "grad_norm": 0.33480948002159827, "learning_rate": 8.820182094081944e-07, "loss": 0.2039, "step": 17295 }, { "epoch": 2.952803618673722, "grad_norm": 0.2775036208081476, "learning_rate": 8.662114314618109e-07, "loss": 0.2106, "step": 17300 }, { "epoch": 2.953657079457199, "grad_norm": 0.2950668116499416, "learning_rate": 8.504046535154275e-07, "loss": 0.2113, "step": 17305 }, { "epoch": 2.954510540240676, "grad_norm": 0.3294077651970724, "learning_rate": 8.34597875569044e-07, "loss": 0.1966, "step": 17310 }, { "epoch": 2.955364001024153, "grad_norm": 0.27239839934327037, "learning_rate": 8.187910976226607e-07, "loss": 0.1994, "step": 17315 }, { "epoch": 2.95621746180763, "grad_norm": 0.3162738467786228, "learning_rate": 8.029843196762771e-07, "loss": 0.2082, "step": 17320 }, { "epoch": 2.957070922591107, "grad_norm": 0.33819682200844636, "learning_rate": 7.871775417298938e-07, "loss": 0.1965, "step": 17325 }, { "epoch": 2.957924383374584, "grad_norm": 0.2745774876826912, "learning_rate": 7.713707637835104e-07, "loss": 0.2053, "step": 17330 }, { "epoch": 2.9587778441580608, "grad_norm": 0.25557616741234984, "learning_rate": 7.555639858371269e-07, "loss": 0.2028, "step": 17335 }, { "epoch": 2.9596313049415377, "grad_norm": 0.3115187409212004, "learning_rate": 7.397572078907435e-07, "loss": 0.1959, "step": 17340 }, { "epoch": 2.960484765725015, "grad_norm": 0.28487876643426213, "learning_rate": 7.239504299443603e-07, "loss": 0.1774, "step": 17345 }, { "epoch": 2.961338226508492, "grad_norm": 0.32857046376753857, "learning_rate": 7.081436519979768e-07, "loss": 0.2095, "step": 17350 }, { "epoch": 2.962191687291969, "grad_norm": 0.3237084789788513, "learning_rate": 6.923368740515934e-07, "loss": 0.2129, "step": 17355 }, { "epoch": 2.963045148075446, "grad_norm": 0.374606186918676, "learning_rate": 6.7653009610521e-07, "loss": 0.1932, "step": 17360 }, { "epoch": 2.9638986088589228, "grad_norm": 0.26948978269296464, "learning_rate": 6.607233181588265e-07, "loss": 0.202, "step": 17365 }, { "epoch": 2.9647520696424, "grad_norm": 0.31925476650371226, "learning_rate": 6.449165402124431e-07, "loss": 0.2082, "step": 17370 }, { "epoch": 2.965605530425877, "grad_norm": 0.30750036652938445, "learning_rate": 6.291097622660597e-07, "loss": 0.2066, "step": 17375 }, { "epoch": 2.966458991209354, "grad_norm": 0.2795830488580364, "learning_rate": 6.133029843196763e-07, "loss": 0.1942, "step": 17380 }, { "epoch": 2.967312451992831, "grad_norm": 0.283857585637038, "learning_rate": 5.974962063732929e-07, "loss": 0.2056, "step": 17385 }, { "epoch": 2.968165912776308, "grad_norm": 0.32806663776010714, "learning_rate": 5.816894284269095e-07, "loss": 0.2088, "step": 17390 }, { "epoch": 2.969019373559785, "grad_norm": 0.29903731557096414, "learning_rate": 5.65882650480526e-07, "loss": 0.2128, "step": 17395 }, { "epoch": 2.9698728343432617, "grad_norm": 0.27798130323329445, "learning_rate": 5.500758725341426e-07, "loss": 0.2069, "step": 17400 }, { "epoch": 2.970726295126739, "grad_norm": 0.2773037932746904, "learning_rate": 5.342690945877593e-07, "loss": 0.2083, "step": 17405 }, { "epoch": 2.971579755910216, "grad_norm": 0.2913683496859182, "learning_rate": 5.184623166413759e-07, "loss": 0.2064, "step": 17410 }, { "epoch": 2.972433216693693, "grad_norm": 0.2750443985033496, "learning_rate": 5.026555386949924e-07, "loss": 0.2022, "step": 17415 }, { "epoch": 2.97328667747717, "grad_norm": 0.2907162388737508, "learning_rate": 4.868487607486091e-07, "loss": 0.2083, "step": 17420 }, { "epoch": 2.9741401382606467, "grad_norm": 0.3042086507388893, "learning_rate": 4.7104198280222565e-07, "loss": 0.1923, "step": 17425 }, { "epoch": 2.974993599044124, "grad_norm": 0.261060796298194, "learning_rate": 4.552352048558422e-07, "loss": 0.1856, "step": 17430 }, { "epoch": 2.975847059827601, "grad_norm": 0.3210130444385053, "learning_rate": 4.394284269094588e-07, "loss": 0.1951, "step": 17435 }, { "epoch": 2.976700520611078, "grad_norm": 0.3098077520507755, "learning_rate": 4.236216489630754e-07, "loss": 0.2171, "step": 17440 }, { "epoch": 2.977553981394555, "grad_norm": 0.25974566882508726, "learning_rate": 4.0781487101669193e-07, "loss": 0.2143, "step": 17445 }, { "epoch": 2.9784074421780318, "grad_norm": 0.28008202908834956, "learning_rate": 3.9200809307030854e-07, "loss": 0.2037, "step": 17450 }, { "epoch": 2.979260902961509, "grad_norm": 0.317724688494707, "learning_rate": 3.7620131512392515e-07, "loss": 0.2055, "step": 17455 }, { "epoch": 2.980114363744986, "grad_norm": 0.31406472433161403, "learning_rate": 3.603945371775417e-07, "loss": 0.2107, "step": 17460 }, { "epoch": 2.980967824528463, "grad_norm": 0.2909021629986124, "learning_rate": 3.445877592311583e-07, "loss": 0.2145, "step": 17465 }, { "epoch": 2.98182128531194, "grad_norm": 0.2995678857480539, "learning_rate": 3.2878098128477493e-07, "loss": 0.1879, "step": 17470 }, { "epoch": 2.982674746095417, "grad_norm": 0.287091490223274, "learning_rate": 3.129742033383915e-07, "loss": 0.2216, "step": 17475 }, { "epoch": 2.9835282068788938, "grad_norm": 0.271455647806394, "learning_rate": 2.971674253920081e-07, "loss": 0.2013, "step": 17480 }, { "epoch": 2.9843816676623707, "grad_norm": 0.2498720778404504, "learning_rate": 2.813606474456247e-07, "loss": 0.2204, "step": 17485 }, { "epoch": 2.985235128445848, "grad_norm": 0.2789763067933592, "learning_rate": 2.6555386949924127e-07, "loss": 0.2002, "step": 17490 }, { "epoch": 2.986088589229325, "grad_norm": 0.279851589141097, "learning_rate": 2.497470915528579e-07, "loss": 0.1968, "step": 17495 }, { "epoch": 2.986942050012802, "grad_norm": 0.30485262400705815, "learning_rate": 2.3394031360647446e-07, "loss": 0.1827, "step": 17500 }, { "epoch": 2.987795510796279, "grad_norm": 0.2731892652303336, "learning_rate": 2.1813353566009108e-07, "loss": 0.1908, "step": 17505 }, { "epoch": 2.9886489715797557, "grad_norm": 0.367266683632533, "learning_rate": 2.0232675771370766e-07, "loss": 0.22, "step": 17510 }, { "epoch": 2.989502432363233, "grad_norm": 0.32557608251438674, "learning_rate": 1.8651997976732424e-07, "loss": 0.1991, "step": 17515 }, { "epoch": 2.99035589314671, "grad_norm": 0.30537494164014334, "learning_rate": 1.7071320182094083e-07, "loss": 0.2096, "step": 17520 }, { "epoch": 2.991209353930187, "grad_norm": 0.29579184520359153, "learning_rate": 1.549064238745574e-07, "loss": 0.2121, "step": 17525 }, { "epoch": 2.992062814713664, "grad_norm": 0.2998913971470366, "learning_rate": 1.3909964592817402e-07, "loss": 0.2002, "step": 17530 }, { "epoch": 2.992916275497141, "grad_norm": 0.27881525099940596, "learning_rate": 1.2329286798179058e-07, "loss": 0.2002, "step": 17535 }, { "epoch": 2.993769736280618, "grad_norm": 0.2950807076187271, "learning_rate": 1.0748609003540719e-07, "loss": 0.2143, "step": 17540 }, { "epoch": 2.9946231970640946, "grad_norm": 0.32222949626269104, "learning_rate": 9.167931208902377e-08, "loss": 0.1906, "step": 17545 }, { "epoch": 2.995476657847572, "grad_norm": 0.2976457046919782, "learning_rate": 7.587253414264037e-08, "loss": 0.1981, "step": 17550 }, { "epoch": 2.996330118631049, "grad_norm": 0.285588874142171, "learning_rate": 6.006575619625696e-08, "loss": 0.1921, "step": 17555 }, { "epoch": 2.997183579414526, "grad_norm": 0.3305888547822448, "learning_rate": 4.425897824987355e-08, "loss": 0.1969, "step": 17560 }, { "epoch": 2.9980370401980028, "grad_norm": 0.31806868362224194, "learning_rate": 2.8452200303490138e-08, "loss": 0.2178, "step": 17565 }, { "epoch": 2.9988905009814797, "grad_norm": 0.30123511403621756, "learning_rate": 1.2645422357106729e-08, "loss": 0.1856, "step": 17570 }, { "epoch": 2.9995732696082618, "step": 17574, "total_flos": 2.7506794053018583e+18, "train_loss": 0.33027039743859005, "train_runtime": 72978.1657, "train_samples_per_second": 3.853, "train_steps_per_second": 0.241 } ], "logging_steps": 5, "max_steps": 17574, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7506794053018583e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }