{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999731255038968, "eval_steps": 500, "global_step": 930, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010749798441279225, "grad_norm": 22.762095682751983, "learning_rate": 1.0752688172043012e-07, "loss": 1.3377, "step": 1 }, { "epoch": 0.005374899220639613, "grad_norm": 20.69927583673527, "learning_rate": 5.376344086021506e-07, "loss": 1.3086, "step": 5 }, { "epoch": 0.010749798441279226, "grad_norm": 8.308333536609279, "learning_rate": 1.0752688172043011e-06, "loss": 1.1878, "step": 10 }, { "epoch": 0.01612469766191884, "grad_norm": 6.959676956693679, "learning_rate": 1.6129032258064516e-06, "loss": 1.0321, "step": 15 }, { "epoch": 0.021499596882558453, "grad_norm": 2.9820242928638794, "learning_rate": 2.1505376344086023e-06, "loss": 0.9101, "step": 20 }, { "epoch": 0.026874496103198066, "grad_norm": 2.5554546533203144, "learning_rate": 2.688172043010753e-06, "loss": 0.8617, "step": 25 }, { "epoch": 0.03224939532383768, "grad_norm": 2.286909995566289, "learning_rate": 3.225806451612903e-06, "loss": 0.8383, "step": 30 }, { "epoch": 0.03762429454447729, "grad_norm": 2.3398312761165907, "learning_rate": 3.763440860215054e-06, "loss": 0.8185, "step": 35 }, { "epoch": 0.042999193765116905, "grad_norm": 2.204837686008753, "learning_rate": 4.3010752688172045e-06, "loss": 0.8053, "step": 40 }, { "epoch": 0.04837409298575652, "grad_norm": 2.315534843885615, "learning_rate": 4.838709677419355e-06, "loss": 0.7857, "step": 45 }, { "epoch": 0.05374899220639613, "grad_norm": 2.5131629870127705, "learning_rate": 5.376344086021506e-06, "loss": 0.7691, "step": 50 }, { "epoch": 0.059123891427035745, "grad_norm": 2.4607920938058396, "learning_rate": 5.9139784946236566e-06, "loss": 0.758, "step": 55 }, { "epoch": 0.06449879064767536, "grad_norm": 2.367828432773105, "learning_rate": 6.451612903225806e-06, "loss": 0.7375, "step": 60 }, { "epoch": 0.06987368986831496, "grad_norm": 2.3306378375646406, "learning_rate": 6.989247311827958e-06, "loss": 0.7304, "step": 65 }, { "epoch": 0.07524858908895458, "grad_norm": 2.344704242935191, "learning_rate": 7.526881720430108e-06, "loss": 0.7131, "step": 70 }, { "epoch": 0.08062348830959419, "grad_norm": 2.3556743904747366, "learning_rate": 8.064516129032258e-06, "loss": 0.7109, "step": 75 }, { "epoch": 0.08599838753023381, "grad_norm": 2.2479332716993605, "learning_rate": 8.602150537634409e-06, "loss": 0.7022, "step": 80 }, { "epoch": 0.09137328675087342, "grad_norm": 2.3017400204623666, "learning_rate": 9.13978494623656e-06, "loss": 0.7046, "step": 85 }, { "epoch": 0.09674818597151304, "grad_norm": 2.3666760651980536, "learning_rate": 9.67741935483871e-06, "loss": 0.698, "step": 90 }, { "epoch": 0.10212308519215264, "grad_norm": 2.9278768472096433, "learning_rate": 9.999859120828162e-06, "loss": 0.6998, "step": 95 }, { "epoch": 0.10749798441279226, "grad_norm": 2.4524372124571165, "learning_rate": 9.998274321315453e-06, "loss": 0.6905, "step": 100 }, { "epoch": 0.11287288363343187, "grad_norm": 2.4347797178103394, "learning_rate": 9.994929183335237e-06, "loss": 0.679, "step": 105 }, { "epoch": 0.11824778285407149, "grad_norm": 2.3508265818164515, "learning_rate": 9.989824885009142e-06, "loss": 0.6843, "step": 110 }, { "epoch": 0.1236226820747111, "grad_norm": 2.164423459020718, "learning_rate": 9.982963224016152e-06, "loss": 0.6784, "step": 115 }, { "epoch": 0.12899758129535072, "grad_norm": 2.4050223924123113, "learning_rate": 9.974346616959476e-06, "loss": 0.6693, "step": 120 }, { "epoch": 0.13437248051599032, "grad_norm": 2.1741064666149903, "learning_rate": 9.963978098515468e-06, "loss": 0.6667, "step": 125 }, { "epoch": 0.13974737973662993, "grad_norm": 2.151624445725951, "learning_rate": 9.951861320364822e-06, "loss": 0.6686, "step": 130 }, { "epoch": 0.14512227895726956, "grad_norm": 2.140470340411352, "learning_rate": 9.938000549906509e-06, "loss": 0.6626, "step": 135 }, { "epoch": 0.15049717817790917, "grad_norm": 2.2274245089556706, "learning_rate": 9.922400668754833e-06, "loss": 0.6517, "step": 140 }, { "epoch": 0.15587207739854878, "grad_norm": 1.95083198656833, "learning_rate": 9.905067171020185e-06, "loss": 0.6512, "step": 145 }, { "epoch": 0.16124697661918838, "grad_norm": 2.4043795722101136, "learning_rate": 9.88600616137407e-06, "loss": 0.6524, "step": 150 }, { "epoch": 0.16662187583982802, "grad_norm": 2.004729246454245, "learning_rate": 9.86522435289912e-06, "loss": 0.6495, "step": 155 }, { "epoch": 0.17199677506046762, "grad_norm": 2.0488055473575724, "learning_rate": 9.8427290647248e-06, "loss": 0.6417, "step": 160 }, { "epoch": 0.17737167428110723, "grad_norm": 2.026026305529713, "learning_rate": 9.818528219449705e-06, "loss": 0.6308, "step": 165 }, { "epoch": 0.18274657350174683, "grad_norm": 1.923497956617847, "learning_rate": 9.792630340351301e-06, "loss": 0.6464, "step": 170 }, { "epoch": 0.18812147272238647, "grad_norm": 1.8714980274335011, "learning_rate": 9.765044548384113e-06, "loss": 0.6406, "step": 175 }, { "epoch": 0.19349637194302607, "grad_norm": 2.130465692033948, "learning_rate": 9.735780558967434e-06, "loss": 0.6292, "step": 180 }, { "epoch": 0.19887127116366568, "grad_norm": 2.0472476860009086, "learning_rate": 9.70484867856365e-06, "loss": 0.6209, "step": 185 }, { "epoch": 0.2042461703843053, "grad_norm": 2.45138914999084, "learning_rate": 9.67225980104841e-06, "loss": 0.633, "step": 190 }, { "epoch": 0.20962106960494492, "grad_norm": 1.9725264804153964, "learning_rate": 9.638025403873939e-06, "loss": 0.625, "step": 195 }, { "epoch": 0.21499596882558453, "grad_norm": 1.956589833642945, "learning_rate": 9.602157544026785e-06, "loss": 0.6274, "step": 200 }, { "epoch": 0.22037086804622413, "grad_norm": 2.032060013563488, "learning_rate": 9.564668853781483e-06, "loss": 0.6143, "step": 205 }, { "epoch": 0.22574576726686374, "grad_norm": 1.9927039178935393, "learning_rate": 9.525572536251608e-06, "loss": 0.6131, "step": 210 }, { "epoch": 0.23112066648750335, "grad_norm": 1.946959208571873, "learning_rate": 9.484882360739772e-06, "loss": 0.6029, "step": 215 }, { "epoch": 0.23649556570814298, "grad_norm": 1.8806701194051492, "learning_rate": 9.442612657888237e-06, "loss": 0.6118, "step": 220 }, { "epoch": 0.24187046492878259, "grad_norm": 2.225666197462802, "learning_rate": 9.398778314631801e-06, "loss": 0.6028, "step": 225 }, { "epoch": 0.2472453641494222, "grad_norm": 2.001619119705015, "learning_rate": 9.353394768954791e-06, "loss": 0.609, "step": 230 }, { "epoch": 0.2526202633700618, "grad_norm": 2.2733431261860777, "learning_rate": 9.30647800445397e-06, "loss": 0.601, "step": 235 }, { "epoch": 0.25799516259070143, "grad_norm": 2.230210940508007, "learning_rate": 9.258044544709276e-06, "loss": 0.5833, "step": 240 }, { "epoch": 0.26337006181134104, "grad_norm": 1.904244193327823, "learning_rate": 9.208111447464407e-06, "loss": 0.6101, "step": 245 }, { "epoch": 0.26874496103198064, "grad_norm": 1.88832266700029, "learning_rate": 9.156696298619266e-06, "loss": 0.5953, "step": 250 }, { "epoch": 0.27411986025262025, "grad_norm": 1.838793567250841, "learning_rate": 9.103817206036383e-06, "loss": 0.594, "step": 255 }, { "epoch": 0.27949475947325986, "grad_norm": 1.951838096647268, "learning_rate": 9.049492793163539e-06, "loss": 0.5783, "step": 260 }, { "epoch": 0.28486965869389946, "grad_norm": 2.1569848912019296, "learning_rate": 8.993742192474773e-06, "loss": 0.5897, "step": 265 }, { "epoch": 0.2902445579145391, "grad_norm": 2.0054996500986833, "learning_rate": 8.936585038732143e-06, "loss": 0.5866, "step": 270 }, { "epoch": 0.29561945713517873, "grad_norm": 2.112212926041267, "learning_rate": 8.878041462070556e-06, "loss": 0.5795, "step": 275 }, { "epoch": 0.30099435635581834, "grad_norm": 2.1036396849174492, "learning_rate": 8.818132080908178e-06, "loss": 0.5818, "step": 280 }, { "epoch": 0.30636925557645794, "grad_norm": 2.167096693852596, "learning_rate": 8.756877994684818e-06, "loss": 0.564, "step": 285 }, { "epoch": 0.31174415479709755, "grad_norm": 1.956775665235312, "learning_rate": 8.694300776430958e-06, "loss": 0.5683, "step": 290 }, { "epoch": 0.31711905401773716, "grad_norm": 1.97168436084639, "learning_rate": 8.630422465169947e-06, "loss": 0.5697, "step": 295 }, { "epoch": 0.32249395323837676, "grad_norm": 2.043558514889714, "learning_rate": 8.565265558156101e-06, "loss": 0.5635, "step": 300 }, { "epoch": 0.32786885245901637, "grad_norm": 1.9273574277695336, "learning_rate": 8.498853002951414e-06, "loss": 0.5728, "step": 305 }, { "epoch": 0.33324375167965603, "grad_norm": 1.9657306797632228, "learning_rate": 8.43120818934367e-06, "loss": 0.5619, "step": 310 }, { "epoch": 0.33861865090029564, "grad_norm": 1.9012625288122775, "learning_rate": 8.362354941108803e-06, "loss": 0.5702, "step": 315 }, { "epoch": 0.34399355012093524, "grad_norm": 2.123858299795287, "learning_rate": 8.292317507620438e-06, "loss": 0.554, "step": 320 }, { "epoch": 0.34936844934157485, "grad_norm": 1.9513877089408058, "learning_rate": 8.221120555309511e-06, "loss": 0.5498, "step": 325 }, { "epoch": 0.35474334856221446, "grad_norm": 1.914136234798412, "learning_rate": 8.148789158977012e-06, "loss": 0.5455, "step": 330 }, { "epoch": 0.36011824778285406, "grad_norm": 1.8672863149021612, "learning_rate": 8.075348792962924e-06, "loss": 0.5404, "step": 335 }, { "epoch": 0.36549314700349367, "grad_norm": 2.1793777540733177, "learning_rate": 8.000825322174424e-06, "loss": 0.5423, "step": 340 }, { "epoch": 0.3708680462241333, "grad_norm": 2.0291841027180455, "learning_rate": 7.925244992976538e-06, "loss": 0.5386, "step": 345 }, { "epoch": 0.37624294544477294, "grad_norm": 2.1016851887581662, "learning_rate": 7.848634423948468e-06, "loss": 0.5326, "step": 350 }, { "epoch": 0.38161784466541254, "grad_norm": 2.029810803364677, "learning_rate": 7.7710205965088e-06, "loss": 0.5403, "step": 355 }, { "epoch": 0.38699274388605215, "grad_norm": 1.8590989815971988, "learning_rate": 7.692430845412946e-06, "loss": 0.5333, "step": 360 }, { "epoch": 0.39236764310669175, "grad_norm": 1.9140526286879267, "learning_rate": 7.612892849126132e-06, "loss": 0.5252, "step": 365 }, { "epoch": 0.39774254232733136, "grad_norm": 1.9988915945152586, "learning_rate": 7.532434620075349e-06, "loss": 0.5242, "step": 370 }, { "epoch": 0.40311744154797097, "grad_norm": 1.9746262065023925, "learning_rate": 7.451084494783668e-06, "loss": 0.5085, "step": 375 }, { "epoch": 0.4084923407686106, "grad_norm": 1.9222623607017046, "learning_rate": 7.368871123890425e-06, "loss": 0.5247, "step": 380 }, { "epoch": 0.4138672399892502, "grad_norm": 2.184926573195526, "learning_rate": 7.285823462060776e-06, "loss": 0.5153, "step": 385 }, { "epoch": 0.41924213920988984, "grad_norm": 1.9008034480063025, "learning_rate": 7.201970757788172e-06, "loss": 0.5146, "step": 390 }, { "epoch": 0.42461703843052945, "grad_norm": 2.0387228861562776, "learning_rate": 7.117342543093358e-06, "loss": 0.506, "step": 395 }, { "epoch": 0.42999193765116905, "grad_norm": 2.165202028954103, "learning_rate": 7.031968623123503e-06, "loss": 0.5086, "step": 400 }, { "epoch": 0.43536683687180866, "grad_norm": 2.0059909450641435, "learning_rate": 6.945879065655164e-06, "loss": 0.5052, "step": 405 }, { "epoch": 0.44074173609244827, "grad_norm": 2.3690249701648725, "learning_rate": 6.859104190504725e-06, "loss": 0.5042, "step": 410 }, { "epoch": 0.4461166353130879, "grad_norm": 2.010697016188747, "learning_rate": 6.771674558850088e-06, "loss": 0.4958, "step": 415 }, { "epoch": 0.4514915345337275, "grad_norm": 1.9600695859304653, "learning_rate": 6.6836209624673575e-06, "loss": 0.5028, "step": 420 }, { "epoch": 0.4568664337543671, "grad_norm": 1.897791733525091, "learning_rate": 6.5949744128863026e-06, "loss": 0.4918, "step": 425 }, { "epoch": 0.4622413329750067, "grad_norm": 2.0272846103958755, "learning_rate": 6.5057661304684314e-06, "loss": 0.4863, "step": 430 }, { "epoch": 0.46761623219564635, "grad_norm": 1.888723644878263, "learning_rate": 6.41602753341152e-06, "loss": 0.4816, "step": 435 }, { "epoch": 0.47299113141628596, "grad_norm": 1.9192743092814748, "learning_rate": 6.32579022668446e-06, "loss": 0.4651, "step": 440 }, { "epoch": 0.47836603063692557, "grad_norm": 2.015499929746225, "learning_rate": 6.235085990896317e-06, "loss": 0.4843, "step": 445 }, { "epoch": 0.48374092985756517, "grad_norm": 1.8591635323006084, "learning_rate": 6.143946771103561e-06, "loss": 0.4792, "step": 450 }, { "epoch": 0.4891158290782048, "grad_norm": 1.9418548689130606, "learning_rate": 6.052404665559342e-06, "loss": 0.4808, "step": 455 }, { "epoch": 0.4944907282988444, "grad_norm": 1.944779410362686, "learning_rate": 5.960491914408846e-06, "loss": 0.478, "step": 460 }, { "epoch": 0.499865627519484, "grad_norm": 1.9179287908203055, "learning_rate": 5.8682408883346535e-06, "loss": 0.4741, "step": 465 }, { "epoch": 0.5052405267401237, "grad_norm": 1.939456137738699, "learning_rate": 5.775684077156133e-06, "loss": 0.4711, "step": 470 }, { "epoch": 0.5106154259607633, "grad_norm": 2.030961825347864, "learning_rate": 5.682854078386882e-06, "loss": 0.4684, "step": 475 }, { "epoch": 0.5159903251814029, "grad_norm": 2.017192647267279, "learning_rate": 5.5897835857542315e-06, "loss": 0.4716, "step": 480 }, { "epoch": 0.5213652244020425, "grad_norm": 1.8797704382098077, "learning_rate": 5.496505377684858e-06, "loss": 0.46, "step": 485 }, { "epoch": 0.5267401236226821, "grad_norm": 1.937682332181844, "learning_rate": 5.4030523057605865e-06, "loss": 0.4561, "step": 490 }, { "epoch": 0.5321150228433217, "grad_norm": 2.030922067258147, "learning_rate": 5.30945728314841e-06, "loss": 0.4558, "step": 495 }, { "epoch": 0.5374899220639613, "grad_norm": 2.0947701090151347, "learning_rate": 5.215753273008828e-06, "loss": 0.4483, "step": 500 }, { "epoch": 0.5428648212846009, "grad_norm": 1.9527895814298004, "learning_rate": 5.1219732768865744e-06, "loss": 0.4546, "step": 505 }, { "epoch": 0.5482397205052405, "grad_norm": 1.925264597102223, "learning_rate": 5.0281503230878304e-06, "loss": 0.454, "step": 510 }, { "epoch": 0.5536146197258801, "grad_norm": 1.870811300114845, "learning_rate": 4.934317455048005e-06, "loss": 0.4519, "step": 515 }, { "epoch": 0.5589895189465197, "grad_norm": 1.8517150460463325, "learning_rate": 4.840507719694202e-06, "loss": 0.4465, "step": 520 }, { "epoch": 0.5643644181671593, "grad_norm": 1.9901783088282807, "learning_rate": 4.746754155806437e-06, "loss": 0.4426, "step": 525 }, { "epoch": 0.5697393173877989, "grad_norm": 1.92526563478314, "learning_rate": 4.6530897823817425e-06, "loss": 0.447, "step": 530 }, { "epoch": 0.5751142166084386, "grad_norm": 2.0060885985999297, "learning_rate": 4.559547587005227e-06, "loss": 0.4324, "step": 535 }, { "epoch": 0.5804891158290783, "grad_norm": 1.9893587686847451, "learning_rate": 4.466160514232206e-06, "loss": 0.4333, "step": 540 }, { "epoch": 0.5858640150497179, "grad_norm": 1.9557768830171276, "learning_rate": 4.3729614539854815e-06, "loss": 0.4317, "step": 545 }, { "epoch": 0.5912389142703575, "grad_norm": 1.934922488824026, "learning_rate": 4.279983229971863e-06, "loss": 0.4385, "step": 550 }, { "epoch": 0.5966138134909971, "grad_norm": 1.876883342350168, "learning_rate": 4.187258588122019e-06, "loss": 0.4308, "step": 555 }, { "epoch": 0.6019887127116367, "grad_norm": 1.9145406252528392, "learning_rate": 4.094820185057701e-06, "loss": 0.429, "step": 560 }, { "epoch": 0.6073636119322763, "grad_norm": 1.9668078483751361, "learning_rate": 4.002700576590441e-06, "loss": 0.4355, "step": 565 }, { "epoch": 0.6127385111529159, "grad_norm": 1.961317833983311, "learning_rate": 3.910932206255742e-06, "loss": 0.4307, "step": 570 }, { "epoch": 0.6181134103735555, "grad_norm": 1.9266836648594068, "learning_rate": 3.819547393886816e-06, "loss": 0.4228, "step": 575 }, { "epoch": 0.6234883095941951, "grad_norm": 1.9178449532415156, "learning_rate": 3.7285783242318773e-06, "loss": 0.4208, "step": 580 }, { "epoch": 0.6288632088148347, "grad_norm": 1.9258945419319073, "learning_rate": 3.6380570356190346e-06, "loss": 0.4198, "step": 585 }, { "epoch": 0.6342381080354743, "grad_norm": 1.900131272139124, "learning_rate": 3.548015408672723e-06, "loss": 0.4166, "step": 590 }, { "epoch": 0.6396130072561139, "grad_norm": 1.9436783208199349, "learning_rate": 3.4584851550857007e-06, "loss": 0.4097, "step": 595 }, { "epoch": 0.6449879064767535, "grad_norm": 1.9775742266969982, "learning_rate": 3.3694978064505258e-06, "loss": 0.4129, "step": 600 }, { "epoch": 0.6503628056973931, "grad_norm": 1.8269664591538408, "learning_rate": 3.2810847031544703e-06, "loss": 0.4088, "step": 605 }, { "epoch": 0.6557377049180327, "grad_norm": 1.886926257627791, "learning_rate": 3.193276983341773e-06, "loss": 0.4047, "step": 610 }, { "epoch": 0.6611126041386725, "grad_norm": 1.9544062693179776, "learning_rate": 3.10610557194712e-06, "loss": 0.4062, "step": 615 }, { "epoch": 0.6664875033593121, "grad_norm": 1.8781772152552532, "learning_rate": 3.019601169804216e-06, "loss": 0.4104, "step": 620 }, { "epoch": 0.6718624025799517, "grad_norm": 1.8152357686298919, "learning_rate": 2.9337942428332787e-06, "loss": 0.4045, "step": 625 }, { "epoch": 0.6772373018005913, "grad_norm": 1.8231399150815806, "learning_rate": 2.848715011311271e-06, "loss": 0.392, "step": 630 }, { "epoch": 0.6826122010212309, "grad_norm": 1.8660672229253348, "learning_rate": 2.764393439228643e-06, "loss": 0.3943, "step": 635 }, { "epoch": 0.6879871002418705, "grad_norm": 1.8610053663644113, "learning_rate": 2.6808592237363364e-06, "loss": 0.4008, "step": 640 }, { "epoch": 0.6933619994625101, "grad_norm": 1.8382570488101446, "learning_rate": 2.5981417846867753e-06, "loss": 0.4016, "step": 645 }, { "epoch": 0.6987368986831497, "grad_norm": 2.1963817133336487, "learning_rate": 2.5162702542724924e-06, "loss": 0.3897, "step": 650 }, { "epoch": 0.7041117979037893, "grad_norm": 1.9176668039213747, "learning_rate": 2.4352734667661073e-06, "loss": 0.3828, "step": 655 }, { "epoch": 0.7094866971244289, "grad_norm": 1.7727520223769333, "learning_rate": 2.3551799483651894e-06, "loss": 0.3918, "step": 660 }, { "epoch": 0.7148615963450685, "grad_norm": 1.8508452992048683, "learning_rate": 2.2760179071456356e-06, "loss": 0.3923, "step": 665 }, { "epoch": 0.7202364955657081, "grad_norm": 1.8898208151955234, "learning_rate": 2.1978152231271077e-06, "loss": 0.3889, "step": 670 }, { "epoch": 0.7256113947863477, "grad_norm": 1.8442479576678996, "learning_rate": 2.120599438453968e-06, "loss": 0.3803, "step": 675 }, { "epoch": 0.7309862940069873, "grad_norm": 1.8287369121112131, "learning_rate": 2.044397747695247e-06, "loss": 0.3803, "step": 680 }, { "epoch": 0.7363611932276269, "grad_norm": 1.8799793582063802, "learning_rate": 1.969236988267005e-06, "loss": 0.3761, "step": 685 }, { "epoch": 0.7417360924482665, "grad_norm": 1.7804441999040213, "learning_rate": 1.8951436309804766e-06, "loss": 0.3803, "step": 690 }, { "epoch": 0.7471109916689062, "grad_norm": 1.8057045901843964, "learning_rate": 1.8221437707193424e-06, "loss": 0.3791, "step": 695 }, { "epoch": 0.7524858908895459, "grad_norm": 1.8066778942795885, "learning_rate": 1.7502631172493878e-06, "loss": 0.3787, "step": 700 }, { "epoch": 0.7578607901101855, "grad_norm": 1.8302691938217297, "learning_rate": 1.6795269861638041e-06, "loss": 0.3881, "step": 705 }, { "epoch": 0.7632356893308251, "grad_norm": 1.8234451924525408, "learning_rate": 1.6099602899673083e-06, "loss": 0.3755, "step": 710 }, { "epoch": 0.7686105885514647, "grad_norm": 1.767257969497289, "learning_rate": 1.5415875293022181e-06, "loss": 0.3767, "step": 715 }, { "epoch": 0.7739854877721043, "grad_norm": 1.8275002961230755, "learning_rate": 1.4744327843196043e-06, "loss": 0.3676, "step": 720 }, { "epoch": 0.7793603869927439, "grad_norm": 1.8120986388263132, "learning_rate": 1.4085197061985022e-06, "loss": 0.378, "step": 725 }, { "epoch": 0.7847352862133835, "grad_norm": 1.7559930973845725, "learning_rate": 1.3438715088162403e-06, "loss": 0.3676, "step": 730 }, { "epoch": 0.7901101854340231, "grad_norm": 1.8425030002430842, "learning_rate": 1.280510960572745e-06, "loss": 0.3721, "step": 735 }, { "epoch": 0.7954850846546627, "grad_norm": 1.8012025523517194, "learning_rate": 1.2184603763717684e-06, "loss": 0.3624, "step": 740 }, { "epoch": 0.8008599838753023, "grad_norm": 1.861454253307067, "learning_rate": 1.1577416097618138e-06, "loss": 0.3628, "step": 745 }, { "epoch": 0.8062348830959419, "grad_norm": 1.7687205113359847, "learning_rate": 1.0983760452395415e-06, "loss": 0.3624, "step": 750 }, { "epoch": 0.8116097823165815, "grad_norm": 1.9487313717243606, "learning_rate": 1.040384590718399e-06, "loss": 0.3554, "step": 755 }, { "epoch": 0.8169846815372211, "grad_norm": 1.6883835685983015, "learning_rate": 9.837876701650606e-07, "loss": 0.3552, "step": 760 }, { "epoch": 0.8223595807578608, "grad_norm": 1.8609027176051518, "learning_rate": 9.286052164063369e-07, "loss": 0.3658, "step": 765 }, { "epoch": 0.8277344799785004, "grad_norm": 1.8177990709248102, "learning_rate": 8.748566641090433e-07, "loss": 0.3649, "step": 770 }, { "epoch": 0.83310937919914, "grad_norm": 1.8719226675515828, "learning_rate": 8.225609429353187e-07, "loss": 0.3679, "step": 775 }, { "epoch": 0.8384842784197797, "grad_norm": 1.740827963426437, "learning_rate": 7.717364708758024e-07, "loss": 0.3587, "step": 780 }, { "epoch": 0.8438591776404193, "grad_norm": 1.7094014840076446, "learning_rate": 7.224011477630166e-07, "loss": 0.3615, "step": 785 }, { "epoch": 0.8492340768610589, "grad_norm": 1.790943519825671, "learning_rate": 6.745723489672412e-07, "loss": 0.3401, "step": 790 }, { "epoch": 0.8546089760816985, "grad_norm": 1.852602320274836, "learning_rate": 6.282669192770896e-07, "loss": 0.3615, "step": 795 }, { "epoch": 0.8599838753023381, "grad_norm": 1.7448118489231759, "learning_rate": 5.83501166966956e-07, "loss": 0.356, "step": 800 }, { "epoch": 0.8653587745229777, "grad_norm": 1.7138058444443134, "learning_rate": 5.402908580534233e-07, "loss": 0.3507, "step": 805 }, { "epoch": 0.8707336737436173, "grad_norm": 1.786529360611276, "learning_rate": 4.986512107426283e-07, "loss": 0.3545, "step": 810 }, { "epoch": 0.8761085729642569, "grad_norm": 1.756925743913887, "learning_rate": 4.5859689007058896e-07, "loss": 0.3561, "step": 815 }, { "epoch": 0.8814834721848965, "grad_norm": 1.7845712546592873, "learning_rate": 4.2014200273832406e-07, "loss": 0.3484, "step": 820 }, { "epoch": 0.8868583714055361, "grad_norm": 1.8932772047523514, "learning_rate": 3.8330009214363197e-07, "loss": 0.3568, "step": 825 }, { "epoch": 0.8922332706261757, "grad_norm": 1.6852471718751032, "learning_rate": 3.4808413361125004e-07, "loss": 0.3483, "step": 830 }, { "epoch": 0.8976081698468154, "grad_norm": 1.774916611408541, "learning_rate": 3.1450652982307815e-07, "loss": 0.3549, "step": 835 }, { "epoch": 0.902983069067455, "grad_norm": 1.9063855186007104, "learning_rate": 2.8257910645009935e-07, "loss": 0.3551, "step": 840 }, { "epoch": 0.9083579682880946, "grad_norm": 1.7048794107309078, "learning_rate": 2.523131079874963e-07, "loss": 0.3542, "step": 845 }, { "epoch": 0.9137328675087342, "grad_norm": 1.8134483816091473, "learning_rate": 2.2371919379446495e-07, "loss": 0.3546, "step": 850 }, { "epoch": 0.9191077667293738, "grad_norm": 1.847919769203762, "learning_rate": 1.9680743434010385e-07, "loss": 0.3468, "step": 855 }, { "epoch": 0.9244826659500134, "grad_norm": 1.7597705426288421, "learning_rate": 1.7158730765669817e-07, "loss": 0.3598, "step": 860 }, { "epoch": 0.9298575651706531, "grad_norm": 1.8107276346763232, "learning_rate": 1.480676960016636e-07, "loss": 0.3596, "step": 865 }, { "epoch": 0.9352324643912927, "grad_norm": 1.726387338435163, "learning_rate": 1.2625688272930925e-07, "loss": 0.3555, "step": 870 }, { "epoch": 0.9406073636119323, "grad_norm": 1.657583954677237, "learning_rate": 1.0616254937352966e-07, "loss": 0.3478, "step": 875 }, { "epoch": 0.9459822628325719, "grad_norm": 1.7186030940673933, "learning_rate": 8.779177294245044e-08, "loss": 0.3476, "step": 880 }, { "epoch": 0.9513571620532115, "grad_norm": 1.803294093442226, "learning_rate": 7.115102342598101e-08, "loss": 0.3473, "step": 885 }, { "epoch": 0.9567320612738511, "grad_norm": 1.9161389425178095, "learning_rate": 5.6246161517158336e-08, "loss": 0.346, "step": 890 }, { "epoch": 0.9621069604944907, "grad_norm": 1.7244304109116553, "learning_rate": 4.308243654806643e-08, "loss": 0.3522, "step": 895 }, { "epoch": 0.9674818597151303, "grad_norm": 1.798195655862142, "learning_rate": 3.166448464108629e-08, "loss": 0.348, "step": 900 }, { "epoch": 0.97285675893577, "grad_norm": 1.7093455607124064, "learning_rate": 2.1996327076096446e-08, "loss": 0.3497, "step": 905 }, { "epoch": 0.9782316581564096, "grad_norm": 1.79086386462793, "learning_rate": 1.4081368874226398e-08, "loss": 0.3483, "step": 910 }, { "epoch": 0.9836065573770492, "grad_norm": 1.7408308676379578, "learning_rate": 7.922397598642551e-09, "loss": 0.3451, "step": 915 }, { "epoch": 0.9889814565976888, "grad_norm": 1.7334185566757478, "learning_rate": 3.5215823727974274e-09, "loss": 0.3496, "step": 920 }, { "epoch": 0.9943563558183284, "grad_norm": 1.7114218675449298, "learning_rate": 8.804731164901991e-10, "loss": 0.345, "step": 925 }, { "epoch": 0.999731255038968, "grad_norm": 1.7912428537040286, "learning_rate": 0.0, "loss": 0.3503, "step": 930 }, { "epoch": 0.999731255038968, "eval_loss": 0.3442555069923401, "eval_runtime": 95.8162, "eval_samples_per_second": 3.152, "eval_steps_per_second": 0.793, "step": 930 }, { "epoch": 0.999731255038968, "step": 930, "total_flos": 194670734868480.0, "train_loss": 0.508077065149943, "train_runtime": 21031.9081, "train_samples_per_second": 1.415, "train_steps_per_second": 0.044 } ], "logging_steps": 5, "max_steps": 930, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 194670734868480.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }