| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.999731255038968, | |
| "eval_steps": 500, | |
| "global_step": 930, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0010749798441279225, | |
| "grad_norm": 22.762095682751983, | |
| "learning_rate": 1.0752688172043012e-07, | |
| "loss": 1.3377, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.005374899220639613, | |
| "grad_norm": 20.69927583673527, | |
| "learning_rate": 5.376344086021506e-07, | |
| "loss": 1.3086, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.010749798441279226, | |
| "grad_norm": 8.308333536609279, | |
| "learning_rate": 1.0752688172043011e-06, | |
| "loss": 1.1878, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01612469766191884, | |
| "grad_norm": 6.959676956693679, | |
| "learning_rate": 1.6129032258064516e-06, | |
| "loss": 1.0321, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.021499596882558453, | |
| "grad_norm": 2.9820242928638794, | |
| "learning_rate": 2.1505376344086023e-06, | |
| "loss": 0.9101, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.026874496103198066, | |
| "grad_norm": 2.5554546533203144, | |
| "learning_rate": 2.688172043010753e-06, | |
| "loss": 0.8617, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03224939532383768, | |
| "grad_norm": 2.286909995566289, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 0.8383, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03762429454447729, | |
| "grad_norm": 2.3398312761165907, | |
| "learning_rate": 3.763440860215054e-06, | |
| "loss": 0.8185, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.042999193765116905, | |
| "grad_norm": 2.204837686008753, | |
| "learning_rate": 4.3010752688172045e-06, | |
| "loss": 0.8053, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04837409298575652, | |
| "grad_norm": 2.315534843885615, | |
| "learning_rate": 4.838709677419355e-06, | |
| "loss": 0.7857, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05374899220639613, | |
| "grad_norm": 2.5131629870127705, | |
| "learning_rate": 5.376344086021506e-06, | |
| "loss": 0.7691, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.059123891427035745, | |
| "grad_norm": 2.4607920938058396, | |
| "learning_rate": 5.9139784946236566e-06, | |
| "loss": 0.758, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06449879064767536, | |
| "grad_norm": 2.367828432773105, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 0.7375, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06987368986831496, | |
| "grad_norm": 2.3306378375646406, | |
| "learning_rate": 6.989247311827958e-06, | |
| "loss": 0.7304, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07524858908895458, | |
| "grad_norm": 2.344704242935191, | |
| "learning_rate": 7.526881720430108e-06, | |
| "loss": 0.7131, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08062348830959419, | |
| "grad_norm": 2.3556743904747366, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 0.7109, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08599838753023381, | |
| "grad_norm": 2.2479332716993605, | |
| "learning_rate": 8.602150537634409e-06, | |
| "loss": 0.7022, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09137328675087342, | |
| "grad_norm": 2.3017400204623666, | |
| "learning_rate": 9.13978494623656e-06, | |
| "loss": 0.7046, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.09674818597151304, | |
| "grad_norm": 2.3666760651980536, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 0.698, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10212308519215264, | |
| "grad_norm": 2.9278768472096433, | |
| "learning_rate": 9.999859120828162e-06, | |
| "loss": 0.6998, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.10749798441279226, | |
| "grad_norm": 2.4524372124571165, | |
| "learning_rate": 9.998274321315453e-06, | |
| "loss": 0.6905, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11287288363343187, | |
| "grad_norm": 2.4347797178103394, | |
| "learning_rate": 9.994929183335237e-06, | |
| "loss": 0.679, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.11824778285407149, | |
| "grad_norm": 2.3508265818164515, | |
| "learning_rate": 9.989824885009142e-06, | |
| "loss": 0.6843, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1236226820747111, | |
| "grad_norm": 2.164423459020718, | |
| "learning_rate": 9.982963224016152e-06, | |
| "loss": 0.6784, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.12899758129535072, | |
| "grad_norm": 2.4050223924123113, | |
| "learning_rate": 9.974346616959476e-06, | |
| "loss": 0.6693, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13437248051599032, | |
| "grad_norm": 2.1741064666149903, | |
| "learning_rate": 9.963978098515468e-06, | |
| "loss": 0.6667, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.13974737973662993, | |
| "grad_norm": 2.151624445725951, | |
| "learning_rate": 9.951861320364822e-06, | |
| "loss": 0.6686, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14512227895726956, | |
| "grad_norm": 2.140470340411352, | |
| "learning_rate": 9.938000549906509e-06, | |
| "loss": 0.6626, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.15049717817790917, | |
| "grad_norm": 2.2274245089556706, | |
| "learning_rate": 9.922400668754833e-06, | |
| "loss": 0.6517, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15587207739854878, | |
| "grad_norm": 1.95083198656833, | |
| "learning_rate": 9.905067171020185e-06, | |
| "loss": 0.6512, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.16124697661918838, | |
| "grad_norm": 2.4043795722101136, | |
| "learning_rate": 9.88600616137407e-06, | |
| "loss": 0.6524, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16662187583982802, | |
| "grad_norm": 2.004729246454245, | |
| "learning_rate": 9.86522435289912e-06, | |
| "loss": 0.6495, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.17199677506046762, | |
| "grad_norm": 2.0488055473575724, | |
| "learning_rate": 9.8427290647248e-06, | |
| "loss": 0.6417, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.17737167428110723, | |
| "grad_norm": 2.026026305529713, | |
| "learning_rate": 9.818528219449705e-06, | |
| "loss": 0.6308, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.18274657350174683, | |
| "grad_norm": 1.923497956617847, | |
| "learning_rate": 9.792630340351301e-06, | |
| "loss": 0.6464, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18812147272238647, | |
| "grad_norm": 1.8714980274335011, | |
| "learning_rate": 9.765044548384113e-06, | |
| "loss": 0.6406, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.19349637194302607, | |
| "grad_norm": 2.130465692033948, | |
| "learning_rate": 9.735780558967434e-06, | |
| "loss": 0.6292, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19887127116366568, | |
| "grad_norm": 2.0472476860009086, | |
| "learning_rate": 9.70484867856365e-06, | |
| "loss": 0.6209, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2042461703843053, | |
| "grad_norm": 2.45138914999084, | |
| "learning_rate": 9.67225980104841e-06, | |
| "loss": 0.633, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.20962106960494492, | |
| "grad_norm": 1.9725264804153964, | |
| "learning_rate": 9.638025403873939e-06, | |
| "loss": 0.625, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.21499596882558453, | |
| "grad_norm": 1.956589833642945, | |
| "learning_rate": 9.602157544026785e-06, | |
| "loss": 0.6274, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22037086804622413, | |
| "grad_norm": 2.032060013563488, | |
| "learning_rate": 9.564668853781483e-06, | |
| "loss": 0.6143, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.22574576726686374, | |
| "grad_norm": 1.9927039178935393, | |
| "learning_rate": 9.525572536251608e-06, | |
| "loss": 0.6131, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23112066648750335, | |
| "grad_norm": 1.946959208571873, | |
| "learning_rate": 9.484882360739772e-06, | |
| "loss": 0.6029, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.23649556570814298, | |
| "grad_norm": 1.8806701194051492, | |
| "learning_rate": 9.442612657888237e-06, | |
| "loss": 0.6118, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.24187046492878259, | |
| "grad_norm": 2.225666197462802, | |
| "learning_rate": 9.398778314631801e-06, | |
| "loss": 0.6028, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.2472453641494222, | |
| "grad_norm": 2.001619119705015, | |
| "learning_rate": 9.353394768954791e-06, | |
| "loss": 0.609, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2526202633700618, | |
| "grad_norm": 2.2733431261860777, | |
| "learning_rate": 9.30647800445397e-06, | |
| "loss": 0.601, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.25799516259070143, | |
| "grad_norm": 2.230210940508007, | |
| "learning_rate": 9.258044544709276e-06, | |
| "loss": 0.5833, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.26337006181134104, | |
| "grad_norm": 1.904244193327823, | |
| "learning_rate": 9.208111447464407e-06, | |
| "loss": 0.6101, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.26874496103198064, | |
| "grad_norm": 1.88832266700029, | |
| "learning_rate": 9.156696298619266e-06, | |
| "loss": 0.5953, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.27411986025262025, | |
| "grad_norm": 1.838793567250841, | |
| "learning_rate": 9.103817206036383e-06, | |
| "loss": 0.594, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.27949475947325986, | |
| "grad_norm": 1.951838096647268, | |
| "learning_rate": 9.049492793163539e-06, | |
| "loss": 0.5783, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.28486965869389946, | |
| "grad_norm": 2.1569848912019296, | |
| "learning_rate": 8.993742192474773e-06, | |
| "loss": 0.5897, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.2902445579145391, | |
| "grad_norm": 2.0054996500986833, | |
| "learning_rate": 8.936585038732143e-06, | |
| "loss": 0.5866, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.29561945713517873, | |
| "grad_norm": 2.112212926041267, | |
| "learning_rate": 8.878041462070556e-06, | |
| "loss": 0.5795, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.30099435635581834, | |
| "grad_norm": 2.1036396849174492, | |
| "learning_rate": 8.818132080908178e-06, | |
| "loss": 0.5818, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.30636925557645794, | |
| "grad_norm": 2.167096693852596, | |
| "learning_rate": 8.756877994684818e-06, | |
| "loss": 0.564, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.31174415479709755, | |
| "grad_norm": 1.956775665235312, | |
| "learning_rate": 8.694300776430958e-06, | |
| "loss": 0.5683, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.31711905401773716, | |
| "grad_norm": 1.97168436084639, | |
| "learning_rate": 8.630422465169947e-06, | |
| "loss": 0.5697, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.32249395323837676, | |
| "grad_norm": 2.043558514889714, | |
| "learning_rate": 8.565265558156101e-06, | |
| "loss": 0.5635, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 1.9273574277695336, | |
| "learning_rate": 8.498853002951414e-06, | |
| "loss": 0.5728, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.33324375167965603, | |
| "grad_norm": 1.9657306797632228, | |
| "learning_rate": 8.43120818934367e-06, | |
| "loss": 0.5619, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.33861865090029564, | |
| "grad_norm": 1.9012625288122775, | |
| "learning_rate": 8.362354941108803e-06, | |
| "loss": 0.5702, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.34399355012093524, | |
| "grad_norm": 2.123858299795287, | |
| "learning_rate": 8.292317507620438e-06, | |
| "loss": 0.554, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.34936844934157485, | |
| "grad_norm": 1.9513877089408058, | |
| "learning_rate": 8.221120555309511e-06, | |
| "loss": 0.5498, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.35474334856221446, | |
| "grad_norm": 1.914136234798412, | |
| "learning_rate": 8.148789158977012e-06, | |
| "loss": 0.5455, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.36011824778285406, | |
| "grad_norm": 1.8672863149021612, | |
| "learning_rate": 8.075348792962924e-06, | |
| "loss": 0.5404, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.36549314700349367, | |
| "grad_norm": 2.1793777540733177, | |
| "learning_rate": 8.000825322174424e-06, | |
| "loss": 0.5423, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3708680462241333, | |
| "grad_norm": 2.0291841027180455, | |
| "learning_rate": 7.925244992976538e-06, | |
| "loss": 0.5386, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.37624294544477294, | |
| "grad_norm": 2.1016851887581662, | |
| "learning_rate": 7.848634423948468e-06, | |
| "loss": 0.5326, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.38161784466541254, | |
| "grad_norm": 2.029810803364677, | |
| "learning_rate": 7.7710205965088e-06, | |
| "loss": 0.5403, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.38699274388605215, | |
| "grad_norm": 1.8590989815971988, | |
| "learning_rate": 7.692430845412946e-06, | |
| "loss": 0.5333, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.39236764310669175, | |
| "grad_norm": 1.9140526286879267, | |
| "learning_rate": 7.612892849126132e-06, | |
| "loss": 0.5252, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.39774254232733136, | |
| "grad_norm": 1.9988915945152586, | |
| "learning_rate": 7.532434620075349e-06, | |
| "loss": 0.5242, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.40311744154797097, | |
| "grad_norm": 1.9746262065023925, | |
| "learning_rate": 7.451084494783668e-06, | |
| "loss": 0.5085, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.4084923407686106, | |
| "grad_norm": 1.9222623607017046, | |
| "learning_rate": 7.368871123890425e-06, | |
| "loss": 0.5247, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4138672399892502, | |
| "grad_norm": 2.184926573195526, | |
| "learning_rate": 7.285823462060776e-06, | |
| "loss": 0.5153, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.41924213920988984, | |
| "grad_norm": 1.9008034480063025, | |
| "learning_rate": 7.201970757788172e-06, | |
| "loss": 0.5146, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.42461703843052945, | |
| "grad_norm": 2.0387228861562776, | |
| "learning_rate": 7.117342543093358e-06, | |
| "loss": 0.506, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.42999193765116905, | |
| "grad_norm": 2.165202028954103, | |
| "learning_rate": 7.031968623123503e-06, | |
| "loss": 0.5086, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.43536683687180866, | |
| "grad_norm": 2.0059909450641435, | |
| "learning_rate": 6.945879065655164e-06, | |
| "loss": 0.5052, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.44074173609244827, | |
| "grad_norm": 2.3690249701648725, | |
| "learning_rate": 6.859104190504725e-06, | |
| "loss": 0.5042, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4461166353130879, | |
| "grad_norm": 2.010697016188747, | |
| "learning_rate": 6.771674558850088e-06, | |
| "loss": 0.4958, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.4514915345337275, | |
| "grad_norm": 1.9600695859304653, | |
| "learning_rate": 6.6836209624673575e-06, | |
| "loss": 0.5028, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4568664337543671, | |
| "grad_norm": 1.897791733525091, | |
| "learning_rate": 6.5949744128863026e-06, | |
| "loss": 0.4918, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.4622413329750067, | |
| "grad_norm": 2.0272846103958755, | |
| "learning_rate": 6.5057661304684314e-06, | |
| "loss": 0.4863, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.46761623219564635, | |
| "grad_norm": 1.888723644878263, | |
| "learning_rate": 6.41602753341152e-06, | |
| "loss": 0.4816, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.47299113141628596, | |
| "grad_norm": 1.9192743092814748, | |
| "learning_rate": 6.32579022668446e-06, | |
| "loss": 0.4651, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.47836603063692557, | |
| "grad_norm": 2.015499929746225, | |
| "learning_rate": 6.235085990896317e-06, | |
| "loss": 0.4843, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.48374092985756517, | |
| "grad_norm": 1.8591635323006084, | |
| "learning_rate": 6.143946771103561e-06, | |
| "loss": 0.4792, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4891158290782048, | |
| "grad_norm": 1.9418548689130606, | |
| "learning_rate": 6.052404665559342e-06, | |
| "loss": 0.4808, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.4944907282988444, | |
| "grad_norm": 1.944779410362686, | |
| "learning_rate": 5.960491914408846e-06, | |
| "loss": 0.478, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.499865627519484, | |
| "grad_norm": 1.9179287908203055, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.4741, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5052405267401237, | |
| "grad_norm": 1.939456137738699, | |
| "learning_rate": 5.775684077156133e-06, | |
| "loss": 0.4711, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5106154259607633, | |
| "grad_norm": 2.030961825347864, | |
| "learning_rate": 5.682854078386882e-06, | |
| "loss": 0.4684, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5159903251814029, | |
| "grad_norm": 2.017192647267279, | |
| "learning_rate": 5.5897835857542315e-06, | |
| "loss": 0.4716, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5213652244020425, | |
| "grad_norm": 1.8797704382098077, | |
| "learning_rate": 5.496505377684858e-06, | |
| "loss": 0.46, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5267401236226821, | |
| "grad_norm": 1.937682332181844, | |
| "learning_rate": 5.4030523057605865e-06, | |
| "loss": 0.4561, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5321150228433217, | |
| "grad_norm": 2.030922067258147, | |
| "learning_rate": 5.30945728314841e-06, | |
| "loss": 0.4558, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5374899220639613, | |
| "grad_norm": 2.0947701090151347, | |
| "learning_rate": 5.215753273008828e-06, | |
| "loss": 0.4483, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5428648212846009, | |
| "grad_norm": 1.9527895814298004, | |
| "learning_rate": 5.1219732768865744e-06, | |
| "loss": 0.4546, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5482397205052405, | |
| "grad_norm": 1.925264597102223, | |
| "learning_rate": 5.0281503230878304e-06, | |
| "loss": 0.454, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5536146197258801, | |
| "grad_norm": 1.870811300114845, | |
| "learning_rate": 4.934317455048005e-06, | |
| "loss": 0.4519, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5589895189465197, | |
| "grad_norm": 1.8517150460463325, | |
| "learning_rate": 4.840507719694202e-06, | |
| "loss": 0.4465, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5643644181671593, | |
| "grad_norm": 1.9901783088282807, | |
| "learning_rate": 4.746754155806437e-06, | |
| "loss": 0.4426, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.5697393173877989, | |
| "grad_norm": 1.92526563478314, | |
| "learning_rate": 4.6530897823817425e-06, | |
| "loss": 0.447, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5751142166084386, | |
| "grad_norm": 2.0060885985999297, | |
| "learning_rate": 4.559547587005227e-06, | |
| "loss": 0.4324, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.5804891158290783, | |
| "grad_norm": 1.9893587686847451, | |
| "learning_rate": 4.466160514232206e-06, | |
| "loss": 0.4333, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5858640150497179, | |
| "grad_norm": 1.9557768830171276, | |
| "learning_rate": 4.3729614539854815e-06, | |
| "loss": 0.4317, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.5912389142703575, | |
| "grad_norm": 1.934922488824026, | |
| "learning_rate": 4.279983229971863e-06, | |
| "loss": 0.4385, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5966138134909971, | |
| "grad_norm": 1.876883342350168, | |
| "learning_rate": 4.187258588122019e-06, | |
| "loss": 0.4308, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6019887127116367, | |
| "grad_norm": 1.9145406252528392, | |
| "learning_rate": 4.094820185057701e-06, | |
| "loss": 0.429, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6073636119322763, | |
| "grad_norm": 1.9668078483751361, | |
| "learning_rate": 4.002700576590441e-06, | |
| "loss": 0.4355, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.6127385111529159, | |
| "grad_norm": 1.961317833983311, | |
| "learning_rate": 3.910932206255742e-06, | |
| "loss": 0.4307, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6181134103735555, | |
| "grad_norm": 1.9266836648594068, | |
| "learning_rate": 3.819547393886816e-06, | |
| "loss": 0.4228, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6234883095941951, | |
| "grad_norm": 1.9178449532415156, | |
| "learning_rate": 3.7285783242318773e-06, | |
| "loss": 0.4208, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6288632088148347, | |
| "grad_norm": 1.9258945419319073, | |
| "learning_rate": 3.6380570356190346e-06, | |
| "loss": 0.4198, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6342381080354743, | |
| "grad_norm": 1.900131272139124, | |
| "learning_rate": 3.548015408672723e-06, | |
| "loss": 0.4166, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6396130072561139, | |
| "grad_norm": 1.9436783208199349, | |
| "learning_rate": 3.4584851550857007e-06, | |
| "loss": 0.4097, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6449879064767535, | |
| "grad_norm": 1.9775742266969982, | |
| "learning_rate": 3.3694978064505258e-06, | |
| "loss": 0.4129, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6503628056973931, | |
| "grad_norm": 1.8269664591538408, | |
| "learning_rate": 3.2810847031544703e-06, | |
| "loss": 0.4088, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 1.886926257627791, | |
| "learning_rate": 3.193276983341773e-06, | |
| "loss": 0.4047, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6611126041386725, | |
| "grad_norm": 1.9544062693179776, | |
| "learning_rate": 3.10610557194712e-06, | |
| "loss": 0.4062, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6664875033593121, | |
| "grad_norm": 1.8781772152552532, | |
| "learning_rate": 3.019601169804216e-06, | |
| "loss": 0.4104, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6718624025799517, | |
| "grad_norm": 1.8152357686298919, | |
| "learning_rate": 2.9337942428332787e-06, | |
| "loss": 0.4045, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.6772373018005913, | |
| "grad_norm": 1.8231399150815806, | |
| "learning_rate": 2.848715011311271e-06, | |
| "loss": 0.392, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6826122010212309, | |
| "grad_norm": 1.8660672229253348, | |
| "learning_rate": 2.764393439228643e-06, | |
| "loss": 0.3943, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.6879871002418705, | |
| "grad_norm": 1.8610053663644113, | |
| "learning_rate": 2.6808592237363364e-06, | |
| "loss": 0.4008, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6933619994625101, | |
| "grad_norm": 1.8382570488101446, | |
| "learning_rate": 2.5981417846867753e-06, | |
| "loss": 0.4016, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.6987368986831497, | |
| "grad_norm": 2.1963817133336487, | |
| "learning_rate": 2.5162702542724924e-06, | |
| "loss": 0.3897, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7041117979037893, | |
| "grad_norm": 1.9176668039213747, | |
| "learning_rate": 2.4352734667661073e-06, | |
| "loss": 0.3828, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.7094866971244289, | |
| "grad_norm": 1.7727520223769333, | |
| "learning_rate": 2.3551799483651894e-06, | |
| "loss": 0.3918, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7148615963450685, | |
| "grad_norm": 1.8508452992048683, | |
| "learning_rate": 2.2760179071456356e-06, | |
| "loss": 0.3923, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7202364955657081, | |
| "grad_norm": 1.8898208151955234, | |
| "learning_rate": 2.1978152231271077e-06, | |
| "loss": 0.3889, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7256113947863477, | |
| "grad_norm": 1.8442479576678996, | |
| "learning_rate": 2.120599438453968e-06, | |
| "loss": 0.3803, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7309862940069873, | |
| "grad_norm": 1.8287369121112131, | |
| "learning_rate": 2.044397747695247e-06, | |
| "loss": 0.3803, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7363611932276269, | |
| "grad_norm": 1.8799793582063802, | |
| "learning_rate": 1.969236988267005e-06, | |
| "loss": 0.3761, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7417360924482665, | |
| "grad_norm": 1.7804441999040213, | |
| "learning_rate": 1.8951436309804766e-06, | |
| "loss": 0.3803, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7471109916689062, | |
| "grad_norm": 1.8057045901843964, | |
| "learning_rate": 1.8221437707193424e-06, | |
| "loss": 0.3791, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7524858908895459, | |
| "grad_norm": 1.8066778942795885, | |
| "learning_rate": 1.7502631172493878e-06, | |
| "loss": 0.3787, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7578607901101855, | |
| "grad_norm": 1.8302691938217297, | |
| "learning_rate": 1.6795269861638041e-06, | |
| "loss": 0.3881, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.7632356893308251, | |
| "grad_norm": 1.8234451924525408, | |
| "learning_rate": 1.6099602899673083e-06, | |
| "loss": 0.3755, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7686105885514647, | |
| "grad_norm": 1.767257969497289, | |
| "learning_rate": 1.5415875293022181e-06, | |
| "loss": 0.3767, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.7739854877721043, | |
| "grad_norm": 1.8275002961230755, | |
| "learning_rate": 1.4744327843196043e-06, | |
| "loss": 0.3676, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7793603869927439, | |
| "grad_norm": 1.8120986388263132, | |
| "learning_rate": 1.4085197061985022e-06, | |
| "loss": 0.378, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.7847352862133835, | |
| "grad_norm": 1.7559930973845725, | |
| "learning_rate": 1.3438715088162403e-06, | |
| "loss": 0.3676, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7901101854340231, | |
| "grad_norm": 1.8425030002430842, | |
| "learning_rate": 1.280510960572745e-06, | |
| "loss": 0.3721, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.7954850846546627, | |
| "grad_norm": 1.8012025523517194, | |
| "learning_rate": 1.2184603763717684e-06, | |
| "loss": 0.3624, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8008599838753023, | |
| "grad_norm": 1.861454253307067, | |
| "learning_rate": 1.1577416097618138e-06, | |
| "loss": 0.3628, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.8062348830959419, | |
| "grad_norm": 1.7687205113359847, | |
| "learning_rate": 1.0983760452395415e-06, | |
| "loss": 0.3624, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8116097823165815, | |
| "grad_norm": 1.9487313717243606, | |
| "learning_rate": 1.040384590718399e-06, | |
| "loss": 0.3554, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8169846815372211, | |
| "grad_norm": 1.6883835685983015, | |
| "learning_rate": 9.837876701650606e-07, | |
| "loss": 0.3552, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8223595807578608, | |
| "grad_norm": 1.8609027176051518, | |
| "learning_rate": 9.286052164063369e-07, | |
| "loss": 0.3658, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.8277344799785004, | |
| "grad_norm": 1.8177990709248102, | |
| "learning_rate": 8.748566641090433e-07, | |
| "loss": 0.3649, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.83310937919914, | |
| "grad_norm": 1.8719226675515828, | |
| "learning_rate": 8.225609429353187e-07, | |
| "loss": 0.3679, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.8384842784197797, | |
| "grad_norm": 1.740827963426437, | |
| "learning_rate": 7.717364708758024e-07, | |
| "loss": 0.3587, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8438591776404193, | |
| "grad_norm": 1.7094014840076446, | |
| "learning_rate": 7.224011477630166e-07, | |
| "loss": 0.3615, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8492340768610589, | |
| "grad_norm": 1.790943519825671, | |
| "learning_rate": 6.745723489672412e-07, | |
| "loss": 0.3401, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8546089760816985, | |
| "grad_norm": 1.852602320274836, | |
| "learning_rate": 6.282669192770896e-07, | |
| "loss": 0.3615, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8599838753023381, | |
| "grad_norm": 1.7448118489231759, | |
| "learning_rate": 5.83501166966956e-07, | |
| "loss": 0.356, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8653587745229777, | |
| "grad_norm": 1.7138058444443134, | |
| "learning_rate": 5.402908580534233e-07, | |
| "loss": 0.3507, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.8707336737436173, | |
| "grad_norm": 1.786529360611276, | |
| "learning_rate": 4.986512107426283e-07, | |
| "loss": 0.3545, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8761085729642569, | |
| "grad_norm": 1.756925743913887, | |
| "learning_rate": 4.5859689007058896e-07, | |
| "loss": 0.3561, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.8814834721848965, | |
| "grad_norm": 1.7845712546592873, | |
| "learning_rate": 4.2014200273832406e-07, | |
| "loss": 0.3484, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8868583714055361, | |
| "grad_norm": 1.8932772047523514, | |
| "learning_rate": 3.8330009214363197e-07, | |
| "loss": 0.3568, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.8922332706261757, | |
| "grad_norm": 1.6852471718751032, | |
| "learning_rate": 3.4808413361125004e-07, | |
| "loss": 0.3483, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8976081698468154, | |
| "grad_norm": 1.774916611408541, | |
| "learning_rate": 3.1450652982307815e-07, | |
| "loss": 0.3549, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.902983069067455, | |
| "grad_norm": 1.9063855186007104, | |
| "learning_rate": 2.8257910645009935e-07, | |
| "loss": 0.3551, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9083579682880946, | |
| "grad_norm": 1.7048794107309078, | |
| "learning_rate": 2.523131079874963e-07, | |
| "loss": 0.3542, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.9137328675087342, | |
| "grad_norm": 1.8134483816091473, | |
| "learning_rate": 2.2371919379446495e-07, | |
| "loss": 0.3546, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9191077667293738, | |
| "grad_norm": 1.847919769203762, | |
| "learning_rate": 1.9680743434010385e-07, | |
| "loss": 0.3468, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.9244826659500134, | |
| "grad_norm": 1.7597705426288421, | |
| "learning_rate": 1.7158730765669817e-07, | |
| "loss": 0.3598, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9298575651706531, | |
| "grad_norm": 1.8107276346763232, | |
| "learning_rate": 1.480676960016636e-07, | |
| "loss": 0.3596, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.9352324643912927, | |
| "grad_norm": 1.726387338435163, | |
| "learning_rate": 1.2625688272930925e-07, | |
| "loss": 0.3555, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9406073636119323, | |
| "grad_norm": 1.657583954677237, | |
| "learning_rate": 1.0616254937352966e-07, | |
| "loss": 0.3478, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.9459822628325719, | |
| "grad_norm": 1.7186030940673933, | |
| "learning_rate": 8.779177294245044e-08, | |
| "loss": 0.3476, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9513571620532115, | |
| "grad_norm": 1.803294093442226, | |
| "learning_rate": 7.115102342598101e-08, | |
| "loss": 0.3473, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.9567320612738511, | |
| "grad_norm": 1.9161389425178095, | |
| "learning_rate": 5.6246161517158336e-08, | |
| "loss": 0.346, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9621069604944907, | |
| "grad_norm": 1.7244304109116553, | |
| "learning_rate": 4.308243654806643e-08, | |
| "loss": 0.3522, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.9674818597151303, | |
| "grad_norm": 1.798195655862142, | |
| "learning_rate": 3.166448464108629e-08, | |
| "loss": 0.348, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.97285675893577, | |
| "grad_norm": 1.7093455607124064, | |
| "learning_rate": 2.1996327076096446e-08, | |
| "loss": 0.3497, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.9782316581564096, | |
| "grad_norm": 1.79086386462793, | |
| "learning_rate": 1.4081368874226398e-08, | |
| "loss": 0.3483, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 1.7408308676379578, | |
| "learning_rate": 7.922397598642551e-09, | |
| "loss": 0.3451, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.9889814565976888, | |
| "grad_norm": 1.7334185566757478, | |
| "learning_rate": 3.5215823727974274e-09, | |
| "loss": 0.3496, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9943563558183284, | |
| "grad_norm": 1.7114218675449298, | |
| "learning_rate": 8.804731164901991e-10, | |
| "loss": 0.345, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.999731255038968, | |
| "grad_norm": 1.7912428537040286, | |
| "learning_rate": 0.0, | |
| "loss": 0.3503, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.999731255038968, | |
| "eval_loss": 0.3442555069923401, | |
| "eval_runtime": 95.8162, | |
| "eval_samples_per_second": 3.152, | |
| "eval_steps_per_second": 0.793, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.999731255038968, | |
| "step": 930, | |
| "total_flos": 194670734868480.0, | |
| "train_loss": 0.508077065149943, | |
| "train_runtime": 21031.9081, | |
| "train_samples_per_second": 1.415, | |
| "train_steps_per_second": 0.044 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 930, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 194670734868480.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |