| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.999731255038968, | |
| "eval_steps": 500, | |
| "global_step": 930, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0010749798441279225, | |
| "grad_norm": 22.998251618632413, | |
| "learning_rate": 1.0752688172043012e-07, | |
| "loss": 1.3109, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.005374899220639613, | |
| "grad_norm": 21.62666674963918, | |
| "learning_rate": 5.376344086021506e-07, | |
| "loss": 1.319, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.010749798441279226, | |
| "grad_norm": 8.47868515286049, | |
| "learning_rate": 1.0752688172043011e-06, | |
| "loss": 1.201, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01612469766191884, | |
| "grad_norm": 7.0045452722639245, | |
| "learning_rate": 1.6129032258064516e-06, | |
| "loss": 1.0409, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.021499596882558453, | |
| "grad_norm": 2.9866055931463724, | |
| "learning_rate": 2.1505376344086023e-06, | |
| "loss": 0.9079, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.026874496103198066, | |
| "grad_norm": 2.466878497235169, | |
| "learning_rate": 2.688172043010753e-06, | |
| "loss": 0.8681, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03224939532383768, | |
| "grad_norm": 2.1381469942804414, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 0.8354, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03762429454447729, | |
| "grad_norm": 2.3822492851866888, | |
| "learning_rate": 3.763440860215054e-06, | |
| "loss": 0.8242, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.042999193765116905, | |
| "grad_norm": 2.2460422471438446, | |
| "learning_rate": 4.3010752688172045e-06, | |
| "loss": 0.8055, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04837409298575652, | |
| "grad_norm": 2.278945525704108, | |
| "learning_rate": 4.838709677419355e-06, | |
| "loss": 0.7808, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05374899220639613, | |
| "grad_norm": 2.3939238018515785, | |
| "learning_rate": 5.376344086021506e-06, | |
| "loss": 0.7718, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.059123891427035745, | |
| "grad_norm": 2.4529746003373805, | |
| "learning_rate": 5.9139784946236566e-06, | |
| "loss": 0.7577, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06449879064767536, | |
| "grad_norm": 2.370219230419958, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 0.7406, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06987368986831496, | |
| "grad_norm": 2.5160763656892278, | |
| "learning_rate": 6.989247311827958e-06, | |
| "loss": 0.7328, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07524858908895458, | |
| "grad_norm": 2.2323762062076615, | |
| "learning_rate": 7.526881720430108e-06, | |
| "loss": 0.7049, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08062348830959419, | |
| "grad_norm": 2.1957037704755717, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 0.7038, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08599838753023381, | |
| "grad_norm": 2.263429588353797, | |
| "learning_rate": 8.602150537634409e-06, | |
| "loss": 0.7177, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09137328675087342, | |
| "grad_norm": 2.2491062827888757, | |
| "learning_rate": 9.13978494623656e-06, | |
| "loss": 0.7082, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.09674818597151304, | |
| "grad_norm": 2.3972214541569326, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 0.7052, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10212308519215264, | |
| "grad_norm": 2.442294992070258, | |
| "learning_rate": 9.999859120828162e-06, | |
| "loss": 0.6871, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.10749798441279226, | |
| "grad_norm": 2.1737197659282677, | |
| "learning_rate": 9.998274321315453e-06, | |
| "loss": 0.691, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11287288363343187, | |
| "grad_norm": 2.086931041290223, | |
| "learning_rate": 9.994929183335237e-06, | |
| "loss": 0.6887, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.11824778285407149, | |
| "grad_norm": 2.181755095183289, | |
| "learning_rate": 9.989824885009142e-06, | |
| "loss": 0.6761, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1236226820747111, | |
| "grad_norm": 2.1430422893188195, | |
| "learning_rate": 9.982963224016152e-06, | |
| "loss": 0.6748, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.12899758129535072, | |
| "grad_norm": 2.0944055561946047, | |
| "learning_rate": 9.974346616959476e-06, | |
| "loss": 0.6782, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13437248051599032, | |
| "grad_norm": 2.7234289381522694, | |
| "learning_rate": 9.963978098515468e-06, | |
| "loss": 0.6717, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.13974737973662993, | |
| "grad_norm": 2.2612331983977265, | |
| "learning_rate": 9.951861320364822e-06, | |
| "loss": 0.671, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14512227895726956, | |
| "grad_norm": 2.349078940825244, | |
| "learning_rate": 9.938000549906509e-06, | |
| "loss": 0.6628, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.15049717817790917, | |
| "grad_norm": 2.213915224858813, | |
| "learning_rate": 9.922400668754833e-06, | |
| "loss": 0.6578, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15587207739854878, | |
| "grad_norm": 2.0247679479977947, | |
| "learning_rate": 9.905067171020185e-06, | |
| "loss": 0.6621, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.16124697661918838, | |
| "grad_norm": 2.175459644155833, | |
| "learning_rate": 9.88600616137407e-06, | |
| "loss": 0.6539, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16662187583982802, | |
| "grad_norm": 1.9231116317571189, | |
| "learning_rate": 9.86522435289912e-06, | |
| "loss": 0.6486, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.17199677506046762, | |
| "grad_norm": 1.9429182926189486, | |
| "learning_rate": 9.8427290647248e-06, | |
| "loss": 0.6446, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.17737167428110723, | |
| "grad_norm": 2.029616741174372, | |
| "learning_rate": 9.818528219449705e-06, | |
| "loss": 0.6393, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.18274657350174683, | |
| "grad_norm": 1.954236053756045, | |
| "learning_rate": 9.792630340351301e-06, | |
| "loss": 0.6505, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18812147272238647, | |
| "grad_norm": 2.0238728155049994, | |
| "learning_rate": 9.765044548384113e-06, | |
| "loss": 0.6345, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.19349637194302607, | |
| "grad_norm": 1.9933506258733436, | |
| "learning_rate": 9.735780558967434e-06, | |
| "loss": 0.6321, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19887127116366568, | |
| "grad_norm": 2.159514610870395, | |
| "learning_rate": 9.70484867856365e-06, | |
| "loss": 0.6307, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2042461703843053, | |
| "grad_norm": 1.8959726839851836, | |
| "learning_rate": 9.67225980104841e-06, | |
| "loss": 0.6344, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.20962106960494492, | |
| "grad_norm": 1.9231014581432677, | |
| "learning_rate": 9.638025403873939e-06, | |
| "loss": 0.6354, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.21499596882558453, | |
| "grad_norm": 1.9470579040757032, | |
| "learning_rate": 9.602157544026785e-06, | |
| "loss": 0.6213, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22037086804622413, | |
| "grad_norm": 2.1141633711901293, | |
| "learning_rate": 9.564668853781483e-06, | |
| "loss": 0.6287, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.22574576726686374, | |
| "grad_norm": 2.094733969067925, | |
| "learning_rate": 9.525572536251608e-06, | |
| "loss": 0.6184, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23112066648750335, | |
| "grad_norm": 1.8594092068711054, | |
| "learning_rate": 9.484882360739772e-06, | |
| "loss": 0.6049, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.23649556570814298, | |
| "grad_norm": 1.8042100641048069, | |
| "learning_rate": 9.442612657888237e-06, | |
| "loss": 0.604, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.24187046492878259, | |
| "grad_norm": 1.9491363567278153, | |
| "learning_rate": 9.398778314631801e-06, | |
| "loss": 0.5935, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.2472453641494222, | |
| "grad_norm": 2.0060700754274112, | |
| "learning_rate": 9.353394768954791e-06, | |
| "loss": 0.5973, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2526202633700618, | |
| "grad_norm": 2.323587224840248, | |
| "learning_rate": 9.30647800445397e-06, | |
| "loss": 0.602, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.25799516259070143, | |
| "grad_norm": 2.2542109995103985, | |
| "learning_rate": 9.258044544709276e-06, | |
| "loss": 0.6005, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.26337006181134104, | |
| "grad_norm": 2.205069286015657, | |
| "learning_rate": 9.208111447464407e-06, | |
| "loss": 0.5937, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.26874496103198064, | |
| "grad_norm": 1.979112533465105, | |
| "learning_rate": 9.156696298619266e-06, | |
| "loss": 0.5905, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.27411986025262025, | |
| "grad_norm": 2.171300635630568, | |
| "learning_rate": 9.103817206036383e-06, | |
| "loss": 0.5885, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.27949475947325986, | |
| "grad_norm": 1.9064161905725763, | |
| "learning_rate": 9.049492793163539e-06, | |
| "loss": 0.5898, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.28486965869389946, | |
| "grad_norm": 1.9617828705521112, | |
| "learning_rate": 8.993742192474773e-06, | |
| "loss": 0.5831, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.2902445579145391, | |
| "grad_norm": 1.8575944912932347, | |
| "learning_rate": 8.936585038732143e-06, | |
| "loss": 0.5751, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.29561945713517873, | |
| "grad_norm": 1.9607189835672174, | |
| "learning_rate": 8.878041462070556e-06, | |
| "loss": 0.5795, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.30099435635581834, | |
| "grad_norm": 1.9827526904209154, | |
| "learning_rate": 8.818132080908178e-06, | |
| "loss": 0.5756, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.30636925557645794, | |
| "grad_norm": 1.9289388619958865, | |
| "learning_rate": 8.756877994684818e-06, | |
| "loss": 0.5797, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.31174415479709755, | |
| "grad_norm": 1.9423515794251163, | |
| "learning_rate": 8.694300776430958e-06, | |
| "loss": 0.5654, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.31711905401773716, | |
| "grad_norm": 2.133199145110774, | |
| "learning_rate": 8.630422465169947e-06, | |
| "loss": 0.5586, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.32249395323837676, | |
| "grad_norm": 2.2051473612841845, | |
| "learning_rate": 8.565265558156101e-06, | |
| "loss": 0.5674, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 1.94675668742253, | |
| "learning_rate": 8.498853002951414e-06, | |
| "loss": 0.5572, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.33324375167965603, | |
| "grad_norm": 2.046407155545796, | |
| "learning_rate": 8.43120818934367e-06, | |
| "loss": 0.5535, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.33861865090029564, | |
| "grad_norm": 2.0046384820796486, | |
| "learning_rate": 8.362354941108803e-06, | |
| "loss": 0.5577, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.34399355012093524, | |
| "grad_norm": 2.0243084631372223, | |
| "learning_rate": 8.292317507620438e-06, | |
| "loss": 0.566, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.34936844934157485, | |
| "grad_norm": 1.8780763393652544, | |
| "learning_rate": 8.221120555309511e-06, | |
| "loss": 0.5471, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.35474334856221446, | |
| "grad_norm": 2.0125466037004967, | |
| "learning_rate": 8.148789158977012e-06, | |
| "loss": 0.55, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.36011824778285406, | |
| "grad_norm": 2.111876068514048, | |
| "learning_rate": 8.075348792962924e-06, | |
| "loss": 0.542, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.36549314700349367, | |
| "grad_norm": 1.831727662600693, | |
| "learning_rate": 8.000825322174424e-06, | |
| "loss": 0.541, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3708680462241333, | |
| "grad_norm": 1.957058515389012, | |
| "learning_rate": 7.925244992976538e-06, | |
| "loss": 0.525, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.37624294544477294, | |
| "grad_norm": 2.016623611705091, | |
| "learning_rate": 7.848634423948468e-06, | |
| "loss": 0.5294, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.38161784466541254, | |
| "grad_norm": 1.9380904809426653, | |
| "learning_rate": 7.7710205965088e-06, | |
| "loss": 0.5324, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.38699274388605215, | |
| "grad_norm": 2.0054637192864746, | |
| "learning_rate": 7.692430845412946e-06, | |
| "loss": 0.5276, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.39236764310669175, | |
| "grad_norm": 2.201718543185445, | |
| "learning_rate": 7.612892849126132e-06, | |
| "loss": 0.5286, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.39774254232733136, | |
| "grad_norm": 1.9346693875946135, | |
| "learning_rate": 7.532434620075349e-06, | |
| "loss": 0.5217, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.40311744154797097, | |
| "grad_norm": 2.046737331443054, | |
| "learning_rate": 7.451084494783668e-06, | |
| "loss": 0.5196, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.4084923407686106, | |
| "grad_norm": 1.9926284827905993, | |
| "learning_rate": 7.368871123890425e-06, | |
| "loss": 0.5186, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4138672399892502, | |
| "grad_norm": 2.0164701987233133, | |
| "learning_rate": 7.285823462060776e-06, | |
| "loss": 0.5155, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.41924213920988984, | |
| "grad_norm": 1.821494525655949, | |
| "learning_rate": 7.201970757788172e-06, | |
| "loss": 0.5094, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.42461703843052945, | |
| "grad_norm": 2.0350416485078533, | |
| "learning_rate": 7.117342543093358e-06, | |
| "loss": 0.5051, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.42999193765116905, | |
| "grad_norm": 1.9177286245297331, | |
| "learning_rate": 7.031968623123503e-06, | |
| "loss": 0.5037, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.43536683687180866, | |
| "grad_norm": 1.8060468183436844, | |
| "learning_rate": 6.945879065655164e-06, | |
| "loss": 0.5091, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.44074173609244827, | |
| "grad_norm": 2.0324990194845425, | |
| "learning_rate": 6.859104190504725e-06, | |
| "loss": 0.5081, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4461166353130879, | |
| "grad_norm": 1.9781738403258928, | |
| "learning_rate": 6.771674558850088e-06, | |
| "loss": 0.4978, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.4514915345337275, | |
| "grad_norm": 1.967394802071695, | |
| "learning_rate": 6.6836209624673575e-06, | |
| "loss": 0.4993, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4568664337543671, | |
| "grad_norm": 1.8301107915867578, | |
| "learning_rate": 6.5949744128863026e-06, | |
| "loss": 0.4951, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.4622413329750067, | |
| "grad_norm": 1.8666451752143274, | |
| "learning_rate": 6.5057661304684314e-06, | |
| "loss": 0.4929, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.46761623219564635, | |
| "grad_norm": 1.820566898851988, | |
| "learning_rate": 6.41602753341152e-06, | |
| "loss": 0.4925, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.47299113141628596, | |
| "grad_norm": 1.972020976387126, | |
| "learning_rate": 6.32579022668446e-06, | |
| "loss": 0.4869, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.47836603063692557, | |
| "grad_norm": 1.9421491239061908, | |
| "learning_rate": 6.235085990896317e-06, | |
| "loss": 0.4757, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.48374092985756517, | |
| "grad_norm": 1.9181960459695213, | |
| "learning_rate": 6.143946771103561e-06, | |
| "loss": 0.4941, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4891158290782048, | |
| "grad_norm": 1.9219139761794142, | |
| "learning_rate": 6.052404665559342e-06, | |
| "loss": 0.468, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.4944907282988444, | |
| "grad_norm": 1.9586485250210308, | |
| "learning_rate": 5.960491914408846e-06, | |
| "loss": 0.4704, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.499865627519484, | |
| "grad_norm": 1.978155481583312, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.4891, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5052405267401237, | |
| "grad_norm": 1.9491669694735005, | |
| "learning_rate": 5.775684077156133e-06, | |
| "loss": 0.4689, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5106154259607633, | |
| "grad_norm": 1.9616828525043124, | |
| "learning_rate": 5.682854078386882e-06, | |
| "loss": 0.4722, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5159903251814029, | |
| "grad_norm": 1.9497396606270447, | |
| "learning_rate": 5.5897835857542315e-06, | |
| "loss": 0.4624, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5213652244020425, | |
| "grad_norm": 1.895936510041223, | |
| "learning_rate": 5.496505377684858e-06, | |
| "loss": 0.4519, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5267401236226821, | |
| "grad_norm": 1.97427958718019, | |
| "learning_rate": 5.4030523057605865e-06, | |
| "loss": 0.4671, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5321150228433217, | |
| "grad_norm": 1.99020514417267, | |
| "learning_rate": 5.30945728314841e-06, | |
| "loss": 0.4555, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5374899220639613, | |
| "grad_norm": 1.834260157432595, | |
| "learning_rate": 5.215753273008828e-06, | |
| "loss": 0.4592, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5428648212846009, | |
| "grad_norm": 2.07183045007662, | |
| "learning_rate": 5.1219732768865744e-06, | |
| "loss": 0.4544, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5482397205052405, | |
| "grad_norm": 1.8925742071728788, | |
| "learning_rate": 5.0281503230878304e-06, | |
| "loss": 0.4503, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5536146197258801, | |
| "grad_norm": 2.120435800008408, | |
| "learning_rate": 4.934317455048005e-06, | |
| "loss": 0.4432, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5589895189465197, | |
| "grad_norm": 2.078137912583167, | |
| "learning_rate": 4.840507719694202e-06, | |
| "loss": 0.4454, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5643644181671593, | |
| "grad_norm": 1.8218423489133921, | |
| "learning_rate": 4.746754155806437e-06, | |
| "loss": 0.4353, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.5697393173877989, | |
| "grad_norm": 1.886033793318668, | |
| "learning_rate": 4.6530897823817425e-06, | |
| "loss": 0.4518, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5751142166084386, | |
| "grad_norm": 1.837947070178225, | |
| "learning_rate": 4.559547587005227e-06, | |
| "loss": 0.4491, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.5804891158290783, | |
| "grad_norm": 1.8242868271131116, | |
| "learning_rate": 4.466160514232206e-06, | |
| "loss": 0.4405, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5858640150497179, | |
| "grad_norm": 1.9410093391418528, | |
| "learning_rate": 4.3729614539854815e-06, | |
| "loss": 0.4323, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.5912389142703575, | |
| "grad_norm": 2.079990387721551, | |
| "learning_rate": 4.279983229971863e-06, | |
| "loss": 0.4347, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5966138134909971, | |
| "grad_norm": 1.8661086060382346, | |
| "learning_rate": 4.187258588122019e-06, | |
| "loss": 0.4309, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6019887127116367, | |
| "grad_norm": 1.9549179181995389, | |
| "learning_rate": 4.094820185057701e-06, | |
| "loss": 0.434, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6073636119322763, | |
| "grad_norm": 1.9438750319747926, | |
| "learning_rate": 4.002700576590441e-06, | |
| "loss": 0.417, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.6127385111529159, | |
| "grad_norm": 1.86309458236793, | |
| "learning_rate": 3.910932206255742e-06, | |
| "loss": 0.4196, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6181134103735555, | |
| "grad_norm": 1.931200769910855, | |
| "learning_rate": 3.819547393886816e-06, | |
| "loss": 0.4089, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6234883095941951, | |
| "grad_norm": 1.9236481963943084, | |
| "learning_rate": 3.7285783242318773e-06, | |
| "loss": 0.4176, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6288632088148347, | |
| "grad_norm": 1.987655423768294, | |
| "learning_rate": 3.6380570356190346e-06, | |
| "loss": 0.4111, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6342381080354743, | |
| "grad_norm": 1.9140625569934329, | |
| "learning_rate": 3.548015408672723e-06, | |
| "loss": 0.4179, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6396130072561139, | |
| "grad_norm": 1.914275720117943, | |
| "learning_rate": 3.4584851550857007e-06, | |
| "loss": 0.4101, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6449879064767535, | |
| "grad_norm": 1.9920083731014693, | |
| "learning_rate": 3.3694978064505258e-06, | |
| "loss": 0.4096, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6503628056973931, | |
| "grad_norm": 1.8555446650582728, | |
| "learning_rate": 3.2810847031544703e-06, | |
| "loss": 0.4134, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 1.847095213057005, | |
| "learning_rate": 3.193276983341773e-06, | |
| "loss": 0.4044, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6611126041386725, | |
| "grad_norm": 1.8348823558923015, | |
| "learning_rate": 3.10610557194712e-06, | |
| "loss": 0.4081, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6664875033593121, | |
| "grad_norm": 1.7814726492285784, | |
| "learning_rate": 3.019601169804216e-06, | |
| "loss": 0.4054, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6718624025799517, | |
| "grad_norm": 1.9239354085532425, | |
| "learning_rate": 2.9337942428332787e-06, | |
| "loss": 0.4072, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.6772373018005913, | |
| "grad_norm": 1.8174216015392752, | |
| "learning_rate": 2.848715011311271e-06, | |
| "loss": 0.3882, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6826122010212309, | |
| "grad_norm": 1.8355921561483057, | |
| "learning_rate": 2.764393439228643e-06, | |
| "loss": 0.4032, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.6879871002418705, | |
| "grad_norm": 1.8428986241754186, | |
| "learning_rate": 2.6808592237363364e-06, | |
| "loss": 0.3938, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6933619994625101, | |
| "grad_norm": 1.930494807022806, | |
| "learning_rate": 2.5981417846867753e-06, | |
| "loss": 0.3827, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.6987368986831497, | |
| "grad_norm": 1.9965194348229882, | |
| "learning_rate": 2.5162702542724924e-06, | |
| "loss": 0.4025, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7041117979037893, | |
| "grad_norm": 1.9343155927695501, | |
| "learning_rate": 2.4352734667661073e-06, | |
| "loss": 0.3931, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.7094866971244289, | |
| "grad_norm": 1.8913804765293543, | |
| "learning_rate": 2.3551799483651894e-06, | |
| "loss": 0.4008, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7148615963450685, | |
| "grad_norm": 1.877731033002426, | |
| "learning_rate": 2.2760179071456356e-06, | |
| "loss": 0.3836, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7202364955657081, | |
| "grad_norm": 2.271932216399522, | |
| "learning_rate": 2.1978152231271077e-06, | |
| "loss": 0.3899, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7256113947863477, | |
| "grad_norm": 1.8362601272256027, | |
| "learning_rate": 2.120599438453968e-06, | |
| "loss": 0.3935, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7309862940069873, | |
| "grad_norm": 1.8391327367572157, | |
| "learning_rate": 2.044397747695247e-06, | |
| "loss": 0.39, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7363611932276269, | |
| "grad_norm": 1.8672651878485813, | |
| "learning_rate": 1.969236988267005e-06, | |
| "loss": 0.3829, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7417360924482665, | |
| "grad_norm": 1.8664855719301263, | |
| "learning_rate": 1.8951436309804766e-06, | |
| "loss": 0.3826, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7471109916689062, | |
| "grad_norm": 1.8648482987681003, | |
| "learning_rate": 1.8221437707193424e-06, | |
| "loss": 0.3796, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7524858908895459, | |
| "grad_norm": 1.8823945570583507, | |
| "learning_rate": 1.7502631172493878e-06, | |
| "loss": 0.3784, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7578607901101855, | |
| "grad_norm": 1.8429277019145163, | |
| "learning_rate": 1.6795269861638041e-06, | |
| "loss": 0.3808, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.7632356893308251, | |
| "grad_norm": 1.8821109309531021, | |
| "learning_rate": 1.6099602899673083e-06, | |
| "loss": 0.3752, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7686105885514647, | |
| "grad_norm": 1.8044029771514785, | |
| "learning_rate": 1.5415875293022181e-06, | |
| "loss": 0.3631, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.7739854877721043, | |
| "grad_norm": 1.8714319214476052, | |
| "learning_rate": 1.4744327843196043e-06, | |
| "loss": 0.3722, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7793603869927439, | |
| "grad_norm": 1.8005842812695019, | |
| "learning_rate": 1.4085197061985022e-06, | |
| "loss": 0.3764, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.7847352862133835, | |
| "grad_norm": 1.9498199256183426, | |
| "learning_rate": 1.3438715088162403e-06, | |
| "loss": 0.3717, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7901101854340231, | |
| "grad_norm": 1.7622641111168549, | |
| "learning_rate": 1.280510960572745e-06, | |
| "loss": 0.3726, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.7954850846546627, | |
| "grad_norm": 1.859176007788758, | |
| "learning_rate": 1.2184603763717684e-06, | |
| "loss": 0.3581, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8008599838753023, | |
| "grad_norm": 1.8187680407734916, | |
| "learning_rate": 1.1577416097618138e-06, | |
| "loss": 0.361, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.8062348830959419, | |
| "grad_norm": 1.8234490317185335, | |
| "learning_rate": 1.0983760452395415e-06, | |
| "loss": 0.3629, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8116097823165815, | |
| "grad_norm": 1.7846174067763332, | |
| "learning_rate": 1.040384590718399e-06, | |
| "loss": 0.3603, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8169846815372211, | |
| "grad_norm": 1.9539954235141312, | |
| "learning_rate": 9.837876701650606e-07, | |
| "loss": 0.3604, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8223595807578608, | |
| "grad_norm": 1.829814154109719, | |
| "learning_rate": 9.286052164063369e-07, | |
| "loss": 0.361, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.8277344799785004, | |
| "grad_norm": 1.7589374129255484, | |
| "learning_rate": 8.748566641090433e-07, | |
| "loss": 0.3598, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.83310937919914, | |
| "grad_norm": 1.8087859749367046, | |
| "learning_rate": 8.225609429353187e-07, | |
| "loss": 0.3575, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.8384842784197797, | |
| "grad_norm": 1.659093947160247, | |
| "learning_rate": 7.717364708758024e-07, | |
| "loss": 0.357, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8438591776404193, | |
| "grad_norm": 1.7512154153875217, | |
| "learning_rate": 7.224011477630166e-07, | |
| "loss": 0.3556, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8492340768610589, | |
| "grad_norm": 1.721279609127408, | |
| "learning_rate": 6.745723489672412e-07, | |
| "loss": 0.3597, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8546089760816985, | |
| "grad_norm": 1.933232116242709, | |
| "learning_rate": 6.282669192770896e-07, | |
| "loss": 0.3583, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8599838753023381, | |
| "grad_norm": 1.807730857907691, | |
| "learning_rate": 5.83501166966956e-07, | |
| "loss": 0.3524, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8653587745229777, | |
| "grad_norm": 1.7853591098408064, | |
| "learning_rate": 5.402908580534233e-07, | |
| "loss": 0.3623, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.8707336737436173, | |
| "grad_norm": 1.706116308384166, | |
| "learning_rate": 4.986512107426283e-07, | |
| "loss": 0.3485, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8761085729642569, | |
| "grad_norm": 1.8110038058348839, | |
| "learning_rate": 4.5859689007058896e-07, | |
| "loss": 0.3517, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.8814834721848965, | |
| "grad_norm": 1.74966735612597, | |
| "learning_rate": 4.2014200273832406e-07, | |
| "loss": 0.3461, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8868583714055361, | |
| "grad_norm": 1.887649690807963, | |
| "learning_rate": 3.8330009214363197e-07, | |
| "loss": 0.3509, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.8922332706261757, | |
| "grad_norm": 1.7866587101491602, | |
| "learning_rate": 3.4808413361125004e-07, | |
| "loss": 0.3479, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8976081698468154, | |
| "grad_norm": 1.851631043751454, | |
| "learning_rate": 3.1450652982307815e-07, | |
| "loss": 0.3502, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.902983069067455, | |
| "grad_norm": 1.8437354756226652, | |
| "learning_rate": 2.8257910645009935e-07, | |
| "loss": 0.3475, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9083579682880946, | |
| "grad_norm": 1.690251790769186, | |
| "learning_rate": 2.523131079874963e-07, | |
| "loss": 0.3581, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.9137328675087342, | |
| "grad_norm": 1.8644738435275017, | |
| "learning_rate": 2.2371919379446495e-07, | |
| "loss": 0.356, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9191077667293738, | |
| "grad_norm": 1.703645616104732, | |
| "learning_rate": 1.9680743434010385e-07, | |
| "loss": 0.3431, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.9244826659500134, | |
| "grad_norm": 1.7632832319721445, | |
| "learning_rate": 1.7158730765669817e-07, | |
| "loss": 0.3527, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9298575651706531, | |
| "grad_norm": 1.778411516430386, | |
| "learning_rate": 1.480676960016636e-07, | |
| "loss": 0.3527, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.9352324643912927, | |
| "grad_norm": 1.7461766662441864, | |
| "learning_rate": 1.2625688272930925e-07, | |
| "loss": 0.3459, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9406073636119323, | |
| "grad_norm": 1.7323066734927444, | |
| "learning_rate": 1.0616254937352966e-07, | |
| "loss": 0.3484, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.9459822628325719, | |
| "grad_norm": 1.7917085698996813, | |
| "learning_rate": 8.779177294245044e-08, | |
| "loss": 0.3545, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9513571620532115, | |
| "grad_norm": 1.8063043505678373, | |
| "learning_rate": 7.115102342598101e-08, | |
| "loss": 0.3463, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.9567320612738511, | |
| "grad_norm": 1.7402924935517343, | |
| "learning_rate": 5.6246161517158336e-08, | |
| "loss": 0.3443, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9621069604944907, | |
| "grad_norm": 1.7772873395565845, | |
| "learning_rate": 4.308243654806643e-08, | |
| "loss": 0.3421, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.9674818597151303, | |
| "grad_norm": 1.7288228417548714, | |
| "learning_rate": 3.166448464108629e-08, | |
| "loss": 0.3493, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.97285675893577, | |
| "grad_norm": 1.7547077783156886, | |
| "learning_rate": 2.1996327076096446e-08, | |
| "loss": 0.3504, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.9782316581564096, | |
| "grad_norm": 1.8190476476140809, | |
| "learning_rate": 1.4081368874226398e-08, | |
| "loss": 0.3463, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 1.66337852034001, | |
| "learning_rate": 7.922397598642551e-09, | |
| "loss": 0.3459, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.9889814565976888, | |
| "grad_norm": 1.7288826649004094, | |
| "learning_rate": 3.5215823727974274e-09, | |
| "loss": 0.3501, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9943563558183284, | |
| "grad_norm": 1.7197053184297024, | |
| "learning_rate": 8.804731164901991e-10, | |
| "loss": 0.3516, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.999731255038968, | |
| "grad_norm": 1.7184366358056455, | |
| "learning_rate": 0.0, | |
| "loss": 0.3536, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.999731255038968, | |
| "eval_loss": 0.27914658188819885, | |
| "eval_runtime": 1.1884, | |
| "eval_samples_per_second": 1.683, | |
| "eval_steps_per_second": 0.841, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.999731255038968, | |
| "step": 930, | |
| "total_flos": 194670734868480.0, | |
| "train_loss": 0.5080837889384198, | |
| "train_runtime": 21164.3008, | |
| "train_samples_per_second": 1.406, | |
| "train_steps_per_second": 0.044 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 930, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 194670734868480.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |