| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.26763013515321826, | |
| "eval_steps": 999999, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0013381506757660913, | |
| "grad_norm": 43.25, | |
| "learning_rate": 6.684491978609625e-09, | |
| "loss": -1.2338, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0026763013515321826, | |
| "grad_norm": 66.0, | |
| "learning_rate": 1.336898395721925e-08, | |
| "loss": -0.96, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004014452027298274, | |
| "grad_norm": 77.5, | |
| "learning_rate": 2.0053475935828877e-08, | |
| "loss": -0.7695, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005352602703064365, | |
| "grad_norm": 156.0, | |
| "learning_rate": 2.67379679144385e-08, | |
| "loss": -0.5852, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.006690753378830456, | |
| "grad_norm": 49.25, | |
| "learning_rate": 3.342245989304813e-08, | |
| "loss": -1.3508, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.008028904054596548, | |
| "grad_norm": 145.0, | |
| "learning_rate": 4.0106951871657754e-08, | |
| "loss": -0.7302, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.009367054730362638, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 4.679144385026738e-08, | |
| "loss": -0.5402, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01070520540612873, | |
| "grad_norm": 56.5, | |
| "learning_rate": 5.3475935828877e-08, | |
| "loss": -1.5851, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.01204335608189482, | |
| "grad_norm": 97.0, | |
| "learning_rate": 6.016042780748662e-08, | |
| "loss": -1.9164, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.013381506757660913, | |
| "grad_norm": 157.0, | |
| "learning_rate": 6.684491978609626e-08, | |
| "loss": -1.0399, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.014719657433427003, | |
| "grad_norm": 346.0, | |
| "learning_rate": 7.352941176470588e-08, | |
| "loss": -0.788, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.016057808109193095, | |
| "grad_norm": 416.0, | |
| "learning_rate": 8.021390374331551e-08, | |
| "loss": -1.6681, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.017395958784959187, | |
| "grad_norm": 58.0, | |
| "learning_rate": 8.689839572192513e-08, | |
| "loss": -0.9137, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.018734109460725276, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 9.358288770053476e-08, | |
| "loss": -0.8646, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.02007226013649137, | |
| "grad_norm": 63.0, | |
| "learning_rate": 1.0026737967914438e-07, | |
| "loss": -0.5643, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02141041081225746, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.06951871657754e-07, | |
| "loss": -0.8626, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.022748561488023553, | |
| "grad_norm": 91.0, | |
| "learning_rate": 1.1363636363636363e-07, | |
| "loss": -0.9937, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.02408671216378964, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.2032085561497325e-07, | |
| "loss": -1.1182, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.025424862839555733, | |
| "grad_norm": 69.5, | |
| "learning_rate": 1.2700534759358288e-07, | |
| "loss": -1.2665, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.026763013515321826, | |
| "grad_norm": 15.1875, | |
| "learning_rate": 1.3368983957219251e-07, | |
| "loss": -0.4304, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.028101164191087918, | |
| "grad_norm": 360.0, | |
| "learning_rate": 1.4037433155080215e-07, | |
| "loss": -2.4656, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.029439314866854006, | |
| "grad_norm": 11.5, | |
| "learning_rate": 1.4705882352941175e-07, | |
| "loss": -0.7774, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0307774655426201, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 1.5374331550802138e-07, | |
| "loss": -0.8901, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.03211561621838619, | |
| "grad_norm": 255.0, | |
| "learning_rate": 1.6042780748663102e-07, | |
| "loss": -1.2436, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03345376689415228, | |
| "grad_norm": 72.0, | |
| "learning_rate": 1.6711229946524065e-07, | |
| "loss": -1.1494, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.034791917569918375, | |
| "grad_norm": 81.5, | |
| "learning_rate": 1.7379679144385025e-07, | |
| "loss": -1.0291, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.03613006824568447, | |
| "grad_norm": 9.0, | |
| "learning_rate": 1.804812834224599e-07, | |
| "loss": -0.7383, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.03746821892145055, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.8716577540106952e-07, | |
| "loss": -0.7876, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.038806369597216644, | |
| "grad_norm": 26.125, | |
| "learning_rate": 1.9385026737967912e-07, | |
| "loss": -0.7877, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.04014452027298274, | |
| "grad_norm": 175.0, | |
| "learning_rate": 2.0053475935828876e-07, | |
| "loss": -1.557, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04148267094874883, | |
| "grad_norm": 59.75, | |
| "learning_rate": 2.072192513368984e-07, | |
| "loss": -1.0401, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.04282082162451492, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 2.13903743315508e-07, | |
| "loss": -0.8737, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.04415897230028101, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 2.2058823529411763e-07, | |
| "loss": -0.8872, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.045497122976047105, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 2.2727272727272726e-07, | |
| "loss": -0.6217, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.0468352736518132, | |
| "grad_norm": 21.75, | |
| "learning_rate": 2.339572192513369e-07, | |
| "loss": -1.1064, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.04817342432757928, | |
| "grad_norm": 44.0, | |
| "learning_rate": 2.406417112299465e-07, | |
| "loss": -1.0586, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.049511575003345375, | |
| "grad_norm": 32.5, | |
| "learning_rate": 2.473262032085561e-07, | |
| "loss": -0.7374, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.05084972567911147, | |
| "grad_norm": 83.0, | |
| "learning_rate": 2.5401069518716576e-07, | |
| "loss": -1.1667, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.05218787635487756, | |
| "grad_norm": 33.5, | |
| "learning_rate": 2.6069518716577537e-07, | |
| "loss": -1.0022, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.05352602703064365, | |
| "grad_norm": 181.0, | |
| "learning_rate": 2.6737967914438503e-07, | |
| "loss": -0.911, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05486417770640974, | |
| "grad_norm": 169.0, | |
| "learning_rate": 2.7406417112299463e-07, | |
| "loss": -1.4837, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.056202328382175835, | |
| "grad_norm": 164.0, | |
| "learning_rate": 2.807486631016043e-07, | |
| "loss": -0.9708, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.05754047905794193, | |
| "grad_norm": 31.0, | |
| "learning_rate": 2.874331550802139e-07, | |
| "loss": -0.8173, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.05887862973370801, | |
| "grad_norm": 33.75, | |
| "learning_rate": 2.941176470588235e-07, | |
| "loss": -0.7977, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.060216780409474105, | |
| "grad_norm": 54.25, | |
| "learning_rate": 3.008021390374331e-07, | |
| "loss": -1.421, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0615549310852402, | |
| "grad_norm": 34.75, | |
| "learning_rate": 3.0748663101604277e-07, | |
| "loss": -0.5687, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.06289308176100629, | |
| "grad_norm": 45.25, | |
| "learning_rate": 3.141711229946524e-07, | |
| "loss": -0.9605, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.06423123243677238, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 3.2085561497326203e-07, | |
| "loss": -1.0447, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.06556938311253847, | |
| "grad_norm": 161.0, | |
| "learning_rate": 3.2754010695187164e-07, | |
| "loss": -1.2917, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.06690753378830457, | |
| "grad_norm": 38.25, | |
| "learning_rate": 3.342245989304813e-07, | |
| "loss": -0.7867, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06824568446407066, | |
| "grad_norm": 268.0, | |
| "learning_rate": 3.4090909090909085e-07, | |
| "loss": -1.5734, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.06958383513983675, | |
| "grad_norm": 134.0, | |
| "learning_rate": 3.475935828877005e-07, | |
| "loss": -0.8703, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.07092198581560284, | |
| "grad_norm": 87.5, | |
| "learning_rate": 3.542780748663101e-07, | |
| "loss": -0.8778, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.07226013649136893, | |
| "grad_norm": 44.75, | |
| "learning_rate": 3.609625668449198e-07, | |
| "loss": -0.8009, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.07359828716713501, | |
| "grad_norm": 14.75, | |
| "learning_rate": 3.6764705882352943e-07, | |
| "loss": -0.642, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0749364378429011, | |
| "grad_norm": 213.0, | |
| "learning_rate": 3.7433155080213904e-07, | |
| "loss": -1.1518, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.0762745885186672, | |
| "grad_norm": 77.0, | |
| "learning_rate": 3.8101604278074864e-07, | |
| "loss": -0.7262, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.07761273919443329, | |
| "grad_norm": 20.5, | |
| "learning_rate": 3.8770053475935825e-07, | |
| "loss": -0.6358, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.07895088987019938, | |
| "grad_norm": 6.625, | |
| "learning_rate": 3.943850267379679e-07, | |
| "loss": -0.4141, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.08028904054596547, | |
| "grad_norm": 42.75, | |
| "learning_rate": 4.010695187165775e-07, | |
| "loss": -0.8822, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.08162719122173157, | |
| "grad_norm": 42.75, | |
| "learning_rate": 4.0775401069518717e-07, | |
| "loss": -0.4594, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.08296534189749766, | |
| "grad_norm": 202.0, | |
| "learning_rate": 4.144385026737968e-07, | |
| "loss": -2.2936, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.08430349257326375, | |
| "grad_norm": 124.5, | |
| "learning_rate": 4.2112299465240644e-07, | |
| "loss": -0.8653, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.08564164324902984, | |
| "grad_norm": 149.0, | |
| "learning_rate": 4.27807486631016e-07, | |
| "loss": -1.6982, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.08697979392479593, | |
| "grad_norm": 262.0, | |
| "learning_rate": 4.3449197860962565e-07, | |
| "loss": -1.3561, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.08831794460056203, | |
| "grad_norm": 42.75, | |
| "learning_rate": 4.4117647058823526e-07, | |
| "loss": -1.4205, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.08965609527632812, | |
| "grad_norm": 430.0, | |
| "learning_rate": 4.478609625668449e-07, | |
| "loss": -1.6911, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.09099424595209421, | |
| "grad_norm": 54.75, | |
| "learning_rate": 4.545454545454545e-07, | |
| "loss": -1.7924, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.0923323966278603, | |
| "grad_norm": 41.0, | |
| "learning_rate": 4.612299465240642e-07, | |
| "loss": -1.2441, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.0936705473036264, | |
| "grad_norm": 121.0, | |
| "learning_rate": 4.679144385026738e-07, | |
| "loss": -1.1202, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.09500869797939247, | |
| "grad_norm": 26.125, | |
| "learning_rate": 4.745989304812834e-07, | |
| "loss": -0.8682, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.09634684865515857, | |
| "grad_norm": 65.0, | |
| "learning_rate": 4.81283422459893e-07, | |
| "loss": -1.1251, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.09768499933092466, | |
| "grad_norm": 164.0, | |
| "learning_rate": 4.879679144385027e-07, | |
| "loss": -1.714, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.09902315000669075, | |
| "grad_norm": 278.0, | |
| "learning_rate": 4.946524064171122e-07, | |
| "loss": -1.7572, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.10036130068245684, | |
| "grad_norm": 8.5, | |
| "learning_rate": 4.999998908848282e-07, | |
| "loss": -1.4453, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.10169945135822293, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 4.999960718638164e-07, | |
| "loss": -1.0203, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.10303760203398903, | |
| "grad_norm": 462.0, | |
| "learning_rate": 4.999867971794632e-07, | |
| "loss": -1.7162, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.10437575270975512, | |
| "grad_norm": 42.25, | |
| "learning_rate": 4.999720670341701e-07, | |
| "loss": -2.1454, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.10571390338552121, | |
| "grad_norm": 956.0, | |
| "learning_rate": 4.99951881749393e-07, | |
| "loss": -1.8669, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.1070520540612873, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 4.999262417656353e-07, | |
| "loss": -2.2964, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.1083902047370534, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 4.998951476424382e-07, | |
| "loss": -1.1538, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.10972835541281949, | |
| "grad_norm": 1368.0, | |
| "learning_rate": 4.998586000583687e-07, | |
| "loss": -3.0573, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.11106650608858558, | |
| "grad_norm": 1768.0, | |
| "learning_rate": 4.998165998110045e-07, | |
| "loss": -2.3538, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.11240465676435167, | |
| "grad_norm": 125.0, | |
| "learning_rate": 4.99769147816917e-07, | |
| "loss": -1.1908, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.11374280744011776, | |
| "grad_norm": 716.0, | |
| "learning_rate": 4.997162451116507e-07, | |
| "loss": -1.2429, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.11508095811588386, | |
| "grad_norm": 72.0, | |
| "learning_rate": 4.996578928497012e-07, | |
| "loss": -2.3002, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.11641910879164993, | |
| "grad_norm": 16.0, | |
| "learning_rate": 4.995940923044898e-07, | |
| "loss": -1.9943, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.11775725946741603, | |
| "grad_norm": 150.0, | |
| "learning_rate": 4.995248448683355e-07, | |
| "loss": -2.9556, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.11909541014318212, | |
| "grad_norm": 54.75, | |
| "learning_rate": 4.994501520524248e-07, | |
| "loss": -2.435, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.12043356081894821, | |
| "grad_norm": 56.75, | |
| "learning_rate": 4.993700154867787e-07, | |
| "loss": -2.0542, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1217717114947143, | |
| "grad_norm": 163.0, | |
| "learning_rate": 4.992844369202173e-07, | |
| "loss": -3.1973, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.1231098621704804, | |
| "grad_norm": 14.75, | |
| "learning_rate": 4.991934182203214e-07, | |
| "loss": -4.3573, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.12444801284624649, | |
| "grad_norm": 20.375, | |
| "learning_rate": 4.990969613733915e-07, | |
| "loss": -4.3314, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.12578616352201258, | |
| "grad_norm": 35.0, | |
| "learning_rate": 4.989950684844051e-07, | |
| "loss": -3.9402, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.12712431419777867, | |
| "grad_norm": 16.375, | |
| "learning_rate": 4.988877417769705e-07, | |
| "loss": -3.3578, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.12846246487354476, | |
| "grad_norm": 163.0, | |
| "learning_rate": 4.987749835932777e-07, | |
| "loss": -6.255, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.12980061554931085, | |
| "grad_norm": 61.75, | |
| "learning_rate": 4.986567963940486e-07, | |
| "loss": -5.0546, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.13113876622507695, | |
| "grad_norm": 49.0, | |
| "learning_rate": 4.985331827584815e-07, | |
| "loss": -3.643, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.13247691690084304, | |
| "grad_norm": 155.0, | |
| "learning_rate": 4.984041453841966e-07, | |
| "loss": -3.2031, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.13381506757660913, | |
| "grad_norm": 77.0, | |
| "learning_rate": 4.982696870871761e-07, | |
| "loss": -3.013, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.13515321825237522, | |
| "grad_norm": 24.875, | |
| "learning_rate": 4.981298108017027e-07, | |
| "loss": -4.3903, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.13649136892814132, | |
| "grad_norm": 44.5, | |
| "learning_rate": 4.979845195802961e-07, | |
| "loss": -3.194, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.1378295196039074, | |
| "grad_norm": 32.25, | |
| "learning_rate": 4.978338165936462e-07, | |
| "loss": -3.9074, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.1391676702796735, | |
| "grad_norm": 148.0, | |
| "learning_rate": 4.976777051305436e-07, | |
| "loss": -5.6556, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.1405058209554396, | |
| "grad_norm": 366.0, | |
| "learning_rate": 4.975161885978083e-07, | |
| "loss": -4.4532, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.14184397163120568, | |
| "grad_norm": 37.0, | |
| "learning_rate": 4.973492705202148e-07, | |
| "loss": -5.3149, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.14318212230697178, | |
| "grad_norm": 2944.0, | |
| "learning_rate": 4.971769545404158e-07, | |
| "loss": -6.5053, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.14452027298273787, | |
| "grad_norm": 388.0, | |
| "learning_rate": 4.969992444188623e-07, | |
| "loss": -5.4356, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.14585842365850396, | |
| "grad_norm": 125.5, | |
| "learning_rate": 4.968161440337216e-07, | |
| "loss": -4.2288, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.14719657433427003, | |
| "grad_norm": 56.0, | |
| "learning_rate": 4.966276573807928e-07, | |
| "loss": -5.4162, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.14853472501003612, | |
| "grad_norm": 2112.0, | |
| "learning_rate": 4.964337885734192e-07, | |
| "loss": -10.277, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.1498728756858022, | |
| "grad_norm": 67.5, | |
| "learning_rate": 4.962345418423992e-07, | |
| "loss": -7.698, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.1512110263615683, | |
| "grad_norm": 44.25, | |
| "learning_rate": 4.960299215358934e-07, | |
| "loss": -3.43, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.1525491770373344, | |
| "grad_norm": 2992.0, | |
| "learning_rate": 4.958199321193302e-07, | |
| "loss": -11.3613, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.15388732771310049, | |
| "grad_norm": 3296.0, | |
| "learning_rate": 4.956045781753075e-07, | |
| "loss": -12.9118, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.15522547838886658, | |
| "grad_norm": 3120.0, | |
| "learning_rate": 4.95383864403494e-07, | |
| "loss": -7.3724, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.15656362906463267, | |
| "grad_norm": 146.0, | |
| "learning_rate": 4.951577956205252e-07, | |
| "loss": -8.3391, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.15790177974039876, | |
| "grad_norm": 3408.0, | |
| "learning_rate": 4.949263767598995e-07, | |
| "loss": -10.8889, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.15923993041616485, | |
| "grad_norm": 3632.0, | |
| "learning_rate": 4.946896128718698e-07, | |
| "loss": -9.2901, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.16057808109193095, | |
| "grad_norm": 4448.0, | |
| "learning_rate": 4.944475091233333e-07, | |
| "loss": -12.0885, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.16191623176769704, | |
| "grad_norm": 2816.0, | |
| "learning_rate": 4.942000707977195e-07, | |
| "loss": -12.379, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.16325438244346313, | |
| "grad_norm": 286.0, | |
| "learning_rate": 4.939473032948741e-07, | |
| "loss": -2.5466, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.16459253311922922, | |
| "grad_norm": 83.5, | |
| "learning_rate": 4.936892121309411e-07, | |
| "loss": -5.7836, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.16593068379499532, | |
| "grad_norm": 5152.0, | |
| "learning_rate": 4.934258029382431e-07, | |
| "loss": -19.1814, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.1672688344707614, | |
| "grad_norm": 65.0, | |
| "learning_rate": 4.93157081465158e-07, | |
| "loss": -5.7378, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.1686069851465275, | |
| "grad_norm": 83.5, | |
| "learning_rate": 4.928830535759934e-07, | |
| "loss": -13.2967, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.1699451358222936, | |
| "grad_norm": 8032.0, | |
| "learning_rate": 4.926037252508591e-07, | |
| "loss": -9.7339, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.17128328649805968, | |
| "grad_norm": 55.5, | |
| "learning_rate": 4.923191025855359e-07, | |
| "loss": -4.0844, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.17262143717382578, | |
| "grad_norm": 38.0, | |
| "learning_rate": 4.920291917913432e-07, | |
| "loss": -12.4584, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.17395958784959187, | |
| "grad_norm": 115.0, | |
| "learning_rate": 4.917339991950032e-07, | |
| "loss": -8.7411, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.17529773852535796, | |
| "grad_norm": 141.0, | |
| "learning_rate": 4.914335312385027e-07, | |
| "loss": -9.0839, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.17663588920112405, | |
| "grad_norm": 7488.0, | |
| "learning_rate": 4.911277944789531e-07, | |
| "loss": -16.2416, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.17797403987689014, | |
| "grad_norm": 27.625, | |
| "learning_rate": 4.908167955884461e-07, | |
| "loss": -1.2109, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.17931219055265624, | |
| "grad_norm": 1944.0, | |
| "learning_rate": 4.905005413539098e-07, | |
| "loss": -12.3225, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.18065034122842233, | |
| "grad_norm": 75.0, | |
| "learning_rate": 4.90179038676959e-07, | |
| "loss": -2.7202, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.18198849190418842, | |
| "grad_norm": 63.5, | |
| "learning_rate": 4.898522945737453e-07, | |
| "loss": -4.8082, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.1833266425799545, | |
| "grad_norm": 82.5, | |
| "learning_rate": 4.895203161748042e-07, | |
| "loss": -9.3348, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.1846647932557206, | |
| "grad_norm": 133.0, | |
| "learning_rate": 4.89183110724899e-07, | |
| "loss": -2.2605, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.1860029439314867, | |
| "grad_norm": 3808.0, | |
| "learning_rate": 4.888406855828629e-07, | |
| "loss": -4.6184, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.1873410946072528, | |
| "grad_norm": 344.0, | |
| "learning_rate": 4.884930482214386e-07, | |
| "loss": -2.5858, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.18867924528301888, | |
| "grad_norm": 384.0, | |
| "learning_rate": 4.881402062271148e-07, | |
| "loss": -6.3372, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.19001739595878495, | |
| "grad_norm": 82.5, | |
| "learning_rate": 4.877821672999613e-07, | |
| "loss": -2.8738, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.19135554663455104, | |
| "grad_norm": 186.0, | |
| "learning_rate": 4.874189392534599e-07, | |
| "loss": -1.0014, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.19269369731031713, | |
| "grad_norm": 17.625, | |
| "learning_rate": 4.870505300143352e-07, | |
| "loss": -8.9454, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.19403184798608322, | |
| "grad_norm": 1344.0, | |
| "learning_rate": 4.866769476223804e-07, | |
| "loss": -2.1685, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.19536999866184931, | |
| "grad_norm": 153.0, | |
| "learning_rate": 4.862982002302829e-07, | |
| "loss": -0.1105, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.1967081493376154, | |
| "grad_norm": 2704.0, | |
| "learning_rate": 4.859142961034454e-07, | |
| "loss": -4.0817, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.1980463000133815, | |
| "grad_norm": 121.0, | |
| "learning_rate": 4.855252436198064e-07, | |
| "loss": 2.3027, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.1993844506891476, | |
| "grad_norm": 464.0, | |
| "learning_rate": 4.851310512696566e-07, | |
| "loss": -3.0267, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.20072260136491368, | |
| "grad_norm": 688.0, | |
| "learning_rate": 4.847317276554545e-07, | |
| "loss": -8.8319, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.20206075204067978, | |
| "grad_norm": 1912.0, | |
| "learning_rate": 4.843272814916375e-07, | |
| "loss": -8.3295, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.20339890271644587, | |
| "grad_norm": 81.0, | |
| "learning_rate": 4.839177216044329e-07, | |
| "loss": 1.4731, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.20473705339221196, | |
| "grad_norm": 2352.0, | |
| "learning_rate": 4.835030569316646e-07, | |
| "loss": -2.4913, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.20607520406797805, | |
| "grad_norm": 26.875, | |
| "learning_rate": 4.830832965225581e-07, | |
| "loss": -6.6273, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.20741335474374414, | |
| "grad_norm": 156.0, | |
| "learning_rate": 4.826584495375433e-07, | |
| "loss": -4.7107, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.20875150541951024, | |
| "grad_norm": 640.0, | |
| "learning_rate": 4.822285252480543e-07, | |
| "loss": -5.5531, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.21008965609527633, | |
| "grad_norm": 93.0, | |
| "learning_rate": 4.817935330363274e-07, | |
| "loss": -0.0346, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.21142780677104242, | |
| "grad_norm": 398.0, | |
| "learning_rate": 4.813534823951958e-07, | |
| "loss": -6.8717, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.2127659574468085, | |
| "grad_norm": 121.5, | |
| "learning_rate": 4.809083829278831e-07, | |
| "loss": -0.3183, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2141041081225746, | |
| "grad_norm": 916.0, | |
| "learning_rate": 4.804582443477936e-07, | |
| "loss": -1.5798, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2154422587983407, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 4.800030764782993e-07, | |
| "loss": -4.0797, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2167804094741068, | |
| "grad_norm": 3728.0, | |
| "learning_rate": 4.795428892525273e-07, | |
| "loss": -4.8349, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.21811856014987288, | |
| "grad_norm": 162.0, | |
| "learning_rate": 4.790776927131416e-07, | |
| "loss": -2.2261, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.21945671082563897, | |
| "grad_norm": 44.75, | |
| "learning_rate": 4.786074970121246e-07, | |
| "loss": -4.7069, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.22079486150140507, | |
| "grad_norm": 52.25, | |
| "learning_rate": 4.781323124105551e-07, | |
| "loss": -7.4818, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.22213301217717116, | |
| "grad_norm": 532.0, | |
| "learning_rate": 4.776521492783852e-07, | |
| "loss": -2.0291, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.22347116285293725, | |
| "grad_norm": 3968.0, | |
| "learning_rate": 4.771670180942129e-07, | |
| "loss": -6.5649, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.22480931352870334, | |
| "grad_norm": 34.75, | |
| "learning_rate": 4.7667692944505433e-07, | |
| "loss": -4.8102, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.22614746420446943, | |
| "grad_norm": 1384.0, | |
| "learning_rate": 4.761818940261122e-07, | |
| "loss": -7.5108, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.22748561488023553, | |
| "grad_norm": 217.0, | |
| "learning_rate": 4.7568192264054264e-07, | |
| "loss": -2.7564, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.22882376555600162, | |
| "grad_norm": 356.0, | |
| "learning_rate": 4.7517702619921935e-07, | |
| "loss": -0.92, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.2301619162317677, | |
| "grad_norm": 540.0, | |
| "learning_rate": 4.746672157204954e-07, | |
| "loss": 2.2638, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.23150006690753377, | |
| "grad_norm": 44.25, | |
| "learning_rate": 4.741525023299631e-07, | |
| "loss": -6.1031, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.23283821758329987, | |
| "grad_norm": 93.0, | |
| "learning_rate": 4.736328972602106e-07, | |
| "loss": -2.0605, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.23417636825906596, | |
| "grad_norm": 1896.0, | |
| "learning_rate": 4.731084118505776e-07, | |
| "loss": -7.2052, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.23551451893483205, | |
| "grad_norm": 35.75, | |
| "learning_rate": 4.7257905754690724e-07, | |
| "loss": 2.7438, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.23685266961059814, | |
| "grad_norm": 195.0, | |
| "learning_rate": 4.720448459012964e-07, | |
| "loss": -2.656, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.23819082028636424, | |
| "grad_norm": 1984.0, | |
| "learning_rate": 4.7150578857184384e-07, | |
| "loss": -7.6791, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.23952897096213033, | |
| "grad_norm": 548.0, | |
| "learning_rate": 4.7096189732239575e-07, | |
| "loss": -2.1757, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.24086712163789642, | |
| "grad_norm": 121.0, | |
| "learning_rate": 4.7041318402228877e-07, | |
| "loss": -4.9406, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2422052723136625, | |
| "grad_norm": 2480.0, | |
| "learning_rate": 4.698596606460911e-07, | |
| "loss": 6.3874, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.2435434229894286, | |
| "grad_norm": 824.0, | |
| "learning_rate": 4.693013392733415e-07, | |
| "loss": -1.0411, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.2448815736651947, | |
| "grad_norm": 330.0, | |
| "learning_rate": 4.68738232088285e-07, | |
| "loss": -1.7624, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.2462197243409608, | |
| "grad_norm": 2160.0, | |
| "learning_rate": 4.681703513796077e-07, | |
| "loss": 0.637, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.24755787501672688, | |
| "grad_norm": 660.0, | |
| "learning_rate": 4.675977095401682e-07, | |
| "loss": -5.6651, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.24889602569249297, | |
| "grad_norm": 114.0, | |
| "learning_rate": 4.6702031906672725e-07, | |
| "loss": -8.9473, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.25023417636825906, | |
| "grad_norm": 41.0, | |
| "learning_rate": 4.664381925596748e-07, | |
| "loss": -4.5124, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.25157232704402516, | |
| "grad_norm": 70.0, | |
| "learning_rate": 4.658513427227556e-07, | |
| "loss": -4.7603, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.25291047771979125, | |
| "grad_norm": 84.0, | |
| "learning_rate": 4.652597823627915e-07, | |
| "loss": -3.1663, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.25424862839555734, | |
| "grad_norm": 12.0, | |
| "learning_rate": 4.6466352438940186e-07, | |
| "loss": -6.3194, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.25558677907132343, | |
| "grad_norm": 404.0, | |
| "learning_rate": 4.640625818147224e-07, | |
| "loss": -3.6509, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.2569249297470895, | |
| "grad_norm": 90.5, | |
| "learning_rate": 4.634569677531204e-07, | |
| "loss": -2.3439, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.2582630804228556, | |
| "grad_norm": 16.75, | |
| "learning_rate": 4.628466954209095e-07, | |
| "loss": -0.8997, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.2596012310986217, | |
| "grad_norm": 19.125, | |
| "learning_rate": 4.622317781360604e-07, | |
| "loss": -3.01, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.2609393817743878, | |
| "grad_norm": 182.0, | |
| "learning_rate": 4.6161222931791084e-07, | |
| "loss": -4.8218, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.2622775324501539, | |
| "grad_norm": 2768.0, | |
| "learning_rate": 4.609880624868722e-07, | |
| "loss": -9.8225, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.26361568312592, | |
| "grad_norm": 756.0, | |
| "learning_rate": 4.60359291264135e-07, | |
| "loss": -6.7088, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.2649538338016861, | |
| "grad_norm": 304.0, | |
| "learning_rate": 4.597259293713712e-07, | |
| "loss": -5.1363, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.26629198447745217, | |
| "grad_norm": 2048.0, | |
| "learning_rate": 4.590879906304352e-07, | |
| "loss": -4.1671, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.26763013515321826, | |
| "grad_norm": 19.625, | |
| "learning_rate": 4.5844548896306156e-07, | |
| "loss": -6.2636, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7473, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |