{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26763013515321826, "eval_steps": 999999, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013381506757660913, "grad_norm": 7.53125, "learning_rate": 6.684491978609625e-09, "loss": -0.3128, "step": 10 }, { "epoch": 0.0026763013515321826, "grad_norm": 25.125, "learning_rate": 1.336898395721925e-08, "loss": -0.3665, "step": 20 }, { "epoch": 0.004014452027298274, "grad_norm": 24.25, "learning_rate": 2.0053475935828877e-08, "loss": -0.3755, "step": 30 }, { "epoch": 0.005352602703064365, "grad_norm": 36.75, "learning_rate": 2.67379679144385e-08, "loss": -0.4086, "step": 40 }, { "epoch": 0.006690753378830456, "grad_norm": 7.09375, "learning_rate": 3.342245989304813e-08, "loss": -0.3711, "step": 50 }, { "epoch": 0.008028904054596548, "grad_norm": 30.0, "learning_rate": 4.0106951871657754e-08, "loss": -0.3331, "step": 60 }, { "epoch": 0.009367054730362638, "grad_norm": 13.6875, "learning_rate": 4.679144385026738e-08, "loss": -0.4149, "step": 70 }, { "epoch": 0.01070520540612873, "grad_norm": 46.0, "learning_rate": 5.3475935828877e-08, "loss": -0.5213, "step": 80 }, { "epoch": 0.01204335608189482, "grad_norm": 12.6875, "learning_rate": 6.016042780748662e-08, "loss": -0.4262, "step": 90 }, { "epoch": 0.013381506757660913, "grad_norm": 40.25, "learning_rate": 6.684491978609626e-08, "loss": -0.3183, "step": 100 }, { "epoch": 0.014719657433427003, "grad_norm": 1.7265625, "learning_rate": 7.352941176470588e-08, "loss": -0.4182, "step": 110 }, { "epoch": 0.016057808109193095, "grad_norm": 26.875, "learning_rate": 8.021390374331551e-08, "loss": -0.3054, "step": 120 }, { "epoch": 0.017395958784959187, "grad_norm": 23.625, "learning_rate": 8.689839572192513e-08, "loss": -0.4104, "step": 130 }, { "epoch": 0.018734109460725276, "grad_norm": 48.5, "learning_rate": 9.358288770053476e-08, "loss": -0.4081, "step": 140 }, { "epoch": 0.02007226013649137, "grad_norm": 30.0, "learning_rate": 1.0026737967914438e-07, "loss": -0.3352, "step": 150 }, { "epoch": 0.02141041081225746, "grad_norm": 30.125, "learning_rate": 1.06951871657754e-07, "loss": -0.3064, "step": 160 }, { "epoch": 0.022748561488023553, "grad_norm": 25.75, "learning_rate": 1.1363636363636363e-07, "loss": -0.28, "step": 170 }, { "epoch": 0.02408671216378964, "grad_norm": 79.0, "learning_rate": 1.2032085561497325e-07, "loss": -0.3164, "step": 180 }, { "epoch": 0.025424862839555733, "grad_norm": 20.0, "learning_rate": 1.2700534759358288e-07, "loss": -0.2578, "step": 190 }, { "epoch": 0.026763013515321826, "grad_norm": 71.5, "learning_rate": 1.3368983957219251e-07, "loss": -0.3932, "step": 200 }, { "epoch": 0.028101164191087918, "grad_norm": 21.125, "learning_rate": 1.4037433155080215e-07, "loss": -0.3245, "step": 210 }, { "epoch": 0.029439314866854006, "grad_norm": 33.0, "learning_rate": 1.4705882352941175e-07, "loss": -0.3518, "step": 220 }, { "epoch": 0.0307774655426201, "grad_norm": 23.5, "learning_rate": 1.5374331550802138e-07, "loss": -0.3089, "step": 230 }, { "epoch": 0.03211561621838619, "grad_norm": 28.75, "learning_rate": 1.6042780748663102e-07, "loss": -0.343, "step": 240 }, { "epoch": 0.03345376689415228, "grad_norm": 9.125, "learning_rate": 1.6711229946524065e-07, "loss": -0.2475, "step": 250 }, { "epoch": 0.034791917569918375, "grad_norm": 31.875, "learning_rate": 1.7379679144385025e-07, "loss": -0.4017, "step": 260 }, { "epoch": 0.03613006824568447, "grad_norm": 30.25, "learning_rate": 1.804812834224599e-07, "loss": -0.3132, "step": 270 }, { "epoch": 0.03746821892145055, "grad_norm": 28.75, "learning_rate": 1.8716577540106952e-07, "loss": -0.4582, "step": 280 }, { "epoch": 0.038806369597216644, "grad_norm": 17.875, "learning_rate": 1.9385026737967912e-07, "loss": -0.3066, "step": 290 }, { "epoch": 0.04014452027298274, "grad_norm": 38.0, "learning_rate": 2.0053475935828876e-07, "loss": -0.3265, "step": 300 }, { "epoch": 0.04148267094874883, "grad_norm": 29.125, "learning_rate": 2.072192513368984e-07, "loss": -0.3939, "step": 310 }, { "epoch": 0.04282082162451492, "grad_norm": 6.9375, "learning_rate": 2.13903743315508e-07, "loss": -0.3074, "step": 320 }, { "epoch": 0.04415897230028101, "grad_norm": 19.25, "learning_rate": 2.2058823529411763e-07, "loss": -0.2873, "step": 330 }, { "epoch": 0.045497122976047105, "grad_norm": 38.75, "learning_rate": 2.2727272727272726e-07, "loss": -0.5062, "step": 340 }, { "epoch": 0.0468352736518132, "grad_norm": 36.25, "learning_rate": 2.339572192513369e-07, "loss": -0.2646, "step": 350 }, { "epoch": 0.04817342432757928, "grad_norm": 6.6875, "learning_rate": 2.406417112299465e-07, "loss": -0.2393, "step": 360 }, { "epoch": 0.049511575003345375, "grad_norm": 31.5, "learning_rate": 2.473262032085561e-07, "loss": -0.3988, "step": 370 }, { "epoch": 0.05084972567911147, "grad_norm": 35.75, "learning_rate": 2.5401069518716576e-07, "loss": -0.3095, "step": 380 }, { "epoch": 0.05218787635487756, "grad_norm": 53.75, "learning_rate": 2.6069518716577537e-07, "loss": -0.3792, "step": 390 }, { "epoch": 0.05352602703064365, "grad_norm": 28.125, "learning_rate": 2.6737967914438503e-07, "loss": -0.3322, "step": 400 }, { "epoch": 0.05486417770640974, "grad_norm": 9.375, "learning_rate": 2.7406417112299463e-07, "loss": -0.3146, "step": 410 }, { "epoch": 0.056202328382175835, "grad_norm": 27.125, "learning_rate": 2.807486631016043e-07, "loss": -0.3716, "step": 420 }, { "epoch": 0.05754047905794193, "grad_norm": 33.5, "learning_rate": 2.874331550802139e-07, "loss": -0.2956, "step": 430 }, { "epoch": 0.05887862973370801, "grad_norm": 27.25, "learning_rate": 2.941176470588235e-07, "loss": -0.3851, "step": 440 }, { "epoch": 0.060216780409474105, "grad_norm": 5.96875, "learning_rate": 3.008021390374331e-07, "loss": -0.2302, "step": 450 }, { "epoch": 0.0615549310852402, "grad_norm": 39.75, "learning_rate": 3.0748663101604277e-07, "loss": -0.3303, "step": 460 }, { "epoch": 0.06289308176100629, "grad_norm": 44.5, "learning_rate": 3.141711229946524e-07, "loss": -0.2405, "step": 470 }, { "epoch": 0.06423123243677238, "grad_norm": 27.0, "learning_rate": 3.2085561497326203e-07, "loss": -0.272, "step": 480 }, { "epoch": 0.06556938311253847, "grad_norm": 11.0, "learning_rate": 3.2754010695187164e-07, "loss": -0.3964, "step": 490 }, { "epoch": 0.06690753378830457, "grad_norm": 28.125, "learning_rate": 3.342245989304813e-07, "loss": -0.209, "step": 500 }, { "epoch": 0.06824568446407066, "grad_norm": 11.3125, "learning_rate": 3.4090909090909085e-07, "loss": -0.3374, "step": 510 }, { "epoch": 0.06958383513983675, "grad_norm": 46.5, "learning_rate": 3.475935828877005e-07, "loss": -0.345, "step": 520 }, { "epoch": 0.07092198581560284, "grad_norm": 71.5, "learning_rate": 3.542780748663101e-07, "loss": -0.4501, "step": 530 }, { "epoch": 0.07226013649136893, "grad_norm": 49.75, "learning_rate": 3.609625668449198e-07, "loss": -0.6017, "step": 540 }, { "epoch": 0.07359828716713501, "grad_norm": 58.5, "learning_rate": 3.6764705882352943e-07, "loss": -0.4148, "step": 550 }, { "epoch": 0.0749364378429011, "grad_norm": 47.5, "learning_rate": 3.7433155080213904e-07, "loss": -0.1953, "step": 560 }, { "epoch": 0.0762745885186672, "grad_norm": 16.375, "learning_rate": 3.8101604278074864e-07, "loss": -0.4378, "step": 570 }, { "epoch": 0.07761273919443329, "grad_norm": 34.5, "learning_rate": 3.8770053475935825e-07, "loss": -0.3843, "step": 580 }, { "epoch": 0.07895088987019938, "grad_norm": 33.25, "learning_rate": 3.943850267379679e-07, "loss": -0.3646, "step": 590 }, { "epoch": 0.08028904054596547, "grad_norm": 43.0, "learning_rate": 4.010695187165775e-07, "loss": -0.2891, "step": 600 }, { "epoch": 0.08162719122173157, "grad_norm": 45.5, "learning_rate": 4.0775401069518717e-07, "loss": -0.2855, "step": 610 }, { "epoch": 0.08296534189749766, "grad_norm": 34.75, "learning_rate": 4.144385026737968e-07, "loss": -0.3301, "step": 620 }, { "epoch": 0.08430349257326375, "grad_norm": 26.0, "learning_rate": 4.2112299465240644e-07, "loss": -0.3233, "step": 630 }, { "epoch": 0.08564164324902984, "grad_norm": 46.0, "learning_rate": 4.27807486631016e-07, "loss": -0.4337, "step": 640 }, { "epoch": 0.08697979392479593, "grad_norm": 57.0, "learning_rate": 4.3449197860962565e-07, "loss": -0.4675, "step": 650 }, { "epoch": 0.08831794460056203, "grad_norm": 50.5, "learning_rate": 4.4117647058823526e-07, "loss": -0.3085, "step": 660 }, { "epoch": 0.08965609527632812, "grad_norm": 37.75, "learning_rate": 4.478609625668449e-07, "loss": -0.3673, "step": 670 }, { "epoch": 0.09099424595209421, "grad_norm": 35.75, "learning_rate": 4.545454545454545e-07, "loss": -0.284, "step": 680 }, { "epoch": 0.0923323966278603, "grad_norm": 17.0, "learning_rate": 4.612299465240642e-07, "loss": -0.4212, "step": 690 }, { "epoch": 0.0936705473036264, "grad_norm": 43.75, "learning_rate": 4.679144385026738e-07, "loss": -0.5346, "step": 700 }, { "epoch": 0.09500869797939247, "grad_norm": 35.5, "learning_rate": 4.745989304812834e-07, "loss": -0.4373, "step": 710 }, { "epoch": 0.09634684865515857, "grad_norm": 31.5, "learning_rate": 4.81283422459893e-07, "loss": -0.3401, "step": 720 }, { "epoch": 0.09768499933092466, "grad_norm": 32.5, "learning_rate": 4.879679144385027e-07, "loss": -0.4568, "step": 730 }, { "epoch": 0.09902315000669075, "grad_norm": 15.0625, "learning_rate": 4.946524064171122e-07, "loss": -0.2982, "step": 740 }, { "epoch": 0.10036130068245684, "grad_norm": 12.6875, "learning_rate": 4.999998908848282e-07, "loss": -0.333, "step": 750 }, { "epoch": 0.10169945135822293, "grad_norm": 34.25, "learning_rate": 4.999960718638164e-07, "loss": -0.3132, "step": 760 }, { "epoch": 0.10303760203398903, "grad_norm": 16.875, "learning_rate": 4.999867971794632e-07, "loss": -0.339, "step": 770 }, { "epoch": 0.10437575270975512, "grad_norm": 46.75, "learning_rate": 4.999720670341701e-07, "loss": -0.4171, "step": 780 }, { "epoch": 0.10571390338552121, "grad_norm": 22.75, "learning_rate": 4.99951881749393e-07, "loss": -0.4626, "step": 790 }, { "epoch": 0.1070520540612873, "grad_norm": 28.375, "learning_rate": 4.999262417656353e-07, "loss": -0.4776, "step": 800 }, { "epoch": 0.1083902047370534, "grad_norm": 42.5, "learning_rate": 4.998951476424382e-07, "loss": -0.3065, "step": 810 }, { "epoch": 0.10972835541281949, "grad_norm": 19.25, "learning_rate": 4.998586000583687e-07, "loss": -0.3858, "step": 820 }, { "epoch": 0.11106650608858558, "grad_norm": 39.75, "learning_rate": 4.998165998110045e-07, "loss": -0.4911, "step": 830 }, { "epoch": 0.11240465676435167, "grad_norm": 22.0, "learning_rate": 4.99769147816917e-07, "loss": -0.4398, "step": 840 }, { "epoch": 0.11374280744011776, "grad_norm": 2.984375, "learning_rate": 4.997162451116507e-07, "loss": -0.2828, "step": 850 }, { "epoch": 0.11508095811588386, "grad_norm": 34.0, "learning_rate": 4.996578928497012e-07, "loss": -0.3508, "step": 860 }, { "epoch": 0.11641910879164993, "grad_norm": 38.0, "learning_rate": 4.995940923044898e-07, "loss": -0.3724, "step": 870 }, { "epoch": 0.11775725946741603, "grad_norm": 27.0, "learning_rate": 4.995248448683355e-07, "loss": -0.3978, "step": 880 }, { "epoch": 0.11909541014318212, "grad_norm": 19.25, "learning_rate": 4.994501520524248e-07, "loss": -0.5863, "step": 890 }, { "epoch": 0.12043356081894821, "grad_norm": 38.0, "learning_rate": 4.993700154867787e-07, "loss": -0.4643, "step": 900 }, { "epoch": 0.1217717114947143, "grad_norm": 34.25, "learning_rate": 4.992844369202173e-07, "loss": -0.2558, "step": 910 }, { "epoch": 0.1231098621704804, "grad_norm": 12.0625, "learning_rate": 4.991934182203214e-07, "loss": -0.3719, "step": 920 }, { "epoch": 0.12444801284624649, "grad_norm": 22.875, "learning_rate": 4.990969613733915e-07, "loss": -0.428, "step": 930 }, { "epoch": 0.12578616352201258, "grad_norm": 37.25, "learning_rate": 4.989950684844051e-07, "loss": -0.3126, "step": 940 }, { "epoch": 0.12712431419777867, "grad_norm": 26.875, "learning_rate": 4.988877417769705e-07, "loss": -0.3818, "step": 950 }, { "epoch": 0.12846246487354476, "grad_norm": 26.5, "learning_rate": 4.987749835932777e-07, "loss": -0.4921, "step": 960 }, { "epoch": 0.12980061554931085, "grad_norm": 41.0, "learning_rate": 4.986567963940486e-07, "loss": -0.4253, "step": 970 }, { "epoch": 0.13113876622507695, "grad_norm": 24.5, "learning_rate": 4.985331827584815e-07, "loss": -0.3701, "step": 980 }, { "epoch": 0.13247691690084304, "grad_norm": 22.5, "learning_rate": 4.984041453841966e-07, "loss": -0.5117, "step": 990 }, { "epoch": 0.13381506757660913, "grad_norm": 71.5, "learning_rate": 4.982696870871761e-07, "loss": -0.3924, "step": 1000 }, { "epoch": 0.13515321825237522, "grad_norm": 22.75, "learning_rate": 4.981298108017027e-07, "loss": -0.228, "step": 1010 }, { "epoch": 0.13649136892814132, "grad_norm": 26.875, "learning_rate": 4.979845195802961e-07, "loss": -0.3865, "step": 1020 }, { "epoch": 0.1378295196039074, "grad_norm": 27.5, "learning_rate": 4.978338165936462e-07, "loss": -0.4261, "step": 1030 }, { "epoch": 0.1391676702796735, "grad_norm": 25.375, "learning_rate": 4.976777051305436e-07, "loss": -0.2695, "step": 1040 }, { "epoch": 0.1405058209554396, "grad_norm": 12.0625, "learning_rate": 4.975161885978083e-07, "loss": -0.4197, "step": 1050 }, { "epoch": 0.14184397163120568, "grad_norm": 43.25, "learning_rate": 4.973492705202148e-07, "loss": -0.3235, "step": 1060 }, { "epoch": 0.14318212230697178, "grad_norm": 34.0, "learning_rate": 4.971769545404158e-07, "loss": -0.3259, "step": 1070 }, { "epoch": 0.14452027298273787, "grad_norm": 41.0, "learning_rate": 4.969992444188623e-07, "loss": -0.5216, "step": 1080 }, { "epoch": 0.14585842365850396, "grad_norm": 51.5, "learning_rate": 4.968161440337216e-07, "loss": -0.4587, "step": 1090 }, { "epoch": 0.14719657433427003, "grad_norm": 34.75, "learning_rate": 4.966276573807928e-07, "loss": -0.4925, "step": 1100 }, { "epoch": 0.14853472501003612, "grad_norm": 45.0, "learning_rate": 4.964337885734192e-07, "loss": -0.3688, "step": 1110 }, { "epoch": 0.1498728756858022, "grad_norm": 55.0, "learning_rate": 4.962345418423992e-07, "loss": -0.6191, "step": 1120 }, { "epoch": 0.1512110263615683, "grad_norm": 59.0, "learning_rate": 4.960299215358934e-07, "loss": -0.3621, "step": 1130 }, { "epoch": 0.1525491770373344, "grad_norm": 29.375, "learning_rate": 4.958199321193302e-07, "loss": -0.3627, "step": 1140 }, { "epoch": 0.15388732771310049, "grad_norm": 10.4375, "learning_rate": 4.956045781753075e-07, "loss": -0.3725, "step": 1150 }, { "epoch": 0.15522547838886658, "grad_norm": 48.75, "learning_rate": 4.95383864403494e-07, "loss": -0.3313, "step": 1160 }, { "epoch": 0.15656362906463267, "grad_norm": 49.25, "learning_rate": 4.951577956205252e-07, "loss": -0.4595, "step": 1170 }, { "epoch": 0.15790177974039876, "grad_norm": 17.0, "learning_rate": 4.949263767598995e-07, "loss": -0.3784, "step": 1180 }, { "epoch": 0.15923993041616485, "grad_norm": 65.0, "learning_rate": 4.946896128718698e-07, "loss": -0.3591, "step": 1190 }, { "epoch": 0.16057808109193095, "grad_norm": 37.5, "learning_rate": 4.944475091233333e-07, "loss": -0.5091, "step": 1200 }, { "epoch": 0.16191623176769704, "grad_norm": 41.0, "learning_rate": 4.942000707977195e-07, "loss": -0.3271, "step": 1210 }, { "epoch": 0.16325438244346313, "grad_norm": 57.75, "learning_rate": 4.939473032948741e-07, "loss": -0.5499, "step": 1220 }, { "epoch": 0.16459253311922922, "grad_norm": 10.6875, "learning_rate": 4.936892121309411e-07, "loss": -0.5182, "step": 1230 }, { "epoch": 0.16593068379499532, "grad_norm": 33.25, "learning_rate": 4.934258029382431e-07, "loss": -0.4823, "step": 1240 }, { "epoch": 0.1672688344707614, "grad_norm": 50.25, "learning_rate": 4.93157081465158e-07, "loss": -0.511, "step": 1250 }, { "epoch": 0.1686069851465275, "grad_norm": 28.625, "learning_rate": 4.928830535759934e-07, "loss": -0.2908, "step": 1260 }, { "epoch": 0.1699451358222936, "grad_norm": 58.0, "learning_rate": 4.926037252508591e-07, "loss": -0.4714, "step": 1270 }, { "epoch": 0.17128328649805968, "grad_norm": 31.375, "learning_rate": 4.923191025855359e-07, "loss": -0.4243, "step": 1280 }, { "epoch": 0.17262143717382578, "grad_norm": 61.0, "learning_rate": 4.920291917913432e-07, "loss": -0.7076, "step": 1290 }, { "epoch": 0.17395958784959187, "grad_norm": 22.75, "learning_rate": 4.917339991950032e-07, "loss": -0.5047, "step": 1300 }, { "epoch": 0.17529773852535796, "grad_norm": 8.75, "learning_rate": 4.914335312385027e-07, "loss": -0.2989, "step": 1310 }, { "epoch": 0.17663588920112405, "grad_norm": 8.625, "learning_rate": 4.911277944789531e-07, "loss": -0.464, "step": 1320 }, { "epoch": 0.17797403987689014, "grad_norm": 54.25, "learning_rate": 4.908167955884461e-07, "loss": -0.4918, "step": 1330 }, { "epoch": 0.17931219055265624, "grad_norm": 21.375, "learning_rate": 4.905005413539098e-07, "loss": -0.3496, "step": 1340 }, { "epoch": 0.18065034122842233, "grad_norm": 51.5, "learning_rate": 4.90179038676959e-07, "loss": -0.3733, "step": 1350 }, { "epoch": 0.18198849190418842, "grad_norm": 84.0, "learning_rate": 4.898522945737453e-07, "loss": -0.525, "step": 1360 }, { "epoch": 0.1833266425799545, "grad_norm": 31.75, "learning_rate": 4.895203161748042e-07, "loss": -0.4827, "step": 1370 }, { "epoch": 0.1846647932557206, "grad_norm": 58.75, "learning_rate": 4.89183110724899e-07, "loss": -0.4205, "step": 1380 }, { "epoch": 0.1860029439314867, "grad_norm": 90.0, "learning_rate": 4.888406855828629e-07, "loss": -0.6536, "step": 1390 }, { "epoch": 0.1873410946072528, "grad_norm": 22.25, "learning_rate": 4.884930482214386e-07, "loss": -0.3357, "step": 1400 }, { "epoch": 0.18867924528301888, "grad_norm": 28.25, "learning_rate": 4.881402062271148e-07, "loss": -0.3557, "step": 1410 }, { "epoch": 0.19001739595878495, "grad_norm": 22.5, "learning_rate": 4.877821672999613e-07, "loss": -0.542, "step": 1420 }, { "epoch": 0.19135554663455104, "grad_norm": 5.46875, "learning_rate": 4.874189392534599e-07, "loss": -0.5235, "step": 1430 }, { "epoch": 0.19269369731031713, "grad_norm": 28.875, "learning_rate": 4.870505300143352e-07, "loss": -0.266, "step": 1440 }, { "epoch": 0.19403184798608322, "grad_norm": 19.125, "learning_rate": 4.866769476223804e-07, "loss": -0.5845, "step": 1450 }, { "epoch": 0.19536999866184931, "grad_norm": 37.0, "learning_rate": 4.862982002302829e-07, "loss": -0.4133, "step": 1460 }, { "epoch": 0.1967081493376154, "grad_norm": 51.5, "learning_rate": 4.859142961034454e-07, "loss": -0.5559, "step": 1470 }, { "epoch": 0.1980463000133815, "grad_norm": 57.25, "learning_rate": 4.855252436198064e-07, "loss": -0.3938, "step": 1480 }, { "epoch": 0.1993844506891476, "grad_norm": 52.5, "learning_rate": 4.851310512696566e-07, "loss": -0.4694, "step": 1490 }, { "epoch": 0.20072260136491368, "grad_norm": 48.75, "learning_rate": 4.847317276554545e-07, "loss": -0.512, "step": 1500 }, { "epoch": 0.20206075204067978, "grad_norm": 69.0, "learning_rate": 4.843272814916375e-07, "loss": -0.4054, "step": 1510 }, { "epoch": 0.20339890271644587, "grad_norm": 43.0, "learning_rate": 4.839177216044329e-07, "loss": -0.5674, "step": 1520 }, { "epoch": 0.20473705339221196, "grad_norm": 51.25, "learning_rate": 4.835030569316646e-07, "loss": -0.5255, "step": 1530 }, { "epoch": 0.20607520406797805, "grad_norm": 55.5, "learning_rate": 4.830832965225581e-07, "loss": -0.4292, "step": 1540 }, { "epoch": 0.20741335474374414, "grad_norm": 30.375, "learning_rate": 4.826584495375433e-07, "loss": -0.4477, "step": 1550 }, { "epoch": 0.20875150541951024, "grad_norm": 20.75, "learning_rate": 4.822285252480543e-07, "loss": -0.4126, "step": 1560 }, { "epoch": 0.21008965609527633, "grad_norm": 33.0, "learning_rate": 4.817935330363274e-07, "loss": -0.5722, "step": 1570 }, { "epoch": 0.21142780677104242, "grad_norm": 105.0, "learning_rate": 4.813534823951958e-07, "loss": -0.3666, "step": 1580 }, { "epoch": 0.2127659574468085, "grad_norm": 79.5, "learning_rate": 4.809083829278831e-07, "loss": -0.3935, "step": 1590 }, { "epoch": 0.2141041081225746, "grad_norm": 48.0, "learning_rate": 4.804582443477936e-07, "loss": -0.4185, "step": 1600 }, { "epoch": 0.2154422587983407, "grad_norm": 61.5, "learning_rate": 4.800030764782993e-07, "loss": -0.7174, "step": 1610 }, { "epoch": 0.2167804094741068, "grad_norm": 25.125, "learning_rate": 4.795428892525273e-07, "loss": -0.8589, "step": 1620 }, { "epoch": 0.21811856014987288, "grad_norm": 36.25, "learning_rate": 4.790776927131416e-07, "loss": -0.5246, "step": 1630 }, { "epoch": 0.21945671082563897, "grad_norm": 35.0, "learning_rate": 4.786074970121246e-07, "loss": -0.4672, "step": 1640 }, { "epoch": 0.22079486150140507, "grad_norm": 32.0, "learning_rate": 4.781323124105551e-07, "loss": -0.4255, "step": 1650 }, { "epoch": 0.22213301217717116, "grad_norm": 90.0, "learning_rate": 4.776521492783852e-07, "loss": -0.5239, "step": 1660 }, { "epoch": 0.22347116285293725, "grad_norm": 7.3125, "learning_rate": 4.771670180942129e-07, "loss": -0.6092, "step": 1670 }, { "epoch": 0.22480931352870334, "grad_norm": 48.75, "learning_rate": 4.7667692944505433e-07, "loss": -0.5588, "step": 1680 }, { "epoch": 0.22614746420446943, "grad_norm": 107.0, "learning_rate": 4.761818940261122e-07, "loss": -0.6679, "step": 1690 }, { "epoch": 0.22748561488023553, "grad_norm": 50.5, "learning_rate": 4.7568192264054264e-07, "loss": -0.6672, "step": 1700 }, { "epoch": 0.22882376555600162, "grad_norm": 52.25, "learning_rate": 4.7517702619921935e-07, "loss": -0.639, "step": 1710 }, { "epoch": 0.2301619162317677, "grad_norm": 48.5, "learning_rate": 4.746672157204954e-07, "loss": -0.6892, "step": 1720 }, { "epoch": 0.23150006690753377, "grad_norm": 144.0, "learning_rate": 4.741525023299631e-07, "loss": -0.6231, "step": 1730 }, { "epoch": 0.23283821758329987, "grad_norm": 70.0, "learning_rate": 4.736328972602106e-07, "loss": -0.5604, "step": 1740 }, { "epoch": 0.23417636825906596, "grad_norm": 4.5, "learning_rate": 4.731084118505776e-07, "loss": -0.5889, "step": 1750 }, { "epoch": 0.23551451893483205, "grad_norm": 55.25, "learning_rate": 4.7257905754690724e-07, "loss": -0.5754, "step": 1760 }, { "epoch": 0.23685266961059814, "grad_norm": 135.0, "learning_rate": 4.720448459012964e-07, "loss": -0.6005, "step": 1770 }, { "epoch": 0.23819082028636424, "grad_norm": 26.625, "learning_rate": 4.7150578857184384e-07, "loss": -0.5397, "step": 1780 }, { "epoch": 0.23952897096213033, "grad_norm": 44.75, "learning_rate": 4.7096189732239575e-07, "loss": -0.2146, "step": 1790 }, { "epoch": 0.24086712163789642, "grad_norm": 62.0, "learning_rate": 4.7041318402228877e-07, "loss": -0.5158, "step": 1800 }, { "epoch": 0.2422052723136625, "grad_norm": 43.5, "learning_rate": 4.698596606460911e-07, "loss": -0.6369, "step": 1810 }, { "epoch": 0.2435434229894286, "grad_norm": 37.0, "learning_rate": 4.693013392733415e-07, "loss": -0.3974, "step": 1820 }, { "epoch": 0.2448815736651947, "grad_norm": 21.875, "learning_rate": 4.68738232088285e-07, "loss": -0.5526, "step": 1830 }, { "epoch": 0.2462197243409608, "grad_norm": 43.5, "learning_rate": 4.681703513796077e-07, "loss": -0.4938, "step": 1840 }, { "epoch": 0.24755787501672688, "grad_norm": 80.0, "learning_rate": 4.675977095401682e-07, "loss": -0.5384, "step": 1850 }, { "epoch": 0.24889602569249297, "grad_norm": 46.25, "learning_rate": 4.6702031906672725e-07, "loss": -0.6101, "step": 1860 }, { "epoch": 0.25023417636825906, "grad_norm": 94.0, "learning_rate": 4.664381925596748e-07, "loss": -0.6852, "step": 1870 }, { "epoch": 0.25157232704402516, "grad_norm": 12.875, "learning_rate": 4.658513427227556e-07, "loss": -0.296, "step": 1880 }, { "epoch": 0.25291047771979125, "grad_norm": 47.25, "learning_rate": 4.652597823627915e-07, "loss": -0.9339, "step": 1890 }, { "epoch": 0.25424862839555734, "grad_norm": 71.0, "learning_rate": 4.6466352438940186e-07, "loss": -0.7014, "step": 1900 }, { "epoch": 0.25558677907132343, "grad_norm": 65.5, "learning_rate": 4.640625818147224e-07, "loss": -0.4842, "step": 1910 }, { "epoch": 0.2569249297470895, "grad_norm": 55.75, "learning_rate": 4.634569677531204e-07, "loss": -0.6714, "step": 1920 }, { "epoch": 0.2582630804228556, "grad_norm": 34.5, "learning_rate": 4.628466954209095e-07, "loss": -0.5312, "step": 1930 }, { "epoch": 0.2596012310986217, "grad_norm": 330.0, "learning_rate": 4.622317781360604e-07, "loss": -0.7533, "step": 1940 }, { "epoch": 0.2609393817743878, "grad_norm": 98.5, "learning_rate": 4.6161222931791084e-07, "loss": -0.5828, "step": 1950 }, { "epoch": 0.2622775324501539, "grad_norm": 58.5, "learning_rate": 4.609880624868722e-07, "loss": -0.6233, "step": 1960 }, { "epoch": 0.26361568312592, "grad_norm": 45.0, "learning_rate": 4.60359291264135e-07, "loss": -0.731, "step": 1970 }, { "epoch": 0.2649538338016861, "grad_norm": 31.5, "learning_rate": 4.597259293713712e-07, "loss": -0.4871, "step": 1980 }, { "epoch": 0.26629198447745217, "grad_norm": 65.5, "learning_rate": 4.590879906304352e-07, "loss": -0.5111, "step": 1990 }, { "epoch": 0.26763013515321826, "grad_norm": 34.25, "learning_rate": 4.5844548896306156e-07, "loss": -0.4392, "step": 2000 } ], "logging_steps": 10, "max_steps": 7473, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }