{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0223, "eval_steps": 500, "global_step": 2800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 0.3872889280319214, "learning_rate": 3.8817000000000005e-06, "loss": 1.5489984512329102, "step": 10 }, { "epoch": 0.002, "grad_norm": 0.3785003423690796, "learning_rate": 8.1947e-06, "loss": 1.554966926574707, "step": 20 }, { "epoch": 0.003, "grad_norm": 0.38793623447418213, "learning_rate": 1.25077e-05, "loss": 1.546360397338867, "step": 30 }, { "epoch": 0.004, "grad_norm": 0.4206889569759369, "learning_rate": 1.68207e-05, "loss": 1.606406021118164, "step": 40 }, { "epoch": 0.005, "grad_norm": 0.38157036900520325, "learning_rate": 2.11337e-05, "loss": 1.5404697418212892, "step": 50 }, { "epoch": 0.006, "grad_norm": 0.39413636922836304, "learning_rate": 2.54467e-05, "loss": 1.5285799026489257, "step": 60 }, { "epoch": 0.007, "grad_norm": 0.40694475173950195, "learning_rate": 2.97597e-05, "loss": 1.504967498779297, "step": 70 }, { "epoch": 0.008, "grad_norm": 0.41990774869918823, "learning_rate": 3.40727e-05, "loss": 1.5729640960693358, "step": 80 }, { "epoch": 0.009, "grad_norm": 0.403705358505249, "learning_rate": 3.83857e-05, "loss": 1.5551136970520019, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.3957897424697876, "learning_rate": 4.2698700000000005e-05, "loss": 1.5575121879577636, "step": 100 }, { "epoch": 0.011, "grad_norm": 0.40792497992515564, "learning_rate": 4.70117e-05, "loss": 1.518178939819336, "step": 110 }, { "epoch": 0.012, "grad_norm": 0.42794761061668396, "learning_rate": 5.13247e-05, "loss": 1.5593727111816407, "step": 120 }, { "epoch": 0.013, "grad_norm": 0.4005564749240875, "learning_rate": 5.5637700000000004e-05, "loss": 1.562605857849121, "step": 130 }, { "epoch": 0.014, "grad_norm": 0.3942921757698059, "learning_rate": 5.99507e-05, "loss": 1.5700313568115234, "step": 140 }, { "epoch": 0.015, "grad_norm": 0.399198055267334, "learning_rate": 6.42637e-05, "loss": 1.5733401298522949, "step": 150 }, { "epoch": 0.016, "grad_norm": 0.39514100551605225, "learning_rate": 6.857670000000001e-05, "loss": 1.5353406906127929, "step": 160 }, { "epoch": 0.017, "grad_norm": 0.3957034945487976, "learning_rate": 7.288970000000001e-05, "loss": 1.5085016250610352, "step": 170 }, { "epoch": 0.018, "grad_norm": 0.4351767897605896, "learning_rate": 7.72027e-05, "loss": 1.5530328750610352, "step": 180 }, { "epoch": 0.019, "grad_norm": 0.418708473443985, "learning_rate": 8.15157e-05, "loss": 1.5638229370117187, "step": 190 }, { "epoch": 0.02, "grad_norm": 0.40256965160369873, "learning_rate": 8.58287e-05, "loss": 1.5796945571899415, "step": 200 }, { "epoch": 0.021, "grad_norm": 0.397319495677948, "learning_rate": 8.625982049284077e-05, "loss": 1.5816753387451172, "step": 210 }, { "epoch": 0.022, "grad_norm": 0.41874557733535767, "learning_rate": 8.625919997618398e-05, "loss": 1.5102975845336915, "step": 220 }, { "epoch": 0.023, "grad_norm": 0.40078213810920715, "learning_rate": 8.625813624026713e-05, "loss": 1.571668529510498, "step": 230 }, { "epoch": 0.024, "grad_norm": 0.43663620948791504, "learning_rate": 8.625662929602175e-05, "loss": 1.5277430534362793, "step": 240 }, { "epoch": 0.025, "grad_norm": 0.4428839385509491, "learning_rate": 8.625467915893403e-05, "loss": 1.5955425262451173, "step": 250 }, { "epoch": 0.026, "grad_norm": 0.4413251578807831, "learning_rate": 8.625228584904462e-05, "loss": 1.5842302322387696, "step": 260 }, { "epoch": 0.027, "grad_norm": 0.42099958658218384, "learning_rate": 8.62494493909485e-05, "loss": 1.5954334259033203, "step": 270 }, { "epoch": 0.028, "grad_norm": 0.43221530318260193, "learning_rate": 8.624616981379467e-05, "loss": 1.5589290618896485, "step": 280 }, { "epoch": 0.029, "grad_norm": 0.42665326595306396, "learning_rate": 8.624244715128585e-05, "loss": 1.5465147972106934, "step": 290 }, { "epoch": 0.03, "grad_norm": 0.42965951561927795, "learning_rate": 8.623828144167817e-05, "loss": 1.5350761413574219, "step": 300 }, { "epoch": 0.031, "grad_norm": 0.42243221402168274, "learning_rate": 8.623367272778072e-05, "loss": 1.5516321182250976, "step": 310 }, { "epoch": 0.032, "grad_norm": 0.4054717421531677, "learning_rate": 8.622862105695517e-05, "loss": 1.5238536834716796, "step": 320 }, { "epoch": 0.033, "grad_norm": 0.43124857544898987, "learning_rate": 8.622312648111529e-05, "loss": 1.5706765174865722, "step": 330 }, { "epoch": 0.034, "grad_norm": 0.4318324625492096, "learning_rate": 8.621718905672629e-05, "loss": 1.566463565826416, "step": 340 }, { "epoch": 0.035, "grad_norm": 0.43307042121887207, "learning_rate": 8.621080884480445e-05, "loss": 1.530984878540039, "step": 350 }, { "epoch": 0.036, "grad_norm": 0.43541616201400757, "learning_rate": 8.620398591091628e-05, "loss": 1.5489194869995118, "step": 360 }, { "epoch": 0.037, "grad_norm": 0.40485483407974243, "learning_rate": 8.619672032517798e-05, "loss": 1.576507568359375, "step": 370 }, { "epoch": 0.038, "grad_norm": 0.38804423809051514, "learning_rate": 8.618901216225471e-05, "loss": 1.5632071495056152, "step": 380 }, { "epoch": 0.039, "grad_norm": 0.44407907128334045, "learning_rate": 8.618086150135976e-05, "loss": 1.571460247039795, "step": 390 }, { "epoch": 0.04, "grad_norm": 0.4320124387741089, "learning_rate": 8.617226842625377e-05, "loss": 1.5724874496459962, "step": 400 }, { "epoch": 0.041, "grad_norm": 0.41030609607696533, "learning_rate": 8.61632330252439e-05, "loss": 1.552263069152832, "step": 410 }, { "epoch": 0.042, "grad_norm": 0.4280606210231781, "learning_rate": 8.615375539118286e-05, "loss": 1.5935471534729004, "step": 420 }, { "epoch": 0.043, "grad_norm": 0.425942063331604, "learning_rate": 8.614383562146802e-05, "loss": 1.618431854248047, "step": 430 }, { "epoch": 0.044, "grad_norm": 0.4112652540206909, "learning_rate": 8.613347381804036e-05, "loss": 1.6068254470825196, "step": 440 }, { "epoch": 0.045, "grad_norm": 0.43573281168937683, "learning_rate": 8.612267008738342e-05, "loss": 1.501287078857422, "step": 450 }, { "epoch": 0.046, "grad_norm": 0.43779465556144714, "learning_rate": 8.611142454052226e-05, "loss": 1.5798423767089844, "step": 460 }, { "epoch": 0.047, "grad_norm": 0.45562246441841125, "learning_rate": 8.609973729302228e-05, "loss": 1.5571372985839844, "step": 470 }, { "epoch": 0.048, "grad_norm": 0.4224706292152405, "learning_rate": 8.608760846498805e-05, "loss": 1.5518342971801757, "step": 480 }, { "epoch": 0.049, "grad_norm": 0.4402417242527008, "learning_rate": 8.6075038181062e-05, "loss": 1.5404040336608886, "step": 490 }, { "epoch": 0.05, "grad_norm": 0.42017725110054016, "learning_rate": 8.606202657042327e-05, "loss": 1.5324398040771485, "step": 500 }, { "epoch": 0.051, "grad_norm": 0.41633936762809753, "learning_rate": 8.604857376678628e-05, "loss": 1.6201072692871095, "step": 510 }, { "epoch": 0.052, "grad_norm": 0.4232465922832489, "learning_rate": 8.603467990839943e-05, "loss": 1.579312515258789, "step": 520 }, { "epoch": 0.053, "grad_norm": 0.4202162027359009, "learning_rate": 8.602034513804359e-05, "loss": 1.5241361618041993, "step": 530 }, { "epoch": 0.054, "grad_norm": 0.4149995446205139, "learning_rate": 8.60055696030307e-05, "loss": 1.5884086608886718, "step": 540 }, { "epoch": 0.055, "grad_norm": 0.4352942109107971, "learning_rate": 8.599035345520225e-05, "loss": 1.5795814514160156, "step": 550 }, { "epoch": 0.056, "grad_norm": 0.406890332698822, "learning_rate": 8.59746968509277e-05, "loss": 1.5849136352539062, "step": 560 }, { "epoch": 0.057, "grad_norm": 0.4194611608982086, "learning_rate": 8.595859995110288e-05, "loss": 1.5220673561096192, "step": 570 }, { "epoch": 0.058, "grad_norm": 0.42952999472618103, "learning_rate": 8.594206292114837e-05, "loss": 1.5716930389404298, "step": 580 }, { "epoch": 0.059, "grad_norm": 0.4449344575405121, "learning_rate": 8.59250859310077e-05, "loss": 1.5606157302856445, "step": 590 }, { "epoch": 0.06, "grad_norm": 0.4540117383003235, "learning_rate": 8.590766915514571e-05, "loss": 1.5590102195739746, "step": 600 }, { "epoch": 0.061, "grad_norm": 0.4081774652004242, "learning_rate": 8.588981277254677e-05, "loss": 1.513133430480957, "step": 610 }, { "epoch": 0.062, "grad_norm": 0.43133676052093506, "learning_rate": 8.587151696671278e-05, "loss": 1.5470260620117187, "step": 620 }, { "epoch": 0.063, "grad_norm": 0.42611169815063477, "learning_rate": 8.585278192566148e-05, "loss": 1.5632414817810059, "step": 630 }, { "epoch": 0.064, "grad_norm": 0.4463273286819458, "learning_rate": 8.583360784192441e-05, "loss": 1.6180675506591797, "step": 640 }, { "epoch": 0.065, "grad_norm": 0.4165816009044647, "learning_rate": 8.581399491254493e-05, "loss": 1.5370763778686523, "step": 650 }, { "epoch": 0.066, "grad_norm": 0.44205260276794434, "learning_rate": 8.579394333907624e-05, "loss": 1.558724594116211, "step": 660 }, { "epoch": 0.067, "grad_norm": 0.4382662773132324, "learning_rate": 8.57734533275793e-05, "loss": 1.5468637466430664, "step": 670 }, { "epoch": 0.068, "grad_norm": 0.40459638833999634, "learning_rate": 8.575252508862066e-05, "loss": 1.543410301208496, "step": 680 }, { "epoch": 0.069, "grad_norm": 0.42760446667671204, "learning_rate": 8.573115883727036e-05, "loss": 1.590751838684082, "step": 690 }, { "epoch": 0.07, "grad_norm": 0.42787182331085205, "learning_rate": 8.570935479309969e-05, "loss": 1.591897201538086, "step": 700 }, { "epoch": 0.071, "grad_norm": 0.4374030828475952, "learning_rate": 8.568711318017893e-05, "loss": 1.5722368240356446, "step": 710 }, { "epoch": 0.072, "grad_norm": 0.4523533582687378, "learning_rate": 8.566443422707509e-05, "loss": 1.5926239013671875, "step": 720 }, { "epoch": 0.073, "grad_norm": 0.44648537039756775, "learning_rate": 8.564131816684949e-05, "loss": 1.5328532218933106, "step": 730 }, { "epoch": 0.074, "grad_norm": 0.44178444147109985, "learning_rate": 8.561776523705541e-05, "loss": 1.5162950515747071, "step": 740 }, { "epoch": 0.075, "grad_norm": 0.459213525056839, "learning_rate": 8.559377567973564e-05, "loss": 1.627875518798828, "step": 750 }, { "epoch": 0.076, "grad_norm": 0.4523981511592865, "learning_rate": 8.556934974142002e-05, "loss": 1.5238592147827148, "step": 760 }, { "epoch": 0.077, "grad_norm": 0.4350197911262512, "learning_rate": 8.554448767312285e-05, "loss": 1.5533188819885253, "step": 770 }, { "epoch": 0.078, "grad_norm": 0.461459755897522, "learning_rate": 8.551918973034032e-05, "loss": 1.5818538665771484, "step": 780 }, { "epoch": 0.079, "grad_norm": 0.42179811000823975, "learning_rate": 8.549345617304799e-05, "loss": 1.5900559425354004, "step": 790 }, { "epoch": 0.08, "grad_norm": 0.4687558710575104, "learning_rate": 8.546728726569792e-05, "loss": 1.5136714935302735, "step": 800 }, { "epoch": 0.081, "grad_norm": 0.42299333214759827, "learning_rate": 8.544068327721616e-05, "loss": 1.6079557418823243, "step": 810 }, { "epoch": 0.082, "grad_norm": 0.45741724967956543, "learning_rate": 8.541364448099983e-05, "loss": 1.6250675201416016, "step": 820 }, { "epoch": 0.083, "grad_norm": 0.47489041090011597, "learning_rate": 8.538617115491442e-05, "loss": 1.5306575775146485, "step": 830 }, { "epoch": 0.084, "grad_norm": 0.43424174189567566, "learning_rate": 8.535826358129085e-05, "loss": 1.5837011337280273, "step": 840 }, { "epoch": 0.085, "grad_norm": 0.43130776286125183, "learning_rate": 8.53299220469226e-05, "loss": 1.560431957244873, "step": 850 }, { "epoch": 0.086, "grad_norm": 0.4119592607021332, "learning_rate": 8.530114684306282e-05, "loss": 1.572993278503418, "step": 860 }, { "epoch": 0.087, "grad_norm": 0.4472162425518036, "learning_rate": 8.527193826542123e-05, "loss": 1.5893125534057617, "step": 870 }, { "epoch": 0.088, "grad_norm": 0.43497446179389954, "learning_rate": 8.524229661416117e-05, "loss": 1.560661506652832, "step": 880 }, { "epoch": 0.089, "grad_norm": 0.43729737401008606, "learning_rate": 8.521222219389646e-05, "loss": 1.5510387420654297, "step": 890 }, { "epoch": 0.09, "grad_norm": 0.4537718594074249, "learning_rate": 8.518171531368828e-05, "loss": 1.5789440155029297, "step": 900 }, { "epoch": 0.091, "grad_norm": 0.4269010126590729, "learning_rate": 8.515077628704208e-05, "loss": 1.5246685028076172, "step": 910 }, { "epoch": 0.092, "grad_norm": 0.4279134273529053, "learning_rate": 8.511940543190416e-05, "loss": 1.5408878326416016, "step": 920 }, { "epoch": 0.093, "grad_norm": 0.4487226605415344, "learning_rate": 8.508760307065864e-05, "loss": 1.5889862060546875, "step": 930 }, { "epoch": 0.094, "grad_norm": 0.4346173107624054, "learning_rate": 8.505536953012398e-05, "loss": 1.4902814865112304, "step": 940 }, { "epoch": 0.095, "grad_norm": 0.431169718503952, "learning_rate": 8.502270514154968e-05, "loss": 1.5507546424865724, "step": 950 }, { "epoch": 0.096, "grad_norm": 0.46881142258644104, "learning_rate": 8.498961024061287e-05, "loss": 1.5701534271240234, "step": 960 }, { "epoch": 0.097, "grad_norm": 0.42858099937438965, "learning_rate": 8.495608516741486e-05, "loss": 1.4937341690063477, "step": 970 }, { "epoch": 0.098, "grad_norm": 0.4465695023536682, "learning_rate": 8.492213026647764e-05, "loss": 1.5935869216918945, "step": 980 }, { "epoch": 0.099, "grad_norm": 0.4498782157897949, "learning_rate": 8.488774588674037e-05, "loss": 1.5635653495788575, "step": 990 }, { "epoch": 0.1, "grad_norm": 0.4431057274341583, "learning_rate": 8.485293238155573e-05, "loss": 1.5423452377319335, "step": 1000 }, { "epoch": 0.101, "grad_norm": 0.4575574994087219, "learning_rate": 8.481769010868637e-05, "loss": 1.5350657463073731, "step": 1010 }, { "epoch": 0.102, "grad_norm": 0.4342168867588043, "learning_rate": 8.478201943030117e-05, "loss": 1.5452421188354493, "step": 1020 }, { "epoch": 0.103, "grad_norm": 0.445075124502182, "learning_rate": 8.474592071297152e-05, "loss": 1.554254150390625, "step": 1030 }, { "epoch": 0.104, "grad_norm": 0.4207341969013214, "learning_rate": 8.470939432766762e-05, "loss": 1.5895505905151368, "step": 1040 }, { "epoch": 0.105, "grad_norm": 0.4459976851940155, "learning_rate": 8.467244064975459e-05, "loss": 1.546200942993164, "step": 1050 }, { "epoch": 0.106, "grad_norm": 0.4187023341655731, "learning_rate": 8.463506005898863e-05, "loss": 1.5332408905029298, "step": 1060 }, { "epoch": 0.107, "grad_norm": 0.4594968557357788, "learning_rate": 8.459725293951314e-05, "loss": 1.5771045684814453, "step": 1070 }, { "epoch": 0.108, "grad_norm": 0.4230520725250244, "learning_rate": 8.455901967985479e-05, "loss": 1.5950983047485352, "step": 1080 }, { "epoch": 0.109, "grad_norm": 0.43380364775657654, "learning_rate": 8.452036067291948e-05, "loss": 1.6060211181640625, "step": 1090 }, { "epoch": 0.11, "grad_norm": 0.4305785596370697, "learning_rate": 8.448127631598831e-05, "loss": 1.5464977264404296, "step": 1100 }, { "epoch": 0.111, "grad_norm": 0.46058452129364014, "learning_rate": 8.44417670107135e-05, "loss": 1.5613780975341798, "step": 1110 }, { "epoch": 0.112, "grad_norm": 0.42492201924324036, "learning_rate": 8.440183316311433e-05, "loss": 1.5371337890625, "step": 1120 }, { "epoch": 0.113, "grad_norm": 0.45703360438346863, "learning_rate": 8.436147518357284e-05, "loss": 1.5895401000976563, "step": 1130 }, { "epoch": 0.114, "grad_norm": 0.4336182773113251, "learning_rate": 8.432069348682973e-05, "loss": 1.5472881317138671, "step": 1140 }, { "epoch": 0.115, "grad_norm": 0.44153451919555664, "learning_rate": 8.427948849198002e-05, "loss": 1.5661738395690918, "step": 1150 }, { "epoch": 0.116, "grad_norm": 0.43905603885650635, "learning_rate": 8.42378606224688e-05, "loss": 1.5574228286743164, "step": 1160 }, { "epoch": 0.117, "grad_norm": 0.43091803789138794, "learning_rate": 8.419581030608683e-05, "loss": 1.5603097915649413, "step": 1170 }, { "epoch": 0.118, "grad_norm": 0.4270052909851074, "learning_rate": 8.415333797496616e-05, "loss": 1.552748680114746, "step": 1180 }, { "epoch": 0.119, "grad_norm": 0.4377681016921997, "learning_rate": 8.411044406557571e-05, "loss": 1.5209136962890626, "step": 1190 }, { "epoch": 0.12, "grad_norm": 0.4468444585800171, "learning_rate": 8.406712901871679e-05, "loss": 1.5935579299926759, "step": 1200 }, { "epoch": 0.121, "grad_norm": 0.40383124351501465, "learning_rate": 8.402339327951851e-05, "loss": 1.5492631912231445, "step": 1210 }, { "epoch": 0.122, "grad_norm": 0.453901469707489, "learning_rate": 8.397923729743324e-05, "loss": 1.5286017417907716, "step": 1220 }, { "epoch": 0.123, "grad_norm": 0.45355984568595886, "learning_rate": 8.393466152623203e-05, "loss": 1.534010887145996, "step": 1230 }, { "epoch": 0.124, "grad_norm": 0.41849273443222046, "learning_rate": 8.388966642399991e-05, "loss": 1.5546277999877929, "step": 1240 }, { "epoch": 0.125, "grad_norm": 0.42275470495224, "learning_rate": 8.384425245313115e-05, "loss": 1.5799736976623535, "step": 1250 }, { "epoch": 0.126, "grad_norm": 0.44112563133239746, "learning_rate": 8.379842008032457e-05, "loss": 1.5725579261779785, "step": 1260 }, { "epoch": 0.127, "grad_norm": 0.42606085538864136, "learning_rate": 8.375216977657873e-05, "loss": 1.534956932067871, "step": 1270 }, { "epoch": 0.128, "grad_norm": 0.44213414192199707, "learning_rate": 8.370550201718704e-05, "loss": 1.496871566772461, "step": 1280 }, { "epoch": 0.129, "grad_norm": 0.47176358103752136, "learning_rate": 8.365841728173296e-05, "loss": 1.5814940452575683, "step": 1290 }, { "epoch": 0.13, "grad_norm": 0.4457390606403351, "learning_rate": 8.361091605408497e-05, "loss": 1.599702739715576, "step": 1300 }, { "epoch": 0.131, "grad_norm": 0.43326595425605774, "learning_rate": 8.356299882239168e-05, "loss": 1.5610076904296875, "step": 1310 }, { "epoch": 0.132, "grad_norm": 0.43634170293807983, "learning_rate": 8.35146660790768e-05, "loss": 1.5391773223876952, "step": 1320 }, { "epoch": 0.133, "grad_norm": 0.4215056002140045, "learning_rate": 8.346591832083405e-05, "loss": 1.538853359222412, "step": 1330 }, { "epoch": 0.134, "grad_norm": 0.4359005093574524, "learning_rate": 8.341675604862204e-05, "loss": 1.5893823623657226, "step": 1340 }, { "epoch": 0.135, "grad_norm": 0.4620271623134613, "learning_rate": 8.336717976765922e-05, "loss": 1.544410514831543, "step": 1350 }, { "epoch": 0.136, "grad_norm": 0.45624276995658875, "learning_rate": 8.331718998741857e-05, "loss": 1.5727739334106445, "step": 1360 }, { "epoch": 0.137, "grad_norm": 0.47488826513290405, "learning_rate": 8.32667872216224e-05, "loss": 1.5734460830688477, "step": 1370 }, { "epoch": 0.138, "grad_norm": 0.4481564462184906, "learning_rate": 8.321597198823715e-05, "loss": 1.5132453918457032, "step": 1380 }, { "epoch": 0.139, "grad_norm": 0.44024789333343506, "learning_rate": 8.316474480946796e-05, "loss": 1.5181768417358399, "step": 1390 }, { "epoch": 0.14, "grad_norm": 0.43633797764778137, "learning_rate": 8.311310621175332e-05, "loss": 1.5812137603759766, "step": 1400 }, { "epoch": 0.141, "grad_norm": 0.4629945158958435, "learning_rate": 8.306105672575975e-05, "loss": 1.5248623847961427, "step": 1410 }, { "epoch": 0.142, "grad_norm": 0.44547826051712036, "learning_rate": 8.300859688637624e-05, "loss": 1.603738021850586, "step": 1420 }, { "epoch": 0.143, "grad_norm": 0.4610430896282196, "learning_rate": 8.295572723270881e-05, "loss": 1.5599722862243652, "step": 1430 }, { "epoch": 0.144, "grad_norm": 0.4239096939563751, "learning_rate": 8.290244830807493e-05, "loss": 1.5240245819091798, "step": 1440 }, { "epoch": 0.145, "grad_norm": 0.4612870216369629, "learning_rate": 8.284876065999803e-05, "loss": 1.5548505783081055, "step": 1450 }, { "epoch": 0.146, "grad_norm": 0.437900185585022, "learning_rate": 8.279466484020172e-05, "loss": 1.5739557266235351, "step": 1460 }, { "epoch": 0.147, "grad_norm": 0.459146648645401, "learning_rate": 8.274016140460427e-05, "loss": 1.5370497703552246, "step": 1470 }, { "epoch": 0.148, "grad_norm": 0.4295809268951416, "learning_rate": 8.268525091331279e-05, "loss": 1.5780242919921874, "step": 1480 }, { "epoch": 0.149, "grad_norm": 0.43193328380584717, "learning_rate": 8.262993393061756e-05, "loss": 1.5061514854431153, "step": 1490 }, { "epoch": 0.15, "grad_norm": 0.44761526584625244, "learning_rate": 8.257421102498613e-05, "loss": 1.5490300178527832, "step": 1500 }, { "epoch": 0.151, "grad_norm": 0.4909959137439728, "learning_rate": 8.251808276905761e-05, "loss": 1.5348563194274902, "step": 1510 }, { "epoch": 0.152, "grad_norm": 0.45388928055763245, "learning_rate": 8.246154973963663e-05, "loss": 1.5422224044799804, "step": 1520 }, { "epoch": 0.153, "grad_norm": 0.44226154685020447, "learning_rate": 8.240461251768759e-05, "loss": 1.5589332580566406, "step": 1530 }, { "epoch": 0.154, "grad_norm": 0.45566076040267944, "learning_rate": 8.23472716883285e-05, "loss": 1.55558500289917, "step": 1540 }, { "epoch": 0.155, "grad_norm": 0.45849281549453735, "learning_rate": 8.228952784082511e-05, "loss": 1.60096435546875, "step": 1550 }, { "epoch": 0.156, "grad_norm": 0.4728466272354126, "learning_rate": 8.22313815685848e-05, "loss": 1.594934844970703, "step": 1560 }, { "epoch": 0.157, "grad_norm": 0.4469038248062134, "learning_rate": 8.217283346915048e-05, "loss": 1.553289031982422, "step": 1570 }, { "epoch": 0.158, "grad_norm": 0.48276886343955994, "learning_rate": 8.211388414419445e-05, "loss": 1.503675937652588, "step": 1580 }, { "epoch": 0.159, "grad_norm": 0.4695449769496918, "learning_rate": 8.205453419951223e-05, "loss": 1.5649538040161133, "step": 1590 }, { "epoch": 0.16, "grad_norm": 0.43924853205680847, "learning_rate": 8.199478424501635e-05, "loss": 1.527259635925293, "step": 1600 }, { "epoch": 0.161, "grad_norm": 0.4189310073852539, "learning_rate": 8.193463489473004e-05, "loss": 1.55230655670166, "step": 1610 }, { "epoch": 0.162, "grad_norm": 0.44848498702049255, "learning_rate": 8.187408676678092e-05, "loss": 1.5449299812316895, "step": 1620 }, { "epoch": 0.163, "grad_norm": 0.4382420480251312, "learning_rate": 8.181314048339468e-05, "loss": 1.541883659362793, "step": 1630 }, { "epoch": 0.164, "grad_norm": 0.42705485224723816, "learning_rate": 8.175179667088869e-05, "loss": 1.5482391357421874, "step": 1640 }, { "epoch": 0.165, "grad_norm": 0.4264799654483795, "learning_rate": 8.169005595966554e-05, "loss": 1.562957191467285, "step": 1650 }, { "epoch": 0.166, "grad_norm": 0.46075376868247986, "learning_rate": 8.162791898420655e-05, "loss": 1.533614730834961, "step": 1660 }, { "epoch": 0.167, "grad_norm": 0.46955326199531555, "learning_rate": 8.156538638306531e-05, "loss": 1.5150230407714844, "step": 1670 }, { "epoch": 0.168, "grad_norm": 0.4578472077846527, "learning_rate": 8.150245879886102e-05, "loss": 1.579137420654297, "step": 1680 }, { "epoch": 0.169, "grad_norm": 0.42824026942253113, "learning_rate": 8.143913687827199e-05, "loss": 1.5843921661376954, "step": 1690 }, { "epoch": 0.17, "grad_norm": 0.43932676315307617, "learning_rate": 8.137542127202888e-05, "loss": 1.605410385131836, "step": 1700 }, { "epoch": 0.171, "grad_norm": 0.4417635500431061, "learning_rate": 8.131131263490822e-05, "loss": 1.5659252166748048, "step": 1710 }, { "epoch": 0.172, "grad_norm": 0.448393315076828, "learning_rate": 8.124681162572536e-05, "loss": 1.5661985397338867, "step": 1720 }, { "epoch": 0.173, "grad_norm": 0.4424435496330261, "learning_rate": 8.118191890732799e-05, "loss": 1.5341227531433106, "step": 1730 }, { "epoch": 0.174, "grad_norm": 0.4247022867202759, "learning_rate": 8.111663514658923e-05, "loss": 1.5798539161682128, "step": 1740 }, { "epoch": 0.175, "grad_norm": 0.4436150789260864, "learning_rate": 8.105096101440067e-05, "loss": 1.5780141830444336, "step": 1750 }, { "epoch": 0.176, "grad_norm": 0.4493919312953949, "learning_rate": 8.098489718566567e-05, "loss": 1.5496795654296875, "step": 1760 }, { "epoch": 0.177, "grad_norm": 0.44591131806373596, "learning_rate": 8.091844433929229e-05, "loss": 1.5309679985046387, "step": 1770 }, { "epoch": 0.178, "grad_norm": 0.44779840111732483, "learning_rate": 8.085160315818632e-05, "loss": 1.5156972885131836, "step": 1780 }, { "epoch": 0.179, "grad_norm": 0.44151243567466736, "learning_rate": 8.078437432924434e-05, "loss": 1.5683908462524414, "step": 1790 }, { "epoch": 0.18, "grad_norm": 0.43576961755752563, "learning_rate": 8.071675854334655e-05, "loss": 1.479705810546875, "step": 1800 }, { "epoch": 0.181, "grad_norm": 0.4513467848300934, "learning_rate": 8.06487564953498e-05, "loss": 1.553599739074707, "step": 1810 }, { "epoch": 0.182, "grad_norm": 0.4427294433116913, "learning_rate": 8.058036888408034e-05, "loss": 1.5128995895385742, "step": 1820 }, { "epoch": 0.183, "grad_norm": 0.458186537027359, "learning_rate": 8.051159641232666e-05, "loss": 1.5448317527770996, "step": 1830 }, { "epoch": 0.184, "grad_norm": 0.4836799204349518, "learning_rate": 8.044243978683234e-05, "loss": 1.5993682861328125, "step": 1840 }, { "epoch": 0.185, "grad_norm": 0.44361555576324463, "learning_rate": 8.03728997182887e-05, "loss": 1.5255595207214356, "step": 1850 }, { "epoch": 0.186, "grad_norm": 0.41420966386795044, "learning_rate": 8.030297692132756e-05, "loss": 1.582503890991211, "step": 1860 }, { "epoch": 0.187, "grad_norm": 0.44590333104133606, "learning_rate": 8.023267211451381e-05, "loss": 1.570404624938965, "step": 1870 }, { "epoch": 0.188, "grad_norm": 0.43962764739990234, "learning_rate": 8.016198602033813e-05, "loss": 1.5464605331420898, "step": 1880 }, { "epoch": 0.189, "grad_norm": 0.4597555994987488, "learning_rate": 8.009091936520949e-05, "loss": 1.4956825256347657, "step": 1890 }, { "epoch": 0.19, "grad_norm": 0.4748958349227905, "learning_rate": 8.001947287944775e-05, "loss": 1.5619684219360352, "step": 1900 }, { "epoch": 0.191, "grad_norm": 0.44536739587783813, "learning_rate": 7.99476472972761e-05, "loss": 1.5296592712402344, "step": 1910 }, { "epoch": 0.192, "grad_norm": 0.43877002596855164, "learning_rate": 7.98754433568135e-05, "loss": 1.531735324859619, "step": 1920 }, { "epoch": 0.193, "grad_norm": 0.4474166929721832, "learning_rate": 7.980286180006717e-05, "loss": 1.5319225311279296, "step": 1930 }, { "epoch": 0.194, "grad_norm": 0.4478398859500885, "learning_rate": 7.972990337292492e-05, "loss": 1.598667812347412, "step": 1940 }, { "epoch": 0.195, "grad_norm": 0.4543236196041107, "learning_rate": 7.965656882514742e-05, "loss": 1.5022565841674804, "step": 1950 }, { "epoch": 0.196, "grad_norm": 0.4551485776901245, "learning_rate": 7.958285891036068e-05, "loss": 1.4970108032226563, "step": 1960 }, { "epoch": 0.197, "grad_norm": 0.4381597638130188, "learning_rate": 7.950877438604807e-05, "loss": 1.5610149383544922, "step": 1970 }, { "epoch": 0.198, "grad_norm": 0.47578102350234985, "learning_rate": 7.943431601354271e-05, "loss": 1.4768410682678224, "step": 1980 }, { "epoch": 0.199, "grad_norm": 0.43383491039276123, "learning_rate": 7.93594845580196e-05, "loss": 1.615484619140625, "step": 1990 }, { "epoch": 0.2, "grad_norm": 0.4749927520751953, "learning_rate": 7.928428078848771e-05, "loss": 1.5539310455322266, "step": 2000 }, { "epoch": 0.201, "grad_norm": 0.4608052372932434, "learning_rate": 7.920870547778215e-05, "loss": 1.5318676948547363, "step": 2010 }, { "epoch": 0.202, "grad_norm": 0.45553329586982727, "learning_rate": 7.913275940255613e-05, "loss": 1.5542593002319336, "step": 2020 }, { "epoch": 0.203, "grad_norm": 0.4951193630695343, "learning_rate": 7.905644334327311e-05, "loss": 1.5068115234375, "step": 2030 }, { "epoch": 0.204, "grad_norm": 0.4648756980895996, "learning_rate": 7.897975808419868e-05, "loss": 1.5019064903259278, "step": 2040 }, { "epoch": 0.205, "grad_norm": 0.43299543857574463, "learning_rate": 7.890270441339251e-05, "loss": 1.536239242553711, "step": 2050 }, { "epoch": 0.206, "grad_norm": 0.4560064971446991, "learning_rate": 7.882528312270028e-05, "loss": 1.5249292373657226, "step": 2060 }, { "epoch": 0.207, "grad_norm": 0.4477216601371765, "learning_rate": 7.874749500774555e-05, "loss": 1.515114974975586, "step": 2070 }, { "epoch": 0.208, "grad_norm": 0.4781554341316223, "learning_rate": 7.866934086792156e-05, "loss": 1.5463275909423828, "step": 2080 }, { "epoch": 0.209, "grad_norm": 0.45784494280815125, "learning_rate": 7.859082150638303e-05, "loss": 1.5834733963012695, "step": 2090 }, { "epoch": 0.21, "grad_norm": 0.41224566102027893, "learning_rate": 7.851193773003785e-05, "loss": 1.5228580474853515, "step": 2100 }, { "epoch": 0.211, "grad_norm": 0.4573986530303955, "learning_rate": 7.84326903495389e-05, "loss": 1.4756481170654296, "step": 2110 }, { "epoch": 0.212, "grad_norm": 0.477107971906662, "learning_rate": 7.835308017927563e-05, "loss": 1.563486671447754, "step": 2120 }, { "epoch": 0.213, "grad_norm": 0.4521276652812958, "learning_rate": 7.82731080373657e-05, "loss": 1.5458997726440429, "step": 2130 }, { "epoch": 0.214, "grad_norm": 0.4387427568435669, "learning_rate": 7.819277474564661e-05, "loss": 1.5429878234863281, "step": 2140 }, { "epoch": 0.215, "grad_norm": 0.45496708154678345, "learning_rate": 7.811208112966722e-05, "loss": 1.5660066604614258, "step": 2150 }, { "epoch": 0.216, "grad_norm": 0.5154712796211243, "learning_rate": 7.803102801867928e-05, "loss": 1.5109113693237304, "step": 2160 }, { "epoch": 0.217, "grad_norm": 0.4356984496116638, "learning_rate": 7.794961624562892e-05, "loss": 1.5474672317504883, "step": 2170 }, { "epoch": 0.218, "grad_norm": 0.4773666560649872, "learning_rate": 7.786784664714808e-05, "loss": 1.52636775970459, "step": 2180 }, { "epoch": 0.219, "grad_norm": 0.4520673453807831, "learning_rate": 7.77857200635459e-05, "loss": 1.4999986648559571, "step": 2190 }, { "epoch": 0.22, "grad_norm": 0.45495954155921936, "learning_rate": 7.770323733880007e-05, "loss": 1.5395520210266114, "step": 2200 }, { "epoch": 0.221, "grad_norm": 0.46883487701416016, "learning_rate": 7.762039932054823e-05, "loss": 1.553999900817871, "step": 2210 }, { "epoch": 0.222, "grad_norm": 0.4731461703777313, "learning_rate": 7.753720686007916e-05, "loss": 1.5087655067443848, "step": 2220 }, { "epoch": 0.223, "grad_norm": 0.43786755204200745, "learning_rate": 7.745366081232414e-05, "loss": 1.5776697158813477, "step": 2230 }, { "epoch": 0.224, "grad_norm": 0.44203466176986694, "learning_rate": 7.736976203584805e-05, "loss": 1.5327131271362304, "step": 2240 }, { "epoch": 0.225, "grad_norm": 0.4511767625808716, "learning_rate": 7.728551139284063e-05, "loss": 1.5300714492797851, "step": 2250 }, { "epoch": 0.226, "grad_norm": 0.4910801649093628, "learning_rate": 7.720090974910761e-05, "loss": 1.4961822509765625, "step": 2260 }, { "epoch": 0.227, "grad_norm": 0.46052250266075134, "learning_rate": 7.711595797406177e-05, "loss": 1.5509103775024413, "step": 2270 }, { "epoch": 0.228, "grad_norm": 0.45214343070983887, "learning_rate": 7.703065694071401e-05, "loss": 1.528483772277832, "step": 2280 }, { "epoch": 0.229, "grad_norm": 0.46709343791007996, "learning_rate": 7.694500752566447e-05, "loss": 1.5288267135620117, "step": 2290 }, { "epoch": 0.23, "grad_norm": 0.45310071110725403, "learning_rate": 7.685901060909339e-05, "loss": 1.5159454345703125, "step": 2300 }, { "epoch": 0.231, "grad_norm": 0.4652255177497864, "learning_rate": 7.677266707475213e-05, "loss": 1.5301589012145995, "step": 2310 }, { "epoch": 0.232, "grad_norm": 0.45159515738487244, "learning_rate": 7.66859778099541e-05, "loss": 1.5472338676452637, "step": 2320 }, { "epoch": 0.233, "grad_norm": 0.4177556335926056, "learning_rate": 7.659894370556565e-05, "loss": 1.519093132019043, "step": 2330 }, { "epoch": 0.234, "grad_norm": 0.429817795753479, "learning_rate": 7.651156565599683e-05, "loss": 1.497427272796631, "step": 2340 }, { "epoch": 0.235, "grad_norm": 0.43000954389572144, "learning_rate": 7.642384455919233e-05, "loss": 1.4777793884277344, "step": 2350 }, { "epoch": 0.236, "grad_norm": 0.43708905577659607, "learning_rate": 7.63357813166221e-05, "loss": 1.5647817611694337, "step": 2360 }, { "epoch": 0.237, "grad_norm": 0.45687180757522583, "learning_rate": 7.624737683327225e-05, "loss": 1.4947479248046875, "step": 2370 }, { "epoch": 0.238, "grad_norm": 0.4629661440849304, "learning_rate": 7.615863201763559e-05, "loss": 1.5427149772644042, "step": 2380 }, { "epoch": 0.239, "grad_norm": 0.46899276971817017, "learning_rate": 7.606954778170244e-05, "loss": 1.5839698791503907, "step": 2390 }, { "epoch": 0.24, "grad_norm": 0.4606948494911194, "learning_rate": 7.598012504095115e-05, "loss": 1.541771697998047, "step": 2400 }, { "epoch": 0.241, "grad_norm": 0.45212459564208984, "learning_rate": 7.58903647143387e-05, "loss": 1.5472356796264648, "step": 2410 }, { "epoch": 0.242, "grad_norm": 0.45408064126968384, "learning_rate": 7.58002677242914e-05, "loss": 1.5932246208190919, "step": 2420 }, { "epoch": 0.243, "grad_norm": 0.48358407616615295, "learning_rate": 7.570983499669515e-05, "loss": 1.5044713973999024, "step": 2430 }, { "epoch": 0.244, "grad_norm": 0.44570690393447876, "learning_rate": 7.561906746088618e-05, "loss": 1.5048210144042968, "step": 2440 }, { "epoch": 0.245, "grad_norm": 0.4519187808036804, "learning_rate": 7.552796604964136e-05, "loss": 1.523080539703369, "step": 2450 }, { "epoch": 0.246, "grad_norm": 0.44542405009269714, "learning_rate": 7.543653169916864e-05, "loss": 1.5862494468688966, "step": 2460 }, { "epoch": 0.247, "grad_norm": 0.43162643909454346, "learning_rate": 7.534476534909743e-05, "loss": 1.553419303894043, "step": 2470 }, { "epoch": 0.248, "grad_norm": 0.4649028778076172, "learning_rate": 7.525266794246896e-05, "loss": 1.549135398864746, "step": 2480 }, { "epoch": 0.249, "grad_norm": 0.454800009727478, "learning_rate": 7.51602404257266e-05, "loss": 1.5687236785888672, "step": 2490 }, { "epoch": 0.25, "grad_norm": 0.4387585520744324, "learning_rate": 7.506748374870607e-05, "loss": 1.4585626602172852, "step": 2500 }, { "epoch": 0.251, "grad_norm": 0.44236883521080017, "learning_rate": 7.497439886462573e-05, "loss": 1.5203914642333984, "step": 2510 }, { "epoch": 0.252, "grad_norm": 0.4657570719718933, "learning_rate": 7.488098673007681e-05, "loss": 1.5481887817382813, "step": 2520 }, { "epoch": 0.253, "grad_norm": 0.4656415283679962, "learning_rate": 7.478724830501353e-05, "loss": 1.5086076736450196, "step": 2530 }, { "epoch": 0.254, "grad_norm": 0.4566405713558197, "learning_rate": 7.469318455274322e-05, "loss": 1.497421932220459, "step": 2540 }, { "epoch": 0.255, "grad_norm": 0.4292711019515991, "learning_rate": 7.459879643991647e-05, "loss": 1.5872754096984862, "step": 2550 }, { "epoch": 0.256, "grad_norm": 0.4766315519809723, "learning_rate": 7.450408493651717e-05, "loss": 1.4764188766479491, "step": 2560 }, { "epoch": 0.257, "grad_norm": 0.4654238820075989, "learning_rate": 7.440905101585258e-05, "loss": 1.5425111770629882, "step": 2570 }, { "epoch": 1.0003, "grad_norm": 0.42733854055404663, "learning_rate": 7.431369565454328e-05, "loss": 1.7178598403930665, "step": 2580 }, { "epoch": 1.0013, "grad_norm": 0.4512520730495453, "learning_rate": 7.421801983251313e-05, "loss": 1.434187126159668, "step": 2590 }, { "epoch": 1.0023, "grad_norm": 0.45792531967163086, "learning_rate": 7.412202453297922e-05, "loss": 1.5031817436218262, "step": 2600 }, { "epoch": 1.0033, "grad_norm": 0.4429328739643097, "learning_rate": 7.402571074244179e-05, "loss": 1.4843154907226563, "step": 2610 }, { "epoch": 1.0043, "grad_norm": 0.43615370988845825, "learning_rate": 7.392907945067407e-05, "loss": 1.4864605903625487, "step": 2620 }, { "epoch": 1.0053, "grad_norm": 0.469474196434021, "learning_rate": 7.383213165071205e-05, "loss": 1.4709732055664062, "step": 2630 }, { "epoch": 1.0063, "grad_norm": 0.45088157057762146, "learning_rate": 7.373486833884443e-05, "loss": 1.4104438781738282, "step": 2640 }, { "epoch": 1.0073, "grad_norm": 0.458600252866745, "learning_rate": 7.363729051460219e-05, "loss": 1.4738828659057617, "step": 2650 }, { "epoch": 1.0083, "grad_norm": 0.46691879630088806, "learning_rate": 7.353939918074846e-05, "loss": 1.4707263946533202, "step": 2660 }, { "epoch": 1.0093, "grad_norm": 0.46489620208740234, "learning_rate": 7.344119534326815e-05, "loss": 1.3846729278564454, "step": 2670 }, { "epoch": 1.0103, "grad_norm": 0.44752174615859985, "learning_rate": 7.334268001135764e-05, "loss": 1.4604171752929687, "step": 2680 }, { "epoch": 1.0113, "grad_norm": 0.4594590365886688, "learning_rate": 7.324385419741438e-05, "loss": 1.4276948928833009, "step": 2690 }, { "epoch": 1.0123, "grad_norm": 0.4594690501689911, "learning_rate": 7.314471891702651e-05, "loss": 1.3739115715026855, "step": 2700 }, { "epoch": 1.0133, "grad_norm": 0.4549918472766876, "learning_rate": 7.304527518896245e-05, "loss": 1.4289373397827148, "step": 2710 }, { "epoch": 1.0143, "grad_norm": 0.46599602699279785, "learning_rate": 7.294552403516034e-05, "loss": 1.447694969177246, "step": 2720 }, { "epoch": 1.0153, "grad_norm": 0.4508320987224579, "learning_rate": 7.284546648071763e-05, "loss": 1.4010438919067383, "step": 2730 }, { "epoch": 1.0163, "grad_norm": 0.4645399749279022, "learning_rate": 7.274510355388049e-05, "loss": 1.3724050521850586, "step": 2740 }, { "epoch": 1.0173, "grad_norm": 0.5009335875511169, "learning_rate": 7.26444362860333e-05, "loss": 1.390232467651367, "step": 2750 }, { "epoch": 1.0183, "grad_norm": 0.4698573648929596, "learning_rate": 7.254346571168798e-05, "loss": 1.3995256423950195, "step": 2760 }, { "epoch": 1.0193, "grad_norm": 0.46238020062446594, "learning_rate": 7.24421928684734e-05, "loss": 1.3738612174987792, "step": 2770 }, { "epoch": 1.0203, "grad_norm": 0.45451462268829346, "learning_rate": 7.234061879712472e-05, "loss": 1.3291207313537599, "step": 2780 }, { "epoch": 1.0213, "grad_norm": 0.47129520773887634, "learning_rate": 7.223874454147269e-05, "loss": 1.356761646270752, "step": 2790 }, { "epoch": 1.0223, "grad_norm": 0.4773348569869995, "learning_rate": 7.213657114843287e-05, "loss": 1.3456995010375976, "step": 2800 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.7514887942478234e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }