{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006896551724137931, "grad_norm": 0.23170334100723267, "learning_rate": 4.0909090909090915e-05, "loss": 3.881548309326172, "step": 10 }, { "epoch": 0.013793103448275862, "grad_norm": 0.2462630271911621, "learning_rate": 8.636363636363637e-05, "loss": 3.794137954711914, "step": 20 }, { "epoch": 0.020689655172413793, "grad_norm": 0.2365545630455017, "learning_rate": 0.0001318181818181818, "loss": 3.7382652282714846, "step": 30 }, { "epoch": 0.027586206896551724, "grad_norm": 0.2707105576992035, "learning_rate": 0.00017727272727272728, "loss": 3.7427467346191405, "step": 40 }, { "epoch": 0.034482758620689655, "grad_norm": 0.24755217134952545, "learning_rate": 0.0001992887624466572, "loss": 3.61156005859375, "step": 50 }, { "epoch": 0.041379310344827586, "grad_norm": 0.3027788996696472, "learning_rate": 0.00019786628733997158, "loss": 3.653477096557617, "step": 60 }, { "epoch": 0.04827586206896552, "grad_norm": 0.2786545753479004, "learning_rate": 0.00019644381223328592, "loss": 3.6743812561035156, "step": 70 }, { "epoch": 0.05517241379310345, "grad_norm": 0.2662774622440338, "learning_rate": 0.0001950213371266003, "loss": 3.5414581298828125, "step": 80 }, { "epoch": 0.06206896551724138, "grad_norm": 0.30465996265411377, "learning_rate": 0.00019359886201991466, "loss": 3.5396636962890624, "step": 90 }, { "epoch": 0.06896551724137931, "grad_norm": 0.26726341247558594, "learning_rate": 0.00019217638691322903, "loss": 3.56505126953125, "step": 100 }, { "epoch": 0.06896551724137931, "eval_loss": 3.540494680404663, "eval_runtime": 20.7606, "eval_samples_per_second": 59.68, "eval_steps_per_second": 7.466, "step": 100 }, { "epoch": 0.07586206896551724, "grad_norm": 0.28638505935668945, "learning_rate": 0.00019075391180654338, "loss": 3.5479259490966797, "step": 110 }, { "epoch": 0.08275862068965517, "grad_norm": 0.26623088121414185, "learning_rate": 0.00018933143669985775, "loss": 3.538827133178711, "step": 120 }, { "epoch": 0.0896551724137931, "grad_norm": 0.3132971525192261, "learning_rate": 0.00018790896159317212, "loss": 3.500360107421875, "step": 130 }, { "epoch": 0.09655172413793103, "grad_norm": 0.2965874969959259, "learning_rate": 0.0001864864864864865, "loss": 3.5192401885986326, "step": 140 }, { "epoch": 0.10344827586206896, "grad_norm": 0.2784412205219269, "learning_rate": 0.00018506401137980089, "loss": 3.6045772552490236, "step": 150 }, { "epoch": 0.1103448275862069, "grad_norm": 0.3488187789916992, "learning_rate": 0.00018364153627311523, "loss": 3.497270202636719, "step": 160 }, { "epoch": 0.11724137931034483, "grad_norm": 0.2777283191680908, "learning_rate": 0.0001822190611664296, "loss": 3.417000961303711, "step": 170 }, { "epoch": 0.12413793103448276, "grad_norm": 0.3144644796848297, "learning_rate": 0.00018079658605974397, "loss": 3.5388118743896486, "step": 180 }, { "epoch": 0.1310344827586207, "grad_norm": 0.32053834199905396, "learning_rate": 0.00017937411095305834, "loss": 3.4628257751464844, "step": 190 }, { "epoch": 0.13793103448275862, "grad_norm": 0.3077249228954315, "learning_rate": 0.00017795163584637268, "loss": 3.456898498535156, "step": 200 }, { "epoch": 0.13793103448275862, "eval_loss": 3.459949016571045, "eval_runtime": 23.2077, "eval_samples_per_second": 53.388, "eval_steps_per_second": 6.679, "step": 200 }, { "epoch": 0.14482758620689656, "grad_norm": 0.3117501139640808, "learning_rate": 0.00017652916073968705, "loss": 3.4756256103515626, "step": 210 }, { "epoch": 0.15172413793103448, "grad_norm": 0.31221652030944824, "learning_rate": 0.00017510668563300142, "loss": 3.4557666778564453, "step": 220 }, { "epoch": 0.15862068965517243, "grad_norm": 0.30920568108558655, "learning_rate": 0.0001736842105263158, "loss": 3.5030059814453125, "step": 230 }, { "epoch": 0.16551724137931034, "grad_norm": 0.3118240535259247, "learning_rate": 0.00017226173541963016, "loss": 3.463837814331055, "step": 240 }, { "epoch": 0.1724137931034483, "grad_norm": 0.29713189601898193, "learning_rate": 0.00017083926031294454, "loss": 3.482789993286133, "step": 250 }, { "epoch": 0.1793103448275862, "grad_norm": 0.31772103905677795, "learning_rate": 0.0001694167852062589, "loss": 3.4497989654541015, "step": 260 }, { "epoch": 0.18620689655172415, "grad_norm": 0.31749677658081055, "learning_rate": 0.00016799431009957328, "loss": 3.426702880859375, "step": 270 }, { "epoch": 0.19310344827586207, "grad_norm": 0.3107665479183197, "learning_rate": 0.00016657183499288765, "loss": 3.396825408935547, "step": 280 }, { "epoch": 0.2, "grad_norm": 0.32543718814849854, "learning_rate": 0.000165149359886202, "loss": 3.4777637481689454, "step": 290 }, { "epoch": 0.20689655172413793, "grad_norm": 0.3045833110809326, "learning_rate": 0.00016372688477951636, "loss": 3.417892074584961, "step": 300 }, { "epoch": 0.20689655172413793, "eval_loss": 3.4208664894104004, "eval_runtime": 22.6365, "eval_samples_per_second": 54.735, "eval_steps_per_second": 6.847, "step": 300 }, { "epoch": 0.21379310344827587, "grad_norm": 0.3320230543613434, "learning_rate": 0.00016230440967283073, "loss": 3.4840187072753905, "step": 310 }, { "epoch": 0.2206896551724138, "grad_norm": 0.30821651220321655, "learning_rate": 0.0001608819345661451, "loss": 3.362594985961914, "step": 320 }, { "epoch": 0.22758620689655173, "grad_norm": 0.330126017332077, "learning_rate": 0.00015945945945945947, "loss": 3.4734111785888673, "step": 330 }, { "epoch": 0.23448275862068965, "grad_norm": 0.30710867047309875, "learning_rate": 0.00015803698435277384, "loss": 3.4003364562988283, "step": 340 }, { "epoch": 0.2413793103448276, "grad_norm": 0.30796217918395996, "learning_rate": 0.0001566145092460882, "loss": 3.4497623443603516, "step": 350 }, { "epoch": 0.2482758620689655, "grad_norm": 0.31471186876296997, "learning_rate": 0.00015519203413940258, "loss": 3.410964584350586, "step": 360 }, { "epoch": 0.25517241379310346, "grad_norm": 0.31033286452293396, "learning_rate": 0.00015376955903271693, "loss": 3.4347129821777345, "step": 370 }, { "epoch": 0.2620689655172414, "grad_norm": 0.32137277722358704, "learning_rate": 0.0001523470839260313, "loss": 3.4425697326660156, "step": 380 }, { "epoch": 0.2689655172413793, "grad_norm": 0.3627667725086212, "learning_rate": 0.00015092460881934567, "loss": 3.3625614166259767, "step": 390 }, { "epoch": 0.27586206896551724, "grad_norm": 0.3407364785671234, "learning_rate": 0.00014950213371266004, "loss": 3.3849735260009766, "step": 400 }, { "epoch": 0.27586206896551724, "eval_loss": 3.3975093364715576, "eval_runtime": 24.3998, "eval_samples_per_second": 50.779, "eval_steps_per_second": 6.353, "step": 400 }, { "epoch": 0.2827586206896552, "grad_norm": 0.32097798585891724, "learning_rate": 0.00014807965860597438, "loss": 3.417188262939453, "step": 410 }, { "epoch": 0.2896551724137931, "grad_norm": 0.33030977845191956, "learning_rate": 0.00014665718349928875, "loss": 3.429982376098633, "step": 420 }, { "epoch": 0.296551724137931, "grad_norm": 0.3301655054092407, "learning_rate": 0.00014523470839260315, "loss": 3.338633728027344, "step": 430 }, { "epoch": 0.30344827586206896, "grad_norm": 0.32900184392929077, "learning_rate": 0.00014381223328591752, "loss": 3.376237487792969, "step": 440 }, { "epoch": 0.3103448275862069, "grad_norm": 0.3433472812175751, "learning_rate": 0.0001423897581792319, "loss": 3.3466358184814453, "step": 450 }, { "epoch": 0.31724137931034485, "grad_norm": 0.31025466322898865, "learning_rate": 0.00014096728307254623, "loss": 3.371001052856445, "step": 460 }, { "epoch": 0.32413793103448274, "grad_norm": 0.3327469527721405, "learning_rate": 0.0001395448079658606, "loss": 3.355500411987305, "step": 470 }, { "epoch": 0.3310344827586207, "grad_norm": 0.34813839197158813, "learning_rate": 0.00013812233285917497, "loss": 3.3663055419921877, "step": 480 }, { "epoch": 0.33793103448275863, "grad_norm": 0.35365816950798035, "learning_rate": 0.00013669985775248934, "loss": 3.4172080993652343, "step": 490 }, { "epoch": 0.3448275862068966, "grad_norm": 0.31251364946365356, "learning_rate": 0.0001352773826458037, "loss": 3.334903335571289, "step": 500 }, { "epoch": 0.3448275862068966, "eval_loss": 3.377959728240967, "eval_runtime": 22.4366, "eval_samples_per_second": 55.222, "eval_steps_per_second": 6.908, "step": 500 }, { "epoch": 0.35172413793103446, "grad_norm": 0.3221539855003357, "learning_rate": 0.00013385490753911806, "loss": 3.3751365661621096, "step": 510 }, { "epoch": 0.3586206896551724, "grad_norm": 0.31918609142303467, "learning_rate": 0.00013243243243243243, "loss": 3.360042953491211, "step": 520 }, { "epoch": 0.36551724137931035, "grad_norm": 0.3304445445537567, "learning_rate": 0.00013100995732574682, "loss": 3.355394744873047, "step": 530 }, { "epoch": 0.3724137931034483, "grad_norm": 0.31707221269607544, "learning_rate": 0.00012958748221906117, "loss": 3.3813400268554688, "step": 540 }, { "epoch": 0.3793103448275862, "grad_norm": 0.3358207643032074, "learning_rate": 0.00012816500711237554, "loss": 3.4241260528564452, "step": 550 }, { "epoch": 0.38620689655172413, "grad_norm": 0.3196071982383728, "learning_rate": 0.0001267425320056899, "loss": 3.3610572814941406, "step": 560 }, { "epoch": 0.3931034482758621, "grad_norm": 0.31611961126327515, "learning_rate": 0.00012532005689900428, "loss": 3.328931427001953, "step": 570 }, { "epoch": 0.4, "grad_norm": 0.33409208059310913, "learning_rate": 0.00012389758179231865, "loss": 3.32372932434082, "step": 580 }, { "epoch": 0.4068965517241379, "grad_norm": 0.322489470243454, "learning_rate": 0.000122475106685633, "loss": 3.389539337158203, "step": 590 }, { "epoch": 0.41379310344827586, "grad_norm": 0.3401939272880554, "learning_rate": 0.00012105263157894738, "loss": 3.292881393432617, "step": 600 }, { "epoch": 0.41379310344827586, "eval_loss": 3.3636481761932373, "eval_runtime": 24.5094, "eval_samples_per_second": 50.552, "eval_steps_per_second": 6.324, "step": 600 }, { "epoch": 0.4206896551724138, "grad_norm": 0.36831986904144287, "learning_rate": 0.00011963015647226175, "loss": 3.3248523712158202, "step": 610 }, { "epoch": 0.42758620689655175, "grad_norm": 0.31736257672309875, "learning_rate": 0.00011820768136557612, "loss": 3.328786849975586, "step": 620 }, { "epoch": 0.43448275862068964, "grad_norm": 0.3393501341342926, "learning_rate": 0.00011678520625889046, "loss": 3.3191741943359374, "step": 630 }, { "epoch": 0.4413793103448276, "grad_norm": 0.3327409327030182, "learning_rate": 0.00011536273115220485, "loss": 3.4381561279296875, "step": 640 }, { "epoch": 0.4482758620689655, "grad_norm": 0.32990631461143494, "learning_rate": 0.00011394025604551922, "loss": 3.4140262603759766, "step": 650 }, { "epoch": 0.45517241379310347, "grad_norm": 0.3171171247959137, "learning_rate": 0.00011251778093883359, "loss": 3.358592987060547, "step": 660 }, { "epoch": 0.46206896551724136, "grad_norm": 0.319813072681427, "learning_rate": 0.00011109530583214793, "loss": 3.3165565490722657, "step": 670 }, { "epoch": 0.4689655172413793, "grad_norm": 0.3260372579097748, "learning_rate": 0.0001096728307254623, "loss": 3.353110122680664, "step": 680 }, { "epoch": 0.47586206896551725, "grad_norm": 0.3186911642551422, "learning_rate": 0.00010825035561877668, "loss": 3.4071842193603517, "step": 690 }, { "epoch": 0.4827586206896552, "grad_norm": 0.3407030701637268, "learning_rate": 0.00010682788051209105, "loss": 3.3047447204589844, "step": 700 }, { "epoch": 0.4827586206896552, "eval_loss": 3.353717803955078, "eval_runtime": 23.3132, "eval_samples_per_second": 53.146, "eval_steps_per_second": 6.649, "step": 700 }, { "epoch": 0.4896551724137931, "grad_norm": 0.34808802604675293, "learning_rate": 0.0001054054054054054, "loss": 3.3398147583007813, "step": 710 }, { "epoch": 0.496551724137931, "grad_norm": 0.31498315930366516, "learning_rate": 0.00010398293029871977, "loss": 3.3216724395751953, "step": 720 }, { "epoch": 0.503448275862069, "grad_norm": 0.32081830501556396, "learning_rate": 0.00010256045519203414, "loss": 3.3753803253173826, "step": 730 }, { "epoch": 0.5103448275862069, "grad_norm": 0.38737478852272034, "learning_rate": 0.00010113798008534852, "loss": 3.347806930541992, "step": 740 }, { "epoch": 0.5172413793103449, "grad_norm": 0.3532518744468689, "learning_rate": 9.971550497866288e-05, "loss": 3.3405616760253904, "step": 750 }, { "epoch": 0.5241379310344828, "grad_norm": 0.3295048773288727, "learning_rate": 9.829302987197725e-05, "loss": 3.3597396850585937, "step": 760 }, { "epoch": 0.5310344827586206, "grad_norm": 0.3602186441421509, "learning_rate": 9.68705547652916e-05, "loss": 3.3083240509033205, "step": 770 }, { "epoch": 0.5379310344827586, "grad_norm": 0.3464964032173157, "learning_rate": 9.544807965860598e-05, "loss": 3.3121707916259764, "step": 780 }, { "epoch": 0.5448275862068965, "grad_norm": 0.314314067363739, "learning_rate": 9.402560455192035e-05, "loss": 3.3149440765380858, "step": 790 }, { "epoch": 0.5517241379310345, "grad_norm": 0.3291971683502197, "learning_rate": 9.260312944523472e-05, "loss": 3.3775871276855467, "step": 800 }, { "epoch": 0.5517241379310345, "eval_loss": 3.3451411724090576, "eval_runtime": 22.871, "eval_samples_per_second": 54.173, "eval_steps_per_second": 6.777, "step": 800 }, { "epoch": 0.5586206896551724, "grad_norm": 0.33278515934944153, "learning_rate": 9.118065433854907e-05, "loss": 3.348500061035156, "step": 810 }, { "epoch": 0.5655172413793104, "grad_norm": 0.32254090905189514, "learning_rate": 8.975817923186344e-05, "loss": 3.289051818847656, "step": 820 }, { "epoch": 0.5724137931034483, "grad_norm": 0.37034258246421814, "learning_rate": 8.833570412517781e-05, "loss": 3.3568161010742186, "step": 830 }, { "epoch": 0.5793103448275863, "grad_norm": 0.3335118889808655, "learning_rate": 8.691322901849219e-05, "loss": 3.388734817504883, "step": 840 }, { "epoch": 0.5862068965517241, "grad_norm": 0.321696013212204, "learning_rate": 8.549075391180654e-05, "loss": 3.225112533569336, "step": 850 }, { "epoch": 0.593103448275862, "grad_norm": 0.32803264260292053, "learning_rate": 8.406827880512091e-05, "loss": 3.3828250885009767, "step": 860 }, { "epoch": 0.6, "grad_norm": 0.32728055119514465, "learning_rate": 8.264580369843528e-05, "loss": 3.382917022705078, "step": 870 }, { "epoch": 0.6068965517241379, "grad_norm": 0.3484093248844147, "learning_rate": 8.122332859174965e-05, "loss": 3.3160984039306642, "step": 880 }, { "epoch": 0.6137931034482759, "grad_norm": 0.3902784585952759, "learning_rate": 7.980085348506402e-05, "loss": 3.366690444946289, "step": 890 }, { "epoch": 0.6206896551724138, "grad_norm": 0.32276031374931335, "learning_rate": 7.837837837837838e-05, "loss": 3.2556941986083983, "step": 900 }, { "epoch": 0.6206896551724138, "eval_loss": 3.337270736694336, "eval_runtime": 23.3962, "eval_samples_per_second": 52.957, "eval_steps_per_second": 6.625, "step": 900 }, { "epoch": 0.6275862068965518, "grad_norm": 0.36281818151474, "learning_rate": 7.695590327169275e-05, "loss": 3.4061851501464844, "step": 910 }, { "epoch": 0.6344827586206897, "grad_norm": 0.3139365017414093, "learning_rate": 7.553342816500711e-05, "loss": 3.2938968658447267, "step": 920 }, { "epoch": 0.6413793103448275, "grad_norm": 0.33926886320114136, "learning_rate": 7.411095305832149e-05, "loss": 3.3076290130615233, "step": 930 }, { "epoch": 0.6482758620689655, "grad_norm": 0.3455406427383423, "learning_rate": 7.268847795163585e-05, "loss": 3.338056182861328, "step": 940 }, { "epoch": 0.6551724137931034, "grad_norm": 0.3547625243663788, "learning_rate": 7.126600284495022e-05, "loss": 3.3874538421630858, "step": 950 }, { "epoch": 0.6620689655172414, "grad_norm": 0.34468552470207214, "learning_rate": 6.984352773826458e-05, "loss": 3.35147705078125, "step": 960 }, { "epoch": 0.6689655172413793, "grad_norm": 0.3656456470489502, "learning_rate": 6.842105263157895e-05, "loss": 3.415922164916992, "step": 970 }, { "epoch": 0.6758620689655173, "grad_norm": 0.34468477964401245, "learning_rate": 6.699857752489332e-05, "loss": 3.3692134857177733, "step": 980 }, { "epoch": 0.6827586206896552, "grad_norm": 0.3500272333621979, "learning_rate": 6.557610241820769e-05, "loss": 3.371417999267578, "step": 990 }, { "epoch": 0.6896551724137931, "grad_norm": 0.3438541889190674, "learning_rate": 6.415362731152204e-05, "loss": 3.4223506927490233, "step": 1000 }, { "epoch": 0.6896551724137931, "eval_loss": 3.330833911895752, "eval_runtime": 22.3929, "eval_samples_per_second": 55.33, "eval_steps_per_second": 6.922, "step": 1000 }, { "epoch": 0.696551724137931, "grad_norm": 0.33815649151802063, "learning_rate": 6.273115220483641e-05, "loss": 3.3118003845214843, "step": 1010 }, { "epoch": 0.7034482758620689, "grad_norm": 0.3285435438156128, "learning_rate": 6.130867709815078e-05, "loss": 3.3082767486572267, "step": 1020 }, { "epoch": 0.7103448275862069, "grad_norm": 0.3286275863647461, "learning_rate": 5.988620199146515e-05, "loss": 3.373445510864258, "step": 1030 }, { "epoch": 0.7172413793103448, "grad_norm": 0.3484683334827423, "learning_rate": 5.8463726884779526e-05, "loss": 3.3201057434082033, "step": 1040 }, { "epoch": 0.7241379310344828, "grad_norm": 0.37690791487693787, "learning_rate": 5.704125177809388e-05, "loss": 3.322885513305664, "step": 1050 }, { "epoch": 0.7310344827586207, "grad_norm": 0.3458273112773895, "learning_rate": 5.561877667140826e-05, "loss": 3.3502052307128904, "step": 1060 }, { "epoch": 0.7379310344827587, "grad_norm": 0.3618911802768707, "learning_rate": 5.4196301564722616e-05, "loss": 3.3665504455566406, "step": 1070 }, { "epoch": 0.7448275862068966, "grad_norm": 0.34324246644973755, "learning_rate": 5.277382645803699e-05, "loss": 3.318630599975586, "step": 1080 }, { "epoch": 0.7517241379310344, "grad_norm": 0.3743279278278351, "learning_rate": 5.135135135135135e-05, "loss": 3.2997642517089845, "step": 1090 }, { "epoch": 0.7586206896551724, "grad_norm": 0.3348490595817566, "learning_rate": 4.992887624466572e-05, "loss": 3.194792556762695, "step": 1100 }, { "epoch": 0.7586206896551724, "eval_loss": 3.3259100914001465, "eval_runtime": 22.2447, "eval_samples_per_second": 55.699, "eval_steps_per_second": 6.968, "step": 1100 }, { "epoch": 0.7655172413793103, "grad_norm": 0.33868151903152466, "learning_rate": 4.850640113798009e-05, "loss": 3.346274566650391, "step": 1110 }, { "epoch": 0.7724137931034483, "grad_norm": 0.3498711585998535, "learning_rate": 4.7083926031294455e-05, "loss": 3.323177719116211, "step": 1120 }, { "epoch": 0.7793103448275862, "grad_norm": 0.3602657914161682, "learning_rate": 4.5661450924608825e-05, "loss": 3.273370361328125, "step": 1130 }, { "epoch": 0.7862068965517242, "grad_norm": 0.34091508388519287, "learning_rate": 4.423897581792319e-05, "loss": 3.288585662841797, "step": 1140 }, { "epoch": 0.7931034482758621, "grad_norm": 0.35901182889938354, "learning_rate": 4.281650071123756e-05, "loss": 3.3729190826416016, "step": 1150 }, { "epoch": 0.8, "grad_norm": 0.33599621057510376, "learning_rate": 4.139402560455192e-05, "loss": 3.355024719238281, "step": 1160 }, { "epoch": 0.8068965517241379, "grad_norm": 0.38110241293907166, "learning_rate": 3.997155049786629e-05, "loss": 3.343807601928711, "step": 1170 }, { "epoch": 0.8137931034482758, "grad_norm": 0.34958431124687195, "learning_rate": 3.854907539118066e-05, "loss": 3.272492218017578, "step": 1180 }, { "epoch": 0.8206896551724138, "grad_norm": 0.3552829623222351, "learning_rate": 3.712660028449502e-05, "loss": 3.3572948455810545, "step": 1190 }, { "epoch": 0.8275862068965517, "grad_norm": 0.322081595659256, "learning_rate": 3.570412517780939e-05, "loss": 3.3037082672119142, "step": 1200 }, { "epoch": 0.8275862068965517, "eval_loss": 3.3224334716796875, "eval_runtime": 22.2507, "eval_samples_per_second": 55.684, "eval_steps_per_second": 6.966, "step": 1200 }, { "epoch": 0.8344827586206897, "grad_norm": 0.35375288128852844, "learning_rate": 3.4281650071123755e-05, "loss": 3.2933795928955076, "step": 1210 }, { "epoch": 0.8413793103448276, "grad_norm": 0.35284116864204407, "learning_rate": 3.2859174964438125e-05, "loss": 3.3349658966064455, "step": 1220 }, { "epoch": 0.8482758620689655, "grad_norm": 0.36195898056030273, "learning_rate": 3.143669985775249e-05, "loss": 3.3784534454345705, "step": 1230 }, { "epoch": 0.8551724137931035, "grad_norm": 0.3708537518978119, "learning_rate": 3.0014224751066856e-05, "loss": 3.3437496185302735, "step": 1240 }, { "epoch": 0.8620689655172413, "grad_norm": 0.32489216327667236, "learning_rate": 2.8591749644381226e-05, "loss": 3.276387023925781, "step": 1250 }, { "epoch": 0.8689655172413793, "grad_norm": 0.3359311819076538, "learning_rate": 2.7169274537695593e-05, "loss": 3.2540233612060545, "step": 1260 }, { "epoch": 0.8758620689655172, "grad_norm": 0.40804561972618103, "learning_rate": 2.574679943100996e-05, "loss": 3.2611133575439455, "step": 1270 }, { "epoch": 0.8827586206896552, "grad_norm": 0.3684781491756439, "learning_rate": 2.4324324324324327e-05, "loss": 3.362625503540039, "step": 1280 }, { "epoch": 0.8896551724137931, "grad_norm": 0.38623297214508057, "learning_rate": 2.2901849217638694e-05, "loss": 3.302141571044922, "step": 1290 }, { "epoch": 0.896551724137931, "grad_norm": 0.3602025508880615, "learning_rate": 2.147937411095306e-05, "loss": 3.3853092193603516, "step": 1300 }, { "epoch": 0.896551724137931, "eval_loss": 3.319241762161255, "eval_runtime": 22.5951, "eval_samples_per_second": 54.835, "eval_steps_per_second": 6.86, "step": 1300 }, { "epoch": 0.903448275862069, "grad_norm": 0.3617671728134155, "learning_rate": 2.0056899004267428e-05, "loss": 3.3196762084960936, "step": 1310 }, { "epoch": 0.9103448275862069, "grad_norm": 0.3671157956123352, "learning_rate": 1.8634423897581792e-05, "loss": 3.358323669433594, "step": 1320 }, { "epoch": 0.9172413793103448, "grad_norm": 0.3617306053638458, "learning_rate": 1.721194879089616e-05, "loss": 3.2942028045654297, "step": 1330 }, { "epoch": 0.9241379310344827, "grad_norm": 0.3539746403694153, "learning_rate": 1.5789473684210526e-05, "loss": 3.332453155517578, "step": 1340 }, { "epoch": 0.9310344827586207, "grad_norm": 0.34931978583335876, "learning_rate": 1.4366998577524893e-05, "loss": 3.304658889770508, "step": 1350 }, { "epoch": 0.9379310344827586, "grad_norm": 0.33509236574172974, "learning_rate": 1.2944523470839262e-05, "loss": 3.365464782714844, "step": 1360 }, { "epoch": 0.9448275862068966, "grad_norm": 0.36600831151008606, "learning_rate": 1.1522048364153627e-05, "loss": 3.357099914550781, "step": 1370 }, { "epoch": 0.9517241379310345, "grad_norm": 0.32806214690208435, "learning_rate": 1.0099573257467996e-05, "loss": 3.3524654388427733, "step": 1380 }, { "epoch": 0.9586206896551724, "grad_norm": 0.34161072969436646, "learning_rate": 8.677098150782363e-06, "loss": 3.266713333129883, "step": 1390 }, { "epoch": 0.9655172413793104, "grad_norm": 0.3262627124786377, "learning_rate": 7.254623044096729e-06, "loss": 3.2161521911621094, "step": 1400 }, { "epoch": 0.9655172413793104, "eval_loss": 3.3177387714385986, "eval_runtime": 22.491, "eval_samples_per_second": 55.089, "eval_steps_per_second": 6.892, "step": 1400 }, { "epoch": 0.9724137931034482, "grad_norm": 0.35331958532333374, "learning_rate": 5.832147937411096e-06, "loss": 3.3992801666259767, "step": 1410 }, { "epoch": 0.9793103448275862, "grad_norm": 0.35313528776168823, "learning_rate": 4.409672830725463e-06, "loss": 3.3636543273925783, "step": 1420 }, { "epoch": 0.9862068965517241, "grad_norm": 0.3474940359592438, "learning_rate": 2.9871977240398294e-06, "loss": 3.320618438720703, "step": 1430 }, { "epoch": 0.993103448275862, "grad_norm": 0.3796871304512024, "learning_rate": 1.5647226173541964e-06, "loss": 3.3694156646728515, "step": 1440 }, { "epoch": 1.0, "grad_norm": 0.37185928225517273, "learning_rate": 1.422475106685633e-07, "loss": 3.3201263427734373, "step": 1450 } ], "logging_steps": 10, "max_steps": 1450, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 494830102118400.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }