{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 25.0, "eval_steps": 500, "global_step": 39775, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06285355122564425, "grad_norm": 608.9674682617188, "learning_rate": 4.844437460716531e-05, "loss": 14.2524, "step": 100 }, { "epoch": 0.1257071024512885, "grad_norm": 34.65327453613281, "learning_rate": 4.6873035826524205e-05, "loss": 10.3562, "step": 200 }, { "epoch": 0.18856065367693275, "grad_norm": 21.24808120727539, "learning_rate": 4.5301697045883096e-05, "loss": 7.8551, "step": 300 }, { "epoch": 0.251414204902577, "grad_norm": 17.404918670654297, "learning_rate": 4.373035826524199e-05, "loss": 6.6346, "step": 400 }, { "epoch": 0.3142677561282212, "grad_norm": 12.713433265686035, "learning_rate": 4.2159019484600884e-05, "loss": 5.9755, "step": 500 }, { "epoch": 0.3771213073538655, "grad_norm": 10.050477981567383, "learning_rate": 4.0587680703959775e-05, "loss": 5.5595, "step": 600 }, { "epoch": 0.43997485857950974, "grad_norm": 13.709216117858887, "learning_rate": 3.9016341923318666e-05, "loss": 5.2853, "step": 700 }, { "epoch": 0.502828409805154, "grad_norm": 9.112940788269043, "learning_rate": 3.744500314267756e-05, "loss": 5.1417, "step": 800 }, { "epoch": 0.5656819610307983, "grad_norm": 8.267425537109375, "learning_rate": 3.587366436203646e-05, "loss": 4.9615, "step": 900 }, { "epoch": 0.6285355122564424, "grad_norm": 9.709076881408691, "learning_rate": 3.430232558139535e-05, "loss": 4.6907, "step": 1000 }, { "epoch": 0.6913890634820867, "grad_norm": 845.80859375, "learning_rate": 3.273098680075424e-05, "loss": 4.5456, "step": 1100 }, { "epoch": 0.754242614707731, "grad_norm": 5.943735599517822, "learning_rate": 3.115964802011313e-05, "loss": 4.4291, "step": 1200 }, { "epoch": 0.8170961659333752, "grad_norm": 5.8759989738464355, "learning_rate": 2.9588309239472034e-05, "loss": 4.3252, "step": 1300 }, { "epoch": 0.8799497171590195, "grad_norm": 14.995753288269043, "learning_rate": 2.8016970458830928e-05, "loss": 4.2586, "step": 1400 }, { "epoch": 0.9428032683846638, "grad_norm": 23.3351993560791, "learning_rate": 2.644563167818982e-05, "loss": 4.1372, "step": 1500 }, { "epoch": 1.0, "eval_loss": 3.215750217437744, "eval_runtime": 19.7611, "eval_samples_per_second": 48.479, "eval_steps_per_second": 6.073, "step": 1591 }, { "epoch": 1.005656819610308, "grad_norm": 8.584565162658691, "learning_rate": 2.4874292897548713e-05, "loss": 4.0272, "step": 1600 }, { "epoch": 1.0685103708359522, "grad_norm": 6.45043420791626, "learning_rate": 2.3302954116907607e-05, "loss": 3.9602, "step": 1700 }, { "epoch": 1.1313639220615965, "grad_norm": 6.03476095199585, "learning_rate": 2.17316153362665e-05, "loss": 3.9052, "step": 1800 }, { "epoch": 1.1942174732872408, "grad_norm": 5.746309280395508, "learning_rate": 2.0160276555625392e-05, "loss": 3.9282, "step": 1900 }, { "epoch": 1.2570710245128849, "grad_norm": 8.062549591064453, "learning_rate": 1.858893777498429e-05, "loss": 3.8096, "step": 2000 }, { "epoch": 1.3199245757385292, "grad_norm": 8.58310317993164, "learning_rate": 1.701759899434318e-05, "loss": 3.803, "step": 2100 }, { "epoch": 1.3827781269641735, "grad_norm": 7.599905490875244, "learning_rate": 1.5446260213702074e-05, "loss": 3.8381, "step": 2200 }, { "epoch": 1.4456316781898177, "grad_norm": 22.772512435913086, "learning_rate": 1.3874921433060969e-05, "loss": 3.6456, "step": 2300 }, { "epoch": 1.508485229415462, "grad_norm": 6.949570178985596, "learning_rate": 1.2303582652419863e-05, "loss": 3.7442, "step": 2400 }, { "epoch": 1.5713387806411063, "grad_norm": 5.7536821365356445, "learning_rate": 1.0732243871778757e-05, "loss": 3.691, "step": 2500 }, { "epoch": 1.6341923318667506, "grad_norm": 55.64060974121094, "learning_rate": 9.160905091137651e-06, "loss": 3.7461, "step": 2600 }, { "epoch": 1.6970458830923947, "grad_norm": 6.573077201843262, "learning_rate": 7.589566310496543e-06, "loss": 3.6186, "step": 2700 }, { "epoch": 1.759899434318039, "grad_norm": 8.615326881408691, "learning_rate": 6.018227529855437e-06, "loss": 3.6546, "step": 2800 }, { "epoch": 1.8227529855436833, "grad_norm": 6.359428405761719, "learning_rate": 4.446888749214331e-06, "loss": 3.5724, "step": 2900 }, { "epoch": 1.8856065367693273, "grad_norm": 5.5190582275390625, "learning_rate": 2.8755499685732243e-06, "loss": 3.6164, "step": 3000 }, { "epoch": 1.9484600879949716, "grad_norm": 5.9382004737854, "learning_rate": 1.3042111879321182e-06, "loss": 3.52, "step": 3100 }, { "epoch": 2.0, "eval_loss": 2.803544521331787, "eval_runtime": 19.8643, "eval_samples_per_second": 48.227, "eval_steps_per_second": 6.041, "step": 3182 }, { "epoch": 2.011313639220616, "grad_norm": 10.074417114257812, "learning_rate": 3.9946574481458206e-05, "loss": 3.5087, "step": 3200 }, { "epoch": 2.07416719044626, "grad_norm": 6.9990434646606445, "learning_rate": 3.963230672532998e-05, "loss": 3.5746, "step": 3300 }, { "epoch": 2.1370207416719045, "grad_norm": 6.968172073364258, "learning_rate": 3.931803896920176e-05, "loss": 3.6324, "step": 3400 }, { "epoch": 2.1998742928975488, "grad_norm": 179.99803161621094, "learning_rate": 3.9003771213073545e-05, "loss": 3.4072, "step": 3500 }, { "epoch": 2.262727844123193, "grad_norm": 59.86805725097656, "learning_rate": 3.868950345694532e-05, "loss": 3.391, "step": 3600 }, { "epoch": 2.3255813953488373, "grad_norm": 7.445355415344238, "learning_rate": 3.83752357008171e-05, "loss": 3.2032, "step": 3700 }, { "epoch": 2.3884349465744816, "grad_norm": 5.553746700286865, "learning_rate": 3.806096794468888e-05, "loss": 3.3644, "step": 3800 }, { "epoch": 2.4512884978001255, "grad_norm": 6.544325351715088, "learning_rate": 3.7746700188560656e-05, "loss": 3.1666, "step": 3900 }, { "epoch": 2.5141420490257698, "grad_norm": 7.863962650299072, "learning_rate": 3.7432432432432436e-05, "loss": 3.1982, "step": 4000 }, { "epoch": 2.576995600251414, "grad_norm": 10.573624610900879, "learning_rate": 3.7118164676304215e-05, "loss": 3.1336, "step": 4100 }, { "epoch": 2.6398491514770583, "grad_norm": 8.506134986877441, "learning_rate": 3.680389692017599e-05, "loss": 3.0191, "step": 4200 }, { "epoch": 2.7027027027027026, "grad_norm": 7.1274518966674805, "learning_rate": 3.6489629164047774e-05, "loss": 3.003, "step": 4300 }, { "epoch": 2.765556253928347, "grad_norm": 5.121671199798584, "learning_rate": 3.617536140791955e-05, "loss": 3.085, "step": 4400 }, { "epoch": 2.828409805153991, "grad_norm": 6.66685152053833, "learning_rate": 3.5861093651791327e-05, "loss": 3.0205, "step": 4500 }, { "epoch": 2.8912633563796355, "grad_norm": 8.410430908203125, "learning_rate": 3.5546825895663106e-05, "loss": 2.9611, "step": 4600 }, { "epoch": 2.95411690760528, "grad_norm": 6.266846179962158, "learning_rate": 3.5232558139534886e-05, "loss": 2.9299, "step": 4700 }, { "epoch": 3.0, "eval_loss": 2.3084471225738525, "eval_runtime": 20.0337, "eval_samples_per_second": 47.819, "eval_steps_per_second": 5.99, "step": 4773 }, { "epoch": 3.016970458830924, "grad_norm": 6.011202335357666, "learning_rate": 3.4918290383406665e-05, "loss": 2.886, "step": 4800 }, { "epoch": 3.0798240100565684, "grad_norm": 7.204225063323975, "learning_rate": 3.4604022627278445e-05, "loss": 2.8579, "step": 4900 }, { "epoch": 3.1426775612822127, "grad_norm": 10.316048622131348, "learning_rate": 3.428975487115022e-05, "loss": 2.8155, "step": 5000 }, { "epoch": 3.2055311125078565, "grad_norm": 6.55385684967041, "learning_rate": 3.3975487115022e-05, "loss": 2.8938, "step": 5100 }, { "epoch": 3.268384663733501, "grad_norm": 6.081694602966309, "learning_rate": 3.366121935889378e-05, "loss": 2.7344, "step": 5200 }, { "epoch": 3.331238214959145, "grad_norm": 8.186753273010254, "learning_rate": 3.3346951602765556e-05, "loss": 2.7899, "step": 5300 }, { "epoch": 3.3940917661847894, "grad_norm": 7.425989627838135, "learning_rate": 3.3032683846637335e-05, "loss": 2.7317, "step": 5400 }, { "epoch": 3.4569453174104336, "grad_norm": 5.459439277648926, "learning_rate": 3.2718416090509115e-05, "loss": 2.6456, "step": 5500 }, { "epoch": 3.519798868636078, "grad_norm": 5.077919006347656, "learning_rate": 3.2404148334380894e-05, "loss": 2.6816, "step": 5600 }, { "epoch": 3.5826524198617222, "grad_norm": 5.81939172744751, "learning_rate": 3.2089880578252674e-05, "loss": 2.64, "step": 5700 }, { "epoch": 3.6455059710873665, "grad_norm": 39.74727249145508, "learning_rate": 3.177561282212445e-05, "loss": 2.6725, "step": 5800 }, { "epoch": 3.708359522313011, "grad_norm": 5.927642345428467, "learning_rate": 3.1461345065996226e-05, "loss": 2.5395, "step": 5900 }, { "epoch": 3.771213073538655, "grad_norm": 5.984442710876465, "learning_rate": 3.114707730986801e-05, "loss": 2.6297, "step": 6000 }, { "epoch": 3.834066624764299, "grad_norm": 5.258358478546143, "learning_rate": 3.083280955373979e-05, "loss": 2.6291, "step": 6100 }, { "epoch": 3.8969201759899432, "grad_norm": 5.7379937171936035, "learning_rate": 3.0518541797611565e-05, "loss": 2.6116, "step": 6200 }, { "epoch": 3.9597737272155875, "grad_norm": 5.038835048675537, "learning_rate": 3.0204274041483344e-05, "loss": 2.6695, "step": 6300 }, { "epoch": 4.0, "eval_loss": 2.0932769775390625, "eval_runtime": 20.0417, "eval_samples_per_second": 47.8, "eval_steps_per_second": 5.988, "step": 6364 }, { "epoch": 4.022627278441232, "grad_norm": 7.459395885467529, "learning_rate": 2.9890006285355127e-05, "loss": 2.6404, "step": 6400 }, { "epoch": 4.085480829666876, "grad_norm": 6.721461296081543, "learning_rate": 2.9575738529226903e-05, "loss": 2.4614, "step": 6500 }, { "epoch": 4.14833438089252, "grad_norm": 6.69769287109375, "learning_rate": 2.9261470773098683e-05, "loss": 2.457, "step": 6600 }, { "epoch": 4.211187932118165, "grad_norm": 5.306356906890869, "learning_rate": 2.894720301697046e-05, "loss": 2.513, "step": 6700 }, { "epoch": 4.274041483343809, "grad_norm": 5.425265312194824, "learning_rate": 2.8632935260842235e-05, "loss": 2.5467, "step": 6800 }, { "epoch": 4.336895034569453, "grad_norm": 4.722207546234131, "learning_rate": 2.8318667504714018e-05, "loss": 2.3467, "step": 6900 }, { "epoch": 4.3997485857950975, "grad_norm": 4.346086502075195, "learning_rate": 2.8004399748585797e-05, "loss": 2.5098, "step": 7000 }, { "epoch": 4.462602137020742, "grad_norm": 7.4684319496154785, "learning_rate": 2.7690131992457573e-05, "loss": 2.4396, "step": 7100 }, { "epoch": 4.525455688246386, "grad_norm": 5.709039688110352, "learning_rate": 2.7375864236329353e-05, "loss": 2.4688, "step": 7200 }, { "epoch": 4.58830923947203, "grad_norm": 4.952858924865723, "learning_rate": 2.7061596480201136e-05, "loss": 2.3643, "step": 7300 }, { "epoch": 4.651162790697675, "grad_norm": 6.68017578125, "learning_rate": 2.6747328724072912e-05, "loss": 2.4242, "step": 7400 }, { "epoch": 4.714016341923319, "grad_norm": 3.584669828414917, "learning_rate": 2.6433060967944688e-05, "loss": 2.4552, "step": 7500 }, { "epoch": 4.776869893148963, "grad_norm": 5.264488220214844, "learning_rate": 2.6118793211816468e-05, "loss": 2.4232, "step": 7600 }, { "epoch": 4.8397234443746076, "grad_norm": 4.609414100646973, "learning_rate": 2.580452545568825e-05, "loss": 2.4418, "step": 7700 }, { "epoch": 4.902576995600251, "grad_norm": 4.986881256103516, "learning_rate": 2.5490257699560027e-05, "loss": 2.4065, "step": 7800 }, { "epoch": 4.965430546825896, "grad_norm": 4.9718098640441895, "learning_rate": 2.5175989943431806e-05, "loss": 2.4589, "step": 7900 }, { "epoch": 5.0, "eval_loss": 1.984979271888733, "eval_runtime": 20.0353, "eval_samples_per_second": 47.816, "eval_steps_per_second": 5.989, "step": 7955 }, { "epoch": 5.0282840980515395, "grad_norm": 5.2526750564575195, "learning_rate": 2.4861722187303586e-05, "loss": 2.2708, "step": 8000 }, { "epoch": 5.091137649277184, "grad_norm": 5.312747001647949, "learning_rate": 2.454745443117536e-05, "loss": 2.3068, "step": 8100 }, { "epoch": 5.153991200502828, "grad_norm": 7.204046726226807, "learning_rate": 2.423318667504714e-05, "loss": 2.3729, "step": 8200 }, { "epoch": 5.216844751728472, "grad_norm": 4.8044753074646, "learning_rate": 2.391891891891892e-05, "loss": 2.3501, "step": 8300 }, { "epoch": 5.279698302954117, "grad_norm": 6.9473185539245605, "learning_rate": 2.3604651162790697e-05, "loss": 2.3398, "step": 8400 }, { "epoch": 5.342551854179761, "grad_norm": 4.014726161956787, "learning_rate": 2.3290383406662476e-05, "loss": 2.2938, "step": 8500 }, { "epoch": 5.405405405405405, "grad_norm": 6.722488880157471, "learning_rate": 2.2976115650534256e-05, "loss": 2.2354, "step": 8600 }, { "epoch": 5.4682589566310495, "grad_norm": 5.856524467468262, "learning_rate": 2.2661847894406035e-05, "loss": 2.2757, "step": 8700 }, { "epoch": 5.531112507856694, "grad_norm": 4.9930644035339355, "learning_rate": 2.234758013827781e-05, "loss": 2.2586, "step": 8800 }, { "epoch": 5.593966059082338, "grad_norm": 5.49005126953125, "learning_rate": 2.2033312382149594e-05, "loss": 2.3155, "step": 8900 }, { "epoch": 5.656819610307982, "grad_norm": 8.850517272949219, "learning_rate": 2.171904462602137e-05, "loss": 2.2841, "step": 9000 }, { "epoch": 5.719673161533627, "grad_norm": 5.094405651092529, "learning_rate": 2.140477686989315e-05, "loss": 2.3147, "step": 9100 }, { "epoch": 5.782526712759271, "grad_norm": 4.709909439086914, "learning_rate": 2.109050911376493e-05, "loss": 2.1584, "step": 9200 }, { "epoch": 5.845380263984915, "grad_norm": 4.1693525314331055, "learning_rate": 2.077624135763671e-05, "loss": 2.2396, "step": 9300 }, { "epoch": 5.90823381521056, "grad_norm": 6.800940036773682, "learning_rate": 2.0461973601508485e-05, "loss": 2.301, "step": 9400 }, { "epoch": 5.971087366436204, "grad_norm": 7.419278144836426, "learning_rate": 2.0147705845380265e-05, "loss": 2.3142, "step": 9500 }, { "epoch": 6.0, "eval_loss": 1.905881643295288, "eval_runtime": 20.0332, "eval_samples_per_second": 47.821, "eval_steps_per_second": 5.99, "step": 9546 }, { "epoch": 6.033940917661848, "grad_norm": 4.217894077301025, "learning_rate": 1.9833438089252044e-05, "loss": 2.1013, "step": 9600 }, { "epoch": 6.096794468887492, "grad_norm": 5.345584869384766, "learning_rate": 1.9519170333123824e-05, "loss": 2.2714, "step": 9700 }, { "epoch": 6.159648020113137, "grad_norm": 5.364700794219971, "learning_rate": 1.92049025769956e-05, "loss": 2.2381, "step": 9800 }, { "epoch": 6.222501571338781, "grad_norm": 4.380568504333496, "learning_rate": 1.8890634820867383e-05, "loss": 2.1527, "step": 9900 }, { "epoch": 6.285355122564425, "grad_norm": 6.300790309906006, "learning_rate": 1.857636706473916e-05, "loss": 2.1771, "step": 10000 }, { "epoch": 6.348208673790069, "grad_norm": 5.757110118865967, "learning_rate": 1.8262099308610938e-05, "loss": 2.1695, "step": 10100 }, { "epoch": 6.411062225015713, "grad_norm": 4.908361434936523, "learning_rate": 1.7947831552482718e-05, "loss": 2.1056, "step": 10200 }, { "epoch": 6.473915776241357, "grad_norm": 5.048102378845215, "learning_rate": 1.7633563796354494e-05, "loss": 2.2112, "step": 10300 }, { "epoch": 6.536769327467002, "grad_norm": 8.040143013000488, "learning_rate": 1.7319296040226273e-05, "loss": 2.0298, "step": 10400 }, { "epoch": 6.599622878692646, "grad_norm": 5.15581750869751, "learning_rate": 1.7005028284098053e-05, "loss": 2.1224, "step": 10500 }, { "epoch": 6.66247642991829, "grad_norm": 4.935842514038086, "learning_rate": 1.6690760527969832e-05, "loss": 2.0772, "step": 10600 }, { "epoch": 6.725329981143934, "grad_norm": 5.487718105316162, "learning_rate": 1.637649277184161e-05, "loss": 2.2552, "step": 10700 }, { "epoch": 6.788183532369579, "grad_norm": 5.713748455047607, "learning_rate": 1.6062225015713388e-05, "loss": 2.1358, "step": 10800 }, { "epoch": 6.851037083595223, "grad_norm": 4.882757186889648, "learning_rate": 1.5747957259585168e-05, "loss": 2.1613, "step": 10900 }, { "epoch": 6.913890634820867, "grad_norm": 5.634950637817383, "learning_rate": 1.5433689503456947e-05, "loss": 2.2567, "step": 11000 }, { "epoch": 6.976744186046512, "grad_norm": 5.634829044342041, "learning_rate": 1.5119421747328725e-05, "loss": 2.1283, "step": 11100 }, { "epoch": 7.0, "eval_loss": 1.84635591506958, "eval_runtime": 20.0367, "eval_samples_per_second": 47.812, "eval_steps_per_second": 5.989, "step": 11137 }, { "epoch": 7.039597737272156, "grad_norm": 5.635861873626709, "learning_rate": 1.4805153991200504e-05, "loss": 2.0938, "step": 11200 }, { "epoch": 7.1024512884978, "grad_norm": 5.214977741241455, "learning_rate": 1.4490886235072282e-05, "loss": 2.062, "step": 11300 }, { "epoch": 7.1653048397234445, "grad_norm": 7.498839855194092, "learning_rate": 1.4176618478944062e-05, "loss": 2.1292, "step": 11400 }, { "epoch": 7.228158390949089, "grad_norm": 5.83459997177124, "learning_rate": 1.386235072281584e-05, "loss": 2.0796, "step": 11500 }, { "epoch": 7.291011942174733, "grad_norm": 3.8935282230377197, "learning_rate": 1.3548082966687619e-05, "loss": 2.1414, "step": 11600 }, { "epoch": 7.353865493400377, "grad_norm": 5.774020671844482, "learning_rate": 1.3233815210559397e-05, "loss": 2.145, "step": 11700 }, { "epoch": 7.416719044626022, "grad_norm": 128.24192810058594, "learning_rate": 1.2919547454431178e-05, "loss": 2.0242, "step": 11800 }, { "epoch": 7.479572595851666, "grad_norm": 4.4846367835998535, "learning_rate": 1.2605279698302954e-05, "loss": 2.0936, "step": 11900 }, { "epoch": 7.54242614707731, "grad_norm": 5.091222763061523, "learning_rate": 1.2291011942174734e-05, "loss": 2.1988, "step": 12000 }, { "epoch": 7.6052796983029545, "grad_norm": 3.3482093811035156, "learning_rate": 1.1976744186046513e-05, "loss": 2.1323, "step": 12100 }, { "epoch": 7.668133249528598, "grad_norm": 5.329409599304199, "learning_rate": 1.1662476429918291e-05, "loss": 2.0587, "step": 12200 }, { "epoch": 7.730986800754243, "grad_norm": 7.584386348724365, "learning_rate": 1.134820867379007e-05, "loss": 2.1341, "step": 12300 }, { "epoch": 7.7938403519798864, "grad_norm": 5.996345520019531, "learning_rate": 1.1033940917661848e-05, "loss": 2.1108, "step": 12400 }, { "epoch": 7.856693903205531, "grad_norm": 6.1731648445129395, "learning_rate": 1.0719673161533628e-05, "loss": 2.1218, "step": 12500 }, { "epoch": 7.919547454431175, "grad_norm": 5.414481163024902, "learning_rate": 1.0405405405405407e-05, "loss": 2.028, "step": 12600 }, { "epoch": 7.982401005656819, "grad_norm": 7.198294639587402, "learning_rate": 1.0091137649277185e-05, "loss": 2.0489, "step": 12700 }, { "epoch": 8.0, "eval_loss": 1.8111430406570435, "eval_runtime": 20.0666, "eval_samples_per_second": 47.741, "eval_steps_per_second": 5.98, "step": 12728 }, { "epoch": 8.045254556882464, "grad_norm": 6.677022933959961, "learning_rate": 9.776869893148963e-06, "loss": 2.0814, "step": 12800 }, { "epoch": 8.108108108108109, "grad_norm": 5.1916728019714355, "learning_rate": 9.46260213702074e-06, "loss": 2.119, "step": 12900 }, { "epoch": 8.170961659333752, "grad_norm": 6.04162073135376, "learning_rate": 9.14833438089252e-06, "loss": 2.0058, "step": 13000 }, { "epoch": 8.233815210559397, "grad_norm": 4.764267444610596, "learning_rate": 8.8340666247643e-06, "loss": 2.0113, "step": 13100 }, { "epoch": 8.29666876178504, "grad_norm": 5.77971887588501, "learning_rate": 8.519798868636078e-06, "loss": 2.0392, "step": 13200 }, { "epoch": 8.359522313010686, "grad_norm": 5.698218822479248, "learning_rate": 8.205531112507857e-06, "loss": 2.107, "step": 13300 }, { "epoch": 8.42237586423633, "grad_norm": 5.236012935638428, "learning_rate": 7.891263356379635e-06, "loss": 2.0829, "step": 13400 }, { "epoch": 8.485229415461973, "grad_norm": 4.379955291748047, "learning_rate": 7.576995600251414e-06, "loss": 1.9321, "step": 13500 }, { "epoch": 8.548082966687618, "grad_norm": 6.034859657287598, "learning_rate": 7.262727844123193e-06, "loss": 2.1013, "step": 13600 }, { "epoch": 8.610936517913261, "grad_norm": 5.320705413818359, "learning_rate": 6.948460087994972e-06, "loss": 2.0543, "step": 13700 }, { "epoch": 8.673790069138906, "grad_norm": 5.735895156860352, "learning_rate": 6.634192331866751e-06, "loss": 2.0594, "step": 13800 }, { "epoch": 8.73664362036455, "grad_norm": 4.845800876617432, "learning_rate": 6.31992457573853e-06, "loss": 1.9402, "step": 13900 }, { "epoch": 8.799497171590195, "grad_norm": 4.628382682800293, "learning_rate": 6.0056568196103085e-06, "loss": 1.9937, "step": 14000 }, { "epoch": 8.862350722815838, "grad_norm": 4.747410774230957, "learning_rate": 5.691389063482086e-06, "loss": 2.0654, "step": 14100 }, { "epoch": 8.925204274041484, "grad_norm": 4.694166660308838, "learning_rate": 5.377121307353866e-06, "loss": 2.0523, "step": 14200 }, { "epoch": 8.988057825267127, "grad_norm": 6.711084365844727, "learning_rate": 5.0628535512256445e-06, "loss": 1.9856, "step": 14300 }, { "epoch": 9.0, "eval_loss": 1.7920939922332764, "eval_runtime": 20.0378, "eval_samples_per_second": 47.81, "eval_steps_per_second": 5.989, "step": 14319 }, { "epoch": 9.050911376492772, "grad_norm": 6.053162097930908, "learning_rate": 4.748585795097423e-06, "loss": 2.0392, "step": 14400 }, { "epoch": 9.113764927718416, "grad_norm": 4.806529521942139, "learning_rate": 4.434318038969202e-06, "loss": 2.0308, "step": 14500 }, { "epoch": 9.17661847894406, "grad_norm": 4.725819110870361, "learning_rate": 4.1200502828409805e-06, "loss": 2.0441, "step": 14600 }, { "epoch": 9.239472030169704, "grad_norm": 4.637420177459717, "learning_rate": 3.8057825267127596e-06, "loss": 2.0061, "step": 14700 }, { "epoch": 9.30232558139535, "grad_norm": 6.441665172576904, "learning_rate": 3.4915147705845382e-06, "loss": 2.1299, "step": 14800 }, { "epoch": 9.365179132620993, "grad_norm": 3.506943941116333, "learning_rate": 3.1772470144563173e-06, "loss": 1.9443, "step": 14900 }, { "epoch": 9.428032683846638, "grad_norm": 8.454822540283203, "learning_rate": 2.8629792583280956e-06, "loss": 2.0327, "step": 15000 }, { "epoch": 9.490886235072281, "grad_norm": 5.021187782287598, "learning_rate": 2.5487115021998746e-06, "loss": 1.9839, "step": 15100 }, { "epoch": 9.553739786297927, "grad_norm": 6.3962016105651855, "learning_rate": 2.234443746071653e-06, "loss": 2.0604, "step": 15200 }, { "epoch": 9.61659333752357, "grad_norm": 5.531436443328857, "learning_rate": 1.920175989943432e-06, "loss": 2.0168, "step": 15300 }, { "epoch": 9.679446888749215, "grad_norm": 4.300695896148682, "learning_rate": 1.6059082338152106e-06, "loss": 1.9994, "step": 15400 }, { "epoch": 9.742300439974859, "grad_norm": 3.102018356323242, "learning_rate": 1.2916404776869893e-06, "loss": 2.0441, "step": 15500 }, { "epoch": 9.805153991200502, "grad_norm": 4.91919469833374, "learning_rate": 9.773727215587681e-07, "loss": 1.9584, "step": 15600 }, { "epoch": 9.868007542426147, "grad_norm": 4.21737813949585, "learning_rate": 6.631049654305469e-07, "loss": 2.0019, "step": 15700 }, { "epoch": 9.930861093651792, "grad_norm": 4.098769187927246, "learning_rate": 3.4883720930232557e-07, "loss": 2.0121, "step": 15800 }, { "epoch": 9.993714644877436, "grad_norm": 4.722096920013428, "learning_rate": 3.456945317410434e-08, "loss": 2.0196, "step": 15900 }, { "epoch": 10.0, "eval_loss": 1.787421464920044, "eval_runtime": 20.0243, "eval_samples_per_second": 47.842, "eval_steps_per_second": 5.993, "step": 15910 }, { "epoch": 10.056568196103079, "grad_norm": 3.8331987857818604, "learning_rate": 2.4860150848522942e-05, "loss": 2.0388, "step": 16000 }, { "epoch": 10.119421747328724, "grad_norm": 3.9292027950286865, "learning_rate": 2.4703016970458832e-05, "loss": 2.0913, "step": 16100 }, { "epoch": 10.182275298554368, "grad_norm": 5.124855995178223, "learning_rate": 2.454588309239472e-05, "loss": 2.0452, "step": 16200 }, { "epoch": 10.245128849780013, "grad_norm": 5.743933200836182, "learning_rate": 2.438874921433061e-05, "loss": 2.016, "step": 16300 }, { "epoch": 10.307982401005656, "grad_norm": 6.4510931968688965, "learning_rate": 2.42316153362665e-05, "loss": 1.9785, "step": 16400 }, { "epoch": 10.370835952231301, "grad_norm": 6.550465106964111, "learning_rate": 2.4074481458202387e-05, "loss": 1.9912, "step": 16500 }, { "epoch": 10.433689503456945, "grad_norm": 5.37285852432251, "learning_rate": 2.391734758013828e-05, "loss": 2.0549, "step": 16600 }, { "epoch": 10.49654305468259, "grad_norm": 5.4893412590026855, "learning_rate": 2.376021370207417e-05, "loss": 1.9434, "step": 16700 }, { "epoch": 10.559396605908233, "grad_norm": 4.316259384155273, "learning_rate": 2.3603079824010057e-05, "loss": 1.8413, "step": 16800 }, { "epoch": 10.622250157133879, "grad_norm": 3.4342756271362305, "learning_rate": 2.3445945945945946e-05, "loss": 1.9312, "step": 16900 }, { "epoch": 10.685103708359522, "grad_norm": 5.680815696716309, "learning_rate": 2.3288812067881836e-05, "loss": 1.9678, "step": 17000 }, { "epoch": 10.747957259585167, "grad_norm": 6.04569149017334, "learning_rate": 2.3131678189817726e-05, "loss": 2.0329, "step": 17100 }, { "epoch": 10.81081081081081, "grad_norm": 9.336991310119629, "learning_rate": 2.2974544311753616e-05, "loss": 1.9575, "step": 17200 }, { "epoch": 10.873664362036456, "grad_norm": 3.826447010040283, "learning_rate": 2.2817410433689505e-05, "loss": 1.9692, "step": 17300 }, { "epoch": 10.936517913262099, "grad_norm": 4.134801387786865, "learning_rate": 2.2660276555625392e-05, "loss": 2.0406, "step": 17400 }, { "epoch": 10.999371464487744, "grad_norm": 5.291431903839111, "learning_rate": 2.2503142677561285e-05, "loss": 1.9631, "step": 17500 }, { "epoch": 11.0, "eval_loss": 1.7517410516738892, "eval_runtime": 21.6572, "eval_samples_per_second": 44.235, "eval_steps_per_second": 5.541, "step": 17501 }, { "epoch": 11.062225015713388, "grad_norm": 4.9575066566467285, "learning_rate": 2.234600879949717e-05, "loss": 1.9381, "step": 17600 }, { "epoch": 11.125078566939033, "grad_norm": 12.871175765991211, "learning_rate": 2.218887492143306e-05, "loss": 1.8867, "step": 17700 }, { "epoch": 11.187932118164676, "grad_norm": 4.3662519454956055, "learning_rate": 2.203174104336895e-05, "loss": 1.9713, "step": 17800 }, { "epoch": 11.250785669390321, "grad_norm": 5.662289619445801, "learning_rate": 2.187460716530484e-05, "loss": 1.9188, "step": 17900 }, { "epoch": 11.313639220615965, "grad_norm": 7.633818626403809, "learning_rate": 2.171747328724073e-05, "loss": 1.9142, "step": 18000 }, { "epoch": 11.376492771841608, "grad_norm": 4.940028667449951, "learning_rate": 2.156033940917662e-05, "loss": 1.8697, "step": 18100 }, { "epoch": 11.439346323067253, "grad_norm": 5.070211410522461, "learning_rate": 2.1403205531112506e-05, "loss": 1.9624, "step": 18200 }, { "epoch": 11.502199874292897, "grad_norm": 7.409548282623291, "learning_rate": 2.12460716530484e-05, "loss": 1.9283, "step": 18300 }, { "epoch": 11.565053425518542, "grad_norm": 6.541192531585693, "learning_rate": 2.108893777498429e-05, "loss": 1.9357, "step": 18400 }, { "epoch": 11.627906976744185, "grad_norm": 5.941864967346191, "learning_rate": 2.0931803896920176e-05, "loss": 1.869, "step": 18500 }, { "epoch": 11.69076052796983, "grad_norm": 9.418646812438965, "learning_rate": 2.0774670018856065e-05, "loss": 1.8518, "step": 18600 }, { "epoch": 11.753614079195474, "grad_norm": 5.367152690887451, "learning_rate": 2.061753614079196e-05, "loss": 1.8945, "step": 18700 }, { "epoch": 11.81646763042112, "grad_norm": 5.896432399749756, "learning_rate": 2.0460402262727845e-05, "loss": 1.8569, "step": 18800 }, { "epoch": 11.879321181646763, "grad_norm": 6.137564182281494, "learning_rate": 2.0303268384663735e-05, "loss": 1.9179, "step": 18900 }, { "epoch": 11.942174732872408, "grad_norm": 4.5933918952941895, "learning_rate": 2.0146134506599625e-05, "loss": 1.8941, "step": 19000 }, { "epoch": 12.0, "eval_loss": 1.7062737941741943, "eval_runtime": 21.7167, "eval_samples_per_second": 44.114, "eval_steps_per_second": 5.526, "step": 19092 }, { "epoch": 12.005028284098051, "grad_norm": 5.298050880432129, "learning_rate": 1.998900062853551e-05, "loss": 1.8681, "step": 19100 }, { "epoch": 12.067881835323696, "grad_norm": 7.001854419708252, "learning_rate": 1.9831866750471404e-05, "loss": 1.8377, "step": 19200 }, { "epoch": 12.13073538654934, "grad_norm": 4.692386150360107, "learning_rate": 1.9674732872407294e-05, "loss": 1.8279, "step": 19300 }, { "epoch": 12.193588937774985, "grad_norm": 6.864208221435547, "learning_rate": 1.951759899434318e-05, "loss": 1.8855, "step": 19400 }, { "epoch": 12.256442489000628, "grad_norm": 3.883880853652954, "learning_rate": 1.936046511627907e-05, "loss": 1.84, "step": 19500 }, { "epoch": 12.319296040226273, "grad_norm": 5.302524566650391, "learning_rate": 1.920333123821496e-05, "loss": 1.8791, "step": 19600 }, { "epoch": 12.382149591451917, "grad_norm": 6.854051113128662, "learning_rate": 1.904619736015085e-05, "loss": 1.9189, "step": 19700 }, { "epoch": 12.445003142677562, "grad_norm": 4.728283405303955, "learning_rate": 1.888906348208674e-05, "loss": 1.8903, "step": 19800 }, { "epoch": 12.507856693903205, "grad_norm": 4.314347267150879, "learning_rate": 1.8731929604022626e-05, "loss": 1.8615, "step": 19900 }, { "epoch": 12.57071024512885, "grad_norm": 3.873619318008423, "learning_rate": 1.857479572595852e-05, "loss": 1.8232, "step": 20000 }, { "epoch": 12.633563796354494, "grad_norm": 6.445096969604492, "learning_rate": 1.841766184789441e-05, "loss": 1.7764, "step": 20100 }, { "epoch": 12.696417347580137, "grad_norm": 4.258322715759277, "learning_rate": 1.8260527969830295e-05, "loss": 1.869, "step": 20200 }, { "epoch": 12.759270898805783, "grad_norm": 7.782538414001465, "learning_rate": 1.8103394091766185e-05, "loss": 1.7986, "step": 20300 }, { "epoch": 12.822124450031426, "grad_norm": 7.189488887786865, "learning_rate": 1.7946260213702078e-05, "loss": 1.8448, "step": 20400 }, { "epoch": 12.884978001257071, "grad_norm": 5.59601354598999, "learning_rate": 1.7789126335637964e-05, "loss": 1.7924, "step": 20500 }, { "epoch": 12.947831552482715, "grad_norm": 4.675200939178467, "learning_rate": 1.7631992457573854e-05, "loss": 1.8212, "step": 20600 }, { "epoch": 13.0, "eval_loss": 1.6696668863296509, "eval_runtime": 21.645, "eval_samples_per_second": 44.26, "eval_steps_per_second": 5.544, "step": 20683 }, { "epoch": 13.01068510370836, "grad_norm": 3.3650217056274414, "learning_rate": 1.7474858579509744e-05, "loss": 1.6872, "step": 20700 }, { "epoch": 13.073538654934003, "grad_norm": 6.4758219718933105, "learning_rate": 1.731772470144563e-05, "loss": 1.8029, "step": 20800 }, { "epoch": 13.136392206159648, "grad_norm": 4.500367641448975, "learning_rate": 1.7160590823381523e-05, "loss": 1.8655, "step": 20900 }, { "epoch": 13.199245757385292, "grad_norm": 5.369949817657471, "learning_rate": 1.7003456945317413e-05, "loss": 1.821, "step": 21000 }, { "epoch": 13.262099308610937, "grad_norm": 4.84245491027832, "learning_rate": 1.68463230672533e-05, "loss": 1.7454, "step": 21100 }, { "epoch": 13.32495285983658, "grad_norm": 4.510051727294922, "learning_rate": 1.668918918918919e-05, "loss": 1.8378, "step": 21200 }, { "epoch": 13.387806411062225, "grad_norm": 5.163560390472412, "learning_rate": 1.653205531112508e-05, "loss": 1.7985, "step": 21300 }, { "epoch": 13.450659962287869, "grad_norm": 4.454617023468018, "learning_rate": 1.637492143306097e-05, "loss": 1.8177, "step": 21400 }, { "epoch": 13.513513513513514, "grad_norm": 3.672908067703247, "learning_rate": 1.6217787554996858e-05, "loss": 1.6908, "step": 21500 }, { "epoch": 13.576367064739157, "grad_norm": 4.549923419952393, "learning_rate": 1.6060653676932748e-05, "loss": 1.7603, "step": 21600 }, { "epoch": 13.639220615964803, "grad_norm": 5.733989715576172, "learning_rate": 1.5903519798868638e-05, "loss": 1.7689, "step": 21700 }, { "epoch": 13.702074167190446, "grad_norm": 4.507519245147705, "learning_rate": 1.5746385920804527e-05, "loss": 1.7984, "step": 21800 }, { "epoch": 13.764927718416091, "grad_norm": 4.713226795196533, "learning_rate": 1.5589252042740414e-05, "loss": 1.8011, "step": 21900 }, { "epoch": 13.827781269641735, "grad_norm": 4.300686359405518, "learning_rate": 1.5432118164676304e-05, "loss": 1.7743, "step": 22000 }, { "epoch": 13.89063482086738, "grad_norm": 4.702789306640625, "learning_rate": 1.5274984286612197e-05, "loss": 1.6903, "step": 22100 }, { "epoch": 13.953488372093023, "grad_norm": 6.481640815734863, "learning_rate": 1.5117850408548085e-05, "loss": 1.822, "step": 22200 }, { "epoch": 14.0, "eval_loss": 1.648952603340149, "eval_runtime": 21.6512, "eval_samples_per_second": 44.247, "eval_steps_per_second": 5.542, "step": 22274 }, { "epoch": 14.016341923318668, "grad_norm": 4.320845127105713, "learning_rate": 2.1968573224387177e-05, "loss": 1.7866, "step": 22300 }, { "epoch": 14.079195474544312, "grad_norm": 5.575278282165527, "learning_rate": 2.184286612193589e-05, "loss": 1.7572, "step": 22400 }, { "epoch": 14.142049025769955, "grad_norm": 5.764155387878418, "learning_rate": 2.17171590194846e-05, "loss": 1.7566, "step": 22500 }, { "epoch": 14.2049025769956, "grad_norm": 4.854477882385254, "learning_rate": 2.1591451917033316e-05, "loss": 1.7517, "step": 22600 }, { "epoch": 14.267756128221244, "grad_norm": 4.7141618728637695, "learning_rate": 2.1465744814582025e-05, "loss": 1.713, "step": 22700 }, { "epoch": 14.330609679446889, "grad_norm": 4.3324785232543945, "learning_rate": 2.1340037712130736e-05, "loss": 1.7511, "step": 22800 }, { "epoch": 14.393463230672532, "grad_norm": 3.4204530715942383, "learning_rate": 2.1214330609679448e-05, "loss": 1.7451, "step": 22900 }, { "epoch": 14.456316781898177, "grad_norm": 4.925296783447266, "learning_rate": 2.108862350722816e-05, "loss": 1.6868, "step": 23000 }, { "epoch": 14.51917033312382, "grad_norm": 4.997200965881348, "learning_rate": 2.0962916404776872e-05, "loss": 1.7259, "step": 23100 }, { "epoch": 14.582023884349466, "grad_norm": 4.816483497619629, "learning_rate": 2.0837209302325584e-05, "loss": 1.7716, "step": 23200 }, { "epoch": 14.64487743557511, "grad_norm": 5.224360466003418, "learning_rate": 2.0711502199874295e-05, "loss": 1.7039, "step": 23300 }, { "epoch": 14.707730986800755, "grad_norm": 7.450541019439697, "learning_rate": 2.0585795097423004e-05, "loss": 1.6634, "step": 23400 }, { "epoch": 14.770584538026398, "grad_norm": 5.811767101287842, "learning_rate": 2.0460087994971716e-05, "loss": 1.7526, "step": 23500 }, { "epoch": 14.833438089252043, "grad_norm": 4.1061272621154785, "learning_rate": 2.0334380892520427e-05, "loss": 1.7612, "step": 23600 }, { "epoch": 14.896291640477687, "grad_norm": 4.599556922912598, "learning_rate": 2.020867379006914e-05, "loss": 1.776, "step": 23700 }, { "epoch": 14.959145191703332, "grad_norm": 4.085700988769531, "learning_rate": 2.008296668761785e-05, "loss": 1.7143, "step": 23800 }, { "epoch": 15.0, "eval_loss": 1.6270309686660767, "eval_runtime": 20.346, "eval_samples_per_second": 47.085, "eval_steps_per_second": 5.898, "step": 23865 }, { "epoch": 15.021998742928975, "grad_norm": 8.476902961730957, "learning_rate": 1.9957259585166563e-05, "loss": 1.6504, "step": 23900 }, { "epoch": 15.08485229415462, "grad_norm": 4.84979772567749, "learning_rate": 1.9831552482715275e-05, "loss": 1.7259, "step": 24000 }, { "epoch": 15.147705845380264, "grad_norm": 4.314637184143066, "learning_rate": 1.9705845380263983e-05, "loss": 1.6254, "step": 24100 }, { "epoch": 15.210559396605909, "grad_norm": 4.656597137451172, "learning_rate": 1.9580138277812698e-05, "loss": 1.7493, "step": 24200 }, { "epoch": 15.273412947831552, "grad_norm": 4.276788711547852, "learning_rate": 1.945443117536141e-05, "loss": 1.6797, "step": 24300 }, { "epoch": 15.336266499057198, "grad_norm": 3.9574031829833984, "learning_rate": 1.9328724072910122e-05, "loss": 1.716, "step": 24400 }, { "epoch": 15.399120050282841, "grad_norm": 8.148831367492676, "learning_rate": 1.920301697045883e-05, "loss": 1.6737, "step": 24500 }, { "epoch": 15.461973601508486, "grad_norm": 3.8734018802642822, "learning_rate": 1.9077309868007542e-05, "loss": 1.6452, "step": 24600 }, { "epoch": 15.52482715273413, "grad_norm": 4.928835391998291, "learning_rate": 1.8951602765556257e-05, "loss": 1.7134, "step": 24700 }, { "epoch": 15.587680703959773, "grad_norm": 4.991033554077148, "learning_rate": 1.8825895663104966e-05, "loss": 1.7327, "step": 24800 }, { "epoch": 15.650534255185418, "grad_norm": 4.160732269287109, "learning_rate": 1.8700188560653677e-05, "loss": 1.6678, "step": 24900 }, { "epoch": 15.713387806411061, "grad_norm": 6.523078441619873, "learning_rate": 1.857448145820239e-05, "loss": 1.6856, "step": 25000 }, { "epoch": 15.776241357636707, "grad_norm": 6.306403636932373, "learning_rate": 1.84487743557511e-05, "loss": 1.6699, "step": 25100 }, { "epoch": 15.83909490886235, "grad_norm": 4.479640483856201, "learning_rate": 1.832306725329981e-05, "loss": 1.676, "step": 25200 }, { "epoch": 15.901948460087995, "grad_norm": 4.6891279220581055, "learning_rate": 1.8197360150848525e-05, "loss": 1.667, "step": 25300 }, { "epoch": 15.964802011313639, "grad_norm": 5.908668518066406, "learning_rate": 1.8071653048397236e-05, "loss": 1.6267, "step": 25400 }, { "epoch": 16.0, "eval_loss": 1.608726143836975, "eval_runtime": 20.3571, "eval_samples_per_second": 47.06, "eval_steps_per_second": 5.895, "step": 25456 }, { "epoch": 16.027655562539284, "grad_norm": 4.081086158752441, "learning_rate": 1.7945945945945948e-05, "loss": 1.5625, "step": 25500 }, { "epoch": 16.090509113764927, "grad_norm": 3.7648415565490723, "learning_rate": 1.7820238843494657e-05, "loss": 1.6818, "step": 25600 }, { "epoch": 16.15336266499057, "grad_norm": 5.430357456207275, "learning_rate": 1.769453174104337e-05, "loss": 1.6125, "step": 25700 }, { "epoch": 16.216216216216218, "grad_norm": 5.235119819641113, "learning_rate": 1.7568824638592084e-05, "loss": 1.6985, "step": 25800 }, { "epoch": 16.27906976744186, "grad_norm": 5.521476745605469, "learning_rate": 1.7443117536140792e-05, "loss": 1.6291, "step": 25900 }, { "epoch": 16.341923318667504, "grad_norm": 5.7086873054504395, "learning_rate": 1.7317410433689504e-05, "loss": 1.6523, "step": 26000 }, { "epoch": 16.404776869893148, "grad_norm": 5.697257041931152, "learning_rate": 1.7191703331238216e-05, "loss": 1.6518, "step": 26100 }, { "epoch": 16.467630421118795, "grad_norm": 8.258442878723145, "learning_rate": 1.7065996228786928e-05, "loss": 1.6314, "step": 26200 }, { "epoch": 16.530483972344438, "grad_norm": 4.087442874908447, "learning_rate": 1.694028912633564e-05, "loss": 1.7048, "step": 26300 }, { "epoch": 16.59333752357008, "grad_norm": 4.184548377990723, "learning_rate": 1.681458202388435e-05, "loss": 1.6062, "step": 26400 }, { "epoch": 16.656191074795725, "grad_norm": 5.8042707443237305, "learning_rate": 1.6688874921433063e-05, "loss": 1.6239, "step": 26500 }, { "epoch": 16.719044626021372, "grad_norm": 4.104475498199463, "learning_rate": 1.656316781898177e-05, "loss": 1.5742, "step": 26600 }, { "epoch": 16.781898177247015, "grad_norm": 4.2934722900390625, "learning_rate": 1.6437460716530483e-05, "loss": 1.6069, "step": 26700 }, { "epoch": 16.84475172847266, "grad_norm": 4.601330757141113, "learning_rate": 1.6311753614079195e-05, "loss": 1.5827, "step": 26800 }, { "epoch": 16.907605279698302, "grad_norm": 4.304816246032715, "learning_rate": 1.618604651162791e-05, "loss": 1.6461, "step": 26900 }, { "epoch": 16.970458830923945, "grad_norm": 6.80120325088501, "learning_rate": 1.606033940917662e-05, "loss": 1.6143, "step": 27000 }, { "epoch": 17.0, "eval_loss": 1.5869935750961304, "eval_runtime": 20.3162, "eval_samples_per_second": 47.154, "eval_steps_per_second": 5.907, "step": 27047 }, { "epoch": 17.033312382149592, "grad_norm": 4.368440628051758, "learning_rate": 1.593463230672533e-05, "loss": 1.6352, "step": 27100 }, { "epoch": 17.096165933375236, "grad_norm": 4.066120624542236, "learning_rate": 1.5808925204274042e-05, "loss": 1.5052, "step": 27200 }, { "epoch": 17.15901948460088, "grad_norm": 6.150811672210693, "learning_rate": 1.5683218101822754e-05, "loss": 1.5449, "step": 27300 }, { "epoch": 17.221873035826523, "grad_norm": 7.994663715362549, "learning_rate": 1.5557510999371466e-05, "loss": 1.7157, "step": 27400 }, { "epoch": 17.28472658705217, "grad_norm": 3.554856061935425, "learning_rate": 1.5431803896920178e-05, "loss": 1.5878, "step": 27500 }, { "epoch": 17.347580138277813, "grad_norm": 4.025883674621582, "learning_rate": 1.530609679446889e-05, "loss": 1.6454, "step": 27600 }, { "epoch": 17.410433689503456, "grad_norm": 2.9825448989868164, "learning_rate": 1.51803896920176e-05, "loss": 1.5605, "step": 27700 }, { "epoch": 17.4732872407291, "grad_norm": 4.528345584869385, "learning_rate": 1.505468258956631e-05, "loss": 1.626, "step": 27800 }, { "epoch": 17.536140791954747, "grad_norm": 4.549004554748535, "learning_rate": 1.4928975487115023e-05, "loss": 1.5508, "step": 27900 }, { "epoch": 17.59899434318039, "grad_norm": 4.830588340759277, "learning_rate": 1.4803268384663735e-05, "loss": 1.5394, "step": 28000 }, { "epoch": 17.661847894406034, "grad_norm": 4.127079486846924, "learning_rate": 1.4677561282212447e-05, "loss": 1.5548, "step": 28100 }, { "epoch": 17.724701445631677, "grad_norm": 3.208592414855957, "learning_rate": 1.4551854179761157e-05, "loss": 1.5595, "step": 28200 }, { "epoch": 17.787554996857324, "grad_norm": 4.784154891967773, "learning_rate": 1.4426147077309869e-05, "loss": 1.6029, "step": 28300 }, { "epoch": 17.850408548082967, "grad_norm": 5.0941481590271, "learning_rate": 1.4300439974858582e-05, "loss": 1.634, "step": 28400 }, { "epoch": 17.91326209930861, "grad_norm": 6.4498982429504395, "learning_rate": 1.4174732872407292e-05, "loss": 1.6685, "step": 28500 }, { "epoch": 17.976115650534254, "grad_norm": 5.136322021484375, "learning_rate": 1.4049025769956004e-05, "loss": 1.5587, "step": 28600 }, { "epoch": 18.0, "eval_loss": 1.565408706665039, "eval_runtime": 20.3165, "eval_samples_per_second": 47.154, "eval_steps_per_second": 5.907, "step": 28638 }, { "epoch": 18.0389692017599, "grad_norm": 7.265219211578369, "learning_rate": 1.3923318667504714e-05, "loss": 1.534, "step": 28700 }, { "epoch": 18.101822752985544, "grad_norm": 5.552704334259033, "learning_rate": 1.3797611565053426e-05, "loss": 1.5396, "step": 28800 }, { "epoch": 18.164676304211188, "grad_norm": 7.356419086456299, "learning_rate": 1.3671904462602136e-05, "loss": 1.5851, "step": 28900 }, { "epoch": 18.22752985543683, "grad_norm": 5.519120693206787, "learning_rate": 1.354619736015085e-05, "loss": 1.6331, "step": 29000 }, { "epoch": 18.290383406662478, "grad_norm": 4.4178242683410645, "learning_rate": 1.3420490257699561e-05, "loss": 1.508, "step": 29100 }, { "epoch": 18.35323695788812, "grad_norm": 4.479162216186523, "learning_rate": 1.3294783155248271e-05, "loss": 1.5201, "step": 29200 }, { "epoch": 18.416090509113765, "grad_norm": 4.4193806648254395, "learning_rate": 1.3169076052796983e-05, "loss": 1.5393, "step": 29300 }, { "epoch": 18.47894406033941, "grad_norm": 6.695824146270752, "learning_rate": 1.3043368950345693e-05, "loss": 1.6264, "step": 29400 }, { "epoch": 18.541797611565052, "grad_norm": 4.760421276092529, "learning_rate": 1.2917661847894409e-05, "loss": 1.5465, "step": 29500 }, { "epoch": 18.6046511627907, "grad_norm": 4.158078193664551, "learning_rate": 1.2791954745443119e-05, "loss": 1.5533, "step": 29600 }, { "epoch": 18.667504714016342, "grad_norm": 6.8502092361450195, "learning_rate": 1.266624764299183e-05, "loss": 1.6525, "step": 29700 }, { "epoch": 18.730358265241986, "grad_norm": 4.013594150543213, "learning_rate": 1.254054054054054e-05, "loss": 1.5357, "step": 29800 }, { "epoch": 18.79321181646763, "grad_norm": 6.064908981323242, "learning_rate": 1.2414833438089252e-05, "loss": 1.5659, "step": 29900 }, { "epoch": 18.856065367693276, "grad_norm": 5.281710624694824, "learning_rate": 1.2289126335637964e-05, "loss": 1.4692, "step": 30000 }, { "epoch": 18.91891891891892, "grad_norm": 4.661835193634033, "learning_rate": 1.2163419233186674e-05, "loss": 1.5126, "step": 30100 }, { "epoch": 18.981772470144563, "grad_norm": 3.9490227699279785, "learning_rate": 1.2037712130735388e-05, "loss": 1.5389, "step": 30200 }, { "epoch": 19.0, "eval_loss": 1.5563335418701172, "eval_runtime": 20.3631, "eval_samples_per_second": 47.046, "eval_steps_per_second": 5.893, "step": 30229 }, { "epoch": 19.044626021370206, "grad_norm": 4.6667866706848145, "learning_rate": 1.1912005028284098e-05, "loss": 1.5508, "step": 30300 }, { "epoch": 19.107479572595853, "grad_norm": 4.471792697906494, "learning_rate": 1.1786297925832811e-05, "loss": 1.5253, "step": 30400 }, { "epoch": 19.170333123821496, "grad_norm": 4.01970100402832, "learning_rate": 1.1660590823381521e-05, "loss": 1.5047, "step": 30500 }, { "epoch": 19.23318667504714, "grad_norm": 5.021801471710205, "learning_rate": 1.1534883720930233e-05, "loss": 1.5459, "step": 30600 }, { "epoch": 19.296040226272783, "grad_norm": 4.681889533996582, "learning_rate": 1.1409176618478945e-05, "loss": 1.561, "step": 30700 }, { "epoch": 19.35889377749843, "grad_norm": 4.114772319793701, "learning_rate": 1.1283469516027655e-05, "loss": 1.532, "step": 30800 }, { "epoch": 19.421747328724074, "grad_norm": 3.9337844848632812, "learning_rate": 1.1157762413576367e-05, "loss": 1.5512, "step": 30900 }, { "epoch": 19.484600879949717, "grad_norm": 4.935436725616455, "learning_rate": 1.1032055311125079e-05, "loss": 1.5328, "step": 31000 }, { "epoch": 19.54745443117536, "grad_norm": 5.703494071960449, "learning_rate": 1.090634820867379e-05, "loss": 1.5889, "step": 31100 }, { "epoch": 19.610307982401007, "grad_norm": 6.010659217834473, "learning_rate": 1.0780641106222502e-05, "loss": 1.5166, "step": 31200 }, { "epoch": 19.67316153362665, "grad_norm": 5.14444637298584, "learning_rate": 1.0654934003771214e-05, "loss": 1.5096, "step": 31300 }, { "epoch": 19.736015084852294, "grad_norm": 7.321188449859619, "learning_rate": 1.0529226901319924e-05, "loss": 1.4865, "step": 31400 }, { "epoch": 19.798868636077938, "grad_norm": 3.7702994346618652, "learning_rate": 1.0403519798868636e-05, "loss": 1.5122, "step": 31500 }, { "epoch": 19.86172218730358, "grad_norm": 5.493444442749023, "learning_rate": 1.0277812696417348e-05, "loss": 1.4974, "step": 31600 }, { "epoch": 19.924575738529228, "grad_norm": 5.273486137390137, "learning_rate": 1.015210559396606e-05, "loss": 1.5619, "step": 31700 }, { "epoch": 19.98742928975487, "grad_norm": 4.340183734893799, "learning_rate": 1.0026398491514772e-05, "loss": 1.4476, "step": 31800 }, { "epoch": 20.0, "eval_loss": 1.5459223985671997, "eval_runtime": 20.3264, "eval_samples_per_second": 47.131, "eval_steps_per_second": 5.904, "step": 31820 }, { "epoch": 20.050282840980515, "grad_norm": 3.8120639324188232, "learning_rate": 9.900691389063482e-06, "loss": 1.4837, "step": 31900 }, { "epoch": 20.113136392206158, "grad_norm": 4.154244899749756, "learning_rate": 9.774984286612195e-06, "loss": 1.4684, "step": 32000 }, { "epoch": 20.175989943431805, "grad_norm": 3.925746202468872, "learning_rate": 9.649277184160905e-06, "loss": 1.4685, "step": 32100 }, { "epoch": 20.23884349465745, "grad_norm": 5.944131374359131, "learning_rate": 9.523570081709617e-06, "loss": 1.5097, "step": 32200 }, { "epoch": 20.301697045883092, "grad_norm": 4.755185127258301, "learning_rate": 9.397862979258329e-06, "loss": 1.4334, "step": 32300 }, { "epoch": 20.364550597108735, "grad_norm": 4.627038478851318, "learning_rate": 9.27215587680704e-06, "loss": 1.503, "step": 32400 }, { "epoch": 20.427404148334382, "grad_norm": 9.863165855407715, "learning_rate": 9.14644877435575e-06, "loss": 1.4607, "step": 32500 }, { "epoch": 20.490257699560026, "grad_norm": 4.401854991912842, "learning_rate": 9.020741671904463e-06, "loss": 1.4653, "step": 32600 }, { "epoch": 20.55311125078567, "grad_norm": 6.041737079620361, "learning_rate": 8.895034569453174e-06, "loss": 1.504, "step": 32700 }, { "epoch": 20.615964802011312, "grad_norm": 6.523427963256836, "learning_rate": 8.769327467001886e-06, "loss": 1.6205, "step": 32800 }, { "epoch": 20.67881835323696, "grad_norm": 5.47548246383667, "learning_rate": 8.643620364550598e-06, "loss": 1.4491, "step": 32900 }, { "epoch": 20.741671904462603, "grad_norm": 5.3726959228515625, "learning_rate": 8.517913262099308e-06, "loss": 1.5817, "step": 33000 }, { "epoch": 20.804525455688246, "grad_norm": 3.872283935546875, "learning_rate": 8.392206159648022e-06, "loss": 1.5482, "step": 33100 }, { "epoch": 20.86737900691389, "grad_norm": 4.935946464538574, "learning_rate": 8.266499057196732e-06, "loss": 1.5006, "step": 33200 }, { "epoch": 20.930232558139537, "grad_norm": 6.805904388427734, "learning_rate": 8.140791954745444e-06, "loss": 1.5314, "step": 33300 }, { "epoch": 20.99308610936518, "grad_norm": 4.420083522796631, "learning_rate": 8.015084852294155e-06, "loss": 1.5417, "step": 33400 }, { "epoch": 21.0, "eval_loss": 1.5356966257095337, "eval_runtime": 20.4137, "eval_samples_per_second": 46.929, "eval_steps_per_second": 5.878, "step": 33411 }, { "epoch": 21.055939660590823, "grad_norm": 3.697171688079834, "learning_rate": 7.889377749842865e-06, "loss": 1.4994, "step": 33500 }, { "epoch": 21.118793211816467, "grad_norm": 5.232399940490723, "learning_rate": 7.763670647391579e-06, "loss": 1.5351, "step": 33600 }, { "epoch": 21.18164676304211, "grad_norm": 4.508577823638916, "learning_rate": 7.637963544940289e-06, "loss": 1.4301, "step": 33700 }, { "epoch": 21.244500314267757, "grad_norm": 5.425107479095459, "learning_rate": 7.512256442489001e-06, "loss": 1.4739, "step": 33800 }, { "epoch": 21.3073538654934, "grad_norm": 6.195432186126709, "learning_rate": 7.386549340037713e-06, "loss": 1.5458, "step": 33900 }, { "epoch": 21.370207416719044, "grad_norm": 5.850045204162598, "learning_rate": 7.260842237586424e-06, "loss": 1.5189, "step": 34000 }, { "epoch": 21.433060967944687, "grad_norm": 7.121579170227051, "learning_rate": 7.135135135135136e-06, "loss": 1.5273, "step": 34100 }, { "epoch": 21.495914519170334, "grad_norm": 4.316208362579346, "learning_rate": 7.009428032683847e-06, "loss": 1.4437, "step": 34200 }, { "epoch": 21.558768070395978, "grad_norm": 4.3052873611450195, "learning_rate": 6.883720930232558e-06, "loss": 1.4266, "step": 34300 }, { "epoch": 21.62162162162162, "grad_norm": 4.691330432891846, "learning_rate": 6.758013827781271e-06, "loss": 1.422, "step": 34400 }, { "epoch": 21.684475172847264, "grad_norm": 4.346444129943848, "learning_rate": 6.632306725329982e-06, "loss": 1.5511, "step": 34500 }, { "epoch": 21.74732872407291, "grad_norm": 5.304843902587891, "learning_rate": 6.506599622878693e-06, "loss": 1.4961, "step": 34600 }, { "epoch": 21.810182275298555, "grad_norm": 4.877419948577881, "learning_rate": 6.3808925204274045e-06, "loss": 1.4837, "step": 34700 }, { "epoch": 21.873035826524198, "grad_norm": 4.086881637573242, "learning_rate": 6.2551854179761155e-06, "loss": 1.5164, "step": 34800 }, { "epoch": 21.93588937774984, "grad_norm": 4.570976734161377, "learning_rate": 6.129478315524827e-06, "loss": 1.4681, "step": 34900 }, { "epoch": 21.99874292897549, "grad_norm": 25.407676696777344, "learning_rate": 6.003771213073539e-06, "loss": 1.4062, "step": 35000 }, { "epoch": 22.0, "eval_loss": 1.5373815298080444, "eval_runtime": 20.3495, "eval_samples_per_second": 47.077, "eval_steps_per_second": 5.897, "step": 35002 }, { "epoch": 22.061596480201132, "grad_norm": 4.965208053588867, "learning_rate": 5.878064110622251e-06, "loss": 1.446, "step": 35100 }, { "epoch": 22.124450031426775, "grad_norm": 5.620969772338867, "learning_rate": 5.752357008170962e-06, "loss": 1.475, "step": 35200 }, { "epoch": 22.18730358265242, "grad_norm": 4.315845489501953, "learning_rate": 5.626649905719674e-06, "loss": 1.4866, "step": 35300 }, { "epoch": 22.250157133878066, "grad_norm": 4.076879501342773, "learning_rate": 5.5009428032683854e-06, "loss": 1.5079, "step": 35400 }, { "epoch": 22.31301068510371, "grad_norm": 9.52351188659668, "learning_rate": 5.375235700817096e-06, "loss": 1.5637, "step": 35500 }, { "epoch": 22.375864236329353, "grad_norm": 5.529058933258057, "learning_rate": 5.249528598365807e-06, "loss": 1.4702, "step": 35600 }, { "epoch": 22.438717787554996, "grad_norm": 4.761877536773682, "learning_rate": 5.123821495914519e-06, "loss": 1.4367, "step": 35700 }, { "epoch": 22.501571338780643, "grad_norm": 6.587429046630859, "learning_rate": 4.998114393463231e-06, "loss": 1.4052, "step": 35800 }, { "epoch": 22.564424890006286, "grad_norm": 5.834304332733154, "learning_rate": 4.872407291011943e-06, "loss": 1.4186, "step": 35900 }, { "epoch": 22.62727844123193, "grad_norm": 3.871225595474243, "learning_rate": 4.746700188560654e-06, "loss": 1.51, "step": 36000 }, { "epoch": 22.690131992457573, "grad_norm": 3.876692771911621, "learning_rate": 4.6209930861093655e-06, "loss": 1.5022, "step": 36100 }, { "epoch": 22.752985543683216, "grad_norm": 4.569952964782715, "learning_rate": 4.495285983658077e-06, "loss": 1.454, "step": 36200 }, { "epoch": 22.815839094908863, "grad_norm": 5.837776184082031, "learning_rate": 4.369578881206788e-06, "loss": 1.4472, "step": 36300 }, { "epoch": 22.878692646134507, "grad_norm": 5.9942426681518555, "learning_rate": 4.243871778755499e-06, "loss": 1.4198, "step": 36400 }, { "epoch": 22.94154619736015, "grad_norm": 4.1033220291137695, "learning_rate": 4.118164676304211e-06, "loss": 1.4658, "step": 36500 }, { "epoch": 23.0, "eval_loss": 1.5307875871658325, "eval_runtime": 20.3299, "eval_samples_per_second": 47.123, "eval_steps_per_second": 5.903, "step": 36593 }, { "epoch": 23.004399748585794, "grad_norm": 4.649007320404053, "learning_rate": 3.992457573852923e-06, "loss": 1.4064, "step": 36600 }, { "epoch": 23.06725329981144, "grad_norm": 4.318711757659912, "learning_rate": 3.866750471401635e-06, "loss": 1.4249, "step": 36700 }, { "epoch": 23.130106851037084, "grad_norm": 6.213062286376953, "learning_rate": 3.7410433689503456e-06, "loss": 1.4317, "step": 36800 }, { "epoch": 23.192960402262727, "grad_norm": 4.529442310333252, "learning_rate": 3.6153362664990574e-06, "loss": 1.5102, "step": 36900 }, { "epoch": 23.25581395348837, "grad_norm": 4.912539005279541, "learning_rate": 3.4896291640477688e-06, "loss": 1.4684, "step": 37000 }, { "epoch": 23.318667504714018, "grad_norm": 4.593921661376953, "learning_rate": 3.3639220615964806e-06, "loss": 1.4181, "step": 37100 }, { "epoch": 23.38152105593966, "grad_norm": 5.35049295425415, "learning_rate": 3.2382149591451915e-06, "loss": 1.4813, "step": 37200 }, { "epoch": 23.444374607165305, "grad_norm": 4.00051212310791, "learning_rate": 3.1125078566939033e-06, "loss": 1.4392, "step": 37300 }, { "epoch": 23.507228158390948, "grad_norm": 5.91484260559082, "learning_rate": 2.9868007542426147e-06, "loss": 1.4386, "step": 37400 }, { "epoch": 23.570081709616595, "grad_norm": 7.114585876464844, "learning_rate": 2.861093651791326e-06, "loss": 1.4115, "step": 37500 }, { "epoch": 23.63293526084224, "grad_norm": 2.977877378463745, "learning_rate": 2.735386549340038e-06, "loss": 1.4211, "step": 37600 }, { "epoch": 23.69578881206788, "grad_norm": 3.83953857421875, "learning_rate": 2.6096794468887493e-06, "loss": 1.4601, "step": 37700 }, { "epoch": 23.758642363293525, "grad_norm": 4.377187728881836, "learning_rate": 2.483972344437461e-06, "loss": 1.4281, "step": 37800 }, { "epoch": 23.821495914519172, "grad_norm": 3.9868085384368896, "learning_rate": 2.358265241986172e-06, "loss": 1.4585, "step": 37900 }, { "epoch": 23.884349465744815, "grad_norm": 3.989767551422119, "learning_rate": 2.232558139534884e-06, "loss": 1.5302, "step": 38000 }, { "epoch": 23.94720301697046, "grad_norm": 4.481296062469482, "learning_rate": 2.1068510370835952e-06, "loss": 1.4366, "step": 38100 }, { "epoch": 24.0, "eval_loss": 1.5289642810821533, "eval_runtime": 20.3269, "eval_samples_per_second": 47.13, "eval_steps_per_second": 5.904, "step": 38184 }, { "epoch": 24.010056568196102, "grad_norm": 4.909224033355713, "learning_rate": 1.981143934632307e-06, "loss": 1.4956, "step": 38200 }, { "epoch": 24.072910119421746, "grad_norm": 4.9214372634887695, "learning_rate": 1.8554368321810182e-06, "loss": 1.4725, "step": 38300 }, { "epoch": 24.135763670647393, "grad_norm": 4.345515251159668, "learning_rate": 1.7297297297297298e-06, "loss": 1.4407, "step": 38400 }, { "epoch": 24.198617221873036, "grad_norm": 4.926340579986572, "learning_rate": 1.6040226272784412e-06, "loss": 1.5008, "step": 38500 }, { "epoch": 24.26147077309868, "grad_norm": 4.5064263343811035, "learning_rate": 1.4783155248271527e-06, "loss": 1.4868, "step": 38600 }, { "epoch": 24.324324324324323, "grad_norm": 5.347716808319092, "learning_rate": 1.3526084223758643e-06, "loss": 1.45, "step": 38700 }, { "epoch": 24.38717787554997, "grad_norm": 5.024169921875, "learning_rate": 1.2269013199245757e-06, "loss": 1.3905, "step": 38800 }, { "epoch": 24.450031426775613, "grad_norm": 4.319692611694336, "learning_rate": 1.1011942174732873e-06, "loss": 1.4671, "step": 38900 }, { "epoch": 24.512884978001257, "grad_norm": 2.880321979522705, "learning_rate": 9.75487115021999e-07, "loss": 1.4211, "step": 39000 }, { "epoch": 24.5757385292269, "grad_norm": 4.416039943695068, "learning_rate": 8.497800125707103e-07, "loss": 1.4176, "step": 39100 }, { "epoch": 24.638592080452547, "grad_norm": 4.598896503448486, "learning_rate": 7.240729101194218e-07, "loss": 1.4194, "step": 39200 }, { "epoch": 24.70144563167819, "grad_norm": 4.256235599517822, "learning_rate": 5.983658076681333e-07, "loss": 1.4331, "step": 39300 }, { "epoch": 24.764299182903834, "grad_norm": 4.7764811515808105, "learning_rate": 4.726587052168448e-07, "loss": 1.4491, "step": 39400 }, { "epoch": 24.827152734129477, "grad_norm": 4.296844005584717, "learning_rate": 3.4695160276555627e-07, "loss": 1.4443, "step": 39500 }, { "epoch": 24.890006285355124, "grad_norm": 3.9589693546295166, "learning_rate": 2.2124450031426776e-07, "loss": 1.4612, "step": 39600 }, { "epoch": 24.952859836580767, "grad_norm": 4.165828227996826, "learning_rate": 9.553739786297926e-08, "loss": 1.48, "step": 39700 }, { "epoch": 25.0, "eval_loss": 1.528791069984436, "eval_runtime": 20.2887, "eval_samples_per_second": 47.218, "eval_steps_per_second": 5.915, "step": 39775 } ], "logging_steps": 100, "max_steps": 39775, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.44418915549184e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }