| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 25.0, | |
| "eval_steps": 500, | |
| "global_step": 39775, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06285355122564425, | |
| "grad_norm": 608.9674682617188, | |
| "learning_rate": 4.844437460716531e-05, | |
| "loss": 14.2524, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1257071024512885, | |
| "grad_norm": 34.65327453613281, | |
| "learning_rate": 4.6873035826524205e-05, | |
| "loss": 10.3562, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.18856065367693275, | |
| "grad_norm": 21.24808120727539, | |
| "learning_rate": 4.5301697045883096e-05, | |
| "loss": 7.8551, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.251414204902577, | |
| "grad_norm": 17.404918670654297, | |
| "learning_rate": 4.373035826524199e-05, | |
| "loss": 6.6346, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3142677561282212, | |
| "grad_norm": 12.713433265686035, | |
| "learning_rate": 4.2159019484600884e-05, | |
| "loss": 5.9755, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3771213073538655, | |
| "grad_norm": 10.050477981567383, | |
| "learning_rate": 4.0587680703959775e-05, | |
| "loss": 5.5595, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.43997485857950974, | |
| "grad_norm": 13.709216117858887, | |
| "learning_rate": 3.9016341923318666e-05, | |
| "loss": 5.2853, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.502828409805154, | |
| "grad_norm": 9.112940788269043, | |
| "learning_rate": 3.744500314267756e-05, | |
| "loss": 5.1417, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5656819610307983, | |
| "grad_norm": 8.267425537109375, | |
| "learning_rate": 3.587366436203646e-05, | |
| "loss": 4.9615, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6285355122564424, | |
| "grad_norm": 9.709076881408691, | |
| "learning_rate": 3.430232558139535e-05, | |
| "loss": 4.6907, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6913890634820867, | |
| "grad_norm": 845.80859375, | |
| "learning_rate": 3.273098680075424e-05, | |
| "loss": 4.5456, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.754242614707731, | |
| "grad_norm": 5.943735599517822, | |
| "learning_rate": 3.115964802011313e-05, | |
| "loss": 4.4291, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8170961659333752, | |
| "grad_norm": 5.8759989738464355, | |
| "learning_rate": 2.9588309239472034e-05, | |
| "loss": 4.3252, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8799497171590195, | |
| "grad_norm": 14.995753288269043, | |
| "learning_rate": 2.8016970458830928e-05, | |
| "loss": 4.2586, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9428032683846638, | |
| "grad_norm": 23.3351993560791, | |
| "learning_rate": 2.644563167818982e-05, | |
| "loss": 4.1372, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 3.215750217437744, | |
| "eval_runtime": 19.7611, | |
| "eval_samples_per_second": 48.479, | |
| "eval_steps_per_second": 6.073, | |
| "step": 1591 | |
| }, | |
| { | |
| "epoch": 1.005656819610308, | |
| "grad_norm": 8.584565162658691, | |
| "learning_rate": 2.4874292897548713e-05, | |
| "loss": 4.0272, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.0685103708359522, | |
| "grad_norm": 6.45043420791626, | |
| "learning_rate": 2.3302954116907607e-05, | |
| "loss": 3.9602, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.1313639220615965, | |
| "grad_norm": 6.03476095199585, | |
| "learning_rate": 2.17316153362665e-05, | |
| "loss": 3.9052, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.1942174732872408, | |
| "grad_norm": 5.746309280395508, | |
| "learning_rate": 2.0160276555625392e-05, | |
| "loss": 3.9282, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.2570710245128849, | |
| "grad_norm": 8.062549591064453, | |
| "learning_rate": 1.858893777498429e-05, | |
| "loss": 3.8096, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.3199245757385292, | |
| "grad_norm": 8.58310317993164, | |
| "learning_rate": 1.701759899434318e-05, | |
| "loss": 3.803, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.3827781269641735, | |
| "grad_norm": 7.599905490875244, | |
| "learning_rate": 1.5446260213702074e-05, | |
| "loss": 3.8381, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.4456316781898177, | |
| "grad_norm": 22.772512435913086, | |
| "learning_rate": 1.3874921433060969e-05, | |
| "loss": 3.6456, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.508485229415462, | |
| "grad_norm": 6.949570178985596, | |
| "learning_rate": 1.2303582652419863e-05, | |
| "loss": 3.7442, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.5713387806411063, | |
| "grad_norm": 5.7536821365356445, | |
| "learning_rate": 1.0732243871778757e-05, | |
| "loss": 3.691, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.6341923318667506, | |
| "grad_norm": 55.64060974121094, | |
| "learning_rate": 9.160905091137651e-06, | |
| "loss": 3.7461, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.6970458830923947, | |
| "grad_norm": 6.573077201843262, | |
| "learning_rate": 7.589566310496543e-06, | |
| "loss": 3.6186, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.759899434318039, | |
| "grad_norm": 8.615326881408691, | |
| "learning_rate": 6.018227529855437e-06, | |
| "loss": 3.6546, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.8227529855436833, | |
| "grad_norm": 6.359428405761719, | |
| "learning_rate": 4.446888749214331e-06, | |
| "loss": 3.5724, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.8856065367693273, | |
| "grad_norm": 5.5190582275390625, | |
| "learning_rate": 2.8755499685732243e-06, | |
| "loss": 3.6164, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.9484600879949716, | |
| "grad_norm": 5.9382004737854, | |
| "learning_rate": 1.3042111879321182e-06, | |
| "loss": 3.52, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 2.803544521331787, | |
| "eval_runtime": 19.8643, | |
| "eval_samples_per_second": 48.227, | |
| "eval_steps_per_second": 6.041, | |
| "step": 3182 | |
| }, | |
| { | |
| "epoch": 2.011313639220616, | |
| "grad_norm": 10.074417114257812, | |
| "learning_rate": 3.9946574481458206e-05, | |
| "loss": 3.5087, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.07416719044626, | |
| "grad_norm": 6.9990434646606445, | |
| "learning_rate": 3.963230672532998e-05, | |
| "loss": 3.5746, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.1370207416719045, | |
| "grad_norm": 6.968172073364258, | |
| "learning_rate": 3.931803896920176e-05, | |
| "loss": 3.6324, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.1998742928975488, | |
| "grad_norm": 179.99803161621094, | |
| "learning_rate": 3.9003771213073545e-05, | |
| "loss": 3.4072, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.262727844123193, | |
| "grad_norm": 59.86805725097656, | |
| "learning_rate": 3.868950345694532e-05, | |
| "loss": 3.391, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.3255813953488373, | |
| "grad_norm": 7.445355415344238, | |
| "learning_rate": 3.83752357008171e-05, | |
| "loss": 3.2032, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.3884349465744816, | |
| "grad_norm": 5.553746700286865, | |
| "learning_rate": 3.806096794468888e-05, | |
| "loss": 3.3644, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.4512884978001255, | |
| "grad_norm": 6.544325351715088, | |
| "learning_rate": 3.7746700188560656e-05, | |
| "loss": 3.1666, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.5141420490257698, | |
| "grad_norm": 7.863962650299072, | |
| "learning_rate": 3.7432432432432436e-05, | |
| "loss": 3.1982, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.576995600251414, | |
| "grad_norm": 10.573624610900879, | |
| "learning_rate": 3.7118164676304215e-05, | |
| "loss": 3.1336, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.6398491514770583, | |
| "grad_norm": 8.506134986877441, | |
| "learning_rate": 3.680389692017599e-05, | |
| "loss": 3.0191, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.7027027027027026, | |
| "grad_norm": 7.1274518966674805, | |
| "learning_rate": 3.6489629164047774e-05, | |
| "loss": 3.003, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.765556253928347, | |
| "grad_norm": 5.121671199798584, | |
| "learning_rate": 3.617536140791955e-05, | |
| "loss": 3.085, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.828409805153991, | |
| "grad_norm": 6.66685152053833, | |
| "learning_rate": 3.5861093651791327e-05, | |
| "loss": 3.0205, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.8912633563796355, | |
| "grad_norm": 8.410430908203125, | |
| "learning_rate": 3.5546825895663106e-05, | |
| "loss": 2.9611, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.95411690760528, | |
| "grad_norm": 6.266846179962158, | |
| "learning_rate": 3.5232558139534886e-05, | |
| "loss": 2.9299, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 2.3084471225738525, | |
| "eval_runtime": 20.0337, | |
| "eval_samples_per_second": 47.819, | |
| "eval_steps_per_second": 5.99, | |
| "step": 4773 | |
| }, | |
| { | |
| "epoch": 3.016970458830924, | |
| "grad_norm": 6.011202335357666, | |
| "learning_rate": 3.4918290383406665e-05, | |
| "loss": 2.886, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.0798240100565684, | |
| "grad_norm": 7.204225063323975, | |
| "learning_rate": 3.4604022627278445e-05, | |
| "loss": 2.8579, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.1426775612822127, | |
| "grad_norm": 10.316048622131348, | |
| "learning_rate": 3.428975487115022e-05, | |
| "loss": 2.8155, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.2055311125078565, | |
| "grad_norm": 6.55385684967041, | |
| "learning_rate": 3.3975487115022e-05, | |
| "loss": 2.8938, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 3.268384663733501, | |
| "grad_norm": 6.081694602966309, | |
| "learning_rate": 3.366121935889378e-05, | |
| "loss": 2.7344, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.331238214959145, | |
| "grad_norm": 8.186753273010254, | |
| "learning_rate": 3.3346951602765556e-05, | |
| "loss": 2.7899, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.3940917661847894, | |
| "grad_norm": 7.425989627838135, | |
| "learning_rate": 3.3032683846637335e-05, | |
| "loss": 2.7317, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.4569453174104336, | |
| "grad_norm": 5.459439277648926, | |
| "learning_rate": 3.2718416090509115e-05, | |
| "loss": 2.6456, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.519798868636078, | |
| "grad_norm": 5.077919006347656, | |
| "learning_rate": 3.2404148334380894e-05, | |
| "loss": 2.6816, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.5826524198617222, | |
| "grad_norm": 5.81939172744751, | |
| "learning_rate": 3.2089880578252674e-05, | |
| "loss": 2.64, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.6455059710873665, | |
| "grad_norm": 39.74727249145508, | |
| "learning_rate": 3.177561282212445e-05, | |
| "loss": 2.6725, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.708359522313011, | |
| "grad_norm": 5.927642345428467, | |
| "learning_rate": 3.1461345065996226e-05, | |
| "loss": 2.5395, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 3.771213073538655, | |
| "grad_norm": 5.984442710876465, | |
| "learning_rate": 3.114707730986801e-05, | |
| "loss": 2.6297, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.834066624764299, | |
| "grad_norm": 5.258358478546143, | |
| "learning_rate": 3.083280955373979e-05, | |
| "loss": 2.6291, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.8969201759899432, | |
| "grad_norm": 5.7379937171936035, | |
| "learning_rate": 3.0518541797611565e-05, | |
| "loss": 2.6116, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.9597737272155875, | |
| "grad_norm": 5.038835048675537, | |
| "learning_rate": 3.0204274041483344e-05, | |
| "loss": 2.6695, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 2.0932769775390625, | |
| "eval_runtime": 20.0417, | |
| "eval_samples_per_second": 47.8, | |
| "eval_steps_per_second": 5.988, | |
| "step": 6364 | |
| }, | |
| { | |
| "epoch": 4.022627278441232, | |
| "grad_norm": 7.459395885467529, | |
| "learning_rate": 2.9890006285355127e-05, | |
| "loss": 2.6404, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 4.085480829666876, | |
| "grad_norm": 6.721461296081543, | |
| "learning_rate": 2.9575738529226903e-05, | |
| "loss": 2.4614, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 4.14833438089252, | |
| "grad_norm": 6.69769287109375, | |
| "learning_rate": 2.9261470773098683e-05, | |
| "loss": 2.457, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 4.211187932118165, | |
| "grad_norm": 5.306356906890869, | |
| "learning_rate": 2.894720301697046e-05, | |
| "loss": 2.513, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 4.274041483343809, | |
| "grad_norm": 5.425265312194824, | |
| "learning_rate": 2.8632935260842235e-05, | |
| "loss": 2.5467, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 4.336895034569453, | |
| "grad_norm": 4.722207546234131, | |
| "learning_rate": 2.8318667504714018e-05, | |
| "loss": 2.3467, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 4.3997485857950975, | |
| "grad_norm": 4.346086502075195, | |
| "learning_rate": 2.8004399748585797e-05, | |
| "loss": 2.5098, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.462602137020742, | |
| "grad_norm": 7.4684319496154785, | |
| "learning_rate": 2.7690131992457573e-05, | |
| "loss": 2.4396, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 4.525455688246386, | |
| "grad_norm": 5.709039688110352, | |
| "learning_rate": 2.7375864236329353e-05, | |
| "loss": 2.4688, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 4.58830923947203, | |
| "grad_norm": 4.952858924865723, | |
| "learning_rate": 2.7061596480201136e-05, | |
| "loss": 2.3643, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 4.651162790697675, | |
| "grad_norm": 6.68017578125, | |
| "learning_rate": 2.6747328724072912e-05, | |
| "loss": 2.4242, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 4.714016341923319, | |
| "grad_norm": 3.584669828414917, | |
| "learning_rate": 2.6433060967944688e-05, | |
| "loss": 2.4552, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.776869893148963, | |
| "grad_norm": 5.264488220214844, | |
| "learning_rate": 2.6118793211816468e-05, | |
| "loss": 2.4232, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 4.8397234443746076, | |
| "grad_norm": 4.609414100646973, | |
| "learning_rate": 2.580452545568825e-05, | |
| "loss": 2.4418, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 4.902576995600251, | |
| "grad_norm": 4.986881256103516, | |
| "learning_rate": 2.5490257699560027e-05, | |
| "loss": 2.4065, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 4.965430546825896, | |
| "grad_norm": 4.9718098640441895, | |
| "learning_rate": 2.5175989943431806e-05, | |
| "loss": 2.4589, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.984979271888733, | |
| "eval_runtime": 20.0353, | |
| "eval_samples_per_second": 47.816, | |
| "eval_steps_per_second": 5.989, | |
| "step": 7955 | |
| }, | |
| { | |
| "epoch": 5.0282840980515395, | |
| "grad_norm": 5.2526750564575195, | |
| "learning_rate": 2.4861722187303586e-05, | |
| "loss": 2.2708, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 5.091137649277184, | |
| "grad_norm": 5.312747001647949, | |
| "learning_rate": 2.454745443117536e-05, | |
| "loss": 2.3068, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 5.153991200502828, | |
| "grad_norm": 7.204046726226807, | |
| "learning_rate": 2.423318667504714e-05, | |
| "loss": 2.3729, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 5.216844751728472, | |
| "grad_norm": 4.8044753074646, | |
| "learning_rate": 2.391891891891892e-05, | |
| "loss": 2.3501, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 5.279698302954117, | |
| "grad_norm": 6.9473185539245605, | |
| "learning_rate": 2.3604651162790697e-05, | |
| "loss": 2.3398, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 5.342551854179761, | |
| "grad_norm": 4.014726161956787, | |
| "learning_rate": 2.3290383406662476e-05, | |
| "loss": 2.2938, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 5.405405405405405, | |
| "grad_norm": 6.722488880157471, | |
| "learning_rate": 2.2976115650534256e-05, | |
| "loss": 2.2354, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 5.4682589566310495, | |
| "grad_norm": 5.856524467468262, | |
| "learning_rate": 2.2661847894406035e-05, | |
| "loss": 2.2757, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 5.531112507856694, | |
| "grad_norm": 4.9930644035339355, | |
| "learning_rate": 2.234758013827781e-05, | |
| "loss": 2.2586, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 5.593966059082338, | |
| "grad_norm": 5.49005126953125, | |
| "learning_rate": 2.2033312382149594e-05, | |
| "loss": 2.3155, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 5.656819610307982, | |
| "grad_norm": 8.850517272949219, | |
| "learning_rate": 2.171904462602137e-05, | |
| "loss": 2.2841, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 5.719673161533627, | |
| "grad_norm": 5.094405651092529, | |
| "learning_rate": 2.140477686989315e-05, | |
| "loss": 2.3147, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 5.782526712759271, | |
| "grad_norm": 4.709909439086914, | |
| "learning_rate": 2.109050911376493e-05, | |
| "loss": 2.1584, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 5.845380263984915, | |
| "grad_norm": 4.1693525314331055, | |
| "learning_rate": 2.077624135763671e-05, | |
| "loss": 2.2396, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 5.90823381521056, | |
| "grad_norm": 6.800940036773682, | |
| "learning_rate": 2.0461973601508485e-05, | |
| "loss": 2.301, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 5.971087366436204, | |
| "grad_norm": 7.419278144836426, | |
| "learning_rate": 2.0147705845380265e-05, | |
| "loss": 2.3142, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 1.905881643295288, | |
| "eval_runtime": 20.0332, | |
| "eval_samples_per_second": 47.821, | |
| "eval_steps_per_second": 5.99, | |
| "step": 9546 | |
| }, | |
| { | |
| "epoch": 6.033940917661848, | |
| "grad_norm": 4.217894077301025, | |
| "learning_rate": 1.9833438089252044e-05, | |
| "loss": 2.1013, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 6.096794468887492, | |
| "grad_norm": 5.345584869384766, | |
| "learning_rate": 1.9519170333123824e-05, | |
| "loss": 2.2714, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 6.159648020113137, | |
| "grad_norm": 5.364700794219971, | |
| "learning_rate": 1.92049025769956e-05, | |
| "loss": 2.2381, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 6.222501571338781, | |
| "grad_norm": 4.380568504333496, | |
| "learning_rate": 1.8890634820867383e-05, | |
| "loss": 2.1527, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 6.285355122564425, | |
| "grad_norm": 6.300790309906006, | |
| "learning_rate": 1.857636706473916e-05, | |
| "loss": 2.1771, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 6.348208673790069, | |
| "grad_norm": 5.757110118865967, | |
| "learning_rate": 1.8262099308610938e-05, | |
| "loss": 2.1695, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 6.411062225015713, | |
| "grad_norm": 4.908361434936523, | |
| "learning_rate": 1.7947831552482718e-05, | |
| "loss": 2.1056, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 6.473915776241357, | |
| "grad_norm": 5.048102378845215, | |
| "learning_rate": 1.7633563796354494e-05, | |
| "loss": 2.2112, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 6.536769327467002, | |
| "grad_norm": 8.040143013000488, | |
| "learning_rate": 1.7319296040226273e-05, | |
| "loss": 2.0298, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 6.599622878692646, | |
| "grad_norm": 5.15581750869751, | |
| "learning_rate": 1.7005028284098053e-05, | |
| "loss": 2.1224, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 6.66247642991829, | |
| "grad_norm": 4.935842514038086, | |
| "learning_rate": 1.6690760527969832e-05, | |
| "loss": 2.0772, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 6.725329981143934, | |
| "grad_norm": 5.487718105316162, | |
| "learning_rate": 1.637649277184161e-05, | |
| "loss": 2.2552, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 6.788183532369579, | |
| "grad_norm": 5.713748455047607, | |
| "learning_rate": 1.6062225015713388e-05, | |
| "loss": 2.1358, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 6.851037083595223, | |
| "grad_norm": 4.882757186889648, | |
| "learning_rate": 1.5747957259585168e-05, | |
| "loss": 2.1613, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 6.913890634820867, | |
| "grad_norm": 5.634950637817383, | |
| "learning_rate": 1.5433689503456947e-05, | |
| "loss": 2.2567, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 6.976744186046512, | |
| "grad_norm": 5.634829044342041, | |
| "learning_rate": 1.5119421747328725e-05, | |
| "loss": 2.1283, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 1.84635591506958, | |
| "eval_runtime": 20.0367, | |
| "eval_samples_per_second": 47.812, | |
| "eval_steps_per_second": 5.989, | |
| "step": 11137 | |
| }, | |
| { | |
| "epoch": 7.039597737272156, | |
| "grad_norm": 5.635861873626709, | |
| "learning_rate": 1.4805153991200504e-05, | |
| "loss": 2.0938, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 7.1024512884978, | |
| "grad_norm": 5.214977741241455, | |
| "learning_rate": 1.4490886235072282e-05, | |
| "loss": 2.062, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 7.1653048397234445, | |
| "grad_norm": 7.498839855194092, | |
| "learning_rate": 1.4176618478944062e-05, | |
| "loss": 2.1292, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 7.228158390949089, | |
| "grad_norm": 5.83459997177124, | |
| "learning_rate": 1.386235072281584e-05, | |
| "loss": 2.0796, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 7.291011942174733, | |
| "grad_norm": 3.8935282230377197, | |
| "learning_rate": 1.3548082966687619e-05, | |
| "loss": 2.1414, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 7.353865493400377, | |
| "grad_norm": 5.774020671844482, | |
| "learning_rate": 1.3233815210559397e-05, | |
| "loss": 2.145, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 7.416719044626022, | |
| "grad_norm": 128.24192810058594, | |
| "learning_rate": 1.2919547454431178e-05, | |
| "loss": 2.0242, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 7.479572595851666, | |
| "grad_norm": 4.4846367835998535, | |
| "learning_rate": 1.2605279698302954e-05, | |
| "loss": 2.0936, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 7.54242614707731, | |
| "grad_norm": 5.091222763061523, | |
| "learning_rate": 1.2291011942174734e-05, | |
| "loss": 2.1988, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 7.6052796983029545, | |
| "grad_norm": 3.3482093811035156, | |
| "learning_rate": 1.1976744186046513e-05, | |
| "loss": 2.1323, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 7.668133249528598, | |
| "grad_norm": 5.329409599304199, | |
| "learning_rate": 1.1662476429918291e-05, | |
| "loss": 2.0587, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 7.730986800754243, | |
| "grad_norm": 7.584386348724365, | |
| "learning_rate": 1.134820867379007e-05, | |
| "loss": 2.1341, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 7.7938403519798864, | |
| "grad_norm": 5.996345520019531, | |
| "learning_rate": 1.1033940917661848e-05, | |
| "loss": 2.1108, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 7.856693903205531, | |
| "grad_norm": 6.1731648445129395, | |
| "learning_rate": 1.0719673161533628e-05, | |
| "loss": 2.1218, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 7.919547454431175, | |
| "grad_norm": 5.414481163024902, | |
| "learning_rate": 1.0405405405405407e-05, | |
| "loss": 2.028, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 7.982401005656819, | |
| "grad_norm": 7.198294639587402, | |
| "learning_rate": 1.0091137649277185e-05, | |
| "loss": 2.0489, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 1.8111430406570435, | |
| "eval_runtime": 20.0666, | |
| "eval_samples_per_second": 47.741, | |
| "eval_steps_per_second": 5.98, | |
| "step": 12728 | |
| }, | |
| { | |
| "epoch": 8.045254556882464, | |
| "grad_norm": 6.677022933959961, | |
| "learning_rate": 9.776869893148963e-06, | |
| "loss": 2.0814, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 8.108108108108109, | |
| "grad_norm": 5.1916728019714355, | |
| "learning_rate": 9.46260213702074e-06, | |
| "loss": 2.119, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 8.170961659333752, | |
| "grad_norm": 6.04162073135376, | |
| "learning_rate": 9.14833438089252e-06, | |
| "loss": 2.0058, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 8.233815210559397, | |
| "grad_norm": 4.764267444610596, | |
| "learning_rate": 8.8340666247643e-06, | |
| "loss": 2.0113, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 8.29666876178504, | |
| "grad_norm": 5.77971887588501, | |
| "learning_rate": 8.519798868636078e-06, | |
| "loss": 2.0392, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 8.359522313010686, | |
| "grad_norm": 5.698218822479248, | |
| "learning_rate": 8.205531112507857e-06, | |
| "loss": 2.107, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 8.42237586423633, | |
| "grad_norm": 5.236012935638428, | |
| "learning_rate": 7.891263356379635e-06, | |
| "loss": 2.0829, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 8.485229415461973, | |
| "grad_norm": 4.379955291748047, | |
| "learning_rate": 7.576995600251414e-06, | |
| "loss": 1.9321, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 8.548082966687618, | |
| "grad_norm": 6.034859657287598, | |
| "learning_rate": 7.262727844123193e-06, | |
| "loss": 2.1013, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 8.610936517913261, | |
| "grad_norm": 5.320705413818359, | |
| "learning_rate": 6.948460087994972e-06, | |
| "loss": 2.0543, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 8.673790069138906, | |
| "grad_norm": 5.735895156860352, | |
| "learning_rate": 6.634192331866751e-06, | |
| "loss": 2.0594, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 8.73664362036455, | |
| "grad_norm": 4.845800876617432, | |
| "learning_rate": 6.31992457573853e-06, | |
| "loss": 1.9402, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 8.799497171590195, | |
| "grad_norm": 4.628382682800293, | |
| "learning_rate": 6.0056568196103085e-06, | |
| "loss": 1.9937, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 8.862350722815838, | |
| "grad_norm": 4.747410774230957, | |
| "learning_rate": 5.691389063482086e-06, | |
| "loss": 2.0654, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 8.925204274041484, | |
| "grad_norm": 4.694166660308838, | |
| "learning_rate": 5.377121307353866e-06, | |
| "loss": 2.0523, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 8.988057825267127, | |
| "grad_norm": 6.711084365844727, | |
| "learning_rate": 5.0628535512256445e-06, | |
| "loss": 1.9856, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 1.7920939922332764, | |
| "eval_runtime": 20.0378, | |
| "eval_samples_per_second": 47.81, | |
| "eval_steps_per_second": 5.989, | |
| "step": 14319 | |
| }, | |
| { | |
| "epoch": 9.050911376492772, | |
| "grad_norm": 6.053162097930908, | |
| "learning_rate": 4.748585795097423e-06, | |
| "loss": 2.0392, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 9.113764927718416, | |
| "grad_norm": 4.806529521942139, | |
| "learning_rate": 4.434318038969202e-06, | |
| "loss": 2.0308, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 9.17661847894406, | |
| "grad_norm": 4.725819110870361, | |
| "learning_rate": 4.1200502828409805e-06, | |
| "loss": 2.0441, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 9.239472030169704, | |
| "grad_norm": 4.637420177459717, | |
| "learning_rate": 3.8057825267127596e-06, | |
| "loss": 2.0061, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 9.30232558139535, | |
| "grad_norm": 6.441665172576904, | |
| "learning_rate": 3.4915147705845382e-06, | |
| "loss": 2.1299, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 9.365179132620993, | |
| "grad_norm": 3.506943941116333, | |
| "learning_rate": 3.1772470144563173e-06, | |
| "loss": 1.9443, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 9.428032683846638, | |
| "grad_norm": 8.454822540283203, | |
| "learning_rate": 2.8629792583280956e-06, | |
| "loss": 2.0327, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 9.490886235072281, | |
| "grad_norm": 5.021187782287598, | |
| "learning_rate": 2.5487115021998746e-06, | |
| "loss": 1.9839, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 9.553739786297927, | |
| "grad_norm": 6.3962016105651855, | |
| "learning_rate": 2.234443746071653e-06, | |
| "loss": 2.0604, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 9.61659333752357, | |
| "grad_norm": 5.531436443328857, | |
| "learning_rate": 1.920175989943432e-06, | |
| "loss": 2.0168, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 9.679446888749215, | |
| "grad_norm": 4.300695896148682, | |
| "learning_rate": 1.6059082338152106e-06, | |
| "loss": 1.9994, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 9.742300439974859, | |
| "grad_norm": 3.102018356323242, | |
| "learning_rate": 1.2916404776869893e-06, | |
| "loss": 2.0441, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 9.805153991200502, | |
| "grad_norm": 4.91919469833374, | |
| "learning_rate": 9.773727215587681e-07, | |
| "loss": 1.9584, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 9.868007542426147, | |
| "grad_norm": 4.21737813949585, | |
| "learning_rate": 6.631049654305469e-07, | |
| "loss": 2.0019, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 9.930861093651792, | |
| "grad_norm": 4.098769187927246, | |
| "learning_rate": 3.4883720930232557e-07, | |
| "loss": 2.0121, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 9.993714644877436, | |
| "grad_norm": 4.722096920013428, | |
| "learning_rate": 3.456945317410434e-08, | |
| "loss": 2.0196, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 1.787421464920044, | |
| "eval_runtime": 20.0243, | |
| "eval_samples_per_second": 47.842, | |
| "eval_steps_per_second": 5.993, | |
| "step": 15910 | |
| }, | |
| { | |
| "epoch": 10.056568196103079, | |
| "grad_norm": 3.8331987857818604, | |
| "learning_rate": 2.4860150848522942e-05, | |
| "loss": 2.0388, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 10.119421747328724, | |
| "grad_norm": 3.9292027950286865, | |
| "learning_rate": 2.4703016970458832e-05, | |
| "loss": 2.0913, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 10.182275298554368, | |
| "grad_norm": 5.124855995178223, | |
| "learning_rate": 2.454588309239472e-05, | |
| "loss": 2.0452, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 10.245128849780013, | |
| "grad_norm": 5.743933200836182, | |
| "learning_rate": 2.438874921433061e-05, | |
| "loss": 2.016, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 10.307982401005656, | |
| "grad_norm": 6.4510931968688965, | |
| "learning_rate": 2.42316153362665e-05, | |
| "loss": 1.9785, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 10.370835952231301, | |
| "grad_norm": 6.550465106964111, | |
| "learning_rate": 2.4074481458202387e-05, | |
| "loss": 1.9912, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 10.433689503456945, | |
| "grad_norm": 5.37285852432251, | |
| "learning_rate": 2.391734758013828e-05, | |
| "loss": 2.0549, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 10.49654305468259, | |
| "grad_norm": 5.4893412590026855, | |
| "learning_rate": 2.376021370207417e-05, | |
| "loss": 1.9434, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 10.559396605908233, | |
| "grad_norm": 4.316259384155273, | |
| "learning_rate": 2.3603079824010057e-05, | |
| "loss": 1.8413, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 10.622250157133879, | |
| "grad_norm": 3.4342756271362305, | |
| "learning_rate": 2.3445945945945946e-05, | |
| "loss": 1.9312, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 10.685103708359522, | |
| "grad_norm": 5.680815696716309, | |
| "learning_rate": 2.3288812067881836e-05, | |
| "loss": 1.9678, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 10.747957259585167, | |
| "grad_norm": 6.04569149017334, | |
| "learning_rate": 2.3131678189817726e-05, | |
| "loss": 2.0329, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 10.81081081081081, | |
| "grad_norm": 9.336991310119629, | |
| "learning_rate": 2.2974544311753616e-05, | |
| "loss": 1.9575, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 10.873664362036456, | |
| "grad_norm": 3.826447010040283, | |
| "learning_rate": 2.2817410433689505e-05, | |
| "loss": 1.9692, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 10.936517913262099, | |
| "grad_norm": 4.134801387786865, | |
| "learning_rate": 2.2660276555625392e-05, | |
| "loss": 2.0406, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 10.999371464487744, | |
| "grad_norm": 5.291431903839111, | |
| "learning_rate": 2.2503142677561285e-05, | |
| "loss": 1.9631, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 1.7517410516738892, | |
| "eval_runtime": 21.6572, | |
| "eval_samples_per_second": 44.235, | |
| "eval_steps_per_second": 5.541, | |
| "step": 17501 | |
| }, | |
| { | |
| "epoch": 11.062225015713388, | |
| "grad_norm": 4.9575066566467285, | |
| "learning_rate": 2.234600879949717e-05, | |
| "loss": 1.9381, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 11.125078566939033, | |
| "grad_norm": 12.871175765991211, | |
| "learning_rate": 2.218887492143306e-05, | |
| "loss": 1.8867, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 11.187932118164676, | |
| "grad_norm": 4.3662519454956055, | |
| "learning_rate": 2.203174104336895e-05, | |
| "loss": 1.9713, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 11.250785669390321, | |
| "grad_norm": 5.662289619445801, | |
| "learning_rate": 2.187460716530484e-05, | |
| "loss": 1.9188, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 11.313639220615965, | |
| "grad_norm": 7.633818626403809, | |
| "learning_rate": 2.171747328724073e-05, | |
| "loss": 1.9142, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 11.376492771841608, | |
| "grad_norm": 4.940028667449951, | |
| "learning_rate": 2.156033940917662e-05, | |
| "loss": 1.8697, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 11.439346323067253, | |
| "grad_norm": 5.070211410522461, | |
| "learning_rate": 2.1403205531112506e-05, | |
| "loss": 1.9624, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 11.502199874292897, | |
| "grad_norm": 7.409548282623291, | |
| "learning_rate": 2.12460716530484e-05, | |
| "loss": 1.9283, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 11.565053425518542, | |
| "grad_norm": 6.541192531585693, | |
| "learning_rate": 2.108893777498429e-05, | |
| "loss": 1.9357, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 11.627906976744185, | |
| "grad_norm": 5.941864967346191, | |
| "learning_rate": 2.0931803896920176e-05, | |
| "loss": 1.869, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 11.69076052796983, | |
| "grad_norm": 9.418646812438965, | |
| "learning_rate": 2.0774670018856065e-05, | |
| "loss": 1.8518, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 11.753614079195474, | |
| "grad_norm": 5.367152690887451, | |
| "learning_rate": 2.061753614079196e-05, | |
| "loss": 1.8945, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 11.81646763042112, | |
| "grad_norm": 5.896432399749756, | |
| "learning_rate": 2.0460402262727845e-05, | |
| "loss": 1.8569, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 11.879321181646763, | |
| "grad_norm": 6.137564182281494, | |
| "learning_rate": 2.0303268384663735e-05, | |
| "loss": 1.9179, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 11.942174732872408, | |
| "grad_norm": 4.5933918952941895, | |
| "learning_rate": 2.0146134506599625e-05, | |
| "loss": 1.8941, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 1.7062737941741943, | |
| "eval_runtime": 21.7167, | |
| "eval_samples_per_second": 44.114, | |
| "eval_steps_per_second": 5.526, | |
| "step": 19092 | |
| }, | |
| { | |
| "epoch": 12.005028284098051, | |
| "grad_norm": 5.298050880432129, | |
| "learning_rate": 1.998900062853551e-05, | |
| "loss": 1.8681, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 12.067881835323696, | |
| "grad_norm": 7.001854419708252, | |
| "learning_rate": 1.9831866750471404e-05, | |
| "loss": 1.8377, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 12.13073538654934, | |
| "grad_norm": 4.692386150360107, | |
| "learning_rate": 1.9674732872407294e-05, | |
| "loss": 1.8279, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 12.193588937774985, | |
| "grad_norm": 6.864208221435547, | |
| "learning_rate": 1.951759899434318e-05, | |
| "loss": 1.8855, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 12.256442489000628, | |
| "grad_norm": 3.883880853652954, | |
| "learning_rate": 1.936046511627907e-05, | |
| "loss": 1.84, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 12.319296040226273, | |
| "grad_norm": 5.302524566650391, | |
| "learning_rate": 1.920333123821496e-05, | |
| "loss": 1.8791, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 12.382149591451917, | |
| "grad_norm": 6.854051113128662, | |
| "learning_rate": 1.904619736015085e-05, | |
| "loss": 1.9189, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 12.445003142677562, | |
| "grad_norm": 4.728283405303955, | |
| "learning_rate": 1.888906348208674e-05, | |
| "loss": 1.8903, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 12.507856693903205, | |
| "grad_norm": 4.314347267150879, | |
| "learning_rate": 1.8731929604022626e-05, | |
| "loss": 1.8615, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 12.57071024512885, | |
| "grad_norm": 3.873619318008423, | |
| "learning_rate": 1.857479572595852e-05, | |
| "loss": 1.8232, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 12.633563796354494, | |
| "grad_norm": 6.445096969604492, | |
| "learning_rate": 1.841766184789441e-05, | |
| "loss": 1.7764, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 12.696417347580137, | |
| "grad_norm": 4.258322715759277, | |
| "learning_rate": 1.8260527969830295e-05, | |
| "loss": 1.869, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 12.759270898805783, | |
| "grad_norm": 7.782538414001465, | |
| "learning_rate": 1.8103394091766185e-05, | |
| "loss": 1.7986, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 12.822124450031426, | |
| "grad_norm": 7.189488887786865, | |
| "learning_rate": 1.7946260213702078e-05, | |
| "loss": 1.8448, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 12.884978001257071, | |
| "grad_norm": 5.59601354598999, | |
| "learning_rate": 1.7789126335637964e-05, | |
| "loss": 1.7924, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 12.947831552482715, | |
| "grad_norm": 4.675200939178467, | |
| "learning_rate": 1.7631992457573854e-05, | |
| "loss": 1.8212, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 1.6696668863296509, | |
| "eval_runtime": 21.645, | |
| "eval_samples_per_second": 44.26, | |
| "eval_steps_per_second": 5.544, | |
| "step": 20683 | |
| }, | |
| { | |
| "epoch": 13.01068510370836, | |
| "grad_norm": 3.3650217056274414, | |
| "learning_rate": 1.7474858579509744e-05, | |
| "loss": 1.6872, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 13.073538654934003, | |
| "grad_norm": 6.4758219718933105, | |
| "learning_rate": 1.731772470144563e-05, | |
| "loss": 1.8029, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 13.136392206159648, | |
| "grad_norm": 4.500367641448975, | |
| "learning_rate": 1.7160590823381523e-05, | |
| "loss": 1.8655, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 13.199245757385292, | |
| "grad_norm": 5.369949817657471, | |
| "learning_rate": 1.7003456945317413e-05, | |
| "loss": 1.821, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 13.262099308610937, | |
| "grad_norm": 4.84245491027832, | |
| "learning_rate": 1.68463230672533e-05, | |
| "loss": 1.7454, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 13.32495285983658, | |
| "grad_norm": 4.510051727294922, | |
| "learning_rate": 1.668918918918919e-05, | |
| "loss": 1.8378, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 13.387806411062225, | |
| "grad_norm": 5.163560390472412, | |
| "learning_rate": 1.653205531112508e-05, | |
| "loss": 1.7985, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 13.450659962287869, | |
| "grad_norm": 4.454617023468018, | |
| "learning_rate": 1.637492143306097e-05, | |
| "loss": 1.8177, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 13.513513513513514, | |
| "grad_norm": 3.672908067703247, | |
| "learning_rate": 1.6217787554996858e-05, | |
| "loss": 1.6908, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 13.576367064739157, | |
| "grad_norm": 4.549923419952393, | |
| "learning_rate": 1.6060653676932748e-05, | |
| "loss": 1.7603, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 13.639220615964803, | |
| "grad_norm": 5.733989715576172, | |
| "learning_rate": 1.5903519798868638e-05, | |
| "loss": 1.7689, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 13.702074167190446, | |
| "grad_norm": 4.507519245147705, | |
| "learning_rate": 1.5746385920804527e-05, | |
| "loss": 1.7984, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 13.764927718416091, | |
| "grad_norm": 4.713226795196533, | |
| "learning_rate": 1.5589252042740414e-05, | |
| "loss": 1.8011, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 13.827781269641735, | |
| "grad_norm": 4.300686359405518, | |
| "learning_rate": 1.5432118164676304e-05, | |
| "loss": 1.7743, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 13.89063482086738, | |
| "grad_norm": 4.702789306640625, | |
| "learning_rate": 1.5274984286612197e-05, | |
| "loss": 1.6903, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 13.953488372093023, | |
| "grad_norm": 6.481640815734863, | |
| "learning_rate": 1.5117850408548085e-05, | |
| "loss": 1.822, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 1.648952603340149, | |
| "eval_runtime": 21.6512, | |
| "eval_samples_per_second": 44.247, | |
| "eval_steps_per_second": 5.542, | |
| "step": 22274 | |
| }, | |
| { | |
| "epoch": 14.016341923318668, | |
| "grad_norm": 4.320845127105713, | |
| "learning_rate": 2.1968573224387177e-05, | |
| "loss": 1.7866, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 14.079195474544312, | |
| "grad_norm": 5.575278282165527, | |
| "learning_rate": 2.184286612193589e-05, | |
| "loss": 1.7572, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 14.142049025769955, | |
| "grad_norm": 5.764155387878418, | |
| "learning_rate": 2.17171590194846e-05, | |
| "loss": 1.7566, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 14.2049025769956, | |
| "grad_norm": 4.854477882385254, | |
| "learning_rate": 2.1591451917033316e-05, | |
| "loss": 1.7517, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 14.267756128221244, | |
| "grad_norm": 4.7141618728637695, | |
| "learning_rate": 2.1465744814582025e-05, | |
| "loss": 1.713, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 14.330609679446889, | |
| "grad_norm": 4.3324785232543945, | |
| "learning_rate": 2.1340037712130736e-05, | |
| "loss": 1.7511, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 14.393463230672532, | |
| "grad_norm": 3.4204530715942383, | |
| "learning_rate": 2.1214330609679448e-05, | |
| "loss": 1.7451, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 14.456316781898177, | |
| "grad_norm": 4.925296783447266, | |
| "learning_rate": 2.108862350722816e-05, | |
| "loss": 1.6868, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 14.51917033312382, | |
| "grad_norm": 4.997200965881348, | |
| "learning_rate": 2.0962916404776872e-05, | |
| "loss": 1.7259, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 14.582023884349466, | |
| "grad_norm": 4.816483497619629, | |
| "learning_rate": 2.0837209302325584e-05, | |
| "loss": 1.7716, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 14.64487743557511, | |
| "grad_norm": 5.224360466003418, | |
| "learning_rate": 2.0711502199874295e-05, | |
| "loss": 1.7039, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 14.707730986800755, | |
| "grad_norm": 7.450541019439697, | |
| "learning_rate": 2.0585795097423004e-05, | |
| "loss": 1.6634, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 14.770584538026398, | |
| "grad_norm": 5.811767101287842, | |
| "learning_rate": 2.0460087994971716e-05, | |
| "loss": 1.7526, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 14.833438089252043, | |
| "grad_norm": 4.1061272621154785, | |
| "learning_rate": 2.0334380892520427e-05, | |
| "loss": 1.7612, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 14.896291640477687, | |
| "grad_norm": 4.599556922912598, | |
| "learning_rate": 2.020867379006914e-05, | |
| "loss": 1.776, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 14.959145191703332, | |
| "grad_norm": 4.085700988769531, | |
| "learning_rate": 2.008296668761785e-05, | |
| "loss": 1.7143, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 1.6270309686660767, | |
| "eval_runtime": 20.346, | |
| "eval_samples_per_second": 47.085, | |
| "eval_steps_per_second": 5.898, | |
| "step": 23865 | |
| }, | |
| { | |
| "epoch": 15.021998742928975, | |
| "grad_norm": 8.476902961730957, | |
| "learning_rate": 1.9957259585166563e-05, | |
| "loss": 1.6504, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 15.08485229415462, | |
| "grad_norm": 4.84979772567749, | |
| "learning_rate": 1.9831552482715275e-05, | |
| "loss": 1.7259, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 15.147705845380264, | |
| "grad_norm": 4.314637184143066, | |
| "learning_rate": 1.9705845380263983e-05, | |
| "loss": 1.6254, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 15.210559396605909, | |
| "grad_norm": 4.656597137451172, | |
| "learning_rate": 1.9580138277812698e-05, | |
| "loss": 1.7493, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 15.273412947831552, | |
| "grad_norm": 4.276788711547852, | |
| "learning_rate": 1.945443117536141e-05, | |
| "loss": 1.6797, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 15.336266499057198, | |
| "grad_norm": 3.9574031829833984, | |
| "learning_rate": 1.9328724072910122e-05, | |
| "loss": 1.716, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 15.399120050282841, | |
| "grad_norm": 8.148831367492676, | |
| "learning_rate": 1.920301697045883e-05, | |
| "loss": 1.6737, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 15.461973601508486, | |
| "grad_norm": 3.8734018802642822, | |
| "learning_rate": 1.9077309868007542e-05, | |
| "loss": 1.6452, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 15.52482715273413, | |
| "grad_norm": 4.928835391998291, | |
| "learning_rate": 1.8951602765556257e-05, | |
| "loss": 1.7134, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 15.587680703959773, | |
| "grad_norm": 4.991033554077148, | |
| "learning_rate": 1.8825895663104966e-05, | |
| "loss": 1.7327, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 15.650534255185418, | |
| "grad_norm": 4.160732269287109, | |
| "learning_rate": 1.8700188560653677e-05, | |
| "loss": 1.6678, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 15.713387806411061, | |
| "grad_norm": 6.523078441619873, | |
| "learning_rate": 1.857448145820239e-05, | |
| "loss": 1.6856, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 15.776241357636707, | |
| "grad_norm": 6.306403636932373, | |
| "learning_rate": 1.84487743557511e-05, | |
| "loss": 1.6699, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 15.83909490886235, | |
| "grad_norm": 4.479640483856201, | |
| "learning_rate": 1.832306725329981e-05, | |
| "loss": 1.676, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 15.901948460087995, | |
| "grad_norm": 4.6891279220581055, | |
| "learning_rate": 1.8197360150848525e-05, | |
| "loss": 1.667, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 15.964802011313639, | |
| "grad_norm": 5.908668518066406, | |
| "learning_rate": 1.8071653048397236e-05, | |
| "loss": 1.6267, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 1.608726143836975, | |
| "eval_runtime": 20.3571, | |
| "eval_samples_per_second": 47.06, | |
| "eval_steps_per_second": 5.895, | |
| "step": 25456 | |
| }, | |
| { | |
| "epoch": 16.027655562539284, | |
| "grad_norm": 4.081086158752441, | |
| "learning_rate": 1.7945945945945948e-05, | |
| "loss": 1.5625, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 16.090509113764927, | |
| "grad_norm": 3.7648415565490723, | |
| "learning_rate": 1.7820238843494657e-05, | |
| "loss": 1.6818, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 16.15336266499057, | |
| "grad_norm": 5.430357456207275, | |
| "learning_rate": 1.769453174104337e-05, | |
| "loss": 1.6125, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 16.216216216216218, | |
| "grad_norm": 5.235119819641113, | |
| "learning_rate": 1.7568824638592084e-05, | |
| "loss": 1.6985, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 16.27906976744186, | |
| "grad_norm": 5.521476745605469, | |
| "learning_rate": 1.7443117536140792e-05, | |
| "loss": 1.6291, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 16.341923318667504, | |
| "grad_norm": 5.7086873054504395, | |
| "learning_rate": 1.7317410433689504e-05, | |
| "loss": 1.6523, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 16.404776869893148, | |
| "grad_norm": 5.697257041931152, | |
| "learning_rate": 1.7191703331238216e-05, | |
| "loss": 1.6518, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 16.467630421118795, | |
| "grad_norm": 8.258442878723145, | |
| "learning_rate": 1.7065996228786928e-05, | |
| "loss": 1.6314, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 16.530483972344438, | |
| "grad_norm": 4.087442874908447, | |
| "learning_rate": 1.694028912633564e-05, | |
| "loss": 1.7048, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 16.59333752357008, | |
| "grad_norm": 4.184548377990723, | |
| "learning_rate": 1.681458202388435e-05, | |
| "loss": 1.6062, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 16.656191074795725, | |
| "grad_norm": 5.8042707443237305, | |
| "learning_rate": 1.6688874921433063e-05, | |
| "loss": 1.6239, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 16.719044626021372, | |
| "grad_norm": 4.104475498199463, | |
| "learning_rate": 1.656316781898177e-05, | |
| "loss": 1.5742, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 16.781898177247015, | |
| "grad_norm": 4.2934722900390625, | |
| "learning_rate": 1.6437460716530483e-05, | |
| "loss": 1.6069, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 16.84475172847266, | |
| "grad_norm": 4.601330757141113, | |
| "learning_rate": 1.6311753614079195e-05, | |
| "loss": 1.5827, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 16.907605279698302, | |
| "grad_norm": 4.304816246032715, | |
| "learning_rate": 1.618604651162791e-05, | |
| "loss": 1.6461, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 16.970458830923945, | |
| "grad_norm": 6.80120325088501, | |
| "learning_rate": 1.606033940917662e-05, | |
| "loss": 1.6143, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 1.5869935750961304, | |
| "eval_runtime": 20.3162, | |
| "eval_samples_per_second": 47.154, | |
| "eval_steps_per_second": 5.907, | |
| "step": 27047 | |
| }, | |
| { | |
| "epoch": 17.033312382149592, | |
| "grad_norm": 4.368440628051758, | |
| "learning_rate": 1.593463230672533e-05, | |
| "loss": 1.6352, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 17.096165933375236, | |
| "grad_norm": 4.066120624542236, | |
| "learning_rate": 1.5808925204274042e-05, | |
| "loss": 1.5052, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 17.15901948460088, | |
| "grad_norm": 6.150811672210693, | |
| "learning_rate": 1.5683218101822754e-05, | |
| "loss": 1.5449, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 17.221873035826523, | |
| "grad_norm": 7.994663715362549, | |
| "learning_rate": 1.5557510999371466e-05, | |
| "loss": 1.7157, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 17.28472658705217, | |
| "grad_norm": 3.554856061935425, | |
| "learning_rate": 1.5431803896920178e-05, | |
| "loss": 1.5878, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 17.347580138277813, | |
| "grad_norm": 4.025883674621582, | |
| "learning_rate": 1.530609679446889e-05, | |
| "loss": 1.6454, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 17.410433689503456, | |
| "grad_norm": 2.9825448989868164, | |
| "learning_rate": 1.51803896920176e-05, | |
| "loss": 1.5605, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 17.4732872407291, | |
| "grad_norm": 4.528345584869385, | |
| "learning_rate": 1.505468258956631e-05, | |
| "loss": 1.626, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 17.536140791954747, | |
| "grad_norm": 4.549004554748535, | |
| "learning_rate": 1.4928975487115023e-05, | |
| "loss": 1.5508, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 17.59899434318039, | |
| "grad_norm": 4.830588340759277, | |
| "learning_rate": 1.4803268384663735e-05, | |
| "loss": 1.5394, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 17.661847894406034, | |
| "grad_norm": 4.127079486846924, | |
| "learning_rate": 1.4677561282212447e-05, | |
| "loss": 1.5548, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 17.724701445631677, | |
| "grad_norm": 3.208592414855957, | |
| "learning_rate": 1.4551854179761157e-05, | |
| "loss": 1.5595, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 17.787554996857324, | |
| "grad_norm": 4.784154891967773, | |
| "learning_rate": 1.4426147077309869e-05, | |
| "loss": 1.6029, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 17.850408548082967, | |
| "grad_norm": 5.0941481590271, | |
| "learning_rate": 1.4300439974858582e-05, | |
| "loss": 1.634, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 17.91326209930861, | |
| "grad_norm": 6.4498982429504395, | |
| "learning_rate": 1.4174732872407292e-05, | |
| "loss": 1.6685, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 17.976115650534254, | |
| "grad_norm": 5.136322021484375, | |
| "learning_rate": 1.4049025769956004e-05, | |
| "loss": 1.5587, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 1.565408706665039, | |
| "eval_runtime": 20.3165, | |
| "eval_samples_per_second": 47.154, | |
| "eval_steps_per_second": 5.907, | |
| "step": 28638 | |
| }, | |
| { | |
| "epoch": 18.0389692017599, | |
| "grad_norm": 7.265219211578369, | |
| "learning_rate": 1.3923318667504714e-05, | |
| "loss": 1.534, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 18.101822752985544, | |
| "grad_norm": 5.552704334259033, | |
| "learning_rate": 1.3797611565053426e-05, | |
| "loss": 1.5396, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 18.164676304211188, | |
| "grad_norm": 7.356419086456299, | |
| "learning_rate": 1.3671904462602136e-05, | |
| "loss": 1.5851, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 18.22752985543683, | |
| "grad_norm": 5.519120693206787, | |
| "learning_rate": 1.354619736015085e-05, | |
| "loss": 1.6331, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 18.290383406662478, | |
| "grad_norm": 4.4178242683410645, | |
| "learning_rate": 1.3420490257699561e-05, | |
| "loss": 1.508, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 18.35323695788812, | |
| "grad_norm": 4.479162216186523, | |
| "learning_rate": 1.3294783155248271e-05, | |
| "loss": 1.5201, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 18.416090509113765, | |
| "grad_norm": 4.4193806648254395, | |
| "learning_rate": 1.3169076052796983e-05, | |
| "loss": 1.5393, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 18.47894406033941, | |
| "grad_norm": 6.695824146270752, | |
| "learning_rate": 1.3043368950345693e-05, | |
| "loss": 1.6264, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 18.541797611565052, | |
| "grad_norm": 4.760421276092529, | |
| "learning_rate": 1.2917661847894409e-05, | |
| "loss": 1.5465, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 18.6046511627907, | |
| "grad_norm": 4.158078193664551, | |
| "learning_rate": 1.2791954745443119e-05, | |
| "loss": 1.5533, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 18.667504714016342, | |
| "grad_norm": 6.8502092361450195, | |
| "learning_rate": 1.266624764299183e-05, | |
| "loss": 1.6525, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 18.730358265241986, | |
| "grad_norm": 4.013594150543213, | |
| "learning_rate": 1.254054054054054e-05, | |
| "loss": 1.5357, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 18.79321181646763, | |
| "grad_norm": 6.064908981323242, | |
| "learning_rate": 1.2414833438089252e-05, | |
| "loss": 1.5659, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 18.856065367693276, | |
| "grad_norm": 5.281710624694824, | |
| "learning_rate": 1.2289126335637964e-05, | |
| "loss": 1.4692, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 18.91891891891892, | |
| "grad_norm": 4.661835193634033, | |
| "learning_rate": 1.2163419233186674e-05, | |
| "loss": 1.5126, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 18.981772470144563, | |
| "grad_norm": 3.9490227699279785, | |
| "learning_rate": 1.2037712130735388e-05, | |
| "loss": 1.5389, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 1.5563335418701172, | |
| "eval_runtime": 20.3631, | |
| "eval_samples_per_second": 47.046, | |
| "eval_steps_per_second": 5.893, | |
| "step": 30229 | |
| }, | |
| { | |
| "epoch": 19.044626021370206, | |
| "grad_norm": 4.6667866706848145, | |
| "learning_rate": 1.1912005028284098e-05, | |
| "loss": 1.5508, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 19.107479572595853, | |
| "grad_norm": 4.471792697906494, | |
| "learning_rate": 1.1786297925832811e-05, | |
| "loss": 1.5253, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 19.170333123821496, | |
| "grad_norm": 4.01970100402832, | |
| "learning_rate": 1.1660590823381521e-05, | |
| "loss": 1.5047, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 19.23318667504714, | |
| "grad_norm": 5.021801471710205, | |
| "learning_rate": 1.1534883720930233e-05, | |
| "loss": 1.5459, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 19.296040226272783, | |
| "grad_norm": 4.681889533996582, | |
| "learning_rate": 1.1409176618478945e-05, | |
| "loss": 1.561, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 19.35889377749843, | |
| "grad_norm": 4.114772319793701, | |
| "learning_rate": 1.1283469516027655e-05, | |
| "loss": 1.532, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 19.421747328724074, | |
| "grad_norm": 3.9337844848632812, | |
| "learning_rate": 1.1157762413576367e-05, | |
| "loss": 1.5512, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 19.484600879949717, | |
| "grad_norm": 4.935436725616455, | |
| "learning_rate": 1.1032055311125079e-05, | |
| "loss": 1.5328, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 19.54745443117536, | |
| "grad_norm": 5.703494071960449, | |
| "learning_rate": 1.090634820867379e-05, | |
| "loss": 1.5889, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 19.610307982401007, | |
| "grad_norm": 6.010659217834473, | |
| "learning_rate": 1.0780641106222502e-05, | |
| "loss": 1.5166, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 19.67316153362665, | |
| "grad_norm": 5.14444637298584, | |
| "learning_rate": 1.0654934003771214e-05, | |
| "loss": 1.5096, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 19.736015084852294, | |
| "grad_norm": 7.321188449859619, | |
| "learning_rate": 1.0529226901319924e-05, | |
| "loss": 1.4865, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 19.798868636077938, | |
| "grad_norm": 3.7702994346618652, | |
| "learning_rate": 1.0403519798868636e-05, | |
| "loss": 1.5122, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 19.86172218730358, | |
| "grad_norm": 5.493444442749023, | |
| "learning_rate": 1.0277812696417348e-05, | |
| "loss": 1.4974, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 19.924575738529228, | |
| "grad_norm": 5.273486137390137, | |
| "learning_rate": 1.015210559396606e-05, | |
| "loss": 1.5619, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 19.98742928975487, | |
| "grad_norm": 4.340183734893799, | |
| "learning_rate": 1.0026398491514772e-05, | |
| "loss": 1.4476, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 1.5459223985671997, | |
| "eval_runtime": 20.3264, | |
| "eval_samples_per_second": 47.131, | |
| "eval_steps_per_second": 5.904, | |
| "step": 31820 | |
| }, | |
| { | |
| "epoch": 20.050282840980515, | |
| "grad_norm": 3.8120639324188232, | |
| "learning_rate": 9.900691389063482e-06, | |
| "loss": 1.4837, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 20.113136392206158, | |
| "grad_norm": 4.154244899749756, | |
| "learning_rate": 9.774984286612195e-06, | |
| "loss": 1.4684, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 20.175989943431805, | |
| "grad_norm": 3.925746202468872, | |
| "learning_rate": 9.649277184160905e-06, | |
| "loss": 1.4685, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 20.23884349465745, | |
| "grad_norm": 5.944131374359131, | |
| "learning_rate": 9.523570081709617e-06, | |
| "loss": 1.5097, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 20.301697045883092, | |
| "grad_norm": 4.755185127258301, | |
| "learning_rate": 9.397862979258329e-06, | |
| "loss": 1.4334, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 20.364550597108735, | |
| "grad_norm": 4.627038478851318, | |
| "learning_rate": 9.27215587680704e-06, | |
| "loss": 1.503, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 20.427404148334382, | |
| "grad_norm": 9.863165855407715, | |
| "learning_rate": 9.14644877435575e-06, | |
| "loss": 1.4607, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 20.490257699560026, | |
| "grad_norm": 4.401854991912842, | |
| "learning_rate": 9.020741671904463e-06, | |
| "loss": 1.4653, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 20.55311125078567, | |
| "grad_norm": 6.041737079620361, | |
| "learning_rate": 8.895034569453174e-06, | |
| "loss": 1.504, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 20.615964802011312, | |
| "grad_norm": 6.523427963256836, | |
| "learning_rate": 8.769327467001886e-06, | |
| "loss": 1.6205, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 20.67881835323696, | |
| "grad_norm": 5.47548246383667, | |
| "learning_rate": 8.643620364550598e-06, | |
| "loss": 1.4491, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 20.741671904462603, | |
| "grad_norm": 5.3726959228515625, | |
| "learning_rate": 8.517913262099308e-06, | |
| "loss": 1.5817, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 20.804525455688246, | |
| "grad_norm": 3.872283935546875, | |
| "learning_rate": 8.392206159648022e-06, | |
| "loss": 1.5482, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 20.86737900691389, | |
| "grad_norm": 4.935946464538574, | |
| "learning_rate": 8.266499057196732e-06, | |
| "loss": 1.5006, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 20.930232558139537, | |
| "grad_norm": 6.805904388427734, | |
| "learning_rate": 8.140791954745444e-06, | |
| "loss": 1.5314, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 20.99308610936518, | |
| "grad_norm": 4.420083522796631, | |
| "learning_rate": 8.015084852294155e-06, | |
| "loss": 1.5417, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 1.5356966257095337, | |
| "eval_runtime": 20.4137, | |
| "eval_samples_per_second": 46.929, | |
| "eval_steps_per_second": 5.878, | |
| "step": 33411 | |
| }, | |
| { | |
| "epoch": 21.055939660590823, | |
| "grad_norm": 3.697171688079834, | |
| "learning_rate": 7.889377749842865e-06, | |
| "loss": 1.4994, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 21.118793211816467, | |
| "grad_norm": 5.232399940490723, | |
| "learning_rate": 7.763670647391579e-06, | |
| "loss": 1.5351, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 21.18164676304211, | |
| "grad_norm": 4.508577823638916, | |
| "learning_rate": 7.637963544940289e-06, | |
| "loss": 1.4301, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 21.244500314267757, | |
| "grad_norm": 5.425107479095459, | |
| "learning_rate": 7.512256442489001e-06, | |
| "loss": 1.4739, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 21.3073538654934, | |
| "grad_norm": 6.195432186126709, | |
| "learning_rate": 7.386549340037713e-06, | |
| "loss": 1.5458, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 21.370207416719044, | |
| "grad_norm": 5.850045204162598, | |
| "learning_rate": 7.260842237586424e-06, | |
| "loss": 1.5189, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 21.433060967944687, | |
| "grad_norm": 7.121579170227051, | |
| "learning_rate": 7.135135135135136e-06, | |
| "loss": 1.5273, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 21.495914519170334, | |
| "grad_norm": 4.316208362579346, | |
| "learning_rate": 7.009428032683847e-06, | |
| "loss": 1.4437, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 21.558768070395978, | |
| "grad_norm": 4.3052873611450195, | |
| "learning_rate": 6.883720930232558e-06, | |
| "loss": 1.4266, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 21.62162162162162, | |
| "grad_norm": 4.691330432891846, | |
| "learning_rate": 6.758013827781271e-06, | |
| "loss": 1.422, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 21.684475172847264, | |
| "grad_norm": 4.346444129943848, | |
| "learning_rate": 6.632306725329982e-06, | |
| "loss": 1.5511, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 21.74732872407291, | |
| "grad_norm": 5.304843902587891, | |
| "learning_rate": 6.506599622878693e-06, | |
| "loss": 1.4961, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 21.810182275298555, | |
| "grad_norm": 4.877419948577881, | |
| "learning_rate": 6.3808925204274045e-06, | |
| "loss": 1.4837, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 21.873035826524198, | |
| "grad_norm": 4.086881637573242, | |
| "learning_rate": 6.2551854179761155e-06, | |
| "loss": 1.5164, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 21.93588937774984, | |
| "grad_norm": 4.570976734161377, | |
| "learning_rate": 6.129478315524827e-06, | |
| "loss": 1.4681, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 21.99874292897549, | |
| "grad_norm": 25.407676696777344, | |
| "learning_rate": 6.003771213073539e-06, | |
| "loss": 1.4062, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 1.5373815298080444, | |
| "eval_runtime": 20.3495, | |
| "eval_samples_per_second": 47.077, | |
| "eval_steps_per_second": 5.897, | |
| "step": 35002 | |
| }, | |
| { | |
| "epoch": 22.061596480201132, | |
| "grad_norm": 4.965208053588867, | |
| "learning_rate": 5.878064110622251e-06, | |
| "loss": 1.446, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 22.124450031426775, | |
| "grad_norm": 5.620969772338867, | |
| "learning_rate": 5.752357008170962e-06, | |
| "loss": 1.475, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 22.18730358265242, | |
| "grad_norm": 4.315845489501953, | |
| "learning_rate": 5.626649905719674e-06, | |
| "loss": 1.4866, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 22.250157133878066, | |
| "grad_norm": 4.076879501342773, | |
| "learning_rate": 5.5009428032683854e-06, | |
| "loss": 1.5079, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 22.31301068510371, | |
| "grad_norm": 9.52351188659668, | |
| "learning_rate": 5.375235700817096e-06, | |
| "loss": 1.5637, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 22.375864236329353, | |
| "grad_norm": 5.529058933258057, | |
| "learning_rate": 5.249528598365807e-06, | |
| "loss": 1.4702, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 22.438717787554996, | |
| "grad_norm": 4.761877536773682, | |
| "learning_rate": 5.123821495914519e-06, | |
| "loss": 1.4367, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 22.501571338780643, | |
| "grad_norm": 6.587429046630859, | |
| "learning_rate": 4.998114393463231e-06, | |
| "loss": 1.4052, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 22.564424890006286, | |
| "grad_norm": 5.834304332733154, | |
| "learning_rate": 4.872407291011943e-06, | |
| "loss": 1.4186, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 22.62727844123193, | |
| "grad_norm": 3.871225595474243, | |
| "learning_rate": 4.746700188560654e-06, | |
| "loss": 1.51, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 22.690131992457573, | |
| "grad_norm": 3.876692771911621, | |
| "learning_rate": 4.6209930861093655e-06, | |
| "loss": 1.5022, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 22.752985543683216, | |
| "grad_norm": 4.569952964782715, | |
| "learning_rate": 4.495285983658077e-06, | |
| "loss": 1.454, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 22.815839094908863, | |
| "grad_norm": 5.837776184082031, | |
| "learning_rate": 4.369578881206788e-06, | |
| "loss": 1.4472, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 22.878692646134507, | |
| "grad_norm": 5.9942426681518555, | |
| "learning_rate": 4.243871778755499e-06, | |
| "loss": 1.4198, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 22.94154619736015, | |
| "grad_norm": 4.1033220291137695, | |
| "learning_rate": 4.118164676304211e-06, | |
| "loss": 1.4658, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 1.5307875871658325, | |
| "eval_runtime": 20.3299, | |
| "eval_samples_per_second": 47.123, | |
| "eval_steps_per_second": 5.903, | |
| "step": 36593 | |
| }, | |
| { | |
| "epoch": 23.004399748585794, | |
| "grad_norm": 4.649007320404053, | |
| "learning_rate": 3.992457573852923e-06, | |
| "loss": 1.4064, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 23.06725329981144, | |
| "grad_norm": 4.318711757659912, | |
| "learning_rate": 3.866750471401635e-06, | |
| "loss": 1.4249, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 23.130106851037084, | |
| "grad_norm": 6.213062286376953, | |
| "learning_rate": 3.7410433689503456e-06, | |
| "loss": 1.4317, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 23.192960402262727, | |
| "grad_norm": 4.529442310333252, | |
| "learning_rate": 3.6153362664990574e-06, | |
| "loss": 1.5102, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 23.25581395348837, | |
| "grad_norm": 4.912539005279541, | |
| "learning_rate": 3.4896291640477688e-06, | |
| "loss": 1.4684, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 23.318667504714018, | |
| "grad_norm": 4.593921661376953, | |
| "learning_rate": 3.3639220615964806e-06, | |
| "loss": 1.4181, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 23.38152105593966, | |
| "grad_norm": 5.35049295425415, | |
| "learning_rate": 3.2382149591451915e-06, | |
| "loss": 1.4813, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 23.444374607165305, | |
| "grad_norm": 4.00051212310791, | |
| "learning_rate": 3.1125078566939033e-06, | |
| "loss": 1.4392, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 23.507228158390948, | |
| "grad_norm": 5.91484260559082, | |
| "learning_rate": 2.9868007542426147e-06, | |
| "loss": 1.4386, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 23.570081709616595, | |
| "grad_norm": 7.114585876464844, | |
| "learning_rate": 2.861093651791326e-06, | |
| "loss": 1.4115, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 23.63293526084224, | |
| "grad_norm": 2.977877378463745, | |
| "learning_rate": 2.735386549340038e-06, | |
| "loss": 1.4211, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 23.69578881206788, | |
| "grad_norm": 3.83953857421875, | |
| "learning_rate": 2.6096794468887493e-06, | |
| "loss": 1.4601, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 23.758642363293525, | |
| "grad_norm": 4.377187728881836, | |
| "learning_rate": 2.483972344437461e-06, | |
| "loss": 1.4281, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 23.821495914519172, | |
| "grad_norm": 3.9868085384368896, | |
| "learning_rate": 2.358265241986172e-06, | |
| "loss": 1.4585, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 23.884349465744815, | |
| "grad_norm": 3.989767551422119, | |
| "learning_rate": 2.232558139534884e-06, | |
| "loss": 1.5302, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 23.94720301697046, | |
| "grad_norm": 4.481296062469482, | |
| "learning_rate": 2.1068510370835952e-06, | |
| "loss": 1.4366, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 1.5289642810821533, | |
| "eval_runtime": 20.3269, | |
| "eval_samples_per_second": 47.13, | |
| "eval_steps_per_second": 5.904, | |
| "step": 38184 | |
| }, | |
| { | |
| "epoch": 24.010056568196102, | |
| "grad_norm": 4.909224033355713, | |
| "learning_rate": 1.981143934632307e-06, | |
| "loss": 1.4956, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 24.072910119421746, | |
| "grad_norm": 4.9214372634887695, | |
| "learning_rate": 1.8554368321810182e-06, | |
| "loss": 1.4725, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 24.135763670647393, | |
| "grad_norm": 4.345515251159668, | |
| "learning_rate": 1.7297297297297298e-06, | |
| "loss": 1.4407, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 24.198617221873036, | |
| "grad_norm": 4.926340579986572, | |
| "learning_rate": 1.6040226272784412e-06, | |
| "loss": 1.5008, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 24.26147077309868, | |
| "grad_norm": 4.5064263343811035, | |
| "learning_rate": 1.4783155248271527e-06, | |
| "loss": 1.4868, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 24.324324324324323, | |
| "grad_norm": 5.347716808319092, | |
| "learning_rate": 1.3526084223758643e-06, | |
| "loss": 1.45, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 24.38717787554997, | |
| "grad_norm": 5.024169921875, | |
| "learning_rate": 1.2269013199245757e-06, | |
| "loss": 1.3905, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 24.450031426775613, | |
| "grad_norm": 4.319692611694336, | |
| "learning_rate": 1.1011942174732873e-06, | |
| "loss": 1.4671, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 24.512884978001257, | |
| "grad_norm": 2.880321979522705, | |
| "learning_rate": 9.75487115021999e-07, | |
| "loss": 1.4211, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 24.5757385292269, | |
| "grad_norm": 4.416039943695068, | |
| "learning_rate": 8.497800125707103e-07, | |
| "loss": 1.4176, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 24.638592080452547, | |
| "grad_norm": 4.598896503448486, | |
| "learning_rate": 7.240729101194218e-07, | |
| "loss": 1.4194, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 24.70144563167819, | |
| "grad_norm": 4.256235599517822, | |
| "learning_rate": 5.983658076681333e-07, | |
| "loss": 1.4331, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 24.764299182903834, | |
| "grad_norm": 4.7764811515808105, | |
| "learning_rate": 4.726587052168448e-07, | |
| "loss": 1.4491, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 24.827152734129477, | |
| "grad_norm": 4.296844005584717, | |
| "learning_rate": 3.4695160276555627e-07, | |
| "loss": 1.4443, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 24.890006285355124, | |
| "grad_norm": 3.9589693546295166, | |
| "learning_rate": 2.2124450031426776e-07, | |
| "loss": 1.4612, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 24.952859836580767, | |
| "grad_norm": 4.165828227996826, | |
| "learning_rate": 9.553739786297926e-08, | |
| "loss": 1.48, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 1.528791069984436, | |
| "eval_runtime": 20.2887, | |
| "eval_samples_per_second": 47.218, | |
| "eval_steps_per_second": 5.915, | |
| "step": 39775 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 39775, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 25, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.44418915549184e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |