Monika-9B / Adapter /trainer_state.json
Green-eyedDevil's picture
Upload 12 files
946f577 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9989281886387995,
"eval_steps": 59,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 2.934493064880371,
"eval_ppl": 18.81196,
"eval_runtime": 181.0816,
"eval_samples_per_second": 1.149,
"eval_steps_per_second": 1.149,
"memory/device_reserved (GiB)": 18.64,
"memory/max_active (GiB)": 18.28,
"memory/max_allocated (GiB)": 18.28,
"step": 0
},
{
"epoch": 0.004287245444801715,
"grad_norm": 0.5241990685462952,
"learning_rate": 0.0025,
"loss": 3.2761871814727783,
"memory/device_reserved (GiB)": 18.85,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 26.47464,
"step": 1,
"tokens/total": 2816,
"tokens/train_per_sec_per_gpu": 1.6,
"tokens/trainable": 1916
},
{
"epoch": 0.00857449088960343,
"grad_norm": 0.2679648697376251,
"learning_rate": 0.0025,
"loss": 2.743480920791626,
"memory/device_reserved (GiB)": 18.94,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 15.54099,
"step": 2,
"tokens/total": 4416,
"tokens/train_per_sec_per_gpu": 26.79,
"tokens/trainable": 2568
},
{
"epoch": 0.012861736334405145,
"grad_norm": 2.105276346206665,
"learning_rate": 0.0025,
"loss": 4.918459415435791,
"memory/device_reserved (GiB)": 18.94,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 136.79171,
"step": 3,
"tokens/total": 6272,
"tokens/train_per_sec_per_gpu": 3.76,
"tokens/trainable": 3571
},
{
"epoch": 0.01714898177920686,
"grad_norm": 1.7258063554763794,
"learning_rate": 0.0025,
"loss": 4.470706462860107,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 19.42,
"memory/max_allocated (GiB)": 19.42,
"ppl": 87.41846,
"step": 4,
"tokens/total": 9664,
"tokens/train_per_sec_per_gpu": 4.21,
"tokens/trainable": 6033
},
{
"epoch": 0.021436227224008574,
"grad_norm": 0.9528670907020569,
"learning_rate": 0.0025,
"loss": 3.6473782062530518,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 38.37393,
"step": 5,
"tokens/total": 11392,
"tokens/train_per_sec_per_gpu": 22.21,
"tokens/trainable": 6851
},
{
"epoch": 0.02572347266881029,
"grad_norm": 1.7168548107147217,
"learning_rate": 0.0025,
"loss": 3.641300678253174,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.63,
"memory/max_allocated (GiB)": 18.63,
"ppl": 38.14141,
"step": 6,
"tokens/total": 14464,
"tokens/train_per_sec_per_gpu": 25.23,
"tokens/trainable": 8951
},
{
"epoch": 0.030010718113612004,
"grad_norm": 9.308958053588867,
"learning_rate": 0.0025,
"loss": 11.468059539794922,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 95612.56656,
"step": 7,
"tokens/total": 16512,
"tokens/train_per_sec_per_gpu": 43.04,
"tokens/trainable": 9981
},
{
"epoch": 0.03429796355841372,
"grad_norm": 4.779436111450195,
"learning_rate": 0.0025,
"loss": 12.928940773010254,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.73,
"memory/max_allocated (GiB)": 18.73,
"ppl": 412066.80902,
"step": 8,
"tokens/total": 20160,
"tokens/train_per_sec_per_gpu": 206.05,
"tokens/trainable": 12656
},
{
"epoch": 0.03858520900321544,
"grad_norm": 11.870071411132812,
"learning_rate": 0.0025,
"loss": 26.585786819458008,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 351609428878.3472,
"step": 9,
"tokens/total": 22272,
"tokens/train_per_sec_per_gpu": 21.53,
"tokens/trainable": 13844
},
{
"epoch": 0.04287245444801715,
"grad_norm": 3.3879966735839844,
"learning_rate": 0.0025,
"loss": 21.875102996826172,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 3164000347.95887,
"step": 10,
"tokens/total": 24256,
"tokens/train_per_sec_per_gpu": 53.92,
"tokens/trainable": 15016
},
{
"epoch": 0.04715969989281887,
"grad_norm": 0.25865933299064636,
"learning_rate": 0.0025,
"loss": 14.399214744567871,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 1792666.51864,
"step": 11,
"tokens/total": 27008,
"tokens/train_per_sec_per_gpu": 67.68,
"tokens/trainable": 16945
},
{
"epoch": 0.05144694533762058,
"grad_norm": 3.67203950881958,
"learning_rate": 0.0025,
"loss": 12.23837947845459,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.59,
"memory/max_allocated (GiB)": 18.59,
"ppl": 206566.87286,
"step": 12,
"tokens/total": 29568,
"tokens/train_per_sec_per_gpu": 20.56,
"tokens/trainable": 18583
},
{
"epoch": 0.055734190782422297,
"grad_norm": 2.215568780899048,
"learning_rate": 0.0025,
"loss": 15.4072847366333,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 4912456.77743,
"step": 13,
"tokens/total": 31936,
"tokens/train_per_sec_per_gpu": 30.37,
"tokens/trainable": 19999
},
{
"epoch": 0.06002143622722401,
"grad_norm": 1.5136427879333496,
"learning_rate": 0.0025,
"loss": 11.96999454498291,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.37,
"memory/max_allocated (GiB)": 18.37,
"ppl": 157943.79881,
"step": 14,
"tokens/total": 33856,
"tokens/train_per_sec_per_gpu": 4.74,
"tokens/trainable": 21034
},
{
"epoch": 0.06430868167202572,
"grad_norm": 12.923323631286621,
"learning_rate": 0.0025,
"loss": 11.865056991577148,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.87,
"memory/max_allocated (GiB)": 18.87,
"ppl": 142209.55548,
"step": 15,
"tokens/total": 37632,
"tokens/train_per_sec_per_gpu": 4.06,
"tokens/trainable": 23846
},
{
"epoch": 0.06859592711682744,
"grad_norm": 1.347749948501587,
"learning_rate": 0.0025,
"loss": 9.3814115524292,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 11865.75207,
"step": 16,
"tokens/total": 39744,
"tokens/train_per_sec_per_gpu": 78.77,
"tokens/trainable": 24979
},
{
"epoch": 0.07288317256162916,
"grad_norm": 0.6687317490577698,
"learning_rate": 0.0025,
"loss": 8.9339599609375,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 7585.24325,
"step": 17,
"tokens/total": 42368,
"tokens/train_per_sec_per_gpu": 30.07,
"tokens/trainable": 26674
},
{
"epoch": 0.07717041800643087,
"grad_norm": 1.8986364603042603,
"learning_rate": 0.0025,
"loss": 10.20353889465332,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 26998.56228,
"step": 18,
"tokens/total": 44480,
"tokens/train_per_sec_per_gpu": 10.04,
"tokens/trainable": 27850
},
{
"epoch": 0.08145766345123258,
"grad_norm": 0.3103311359882355,
"learning_rate": 0.0025,
"loss": 8.43118953704834,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 4587.95439,
"step": 19,
"tokens/total": 45952,
"tokens/train_per_sec_per_gpu": 4.36,
"tokens/trainable": 28429
},
{
"epoch": 0.0857449088960343,
"grad_norm": 0.33680954575538635,
"learning_rate": 0.0025,
"loss": 8.632229804992676,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 5609.57257,
"step": 20,
"tokens/total": 48512,
"tokens/train_per_sec_per_gpu": 157.2,
"tokens/trainable": 29973
},
{
"epoch": 0.09003215434083602,
"grad_norm": 0.3231711983680725,
"learning_rate": 0.0025,
"loss": 8.600848197937012,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.86,
"memory/max_allocated (GiB)": 18.86,
"ppl": 5436.26867,
"step": 21,
"tokens/total": 51264,
"tokens/train_per_sec_per_gpu": 2.82,
"tokens/trainable": 31831
},
{
"epoch": 0.09431939978563773,
"grad_norm": 0.28503984212875366,
"learning_rate": 0.0025,
"loss": 8.879640579223633,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.38,
"memory/max_allocated (GiB)": 18.38,
"ppl": 7184.20812,
"step": 22,
"tokens/total": 52736,
"tokens/train_per_sec_per_gpu": 52.3,
"tokens/trainable": 32257
},
{
"epoch": 0.09860664523043944,
"grad_norm": 0.17652413249015808,
"learning_rate": 0.0025,
"loss": 8.288322448730469,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.66,
"memory/max_allocated (GiB)": 18.66,
"ppl": 3977.15671,
"step": 23,
"tokens/total": 55936,
"tokens/train_per_sec_per_gpu": 2.04,
"tokens/trainable": 34507
},
{
"epoch": 0.10289389067524116,
"grad_norm": 0.17270459234714508,
"learning_rate": 0.0025,
"loss": 7.758934020996094,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 2342.40632,
"step": 24,
"tokens/total": 57856,
"tokens/train_per_sec_per_gpu": 52.59,
"tokens/trainable": 35539
},
{
"epoch": 0.10718113612004287,
"grad_norm": 0.14134642481803894,
"learning_rate": 0.0025,
"loss": 7.277863502502441,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 1447.8913,
"step": 25,
"tokens/total": 60288,
"tokens/train_per_sec_per_gpu": 175.7,
"tokens/trainable": 37092
},
{
"epoch": 0.11146838156484459,
"grad_norm": 0.3017260730266571,
"learning_rate": 0.0025,
"loss": 7.528397083282471,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 1860.12149,
"step": 26,
"tokens/total": 62784,
"tokens/train_per_sec_per_gpu": 67.24,
"tokens/trainable": 38599
},
{
"epoch": 0.1157556270096463,
"grad_norm": 0.1806621253490448,
"learning_rate": 0.0025,
"loss": 7.274528503417969,
"memory/device_reserved (GiB)": 19.79,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 1443.07063,
"step": 27,
"tokens/total": 65024,
"tokens/train_per_sec_per_gpu": 22.31,
"tokens/trainable": 39852
},
{
"epoch": 0.12004287245444802,
"grad_norm": 0.4423042833805084,
"learning_rate": 0.0025,
"loss": 6.991304874420166,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 19.54,
"memory/max_allocated (GiB)": 19.54,
"ppl": 1087.13913,
"step": 28,
"tokens/total": 70784,
"tokens/train_per_sec_per_gpu": 70.63,
"tokens/trainable": 44731
},
{
"epoch": 0.12433011789924973,
"grad_norm": 0.22539205849170685,
"learning_rate": 0.0025,
"loss": 6.762757778167725,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 865.02445,
"step": 29,
"tokens/total": 73024,
"tokens/train_per_sec_per_gpu": 73.89,
"tokens/trainable": 46095
},
{
"epoch": 0.12861736334405144,
"grad_norm": 0.585552990436554,
"learning_rate": 0.0025,
"loss": 6.865115165710449,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.64,
"memory/max_allocated (GiB)": 18.64,
"ppl": 958.25619,
"step": 30,
"tokens/total": 75456,
"tokens/train_per_sec_per_gpu": 67.76,
"tokens/trainable": 47570
},
{
"epoch": 0.13290460878885316,
"grad_norm": 0.9950224161148071,
"learning_rate": 0.0025,
"loss": 7.070884704589844,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 1177.18904,
"step": 31,
"tokens/total": 77824,
"tokens/train_per_sec_per_gpu": 29.22,
"tokens/trainable": 48877
},
{
"epoch": 0.13719185423365488,
"grad_norm": 1.1349307298660278,
"learning_rate": 0.0025,
"loss": 6.7641096115112305,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 866.19461,
"step": 32,
"tokens/total": 79680,
"tokens/train_per_sec_per_gpu": 12.6,
"tokens/trainable": 49878
},
{
"epoch": 0.1414790996784566,
"grad_norm": 0.19686993956565857,
"learning_rate": 0.0025,
"loss": 6.472617149353027,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 647.17527,
"step": 33,
"tokens/total": 81792,
"tokens/train_per_sec_per_gpu": 110.67,
"tokens/trainable": 51049
},
{
"epoch": 0.1457663451232583,
"grad_norm": 0.4460529685020447,
"learning_rate": 0.0025,
"loss": 6.628453254699707,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 756.31144,
"step": 34,
"tokens/total": 84032,
"tokens/train_per_sec_per_gpu": 15.12,
"tokens/trainable": 52250
},
{
"epoch": 0.15005359056806003,
"grad_norm": 0.09157463908195496,
"learning_rate": 0.0025,
"loss": 6.950263977050781,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 1043.42513,
"step": 35,
"tokens/total": 86656,
"tokens/train_per_sec_per_gpu": 25.29,
"tokens/trainable": 53911
},
{
"epoch": 0.15434083601286175,
"grad_norm": 1.2685779333114624,
"learning_rate": 0.0025,
"loss": 6.143320083618164,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 465.59683,
"step": 36,
"tokens/total": 88640,
"tokens/train_per_sec_per_gpu": 15.77,
"tokens/trainable": 54904
},
{
"epoch": 0.15862808145766344,
"grad_norm": 0.31893390417099,
"learning_rate": 0.0025,
"loss": 6.968283653259277,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.38,
"memory/max_allocated (GiB)": 18.38,
"ppl": 1062.39774,
"step": 37,
"tokens/total": 90496,
"tokens/train_per_sec_per_gpu": 58.59,
"tokens/trainable": 55743
},
{
"epoch": 0.16291532690246516,
"grad_norm": 1.0469295978546143,
"learning_rate": 0.0025,
"loss": 7.234709739685059,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 1386.73832,
"step": 38,
"tokens/total": 92352,
"tokens/train_per_sec_per_gpu": 0.83,
"tokens/trainable": 56555
},
{
"epoch": 0.16720257234726688,
"grad_norm": 0.2749118506908417,
"learning_rate": 0.0025,
"loss": 6.587377071380615,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 725.87445,
"step": 39,
"tokens/total": 94400,
"tokens/train_per_sec_per_gpu": 33.93,
"tokens/trainable": 57680
},
{
"epoch": 0.1714898177920686,
"grad_norm": 0.18221265077590942,
"learning_rate": 0.0025,
"loss": 6.830031871795654,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 925.2203,
"step": 40,
"tokens/total": 96512,
"tokens/train_per_sec_per_gpu": 45.38,
"tokens/trainable": 58955
},
{
"epoch": 0.1757770632368703,
"grad_norm": 0.24708712100982666,
"learning_rate": 0.0025,
"loss": 6.151037693023682,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 469.20402,
"step": 41,
"tokens/total": 98752,
"tokens/train_per_sec_per_gpu": 74.94,
"tokens/trainable": 60277
},
{
"epoch": 0.18006430868167203,
"grad_norm": 0.17541086673736572,
"learning_rate": 0.0025,
"loss": 6.18589973449707,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.63,
"memory/max_allocated (GiB)": 18.63,
"ppl": 485.8499,
"step": 42,
"tokens/total": 101440,
"tokens/train_per_sec_per_gpu": 19.12,
"tokens/trainable": 61928
},
{
"epoch": 0.18435155412647375,
"grad_norm": 0.5008364319801331,
"learning_rate": 0.0025,
"loss": 6.555770397186279,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 703.29075,
"step": 43,
"tokens/total": 103104,
"tokens/train_per_sec_per_gpu": 8.28,
"tokens/trainable": 62757
},
{
"epoch": 0.18863879957127547,
"grad_norm": 0.8753749132156372,
"learning_rate": 0.0025,
"loss": 6.545720100402832,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 696.25787,
"step": 44,
"tokens/total": 105536,
"tokens/train_per_sec_per_gpu": 4.81,
"tokens/trainable": 64177
},
{
"epoch": 0.19292604501607716,
"grad_norm": 0.22259370982646942,
"learning_rate": 0.0025,
"loss": 6.528131008148193,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 684.1184,
"step": 45,
"tokens/total": 107584,
"tokens/train_per_sec_per_gpu": 14.82,
"tokens/trainable": 65253
},
{
"epoch": 0.19721329046087888,
"grad_norm": 0.10767526179552078,
"learning_rate": 0.0025,
"loss": 6.292204856872559,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 540.3434,
"step": 46,
"tokens/total": 109632,
"tokens/train_per_sec_per_gpu": 19.33,
"tokens/trainable": 66464
},
{
"epoch": 0.2015005359056806,
"grad_norm": 0.12123644351959229,
"learning_rate": 0.0025,
"loss": 6.8791117668151855,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 971.76282,
"step": 47,
"tokens/total": 112192,
"tokens/train_per_sec_per_gpu": 223.37,
"tokens/trainable": 68117
},
{
"epoch": 0.2057877813504823,
"grad_norm": 0.1776631772518158,
"learning_rate": 0.0025,
"loss": 6.613104343414307,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.38,
"memory/max_allocated (GiB)": 18.38,
"ppl": 744.79152,
"step": 48,
"tokens/total": 113856,
"tokens/train_per_sec_per_gpu": 54.59,
"tokens/trainable": 68805
},
{
"epoch": 0.21007502679528403,
"grad_norm": 0.16078130900859833,
"learning_rate": 0.0025,
"loss": 6.84950065612793,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 943.4097,
"step": 49,
"tokens/total": 116032,
"tokens/train_per_sec_per_gpu": 75.86,
"tokens/trainable": 70058
},
{
"epoch": 0.21436227224008575,
"grad_norm": 0.10362584888935089,
"learning_rate": 0.0025,
"loss": 7.038058280944824,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 1139.1735,
"step": 50,
"tokens/total": 118336,
"tokens/train_per_sec_per_gpu": 60.13,
"tokens/trainable": 71390
},
{
"epoch": 0.21864951768488747,
"grad_norm": 0.14731702208518982,
"learning_rate": 0.0025,
"loss": 5.994701385498047,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 401.29683,
"step": 51,
"tokens/total": 120384,
"tokens/train_per_sec_per_gpu": 3.02,
"tokens/trainable": 72578
},
{
"epoch": 0.22293676312968919,
"grad_norm": 0.15322738885879517,
"learning_rate": 0.0025,
"loss": 7.148180961608887,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 1271.79041,
"step": 52,
"tokens/total": 122432,
"tokens/train_per_sec_per_gpu": 49.03,
"tokens/trainable": 73654
},
{
"epoch": 0.22722400857449088,
"grad_norm": 0.2865282893180847,
"learning_rate": 0.0025,
"loss": 6.537259578704834,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 690.39202,
"step": 53,
"tokens/total": 125056,
"tokens/train_per_sec_per_gpu": 37.17,
"tokens/trainable": 75448
},
{
"epoch": 0.2315112540192926,
"grad_norm": 0.19199238717556,
"learning_rate": 0.0025,
"loss": 6.6607537269592285,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 781.13948,
"step": 54,
"tokens/total": 127360,
"tokens/train_per_sec_per_gpu": 153.39,
"tokens/trainable": 76842
},
{
"epoch": 0.2357984994640943,
"grad_norm": 0.07870891690254211,
"learning_rate": 0.0025,
"loss": 6.618167400360107,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.88,
"memory/max_allocated (GiB)": 18.88,
"ppl": 748.57201,
"step": 55,
"tokens/total": 131008,
"tokens/train_per_sec_per_gpu": 12.42,
"tokens/trainable": 79539
},
{
"epoch": 0.24008574490889603,
"grad_norm": 0.10853379964828491,
"learning_rate": 0.0025,
"loss": 6.207864761352539,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 496.63967,
"step": 56,
"tokens/total": 132800,
"tokens/train_per_sec_per_gpu": 9.35,
"tokens/trainable": 80444
},
{
"epoch": 0.24437299035369775,
"grad_norm": 0.06515457481145859,
"learning_rate": 0.0025,
"loss": 6.508807182312012,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 671.02553,
"step": 57,
"tokens/total": 135040,
"tokens/train_per_sec_per_gpu": 112.03,
"tokens/trainable": 81812
},
{
"epoch": 0.24866023579849947,
"grad_norm": 0.12576992809772491,
"learning_rate": 0.0025,
"loss": 6.428314208984375,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 619.12935,
"step": 58,
"tokens/total": 137472,
"tokens/train_per_sec_per_gpu": 117.23,
"tokens/trainable": 83345
},
{
"epoch": 0.2529474812433012,
"grad_norm": 0.11791636049747467,
"learning_rate": 0.0025,
"loss": 6.194557189941406,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.38,
"memory/max_allocated (GiB)": 18.38,
"ppl": 490.07439,
"step": 59,
"tokens/total": 139136,
"tokens/train_per_sec_per_gpu": 54.16,
"tokens/trainable": 84003
},
{
"epoch": 0.2529474812433012,
"eval_loss": 6.30873441696167,
"eval_ppl": 549.34926,
"eval_runtime": 17.198,
"eval_samples_per_second": 12.094,
"eval_steps_per_second": 12.094,
"memory/device_reserved (GiB)": 19.96,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"step": 59
},
{
"epoch": 0.2572347266881029,
"grad_norm": 0.20866750180721283,
"learning_rate": 0.0025,
"loss": 6.447090148925781,
"memory/device_reserved (GiB)": 18.69,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 630.8639,
"step": 60,
"tokens/total": 142208,
"tokens/train_per_sec_per_gpu": 37.47,
"tokens/trainable": 86018
},
{
"epoch": 0.2615219721329046,
"grad_norm": 0.08183833956718445,
"learning_rate": 0.0025,
"loss": 6.008518218994141,
"memory/device_reserved (GiB)": 18.73,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 406.87997,
"step": 61,
"tokens/total": 144000,
"tokens/train_per_sec_per_gpu": 22.8,
"tokens/trainable": 86897
},
{
"epoch": 0.2658092175777063,
"grad_norm": 0.21841929852962494,
"learning_rate": 0.0025,
"loss": 6.573157787322998,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.82,
"memory/max_allocated (GiB)": 18.82,
"ppl": 715.62607,
"step": 62,
"tokens/total": 147520,
"tokens/train_per_sec_per_gpu": 19.02,
"tokens/trainable": 89504
},
{
"epoch": 0.27009646302250806,
"grad_norm": 0.09511567652225494,
"learning_rate": 0.0025,
"loss": 6.555922985076904,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 703.39808,
"step": 63,
"tokens/total": 150016,
"tokens/train_per_sec_per_gpu": 183.78,
"tokens/trainable": 91081
},
{
"epoch": 0.27438370846730975,
"grad_norm": 0.07103318721055984,
"learning_rate": 0.0025,
"loss": 6.265772342681885,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 526.24787,
"step": 64,
"tokens/total": 152384,
"tokens/train_per_sec_per_gpu": 50.64,
"tokens/trainable": 92465
},
{
"epoch": 0.27867095391211144,
"grad_norm": 0.14229358732700348,
"learning_rate": 0.0025,
"loss": 6.940990924835205,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.38,
"memory/max_allocated (GiB)": 18.38,
"ppl": 1033.79412,
"step": 65,
"tokens/total": 153792,
"tokens/train_per_sec_per_gpu": 2.2,
"tokens/trainable": 93043
},
{
"epoch": 0.2829581993569132,
"grad_norm": 0.148279070854187,
"learning_rate": 0.0025,
"loss": 6.338968276977539,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 566.21184,
"step": 66,
"tokens/total": 156352,
"tokens/train_per_sec_per_gpu": 56.15,
"tokens/trainable": 94644
},
{
"epoch": 0.2872454448017149,
"grad_norm": 0.18289832770824432,
"learning_rate": 0.0025,
"loss": 6.198648929595947,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 492.08375,
"step": 67,
"tokens/total": 158144,
"tokens/train_per_sec_per_gpu": 72.42,
"tokens/trainable": 95506
},
{
"epoch": 0.2915326902465166,
"grad_norm": 0.11502089351415634,
"learning_rate": 0.0025,
"loss": 6.177389621734619,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 481.73281,
"step": 68,
"tokens/total": 159808,
"tokens/train_per_sec_per_gpu": 13.1,
"tokens/trainable": 96311
},
{
"epoch": 0.2958199356913183,
"grad_norm": 0.766052782535553,
"learning_rate": 0.0025,
"loss": 6.210302829742432,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 497.85199,
"step": 69,
"tokens/total": 162240,
"tokens/train_per_sec_per_gpu": 15.54,
"tokens/trainable": 97821
},
{
"epoch": 0.30010718113612006,
"grad_norm": 0.16312995553016663,
"learning_rate": 0.0025,
"loss": 6.322083950042725,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 556.73199,
"step": 70,
"tokens/total": 163968,
"tokens/train_per_sec_per_gpu": 35.21,
"tokens/trainable": 98557
},
{
"epoch": 0.30439442658092175,
"grad_norm": 0.09915313869714737,
"learning_rate": 0.0025,
"loss": 5.732677459716797,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 308.79495,
"step": 71,
"tokens/total": 165952,
"tokens/train_per_sec_per_gpu": 126.46,
"tokens/trainable": 99716
},
{
"epoch": 0.3086816720257235,
"grad_norm": 0.3138703405857086,
"learning_rate": 0.0025,
"loss": 5.753807067871094,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 315.38909,
"step": 72,
"tokens/total": 167872,
"tokens/train_per_sec_per_gpu": 27.76,
"tokens/trainable": 100641
},
{
"epoch": 0.3129689174705252,
"grad_norm": 0.2080921083688736,
"learning_rate": 0.0025,
"loss": 6.6848883628845215,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.6,
"memory/max_allocated (GiB)": 18.6,
"ppl": 800.22134,
"step": 73,
"tokens/total": 170304,
"tokens/train_per_sec_per_gpu": 275.0,
"tokens/trainable": 102108
},
{
"epoch": 0.3172561629153269,
"grad_norm": 0.09075198322534561,
"learning_rate": 0.0025,
"loss": 5.682041168212891,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 293.548,
"step": 74,
"tokens/total": 171840,
"tokens/train_per_sec_per_gpu": 24.72,
"tokens/trainable": 102828
},
{
"epoch": 0.3215434083601286,
"grad_norm": 0.19326482713222504,
"learning_rate": 0.0025,
"loss": 6.265585899353027,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 526.14977,
"step": 75,
"tokens/total": 173632,
"tokens/train_per_sec_per_gpu": 9.51,
"tokens/trainable": 103667
},
{
"epoch": 0.3258306538049303,
"grad_norm": 0.09544038772583008,
"learning_rate": 0.0025,
"loss": 5.699280261993408,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.42,
"memory/max_allocated (GiB)": 18.42,
"ppl": 298.65237,
"step": 76,
"tokens/total": 175808,
"tokens/train_per_sec_per_gpu": 2.17,
"tokens/trainable": 104870
},
{
"epoch": 0.33011789924973206,
"grad_norm": 0.7390451431274414,
"learning_rate": 0.0025,
"loss": 6.410941123962402,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 608.46605,
"step": 77,
"tokens/total": 177664,
"tokens/train_per_sec_per_gpu": 10.88,
"tokens/trainable": 105944
},
{
"epoch": 0.33440514469453375,
"grad_norm": 0.14168062806129456,
"learning_rate": 0.0025,
"loss": 7.039776802062988,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 1141.13288,
"step": 78,
"tokens/total": 179584,
"tokens/train_per_sec_per_gpu": 103.93,
"tokens/trainable": 106970
},
{
"epoch": 0.3386923901393355,
"grad_norm": 0.8850395679473877,
"learning_rate": 0.0025,
"loss": 6.027164459228516,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 414.53792,
"step": 79,
"tokens/total": 181312,
"tokens/train_per_sec_per_gpu": 97.76,
"tokens/trainable": 107821
},
{
"epoch": 0.3429796355841372,
"grad_norm": 0.17418481409549713,
"learning_rate": 0.0025,
"loss": 7.046483993530273,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.57,
"memory/max_allocated (GiB)": 18.57,
"ppl": 1148.8124,
"step": 80,
"tokens/total": 183936,
"tokens/train_per_sec_per_gpu": 123.29,
"tokens/trainable": 109492
},
{
"epoch": 0.34726688102893893,
"grad_norm": 0.09937312453985214,
"learning_rate": 0.0025,
"loss": 5.586414337158203,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 266.77733,
"step": 81,
"tokens/total": 185600,
"tokens/train_per_sec_per_gpu": 23.19,
"tokens/trainable": 110255
},
{
"epoch": 0.3515541264737406,
"grad_norm": 0.3117140233516693,
"learning_rate": 0.0025,
"loss": 6.475000381469727,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 648.71948,
"step": 82,
"tokens/total": 187776,
"tokens/train_per_sec_per_gpu": 44.82,
"tokens/trainable": 111472
},
{
"epoch": 0.3558413719185423,
"grad_norm": 0.3783544600009918,
"learning_rate": 0.0025,
"loss": 6.57240104675293,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 715.08474,
"step": 83,
"tokens/total": 189952,
"tokens/train_per_sec_per_gpu": 106.76,
"tokens/trainable": 112729
},
{
"epoch": 0.36012861736334406,
"grad_norm": 0.0920150876045227,
"learning_rate": 0.0025,
"loss": 6.143086910247803,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 465.48828,
"step": 84,
"tokens/total": 192320,
"tokens/train_per_sec_per_gpu": 49.38,
"tokens/trainable": 114215
},
{
"epoch": 0.36441586280814575,
"grad_norm": 0.09526661038398743,
"learning_rate": 0.0025,
"loss": 5.70522403717041,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 300.43278,
"step": 85,
"tokens/total": 194048,
"tokens/train_per_sec_per_gpu": 31.47,
"tokens/trainable": 114924
},
{
"epoch": 0.3687031082529475,
"grad_norm": 0.15185925364494324,
"learning_rate": 0.0025,
"loss": 6.073705673217773,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 434.28703,
"step": 86,
"tokens/total": 195392,
"tokens/train_per_sec_per_gpu": 10.82,
"tokens/trainable": 115406
},
{
"epoch": 0.3729903536977492,
"grad_norm": 0.11615428328514099,
"learning_rate": 0.0025,
"loss": 6.724306106567383,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 832.39418,
"step": 87,
"tokens/total": 197760,
"tokens/train_per_sec_per_gpu": 32.79,
"tokens/trainable": 116773
},
{
"epoch": 0.37727759914255093,
"grad_norm": 0.602443277835846,
"learning_rate": 0.0025,
"loss": 6.486382007598877,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 656.14514,
"step": 88,
"tokens/total": 200192,
"tokens/train_per_sec_per_gpu": 45.96,
"tokens/trainable": 118226
},
{
"epoch": 0.3815648445873526,
"grad_norm": 0.09665144979953766,
"learning_rate": 0.0025,
"loss": 6.018401622772217,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 410.92126,
"step": 89,
"tokens/total": 202816,
"tokens/train_per_sec_per_gpu": 39.89,
"tokens/trainable": 119912
},
{
"epoch": 0.3858520900321543,
"grad_norm": 0.11786024272441864,
"learning_rate": 0.0025,
"loss": 6.514355659484863,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 674.75905,
"step": 90,
"tokens/total": 204544,
"tokens/train_per_sec_per_gpu": 16.48,
"tokens/trainable": 120771
},
{
"epoch": 0.39013933547695606,
"grad_norm": 0.07515699416399002,
"learning_rate": 0.0025,
"loss": 5.9910969734191895,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 399.853,
"step": 91,
"tokens/total": 206464,
"tokens/train_per_sec_per_gpu": 37.79,
"tokens/trainable": 121829
},
{
"epoch": 0.39442658092175775,
"grad_norm": 0.23832163214683533,
"learning_rate": 0.0025,
"loss": 6.451230525970459,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 633.48133,
"step": 92,
"tokens/total": 208384,
"tokens/train_per_sec_per_gpu": 6.91,
"tokens/trainable": 122828
},
{
"epoch": 0.3987138263665595,
"grad_norm": 0.06281202286481857,
"learning_rate": 0.0025,
"loss": 5.861863613128662,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 351.37837,
"step": 93,
"tokens/total": 210688,
"tokens/train_per_sec_per_gpu": 96.78,
"tokens/trainable": 124211
},
{
"epoch": 0.4030010718113612,
"grad_norm": 0.1325235366821289,
"learning_rate": 0.0025,
"loss": 6.324355125427246,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.36,
"memory/max_allocated (GiB)": 18.36,
"ppl": 557.99786,
"step": 94,
"tokens/total": 212096,
"tokens/train_per_sec_per_gpu": 32.79,
"tokens/trainable": 124710
},
{
"epoch": 0.40728831725616294,
"grad_norm": 0.159224733710289,
"learning_rate": 0.0025,
"loss": 5.801607131958008,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 330.83082,
"step": 95,
"tokens/total": 213696,
"tokens/train_per_sec_per_gpu": 21.94,
"tokens/trainable": 125363
},
{
"epoch": 0.4115755627009646,
"grad_norm": 0.1464780569076538,
"learning_rate": 0.0025,
"loss": 6.5112481117248535,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 672.66546,
"step": 96,
"tokens/total": 216320,
"tokens/train_per_sec_per_gpu": 106.96,
"tokens/trainable": 127129
},
{
"epoch": 0.41586280814576637,
"grad_norm": 0.10386360436677933,
"learning_rate": 0.0025,
"loss": 6.267631530761719,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 527.22718,
"step": 97,
"tokens/total": 218368,
"tokens/train_per_sec_per_gpu": 92.64,
"tokens/trainable": 128258
},
{
"epoch": 0.42015005359056806,
"grad_norm": 0.08260782063007355,
"learning_rate": 0.0025,
"loss": 6.162295341491699,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 474.516,
"step": 98,
"tokens/total": 220288,
"tokens/train_per_sec_per_gpu": 228.13,
"tokens/trainable": 129259
},
{
"epoch": 0.42443729903536975,
"grad_norm": 0.15949538350105286,
"learning_rate": 0.0025,
"loss": 6.426211833953857,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 617.82907,
"step": 99,
"tokens/total": 222528,
"tokens/train_per_sec_per_gpu": 27.38,
"tokens/trainable": 130563
},
{
"epoch": 0.4287245444801715,
"grad_norm": 0.13608896732330322,
"learning_rate": 0.0025,
"loss": 5.752037048339844,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 314.83133,
"step": 100,
"tokens/total": 224320,
"tokens/train_per_sec_per_gpu": 4.44,
"tokens/trainable": 131392
},
{
"epoch": 0.4330117899249732,
"grad_norm": 0.0612945631146431,
"learning_rate": 0.0025,
"loss": 6.353605270385742,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.59,
"memory/max_allocated (GiB)": 18.59,
"ppl": 574.56043,
"step": 101,
"tokens/total": 226496,
"tokens/train_per_sec_per_gpu": 64.63,
"tokens/trainable": 132634
},
{
"epoch": 0.43729903536977494,
"grad_norm": 0.3771326243877411,
"learning_rate": 0.0025,
"loss": 5.794309616088867,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 328.42537,
"step": 102,
"tokens/total": 228736,
"tokens/train_per_sec_per_gpu": 23.41,
"tokens/trainable": 134004
},
{
"epoch": 0.4415862808145766,
"grad_norm": 0.05130897834897041,
"learning_rate": 0.0025,
"loss": 5.724153518676758,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.57,
"memory/max_allocated (GiB)": 18.57,
"ppl": 306.17398,
"step": 103,
"tokens/total": 231360,
"tokens/train_per_sec_per_gpu": 36.23,
"tokens/trainable": 135730
},
{
"epoch": 0.44587352625937837,
"grad_norm": 0.05757886916399002,
"learning_rate": 0.0025,
"loss": 5.929745674133301,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 376.05886,
"step": 104,
"tokens/total": 233664,
"tokens/train_per_sec_per_gpu": 35.62,
"tokens/trainable": 137092
},
{
"epoch": 0.45016077170418006,
"grad_norm": 0.17747093737125397,
"learning_rate": 0.0025,
"loss": 5.519756317138672,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 249.57421,
"step": 105,
"tokens/total": 235456,
"tokens/train_per_sec_per_gpu": 6.16,
"tokens/trainable": 138050
},
{
"epoch": 0.45444801714898175,
"grad_norm": 0.9483416080474854,
"learning_rate": 0.0025,
"loss": 6.132290363311768,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 460.48964,
"step": 106,
"tokens/total": 238016,
"tokens/train_per_sec_per_gpu": 8.58,
"tokens/trainable": 139739
},
{
"epoch": 0.4587352625937835,
"grad_norm": 0.0577179454267025,
"learning_rate": 0.0025,
"loss": 6.135179042816162,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 461.82177,
"step": 107,
"tokens/total": 240768,
"tokens/train_per_sec_per_gpu": 197.15,
"tokens/trainable": 141578
},
{
"epoch": 0.4630225080385852,
"grad_norm": 0.13632942736148834,
"learning_rate": 0.0025,
"loss": 5.787478923797607,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 326.18964,
"step": 108,
"tokens/total": 242624,
"tokens/train_per_sec_per_gpu": 6.1,
"tokens/trainable": 142512
},
{
"epoch": 0.46730975348338694,
"grad_norm": 0.06379847973585129,
"learning_rate": 0.0025,
"loss": 5.883008003234863,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.81,
"memory/max_allocated (GiB)": 18.81,
"ppl": 358.88715,
"step": 109,
"tokens/total": 246208,
"tokens/train_per_sec_per_gpu": 80.44,
"tokens/trainable": 145158
},
{
"epoch": 0.4715969989281886,
"grad_norm": 0.06732220202684402,
"learning_rate": 0.0025,
"loss": 6.3092498779296875,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 549.6325,
"step": 110,
"tokens/total": 248448,
"tokens/train_per_sec_per_gpu": 111.87,
"tokens/trainable": 146499
},
{
"epoch": 0.4758842443729904,
"grad_norm": 0.06807619333267212,
"learning_rate": 0.0025,
"loss": 5.3626203536987305,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.61,
"memory/max_allocated (GiB)": 18.61,
"ppl": 213.28309,
"step": 111,
"tokens/total": 251264,
"tokens/train_per_sec_per_gpu": 30.5,
"tokens/trainable": 148381
},
{
"epoch": 0.48017148981779206,
"grad_norm": 0.07463269680738449,
"learning_rate": 0.0025,
"loss": 6.311191558837891,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 550.70075,
"step": 112,
"tokens/total": 252928,
"tokens/train_per_sec_per_gpu": 25.34,
"tokens/trainable": 149181
},
{
"epoch": 0.4844587352625938,
"grad_norm": 0.08292581140995026,
"learning_rate": 0.0025,
"loss": 5.2816314697265625,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 196.69051,
"step": 113,
"tokens/total": 254464,
"tokens/train_per_sec_per_gpu": 136.32,
"tokens/trainable": 149924
},
{
"epoch": 0.4887459807073955,
"grad_norm": 0.061865709722042084,
"learning_rate": 0.0025,
"loss": 6.011983394622803,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 408.29232,
"step": 114,
"tokens/total": 256768,
"tokens/train_per_sec_per_gpu": 10.89,
"tokens/trainable": 151286
},
{
"epoch": 0.4930332261521972,
"grad_norm": 0.10802624374628067,
"learning_rate": 0.0025,
"loss": 5.940651893615723,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 380.18269,
"step": 115,
"tokens/total": 258688,
"tokens/train_per_sec_per_gpu": 90.71,
"tokens/trainable": 152264
},
{
"epoch": 0.49732047159699894,
"grad_norm": 0.06800372898578644,
"learning_rate": 0.0025,
"loss": 5.748905181884766,
"memory/device_reserved (GiB)": 19.35,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 313.84687,
"step": 116,
"tokens/total": 260800,
"tokens/train_per_sec_per_gpu": 10.94,
"tokens/trainable": 153396
},
{
"epoch": 0.5016077170418006,
"grad_norm": 0.05444978550076485,
"learning_rate": 0.0025,
"loss": 5.818991661071777,
"memory/device_reserved (GiB)": 19.39,
"memory/max_active (GiB)": 19.1,
"memory/max_allocated (GiB)": 19.1,
"ppl": 336.63244,
"step": 117,
"tokens/total": 264320,
"tokens/train_per_sec_per_gpu": 603.9,
"tokens/trainable": 156011
},
{
"epoch": 0.5058949624866024,
"grad_norm": 0.05440564081072807,
"learning_rate": 0.0025,
"loss": 5.707052230834961,
"memory/device_reserved (GiB)": 19.39,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 300.98253,
"step": 118,
"tokens/total": 266176,
"tokens/train_per_sec_per_gpu": 31.98,
"tokens/trainable": 156975
},
{
"epoch": 0.5058949624866024,
"eval_loss": 5.828575611114502,
"eval_ppl": 339.87422,
"eval_runtime": 17.0818,
"eval_samples_per_second": 12.177,
"eval_steps_per_second": 12.177,
"memory/device_reserved (GiB)": 19.39,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"step": 118
},
{
"epoch": 0.5101822079314041,
"grad_norm": 0.10290460288524628,
"learning_rate": 0.0025,
"loss": 6.018277168273926,
"memory/device_reserved (GiB)": 18.51,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 410.87013,
"step": 119,
"tokens/total": 267776,
"tokens/train_per_sec_per_gpu": 11.96,
"tokens/trainable": 157706
},
{
"epoch": 0.5144694533762058,
"grad_norm": 0.06775107234716415,
"learning_rate": 0.0025,
"loss": 6.1761064529418945,
"memory/device_reserved (GiB)": 18.72,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 481.11506,
"step": 120,
"tokens/total": 270464,
"tokens/train_per_sec_per_gpu": 63.79,
"tokens/trainable": 159440
},
{
"epoch": 0.5187566988210075,
"grad_norm": 0.09368869662284851,
"learning_rate": 0.0025,
"loss": 5.502309799194336,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.69,
"memory/max_allocated (GiB)": 18.69,
"ppl": 245.25777,
"step": 121,
"tokens/total": 273024,
"tokens/train_per_sec_per_gpu": 0.87,
"tokens/trainable": 161081
},
{
"epoch": 0.5230439442658092,
"grad_norm": 0.05680066719651222,
"learning_rate": 0.0025,
"loss": 5.426424503326416,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 227.33496,
"step": 122,
"tokens/total": 274944,
"tokens/train_per_sec_per_gpu": 7.41,
"tokens/trainable": 162094
},
{
"epoch": 0.5273311897106109,
"grad_norm": 0.05151861906051636,
"learning_rate": 0.0025,
"loss": 5.80197811126709,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 330.95358,
"step": 123,
"tokens/total": 276864,
"tokens/train_per_sec_per_gpu": 49.9,
"tokens/trainable": 163090
},
{
"epoch": 0.5316184351554126,
"grad_norm": 0.08232463896274567,
"learning_rate": 0.0025,
"loss": 5.85071325302124,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.63,
"memory/max_allocated (GiB)": 18.63,
"ppl": 347.48213,
"step": 124,
"tokens/total": 279360,
"tokens/train_per_sec_per_gpu": 1.75,
"tokens/trainable": 164564
},
{
"epoch": 0.5359056806002144,
"grad_norm": 0.08689901232719421,
"learning_rate": 0.0025,
"loss": 5.9564080238342285,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 386.22034,
"step": 125,
"tokens/total": 281984,
"tokens/train_per_sec_per_gpu": 192.74,
"tokens/trainable": 166194
},
{
"epoch": 0.5401929260450161,
"grad_norm": 0.06799639761447906,
"learning_rate": 0.0025,
"loss": 5.424775123596191,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 226.9603,
"step": 126,
"tokens/total": 283584,
"tokens/train_per_sec_per_gpu": 20.32,
"tokens/trainable": 166912
},
{
"epoch": 0.5444801714898178,
"grad_norm": 0.06623026728630066,
"learning_rate": 0.0025,
"loss": 5.536446571350098,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 253.77463,
"step": 127,
"tokens/total": 286400,
"tokens/train_per_sec_per_gpu": 112.92,
"tokens/trainable": 168763
},
{
"epoch": 0.5487674169346195,
"grad_norm": 0.07652036845684052,
"learning_rate": 0.0025,
"loss": 4.905512809753418,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 135.03214,
"step": 128,
"tokens/total": 288448,
"tokens/train_per_sec_per_gpu": 213.64,
"tokens/trainable": 169869
},
{
"epoch": 0.5530546623794212,
"grad_norm": 0.07232151925563812,
"learning_rate": 0.0025,
"loss": 5.730243682861328,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 308.04432,
"step": 129,
"tokens/total": 290560,
"tokens/train_per_sec_per_gpu": 8.69,
"tokens/trainable": 171082
},
{
"epoch": 0.5573419078242229,
"grad_norm": 0.1090153232216835,
"learning_rate": 0.0025,
"loss": 4.770911693572998,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 118.0268,
"step": 130,
"tokens/total": 292096,
"tokens/train_per_sec_per_gpu": 2.14,
"tokens/trainable": 171701
},
{
"epoch": 0.5616291532690246,
"grad_norm": 0.07109065353870392,
"learning_rate": 0.0025,
"loss": 6.074782371520996,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 434.75488,
"step": 131,
"tokens/total": 293952,
"tokens/train_per_sec_per_gpu": 11.1,
"tokens/trainable": 172572
},
{
"epoch": 0.5659163987138264,
"grad_norm": 0.06394513696432114,
"learning_rate": 0.0025,
"loss": 5.415735244750977,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 224.91785,
"step": 132,
"tokens/total": 295424,
"tokens/train_per_sec_per_gpu": 25.67,
"tokens/trainable": 173255
},
{
"epoch": 0.5702036441586281,
"grad_norm": 0.07912840694189072,
"learning_rate": 0.0025,
"loss": 5.249301910400391,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.67,
"memory/max_allocated (GiB)": 18.67,
"ppl": 190.43328,
"step": 133,
"tokens/total": 298112,
"tokens/train_per_sec_per_gpu": 6.48,
"tokens/trainable": 175036
},
{
"epoch": 0.5744908896034298,
"grad_norm": 0.05524107813835144,
"learning_rate": 0.0025,
"loss": 6.232509613037109,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 509.03135,
"step": 134,
"tokens/total": 300288,
"tokens/train_per_sec_per_gpu": 114.55,
"tokens/trainable": 176200
},
{
"epoch": 0.5787781350482315,
"grad_norm": 0.08867005258798599,
"learning_rate": 0.0025,
"loss": 5.501221656799316,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 244.99104,
"step": 135,
"tokens/total": 302016,
"tokens/train_per_sec_per_gpu": 8.77,
"tokens/trainable": 176997
},
{
"epoch": 0.5830653804930332,
"grad_norm": 0.0581279955804348,
"learning_rate": 0.0025,
"loss": 6.093716144561768,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 443.06485,
"step": 136,
"tokens/total": 303936,
"tokens/train_per_sec_per_gpu": 49.42,
"tokens/trainable": 177895
},
{
"epoch": 0.587352625937835,
"grad_norm": 0.24256502091884613,
"learning_rate": 0.0025,
"loss": 5.912446975708008,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 369.60948,
"step": 137,
"tokens/total": 306240,
"tokens/train_per_sec_per_gpu": 117.87,
"tokens/trainable": 179285
},
{
"epoch": 0.5916398713826366,
"grad_norm": 0.08909733593463898,
"learning_rate": 0.0025,
"loss": 7.02211332321167,
"memory/device_reserved (GiB)": 19.02,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 1121.15348,
"step": 138,
"tokens/total": 308672,
"tokens/train_per_sec_per_gpu": 145.17,
"tokens/trainable": 180732
},
{
"epoch": 0.5959271168274384,
"grad_norm": 0.11382216215133667,
"learning_rate": 0.0025,
"loss": 6.5203537940979,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.92,
"memory/max_allocated (GiB)": 18.92,
"ppl": 678.8185,
"step": 139,
"tokens/total": 311552,
"tokens/train_per_sec_per_gpu": 17.16,
"tokens/trainable": 182719
},
{
"epoch": 0.6002143622722401,
"grad_norm": 0.09745891392230988,
"learning_rate": 0.0025,
"loss": 5.971193790435791,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 391.97333,
"step": 140,
"tokens/total": 313856,
"tokens/train_per_sec_per_gpu": 150.24,
"tokens/trainable": 184142
},
{
"epoch": 0.6045016077170418,
"grad_norm": 0.0861353650689125,
"learning_rate": 0.0025,
"loss": 5.941656112670898,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 380.56467,
"step": 141,
"tokens/total": 315392,
"tokens/train_per_sec_per_gpu": 68.11,
"tokens/trainable": 184842
},
{
"epoch": 0.6087888531618435,
"grad_norm": 0.07818640768527985,
"learning_rate": 0.0025,
"loss": 5.195644378662109,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 180.48441,
"step": 142,
"tokens/total": 317248,
"tokens/train_per_sec_per_gpu": 24.45,
"tokens/trainable": 185810
},
{
"epoch": 0.6130760986066452,
"grad_norm": 0.06124793365597725,
"learning_rate": 0.0025,
"loss": 5.502886772155762,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 245.39932,
"step": 143,
"tokens/total": 319232,
"tokens/train_per_sec_per_gpu": 22.21,
"tokens/trainable": 186800
},
{
"epoch": 0.617363344051447,
"grad_norm": 0.0705777034163475,
"learning_rate": 0.0025,
"loss": 6.341245651245117,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.72,
"memory/max_allocated (GiB)": 18.72,
"ppl": 567.50278,
"step": 144,
"tokens/total": 322304,
"tokens/train_per_sec_per_gpu": 19.8,
"tokens/trainable": 189022
},
{
"epoch": 0.6216505894962486,
"grad_norm": 0.07551176846027374,
"learning_rate": 0.0025,
"loss": 5.537265777587891,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 253.9826,
"step": 145,
"tokens/total": 324608,
"tokens/train_per_sec_per_gpu": 14.15,
"tokens/trainable": 190415
},
{
"epoch": 0.6259378349410504,
"grad_norm": 0.04925059527158737,
"learning_rate": 0.0025,
"loss": 5.8168511390686035,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 335.91264,
"step": 146,
"tokens/total": 327168,
"tokens/train_per_sec_per_gpu": 28.95,
"tokens/trainable": 192079
},
{
"epoch": 0.6302250803858521,
"grad_norm": 0.06655045598745346,
"learning_rate": 0.0025,
"loss": 5.837461471557617,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 342.90775,
"step": 147,
"tokens/total": 328640,
"tokens/train_per_sec_per_gpu": 33.92,
"tokens/trainable": 192665
},
{
"epoch": 0.6345123258306538,
"grad_norm": 0.07563883811235428,
"learning_rate": 0.0025,
"loss": 5.163289546966553,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 174.73832,
"step": 148,
"tokens/total": 330432,
"tokens/train_per_sec_per_gpu": 112.65,
"tokens/trainable": 193531
},
{
"epoch": 0.6387995712754555,
"grad_norm": 0.05340481176972389,
"learning_rate": 0.0025,
"loss": 5.348882675170898,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 210.37311,
"step": 149,
"tokens/total": 332032,
"tokens/train_per_sec_per_gpu": 35.17,
"tokens/trainable": 194286
},
{
"epoch": 0.6430868167202572,
"grad_norm": 0.07053768634796143,
"learning_rate": 0.0025,
"loss": 6.0246500968933105,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 413.49693,
"step": 150,
"tokens/total": 334592,
"tokens/train_per_sec_per_gpu": 53.06,
"tokens/trainable": 195894
},
{
"epoch": 0.647374062165059,
"grad_norm": 0.10346148163080215,
"learning_rate": 0.0025,
"loss": 6.111691474914551,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 451.1011,
"step": 151,
"tokens/total": 336896,
"tokens/train_per_sec_per_gpu": 84.27,
"tokens/trainable": 197140
},
{
"epoch": 0.6516613076098606,
"grad_norm": 0.05668232962489128,
"learning_rate": 0.0025,
"loss": 5.328207015991211,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 206.06817,
"step": 152,
"tokens/total": 339392,
"tokens/train_per_sec_per_gpu": 68.85,
"tokens/trainable": 198641
},
{
"epoch": 0.6559485530546624,
"grad_norm": 0.07267311960458755,
"learning_rate": 0.0025,
"loss": 5.421064853668213,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 226.11978,
"step": 153,
"tokens/total": 341312,
"tokens/train_per_sec_per_gpu": 108.38,
"tokens/trainable": 199572
},
{
"epoch": 0.6602357984994641,
"grad_norm": 0.05049528181552887,
"learning_rate": 0.0025,
"loss": 5.701568603515625,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.63,
"memory/max_allocated (GiB)": 18.63,
"ppl": 299.33657,
"step": 154,
"tokens/total": 344064,
"tokens/train_per_sec_per_gpu": 300.52,
"tokens/trainable": 201242
},
{
"epoch": 0.6645230439442658,
"grad_norm": 0.056077949702739716,
"learning_rate": 0.0025,
"loss": 5.401850700378418,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 221.81655,
"step": 155,
"tokens/total": 346432,
"tokens/train_per_sec_per_gpu": 258.94,
"tokens/trainable": 202788
},
{
"epoch": 0.6688102893890675,
"grad_norm": 0.11511314660310745,
"learning_rate": 0.0025,
"loss": 5.723288536071777,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 305.90926,
"step": 156,
"tokens/total": 348160,
"tokens/train_per_sec_per_gpu": 129.3,
"tokens/trainable": 203548
},
{
"epoch": 0.6730975348338692,
"grad_norm": 0.04815061762928963,
"learning_rate": 0.0025,
"loss": 5.29791259765625,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 199.91906,
"step": 157,
"tokens/total": 349888,
"tokens/train_per_sec_per_gpu": 61.9,
"tokens/trainable": 204408
},
{
"epoch": 0.677384780278671,
"grad_norm": 0.0801524966955185,
"learning_rate": 0.0025,
"loss": 6.227181911468506,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 506.3266,
"step": 158,
"tokens/total": 351744,
"tokens/train_per_sec_per_gpu": 102.56,
"tokens/trainable": 205262
},
{
"epoch": 0.6816720257234726,
"grad_norm": 0.059293024241924286,
"learning_rate": 0.0025,
"loss": 5.615620136260986,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 274.68367,
"step": 159,
"tokens/total": 354176,
"tokens/train_per_sec_per_gpu": 31.25,
"tokens/trainable": 206828
},
{
"epoch": 0.6859592711682744,
"grad_norm": 0.07457486540079117,
"learning_rate": 0.0025,
"loss": 5.283931255340576,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 197.14337,
"step": 160,
"tokens/total": 356416,
"tokens/train_per_sec_per_gpu": 51.94,
"tokens/trainable": 208192
},
{
"epoch": 0.6902465166130761,
"grad_norm": 0.06552717089653015,
"learning_rate": 0.0025,
"loss": 5.763904571533203,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 318.58986,
"step": 161,
"tokens/total": 358336,
"tokens/train_per_sec_per_gpu": 169.49,
"tokens/trainable": 209238
},
{
"epoch": 0.6945337620578779,
"grad_norm": 0.07754746079444885,
"learning_rate": 0.0025,
"loss": 5.163079261779785,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 174.70158,
"step": 162,
"tokens/total": 360576,
"tokens/train_per_sec_per_gpu": 17.68,
"tokens/trainable": 210560
},
{
"epoch": 0.6988210075026795,
"grad_norm": 0.11694307625293732,
"learning_rate": 0.0025,
"loss": 6.16943359375,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 477.91534,
"step": 163,
"tokens/total": 362560,
"tokens/train_per_sec_per_gpu": 6.89,
"tokens/trainable": 211646
},
{
"epoch": 0.7031082529474812,
"grad_norm": 0.3541814684867859,
"learning_rate": 0.0025,
"loss": 6.192336082458496,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 488.98709,
"step": 164,
"tokens/total": 364032,
"tokens/train_per_sec_per_gpu": 68.77,
"tokens/trainable": 212233
},
{
"epoch": 0.707395498392283,
"grad_norm": 0.0691906213760376,
"learning_rate": 0.0025,
"loss": 5.444247722625732,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 231.42312,
"step": 165,
"tokens/total": 366208,
"tokens/train_per_sec_per_gpu": 74.88,
"tokens/trainable": 213482
},
{
"epoch": 0.7116827438370846,
"grad_norm": 0.06896678358316422,
"learning_rate": 0.0025,
"loss": 5.45513391494751,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 233.9562,
"step": 166,
"tokens/total": 368192,
"tokens/train_per_sec_per_gpu": 150.94,
"tokens/trainable": 214570
},
{
"epoch": 0.7159699892818864,
"grad_norm": 0.07068932056427002,
"learning_rate": 0.0025,
"loss": 5.088237762451172,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 162.10394,
"step": 167,
"tokens/total": 370432,
"tokens/train_per_sec_per_gpu": 37.23,
"tokens/trainable": 215958
},
{
"epoch": 0.7202572347266881,
"grad_norm": 0.07037021219730377,
"learning_rate": 0.0025,
"loss": 5.780329704284668,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 323.86595,
"step": 168,
"tokens/total": 373248,
"tokens/train_per_sec_per_gpu": 48.79,
"tokens/trainable": 217782
},
{
"epoch": 0.7245444801714899,
"grad_norm": 0.09113272279500961,
"learning_rate": 0.0025,
"loss": 5.445977210998535,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 231.82371,
"step": 169,
"tokens/total": 375296,
"tokens/train_per_sec_per_gpu": 135.42,
"tokens/trainable": 218943
},
{
"epoch": 0.7288317256162915,
"grad_norm": 0.04421088844537735,
"learning_rate": 0.0025,
"loss": 5.593143463134766,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 268.57856,
"step": 170,
"tokens/total": 377280,
"tokens/train_per_sec_per_gpu": 32.25,
"tokens/trainable": 220097
},
{
"epoch": 0.7331189710610932,
"grad_norm": 0.07146560400724411,
"learning_rate": 0.0025,
"loss": 5.347398281097412,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 210.06107,
"step": 171,
"tokens/total": 379392,
"tokens/train_per_sec_per_gpu": 15.36,
"tokens/trainable": 221250
},
{
"epoch": 0.737406216505895,
"grad_norm": 0.058737654238939285,
"learning_rate": 0.0025,
"loss": 5.430908203125,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.58,
"memory/max_allocated (GiB)": 18.58,
"ppl": 228.35655,
"step": 172,
"tokens/total": 382144,
"tokens/train_per_sec_per_gpu": 17.26,
"tokens/trainable": 223081
},
{
"epoch": 0.7416934619506966,
"grad_norm": 0.08348660171031952,
"learning_rate": 0.0025,
"loss": 5.785150527954102,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 325.43102,
"step": 173,
"tokens/total": 384768,
"tokens/train_per_sec_per_gpu": 56.26,
"tokens/trainable": 224769
},
{
"epoch": 0.7459807073954984,
"grad_norm": 0.04298631101846695,
"learning_rate": 0.0025,
"loss": 5.393362998962402,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 219.94181,
"step": 174,
"tokens/total": 387136,
"tokens/train_per_sec_per_gpu": 37.67,
"tokens/trainable": 226216
},
{
"epoch": 0.7502679528403001,
"grad_norm": 0.08122890442609787,
"learning_rate": 0.0025,
"loss": 5.101113796234131,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 164.2047,
"step": 175,
"tokens/total": 388608,
"tokens/train_per_sec_per_gpu": 44.01,
"tokens/trainable": 226770
},
{
"epoch": 0.7545551982851019,
"grad_norm": 0.11935891956090927,
"learning_rate": 0.0025,
"loss": 5.519252300262451,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 249.44845,
"step": 176,
"tokens/total": 390784,
"tokens/train_per_sec_per_gpu": 39.54,
"tokens/trainable": 228098
},
{
"epoch": 0.7588424437299035,
"grad_norm": 0.15656660497188568,
"learning_rate": 0.0025,
"loss": 4.941908836364746,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 140.0373,
"step": 177,
"tokens/total": 392512,
"tokens/train_per_sec_per_gpu": 36.08,
"tokens/trainable": 228809
},
{
"epoch": 0.7588424437299035,
"eval_loss": 5.460564136505127,
"eval_ppl": 235.23009,
"eval_runtime": 17.2516,
"eval_samples_per_second": 12.057,
"eval_steps_per_second": 12.057,
"memory/device_reserved (GiB)": 19.56,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"step": 177
},
{
"epoch": 0.7631296891747053,
"grad_norm": 0.05968443304300308,
"learning_rate": 0.0025,
"loss": 5.653349876403809,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 19.12,
"memory/max_allocated (GiB)": 19.12,
"ppl": 285.2454,
"step": 178,
"tokens/total": 396416,
"tokens/train_per_sec_per_gpu": 110.71,
"tokens/trainable": 231745
},
{
"epoch": 0.767416934619507,
"grad_norm": 0.11675012111663818,
"learning_rate": 0.0025,
"loss": 5.658262252807617,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 286.65008,
"step": 179,
"tokens/total": 399040,
"tokens/train_per_sec_per_gpu": 101.59,
"tokens/trainable": 233370
},
{
"epoch": 0.7717041800643086,
"grad_norm": 0.07265754044055939,
"learning_rate": 0.0025,
"loss": 5.7116546630859375,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 302.37098,
"step": 180,
"tokens/total": 400896,
"tokens/train_per_sec_per_gpu": 5.2,
"tokens/trainable": 234357
},
{
"epoch": 0.7759914255091104,
"grad_norm": 0.06884697079658508,
"learning_rate": 0.0025,
"loss": 5.63464879989624,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 279.96058,
"step": 181,
"tokens/total": 403072,
"tokens/train_per_sec_per_gpu": 105.0,
"tokens/trainable": 235702
},
{
"epoch": 0.7802786709539121,
"grad_norm": 0.12419719249010086,
"learning_rate": 0.0025,
"loss": 5.239850997924805,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 188.64199,
"step": 182,
"tokens/total": 404928,
"tokens/train_per_sec_per_gpu": 19.58,
"tokens/trainable": 236615
},
{
"epoch": 0.7845659163987139,
"grad_norm": 0.10391955822706223,
"learning_rate": 0.0025,
"loss": 4.728728294372559,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 113.15158,
"step": 183,
"tokens/total": 406272,
"tokens/train_per_sec_per_gpu": 24.51,
"tokens/trainable": 237082
},
{
"epoch": 0.7888531618435155,
"grad_norm": 0.07022784650325775,
"learning_rate": 0.0025,
"loss": 5.611477375030518,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 273.54807,
"step": 184,
"tokens/total": 408384,
"tokens/train_per_sec_per_gpu": 110.02,
"tokens/trainable": 238396
},
{
"epoch": 0.7931404072883173,
"grad_norm": 0.11678767204284668,
"learning_rate": 0.0025,
"loss": 6.098667144775391,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 445.2639,
"step": 185,
"tokens/total": 410560,
"tokens/train_per_sec_per_gpu": 2.18,
"tokens/trainable": 239649
},
{
"epoch": 0.797427652733119,
"grad_norm": 0.06710375845432281,
"learning_rate": 0.0025,
"loss": 5.643878936767578,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.65,
"memory/max_allocated (GiB)": 18.65,
"ppl": 282.55661,
"step": 186,
"tokens/total": 412800,
"tokens/train_per_sec_per_gpu": 12.9,
"tokens/trainable": 240972
},
{
"epoch": 0.8017148981779206,
"grad_norm": 0.09496990591287613,
"learning_rate": 0.0025,
"loss": 5.7106122970581055,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 302.05596,
"step": 187,
"tokens/total": 415296,
"tokens/train_per_sec_per_gpu": 67.78,
"tokens/trainable": 242450
},
{
"epoch": 0.8060021436227224,
"grad_norm": 0.06658744066953659,
"learning_rate": 0.0025,
"loss": 5.754177093505859,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 315.50581,
"step": 188,
"tokens/total": 417856,
"tokens/train_per_sec_per_gpu": 14.18,
"tokens/trainable": 244076
},
{
"epoch": 0.8102893890675241,
"grad_norm": 0.054122067987918854,
"learning_rate": 0.0025,
"loss": 5.310729026794434,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 202.4978,
"step": 189,
"tokens/total": 419584,
"tokens/train_per_sec_per_gpu": 11.57,
"tokens/trainable": 244832
},
{
"epoch": 0.8145766345123259,
"grad_norm": 0.11224393546581268,
"learning_rate": 0.0025,
"loss": 5.699178695678711,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 298.62204,
"step": 190,
"tokens/total": 420928,
"tokens/train_per_sec_per_gpu": 8.17,
"tokens/trainable": 245280
},
{
"epoch": 0.8188638799571275,
"grad_norm": 0.05273159593343735,
"learning_rate": 0.0025,
"loss": 5.184141159057617,
"memory/device_reserved (GiB)": 19.59,
"memory/max_active (GiB)": 18.73,
"memory/max_allocated (GiB)": 18.73,
"ppl": 178.42015,
"step": 191,
"tokens/total": 424000,
"tokens/train_per_sec_per_gpu": 53.69,
"tokens/trainable": 247590
},
{
"epoch": 0.8231511254019293,
"grad_norm": 0.03733557090163231,
"learning_rate": 0.0025,
"loss": 5.854153633117676,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 19.12,
"memory/max_allocated (GiB)": 19.12,
"ppl": 348.67966,
"step": 192,
"tokens/total": 428480,
"tokens/train_per_sec_per_gpu": 52.36,
"tokens/trainable": 251024
},
{
"epoch": 0.827438370846731,
"grad_norm": 0.06321356445550919,
"learning_rate": 0.0025,
"loss": 5.20097017288208,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 181.44819,
"step": 193,
"tokens/total": 430656,
"tokens/train_per_sec_per_gpu": 1.74,
"tokens/trainable": 252277
},
{
"epoch": 0.8317256162915327,
"grad_norm": 0.1052091047167778,
"learning_rate": 0.0025,
"loss": 4.629343509674072,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 102.44679,
"step": 194,
"tokens/total": 432064,
"tokens/train_per_sec_per_gpu": 21.14,
"tokens/trainable": 252834
},
{
"epoch": 0.8360128617363344,
"grad_norm": 0.06142156571149826,
"learning_rate": 0.0025,
"loss": 5.491484642028809,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 242.61714,
"step": 195,
"tokens/total": 434624,
"tokens/train_per_sec_per_gpu": 23.29,
"tokens/trainable": 254420
},
{
"epoch": 0.8403001071811361,
"grad_norm": 0.05292431265115738,
"learning_rate": 0.0025,
"loss": 5.481139183044434,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.62,
"memory/max_allocated (GiB)": 18.62,
"ppl": 240.12009,
"step": 196,
"tokens/total": 437440,
"tokens/train_per_sec_per_gpu": 52.86,
"tokens/trainable": 256271
},
{
"epoch": 0.8445873526259379,
"grad_norm": 0.0795208215713501,
"learning_rate": 0.0025,
"loss": 5.259885787963867,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 192.45951,
"step": 197,
"tokens/total": 439744,
"tokens/train_per_sec_per_gpu": 64.52,
"tokens/trainable": 257576
},
{
"epoch": 0.8488745980707395,
"grad_norm": 0.08473947644233704,
"learning_rate": 0.0025,
"loss": 5.696974754333496,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.59,
"memory/max_allocated (GiB)": 18.59,
"ppl": 297.96462,
"step": 198,
"tokens/total": 442816,
"tokens/train_per_sec_per_gpu": 36.19,
"tokens/trainable": 259567
},
{
"epoch": 0.8531618435155413,
"grad_norm": 0.057201892137527466,
"learning_rate": 0.0025,
"loss": 5.780117034912109,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.62,
"memory/max_allocated (GiB)": 18.62,
"ppl": 323.79708,
"step": 199,
"tokens/total": 445888,
"tokens/train_per_sec_per_gpu": 81.99,
"tokens/trainable": 261637
},
{
"epoch": 0.857449088960343,
"grad_norm": 0.09185982495546341,
"learning_rate": 0.0025,
"loss": 6.029691219329834,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 415.58668,
"step": 200,
"tokens/total": 449408,
"tokens/train_per_sec_per_gpu": 163.44,
"tokens/trainable": 264161
},
{
"epoch": 0.8617363344051447,
"grad_norm": 0.055811040103435516,
"learning_rate": 0.0025,
"loss": 5.37666654586792,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 216.30005,
"step": 201,
"tokens/total": 452032,
"tokens/train_per_sec_per_gpu": 14.76,
"tokens/trainable": 265933
},
{
"epoch": 0.8660235798499464,
"grad_norm": 0.06049516424536705,
"learning_rate": 0.0025,
"loss": 5.930117130279541,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 376.19858,
"step": 202,
"tokens/total": 455232,
"tokens/train_per_sec_per_gpu": 34.59,
"tokens/trainable": 268176
},
{
"epoch": 0.8703108252947481,
"grad_norm": 0.04760754853487015,
"learning_rate": 0.0025,
"loss": 5.988245487213135,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 398.71445,
"step": 203,
"tokens/total": 457792,
"tokens/train_per_sec_per_gpu": 23.31,
"tokens/trainable": 269826
},
{
"epoch": 0.8745980707395499,
"grad_norm": 0.3524979054927826,
"learning_rate": 0.0025,
"loss": 5.5907745361328125,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 267.94307,
"step": 204,
"tokens/total": 460224,
"tokens/train_per_sec_per_gpu": 112.76,
"tokens/trainable": 271226
},
{
"epoch": 0.8788853161843515,
"grad_norm": 0.07816269993782043,
"learning_rate": 0.0025,
"loss": 5.5978217124938965,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 269.83798,
"step": 205,
"tokens/total": 463232,
"tokens/train_per_sec_per_gpu": 83.11,
"tokens/trainable": 273301
},
{
"epoch": 0.8831725616291533,
"grad_norm": 0.06009744852781296,
"learning_rate": 0.0025,
"loss": 5.7178802490234375,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 304.25928,
"step": 206,
"tokens/total": 464960,
"tokens/train_per_sec_per_gpu": 16.0,
"tokens/trainable": 274259
},
{
"epoch": 0.887459807073955,
"grad_norm": 0.09694831818342209,
"learning_rate": 0.0025,
"loss": 5.388084888458252,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 218.78399,
"step": 207,
"tokens/total": 467008,
"tokens/train_per_sec_per_gpu": 34.61,
"tokens/trainable": 275392
},
{
"epoch": 0.8917470525187567,
"grad_norm": 0.06148134917020798,
"learning_rate": 0.0025,
"loss": 5.084897041320801,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 161.5633,
"step": 208,
"tokens/total": 468608,
"tokens/train_per_sec_per_gpu": 9.12,
"tokens/trainable": 276045
},
{
"epoch": 0.8960342979635584,
"grad_norm": 0.09476284682750702,
"learning_rate": 0.0025,
"loss": 5.941885471343994,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.6,
"memory/max_allocated (GiB)": 18.6,
"ppl": 380.65196,
"step": 209,
"tokens/total": 470912,
"tokens/train_per_sec_per_gpu": 31.97,
"tokens/trainable": 277484
},
{
"epoch": 0.9003215434083601,
"grad_norm": 0.07026304304599762,
"learning_rate": 0.0025,
"loss": 5.341467380523682,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.7,
"memory/max_allocated (GiB)": 18.7,
"ppl": 208.8189,
"step": 210,
"tokens/total": 473920,
"tokens/train_per_sec_per_gpu": 114.73,
"tokens/trainable": 279470
},
{
"epoch": 0.9046087888531619,
"grad_norm": 0.046190474182367325,
"learning_rate": 0.0025,
"loss": 5.205893516540527,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 182.34373,
"step": 211,
"tokens/total": 476480,
"tokens/train_per_sec_per_gpu": 112.77,
"tokens/trainable": 280947
},
{
"epoch": 0.9088960342979635,
"grad_norm": 0.06688795238733292,
"learning_rate": 0.0025,
"loss": 4.937982559204102,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 139.48856,
"step": 212,
"tokens/total": 478016,
"tokens/train_per_sec_per_gpu": 101.68,
"tokens/trainable": 281612
},
{
"epoch": 0.9131832797427653,
"grad_norm": 0.07849342375993729,
"learning_rate": 0.0025,
"loss": 5.182241439819336,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 178.08152,
"step": 213,
"tokens/total": 479680,
"tokens/train_per_sec_per_gpu": 150.05,
"tokens/trainable": 282478
},
{
"epoch": 0.917470525187567,
"grad_norm": 0.05929256230592728,
"learning_rate": 0.0025,
"loss": 4.967282295227051,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.42,
"memory/max_allocated (GiB)": 18.42,
"ppl": 143.636,
"step": 214,
"tokens/total": 481728,
"tokens/train_per_sec_per_gpu": 128.72,
"tokens/trainable": 283641
},
{
"epoch": 0.9217577706323687,
"grad_norm": 0.07487839460372925,
"learning_rate": 0.0025,
"loss": 5.071913719177246,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 159.47923,
"step": 215,
"tokens/total": 483200,
"tokens/train_per_sec_per_gpu": 13.64,
"tokens/trainable": 284253
},
{
"epoch": 0.9260450160771704,
"grad_norm": 0.08760891854763031,
"learning_rate": 0.0025,
"loss": 5.086419582366943,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 161.80948,
"step": 216,
"tokens/total": 485184,
"tokens/train_per_sec_per_gpu": 33.77,
"tokens/trainable": 285350
},
{
"epoch": 0.9303322615219721,
"grad_norm": 0.058040693402290344,
"learning_rate": 0.0025,
"loss": 5.140557289123535,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 170.81093,
"step": 217,
"tokens/total": 487488,
"tokens/train_per_sec_per_gpu": 148.26,
"tokens/trainable": 286725
},
{
"epoch": 0.9346195069667739,
"grad_norm": 0.051069747656583786,
"learning_rate": 0.0025,
"loss": 5.112553119659424,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 166.09387,
"step": 218,
"tokens/total": 490176,
"tokens/train_per_sec_per_gpu": 7.43,
"tokens/trainable": 288433
},
{
"epoch": 0.9389067524115756,
"grad_norm": 0.04658494517207146,
"learning_rate": 0.0025,
"loss": 5.061060428619385,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 157.75772,
"step": 219,
"tokens/total": 492288,
"tokens/train_per_sec_per_gpu": 18.83,
"tokens/trainable": 289637
},
{
"epoch": 0.9431939978563773,
"grad_norm": 0.058834441006183624,
"learning_rate": 0.0025,
"loss": 5.130789756774902,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 169.15065,
"step": 220,
"tokens/total": 494848,
"tokens/train_per_sec_per_gpu": 137.69,
"tokens/trainable": 291132
},
{
"epoch": 0.947481243301179,
"grad_norm": 0.10351614654064178,
"learning_rate": 0.0025,
"loss": 5.497127532958984,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 243.99007,
"step": 221,
"tokens/total": 497088,
"tokens/train_per_sec_per_gpu": 195.46,
"tokens/trainable": 292518
},
{
"epoch": 0.9517684887459807,
"grad_norm": 0.14364013075828552,
"learning_rate": 0.0025,
"loss": 5.386436939239502,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 218.42374,
"step": 222,
"tokens/total": 499520,
"tokens/train_per_sec_per_gpu": 4.31,
"tokens/trainable": 293866
},
{
"epoch": 0.9560557341907824,
"grad_norm": 0.06514472514390945,
"learning_rate": 0.0025,
"loss": 5.5871148109436035,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 266.96427,
"step": 223,
"tokens/total": 502208,
"tokens/train_per_sec_per_gpu": 19.56,
"tokens/trainable": 295684
},
{
"epoch": 0.9603429796355841,
"grad_norm": 0.05746331810951233,
"learning_rate": 0.0025,
"loss": 5.019771099090576,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.57,
"memory/max_allocated (GiB)": 18.57,
"ppl": 151.37665,
"step": 224,
"tokens/total": 504576,
"tokens/train_per_sec_per_gpu": 133.41,
"tokens/trainable": 297180
},
{
"epoch": 0.9646302250803859,
"grad_norm": 0.06428291648626328,
"learning_rate": 0.0025,
"loss": 5.374805450439453,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.77,
"memory/max_allocated (GiB)": 18.77,
"ppl": 215.89787,
"step": 225,
"tokens/total": 508224,
"tokens/train_per_sec_per_gpu": 174.74,
"tokens/trainable": 299827
},
{
"epoch": 0.9689174705251876,
"grad_norm": 0.06595566868782043,
"learning_rate": 0.0025,
"loss": 5.579074859619141,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 264.82649,
"step": 226,
"tokens/total": 509824,
"tokens/train_per_sec_per_gpu": 6.15,
"tokens/trainable": 300582
},
{
"epoch": 0.9732047159699893,
"grad_norm": 0.06129618361592293,
"learning_rate": 0.0025,
"loss": 4.87337064743042,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.67,
"memory/max_allocated (GiB)": 18.67,
"ppl": 130.76092,
"step": 227,
"tokens/total": 512768,
"tokens/train_per_sec_per_gpu": 14.39,
"tokens/trainable": 302643
},
{
"epoch": 0.977491961414791,
"grad_norm": 0.07292018085718155,
"learning_rate": 0.0025,
"loss": 5.493099212646484,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.55,
"memory/max_allocated (GiB)": 18.55,
"ppl": 243.00918,
"step": 228,
"tokens/total": 515328,
"tokens/train_per_sec_per_gpu": 8.6,
"tokens/trainable": 304358
},
{
"epoch": 0.9817792068595927,
"grad_norm": 0.04551401734352112,
"learning_rate": 0.0025,
"loss": 5.522790908813477,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.54,
"memory/max_allocated (GiB)": 18.54,
"ppl": 250.33272,
"step": 229,
"tokens/total": 518144,
"tokens/train_per_sec_per_gpu": 36.31,
"tokens/trainable": 306242
},
{
"epoch": 0.9860664523043944,
"grad_norm": 0.08372899889945984,
"learning_rate": 0.0025,
"loss": 5.140377998352051,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 170.78031,
"step": 230,
"tokens/total": 520000,
"tokens/train_per_sec_per_gpu": 20.62,
"tokens/trainable": 307298
},
{
"epoch": 0.9903536977491961,
"grad_norm": 0.0876646488904953,
"learning_rate": 0.0025,
"loss": 5.655140399932861,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.96,
"memory/max_allocated (GiB)": 18.96,
"ppl": 285.7566,
"step": 231,
"tokens/total": 523456,
"tokens/train_per_sec_per_gpu": 176.78,
"tokens/trainable": 309825
},
{
"epoch": 0.9946409431939979,
"grad_norm": 0.11478639394044876,
"learning_rate": 0.0025,
"loss": 5.731156826019287,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 308.32574,
"step": 232,
"tokens/total": 524928,
"tokens/train_per_sec_per_gpu": 73.31,
"tokens/trainable": 310402
},
{
"epoch": 0.9989281886387996,
"grad_norm": 0.05332854762673378,
"learning_rate": 0.0025,
"loss": 4.958339214324951,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 142.35717,
"step": 233,
"tokens/total": 527040,
"tokens/train_per_sec_per_gpu": 168.24,
"tokens/trainable": 311695
},
{
"epoch": 1.0,
"grad_norm": 0.09713005274534225,
"learning_rate": 0.0025,
"loss": 4.436938285827637,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 84.51578,
"step": 234,
"tokens/total": 527680,
"tokens/train_per_sec_per_gpu": 682.75,
"tokens/trainable": 312130
},
{
"epoch": 1.0042872454448017,
"grad_norm": 0.05925685912370682,
"learning_rate": 0.0025,
"loss": 4.939643859863281,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 139.72048,
"step": 235,
"tokens/total": 529920,
"tokens/train_per_sec_per_gpu": 77.24,
"tokens/trainable": 313388
},
{
"epoch": 1.0085744908896035,
"grad_norm": 0.09591115266084671,
"learning_rate": 0.0025,
"loss": 5.589860916137695,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 267.69838,
"step": 236,
"tokens/total": 531648,
"tokens/train_per_sec_per_gpu": 11.46,
"tokens/trainable": 314241
},
{
"epoch": 1.0085744908896035,
"eval_loss": 5.21028470993042,
"eval_ppl": 183.14619,
"eval_runtime": 17.0244,
"eval_samples_per_second": 12.218,
"eval_steps_per_second": 12.218,
"memory/device_reserved (GiB)": 19.84,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"step": 236
},
{
"epoch": 1.0128617363344052,
"grad_norm": 0.07797813415527344,
"learning_rate": 0.0025,
"loss": 5.490115165710449,
"memory/device_reserved (GiB)": 18.62,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 242.28511,
"step": 237,
"tokens/total": 533760,
"tokens/train_per_sec_per_gpu": 66.73,
"tokens/trainable": 315528
},
{
"epoch": 1.0171489817792068,
"grad_norm": 0.058185361325740814,
"learning_rate": 0.0025,
"loss": 5.693612098693848,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.75,
"memory/max_allocated (GiB)": 18.75,
"ppl": 296.96435,
"step": 238,
"tokens/total": 536832,
"tokens/train_per_sec_per_gpu": 6.22,
"tokens/trainable": 317658
},
{
"epoch": 1.0214362272240085,
"grad_norm": 0.06836100667715073,
"learning_rate": 0.0025,
"loss": 5.170307159423828,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 175.96888,
"step": 239,
"tokens/total": 539648,
"tokens/train_per_sec_per_gpu": 124.58,
"tokens/trainable": 319553
},
{
"epoch": 1.0257234726688103,
"grad_norm": 0.063715860247612,
"learning_rate": 0.0025,
"loss": 5.572962760925293,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 263.21278,
"step": 240,
"tokens/total": 542144,
"tokens/train_per_sec_per_gpu": 16.68,
"tokens/trainable": 321099
},
{
"epoch": 1.030010718113612,
"grad_norm": 0.04595501348376274,
"learning_rate": 0.0025,
"loss": 5.575761795043945,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 263.95056,
"step": 241,
"tokens/total": 544768,
"tokens/train_per_sec_per_gpu": 42.37,
"tokens/trainable": 322797
},
{
"epoch": 1.0342979635584137,
"grad_norm": 0.043023187667131424,
"learning_rate": 0.0025,
"loss": 5.4010138511657715,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 221.631,
"step": 242,
"tokens/total": 547392,
"tokens/train_per_sec_per_gpu": 64.52,
"tokens/trainable": 324497
},
{
"epoch": 1.0385852090032155,
"grad_norm": 0.10789299011230469,
"learning_rate": 0.0025,
"loss": 5.370054244995117,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 214.87452,
"step": 243,
"tokens/total": 548736,
"tokens/train_per_sec_per_gpu": 3.01,
"tokens/trainable": 324950
},
{
"epoch": 1.0428724544480172,
"grad_norm": 0.043591853231191635,
"learning_rate": 0.0025,
"loss": 5.297659397125244,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.59,
"memory/max_allocated (GiB)": 18.59,
"ppl": 199.86845,
"step": 244,
"tokens/total": 551872,
"tokens/train_per_sec_per_gpu": 99.52,
"tokens/trainable": 327123
},
{
"epoch": 1.0471596998928188,
"grad_norm": 0.07090502977371216,
"learning_rate": 0.0025,
"loss": 5.015192985534668,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 150.68521,
"step": 245,
"tokens/total": 554048,
"tokens/train_per_sec_per_gpu": 4.32,
"tokens/trainable": 328432
},
{
"epoch": 1.0514469453376205,
"grad_norm": 0.06152981519699097,
"learning_rate": 0.0025,
"loss": 4.667083740234375,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 106.38704,
"step": 246,
"tokens/total": 555712,
"tokens/train_per_sec_per_gpu": 45.68,
"tokens/trainable": 329309
},
{
"epoch": 1.0557341907824223,
"grad_norm": 0.06200568005442619,
"learning_rate": 0.0025,
"loss": 5.22236967086792,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 185.37294,
"step": 247,
"tokens/total": 557632,
"tokens/train_per_sec_per_gpu": 6.58,
"tokens/trainable": 330316
},
{
"epoch": 1.060021436227224,
"grad_norm": 0.0687415823340416,
"learning_rate": 0.0025,
"loss": 4.802757740020752,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.59,
"memory/max_allocated (GiB)": 18.59,
"ppl": 121.84597,
"step": 248,
"tokens/total": 560320,
"tokens/train_per_sec_per_gpu": 18.48,
"tokens/trainable": 331985
},
{
"epoch": 1.0643086816720257,
"grad_norm": 0.07006296515464783,
"learning_rate": 0.0025,
"loss": 5.4001383781433105,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 221.43706,
"step": 249,
"tokens/total": 562304,
"tokens/train_per_sec_per_gpu": 15.87,
"tokens/trainable": 333042
},
{
"epoch": 1.0685959271168275,
"grad_norm": 0.057536471635103226,
"learning_rate": 0.0025,
"loss": 4.920461654663086,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 137.06588,
"step": 250,
"tokens/total": 564096,
"tokens/train_per_sec_per_gpu": 97.41,
"tokens/trainable": 334009
},
{
"epoch": 1.0728831725616292,
"grad_norm": 0.2622619867324829,
"learning_rate": 0.0025,
"loss": 6.004157543182373,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.38,
"memory/max_allocated (GiB)": 18.38,
"ppl": 405.10956,
"step": 251,
"tokens/total": 565376,
"tokens/train_per_sec_per_gpu": 11.03,
"tokens/trainable": 334450
},
{
"epoch": 1.077170418006431,
"grad_norm": 0.08943215012550354,
"learning_rate": 0.0025,
"loss": 5.477893829345703,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.62,
"memory/max_allocated (GiB)": 18.62,
"ppl": 239.34208,
"step": 252,
"tokens/total": 567744,
"tokens/train_per_sec_per_gpu": 28.99,
"tokens/trainable": 335890
},
{
"epoch": 1.0814576634512325,
"grad_norm": 0.05865178629755974,
"learning_rate": 0.0025,
"loss": 4.780120849609375,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 119.11874,
"step": 253,
"tokens/total": 570240,
"tokens/train_per_sec_per_gpu": 32.08,
"tokens/trainable": 337404
},
{
"epoch": 1.0857449088960343,
"grad_norm": 0.08746153116226196,
"learning_rate": 0.0025,
"loss": 5.729345798492432,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 307.76786,
"step": 254,
"tokens/total": 572416,
"tokens/train_per_sec_per_gpu": 37.32,
"tokens/trainable": 338644
},
{
"epoch": 1.090032154340836,
"grad_norm": 0.0820365622639656,
"learning_rate": 0.0025,
"loss": 4.818498611450195,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 123.77911,
"step": 255,
"tokens/total": 574144,
"tokens/train_per_sec_per_gpu": 204.86,
"tokens/trainable": 339546
},
{
"epoch": 1.0943193997856377,
"grad_norm": 0.05401737242937088,
"learning_rate": 0.0025,
"loss": 4.729460716247559,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 113.23448,
"step": 256,
"tokens/total": 576448,
"tokens/train_per_sec_per_gpu": 62.54,
"tokens/trainable": 340927
},
{
"epoch": 1.0986066452304395,
"grad_norm": 0.08213179558515549,
"learning_rate": 0.0025,
"loss": 4.536296844482422,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 93.34449,
"step": 257,
"tokens/total": 577856,
"tokens/train_per_sec_per_gpu": 25.55,
"tokens/trainable": 341479
},
{
"epoch": 1.1028938906752412,
"grad_norm": 0.09325698018074036,
"learning_rate": 0.0025,
"loss": 6.08807897567749,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.58,
"memory/max_allocated (GiB)": 18.58,
"ppl": 440.57424,
"step": 258,
"tokens/total": 580928,
"tokens/train_per_sec_per_gpu": 30.01,
"tokens/trainable": 343497
},
{
"epoch": 1.107181136120043,
"grad_norm": 0.06686203926801682,
"learning_rate": 0.0025,
"loss": 5.119194984436035,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 167.20072,
"step": 259,
"tokens/total": 583040,
"tokens/train_per_sec_per_gpu": 112.68,
"tokens/trainable": 344661
},
{
"epoch": 1.1114683815648445,
"grad_norm": 0.08321559429168701,
"learning_rate": 0.0025,
"loss": 5.304489612579346,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 201.23827,
"step": 260,
"tokens/total": 585472,
"tokens/train_per_sec_per_gpu": 2.12,
"tokens/trainable": 346087
},
{
"epoch": 1.1157556270096463,
"grad_norm": 0.05973471328616142,
"learning_rate": 0.0025,
"loss": 5.243081092834473,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.42,
"memory/max_allocated (GiB)": 18.42,
"ppl": 189.25231,
"step": 261,
"tokens/total": 587520,
"tokens/train_per_sec_per_gpu": 118.72,
"tokens/trainable": 347244
},
{
"epoch": 1.120042872454448,
"grad_norm": 0.06643401831388474,
"learning_rate": 0.0025,
"loss": 4.716557025909424,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.6,
"memory/max_allocated (GiB)": 18.6,
"ppl": 111.78272,
"step": 262,
"tokens/total": 589504,
"tokens/train_per_sec_per_gpu": 7.82,
"tokens/trainable": 348354
},
{
"epoch": 1.1243301178992497,
"grad_norm": 0.05456831306219101,
"learning_rate": 0.0025,
"loss": 5.202316761016846,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 181.69269,
"step": 263,
"tokens/total": 592448,
"tokens/train_per_sec_per_gpu": 247.8,
"tokens/trainable": 350338
},
{
"epoch": 1.1286173633440515,
"grad_norm": 0.05614905431866646,
"learning_rate": 0.0025,
"loss": 4.99215030670166,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 147.25272,
"step": 264,
"tokens/total": 594432,
"tokens/train_per_sec_per_gpu": 2.18,
"tokens/trainable": 351396
},
{
"epoch": 1.1329046087888532,
"grad_norm": 0.06128396466374397,
"learning_rate": 0.0025,
"loss": 4.919600009918213,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 136.94782,
"step": 265,
"tokens/total": 596864,
"tokens/train_per_sec_per_gpu": 143.33,
"tokens/trainable": 352905
},
{
"epoch": 1.137191854233655,
"grad_norm": 0.09635547548532486,
"learning_rate": 0.0025,
"loss": 4.9075703620910645,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 135.31026,
"step": 266,
"tokens/total": 598784,
"tokens/train_per_sec_per_gpu": 95.18,
"tokens/trainable": 353980
},
{
"epoch": 1.1414790996784565,
"grad_norm": 0.07255875319242477,
"learning_rate": 0.0025,
"loss": 5.526827812194824,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 251.34533,
"step": 267,
"tokens/total": 600704,
"tokens/train_per_sec_per_gpu": 208.93,
"tokens/trainable": 355034
},
{
"epoch": 1.1457663451232583,
"grad_norm": 0.07017937302589417,
"learning_rate": 0.0025,
"loss": 5.108213424682617,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.59,
"memory/max_allocated (GiB)": 18.59,
"ppl": 165.37464,
"step": 268,
"tokens/total": 603136,
"tokens/train_per_sec_per_gpu": 169.95,
"tokens/trainable": 356502
},
{
"epoch": 1.15005359056806,
"grad_norm": 0.05392616242170334,
"learning_rate": 0.0025,
"loss": 4.911001205444336,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 135.77529,
"step": 269,
"tokens/total": 605696,
"tokens/train_per_sec_per_gpu": 98.87,
"tokens/trainable": 358164
},
{
"epoch": 1.1543408360128617,
"grad_norm": 0.06459183990955353,
"learning_rate": 0.0025,
"loss": 5.657564640045166,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.67,
"memory/max_allocated (GiB)": 18.67,
"ppl": 286.45018,
"step": 270,
"tokens/total": 608256,
"tokens/train_per_sec_per_gpu": 34.31,
"tokens/trainable": 359715
},
{
"epoch": 1.1586280814576635,
"grad_norm": 0.05815054103732109,
"learning_rate": 0.0025,
"loss": 5.023680210113525,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 151.96956,
"step": 271,
"tokens/total": 610752,
"tokens/train_per_sec_per_gpu": 256.08,
"tokens/trainable": 361290
},
{
"epoch": 1.1629153269024652,
"grad_norm": 0.08935742825269699,
"learning_rate": 0.0025,
"loss": 5.135861396789551,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 170.0107,
"step": 272,
"tokens/total": 612480,
"tokens/train_per_sec_per_gpu": 3.04,
"tokens/trainable": 362007
},
{
"epoch": 1.167202572347267,
"grad_norm": 0.07162267714738846,
"learning_rate": 0.0025,
"loss": 4.85862922668457,
"memory/device_reserved (GiB)": 19.05,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 128.84746,
"step": 273,
"tokens/total": 614144,
"tokens/train_per_sec_per_gpu": 10.47,
"tokens/trainable": 362762
},
{
"epoch": 1.1714898177920685,
"grad_norm": 0.048264019191265106,
"learning_rate": 0.0025,
"loss": 5.584687232971191,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.85,
"memory/max_allocated (GiB)": 18.85,
"ppl": 266.31697,
"step": 274,
"tokens/total": 617792,
"tokens/train_per_sec_per_gpu": 37.41,
"tokens/trainable": 365433
},
{
"epoch": 1.1757770632368703,
"grad_norm": 0.05624736472964287,
"learning_rate": 0.0025,
"loss": 5.056115627288818,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 156.97956,
"step": 275,
"tokens/total": 619712,
"tokens/train_per_sec_per_gpu": 177.31,
"tokens/trainable": 366496
},
{
"epoch": 1.180064308681672,
"grad_norm": 0.0889284759759903,
"learning_rate": 0.0025,
"loss": 5.753776550292969,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 315.37946,
"step": 276,
"tokens/total": 621248,
"tokens/train_per_sec_per_gpu": 30.13,
"tokens/trainable": 367083
},
{
"epoch": 1.1843515541264737,
"grad_norm": 0.07002771645784378,
"learning_rate": 0.0025,
"loss": 5.0761637687683105,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 160.15847,
"step": 277,
"tokens/total": 623552,
"tokens/train_per_sec_per_gpu": 124.59,
"tokens/trainable": 368541
},
{
"epoch": 1.1886387995712755,
"grad_norm": 0.04624473676085472,
"learning_rate": 0.0025,
"loss": 5.2077860832214355,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 182.68915,
"step": 278,
"tokens/total": 625600,
"tokens/train_per_sec_per_gpu": 20.49,
"tokens/trainable": 369666
},
{
"epoch": 1.1929260450160772,
"grad_norm": 0.08523814380168915,
"learning_rate": 0.0025,
"loss": 4.794674873352051,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 120.86508,
"step": 279,
"tokens/total": 627008,
"tokens/train_per_sec_per_gpu": 6.56,
"tokens/trainable": 370220
},
{
"epoch": 1.197213290460879,
"grad_norm": 0.04043371230363846,
"learning_rate": 0.0025,
"loss": 4.8553667068481445,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.6,
"memory/max_allocated (GiB)": 18.6,
"ppl": 128.42778,
"step": 280,
"tokens/total": 629824,
"tokens/train_per_sec_per_gpu": 277.58,
"tokens/trainable": 372088
},
{
"epoch": 1.2015005359056805,
"grad_norm": 0.05826675891876221,
"learning_rate": 0.0025,
"loss": 5.2880730628967285,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 197.9616,
"step": 281,
"tokens/total": 632128,
"tokens/train_per_sec_per_gpu": 58.45,
"tokens/trainable": 373418
},
{
"epoch": 1.2057877813504823,
"grad_norm": 0.055210962891578674,
"learning_rate": 0.0025,
"loss": 5.282061576843262,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 196.77512,
"step": 282,
"tokens/total": 634880,
"tokens/train_per_sec_per_gpu": 43.45,
"tokens/trainable": 375160
},
{
"epoch": 1.210075026795284,
"grad_norm": 0.05953294038772583,
"learning_rate": 0.0025,
"loss": 4.843015193939209,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 126.85126,
"step": 283,
"tokens/total": 637120,
"tokens/train_per_sec_per_gpu": 11.03,
"tokens/trainable": 376400
},
{
"epoch": 1.2143622722400857,
"grad_norm": 0.09921937435865402,
"learning_rate": 0.0025,
"loss": 5.3062543869018555,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.68,
"memory/max_allocated (GiB)": 18.68,
"ppl": 201.59372,
"step": 284,
"tokens/total": 640192,
"tokens/train_per_sec_per_gpu": 26.81,
"tokens/trainable": 378440
},
{
"epoch": 1.2186495176848875,
"grad_norm": 0.07062297314405441,
"learning_rate": 0.0025,
"loss": 5.3862786293029785,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 218.38916,
"step": 285,
"tokens/total": 642432,
"tokens/train_per_sec_per_gpu": 106.98,
"tokens/trainable": 379692
},
{
"epoch": 1.2229367631296892,
"grad_norm": 0.061749882996082306,
"learning_rate": 0.0025,
"loss": 4.912004470825195,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 135.91157,
"step": 286,
"tokens/total": 644224,
"tokens/train_per_sec_per_gpu": 97.43,
"tokens/trainable": 380455
},
{
"epoch": 1.227224008574491,
"grad_norm": 0.08968321979045868,
"learning_rate": 0.0025,
"loss": 5.269050598144531,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.77,
"memory/max_allocated (GiB)": 18.77,
"ppl": 194.23147,
"step": 287,
"tokens/total": 647488,
"tokens/train_per_sec_per_gpu": 11.93,
"tokens/trainable": 382717
},
{
"epoch": 1.2315112540192925,
"grad_norm": 0.06253078579902649,
"learning_rate": 0.0025,
"loss": 4.804765701293945,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 122.09088,
"step": 288,
"tokens/total": 649216,
"tokens/train_per_sec_per_gpu": 112.0,
"tokens/trainable": 383607
},
{
"epoch": 1.2357984994640943,
"grad_norm": 0.07760690897703171,
"learning_rate": 0.0025,
"loss": 4.299499034881592,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.66,
"memory/max_allocated (GiB)": 18.66,
"ppl": 73.66288,
"step": 289,
"tokens/total": 652160,
"tokens/train_per_sec_per_gpu": 115.01,
"tokens/trainable": 385609
},
{
"epoch": 1.240085744908896,
"grad_norm": 0.05732857063412666,
"learning_rate": 0.0025,
"loss": 4.712733268737793,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 111.35611,
"step": 290,
"tokens/total": 655104,
"tokens/train_per_sec_per_gpu": 26.12,
"tokens/trainable": 387600
},
{
"epoch": 1.2443729903536977,
"grad_norm": 0.08104580640792847,
"learning_rate": 0.0025,
"loss": 5.645717620849609,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 283.07663,
"step": 291,
"tokens/total": 657600,
"tokens/train_per_sec_per_gpu": 134.65,
"tokens/trainable": 389160
},
{
"epoch": 1.2486602357984995,
"grad_norm": 0.07328224182128906,
"learning_rate": 0.0025,
"loss": 4.8165693283081055,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 123.54054,
"step": 292,
"tokens/total": 659520,
"tokens/train_per_sec_per_gpu": 29.84,
"tokens/trainable": 389993
},
{
"epoch": 1.2529474812433012,
"grad_norm": 0.05570969730615616,
"learning_rate": 0.0025,
"loss": 4.818185806274414,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 123.7404,
"step": 293,
"tokens/total": 661824,
"tokens/train_per_sec_per_gpu": 58.03,
"tokens/trainable": 391387
},
{
"epoch": 1.257234726688103,
"grad_norm": 0.06483778357505798,
"learning_rate": 0.0025,
"loss": 4.867541313171387,
"memory/device_reserved (GiB)": 19.21,
"memory/max_active (GiB)": 18.86,
"memory/max_allocated (GiB)": 18.86,
"ppl": 130.00089,
"step": 294,
"tokens/total": 664384,
"tokens/train_per_sec_per_gpu": 58.74,
"tokens/trainable": 393119
},
{
"epoch": 1.2615219721329045,
"grad_norm": 0.06346052885055542,
"learning_rate": 0.0025,
"loss": 4.831150054931641,
"memory/device_reserved (GiB)": 19.34,
"memory/max_active (GiB)": 18.91,
"memory/max_allocated (GiB)": 18.91,
"ppl": 125.35504,
"step": 295,
"tokens/total": 667456,
"tokens/train_per_sec_per_gpu": 105.27,
"tokens/trainable": 395350
},
{
"epoch": 1.2615219721329045,
"eval_loss": 4.9676361083984375,
"eval_ppl": 143.68683,
"eval_runtime": 17.3088,
"eval_samples_per_second": 12.017,
"eval_steps_per_second": 12.017,
"memory/device_reserved (GiB)": 19.34,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"step": 295
},
{
"epoch": 1.2658092175777063,
"grad_norm": 0.06215736269950867,
"learning_rate": 0.0025,
"loss": 5.038169860839844,
"memory/device_reserved (GiB)": 18.81,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 154.18757,
"step": 296,
"tokens/total": 669568,
"tokens/train_per_sec_per_gpu": 85.6,
"tokens/trainable": 396670
},
{
"epoch": 1.270096463022508,
"grad_norm": 0.07183429598808289,
"learning_rate": 0.0025,
"loss": 5.635323524475098,
"memory/device_reserved (GiB)": 18.81,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 280.14954,
"step": 297,
"tokens/total": 671360,
"tokens/train_per_sec_per_gpu": 3.91,
"tokens/trainable": 397597
},
{
"epoch": 1.2743837084673098,
"grad_norm": 0.08423589169979095,
"learning_rate": 0.0025,
"loss": 4.927524089813232,
"memory/device_reserved (GiB)": 18.81,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 138.03732,
"step": 298,
"tokens/total": 672896,
"tokens/train_per_sec_per_gpu": 21.59,
"tokens/trainable": 398137
},
{
"epoch": 1.2786709539121115,
"grad_norm": 0.050887443125247955,
"learning_rate": 0.0025,
"loss": 4.658658981323242,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.67,
"memory/max_allocated (GiB)": 18.67,
"ppl": 105.49452,
"step": 299,
"tokens/total": 675968,
"tokens/train_per_sec_per_gpu": 343.04,
"tokens/trainable": 400317
},
{
"epoch": 1.2829581993569132,
"grad_norm": 0.0761413648724556,
"learning_rate": 0.0025,
"loss": 5.080306529998779,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 160.82335,
"step": 300,
"tokens/total": 678400,
"tokens/train_per_sec_per_gpu": 112.46,
"tokens/trainable": 401891
},
{
"epoch": 1.287245444801715,
"grad_norm": 0.08556882292032242,
"learning_rate": 0.0025,
"loss": 5.198375225067139,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 180.97795,
"step": 301,
"tokens/total": 680576,
"tokens/train_per_sec_per_gpu": 57.45,
"tokens/trainable": 403197
},
{
"epoch": 1.2915326902465165,
"grad_norm": 0.05796833708882332,
"learning_rate": 0.0025,
"loss": 5.415544509887695,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 224.87496,
"step": 302,
"tokens/total": 682688,
"tokens/train_per_sec_per_gpu": 111.88,
"tokens/trainable": 404402
},
{
"epoch": 1.2958199356913183,
"grad_norm": 0.054447028785943985,
"learning_rate": 0.0025,
"loss": 5.261867523193359,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 192.84129,
"step": 303,
"tokens/total": 684864,
"tokens/train_per_sec_per_gpu": 103.18,
"tokens/trainable": 405715
},
{
"epoch": 1.30010718113612,
"grad_norm": 0.060531970113515854,
"learning_rate": 0.0025,
"loss": 5.523534297943115,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 250.51888,
"step": 304,
"tokens/total": 687616,
"tokens/train_per_sec_per_gpu": 106.29,
"tokens/trainable": 407456
},
{
"epoch": 1.3043944265809218,
"grad_norm": 0.050943512469530106,
"learning_rate": 0.0025,
"loss": 5.201202392578125,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 181.49033,
"step": 305,
"tokens/total": 689984,
"tokens/train_per_sec_per_gpu": 146.93,
"tokens/trainable": 408956
},
{
"epoch": 1.3086816720257235,
"grad_norm": 0.0500502735376358,
"learning_rate": 0.0025,
"loss": 4.794355392456055,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.63,
"memory/max_allocated (GiB)": 18.63,
"ppl": 120.82647,
"step": 306,
"tokens/total": 693440,
"tokens/train_per_sec_per_gpu": 298.58,
"tokens/trainable": 411585
},
{
"epoch": 1.3129689174705252,
"grad_norm": 0.055616557598114014,
"learning_rate": 0.0025,
"loss": 5.319107532501221,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 204.20156,
"step": 307,
"tokens/total": 695872,
"tokens/train_per_sec_per_gpu": 88.09,
"tokens/trainable": 413098
},
{
"epoch": 1.317256162915327,
"grad_norm": 0.1226491928100586,
"learning_rate": 0.0025,
"loss": 4.878961563110352,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 131.49404,
"step": 308,
"tokens/total": 697664,
"tokens/train_per_sec_per_gpu": 14.26,
"tokens/trainable": 414049
},
{
"epoch": 1.3215434083601285,
"grad_norm": 0.044849809259176254,
"learning_rate": 0.0025,
"loss": 4.39945650100708,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 81.40661,
"step": 309,
"tokens/total": 699456,
"tokens/train_per_sec_per_gpu": 35.77,
"tokens/trainable": 414939
},
{
"epoch": 1.3258306538049303,
"grad_norm": 0.10974457114934921,
"learning_rate": 0.0025,
"loss": 4.777448654174805,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 118.80086,
"step": 310,
"tokens/total": 701312,
"tokens/train_per_sec_per_gpu": 35.79,
"tokens/trainable": 415891
},
{
"epoch": 1.330117899249732,
"grad_norm": 0.06229991093277931,
"learning_rate": 0.0025,
"loss": 4.938532829284668,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 139.56533,
"step": 311,
"tokens/total": 703744,
"tokens/train_per_sec_per_gpu": 57.07,
"tokens/trainable": 417446
},
{
"epoch": 1.3344051446945338,
"grad_norm": 0.06946682929992676,
"learning_rate": 0.0025,
"loss": 5.452759265899658,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 233.40129,
"step": 312,
"tokens/total": 706112,
"tokens/train_per_sec_per_gpu": 213.08,
"tokens/trainable": 418846
},
{
"epoch": 1.3386923901393355,
"grad_norm": 0.0820111483335495,
"learning_rate": 0.0025,
"loss": 4.500866413116455,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.42,
"memory/max_allocated (GiB)": 18.42,
"ppl": 90.09516,
"step": 313,
"tokens/total": 707520,
"tokens/train_per_sec_per_gpu": 99.78,
"tokens/trainable": 419352
},
{
"epoch": 1.3429796355841372,
"grad_norm": 0.16722634434700012,
"learning_rate": 0.0025,
"loss": 4.281378269195557,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.59,
"memory/max_allocated (GiB)": 18.59,
"ppl": 72.34008,
"step": 314,
"tokens/total": 709440,
"tokens/train_per_sec_per_gpu": 15.86,
"tokens/trainable": 420321
},
{
"epoch": 1.347266881028939,
"grad_norm": 0.06021692231297493,
"learning_rate": 0.0025,
"loss": 5.971747875213623,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 392.19057,
"step": 315,
"tokens/total": 711936,
"tokens/train_per_sec_per_gpu": 77.5,
"tokens/trainable": 421997
},
{
"epoch": 1.3515541264737405,
"grad_norm": 0.048991329967975616,
"learning_rate": 0.0025,
"loss": 5.36166524887085,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 213.07948,
"step": 316,
"tokens/total": 714432,
"tokens/train_per_sec_per_gpu": 144.42,
"tokens/trainable": 423421
},
{
"epoch": 1.3558413719185423,
"grad_norm": 0.11430433392524719,
"learning_rate": 0.0025,
"loss": 4.600022792816162,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.36,
"memory/max_allocated (GiB)": 18.36,
"ppl": 99.48658,
"step": 317,
"tokens/total": 715648,
"tokens/train_per_sec_per_gpu": 0.87,
"tokens/trainable": 423767
},
{
"epoch": 1.360128617363344,
"grad_norm": 0.08438309282064438,
"learning_rate": 0.0025,
"loss": 5.151257038116455,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 172.64838,
"step": 318,
"tokens/total": 718272,
"tokens/train_per_sec_per_gpu": 259.31,
"tokens/trainable": 425467
},
{
"epoch": 1.3644158628081458,
"grad_norm": 0.056045304983854294,
"learning_rate": 0.0025,
"loss": 4.737667560577393,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 114.1676,
"step": 319,
"tokens/total": 720384,
"tokens/train_per_sec_per_gpu": 30.87,
"tokens/trainable": 426694
},
{
"epoch": 1.3687031082529475,
"grad_norm": 0.06773889809846878,
"learning_rate": 0.0025,
"loss": 5.229560852050781,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.57,
"memory/max_allocated (GiB)": 18.57,
"ppl": 186.71079,
"step": 320,
"tokens/total": 722688,
"tokens/train_per_sec_per_gpu": 10.25,
"tokens/trainable": 428099
},
{
"epoch": 1.3729903536977492,
"grad_norm": 0.06447850167751312,
"learning_rate": 0.0025,
"loss": 5.048516750335693,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.42,
"memory/max_allocated (GiB)": 18.42,
"ppl": 155.79122,
"step": 321,
"tokens/total": 724736,
"tokens/train_per_sec_per_gpu": 14.38,
"tokens/trainable": 429136
},
{
"epoch": 1.377277599142551,
"grad_norm": 0.07881579548120499,
"learning_rate": 0.0025,
"loss": 5.187434673309326,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.57,
"memory/max_allocated (GiB)": 18.57,
"ppl": 179.00875,
"step": 322,
"tokens/total": 726784,
"tokens/train_per_sec_per_gpu": 25.14,
"tokens/trainable": 430417
},
{
"epoch": 1.3815648445873525,
"grad_norm": 0.06841576844453812,
"learning_rate": 0.0025,
"loss": 5.470486640930176,
"memory/device_reserved (GiB)": 19.0,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 237.57578,
"step": 323,
"tokens/total": 728512,
"tokens/train_per_sec_per_gpu": 156.65,
"tokens/trainable": 431369
},
{
"epoch": 1.3858520900321543,
"grad_norm": 0.11785981059074402,
"learning_rate": 0.0025,
"loss": 4.845800399780273,
"memory/device_reserved (GiB)": 19.14,
"memory/max_active (GiB)": 18.69,
"memory/max_allocated (GiB)": 18.69,
"ppl": 127.20506,
"step": 324,
"tokens/total": 731200,
"tokens/train_per_sec_per_gpu": 6.44,
"tokens/trainable": 433078
},
{
"epoch": 1.390139335476956,
"grad_norm": 0.07563474774360657,
"learning_rate": 0.0025,
"loss": 4.149139404296875,
"memory/device_reserved (GiB)": 19.14,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 63.37943,
"step": 325,
"tokens/total": 733504,
"tokens/train_per_sec_per_gpu": 77.55,
"tokens/trainable": 434444
},
{
"epoch": 1.3944265809217578,
"grad_norm": 0.07862015813589096,
"learning_rate": 0.0025,
"loss": 4.405404090881348,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 19.09,
"memory/max_allocated (GiB)": 19.09,
"ppl": 81.89223,
"step": 326,
"tokens/total": 737088,
"tokens/train_per_sec_per_gpu": 155.85,
"tokens/trainable": 437043
},
{
"epoch": 1.3987138263665595,
"grad_norm": 0.07842207700014114,
"learning_rate": 0.0025,
"loss": 5.559260368347168,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 259.63073,
"step": 327,
"tokens/total": 738880,
"tokens/train_per_sec_per_gpu": 24.68,
"tokens/trainable": 437857
},
{
"epoch": 1.4030010718113612,
"grad_norm": 0.0685216560959816,
"learning_rate": 0.0025,
"loss": 5.114203453063965,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 166.36821,
"step": 328,
"tokens/total": 740928,
"tokens/train_per_sec_per_gpu": 8.77,
"tokens/trainable": 439031
},
{
"epoch": 1.407288317256163,
"grad_norm": 0.07267401367425919,
"learning_rate": 0.0025,
"loss": 4.98257303237915,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 145.84917,
"step": 329,
"tokens/total": 742848,
"tokens/train_per_sec_per_gpu": 6.98,
"tokens/trainable": 440007
},
{
"epoch": 1.4115755627009645,
"grad_norm": 0.0542726069688797,
"learning_rate": 0.0025,
"loss": 4.922747611999512,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 137.37956,
"step": 330,
"tokens/total": 744960,
"tokens/train_per_sec_per_gpu": 109.51,
"tokens/trainable": 441203
},
{
"epoch": 1.4158628081457665,
"grad_norm": 0.06696043908596039,
"learning_rate": 0.0025,
"loss": 4.691984176635742,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 109.06938,
"step": 331,
"tokens/total": 747136,
"tokens/train_per_sec_per_gpu": 129.1,
"tokens/trainable": 442561
},
{
"epoch": 1.420150053590568,
"grad_norm": 0.06947220861911774,
"learning_rate": 0.0025,
"loss": 4.651721000671387,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 104.76513,
"step": 332,
"tokens/total": 748800,
"tokens/train_per_sec_per_gpu": 85.45,
"tokens/trainable": 443290
},
{
"epoch": 1.4244372990353698,
"grad_norm": 0.06656062602996826,
"learning_rate": 0.0025,
"loss": 5.165863990783691,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 175.18875,
"step": 333,
"tokens/total": 751296,
"tokens/train_per_sec_per_gpu": 130.0,
"tokens/trainable": 444808
},
{
"epoch": 1.4287245444801715,
"grad_norm": 0.0503920316696167,
"learning_rate": 0.0025,
"loss": 5.2461981773376465,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.9,
"memory/max_allocated (GiB)": 18.9,
"ppl": 189.84314,
"step": 334,
"tokens/total": 755072,
"tokens/train_per_sec_per_gpu": 87.86,
"tokens/trainable": 447741
},
{
"epoch": 1.4330117899249732,
"grad_norm": 0.06683284789323807,
"learning_rate": 0.0025,
"loss": 4.544827461242676,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 94.14418,
"step": 335,
"tokens/total": 756544,
"tokens/train_per_sec_per_gpu": 5.23,
"tokens/trainable": 448356
},
{
"epoch": 1.437299035369775,
"grad_norm": 0.06916282325983047,
"learning_rate": 0.0025,
"loss": 5.005459308624268,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 149.22561,
"step": 336,
"tokens/total": 759424,
"tokens/train_per_sec_per_gpu": 82.24,
"tokens/trainable": 450248
},
{
"epoch": 1.4415862808145765,
"grad_norm": 0.0864240899682045,
"learning_rate": 0.0025,
"loss": 4.898914813995361,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.42,
"memory/max_allocated (GiB)": 18.42,
"ppl": 134.14413,
"step": 337,
"tokens/total": 761408,
"tokens/train_per_sec_per_gpu": 25.71,
"tokens/trainable": 451300
},
{
"epoch": 1.4458735262593785,
"grad_norm": 0.05566547438502312,
"learning_rate": 0.0025,
"loss": 5.002068996429443,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 148.72054,
"step": 338,
"tokens/total": 763904,
"tokens/train_per_sec_per_gpu": 2.62,
"tokens/trainable": 452916
},
{
"epoch": 1.45016077170418,
"grad_norm": 0.05582151934504509,
"learning_rate": 0.0025,
"loss": 5.314091682434082,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 203.17988,
"step": 339,
"tokens/total": 766208,
"tokens/train_per_sec_per_gpu": 107.08,
"tokens/trainable": 454272
},
{
"epoch": 1.4544480171489818,
"grad_norm": 0.05226564779877663,
"learning_rate": 0.0025,
"loss": 4.944754600524902,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.42,
"memory/max_allocated (GiB)": 18.42,
"ppl": 140.43638,
"step": 340,
"tokens/total": 768064,
"tokens/train_per_sec_per_gpu": 112.77,
"tokens/trainable": 455346
},
{
"epoch": 1.4587352625937835,
"grad_norm": 0.06044873222708702,
"learning_rate": 0.0025,
"loss": 4.841938018798828,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 126.71469,
"step": 341,
"tokens/total": 769664,
"tokens/train_per_sec_per_gpu": 62.41,
"tokens/trainable": 456009
},
{
"epoch": 1.4630225080385852,
"grad_norm": 0.04582054540514946,
"learning_rate": 0.0025,
"loss": 5.606667518615723,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 272.2355,
"step": 342,
"tokens/total": 772224,
"tokens/train_per_sec_per_gpu": 167.3,
"tokens/trainable": 457672
},
{
"epoch": 1.467309753483387,
"grad_norm": 0.052784983068704605,
"learning_rate": 0.0025,
"loss": 4.380001068115234,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 79.83812,
"step": 343,
"tokens/total": 774592,
"tokens/train_per_sec_per_gpu": 64.87,
"tokens/trainable": 459004
},
{
"epoch": 1.4715969989281885,
"grad_norm": 0.04827815666794777,
"learning_rate": 0.0025,
"loss": 4.817045211791992,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 123.59934,
"step": 344,
"tokens/total": 777280,
"tokens/train_per_sec_per_gpu": 39.6,
"tokens/trainable": 460657
},
{
"epoch": 1.4758842443729905,
"grad_norm": 0.07826294749975204,
"learning_rate": 0.0025,
"loss": 3.8754422664642334,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 48.20401,
"step": 345,
"tokens/total": 778496,
"tokens/train_per_sec_per_gpu": 23.47,
"tokens/trainable": 461041
},
{
"epoch": 1.480171489817792,
"grad_norm": 0.04768767207860947,
"learning_rate": 0.0025,
"loss": 5.038158893585205,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 154.18588,
"step": 346,
"tokens/total": 780992,
"tokens/train_per_sec_per_gpu": 78.54,
"tokens/trainable": 462567
},
{
"epoch": 1.4844587352625938,
"grad_norm": 0.2351859211921692,
"learning_rate": 0.0025,
"loss": 5.444571018218994,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 231.49795,
"step": 347,
"tokens/total": 783616,
"tokens/train_per_sec_per_gpu": 168.35,
"tokens/trainable": 464210
},
{
"epoch": 1.4887459807073955,
"grad_norm": 0.04593876376748085,
"learning_rate": 0.0025,
"loss": 5.326495170593262,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 205.71571,
"step": 348,
"tokens/total": 786304,
"tokens/train_per_sec_per_gpu": 140.75,
"tokens/trainable": 466015
},
{
"epoch": 1.4930332261521972,
"grad_norm": 0.06556063890457153,
"learning_rate": 0.0025,
"loss": 5.152454376220703,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 172.85522,
"step": 349,
"tokens/total": 788480,
"tokens/train_per_sec_per_gpu": 224.08,
"tokens/trainable": 467198
},
{
"epoch": 1.497320471596999,
"grad_norm": 0.06161191314458847,
"learning_rate": 0.0025,
"loss": 5.141845226287842,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 171.03107,
"step": 350,
"tokens/total": 790336,
"tokens/train_per_sec_per_gpu": 74.32,
"tokens/trainable": 468199
},
{
"epoch": 1.5016077170418005,
"grad_norm": 0.28069961071014404,
"learning_rate": 0.0025,
"loss": 5.9613165855407715,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 388.12078,
"step": 351,
"tokens/total": 792320,
"tokens/train_per_sec_per_gpu": 10.1,
"tokens/trainable": 469192
},
{
"epoch": 1.5058949624866025,
"grad_norm": 0.09382814168930054,
"learning_rate": 0.0025,
"loss": 5.591994762420654,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 268.27022,
"step": 352,
"tokens/total": 794816,
"tokens/train_per_sec_per_gpu": 35.14,
"tokens/trainable": 470744
},
{
"epoch": 1.510182207931404,
"grad_norm": 0.06651383638381958,
"learning_rate": 0.0025,
"loss": 4.758882522583008,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 116.61554,
"step": 353,
"tokens/total": 796800,
"tokens/train_per_sec_per_gpu": 46.01,
"tokens/trainable": 471734
},
{
"epoch": 1.5144694533762058,
"grad_norm": 0.07342278957366943,
"learning_rate": 0.0025,
"loss": 5.06705379486084,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 158.70606,
"step": 354,
"tokens/total": 798400,
"tokens/train_per_sec_per_gpu": 46.96,
"tokens/trainable": 472368
},
{
"epoch": 1.5144694533762058,
"eval_loss": 4.888035297393799,
"eval_ppl": 132.69262,
"eval_runtime": 17.094,
"eval_samples_per_second": 12.168,
"eval_steps_per_second": 12.168,
"memory/device_reserved (GiB)": 19.64,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"step": 354
},
{
"epoch": 1.5187566988210075,
"grad_norm": 0.05180167779326439,
"learning_rate": 0.0025,
"loss": 4.341151714324951,
"memory/device_reserved (GiB)": 18.71,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 76.79594,
"step": 355,
"tokens/total": 801024,
"tokens/train_per_sec_per_gpu": 18.76,
"tokens/trainable": 474170
},
{
"epoch": 1.5230439442658092,
"grad_norm": 0.05290725454688072,
"learning_rate": 0.0025,
"loss": 4.733070373535156,
"memory/device_reserved (GiB)": 18.74,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 113.64396,
"step": 356,
"tokens/total": 803072,
"tokens/train_per_sec_per_gpu": 10.8,
"tokens/trainable": 475255
},
{
"epoch": 1.527331189710611,
"grad_norm": 0.08694328367710114,
"learning_rate": 0.0025,
"loss": 4.449097156524658,
"memory/device_reserved (GiB)": 18.74,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 85.54967,
"step": 357,
"tokens/total": 804864,
"tokens/train_per_sec_per_gpu": 36.39,
"tokens/trainable": 476172
},
{
"epoch": 1.5316184351554125,
"grad_norm": 0.059812407940626144,
"learning_rate": 0.0025,
"loss": 5.0324225425720215,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 19.55,
"memory/max_allocated (GiB)": 19.55,
"ppl": 153.30395,
"step": 358,
"tokens/total": 809344,
"tokens/train_per_sec_per_gpu": 62.57,
"tokens/trainable": 479719
},
{
"epoch": 1.5359056806002145,
"grad_norm": 0.05654964968562126,
"learning_rate": 0.0025,
"loss": 5.156862735748291,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 19.12,
"memory/max_allocated (GiB)": 19.12,
"ppl": 173.61891,
"step": 359,
"tokens/total": 813440,
"tokens/train_per_sec_per_gpu": 30.77,
"tokens/trainable": 482873
},
{
"epoch": 1.540192926045016,
"grad_norm": 0.04540662467479706,
"learning_rate": 0.0025,
"loss": 4.702476501464844,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 110.21979,
"step": 360,
"tokens/total": 815424,
"tokens/train_per_sec_per_gpu": 10.68,
"tokens/trainable": 483879
},
{
"epoch": 1.5444801714898178,
"grad_norm": 0.08061626553535461,
"learning_rate": 0.0025,
"loss": 4.724915504455566,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 19.5,
"memory/max_allocated (GiB)": 19.5,
"ppl": 112.72097,
"step": 361,
"tokens/total": 819520,
"tokens/train_per_sec_per_gpu": 7.14,
"tokens/trainable": 486988
},
{
"epoch": 1.5487674169346195,
"grad_norm": 0.047675181180238724,
"learning_rate": 0.0025,
"loss": 4.952810764312744,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 141.57233,
"step": 362,
"tokens/total": 822528,
"tokens/train_per_sec_per_gpu": 188.99,
"tokens/trainable": 489093
},
{
"epoch": 1.5530546623794212,
"grad_norm": 0.0705411285161972,
"learning_rate": 0.0025,
"loss": 4.235616683959961,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.38,
"memory/max_allocated (GiB)": 18.38,
"ppl": 69.10428,
"step": 363,
"tokens/total": 824000,
"tokens/train_per_sec_per_gpu": 40.32,
"tokens/trainable": 489788
},
{
"epoch": 1.557341907824223,
"grad_norm": 0.0532984621822834,
"learning_rate": 0.0025,
"loss": 4.592817306518555,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 98.77231,
"step": 364,
"tokens/total": 825984,
"tokens/train_per_sec_per_gpu": 126.04,
"tokens/trainable": 490775
},
{
"epoch": 1.5616291532690245,
"grad_norm": 0.05280710384249687,
"learning_rate": 0.0025,
"loss": 5.201740264892578,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 181.58798,
"step": 365,
"tokens/total": 828096,
"tokens/train_per_sec_per_gpu": 54.56,
"tokens/trainable": 492005
},
{
"epoch": 1.5659163987138265,
"grad_norm": 0.051445771008729935,
"learning_rate": 0.0025,
"loss": 4.719823360443115,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 112.14844,
"step": 366,
"tokens/total": 830720,
"tokens/train_per_sec_per_gpu": 197.49,
"tokens/trainable": 493565
},
{
"epoch": 1.570203644158628,
"grad_norm": 0.05346055328845978,
"learning_rate": 0.0025,
"loss": 4.971454620361328,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 144.23654,
"step": 367,
"tokens/total": 833216,
"tokens/train_per_sec_per_gpu": 12.08,
"tokens/trainable": 495164
},
{
"epoch": 1.5744908896034298,
"grad_norm": 0.0699196457862854,
"learning_rate": 0.0025,
"loss": 4.878042221069336,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 131.37321,
"step": 368,
"tokens/total": 835520,
"tokens/train_per_sec_per_gpu": 10.86,
"tokens/trainable": 496515
},
{
"epoch": 1.5787781350482315,
"grad_norm": 0.06464764475822449,
"learning_rate": 0.0025,
"loss": 4.946186542510986,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 140.63762,
"step": 369,
"tokens/total": 837696,
"tokens/train_per_sec_per_gpu": 199.16,
"tokens/trainable": 497713
},
{
"epoch": 1.5830653804930332,
"grad_norm": 0.08636437356472015,
"learning_rate": 0.0025,
"loss": 4.280492305755615,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 72.27601,
"step": 370,
"tokens/total": 839168,
"tokens/train_per_sec_per_gpu": 71.64,
"tokens/trainable": 498229
},
{
"epoch": 1.587352625937835,
"grad_norm": 0.047297775745391846,
"learning_rate": 0.0025,
"loss": 5.235410213470459,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 187.80613,
"step": 371,
"tokens/total": 842112,
"tokens/train_per_sec_per_gpu": 170.73,
"tokens/trainable": 500237
},
{
"epoch": 1.5916398713826365,
"grad_norm": 0.07992982119321823,
"learning_rate": 0.0025,
"loss": 4.507941246032715,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 90.73483,
"step": 372,
"tokens/total": 843584,
"tokens/train_per_sec_per_gpu": 11.01,
"tokens/trainable": 500839
},
{
"epoch": 1.5959271168274385,
"grad_norm": 0.06259352713823318,
"learning_rate": 0.0025,
"loss": 4.919366836547852,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 136.9159,
"step": 373,
"tokens/total": 845504,
"tokens/train_per_sec_per_gpu": 41.51,
"tokens/trainable": 501881
},
{
"epoch": 1.60021436227224,
"grad_norm": 0.08243437856435776,
"learning_rate": 0.0025,
"loss": 5.151687145233154,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.58,
"memory/max_allocated (GiB)": 18.58,
"ppl": 172.72265,
"step": 374,
"tokens/total": 848768,
"tokens/train_per_sec_per_gpu": 261.87,
"tokens/trainable": 504139
},
{
"epoch": 1.6045016077170418,
"grad_norm": 0.08473316580057144,
"learning_rate": 0.0025,
"loss": 5.018991470336914,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 151.25868,
"step": 375,
"tokens/total": 851136,
"tokens/train_per_sec_per_gpu": 4.3,
"tokens/trainable": 505536
},
{
"epoch": 1.6087888531618435,
"grad_norm": 0.04689257964491844,
"learning_rate": 0.0025,
"loss": 4.897285461425781,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 133.92574,
"step": 376,
"tokens/total": 853504,
"tokens/train_per_sec_per_gpu": 29.13,
"tokens/trainable": 507033
},
{
"epoch": 1.6130760986066452,
"grad_norm": 0.058138296008110046,
"learning_rate": 0.0025,
"loss": 4.437278747558594,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 84.54456,
"step": 377,
"tokens/total": 855040,
"tokens/train_per_sec_per_gpu": 60.77,
"tokens/trainable": 507780
},
{
"epoch": 1.617363344051447,
"grad_norm": 0.07955910265445709,
"learning_rate": 0.0025,
"loss": 4.341729640960693,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 76.84033,
"step": 378,
"tokens/total": 857024,
"tokens/train_per_sec_per_gpu": 1.3,
"tokens/trainable": 508911
},
{
"epoch": 1.6216505894962485,
"grad_norm": 0.057746682316064835,
"learning_rate": 0.0025,
"loss": 3.7874722480773926,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 44.14467,
"step": 379,
"tokens/total": 858752,
"tokens/train_per_sec_per_gpu": 27.9,
"tokens/trainable": 509683
},
{
"epoch": 1.6259378349410505,
"grad_norm": 0.07005178928375244,
"learning_rate": 0.0025,
"loss": 5.148179531097412,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 172.11787,
"step": 380,
"tokens/total": 860928,
"tokens/train_per_sec_per_gpu": 56.8,
"tokens/trainable": 510959
},
{
"epoch": 1.630225080385852,
"grad_norm": 0.04911843314766884,
"learning_rate": 0.0025,
"loss": 4.848702430725098,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 127.57475,
"step": 381,
"tokens/total": 862976,
"tokens/train_per_sec_per_gpu": 93.02,
"tokens/trainable": 512078
},
{
"epoch": 1.6345123258306538,
"grad_norm": 0.07803714275360107,
"learning_rate": 0.0025,
"loss": 4.658711910247803,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 105.5001,
"step": 382,
"tokens/total": 865088,
"tokens/train_per_sec_per_gpu": 67.71,
"tokens/trainable": 513209
},
{
"epoch": 1.6387995712754555,
"grad_norm": 0.039557769894599915,
"learning_rate": 0.0025,
"loss": 4.664157867431641,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 106.07622,
"step": 383,
"tokens/total": 867520,
"tokens/train_per_sec_per_gpu": 144.15,
"tokens/trainable": 514700
},
{
"epoch": 1.6430868167202572,
"grad_norm": 0.047754038125276566,
"learning_rate": 0.0025,
"loss": 4.329286575317383,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.64,
"memory/max_allocated (GiB)": 18.64,
"ppl": 75.89013,
"step": 384,
"tokens/total": 869632,
"tokens/train_per_sec_per_gpu": 32.67,
"tokens/trainable": 516045
},
{
"epoch": 1.647374062165059,
"grad_norm": 0.0593026764690876,
"learning_rate": 0.0025,
"loss": 5.5122809410095215,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.55,
"memory/max_allocated (GiB)": 18.55,
"ppl": 247.71551,
"step": 385,
"tokens/total": 872128,
"tokens/train_per_sec_per_gpu": 6.48,
"tokens/trainable": 517536
},
{
"epoch": 1.6516613076098605,
"grad_norm": 0.0731717124581337,
"learning_rate": 0.0025,
"loss": 4.586673736572266,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 98.16736,
"step": 386,
"tokens/total": 874240,
"tokens/train_per_sec_per_gpu": 15.25,
"tokens/trainable": 518738
},
{
"epoch": 1.6559485530546625,
"grad_norm": 0.049605630338191986,
"learning_rate": 0.0025,
"loss": 4.679656982421875,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 107.73311,
"step": 387,
"tokens/total": 876288,
"tokens/train_per_sec_per_gpu": 135.43,
"tokens/trainable": 519876
},
{
"epoch": 1.660235798499464,
"grad_norm": 0.05317911505699158,
"learning_rate": 0.0025,
"loss": 4.681480407714844,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.73,
"memory/max_allocated (GiB)": 18.73,
"ppl": 107.92973,
"step": 388,
"tokens/total": 879360,
"tokens/train_per_sec_per_gpu": 368.87,
"tokens/trainable": 521966
},
{
"epoch": 1.6645230439442658,
"grad_norm": 0.060831792652606964,
"learning_rate": 0.0025,
"loss": 5.22551155090332,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 185.95627,
"step": 389,
"tokens/total": 881664,
"tokens/train_per_sec_per_gpu": 7.84,
"tokens/trainable": 523262
},
{
"epoch": 1.6688102893890675,
"grad_norm": 0.10634256899356842,
"learning_rate": 0.0025,
"loss": 5.569226264953613,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 262.23112,
"step": 390,
"tokens/total": 883200,
"tokens/train_per_sec_per_gpu": 7.0,
"tokens/trainable": 523927
},
{
"epoch": 1.6730975348338692,
"grad_norm": 0.04912353307008743,
"learning_rate": 0.0025,
"loss": 4.896402359008789,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 133.80752,
"step": 391,
"tokens/total": 885376,
"tokens/train_per_sec_per_gpu": 131.57,
"tokens/trainable": 525245
},
{
"epoch": 1.677384780278671,
"grad_norm": 0.051567140966653824,
"learning_rate": 0.0025,
"loss": 4.537640571594238,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 93.47,
"step": 392,
"tokens/total": 887680,
"tokens/train_per_sec_per_gpu": 23.66,
"tokens/trainable": 526583
},
{
"epoch": 1.6816720257234725,
"grad_norm": 0.05488206073641777,
"learning_rate": 0.0025,
"loss": 4.715126037597656,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 111.62288,
"step": 393,
"tokens/total": 890304,
"tokens/train_per_sec_per_gpu": 140.1,
"tokens/trainable": 528304
},
{
"epoch": 1.6859592711682745,
"grad_norm": 0.04531604424118996,
"learning_rate": 0.0025,
"loss": 4.594015121459961,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 98.89069,
"step": 394,
"tokens/total": 892416,
"tokens/train_per_sec_per_gpu": 63.67,
"tokens/trainable": 529449
},
{
"epoch": 1.690246516613076,
"grad_norm": 0.054829467087984085,
"learning_rate": 0.0025,
"loss": 4.549041271209717,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 94.54172,
"step": 395,
"tokens/total": 894144,
"tokens/train_per_sec_per_gpu": 30.68,
"tokens/trainable": 530299
},
{
"epoch": 1.694533762057878,
"grad_norm": 0.053975410759449005,
"learning_rate": 0.0025,
"loss": 4.356207370758057,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.59,
"memory/max_allocated (GiB)": 18.59,
"ppl": 77.9609,
"step": 396,
"tokens/total": 896192,
"tokens/train_per_sec_per_gpu": 23.37,
"tokens/trainable": 531383
},
{
"epoch": 1.6988210075026795,
"grad_norm": 0.06466397643089294,
"learning_rate": 0.0025,
"loss": 4.832691192626953,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 125.54838,
"step": 397,
"tokens/total": 898368,
"tokens/train_per_sec_per_gpu": 28.95,
"tokens/trainable": 532533
},
{
"epoch": 1.7031082529474812,
"grad_norm": 0.0506359338760376,
"learning_rate": 0.0025,
"loss": 4.359593868255615,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 78.22536,
"step": 398,
"tokens/total": 900736,
"tokens/train_per_sec_per_gpu": 7.39,
"tokens/trainable": 533957
},
{
"epoch": 1.707395498392283,
"grad_norm": 0.06138148903846741,
"learning_rate": 0.0025,
"loss": 4.625148296356201,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 102.0179,
"step": 399,
"tokens/total": 902592,
"tokens/train_per_sec_per_gpu": 20.91,
"tokens/trainable": 534928
},
{
"epoch": 1.7116827438370845,
"grad_norm": 0.047848962247371674,
"learning_rate": 0.0025,
"loss": 4.433683395385742,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 84.24114,
"step": 400,
"tokens/total": 904448,
"tokens/train_per_sec_per_gpu": 18.74,
"tokens/trainable": 535890
},
{
"epoch": 1.7159699892818865,
"grad_norm": 0.06819909065961838,
"learning_rate": 0.0025,
"loss": 4.891788482666016,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 133.19157,
"step": 401,
"tokens/total": 906048,
"tokens/train_per_sec_per_gpu": 52.8,
"tokens/trainable": 536499
},
{
"epoch": 1.720257234726688,
"grad_norm": 0.04256964474916458,
"learning_rate": 0.0025,
"loss": 4.25316858291626,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 70.3279,
"step": 402,
"tokens/total": 907968,
"tokens/train_per_sec_per_gpu": 121.28,
"tokens/trainable": 537519
},
{
"epoch": 1.72454448017149,
"grad_norm": 0.04542100802063942,
"learning_rate": 0.0025,
"loss": 5.214837551116943,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.52,
"memory/max_allocated (GiB)": 18.52,
"ppl": 183.98193,
"step": 403,
"tokens/total": 910848,
"tokens/train_per_sec_per_gpu": 233.51,
"tokens/trainable": 539428
},
{
"epoch": 1.7288317256162915,
"grad_norm": 0.04585760459303856,
"learning_rate": 0.0025,
"loss": 4.585484027862549,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 98.05063,
"step": 404,
"tokens/total": 913536,
"tokens/train_per_sec_per_gpu": 183.23,
"tokens/trainable": 541209
},
{
"epoch": 1.7331189710610932,
"grad_norm": 0.0482996366918087,
"learning_rate": 0.0025,
"loss": 4.8759870529174805,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.63,
"memory/max_allocated (GiB)": 18.63,
"ppl": 131.1035,
"step": 405,
"tokens/total": 916032,
"tokens/train_per_sec_per_gpu": 65.34,
"tokens/trainable": 542856
},
{
"epoch": 1.737406216505895,
"grad_norm": 0.04029145836830139,
"learning_rate": 0.0025,
"loss": 4.391783714294434,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.4,
"memory/max_allocated (GiB)": 18.4,
"ppl": 80.78439,
"step": 406,
"tokens/total": 917824,
"tokens/train_per_sec_per_gpu": 6.99,
"tokens/trainable": 543743
},
{
"epoch": 1.7416934619506965,
"grad_norm": 0.03549795225262642,
"learning_rate": 0.0025,
"loss": 4.487819671630859,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 88.92734,
"step": 407,
"tokens/total": 920192,
"tokens/train_per_sec_per_gpu": 142.0,
"tokens/trainable": 545102
},
{
"epoch": 1.7459807073954985,
"grad_norm": 0.05987889692187309,
"learning_rate": 0.0025,
"loss": 4.304838180541992,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 74.05723,
"step": 408,
"tokens/total": 921920,
"tokens/train_per_sec_per_gpu": 80.61,
"tokens/trainable": 545864
},
{
"epoch": 1.7502679528403,
"grad_norm": 0.03903573006391525,
"learning_rate": 0.0025,
"loss": 4.431785583496094,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 84.08142,
"step": 409,
"tokens/total": 924160,
"tokens/train_per_sec_per_gpu": 56.37,
"tokens/trainable": 547197
},
{
"epoch": 1.754555198285102,
"grad_norm": 0.08176471292972565,
"learning_rate": 0.0025,
"loss": 5.05267333984375,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 156.44012,
"step": 410,
"tokens/total": 926400,
"tokens/train_per_sec_per_gpu": 78.56,
"tokens/trainable": 548428
},
{
"epoch": 1.7588424437299035,
"grad_norm": 0.06569211184978485,
"learning_rate": 0.0025,
"loss": 4.557641506195068,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 95.35831,
"step": 411,
"tokens/total": 928256,
"tokens/train_per_sec_per_gpu": 69.11,
"tokens/trainable": 549334
},
{
"epoch": 1.7631296891747053,
"grad_norm": 0.0812261626124382,
"learning_rate": 0.0025,
"loss": 4.661388397216797,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.72,
"memory/max_allocated (GiB)": 18.72,
"ppl": 105.78285,
"step": 412,
"tokens/total": 930432,
"tokens/train_per_sec_per_gpu": 30.49,
"tokens/trainable": 550597
},
{
"epoch": 1.767416934619507,
"grad_norm": 0.06816331297159195,
"learning_rate": 0.0025,
"loss": 4.940434455871582,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 139.83099,
"step": 413,
"tokens/total": 932672,
"tokens/train_per_sec_per_gpu": 46.78,
"tokens/trainable": 551971
},
{
"epoch": 1.767416934619507,
"eval_loss": 4.7477898597717285,
"eval_ppl": 115.32911,
"eval_runtime": 16.9681,
"eval_samples_per_second": 12.258,
"eval_steps_per_second": 12.258,
"memory/device_reserved (GiB)": 19.92,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"step": 413
},
{
"epoch": 1.7717041800643085,
"grad_norm": 0.04632922261953354,
"learning_rate": 0.0025,
"loss": 4.674541473388672,
"memory/device_reserved (GiB)": 18.75,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 107.18341,
"step": 414,
"tokens/total": 934720,
"tokens/train_per_sec_per_gpu": 21.87,
"tokens/trainable": 553178
},
{
"epoch": 1.7759914255091105,
"grad_norm": 0.0418037474155426,
"learning_rate": 0.0025,
"loss": 4.476470947265625,
"memory/device_reserved (GiB)": 18.75,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 87.92384,
"step": 415,
"tokens/total": 936768,
"tokens/train_per_sec_per_gpu": 44.4,
"tokens/trainable": 554330
},
{
"epoch": 1.780278670953912,
"grad_norm": 0.0720926821231842,
"learning_rate": 0.0025,
"loss": 4.901095390319824,
"memory/device_reserved (GiB)": 18.75,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 134.43696,
"step": 416,
"tokens/total": 938752,
"tokens/train_per_sec_per_gpu": 58.68,
"tokens/trainable": 555375
},
{
"epoch": 1.784565916398714,
"grad_norm": 0.0682898610830307,
"learning_rate": 0.0025,
"loss": 4.806495189666748,
"memory/device_reserved (GiB)": 18.75,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 122.30222,
"step": 417,
"tokens/total": 940736,
"tokens/train_per_sec_per_gpu": 60.27,
"tokens/trainable": 556410
},
{
"epoch": 1.7888531618435155,
"grad_norm": 0.05859844386577606,
"learning_rate": 0.0025,
"loss": 4.619528293609619,
"memory/device_reserved (GiB)": 18.75,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 101.44617,
"step": 418,
"tokens/total": 942720,
"tokens/train_per_sec_per_gpu": 119.79,
"tokens/trainable": 557481
},
{
"epoch": 1.7931404072883173,
"grad_norm": 0.0587584562599659,
"learning_rate": 0.0025,
"loss": 4.6873064041137695,
"memory/device_reserved (GiB)": 18.75,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 108.56037,
"step": 419,
"tokens/total": 944576,
"tokens/train_per_sec_per_gpu": 103.57,
"tokens/trainable": 558411
},
{
"epoch": 1.797427652733119,
"grad_norm": 0.048332419246435165,
"learning_rate": 0.0025,
"loss": 4.776644706726074,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 19.42,
"memory/max_allocated (GiB)": 19.42,
"ppl": 118.70539,
"step": 420,
"tokens/total": 948160,
"tokens/train_per_sec_per_gpu": 18.63,
"tokens/trainable": 561055
},
{
"epoch": 1.8017148981779205,
"grad_norm": 0.045766137540340424,
"learning_rate": 0.0025,
"loss": 4.8820977210998535,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 131.90708,
"step": 421,
"tokens/total": 950784,
"tokens/train_per_sec_per_gpu": 58.05,
"tokens/trainable": 562707
},
{
"epoch": 1.8060021436227225,
"grad_norm": 0.07353589683771133,
"learning_rate": 0.0025,
"loss": 4.372645854949951,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 79.25305,
"step": 422,
"tokens/total": 952384,
"tokens/train_per_sec_per_gpu": 94.96,
"tokens/trainable": 563370
},
{
"epoch": 1.810289389067524,
"grad_norm": 0.057513438165187836,
"learning_rate": 0.0025,
"loss": 4.444611549377441,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 85.16679,
"step": 423,
"tokens/total": 954560,
"tokens/train_per_sec_per_gpu": 14.88,
"tokens/trainable": 564680
},
{
"epoch": 1.814576634512326,
"grad_norm": 0.054809462279081345,
"learning_rate": 0.0025,
"loss": 4.237483501434326,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 69.23341,
"step": 424,
"tokens/total": 957696,
"tokens/train_per_sec_per_gpu": 52.44,
"tokens/trainable": 566858
},
{
"epoch": 1.8188638799571275,
"grad_norm": 0.042437877506017685,
"learning_rate": 0.0025,
"loss": 4.65795373916626,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 105.42014,
"step": 425,
"tokens/total": 960064,
"tokens/train_per_sec_per_gpu": 58.49,
"tokens/trainable": 568277
},
{
"epoch": 1.8231511254019293,
"grad_norm": 0.0690232664346695,
"learning_rate": 0.0025,
"loss": 4.57647180557251,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 19.1,
"memory/max_allocated (GiB)": 19.1,
"ppl": 97.17095,
"step": 426,
"tokens/total": 963264,
"tokens/train_per_sec_per_gpu": 604.98,
"tokens/trainable": 570607
},
{
"epoch": 1.827438370846731,
"grad_norm": 0.05784786492586136,
"learning_rate": 0.0025,
"loss": 5.262682914733887,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 192.9986,
"step": 427,
"tokens/total": 965824,
"tokens/train_per_sec_per_gpu": 120.41,
"tokens/trainable": 572251
},
{
"epoch": 1.8317256162915327,
"grad_norm": 0.07479379326105118,
"learning_rate": 0.0025,
"loss": 5.224900722503662,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 185.84272,
"step": 428,
"tokens/total": 968064,
"tokens/train_per_sec_per_gpu": 7.22,
"tokens/trainable": 573467
},
{
"epoch": 1.8360128617363345,
"grad_norm": 0.048712894320487976,
"learning_rate": 0.0025,
"loss": 5.22140645980835,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 185.19447,
"step": 429,
"tokens/total": 970432,
"tokens/train_per_sec_per_gpu": 17.21,
"tokens/trainable": 574765
},
{
"epoch": 1.840300107181136,
"grad_norm": 0.0432349257171154,
"learning_rate": 0.0025,
"loss": 4.933577060699463,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 138.87539,
"step": 430,
"tokens/total": 972928,
"tokens/train_per_sec_per_gpu": 126.94,
"tokens/trainable": 576351
},
{
"epoch": 1.844587352625938,
"grad_norm": 0.035638924688100815,
"learning_rate": 0.0025,
"loss": 5.010141372680664,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.5,
"memory/max_allocated (GiB)": 18.5,
"ppl": 149.92593,
"step": 431,
"tokens/total": 975616,
"tokens/train_per_sec_per_gpu": 55.33,
"tokens/trainable": 578016
},
{
"epoch": 1.8488745980707395,
"grad_norm": 0.05865227058529854,
"learning_rate": 0.0025,
"loss": 4.759288787841797,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.47,
"memory/max_allocated (GiB)": 18.47,
"ppl": 116.66292,
"step": 432,
"tokens/total": 977344,
"tokens/train_per_sec_per_gpu": 17.53,
"tokens/trainable": 578907
},
{
"epoch": 1.8531618435155413,
"grad_norm": 0.037786636501550674,
"learning_rate": 0.0025,
"loss": 4.746092796325684,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.56,
"memory/max_allocated (GiB)": 18.56,
"ppl": 115.13355,
"step": 433,
"tokens/total": 980800,
"tokens/train_per_sec_per_gpu": 37.5,
"tokens/trainable": 581317
},
{
"epoch": 1.857449088960343,
"grad_norm": 0.036064039915800095,
"learning_rate": 0.0025,
"loss": 4.497668743133545,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 89.80752,
"step": 434,
"tokens/total": 982976,
"tokens/train_per_sec_per_gpu": 41.65,
"tokens/trainable": 582503
},
{
"epoch": 1.8617363344051447,
"grad_norm": 0.059433262795209885,
"learning_rate": 0.0025,
"loss": 5.0172624588012695,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.55,
"memory/max_allocated (GiB)": 18.55,
"ppl": 150.99738,
"step": 435,
"tokens/total": 985600,
"tokens/train_per_sec_per_gpu": 18.07,
"tokens/trainable": 584121
},
{
"epoch": 1.8660235798499465,
"grad_norm": 0.05385487526655197,
"learning_rate": 0.0025,
"loss": 5.047094821929932,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.71,
"memory/max_allocated (GiB)": 18.71,
"ppl": 155.56985,
"step": 436,
"tokens/total": 988352,
"tokens/train_per_sec_per_gpu": 21.47,
"tokens/trainable": 585931
},
{
"epoch": 1.870310825294748,
"grad_norm": 0.07723033428192139,
"learning_rate": 0.0025,
"loss": 5.156320571899414,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 173.52481,
"step": 437,
"tokens/total": 991104,
"tokens/train_per_sec_per_gpu": 210.83,
"tokens/trainable": 587715
},
{
"epoch": 1.87459807073955,
"grad_norm": 0.054785728454589844,
"learning_rate": 0.0025,
"loss": 4.405440330505371,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.36,
"memory/max_allocated (GiB)": 18.36,
"ppl": 81.8952,
"step": 438,
"tokens/total": 992448,
"tokens/train_per_sec_per_gpu": 34.58,
"tokens/trainable": 588195
},
{
"epoch": 1.8788853161843515,
"grad_norm": 0.044970739632844925,
"learning_rate": 0.0025,
"loss": 4.495140075683594,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 89.58072,
"step": 439,
"tokens/total": 995008,
"tokens/train_per_sec_per_gpu": 10.07,
"tokens/trainable": 589904
},
{
"epoch": 1.8831725616291533,
"grad_norm": 0.05420251190662384,
"learning_rate": 0.0025,
"loss": 4.917167663574219,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.63,
"memory/max_allocated (GiB)": 18.63,
"ppl": 136.61512,
"step": 440,
"tokens/total": 997568,
"tokens/train_per_sec_per_gpu": 58.61,
"tokens/trainable": 591429
},
{
"epoch": 1.887459807073955,
"grad_norm": 0.06885336339473724,
"learning_rate": 0.0025,
"loss": 4.854959011077881,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.64,
"memory/max_allocated (GiB)": 18.64,
"ppl": 128.37543,
"step": 441,
"tokens/total": 1000000,
"tokens/train_per_sec_per_gpu": 2.19,
"tokens/trainable": 592888
},
{
"epoch": 1.8917470525187567,
"grad_norm": 0.04857528582215309,
"learning_rate": 0.0025,
"loss": 4.567473888397217,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.67,
"memory/max_allocated (GiB)": 18.67,
"ppl": 96.30054,
"step": 442,
"tokens/total": 1002496,
"tokens/train_per_sec_per_gpu": 28.11,
"tokens/trainable": 594412
},
{
"epoch": 1.8960342979635585,
"grad_norm": 0.05679011344909668,
"learning_rate": 0.0025,
"loss": 4.383760452270508,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 80.13883,
"step": 443,
"tokens/total": 1004032,
"tokens/train_per_sec_per_gpu": 7.89,
"tokens/trainable": 595042
},
{
"epoch": 1.90032154340836,
"grad_norm": 0.08521363139152527,
"learning_rate": 0.0025,
"loss": 5.441523551940918,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 230.79354,
"step": 444,
"tokens/total": 1005824,
"tokens/train_per_sec_per_gpu": 182.26,
"tokens/trainable": 595958
},
{
"epoch": 1.904608788853162,
"grad_norm": 0.04015873000025749,
"learning_rate": 0.0025,
"loss": 4.99971866607666,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.57,
"memory/max_allocated (GiB)": 18.57,
"ppl": 148.37141,
"step": 445,
"tokens/total": 1008640,
"tokens/train_per_sec_per_gpu": 16.68,
"tokens/trainable": 597780
},
{
"epoch": 1.9088960342979635,
"grad_norm": 0.05560390651226044,
"learning_rate": 0.0025,
"loss": 4.3576273918151855,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 78.07168,
"step": 446,
"tokens/total": 1010752,
"tokens/train_per_sec_per_gpu": 100.98,
"tokens/trainable": 598947
},
{
"epoch": 1.9131832797427653,
"grad_norm": 0.07791434973478317,
"learning_rate": 0.0025,
"loss": 4.918298721313477,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 136.76973,
"step": 447,
"tokens/total": 1012352,
"tokens/train_per_sec_per_gpu": 78.47,
"tokens/trainable": 599685
},
{
"epoch": 1.917470525187567,
"grad_norm": 0.05341208353638649,
"learning_rate": 0.0025,
"loss": 5.289507865905762,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.49,
"memory/max_allocated (GiB)": 18.49,
"ppl": 198.24584,
"step": 448,
"tokens/total": 1014592,
"tokens/train_per_sec_per_gpu": 6.23,
"tokens/trainable": 601074
},
{
"epoch": 1.9217577706323687,
"grad_norm": 0.06588708609342575,
"learning_rate": 0.0025,
"loss": 4.922712802886963,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 137.37478,
"step": 449,
"tokens/total": 1016832,
"tokens/train_per_sec_per_gpu": 6.2,
"tokens/trainable": 602419
},
{
"epoch": 1.9260450160771705,
"grad_norm": 0.08113836497068405,
"learning_rate": 0.0025,
"loss": 4.302712440490723,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 73.89997,
"step": 450,
"tokens/total": 1018880,
"tokens/train_per_sec_per_gpu": 54.48,
"tokens/trainable": 603525
},
{
"epoch": 1.930332261521972,
"grad_norm": 0.038718972355127335,
"learning_rate": 0.0025,
"loss": 5.016265869140625,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 150.84697,
"step": 451,
"tokens/total": 1021440,
"tokens/train_per_sec_per_gpu": 27.21,
"tokens/trainable": 605190
},
{
"epoch": 1.934619506966774,
"grad_norm": 0.06918424367904663,
"learning_rate": 0.0025,
"loss": 4.20094633102417,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.39,
"memory/max_allocated (GiB)": 18.39,
"ppl": 66.74947,
"step": 452,
"tokens/total": 1022976,
"tokens/train_per_sec_per_gpu": 59.21,
"tokens/trainable": 605785
},
{
"epoch": 1.9389067524115755,
"grad_norm": 0.05727904662489891,
"learning_rate": 0.0025,
"loss": 4.495724201202393,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 89.63306,
"step": 453,
"tokens/total": 1024832,
"tokens/train_per_sec_per_gpu": 15.8,
"tokens/trainable": 606673
},
{
"epoch": 1.9431939978563773,
"grad_norm": 0.050397999584674835,
"learning_rate": 0.0025,
"loss": 4.682392120361328,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.45,
"memory/max_allocated (GiB)": 18.45,
"ppl": 108.02818,
"step": 454,
"tokens/total": 1026880,
"tokens/train_per_sec_per_gpu": 14.36,
"tokens/trainable": 607794
},
{
"epoch": 1.947481243301179,
"grad_norm": 0.05501880869269371,
"learning_rate": 0.0025,
"loss": 4.579135894775391,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.94,
"memory/max_allocated (GiB)": 18.94,
"ppl": 97.43017,
"step": 455,
"tokens/total": 1030144,
"tokens/train_per_sec_per_gpu": 522.12,
"tokens/trainable": 610019
},
{
"epoch": 1.9517684887459807,
"grad_norm": 0.04997771605849266,
"learning_rate": 0.0025,
"loss": 4.384857177734375,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 80.22676,
"step": 456,
"tokens/total": 1032128,
"tokens/train_per_sec_per_gpu": 14.95,
"tokens/trainable": 611215
},
{
"epoch": 1.9560557341907825,
"grad_norm": 0.036863308399915695,
"learning_rate": 0.0025,
"loss": 4.938703536987305,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 139.58916,
"step": 457,
"tokens/total": 1034240,
"tokens/train_per_sec_per_gpu": 39.48,
"tokens/trainable": 612358
},
{
"epoch": 1.960342979635584,
"grad_norm": 0.0519835501909256,
"learning_rate": 0.0025,
"loss": 4.352743148803711,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 77.69129,
"step": 458,
"tokens/total": 1035712,
"tokens/train_per_sec_per_gpu": 86.87,
"tokens/trainable": 612911
},
{
"epoch": 1.964630225080386,
"grad_norm": 0.06522325426340103,
"learning_rate": 0.0025,
"loss": 4.363661766052246,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.44,
"memory/max_allocated (GiB)": 18.44,
"ppl": 78.54422,
"step": 459,
"tokens/total": 1037376,
"tokens/train_per_sec_per_gpu": 48.59,
"tokens/trainable": 613674
},
{
"epoch": 1.9689174705251875,
"grad_norm": 0.04710303246974945,
"learning_rate": 0.0025,
"loss": 4.830078601837158,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.51,
"memory/max_allocated (GiB)": 18.51,
"ppl": 125.2208,
"step": 460,
"tokens/total": 1039296,
"tokens/train_per_sec_per_gpu": 21.74,
"tokens/trainable": 614688
},
{
"epoch": 1.9732047159699893,
"grad_norm": 0.05254080519080162,
"learning_rate": 0.0025,
"loss": 4.752572536468506,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 115.88201,
"step": 461,
"tokens/total": 1041280,
"tokens/train_per_sec_per_gpu": 28.76,
"tokens/trainable": 615727
},
{
"epoch": 1.977491961414791,
"grad_norm": 0.04451625421643257,
"learning_rate": 0.0025,
"loss": 4.789048671722412,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.77,
"memory/max_allocated (GiB)": 18.77,
"ppl": 120.18698,
"step": 462,
"tokens/total": 1044224,
"tokens/train_per_sec_per_gpu": 95.08,
"tokens/trainable": 617841
},
{
"epoch": 1.9817792068595927,
"grad_norm": 0.07913687825202942,
"learning_rate": 0.0025,
"loss": 4.311519622802734,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.43,
"memory/max_allocated (GiB)": 18.43,
"ppl": 74.5537,
"step": 463,
"tokens/total": 1046208,
"tokens/train_per_sec_per_gpu": 8.84,
"tokens/trainable": 618874
},
{
"epoch": 1.9860664523043945,
"grad_norm": 0.047682974487543106,
"learning_rate": 0.0025,
"loss": 5.018848419189453,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.48,
"memory/max_allocated (GiB)": 18.48,
"ppl": 151.23704,
"step": 464,
"tokens/total": 1048704,
"tokens/train_per_sec_per_gpu": 10.86,
"tokens/trainable": 620535
},
{
"epoch": 1.990353697749196,
"grad_norm": 0.06579267233610153,
"learning_rate": 0.0025,
"loss": 4.598994255065918,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.41,
"memory/max_allocated (GiB)": 18.41,
"ppl": 99.38431,
"step": 465,
"tokens/total": 1050496,
"tokens/train_per_sec_per_gpu": 18.04,
"tokens/trainable": 621414
},
{
"epoch": 1.994640943193998,
"grad_norm": 0.049845289438962936,
"learning_rate": 0.0025,
"loss": 4.27248477935791,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.46,
"memory/max_allocated (GiB)": 18.46,
"ppl": 71.69957,
"step": 466,
"tokens/total": 1052480,
"tokens/train_per_sec_per_gpu": 112.51,
"tokens/trainable": 622451
},
{
"epoch": 1.9989281886387995,
"grad_norm": 0.04582377150654793,
"learning_rate": 0.0025,
"loss": 4.850298881530762,
"memory/device_reserved (GiB)": 19.85,
"memory/max_active (GiB)": 18.53,
"memory/max_allocated (GiB)": 18.53,
"ppl": 127.77857,
"step": 467,
"tokens/total": 1054528,
"tokens/train_per_sec_per_gpu": 210.97,
"tokens/trainable": 623603
}
],
"logging_steps": 1,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 234,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.347829287307059e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}