Guilherme34's picture
Upload folder using huggingface_hub
47021fc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.100806451612903,
"eval_steps": 500,
"global_step": 405,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020161290322580645,
"grad_norm": 14.229155540466309,
"learning_rate": 0.0002,
"loss": 2.857408046722412,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 17.41633,
"step": 1,
"tokens/total": 2048,
"tokens/train_per_sec_per_gpu": 20.25,
"tokens/trainable": 2039
},
{
"epoch": 0.004032258064516129,
"grad_norm": 0.4178522527217865,
"learning_rate": 0.00019999799412001546,
"loss": 2.866978645324707,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 17.58381,
"step": 2,
"tokens/total": 4096,
"tokens/train_per_sec_per_gpu": 262.4,
"tokens/trainable": 4078
},
{
"epoch": 0.006048387096774193,
"grad_norm": 0.42466941475868225,
"learning_rate": 0.00019999197656053288,
"loss": 2.8624894618988037,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 17.50505,
"step": 3,
"tokens/total": 6144,
"tokens/train_per_sec_per_gpu": 261.67,
"tokens/trainable": 6123
},
{
"epoch": 0.008064516129032258,
"grad_norm": 10.047090530395508,
"learning_rate": 0.0001999819475629623,
"loss": 2.7082412242889404,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 15.00287,
"step": 4,
"tokens/total": 8192,
"tokens/train_per_sec_per_gpu": 264.34,
"tokens/trainable": 8161
},
{
"epoch": 0.010080645161290322,
"grad_norm": 0.42684099078178406,
"learning_rate": 0.00019996790752964305,
"loss": 2.5221738815307617,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 12.45564,
"step": 5,
"tokens/total": 10240,
"tokens/train_per_sec_per_gpu": 257.89,
"tokens/trainable": 10195
},
{
"epoch": 0.012096774193548387,
"grad_norm": 0.552547812461853,
"learning_rate": 0.00019994985702382758,
"loss": 3.238849639892578,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 25.50437,
"step": 6,
"tokens/total": 12288,
"tokens/train_per_sec_per_gpu": 252.93,
"tokens/trainable": 12232
},
{
"epoch": 0.014112903225806451,
"grad_norm": 0.5256035327911377,
"learning_rate": 0.00019992779676965885,
"loss": 2.7185559272766113,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 15.15842,
"step": 7,
"tokens/total": 14336,
"tokens/train_per_sec_per_gpu": 263.26,
"tokens/trainable": 14266
},
{
"epoch": 0.016129032258064516,
"grad_norm": 0.5142767429351807,
"learning_rate": 0.00019990172765214128,
"loss": 2.322587728500366,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.20204,
"step": 8,
"tokens/total": 16384,
"tokens/train_per_sec_per_gpu": 256.27,
"tokens/trainable": 16312
},
{
"epoch": 0.018145161290322582,
"grad_norm": 0.6270139813423157,
"learning_rate": 0.00019987165071710527,
"loss": 2.8085548877716064,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 16.58593,
"step": 9,
"tokens/total": 18432,
"tokens/train_per_sec_per_gpu": 244.86,
"tokens/trainable": 18348
},
{
"epoch": 0.020161290322580645,
"grad_norm": 0.5775133371353149,
"learning_rate": 0.00019983756717116536,
"loss": 2.1894941329956055,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.93069,
"step": 10,
"tokens/total": 20480,
"tokens/train_per_sec_per_gpu": 255.48,
"tokens/trainable": 20385
},
{
"epoch": 0.02217741935483871,
"grad_norm": 0.7627199292182922,
"learning_rate": 0.0001997994783816715,
"loss": 2.6888070106506348,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 14.71411,
"step": 11,
"tokens/total": 22528,
"tokens/train_per_sec_per_gpu": 262.76,
"tokens/trainable": 22428
},
{
"epoch": 0.024193548387096774,
"grad_norm": 0.7487443685531616,
"learning_rate": 0.00019975738587665456,
"loss": 3.052485942840576,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 21.1679,
"step": 12,
"tokens/total": 24576,
"tokens/train_per_sec_per_gpu": 258.22,
"tokens/trainable": 24458
},
{
"epoch": 0.02620967741935484,
"grad_norm": 0.783439040184021,
"learning_rate": 0.00019971129134476473,
"loss": 2.723402500152588,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 15.23206,
"step": 13,
"tokens/total": 26624,
"tokens/train_per_sec_per_gpu": 260.81,
"tokens/trainable": 26498
},
{
"epoch": 0.028225806451612902,
"grad_norm": 0.8359043002128601,
"learning_rate": 0.00019966119663520412,
"loss": 2.5634570121765137,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 12.98061,
"step": 14,
"tokens/total": 28672,
"tokens/train_per_sec_per_gpu": 257.31,
"tokens/trainable": 28538
},
{
"epoch": 0.03024193548387097,
"grad_norm": 0.6595765352249146,
"learning_rate": 0.0001996071037576521,
"loss": 2.629350423812866,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 13.86476,
"step": 15,
"tokens/total": 30720,
"tokens/train_per_sec_per_gpu": 247.77,
"tokens/trainable": 30584
},
{
"epoch": 0.03225806451612903,
"grad_norm": 0.611452043056488,
"learning_rate": 0.00019954901488218515,
"loss": 2.4734597206115723,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 11.86342,
"step": 16,
"tokens/total": 32768,
"tokens/train_per_sec_per_gpu": 259.46,
"tokens/trainable": 32623
},
{
"epoch": 0.034274193548387094,
"grad_norm": 0.5893421769142151,
"learning_rate": 0.00019948693233918952,
"loss": 2.2999792098999023,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.97398,
"step": 17,
"tokens/total": 34816,
"tokens/train_per_sec_per_gpu": 256.75,
"tokens/trainable": 34652
},
{
"epoch": 0.036290322580645164,
"grad_norm": 0.6046351790428162,
"learning_rate": 0.0001994208586192678,
"loss": 2.5167291164398193,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 12.38801,
"step": 18,
"tokens/total": 36864,
"tokens/train_per_sec_per_gpu": 250.14,
"tokens/trainable": 36691
},
{
"epoch": 0.038306451612903226,
"grad_norm": 0.6316596865653992,
"learning_rate": 0.00019935079637313906,
"loss": 2.5756912231445312,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 13.1404,
"step": 19,
"tokens/total": 38912,
"tokens/train_per_sec_per_gpu": 249.87,
"tokens/trainable": 38729
},
{
"epoch": 0.04032258064516129,
"grad_norm": 0.6076182126998901,
"learning_rate": 0.00019927674841153237,
"loss": 2.090085983276367,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.08561,
"step": 20,
"tokens/total": 40960,
"tokens/train_per_sec_per_gpu": 249.2,
"tokens/trainable": 40773
},
{
"epoch": 0.04233870967741935,
"grad_norm": 0.618816614151001,
"learning_rate": 0.0001991987177050743,
"loss": 2.6480720043182373,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 14.12678,
"step": 21,
"tokens/total": 43008,
"tokens/train_per_sec_per_gpu": 255.22,
"tokens/trainable": 42805
},
{
"epoch": 0.04435483870967742,
"grad_norm": 0.6039671301841736,
"learning_rate": 0.00019911670738416947,
"loss": 1.99757719039917,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.37118,
"step": 22,
"tokens/total": 45056,
"tokens/train_per_sec_per_gpu": 259.8,
"tokens/trainable": 44849
},
{
"epoch": 0.046370967741935484,
"grad_norm": 0.5967584252357483,
"learning_rate": 0.00019903072073887507,
"loss": 2.346921443939209,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.45334,
"step": 23,
"tokens/total": 47104,
"tokens/train_per_sec_per_gpu": 261.54,
"tokens/trainable": 46878
},
{
"epoch": 0.04838709677419355,
"grad_norm": 0.5980664491653442,
"learning_rate": 0.000198940761218769,
"loss": 2.123394012451172,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.35946,
"step": 24,
"tokens/total": 49152,
"tokens/train_per_sec_per_gpu": 261.87,
"tokens/trainable": 48920
},
{
"epoch": 0.05040322580645161,
"grad_norm": 0.612209677696228,
"learning_rate": 0.00019884683243281116,
"loss": 2.1959524154663086,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.98856,
"step": 25,
"tokens/total": 51200,
"tokens/train_per_sec_per_gpu": 254.83,
"tokens/trainable": 50960
},
{
"epoch": 0.05241935483870968,
"grad_norm": 0.6679723858833313,
"learning_rate": 0.00019874893814919906,
"loss": 2.667539119720459,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 14.40448,
"step": 26,
"tokens/total": 53248,
"tokens/train_per_sec_per_gpu": 245.17,
"tokens/trainable": 52995
},
{
"epoch": 0.05443548387096774,
"grad_norm": 0.6861585378646851,
"learning_rate": 0.00019864708229521636,
"loss": 1.9803462028503418,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.24525,
"step": 27,
"tokens/total": 55296,
"tokens/train_per_sec_per_gpu": 258.01,
"tokens/trainable": 55034
},
{
"epoch": 0.056451612903225805,
"grad_norm": 0.5645394921302795,
"learning_rate": 0.0001985412689570754,
"loss": 1.961925745010376,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.11301,
"step": 28,
"tokens/total": 57344,
"tokens/train_per_sec_per_gpu": 258.84,
"tokens/trainable": 57050
},
{
"epoch": 0.05846774193548387,
"grad_norm": 0.6146724224090576,
"learning_rate": 0.00019843150237975344,
"loss": 2.7785215377807617,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 16.09521,
"step": 29,
"tokens/total": 59392,
"tokens/train_per_sec_per_gpu": 262.26,
"tokens/trainable": 59084
},
{
"epoch": 0.06048387096774194,
"grad_norm": 0.5862687230110168,
"learning_rate": 0.00019831778696682194,
"loss": 1.864811897277832,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.45472,
"step": 30,
"tokens/total": 61440,
"tokens/train_per_sec_per_gpu": 262.16,
"tokens/trainable": 61121
},
{
"epoch": 0.0625,
"grad_norm": 0.6342179775238037,
"learning_rate": 0.00019820012728027044,
"loss": 2.477421283721924,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 11.91051,
"step": 31,
"tokens/total": 63488,
"tokens/train_per_sec_per_gpu": 264.65,
"tokens/trainable": 63166
},
{
"epoch": 0.06451612903225806,
"grad_norm": 0.5973448157310486,
"learning_rate": 0.00019807852804032305,
"loss": 2.2537577152252197,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.52346,
"step": 32,
"tokens/total": 65536,
"tokens/train_per_sec_per_gpu": 249.19,
"tokens/trainable": 65203
},
{
"epoch": 0.06653225806451613,
"grad_norm": 0.5755914449691772,
"learning_rate": 0.00019795299412524945,
"loss": 2.242089033126831,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.41297,
"step": 33,
"tokens/total": 67584,
"tokens/train_per_sec_per_gpu": 257.52,
"tokens/trainable": 67227
},
{
"epoch": 0.06854838709677419,
"grad_norm": 0.6142029166221619,
"learning_rate": 0.000197823530571169,
"loss": 2.0939087867736816,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.11658,
"step": 34,
"tokens/total": 69632,
"tokens/train_per_sec_per_gpu": 258.27,
"tokens/trainable": 69269
},
{
"epoch": 0.07056451612903226,
"grad_norm": 0.6227375268936157,
"learning_rate": 0.0001976901425718487,
"loss": 2.35734224319458,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.56284,
"step": 35,
"tokens/total": 71680,
"tokens/train_per_sec_per_gpu": 260.78,
"tokens/trainable": 71312
},
{
"epoch": 0.07258064516129033,
"grad_norm": 0.5991148948669434,
"learning_rate": 0.00019755283547849494,
"loss": 2.319620370864868,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.17181,
"step": 36,
"tokens/total": 73728,
"tokens/train_per_sec_per_gpu": 259.96,
"tokens/trainable": 73349
},
{
"epoch": 0.07459677419354839,
"grad_norm": 0.5688547492027283,
"learning_rate": 0.0001974116147995387,
"loss": 1.8059585094451904,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.0858,
"step": 37,
"tokens/total": 75776,
"tokens/train_per_sec_per_gpu": 262.3,
"tokens/trainable": 75395
},
{
"epoch": 0.07661290322580645,
"grad_norm": 0.7161643505096436,
"learning_rate": 0.00019726648620041468,
"loss": 2.224153518676758,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.24565,
"step": 38,
"tokens/total": 77824,
"tokens/train_per_sec_per_gpu": 255.46,
"tokens/trainable": 77427
},
{
"epoch": 0.07862903225806452,
"grad_norm": 0.6673325300216675,
"learning_rate": 0.0001971174555033339,
"loss": 2.3063974380493164,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.0382,
"step": 39,
"tokens/total": 79872,
"tokens/train_per_sec_per_gpu": 254.47,
"tokens/trainable": 79459
},
{
"epoch": 0.08064516129032258,
"grad_norm": 0.6383804678916931,
"learning_rate": 0.00019696452868705024,
"loss": 2.597661018371582,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 13.43228,
"step": 40,
"tokens/total": 81920,
"tokens/train_per_sec_per_gpu": 256.24,
"tokens/trainable": 81506
},
{
"epoch": 0.08266129032258064,
"grad_norm": 0.7061684727668762,
"learning_rate": 0.00019680771188662044,
"loss": 1.7441378831863403,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.72097,
"step": 41,
"tokens/total": 83968,
"tokens/train_per_sec_per_gpu": 242.74,
"tokens/trainable": 83536
},
{
"epoch": 0.0846774193548387,
"grad_norm": 0.7068732380867004,
"learning_rate": 0.0001966470113931582,
"loss": 2.949695587158203,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 19.10014,
"step": 42,
"tokens/total": 86016,
"tokens/train_per_sec_per_gpu": 252.81,
"tokens/trainable": 85581
},
{
"epoch": 0.08669354838709678,
"grad_norm": 0.5967429876327515,
"learning_rate": 0.00019648243365358146,
"loss": 2.2845401763916016,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.82117,
"step": 43,
"tokens/total": 88064,
"tokens/train_per_sec_per_gpu": 256.36,
"tokens/trainable": 87621
},
{
"epoch": 0.08870967741935484,
"grad_norm": 0.6582663059234619,
"learning_rate": 0.00019631398527035422,
"loss": 2.4249186515808105,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 11.30131,
"step": 44,
"tokens/total": 90112,
"tokens/train_per_sec_per_gpu": 262.03,
"tokens/trainable": 89666
},
{
"epoch": 0.0907258064516129,
"grad_norm": 0.6329452395439148,
"learning_rate": 0.00019614167300122126,
"loss": 2.4295296669006348,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 11.35354,
"step": 45,
"tokens/total": 92160,
"tokens/train_per_sec_per_gpu": 255.81,
"tokens/trainable": 91706
},
{
"epoch": 0.09274193548387097,
"grad_norm": 0.6765903830528259,
"learning_rate": 0.0001959655037589372,
"loss": 2.2950925827026367,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.92535,
"step": 46,
"tokens/total": 94208,
"tokens/train_per_sec_per_gpu": 32.48,
"tokens/trainable": 93749
},
{
"epoch": 0.09475806451612903,
"grad_norm": 0.795857310295105,
"learning_rate": 0.00019578548461098914,
"loss": 1.9531352519989014,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.05076,
"step": 47,
"tokens/total": 96256,
"tokens/train_per_sec_per_gpu": 134.2,
"tokens/trainable": 95772
},
{
"epoch": 0.0967741935483871,
"grad_norm": 0.5967952013015747,
"learning_rate": 0.00019560162277931325,
"loss": 1.7950658798217773,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.01987,
"step": 48,
"tokens/total": 98304,
"tokens/train_per_sec_per_gpu": 133.62,
"tokens/trainable": 97793
},
{
"epoch": 0.09879032258064516,
"grad_norm": 0.6623239517211914,
"learning_rate": 0.00019541392564000488,
"loss": 2.242034912109375,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.41247,
"step": 49,
"tokens/total": 100352,
"tokens/train_per_sec_per_gpu": 133.56,
"tokens/trainable": 99828
},
{
"epoch": 0.10080645161290322,
"grad_norm": 0.6411992907524109,
"learning_rate": 0.00019522240072302274,
"loss": 1.8567615747451782,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.40297,
"step": 50,
"tokens/total": 102400,
"tokens/train_per_sec_per_gpu": 136.64,
"tokens/trainable": 101859
},
{
"epoch": 0.1028225806451613,
"grad_norm": 0.6743437647819519,
"learning_rate": 0.00019502705571188672,
"loss": 1.7463788986206055,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.7338,
"step": 51,
"tokens/total": 104448,
"tokens/train_per_sec_per_gpu": 134.46,
"tokens/trainable": 103902
},
{
"epoch": 0.10483870967741936,
"grad_norm": 0.5422692894935608,
"learning_rate": 0.0001948278984433699,
"loss": 1.7116990089416504,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.53836,
"step": 52,
"tokens/total": 106496,
"tokens/train_per_sec_per_gpu": 135.55,
"tokens/trainable": 105941
},
{
"epoch": 0.10685483870967742,
"grad_norm": 0.5959545969963074,
"learning_rate": 0.0001946249369071837,
"loss": 2.0541088581085205,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.79988,
"step": 53,
"tokens/total": 108544,
"tokens/train_per_sec_per_gpu": 136.22,
"tokens/trainable": 107963
},
{
"epoch": 0.10887096774193548,
"grad_norm": 0.7033765316009521,
"learning_rate": 0.00019441817924565786,
"loss": 2.3317766189575195,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.29622,
"step": 54,
"tokens/total": 110592,
"tokens/train_per_sec_per_gpu": 138.09,
"tokens/trainable": 109990
},
{
"epoch": 0.11088709677419355,
"grad_norm": 0.8628013134002686,
"learning_rate": 0.0001942076337534135,
"loss": 2.250319004058838,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.49076,
"step": 55,
"tokens/total": 112640,
"tokens/train_per_sec_per_gpu": 133.73,
"tokens/trainable": 112020
},
{
"epoch": 0.11290322580645161,
"grad_norm": 0.6559049487113953,
"learning_rate": 0.00019399330887703037,
"loss": 2.2744903564453125,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.72296,
"step": 56,
"tokens/total": 114688,
"tokens/train_per_sec_per_gpu": 134.26,
"tokens/trainable": 114066
},
{
"epoch": 0.11491935483870967,
"grad_norm": 0.6887659430503845,
"learning_rate": 0.00019377521321470805,
"loss": 2.313232898712158,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.10705,
"step": 57,
"tokens/total": 116736,
"tokens/train_per_sec_per_gpu": 133.71,
"tokens/trainable": 116088
},
{
"epoch": 0.11693548387096774,
"grad_norm": 0.6893835663795471,
"learning_rate": 0.00019355335551592105,
"loss": 2.245192289352417,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.44223,
"step": 58,
"tokens/total": 118784,
"tokens/train_per_sec_per_gpu": 133.19,
"tokens/trainable": 118109
},
{
"epoch": 0.11895161290322581,
"grad_norm": 0.7315247654914856,
"learning_rate": 0.00019332774468106768,
"loss": 2.1427183151245117,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.52257,
"step": 59,
"tokens/total": 120832,
"tokens/train_per_sec_per_gpu": 136.05,
"tokens/trainable": 120155
},
{
"epoch": 0.12096774193548387,
"grad_norm": 0.6021593809127808,
"learning_rate": 0.00019309838976111311,
"loss": 2.017518997192383,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.51965,
"step": 60,
"tokens/total": 122880,
"tokens/train_per_sec_per_gpu": 134.7,
"tokens/trainable": 122178
},
{
"epoch": 0.12298387096774194,
"grad_norm": 0.6357892751693726,
"learning_rate": 0.00019286529995722623,
"loss": 1.8628515005111694,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.44208,
"step": 61,
"tokens/total": 124928,
"tokens/train_per_sec_per_gpu": 133.45,
"tokens/trainable": 124202
},
{
"epoch": 0.125,
"grad_norm": 0.6388751864433289,
"learning_rate": 0.00019262848462041045,
"loss": 1.9663957357406616,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.14488,
"step": 62,
"tokens/total": 126976,
"tokens/train_per_sec_per_gpu": 131.0,
"tokens/trainable": 126219
},
{
"epoch": 0.12701612903225806,
"grad_norm": 0.6599971652030945,
"learning_rate": 0.0001923879532511287,
"loss": 2.0729598999023438,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.94831,
"step": 63,
"tokens/total": 129024,
"tokens/train_per_sec_per_gpu": 135.7,
"tokens/trainable": 128262
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.7960649728775024,
"learning_rate": 0.0001921437154989221,
"loss": 2.5976061820983887,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 13.43155,
"step": 64,
"tokens/total": 131072,
"tokens/train_per_sec_per_gpu": 136.99,
"tokens/trainable": 130300
},
{
"epoch": 0.1310483870967742,
"grad_norm": 0.6315323710441589,
"learning_rate": 0.00019189578116202307,
"loss": 2.0962467193603516,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.13558,
"step": 65,
"tokens/total": 133120,
"tokens/train_per_sec_per_gpu": 137.36,
"tokens/trainable": 132335
},
{
"epoch": 0.13306451612903225,
"grad_norm": 0.6805335879325867,
"learning_rate": 0.00019164416018696207,
"loss": 2.0736522674560547,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.95382,
"step": 66,
"tokens/total": 135168,
"tokens/train_per_sec_per_gpu": 137.33,
"tokens/trainable": 134340
},
{
"epoch": 0.1350806451612903,
"grad_norm": 0.635079562664032,
"learning_rate": 0.00019138886266816866,
"loss": 2.0142641067504883,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.49521,
"step": 67,
"tokens/total": 137216,
"tokens/train_per_sec_per_gpu": 133.68,
"tokens/trainable": 136365
},
{
"epoch": 0.13709677419354838,
"grad_norm": 0.6177524924278259,
"learning_rate": 0.00019112989884756653,
"loss": 1.8478095531463623,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.3459,
"step": 68,
"tokens/total": 139264,
"tokens/train_per_sec_per_gpu": 136.24,
"tokens/trainable": 138391
},
{
"epoch": 0.13911290322580644,
"grad_norm": 0.6538437604904175,
"learning_rate": 0.0001908672791141625,
"loss": 2.0607097148895264,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.85154,
"step": 69,
"tokens/total": 141312,
"tokens/train_per_sec_per_gpu": 138.96,
"tokens/trainable": 140435
},
{
"epoch": 0.14112903225806453,
"grad_norm": 0.6102776527404785,
"learning_rate": 0.00019060101400362998,
"loss": 1.8468718528747559,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.33996,
"step": 70,
"tokens/total": 143360,
"tokens/train_per_sec_per_gpu": 137.72,
"tokens/trainable": 142463
},
{
"epoch": 0.1431451612903226,
"grad_norm": 0.6988463997840881,
"learning_rate": 0.00019033111419788597,
"loss": 2.087909460067749,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.06803,
"step": 71,
"tokens/total": 145408,
"tokens/train_per_sec_per_gpu": 120.7,
"tokens/trainable": 144290
},
{
"epoch": 1.002016129032258,
"grad_norm": 0.5984178781509399,
"learning_rate": 0.000190057590524663,
"loss": 1.9351210594177246,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.92488,
"step": 72,
"tokens/total": 147456,
"tokens/train_per_sec_per_gpu": 134.47,
"tokens/trainable": 146323
},
{
"epoch": 1.0040322580645162,
"grad_norm": 0.6225078701972961,
"learning_rate": 0.00018978045395707418,
"loss": 1.8930723667144775,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.63974,
"step": 73,
"tokens/total": 149504,
"tokens/train_per_sec_per_gpu": 138.22,
"tokens/trainable": 148360
},
{
"epoch": 1.0060483870967742,
"grad_norm": 0.6837508082389832,
"learning_rate": 0.0001894997156131734,
"loss": 1.9495553970336914,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.02556,
"step": 74,
"tokens/total": 151552,
"tokens/train_per_sec_per_gpu": 137.22,
"tokens/trainable": 150403
},
{
"epoch": 1.0080645161290323,
"grad_norm": 0.6232768893241882,
"learning_rate": 0.0001892153867555092,
"loss": 1.8917932510375977,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.63125,
"step": 75,
"tokens/total": 153600,
"tokens/train_per_sec_per_gpu": 137.44,
"tokens/trainable": 152441
},
{
"epoch": 1.0100806451612903,
"grad_norm": 0.664825975894928,
"learning_rate": 0.00018892747879067286,
"loss": 1.785915732383728,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.96504,
"step": 76,
"tokens/total": 155648,
"tokens/train_per_sec_per_gpu": 136.24,
"tokens/trainable": 154488
},
{
"epoch": 1.0120967741935485,
"grad_norm": 0.6600914597511292,
"learning_rate": 0.00018863600326884082,
"loss": 1.8432128429412842,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.3168,
"step": 77,
"tokens/total": 157696,
"tokens/train_per_sec_per_gpu": 133.58,
"tokens/trainable": 156510
},
{
"epoch": 1.0141129032258065,
"grad_norm": 0.7409534454345703,
"learning_rate": 0.00018834097188331143,
"loss": 2.134878158569336,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.45602,
"step": 78,
"tokens/total": 159744,
"tokens/train_per_sec_per_gpu": 136.39,
"tokens/trainable": 158551
},
{
"epoch": 1.0161290322580645,
"grad_norm": 0.6784939765930176,
"learning_rate": 0.00018804239647003573,
"loss": 1.950951099395752,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.03538,
"step": 79,
"tokens/total": 161792,
"tokens/train_per_sec_per_gpu": 133.83,
"tokens/trainable": 160585
},
{
"epoch": 1.0181451612903225,
"grad_norm": 0.6995214819908142,
"learning_rate": 0.00018774028900714256,
"loss": 2.291863441467285,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.89336,
"step": 80,
"tokens/total": 163840,
"tokens/train_per_sec_per_gpu": 135.69,
"tokens/trainable": 162622
},
{
"epoch": 1.0201612903225807,
"grad_norm": 0.6955534219741821,
"learning_rate": 0.00018743466161445823,
"loss": 1.7035590410232544,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.49346,
"step": 81,
"tokens/total": 165888,
"tokens/train_per_sec_per_gpu": 133.41,
"tokens/trainable": 164663
},
{
"epoch": 1.0221774193548387,
"grad_norm": 0.6910862326622009,
"learning_rate": 0.0001871255265530201,
"loss": 2.1356892585754395,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.46288,
"step": 82,
"tokens/total": 167936,
"tokens/train_per_sec_per_gpu": 133.66,
"tokens/trainable": 166700
},
{
"epoch": 1.0241935483870968,
"grad_norm": 0.7269375324249268,
"learning_rate": 0.00018681289622458485,
"loss": 2.05245304107666,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.78698,
"step": 83,
"tokens/total": 169984,
"tokens/train_per_sec_per_gpu": 133.27,
"tokens/trainable": 168743
},
{
"epoch": 1.0262096774193548,
"grad_norm": 0.7181906700134277,
"learning_rate": 0.00018649678317113084,
"loss": 1.815029263496399,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.14126,
"step": 84,
"tokens/total": 172032,
"tokens/train_per_sec_per_gpu": 133.71,
"tokens/trainable": 170779
},
{
"epoch": 1.028225806451613,
"grad_norm": 0.7987785935401917,
"learning_rate": 0.00018617720007435497,
"loss": 2.4782207012176514,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 11.92004,
"step": 85,
"tokens/total": 174080,
"tokens/train_per_sec_per_gpu": 134.46,
"tokens/trainable": 172825
},
{
"epoch": 1.030241935483871,
"grad_norm": 0.7619947195053101,
"learning_rate": 0.000185854159755164,
"loss": 2.177952289581299,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.82821,
"step": 86,
"tokens/total": 176128,
"tokens/train_per_sec_per_gpu": 131.89,
"tokens/trainable": 174861
},
{
"epoch": 1.032258064516129,
"grad_norm": 0.6998146772384644,
"learning_rate": 0.00018552767517316022,
"loss": 1.8050026893615723,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.07999,
"step": 87,
"tokens/total": 178176,
"tokens/train_per_sec_per_gpu": 133.17,
"tokens/trainable": 176894
},
{
"epoch": 1.034274193548387,
"grad_norm": 0.7544717788696289,
"learning_rate": 0.00018519775942612128,
"loss": 2.0845391750335693,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.04089,
"step": 88,
"tokens/total": 180224,
"tokens/train_per_sec_per_gpu": 132.51,
"tokens/trainable": 178924
},
{
"epoch": 1.0362903225806452,
"grad_norm": 0.7606767416000366,
"learning_rate": 0.00018486442574947511,
"loss": 2.1034138202667236,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.1941,
"step": 89,
"tokens/total": 182272,
"tokens/train_per_sec_per_gpu": 134.29,
"tokens/trainable": 180969
},
{
"epoch": 1.0383064516129032,
"grad_norm": 0.7183067798614502,
"learning_rate": 0.0001845276875157687,
"loss": 2.0029361248016357,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.41078,
"step": 90,
"tokens/total": 184320,
"tokens/train_per_sec_per_gpu": 132.23,
"tokens/trainable": 183002
},
{
"epoch": 1.0403225806451613,
"grad_norm": 0.8227747082710266,
"learning_rate": 0.0001841875582341317,
"loss": 2.2535457611083984,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.52144,
"step": 91,
"tokens/total": 186368,
"tokens/train_per_sec_per_gpu": 130.72,
"tokens/trainable": 185040
},
{
"epoch": 1.0423387096774193,
"grad_norm": 0.8300987482070923,
"learning_rate": 0.0001838440515497345,
"loss": 2.2482643127441406,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.47128,
"step": 92,
"tokens/total": 188416,
"tokens/train_per_sec_per_gpu": 133.43,
"tokens/trainable": 187068
},
{
"epoch": 1.0443548387096775,
"grad_norm": 0.7577045559883118,
"learning_rate": 0.00018349718124324076,
"loss": 2.021200656890869,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.54738,
"step": 93,
"tokens/total": 190464,
"tokens/train_per_sec_per_gpu": 133.41,
"tokens/trainable": 189096
},
{
"epoch": 1.0463709677419355,
"grad_norm": 0.7605635523796082,
"learning_rate": 0.00018314696123025454,
"loss": 1.8970659971237183,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.66631,
"step": 94,
"tokens/total": 192512,
"tokens/train_per_sec_per_gpu": 133.88,
"tokens/trainable": 191131
},
{
"epoch": 1.0483870967741935,
"grad_norm": 0.7875813245773315,
"learning_rate": 0.00018279340556076216,
"loss": 1.7688566446304321,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.86414,
"step": 95,
"tokens/total": 194560,
"tokens/train_per_sec_per_gpu": 131.53,
"tokens/trainable": 193158
},
{
"epoch": 1.0504032258064515,
"grad_norm": 0.7767308354377747,
"learning_rate": 0.0001824365284185684,
"loss": 1.7849911451339722,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.95953,
"step": 96,
"tokens/total": 196608,
"tokens/train_per_sec_per_gpu": 132.93,
"tokens/trainable": 195195
},
{
"epoch": 1.0524193548387097,
"grad_norm": 0.7135453224182129,
"learning_rate": 0.00018207634412072764,
"loss": 1.6029993295669556,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.96791,
"step": 97,
"tokens/total": 198656,
"tokens/train_per_sec_per_gpu": 134.14,
"tokens/trainable": 197231
},
{
"epoch": 1.0544354838709677,
"grad_norm": 0.7890044450759888,
"learning_rate": 0.00018171286711696934,
"loss": 2.068784236907959,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.91519,
"step": 98,
"tokens/total": 200704,
"tokens/train_per_sec_per_gpu": 132.51,
"tokens/trainable": 199274
},
{
"epoch": 1.0564516129032258,
"grad_norm": 0.7956987023353577,
"learning_rate": 0.0001813461119891184,
"loss": 1.9784281253814697,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.23137,
"step": 99,
"tokens/total": 202752,
"tokens/train_per_sec_per_gpu": 135.46,
"tokens/trainable": 201310
},
{
"epoch": 1.0584677419354838,
"grad_norm": 0.7803149223327637,
"learning_rate": 0.00018097609345051025,
"loss": 1.885071039199829,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.58682,
"step": 100,
"tokens/total": 204800,
"tokens/train_per_sec_per_gpu": 133.55,
"tokens/trainable": 203342
},
{
"epoch": 1.060483870967742,
"grad_norm": 0.8120332956314087,
"learning_rate": 0.00018060282634540053,
"loss": 2.0599560737609863,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.84563,
"step": 101,
"tokens/total": 206848,
"tokens/train_per_sec_per_gpu": 133.67,
"tokens/trainable": 205365
},
{
"epoch": 1.0625,
"grad_norm": 0.7515849471092224,
"learning_rate": 0.00018022632564836948,
"loss": 1.4255881309509277,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.1603,
"step": 102,
"tokens/total": 208896,
"tokens/train_per_sec_per_gpu": 133.07,
"tokens/trainable": 207388
},
{
"epoch": 1.064516129032258,
"grad_norm": 0.8653085231781006,
"learning_rate": 0.0001798466064637214,
"loss": 2.1312050819396973,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.42501,
"step": 103,
"tokens/total": 210944,
"tokens/train_per_sec_per_gpu": 135.16,
"tokens/trainable": 209420
},
{
"epoch": 1.066532258064516,
"grad_norm": 0.8592569231987,
"learning_rate": 0.00017946368402487845,
"loss": 1.9539296627044678,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.05636,
"step": 104,
"tokens/total": 212992,
"tokens/train_per_sec_per_gpu": 131.63,
"tokens/trainable": 211462
},
{
"epoch": 1.0685483870967742,
"grad_norm": 0.7780154347419739,
"learning_rate": 0.00017907757369376985,
"loss": 1.9227118492126465,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.83948,
"step": 105,
"tokens/total": 215040,
"tokens/train_per_sec_per_gpu": 134.27,
"tokens/trainable": 213491
},
{
"epoch": 1.0705645161290323,
"grad_norm": 0.7553818225860596,
"learning_rate": 0.00017868829096021527,
"loss": 1.7215089797973633,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.59296,
"step": 106,
"tokens/total": 217088,
"tokens/train_per_sec_per_gpu": 134.03,
"tokens/trainable": 215522
},
{
"epoch": 1.0725806451612903,
"grad_norm": 0.8612887263298035,
"learning_rate": 0.00017829585144130356,
"loss": 2.0418860912323,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.70513,
"step": 107,
"tokens/total": 219136,
"tokens/train_per_sec_per_gpu": 134.46,
"tokens/trainable": 217548
},
{
"epoch": 1.0745967741935485,
"grad_norm": 0.7575510740280151,
"learning_rate": 0.0001779002708807662,
"loss": 1.6365363597869873,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.13734,
"step": 108,
"tokens/total": 221184,
"tokens/train_per_sec_per_gpu": 137.71,
"tokens/trainable": 219593
},
{
"epoch": 1.0766129032258065,
"grad_norm": 0.8436228632926941,
"learning_rate": 0.0001775015651483459,
"loss": 2.169118881225586,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.75057,
"step": 109,
"tokens/total": 223232,
"tokens/train_per_sec_per_gpu": 134.87,
"tokens/trainable": 221619
},
{
"epoch": 1.0786290322580645,
"grad_norm": 0.7953697443008423,
"learning_rate": 0.00017709975023915949,
"loss": 1.9425008296966553,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.97618,
"step": 110,
"tokens/total": 225280,
"tokens/train_per_sec_per_gpu": 137.3,
"tokens/trainable": 223657
},
{
"epoch": 1.0806451612903225,
"grad_norm": 0.7815860509872437,
"learning_rate": 0.0001766948422730567,
"loss": 1.682219386100769,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.37748,
"step": 111,
"tokens/total": 227328,
"tokens/train_per_sec_per_gpu": 132.35,
"tokens/trainable": 225684
},
{
"epoch": 1.0826612903225807,
"grad_norm": 0.9288930892944336,
"learning_rate": 0.0001762868574939732,
"loss": 2.3112406730651855,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.08693,
"step": 112,
"tokens/total": 229376,
"tokens/train_per_sec_per_gpu": 135.23,
"tokens/trainable": 227725
},
{
"epoch": 1.0846774193548387,
"grad_norm": 0.8349924087524414,
"learning_rate": 0.0001758758122692791,
"loss": 1.882235050201416,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.56817,
"step": 113,
"tokens/total": 231424,
"tokens/train_per_sec_per_gpu": 134.29,
"tokens/trainable": 229762
},
{
"epoch": 1.0866935483870968,
"grad_norm": 0.7674592137336731,
"learning_rate": 0.00017546172308912213,
"loss": 1.8356249332427979,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.26905,
"step": 114,
"tokens/total": 233472,
"tokens/train_per_sec_per_gpu": 135.38,
"tokens/trainable": 231797
},
{
"epoch": 1.0887096774193548,
"grad_norm": 0.7179352641105652,
"learning_rate": 0.00017504460656576627,
"loss": 1.7059075832366943,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.50638,
"step": 115,
"tokens/total": 235520,
"tokens/train_per_sec_per_gpu": 134.51,
"tokens/trainable": 233832
},
{
"epoch": 1.090725806451613,
"grad_norm": 0.8075538873672485,
"learning_rate": 0.0001746244794329252,
"loss": 2.1125102043151855,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.26897,
"step": 116,
"tokens/total": 237568,
"tokens/train_per_sec_per_gpu": 132.98,
"tokens/trainable": 235856
},
{
"epoch": 1.092741935483871,
"grad_norm": 0.735394299030304,
"learning_rate": 0.0001742013585450911,
"loss": 1.6673572063446045,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.29815,
"step": 117,
"tokens/total": 239616,
"tokens/train_per_sec_per_gpu": 133.83,
"tokens/trainable": 237885
},
{
"epoch": 1.094758064516129,
"grad_norm": 0.7311357259750366,
"learning_rate": 0.00017377526087685832,
"loss": 1.642756462097168,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.1694,
"step": 118,
"tokens/total": 241664,
"tokens/train_per_sec_per_gpu": 135.95,
"tokens/trainable": 239928
},
{
"epoch": 1.096774193548387,
"grad_norm": 0.7718885540962219,
"learning_rate": 0.0001733462035222426,
"loss": 1.8600523471832275,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.42407,
"step": 119,
"tokens/total": 243712,
"tokens/train_per_sec_per_gpu": 133.17,
"tokens/trainable": 241964
},
{
"epoch": 1.0987903225806452,
"grad_norm": 0.7755111455917358,
"learning_rate": 0.0001729142036939951,
"loss": 1.783015489578247,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.94776,
"step": 120,
"tokens/total": 245760,
"tokens/train_per_sec_per_gpu": 134.83,
"tokens/trainable": 243989
},
{
"epoch": 1.1008064516129032,
"grad_norm": 0.7631067633628845,
"learning_rate": 0.000172479278722912,
"loss": 1.6326531171798706,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.11743,
"step": 121,
"tokens/total": 247808,
"tokens/train_per_sec_per_gpu": 133.96,
"tokens/trainable": 246025
},
{
"epoch": 1.1028225806451613,
"grad_norm": 0.6940619349479675,
"learning_rate": 0.0001720414460571392,
"loss": 1.6045302152633667,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.97552,
"step": 122,
"tokens/total": 249856,
"tokens/train_per_sec_per_gpu": 131.93,
"tokens/trainable": 248049
},
{
"epoch": 1.1048387096774193,
"grad_norm": 0.7802858948707581,
"learning_rate": 0.0001716007232614723,
"loss": 1.4858357906341553,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.41866,
"step": 123,
"tokens/total": 251904,
"tokens/train_per_sec_per_gpu": 132.63,
"tokens/trainable": 250089
},
{
"epoch": 1.1068548387096775,
"grad_norm": 0.872261643409729,
"learning_rate": 0.000171157128016652,
"loss": 1.9680781364440918,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.15691,
"step": 124,
"tokens/total": 253952,
"tokens/train_per_sec_per_gpu": 134.69,
"tokens/trainable": 252120
},
{
"epoch": 1.1088709677419355,
"grad_norm": 0.814172625541687,
"learning_rate": 0.00017071067811865476,
"loss": 1.9667139053344727,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.14715,
"step": 125,
"tokens/total": 256000,
"tokens/train_per_sec_per_gpu": 131.2,
"tokens/trainable": 254151
},
{
"epoch": 1.1108870967741935,
"grad_norm": 0.7579799890518188,
"learning_rate": 0.0001702613914779789,
"loss": 1.8390204906463623,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.29037,
"step": 126,
"tokens/total": 258048,
"tokens/train_per_sec_per_gpu": 130.65,
"tokens/trainable": 256197
},
{
"epoch": 1.1129032258064515,
"grad_norm": 0.89185631275177,
"learning_rate": 0.0001698092861189259,
"loss": 2.3891618251800537,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.90435,
"step": 127,
"tokens/total": 260096,
"tokens/train_per_sec_per_gpu": 131.31,
"tokens/trainable": 258228
},
{
"epoch": 1.1149193548387097,
"grad_norm": 0.8794082999229431,
"learning_rate": 0.00016935438017887772,
"loss": 2.1794254779815674,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.84123,
"step": 128,
"tokens/total": 262144,
"tokens/train_per_sec_per_gpu": 132.45,
"tokens/trainable": 260247
},
{
"epoch": 1.1169354838709677,
"grad_norm": 0.7506632804870605,
"learning_rate": 0.00016889669190756868,
"loss": 1.693035364151001,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.43596,
"step": 129,
"tokens/total": 264192,
"tokens/train_per_sec_per_gpu": 133.97,
"tokens/trainable": 262269
},
{
"epoch": 1.1189516129032258,
"grad_norm": 0.718761146068573,
"learning_rate": 0.00016843623966635366,
"loss": 1.2158582210540771,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.37319,
"step": 130,
"tokens/total": 266240,
"tokens/train_per_sec_per_gpu": 129.03,
"tokens/trainable": 264279
},
{
"epoch": 1.120967741935484,
"grad_norm": 0.9651544690132141,
"learning_rate": 0.0001679730419274713,
"loss": 2.0829291343688965,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.02795,
"step": 131,
"tokens/total": 268288,
"tokens/train_per_sec_per_gpu": 126.71,
"tokens/trainable": 266304
},
{
"epoch": 1.122983870967742,
"grad_norm": 0.833846390247345,
"learning_rate": 0.0001675071172733031,
"loss": 1.8694937229156494,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.48501,
"step": 132,
"tokens/total": 270336,
"tokens/train_per_sec_per_gpu": 128.42,
"tokens/trainable": 268327
},
{
"epoch": 1.125,
"grad_norm": 0.8215437531471252,
"learning_rate": 0.00016703848439562785,
"loss": 1.878624677658081,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.5445,
"step": 133,
"tokens/total": 272384,
"tokens/train_per_sec_per_gpu": 130.51,
"tokens/trainable": 270345
},
{
"epoch": 1.127016129032258,
"grad_norm": 0.8171601891517639,
"learning_rate": 0.00016656716209487174,
"loss": 1.9828517436981201,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.26343,
"step": 134,
"tokens/total": 274432,
"tokens/train_per_sec_per_gpu": 134.93,
"tokens/trainable": 272374
},
{
"epoch": 1.129032258064516,
"grad_norm": 0.966566801071167,
"learning_rate": 0.0001660931692793541,
"loss": 2.0265626907348633,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.58796,
"step": 135,
"tokens/total": 276480,
"tokens/train_per_sec_per_gpu": 133.85,
"tokens/trainable": 274398
},
{
"epoch": 1.1310483870967742,
"grad_norm": 0.9244574904441833,
"learning_rate": 0.000165616524964529,
"loss": 2.0933644771575928,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.11216,
"step": 136,
"tokens/total": 278528,
"tokens/train_per_sec_per_gpu": 10.87,
"tokens/trainable": 276439
},
{
"epoch": 1.1330645161290323,
"grad_norm": 0.8794076442718506,
"learning_rate": 0.00016513724827222227,
"loss": 2.0877299308776855,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.06658,
"step": 137,
"tokens/total": 280576,
"tokens/train_per_sec_per_gpu": 65.05,
"tokens/trainable": 278477
},
{
"epoch": 1.1350806451612903,
"grad_norm": 0.8859477639198303,
"learning_rate": 0.00016465535842986434,
"loss": 2.188518524169922,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.92199,
"step": 138,
"tokens/total": 282624,
"tokens/train_per_sec_per_gpu": 84.63,
"tokens/trainable": 280494
},
{
"epoch": 1.1370967741935485,
"grad_norm": 0.8838194012641907,
"learning_rate": 0.000164170874769719,
"loss": 1.8421143293380737,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.30987,
"step": 139,
"tokens/total": 284672,
"tokens/train_per_sec_per_gpu": 87.05,
"tokens/trainable": 282504
},
{
"epoch": 1.1391129032258065,
"grad_norm": 0.7671190500259399,
"learning_rate": 0.00016368381672810786,
"loss": 1.5053142309188843,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.50557,
"step": 140,
"tokens/total": 286720,
"tokens/train_per_sec_per_gpu": 87.13,
"tokens/trainable": 284532
},
{
"epoch": 1.1411290322580645,
"grad_norm": 1.073824167251587,
"learning_rate": 0.0001631942038446304,
"loss": 2.0518157482147217,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.78202,
"step": 141,
"tokens/total": 288768,
"tokens/train_per_sec_per_gpu": 86.56,
"tokens/trainable": 286491
},
{
"epoch": 1.1431451612903225,
"grad_norm": 0.9559262990951538,
"learning_rate": 0.00016270205576138032,
"loss": 1.7970073223114014,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.03157,
"step": 142,
"tokens/total": 290816,
"tokens/train_per_sec_per_gpu": 63.19,
"tokens/trainable": 287984
},
{
"epoch": 2.002016129032258,
"grad_norm": 0.9580713510513306,
"learning_rate": 0.00016220739222215738,
"loss": 2.409911632537842,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 11.13298,
"step": 143,
"tokens/total": 292864,
"tokens/train_per_sec_per_gpu": 84.37,
"tokens/trainable": 290021
},
{
"epoch": 2.004032258064516,
"grad_norm": 0.7835400700569153,
"learning_rate": 0.00016171023307167545,
"loss": 1.5976440906524658,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.94138,
"step": 144,
"tokens/total": 294912,
"tokens/train_per_sec_per_gpu": 88.19,
"tokens/trainable": 292069
},
{
"epoch": 2.006048387096774,
"grad_norm": 0.8442096710205078,
"learning_rate": 0.0001612105982547663,
"loss": 1.6255199909210205,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.08106,
"step": 145,
"tokens/total": 296960,
"tokens/train_per_sec_per_gpu": 92.67,
"tokens/trainable": 294108
},
{
"epoch": 2.0080645161290325,
"grad_norm": 0.7924631834030151,
"learning_rate": 0.00016070850781557948,
"loss": 1.660654067993164,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.26275,
"step": 146,
"tokens/total": 299008,
"tokens/train_per_sec_per_gpu": 86.3,
"tokens/trainable": 296150
},
{
"epoch": 2.0100806451612905,
"grad_norm": 0.8508074283599854,
"learning_rate": 0.0001602039818967783,
"loss": 1.522460699081421,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.58349,
"step": 147,
"tokens/total": 301056,
"tokens/train_per_sec_per_gpu": 90.44,
"tokens/trainable": 298194
},
{
"epoch": 2.0120967741935485,
"grad_norm": 0.8621785044670105,
"learning_rate": 0.00015969704073873157,
"loss": 1.6797025203704834,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.36396,
"step": 148,
"tokens/total": 303104,
"tokens/train_per_sec_per_gpu": 88.14,
"tokens/trainable": 300237
},
{
"epoch": 2.0141129032258065,
"grad_norm": 0.9323299527168274,
"learning_rate": 0.0001591877046787017,
"loss": 1.7540191411972046,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.77778,
"step": 149,
"tokens/total": 305152,
"tokens/train_per_sec_per_gpu": 89.25,
"tokens/trainable": 302278
},
{
"epoch": 2.0161290322580645,
"grad_norm": 0.9931126832962036,
"learning_rate": 0.00015867599415002895,
"loss": 1.766423225402832,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.84989,
"step": 150,
"tokens/total": 307200,
"tokens/train_per_sec_per_gpu": 89.97,
"tokens/trainable": 304305
},
{
"epoch": 2.0181451612903225,
"grad_norm": 1.1118701696395874,
"learning_rate": 0.00015816192968131138,
"loss": 2.2324070930480957,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 9.32228,
"step": 151,
"tokens/total": 309248,
"tokens/train_per_sec_per_gpu": 84.81,
"tokens/trainable": 306352
},
{
"epoch": 2.0201612903225805,
"grad_norm": 1.1531903743743896,
"learning_rate": 0.0001576455318955816,
"loss": 2.0641446113586426,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.87856,
"step": 152,
"tokens/total": 311296,
"tokens/train_per_sec_per_gpu": 91.08,
"tokens/trainable": 308400
},
{
"epoch": 2.0221774193548385,
"grad_norm": 0.9435677528381348,
"learning_rate": 0.00015712682150947923,
"loss": 1.8673259019851685,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.47097,
"step": 153,
"tokens/total": 313344,
"tokens/train_per_sec_per_gpu": 90.76,
"tokens/trainable": 310436
},
{
"epoch": 2.024193548387097,
"grad_norm": 0.8857016563415527,
"learning_rate": 0.00015660581933241993,
"loss": 1.4891250133514404,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.43321,
"step": 154,
"tokens/total": 315392,
"tokens/train_per_sec_per_gpu": 91.7,
"tokens/trainable": 312479
},
{
"epoch": 2.026209677419355,
"grad_norm": 1.0294502973556519,
"learning_rate": 0.00015608254626576048,
"loss": 2.35011625289917,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 10.48679,
"step": 155,
"tokens/total": 317440,
"tokens/train_per_sec_per_gpu": 91.75,
"tokens/trainable": 314520
},
{
"epoch": 2.028225806451613,
"grad_norm": 0.9770046472549438,
"learning_rate": 0.00015555702330196023,
"loss": 1.99894118309021,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.38124,
"step": 156,
"tokens/total": 319488,
"tokens/train_per_sec_per_gpu": 89.42,
"tokens/trainable": 316550
},
{
"epoch": 2.030241935483871,
"grad_norm": 0.8702968955039978,
"learning_rate": 0.00015502927152373914,
"loss": 1.7607452869415283,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.81677,
"step": 157,
"tokens/total": 321536,
"tokens/train_per_sec_per_gpu": 86.57,
"tokens/trainable": 318590
},
{
"epoch": 2.032258064516129,
"grad_norm": 0.8458526134490967,
"learning_rate": 0.0001544993121032318,
"loss": 1.7635177373886108,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.83292,
"step": 158,
"tokens/total": 323584,
"tokens/train_per_sec_per_gpu": 95.81,
"tokens/trainable": 320630
},
{
"epoch": 2.034274193548387,
"grad_norm": 0.9898233413696289,
"learning_rate": 0.000153967166301138,
"loss": 1.5542548894882202,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.73156,
"step": 159,
"tokens/total": 325632,
"tokens/train_per_sec_per_gpu": 90.09,
"tokens/trainable": 322660
},
{
"epoch": 2.036290322580645,
"grad_norm": 1.072844386100769,
"learning_rate": 0.00015343285546587013,
"loss": 1.4093410968780518,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.09326,
"step": 160,
"tokens/total": 327680,
"tokens/train_per_sec_per_gpu": 86.27,
"tokens/trainable": 324691
},
{
"epoch": 2.038306451612903,
"grad_norm": 0.9787235260009766,
"learning_rate": 0.00015289640103269625,
"loss": 1.877701997756958,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.53846,
"step": 161,
"tokens/total": 329728,
"tokens/train_per_sec_per_gpu": 87.58,
"tokens/trainable": 326732
},
{
"epoch": 2.0403225806451615,
"grad_norm": 0.8260992765426636,
"learning_rate": 0.00015235782452288068,
"loss": 1.4073071479797363,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.08494,
"step": 162,
"tokens/total": 331776,
"tokens/train_per_sec_per_gpu": 91.15,
"tokens/trainable": 328766
},
{
"epoch": 2.0423387096774195,
"grad_norm": 0.9031779766082764,
"learning_rate": 0.0001518171475428202,
"loss": 1.6873669624328613,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.40523,
"step": 163,
"tokens/total": 333824,
"tokens/train_per_sec_per_gpu": 87.34,
"tokens/trainable": 330799
},
{
"epoch": 2.0443548387096775,
"grad_norm": 0.9085680842399597,
"learning_rate": 0.00015127439178317745,
"loss": 1.860163688659668,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.42479,
"step": 164,
"tokens/total": 335872,
"tokens/train_per_sec_per_gpu": 88.02,
"tokens/trainable": 332832
},
{
"epoch": 2.0463709677419355,
"grad_norm": 1.0631558895111084,
"learning_rate": 0.00015072957901801076,
"loss": 1.7626943588256836,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.82812,
"step": 165,
"tokens/total": 337920,
"tokens/train_per_sec_per_gpu": 96.08,
"tokens/trainable": 334880
},
{
"epoch": 2.0483870967741935,
"grad_norm": 0.9707496166229248,
"learning_rate": 0.0001501827311039005,
"loss": 1.3545994758605957,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.87521,
"step": 166,
"tokens/total": 339968,
"tokens/train_per_sec_per_gpu": 95.22,
"tokens/trainable": 336918
},
{
"epoch": 2.0504032258064515,
"grad_norm": 0.9129533767700195,
"learning_rate": 0.0001496338699790724,
"loss": 1.9540798664093018,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.05742,
"step": 167,
"tokens/total": 342016,
"tokens/train_per_sec_per_gpu": 94.08,
"tokens/trainable": 338950
},
{
"epoch": 2.0524193548387095,
"grad_norm": 1.0040849447250366,
"learning_rate": 0.00014908301766251739,
"loss": 1.9150488376617432,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.78727,
"step": 168,
"tokens/total": 344064,
"tokens/train_per_sec_per_gpu": 93.34,
"tokens/trainable": 340985
},
{
"epoch": 2.0544354838709675,
"grad_norm": 0.892105221748352,
"learning_rate": 0.00014853019625310813,
"loss": 1.5278139114379883,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.60809,
"step": 169,
"tokens/total": 346112,
"tokens/train_per_sec_per_gpu": 91.28,
"tokens/trainable": 343029
},
{
"epoch": 2.056451612903226,
"grad_norm": 0.9372109174728394,
"learning_rate": 0.00014797542792871265,
"loss": 1.9480211734771729,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.01479,
"step": 170,
"tokens/total": 348160,
"tokens/train_per_sec_per_gpu": 89.14,
"tokens/trainable": 345066
},
{
"epoch": 2.058467741935484,
"grad_norm": 0.9851438403129578,
"learning_rate": 0.0001474187349453045,
"loss": 2.036619186401367,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.66465,
"step": 171,
"tokens/total": 350208,
"tokens/train_per_sec_per_gpu": 89.33,
"tokens/trainable": 347097
},
{
"epoch": 2.060483870967742,
"grad_norm": 1.0404151678085327,
"learning_rate": 0.00014686013963607,
"loss": 1.9593498706817627,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.09471,
"step": 172,
"tokens/total": 352256,
"tokens/train_per_sec_per_gpu": 90.31,
"tokens/trainable": 349138
},
{
"epoch": 2.0625,
"grad_norm": 0.9579296708106995,
"learning_rate": 0.00014629966441051208,
"loss": 1.647827386856079,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.19568,
"step": 173,
"tokens/total": 354304,
"tokens/train_per_sec_per_gpu": 88.57,
"tokens/trainable": 351184
},
{
"epoch": 2.064516129032258,
"grad_norm": 0.9718630909919739,
"learning_rate": 0.0001457373317535515,
"loss": 1.6372270584106445,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.14089,
"step": 174,
"tokens/total": 356352,
"tokens/train_per_sec_per_gpu": 84.71,
"tokens/trainable": 353227
},
{
"epoch": 2.066532258064516,
"grad_norm": 0.9516034126281738,
"learning_rate": 0.0001451731642246247,
"loss": 1.6231439113616943,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.069,
"step": 175,
"tokens/total": 358400,
"tokens/train_per_sec_per_gpu": 89.05,
"tokens/trainable": 355254
},
{
"epoch": 2.068548387096774,
"grad_norm": 0.9384471774101257,
"learning_rate": 0.00014460718445677876,
"loss": 1.7225630283355713,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.59886,
"step": 176,
"tokens/total": 360448,
"tokens/train_per_sec_per_gpu": 88.75,
"tokens/trainable": 357296
},
{
"epoch": 2.0705645161290325,
"grad_norm": 0.9477818012237549,
"learning_rate": 0.00014403941515576344,
"loss": 1.3852624893188477,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.99587,
"step": 177,
"tokens/total": 362496,
"tokens/train_per_sec_per_gpu": 92.17,
"tokens/trainable": 359324
},
{
"epoch": 2.0725806451612905,
"grad_norm": 1.0369067192077637,
"learning_rate": 0.00014346987909912023,
"loss": 1.883034348487854,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.57342,
"step": 178,
"tokens/total": 364544,
"tokens/train_per_sec_per_gpu": 92.07,
"tokens/trainable": 361370
},
{
"epoch": 2.0745967741935485,
"grad_norm": 1.0867875814437866,
"learning_rate": 0.00014289859913526874,
"loss": 2.1149420738220215,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.28911,
"step": 179,
"tokens/total": 366592,
"tokens/train_per_sec_per_gpu": 135.2,
"tokens/trainable": 363411
},
{
"epoch": 2.0766129032258065,
"grad_norm": 1.0646148920059204,
"learning_rate": 0.00014232559818258984,
"loss": 1.6198821067810059,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.05249,
"step": 180,
"tokens/total": 368640,
"tokens/train_per_sec_per_gpu": 135.66,
"tokens/trainable": 365443
},
{
"epoch": 2.0786290322580645,
"grad_norm": 0.9128944873809814,
"learning_rate": 0.00014175089922850633,
"loss": 1.5648690462112427,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.78205,
"step": 181,
"tokens/total": 370688,
"tokens/train_per_sec_per_gpu": 133.95,
"tokens/trainable": 367472
},
{
"epoch": 2.0806451612903225,
"grad_norm": 1.1343512535095215,
"learning_rate": 0.00014117452532856083,
"loss": 1.924842119216919,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.85407,
"step": 182,
"tokens/total": 372736,
"tokens/train_per_sec_per_gpu": 136.2,
"tokens/trainable": 369508
},
{
"epoch": 2.0826612903225805,
"grad_norm": 0.9804732799530029,
"learning_rate": 0.0001405964996054907,
"loss": 2.0507593154907227,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.7738,
"step": 183,
"tokens/total": 374784,
"tokens/train_per_sec_per_gpu": 133.13,
"tokens/trainable": 371540
},
{
"epoch": 2.0846774193548385,
"grad_norm": 1.1312826871871948,
"learning_rate": 0.00014001684524830057,
"loss": 1.557239055633545,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.7457,
"step": 184,
"tokens/total": 376832,
"tokens/train_per_sec_per_gpu": 136.66,
"tokens/trainable": 373580
},
{
"epoch": 2.086693548387097,
"grad_norm": 0.9265478253364563,
"learning_rate": 0.00013943558551133186,
"loss": 1.7553461790084839,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.78545,
"step": 185,
"tokens/total": 378880,
"tokens/train_per_sec_per_gpu": 135.57,
"tokens/trainable": 375627
},
{
"epoch": 2.088709677419355,
"grad_norm": 1.0380595922470093,
"learning_rate": 0.00013885274371333,
"loss": 1.7150152921676636,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.55676,
"step": 186,
"tokens/total": 380928,
"tokens/train_per_sec_per_gpu": 132.35,
"tokens/trainable": 377649
},
{
"epoch": 2.090725806451613,
"grad_norm": 1.0621381998062134,
"learning_rate": 0.000138268343236509,
"loss": 1.9849774837493896,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.27888,
"step": 187,
"tokens/total": 382976,
"tokens/train_per_sec_per_gpu": 131.15,
"tokens/trainable": 379649
},
{
"epoch": 2.092741935483871,
"grad_norm": 0.8717451691627502,
"learning_rate": 0.00013768240752561314,
"loss": 1.2543466091156006,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.50555,
"step": 188,
"tokens/total": 385024,
"tokens/train_per_sec_per_gpu": 134.1,
"tokens/trainable": 381680
},
{
"epoch": 2.094758064516129,
"grad_norm": 0.9604067206382751,
"learning_rate": 0.0001370949600869768,
"loss": 1.8013954162597656,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.0581,
"step": 189,
"tokens/total": 387072,
"tokens/train_per_sec_per_gpu": 132.56,
"tokens/trainable": 383719
},
{
"epoch": 2.096774193548387,
"grad_norm": 0.8730901479721069,
"learning_rate": 0.00013650602448758112,
"loss": 1.5253915786743164,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.59694,
"step": 190,
"tokens/total": 389120,
"tokens/train_per_sec_per_gpu": 131.79,
"tokens/trainable": 385752
},
{
"epoch": 2.098790322580645,
"grad_norm": 0.884779691696167,
"learning_rate": 0.0001359156243541087,
"loss": 1.194589376449585,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.3022,
"step": 191,
"tokens/total": 391168,
"tokens/train_per_sec_per_gpu": 134.36,
"tokens/trainable": 387786
},
{
"epoch": 2.100806451612903,
"grad_norm": 0.9849578142166138,
"learning_rate": 0.00013532378337199582,
"loss": 1.355954885482788,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.88046,
"step": 192,
"tokens/total": 393216,
"tokens/train_per_sec_per_gpu": 131.67,
"tokens/trainable": 389810
},
{
"epoch": 2.1028225806451615,
"grad_norm": 1.015483021736145,
"learning_rate": 0.00013473052528448201,
"loss": 1.7660787105560303,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.84788,
"step": 193,
"tokens/total": 395264,
"tokens/train_per_sec_per_gpu": 131.12,
"tokens/trainable": 391846
},
{
"epoch": 2.1048387096774195,
"grad_norm": 1.1974010467529297,
"learning_rate": 0.00013413587389165784,
"loss": 2.0104124546051025,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.4664,
"step": 194,
"tokens/total": 397312,
"tokens/train_per_sec_per_gpu": 132.58,
"tokens/trainable": 393887
},
{
"epoch": 2.1068548387096775,
"grad_norm": 0.8975300788879395,
"learning_rate": 0.00013353985304950973,
"loss": 1.717996597290039,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.57335,
"step": 195,
"tokens/total": 399360,
"tokens/train_per_sec_per_gpu": 131.46,
"tokens/trainable": 395920
},
{
"epoch": 2.1088709677419355,
"grad_norm": 0.9136057496070862,
"learning_rate": 0.00013294248666896328,
"loss": 1.597192406654358,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.93915,
"step": 196,
"tokens/total": 401408,
"tokens/train_per_sec_per_gpu": 132.82,
"tokens/trainable": 397953
},
{
"epoch": 2.1108870967741935,
"grad_norm": 0.8866783380508423,
"learning_rate": 0.0001323437987149238,
"loss": 1.5093090534210205,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.5236,
"step": 197,
"tokens/total": 403456,
"tokens/train_per_sec_per_gpu": 131.45,
"tokens/trainable": 399989
},
{
"epoch": 2.1129032258064515,
"grad_norm": 0.9615466594696045,
"learning_rate": 0.00013174381320531505,
"loss": 1.4923886060714722,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.44771,
"step": 198,
"tokens/total": 405504,
"tokens/train_per_sec_per_gpu": 137.67,
"tokens/trainable": 402007
},
{
"epoch": 2.1149193548387095,
"grad_norm": 1.0945857763290405,
"learning_rate": 0.0001311425542101154,
"loss": 1.8227579593658447,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.1889,
"step": 199,
"tokens/total": 407552,
"tokens/train_per_sec_per_gpu": 137.19,
"tokens/trainable": 404045
},
{
"epoch": 2.1169354838709675,
"grad_norm": 1.0081984996795654,
"learning_rate": 0.00013054004585039258,
"loss": 2.030510663986206,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.61798,
"step": 200,
"tokens/total": 409600,
"tokens/train_per_sec_per_gpu": 135.71,
"tokens/trainable": 406039
},
{
"epoch": 2.118951612903226,
"grad_norm": 0.9970818161964417,
"learning_rate": 0.00012993631229733582,
"loss": 1.589468002319336,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.90114,
"step": 201,
"tokens/total": 411648,
"tokens/train_per_sec_per_gpu": 134.62,
"tokens/trainable": 408035
},
{
"epoch": 2.120967741935484,
"grad_norm": 1.0629839897155762,
"learning_rate": 0.00012933137777128607,
"loss": 1.8885078430175781,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.6095,
"step": 202,
"tokens/total": 413696,
"tokens/train_per_sec_per_gpu": 137.54,
"tokens/trainable": 410049
},
{
"epoch": 2.122983870967742,
"grad_norm": 1.0215117931365967,
"learning_rate": 0.0001287252665407645,
"loss": 1.3511649370193481,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.86192,
"step": 203,
"tokens/total": 415744,
"tokens/train_per_sec_per_gpu": 136.73,
"tokens/trainable": 412060
},
{
"epoch": 2.125,
"grad_norm": 0.9783656597137451,
"learning_rate": 0.0001281180029214988,
"loss": 1.569549798965454,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.80448,
"step": 204,
"tokens/total": 417792,
"tokens/train_per_sec_per_gpu": 134.42,
"tokens/trainable": 414059
},
{
"epoch": 2.127016129032258,
"grad_norm": 0.9745059609413147,
"learning_rate": 0.0001275096112754478,
"loss": 1.5478930473327637,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.70155,
"step": 205,
"tokens/total": 419840,
"tokens/train_per_sec_per_gpu": 136.01,
"tokens/trainable": 416092
},
{
"epoch": 2.129032258064516,
"grad_norm": 1.0262490510940552,
"learning_rate": 0.000126900116009824,
"loss": 1.683368444442749,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.38366,
"step": 206,
"tokens/total": 421888,
"tokens/train_per_sec_per_gpu": 136.5,
"tokens/trainable": 418128
},
{
"epoch": 2.131048387096774,
"grad_norm": 0.885127604007721,
"learning_rate": 0.0001262895415761145,
"loss": 1.145145058631897,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.1429,
"step": 207,
"tokens/total": 423936,
"tokens/train_per_sec_per_gpu": 135.34,
"tokens/trainable": 420138
},
{
"epoch": 2.133064516129032,
"grad_norm": 1.0641528367996216,
"learning_rate": 0.00012567791246909994,
"loss": 1.5435967445373535,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.6814,
"step": 208,
"tokens/total": 425984,
"tokens/train_per_sec_per_gpu": 135.8,
"tokens/trainable": 422149
},
{
"epoch": 2.1350806451612905,
"grad_norm": 1.146147608757019,
"learning_rate": 0.00012506525322587207,
"loss": 1.7838163375854492,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.95253,
"step": 209,
"tokens/total": 428032,
"tokens/train_per_sec_per_gpu": 136.69,
"tokens/trainable": 424159
},
{
"epoch": 2.1370967741935485,
"grad_norm": 1.203956961631775,
"learning_rate": 0.0001244515884248491,
"loss": 1.9419140815734863,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.97208,
"step": 210,
"tokens/total": 430080,
"tokens/train_per_sec_per_gpu": 140.93,
"tokens/trainable": 426199
},
{
"epoch": 2.1391129032258065,
"grad_norm": 1.015429139137268,
"learning_rate": 0.00012383694268478993,
"loss": 1.591796875,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.91257,
"step": 211,
"tokens/total": 432128,
"tokens/train_per_sec_per_gpu": 139.89,
"tokens/trainable": 428188
},
{
"epoch": 2.1411290322580645,
"grad_norm": 1.0179929733276367,
"learning_rate": 0.0001232213406638062,
"loss": 1.5322093963623047,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.62839,
"step": 212,
"tokens/total": 434176,
"tokens/train_per_sec_per_gpu": 134.26,
"tokens/trainable": 430120
},
{
"epoch": 2.1431451612903225,
"grad_norm": 3.186521530151367,
"learning_rate": 0.0001226048070583735,
"loss": 1.9480620622634888,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.01508,
"step": 213,
"tokens/total": 436224,
"tokens/train_per_sec_per_gpu": 38.63,
"tokens/trainable": 430680
},
{
"epoch": 3.002016129032258,
"grad_norm": 0.9344216585159302,
"learning_rate": 0.00012198736660234009,
"loss": 1.3694523572921753,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.9332,
"step": 214,
"tokens/total": 438272,
"tokens/train_per_sec_per_gpu": 139.4,
"tokens/trainable": 432721
},
{
"epoch": 3.004032258064516,
"grad_norm": 1.053391456604004,
"learning_rate": 0.00012136904406593507,
"loss": 1.7003636360168457,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.47594,
"step": 215,
"tokens/total": 440320,
"tokens/train_per_sec_per_gpu": 141.09,
"tokens/trainable": 434759
},
{
"epoch": 3.006048387096774,
"grad_norm": 1.0795952081680298,
"learning_rate": 0.00012074986425477445,
"loss": 1.7129063606262207,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.54505,
"step": 216,
"tokens/total": 442368,
"tokens/train_per_sec_per_gpu": 140.09,
"tokens/trainable": 436801
},
{
"epoch": 3.0080645161290325,
"grad_norm": 0.9860684275627136,
"learning_rate": 0.00012012985200886602,
"loss": 1.1881208419799805,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.28091,
"step": 217,
"tokens/total": 444416,
"tokens/train_per_sec_per_gpu": 142.8,
"tokens/trainable": 438848
},
{
"epoch": 3.0100806451612905,
"grad_norm": 1.1295051574707031,
"learning_rate": 0.00011950903220161285,
"loss": 1.8362010717391968,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.27266,
"step": 218,
"tokens/total": 446464,
"tokens/train_per_sec_per_gpu": 142.72,
"tokens/trainable": 440887
},
{
"epoch": 3.0120967741935485,
"grad_norm": 1.1178804636001587,
"learning_rate": 0.00011888742973881543,
"loss": 1.6956043243408203,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.44994,
"step": 219,
"tokens/total": 448512,
"tokens/train_per_sec_per_gpu": 142.15,
"tokens/trainable": 442922
},
{
"epoch": 3.0141129032258065,
"grad_norm": 1.0459589958190918,
"learning_rate": 0.00011826506955767258,
"loss": 1.6698713302612305,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.31148,
"step": 220,
"tokens/total": 450560,
"tokens/train_per_sec_per_gpu": 140.62,
"tokens/trainable": 444956
},
{
"epoch": 3.0161290322580645,
"grad_norm": 1.1543166637420654,
"learning_rate": 0.00011764197662578086,
"loss": 1.579270839691162,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.85142,
"step": 221,
"tokens/total": 452608,
"tokens/train_per_sec_per_gpu": 140.09,
"tokens/trainable": 446986
},
{
"epoch": 3.0181451612903225,
"grad_norm": 1.282638430595398,
"learning_rate": 0.00011701817594013312,
"loss": 1.6817176342010498,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.37478,
"step": 222,
"tokens/total": 454656,
"tokens/train_per_sec_per_gpu": 143.82,
"tokens/trainable": 449023
},
{
"epoch": 3.0201612903225805,
"grad_norm": 1.0392447710037231,
"learning_rate": 0.00011639369252611552,
"loss": 1.2293877601623535,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.41914,
"step": 223,
"tokens/total": 456704,
"tokens/train_per_sec_per_gpu": 141.07,
"tokens/trainable": 451053
},
{
"epoch": 3.0221774193548385,
"grad_norm": 1.2307904958724976,
"learning_rate": 0.00011576855143650371,
"loss": 1.8099391460418701,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.11008,
"step": 224,
"tokens/total": 458752,
"tokens/train_per_sec_per_gpu": 141.68,
"tokens/trainable": 453087
},
{
"epoch": 3.024193548387097,
"grad_norm": 1.2422001361846924,
"learning_rate": 0.00011514277775045768,
"loss": 1.6516501903533936,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.21558,
"step": 225,
"tokens/total": 460800,
"tokens/train_per_sec_per_gpu": 138.34,
"tokens/trainable": 455113
},
{
"epoch": 3.026209677419355,
"grad_norm": 1.1288433074951172,
"learning_rate": 0.00011451639657251563,
"loss": 1.3742070198059082,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.95194,
"step": 226,
"tokens/total": 462848,
"tokens/train_per_sec_per_gpu": 140.84,
"tokens/trainable": 457161
},
{
"epoch": 3.028225806451613,
"grad_norm": 1.1234395503997803,
"learning_rate": 0.00011388943303158693,
"loss": 1.5823338031768799,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.8663,
"step": 227,
"tokens/total": 464896,
"tokens/train_per_sec_per_gpu": 139.68,
"tokens/trainable": 459193
},
{
"epoch": 3.030241935483871,
"grad_norm": 1.0792587995529175,
"learning_rate": 0.00011326191227994391,
"loss": 1.3610866069793701,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.90043,
"step": 228,
"tokens/total": 466944,
"tokens/train_per_sec_per_gpu": 141.05,
"tokens/trainable": 461236
},
{
"epoch": 3.032258064516129,
"grad_norm": 1.1624300479888916,
"learning_rate": 0.00011263385949221295,
"loss": 1.6541019678115845,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.22838,
"step": 229,
"tokens/total": 468992,
"tokens/train_per_sec_per_gpu": 138.91,
"tokens/trainable": 463270
},
{
"epoch": 3.034274193548387,
"grad_norm": 1.2286807298660278,
"learning_rate": 0.0001120052998643643,
"loss": 1.3511790037155151,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.86198,
"step": 230,
"tokens/total": 471040,
"tokens/train_per_sec_per_gpu": 139.55,
"tokens/trainable": 465306
},
{
"epoch": 3.036290322580645,
"grad_norm": 1.059576153755188,
"learning_rate": 0.00011137625861270151,
"loss": 1.3441061973571777,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.83476,
"step": 231,
"tokens/total": 473088,
"tokens/train_per_sec_per_gpu": 140.71,
"tokens/trainable": 467339
},
{
"epoch": 3.038306451612903,
"grad_norm": 1.110609769821167,
"learning_rate": 0.00011074676097284973,
"loss": 1.3131623268127441,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.71791,
"step": 232,
"tokens/total": 475136,
"tokens/train_per_sec_per_gpu": 138.97,
"tokens/trainable": 469379
},
{
"epoch": 3.0403225806451615,
"grad_norm": 1.062983512878418,
"learning_rate": 0.00011011683219874323,
"loss": 1.384516954421997,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.9929,
"step": 233,
"tokens/total": 477184,
"tokens/train_per_sec_per_gpu": 139.52,
"tokens/trainable": 471411
},
{
"epoch": 3.0423387096774195,
"grad_norm": 1.2548089027404785,
"learning_rate": 0.00010948649756161246,
"loss": 1.588539719581604,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.89659,
"step": 234,
"tokens/total": 479232,
"tokens/train_per_sec_per_gpu": 139.35,
"tokens/trainable": 473446
},
{
"epoch": 3.0443548387096775,
"grad_norm": 1.2708343267440796,
"learning_rate": 0.00010885578234897003,
"loss": 1.839888572692871,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.29584,
"step": 235,
"tokens/total": 481280,
"tokens/train_per_sec_per_gpu": 144.7,
"tokens/trainable": 475488
},
{
"epoch": 3.0463709677419355,
"grad_norm": 1.2227643728256226,
"learning_rate": 0.00010822471186359639,
"loss": 1.7494804859161377,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.75161,
"step": 236,
"tokens/total": 483328,
"tokens/train_per_sec_per_gpu": 143.61,
"tokens/trainable": 477534
},
{
"epoch": 3.0483870967741935,
"grad_norm": 1.117796778678894,
"learning_rate": 0.00010759331142252462,
"loss": 1.2630927562713623,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.53634,
"step": 237,
"tokens/total": 485376,
"tokens/train_per_sec_per_gpu": 139.48,
"tokens/trainable": 479561
},
{
"epoch": 3.0504032258064515,
"grad_norm": 1.3211694955825806,
"learning_rate": 0.00010696160635602487,
"loss": 1.8685176372528076,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.47869,
"step": 238,
"tokens/total": 487424,
"tokens/train_per_sec_per_gpu": 140.92,
"tokens/trainable": 481592
},
{
"epoch": 3.0524193548387095,
"grad_norm": 1.2283082008361816,
"learning_rate": 0.00010632962200658815,
"loss": 1.4223217964172363,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.14674,
"step": 239,
"tokens/total": 489472,
"tokens/train_per_sec_per_gpu": 139.3,
"tokens/trainable": 483631
},
{
"epoch": 3.0544354838709675,
"grad_norm": 1.1250922679901123,
"learning_rate": 0.00010569738372790956,
"loss": 1.3267250061035156,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.76868,
"step": 240,
"tokens/total": 491520,
"tokens/train_per_sec_per_gpu": 135.89,
"tokens/trainable": 485675
},
{
"epoch": 3.056451612903226,
"grad_norm": 1.1296156644821167,
"learning_rate": 0.00010506491688387127,
"loss": 1.3787105083465576,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.96978,
"step": 241,
"tokens/total": 493568,
"tokens/train_per_sec_per_gpu": 144.49,
"tokens/trainable": 487712
},
{
"epoch": 3.058467741935484,
"grad_norm": 1.263743281364441,
"learning_rate": 0.000104432246847525,
"loss": 1.6385829448699951,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.14787,
"step": 242,
"tokens/total": 495616,
"tokens/train_per_sec_per_gpu": 141.59,
"tokens/trainable": 489754
},
{
"epoch": 3.060483870967742,
"grad_norm": 1.190499186515808,
"learning_rate": 0.00010379939900007393,
"loss": 1.273460030555725,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.57319,
"step": 243,
"tokens/total": 497664,
"tokens/train_per_sec_per_gpu": 141.11,
"tokens/trainable": 491796
},
{
"epoch": 3.0625,
"grad_norm": 1.3139567375183105,
"learning_rate": 0.00010316639872985472,
"loss": 1.6341391801834106,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.12504,
"step": 244,
"tokens/total": 499712,
"tokens/train_per_sec_per_gpu": 140.26,
"tokens/trainable": 493840
},
{
"epoch": 3.064516129032258,
"grad_norm": 1.2812219858169556,
"learning_rate": 0.00010253327143131879,
"loss": 1.3600250482559204,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.89629,
"step": 245,
"tokens/total": 501760,
"tokens/train_per_sec_per_gpu": 138.47,
"tokens/trainable": 495860
},
{
"epoch": 3.066532258064516,
"grad_norm": 1.2698646783828735,
"learning_rate": 0.00010190004250401368,
"loss": 1.666378140449524,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.29296,
"step": 246,
"tokens/total": 503808,
"tokens/train_per_sec_per_gpu": 141.99,
"tokens/trainable": 497886
},
{
"epoch": 3.068548387096774,
"grad_norm": 1.275866985321045,
"learning_rate": 0.00010126673735156402,
"loss": 1.4717791080474854,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.35698,
"step": 247,
"tokens/total": 505856,
"tokens/train_per_sec_per_gpu": 142.33,
"tokens/trainable": 499930
},
{
"epoch": 3.0705645161290325,
"grad_norm": 1.4462482929229736,
"learning_rate": 0.00010063338138065234,
"loss": 1.6472115516662598,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.19248,
"step": 248,
"tokens/total": 507904,
"tokens/train_per_sec_per_gpu": 143.7,
"tokens/trainable": 501964
},
{
"epoch": 3.0725806451612905,
"grad_norm": 1.291642189025879,
"learning_rate": 0.0001,
"loss": 2.117743968963623,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 8.31236,
"step": 249,
"tokens/total": 509952,
"tokens/train_per_sec_per_gpu": 139.27,
"tokens/trainable": 503995
},
{
"epoch": 3.0745967741935485,
"grad_norm": 1.065657138824463,
"learning_rate": 9.936661861934765e-05,
"loss": 1.7818100452423096,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.9406,
"step": 250,
"tokens/total": 512000,
"tokens/train_per_sec_per_gpu": 139.06,
"tokens/trainable": 506033
},
{
"epoch": 3.0766129032258065,
"grad_norm": 1.2712163925170898,
"learning_rate": 9.8733262648436e-05,
"loss": 1.7442567348480225,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.72165,
"step": 251,
"tokens/total": 514048,
"tokens/train_per_sec_per_gpu": 139.84,
"tokens/trainable": 508057
},
{
"epoch": 3.0786290322580645,
"grad_norm": 1.2973222732543945,
"learning_rate": 9.809995749598632e-05,
"loss": 1.9768915176391602,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.22026,
"step": 252,
"tokens/total": 516096,
"tokens/train_per_sec_per_gpu": 139.92,
"tokens/trainable": 510079
},
{
"epoch": 3.0806451612903225,
"grad_norm": 1.0970314741134644,
"learning_rate": 9.746672856868123e-05,
"loss": 1.2245832681655884,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.40275,
"step": 253,
"tokens/total": 518144,
"tokens/train_per_sec_per_gpu": 142.79,
"tokens/trainable": 512112
},
{
"epoch": 3.0826612903225805,
"grad_norm": 1.3336235284805298,
"learning_rate": 9.683360127014529e-05,
"loss": 1.5929476022720337,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.91822,
"step": 254,
"tokens/total": 520192,
"tokens/train_per_sec_per_gpu": 138.39,
"tokens/trainable": 514140
},
{
"epoch": 3.0846774193548385,
"grad_norm": 1.2752783298492432,
"learning_rate": 9.620060099992609e-05,
"loss": 1.6711721420288086,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.3184,
"step": 255,
"tokens/total": 522240,
"tokens/train_per_sec_per_gpu": 138.11,
"tokens/trainable": 516164
},
{
"epoch": 3.086693548387097,
"grad_norm": 1.0667186975479126,
"learning_rate": 9.556775315247501e-05,
"loss": 1.4244043827056885,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.15538,
"step": 256,
"tokens/total": 524288,
"tokens/train_per_sec_per_gpu": 142.64,
"tokens/trainable": 518204
},
{
"epoch": 3.088709677419355,
"grad_norm": 1.2738869190216064,
"learning_rate": 9.493508311612874e-05,
"loss": 1.912049412727356,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.76694,
"step": 257,
"tokens/total": 526336,
"tokens/train_per_sec_per_gpu": 140.83,
"tokens/trainable": 520239
},
{
"epoch": 3.090725806451613,
"grad_norm": 1.1487785577774048,
"learning_rate": 9.430261627209044e-05,
"loss": 1.6021625995635986,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.96376,
"step": 258,
"tokens/total": 528384,
"tokens/train_per_sec_per_gpu": 142.29,
"tokens/trainable": 522256
},
{
"epoch": 3.092741935483871,
"grad_norm": 1.1487421989440918,
"learning_rate": 9.367037799341187e-05,
"loss": 1.5816335678100586,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.86289,
"step": 259,
"tokens/total": 530432,
"tokens/train_per_sec_per_gpu": 144.34,
"tokens/trainable": 524301
},
{
"epoch": 3.094758064516129,
"grad_norm": 1.1304370164871216,
"learning_rate": 9.303839364397511e-05,
"loss": 1.5841972827911377,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.87538,
"step": 260,
"tokens/total": 532480,
"tokens/train_per_sec_per_gpu": 140.53,
"tokens/trainable": 526337
},
{
"epoch": 3.096774193548387,
"grad_norm": 1.2210973501205444,
"learning_rate": 9.24066885774754e-05,
"loss": 1.3856146335601807,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.99728,
"step": 261,
"tokens/total": 534528,
"tokens/train_per_sec_per_gpu": 142.27,
"tokens/trainable": 528372
},
{
"epoch": 3.098790322580645,
"grad_norm": 1.106105089187622,
"learning_rate": 9.177528813640362e-05,
"loss": 1.5088179111480713,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.52138,
"step": 262,
"tokens/total": 536576,
"tokens/train_per_sec_per_gpu": 137.11,
"tokens/trainable": 530386
},
{
"epoch": 3.100806451612903,
"grad_norm": 1.2509136199951172,
"learning_rate": 9.114421765102999e-05,
"loss": 1.6389458179473877,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.14974,
"step": 263,
"tokens/total": 538624,
"tokens/train_per_sec_per_gpu": 142.74,
"tokens/trainable": 532411
},
{
"epoch": 3.1028225806451615,
"grad_norm": 1.1185474395751953,
"learning_rate": 9.051350243838756e-05,
"loss": 1.4692389965057373,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.34593,
"step": 264,
"tokens/total": 540672,
"tokens/train_per_sec_per_gpu": 141.72,
"tokens/trainable": 534431
},
{
"epoch": 3.1048387096774195,
"grad_norm": 1.253171443939209,
"learning_rate": 8.98831678012568e-05,
"loss": 2.0175833702087402,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.52013,
"step": 265,
"tokens/total": 542720,
"tokens/train_per_sec_per_gpu": 143.19,
"tokens/trainable": 536470
},
{
"epoch": 3.1068548387096775,
"grad_norm": 1.2736040353775024,
"learning_rate": 8.925323902715031e-05,
"loss": 1.5984770059585571,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.94549,
"step": 266,
"tokens/total": 544768,
"tokens/train_per_sec_per_gpu": 140.92,
"tokens/trainable": 538506
},
{
"epoch": 3.1088709677419355,
"grad_norm": 1.1199898719787598,
"learning_rate": 8.862374138729853e-05,
"loss": 1.399961233139038,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.05504,
"step": 267,
"tokens/total": 546816,
"tokens/train_per_sec_per_gpu": 139.15,
"tokens/trainable": 540530
},
{
"epoch": 3.1108870967741935,
"grad_norm": 1.2822198867797852,
"learning_rate": 8.799470013563573e-05,
"loss": 1.5098600387573242,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.5261,
"step": 268,
"tokens/total": 548864,
"tokens/train_per_sec_per_gpu": 141.42,
"tokens/trainable": 542552
},
{
"epoch": 3.1129032258064515,
"grad_norm": 1.323458194732666,
"learning_rate": 8.73661405077871e-05,
"loss": 1.529092788696289,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.61399,
"step": 269,
"tokens/total": 550912,
"tokens/train_per_sec_per_gpu": 142.05,
"tokens/trainable": 544574
},
{
"epoch": 3.1149193548387095,
"grad_norm": 1.1802997589111328,
"learning_rate": 8.67380877200561e-05,
"loss": 1.5494701862335205,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.70897,
"step": 270,
"tokens/total": 552960,
"tokens/train_per_sec_per_gpu": 136.79,
"tokens/trainable": 546576
},
{
"epoch": 3.1169354838709675,
"grad_norm": 1.1339328289031982,
"learning_rate": 8.611056696841312e-05,
"loss": 1.4412262439727783,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.22587,
"step": 271,
"tokens/total": 555008,
"tokens/train_per_sec_per_gpu": 139.97,
"tokens/trainable": 548595
},
{
"epoch": 3.118951612903226,
"grad_norm": 1.184397578239441,
"learning_rate": 8.54836034274844e-05,
"loss": 1.4191689491271973,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.13368,
"step": 272,
"tokens/total": 557056,
"tokens/train_per_sec_per_gpu": 138.8,
"tokens/trainable": 550640
},
{
"epoch": 3.120967741935484,
"grad_norm": 1.1473497152328491,
"learning_rate": 8.485722224954237e-05,
"loss": 1.5693676471710205,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.80361,
"step": 273,
"tokens/total": 559104,
"tokens/train_per_sec_per_gpu": 139.3,
"tokens/trainable": 552674
},
{
"epoch": 3.122983870967742,
"grad_norm": 1.356634497642517,
"learning_rate": 8.423144856349631e-05,
"loss": 1.9328263998031616,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.90901,
"step": 274,
"tokens/total": 561152,
"tokens/train_per_sec_per_gpu": 137.86,
"tokens/trainable": 554682
},
{
"epoch": 3.125,
"grad_norm": 1.1992149353027344,
"learning_rate": 8.36063074738845e-05,
"loss": 1.3870232105255127,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.00292,
"step": 275,
"tokens/total": 563200,
"tokens/train_per_sec_per_gpu": 140.61,
"tokens/trainable": 556700
},
{
"epoch": 3.127016129032258,
"grad_norm": 1.2201882600784302,
"learning_rate": 8.298182405986689e-05,
"loss": 1.576523780822754,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.83811,
"step": 276,
"tokens/total": 565248,
"tokens/train_per_sec_per_gpu": 140.13,
"tokens/trainable": 558705
},
{
"epoch": 3.129032258064516,
"grad_norm": 1.4621694087982178,
"learning_rate": 8.235802337421919e-05,
"loss": 1.915595531463623,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.79098,
"step": 277,
"tokens/total": 567296,
"tokens/train_per_sec_per_gpu": 139.56,
"tokens/trainable": 560720
},
{
"epoch": 3.131048387096774,
"grad_norm": 1.246692180633545,
"learning_rate": 8.173493044232745e-05,
"loss": 1.5288515090942383,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.61288,
"step": 278,
"tokens/total": 569344,
"tokens/train_per_sec_per_gpu": 143.15,
"tokens/trainable": 562768
},
{
"epoch": 3.133064516129032,
"grad_norm": 1.1976420879364014,
"learning_rate": 8.11125702611846e-05,
"loss": 1.3603302240371704,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.89748,
"step": 279,
"tokens/total": 571392,
"tokens/train_per_sec_per_gpu": 137.69,
"tokens/trainable": 564802
},
{
"epoch": 3.1350806451612905,
"grad_norm": 1.2767409086227417,
"learning_rate": 8.049096779838719e-05,
"loss": 1.5446770191192627,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.68646,
"step": 280,
"tokens/total": 573440,
"tokens/train_per_sec_per_gpu": 136.27,
"tokens/trainable": 566813
},
{
"epoch": 3.1370967741935485,
"grad_norm": 1.2133939266204834,
"learning_rate": 7.987014799113397e-05,
"loss": 1.2143042087554932,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.36795,
"step": 281,
"tokens/total": 575488,
"tokens/train_per_sec_per_gpu": 134.72,
"tokens/trainable": 568792
},
{
"epoch": 3.1391129032258065,
"grad_norm": 1.0450326204299927,
"learning_rate": 7.925013574522557e-05,
"loss": 1.2417564392089844,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.46169,
"step": 282,
"tokens/total": 577536,
"tokens/train_per_sec_per_gpu": 137.68,
"tokens/trainable": 570791
},
{
"epoch": 3.1411290322580645,
"grad_norm": 1.2996821403503418,
"learning_rate": 7.863095593406491e-05,
"loss": 1.5991716384887695,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.94893,
"step": 283,
"tokens/total": 579584,
"tokens/train_per_sec_per_gpu": 134.29,
"tokens/trainable": 572699
},
{
"epoch": 3.1431451612903225,
"grad_norm": 1.655535101890564,
"learning_rate": 7.801263339765994e-05,
"loss": 1.2429203987121582,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.46572,
"step": 284,
"tokens/total": 581632,
"tokens/train_per_sec_per_gpu": 104.39,
"tokens/trainable": 574221
},
{
"epoch": 4.002016129032258,
"grad_norm": 1.274104356765747,
"learning_rate": 7.739519294162652e-05,
"loss": 1.5383883714675903,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.65708,
"step": 285,
"tokens/total": 583680,
"tokens/train_per_sec_per_gpu": 139.07,
"tokens/trainable": 576269
},
{
"epoch": 4.004032258064516,
"grad_norm": 1.2912395000457764,
"learning_rate": 7.677865933619379e-05,
"loss": 1.3963959217071533,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.04061,
"step": 286,
"tokens/total": 585728,
"tokens/train_per_sec_per_gpu": 139.66,
"tokens/trainable": 578298
},
{
"epoch": 4.006048387096774,
"grad_norm": 1.2305668592453003,
"learning_rate": 7.616305731521008e-05,
"loss": 1.466391921043396,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.33357,
"step": 287,
"tokens/total": 587776,
"tokens/train_per_sec_per_gpu": 232.49,
"tokens/trainable": 580337
},
{
"epoch": 4.008064516129032,
"grad_norm": 1.1913374662399292,
"learning_rate": 7.554841157515092e-05,
"loss": 1.312002420425415,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.7136,
"step": 288,
"tokens/total": 589824,
"tokens/train_per_sec_per_gpu": 263.64,
"tokens/trainable": 582382
},
{
"epoch": 4.01008064516129,
"grad_norm": 1.3715969324111938,
"learning_rate": 7.493474677412794e-05,
"loss": 1.5232388973236084,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.58706,
"step": 289,
"tokens/total": 591872,
"tokens/train_per_sec_per_gpu": 265.19,
"tokens/trainable": 584411
},
{
"epoch": 4.012096774193548,
"grad_norm": 1.2743836641311646,
"learning_rate": 7.432208753090009e-05,
"loss": 1.570101022720337,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.80713,
"step": 290,
"tokens/total": 593920,
"tokens/train_per_sec_per_gpu": 267.17,
"tokens/trainable": 586451
},
{
"epoch": 4.014112903225806,
"grad_norm": 1.1839017868041992,
"learning_rate": 7.371045842388552e-05,
"loss": 1.2354711294174194,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.44,
"step": 291,
"tokens/total": 595968,
"tokens/train_per_sec_per_gpu": 257.88,
"tokens/trainable": 588494
},
{
"epoch": 4.016129032258065,
"grad_norm": 1.5452686548233032,
"learning_rate": 7.309988399017602e-05,
"loss": 1.75980806350708,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.81132,
"step": 292,
"tokens/total": 598016,
"tokens/train_per_sec_per_gpu": 266.87,
"tokens/trainable": 590539
},
{
"epoch": 4.018145161290323,
"grad_norm": 1.3476964235305786,
"learning_rate": 7.24903887245522e-05,
"loss": 1.204418659210205,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.33482,
"step": 293,
"tokens/total": 600064,
"tokens/train_per_sec_per_gpu": 266.31,
"tokens/trainable": 592573
},
{
"epoch": 4.020161290322581,
"grad_norm": 1.3166759014129639,
"learning_rate": 7.188199707850122e-05,
"loss": 1.5485095977783203,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.70445,
"step": 294,
"tokens/total": 602112,
"tokens/train_per_sec_per_gpu": 262.47,
"tokens/trainable": 594600
},
{
"epoch": 4.022177419354839,
"grad_norm": 1.2978073358535767,
"learning_rate": 7.127473345923554e-05,
"loss": 1.2120771408081055,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.36046,
"step": 295,
"tokens/total": 604160,
"tokens/train_per_sec_per_gpu": 264.34,
"tokens/trainable": 596636
},
{
"epoch": 4.024193548387097,
"grad_norm": 1.4947535991668701,
"learning_rate": 7.066862222871397e-05,
"loss": 1.7942547798156738,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.01499,
"step": 296,
"tokens/total": 606208,
"tokens/train_per_sec_per_gpu": 262.53,
"tokens/trainable": 598664
},
{
"epoch": 4.026209677419355,
"grad_norm": 1.3672757148742676,
"learning_rate": 7.006368770266421e-05,
"loss": 1.1543655395507812,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.17201,
"step": 297,
"tokens/total": 608256,
"tokens/train_per_sec_per_gpu": 263.64,
"tokens/trainable": 600701
},
{
"epoch": 4.028225806451613,
"grad_norm": 1.3657294511795044,
"learning_rate": 6.945995414960744e-05,
"loss": 1.5515549182891846,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.7188,
"step": 298,
"tokens/total": 610304,
"tokens/train_per_sec_per_gpu": 266.24,
"tokens/trainable": 602742
},
{
"epoch": 4.030241935483871,
"grad_norm": 1.343180775642395,
"learning_rate": 6.885744578988463e-05,
"loss": 1.6137218475341797,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.02147,
"step": 299,
"tokens/total": 612352,
"tokens/train_per_sec_per_gpu": 262.83,
"tokens/trainable": 604776
},
{
"epoch": 4.032258064516129,
"grad_norm": 1.1807539463043213,
"learning_rate": 6.825618679468502e-05,
"loss": 0.9370715618133545,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.5525,
"step": 300,
"tokens/total": 614400,
"tokens/train_per_sec_per_gpu": 263.56,
"tokens/trainable": 606812
},
{
"epoch": 4.034274193548387,
"grad_norm": 1.3539948463439941,
"learning_rate": 6.765620128507619e-05,
"loss": 1.345663070678711,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.84073,
"step": 301,
"tokens/total": 616448,
"tokens/train_per_sec_per_gpu": 261.91,
"tokens/trainable": 608850
},
{
"epoch": 4.036290322580645,
"grad_norm": 1.2722002267837524,
"learning_rate": 6.705751333103675e-05,
"loss": 1.2931978702545166,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.64442,
"step": 302,
"tokens/total": 618496,
"tokens/train_per_sec_per_gpu": 268.26,
"tokens/trainable": 610889
},
{
"epoch": 4.038306451612903,
"grad_norm": 1.3646944761276245,
"learning_rate": 6.64601469504903e-05,
"loss": 1.4423154592514038,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.23048,
"step": 303,
"tokens/total": 620544,
"tokens/train_per_sec_per_gpu": 268.66,
"tokens/trainable": 612927
},
{
"epoch": 4.040322580645161,
"grad_norm": 1.234769344329834,
"learning_rate": 6.586412610834221e-05,
"loss": 1.2587189674377441,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.52091,
"step": 304,
"tokens/total": 622592,
"tokens/train_per_sec_per_gpu": 268.44,
"tokens/trainable": 614966
},
{
"epoch": 4.042338709677419,
"grad_norm": 1.3554316759109497,
"learning_rate": 6.526947471551798e-05,
"loss": 1.6908648014068604,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.42417,
"step": 305,
"tokens/total": 624640,
"tokens/train_per_sec_per_gpu": 268.41,
"tokens/trainable": 617010
},
{
"epoch": 4.044354838709677,
"grad_norm": 1.4698665142059326,
"learning_rate": 6.46762166280042e-05,
"loss": 1.8330029249191284,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 6.25263,
"step": 306,
"tokens/total": 626688,
"tokens/train_per_sec_per_gpu": 265.37,
"tokens/trainable": 619042
},
{
"epoch": 4.046370967741935,
"grad_norm": 1.3987410068511963,
"learning_rate": 6.40843756458913e-05,
"loss": 1.682692527770996,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.38002,
"step": 307,
"tokens/total": 628736,
"tokens/train_per_sec_per_gpu": 263.08,
"tokens/trainable": 621066
},
{
"epoch": 4.048387096774194,
"grad_norm": 1.5588736534118652,
"learning_rate": 6.349397551241894e-05,
"loss": 1.6790101528167725,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.36025,
"step": 308,
"tokens/total": 630784,
"tokens/train_per_sec_per_gpu": 265.49,
"tokens/trainable": 623102
},
{
"epoch": 4.050403225806452,
"grad_norm": 1.292420744895935,
"learning_rate": 6.290503991302324e-05,
"loss": 1.1997315883636475,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.31923,
"step": 309,
"tokens/total": 632832,
"tokens/train_per_sec_per_gpu": 263.41,
"tokens/trainable": 625144
},
{
"epoch": 4.05241935483871,
"grad_norm": 1.4492568969726562,
"learning_rate": 6.231759247438689e-05,
"loss": 1.3761956691741943,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.95981,
"step": 310,
"tokens/total": 634880,
"tokens/train_per_sec_per_gpu": 262.05,
"tokens/trainable": 627175
},
{
"epoch": 4.054435483870968,
"grad_norm": 1.437646508216858,
"learning_rate": 6.173165676349103e-05,
"loss": 1.2556071281433105,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.50997,
"step": 311,
"tokens/total": 636928,
"tokens/train_per_sec_per_gpu": 262.88,
"tokens/trainable": 629214
},
{
"epoch": 4.056451612903226,
"grad_norm": 1.372708797454834,
"learning_rate": 6.114725628666998e-05,
"loss": 1.3328940868377686,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.792,
"step": 312,
"tokens/total": 638976,
"tokens/train_per_sec_per_gpu": 264.5,
"tokens/trainable": 631258
},
{
"epoch": 4.058467741935484,
"grad_norm": 1.6981028318405151,
"learning_rate": 6.0564414488668165e-05,
"loss": 1.6544911861419678,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.23042,
"step": 313,
"tokens/total": 641024,
"tokens/train_per_sec_per_gpu": 262.31,
"tokens/trainable": 633287
},
{
"epoch": 4.060483870967742,
"grad_norm": 1.4426374435424805,
"learning_rate": 5.998315475169942e-05,
"loss": 1.2637913227081299,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.53881,
"step": 314,
"tokens/total": 643072,
"tokens/train_per_sec_per_gpu": 266.31,
"tokens/trainable": 635323
},
{
"epoch": 4.0625,
"grad_norm": 1.3725757598876953,
"learning_rate": 5.94035003945093e-05,
"loss": 1.5147466659545898,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.54827,
"step": 315,
"tokens/total": 645120,
"tokens/train_per_sec_per_gpu": 265.15,
"tokens/trainable": 637355
},
{
"epoch": 4.064516129032258,
"grad_norm": 1.3077338933944702,
"learning_rate": 5.88254746714392e-05,
"loss": 1.2489416599273682,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.48665,
"step": 316,
"tokens/total": 647168,
"tokens/train_per_sec_per_gpu": 254.63,
"tokens/trainable": 639396
},
{
"epoch": 4.066532258064516,
"grad_norm": 1.3260565996170044,
"learning_rate": 5.824910077149371e-05,
"loss": 1.153064489364624,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.16789,
"step": 317,
"tokens/total": 649216,
"tokens/train_per_sec_per_gpu": 265.77,
"tokens/trainable": 641426
},
{
"epoch": 4.068548387096774,
"grad_norm": 1.4490100145339966,
"learning_rate": 5.767440181741019e-05,
"loss": 1.2668461799621582,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.54964,
"step": 318,
"tokens/total": 651264,
"tokens/train_per_sec_per_gpu": 262.28,
"tokens/trainable": 643458
},
{
"epoch": 4.070564516129032,
"grad_norm": 1.4084233045578003,
"learning_rate": 5.710140086473129e-05,
"loss": 1.0618423223495483,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.89169,
"step": 319,
"tokens/total": 653312,
"tokens/train_per_sec_per_gpu": 262.96,
"tokens/trainable": 645481
},
{
"epoch": 4.07258064516129,
"grad_norm": 1.4210598468780518,
"learning_rate": 5.653012090087977e-05,
"loss": 1.278883457183838,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.59263,
"step": 320,
"tokens/total": 655360,
"tokens/train_per_sec_per_gpu": 264.32,
"tokens/trainable": 647514
},
{
"epoch": 4.074596774193548,
"grad_norm": 1.4964855909347534,
"learning_rate": 5.596058484423656e-05,
"loss": 1.4187018871307373,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.13175,
"step": 321,
"tokens/total": 657408,
"tokens/train_per_sec_per_gpu": 264.89,
"tokens/trainable": 649562
},
{
"epoch": 4.076612903225806,
"grad_norm": 1.5338218212127686,
"learning_rate": 5.5392815543221254e-05,
"loss": 1.6856354475021362,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.39588,
"step": 322,
"tokens/total": 659456,
"tokens/train_per_sec_per_gpu": 264.2,
"tokens/trainable": 651599
},
{
"epoch": 4.078629032258065,
"grad_norm": 1.441730260848999,
"learning_rate": 5.4826835775375285e-05,
"loss": 1.5783681869506836,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.84704,
"step": 323,
"tokens/total": 661504,
"tokens/train_per_sec_per_gpu": 265.01,
"tokens/trainable": 653646
},
{
"epoch": 4.080645161290323,
"grad_norm": 1.5422190427780151,
"learning_rate": 5.4262668246448475e-05,
"loss": 1.6311604976654053,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.1098,
"step": 324,
"tokens/total": 663552,
"tokens/train_per_sec_per_gpu": 262.57,
"tokens/trainable": 655681
},
{
"epoch": 4.082661290322581,
"grad_norm": 1.6251616477966309,
"learning_rate": 5.3700335589487925e-05,
"loss": 1.576082468032837,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.83597,
"step": 325,
"tokens/total": 665600,
"tokens/train_per_sec_per_gpu": 262.3,
"tokens/trainable": 657716
},
{
"epoch": 4.084677419354839,
"grad_norm": 1.5639586448669434,
"learning_rate": 5.3139860363929996e-05,
"loss": 1.213226556777954,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.36432,
"step": 326,
"tokens/total": 667648,
"tokens/train_per_sec_per_gpu": 260.53,
"tokens/trainable": 659728
},
{
"epoch": 4.086693548387097,
"grad_norm": 1.2871079444885254,
"learning_rate": 5.2581265054695494e-05,
"loss": 1.0847278833389282,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.95863,
"step": 327,
"tokens/total": 669696,
"tokens/train_per_sec_per_gpu": 257.33,
"tokens/trainable": 661749
},
{
"epoch": 4.088709677419355,
"grad_norm": 1.398889183998108,
"learning_rate": 5.202457207128736e-05,
"loss": 1.311136245727539,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.71039,
"step": 328,
"tokens/total": 671744,
"tokens/train_per_sec_per_gpu": 267.91,
"tokens/trainable": 663790
},
{
"epoch": 4.090725806451613,
"grad_norm": 1.2428795099258423,
"learning_rate": 5.146980374689192e-05,
"loss": 1.1416620016098022,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.13197,
"step": 329,
"tokens/total": 673792,
"tokens/train_per_sec_per_gpu": 266.28,
"tokens/trainable": 665811
},
{
"epoch": 4.092741935483871,
"grad_norm": 1.2925283908843994,
"learning_rate": 5.0916982337482644e-05,
"loss": 1.1519179344177246,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.16426,
"step": 330,
"tokens/total": 675840,
"tokens/train_per_sec_per_gpu": 263.97,
"tokens/trainable": 667848
},
{
"epoch": 4.094758064516129,
"grad_norm": 1.506675362586975,
"learning_rate": 5.0366130020927624e-05,
"loss": 1.4302377700805664,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.17969,
"step": 331,
"tokens/total": 677888,
"tokens/train_per_sec_per_gpu": 266.77,
"tokens/trainable": 669886
},
{
"epoch": 4.096774193548387,
"grad_norm": 1.311647891998291,
"learning_rate": 4.981726889609952e-05,
"loss": 1.3400213718414307,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.81913,
"step": 332,
"tokens/total": 679936,
"tokens/train_per_sec_per_gpu": 266.18,
"tokens/trainable": 671928
},
{
"epoch": 4.098790322580645,
"grad_norm": 1.3376080989837646,
"learning_rate": 4.9270420981989294e-05,
"loss": 1.423518180847168,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.1517,
"step": 333,
"tokens/total": 681984,
"tokens/train_per_sec_per_gpu": 264.49,
"tokens/trainable": 673948
},
{
"epoch": 4.100806451612903,
"grad_norm": 1.3112465143203735,
"learning_rate": 4.872560821682256e-05,
"loss": 1.1089239120483398,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.03109,
"step": 334,
"tokens/total": 684032,
"tokens/train_per_sec_per_gpu": 260.74,
"tokens/trainable": 675973
},
{
"epoch": 4.102822580645161,
"grad_norm": 1.4535356760025024,
"learning_rate": 4.818285245717984e-05,
"loss": 1.2431288957595825,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.46644,
"step": 335,
"tokens/total": 686080,
"tokens/train_per_sec_per_gpu": 253.64,
"tokens/trainable": 678009
},
{
"epoch": 4.104838709677419,
"grad_norm": 1.4877009391784668,
"learning_rate": 4.764217547711934e-05,
"loss": 1.4921081066131592,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.44646,
"step": 336,
"tokens/total": 688128,
"tokens/train_per_sec_per_gpu": 263.79,
"tokens/trainable": 680047
},
{
"epoch": 4.106854838709677,
"grad_norm": 1.486241340637207,
"learning_rate": 4.710359896730379e-05,
"loss": 1.3672105073928833,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.92439,
"step": 337,
"tokens/total": 690176,
"tokens/train_per_sec_per_gpu": 264.3,
"tokens/trainable": 682082
},
{
"epoch": 4.108870967741935,
"grad_norm": 1.5436185598373413,
"learning_rate": 4.656714453412993e-05,
"loss": 1.280173659324646,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.59726,
"step": 338,
"tokens/total": 692224,
"tokens/train_per_sec_per_gpu": 263.44,
"tokens/trainable": 684104
},
{
"epoch": 4.110887096774194,
"grad_norm": 1.5693432092666626,
"learning_rate": 4.6032833698862044e-05,
"loss": 1.5435786247253418,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.68131,
"step": 339,
"tokens/total": 694272,
"tokens/train_per_sec_per_gpu": 266.3,
"tokens/trainable": 686136
},
{
"epoch": 4.112903225806452,
"grad_norm": 1.3754907846450806,
"learning_rate": 4.5500687896768256e-05,
"loss": 1.0676181316375732,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.90844,
"step": 340,
"tokens/total": 696320,
"tokens/train_per_sec_per_gpu": 269.49,
"tokens/trainable": 688177
},
{
"epoch": 4.11491935483871,
"grad_norm": 1.6923837661743164,
"learning_rate": 4.497072847626087e-05,
"loss": 1.1597163677215576,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.18903,
"step": 341,
"tokens/total": 698368,
"tokens/train_per_sec_per_gpu": 264.76,
"tokens/trainable": 690191
},
{
"epoch": 4.116935483870968,
"grad_norm": 1.5109210014343262,
"learning_rate": 4.444297669803981e-05,
"loss": 1.552527904510498,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.7234,
"step": 342,
"tokens/total": 700416,
"tokens/train_per_sec_per_gpu": 259.52,
"tokens/trainable": 692221
},
{
"epoch": 4.118951612903226,
"grad_norm": 1.6301664113998413,
"learning_rate": 4.3917453734239566e-05,
"loss": 1.6055819988250732,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.98076,
"step": 343,
"tokens/total": 702464,
"tokens/train_per_sec_per_gpu": 253.58,
"tokens/trainable": 694233
},
{
"epoch": 4.120967741935484,
"grad_norm": 1.4239203929901123,
"learning_rate": 4.339418066758008e-05,
"loss": 1.364980936050415,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.91565,
"step": 344,
"tokens/total": 704512,
"tokens/train_per_sec_per_gpu": 258.35,
"tokens/trainable": 696278
},
{
"epoch": 4.122983870967742,
"grad_norm": 1.6190986633300781,
"learning_rate": 4.287317849052075e-05,
"loss": 1.7706489562988281,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.87466,
"step": 345,
"tokens/total": 706560,
"tokens/train_per_sec_per_gpu": 261.65,
"tokens/trainable": 698318
},
{
"epoch": 4.125,
"grad_norm": 1.4046826362609863,
"learning_rate": 4.235446810441841e-05,
"loss": 1.204803705215454,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.3361,
"step": 346,
"tokens/total": 708608,
"tokens/train_per_sec_per_gpu": 258.84,
"tokens/trainable": 700346
},
{
"epoch": 4.127016129032258,
"grad_norm": 1.7797582149505615,
"learning_rate": 4.1838070318688604e-05,
"loss": 2.0054688453674316,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 7.42958,
"step": 347,
"tokens/total": 710656,
"tokens/train_per_sec_per_gpu": 258.83,
"tokens/trainable": 702366
},
{
"epoch": 4.129032258064516,
"grad_norm": 1.3385303020477295,
"learning_rate": 4.132400584997106e-05,
"loss": 1.164678931236267,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.20489,
"step": 348,
"tokens/total": 712704,
"tokens/train_per_sec_per_gpu": 255.68,
"tokens/trainable": 704367
},
{
"epoch": 4.131048387096774,
"grad_norm": 1.2760944366455078,
"learning_rate": 4.081229532129827e-05,
"loss": 1.1424689292907715,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.1345,
"step": 349,
"tokens/total": 714752,
"tokens/train_per_sec_per_gpu": 257.9,
"tokens/trainable": 706393
},
{
"epoch": 4.133064516129032,
"grad_norm": 1.4096364974975586,
"learning_rate": 4.030295926126845e-05,
"loss": 1.409515380859375,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.09397,
"step": 350,
"tokens/total": 716800,
"tokens/train_per_sec_per_gpu": 257.11,
"tokens/trainable": 708399
},
{
"epoch": 4.13508064516129,
"grad_norm": 1.3151328563690186,
"learning_rate": 3.979601810322169e-05,
"loss": 1.3166403770446777,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.73087,
"step": 351,
"tokens/total": 718848,
"tokens/train_per_sec_per_gpu": 262.63,
"tokens/trainable": 710426
},
{
"epoch": 4.137096774193548,
"grad_norm": 1.3529253005981445,
"learning_rate": 3.929149218442052e-05,
"loss": 1.3559751510620117,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.88054,
"step": 352,
"tokens/total": 720896,
"tokens/train_per_sec_per_gpu": 258.26,
"tokens/trainable": 712423
},
{
"epoch": 4.139112903225806,
"grad_norm": 1.3632549047470093,
"learning_rate": 3.878940174523371e-05,
"loss": 1.2670247554779053,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.55027,
"step": 353,
"tokens/total": 722944,
"tokens/train_per_sec_per_gpu": 261.46,
"tokens/trainable": 714448
},
{
"epoch": 4.141129032258065,
"grad_norm": 1.3360320329666138,
"learning_rate": 3.828976692832458e-05,
"loss": 1.553146481513977,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.72632,
"step": 354,
"tokens/total": 724992,
"tokens/train_per_sec_per_gpu": 254.34,
"tokens/trainable": 716436
},
{
"epoch": 4.143145161290323,
"grad_norm": 1.5650333166122437,
"learning_rate": 3.779260777784263e-05,
"loss": 1.3416231870651245,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.82525,
"step": 355,
"tokens/total": 727040,
"tokens/train_per_sec_per_gpu": 205.74,
"tokens/trainable": 718064
},
{
"epoch": 5.002016129032258,
"grad_norm": 1.5316541194915771,
"learning_rate": 3.7297944238619706e-05,
"loss": 1.6510117053985596,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.21225,
"step": 356,
"tokens/total": 729088,
"tokens/train_per_sec_per_gpu": 257.01,
"tokens/trainable": 720105
},
{
"epoch": 5.004032258064516,
"grad_norm": 1.2902381420135498,
"learning_rate": 3.680579615536961e-05,
"loss": 1.4203373193740845,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.13852,
"step": 357,
"tokens/total": 731136,
"tokens/train_per_sec_per_gpu": 247.52,
"tokens/trainable": 722129
},
{
"epoch": 5.006048387096774,
"grad_norm": 1.3662035465240479,
"learning_rate": 3.631618327189218e-05,
"loss": 1.4179635047912598,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.1287,
"step": 358,
"tokens/total": 733184,
"tokens/train_per_sec_per_gpu": 258.81,
"tokens/trainable": 724173
},
{
"epoch": 5.008064516129032,
"grad_norm": 1.3037316799163818,
"learning_rate": 3.582912523028101e-05,
"loss": 1.352861762046814,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.86848,
"step": 359,
"tokens/total": 735232,
"tokens/train_per_sec_per_gpu": 262.86,
"tokens/trainable": 726202
},
{
"epoch": 5.01008064516129,
"grad_norm": 1.3802729845046997,
"learning_rate": 3.534464157013574e-05,
"loss": 1.6564579010009766,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.24071,
"step": 360,
"tokens/total": 737280,
"tokens/train_per_sec_per_gpu": 266.73,
"tokens/trainable": 728239
},
{
"epoch": 5.012096774193548,
"grad_norm": 1.6267991065979004,
"learning_rate": 3.4862751727777797e-05,
"loss": 1.5538551807403564,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.72967,
"step": 361,
"tokens/total": 739328,
"tokens/train_per_sec_per_gpu": 240.84,
"tokens/trainable": 730281
},
{
"epoch": 5.014112903225806,
"grad_norm": 1.455069661140442,
"learning_rate": 3.438347503547102e-05,
"loss": 1.299963355064392,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.66916,
"step": 362,
"tokens/total": 741376,
"tokens/train_per_sec_per_gpu": 264.41,
"tokens/trainable": 732325
},
{
"epoch": 5.016129032258065,
"grad_norm": 1.1740471124649048,
"learning_rate": 3.390683072064594e-05,
"loss": 0.7859928607940674,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.19458,
"step": 363,
"tokens/total": 743424,
"tokens/train_per_sec_per_gpu": 261.92,
"tokens/trainable": 734363
},
{
"epoch": 5.018145161290323,
"grad_norm": 1.5127320289611816,
"learning_rate": 3.343283790512829e-05,
"loss": 1.3409039974212646,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.8225,
"step": 364,
"tokens/total": 745472,
"tokens/train_per_sec_per_gpu": 264.56,
"tokens/trainable": 736408
},
{
"epoch": 5.020161290322581,
"grad_norm": 1.4884353876113892,
"learning_rate": 3.296151560437214e-05,
"loss": 1.1510839462280273,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.16162,
"step": 365,
"tokens/total": 747520,
"tokens/train_per_sec_per_gpu": 261.75,
"tokens/trainable": 738441
},
{
"epoch": 5.022177419354839,
"grad_norm": 1.6059309244155884,
"learning_rate": 3.249288272669691e-05,
"loss": 1.377195119857788,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.96377,
"step": 366,
"tokens/total": 749568,
"tokens/train_per_sec_per_gpu": 263.14,
"tokens/trainable": 740472
},
{
"epoch": 5.024193548387097,
"grad_norm": 1.5393133163452148,
"learning_rate": 3.202695807252871e-05,
"loss": 1.3185768127441406,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.7381,
"step": 367,
"tokens/total": 751616,
"tokens/train_per_sec_per_gpu": 263.8,
"tokens/trainable": 742500
},
{
"epoch": 5.026209677419355,
"grad_norm": 1.5796706676483154,
"learning_rate": 3.1563760333646395e-05,
"loss": 1.3427810668945312,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.82968,
"step": 368,
"tokens/total": 753664,
"tokens/train_per_sec_per_gpu": 261.52,
"tokens/trainable": 744527
},
{
"epoch": 5.028225806451613,
"grad_norm": 1.6402651071548462,
"learning_rate": 3.110330809243134e-05,
"loss": 1.2144150733947754,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.36832,
"step": 369,
"tokens/total": 755712,
"tokens/train_per_sec_per_gpu": 261.29,
"tokens/trainable": 746548
},
{
"epoch": 5.030241935483871,
"grad_norm": 1.6114401817321777,
"learning_rate": 3.064561982112232e-05,
"loss": 1.4515684843063354,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.26981,
"step": 370,
"tokens/total": 757760,
"tokens/train_per_sec_per_gpu": 266.07,
"tokens/trainable": 748593
},
{
"epoch": 5.032258064516129,
"grad_norm": 1.5494929552078247,
"learning_rate": 3.0190713881074105e-05,
"loss": 1.3978958129882812,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.04668,
"step": 371,
"tokens/total": 759808,
"tokens/train_per_sec_per_gpu": 262.06,
"tokens/trainable": 750622
},
{
"epoch": 5.034274193548387,
"grad_norm": 1.4975247383117676,
"learning_rate": 2.9738608522021173e-05,
"loss": 1.3401731252670288,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.8197,
"step": 372,
"tokens/total": 761856,
"tokens/train_per_sec_per_gpu": 266.56,
"tokens/trainable": 752664
},
{
"epoch": 5.036290322580645,
"grad_norm": 1.4258899688720703,
"learning_rate": 2.9289321881345254e-05,
"loss": 1.0087716579437256,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.74223,
"step": 373,
"tokens/total": 763904,
"tokens/train_per_sec_per_gpu": 262.62,
"tokens/trainable": 754695
},
{
"epoch": 5.038306451612903,
"grad_norm": 1.3442202806472778,
"learning_rate": 2.8842871983347998e-05,
"loss": 1.0394554138183594,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.82768,
"step": 374,
"tokens/total": 765952,
"tokens/train_per_sec_per_gpu": 265.19,
"tokens/trainable": 756738
},
{
"epoch": 5.040322580645161,
"grad_norm": 1.64373779296875,
"learning_rate": 2.8399276738527714e-05,
"loss": 1.4822442531585693,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.40282,
"step": 375,
"tokens/total": 768000,
"tokens/train_per_sec_per_gpu": 263.71,
"tokens/trainable": 758772
},
{
"epoch": 5.042338709677419,
"grad_norm": 1.4457206726074219,
"learning_rate": 2.795855394286081e-05,
"loss": 1.1947224140167236,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.30264,
"step": 376,
"tokens/total": 770048,
"tokens/train_per_sec_per_gpu": 265.63,
"tokens/trainable": 760817
},
{
"epoch": 5.044354838709677,
"grad_norm": 1.353393316268921,
"learning_rate": 2.7520721277088024e-05,
"loss": 1.000101923942566,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.71856,
"step": 377,
"tokens/total": 772096,
"tokens/train_per_sec_per_gpu": 263.99,
"tokens/trainable": 762861
},
{
"epoch": 5.046370967741935,
"grad_norm": 1.3696867227554321,
"learning_rate": 2.7085796306004906e-05,
"loss": 1.1429383754730225,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.13597,
"step": 378,
"tokens/total": 774144,
"tokens/train_per_sec_per_gpu": 264.49,
"tokens/trainable": 764893
},
{
"epoch": 5.048387096774194,
"grad_norm": 1.4743932485580444,
"learning_rate": 2.6653796477757432e-05,
"loss": 1.1978074312210083,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.31285,
"step": 379,
"tokens/total": 776192,
"tokens/train_per_sec_per_gpu": 256.19,
"tokens/trainable": 766940
},
{
"epoch": 5.050403225806452,
"grad_norm": 1.4643374681472778,
"learning_rate": 2.6224739123141684e-05,
"loss": 1.512551188468933,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.53829,
"step": 380,
"tokens/total": 778240,
"tokens/train_per_sec_per_gpu": 268.35,
"tokens/trainable": 768987
},
{
"epoch": 5.05241935483871,
"grad_norm": 1.456166386604309,
"learning_rate": 2.5798641454908944e-05,
"loss": 1.212234616279602,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.36099,
"step": 381,
"tokens/total": 780288,
"tokens/train_per_sec_per_gpu": 268.08,
"tokens/trainable": 771017
},
{
"epoch": 5.054435483870968,
"grad_norm": 1.4265928268432617,
"learning_rate": 2.537552056707483e-05,
"loss": 1.1896257400512695,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.28585,
"step": 382,
"tokens/total": 782336,
"tokens/train_per_sec_per_gpu": 268.81,
"tokens/trainable": 773063
},
{
"epoch": 5.056451612903226,
"grad_norm": 1.5509512424468994,
"learning_rate": 2.4955393434233754e-05,
"loss": 0.8469423055648804,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.3325,
"step": 383,
"tokens/total": 784384,
"tokens/train_per_sec_per_gpu": 260.74,
"tokens/trainable": 775084
},
{
"epoch": 5.058467741935484,
"grad_norm": 1.5296618938446045,
"learning_rate": 2.45382769108779e-05,
"loss": 1.3008588552474976,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.67245,
"step": 384,
"tokens/total": 786432,
"tokens/train_per_sec_per_gpu": 260.72,
"tokens/trainable": 777109
},
{
"epoch": 5.060483870967742,
"grad_norm": 1.67258882522583,
"learning_rate": 2.4124187730720917e-05,
"loss": 1.3961818218231201,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.03975,
"step": 385,
"tokens/total": 788480,
"tokens/train_per_sec_per_gpu": 264.5,
"tokens/trainable": 779147
},
{
"epoch": 5.0625,
"grad_norm": 1.4869979619979858,
"learning_rate": 2.3713142506026786e-05,
"loss": 1.1131031513214111,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.04379,
"step": 386,
"tokens/total": 790528,
"tokens/train_per_sec_per_gpu": 262.53,
"tokens/trainable": 781185
},
{
"epoch": 5.064516129032258,
"grad_norm": 1.2601240873336792,
"learning_rate": 2.3305157726943327e-05,
"loss": 0.9813694357872009,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.66811,
"step": 387,
"tokens/total": 792576,
"tokens/train_per_sec_per_gpu": 263.69,
"tokens/trainable": 783227
},
{
"epoch": 5.066532258064516,
"grad_norm": 1.640095829963684,
"learning_rate": 2.290024976084052e-05,
"loss": 1.22378671169281,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.40004,
"step": 388,
"tokens/total": 794624,
"tokens/train_per_sec_per_gpu": 262.79,
"tokens/trainable": 785258
},
{
"epoch": 5.068548387096774,
"grad_norm": 1.4825721979141235,
"learning_rate": 2.2498434851654126e-05,
"loss": 1.0400973558425903,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.82949,
"step": 389,
"tokens/total": 796672,
"tokens/train_per_sec_per_gpu": 261.9,
"tokens/trainable": 787284
},
{
"epoch": 5.070564516129032,
"grad_norm": 1.561867117881775,
"learning_rate": 2.209972911923377e-05,
"loss": 0.9926523566246033,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 2.69838,
"step": 390,
"tokens/total": 798720,
"tokens/train_per_sec_per_gpu": 262.8,
"tokens/trainable": 789319
},
{
"epoch": 5.07258064516129,
"grad_norm": 1.9464138746261597,
"learning_rate": 2.170414855869647e-05,
"loss": 1.26716148853302,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.55076,
"step": 391,
"tokens/total": 800768,
"tokens/train_per_sec_per_gpu": 265.31,
"tokens/trainable": 791355
},
{
"epoch": 5.074596774193548,
"grad_norm": 1.5921978950500488,
"learning_rate": 2.1311709039784734e-05,
"loss": 1.1754467487335205,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.23959,
"step": 392,
"tokens/total": 802816,
"tokens/train_per_sec_per_gpu": 258.36,
"tokens/trainable": 793389
},
{
"epoch": 5.076612903225806,
"grad_norm": 1.8206204175949097,
"learning_rate": 2.092242630623016e-05,
"loss": 1.431692123413086,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.18578,
"step": 393,
"tokens/total": 804864,
"tokens/train_per_sec_per_gpu": 266.26,
"tokens/trainable": 795424
},
{
"epoch": 5.078629032258065,
"grad_norm": 1.725522518157959,
"learning_rate": 2.0536315975121544e-05,
"loss": 1.7849311828613281,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.95917,
"step": 394,
"tokens/total": 806912,
"tokens/train_per_sec_per_gpu": 270.11,
"tokens/trainable": 797452
},
{
"epoch": 5.080645161290323,
"grad_norm": 1.5005055665969849,
"learning_rate": 2.0153393536278653e-05,
"loss": 1.2330188751220703,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.43157,
"step": 395,
"tokens/total": 808960,
"tokens/train_per_sec_per_gpu": 267.03,
"tokens/trainable": 799477
},
{
"epoch": 5.082661290322581,
"grad_norm": 1.4876439571380615,
"learning_rate": 1.9773674351630545e-05,
"loss": 1.4646284580230713,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.32594,
"step": 396,
"tokens/total": 811008,
"tokens/train_per_sec_per_gpu": 265.64,
"tokens/trainable": 801522
},
{
"epoch": 5.084677419354839,
"grad_norm": 1.5197019577026367,
"learning_rate": 1.939717365459952e-05,
"loss": 1.2707014083862305,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.56335,
"step": 397,
"tokens/total": 813056,
"tokens/train_per_sec_per_gpu": 265.76,
"tokens/trainable": 803563
},
{
"epoch": 5.086693548387097,
"grad_norm": 1.59368097782135,
"learning_rate": 1.9023906549489767e-05,
"loss": 1.4219883680343628,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.14535,
"step": 398,
"tokens/total": 815104,
"tokens/train_per_sec_per_gpu": 262.05,
"tokens/trainable": 805600
},
{
"epoch": 5.088709677419355,
"grad_norm": 1.5910316705703735,
"learning_rate": 1.8653888010881637e-05,
"loss": 1.5422284603118896,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.675,
"step": 399,
"tokens/total": 817152,
"tokens/train_per_sec_per_gpu": 262.99,
"tokens/trainable": 807629
},
{
"epoch": 5.090725806451613,
"grad_norm": 1.5295209884643555,
"learning_rate": 1.82871328830307e-05,
"loss": 1.1745970249176025,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.23684,
"step": 400,
"tokens/total": 819200,
"tokens/train_per_sec_per_gpu": 256.08,
"tokens/trainable": 809674
},
{
"epoch": 5.092741935483871,
"grad_norm": 1.8055143356323242,
"learning_rate": 1.7923655879272393e-05,
"loss": 1.7367737293243408,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 5.67899,
"step": 401,
"tokens/total": 821248,
"tokens/train_per_sec_per_gpu": 262.84,
"tokens/trainable": 811712
},
{
"epoch": 5.094758064516129,
"grad_norm": 1.5461682081222534,
"learning_rate": 1.7563471581431624e-05,
"loss": 1.4324666261672974,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.18902,
"step": 402,
"tokens/total": 823296,
"tokens/train_per_sec_per_gpu": 265.47,
"tokens/trainable": 813752
},
{
"epoch": 5.096774193548387,
"grad_norm": 1.5001784563064575,
"learning_rate": 1.7206594439237865e-05,
"loss": 1.1119897365570068,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.0404,
"step": 403,
"tokens/total": 825344,
"tokens/train_per_sec_per_gpu": 262.3,
"tokens/trainable": 815783
},
{
"epoch": 5.098790322580645,
"grad_norm": 1.5779640674591064,
"learning_rate": 1.6853038769745467e-05,
"loss": 1.464457392692566,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 4.3252,
"step": 404,
"tokens/total": 827392,
"tokens/train_per_sec_per_gpu": 261.96,
"tokens/trainable": 817804
},
{
"epoch": 5.100806451612903,
"grad_norm": 1.684232473373413,
"learning_rate": 1.6502818756759276e-05,
"loss": 1.3644180297851562,
"memory/device_reserved (GiB)": 8.46,
"memory/max_active (GiB)": 4.41,
"memory/max_allocated (GiB)": 4.34,
"ppl": 3.91344,
"step": 405,
"tokens/total": 829440,
"tokens/train_per_sec_per_gpu": 265.87,
"tokens/trainable": 819840
}
],
"logging_steps": 1,
"max_steps": 496,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 45,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.298203288338432e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}