Simp_1_2_2026 / trainer_state.json
anhnct's picture
Upload folder using huggingface_hub
1e84e85 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 13724,
"global_step": 68625,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 8.86070728302002,
"eval_runtime": 579.5301,
"eval_samples_per_second": 523.403,
"eval_steps_per_second": 43.618,
"memory/device_mem_reserved(gib)": 28.34,
"memory/max_mem_active(gib)": 28.24,
"memory/max_mem_allocated(gib)": 28.24,
"step": 0
},
{
"epoch": 0.0036430536075338347,
"grad_norm": 4.34375,
"learning_rate": 0.0001999999601433486,
"loss": 6.5752,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50
},
{
"epoch": 0.007286107215067669,
"grad_norm": 3.015625,
"learning_rate": 0.00019999979243626065,
"loss": 5.425,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 100
},
{
"epoch": 0.010929160822601505,
"grad_norm": 3.609375,
"learning_rate": 0.0001999994937082425,
"loss": 5.221,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 150
},
{
"epoch": 0.014572214430135339,
"grad_norm": 2.765625,
"learning_rate": 0.0001999990639596855,
"loss": 5.1478,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 200
},
{
"epoch": 0.018215268037669173,
"grad_norm": 2.375,
"learning_rate": 0.00019999850319115273,
"loss": 5.0803,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 250
},
{
"epoch": 0.02185832164520301,
"grad_norm": 3.921875,
"learning_rate": 0.0001999978114033789,
"loss": 5.0557,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 300
},
{
"epoch": 0.025501375252736845,
"grad_norm": 3.28125,
"learning_rate": 0.00019999698859727048,
"loss": 5.0235,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 350
},
{
"epoch": 0.029144428860270678,
"grad_norm": 3.453125,
"learning_rate": 0.0001999960347739054,
"loss": 4.9791,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 400
},
{
"epoch": 0.03278748246780452,
"grad_norm": 892.0,
"learning_rate": 0.00019999494993453346,
"loss": 4.9892,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 450
},
{
"epoch": 0.036430536075338346,
"grad_norm": 3.21875,
"learning_rate": 0.00019999373408057598,
"loss": 4.9557,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 500
},
{
"epoch": 0.04007358968287218,
"grad_norm": 2.84375,
"learning_rate": 0.000199992387213626,
"loss": 4.9263,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 550
},
{
"epoch": 0.04371664329040602,
"grad_norm": 1.8515625,
"learning_rate": 0.0001999909093354482,
"loss": 4.9129,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 600
},
{
"epoch": 0.047359696897939854,
"grad_norm": 3.046875,
"learning_rate": 0.00019998930044797897,
"loss": 4.9079,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 650
},
{
"epoch": 0.05100275050547369,
"grad_norm": 3.0,
"learning_rate": 0.0001999875605533262,
"loss": 4.9052,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 700
},
{
"epoch": 0.054645804113007526,
"grad_norm": 2.796875,
"learning_rate": 0.00019998568965376955,
"loss": 4.8854,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 750
},
{
"epoch": 0.058288857720541355,
"grad_norm": 2.0625,
"learning_rate": 0.00019998368775176034,
"loss": 4.8794,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 800
},
{
"epoch": 0.06193191132807519,
"grad_norm": 3.171875,
"learning_rate": 0.00019998155484992144,
"loss": 4.8675,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 850
},
{
"epoch": 0.06557496493560903,
"grad_norm": 2.234375,
"learning_rate": 0.00019997929095104744,
"loss": 4.8586,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 900
},
{
"epoch": 0.06921801854314286,
"grad_norm": 4.65625,
"learning_rate": 0.0001999768960581045,
"loss": 4.8677,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 950
},
{
"epoch": 0.07286107215067669,
"grad_norm": 2.671875,
"learning_rate": 0.00019997437017423044,
"loss": 4.8616,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1000
},
{
"epoch": 0.07650412575821053,
"grad_norm": 2.34375,
"learning_rate": 0.0001999717133027347,
"loss": 4.859,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1050
},
{
"epoch": 0.08014717936574436,
"grad_norm": 2.171875,
"learning_rate": 0.00019996892544709834,
"loss": 4.8502,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1100
},
{
"epoch": 0.0837902329732782,
"grad_norm": 2.65625,
"learning_rate": 0.0001999660066109741,
"loss": 4.8431,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1150
},
{
"epoch": 0.08743328658081204,
"grad_norm": 2.03125,
"learning_rate": 0.00019996295679818618,
"loss": 4.8409,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1200
},
{
"epoch": 0.09107634018834587,
"grad_norm": 2.171875,
"learning_rate": 0.00019995977601273052,
"loss": 4.8311,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1250
},
{
"epoch": 0.09471939379587971,
"grad_norm": 1.921875,
"learning_rate": 0.00019995646425877467,
"loss": 4.8255,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1300
},
{
"epoch": 0.09836244740341354,
"grad_norm": 1.65625,
"learning_rate": 0.0001999530215406577,
"loss": 4.8379,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1350
},
{
"epoch": 0.10200550101094738,
"grad_norm": 1.5234375,
"learning_rate": 0.00019994944786289027,
"loss": 4.8341,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1400
},
{
"epoch": 0.10564855461848122,
"grad_norm": 3.328125,
"learning_rate": 0.0001999457432301547,
"loss": 4.8198,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1450
},
{
"epoch": 0.10929160822601505,
"grad_norm": 2.0,
"learning_rate": 0.0001999419076473048,
"loss": 4.8208,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1500
},
{
"epoch": 0.11293466183354887,
"grad_norm": 1.6796875,
"learning_rate": 0.00019993794111936603,
"loss": 4.811,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1550
},
{
"epoch": 0.11657771544108271,
"grad_norm": 1.7109375,
"learning_rate": 0.0001999338436515354,
"loss": 4.8103,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1600
},
{
"epoch": 0.12022076904861655,
"grad_norm": 2.25,
"learning_rate": 0.00019992961524918138,
"loss": 4.8211,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1650
},
{
"epoch": 0.12386382265615038,
"grad_norm": 1.640625,
"learning_rate": 0.00019992525591784418,
"loss": 4.7974,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1700
},
{
"epoch": 0.12750687626368423,
"grad_norm": 4.90625,
"learning_rate": 0.00019992076566323537,
"loss": 4.8109,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1750
},
{
"epoch": 0.13114992987121807,
"grad_norm": 2.5,
"learning_rate": 0.00019991614449123816,
"loss": 4.8088,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1800
},
{
"epoch": 0.13479298347875188,
"grad_norm": 2.765625,
"learning_rate": 0.00019991139240790727,
"loss": 4.8129,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1850
},
{
"epoch": 0.1384360370862857,
"grad_norm": 2.5,
"learning_rate": 0.00019990650941946892,
"loss": 4.7992,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1900
},
{
"epoch": 0.14207909069381955,
"grad_norm": 1.5703125,
"learning_rate": 0.0001999014955323209,
"loss": 4.7891,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 1950
},
{
"epoch": 0.14572214430135338,
"grad_norm": 3.015625,
"learning_rate": 0.00019989635075303244,
"loss": 4.807,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2000
},
{
"epoch": 0.14936519790888722,
"grad_norm": 1.9609375,
"learning_rate": 0.00019989107508834426,
"loss": 4.7982,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2050
},
{
"epoch": 0.15300825151642106,
"grad_norm": 4.1875,
"learning_rate": 0.00019988566854516865,
"loss": 4.7876,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2100
},
{
"epoch": 0.1566513051239549,
"grad_norm": 2.296875,
"learning_rate": 0.00019988013113058931,
"loss": 4.781,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2150
},
{
"epoch": 0.16029435873148873,
"grad_norm": 1.9296875,
"learning_rate": 0.00019987446285186144,
"loss": 4.7979,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2200
},
{
"epoch": 0.16393741233902256,
"grad_norm": 1.78125,
"learning_rate": 0.00019986866371641163,
"loss": 4.7943,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2250
},
{
"epoch": 0.1675804659465564,
"grad_norm": 1.9375,
"learning_rate": 0.00019986273373183807,
"loss": 4.7746,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2300
},
{
"epoch": 0.17122351955409024,
"grad_norm": 1.65625,
"learning_rate": 0.00019985667290591024,
"loss": 4.7891,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2350
},
{
"epoch": 0.17486657316162407,
"grad_norm": 1.7890625,
"learning_rate": 0.00019985048124656908,
"loss": 4.7855,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2400
},
{
"epoch": 0.1785096267691579,
"grad_norm": 1.7265625,
"learning_rate": 0.00019984415876192705,
"loss": 4.7557,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2450
},
{
"epoch": 0.18215268037669174,
"grad_norm": 1.71875,
"learning_rate": 0.00019983770546026786,
"loss": 4.792,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2500
},
{
"epoch": 0.18579573398422558,
"grad_norm": 2.09375,
"learning_rate": 0.00019983112135004677,
"loss": 4.7808,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2550
},
{
"epoch": 0.18943878759175942,
"grad_norm": 1.6171875,
"learning_rate": 0.00019982440643989035,
"loss": 4.7753,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2600
},
{
"epoch": 0.19308184119929325,
"grad_norm": 1.9921875,
"learning_rate": 0.0001998175607385965,
"loss": 4.7718,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2650
},
{
"epoch": 0.1967248948068271,
"grad_norm": 1.640625,
"learning_rate": 0.00019981058425513464,
"loss": 4.7856,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2700
},
{
"epoch": 0.20036794841436092,
"grad_norm": 1.625,
"learning_rate": 0.00019980347699864533,
"loss": 4.7822,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2750
},
{
"epoch": 0.20401100202189476,
"grad_norm": 1.8359375,
"learning_rate": 0.0001997962389784407,
"loss": 4.7734,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2800
},
{
"epoch": 0.2076540556294286,
"grad_norm": 1.828125,
"learning_rate": 0.00019978887020400402,
"loss": 4.7677,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2850
},
{
"epoch": 0.21129710923696243,
"grad_norm": 3.078125,
"learning_rate": 0.00019978137068498995,
"loss": 4.7827,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2900
},
{
"epoch": 0.21494016284449627,
"grad_norm": 1.7421875,
"learning_rate": 0.00019977374043122446,
"loss": 4.76,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 2950
},
{
"epoch": 0.2185832164520301,
"grad_norm": 2.59375,
"learning_rate": 0.00019976597945270478,
"loss": 4.7473,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3000
},
{
"epoch": 0.2222262700595639,
"grad_norm": 1.8828125,
"learning_rate": 0.00019975808775959947,
"loss": 4.7717,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3050
},
{
"epoch": 0.22586932366709775,
"grad_norm": 2.046875,
"learning_rate": 0.0001997500653622483,
"loss": 4.7739,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3100
},
{
"epoch": 0.22951237727463158,
"grad_norm": 1.734375,
"learning_rate": 0.00019974191227116234,
"loss": 4.7492,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3150
},
{
"epoch": 0.23315543088216542,
"grad_norm": 2.28125,
"learning_rate": 0.00019973362849702383,
"loss": 4.7625,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3200
},
{
"epoch": 0.23679848448969926,
"grad_norm": 2.265625,
"learning_rate": 0.00019972521405068626,
"loss": 4.7515,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3250
},
{
"epoch": 0.2404415380972331,
"grad_norm": 1.7890625,
"learning_rate": 0.0001997166689431744,
"loss": 4.7716,
"memory/device_mem_reserved(gib)": 63.02,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3300
},
{
"epoch": 0.24408459170476693,
"grad_norm": 2.328125,
"learning_rate": 0.00019970799318568412,
"loss": 4.7701,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3350
},
{
"epoch": 0.24772764531230076,
"grad_norm": 1.890625,
"learning_rate": 0.0001996991867895825,
"loss": 4.7435,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3400
},
{
"epoch": 0.2513706989198346,
"grad_norm": 1.640625,
"learning_rate": 0.00019969024976640776,
"loss": 4.7664,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3450
},
{
"epoch": 0.25501375252736846,
"grad_norm": 1.5703125,
"learning_rate": 0.00019968118212786935,
"loss": 4.7539,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3500
},
{
"epoch": 0.2586568061349023,
"grad_norm": 1.984375,
"learning_rate": 0.00019967198388584775,
"loss": 4.7536,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3550
},
{
"epoch": 0.26229985974243614,
"grad_norm": 3.171875,
"learning_rate": 0.00019966265505239465,
"loss": 4.7535,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3600
},
{
"epoch": 0.26594291334996994,
"grad_norm": 1.5703125,
"learning_rate": 0.00019965319563973276,
"loss": 4.7544,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3650
},
{
"epoch": 0.26958596695750375,
"grad_norm": 1.7734375,
"learning_rate": 0.00019964360566025592,
"loss": 4.7627,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3700
},
{
"epoch": 0.2732290205650376,
"grad_norm": 2.265625,
"learning_rate": 0.0001996338851265291,
"loss": 4.7506,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3750
},
{
"epoch": 0.2768720741725714,
"grad_norm": 2.375,
"learning_rate": 0.00019962403405128818,
"loss": 4.7488,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3800
},
{
"epoch": 0.2805151277801053,
"grad_norm": 2.171875,
"learning_rate": 0.0001996140524474402,
"loss": 4.7349,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3850
},
{
"epoch": 0.2841581813876391,
"grad_norm": 2.484375,
"learning_rate": 0.00019960394032806313,
"loss": 4.74,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3900
},
{
"epoch": 0.28780123499517296,
"grad_norm": 1.3125,
"learning_rate": 0.00019959369770640605,
"loss": 4.7398,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 3950
},
{
"epoch": 0.29144428860270677,
"grad_norm": 1.6328125,
"learning_rate": 0.0001995833245958889,
"loss": 4.7369,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4000
},
{
"epoch": 0.29508734221024063,
"grad_norm": 1.8359375,
"learning_rate": 0.00019957282101010275,
"loss": 4.7469,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4050
},
{
"epoch": 0.29873039581777444,
"grad_norm": 2.1875,
"learning_rate": 0.00019956218696280946,
"loss": 4.7504,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4100
},
{
"epoch": 0.3023734494253083,
"grad_norm": 1.4453125,
"learning_rate": 0.00019955142246794188,
"loss": 4.7473,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4150
},
{
"epoch": 0.3060165030328421,
"grad_norm": 1.84375,
"learning_rate": 0.0001995405275396038,
"loss": 4.7453,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4200
},
{
"epoch": 0.309659556640376,
"grad_norm": 1.453125,
"learning_rate": 0.0001995295021920699,
"loss": 4.7269,
"memory/device_mem_reserved(gib)": 63.21,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4250
},
{
"epoch": 0.3133026102479098,
"grad_norm": 1.4921875,
"learning_rate": 0.0001995183464397857,
"loss": 4.7336,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4300
},
{
"epoch": 0.31694566385544365,
"grad_norm": 1.546875,
"learning_rate": 0.00019950706029736758,
"loss": 4.7422,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4350
},
{
"epoch": 0.32058871746297746,
"grad_norm": 1.59375,
"learning_rate": 0.00019949564377960281,
"loss": 4.7503,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4400
},
{
"epoch": 0.3242317710705113,
"grad_norm": 1.40625,
"learning_rate": 0.0001994840969014495,
"loss": 4.7461,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4450
},
{
"epoch": 0.32787482467804513,
"grad_norm": 1.3671875,
"learning_rate": 0.00019947241967803637,
"loss": 4.7384,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4500
},
{
"epoch": 0.331517878285579,
"grad_norm": 1.1875,
"learning_rate": 0.0001994606121246632,
"loss": 4.7454,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4550
},
{
"epoch": 0.3351609318931128,
"grad_norm": 1.6484375,
"learning_rate": 0.00019944867425680034,
"loss": 4.7483,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4600
},
{
"epoch": 0.33880398550064666,
"grad_norm": 1.4921875,
"learning_rate": 0.00019943660609008886,
"loss": 4.7328,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4650
},
{
"epoch": 0.3424470391081805,
"grad_norm": 1.6796875,
"learning_rate": 0.00019942440764034075,
"loss": 4.7382,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4700
},
{
"epoch": 0.34609009271571434,
"grad_norm": 2.78125,
"learning_rate": 0.00019941207892353843,
"loss": 4.729,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4750
},
{
"epoch": 0.34973314632324815,
"grad_norm": 2.15625,
"learning_rate": 0.0001993996199558352,
"loss": 4.7405,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4800
},
{
"epoch": 0.353376199930782,
"grad_norm": 1.3984375,
"learning_rate": 0.00019938703075355496,
"loss": 4.7504,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4850
},
{
"epoch": 0.3570192535383158,
"grad_norm": 1.3203125,
"learning_rate": 0.0001993743113331922,
"loss": 4.7415,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4900
},
{
"epoch": 0.3606623071458496,
"grad_norm": 1.84375,
"learning_rate": 0.0001993614617114121,
"loss": 4.7293,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 4950
},
{
"epoch": 0.3643053607533835,
"grad_norm": 1.234375,
"learning_rate": 0.00019934848190505036,
"loss": 4.7308,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5000
},
{
"epoch": 0.3679484143609173,
"grad_norm": 1.890625,
"learning_rate": 0.00019933537193111327,
"loss": 4.7361,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5050
},
{
"epoch": 0.37159146796845116,
"grad_norm": 1.46875,
"learning_rate": 0.0001993221318067777,
"loss": 4.7288,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5100
},
{
"epoch": 0.37523452157598497,
"grad_norm": 1.765625,
"learning_rate": 0.00019930876154939097,
"loss": 4.7409,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5150
},
{
"epoch": 0.37887757518351883,
"grad_norm": 1.75,
"learning_rate": 0.00019929526117647104,
"loss": 4.7261,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5200
},
{
"epoch": 0.38252062879105264,
"grad_norm": 1.453125,
"learning_rate": 0.00019928163070570619,
"loss": 4.7163,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5250
},
{
"epoch": 0.3861636823985865,
"grad_norm": 1.140625,
"learning_rate": 0.00019926787015495524,
"loss": 4.7419,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5300
},
{
"epoch": 0.3898067360061203,
"grad_norm": 2.578125,
"learning_rate": 0.00019925397954224746,
"loss": 4.723,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5350
},
{
"epoch": 0.3934497896136542,
"grad_norm": 1.7734375,
"learning_rate": 0.00019923995888578242,
"loss": 4.7147,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5400
},
{
"epoch": 0.397092843221188,
"grad_norm": 1.53125,
"learning_rate": 0.00019922580820393027,
"loss": 4.7327,
"memory/device_mem_reserved(gib)": 63.25,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5450
},
{
"epoch": 0.40073589682872185,
"grad_norm": 1.6875,
"learning_rate": 0.0001992115275152313,
"loss": 4.7336,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5500
},
{
"epoch": 0.40437895043625566,
"grad_norm": 1.9609375,
"learning_rate": 0.00019919711683839627,
"loss": 4.7355,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5550
},
{
"epoch": 0.4080220040437895,
"grad_norm": 3.140625,
"learning_rate": 0.00019918257619230627,
"loss": 4.7275,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5600
},
{
"epoch": 0.41166505765132333,
"grad_norm": 1.3671875,
"learning_rate": 0.00019916790559601258,
"loss": 4.7265,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5650
},
{
"epoch": 0.4153081112588572,
"grad_norm": 3.390625,
"learning_rate": 0.0001991531050687368,
"loss": 4.7165,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5700
},
{
"epoch": 0.418951164866391,
"grad_norm": 1.84375,
"learning_rate": 0.00019913817462987075,
"loss": 4.7273,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5750
},
{
"epoch": 0.42259421847392487,
"grad_norm": 1.7578125,
"learning_rate": 0.0001991231142989765,
"loss": 4.7246,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5800
},
{
"epoch": 0.4262372720814587,
"grad_norm": 1.3125,
"learning_rate": 0.00019910792409578624,
"loss": 4.7113,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5850
},
{
"epoch": 0.42988032568899254,
"grad_norm": 2.453125,
"learning_rate": 0.0001990926040402024,
"loss": 4.7308,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5900
},
{
"epoch": 0.43352337929652635,
"grad_norm": 2.375,
"learning_rate": 0.00019907715415229746,
"loss": 4.7258,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 5950
},
{
"epoch": 0.4371664329040602,
"grad_norm": 1.8359375,
"learning_rate": 0.00019906157445231406,
"loss": 4.7347,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6000
},
{
"epoch": 0.440809486511594,
"grad_norm": 1.359375,
"learning_rate": 0.00019904586496066493,
"loss": 4.7186,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6050
},
{
"epoch": 0.4444525401191278,
"grad_norm": 1.9765625,
"learning_rate": 0.00019903002569793282,
"loss": 4.7277,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6100
},
{
"epoch": 0.4480955937266617,
"grad_norm": 1.3125,
"learning_rate": 0.0001990140566848705,
"loss": 4.7184,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6150
},
{
"epoch": 0.4517386473341955,
"grad_norm": 1.375,
"learning_rate": 0.0001989979579424008,
"loss": 4.7278,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6200
},
{
"epoch": 0.45538170094172936,
"grad_norm": 1.3359375,
"learning_rate": 0.00019898172949161648,
"loss": 4.7344,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6250
},
{
"epoch": 0.45902475454926317,
"grad_norm": 2.734375,
"learning_rate": 0.00019896537135378025,
"loss": 4.7325,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6300
},
{
"epoch": 0.46266780815679703,
"grad_norm": 1.7578125,
"learning_rate": 0.00019894888355032468,
"loss": 4.7283,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6350
},
{
"epoch": 0.46631086176433084,
"grad_norm": 1.6953125,
"learning_rate": 0.0001989322661028524,
"loss": 4.7196,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6400
},
{
"epoch": 0.4699539153718647,
"grad_norm": 1.40625,
"learning_rate": 0.0001989155190331357,
"loss": 4.7179,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6450
},
{
"epoch": 0.4735969689793985,
"grad_norm": 1.1171875,
"learning_rate": 0.00019889864236311683,
"loss": 4.7332,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6500
},
{
"epoch": 0.4772400225869324,
"grad_norm": 3.234375,
"learning_rate": 0.0001988816361149078,
"loss": 4.7166,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6550
},
{
"epoch": 0.4808830761944662,
"grad_norm": 1.875,
"learning_rate": 0.0001988645003107904,
"loss": 4.7201,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6600
},
{
"epoch": 0.48452612980200005,
"grad_norm": 1.84375,
"learning_rate": 0.00019884723497321617,
"loss": 4.7344,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6650
},
{
"epoch": 0.48816918340953386,
"grad_norm": 1.390625,
"learning_rate": 0.00019882984012480634,
"loss": 4.7158,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6700
},
{
"epoch": 0.4918122370170677,
"grad_norm": 1.5078125,
"learning_rate": 0.00019881231578835185,
"loss": 4.7163,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6750
},
{
"epoch": 0.49545529062460153,
"grad_norm": 1.578125,
"learning_rate": 0.00019879466198681337,
"loss": 4.7198,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6800
},
{
"epoch": 0.4990983442321354,
"grad_norm": 1.921875,
"learning_rate": 0.00019877687874332104,
"loss": 4.7198,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6850
},
{
"epoch": 0.5027413978396692,
"grad_norm": 1.6875,
"learning_rate": 0.0001987589660811747,
"loss": 4.7289,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6900
},
{
"epoch": 0.506384451447203,
"grad_norm": 1.2265625,
"learning_rate": 0.00019874092402384375,
"loss": 4.7214,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 6950
},
{
"epoch": 0.5100275050547369,
"grad_norm": 1.4765625,
"learning_rate": 0.00019872275259496709,
"loss": 4.7201,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7000
},
{
"epoch": 0.5136705586622707,
"grad_norm": 1.6484375,
"learning_rate": 0.00019870445181835317,
"loss": 4.7091,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7050
},
{
"epoch": 0.5173136122698045,
"grad_norm": 1.1796875,
"learning_rate": 0.00019868602171797985,
"loss": 4.7131,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7100
},
{
"epoch": 0.5209566658773384,
"grad_norm": 1.171875,
"learning_rate": 0.00019866746231799452,
"loss": 4.7132,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7150
},
{
"epoch": 0.5245997194848723,
"grad_norm": 1.40625,
"learning_rate": 0.00019864877364271388,
"loss": 4.7088,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7200
},
{
"epoch": 0.5282427730924061,
"grad_norm": 1.140625,
"learning_rate": 0.0001986299557166241,
"loss": 4.7075,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7250
},
{
"epoch": 0.5318858266999399,
"grad_norm": 1.453125,
"learning_rate": 0.0001986110085643806,
"loss": 4.7243,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7300
},
{
"epoch": 0.5355288803074737,
"grad_norm": 1.3203125,
"learning_rate": 0.00019859193221080824,
"loss": 4.7224,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7350
},
{
"epoch": 0.5391719339150075,
"grad_norm": 1.5078125,
"learning_rate": 0.00019857272668090102,
"loss": 4.703,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7400
},
{
"epoch": 0.5428149875225414,
"grad_norm": 1.59375,
"learning_rate": 0.0001985533919998223,
"loss": 4.7123,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7450
},
{
"epoch": 0.5464580411300752,
"grad_norm": 2.546875,
"learning_rate": 0.00019853392819290457,
"loss": 4.7179,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7500
},
{
"epoch": 0.550101094737609,
"grad_norm": 2.625,
"learning_rate": 0.0001985143352856496,
"loss": 4.7139,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7550
},
{
"epoch": 0.5537441483451429,
"grad_norm": 1.2890625,
"learning_rate": 0.0001984946133037282,
"loss": 4.7175,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7600
},
{
"epoch": 0.5573872019526768,
"grad_norm": 2.0,
"learning_rate": 0.00019847476227298038,
"loss": 4.7202,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7650
},
{
"epoch": 0.5610302555602106,
"grad_norm": 1.2734375,
"learning_rate": 0.00019845478221941517,
"loss": 4.7185,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7700
},
{
"epoch": 0.5646733091677444,
"grad_norm": 1.6484375,
"learning_rate": 0.0001984346731692107,
"loss": 4.7153,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7750
},
{
"epoch": 0.5683163627752782,
"grad_norm": 1.6640625,
"learning_rate": 0.0001984144351487141,
"loss": 4.723,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7800
},
{
"epoch": 0.5719594163828121,
"grad_norm": 2.1875,
"learning_rate": 0.00019839406818444145,
"loss": 4.7072,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7850
},
{
"epoch": 0.5756024699903459,
"grad_norm": 1.75,
"learning_rate": 0.00019837357230307776,
"loss": 4.706,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7900
},
{
"epoch": 0.5792455235978797,
"grad_norm": 2.078125,
"learning_rate": 0.00019835294753147703,
"loss": 4.6992,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 7950
},
{
"epoch": 0.5828885772054135,
"grad_norm": 1.6484375,
"learning_rate": 0.00019833219389666206,
"loss": 4.7252,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8000
},
{
"epoch": 0.5865316308129475,
"grad_norm": 1.4375,
"learning_rate": 0.00019831131142582453,
"loss": 4.6982,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8050
},
{
"epoch": 0.5901746844204813,
"grad_norm": 1.375,
"learning_rate": 0.0001982903001463249,
"loss": 4.7183,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8100
},
{
"epoch": 0.5938177380280151,
"grad_norm": 1.578125,
"learning_rate": 0.0001982691600856924,
"loss": 4.7046,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8150
},
{
"epoch": 0.5974607916355489,
"grad_norm": 2.078125,
"learning_rate": 0.00019824789127162492,
"loss": 4.7002,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8200
},
{
"epoch": 0.6011038452430828,
"grad_norm": 1.2265625,
"learning_rate": 0.00019822649373198926,
"loss": 4.7226,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8250
},
{
"epoch": 0.6047468988506166,
"grad_norm": 1.1328125,
"learning_rate": 0.00019820496749482062,
"loss": 4.7044,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8300
},
{
"epoch": 0.6083899524581504,
"grad_norm": 1.4921875,
"learning_rate": 0.00019818331258832298,
"loss": 4.7108,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8350
},
{
"epoch": 0.6120330060656842,
"grad_norm": 1.65625,
"learning_rate": 0.00019816152904086884,
"loss": 4.7146,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8400
},
{
"epoch": 0.6156760596732181,
"grad_norm": 1.3515625,
"learning_rate": 0.00019813961688099925,
"loss": 4.7052,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8450
},
{
"epoch": 0.619319113280752,
"grad_norm": 1.375,
"learning_rate": 0.00019811757613742383,
"loss": 4.7054,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8500
},
{
"epoch": 0.6229621668882858,
"grad_norm": 1.484375,
"learning_rate": 0.0001980954068390206,
"loss": 4.7186,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8550
},
{
"epoch": 0.6266052204958196,
"grad_norm": 3.515625,
"learning_rate": 0.00019807310901483608,
"loss": 4.7105,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8600
},
{
"epoch": 0.6302482741033534,
"grad_norm": 1.4140625,
"learning_rate": 0.00019805068269408512,
"loss": 4.7019,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8650
},
{
"epoch": 0.6338913277108873,
"grad_norm": 2.359375,
"learning_rate": 0.0001980281279061509,
"loss": 4.7042,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8700
},
{
"epoch": 0.6375343813184211,
"grad_norm": 2.796875,
"learning_rate": 0.00019800544468058504,
"loss": 4.7089,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8750
},
{
"epoch": 0.6411774349259549,
"grad_norm": 1.2109375,
"learning_rate": 0.00019798263304710739,
"loss": 4.6996,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8800
},
{
"epoch": 0.6448204885334887,
"grad_norm": 1.390625,
"learning_rate": 0.00019795969303560595,
"loss": 4.705,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8850
},
{
"epoch": 0.6484635421410226,
"grad_norm": 1.46875,
"learning_rate": 0.00019793662467613708,
"loss": 4.704,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8900
},
{
"epoch": 0.6521065957485565,
"grad_norm": 1.15625,
"learning_rate": 0.00019791342799892515,
"loss": 4.7013,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 8950
},
{
"epoch": 0.6557496493560903,
"grad_norm": 2.09375,
"learning_rate": 0.0001978901030343628,
"loss": 4.7052,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9000
},
{
"epoch": 0.6593927029636241,
"grad_norm": 1.40625,
"learning_rate": 0.00019786664981301063,
"loss": 4.6984,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9050
},
{
"epoch": 0.663035756571158,
"grad_norm": 2.15625,
"learning_rate": 0.00019784306836559732,
"loss": 4.7043,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9100
},
{
"epoch": 0.6666788101786918,
"grad_norm": 1.3203125,
"learning_rate": 0.00019781935872301962,
"loss": 4.7079,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9150
},
{
"epoch": 0.6703218637862256,
"grad_norm": 1.09375,
"learning_rate": 0.00019779552091634214,
"loss": 4.6956,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9200
},
{
"epoch": 0.6739649173937594,
"grad_norm": 1.5,
"learning_rate": 0.00019777155497679747,
"loss": 4.714,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9250
},
{
"epoch": 0.6776079710012933,
"grad_norm": 2.59375,
"learning_rate": 0.0001977474609357861,
"loss": 4.7094,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9300
},
{
"epoch": 0.6812510246088271,
"grad_norm": 1.3515625,
"learning_rate": 0.00019772323882487632,
"loss": 4.7011,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9350
},
{
"epoch": 0.684894078216361,
"grad_norm": 1.6953125,
"learning_rate": 0.0001976988886758042,
"loss": 4.6987,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9400
},
{
"epoch": 0.6885371318238948,
"grad_norm": 2.59375,
"learning_rate": 0.00019767441052047363,
"loss": 4.7024,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9450
},
{
"epoch": 0.6921801854314287,
"grad_norm": 1.140625,
"learning_rate": 0.00019764980439095618,
"loss": 4.7205,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9500
},
{
"epoch": 0.6958232390389625,
"grad_norm": 1.6484375,
"learning_rate": 0.00019762507031949108,
"loss": 4.7038,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9550
},
{
"epoch": 0.6994662926464963,
"grad_norm": 1.609375,
"learning_rate": 0.00019760020833848522,
"loss": 4.7058,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9600
},
{
"epoch": 0.7031093462540301,
"grad_norm": 1.6953125,
"learning_rate": 0.00019757521848051308,
"loss": 4.7002,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9650
},
{
"epoch": 0.706752399861564,
"grad_norm": 1.7421875,
"learning_rate": 0.00019755010077831666,
"loss": 4.7137,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9700
},
{
"epoch": 0.7103954534690978,
"grad_norm": 1.5625,
"learning_rate": 0.00019752485526480546,
"loss": 4.717,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9750
},
{
"epoch": 0.7140385070766316,
"grad_norm": 1.4140625,
"learning_rate": 0.0001974994819730565,
"loss": 4.6963,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9800
},
{
"epoch": 0.7176815606841654,
"grad_norm": 2.296875,
"learning_rate": 0.0001974739809363141,
"loss": 4.7155,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9850
},
{
"epoch": 0.7213246142916993,
"grad_norm": 1.171875,
"learning_rate": 0.00019744835218799009,
"loss": 4.7063,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9900
},
{
"epoch": 0.7249676678992332,
"grad_norm": 1.828125,
"learning_rate": 0.00019742259576166355,
"loss": 4.7001,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 9950
},
{
"epoch": 0.728610721506767,
"grad_norm": 2.328125,
"learning_rate": 0.00019739671169108082,
"loss": 4.7064,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10000
},
{
"epoch": 0.7322537751143008,
"grad_norm": 1.5390625,
"learning_rate": 0.0001973707000101556,
"loss": 4.7168,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10050
},
{
"epoch": 0.7358968287218346,
"grad_norm": 2.15625,
"learning_rate": 0.00019734456075296862,
"loss": 4.7028,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10100
},
{
"epoch": 0.7395398823293685,
"grad_norm": 1.4375,
"learning_rate": 0.00019731829395376786,
"loss": 4.7101,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10150
},
{
"epoch": 0.7431829359369023,
"grad_norm": 1.265625,
"learning_rate": 0.00019729189964696846,
"loss": 4.7066,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10200
},
{
"epoch": 0.7468259895444361,
"grad_norm": 1.234375,
"learning_rate": 0.0001972653778671525,
"loss": 4.7244,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10250
},
{
"epoch": 0.7504690431519699,
"grad_norm": 1.7734375,
"learning_rate": 0.00019723872864906917,
"loss": 4.7167,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10300
},
{
"epoch": 0.7541120967595039,
"grad_norm": 1.515625,
"learning_rate": 0.00019721195202763458,
"loss": 4.6993,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10350
},
{
"epoch": 0.7577551503670377,
"grad_norm": 2.0,
"learning_rate": 0.00019718504803793176,
"loss": 4.7053,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10400
},
{
"epoch": 0.7613982039745715,
"grad_norm": 1.390625,
"learning_rate": 0.0001971580167152107,
"loss": 4.7009,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10450
},
{
"epoch": 0.7650412575821053,
"grad_norm": 1.5859375,
"learning_rate": 0.00019713085809488812,
"loss": 4.6948,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10500
},
{
"epoch": 0.7686843111896392,
"grad_norm": 1.6953125,
"learning_rate": 0.00019710357221254757,
"loss": 4.6921,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10550
},
{
"epoch": 0.772327364797173,
"grad_norm": 3.84375,
"learning_rate": 0.00019707615910393933,
"loss": 4.7048,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10600
},
{
"epoch": 0.7759704184047068,
"grad_norm": 1.5390625,
"learning_rate": 0.00019704861880498042,
"loss": 4.7059,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10650
},
{
"epoch": 0.7796134720122406,
"grad_norm": 1.4375,
"learning_rate": 0.00019702095135175444,
"loss": 4.7035,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10700
},
{
"epoch": 0.7832565256197745,
"grad_norm": 1.40625,
"learning_rate": 0.00019699315678051166,
"loss": 4.7176,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10750
},
{
"epoch": 0.7868995792273084,
"grad_norm": 2.296875,
"learning_rate": 0.00019696523512766884,
"loss": 4.686,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10800
},
{
"epoch": 0.7905426328348422,
"grad_norm": 3.0,
"learning_rate": 0.0001969371864298092,
"loss": 4.7,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10850
},
{
"epoch": 0.794185686442376,
"grad_norm": 2.234375,
"learning_rate": 0.00019690901072368262,
"loss": 4.686,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10900
},
{
"epoch": 0.7978287400499098,
"grad_norm": 1.875,
"learning_rate": 0.00019688070804620513,
"loss": 4.6894,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 10950
},
{
"epoch": 0.8014717936574437,
"grad_norm": 1.796875,
"learning_rate": 0.00019685227843445926,
"loss": 4.7038,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11000
},
{
"epoch": 0.8051148472649775,
"grad_norm": 1.578125,
"learning_rate": 0.00019682372192569386,
"loss": 4.6949,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11050
},
{
"epoch": 0.8087579008725113,
"grad_norm": 1.2421875,
"learning_rate": 0.00019679503855732404,
"loss": 4.7035,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11100
},
{
"epoch": 0.8124009544800451,
"grad_norm": 1.34375,
"learning_rate": 0.00019676622836693102,
"loss": 4.6992,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11150
},
{
"epoch": 0.816044008087579,
"grad_norm": 4.0625,
"learning_rate": 0.00019673729139226229,
"loss": 4.6937,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11200
},
{
"epoch": 0.8196870616951129,
"grad_norm": 1.0859375,
"learning_rate": 0.00019670822767123142,
"loss": 4.6948,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11250
},
{
"epoch": 0.8233301153026467,
"grad_norm": 1.59375,
"learning_rate": 0.00019667903724191805,
"loss": 4.6816,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11300
},
{
"epoch": 0.8269731689101805,
"grad_norm": 1.40625,
"learning_rate": 0.00019664972014256783,
"loss": 4.6848,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11350
},
{
"epoch": 0.8306162225177144,
"grad_norm": 1.15625,
"learning_rate": 0.0001966202764115924,
"loss": 4.6876,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11400
},
{
"epoch": 0.8342592761252482,
"grad_norm": 2.21875,
"learning_rate": 0.00019659070608756926,
"loss": 4.7076,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11450
},
{
"epoch": 0.837902329732782,
"grad_norm": 1.359375,
"learning_rate": 0.0001965610092092418,
"loss": 4.6935,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11500
},
{
"epoch": 0.8415453833403158,
"grad_norm": 1.3203125,
"learning_rate": 0.00019653118581551925,
"loss": 4.693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11550
},
{
"epoch": 0.8451884369478497,
"grad_norm": 4.375,
"learning_rate": 0.00019650123594547656,
"loss": 4.6968,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11600
},
{
"epoch": 0.8488314905553835,
"grad_norm": 1.3515625,
"learning_rate": 0.00019647115963835444,
"loss": 4.6935,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11650
},
{
"epoch": 0.8524745441629173,
"grad_norm": 1.6484375,
"learning_rate": 0.00019644095693355915,
"loss": 4.6961,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11700
},
{
"epoch": 0.8561175977704512,
"grad_norm": 3.046875,
"learning_rate": 0.00019641062787066274,
"loss": 4.7102,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11750
},
{
"epoch": 0.8597606513779851,
"grad_norm": 1.40625,
"learning_rate": 0.00019638017248940263,
"loss": 4.722,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11800
},
{
"epoch": 0.8634037049855189,
"grad_norm": 1.9453125,
"learning_rate": 0.0001963495908296819,
"loss": 4.6987,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11850
},
{
"epoch": 0.8670467585930527,
"grad_norm": 2.53125,
"learning_rate": 0.00019631888293156896,
"loss": 4.6952,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11900
},
{
"epoch": 0.8706898122005865,
"grad_norm": 2.75,
"learning_rate": 0.00019628804883529765,
"loss": 4.6898,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 11950
},
{
"epoch": 0.8743328658081204,
"grad_norm": 2.71875,
"learning_rate": 0.00019625708858126727,
"loss": 4.7085,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12000
},
{
"epoch": 0.8779759194156542,
"grad_norm": 1.734375,
"learning_rate": 0.00019622600221004218,
"loss": 4.6967,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12050
},
{
"epoch": 0.881618973023188,
"grad_norm": 1.65625,
"learning_rate": 0.0001961947897623522,
"loss": 4.7088,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12100
},
{
"epoch": 0.8852620266307218,
"grad_norm": 1.5390625,
"learning_rate": 0.00019616345127909226,
"loss": 4.6893,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12150
},
{
"epoch": 0.8889050802382557,
"grad_norm": 3.375,
"learning_rate": 0.0001961319868013224,
"loss": 4.6895,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12200
},
{
"epoch": 0.8925481338457896,
"grad_norm": 1.2578125,
"learning_rate": 0.00019610039637026774,
"loss": 4.6961,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12250
},
{
"epoch": 0.8961911874533234,
"grad_norm": 2.03125,
"learning_rate": 0.00019606868002731845,
"loss": 4.7013,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12300
},
{
"epoch": 0.8998342410608572,
"grad_norm": 1.328125,
"learning_rate": 0.00019603683781402968,
"loss": 4.7018,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12350
},
{
"epoch": 0.903477294668391,
"grad_norm": 1.9296875,
"learning_rate": 0.00019600486977212146,
"loss": 4.703,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12400
},
{
"epoch": 0.9071203482759249,
"grad_norm": 1.3125,
"learning_rate": 0.00019597277594347875,
"loss": 4.704,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12450
},
{
"epoch": 0.9107634018834587,
"grad_norm": 3.625,
"learning_rate": 0.0001959405563701512,
"loss": 4.7039,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12500
},
{
"epoch": 0.9144064554909925,
"grad_norm": 1.6484375,
"learning_rate": 0.00019590821109435335,
"loss": 4.6941,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12550
},
{
"epoch": 0.9180495090985263,
"grad_norm": 1.75,
"learning_rate": 0.00019587574015846433,
"loss": 4.7073,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12600
},
{
"epoch": 0.9216925627060603,
"grad_norm": 1.4921875,
"learning_rate": 0.00019584314360502792,
"loss": 4.6859,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12650
},
{
"epoch": 0.9253356163135941,
"grad_norm": 2.640625,
"learning_rate": 0.00019581042147675257,
"loss": 4.7085,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12700
},
{
"epoch": 0.9289786699211279,
"grad_norm": 2.8125,
"learning_rate": 0.0001957775738165112,
"loss": 4.6923,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12750
},
{
"epoch": 0.9326217235286617,
"grad_norm": 1.59375,
"learning_rate": 0.00019574460066734123,
"loss": 4.7077,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12800
},
{
"epoch": 0.9362647771361956,
"grad_norm": 2.4375,
"learning_rate": 0.0001957115020724444,
"loss": 4.6849,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12850
},
{
"epoch": 0.9399078307437294,
"grad_norm": 1.1796875,
"learning_rate": 0.00019567827807518692,
"loss": 4.6974,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12900
},
{
"epoch": 0.9435508843512632,
"grad_norm": 4.4375,
"learning_rate": 0.0001956449287190993,
"loss": 4.695,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 12950
},
{
"epoch": 0.947193937958797,
"grad_norm": 1.9296875,
"learning_rate": 0.00019561145404787625,
"loss": 4.6926,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13000
},
{
"epoch": 0.950836991566331,
"grad_norm": 1.25,
"learning_rate": 0.0001955778541053767,
"loss": 4.6966,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13050
},
{
"epoch": 0.9544800451738648,
"grad_norm": 3.03125,
"learning_rate": 0.00019554412893562368,
"loss": 4.6981,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13100
},
{
"epoch": 0.9581230987813986,
"grad_norm": 1.4765625,
"learning_rate": 0.0001955102785828043,
"loss": 4.6908,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13150
},
{
"epoch": 0.9617661523889324,
"grad_norm": 1.796875,
"learning_rate": 0.00019547630309126973,
"loss": 4.6856,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13200
},
{
"epoch": 0.9654092059964663,
"grad_norm": 2.015625,
"learning_rate": 0.00019544220250553504,
"loss": 4.6967,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13250
},
{
"epoch": 0.9690522596040001,
"grad_norm": 1.4375,
"learning_rate": 0.00019540797687027928,
"loss": 4.7017,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13300
},
{
"epoch": 0.9726953132115339,
"grad_norm": 1.59375,
"learning_rate": 0.0001953736262303452,
"loss": 4.6947,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13350
},
{
"epoch": 0.9763383668190677,
"grad_norm": 1.328125,
"learning_rate": 0.0001953391506307395,
"loss": 4.6823,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13400
},
{
"epoch": 0.9799814204266015,
"grad_norm": 1.8125,
"learning_rate": 0.0001953045501166325,
"loss": 4.6847,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13450
},
{
"epoch": 0.9836244740341354,
"grad_norm": 2.671875,
"learning_rate": 0.0001952698247333582,
"loss": 4.7012,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13500
},
{
"epoch": 0.9872675276416693,
"grad_norm": 2.34375,
"learning_rate": 0.0001952349745264142,
"loss": 4.6827,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13550
},
{
"epoch": 0.9909105812492031,
"grad_norm": 1.3125,
"learning_rate": 0.00019519999954146174,
"loss": 4.7041,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13600
},
{
"epoch": 0.9945536348567369,
"grad_norm": 1.7890625,
"learning_rate": 0.00019516489982432535,
"loss": 4.6852,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13650
},
{
"epoch": 0.9981966884642708,
"grad_norm": 1.6484375,
"learning_rate": 0.00019512967542099316,
"loss": 4.6928,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13700
},
{
"epoch": 0.999945354195887,
"eval_loss": 4.698671340942383,
"eval_runtime": 581.7351,
"eval_samples_per_second": 521.419,
"eval_steps_per_second": 43.453,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13724
},
{
"epoch": 1.001821526803767,
"grad_norm": 1.4453125,
"learning_rate": 0.00019509432637761665,
"loss": 4.6839,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13750
},
{
"epoch": 1.0054645804113007,
"grad_norm": 1.875,
"learning_rate": 0.0001950588527405105,
"loss": 4.6792,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13800
},
{
"epoch": 1.0091076340188345,
"grad_norm": 1.3125,
"learning_rate": 0.00019502325455615267,
"loss": 4.683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13850
},
{
"epoch": 1.0127506876263683,
"grad_norm": 1.921875,
"learning_rate": 0.0001949875318711844,
"loss": 4.6828,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13900
},
{
"epoch": 1.0163937412339024,
"grad_norm": 1.4609375,
"learning_rate": 0.00019495168473240994,
"loss": 4.6829,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 13950
},
{
"epoch": 1.0200367948414362,
"grad_norm": 1.6796875,
"learning_rate": 0.0001949157131867967,
"loss": 4.6819,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14000
},
{
"epoch": 1.02367984844897,
"grad_norm": 1.6875,
"learning_rate": 0.00019487961728147495,
"loss": 4.6945,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14050
},
{
"epoch": 1.0273229020565038,
"grad_norm": 1.4375,
"learning_rate": 0.000194843397063738,
"loss": 4.6835,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14100
},
{
"epoch": 1.0309659556640376,
"grad_norm": 1.5390625,
"learning_rate": 0.00019480705258104205,
"loss": 4.6919,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14150
},
{
"epoch": 1.0346090092715714,
"grad_norm": 1.4765625,
"learning_rate": 0.00019477058388100605,
"loss": 4.6791,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14200
},
{
"epoch": 1.0382520628791052,
"grad_norm": 1.0,
"learning_rate": 0.00019473399101141176,
"loss": 4.6751,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14250
},
{
"epoch": 1.041895116486639,
"grad_norm": 2.03125,
"learning_rate": 0.00019469727402020358,
"loss": 4.6755,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14300
},
{
"epoch": 1.045538170094173,
"grad_norm": 1.5625,
"learning_rate": 0.0001946604329554885,
"loss": 4.6791,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14350
},
{
"epoch": 1.0491812237017069,
"grad_norm": 1.453125,
"learning_rate": 0.0001946234678655362,
"loss": 4.6889,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14400
},
{
"epoch": 1.0528242773092407,
"grad_norm": 1.2421875,
"learning_rate": 0.00019458637879877876,
"loss": 4.687,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14450
},
{
"epoch": 1.0564673309167745,
"grad_norm": 1.453125,
"learning_rate": 0.00019454916580381075,
"loss": 4.6753,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14500
},
{
"epoch": 1.0601103845243083,
"grad_norm": 2.296875,
"learning_rate": 0.00019451182892938902,
"loss": 4.6795,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14550
},
{
"epoch": 1.063753438131842,
"grad_norm": 2.390625,
"learning_rate": 0.00019447436822443286,
"loss": 4.671,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14600
},
{
"epoch": 1.067396491739376,
"grad_norm": 2.875,
"learning_rate": 0.00019443678373802365,
"loss": 4.683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14650
},
{
"epoch": 1.0710395453469097,
"grad_norm": 1.40625,
"learning_rate": 0.00019439907551940512,
"loss": 4.6855,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14700
},
{
"epoch": 1.0746825989544435,
"grad_norm": 1.4375,
"learning_rate": 0.000194361243617983,
"loss": 4.6963,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14750
},
{
"epoch": 1.0783256525619775,
"grad_norm": 1.7265625,
"learning_rate": 0.0001943232880833251,
"loss": 4.6866,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14800
},
{
"epoch": 1.0819687061695114,
"grad_norm": 1.203125,
"learning_rate": 0.00019428520896516122,
"loss": 4.6841,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14850
},
{
"epoch": 1.0856117597770452,
"grad_norm": 1.9609375,
"learning_rate": 0.00019424700631338304,
"loss": 4.6809,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14900
},
{
"epoch": 1.089254813384579,
"grad_norm": 2.640625,
"learning_rate": 0.00019420868017804423,
"loss": 4.6902,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 14950
},
{
"epoch": 1.0928978669921128,
"grad_norm": 1.4375,
"learning_rate": 0.00019417023060936005,
"loss": 4.6618,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15000
},
{
"epoch": 1.0965409205996466,
"grad_norm": 2.796875,
"learning_rate": 0.00019413165765770765,
"loss": 4.6778,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15050
},
{
"epoch": 1.1001839742071804,
"grad_norm": 2.1875,
"learning_rate": 0.00019409296137362577,
"loss": 4.6947,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15100
},
{
"epoch": 1.1038270278147142,
"grad_norm": 1.3203125,
"learning_rate": 0.00019405414180781469,
"loss": 4.6835,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15150
},
{
"epoch": 1.1074700814222482,
"grad_norm": 1.171875,
"learning_rate": 0.00019401519901113634,
"loss": 4.6801,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15200
},
{
"epoch": 1.111113135029782,
"grad_norm": 1.7734375,
"learning_rate": 0.00019397613303461403,
"loss": 4.6784,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15250
},
{
"epoch": 1.1147561886373158,
"grad_norm": 1.734375,
"learning_rate": 0.00019393694392943244,
"loss": 4.6779,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15300
},
{
"epoch": 1.1183992422448497,
"grad_norm": 1.2578125,
"learning_rate": 0.00019389763174693764,
"loss": 4.6905,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15350
},
{
"epoch": 1.1220422958523835,
"grad_norm": 1.515625,
"learning_rate": 0.0001938581965386369,
"loss": 4.6881,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15400
},
{
"epoch": 1.1256853494599173,
"grad_norm": 1.53125,
"learning_rate": 0.00019381863835619872,
"loss": 4.6866,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15450
},
{
"epoch": 1.129328403067451,
"grad_norm": 1.5625,
"learning_rate": 0.00019377895725145267,
"loss": 4.6806,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15500
},
{
"epoch": 1.132971456674985,
"grad_norm": 1.578125,
"learning_rate": 0.00019373915327638945,
"loss": 4.6933,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15550
},
{
"epoch": 1.1366145102825187,
"grad_norm": 2.28125,
"learning_rate": 0.0001936992264831607,
"loss": 4.6747,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15600
},
{
"epoch": 1.1402575638900527,
"grad_norm": 1.453125,
"learning_rate": 0.00019365917692407898,
"loss": 4.6811,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15650
},
{
"epoch": 1.1439006174975865,
"grad_norm": 1.53125,
"learning_rate": 0.0001936190046516177,
"loss": 4.6794,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15700
},
{
"epoch": 1.1475436711051203,
"grad_norm": 1.6640625,
"learning_rate": 0.00019357870971841104,
"loss": 4.6911,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15750
},
{
"epoch": 1.1511867247126542,
"grad_norm": 1.2109375,
"learning_rate": 0.00019353829217725398,
"loss": 4.6904,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15800
},
{
"epoch": 1.154829778320188,
"grad_norm": 2.171875,
"learning_rate": 0.00019349775208110198,
"loss": 4.6759,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15850
},
{
"epoch": 1.1584728319277218,
"grad_norm": 1.859375,
"learning_rate": 0.00019345708948307117,
"loss": 4.6844,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15900
},
{
"epoch": 1.1621158855352556,
"grad_norm": 1.46875,
"learning_rate": 0.00019341630443643824,
"loss": 4.6956,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 15950
},
{
"epoch": 1.1657589391427896,
"grad_norm": 1.21875,
"learning_rate": 0.00019337539699464018,
"loss": 4.6799,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16000
},
{
"epoch": 1.1694019927503234,
"grad_norm": 1.515625,
"learning_rate": 0.00019333436721127444,
"loss": 4.6869,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16050
},
{
"epoch": 1.1730450463578572,
"grad_norm": 2.875,
"learning_rate": 0.00019329321514009875,
"loss": 4.6809,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16100
},
{
"epoch": 1.176688099965391,
"grad_norm": 1.9296875,
"learning_rate": 0.00019325194083503103,
"loss": 4.6703,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16150
},
{
"epoch": 1.1803311535729248,
"grad_norm": 1.6875,
"learning_rate": 0.00019321054435014935,
"loss": 4.6802,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16200
},
{
"epoch": 1.1839742071804586,
"grad_norm": 3.03125,
"learning_rate": 0.0001931690257396919,
"loss": 4.689,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16250
},
{
"epoch": 1.1876172607879925,
"grad_norm": 1.515625,
"learning_rate": 0.00019312738505805691,
"loss": 4.6841,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16300
},
{
"epoch": 1.1912603143955263,
"grad_norm": 1.5546875,
"learning_rate": 0.00019308562235980243,
"loss": 4.684,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16350
},
{
"epoch": 1.19490336800306,
"grad_norm": 2.859375,
"learning_rate": 0.0001930437376996465,
"loss": 4.6814,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16400
},
{
"epoch": 1.1985464216105939,
"grad_norm": 2.046875,
"learning_rate": 0.00019300173113246687,
"loss": 4.6885,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16450
},
{
"epoch": 1.202189475218128,
"grad_norm": 1.4765625,
"learning_rate": 0.00019295960271330104,
"loss": 4.6973,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16500
},
{
"epoch": 1.2058325288256617,
"grad_norm": 2.609375,
"learning_rate": 0.00019291735249734622,
"loss": 4.6803,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16550
},
{
"epoch": 1.2094755824331955,
"grad_norm": 1.71875,
"learning_rate": 0.00019287498053995915,
"loss": 4.6959,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16600
},
{
"epoch": 1.2131186360407293,
"grad_norm": 1.6875,
"learning_rate": 0.000192832486896656,
"loss": 4.6853,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16650
},
{
"epoch": 1.2167616896482631,
"grad_norm": 1.28125,
"learning_rate": 0.00019278987162311254,
"loss": 4.6695,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16700
},
{
"epoch": 1.220404743255797,
"grad_norm": 1.578125,
"learning_rate": 0.00019274713477516377,
"loss": 4.6742,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16750
},
{
"epoch": 1.2240477968633308,
"grad_norm": 1.375,
"learning_rate": 0.00019270427640880405,
"loss": 4.6926,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16800
},
{
"epoch": 1.2276908504708648,
"grad_norm": 1.546875,
"learning_rate": 0.00019266129658018687,
"loss": 4.6798,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16850
},
{
"epoch": 1.2313339040783986,
"grad_norm": 1.4453125,
"learning_rate": 0.000192618195345625,
"loss": 4.6903,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16900
},
{
"epoch": 1.2349769576859324,
"grad_norm": 1.96875,
"learning_rate": 0.0001925749727615901,
"loss": 4.6728,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 16950
},
{
"epoch": 1.2386200112934662,
"grad_norm": 1.8515625,
"learning_rate": 0.00019253162888471304,
"loss": 4.6905,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17000
},
{
"epoch": 1.242263064901,
"grad_norm": 2.59375,
"learning_rate": 0.00019248816377178337,
"loss": 4.6912,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17050
},
{
"epoch": 1.2459061185085338,
"grad_norm": 1.2734375,
"learning_rate": 0.00019244457747974968,
"loss": 4.6748,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17100
},
{
"epoch": 1.2495491721160676,
"grad_norm": 1.609375,
"learning_rate": 0.00019240087006571922,
"loss": 4.6893,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17150
},
{
"epoch": 1.2531922257236014,
"grad_norm": 1.53125,
"learning_rate": 0.00019235704158695796,
"loss": 4.6905,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17200
},
{
"epoch": 1.2568352793311353,
"grad_norm": 3.3125,
"learning_rate": 0.00019231309210089053,
"loss": 4.6715,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17250
},
{
"epoch": 1.260478332938669,
"grad_norm": 1.4765625,
"learning_rate": 0.00019226902166510006,
"loss": 4.6798,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17300
},
{
"epoch": 1.264121386546203,
"grad_norm": 2.5625,
"learning_rate": 0.00019222483033732815,
"loss": 4.6959,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17350
},
{
"epoch": 1.267764440153737,
"grad_norm": 1.125,
"learning_rate": 0.00019218051817547483,
"loss": 4.6907,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17400
},
{
"epoch": 1.2714074937612707,
"grad_norm": 1.3984375,
"learning_rate": 0.00019213608523759842,
"loss": 4.6718,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17450
},
{
"epoch": 1.2750505473688045,
"grad_norm": 1.703125,
"learning_rate": 0.00019209153158191553,
"loss": 4.691,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17500
},
{
"epoch": 1.2786936009763383,
"grad_norm": 3.109375,
"learning_rate": 0.00019204685726680084,
"loss": 4.6792,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17550
},
{
"epoch": 1.2823366545838721,
"grad_norm": 2.75,
"learning_rate": 0.00019200206235078717,
"loss": 4.6572,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17600
},
{
"epoch": 1.2859797081914062,
"grad_norm": 1.5546875,
"learning_rate": 0.0001919571468925654,
"loss": 4.6803,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17650
},
{
"epoch": 1.28962276179894,
"grad_norm": 2.78125,
"learning_rate": 0.00019191211095098424,
"loss": 4.6733,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17700
},
{
"epoch": 1.2932658154064738,
"grad_norm": 1.546875,
"learning_rate": 0.0001918669545850504,
"loss": 4.6788,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17750
},
{
"epoch": 1.2969088690140076,
"grad_norm": 1.6015625,
"learning_rate": 0.00019182167785392827,
"loss": 4.6925,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17800
},
{
"epoch": 1.3005519226215414,
"grad_norm": 1.4921875,
"learning_rate": 0.00019177628081693993,
"loss": 4.683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17850
},
{
"epoch": 1.3041949762290752,
"grad_norm": 1.8125,
"learning_rate": 0.00019173076353356513,
"loss": 4.6842,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17900
},
{
"epoch": 1.307838029836609,
"grad_norm": 2.34375,
"learning_rate": 0.0001916851260634412,
"loss": 4.6797,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 17950
},
{
"epoch": 1.3114810834441428,
"grad_norm": 1.6796875,
"learning_rate": 0.00019163936846636293,
"loss": 4.6547,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18000
},
{
"epoch": 1.3151241370516766,
"grad_norm": 1.1796875,
"learning_rate": 0.00019159349080228238,
"loss": 4.6845,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18050
},
{
"epoch": 1.3187671906592104,
"grad_norm": 1.1171875,
"learning_rate": 0.0001915474931313091,
"loss": 4.6925,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18100
},
{
"epoch": 1.3224102442667443,
"grad_norm": 3.71875,
"learning_rate": 0.0001915013755137097,
"loss": 4.6897,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18150
},
{
"epoch": 1.3260532978742783,
"grad_norm": 1.5625,
"learning_rate": 0.0001914551380099081,
"loss": 4.6832,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18200
},
{
"epoch": 1.329696351481812,
"grad_norm": 1.0859375,
"learning_rate": 0.00019140878068048527,
"loss": 4.6879,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18250
},
{
"epoch": 1.333339405089346,
"grad_norm": 1.90625,
"learning_rate": 0.00019136230358617908,
"loss": 4.6782,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18300
},
{
"epoch": 1.3369824586968797,
"grad_norm": 1.9375,
"learning_rate": 0.00019131570678788438,
"loss": 4.6916,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18350
},
{
"epoch": 1.3406255123044135,
"grad_norm": 1.671875,
"learning_rate": 0.00019126899034665286,
"loss": 4.6731,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18400
},
{
"epoch": 1.3442685659119473,
"grad_norm": 2.671875,
"learning_rate": 0.000191222154323693,
"loss": 4.6781,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18450
},
{
"epoch": 1.3479116195194814,
"grad_norm": 1.7109375,
"learning_rate": 0.00019117519878036988,
"loss": 4.678,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18500
},
{
"epoch": 1.3515546731270152,
"grad_norm": 2.109375,
"learning_rate": 0.0001911281237782052,
"loss": 4.696,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18550
},
{
"epoch": 1.355197726734549,
"grad_norm": 1.9453125,
"learning_rate": 0.00019108092937887726,
"loss": 4.6942,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18600
},
{
"epoch": 1.3588407803420828,
"grad_norm": 1.6484375,
"learning_rate": 0.00019103361564422067,
"loss": 4.6789,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18650
},
{
"epoch": 1.3624838339496166,
"grad_norm": 2.15625,
"learning_rate": 0.00019098618263622649,
"loss": 4.6765,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18700
},
{
"epoch": 1.3661268875571504,
"grad_norm": 2.015625,
"learning_rate": 0.000190938630417042,
"loss": 4.6836,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18750
},
{
"epoch": 1.3697699411646842,
"grad_norm": 1.453125,
"learning_rate": 0.0001908909590489707,
"loss": 4.6751,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18800
},
{
"epoch": 1.373412994772218,
"grad_norm": 1.2109375,
"learning_rate": 0.00019084316859447218,
"loss": 4.6856,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18850
},
{
"epoch": 1.3770560483797518,
"grad_norm": 1.4453125,
"learning_rate": 0.00019079525911616207,
"loss": 4.6787,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18900
},
{
"epoch": 1.3806991019872856,
"grad_norm": 1.609375,
"learning_rate": 0.00019074723067681192,
"loss": 4.6701,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 18950
},
{
"epoch": 1.3843421555948197,
"grad_norm": 1.3828125,
"learning_rate": 0.00019069908333934924,
"loss": 4.6798,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19000
},
{
"epoch": 1.3879852092023535,
"grad_norm": 1.53125,
"learning_rate": 0.00019065081716685718,
"loss": 4.6789,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19050
},
{
"epoch": 1.3916282628098873,
"grad_norm": 1.4765625,
"learning_rate": 0.00019060243222257468,
"loss": 4.6722,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19100
},
{
"epoch": 1.395271316417421,
"grad_norm": 2.5,
"learning_rate": 0.00019055392856989628,
"loss": 4.6803,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19150
},
{
"epoch": 1.398914370024955,
"grad_norm": 1.1796875,
"learning_rate": 0.00019050530627237207,
"loss": 4.6862,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19200
},
{
"epoch": 1.4025574236324887,
"grad_norm": 1.3984375,
"learning_rate": 0.0001904565653937075,
"loss": 4.6842,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19250
},
{
"epoch": 1.4062004772400225,
"grad_norm": 1.7734375,
"learning_rate": 0.00019040770599776355,
"loss": 4.6885,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19300
},
{
"epoch": 1.4098435308475565,
"grad_norm": 1.5078125,
"learning_rate": 0.0001903587281485563,
"loss": 4.6772,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19350
},
{
"epoch": 1.4134865844550903,
"grad_norm": 1.453125,
"learning_rate": 0.00019030963191025716,
"loss": 4.6779,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19400
},
{
"epoch": 1.4171296380626242,
"grad_norm": 1.2421875,
"learning_rate": 0.00019026041734719265,
"loss": 4.6701,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19450
},
{
"epoch": 1.420772691670158,
"grad_norm": 1.8125,
"learning_rate": 0.0001902110845238442,
"loss": 4.6867,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19500
},
{
"epoch": 1.4244157452776918,
"grad_norm": 1.2578125,
"learning_rate": 0.00019016163350484832,
"loss": 4.6724,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19550
},
{
"epoch": 1.4280587988852256,
"grad_norm": 1.578125,
"learning_rate": 0.00019011206435499633,
"loss": 4.6723,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19600
},
{
"epoch": 1.4317018524927594,
"grad_norm": 2.703125,
"learning_rate": 0.0001900623771392343,
"loss": 4.681,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19650
},
{
"epoch": 1.4353449061002932,
"grad_norm": 1.46875,
"learning_rate": 0.00019001257192266305,
"loss": 4.6846,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19700
},
{
"epoch": 1.438987959707827,
"grad_norm": 1.859375,
"learning_rate": 0.00018996264877053792,
"loss": 4.6883,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19750
},
{
"epoch": 1.4426310133153608,
"grad_norm": 1.4453125,
"learning_rate": 0.0001899126077482689,
"loss": 4.6778,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19800
},
{
"epoch": 1.4462740669228948,
"grad_norm": 3.78125,
"learning_rate": 0.00018986244892142025,
"loss": 4.6831,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19850
},
{
"epoch": 1.4499171205304286,
"grad_norm": 2.546875,
"learning_rate": 0.00018981217235571076,
"loss": 4.6752,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19900
},
{
"epoch": 1.4535601741379625,
"grad_norm": 1.234375,
"learning_rate": 0.00018976177811701336,
"loss": 4.6834,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 19950
},
{
"epoch": 1.4572032277454963,
"grad_norm": 1.1796875,
"learning_rate": 0.0001897112662713551,
"loss": 4.6962,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20000
},
{
"epoch": 1.46084628135303,
"grad_norm": 1.6171875,
"learning_rate": 0.00018966063688491736,
"loss": 4.6861,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20050
},
{
"epoch": 1.4644893349605639,
"grad_norm": 1.3984375,
"learning_rate": 0.00018960989002403525,
"loss": 4.684,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20100
},
{
"epoch": 1.468132388568098,
"grad_norm": 1.4296875,
"learning_rate": 0.00018955902575519797,
"loss": 4.671,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20150
},
{
"epoch": 1.4717754421756317,
"grad_norm": 1.6328125,
"learning_rate": 0.00018950804414504847,
"loss": 4.6864,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20200
},
{
"epoch": 1.4754184957831655,
"grad_norm": 1.8984375,
"learning_rate": 0.0001894569452603835,
"loss": 4.6884,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20250
},
{
"epoch": 1.4790615493906993,
"grad_norm": 1.1484375,
"learning_rate": 0.00018940572916815342,
"loss": 4.6838,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20300
},
{
"epoch": 1.4827046029982331,
"grad_norm": 1.65625,
"learning_rate": 0.00018935439593546218,
"loss": 4.6925,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20350
},
{
"epoch": 1.486347656605767,
"grad_norm": 1.4375,
"learning_rate": 0.0001893029456295672,
"loss": 4.6731,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20400
},
{
"epoch": 1.4899907102133008,
"grad_norm": 1.21875,
"learning_rate": 0.00018925137831787923,
"loss": 4.6744,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20450
},
{
"epoch": 1.4936337638208346,
"grad_norm": 1.7265625,
"learning_rate": 0.00018919969406796242,
"loss": 4.6821,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20500
},
{
"epoch": 1.4972768174283684,
"grad_norm": 1.71875,
"learning_rate": 0.00018914789294753414,
"loss": 4.6783,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20550
},
{
"epoch": 1.5009198710359022,
"grad_norm": 1.4375,
"learning_rate": 0.00018909597502446478,
"loss": 4.6858,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20600
},
{
"epoch": 1.504562924643436,
"grad_norm": 1.5703125,
"learning_rate": 0.00018904394036677787,
"loss": 4.6858,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20650
},
{
"epoch": 1.5082059782509698,
"grad_norm": 1.8125,
"learning_rate": 0.0001889917890426498,
"loss": 4.683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20700
},
{
"epoch": 1.5118490318585038,
"grad_norm": 1.703125,
"learning_rate": 0.0001889395211204099,
"loss": 4.6717,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20750
},
{
"epoch": 1.5154920854660376,
"grad_norm": 1.5390625,
"learning_rate": 0.00018888713666854022,
"loss": 4.6756,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20800
},
{
"epoch": 1.5191351390735714,
"grad_norm": 1.3671875,
"learning_rate": 0.0001888346357556755,
"loss": 4.6734,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20850
},
{
"epoch": 1.5227781926811053,
"grad_norm": 1.4296875,
"learning_rate": 0.00018878201845060305,
"loss": 4.6727,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20900
},
{
"epoch": 1.5264212462886393,
"grad_norm": 1.4375,
"learning_rate": 0.00018872928482226273,
"loss": 4.672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 20950
},
{
"epoch": 1.530064299896173,
"grad_norm": 2.234375,
"learning_rate": 0.00018867643493974674,
"loss": 4.6821,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21000
},
{
"epoch": 1.533707353503707,
"grad_norm": 1.984375,
"learning_rate": 0.00018862346887229961,
"loss": 4.6826,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21050
},
{
"epoch": 1.5373504071112407,
"grad_norm": 1.265625,
"learning_rate": 0.0001885703866893182,
"loss": 4.6786,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21100
},
{
"epoch": 1.5409934607187745,
"grad_norm": 2.390625,
"learning_rate": 0.00018851718846035133,
"loss": 4.6706,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21150
},
{
"epoch": 1.5446365143263083,
"grad_norm": 1.3203125,
"learning_rate": 0.00018846387425510005,
"loss": 4.6719,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21200
},
{
"epoch": 1.5482795679338421,
"grad_norm": 1.40625,
"learning_rate": 0.0001884104441434172,
"loss": 4.689,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21250
},
{
"epoch": 1.551922621541376,
"grad_norm": 2.125,
"learning_rate": 0.00018835689819530758,
"loss": 4.6893,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21300
},
{
"epoch": 1.5555656751489098,
"grad_norm": 1.75,
"learning_rate": 0.00018830323648092777,
"loss": 4.6814,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21350
},
{
"epoch": 1.5592087287564436,
"grad_norm": 1.25,
"learning_rate": 0.00018824945907058591,
"loss": 4.6849,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21400
},
{
"epoch": 1.5628517823639774,
"grad_norm": 1.8203125,
"learning_rate": 0.0001881955660347419,
"loss": 4.6842,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21450
},
{
"epoch": 1.5664948359715112,
"grad_norm": 1.3125,
"learning_rate": 0.00018814155744400698,
"loss": 4.6786,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21500
},
{
"epoch": 1.5701378895790452,
"grad_norm": 2.109375,
"learning_rate": 0.0001880874333691439,
"loss": 4.6883,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21550
},
{
"epoch": 1.573780943186579,
"grad_norm": 1.6953125,
"learning_rate": 0.0001880331938810666,
"loss": 4.6582,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21600
},
{
"epoch": 1.5774239967941128,
"grad_norm": 1.7890625,
"learning_rate": 0.0001879788390508404,
"loss": 4.6846,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21650
},
{
"epoch": 1.5810670504016466,
"grad_norm": 1.4765625,
"learning_rate": 0.00018792436894968164,
"loss": 4.6714,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21700
},
{
"epoch": 1.5847101040091804,
"grad_norm": 1.9140625,
"learning_rate": 0.00018786978364895761,
"loss": 4.6769,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21750
},
{
"epoch": 1.5883531576167145,
"grad_norm": 1.65625,
"learning_rate": 0.00018781508322018674,
"loss": 4.6844,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21800
},
{
"epoch": 1.5919962112242483,
"grad_norm": 1.28125,
"learning_rate": 0.0001877602677350381,
"loss": 4.6767,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21850
},
{
"epoch": 1.595639264831782,
"grad_norm": 1.3359375,
"learning_rate": 0.00018770533726533172,
"loss": 4.6668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21900
},
{
"epoch": 1.599282318439316,
"grad_norm": 2.234375,
"learning_rate": 0.000187650291883038,
"loss": 4.6855,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 21950
},
{
"epoch": 1.6029253720468497,
"grad_norm": 1.4296875,
"learning_rate": 0.00018759513166027817,
"loss": 4.6807,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22000
},
{
"epoch": 1.6065684256543835,
"grad_norm": 1.609375,
"learning_rate": 0.0001875398566693238,
"loss": 4.672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22050
},
{
"epoch": 1.6102114792619173,
"grad_norm": 1.296875,
"learning_rate": 0.0001874844669825968,
"loss": 4.689,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22100
},
{
"epoch": 1.6138545328694511,
"grad_norm": 1.1953125,
"learning_rate": 0.00018742896267266948,
"loss": 4.6654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22150
},
{
"epoch": 1.617497586476985,
"grad_norm": 1.5703125,
"learning_rate": 0.00018737334381226418,
"loss": 4.6747,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22200
},
{
"epoch": 1.6211406400845187,
"grad_norm": 3.5625,
"learning_rate": 0.00018731761047425347,
"loss": 4.6828,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22250
},
{
"epoch": 1.6247836936920526,
"grad_norm": 1.7265625,
"learning_rate": 0.0001872617627316598,
"loss": 4.6878,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22300
},
{
"epoch": 1.6284267472995864,
"grad_norm": 1.25,
"learning_rate": 0.00018720580065765554,
"loss": 4.6709,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22350
},
{
"epoch": 1.6320698009071204,
"grad_norm": 1.3203125,
"learning_rate": 0.0001871497243255629,
"loss": 4.6742,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22400
},
{
"epoch": 1.6357128545146542,
"grad_norm": 1.40625,
"learning_rate": 0.0001870935338088537,
"loss": 4.6825,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22450
},
{
"epoch": 1.639355908122188,
"grad_norm": 1.5234375,
"learning_rate": 0.00018703722918114954,
"loss": 4.6736,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22500
},
{
"epoch": 1.6429989617297218,
"grad_norm": 3.515625,
"learning_rate": 0.00018698081051622136,
"loss": 4.6625,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22550
},
{
"epoch": 1.6466420153372558,
"grad_norm": 2.78125,
"learning_rate": 0.00018692427788798957,
"loss": 4.6777,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22600
},
{
"epoch": 1.6502850689447897,
"grad_norm": 1.234375,
"learning_rate": 0.0001868676313705239,
"loss": 4.6809,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22650
},
{
"epoch": 1.6539281225523235,
"grad_norm": 1.28125,
"learning_rate": 0.00018681087103804332,
"loss": 4.6657,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22700
},
{
"epoch": 1.6575711761598573,
"grad_norm": 1.5703125,
"learning_rate": 0.00018675399696491587,
"loss": 4.6701,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22750
},
{
"epoch": 1.661214229767391,
"grad_norm": 1.5390625,
"learning_rate": 0.00018669700922565864,
"loss": 4.6763,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22800
},
{
"epoch": 1.664857283374925,
"grad_norm": 1.71875,
"learning_rate": 0.0001866399078949377,
"loss": 4.6824,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22850
},
{
"epoch": 1.6685003369824587,
"grad_norm": 3.4375,
"learning_rate": 0.00018658269304756784,
"loss": 4.6686,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22900
},
{
"epoch": 1.6721433905899925,
"grad_norm": 1.7109375,
"learning_rate": 0.00018652536475851272,
"loss": 4.6808,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 22950
},
{
"epoch": 1.6757864441975263,
"grad_norm": 1.59375,
"learning_rate": 0.00018646792310288447,
"loss": 4.687,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23000
},
{
"epoch": 1.6794294978050601,
"grad_norm": 1.8984375,
"learning_rate": 0.00018641036815594388,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23050
},
{
"epoch": 1.683072551412594,
"grad_norm": 1.546875,
"learning_rate": 0.0001863526999931001,
"loss": 4.678,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23100
},
{
"epoch": 1.6867156050201277,
"grad_norm": 2.609375,
"learning_rate": 0.00018629491868991073,
"loss": 4.6742,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23150
},
{
"epoch": 1.6903586586276615,
"grad_norm": 3.15625,
"learning_rate": 0.00018623702432208144,
"loss": 4.6866,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23200
},
{
"epoch": 1.6940017122351956,
"grad_norm": 2.390625,
"learning_rate": 0.00018617901696546616,
"loss": 4.6763,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23250
},
{
"epoch": 1.6976447658427294,
"grad_norm": 1.5234375,
"learning_rate": 0.00018612089669606683,
"loss": 4.6808,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23300
},
{
"epoch": 1.7012878194502632,
"grad_norm": 1.5859375,
"learning_rate": 0.00018606266359003331,
"loss": 4.6682,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23350
},
{
"epoch": 1.704930873057797,
"grad_norm": 1.9296875,
"learning_rate": 0.00018600431772366335,
"loss": 4.6846,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23400
},
{
"epoch": 1.708573926665331,
"grad_norm": 3.3125,
"learning_rate": 0.00018594585917340237,
"loss": 4.6857,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23450
},
{
"epoch": 1.7122169802728648,
"grad_norm": 1.796875,
"learning_rate": 0.00018588728801584347,
"loss": 4.6678,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23500
},
{
"epoch": 1.7158600338803986,
"grad_norm": 2.015625,
"learning_rate": 0.00018582860432772732,
"loss": 4.6897,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23550
},
{
"epoch": 1.7195030874879325,
"grad_norm": 1.4453125,
"learning_rate": 0.00018576980818594193,
"loss": 4.6843,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23600
},
{
"epoch": 1.7231461410954663,
"grad_norm": 1.390625,
"learning_rate": 0.00018571089966752278,
"loss": 4.6731,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23650
},
{
"epoch": 1.726789194703,
"grad_norm": 1.46875,
"learning_rate": 0.00018565187884965248,
"loss": 4.6805,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23700
},
{
"epoch": 1.7304322483105339,
"grad_norm": 1.5,
"learning_rate": 0.00018559274580966082,
"loss": 4.6872,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23750
},
{
"epoch": 1.7340753019180677,
"grad_norm": 1.3359375,
"learning_rate": 0.0001855335006250246,
"loss": 4.6871,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23800
},
{
"epoch": 1.7377183555256015,
"grad_norm": 2.453125,
"learning_rate": 0.00018547414337336763,
"loss": 4.6775,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23850
},
{
"epoch": 1.7413614091331353,
"grad_norm": 1.484375,
"learning_rate": 0.00018541467413246045,
"loss": 4.6795,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23900
},
{
"epoch": 1.7450044627406691,
"grad_norm": 1.6484375,
"learning_rate": 0.00018535509298022037,
"loss": 4.6936,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 23950
},
{
"epoch": 1.748647516348203,
"grad_norm": 2.484375,
"learning_rate": 0.00018529539999471138,
"loss": 4.6869,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24000
},
{
"epoch": 1.752290569955737,
"grad_norm": 1.703125,
"learning_rate": 0.0001852355952541439,
"loss": 4.6812,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24050
},
{
"epoch": 1.7559336235632708,
"grad_norm": 1.9453125,
"learning_rate": 0.0001851756788368748,
"loss": 4.6919,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24100
},
{
"epoch": 1.7595766771708046,
"grad_norm": 1.90625,
"learning_rate": 0.00018511565082140736,
"loss": 4.6749,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24150
},
{
"epoch": 1.7632197307783384,
"grad_norm": 2.859375,
"learning_rate": 0.00018505551128639096,
"loss": 4.663,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24200
},
{
"epoch": 1.7668627843858722,
"grad_norm": 1.515625,
"learning_rate": 0.00018499526031062115,
"loss": 4.6771,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24250
},
{
"epoch": 1.7705058379934062,
"grad_norm": 2.515625,
"learning_rate": 0.0001849348979730395,
"loss": 4.672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24300
},
{
"epoch": 1.77414889160094,
"grad_norm": 1.390625,
"learning_rate": 0.00018487442435273345,
"loss": 4.6793,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24350
},
{
"epoch": 1.7777919452084738,
"grad_norm": 2.109375,
"learning_rate": 0.00018481383952893629,
"loss": 4.6899,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24400
},
{
"epoch": 1.7814349988160076,
"grad_norm": 2.4375,
"learning_rate": 0.00018475314358102693,
"loss": 4.6881,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24450
},
{
"epoch": 1.7850780524235415,
"grad_norm": 1.890625,
"learning_rate": 0.00018469233658852997,
"loss": 4.6776,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24500
},
{
"epoch": 1.7887211060310753,
"grad_norm": 1.34375,
"learning_rate": 0.0001846314186311155,
"loss": 4.6622,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24550
},
{
"epoch": 1.792364159638609,
"grad_norm": 1.359375,
"learning_rate": 0.00018457038978859886,
"loss": 4.6624,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24600
},
{
"epoch": 1.7960072132461429,
"grad_norm": 1.5703125,
"learning_rate": 0.00018450925014094086,
"loss": 4.6705,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24650
},
{
"epoch": 1.7996502668536767,
"grad_norm": 2.578125,
"learning_rate": 0.00018444799976824737,
"loss": 4.6764,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24700
},
{
"epoch": 1.8032933204612105,
"grad_norm": 1.046875,
"learning_rate": 0.00018438663875076938,
"loss": 4.6814,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24750
},
{
"epoch": 1.8069363740687443,
"grad_norm": 1.859375,
"learning_rate": 0.0001843251671689028,
"loss": 4.6732,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24800
},
{
"epoch": 1.810579427676278,
"grad_norm": 1.5625,
"learning_rate": 0.0001842635851031885,
"loss": 4.6754,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24850
},
{
"epoch": 1.8142224812838121,
"grad_norm": 1.34375,
"learning_rate": 0.00018420189263431195,
"loss": 4.6738,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24900
},
{
"epoch": 1.817865534891346,
"grad_norm": 1.71875,
"learning_rate": 0.00018414008984310345,
"loss": 4.6806,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 24950
},
{
"epoch": 1.8215085884988798,
"grad_norm": 2.234375,
"learning_rate": 0.00018407817681053768,
"loss": 4.6575,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25000
},
{
"epoch": 1.8251516421064136,
"grad_norm": 1.59375,
"learning_rate": 0.0001840161536177339,
"loss": 4.6662,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25050
},
{
"epoch": 1.8287946957139476,
"grad_norm": 1.796875,
"learning_rate": 0.00018395402034595567,
"loss": 4.6556,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25100
},
{
"epoch": 1.8324377493214814,
"grad_norm": 2.078125,
"learning_rate": 0.00018389177707661065,
"loss": 4.6795,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25150
},
{
"epoch": 1.8360808029290152,
"grad_norm": 1.6953125,
"learning_rate": 0.00018382942389125079,
"loss": 4.6796,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25200
},
{
"epoch": 1.839723856536549,
"grad_norm": 1.8046875,
"learning_rate": 0.00018376696087157198,
"loss": 4.6607,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25250
},
{
"epoch": 1.8433669101440828,
"grad_norm": 1.5703125,
"learning_rate": 0.00018370438809941402,
"loss": 4.681,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25300
},
{
"epoch": 1.8470099637516166,
"grad_norm": 1.7734375,
"learning_rate": 0.0001836417056567605,
"loss": 4.6719,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25350
},
{
"epoch": 1.8506530173591504,
"grad_norm": 1.234375,
"learning_rate": 0.00018357891362573864,
"loss": 4.6733,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25400
},
{
"epoch": 1.8542960709666843,
"grad_norm": 1.7265625,
"learning_rate": 0.00018351601208861944,
"loss": 4.682,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25450
},
{
"epoch": 1.857939124574218,
"grad_norm": 1.6953125,
"learning_rate": 0.0001834530011278172,
"loss": 4.7023,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25500
},
{
"epoch": 1.8615821781817519,
"grad_norm": 1.390625,
"learning_rate": 0.00018338988082588958,
"loss": 4.6791,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25550
},
{
"epoch": 1.8652252317892857,
"grad_norm": 2.359375,
"learning_rate": 0.0001833266512655376,
"loss": 4.6788,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25600
},
{
"epoch": 1.8688682853968195,
"grad_norm": 1.8828125,
"learning_rate": 0.00018326331252960544,
"loss": 4.6757,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25650
},
{
"epoch": 1.8725113390043533,
"grad_norm": 2.75,
"learning_rate": 0.00018319986470108023,
"loss": 4.6823,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25700
},
{
"epoch": 1.8761543926118873,
"grad_norm": 1.3359375,
"learning_rate": 0.00018313630786309206,
"loss": 4.6772,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25750
},
{
"epoch": 1.8797974462194211,
"grad_norm": 1.390625,
"learning_rate": 0.00018307264209891389,
"loss": 4.6804,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25800
},
{
"epoch": 1.883440499826955,
"grad_norm": 1.328125,
"learning_rate": 0.00018300886749196134,
"loss": 4.685,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25850
},
{
"epoch": 1.8870835534344887,
"grad_norm": 1.5703125,
"learning_rate": 0.0001829449841257927,
"loss": 4.6618,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25900
},
{
"epoch": 1.8907266070420228,
"grad_norm": 1.671875,
"learning_rate": 0.00018288099208410872,
"loss": 4.6782,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 25950
},
{
"epoch": 1.8943696606495566,
"grad_norm": 1.6015625,
"learning_rate": 0.00018281689145075252,
"loss": 4.672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26000
},
{
"epoch": 1.8980127142570904,
"grad_norm": 2.15625,
"learning_rate": 0.00018275268230970955,
"loss": 4.6855,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26050
},
{
"epoch": 1.9016557678646242,
"grad_norm": 1.8046875,
"learning_rate": 0.00018268836474510738,
"loss": 4.6776,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26100
},
{
"epoch": 1.905298821472158,
"grad_norm": 1.7265625,
"learning_rate": 0.00018262393884121566,
"loss": 4.6828,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26150
},
{
"epoch": 1.9089418750796918,
"grad_norm": 3.078125,
"learning_rate": 0.00018255940468244597,
"loss": 4.6805,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26200
},
{
"epoch": 1.9125849286872256,
"grad_norm": 3.125,
"learning_rate": 0.00018249476235335176,
"loss": 4.6794,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26250
},
{
"epoch": 1.9162279822947594,
"grad_norm": 1.875,
"learning_rate": 0.0001824300119386282,
"loss": 4.695,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26300
},
{
"epoch": 1.9198710359022932,
"grad_norm": 1.96875,
"learning_rate": 0.000182365153523112,
"loss": 4.6675,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26350
},
{
"epoch": 1.923514089509827,
"grad_norm": 2.390625,
"learning_rate": 0.0001823001871917815,
"loss": 4.6864,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26400
},
{
"epoch": 1.9271571431173609,
"grad_norm": 1.5390625,
"learning_rate": 0.00018223511302975636,
"loss": 4.6744,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26450
},
{
"epoch": 1.9308001967248947,
"grad_norm": 1.8828125,
"learning_rate": 0.00018216993112229747,
"loss": 4.6792,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26500
},
{
"epoch": 1.9344432503324285,
"grad_norm": 2.890625,
"learning_rate": 0.00018210464155480707,
"loss": 4.681,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26550
},
{
"epoch": 1.9380863039399625,
"grad_norm": 1.3359375,
"learning_rate": 0.00018203924441282822,
"loss": 4.6741,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26600
},
{
"epoch": 1.9417293575474963,
"grad_norm": 1.4921875,
"learning_rate": 0.00018197373978204507,
"loss": 4.6715,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26650
},
{
"epoch": 1.9453724111550301,
"grad_norm": 2.25,
"learning_rate": 0.0001819081277482826,
"loss": 4.6725,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26700
},
{
"epoch": 1.949015464762564,
"grad_norm": 2.109375,
"learning_rate": 0.00018184240839750647,
"loss": 4.669,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26750
},
{
"epoch": 1.952658518370098,
"grad_norm": 2.5625,
"learning_rate": 0.00018177658181582295,
"loss": 4.6894,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26800
},
{
"epoch": 1.9563015719776318,
"grad_norm": 1.8828125,
"learning_rate": 0.00018171064808947883,
"loss": 4.679,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26850
},
{
"epoch": 1.9599446255851656,
"grad_norm": 1.546875,
"learning_rate": 0.0001816446073048613,
"loss": 4.666,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26900
},
{
"epoch": 1.9635876791926994,
"grad_norm": 1.2578125,
"learning_rate": 0.00018157845954849778,
"loss": 4.6769,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 26950
},
{
"epoch": 1.9672307328002332,
"grad_norm": 1.1875,
"learning_rate": 0.0001815122049070558,
"loss": 4.673,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27000
},
{
"epoch": 1.970873786407767,
"grad_norm": 1.7421875,
"learning_rate": 0.00018144584346734308,
"loss": 4.6832,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27050
},
{
"epoch": 1.9745168400153008,
"grad_norm": 1.828125,
"learning_rate": 0.00018137937531630707,
"loss": 4.6632,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27100
},
{
"epoch": 1.9781598936228346,
"grad_norm": 1.1640625,
"learning_rate": 0.00018131280054103522,
"loss": 4.6781,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27150
},
{
"epoch": 1.9818029472303684,
"grad_norm": 1.34375,
"learning_rate": 0.0001812461192287546,
"loss": 4.6722,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27200
},
{
"epoch": 1.9854460008379022,
"grad_norm": 2.34375,
"learning_rate": 0.0001811793314668318,
"loss": 4.6739,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27250
},
{
"epoch": 1.989089054445436,
"grad_norm": 2.703125,
"learning_rate": 0.000181112437342773,
"loss": 4.6668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27300
},
{
"epoch": 1.9927321080529699,
"grad_norm": 2.234375,
"learning_rate": 0.00018104543694422368,
"loss": 4.6842,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27350
},
{
"epoch": 1.9963751616605039,
"grad_norm": 3.28125,
"learning_rate": 0.00018097833035896858,
"loss": 4.667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27400
},
{
"epoch": 1.9998724931237364,
"eval_loss": 4.683139801025391,
"eval_runtime": 582.0603,
"eval_samples_per_second": 521.128,
"eval_steps_per_second": 43.428,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27448
},
{
"epoch": 2.0,
"grad_norm": 1.53125,
"learning_rate": 0.00018091111767493153,
"loss": 4.6719,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27450
},
{
"epoch": 2.003643053607534,
"grad_norm": 1.5625,
"learning_rate": 0.00018084379898017543,
"loss": 4.6549,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27500
},
{
"epoch": 2.0072861072150676,
"grad_norm": 2.015625,
"learning_rate": 0.00018077637436290198,
"loss": 4.67,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27550
},
{
"epoch": 2.0109291608226014,
"grad_norm": 1.359375,
"learning_rate": 0.0001807088439114518,
"loss": 4.6681,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27600
},
{
"epoch": 2.0145722144301352,
"grad_norm": 2.140625,
"learning_rate": 0.00018064120771430403,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27650
},
{
"epoch": 2.018215268037669,
"grad_norm": 2.15625,
"learning_rate": 0.0001805734658600765,
"loss": 4.6603,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27700
},
{
"epoch": 2.021858321645203,
"grad_norm": 2.203125,
"learning_rate": 0.00018050561843752533,
"loss": 4.6783,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27750
},
{
"epoch": 2.0255013752527367,
"grad_norm": 1.796875,
"learning_rate": 0.00018043766553554506,
"loss": 4.6753,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27800
},
{
"epoch": 2.0291444288602705,
"grad_norm": 1.609375,
"learning_rate": 0.00018036960724316842,
"loss": 4.6606,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27850
},
{
"epoch": 2.0327874824678047,
"grad_norm": 1.9609375,
"learning_rate": 0.00018030144364956615,
"loss": 4.6776,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27900
},
{
"epoch": 2.0364305360753385,
"grad_norm": 1.5703125,
"learning_rate": 0.00018023317484404708,
"loss": 4.6566,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 27950
},
{
"epoch": 2.0400735896828723,
"grad_norm": 2.1875,
"learning_rate": 0.00018016480091605778,
"loss": 4.66,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28000
},
{
"epoch": 2.043716643290406,
"grad_norm": 1.921875,
"learning_rate": 0.0001800963219551826,
"loss": 4.6631,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28050
},
{
"epoch": 2.04735969689794,
"grad_norm": 2.109375,
"learning_rate": 0.00018002773805114348,
"loss": 4.6639,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28100
},
{
"epoch": 2.0510027505054738,
"grad_norm": 1.3984375,
"learning_rate": 0.00017995904929379988,
"loss": 4.6769,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28150
},
{
"epoch": 2.0546458041130076,
"grad_norm": 1.46875,
"learning_rate": 0.00017989025577314866,
"loss": 4.6637,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28200
},
{
"epoch": 2.0582888577205414,
"grad_norm": 1.203125,
"learning_rate": 0.0001798213575793239,
"loss": 4.665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28250
},
{
"epoch": 2.061931911328075,
"grad_norm": 1.4765625,
"learning_rate": 0.00017975235480259684,
"loss": 4.656,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28300
},
{
"epoch": 2.065574964935609,
"grad_norm": 1.796875,
"learning_rate": 0.00017968324753337575,
"loss": 4.6532,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28350
},
{
"epoch": 2.069218018543143,
"grad_norm": 1.796875,
"learning_rate": 0.00017961403586220582,
"loss": 4.6784,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28400
},
{
"epoch": 2.0728610721506766,
"grad_norm": 3.328125,
"learning_rate": 0.000179544719879769,
"loss": 4.6751,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28450
},
{
"epoch": 2.0765041257582104,
"grad_norm": 1.3828125,
"learning_rate": 0.0001794752996768839,
"loss": 4.6772,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28500
},
{
"epoch": 2.0801471793657442,
"grad_norm": 2.0,
"learning_rate": 0.00017940577534450574,
"loss": 4.6652,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28550
},
{
"epoch": 2.083790232973278,
"grad_norm": 1.3671875,
"learning_rate": 0.00017933614697372615,
"loss": 4.6699,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28600
},
{
"epoch": 2.087433286580812,
"grad_norm": 1.3671875,
"learning_rate": 0.00017926641465577302,
"loss": 4.6686,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28650
},
{
"epoch": 2.091076340188346,
"grad_norm": 2.28125,
"learning_rate": 0.00017919657848201046,
"loss": 4.6531,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28700
},
{
"epoch": 2.09471939379588,
"grad_norm": 1.828125,
"learning_rate": 0.0001791266385439387,
"loss": 4.6573,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28750
},
{
"epoch": 2.0983624474034137,
"grad_norm": 1.46875,
"learning_rate": 0.00017905659493319388,
"loss": 4.6785,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28800
},
{
"epoch": 2.1020055010109475,
"grad_norm": 1.6171875,
"learning_rate": 0.00017898644774154798,
"loss": 4.6767,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28850
},
{
"epoch": 2.1056485546184813,
"grad_norm": 1.46875,
"learning_rate": 0.0001789161970609087,
"loss": 4.6656,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28900
},
{
"epoch": 2.109291608226015,
"grad_norm": 1.421875,
"learning_rate": 0.0001788458429833193,
"loss": 4.6714,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 28950
},
{
"epoch": 2.112934661833549,
"grad_norm": 1.640625,
"learning_rate": 0.0001787753856009586,
"loss": 4.6631,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29000
},
{
"epoch": 2.1165777154410828,
"grad_norm": 1.8828125,
"learning_rate": 0.0001787048250061406,
"loss": 4.6646,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29050
},
{
"epoch": 2.1202207690486166,
"grad_norm": 1.5234375,
"learning_rate": 0.00017863416129131475,
"loss": 4.6786,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29100
},
{
"epoch": 2.1238638226561504,
"grad_norm": 1.9765625,
"learning_rate": 0.00017856339454906542,
"loss": 4.6569,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29150
},
{
"epoch": 2.127506876263684,
"grad_norm": 1.96875,
"learning_rate": 0.0001784925248721121,
"loss": 4.6741,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29200
},
{
"epoch": 2.131149929871218,
"grad_norm": 1.9453125,
"learning_rate": 0.00017842155235330904,
"loss": 4.6723,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29250
},
{
"epoch": 2.134792983478752,
"grad_norm": 2.015625,
"learning_rate": 0.00017835047708564537,
"loss": 4.6765,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29300
},
{
"epoch": 2.1384360370862856,
"grad_norm": 2.328125,
"learning_rate": 0.0001782792991622447,
"loss": 4.6637,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29350
},
{
"epoch": 2.1420790906938194,
"grad_norm": 1.5,
"learning_rate": 0.00017820801867636518,
"loss": 4.6592,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29400
},
{
"epoch": 2.145722144301353,
"grad_norm": 2.46875,
"learning_rate": 0.0001781366357213994,
"loss": 4.6823,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29450
},
{
"epoch": 2.149365197908887,
"grad_norm": 2.96875,
"learning_rate": 0.00017806515039087416,
"loss": 4.6764,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29500
},
{
"epoch": 2.153008251516421,
"grad_norm": 1.4765625,
"learning_rate": 0.0001779935627784504,
"loss": 4.663,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29550
},
{
"epoch": 2.156651305123955,
"grad_norm": 1.40625,
"learning_rate": 0.00017792187297792307,
"loss": 4.6608,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29600
},
{
"epoch": 2.160294358731489,
"grad_norm": 1.3984375,
"learning_rate": 0.00017785008108322102,
"loss": 4.6793,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29650
},
{
"epoch": 2.1639374123390227,
"grad_norm": 1.3046875,
"learning_rate": 0.0001777781871884068,
"loss": 4.6803,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29700
},
{
"epoch": 2.1675804659465565,
"grad_norm": 2.71875,
"learning_rate": 0.0001777061913876767,
"loss": 4.6578,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29750
},
{
"epoch": 2.1712235195540903,
"grad_norm": 1.8046875,
"learning_rate": 0.0001776340937753605,
"loss": 4.6784,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29800
},
{
"epoch": 2.174866573161624,
"grad_norm": 3.078125,
"learning_rate": 0.00017756189444592132,
"loss": 4.6732,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29850
},
{
"epoch": 2.178509626769158,
"grad_norm": 1.78125,
"learning_rate": 0.0001774895934939556,
"loss": 4.6417,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29900
},
{
"epoch": 2.1821526803766917,
"grad_norm": 2.125,
"learning_rate": 0.00017741719101419293,
"loss": 4.6829,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 29950
},
{
"epoch": 2.1857957339842256,
"grad_norm": 1.8671875,
"learning_rate": 0.00017734468710149589,
"loss": 4.6706,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30000
},
{
"epoch": 2.1894387875917594,
"grad_norm": 1.2109375,
"learning_rate": 0.00017727208185086,
"loss": 4.671,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30050
},
{
"epoch": 2.193081841199293,
"grad_norm": 1.984375,
"learning_rate": 0.00017719937535741354,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30100
},
{
"epoch": 2.196724894806827,
"grad_norm": 1.515625,
"learning_rate": 0.00017712656771641739,
"loss": 4.6823,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30150
},
{
"epoch": 2.200367948414361,
"grad_norm": 3.171875,
"learning_rate": 0.00017705365902326498,
"loss": 4.6786,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30200
},
{
"epoch": 2.2040110020218946,
"grad_norm": 1.3046875,
"learning_rate": 0.00017698064937348224,
"loss": 4.6732,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30250
},
{
"epoch": 2.2076540556294284,
"grad_norm": 1.6640625,
"learning_rate": 0.00017690753886272727,
"loss": 4.6691,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30300
},
{
"epoch": 2.2112971092369627,
"grad_norm": 2.15625,
"learning_rate": 0.00017683432758679025,
"loss": 4.688,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30350
},
{
"epoch": 2.2149401628444965,
"grad_norm": 1.4296875,
"learning_rate": 0.00017676101564159357,
"loss": 4.6638,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30400
},
{
"epoch": 2.2185832164520303,
"grad_norm": 2.703125,
"learning_rate": 0.00017668760312319142,
"loss": 4.6481,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30450
},
{
"epoch": 2.222226270059564,
"grad_norm": 1.96875,
"learning_rate": 0.0001766140901277697,
"loss": 4.6795,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30500
},
{
"epoch": 2.225869323667098,
"grad_norm": 3.125,
"learning_rate": 0.0001765404767516461,
"loss": 4.6823,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30550
},
{
"epoch": 2.2295123772746317,
"grad_norm": 1.40625,
"learning_rate": 0.00017646676309126966,
"loss": 4.6588,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30600
},
{
"epoch": 2.2331554308821655,
"grad_norm": 1.4609375,
"learning_rate": 0.000176392949243221,
"loss": 4.6726,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30650
},
{
"epoch": 2.2367984844896993,
"grad_norm": 1.6328125,
"learning_rate": 0.00017631903530421181,
"loss": 4.66,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30700
},
{
"epoch": 2.240441538097233,
"grad_norm": 1.3203125,
"learning_rate": 0.00017624502137108508,
"loss": 4.6832,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30750
},
{
"epoch": 2.244084591704767,
"grad_norm": 1.6328125,
"learning_rate": 0.00017617090754081476,
"loss": 4.6839,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30800
},
{
"epoch": 2.2477276453123007,
"grad_norm": 1.5078125,
"learning_rate": 0.00017609669391050568,
"loss": 4.658,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30850
},
{
"epoch": 2.2513706989198345,
"grad_norm": 1.5703125,
"learning_rate": 0.00017602238057739343,
"loss": 4.6806,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30900
},
{
"epoch": 2.2550137525273684,
"grad_norm": 1.2890625,
"learning_rate": 0.0001759479676388442,
"loss": 4.6706,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 30950
},
{
"epoch": 2.258656806134902,
"grad_norm": 1.1640625,
"learning_rate": 0.00017587345519235473,
"loss": 4.6698,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31000
},
{
"epoch": 2.262299859742436,
"grad_norm": 1.484375,
"learning_rate": 0.00017579884333555215,
"loss": 4.6699,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31050
},
{
"epoch": 2.26594291334997,
"grad_norm": 2.578125,
"learning_rate": 0.0001757241321661938,
"loss": 4.672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31100
},
{
"epoch": 2.2695859669575036,
"grad_norm": 1.6796875,
"learning_rate": 0.00017564932178216713,
"loss": 4.6814,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31150
},
{
"epoch": 2.2732290205650374,
"grad_norm": 1.5234375,
"learning_rate": 0.0001755744122814896,
"loss": 4.6668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31200
},
{
"epoch": 2.276872074172571,
"grad_norm": 2.5625,
"learning_rate": 0.0001754994037623086,
"loss": 4.6684,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31250
},
{
"epoch": 2.2805151277801055,
"grad_norm": 1.4921875,
"learning_rate": 0.00017542429632290115,
"loss": 4.6533,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31300
},
{
"epoch": 2.2841581813876393,
"grad_norm": 1.46875,
"learning_rate": 0.00017534909006167393,
"loss": 4.6619,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31350
},
{
"epoch": 2.287801234995173,
"grad_norm": 1.5390625,
"learning_rate": 0.00017527378507716304,
"loss": 4.6627,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31400
},
{
"epoch": 2.291444288602707,
"grad_norm": 1.359375,
"learning_rate": 0.00017519838146803405,
"loss": 4.6611,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31450
},
{
"epoch": 2.2950873422102407,
"grad_norm": 1.6484375,
"learning_rate": 0.00017512287933308166,
"loss": 4.6721,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31500
},
{
"epoch": 2.2987303958177745,
"grad_norm": 2.234375,
"learning_rate": 0.0001750472787712296,
"loss": 4.6755,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31550
},
{
"epoch": 2.3023734494253083,
"grad_norm": 1.3671875,
"learning_rate": 0.00017497157988153074,
"loss": 4.673,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31600
},
{
"epoch": 2.306016503032842,
"grad_norm": 2.546875,
"learning_rate": 0.0001748957827631666,
"loss": 4.6731,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31650
},
{
"epoch": 2.309659556640376,
"grad_norm": 1.3359375,
"learning_rate": 0.00017481988751544752,
"loss": 4.6532,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31700
},
{
"epoch": 2.3133026102479097,
"grad_norm": 1.53125,
"learning_rate": 0.00017474389423781234,
"loss": 4.6598,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31750
},
{
"epoch": 2.3169456638554435,
"grad_norm": 1.546875,
"learning_rate": 0.00017466780302982836,
"loss": 4.6708,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31800
},
{
"epoch": 2.3205887174629773,
"grad_norm": 1.28125,
"learning_rate": 0.00017459161399119123,
"loss": 4.6808,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31850
},
{
"epoch": 2.324231771070511,
"grad_norm": 1.6328125,
"learning_rate": 0.0001745153272217247,
"loss": 4.677,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31900
},
{
"epoch": 2.327874824678045,
"grad_norm": 1.46875,
"learning_rate": 0.00017443894282138066,
"loss": 4.6683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 31950
},
{
"epoch": 2.331517878285579,
"grad_norm": 1.9375,
"learning_rate": 0.00017436246089023888,
"loss": 4.6764,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32000
},
{
"epoch": 2.335160931893113,
"grad_norm": 1.6328125,
"learning_rate": 0.0001742858815285068,
"loss": 4.6805,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32050
},
{
"epoch": 2.338803985500647,
"grad_norm": 2.328125,
"learning_rate": 0.00017420920483651972,
"loss": 4.6645,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32100
},
{
"epoch": 2.3424470391081806,
"grad_norm": 1.6640625,
"learning_rate": 0.00017413243091474035,
"loss": 4.6703,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32150
},
{
"epoch": 2.3460900927157144,
"grad_norm": 2.1875,
"learning_rate": 0.00017405555986375881,
"loss": 4.6626,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32200
},
{
"epoch": 2.3497331463232483,
"grad_norm": 1.5625,
"learning_rate": 0.00017397859178429242,
"loss": 4.675,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32250
},
{
"epoch": 2.353376199930782,
"grad_norm": 1.8203125,
"learning_rate": 0.00017390152677718574,
"loss": 4.6866,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32300
},
{
"epoch": 2.357019253538316,
"grad_norm": 1.7421875,
"learning_rate": 0.00017382436494341023,
"loss": 4.6791,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32350
},
{
"epoch": 2.3606623071458497,
"grad_norm": 1.6796875,
"learning_rate": 0.0001737471063840643,
"loss": 4.6648,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32400
},
{
"epoch": 2.3643053607533835,
"grad_norm": 1.53125,
"learning_rate": 0.00017366975120037298,
"loss": 4.6664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32450
},
{
"epoch": 2.3679484143609173,
"grad_norm": 1.8984375,
"learning_rate": 0.00017359229949368802,
"loss": 4.6722,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32500
},
{
"epoch": 2.371591467968451,
"grad_norm": 1.21875,
"learning_rate": 0.0001735147513654875,
"loss": 4.6649,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32550
},
{
"epoch": 2.375234521575985,
"grad_norm": 1.5390625,
"learning_rate": 0.00017343710691737602,
"loss": 4.6788,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32600
},
{
"epoch": 2.3788775751835187,
"grad_norm": 1.9609375,
"learning_rate": 0.00017335936625108422,
"loss": 4.664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32650
},
{
"epoch": 2.3825206287910525,
"grad_norm": 1.5078125,
"learning_rate": 0.0001732815294684688,
"loss": 4.6543,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32700
},
{
"epoch": 2.3861636823985863,
"grad_norm": 1.1875,
"learning_rate": 0.00017320359667151252,
"loss": 4.6802,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32750
},
{
"epoch": 2.38980673600612,
"grad_norm": 1.78125,
"learning_rate": 0.00017312556796232387,
"loss": 4.664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32800
},
{
"epoch": 2.393449789613654,
"grad_norm": 2.3125,
"learning_rate": 0.00017304744344313693,
"loss": 4.6545,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32850
},
{
"epoch": 2.3970928432211878,
"grad_norm": 1.21875,
"learning_rate": 0.00017296922321631146,
"loss": 4.6742,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32900
},
{
"epoch": 2.400735896828722,
"grad_norm": 1.5546875,
"learning_rate": 0.0001728909073843325,
"loss": 4.674,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 32950
},
{
"epoch": 2.404378950436256,
"grad_norm": 1.515625,
"learning_rate": 0.0001728124960498104,
"loss": 4.6781,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33000
},
{
"epoch": 2.4080220040437896,
"grad_norm": 2.171875,
"learning_rate": 0.00017273398931548064,
"loss": 4.6719,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33050
},
{
"epoch": 2.4116650576513234,
"grad_norm": 2.359375,
"learning_rate": 0.0001726553872842037,
"loss": 4.6721,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33100
},
{
"epoch": 2.4153081112588572,
"grad_norm": 1.6484375,
"learning_rate": 0.00017257669005896485,
"loss": 4.6609,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33150
},
{
"epoch": 2.418951164866391,
"grad_norm": 1.2265625,
"learning_rate": 0.00017249789774287423,
"loss": 4.67,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33200
},
{
"epoch": 2.422594218473925,
"grad_norm": 2.3125,
"learning_rate": 0.00017241901043916645,
"loss": 4.6684,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33250
},
{
"epoch": 2.4262372720814587,
"grad_norm": 2.390625,
"learning_rate": 0.00017234002825120053,
"loss": 4.6529,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33300
},
{
"epoch": 2.4298803256889925,
"grad_norm": 1.1796875,
"learning_rate": 0.00017226095128245998,
"loss": 4.6753,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33350
},
{
"epoch": 2.4335233792965263,
"grad_norm": 1.9453125,
"learning_rate": 0.00017218177963655232,
"loss": 4.6712,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33400
},
{
"epoch": 2.43716643290406,
"grad_norm": 2.109375,
"learning_rate": 0.00017210251341720926,
"loss": 4.6821,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33450
},
{
"epoch": 2.440809486511594,
"grad_norm": 1.4609375,
"learning_rate": 0.0001720231527282863,
"loss": 4.6645,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33500
},
{
"epoch": 2.4444525401191277,
"grad_norm": 1.390625,
"learning_rate": 0.00017194369767376281,
"loss": 4.6742,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33550
},
{
"epoch": 2.4480955937266615,
"grad_norm": 1.7734375,
"learning_rate": 0.0001718641483577417,
"loss": 4.6638,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33600
},
{
"epoch": 2.4517386473341953,
"grad_norm": 1.7578125,
"learning_rate": 0.00017178450488444954,
"loss": 4.6735,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33650
},
{
"epoch": 2.4553817009417296,
"grad_norm": 1.1015625,
"learning_rate": 0.00017170476735823604,
"loss": 4.6823,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33700
},
{
"epoch": 2.4590247545492634,
"grad_norm": 1.65625,
"learning_rate": 0.00017162493588357433,
"loss": 4.6804,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33750
},
{
"epoch": 2.462667808156797,
"grad_norm": 1.4921875,
"learning_rate": 0.0001715450105650606,
"loss": 4.6773,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33800
},
{
"epoch": 2.466310861764331,
"grad_norm": 1.3671875,
"learning_rate": 0.0001714649915074139,
"loss": 4.6683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33850
},
{
"epoch": 2.469953915371865,
"grad_norm": 1.859375,
"learning_rate": 0.0001713848788154762,
"loss": 4.6661,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33900
},
{
"epoch": 2.4735969689793986,
"grad_norm": 1.3671875,
"learning_rate": 0.00017130467259421212,
"loss": 4.6834,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 33950
},
{
"epoch": 2.4772400225869324,
"grad_norm": 4.03125,
"learning_rate": 0.0001712243729487088,
"loss": 4.6664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34000
},
{
"epoch": 2.4808830761944662,
"grad_norm": 1.640625,
"learning_rate": 0.00017114397998417577,
"loss": 4.6697,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34050
},
{
"epoch": 2.484526129802,
"grad_norm": 1.3984375,
"learning_rate": 0.00017106349380594492,
"loss": 4.686,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34100
},
{
"epoch": 2.488169183409534,
"grad_norm": 2.8125,
"learning_rate": 0.00017098291451947013,
"loss": 4.6663,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34150
},
{
"epoch": 2.4918122370170677,
"grad_norm": 1.5390625,
"learning_rate": 0.00017090224223032741,
"loss": 4.6654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34200
},
{
"epoch": 2.4954552906246015,
"grad_norm": 1.359375,
"learning_rate": 0.00017082147704421455,
"loss": 4.6702,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34250
},
{
"epoch": 2.4990983442321353,
"grad_norm": 1.4765625,
"learning_rate": 0.00017074061906695109,
"loss": 4.6702,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34300
},
{
"epoch": 2.502741397839669,
"grad_norm": 1.96875,
"learning_rate": 0.00017065966840447807,
"loss": 4.6797,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34350
},
{
"epoch": 2.506384451447203,
"grad_norm": 1.53125,
"learning_rate": 0.0001705786251628581,
"loss": 4.672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34400
},
{
"epoch": 2.5100275050547367,
"grad_norm": 1.2421875,
"learning_rate": 0.00017049748944827494,
"loss": 4.6731,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34450
},
{
"epoch": 2.5136705586622705,
"grad_norm": 2.09375,
"learning_rate": 0.00017041626136703357,
"loss": 4.6617,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34500
},
{
"epoch": 2.5173136122698043,
"grad_norm": 2.046875,
"learning_rate": 0.00017033494102556006,
"loss": 4.6664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34550
},
{
"epoch": 2.520956665877338,
"grad_norm": 1.65625,
"learning_rate": 0.0001702535285304012,
"loss": 4.6672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34600
},
{
"epoch": 2.5245997194848724,
"grad_norm": 1.3359375,
"learning_rate": 0.00017017202398822477,
"loss": 4.6616,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34650
},
{
"epoch": 2.528242773092406,
"grad_norm": 1.34375,
"learning_rate": 0.00017009042750581888,
"loss": 4.6608,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34700
},
{
"epoch": 2.53188582669994,
"grad_norm": 1.765625,
"learning_rate": 0.00017000873919009225,
"loss": 4.6797,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34750
},
{
"epoch": 2.535528880307474,
"grad_norm": 1.46875,
"learning_rate": 0.00016992695914807394,
"loss": 4.6768,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34800
},
{
"epoch": 2.5391719339150076,
"grad_norm": 1.1484375,
"learning_rate": 0.00016984508748691307,
"loss": 4.6575,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34850
},
{
"epoch": 2.5428149875225414,
"grad_norm": 2.0,
"learning_rate": 0.00016976312431387893,
"loss": 4.6685,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34900
},
{
"epoch": 2.5464580411300752,
"grad_norm": 2.109375,
"learning_rate": 0.00016968106973636062,
"loss": 4.6746,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 34950
},
{
"epoch": 2.550101094737609,
"grad_norm": 1.6796875,
"learning_rate": 0.00016959892386186705,
"loss": 4.6698,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35000
},
{
"epoch": 2.553744148345143,
"grad_norm": 1.46875,
"learning_rate": 0.00016951668679802677,
"loss": 4.6728,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35050
},
{
"epoch": 2.5573872019526767,
"grad_norm": 1.3671875,
"learning_rate": 0.0001694343586525877,
"loss": 4.6767,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35100
},
{
"epoch": 2.5610302555602105,
"grad_norm": 1.265625,
"learning_rate": 0.00016935193953341722,
"loss": 4.6755,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35150
},
{
"epoch": 2.5646733091677443,
"grad_norm": 1.7109375,
"learning_rate": 0.00016926942954850182,
"loss": 4.6739,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35200
},
{
"epoch": 2.568316362775278,
"grad_norm": 1.5625,
"learning_rate": 0.00016918682880594707,
"loss": 4.6814,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35250
},
{
"epoch": 2.5719594163828123,
"grad_norm": 1.7578125,
"learning_rate": 0.0001691041374139775,
"loss": 4.6628,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35300
},
{
"epoch": 2.575602469990346,
"grad_norm": 1.296875,
"learning_rate": 0.0001690213554809363,
"loss": 4.6624,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35350
},
{
"epoch": 2.57924552359788,
"grad_norm": 2.421875,
"learning_rate": 0.00016893848311528542,
"loss": 4.6547,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35400
},
{
"epoch": 2.5828885772054138,
"grad_norm": 2.109375,
"learning_rate": 0.00016885552042560518,
"loss": 4.6846,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35450
},
{
"epoch": 2.5865316308129476,
"grad_norm": 1.328125,
"learning_rate": 0.0001687724675205943,
"loss": 4.6568,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35500
},
{
"epoch": 2.5901746844204814,
"grad_norm": 1.234375,
"learning_rate": 0.0001686893245090697,
"loss": 4.6769,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35550
},
{
"epoch": 2.593817738028015,
"grad_norm": 2.40625,
"learning_rate": 0.0001686060914999664,
"loss": 4.6646,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35600
},
{
"epoch": 2.597460791635549,
"grad_norm": 1.78125,
"learning_rate": 0.00016852276860233724,
"loss": 4.6611,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35650
},
{
"epoch": 2.601103845243083,
"grad_norm": 1.4609375,
"learning_rate": 0.0001684393559253529,
"loss": 4.6841,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35700
},
{
"epoch": 2.6047468988506166,
"grad_norm": 1.75,
"learning_rate": 0.00016835585357830162,
"loss": 4.6643,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35750
},
{
"epoch": 2.6083899524581504,
"grad_norm": 1.5078125,
"learning_rate": 0.0001682722616705892,
"loss": 4.6696,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35800
},
{
"epoch": 2.6120330060656842,
"grad_norm": 3.953125,
"learning_rate": 0.00016818858031173887,
"loss": 4.6742,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35850
},
{
"epoch": 2.615676059673218,
"grad_norm": 3.578125,
"learning_rate": 0.00016810480961139087,
"loss": 4.6653,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35900
},
{
"epoch": 2.619319113280752,
"grad_norm": 1.578125,
"learning_rate": 0.00016802094967930252,
"loss": 4.665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 35950
},
{
"epoch": 2.6229621668882857,
"grad_norm": 1.921875,
"learning_rate": 0.00016793700062534816,
"loss": 4.6802,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36000
},
{
"epoch": 2.6266052204958195,
"grad_norm": 2.296875,
"learning_rate": 0.00016785296255951887,
"loss": 4.6725,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36050
},
{
"epoch": 2.6302482741033533,
"grad_norm": 1.2734375,
"learning_rate": 0.00016776883559192234,
"loss": 4.6635,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36100
},
{
"epoch": 2.633891327710887,
"grad_norm": 1.5078125,
"learning_rate": 0.00016768461983278273,
"loss": 4.6662,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36150
},
{
"epoch": 2.637534381318421,
"grad_norm": 1.8125,
"learning_rate": 0.00016760031539244047,
"loss": 4.6712,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36200
},
{
"epoch": 2.6411774349259547,
"grad_norm": 2.640625,
"learning_rate": 0.00016751592238135234,
"loss": 4.6612,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36250
},
{
"epoch": 2.6448204885334885,
"grad_norm": 2.46875,
"learning_rate": 0.00016743144091009105,
"loss": 4.6673,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36300
},
{
"epoch": 2.6484635421410228,
"grad_norm": 1.3828125,
"learning_rate": 0.0001673468710893452,
"loss": 4.6663,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36350
},
{
"epoch": 2.6521065957485566,
"grad_norm": 1.2890625,
"learning_rate": 0.00016726221302991925,
"loss": 4.664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36400
},
{
"epoch": 2.6557496493560904,
"grad_norm": 2.71875,
"learning_rate": 0.0001671774668427332,
"loss": 4.6669,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36450
},
{
"epoch": 2.659392702963624,
"grad_norm": 1.7578125,
"learning_rate": 0.00016709263263882244,
"loss": 4.6605,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36500
},
{
"epoch": 2.663035756571158,
"grad_norm": 1.2109375,
"learning_rate": 0.00016700771052933786,
"loss": 4.668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36550
},
{
"epoch": 2.666678810178692,
"grad_norm": 1.375,
"learning_rate": 0.00016692270062554539,
"loss": 4.6727,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36600
},
{
"epoch": 2.6703218637862256,
"grad_norm": 2.265625,
"learning_rate": 0.00016683760303882602,
"loss": 4.6592,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36650
},
{
"epoch": 2.6739649173937594,
"grad_norm": 2.21875,
"learning_rate": 0.00016675241788067564,
"loss": 4.677,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36700
},
{
"epoch": 2.677607971001293,
"grad_norm": 2.015625,
"learning_rate": 0.0001666671452627049,
"loss": 4.6738,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36750
},
{
"epoch": 2.681251024608827,
"grad_norm": 1.53125,
"learning_rate": 0.00016658178529663892,
"loss": 4.6657,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36800
},
{
"epoch": 2.684894078216361,
"grad_norm": 1.5703125,
"learning_rate": 0.00016649633809431746,
"loss": 4.6625,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36850
},
{
"epoch": 2.6885371318238946,
"grad_norm": 3.28125,
"learning_rate": 0.0001664108037676944,
"loss": 4.6672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36900
},
{
"epoch": 2.692180185431429,
"grad_norm": 1.7890625,
"learning_rate": 0.00016632518242883787,
"loss": 4.6855,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 36950
},
{
"epoch": 2.6958232390389627,
"grad_norm": 1.578125,
"learning_rate": 0.00016623947418993,
"loss": 4.6676,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37000
},
{
"epoch": 2.6994662926464965,
"grad_norm": 1.7578125,
"learning_rate": 0.00016615367916326672,
"loss": 4.6699,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37050
},
{
"epoch": 2.7031093462540303,
"grad_norm": 1.765625,
"learning_rate": 0.0001660677974612577,
"loss": 4.6637,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37100
},
{
"epoch": 2.706752399861564,
"grad_norm": 2.515625,
"learning_rate": 0.0001659818291964262,
"loss": 4.6789,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37150
},
{
"epoch": 2.710395453469098,
"grad_norm": 2.0,
"learning_rate": 0.00016589577448140888,
"loss": 4.6829,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37200
},
{
"epoch": 2.7140385070766317,
"grad_norm": 2.8125,
"learning_rate": 0.00016580963342895563,
"loss": 4.6611,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37250
},
{
"epoch": 2.7176815606841656,
"grad_norm": 2.046875,
"learning_rate": 0.00016572340615192952,
"loss": 4.6801,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37300
},
{
"epoch": 2.7213246142916994,
"grad_norm": 1.5078125,
"learning_rate": 0.00016563709276330656,
"loss": 4.6716,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37350
},
{
"epoch": 2.724967667899233,
"grad_norm": 1.4453125,
"learning_rate": 0.00016555069337617558,
"loss": 4.6662,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37400
},
{
"epoch": 2.728610721506767,
"grad_norm": 1.5390625,
"learning_rate": 0.00016546420810373809,
"loss": 4.6735,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37450
},
{
"epoch": 2.732253775114301,
"grad_norm": 1.9375,
"learning_rate": 0.00016537763705930816,
"loss": 4.6842,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37500
},
{
"epoch": 2.7358968287218346,
"grad_norm": 1.59375,
"learning_rate": 0.0001652909803563122,
"loss": 4.6689,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37550
},
{
"epoch": 2.7395398823293684,
"grad_norm": 1.3359375,
"learning_rate": 0.00016520423810828888,
"loss": 4.6773,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37600
},
{
"epoch": 2.743182935936902,
"grad_norm": 1.3671875,
"learning_rate": 0.0001651174104288889,
"loss": 4.6732,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37650
},
{
"epoch": 2.746825989544436,
"grad_norm": 1.484375,
"learning_rate": 0.00016503049743187494,
"loss": 4.6915,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37700
},
{
"epoch": 2.75046904315197,
"grad_norm": 1.78125,
"learning_rate": 0.0001649434992311215,
"loss": 4.684,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37750
},
{
"epoch": 2.7541120967595036,
"grad_norm": 1.4296875,
"learning_rate": 0.0001648564159406146,
"loss": 4.6677,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37800
},
{
"epoch": 2.7577551503670374,
"grad_norm": 1.5234375,
"learning_rate": 0.00016476924767445182,
"loss": 4.6729,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37850
},
{
"epoch": 2.7613982039745713,
"grad_norm": 1.453125,
"learning_rate": 0.00016468199454684214,
"loss": 4.6682,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37900
},
{
"epoch": 2.765041257582105,
"grad_norm": 1.6171875,
"learning_rate": 0.00016459465667210558,
"loss": 4.6634,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 37950
},
{
"epoch": 2.7686843111896393,
"grad_norm": 2.5,
"learning_rate": 0.00016450723416467332,
"loss": 4.6599,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38000
},
{
"epoch": 2.772327364797173,
"grad_norm": 1.65625,
"learning_rate": 0.00016441972713908737,
"loss": 4.6732,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38050
},
{
"epoch": 2.775970418404707,
"grad_norm": 1.2578125,
"learning_rate": 0.00016433213571000047,
"loss": 4.6749,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38100
},
{
"epoch": 2.7796134720122407,
"grad_norm": 1.59375,
"learning_rate": 0.00016424445999217602,
"loss": 4.673,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38150
},
{
"epoch": 2.7832565256197745,
"grad_norm": 2.625,
"learning_rate": 0.0001641567001004877,
"loss": 4.6883,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38200
},
{
"epoch": 2.7868995792273084,
"grad_norm": 1.2890625,
"learning_rate": 0.00016406885614991964,
"loss": 4.6538,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38250
},
{
"epoch": 2.790542632834842,
"grad_norm": 2.078125,
"learning_rate": 0.00016398092825556604,
"loss": 4.6694,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38300
},
{
"epoch": 2.794185686442376,
"grad_norm": 1.4140625,
"learning_rate": 0.00016389291653263108,
"loss": 4.6546,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38350
},
{
"epoch": 2.79782874004991,
"grad_norm": 1.7109375,
"learning_rate": 0.00016380482109642878,
"loss": 4.6594,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38400
},
{
"epoch": 2.8014717936574436,
"grad_norm": 1.2421875,
"learning_rate": 0.0001637166420623828,
"loss": 4.6741,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38450
},
{
"epoch": 2.8051148472649774,
"grad_norm": 3.25,
"learning_rate": 0.00016362837954602642,
"loss": 4.6649,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38500
},
{
"epoch": 2.808757900872511,
"grad_norm": 2.078125,
"learning_rate": 0.00016354003366300223,
"loss": 4.6734,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38550
},
{
"epoch": 2.812400954480045,
"grad_norm": 2.40625,
"learning_rate": 0.00016345160452906207,
"loss": 4.6686,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38600
},
{
"epoch": 2.8160440080875793,
"grad_norm": 1.6328125,
"learning_rate": 0.00016336309226006688,
"loss": 4.663,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38650
},
{
"epoch": 2.819687061695113,
"grad_norm": 2.15625,
"learning_rate": 0.0001632744969719864,
"loss": 4.6637,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38700
},
{
"epoch": 2.823330115302647,
"grad_norm": 2.140625,
"learning_rate": 0.0001631858187808994,
"loss": 4.6513,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38750
},
{
"epoch": 2.8269731689101807,
"grad_norm": 1.515625,
"learning_rate": 0.00016309705780299298,
"loss": 4.6545,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38800
},
{
"epoch": 2.8306162225177145,
"grad_norm": 1.8359375,
"learning_rate": 0.00016300821415456292,
"loss": 4.6579,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38850
},
{
"epoch": 2.8342592761252483,
"grad_norm": 2.203125,
"learning_rate": 0.00016291928795201318,
"loss": 4.6785,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38900
},
{
"epoch": 2.837902329732782,
"grad_norm": 1.3125,
"learning_rate": 0.00016283027931185594,
"loss": 4.6634,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 38950
},
{
"epoch": 2.841545383340316,
"grad_norm": 2.84375,
"learning_rate": 0.00016274118835071146,
"loss": 4.6633,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39000
},
{
"epoch": 2.8451884369478497,
"grad_norm": 1.484375,
"learning_rate": 0.0001626520151853077,
"loss": 4.6687,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39050
},
{
"epoch": 2.8488314905553835,
"grad_norm": 1.140625,
"learning_rate": 0.00016256275993248052,
"loss": 4.6654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39100
},
{
"epoch": 2.8524745441629173,
"grad_norm": 1.6640625,
"learning_rate": 0.00016247342270917309,
"loss": 4.6683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39150
},
{
"epoch": 2.856117597770451,
"grad_norm": 3.953125,
"learning_rate": 0.0001623840036324362,
"loss": 4.6834,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39200
},
{
"epoch": 2.859760651377985,
"grad_norm": 1.6328125,
"learning_rate": 0.0001622945028194278,
"loss": 4.6953,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39250
},
{
"epoch": 2.8634037049855188,
"grad_norm": 1.4921875,
"learning_rate": 0.00016220492038741292,
"loss": 4.6717,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39300
},
{
"epoch": 2.8670467585930526,
"grad_norm": 1.7578125,
"learning_rate": 0.00016211525645376353,
"loss": 4.6674,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39350
},
{
"epoch": 2.8706898122005864,
"grad_norm": 1.265625,
"learning_rate": 0.0001620255111359584,
"loss": 4.6619,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39400
},
{
"epoch": 2.87433286580812,
"grad_norm": 1.3203125,
"learning_rate": 0.0001619356845515829,
"loss": 4.6823,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39450
},
{
"epoch": 2.877975919415654,
"grad_norm": 1.734375,
"learning_rate": 0.00016184577681832893,
"loss": 4.6693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39500
},
{
"epoch": 2.881618973023188,
"grad_norm": 1.8671875,
"learning_rate": 0.0001617557880539947,
"loss": 4.6815,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39550
},
{
"epoch": 2.8852620266307216,
"grad_norm": 2.09375,
"learning_rate": 0.0001616657183764845,
"loss": 4.6618,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39600
},
{
"epoch": 2.8889050802382554,
"grad_norm": 1.6171875,
"learning_rate": 0.00016157556790380882,
"loss": 4.6625,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39650
},
{
"epoch": 2.8925481338457897,
"grad_norm": 1.7578125,
"learning_rate": 0.00016148533675408377,
"loss": 4.6692,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39700
},
{
"epoch": 2.8961911874533235,
"grad_norm": 1.765625,
"learning_rate": 0.00016139502504553135,
"loss": 4.6749,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39750
},
{
"epoch": 2.8998342410608573,
"grad_norm": 1.921875,
"learning_rate": 0.00016130463289647907,
"loss": 4.6764,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39800
},
{
"epoch": 2.903477294668391,
"grad_norm": 1.390625,
"learning_rate": 0.00016121416042535973,
"loss": 4.6761,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39850
},
{
"epoch": 2.907120348275925,
"grad_norm": 1.5546875,
"learning_rate": 0.00016112360775071154,
"loss": 4.6777,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39900
},
{
"epoch": 2.9107634018834587,
"grad_norm": 1.375,
"learning_rate": 0.0001610329749911776,
"loss": 4.6786,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 39950
},
{
"epoch": 2.9144064554909925,
"grad_norm": 2.125,
"learning_rate": 0.00016094226226550618,
"loss": 4.6697,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40000
},
{
"epoch": 2.9180495090985263,
"grad_norm": 2.25,
"learning_rate": 0.00016085146969255004,
"loss": 4.6815,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40050
},
{
"epoch": 2.92169256270606,
"grad_norm": 1.2578125,
"learning_rate": 0.0001607605973912668,
"loss": 4.6589,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40100
},
{
"epoch": 2.925335616313594,
"grad_norm": 5.1875,
"learning_rate": 0.00016066964548071838,
"loss": 4.6829,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40150
},
{
"epoch": 2.9289786699211278,
"grad_norm": 1.875,
"learning_rate": 0.00016057861408007114,
"loss": 4.6657,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40200
},
{
"epoch": 2.9326217235286616,
"grad_norm": 2.90625,
"learning_rate": 0.00016048750330859544,
"loss": 4.6819,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40250
},
{
"epoch": 2.936264777136196,
"grad_norm": 1.171875,
"learning_rate": 0.0001603963132856657,
"loss": 4.6581,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40300
},
{
"epoch": 2.9399078307437296,
"grad_norm": 2.28125,
"learning_rate": 0.00016030504413076032,
"loss": 4.6718,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40350
},
{
"epoch": 2.9435508843512634,
"grad_norm": 2.578125,
"learning_rate": 0.00016021369596346108,
"loss": 4.6695,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40400
},
{
"epoch": 2.9471939379587972,
"grad_norm": 2.0625,
"learning_rate": 0.00016012226890345352,
"loss": 4.6671,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40450
},
{
"epoch": 2.950836991566331,
"grad_norm": 2.171875,
"learning_rate": 0.00016003076307052644,
"loss": 4.6717,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40500
},
{
"epoch": 2.954480045173865,
"grad_norm": 2.734375,
"learning_rate": 0.00015993917858457194,
"loss": 4.6727,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40550
},
{
"epoch": 2.9581230987813987,
"grad_norm": 1.75,
"learning_rate": 0.00015984751556558506,
"loss": 4.6665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40600
},
{
"epoch": 2.9617661523889325,
"grad_norm": 1.8046875,
"learning_rate": 0.00015975577413366386,
"loss": 4.661,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40650
},
{
"epoch": 2.9654092059964663,
"grad_norm": 1.4765625,
"learning_rate": 0.00015966395440900896,
"loss": 4.673,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40700
},
{
"epoch": 2.969052259604,
"grad_norm": 1.25,
"learning_rate": 0.00015957205651192377,
"loss": 4.6778,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40750
},
{
"epoch": 2.972695313211534,
"grad_norm": 1.625,
"learning_rate": 0.00015948008056281395,
"loss": 4.67,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40800
},
{
"epoch": 2.9763383668190677,
"grad_norm": 1.1015625,
"learning_rate": 0.00015938802668218752,
"loss": 4.6577,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40850
},
{
"epoch": 2.9799814204266015,
"grad_norm": 1.296875,
"learning_rate": 0.00015929589499065458,
"loss": 4.6602,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40900
},
{
"epoch": 2.9836244740341353,
"grad_norm": 2.015625,
"learning_rate": 0.00015920368560892723,
"loss": 4.676,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 40950
},
{
"epoch": 2.987267527641669,
"grad_norm": 1.828125,
"learning_rate": 0.0001591113986578192,
"loss": 4.658,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41000
},
{
"epoch": 2.990910581249203,
"grad_norm": 3.234375,
"learning_rate": 0.00015901903425824605,
"loss": 4.6804,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41050
},
{
"epoch": 2.9945536348567368,
"grad_norm": 1.7734375,
"learning_rate": 0.0001589265925312247,
"loss": 4.6605,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41100
},
{
"epoch": 2.9981966884642706,
"grad_norm": 1.75,
"learning_rate": 0.0001588340735978734,
"loss": 4.6688,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41150
},
{
"epoch": 2.9997996320515856,
"eval_loss": 4.678781032562256,
"eval_runtime": 582.4467,
"eval_samples_per_second": 520.782,
"eval_steps_per_second": 43.4,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41172
},
{
"epoch": 3.001821526803767,
"grad_norm": 1.96875,
"learning_rate": 0.0001587414775794116,
"loss": 4.6603,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41200
},
{
"epoch": 3.0054645804113007,
"grad_norm": 1.546875,
"learning_rate": 0.00015864880459715968,
"loss": 4.6561,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41250
},
{
"epoch": 3.0091076340188345,
"grad_norm": 1.625,
"learning_rate": 0.00015855605477253893,
"loss": 4.6607,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41300
},
{
"epoch": 3.0127506876263683,
"grad_norm": 1.734375,
"learning_rate": 0.00015846322822707124,
"loss": 4.6587,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41350
},
{
"epoch": 3.016393741233902,
"grad_norm": 1.71875,
"learning_rate": 0.0001583703250823791,
"loss": 4.66,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41400
},
{
"epoch": 3.020036794841436,
"grad_norm": 2.71875,
"learning_rate": 0.00015827734546018535,
"loss": 4.6593,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41450
},
{
"epoch": 3.0236798484489698,
"grad_norm": 2.015625,
"learning_rate": 0.00015818428948231297,
"loss": 4.6726,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41500
},
{
"epoch": 3.0273229020565036,
"grad_norm": 1.359375,
"learning_rate": 0.00015809115727068504,
"loss": 4.6615,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41550
},
{
"epoch": 3.030965955664038,
"grad_norm": 1.4609375,
"learning_rate": 0.0001579979489473245,
"loss": 4.6702,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41600
},
{
"epoch": 3.0346090092715716,
"grad_norm": 1.7734375,
"learning_rate": 0.00015790466463435395,
"loss": 4.6569,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41650
},
{
"epoch": 3.0382520628791054,
"grad_norm": 1.2109375,
"learning_rate": 0.00015781130445399573,
"loss": 4.6531,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41700
},
{
"epoch": 3.0418951164866392,
"grad_norm": 1.9296875,
"learning_rate": 0.00015771786852857142,
"loss": 4.654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41750
},
{
"epoch": 3.045538170094173,
"grad_norm": 1.7734375,
"learning_rate": 0.00015762435698050186,
"loss": 4.6573,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41800
},
{
"epoch": 3.049181223701707,
"grad_norm": 1.4296875,
"learning_rate": 0.00015753076993230706,
"loss": 4.6682,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41850
},
{
"epoch": 3.0528242773092407,
"grad_norm": 1.3203125,
"learning_rate": 0.00015743710750660588,
"loss": 4.666,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41900
},
{
"epoch": 3.0564673309167745,
"grad_norm": 1.7265625,
"learning_rate": 0.00015734336982611594,
"loss": 4.6541,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 41950
},
{
"epoch": 3.0601103845243083,
"grad_norm": 1.765625,
"learning_rate": 0.0001572495570136535,
"loss": 4.6601,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42000
},
{
"epoch": 3.063753438131842,
"grad_norm": 1.59375,
"learning_rate": 0.0001571556691921332,
"loss": 4.6492,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42050
},
{
"epoch": 3.067396491739376,
"grad_norm": 1.8359375,
"learning_rate": 0.000157061706484568,
"loss": 4.6617,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42100
},
{
"epoch": 3.0710395453469097,
"grad_norm": 1.34375,
"learning_rate": 0.000156967669014069,
"loss": 4.665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42150
},
{
"epoch": 3.0746825989544435,
"grad_norm": 1.1640625,
"learning_rate": 0.0001568735569038452,
"loss": 4.6767,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42200
},
{
"epoch": 3.0783256525619773,
"grad_norm": 2.21875,
"learning_rate": 0.00015677937027720344,
"loss": 4.6662,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42250
},
{
"epoch": 3.081968706169511,
"grad_norm": 1.8984375,
"learning_rate": 0.00015668510925754813,
"loss": 4.664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42300
},
{
"epoch": 3.085611759777045,
"grad_norm": 2.4375,
"learning_rate": 0.0001565907739683812,
"loss": 4.6605,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42350
},
{
"epoch": 3.0892548133845787,
"grad_norm": 1.8046875,
"learning_rate": 0.00015649636453330193,
"loss": 4.6696,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42400
},
{
"epoch": 3.092897866992113,
"grad_norm": 1.65625,
"learning_rate": 0.00015640188107600664,
"loss": 4.6404,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42450
},
{
"epoch": 3.096540920599647,
"grad_norm": 1.46875,
"learning_rate": 0.00015630732372028873,
"loss": 4.6577,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42500
},
{
"epoch": 3.1001839742071806,
"grad_norm": 2.015625,
"learning_rate": 0.00015621269259003835,
"loss": 4.6764,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42550
},
{
"epoch": 3.1038270278147144,
"grad_norm": 1.421875,
"learning_rate": 0.00015611798780924236,
"loss": 4.6652,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42600
},
{
"epoch": 3.1074700814222482,
"grad_norm": 1.5625,
"learning_rate": 0.0001560232095019841,
"loss": 4.6604,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42650
},
{
"epoch": 3.111113135029782,
"grad_norm": 1.3984375,
"learning_rate": 0.00015592835779244327,
"loss": 4.6584,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42700
},
{
"epoch": 3.114756188637316,
"grad_norm": 1.78125,
"learning_rate": 0.0001558334328048957,
"loss": 4.6588,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42750
},
{
"epoch": 3.1183992422448497,
"grad_norm": 1.34375,
"learning_rate": 0.00015573843466371324,
"loss": 4.6702,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42800
},
{
"epoch": 3.1220422958523835,
"grad_norm": 2.53125,
"learning_rate": 0.00015564336349336362,
"loss": 4.668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42850
},
{
"epoch": 3.1256853494599173,
"grad_norm": 1.7265625,
"learning_rate": 0.00015554821941841023,
"loss": 4.6675,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42900
},
{
"epoch": 3.129328403067451,
"grad_norm": 1.71875,
"learning_rate": 0.00015545300256351193,
"loss": 4.6616,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 42950
},
{
"epoch": 3.132971456674985,
"grad_norm": 1.6015625,
"learning_rate": 0.00015535771305342307,
"loss": 4.6741,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43000
},
{
"epoch": 3.1366145102825187,
"grad_norm": 1.9453125,
"learning_rate": 0.00015526235101299304,
"loss": 4.6554,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43050
},
{
"epoch": 3.1402575638900525,
"grad_norm": 1.90625,
"learning_rate": 0.00015516691656716636,
"loss": 4.6621,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43100
},
{
"epoch": 3.1439006174975863,
"grad_norm": 1.3515625,
"learning_rate": 0.00015507140984098242,
"loss": 4.66,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43150
},
{
"epoch": 3.14754367110512,
"grad_norm": 1.90625,
"learning_rate": 0.00015497583095957522,
"loss": 4.6722,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43200
},
{
"epoch": 3.1511867247126544,
"grad_norm": 1.921875,
"learning_rate": 0.0001548801800481734,
"loss": 4.6719,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43250
},
{
"epoch": 3.154829778320188,
"grad_norm": 1.5703125,
"learning_rate": 0.0001547844572320999,
"loss": 4.6563,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43300
},
{
"epoch": 3.158472831927722,
"grad_norm": 1.2890625,
"learning_rate": 0.00015468866263677194,
"loss": 4.6653,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43350
},
{
"epoch": 3.162115885535256,
"grad_norm": 1.921875,
"learning_rate": 0.00015459279638770077,
"loss": 4.6787,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43400
},
{
"epoch": 3.1657589391427896,
"grad_norm": 1.5546875,
"learning_rate": 0.00015449685861049144,
"loss": 4.6616,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43450
},
{
"epoch": 3.1694019927503234,
"grad_norm": 2.0625,
"learning_rate": 0.00015440084943084282,
"loss": 4.6695,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43500
},
{
"epoch": 3.1730450463578572,
"grad_norm": 1.2109375,
"learning_rate": 0.00015430476897454725,
"loss": 4.6631,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43550
},
{
"epoch": 3.176688099965391,
"grad_norm": 1.6171875,
"learning_rate": 0.00015420861736749057,
"loss": 4.6522,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43600
},
{
"epoch": 3.180331153572925,
"grad_norm": 2.015625,
"learning_rate": 0.00015411239473565172,
"loss": 4.662,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43650
},
{
"epoch": 3.1839742071804586,
"grad_norm": 1.8515625,
"learning_rate": 0.00015401610120510274,
"loss": 4.6728,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43700
},
{
"epoch": 3.1876172607879925,
"grad_norm": 1.9921875,
"learning_rate": 0.0001539197369020086,
"loss": 4.666,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43750
},
{
"epoch": 3.1912603143955263,
"grad_norm": 1.2734375,
"learning_rate": 0.00015382330195262697,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43800
},
{
"epoch": 3.19490336800306,
"grad_norm": 2.265625,
"learning_rate": 0.00015372679648330807,
"loss": 4.6638,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43850
},
{
"epoch": 3.198546421610594,
"grad_norm": 2.453125,
"learning_rate": 0.00015363022062049453,
"loss": 4.6707,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43900
},
{
"epoch": 3.2021894752181277,
"grad_norm": 2.625,
"learning_rate": 0.00015353357449072118,
"loss": 4.6808,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 43950
},
{
"epoch": 3.2058325288256615,
"grad_norm": 2.0,
"learning_rate": 0.00015343685822061498,
"loss": 4.6628,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44000
},
{
"epoch": 3.2094755824331953,
"grad_norm": 1.59375,
"learning_rate": 0.00015334007193689475,
"loss": 4.6789,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44050
},
{
"epoch": 3.213118636040729,
"grad_norm": 1.5234375,
"learning_rate": 0.00015324321576637098,
"loss": 4.6682,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44100
},
{
"epoch": 3.2167616896482634,
"grad_norm": 1.28125,
"learning_rate": 0.00015314628983594588,
"loss": 4.6518,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44150
},
{
"epoch": 3.220404743255797,
"grad_norm": 1.890625,
"learning_rate": 0.0001530492942726129,
"loss": 4.6576,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44200
},
{
"epoch": 3.224047796863331,
"grad_norm": 1.328125,
"learning_rate": 0.00015295222920345681,
"loss": 4.6761,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44250
},
{
"epoch": 3.227690850470865,
"grad_norm": 1.3671875,
"learning_rate": 0.00015285509475565344,
"loss": 4.6632,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44300
},
{
"epoch": 3.2313339040783986,
"grad_norm": 1.9453125,
"learning_rate": 0.00015275789105646952,
"loss": 4.6736,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44350
},
{
"epoch": 3.2349769576859324,
"grad_norm": 1.3515625,
"learning_rate": 0.0001526606182332625,
"loss": 4.6553,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44400
},
{
"epoch": 3.238620011293466,
"grad_norm": 2.109375,
"learning_rate": 0.00015256327641348036,
"loss": 4.6735,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44450
},
{
"epoch": 3.242263064901,
"grad_norm": 2.3125,
"learning_rate": 0.0001524658657246616,
"loss": 4.6757,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44500
},
{
"epoch": 3.245906118508534,
"grad_norm": 1.3828125,
"learning_rate": 0.0001523683862944348,
"loss": 4.6585,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44550
},
{
"epoch": 3.2495491721160676,
"grad_norm": 1.625,
"learning_rate": 0.00015227083825051875,
"loss": 4.672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44600
},
{
"epoch": 3.2531922257236014,
"grad_norm": 1.9453125,
"learning_rate": 0.000152173221720722,
"loss": 4.6732,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44650
},
{
"epoch": 3.2568352793311353,
"grad_norm": 2.671875,
"learning_rate": 0.00015207553683294298,
"loss": 4.6556,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44700
},
{
"epoch": 3.260478332938669,
"grad_norm": 1.8984375,
"learning_rate": 0.0001519777837151695,
"loss": 4.6633,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44750
},
{
"epoch": 3.264121386546203,
"grad_norm": 1.7734375,
"learning_rate": 0.000151879962495479,
"loss": 4.6799,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44800
},
{
"epoch": 3.2677644401537367,
"grad_norm": 1.4296875,
"learning_rate": 0.0001517820733020379,
"loss": 4.6737,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44850
},
{
"epoch": 3.271407493761271,
"grad_norm": 3.1875,
"learning_rate": 0.00015168411626310184,
"loss": 4.6544,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44900
},
{
"epoch": 3.2750505473688047,
"grad_norm": 2.484375,
"learning_rate": 0.00015158609150701537,
"loss": 4.6729,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 44950
},
{
"epoch": 3.2786936009763386,
"grad_norm": 1.8984375,
"learning_rate": 0.0001514879991622116,
"loss": 4.6628,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45000
},
{
"epoch": 3.2823366545838724,
"grad_norm": 2.234375,
"learning_rate": 0.0001513898393572124,
"loss": 4.6395,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45050
},
{
"epoch": 3.285979708191406,
"grad_norm": 2.203125,
"learning_rate": 0.00015129161222062783,
"loss": 4.664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45100
},
{
"epoch": 3.28962276179894,
"grad_norm": 1.4140625,
"learning_rate": 0.00015119331788115633,
"loss": 4.6565,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45150
},
{
"epoch": 3.293265815406474,
"grad_norm": 1.78125,
"learning_rate": 0.0001510949564675843,
"loss": 4.6621,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45200
},
{
"epoch": 3.2969088690140076,
"grad_norm": 1.8359375,
"learning_rate": 0.0001509965281087861,
"loss": 4.677,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45250
},
{
"epoch": 3.3005519226215414,
"grad_norm": 1.375,
"learning_rate": 0.00015089803293372365,
"loss": 4.6671,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45300
},
{
"epoch": 3.304194976229075,
"grad_norm": 1.34375,
"learning_rate": 0.00015079947107144657,
"loss": 4.6683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45350
},
{
"epoch": 3.307838029836609,
"grad_norm": 1.75,
"learning_rate": 0.00015070084265109176,
"loss": 4.6641,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45400
},
{
"epoch": 3.311481083444143,
"grad_norm": 1.8828125,
"learning_rate": 0.00015060214780188345,
"loss": 4.6374,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45450
},
{
"epoch": 3.3151241370516766,
"grad_norm": 1.7578125,
"learning_rate": 0.00015050338665313276,
"loss": 4.6689,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45500
},
{
"epoch": 3.3187671906592104,
"grad_norm": 1.9375,
"learning_rate": 0.0001504045593342377,
"loss": 4.6764,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45550
},
{
"epoch": 3.3224102442667443,
"grad_norm": 2.359375,
"learning_rate": 0.00015030566597468305,
"loss": 4.6733,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45600
},
{
"epoch": 3.326053297874278,
"grad_norm": 2.046875,
"learning_rate": 0.0001502067067040401,
"loss": 4.6676,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45650
},
{
"epoch": 3.329696351481812,
"grad_norm": 1.0625,
"learning_rate": 0.00015010768165196647,
"loss": 4.6737,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45700
},
{
"epoch": 3.3333394050893457,
"grad_norm": 1.515625,
"learning_rate": 0.00015000859094820593,
"loss": 4.6632,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45750
},
{
"epoch": 3.3369824586968795,
"grad_norm": 1.6640625,
"learning_rate": 0.00014990943472258832,
"loss": 4.6773,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45800
},
{
"epoch": 3.3406255123044137,
"grad_norm": 1.328125,
"learning_rate": 0.00014981021310502937,
"loss": 4.6582,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45850
},
{
"epoch": 3.3442685659119475,
"grad_norm": 1.65625,
"learning_rate": 0.00014971092622553038,
"loss": 4.6628,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45900
},
{
"epoch": 3.3479116195194814,
"grad_norm": 2.46875,
"learning_rate": 0.00014961157421417823,
"loss": 4.6629,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 45950
},
{
"epoch": 3.351554673127015,
"grad_norm": 1.4921875,
"learning_rate": 0.00014951215720114514,
"loss": 4.6821,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46000
},
{
"epoch": 3.355197726734549,
"grad_norm": 2.765625,
"learning_rate": 0.00014941267531668845,
"loss": 4.6796,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46050
},
{
"epoch": 3.3588407803420828,
"grad_norm": 1.96875,
"learning_rate": 0.00014931312869115052,
"loss": 4.6634,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46100
},
{
"epoch": 3.3624838339496166,
"grad_norm": 1.9765625,
"learning_rate": 0.00014921351745495858,
"loss": 4.6605,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46150
},
{
"epoch": 3.3661268875571504,
"grad_norm": 2.125,
"learning_rate": 0.00014911384173862445,
"loss": 4.6685,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46200
},
{
"epoch": 3.369769941164684,
"grad_norm": 1.484375,
"learning_rate": 0.00014901410167274438,
"loss": 4.6604,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46250
},
{
"epoch": 3.373412994772218,
"grad_norm": 1.578125,
"learning_rate": 0.0001489142973879991,
"loss": 4.6723,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46300
},
{
"epoch": 3.377056048379752,
"grad_norm": 1.90625,
"learning_rate": 0.00014881442901515333,
"loss": 4.6638,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46350
},
{
"epoch": 3.3806991019872856,
"grad_norm": 2.09375,
"learning_rate": 0.00014871449668505586,
"loss": 4.6548,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46400
},
{
"epoch": 3.3843421555948194,
"grad_norm": 1.8359375,
"learning_rate": 0.00014861450052863914,
"loss": 4.6652,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46450
},
{
"epoch": 3.3879852092023532,
"grad_norm": 1.296875,
"learning_rate": 0.00014851444067691944,
"loss": 4.6641,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46500
},
{
"epoch": 3.3916282628098875,
"grad_norm": 2.59375,
"learning_rate": 0.00014841431726099632,
"loss": 4.6579,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46550
},
{
"epoch": 3.3952713164174213,
"grad_norm": 2.09375,
"learning_rate": 0.00014831413041205272,
"loss": 4.666,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46600
},
{
"epoch": 3.398914370024955,
"grad_norm": 1.1484375,
"learning_rate": 0.0001482138802613546,
"loss": 4.6717,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46650
},
{
"epoch": 3.402557423632489,
"grad_norm": 2.234375,
"learning_rate": 0.00014811356694025097,
"loss": 4.6704,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46700
},
{
"epoch": 3.4062004772400227,
"grad_norm": 1.203125,
"learning_rate": 0.00014801319058017348,
"loss": 4.6745,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46750
},
{
"epoch": 3.4098435308475565,
"grad_norm": 2.8125,
"learning_rate": 0.00014791275131263652,
"loss": 4.6627,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46800
},
{
"epoch": 3.4134865844550903,
"grad_norm": 1.625,
"learning_rate": 0.0001478122492692368,
"loss": 4.6639,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46850
},
{
"epoch": 3.417129638062624,
"grad_norm": 1.359375,
"learning_rate": 0.0001477116845816533,
"loss": 4.6558,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46900
},
{
"epoch": 3.420772691670158,
"grad_norm": 1.3515625,
"learning_rate": 0.00014761105738164706,
"loss": 4.6723,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 46950
},
{
"epoch": 3.4244157452776918,
"grad_norm": 1.40625,
"learning_rate": 0.0001475103678010611,
"loss": 4.659,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47000
},
{
"epoch": 3.4280587988852256,
"grad_norm": 1.421875,
"learning_rate": 0.00014740961597182006,
"loss": 4.6581,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47050
},
{
"epoch": 3.4317018524927594,
"grad_norm": 1.734375,
"learning_rate": 0.00014730880202593023,
"loss": 4.6662,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47100
},
{
"epoch": 3.435344906100293,
"grad_norm": 3.75,
"learning_rate": 0.00014720792609547928,
"loss": 4.6708,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47150
},
{
"epoch": 3.438987959707827,
"grad_norm": 2.078125,
"learning_rate": 0.00014710698831263595,
"loss": 4.6745,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47200
},
{
"epoch": 3.442631013315361,
"grad_norm": 2.03125,
"learning_rate": 0.0001470059888096503,
"loss": 4.6637,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47250
},
{
"epoch": 3.4462740669228946,
"grad_norm": 1.40625,
"learning_rate": 0.000146904927718853,
"loss": 4.6691,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47300
},
{
"epoch": 3.4499171205304284,
"grad_norm": 2.34375,
"learning_rate": 0.00014680380517265556,
"loss": 4.6609,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47350
},
{
"epoch": 3.4535601741379622,
"grad_norm": 1.7890625,
"learning_rate": 0.00014670262130354983,
"loss": 4.6701,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47400
},
{
"epoch": 3.457203227745496,
"grad_norm": 2.28125,
"learning_rate": 0.00014660137624410827,
"loss": 4.6833,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47450
},
{
"epoch": 3.4608462813530303,
"grad_norm": 1.5390625,
"learning_rate": 0.00014650007012698333,
"loss": 4.6722,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47500
},
{
"epoch": 3.464489334960564,
"grad_norm": 1.46875,
"learning_rate": 0.0001463987030849075,
"loss": 4.6706,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47550
},
{
"epoch": 3.468132388568098,
"grad_norm": 1.3203125,
"learning_rate": 0.00014629727525069307,
"loss": 4.6575,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47600
},
{
"epoch": 3.4717754421756317,
"grad_norm": 2.90625,
"learning_rate": 0.00014619578675723201,
"loss": 4.6738,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47650
},
{
"epoch": 3.4754184957831655,
"grad_norm": 1.1796875,
"learning_rate": 0.00014609423773749583,
"loss": 4.675,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47700
},
{
"epoch": 3.4790615493906993,
"grad_norm": 1.40625,
"learning_rate": 0.00014599262832453523,
"loss": 4.6703,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47750
},
{
"epoch": 3.482704602998233,
"grad_norm": 1.484375,
"learning_rate": 0.00014589095865148006,
"loss": 4.6795,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47800
},
{
"epoch": 3.486347656605767,
"grad_norm": 1.6328125,
"learning_rate": 0.00014578922885153916,
"loss": 4.6597,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47850
},
{
"epoch": 3.4899907102133008,
"grad_norm": 1.1953125,
"learning_rate": 0.0001456874390580002,
"loss": 4.6612,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47900
},
{
"epoch": 3.4936337638208346,
"grad_norm": 2.171875,
"learning_rate": 0.0001455855894042293,
"loss": 4.6689,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 47950
},
{
"epoch": 3.4972768174283684,
"grad_norm": 1.5625,
"learning_rate": 0.00014548368002367118,
"loss": 4.6654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48000
},
{
"epoch": 3.500919871035902,
"grad_norm": 1.7265625,
"learning_rate": 0.00014538171104984868,
"loss": 4.6726,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48050
},
{
"epoch": 3.504562924643436,
"grad_norm": 1.9453125,
"learning_rate": 0.00014527968261636277,
"loss": 4.6724,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48100
},
{
"epoch": 3.50820597825097,
"grad_norm": 2.125,
"learning_rate": 0.00014517759485689236,
"loss": 4.6692,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48150
},
{
"epoch": 3.511849031858504,
"grad_norm": 1.609375,
"learning_rate": 0.00014507544790519407,
"loss": 4.6592,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48200
},
{
"epoch": 3.515492085466038,
"grad_norm": 1.515625,
"learning_rate": 0.00014497324189510208,
"loss": 4.6634,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48250
},
{
"epoch": 3.5191351390735717,
"grad_norm": 1.828125,
"learning_rate": 0.00014487097696052784,
"loss": 4.6607,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48300
},
{
"epoch": 3.5227781926811055,
"grad_norm": 1.3984375,
"learning_rate": 0.00014476865323546017,
"loss": 4.6601,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48350
},
{
"epoch": 3.5264212462886393,
"grad_norm": 1.3671875,
"learning_rate": 0.00014466627085396485,
"loss": 4.6596,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48400
},
{
"epoch": 3.530064299896173,
"grad_norm": 2.015625,
"learning_rate": 0.00014456382995018448,
"loss": 4.6696,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48450
},
{
"epoch": 3.533707353503707,
"grad_norm": 1.4765625,
"learning_rate": 0.0001444613306583384,
"loss": 4.6711,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48500
},
{
"epoch": 3.5373504071112407,
"grad_norm": 1.4453125,
"learning_rate": 0.00014435877311272234,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48550
},
{
"epoch": 3.5409934607187745,
"grad_norm": 2.390625,
"learning_rate": 0.0001442561574477085,
"loss": 4.6584,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48600
},
{
"epoch": 3.5446365143263083,
"grad_norm": 1.21875,
"learning_rate": 0.00014415348379774514,
"loss": 4.6599,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48650
},
{
"epoch": 3.548279567933842,
"grad_norm": 1.6640625,
"learning_rate": 0.0001440507522973565,
"loss": 4.6766,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48700
},
{
"epoch": 3.551922621541376,
"grad_norm": 2.078125,
"learning_rate": 0.00014394796308114262,
"loss": 4.6769,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48750
},
{
"epoch": 3.5555656751489098,
"grad_norm": 1.3515625,
"learning_rate": 0.00014384511628377918,
"loss": 4.668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48800
},
{
"epoch": 3.5592087287564436,
"grad_norm": 1.421875,
"learning_rate": 0.00014374221204001728,
"loss": 4.6725,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48850
},
{
"epoch": 3.5628517823639774,
"grad_norm": 1.3046875,
"learning_rate": 0.00014363925048468335,
"loss": 4.6722,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48900
},
{
"epoch": 3.566494835971511,
"grad_norm": 1.078125,
"learning_rate": 0.00014353623175267875,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 48950
},
{
"epoch": 3.570137889579045,
"grad_norm": 1.78125,
"learning_rate": 0.00014343315597897997,
"loss": 4.6764,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49000
},
{
"epoch": 3.573780943186579,
"grad_norm": 1.6953125,
"learning_rate": 0.00014333002329863808,
"loss": 4.6455,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49050
},
{
"epoch": 3.5774239967941126,
"grad_norm": 2.53125,
"learning_rate": 0.00014322683384677875,
"loss": 4.6727,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49100
},
{
"epoch": 3.5810670504016464,
"grad_norm": 2.03125,
"learning_rate": 0.00014312358775860203,
"loss": 4.6595,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49150
},
{
"epoch": 3.58471010400918,
"grad_norm": 1.4375,
"learning_rate": 0.00014302028516938224,
"loss": 4.6654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49200
},
{
"epoch": 3.5883531576167145,
"grad_norm": 1.5546875,
"learning_rate": 0.00014291692621446763,
"loss": 4.6733,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49250
},
{
"epoch": 3.5919962112242483,
"grad_norm": 1.75,
"learning_rate": 0.00014281351102928032,
"loss": 4.6648,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49300
},
{
"epoch": 3.595639264831782,
"grad_norm": 1.75,
"learning_rate": 0.00014271003974931622,
"loss": 4.6548,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49350
},
{
"epoch": 3.599282318439316,
"grad_norm": 1.953125,
"learning_rate": 0.00014260651251014458,
"loss": 4.6738,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49400
},
{
"epoch": 3.6029253720468497,
"grad_norm": 2.125,
"learning_rate": 0.000142502929447408,
"loss": 4.6686,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49450
},
{
"epoch": 3.6065684256543835,
"grad_norm": 1.5,
"learning_rate": 0.00014239929069682233,
"loss": 4.66,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49500
},
{
"epoch": 3.6102114792619173,
"grad_norm": 1.8984375,
"learning_rate": 0.00014229559639417628,
"loss": 4.6783,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49550
},
{
"epoch": 3.613854532869451,
"grad_norm": 1.15625,
"learning_rate": 0.00014219184667533136,
"loss": 4.6547,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49600
},
{
"epoch": 3.617497586476985,
"grad_norm": 1.328125,
"learning_rate": 0.0001420880416762217,
"loss": 4.6647,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49650
},
{
"epoch": 3.6211406400845187,
"grad_norm": 1.8203125,
"learning_rate": 0.00014198418153285385,
"loss": 4.6716,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49700
},
{
"epoch": 3.6247836936920526,
"grad_norm": 1.5390625,
"learning_rate": 0.00014188026638130668,
"loss": 4.6764,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49750
},
{
"epoch": 3.6284267472995864,
"grad_norm": 1.9765625,
"learning_rate": 0.00014177629635773098,
"loss": 4.6583,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49800
},
{
"epoch": 3.6320698009071206,
"grad_norm": 1.828125,
"learning_rate": 0.00014167227159834955,
"loss": 4.6629,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49850
},
{
"epoch": 3.6357128545146544,
"grad_norm": 1.5,
"learning_rate": 0.00014156819223945695,
"loss": 4.6711,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49900
},
{
"epoch": 3.6393559081221882,
"grad_norm": 1.71875,
"learning_rate": 0.00014146405841741907,
"loss": 4.6629,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 49950
},
{
"epoch": 3.642998961729722,
"grad_norm": 2.0625,
"learning_rate": 0.00014135987026867345,
"loss": 4.6513,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50000
},
{
"epoch": 3.646642015337256,
"grad_norm": 2.3125,
"learning_rate": 0.00014125562792972857,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50050
},
{
"epoch": 3.6502850689447897,
"grad_norm": 1.5078125,
"learning_rate": 0.00014115133153716402,
"loss": 4.67,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50100
},
{
"epoch": 3.6539281225523235,
"grad_norm": 1.171875,
"learning_rate": 0.00014104698122763013,
"loss": 4.6534,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50150
},
{
"epoch": 3.6575711761598573,
"grad_norm": 1.34375,
"learning_rate": 0.00014094257713784803,
"loss": 4.6594,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50200
},
{
"epoch": 3.661214229767391,
"grad_norm": 2.421875,
"learning_rate": 0.00014083811940460915,
"loss": 4.665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50250
},
{
"epoch": 3.664857283374925,
"grad_norm": 1.5078125,
"learning_rate": 0.00014073360816477528,
"loss": 4.6714,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50300
},
{
"epoch": 3.6685003369824587,
"grad_norm": 1.90625,
"learning_rate": 0.0001406290435552783,
"loss": 4.6579,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50350
},
{
"epoch": 3.6721433905899925,
"grad_norm": 2.265625,
"learning_rate": 0.00014052442571312,
"loss": 4.6711,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50400
},
{
"epoch": 3.6757864441975263,
"grad_norm": 1.6953125,
"learning_rate": 0.00014041975477537198,
"loss": 4.6767,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50450
},
{
"epoch": 3.67942949780506,
"grad_norm": 2.421875,
"learning_rate": 0.00014031503087917532,
"loss": 4.6554,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50500
},
{
"epoch": 3.683072551412594,
"grad_norm": 1.890625,
"learning_rate": 0.00014021025416174055,
"loss": 4.668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50550
},
{
"epoch": 3.6867156050201277,
"grad_norm": 1.8984375,
"learning_rate": 0.00014010542476034735,
"loss": 4.6641,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50600
},
{
"epoch": 3.6903586586276615,
"grad_norm": 2.015625,
"learning_rate": 0.00014000054281234454,
"loss": 4.6771,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50650
},
{
"epoch": 3.6940017122351954,
"grad_norm": 2.125,
"learning_rate": 0.00013989560845514964,
"loss": 4.6664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50700
},
{
"epoch": 3.697644765842729,
"grad_norm": 1.515625,
"learning_rate": 0.00013979062182624893,
"loss": 4.6692,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50750
},
{
"epoch": 3.701287819450263,
"grad_norm": 1.7578125,
"learning_rate": 0.00013968558306319716,
"loss": 4.6558,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50800
},
{
"epoch": 3.704930873057797,
"grad_norm": 1.328125,
"learning_rate": 0.00013958049230361732,
"loss": 4.6734,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50850
},
{
"epoch": 3.708573926665331,
"grad_norm": 2.40625,
"learning_rate": 0.00013947534968520065,
"loss": 4.676,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50900
},
{
"epoch": 3.712216980272865,
"grad_norm": 1.7265625,
"learning_rate": 0.00013937015534570629,
"loss": 4.6575,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 50950
},
{
"epoch": 3.7158600338803986,
"grad_norm": 1.4453125,
"learning_rate": 0.0001392649094229611,
"loss": 4.6795,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51000
},
{
"epoch": 3.7195030874879325,
"grad_norm": 1.6015625,
"learning_rate": 0.0001391596120548595,
"loss": 4.6737,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51050
},
{
"epoch": 3.7231461410954663,
"grad_norm": 1.5546875,
"learning_rate": 0.00013905426337936346,
"loss": 4.6628,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51100
},
{
"epoch": 3.726789194703,
"grad_norm": 1.453125,
"learning_rate": 0.00013894886353450203,
"loss": 4.6703,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51150
},
{
"epoch": 3.730432248310534,
"grad_norm": 1.3046875,
"learning_rate": 0.00013884341265837135,
"loss": 4.678,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51200
},
{
"epoch": 3.7340753019180677,
"grad_norm": 1.515625,
"learning_rate": 0.00013873791088913446,
"loss": 4.6771,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51250
},
{
"epoch": 3.7377183555256015,
"grad_norm": 2.078125,
"learning_rate": 0.000138632358365021,
"loss": 4.6677,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51300
},
{
"epoch": 3.7413614091331353,
"grad_norm": 1.5546875,
"learning_rate": 0.00013852675522432718,
"loss": 4.6697,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51350
},
{
"epoch": 3.745004462740669,
"grad_norm": 1.7890625,
"learning_rate": 0.0001384211016054155,
"loss": 4.6837,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51400
},
{
"epoch": 3.748647516348203,
"grad_norm": 2.765625,
"learning_rate": 0.00013831539764671465,
"loss": 4.6769,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51450
},
{
"epoch": 3.752290569955737,
"grad_norm": 1.90625,
"learning_rate": 0.00013820964348671915,
"loss": 4.6714,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51500
},
{
"epoch": 3.755933623563271,
"grad_norm": 1.625,
"learning_rate": 0.0001381038392639894,
"loss": 4.6822,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51550
},
{
"epoch": 3.759576677170805,
"grad_norm": 2.609375,
"learning_rate": 0.00013799798511715137,
"loss": 4.665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51600
},
{
"epoch": 3.7632197307783386,
"grad_norm": 2.125,
"learning_rate": 0.00013789208118489648,
"loss": 4.6527,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51650
},
{
"epoch": 3.7668627843858724,
"grad_norm": 1.6796875,
"learning_rate": 0.00013778612760598124,
"loss": 4.667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51700
},
{
"epoch": 3.770505837993406,
"grad_norm": 1.515625,
"learning_rate": 0.00013768012451922736,
"loss": 4.6622,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51750
},
{
"epoch": 3.77414889160094,
"grad_norm": 2.53125,
"learning_rate": 0.00013757407206352136,
"loss": 4.6693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51800
},
{
"epoch": 3.777791945208474,
"grad_norm": 1.921875,
"learning_rate": 0.00013746797037781448,
"loss": 4.6799,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51850
},
{
"epoch": 3.7814349988160076,
"grad_norm": 3.21875,
"learning_rate": 0.00013736181960112234,
"loss": 4.6782,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51900
},
{
"epoch": 3.7850780524235415,
"grad_norm": 1.7265625,
"learning_rate": 0.00013725561987252497,
"loss": 4.6676,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 51950
},
{
"epoch": 3.7887211060310753,
"grad_norm": 1.3125,
"learning_rate": 0.0001371493713311666,
"loss": 4.6522,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52000
},
{
"epoch": 3.792364159638609,
"grad_norm": 2.265625,
"learning_rate": 0.0001370430741162553,
"loss": 4.6524,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52050
},
{
"epoch": 3.796007213246143,
"grad_norm": 1.828125,
"learning_rate": 0.000136936728367063,
"loss": 4.6601,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52100
},
{
"epoch": 3.7996502668536767,
"grad_norm": 2.921875,
"learning_rate": 0.0001368303342229251,
"loss": 4.6668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52150
},
{
"epoch": 3.8032933204612105,
"grad_norm": 1.9140625,
"learning_rate": 0.00013672389182324058,
"loss": 4.672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52200
},
{
"epoch": 3.8069363740687443,
"grad_norm": 1.859375,
"learning_rate": 0.00013661740130747145,
"loss": 4.6639,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52250
},
{
"epoch": 3.810579427676278,
"grad_norm": 1.5078125,
"learning_rate": 0.000136510862815143,
"loss": 4.6658,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52300
},
{
"epoch": 3.814222481283812,
"grad_norm": 2.4375,
"learning_rate": 0.00013640427648584308,
"loss": 4.6641,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52350
},
{
"epoch": 3.8178655348913457,
"grad_norm": 1.53125,
"learning_rate": 0.00013629764245922253,
"loss": 4.6706,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52400
},
{
"epoch": 3.8215085884988795,
"grad_norm": 1.21875,
"learning_rate": 0.0001361909608749944,
"loss": 4.6476,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52450
},
{
"epoch": 3.8251516421064133,
"grad_norm": 1.390625,
"learning_rate": 0.0001360842318729343,
"loss": 4.6566,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52500
},
{
"epoch": 3.8287946957139476,
"grad_norm": 1.3984375,
"learning_rate": 0.0001359774555928798,
"loss": 4.6454,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52550
},
{
"epoch": 3.8324377493214814,
"grad_norm": 2.25,
"learning_rate": 0.00013587063217473046,
"loss": 4.6705,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52600
},
{
"epoch": 3.836080802929015,
"grad_norm": 1.3515625,
"learning_rate": 0.00013576376175844764,
"loss": 4.6695,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52650
},
{
"epoch": 3.839723856536549,
"grad_norm": 1.7265625,
"learning_rate": 0.00013565684448405417,
"loss": 4.6508,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52700
},
{
"epoch": 3.843366910144083,
"grad_norm": 1.4609375,
"learning_rate": 0.00013554988049163444,
"loss": 4.6717,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52750
},
{
"epoch": 3.8470099637516166,
"grad_norm": 2.0625,
"learning_rate": 0.0001354428699213339,
"loss": 4.663,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52800
},
{
"epoch": 3.8506530173591504,
"grad_norm": 1.4609375,
"learning_rate": 0.00013533581291335912,
"loss": 4.6642,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52850
},
{
"epoch": 3.8542960709666843,
"grad_norm": 3.03125,
"learning_rate": 0.00013522870960797743,
"loss": 4.6733,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52900
},
{
"epoch": 3.857939124574218,
"grad_norm": 2.328125,
"learning_rate": 0.00013512156014551692,
"loss": 4.6938,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 52950
},
{
"epoch": 3.861582178181752,
"grad_norm": 2.375,
"learning_rate": 0.00013501436466636607,
"loss": 4.6695,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53000
},
{
"epoch": 3.8652252317892857,
"grad_norm": 2.515625,
"learning_rate": 0.00013490712331097374,
"loss": 4.6698,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53050
},
{
"epoch": 3.8688682853968195,
"grad_norm": 1.9375,
"learning_rate": 0.00013479983621984877,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53100
},
{
"epoch": 3.8725113390043533,
"grad_norm": 1.71875,
"learning_rate": 0.00013469250353356,
"loss": 4.6726,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53150
},
{
"epoch": 3.8761543926118875,
"grad_norm": 1.2265625,
"learning_rate": 0.0001345851253927361,
"loss": 4.6674,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53200
},
{
"epoch": 3.8797974462194214,
"grad_norm": 1.5390625,
"learning_rate": 0.00013447770193806508,
"loss": 4.6709,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53250
},
{
"epoch": 3.883440499826955,
"grad_norm": 2.21875,
"learning_rate": 0.00013437023331029454,
"loss": 4.676,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53300
},
{
"epoch": 3.887083553434489,
"grad_norm": 2.21875,
"learning_rate": 0.00013426271965023107,
"loss": 4.6524,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53350
},
{
"epoch": 3.8907266070420228,
"grad_norm": 2.0625,
"learning_rate": 0.00013415516109874047,
"loss": 4.6691,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53400
},
{
"epoch": 3.8943696606495566,
"grad_norm": 1.3359375,
"learning_rate": 0.00013404755779674715,
"loss": 4.6632,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53450
},
{
"epoch": 3.8980127142570904,
"grad_norm": 1.8515625,
"learning_rate": 0.00013393990988523434,
"loss": 4.6769,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53500
},
{
"epoch": 3.901655767864624,
"grad_norm": 1.6328125,
"learning_rate": 0.00013383221750524354,
"loss": 4.6693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53550
},
{
"epoch": 3.905298821472158,
"grad_norm": 1.359375,
"learning_rate": 0.00013372448079787465,
"loss": 4.674,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53600
},
{
"epoch": 3.908941875079692,
"grad_norm": 2.515625,
"learning_rate": 0.0001336166999042856,
"loss": 4.6718,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53650
},
{
"epoch": 3.9125849286872256,
"grad_norm": 1.515625,
"learning_rate": 0.00013350887496569217,
"loss": 4.6714,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53700
},
{
"epoch": 3.9162279822947594,
"grad_norm": 2.359375,
"learning_rate": 0.00013340100612336799,
"loss": 4.6874,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53750
},
{
"epoch": 3.9198710359022932,
"grad_norm": 1.7265625,
"learning_rate": 0.00013329309351864396,
"loss": 4.6584,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53800
},
{
"epoch": 3.923514089509827,
"grad_norm": 2.171875,
"learning_rate": 0.00013318513729290862,
"loss": 4.6773,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53850
},
{
"epoch": 3.927157143117361,
"grad_norm": 1.5390625,
"learning_rate": 0.00013307713758760746,
"loss": 4.6655,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53900
},
{
"epoch": 3.9308001967248947,
"grad_norm": 2.5625,
"learning_rate": 0.00013296909454424297,
"loss": 4.6706,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 53950
},
{
"epoch": 3.9344432503324285,
"grad_norm": 1.8125,
"learning_rate": 0.00013286100830437445,
"loss": 4.6729,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54000
},
{
"epoch": 3.9380863039399623,
"grad_norm": 1.609375,
"learning_rate": 0.0001327528790096178,
"loss": 4.6654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54050
},
{
"epoch": 3.941729357547496,
"grad_norm": 1.5859375,
"learning_rate": 0.00013264470680164533,
"loss": 4.6632,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54100
},
{
"epoch": 3.94537241115503,
"grad_norm": 1.234375,
"learning_rate": 0.00013253649182218556,
"loss": 4.6643,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54150
},
{
"epoch": 3.9490154647625637,
"grad_norm": 1.9140625,
"learning_rate": 0.00013242823421302308,
"loss": 4.6606,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54200
},
{
"epoch": 3.952658518370098,
"grad_norm": 1.78125,
"learning_rate": 0.00013231993411599828,
"loss": 4.6812,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54250
},
{
"epoch": 3.9563015719776318,
"grad_norm": 2.5,
"learning_rate": 0.00013221159167300725,
"loss": 4.6707,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54300
},
{
"epoch": 3.9599446255851656,
"grad_norm": 1.625,
"learning_rate": 0.00013210320702600157,
"loss": 4.6572,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54350
},
{
"epoch": 3.9635876791926994,
"grad_norm": 1.3359375,
"learning_rate": 0.0001319947803169881,
"loss": 4.6683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54400
},
{
"epoch": 3.967230732800233,
"grad_norm": 1.1953125,
"learning_rate": 0.00013188631168802883,
"loss": 4.6647,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54450
},
{
"epoch": 3.970873786407767,
"grad_norm": 1.546875,
"learning_rate": 0.00013177780128124065,
"loss": 4.6754,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54500
},
{
"epoch": 3.974516840015301,
"grad_norm": 1.5390625,
"learning_rate": 0.00013166924923879521,
"loss": 4.6546,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54550
},
{
"epoch": 3.9781598936228346,
"grad_norm": 1.3515625,
"learning_rate": 0.0001315606557029187,
"loss": 4.6696,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54600
},
{
"epoch": 3.9818029472303684,
"grad_norm": 2.09375,
"learning_rate": 0.00013145202081589168,
"loss": 4.6634,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54650
},
{
"epoch": 3.9854460008379022,
"grad_norm": 2.171875,
"learning_rate": 0.00013134334472004886,
"loss": 4.6656,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54700
},
{
"epoch": 3.989089054445436,
"grad_norm": 1.5703125,
"learning_rate": 0.00013123462755777897,
"loss": 4.6588,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54750
},
{
"epoch": 3.99273210805297,
"grad_norm": 2.265625,
"learning_rate": 0.0001311258694715246,
"loss": 4.6771,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54800
},
{
"epoch": 3.996375161660504,
"grad_norm": 2.5625,
"learning_rate": 0.00013101707060378186,
"loss": 4.6592,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54850
},
{
"epoch": 3.999726770979435,
"eval_loss": 4.676952838897705,
"eval_runtime": 579.4088,
"eval_samples_per_second": 523.513,
"eval_steps_per_second": 43.627,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54896
},
{
"epoch": 4.0,
"grad_norm": 1.4921875,
"learning_rate": 0.00013090823109710028,
"loss": 4.6635,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54900
},
{
"epoch": 4.003643053607534,
"grad_norm": 1.53125,
"learning_rate": 0.00013079935109408278,
"loss": 4.6468,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 54950
},
{
"epoch": 4.007286107215068,
"grad_norm": 1.5546875,
"learning_rate": 0.0001306904307373852,
"loss": 4.6623,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55000
},
{
"epoch": 4.010929160822601,
"grad_norm": 1.484375,
"learning_rate": 0.00013058147016971637,
"loss": 4.6599,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55050
},
{
"epoch": 4.014572214430135,
"grad_norm": 1.578125,
"learning_rate": 0.00013047246953383764,
"loss": 4.6579,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55100
},
{
"epoch": 4.018215268037669,
"grad_norm": 1.8046875,
"learning_rate": 0.00013036342897256297,
"loss": 4.6526,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55150
},
{
"epoch": 4.021858321645203,
"grad_norm": 1.546875,
"learning_rate": 0.00013025434862875865,
"loss": 4.6709,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55200
},
{
"epoch": 4.025501375252737,
"grad_norm": 1.6171875,
"learning_rate": 0.00013014522864534308,
"loss": 4.6681,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55250
},
{
"epoch": 4.0291444288602705,
"grad_norm": 1.484375,
"learning_rate": 0.0001300360691652865,
"loss": 4.6531,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55300
},
{
"epoch": 4.032787482467804,
"grad_norm": 1.734375,
"learning_rate": 0.00012992687033161102,
"loss": 4.67,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55350
},
{
"epoch": 4.036430536075338,
"grad_norm": 1.6015625,
"learning_rate": 0.00012981763228739029,
"loss": 4.6491,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55400
},
{
"epoch": 4.040073589682872,
"grad_norm": 1.2265625,
"learning_rate": 0.0001297083551757492,
"loss": 4.6525,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55450
},
{
"epoch": 4.043716643290406,
"grad_norm": 1.8125,
"learning_rate": 0.00012959903913986408,
"loss": 4.6554,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55500
},
{
"epoch": 4.0473596968979395,
"grad_norm": 1.3359375,
"learning_rate": 0.000129489684322962,
"loss": 4.6561,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55550
},
{
"epoch": 4.051002750505473,
"grad_norm": 2.140625,
"learning_rate": 0.00012938029086832106,
"loss": 4.6693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55600
},
{
"epoch": 4.054645804113007,
"grad_norm": 1.75,
"learning_rate": 0.00012927085891926976,
"loss": 4.6559,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55650
},
{
"epoch": 4.058288857720541,
"grad_norm": 1.2578125,
"learning_rate": 0.00012916138861918726,
"loss": 4.6572,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55700
},
{
"epoch": 4.061931911328076,
"grad_norm": 1.3359375,
"learning_rate": 0.0001290518801115028,
"loss": 4.6476,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55750
},
{
"epoch": 4.065574964935609,
"grad_norm": 1.6875,
"learning_rate": 0.00012894233353969575,
"loss": 4.6452,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55800
},
{
"epoch": 4.069218018543143,
"grad_norm": 1.9765625,
"learning_rate": 0.00012883274904729538,
"loss": 4.6708,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55850
},
{
"epoch": 4.072861072150677,
"grad_norm": 2.28125,
"learning_rate": 0.00012872312677788056,
"loss": 4.6679,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55900
},
{
"epoch": 4.076504125758211,
"grad_norm": 1.671875,
"learning_rate": 0.00012861346687507974,
"loss": 4.6709,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 55950
},
{
"epoch": 4.080147179365745,
"grad_norm": 2.34375,
"learning_rate": 0.00012850376948257068,
"loss": 4.658,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56000
},
{
"epoch": 4.0837902329732785,
"grad_norm": 2.078125,
"learning_rate": 0.00012839403474408016,
"loss": 4.6636,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56050
},
{
"epoch": 4.087433286580812,
"grad_norm": 2.234375,
"learning_rate": 0.00012828426280338392,
"loss": 4.6613,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56100
},
{
"epoch": 4.091076340188346,
"grad_norm": 2.03125,
"learning_rate": 0.0001281744538043066,
"loss": 4.646,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56150
},
{
"epoch": 4.09471939379588,
"grad_norm": 1.734375,
"learning_rate": 0.00012806460789072116,
"loss": 4.6497,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56200
},
{
"epoch": 4.098362447403414,
"grad_norm": 1.125,
"learning_rate": 0.00012795472520654908,
"loss": 4.6718,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56250
},
{
"epoch": 4.1020055010109475,
"grad_norm": 3.328125,
"learning_rate": 0.00012784480589575996,
"loss": 4.6702,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56300
},
{
"epoch": 4.105648554618481,
"grad_norm": 1.734375,
"learning_rate": 0.0001277348501023714,
"loss": 4.6594,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56350
},
{
"epoch": 4.109291608226015,
"grad_norm": 1.4140625,
"learning_rate": 0.00012762485797044882,
"loss": 4.6645,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56400
},
{
"epoch": 4.112934661833549,
"grad_norm": 1.6953125,
"learning_rate": 0.00012751482964410525,
"loss": 4.6562,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56450
},
{
"epoch": 4.116577715441083,
"grad_norm": 1.546875,
"learning_rate": 0.0001274047652675011,
"loss": 4.6582,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56500
},
{
"epoch": 4.120220769048617,
"grad_norm": 1.2109375,
"learning_rate": 0.000127294664984844,
"loss": 4.6714,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56550
},
{
"epoch": 4.12386382265615,
"grad_norm": 1.6328125,
"learning_rate": 0.00012718452894038874,
"loss": 4.6503,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56600
},
{
"epoch": 4.127506876263684,
"grad_norm": 1.46875,
"learning_rate": 0.00012707435727843687,
"loss": 4.6677,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56650
},
{
"epoch": 4.131149929871218,
"grad_norm": 1.703125,
"learning_rate": 0.0001269641501433366,
"loss": 4.666,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56700
},
{
"epoch": 4.134792983478752,
"grad_norm": 1.390625,
"learning_rate": 0.00012685390767948267,
"loss": 4.6699,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56750
},
{
"epoch": 4.138436037086286,
"grad_norm": 1.7421875,
"learning_rate": 0.000126743630031316,
"loss": 4.6572,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56800
},
{
"epoch": 4.142079090693819,
"grad_norm": 1.5234375,
"learning_rate": 0.0001266333173433238,
"loss": 4.6519,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56850
},
{
"epoch": 4.145722144301353,
"grad_norm": 2.59375,
"learning_rate": 0.00012652296976003907,
"loss": 4.6755,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56900
},
{
"epoch": 4.149365197908887,
"grad_norm": 3.4375,
"learning_rate": 0.0001264125874260405,
"loss": 4.6704,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 56950
},
{
"epoch": 4.153008251516421,
"grad_norm": 1.8359375,
"learning_rate": 0.00012630217048595233,
"loss": 4.6562,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57000
},
{
"epoch": 4.156651305123955,
"grad_norm": 1.4765625,
"learning_rate": 0.00012619171908444417,
"loss": 4.6538,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57050
},
{
"epoch": 4.1602943587314885,
"grad_norm": 1.4921875,
"learning_rate": 0.00012608123336623083,
"loss": 4.6728,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57100
},
{
"epoch": 4.163937412339022,
"grad_norm": 1.375,
"learning_rate": 0.000125970713476072,
"loss": 4.6743,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57150
},
{
"epoch": 4.167580465946556,
"grad_norm": 1.8984375,
"learning_rate": 0.00012586015955877214,
"loss": 4.6516,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57200
},
{
"epoch": 4.17122351955409,
"grad_norm": 1.1953125,
"learning_rate": 0.00012574957175918032,
"loss": 4.6724,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57250
},
{
"epoch": 4.174866573161624,
"grad_norm": 1.828125,
"learning_rate": 0.00012563895022219004,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57300
},
{
"epoch": 4.1785096267691575,
"grad_norm": 1.359375,
"learning_rate": 0.00012552829509273898,
"loss": 4.6346,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57350
},
{
"epoch": 4.182152680376692,
"grad_norm": 1.65625,
"learning_rate": 0.00012541760651580875,
"loss": 4.6762,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57400
},
{
"epoch": 4.185795733984226,
"grad_norm": 1.984375,
"learning_rate": 0.00012530688463642493,
"loss": 4.664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57450
},
{
"epoch": 4.18943878759176,
"grad_norm": 2.640625,
"learning_rate": 0.0001251961295996566,
"loss": 4.6646,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57500
},
{
"epoch": 4.193081841199294,
"grad_norm": 1.4453125,
"learning_rate": 0.0001250853415506164,
"loss": 4.6602,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57550
},
{
"epoch": 4.196724894806827,
"grad_norm": 1.4609375,
"learning_rate": 0.00012497452063446013,
"loss": 4.6755,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57600
},
{
"epoch": 4.200367948414361,
"grad_norm": 2.453125,
"learning_rate": 0.00012486366699638666,
"loss": 4.6727,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57650
},
{
"epoch": 4.204011002021895,
"grad_norm": 1.6328125,
"learning_rate": 0.0001247527807816378,
"loss": 4.6672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57700
},
{
"epoch": 4.207654055629429,
"grad_norm": 1.453125,
"learning_rate": 0.000124641862135498,
"loss": 4.6627,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57750
},
{
"epoch": 4.211297109236963,
"grad_norm": 1.640625,
"learning_rate": 0.00012453091120329416,
"loss": 4.6823,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57800
},
{
"epoch": 4.2149401628444965,
"grad_norm": 1.578125,
"learning_rate": 0.00012441992813039555,
"loss": 4.6582,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57850
},
{
"epoch": 4.21858321645203,
"grad_norm": 1.890625,
"learning_rate": 0.00012430891306221354,
"loss": 4.6422,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57900
},
{
"epoch": 4.222226270059564,
"grad_norm": 1.5703125,
"learning_rate": 0.0001241978661442014,
"loss": 4.6743,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 57950
},
{
"epoch": 4.225869323667098,
"grad_norm": 1.796875,
"learning_rate": 0.00012408678752185418,
"loss": 4.6765,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58000
},
{
"epoch": 4.229512377274632,
"grad_norm": 1.484375,
"learning_rate": 0.00012397567734070836,
"loss": 4.6528,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58050
},
{
"epoch": 4.2331554308821655,
"grad_norm": 1.7265625,
"learning_rate": 0.00012386453574634183,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58100
},
{
"epoch": 4.236798484489699,
"grad_norm": 1.3046875,
"learning_rate": 0.00012375336288437372,
"loss": 4.6534,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58150
},
{
"epoch": 4.240441538097233,
"grad_norm": 1.328125,
"learning_rate": 0.00012364215890046395,
"loss": 4.6772,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58200
},
{
"epoch": 4.244084591704767,
"grad_norm": 1.53125,
"learning_rate": 0.00012353092394031337,
"loss": 4.6779,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58250
},
{
"epoch": 4.247727645312301,
"grad_norm": 1.6171875,
"learning_rate": 0.0001234196581496634,
"loss": 4.6523,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58300
},
{
"epoch": 4.2513706989198345,
"grad_norm": 1.65625,
"learning_rate": 0.00012330836167429577,
"loss": 4.6743,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58350
},
{
"epoch": 4.255013752527368,
"grad_norm": 2.0625,
"learning_rate": 0.0001231970346600324,
"loss": 4.664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58400
},
{
"epoch": 4.258656806134902,
"grad_norm": 1.2109375,
"learning_rate": 0.00012308567725273544,
"loss": 4.6635,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58450
},
{
"epoch": 4.262299859742436,
"grad_norm": 2.921875,
"learning_rate": 0.00012297428959830655,
"loss": 4.6636,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58500
},
{
"epoch": 4.26594291334997,
"grad_norm": 1.4921875,
"learning_rate": 0.00012286287184268727,
"loss": 4.6659,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58550
},
{
"epoch": 4.269585966957504,
"grad_norm": 1.8203125,
"learning_rate": 0.00012275142413185842,
"loss": 4.6752,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58600
},
{
"epoch": 4.273229020565037,
"grad_norm": 1.28125,
"learning_rate": 0.00012263994661184017,
"loss": 4.6608,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58650
},
{
"epoch": 4.276872074172571,
"grad_norm": 1.7890625,
"learning_rate": 0.00012252843942869173,
"loss": 4.6629,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58700
},
{
"epoch": 4.280515127780105,
"grad_norm": 2.125,
"learning_rate": 0.00012241690272851109,
"loss": 4.6469,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58750
},
{
"epoch": 4.284158181387639,
"grad_norm": 1.21875,
"learning_rate": 0.000122305336657435,
"loss": 4.6554,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58800
},
{
"epoch": 4.287801234995173,
"grad_norm": 1.4375,
"learning_rate": 0.00012219374136163865,
"loss": 4.6569,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58850
},
{
"epoch": 4.291444288602706,
"grad_norm": 2.53125,
"learning_rate": 0.0001220821169873356,
"loss": 4.6552,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58900
},
{
"epoch": 4.29508734221024,
"grad_norm": 2.921875,
"learning_rate": 0.00012197046368077738,
"loss": 4.6664,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 58950
},
{
"epoch": 4.298730395817774,
"grad_norm": 1.8828125,
"learning_rate": 0.00012185878158825356,
"loss": 4.6696,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59000
},
{
"epoch": 4.302373449425309,
"grad_norm": 1.546875,
"learning_rate": 0.0001217470708560913,
"loss": 4.6672,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59050
},
{
"epoch": 4.306016503032842,
"grad_norm": 2.015625,
"learning_rate": 0.00012163533163065535,
"loss": 4.6674,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59100
},
{
"epoch": 4.309659556640376,
"grad_norm": 1.890625,
"learning_rate": 0.00012152356405834782,
"loss": 4.6471,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59150
},
{
"epoch": 4.31330261024791,
"grad_norm": 1.7890625,
"learning_rate": 0.00012141176828560787,
"loss": 4.6535,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59200
},
{
"epoch": 4.316945663855444,
"grad_norm": 1.5625,
"learning_rate": 0.00012129994445891177,
"loss": 4.6649,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59250
},
{
"epoch": 4.320588717462978,
"grad_norm": 1.3515625,
"learning_rate": 0.00012118809272477231,
"loss": 4.6748,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59300
},
{
"epoch": 4.324231771070512,
"grad_norm": 1.3359375,
"learning_rate": 0.0001210762132297391,
"loss": 4.671,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59350
},
{
"epoch": 4.327874824678045,
"grad_norm": 1.765625,
"learning_rate": 0.00012096430612039791,
"loss": 4.6624,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59400
},
{
"epoch": 4.331517878285579,
"grad_norm": 2.59375,
"learning_rate": 0.00012085237154337083,
"loss": 4.6709,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59450
},
{
"epoch": 4.335160931893113,
"grad_norm": 1.984375,
"learning_rate": 0.00012074040964531588,
"loss": 4.6755,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59500
},
{
"epoch": 4.338803985500647,
"grad_norm": 1.609375,
"learning_rate": 0.00012062842057292685,
"loss": 4.6591,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59550
},
{
"epoch": 4.342447039108181,
"grad_norm": 1.4765625,
"learning_rate": 0.0001205164044729332,
"loss": 4.6643,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59600
},
{
"epoch": 4.3460900927157144,
"grad_norm": 1.6484375,
"learning_rate": 0.00012040436149209974,
"loss": 4.6569,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59650
},
{
"epoch": 4.349733146323248,
"grad_norm": 1.7734375,
"learning_rate": 0.00012029229177722663,
"loss": 4.6693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59700
},
{
"epoch": 4.353376199930782,
"grad_norm": 2.0625,
"learning_rate": 0.00012018019547514882,
"loss": 4.6816,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59750
},
{
"epoch": 4.357019253538316,
"grad_norm": 1.65625,
"learning_rate": 0.0001200680727327363,
"loss": 4.6742,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59800
},
{
"epoch": 4.36066230714585,
"grad_norm": 2.140625,
"learning_rate": 0.00011995592369689366,
"loss": 4.6591,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59850
},
{
"epoch": 4.3643053607533835,
"grad_norm": 1.6953125,
"learning_rate": 0.00011984374851455989,
"loss": 4.6604,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59900
},
{
"epoch": 4.367948414360917,
"grad_norm": 1.3671875,
"learning_rate": 0.00011973154733270823,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 59950
},
{
"epoch": 4.371591467968451,
"grad_norm": 2.03125,
"learning_rate": 0.00011961932029834605,
"loss": 4.6594,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60000
},
{
"epoch": 4.375234521575985,
"grad_norm": 2.03125,
"learning_rate": 0.00011950706755851458,
"loss": 4.6734,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60050
},
{
"epoch": 4.378877575183519,
"grad_norm": 3.015625,
"learning_rate": 0.00011939478926028869,
"loss": 4.6582,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60100
},
{
"epoch": 4.3825206287910525,
"grad_norm": 3.078125,
"learning_rate": 0.00011928248555077672,
"loss": 4.6486,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60150
},
{
"epoch": 4.386163682398586,
"grad_norm": 1.1953125,
"learning_rate": 0.00011917015657712038,
"loss": 4.6749,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60200
},
{
"epoch": 4.38980673600612,
"grad_norm": 1.546875,
"learning_rate": 0.00011905780248649443,
"loss": 4.6586,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60250
},
{
"epoch": 4.393449789613654,
"grad_norm": 1.9296875,
"learning_rate": 0.00011894542342610655,
"loss": 4.649,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60300
},
{
"epoch": 4.397092843221188,
"grad_norm": 1.234375,
"learning_rate": 0.00011883301954319717,
"loss": 4.6684,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60350
},
{
"epoch": 4.400735896828722,
"grad_norm": 1.234375,
"learning_rate": 0.00011872059098503916,
"loss": 4.6684,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60400
},
{
"epoch": 4.404378950436255,
"grad_norm": 1.7265625,
"learning_rate": 0.00011860813789893777,
"loss": 4.6733,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60450
},
{
"epoch": 4.408022004043789,
"grad_norm": 1.3359375,
"learning_rate": 0.00011849566043223041,
"loss": 4.6663,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60500
},
{
"epoch": 4.411665057651323,
"grad_norm": 1.40625,
"learning_rate": 0.00011838315873228642,
"loss": 4.6666,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60550
},
{
"epoch": 4.415308111258857,
"grad_norm": 1.75,
"learning_rate": 0.00011827063294650684,
"loss": 4.6562,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60600
},
{
"epoch": 4.418951164866391,
"grad_norm": 1.296875,
"learning_rate": 0.00011815808322232434,
"loss": 4.6649,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60650
},
{
"epoch": 4.422594218473925,
"grad_norm": 2.671875,
"learning_rate": 0.00011804550970720289,
"loss": 4.6632,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60700
},
{
"epoch": 4.426237272081458,
"grad_norm": 1.21875,
"learning_rate": 0.00011793291254863772,
"loss": 4.6476,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60750
},
{
"epoch": 4.429880325688993,
"grad_norm": 1.2890625,
"learning_rate": 0.0001178202918941549,
"loss": 4.6693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60800
},
{
"epoch": 4.433523379296527,
"grad_norm": 1.4140625,
"learning_rate": 0.00011770764789131143,
"loss": 4.6654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60850
},
{
"epoch": 4.4371664329040605,
"grad_norm": 1.40625,
"learning_rate": 0.00011759498068769481,
"loss": 4.6775,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60900
},
{
"epoch": 4.440809486511594,
"grad_norm": 1.5625,
"learning_rate": 0.00011748229043092296,
"loss": 4.6591,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 60950
},
{
"epoch": 4.444452540119128,
"grad_norm": 1.5078125,
"learning_rate": 0.00011736957726864405,
"loss": 4.669,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61000
},
{
"epoch": 4.448095593726662,
"grad_norm": 2.015625,
"learning_rate": 0.00011725684134853619,
"loss": 4.6586,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61050
},
{
"epoch": 4.451738647334196,
"grad_norm": 1.828125,
"learning_rate": 0.00011714408281830734,
"loss": 4.6681,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61100
},
{
"epoch": 4.45538170094173,
"grad_norm": 1.0390625,
"learning_rate": 0.00011703130182569502,
"loss": 4.6776,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61150
},
{
"epoch": 4.459024754549263,
"grad_norm": 2.3125,
"learning_rate": 0.00011691849851846634,
"loss": 4.6756,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61200
},
{
"epoch": 4.462667808156797,
"grad_norm": 1.4765625,
"learning_rate": 0.00011680567304441753,
"loss": 4.6723,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61250
},
{
"epoch": 4.466310861764331,
"grad_norm": 1.4375,
"learning_rate": 0.00011669282555137382,
"loss": 4.6634,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61300
},
{
"epoch": 4.469953915371865,
"grad_norm": 1.2265625,
"learning_rate": 0.00011657995618718944,
"loss": 4.661,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61350
},
{
"epoch": 4.473596968979399,
"grad_norm": 1.390625,
"learning_rate": 0.0001164670650997471,
"loss": 4.6786,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61400
},
{
"epoch": 4.477240022586932,
"grad_norm": 2.890625,
"learning_rate": 0.00011635415243695813,
"loss": 4.6606,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61450
},
{
"epoch": 4.480883076194466,
"grad_norm": 1.3125,
"learning_rate": 0.00011624121834676203,
"loss": 4.665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61500
},
{
"epoch": 4.484526129802,
"grad_norm": 1.234375,
"learning_rate": 0.0001161282629771264,
"loss": 4.6808,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61550
},
{
"epoch": 4.488169183409534,
"grad_norm": 1.8125,
"learning_rate": 0.00011601528647604671,
"loss": 4.6619,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61600
},
{
"epoch": 4.491812237017068,
"grad_norm": 2.0,
"learning_rate": 0.00011590228899154618,
"loss": 4.6604,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61650
},
{
"epoch": 4.4954552906246015,
"grad_norm": 1.9140625,
"learning_rate": 0.00011578927067167539,
"loss": 4.6656,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61700
},
{
"epoch": 4.499098344232135,
"grad_norm": 1.4921875,
"learning_rate": 0.00011567623166451242,
"loss": 4.6652,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61750
},
{
"epoch": 4.502741397839669,
"grad_norm": 1.5703125,
"learning_rate": 0.00011556317211816223,
"loss": 4.6743,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61800
},
{
"epoch": 4.506384451447203,
"grad_norm": 1.40625,
"learning_rate": 0.00011545009218075682,
"loss": 4.6667,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61850
},
{
"epoch": 4.510027505054737,
"grad_norm": 1.390625,
"learning_rate": 0.00011533699200045492,
"loss": 4.6681,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61900
},
{
"epoch": 4.5136705586622705,
"grad_norm": 1.4765625,
"learning_rate": 0.00011522387172544169,
"loss": 4.6568,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 61950
},
{
"epoch": 4.517313612269804,
"grad_norm": 1.5234375,
"learning_rate": 0.00011511073150392875,
"loss": 4.6617,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62000
},
{
"epoch": 4.520956665877338,
"grad_norm": 1.34375,
"learning_rate": 0.00011499757148415368,
"loss": 4.6625,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62050
},
{
"epoch": 4.524599719484872,
"grad_norm": 1.09375,
"learning_rate": 0.00011488439181438022,
"loss": 4.6566,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62100
},
{
"epoch": 4.528242773092406,
"grad_norm": 1.21875,
"learning_rate": 0.00011477119264289762,
"loss": 4.6556,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62150
},
{
"epoch": 4.53188582669994,
"grad_norm": 1.6640625,
"learning_rate": 0.00011465797411802089,
"loss": 4.675,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62200
},
{
"epoch": 4.535528880307473,
"grad_norm": 1.65625,
"learning_rate": 0.00011454473638809023,
"loss": 4.6728,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62250
},
{
"epoch": 4.539171933915007,
"grad_norm": 1.671875,
"learning_rate": 0.00011443147960147108,
"loss": 4.6531,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62300
},
{
"epoch": 4.542814987522542,
"grad_norm": 1.9453125,
"learning_rate": 0.00011431820390655387,
"loss": 4.6637,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62350
},
{
"epoch": 4.546458041130075,
"grad_norm": 1.5546875,
"learning_rate": 0.00011420490945175377,
"loss": 4.6699,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62400
},
{
"epoch": 4.5501010947376095,
"grad_norm": 1.5703125,
"learning_rate": 0.00011409159638551057,
"loss": 4.6651,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62450
},
{
"epoch": 4.553744148345142,
"grad_norm": 1.296875,
"learning_rate": 0.00011397826485628835,
"loss": 4.6682,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62500
},
{
"epoch": 4.557387201952677,
"grad_norm": 1.75,
"learning_rate": 0.00011386491501257548,
"loss": 4.6717,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62550
},
{
"epoch": 4.561030255560211,
"grad_norm": 1.515625,
"learning_rate": 0.00011375154700288429,
"loss": 4.6708,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62600
},
{
"epoch": 4.564673309167745,
"grad_norm": 1.796875,
"learning_rate": 0.0001136381609757509,
"loss": 4.6694,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62650
},
{
"epoch": 4.5683163627752785,
"grad_norm": 2.125,
"learning_rate": 0.00011352475707973502,
"loss": 4.6766,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62700
},
{
"epoch": 4.571959416382812,
"grad_norm": 1.390625,
"learning_rate": 0.00011341133546341985,
"loss": 4.6582,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62750
},
{
"epoch": 4.575602469990346,
"grad_norm": 2.1875,
"learning_rate": 0.0001132978962754117,
"loss": 4.6576,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62800
},
{
"epoch": 4.57924552359788,
"grad_norm": 1.9140625,
"learning_rate": 0.00011318443966434,
"loss": 4.6498,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62850
},
{
"epoch": 4.582888577205414,
"grad_norm": 1.25,
"learning_rate": 0.00011307096577885697,
"loss": 4.6803,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62900
},
{
"epoch": 4.586531630812948,
"grad_norm": 1.328125,
"learning_rate": 0.0001129574747676374,
"loss": 4.6524,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 62950
},
{
"epoch": 4.590174684420481,
"grad_norm": 1.078125,
"learning_rate": 0.00011284396677937863,
"loss": 4.6725,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63000
},
{
"epoch": 4.593817738028015,
"grad_norm": 1.1640625,
"learning_rate": 0.00011273044196280018,
"loss": 4.6597,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63050
},
{
"epoch": 4.597460791635549,
"grad_norm": 1.328125,
"learning_rate": 0.00011261690046664365,
"loss": 4.6564,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63100
},
{
"epoch": 4.601103845243083,
"grad_norm": 1.3125,
"learning_rate": 0.00011250334243967245,
"loss": 4.6796,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63150
},
{
"epoch": 4.604746898850617,
"grad_norm": 1.078125,
"learning_rate": 0.00011238976803067165,
"loss": 4.659,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63200
},
{
"epoch": 4.60838995245815,
"grad_norm": 1.6484375,
"learning_rate": 0.00011227617738844782,
"loss": 4.6644,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63250
},
{
"epoch": 4.612033006065684,
"grad_norm": 1.796875,
"learning_rate": 0.00011216257066182883,
"loss": 4.6703,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63300
},
{
"epoch": 4.615676059673218,
"grad_norm": 1.40625,
"learning_rate": 0.00011204894799966352,
"loss": 4.6614,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63350
},
{
"epoch": 4.619319113280752,
"grad_norm": 1.8671875,
"learning_rate": 0.0001119353095508217,
"loss": 4.6609,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63400
},
{
"epoch": 4.622962166888286,
"grad_norm": 1.6015625,
"learning_rate": 0.00011182165546419381,
"loss": 4.676,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63450
},
{
"epoch": 4.6266052204958195,
"grad_norm": 1.5,
"learning_rate": 0.00011170798588869083,
"loss": 4.6682,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63500
},
{
"epoch": 4.630248274103353,
"grad_norm": 1.53125,
"learning_rate": 0.00011159430097324397,
"loss": 4.6587,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63550
},
{
"epoch": 4.633891327710887,
"grad_norm": 1.609375,
"learning_rate": 0.0001114806008668046,
"loss": 4.6622,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63600
},
{
"epoch": 4.637534381318421,
"grad_norm": 2.25,
"learning_rate": 0.00011136688571834395,
"loss": 4.6668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63650
},
{
"epoch": 4.641177434925955,
"grad_norm": 1.71875,
"learning_rate": 0.00011125315567685299,
"loss": 4.6569,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63700
},
{
"epoch": 4.6448204885334885,
"grad_norm": 1.3984375,
"learning_rate": 0.00011113941089134219,
"loss": 4.6627,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63750
},
{
"epoch": 4.648463542141022,
"grad_norm": 1.859375,
"learning_rate": 0.0001110256515108413,
"loss": 4.6619,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63800
},
{
"epoch": 4.652106595748556,
"grad_norm": 1.546875,
"learning_rate": 0.0001109118776843993,
"loss": 4.6596,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63850
},
{
"epoch": 4.65574964935609,
"grad_norm": 1.171875,
"learning_rate": 0.00011079808956108391,
"loss": 4.6623,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63900
},
{
"epoch": 4.659392702963624,
"grad_norm": 2.0625,
"learning_rate": 0.00011068428728998182,
"loss": 4.6569,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 63950
},
{
"epoch": 4.663035756571158,
"grad_norm": 1.625,
"learning_rate": 0.00011057047102019804,
"loss": 4.6638,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64000
},
{
"epoch": 4.666678810178691,
"grad_norm": 1.3515625,
"learning_rate": 0.00011045664090085605,
"loss": 4.6685,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64050
},
{
"epoch": 4.670321863786226,
"grad_norm": 1.40625,
"learning_rate": 0.00011034279708109749,
"loss": 4.6551,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64100
},
{
"epoch": 4.673964917393759,
"grad_norm": 1.5625,
"learning_rate": 0.00011022893971008182,
"loss": 4.6731,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64150
},
{
"epoch": 4.677607971001294,
"grad_norm": 1.3359375,
"learning_rate": 0.00011011506893698638,
"loss": 4.6693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64200
},
{
"epoch": 4.6812510246088275,
"grad_norm": 1.78125,
"learning_rate": 0.00011000118491100601,
"loss": 4.6615,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64250
},
{
"epoch": 4.684894078216361,
"grad_norm": 2.140625,
"learning_rate": 0.00010988728778135297,
"loss": 4.6585,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64300
},
{
"epoch": 4.688537131823895,
"grad_norm": 1.3046875,
"learning_rate": 0.00010977337769725658,
"loss": 4.6636,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64350
},
{
"epoch": 4.692180185431429,
"grad_norm": 1.578125,
"learning_rate": 0.00010965945480796325,
"loss": 4.6818,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64400
},
{
"epoch": 4.695823239038963,
"grad_norm": 1.8984375,
"learning_rate": 0.0001095455192627361,
"loss": 4.6638,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64450
},
{
"epoch": 4.6994662926464965,
"grad_norm": 1.5703125,
"learning_rate": 0.00010943157121085488,
"loss": 4.6655,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64500
},
{
"epoch": 4.70310934625403,
"grad_norm": 1.71875,
"learning_rate": 0.00010931761080161566,
"loss": 4.6591,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64550
},
{
"epoch": 4.706752399861564,
"grad_norm": 2.21875,
"learning_rate": 0.00010920363818433074,
"loss": 4.6745,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64600
},
{
"epoch": 4.710395453469098,
"grad_norm": 1.8125,
"learning_rate": 0.00010908965350832845,
"loss": 4.6792,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64650
},
{
"epoch": 4.714038507076632,
"grad_norm": 2.0,
"learning_rate": 0.00010897565692295284,
"loss": 4.6577,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64700
},
{
"epoch": 4.7176815606841656,
"grad_norm": 1.671875,
"learning_rate": 0.00010886164857756364,
"loss": 4.6769,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64750
},
{
"epoch": 4.721324614291699,
"grad_norm": 1.328125,
"learning_rate": 0.00010874762862153591,
"loss": 4.6679,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64800
},
{
"epoch": 4.724967667899233,
"grad_norm": 1.28125,
"learning_rate": 0.00010863359720426005,
"loss": 4.6622,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64850
},
{
"epoch": 4.728610721506767,
"grad_norm": 1.4453125,
"learning_rate": 0.0001085195544751413,
"loss": 4.6696,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64900
},
{
"epoch": 4.732253775114301,
"grad_norm": 1.71875,
"learning_rate": 0.00010840550058359988,
"loss": 4.6804,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 64950
},
{
"epoch": 4.735896828721835,
"grad_norm": 1.3671875,
"learning_rate": 0.00010829143567907054,
"loss": 4.665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65000
},
{
"epoch": 4.739539882329368,
"grad_norm": 1.296875,
"learning_rate": 0.00010817735991100248,
"loss": 4.6734,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65050
},
{
"epoch": 4.743182935936902,
"grad_norm": 1.3515625,
"learning_rate": 0.00010806327342885918,
"loss": 4.6693,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65100
},
{
"epoch": 4.746825989544436,
"grad_norm": 1.2890625,
"learning_rate": 0.00010794917638211808,
"loss": 4.6875,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65150
},
{
"epoch": 4.75046904315197,
"grad_norm": 1.7578125,
"learning_rate": 0.00010783506892027056,
"loss": 4.6801,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65200
},
{
"epoch": 4.754112096759504,
"grad_norm": 1.4375,
"learning_rate": 0.00010772095119282151,
"loss": 4.6639,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65250
},
{
"epoch": 4.7577551503670374,
"grad_norm": 2.0,
"learning_rate": 0.0001076068233492894,
"loss": 4.6687,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65300
},
{
"epoch": 4.761398203974571,
"grad_norm": 1.328125,
"learning_rate": 0.00010749268553920587,
"loss": 4.6645,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65350
},
{
"epoch": 4.765041257582105,
"grad_norm": 1.484375,
"learning_rate": 0.00010737853791211572,
"loss": 4.6596,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65400
},
{
"epoch": 4.768684311189639,
"grad_norm": 2.0,
"learning_rate": 0.00010726438061757645,
"loss": 4.6566,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65450
},
{
"epoch": 4.772327364797173,
"grad_norm": 1.6796875,
"learning_rate": 0.00010715021380515837,
"loss": 4.6698,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65500
},
{
"epoch": 4.7759704184047065,
"grad_norm": 1.203125,
"learning_rate": 0.0001070360376244442,
"loss": 4.6715,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65550
},
{
"epoch": 4.77961347201224,
"grad_norm": 1.515625,
"learning_rate": 0.00010692185222502898,
"loss": 4.6691,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65600
},
{
"epoch": 4.783256525619775,
"grad_norm": 1.4140625,
"learning_rate": 0.00010680765775651972,
"loss": 4.6848,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65650
},
{
"epoch": 4.786899579227308,
"grad_norm": 1.265625,
"learning_rate": 0.00010669345436853546,
"loss": 4.6499,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65700
},
{
"epoch": 4.790542632834843,
"grad_norm": 1.515625,
"learning_rate": 0.00010657924221070681,
"loss": 4.6656,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65750
},
{
"epoch": 4.7941856864423755,
"grad_norm": 1.515625,
"learning_rate": 0.00010646502143267591,
"loss": 4.6505,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65800
},
{
"epoch": 4.79782874004991,
"grad_norm": 1.75,
"learning_rate": 0.00010635079218409623,
"loss": 4.6556,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65850
},
{
"epoch": 4.801471793657444,
"grad_norm": 1.265625,
"learning_rate": 0.00010623655461463227,
"loss": 4.6704,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65900
},
{
"epoch": 4.805114847264978,
"grad_norm": 2.328125,
"learning_rate": 0.00010612230887395946,
"loss": 4.6611,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 65950
},
{
"epoch": 4.808757900872512,
"grad_norm": 1.625,
"learning_rate": 0.000106008055111764,
"loss": 4.6705,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66000
},
{
"epoch": 4.8124009544800455,
"grad_norm": 1.7109375,
"learning_rate": 0.00010589379347774249,
"loss": 4.6656,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66050
},
{
"epoch": 4.816044008087579,
"grad_norm": 3.140625,
"learning_rate": 0.0001057795241216019,
"loss": 4.6599,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66100
},
{
"epoch": 4.819687061695113,
"grad_norm": 1.296875,
"learning_rate": 0.00010566524719305934,
"loss": 4.6602,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66150
},
{
"epoch": 4.823330115302647,
"grad_norm": 1.5625,
"learning_rate": 0.00010555096284184178,
"loss": 4.6479,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66200
},
{
"epoch": 4.826973168910181,
"grad_norm": 1.578125,
"learning_rate": 0.00010543667121768602,
"loss": 4.6504,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66250
},
{
"epoch": 4.8306162225177145,
"grad_norm": 2.0625,
"learning_rate": 0.00010532237247033823,
"loss": 4.654,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66300
},
{
"epoch": 4.834259276125248,
"grad_norm": 1.515625,
"learning_rate": 0.00010520806674955403,
"loss": 4.6747,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66350
},
{
"epoch": 4.837902329732782,
"grad_norm": 2.234375,
"learning_rate": 0.0001050937542050982,
"loss": 4.6595,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66400
},
{
"epoch": 4.841545383340316,
"grad_norm": 2.015625,
"learning_rate": 0.00010497943498674436,
"loss": 4.6597,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66450
},
{
"epoch": 4.84518843694785,
"grad_norm": 1.3515625,
"learning_rate": 0.00010486510924427496,
"loss": 4.665,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66500
},
{
"epoch": 4.8488314905553835,
"grad_norm": 1.7421875,
"learning_rate": 0.00010475077712748091,
"loss": 4.6618,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66550
},
{
"epoch": 4.852474544162917,
"grad_norm": 1.546875,
"learning_rate": 0.00010463643878616159,
"loss": 4.6647,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66600
},
{
"epoch": 4.856117597770451,
"grad_norm": 1.2265625,
"learning_rate": 0.00010452209437012439,
"loss": 4.6807,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66650
},
{
"epoch": 4.859760651377985,
"grad_norm": 1.5078125,
"learning_rate": 0.00010440774402918481,
"loss": 4.6923,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66700
},
{
"epoch": 4.863403704985519,
"grad_norm": 2.5625,
"learning_rate": 0.00010429338791316601,
"loss": 4.6683,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66750
},
{
"epoch": 4.867046758593053,
"grad_norm": 1.421875,
"learning_rate": 0.00010417902617189875,
"loss": 4.6638,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66800
},
{
"epoch": 4.870689812200586,
"grad_norm": 2.078125,
"learning_rate": 0.00010406465895522117,
"loss": 4.6586,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66850
},
{
"epoch": 4.87433286580812,
"grad_norm": 2.625,
"learning_rate": 0.00010395028641297853,
"loss": 4.6786,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66900
},
{
"epoch": 4.877975919415654,
"grad_norm": 1.40625,
"learning_rate": 0.00010383590869502318,
"loss": 4.6656,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 66950
},
{
"epoch": 4.881618973023188,
"grad_norm": 1.6171875,
"learning_rate": 0.00010372152595121412,
"loss": 4.6776,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67000
},
{
"epoch": 4.885262026630722,
"grad_norm": 1.4765625,
"learning_rate": 0.00010360713833141704,
"loss": 4.6579,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67050
},
{
"epoch": 4.888905080238255,
"grad_norm": 1.53125,
"learning_rate": 0.00010349274598550391,
"loss": 4.6589,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67100
},
{
"epoch": 4.892548133845789,
"grad_norm": 1.6875,
"learning_rate": 0.00010337834906335306,
"loss": 4.6657,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67150
},
{
"epoch": 4.896191187453323,
"grad_norm": 1.7265625,
"learning_rate": 0.00010326394771484862,
"loss": 4.6714,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67200
},
{
"epoch": 4.899834241060857,
"grad_norm": 1.296875,
"learning_rate": 0.00010314954208988066,
"loss": 4.6727,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67250
},
{
"epoch": 4.903477294668391,
"grad_norm": 1.609375,
"learning_rate": 0.00010303513233834478,
"loss": 4.6727,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67300
},
{
"epoch": 4.9071203482759245,
"grad_norm": 1.375,
"learning_rate": 0.00010292071861014202,
"loss": 4.6744,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67350
},
{
"epoch": 4.910763401883459,
"grad_norm": 1.3515625,
"learning_rate": 0.00010280630105517863,
"loss": 4.6755,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67400
},
{
"epoch": 4.914406455490992,
"grad_norm": 1.3125,
"learning_rate": 0.00010269187982336585,
"loss": 4.6669,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67450
},
{
"epoch": 4.918049509098527,
"grad_norm": 1.578125,
"learning_rate": 0.00010257745506461979,
"loss": 4.6787,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67500
},
{
"epoch": 4.921692562706061,
"grad_norm": 1.6484375,
"learning_rate": 0.00010246302692886108,
"loss": 4.6556,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67550
},
{
"epoch": 4.925335616313594,
"grad_norm": 1.9609375,
"learning_rate": 0.00010234859556601491,
"loss": 4.6794,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67600
},
{
"epoch": 4.928978669921128,
"grad_norm": 1.59375,
"learning_rate": 0.00010223416112601059,
"loss": 4.6625,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67650
},
{
"epoch": 4.932621723528662,
"grad_norm": 1.40625,
"learning_rate": 0.00010211972375878152,
"loss": 4.679,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67700
},
{
"epoch": 4.936264777136196,
"grad_norm": 1.1171875,
"learning_rate": 0.00010200528361426487,
"loss": 4.6547,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67750
},
{
"epoch": 4.93990783074373,
"grad_norm": 1.390625,
"learning_rate": 0.00010189084084240153,
"loss": 4.668,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67800
},
{
"epoch": 4.943550884351263,
"grad_norm": 1.9921875,
"learning_rate": 0.00010177639559313576,
"loss": 4.6662,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67850
},
{
"epoch": 4.947193937958797,
"grad_norm": 1.453125,
"learning_rate": 0.00010166194801641515,
"loss": 4.6642,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67900
},
{
"epoch": 4.950836991566331,
"grad_norm": 2.390625,
"learning_rate": 0.0001015474982621903,
"loss": 4.6687,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 67950
},
{
"epoch": 4.954480045173865,
"grad_norm": 1.4609375,
"learning_rate": 0.00010143304648041459,
"loss": 4.6698,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68000
},
{
"epoch": 4.958123098781399,
"grad_norm": 2.21875,
"learning_rate": 0.00010131859282104416,
"loss": 4.6637,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68050
},
{
"epoch": 4.9617661523889325,
"grad_norm": 1.5078125,
"learning_rate": 0.00010120413743403755,
"loss": 4.6576,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68100
},
{
"epoch": 4.965409205996466,
"grad_norm": 1.3515625,
"learning_rate": 0.00010108968046935565,
"loss": 4.6698,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68150
},
{
"epoch": 4.969052259604,
"grad_norm": 1.3671875,
"learning_rate": 0.00010097522207696126,
"loss": 4.6746,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68200
},
{
"epoch": 4.972695313211534,
"grad_norm": 1.578125,
"learning_rate": 0.00010086076240681916,
"loss": 4.6666,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68250
},
{
"epoch": 4.976338366819068,
"grad_norm": 1.625,
"learning_rate": 0.00010074630160889585,
"loss": 4.6544,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68300
},
{
"epoch": 4.9799814204266015,
"grad_norm": 1.6484375,
"learning_rate": 0.00010063183983315919,
"loss": 4.6565,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68350
},
{
"epoch": 4.983624474034135,
"grad_norm": 2.171875,
"learning_rate": 0.00010051737722957837,
"loss": 4.6728,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68400
},
{
"epoch": 4.987267527641669,
"grad_norm": 3.171875,
"learning_rate": 0.00010040291394812366,
"loss": 4.6552,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68450
},
{
"epoch": 4.990910581249203,
"grad_norm": 1.1796875,
"learning_rate": 0.0001002884501387663,
"loss": 4.6773,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68500
},
{
"epoch": 4.994553634856737,
"grad_norm": 3.09375,
"learning_rate": 0.00010017398595147807,
"loss": 4.6575,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68550
},
{
"epoch": 4.998196688464271,
"grad_norm": 2.125,
"learning_rate": 0.00010005952153623137,
"loss": 4.6656,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68600
},
{
"epoch": 4.999653909907284,
"eval_loss": 4.676141738891602,
"eval_runtime": 585.0208,
"eval_samples_per_second": 518.491,
"eval_steps_per_second": 43.209,
"memory/device_mem_reserved(gib)": 63.58,
"memory/max_mem_active(gib)": 57.51,
"memory/max_mem_allocated(gib)": 57.51,
"step": 68620
}
],
"logging_steps": 50,
"max_steps": 137240,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.896851859210974e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}