agentlans's picture
Upload 12 files
ea0a75a verified
{
"best_global_step": 19990,
"best_metric": 0.566254198551178,
"best_model_checkpoint": "/media/user/Expansion1/granite-embedding-107m-multilingual-chat-difficulty/checkpoint-19990",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 49975,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05002501250625312,
"grad_norm": 14.432242393493652,
"learning_rate": 4.9500750375187595e-05,
"loss": 0.7612,
"num_input_tokens_seen": 512000,
"step": 500,
"train_runtime": 10.716,
"train_tokens_per_second": 47778.963
},
{
"epoch": 0.10005002501250625,
"grad_norm": 4.861369609832764,
"learning_rate": 4.900050025012506e-05,
"loss": 0.682,
"num_input_tokens_seen": 1024000,
"step": 1000,
"train_runtime": 21.117,
"train_tokens_per_second": 48491.697
},
{
"epoch": 0.1500750375187594,
"grad_norm": 8.015384674072266,
"learning_rate": 4.8500250125062535e-05,
"loss": 0.6637,
"num_input_tokens_seen": 1536000,
"step": 1500,
"train_runtime": 31.546,
"train_tokens_per_second": 48690.793
},
{
"epoch": 0.2001000500250125,
"grad_norm": 8.866910934448242,
"learning_rate": 4.8e-05,
"loss": 0.6346,
"num_input_tokens_seen": 2048000,
"step": 2000,
"train_runtime": 41.8744,
"train_tokens_per_second": 48908.201
},
{
"epoch": 0.25012506253126565,
"grad_norm": 8.685997009277344,
"learning_rate": 4.7499749874937475e-05,
"loss": 0.6475,
"num_input_tokens_seen": 2560000,
"step": 2500,
"train_runtime": 52.2059,
"train_tokens_per_second": 49036.559
},
{
"epoch": 0.3001500750375188,
"grad_norm": 11.758138656616211,
"learning_rate": 4.699949974987494e-05,
"loss": 0.6349,
"num_input_tokens_seen": 3072000,
"step": 3000,
"train_runtime": 62.5742,
"train_tokens_per_second": 49093.701
},
{
"epoch": 0.3501750875437719,
"grad_norm": 8.118245124816895,
"learning_rate": 4.649924962481241e-05,
"loss": 0.6317,
"num_input_tokens_seen": 3584000,
"step": 3500,
"train_runtime": 72.9422,
"train_tokens_per_second": 49134.817
},
{
"epoch": 0.400200100050025,
"grad_norm": 6.139720916748047,
"learning_rate": 4.599899949974988e-05,
"loss": 0.6167,
"num_input_tokens_seen": 4096000,
"step": 4000,
"train_runtime": 83.3179,
"train_tokens_per_second": 49161.134
},
{
"epoch": 0.4502251125562781,
"grad_norm": 8.408066749572754,
"learning_rate": 4.549874937468734e-05,
"loss": 0.5989,
"num_input_tokens_seen": 4608000,
"step": 4500,
"train_runtime": 93.7746,
"train_tokens_per_second": 49139.121
},
{
"epoch": 0.5002501250625313,
"grad_norm": 10.197962760925293,
"learning_rate": 4.4998499249624814e-05,
"loss": 0.6064,
"num_input_tokens_seen": 5120000,
"step": 5000,
"train_runtime": 104.1644,
"train_tokens_per_second": 49153.06
},
{
"epoch": 0.5502751375687844,
"grad_norm": 4.074629306793213,
"learning_rate": 4.449824912456229e-05,
"loss": 0.6092,
"num_input_tokens_seen": 5632000,
"step": 5500,
"train_runtime": 114.5316,
"train_tokens_per_second": 49174.215
},
{
"epoch": 0.6003001500750376,
"grad_norm": 3.91839861869812,
"learning_rate": 4.3997998999499754e-05,
"loss": 0.589,
"num_input_tokens_seen": 6144000,
"step": 6000,
"train_runtime": 124.9251,
"train_tokens_per_second": 49181.485
},
{
"epoch": 0.6503251625812907,
"grad_norm": 8.543745040893555,
"learning_rate": 4.349774887443722e-05,
"loss": 0.6039,
"num_input_tokens_seen": 6656000,
"step": 6500,
"train_runtime": 135.6035,
"train_tokens_per_second": 49084.295
},
{
"epoch": 0.7003501750875438,
"grad_norm": 7.670341491699219,
"learning_rate": 4.299749874937469e-05,
"loss": 0.6106,
"num_input_tokens_seen": 7168000,
"step": 7000,
"train_runtime": 146.0175,
"train_tokens_per_second": 49090.01
},
{
"epoch": 0.7503751875937968,
"grad_norm": 7.7239580154418945,
"learning_rate": 4.249724862431216e-05,
"loss": 0.6287,
"num_input_tokens_seen": 7680000,
"step": 7500,
"train_runtime": 156.4134,
"train_tokens_per_second": 49100.637
},
{
"epoch": 0.80040020010005,
"grad_norm": 11.006497383117676,
"learning_rate": 4.199699849924963e-05,
"loss": 0.5984,
"num_input_tokens_seen": 8192000,
"step": 8000,
"train_runtime": 166.8094,
"train_tokens_per_second": 49109.928
},
{
"epoch": 0.8504252126063031,
"grad_norm": 6.032561779022217,
"learning_rate": 4.1496748374187094e-05,
"loss": 0.5775,
"num_input_tokens_seen": 8704000,
"step": 8500,
"train_runtime": 177.6017,
"train_tokens_per_second": 49008.538
},
{
"epoch": 0.9004502251125562,
"grad_norm": 19.184879302978516,
"learning_rate": 4.099649824912457e-05,
"loss": 0.6008,
"num_input_tokens_seen": 9216000,
"step": 9000,
"train_runtime": 188.21,
"train_tokens_per_second": 48966.59
},
{
"epoch": 0.9504752376188094,
"grad_norm": 21.708553314208984,
"learning_rate": 4.049624812406203e-05,
"loss": 0.5781,
"num_input_tokens_seen": 9728000,
"step": 9500,
"train_runtime": 198.7447,
"train_tokens_per_second": 48947.209
},
{
"epoch": 1.0,
"eval_loss": 0.5996330380439758,
"eval_mse": 0.599633070872774,
"eval_runtime": 7.71,
"eval_samples_per_second": 2592.618,
"eval_steps_per_second": 324.126,
"num_input_tokens_seen": 10234624,
"step": 9995
},
{
"epoch": 1.0005002501250626,
"grad_norm": 14.216784477233887,
"learning_rate": 3.99959979989995e-05,
"loss": 0.5931,
"num_input_tokens_seen": 10239744,
"step": 10000,
"train_runtime": 218.5356,
"train_tokens_per_second": 46856.177
},
{
"epoch": 1.0505252626313157,
"grad_norm": 12.273909568786621,
"learning_rate": 3.9495747873936974e-05,
"loss": 0.5232,
"num_input_tokens_seen": 10751744,
"step": 10500,
"train_runtime": 228.9682,
"train_tokens_per_second": 46957.358
},
{
"epoch": 1.1005502751375689,
"grad_norm": 5.408884525299072,
"learning_rate": 3.899549774887444e-05,
"loss": 0.5491,
"num_input_tokens_seen": 11263744,
"step": 11000,
"train_runtime": 239.4335,
"train_tokens_per_second": 47043.302
},
{
"epoch": 1.150575287643822,
"grad_norm": 13.357819557189941,
"learning_rate": 3.849524762381191e-05,
"loss": 0.5141,
"num_input_tokens_seen": 11775744,
"step": 11500,
"train_runtime": 249.8249,
"train_tokens_per_second": 47135.996
},
{
"epoch": 1.2006003001500751,
"grad_norm": 6.6150641441345215,
"learning_rate": 3.7994997498749374e-05,
"loss": 0.5161,
"num_input_tokens_seen": 12287744,
"step": 12000,
"train_runtime": 260.1103,
"train_tokens_per_second": 47240.518
},
{
"epoch": 1.2506253126563283,
"grad_norm": 6.556243896484375,
"learning_rate": 3.749474737368685e-05,
"loss": 0.5046,
"num_input_tokens_seen": 12799744,
"step": 12500,
"train_runtime": 270.3312,
"train_tokens_per_second": 47348.381
},
{
"epoch": 1.3006503251625814,
"grad_norm": 15.295032501220703,
"learning_rate": 3.6994497248624314e-05,
"loss": 0.5032,
"num_input_tokens_seen": 13311744,
"step": 13000,
"train_runtime": 280.9075,
"train_tokens_per_second": 47388.352
},
{
"epoch": 1.3506753376688345,
"grad_norm": 5.232257843017578,
"learning_rate": 3.649424712356178e-05,
"loss": 0.5209,
"num_input_tokens_seen": 13823744,
"step": 13500,
"train_runtime": 291.1832,
"train_tokens_per_second": 47474.391
},
{
"epoch": 1.4007003501750876,
"grad_norm": 4.907810688018799,
"learning_rate": 3.5993996998499254e-05,
"loss": 0.5166,
"num_input_tokens_seen": 14335744,
"step": 14000,
"train_runtime": 301.5311,
"train_tokens_per_second": 47543.176
},
{
"epoch": 1.4507253626813408,
"grad_norm": 2.214416742324829,
"learning_rate": 3.549374687343672e-05,
"loss": 0.508,
"num_input_tokens_seen": 14847744,
"step": 14500,
"train_runtime": 311.993,
"train_tokens_per_second": 47589.991
},
{
"epoch": 1.500750375187594,
"grad_norm": 7.953266620635986,
"learning_rate": 3.499349674837419e-05,
"loss": 0.5163,
"num_input_tokens_seen": 15359744,
"step": 15000,
"train_runtime": 322.4354,
"train_tokens_per_second": 47636.659
},
{
"epoch": 1.550775387693847,
"grad_norm": 27.234952926635742,
"learning_rate": 3.449324662331166e-05,
"loss": 0.5105,
"num_input_tokens_seen": 15871744,
"step": 15500,
"train_runtime": 332.8782,
"train_tokens_per_second": 47680.34
},
{
"epoch": 1.6008004002001002,
"grad_norm": 8.335816383361816,
"learning_rate": 3.399299649824913e-05,
"loss": 0.5177,
"num_input_tokens_seen": 16383744,
"step": 16000,
"train_runtime": 343.188,
"train_tokens_per_second": 47739.856
},
{
"epoch": 1.6508254127063533,
"grad_norm": 12.731109619140625,
"learning_rate": 3.3492746373186594e-05,
"loss": 0.5282,
"num_input_tokens_seen": 16895744,
"step": 16500,
"train_runtime": 353.7522,
"train_tokens_per_second": 47761.528
},
{
"epoch": 1.7008504252126064,
"grad_norm": 2.0694406032562256,
"learning_rate": 3.299249624812407e-05,
"loss": 0.5217,
"num_input_tokens_seen": 17407744,
"step": 17000,
"train_runtime": 364.3803,
"train_tokens_per_second": 47773.564
},
{
"epoch": 1.7508754377188596,
"grad_norm": 6.190609455108643,
"learning_rate": 3.2492246123061534e-05,
"loss": 0.5185,
"num_input_tokens_seen": 17919744,
"step": 17500,
"train_runtime": 374.8689,
"train_tokens_per_second": 47802.702
},
{
"epoch": 1.8009004502251127,
"grad_norm": 6.968355655670166,
"learning_rate": 3.1991995997999e-05,
"loss": 0.5025,
"num_input_tokens_seen": 18431744,
"step": 18000,
"train_runtime": 385.1924,
"train_tokens_per_second": 47850.75
},
{
"epoch": 1.8509254627313658,
"grad_norm": 2.6967575550079346,
"learning_rate": 3.149174587293647e-05,
"loss": 0.5118,
"num_input_tokens_seen": 18943744,
"step": 18500,
"train_runtime": 395.8463,
"train_tokens_per_second": 47856.316
},
{
"epoch": 1.900950475237619,
"grad_norm": 13.992359161376953,
"learning_rate": 3.099149574787394e-05,
"loss": 0.5209,
"num_input_tokens_seen": 19455744,
"step": 19000,
"train_runtime": 406.5011,
"train_tokens_per_second": 47861.481
},
{
"epoch": 1.950975487743872,
"grad_norm": 4.206520080566406,
"learning_rate": 3.049124562281141e-05,
"loss": 0.5063,
"num_input_tokens_seen": 19967744,
"step": 19500,
"train_runtime": 416.9681,
"train_tokens_per_second": 47887.945
},
{
"epoch": 2.0,
"eval_loss": 0.566254198551178,
"eval_mse": 0.5662542309499291,
"eval_runtime": 7.8118,
"eval_samples_per_second": 2558.806,
"eval_steps_per_second": 319.899,
"num_input_tokens_seen": 20469248,
"step": 19990
},
{
"epoch": 2.001000500250125,
"grad_norm": 5.060875415802002,
"learning_rate": 2.9990995497748873e-05,
"loss": 0.5111,
"num_input_tokens_seen": 20479488,
"step": 20000,
"train_runtime": 437.0238,
"train_tokens_per_second": 46861.264
},
{
"epoch": 2.0510255127563783,
"grad_norm": 4.569818019866943,
"learning_rate": 2.9490745372686347e-05,
"loss": 0.4432,
"num_input_tokens_seen": 20991488,
"step": 20500,
"train_runtime": 447.3167,
"train_tokens_per_second": 46927.57
},
{
"epoch": 2.1010505252626315,
"grad_norm": 6.6488142013549805,
"learning_rate": 2.899049524762381e-05,
"loss": 0.418,
"num_input_tokens_seen": 21503488,
"step": 21000,
"train_runtime": 457.6456,
"train_tokens_per_second": 46987.204
},
{
"epoch": 2.1510755377688846,
"grad_norm": 6.992687702178955,
"learning_rate": 2.8490245122561283e-05,
"loss": 0.4222,
"num_input_tokens_seen": 22015488,
"step": 21500,
"train_runtime": 467.9444,
"train_tokens_per_second": 47047.231
},
{
"epoch": 2.2011005502751377,
"grad_norm": 9.515182495117188,
"learning_rate": 2.7989994997498753e-05,
"loss": 0.4325,
"num_input_tokens_seen": 22527488,
"step": 22000,
"train_runtime": 478.2535,
"train_tokens_per_second": 47103.653
},
{
"epoch": 2.251125562781391,
"grad_norm": 9.444127082824707,
"learning_rate": 2.748974487243622e-05,
"loss": 0.42,
"num_input_tokens_seen": 23039488,
"step": 22500,
"train_runtime": 488.5556,
"train_tokens_per_second": 47158.378
},
{
"epoch": 2.301150575287644,
"grad_norm": 9.03792953491211,
"learning_rate": 2.698949474737369e-05,
"loss": 0.4123,
"num_input_tokens_seen": 23551488,
"step": 23000,
"train_runtime": 498.876,
"train_tokens_per_second": 47209.103
},
{
"epoch": 2.351175587793897,
"grad_norm": 8.053046226501465,
"learning_rate": 2.6489244622311153e-05,
"loss": 0.4179,
"num_input_tokens_seen": 24063488,
"step": 23500,
"train_runtime": 509.2037,
"train_tokens_per_second": 47257.098
},
{
"epoch": 2.4012006003001503,
"grad_norm": 13.128437995910645,
"learning_rate": 2.5988994497248627e-05,
"loss": 0.4239,
"num_input_tokens_seen": 24575488,
"step": 24000,
"train_runtime": 519.5215,
"train_tokens_per_second": 47304.079
},
{
"epoch": 2.4512256128064034,
"grad_norm": 10.007084846496582,
"learning_rate": 2.5488744372186097e-05,
"loss": 0.4316,
"num_input_tokens_seen": 25087488,
"step": 24500,
"train_runtime": 529.8242,
"train_tokens_per_second": 47350.591
},
{
"epoch": 2.5012506253126565,
"grad_norm": 11.051094055175781,
"learning_rate": 2.4988494247123563e-05,
"loss": 0.4233,
"num_input_tokens_seen": 25599488,
"step": 25000,
"train_runtime": 540.1399,
"train_tokens_per_second": 47394.182
},
{
"epoch": 2.551275637818909,
"grad_norm": 6.066195487976074,
"learning_rate": 2.4488244122061033e-05,
"loss": 0.4251,
"num_input_tokens_seen": 26111488,
"step": 25500,
"train_runtime": 550.4474,
"train_tokens_per_second": 47436.848
},
{
"epoch": 2.6013006503251628,
"grad_norm": 2.6425974369049072,
"learning_rate": 2.39879939969985e-05,
"loss": 0.4479,
"num_input_tokens_seen": 26623488,
"step": 26000,
"train_runtime": 560.7806,
"train_tokens_per_second": 47475.766
},
{
"epoch": 2.6513256628314155,
"grad_norm": 11.04046630859375,
"learning_rate": 2.348774387193597e-05,
"loss": 0.4457,
"num_input_tokens_seen": 27135488,
"step": 26500,
"train_runtime": 571.1223,
"train_tokens_per_second": 47512.567
},
{
"epoch": 2.701350675337669,
"grad_norm": 14.067301750183105,
"learning_rate": 2.2987493746873436e-05,
"loss": 0.4323,
"num_input_tokens_seen": 27647488,
"step": 27000,
"train_runtime": 581.4573,
"train_tokens_per_second": 47548.615
},
{
"epoch": 2.7513756878439217,
"grad_norm": 6.148231029510498,
"learning_rate": 2.2487243621810906e-05,
"loss": 0.4314,
"num_input_tokens_seen": 28159488,
"step": 27500,
"train_runtime": 591.7788,
"train_tokens_per_second": 47584.484
},
{
"epoch": 2.8014007003501753,
"grad_norm": 4.049166202545166,
"learning_rate": 2.1986993496748376e-05,
"loss": 0.4245,
"num_input_tokens_seen": 28671488,
"step": 28000,
"train_runtime": 602.1254,
"train_tokens_per_second": 47617.141
},
{
"epoch": 2.851425712856428,
"grad_norm": 10.769721031188965,
"learning_rate": 2.1486743371685843e-05,
"loss": 0.432,
"num_input_tokens_seen": 29183488,
"step": 28500,
"train_runtime": 612.4701,
"train_tokens_per_second": 47648.836
},
{
"epoch": 2.9014507253626816,
"grad_norm": 7.408927917480469,
"learning_rate": 2.0986493246623313e-05,
"loss": 0.4296,
"num_input_tokens_seen": 29695488,
"step": 29000,
"train_runtime": 622.8209,
"train_tokens_per_second": 47679.02
},
{
"epoch": 2.9514757378689342,
"grad_norm": 15.281734466552734,
"learning_rate": 2.048624312156078e-05,
"loss": 0.4124,
"num_input_tokens_seen": 30207488,
"step": 29500,
"train_runtime": 633.1949,
"train_tokens_per_second": 47706.46
},
{
"epoch": 3.0,
"eval_loss": 0.6329430937767029,
"eval_mse": 0.6329430625566823,
"eval_runtime": 7.4256,
"eval_samples_per_second": 2691.912,
"eval_steps_per_second": 336.539,
"num_input_tokens_seen": 30703872,
"step": 29985
},
{
"epoch": 3.0015007503751874,
"grad_norm": 5.859447479248047,
"learning_rate": 1.998599299649825e-05,
"loss": 0.3997,
"num_input_tokens_seen": 30719232,
"step": 30000,
"train_runtime": 652.5502,
"train_tokens_per_second": 47075.658
},
{
"epoch": 3.0515257628814405,
"grad_norm": 34.13383102416992,
"learning_rate": 1.948574287143572e-05,
"loss": 0.3636,
"num_input_tokens_seen": 31231232,
"step": 30500,
"train_runtime": 662.8658,
"train_tokens_per_second": 47115.47
},
{
"epoch": 3.1015507753876936,
"grad_norm": 7.527888298034668,
"learning_rate": 1.8985492746373186e-05,
"loss": 0.3514,
"num_input_tokens_seen": 31743232,
"step": 31000,
"train_runtime": 673.2071,
"train_tokens_per_second": 47152.251
},
{
"epoch": 3.1515757878939468,
"grad_norm": 13.83651351928711,
"learning_rate": 1.8485242621310656e-05,
"loss": 0.3339,
"num_input_tokens_seen": 32255232,
"step": 31500,
"train_runtime": 683.5282,
"train_tokens_per_second": 47189.32
},
{
"epoch": 3.2016008004002,
"grad_norm": 16.254074096679688,
"learning_rate": 1.7984992496248123e-05,
"loss": 0.3544,
"num_input_tokens_seen": 32767232,
"step": 32000,
"train_runtime": 693.8526,
"train_tokens_per_second": 47225.062
},
{
"epoch": 3.251625812906453,
"grad_norm": 11.702266693115234,
"learning_rate": 1.7484742371185596e-05,
"loss": 0.3475,
"num_input_tokens_seen": 33279232,
"step": 32500,
"train_runtime": 704.191,
"train_tokens_per_second": 47258.812
},
{
"epoch": 3.3016508254127066,
"grad_norm": 6.544358253479004,
"learning_rate": 1.6984492246123063e-05,
"loss": 0.3251,
"num_input_tokens_seen": 33791232,
"step": 33000,
"train_runtime": 714.5267,
"train_tokens_per_second": 47291.768
},
{
"epoch": 3.3516758379189593,
"grad_norm": 6.692401885986328,
"learning_rate": 1.648424212106053e-05,
"loss": 0.3596,
"num_input_tokens_seen": 34303232,
"step": 33500,
"train_runtime": 724.8425,
"train_tokens_per_second": 47325.083
},
{
"epoch": 3.4017008504252124,
"grad_norm": 1.8304575681686401,
"learning_rate": 1.5983991995998e-05,
"loss": 0.3368,
"num_input_tokens_seen": 34815232,
"step": 34000,
"train_runtime": 735.1924,
"train_tokens_per_second": 47355.269
},
{
"epoch": 3.4517258629314655,
"grad_norm": 16.374961853027344,
"learning_rate": 1.548374187093547e-05,
"loss": 0.355,
"num_input_tokens_seen": 35327232,
"step": 34500,
"train_runtime": 745.5365,
"train_tokens_per_second": 47384.977
},
{
"epoch": 3.501750875437719,
"grad_norm": 10.639739036560059,
"learning_rate": 1.4983491745872938e-05,
"loss": 0.3686,
"num_input_tokens_seen": 35839232,
"step": 35000,
"train_runtime": 755.8868,
"train_tokens_per_second": 47413.491
},
{
"epoch": 3.551775887943972,
"grad_norm": 7.099308967590332,
"learning_rate": 1.4483241620810406e-05,
"loss": 0.3526,
"num_input_tokens_seen": 36351232,
"step": 35500,
"train_runtime": 766.2123,
"train_tokens_per_second": 47442.767
},
{
"epoch": 3.6018009004502254,
"grad_norm": 5.288228988647461,
"learning_rate": 1.3982991495747874e-05,
"loss": 0.3478,
"num_input_tokens_seen": 36863232,
"step": 36000,
"train_runtime": 776.5473,
"train_tokens_per_second": 47470.685
},
{
"epoch": 3.651825912956478,
"grad_norm": 5.8353471755981445,
"learning_rate": 1.3482741370685342e-05,
"loss": 0.3551,
"num_input_tokens_seen": 37375232,
"step": 36500,
"train_runtime": 786.8701,
"train_tokens_per_second": 47498.606
},
{
"epoch": 3.701850925462731,
"grad_norm": 5.141830921173096,
"learning_rate": 1.2982491245622812e-05,
"loss": 0.3564,
"num_input_tokens_seen": 37887232,
"step": 37000,
"train_runtime": 797.4137,
"train_tokens_per_second": 47512.64
},
{
"epoch": 3.7518759379689843,
"grad_norm": 9.891301155090332,
"learning_rate": 1.248224112056028e-05,
"loss": 0.3287,
"num_input_tokens_seen": 38399232,
"step": 37500,
"train_runtime": 807.7554,
"train_tokens_per_second": 47538.195
},
{
"epoch": 3.8019009504752375,
"grad_norm": 12.06251049041748,
"learning_rate": 1.1981990995497749e-05,
"loss": 0.3685,
"num_input_tokens_seen": 38911232,
"step": 38000,
"train_runtime": 818.0758,
"train_tokens_per_second": 47564.337
},
{
"epoch": 3.8519259629814906,
"grad_norm": 3.9367198944091797,
"learning_rate": 1.1481740870435217e-05,
"loss": 0.3376,
"num_input_tokens_seen": 39423232,
"step": 38500,
"train_runtime": 828.4313,
"train_tokens_per_second": 47587.81
},
{
"epoch": 3.9019509754877437,
"grad_norm": 12.005465507507324,
"learning_rate": 1.0981490745372687e-05,
"loss": 0.3435,
"num_input_tokens_seen": 39935232,
"step": 39000,
"train_runtime": 838.7871,
"train_tokens_per_second": 47610.69
},
{
"epoch": 3.951975987993997,
"grad_norm": 8.57763671875,
"learning_rate": 1.0481240620310156e-05,
"loss": 0.3463,
"num_input_tokens_seen": 40447232,
"step": 39500,
"train_runtime": 849.1478,
"train_tokens_per_second": 47632.736
},
{
"epoch": 4.0,
"eval_loss": 0.6491973996162415,
"eval_mse": 0.6491973448647838,
"eval_runtime": 7.4218,
"eval_samples_per_second": 2693.27,
"eval_steps_per_second": 336.709,
"num_input_tokens_seen": 40938496,
"step": 39980
},
{
"epoch": 4.00200100050025,
"grad_norm": 8.412797927856445,
"learning_rate": 9.980990495247624e-06,
"loss": 0.3346,
"num_input_tokens_seen": 40958976,
"step": 40000,
"train_runtime": 868.7008,
"train_tokens_per_second": 47149.696
},
{
"epoch": 4.052026013006503,
"grad_norm": 17.825761795043945,
"learning_rate": 9.480740370185092e-06,
"loss": 0.3034,
"num_input_tokens_seen": 41470976,
"step": 40500,
"train_runtime": 878.9875,
"train_tokens_per_second": 47180.395
},
{
"epoch": 4.102051025512757,
"grad_norm": 10.180594444274902,
"learning_rate": 8.980490245122562e-06,
"loss": 0.2971,
"num_input_tokens_seen": 41982976,
"step": 41000,
"train_runtime": 889.3152,
"train_tokens_per_second": 47208.207
},
{
"epoch": 4.152076038019009,
"grad_norm": 6.496691703796387,
"learning_rate": 8.48024012006003e-06,
"loss": 0.29,
"num_input_tokens_seen": 42494976,
"step": 41500,
"train_runtime": 899.6157,
"train_tokens_per_second": 47236.812
},
{
"epoch": 4.202101050525263,
"grad_norm": 7.926106929779053,
"learning_rate": 7.979989994997499e-06,
"loss": 0.2877,
"num_input_tokens_seen": 43006976,
"step": 42000,
"train_runtime": 909.9204,
"train_tokens_per_second": 47264.546
},
{
"epoch": 4.252126063031516,
"grad_norm": 9.483957290649414,
"learning_rate": 7.479739869934968e-06,
"loss": 0.3057,
"num_input_tokens_seen": 43518976,
"step": 42500,
"train_runtime": 920.2145,
"train_tokens_per_second": 47292.21
},
{
"epoch": 4.302151075537769,
"grad_norm": 13.678934097290039,
"learning_rate": 6.979489744872436e-06,
"loss": 0.2989,
"num_input_tokens_seen": 44030976,
"step": 43000,
"train_runtime": 930.5087,
"train_tokens_per_second": 47319.254
},
{
"epoch": 4.352176088044022,
"grad_norm": 4.871670246124268,
"learning_rate": 6.479239619809905e-06,
"loss": 0.2948,
"num_input_tokens_seen": 44542976,
"step": 43500,
"train_runtime": 940.808,
"train_tokens_per_second": 47345.449
},
{
"epoch": 4.4022011005502755,
"grad_norm": 11.43335247039795,
"learning_rate": 5.978989494747374e-06,
"loss": 0.301,
"num_input_tokens_seen": 45054976,
"step": 44000,
"train_runtime": 951.1249,
"train_tokens_per_second": 47370.197
},
{
"epoch": 4.452226113056528,
"grad_norm": 4.706591606140137,
"learning_rate": 5.478739369684843e-06,
"loss": 0.2929,
"num_input_tokens_seen": 45566976,
"step": 44500,
"train_runtime": 961.4381,
"train_tokens_per_second": 47394.601
},
{
"epoch": 4.502251125562782,
"grad_norm": 7.695474624633789,
"learning_rate": 4.978489244622311e-06,
"loss": 0.3027,
"num_input_tokens_seen": 46078976,
"step": 45000,
"train_runtime": 971.7628,
"train_tokens_per_second": 47417.925
},
{
"epoch": 4.552276138069034,
"grad_norm": 3.7478439807891846,
"learning_rate": 4.47823911955978e-06,
"loss": 0.317,
"num_input_tokens_seen": 46590976,
"step": 45500,
"train_runtime": 982.0466,
"train_tokens_per_second": 47442.735
},
{
"epoch": 4.602301150575288,
"grad_norm": 5.191857814788818,
"learning_rate": 3.977988994497249e-06,
"loss": 0.2869,
"num_input_tokens_seen": 47102976,
"step": 46000,
"train_runtime": 992.3427,
"train_tokens_per_second": 47466.441
},
{
"epoch": 4.652326163081541,
"grad_norm": 9.192943572998047,
"learning_rate": 3.4777388694347177e-06,
"loss": 0.2741,
"num_input_tokens_seen": 47614976,
"step": 46500,
"train_runtime": 1002.6605,
"train_tokens_per_second": 47488.632
},
{
"epoch": 4.702351175587794,
"grad_norm": 11.236271858215332,
"learning_rate": 2.9774887443721864e-06,
"loss": 0.2967,
"num_input_tokens_seen": 48126976,
"step": 47000,
"train_runtime": 1012.9854,
"train_tokens_per_second": 47510.041
},
{
"epoch": 4.752376188094047,
"grad_norm": 4.665492534637451,
"learning_rate": 2.477238619309655e-06,
"loss": 0.29,
"num_input_tokens_seen": 48638976,
"step": 47500,
"train_runtime": 1023.2749,
"train_tokens_per_second": 47532.66
},
{
"epoch": 4.8024012006003005,
"grad_norm": 14.87586784362793,
"learning_rate": 1.9769884942471234e-06,
"loss": 0.2883,
"num_input_tokens_seen": 49150976,
"step": 48000,
"train_runtime": 1033.6207,
"train_tokens_per_second": 47552.238
},
{
"epoch": 4.852426213106553,
"grad_norm": 12.128450393676758,
"learning_rate": 1.4767383691845923e-06,
"loss": 0.2925,
"num_input_tokens_seen": 49662976,
"step": 48500,
"train_runtime": 1043.9662,
"train_tokens_per_second": 47571.44
},
{
"epoch": 4.902451225612807,
"grad_norm": 7.318789958953857,
"learning_rate": 9.76488244122061e-07,
"loss": 0.2972,
"num_input_tokens_seen": 50174976,
"step": 49000,
"train_runtime": 1054.2994,
"train_tokens_per_second": 47590.821
},
{
"epoch": 4.9524762381190595,
"grad_norm": 6.287776947021484,
"learning_rate": 4.762381190595298e-07,
"loss": 0.3124,
"num_input_tokens_seen": 50686976,
"step": 49500,
"train_runtime": 1064.6538,
"train_tokens_per_second": 47608.88
},
{
"epoch": 5.0,
"eval_loss": 0.6694201231002808,
"eval_mse": 0.66942013065375,
"eval_runtime": 7.3591,
"eval_samples_per_second": 2716.211,
"eval_steps_per_second": 339.577,
"num_input_tokens_seen": 51173120,
"step": 49975
},
{
"epoch": 5.0,
"num_input_tokens_seen": 51173120,
"step": 49975,
"total_flos": 3314721551485440.0,
"train_loss": 0.4421676393006073,
"train_runtime": 1083.6381,
"train_samples_per_second": 368.933,
"train_steps_per_second": 46.118
}
],
"logging_steps": 500,
"max_steps": 49975,
"num_input_tokens_seen": 51173120,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3314721551485440.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}