DATN_FULLDATA_MLM / trainer_state.json
0wovv0's picture
Upload 13 files
67bb871 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 400,
"global_step": 26155,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019116803670426306,
"grad_norm": 6.148062705993652,
"learning_rate": 4.981265532402982e-05,
"loss": 1.0593,
"step": 100
},
{
"epoch": 0.03823360734085261,
"grad_norm": 4.97545051574707,
"learning_rate": 4.9625310648059644e-05,
"loss": 0.9839,
"step": 200
},
{
"epoch": 0.05735041101127891,
"grad_norm": 5.777927875518799,
"learning_rate": 4.9434142611355384e-05,
"loss": 0.9631,
"step": 300
},
{
"epoch": 0.07646721468170523,
"grad_norm": 6.401044845581055,
"learning_rate": 4.924297457465112e-05,
"loss": 0.9816,
"step": 400
},
{
"epoch": 0.07646721468170523,
"eval_loss": 0.8617107272148132,
"eval_runtime": 92.0408,
"eval_samples_per_second": 89.341,
"eval_steps_per_second": 11.169,
"step": 400
},
{
"epoch": 0.09558401835213153,
"grad_norm": 5.193621635437012,
"learning_rate": 4.905180653794686e-05,
"loss": 0.9312,
"step": 500
},
{
"epoch": 0.11470082202255782,
"grad_norm": 5.454291343688965,
"learning_rate": 4.88606385012426e-05,
"loss": 0.9509,
"step": 600
},
{
"epoch": 0.13381762569298414,
"grad_norm": 6.0297112464904785,
"learning_rate": 4.866947046453833e-05,
"loss": 0.9374,
"step": 700
},
{
"epoch": 0.15293442936341045,
"grad_norm": 4.852521896362305,
"learning_rate": 4.847830242783406e-05,
"loss": 0.9135,
"step": 800
},
{
"epoch": 0.15293442936341045,
"eval_loss": 0.8287038207054138,
"eval_runtime": 92.0241,
"eval_samples_per_second": 89.357,
"eval_steps_per_second": 11.171,
"step": 800
},
{
"epoch": 0.17205123303383674,
"grad_norm": 5.611992835998535,
"learning_rate": 4.828713439112981e-05,
"loss": 0.9056,
"step": 900
},
{
"epoch": 0.19116803670426305,
"grad_norm": 4.3008832931518555,
"learning_rate": 4.809596635442554e-05,
"loss": 0.8996,
"step": 1000
},
{
"epoch": 0.21028484037468936,
"grad_norm": 5.790643215179443,
"learning_rate": 4.790670999808832e-05,
"loss": 0.8965,
"step": 1100
},
{
"epoch": 0.22940164404511565,
"grad_norm": 5.138456344604492,
"learning_rate": 4.7715541961384055e-05,
"loss": 0.9044,
"step": 1200
},
{
"epoch": 0.22940164404511565,
"eval_loss": 0.8015913367271423,
"eval_runtime": 92.1486,
"eval_samples_per_second": 89.236,
"eval_steps_per_second": 11.156,
"step": 1200
},
{
"epoch": 0.24851844771554196,
"grad_norm": 5.13100004196167,
"learning_rate": 4.7524373924679795e-05,
"loss": 0.9073,
"step": 1300
},
{
"epoch": 0.2676352513859683,
"grad_norm": 4.568897724151611,
"learning_rate": 4.7333205887975535e-05,
"loss": 0.8891,
"step": 1400
},
{
"epoch": 0.28675205505639456,
"grad_norm": 4.8391337394714355,
"learning_rate": 4.714203785127127e-05,
"loss": 0.871,
"step": 1500
},
{
"epoch": 0.3058688587268209,
"grad_norm": 4.416493892669678,
"learning_rate": 4.695086981456701e-05,
"loss": 0.8974,
"step": 1600
},
{
"epoch": 0.3058688587268209,
"eval_loss": 0.8033304810523987,
"eval_runtime": 91.9998,
"eval_samples_per_second": 89.381,
"eval_steps_per_second": 11.174,
"step": 1600
},
{
"epoch": 0.3249856623972472,
"grad_norm": 4.519742012023926,
"learning_rate": 4.675970177786274e-05,
"loss": 0.8682,
"step": 1700
},
{
"epoch": 0.3441024660676735,
"grad_norm": 4.968471527099609,
"learning_rate": 4.657044542152552e-05,
"loss": 0.9351,
"step": 1800
},
{
"epoch": 0.3632192697380998,
"grad_norm": 5.676445960998535,
"learning_rate": 4.637927738482126e-05,
"loss": 0.8868,
"step": 1900
},
{
"epoch": 0.3823360734085261,
"grad_norm": 4.578829288482666,
"learning_rate": 4.6188109348117e-05,
"loss": 0.8726,
"step": 2000
},
{
"epoch": 0.3823360734085261,
"eval_loss": 0.7817397117614746,
"eval_runtime": 92.2617,
"eval_samples_per_second": 89.127,
"eval_steps_per_second": 11.142,
"step": 2000
},
{
"epoch": 0.4014528770789524,
"grad_norm": 5.0320892333984375,
"learning_rate": 4.599694131141273e-05,
"loss": 0.8714,
"step": 2100
},
{
"epoch": 0.4205696807493787,
"grad_norm": 5.464868545532227,
"learning_rate": 4.5805773274708466e-05,
"loss": 0.8674,
"step": 2200
},
{
"epoch": 0.439686484419805,
"grad_norm": 4.6338934898376465,
"learning_rate": 4.561460523800421e-05,
"loss": 0.851,
"step": 2300
},
{
"epoch": 0.4588032880902313,
"grad_norm": 4.83494234085083,
"learning_rate": 4.5423437201299945e-05,
"loss": 0.8855,
"step": 2400
},
{
"epoch": 0.4588032880902313,
"eval_loss": 0.7753578424453735,
"eval_runtime": 92.0087,
"eval_samples_per_second": 89.372,
"eval_steps_per_second": 11.173,
"step": 2400
},
{
"epoch": 0.47792009176065764,
"grad_norm": 4.239180564880371,
"learning_rate": 4.523226916459568e-05,
"loss": 0.8647,
"step": 2500
},
{
"epoch": 0.4970368954310839,
"grad_norm": 5.430279731750488,
"learning_rate": 4.504110112789142e-05,
"loss": 0.8601,
"step": 2600
},
{
"epoch": 0.5161536991015102,
"grad_norm": 4.269476890563965,
"learning_rate": 4.484993309118716e-05,
"loss": 0.8713,
"step": 2700
},
{
"epoch": 0.5352705027719366,
"grad_norm": 5.328524589538574,
"learning_rate": 4.465876505448289e-05,
"loss": 0.8462,
"step": 2800
},
{
"epoch": 0.5352705027719366,
"eval_loss": 0.763035237789154,
"eval_runtime": 92.1674,
"eval_samples_per_second": 89.218,
"eval_steps_per_second": 11.154,
"step": 2800
},
{
"epoch": 0.5543873064423628,
"grad_norm": 5.192692756652832,
"learning_rate": 4.446759701777863e-05,
"loss": 0.8335,
"step": 2900
},
{
"epoch": 0.5735041101127891,
"grad_norm": 4.487423419952393,
"learning_rate": 4.4276428981074365e-05,
"loss": 0.8294,
"step": 3000
},
{
"epoch": 0.5926209137832155,
"grad_norm": 4.573492527008057,
"learning_rate": 4.4085260944370104e-05,
"loss": 0.8579,
"step": 3100
},
{
"epoch": 0.6117377174536418,
"grad_norm": 4.045724868774414,
"learning_rate": 4.389409290766584e-05,
"loss": 0.8206,
"step": 3200
},
{
"epoch": 0.6117377174536418,
"eval_loss": 0.7534114122390747,
"eval_runtime": 92.1289,
"eval_samples_per_second": 89.255,
"eval_steps_per_second": 11.158,
"step": 3200
},
{
"epoch": 0.630854521124068,
"grad_norm": 4.9863386154174805,
"learning_rate": 4.370292487096158e-05,
"loss": 0.8268,
"step": 3300
},
{
"epoch": 0.6499713247944944,
"grad_norm": 4.475931644439697,
"learning_rate": 4.351175683425732e-05,
"loss": 0.8326,
"step": 3400
},
{
"epoch": 0.6690881284649207,
"grad_norm": 5.008608341217041,
"learning_rate": 4.332058879755305e-05,
"loss": 0.7823,
"step": 3500
},
{
"epoch": 0.688204932135347,
"grad_norm": 6.179344177246094,
"learning_rate": 4.312942076084879e-05,
"loss": 0.8181,
"step": 3600
},
{
"epoch": 0.688204932135347,
"eval_loss": 0.74337238073349,
"eval_runtime": 92.0867,
"eval_samples_per_second": 89.296,
"eval_steps_per_second": 11.163,
"step": 3600
},
{
"epoch": 0.7073217358057733,
"grad_norm": 6.242499351501465,
"learning_rate": 4.2938252724144524e-05,
"loss": 0.8082,
"step": 3700
},
{
"epoch": 0.7264385394761996,
"grad_norm": 4.346529006958008,
"learning_rate": 4.2747084687440263e-05,
"loss": 0.8274,
"step": 3800
},
{
"epoch": 0.7455553431466259,
"grad_norm": 3.4538333415985107,
"learning_rate": 4.2555916650735997e-05,
"loss": 0.812,
"step": 3900
},
{
"epoch": 0.7646721468170522,
"grad_norm": 5.477049350738525,
"learning_rate": 4.2364748614031736e-05,
"loss": 0.8291,
"step": 4000
},
{
"epoch": 0.7646721468170522,
"eval_loss": 0.8589261770248413,
"eval_runtime": 91.9418,
"eval_samples_per_second": 89.437,
"eval_steps_per_second": 11.181,
"step": 4000
},
{
"epoch": 0.7837889504874785,
"grad_norm": 6.062958717346191,
"learning_rate": 4.217358057732747e-05,
"loss": 0.8236,
"step": 4100
},
{
"epoch": 0.8029057541579048,
"grad_norm": 4.950747489929199,
"learning_rate": 4.198241254062321e-05,
"loss": 0.8356,
"step": 4200
},
{
"epoch": 0.8220225578283311,
"grad_norm": 4.48617696762085,
"learning_rate": 4.179124450391895e-05,
"loss": 0.778,
"step": 4300
},
{
"epoch": 0.8411393614987575,
"grad_norm": 4.147668361663818,
"learning_rate": 4.160007646721468e-05,
"loss": 0.8314,
"step": 4400
},
{
"epoch": 0.8411393614987575,
"eval_loss": 0.7325341105461121,
"eval_runtime": 91.9404,
"eval_samples_per_second": 89.438,
"eval_steps_per_second": 11.181,
"step": 4400
},
{
"epoch": 0.8602561651691837,
"grad_norm": 4.805129051208496,
"learning_rate": 4.1408908430510416e-05,
"loss": 0.841,
"step": 4500
},
{
"epoch": 0.87937296883961,
"grad_norm": 4.559420108795166,
"learning_rate": 4.121774039380616e-05,
"loss": 0.805,
"step": 4600
},
{
"epoch": 0.8984897725100364,
"grad_norm": 4.7745771408081055,
"learning_rate": 4.1026572357101895e-05,
"loss": 0.8243,
"step": 4700
},
{
"epoch": 0.9176065761804626,
"grad_norm": 4.034806251525879,
"learning_rate": 4.083540432039763e-05,
"loss": 0.8016,
"step": 4800
},
{
"epoch": 0.9176065761804626,
"eval_loss": 0.7278503775596619,
"eval_runtime": 91.908,
"eval_samples_per_second": 89.47,
"eval_steps_per_second": 11.185,
"step": 4800
},
{
"epoch": 0.9367233798508889,
"grad_norm": 4.074121952056885,
"learning_rate": 4.064423628369337e-05,
"loss": 0.8006,
"step": 4900
},
{
"epoch": 0.9558401835213153,
"grad_norm": 4.704626560211182,
"learning_rate": 4.045306824698911e-05,
"loss": 0.8106,
"step": 5000
},
{
"epoch": 0.9749569871917415,
"grad_norm": 4.858222961425781,
"learning_rate": 4.026190021028484e-05,
"loss": 0.8129,
"step": 5100
},
{
"epoch": 0.9940737908621678,
"grad_norm": 3.913759469985962,
"learning_rate": 4.0070732173580575e-05,
"loss": 0.7769,
"step": 5200
},
{
"epoch": 0.9940737908621678,
"eval_loss": 0.7124339938163757,
"eval_runtime": 92.1458,
"eval_samples_per_second": 89.239,
"eval_steps_per_second": 11.156,
"step": 5200
},
{
"epoch": 1.013190594532594,
"grad_norm": 4.4074015617370605,
"learning_rate": 3.987956413687632e-05,
"loss": 0.7881,
"step": 5300
},
{
"epoch": 1.0323073982030204,
"grad_norm": 4.406864643096924,
"learning_rate": 3.9688396100172054e-05,
"loss": 0.783,
"step": 5400
},
{
"epoch": 1.0514242018734468,
"grad_norm": 4.834721565246582,
"learning_rate": 3.949722806346779e-05,
"loss": 0.7832,
"step": 5500
},
{
"epoch": 1.070541005543873,
"grad_norm": 4.5118889808654785,
"learning_rate": 3.930606002676353e-05,
"loss": 0.7652,
"step": 5600
},
{
"epoch": 1.070541005543873,
"eval_loss": 0.7111669182777405,
"eval_runtime": 91.8955,
"eval_samples_per_second": 89.482,
"eval_steps_per_second": 11.187,
"step": 5600
},
{
"epoch": 1.0896578092142994,
"grad_norm": 4.746542930603027,
"learning_rate": 3.911489199005927e-05,
"loss": 0.773,
"step": 5700
},
{
"epoch": 1.1087746128847256,
"grad_norm": 4.428741455078125,
"learning_rate": 3.8923723953355e-05,
"loss": 0.78,
"step": 5800
},
{
"epoch": 1.127891416555152,
"grad_norm": 5.356833457946777,
"learning_rate": 3.8732555916650734e-05,
"loss": 0.7614,
"step": 5900
},
{
"epoch": 1.1470082202255782,
"grad_norm": 5.255131244659424,
"learning_rate": 3.8541387879946474e-05,
"loss": 0.7753,
"step": 6000
},
{
"epoch": 1.1470082202255782,
"eval_loss": 0.7003123164176941,
"eval_runtime": 91.7533,
"eval_samples_per_second": 89.621,
"eval_steps_per_second": 11.204,
"step": 6000
},
{
"epoch": 1.1661250238960046,
"grad_norm": 4.378373622894287,
"learning_rate": 3.8350219843242213e-05,
"loss": 0.7813,
"step": 6100
},
{
"epoch": 1.185241827566431,
"grad_norm": 5.40322208404541,
"learning_rate": 3.8159051806537947e-05,
"loss": 0.7519,
"step": 6200
},
{
"epoch": 1.2043586312368573,
"grad_norm": 4.921120643615723,
"learning_rate": 3.7967883769833686e-05,
"loss": 0.7649,
"step": 6300
},
{
"epoch": 1.2234754349072836,
"grad_norm": 4.296281814575195,
"learning_rate": 3.7778627413496465e-05,
"loss": 0.7948,
"step": 6400
},
{
"epoch": 1.2234754349072836,
"eval_loss": 0.707783579826355,
"eval_runtime": 91.9021,
"eval_samples_per_second": 89.476,
"eval_steps_per_second": 11.186,
"step": 6400
},
{
"epoch": 1.2425922385777097,
"grad_norm": 3.6409478187561035,
"learning_rate": 3.75874593767922e-05,
"loss": 0.7547,
"step": 6500
},
{
"epoch": 1.261709042248136,
"grad_norm": 4.5969672203063965,
"learning_rate": 3.739629134008794e-05,
"loss": 0.7419,
"step": 6600
},
{
"epoch": 1.2808258459185624,
"grad_norm": 6.831844806671143,
"learning_rate": 3.720703498375072e-05,
"loss": 0.7909,
"step": 6700
},
{
"epoch": 1.2999426495889888,
"grad_norm": 4.02253532409668,
"learning_rate": 3.701586694704646e-05,
"loss": 0.7693,
"step": 6800
},
{
"epoch": 1.2999426495889888,
"eval_loss": 0.6886795163154602,
"eval_runtime": 91.6808,
"eval_samples_per_second": 89.692,
"eval_steps_per_second": 11.213,
"step": 6800
},
{
"epoch": 1.319059453259415,
"grad_norm": 5.338057994842529,
"learning_rate": 3.682469891034219e-05,
"loss": 0.7468,
"step": 6900
},
{
"epoch": 1.3381762569298412,
"grad_norm": 4.327272415161133,
"learning_rate": 3.663353087363793e-05,
"loss": 0.7493,
"step": 7000
},
{
"epoch": 1.3572930606002676,
"grad_norm": 5.279138088226318,
"learning_rate": 3.644236283693367e-05,
"loss": 0.7527,
"step": 7100
},
{
"epoch": 1.376409864270694,
"grad_norm": 5.587660789489746,
"learning_rate": 3.62511948002294e-05,
"loss": 0.7456,
"step": 7200
},
{
"epoch": 1.376409864270694,
"eval_loss": 0.6874927282333374,
"eval_runtime": 91.6806,
"eval_samples_per_second": 89.692,
"eval_steps_per_second": 11.213,
"step": 7200
},
{
"epoch": 1.3955266679411202,
"grad_norm": 3.81733775138855,
"learning_rate": 3.606002676352514e-05,
"loss": 0.7551,
"step": 7300
},
{
"epoch": 1.4146434716115466,
"grad_norm": 4.612213611602783,
"learning_rate": 3.5868858726820876e-05,
"loss": 0.7616,
"step": 7400
},
{
"epoch": 1.433760275281973,
"grad_norm": 5.318126678466797,
"learning_rate": 3.5677690690116616e-05,
"loss": 0.7507,
"step": 7500
},
{
"epoch": 1.4528770789523993,
"grad_norm": 5.163857936859131,
"learning_rate": 3.548652265341235e-05,
"loss": 0.7435,
"step": 7600
},
{
"epoch": 1.4528770789523993,
"eval_loss": 0.6907040476799011,
"eval_runtime": 91.8652,
"eval_samples_per_second": 89.512,
"eval_steps_per_second": 11.19,
"step": 7600
},
{
"epoch": 1.4719938826228254,
"grad_norm": 4.82822322845459,
"learning_rate": 3.529535461670809e-05,
"loss": 0.7539,
"step": 7700
},
{
"epoch": 1.4911106862932517,
"grad_norm": 3.967336416244507,
"learning_rate": 3.510609826037087e-05,
"loss": 0.7339,
"step": 7800
},
{
"epoch": 1.510227489963678,
"grad_norm": 4.51738166809082,
"learning_rate": 3.49149302236666e-05,
"loss": 0.7473,
"step": 7900
},
{
"epoch": 1.5293442936341044,
"grad_norm": 5.371578693389893,
"learning_rate": 3.472376218696235e-05,
"loss": 0.759,
"step": 8000
},
{
"epoch": 1.5293442936341044,
"eval_loss": 0.6801463961601257,
"eval_runtime": 91.9431,
"eval_samples_per_second": 89.436,
"eval_steps_per_second": 11.181,
"step": 8000
},
{
"epoch": 1.5484610973045307,
"grad_norm": 5.212741851806641,
"learning_rate": 3.453259415025808e-05,
"loss": 0.746,
"step": 8100
},
{
"epoch": 1.5675779009749569,
"grad_norm": 3.754594564437866,
"learning_rate": 3.4341426113553814e-05,
"loss": 0.779,
"step": 8200
},
{
"epoch": 1.5866947046453834,
"grad_norm": 4.9265289306640625,
"learning_rate": 3.4150258076849553e-05,
"loss": 0.7602,
"step": 8300
},
{
"epoch": 1.6058115083158095,
"grad_norm": 4.226690769195557,
"learning_rate": 3.395909004014529e-05,
"loss": 0.7413,
"step": 8400
},
{
"epoch": 1.6058115083158095,
"eval_loss": 0.6831762194633484,
"eval_runtime": 92.496,
"eval_samples_per_second": 88.901,
"eval_steps_per_second": 11.114,
"step": 8400
},
{
"epoch": 1.6249283119862359,
"grad_norm": 4.135231018066406,
"learning_rate": 3.3767922003441027e-05,
"loss": 0.748,
"step": 8500
},
{
"epoch": 1.6440451156566622,
"grad_norm": 4.373616695404053,
"learning_rate": 3.357675396673676e-05,
"loss": 0.7212,
"step": 8600
},
{
"epoch": 1.6631619193270883,
"grad_norm": 3.8442435264587402,
"learning_rate": 3.33855859300325e-05,
"loss": 0.731,
"step": 8700
},
{
"epoch": 1.682278722997515,
"grad_norm": 5.096011638641357,
"learning_rate": 3.319441789332824e-05,
"loss": 0.742,
"step": 8800
},
{
"epoch": 1.682278722997515,
"eval_loss": 0.6838387250900269,
"eval_runtime": 92.5093,
"eval_samples_per_second": 88.888,
"eval_steps_per_second": 11.112,
"step": 8800
},
{
"epoch": 1.701395526667941,
"grad_norm": 3.780578851699829,
"learning_rate": 3.300324985662397e-05,
"loss": 0.742,
"step": 8900
},
{
"epoch": 1.7205123303383676,
"grad_norm": 4.875925064086914,
"learning_rate": 3.281208181991971e-05,
"loss": 0.7479,
"step": 9000
},
{
"epoch": 1.7396291340087937,
"grad_norm": 3.7036213874816895,
"learning_rate": 3.2620913783215446e-05,
"loss": 0.7228,
"step": 9100
},
{
"epoch": 1.75874593767922,
"grad_norm": 5.547360897064209,
"learning_rate": 3.2429745746511185e-05,
"loss": 0.7363,
"step": 9200
},
{
"epoch": 1.75874593767922,
"eval_loss": 0.6730713844299316,
"eval_runtime": 92.4366,
"eval_samples_per_second": 88.958,
"eval_steps_per_second": 11.121,
"step": 9200
},
{
"epoch": 1.7778627413496464,
"grad_norm": 3.721874475479126,
"learning_rate": 3.223857770980692e-05,
"loss": 0.73,
"step": 9300
},
{
"epoch": 1.7969795450200725,
"grad_norm": 4.086114406585693,
"learning_rate": 3.204740967310266e-05,
"loss": 0.7188,
"step": 9400
},
{
"epoch": 1.816096348690499,
"grad_norm": 5.272156238555908,
"learning_rate": 3.18562416363984e-05,
"loss": 0.7556,
"step": 9500
},
{
"epoch": 1.8352131523609252,
"grad_norm": 4.227740287780762,
"learning_rate": 3.166507359969413e-05,
"loss": 0.7355,
"step": 9600
},
{
"epoch": 1.8352131523609252,
"eval_loss": 0.6690217852592468,
"eval_runtime": 92.4428,
"eval_samples_per_second": 88.952,
"eval_steps_per_second": 11.12,
"step": 9600
},
{
"epoch": 1.8543299560313515,
"grad_norm": 4.812748432159424,
"learning_rate": 3.147390556298987e-05,
"loss": 0.7534,
"step": 9700
},
{
"epoch": 1.8734467597017779,
"grad_norm": 4.234578609466553,
"learning_rate": 3.1282737526285605e-05,
"loss": 0.7143,
"step": 9800
},
{
"epoch": 1.8925635633722042,
"grad_norm": 4.305205345153809,
"learning_rate": 3.1091569489581344e-05,
"loss": 0.7151,
"step": 9900
},
{
"epoch": 1.9116803670426306,
"grad_norm": 4.651333332061768,
"learning_rate": 3.090231313324412e-05,
"loss": 0.7097,
"step": 10000
},
{
"epoch": 1.9116803670426306,
"eval_loss": 0.664512038230896,
"eval_runtime": 92.6703,
"eval_samples_per_second": 88.734,
"eval_steps_per_second": 11.093,
"step": 10000
},
{
"epoch": 1.9307971707130567,
"grad_norm": 4.365225791931152,
"learning_rate": 3.071114509653986e-05,
"loss": 0.7095,
"step": 10100
},
{
"epoch": 1.9499139743834832,
"grad_norm": 4.273274898529053,
"learning_rate": 3.0519977059835596e-05,
"loss": 0.7283,
"step": 10200
},
{
"epoch": 1.9690307780539094,
"grad_norm": 4.034031391143799,
"learning_rate": 3.033072070349838e-05,
"loss": 0.7083,
"step": 10300
},
{
"epoch": 1.9881475817243357,
"grad_norm": 4.648432731628418,
"learning_rate": 3.013955266679411e-05,
"loss": 0.7019,
"step": 10400
},
{
"epoch": 1.9881475817243357,
"eval_loss": 0.658172607421875,
"eval_runtime": 92.5836,
"eval_samples_per_second": 88.817,
"eval_steps_per_second": 11.103,
"step": 10400
},
{
"epoch": 2.007264385394762,
"grad_norm": 3.510467290878296,
"learning_rate": 2.9948384630089848e-05,
"loss": 0.7183,
"step": 10500
},
{
"epoch": 2.026381189065188,
"grad_norm": 4.297295570373535,
"learning_rate": 2.9757216593385588e-05,
"loss": 0.701,
"step": 10600
},
{
"epoch": 2.0454979927356147,
"grad_norm": 5.070156097412109,
"learning_rate": 2.9566048556681324e-05,
"loss": 0.7029,
"step": 10700
},
{
"epoch": 2.064614796406041,
"grad_norm": 5.115049362182617,
"learning_rate": 2.937488051997706e-05,
"loss": 0.7023,
"step": 10800
},
{
"epoch": 2.064614796406041,
"eval_loss": 0.6589385867118835,
"eval_runtime": 92.7551,
"eval_samples_per_second": 88.653,
"eval_steps_per_second": 11.083,
"step": 10800
},
{
"epoch": 2.0837316000764674,
"grad_norm": 4.264118194580078,
"learning_rate": 2.9183712483272797e-05,
"loss": 0.701,
"step": 10900
},
{
"epoch": 2.1028484037468935,
"grad_norm": 4.804683208465576,
"learning_rate": 2.8992544446568537e-05,
"loss": 0.6865,
"step": 11000
},
{
"epoch": 2.1219652074173196,
"grad_norm": 3.3149337768554688,
"learning_rate": 2.8801376409864274e-05,
"loss": 0.7121,
"step": 11100
},
{
"epoch": 2.141082011087746,
"grad_norm": 4.628523349761963,
"learning_rate": 2.8610208373160007e-05,
"loss": 0.7095,
"step": 11200
},
{
"epoch": 2.141082011087746,
"eval_loss": 0.6505147218704224,
"eval_runtime": 92.4458,
"eval_samples_per_second": 88.949,
"eval_steps_per_second": 11.12,
"step": 11200
},
{
"epoch": 2.1601988147581723,
"grad_norm": 4.2497453689575195,
"learning_rate": 2.8419040336455743e-05,
"loss": 0.7078,
"step": 11300
},
{
"epoch": 2.179315618428599,
"grad_norm": 4.486359119415283,
"learning_rate": 2.8227872299751483e-05,
"loss": 0.7164,
"step": 11400
},
{
"epoch": 2.198432422099025,
"grad_norm": 4.553341388702393,
"learning_rate": 2.803670426304722e-05,
"loss": 0.6857,
"step": 11500
},
{
"epoch": 2.217549225769451,
"grad_norm": 4.612332344055176,
"learning_rate": 2.7845536226342956e-05,
"loss": 0.7088,
"step": 11600
},
{
"epoch": 2.217549225769451,
"eval_loss": 0.6430885195732117,
"eval_runtime": 92.4895,
"eval_samples_per_second": 88.907,
"eval_steps_per_second": 11.115,
"step": 11600
},
{
"epoch": 2.2366660294398777,
"grad_norm": 4.992730617523193,
"learning_rate": 2.7654368189638696e-05,
"loss": 0.6807,
"step": 11700
},
{
"epoch": 2.255782833110304,
"grad_norm": 4.852089881896973,
"learning_rate": 2.7463200152934433e-05,
"loss": 0.6944,
"step": 11800
},
{
"epoch": 2.2748996367807304,
"grad_norm": 4.18324089050293,
"learning_rate": 2.7272032116230166e-05,
"loss": 0.704,
"step": 11900
},
{
"epoch": 2.2940164404511565,
"grad_norm": 4.048402786254883,
"learning_rate": 2.7080864079525902e-05,
"loss": 0.7127,
"step": 12000
},
{
"epoch": 2.2940164404511565,
"eval_loss": 0.6457203030586243,
"eval_runtime": 92.3316,
"eval_samples_per_second": 89.059,
"eval_steps_per_second": 11.134,
"step": 12000
},
{
"epoch": 2.313133244121583,
"grad_norm": 4.407283306121826,
"learning_rate": 2.6889696042821642e-05,
"loss": 0.7022,
"step": 12100
},
{
"epoch": 2.332250047792009,
"grad_norm": 3.9950592517852783,
"learning_rate": 2.669852800611738e-05,
"loss": 0.7034,
"step": 12200
},
{
"epoch": 2.3513668514624353,
"grad_norm": 4.345687389373779,
"learning_rate": 2.6507359969413115e-05,
"loss": 0.6908,
"step": 12300
},
{
"epoch": 2.370483655132862,
"grad_norm": 4.338857173919678,
"learning_rate": 2.631619193270885e-05,
"loss": 0.6754,
"step": 12400
},
{
"epoch": 2.370483655132862,
"eval_loss": 0.6425340175628662,
"eval_runtime": 92.4505,
"eval_samples_per_second": 88.945,
"eval_steps_per_second": 11.119,
"step": 12400
},
{
"epoch": 2.389600458803288,
"grad_norm": 4.529644012451172,
"learning_rate": 2.6125023896004592e-05,
"loss": 0.6968,
"step": 12500
},
{
"epoch": 2.4087172624737145,
"grad_norm": 4.309901714324951,
"learning_rate": 2.593385585930033e-05,
"loss": 0.7247,
"step": 12600
},
{
"epoch": 2.4278340661441407,
"grad_norm": 4.750647068023682,
"learning_rate": 2.574268782259606e-05,
"loss": 0.6855,
"step": 12700
},
{
"epoch": 2.446950869814567,
"grad_norm": 3.7934632301330566,
"learning_rate": 2.5551519785891798e-05,
"loss": 0.6865,
"step": 12800
},
{
"epoch": 2.446950869814567,
"eval_loss": 0.6368651390075684,
"eval_runtime": 92.2827,
"eval_samples_per_second": 89.107,
"eval_steps_per_second": 11.14,
"step": 12800
},
{
"epoch": 2.4660676734849933,
"grad_norm": 4.074941158294678,
"learning_rate": 2.5360351749187538e-05,
"loss": 0.694,
"step": 12900
},
{
"epoch": 2.4851844771554195,
"grad_norm": 4.529365539550781,
"learning_rate": 2.5169183712483274e-05,
"loss": 0.69,
"step": 13000
},
{
"epoch": 2.504301280825846,
"grad_norm": 4.395044803619385,
"learning_rate": 2.4979927356146053e-05,
"loss": 0.6887,
"step": 13100
},
{
"epoch": 2.523418084496272,
"grad_norm": 8.110248565673828,
"learning_rate": 2.478875931944179e-05,
"loss": 0.7046,
"step": 13200
},
{
"epoch": 2.523418084496272,
"eval_loss": 0.6256079077720642,
"eval_runtime": 92.0493,
"eval_samples_per_second": 89.333,
"eval_steps_per_second": 11.168,
"step": 13200
},
{
"epoch": 2.5425348881666983,
"grad_norm": 3.641064167022705,
"learning_rate": 2.459759128273753e-05,
"loss": 0.6763,
"step": 13300
},
{
"epoch": 2.561651691837125,
"grad_norm": 3.8722896575927734,
"learning_rate": 2.4406423246033263e-05,
"loss": 0.6963,
"step": 13400
},
{
"epoch": 2.5807684955075514,
"grad_norm": 4.976208686828613,
"learning_rate": 2.4215255209329003e-05,
"loss": 0.675,
"step": 13500
},
{
"epoch": 2.5998852991779775,
"grad_norm": 4.116947650909424,
"learning_rate": 2.402408717262474e-05,
"loss": 0.6729,
"step": 13600
},
{
"epoch": 2.5998852991779775,
"eval_loss": 0.6305546164512634,
"eval_runtime": 92.2226,
"eval_samples_per_second": 89.165,
"eval_steps_per_second": 11.147,
"step": 13600
},
{
"epoch": 2.6190021028484036,
"grad_norm": 4.226246356964111,
"learning_rate": 2.3832919135920476e-05,
"loss": 0.7178,
"step": 13700
},
{
"epoch": 2.63811890651883,
"grad_norm": 3.8298568725585938,
"learning_rate": 2.3641751099216212e-05,
"loss": 0.6867,
"step": 13800
},
{
"epoch": 2.6572357101892563,
"grad_norm": 4.829805374145508,
"learning_rate": 2.345058306251195e-05,
"loss": 0.6816,
"step": 13900
},
{
"epoch": 2.6763525138596824,
"grad_norm": 4.9147210121154785,
"learning_rate": 2.3259415025807685e-05,
"loss": 0.6628,
"step": 14000
},
{
"epoch": 2.6763525138596824,
"eval_loss": 0.6306756138801575,
"eval_runtime": 92.2614,
"eval_samples_per_second": 89.127,
"eval_steps_per_second": 11.142,
"step": 14000
},
{
"epoch": 2.695469317530109,
"grad_norm": 4.762243270874023,
"learning_rate": 2.306824698910342e-05,
"loss": 0.7046,
"step": 14100
},
{
"epoch": 2.714586121200535,
"grad_norm": 3.3113512992858887,
"learning_rate": 2.287707895239916e-05,
"loss": 0.6755,
"step": 14200
},
{
"epoch": 2.7337029248709617,
"grad_norm": 4.240131855010986,
"learning_rate": 2.2685910915694898e-05,
"loss": 0.6827,
"step": 14300
},
{
"epoch": 2.752819728541388,
"grad_norm": 3.4899845123291016,
"learning_rate": 2.2494742878990635e-05,
"loss": 0.6789,
"step": 14400
},
{
"epoch": 2.752819728541388,
"eval_loss": 0.6250412464141846,
"eval_runtime": 92.2599,
"eval_samples_per_second": 89.129,
"eval_steps_per_second": 11.142,
"step": 14400
},
{
"epoch": 2.7719365322118144,
"grad_norm": 3.6331779956817627,
"learning_rate": 2.230357484228637e-05,
"loss": 0.6732,
"step": 14500
},
{
"epoch": 2.7910533358822405,
"grad_norm": 5.3023247718811035,
"learning_rate": 2.2112406805582108e-05,
"loss": 0.6705,
"step": 14600
},
{
"epoch": 2.8101701395526666,
"grad_norm": 4.467443943023682,
"learning_rate": 2.1921238768877844e-05,
"loss": 0.6857,
"step": 14700
},
{
"epoch": 2.829286943223093,
"grad_norm": 3.4010238647460938,
"learning_rate": 2.1730070732173584e-05,
"loss": 0.6876,
"step": 14800
},
{
"epoch": 2.829286943223093,
"eval_loss": 0.6198094487190247,
"eval_runtime": 92.347,
"eval_samples_per_second": 89.045,
"eval_steps_per_second": 11.132,
"step": 14800
},
{
"epoch": 2.8484037468935193,
"grad_norm": 4.248734951019287,
"learning_rate": 2.1538902695469317e-05,
"loss": 0.674,
"step": 14900
},
{
"epoch": 2.867520550563946,
"grad_norm": 4.063199520111084,
"learning_rate": 2.1347734658765057e-05,
"loss": 0.666,
"step": 15000
},
{
"epoch": 2.886637354234372,
"grad_norm": 4.015697002410889,
"learning_rate": 2.115656662206079e-05,
"loss": 0.6953,
"step": 15100
},
{
"epoch": 2.9057541579047985,
"grad_norm": 3.798788070678711,
"learning_rate": 2.096539858535653e-05,
"loss": 0.65,
"step": 15200
},
{
"epoch": 2.9057541579047985,
"eval_loss": 0.6209089756011963,
"eval_runtime": 92.3611,
"eval_samples_per_second": 89.031,
"eval_steps_per_second": 11.13,
"step": 15200
},
{
"epoch": 2.9248709615752246,
"grad_norm": 5.368408679962158,
"learning_rate": 2.0774230548652267e-05,
"loss": 0.6766,
"step": 15300
},
{
"epoch": 2.9439877652456508,
"grad_norm": 3.803342580795288,
"learning_rate": 2.0583062511948003e-05,
"loss": 0.6659,
"step": 15400
},
{
"epoch": 2.9631045689160773,
"grad_norm": 4.15940523147583,
"learning_rate": 2.039189447524374e-05,
"loss": 0.6636,
"step": 15500
},
{
"epoch": 2.9822213725865034,
"grad_norm": 4.552635192871094,
"learning_rate": 2.0200726438539476e-05,
"loss": 0.6731,
"step": 15600
},
{
"epoch": 2.9822213725865034,
"eval_loss": 0.6129796504974365,
"eval_runtime": 92.2051,
"eval_samples_per_second": 89.182,
"eval_steps_per_second": 11.149,
"step": 15600
},
{
"epoch": 3.00133817625693,
"grad_norm": 4.447234153747559,
"learning_rate": 2.0009558401835213e-05,
"loss": 0.6696,
"step": 15700
},
{
"epoch": 3.020454979927356,
"grad_norm": 4.5681376457214355,
"learning_rate": 1.9820302045497995e-05,
"loss": 0.6701,
"step": 15800
},
{
"epoch": 3.0395717835977822,
"grad_norm": 4.63778018951416,
"learning_rate": 1.962913400879373e-05,
"loss": 0.6498,
"step": 15900
},
{
"epoch": 3.058688587268209,
"grad_norm": 3.8129169940948486,
"learning_rate": 1.9437965972089468e-05,
"loss": 0.6514,
"step": 16000
},
{
"epoch": 3.058688587268209,
"eval_loss": 0.6256683468818665,
"eval_runtime": 92.6353,
"eval_samples_per_second": 88.767,
"eval_steps_per_second": 11.097,
"step": 16000
},
{
"epoch": 3.077805390938635,
"grad_norm": 4.221614837646484,
"learning_rate": 1.9246797935385204e-05,
"loss": 0.6745,
"step": 16100
},
{
"epoch": 3.0969221946090615,
"grad_norm": 4.324675559997559,
"learning_rate": 1.905562989868094e-05,
"loss": 0.6516,
"step": 16200
},
{
"epoch": 3.1160389982794876,
"grad_norm": 4.2093939781188965,
"learning_rate": 1.8864461861976677e-05,
"loss": 0.6574,
"step": 16300
},
{
"epoch": 3.135155801949914,
"grad_norm": 4.944237232208252,
"learning_rate": 1.8673293825272414e-05,
"loss": 0.6602,
"step": 16400
},
{
"epoch": 3.135155801949914,
"eval_loss": 0.6231346726417542,
"eval_runtime": 92.6789,
"eval_samples_per_second": 88.726,
"eval_steps_per_second": 11.092,
"step": 16400
},
{
"epoch": 3.1542726056203403,
"grad_norm": 4.590976238250732,
"learning_rate": 1.8482125788568154e-05,
"loss": 0.6662,
"step": 16500
},
{
"epoch": 3.1733894092907664,
"grad_norm": 4.595473289489746,
"learning_rate": 1.8290957751863887e-05,
"loss": 0.6294,
"step": 16600
},
{
"epoch": 3.192506212961193,
"grad_norm": 3.4412662982940674,
"learning_rate": 1.8099789715159627e-05,
"loss": 0.6558,
"step": 16700
},
{
"epoch": 3.211623016631619,
"grad_norm": 4.820471286773682,
"learning_rate": 1.7908621678455363e-05,
"loss": 0.6749,
"step": 16800
},
{
"epoch": 3.211623016631619,
"eval_loss": 0.6116614937782288,
"eval_runtime": 92.7709,
"eval_samples_per_second": 88.638,
"eval_steps_per_second": 11.081,
"step": 16800
},
{
"epoch": 3.2307398203020457,
"grad_norm": 5.879933834075928,
"learning_rate": 1.77174536417511e-05,
"loss": 0.6435,
"step": 16900
},
{
"epoch": 3.2498566239724718,
"grad_norm": 4.462230205535889,
"learning_rate": 1.752628560504684e-05,
"loss": 0.6691,
"step": 17000
},
{
"epoch": 3.268973427642898,
"grad_norm": 3.9079251289367676,
"learning_rate": 1.7335117568342573e-05,
"loss": 0.6684,
"step": 17100
},
{
"epoch": 3.2880902313133245,
"grad_norm": 3.6853411197662354,
"learning_rate": 1.7143949531638313e-05,
"loss": 0.6429,
"step": 17200
},
{
"epoch": 3.2880902313133245,
"eval_loss": 0.6121929883956909,
"eval_runtime": 92.7326,
"eval_samples_per_second": 88.674,
"eval_steps_per_second": 11.086,
"step": 17200
},
{
"epoch": 3.3072070349837506,
"grad_norm": 4.5890631675720215,
"learning_rate": 1.6952781494934046e-05,
"loss": 0.6608,
"step": 17300
},
{
"epoch": 3.326323838654177,
"grad_norm": 3.9099321365356445,
"learning_rate": 1.6761613458229786e-05,
"loss": 0.6515,
"step": 17400
},
{
"epoch": 3.3454406423246033,
"grad_norm": 8.615681648254395,
"learning_rate": 1.6570445421525522e-05,
"loss": 0.6516,
"step": 17500
},
{
"epoch": 3.36455744599503,
"grad_norm": 3.810173988342285,
"learning_rate": 1.637927738482126e-05,
"loss": 0.6488,
"step": 17600
},
{
"epoch": 3.36455744599503,
"eval_loss": 0.6148595809936523,
"eval_runtime": 92.7216,
"eval_samples_per_second": 88.685,
"eval_steps_per_second": 11.087,
"step": 17600
},
{
"epoch": 3.383674249665456,
"grad_norm": 4.097940444946289,
"learning_rate": 1.6188109348116995e-05,
"loss": 0.6441,
"step": 17700
},
{
"epoch": 3.402791053335882,
"grad_norm": 4.74275016784668,
"learning_rate": 1.5996941311412732e-05,
"loss": 0.6444,
"step": 17800
},
{
"epoch": 3.4219078570063086,
"grad_norm": 4.2954888343811035,
"learning_rate": 1.5805773274708468e-05,
"loss": 0.6474,
"step": 17900
},
{
"epoch": 3.4410246606767347,
"grad_norm": 4.689930438995361,
"learning_rate": 1.5614605238004208e-05,
"loss": 0.6341,
"step": 18000
},
{
"epoch": 3.4410246606767347,
"eval_loss": 0.6162819862365723,
"eval_runtime": 92.7406,
"eval_samples_per_second": 88.667,
"eval_steps_per_second": 11.085,
"step": 18000
},
{
"epoch": 3.4601414643471613,
"grad_norm": 4.592708110809326,
"learning_rate": 1.542343720129994e-05,
"loss": 0.6344,
"step": 18100
},
{
"epoch": 3.4792582680175874,
"grad_norm": 3.388826608657837,
"learning_rate": 1.5232269164595681e-05,
"loss": 0.6476,
"step": 18200
},
{
"epoch": 3.498375071688014,
"grad_norm": 3.1417880058288574,
"learning_rate": 1.5041101127891416e-05,
"loss": 0.63,
"step": 18300
},
{
"epoch": 3.51749187535844,
"grad_norm": 3.839583158493042,
"learning_rate": 1.4849933091187154e-05,
"loss": 0.6502,
"step": 18400
},
{
"epoch": 3.51749187535844,
"eval_loss": 0.6018521189689636,
"eval_runtime": 92.7633,
"eval_samples_per_second": 88.645,
"eval_steps_per_second": 11.082,
"step": 18400
},
{
"epoch": 3.536608679028866,
"grad_norm": 4.443102836608887,
"learning_rate": 1.4658765054482892e-05,
"loss": 0.615,
"step": 18500
},
{
"epoch": 3.555725482699293,
"grad_norm": 3.7104012966156006,
"learning_rate": 1.4467597017778627e-05,
"loss": 0.6362,
"step": 18600
},
{
"epoch": 3.574842286369719,
"grad_norm": 3.7938549518585205,
"learning_rate": 1.4276428981074365e-05,
"loss": 0.6351,
"step": 18700
},
{
"epoch": 3.593959090040145,
"grad_norm": 3.9377052783966064,
"learning_rate": 1.4085260944370102e-05,
"loss": 0.6388,
"step": 18800
},
{
"epoch": 3.593959090040145,
"eval_loss": 0.604111909866333,
"eval_runtime": 92.8027,
"eval_samples_per_second": 88.607,
"eval_steps_per_second": 11.077,
"step": 18800
},
{
"epoch": 3.6130758937105716,
"grad_norm": 4.241858959197998,
"learning_rate": 1.389409290766584e-05,
"loss": 0.6598,
"step": 18900
},
{
"epoch": 3.632192697380998,
"grad_norm": 4.486043453216553,
"learning_rate": 1.3702924870961575e-05,
"loss": 0.6225,
"step": 19000
},
{
"epoch": 3.6513095010514243,
"grad_norm": 4.468062877655029,
"learning_rate": 1.3511756834257313e-05,
"loss": 0.6401,
"step": 19100
},
{
"epoch": 3.6704263047218504,
"grad_norm": 4.349284648895264,
"learning_rate": 1.332058879755305e-05,
"loss": 0.6489,
"step": 19200
},
{
"epoch": 3.6704263047218504,
"eval_loss": 0.5988742113113403,
"eval_runtime": 92.7585,
"eval_samples_per_second": 88.65,
"eval_steps_per_second": 11.083,
"step": 19200
},
{
"epoch": 3.689543108392277,
"grad_norm": 4.928227424621582,
"learning_rate": 1.3129420760848788e-05,
"loss": 0.62,
"step": 19300
},
{
"epoch": 3.708659912062703,
"grad_norm": 3.8102471828460693,
"learning_rate": 1.2938252724144523e-05,
"loss": 0.6427,
"step": 19400
},
{
"epoch": 3.727776715733129,
"grad_norm": 4.022319316864014,
"learning_rate": 1.2747084687440261e-05,
"loss": 0.6461,
"step": 19500
},
{
"epoch": 3.7468935194035558,
"grad_norm": 4.785296440124512,
"learning_rate": 1.2555916650735996e-05,
"loss": 0.6502,
"step": 19600
},
{
"epoch": 3.7468935194035558,
"eval_loss": 0.5975850224494934,
"eval_runtime": 92.6855,
"eval_samples_per_second": 88.719,
"eval_steps_per_second": 11.091,
"step": 19600
},
{
"epoch": 3.766010323073982,
"grad_norm": 4.089471817016602,
"learning_rate": 1.2364748614031734e-05,
"loss": 0.6329,
"step": 19700
},
{
"epoch": 3.7851271267444084,
"grad_norm": 2.898491859436035,
"learning_rate": 1.2173580577327472e-05,
"loss": 0.6317,
"step": 19800
},
{
"epoch": 3.8042439304148346,
"grad_norm": 3.9998719692230225,
"learning_rate": 1.1982412540623209e-05,
"loss": 0.6184,
"step": 19900
},
{
"epoch": 3.823360734085261,
"grad_norm": 3.649463176727295,
"learning_rate": 1.1793156184285987e-05,
"loss": 0.6301,
"step": 20000
},
{
"epoch": 3.823360734085261,
"eval_loss": 0.6061282157897949,
"eval_runtime": 92.6598,
"eval_samples_per_second": 88.744,
"eval_steps_per_second": 11.094,
"step": 20000
},
{
"epoch": 3.8424775377556872,
"grad_norm": 4.067989349365234,
"learning_rate": 1.1601988147581724e-05,
"loss": 0.6425,
"step": 20100
},
{
"epoch": 3.8615943414261134,
"grad_norm": 3.7173011302948,
"learning_rate": 1.141082011087746e-05,
"loss": 0.6134,
"step": 20200
},
{
"epoch": 3.88071114509654,
"grad_norm": 4.036506175994873,
"learning_rate": 1.1219652074173199e-05,
"loss": 0.6612,
"step": 20300
},
{
"epoch": 3.899827948766966,
"grad_norm": 3.4378676414489746,
"learning_rate": 1.1028484037468937e-05,
"loss": 0.6194,
"step": 20400
},
{
"epoch": 3.899827948766966,
"eval_loss": 0.5860570669174194,
"eval_runtime": 92.6457,
"eval_samples_per_second": 88.757,
"eval_steps_per_second": 11.096,
"step": 20400
},
{
"epoch": 3.9189447524373926,
"grad_norm": 7.162832260131836,
"learning_rate": 1.0837316000764673e-05,
"loss": 0.6385,
"step": 20500
},
{
"epoch": 3.9380615561078187,
"grad_norm": 3.439091920852661,
"learning_rate": 1.064614796406041e-05,
"loss": 0.6243,
"step": 20600
},
{
"epoch": 3.9571783597782453,
"grad_norm": 3.7195284366607666,
"learning_rate": 1.0454979927356146e-05,
"loss": 0.6337,
"step": 20700
},
{
"epoch": 3.9762951634486714,
"grad_norm": 3.3584518432617188,
"learning_rate": 1.0263811890651883e-05,
"loss": 0.6352,
"step": 20800
},
{
"epoch": 3.9762951634486714,
"eval_loss": 0.5977619290351868,
"eval_runtime": 92.6632,
"eval_samples_per_second": 88.741,
"eval_steps_per_second": 11.094,
"step": 20800
},
{
"epoch": 3.9954119671190975,
"grad_norm": 5.517305850982666,
"learning_rate": 1.0072643853947621e-05,
"loss": 0.6241,
"step": 20900
},
{
"epoch": 4.014528770789524,
"grad_norm": 3.5819714069366455,
"learning_rate": 9.881475817243358e-06,
"loss": 0.6352,
"step": 21000
},
{
"epoch": 4.033645574459951,
"grad_norm": 3.967008352279663,
"learning_rate": 9.690307780539094e-06,
"loss": 0.617,
"step": 21100
},
{
"epoch": 4.052762378130376,
"grad_norm": 3.5766172409057617,
"learning_rate": 9.49913974383483e-06,
"loss": 0.628,
"step": 21200
},
{
"epoch": 4.052762378130376,
"eval_loss": 0.5928879380226135,
"eval_runtime": 92.586,
"eval_samples_per_second": 88.815,
"eval_steps_per_second": 11.103,
"step": 21200
},
{
"epoch": 4.071879181800803,
"grad_norm": 4.025076866149902,
"learning_rate": 9.307971707130567e-06,
"loss": 0.6238,
"step": 21300
},
{
"epoch": 4.0909959854712294,
"grad_norm": 5.096427917480469,
"learning_rate": 9.116803670426305e-06,
"loss": 0.6337,
"step": 21400
},
{
"epoch": 4.110112789141655,
"grad_norm": 5.223696708679199,
"learning_rate": 8.925635633722042e-06,
"loss": 0.6212,
"step": 21500
},
{
"epoch": 4.129229592812082,
"grad_norm": 4.151371479034424,
"learning_rate": 8.734467597017778e-06,
"loss": 0.6176,
"step": 21600
},
{
"epoch": 4.129229592812082,
"eval_loss": 0.5918228626251221,
"eval_runtime": 92.7104,
"eval_samples_per_second": 88.696,
"eval_steps_per_second": 11.088,
"step": 21600
},
{
"epoch": 4.148346396482508,
"grad_norm": 4.445927619934082,
"learning_rate": 8.543299560313515e-06,
"loss": 0.6322,
"step": 21700
},
{
"epoch": 4.167463200152935,
"grad_norm": 6.669031143188477,
"learning_rate": 8.352131523609253e-06,
"loss": 0.6307,
"step": 21800
},
{
"epoch": 4.1865800038233605,
"grad_norm": 4.0559186935424805,
"learning_rate": 8.16096348690499e-06,
"loss": 0.6323,
"step": 21900
},
{
"epoch": 4.205696807493787,
"grad_norm": 4.512356281280518,
"learning_rate": 7.969795450200728e-06,
"loss": 0.6385,
"step": 22000
},
{
"epoch": 4.205696807493787,
"eval_loss": 0.5929626226425171,
"eval_runtime": 92.4043,
"eval_samples_per_second": 88.989,
"eval_steps_per_second": 11.125,
"step": 22000
},
{
"epoch": 4.224813611164214,
"grad_norm": 4.109405517578125,
"learning_rate": 7.778627413496464e-06,
"loss": 0.6255,
"step": 22100
},
{
"epoch": 4.243930414834639,
"grad_norm": 4.0541486740112305,
"learning_rate": 7.587459376792201e-06,
"loss": 0.6198,
"step": 22200
},
{
"epoch": 4.263047218505066,
"grad_norm": 5.3996663093566895,
"learning_rate": 7.396291340087938e-06,
"loss": 0.6172,
"step": 22300
},
{
"epoch": 4.282164022175492,
"grad_norm": 4.728433609008789,
"learning_rate": 7.205123303383675e-06,
"loss": 0.6061,
"step": 22400
},
{
"epoch": 4.282164022175492,
"eval_loss": 0.5865157246589661,
"eval_runtime": 92.3573,
"eval_samples_per_second": 89.035,
"eval_steps_per_second": 11.131,
"step": 22400
},
{
"epoch": 4.301280825845919,
"grad_norm": 4.604154586791992,
"learning_rate": 7.013955266679411e-06,
"loss": 0.6441,
"step": 22500
},
{
"epoch": 4.320397629516345,
"grad_norm": 4.1287760734558105,
"learning_rate": 6.822787229975149e-06,
"loss": 0.625,
"step": 22600
},
{
"epoch": 4.339514433186771,
"grad_norm": 3.1182920932769775,
"learning_rate": 6.631619193270885e-06,
"loss": 0.5973,
"step": 22700
},
{
"epoch": 4.358631236857198,
"grad_norm": 4.751844882965088,
"learning_rate": 6.4404511565666225e-06,
"loss": 0.597,
"step": 22800
},
{
"epoch": 4.358631236857198,
"eval_loss": 0.5919764637947083,
"eval_runtime": 92.4705,
"eval_samples_per_second": 88.926,
"eval_steps_per_second": 11.117,
"step": 22800
},
{
"epoch": 4.3777480405276235,
"grad_norm": 5.184845924377441,
"learning_rate": 6.249283119862359e-06,
"loss": 0.6219,
"step": 22900
},
{
"epoch": 4.39686484419805,
"grad_norm": 4.108447551727295,
"learning_rate": 6.058115083158096e-06,
"loss": 0.6275,
"step": 23000
},
{
"epoch": 4.415981647868477,
"grad_norm": 3.9303929805755615,
"learning_rate": 5.866947046453833e-06,
"loss": 0.6213,
"step": 23100
},
{
"epoch": 4.435098451538902,
"grad_norm": 4.054929733276367,
"learning_rate": 5.67577900974957e-06,
"loss": 0.6313,
"step": 23200
},
{
"epoch": 4.435098451538902,
"eval_loss": 0.5812836289405823,
"eval_runtime": 92.6171,
"eval_samples_per_second": 88.785,
"eval_steps_per_second": 11.099,
"step": 23200
},
{
"epoch": 4.454215255209329,
"grad_norm": 4.04779052734375,
"learning_rate": 5.484610973045308e-06,
"loss": 0.6055,
"step": 23300
},
{
"epoch": 4.473332058879755,
"grad_norm": 4.373106956481934,
"learning_rate": 5.293442936341044e-06,
"loss": 0.6181,
"step": 23400
},
{
"epoch": 4.492448862550182,
"grad_norm": 3.912672758102417,
"learning_rate": 5.102274899636781e-06,
"loss": 0.6049,
"step": 23500
},
{
"epoch": 4.511565666220608,
"grad_norm": 4.924178123474121,
"learning_rate": 4.911106862932518e-06,
"loss": 0.6388,
"step": 23600
},
{
"epoch": 4.511565666220608,
"eval_loss": 0.585365891456604,
"eval_runtime": 92.598,
"eval_samples_per_second": 88.803,
"eval_steps_per_second": 11.102,
"step": 23600
},
{
"epoch": 4.530682469891034,
"grad_norm": 4.225689888000488,
"learning_rate": 4.7199388262282546e-06,
"loss": 0.5984,
"step": 23700
},
{
"epoch": 4.549799273561461,
"grad_norm": 3.848640203475952,
"learning_rate": 4.528770789523992e-06,
"loss": 0.6009,
"step": 23800
},
{
"epoch": 4.568916077231886,
"grad_norm": 3.6290130615234375,
"learning_rate": 4.3376027528197284e-06,
"loss": 0.6205,
"step": 23900
},
{
"epoch": 4.588032880902313,
"grad_norm": 5.409413814544678,
"learning_rate": 4.146434716115466e-06,
"loss": 0.6052,
"step": 24000
},
{
"epoch": 4.588032880902313,
"eval_loss": 0.5768113732337952,
"eval_runtime": 92.5622,
"eval_samples_per_second": 88.838,
"eval_steps_per_second": 11.106,
"step": 24000
},
{
"epoch": 4.6071496845727395,
"grad_norm": 4.062690258026123,
"learning_rate": 3.955266679411203e-06,
"loss": 0.6227,
"step": 24100
},
{
"epoch": 4.626266488243166,
"grad_norm": 6.228837490081787,
"learning_rate": 3.7640986427069397e-06,
"loss": 0.6104,
"step": 24200
},
{
"epoch": 4.645383291913592,
"grad_norm": 3.8039870262145996,
"learning_rate": 3.5729306060026766e-06,
"loss": 0.6087,
"step": 24300
},
{
"epoch": 4.664500095584018,
"grad_norm": 4.199521541595459,
"learning_rate": 3.3817625692984135e-06,
"loss": 0.6135,
"step": 24400
},
{
"epoch": 4.664500095584018,
"eval_loss": 0.5789579153060913,
"eval_runtime": 92.5954,
"eval_samples_per_second": 88.806,
"eval_steps_per_second": 11.102,
"step": 24400
},
{
"epoch": 4.683616899254445,
"grad_norm": 3.312234878540039,
"learning_rate": 3.19059453259415e-06,
"loss": 0.6108,
"step": 24500
},
{
"epoch": 4.702733702924871,
"grad_norm": 4.610132694244385,
"learning_rate": 2.9994264958898874e-06,
"loss": 0.5984,
"step": 24600
},
{
"epoch": 4.721850506595297,
"grad_norm": 4.196247100830078,
"learning_rate": 2.8082584591856244e-06,
"loss": 0.6249,
"step": 24700
},
{
"epoch": 4.740967310265724,
"grad_norm": 4.444230079650879,
"learning_rate": 2.6170904224813613e-06,
"loss": 0.6135,
"step": 24800
},
{
"epoch": 4.740967310265724,
"eval_loss": 0.5723977088928223,
"eval_runtime": 92.5878,
"eval_samples_per_second": 88.813,
"eval_steps_per_second": 11.103,
"step": 24800
},
{
"epoch": 4.76008411393615,
"grad_norm": 4.337975025177002,
"learning_rate": 2.4259223857770982e-06,
"loss": 0.6222,
"step": 24900
},
{
"epoch": 4.779200917606576,
"grad_norm": 4.261539459228516,
"learning_rate": 2.234754349072835e-06,
"loss": 0.5808,
"step": 25000
},
{
"epoch": 4.7983177212770025,
"grad_norm": 4.659415245056152,
"learning_rate": 2.043586312368572e-06,
"loss": 0.592,
"step": 25100
},
{
"epoch": 4.817434524947429,
"grad_norm": 4.005898952484131,
"learning_rate": 1.852418275664309e-06,
"loss": 0.6087,
"step": 25200
},
{
"epoch": 4.817434524947429,
"eval_loss": 0.5687017440795898,
"eval_runtime": 92.6047,
"eval_samples_per_second": 88.797,
"eval_steps_per_second": 11.101,
"step": 25200
},
{
"epoch": 4.836551328617855,
"grad_norm": 4.495694160461426,
"learning_rate": 1.6612502389600458e-06,
"loss": 0.6195,
"step": 25300
},
{
"epoch": 4.855668132288281,
"grad_norm": 5.030457019805908,
"learning_rate": 1.470082202255783e-06,
"loss": 0.6202,
"step": 25400
},
{
"epoch": 4.874784935958708,
"grad_norm": 4.42711877822876,
"learning_rate": 1.2789141655515199e-06,
"loss": 0.6007,
"step": 25500
},
{
"epoch": 4.893901739629134,
"grad_norm": 4.1595563888549805,
"learning_rate": 1.0877461288472568e-06,
"loss": 0.621,
"step": 25600
},
{
"epoch": 4.893901739629134,
"eval_loss": 0.5748383402824402,
"eval_runtime": 92.5734,
"eval_samples_per_second": 88.827,
"eval_steps_per_second": 11.105,
"step": 25600
},
{
"epoch": 4.91301854329956,
"grad_norm": 3.987473249435425,
"learning_rate": 8.965780921429937e-07,
"loss": 0.586,
"step": 25700
},
{
"epoch": 4.932135346969987,
"grad_norm": 3.4999001026153564,
"learning_rate": 7.054100554387307e-07,
"loss": 0.6105,
"step": 25800
},
{
"epoch": 4.951252150640413,
"grad_norm": 3.7822272777557373,
"learning_rate": 5.142420187344676e-07,
"loss": 0.6047,
"step": 25900
},
{
"epoch": 4.970368954310839,
"grad_norm": 3.8180148601531982,
"learning_rate": 3.2307398203020455e-07,
"loss": 0.6256,
"step": 26000
},
{
"epoch": 4.970368954310839,
"eval_loss": 0.5706872344017029,
"eval_runtime": 92.6466,
"eval_samples_per_second": 88.757,
"eval_steps_per_second": 11.096,
"step": 26000
},
{
"epoch": 4.9894857579812655,
"grad_norm": 3.5531647205352783,
"learning_rate": 1.319059453259415e-07,
"loss": 0.599,
"step": 26100
}
],
"logging_steps": 100,
"max_steps": 26155,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.446806122974282e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}