| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 400, | |
| "global_step": 26155, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.019116803670426306, | |
| "grad_norm": 6.148062705993652, | |
| "learning_rate": 4.981265532402982e-05, | |
| "loss": 1.0593, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03823360734085261, | |
| "grad_norm": 4.97545051574707, | |
| "learning_rate": 4.9625310648059644e-05, | |
| "loss": 0.9839, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05735041101127891, | |
| "grad_norm": 5.777927875518799, | |
| "learning_rate": 4.9434142611355384e-05, | |
| "loss": 0.9631, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07646721468170523, | |
| "grad_norm": 6.401044845581055, | |
| "learning_rate": 4.924297457465112e-05, | |
| "loss": 0.9816, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07646721468170523, | |
| "eval_loss": 0.8617107272148132, | |
| "eval_runtime": 92.0408, | |
| "eval_samples_per_second": 89.341, | |
| "eval_steps_per_second": 11.169, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09558401835213153, | |
| "grad_norm": 5.193621635437012, | |
| "learning_rate": 4.905180653794686e-05, | |
| "loss": 0.9312, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11470082202255782, | |
| "grad_norm": 5.454291343688965, | |
| "learning_rate": 4.88606385012426e-05, | |
| "loss": 0.9509, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.13381762569298414, | |
| "grad_norm": 6.0297112464904785, | |
| "learning_rate": 4.866947046453833e-05, | |
| "loss": 0.9374, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.15293442936341045, | |
| "grad_norm": 4.852521896362305, | |
| "learning_rate": 4.847830242783406e-05, | |
| "loss": 0.9135, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.15293442936341045, | |
| "eval_loss": 0.8287038207054138, | |
| "eval_runtime": 92.0241, | |
| "eval_samples_per_second": 89.357, | |
| "eval_steps_per_second": 11.171, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.17205123303383674, | |
| "grad_norm": 5.611992835998535, | |
| "learning_rate": 4.828713439112981e-05, | |
| "loss": 0.9056, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.19116803670426305, | |
| "grad_norm": 4.3008832931518555, | |
| "learning_rate": 4.809596635442554e-05, | |
| "loss": 0.8996, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.21028484037468936, | |
| "grad_norm": 5.790643215179443, | |
| "learning_rate": 4.790670999808832e-05, | |
| "loss": 0.8965, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.22940164404511565, | |
| "grad_norm": 5.138456344604492, | |
| "learning_rate": 4.7715541961384055e-05, | |
| "loss": 0.9044, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.22940164404511565, | |
| "eval_loss": 0.8015913367271423, | |
| "eval_runtime": 92.1486, | |
| "eval_samples_per_second": 89.236, | |
| "eval_steps_per_second": 11.156, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.24851844771554196, | |
| "grad_norm": 5.13100004196167, | |
| "learning_rate": 4.7524373924679795e-05, | |
| "loss": 0.9073, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2676352513859683, | |
| "grad_norm": 4.568897724151611, | |
| "learning_rate": 4.7333205887975535e-05, | |
| "loss": 0.8891, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.28675205505639456, | |
| "grad_norm": 4.8391337394714355, | |
| "learning_rate": 4.714203785127127e-05, | |
| "loss": 0.871, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3058688587268209, | |
| "grad_norm": 4.416493892669678, | |
| "learning_rate": 4.695086981456701e-05, | |
| "loss": 0.8974, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.3058688587268209, | |
| "eval_loss": 0.8033304810523987, | |
| "eval_runtime": 91.9998, | |
| "eval_samples_per_second": 89.381, | |
| "eval_steps_per_second": 11.174, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.3249856623972472, | |
| "grad_norm": 4.519742012023926, | |
| "learning_rate": 4.675970177786274e-05, | |
| "loss": 0.8682, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3441024660676735, | |
| "grad_norm": 4.968471527099609, | |
| "learning_rate": 4.657044542152552e-05, | |
| "loss": 0.9351, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3632192697380998, | |
| "grad_norm": 5.676445960998535, | |
| "learning_rate": 4.637927738482126e-05, | |
| "loss": 0.8868, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3823360734085261, | |
| "grad_norm": 4.578829288482666, | |
| "learning_rate": 4.6188109348117e-05, | |
| "loss": 0.8726, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3823360734085261, | |
| "eval_loss": 0.7817397117614746, | |
| "eval_runtime": 92.2617, | |
| "eval_samples_per_second": 89.127, | |
| "eval_steps_per_second": 11.142, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4014528770789524, | |
| "grad_norm": 5.0320892333984375, | |
| "learning_rate": 4.599694131141273e-05, | |
| "loss": 0.8714, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.4205696807493787, | |
| "grad_norm": 5.464868545532227, | |
| "learning_rate": 4.5805773274708466e-05, | |
| "loss": 0.8674, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.439686484419805, | |
| "grad_norm": 4.6338934898376465, | |
| "learning_rate": 4.561460523800421e-05, | |
| "loss": 0.851, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4588032880902313, | |
| "grad_norm": 4.83494234085083, | |
| "learning_rate": 4.5423437201299945e-05, | |
| "loss": 0.8855, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4588032880902313, | |
| "eval_loss": 0.7753578424453735, | |
| "eval_runtime": 92.0087, | |
| "eval_samples_per_second": 89.372, | |
| "eval_steps_per_second": 11.173, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.47792009176065764, | |
| "grad_norm": 4.239180564880371, | |
| "learning_rate": 4.523226916459568e-05, | |
| "loss": 0.8647, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4970368954310839, | |
| "grad_norm": 5.430279731750488, | |
| "learning_rate": 4.504110112789142e-05, | |
| "loss": 0.8601, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.5161536991015102, | |
| "grad_norm": 4.269476890563965, | |
| "learning_rate": 4.484993309118716e-05, | |
| "loss": 0.8713, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.5352705027719366, | |
| "grad_norm": 5.328524589538574, | |
| "learning_rate": 4.465876505448289e-05, | |
| "loss": 0.8462, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5352705027719366, | |
| "eval_loss": 0.763035237789154, | |
| "eval_runtime": 92.1674, | |
| "eval_samples_per_second": 89.218, | |
| "eval_steps_per_second": 11.154, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5543873064423628, | |
| "grad_norm": 5.192692756652832, | |
| "learning_rate": 4.446759701777863e-05, | |
| "loss": 0.8335, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5735041101127891, | |
| "grad_norm": 4.487423419952393, | |
| "learning_rate": 4.4276428981074365e-05, | |
| "loss": 0.8294, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5926209137832155, | |
| "grad_norm": 4.573492527008057, | |
| "learning_rate": 4.4085260944370104e-05, | |
| "loss": 0.8579, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.6117377174536418, | |
| "grad_norm": 4.045724868774414, | |
| "learning_rate": 4.389409290766584e-05, | |
| "loss": 0.8206, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.6117377174536418, | |
| "eval_loss": 0.7534114122390747, | |
| "eval_runtime": 92.1289, | |
| "eval_samples_per_second": 89.255, | |
| "eval_steps_per_second": 11.158, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.630854521124068, | |
| "grad_norm": 4.9863386154174805, | |
| "learning_rate": 4.370292487096158e-05, | |
| "loss": 0.8268, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.6499713247944944, | |
| "grad_norm": 4.475931644439697, | |
| "learning_rate": 4.351175683425732e-05, | |
| "loss": 0.8326, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6690881284649207, | |
| "grad_norm": 5.008608341217041, | |
| "learning_rate": 4.332058879755305e-05, | |
| "loss": 0.7823, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.688204932135347, | |
| "grad_norm": 6.179344177246094, | |
| "learning_rate": 4.312942076084879e-05, | |
| "loss": 0.8181, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.688204932135347, | |
| "eval_loss": 0.74337238073349, | |
| "eval_runtime": 92.0867, | |
| "eval_samples_per_second": 89.296, | |
| "eval_steps_per_second": 11.163, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.7073217358057733, | |
| "grad_norm": 6.242499351501465, | |
| "learning_rate": 4.2938252724144524e-05, | |
| "loss": 0.8082, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.7264385394761996, | |
| "grad_norm": 4.346529006958008, | |
| "learning_rate": 4.2747084687440263e-05, | |
| "loss": 0.8274, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.7455553431466259, | |
| "grad_norm": 3.4538333415985107, | |
| "learning_rate": 4.2555916650735997e-05, | |
| "loss": 0.812, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.7646721468170522, | |
| "grad_norm": 5.477049350738525, | |
| "learning_rate": 4.2364748614031736e-05, | |
| "loss": 0.8291, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7646721468170522, | |
| "eval_loss": 0.8589261770248413, | |
| "eval_runtime": 91.9418, | |
| "eval_samples_per_second": 89.437, | |
| "eval_steps_per_second": 11.181, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7837889504874785, | |
| "grad_norm": 6.062958717346191, | |
| "learning_rate": 4.217358057732747e-05, | |
| "loss": 0.8236, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.8029057541579048, | |
| "grad_norm": 4.950747489929199, | |
| "learning_rate": 4.198241254062321e-05, | |
| "loss": 0.8356, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.8220225578283311, | |
| "grad_norm": 4.48617696762085, | |
| "learning_rate": 4.179124450391895e-05, | |
| "loss": 0.778, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.8411393614987575, | |
| "grad_norm": 4.147668361663818, | |
| "learning_rate": 4.160007646721468e-05, | |
| "loss": 0.8314, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.8411393614987575, | |
| "eval_loss": 0.7325341105461121, | |
| "eval_runtime": 91.9404, | |
| "eval_samples_per_second": 89.438, | |
| "eval_steps_per_second": 11.181, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.8602561651691837, | |
| "grad_norm": 4.805129051208496, | |
| "learning_rate": 4.1408908430510416e-05, | |
| "loss": 0.841, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.87937296883961, | |
| "grad_norm": 4.559420108795166, | |
| "learning_rate": 4.121774039380616e-05, | |
| "loss": 0.805, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8984897725100364, | |
| "grad_norm": 4.7745771408081055, | |
| "learning_rate": 4.1026572357101895e-05, | |
| "loss": 0.8243, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.9176065761804626, | |
| "grad_norm": 4.034806251525879, | |
| "learning_rate": 4.083540432039763e-05, | |
| "loss": 0.8016, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.9176065761804626, | |
| "eval_loss": 0.7278503775596619, | |
| "eval_runtime": 91.908, | |
| "eval_samples_per_second": 89.47, | |
| "eval_steps_per_second": 11.185, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.9367233798508889, | |
| "grad_norm": 4.074121952056885, | |
| "learning_rate": 4.064423628369337e-05, | |
| "loss": 0.8006, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.9558401835213153, | |
| "grad_norm": 4.704626560211182, | |
| "learning_rate": 4.045306824698911e-05, | |
| "loss": 0.8106, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.9749569871917415, | |
| "grad_norm": 4.858222961425781, | |
| "learning_rate": 4.026190021028484e-05, | |
| "loss": 0.8129, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.9940737908621678, | |
| "grad_norm": 3.913759469985962, | |
| "learning_rate": 4.0070732173580575e-05, | |
| "loss": 0.7769, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.9940737908621678, | |
| "eval_loss": 0.7124339938163757, | |
| "eval_runtime": 92.1458, | |
| "eval_samples_per_second": 89.239, | |
| "eval_steps_per_second": 11.156, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.013190594532594, | |
| "grad_norm": 4.4074015617370605, | |
| "learning_rate": 3.987956413687632e-05, | |
| "loss": 0.7881, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.0323073982030204, | |
| "grad_norm": 4.406864643096924, | |
| "learning_rate": 3.9688396100172054e-05, | |
| "loss": 0.783, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.0514242018734468, | |
| "grad_norm": 4.834721565246582, | |
| "learning_rate": 3.949722806346779e-05, | |
| "loss": 0.7832, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.070541005543873, | |
| "grad_norm": 4.5118889808654785, | |
| "learning_rate": 3.930606002676353e-05, | |
| "loss": 0.7652, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.070541005543873, | |
| "eval_loss": 0.7111669182777405, | |
| "eval_runtime": 91.8955, | |
| "eval_samples_per_second": 89.482, | |
| "eval_steps_per_second": 11.187, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.0896578092142994, | |
| "grad_norm": 4.746542930603027, | |
| "learning_rate": 3.911489199005927e-05, | |
| "loss": 0.773, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.1087746128847256, | |
| "grad_norm": 4.428741455078125, | |
| "learning_rate": 3.8923723953355e-05, | |
| "loss": 0.78, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.127891416555152, | |
| "grad_norm": 5.356833457946777, | |
| "learning_rate": 3.8732555916650734e-05, | |
| "loss": 0.7614, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.1470082202255782, | |
| "grad_norm": 5.255131244659424, | |
| "learning_rate": 3.8541387879946474e-05, | |
| "loss": 0.7753, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.1470082202255782, | |
| "eval_loss": 0.7003123164176941, | |
| "eval_runtime": 91.7533, | |
| "eval_samples_per_second": 89.621, | |
| "eval_steps_per_second": 11.204, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.1661250238960046, | |
| "grad_norm": 4.378373622894287, | |
| "learning_rate": 3.8350219843242213e-05, | |
| "loss": 0.7813, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.185241827566431, | |
| "grad_norm": 5.40322208404541, | |
| "learning_rate": 3.8159051806537947e-05, | |
| "loss": 0.7519, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.2043586312368573, | |
| "grad_norm": 4.921120643615723, | |
| "learning_rate": 3.7967883769833686e-05, | |
| "loss": 0.7649, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.2234754349072836, | |
| "grad_norm": 4.296281814575195, | |
| "learning_rate": 3.7778627413496465e-05, | |
| "loss": 0.7948, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.2234754349072836, | |
| "eval_loss": 0.707783579826355, | |
| "eval_runtime": 91.9021, | |
| "eval_samples_per_second": 89.476, | |
| "eval_steps_per_second": 11.186, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.2425922385777097, | |
| "grad_norm": 3.6409478187561035, | |
| "learning_rate": 3.75874593767922e-05, | |
| "loss": 0.7547, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.261709042248136, | |
| "grad_norm": 4.5969672203063965, | |
| "learning_rate": 3.739629134008794e-05, | |
| "loss": 0.7419, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.2808258459185624, | |
| "grad_norm": 6.831844806671143, | |
| "learning_rate": 3.720703498375072e-05, | |
| "loss": 0.7909, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.2999426495889888, | |
| "grad_norm": 4.02253532409668, | |
| "learning_rate": 3.701586694704646e-05, | |
| "loss": 0.7693, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.2999426495889888, | |
| "eval_loss": 0.6886795163154602, | |
| "eval_runtime": 91.6808, | |
| "eval_samples_per_second": 89.692, | |
| "eval_steps_per_second": 11.213, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.319059453259415, | |
| "grad_norm": 5.338057994842529, | |
| "learning_rate": 3.682469891034219e-05, | |
| "loss": 0.7468, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.3381762569298412, | |
| "grad_norm": 4.327272415161133, | |
| "learning_rate": 3.663353087363793e-05, | |
| "loss": 0.7493, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.3572930606002676, | |
| "grad_norm": 5.279138088226318, | |
| "learning_rate": 3.644236283693367e-05, | |
| "loss": 0.7527, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.376409864270694, | |
| "grad_norm": 5.587660789489746, | |
| "learning_rate": 3.62511948002294e-05, | |
| "loss": 0.7456, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.376409864270694, | |
| "eval_loss": 0.6874927282333374, | |
| "eval_runtime": 91.6806, | |
| "eval_samples_per_second": 89.692, | |
| "eval_steps_per_second": 11.213, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.3955266679411202, | |
| "grad_norm": 3.81733775138855, | |
| "learning_rate": 3.606002676352514e-05, | |
| "loss": 0.7551, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.4146434716115466, | |
| "grad_norm": 4.612213611602783, | |
| "learning_rate": 3.5868858726820876e-05, | |
| "loss": 0.7616, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.433760275281973, | |
| "grad_norm": 5.318126678466797, | |
| "learning_rate": 3.5677690690116616e-05, | |
| "loss": 0.7507, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.4528770789523993, | |
| "grad_norm": 5.163857936859131, | |
| "learning_rate": 3.548652265341235e-05, | |
| "loss": 0.7435, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.4528770789523993, | |
| "eval_loss": 0.6907040476799011, | |
| "eval_runtime": 91.8652, | |
| "eval_samples_per_second": 89.512, | |
| "eval_steps_per_second": 11.19, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.4719938826228254, | |
| "grad_norm": 4.82822322845459, | |
| "learning_rate": 3.529535461670809e-05, | |
| "loss": 0.7539, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.4911106862932517, | |
| "grad_norm": 3.967336416244507, | |
| "learning_rate": 3.510609826037087e-05, | |
| "loss": 0.7339, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.510227489963678, | |
| "grad_norm": 4.51738166809082, | |
| "learning_rate": 3.49149302236666e-05, | |
| "loss": 0.7473, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.5293442936341044, | |
| "grad_norm": 5.371578693389893, | |
| "learning_rate": 3.472376218696235e-05, | |
| "loss": 0.759, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.5293442936341044, | |
| "eval_loss": 0.6801463961601257, | |
| "eval_runtime": 91.9431, | |
| "eval_samples_per_second": 89.436, | |
| "eval_steps_per_second": 11.181, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.5484610973045307, | |
| "grad_norm": 5.212741851806641, | |
| "learning_rate": 3.453259415025808e-05, | |
| "loss": 0.746, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.5675779009749569, | |
| "grad_norm": 3.754594564437866, | |
| "learning_rate": 3.4341426113553814e-05, | |
| "loss": 0.779, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.5866947046453834, | |
| "grad_norm": 4.9265289306640625, | |
| "learning_rate": 3.4150258076849553e-05, | |
| "loss": 0.7602, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.6058115083158095, | |
| "grad_norm": 4.226690769195557, | |
| "learning_rate": 3.395909004014529e-05, | |
| "loss": 0.7413, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.6058115083158095, | |
| "eval_loss": 0.6831762194633484, | |
| "eval_runtime": 92.496, | |
| "eval_samples_per_second": 88.901, | |
| "eval_steps_per_second": 11.114, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.6249283119862359, | |
| "grad_norm": 4.135231018066406, | |
| "learning_rate": 3.3767922003441027e-05, | |
| "loss": 0.748, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.6440451156566622, | |
| "grad_norm": 4.373616695404053, | |
| "learning_rate": 3.357675396673676e-05, | |
| "loss": 0.7212, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.6631619193270883, | |
| "grad_norm": 3.8442435264587402, | |
| "learning_rate": 3.33855859300325e-05, | |
| "loss": 0.731, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.682278722997515, | |
| "grad_norm": 5.096011638641357, | |
| "learning_rate": 3.319441789332824e-05, | |
| "loss": 0.742, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.682278722997515, | |
| "eval_loss": 0.6838387250900269, | |
| "eval_runtime": 92.5093, | |
| "eval_samples_per_second": 88.888, | |
| "eval_steps_per_second": 11.112, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.701395526667941, | |
| "grad_norm": 3.780578851699829, | |
| "learning_rate": 3.300324985662397e-05, | |
| "loss": 0.742, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.7205123303383676, | |
| "grad_norm": 4.875925064086914, | |
| "learning_rate": 3.281208181991971e-05, | |
| "loss": 0.7479, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.7396291340087937, | |
| "grad_norm": 3.7036213874816895, | |
| "learning_rate": 3.2620913783215446e-05, | |
| "loss": 0.7228, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.75874593767922, | |
| "grad_norm": 5.547360897064209, | |
| "learning_rate": 3.2429745746511185e-05, | |
| "loss": 0.7363, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.75874593767922, | |
| "eval_loss": 0.6730713844299316, | |
| "eval_runtime": 92.4366, | |
| "eval_samples_per_second": 88.958, | |
| "eval_steps_per_second": 11.121, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.7778627413496464, | |
| "grad_norm": 3.721874475479126, | |
| "learning_rate": 3.223857770980692e-05, | |
| "loss": 0.73, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.7969795450200725, | |
| "grad_norm": 4.086114406585693, | |
| "learning_rate": 3.204740967310266e-05, | |
| "loss": 0.7188, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.816096348690499, | |
| "grad_norm": 5.272156238555908, | |
| "learning_rate": 3.18562416363984e-05, | |
| "loss": 0.7556, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.8352131523609252, | |
| "grad_norm": 4.227740287780762, | |
| "learning_rate": 3.166507359969413e-05, | |
| "loss": 0.7355, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.8352131523609252, | |
| "eval_loss": 0.6690217852592468, | |
| "eval_runtime": 92.4428, | |
| "eval_samples_per_second": 88.952, | |
| "eval_steps_per_second": 11.12, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.8543299560313515, | |
| "grad_norm": 4.812748432159424, | |
| "learning_rate": 3.147390556298987e-05, | |
| "loss": 0.7534, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.8734467597017779, | |
| "grad_norm": 4.234578609466553, | |
| "learning_rate": 3.1282737526285605e-05, | |
| "loss": 0.7143, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.8925635633722042, | |
| "grad_norm": 4.305205345153809, | |
| "learning_rate": 3.1091569489581344e-05, | |
| "loss": 0.7151, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.9116803670426306, | |
| "grad_norm": 4.651333332061768, | |
| "learning_rate": 3.090231313324412e-05, | |
| "loss": 0.7097, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.9116803670426306, | |
| "eval_loss": 0.664512038230896, | |
| "eval_runtime": 92.6703, | |
| "eval_samples_per_second": 88.734, | |
| "eval_steps_per_second": 11.093, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.9307971707130567, | |
| "grad_norm": 4.365225791931152, | |
| "learning_rate": 3.071114509653986e-05, | |
| "loss": 0.7095, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.9499139743834832, | |
| "grad_norm": 4.273274898529053, | |
| "learning_rate": 3.0519977059835596e-05, | |
| "loss": 0.7283, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.9690307780539094, | |
| "grad_norm": 4.034031391143799, | |
| "learning_rate": 3.033072070349838e-05, | |
| "loss": 0.7083, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.9881475817243357, | |
| "grad_norm": 4.648432731628418, | |
| "learning_rate": 3.013955266679411e-05, | |
| "loss": 0.7019, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.9881475817243357, | |
| "eval_loss": 0.658172607421875, | |
| "eval_runtime": 92.5836, | |
| "eval_samples_per_second": 88.817, | |
| "eval_steps_per_second": 11.103, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.007264385394762, | |
| "grad_norm": 3.510467290878296, | |
| "learning_rate": 2.9948384630089848e-05, | |
| "loss": 0.7183, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.026381189065188, | |
| "grad_norm": 4.297295570373535, | |
| "learning_rate": 2.9757216593385588e-05, | |
| "loss": 0.701, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.0454979927356147, | |
| "grad_norm": 5.070156097412109, | |
| "learning_rate": 2.9566048556681324e-05, | |
| "loss": 0.7029, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.064614796406041, | |
| "grad_norm": 5.115049362182617, | |
| "learning_rate": 2.937488051997706e-05, | |
| "loss": 0.7023, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.064614796406041, | |
| "eval_loss": 0.6589385867118835, | |
| "eval_runtime": 92.7551, | |
| "eval_samples_per_second": 88.653, | |
| "eval_steps_per_second": 11.083, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.0837316000764674, | |
| "grad_norm": 4.264118194580078, | |
| "learning_rate": 2.9183712483272797e-05, | |
| "loss": 0.701, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.1028484037468935, | |
| "grad_norm": 4.804683208465576, | |
| "learning_rate": 2.8992544446568537e-05, | |
| "loss": 0.6865, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.1219652074173196, | |
| "grad_norm": 3.3149337768554688, | |
| "learning_rate": 2.8801376409864274e-05, | |
| "loss": 0.7121, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.141082011087746, | |
| "grad_norm": 4.628523349761963, | |
| "learning_rate": 2.8610208373160007e-05, | |
| "loss": 0.7095, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.141082011087746, | |
| "eval_loss": 0.6505147218704224, | |
| "eval_runtime": 92.4458, | |
| "eval_samples_per_second": 88.949, | |
| "eval_steps_per_second": 11.12, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.1601988147581723, | |
| "grad_norm": 4.2497453689575195, | |
| "learning_rate": 2.8419040336455743e-05, | |
| "loss": 0.7078, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.179315618428599, | |
| "grad_norm": 4.486359119415283, | |
| "learning_rate": 2.8227872299751483e-05, | |
| "loss": 0.7164, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.198432422099025, | |
| "grad_norm": 4.553341388702393, | |
| "learning_rate": 2.803670426304722e-05, | |
| "loss": 0.6857, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.217549225769451, | |
| "grad_norm": 4.612332344055176, | |
| "learning_rate": 2.7845536226342956e-05, | |
| "loss": 0.7088, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.217549225769451, | |
| "eval_loss": 0.6430885195732117, | |
| "eval_runtime": 92.4895, | |
| "eval_samples_per_second": 88.907, | |
| "eval_steps_per_second": 11.115, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.2366660294398777, | |
| "grad_norm": 4.992730617523193, | |
| "learning_rate": 2.7654368189638696e-05, | |
| "loss": 0.6807, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.255782833110304, | |
| "grad_norm": 4.852089881896973, | |
| "learning_rate": 2.7463200152934433e-05, | |
| "loss": 0.6944, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.2748996367807304, | |
| "grad_norm": 4.18324089050293, | |
| "learning_rate": 2.7272032116230166e-05, | |
| "loss": 0.704, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.2940164404511565, | |
| "grad_norm": 4.048402786254883, | |
| "learning_rate": 2.7080864079525902e-05, | |
| "loss": 0.7127, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.2940164404511565, | |
| "eval_loss": 0.6457203030586243, | |
| "eval_runtime": 92.3316, | |
| "eval_samples_per_second": 89.059, | |
| "eval_steps_per_second": 11.134, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.313133244121583, | |
| "grad_norm": 4.407283306121826, | |
| "learning_rate": 2.6889696042821642e-05, | |
| "loss": 0.7022, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.332250047792009, | |
| "grad_norm": 3.9950592517852783, | |
| "learning_rate": 2.669852800611738e-05, | |
| "loss": 0.7034, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.3513668514624353, | |
| "grad_norm": 4.345687389373779, | |
| "learning_rate": 2.6507359969413115e-05, | |
| "loss": 0.6908, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.370483655132862, | |
| "grad_norm": 4.338857173919678, | |
| "learning_rate": 2.631619193270885e-05, | |
| "loss": 0.6754, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.370483655132862, | |
| "eval_loss": 0.6425340175628662, | |
| "eval_runtime": 92.4505, | |
| "eval_samples_per_second": 88.945, | |
| "eval_steps_per_second": 11.119, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.389600458803288, | |
| "grad_norm": 4.529644012451172, | |
| "learning_rate": 2.6125023896004592e-05, | |
| "loss": 0.6968, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.4087172624737145, | |
| "grad_norm": 4.309901714324951, | |
| "learning_rate": 2.593385585930033e-05, | |
| "loss": 0.7247, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.4278340661441407, | |
| "grad_norm": 4.750647068023682, | |
| "learning_rate": 2.574268782259606e-05, | |
| "loss": 0.6855, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.446950869814567, | |
| "grad_norm": 3.7934632301330566, | |
| "learning_rate": 2.5551519785891798e-05, | |
| "loss": 0.6865, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.446950869814567, | |
| "eval_loss": 0.6368651390075684, | |
| "eval_runtime": 92.2827, | |
| "eval_samples_per_second": 89.107, | |
| "eval_steps_per_second": 11.14, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.4660676734849933, | |
| "grad_norm": 4.074941158294678, | |
| "learning_rate": 2.5360351749187538e-05, | |
| "loss": 0.694, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.4851844771554195, | |
| "grad_norm": 4.529365539550781, | |
| "learning_rate": 2.5169183712483274e-05, | |
| "loss": 0.69, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.504301280825846, | |
| "grad_norm": 4.395044803619385, | |
| "learning_rate": 2.4979927356146053e-05, | |
| "loss": 0.6887, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.523418084496272, | |
| "grad_norm": 8.110248565673828, | |
| "learning_rate": 2.478875931944179e-05, | |
| "loss": 0.7046, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.523418084496272, | |
| "eval_loss": 0.6256079077720642, | |
| "eval_runtime": 92.0493, | |
| "eval_samples_per_second": 89.333, | |
| "eval_steps_per_second": 11.168, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.5425348881666983, | |
| "grad_norm": 3.641064167022705, | |
| "learning_rate": 2.459759128273753e-05, | |
| "loss": 0.6763, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.561651691837125, | |
| "grad_norm": 3.8722896575927734, | |
| "learning_rate": 2.4406423246033263e-05, | |
| "loss": 0.6963, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.5807684955075514, | |
| "grad_norm": 4.976208686828613, | |
| "learning_rate": 2.4215255209329003e-05, | |
| "loss": 0.675, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.5998852991779775, | |
| "grad_norm": 4.116947650909424, | |
| "learning_rate": 2.402408717262474e-05, | |
| "loss": 0.6729, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.5998852991779775, | |
| "eval_loss": 0.6305546164512634, | |
| "eval_runtime": 92.2226, | |
| "eval_samples_per_second": 89.165, | |
| "eval_steps_per_second": 11.147, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.6190021028484036, | |
| "grad_norm": 4.226246356964111, | |
| "learning_rate": 2.3832919135920476e-05, | |
| "loss": 0.7178, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.63811890651883, | |
| "grad_norm": 3.8298568725585938, | |
| "learning_rate": 2.3641751099216212e-05, | |
| "loss": 0.6867, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.6572357101892563, | |
| "grad_norm": 4.829805374145508, | |
| "learning_rate": 2.345058306251195e-05, | |
| "loss": 0.6816, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.6763525138596824, | |
| "grad_norm": 4.9147210121154785, | |
| "learning_rate": 2.3259415025807685e-05, | |
| "loss": 0.6628, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.6763525138596824, | |
| "eval_loss": 0.6306756138801575, | |
| "eval_runtime": 92.2614, | |
| "eval_samples_per_second": 89.127, | |
| "eval_steps_per_second": 11.142, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.695469317530109, | |
| "grad_norm": 4.762243270874023, | |
| "learning_rate": 2.306824698910342e-05, | |
| "loss": 0.7046, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.714586121200535, | |
| "grad_norm": 3.3113512992858887, | |
| "learning_rate": 2.287707895239916e-05, | |
| "loss": 0.6755, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.7337029248709617, | |
| "grad_norm": 4.240131855010986, | |
| "learning_rate": 2.2685910915694898e-05, | |
| "loss": 0.6827, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.752819728541388, | |
| "grad_norm": 3.4899845123291016, | |
| "learning_rate": 2.2494742878990635e-05, | |
| "loss": 0.6789, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.752819728541388, | |
| "eval_loss": 0.6250412464141846, | |
| "eval_runtime": 92.2599, | |
| "eval_samples_per_second": 89.129, | |
| "eval_steps_per_second": 11.142, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.7719365322118144, | |
| "grad_norm": 3.6331779956817627, | |
| "learning_rate": 2.230357484228637e-05, | |
| "loss": 0.6732, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.7910533358822405, | |
| "grad_norm": 5.3023247718811035, | |
| "learning_rate": 2.2112406805582108e-05, | |
| "loss": 0.6705, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.8101701395526666, | |
| "grad_norm": 4.467443943023682, | |
| "learning_rate": 2.1921238768877844e-05, | |
| "loss": 0.6857, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.829286943223093, | |
| "grad_norm": 3.4010238647460938, | |
| "learning_rate": 2.1730070732173584e-05, | |
| "loss": 0.6876, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.829286943223093, | |
| "eval_loss": 0.6198094487190247, | |
| "eval_runtime": 92.347, | |
| "eval_samples_per_second": 89.045, | |
| "eval_steps_per_second": 11.132, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.8484037468935193, | |
| "grad_norm": 4.248734951019287, | |
| "learning_rate": 2.1538902695469317e-05, | |
| "loss": 0.674, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.867520550563946, | |
| "grad_norm": 4.063199520111084, | |
| "learning_rate": 2.1347734658765057e-05, | |
| "loss": 0.666, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.886637354234372, | |
| "grad_norm": 4.015697002410889, | |
| "learning_rate": 2.115656662206079e-05, | |
| "loss": 0.6953, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.9057541579047985, | |
| "grad_norm": 3.798788070678711, | |
| "learning_rate": 2.096539858535653e-05, | |
| "loss": 0.65, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.9057541579047985, | |
| "eval_loss": 0.6209089756011963, | |
| "eval_runtime": 92.3611, | |
| "eval_samples_per_second": 89.031, | |
| "eval_steps_per_second": 11.13, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.9248709615752246, | |
| "grad_norm": 5.368408679962158, | |
| "learning_rate": 2.0774230548652267e-05, | |
| "loss": 0.6766, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.9439877652456508, | |
| "grad_norm": 3.803342580795288, | |
| "learning_rate": 2.0583062511948003e-05, | |
| "loss": 0.6659, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.9631045689160773, | |
| "grad_norm": 4.15940523147583, | |
| "learning_rate": 2.039189447524374e-05, | |
| "loss": 0.6636, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.9822213725865034, | |
| "grad_norm": 4.552635192871094, | |
| "learning_rate": 2.0200726438539476e-05, | |
| "loss": 0.6731, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.9822213725865034, | |
| "eval_loss": 0.6129796504974365, | |
| "eval_runtime": 92.2051, | |
| "eval_samples_per_second": 89.182, | |
| "eval_steps_per_second": 11.149, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 3.00133817625693, | |
| "grad_norm": 4.447234153747559, | |
| "learning_rate": 2.0009558401835213e-05, | |
| "loss": 0.6696, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 3.020454979927356, | |
| "grad_norm": 4.5681376457214355, | |
| "learning_rate": 1.9820302045497995e-05, | |
| "loss": 0.6701, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 3.0395717835977822, | |
| "grad_norm": 4.63778018951416, | |
| "learning_rate": 1.962913400879373e-05, | |
| "loss": 0.6498, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 3.058688587268209, | |
| "grad_norm": 3.8129169940948486, | |
| "learning_rate": 1.9437965972089468e-05, | |
| "loss": 0.6514, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.058688587268209, | |
| "eval_loss": 0.6256683468818665, | |
| "eval_runtime": 92.6353, | |
| "eval_samples_per_second": 88.767, | |
| "eval_steps_per_second": 11.097, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.077805390938635, | |
| "grad_norm": 4.221614837646484, | |
| "learning_rate": 1.9246797935385204e-05, | |
| "loss": 0.6745, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 3.0969221946090615, | |
| "grad_norm": 4.324675559997559, | |
| "learning_rate": 1.905562989868094e-05, | |
| "loss": 0.6516, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 3.1160389982794876, | |
| "grad_norm": 4.2093939781188965, | |
| "learning_rate": 1.8864461861976677e-05, | |
| "loss": 0.6574, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 3.135155801949914, | |
| "grad_norm": 4.944237232208252, | |
| "learning_rate": 1.8673293825272414e-05, | |
| "loss": 0.6602, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 3.135155801949914, | |
| "eval_loss": 0.6231346726417542, | |
| "eval_runtime": 92.6789, | |
| "eval_samples_per_second": 88.726, | |
| "eval_steps_per_second": 11.092, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 3.1542726056203403, | |
| "grad_norm": 4.590976238250732, | |
| "learning_rate": 1.8482125788568154e-05, | |
| "loss": 0.6662, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.1733894092907664, | |
| "grad_norm": 4.595473289489746, | |
| "learning_rate": 1.8290957751863887e-05, | |
| "loss": 0.6294, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 3.192506212961193, | |
| "grad_norm": 3.4412662982940674, | |
| "learning_rate": 1.8099789715159627e-05, | |
| "loss": 0.6558, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 3.211623016631619, | |
| "grad_norm": 4.820471286773682, | |
| "learning_rate": 1.7908621678455363e-05, | |
| "loss": 0.6749, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 3.211623016631619, | |
| "eval_loss": 0.6116614937782288, | |
| "eval_runtime": 92.7709, | |
| "eval_samples_per_second": 88.638, | |
| "eval_steps_per_second": 11.081, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 3.2307398203020457, | |
| "grad_norm": 5.879933834075928, | |
| "learning_rate": 1.77174536417511e-05, | |
| "loss": 0.6435, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 3.2498566239724718, | |
| "grad_norm": 4.462230205535889, | |
| "learning_rate": 1.752628560504684e-05, | |
| "loss": 0.6691, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.268973427642898, | |
| "grad_norm": 3.9079251289367676, | |
| "learning_rate": 1.7335117568342573e-05, | |
| "loss": 0.6684, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 3.2880902313133245, | |
| "grad_norm": 3.6853411197662354, | |
| "learning_rate": 1.7143949531638313e-05, | |
| "loss": 0.6429, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 3.2880902313133245, | |
| "eval_loss": 0.6121929883956909, | |
| "eval_runtime": 92.7326, | |
| "eval_samples_per_second": 88.674, | |
| "eval_steps_per_second": 11.086, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 3.3072070349837506, | |
| "grad_norm": 4.5890631675720215, | |
| "learning_rate": 1.6952781494934046e-05, | |
| "loss": 0.6608, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 3.326323838654177, | |
| "grad_norm": 3.9099321365356445, | |
| "learning_rate": 1.6761613458229786e-05, | |
| "loss": 0.6515, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 3.3454406423246033, | |
| "grad_norm": 8.615681648254395, | |
| "learning_rate": 1.6570445421525522e-05, | |
| "loss": 0.6516, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.36455744599503, | |
| "grad_norm": 3.810173988342285, | |
| "learning_rate": 1.637927738482126e-05, | |
| "loss": 0.6488, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 3.36455744599503, | |
| "eval_loss": 0.6148595809936523, | |
| "eval_runtime": 92.7216, | |
| "eval_samples_per_second": 88.685, | |
| "eval_steps_per_second": 11.087, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 3.383674249665456, | |
| "grad_norm": 4.097940444946289, | |
| "learning_rate": 1.6188109348116995e-05, | |
| "loss": 0.6441, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 3.402791053335882, | |
| "grad_norm": 4.74275016784668, | |
| "learning_rate": 1.5996941311412732e-05, | |
| "loss": 0.6444, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 3.4219078570063086, | |
| "grad_norm": 4.2954888343811035, | |
| "learning_rate": 1.5805773274708468e-05, | |
| "loss": 0.6474, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 3.4410246606767347, | |
| "grad_norm": 4.689930438995361, | |
| "learning_rate": 1.5614605238004208e-05, | |
| "loss": 0.6341, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.4410246606767347, | |
| "eval_loss": 0.6162819862365723, | |
| "eval_runtime": 92.7406, | |
| "eval_samples_per_second": 88.667, | |
| "eval_steps_per_second": 11.085, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.4601414643471613, | |
| "grad_norm": 4.592708110809326, | |
| "learning_rate": 1.542343720129994e-05, | |
| "loss": 0.6344, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 3.4792582680175874, | |
| "grad_norm": 3.388826608657837, | |
| "learning_rate": 1.5232269164595681e-05, | |
| "loss": 0.6476, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 3.498375071688014, | |
| "grad_norm": 3.1417880058288574, | |
| "learning_rate": 1.5041101127891416e-05, | |
| "loss": 0.63, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 3.51749187535844, | |
| "grad_norm": 3.839583158493042, | |
| "learning_rate": 1.4849933091187154e-05, | |
| "loss": 0.6502, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 3.51749187535844, | |
| "eval_loss": 0.6018521189689636, | |
| "eval_runtime": 92.7633, | |
| "eval_samples_per_second": 88.645, | |
| "eval_steps_per_second": 11.082, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 3.536608679028866, | |
| "grad_norm": 4.443102836608887, | |
| "learning_rate": 1.4658765054482892e-05, | |
| "loss": 0.615, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.555725482699293, | |
| "grad_norm": 3.7104012966156006, | |
| "learning_rate": 1.4467597017778627e-05, | |
| "loss": 0.6362, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 3.574842286369719, | |
| "grad_norm": 3.7938549518585205, | |
| "learning_rate": 1.4276428981074365e-05, | |
| "loss": 0.6351, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 3.593959090040145, | |
| "grad_norm": 3.9377052783966064, | |
| "learning_rate": 1.4085260944370102e-05, | |
| "loss": 0.6388, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 3.593959090040145, | |
| "eval_loss": 0.604111909866333, | |
| "eval_runtime": 92.8027, | |
| "eval_samples_per_second": 88.607, | |
| "eval_steps_per_second": 11.077, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 3.6130758937105716, | |
| "grad_norm": 4.241858959197998, | |
| "learning_rate": 1.389409290766584e-05, | |
| "loss": 0.6598, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 3.632192697380998, | |
| "grad_norm": 4.486043453216553, | |
| "learning_rate": 1.3702924870961575e-05, | |
| "loss": 0.6225, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.6513095010514243, | |
| "grad_norm": 4.468062877655029, | |
| "learning_rate": 1.3511756834257313e-05, | |
| "loss": 0.6401, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 3.6704263047218504, | |
| "grad_norm": 4.349284648895264, | |
| "learning_rate": 1.332058879755305e-05, | |
| "loss": 0.6489, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 3.6704263047218504, | |
| "eval_loss": 0.5988742113113403, | |
| "eval_runtime": 92.7585, | |
| "eval_samples_per_second": 88.65, | |
| "eval_steps_per_second": 11.083, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 3.689543108392277, | |
| "grad_norm": 4.928227424621582, | |
| "learning_rate": 1.3129420760848788e-05, | |
| "loss": 0.62, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 3.708659912062703, | |
| "grad_norm": 3.8102471828460693, | |
| "learning_rate": 1.2938252724144523e-05, | |
| "loss": 0.6427, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 3.727776715733129, | |
| "grad_norm": 4.022319316864014, | |
| "learning_rate": 1.2747084687440261e-05, | |
| "loss": 0.6461, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 3.7468935194035558, | |
| "grad_norm": 4.785296440124512, | |
| "learning_rate": 1.2555916650735996e-05, | |
| "loss": 0.6502, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 3.7468935194035558, | |
| "eval_loss": 0.5975850224494934, | |
| "eval_runtime": 92.6855, | |
| "eval_samples_per_second": 88.719, | |
| "eval_steps_per_second": 11.091, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 3.766010323073982, | |
| "grad_norm": 4.089471817016602, | |
| "learning_rate": 1.2364748614031734e-05, | |
| "loss": 0.6329, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 3.7851271267444084, | |
| "grad_norm": 2.898491859436035, | |
| "learning_rate": 1.2173580577327472e-05, | |
| "loss": 0.6317, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 3.8042439304148346, | |
| "grad_norm": 3.9998719692230225, | |
| "learning_rate": 1.1982412540623209e-05, | |
| "loss": 0.6184, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 3.823360734085261, | |
| "grad_norm": 3.649463176727295, | |
| "learning_rate": 1.1793156184285987e-05, | |
| "loss": 0.6301, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.823360734085261, | |
| "eval_loss": 0.6061282157897949, | |
| "eval_runtime": 92.6598, | |
| "eval_samples_per_second": 88.744, | |
| "eval_steps_per_second": 11.094, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.8424775377556872, | |
| "grad_norm": 4.067989349365234, | |
| "learning_rate": 1.1601988147581724e-05, | |
| "loss": 0.6425, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 3.8615943414261134, | |
| "grad_norm": 3.7173011302948, | |
| "learning_rate": 1.141082011087746e-05, | |
| "loss": 0.6134, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 3.88071114509654, | |
| "grad_norm": 4.036506175994873, | |
| "learning_rate": 1.1219652074173199e-05, | |
| "loss": 0.6612, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 3.899827948766966, | |
| "grad_norm": 3.4378676414489746, | |
| "learning_rate": 1.1028484037468937e-05, | |
| "loss": 0.6194, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 3.899827948766966, | |
| "eval_loss": 0.5860570669174194, | |
| "eval_runtime": 92.6457, | |
| "eval_samples_per_second": 88.757, | |
| "eval_steps_per_second": 11.096, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 3.9189447524373926, | |
| "grad_norm": 7.162832260131836, | |
| "learning_rate": 1.0837316000764673e-05, | |
| "loss": 0.6385, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 3.9380615561078187, | |
| "grad_norm": 3.439091920852661, | |
| "learning_rate": 1.064614796406041e-05, | |
| "loss": 0.6243, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 3.9571783597782453, | |
| "grad_norm": 3.7195284366607666, | |
| "learning_rate": 1.0454979927356146e-05, | |
| "loss": 0.6337, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 3.9762951634486714, | |
| "grad_norm": 3.3584518432617188, | |
| "learning_rate": 1.0263811890651883e-05, | |
| "loss": 0.6352, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 3.9762951634486714, | |
| "eval_loss": 0.5977619290351868, | |
| "eval_runtime": 92.6632, | |
| "eval_samples_per_second": 88.741, | |
| "eval_steps_per_second": 11.094, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 3.9954119671190975, | |
| "grad_norm": 5.517305850982666, | |
| "learning_rate": 1.0072643853947621e-05, | |
| "loss": 0.6241, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 4.014528770789524, | |
| "grad_norm": 3.5819714069366455, | |
| "learning_rate": 9.881475817243358e-06, | |
| "loss": 0.6352, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 4.033645574459951, | |
| "grad_norm": 3.967008352279663, | |
| "learning_rate": 9.690307780539094e-06, | |
| "loss": 0.617, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 4.052762378130376, | |
| "grad_norm": 3.5766172409057617, | |
| "learning_rate": 9.49913974383483e-06, | |
| "loss": 0.628, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 4.052762378130376, | |
| "eval_loss": 0.5928879380226135, | |
| "eval_runtime": 92.586, | |
| "eval_samples_per_second": 88.815, | |
| "eval_steps_per_second": 11.103, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 4.071879181800803, | |
| "grad_norm": 4.025076866149902, | |
| "learning_rate": 9.307971707130567e-06, | |
| "loss": 0.6238, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 4.0909959854712294, | |
| "grad_norm": 5.096427917480469, | |
| "learning_rate": 9.116803670426305e-06, | |
| "loss": 0.6337, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 4.110112789141655, | |
| "grad_norm": 5.223696708679199, | |
| "learning_rate": 8.925635633722042e-06, | |
| "loss": 0.6212, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 4.129229592812082, | |
| "grad_norm": 4.151371479034424, | |
| "learning_rate": 8.734467597017778e-06, | |
| "loss": 0.6176, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 4.129229592812082, | |
| "eval_loss": 0.5918228626251221, | |
| "eval_runtime": 92.7104, | |
| "eval_samples_per_second": 88.696, | |
| "eval_steps_per_second": 11.088, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 4.148346396482508, | |
| "grad_norm": 4.445927619934082, | |
| "learning_rate": 8.543299560313515e-06, | |
| "loss": 0.6322, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 4.167463200152935, | |
| "grad_norm": 6.669031143188477, | |
| "learning_rate": 8.352131523609253e-06, | |
| "loss": 0.6307, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 4.1865800038233605, | |
| "grad_norm": 4.0559186935424805, | |
| "learning_rate": 8.16096348690499e-06, | |
| "loss": 0.6323, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 4.205696807493787, | |
| "grad_norm": 4.512356281280518, | |
| "learning_rate": 7.969795450200728e-06, | |
| "loss": 0.6385, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 4.205696807493787, | |
| "eval_loss": 0.5929626226425171, | |
| "eval_runtime": 92.4043, | |
| "eval_samples_per_second": 88.989, | |
| "eval_steps_per_second": 11.125, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 4.224813611164214, | |
| "grad_norm": 4.109405517578125, | |
| "learning_rate": 7.778627413496464e-06, | |
| "loss": 0.6255, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 4.243930414834639, | |
| "grad_norm": 4.0541486740112305, | |
| "learning_rate": 7.587459376792201e-06, | |
| "loss": 0.6198, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 4.263047218505066, | |
| "grad_norm": 5.3996663093566895, | |
| "learning_rate": 7.396291340087938e-06, | |
| "loss": 0.6172, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 4.282164022175492, | |
| "grad_norm": 4.728433609008789, | |
| "learning_rate": 7.205123303383675e-06, | |
| "loss": 0.6061, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 4.282164022175492, | |
| "eval_loss": 0.5865157246589661, | |
| "eval_runtime": 92.3573, | |
| "eval_samples_per_second": 89.035, | |
| "eval_steps_per_second": 11.131, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 4.301280825845919, | |
| "grad_norm": 4.604154586791992, | |
| "learning_rate": 7.013955266679411e-06, | |
| "loss": 0.6441, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 4.320397629516345, | |
| "grad_norm": 4.1287760734558105, | |
| "learning_rate": 6.822787229975149e-06, | |
| "loss": 0.625, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 4.339514433186771, | |
| "grad_norm": 3.1182920932769775, | |
| "learning_rate": 6.631619193270885e-06, | |
| "loss": 0.5973, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 4.358631236857198, | |
| "grad_norm": 4.751844882965088, | |
| "learning_rate": 6.4404511565666225e-06, | |
| "loss": 0.597, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 4.358631236857198, | |
| "eval_loss": 0.5919764637947083, | |
| "eval_runtime": 92.4705, | |
| "eval_samples_per_second": 88.926, | |
| "eval_steps_per_second": 11.117, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 4.3777480405276235, | |
| "grad_norm": 5.184845924377441, | |
| "learning_rate": 6.249283119862359e-06, | |
| "loss": 0.6219, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 4.39686484419805, | |
| "grad_norm": 4.108447551727295, | |
| "learning_rate": 6.058115083158096e-06, | |
| "loss": 0.6275, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 4.415981647868477, | |
| "grad_norm": 3.9303929805755615, | |
| "learning_rate": 5.866947046453833e-06, | |
| "loss": 0.6213, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 4.435098451538902, | |
| "grad_norm": 4.054929733276367, | |
| "learning_rate": 5.67577900974957e-06, | |
| "loss": 0.6313, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 4.435098451538902, | |
| "eval_loss": 0.5812836289405823, | |
| "eval_runtime": 92.6171, | |
| "eval_samples_per_second": 88.785, | |
| "eval_steps_per_second": 11.099, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 4.454215255209329, | |
| "grad_norm": 4.04779052734375, | |
| "learning_rate": 5.484610973045308e-06, | |
| "loss": 0.6055, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 4.473332058879755, | |
| "grad_norm": 4.373106956481934, | |
| "learning_rate": 5.293442936341044e-06, | |
| "loss": 0.6181, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 4.492448862550182, | |
| "grad_norm": 3.912672758102417, | |
| "learning_rate": 5.102274899636781e-06, | |
| "loss": 0.6049, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 4.511565666220608, | |
| "grad_norm": 4.924178123474121, | |
| "learning_rate": 4.911106862932518e-06, | |
| "loss": 0.6388, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 4.511565666220608, | |
| "eval_loss": 0.585365891456604, | |
| "eval_runtime": 92.598, | |
| "eval_samples_per_second": 88.803, | |
| "eval_steps_per_second": 11.102, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 4.530682469891034, | |
| "grad_norm": 4.225689888000488, | |
| "learning_rate": 4.7199388262282546e-06, | |
| "loss": 0.5984, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 4.549799273561461, | |
| "grad_norm": 3.848640203475952, | |
| "learning_rate": 4.528770789523992e-06, | |
| "loss": 0.6009, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 4.568916077231886, | |
| "grad_norm": 3.6290130615234375, | |
| "learning_rate": 4.3376027528197284e-06, | |
| "loss": 0.6205, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 4.588032880902313, | |
| "grad_norm": 5.409413814544678, | |
| "learning_rate": 4.146434716115466e-06, | |
| "loss": 0.6052, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 4.588032880902313, | |
| "eval_loss": 0.5768113732337952, | |
| "eval_runtime": 92.5622, | |
| "eval_samples_per_second": 88.838, | |
| "eval_steps_per_second": 11.106, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 4.6071496845727395, | |
| "grad_norm": 4.062690258026123, | |
| "learning_rate": 3.955266679411203e-06, | |
| "loss": 0.6227, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 4.626266488243166, | |
| "grad_norm": 6.228837490081787, | |
| "learning_rate": 3.7640986427069397e-06, | |
| "loss": 0.6104, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 4.645383291913592, | |
| "grad_norm": 3.8039870262145996, | |
| "learning_rate": 3.5729306060026766e-06, | |
| "loss": 0.6087, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 4.664500095584018, | |
| "grad_norm": 4.199521541595459, | |
| "learning_rate": 3.3817625692984135e-06, | |
| "loss": 0.6135, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 4.664500095584018, | |
| "eval_loss": 0.5789579153060913, | |
| "eval_runtime": 92.5954, | |
| "eval_samples_per_second": 88.806, | |
| "eval_steps_per_second": 11.102, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 4.683616899254445, | |
| "grad_norm": 3.312234878540039, | |
| "learning_rate": 3.19059453259415e-06, | |
| "loss": 0.6108, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 4.702733702924871, | |
| "grad_norm": 4.610132694244385, | |
| "learning_rate": 2.9994264958898874e-06, | |
| "loss": 0.5984, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 4.721850506595297, | |
| "grad_norm": 4.196247100830078, | |
| "learning_rate": 2.8082584591856244e-06, | |
| "loss": 0.6249, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 4.740967310265724, | |
| "grad_norm": 4.444230079650879, | |
| "learning_rate": 2.6170904224813613e-06, | |
| "loss": 0.6135, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 4.740967310265724, | |
| "eval_loss": 0.5723977088928223, | |
| "eval_runtime": 92.5878, | |
| "eval_samples_per_second": 88.813, | |
| "eval_steps_per_second": 11.103, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 4.76008411393615, | |
| "grad_norm": 4.337975025177002, | |
| "learning_rate": 2.4259223857770982e-06, | |
| "loss": 0.6222, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 4.779200917606576, | |
| "grad_norm": 4.261539459228516, | |
| "learning_rate": 2.234754349072835e-06, | |
| "loss": 0.5808, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 4.7983177212770025, | |
| "grad_norm": 4.659415245056152, | |
| "learning_rate": 2.043586312368572e-06, | |
| "loss": 0.592, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 4.817434524947429, | |
| "grad_norm": 4.005898952484131, | |
| "learning_rate": 1.852418275664309e-06, | |
| "loss": 0.6087, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 4.817434524947429, | |
| "eval_loss": 0.5687017440795898, | |
| "eval_runtime": 92.6047, | |
| "eval_samples_per_second": 88.797, | |
| "eval_steps_per_second": 11.101, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 4.836551328617855, | |
| "grad_norm": 4.495694160461426, | |
| "learning_rate": 1.6612502389600458e-06, | |
| "loss": 0.6195, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 4.855668132288281, | |
| "grad_norm": 5.030457019805908, | |
| "learning_rate": 1.470082202255783e-06, | |
| "loss": 0.6202, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 4.874784935958708, | |
| "grad_norm": 4.42711877822876, | |
| "learning_rate": 1.2789141655515199e-06, | |
| "loss": 0.6007, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 4.893901739629134, | |
| "grad_norm": 4.1595563888549805, | |
| "learning_rate": 1.0877461288472568e-06, | |
| "loss": 0.621, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 4.893901739629134, | |
| "eval_loss": 0.5748383402824402, | |
| "eval_runtime": 92.5734, | |
| "eval_samples_per_second": 88.827, | |
| "eval_steps_per_second": 11.105, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 4.91301854329956, | |
| "grad_norm": 3.987473249435425, | |
| "learning_rate": 8.965780921429937e-07, | |
| "loss": 0.586, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 4.932135346969987, | |
| "grad_norm": 3.4999001026153564, | |
| "learning_rate": 7.054100554387307e-07, | |
| "loss": 0.6105, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 4.951252150640413, | |
| "grad_norm": 3.7822272777557373, | |
| "learning_rate": 5.142420187344676e-07, | |
| "loss": 0.6047, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 4.970368954310839, | |
| "grad_norm": 3.8180148601531982, | |
| "learning_rate": 3.2307398203020455e-07, | |
| "loss": 0.6256, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 4.970368954310839, | |
| "eval_loss": 0.5706872344017029, | |
| "eval_runtime": 92.6466, | |
| "eval_samples_per_second": 88.757, | |
| "eval_steps_per_second": 11.096, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 4.9894857579812655, | |
| "grad_norm": 3.5531647205352783, | |
| "learning_rate": 1.319059453259415e-07, | |
| "loss": 0.599, | |
| "step": 26100 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 26155, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.446806122974282e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |