{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 400, "global_step": 26155, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019116803670426306, "grad_norm": 6.148062705993652, "learning_rate": 4.981265532402982e-05, "loss": 1.0593, "step": 100 }, { "epoch": 0.03823360734085261, "grad_norm": 4.97545051574707, "learning_rate": 4.9625310648059644e-05, "loss": 0.9839, "step": 200 }, { "epoch": 0.05735041101127891, "grad_norm": 5.777927875518799, "learning_rate": 4.9434142611355384e-05, "loss": 0.9631, "step": 300 }, { "epoch": 0.07646721468170523, "grad_norm": 6.401044845581055, "learning_rate": 4.924297457465112e-05, "loss": 0.9816, "step": 400 }, { "epoch": 0.07646721468170523, "eval_loss": 0.8617107272148132, "eval_runtime": 92.0408, "eval_samples_per_second": 89.341, "eval_steps_per_second": 11.169, "step": 400 }, { "epoch": 0.09558401835213153, "grad_norm": 5.193621635437012, "learning_rate": 4.905180653794686e-05, "loss": 0.9312, "step": 500 }, { "epoch": 0.11470082202255782, "grad_norm": 5.454291343688965, "learning_rate": 4.88606385012426e-05, "loss": 0.9509, "step": 600 }, { "epoch": 0.13381762569298414, "grad_norm": 6.0297112464904785, "learning_rate": 4.866947046453833e-05, "loss": 0.9374, "step": 700 }, { "epoch": 0.15293442936341045, "grad_norm": 4.852521896362305, "learning_rate": 4.847830242783406e-05, "loss": 0.9135, "step": 800 }, { "epoch": 0.15293442936341045, "eval_loss": 0.8287038207054138, "eval_runtime": 92.0241, "eval_samples_per_second": 89.357, "eval_steps_per_second": 11.171, "step": 800 }, { "epoch": 0.17205123303383674, "grad_norm": 5.611992835998535, "learning_rate": 4.828713439112981e-05, "loss": 0.9056, "step": 900 }, { "epoch": 0.19116803670426305, "grad_norm": 4.3008832931518555, "learning_rate": 4.809596635442554e-05, "loss": 0.8996, "step": 1000 }, { "epoch": 0.21028484037468936, "grad_norm": 5.790643215179443, "learning_rate": 4.790670999808832e-05, "loss": 0.8965, "step": 1100 }, { "epoch": 0.22940164404511565, "grad_norm": 5.138456344604492, "learning_rate": 4.7715541961384055e-05, "loss": 0.9044, "step": 1200 }, { "epoch": 0.22940164404511565, "eval_loss": 0.8015913367271423, "eval_runtime": 92.1486, "eval_samples_per_second": 89.236, "eval_steps_per_second": 11.156, "step": 1200 }, { "epoch": 0.24851844771554196, "grad_norm": 5.13100004196167, "learning_rate": 4.7524373924679795e-05, "loss": 0.9073, "step": 1300 }, { "epoch": 0.2676352513859683, "grad_norm": 4.568897724151611, "learning_rate": 4.7333205887975535e-05, "loss": 0.8891, "step": 1400 }, { "epoch": 0.28675205505639456, "grad_norm": 4.8391337394714355, "learning_rate": 4.714203785127127e-05, "loss": 0.871, "step": 1500 }, { "epoch": 0.3058688587268209, "grad_norm": 4.416493892669678, "learning_rate": 4.695086981456701e-05, "loss": 0.8974, "step": 1600 }, { "epoch": 0.3058688587268209, "eval_loss": 0.8033304810523987, "eval_runtime": 91.9998, "eval_samples_per_second": 89.381, "eval_steps_per_second": 11.174, "step": 1600 }, { "epoch": 0.3249856623972472, "grad_norm": 4.519742012023926, "learning_rate": 4.675970177786274e-05, "loss": 0.8682, "step": 1700 }, { "epoch": 0.3441024660676735, "grad_norm": 4.968471527099609, "learning_rate": 4.657044542152552e-05, "loss": 0.9351, "step": 1800 }, { "epoch": 0.3632192697380998, "grad_norm": 5.676445960998535, "learning_rate": 4.637927738482126e-05, "loss": 0.8868, "step": 1900 }, { "epoch": 0.3823360734085261, "grad_norm": 4.578829288482666, "learning_rate": 4.6188109348117e-05, "loss": 0.8726, "step": 2000 }, { "epoch": 0.3823360734085261, "eval_loss": 0.7817397117614746, "eval_runtime": 92.2617, "eval_samples_per_second": 89.127, "eval_steps_per_second": 11.142, "step": 2000 }, { "epoch": 0.4014528770789524, "grad_norm": 5.0320892333984375, "learning_rate": 4.599694131141273e-05, "loss": 0.8714, "step": 2100 }, { "epoch": 0.4205696807493787, "grad_norm": 5.464868545532227, "learning_rate": 4.5805773274708466e-05, "loss": 0.8674, "step": 2200 }, { "epoch": 0.439686484419805, "grad_norm": 4.6338934898376465, "learning_rate": 4.561460523800421e-05, "loss": 0.851, "step": 2300 }, { "epoch": 0.4588032880902313, "grad_norm": 4.83494234085083, "learning_rate": 4.5423437201299945e-05, "loss": 0.8855, "step": 2400 }, { "epoch": 0.4588032880902313, "eval_loss": 0.7753578424453735, "eval_runtime": 92.0087, "eval_samples_per_second": 89.372, "eval_steps_per_second": 11.173, "step": 2400 }, { "epoch": 0.47792009176065764, "grad_norm": 4.239180564880371, "learning_rate": 4.523226916459568e-05, "loss": 0.8647, "step": 2500 }, { "epoch": 0.4970368954310839, "grad_norm": 5.430279731750488, "learning_rate": 4.504110112789142e-05, "loss": 0.8601, "step": 2600 }, { "epoch": 0.5161536991015102, "grad_norm": 4.269476890563965, "learning_rate": 4.484993309118716e-05, "loss": 0.8713, "step": 2700 }, { "epoch": 0.5352705027719366, "grad_norm": 5.328524589538574, "learning_rate": 4.465876505448289e-05, "loss": 0.8462, "step": 2800 }, { "epoch": 0.5352705027719366, "eval_loss": 0.763035237789154, "eval_runtime": 92.1674, "eval_samples_per_second": 89.218, "eval_steps_per_second": 11.154, "step": 2800 }, { "epoch": 0.5543873064423628, "grad_norm": 5.192692756652832, "learning_rate": 4.446759701777863e-05, "loss": 0.8335, "step": 2900 }, { "epoch": 0.5735041101127891, "grad_norm": 4.487423419952393, "learning_rate": 4.4276428981074365e-05, "loss": 0.8294, "step": 3000 }, { "epoch": 0.5926209137832155, "grad_norm": 4.573492527008057, "learning_rate": 4.4085260944370104e-05, "loss": 0.8579, "step": 3100 }, { "epoch": 0.6117377174536418, "grad_norm": 4.045724868774414, "learning_rate": 4.389409290766584e-05, "loss": 0.8206, "step": 3200 }, { "epoch": 0.6117377174536418, "eval_loss": 0.7534114122390747, "eval_runtime": 92.1289, "eval_samples_per_second": 89.255, "eval_steps_per_second": 11.158, "step": 3200 }, { "epoch": 0.630854521124068, "grad_norm": 4.9863386154174805, "learning_rate": 4.370292487096158e-05, "loss": 0.8268, "step": 3300 }, { "epoch": 0.6499713247944944, "grad_norm": 4.475931644439697, "learning_rate": 4.351175683425732e-05, "loss": 0.8326, "step": 3400 }, { "epoch": 0.6690881284649207, "grad_norm": 5.008608341217041, "learning_rate": 4.332058879755305e-05, "loss": 0.7823, "step": 3500 }, { "epoch": 0.688204932135347, "grad_norm": 6.179344177246094, "learning_rate": 4.312942076084879e-05, "loss": 0.8181, "step": 3600 }, { "epoch": 0.688204932135347, "eval_loss": 0.74337238073349, "eval_runtime": 92.0867, "eval_samples_per_second": 89.296, "eval_steps_per_second": 11.163, "step": 3600 }, { "epoch": 0.7073217358057733, "grad_norm": 6.242499351501465, "learning_rate": 4.2938252724144524e-05, "loss": 0.8082, "step": 3700 }, { "epoch": 0.7264385394761996, "grad_norm": 4.346529006958008, "learning_rate": 4.2747084687440263e-05, "loss": 0.8274, "step": 3800 }, { "epoch": 0.7455553431466259, "grad_norm": 3.4538333415985107, "learning_rate": 4.2555916650735997e-05, "loss": 0.812, "step": 3900 }, { "epoch": 0.7646721468170522, "grad_norm": 5.477049350738525, "learning_rate": 4.2364748614031736e-05, "loss": 0.8291, "step": 4000 }, { "epoch": 0.7646721468170522, "eval_loss": 0.8589261770248413, "eval_runtime": 91.9418, "eval_samples_per_second": 89.437, "eval_steps_per_second": 11.181, "step": 4000 }, { "epoch": 0.7837889504874785, "grad_norm": 6.062958717346191, "learning_rate": 4.217358057732747e-05, "loss": 0.8236, "step": 4100 }, { "epoch": 0.8029057541579048, "grad_norm": 4.950747489929199, "learning_rate": 4.198241254062321e-05, "loss": 0.8356, "step": 4200 }, { "epoch": 0.8220225578283311, "grad_norm": 4.48617696762085, "learning_rate": 4.179124450391895e-05, "loss": 0.778, "step": 4300 }, { "epoch": 0.8411393614987575, "grad_norm": 4.147668361663818, "learning_rate": 4.160007646721468e-05, "loss": 0.8314, "step": 4400 }, { "epoch": 0.8411393614987575, "eval_loss": 0.7325341105461121, "eval_runtime": 91.9404, "eval_samples_per_second": 89.438, "eval_steps_per_second": 11.181, "step": 4400 }, { "epoch": 0.8602561651691837, "grad_norm": 4.805129051208496, "learning_rate": 4.1408908430510416e-05, "loss": 0.841, "step": 4500 }, { "epoch": 0.87937296883961, "grad_norm": 4.559420108795166, "learning_rate": 4.121774039380616e-05, "loss": 0.805, "step": 4600 }, { "epoch": 0.8984897725100364, "grad_norm": 4.7745771408081055, "learning_rate": 4.1026572357101895e-05, "loss": 0.8243, "step": 4700 }, { "epoch": 0.9176065761804626, "grad_norm": 4.034806251525879, "learning_rate": 4.083540432039763e-05, "loss": 0.8016, "step": 4800 }, { "epoch": 0.9176065761804626, "eval_loss": 0.7278503775596619, "eval_runtime": 91.908, "eval_samples_per_second": 89.47, "eval_steps_per_second": 11.185, "step": 4800 }, { "epoch": 0.9367233798508889, "grad_norm": 4.074121952056885, "learning_rate": 4.064423628369337e-05, "loss": 0.8006, "step": 4900 }, { "epoch": 0.9558401835213153, "grad_norm": 4.704626560211182, "learning_rate": 4.045306824698911e-05, "loss": 0.8106, "step": 5000 }, { "epoch": 0.9749569871917415, "grad_norm": 4.858222961425781, "learning_rate": 4.026190021028484e-05, "loss": 0.8129, "step": 5100 }, { "epoch": 0.9940737908621678, "grad_norm": 3.913759469985962, "learning_rate": 4.0070732173580575e-05, "loss": 0.7769, "step": 5200 }, { "epoch": 0.9940737908621678, "eval_loss": 0.7124339938163757, "eval_runtime": 92.1458, "eval_samples_per_second": 89.239, "eval_steps_per_second": 11.156, "step": 5200 }, { "epoch": 1.013190594532594, "grad_norm": 4.4074015617370605, "learning_rate": 3.987956413687632e-05, "loss": 0.7881, "step": 5300 }, { "epoch": 1.0323073982030204, "grad_norm": 4.406864643096924, "learning_rate": 3.9688396100172054e-05, "loss": 0.783, "step": 5400 }, { "epoch": 1.0514242018734468, "grad_norm": 4.834721565246582, "learning_rate": 3.949722806346779e-05, "loss": 0.7832, "step": 5500 }, { "epoch": 1.070541005543873, "grad_norm": 4.5118889808654785, "learning_rate": 3.930606002676353e-05, "loss": 0.7652, "step": 5600 }, { "epoch": 1.070541005543873, "eval_loss": 0.7111669182777405, "eval_runtime": 91.8955, "eval_samples_per_second": 89.482, "eval_steps_per_second": 11.187, "step": 5600 }, { "epoch": 1.0896578092142994, "grad_norm": 4.746542930603027, "learning_rate": 3.911489199005927e-05, "loss": 0.773, "step": 5700 }, { "epoch": 1.1087746128847256, "grad_norm": 4.428741455078125, "learning_rate": 3.8923723953355e-05, "loss": 0.78, "step": 5800 }, { "epoch": 1.127891416555152, "grad_norm": 5.356833457946777, "learning_rate": 3.8732555916650734e-05, "loss": 0.7614, "step": 5900 }, { "epoch": 1.1470082202255782, "grad_norm": 5.255131244659424, "learning_rate": 3.8541387879946474e-05, "loss": 0.7753, "step": 6000 }, { "epoch": 1.1470082202255782, "eval_loss": 0.7003123164176941, "eval_runtime": 91.7533, "eval_samples_per_second": 89.621, "eval_steps_per_second": 11.204, "step": 6000 }, { "epoch": 1.1661250238960046, "grad_norm": 4.378373622894287, "learning_rate": 3.8350219843242213e-05, "loss": 0.7813, "step": 6100 }, { "epoch": 1.185241827566431, "grad_norm": 5.40322208404541, "learning_rate": 3.8159051806537947e-05, "loss": 0.7519, "step": 6200 }, { "epoch": 1.2043586312368573, "grad_norm": 4.921120643615723, "learning_rate": 3.7967883769833686e-05, "loss": 0.7649, "step": 6300 }, { "epoch": 1.2234754349072836, "grad_norm": 4.296281814575195, "learning_rate": 3.7778627413496465e-05, "loss": 0.7948, "step": 6400 }, { "epoch": 1.2234754349072836, "eval_loss": 0.707783579826355, "eval_runtime": 91.9021, "eval_samples_per_second": 89.476, "eval_steps_per_second": 11.186, "step": 6400 }, { "epoch": 1.2425922385777097, "grad_norm": 3.6409478187561035, "learning_rate": 3.75874593767922e-05, "loss": 0.7547, "step": 6500 }, { "epoch": 1.261709042248136, "grad_norm": 4.5969672203063965, "learning_rate": 3.739629134008794e-05, "loss": 0.7419, "step": 6600 }, { "epoch": 1.2808258459185624, "grad_norm": 6.831844806671143, "learning_rate": 3.720703498375072e-05, "loss": 0.7909, "step": 6700 }, { "epoch": 1.2999426495889888, "grad_norm": 4.02253532409668, "learning_rate": 3.701586694704646e-05, "loss": 0.7693, "step": 6800 }, { "epoch": 1.2999426495889888, "eval_loss": 0.6886795163154602, "eval_runtime": 91.6808, "eval_samples_per_second": 89.692, "eval_steps_per_second": 11.213, "step": 6800 }, { "epoch": 1.319059453259415, "grad_norm": 5.338057994842529, "learning_rate": 3.682469891034219e-05, "loss": 0.7468, "step": 6900 }, { "epoch": 1.3381762569298412, "grad_norm": 4.327272415161133, "learning_rate": 3.663353087363793e-05, "loss": 0.7493, "step": 7000 }, { "epoch": 1.3572930606002676, "grad_norm": 5.279138088226318, "learning_rate": 3.644236283693367e-05, "loss": 0.7527, "step": 7100 }, { "epoch": 1.376409864270694, "grad_norm": 5.587660789489746, "learning_rate": 3.62511948002294e-05, "loss": 0.7456, "step": 7200 }, { "epoch": 1.376409864270694, "eval_loss": 0.6874927282333374, "eval_runtime": 91.6806, "eval_samples_per_second": 89.692, "eval_steps_per_second": 11.213, "step": 7200 }, { "epoch": 1.3955266679411202, "grad_norm": 3.81733775138855, "learning_rate": 3.606002676352514e-05, "loss": 0.7551, "step": 7300 }, { "epoch": 1.4146434716115466, "grad_norm": 4.612213611602783, "learning_rate": 3.5868858726820876e-05, "loss": 0.7616, "step": 7400 }, { "epoch": 1.433760275281973, "grad_norm": 5.318126678466797, "learning_rate": 3.5677690690116616e-05, "loss": 0.7507, "step": 7500 }, { "epoch": 1.4528770789523993, "grad_norm": 5.163857936859131, "learning_rate": 3.548652265341235e-05, "loss": 0.7435, "step": 7600 }, { "epoch": 1.4528770789523993, "eval_loss": 0.6907040476799011, "eval_runtime": 91.8652, "eval_samples_per_second": 89.512, "eval_steps_per_second": 11.19, "step": 7600 }, { "epoch": 1.4719938826228254, "grad_norm": 4.82822322845459, "learning_rate": 3.529535461670809e-05, "loss": 0.7539, "step": 7700 }, { "epoch": 1.4911106862932517, "grad_norm": 3.967336416244507, "learning_rate": 3.510609826037087e-05, "loss": 0.7339, "step": 7800 }, { "epoch": 1.510227489963678, "grad_norm": 4.51738166809082, "learning_rate": 3.49149302236666e-05, "loss": 0.7473, "step": 7900 }, { "epoch": 1.5293442936341044, "grad_norm": 5.371578693389893, "learning_rate": 3.472376218696235e-05, "loss": 0.759, "step": 8000 }, { "epoch": 1.5293442936341044, "eval_loss": 0.6801463961601257, "eval_runtime": 91.9431, "eval_samples_per_second": 89.436, "eval_steps_per_second": 11.181, "step": 8000 }, { "epoch": 1.5484610973045307, "grad_norm": 5.212741851806641, "learning_rate": 3.453259415025808e-05, "loss": 0.746, "step": 8100 }, { "epoch": 1.5675779009749569, "grad_norm": 3.754594564437866, "learning_rate": 3.4341426113553814e-05, "loss": 0.779, "step": 8200 }, { "epoch": 1.5866947046453834, "grad_norm": 4.9265289306640625, "learning_rate": 3.4150258076849553e-05, "loss": 0.7602, "step": 8300 }, { "epoch": 1.6058115083158095, "grad_norm": 4.226690769195557, "learning_rate": 3.395909004014529e-05, "loss": 0.7413, "step": 8400 }, { "epoch": 1.6058115083158095, "eval_loss": 0.6831762194633484, "eval_runtime": 92.496, "eval_samples_per_second": 88.901, "eval_steps_per_second": 11.114, "step": 8400 }, { "epoch": 1.6249283119862359, "grad_norm": 4.135231018066406, "learning_rate": 3.3767922003441027e-05, "loss": 0.748, "step": 8500 }, { "epoch": 1.6440451156566622, "grad_norm": 4.373616695404053, "learning_rate": 3.357675396673676e-05, "loss": 0.7212, "step": 8600 }, { "epoch": 1.6631619193270883, "grad_norm": 3.8442435264587402, "learning_rate": 3.33855859300325e-05, "loss": 0.731, "step": 8700 }, { "epoch": 1.682278722997515, "grad_norm": 5.096011638641357, "learning_rate": 3.319441789332824e-05, "loss": 0.742, "step": 8800 }, { "epoch": 1.682278722997515, "eval_loss": 0.6838387250900269, "eval_runtime": 92.5093, "eval_samples_per_second": 88.888, "eval_steps_per_second": 11.112, "step": 8800 }, { "epoch": 1.701395526667941, "grad_norm": 3.780578851699829, "learning_rate": 3.300324985662397e-05, "loss": 0.742, "step": 8900 }, { "epoch": 1.7205123303383676, "grad_norm": 4.875925064086914, "learning_rate": 3.281208181991971e-05, "loss": 0.7479, "step": 9000 }, { "epoch": 1.7396291340087937, "grad_norm": 3.7036213874816895, "learning_rate": 3.2620913783215446e-05, "loss": 0.7228, "step": 9100 }, { "epoch": 1.75874593767922, "grad_norm": 5.547360897064209, "learning_rate": 3.2429745746511185e-05, "loss": 0.7363, "step": 9200 }, { "epoch": 1.75874593767922, "eval_loss": 0.6730713844299316, "eval_runtime": 92.4366, "eval_samples_per_second": 88.958, "eval_steps_per_second": 11.121, "step": 9200 }, { "epoch": 1.7778627413496464, "grad_norm": 3.721874475479126, "learning_rate": 3.223857770980692e-05, "loss": 0.73, "step": 9300 }, { "epoch": 1.7969795450200725, "grad_norm": 4.086114406585693, "learning_rate": 3.204740967310266e-05, "loss": 0.7188, "step": 9400 }, { "epoch": 1.816096348690499, "grad_norm": 5.272156238555908, "learning_rate": 3.18562416363984e-05, "loss": 0.7556, "step": 9500 }, { "epoch": 1.8352131523609252, "grad_norm": 4.227740287780762, "learning_rate": 3.166507359969413e-05, "loss": 0.7355, "step": 9600 }, { "epoch": 1.8352131523609252, "eval_loss": 0.6690217852592468, "eval_runtime": 92.4428, "eval_samples_per_second": 88.952, "eval_steps_per_second": 11.12, "step": 9600 }, { "epoch": 1.8543299560313515, "grad_norm": 4.812748432159424, "learning_rate": 3.147390556298987e-05, "loss": 0.7534, "step": 9700 }, { "epoch": 1.8734467597017779, "grad_norm": 4.234578609466553, "learning_rate": 3.1282737526285605e-05, "loss": 0.7143, "step": 9800 }, { "epoch": 1.8925635633722042, "grad_norm": 4.305205345153809, "learning_rate": 3.1091569489581344e-05, "loss": 0.7151, "step": 9900 }, { "epoch": 1.9116803670426306, "grad_norm": 4.651333332061768, "learning_rate": 3.090231313324412e-05, "loss": 0.7097, "step": 10000 }, { "epoch": 1.9116803670426306, "eval_loss": 0.664512038230896, "eval_runtime": 92.6703, "eval_samples_per_second": 88.734, "eval_steps_per_second": 11.093, "step": 10000 }, { "epoch": 1.9307971707130567, "grad_norm": 4.365225791931152, "learning_rate": 3.071114509653986e-05, "loss": 0.7095, "step": 10100 }, { "epoch": 1.9499139743834832, "grad_norm": 4.273274898529053, "learning_rate": 3.0519977059835596e-05, "loss": 0.7283, "step": 10200 }, { "epoch": 1.9690307780539094, "grad_norm": 4.034031391143799, "learning_rate": 3.033072070349838e-05, "loss": 0.7083, "step": 10300 }, { "epoch": 1.9881475817243357, "grad_norm": 4.648432731628418, "learning_rate": 3.013955266679411e-05, "loss": 0.7019, "step": 10400 }, { "epoch": 1.9881475817243357, "eval_loss": 0.658172607421875, "eval_runtime": 92.5836, "eval_samples_per_second": 88.817, "eval_steps_per_second": 11.103, "step": 10400 }, { "epoch": 2.007264385394762, "grad_norm": 3.510467290878296, "learning_rate": 2.9948384630089848e-05, "loss": 0.7183, "step": 10500 }, { "epoch": 2.026381189065188, "grad_norm": 4.297295570373535, "learning_rate": 2.9757216593385588e-05, "loss": 0.701, "step": 10600 }, { "epoch": 2.0454979927356147, "grad_norm": 5.070156097412109, "learning_rate": 2.9566048556681324e-05, "loss": 0.7029, "step": 10700 }, { "epoch": 2.064614796406041, "grad_norm": 5.115049362182617, "learning_rate": 2.937488051997706e-05, "loss": 0.7023, "step": 10800 }, { "epoch": 2.064614796406041, "eval_loss": 0.6589385867118835, "eval_runtime": 92.7551, "eval_samples_per_second": 88.653, "eval_steps_per_second": 11.083, "step": 10800 }, { "epoch": 2.0837316000764674, "grad_norm": 4.264118194580078, "learning_rate": 2.9183712483272797e-05, "loss": 0.701, "step": 10900 }, { "epoch": 2.1028484037468935, "grad_norm": 4.804683208465576, "learning_rate": 2.8992544446568537e-05, "loss": 0.6865, "step": 11000 }, { "epoch": 2.1219652074173196, "grad_norm": 3.3149337768554688, "learning_rate": 2.8801376409864274e-05, "loss": 0.7121, "step": 11100 }, { "epoch": 2.141082011087746, "grad_norm": 4.628523349761963, "learning_rate": 2.8610208373160007e-05, "loss": 0.7095, "step": 11200 }, { "epoch": 2.141082011087746, "eval_loss": 0.6505147218704224, "eval_runtime": 92.4458, "eval_samples_per_second": 88.949, "eval_steps_per_second": 11.12, "step": 11200 }, { "epoch": 2.1601988147581723, "grad_norm": 4.2497453689575195, "learning_rate": 2.8419040336455743e-05, "loss": 0.7078, "step": 11300 }, { "epoch": 2.179315618428599, "grad_norm": 4.486359119415283, "learning_rate": 2.8227872299751483e-05, "loss": 0.7164, "step": 11400 }, { "epoch": 2.198432422099025, "grad_norm": 4.553341388702393, "learning_rate": 2.803670426304722e-05, "loss": 0.6857, "step": 11500 }, { "epoch": 2.217549225769451, "grad_norm": 4.612332344055176, "learning_rate": 2.7845536226342956e-05, "loss": 0.7088, "step": 11600 }, { "epoch": 2.217549225769451, "eval_loss": 0.6430885195732117, "eval_runtime": 92.4895, "eval_samples_per_second": 88.907, "eval_steps_per_second": 11.115, "step": 11600 }, { "epoch": 2.2366660294398777, "grad_norm": 4.992730617523193, "learning_rate": 2.7654368189638696e-05, "loss": 0.6807, "step": 11700 }, { "epoch": 2.255782833110304, "grad_norm": 4.852089881896973, "learning_rate": 2.7463200152934433e-05, "loss": 0.6944, "step": 11800 }, { "epoch": 2.2748996367807304, "grad_norm": 4.18324089050293, "learning_rate": 2.7272032116230166e-05, "loss": 0.704, "step": 11900 }, { "epoch": 2.2940164404511565, "grad_norm": 4.048402786254883, "learning_rate": 2.7080864079525902e-05, "loss": 0.7127, "step": 12000 }, { "epoch": 2.2940164404511565, "eval_loss": 0.6457203030586243, "eval_runtime": 92.3316, "eval_samples_per_second": 89.059, "eval_steps_per_second": 11.134, "step": 12000 }, { "epoch": 2.313133244121583, "grad_norm": 4.407283306121826, "learning_rate": 2.6889696042821642e-05, "loss": 0.7022, "step": 12100 }, { "epoch": 2.332250047792009, "grad_norm": 3.9950592517852783, "learning_rate": 2.669852800611738e-05, "loss": 0.7034, "step": 12200 }, { "epoch": 2.3513668514624353, "grad_norm": 4.345687389373779, "learning_rate": 2.6507359969413115e-05, "loss": 0.6908, "step": 12300 }, { "epoch": 2.370483655132862, "grad_norm": 4.338857173919678, "learning_rate": 2.631619193270885e-05, "loss": 0.6754, "step": 12400 }, { "epoch": 2.370483655132862, "eval_loss": 0.6425340175628662, "eval_runtime": 92.4505, "eval_samples_per_second": 88.945, "eval_steps_per_second": 11.119, "step": 12400 }, { "epoch": 2.389600458803288, "grad_norm": 4.529644012451172, "learning_rate": 2.6125023896004592e-05, "loss": 0.6968, "step": 12500 }, { "epoch": 2.4087172624737145, "grad_norm": 4.309901714324951, "learning_rate": 2.593385585930033e-05, "loss": 0.7247, "step": 12600 }, { "epoch": 2.4278340661441407, "grad_norm": 4.750647068023682, "learning_rate": 2.574268782259606e-05, "loss": 0.6855, "step": 12700 }, { "epoch": 2.446950869814567, "grad_norm": 3.7934632301330566, "learning_rate": 2.5551519785891798e-05, "loss": 0.6865, "step": 12800 }, { "epoch": 2.446950869814567, "eval_loss": 0.6368651390075684, "eval_runtime": 92.2827, "eval_samples_per_second": 89.107, "eval_steps_per_second": 11.14, "step": 12800 }, { "epoch": 2.4660676734849933, "grad_norm": 4.074941158294678, "learning_rate": 2.5360351749187538e-05, "loss": 0.694, "step": 12900 }, { "epoch": 2.4851844771554195, "grad_norm": 4.529365539550781, "learning_rate": 2.5169183712483274e-05, "loss": 0.69, "step": 13000 }, { "epoch": 2.504301280825846, "grad_norm": 4.395044803619385, "learning_rate": 2.4979927356146053e-05, "loss": 0.6887, "step": 13100 }, { "epoch": 2.523418084496272, "grad_norm": 8.110248565673828, "learning_rate": 2.478875931944179e-05, "loss": 0.7046, "step": 13200 }, { "epoch": 2.523418084496272, "eval_loss": 0.6256079077720642, "eval_runtime": 92.0493, "eval_samples_per_second": 89.333, "eval_steps_per_second": 11.168, "step": 13200 }, { "epoch": 2.5425348881666983, "grad_norm": 3.641064167022705, "learning_rate": 2.459759128273753e-05, "loss": 0.6763, "step": 13300 }, { "epoch": 2.561651691837125, "grad_norm": 3.8722896575927734, "learning_rate": 2.4406423246033263e-05, "loss": 0.6963, "step": 13400 }, { "epoch": 2.5807684955075514, "grad_norm": 4.976208686828613, "learning_rate": 2.4215255209329003e-05, "loss": 0.675, "step": 13500 }, { "epoch": 2.5998852991779775, "grad_norm": 4.116947650909424, "learning_rate": 2.402408717262474e-05, "loss": 0.6729, "step": 13600 }, { "epoch": 2.5998852991779775, "eval_loss": 0.6305546164512634, "eval_runtime": 92.2226, "eval_samples_per_second": 89.165, "eval_steps_per_second": 11.147, "step": 13600 }, { "epoch": 2.6190021028484036, "grad_norm": 4.226246356964111, "learning_rate": 2.3832919135920476e-05, "loss": 0.7178, "step": 13700 }, { "epoch": 2.63811890651883, "grad_norm": 3.8298568725585938, "learning_rate": 2.3641751099216212e-05, "loss": 0.6867, "step": 13800 }, { "epoch": 2.6572357101892563, "grad_norm": 4.829805374145508, "learning_rate": 2.345058306251195e-05, "loss": 0.6816, "step": 13900 }, { "epoch": 2.6763525138596824, "grad_norm": 4.9147210121154785, "learning_rate": 2.3259415025807685e-05, "loss": 0.6628, "step": 14000 }, { "epoch": 2.6763525138596824, "eval_loss": 0.6306756138801575, "eval_runtime": 92.2614, "eval_samples_per_second": 89.127, "eval_steps_per_second": 11.142, "step": 14000 }, { "epoch": 2.695469317530109, "grad_norm": 4.762243270874023, "learning_rate": 2.306824698910342e-05, "loss": 0.7046, "step": 14100 }, { "epoch": 2.714586121200535, "grad_norm": 3.3113512992858887, "learning_rate": 2.287707895239916e-05, "loss": 0.6755, "step": 14200 }, { "epoch": 2.7337029248709617, "grad_norm": 4.240131855010986, "learning_rate": 2.2685910915694898e-05, "loss": 0.6827, "step": 14300 }, { "epoch": 2.752819728541388, "grad_norm": 3.4899845123291016, "learning_rate": 2.2494742878990635e-05, "loss": 0.6789, "step": 14400 }, { "epoch": 2.752819728541388, "eval_loss": 0.6250412464141846, "eval_runtime": 92.2599, "eval_samples_per_second": 89.129, "eval_steps_per_second": 11.142, "step": 14400 }, { "epoch": 2.7719365322118144, "grad_norm": 3.6331779956817627, "learning_rate": 2.230357484228637e-05, "loss": 0.6732, "step": 14500 }, { "epoch": 2.7910533358822405, "grad_norm": 5.3023247718811035, "learning_rate": 2.2112406805582108e-05, "loss": 0.6705, "step": 14600 }, { "epoch": 2.8101701395526666, "grad_norm": 4.467443943023682, "learning_rate": 2.1921238768877844e-05, "loss": 0.6857, "step": 14700 }, { "epoch": 2.829286943223093, "grad_norm": 3.4010238647460938, "learning_rate": 2.1730070732173584e-05, "loss": 0.6876, "step": 14800 }, { "epoch": 2.829286943223093, "eval_loss": 0.6198094487190247, "eval_runtime": 92.347, "eval_samples_per_second": 89.045, "eval_steps_per_second": 11.132, "step": 14800 }, { "epoch": 2.8484037468935193, "grad_norm": 4.248734951019287, "learning_rate": 2.1538902695469317e-05, "loss": 0.674, "step": 14900 }, { "epoch": 2.867520550563946, "grad_norm": 4.063199520111084, "learning_rate": 2.1347734658765057e-05, "loss": 0.666, "step": 15000 }, { "epoch": 2.886637354234372, "grad_norm": 4.015697002410889, "learning_rate": 2.115656662206079e-05, "loss": 0.6953, "step": 15100 }, { "epoch": 2.9057541579047985, "grad_norm": 3.798788070678711, "learning_rate": 2.096539858535653e-05, "loss": 0.65, "step": 15200 }, { "epoch": 2.9057541579047985, "eval_loss": 0.6209089756011963, "eval_runtime": 92.3611, "eval_samples_per_second": 89.031, "eval_steps_per_second": 11.13, "step": 15200 }, { "epoch": 2.9248709615752246, "grad_norm": 5.368408679962158, "learning_rate": 2.0774230548652267e-05, "loss": 0.6766, "step": 15300 }, { "epoch": 2.9439877652456508, "grad_norm": 3.803342580795288, "learning_rate": 2.0583062511948003e-05, "loss": 0.6659, "step": 15400 }, { "epoch": 2.9631045689160773, "grad_norm": 4.15940523147583, "learning_rate": 2.039189447524374e-05, "loss": 0.6636, "step": 15500 }, { "epoch": 2.9822213725865034, "grad_norm": 4.552635192871094, "learning_rate": 2.0200726438539476e-05, "loss": 0.6731, "step": 15600 }, { "epoch": 2.9822213725865034, "eval_loss": 0.6129796504974365, "eval_runtime": 92.2051, "eval_samples_per_second": 89.182, "eval_steps_per_second": 11.149, "step": 15600 }, { "epoch": 3.00133817625693, "grad_norm": 4.447234153747559, "learning_rate": 2.0009558401835213e-05, "loss": 0.6696, "step": 15700 }, { "epoch": 3.020454979927356, "grad_norm": 4.5681376457214355, "learning_rate": 1.9820302045497995e-05, "loss": 0.6701, "step": 15800 }, { "epoch": 3.0395717835977822, "grad_norm": 4.63778018951416, "learning_rate": 1.962913400879373e-05, "loss": 0.6498, "step": 15900 }, { "epoch": 3.058688587268209, "grad_norm": 3.8129169940948486, "learning_rate": 1.9437965972089468e-05, "loss": 0.6514, "step": 16000 }, { "epoch": 3.058688587268209, "eval_loss": 0.6256683468818665, "eval_runtime": 92.6353, "eval_samples_per_second": 88.767, "eval_steps_per_second": 11.097, "step": 16000 }, { "epoch": 3.077805390938635, "grad_norm": 4.221614837646484, "learning_rate": 1.9246797935385204e-05, "loss": 0.6745, "step": 16100 }, { "epoch": 3.0969221946090615, "grad_norm": 4.324675559997559, "learning_rate": 1.905562989868094e-05, "loss": 0.6516, "step": 16200 }, { "epoch": 3.1160389982794876, "grad_norm": 4.2093939781188965, "learning_rate": 1.8864461861976677e-05, "loss": 0.6574, "step": 16300 }, { "epoch": 3.135155801949914, "grad_norm": 4.944237232208252, "learning_rate": 1.8673293825272414e-05, "loss": 0.6602, "step": 16400 }, { "epoch": 3.135155801949914, "eval_loss": 0.6231346726417542, "eval_runtime": 92.6789, "eval_samples_per_second": 88.726, "eval_steps_per_second": 11.092, "step": 16400 }, { "epoch": 3.1542726056203403, "grad_norm": 4.590976238250732, "learning_rate": 1.8482125788568154e-05, "loss": 0.6662, "step": 16500 }, { "epoch": 3.1733894092907664, "grad_norm": 4.595473289489746, "learning_rate": 1.8290957751863887e-05, "loss": 0.6294, "step": 16600 }, { "epoch": 3.192506212961193, "grad_norm": 3.4412662982940674, "learning_rate": 1.8099789715159627e-05, "loss": 0.6558, "step": 16700 }, { "epoch": 3.211623016631619, "grad_norm": 4.820471286773682, "learning_rate": 1.7908621678455363e-05, "loss": 0.6749, "step": 16800 }, { "epoch": 3.211623016631619, "eval_loss": 0.6116614937782288, "eval_runtime": 92.7709, "eval_samples_per_second": 88.638, "eval_steps_per_second": 11.081, "step": 16800 }, { "epoch": 3.2307398203020457, "grad_norm": 5.879933834075928, "learning_rate": 1.77174536417511e-05, "loss": 0.6435, "step": 16900 }, { "epoch": 3.2498566239724718, "grad_norm": 4.462230205535889, "learning_rate": 1.752628560504684e-05, "loss": 0.6691, "step": 17000 }, { "epoch": 3.268973427642898, "grad_norm": 3.9079251289367676, "learning_rate": 1.7335117568342573e-05, "loss": 0.6684, "step": 17100 }, { "epoch": 3.2880902313133245, "grad_norm": 3.6853411197662354, "learning_rate": 1.7143949531638313e-05, "loss": 0.6429, "step": 17200 }, { "epoch": 3.2880902313133245, "eval_loss": 0.6121929883956909, "eval_runtime": 92.7326, "eval_samples_per_second": 88.674, "eval_steps_per_second": 11.086, "step": 17200 }, { "epoch": 3.3072070349837506, "grad_norm": 4.5890631675720215, "learning_rate": 1.6952781494934046e-05, "loss": 0.6608, "step": 17300 }, { "epoch": 3.326323838654177, "grad_norm": 3.9099321365356445, "learning_rate": 1.6761613458229786e-05, "loss": 0.6515, "step": 17400 }, { "epoch": 3.3454406423246033, "grad_norm": 8.615681648254395, "learning_rate": 1.6570445421525522e-05, "loss": 0.6516, "step": 17500 }, { "epoch": 3.36455744599503, "grad_norm": 3.810173988342285, "learning_rate": 1.637927738482126e-05, "loss": 0.6488, "step": 17600 }, { "epoch": 3.36455744599503, "eval_loss": 0.6148595809936523, "eval_runtime": 92.7216, "eval_samples_per_second": 88.685, "eval_steps_per_second": 11.087, "step": 17600 }, { "epoch": 3.383674249665456, "grad_norm": 4.097940444946289, "learning_rate": 1.6188109348116995e-05, "loss": 0.6441, "step": 17700 }, { "epoch": 3.402791053335882, "grad_norm": 4.74275016784668, "learning_rate": 1.5996941311412732e-05, "loss": 0.6444, "step": 17800 }, { "epoch": 3.4219078570063086, "grad_norm": 4.2954888343811035, "learning_rate": 1.5805773274708468e-05, "loss": 0.6474, "step": 17900 }, { "epoch": 3.4410246606767347, "grad_norm": 4.689930438995361, "learning_rate": 1.5614605238004208e-05, "loss": 0.6341, "step": 18000 }, { "epoch": 3.4410246606767347, "eval_loss": 0.6162819862365723, "eval_runtime": 92.7406, "eval_samples_per_second": 88.667, "eval_steps_per_second": 11.085, "step": 18000 }, { "epoch": 3.4601414643471613, "grad_norm": 4.592708110809326, "learning_rate": 1.542343720129994e-05, "loss": 0.6344, "step": 18100 }, { "epoch": 3.4792582680175874, "grad_norm": 3.388826608657837, "learning_rate": 1.5232269164595681e-05, "loss": 0.6476, "step": 18200 }, { "epoch": 3.498375071688014, "grad_norm": 3.1417880058288574, "learning_rate": 1.5041101127891416e-05, "loss": 0.63, "step": 18300 }, { "epoch": 3.51749187535844, "grad_norm": 3.839583158493042, "learning_rate": 1.4849933091187154e-05, "loss": 0.6502, "step": 18400 }, { "epoch": 3.51749187535844, "eval_loss": 0.6018521189689636, "eval_runtime": 92.7633, "eval_samples_per_second": 88.645, "eval_steps_per_second": 11.082, "step": 18400 }, { "epoch": 3.536608679028866, "grad_norm": 4.443102836608887, "learning_rate": 1.4658765054482892e-05, "loss": 0.615, "step": 18500 }, { "epoch": 3.555725482699293, "grad_norm": 3.7104012966156006, "learning_rate": 1.4467597017778627e-05, "loss": 0.6362, "step": 18600 }, { "epoch": 3.574842286369719, "grad_norm": 3.7938549518585205, "learning_rate": 1.4276428981074365e-05, "loss": 0.6351, "step": 18700 }, { "epoch": 3.593959090040145, "grad_norm": 3.9377052783966064, "learning_rate": 1.4085260944370102e-05, "loss": 0.6388, "step": 18800 }, { "epoch": 3.593959090040145, "eval_loss": 0.604111909866333, "eval_runtime": 92.8027, "eval_samples_per_second": 88.607, "eval_steps_per_second": 11.077, "step": 18800 }, { "epoch": 3.6130758937105716, "grad_norm": 4.241858959197998, "learning_rate": 1.389409290766584e-05, "loss": 0.6598, "step": 18900 }, { "epoch": 3.632192697380998, "grad_norm": 4.486043453216553, "learning_rate": 1.3702924870961575e-05, "loss": 0.6225, "step": 19000 }, { "epoch": 3.6513095010514243, "grad_norm": 4.468062877655029, "learning_rate": 1.3511756834257313e-05, "loss": 0.6401, "step": 19100 }, { "epoch": 3.6704263047218504, "grad_norm": 4.349284648895264, "learning_rate": 1.332058879755305e-05, "loss": 0.6489, "step": 19200 }, { "epoch": 3.6704263047218504, "eval_loss": 0.5988742113113403, "eval_runtime": 92.7585, "eval_samples_per_second": 88.65, "eval_steps_per_second": 11.083, "step": 19200 }, { "epoch": 3.689543108392277, "grad_norm": 4.928227424621582, "learning_rate": 1.3129420760848788e-05, "loss": 0.62, "step": 19300 }, { "epoch": 3.708659912062703, "grad_norm": 3.8102471828460693, "learning_rate": 1.2938252724144523e-05, "loss": 0.6427, "step": 19400 }, { "epoch": 3.727776715733129, "grad_norm": 4.022319316864014, "learning_rate": 1.2747084687440261e-05, "loss": 0.6461, "step": 19500 }, { "epoch": 3.7468935194035558, "grad_norm": 4.785296440124512, "learning_rate": 1.2555916650735996e-05, "loss": 0.6502, "step": 19600 }, { "epoch": 3.7468935194035558, "eval_loss": 0.5975850224494934, "eval_runtime": 92.6855, "eval_samples_per_second": 88.719, "eval_steps_per_second": 11.091, "step": 19600 }, { "epoch": 3.766010323073982, "grad_norm": 4.089471817016602, "learning_rate": 1.2364748614031734e-05, "loss": 0.6329, "step": 19700 }, { "epoch": 3.7851271267444084, "grad_norm": 2.898491859436035, "learning_rate": 1.2173580577327472e-05, "loss": 0.6317, "step": 19800 }, { "epoch": 3.8042439304148346, "grad_norm": 3.9998719692230225, "learning_rate": 1.1982412540623209e-05, "loss": 0.6184, "step": 19900 }, { "epoch": 3.823360734085261, "grad_norm": 3.649463176727295, "learning_rate": 1.1793156184285987e-05, "loss": 0.6301, "step": 20000 }, { "epoch": 3.823360734085261, "eval_loss": 0.6061282157897949, "eval_runtime": 92.6598, "eval_samples_per_second": 88.744, "eval_steps_per_second": 11.094, "step": 20000 }, { "epoch": 3.8424775377556872, "grad_norm": 4.067989349365234, "learning_rate": 1.1601988147581724e-05, "loss": 0.6425, "step": 20100 }, { "epoch": 3.8615943414261134, "grad_norm": 3.7173011302948, "learning_rate": 1.141082011087746e-05, "loss": 0.6134, "step": 20200 }, { "epoch": 3.88071114509654, "grad_norm": 4.036506175994873, "learning_rate": 1.1219652074173199e-05, "loss": 0.6612, "step": 20300 }, { "epoch": 3.899827948766966, "grad_norm": 3.4378676414489746, "learning_rate": 1.1028484037468937e-05, "loss": 0.6194, "step": 20400 }, { "epoch": 3.899827948766966, "eval_loss": 0.5860570669174194, "eval_runtime": 92.6457, "eval_samples_per_second": 88.757, "eval_steps_per_second": 11.096, "step": 20400 }, { "epoch": 3.9189447524373926, "grad_norm": 7.162832260131836, "learning_rate": 1.0837316000764673e-05, "loss": 0.6385, "step": 20500 }, { "epoch": 3.9380615561078187, "grad_norm": 3.439091920852661, "learning_rate": 1.064614796406041e-05, "loss": 0.6243, "step": 20600 }, { "epoch": 3.9571783597782453, "grad_norm": 3.7195284366607666, "learning_rate": 1.0454979927356146e-05, "loss": 0.6337, "step": 20700 }, { "epoch": 3.9762951634486714, "grad_norm": 3.3584518432617188, "learning_rate": 1.0263811890651883e-05, "loss": 0.6352, "step": 20800 }, { "epoch": 3.9762951634486714, "eval_loss": 0.5977619290351868, "eval_runtime": 92.6632, "eval_samples_per_second": 88.741, "eval_steps_per_second": 11.094, "step": 20800 }, { "epoch": 3.9954119671190975, "grad_norm": 5.517305850982666, "learning_rate": 1.0072643853947621e-05, "loss": 0.6241, "step": 20900 }, { "epoch": 4.014528770789524, "grad_norm": 3.5819714069366455, "learning_rate": 9.881475817243358e-06, "loss": 0.6352, "step": 21000 }, { "epoch": 4.033645574459951, "grad_norm": 3.967008352279663, "learning_rate": 9.690307780539094e-06, "loss": 0.617, "step": 21100 }, { "epoch": 4.052762378130376, "grad_norm": 3.5766172409057617, "learning_rate": 9.49913974383483e-06, "loss": 0.628, "step": 21200 }, { "epoch": 4.052762378130376, "eval_loss": 0.5928879380226135, "eval_runtime": 92.586, "eval_samples_per_second": 88.815, "eval_steps_per_second": 11.103, "step": 21200 }, { "epoch": 4.071879181800803, "grad_norm": 4.025076866149902, "learning_rate": 9.307971707130567e-06, "loss": 0.6238, "step": 21300 }, { "epoch": 4.0909959854712294, "grad_norm": 5.096427917480469, "learning_rate": 9.116803670426305e-06, "loss": 0.6337, "step": 21400 }, { "epoch": 4.110112789141655, "grad_norm": 5.223696708679199, "learning_rate": 8.925635633722042e-06, "loss": 0.6212, "step": 21500 }, { "epoch": 4.129229592812082, "grad_norm": 4.151371479034424, "learning_rate": 8.734467597017778e-06, "loss": 0.6176, "step": 21600 }, { "epoch": 4.129229592812082, "eval_loss": 0.5918228626251221, "eval_runtime": 92.7104, "eval_samples_per_second": 88.696, "eval_steps_per_second": 11.088, "step": 21600 }, { "epoch": 4.148346396482508, "grad_norm": 4.445927619934082, "learning_rate": 8.543299560313515e-06, "loss": 0.6322, "step": 21700 }, { "epoch": 4.167463200152935, "grad_norm": 6.669031143188477, "learning_rate": 8.352131523609253e-06, "loss": 0.6307, "step": 21800 }, { "epoch": 4.1865800038233605, "grad_norm": 4.0559186935424805, "learning_rate": 8.16096348690499e-06, "loss": 0.6323, "step": 21900 }, { "epoch": 4.205696807493787, "grad_norm": 4.512356281280518, "learning_rate": 7.969795450200728e-06, "loss": 0.6385, "step": 22000 }, { "epoch": 4.205696807493787, "eval_loss": 0.5929626226425171, "eval_runtime": 92.4043, "eval_samples_per_second": 88.989, "eval_steps_per_second": 11.125, "step": 22000 }, { "epoch": 4.224813611164214, "grad_norm": 4.109405517578125, "learning_rate": 7.778627413496464e-06, "loss": 0.6255, "step": 22100 }, { "epoch": 4.243930414834639, "grad_norm": 4.0541486740112305, "learning_rate": 7.587459376792201e-06, "loss": 0.6198, "step": 22200 }, { "epoch": 4.263047218505066, "grad_norm": 5.3996663093566895, "learning_rate": 7.396291340087938e-06, "loss": 0.6172, "step": 22300 }, { "epoch": 4.282164022175492, "grad_norm": 4.728433609008789, "learning_rate": 7.205123303383675e-06, "loss": 0.6061, "step": 22400 }, { "epoch": 4.282164022175492, "eval_loss": 0.5865157246589661, "eval_runtime": 92.3573, "eval_samples_per_second": 89.035, "eval_steps_per_second": 11.131, "step": 22400 }, { "epoch": 4.301280825845919, "grad_norm": 4.604154586791992, "learning_rate": 7.013955266679411e-06, "loss": 0.6441, "step": 22500 }, { "epoch": 4.320397629516345, "grad_norm": 4.1287760734558105, "learning_rate": 6.822787229975149e-06, "loss": 0.625, "step": 22600 }, { "epoch": 4.339514433186771, "grad_norm": 3.1182920932769775, "learning_rate": 6.631619193270885e-06, "loss": 0.5973, "step": 22700 }, { "epoch": 4.358631236857198, "grad_norm": 4.751844882965088, "learning_rate": 6.4404511565666225e-06, "loss": 0.597, "step": 22800 }, { "epoch": 4.358631236857198, "eval_loss": 0.5919764637947083, "eval_runtime": 92.4705, "eval_samples_per_second": 88.926, "eval_steps_per_second": 11.117, "step": 22800 }, { "epoch": 4.3777480405276235, "grad_norm": 5.184845924377441, "learning_rate": 6.249283119862359e-06, "loss": 0.6219, "step": 22900 }, { "epoch": 4.39686484419805, "grad_norm": 4.108447551727295, "learning_rate": 6.058115083158096e-06, "loss": 0.6275, "step": 23000 }, { "epoch": 4.415981647868477, "grad_norm": 3.9303929805755615, "learning_rate": 5.866947046453833e-06, "loss": 0.6213, "step": 23100 }, { "epoch": 4.435098451538902, "grad_norm": 4.054929733276367, "learning_rate": 5.67577900974957e-06, "loss": 0.6313, "step": 23200 }, { "epoch": 4.435098451538902, "eval_loss": 0.5812836289405823, "eval_runtime": 92.6171, "eval_samples_per_second": 88.785, "eval_steps_per_second": 11.099, "step": 23200 }, { "epoch": 4.454215255209329, "grad_norm": 4.04779052734375, "learning_rate": 5.484610973045308e-06, "loss": 0.6055, "step": 23300 }, { "epoch": 4.473332058879755, "grad_norm": 4.373106956481934, "learning_rate": 5.293442936341044e-06, "loss": 0.6181, "step": 23400 }, { "epoch": 4.492448862550182, "grad_norm": 3.912672758102417, "learning_rate": 5.102274899636781e-06, "loss": 0.6049, "step": 23500 }, { "epoch": 4.511565666220608, "grad_norm": 4.924178123474121, "learning_rate": 4.911106862932518e-06, "loss": 0.6388, "step": 23600 }, { "epoch": 4.511565666220608, "eval_loss": 0.585365891456604, "eval_runtime": 92.598, "eval_samples_per_second": 88.803, "eval_steps_per_second": 11.102, "step": 23600 }, { "epoch": 4.530682469891034, "grad_norm": 4.225689888000488, "learning_rate": 4.7199388262282546e-06, "loss": 0.5984, "step": 23700 }, { "epoch": 4.549799273561461, "grad_norm": 3.848640203475952, "learning_rate": 4.528770789523992e-06, "loss": 0.6009, "step": 23800 }, { "epoch": 4.568916077231886, "grad_norm": 3.6290130615234375, "learning_rate": 4.3376027528197284e-06, "loss": 0.6205, "step": 23900 }, { "epoch": 4.588032880902313, "grad_norm": 5.409413814544678, "learning_rate": 4.146434716115466e-06, "loss": 0.6052, "step": 24000 }, { "epoch": 4.588032880902313, "eval_loss": 0.5768113732337952, "eval_runtime": 92.5622, "eval_samples_per_second": 88.838, "eval_steps_per_second": 11.106, "step": 24000 }, { "epoch": 4.6071496845727395, "grad_norm": 4.062690258026123, "learning_rate": 3.955266679411203e-06, "loss": 0.6227, "step": 24100 }, { "epoch": 4.626266488243166, "grad_norm": 6.228837490081787, "learning_rate": 3.7640986427069397e-06, "loss": 0.6104, "step": 24200 }, { "epoch": 4.645383291913592, "grad_norm": 3.8039870262145996, "learning_rate": 3.5729306060026766e-06, "loss": 0.6087, "step": 24300 }, { "epoch": 4.664500095584018, "grad_norm": 4.199521541595459, "learning_rate": 3.3817625692984135e-06, "loss": 0.6135, "step": 24400 }, { "epoch": 4.664500095584018, "eval_loss": 0.5789579153060913, "eval_runtime": 92.5954, "eval_samples_per_second": 88.806, "eval_steps_per_second": 11.102, "step": 24400 }, { "epoch": 4.683616899254445, "grad_norm": 3.312234878540039, "learning_rate": 3.19059453259415e-06, "loss": 0.6108, "step": 24500 }, { "epoch": 4.702733702924871, "grad_norm": 4.610132694244385, "learning_rate": 2.9994264958898874e-06, "loss": 0.5984, "step": 24600 }, { "epoch": 4.721850506595297, "grad_norm": 4.196247100830078, "learning_rate": 2.8082584591856244e-06, "loss": 0.6249, "step": 24700 }, { "epoch": 4.740967310265724, "grad_norm": 4.444230079650879, "learning_rate": 2.6170904224813613e-06, "loss": 0.6135, "step": 24800 }, { "epoch": 4.740967310265724, "eval_loss": 0.5723977088928223, "eval_runtime": 92.5878, "eval_samples_per_second": 88.813, "eval_steps_per_second": 11.103, "step": 24800 }, { "epoch": 4.76008411393615, "grad_norm": 4.337975025177002, "learning_rate": 2.4259223857770982e-06, "loss": 0.6222, "step": 24900 }, { "epoch": 4.779200917606576, "grad_norm": 4.261539459228516, "learning_rate": 2.234754349072835e-06, "loss": 0.5808, "step": 25000 }, { "epoch": 4.7983177212770025, "grad_norm": 4.659415245056152, "learning_rate": 2.043586312368572e-06, "loss": 0.592, "step": 25100 }, { "epoch": 4.817434524947429, "grad_norm": 4.005898952484131, "learning_rate": 1.852418275664309e-06, "loss": 0.6087, "step": 25200 }, { "epoch": 4.817434524947429, "eval_loss": 0.5687017440795898, "eval_runtime": 92.6047, "eval_samples_per_second": 88.797, "eval_steps_per_second": 11.101, "step": 25200 }, { "epoch": 4.836551328617855, "grad_norm": 4.495694160461426, "learning_rate": 1.6612502389600458e-06, "loss": 0.6195, "step": 25300 }, { "epoch": 4.855668132288281, "grad_norm": 5.030457019805908, "learning_rate": 1.470082202255783e-06, "loss": 0.6202, "step": 25400 }, { "epoch": 4.874784935958708, "grad_norm": 4.42711877822876, "learning_rate": 1.2789141655515199e-06, "loss": 0.6007, "step": 25500 }, { "epoch": 4.893901739629134, "grad_norm": 4.1595563888549805, "learning_rate": 1.0877461288472568e-06, "loss": 0.621, "step": 25600 }, { "epoch": 4.893901739629134, "eval_loss": 0.5748383402824402, "eval_runtime": 92.5734, "eval_samples_per_second": 88.827, "eval_steps_per_second": 11.105, "step": 25600 }, { "epoch": 4.91301854329956, "grad_norm": 3.987473249435425, "learning_rate": 8.965780921429937e-07, "loss": 0.586, "step": 25700 }, { "epoch": 4.932135346969987, "grad_norm": 3.4999001026153564, "learning_rate": 7.054100554387307e-07, "loss": 0.6105, "step": 25800 }, { "epoch": 4.951252150640413, "grad_norm": 3.7822272777557373, "learning_rate": 5.142420187344676e-07, "loss": 0.6047, "step": 25900 }, { "epoch": 4.970368954310839, "grad_norm": 3.8180148601531982, "learning_rate": 3.2307398203020455e-07, "loss": 0.6256, "step": 26000 }, { "epoch": 4.970368954310839, "eval_loss": 0.5706872344017029, "eval_runtime": 92.6466, "eval_samples_per_second": 88.757, "eval_steps_per_second": 11.096, "step": 26000 }, { "epoch": 4.9894857579812655, "grad_norm": 3.5531647205352783, "learning_rate": 1.319059453259415e-07, "loss": 0.599, "step": 26100 } ], "logging_steps": 100, "max_steps": 26155, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.446806122974282e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }