| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 100, | |
| "global_step": 880, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.17341195046901703, | |
| "learning_rate": 2.272727272727273e-05, | |
| "loss": 0.9238, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.151872456073761, | |
| "learning_rate": 5.113636363636364e-05, | |
| "loss": 0.8658, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.1270148754119873, | |
| "learning_rate": 7.954545454545455e-05, | |
| "loss": 0.8384, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.13214267790317535, | |
| "learning_rate": 0.00010795454545454545, | |
| "loss": 0.7931, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.125133216381073, | |
| "learning_rate": 0.00013636363636363637, | |
| "loss": 0.7621, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.1484372466802597, | |
| "learning_rate": 0.00016477272727272727, | |
| "loss": 0.7396, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.14502882957458496, | |
| "learning_rate": 0.00019318181818181817, | |
| "loss": 0.7783, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.1474684476852417, | |
| "learning_rate": 0.0002215909090909091, | |
| "loss": 0.7208, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.14219504594802856, | |
| "learning_rate": 0.00025, | |
| "loss": 0.7467, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.14778761565685272, | |
| "learning_rate": 0.0002784090909090909, | |
| "loss": 0.6922, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6285714285714286, | |
| "grad_norm": 0.1440540999174118, | |
| "learning_rate": 0.0003068181818181818, | |
| "loss": 0.6895, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.14411456882953644, | |
| "learning_rate": 0.00033522727272727274, | |
| "loss": 0.7601, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7428571428571429, | |
| "grad_norm": 0.1502826064825058, | |
| "learning_rate": 0.00036363636363636367, | |
| "loss": 0.6845, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.15052765607833862, | |
| "learning_rate": 0.00039204545454545454, | |
| "loss": 0.6665, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.1469675898551941, | |
| "learning_rate": 0.0004204545454545455, | |
| "loss": 0.7189, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 0.136297807097435, | |
| "learning_rate": 0.00044886363636363635, | |
| "loss": 0.6771, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9714285714285714, | |
| "grad_norm": 0.15446212887763977, | |
| "learning_rate": 0.0004772727272727273, | |
| "loss": 0.6908, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.022857142857143, | |
| "grad_norm": 0.15397129952907562, | |
| "learning_rate": 0.0004999980332062218, | |
| "loss": 0.7028, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.15528780221939087, | |
| "learning_rate": 0.0004999291986732823, | |
| "loss": 0.6689, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.1371428571428572, | |
| "grad_norm": 0.17444616556167603, | |
| "learning_rate": 0.0004997620553954645, | |
| "loss": 0.6345, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1371428571428572, | |
| "eval_loss": 0.6875521540641785, | |
| "eval_runtime": 29.6981, | |
| "eval_samples_per_second": 2.727, | |
| "eval_steps_per_second": 2.727, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1942857142857144, | |
| "grad_norm": 0.16984045505523682, | |
| "learning_rate": 0.0004994966691179711, | |
| "loss": 0.6399, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.2514285714285713, | |
| "grad_norm": 0.18500125408172607, | |
| "learning_rate": 0.0004991331442295331, | |
| "loss": 0.6482, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.3085714285714285, | |
| "grad_norm": 0.19161580502986908, | |
| "learning_rate": 0.0004986716237213483, | |
| "loss": 0.6554, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3657142857142857, | |
| "grad_norm": 0.17785383760929108, | |
| "learning_rate": 0.0004981122891308368, | |
| "loss": 0.5931, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.4228571428571428, | |
| "grad_norm": 0.1681031882762909, | |
| "learning_rate": 0.0004974553604702333, | |
| "loss": 0.5979, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.1733732372522354, | |
| "learning_rate": 0.0004967010961400466, | |
| "loss": 0.6648, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5371428571428571, | |
| "grad_norm": 0.1732897013425827, | |
| "learning_rate": 0.0004958497928274184, | |
| "loss": 0.6383, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5942857142857143, | |
| "grad_norm": 0.18160653114318848, | |
| "learning_rate": 0.000494901785389423, | |
| "loss": 0.6114, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6514285714285715, | |
| "grad_norm": 0.16878579556941986, | |
| "learning_rate": 0.0004938574467213517, | |
| "loss": 0.636, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.7085714285714286, | |
| "grad_norm": 0.16055361926555634, | |
| "learning_rate": 0.0004927171876100363, | |
| "loss": 0.6337, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7657142857142856, | |
| "grad_norm": 0.17928151786327362, | |
| "learning_rate": 0.0004914814565722671, | |
| "loss": 0.61, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.822857142857143, | |
| "grad_norm": 0.17349475622177124, | |
| "learning_rate": 0.0004901507396783714, | |
| "loss": 0.656, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.17489437758922577, | |
| "learning_rate": 0.0004887255603610184, | |
| "loss": 0.6506, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.9371428571428573, | |
| "grad_norm": 0.18316401541233063, | |
| "learning_rate": 0.00048720647920932994, | |
| "loss": 0.5985, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9942857142857142, | |
| "grad_norm": 0.16122691333293915, | |
| "learning_rate": 0.0004855940937483735, | |
| "loss": 0.6132, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.045714285714286, | |
| "grad_norm": 0.1657329797744751, | |
| "learning_rate": 0.0004838890382041291, | |
| "loss": 0.5925, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.1028571428571428, | |
| "grad_norm": 0.2137223184108734, | |
| "learning_rate": 0.00048209198325401817, | |
| "loss": 0.4924, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.18751586973667145, | |
| "learning_rate": 0.0004802036357630951, | |
| "loss": 0.5103, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.217142857142857, | |
| "grad_norm": 0.22524061799049377, | |
| "learning_rate": 0.00047822473850600447, | |
| "loss": 0.4742, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.2742857142857145, | |
| "grad_norm": 0.23922263085842133, | |
| "learning_rate": 0.0004761560698748135, | |
| "loss": 0.472, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2742857142857145, | |
| "eval_loss": 0.7193492650985718, | |
| "eval_runtime": 29.6354, | |
| "eval_samples_per_second": 2.733, | |
| "eval_steps_per_second": 2.733, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.3314285714285714, | |
| "grad_norm": 0.25317704677581787, | |
| "learning_rate": 0.00047399844357283395, | |
| "loss": 0.4624, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.388571428571429, | |
| "grad_norm": 0.22575099766254425, | |
| "learning_rate": 0.0004717527082945554, | |
| "loss": 0.4803, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.4457142857142857, | |
| "grad_norm": 0.22181181609630585, | |
| "learning_rate": 0.0004694197473918139, | |
| "loss": 0.4805, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.5028571428571427, | |
| "grad_norm": 0.2090139091014862, | |
| "learning_rate": 0.0004670004785263289, | |
| "loss": 0.5365, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.24724529683589935, | |
| "learning_rate": 0.0004644958533087443, | |
| "loss": 0.5005, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.617142857142857, | |
| "grad_norm": 0.22190535068511963, | |
| "learning_rate": 0.0004619068569243159, | |
| "loss": 0.4974, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.6742857142857144, | |
| "grad_norm": 0.21910065412521362, | |
| "learning_rate": 0.00045923450774539243, | |
| "loss": 0.5322, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.7314285714285713, | |
| "grad_norm": 0.24561282992362976, | |
| "learning_rate": 0.0004564798569308423, | |
| "loss": 0.5154, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.7885714285714287, | |
| "grad_norm": 0.22541281580924988, | |
| "learning_rate": 0.00045364398801258396, | |
| "loss": 0.4968, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.8457142857142856, | |
| "grad_norm": 0.21167173981666565, | |
| "learning_rate": 0.000450728016469383, | |
| "loss": 0.5049, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.902857142857143, | |
| "grad_norm": 0.20972833037376404, | |
| "learning_rate": 0.0004477330892880823, | |
| "loss": 0.5303, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.21511210501194, | |
| "learning_rate": 0.0004446603845124388, | |
| "loss": 0.5203, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.0114285714285716, | |
| "grad_norm": 0.20611043274402618, | |
| "learning_rate": 0.0004415111107797445, | |
| "loss": 0.5064, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 3.0685714285714285, | |
| "grad_norm": 0.2586754262447357, | |
| "learning_rate": 0.0004382865068454133, | |
| "loss": 0.3602, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.125714285714286, | |
| "grad_norm": 0.25579819083213806, | |
| "learning_rate": 0.00043498784109572097, | |
| "loss": 0.3523, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 3.182857142857143, | |
| "grad_norm": 0.2642401456832886, | |
| "learning_rate": 0.00043161641104889003, | |
| "loss": 0.3604, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.2600599527359009, | |
| "learning_rate": 0.00042817354284471575, | |
| "loss": 0.3497, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.297142857142857, | |
| "grad_norm": 0.2751370966434479, | |
| "learning_rate": 0.00042466059072293367, | |
| "loss": 0.3525, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.354285714285714, | |
| "grad_norm": 0.2760413885116577, | |
| "learning_rate": 0.00042107893649053456, | |
| "loss": 0.3369, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 3.4114285714285715, | |
| "grad_norm": 0.28514841198921204, | |
| "learning_rate": 0.0004174299889782355, | |
| "loss": 0.3499, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.4114285714285715, | |
| "eval_loss": 0.7963736057281494, | |
| "eval_runtime": 29.7356, | |
| "eval_samples_per_second": 2.724, | |
| "eval_steps_per_second": 2.724, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.4685714285714284, | |
| "grad_norm": 0.28093528747558594, | |
| "learning_rate": 0.0004137151834863213, | |
| "loss": 0.3601, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.525714285714286, | |
| "grad_norm": 0.2521819472312927, | |
| "learning_rate": 0.0004099359812200746, | |
| "loss": 0.3768, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.5828571428571427, | |
| "grad_norm": 0.2599101960659027, | |
| "learning_rate": 0.00040609386871501583, | |
| "loss": 0.3248, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 0.277692049741745, | |
| "learning_rate": 0.0004021903572521802, | |
| "loss": 0.363, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.697142857142857, | |
| "grad_norm": 0.2820269763469696, | |
| "learning_rate": 0.00039822698226366017, | |
| "loss": 0.3676, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.7542857142857144, | |
| "grad_norm": 0.28230783343315125, | |
| "learning_rate": 0.00039420530272864934, | |
| "loss": 0.3556, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.8114285714285714, | |
| "grad_norm": 0.3273780941963196, | |
| "learning_rate": 0.0003901269005602235, | |
| "loss": 0.3656, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.8685714285714283, | |
| "grad_norm": 0.28806638717651367, | |
| "learning_rate": 0.0003859933799831008, | |
| "loss": 0.3499, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.9257142857142857, | |
| "grad_norm": 0.30954381823539734, | |
| "learning_rate": 0.00038180636690262563, | |
| "loss": 0.392, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.982857142857143, | |
| "grad_norm": 0.29088252782821655, | |
| "learning_rate": 0.000377567508265225, | |
| "loss": 0.3736, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.034285714285715, | |
| "grad_norm": 0.22520305216312408, | |
| "learning_rate": 0.0003732784714105876, | |
| "loss": 0.2785, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 4.091428571428572, | |
| "grad_norm": 0.3518202602863312, | |
| "learning_rate": 0.0003689409434158224, | |
| "loss": 0.2139, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.148571428571429, | |
| "grad_norm": 0.2898755967617035, | |
| "learning_rate": 0.0003645566304318526, | |
| "loss": 0.2234, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 4.2057142857142855, | |
| "grad_norm": 0.3240414261817932, | |
| "learning_rate": 0.00036012725701230734, | |
| "loss": 0.2342, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.2628571428571425, | |
| "grad_norm": 0.28856348991394043, | |
| "learning_rate": 0.00035565456543517487, | |
| "loss": 0.2117, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 0.3389490246772766, | |
| "learning_rate": 0.0003511403150174838, | |
| "loss": 0.2435, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.377142857142857, | |
| "grad_norm": 0.31090596318244934, | |
| "learning_rate": 0.00034658628142328216, | |
| "loss": 0.2281, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 4.434285714285714, | |
| "grad_norm": 0.3269132077693939, | |
| "learning_rate": 0.0003419942559651863, | |
| "loss": 0.2637, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.491428571428571, | |
| "grad_norm": 0.30986836552619934, | |
| "learning_rate": 0.0003373660448997746, | |
| "loss": 0.228, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 4.548571428571429, | |
| "grad_norm": 0.28136685490608215, | |
| "learning_rate": 0.000332703468717103, | |
| "loss": 0.2457, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.548571428571429, | |
| "eval_loss": 0.9142104983329773, | |
| "eval_runtime": 29.7632, | |
| "eval_samples_per_second": 2.721, | |
| "eval_steps_per_second": 2.721, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.605714285714286, | |
| "grad_norm": 0.3222131133079529, | |
| "learning_rate": 0.00032800836142462175, | |
| "loss": 0.2238, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.662857142857143, | |
| "grad_norm": 0.3103245496749878, | |
| "learning_rate": 0.0003232825698257755, | |
| "loss": 0.2444, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 0.319525808095932, | |
| "learning_rate": 0.00031852795279356945, | |
| "loss": 0.2498, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.777142857142858, | |
| "grad_norm": 0.2778545320034027, | |
| "learning_rate": 0.0003137463805393885, | |
| "loss": 0.231, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.8342857142857145, | |
| "grad_norm": 0.3173794448375702, | |
| "learning_rate": 0.0003089397338773569, | |
| "loss": 0.2471, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.8914285714285715, | |
| "grad_norm": 0.3207133710384369, | |
| "learning_rate": 0.00030410990348452574, | |
| "loss": 0.2302, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.948571428571428, | |
| "grad_norm": 0.26637086272239685, | |
| "learning_rate": 0.0002992587891571833, | |
| "loss": 0.2244, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.48891177773475647, | |
| "learning_rate": 0.0002943882990635759, | |
| "loss": 0.2451, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.057142857142857, | |
| "grad_norm": 0.23097443580627441, | |
| "learning_rate": 0.0002895003489933375, | |
| "loss": 0.135, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 5.114285714285714, | |
| "grad_norm": 0.30315983295440674, | |
| "learning_rate": 0.0002845968616039207, | |
| "loss": 0.1345, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.171428571428572, | |
| "grad_norm": 0.29658541083335876, | |
| "learning_rate": 0.0002796797656643263, | |
| "loss": 0.147, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 5.228571428571429, | |
| "grad_norm": 0.28584039211273193, | |
| "learning_rate": 0.00027475099529642886, | |
| "loss": 0.1333, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.285714285714286, | |
| "grad_norm": 0.3031592071056366, | |
| "learning_rate": 0.0002698124892141971, | |
| "loss": 0.147, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 5.3428571428571425, | |
| "grad_norm": 0.3030093312263489, | |
| "learning_rate": 0.00026486618996110777, | |
| "loss": 0.1298, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 0.27724677324295044, | |
| "learning_rate": 0.0002599140431460531, | |
| "loss": 0.1406, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 5.457142857142857, | |
| "grad_norm": 0.27622610330581665, | |
| "learning_rate": 0.00025495799667804255, | |
| "loss": 0.1225, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.514285714285714, | |
| "grad_norm": 0.2955563962459564, | |
| "learning_rate": 0.00025, | |
| "loss": 0.1263, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 5.571428571428571, | |
| "grad_norm": 0.3078777492046356, | |
| "learning_rate": 0.00024504200332195757, | |
| "loss": 0.1265, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.628571428571428, | |
| "grad_norm": 0.33732086420059204, | |
| "learning_rate": 0.00024008595685394692, | |
| "loss": 0.1611, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 5.685714285714286, | |
| "grad_norm": 0.29280802607536316, | |
| "learning_rate": 0.00023513381003889227, | |
| "loss": 0.1229, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.685714285714286, | |
| "eval_loss": 1.0489529371261597, | |
| "eval_runtime": 29.6727, | |
| "eval_samples_per_second": 2.73, | |
| "eval_steps_per_second": 2.73, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.742857142857143, | |
| "grad_norm": 0.2972449064254761, | |
| "learning_rate": 0.00023018751078580287, | |
| "loss": 0.138, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 0.29159605503082275, | |
| "learning_rate": 0.00022524900470357118, | |
| "loss": 0.1351, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.857142857142857, | |
| "grad_norm": 0.3165677785873413, | |
| "learning_rate": 0.00022032023433567378, | |
| "loss": 0.1371, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 5.914285714285715, | |
| "grad_norm": 0.3170928657054901, | |
| "learning_rate": 0.0002154031383960793, | |
| "loss": 0.1494, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.9714285714285715, | |
| "grad_norm": 0.2793048024177551, | |
| "learning_rate": 0.0002104996510066625, | |
| "loss": 0.139, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 6.022857142857143, | |
| "grad_norm": 0.16289132833480835, | |
| "learning_rate": 0.00020561170093642424, | |
| "loss": 0.1127, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 0.22523629665374756, | |
| "learning_rate": 0.00020074121084281678, | |
| "loss": 0.0794, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 6.137142857142857, | |
| "grad_norm": 0.301932692527771, | |
| "learning_rate": 0.0001958900965154743, | |
| "loss": 0.0724, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.194285714285714, | |
| "grad_norm": 0.21225464344024658, | |
| "learning_rate": 0.00019106026612264316, | |
| "loss": 0.0747, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 6.251428571428572, | |
| "grad_norm": 0.17405834794044495, | |
| "learning_rate": 0.0001862536194606115, | |
| "loss": 0.0638, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.308571428571429, | |
| "grad_norm": 0.22534868121147156, | |
| "learning_rate": 0.00018147204720643065, | |
| "loss": 0.0722, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 6.365714285714286, | |
| "grad_norm": 0.2658616900444031, | |
| "learning_rate": 0.00017671743017422448, | |
| "loss": 0.0696, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.422857142857143, | |
| "grad_norm": 0.27225354313850403, | |
| "learning_rate": 0.00017199163857537826, | |
| "loss": 0.0764, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 0.2424718737602234, | |
| "learning_rate": 0.000167296531282897, | |
| "loss": 0.0725, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.537142857142857, | |
| "grad_norm": 0.21518002450466156, | |
| "learning_rate": 0.00016263395510022544, | |
| "loss": 0.0688, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 6.594285714285714, | |
| "grad_norm": 0.27070116996765137, | |
| "learning_rate": 0.00015800574403481376, | |
| "loss": 0.0814, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 6.651428571428571, | |
| "grad_norm": 0.28266236186027527, | |
| "learning_rate": 0.00015341371857671783, | |
| "loss": 0.0789, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 6.708571428571428, | |
| "grad_norm": 0.2508731782436371, | |
| "learning_rate": 0.00014885968498251623, | |
| "loss": 0.0677, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 6.765714285714286, | |
| "grad_norm": 0.2912222146987915, | |
| "learning_rate": 0.0001443454345648252, | |
| "loss": 0.0755, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 6.822857142857143, | |
| "grad_norm": 0.21923169493675232, | |
| "learning_rate": 0.00013987274298769264, | |
| "loss": 0.0728, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.822857142857143, | |
| "eval_loss": 1.1974577903747559, | |
| "eval_runtime": 29.5963, | |
| "eval_samples_per_second": 2.737, | |
| "eval_steps_per_second": 2.737, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 0.2651500999927521, | |
| "learning_rate": 0.0001354433695681474, | |
| "loss": 0.0799, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 6.937142857142857, | |
| "grad_norm": 0.24103859066963196, | |
| "learning_rate": 0.00013105905658417755, | |
| "loss": 0.0779, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 6.994285714285715, | |
| "grad_norm": 0.26907840371131897, | |
| "learning_rate": 0.00012672152858941244, | |
| "loss": 0.077, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 7.045714285714285, | |
| "grad_norm": 0.12422758340835571, | |
| "learning_rate": 0.00012243249173477514, | |
| "loss": 0.0527, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 7.102857142857143, | |
| "grad_norm": 0.1850104182958603, | |
| "learning_rate": 0.00011819363309737438, | |
| "loss": 0.0449, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 7.16, | |
| "grad_norm": 0.2082287222146988, | |
| "learning_rate": 0.00011400662001689926, | |
| "loss": 0.04, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.217142857142857, | |
| "grad_norm": 0.18963204324245453, | |
| "learning_rate": 0.00010987309943977646, | |
| "loss": 0.0434, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 7.274285714285714, | |
| "grad_norm": 0.20188449323177338, | |
| "learning_rate": 0.00010579469727135068, | |
| "loss": 0.041, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.331428571428571, | |
| "grad_norm": 0.19002296030521393, | |
| "learning_rate": 0.00010177301773633993, | |
| "loss": 0.0415, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 7.388571428571429, | |
| "grad_norm": 0.16656257212162018, | |
| "learning_rate": 9.780964274781984e-05, | |
| "loss": 0.0384, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 7.445714285714286, | |
| "grad_norm": 0.2054123878479004, | |
| "learning_rate": 9.390613128498418e-05, | |
| "loss": 0.0414, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 7.502857142857143, | |
| "grad_norm": 0.19603969156742096, | |
| "learning_rate": 9.006401877992549e-05, | |
| "loss": 0.04, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 7.5600000000000005, | |
| "grad_norm": 0.22474470734596252, | |
| "learning_rate": 8.628481651367875e-05, | |
| "loss": 0.0448, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 7.617142857142857, | |
| "grad_norm": 0.1726624220609665, | |
| "learning_rate": 8.257001102176459e-05, | |
| "loss": 0.0368, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 7.674285714285714, | |
| "grad_norm": 0.21024195849895477, | |
| "learning_rate": 7.892106350946543e-05, | |
| "loss": 0.0456, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 7.731428571428571, | |
| "grad_norm": 0.1670810580253601, | |
| "learning_rate": 7.533940927706637e-05, | |
| "loss": 0.0436, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 7.788571428571428, | |
| "grad_norm": 0.18880096077919006, | |
| "learning_rate": 7.182645715528436e-05, | |
| "loss": 0.039, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 7.845714285714286, | |
| "grad_norm": 0.16823258996009827, | |
| "learning_rate": 6.838358895111e-05, | |
| "loss": 0.0418, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 7.902857142857143, | |
| "grad_norm": 0.1878485232591629, | |
| "learning_rate": 6.501215890427908e-05, | |
| "loss": 0.0462, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "grad_norm": 0.17427195608615875, | |
| "learning_rate": 6.171349315458669e-05, | |
| "loss": 0.0398, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "eval_loss": 1.317841649055481, | |
| "eval_runtime": 29.5847, | |
| "eval_samples_per_second": 2.738, | |
| "eval_steps_per_second": 2.738, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 8.01142857142857, | |
| "grad_norm": 0.11310245841741562, | |
| "learning_rate": 5.848888922025553e-05, | |
| "loss": 0.0364, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 8.06857142857143, | |
| "grad_norm": 0.13256970047950745, | |
| "learning_rate": 5.533961548756128e-05, | |
| "loss": 0.0295, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 8.125714285714286, | |
| "grad_norm": 0.12489226460456848, | |
| "learning_rate": 5.226691071191772e-05, | |
| "loss": 0.0281, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 8.182857142857143, | |
| "grad_norm": 0.10919743031263351, | |
| "learning_rate": 4.9271983530617046e-05, | |
| "loss": 0.0258, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.24, | |
| "grad_norm": 0.13559609651565552, | |
| "learning_rate": 4.6356011987416066e-05, | |
| "loss": 0.0272, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 8.297142857142857, | |
| "grad_norm": 0.117083340883255, | |
| "learning_rate": 4.35201430691578e-05, | |
| "loss": 0.0285, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 8.354285714285714, | |
| "grad_norm": 0.16256172955036163, | |
| "learning_rate": 4.076549225460757e-05, | |
| "loss": 0.0257, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 8.411428571428571, | |
| "grad_norm": 0.1305255889892578, | |
| "learning_rate": 3.809314307568412e-05, | |
| "loss": 0.0255, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 8.468571428571428, | |
| "grad_norm": 0.11966060847043991, | |
| "learning_rate": 3.550414669125573e-05, | |
| "loss": 0.0263, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 8.525714285714285, | |
| "grad_norm": 0.17564554512500763, | |
| "learning_rate": 3.2999521473671136e-05, | |
| "loss": 0.0266, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 8.582857142857144, | |
| "grad_norm": 0.1525678038597107, | |
| "learning_rate": 3.0580252608186086e-05, | |
| "loss": 0.0292, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 0.20154449343681335, | |
| "learning_rate": 2.824729170544457e-05, | |
| "loss": 0.0304, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 8.697142857142858, | |
| "grad_norm": 0.14955027401447296, | |
| "learning_rate": 2.6001556427166062e-05, | |
| "loss": 0.0277, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 8.754285714285714, | |
| "grad_norm": 0.13878516852855682, | |
| "learning_rate": 2.3843930125186542e-05, | |
| "loss": 0.0255, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 8.811428571428571, | |
| "grad_norm": 0.12101715803146362, | |
| "learning_rate": 2.177526149399556e-05, | |
| "loss": 0.0263, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 8.868571428571428, | |
| "grad_norm": 0.11588730663061142, | |
| "learning_rate": 1.9796364236904924e-05, | |
| "loss": 0.0319, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 8.925714285714285, | |
| "grad_norm": 0.15549886226654053, | |
| "learning_rate": 1.7908016745981858e-05, | |
| "loss": 0.0275, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 8.982857142857142, | |
| "grad_norm": 0.11943016201257706, | |
| "learning_rate": 1.6110961795870906e-05, | |
| "loss": 0.0283, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 9.034285714285714, | |
| "grad_norm": 0.10950807482004166, | |
| "learning_rate": 1.4405906251626494e-05, | |
| "loss": 0.0273, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 9.09142857142857, | |
| "grad_norm": 0.12409886717796326, | |
| "learning_rate": 1.2793520790670116e-05, | |
| "loss": 0.0213, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 9.09142857142857, | |
| "eval_loss": 1.399065613746643, | |
| "eval_runtime": 29.6981, | |
| "eval_samples_per_second": 2.727, | |
| "eval_steps_per_second": 2.727, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 9.14857142857143, | |
| "grad_norm": 0.10451866686344147, | |
| "learning_rate": 1.1274439638981532e-05, | |
| "loss": 0.0219, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 9.205714285714286, | |
| "grad_norm": 0.11612384021282196, | |
| "learning_rate": 9.849260321628667e-06, | |
| "loss": 0.0239, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.262857142857143, | |
| "grad_norm": 0.10945022851228714, | |
| "learning_rate": 8.51854342773295e-06, | |
| "loss": 0.0233, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 9.32, | |
| "grad_norm": 0.11814412474632263, | |
| "learning_rate": 7.282812389963784e-06, | |
| "loss": 0.0221, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 9.377142857142857, | |
| "grad_norm": 0.12562333047389984, | |
| "learning_rate": 6.142553278648239e-06, | |
| "loss": 0.0234, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 9.434285714285714, | |
| "grad_norm": 0.11704660952091217, | |
| "learning_rate": 5.0982146105769125e-06, | |
| "loss": 0.0277, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 9.491428571428571, | |
| "grad_norm": 0.1091628223657608, | |
| "learning_rate": 4.150207172581522e-06, | |
| "loss": 0.0237, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 9.548571428571428, | |
| "grad_norm": 0.11104241013526917, | |
| "learning_rate": 3.298903859953517e-06, | |
| "loss": 0.0239, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 9.605714285714285, | |
| "grad_norm": 0.11044926196336746, | |
| "learning_rate": 2.544639529766829e-06, | |
| "loss": 0.0234, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 9.662857142857142, | |
| "grad_norm": 0.12619182467460632, | |
| "learning_rate": 1.887710869163284e-06, | |
| "loss": 0.0233, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 9.72, | |
| "grad_norm": 0.12324284762144089, | |
| "learning_rate": 1.328376278651705e-06, | |
| "loss": 0.023, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 9.777142857142858, | |
| "grad_norm": 0.11614653468132019, | |
| "learning_rate": 8.668557704669122e-07, | |
| "loss": 0.0238, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 9.834285714285715, | |
| "grad_norm": 0.13400106132030487, | |
| "learning_rate": 5.033308820289185e-07, | |
| "loss": 0.0237, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 9.891428571428571, | |
| "grad_norm": 0.12201932817697525, | |
| "learning_rate": 2.3794460453555045e-07, | |
| "loss": 0.0216, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 9.948571428571428, | |
| "grad_norm": 0.12058448791503906, | |
| "learning_rate": 7.080132671774542e-08, | |
| "loss": 0.0238, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.15471021831035614, | |
| "learning_rate": 1.966793778229725e-09, | |
| "loss": 0.0221, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 880, | |
| "total_flos": 5.7149261611008e+16, | |
| "train_loss": 0.2784872981296344, | |
| "train_runtime": 8588.2113, | |
| "train_samples_per_second": 0.815, | |
| "train_steps_per_second": 0.102 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 880, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 40, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.7149261611008e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |