{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 100, "global_step": 880, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05714285714285714, "grad_norm": 0.17341195046901703, "learning_rate": 2.272727272727273e-05, "loss": 0.9238, "step": 5 }, { "epoch": 0.11428571428571428, "grad_norm": 0.151872456073761, "learning_rate": 5.113636363636364e-05, "loss": 0.8658, "step": 10 }, { "epoch": 0.17142857142857143, "grad_norm": 0.1270148754119873, "learning_rate": 7.954545454545455e-05, "loss": 0.8384, "step": 15 }, { "epoch": 0.22857142857142856, "grad_norm": 0.13214267790317535, "learning_rate": 0.00010795454545454545, "loss": 0.7931, "step": 20 }, { "epoch": 0.2857142857142857, "grad_norm": 0.125133216381073, "learning_rate": 0.00013636363636363637, "loss": 0.7621, "step": 25 }, { "epoch": 0.34285714285714286, "grad_norm": 0.1484372466802597, "learning_rate": 0.00016477272727272727, "loss": 0.7396, "step": 30 }, { "epoch": 0.4, "grad_norm": 0.14502882957458496, "learning_rate": 0.00019318181818181817, "loss": 0.7783, "step": 35 }, { "epoch": 0.45714285714285713, "grad_norm": 0.1474684476852417, "learning_rate": 0.0002215909090909091, "loss": 0.7208, "step": 40 }, { "epoch": 0.5142857142857142, "grad_norm": 0.14219504594802856, "learning_rate": 0.00025, "loss": 0.7467, "step": 45 }, { "epoch": 0.5714285714285714, "grad_norm": 0.14778761565685272, "learning_rate": 0.0002784090909090909, "loss": 0.6922, "step": 50 }, { "epoch": 0.6285714285714286, "grad_norm": 0.1440540999174118, "learning_rate": 0.0003068181818181818, "loss": 0.6895, "step": 55 }, { "epoch": 0.6857142857142857, "grad_norm": 0.14411456882953644, "learning_rate": 0.00033522727272727274, "loss": 0.7601, "step": 60 }, { "epoch": 0.7428571428571429, "grad_norm": 0.1502826064825058, "learning_rate": 0.00036363636363636367, "loss": 0.6845, "step": 65 }, { "epoch": 0.8, "grad_norm": 0.15052765607833862, "learning_rate": 0.00039204545454545454, "loss": 0.6665, "step": 70 }, { "epoch": 0.8571428571428571, "grad_norm": 0.1469675898551941, "learning_rate": 0.0004204545454545455, "loss": 0.7189, "step": 75 }, { "epoch": 0.9142857142857143, "grad_norm": 0.136297807097435, "learning_rate": 0.00044886363636363635, "loss": 0.6771, "step": 80 }, { "epoch": 0.9714285714285714, "grad_norm": 0.15446212887763977, "learning_rate": 0.0004772727272727273, "loss": 0.6908, "step": 85 }, { "epoch": 1.022857142857143, "grad_norm": 0.15397129952907562, "learning_rate": 0.0004999980332062218, "loss": 0.7028, "step": 90 }, { "epoch": 1.08, "grad_norm": 0.15528780221939087, "learning_rate": 0.0004999291986732823, "loss": 0.6689, "step": 95 }, { "epoch": 1.1371428571428572, "grad_norm": 0.17444616556167603, "learning_rate": 0.0004997620553954645, "loss": 0.6345, "step": 100 }, { "epoch": 1.1371428571428572, "eval_loss": 0.6875521540641785, "eval_runtime": 29.6981, "eval_samples_per_second": 2.727, "eval_steps_per_second": 2.727, "step": 100 }, { "epoch": 1.1942857142857144, "grad_norm": 0.16984045505523682, "learning_rate": 0.0004994966691179711, "loss": 0.6399, "step": 105 }, { "epoch": 1.2514285714285713, "grad_norm": 0.18500125408172607, "learning_rate": 0.0004991331442295331, "loss": 0.6482, "step": 110 }, { "epoch": 1.3085714285714285, "grad_norm": 0.19161580502986908, "learning_rate": 0.0004986716237213483, "loss": 0.6554, "step": 115 }, { "epoch": 1.3657142857142857, "grad_norm": 0.17785383760929108, "learning_rate": 0.0004981122891308368, "loss": 0.5931, "step": 120 }, { "epoch": 1.4228571428571428, "grad_norm": 0.1681031882762909, "learning_rate": 0.0004974553604702333, "loss": 0.5979, "step": 125 }, { "epoch": 1.48, "grad_norm": 0.1733732372522354, "learning_rate": 0.0004967010961400466, "loss": 0.6648, "step": 130 }, { "epoch": 1.5371428571428571, "grad_norm": 0.1732897013425827, "learning_rate": 0.0004958497928274184, "loss": 0.6383, "step": 135 }, { "epoch": 1.5942857142857143, "grad_norm": 0.18160653114318848, "learning_rate": 0.000494901785389423, "loss": 0.6114, "step": 140 }, { "epoch": 1.6514285714285715, "grad_norm": 0.16878579556941986, "learning_rate": 0.0004938574467213517, "loss": 0.636, "step": 145 }, { "epoch": 1.7085714285714286, "grad_norm": 0.16055361926555634, "learning_rate": 0.0004927171876100363, "loss": 0.6337, "step": 150 }, { "epoch": 1.7657142857142856, "grad_norm": 0.17928151786327362, "learning_rate": 0.0004914814565722671, "loss": 0.61, "step": 155 }, { "epoch": 1.822857142857143, "grad_norm": 0.17349475622177124, "learning_rate": 0.0004901507396783714, "loss": 0.656, "step": 160 }, { "epoch": 1.88, "grad_norm": 0.17489437758922577, "learning_rate": 0.0004887255603610184, "loss": 0.6506, "step": 165 }, { "epoch": 1.9371428571428573, "grad_norm": 0.18316401541233063, "learning_rate": 0.00048720647920932994, "loss": 0.5985, "step": 170 }, { "epoch": 1.9942857142857142, "grad_norm": 0.16122691333293915, "learning_rate": 0.0004855940937483735, "loss": 0.6132, "step": 175 }, { "epoch": 2.045714285714286, "grad_norm": 0.1657329797744751, "learning_rate": 0.0004838890382041291, "loss": 0.5925, "step": 180 }, { "epoch": 2.1028571428571428, "grad_norm": 0.2137223184108734, "learning_rate": 0.00048209198325401817, "loss": 0.4924, "step": 185 }, { "epoch": 2.16, "grad_norm": 0.18751586973667145, "learning_rate": 0.0004802036357630951, "loss": 0.5103, "step": 190 }, { "epoch": 2.217142857142857, "grad_norm": 0.22524061799049377, "learning_rate": 0.00047822473850600447, "loss": 0.4742, "step": 195 }, { "epoch": 2.2742857142857145, "grad_norm": 0.23922263085842133, "learning_rate": 0.0004761560698748135, "loss": 0.472, "step": 200 }, { "epoch": 2.2742857142857145, "eval_loss": 0.7193492650985718, "eval_runtime": 29.6354, "eval_samples_per_second": 2.733, "eval_steps_per_second": 2.733, "step": 200 }, { "epoch": 2.3314285714285714, "grad_norm": 0.25317704677581787, "learning_rate": 0.00047399844357283395, "loss": 0.4624, "step": 205 }, { "epoch": 2.388571428571429, "grad_norm": 0.22575099766254425, "learning_rate": 0.0004717527082945554, "loss": 0.4803, "step": 210 }, { "epoch": 2.4457142857142857, "grad_norm": 0.22181181609630585, "learning_rate": 0.0004694197473918139, "loss": 0.4805, "step": 215 }, { "epoch": 2.5028571428571427, "grad_norm": 0.2090139091014862, "learning_rate": 0.0004670004785263289, "loss": 0.5365, "step": 220 }, { "epoch": 2.56, "grad_norm": 0.24724529683589935, "learning_rate": 0.0004644958533087443, "loss": 0.5005, "step": 225 }, { "epoch": 2.617142857142857, "grad_norm": 0.22190535068511963, "learning_rate": 0.0004619068569243159, "loss": 0.4974, "step": 230 }, { "epoch": 2.6742857142857144, "grad_norm": 0.21910065412521362, "learning_rate": 0.00045923450774539243, "loss": 0.5322, "step": 235 }, { "epoch": 2.7314285714285713, "grad_norm": 0.24561282992362976, "learning_rate": 0.0004564798569308423, "loss": 0.5154, "step": 240 }, { "epoch": 2.7885714285714287, "grad_norm": 0.22541281580924988, "learning_rate": 0.00045364398801258396, "loss": 0.4968, "step": 245 }, { "epoch": 2.8457142857142856, "grad_norm": 0.21167173981666565, "learning_rate": 0.000450728016469383, "loss": 0.5049, "step": 250 }, { "epoch": 2.902857142857143, "grad_norm": 0.20972833037376404, "learning_rate": 0.0004477330892880823, "loss": 0.5303, "step": 255 }, { "epoch": 2.96, "grad_norm": 0.21511210501194, "learning_rate": 0.0004446603845124388, "loss": 0.5203, "step": 260 }, { "epoch": 3.0114285714285716, "grad_norm": 0.20611043274402618, "learning_rate": 0.0004415111107797445, "loss": 0.5064, "step": 265 }, { "epoch": 3.0685714285714285, "grad_norm": 0.2586754262447357, "learning_rate": 0.0004382865068454133, "loss": 0.3602, "step": 270 }, { "epoch": 3.125714285714286, "grad_norm": 0.25579819083213806, "learning_rate": 0.00043498784109572097, "loss": 0.3523, "step": 275 }, { "epoch": 3.182857142857143, "grad_norm": 0.2642401456832886, "learning_rate": 0.00043161641104889003, "loss": 0.3604, "step": 280 }, { "epoch": 3.24, "grad_norm": 0.2600599527359009, "learning_rate": 0.00042817354284471575, "loss": 0.3497, "step": 285 }, { "epoch": 3.297142857142857, "grad_norm": 0.2751370966434479, "learning_rate": 0.00042466059072293367, "loss": 0.3525, "step": 290 }, { "epoch": 3.354285714285714, "grad_norm": 0.2760413885116577, "learning_rate": 0.00042107893649053456, "loss": 0.3369, "step": 295 }, { "epoch": 3.4114285714285715, "grad_norm": 0.28514841198921204, "learning_rate": 0.0004174299889782355, "loss": 0.3499, "step": 300 }, { "epoch": 3.4114285714285715, "eval_loss": 0.7963736057281494, "eval_runtime": 29.7356, "eval_samples_per_second": 2.724, "eval_steps_per_second": 2.724, "step": 300 }, { "epoch": 3.4685714285714284, "grad_norm": 0.28093528747558594, "learning_rate": 0.0004137151834863213, "loss": 0.3601, "step": 305 }, { "epoch": 3.525714285714286, "grad_norm": 0.2521819472312927, "learning_rate": 0.0004099359812200746, "loss": 0.3768, "step": 310 }, { "epoch": 3.5828571428571427, "grad_norm": 0.2599101960659027, "learning_rate": 0.00040609386871501583, "loss": 0.3248, "step": 315 }, { "epoch": 3.64, "grad_norm": 0.277692049741745, "learning_rate": 0.0004021903572521802, "loss": 0.363, "step": 320 }, { "epoch": 3.697142857142857, "grad_norm": 0.2820269763469696, "learning_rate": 0.00039822698226366017, "loss": 0.3676, "step": 325 }, { "epoch": 3.7542857142857144, "grad_norm": 0.28230783343315125, "learning_rate": 0.00039420530272864934, "loss": 0.3556, "step": 330 }, { "epoch": 3.8114285714285714, "grad_norm": 0.3273780941963196, "learning_rate": 0.0003901269005602235, "loss": 0.3656, "step": 335 }, { "epoch": 3.8685714285714283, "grad_norm": 0.28806638717651367, "learning_rate": 0.0003859933799831008, "loss": 0.3499, "step": 340 }, { "epoch": 3.9257142857142857, "grad_norm": 0.30954381823539734, "learning_rate": 0.00038180636690262563, "loss": 0.392, "step": 345 }, { "epoch": 3.982857142857143, "grad_norm": 0.29088252782821655, "learning_rate": 0.000377567508265225, "loss": 0.3736, "step": 350 }, { "epoch": 4.034285714285715, "grad_norm": 0.22520305216312408, "learning_rate": 0.0003732784714105876, "loss": 0.2785, "step": 355 }, { "epoch": 4.091428571428572, "grad_norm": 0.3518202602863312, "learning_rate": 0.0003689409434158224, "loss": 0.2139, "step": 360 }, { "epoch": 4.148571428571429, "grad_norm": 0.2898755967617035, "learning_rate": 0.0003645566304318526, "loss": 0.2234, "step": 365 }, { "epoch": 4.2057142857142855, "grad_norm": 0.3240414261817932, "learning_rate": 0.00036012725701230734, "loss": 0.2342, "step": 370 }, { "epoch": 4.2628571428571425, "grad_norm": 0.28856348991394043, "learning_rate": 0.00035565456543517487, "loss": 0.2117, "step": 375 }, { "epoch": 4.32, "grad_norm": 0.3389490246772766, "learning_rate": 0.0003511403150174838, "loss": 0.2435, "step": 380 }, { "epoch": 4.377142857142857, "grad_norm": 0.31090596318244934, "learning_rate": 0.00034658628142328216, "loss": 0.2281, "step": 385 }, { "epoch": 4.434285714285714, "grad_norm": 0.3269132077693939, "learning_rate": 0.0003419942559651863, "loss": 0.2637, "step": 390 }, { "epoch": 4.491428571428571, "grad_norm": 0.30986836552619934, "learning_rate": 0.0003373660448997746, "loss": 0.228, "step": 395 }, { "epoch": 4.548571428571429, "grad_norm": 0.28136685490608215, "learning_rate": 0.000332703468717103, "loss": 0.2457, "step": 400 }, { "epoch": 4.548571428571429, "eval_loss": 0.9142104983329773, "eval_runtime": 29.7632, "eval_samples_per_second": 2.721, "eval_steps_per_second": 2.721, "step": 400 }, { "epoch": 4.605714285714286, "grad_norm": 0.3222131133079529, "learning_rate": 0.00032800836142462175, "loss": 0.2238, "step": 405 }, { "epoch": 4.662857142857143, "grad_norm": 0.3103245496749878, "learning_rate": 0.0003232825698257755, "loss": 0.2444, "step": 410 }, { "epoch": 4.72, "grad_norm": 0.319525808095932, "learning_rate": 0.00031852795279356945, "loss": 0.2498, "step": 415 }, { "epoch": 4.777142857142858, "grad_norm": 0.2778545320034027, "learning_rate": 0.0003137463805393885, "loss": 0.231, "step": 420 }, { "epoch": 4.8342857142857145, "grad_norm": 0.3173794448375702, "learning_rate": 0.0003089397338773569, "loss": 0.2471, "step": 425 }, { "epoch": 4.8914285714285715, "grad_norm": 0.3207133710384369, "learning_rate": 0.00030410990348452574, "loss": 0.2302, "step": 430 }, { "epoch": 4.948571428571428, "grad_norm": 0.26637086272239685, "learning_rate": 0.0002992587891571833, "loss": 0.2244, "step": 435 }, { "epoch": 5.0, "grad_norm": 0.48891177773475647, "learning_rate": 0.0002943882990635759, "loss": 0.2451, "step": 440 }, { "epoch": 5.057142857142857, "grad_norm": 0.23097443580627441, "learning_rate": 0.0002895003489933375, "loss": 0.135, "step": 445 }, { "epoch": 5.114285714285714, "grad_norm": 0.30315983295440674, "learning_rate": 0.0002845968616039207, "loss": 0.1345, "step": 450 }, { "epoch": 5.171428571428572, "grad_norm": 0.29658541083335876, "learning_rate": 0.0002796797656643263, "loss": 0.147, "step": 455 }, { "epoch": 5.228571428571429, "grad_norm": 0.28584039211273193, "learning_rate": 0.00027475099529642886, "loss": 0.1333, "step": 460 }, { "epoch": 5.285714285714286, "grad_norm": 0.3031592071056366, "learning_rate": 0.0002698124892141971, "loss": 0.147, "step": 465 }, { "epoch": 5.3428571428571425, "grad_norm": 0.3030093312263489, "learning_rate": 0.00026486618996110777, "loss": 0.1298, "step": 470 }, { "epoch": 5.4, "grad_norm": 0.27724677324295044, "learning_rate": 0.0002599140431460531, "loss": 0.1406, "step": 475 }, { "epoch": 5.457142857142857, "grad_norm": 0.27622610330581665, "learning_rate": 0.00025495799667804255, "loss": 0.1225, "step": 480 }, { "epoch": 5.514285714285714, "grad_norm": 0.2955563962459564, "learning_rate": 0.00025, "loss": 0.1263, "step": 485 }, { "epoch": 5.571428571428571, "grad_norm": 0.3078777492046356, "learning_rate": 0.00024504200332195757, "loss": 0.1265, "step": 490 }, { "epoch": 5.628571428571428, "grad_norm": 0.33732086420059204, "learning_rate": 0.00024008595685394692, "loss": 0.1611, "step": 495 }, { "epoch": 5.685714285714286, "grad_norm": 0.29280802607536316, "learning_rate": 0.00023513381003889227, "loss": 0.1229, "step": 500 }, { "epoch": 5.685714285714286, "eval_loss": 1.0489529371261597, "eval_runtime": 29.6727, "eval_samples_per_second": 2.73, "eval_steps_per_second": 2.73, "step": 500 }, { "epoch": 5.742857142857143, "grad_norm": 0.2972449064254761, "learning_rate": 0.00023018751078580287, "loss": 0.138, "step": 505 }, { "epoch": 5.8, "grad_norm": 0.29159605503082275, "learning_rate": 0.00022524900470357118, "loss": 0.1351, "step": 510 }, { "epoch": 5.857142857142857, "grad_norm": 0.3165677785873413, "learning_rate": 0.00022032023433567378, "loss": 0.1371, "step": 515 }, { "epoch": 5.914285714285715, "grad_norm": 0.3170928657054901, "learning_rate": 0.0002154031383960793, "loss": 0.1494, "step": 520 }, { "epoch": 5.9714285714285715, "grad_norm": 0.2793048024177551, "learning_rate": 0.0002104996510066625, "loss": 0.139, "step": 525 }, { "epoch": 6.022857142857143, "grad_norm": 0.16289132833480835, "learning_rate": 0.00020561170093642424, "loss": 0.1127, "step": 530 }, { "epoch": 6.08, "grad_norm": 0.22523629665374756, "learning_rate": 0.00020074121084281678, "loss": 0.0794, "step": 535 }, { "epoch": 6.137142857142857, "grad_norm": 0.301932692527771, "learning_rate": 0.0001958900965154743, "loss": 0.0724, "step": 540 }, { "epoch": 6.194285714285714, "grad_norm": 0.21225464344024658, "learning_rate": 0.00019106026612264316, "loss": 0.0747, "step": 545 }, { "epoch": 6.251428571428572, "grad_norm": 0.17405834794044495, "learning_rate": 0.0001862536194606115, "loss": 0.0638, "step": 550 }, { "epoch": 6.308571428571429, "grad_norm": 0.22534868121147156, "learning_rate": 0.00018147204720643065, "loss": 0.0722, "step": 555 }, { "epoch": 6.365714285714286, "grad_norm": 0.2658616900444031, "learning_rate": 0.00017671743017422448, "loss": 0.0696, "step": 560 }, { "epoch": 6.422857142857143, "grad_norm": 0.27225354313850403, "learning_rate": 0.00017199163857537826, "loss": 0.0764, "step": 565 }, { "epoch": 6.48, "grad_norm": 0.2424718737602234, "learning_rate": 0.000167296531282897, "loss": 0.0725, "step": 570 }, { "epoch": 6.537142857142857, "grad_norm": 0.21518002450466156, "learning_rate": 0.00016263395510022544, "loss": 0.0688, "step": 575 }, { "epoch": 6.594285714285714, "grad_norm": 0.27070116996765137, "learning_rate": 0.00015800574403481376, "loss": 0.0814, "step": 580 }, { "epoch": 6.651428571428571, "grad_norm": 0.28266236186027527, "learning_rate": 0.00015341371857671783, "loss": 0.0789, "step": 585 }, { "epoch": 6.708571428571428, "grad_norm": 0.2508731782436371, "learning_rate": 0.00014885968498251623, "loss": 0.0677, "step": 590 }, { "epoch": 6.765714285714286, "grad_norm": 0.2912222146987915, "learning_rate": 0.0001443454345648252, "loss": 0.0755, "step": 595 }, { "epoch": 6.822857142857143, "grad_norm": 0.21923169493675232, "learning_rate": 0.00013987274298769264, "loss": 0.0728, "step": 600 }, { "epoch": 6.822857142857143, "eval_loss": 1.1974577903747559, "eval_runtime": 29.5963, "eval_samples_per_second": 2.737, "eval_steps_per_second": 2.737, "step": 600 }, { "epoch": 6.88, "grad_norm": 0.2651500999927521, "learning_rate": 0.0001354433695681474, "loss": 0.0799, "step": 605 }, { "epoch": 6.937142857142857, "grad_norm": 0.24103859066963196, "learning_rate": 0.00013105905658417755, "loss": 0.0779, "step": 610 }, { "epoch": 6.994285714285715, "grad_norm": 0.26907840371131897, "learning_rate": 0.00012672152858941244, "loss": 0.077, "step": 615 }, { "epoch": 7.045714285714285, "grad_norm": 0.12422758340835571, "learning_rate": 0.00012243249173477514, "loss": 0.0527, "step": 620 }, { "epoch": 7.102857142857143, "grad_norm": 0.1850104182958603, "learning_rate": 0.00011819363309737438, "loss": 0.0449, "step": 625 }, { "epoch": 7.16, "grad_norm": 0.2082287222146988, "learning_rate": 0.00011400662001689926, "loss": 0.04, "step": 630 }, { "epoch": 7.217142857142857, "grad_norm": 0.18963204324245453, "learning_rate": 0.00010987309943977646, "loss": 0.0434, "step": 635 }, { "epoch": 7.274285714285714, "grad_norm": 0.20188449323177338, "learning_rate": 0.00010579469727135068, "loss": 0.041, "step": 640 }, { "epoch": 7.331428571428571, "grad_norm": 0.19002296030521393, "learning_rate": 0.00010177301773633993, "loss": 0.0415, "step": 645 }, { "epoch": 7.388571428571429, "grad_norm": 0.16656257212162018, "learning_rate": 9.780964274781984e-05, "loss": 0.0384, "step": 650 }, { "epoch": 7.445714285714286, "grad_norm": 0.2054123878479004, "learning_rate": 9.390613128498418e-05, "loss": 0.0414, "step": 655 }, { "epoch": 7.502857142857143, "grad_norm": 0.19603969156742096, "learning_rate": 9.006401877992549e-05, "loss": 0.04, "step": 660 }, { "epoch": 7.5600000000000005, "grad_norm": 0.22474470734596252, "learning_rate": 8.628481651367875e-05, "loss": 0.0448, "step": 665 }, { "epoch": 7.617142857142857, "grad_norm": 0.1726624220609665, "learning_rate": 8.257001102176459e-05, "loss": 0.0368, "step": 670 }, { "epoch": 7.674285714285714, "grad_norm": 0.21024195849895477, "learning_rate": 7.892106350946543e-05, "loss": 0.0456, "step": 675 }, { "epoch": 7.731428571428571, "grad_norm": 0.1670810580253601, "learning_rate": 7.533940927706637e-05, "loss": 0.0436, "step": 680 }, { "epoch": 7.788571428571428, "grad_norm": 0.18880096077919006, "learning_rate": 7.182645715528436e-05, "loss": 0.039, "step": 685 }, { "epoch": 7.845714285714286, "grad_norm": 0.16823258996009827, "learning_rate": 6.838358895111e-05, "loss": 0.0418, "step": 690 }, { "epoch": 7.902857142857143, "grad_norm": 0.1878485232591629, "learning_rate": 6.501215890427908e-05, "loss": 0.0462, "step": 695 }, { "epoch": 7.96, "grad_norm": 0.17427195608615875, "learning_rate": 6.171349315458669e-05, "loss": 0.0398, "step": 700 }, { "epoch": 7.96, "eval_loss": 1.317841649055481, "eval_runtime": 29.5847, "eval_samples_per_second": 2.738, "eval_steps_per_second": 2.738, "step": 700 }, { "epoch": 8.01142857142857, "grad_norm": 0.11310245841741562, "learning_rate": 5.848888922025553e-05, "loss": 0.0364, "step": 705 }, { "epoch": 8.06857142857143, "grad_norm": 0.13256970047950745, "learning_rate": 5.533961548756128e-05, "loss": 0.0295, "step": 710 }, { "epoch": 8.125714285714286, "grad_norm": 0.12489226460456848, "learning_rate": 5.226691071191772e-05, "loss": 0.0281, "step": 715 }, { "epoch": 8.182857142857143, "grad_norm": 0.10919743031263351, "learning_rate": 4.9271983530617046e-05, "loss": 0.0258, "step": 720 }, { "epoch": 8.24, "grad_norm": 0.13559609651565552, "learning_rate": 4.6356011987416066e-05, "loss": 0.0272, "step": 725 }, { "epoch": 8.297142857142857, "grad_norm": 0.117083340883255, "learning_rate": 4.35201430691578e-05, "loss": 0.0285, "step": 730 }, { "epoch": 8.354285714285714, "grad_norm": 0.16256172955036163, "learning_rate": 4.076549225460757e-05, "loss": 0.0257, "step": 735 }, { "epoch": 8.411428571428571, "grad_norm": 0.1305255889892578, "learning_rate": 3.809314307568412e-05, "loss": 0.0255, "step": 740 }, { "epoch": 8.468571428571428, "grad_norm": 0.11966060847043991, "learning_rate": 3.550414669125573e-05, "loss": 0.0263, "step": 745 }, { "epoch": 8.525714285714285, "grad_norm": 0.17564554512500763, "learning_rate": 3.2999521473671136e-05, "loss": 0.0266, "step": 750 }, { "epoch": 8.582857142857144, "grad_norm": 0.1525678038597107, "learning_rate": 3.0580252608186086e-05, "loss": 0.0292, "step": 755 }, { "epoch": 8.64, "grad_norm": 0.20154449343681335, "learning_rate": 2.824729170544457e-05, "loss": 0.0304, "step": 760 }, { "epoch": 8.697142857142858, "grad_norm": 0.14955027401447296, "learning_rate": 2.6001556427166062e-05, "loss": 0.0277, "step": 765 }, { "epoch": 8.754285714285714, "grad_norm": 0.13878516852855682, "learning_rate": 2.3843930125186542e-05, "loss": 0.0255, "step": 770 }, { "epoch": 8.811428571428571, "grad_norm": 0.12101715803146362, "learning_rate": 2.177526149399556e-05, "loss": 0.0263, "step": 775 }, { "epoch": 8.868571428571428, "grad_norm": 0.11588730663061142, "learning_rate": 1.9796364236904924e-05, "loss": 0.0319, "step": 780 }, { "epoch": 8.925714285714285, "grad_norm": 0.15549886226654053, "learning_rate": 1.7908016745981858e-05, "loss": 0.0275, "step": 785 }, { "epoch": 8.982857142857142, "grad_norm": 0.11943016201257706, "learning_rate": 1.6110961795870906e-05, "loss": 0.0283, "step": 790 }, { "epoch": 9.034285714285714, "grad_norm": 0.10950807482004166, "learning_rate": 1.4405906251626494e-05, "loss": 0.0273, "step": 795 }, { "epoch": 9.09142857142857, "grad_norm": 0.12409886717796326, "learning_rate": 1.2793520790670116e-05, "loss": 0.0213, "step": 800 }, { "epoch": 9.09142857142857, "eval_loss": 1.399065613746643, "eval_runtime": 29.6981, "eval_samples_per_second": 2.727, "eval_steps_per_second": 2.727, "step": 800 }, { "epoch": 9.14857142857143, "grad_norm": 0.10451866686344147, "learning_rate": 1.1274439638981532e-05, "loss": 0.0219, "step": 805 }, { "epoch": 9.205714285714286, "grad_norm": 0.11612384021282196, "learning_rate": 9.849260321628667e-06, "loss": 0.0239, "step": 810 }, { "epoch": 9.262857142857143, "grad_norm": 0.10945022851228714, "learning_rate": 8.51854342773295e-06, "loss": 0.0233, "step": 815 }, { "epoch": 9.32, "grad_norm": 0.11814412474632263, "learning_rate": 7.282812389963784e-06, "loss": 0.0221, "step": 820 }, { "epoch": 9.377142857142857, "grad_norm": 0.12562333047389984, "learning_rate": 6.142553278648239e-06, "loss": 0.0234, "step": 825 }, { "epoch": 9.434285714285714, "grad_norm": 0.11704660952091217, "learning_rate": 5.0982146105769125e-06, "loss": 0.0277, "step": 830 }, { "epoch": 9.491428571428571, "grad_norm": 0.1091628223657608, "learning_rate": 4.150207172581522e-06, "loss": 0.0237, "step": 835 }, { "epoch": 9.548571428571428, "grad_norm": 0.11104241013526917, "learning_rate": 3.298903859953517e-06, "loss": 0.0239, "step": 840 }, { "epoch": 9.605714285714285, "grad_norm": 0.11044926196336746, "learning_rate": 2.544639529766829e-06, "loss": 0.0234, "step": 845 }, { "epoch": 9.662857142857142, "grad_norm": 0.12619182467460632, "learning_rate": 1.887710869163284e-06, "loss": 0.0233, "step": 850 }, { "epoch": 9.72, "grad_norm": 0.12324284762144089, "learning_rate": 1.328376278651705e-06, "loss": 0.023, "step": 855 }, { "epoch": 9.777142857142858, "grad_norm": 0.11614653468132019, "learning_rate": 8.668557704669122e-07, "loss": 0.0238, "step": 860 }, { "epoch": 9.834285714285715, "grad_norm": 0.13400106132030487, "learning_rate": 5.033308820289185e-07, "loss": 0.0237, "step": 865 }, { "epoch": 9.891428571428571, "grad_norm": 0.12201932817697525, "learning_rate": 2.3794460453555045e-07, "loss": 0.0216, "step": 870 }, { "epoch": 9.948571428571428, "grad_norm": 0.12058448791503906, "learning_rate": 7.080132671774542e-08, "loss": 0.0238, "step": 875 }, { "epoch": 10.0, "grad_norm": 0.15471021831035614, "learning_rate": 1.966793778229725e-09, "loss": 0.0221, "step": 880 }, { "epoch": 10.0, "step": 880, "total_flos": 5.7149261611008e+16, "train_loss": 0.2784872981296344, "train_runtime": 8588.2113, "train_samples_per_second": 0.815, "train_steps_per_second": 0.102 } ], "logging_steps": 5, "max_steps": 880, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.7149261611008e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }