| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 790, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03164556962025317, |
| "grad_norm": 5.988588333129883, |
| "learning_rate": 0.0002, |
| "loss": 2.7545, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.06329113924050633, |
| "grad_norm": 1.671600580215454, |
| "learning_rate": 0.00019872773536895675, |
| "loss": 0.7945, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0949367088607595, |
| "grad_norm": 1.5613313913345337, |
| "learning_rate": 0.00019745547073791352, |
| "loss": 0.5902, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.12658227848101267, |
| "grad_norm": 0.9209323525428772, |
| "learning_rate": 0.00019618320610687023, |
| "loss": 0.4814, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.15822784810126583, |
| "grad_norm": 0.7866256237030029, |
| "learning_rate": 0.00019491094147582698, |
| "loss": 0.4366, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.189873417721519, |
| "grad_norm": 0.7535956501960754, |
| "learning_rate": 0.00019363867684478372, |
| "loss": 0.4238, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.22151898734177214, |
| "grad_norm": 0.6957400441169739, |
| "learning_rate": 0.00019236641221374049, |
| "loss": 0.5012, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.25316455696202533, |
| "grad_norm": 0.7030977010726929, |
| "learning_rate": 0.00019109414758269723, |
| "loss": 0.4676, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2848101265822785, |
| "grad_norm": 0.741550624370575, |
| "learning_rate": 0.00018982188295165394, |
| "loss": 0.4893, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.31645569620253167, |
| "grad_norm": 0.5580260753631592, |
| "learning_rate": 0.00018854961832061068, |
| "loss": 0.4825, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.34810126582278483, |
| "grad_norm": 0.5945926308631897, |
| "learning_rate": 0.00018727735368956745, |
| "loss": 0.495, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.379746835443038, |
| "grad_norm": 0.570940375328064, |
| "learning_rate": 0.0001860050890585242, |
| "loss": 0.4557, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.41139240506329117, |
| "grad_norm": 0.6694577932357788, |
| "learning_rate": 0.00018473282442748093, |
| "loss": 0.4674, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.4430379746835443, |
| "grad_norm": 0.6412336826324463, |
| "learning_rate": 0.00018346055979643765, |
| "loss": 0.4305, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.47468354430379744, |
| "grad_norm": 0.6980250477790833, |
| "learning_rate": 0.00018218829516539442, |
| "loss": 0.4357, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5063291139240507, |
| "grad_norm": 0.48515501618385315, |
| "learning_rate": 0.00018091603053435116, |
| "loss": 0.4285, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5379746835443038, |
| "grad_norm": 0.6025907397270203, |
| "learning_rate": 0.0001796437659033079, |
| "loss": 0.45, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.569620253164557, |
| "grad_norm": 0.5797450542449951, |
| "learning_rate": 0.00017837150127226464, |
| "loss": 0.5033, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6012658227848101, |
| "grad_norm": 0.6783467531204224, |
| "learning_rate": 0.00017709923664122138, |
| "loss": 0.4174, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.6329113924050633, |
| "grad_norm": 0.5603845119476318, |
| "learning_rate": 0.00017582697201017812, |
| "loss": 0.4505, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6645569620253164, |
| "grad_norm": 0.7022290229797363, |
| "learning_rate": 0.00017455470737913486, |
| "loss": 0.5161, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.6962025316455697, |
| "grad_norm": 0.6286556124687195, |
| "learning_rate": 0.00017328244274809163, |
| "loss": 0.4525, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.7278481012658228, |
| "grad_norm": 0.7144973874092102, |
| "learning_rate": 0.00017201017811704835, |
| "loss": 0.5068, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.759493670886076, |
| "grad_norm": 0.55781090259552, |
| "learning_rate": 0.0001707379134860051, |
| "loss": 0.4385, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.7911392405063291, |
| "grad_norm": 0.5584812760353088, |
| "learning_rate": 0.00016946564885496183, |
| "loss": 0.4206, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8227848101265823, |
| "grad_norm": 0.7030683755874634, |
| "learning_rate": 0.0001681933842239186, |
| "loss": 0.4833, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.8544303797468354, |
| "grad_norm": 0.6400471329689026, |
| "learning_rate": 0.00016692111959287534, |
| "loss": 0.4646, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.8860759493670886, |
| "grad_norm": 0.5747826099395752, |
| "learning_rate": 0.00016564885496183205, |
| "loss": 0.4334, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.9177215189873418, |
| "grad_norm": 0.519247829914093, |
| "learning_rate": 0.0001643765903307888, |
| "loss": 0.4365, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.9493670886075949, |
| "grad_norm": 0.6712743639945984, |
| "learning_rate": 0.00016310432569974556, |
| "loss": 0.4196, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.9810126582278481, |
| "grad_norm": 0.8766248226165771, |
| "learning_rate": 0.0001618320610687023, |
| "loss": 0.44, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.0126582278481013, |
| "grad_norm": 0.432377427816391, |
| "learning_rate": 0.00016055979643765905, |
| "loss": 0.3616, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.0443037974683544, |
| "grad_norm": 0.5202181339263916, |
| "learning_rate": 0.0001592875318066158, |
| "loss": 0.3763, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.0759493670886076, |
| "grad_norm": 0.5511195659637451, |
| "learning_rate": 0.00015801526717557253, |
| "loss": 0.3291, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.1075949367088607, |
| "grad_norm": 0.6027284264564514, |
| "learning_rate": 0.00015674300254452927, |
| "loss": 0.3141, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.139240506329114, |
| "grad_norm": 0.6925147175788879, |
| "learning_rate": 0.000155470737913486, |
| "loss": 0.3651, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.1708860759493671, |
| "grad_norm": 0.6030688285827637, |
| "learning_rate": 0.00015419847328244275, |
| "loss": 0.3411, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.2025316455696202, |
| "grad_norm": 0.5992720127105713, |
| "learning_rate": 0.0001529262086513995, |
| "loss": 0.3508, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.2341772151898733, |
| "grad_norm": 0.7508724331855774, |
| "learning_rate": 0.00015165394402035624, |
| "loss": 0.3284, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.2658227848101267, |
| "grad_norm": 0.7126018404960632, |
| "learning_rate": 0.00015038167938931298, |
| "loss": 0.3466, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.2974683544303798, |
| "grad_norm": 0.8017547130584717, |
| "learning_rate": 0.00014910941475826972, |
| "loss": 0.3485, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.3291139240506329, |
| "grad_norm": 0.7311880588531494, |
| "learning_rate": 0.0001478371501272265, |
| "loss": 0.3566, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.360759493670886, |
| "grad_norm": 0.7723591327667236, |
| "learning_rate": 0.0001465648854961832, |
| "loss": 0.3329, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.3924050632911391, |
| "grad_norm": 0.8075547814369202, |
| "learning_rate": 0.00014529262086513994, |
| "loss": 0.3584, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.4240506329113924, |
| "grad_norm": 0.5989384055137634, |
| "learning_rate": 0.0001440203562340967, |
| "loss": 0.371, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.4556962025316456, |
| "grad_norm": 0.678035318851471, |
| "learning_rate": 0.00014274809160305345, |
| "loss": 0.3448, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.4873417721518987, |
| "grad_norm": 0.8693724274635315, |
| "learning_rate": 0.0001414758269720102, |
| "loss": 0.3644, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.518987341772152, |
| "grad_norm": 0.6025015115737915, |
| "learning_rate": 0.0001402035623409669, |
| "loss": 0.3233, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.5506329113924051, |
| "grad_norm": 0.679233729839325, |
| "learning_rate": 0.00013893129770992368, |
| "loss": 0.3247, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.5822784810126582, |
| "grad_norm": 0.7034026980400085, |
| "learning_rate": 0.00013765903307888042, |
| "loss": 0.3527, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.6139240506329116, |
| "grad_norm": 0.7514588236808777, |
| "learning_rate": 0.00013638676844783716, |
| "loss": 0.3487, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.6455696202531644, |
| "grad_norm": 0.7183879017829895, |
| "learning_rate": 0.0001351145038167939, |
| "loss": 0.3407, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.6772151898734178, |
| "grad_norm": 0.6752856969833374, |
| "learning_rate": 0.00013384223918575064, |
| "loss": 0.3088, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.7088607594936709, |
| "grad_norm": 0.8107082843780518, |
| "learning_rate": 0.00013256997455470738, |
| "loss": 0.3841, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.740506329113924, |
| "grad_norm": 0.5849813222885132, |
| "learning_rate": 0.00013129770992366413, |
| "loss": 0.3325, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.7721518987341773, |
| "grad_norm": 0.8018965125083923, |
| "learning_rate": 0.00013002544529262087, |
| "loss": 0.3649, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.8037974683544302, |
| "grad_norm": 0.8379972577095032, |
| "learning_rate": 0.0001287531806615776, |
| "loss": 0.3668, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.8354430379746836, |
| "grad_norm": 0.6462769508361816, |
| "learning_rate": 0.00012748091603053435, |
| "loss": 0.3363, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.8670886075949367, |
| "grad_norm": 0.8890714645385742, |
| "learning_rate": 0.0001262086513994911, |
| "loss": 0.3265, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.8987341772151898, |
| "grad_norm": 0.797147274017334, |
| "learning_rate": 0.00012493638676844783, |
| "loss": 0.3636, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.9303797468354431, |
| "grad_norm": 0.6804778575897217, |
| "learning_rate": 0.0001236641221374046, |
| "loss": 0.3442, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.9620253164556962, |
| "grad_norm": 0.6891390681266785, |
| "learning_rate": 0.00012239185750636134, |
| "loss": 0.3145, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.9936708860759493, |
| "grad_norm": 0.9055079817771912, |
| "learning_rate": 0.00012111959287531807, |
| "loss": 0.342, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.0253164556962027, |
| "grad_norm": 0.609603762626648, |
| "learning_rate": 0.00011984732824427483, |
| "loss": 0.2504, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.0569620253164556, |
| "grad_norm": 1.3054362535476685, |
| "learning_rate": 0.00011857506361323157, |
| "loss": 0.2211, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.088607594936709, |
| "grad_norm": 0.8065559267997742, |
| "learning_rate": 0.0001173027989821883, |
| "loss": 0.2173, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.1202531645569622, |
| "grad_norm": 0.8054972887039185, |
| "learning_rate": 0.00011603053435114504, |
| "loss": 0.2126, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.151898734177215, |
| "grad_norm": 0.9218589663505554, |
| "learning_rate": 0.00011475826972010179, |
| "loss": 0.2042, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.1835443037974684, |
| "grad_norm": 0.9257758259773254, |
| "learning_rate": 0.00011348600508905853, |
| "loss": 0.2102, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.2151898734177213, |
| "grad_norm": 0.9863210320472717, |
| "learning_rate": 0.00011221374045801527, |
| "loss": 0.219, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.2468354430379747, |
| "grad_norm": 0.8986596465110779, |
| "learning_rate": 0.000110941475826972, |
| "loss": 0.2145, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.278481012658228, |
| "grad_norm": 0.869886040687561, |
| "learning_rate": 0.00010966921119592877, |
| "loss": 0.1967, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.310126582278481, |
| "grad_norm": 1.0244789123535156, |
| "learning_rate": 0.0001083969465648855, |
| "loss": 0.2095, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.3417721518987342, |
| "grad_norm": 0.7236781120300293, |
| "learning_rate": 0.00010712468193384224, |
| "loss": 0.2123, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.3734177215189876, |
| "grad_norm": 0.7103443145751953, |
| "learning_rate": 0.00010585241730279898, |
| "loss": 0.2205, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.4050632911392404, |
| "grad_norm": 0.9352710247039795, |
| "learning_rate": 0.00010458015267175574, |
| "loss": 0.2259, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.4367088607594938, |
| "grad_norm": 0.8048036098480225, |
| "learning_rate": 0.00010330788804071248, |
| "loss": 0.2138, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.4683544303797467, |
| "grad_norm": 0.814346194267273, |
| "learning_rate": 0.00010203562340966922, |
| "loss": 0.2311, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.9042171835899353, |
| "learning_rate": 0.00010076335877862595, |
| "loss": 0.2278, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.5316455696202533, |
| "grad_norm": 0.7023847103118896, |
| "learning_rate": 9.94910941475827e-05, |
| "loss": 0.2357, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.5632911392405062, |
| "grad_norm": 0.9368842840194702, |
| "learning_rate": 9.821882951653944e-05, |
| "loss": 0.2182, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.5949367088607596, |
| "grad_norm": 0.9225996136665344, |
| "learning_rate": 9.694656488549618e-05, |
| "loss": 0.2228, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.6265822784810124, |
| "grad_norm": 0.7295313477516174, |
| "learning_rate": 9.567430025445293e-05, |
| "loss": 0.2143, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.6582278481012658, |
| "grad_norm": 0.9664236903190613, |
| "learning_rate": 9.440203562340968e-05, |
| "loss": 0.2152, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.689873417721519, |
| "grad_norm": 0.8742689490318298, |
| "learning_rate": 9.312977099236642e-05, |
| "loss": 0.2182, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.721518987341772, |
| "grad_norm": 0.8087453842163086, |
| "learning_rate": 9.185750636132316e-05, |
| "loss": 0.2184, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.7531645569620253, |
| "grad_norm": 1.062659502029419, |
| "learning_rate": 9.05852417302799e-05, |
| "loss": 0.2156, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.7848101265822782, |
| "grad_norm": 0.9411716461181641, |
| "learning_rate": 8.931297709923665e-05, |
| "loss": 0.2213, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.8164556962025316, |
| "grad_norm": 0.993147075176239, |
| "learning_rate": 8.804071246819339e-05, |
| "loss": 0.2127, |
| "step": 445 |
| }, |
| { |
| "epoch": 2.848101265822785, |
| "grad_norm": 0.8353611826896667, |
| "learning_rate": 8.676844783715013e-05, |
| "loss": 0.2116, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.879746835443038, |
| "grad_norm": 0.9915521740913391, |
| "learning_rate": 8.549618320610687e-05, |
| "loss": 0.2299, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.911392405063291, |
| "grad_norm": 0.9111132621765137, |
| "learning_rate": 8.422391857506363e-05, |
| "loss": 0.2113, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.9430379746835444, |
| "grad_norm": 0.8857221007347107, |
| "learning_rate": 8.295165394402035e-05, |
| "loss": 0.2285, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.9746835443037973, |
| "grad_norm": 0.8553436398506165, |
| "learning_rate": 8.167938931297711e-05, |
| "loss": 0.233, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.0063291139240507, |
| "grad_norm": 0.5474989414215088, |
| "learning_rate": 8.040712468193385e-05, |
| "loss": 0.1938, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.037974683544304, |
| "grad_norm": 0.703250527381897, |
| "learning_rate": 7.913486005089059e-05, |
| "loss": 0.131, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.069620253164557, |
| "grad_norm": 1.2964314222335815, |
| "learning_rate": 7.786259541984733e-05, |
| "loss": 0.1256, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.1012658227848102, |
| "grad_norm": 0.7699221968650818, |
| "learning_rate": 7.659033078880407e-05, |
| "loss": 0.1247, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.132911392405063, |
| "grad_norm": 0.6273168325424194, |
| "learning_rate": 7.531806615776081e-05, |
| "loss": 0.1173, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.1645569620253164, |
| "grad_norm": 0.7778182029724121, |
| "learning_rate": 7.404580152671756e-05, |
| "loss": 0.1263, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.1962025316455698, |
| "grad_norm": 1.197022795677185, |
| "learning_rate": 7.27735368956743e-05, |
| "loss": 0.1278, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.2278481012658227, |
| "grad_norm": 0.7795239090919495, |
| "learning_rate": 7.150127226463105e-05, |
| "loss": 0.1253, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.259493670886076, |
| "grad_norm": 0.8459110856056213, |
| "learning_rate": 7.022900763358778e-05, |
| "loss": 0.1245, |
| "step": 515 |
| }, |
| { |
| "epoch": 3.291139240506329, |
| "grad_norm": 0.6801343560218811, |
| "learning_rate": 6.895674300254454e-05, |
| "loss": 0.1284, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.3227848101265822, |
| "grad_norm": 1.0283461809158325, |
| "learning_rate": 6.768447837150128e-05, |
| "loss": 0.1289, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.3544303797468356, |
| "grad_norm": 1.1402161121368408, |
| "learning_rate": 6.641221374045802e-05, |
| "loss": 0.1335, |
| "step": 530 |
| }, |
| { |
| "epoch": 3.3860759493670884, |
| "grad_norm": 0.8805460333824158, |
| "learning_rate": 6.513994910941476e-05, |
| "loss": 0.127, |
| "step": 535 |
| }, |
| { |
| "epoch": 3.4177215189873418, |
| "grad_norm": 0.8641778230667114, |
| "learning_rate": 6.38676844783715e-05, |
| "loss": 0.1253, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.449367088607595, |
| "grad_norm": 0.9324259161949158, |
| "learning_rate": 6.259541984732826e-05, |
| "loss": 0.1244, |
| "step": 545 |
| }, |
| { |
| "epoch": 3.481012658227848, |
| "grad_norm": 0.8999868035316467, |
| "learning_rate": 6.132315521628498e-05, |
| "loss": 0.1294, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.5126582278481013, |
| "grad_norm": 0.8019403219223022, |
| "learning_rate": 6.005089058524174e-05, |
| "loss": 0.1243, |
| "step": 555 |
| }, |
| { |
| "epoch": 3.5443037974683547, |
| "grad_norm": 0.9356296062469482, |
| "learning_rate": 5.877862595419847e-05, |
| "loss": 0.1296, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.5759493670886076, |
| "grad_norm": 0.8532143831253052, |
| "learning_rate": 5.750636132315522e-05, |
| "loss": 0.124, |
| "step": 565 |
| }, |
| { |
| "epoch": 3.607594936708861, |
| "grad_norm": 1.1260122060775757, |
| "learning_rate": 5.6234096692111956e-05, |
| "loss": 0.1209, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.6392405063291138, |
| "grad_norm": 0.7926989793777466, |
| "learning_rate": 5.496183206106871e-05, |
| "loss": 0.1265, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.670886075949367, |
| "grad_norm": 0.8992180824279785, |
| "learning_rate": 5.3689567430025446e-05, |
| "loss": 0.1311, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.7025316455696204, |
| "grad_norm": 0.7314108610153198, |
| "learning_rate": 5.2417302798982194e-05, |
| "loss": 0.1254, |
| "step": 585 |
| }, |
| { |
| "epoch": 3.7341772151898733, |
| "grad_norm": 0.9207622408866882, |
| "learning_rate": 5.114503816793893e-05, |
| "loss": 0.1289, |
| "step": 590 |
| }, |
| { |
| "epoch": 3.7658227848101267, |
| "grad_norm": 0.622431218624115, |
| "learning_rate": 4.9872773536895677e-05, |
| "loss": 0.1251, |
| "step": 595 |
| }, |
| { |
| "epoch": 3.7974683544303796, |
| "grad_norm": 1.0110617876052856, |
| "learning_rate": 4.860050890585242e-05, |
| "loss": 0.1312, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.829113924050633, |
| "grad_norm": 0.699611246585846, |
| "learning_rate": 4.7328244274809166e-05, |
| "loss": 0.1263, |
| "step": 605 |
| }, |
| { |
| "epoch": 3.8607594936708862, |
| "grad_norm": 0.7877194881439209, |
| "learning_rate": 4.605597964376591e-05, |
| "loss": 0.1304, |
| "step": 610 |
| }, |
| { |
| "epoch": 3.892405063291139, |
| "grad_norm": 0.8100650906562805, |
| "learning_rate": 4.478371501272265e-05, |
| "loss": 0.1311, |
| "step": 615 |
| }, |
| { |
| "epoch": 3.9240506329113924, |
| "grad_norm": 0.6674991250038147, |
| "learning_rate": 4.351145038167939e-05, |
| "loss": 0.1303, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.9556962025316453, |
| "grad_norm": 0.8028637170791626, |
| "learning_rate": 4.223918575063613e-05, |
| "loss": 0.1304, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.9873417721518987, |
| "grad_norm": 1.6102169752120972, |
| "learning_rate": 4.096692111959288e-05, |
| "loss": 0.125, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.018987341772152, |
| "grad_norm": 0.4470888376235962, |
| "learning_rate": 3.969465648854962e-05, |
| "loss": 0.106, |
| "step": 635 |
| }, |
| { |
| "epoch": 4.050632911392405, |
| "grad_norm": 0.40415889024734497, |
| "learning_rate": 3.842239185750636e-05, |
| "loss": 0.0871, |
| "step": 640 |
| }, |
| { |
| "epoch": 4.082278481012658, |
| "grad_norm": 0.48266398906707764, |
| "learning_rate": 3.7150127226463104e-05, |
| "loss": 0.0859, |
| "step": 645 |
| }, |
| { |
| "epoch": 4.113924050632911, |
| "grad_norm": 0.836426854133606, |
| "learning_rate": 3.5877862595419845e-05, |
| "loss": 0.0883, |
| "step": 650 |
| }, |
| { |
| "epoch": 4.1455696202531644, |
| "grad_norm": 0.6731426119804382, |
| "learning_rate": 3.4605597964376594e-05, |
| "loss": 0.0876, |
| "step": 655 |
| }, |
| { |
| "epoch": 4.177215189873418, |
| "grad_norm": 0.5741623640060425, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 0.0911, |
| "step": 660 |
| }, |
| { |
| "epoch": 4.208860759493671, |
| "grad_norm": 0.5007706880569458, |
| "learning_rate": 3.2061068702290076e-05, |
| "loss": 0.0882, |
| "step": 665 |
| }, |
| { |
| "epoch": 4.2405063291139244, |
| "grad_norm": 0.5870316028594971, |
| "learning_rate": 3.078880407124682e-05, |
| "loss": 0.0891, |
| "step": 670 |
| }, |
| { |
| "epoch": 4.272151898734177, |
| "grad_norm": 0.698828935623169, |
| "learning_rate": 2.9516539440203562e-05, |
| "loss": 0.0912, |
| "step": 675 |
| }, |
| { |
| "epoch": 4.30379746835443, |
| "grad_norm": 0.5611512064933777, |
| "learning_rate": 2.824427480916031e-05, |
| "loss": 0.0881, |
| "step": 680 |
| }, |
| { |
| "epoch": 4.3354430379746836, |
| "grad_norm": 0.9599896669387817, |
| "learning_rate": 2.6972010178117052e-05, |
| "loss": 0.0875, |
| "step": 685 |
| }, |
| { |
| "epoch": 4.367088607594937, |
| "grad_norm": 0.6073245406150818, |
| "learning_rate": 2.5699745547073793e-05, |
| "loss": 0.0887, |
| "step": 690 |
| }, |
| { |
| "epoch": 4.39873417721519, |
| "grad_norm": 0.6183071136474609, |
| "learning_rate": 2.4427480916030535e-05, |
| "loss": 0.0902, |
| "step": 695 |
| }, |
| { |
| "epoch": 4.430379746835443, |
| "grad_norm": 0.4458979368209839, |
| "learning_rate": 2.3155216284987276e-05, |
| "loss": 0.088, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.462025316455696, |
| "grad_norm": 0.6202102303504944, |
| "learning_rate": 2.1882951653944024e-05, |
| "loss": 0.0905, |
| "step": 705 |
| }, |
| { |
| "epoch": 4.493670886075949, |
| "grad_norm": 0.46292412281036377, |
| "learning_rate": 2.0610687022900766e-05, |
| "loss": 0.0895, |
| "step": 710 |
| }, |
| { |
| "epoch": 4.525316455696203, |
| "grad_norm": 0.6506438255310059, |
| "learning_rate": 1.9338422391857507e-05, |
| "loss": 0.0931, |
| "step": 715 |
| }, |
| { |
| "epoch": 4.556962025316456, |
| "grad_norm": 0.5219342112541199, |
| "learning_rate": 1.8066157760814252e-05, |
| "loss": 0.0916, |
| "step": 720 |
| }, |
| { |
| "epoch": 4.588607594936709, |
| "grad_norm": 0.47599899768829346, |
| "learning_rate": 1.6793893129770993e-05, |
| "loss": 0.0867, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.620253164556962, |
| "grad_norm": 0.5680922865867615, |
| "learning_rate": 1.5521628498727735e-05, |
| "loss": 0.0878, |
| "step": 730 |
| }, |
| { |
| "epoch": 4.651898734177215, |
| "grad_norm": 0.5268383622169495, |
| "learning_rate": 1.424936386768448e-05, |
| "loss": 0.0881, |
| "step": 735 |
| }, |
| { |
| "epoch": 4.6835443037974684, |
| "grad_norm": 0.6063334345817566, |
| "learning_rate": 1.2977099236641221e-05, |
| "loss": 0.0904, |
| "step": 740 |
| }, |
| { |
| "epoch": 4.715189873417722, |
| "grad_norm": 0.5388665795326233, |
| "learning_rate": 1.1704834605597966e-05, |
| "loss": 0.0877, |
| "step": 745 |
| }, |
| { |
| "epoch": 4.746835443037975, |
| "grad_norm": 0.5125636458396912, |
| "learning_rate": 1.0432569974554709e-05, |
| "loss": 0.0927, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.7784810126582276, |
| "grad_norm": 0.5058565139770508, |
| "learning_rate": 9.16030534351145e-06, |
| "loss": 0.0885, |
| "step": 755 |
| }, |
| { |
| "epoch": 4.810126582278481, |
| "grad_norm": 0.39005881547927856, |
| "learning_rate": 7.888040712468193e-06, |
| "loss": 0.0892, |
| "step": 760 |
| }, |
| { |
| "epoch": 4.841772151898734, |
| "grad_norm": 0.45494306087493896, |
| "learning_rate": 6.615776081424936e-06, |
| "loss": 0.0926, |
| "step": 765 |
| }, |
| { |
| "epoch": 4.8734177215189876, |
| "grad_norm": 0.5130964517593384, |
| "learning_rate": 5.343511450381679e-06, |
| "loss": 0.0902, |
| "step": 770 |
| }, |
| { |
| "epoch": 4.905063291139241, |
| "grad_norm": 0.6438283324241638, |
| "learning_rate": 4.0712468193384225e-06, |
| "loss": 0.092, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.936708860759493, |
| "grad_norm": 0.4781509041786194, |
| "learning_rate": 2.7989821882951656e-06, |
| "loss": 0.0912, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.968354430379747, |
| "grad_norm": 0.42383071780204773, |
| "learning_rate": 1.5267175572519084e-06, |
| "loss": 0.0866, |
| "step": 785 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.7937325835227966, |
| "learning_rate": 2.544529262086514e-07, |
| "loss": 0.0851, |
| "step": 790 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 790, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.804508938412032e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|