| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9998997292690264, | |
| "eval_steps": 500, | |
| "global_step": 831, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012032487716835455, | |
| "grad_norm": 1.0745393392299216, | |
| "learning_rate": 1.1904761904761906e-07, | |
| "loss": 0.7396, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.006016243858417728, | |
| "grad_norm": 1.0096667298005468, | |
| "learning_rate": 5.952380952380953e-07, | |
| "loss": 0.7345, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.012032487716835455, | |
| "grad_norm": 0.5432840285599178, | |
| "learning_rate": 1.1904761904761906e-06, | |
| "loss": 0.6995, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.018048731575253184, | |
| "grad_norm": 0.5379885594965839, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 0.5364, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02406497543367091, | |
| "grad_norm": 0.28279328593216746, | |
| "learning_rate": 2.380952380952381e-06, | |
| "loss": 0.3438, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03008121929208864, | |
| "grad_norm": 0.11670701576343243, | |
| "learning_rate": 2.9761904761904763e-06, | |
| "loss": 0.2194, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03609746315050637, | |
| "grad_norm": 0.09598554828698612, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 0.1655, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0421137070089241, | |
| "grad_norm": 0.07370159012965125, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.1433, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04812995086734182, | |
| "grad_norm": 0.05410564768989329, | |
| "learning_rate": 4.761904761904762e-06, | |
| "loss": 0.1227, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05414619472575955, | |
| "grad_norm": 0.049586846542590546, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 0.1097, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.06016243858417728, | |
| "grad_norm": 0.048103082558905504, | |
| "learning_rate": 5.9523809523809525e-06, | |
| "loss": 0.0978, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.066178682442595, | |
| "grad_norm": 0.05669271340624084, | |
| "learning_rate": 6.547619047619048e-06, | |
| "loss": 0.0817, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.07219492630101274, | |
| "grad_norm": 0.054915884901381114, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 0.0801, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07821117015943047, | |
| "grad_norm": 0.06021281263850561, | |
| "learning_rate": 7.738095238095238e-06, | |
| "loss": 0.0702, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0842274140178482, | |
| "grad_norm": 0.039388545955733205, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.0619, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09024365787626591, | |
| "grad_norm": 0.044736289577753, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.0556, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.09625990173468364, | |
| "grad_norm": 0.04237369775091706, | |
| "learning_rate": 9.523809523809525e-06, | |
| "loss": 0.0523, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10227614559310137, | |
| "grad_norm": 0.04699046038689833, | |
| "learning_rate": 9.999955782120656e-06, | |
| "loss": 0.0497, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1082923894515191, | |
| "grad_norm": 0.03964650441405405, | |
| "learning_rate": 9.99840823846134e-06, | |
| "loss": 0.0458, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11430863330993683, | |
| "grad_norm": 0.03621563631742996, | |
| "learning_rate": 9.994650582860978e-06, | |
| "loss": 0.0477, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.12032487716835456, | |
| "grad_norm": 0.0268237826645417, | |
| "learning_rate": 9.98868447681642e-06, | |
| "loss": 0.0439, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12634112102677228, | |
| "grad_norm": 0.028555915124659773, | |
| "learning_rate": 9.980512558319915e-06, | |
| "loss": 0.0409, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.13235736488519, | |
| "grad_norm": 0.026049496996790083, | |
| "learning_rate": 9.970138440692706e-06, | |
| "loss": 0.0414, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13837360874360774, | |
| "grad_norm": 0.03147552348817025, | |
| "learning_rate": 9.957566710987338e-06, | |
| "loss": 0.0406, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.14438985260202547, | |
| "grad_norm": 0.027902049215008537, | |
| "learning_rate": 9.942802927959444e-06, | |
| "loss": 0.0392, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1504060964604432, | |
| "grad_norm": 0.028174099496676555, | |
| "learning_rate": 9.925853619609858e-06, | |
| "loss": 0.0339, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.15642234031886093, | |
| "grad_norm": 0.025957912802832783, | |
| "learning_rate": 9.906726280298185e-06, | |
| "loss": 0.0365, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16243858417727866, | |
| "grad_norm": 0.026511579347961597, | |
| "learning_rate": 9.885429367429062e-06, | |
| "loss": 0.0365, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.1684548280356964, | |
| "grad_norm": 0.02801670507212045, | |
| "learning_rate": 9.861972297712606e-06, | |
| "loss": 0.0343, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17447107189411412, | |
| "grad_norm": 0.023525196780612226, | |
| "learning_rate": 9.836365443000697e-06, | |
| "loss": 0.0331, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.18048731575253182, | |
| "grad_norm": 0.02456235083949215, | |
| "learning_rate": 9.808620125700925e-06, | |
| "loss": 0.0335, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18650355961094955, | |
| "grad_norm": 0.021953324848861307, | |
| "learning_rate": 9.778748613770234e-06, | |
| "loss": 0.0313, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.19251980346936728, | |
| "grad_norm": 0.02914752771577575, | |
| "learning_rate": 9.746764115290496e-06, | |
| "loss": 0.0354, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.19853604732778501, | |
| "grad_norm": 0.0241041372740356, | |
| "learning_rate": 9.712680772628365e-06, | |
| "loss": 0.0338, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.20455229118620274, | |
| "grad_norm": 0.02385726337646924, | |
| "learning_rate": 9.676513656182059e-06, | |
| "loss": 0.0343, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21056853504462048, | |
| "grad_norm": 0.021532109435525824, | |
| "learning_rate": 9.63827875771778e-06, | |
| "loss": 0.0317, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2165847789030382, | |
| "grad_norm": 0.02462556259506109, | |
| "learning_rate": 9.597992983298748e-06, | |
| "loss": 0.0299, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.22260102276145594, | |
| "grad_norm": 0.022316517368094545, | |
| "learning_rate": 9.55567414580995e-06, | |
| "loss": 0.0323, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.22861726661987367, | |
| "grad_norm": 0.02288293026305173, | |
| "learning_rate": 9.511340957081957e-06, | |
| "loss": 0.0307, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2346335104782914, | |
| "grad_norm": 0.022260065504874377, | |
| "learning_rate": 9.46501301961723e-06, | |
| "loss": 0.0345, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.24064975433670913, | |
| "grad_norm": 0.021736470284370146, | |
| "learning_rate": 9.416710817922615e-06, | |
| "loss": 0.0311, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24666599819512683, | |
| "grad_norm": 0.02550413340783063, | |
| "learning_rate": 9.366455709451857e-06, | |
| "loss": 0.0329, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.25268224205354456, | |
| "grad_norm": 0.02201788645460498, | |
| "learning_rate": 9.314269915162115e-06, | |
| "loss": 0.0309, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2586984859119623, | |
| "grad_norm": 0.02936137915292986, | |
| "learning_rate": 9.260176509688673e-06, | |
| "loss": 0.0285, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.26471472977038, | |
| "grad_norm": 0.023538798060004095, | |
| "learning_rate": 9.204199411142196e-06, | |
| "loss": 0.0293, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2707309736287978, | |
| "grad_norm": 0.018492280335574894, | |
| "learning_rate": 9.146363370533004e-06, | |
| "loss": 0.0281, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.2767472174872155, | |
| "grad_norm": 0.023852820186103463, | |
| "learning_rate": 9.086693960827106e-06, | |
| "loss": 0.028, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2827634613456332, | |
| "grad_norm": 0.021207055501016953, | |
| "learning_rate": 9.025217565638766e-06, | |
| "loss": 0.0291, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.28877970520405094, | |
| "grad_norm": 0.01877956343623577, | |
| "learning_rate": 8.961961367564652e-06, | |
| "loss": 0.0282, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.29479594906246864, | |
| "grad_norm": 0.019669983451369902, | |
| "learning_rate": 8.89695333616467e-06, | |
| "loss": 0.0259, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.3008121929208864, | |
| "grad_norm": 0.026680075575420455, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.0289, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3068284367793041, | |
| "grad_norm": 0.01856152788906122, | |
| "learning_rate": 8.761797511897907e-06, | |
| "loss": 0.0241, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.31284468063772186, | |
| "grad_norm": 0.01860271376372244, | |
| "learning_rate": 8.691709479956373e-06, | |
| "loss": 0.0272, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.31886092449613956, | |
| "grad_norm": 0.019865593703402643, | |
| "learning_rate": 8.619989110115398e-06, | |
| "loss": 0.0257, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3248771683545573, | |
| "grad_norm": 0.02048991665947615, | |
| "learning_rate": 8.546668114479769e-06, | |
| "loss": 0.029, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.330893412212975, | |
| "grad_norm": 0.017217185351734676, | |
| "learning_rate": 8.471778912892008e-06, | |
| "loss": 0.0252, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3369096560713928, | |
| "grad_norm": 0.01813603892384963, | |
| "learning_rate": 8.395354618597533e-06, | |
| "loss": 0.0268, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3429258999298105, | |
| "grad_norm": 0.0238318384115611, | |
| "learning_rate": 8.31742902360319e-06, | |
| "loss": 0.0282, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.34894214378822824, | |
| "grad_norm": 0.019064179482916437, | |
| "learning_rate": 8.238036583735673e-06, | |
| "loss": 0.0271, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.35495838764664595, | |
| "grad_norm": 0.0222590787343455, | |
| "learning_rate": 8.157212403406424e-06, | |
| "loss": 0.0257, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.36097463150506365, | |
| "grad_norm": 0.01848676924848829, | |
| "learning_rate": 8.07499222008977e-06, | |
| "loss": 0.0251, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3669908753634814, | |
| "grad_norm": 0.01993669886796683, | |
| "learning_rate": 7.991412388521108e-06, | |
| "loss": 0.0261, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.3730071192218991, | |
| "grad_norm": 0.019508670102946216, | |
| "learning_rate": 7.906509864622202e-06, | |
| "loss": 0.0258, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.37902336308031687, | |
| "grad_norm": 0.02181687850379784, | |
| "learning_rate": 7.820322189160618e-06, | |
| "loss": 0.0219, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.38503960693873457, | |
| "grad_norm": 0.020435112312527343, | |
| "learning_rate": 7.732887471150589e-06, | |
| "loss": 0.0258, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3910558507971523, | |
| "grad_norm": 0.016792623507338397, | |
| "learning_rate": 7.644244371002619e-06, | |
| "loss": 0.0259, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.39707209465557003, | |
| "grad_norm": 0.025243263772643947, | |
| "learning_rate": 7.554432083429253e-06, | |
| "loss": 0.0239, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4030883385139878, | |
| "grad_norm": 0.017995141016588313, | |
| "learning_rate": 7.463490320114646e-06, | |
| "loss": 0.023, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.4091045823724055, | |
| "grad_norm": 0.016947239802050783, | |
| "learning_rate": 7.371459292155501e-06, | |
| "loss": 0.0227, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.41512082623082325, | |
| "grad_norm": 0.01945334381344522, | |
| "learning_rate": 7.278379692281209e-06, | |
| "loss": 0.0236, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.42113707008924095, | |
| "grad_norm": 0.018112579661825487, | |
| "learning_rate": 7.184292676861024e-06, | |
| "loss": 0.0262, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.42715331394765865, | |
| "grad_norm": 0.01698932526214814, | |
| "learning_rate": 7.0892398477062375e-06, | |
| "loss": 0.024, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.4331695578060764, | |
| "grad_norm": 0.018750963386521588, | |
| "learning_rate": 6.99326323367538e-06, | |
| "loss": 0.0245, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4391858016644941, | |
| "grad_norm": 0.02081900306974978, | |
| "learning_rate": 6.8964052720906175e-06, | |
| "loss": 0.026, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.44520204552291187, | |
| "grad_norm": 0.02082309821957481, | |
| "learning_rate": 6.798708789973527e-06, | |
| "loss": 0.0255, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4512182893813296, | |
| "grad_norm": 0.023227812045822693, | |
| "learning_rate": 6.700216985108568e-06, | |
| "loss": 0.0243, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.45723453323974733, | |
| "grad_norm": 0.01688396438835524, | |
| "learning_rate": 6.600973406942617e-06, | |
| "loss": 0.0235, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.46325077709816503, | |
| "grad_norm": 0.020269716613110212, | |
| "learning_rate": 6.501021937328992e-06, | |
| "loss": 0.0215, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.4692670209565828, | |
| "grad_norm": 0.018025367141900818, | |
| "learning_rate": 6.4004067711245366e-06, | |
| "loss": 0.0221, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4752832648150005, | |
| "grad_norm": 0.020858778838367335, | |
| "learning_rate": 6.29917239664826e-06, | |
| "loss": 0.0232, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.48129950867341825, | |
| "grad_norm": 0.022426616797026122, | |
| "learning_rate": 6.1973635760102645e-06, | |
| "loss": 0.0232, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.48731575253183596, | |
| "grad_norm": 0.02208916714792084, | |
| "learning_rate": 6.0950253253195656e-06, | |
| "loss": 0.0277, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.49333199639025366, | |
| "grad_norm": 0.020289821733336374, | |
| "learning_rate": 5.9922028947796495e-06, | |
| "loss": 0.0226, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4993482402486714, | |
| "grad_norm": 0.022132901095444316, | |
| "learning_rate": 5.888941748680484e-06, | |
| "loss": 0.023, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5053644841070891, | |
| "grad_norm": 0.02080443746491033, | |
| "learning_rate": 5.785287545295895e-06, | |
| "loss": 0.0214, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5113807279655068, | |
| "grad_norm": 0.018676657765971747, | |
| "learning_rate": 5.681286116695155e-06, | |
| "loss": 0.0225, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.5173969718239246, | |
| "grad_norm": 0.020395399948370366, | |
| "learning_rate": 5.5769834484777344e-06, | |
| "loss": 0.0222, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5234132156823423, | |
| "grad_norm": 0.01859239650955675, | |
| "learning_rate": 5.472425659440157e-06, | |
| "loss": 0.0205, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.52942945954076, | |
| "grad_norm": 0.01868449550665156, | |
| "learning_rate": 5.367658981183979e-06, | |
| "loss": 0.0232, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5354457033991777, | |
| "grad_norm": 0.015161045404289166, | |
| "learning_rate": 5.2627297376738674e-06, | |
| "loss": 0.0203, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.5414619472575956, | |
| "grad_norm": 0.016845793697294737, | |
| "learning_rate": 5.157684324754858e-06, | |
| "loss": 0.0211, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5474781911160133, | |
| "grad_norm": 0.0194074719508428, | |
| "learning_rate": 5.052569189637813e-06, | |
| "loss": 0.0238, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.553494434974431, | |
| "grad_norm": 0.01936893905137615, | |
| "learning_rate": 4.947430810362188e-06, | |
| "loss": 0.0216, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5595106788328487, | |
| "grad_norm": 0.01859222507454881, | |
| "learning_rate": 4.842315675245144e-06, | |
| "loss": 0.0195, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5655269226912664, | |
| "grad_norm": 0.022318020720072922, | |
| "learning_rate": 4.737270262326134e-06, | |
| "loss": 0.0214, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5715431665496842, | |
| "grad_norm": 0.01657420129142331, | |
| "learning_rate": 4.632341018816023e-06, | |
| "loss": 0.0213, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5775594104081019, | |
| "grad_norm": 0.02035658875684946, | |
| "learning_rate": 4.527574340559844e-06, | |
| "loss": 0.0226, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5835756542665196, | |
| "grad_norm": 0.016236086655299523, | |
| "learning_rate": 4.423016551522268e-06, | |
| "loss": 0.0203, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5895918981249373, | |
| "grad_norm": 0.017728340188220976, | |
| "learning_rate": 4.318713883304846e-06, | |
| "loss": 0.0227, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5956081419833551, | |
| "grad_norm": 0.019076149038680715, | |
| "learning_rate": 4.214712454704107e-06, | |
| "loss": 0.0212, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.6016243858417728, | |
| "grad_norm": 0.015355569764402307, | |
| "learning_rate": 4.111058251319517e-06, | |
| "loss": 0.0201, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6076406297001905, | |
| "grad_norm": 0.018415895096337096, | |
| "learning_rate": 4.007797105220352e-06, | |
| "loss": 0.0191, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.6136568735586082, | |
| "grad_norm": 0.017521505257648343, | |
| "learning_rate": 3.904974674680436e-06, | |
| "loss": 0.0206, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.619673117417026, | |
| "grad_norm": 0.017605011135453034, | |
| "learning_rate": 3.802636423989738e-06, | |
| "loss": 0.0212, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.6256893612754437, | |
| "grad_norm": 0.01785825295985011, | |
| "learning_rate": 3.70082760335174e-06, | |
| "loss": 0.019, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6317056051338614, | |
| "grad_norm": 0.01652138449465044, | |
| "learning_rate": 3.5995932288754655e-06, | |
| "loss": 0.0181, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.6377218489922791, | |
| "grad_norm": 0.018648556241997245, | |
| "learning_rate": 3.4989780626710103e-06, | |
| "loss": 0.0205, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6437380928506968, | |
| "grad_norm": 0.01827077862731449, | |
| "learning_rate": 3.3990265930573863e-06, | |
| "loss": 0.0169, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.6497543367091146, | |
| "grad_norm": 0.015591039180393907, | |
| "learning_rate": 3.2997830148914316e-06, | |
| "loss": 0.0206, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6557705805675323, | |
| "grad_norm": 0.019792404271656995, | |
| "learning_rate": 3.2012912100264743e-06, | |
| "loss": 0.0202, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.66178682442595, | |
| "grad_norm": 0.018533196717200034, | |
| "learning_rate": 3.1035947279093846e-06, | |
| "loss": 0.021, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6678030682843678, | |
| "grad_norm": 0.019636694754151293, | |
| "learning_rate": 3.006736766324623e-06, | |
| "loss": 0.0221, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6738193121427856, | |
| "grad_norm": 0.014256034059804634, | |
| "learning_rate": 2.9107601522937638e-06, | |
| "loss": 0.0202, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6798355560012033, | |
| "grad_norm": 0.016107622432284356, | |
| "learning_rate": 2.8157073231389752e-06, | |
| "loss": 0.0187, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.685851799859621, | |
| "grad_norm": 0.019189045539841333, | |
| "learning_rate": 2.721620307718793e-06, | |
| "loss": 0.0187, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6918680437180387, | |
| "grad_norm": 0.014474735470360907, | |
| "learning_rate": 2.6285407078445015e-06, | |
| "loss": 0.021, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6978842875764565, | |
| "grad_norm": 0.019262771622277575, | |
| "learning_rate": 2.536509679885355e-06, | |
| "loss": 0.0209, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7039005314348742, | |
| "grad_norm": 0.01673939208310001, | |
| "learning_rate": 2.4455679165707473e-06, | |
| "loss": 0.0205, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.7099167752932919, | |
| "grad_norm": 0.014946535263918313, | |
| "learning_rate": 2.3557556289973838e-06, | |
| "loss": 0.0205, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7159330191517096, | |
| "grad_norm": 0.01596788205464162, | |
| "learning_rate": 2.2671125288494123e-06, | |
| "loss": 0.0189, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.7219492630101273, | |
| "grad_norm": 0.016621734500822712, | |
| "learning_rate": 2.1796778108393824e-06, | |
| "loss": 0.0198, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7279655068685451, | |
| "grad_norm": 0.018148193058810227, | |
| "learning_rate": 2.0934901353777994e-06, | |
| "loss": 0.0203, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.7339817507269628, | |
| "grad_norm": 0.016498667065988057, | |
| "learning_rate": 2.008587611478894e-06, | |
| "loss": 0.0193, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7399979945853805, | |
| "grad_norm": 0.017876917503246166, | |
| "learning_rate": 1.9250077799102323e-06, | |
| "loss": 0.0197, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.7460142384437982, | |
| "grad_norm": 0.01737881401071936, | |
| "learning_rate": 1.842787596593576e-06, | |
| "loss": 0.02, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.752030482302216, | |
| "grad_norm": 0.01669399114670848, | |
| "learning_rate": 1.761963416264329e-06, | |
| "loss": 0.0198, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.7580467261606337, | |
| "grad_norm": 0.01963624059543007, | |
| "learning_rate": 1.6825709763968112e-06, | |
| "loss": 0.0193, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7640629700190514, | |
| "grad_norm": 0.016744844031857532, | |
| "learning_rate": 1.6046453814024671e-06, | |
| "loss": 0.0194, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.7700792138774691, | |
| "grad_norm": 0.015338845138862633, | |
| "learning_rate": 1.5282210871079929e-06, | |
| "loss": 0.0192, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7760954577358868, | |
| "grad_norm": 0.014792090312693856, | |
| "learning_rate": 1.453331885520234e-06, | |
| "loss": 0.0164, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.7821117015943047, | |
| "grad_norm": 0.01431108236231679, | |
| "learning_rate": 1.3800108898846022e-06, | |
| "loss": 0.0193, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7881279454527224, | |
| "grad_norm": 0.01517595387399891, | |
| "learning_rate": 1.3082905200436291e-06, | |
| "loss": 0.0208, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.7941441893111401, | |
| "grad_norm": 0.01585759719142669, | |
| "learning_rate": 1.2382024881020937e-06, | |
| "loss": 0.019, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8001604331695578, | |
| "grad_norm": 0.01823159719823014, | |
| "learning_rate": 1.1697777844051105e-06, | |
| "loss": 0.0198, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.8061766770279756, | |
| "grad_norm": 0.013515790000071682, | |
| "learning_rate": 1.1030466638353293e-06, | |
| "loss": 0.0187, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8121929208863933, | |
| "grad_norm": 0.016225422506154382, | |
| "learning_rate": 1.0380386324353508e-06, | |
| "loss": 0.0175, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.818209164744811, | |
| "grad_norm": 0.018290959272687618, | |
| "learning_rate": 9.74782434361234e-07, | |
| "loss": 0.0191, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8242254086032287, | |
| "grad_norm": 0.014122550894005206, | |
| "learning_rate": 9.133060391728965e-07, | |
| "loss": 0.0186, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.8302416524616465, | |
| "grad_norm": 0.01479861703335419, | |
| "learning_rate": 8.536366294669979e-07, | |
| "loss": 0.017, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8362578963200642, | |
| "grad_norm": 0.01704145841348508, | |
| "learning_rate": 7.958005888578063e-07, | |
| "loss": 0.0189, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.8422741401784819, | |
| "grad_norm": 0.014330474670880548, | |
| "learning_rate": 7.398234903113266e-07, | |
| "loss": 0.0177, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8482903840368996, | |
| "grad_norm": 0.016122466793252455, | |
| "learning_rate": 6.857300848378857e-07, | |
| "loss": 0.0186, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.8543066278953173, | |
| "grad_norm": 0.016531884356419155, | |
| "learning_rate": 6.335442905481442e-07, | |
| "loss": 0.0183, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8603228717537351, | |
| "grad_norm": 0.017990075667932776, | |
| "learning_rate": 5.832891820773868e-07, | |
| "loss": 0.0201, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.8663391156121528, | |
| "grad_norm": 0.01787947586996767, | |
| "learning_rate": 5.349869803827717e-07, | |
| "loss": 0.0168, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8723553594705705, | |
| "grad_norm": 0.016104542603383502, | |
| "learning_rate": 4.886590429180426e-07, | |
| "loss": 0.0179, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.8783716033289882, | |
| "grad_norm": 0.01865954701869269, | |
| "learning_rate": 4.443258541900508e-07, | |
| "loss": 0.0179, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.884387847187406, | |
| "grad_norm": 0.014772814789547404, | |
| "learning_rate": 4.020070167012541e-07, | |
| "loss": 0.0166, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.8904040910458237, | |
| "grad_norm": 0.0189883024943083, | |
| "learning_rate": 3.6172124228221914e-07, | |
| "loss": 0.0185, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8964203349042414, | |
| "grad_norm": 0.01755215014494978, | |
| "learning_rate": 3.23486343817942e-07, | |
| "loss": 0.0174, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.9024365787626591, | |
| "grad_norm": 0.021255269979833603, | |
| "learning_rate": 2.873192273716369e-07, | |
| "loss": 0.0198, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.908452822621077, | |
| "grad_norm": 0.018993801870159418, | |
| "learning_rate": 2.532358847095051e-07, | |
| "loss": 0.0181, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.9144690664794947, | |
| "grad_norm": 0.01729608956020829, | |
| "learning_rate": 2.2125138622976494e-07, | |
| "loss": 0.019, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9204853103379124, | |
| "grad_norm": 0.015801337315760878, | |
| "learning_rate": 1.9137987429907635e-07, | |
| "loss": 0.0168, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.9265015541963301, | |
| "grad_norm": 0.016592823438343365, | |
| "learning_rate": 1.636345569993042e-07, | |
| "loss": 0.0172, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9325177980547478, | |
| "grad_norm": 0.013326130117293207, | |
| "learning_rate": 1.3802770228739547e-07, | |
| "loss": 0.0162, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.9385340419131656, | |
| "grad_norm": 0.017403164818775768, | |
| "learning_rate": 1.1457063257093892e-07, | |
| "loss": 0.0181, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9445502857715833, | |
| "grad_norm": 0.01717367430718781, | |
| "learning_rate": 9.32737197018152e-08, | |
| "loss": 0.0193, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.950566529630001, | |
| "grad_norm": 0.01932159740421986, | |
| "learning_rate": 7.414638039014266e-08, | |
| "loss": 0.0189, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9565827734884187, | |
| "grad_norm": 0.014545477047193084, | |
| "learning_rate": 5.7197072040557356e-08, | |
| "loss": 0.0175, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.9625990173468365, | |
| "grad_norm": 0.01471817818854388, | |
| "learning_rate": 4.243328901266219e-08, | |
| "loss": 0.0184, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9686152612052542, | |
| "grad_norm": 0.016067984603921493, | |
| "learning_rate": 2.986155930729484e-08, | |
| "loss": 0.0163, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.9746315050636719, | |
| "grad_norm": 0.014988902741589531, | |
| "learning_rate": 1.9487441680084983e-08, | |
| "loss": 0.0174, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9806477489220896, | |
| "grad_norm": 0.016141274215466517, | |
| "learning_rate": 1.1315523183581534e-08, | |
| "loss": 0.0186, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.9866639927805073, | |
| "grad_norm": 0.016494560923804934, | |
| "learning_rate": 5.349417139022816e-09, | |
| "loss": 0.0186, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9926802366389251, | |
| "grad_norm": 0.019570420674046143, | |
| "learning_rate": 1.591761538662362e-09, | |
| "loss": 0.0203, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.9986964804973428, | |
| "grad_norm": 0.014921909514844484, | |
| "learning_rate": 4.4217879344166104e-11, | |
| "loss": 0.0162, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9998997292690264, | |
| "step": 831, | |
| "total_flos": 4.483822414295204e+18, | |
| "train_loss": 0.043213658636630875, | |
| "train_runtime": 13633.5754, | |
| "train_samples_per_second": 2.926, | |
| "train_steps_per_second": 0.061 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 831, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.483822414295204e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |