| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 420, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.047619047619047616, | |
| "grad_norm": 1.4257669918070304, | |
| "learning_rate": 5.9523809523809525e-06, | |
| "loss": 0.5346, | |
| "mean_token_accuracy": 0.8659344732761383, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 0.7751540276308181, | |
| "learning_rate": 1.1904761904761905e-05, | |
| "loss": 0.4535, | |
| "mean_token_accuracy": 0.8758046269416809, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.3688156015884892, | |
| "learning_rate": 1.785714285714286e-05, | |
| "loss": 0.4071, | |
| "mean_token_accuracy": 0.8853001713752746, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 0.2571090114484444, | |
| "learning_rate": 2.380952380952381e-05, | |
| "loss": 0.3811, | |
| "mean_token_accuracy": 0.8898558735847473, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 0.2445335136114927, | |
| "learning_rate": 2.9761904761904762e-05, | |
| "loss": 0.3714, | |
| "mean_token_accuracy": 0.8922576308250427, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.25497240173448055, | |
| "learning_rate": 3.571428571428572e-05, | |
| "loss": 0.3621, | |
| "mean_token_accuracy": 0.8938062608242034, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.22758565353000912, | |
| "learning_rate": 4.166666666666667e-05, | |
| "loss": 0.3572, | |
| "mean_token_accuracy": 0.8950600028038025, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 0.21108211135784025, | |
| "learning_rate": 4.761904761904762e-05, | |
| "loss": 0.3463, | |
| "mean_token_accuracy": 0.8972749292850495, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.2207672207804811, | |
| "learning_rate": 4.999300659501837e-05, | |
| "loss": 0.3443, | |
| "mean_token_accuracy": 0.8975539267063141, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 0.21926886992248296, | |
| "learning_rate": 4.995028486261366e-05, | |
| "loss": 0.3378, | |
| "mean_token_accuracy": 0.899033111333847, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5238095238095238, | |
| "grad_norm": 0.224235174049935, | |
| "learning_rate": 4.986880029700503e-05, | |
| "loss": 0.3403, | |
| "mean_token_accuracy": 0.8979385972023011, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.21434998232271477, | |
| "learning_rate": 4.974869359006539e-05, | |
| "loss": 0.3297, | |
| "mean_token_accuracy": 0.9011363804340362, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6190476190476191, | |
| "grad_norm": 0.22126533026421416, | |
| "learning_rate": 4.959017211895173e-05, | |
| "loss": 0.3329, | |
| "mean_token_accuracy": 0.8999340653419494, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.2122647717862151, | |
| "learning_rate": 4.9393509588046036e-05, | |
| "loss": 0.3306, | |
| "mean_token_accuracy": 0.9009318828582764, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.20764297035844217, | |
| "learning_rate": 4.915904555637527e-05, | |
| "loss": 0.33, | |
| "mean_token_accuracy": 0.9004346609115601, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.20816077188373128, | |
| "learning_rate": 4.8887184851326155e-05, | |
| "loss": 0.3202, | |
| "mean_token_accuracy": 0.9034851253032684, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.8095238095238095, | |
| "grad_norm": 0.22148078953488418, | |
| "learning_rate": 4.8578396869667095e-05, | |
| "loss": 0.3252, | |
| "mean_token_accuracy": 0.9017885386943817, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.21406123617550696, | |
| "learning_rate": 4.823321476708417e-05, | |
| "loss": 0.3207, | |
| "mean_token_accuracy": 0.9030977070331574, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.9047619047619048, | |
| "grad_norm": 0.20063414494959075, | |
| "learning_rate": 4.7852234537630435e-05, | |
| "loss": 0.3151, | |
| "mean_token_accuracy": 0.9040939033031463, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.20157178980104304, | |
| "learning_rate": 4.7436113984678024e-05, | |
| "loss": 0.3133, | |
| "mean_token_accuracy": 0.9044515311717987, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.21946571056086206, | |
| "learning_rate": 4.6985571585149876e-05, | |
| "loss": 0.3088, | |
| "mean_token_accuracy": 0.9057298839092255, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.0476190476190477, | |
| "grad_norm": 0.22243550466680892, | |
| "learning_rate": 4.650138524899193e-05, | |
| "loss": 0.2589, | |
| "mean_token_accuracy": 0.9184723615646362, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0952380952380953, | |
| "grad_norm": 0.21782596690450573, | |
| "learning_rate": 4.5984390976027946e-05, | |
| "loss": 0.2467, | |
| "mean_token_accuracy": 0.9217934429645538, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.20047942498513227, | |
| "learning_rate": 4.5435481412515755e-05, | |
| "loss": 0.2458, | |
| "mean_token_accuracy": 0.9213689804077149, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 0.20323781517968, | |
| "learning_rate": 4.4855604309897496e-05, | |
| "loss": 0.2533, | |
| "mean_token_accuracy": 0.9193067252635956, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2380952380952381, | |
| "grad_norm": 0.2089213813676996, | |
| "learning_rate": 4.4245760888404665e-05, | |
| "loss": 0.2451, | |
| "mean_token_accuracy": 0.921397477388382, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 0.20871576941556397, | |
| "learning_rate": 4.360700410834367e-05, | |
| "loss": 0.2487, | |
| "mean_token_accuracy": 0.9207526803016662, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.20056126493311283, | |
| "learning_rate": 4.294043685204651e-05, | |
| "loss": 0.2361, | |
| "mean_token_accuracy": 0.924414473772049, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 0.2115275835238231, | |
| "learning_rate": 4.224721001962573e-05, | |
| "loss": 0.2409, | |
| "mean_token_accuracy": 0.9230122208595276, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.20541500240177765, | |
| "learning_rate": 4.1528520541821506e-05, | |
| "loss": 0.2398, | |
| "mean_token_accuracy": 0.9232278406620026, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4761904761904763, | |
| "grad_norm": 0.20798560704125774, | |
| "learning_rate": 4.078560931337187e-05, | |
| "loss": 0.2471, | |
| "mean_token_accuracy": 0.9215823113918304, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.5238095238095237, | |
| "grad_norm": 0.1939929705798728, | |
| "learning_rate": 4.001975905047442e-05, | |
| "loss": 0.2401, | |
| "mean_token_accuracy": 0.923470401763916, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.21344717934526472, | |
| "learning_rate": 3.923229207603871e-05, | |
| "loss": 0.2396, | |
| "mean_token_accuracy": 0.9238272488117218, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.619047619047619, | |
| "grad_norm": 0.2191275580698787, | |
| "learning_rate": 3.842456803655342e-05, | |
| "loss": 0.2406, | |
| "mean_token_accuracy": 0.923265916109085, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.19421469398044988, | |
| "learning_rate": 3.75979815545104e-05, | |
| "loss": 0.2359, | |
| "mean_token_accuracy": 0.924458909034729, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.19470949781451338, | |
| "learning_rate": 3.6753959820438764e-05, | |
| "loss": 0.2321, | |
| "mean_token_accuracy": 0.9256431758403778, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 0.1876708972166408, | |
| "learning_rate": 3.589396012870687e-05, | |
| "loss": 0.2291, | |
| "mean_token_accuracy": 0.9266167640686035, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.8095238095238095, | |
| "grad_norm": 0.2147218447176946, | |
| "learning_rate": 3.5019467361346724e-05, | |
| "loss": 0.2329, | |
| "mean_token_accuracy": 0.925212299823761, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.19962238819866412, | |
| "learning_rate": 3.413199142424535e-05, | |
| "loss": 0.2296, | |
| "mean_token_accuracy": 0.9259015142917633, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 0.19475591470139603, | |
| "learning_rate": 3.3233064640129734e-05, | |
| "loss": 0.2277, | |
| "mean_token_accuracy": 0.92720667719841, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 0.20861501443310124, | |
| "learning_rate": 3.232423910284672e-05, | |
| "loss": 0.2284, | |
| "mean_token_accuracy": 0.9266561150550843, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.19761114102987562, | |
| "learning_rate": 3.140708399750594e-05, | |
| "loss": 0.2291, | |
| "mean_token_accuracy": 0.9258988976478577, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.0476190476190474, | |
| "grad_norm": 0.24523566793415144, | |
| "learning_rate": 3.048318289111279e-05, | |
| "loss": 0.1563, | |
| "mean_token_accuracy": 0.9481804072856903, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.0952380952380953, | |
| "grad_norm": 0.204067402376235, | |
| "learning_rate": 2.955413099836959e-05, | |
| "loss": 0.1542, | |
| "mean_token_accuracy": 0.9486050844192505, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.19593572116438304, | |
| "learning_rate": 2.8621532427365687e-05, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.9480521976947784, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 0.19800587405132705, | |
| "learning_rate": 2.7686997409912192e-05, | |
| "loss": 0.1566, | |
| "mean_token_accuracy": 0.9476682603359222, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.238095238095238, | |
| "grad_norm": 0.19599017566764934, | |
| "learning_rate": 2.6752139521303403e-05, | |
| "loss": 0.1525, | |
| "mean_token_accuracy": 0.9489890992641449, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.1957327582988075, | |
| "learning_rate": 2.5818572894305453e-05, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9491177141666413, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.19017281050653162, | |
| "learning_rate": 2.4887909432182316e-05, | |
| "loss": 0.1512, | |
| "mean_token_accuracy": 0.9494254469871521, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 0.19356138909333898, | |
| "learning_rate": 2.3961756025571336e-05, | |
| "loss": 0.1535, | |
| "mean_token_accuracy": 0.9488085567951202, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.4285714285714284, | |
| "grad_norm": 0.21519917588614698, | |
| "learning_rate": 2.304171177801356e-05, | |
| "loss": 0.1521, | |
| "mean_token_accuracy": 0.9488667666912078, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.4761904761904763, | |
| "grad_norm": 0.18356462457323924, | |
| "learning_rate": 2.2129365244929402e-05, | |
| "loss": 0.1503, | |
| "mean_token_accuracy": 0.9497200846672058, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.5238095238095237, | |
| "grad_norm": 0.19727737940368512, | |
| "learning_rate": 2.1226291690806715e-05, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9498027801513672, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.18491161543383636, | |
| "learning_rate": 2.0334050369337104e-05, | |
| "loss": 0.1493, | |
| "mean_token_accuracy": 0.9500907003879547, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.619047619047619, | |
| "grad_norm": 0.180689936763116, | |
| "learning_rate": 1.945418183119656e-05, | |
| "loss": 0.1461, | |
| "mean_token_accuracy": 0.9510073781013488, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.18357579016167094, | |
| "learning_rate": 1.8588205264118974e-05, | |
| "loss": 0.1522, | |
| "mean_token_accuracy": 0.9489753544330597, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.7142857142857144, | |
| "grad_norm": 0.17455870908089519, | |
| "learning_rate": 1.7737615869854944e-05, | |
| "loss": 0.1476, | |
| "mean_token_accuracy": 0.9504713833332061, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.761904761904762, | |
| "grad_norm": 0.1895734172134673, | |
| "learning_rate": 1.6903882282545055e-05, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9508952736854553, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.8095238095238093, | |
| "grad_norm": 0.1814138039625087, | |
| "learning_rate": 1.6088444032964923e-05, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9506540060043335, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.18505343239566618, | |
| "learning_rate": 1.5292709063020415e-05, | |
| "loss": 0.1453, | |
| "mean_token_accuracy": 0.9513262331485748, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.9047619047619047, | |
| "grad_norm": 0.1818729025239862, | |
| "learning_rate": 1.4518051294784384e-05, | |
| "loss": 0.1427, | |
| "mean_token_accuracy": 0.9522208392620086, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.9523809523809526, | |
| "grad_norm": 0.18282477277115136, | |
| "learning_rate": 1.3765808258272334e-05, | |
| "loss": 0.1457, | |
| "mean_token_accuracy": 0.9513007164001465, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.17250893276801252, | |
| "learning_rate": 1.3037278782052863e-05, | |
| "loss": 0.1448, | |
| "mean_token_accuracy": 0.9508862257003784, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.0476190476190474, | |
| "grad_norm": 0.21843020721408246, | |
| "learning_rate": 1.2333720750680403e-05, | |
| "loss": 0.0972, | |
| "mean_token_accuracy": 0.9678010582923889, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.0952380952380953, | |
| "grad_norm": 0.18713956774840773, | |
| "learning_rate": 1.1656348932822086e-05, | |
| "loss": 0.0951, | |
| "mean_token_accuracy": 0.9680294811725616, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.142857142857143, | |
| "grad_norm": 0.17099224072656777, | |
| "learning_rate": 1.1006332883828913e-05, | |
| "loss": 0.0928, | |
| "mean_token_accuracy": 0.9688285231590271, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.1904761904761907, | |
| "grad_norm": 0.1889152084893897, | |
| "learning_rate": 1.0384794926372563e-05, | |
| "loss": 0.0945, | |
| "mean_token_accuracy": 0.9682107090950012, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.238095238095238, | |
| "grad_norm": 0.16909157911032757, | |
| "learning_rate": 9.792808212634502e-06, | |
| "loss": 0.0911, | |
| "mean_token_accuracy": 0.9692863464355469, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.2857142857142856, | |
| "grad_norm": 0.16462195486055214, | |
| "learning_rate": 9.231394871393228e-06, | |
| "loss": 0.0926, | |
| "mean_token_accuracy": 0.9688502609729767, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.16694354195202218, | |
| "learning_rate": 8.701524243208935e-06, | |
| "loss": 0.0941, | |
| "mean_token_accuracy": 0.968300586938858, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.380952380952381, | |
| "grad_norm": 0.1692222626696623, | |
| "learning_rate": 8.204111206752663e-06, | |
| "loss": 0.0918, | |
| "mean_token_accuracy": 0.9689784705638885, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.4285714285714284, | |
| "grad_norm": 0.17221192973282243, | |
| "learning_rate": 7.740014599169857e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9694355607032776, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.4761904761904763, | |
| "grad_norm": 0.1741810830817914, | |
| "learning_rate": 7.3100357332055624e-06, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9696675717830658, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.5238095238095237, | |
| "grad_norm": 0.17501221405018258, | |
| "learning_rate": 6.914917013651723e-06, | |
| "loss": 0.092, | |
| "mean_token_accuracy": 0.968971711397171, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 0.16570291902290668, | |
| "learning_rate": 6.555340655505407e-06, | |
| "loss": 0.0917, | |
| "mean_token_accuracy": 0.9693442165851593, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.619047619047619, | |
| "grad_norm": 0.15923829992676775, | |
| "learning_rate": 6.231927506051192e-06, | |
| "loss": 0.0915, | |
| "mean_token_accuracy": 0.969234949350357, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 0.17338149522120277, | |
| "learning_rate": 5.9452359729015004e-06, | |
| "loss": 0.0907, | |
| "mean_token_accuracy": 0.9692964613437652, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.7142857142857144, | |
| "grad_norm": 0.17158993366626976, | |
| "learning_rate": 5.695761059845749e-06, | |
| "loss": 0.0902, | |
| "mean_token_accuracy": 0.9696565389633178, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.761904761904762, | |
| "grad_norm": 0.17688545350709856, | |
| "learning_rate": 5.483933512173022e-06, | |
| "loss": 0.0918, | |
| "mean_token_accuracy": 0.9692005813121796, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.8095238095238093, | |
| "grad_norm": 0.1621556558547238, | |
| "learning_rate": 5.310119072943991e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9700611054897308, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.857142857142857, | |
| "grad_norm": 0.16839822104636895, | |
| "learning_rate": 5.174617851496128e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9694457828998566, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.9047619047619047, | |
| "grad_norm": 0.16305557552247782, | |
| "learning_rate": 5.077663805272652e-06, | |
| "loss": 0.0902, | |
| "mean_token_accuracy": 0.9695923388004303, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.9523809523809526, | |
| "grad_norm": 0.16067720695857812, | |
| "learning_rate": 5.019424335869808e-06, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9695243299007416, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.1603132269708069, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9692174196243286, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 420, | |
| "total_flos": 216761967837184.0, | |
| "train_loss": 0.20915592369579133, | |
| "train_runtime": 2440.0132, | |
| "train_samples_per_second": 2.749, | |
| "train_steps_per_second": 0.172 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 420, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 216761967837184.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |