{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.047619047619047616, "grad_norm": 1.4257669918070304, "learning_rate": 5.9523809523809525e-06, "loss": 0.5346, "mean_token_accuracy": 0.8659344732761383, "step": 5 }, { "epoch": 0.09523809523809523, "grad_norm": 0.7751540276308181, "learning_rate": 1.1904761904761905e-05, "loss": 0.4535, "mean_token_accuracy": 0.8758046269416809, "step": 10 }, { "epoch": 0.14285714285714285, "grad_norm": 0.3688156015884892, "learning_rate": 1.785714285714286e-05, "loss": 0.4071, "mean_token_accuracy": 0.8853001713752746, "step": 15 }, { "epoch": 0.19047619047619047, "grad_norm": 0.2571090114484444, "learning_rate": 2.380952380952381e-05, "loss": 0.3811, "mean_token_accuracy": 0.8898558735847473, "step": 20 }, { "epoch": 0.23809523809523808, "grad_norm": 0.2445335136114927, "learning_rate": 2.9761904761904762e-05, "loss": 0.3714, "mean_token_accuracy": 0.8922576308250427, "step": 25 }, { "epoch": 0.2857142857142857, "grad_norm": 0.25497240173448055, "learning_rate": 3.571428571428572e-05, "loss": 0.3621, "mean_token_accuracy": 0.8938062608242034, "step": 30 }, { "epoch": 0.3333333333333333, "grad_norm": 0.22758565353000912, "learning_rate": 4.166666666666667e-05, "loss": 0.3572, "mean_token_accuracy": 0.8950600028038025, "step": 35 }, { "epoch": 0.38095238095238093, "grad_norm": 0.21108211135784025, "learning_rate": 4.761904761904762e-05, "loss": 0.3463, "mean_token_accuracy": 0.8972749292850495, "step": 40 }, { "epoch": 0.42857142857142855, "grad_norm": 0.2207672207804811, "learning_rate": 4.999300659501837e-05, "loss": 0.3443, "mean_token_accuracy": 0.8975539267063141, "step": 45 }, { "epoch": 0.47619047619047616, "grad_norm": 0.21926886992248296, "learning_rate": 4.995028486261366e-05, "loss": 0.3378, "mean_token_accuracy": 0.899033111333847, "step": 50 }, { "epoch": 0.5238095238095238, "grad_norm": 0.224235174049935, "learning_rate": 4.986880029700503e-05, "loss": 0.3403, "mean_token_accuracy": 0.8979385972023011, "step": 55 }, { "epoch": 0.5714285714285714, "grad_norm": 0.21434998232271477, "learning_rate": 4.974869359006539e-05, "loss": 0.3297, "mean_token_accuracy": 0.9011363804340362, "step": 60 }, { "epoch": 0.6190476190476191, "grad_norm": 0.22126533026421416, "learning_rate": 4.959017211895173e-05, "loss": 0.3329, "mean_token_accuracy": 0.8999340653419494, "step": 65 }, { "epoch": 0.6666666666666666, "grad_norm": 0.2122647717862151, "learning_rate": 4.9393509588046036e-05, "loss": 0.3306, "mean_token_accuracy": 0.9009318828582764, "step": 70 }, { "epoch": 0.7142857142857143, "grad_norm": 0.20764297035844217, "learning_rate": 4.915904555637527e-05, "loss": 0.33, "mean_token_accuracy": 0.9004346609115601, "step": 75 }, { "epoch": 0.7619047619047619, "grad_norm": 0.20816077188373128, "learning_rate": 4.8887184851326155e-05, "loss": 0.3202, "mean_token_accuracy": 0.9034851253032684, "step": 80 }, { "epoch": 0.8095238095238095, "grad_norm": 0.22148078953488418, "learning_rate": 4.8578396869667095e-05, "loss": 0.3252, "mean_token_accuracy": 0.9017885386943817, "step": 85 }, { "epoch": 0.8571428571428571, "grad_norm": 0.21406123617550696, "learning_rate": 4.823321476708417e-05, "loss": 0.3207, "mean_token_accuracy": 0.9030977070331574, "step": 90 }, { "epoch": 0.9047619047619048, "grad_norm": 0.20063414494959075, "learning_rate": 4.7852234537630435e-05, "loss": 0.3151, "mean_token_accuracy": 0.9040939033031463, "step": 95 }, { "epoch": 0.9523809523809523, "grad_norm": 0.20157178980104304, "learning_rate": 4.7436113984678024e-05, "loss": 0.3133, "mean_token_accuracy": 0.9044515311717987, "step": 100 }, { "epoch": 1.0, "grad_norm": 0.21946571056086206, "learning_rate": 4.6985571585149876e-05, "loss": 0.3088, "mean_token_accuracy": 0.9057298839092255, "step": 105 }, { "epoch": 1.0476190476190477, "grad_norm": 0.22243550466680892, "learning_rate": 4.650138524899193e-05, "loss": 0.2589, "mean_token_accuracy": 0.9184723615646362, "step": 110 }, { "epoch": 1.0952380952380953, "grad_norm": 0.21782596690450573, "learning_rate": 4.5984390976027946e-05, "loss": 0.2467, "mean_token_accuracy": 0.9217934429645538, "step": 115 }, { "epoch": 1.1428571428571428, "grad_norm": 0.20047942498513227, "learning_rate": 4.5435481412515755e-05, "loss": 0.2458, "mean_token_accuracy": 0.9213689804077149, "step": 120 }, { "epoch": 1.1904761904761905, "grad_norm": 0.20323781517968, "learning_rate": 4.4855604309897496e-05, "loss": 0.2533, "mean_token_accuracy": 0.9193067252635956, "step": 125 }, { "epoch": 1.2380952380952381, "grad_norm": 0.2089213813676996, "learning_rate": 4.4245760888404665e-05, "loss": 0.2451, "mean_token_accuracy": 0.921397477388382, "step": 130 }, { "epoch": 1.2857142857142856, "grad_norm": 0.20871576941556397, "learning_rate": 4.360700410834367e-05, "loss": 0.2487, "mean_token_accuracy": 0.9207526803016662, "step": 135 }, { "epoch": 1.3333333333333333, "grad_norm": 0.20056126493311283, "learning_rate": 4.294043685204651e-05, "loss": 0.2361, "mean_token_accuracy": 0.924414473772049, "step": 140 }, { "epoch": 1.380952380952381, "grad_norm": 0.2115275835238231, "learning_rate": 4.224721001962573e-05, "loss": 0.2409, "mean_token_accuracy": 0.9230122208595276, "step": 145 }, { "epoch": 1.4285714285714286, "grad_norm": 0.20541500240177765, "learning_rate": 4.1528520541821506e-05, "loss": 0.2398, "mean_token_accuracy": 0.9232278406620026, "step": 150 }, { "epoch": 1.4761904761904763, "grad_norm": 0.20798560704125774, "learning_rate": 4.078560931337187e-05, "loss": 0.2471, "mean_token_accuracy": 0.9215823113918304, "step": 155 }, { "epoch": 1.5238095238095237, "grad_norm": 0.1939929705798728, "learning_rate": 4.001975905047442e-05, "loss": 0.2401, "mean_token_accuracy": 0.923470401763916, "step": 160 }, { "epoch": 1.5714285714285714, "grad_norm": 0.21344717934526472, "learning_rate": 3.923229207603871e-05, "loss": 0.2396, "mean_token_accuracy": 0.9238272488117218, "step": 165 }, { "epoch": 1.619047619047619, "grad_norm": 0.2191275580698787, "learning_rate": 3.842456803655342e-05, "loss": 0.2406, "mean_token_accuracy": 0.923265916109085, "step": 170 }, { "epoch": 1.6666666666666665, "grad_norm": 0.19421469398044988, "learning_rate": 3.75979815545104e-05, "loss": 0.2359, "mean_token_accuracy": 0.924458909034729, "step": 175 }, { "epoch": 1.7142857142857144, "grad_norm": 0.19470949781451338, "learning_rate": 3.6753959820438764e-05, "loss": 0.2321, "mean_token_accuracy": 0.9256431758403778, "step": 180 }, { "epoch": 1.7619047619047619, "grad_norm": 0.1876708972166408, "learning_rate": 3.589396012870687e-05, "loss": 0.2291, "mean_token_accuracy": 0.9266167640686035, "step": 185 }, { "epoch": 1.8095238095238095, "grad_norm": 0.2147218447176946, "learning_rate": 3.5019467361346724e-05, "loss": 0.2329, "mean_token_accuracy": 0.925212299823761, "step": 190 }, { "epoch": 1.8571428571428572, "grad_norm": 0.19962238819866412, "learning_rate": 3.413199142424535e-05, "loss": 0.2296, "mean_token_accuracy": 0.9259015142917633, "step": 195 }, { "epoch": 1.9047619047619047, "grad_norm": 0.19475591470139603, "learning_rate": 3.3233064640129734e-05, "loss": 0.2277, "mean_token_accuracy": 0.92720667719841, "step": 200 }, { "epoch": 1.9523809523809523, "grad_norm": 0.20861501443310124, "learning_rate": 3.232423910284672e-05, "loss": 0.2284, "mean_token_accuracy": 0.9266561150550843, "step": 205 }, { "epoch": 2.0, "grad_norm": 0.19761114102987562, "learning_rate": 3.140708399750594e-05, "loss": 0.2291, "mean_token_accuracy": 0.9258988976478577, "step": 210 }, { "epoch": 2.0476190476190474, "grad_norm": 0.24523566793415144, "learning_rate": 3.048318289111279e-05, "loss": 0.1563, "mean_token_accuracy": 0.9481804072856903, "step": 215 }, { "epoch": 2.0952380952380953, "grad_norm": 0.204067402376235, "learning_rate": 2.955413099836959e-05, "loss": 0.1542, "mean_token_accuracy": 0.9486050844192505, "step": 220 }, { "epoch": 2.142857142857143, "grad_norm": 0.19593572116438304, "learning_rate": 2.8621532427365687e-05, "loss": 0.1553, "mean_token_accuracy": 0.9480521976947784, "step": 225 }, { "epoch": 2.1904761904761907, "grad_norm": 0.19800587405132705, "learning_rate": 2.7686997409912192e-05, "loss": 0.1566, "mean_token_accuracy": 0.9476682603359222, "step": 230 }, { "epoch": 2.238095238095238, "grad_norm": 0.19599017566764934, "learning_rate": 2.6752139521303403e-05, "loss": 0.1525, "mean_token_accuracy": 0.9489890992641449, "step": 235 }, { "epoch": 2.2857142857142856, "grad_norm": 0.1957327582988075, "learning_rate": 2.5818572894305453e-05, "loss": 0.1515, "mean_token_accuracy": 0.9491177141666413, "step": 240 }, { "epoch": 2.3333333333333335, "grad_norm": 0.19017281050653162, "learning_rate": 2.4887909432182316e-05, "loss": 0.1512, "mean_token_accuracy": 0.9494254469871521, "step": 245 }, { "epoch": 2.380952380952381, "grad_norm": 0.19356138909333898, "learning_rate": 2.3961756025571336e-05, "loss": 0.1535, "mean_token_accuracy": 0.9488085567951202, "step": 250 }, { "epoch": 2.4285714285714284, "grad_norm": 0.21519917588614698, "learning_rate": 2.304171177801356e-05, "loss": 0.1521, "mean_token_accuracy": 0.9488667666912078, "step": 255 }, { "epoch": 2.4761904761904763, "grad_norm": 0.18356462457323924, "learning_rate": 2.2129365244929402e-05, "loss": 0.1503, "mean_token_accuracy": 0.9497200846672058, "step": 260 }, { "epoch": 2.5238095238095237, "grad_norm": 0.19727737940368512, "learning_rate": 2.1226291690806715e-05, "loss": 0.1502, "mean_token_accuracy": 0.9498027801513672, "step": 265 }, { "epoch": 2.571428571428571, "grad_norm": 0.18491161543383636, "learning_rate": 2.0334050369337104e-05, "loss": 0.1493, "mean_token_accuracy": 0.9500907003879547, "step": 270 }, { "epoch": 2.619047619047619, "grad_norm": 0.180689936763116, "learning_rate": 1.945418183119656e-05, "loss": 0.1461, "mean_token_accuracy": 0.9510073781013488, "step": 275 }, { "epoch": 2.6666666666666665, "grad_norm": 0.18357579016167094, "learning_rate": 1.8588205264118974e-05, "loss": 0.1522, "mean_token_accuracy": 0.9489753544330597, "step": 280 }, { "epoch": 2.7142857142857144, "grad_norm": 0.17455870908089519, "learning_rate": 1.7737615869854944e-05, "loss": 0.1476, "mean_token_accuracy": 0.9504713833332061, "step": 285 }, { "epoch": 2.761904761904762, "grad_norm": 0.1895734172134673, "learning_rate": 1.6903882282545055e-05, "loss": 0.1471, "mean_token_accuracy": 0.9508952736854553, "step": 290 }, { "epoch": 2.8095238095238093, "grad_norm": 0.1814138039625087, "learning_rate": 1.6088444032964923e-05, "loss": 0.1471, "mean_token_accuracy": 0.9506540060043335, "step": 295 }, { "epoch": 2.857142857142857, "grad_norm": 0.18505343239566618, "learning_rate": 1.5292709063020415e-05, "loss": 0.1453, "mean_token_accuracy": 0.9513262331485748, "step": 300 }, { "epoch": 2.9047619047619047, "grad_norm": 0.1818729025239862, "learning_rate": 1.4518051294784384e-05, "loss": 0.1427, "mean_token_accuracy": 0.9522208392620086, "step": 305 }, { "epoch": 2.9523809523809526, "grad_norm": 0.18282477277115136, "learning_rate": 1.3765808258272334e-05, "loss": 0.1457, "mean_token_accuracy": 0.9513007164001465, "step": 310 }, { "epoch": 3.0, "grad_norm": 0.17250893276801252, "learning_rate": 1.3037278782052863e-05, "loss": 0.1448, "mean_token_accuracy": 0.9508862257003784, "step": 315 }, { "epoch": 3.0476190476190474, "grad_norm": 0.21843020721408246, "learning_rate": 1.2333720750680403e-05, "loss": 0.0972, "mean_token_accuracy": 0.9678010582923889, "step": 320 }, { "epoch": 3.0952380952380953, "grad_norm": 0.18713956774840773, "learning_rate": 1.1656348932822086e-05, "loss": 0.0951, "mean_token_accuracy": 0.9680294811725616, "step": 325 }, { "epoch": 3.142857142857143, "grad_norm": 0.17099224072656777, "learning_rate": 1.1006332883828913e-05, "loss": 0.0928, "mean_token_accuracy": 0.9688285231590271, "step": 330 }, { "epoch": 3.1904761904761907, "grad_norm": 0.1889152084893897, "learning_rate": 1.0384794926372563e-05, "loss": 0.0945, "mean_token_accuracy": 0.9682107090950012, "step": 335 }, { "epoch": 3.238095238095238, "grad_norm": 0.16909157911032757, "learning_rate": 9.792808212634502e-06, "loss": 0.0911, "mean_token_accuracy": 0.9692863464355469, "step": 340 }, { "epoch": 3.2857142857142856, "grad_norm": 0.16462195486055214, "learning_rate": 9.231394871393228e-06, "loss": 0.0926, "mean_token_accuracy": 0.9688502609729767, "step": 345 }, { "epoch": 3.3333333333333335, "grad_norm": 0.16694354195202218, "learning_rate": 8.701524243208935e-06, "loss": 0.0941, "mean_token_accuracy": 0.968300586938858, "step": 350 }, { "epoch": 3.380952380952381, "grad_norm": 0.1692222626696623, "learning_rate": 8.204111206752663e-06, "loss": 0.0918, "mean_token_accuracy": 0.9689784705638885, "step": 355 }, { "epoch": 3.4285714285714284, "grad_norm": 0.17221192973282243, "learning_rate": 7.740014599169857e-06, "loss": 0.0904, "mean_token_accuracy": 0.9694355607032776, "step": 360 }, { "epoch": 3.4761904761904763, "grad_norm": 0.1741810830817914, "learning_rate": 7.3100357332055624e-06, "loss": 0.0896, "mean_token_accuracy": 0.9696675717830658, "step": 365 }, { "epoch": 3.5238095238095237, "grad_norm": 0.17501221405018258, "learning_rate": 6.914917013651723e-06, "loss": 0.092, "mean_token_accuracy": 0.968971711397171, "step": 370 }, { "epoch": 3.571428571428571, "grad_norm": 0.16570291902290668, "learning_rate": 6.555340655505407e-06, "loss": 0.0917, "mean_token_accuracy": 0.9693442165851593, "step": 375 }, { "epoch": 3.619047619047619, "grad_norm": 0.15923829992676775, "learning_rate": 6.231927506051192e-06, "loss": 0.0915, "mean_token_accuracy": 0.969234949350357, "step": 380 }, { "epoch": 3.6666666666666665, "grad_norm": 0.17338149522120277, "learning_rate": 5.9452359729015004e-06, "loss": 0.0907, "mean_token_accuracy": 0.9692964613437652, "step": 385 }, { "epoch": 3.7142857142857144, "grad_norm": 0.17158993366626976, "learning_rate": 5.695761059845749e-06, "loss": 0.0902, "mean_token_accuracy": 0.9696565389633178, "step": 390 }, { "epoch": 3.761904761904762, "grad_norm": 0.17688545350709856, "learning_rate": 5.483933512173022e-06, "loss": 0.0918, "mean_token_accuracy": 0.9692005813121796, "step": 395 }, { "epoch": 3.8095238095238093, "grad_norm": 0.1621556558547238, "learning_rate": 5.310119072943991e-06, "loss": 0.0892, "mean_token_accuracy": 0.9700611054897308, "step": 400 }, { "epoch": 3.857142857142857, "grad_norm": 0.16839822104636895, "learning_rate": 5.174617851496128e-06, "loss": 0.0904, "mean_token_accuracy": 0.9694457828998566, "step": 405 }, { "epoch": 3.9047619047619047, "grad_norm": 0.16305557552247782, "learning_rate": 5.077663805272652e-06, "loss": 0.0902, "mean_token_accuracy": 0.9695923388004303, "step": 410 }, { "epoch": 3.9523809523809526, "grad_norm": 0.16067720695857812, "learning_rate": 5.019424335869808e-06, "loss": 0.0903, "mean_token_accuracy": 0.9695243299007416, "step": 415 }, { "epoch": 4.0, "grad_norm": 0.1603132269708069, "learning_rate": 5e-06, "loss": 0.0903, "mean_token_accuracy": 0.9692174196243286, "step": 420 }, { "epoch": 4.0, "step": 420, "total_flos": 216761967837184.0, "train_loss": 0.20915592369579133, "train_runtime": 2440.0132, "train_samples_per_second": 2.749, "train_steps_per_second": 0.172 } ], "logging_steps": 5, "max_steps": 420, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 216761967837184.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }