| { |
| "best_global_step": 3144, |
| "best_metric": 1.1658307313919067, |
| "best_model_checkpoint": "models/gemma-3-4b-sft-full/checkpoint-3144", |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 15720, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0006361323155216285, |
| "grad_norm": 31.319606519809206, |
| "learning_rate": 1.2722646310432571e-08, |
| "loss": 2.0248, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.031806615776081425, |
| "grad_norm": 13.2120799093862, |
| "learning_rate": 6.361323155216286e-07, |
| "loss": 1.9103, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06361323155216285, |
| "grad_norm": 2.8305292907887694, |
| "learning_rate": 1.2722646310432571e-06, |
| "loss": 1.4434, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.09541984732824428, |
| "grad_norm": 2.3110831279704738, |
| "learning_rate": 1.908396946564886e-06, |
| "loss": 1.3196, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.1272264631043257, |
| "grad_norm": 2.3096762225567056, |
| "learning_rate": 2.5445292620865143e-06, |
| "loss": 1.3039, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.15903307888040713, |
| "grad_norm": 2.2782572106396306, |
| "learning_rate": 3.1806615776081427e-06, |
| "loss": 1.2618, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.19083969465648856, |
| "grad_norm": 2.2420875359580132, |
| "learning_rate": 3.816793893129772e-06, |
| "loss": 1.2501, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.22264631043256997, |
| "grad_norm": 2.0330974831105215, |
| "learning_rate": 4.4529262086514e-06, |
| "loss": 1.2541, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.2544529262086514, |
| "grad_norm": 2.1026569258639043, |
| "learning_rate": 5.0890585241730285e-06, |
| "loss": 1.2278, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2862595419847328, |
| "grad_norm": 2.0803892623652196, |
| "learning_rate": 5.725190839694656e-06, |
| "loss": 1.2173, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.31806615776081426, |
| "grad_norm": 2.5887368846264684, |
| "learning_rate": 6.3613231552162854e-06, |
| "loss": 1.2241, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.34987277353689566, |
| "grad_norm": 1.8616362535769346, |
| "learning_rate": 6.997455470737914e-06, |
| "loss": 1.1954, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.3816793893129771, |
| "grad_norm": 2.2198054581544153, |
| "learning_rate": 7.633587786259543e-06, |
| "loss": 1.2207, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.41348600508905853, |
| "grad_norm": 1.9458297082843083, |
| "learning_rate": 8.26972010178117e-06, |
| "loss": 1.2104, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.44529262086513993, |
| "grad_norm": 1.754441697698081, |
| "learning_rate": 8.9058524173028e-06, |
| "loss": 1.1954, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.4770992366412214, |
| "grad_norm": 1.9516730885650273, |
| "learning_rate": 9.54198473282443e-06, |
| "loss": 1.1962, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5089058524173028, |
| "grad_norm": 1.8788578410476755, |
| "learning_rate": 1.0178117048346057e-05, |
| "loss": 1.1955, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5407124681933843, |
| "grad_norm": 1.7660250423975214, |
| "learning_rate": 1.0814249363867686e-05, |
| "loss": 1.2029, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5725190839694656, |
| "grad_norm": 1.7319081555721738, |
| "learning_rate": 1.1450381679389312e-05, |
| "loss": 1.201, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6043256997455471, |
| "grad_norm": 1.7433405435098388, |
| "learning_rate": 1.2086513994910942e-05, |
| "loss": 1.1945, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.6361323155216285, |
| "grad_norm": 1.6584922549922605, |
| "learning_rate": 1.2722646310432571e-05, |
| "loss": 1.188, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6679389312977099, |
| "grad_norm": 1.694934894298546, |
| "learning_rate": 1.3358778625954198e-05, |
| "loss": 1.1853, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6997455470737913, |
| "grad_norm": 1.8972752727827624, |
| "learning_rate": 1.3994910941475828e-05, |
| "loss": 1.1796, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.7315521628498728, |
| "grad_norm": 1.7794214888108801, |
| "learning_rate": 1.4631043256997457e-05, |
| "loss": 1.1879, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7633587786259542, |
| "grad_norm": 1.7080167758006621, |
| "learning_rate": 1.5267175572519086e-05, |
| "loss": 1.2033, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7951653944020356, |
| "grad_norm": 1.6732561680746716, |
| "learning_rate": 1.5903307888040712e-05, |
| "loss": 1.1729, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.8269720101781171, |
| "grad_norm": 2.0115920286242472, |
| "learning_rate": 1.653944020356234e-05, |
| "loss": 1.1798, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8587786259541985, |
| "grad_norm": 1.5883913583214553, |
| "learning_rate": 1.717557251908397e-05, |
| "loss": 1.1761, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.8905852417302799, |
| "grad_norm": 1.5615231326127277, |
| "learning_rate": 1.78117048346056e-05, |
| "loss": 1.1807, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.9223918575063613, |
| "grad_norm": 1.6052692336601109, |
| "learning_rate": 1.844783715012723e-05, |
| "loss": 1.1872, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.9541984732824428, |
| "grad_norm": 1.6293394603925617, |
| "learning_rate": 1.908396946564886e-05, |
| "loss": 1.1821, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9860050890585241, |
| "grad_norm": 1.9511097309507746, |
| "learning_rate": 1.9720101781170485e-05, |
| "loss": 1.193, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 1.1915197372436523, |
| "eval_runtime": 50.604, |
| "eval_samples_per_second": 55.45, |
| "eval_steps_per_second": 1.739, |
| "step": 1572 |
| }, |
| { |
| "epoch": 1.0178117048346056, |
| "grad_norm": 1.699246916566911, |
| "learning_rate": 1.9999806716709255e-05, |
| "loss": 1.0668, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.049618320610687, |
| "grad_norm": 1.6215378174021484, |
| "learning_rate": 1.999850011488216e-05, |
| "loss": 0.9829, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.0814249363867685, |
| "grad_norm": 1.7868804206457551, |
| "learning_rate": 1.9995961032584046e-05, |
| "loss": 0.9782, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.11323155216285, |
| "grad_norm": 1.824863693326858, |
| "learning_rate": 1.9992189782798795e-05, |
| "loss": 0.9649, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.1450381679389312, |
| "grad_norm": 1.9389315988555975, |
| "learning_rate": 1.99871868303953e-05, |
| "loss": 0.9859, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.1768447837150127, |
| "grad_norm": 1.8613552819265144, |
| "learning_rate": 1.9980952792070175e-05, |
| "loss": 0.97, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.2086513994910941, |
| "grad_norm": 1.6290767219311002, |
| "learning_rate": 1.9973488436271728e-05, |
| "loss": 0.9898, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.2404580152671756, |
| "grad_norm": 1.9280005053128177, |
| "learning_rate": 1.996479468310524e-05, |
| "loss": 0.977, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.272264631043257, |
| "grad_norm": 1.8021715712875992, |
| "learning_rate": 1.9954872604219543e-05, |
| "loss": 0.9778, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.3040712468193385, |
| "grad_norm": 1.778983300178611, |
| "learning_rate": 1.994372342267493e-05, |
| "loss": 0.9754, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.33587786259542, |
| "grad_norm": 1.6139758020504216, |
| "learning_rate": 1.993134851279238e-05, |
| "loss": 0.9768, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.3676844783715012, |
| "grad_norm": 1.6159993769878525, |
| "learning_rate": 1.991774939998417e-05, |
| "loss": 0.977, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.3994910941475827, |
| "grad_norm": 1.7346584119107982, |
| "learning_rate": 1.9902927760565824e-05, |
| "loss": 1.0021, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.4312977099236641, |
| "grad_norm": 1.6348257679838059, |
| "learning_rate": 1.988688542154948e-05, |
| "loss": 0.9911, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.4631043256997456, |
| "grad_norm": 2.005161271222442, |
| "learning_rate": 1.98696243604187e-05, |
| "loss": 0.98, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.494910941475827, |
| "grad_norm": 1.6947935478149847, |
| "learning_rate": 1.9851146704884684e-05, |
| "loss": 0.9933, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.5267175572519083, |
| "grad_norm": 1.559288613818951, |
| "learning_rate": 1.9831454732624023e-05, |
| "loss": 0.9812, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.55852417302799, |
| "grad_norm": 1.6147458399643977, |
| "learning_rate": 1.9810550870997914e-05, |
| "loss": 0.9829, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.5903307888040712, |
| "grad_norm": 1.7200525728774254, |
| "learning_rate": 1.9788437696752965e-05, |
| "loss": 0.9827, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.6221374045801527, |
| "grad_norm": 1.5679464105011003, |
| "learning_rate": 1.9765117935703556e-05, |
| "loss": 0.9918, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.6539440203562341, |
| "grad_norm": 1.5684761038610553, |
| "learning_rate": 1.9740594462395844e-05, |
| "loss": 1.0035, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.6857506361323156, |
| "grad_norm": 1.6525710526384763, |
| "learning_rate": 1.9714870299753425e-05, |
| "loss": 0.9757, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.717557251908397, |
| "grad_norm": 1.61635439544328, |
| "learning_rate": 1.9687948618704713e-05, |
| "loss": 0.9878, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.7493638676844783, |
| "grad_norm": 1.552931766301823, |
| "learning_rate": 1.9659832737792065e-05, |
| "loss": 0.9926, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.78117048346056, |
| "grad_norm": 1.7462958660917196, |
| "learning_rate": 1.963052612276272e-05, |
| "loss": 0.9923, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.8129770992366412, |
| "grad_norm": 1.541467107392074, |
| "learning_rate": 1.9600032386141578e-05, |
| "loss": 0.9883, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.8447837150127226, |
| "grad_norm": 1.60142808575721, |
| "learning_rate": 1.9568355286785916e-05, |
| "loss": 0.9848, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.876590330788804, |
| "grad_norm": 1.628212808465854, |
| "learning_rate": 1.9535498729422034e-05, |
| "loss": 0.981, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.9083969465648853, |
| "grad_norm": 1.589079219019998, |
| "learning_rate": 1.950146676416393e-05, |
| "loss": 0.9938, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.940203562340967, |
| "grad_norm": 1.5927647305457868, |
| "learning_rate": 1.9466263586014062e-05, |
| "loss": 0.9831, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.9720101781170483, |
| "grad_norm": 1.6181088935396841, |
| "learning_rate": 1.9429893534346248e-05, |
| "loss": 0.9738, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 1.1658307313919067, |
| "eval_runtime": 57.664, |
| "eval_samples_per_second": 48.661, |
| "eval_steps_per_second": 1.526, |
| "step": 3144 |
| }, |
| { |
| "epoch": 2.00381679389313, |
| "grad_norm": 2.8157430444252833, |
| "learning_rate": 1.9392361092370756e-05, |
| "loss": 0.9372, |
| "step": 3150 |
| }, |
| { |
| "epoch": 2.035623409669211, |
| "grad_norm": 1.8202205896718766, |
| "learning_rate": 1.9353670886581683e-05, |
| "loss": 0.6118, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.0674300254452924, |
| "grad_norm": 1.8024719083066718, |
| "learning_rate": 1.9313827686186664e-05, |
| "loss": 0.5956, |
| "step": 3250 |
| }, |
| { |
| "epoch": 2.099236641221374, |
| "grad_norm": 1.8065831151097012, |
| "learning_rate": 1.927283640251898e-05, |
| "loss": 0.615, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.1310432569974553, |
| "grad_norm": 1.93182684100521, |
| "learning_rate": 1.923070208843216e-05, |
| "loss": 0.6079, |
| "step": 3350 |
| }, |
| { |
| "epoch": 2.162849872773537, |
| "grad_norm": 1.8738788734317153, |
| "learning_rate": 1.9187429937677136e-05, |
| "loss": 0.607, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.1946564885496183, |
| "grad_norm": 1.8040300513160983, |
| "learning_rate": 1.9143025284262022e-05, |
| "loss": 0.6085, |
| "step": 3450 |
| }, |
| { |
| "epoch": 2.2264631043257, |
| "grad_norm": 1.8986773569695647, |
| "learning_rate": 1.909749360179461e-05, |
| "loss": 0.6145, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.258269720101781, |
| "grad_norm": 1.9163165829127622, |
| "learning_rate": 1.9050840502807665e-05, |
| "loss": 0.6169, |
| "step": 3550 |
| }, |
| { |
| "epoch": 2.2900763358778624, |
| "grad_norm": 2.0342511222836657, |
| "learning_rate": 1.9003071738067073e-05, |
| "loss": 0.6181, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.321882951653944, |
| "grad_norm": 1.9022311954341746, |
| "learning_rate": 1.895419319586298e-05, |
| "loss": 0.6322, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.3536895674300253, |
| "grad_norm": 1.947735727576319, |
| "learning_rate": 1.890421090128395e-05, |
| "loss": 0.6261, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.385496183206107, |
| "grad_norm": 1.8908602175645888, |
| "learning_rate": 1.8853131015474278e-05, |
| "loss": 0.6241, |
| "step": 3750 |
| }, |
| { |
| "epoch": 2.4173027989821882, |
| "grad_norm": 1.8428847331642595, |
| "learning_rate": 1.8800959834874534e-05, |
| "loss": 0.6247, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.4491094147582695, |
| "grad_norm": 1.9386784496016072, |
| "learning_rate": 1.8747703790445412e-05, |
| "loss": 0.6369, |
| "step": 3850 |
| }, |
| { |
| "epoch": 2.480916030534351, |
| "grad_norm": 1.8110474855626102, |
| "learning_rate": 1.8693369446875008e-05, |
| "loss": 0.6352, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.5127226463104324, |
| "grad_norm": 1.8744360271519491, |
| "learning_rate": 1.8637963501769625e-05, |
| "loss": 0.6402, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.544529262086514, |
| "grad_norm": 1.858724398900357, |
| "learning_rate": 1.858149278482817e-05, |
| "loss": 0.6459, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.5763358778625953, |
| "grad_norm": 1.8627524401678055, |
| "learning_rate": 1.8523964257000288e-05, |
| "loss": 0.6276, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.608142493638677, |
| "grad_norm": 1.9220180062265788, |
| "learning_rate": 1.8465385009628308e-05, |
| "loss": 0.6481, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.6399491094147582, |
| "grad_norm": 1.9319620445548449, |
| "learning_rate": 1.8405762263573108e-05, |
| "loss": 0.6344, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.67175572519084, |
| "grad_norm": 1.8442743167506148, |
| "learning_rate": 1.834510336832405e-05, |
| "loss": 0.6418, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.703562340966921, |
| "grad_norm": 1.8919128966016239, |
| "learning_rate": 1.8283415801093007e-05, |
| "loss": 0.6455, |
| "step": 4250 |
| }, |
| { |
| "epoch": 2.7353689567430024, |
| "grad_norm": 1.79572731114352, |
| "learning_rate": 1.8220707165892682e-05, |
| "loss": 0.6474, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.767175572519084, |
| "grad_norm": 1.8916208552532916, |
| "learning_rate": 1.815698519259929e-05, |
| "loss": 0.6479, |
| "step": 4350 |
| }, |
| { |
| "epoch": 2.7989821882951653, |
| "grad_norm": 1.8754600469553322, |
| "learning_rate": 1.8092257735999734e-05, |
| "loss": 0.6549, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.830788804071247, |
| "grad_norm": 1.8972086051601613, |
| "learning_rate": 1.8026532774823343e-05, |
| "loss": 0.6397, |
| "step": 4450 |
| }, |
| { |
| "epoch": 2.8625954198473282, |
| "grad_norm": 1.8335920924146587, |
| "learning_rate": 1.7959818410758395e-05, |
| "loss": 0.6379, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.8944020356234095, |
| "grad_norm": 2.010899629666033, |
| "learning_rate": 1.789212286745342e-05, |
| "loss": 0.645, |
| "step": 4550 |
| }, |
| { |
| "epoch": 2.926208651399491, |
| "grad_norm": 1.854046640562392, |
| "learning_rate": 1.7823454489503526e-05, |
| "loss": 0.6491, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.9580152671755724, |
| "grad_norm": 1.9582134927711392, |
| "learning_rate": 1.775382174142177e-05, |
| "loss": 0.6542, |
| "step": 4650 |
| }, |
| { |
| "epoch": 2.989821882951654, |
| "grad_norm": 1.851650468210065, |
| "learning_rate": 1.768323320659578e-05, |
| "loss": 0.6542, |
| "step": 4700 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 1.25302255153656, |
| "eval_runtime": 57.2963, |
| "eval_samples_per_second": 48.973, |
| "eval_steps_per_second": 1.536, |
| "step": 4716 |
| }, |
| { |
| "epoch": 3.0216284987277353, |
| "grad_norm": 1.9776148010713264, |
| "learning_rate": 1.7611697586229695e-05, |
| "loss": 0.4254, |
| "step": 4750 |
| }, |
| { |
| "epoch": 3.053435114503817, |
| "grad_norm": 2.1156326922920994, |
| "learning_rate": 1.753922369827162e-05, |
| "loss": 0.3248, |
| "step": 4800 |
| }, |
| { |
| "epoch": 3.0852417302798982, |
| "grad_norm": 1.927754824109064, |
| "learning_rate": 1.7465820476326656e-05, |
| "loss": 0.328, |
| "step": 4850 |
| }, |
| { |
| "epoch": 3.1170483460559795, |
| "grad_norm": 1.984505722526154, |
| "learning_rate": 1.7391496968555667e-05, |
| "loss": 0.3325, |
| "step": 4900 |
| }, |
| { |
| "epoch": 3.148854961832061, |
| "grad_norm": 2.070364334889199, |
| "learning_rate": 1.7316262336559978e-05, |
| "loss": 0.3348, |
| "step": 4950 |
| }, |
| { |
| "epoch": 3.1806615776081424, |
| "grad_norm": 1.9450509022734594, |
| "learning_rate": 1.7240125854252043e-05, |
| "loss": 0.3413, |
| "step": 5000 |
| }, |
| { |
| "epoch": 3.212468193384224, |
| "grad_norm": 2.0171683252659323, |
| "learning_rate": 1.7163096906712267e-05, |
| "loss": 0.3353, |
| "step": 5050 |
| }, |
| { |
| "epoch": 3.2442748091603053, |
| "grad_norm": 2.0051713529489046, |
| "learning_rate": 1.708518498903216e-05, |
| "loss": 0.3411, |
| "step": 5100 |
| }, |
| { |
| "epoch": 3.276081424936387, |
| "grad_norm": 2.0973923846500213, |
| "learning_rate": 1.7006399705143905e-05, |
| "loss": 0.3421, |
| "step": 5150 |
| }, |
| { |
| "epoch": 3.3078880407124682, |
| "grad_norm": 2.0572445637964103, |
| "learning_rate": 1.692675076663651e-05, |
| "loss": 0.338, |
| "step": 5200 |
| }, |
| { |
| "epoch": 3.3396946564885495, |
| "grad_norm": 2.1760429565142223, |
| "learning_rate": 1.6846247991558686e-05, |
| "loss": 0.3506, |
| "step": 5250 |
| }, |
| { |
| "epoch": 3.371501272264631, |
| "grad_norm": 1.9297734638867776, |
| "learning_rate": 1.6764901303208632e-05, |
| "loss": 0.344, |
| "step": 5300 |
| }, |
| { |
| "epoch": 3.4033078880407124, |
| "grad_norm": 2.051646120204668, |
| "learning_rate": 1.6682720728910815e-05, |
| "loss": 0.3531, |
| "step": 5350 |
| }, |
| { |
| "epoch": 3.435114503816794, |
| "grad_norm": 2.053007809243884, |
| "learning_rate": 1.659971639877992e-05, |
| "loss": 0.356, |
| "step": 5400 |
| }, |
| { |
| "epoch": 3.4669211195928753, |
| "grad_norm": 2.113810517440616, |
| "learning_rate": 1.6515898544472172e-05, |
| "loss": 0.3544, |
| "step": 5450 |
| }, |
| { |
| "epoch": 3.4987277353689565, |
| "grad_norm": 2.0372048460483207, |
| "learning_rate": 1.6431277497924093e-05, |
| "loss": 0.3461, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.530534351145038, |
| "grad_norm": 2.0430461667046753, |
| "learning_rate": 1.6345863690078942e-05, |
| "loss": 0.3527, |
| "step": 5550 |
| }, |
| { |
| "epoch": 3.5623409669211195, |
| "grad_norm": 2.0399250101984485, |
| "learning_rate": 1.6259667649600907e-05, |
| "loss": 0.3584, |
| "step": 5600 |
| }, |
| { |
| "epoch": 3.594147582697201, |
| "grad_norm": 2.1539893126413165, |
| "learning_rate": 1.6172700001577286e-05, |
| "loss": 0.3599, |
| "step": 5650 |
| }, |
| { |
| "epoch": 3.6259541984732824, |
| "grad_norm": 2.158545963475489, |
| "learning_rate": 1.6084971466208764e-05, |
| "loss": 0.3639, |
| "step": 5700 |
| }, |
| { |
| "epoch": 3.6577608142493636, |
| "grad_norm": 2.133349813061679, |
| "learning_rate": 1.599649285748798e-05, |
| "loss": 0.3604, |
| "step": 5750 |
| }, |
| { |
| "epoch": 3.6895674300254453, |
| "grad_norm": 2.1126907045423136, |
| "learning_rate": 1.5907275081866504e-05, |
| "loss": 0.3572, |
| "step": 5800 |
| }, |
| { |
| "epoch": 3.721374045801527, |
| "grad_norm": 2.1536592654492006, |
| "learning_rate": 1.5817329136910463e-05, |
| "loss": 0.3597, |
| "step": 5850 |
| }, |
| { |
| "epoch": 3.753180661577608, |
| "grad_norm": 1.9820017146238096, |
| "learning_rate": 1.5726666109944887e-05, |
| "loss": 0.366, |
| "step": 5900 |
| }, |
| { |
| "epoch": 3.7849872773536894, |
| "grad_norm": 2.0394074001285167, |
| "learning_rate": 1.563529717668702e-05, |
| "loss": 0.3586, |
| "step": 5950 |
| }, |
| { |
| "epoch": 3.816793893129771, |
| "grad_norm": 2.0797434739007548, |
| "learning_rate": 1.5543233599868744e-05, |
| "loss": 0.3611, |
| "step": 6000 |
| }, |
| { |
| "epoch": 3.8486005089058524, |
| "grad_norm": 1.9847509894940687, |
| "learning_rate": 1.5450486727848217e-05, |
| "loss": 0.3682, |
| "step": 6050 |
| }, |
| { |
| "epoch": 3.880407124681934, |
| "grad_norm": 2.0762467778933806, |
| "learning_rate": 1.535706799321106e-05, |
| "loss": 0.367, |
| "step": 6100 |
| }, |
| { |
| "epoch": 3.9122137404580153, |
| "grad_norm": 2.08727307746701, |
| "learning_rate": 1.526298891136105e-05, |
| "loss": 0.3661, |
| "step": 6150 |
| }, |
| { |
| "epoch": 3.9440203562340965, |
| "grad_norm": 2.2966942638832544, |
| "learning_rate": 1.5168261079100695e-05, |
| "loss": 0.362, |
| "step": 6200 |
| }, |
| { |
| "epoch": 3.975826972010178, |
| "grad_norm": 2.052981094095626, |
| "learning_rate": 1.5072896173201697e-05, |
| "loss": 0.3692, |
| "step": 6250 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 1.4977455139160156, |
| "eval_runtime": 51.4362, |
| "eval_samples_per_second": 54.553, |
| "eval_steps_per_second": 1.711, |
| "step": 6288 |
| }, |
| { |
| "epoch": 4.00763358778626, |
| "grad_norm": 2.107958347254252, |
| "learning_rate": 1.4976905948965637e-05, |
| "loss": 0.3142, |
| "step": 6300 |
| }, |
| { |
| "epoch": 4.039440203562341, |
| "grad_norm": 1.9505610066311394, |
| "learning_rate": 1.4880302238774911e-05, |
| "loss": 0.1694, |
| "step": 6350 |
| }, |
| { |
| "epoch": 4.071246819338422, |
| "grad_norm": 1.8629359372754337, |
| "learning_rate": 1.4783096950634211e-05, |
| "loss": 0.1727, |
| "step": 6400 |
| }, |
| { |
| "epoch": 4.103053435114504, |
| "grad_norm": 1.8272931243832953, |
| "learning_rate": 1.468530206670265e-05, |
| "loss": 0.1707, |
| "step": 6450 |
| }, |
| { |
| "epoch": 4.134860050890585, |
| "grad_norm": 1.982558356194926, |
| "learning_rate": 1.4586929641816783e-05, |
| "loss": 0.1757, |
| "step": 6500 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 1.7448326286779277, |
| "learning_rate": 1.4487991802004625e-05, |
| "loss": 0.1777, |
| "step": 6550 |
| }, |
| { |
| "epoch": 4.198473282442748, |
| "grad_norm": 1.9280680517187911, |
| "learning_rate": 1.4388500742990934e-05, |
| "loss": 0.1785, |
| "step": 6600 |
| }, |
| { |
| "epoch": 4.230279898218829, |
| "grad_norm": 1.9089795547388508, |
| "learning_rate": 1.4288468728693889e-05, |
| "loss": 0.181, |
| "step": 6650 |
| }, |
| { |
| "epoch": 4.262086513994911, |
| "grad_norm": 2.0037836680476566, |
| "learning_rate": 1.4187908089713348e-05, |
| "loss": 0.1823, |
| "step": 6700 |
| }, |
| { |
| "epoch": 4.293893129770993, |
| "grad_norm": 1.8739930764604456, |
| "learning_rate": 1.4086831221810897e-05, |
| "loss": 0.1812, |
| "step": 6750 |
| }, |
| { |
| "epoch": 4.325699745547074, |
| "grad_norm": 1.9039560558559352, |
| "learning_rate": 1.3985250584381884e-05, |
| "loss": 0.1848, |
| "step": 6800 |
| }, |
| { |
| "epoch": 4.357506361323155, |
| "grad_norm": 1.979148934201314, |
| "learning_rate": 1.3883178698919578e-05, |
| "loss": 0.183, |
| "step": 6850 |
| }, |
| { |
| "epoch": 4.3893129770992365, |
| "grad_norm": 2.037387611365519, |
| "learning_rate": 1.378062814747168e-05, |
| "loss": 0.1858, |
| "step": 6900 |
| }, |
| { |
| "epoch": 4.421119592875318, |
| "grad_norm": 2.081816352916778, |
| "learning_rate": 1.3677611571089406e-05, |
| "loss": 0.1889, |
| "step": 6950 |
| }, |
| { |
| "epoch": 4.4529262086514, |
| "grad_norm": 2.0722431553233993, |
| "learning_rate": 1.3574141668269235e-05, |
| "loss": 0.1876, |
| "step": 7000 |
| }, |
| { |
| "epoch": 4.484732824427481, |
| "grad_norm": 1.893880597719728, |
| "learning_rate": 1.3470231193387639e-05, |
| "loss": 0.1868, |
| "step": 7050 |
| }, |
| { |
| "epoch": 4.516539440203562, |
| "grad_norm": 2.136980665093255, |
| "learning_rate": 1.3365892955128876e-05, |
| "loss": 0.1925, |
| "step": 7100 |
| }, |
| { |
| "epoch": 4.548346055979644, |
| "grad_norm": 2.2126003717690144, |
| "learning_rate": 1.326113981490611e-05, |
| "loss": 0.1868, |
| "step": 7150 |
| }, |
| { |
| "epoch": 4.580152671755725, |
| "grad_norm": 2.295568251011161, |
| "learning_rate": 1.315598468527604e-05, |
| "loss": 0.1855, |
| "step": 7200 |
| }, |
| { |
| "epoch": 4.611959287531807, |
| "grad_norm": 1.886004237984628, |
| "learning_rate": 1.30504405283472e-05, |
| "loss": 0.1891, |
| "step": 7250 |
| }, |
| { |
| "epoch": 4.643765903307888, |
| "grad_norm": 2.0731100810122065, |
| "learning_rate": 1.294452035418218e-05, |
| "loss": 0.1901, |
| "step": 7300 |
| }, |
| { |
| "epoch": 4.675572519083969, |
| "grad_norm": 1.8758598657873944, |
| "learning_rate": 1.2838237219193897e-05, |
| "loss": 0.1898, |
| "step": 7350 |
| }, |
| { |
| "epoch": 4.707379134860051, |
| "grad_norm": 1.9515874051409456, |
| "learning_rate": 1.2731604224536208e-05, |
| "loss": 0.1895, |
| "step": 7400 |
| }, |
| { |
| "epoch": 4.739185750636132, |
| "grad_norm": 1.972736566907763, |
| "learning_rate": 1.262463451448895e-05, |
| "loss": 0.1888, |
| "step": 7450 |
| }, |
| { |
| "epoch": 4.770992366412214, |
| "grad_norm": 1.9482783400297305, |
| "learning_rate": 1.2517341274837702e-05, |
| "loss": 0.1931, |
| "step": 7500 |
| }, |
| { |
| "epoch": 4.802798982188295, |
| "grad_norm": 1.8471591343739595, |
| "learning_rate": 1.2409737731248418e-05, |
| "loss": 0.1903, |
| "step": 7550 |
| }, |
| { |
| "epoch": 4.8346055979643765, |
| "grad_norm": 2.025410052096641, |
| "learning_rate": 1.2301837147637137e-05, |
| "loss": 0.1926, |
| "step": 7600 |
| }, |
| { |
| "epoch": 4.866412213740458, |
| "grad_norm": 1.8904775058623584, |
| "learning_rate": 1.2193652824535e-05, |
| "loss": 0.1909, |
| "step": 7650 |
| }, |
| { |
| "epoch": 4.898218829516539, |
| "grad_norm": 2.0404591880691405, |
| "learning_rate": 1.2085198097448732e-05, |
| "loss": 0.1909, |
| "step": 7700 |
| }, |
| { |
| "epoch": 4.930025445292621, |
| "grad_norm": 1.9452174397216788, |
| "learning_rate": 1.197648633521681e-05, |
| "loss": 0.19, |
| "step": 7750 |
| }, |
| { |
| "epoch": 4.961832061068702, |
| "grad_norm": 2.098087417526516, |
| "learning_rate": 1.1867530938361557e-05, |
| "loss": 0.1953, |
| "step": 7800 |
| }, |
| { |
| "epoch": 4.993638676844784, |
| "grad_norm": 2.028699161436086, |
| "learning_rate": 1.1758345337437284e-05, |
| "loss": 0.1932, |
| "step": 7850 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 1.913898229598999, |
| "eval_runtime": 54.0702, |
| "eval_samples_per_second": 51.896, |
| "eval_steps_per_second": 1.628, |
| "step": 7860 |
| }, |
| { |
| "epoch": 5.025445292620865, |
| "grad_norm": 1.4793683076824136, |
| "learning_rate": 1.164894299137476e-05, |
| "loss": 0.1166, |
| "step": 7900 |
| }, |
| { |
| "epoch": 5.057251908396947, |
| "grad_norm": 1.5651227153942533, |
| "learning_rate": 1.1539337385822179e-05, |
| "loss": 0.1006, |
| "step": 7950 |
| }, |
| { |
| "epoch": 5.089058524173028, |
| "grad_norm": 1.7359295760816373, |
| "learning_rate": 1.1429542031482828e-05, |
| "loss": 0.1037, |
| "step": 8000 |
| }, |
| { |
| "epoch": 5.120865139949109, |
| "grad_norm": 1.5516562484547498, |
| "learning_rate": 1.1319570462449664e-05, |
| "loss": 0.1073, |
| "step": 8050 |
| }, |
| { |
| "epoch": 5.152671755725191, |
| "grad_norm": 1.7147421352890893, |
| "learning_rate": 1.120943623453703e-05, |
| "loss": 0.1048, |
| "step": 8100 |
| }, |
| { |
| "epoch": 5.184478371501272, |
| "grad_norm": 1.588020821919146, |
| "learning_rate": 1.1099152923609654e-05, |
| "loss": 0.1046, |
| "step": 8150 |
| }, |
| { |
| "epoch": 5.216284987277354, |
| "grad_norm": 1.4744529210130621, |
| "learning_rate": 1.0988734123909218e-05, |
| "loss": 0.1053, |
| "step": 8200 |
| }, |
| { |
| "epoch": 5.248091603053435, |
| "grad_norm": 1.5149467562017078, |
| "learning_rate": 1.0878193446378633e-05, |
| "loss": 0.1067, |
| "step": 8250 |
| }, |
| { |
| "epoch": 5.2798982188295165, |
| "grad_norm": 1.641740858676932, |
| "learning_rate": 1.076754451698427e-05, |
| "loss": 0.1052, |
| "step": 8300 |
| }, |
| { |
| "epoch": 5.311704834605598, |
| "grad_norm": 1.9805138911324567, |
| "learning_rate": 1.0656800975036328e-05, |
| "loss": 0.1073, |
| "step": 8350 |
| }, |
| { |
| "epoch": 5.34351145038168, |
| "grad_norm": 1.538007502423292, |
| "learning_rate": 1.0545976471507573e-05, |
| "loss": 0.1092, |
| "step": 8400 |
| }, |
| { |
| "epoch": 5.375318066157761, |
| "grad_norm": 1.5971589586731525, |
| "learning_rate": 1.0435084667350619e-05, |
| "loss": 0.1075, |
| "step": 8450 |
| }, |
| { |
| "epoch": 5.407124681933842, |
| "grad_norm": 1.690135861247903, |
| "learning_rate": 1.0324139231813997e-05, |
| "loss": 0.1074, |
| "step": 8500 |
| }, |
| { |
| "epoch": 5.438931297709924, |
| "grad_norm": 1.6897524323080446, |
| "learning_rate": 1.0213153840757198e-05, |
| "loss": 0.1099, |
| "step": 8550 |
| }, |
| { |
| "epoch": 5.470737913486005, |
| "grad_norm": 1.6020111115696878, |
| "learning_rate": 1.0102142174964883e-05, |
| "loss": 0.1089, |
| "step": 8600 |
| }, |
| { |
| "epoch": 5.502544529262087, |
| "grad_norm": 1.721827372559917, |
| "learning_rate": 9.991117918460518e-06, |
| "loss": 0.1085, |
| "step": 8650 |
| }, |
| { |
| "epoch": 5.534351145038168, |
| "grad_norm": 1.5374549183894768, |
| "learning_rate": 9.880094756819572e-06, |
| "loss": 0.1088, |
| "step": 8700 |
| }, |
| { |
| "epoch": 5.566157760814249, |
| "grad_norm": 1.63155554721324, |
| "learning_rate": 9.769086375482561e-06, |
| "loss": 0.1095, |
| "step": 8750 |
| }, |
| { |
| "epoch": 5.597964376590331, |
| "grad_norm": 1.6722391499109936, |
| "learning_rate": 9.658106458068086e-06, |
| "loss": 0.1097, |
| "step": 8800 |
| }, |
| { |
| "epoch": 5.629770992366412, |
| "grad_norm": 1.6914082156675518, |
| "learning_rate": 9.547168684686088e-06, |
| "loss": 0.1092, |
| "step": 8850 |
| }, |
| { |
| "epoch": 5.661577608142494, |
| "grad_norm": 1.6060594589970834, |
| "learning_rate": 9.436286730251568e-06, |
| "loss": 0.1109, |
| "step": 8900 |
| }, |
| { |
| "epoch": 5.693384223918575, |
| "grad_norm": 1.6457178552761271, |
| "learning_rate": 9.32547426279892e-06, |
| "loss": 0.1101, |
| "step": 8950 |
| }, |
| { |
| "epoch": 5.7251908396946565, |
| "grad_norm": 1.6736941545247992, |
| "learning_rate": 9.214744941797115e-06, |
| "loss": 0.1087, |
| "step": 9000 |
| }, |
| { |
| "epoch": 5.756997455470738, |
| "grad_norm": 1.6655682373835654, |
| "learning_rate": 9.104112416465949e-06, |
| "loss": 0.1072, |
| "step": 9050 |
| }, |
| { |
| "epoch": 5.788804071246819, |
| "grad_norm": 1.716312631900298, |
| "learning_rate": 8.993590324093548e-06, |
| "loss": 0.1096, |
| "step": 9100 |
| }, |
| { |
| "epoch": 5.820610687022901, |
| "grad_norm": 1.6302159083496506, |
| "learning_rate": 8.883192288355362e-06, |
| "loss": 0.1093, |
| "step": 9150 |
| }, |
| { |
| "epoch": 5.852417302798982, |
| "grad_norm": 1.6265328321818522, |
| "learning_rate": 8.772931917634792e-06, |
| "loss": 0.1101, |
| "step": 9200 |
| }, |
| { |
| "epoch": 5.8842239185750635, |
| "grad_norm": 1.611412246032975, |
| "learning_rate": 8.662822803345762e-06, |
| "loss": 0.1082, |
| "step": 9250 |
| }, |
| { |
| "epoch": 5.916030534351145, |
| "grad_norm": 1.523038058167952, |
| "learning_rate": 8.552878518257335e-06, |
| "loss": 0.1098, |
| "step": 9300 |
| }, |
| { |
| "epoch": 5.947837150127226, |
| "grad_norm": 1.652523299337976, |
| "learning_rate": 8.44311261482065e-06, |
| "loss": 0.1093, |
| "step": 9350 |
| }, |
| { |
| "epoch": 5.979643765903308, |
| "grad_norm": 1.6687963754361035, |
| "learning_rate": 8.333538623498357e-06, |
| "loss": 0.1083, |
| "step": 9400 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 2.39013671875, |
| "eval_runtime": 57.4954, |
| "eval_samples_per_second": 48.804, |
| "eval_steps_per_second": 1.531, |
| "step": 9432 |
| }, |
| { |
| "epoch": 6.011450381679389, |
| "grad_norm": 1.1640024621937262, |
| "learning_rate": 8.224170051096769e-06, |
| "loss": 0.0926, |
| "step": 9450 |
| }, |
| { |
| "epoch": 6.043256997455471, |
| "grad_norm": 1.0821873479243533, |
| "learning_rate": 8.115020379100913e-06, |
| "loss": 0.0686, |
| "step": 9500 |
| }, |
| { |
| "epoch": 6.075063613231552, |
| "grad_norm": 1.0429744336325093, |
| "learning_rate": 8.006103062012725e-06, |
| "loss": 0.0709, |
| "step": 9550 |
| }, |
| { |
| "epoch": 6.106870229007634, |
| "grad_norm": 1.0982634834958693, |
| "learning_rate": 7.897431525692557e-06, |
| "loss": 0.0699, |
| "step": 9600 |
| }, |
| { |
| "epoch": 6.138676844783715, |
| "grad_norm": 1.1260866233749833, |
| "learning_rate": 7.789019165704218e-06, |
| "loss": 0.0721, |
| "step": 9650 |
| }, |
| { |
| "epoch": 6.1704834605597965, |
| "grad_norm": 1.2094196230310894, |
| "learning_rate": 7.680879345663745e-06, |
| "loss": 0.0717, |
| "step": 9700 |
| }, |
| { |
| "epoch": 6.202290076335878, |
| "grad_norm": 1.1634423083927459, |
| "learning_rate": 7.573025395592125e-06, |
| "loss": 0.0715, |
| "step": 9750 |
| }, |
| { |
| "epoch": 6.234096692111959, |
| "grad_norm": 1.2985863539363118, |
| "learning_rate": 7.4654706102721405e-06, |
| "loss": 0.0728, |
| "step": 9800 |
| }, |
| { |
| "epoch": 6.265903307888041, |
| "grad_norm": 1.2070636203209166, |
| "learning_rate": 7.358228247609569e-06, |
| "loss": 0.0733, |
| "step": 9850 |
| }, |
| { |
| "epoch": 6.297709923664122, |
| "grad_norm": 1.160714182767047, |
| "learning_rate": 7.251311526998934e-06, |
| "loss": 0.0721, |
| "step": 9900 |
| }, |
| { |
| "epoch": 6.3295165394402035, |
| "grad_norm": 1.2885727319815767, |
| "learning_rate": 7.1447336276939915e-06, |
| "loss": 0.073, |
| "step": 9950 |
| }, |
| { |
| "epoch": 6.361323155216285, |
| "grad_norm": 1.1465122232062648, |
| "learning_rate": 7.038507687183167e-06, |
| "loss": 0.0718, |
| "step": 10000 |
| }, |
| { |
| "epoch": 6.393129770992366, |
| "grad_norm": 1.2099805716340462, |
| "learning_rate": 6.932646799570144e-06, |
| "loss": 0.0744, |
| "step": 10050 |
| }, |
| { |
| "epoch": 6.424936386768448, |
| "grad_norm": 1.1587707031129502, |
| "learning_rate": 6.827164013959805e-06, |
| "loss": 0.0725, |
| "step": 10100 |
| }, |
| { |
| "epoch": 6.456743002544529, |
| "grad_norm": 1.182892310121918, |
| "learning_rate": 6.722072332849697e-06, |
| "loss": 0.0735, |
| "step": 10150 |
| }, |
| { |
| "epoch": 6.488549618320611, |
| "grad_norm": 1.2622584554849223, |
| "learning_rate": 6.617384710527282e-06, |
| "loss": 0.0733, |
| "step": 10200 |
| }, |
| { |
| "epoch": 6.520356234096692, |
| "grad_norm": 1.3226325470322322, |
| "learning_rate": 6.513114051473094e-06, |
| "loss": 0.0742, |
| "step": 10250 |
| }, |
| { |
| "epoch": 6.552162849872774, |
| "grad_norm": 1.3036627900457751, |
| "learning_rate": 6.409273208770039e-06, |
| "loss": 0.0733, |
| "step": 10300 |
| }, |
| { |
| "epoch": 6.583969465648855, |
| "grad_norm": 1.2326022863178456, |
| "learning_rate": 6.305874982519064e-06, |
| "loss": 0.0748, |
| "step": 10350 |
| }, |
| { |
| "epoch": 6.6157760814249365, |
| "grad_norm": 1.1888251941386172, |
| "learning_rate": 6.202932118261309e-06, |
| "loss": 0.0735, |
| "step": 10400 |
| }, |
| { |
| "epoch": 6.647582697201018, |
| "grad_norm": 1.2979512300254676, |
| "learning_rate": 6.100457305407024e-06, |
| "loss": 0.0742, |
| "step": 10450 |
| }, |
| { |
| "epoch": 6.679389312977099, |
| "grad_norm": 1.2201598913375877, |
| "learning_rate": 5.998463175671382e-06, |
| "loss": 0.0731, |
| "step": 10500 |
| }, |
| { |
| "epoch": 6.711195928753181, |
| "grad_norm": 1.074538323760813, |
| "learning_rate": 5.896962301517415e-06, |
| "loss": 0.072, |
| "step": 10550 |
| }, |
| { |
| "epoch": 6.743002544529262, |
| "grad_norm": 1.154410840799108, |
| "learning_rate": 5.795967194606249e-06, |
| "loss": 0.0733, |
| "step": 10600 |
| }, |
| { |
| "epoch": 6.7748091603053435, |
| "grad_norm": 1.1746045784185184, |
| "learning_rate": 5.695490304254825e-06, |
| "loss": 0.0722, |
| "step": 10650 |
| }, |
| { |
| "epoch": 6.806615776081425, |
| "grad_norm": 1.1366587055886472, |
| "learning_rate": 5.59554401590134e-06, |
| "loss": 0.072, |
| "step": 10700 |
| }, |
| { |
| "epoch": 6.838422391857506, |
| "grad_norm": 1.1414915705802602, |
| "learning_rate": 5.496140649578507e-06, |
| "loss": 0.073, |
| "step": 10750 |
| }, |
| { |
| "epoch": 6.870229007633588, |
| "grad_norm": 1.0658028472882173, |
| "learning_rate": 5.397292458394923e-06, |
| "loss": 0.0711, |
| "step": 10800 |
| }, |
| { |
| "epoch": 6.902035623409669, |
| "grad_norm": 1.13284721647744, |
| "learning_rate": 5.2990116270246795e-06, |
| "loss": 0.0713, |
| "step": 10850 |
| }, |
| { |
| "epoch": 6.933842239185751, |
| "grad_norm": 1.1850532152524302, |
| "learning_rate": 5.201310270205375e-06, |
| "loss": 0.0714, |
| "step": 10900 |
| }, |
| { |
| "epoch": 6.965648854961832, |
| "grad_norm": 1.4139501578857867, |
| "learning_rate": 5.104200431244802e-06, |
| "loss": 0.0716, |
| "step": 10950 |
| }, |
| { |
| "epoch": 6.997455470737913, |
| "grad_norm": 1.3206202357844146, |
| "learning_rate": 5.007694080536379e-06, |
| "loss": 0.0721, |
| "step": 11000 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 2.679978132247925, |
| "eval_runtime": 55.9717, |
| "eval_samples_per_second": 50.132, |
| "eval_steps_per_second": 1.572, |
| "step": 11004 |
| }, |
| { |
| "epoch": 7.029262086513995, |
| "grad_norm": 0.7594215283173359, |
| "learning_rate": 4.911803114083635e-06, |
| "loss": 0.0532, |
| "step": 11050 |
| }, |
| { |
| "epoch": 7.061068702290076, |
| "grad_norm": 0.7570171860957418, |
| "learning_rate": 4.816539352033806e-06, |
| "loss": 0.0509, |
| "step": 11100 |
| }, |
| { |
| "epoch": 7.092875318066158, |
| "grad_norm": 0.7549673159769185, |
| "learning_rate": 4.721914537220807e-06, |
| "loss": 0.0522, |
| "step": 11150 |
| }, |
| { |
| "epoch": 7.124681933842239, |
| "grad_norm": 0.8233969961008896, |
| "learning_rate": 4.627940333717758e-06, |
| "loss": 0.0527, |
| "step": 11200 |
| }, |
| { |
| "epoch": 7.156488549618321, |
| "grad_norm": 0.8166861593202606, |
| "learning_rate": 4.534628325399157e-06, |
| "loss": 0.052, |
| "step": 11250 |
| }, |
| { |
| "epoch": 7.188295165394402, |
| "grad_norm": 0.9851243027512826, |
| "learning_rate": 4.441990014513016e-06, |
| "loss": 0.0524, |
| "step": 11300 |
| }, |
| { |
| "epoch": 7.2201017811704835, |
| "grad_norm": 0.8447465274214352, |
| "learning_rate": 4.3500368202629775e-06, |
| "loss": 0.0522, |
| "step": 11350 |
| }, |
| { |
| "epoch": 7.251908396946565, |
| "grad_norm": 0.750732927712184, |
| "learning_rate": 4.2587800774007485e-06, |
| "loss": 0.0528, |
| "step": 11400 |
| }, |
| { |
| "epoch": 7.283715012722646, |
| "grad_norm": 0.7801419309981809, |
| "learning_rate": 4.168231034828873e-06, |
| "loss": 0.0528, |
| "step": 11450 |
| }, |
| { |
| "epoch": 7.315521628498728, |
| "grad_norm": 0.6873017424208359, |
| "learning_rate": 4.078400854214136e-06, |
| "loss": 0.052, |
| "step": 11500 |
| }, |
| { |
| "epoch": 7.347328244274809, |
| "grad_norm": 0.8809132112218713, |
| "learning_rate": 3.989300608611709e-06, |
| "loss": 0.0538, |
| "step": 11550 |
| }, |
| { |
| "epoch": 7.379134860050891, |
| "grad_norm": 0.8614536390106059, |
| "learning_rate": 3.90094128110018e-06, |
| "loss": 0.0532, |
| "step": 11600 |
| }, |
| { |
| "epoch": 7.410941475826972, |
| "grad_norm": 0.7727143215643426, |
| "learning_rate": 3.8133337634277556e-06, |
| "loss": 0.0534, |
| "step": 11650 |
| }, |
| { |
| "epoch": 7.442748091603053, |
| "grad_norm": 0.8389043129325592, |
| "learning_rate": 3.726488854669631e-06, |
| "loss": 0.0535, |
| "step": 11700 |
| }, |
| { |
| "epoch": 7.474554707379135, |
| "grad_norm": 0.7673983015748496, |
| "learning_rate": 3.640417259896856e-06, |
| "loss": 0.0525, |
| "step": 11750 |
| }, |
| { |
| "epoch": 7.506361323155216, |
| "grad_norm": 0.8586943513972485, |
| "learning_rate": 3.5551295888567304e-06, |
| "loss": 0.0527, |
| "step": 11800 |
| }, |
| { |
| "epoch": 7.538167938931298, |
| "grad_norm": 0.8250658330704663, |
| "learning_rate": 3.470636354665006e-06, |
| "loss": 0.0528, |
| "step": 11850 |
| }, |
| { |
| "epoch": 7.569974554707379, |
| "grad_norm": 0.9181462088501849, |
| "learning_rate": 3.386947972509944e-06, |
| "loss": 0.0531, |
| "step": 11900 |
| }, |
| { |
| "epoch": 7.601781170483461, |
| "grad_norm": 0.7612511386277396, |
| "learning_rate": 3.3040747583684864e-06, |
| "loss": 0.0534, |
| "step": 11950 |
| }, |
| { |
| "epoch": 7.633587786259542, |
| "grad_norm": 0.8374071182256133, |
| "learning_rate": 3.2220269277346437e-06, |
| "loss": 0.0525, |
| "step": 12000 |
| }, |
| { |
| "epoch": 7.6653944020356235, |
| "grad_norm": 0.6843857705551388, |
| "learning_rate": 3.140814594360254e-06, |
| "loss": 0.0532, |
| "step": 12050 |
| }, |
| { |
| "epoch": 7.697201017811705, |
| "grad_norm": 0.7472725928560873, |
| "learning_rate": 3.060447769008311e-06, |
| "loss": 0.053, |
| "step": 12100 |
| }, |
| { |
| "epoch": 7.729007633587786, |
| "grad_norm": 0.6742611901517995, |
| "learning_rate": 2.980936358218951e-06, |
| "loss": 0.0521, |
| "step": 12150 |
| }, |
| { |
| "epoch": 7.760814249363868, |
| "grad_norm": 0.7447558302371876, |
| "learning_rate": 2.902290163088334e-06, |
| "loss": 0.0523, |
| "step": 12200 |
| }, |
| { |
| "epoch": 7.792620865139949, |
| "grad_norm": 0.8357691068645717, |
| "learning_rate": 2.824518878060475e-06, |
| "loss": 0.0522, |
| "step": 12250 |
| }, |
| { |
| "epoch": 7.824427480916031, |
| "grad_norm": 0.7615949867158718, |
| "learning_rate": 2.7476320897322507e-06, |
| "loss": 0.0528, |
| "step": 12300 |
| }, |
| { |
| "epoch": 7.856234096692112, |
| "grad_norm": 0.6736658957587341, |
| "learning_rate": 2.6716392756717025e-06, |
| "loss": 0.0528, |
| "step": 12350 |
| }, |
| { |
| "epoch": 7.888040712468193, |
| "grad_norm": 0.8585452571929305, |
| "learning_rate": 2.596549803249748e-06, |
| "loss": 0.0523, |
| "step": 12400 |
| }, |
| { |
| "epoch": 7.919847328244275, |
| "grad_norm": 0.7813934967634093, |
| "learning_rate": 2.522372928485526e-06, |
| "loss": 0.052, |
| "step": 12450 |
| }, |
| { |
| "epoch": 7.951653944020356, |
| "grad_norm": 0.6924594723451242, |
| "learning_rate": 2.4491177949054066e-06, |
| "loss": 0.0526, |
| "step": 12500 |
| }, |
| { |
| "epoch": 7.983460559796438, |
| "grad_norm": 0.797811753846389, |
| "learning_rate": 2.376793432415935e-06, |
| "loss": 0.0521, |
| "step": 12550 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 2.9158363342285156, |
| "eval_runtime": 54.0838, |
| "eval_samples_per_second": 51.882, |
| "eval_steps_per_second": 1.627, |
| "step": 12576 |
| }, |
| { |
| "epoch": 8.01526717557252, |
| "grad_norm": 0.47699575150025136, |
| "learning_rate": 2.3054087561907133e-06, |
| "loss": 0.0476, |
| "step": 12600 |
| }, |
| { |
| "epoch": 8.0470737913486, |
| "grad_norm": 0.44472989078715447, |
| "learning_rate": 2.2349725655714784e-06, |
| "loss": 0.0408, |
| "step": 12650 |
| }, |
| { |
| "epoch": 8.078880407124682, |
| "grad_norm": 0.4166634413980581, |
| "learning_rate": 2.165493542983439e-06, |
| "loss": 0.04, |
| "step": 12700 |
| }, |
| { |
| "epoch": 8.110687022900763, |
| "grad_norm": 0.4826499882370959, |
| "learning_rate": 2.0969802528650052e-06, |
| "loss": 0.0406, |
| "step": 12750 |
| }, |
| { |
| "epoch": 8.142493638676845, |
| "grad_norm": 0.5525649043803899, |
| "learning_rate": 2.0294411406121017e-06, |
| "loss": 0.0403, |
| "step": 12800 |
| }, |
| { |
| "epoch": 8.174300254452927, |
| "grad_norm": 0.6770167907516378, |
| "learning_rate": 1.9628845315371135e-06, |
| "loss": 0.0416, |
| "step": 12850 |
| }, |
| { |
| "epoch": 8.206106870229007, |
| "grad_norm": 0.5609786348847312, |
| "learning_rate": 1.8973186298426715e-06, |
| "loss": 0.04, |
| "step": 12900 |
| }, |
| { |
| "epoch": 8.23791348600509, |
| "grad_norm": 1.2377244048034484, |
| "learning_rate": 1.8327515176103339e-06, |
| "loss": 0.0409, |
| "step": 12950 |
| }, |
| { |
| "epoch": 8.26972010178117, |
| "grad_norm": 0.5811829212233038, |
| "learning_rate": 1.7691911538043426e-06, |
| "loss": 0.0414, |
| "step": 13000 |
| }, |
| { |
| "epoch": 8.301526717557252, |
| "grad_norm": 0.404578030905123, |
| "learning_rate": 1.7066453732905497e-06, |
| "loss": 0.0406, |
| "step": 13050 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.5099055257542866, |
| "learning_rate": 1.6451218858706374e-06, |
| "loss": 0.0413, |
| "step": 13100 |
| }, |
| { |
| "epoch": 8.365139949109414, |
| "grad_norm": 0.4999951785478943, |
| "learning_rate": 1.5846282753317665e-06, |
| "loss": 0.0403, |
| "step": 13150 |
| }, |
| { |
| "epoch": 8.396946564885496, |
| "grad_norm": 0.48048243299669846, |
| "learning_rate": 1.525171998511733e-06, |
| "loss": 0.0404, |
| "step": 13200 |
| }, |
| { |
| "epoch": 8.428753180661577, |
| "grad_norm": 0.5268405348490426, |
| "learning_rate": 1.4667603843798106e-06, |
| "loss": 0.0412, |
| "step": 13250 |
| }, |
| { |
| "epoch": 8.460559796437659, |
| "grad_norm": 0.48606935124003653, |
| "learning_rate": 1.40940063313331e-06, |
| "loss": 0.0407, |
| "step": 13300 |
| }, |
| { |
| "epoch": 8.492366412213741, |
| "grad_norm": 0.4586801501366044, |
| "learning_rate": 1.3530998153100584e-06, |
| "loss": 0.041, |
| "step": 13350 |
| }, |
| { |
| "epoch": 8.524173027989821, |
| "grad_norm": 0.46212200000809905, |
| "learning_rate": 1.2978648709168218e-06, |
| "loss": 0.0415, |
| "step": 13400 |
| }, |
| { |
| "epoch": 8.555979643765903, |
| "grad_norm": 0.5230101492785741, |
| "learning_rate": 1.2437026085738413e-06, |
| "loss": 0.0412, |
| "step": 13450 |
| }, |
| { |
| "epoch": 8.587786259541986, |
| "grad_norm": 0.5010710891320053, |
| "learning_rate": 1.190619704675564e-06, |
| "loss": 0.0407, |
| "step": 13500 |
| }, |
| { |
| "epoch": 8.619592875318066, |
| "grad_norm": 0.5337226775595767, |
| "learning_rate": 1.1386227025676533e-06, |
| "loss": 0.0411, |
| "step": 13550 |
| }, |
| { |
| "epoch": 8.651399491094148, |
| "grad_norm": 0.49748450214540363, |
| "learning_rate": 1.0877180117404262e-06, |
| "loss": 0.041, |
| "step": 13600 |
| }, |
| { |
| "epoch": 8.683206106870228, |
| "grad_norm": 0.5478966127927775, |
| "learning_rate": 1.0379119070387678e-06, |
| "loss": 0.0408, |
| "step": 13650 |
| }, |
| { |
| "epoch": 8.71501272264631, |
| "grad_norm": 0.5724410891992745, |
| "learning_rate": 9.892105278886633e-07, |
| "loss": 0.0413, |
| "step": 13700 |
| }, |
| { |
| "epoch": 8.746819338422393, |
| "grad_norm": 0.5129565444894096, |
| "learning_rate": 9.416198775403995e-07, |
| "loss": 0.0416, |
| "step": 13750 |
| }, |
| { |
| "epoch": 8.778625954198473, |
| "grad_norm": 0.4919521234854214, |
| "learning_rate": 8.951458223285747e-07, |
| "loss": 0.041, |
| "step": 13800 |
| }, |
| { |
| "epoch": 8.810432569974555, |
| "grad_norm": 0.5202619264425508, |
| "learning_rate": 8.497940909489766e-07, |
| "loss": 0.0409, |
| "step": 13850 |
| }, |
| { |
| "epoch": 8.842239185750635, |
| "grad_norm": 0.5737719301526714, |
| "learning_rate": 8.05570273752414e-07, |
| "loss": 0.0409, |
| "step": 13900 |
| }, |
| { |
| "epoch": 8.874045801526718, |
| "grad_norm": 0.5125697208888668, |
| "learning_rate": 7.624798220556307e-07, |
| "loss": 0.0416, |
| "step": 13950 |
| }, |
| { |
| "epoch": 8.9058524173028, |
| "grad_norm": 0.4752912481647863, |
| "learning_rate": 7.205280474693255e-07, |
| "loss": 0.0408, |
| "step": 14000 |
| }, |
| { |
| "epoch": 8.93765903307888, |
| "grad_norm": 0.46889034532949214, |
| "learning_rate": 6.797201212434179e-07, |
| "loss": 0.0414, |
| "step": 14050 |
| }, |
| { |
| "epoch": 8.969465648854962, |
| "grad_norm": 0.5191849271721021, |
| "learning_rate": 6.40061073629602e-07, |
| "loss": 0.0407, |
| "step": 14100 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 3.1603267192840576, |
| "eval_runtime": 41.6298, |
| "eval_samples_per_second": 67.404, |
| "eval_steps_per_second": 2.114, |
| "step": 14148 |
| }, |
| { |
| "epoch": 9.001272264631043, |
| "grad_norm": 0.3119408951667528, |
| "learning_rate": 6.015557932612814e-07, |
| "loss": 0.0406, |
| "step": 14150 |
| }, |
| { |
| "epoch": 9.033078880407125, |
| "grad_norm": 0.40226216369582235, |
| "learning_rate": 5.642090265509758e-07, |
| "loss": 0.0339, |
| "step": 14200 |
| }, |
| { |
| "epoch": 9.064885496183207, |
| "grad_norm": 0.4523677357722474, |
| "learning_rate": 5.280253771052335e-07, |
| "loss": 0.0345, |
| "step": 14250 |
| }, |
| { |
| "epoch": 9.096692111959287, |
| "grad_norm": 0.4305698289148337, |
| "learning_rate": 4.930093051571727e-07, |
| "loss": 0.034, |
| "step": 14300 |
| }, |
| { |
| "epoch": 9.12849872773537, |
| "grad_norm": 0.391062079240868, |
| "learning_rate": 4.5916512701667194e-07, |
| "loss": 0.0331, |
| "step": 14350 |
| }, |
| { |
| "epoch": 9.16030534351145, |
| "grad_norm": 0.4320929587892914, |
| "learning_rate": 4.264970145383218e-07, |
| "loss": 0.0338, |
| "step": 14400 |
| }, |
| { |
| "epoch": 9.192111959287532, |
| "grad_norm": 0.4600987494145122, |
| "learning_rate": 3.950089946071789e-07, |
| "loss": 0.0338, |
| "step": 14450 |
| }, |
| { |
| "epoch": 9.223918575063614, |
| "grad_norm": 0.4522601727238207, |
| "learning_rate": 3.647049486423715e-07, |
| "loss": 0.0341, |
| "step": 14500 |
| }, |
| { |
| "epoch": 9.255725190839694, |
| "grad_norm": 0.5578268300468956, |
| "learning_rate": 3.3558861211866465e-07, |
| "loss": 0.034, |
| "step": 14550 |
| }, |
| { |
| "epoch": 9.287531806615776, |
| "grad_norm": 0.4469673191217148, |
| "learning_rate": 3.076635741059897e-07, |
| "loss": 0.0339, |
| "step": 14600 |
| }, |
| { |
| "epoch": 9.319338422391857, |
| "grad_norm": 0.3715355931773751, |
| "learning_rate": 2.8093327682704455e-07, |
| "loss": 0.0336, |
| "step": 14650 |
| }, |
| { |
| "epoch": 9.351145038167939, |
| "grad_norm": 0.45563517363253375, |
| "learning_rate": 2.554010152329689e-07, |
| "loss": 0.0338, |
| "step": 14700 |
| }, |
| { |
| "epoch": 9.382951653944021, |
| "grad_norm": 0.439182546045708, |
| "learning_rate": 2.3106993659719912e-07, |
| "loss": 0.0341, |
| "step": 14750 |
| }, |
| { |
| "epoch": 9.414758269720101, |
| "grad_norm": 0.477249284039544, |
| "learning_rate": 2.079430401275062e-07, |
| "loss": 0.0337, |
| "step": 14800 |
| }, |
| { |
| "epoch": 9.446564885496183, |
| "grad_norm": 0.46561188379582136, |
| "learning_rate": 1.8602317659630164e-07, |
| "loss": 0.0341, |
| "step": 14850 |
| }, |
| { |
| "epoch": 9.478371501272264, |
| "grad_norm": 0.43267746862828027, |
| "learning_rate": 1.6531304798922175e-07, |
| "loss": 0.0334, |
| "step": 14900 |
| }, |
| { |
| "epoch": 9.510178117048346, |
| "grad_norm": 0.4469433294529979, |
| "learning_rate": 1.4581520717207532e-07, |
| "loss": 0.0338, |
| "step": 14950 |
| }, |
| { |
| "epoch": 9.541984732824428, |
| "grad_norm": 0.471710246057724, |
| "learning_rate": 1.2753205757615184e-07, |
| "loss": 0.0342, |
| "step": 15000 |
| }, |
| { |
| "epoch": 9.573791348600508, |
| "grad_norm": 0.45324554344467166, |
| "learning_rate": 1.1046585290196199e-07, |
| "loss": 0.0344, |
| "step": 15050 |
| }, |
| { |
| "epoch": 9.60559796437659, |
| "grad_norm": 0.44380469046920124, |
| "learning_rate": 9.461869684143532e-08, |
| "loss": 0.0337, |
| "step": 15100 |
| }, |
| { |
| "epoch": 9.637404580152673, |
| "grad_norm": 0.43526204289234377, |
| "learning_rate": 7.99925428185988e-08, |
| "loss": 0.0336, |
| "step": 15150 |
| }, |
| { |
| "epoch": 9.669211195928753, |
| "grad_norm": 0.48131870263398113, |
| "learning_rate": 6.658919374878836e-08, |
| "loss": 0.0338, |
| "step": 15200 |
| }, |
| { |
| "epoch": 9.701017811704835, |
| "grad_norm": 0.51605738105091, |
| "learning_rate": 5.4410301816409936e-08, |
| "loss": 0.0346, |
| "step": 15250 |
| }, |
| { |
| "epoch": 9.732824427480915, |
| "grad_norm": 0.5246279101214187, |
| "learning_rate": 4.345736827128133e-08, |
| "loss": 0.0338, |
| "step": 15300 |
| }, |
| { |
| "epoch": 9.764631043256998, |
| "grad_norm": 0.4253195146423773, |
| "learning_rate": 3.373174324357464e-08, |
| "loss": 0.034, |
| "step": 15350 |
| }, |
| { |
| "epoch": 9.796437659033078, |
| "grad_norm": 0.4147726333463597, |
| "learning_rate": 2.523462557739609e-08, |
| "loss": 0.034, |
| "step": 15400 |
| }, |
| { |
| "epoch": 9.82824427480916, |
| "grad_norm": 0.4712720308828737, |
| "learning_rate": 1.7967062683001968e-08, |
| "loss": 0.0338, |
| "step": 15450 |
| }, |
| { |
| "epoch": 9.860050890585242, |
| "grad_norm": 0.4668311393279567, |
| "learning_rate": 1.192995040769307e-08, |
| "loss": 0.0331, |
| "step": 15500 |
| }, |
| { |
| "epoch": 9.891857506361323, |
| "grad_norm": 0.5257019319363953, |
| "learning_rate": 7.1240329253852155e-09, |
| "loss": 0.0342, |
| "step": 15550 |
| }, |
| { |
| "epoch": 9.923664122137405, |
| "grad_norm": 0.5539605952740027, |
| "learning_rate": 3.5499026448782006e-09, |
| "loss": 0.0337, |
| "step": 15600 |
| }, |
| { |
| "epoch": 9.955470737913487, |
| "grad_norm": 0.4268133146505169, |
| "learning_rate": 1.2080001368286466e-09, |
| "loss": 0.0339, |
| "step": 15650 |
| }, |
| { |
| "epoch": 9.987277353689567, |
| "grad_norm": 0.47264194038520635, |
| "learning_rate": 9.861407944566914e-11, |
| "loss": 0.034, |
| "step": 15700 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 3.2902276515960693, |
| "eval_runtime": 52.6381, |
| "eval_samples_per_second": 53.307, |
| "eval_steps_per_second": 1.672, |
| "step": 15720 |
| }, |
| { |
| "epoch": 10.0, |
| "step": 15720, |
| "total_flos": 1137765022433280.0, |
| "train_loss": 0.3699118922533273, |
| "train_runtime": 20589.7922, |
| "train_samples_per_second": 24.417, |
| "train_steps_per_second": 0.763 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 15720, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1137765022433280.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|