| { |
| "best_global_step": 89000, |
| "best_metric": 3.5322518348693848, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_hit_frequency_1001/checkpoint-40000", |
| "epoch": 31.75990675990676, |
| "eval_steps": 1000, |
| "global_step": 109000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014568764568764568, |
| "grad_norm": 1.0800504684448242, |
| "learning_rate": 0.000294, |
| "loss": 8.4423, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029137529137529136, |
| "grad_norm": 1.1450026035308838, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.716, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.043706293706293704, |
| "grad_norm": 0.8036891222000122, |
| "learning_rate": 0.0005998285714285713, |
| "loss": 6.3382, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05827505827505827, |
| "grad_norm": 0.45410433411598206, |
| "learning_rate": 0.0005996536443148687, |
| "loss": 6.1321, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07284382284382285, |
| "grad_norm": 0.4067866802215576, |
| "learning_rate": 0.0005994787172011662, |
| "loss": 6.0112, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08741258741258741, |
| "grad_norm": 0.5059288144111633, |
| "learning_rate": 0.0005993037900874635, |
| "loss": 5.8763, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10198135198135198, |
| "grad_norm": 0.4744279086589813, |
| "learning_rate": 0.0005991288629737609, |
| "loss": 5.7396, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11655011655011654, |
| "grad_norm": 0.4487614035606384, |
| "learning_rate": 0.0005989539358600582, |
| "loss": 5.6258, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.13111888111888112, |
| "grad_norm": 0.49402862787246704, |
| "learning_rate": 0.0005987790087463557, |
| "loss": 5.5118, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1456876456876457, |
| "grad_norm": 0.5154410004615784, |
| "learning_rate": 0.000598604081632653, |
| "loss": 5.4117, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16025641025641027, |
| "grad_norm": 0.42797785997390747, |
| "learning_rate": 0.0005984291545189504, |
| "loss": 5.3483, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17482517482517482, |
| "grad_norm": 0.40208497643470764, |
| "learning_rate": 0.0005982542274052477, |
| "loss": 5.2713, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1893939393939394, |
| "grad_norm": 0.4364264905452728, |
| "learning_rate": 0.0005980793002915452, |
| "loss": 5.1872, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.20396270396270397, |
| "grad_norm": 0.43082669377326965, |
| "learning_rate": 0.0005979043731778425, |
| "loss": 5.1489, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.21853146853146854, |
| "grad_norm": 0.42554041743278503, |
| "learning_rate": 0.0005977294460641399, |
| "loss": 5.1058, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2331002331002331, |
| "grad_norm": 0.4088204801082611, |
| "learning_rate": 0.0005975545189504372, |
| "loss": 5.0338, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24766899766899766, |
| "grad_norm": 0.46788325905799866, |
| "learning_rate": 0.0005973795918367347, |
| "loss": 4.9637, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.26223776223776224, |
| "grad_norm": 0.4362371861934662, |
| "learning_rate": 0.000597204664723032, |
| "loss": 4.9335, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2768065268065268, |
| "grad_norm": 0.4068501591682434, |
| "learning_rate": 0.0005970297376093294, |
| "loss": 4.8877, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.2913752913752914, |
| "grad_norm": 0.43599948287010193, |
| "learning_rate": 0.0005968548104956268, |
| "loss": 4.8439, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2913752913752914, |
| "eval_accuracy": 0.25089175031884764, |
| "eval_loss": 4.7751851081848145, |
| "eval_runtime": 179.8397, |
| "eval_samples_per_second": 92.538, |
| "eval_steps_per_second": 5.788, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30594405594405594, |
| "grad_norm": 0.43511125445365906, |
| "learning_rate": 0.0005966798833819242, |
| "loss": 4.8042, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.32051282051282054, |
| "grad_norm": 0.5234044790267944, |
| "learning_rate": 0.0005965049562682215, |
| "loss": 4.7645, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3350815850815851, |
| "grad_norm": 0.45317813754081726, |
| "learning_rate": 0.0005963300291545189, |
| "loss": 4.7255, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.34965034965034963, |
| "grad_norm": 0.46833401918411255, |
| "learning_rate": 0.0005961551020408162, |
| "loss": 4.6673, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36421911421911424, |
| "grad_norm": 0.39988529682159424, |
| "learning_rate": 0.0005959801749271137, |
| "loss": 4.6489, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3787878787878788, |
| "grad_norm": 0.46132174134254456, |
| "learning_rate": 0.000595805247813411, |
| "loss": 4.6077, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.39335664335664333, |
| "grad_norm": 0.5386888980865479, |
| "learning_rate": 0.0005956303206997084, |
| "loss": 4.6052, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.40792540792540793, |
| "grad_norm": 0.44581151008605957, |
| "learning_rate": 0.0005954553935860059, |
| "loss": 4.5552, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.4224941724941725, |
| "grad_norm": 0.4631711542606354, |
| "learning_rate": 0.0005952804664723032, |
| "loss": 4.5286, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4370629370629371, |
| "grad_norm": 0.441871702671051, |
| "learning_rate": 0.0005951055393586005, |
| "loss": 4.5022, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.45163170163170163, |
| "grad_norm": 0.4177614152431488, |
| "learning_rate": 0.0005949306122448979, |
| "loss": 4.4857, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4662004662004662, |
| "grad_norm": 0.4227793216705322, |
| "learning_rate": 0.0005947556851311952, |
| "loss": 4.4736, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4807692307692308, |
| "grad_norm": 0.4034147262573242, |
| "learning_rate": 0.0005945807580174927, |
| "loss": 4.441, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.49533799533799533, |
| "grad_norm": 0.4163498282432556, |
| "learning_rate": 0.00059440583090379, |
| "loss": 4.4279, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5099067599067599, |
| "grad_norm": 0.42639055848121643, |
| "learning_rate": 0.0005942309037900874, |
| "loss": 4.4093, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5244755244755245, |
| "grad_norm": 0.4284451901912689, |
| "learning_rate": 0.0005940559766763847, |
| "loss": 4.3972, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.539044289044289, |
| "grad_norm": 0.4254048764705658, |
| "learning_rate": 0.0005938810495626822, |
| "loss": 4.3764, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5536130536130536, |
| "grad_norm": 0.40332749485969543, |
| "learning_rate": 0.0005937061224489796, |
| "loss": 4.3664, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5681818181818182, |
| "grad_norm": 0.41593775153160095, |
| "learning_rate": 0.0005935311953352769, |
| "loss": 4.3491, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5827505827505828, |
| "grad_norm": 0.3947841227054596, |
| "learning_rate": 0.0005933562682215743, |
| "loss": 4.329, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5827505827505828, |
| "eval_accuracy": 0.3000407334753674, |
| "eval_loss": 4.277864933013916, |
| "eval_runtime": 179.9996, |
| "eval_samples_per_second": 92.456, |
| "eval_steps_per_second": 5.783, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5973193473193473, |
| "grad_norm": 0.3777885437011719, |
| "learning_rate": 0.0005931813411078717, |
| "loss": 4.3248, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6118881118881119, |
| "grad_norm": 0.4356587827205658, |
| "learning_rate": 0.000593006413994169, |
| "loss": 4.3067, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6264568764568764, |
| "grad_norm": 0.3846246898174286, |
| "learning_rate": 0.0005928314868804664, |
| "loss": 4.3008, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6410256410256411, |
| "grad_norm": 0.3826352655887604, |
| "learning_rate": 0.0005926565597667638, |
| "loss": 4.2847, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6555944055944056, |
| "grad_norm": 0.35496312379837036, |
| "learning_rate": 0.0005924816326530612, |
| "loss": 4.2659, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6701631701631702, |
| "grad_norm": 0.3881349563598633, |
| "learning_rate": 0.0005923067055393586, |
| "loss": 4.2684, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6847319347319347, |
| "grad_norm": 0.3730715215206146, |
| "learning_rate": 0.0005921317784256559, |
| "loss": 4.2634, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6993006993006993, |
| "grad_norm": 0.38490796089172363, |
| "learning_rate": 0.0005919568513119533, |
| "loss": 4.242, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7138694638694638, |
| "grad_norm": 0.3891685903072357, |
| "learning_rate": 0.0005917819241982507, |
| "loss": 4.2256, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7284382284382285, |
| "grad_norm": 0.4048764705657959, |
| "learning_rate": 0.000591606997084548, |
| "loss": 4.2221, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.743006993006993, |
| "grad_norm": 0.3975249230861664, |
| "learning_rate": 0.0005914320699708454, |
| "loss": 4.2187, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7575757575757576, |
| "grad_norm": 0.38864725828170776, |
| "learning_rate": 0.0005912571428571428, |
| "loss": 4.2054, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7721445221445221, |
| "grad_norm": 0.4100123345851898, |
| "learning_rate": 0.0005910822157434402, |
| "loss": 4.2029, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7867132867132867, |
| "grad_norm": 0.3994918763637543, |
| "learning_rate": 0.0005909072886297376, |
| "loss": 4.1821, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8012820512820513, |
| "grad_norm": 0.4012759327888489, |
| "learning_rate": 0.0005907323615160349, |
| "loss": 4.1892, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8158508158508159, |
| "grad_norm": 0.40569165349006653, |
| "learning_rate": 0.0005905574344023324, |
| "loss": 4.1752, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8304195804195804, |
| "grad_norm": 0.4140167832374573, |
| "learning_rate": 0.0005903825072886297, |
| "loss": 4.1679, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.844988344988345, |
| "grad_norm": 0.40095993876457214, |
| "learning_rate": 0.000590207580174927, |
| "loss": 4.152, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8595571095571095, |
| "grad_norm": 0.3606632351875305, |
| "learning_rate": 0.0005900326530612244, |
| "loss": 4.1414, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8741258741258742, |
| "grad_norm": 0.3624541163444519, |
| "learning_rate": 0.0005898577259475218, |
| "loss": 4.1511, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8741258741258742, |
| "eval_accuracy": 0.31668007594488373, |
| "eval_loss": 4.091818809509277, |
| "eval_runtime": 179.9935, |
| "eval_samples_per_second": 92.459, |
| "eval_steps_per_second": 5.784, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8886946386946387, |
| "grad_norm": 0.37599942088127136, |
| "learning_rate": 0.0005896827988338192, |
| "loss": 4.1237, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9032634032634033, |
| "grad_norm": 0.37333548069000244, |
| "learning_rate": 0.0005895078717201166, |
| "loss": 4.1181, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9178321678321678, |
| "grad_norm": 0.3791821300983429, |
| "learning_rate": 0.000589332944606414, |
| "loss": 4.1161, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.9324009324009324, |
| "grad_norm": 0.34895625710487366, |
| "learning_rate": 0.0005891580174927114, |
| "loss": 4.1138, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.946969696969697, |
| "grad_norm": 0.4230435788631439, |
| "learning_rate": 0.0005889830903790087, |
| "loss": 4.0998, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9615384615384616, |
| "grad_norm": 0.356535404920578, |
| "learning_rate": 0.000588808163265306, |
| "loss": 4.1032, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9761072261072261, |
| "grad_norm": 0.39397764205932617, |
| "learning_rate": 0.0005886332361516035, |
| "loss": 4.0832, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9906759906759907, |
| "grad_norm": 0.34773239493370056, |
| "learning_rate": 0.0005884583090379008, |
| "loss": 4.0945, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.0052447552447552, |
| "grad_norm": 0.3444010019302368, |
| "learning_rate": 0.0005882833819241982, |
| "loss": 4.0365, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0198135198135199, |
| "grad_norm": 0.3705681264400482, |
| "learning_rate": 0.0005881084548104955, |
| "loss": 4.0123, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.0343822843822843, |
| "grad_norm": 0.351367712020874, |
| "learning_rate": 0.000587933527696793, |
| "loss": 4.0021, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.048951048951049, |
| "grad_norm": 0.34913498163223267, |
| "learning_rate": 0.0005877586005830904, |
| "loss": 4.0137, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.0635198135198136, |
| "grad_norm": 0.35815438628196716, |
| "learning_rate": 0.0005875836734693877, |
| "loss": 4.0028, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.078088578088578, |
| "grad_norm": 0.3701878488063812, |
| "learning_rate": 0.0005874087463556851, |
| "loss": 4.0022, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0926573426573427, |
| "grad_norm": 0.3556549847126007, |
| "learning_rate": 0.0005872338192419825, |
| "loss": 4.011, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.1072261072261071, |
| "grad_norm": 0.3409753739833832, |
| "learning_rate": 0.0005870588921282798, |
| "loss": 4.0026, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.1217948717948718, |
| "grad_norm": 0.3481835126876831, |
| "learning_rate": 0.0005868839650145772, |
| "loss": 3.992, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.1363636363636362, |
| "grad_norm": 0.3449511229991913, |
| "learning_rate": 0.0005867090379008745, |
| "loss": 3.9925, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.150932400932401, |
| "grad_norm": 0.3285890817642212, |
| "learning_rate": 0.000586534110787172, |
| "loss": 3.996, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.1655011655011656, |
| "grad_norm": 0.34176114201545715, |
| "learning_rate": 0.0005863591836734694, |
| "loss": 3.9927, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1655011655011656, |
| "eval_accuracy": 0.32579760119340617, |
| "eval_loss": 3.9809024333953857, |
| "eval_runtime": 179.99, |
| "eval_samples_per_second": 92.461, |
| "eval_steps_per_second": 5.784, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.18006993006993, |
| "grad_norm": 0.33748361468315125, |
| "learning_rate": 0.0005861842565597667, |
| "loss": 3.9866, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.1946386946386947, |
| "grad_norm": 0.3346062898635864, |
| "learning_rate": 0.0005860093294460641, |
| "loss": 3.9696, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.2092074592074593, |
| "grad_norm": 0.3950088322162628, |
| "learning_rate": 0.0005858344023323615, |
| "loss": 3.9875, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2237762237762237, |
| "grad_norm": 0.3502199053764343, |
| "learning_rate": 0.0005856594752186588, |
| "loss": 3.9619, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.2383449883449884, |
| "grad_norm": 0.35575905442237854, |
| "learning_rate": 0.0005854845481049562, |
| "loss": 3.9642, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.2529137529137528, |
| "grad_norm": 0.3766721487045288, |
| "learning_rate": 0.0005853096209912535, |
| "loss": 3.973, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2674825174825175, |
| "grad_norm": 0.3426792621612549, |
| "learning_rate": 0.000585134693877551, |
| "loss": 3.965, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.282051282051282, |
| "grad_norm": 0.32944539189338684, |
| "learning_rate": 0.0005849597667638484, |
| "loss": 3.9634, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.2966200466200466, |
| "grad_norm": 0.3352791368961334, |
| "learning_rate": 0.0005847848396501457, |
| "loss": 3.953, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3111888111888113, |
| "grad_norm": 0.3523617386817932, |
| "learning_rate": 0.0005846099125364432, |
| "loss": 3.9577, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.3257575757575757, |
| "grad_norm": 0.35192176699638367, |
| "learning_rate": 0.0005844349854227405, |
| "loss": 3.9518, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.3403263403263403, |
| "grad_norm": 0.3310588598251343, |
| "learning_rate": 0.0005842600583090379, |
| "loss": 3.9505, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.354895104895105, |
| "grad_norm": 0.3588995933532715, |
| "learning_rate": 0.0005840851311953352, |
| "loss": 3.9455, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.3694638694638694, |
| "grad_norm": 0.36037853360176086, |
| "learning_rate": 0.0005839102040816325, |
| "loss": 3.929, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.384032634032634, |
| "grad_norm": 0.335283488035202, |
| "learning_rate": 0.00058373527696793, |
| "loss": 3.9381, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3986013986013985, |
| "grad_norm": 0.4042842388153076, |
| "learning_rate": 0.0005835603498542273, |
| "loss": 3.9542, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.4131701631701632, |
| "grad_norm": 0.3445352613925934, |
| "learning_rate": 0.0005833854227405247, |
| "loss": 3.9421, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.4277389277389276, |
| "grad_norm": 0.33620062470436096, |
| "learning_rate": 0.0005832104956268222, |
| "loss": 3.9228, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4423076923076923, |
| "grad_norm": 0.3565351665019989, |
| "learning_rate": 0.0005830355685131195, |
| "loss": 3.9231, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.456876456876457, |
| "grad_norm": 0.3392309248447418, |
| "learning_rate": 0.0005828606413994169, |
| "loss": 3.9166, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.456876456876457, |
| "eval_accuracy": 0.33273405109228976, |
| "eval_loss": 3.90755033493042, |
| "eval_runtime": 180.1089, |
| "eval_samples_per_second": 92.4, |
| "eval_steps_per_second": 5.78, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4714452214452214, |
| "grad_norm": 0.33691731095314026, |
| "learning_rate": 0.0005826857142857142, |
| "loss": 3.919, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.486013986013986, |
| "grad_norm": 0.34635353088378906, |
| "learning_rate": 0.0005825107871720116, |
| "loss": 3.9288, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.5005827505827507, |
| "grad_norm": 0.33720529079437256, |
| "learning_rate": 0.000582335860058309, |
| "loss": 3.9096, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5151515151515151, |
| "grad_norm": 0.33896803855895996, |
| "learning_rate": 0.0005821609329446063, |
| "loss": 3.9164, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5297202797202796, |
| "grad_norm": 0.3564818799495697, |
| "learning_rate": 0.0005819860058309037, |
| "loss": 3.923, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.5442890442890445, |
| "grad_norm": 0.3230993151664734, |
| "learning_rate": 0.0005818110787172012, |
| "loss": 3.9313, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.558857808857809, |
| "grad_norm": 0.3241836428642273, |
| "learning_rate": 0.0005816361516034985, |
| "loss": 3.9051, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.5734265734265733, |
| "grad_norm": 0.3641699254512787, |
| "learning_rate": 0.0005814612244897959, |
| "loss": 3.8919, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.587995337995338, |
| "grad_norm": 0.33417633175849915, |
| "learning_rate": 0.0005812862973760932, |
| "loss": 3.9001, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6025641025641026, |
| "grad_norm": 0.3381204903125763, |
| "learning_rate": 0.0005811113702623907, |
| "loss": 3.8948, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.617132867132867, |
| "grad_norm": 0.33705613017082214, |
| "learning_rate": 0.000580936443148688, |
| "loss": 3.9058, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.6317016317016317, |
| "grad_norm": 0.33371835947036743, |
| "learning_rate": 0.0005807615160349853, |
| "loss": 3.8844, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.6462703962703964, |
| "grad_norm": 0.3282848596572876, |
| "learning_rate": 0.0005805865889212827, |
| "loss": 3.8905, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.6608391608391608, |
| "grad_norm": 0.33143365383148193, |
| "learning_rate": 0.0005804116618075802, |
| "loss": 3.8805, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.6754079254079253, |
| "grad_norm": 0.3147096335887909, |
| "learning_rate": 0.0005802367346938775, |
| "loss": 3.8818, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.68997668997669, |
| "grad_norm": 0.3499462902545929, |
| "learning_rate": 0.0005800618075801749, |
| "loss": 3.887, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.7045454545454546, |
| "grad_norm": 0.3366160988807678, |
| "learning_rate": 0.0005798868804664722, |
| "loss": 3.8708, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.719114219114219, |
| "grad_norm": 0.33289647102355957, |
| "learning_rate": 0.0005797119533527697, |
| "loss": 3.881, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7336829836829837, |
| "grad_norm": 0.34272122383117676, |
| "learning_rate": 0.000579537026239067, |
| "loss": 3.888, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.7482517482517483, |
| "grad_norm": 0.33425042033195496, |
| "learning_rate": 0.0005793620991253643, |
| "loss": 3.8693, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7482517482517483, |
| "eval_accuracy": 0.3378537221389026, |
| "eval_loss": 3.8500373363494873, |
| "eval_runtime": 180.0024, |
| "eval_samples_per_second": 92.454, |
| "eval_steps_per_second": 5.783, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7628205128205128, |
| "grad_norm": 0.33192870020866394, |
| "learning_rate": 0.0005791871720116617, |
| "loss": 3.8684, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.7773892773892774, |
| "grad_norm": 0.3180764615535736, |
| "learning_rate": 0.0005790122448979591, |
| "loss": 3.8647, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.791958041958042, |
| "grad_norm": 0.3597005009651184, |
| "learning_rate": 0.0005788373177842565, |
| "loss": 3.8541, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.8065268065268065, |
| "grad_norm": 0.30698323249816895, |
| "learning_rate": 0.0005786623906705539, |
| "loss": 3.8699, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.821095571095571, |
| "grad_norm": 0.32494407892227173, |
| "learning_rate": 0.0005784874635568512, |
| "loss": 3.8687, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.8356643356643356, |
| "grad_norm": 0.32841014862060547, |
| "learning_rate": 0.0005783125364431487, |
| "loss": 3.8528, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8502331002331003, |
| "grad_norm": 0.3171062171459198, |
| "learning_rate": 0.000578137609329446, |
| "loss": 3.8483, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8648018648018647, |
| "grad_norm": 0.3196788728237152, |
| "learning_rate": 0.0005779626822157434, |
| "loss": 3.8493, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.8793706293706294, |
| "grad_norm": 0.324875146150589, |
| "learning_rate": 0.0005777877551020408, |
| "loss": 3.8649, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.893939393939394, |
| "grad_norm": 0.3102283775806427, |
| "learning_rate": 0.0005776128279883381, |
| "loss": 3.8619, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9085081585081585, |
| "grad_norm": 0.34474343061447144, |
| "learning_rate": 0.0005774379008746355, |
| "loss": 3.8556, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "grad_norm": 0.3284788429737091, |
| "learning_rate": 0.0005772629737609329, |
| "loss": 3.8463, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9376456876456878, |
| "grad_norm": 0.31729623675346375, |
| "learning_rate": 0.0005770880466472303, |
| "loss": 3.837, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.9522144522144522, |
| "grad_norm": 0.3377760946750641, |
| "learning_rate": 0.0005769131195335277, |
| "loss": 3.8615, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.9667832167832167, |
| "grad_norm": 0.31680819392204285, |
| "learning_rate": 0.000576738192419825, |
| "loss": 3.8351, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9813519813519813, |
| "grad_norm": 0.3283533453941345, |
| "learning_rate": 0.0005765632653061224, |
| "loss": 3.8376, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.995920745920746, |
| "grad_norm": 0.32663795351982117, |
| "learning_rate": 0.0005763883381924198, |
| "loss": 3.8481, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.0104895104895104, |
| "grad_norm": 0.3307703733444214, |
| "learning_rate": 0.0005762134110787171, |
| "loss": 3.7685, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.025058275058275, |
| "grad_norm": 0.3428336977958679, |
| "learning_rate": 0.0005760384839650145, |
| "loss": 3.734, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.0396270396270397, |
| "grad_norm": 0.3403928577899933, |
| "learning_rate": 0.0005758635568513119, |
| "loss": 3.7327, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0396270396270397, |
| "eval_accuracy": 0.342164485630514, |
| "eval_loss": 3.808783769607544, |
| "eval_runtime": 180.0566, |
| "eval_samples_per_second": 92.427, |
| "eval_steps_per_second": 5.782, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.054195804195804, |
| "grad_norm": 0.3355982303619385, |
| "learning_rate": 0.0005756886297376093, |
| "loss": 3.7526, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.0687645687645686, |
| "grad_norm": 0.34529364109039307, |
| "learning_rate": 0.0005755137026239067, |
| "loss": 3.7421, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.0833333333333335, |
| "grad_norm": 0.3153204023838043, |
| "learning_rate": 0.000575338775510204, |
| "loss": 3.752, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.097902097902098, |
| "grad_norm": 0.32373619079589844, |
| "learning_rate": 0.0005751638483965014, |
| "loss": 3.737, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.1124708624708624, |
| "grad_norm": 0.32524994015693665, |
| "learning_rate": 0.0005749889212827988, |
| "loss": 3.7506, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.1270396270396272, |
| "grad_norm": 0.33004939556121826, |
| "learning_rate": 0.0005748139941690962, |
| "loss": 3.7495, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.1416083916083917, |
| "grad_norm": 0.3301999270915985, |
| "learning_rate": 0.0005746390670553935, |
| "loss": 3.7555, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.156177156177156, |
| "grad_norm": 0.31961825489997864, |
| "learning_rate": 0.000574464139941691, |
| "loss": 3.7478, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.1707459207459205, |
| "grad_norm": 0.3425387144088745, |
| "learning_rate": 0.0005742892128279883, |
| "loss": 3.7535, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.1853146853146854, |
| "grad_norm": 0.33788540959358215, |
| "learning_rate": 0.0005741142857142857, |
| "loss": 3.7545, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.19988344988345, |
| "grad_norm": 0.32931485772132874, |
| "learning_rate": 0.000573939358600583, |
| "loss": 3.7457, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.2144522144522143, |
| "grad_norm": 0.3231325149536133, |
| "learning_rate": 0.0005737644314868805, |
| "loss": 3.7522, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.229020979020979, |
| "grad_norm": 0.30641666054725647, |
| "learning_rate": 0.0005735895043731778, |
| "loss": 3.7573, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.2435897435897436, |
| "grad_norm": 0.3495963513851166, |
| "learning_rate": 0.0005734145772594752, |
| "loss": 3.7483, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.258158508158508, |
| "grad_norm": 0.3273448646068573, |
| "learning_rate": 0.0005732396501457726, |
| "loss": 3.7492, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.2727272727272725, |
| "grad_norm": 0.3312014639377594, |
| "learning_rate": 0.0005730647230320698, |
| "loss": 3.7369, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.2872960372960374, |
| "grad_norm": 0.31098538637161255, |
| "learning_rate": 0.0005728897959183673, |
| "loss": 3.7632, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.301864801864802, |
| "grad_norm": 0.3138977885246277, |
| "learning_rate": 0.0005727148688046647, |
| "loss": 3.725, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.3164335664335667, |
| "grad_norm": 0.3393580913543701, |
| "learning_rate": 0.000572539941690962, |
| "loss": 3.7624, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.331002331002331, |
| "grad_norm": 0.3338955044746399, |
| "learning_rate": 0.0005723650145772595, |
| "loss": 3.7487, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.331002331002331, |
| "eval_accuracy": 0.3450254713570997, |
| "eval_loss": 3.777087450027466, |
| "eval_runtime": 180.0959, |
| "eval_samples_per_second": 92.406, |
| "eval_steps_per_second": 5.78, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3455710955710956, |
| "grad_norm": 0.3354654014110565, |
| "learning_rate": 0.0005721900874635568, |
| "loss": 3.7489, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.36013986013986, |
| "grad_norm": 0.31632018089294434, |
| "learning_rate": 0.0005720151603498542, |
| "loss": 3.748, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.374708624708625, |
| "grad_norm": 0.31123656034469604, |
| "learning_rate": 0.0005718402332361515, |
| "loss": 3.7349, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.3892773892773893, |
| "grad_norm": 0.3092937469482422, |
| "learning_rate": 0.000571665306122449, |
| "loss": 3.7501, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.4038461538461537, |
| "grad_norm": 0.3420781195163727, |
| "learning_rate": 0.0005714903790087463, |
| "loss": 3.7408, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.4184149184149186, |
| "grad_norm": 0.3158037066459656, |
| "learning_rate": 0.0005713154518950437, |
| "loss": 3.7381, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.432983682983683, |
| "grad_norm": 0.32453060150146484, |
| "learning_rate": 0.000571140524781341, |
| "loss": 3.7396, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.4475524475524475, |
| "grad_norm": 0.3126128613948822, |
| "learning_rate": 0.0005709655976676385, |
| "loss": 3.7409, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.462121212121212, |
| "grad_norm": 0.31964123249053955, |
| "learning_rate": 0.0005707906705539358, |
| "loss": 3.7503, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.476689976689977, |
| "grad_norm": 0.3062613606452942, |
| "learning_rate": 0.0005706157434402332, |
| "loss": 3.7406, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4912587412587412, |
| "grad_norm": 0.31508180499076843, |
| "learning_rate": 0.0005704408163265305, |
| "loss": 3.7365, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.5058275058275057, |
| "grad_norm": 0.3289349377155304, |
| "learning_rate": 0.000570265889212828, |
| "loss": 3.7355, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.5203962703962706, |
| "grad_norm": 0.3557591736316681, |
| "learning_rate": 0.0005700909620991253, |
| "loss": 3.7401, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.534965034965035, |
| "grad_norm": 0.3184148073196411, |
| "learning_rate": 0.0005699160349854227, |
| "loss": 3.7459, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.5495337995337994, |
| "grad_norm": 0.33620187640190125, |
| "learning_rate": 0.00056974110787172, |
| "loss": 3.7431, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.564102564102564, |
| "grad_norm": 0.31378746032714844, |
| "learning_rate": 0.0005695661807580175, |
| "loss": 3.756, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.5786713286713288, |
| "grad_norm": 0.32717159390449524, |
| "learning_rate": 0.0005693912536443148, |
| "loss": 3.7267, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.593240093240093, |
| "grad_norm": 0.2987724244594574, |
| "learning_rate": 0.0005692163265306122, |
| "loss": 3.732, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.607808857808858, |
| "grad_norm": 0.3157244324684143, |
| "learning_rate": 0.0005690413994169095, |
| "loss": 3.7428, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6223776223776225, |
| "grad_norm": 0.307952344417572, |
| "learning_rate": 0.000568866472303207, |
| "loss": 3.7285, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6223776223776225, |
| "eval_accuracy": 0.3477683958560039, |
| "eval_loss": 3.749556064605713, |
| "eval_runtime": 180.156, |
| "eval_samples_per_second": 92.375, |
| "eval_steps_per_second": 5.778, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.636946386946387, |
| "grad_norm": 0.3186863660812378, |
| "learning_rate": 0.0005686915451895044, |
| "loss": 3.7367, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.6515151515151514, |
| "grad_norm": 0.33932939171791077, |
| "learning_rate": 0.0005685166180758016, |
| "loss": 3.7358, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.666083916083916, |
| "grad_norm": 0.3142889142036438, |
| "learning_rate": 0.000568341690962099, |
| "loss": 3.7354, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.6806526806526807, |
| "grad_norm": 0.3227601647377014, |
| "learning_rate": 0.0005681667638483965, |
| "loss": 3.7269, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.695221445221445, |
| "grad_norm": 0.36613523960113525, |
| "learning_rate": 0.0005679918367346938, |
| "loss": 3.7232, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.70979020979021, |
| "grad_norm": 0.3181409239768982, |
| "learning_rate": 0.0005678169096209912, |
| "loss": 3.7404, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.7243589743589745, |
| "grad_norm": 0.32013779878616333, |
| "learning_rate": 0.0005676419825072885, |
| "loss": 3.7268, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.738927738927739, |
| "grad_norm": 0.31618836522102356, |
| "learning_rate": 0.000567467055393586, |
| "loss": 3.723, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.7534965034965033, |
| "grad_norm": 0.34557044506073, |
| "learning_rate": 0.0005672921282798833, |
| "loss": 3.7266, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.768065268065268, |
| "grad_norm": 0.3225458264350891, |
| "learning_rate": 0.0005671172011661807, |
| "loss": 3.7298, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.7826340326340326, |
| "grad_norm": 0.3175308406352997, |
| "learning_rate": 0.000566942274052478, |
| "loss": 3.7272, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.797202797202797, |
| "grad_norm": 0.3606358468532562, |
| "learning_rate": 0.0005667673469387755, |
| "loss": 3.7187, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.811771561771562, |
| "grad_norm": 0.32616934180259705, |
| "learning_rate": 0.0005665924198250728, |
| "loss": 3.7128, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.8263403263403264, |
| "grad_norm": 0.30753543972969055, |
| "learning_rate": 0.0005664174927113702, |
| "loss": 3.7311, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.840909090909091, |
| "grad_norm": 0.3168071508407593, |
| "learning_rate": 0.0005662425655976676, |
| "loss": 3.7225, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.8554778554778553, |
| "grad_norm": 0.3064960837364197, |
| "learning_rate": 0.000566067638483965, |
| "loss": 3.714, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.87004662004662, |
| "grad_norm": 0.3164961338043213, |
| "learning_rate": 0.0005658927113702623, |
| "loss": 3.7264, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.8846153846153846, |
| "grad_norm": 0.32540807127952576, |
| "learning_rate": 0.0005657177842565597, |
| "loss": 3.7172, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.8991841491841495, |
| "grad_norm": 0.318560391664505, |
| "learning_rate": 0.0005655428571428572, |
| "loss": 3.7118, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.913752913752914, |
| "grad_norm": 0.3398500084877014, |
| "learning_rate": 0.0005653679300291545, |
| "loss": 3.7212, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.913752913752914, |
| "eval_accuracy": 0.35038244076771785, |
| "eval_loss": 3.7192089557647705, |
| "eval_runtime": 180.0126, |
| "eval_samples_per_second": 92.449, |
| "eval_steps_per_second": 5.783, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9283216783216783, |
| "grad_norm": 0.32436761260032654, |
| "learning_rate": 0.0005651930029154518, |
| "loss": 3.7122, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.9428904428904428, |
| "grad_norm": 0.3328739106655121, |
| "learning_rate": 0.0005650180758017492, |
| "loss": 3.7132, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.957459207459207, |
| "grad_norm": 0.31366440653800964, |
| "learning_rate": 0.0005648431486880466, |
| "loss": 3.7084, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.972027972027972, |
| "grad_norm": 0.3094634711742401, |
| "learning_rate": 0.000564668221574344, |
| "loss": 3.711, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.9865967365967365, |
| "grad_norm": 0.3342944383621216, |
| "learning_rate": 0.0005644932944606413, |
| "loss": 3.7073, |
| "step": 10250 |
| }, |
| { |
| "epoch": 3.001165501165501, |
| "grad_norm": 0.3029869496822357, |
| "learning_rate": 0.0005643183673469387, |
| "loss": 3.7038, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.015734265734266, |
| "grad_norm": 0.314151406288147, |
| "learning_rate": 0.0005641434402332362, |
| "loss": 3.5974, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.0303030303030303, |
| "grad_norm": 0.31103676557540894, |
| "learning_rate": 0.0005639685131195335, |
| "loss": 3.6124, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.0448717948717947, |
| "grad_norm": 0.3061332702636719, |
| "learning_rate": 0.0005637935860058308, |
| "loss": 3.6153, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.0594405594405596, |
| "grad_norm": 0.3213406205177307, |
| "learning_rate": 0.0005636186588921282, |
| "loss": 3.6026, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.074009324009324, |
| "grad_norm": 0.3310888111591339, |
| "learning_rate": 0.0005634437317784256, |
| "loss": 3.6145, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.0885780885780885, |
| "grad_norm": 0.3187083601951599, |
| "learning_rate": 0.000563268804664723, |
| "loss": 3.6114, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.1031468531468533, |
| "grad_norm": 0.3135937750339508, |
| "learning_rate": 0.0005630938775510203, |
| "loss": 3.6184, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.117715617715618, |
| "grad_norm": 0.3369690179824829, |
| "learning_rate": 0.0005629189504373177, |
| "loss": 3.6178, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.132284382284382, |
| "grad_norm": 0.3287160098552704, |
| "learning_rate": 0.0005627440233236151, |
| "loss": 3.6235, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.1468531468531467, |
| "grad_norm": 0.3253456652164459, |
| "learning_rate": 0.0005625690962099125, |
| "loss": 3.6301, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.1614219114219115, |
| "grad_norm": 0.3379002511501312, |
| "learning_rate": 0.0005623941690962099, |
| "loss": 3.6268, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.175990675990676, |
| "grad_norm": 0.3228496313095093, |
| "learning_rate": 0.0005622192419825073, |
| "loss": 3.623, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.1905594405594404, |
| "grad_norm": 0.32255882024765015, |
| "learning_rate": 0.0005620443148688046, |
| "loss": 3.6165, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.2051282051282053, |
| "grad_norm": 0.3215540945529938, |
| "learning_rate": 0.000561869387755102, |
| "loss": 3.6257, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2051282051282053, |
| "eval_accuracy": 0.3524533334775781, |
| "eval_loss": 3.709486484527588, |
| "eval_runtime": 179.9331, |
| "eval_samples_per_second": 92.49, |
| "eval_steps_per_second": 5.785, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2196969696969697, |
| "grad_norm": 0.31145378947257996, |
| "learning_rate": 0.0005616944606413993, |
| "loss": 3.6378, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.234265734265734, |
| "grad_norm": 0.3215027451515198, |
| "learning_rate": 0.0005615195335276968, |
| "loss": 3.6347, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.248834498834499, |
| "grad_norm": 0.32689639925956726, |
| "learning_rate": 0.0005613446064139941, |
| "loss": 3.6273, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.2634032634032635, |
| "grad_norm": 0.3098812401294708, |
| "learning_rate": 0.0005611696793002915, |
| "loss": 3.6261, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.277972027972028, |
| "grad_norm": 0.3190302848815918, |
| "learning_rate": 0.0005609947521865889, |
| "loss": 3.6275, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.2925407925407923, |
| "grad_norm": 0.31954291462898254, |
| "learning_rate": 0.0005608198250728863, |
| "loss": 3.6323, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.3071095571095572, |
| "grad_norm": 0.33193185925483704, |
| "learning_rate": 0.0005606448979591836, |
| "loss": 3.6185, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.3216783216783217, |
| "grad_norm": 0.30325186252593994, |
| "learning_rate": 0.000560469970845481, |
| "loss": 3.6268, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.336247086247086, |
| "grad_norm": 0.311393678188324, |
| "learning_rate": 0.0005602950437317783, |
| "loss": 3.6327, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.350815850815851, |
| "grad_norm": 0.33085450530052185, |
| "learning_rate": 0.0005601201166180758, |
| "loss": 3.6412, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.3653846153846154, |
| "grad_norm": 0.3296162486076355, |
| "learning_rate": 0.0005599451895043731, |
| "loss": 3.6398, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.37995337995338, |
| "grad_norm": 0.3392165005207062, |
| "learning_rate": 0.0005597702623906705, |
| "loss": 3.6299, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.3945221445221447, |
| "grad_norm": 0.3435153663158417, |
| "learning_rate": 0.0005595953352769679, |
| "loss": 3.6408, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.409090909090909, |
| "grad_norm": 0.3271493911743164, |
| "learning_rate": 0.0005594204081632653, |
| "loss": 3.6544, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.4236596736596736, |
| "grad_norm": 0.3151093125343323, |
| "learning_rate": 0.0005592454810495627, |
| "loss": 3.6352, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.438228438228438, |
| "grad_norm": 0.3355579674243927, |
| "learning_rate": 0.00055907055393586, |
| "loss": 3.6333, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.452797202797203, |
| "grad_norm": 0.3261067271232605, |
| "learning_rate": 0.0005588956268221573, |
| "loss": 3.6588, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.4673659673659674, |
| "grad_norm": 0.3100610673427582, |
| "learning_rate": 0.0005587206997084548, |
| "loss": 3.6338, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.481934731934732, |
| "grad_norm": 0.3188706636428833, |
| "learning_rate": 0.0005585457725947521, |
| "loss": 3.6304, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.4965034965034967, |
| "grad_norm": 0.3274799585342407, |
| "learning_rate": 0.0005583708454810495, |
| "loss": 3.6324, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.4965034965034967, |
| "eval_accuracy": 0.3537938693297391, |
| "eval_loss": 3.6916778087615967, |
| "eval_runtime": 180.0372, |
| "eval_samples_per_second": 92.436, |
| "eval_steps_per_second": 5.782, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.511072261072261, |
| "grad_norm": 0.3254401981830597, |
| "learning_rate": 0.0005581959183673468, |
| "loss": 3.636, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.5256410256410255, |
| "grad_norm": 0.3294544517993927, |
| "learning_rate": 0.0005580209912536443, |
| "loss": 3.6423, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.54020979020979, |
| "grad_norm": 0.32317084074020386, |
| "learning_rate": 0.0005578460641399417, |
| "loss": 3.644, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.554778554778555, |
| "grad_norm": 0.3209782540798187, |
| "learning_rate": 0.000557671137026239, |
| "loss": 3.6373, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.5693473193473193, |
| "grad_norm": 0.3246396780014038, |
| "learning_rate": 0.0005574962099125363, |
| "loss": 3.6479, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.583916083916084, |
| "grad_norm": 0.32658424973487854, |
| "learning_rate": 0.0005573212827988338, |
| "loss": 3.6435, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.5984848484848486, |
| "grad_norm": 0.34126392006874084, |
| "learning_rate": 0.0005571463556851311, |
| "loss": 3.6474, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.613053613053613, |
| "grad_norm": 0.31840071082115173, |
| "learning_rate": 0.0005569714285714285, |
| "loss": 3.6364, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.6276223776223775, |
| "grad_norm": 0.32670649886131287, |
| "learning_rate": 0.0005567965014577258, |
| "loss": 3.6318, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.642191142191142, |
| "grad_norm": 0.3206935524940491, |
| "learning_rate": 0.0005566215743440233, |
| "loss": 3.6369, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.656759906759907, |
| "grad_norm": 0.3333989083766937, |
| "learning_rate": 0.0005564466472303207, |
| "loss": 3.6437, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.6713286713286712, |
| "grad_norm": 0.33308637142181396, |
| "learning_rate": 0.000556271720116618, |
| "loss": 3.6476, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.685897435897436, |
| "grad_norm": 0.30366429686546326, |
| "learning_rate": 0.0005560967930029155, |
| "loss": 3.6361, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.7004662004662006, |
| "grad_norm": 0.3257286250591278, |
| "learning_rate": 0.0005559218658892128, |
| "loss": 3.6359, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.715034965034965, |
| "grad_norm": 0.3141056299209595, |
| "learning_rate": 0.0005557469387755101, |
| "loss": 3.6392, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.7296037296037294, |
| "grad_norm": 0.321039617061615, |
| "learning_rate": 0.0005555720116618075, |
| "loss": 3.6265, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.7441724941724943, |
| "grad_norm": 0.30979907512664795, |
| "learning_rate": 0.0005553970845481049, |
| "loss": 3.6325, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.7587412587412588, |
| "grad_norm": 0.31813862919807434, |
| "learning_rate": 0.0005552221574344023, |
| "loss": 3.6491, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.773310023310023, |
| "grad_norm": 0.32558414340019226, |
| "learning_rate": 0.0005550472303206997, |
| "loss": 3.6395, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.787878787878788, |
| "grad_norm": 0.3136826157569885, |
| "learning_rate": 0.000554872303206997, |
| "loss": 3.6403, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.787878787878788, |
| "eval_accuracy": 0.35539592726393576, |
| "eval_loss": 3.6737773418426514, |
| "eval_runtime": 180.0124, |
| "eval_samples_per_second": 92.449, |
| "eval_steps_per_second": 5.783, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.8024475524475525, |
| "grad_norm": 0.2937915325164795, |
| "learning_rate": 0.0005546973760932945, |
| "loss": 3.638, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.817016317016317, |
| "grad_norm": 0.3118761479854584, |
| "learning_rate": 0.0005545224489795918, |
| "loss": 3.638, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.8315850815850814, |
| "grad_norm": 0.32508912682533264, |
| "learning_rate": 0.0005543475218658891, |
| "loss": 3.6426, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.8461538461538463, |
| "grad_norm": 0.3132587969303131, |
| "learning_rate": 0.0005541725947521865, |
| "loss": 3.6273, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.8607226107226107, |
| "grad_norm": 0.32538720965385437, |
| "learning_rate": 0.0005539976676384839, |
| "loss": 3.6287, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.875291375291375, |
| "grad_norm": 0.3058031499385834, |
| "learning_rate": 0.0005538227405247813, |
| "loss": 3.6354, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.88986013986014, |
| "grad_norm": 0.3083172142505646, |
| "learning_rate": 0.0005536478134110787, |
| "loss": 3.6299, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.9044289044289044, |
| "grad_norm": 0.31901004910469055, |
| "learning_rate": 0.000553472886297376, |
| "loss": 3.6346, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.918997668997669, |
| "grad_norm": 0.30737531185150146, |
| "learning_rate": 0.0005532979591836735, |
| "loss": 3.6519, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.9335664335664333, |
| "grad_norm": 0.3150396943092346, |
| "learning_rate": 0.0005531230320699708, |
| "loss": 3.6308, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.948135198135198, |
| "grad_norm": 0.30913490056991577, |
| "learning_rate": 0.0005529481049562682, |
| "loss": 3.6363, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.9627039627039626, |
| "grad_norm": 0.32307544350624084, |
| "learning_rate": 0.0005527731778425655, |
| "loss": 3.628, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.9772727272727275, |
| "grad_norm": 0.32591161131858826, |
| "learning_rate": 0.0005525982507288629, |
| "loss": 3.6293, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.991841491841492, |
| "grad_norm": 0.35084959864616394, |
| "learning_rate": 0.0005524233236151603, |
| "loss": 3.6342, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.006410256410256, |
| "grad_norm": 0.3362487554550171, |
| "learning_rate": 0.0005522483965014576, |
| "loss": 3.577, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.020979020979021, |
| "grad_norm": 0.33940762281417847, |
| "learning_rate": 0.000552073469387755, |
| "loss": 3.5236, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.035547785547785, |
| "grad_norm": 0.3365685045719147, |
| "learning_rate": 0.0005518985422740525, |
| "loss": 3.5171, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.05011655011655, |
| "grad_norm": 0.32239189743995667, |
| "learning_rate": 0.0005517236151603498, |
| "loss": 3.5349, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.064685314685315, |
| "grad_norm": 0.3359779715538025, |
| "learning_rate": 0.0005515486880466472, |
| "loss": 3.5388, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.0792540792540795, |
| "grad_norm": 0.3400496542453766, |
| "learning_rate": 0.0005513737609329446, |
| "loss": 3.5471, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.0792540792540795, |
| "eval_accuracy": 0.3563893348849056, |
| "eval_loss": 3.6670174598693848, |
| "eval_runtime": 180.0, |
| "eval_samples_per_second": 92.456, |
| "eval_steps_per_second": 5.783, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.093822843822844, |
| "grad_norm": 0.3247426450252533, |
| "learning_rate": 0.0005511988338192419, |
| "loss": 3.5358, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.108391608391608, |
| "grad_norm": 0.32067304849624634, |
| "learning_rate": 0.0005510239067055393, |
| "loss": 3.5265, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.122960372960373, |
| "grad_norm": 0.31784480810165405, |
| "learning_rate": 0.0005508489795918366, |
| "loss": 3.5411, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.137529137529137, |
| "grad_norm": 0.30831626057624817, |
| "learning_rate": 0.0005506740524781341, |
| "loss": 3.5379, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.1520979020979025, |
| "grad_norm": 0.3176884651184082, |
| "learning_rate": 0.0005504991253644315, |
| "loss": 3.5452, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 0.34430864453315735, |
| "learning_rate": 0.0005503241982507288, |
| "loss": 3.5519, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.181235431235431, |
| "grad_norm": 0.3338887691497803, |
| "learning_rate": 0.0005501492711370262, |
| "loss": 3.5369, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.195804195804196, |
| "grad_norm": 0.3119693100452423, |
| "learning_rate": 0.0005499743440233236, |
| "loss": 3.5491, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.21037296037296, |
| "grad_norm": 0.3248231112957001, |
| "learning_rate": 0.000549799416909621, |
| "loss": 3.5541, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.224941724941725, |
| "grad_norm": 0.32441389560699463, |
| "learning_rate": 0.0005496244897959183, |
| "loss": 3.5611, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.239510489510489, |
| "grad_norm": 0.3318880498409271, |
| "learning_rate": 0.0005494495626822156, |
| "loss": 3.5484, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.2540792540792545, |
| "grad_norm": 0.3246316611766815, |
| "learning_rate": 0.0005492746355685131, |
| "loss": 3.5641, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.268648018648019, |
| "grad_norm": 0.32765793800354004, |
| "learning_rate": 0.0005490997084548105, |
| "loss": 3.5597, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.283216783216783, |
| "grad_norm": 0.3181726336479187, |
| "learning_rate": 0.0005489247813411078, |
| "loss": 3.5587, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.297785547785548, |
| "grad_norm": 0.3304060697555542, |
| "learning_rate": 0.0005487498542274052, |
| "loss": 3.5593, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.312354312354312, |
| "grad_norm": 0.3318939507007599, |
| "learning_rate": 0.0005485749271137026, |
| "loss": 3.5622, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.326923076923077, |
| "grad_norm": 0.31229400634765625, |
| "learning_rate": 0.0005484, |
| "loss": 3.555, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.341491841491841, |
| "grad_norm": 0.3215874433517456, |
| "learning_rate": 0.0005482250728862973, |
| "loss": 3.5703, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.356060606060606, |
| "grad_norm": 0.3406248092651367, |
| "learning_rate": 0.0005480501457725946, |
| "loss": 3.5592, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.370629370629371, |
| "grad_norm": 0.3115122318267822, |
| "learning_rate": 0.0005478752186588921, |
| "loss": 3.5627, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.370629370629371, |
| "eval_accuracy": 0.3577426881412671, |
| "eval_loss": 3.655911922454834, |
| "eval_runtime": 180.1218, |
| "eval_samples_per_second": 92.393, |
| "eval_steps_per_second": 5.779, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.385198135198135, |
| "grad_norm": 0.32183435559272766, |
| "learning_rate": 0.0005477002915451894, |
| "loss": 3.5637, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.3997668997669, |
| "grad_norm": 0.3230733573436737, |
| "learning_rate": 0.0005475253644314868, |
| "loss": 3.5635, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.414335664335664, |
| "grad_norm": 0.3347214162349701, |
| "learning_rate": 0.0005473504373177842, |
| "loss": 3.5735, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.428904428904429, |
| "grad_norm": 0.3242194950580597, |
| "learning_rate": 0.0005471755102040816, |
| "loss": 3.5625, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.443473193473194, |
| "grad_norm": 0.31296148896217346, |
| "learning_rate": 0.000547000583090379, |
| "loss": 3.5629, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.458041958041958, |
| "grad_norm": 0.3409155607223511, |
| "learning_rate": 0.0005468256559766763, |
| "loss": 3.5572, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.472610722610723, |
| "grad_norm": 0.3338817059993744, |
| "learning_rate": 0.0005466507288629738, |
| "loss": 3.5601, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.487179487179487, |
| "grad_norm": 0.32212719321250916, |
| "learning_rate": 0.0005464758017492711, |
| "loss": 3.5644, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.501748251748252, |
| "grad_norm": 0.32063987851142883, |
| "learning_rate": 0.0005463008746355684, |
| "loss": 3.5586, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.516317016317016, |
| "grad_norm": 0.3092406392097473, |
| "learning_rate": 0.0005461259475218658, |
| "loss": 3.5675, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.5308857808857805, |
| "grad_norm": 0.31964099407196045, |
| "learning_rate": 0.0005459510204081633, |
| "loss": 3.5691, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.545454545454545, |
| "grad_norm": 0.31656357645988464, |
| "learning_rate": 0.0005457760932944606, |
| "loss": 3.5724, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.56002331002331, |
| "grad_norm": 0.3061751425266266, |
| "learning_rate": 0.000545601166180758, |
| "loss": 3.5734, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.574592074592075, |
| "grad_norm": 0.3436771333217621, |
| "learning_rate": 0.0005454262390670553, |
| "loss": 3.5662, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.589160839160839, |
| "grad_norm": 0.33170390129089355, |
| "learning_rate": 0.0005452513119533528, |
| "loss": 3.5748, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.603729603729604, |
| "grad_norm": 0.3121337294578552, |
| "learning_rate": 0.0005450763848396501, |
| "loss": 3.5799, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.618298368298368, |
| "grad_norm": 0.3196841776371002, |
| "learning_rate": 0.0005449014577259474, |
| "loss": 3.5751, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.632867132867133, |
| "grad_norm": 0.32077154517173767, |
| "learning_rate": 0.0005447265306122448, |
| "loss": 3.566, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.647435897435898, |
| "grad_norm": 0.34822878241539, |
| "learning_rate": 0.0005445516034985423, |
| "loss": 3.5787, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.662004662004662, |
| "grad_norm": 0.35842180252075195, |
| "learning_rate": 0.0005443766763848396, |
| "loss": 3.5772, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.662004662004662, |
| "eval_accuracy": 0.35919034927073673, |
| "eval_loss": 3.6416175365448, |
| "eval_runtime": 180.1041, |
| "eval_samples_per_second": 92.402, |
| "eval_steps_per_second": 5.78, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.676573426573427, |
| "grad_norm": 0.3213863670825958, |
| "learning_rate": 0.000544201749271137, |
| "loss": 3.5688, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.691142191142191, |
| "grad_norm": 0.3245983421802521, |
| "learning_rate": 0.0005440268221574343, |
| "loss": 3.5757, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.7057109557109555, |
| "grad_norm": 0.3060520887374878, |
| "learning_rate": 0.0005438518950437318, |
| "loss": 3.5767, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.72027972027972, |
| "grad_norm": 0.3220885097980499, |
| "learning_rate": 0.0005436769679300291, |
| "loss": 3.5617, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.734848484848484, |
| "grad_norm": 0.30982065200805664, |
| "learning_rate": 0.0005435020408163265, |
| "loss": 3.5732, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.74941724941725, |
| "grad_norm": 0.31846100091934204, |
| "learning_rate": 0.0005433271137026238, |
| "loss": 3.5751, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.763986013986014, |
| "grad_norm": 0.3250206410884857, |
| "learning_rate": 0.0005431521865889212, |
| "loss": 3.5711, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.778554778554779, |
| "grad_norm": 0.3231157064437866, |
| "learning_rate": 0.0005429772594752186, |
| "loss": 3.5673, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.793123543123543, |
| "grad_norm": 0.34479278326034546, |
| "learning_rate": 0.000542802332361516, |
| "loss": 3.5883, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.8076923076923075, |
| "grad_norm": 0.2996126413345337, |
| "learning_rate": 0.0005426274052478133, |
| "loss": 3.5754, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.822261072261072, |
| "grad_norm": 0.3211113214492798, |
| "learning_rate": 0.0005424524781341108, |
| "loss": 3.5794, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.836829836829837, |
| "grad_norm": 0.33680838346481323, |
| "learning_rate": 0.0005422775510204081, |
| "loss": 3.567, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.851398601398602, |
| "grad_norm": 0.2969048321247101, |
| "learning_rate": 0.0005421026239067055, |
| "loss": 3.5801, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.865967365967366, |
| "grad_norm": 0.31932759284973145, |
| "learning_rate": 0.0005419276967930028, |
| "loss": 3.5793, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.880536130536131, |
| "grad_norm": 0.2942551076412201, |
| "learning_rate": 0.0005417527696793002, |
| "loss": 3.5725, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.895104895104895, |
| "grad_norm": 0.30885228514671326, |
| "learning_rate": 0.0005415778425655976, |
| "loss": 3.5668, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.909673659673659, |
| "grad_norm": 0.29992324113845825, |
| "learning_rate": 0.000541402915451895, |
| "loss": 3.5678, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.924242424242424, |
| "grad_norm": 0.31865811347961426, |
| "learning_rate": 0.0005412279883381923, |
| "loss": 3.5728, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.938811188811189, |
| "grad_norm": 0.3378613293170929, |
| "learning_rate": 0.0005410530612244898, |
| "loss": 3.5738, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.953379953379954, |
| "grad_norm": 0.3021279275417328, |
| "learning_rate": 0.0005408781341107871, |
| "loss": 3.5756, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.953379953379954, |
| "eval_accuracy": 0.36047597018930483, |
| "eval_loss": 3.6271886825561523, |
| "eval_runtime": 180.0908, |
| "eval_samples_per_second": 92.409, |
| "eval_steps_per_second": 5.78, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.967948717948718, |
| "grad_norm": 0.3146043121814728, |
| "learning_rate": 0.0005407032069970845, |
| "loss": 3.5633, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.9825174825174825, |
| "grad_norm": 0.3014509677886963, |
| "learning_rate": 0.0005405282798833819, |
| "loss": 3.5643, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.997086247086247, |
| "grad_norm": 0.3128616213798523, |
| "learning_rate": 0.0005403533527696793, |
| "loss": 3.572, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.011655011655011, |
| "grad_norm": 0.34456515312194824, |
| "learning_rate": 0.0005401784256559766, |
| "loss": 3.4792, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.026223776223776, |
| "grad_norm": 0.32703524827957153, |
| "learning_rate": 0.000540003498542274, |
| "loss": 3.4653, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.040792540792541, |
| "grad_norm": 0.32900843024253845, |
| "learning_rate": 0.0005398285714285714, |
| "loss": 3.463, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.055361305361306, |
| "grad_norm": 0.31689518690109253, |
| "learning_rate": 0.0005396536443148688, |
| "loss": 3.4594, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.06993006993007, |
| "grad_norm": 0.3287120461463928, |
| "learning_rate": 0.0005394787172011661, |
| "loss": 3.4681, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.084498834498834, |
| "grad_norm": 0.3206745684146881, |
| "learning_rate": 0.0005393037900874635, |
| "loss": 3.4808, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.099067599067599, |
| "grad_norm": 0.3196023404598236, |
| "learning_rate": 0.0005391288629737609, |
| "loss": 3.4808, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.113636363636363, |
| "grad_norm": 0.33133959770202637, |
| "learning_rate": 0.0005389539358600583, |
| "loss": 3.4862, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.128205128205128, |
| "grad_norm": 0.3225027024745941, |
| "learning_rate": 0.0005387790087463557, |
| "loss": 3.4919, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.142773892773893, |
| "grad_norm": 0.3661787211894989, |
| "learning_rate": 0.0005386040816326529, |
| "loss": 3.4776, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.1573426573426575, |
| "grad_norm": 0.31561416387557983, |
| "learning_rate": 0.0005384291545189504, |
| "loss": 3.4934, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.171911421911422, |
| "grad_norm": 0.32750192284584045, |
| "learning_rate": 0.0005382542274052478, |
| "loss": 3.4987, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.186480186480186, |
| "grad_norm": 0.36426979303359985, |
| "learning_rate": 0.0005380793002915451, |
| "loss": 3.4982, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.201048951048951, |
| "grad_norm": 0.32267603278160095, |
| "learning_rate": 0.0005379043731778425, |
| "loss": 3.5052, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.215617715617715, |
| "grad_norm": 0.32066959142684937, |
| "learning_rate": 0.0005377294460641399, |
| "loss": 3.4943, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.230186480186481, |
| "grad_norm": 0.3105967044830322, |
| "learning_rate": 0.0005375545189504373, |
| "loss": 3.5037, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.244755244755245, |
| "grad_norm": 0.3213440179824829, |
| "learning_rate": 0.0005373795918367346, |
| "loss": 3.4948, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.244755244755245, |
| "eval_accuracy": 0.36040694435200493, |
| "eval_loss": 3.630403995513916, |
| "eval_runtime": 180.0023, |
| "eval_samples_per_second": 92.454, |
| "eval_steps_per_second": 5.783, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.2593240093240095, |
| "grad_norm": 0.33134472370147705, |
| "learning_rate": 0.000537204664723032, |
| "loss": 3.5058, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.273892773892774, |
| "grad_norm": 0.32371455430984497, |
| "learning_rate": 0.0005370297376093294, |
| "loss": 3.4951, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.288461538461538, |
| "grad_norm": 0.3160519301891327, |
| "learning_rate": 0.0005368548104956268, |
| "loss": 3.5056, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.303030303030303, |
| "grad_norm": 0.32032284140586853, |
| "learning_rate": 0.0005366798833819241, |
| "loss": 3.4996, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.317599067599067, |
| "grad_norm": 0.344394326210022, |
| "learning_rate": 0.0005365049562682215, |
| "loss": 3.5106, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.3321678321678325, |
| "grad_norm": 0.3405134379863739, |
| "learning_rate": 0.0005363300291545189, |
| "loss": 3.5028, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.346736596736597, |
| "grad_norm": 0.32548317313194275, |
| "learning_rate": 0.0005361551020408163, |
| "loss": 3.5205, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.361305361305361, |
| "grad_norm": 0.3317105174064636, |
| "learning_rate": 0.0005359801749271136, |
| "loss": 3.4986, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.375874125874126, |
| "grad_norm": 0.3342733383178711, |
| "learning_rate": 0.000535805247813411, |
| "loss": 3.5151, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.39044289044289, |
| "grad_norm": 0.33692413568496704, |
| "learning_rate": 0.0005356303206997085, |
| "loss": 3.5003, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.405011655011655, |
| "grad_norm": 0.31811004877090454, |
| "learning_rate": 0.0005354553935860058, |
| "loss": 3.5257, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.41958041958042, |
| "grad_norm": 0.3129310607910156, |
| "learning_rate": 0.0005352804664723031, |
| "loss": 3.5172, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.4341491841491845, |
| "grad_norm": 0.30926746129989624, |
| "learning_rate": 0.0005351055393586006, |
| "loss": 3.5206, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.448717948717949, |
| "grad_norm": 0.3198590576648712, |
| "learning_rate": 0.0005349306122448979, |
| "loss": 3.5003, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.463286713286713, |
| "grad_norm": 0.30657318234443665, |
| "learning_rate": 0.0005347556851311953, |
| "loss": 3.5242, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.477855477855478, |
| "grad_norm": 0.3101734519004822, |
| "learning_rate": 0.0005345807580174926, |
| "loss": 3.5127, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.492424242424242, |
| "grad_norm": 0.3366558253765106, |
| "learning_rate": 0.0005344058309037901, |
| "loss": 3.5179, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.506993006993007, |
| "grad_norm": 0.3367293179035187, |
| "learning_rate": 0.0005342309037900875, |
| "loss": 3.5164, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.521561771561771, |
| "grad_norm": 0.3203129172325134, |
| "learning_rate": 0.0005340559766763848, |
| "loss": 3.5129, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.536130536130536, |
| "grad_norm": 0.3326050341129303, |
| "learning_rate": 0.0005338810495626821, |
| "loss": 3.5348, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.536130536130536, |
| "eval_accuracy": 0.36132097813962316, |
| "eval_loss": 3.622004508972168, |
| "eval_runtime": 180.0706, |
| "eval_samples_per_second": 92.419, |
| "eval_steps_per_second": 5.781, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.550699300699301, |
| "grad_norm": 0.3246619701385498, |
| "learning_rate": 0.0005337061224489796, |
| "loss": 3.5178, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.565268065268065, |
| "grad_norm": 0.3267384171485901, |
| "learning_rate": 0.0005335311953352769, |
| "loss": 3.5183, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.57983682983683, |
| "grad_norm": 0.3426792323589325, |
| "learning_rate": 0.0005333562682215743, |
| "loss": 3.5213, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.594405594405594, |
| "grad_norm": 0.3265758454799652, |
| "learning_rate": 0.0005331813411078716, |
| "loss": 3.5194, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.608974358974359, |
| "grad_norm": 0.32385995984077454, |
| "learning_rate": 0.0005330064139941691, |
| "loss": 3.5222, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.623543123543124, |
| "grad_norm": 0.3385801911354065, |
| "learning_rate": 0.0005328314868804665, |
| "loss": 3.5114, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.638111888111888, |
| "grad_norm": 0.30696603655815125, |
| "learning_rate": 0.0005326565597667638, |
| "loss": 3.5162, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.652680652680653, |
| "grad_norm": 0.3337629735469818, |
| "learning_rate": 0.0005324816326530612, |
| "loss": 3.5178, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.667249417249417, |
| "grad_norm": 0.3224729895591736, |
| "learning_rate": 0.0005323067055393586, |
| "loss": 3.5234, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.681818181818182, |
| "grad_norm": 0.3160954713821411, |
| "learning_rate": 0.0005321317784256559, |
| "loss": 3.5084, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.696386946386946, |
| "grad_norm": 0.3182397186756134, |
| "learning_rate": 0.0005319568513119533, |
| "loss": 3.5209, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.7109557109557105, |
| "grad_norm": 0.3317508399486542, |
| "learning_rate": 0.0005317819241982506, |
| "loss": 3.5349, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.725524475524476, |
| "grad_norm": 0.3387415409088135, |
| "learning_rate": 0.0005316069970845481, |
| "loss": 3.5275, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.74009324009324, |
| "grad_norm": 0.33118101954460144, |
| "learning_rate": 0.0005314320699708454, |
| "loss": 3.5235, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.754662004662005, |
| "grad_norm": 0.32198289036750793, |
| "learning_rate": 0.0005312571428571428, |
| "loss": 3.5296, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "grad_norm": 0.3228715658187866, |
| "learning_rate": 0.0005310822157434403, |
| "loss": 3.5169, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.783799533799534, |
| "grad_norm": 0.3381294012069702, |
| "learning_rate": 0.0005309072886297376, |
| "loss": 3.5367, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.798368298368298, |
| "grad_norm": 0.3143836557865143, |
| "learning_rate": 0.0005307323615160349, |
| "loss": 3.5199, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.812937062937063, |
| "grad_norm": 0.32376548647880554, |
| "learning_rate": 0.0005305574344023323, |
| "loss": 3.5133, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.827505827505828, |
| "grad_norm": 0.31885069608688354, |
| "learning_rate": 0.0005303825072886296, |
| "loss": 3.5111, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.827505827505828, |
| "eval_accuracy": 0.36265845662931434, |
| "eval_loss": 3.611976146697998, |
| "eval_runtime": 180.1718, |
| "eval_samples_per_second": 92.367, |
| "eval_steps_per_second": 5.778, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.842074592074592, |
| "grad_norm": 0.3121870458126068, |
| "learning_rate": 0.0005302075801749271, |
| "loss": 3.524, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.856643356643357, |
| "grad_norm": 0.3262169063091278, |
| "learning_rate": 0.0005300326530612244, |
| "loss": 3.5309, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.871212121212121, |
| "grad_norm": 0.3347600996494293, |
| "learning_rate": 0.0005298577259475218, |
| "loss": 3.5402, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.8857808857808855, |
| "grad_norm": 0.31512966752052307, |
| "learning_rate": 0.0005296827988338193, |
| "loss": 3.5253, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.90034965034965, |
| "grad_norm": 0.3156508505344391, |
| "learning_rate": 0.0005295078717201166, |
| "loss": 3.5208, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.914918414918415, |
| "grad_norm": 0.31857651472091675, |
| "learning_rate": 0.000529332944606414, |
| "loss": 3.5183, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.92948717948718, |
| "grad_norm": 0.32981452345848083, |
| "learning_rate": 0.0005291580174927113, |
| "loss": 3.5222, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.944055944055944, |
| "grad_norm": 0.3362169563770294, |
| "learning_rate": 0.0005289830903790087, |
| "loss": 3.5272, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.958624708624709, |
| "grad_norm": 0.32992255687713623, |
| "learning_rate": 0.0005288081632653061, |
| "loss": 3.5311, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.973193473193473, |
| "grad_norm": 0.31938815116882324, |
| "learning_rate": 0.0005286332361516034, |
| "loss": 3.5149, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.9877622377622375, |
| "grad_norm": 0.33160415291786194, |
| "learning_rate": 0.0005284583090379008, |
| "loss": 3.5177, |
| "step": 20550 |
| }, |
| { |
| "epoch": 6.002331002331002, |
| "grad_norm": 0.3441227674484253, |
| "learning_rate": 0.0005282833819241983, |
| "loss": 3.51, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.016899766899767, |
| "grad_norm": 0.32598426938056946, |
| "learning_rate": 0.0005281084548104956, |
| "loss": 3.414, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.031468531468532, |
| "grad_norm": 0.31953006982803345, |
| "learning_rate": 0.000527933527696793, |
| "loss": 3.4213, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.046037296037296, |
| "grad_norm": 0.3276713192462921, |
| "learning_rate": 0.0005277586005830903, |
| "loss": 3.4137, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.0606060606060606, |
| "grad_norm": 0.338102251291275, |
| "learning_rate": 0.0005275836734693877, |
| "loss": 3.4243, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.075174825174825, |
| "grad_norm": 0.328328013420105, |
| "learning_rate": 0.0005274087463556851, |
| "loss": 3.4284, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.089743589743589, |
| "grad_norm": 0.33343490958213806, |
| "learning_rate": 0.0005272338192419824, |
| "loss": 3.4311, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.104312354312355, |
| "grad_norm": 0.3318384289741516, |
| "learning_rate": 0.0005270588921282798, |
| "loss": 3.4394, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.118881118881119, |
| "grad_norm": 0.34948527812957764, |
| "learning_rate": 0.0005268839650145772, |
| "loss": 3.4294, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.118881118881119, |
| "eval_accuracy": 0.36256932275423204, |
| "eval_loss": 3.613757371902466, |
| "eval_runtime": 180.2204, |
| "eval_samples_per_second": 92.342, |
| "eval_steps_per_second": 5.776, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.133449883449884, |
| "grad_norm": 0.3257012665271759, |
| "learning_rate": 0.0005267090379008746, |
| "loss": 3.4437, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.148018648018648, |
| "grad_norm": 0.3224816620349884, |
| "learning_rate": 0.000526534110787172, |
| "loss": 3.4454, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.1625874125874125, |
| "grad_norm": 0.3164311349391937, |
| "learning_rate": 0.0005263591836734693, |
| "loss": 3.4504, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.177156177156177, |
| "grad_norm": 0.3344584107398987, |
| "learning_rate": 0.0005261842565597668, |
| "loss": 3.4456, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.191724941724941, |
| "grad_norm": 0.32673484086990356, |
| "learning_rate": 0.0005260093294460641, |
| "loss": 3.4504, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.206293706293707, |
| "grad_norm": 0.33216020464897156, |
| "learning_rate": 0.0005258344023323614, |
| "loss": 3.4525, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.220862470862471, |
| "grad_norm": 0.3255676031112671, |
| "learning_rate": 0.0005256594752186588, |
| "loss": 3.4533, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.235431235431236, |
| "grad_norm": 0.3329181373119354, |
| "learning_rate": 0.0005254845481049562, |
| "loss": 3.456, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.25, |
| "grad_norm": 0.3463157117366791, |
| "learning_rate": 0.0005253096209912536, |
| "loss": 3.463, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.264568764568764, |
| "grad_norm": 0.3387112319469452, |
| "learning_rate": 0.000525134693877551, |
| "loss": 3.4693, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.279137529137529, |
| "grad_norm": 0.35688215494155884, |
| "learning_rate": 0.0005249597667638484, |
| "loss": 3.4673, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.293706293706293, |
| "grad_norm": 0.32070285081863403, |
| "learning_rate": 0.0005247848396501458, |
| "loss": 3.4506, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.308275058275059, |
| "grad_norm": 0.3204960525035858, |
| "learning_rate": 0.0005246099125364431, |
| "loss": 3.4585, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.322843822843823, |
| "grad_norm": 0.33638399839401245, |
| "learning_rate": 0.0005244349854227404, |
| "loss": 3.4617, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.3374125874125875, |
| "grad_norm": 0.3176703155040741, |
| "learning_rate": 0.0005242600583090379, |
| "loss": 3.4561, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.351981351981352, |
| "grad_norm": 0.3235324025154114, |
| "learning_rate": 0.0005240851311953352, |
| "loss": 3.4705, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.366550116550116, |
| "grad_norm": 0.34393689036369324, |
| "learning_rate": 0.0005239102040816326, |
| "loss": 3.4627, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.381118881118881, |
| "grad_norm": 0.3094463646411896, |
| "learning_rate": 0.00052373527696793, |
| "loss": 3.4661, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.395687645687646, |
| "grad_norm": 0.33415213227272034, |
| "learning_rate": 0.0005235603498542274, |
| "loss": 3.462, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.410256410256411, |
| "grad_norm": 0.32387012243270874, |
| "learning_rate": 0.0005233854227405248, |
| "loss": 3.468, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.410256410256411, |
| "eval_accuracy": 0.3633268431015672, |
| "eval_loss": 3.607569694519043, |
| "eval_runtime": 180.2515, |
| "eval_samples_per_second": 92.327, |
| "eval_steps_per_second": 5.775, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.424825174825175, |
| "grad_norm": 0.33040571212768555, |
| "learning_rate": 0.0005232104956268221, |
| "loss": 3.47, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.4393939393939394, |
| "grad_norm": 0.3241094648838043, |
| "learning_rate": 0.0005230355685131195, |
| "loss": 3.4797, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.453962703962704, |
| "grad_norm": 0.32219818234443665, |
| "learning_rate": 0.0005228606413994169, |
| "loss": 3.4814, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.468531468531468, |
| "grad_norm": 0.29795998334884644, |
| "learning_rate": 0.0005226857142857142, |
| "loss": 3.4686, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.483100233100233, |
| "grad_norm": 0.32461127638816833, |
| "learning_rate": 0.0005225107871720116, |
| "loss": 3.4817, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.497668997668998, |
| "grad_norm": 0.3403743803501129, |
| "learning_rate": 0.0005223358600583089, |
| "loss": 3.469, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.5122377622377625, |
| "grad_norm": 0.3380977213382721, |
| "learning_rate": 0.0005221609329446064, |
| "loss": 3.4631, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.526806526806527, |
| "grad_norm": 0.3517080545425415, |
| "learning_rate": 0.0005219860058309038, |
| "loss": 3.4956, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.541375291375291, |
| "grad_norm": 0.3424749970436096, |
| "learning_rate": 0.0005218110787172011, |
| "loss": 3.4799, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.555944055944056, |
| "grad_norm": 0.33320173621177673, |
| "learning_rate": 0.0005216361516034985, |
| "loss": 3.4824, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.57051282051282, |
| "grad_norm": 0.3647104799747467, |
| "learning_rate": 0.0005214612244897959, |
| "loss": 3.4817, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.585081585081585, |
| "grad_norm": 0.32451170682907104, |
| "learning_rate": 0.0005212862973760932, |
| "loss": 3.4947, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.59965034965035, |
| "grad_norm": 0.3204374313354492, |
| "learning_rate": 0.0005211113702623906, |
| "loss": 3.4697, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.6142191142191145, |
| "grad_norm": 0.35725289583206177, |
| "learning_rate": 0.0005209364431486879, |
| "loss": 3.4834, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.628787878787879, |
| "grad_norm": 0.3210899531841278, |
| "learning_rate": 0.0005207615160349854, |
| "loss": 3.4881, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.643356643356643, |
| "grad_norm": 0.32716649770736694, |
| "learning_rate": 0.0005205865889212828, |
| "loss": 3.4757, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.657925407925408, |
| "grad_norm": 0.30155670642852783, |
| "learning_rate": 0.0005204116618075801, |
| "loss": 3.4726, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.672494172494172, |
| "grad_norm": 0.3163397014141083, |
| "learning_rate": 0.0005202367346938776, |
| "loss": 3.4822, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.687062937062937, |
| "grad_norm": 0.32445988059043884, |
| "learning_rate": 0.0005200618075801749, |
| "loss": 3.4772, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.701631701631702, |
| "grad_norm": 0.3531341254711151, |
| "learning_rate": 0.0005198868804664723, |
| "loss": 3.463, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.701631701631702, |
| "eval_accuracy": 0.3641228156615039, |
| "eval_loss": 3.5994231700897217, |
| "eval_runtime": 180.3181, |
| "eval_samples_per_second": 92.292, |
| "eval_steps_per_second": 5.773, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.716200466200466, |
| "grad_norm": 0.31323182582855225, |
| "learning_rate": 0.0005197119533527696, |
| "loss": 3.4779, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.730769230769231, |
| "grad_norm": 0.33791857957839966, |
| "learning_rate": 0.000519537026239067, |
| "loss": 3.4815, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.745337995337995, |
| "grad_norm": 0.32510659098625183, |
| "learning_rate": 0.0005193620991253644, |
| "loss": 3.4744, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.75990675990676, |
| "grad_norm": 0.3223019540309906, |
| "learning_rate": 0.0005191871720116618, |
| "loss": 3.4814, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.774475524475524, |
| "grad_norm": 0.33018162846565247, |
| "learning_rate": 0.0005190122448979591, |
| "loss": 3.4914, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.7890442890442895, |
| "grad_norm": 0.34106600284576416, |
| "learning_rate": 0.0005188373177842566, |
| "loss": 3.4826, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.803613053613054, |
| "grad_norm": 0.3386887013912201, |
| "learning_rate": 0.0005186623906705539, |
| "loss": 3.4854, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.818181818181818, |
| "grad_norm": 0.3332427442073822, |
| "learning_rate": 0.0005184874635568513, |
| "loss": 3.4823, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.832750582750583, |
| "grad_norm": 0.3226874768733978, |
| "learning_rate": 0.0005183125364431486, |
| "loss": 3.4834, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.847319347319347, |
| "grad_norm": 0.3233667016029358, |
| "learning_rate": 0.000518137609329446, |
| "loss": 3.4903, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.861888111888112, |
| "grad_norm": 0.3243621289730072, |
| "learning_rate": 0.0005179626822157434, |
| "loss": 3.4887, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.876456876456876, |
| "grad_norm": 0.31626027822494507, |
| "learning_rate": 0.0005177877551020407, |
| "loss": 3.4999, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.891025641025641, |
| "grad_norm": 0.32455411553382874, |
| "learning_rate": 0.0005176128279883381, |
| "loss": 3.4913, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.905594405594406, |
| "grad_norm": 0.33513808250427246, |
| "learning_rate": 0.0005174379008746356, |
| "loss": 3.4868, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.92016317016317, |
| "grad_norm": 0.3218197226524353, |
| "learning_rate": 0.0005172629737609329, |
| "loss": 3.4826, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.934731934731935, |
| "grad_norm": 0.3316863477230072, |
| "learning_rate": 0.0005170880466472303, |
| "loss": 3.4793, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.949300699300699, |
| "grad_norm": 0.3216456472873688, |
| "learning_rate": 0.0005169131195335276, |
| "loss": 3.4953, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.963869463869464, |
| "grad_norm": 0.3314974308013916, |
| "learning_rate": 0.0005167381924198251, |
| "loss": 3.4799, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.978438228438229, |
| "grad_norm": 0.3145802319049835, |
| "learning_rate": 0.0005165632653061224, |
| "loss": 3.4943, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.993006993006993, |
| "grad_norm": 0.3357449471950531, |
| "learning_rate": 0.0005163883381924197, |
| "loss": 3.4931, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.993006993006993, |
| "eval_accuracy": 0.3651081095128422, |
| "eval_loss": 3.587930202484131, |
| "eval_runtime": 180.1844, |
| "eval_samples_per_second": 92.361, |
| "eval_steps_per_second": 5.777, |
| "step": 24000 |
| }, |
| { |
| "epoch": 7.007575757575758, |
| "grad_norm": 0.33467987179756165, |
| "learning_rate": 0.0005162134110787171, |
| "loss": 3.4232, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.022144522144522, |
| "grad_norm": 0.34825414419174194, |
| "learning_rate": 0.0005160384839650146, |
| "loss": 3.3667, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.036713286713287, |
| "grad_norm": 0.3400607407093048, |
| "learning_rate": 0.0005158635568513119, |
| "loss": 3.3739, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.051282051282051, |
| "grad_norm": 0.3337096869945526, |
| "learning_rate": 0.0005156886297376093, |
| "loss": 3.3744, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.0658508158508155, |
| "grad_norm": 0.32601627707481384, |
| "learning_rate": 0.0005155137026239066, |
| "loss": 3.3788, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.08041958041958, |
| "grad_norm": 0.3523777723312378, |
| "learning_rate": 0.0005153387755102041, |
| "loss": 3.3793, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.094988344988345, |
| "grad_norm": 0.3176375925540924, |
| "learning_rate": 0.0005151638483965014, |
| "loss": 3.3979, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.10955710955711, |
| "grad_norm": 0.34010785818099976, |
| "learning_rate": 0.0005149889212827987, |
| "loss": 3.3969, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.124125874125874, |
| "grad_norm": 0.3091638684272766, |
| "learning_rate": 0.0005148139941690961, |
| "loss": 3.3991, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.138694638694639, |
| "grad_norm": 0.33341875672340393, |
| "learning_rate": 0.0005146390670553936, |
| "loss": 3.3976, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.153263403263403, |
| "grad_norm": 0.32438334822654724, |
| "learning_rate": 0.0005144641399416909, |
| "loss": 3.4227, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.1678321678321675, |
| "grad_norm": 0.32416701316833496, |
| "learning_rate": 0.0005142892128279883, |
| "loss": 3.4131, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.182400932400933, |
| "grad_norm": 0.31941843032836914, |
| "learning_rate": 0.0005141142857142856, |
| "loss": 3.4218, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.196969696969697, |
| "grad_norm": 0.3458154499530792, |
| "learning_rate": 0.0005139393586005831, |
| "loss": 3.406, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.211538461538462, |
| "grad_norm": 0.3377438485622406, |
| "learning_rate": 0.0005137644314868804, |
| "loss": 3.4271, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.226107226107226, |
| "grad_norm": 0.32163214683532715, |
| "learning_rate": 0.0005135895043731778, |
| "loss": 3.4073, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.2406759906759905, |
| "grad_norm": 0.3218829333782196, |
| "learning_rate": 0.0005134145772594752, |
| "loss": 3.4237, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.255244755244755, |
| "grad_norm": 0.32908493280410767, |
| "learning_rate": 0.0005132396501457726, |
| "loss": 3.4279, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.269813519813519, |
| "grad_norm": 0.3292107880115509, |
| "learning_rate": 0.0005130647230320699, |
| "loss": 3.4245, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.284382284382285, |
| "grad_norm": 0.33104875683784485, |
| "learning_rate": 0.0005128897959183673, |
| "loss": 3.4302, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.284382284382285, |
| "eval_accuracy": 0.36487504441994895, |
| "eval_loss": 3.5962696075439453, |
| "eval_runtime": 180.2058, |
| "eval_samples_per_second": 92.35, |
| "eval_steps_per_second": 5.777, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.298951048951049, |
| "grad_norm": 0.3441121578216553, |
| "learning_rate": 0.0005127148688046647, |
| "loss": 3.4167, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.313519813519814, |
| "grad_norm": 0.3398892283439636, |
| "learning_rate": 0.0005125399416909621, |
| "loss": 3.4279, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.328088578088578, |
| "grad_norm": 0.31465670466423035, |
| "learning_rate": 0.0005123650145772594, |
| "loss": 3.4331, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.3426573426573425, |
| "grad_norm": 0.3508657217025757, |
| "learning_rate": 0.0005121900874635568, |
| "loss": 3.4321, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.357226107226107, |
| "grad_norm": 0.3288611173629761, |
| "learning_rate": 0.0005120151603498543, |
| "loss": 3.4375, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.371794871794872, |
| "grad_norm": 0.3296453356742859, |
| "learning_rate": 0.0005118402332361515, |
| "loss": 3.4207, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.386363636363637, |
| "grad_norm": 0.3522392511367798, |
| "learning_rate": 0.0005116653061224489, |
| "loss": 3.4314, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.400932400932401, |
| "grad_norm": 0.3410415053367615, |
| "learning_rate": 0.0005114903790087463, |
| "loss": 3.441, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.415501165501166, |
| "grad_norm": 0.3186326324939728, |
| "learning_rate": 0.0005113154518950437, |
| "loss": 3.4412, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.43006993006993, |
| "grad_norm": 0.34362727403640747, |
| "learning_rate": 0.0005111405247813411, |
| "loss": 3.4385, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.444638694638694, |
| "grad_norm": 0.3459414541721344, |
| "learning_rate": 0.0005109655976676384, |
| "loss": 3.4456, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.459207459207459, |
| "grad_norm": 0.32094958424568176, |
| "learning_rate": 0.0005107906705539358, |
| "loss": 3.4242, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.473776223776224, |
| "grad_norm": 0.33854368329048157, |
| "learning_rate": 0.0005106157434402332, |
| "loss": 3.4381, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.488344988344989, |
| "grad_norm": 0.3441680371761322, |
| "learning_rate": 0.0005104408163265306, |
| "loss": 3.4411, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.502913752913753, |
| "grad_norm": 0.34269794821739197, |
| "learning_rate": 0.0005102658892128279, |
| "loss": 3.4422, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.5174825174825175, |
| "grad_norm": 0.3172236382961273, |
| "learning_rate": 0.0005100909620991253, |
| "loss": 3.4436, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.532051282051282, |
| "grad_norm": 0.3277212977409363, |
| "learning_rate": 0.0005099160349854227, |
| "loss": 3.4417, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.546620046620046, |
| "grad_norm": 0.3453097641468048, |
| "learning_rate": 0.0005097411078717201, |
| "loss": 3.441, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.561188811188811, |
| "grad_norm": 0.33524438738822937, |
| "learning_rate": 0.0005095661807580174, |
| "loss": 3.4466, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.575757575757576, |
| "grad_norm": 0.3289114832878113, |
| "learning_rate": 0.0005093912536443149, |
| "loss": 3.4563, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.575757575757576, |
| "eval_accuracy": 0.36541772625834573, |
| "eval_loss": 3.5879688262939453, |
| "eval_runtime": 180.0683, |
| "eval_samples_per_second": 92.42, |
| "eval_steps_per_second": 5.781, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.590326340326341, |
| "grad_norm": 0.32762327790260315, |
| "learning_rate": 0.0005092163265306122, |
| "loss": 3.4424, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.604895104895105, |
| "grad_norm": 0.32108092308044434, |
| "learning_rate": 0.0005090413994169096, |
| "loss": 3.4444, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.619463869463869, |
| "grad_norm": 0.31724825501441956, |
| "learning_rate": 0.000508866472303207, |
| "loss": 3.4457, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.634032634032634, |
| "grad_norm": 0.3468092083930969, |
| "learning_rate": 0.0005086915451895044, |
| "loss": 3.4433, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.648601398601398, |
| "grad_norm": 0.3435969054698944, |
| "learning_rate": 0.0005085166180758017, |
| "loss": 3.4572, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.663170163170163, |
| "grad_norm": 0.34128132462501526, |
| "learning_rate": 0.0005083416909620991, |
| "loss": 3.4489, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.677738927738928, |
| "grad_norm": 0.33892133831977844, |
| "learning_rate": 0.0005081667638483964, |
| "loss": 3.4451, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.6923076923076925, |
| "grad_norm": 0.3309405744075775, |
| "learning_rate": 0.0005079918367346939, |
| "loss": 3.4451, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.706876456876457, |
| "grad_norm": 0.3350067138671875, |
| "learning_rate": 0.0005078169096209912, |
| "loss": 3.4521, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.721445221445221, |
| "grad_norm": 0.3235504627227783, |
| "learning_rate": 0.0005076419825072886, |
| "loss": 3.4527, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.736013986013986, |
| "grad_norm": 0.3295065462589264, |
| "learning_rate": 0.000507467055393586, |
| "loss": 3.4627, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.75058275058275, |
| "grad_norm": 0.3342365622520447, |
| "learning_rate": 0.0005072921282798834, |
| "loss": 3.4602, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.765151515151516, |
| "grad_norm": 0.33766648173332214, |
| "learning_rate": 0.0005071172011661807, |
| "loss": 3.4665, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.77972027972028, |
| "grad_norm": 0.33531129360198975, |
| "learning_rate": 0.0005069422740524781, |
| "loss": 3.4529, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.7942890442890445, |
| "grad_norm": 0.31520190834999084, |
| "learning_rate": 0.0005067673469387754, |
| "loss": 3.4565, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.808857808857809, |
| "grad_norm": 0.319269061088562, |
| "learning_rate": 0.0005065924198250729, |
| "loss": 3.4587, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.823426573426573, |
| "grad_norm": 0.33063361048698425, |
| "learning_rate": 0.0005064174927113702, |
| "loss": 3.4538, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.837995337995338, |
| "grad_norm": 0.3421882390975952, |
| "learning_rate": 0.0005062425655976676, |
| "loss": 3.4524, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.852564102564102, |
| "grad_norm": 0.3269641399383545, |
| "learning_rate": 0.0005060676384839649, |
| "loss": 3.4436, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.867132867132867, |
| "grad_norm": 0.3379534184932709, |
| "learning_rate": 0.0005058927113702624, |
| "loss": 3.4508, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.867132867132867, |
| "eval_accuracy": 0.36653001824304665, |
| "eval_loss": 3.576610803604126, |
| "eval_runtime": 180.045, |
| "eval_samples_per_second": 92.432, |
| "eval_steps_per_second": 5.782, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.881701631701632, |
| "grad_norm": 0.32535165548324585, |
| "learning_rate": 0.0005057177842565598, |
| "loss": 3.4509, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.896270396270396, |
| "grad_norm": 0.32828956842422485, |
| "learning_rate": 0.0005055428571428571, |
| "loss": 3.4521, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.910839160839161, |
| "grad_norm": 0.3364267945289612, |
| "learning_rate": 0.0005053679300291544, |
| "loss": 3.4542, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.925407925407925, |
| "grad_norm": 0.3117345869541168, |
| "learning_rate": 0.0005051930029154519, |
| "loss": 3.4532, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.93997668997669, |
| "grad_norm": 0.34424829483032227, |
| "learning_rate": 0.0005050180758017492, |
| "loss": 3.4511, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.954545454545455, |
| "grad_norm": 0.3215528130531311, |
| "learning_rate": 0.0005048431486880466, |
| "loss": 3.4591, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.9691142191142195, |
| "grad_norm": 0.32633933424949646, |
| "learning_rate": 0.0005046682215743439, |
| "loss": 3.4638, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.983682983682984, |
| "grad_norm": 0.3218749165534973, |
| "learning_rate": 0.0005044932944606414, |
| "loss": 3.4643, |
| "step": 27400 |
| }, |
| { |
| "epoch": 7.998251748251748, |
| "grad_norm": 0.3505411446094513, |
| "learning_rate": 0.0005043183673469388, |
| "loss": 3.471, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.012820512820513, |
| "grad_norm": 0.32962340116500854, |
| "learning_rate": 0.0005041434402332361, |
| "loss": 3.3527, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.027389277389277, |
| "grad_norm": 0.3332984447479248, |
| "learning_rate": 0.0005039685131195334, |
| "loss": 3.3579, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.041958041958042, |
| "grad_norm": 0.3268524706363678, |
| "learning_rate": 0.0005037935860058309, |
| "loss": 3.3558, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.056526806526806, |
| "grad_norm": 0.326473593711853, |
| "learning_rate": 0.0005036186588921282, |
| "loss": 3.3529, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.07109557109557, |
| "grad_norm": 0.3332017958164215, |
| "learning_rate": 0.0005034437317784256, |
| "loss": 3.3669, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.085664335664335, |
| "grad_norm": 0.34896111488342285, |
| "learning_rate": 0.000503268804664723, |
| "loss": 3.353, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.1002331002331, |
| "grad_norm": 0.3163587749004364, |
| "learning_rate": 0.0005030938775510204, |
| "loss": 3.3777, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.114801864801866, |
| "grad_norm": 0.3226380944252014, |
| "learning_rate": 0.0005029189504373178, |
| "loss": 3.3687, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.12937062937063, |
| "grad_norm": 0.3180595934391022, |
| "learning_rate": 0.0005027440233236151, |
| "loss": 3.3655, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.143939393939394, |
| "grad_norm": 0.3455767035484314, |
| "learning_rate": 0.0005025690962099126, |
| "loss": 3.3818, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.158508158508159, |
| "grad_norm": 0.3457156717777252, |
| "learning_rate": 0.0005023941690962099, |
| "loss": 3.3689, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.158508158508159, |
| "eval_accuracy": 0.36624509557903034, |
| "eval_loss": 3.5875673294067383, |
| "eval_runtime": 180.2264, |
| "eval_samples_per_second": 92.339, |
| "eval_steps_per_second": 5.776, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.173076923076923, |
| "grad_norm": 0.33135128021240234, |
| "learning_rate": 0.0005022192419825072, |
| "loss": 3.3743, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.187645687645688, |
| "grad_norm": 0.34318405389785767, |
| "learning_rate": 0.0005020443148688046, |
| "loss": 3.3838, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.202214452214452, |
| "grad_norm": 0.3445442318916321, |
| "learning_rate": 0.000501869387755102, |
| "loss": 3.3952, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.216783216783217, |
| "grad_norm": 0.33991461992263794, |
| "learning_rate": 0.0005016944606413994, |
| "loss": 3.39, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.231351981351981, |
| "grad_norm": 0.30327895283699036, |
| "learning_rate": 0.0005015195335276967, |
| "loss": 3.3729, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.245920745920746, |
| "grad_norm": 0.3228490650653839, |
| "learning_rate": 0.0005013446064139941, |
| "loss": 3.3843, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.26048951048951, |
| "grad_norm": 0.3313380777835846, |
| "learning_rate": 0.0005011696793002916, |
| "loss": 3.3977, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.275058275058274, |
| "grad_norm": 0.31908831000328064, |
| "learning_rate": 0.0005009947521865889, |
| "loss": 3.3963, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.289627039627039, |
| "grad_norm": 0.3229847252368927, |
| "learning_rate": 0.0005008198250728862, |
| "loss": 3.3839, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.304195804195805, |
| "grad_norm": 0.3282051086425781, |
| "learning_rate": 0.0005006448979591836, |
| "loss": 3.3839, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.31876456876457, |
| "grad_norm": 0.33158057928085327, |
| "learning_rate": 0.000500469970845481, |
| "loss": 3.3931, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.33471545577049255, |
| "learning_rate": 0.0005002950437317784, |
| "loss": 3.401, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.347902097902098, |
| "grad_norm": 0.3325505554676056, |
| "learning_rate": 0.0005001201166180757, |
| "loss": 3.4004, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.362470862470863, |
| "grad_norm": 0.3467610776424408, |
| "learning_rate": 0.0004999451895043731, |
| "loss": 3.4166, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.377039627039627, |
| "grad_norm": 0.3516916334629059, |
| "learning_rate": 0.0004997702623906706, |
| "loss": 3.41, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.391608391608392, |
| "grad_norm": 0.3250814378261566, |
| "learning_rate": 0.0004995953352769679, |
| "loss": 3.3877, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.406177156177156, |
| "grad_norm": 0.3190094530582428, |
| "learning_rate": 0.0004994204081632653, |
| "loss": 3.4102, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.42074592074592, |
| "grad_norm": 0.32494157552719116, |
| "learning_rate": 0.0004992454810495626, |
| "loss": 3.3945, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.435314685314685, |
| "grad_norm": 0.32460781931877136, |
| "learning_rate": 0.00049907055393586, |
| "loss": 3.4025, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.44988344988345, |
| "grad_norm": 0.33918076753616333, |
| "learning_rate": 0.0004988956268221574, |
| "loss": 3.409, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.44988344988345, |
| "eval_accuracy": 0.36715372018689424, |
| "eval_loss": 3.5787601470947266, |
| "eval_runtime": 179.9871, |
| "eval_samples_per_second": 92.462, |
| "eval_steps_per_second": 5.784, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.464452214452214, |
| "grad_norm": 0.33270788192749023, |
| "learning_rate": 0.0004987206997084547, |
| "loss": 3.4046, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.479020979020978, |
| "grad_norm": 0.3704804480075836, |
| "learning_rate": 0.0004985457725947521, |
| "loss": 3.4126, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.493589743589745, |
| "grad_norm": 0.32601258158683777, |
| "learning_rate": 0.0004983708454810496, |
| "loss": 3.4148, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.508158508158509, |
| "grad_norm": 0.333025723695755, |
| "learning_rate": 0.0004981959183673469, |
| "loss": 3.4101, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.522727272727273, |
| "grad_norm": 0.37352901697158813, |
| "learning_rate": 0.0004980209912536443, |
| "loss": 3.4143, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.537296037296038, |
| "grad_norm": 0.33815521001815796, |
| "learning_rate": 0.0004978460641399417, |
| "loss": 3.4212, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.551864801864802, |
| "grad_norm": 0.33571699261665344, |
| "learning_rate": 0.000497671137026239, |
| "loss": 3.4171, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.566433566433567, |
| "grad_norm": 0.33257025480270386, |
| "learning_rate": 0.0004974962099125364, |
| "loss": 3.4078, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.581002331002331, |
| "grad_norm": 0.3310984671115875, |
| "learning_rate": 0.0004973212827988337, |
| "loss": 3.4084, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.595571095571096, |
| "grad_norm": 0.3633224368095398, |
| "learning_rate": 0.0004971463556851312, |
| "loss": 3.417, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.61013986013986, |
| "grad_norm": 0.3331892192363739, |
| "learning_rate": 0.0004969714285714286, |
| "loss": 3.4142, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.624708624708624, |
| "grad_norm": 0.3235609829425812, |
| "learning_rate": 0.0004967965014577259, |
| "loss": 3.4237, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.639277389277389, |
| "grad_norm": 0.3281348943710327, |
| "learning_rate": 0.0004966215743440233, |
| "loss": 3.417, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.653846153846153, |
| "grad_norm": 0.3530293405056, |
| "learning_rate": 0.0004964466472303207, |
| "loss": 3.4244, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.668414918414918, |
| "grad_norm": 0.3330443501472473, |
| "learning_rate": 0.000496271720116618, |
| "loss": 3.4279, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.682983682983682, |
| "grad_norm": 0.3074372112751007, |
| "learning_rate": 0.0004960967930029154, |
| "loss": 3.4178, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.697552447552448, |
| "grad_norm": 0.317383348941803, |
| "learning_rate": 0.0004959218658892127, |
| "loss": 3.4187, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.712121212121213, |
| "grad_norm": 0.3294063210487366, |
| "learning_rate": 0.0004957469387755102, |
| "loss": 3.4365, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.726689976689977, |
| "grad_norm": 0.32321542501449585, |
| "learning_rate": 0.0004955720116618075, |
| "loss": 3.4291, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.741258741258742, |
| "grad_norm": 0.33648818731307983, |
| "learning_rate": 0.0004953970845481049, |
| "loss": 3.4213, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.741258741258742, |
| "eval_accuracy": 0.3674972031012944, |
| "eval_loss": 3.5736160278320312, |
| "eval_runtime": 184.4878, |
| "eval_samples_per_second": 90.206, |
| "eval_steps_per_second": 5.643, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.755827505827506, |
| "grad_norm": 0.34832367300987244, |
| "learning_rate": 0.0004952221574344023, |
| "loss": 3.4256, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.77039627039627, |
| "grad_norm": 0.35622331500053406, |
| "learning_rate": 0.0004950472303206997, |
| "loss": 3.4305, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.784965034965035, |
| "grad_norm": 0.3356943726539612, |
| "learning_rate": 0.0004948723032069971, |
| "loss": 3.4211, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.7995337995338, |
| "grad_norm": 0.3233150243759155, |
| "learning_rate": 0.0004946973760932944, |
| "loss": 3.4292, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.814102564102564, |
| "grad_norm": 0.321884423494339, |
| "learning_rate": 0.0004945224489795917, |
| "loss": 3.4113, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.828671328671328, |
| "grad_norm": 0.3420848846435547, |
| "learning_rate": 0.0004943475218658892, |
| "loss": 3.4163, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.843240093240093, |
| "grad_norm": 0.33607736229896545, |
| "learning_rate": 0.0004941725947521865, |
| "loss": 3.4269, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.857808857808857, |
| "grad_norm": 0.327867329120636, |
| "learning_rate": 0.0004939976676384839, |
| "loss": 3.428, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.872377622377622, |
| "grad_norm": 0.33073338866233826, |
| "learning_rate": 0.0004938227405247813, |
| "loss": 3.4269, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.886946386946388, |
| "grad_norm": 0.3265506625175476, |
| "learning_rate": 0.0004936478134110787, |
| "loss": 3.427, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.901515151515152, |
| "grad_norm": 0.3573986291885376, |
| "learning_rate": 0.0004934728862973761, |
| "loss": 3.4453, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.916083916083917, |
| "grad_norm": 0.31229108572006226, |
| "learning_rate": 0.0004932979591836734, |
| "loss": 3.4326, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.930652680652681, |
| "grad_norm": 0.31914326548576355, |
| "learning_rate": 0.0004931230320699707, |
| "loss": 3.4377, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.945221445221446, |
| "grad_norm": 0.3208966851234436, |
| "learning_rate": 0.0004929481049562682, |
| "loss": 3.4355, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.95979020979021, |
| "grad_norm": 0.33020609617233276, |
| "learning_rate": 0.0004927731778425655, |
| "loss": 3.4301, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.974358974358974, |
| "grad_norm": 0.3164729177951813, |
| "learning_rate": 0.0004925982507288629, |
| "loss": 3.4351, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.988927738927739, |
| "grad_norm": 0.3445264399051666, |
| "learning_rate": 0.0004924233236151604, |
| "loss": 3.4289, |
| "step": 30850 |
| }, |
| { |
| "epoch": 9.003496503496503, |
| "grad_norm": 0.35110875964164734, |
| "learning_rate": 0.0004922483965014577, |
| "loss": 3.41, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.018065268065268, |
| "grad_norm": 0.34962424635887146, |
| "learning_rate": 0.0004920734693877551, |
| "loss": 3.3223, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.032634032634032, |
| "grad_norm": 0.349322110414505, |
| "learning_rate": 0.0004918985422740524, |
| "loss": 3.3257, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.032634032634032, |
| "eval_accuracy": 0.3677295626490023, |
| "eval_loss": 3.5775063037872314, |
| "eval_runtime": 184.5342, |
| "eval_samples_per_second": 90.184, |
| "eval_steps_per_second": 5.641, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.047202797202797, |
| "grad_norm": 0.33628126978874207, |
| "learning_rate": 0.0004917236151603499, |
| "loss": 3.3398, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.061771561771561, |
| "grad_norm": 0.3321647047996521, |
| "learning_rate": 0.0004915486880466472, |
| "loss": 3.3387, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.076340326340326, |
| "grad_norm": 0.3293364346027374, |
| "learning_rate": 0.0004913737609329445, |
| "loss": 3.3316, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.090909090909092, |
| "grad_norm": 0.36083984375, |
| "learning_rate": 0.0004911988338192419, |
| "loss": 3.3328, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.105477855477856, |
| "grad_norm": 0.3213653862476349, |
| "learning_rate": 0.0004910239067055393, |
| "loss": 3.3368, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.12004662004662, |
| "grad_norm": 0.328110009431839, |
| "learning_rate": 0.0004908489795918367, |
| "loss": 3.3341, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.134615384615385, |
| "grad_norm": 0.34902945160865784, |
| "learning_rate": 0.0004906740524781341, |
| "loss": 3.3506, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.14918414918415, |
| "grad_norm": 0.3422456979751587, |
| "learning_rate": 0.0004904991253644314, |
| "loss": 3.3336, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.163752913752914, |
| "grad_norm": 0.32111626863479614, |
| "learning_rate": 0.0004903241982507289, |
| "loss": 3.3448, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.178321678321678, |
| "grad_norm": 0.3301600515842438, |
| "learning_rate": 0.0004901492711370262, |
| "loss": 3.3602, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.192890442890443, |
| "grad_norm": 0.3596358895301819, |
| "learning_rate": 0.0004899743440233235, |
| "loss": 3.3425, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.207459207459207, |
| "grad_norm": 0.34408465027809143, |
| "learning_rate": 0.0004897994169096209, |
| "loss": 3.3586, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.222027972027972, |
| "grad_norm": 0.29452061653137207, |
| "learning_rate": 0.0004896244897959183, |
| "loss": 3.3461, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.236596736596736, |
| "grad_norm": 0.3462780714035034, |
| "learning_rate": 0.0004894495626822157, |
| "loss": 3.3546, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.2511655011655, |
| "grad_norm": 0.3364809453487396, |
| "learning_rate": 0.0004892746355685131, |
| "loss": 3.3604, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.265734265734265, |
| "grad_norm": 0.37016913294792175, |
| "learning_rate": 0.0004890997084548104, |
| "loss": 3.3713, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.280303030303031, |
| "grad_norm": 0.35441073775291443, |
| "learning_rate": 0.0004889247813411079, |
| "loss": 3.3603, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.294871794871796, |
| "grad_norm": 0.37994956970214844, |
| "learning_rate": 0.0004887498542274052, |
| "loss": 3.3599, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.30944055944056, |
| "grad_norm": 0.32939761877059937, |
| "learning_rate": 0.0004885749271137026, |
| "loss": 3.3698, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.324009324009324, |
| "grad_norm": 0.3467860817909241, |
| "learning_rate": 0.0004883999999999999, |
| "loss": 3.3722, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.324009324009324, |
| "eval_accuracy": 0.3676854660749181, |
| "eval_loss": 3.57747483253479, |
| "eval_runtime": 184.5767, |
| "eval_samples_per_second": 90.163, |
| "eval_steps_per_second": 5.64, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.338578088578089, |
| "grad_norm": 0.3501215875148773, |
| "learning_rate": 0.0004882250728862973, |
| "loss": 3.3923, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.353146853146853, |
| "grad_norm": 0.3472752571105957, |
| "learning_rate": 0.0004880501457725947, |
| "loss": 3.3728, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.367715617715618, |
| "grad_norm": 0.33848997950553894, |
| "learning_rate": 0.00048787521865889207, |
| "loss": 3.3655, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.382284382284382, |
| "grad_norm": 0.33940359950065613, |
| "learning_rate": 0.00048770029154518945, |
| "loss": 3.3753, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.396853146853147, |
| "grad_norm": 0.33698955178260803, |
| "learning_rate": 0.0004875253644314868, |
| "loss": 3.3753, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.411421911421911, |
| "grad_norm": 0.38616305589675903, |
| "learning_rate": 0.0004873504373177842, |
| "loss": 3.3864, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.425990675990676, |
| "grad_norm": 0.3368024528026581, |
| "learning_rate": 0.00048717551020408163, |
| "loss": 3.3848, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.44055944055944, |
| "grad_norm": 0.3477652370929718, |
| "learning_rate": 0.000487000583090379, |
| "loss": 3.3911, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.455128205128204, |
| "grad_norm": 0.3296109437942505, |
| "learning_rate": 0.00048682565597667633, |
| "loss": 3.3774, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.469696969696969, |
| "grad_norm": 0.3162249028682709, |
| "learning_rate": 0.0004866507288629737, |
| "loss": 3.389, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.484265734265735, |
| "grad_norm": 0.35383546352386475, |
| "learning_rate": 0.0004864758017492711, |
| "loss": 3.3884, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.4988344988345, |
| "grad_norm": 0.36289462447166443, |
| "learning_rate": 0.00048630087463556845, |
| "loss": 3.3902, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.513403263403264, |
| "grad_norm": 0.3470366299152374, |
| "learning_rate": 0.00048612594752186583, |
| "loss": 3.3957, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.527972027972028, |
| "grad_norm": 0.35183480381965637, |
| "learning_rate": 0.0004859510204081632, |
| "loss": 3.3927, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.542540792540793, |
| "grad_norm": 0.35853511095046997, |
| "learning_rate": 0.00048577609329446064, |
| "loss": 3.3904, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.557109557109557, |
| "grad_norm": 0.36935216188430786, |
| "learning_rate": 0.000485601166180758, |
| "loss": 3.3844, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.571678321678322, |
| "grad_norm": 0.339053213596344, |
| "learning_rate": 0.0004854262390670554, |
| "loss": 3.3895, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.586247086247086, |
| "grad_norm": 0.34945929050445557, |
| "learning_rate": 0.0004852513119533527, |
| "loss": 3.4071, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.60081585081585, |
| "grad_norm": 0.3158731758594513, |
| "learning_rate": 0.0004850763848396501, |
| "loss": 3.401, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.615384615384615, |
| "grad_norm": 0.33528274297714233, |
| "learning_rate": 0.00048490145772594746, |
| "loss": 3.3982, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.615384615384615, |
| "eval_accuracy": 0.36842334874792776, |
| "eval_loss": 3.5674285888671875, |
| "eval_runtime": 181.5104, |
| "eval_samples_per_second": 91.686, |
| "eval_steps_per_second": 5.735, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.62995337995338, |
| "grad_norm": 0.3466652035713196, |
| "learning_rate": 0.00048472653061224484, |
| "loss": 3.3889, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.644522144522144, |
| "grad_norm": 0.34382832050323486, |
| "learning_rate": 0.0004845516034985422, |
| "loss": 3.3952, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.659090909090908, |
| "grad_norm": 0.33218619227409363, |
| "learning_rate": 0.0004843766763848396, |
| "loss": 3.3955, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.673659673659674, |
| "grad_norm": 0.3392280638217926, |
| "learning_rate": 0.000484201749271137, |
| "loss": 3.4007, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.688228438228439, |
| "grad_norm": 0.32414713501930237, |
| "learning_rate": 0.0004840268221574344, |
| "loss": 3.3993, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.702797202797203, |
| "grad_norm": 0.3343014121055603, |
| "learning_rate": 0.00048385189504373177, |
| "loss": 3.4024, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.717365967365968, |
| "grad_norm": 0.3456258177757263, |
| "learning_rate": 0.0004836769679300291, |
| "loss": 3.4074, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.731934731934732, |
| "grad_norm": 0.33294492959976196, |
| "learning_rate": 0.00048350204081632647, |
| "loss": 3.3968, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.746503496503497, |
| "grad_norm": 0.32060715556144714, |
| "learning_rate": 0.00048332711370262384, |
| "loss": 3.398, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.761072261072261, |
| "grad_norm": 0.34071505069732666, |
| "learning_rate": 0.0004831521865889212, |
| "loss": 3.4091, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.775641025641026, |
| "grad_norm": 0.3401913046836853, |
| "learning_rate": 0.0004829772594752186, |
| "loss": 3.3941, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.79020979020979, |
| "grad_norm": 0.3486825227737427, |
| "learning_rate": 0.00048280233236151597, |
| "loss": 3.4085, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.804778554778554, |
| "grad_norm": 0.32629451155662537, |
| "learning_rate": 0.0004826274052478134, |
| "loss": 3.3988, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.819347319347319, |
| "grad_norm": 0.3566981256008148, |
| "learning_rate": 0.0004824524781341108, |
| "loss": 3.3986, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.833916083916083, |
| "grad_norm": 0.3448028564453125, |
| "learning_rate": 0.00048227755102040815, |
| "loss": 3.3956, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.848484848484848, |
| "grad_norm": 0.32237598299980164, |
| "learning_rate": 0.0004821026239067055, |
| "loss": 3.4087, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.863053613053612, |
| "grad_norm": 0.3462821841239929, |
| "learning_rate": 0.00048192769679300285, |
| "loss": 3.4071, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.877622377622378, |
| "grad_norm": 0.3197895288467407, |
| "learning_rate": 0.0004817527696793002, |
| "loss": 3.4081, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.892191142191143, |
| "grad_norm": 0.33993223309516907, |
| "learning_rate": 0.0004815778425655976, |
| "loss": 3.4168, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.906759906759907, |
| "grad_norm": 0.3364954888820648, |
| "learning_rate": 0.000481402915451895, |
| "loss": 3.4083, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.906759906759907, |
| "eval_accuracy": 0.36888501048087374, |
| "eval_loss": 3.5607762336730957, |
| "eval_runtime": 180.1014, |
| "eval_samples_per_second": 92.404, |
| "eval_steps_per_second": 5.78, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.921328671328672, |
| "grad_norm": 0.37011784315109253, |
| "learning_rate": 0.0004812279883381924, |
| "loss": 3.4251, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.935897435897436, |
| "grad_norm": 0.33454248309135437, |
| "learning_rate": 0.0004810530612244898, |
| "loss": 3.4185, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.9504662004662, |
| "grad_norm": 0.3181527853012085, |
| "learning_rate": 0.00048087813411078716, |
| "loss": 3.406, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.965034965034965, |
| "grad_norm": 0.34176966547966003, |
| "learning_rate": 0.00048070320699708453, |
| "loss": 3.4085, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.97960372960373, |
| "grad_norm": 0.32492902874946594, |
| "learning_rate": 0.00048052827988338186, |
| "loss": 3.4045, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.994172494172494, |
| "grad_norm": 0.345251202583313, |
| "learning_rate": 0.00048035335276967923, |
| "loss": 3.3892, |
| "step": 34300 |
| }, |
| { |
| "epoch": 10.008741258741258, |
| "grad_norm": 0.364636093378067, |
| "learning_rate": 0.0004801784256559766, |
| "loss": 3.3166, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.023310023310023, |
| "grad_norm": 0.3602936267852783, |
| "learning_rate": 0.000480003498542274, |
| "loss": 3.294, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.037878787878787, |
| "grad_norm": 0.32505398988723755, |
| "learning_rate": 0.00047982857142857136, |
| "loss": 3.3059, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.052447552447552, |
| "grad_norm": 0.3503585457801819, |
| "learning_rate": 0.0004796536443148688, |
| "loss": 3.292, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.067016317016318, |
| "grad_norm": 0.35856735706329346, |
| "learning_rate": 0.00047947871720116616, |
| "loss": 3.3011, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.081585081585082, |
| "grad_norm": 0.32884514331817627, |
| "learning_rate": 0.00047930379008746354, |
| "loss": 3.3162, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.096153846153847, |
| "grad_norm": 0.34173399209976196, |
| "learning_rate": 0.0004791288629737609, |
| "loss": 3.3267, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.110722610722611, |
| "grad_norm": 0.32754212617874146, |
| "learning_rate": 0.00047895393586005824, |
| "loss": 3.3069, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.125291375291376, |
| "grad_norm": 0.35599884390830994, |
| "learning_rate": 0.0004787790087463556, |
| "loss": 3.3275, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.13986013986014, |
| "grad_norm": 0.3487524092197418, |
| "learning_rate": 0.000478604081632653, |
| "loss": 3.3185, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.154428904428904, |
| "grad_norm": 0.35253459215164185, |
| "learning_rate": 0.00047842915451895037, |
| "loss": 3.326, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.168997668997669, |
| "grad_norm": 0.37047427892684937, |
| "learning_rate": 0.0004782542274052478, |
| "loss": 3.3348, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.183566433566433, |
| "grad_norm": 0.3368969261646271, |
| "learning_rate": 0.00047807930029154517, |
| "loss": 3.3406, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.198135198135198, |
| "grad_norm": 0.3401995897293091, |
| "learning_rate": 0.00047790437317784255, |
| "loss": 3.3294, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.198135198135198, |
| "eval_accuracy": 0.3686502991158813, |
| "eval_loss": 3.5734663009643555, |
| "eval_runtime": 179.963, |
| "eval_samples_per_second": 92.475, |
| "eval_steps_per_second": 5.785, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.212703962703962, |
| "grad_norm": 0.3233661949634552, |
| "learning_rate": 0.0004777294460641399, |
| "loss": 3.3418, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.227272727272727, |
| "grad_norm": 0.35350361466407776, |
| "learning_rate": 0.0004775545189504373, |
| "loss": 3.3428, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.241841491841491, |
| "grad_norm": 0.3602052628993988, |
| "learning_rate": 0.0004773795918367346, |
| "loss": 3.3347, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.256410256410255, |
| "grad_norm": 0.3384787142276764, |
| "learning_rate": 0.000477204664723032, |
| "loss": 3.335, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.270979020979022, |
| "grad_norm": 0.35919293761253357, |
| "learning_rate": 0.00047702973760932937, |
| "loss": 3.3324, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.285547785547786, |
| "grad_norm": 0.3487773835659027, |
| "learning_rate": 0.00047685481049562675, |
| "loss": 3.3435, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.30011655011655, |
| "grad_norm": 0.33635222911834717, |
| "learning_rate": 0.0004766798833819242, |
| "loss": 3.3492, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.314685314685315, |
| "grad_norm": 0.3445112705230713, |
| "learning_rate": 0.00047650495626822155, |
| "loss": 3.3516, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.32925407925408, |
| "grad_norm": 0.3628615438938141, |
| "learning_rate": 0.00047633002915451893, |
| "loss": 3.3535, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.343822843822844, |
| "grad_norm": 0.3586277663707733, |
| "learning_rate": 0.0004761551020408163, |
| "loss": 3.3652, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.358391608391608, |
| "grad_norm": 0.3344104588031769, |
| "learning_rate": 0.0004759801749271137, |
| "loss": 3.3564, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.372960372960373, |
| "grad_norm": 0.32970529794692993, |
| "learning_rate": 0.000475805247813411, |
| "loss": 3.3439, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.387529137529137, |
| "grad_norm": 0.3633076250553131, |
| "learning_rate": 0.0004756303206997084, |
| "loss": 3.3533, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.402097902097902, |
| "grad_norm": 0.34395188093185425, |
| "learning_rate": 0.00047545539358600575, |
| "loss": 3.3582, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.416666666666666, |
| "grad_norm": 0.3351273536682129, |
| "learning_rate": 0.00047528046647230313, |
| "loss": 3.3544, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.43123543123543, |
| "grad_norm": 0.34297794103622437, |
| "learning_rate": 0.00047510553935860056, |
| "loss": 3.3573, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.445804195804195, |
| "grad_norm": 0.3346062898635864, |
| "learning_rate": 0.00047493061224489794, |
| "loss": 3.3565, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.460372960372961, |
| "grad_norm": 0.3574366271495819, |
| "learning_rate": 0.0004747556851311953, |
| "loss": 3.3548, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.474941724941726, |
| "grad_norm": 0.35288873314857483, |
| "learning_rate": 0.0004745807580174927, |
| "loss": 3.3584, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.48951048951049, |
| "grad_norm": 0.32899612188339233, |
| "learning_rate": 0.00047440583090379006, |
| "loss": 3.3591, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.48951048951049, |
| "eval_accuracy": 0.3689316940539709, |
| "eval_loss": 3.566610336303711, |
| "eval_runtime": 179.9576, |
| "eval_samples_per_second": 92.477, |
| "eval_steps_per_second": 5.785, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.504079254079254, |
| "grad_norm": 0.3387123942375183, |
| "learning_rate": 0.0004742309037900874, |
| "loss": 3.3697, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.518648018648019, |
| "grad_norm": 0.3552062213420868, |
| "learning_rate": 0.00047405597667638476, |
| "loss": 3.3692, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.533216783216783, |
| "grad_norm": 0.30987149477005005, |
| "learning_rate": 0.00047388104956268214, |
| "loss": 3.3757, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.547785547785548, |
| "grad_norm": 0.38266798853874207, |
| "learning_rate": 0.00047370612244897957, |
| "loss": 3.371, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.562354312354312, |
| "grad_norm": 0.35734835267066956, |
| "learning_rate": 0.00047353119533527694, |
| "loss": 3.3526, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.576923076923077, |
| "grad_norm": 0.33983373641967773, |
| "learning_rate": 0.0004733562682215743, |
| "loss": 3.3765, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.591491841491841, |
| "grad_norm": 0.34860071539878845, |
| "learning_rate": 0.0004731813411078717, |
| "loss": 3.3727, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.606060606060606, |
| "grad_norm": 0.3304286301136017, |
| "learning_rate": 0.00047300641399416907, |
| "loss": 3.3738, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.62062937062937, |
| "grad_norm": 0.36452704668045044, |
| "learning_rate": 0.00047283148688046645, |
| "loss": 3.3727, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.635198135198134, |
| "grad_norm": 0.33509254455566406, |
| "learning_rate": 0.00047265655976676377, |
| "loss": 3.3678, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.649766899766899, |
| "grad_norm": 0.3406018912792206, |
| "learning_rate": 0.00047248163265306114, |
| "loss": 3.3738, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.664335664335665, |
| "grad_norm": 0.3569091856479645, |
| "learning_rate": 0.0004723067055393585, |
| "loss": 3.3718, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.67890442890443, |
| "grad_norm": 0.31912165880203247, |
| "learning_rate": 0.00047213177842565595, |
| "loss": 3.3627, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.693473193473194, |
| "grad_norm": 0.3501559793949127, |
| "learning_rate": 0.0004719568513119533, |
| "loss": 3.3774, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.708041958041958, |
| "grad_norm": 0.36387351155281067, |
| "learning_rate": 0.0004717819241982507, |
| "loss": 3.3652, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.722610722610723, |
| "grad_norm": 0.3268325924873352, |
| "learning_rate": 0.0004716069970845481, |
| "loss": 3.3754, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.737179487179487, |
| "grad_norm": 0.3691311776638031, |
| "learning_rate": 0.00047143206997084545, |
| "loss": 3.3789, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.751748251748252, |
| "grad_norm": 0.34827283024787903, |
| "learning_rate": 0.00047125714285714283, |
| "loss": 3.3962, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.766317016317016, |
| "grad_norm": 0.3422998785972595, |
| "learning_rate": 0.00047108221574344015, |
| "loss": 3.3801, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.78088578088578, |
| "grad_norm": 0.3483596444129944, |
| "learning_rate": 0.0004709072886297375, |
| "loss": 3.3894, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.78088578088578, |
| "eval_accuracy": 0.3693519638027098, |
| "eval_loss": 3.556098699569702, |
| "eval_runtime": 180.0859, |
| "eval_samples_per_second": 92.411, |
| "eval_steps_per_second": 5.781, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.795454545454545, |
| "grad_norm": 0.3572978973388672, |
| "learning_rate": 0.00047073236151603495, |
| "loss": 3.3816, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.81002331002331, |
| "grad_norm": 0.3622797727584839, |
| "learning_rate": 0.00047055743440233233, |
| "loss": 3.389, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.824592074592074, |
| "grad_norm": 0.3194954991340637, |
| "learning_rate": 0.0004703825072886297, |
| "loss": 3.3866, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.83916083916084, |
| "grad_norm": 0.34308746457099915, |
| "learning_rate": 0.0004702075801749271, |
| "loss": 3.3863, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.853729603729604, |
| "grad_norm": 0.33468618988990784, |
| "learning_rate": 0.00047003265306122446, |
| "loss": 3.3797, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.868298368298369, |
| "grad_norm": 0.3478671908378601, |
| "learning_rate": 0.00046985772594752183, |
| "loss": 3.3934, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.882867132867133, |
| "grad_norm": 0.34233909845352173, |
| "learning_rate": 0.0004696827988338192, |
| "loss": 3.3845, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.897435897435898, |
| "grad_norm": 0.3444744646549225, |
| "learning_rate": 0.00046950787172011653, |
| "loss": 3.3938, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.912004662004662, |
| "grad_norm": 0.3458085060119629, |
| "learning_rate": 0.0004693329446064139, |
| "loss": 3.3931, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.926573426573427, |
| "grad_norm": 0.3380485773086548, |
| "learning_rate": 0.00046915801749271134, |
| "loss": 3.3893, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.941142191142191, |
| "grad_norm": 0.33901792764663696, |
| "learning_rate": 0.0004689830903790087, |
| "loss": 3.3976, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.955710955710956, |
| "grad_norm": 0.3939083516597748, |
| "learning_rate": 0.0004688081632653061, |
| "loss": 3.3945, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.97027972027972, |
| "grad_norm": 0.3543170392513275, |
| "learning_rate": 0.00046863323615160346, |
| "loss": 3.3905, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.984848484848484, |
| "grad_norm": 0.33849748969078064, |
| "learning_rate": 0.00046845830903790084, |
| "loss": 3.3845, |
| "step": 37700 |
| }, |
| { |
| "epoch": 10.999417249417249, |
| "grad_norm": 0.3542396128177643, |
| "learning_rate": 0.0004682833819241982, |
| "loss": 3.3943, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.013986013986013, |
| "grad_norm": 0.3594779968261719, |
| "learning_rate": 0.0004681084548104956, |
| "loss": 3.2759, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.028554778554778, |
| "grad_norm": 0.335800439119339, |
| "learning_rate": 0.0004679335276967929, |
| "loss": 3.2729, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.043123543123544, |
| "grad_norm": 0.3539378345012665, |
| "learning_rate": 0.0004677586005830903, |
| "loss": 3.2811, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.057692307692308, |
| "grad_norm": 0.3634989857673645, |
| "learning_rate": 0.0004675836734693877, |
| "loss": 3.2793, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.072261072261073, |
| "grad_norm": 0.35441911220550537, |
| "learning_rate": 0.0004674087463556851, |
| "loss": 3.2944, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.072261072261073, |
| "eval_accuracy": 0.3693041043209704, |
| "eval_loss": 3.5667061805725098, |
| "eval_runtime": 179.9313, |
| "eval_samples_per_second": 92.491, |
| "eval_steps_per_second": 5.786, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.086829836829837, |
| "grad_norm": 0.33456510305404663, |
| "learning_rate": 0.00046723381924198247, |
| "loss": 3.3043, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.101398601398602, |
| "grad_norm": 0.35062792897224426, |
| "learning_rate": 0.00046705889212827985, |
| "loss": 3.2976, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.115967365967366, |
| "grad_norm": 0.35433903336524963, |
| "learning_rate": 0.0004668839650145772, |
| "loss": 3.3004, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.13053613053613, |
| "grad_norm": 0.34735456109046936, |
| "learning_rate": 0.0004667090379008746, |
| "loss": 3.3141, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.145104895104895, |
| "grad_norm": 0.36027348041534424, |
| "learning_rate": 0.000466534110787172, |
| "loss": 3.3177, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.15967365967366, |
| "grad_norm": 0.3495638370513916, |
| "learning_rate": 0.0004663591836734693, |
| "loss": 3.3033, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.174242424242424, |
| "grad_norm": 0.352952778339386, |
| "learning_rate": 0.0004661842565597667, |
| "loss": 3.3067, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.188811188811188, |
| "grad_norm": 0.35756683349609375, |
| "learning_rate": 0.0004660093294460641, |
| "loss": 3.3134, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.203379953379953, |
| "grad_norm": 0.3577975630760193, |
| "learning_rate": 0.0004658344023323615, |
| "loss": 3.3004, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.217948717948717, |
| "grad_norm": 0.32897868752479553, |
| "learning_rate": 0.00046565947521865885, |
| "loss": 3.3096, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.232517482517483, |
| "grad_norm": 0.32986345887184143, |
| "learning_rate": 0.00046548454810495623, |
| "loss": 3.3042, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.247086247086248, |
| "grad_norm": 0.3170894384384155, |
| "learning_rate": 0.0004653096209912536, |
| "loss": 3.3199, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.261655011655012, |
| "grad_norm": 0.3499142527580261, |
| "learning_rate": 0.000465134693877551, |
| "loss": 3.3325, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.276223776223777, |
| "grad_norm": 0.34623247385025024, |
| "learning_rate": 0.0004649597667638484, |
| "loss": 3.3242, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.290792540792541, |
| "grad_norm": 0.3531145453453064, |
| "learning_rate": 0.0004647848396501457, |
| "loss": 3.3253, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.305361305361306, |
| "grad_norm": 0.3613262474536896, |
| "learning_rate": 0.0004646099125364431, |
| "loss": 3.318, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.31993006993007, |
| "grad_norm": 0.34619051218032837, |
| "learning_rate": 0.0004644349854227405, |
| "loss": 3.3252, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.334498834498834, |
| "grad_norm": 0.34788867831230164, |
| "learning_rate": 0.00046426005830903786, |
| "loss": 3.3346, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.349067599067599, |
| "grad_norm": 0.35362058877944946, |
| "learning_rate": 0.00046408513119533523, |
| "loss": 3.3304, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.363636363636363, |
| "grad_norm": 0.32547181844711304, |
| "learning_rate": 0.0004639102040816326, |
| "loss": 3.3381, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.363636363636363, |
| "eval_accuracy": 0.36909796753598456, |
| "eval_loss": 3.563558578491211, |
| "eval_runtime": 180.0115, |
| "eval_samples_per_second": 92.45, |
| "eval_steps_per_second": 5.783, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.378205128205128, |
| "grad_norm": 0.37591269612312317, |
| "learning_rate": 0.00046373527696793, |
| "loss": 3.3343, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.392773892773892, |
| "grad_norm": 0.3608076870441437, |
| "learning_rate": 0.00046356034985422736, |
| "loss": 3.3386, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.407342657342657, |
| "grad_norm": 0.3387145400047302, |
| "learning_rate": 0.0004633854227405248, |
| "loss": 3.3433, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.421911421911421, |
| "grad_norm": 0.338716596364975, |
| "learning_rate": 0.0004632104956268221, |
| "loss": 3.3425, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.436480186480187, |
| "grad_norm": 0.3434636890888214, |
| "learning_rate": 0.0004630355685131195, |
| "loss": 3.3399, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.451048951048952, |
| "grad_norm": 0.3447709083557129, |
| "learning_rate": 0.00046286064139941687, |
| "loss": 3.3545, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.465617715617716, |
| "grad_norm": 0.3637690842151642, |
| "learning_rate": 0.00046268571428571424, |
| "loss": 3.3408, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.48018648018648, |
| "grad_norm": 0.3768569529056549, |
| "learning_rate": 0.0004625107871720116, |
| "loss": 3.3461, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.494755244755245, |
| "grad_norm": 0.33349111676216125, |
| "learning_rate": 0.000462335860058309, |
| "loss": 3.3351, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.50932400932401, |
| "grad_norm": 0.3535425364971161, |
| "learning_rate": 0.00046216093294460637, |
| "loss": 3.3506, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.523892773892774, |
| "grad_norm": 0.35613155364990234, |
| "learning_rate": 0.0004619860058309038, |
| "loss": 3.3595, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.538461538461538, |
| "grad_norm": 0.3751663565635681, |
| "learning_rate": 0.0004618110787172012, |
| "loss": 3.3434, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.553030303030303, |
| "grad_norm": 0.38919779658317566, |
| "learning_rate": 0.0004616361516034985, |
| "loss": 3.3436, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.567599067599067, |
| "grad_norm": 0.3445196747779846, |
| "learning_rate": 0.00046146122448979587, |
| "loss": 3.3509, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.582167832167832, |
| "grad_norm": 0.3505731523036957, |
| "learning_rate": 0.00046128629737609325, |
| "loss": 3.3374, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.596736596736596, |
| "grad_norm": 0.35699549317359924, |
| "learning_rate": 0.0004611113702623906, |
| "loss": 3.3467, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.61130536130536, |
| "grad_norm": 0.33985093235969543, |
| "learning_rate": 0.000460936443148688, |
| "loss": 3.3476, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.625874125874127, |
| "grad_norm": 0.37056589126586914, |
| "learning_rate": 0.0004607615160349854, |
| "loss": 3.3499, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.640442890442891, |
| "grad_norm": 0.3587411046028137, |
| "learning_rate": 0.00046058658892128275, |
| "loss": 3.3645, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.655011655011656, |
| "grad_norm": 0.3274442255496979, |
| "learning_rate": 0.0004604116618075802, |
| "loss": 3.3591, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.655011655011656, |
| "eval_accuracy": 0.37000906155199714, |
| "eval_loss": 3.5557005405426025, |
| "eval_runtime": 179.9923, |
| "eval_samples_per_second": 92.46, |
| "eval_steps_per_second": 5.784, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.66958041958042, |
| "grad_norm": 0.33285725116729736, |
| "learning_rate": 0.00046023673469387756, |
| "loss": 3.3547, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.684149184149184, |
| "grad_norm": 0.35055187344551086, |
| "learning_rate": 0.0004600618075801749, |
| "loss": 3.3695, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.698717948717949, |
| "grad_norm": 0.33380094170570374, |
| "learning_rate": 0.00045988688046647225, |
| "loss": 3.3513, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.713286713286713, |
| "grad_norm": 0.36289268732070923, |
| "learning_rate": 0.00045971195335276963, |
| "loss": 3.3489, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.727855477855478, |
| "grad_norm": 0.33579185605049133, |
| "learning_rate": 0.000459537026239067, |
| "loss": 3.3538, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.742424242424242, |
| "grad_norm": 0.3782014846801758, |
| "learning_rate": 0.0004593620991253644, |
| "loss": 3.3616, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.756993006993007, |
| "grad_norm": 0.34368279576301575, |
| "learning_rate": 0.00045918717201166176, |
| "loss": 3.3674, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.771561771561771, |
| "grad_norm": 0.3444232940673828, |
| "learning_rate": 0.00045901224489795913, |
| "loss": 3.3633, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.786130536130536, |
| "grad_norm": 0.33604544401168823, |
| "learning_rate": 0.00045883731778425656, |
| "loss": 3.3594, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.8006993006993, |
| "grad_norm": 0.33371636271476746, |
| "learning_rate": 0.00045866239067055394, |
| "loss": 3.3687, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.815268065268064, |
| "grad_norm": 0.3588174879550934, |
| "learning_rate": 0.00045848746355685126, |
| "loss": 3.3681, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.82983682983683, |
| "grad_norm": 0.3513983190059662, |
| "learning_rate": 0.00045831253644314864, |
| "loss": 3.3679, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.844405594405595, |
| "grad_norm": 0.3378084897994995, |
| "learning_rate": 0.000458137609329446, |
| "loss": 3.3578, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.85897435897436, |
| "grad_norm": 0.3297298550605774, |
| "learning_rate": 0.0004579626822157434, |
| "loss": 3.3709, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.873543123543124, |
| "grad_norm": 0.360269695520401, |
| "learning_rate": 0.00045778775510204076, |
| "loss": 3.3568, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.888111888111888, |
| "grad_norm": 0.3312649130821228, |
| "learning_rate": 0.00045761282798833814, |
| "loss": 3.3656, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.902680652680653, |
| "grad_norm": 0.36778295040130615, |
| "learning_rate": 0.00045743790087463557, |
| "loss": 3.3674, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.917249417249417, |
| "grad_norm": 0.33830592036247253, |
| "learning_rate": 0.00045726297376093294, |
| "loss": 3.3699, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.931818181818182, |
| "grad_norm": 0.37597140669822693, |
| "learning_rate": 0.0004570880466472303, |
| "loss": 3.3764, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.946386946386946, |
| "grad_norm": 0.34776413440704346, |
| "learning_rate": 0.00045691311953352764, |
| "loss": 3.357, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.946386946386946, |
| "eval_accuracy": 0.3705675005662, |
| "eval_loss": 3.548083782196045, |
| "eval_runtime": 179.9881, |
| "eval_samples_per_second": 92.462, |
| "eval_steps_per_second": 5.784, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.96095571095571, |
| "grad_norm": 0.35101014375686646, |
| "learning_rate": 0.000456738192419825, |
| "loss": 3.3761, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.975524475524475, |
| "grad_norm": 0.3674442172050476, |
| "learning_rate": 0.0004565632653061224, |
| "loss": 3.3691, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.99009324009324, |
| "grad_norm": 0.3214508891105652, |
| "learning_rate": 0.00045638833819241977, |
| "loss": 3.3713, |
| "step": 41150 |
| }, |
| { |
| "epoch": 12.004662004662004, |
| "grad_norm": 0.3173612952232361, |
| "learning_rate": 0.00045621341107871715, |
| "loss": 3.3369, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.01923076923077, |
| "grad_norm": 0.3481275737285614, |
| "learning_rate": 0.0004560384839650145, |
| "loss": 3.242, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.033799533799534, |
| "grad_norm": 0.33980754017829895, |
| "learning_rate": 0.00045586355685131195, |
| "loss": 3.2611, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.048368298368299, |
| "grad_norm": 0.35299932956695557, |
| "learning_rate": 0.0004556886297376093, |
| "loss": 3.2661, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.062937062937063, |
| "grad_norm": 0.3485766649246216, |
| "learning_rate": 0.0004555137026239067, |
| "loss": 3.2656, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.077505827505828, |
| "grad_norm": 0.33770278096199036, |
| "learning_rate": 0.000455338775510204, |
| "loss": 3.2791, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.092074592074592, |
| "grad_norm": 0.34751835465431213, |
| "learning_rate": 0.0004551638483965014, |
| "loss": 3.2841, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.106643356643357, |
| "grad_norm": 0.37227487564086914, |
| "learning_rate": 0.0004549889212827988, |
| "loss": 3.2799, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.121212121212121, |
| "grad_norm": 0.34621065855026245, |
| "learning_rate": 0.00045481399416909615, |
| "loss": 3.282, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.135780885780886, |
| "grad_norm": 0.3268602788448334, |
| "learning_rate": 0.00045463906705539353, |
| "loss": 3.2881, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.15034965034965, |
| "grad_norm": 0.39036524295806885, |
| "learning_rate": 0.0004544641399416909, |
| "loss": 3.2841, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.164918414918414, |
| "grad_norm": 0.35717472434043884, |
| "learning_rate": 0.00045428921282798833, |
| "loss": 3.2958, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.179487179487179, |
| "grad_norm": 0.35328230261802673, |
| "learning_rate": 0.0004541142857142857, |
| "loss": 3.2796, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.194055944055943, |
| "grad_norm": 0.33877119421958923, |
| "learning_rate": 0.0004539393586005831, |
| "loss": 3.2983, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.20862470862471, |
| "grad_norm": 0.3424377143383026, |
| "learning_rate": 0.0004537644314868804, |
| "loss": 3.2956, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.223193473193474, |
| "grad_norm": 0.35039380192756653, |
| "learning_rate": 0.0004535895043731778, |
| "loss": 3.3151, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.237762237762238, |
| "grad_norm": 0.36922869086265564, |
| "learning_rate": 0.00045341457725947516, |
| "loss": 3.3056, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.237762237762238, |
| "eval_accuracy": 0.3698839448724621, |
| "eval_loss": 3.5648937225341797, |
| "eval_runtime": 180.0535, |
| "eval_samples_per_second": 92.428, |
| "eval_steps_per_second": 5.782, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.252331002331003, |
| "grad_norm": 0.33937689661979675, |
| "learning_rate": 0.00045323965014577253, |
| "loss": 3.3066, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.266899766899767, |
| "grad_norm": 0.36293527483940125, |
| "learning_rate": 0.0004530647230320699, |
| "loss": 3.3046, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.281468531468532, |
| "grad_norm": 0.36257535219192505, |
| "learning_rate": 0.00045288979591836734, |
| "loss": 3.3095, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.296037296037296, |
| "grad_norm": 0.38248223066329956, |
| "learning_rate": 0.0004527148688046647, |
| "loss": 3.3109, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.31060606060606, |
| "grad_norm": 0.35680779814720154, |
| "learning_rate": 0.0004525399416909621, |
| "loss": 3.3174, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.325174825174825, |
| "grad_norm": 0.3464656174182892, |
| "learning_rate": 0.00045236501457725947, |
| "loss": 3.3103, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.33974358974359, |
| "grad_norm": 0.36527353525161743, |
| "learning_rate": 0.0004521900874635568, |
| "loss": 3.3141, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.354312354312354, |
| "grad_norm": 0.34814345836639404, |
| "learning_rate": 0.00045201516034985416, |
| "loss": 3.3219, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.368881118881118, |
| "grad_norm": 0.3569382131099701, |
| "learning_rate": 0.00045184023323615154, |
| "loss": 3.318, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.383449883449883, |
| "grad_norm": 0.3441168963909149, |
| "learning_rate": 0.0004516653061224489, |
| "loss": 3.3209, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.398018648018647, |
| "grad_norm": 0.3545966148376465, |
| "learning_rate": 0.0004514903790087463, |
| "loss": 3.3156, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.412587412587413, |
| "grad_norm": 0.349949449300766, |
| "learning_rate": 0.0004513154518950437, |
| "loss": 3.3224, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.427156177156178, |
| "grad_norm": 0.375348836183548, |
| "learning_rate": 0.0004511405247813411, |
| "loss": 3.3445, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.441724941724942, |
| "grad_norm": 0.3431892395019531, |
| "learning_rate": 0.0004509655976676385, |
| "loss": 3.3242, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.456293706293707, |
| "grad_norm": 0.3887154161930084, |
| "learning_rate": 0.00045079067055393585, |
| "loss": 3.3199, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.470862470862471, |
| "grad_norm": 0.3668091297149658, |
| "learning_rate": 0.00045061574344023317, |
| "loss": 3.3249, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.485431235431236, |
| "grad_norm": 0.35282227396965027, |
| "learning_rate": 0.00045044081632653055, |
| "loss": 3.323, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 0.34184911847114563, |
| "learning_rate": 0.0004502658892128279, |
| "loss": 3.3244, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.514568764568764, |
| "grad_norm": 0.3672352135181427, |
| "learning_rate": 0.0004500909620991253, |
| "loss": 3.3266, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.529137529137529, |
| "grad_norm": 0.3395889103412628, |
| "learning_rate": 0.00044991603498542273, |
| "loss": 3.3235, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.529137529137529, |
| "eval_accuracy": 0.37045567165432236, |
| "eval_loss": 3.553213119506836, |
| "eval_runtime": 180.0175, |
| "eval_samples_per_second": 92.447, |
| "eval_steps_per_second": 5.783, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.543706293706293, |
| "grad_norm": 0.3517553508281708, |
| "learning_rate": 0.0004497411078717201, |
| "loss": 3.3239, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.558275058275058, |
| "grad_norm": 0.3382112383842468, |
| "learning_rate": 0.0004495661807580175, |
| "loss": 3.3364, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.572843822843822, |
| "grad_norm": 0.3736812472343445, |
| "learning_rate": 0.00044939125364431486, |
| "loss": 3.3348, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.587412587412587, |
| "grad_norm": 0.36248350143432617, |
| "learning_rate": 0.00044921632653061223, |
| "loss": 3.3275, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.601981351981351, |
| "grad_norm": 0.3704957365989685, |
| "learning_rate": 0.00044904139941690955, |
| "loss": 3.3323, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.616550116550117, |
| "grad_norm": 0.36183616518974304, |
| "learning_rate": 0.00044886647230320693, |
| "loss": 3.329, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.631118881118882, |
| "grad_norm": 0.3404022455215454, |
| "learning_rate": 0.0004486915451895043, |
| "loss": 3.3414, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.645687645687646, |
| "grad_norm": 0.3847573399543762, |
| "learning_rate": 0.0004485166180758017, |
| "loss": 3.3339, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.66025641025641, |
| "grad_norm": 0.3649556636810303, |
| "learning_rate": 0.0004483416909620991, |
| "loss": 3.3361, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.674825174825175, |
| "grad_norm": 0.3577769696712494, |
| "learning_rate": 0.0004481667638483965, |
| "loss": 3.3328, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.68939393939394, |
| "grad_norm": 0.3543531000614166, |
| "learning_rate": 0.00044799183673469386, |
| "loss": 3.3446, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.703962703962704, |
| "grad_norm": 0.3769163191318512, |
| "learning_rate": 0.00044781690962099124, |
| "loss": 3.3362, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.718531468531468, |
| "grad_norm": 0.36154523491859436, |
| "learning_rate": 0.0004476419825072886, |
| "loss": 3.338, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.733100233100233, |
| "grad_norm": 0.3562755584716797, |
| "learning_rate": 0.00044746705539358593, |
| "loss": 3.3321, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.747668997668997, |
| "grad_norm": 0.34634169936180115, |
| "learning_rate": 0.0004472921282798833, |
| "loss": 3.3404, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.762237762237762, |
| "grad_norm": 0.35489341616630554, |
| "learning_rate": 0.0004471172011661807, |
| "loss": 3.3451, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.776806526806526, |
| "grad_norm": 0.349941223859787, |
| "learning_rate": 0.00044694227405247806, |
| "loss": 3.3459, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.791375291375292, |
| "grad_norm": 0.3710970878601074, |
| "learning_rate": 0.0004467673469387755, |
| "loss": 3.3494, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.805944055944057, |
| "grad_norm": 0.3861698806285858, |
| "learning_rate": 0.00044659241982507287, |
| "loss": 3.343, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.820512820512821, |
| "grad_norm": 0.3910358250141144, |
| "learning_rate": 0.00044641749271137024, |
| "loss": 3.3355, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.820512820512821, |
| "eval_accuracy": 0.3705464518015038, |
| "eval_loss": 3.54892635345459, |
| "eval_runtime": 179.9708, |
| "eval_samples_per_second": 92.471, |
| "eval_steps_per_second": 5.784, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.835081585081586, |
| "grad_norm": 0.3828602135181427, |
| "learning_rate": 0.0004462425655976676, |
| "loss": 3.3659, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.84965034965035, |
| "grad_norm": 0.3822254538536072, |
| "learning_rate": 0.000446067638483965, |
| "loss": 3.3387, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.864219114219114, |
| "grad_norm": 0.3691919445991516, |
| "learning_rate": 0.0004458927113702623, |
| "loss": 3.3439, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.878787878787879, |
| "grad_norm": 0.36081933975219727, |
| "learning_rate": 0.0004457177842565597, |
| "loss": 3.3431, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.893356643356643, |
| "grad_norm": 0.36241626739501953, |
| "learning_rate": 0.00044554285714285707, |
| "loss": 3.3497, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.907925407925408, |
| "grad_norm": 0.3698984980583191, |
| "learning_rate": 0.0004453679300291545, |
| "loss": 3.3471, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.922494172494172, |
| "grad_norm": 0.3605395555496216, |
| "learning_rate": 0.0004451930029154519, |
| "loss": 3.3469, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.937062937062937, |
| "grad_norm": 0.3236188590526581, |
| "learning_rate": 0.00044501807580174925, |
| "loss": 3.3546, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.951631701631701, |
| "grad_norm": 0.3255074620246887, |
| "learning_rate": 0.0004448431486880466, |
| "loss": 3.3495, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.966200466200466, |
| "grad_norm": 0.34724318981170654, |
| "learning_rate": 0.000444668221574344, |
| "loss": 3.3554, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.98076923076923, |
| "grad_norm": 0.3772111237049103, |
| "learning_rate": 0.0004444932944606414, |
| "loss": 3.3509, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.995337995337996, |
| "grad_norm": 0.3767688572406769, |
| "learning_rate": 0.0004443183673469387, |
| "loss": 3.3577, |
| "step": 44600 |
| }, |
| { |
| "epoch": 13.00990675990676, |
| "grad_norm": 0.35660919547080994, |
| "learning_rate": 0.0004441434402332361, |
| "loss": 3.27, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.024475524475525, |
| "grad_norm": 0.3383883833885193, |
| "learning_rate": 0.00044396851311953345, |
| "loss": 3.2467, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.03904428904429, |
| "grad_norm": 0.3448445796966553, |
| "learning_rate": 0.0004437935860058309, |
| "loss": 3.24, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.053613053613054, |
| "grad_norm": 0.387921541929245, |
| "learning_rate": 0.00044361865889212826, |
| "loss": 3.254, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.068181818181818, |
| "grad_norm": 0.3401538133621216, |
| "learning_rate": 0.00044344373177842563, |
| "loss": 3.2603, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.082750582750583, |
| "grad_norm": 0.3475644886493683, |
| "learning_rate": 0.000443268804664723, |
| "loss": 3.2626, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.097319347319347, |
| "grad_norm": 0.4108971357345581, |
| "learning_rate": 0.0004430938775510204, |
| "loss": 3.2448, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.111888111888112, |
| "grad_norm": 0.3426775336265564, |
| "learning_rate": 0.00044291895043731776, |
| "loss": 3.2631, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.111888111888112, |
| "eval_accuracy": 0.37012735796140717, |
| "eval_loss": 3.561819553375244, |
| "eval_runtime": 179.9972, |
| "eval_samples_per_second": 92.457, |
| "eval_steps_per_second": 5.783, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.126456876456876, |
| "grad_norm": 0.35371506214141846, |
| "learning_rate": 0.0004427440233236151, |
| "loss": 3.2639, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.14102564102564, |
| "grad_norm": 0.3580614924430847, |
| "learning_rate": 0.00044256909620991246, |
| "loss": 3.2704, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.155594405594405, |
| "grad_norm": 0.34704920649528503, |
| "learning_rate": 0.0004423941690962099, |
| "loss": 3.2731, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.17016317016317, |
| "grad_norm": 0.3504481613636017, |
| "learning_rate": 0.00044221924198250726, |
| "loss": 3.2719, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.184731934731936, |
| "grad_norm": 0.33558207750320435, |
| "learning_rate": 0.00044204431486880464, |
| "loss": 3.2841, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.1993006993007, |
| "grad_norm": 0.3562605679035187, |
| "learning_rate": 0.000441869387755102, |
| "loss": 3.2794, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.213869463869464, |
| "grad_norm": 0.34473833441734314, |
| "learning_rate": 0.0004416944606413994, |
| "loss": 3.2687, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.228438228438229, |
| "grad_norm": 0.35833731293678284, |
| "learning_rate": 0.00044151953352769677, |
| "loss": 3.2814, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.243006993006993, |
| "grad_norm": 0.37955713272094727, |
| "learning_rate": 0.00044134460641399414, |
| "loss": 3.286, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.257575757575758, |
| "grad_norm": 0.3702819049358368, |
| "learning_rate": 0.00044116967930029146, |
| "loss": 3.2925, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.272144522144522, |
| "grad_norm": 0.36276885867118835, |
| "learning_rate": 0.00044099475218658884, |
| "loss": 3.2913, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.286713286713287, |
| "grad_norm": 0.36508500576019287, |
| "learning_rate": 0.00044081982507288627, |
| "loss": 3.2984, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.301282051282051, |
| "grad_norm": 0.3717924952507019, |
| "learning_rate": 0.00044064489795918365, |
| "loss": 3.2987, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.315850815850816, |
| "grad_norm": 0.3577413856983185, |
| "learning_rate": 0.000440469970845481, |
| "loss": 3.295, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.33041958041958, |
| "grad_norm": 0.38441523909568787, |
| "learning_rate": 0.0004402950437317784, |
| "loss": 3.3003, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.344988344988344, |
| "grad_norm": 0.33631208539009094, |
| "learning_rate": 0.00044012011661807577, |
| "loss": 3.2865, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.359557109557109, |
| "grad_norm": 0.35587334632873535, |
| "learning_rate": 0.00043994518950437315, |
| "loss": 3.3047, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.374125874125873, |
| "grad_norm": 0.37436777353286743, |
| "learning_rate": 0.0004397702623906705, |
| "loss": 3.3059, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.38869463869464, |
| "grad_norm": 0.399141401052475, |
| "learning_rate": 0.00043959533527696785, |
| "loss": 3.296, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.403263403263404, |
| "grad_norm": 0.385282427072525, |
| "learning_rate": 0.0004394204081632652, |
| "loss": 3.3058, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.403263403263404, |
| "eval_accuracy": 0.37068544420301736, |
| "eval_loss": 3.5576846599578857, |
| "eval_runtime": 179.9529, |
| "eval_samples_per_second": 92.48, |
| "eval_steps_per_second": 5.785, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.417832167832168, |
| "grad_norm": 0.3681379556655884, |
| "learning_rate": 0.00043924548104956265, |
| "loss": 3.3096, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.432400932400933, |
| "grad_norm": 0.38421115279197693, |
| "learning_rate": 0.00043907055393586003, |
| "loss": 3.3126, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.446969696969697, |
| "grad_norm": 0.3728811740875244, |
| "learning_rate": 0.0004388956268221574, |
| "loss": 3.3038, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.461538461538462, |
| "grad_norm": 0.3609063923358917, |
| "learning_rate": 0.0004387206997084548, |
| "loss": 3.3158, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.476107226107226, |
| "grad_norm": 0.3440368175506592, |
| "learning_rate": 0.00043854577259475215, |
| "loss": 3.3067, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.49067599067599, |
| "grad_norm": 0.33095043897628784, |
| "learning_rate": 0.00043837084548104953, |
| "loss": 3.324, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.505244755244755, |
| "grad_norm": 0.32994940876960754, |
| "learning_rate": 0.0004381959183673469, |
| "loss": 3.3235, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.51981351981352, |
| "grad_norm": 0.34858837723731995, |
| "learning_rate": 0.00043802099125364423, |
| "loss": 3.3207, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.534382284382284, |
| "grad_norm": 0.3519839942455292, |
| "learning_rate": 0.00043784606413994166, |
| "loss": 3.3126, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.548951048951048, |
| "grad_norm": 0.3784167170524597, |
| "learning_rate": 0.00043767113702623903, |
| "loss": 3.3128, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.563519813519813, |
| "grad_norm": 0.35100221633911133, |
| "learning_rate": 0.0004374962099125364, |
| "loss": 3.3071, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.578088578088579, |
| "grad_norm": 0.37419000267982483, |
| "learning_rate": 0.0004373212827988338, |
| "loss": 3.3138, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.592657342657343, |
| "grad_norm": 0.34843066334724426, |
| "learning_rate": 0.00043714635568513116, |
| "loss": 3.3234, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.607226107226108, |
| "grad_norm": 0.3874765634536743, |
| "learning_rate": 0.00043697142857142854, |
| "loss": 3.3175, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.621794871794872, |
| "grad_norm": 0.3625902235507965, |
| "learning_rate": 0.0004367965014577259, |
| "loss": 3.3113, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.636363636363637, |
| "grad_norm": 0.362448513507843, |
| "learning_rate": 0.00043662157434402334, |
| "loss": 3.3231, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.650932400932401, |
| "grad_norm": 0.3466891348361969, |
| "learning_rate": 0.0004364466472303206, |
| "loss": 3.3209, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.665501165501166, |
| "grad_norm": 0.3769454061985016, |
| "learning_rate": 0.00043627172011661804, |
| "loss": 3.3104, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.68006993006993, |
| "grad_norm": 0.42102763056755066, |
| "learning_rate": 0.0004360967930029154, |
| "loss": 3.3251, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.694638694638694, |
| "grad_norm": 0.35508814454078674, |
| "learning_rate": 0.0004359218658892128, |
| "loss": 3.3095, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.694638694638694, |
| "eval_accuracy": 0.3713083230108153, |
| "eval_loss": 3.5499608516693115, |
| "eval_runtime": 180.0856, |
| "eval_samples_per_second": 92.412, |
| "eval_steps_per_second": 5.781, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.709207459207459, |
| "grad_norm": 0.3345799446105957, |
| "learning_rate": 0.00043574693877551017, |
| "loss": 3.3378, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.723776223776223, |
| "grad_norm": 0.33664801716804504, |
| "learning_rate": 0.00043557201166180754, |
| "loss": 3.3381, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.738344988344988, |
| "grad_norm": 0.35207465291023254, |
| "learning_rate": 0.0004353970845481049, |
| "loss": 3.3346, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.752913752913752, |
| "grad_norm": 0.34713587164878845, |
| "learning_rate": 0.0004352221574344023, |
| "loss": 3.3475, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.767482517482517, |
| "grad_norm": 0.37446174025535583, |
| "learning_rate": 0.0004350472303206997, |
| "loss": 3.3309, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.782051282051283, |
| "grad_norm": 0.33329087495803833, |
| "learning_rate": 0.00043487230320699705, |
| "loss": 3.3289, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.796620046620047, |
| "grad_norm": 0.38883328437805176, |
| "learning_rate": 0.0004346973760932944, |
| "loss": 3.3214, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.811188811188812, |
| "grad_norm": 0.388072669506073, |
| "learning_rate": 0.0004345224489795918, |
| "loss": 3.3401, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.825757575757576, |
| "grad_norm": 0.3695523142814636, |
| "learning_rate": 0.0004343475218658892, |
| "loss": 3.3128, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.84032634032634, |
| "grad_norm": 0.37850746512413025, |
| "learning_rate": 0.00043417259475218655, |
| "loss": 3.3294, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.854895104895105, |
| "grad_norm": 0.3716648519039154, |
| "learning_rate": 0.0004339976676384839, |
| "loss": 3.3223, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.86946386946387, |
| "grad_norm": 0.38063937425613403, |
| "learning_rate": 0.0004338227405247813, |
| "loss": 3.3279, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.884032634032634, |
| "grad_norm": 0.39428770542144775, |
| "learning_rate": 0.00043364781341107873, |
| "loss": 3.3375, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.898601398601398, |
| "grad_norm": 0.34782904386520386, |
| "learning_rate": 0.0004334728862973761, |
| "loss": 3.3189, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.913170163170163, |
| "grad_norm": 0.37662190198898315, |
| "learning_rate": 0.00043329795918367343, |
| "loss": 3.3221, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.927738927738927, |
| "grad_norm": 0.36181744933128357, |
| "learning_rate": 0.0004331230320699708, |
| "loss": 3.3231, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.942307692307692, |
| "grad_norm": 0.34516653418540955, |
| "learning_rate": 0.0004329481049562682, |
| "loss": 3.3375, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.956876456876456, |
| "grad_norm": 0.38795268535614014, |
| "learning_rate": 0.00043277317784256556, |
| "loss": 3.343, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.971445221445222, |
| "grad_norm": 0.3683021664619446, |
| "learning_rate": 0.00043259825072886293, |
| "loss": 3.3304, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.986013986013987, |
| "grad_norm": 0.3830035328865051, |
| "learning_rate": 0.0004324233236151603, |
| "loss": 3.3335, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.986013986013987, |
| "eval_accuracy": 0.37180502682130023, |
| "eval_loss": 3.538908004760742, |
| "eval_runtime": 180.2136, |
| "eval_samples_per_second": 92.346, |
| "eval_steps_per_second": 5.776, |
| "step": 48000 |
| }, |
| { |
| "epoch": 14.000582750582751, |
| "grad_norm": 0.3480665981769562, |
| "learning_rate": 0.0004322483965014577, |
| "loss": 3.3392, |
| "step": 48050 |
| }, |
| { |
| "epoch": 14.015151515151516, |
| "grad_norm": 0.37040701508522034, |
| "learning_rate": 0.0004320734693877551, |
| "loss": 3.2289, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.02972027972028, |
| "grad_norm": 0.3557523190975189, |
| "learning_rate": 0.0004318985422740525, |
| "loss": 3.2138, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.044289044289044, |
| "grad_norm": 0.35745498538017273, |
| "learning_rate": 0.0004317236151603498, |
| "loss": 3.2378, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.058857808857809, |
| "grad_norm": 0.35629206895828247, |
| "learning_rate": 0.0004315486880466472, |
| "loss": 3.2303, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.073426573426573, |
| "grad_norm": 0.386960506439209, |
| "learning_rate": 0.00043137376093294456, |
| "loss": 3.2403, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.087995337995338, |
| "grad_norm": 0.3604874312877655, |
| "learning_rate": 0.00043119883381924194, |
| "loss": 3.2408, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.102564102564102, |
| "grad_norm": 0.3432071805000305, |
| "learning_rate": 0.0004310239067055393, |
| "loss": 3.2521, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.117132867132867, |
| "grad_norm": 0.367196261882782, |
| "learning_rate": 0.0004308489795918367, |
| "loss": 3.2488, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.131701631701631, |
| "grad_norm": 0.39279282093048096, |
| "learning_rate": 0.00043067405247813407, |
| "loss": 3.2613, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.146270396270396, |
| "grad_norm": 0.38095659017562866, |
| "learning_rate": 0.0004304991253644315, |
| "loss": 3.2409, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.16083916083916, |
| "grad_norm": 0.3556550145149231, |
| "learning_rate": 0.00043032419825072887, |
| "loss": 3.2639, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.175407925407926, |
| "grad_norm": 0.36897987127304077, |
| "learning_rate": 0.0004301492711370262, |
| "loss": 3.2696, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.18997668997669, |
| "grad_norm": 0.4175944924354553, |
| "learning_rate": 0.00042997434402332357, |
| "loss": 3.2581, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.204545454545455, |
| "grad_norm": 0.3549362123012543, |
| "learning_rate": 0.00042979941690962094, |
| "loss": 3.2691, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.21911421911422, |
| "grad_norm": 0.36222416162490845, |
| "learning_rate": 0.0004296244897959183, |
| "loss": 3.2571, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.233682983682984, |
| "grad_norm": 0.372361958026886, |
| "learning_rate": 0.0004294495626822157, |
| "loss": 3.2698, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.248251748251748, |
| "grad_norm": 0.3558199405670166, |
| "learning_rate": 0.00042927463556851307, |
| "loss": 3.276, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.262820512820513, |
| "grad_norm": 0.35009899735450745, |
| "learning_rate": 0.0004290997084548105, |
| "loss": 3.2863, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.277389277389277, |
| "grad_norm": 0.37568387389183044, |
| "learning_rate": 0.0004289247813411079, |
| "loss": 3.2777, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.277389277389277, |
| "eval_accuracy": 0.3711598057492996, |
| "eval_loss": 3.5574018955230713, |
| "eval_runtime": 180.0118, |
| "eval_samples_per_second": 92.45, |
| "eval_steps_per_second": 5.783, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.291958041958042, |
| "grad_norm": 0.36017411947250366, |
| "learning_rate": 0.00042874985422740525, |
| "loss": 3.2748, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.306526806526806, |
| "grad_norm": 0.34823670983314514, |
| "learning_rate": 0.0004285749271137026, |
| "loss": 3.2609, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.32109557109557, |
| "grad_norm": 0.39780181646347046, |
| "learning_rate": 0.00042839999999999995, |
| "loss": 3.2832, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.335664335664335, |
| "grad_norm": 0.3572824001312256, |
| "learning_rate": 0.0004282250728862973, |
| "loss": 3.2827, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.3502331002331, |
| "grad_norm": 0.37133660912513733, |
| "learning_rate": 0.0004280501457725947, |
| "loss": 3.2703, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.364801864801866, |
| "grad_norm": 0.38326296210289, |
| "learning_rate": 0.0004278752186588921, |
| "loss": 3.2889, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.37937062937063, |
| "grad_norm": 0.339304655790329, |
| "learning_rate": 0.00042770029154518945, |
| "loss": 3.2866, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.393939393939394, |
| "grad_norm": 0.37460795044898987, |
| "learning_rate": 0.0004275253644314869, |
| "loss": 3.2895, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.408508158508159, |
| "grad_norm": 0.357552170753479, |
| "learning_rate": 0.00042735043731778426, |
| "loss": 3.2825, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.423076923076923, |
| "grad_norm": 0.3532392978668213, |
| "learning_rate": 0.00042717551020408164, |
| "loss": 3.2863, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.437645687645688, |
| "grad_norm": 0.3605949878692627, |
| "learning_rate": 0.00042700058309037896, |
| "loss": 3.2907, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.452214452214452, |
| "grad_norm": 0.3634662628173828, |
| "learning_rate": 0.00042682565597667633, |
| "loss": 3.3028, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.466783216783217, |
| "grad_norm": 0.3738766312599182, |
| "learning_rate": 0.0004266507288629737, |
| "loss": 3.2952, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.481351981351981, |
| "grad_norm": 0.3741609752178192, |
| "learning_rate": 0.0004264758017492711, |
| "loss": 3.279, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.495920745920746, |
| "grad_norm": 0.33341366052627563, |
| "learning_rate": 0.00042630087463556846, |
| "loss": 3.3027, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.51048951048951, |
| "grad_norm": 0.35941943526268005, |
| "learning_rate": 0.00042612594752186584, |
| "loss": 3.2978, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.525058275058274, |
| "grad_norm": 0.3725701868534088, |
| "learning_rate": 0.00042595102040816327, |
| "loss": 3.3026, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.539627039627039, |
| "grad_norm": 0.36116182804107666, |
| "learning_rate": 0.00042577609329446064, |
| "loss": 3.2948, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.554195804195803, |
| "grad_norm": 0.3436543345451355, |
| "learning_rate": 0.000425601166180758, |
| "loss": 3.3013, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.56876456876457, |
| "grad_norm": 0.3576006293296814, |
| "learning_rate": 0.00042542623906705534, |
| "loss": 3.313, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.56876456876457, |
| "eval_accuracy": 0.3715792523619889, |
| "eval_loss": 3.5501418113708496, |
| "eval_runtime": 180.1333, |
| "eval_samples_per_second": 92.387, |
| "eval_steps_per_second": 5.779, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.583333333333334, |
| "grad_norm": 0.36011427640914917, |
| "learning_rate": 0.0004252513119533527, |
| "loss": 3.308, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.597902097902098, |
| "grad_norm": 0.3479318916797638, |
| "learning_rate": 0.0004250763848396501, |
| "loss": 3.3094, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.612470862470863, |
| "grad_norm": 0.3761695623397827, |
| "learning_rate": 0.00042490145772594747, |
| "loss": 3.3024, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.627039627039627, |
| "grad_norm": 0.3592838644981384, |
| "learning_rate": 0.00042472653061224484, |
| "loss": 3.3189, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.641608391608392, |
| "grad_norm": 0.38095730543136597, |
| "learning_rate": 0.00042455160349854227, |
| "loss": 3.2947, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.656177156177156, |
| "grad_norm": 0.3472435772418976, |
| "learning_rate": 0.00042437667638483965, |
| "loss": 3.3113, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.67074592074592, |
| "grad_norm": 0.3782998323440552, |
| "learning_rate": 0.000424201749271137, |
| "loss": 3.3189, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.685314685314685, |
| "grad_norm": 0.35338160395622253, |
| "learning_rate": 0.0004240268221574344, |
| "loss": 3.309, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.69988344988345, |
| "grad_norm": 0.35487911105155945, |
| "learning_rate": 0.0004238518950437317, |
| "loss": 3.2962, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.714452214452214, |
| "grad_norm": 0.38159093260765076, |
| "learning_rate": 0.0004236769679300291, |
| "loss": 3.3062, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.729020979020978, |
| "grad_norm": 0.3594988286495209, |
| "learning_rate": 0.00042350204081632647, |
| "loss": 3.3045, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.743589743589745, |
| "grad_norm": 0.3737533390522003, |
| "learning_rate": 0.00042332711370262385, |
| "loss": 3.317, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.758158508158509, |
| "grad_norm": 0.3727569282054901, |
| "learning_rate": 0.0004231521865889212, |
| "loss": 3.3045, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.772727272727273, |
| "grad_norm": 0.3523760437965393, |
| "learning_rate": 0.00042297725947521865, |
| "loss": 3.3158, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.787296037296038, |
| "grad_norm": 0.3428592383861542, |
| "learning_rate": 0.00042280233236151603, |
| "loss": 3.2976, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.801864801864802, |
| "grad_norm": 0.3490734100341797, |
| "learning_rate": 0.0004226274052478134, |
| "loss": 3.3028, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.816433566433567, |
| "grad_norm": 0.3931259512901306, |
| "learning_rate": 0.0004224524781341108, |
| "loss": 3.3063, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.831002331002331, |
| "grad_norm": 0.35205090045928955, |
| "learning_rate": 0.0004222775510204081, |
| "loss": 3.3259, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.845571095571096, |
| "grad_norm": 0.3528735637664795, |
| "learning_rate": 0.0004221026239067055, |
| "loss": 3.3185, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.86013986013986, |
| "grad_norm": 0.38072121143341064, |
| "learning_rate": 0.00042192769679300285, |
| "loss": 3.3186, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.86013986013986, |
| "eval_accuracy": 0.37217014645471774, |
| "eval_loss": 3.5429024696350098, |
| "eval_runtime": 180.2191, |
| "eval_samples_per_second": 92.343, |
| "eval_steps_per_second": 5.776, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.874708624708624, |
| "grad_norm": 0.3460468649864197, |
| "learning_rate": 0.00042175276967930023, |
| "loss": 3.3226, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.889277389277389, |
| "grad_norm": 0.3763303756713867, |
| "learning_rate": 0.00042157784256559766, |
| "loss": 3.3287, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.903846153846153, |
| "grad_norm": 0.36650410294532776, |
| "learning_rate": 0.00042140291545189504, |
| "loss": 3.3108, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.918414918414918, |
| "grad_norm": 0.34479567408561707, |
| "learning_rate": 0.0004212279883381924, |
| "loss": 3.3206, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.932983682983682, |
| "grad_norm": 0.36906757950782776, |
| "learning_rate": 0.0004210530612244898, |
| "loss": 3.3259, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.947552447552448, |
| "grad_norm": 0.35507652163505554, |
| "learning_rate": 0.0004208781341107871, |
| "loss": 3.3299, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.962121212121213, |
| "grad_norm": 0.3915591239929199, |
| "learning_rate": 0.0004207032069970845, |
| "loss": 3.3257, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.976689976689977, |
| "grad_norm": 0.37498611211776733, |
| "learning_rate": 0.00042052827988338186, |
| "loss": 3.327, |
| "step": 51400 |
| }, |
| { |
| "epoch": 14.991258741258742, |
| "grad_norm": 0.3631887435913086, |
| "learning_rate": 0.00042035335276967924, |
| "loss": 3.3221, |
| "step": 51450 |
| }, |
| { |
| "epoch": 15.005827505827506, |
| "grad_norm": 0.3742659091949463, |
| "learning_rate": 0.0004201784256559766, |
| "loss": 3.2863, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.02039627039627, |
| "grad_norm": 0.36948665976524353, |
| "learning_rate": 0.00042000349854227404, |
| "loss": 3.2117, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.034965034965035, |
| "grad_norm": 0.35238027572631836, |
| "learning_rate": 0.0004198285714285714, |
| "loss": 3.2095, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.0495337995338, |
| "grad_norm": 0.36405453085899353, |
| "learning_rate": 0.0004196536443148688, |
| "loss": 3.2157, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.064102564102564, |
| "grad_norm": 0.3587688207626343, |
| "learning_rate": 0.00041947871720116617, |
| "loss": 3.2258, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.078671328671328, |
| "grad_norm": 0.38245049118995667, |
| "learning_rate": 0.0004193037900874635, |
| "loss": 3.2297, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.093240093240093, |
| "grad_norm": 0.3895741105079651, |
| "learning_rate": 0.00041912886297376087, |
| "loss": 3.2331, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.107808857808857, |
| "grad_norm": 0.36344262957572937, |
| "learning_rate": 0.00041895393586005824, |
| "loss": 3.246, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.122377622377622, |
| "grad_norm": 0.3528017997741699, |
| "learning_rate": 0.0004187790087463556, |
| "loss": 3.2407, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.136946386946388, |
| "grad_norm": 0.3691173791885376, |
| "learning_rate": 0.000418604081632653, |
| "loss": 3.2275, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.151515151515152, |
| "grad_norm": 0.3804149031639099, |
| "learning_rate": 0.0004184291545189504, |
| "loss": 3.2468, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.151515151515152, |
| "eval_accuracy": 0.37138205248268413, |
| "eval_loss": 3.5521976947784424, |
| "eval_runtime": 180.2046, |
| "eval_samples_per_second": 92.351, |
| "eval_steps_per_second": 5.777, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.166083916083917, |
| "grad_norm": 0.36506277322769165, |
| "learning_rate": 0.0004182542274052478, |
| "loss": 3.2333, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.180652680652681, |
| "grad_norm": 0.36347970366477966, |
| "learning_rate": 0.0004180793002915452, |
| "loss": 3.2548, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.195221445221446, |
| "grad_norm": 0.3790263831615448, |
| "learning_rate": 0.00041790437317784255, |
| "loss": 3.2381, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.20979020979021, |
| "grad_norm": 0.35307011008262634, |
| "learning_rate": 0.0004177294460641399, |
| "loss": 3.2399, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.224358974358974, |
| "grad_norm": 0.38718825578689575, |
| "learning_rate": 0.00041755451895043725, |
| "loss": 3.2514, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.238927738927739, |
| "grad_norm": 0.3450746238231659, |
| "learning_rate": 0.0004173795918367346, |
| "loss": 3.2618, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.253496503496503, |
| "grad_norm": 0.3694665729999542, |
| "learning_rate": 0.000417204664723032, |
| "loss": 3.2559, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.268065268065268, |
| "grad_norm": 0.3642129600048065, |
| "learning_rate": 0.00041702973760932943, |
| "loss": 3.2472, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.282634032634032, |
| "grad_norm": 0.36244091391563416, |
| "learning_rate": 0.0004168548104956268, |
| "loss": 3.2634, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.297202797202797, |
| "grad_norm": 0.3727215826511383, |
| "learning_rate": 0.0004166798833819242, |
| "loss": 3.2626, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.311771561771561, |
| "grad_norm": 0.3629833459854126, |
| "learning_rate": 0.00041650495626822156, |
| "loss": 3.2536, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.326340326340326, |
| "grad_norm": 0.39329707622528076, |
| "learning_rate": 0.00041633002915451893, |
| "loss": 3.2716, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.340909090909092, |
| "grad_norm": 0.38280272483825684, |
| "learning_rate": 0.00041615510204081626, |
| "loss": 3.2711, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.355477855477856, |
| "grad_norm": 0.38633573055267334, |
| "learning_rate": 0.00041598017492711363, |
| "loss": 3.2793, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.37004662004662, |
| "grad_norm": 0.3785382807254791, |
| "learning_rate": 0.000415805247813411, |
| "loss": 3.2758, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.384615384615385, |
| "grad_norm": 0.3668574094772339, |
| "learning_rate": 0.0004156303206997084, |
| "loss": 3.2892, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.39918414918415, |
| "grad_norm": 0.37236839532852173, |
| "learning_rate": 0.0004154553935860058, |
| "loss": 3.2606, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.413752913752914, |
| "grad_norm": 0.35925427079200745, |
| "learning_rate": 0.0004152804664723032, |
| "loss": 3.2751, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.428321678321678, |
| "grad_norm": 0.3505415618419647, |
| "learning_rate": 0.00041510553935860056, |
| "loss": 3.2763, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.442890442890443, |
| "grad_norm": 0.3869665563106537, |
| "learning_rate": 0.00041493061224489794, |
| "loss": 3.2767, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.442890442890443, |
| "eval_accuracy": 0.3715333919249413, |
| "eval_loss": 3.549450635910034, |
| "eval_runtime": 180.2062, |
| "eval_samples_per_second": 92.35, |
| "eval_steps_per_second": 5.777, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.457459207459207, |
| "grad_norm": 0.3662952780723572, |
| "learning_rate": 0.0004147556851311953, |
| "loss": 3.2921, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.472027972027972, |
| "grad_norm": 0.3680059313774109, |
| "learning_rate": 0.00041458075801749264, |
| "loss": 3.2848, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.486596736596736, |
| "grad_norm": 0.3896573781967163, |
| "learning_rate": 0.00041440583090379, |
| "loss": 3.2915, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.5011655011655, |
| "grad_norm": 0.35986337065696716, |
| "learning_rate": 0.0004142309037900874, |
| "loss": 3.2853, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.515734265734265, |
| "grad_norm": 0.3684069514274597, |
| "learning_rate": 0.0004140559766763848, |
| "loss": 3.283, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.530303030303031, |
| "grad_norm": 0.3770783841609955, |
| "learning_rate": 0.0004138810495626822, |
| "loss": 3.2694, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.544871794871796, |
| "grad_norm": 0.370087593793869, |
| "learning_rate": 0.00041370612244897957, |
| "loss": 3.2819, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.55944055944056, |
| "grad_norm": 0.42769378423690796, |
| "learning_rate": 0.00041353119533527695, |
| "loss": 3.2821, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.574009324009324, |
| "grad_norm": 0.3953687250614166, |
| "learning_rate": 0.0004133562682215743, |
| "loss": 3.2944, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.588578088578089, |
| "grad_norm": 0.38645675778388977, |
| "learning_rate": 0.0004131813411078717, |
| "loss": 3.289, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.603146853146853, |
| "grad_norm": 0.34176626801490784, |
| "learning_rate": 0.000413006413994169, |
| "loss": 3.2847, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.617715617715618, |
| "grad_norm": 0.3668389320373535, |
| "learning_rate": 0.0004128314868804664, |
| "loss": 3.2873, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.632284382284382, |
| "grad_norm": 0.38792678713798523, |
| "learning_rate": 0.00041265655976676377, |
| "loss": 3.2895, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.646853146853147, |
| "grad_norm": 0.3726765811443329, |
| "learning_rate": 0.0004124816326530612, |
| "loss": 3.2879, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.661421911421911, |
| "grad_norm": 0.38674214482307434, |
| "learning_rate": 0.0004123067055393586, |
| "loss": 3.2967, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.675990675990676, |
| "grad_norm": 0.3406377136707306, |
| "learning_rate": 0.00041213177842565595, |
| "loss": 3.2872, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.69055944055944, |
| "grad_norm": 0.368076354265213, |
| "learning_rate": 0.00041195685131195333, |
| "loss": 3.2957, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.705128205128204, |
| "grad_norm": 0.3570297956466675, |
| "learning_rate": 0.0004117819241982507, |
| "loss": 3.2905, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.719696969696969, |
| "grad_norm": 0.37296998500823975, |
| "learning_rate": 0.0004116069970845481, |
| "loss": 3.308, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.734265734265735, |
| "grad_norm": 0.35054120421409607, |
| "learning_rate": 0.0004114320699708454, |
| "loss": 3.2932, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.734265734265735, |
| "eval_accuracy": 0.3724629477066371, |
| "eval_loss": 3.539015054702759, |
| "eval_runtime": 179.9569, |
| "eval_samples_per_second": 92.478, |
| "eval_steps_per_second": 5.785, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.7488344988345, |
| "grad_norm": 0.3462640345096588, |
| "learning_rate": 0.0004112571428571428, |
| "loss": 3.3019, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.763403263403264, |
| "grad_norm": 0.38437676429748535, |
| "learning_rate": 0.00041108221574344015, |
| "loss": 3.3003, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.777972027972028, |
| "grad_norm": 0.37581923604011536, |
| "learning_rate": 0.0004109072886297376, |
| "loss": 3.2989, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.792540792540793, |
| "grad_norm": 0.4052378237247467, |
| "learning_rate": 0.00041073236151603496, |
| "loss": 3.2972, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.807109557109557, |
| "grad_norm": 0.4042377471923828, |
| "learning_rate": 0.00041055743440233234, |
| "loss": 3.2974, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.821678321678322, |
| "grad_norm": 0.3874850869178772, |
| "learning_rate": 0.0004103825072886297, |
| "loss": 3.3083, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.836247086247086, |
| "grad_norm": 0.3619961440563202, |
| "learning_rate": 0.0004102075801749271, |
| "loss": 3.2968, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.85081585081585, |
| "grad_norm": 0.3616982400417328, |
| "learning_rate": 0.00041003265306122446, |
| "loss": 3.305, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.865384615384615, |
| "grad_norm": 0.39646637439727783, |
| "learning_rate": 0.0004098577259475218, |
| "loss": 3.3048, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.87995337995338, |
| "grad_norm": 0.3735204041004181, |
| "learning_rate": 0.00040968279883381916, |
| "loss": 3.3097, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.894522144522144, |
| "grad_norm": 0.38079050183296204, |
| "learning_rate": 0.0004095078717201166, |
| "loss": 3.321, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.909090909090908, |
| "grad_norm": 0.3562343120574951, |
| "learning_rate": 0.00040933294460641397, |
| "loss": 3.312, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.923659673659674, |
| "grad_norm": 0.3720053434371948, |
| "learning_rate": 0.00040915801749271134, |
| "loss": 3.3141, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.938228438228439, |
| "grad_norm": 0.35950398445129395, |
| "learning_rate": 0.0004089830903790087, |
| "loss": 3.3188, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.952797202797203, |
| "grad_norm": 0.35162949562072754, |
| "learning_rate": 0.0004088081632653061, |
| "loss": 3.3067, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.967365967365968, |
| "grad_norm": 0.368741512298584, |
| "learning_rate": 0.00040863323615160347, |
| "loss": 3.2972, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.981934731934732, |
| "grad_norm": 0.3791120648384094, |
| "learning_rate": 0.00040845830903790085, |
| "loss": 3.2925, |
| "step": 54850 |
| }, |
| { |
| "epoch": 15.996503496503497, |
| "grad_norm": 0.38239946961402893, |
| "learning_rate": 0.00040828338192419817, |
| "loss": 3.31, |
| "step": 54900 |
| }, |
| { |
| "epoch": 16.01107226107226, |
| "grad_norm": 0.3918370306491852, |
| "learning_rate": 0.00040810845481049554, |
| "loss": 3.2242, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.025641025641026, |
| "grad_norm": 0.36443474888801575, |
| "learning_rate": 0.00040793352769679297, |
| "loss": 3.2057, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.025641025641026, |
| "eval_accuracy": 0.3720653729946936, |
| "eval_loss": 3.5517778396606445, |
| "eval_runtime": 179.9903, |
| "eval_samples_per_second": 92.461, |
| "eval_steps_per_second": 5.784, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.04020979020979, |
| "grad_norm": 0.3803104758262634, |
| "learning_rate": 0.00040775860058309035, |
| "loss": 3.2087, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.054778554778554, |
| "grad_norm": 0.37503015995025635, |
| "learning_rate": 0.0004075836734693877, |
| "loss": 3.2064, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.06934731934732, |
| "grad_norm": 0.422076553106308, |
| "learning_rate": 0.0004074087463556851, |
| "loss": 3.2159, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.083916083916083, |
| "grad_norm": 0.3793075382709503, |
| "learning_rate": 0.0004072338192419825, |
| "loss": 3.2157, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.098484848484848, |
| "grad_norm": 0.35776856541633606, |
| "learning_rate": 0.00040705889212827985, |
| "loss": 3.2258, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.113053613053612, |
| "grad_norm": 0.37177175283432007, |
| "learning_rate": 0.00040688396501457723, |
| "loss": 3.2206, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.127622377622377, |
| "grad_norm": 0.3849240243434906, |
| "learning_rate": 0.00040670903790087455, |
| "loss": 3.2165, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.14219114219114, |
| "grad_norm": 0.37367263436317444, |
| "learning_rate": 0.0004065341107871719, |
| "loss": 3.2369, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.156759906759905, |
| "grad_norm": 0.3775652050971985, |
| "learning_rate": 0.00040635918367346935, |
| "loss": 3.2323, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.17132867132867, |
| "grad_norm": 0.3845197260379791, |
| "learning_rate": 0.00040618425655976673, |
| "loss": 3.2327, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.185897435897434, |
| "grad_norm": 0.3939662575721741, |
| "learning_rate": 0.0004060093294460641, |
| "loss": 3.2396, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.2004662004662, |
| "grad_norm": 0.36457160115242004, |
| "learning_rate": 0.0004058344023323615, |
| "loss": 3.2348, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.215034965034967, |
| "grad_norm": 0.3628428876399994, |
| "learning_rate": 0.00040565947521865886, |
| "loss": 3.2387, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.22960372960373, |
| "grad_norm": 0.39392805099487305, |
| "learning_rate": 0.00040548454810495623, |
| "loss": 3.2455, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.244172494172496, |
| "grad_norm": 0.38273295760154724, |
| "learning_rate": 0.0004053096209912536, |
| "loss": 3.2404, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.25874125874126, |
| "grad_norm": 0.38725781440734863, |
| "learning_rate": 0.00040513469387755093, |
| "loss": 3.2394, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.273310023310025, |
| "grad_norm": 0.3701291084289551, |
| "learning_rate": 0.00040495976676384836, |
| "loss": 3.2477, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.28787878787879, |
| "grad_norm": 0.35124650597572327, |
| "learning_rate": 0.00040478483965014574, |
| "loss": 3.2511, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.302447552447553, |
| "grad_norm": 0.3882130980491638, |
| "learning_rate": 0.0004046099125364431, |
| "loss": 3.2493, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.317016317016318, |
| "grad_norm": 0.39306777715682983, |
| "learning_rate": 0.0004044349854227405, |
| "loss": 3.2652, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.317016317016318, |
| "eval_accuracy": 0.37204632327468923, |
| "eval_loss": 3.5508227348327637, |
| "eval_runtime": 180.1019, |
| "eval_samples_per_second": 92.403, |
| "eval_steps_per_second": 5.78, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.331585081585082, |
| "grad_norm": 0.36082884669303894, |
| "learning_rate": 0.00040426005830903786, |
| "loss": 3.2585, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.346153846153847, |
| "grad_norm": 0.38729801774024963, |
| "learning_rate": 0.00040408513119533524, |
| "loss": 3.2564, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.36072261072261, |
| "grad_norm": 0.35457730293273926, |
| "learning_rate": 0.0004039102040816326, |
| "loss": 3.2492, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.375291375291376, |
| "grad_norm": 0.3973914682865143, |
| "learning_rate": 0.00040373527696793005, |
| "loss": 3.2576, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.38986013986014, |
| "grad_norm": 0.40199264883995056, |
| "learning_rate": 0.0004035603498542273, |
| "loss": 3.2601, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.404428904428904, |
| "grad_norm": 0.3823801577091217, |
| "learning_rate": 0.00040338542274052474, |
| "loss": 3.2549, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.41899766899767, |
| "grad_norm": 0.3930020034313202, |
| "learning_rate": 0.0004032104956268221, |
| "loss": 3.2594, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.433566433566433, |
| "grad_norm": 0.3808688521385193, |
| "learning_rate": 0.0004030355685131195, |
| "loss": 3.2749, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.448135198135198, |
| "grad_norm": 0.36190223693847656, |
| "learning_rate": 0.00040286064139941687, |
| "loss": 3.2618, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.462703962703962, |
| "grad_norm": 0.37956106662750244, |
| "learning_rate": 0.00040268571428571425, |
| "loss": 3.2701, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.477272727272727, |
| "grad_norm": 0.349180668592453, |
| "learning_rate": 0.0004025107871720116, |
| "loss": 3.2726, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.49184149184149, |
| "grad_norm": 0.40453752875328064, |
| "learning_rate": 0.000402335860058309, |
| "loss": 3.2637, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.506410256410255, |
| "grad_norm": 0.3742610216140747, |
| "learning_rate": 0.00040216093294460643, |
| "loss": 3.2705, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.52097902097902, |
| "grad_norm": 0.35317522287368774, |
| "learning_rate": 0.00040198600583090375, |
| "loss": 3.2734, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.535547785547784, |
| "grad_norm": 0.3735811710357666, |
| "learning_rate": 0.0004018110787172011, |
| "loss": 3.2781, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.55011655011655, |
| "grad_norm": 0.37040281295776367, |
| "learning_rate": 0.0004016361516034985, |
| "loss": 3.2757, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.564685314685313, |
| "grad_norm": 0.3771756887435913, |
| "learning_rate": 0.0004014612244897959, |
| "loss": 3.2827, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.579254079254078, |
| "grad_norm": 0.4176678955554962, |
| "learning_rate": 0.00040128629737609325, |
| "loss": 3.2724, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.593822843822842, |
| "grad_norm": 0.38978853821754456, |
| "learning_rate": 0.00040111137026239063, |
| "loss": 3.2715, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.60839160839161, |
| "grad_norm": 0.3670736849308014, |
| "learning_rate": 0.000400936443148688, |
| "loss": 3.2886, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.60839160839161, |
| "eval_accuracy": 0.3724863482886178, |
| "eval_loss": 3.5458335876464844, |
| "eval_runtime": 180.2316, |
| "eval_samples_per_second": 92.337, |
| "eval_steps_per_second": 5.776, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.622960372960375, |
| "grad_norm": 0.33764857053756714, |
| "learning_rate": 0.00040076151603498543, |
| "loss": 3.2677, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.63752913752914, |
| "grad_norm": 0.3740048408508301, |
| "learning_rate": 0.0004005865889212828, |
| "loss": 3.29, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.652097902097903, |
| "grad_norm": 0.5407046675682068, |
| "learning_rate": 0.00040041166180758013, |
| "loss": 3.2744, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 0.34932196140289307, |
| "learning_rate": 0.0004002367346938775, |
| "loss": 3.2688, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.681235431235432, |
| "grad_norm": 0.4044182300567627, |
| "learning_rate": 0.0004000618075801749, |
| "loss": 3.2732, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.695804195804197, |
| "grad_norm": 0.3779875338077545, |
| "learning_rate": 0.00039988688046647226, |
| "loss": 3.2768, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.71037296037296, |
| "grad_norm": 0.3720763027667999, |
| "learning_rate": 0.00039971195335276963, |
| "loss": 3.2873, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.724941724941726, |
| "grad_norm": 0.3826303482055664, |
| "learning_rate": 0.000399537026239067, |
| "loss": 3.2898, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.73951048951049, |
| "grad_norm": 0.38142111897468567, |
| "learning_rate": 0.0003993620991253644, |
| "loss": 3.2731, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.754079254079254, |
| "grad_norm": 0.3760949373245239, |
| "learning_rate": 0.0003991871720116618, |
| "loss": 3.2895, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.76864801864802, |
| "grad_norm": 0.370598703622818, |
| "learning_rate": 0.0003990122448979592, |
| "loss": 3.2929, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.783216783216783, |
| "grad_norm": 0.37887001037597656, |
| "learning_rate": 0.0003988373177842565, |
| "loss": 3.2806, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.797785547785548, |
| "grad_norm": 0.37170445919036865, |
| "learning_rate": 0.0003986623906705539, |
| "loss": 3.3034, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.812354312354312, |
| "grad_norm": 0.37460795044898987, |
| "learning_rate": 0.00039848746355685127, |
| "loss": 3.2842, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.826923076923077, |
| "grad_norm": 0.36762702465057373, |
| "learning_rate": 0.00039831253644314864, |
| "loss": 3.2856, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.84149184149184, |
| "grad_norm": 0.3489190936088562, |
| "learning_rate": 0.000398137609329446, |
| "loss": 3.2822, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.856060606060606, |
| "grad_norm": 0.3669528663158417, |
| "learning_rate": 0.0003979626822157434, |
| "loss": 3.2862, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.87062937062937, |
| "grad_norm": 0.38453635573387146, |
| "learning_rate": 0.00039778775510204077, |
| "loss": 3.2791, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.885198135198134, |
| "grad_norm": 0.4126264750957489, |
| "learning_rate": 0.0003976128279883382, |
| "loss": 3.2916, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.8997668997669, |
| "grad_norm": 0.354192316532135, |
| "learning_rate": 0.0003974379008746356, |
| "loss": 3.2946, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.8997668997669, |
| "eval_accuracy": 0.37311945750160336, |
| "eval_loss": 3.5339043140411377, |
| "eval_runtime": 180.1703, |
| "eval_samples_per_second": 92.368, |
| "eval_steps_per_second": 5.778, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.914335664335663, |
| "grad_norm": 0.36521029472351074, |
| "learning_rate": 0.0003972629737609329, |
| "loss": 3.2874, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.928904428904428, |
| "grad_norm": 0.375409334897995, |
| "learning_rate": 0.00039708804664723027, |
| "loss": 3.2946, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.943473193473192, |
| "grad_norm": 0.3432232439517975, |
| "learning_rate": 0.00039691311953352765, |
| "loss": 3.2923, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.958041958041957, |
| "grad_norm": 0.35378479957580566, |
| "learning_rate": 0.000396738192419825, |
| "loss": 3.2955, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.97261072261072, |
| "grad_norm": 0.38814520835876465, |
| "learning_rate": 0.0003965632653061224, |
| "loss": 3.2975, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.98717948717949, |
| "grad_norm": 0.3771551251411438, |
| "learning_rate": 0.0003963883381924198, |
| "loss": 3.2997, |
| "step": 58300 |
| }, |
| { |
| "epoch": 17.001748251748253, |
| "grad_norm": 0.4017852246761322, |
| "learning_rate": 0.0003962134110787172, |
| "loss": 3.285, |
| "step": 58350 |
| }, |
| { |
| "epoch": 17.016317016317018, |
| "grad_norm": 0.3747924268245697, |
| "learning_rate": 0.0003960384839650146, |
| "loss": 3.1918, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.030885780885782, |
| "grad_norm": 0.3842487335205078, |
| "learning_rate": 0.00039586355685131196, |
| "loss": 3.1884, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.045454545454547, |
| "grad_norm": 0.38249558210372925, |
| "learning_rate": 0.0003956886297376093, |
| "loss": 3.1965, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.06002331002331, |
| "grad_norm": 0.3720807135105133, |
| "learning_rate": 0.00039551370262390665, |
| "loss": 3.2026, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.074592074592076, |
| "grad_norm": 0.36453333497047424, |
| "learning_rate": 0.00039533877551020403, |
| "loss": 3.1978, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.08916083916084, |
| "grad_norm": 0.37213221192359924, |
| "learning_rate": 0.0003951638483965014, |
| "loss": 3.193, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.103729603729604, |
| "grad_norm": 0.3810282051563263, |
| "learning_rate": 0.0003949889212827988, |
| "loss": 3.2161, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.11829836829837, |
| "grad_norm": 0.4005163311958313, |
| "learning_rate": 0.00039481399416909616, |
| "loss": 3.2006, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.132867132867133, |
| "grad_norm": 0.3493218719959259, |
| "learning_rate": 0.0003946390670553936, |
| "loss": 3.1989, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.147435897435898, |
| "grad_norm": 0.4085027575492859, |
| "learning_rate": 0.00039446413994169096, |
| "loss": 3.2141, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.162004662004662, |
| "grad_norm": 0.3723236918449402, |
| "learning_rate": 0.00039428921282798834, |
| "loss": 3.2122, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.176573426573427, |
| "grad_norm": 0.365602046251297, |
| "learning_rate": 0.00039411428571428566, |
| "loss": 3.2235, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.19114219114219, |
| "grad_norm": 0.3900095522403717, |
| "learning_rate": 0.00039393935860058304, |
| "loss": 3.2236, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.19114219114219, |
| "eval_accuracy": 0.37191403355243646, |
| "eval_loss": 3.5545146465301514, |
| "eval_runtime": 180.0902, |
| "eval_samples_per_second": 92.409, |
| "eval_steps_per_second": 5.78, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.205710955710956, |
| "grad_norm": 0.39965546131134033, |
| "learning_rate": 0.0003937644314868804, |
| "loss": 3.2383, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.22027972027972, |
| "grad_norm": 0.3811628818511963, |
| "learning_rate": 0.0003935895043731778, |
| "loss": 3.2326, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.234848484848484, |
| "grad_norm": 0.3733785152435303, |
| "learning_rate": 0.00039341457725947516, |
| "loss": 3.2351, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.24941724941725, |
| "grad_norm": 0.3800208568572998, |
| "learning_rate": 0.0003932396501457726, |
| "loss": 3.2306, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.263986013986013, |
| "grad_norm": 0.40144771337509155, |
| "learning_rate": 0.00039306472303206997, |
| "loss": 3.2401, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.278554778554778, |
| "grad_norm": 0.3901655375957489, |
| "learning_rate": 0.00039288979591836734, |
| "loss": 3.2329, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.293123543123542, |
| "grad_norm": 0.37534698843955994, |
| "learning_rate": 0.0003927148688046647, |
| "loss": 3.2392, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.307692307692307, |
| "grad_norm": 0.3891315460205078, |
| "learning_rate": 0.00039253994169096204, |
| "loss": 3.2275, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.32226107226107, |
| "grad_norm": 0.38900765776634216, |
| "learning_rate": 0.0003923650145772594, |
| "loss": 3.2505, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.336829836829835, |
| "grad_norm": 0.40090858936309814, |
| "learning_rate": 0.0003921900874635568, |
| "loss": 3.2417, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.3513986013986, |
| "grad_norm": 0.4910936653614044, |
| "learning_rate": 0.00039201516034985417, |
| "loss": 3.2483, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.365967365967364, |
| "grad_norm": 0.38942986726760864, |
| "learning_rate": 0.00039184023323615155, |
| "loss": 3.2518, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.38053613053613, |
| "grad_norm": 0.39612892270088196, |
| "learning_rate": 0.000391665306122449, |
| "loss": 3.2501, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.395104895104897, |
| "grad_norm": 0.37708619236946106, |
| "learning_rate": 0.00039149037900874635, |
| "loss": 3.2523, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.40967365967366, |
| "grad_norm": 0.3658439517021179, |
| "learning_rate": 0.0003913154518950437, |
| "loss": 3.2484, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.424242424242426, |
| "grad_norm": 0.37678831815719604, |
| "learning_rate": 0.0003911405247813411, |
| "loss": 3.2475, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.43881118881119, |
| "grad_norm": 0.375609815120697, |
| "learning_rate": 0.0003909655976676384, |
| "loss": 3.2691, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.453379953379955, |
| "grad_norm": 0.3910381495952606, |
| "learning_rate": 0.0003907906705539358, |
| "loss": 3.2618, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.46794871794872, |
| "grad_norm": 0.3733561038970947, |
| "learning_rate": 0.0003906157434402332, |
| "loss": 3.2444, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.482517482517483, |
| "grad_norm": 0.40049925446510315, |
| "learning_rate": 0.00039044081632653055, |
| "loss": 3.268, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.482517482517483, |
| "eval_accuracy": 0.37249316855874287, |
| "eval_loss": 3.5441601276397705, |
| "eval_runtime": 258.063, |
| "eval_samples_per_second": 64.488, |
| "eval_steps_per_second": 4.034, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.497086247086248, |
| "grad_norm": 0.3814074695110321, |
| "learning_rate": 0.00039026588921282793, |
| "loss": 3.2626, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.511655011655012, |
| "grad_norm": 0.39018502831459045, |
| "learning_rate": 0.00039009096209912536, |
| "loss": 3.2578, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.526223776223777, |
| "grad_norm": 0.41158032417297363, |
| "learning_rate": 0.00038991603498542273, |
| "loss": 3.2538, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.54079254079254, |
| "grad_norm": 0.3606468141078949, |
| "learning_rate": 0.0003897411078717201, |
| "loss": 3.2476, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.555361305361306, |
| "grad_norm": 0.3680814802646637, |
| "learning_rate": 0.0003895661807580175, |
| "loss": 3.2732, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.56993006993007, |
| "grad_norm": 0.4254201054573059, |
| "learning_rate": 0.0003893912536443148, |
| "loss": 3.2638, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.584498834498834, |
| "grad_norm": 0.3743131160736084, |
| "learning_rate": 0.0003892163265306122, |
| "loss": 3.2699, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.5990675990676, |
| "grad_norm": 0.40694037079811096, |
| "learning_rate": 0.00038904139941690956, |
| "loss": 3.2678, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.613636363636363, |
| "grad_norm": 0.3833000957965851, |
| "learning_rate": 0.00038886647230320693, |
| "loss": 3.2568, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.628205128205128, |
| "grad_norm": 0.38820087909698486, |
| "learning_rate": 0.00038869154518950436, |
| "loss": 3.2669, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.642773892773892, |
| "grad_norm": 0.3931123614311218, |
| "learning_rate": 0.00038851661807580174, |
| "loss": 3.2692, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.657342657342657, |
| "grad_norm": 0.4032643735408783, |
| "learning_rate": 0.0003883416909620991, |
| "loss": 3.2646, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.67191142191142, |
| "grad_norm": 0.3966495394706726, |
| "learning_rate": 0.0003881667638483965, |
| "loss": 3.2693, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.686480186480185, |
| "grad_norm": 0.3854696452617645, |
| "learning_rate": 0.00038799183673469387, |
| "loss": 3.2577, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.70104895104895, |
| "grad_norm": 0.4019714891910553, |
| "learning_rate": 0.0003878169096209912, |
| "loss": 3.2724, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.715617715617714, |
| "grad_norm": 0.3742389380931854, |
| "learning_rate": 0.00038764198250728856, |
| "loss": 3.2664, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.73018648018648, |
| "grad_norm": 0.40025609731674194, |
| "learning_rate": 0.00038746705539358594, |
| "loss": 3.2683, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.744755244755243, |
| "grad_norm": 0.36912432312965393, |
| "learning_rate": 0.0003872921282798833, |
| "loss": 3.2726, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.759324009324008, |
| "grad_norm": 0.37890687584877014, |
| "learning_rate": 0.00038711720116618075, |
| "loss": 3.2663, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.773892773892776, |
| "grad_norm": 0.38784855604171753, |
| "learning_rate": 0.0003869422740524781, |
| "loss": 3.2783, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.773892773892776, |
| "eval_accuracy": 0.37300621749935503, |
| "eval_loss": 3.5378918647766113, |
| "eval_runtime": 180.0537, |
| "eval_samples_per_second": 92.428, |
| "eval_steps_per_second": 5.782, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.78846153846154, |
| "grad_norm": 0.3923535943031311, |
| "learning_rate": 0.0003867673469387755, |
| "loss": 3.2814, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.803030303030305, |
| "grad_norm": 0.39278125762939453, |
| "learning_rate": 0.0003865924198250729, |
| "loss": 3.275, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.81759906759907, |
| "grad_norm": 0.3636791706085205, |
| "learning_rate": 0.00038641749271137025, |
| "loss": 3.2685, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.832167832167833, |
| "grad_norm": 0.3580476939678192, |
| "learning_rate": 0.00038624256559766757, |
| "loss": 3.2727, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.846736596736598, |
| "grad_norm": 0.3807421922683716, |
| "learning_rate": 0.00038606763848396495, |
| "loss": 3.2808, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.861305361305362, |
| "grad_norm": 0.4115810990333557, |
| "learning_rate": 0.0003858927113702623, |
| "loss": 3.2744, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.875874125874127, |
| "grad_norm": 0.3622573912143707, |
| "learning_rate": 0.00038571778425655975, |
| "loss": 3.2848, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.89044289044289, |
| "grad_norm": 0.41785427927970886, |
| "learning_rate": 0.00038554285714285713, |
| "loss": 3.2763, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.905011655011656, |
| "grad_norm": 0.40546101331710815, |
| "learning_rate": 0.0003853679300291545, |
| "loss": 3.2897, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.91958041958042, |
| "grad_norm": 0.36965155601501465, |
| "learning_rate": 0.0003851930029154519, |
| "loss": 3.2803, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.934149184149184, |
| "grad_norm": 0.38562914729118347, |
| "learning_rate": 0.00038501807580174926, |
| "loss": 3.2726, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.94871794871795, |
| "grad_norm": 0.4070318937301636, |
| "learning_rate": 0.00038484314868804663, |
| "loss": 3.2823, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.963286713286713, |
| "grad_norm": 0.36294493079185486, |
| "learning_rate": 0.00038466822157434395, |
| "loss": 3.2837, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.977855477855478, |
| "grad_norm": 0.402779757976532, |
| "learning_rate": 0.00038449329446064133, |
| "loss": 3.2822, |
| "step": 61700 |
| }, |
| { |
| "epoch": 17.992424242424242, |
| "grad_norm": 0.3932071626186371, |
| "learning_rate": 0.0003843183673469387, |
| "loss": 3.2756, |
| "step": 61750 |
| }, |
| { |
| "epoch": 18.006993006993007, |
| "grad_norm": 0.4182881712913513, |
| "learning_rate": 0.00038414344023323613, |
| "loss": 3.2423, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.02156177156177, |
| "grad_norm": 0.41027215123176575, |
| "learning_rate": 0.0003839685131195335, |
| "loss": 3.1628, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.036130536130536, |
| "grad_norm": 0.3935529291629791, |
| "learning_rate": 0.0003837935860058309, |
| "loss": 3.1793, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.0506993006993, |
| "grad_norm": 0.4026806950569153, |
| "learning_rate": 0.00038361865889212826, |
| "loss": 3.1907, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.065268065268064, |
| "grad_norm": 0.39838963747024536, |
| "learning_rate": 0.00038344373177842564, |
| "loss": 3.1921, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.065268065268064, |
| "eval_accuracy": 0.3720812477613639, |
| "eval_loss": 3.5565507411956787, |
| "eval_runtime": 180.0846, |
| "eval_samples_per_second": 92.412, |
| "eval_steps_per_second": 5.781, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.07983682983683, |
| "grad_norm": 0.3916713297367096, |
| "learning_rate": 0.000383268804664723, |
| "loss": 3.1856, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.094405594405593, |
| "grad_norm": 0.3870500922203064, |
| "learning_rate": 0.00038309387755102034, |
| "loss": 3.1768, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.108974358974358, |
| "grad_norm": 0.43737277388572693, |
| "learning_rate": 0.0003829189504373177, |
| "loss": 3.2043, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.123543123543122, |
| "grad_norm": 0.3725016415119171, |
| "learning_rate": 0.0003827440233236151, |
| "loss": 3.1973, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.138111888111887, |
| "grad_norm": 0.39706695079803467, |
| "learning_rate": 0.0003825690962099125, |
| "loss": 3.2012, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.15268065268065, |
| "grad_norm": 0.4028768241405487, |
| "learning_rate": 0.0003823941690962099, |
| "loss": 3.2017, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.16724941724942, |
| "grad_norm": 0.386943519115448, |
| "learning_rate": 0.00038221924198250727, |
| "loss": 3.2188, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.181818181818183, |
| "grad_norm": 0.37898704409599304, |
| "learning_rate": 0.00038204431486880464, |
| "loss": 3.2086, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.196386946386948, |
| "grad_norm": 0.39674118161201477, |
| "learning_rate": 0.000381869387755102, |
| "loss": 3.226, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.210955710955712, |
| "grad_norm": 0.38656550645828247, |
| "learning_rate": 0.0003816944606413994, |
| "loss": 3.2116, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.225524475524477, |
| "grad_norm": 0.434226393699646, |
| "learning_rate": 0.0003815195335276967, |
| "loss": 3.2205, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.24009324009324, |
| "grad_norm": 0.38330623507499695, |
| "learning_rate": 0.0003813446064139941, |
| "loss": 3.2152, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.254662004662006, |
| "grad_norm": 0.3713931739330292, |
| "learning_rate": 0.0003811696793002915, |
| "loss": 3.2287, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.26923076923077, |
| "grad_norm": 0.4108925759792328, |
| "learning_rate": 0.0003809947521865889, |
| "loss": 3.2264, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.283799533799534, |
| "grad_norm": 0.40365439653396606, |
| "learning_rate": 0.0003808198250728863, |
| "loss": 3.2186, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.2983682983683, |
| "grad_norm": 0.36954963207244873, |
| "learning_rate": 0.00038064489795918365, |
| "loss": 3.2303, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.312937062937063, |
| "grad_norm": 0.3932103216648102, |
| "learning_rate": 0.000380469970845481, |
| "loss": 3.2229, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.327505827505828, |
| "grad_norm": 0.3911256194114685, |
| "learning_rate": 0.0003802950437317784, |
| "loss": 3.2326, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.342074592074592, |
| "grad_norm": 0.3954258859157562, |
| "learning_rate": 0.0003801201166180758, |
| "loss": 3.2245, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.356643356643357, |
| "grad_norm": 0.39383465051651, |
| "learning_rate": 0.0003799451895043731, |
| "loss": 3.2344, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.356643356643357, |
| "eval_accuracy": 0.372702597887927, |
| "eval_loss": 3.5481696128845215, |
| "eval_runtime": 180.1418, |
| "eval_samples_per_second": 92.383, |
| "eval_steps_per_second": 5.779, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.37121212121212, |
| "grad_norm": 0.3588085472583771, |
| "learning_rate": 0.0003797702623906705, |
| "loss": 3.241, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.385780885780886, |
| "grad_norm": 0.3684539198875427, |
| "learning_rate": 0.0003795953352769679, |
| "loss": 3.2332, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.40034965034965, |
| "grad_norm": 0.3738718032836914, |
| "learning_rate": 0.0003794204081632653, |
| "loss": 3.2348, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.414918414918414, |
| "grad_norm": 0.41179683804512024, |
| "learning_rate": 0.00037924548104956266, |
| "loss": 3.2366, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.42948717948718, |
| "grad_norm": 0.3886624574661255, |
| "learning_rate": 0.00037907055393586003, |
| "loss": 3.2426, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.444055944055943, |
| "grad_norm": 0.40785521268844604, |
| "learning_rate": 0.0003788956268221574, |
| "loss": 3.2447, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.458624708624708, |
| "grad_norm": 0.36944928765296936, |
| "learning_rate": 0.0003787206997084548, |
| "loss": 3.2366, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.473193473193472, |
| "grad_norm": 0.4326910078525543, |
| "learning_rate": 0.00037854577259475216, |
| "loss": 3.2408, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.487762237762237, |
| "grad_norm": 0.3922458291053772, |
| "learning_rate": 0.0003783708454810495, |
| "loss": 3.2536, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.502331002331, |
| "grad_norm": 0.404136598110199, |
| "learning_rate": 0.00037819591836734686, |
| "loss": 3.2394, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.516899766899765, |
| "grad_norm": 0.37804481387138367, |
| "learning_rate": 0.0003780209912536443, |
| "loss": 3.2318, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.53146853146853, |
| "grad_norm": 0.38064321875572205, |
| "learning_rate": 0.00037784606413994166, |
| "loss": 3.25, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.546037296037294, |
| "grad_norm": 0.4081238806247711, |
| "learning_rate": 0.00037767113702623904, |
| "loss": 3.2407, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.560606060606062, |
| "grad_norm": 0.4321068227291107, |
| "learning_rate": 0.0003774962099125364, |
| "loss": 3.2478, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.575174825174827, |
| "grad_norm": 0.38897502422332764, |
| "learning_rate": 0.0003773212827988338, |
| "loss": 3.24, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.58974358974359, |
| "grad_norm": 0.39090046286582947, |
| "learning_rate": 0.00037714635568513117, |
| "loss": 3.2659, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.604312354312356, |
| "grad_norm": 0.3871738016605377, |
| "learning_rate": 0.00037697142857142854, |
| "loss": 3.2508, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.61888111888112, |
| "grad_norm": 0.4042744040489197, |
| "learning_rate": 0.00037679650145772586, |
| "loss": 3.2437, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.633449883449885, |
| "grad_norm": 0.3949747085571289, |
| "learning_rate": 0.0003766215743440233, |
| "loss": 3.2632, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.64801864801865, |
| "grad_norm": 0.39369118213653564, |
| "learning_rate": 0.00037644664723032067, |
| "loss": 3.2715, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.64801864801865, |
| "eval_accuracy": 0.3731554403060561, |
| "eval_loss": 3.5396413803100586, |
| "eval_runtime": 180.0063, |
| "eval_samples_per_second": 92.452, |
| "eval_steps_per_second": 5.783, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.662587412587413, |
| "grad_norm": 0.3882533609867096, |
| "learning_rate": 0.00037627172011661805, |
| "loss": 3.248, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.677156177156178, |
| "grad_norm": 0.38155344128608704, |
| "learning_rate": 0.0003760967930029154, |
| "loss": 3.258, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.691724941724942, |
| "grad_norm": 0.4137474298477173, |
| "learning_rate": 0.0003759218658892128, |
| "loss": 3.259, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.706293706293707, |
| "grad_norm": 0.38933834433555603, |
| "learning_rate": 0.00037574693877551017, |
| "loss": 3.2653, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.72086247086247, |
| "grad_norm": 0.3694373071193695, |
| "learning_rate": 0.00037557201166180755, |
| "loss": 3.2588, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.735431235431236, |
| "grad_norm": 0.36894160509109497, |
| "learning_rate": 0.000375397084548105, |
| "loss": 3.2735, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.75, |
| "grad_norm": 0.38931140303611755, |
| "learning_rate": 0.00037522215743440225, |
| "loss": 3.2714, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.764568764568764, |
| "grad_norm": 0.38522276282310486, |
| "learning_rate": 0.0003750472303206997, |
| "loss": 3.2577, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.77913752913753, |
| "grad_norm": 0.38880297541618347, |
| "learning_rate": 0.00037487230320699705, |
| "loss": 3.251, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.793706293706293, |
| "grad_norm": 0.3946090340614319, |
| "learning_rate": 0.00037469737609329443, |
| "loss": 3.2674, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.808275058275058, |
| "grad_norm": 0.38031676411628723, |
| "learning_rate": 0.0003745224489795918, |
| "loss": 3.2692, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.822843822843822, |
| "grad_norm": 0.4162115156650543, |
| "learning_rate": 0.0003743475218658892, |
| "loss": 3.2606, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.837412587412587, |
| "grad_norm": 0.37284985184669495, |
| "learning_rate": 0.00037417259475218655, |
| "loss": 3.2631, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.85198135198135, |
| "grad_norm": 0.37252700328826904, |
| "learning_rate": 0.00037399766763848393, |
| "loss": 3.2592, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.866550116550115, |
| "grad_norm": 0.39544638991355896, |
| "learning_rate": 0.00037382274052478136, |
| "loss": 3.2621, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.88111888111888, |
| "grad_norm": 0.3822038769721985, |
| "learning_rate": 0.0003736478134110787, |
| "loss": 3.2757, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.895687645687644, |
| "grad_norm": 0.37755459547042847, |
| "learning_rate": 0.00037347288629737606, |
| "loss": 3.2773, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.91025641025641, |
| "grad_norm": 0.37911224365234375, |
| "learning_rate": 0.00037329795918367343, |
| "loss": 3.2818, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.924825174825173, |
| "grad_norm": 0.380943238735199, |
| "learning_rate": 0.0003731230320699708, |
| "loss": 3.2727, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.939393939393938, |
| "grad_norm": 0.3881559669971466, |
| "learning_rate": 0.0003729481049562682, |
| "loss": 3.2683, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.939393939393938, |
| "eval_accuracy": 0.373618630720237, |
| "eval_loss": 3.5344443321228027, |
| "eval_runtime": 180.0973, |
| "eval_samples_per_second": 92.406, |
| "eval_steps_per_second": 5.78, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.953962703962706, |
| "grad_norm": 0.37991875410079956, |
| "learning_rate": 0.00037277317784256556, |
| "loss": 3.2692, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.96853146853147, |
| "grad_norm": 0.38167208433151245, |
| "learning_rate": 0.00037259825072886294, |
| "loss": 3.2683, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.983100233100235, |
| "grad_norm": 0.4169837236404419, |
| "learning_rate": 0.00037242332361516037, |
| "loss": 3.2738, |
| "step": 65150 |
| }, |
| { |
| "epoch": 18.997668997669, |
| "grad_norm": 0.377718985080719, |
| "learning_rate": 0.00037224839650145774, |
| "loss": 3.2681, |
| "step": 65200 |
| }, |
| { |
| "epoch": 19.012237762237763, |
| "grad_norm": 0.3844805061817169, |
| "learning_rate": 0.00037207346938775506, |
| "loss": 3.1864, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.026806526806528, |
| "grad_norm": 0.3732326626777649, |
| "learning_rate": 0.00037189854227405244, |
| "loss": 3.1594, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.041375291375292, |
| "grad_norm": 0.4207264184951782, |
| "learning_rate": 0.0003717236151603498, |
| "loss": 3.1827, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.055944055944057, |
| "grad_norm": 0.3737161457538605, |
| "learning_rate": 0.0003715486880466472, |
| "loss": 3.1722, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.07051282051282, |
| "grad_norm": 0.39451220631599426, |
| "learning_rate": 0.00037137376093294457, |
| "loss": 3.1706, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.085081585081586, |
| "grad_norm": 0.3852495849132538, |
| "learning_rate": 0.00037119883381924194, |
| "loss": 3.18, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.09965034965035, |
| "grad_norm": 0.39817318320274353, |
| "learning_rate": 0.0003710239067055393, |
| "loss": 3.1732, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.114219114219114, |
| "grad_norm": 0.3873465955257416, |
| "learning_rate": 0.00037084897959183675, |
| "loss": 3.1912, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.12878787878788, |
| "grad_norm": 0.4169006049633026, |
| "learning_rate": 0.0003706740524781341, |
| "loss": 3.1931, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.143356643356643, |
| "grad_norm": 0.39785993099212646, |
| "learning_rate": 0.00037049912536443145, |
| "loss": 3.1855, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.157925407925408, |
| "grad_norm": 0.40643396973609924, |
| "learning_rate": 0.0003703241982507288, |
| "loss": 3.1954, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.172494172494172, |
| "grad_norm": 0.37612253427505493, |
| "learning_rate": 0.0003701492711370262, |
| "loss": 3.1991, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.187062937062937, |
| "grad_norm": 0.3855384290218353, |
| "learning_rate": 0.0003699743440233236, |
| "loss": 3.1975, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.2016317016317, |
| "grad_norm": 0.3889921009540558, |
| "learning_rate": 0.00036979941690962095, |
| "loss": 3.2026, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.216200466200466, |
| "grad_norm": 0.3912866413593292, |
| "learning_rate": 0.0003696244897959183, |
| "loss": 3.2027, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.23076923076923, |
| "grad_norm": 0.38603660464286804, |
| "learning_rate": 0.0003694495626822157, |
| "loss": 3.2061, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.23076923076923, |
| "eval_accuracy": 0.37278585221979804, |
| "eval_loss": 3.546868085861206, |
| "eval_runtime": 180.1726, |
| "eval_samples_per_second": 92.367, |
| "eval_steps_per_second": 5.778, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.245337995337994, |
| "grad_norm": 0.4257270395755768, |
| "learning_rate": 0.00036927463556851313, |
| "loss": 3.1954, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.25990675990676, |
| "grad_norm": 0.3850635290145874, |
| "learning_rate": 0.0003690997084548105, |
| "loss": 3.1969, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.274475524475523, |
| "grad_norm": 0.3987645208835602, |
| "learning_rate": 0.00036892478134110783, |
| "loss": 3.21, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.289044289044288, |
| "grad_norm": 0.3949647843837738, |
| "learning_rate": 0.0003687498542274052, |
| "loss": 3.2137, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.303613053613052, |
| "grad_norm": 0.3926634192466736, |
| "learning_rate": 0.0003685749271137026, |
| "loss": 3.2339, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.318181818181817, |
| "grad_norm": 0.379131942987442, |
| "learning_rate": 0.00036839999999999996, |
| "loss": 3.2059, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.33275058275058, |
| "grad_norm": 0.39936667680740356, |
| "learning_rate": 0.00036822507288629733, |
| "loss": 3.2171, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.34731934731935, |
| "grad_norm": 0.401324599981308, |
| "learning_rate": 0.0003680501457725947, |
| "loss": 3.2303, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.361888111888113, |
| "grad_norm": 0.4343133270740509, |
| "learning_rate": 0.00036787521865889214, |
| "loss": 3.2231, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.376456876456878, |
| "grad_norm": 0.408704549074173, |
| "learning_rate": 0.0003677002915451895, |
| "loss": 3.219, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.391025641025642, |
| "grad_norm": 0.4143206775188446, |
| "learning_rate": 0.0003675253644314869, |
| "loss": 3.23, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.405594405594407, |
| "grad_norm": 0.4262617528438568, |
| "learning_rate": 0.0003673504373177842, |
| "loss": 3.2392, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.42016317016317, |
| "grad_norm": 0.42696934938430786, |
| "learning_rate": 0.0003671755102040816, |
| "loss": 3.2251, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.434731934731936, |
| "grad_norm": 0.3789410889148712, |
| "learning_rate": 0.00036700058309037896, |
| "loss": 3.2156, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.4493006993007, |
| "grad_norm": 0.38625243306159973, |
| "learning_rate": 0.00036682565597667634, |
| "loss": 3.2374, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.463869463869464, |
| "grad_norm": 0.43200168013572693, |
| "learning_rate": 0.0003666507288629737, |
| "loss": 3.2323, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.47843822843823, |
| "grad_norm": 0.4356413185596466, |
| "learning_rate": 0.0003664758017492711, |
| "loss": 3.2489, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.493006993006993, |
| "grad_norm": 0.38407424092292786, |
| "learning_rate": 0.0003663008746355685, |
| "loss": 3.2306, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.507575757575758, |
| "grad_norm": 0.39188191294670105, |
| "learning_rate": 0.0003661259475218659, |
| "loss": 3.2295, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.522144522144522, |
| "grad_norm": 0.39774879813194275, |
| "learning_rate": 0.00036595102040816327, |
| "loss": 3.2432, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.522144522144522, |
| "eval_accuracy": 0.373129217543334, |
| "eval_loss": 3.542863368988037, |
| "eval_runtime": 180.0798, |
| "eval_samples_per_second": 92.415, |
| "eval_steps_per_second": 5.781, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.536713286713287, |
| "grad_norm": 0.41966238617897034, |
| "learning_rate": 0.0003657760932944606, |
| "loss": 3.2393, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.55128205128205, |
| "grad_norm": 0.3979637026786804, |
| "learning_rate": 0.00036560116618075797, |
| "loss": 3.2318, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.565850815850816, |
| "grad_norm": 0.3898719847202301, |
| "learning_rate": 0.00036542623906705534, |
| "loss": 3.2325, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.58041958041958, |
| "grad_norm": 0.3797769546508789, |
| "learning_rate": 0.0003652513119533527, |
| "loss": 3.2352, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.594988344988344, |
| "grad_norm": 0.4092084765434265, |
| "learning_rate": 0.0003650763848396501, |
| "loss": 3.2359, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.60955710955711, |
| "grad_norm": 0.3929194509983063, |
| "learning_rate": 0.0003649014577259475, |
| "loss": 3.2403, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.624125874125873, |
| "grad_norm": 0.3677727282047272, |
| "learning_rate": 0.0003647265306122449, |
| "loss": 3.2368, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.638694638694638, |
| "grad_norm": 0.40046238899230957, |
| "learning_rate": 0.0003645516034985423, |
| "loss": 3.2353, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.653263403263402, |
| "grad_norm": 0.41111862659454346, |
| "learning_rate": 0.00036437667638483965, |
| "loss": 3.2407, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.667832167832167, |
| "grad_norm": 0.39563626050949097, |
| "learning_rate": 0.000364201749271137, |
| "loss": 3.2616, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.68240093240093, |
| "grad_norm": 0.3748413324356079, |
| "learning_rate": 0.00036402682215743435, |
| "loss": 3.2507, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.696969696969695, |
| "grad_norm": 0.40953975915908813, |
| "learning_rate": 0.0003638518950437317, |
| "loss": 3.2504, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.71153846153846, |
| "grad_norm": 0.40304210782051086, |
| "learning_rate": 0.0003636769679300291, |
| "loss": 3.2546, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.726107226107224, |
| "grad_norm": 0.3886299133300781, |
| "learning_rate": 0.0003635020408163265, |
| "loss": 3.2447, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.740675990675992, |
| "grad_norm": 0.38597121834754944, |
| "learning_rate": 0.0003633271137026239, |
| "loss": 3.2565, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.755244755244757, |
| "grad_norm": 0.36891189217567444, |
| "learning_rate": 0.0003631521865889213, |
| "loss": 3.2651, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.76981351981352, |
| "grad_norm": 0.39409512281417847, |
| "learning_rate": 0.00036297725947521866, |
| "loss": 3.253, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.784382284382286, |
| "grad_norm": 0.4034635126590729, |
| "learning_rate": 0.00036280233236151604, |
| "loss": 3.2613, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.79895104895105, |
| "grad_norm": 0.3696857690811157, |
| "learning_rate": 0.00036262740524781336, |
| "loss": 3.2566, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.813519813519815, |
| "grad_norm": 0.4099896550178528, |
| "learning_rate": 0.00036245247813411073, |
| "loss": 3.2545, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.813519813519815, |
| "eval_accuracy": 0.3735982875007261, |
| "eval_loss": 3.5354998111724854, |
| "eval_runtime": 180.0799, |
| "eval_samples_per_second": 92.415, |
| "eval_steps_per_second": 5.781, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.82808857808858, |
| "grad_norm": 0.40911927819252014, |
| "learning_rate": 0.0003622775510204081, |
| "loss": 3.2756, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.842657342657343, |
| "grad_norm": 0.36278921365737915, |
| "learning_rate": 0.0003621026239067055, |
| "loss": 3.2448, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.857226107226108, |
| "grad_norm": 0.4001983404159546, |
| "learning_rate": 0.00036192769679300286, |
| "loss": 3.2533, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.871794871794872, |
| "grad_norm": 0.40843650698661804, |
| "learning_rate": 0.0003617527696793003, |
| "loss": 3.2475, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.886363636363637, |
| "grad_norm": 0.38101208209991455, |
| "learning_rate": 0.00036157784256559767, |
| "loss": 3.2538, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.9009324009324, |
| "grad_norm": 0.4274073541164398, |
| "learning_rate": 0.00036140291545189504, |
| "loss": 3.2605, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.915501165501166, |
| "grad_norm": 0.3906668424606323, |
| "learning_rate": 0.0003612279883381924, |
| "loss": 3.2643, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.93006993006993, |
| "grad_norm": 0.35871779918670654, |
| "learning_rate": 0.00036105306122448974, |
| "loss": 3.2659, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.944638694638694, |
| "grad_norm": 0.3644237518310547, |
| "learning_rate": 0.0003608781341107871, |
| "loss": 3.2685, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.95920745920746, |
| "grad_norm": 0.45362988114356995, |
| "learning_rate": 0.0003607032069970845, |
| "loss": 3.2506, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.973776223776223, |
| "grad_norm": 0.38506874442100525, |
| "learning_rate": 0.00036052827988338187, |
| "loss": 3.2658, |
| "step": 68550 |
| }, |
| { |
| "epoch": 19.988344988344988, |
| "grad_norm": 0.445112407207489, |
| "learning_rate": 0.0003603533527696793, |
| "loss": 3.2594, |
| "step": 68600 |
| }, |
| { |
| "epoch": 20.002913752913752, |
| "grad_norm": 0.3915383219718933, |
| "learning_rate": 0.00036017842565597667, |
| "loss": 3.2511, |
| "step": 68650 |
| }, |
| { |
| "epoch": 20.017482517482517, |
| "grad_norm": 0.39854714274406433, |
| "learning_rate": 0.00036000349854227405, |
| "loss": 3.156, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.03205128205128, |
| "grad_norm": 0.4197991192340851, |
| "learning_rate": 0.0003598285714285714, |
| "loss": 3.1582, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.046620046620045, |
| "grad_norm": 0.415543794631958, |
| "learning_rate": 0.0003596536443148688, |
| "loss": 3.1528, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.06118881118881, |
| "grad_norm": 0.3774547874927521, |
| "learning_rate": 0.0003594787172011661, |
| "loss": 3.1522, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.075757575757574, |
| "grad_norm": 0.37995073199272156, |
| "learning_rate": 0.0003593037900874635, |
| "loss": 3.1586, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.09032634032634, |
| "grad_norm": 0.3786678612232208, |
| "learning_rate": 0.00035912886297376087, |
| "loss": 3.1593, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.104895104895103, |
| "grad_norm": 0.4058922231197357, |
| "learning_rate": 0.00035895393586005825, |
| "loss": 3.1751, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.104895104895103, |
| "eval_accuracy": 0.373238106683606, |
| "eval_loss": 3.54990553855896, |
| "eval_runtime": 180.2945, |
| "eval_samples_per_second": 92.305, |
| "eval_steps_per_second": 5.774, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.11946386946387, |
| "grad_norm": 0.4163013994693756, |
| "learning_rate": 0.0003587790087463557, |
| "loss": 3.1827, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.134032634032636, |
| "grad_norm": 0.3935016691684723, |
| "learning_rate": 0.00035860408163265305, |
| "loss": 3.1895, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.1486013986014, |
| "grad_norm": 0.3995712399482727, |
| "learning_rate": 0.00035842915451895043, |
| "loss": 3.193, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.163170163170165, |
| "grad_norm": 0.38892659544944763, |
| "learning_rate": 0.0003582542274052478, |
| "loss": 3.1873, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.17773892773893, |
| "grad_norm": 0.4174294173717499, |
| "learning_rate": 0.0003580793002915452, |
| "loss": 3.1912, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.192307692307693, |
| "grad_norm": 0.4112229645252228, |
| "learning_rate": 0.0003579043731778425, |
| "loss": 3.1825, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.206876456876458, |
| "grad_norm": 0.405185729265213, |
| "learning_rate": 0.0003577294460641399, |
| "loss": 3.1985, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.221445221445222, |
| "grad_norm": 0.3787309527397156, |
| "learning_rate": 0.00035755451895043725, |
| "loss": 3.1956, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.236013986013987, |
| "grad_norm": 0.38647204637527466, |
| "learning_rate": 0.00035737959183673463, |
| "loss": 3.1993, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.25058275058275, |
| "grad_norm": 0.39363858103752136, |
| "learning_rate": 0.00035720466472303206, |
| "loss": 3.1986, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.265151515151516, |
| "grad_norm": 0.38450032472610474, |
| "learning_rate": 0.00035702973760932944, |
| "loss": 3.2011, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.27972027972028, |
| "grad_norm": 0.4013044834136963, |
| "learning_rate": 0.0003568548104956268, |
| "loss": 3.2051, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.294289044289044, |
| "grad_norm": 0.385503888130188, |
| "learning_rate": 0.0003566798833819242, |
| "loss": 3.2117, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.30885780885781, |
| "grad_norm": 0.39147505164146423, |
| "learning_rate": 0.00035650495626822156, |
| "loss": 3.2079, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.323426573426573, |
| "grad_norm": 0.4091346561908722, |
| "learning_rate": 0.0003563300291545189, |
| "loss": 3.2091, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.337995337995338, |
| "grad_norm": 0.38685789704322815, |
| "learning_rate": 0.00035615510204081626, |
| "loss": 3.2058, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.352564102564102, |
| "grad_norm": 0.3918103873729706, |
| "learning_rate": 0.00035598017492711364, |
| "loss": 3.201, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.367132867132867, |
| "grad_norm": 0.37603530287742615, |
| "learning_rate": 0.00035580524781341107, |
| "loss": 3.2128, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.38170163170163, |
| "grad_norm": 0.3842366933822632, |
| "learning_rate": 0.00035563032069970844, |
| "loss": 3.212, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.396270396270396, |
| "grad_norm": 0.3946133852005005, |
| "learning_rate": 0.0003554553935860058, |
| "loss": 3.2055, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.396270396270396, |
| "eval_accuracy": 0.3729368388894625, |
| "eval_loss": 3.549053192138672, |
| "eval_runtime": 180.0767, |
| "eval_samples_per_second": 92.416, |
| "eval_steps_per_second": 5.781, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.41083916083916, |
| "grad_norm": 0.4046531915664673, |
| "learning_rate": 0.0003552804664723032, |
| "loss": 3.2123, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.425407925407924, |
| "grad_norm": 0.39396217465400696, |
| "learning_rate": 0.00035510553935860057, |
| "loss": 3.2179, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.43997668997669, |
| "grad_norm": 0.4202251434326172, |
| "learning_rate": 0.00035493061224489795, |
| "loss": 3.2314, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.454545454545453, |
| "grad_norm": 0.4146822392940521, |
| "learning_rate": 0.00035475568513119527, |
| "loss": 3.2219, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.469114219114218, |
| "grad_norm": 0.44110000133514404, |
| "learning_rate": 0.00035458075801749264, |
| "loss": 3.2281, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.483682983682982, |
| "grad_norm": 0.43832480907440186, |
| "learning_rate": 0.00035440583090379, |
| "loss": 3.2258, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.498251748251747, |
| "grad_norm": 0.4042571783065796, |
| "learning_rate": 0.00035423090379008745, |
| "loss": 3.2261, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.51282051282051, |
| "grad_norm": 0.4489721953868866, |
| "learning_rate": 0.0003540559766763848, |
| "loss": 3.2331, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.52738927738928, |
| "grad_norm": 0.4016249179840088, |
| "learning_rate": 0.0003538810495626822, |
| "loss": 3.2284, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.541958041958043, |
| "grad_norm": 0.5365473628044128, |
| "learning_rate": 0.0003537061224489796, |
| "loss": 3.2255, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.556526806526808, |
| "grad_norm": 0.4111057221889496, |
| "learning_rate": 0.00035353119533527695, |
| "loss": 3.2203, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.571095571095572, |
| "grad_norm": 0.4217143654823303, |
| "learning_rate": 0.00035335626822157433, |
| "loss": 3.2344, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.585664335664337, |
| "grad_norm": 0.41283461451530457, |
| "learning_rate": 0.00035318134110787165, |
| "loss": 3.2391, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.6002331002331, |
| "grad_norm": 0.4114687442779541, |
| "learning_rate": 0.000353006413994169, |
| "loss": 3.2305, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.614801864801866, |
| "grad_norm": 0.4181511402130127, |
| "learning_rate": 0.00035283148688046646, |
| "loss": 3.2326, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.62937062937063, |
| "grad_norm": 0.4368336796760559, |
| "learning_rate": 0.00035265655976676383, |
| "loss": 3.2398, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.643939393939394, |
| "grad_norm": 0.38254714012145996, |
| "learning_rate": 0.0003524816326530612, |
| "loss": 3.2424, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.65850815850816, |
| "grad_norm": 0.3811262249946594, |
| "learning_rate": 0.0003523067055393586, |
| "loss": 3.2214, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.673076923076923, |
| "grad_norm": 0.38296961784362793, |
| "learning_rate": 0.00035213177842565596, |
| "loss": 3.2346, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.687645687645688, |
| "grad_norm": 0.42211851477622986, |
| "learning_rate": 0.00035195685131195333, |
| "loss": 3.2364, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.687645687645688, |
| "eval_accuracy": 0.37363732766764873, |
| "eval_loss": 3.5381784439086914, |
| "eval_runtime": 180.1083, |
| "eval_samples_per_second": 92.4, |
| "eval_steps_per_second": 5.78, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.702214452214452, |
| "grad_norm": 0.39245641231536865, |
| "learning_rate": 0.0003517819241982507, |
| "loss": 3.2294, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.716783216783217, |
| "grad_norm": 0.4119685888290405, |
| "learning_rate": 0.00035160699708454803, |
| "loss": 3.2396, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.73135198135198, |
| "grad_norm": 0.41568517684936523, |
| "learning_rate": 0.0003514320699708454, |
| "loss": 3.2331, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.745920745920746, |
| "grad_norm": 0.4322853684425354, |
| "learning_rate": 0.00035125714285714284, |
| "loss": 3.2411, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.76048951048951, |
| "grad_norm": 0.41902053356170654, |
| "learning_rate": 0.0003510822157434402, |
| "loss": 3.2489, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.775058275058274, |
| "grad_norm": 0.4021133482456207, |
| "learning_rate": 0.0003509072886297376, |
| "loss": 3.2422, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.78962703962704, |
| "grad_norm": 0.3975924849510193, |
| "learning_rate": 0.00035073236151603497, |
| "loss": 3.2368, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.804195804195803, |
| "grad_norm": 0.38075828552246094, |
| "learning_rate": 0.00035055743440233234, |
| "loss": 3.2414, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.818764568764568, |
| "grad_norm": 0.4107215702533722, |
| "learning_rate": 0.0003503825072886297, |
| "loss": 3.2484, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.833333333333332, |
| "grad_norm": 0.39930886030197144, |
| "learning_rate": 0.0003502075801749271, |
| "loss": 3.2399, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.847902097902097, |
| "grad_norm": 0.43924176692962646, |
| "learning_rate": 0.0003500326530612244, |
| "loss": 3.238, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.86247086247086, |
| "grad_norm": 0.4071701467037201, |
| "learning_rate": 0.0003498577259475218, |
| "loss": 3.2547, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.877039627039625, |
| "grad_norm": 0.3740857243537903, |
| "learning_rate": 0.0003496827988338192, |
| "loss": 3.2425, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.89160839160839, |
| "grad_norm": 0.4022287428379059, |
| "learning_rate": 0.0003495078717201166, |
| "loss": 3.2539, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.906177156177158, |
| "grad_norm": 0.43108558654785156, |
| "learning_rate": 0.00034933294460641397, |
| "loss": 3.2453, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.920745920745922, |
| "grad_norm": 0.38944077491760254, |
| "learning_rate": 0.00034915801749271135, |
| "loss": 3.2474, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.935314685314687, |
| "grad_norm": 0.3937079906463623, |
| "learning_rate": 0.0003489830903790087, |
| "loss": 3.255, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.94988344988345, |
| "grad_norm": 0.45583540201187134, |
| "learning_rate": 0.0003488081632653061, |
| "loss": 3.2412, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.964452214452216, |
| "grad_norm": 0.40046775341033936, |
| "learning_rate": 0.0003486332361516035, |
| "loss": 3.2441, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.97902097902098, |
| "grad_norm": 0.4097670018672943, |
| "learning_rate": 0.0003484583090379008, |
| "loss": 3.2603, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.97902097902098, |
| "eval_accuracy": 0.374076647136392, |
| "eval_loss": 3.532677173614502, |
| "eval_runtime": 180.1507, |
| "eval_samples_per_second": 92.378, |
| "eval_steps_per_second": 5.778, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.993589743589745, |
| "grad_norm": 0.4118557274341583, |
| "learning_rate": 0.0003482833819241982, |
| "loss": 3.2535, |
| "step": 72050 |
| }, |
| { |
| "epoch": 21.00815850815851, |
| "grad_norm": 0.4162599444389343, |
| "learning_rate": 0.0003481084548104956, |
| "loss": 3.1958, |
| "step": 72100 |
| }, |
| { |
| "epoch": 21.022727272727273, |
| "grad_norm": 0.40431153774261475, |
| "learning_rate": 0.000347933527696793, |
| "loss": 3.1384, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.037296037296038, |
| "grad_norm": 0.40858593583106995, |
| "learning_rate": 0.00034775860058309035, |
| "loss": 3.1558, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.051864801864802, |
| "grad_norm": 0.4380705654621124, |
| "learning_rate": 0.00034758367346938773, |
| "loss": 3.1546, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.066433566433567, |
| "grad_norm": 0.39787471294403076, |
| "learning_rate": 0.0003474087463556851, |
| "loss": 3.1561, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.08100233100233, |
| "grad_norm": 0.38657239079475403, |
| "learning_rate": 0.0003472338192419825, |
| "loss": 3.1574, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.095571095571096, |
| "grad_norm": 0.38952600955963135, |
| "learning_rate": 0.0003470588921282799, |
| "loss": 3.1672, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.11013986013986, |
| "grad_norm": 0.41245877742767334, |
| "learning_rate": 0.0003468839650145772, |
| "loss": 3.175, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.124708624708624, |
| "grad_norm": 0.40463095903396606, |
| "learning_rate": 0.0003467090379008746, |
| "loss": 3.1695, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.13927738927739, |
| "grad_norm": 0.41737812757492065, |
| "learning_rate": 0.000346534110787172, |
| "loss": 3.1721, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.153846153846153, |
| "grad_norm": 0.5267341136932373, |
| "learning_rate": 0.00034635918367346936, |
| "loss": 3.1717, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.168414918414918, |
| "grad_norm": 0.37345781922340393, |
| "learning_rate": 0.00034618425655976674, |
| "loss": 3.1725, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.182983682983682, |
| "grad_norm": 0.4185006618499756, |
| "learning_rate": 0.0003460093294460641, |
| "loss": 3.1749, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.197552447552447, |
| "grad_norm": 0.3790314495563507, |
| "learning_rate": 0.0003458344023323615, |
| "loss": 3.1786, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.21212121212121, |
| "grad_norm": 0.39847561717033386, |
| "learning_rate": 0.00034565947521865886, |
| "loss": 3.1873, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.226689976689975, |
| "grad_norm": 0.4208274781703949, |
| "learning_rate": 0.0003454845481049562, |
| "loss": 3.1832, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.24125874125874, |
| "grad_norm": 0.411825567483902, |
| "learning_rate": 0.0003453096209912536, |
| "loss": 3.1955, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.255827505827504, |
| "grad_norm": 0.42742517590522766, |
| "learning_rate": 0.000345134693877551, |
| "loss": 3.1811, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.27039627039627, |
| "grad_norm": 0.40830036997795105, |
| "learning_rate": 0.00034495976676384837, |
| "loss": 3.1853, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.27039627039627, |
| "eval_accuracy": 0.37328361434806095, |
| "eval_loss": 3.5470807552337646, |
| "eval_runtime": 180.3065, |
| "eval_samples_per_second": 92.298, |
| "eval_steps_per_second": 5.774, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.284965034965033, |
| "grad_norm": 0.39452317357063293, |
| "learning_rate": 0.00034478483965014574, |
| "loss": 3.1955, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.2995337995338, |
| "grad_norm": 0.39910146594047546, |
| "learning_rate": 0.0003446099125364431, |
| "loss": 3.1821, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.314102564102566, |
| "grad_norm": 0.3868776559829712, |
| "learning_rate": 0.0003444349854227405, |
| "loss": 3.2021, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.32867132867133, |
| "grad_norm": 0.3973531126976013, |
| "learning_rate": 0.00034426005830903787, |
| "loss": 3.1984, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.343240093240095, |
| "grad_norm": 0.40514352917671204, |
| "learning_rate": 0.0003440851311953353, |
| "loss": 3.2046, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.35780885780886, |
| "grad_norm": 0.41221529245376587, |
| "learning_rate": 0.00034391020408163257, |
| "loss": 3.2093, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.372377622377623, |
| "grad_norm": 0.4092659056186676, |
| "learning_rate": 0.00034373527696793, |
| "loss": 3.2081, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.386946386946388, |
| "grad_norm": 0.38245689868927, |
| "learning_rate": 0.00034356034985422737, |
| "loss": 3.1956, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.401515151515152, |
| "grad_norm": 0.4299090802669525, |
| "learning_rate": 0.00034338542274052475, |
| "loss": 3.2053, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.416083916083917, |
| "grad_norm": 0.37501996755599976, |
| "learning_rate": 0.0003432104956268221, |
| "loss": 3.2119, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.43065268065268, |
| "grad_norm": 0.4129965305328369, |
| "learning_rate": 0.0003430355685131195, |
| "loss": 3.2079, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.445221445221446, |
| "grad_norm": 0.4159983694553375, |
| "learning_rate": 0.0003428606413994169, |
| "loss": 3.2124, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.45979020979021, |
| "grad_norm": 0.4268476665019989, |
| "learning_rate": 0.00034268571428571425, |
| "loss": 3.2133, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.474358974358974, |
| "grad_norm": 0.40658941864967346, |
| "learning_rate": 0.0003425107871720117, |
| "loss": 3.2121, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.48892773892774, |
| "grad_norm": 0.4047592580318451, |
| "learning_rate": 0.00034233586005830895, |
| "loss": 3.204, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.503496503496503, |
| "grad_norm": 0.39788851141929626, |
| "learning_rate": 0.0003421609329446064, |
| "loss": 3.212, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.518065268065268, |
| "grad_norm": 0.4571487307548523, |
| "learning_rate": 0.00034198600583090375, |
| "loss": 3.2139, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.532634032634032, |
| "grad_norm": 0.4046323597431183, |
| "learning_rate": 0.00034181107871720113, |
| "loss": 3.2159, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.547202797202797, |
| "grad_norm": 0.4000145196914673, |
| "learning_rate": 0.0003416361516034985, |
| "loss": 3.2115, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.56177156177156, |
| "grad_norm": 0.39644888043403625, |
| "learning_rate": 0.0003414612244897959, |
| "loss": 3.2229, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.56177156177156, |
| "eval_accuracy": 0.3736430896199957, |
| "eval_loss": 3.542844772338867, |
| "eval_runtime": 180.201, |
| "eval_samples_per_second": 92.352, |
| "eval_steps_per_second": 5.777, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.576340326340326, |
| "grad_norm": 0.38864246010780334, |
| "learning_rate": 0.00034128629737609326, |
| "loss": 3.2182, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.59090909090909, |
| "grad_norm": 0.39571088552474976, |
| "learning_rate": 0.00034111137026239063, |
| "loss": 3.2163, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.605477855477854, |
| "grad_norm": 0.40646892786026, |
| "learning_rate": 0.00034093644314868806, |
| "loss": 3.2255, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.62004662004662, |
| "grad_norm": 0.41352179646492004, |
| "learning_rate": 0.0003407615160349854, |
| "loss": 3.2206, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.634615384615383, |
| "grad_norm": 0.4505552053451538, |
| "learning_rate": 0.00034058658892128276, |
| "loss": 3.2268, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.649184149184148, |
| "grad_norm": 0.41021692752838135, |
| "learning_rate": 0.00034041166180758014, |
| "loss": 3.2119, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.663752913752912, |
| "grad_norm": 0.42527827620506287, |
| "learning_rate": 0.0003402367346938775, |
| "loss": 3.2275, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.67832167832168, |
| "grad_norm": 0.3895317018032074, |
| "learning_rate": 0.0003400618075801749, |
| "loss": 3.2306, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.692890442890445, |
| "grad_norm": 0.3877924978733063, |
| "learning_rate": 0.00033988688046647226, |
| "loss": 3.2244, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.70745920745921, |
| "grad_norm": 0.3887571394443512, |
| "learning_rate": 0.00033971195335276964, |
| "loss": 3.2274, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.722027972027973, |
| "grad_norm": 0.401986300945282, |
| "learning_rate": 0.00033953702623906707, |
| "loss": 3.2308, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.736596736596738, |
| "grad_norm": 0.421645849943161, |
| "learning_rate": 0.00033936209912536445, |
| "loss": 3.2348, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.751165501165502, |
| "grad_norm": 0.40199247002601624, |
| "learning_rate": 0.00033918717201166177, |
| "loss": 3.2305, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.765734265734267, |
| "grad_norm": 0.456487238407135, |
| "learning_rate": 0.00033901224489795914, |
| "loss": 3.2305, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.78030303030303, |
| "grad_norm": 0.4060775935649872, |
| "learning_rate": 0.0003388373177842565, |
| "loss": 3.2375, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.794871794871796, |
| "grad_norm": 0.40119364857673645, |
| "learning_rate": 0.0003386623906705539, |
| "loss": 3.2358, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.80944055944056, |
| "grad_norm": 0.42308083176612854, |
| "learning_rate": 0.00033848746355685127, |
| "loss": 3.2385, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.824009324009324, |
| "grad_norm": 0.42308709025382996, |
| "learning_rate": 0.00033831253644314865, |
| "loss": 3.2376, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.83857808857809, |
| "grad_norm": 0.42288511991500854, |
| "learning_rate": 0.000338137609329446, |
| "loss": 3.2356, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.853146853146853, |
| "grad_norm": 0.4140841066837311, |
| "learning_rate": 0.00033796268221574345, |
| "loss": 3.2503, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.853146853146853, |
| "eval_accuracy": 0.3742927791448369, |
| "eval_loss": 3.533069610595703, |
| "eval_runtime": 180.2744, |
| "eval_samples_per_second": 92.315, |
| "eval_steps_per_second": 5.775, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.867715617715618, |
| "grad_norm": 0.4126766622066498, |
| "learning_rate": 0.00033778775510204083, |
| "loss": 3.2405, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.882284382284382, |
| "grad_norm": 0.3706146776676178, |
| "learning_rate": 0.00033761282798833815, |
| "loss": 3.2475, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.896853146853147, |
| "grad_norm": 0.40189969539642334, |
| "learning_rate": 0.0003374379008746355, |
| "loss": 3.2266, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.91142191142191, |
| "grad_norm": 0.39839887619018555, |
| "learning_rate": 0.0003372629737609329, |
| "loss": 3.24, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.925990675990676, |
| "grad_norm": 0.4024837911128998, |
| "learning_rate": 0.0003370880466472303, |
| "loss": 3.2304, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.94055944055944, |
| "grad_norm": 0.4002145230770111, |
| "learning_rate": 0.00033691311953352765, |
| "loss": 3.2409, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.955128205128204, |
| "grad_norm": 0.4018450975418091, |
| "learning_rate": 0.00033673819241982503, |
| "loss": 3.2406, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.96969696969697, |
| "grad_norm": 0.39322006702423096, |
| "learning_rate": 0.00033656326530612246, |
| "loss": 3.2375, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.984265734265733, |
| "grad_norm": 0.40275838971138, |
| "learning_rate": 0.00033638833819241983, |
| "loss": 3.236, |
| "step": 75450 |
| }, |
| { |
| "epoch": 21.998834498834498, |
| "grad_norm": 0.4216180741786957, |
| "learning_rate": 0.0003362134110787172, |
| "loss": 3.2382, |
| "step": 75500 |
| }, |
| { |
| "epoch": 22.013403263403262, |
| "grad_norm": 0.41659364104270935, |
| "learning_rate": 0.00033603848396501453, |
| "loss": 3.1556, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.027972027972027, |
| "grad_norm": 0.4074558615684509, |
| "learning_rate": 0.0003358635568513119, |
| "loss": 3.1399, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.04254079254079, |
| "grad_norm": 0.4153924584388733, |
| "learning_rate": 0.0003356886297376093, |
| "loss": 3.1437, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.057109557109555, |
| "grad_norm": 0.4084555506706238, |
| "learning_rate": 0.00033551370262390666, |
| "loss": 3.1462, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.071678321678323, |
| "grad_norm": 0.4191485047340393, |
| "learning_rate": 0.00033533877551020403, |
| "loss": 3.1581, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.086247086247088, |
| "grad_norm": 0.4273689091205597, |
| "learning_rate": 0.0003351638483965014, |
| "loss": 3.1518, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.100815850815852, |
| "grad_norm": 0.413409948348999, |
| "learning_rate": 0.00033498892128279884, |
| "loss": 3.1435, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.115384615384617, |
| "grad_norm": 0.39928290247917175, |
| "learning_rate": 0.0003348139941690962, |
| "loss": 3.1549, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.12995337995338, |
| "grad_norm": 0.41116443276405334, |
| "learning_rate": 0.0003346390670553936, |
| "loss": 3.1695, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.144522144522146, |
| "grad_norm": 0.4264084994792938, |
| "learning_rate": 0.0003344641399416909, |
| "loss": 3.1733, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.144522144522146, |
| "eval_accuracy": 0.3729999851835511, |
| "eval_loss": 3.551360607147217, |
| "eval_runtime": 181.2907, |
| "eval_samples_per_second": 91.797, |
| "eval_steps_per_second": 5.742, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.15909090909091, |
| "grad_norm": 0.4089738428592682, |
| "learning_rate": 0.0003342892128279883, |
| "loss": 3.1636, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.173659673659674, |
| "grad_norm": 0.4810738265514374, |
| "learning_rate": 0.00033411428571428567, |
| "loss": 3.1689, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.18822843822844, |
| "grad_norm": 0.43918928503990173, |
| "learning_rate": 0.00033393935860058304, |
| "loss": 3.1725, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.202797202797203, |
| "grad_norm": 0.4179990291595459, |
| "learning_rate": 0.0003337644314868804, |
| "loss": 3.1798, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.217365967365968, |
| "grad_norm": 0.4197443127632141, |
| "learning_rate": 0.0003335895043731778, |
| "loss": 3.1672, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.231934731934732, |
| "grad_norm": 0.3869195282459259, |
| "learning_rate": 0.0003334145772594752, |
| "loss": 3.1915, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.246503496503497, |
| "grad_norm": 0.3932842016220093, |
| "learning_rate": 0.0003332396501457726, |
| "loss": 3.169, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.26107226107226, |
| "grad_norm": 0.4352983832359314, |
| "learning_rate": 0.00033306472303207, |
| "loss": 3.1863, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.275641025641026, |
| "grad_norm": 0.3932501971721649, |
| "learning_rate": 0.0003328897959183673, |
| "loss": 3.1677, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.29020979020979, |
| "grad_norm": 0.4138025939464569, |
| "learning_rate": 0.00033271486880466467, |
| "loss": 3.1914, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.304778554778554, |
| "grad_norm": 0.4232082664966583, |
| "learning_rate": 0.00033253994169096205, |
| "loss": 3.1857, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.31934731934732, |
| "grad_norm": 0.39019161462783813, |
| "learning_rate": 0.0003323650145772594, |
| "loss": 3.1815, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.333916083916083, |
| "grad_norm": 0.3833393454551697, |
| "learning_rate": 0.0003321900874635568, |
| "loss": 3.1873, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.348484848484848, |
| "grad_norm": 0.4164026975631714, |
| "learning_rate": 0.00033201516034985423, |
| "loss": 3.1863, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.363053613053612, |
| "grad_norm": 0.41754239797592163, |
| "learning_rate": 0.0003318402332361516, |
| "loss": 3.1991, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.377622377622377, |
| "grad_norm": 0.41156837344169617, |
| "learning_rate": 0.000331665306122449, |
| "loss": 3.207, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.39219114219114, |
| "grad_norm": 0.4196581244468689, |
| "learning_rate": 0.00033149037900874636, |
| "loss": 3.1954, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.406759906759905, |
| "grad_norm": 0.41179725527763367, |
| "learning_rate": 0.0003313154518950437, |
| "loss": 3.2009, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.42132867132867, |
| "grad_norm": 0.41211971640586853, |
| "learning_rate": 0.00033114052478134105, |
| "loss": 3.1883, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.435897435897434, |
| "grad_norm": 0.4655284583568573, |
| "learning_rate": 0.00033096559766763843, |
| "loss": 3.1996, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.435897435897434, |
| "eval_accuracy": 0.3736674309288902, |
| "eval_loss": 3.5417394638061523, |
| "eval_runtime": 179.8191, |
| "eval_samples_per_second": 92.549, |
| "eval_steps_per_second": 5.789, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.4504662004662, |
| "grad_norm": 0.38444826006889343, |
| "learning_rate": 0.0003307906705539358, |
| "loss": 3.2122, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.465034965034967, |
| "grad_norm": 0.4191334545612335, |
| "learning_rate": 0.0003306157434402332, |
| "loss": 3.1952, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.47960372960373, |
| "grad_norm": 0.4504798352718353, |
| "learning_rate": 0.0003304408163265306, |
| "loss": 3.2073, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.494172494172496, |
| "grad_norm": 0.4140982925891876, |
| "learning_rate": 0.000330265889212828, |
| "loss": 3.1971, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.50874125874126, |
| "grad_norm": 0.4141624867916107, |
| "learning_rate": 0.00033009096209912536, |
| "loss": 3.2065, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.523310023310025, |
| "grad_norm": 0.39601314067840576, |
| "learning_rate": 0.00032991603498542274, |
| "loss": 3.206, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.53787878787879, |
| "grad_norm": 0.3927428126335144, |
| "learning_rate": 0.00032974110787172006, |
| "loss": 3.1968, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.552447552447553, |
| "grad_norm": 0.42265579104423523, |
| "learning_rate": 0.00032956618075801744, |
| "loss": 3.2056, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.567016317016318, |
| "grad_norm": 0.39944592118263245, |
| "learning_rate": 0.0003293912536443148, |
| "loss": 3.213, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.581585081585082, |
| "grad_norm": 0.4446455240249634, |
| "learning_rate": 0.0003292163265306122, |
| "loss": 3.2058, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.596153846153847, |
| "grad_norm": 0.41780418157577515, |
| "learning_rate": 0.00032904139941690956, |
| "loss": 3.2082, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.61072261072261, |
| "grad_norm": 0.3988536596298218, |
| "learning_rate": 0.000328866472303207, |
| "loss": 3.2213, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.625291375291376, |
| "grad_norm": 0.4125400185585022, |
| "learning_rate": 0.00032869154518950437, |
| "loss": 3.2192, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.63986013986014, |
| "grad_norm": 0.43208998441696167, |
| "learning_rate": 0.00032851661807580174, |
| "loss": 3.2108, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.654428904428904, |
| "grad_norm": 0.4006592631340027, |
| "learning_rate": 0.0003283416909620991, |
| "loss": 3.2152, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.66899766899767, |
| "grad_norm": 0.40716752409935, |
| "learning_rate": 0.00032816676384839644, |
| "loss": 3.2174, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.683566433566433, |
| "grad_norm": 0.4060366451740265, |
| "learning_rate": 0.0003279918367346938, |
| "loss": 3.2187, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.698135198135198, |
| "grad_norm": 0.38326430320739746, |
| "learning_rate": 0.0003278169096209912, |
| "loss": 3.2087, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.712703962703962, |
| "grad_norm": 0.3733854591846466, |
| "learning_rate": 0.00032764198250728857, |
| "loss": 3.2059, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.727272727272727, |
| "grad_norm": 0.4183722138404846, |
| "learning_rate": 0.000327467055393586, |
| "loss": 3.2241, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.727272727272727, |
| "eval_accuracy": 0.3741929445011102, |
| "eval_loss": 3.53352952003479, |
| "eval_runtime": 179.8425, |
| "eval_samples_per_second": 92.537, |
| "eval_steps_per_second": 5.788, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.74184149184149, |
| "grad_norm": 0.41641032695770264, |
| "learning_rate": 0.0003272921282798834, |
| "loss": 3.2211, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.756410256410255, |
| "grad_norm": 0.40354758501052856, |
| "learning_rate": 0.00032711720116618075, |
| "loss": 3.2277, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.77097902097902, |
| "grad_norm": 0.41052916646003723, |
| "learning_rate": 0.0003269422740524781, |
| "loss": 3.2228, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.785547785547784, |
| "grad_norm": 0.38687601685523987, |
| "learning_rate": 0.0003267673469387755, |
| "loss": 3.2047, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.80011655011655, |
| "grad_norm": 0.42369911074638367, |
| "learning_rate": 0.0003265924198250728, |
| "loss": 3.2269, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.814685314685313, |
| "grad_norm": 0.43428516387939453, |
| "learning_rate": 0.0003264174927113702, |
| "loss": 3.2192, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.829254079254078, |
| "grad_norm": 0.3913278877735138, |
| "learning_rate": 0.0003262425655976676, |
| "loss": 3.2241, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.843822843822842, |
| "grad_norm": 0.39617830514907837, |
| "learning_rate": 0.00032606763848396495, |
| "loss": 3.2347, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.85839160839161, |
| "grad_norm": 0.4081778824329376, |
| "learning_rate": 0.0003258927113702624, |
| "loss": 3.2338, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.872960372960375, |
| "grad_norm": 0.41960251331329346, |
| "learning_rate": 0.00032571778425655976, |
| "loss": 3.2207, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.88752913752914, |
| "grad_norm": 0.39189615845680237, |
| "learning_rate": 0.00032554285714285713, |
| "loss": 3.2217, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.902097902097903, |
| "grad_norm": 0.41321077942848206, |
| "learning_rate": 0.0003253679300291545, |
| "loss": 3.2203, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.916666666666668, |
| "grad_norm": 0.38586828112602234, |
| "learning_rate": 0.0003251930029154519, |
| "loss": 3.2301, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.931235431235432, |
| "grad_norm": 0.40068861842155457, |
| "learning_rate": 0.0003250180758017492, |
| "loss": 3.2131, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.945804195804197, |
| "grad_norm": 0.40055349469184875, |
| "learning_rate": 0.0003248431486880466, |
| "loss": 3.2253, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.96037296037296, |
| "grad_norm": 0.416927695274353, |
| "learning_rate": 0.00032466822157434396, |
| "loss": 3.2276, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.974941724941726, |
| "grad_norm": 0.39231857657432556, |
| "learning_rate": 0.0003244932944606414, |
| "loss": 3.2387, |
| "step": 78850 |
| }, |
| { |
| "epoch": 22.98951048951049, |
| "grad_norm": 0.39868852496147156, |
| "learning_rate": 0.00032431836734693876, |
| "loss": 3.2249, |
| "step": 78900 |
| }, |
| { |
| "epoch": 23.004079254079254, |
| "grad_norm": 0.4433835744857788, |
| "learning_rate": 0.00032414344023323614, |
| "loss": 3.2051, |
| "step": 78950 |
| }, |
| { |
| "epoch": 23.01864801864802, |
| "grad_norm": 0.43138596415519714, |
| "learning_rate": 0.0003239685131195335, |
| "loss": 3.1207, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.01864801864802, |
| "eval_accuracy": 0.37376632484570316, |
| "eval_loss": 3.548415184020996, |
| "eval_runtime": 216.5967, |
| "eval_samples_per_second": 76.834, |
| "eval_steps_per_second": 4.806, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.033216783216783, |
| "grad_norm": 0.4261009097099304, |
| "learning_rate": 0.0003237935860058309, |
| "loss": 3.1319, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.047785547785548, |
| "grad_norm": 0.3840440809726715, |
| "learning_rate": 0.00032361865889212827, |
| "loss": 3.1392, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.062354312354312, |
| "grad_norm": 0.40722066164016724, |
| "learning_rate": 0.0003234437317784256, |
| "loss": 3.142, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.076923076923077, |
| "grad_norm": 0.4106970727443695, |
| "learning_rate": 0.00032326880466472296, |
| "loss": 3.1438, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.09149184149184, |
| "grad_norm": 0.42081838846206665, |
| "learning_rate": 0.00032309387755102034, |
| "loss": 3.146, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.106060606060606, |
| "grad_norm": 0.41058728098869324, |
| "learning_rate": 0.00032291895043731777, |
| "loss": 3.1509, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.12062937062937, |
| "grad_norm": 0.45436742901802063, |
| "learning_rate": 0.00032274402332361515, |
| "loss": 3.1516, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.135198135198134, |
| "grad_norm": 0.4210570752620697, |
| "learning_rate": 0.0003225690962099125, |
| "loss": 3.1577, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.1497668997669, |
| "grad_norm": 0.3977571129798889, |
| "learning_rate": 0.0003223941690962099, |
| "loss": 3.1589, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.164335664335663, |
| "grad_norm": 0.4383814036846161, |
| "learning_rate": 0.0003222192419825073, |
| "loss": 3.1622, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.178904428904428, |
| "grad_norm": 0.40101122856140137, |
| "learning_rate": 0.00032204431486880465, |
| "loss": 3.1485, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.193473193473192, |
| "grad_norm": 0.4000990390777588, |
| "learning_rate": 0.00032186938775510197, |
| "loss": 3.1623, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.208041958041957, |
| "grad_norm": 0.3868107199668884, |
| "learning_rate": 0.00032169446064139935, |
| "loss": 3.1657, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.22261072261072, |
| "grad_norm": 0.43203461170196533, |
| "learning_rate": 0.0003215195335276967, |
| "loss": 3.1601, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.237179487179485, |
| "grad_norm": 0.4248724579811096, |
| "learning_rate": 0.00032134460641399415, |
| "loss": 3.1676, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.251748251748253, |
| "grad_norm": 0.4065852761268616, |
| "learning_rate": 0.00032116967930029153, |
| "loss": 3.1693, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.266317016317018, |
| "grad_norm": 0.43213963508605957, |
| "learning_rate": 0.0003209947521865889, |
| "loss": 3.1721, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.280885780885782, |
| "grad_norm": 0.4326905310153961, |
| "learning_rate": 0.0003208198250728863, |
| "loss": 3.1854, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.295454545454547, |
| "grad_norm": 0.4199202060699463, |
| "learning_rate": 0.00032064489795918366, |
| "loss": 3.1652, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.31002331002331, |
| "grad_norm": 0.40087878704071045, |
| "learning_rate": 0.00032046997084548103, |
| "loss": 3.1867, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.31002331002331, |
| "eval_accuracy": 0.3736945944185261, |
| "eval_loss": 3.550523519515991, |
| "eval_runtime": 242.1111, |
| "eval_samples_per_second": 68.737, |
| "eval_steps_per_second": 4.3, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.324592074592076, |
| "grad_norm": 0.39655831456184387, |
| "learning_rate": 0.00032029504373177835, |
| "loss": 3.1418, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.33916083916084, |
| "grad_norm": 0.4348229467868805, |
| "learning_rate": 0.00032012011661807573, |
| "loss": 3.1364, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.353729603729604, |
| "grad_norm": 0.42798683047294617, |
| "learning_rate": 0.00031994518950437316, |
| "loss": 3.1377, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.36829836829837, |
| "grad_norm": 0.4182716906070709, |
| "learning_rate": 0.00031977026239067053, |
| "loss": 3.1419, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.382867132867133, |
| "grad_norm": 0.4539523720741272, |
| "learning_rate": 0.0003195953352769679, |
| "loss": 3.157, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.397435897435898, |
| "grad_norm": 0.42409515380859375, |
| "learning_rate": 0.0003194204081632653, |
| "loss": 3.1638, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.412004662004662, |
| "grad_norm": 0.4075915813446045, |
| "learning_rate": 0.00031924548104956266, |
| "loss": 3.1426, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.426573426573427, |
| "grad_norm": 0.4082591235637665, |
| "learning_rate": 0.00031907055393586004, |
| "loss": 3.1518, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.44114219114219, |
| "grad_norm": 0.40404626727104187, |
| "learning_rate": 0.0003188956268221574, |
| "loss": 3.1526, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.455710955710956, |
| "grad_norm": 0.4418063461780548, |
| "learning_rate": 0.00031872069970845474, |
| "loss": 3.1499, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.47027972027972, |
| "grad_norm": 0.39059650897979736, |
| "learning_rate": 0.0003185457725947521, |
| "loss": 3.1609, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.484848484848484, |
| "grad_norm": 0.4021977186203003, |
| "learning_rate": 0.00031837084548104954, |
| "loss": 3.165, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.49941724941725, |
| "grad_norm": 0.4297217130661011, |
| "learning_rate": 0.0003181959183673469, |
| "loss": 3.17, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.513986013986013, |
| "grad_norm": 0.41383132338523865, |
| "learning_rate": 0.0003180209912536443, |
| "loss": 3.1562, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.528554778554778, |
| "grad_norm": 0.4195021986961365, |
| "learning_rate": 0.00031784606413994167, |
| "loss": 3.1654, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.543123543123542, |
| "grad_norm": 0.40288063883781433, |
| "learning_rate": 0.00031767113702623904, |
| "loss": 3.1678, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.557692307692307, |
| "grad_norm": 0.43353042006492615, |
| "learning_rate": 0.0003174962099125364, |
| "loss": 3.1641, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.57226107226107, |
| "grad_norm": 0.44536879658699036, |
| "learning_rate": 0.0003173212827988338, |
| "loss": 3.1672, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.586829836829835, |
| "grad_norm": 0.4158283472061157, |
| "learning_rate": 0.0003171463556851311, |
| "loss": 3.1782, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.6013986013986, |
| "grad_norm": 0.40795543789863586, |
| "learning_rate": 0.00031697142857142855, |
| "loss": 3.1736, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.6013986013986, |
| "eval_accuracy": 0.37375009730644015, |
| "eval_loss": 3.5509092807769775, |
| "eval_runtime": 181.2172, |
| "eval_samples_per_second": 91.835, |
| "eval_steps_per_second": 5.744, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.615967365967364, |
| "grad_norm": 0.44809621572494507, |
| "learning_rate": 0.0003167965014577259, |
| "loss": 3.1694, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.63053613053613, |
| "grad_norm": 0.43004634976387024, |
| "learning_rate": 0.0003166215743440233, |
| "loss": 3.1852, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.645104895104897, |
| "grad_norm": 0.4156273901462555, |
| "learning_rate": 0.0003164466472303207, |
| "loss": 3.1741, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.65967365967366, |
| "grad_norm": 0.40418311953544617, |
| "learning_rate": 0.00031627172011661805, |
| "loss": 3.1762, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.674242424242426, |
| "grad_norm": 0.42274951934814453, |
| "learning_rate": 0.0003160967930029154, |
| "loss": 3.1866, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.68881118881119, |
| "grad_norm": 0.43228334188461304, |
| "learning_rate": 0.0003159218658892128, |
| "loss": 3.1837, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.703379953379955, |
| "grad_norm": 0.40298640727996826, |
| "learning_rate": 0.00031574693877551023, |
| "loss": 3.1876, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.71794871794872, |
| "grad_norm": 0.3895956873893738, |
| "learning_rate": 0.0003155720116618075, |
| "loss": 3.1807, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.732517482517483, |
| "grad_norm": 0.40143319964408875, |
| "learning_rate": 0.00031539708454810493, |
| "loss": 3.1806, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.747086247086248, |
| "grad_norm": 0.4256174564361572, |
| "learning_rate": 0.0003152221574344023, |
| "loss": 3.1951, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.761655011655012, |
| "grad_norm": 0.43005815148353577, |
| "learning_rate": 0.0003150472303206997, |
| "loss": 3.1962, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.776223776223777, |
| "grad_norm": 0.42560434341430664, |
| "learning_rate": 0.00031487230320699706, |
| "loss": 3.1905, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.79079254079254, |
| "grad_norm": 0.4113454520702362, |
| "learning_rate": 0.00031469737609329443, |
| "loss": 3.2006, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.805361305361306, |
| "grad_norm": 0.41583073139190674, |
| "learning_rate": 0.0003145224489795918, |
| "loss": 3.2035, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.81993006993007, |
| "grad_norm": 0.46412721276283264, |
| "learning_rate": 0.0003143475218658892, |
| "loss": 3.1954, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.834498834498834, |
| "grad_norm": 0.4346821904182434, |
| "learning_rate": 0.0003141725947521866, |
| "loss": 3.1997, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.8490675990676, |
| "grad_norm": 0.4530637562274933, |
| "learning_rate": 0.0003139976676384839, |
| "loss": 3.1965, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.863636363636363, |
| "grad_norm": 0.39855870604515076, |
| "learning_rate": 0.0003138227405247813, |
| "loss": 3.1931, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.878205128205128, |
| "grad_norm": 0.41262272000312805, |
| "learning_rate": 0.0003136478134110787, |
| "loss": 3.2012, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.892773892773892, |
| "grad_norm": 0.39622053503990173, |
| "learning_rate": 0.00031347288629737606, |
| "loss": 3.1804, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.892773892773892, |
| "eval_accuracy": 0.37398633735266745, |
| "eval_loss": 3.5458834171295166, |
| "eval_runtime": 179.9531, |
| "eval_samples_per_second": 92.48, |
| "eval_steps_per_second": 5.785, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.907342657342657, |
| "grad_norm": 0.520991861820221, |
| "learning_rate": 0.00031329795918367344, |
| "loss": 3.1926, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.92191142191142, |
| "grad_norm": 0.4120796322822571, |
| "learning_rate": 0.0003131230320699708, |
| "loss": 3.1929, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.936480186480185, |
| "grad_norm": 0.3996407389640808, |
| "learning_rate": 0.0003129481049562682, |
| "loss": 3.1937, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.95104895104895, |
| "grad_norm": 0.4584140181541443, |
| "learning_rate": 0.00031277317784256557, |
| "loss": 3.1978, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.965617715617714, |
| "grad_norm": 0.41645756363868713, |
| "learning_rate": 0.000312598250728863, |
| "loss": 3.1941, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.98018648018648, |
| "grad_norm": 0.3965514898300171, |
| "learning_rate": 0.0003124233236151603, |
| "loss": 3.2015, |
| "step": 82300 |
| }, |
| { |
| "epoch": 23.994755244755243, |
| "grad_norm": 0.4470701217651367, |
| "learning_rate": 0.0003122483965014577, |
| "loss": 3.2066, |
| "step": 82350 |
| }, |
| { |
| "epoch": 24.009324009324008, |
| "grad_norm": 0.4205404818058014, |
| "learning_rate": 0.00031207346938775507, |
| "loss": 3.1394, |
| "step": 82400 |
| }, |
| { |
| "epoch": 24.023892773892776, |
| "grad_norm": 0.3919592499732971, |
| "learning_rate": 0.00031189854227405245, |
| "loss": 3.1313, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.03846153846154, |
| "grad_norm": 0.420136034488678, |
| "learning_rate": 0.0003117236151603498, |
| "loss": 3.1255, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.053030303030305, |
| "grad_norm": 0.4368348717689514, |
| "learning_rate": 0.0003115486880466472, |
| "loss": 3.1299, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.06759906759907, |
| "grad_norm": 0.43539777398109436, |
| "learning_rate": 0.00031137376093294457, |
| "loss": 3.1331, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.082167832167833, |
| "grad_norm": 0.3938283920288086, |
| "learning_rate": 0.000311198833819242, |
| "loss": 3.1423, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.096736596736598, |
| "grad_norm": 0.43436935544013977, |
| "learning_rate": 0.0003110239067055394, |
| "loss": 3.1361, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.111305361305362, |
| "grad_norm": 0.40530428290367126, |
| "learning_rate": 0.0003108489795918367, |
| "loss": 3.1474, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.125874125874127, |
| "grad_norm": 0.42183616757392883, |
| "learning_rate": 0.0003106740524781341, |
| "loss": 3.1602, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.14044289044289, |
| "grad_norm": 0.4034741520881653, |
| "learning_rate": 0.00031049912536443145, |
| "loss": 3.1481, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.155011655011656, |
| "grad_norm": 0.4140460789203644, |
| "learning_rate": 0.00031032419825072883, |
| "loss": 3.1438, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.16958041958042, |
| "grad_norm": 0.4160782992839813, |
| "learning_rate": 0.0003101492711370262, |
| "loss": 3.1535, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.184149184149184, |
| "grad_norm": 0.4291095435619354, |
| "learning_rate": 0.0003099743440233236, |
| "loss": 3.166, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.184149184149184, |
| "eval_accuracy": 0.37344577214982677, |
| "eval_loss": 3.5539300441741943, |
| "eval_runtime": 187.6495, |
| "eval_samples_per_second": 88.687, |
| "eval_steps_per_second": 5.548, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.19871794871795, |
| "grad_norm": 0.4372076988220215, |
| "learning_rate": 0.00030979941690962095, |
| "loss": 3.1622, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.213286713286713, |
| "grad_norm": 0.43101027607917786, |
| "learning_rate": 0.0003096244897959184, |
| "loss": 3.1568, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.227855477855478, |
| "grad_norm": 0.43224048614501953, |
| "learning_rate": 0.00030944956268221576, |
| "loss": 3.1681, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.242424242424242, |
| "grad_norm": 0.3828674852848053, |
| "learning_rate": 0.0003092746355685131, |
| "loss": 3.1676, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.256993006993007, |
| "grad_norm": 0.41439077258110046, |
| "learning_rate": 0.00030909970845481046, |
| "loss": 3.1635, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.27156177156177, |
| "grad_norm": 0.4204312562942505, |
| "learning_rate": 0.00030892478134110783, |
| "loss": 3.1724, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.286130536130536, |
| "grad_norm": 0.42064785957336426, |
| "learning_rate": 0.0003087498542274052, |
| "loss": 3.1548, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.3006993006993, |
| "grad_norm": 0.42346125841140747, |
| "learning_rate": 0.0003085749271137026, |
| "loss": 3.1665, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.315268065268064, |
| "grad_norm": 0.41940489411354065, |
| "learning_rate": 0.00030839999999999996, |
| "loss": 3.1784, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.32983682983683, |
| "grad_norm": 0.40471509099006653, |
| "learning_rate": 0.0003082250728862974, |
| "loss": 3.1763, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.344405594405593, |
| "grad_norm": 0.401151567697525, |
| "learning_rate": 0.00030805014577259477, |
| "loss": 3.1704, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.358974358974358, |
| "grad_norm": 0.4783649742603302, |
| "learning_rate": 0.00030787521865889214, |
| "loss": 3.1734, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.373543123543122, |
| "grad_norm": 0.4198293089866638, |
| "learning_rate": 0.00030770029154518946, |
| "loss": 3.182, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.388111888111887, |
| "grad_norm": 0.42297860980033875, |
| "learning_rate": 0.00030752536443148684, |
| "loss": 3.1696, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.40268065268065, |
| "grad_norm": 0.45142224431037903, |
| "learning_rate": 0.0003073504373177842, |
| "loss": 3.1804, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.41724941724942, |
| "grad_norm": 0.40544140338897705, |
| "learning_rate": 0.0003071755102040816, |
| "loss": 3.1888, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.431818181818183, |
| "grad_norm": 0.42886918783187866, |
| "learning_rate": 0.00030700058309037897, |
| "loss": 3.1823, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.446386946386948, |
| "grad_norm": 0.41986921429634094, |
| "learning_rate": 0.00030682565597667634, |
| "loss": 3.1728, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.460955710955712, |
| "grad_norm": 0.4673488140106201, |
| "learning_rate": 0.0003066507288629738, |
| "loss": 3.1857, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.475524475524477, |
| "grad_norm": 0.4163152873516083, |
| "learning_rate": 0.00030647580174927115, |
| "loss": 3.193, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.475524475524477, |
| "eval_accuracy": 0.37401114902501886, |
| "eval_loss": 3.5467607975006104, |
| "eval_runtime": 180.4483, |
| "eval_samples_per_second": 92.226, |
| "eval_steps_per_second": 5.769, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.49009324009324, |
| "grad_norm": 0.4326895475387573, |
| "learning_rate": 0.0003063008746355685, |
| "loss": 3.1831, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.504662004662006, |
| "grad_norm": 0.4216519892215729, |
| "learning_rate": 0.00030612594752186585, |
| "loss": 3.1873, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.51923076923077, |
| "grad_norm": 0.4140453636646271, |
| "learning_rate": 0.0003059510204081632, |
| "loss": 3.1893, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.533799533799534, |
| "grad_norm": 0.41393721103668213, |
| "learning_rate": 0.0003057760932944606, |
| "loss": 3.1888, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.5483682983683, |
| "grad_norm": 0.40341731905937195, |
| "learning_rate": 0.000305601166180758, |
| "loss": 3.1879, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.562937062937063, |
| "grad_norm": 0.4132257103919983, |
| "learning_rate": 0.00030542623906705535, |
| "loss": 3.1901, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.577505827505828, |
| "grad_norm": 0.42835623025894165, |
| "learning_rate": 0.0003052513119533527, |
| "loss": 3.1866, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.592074592074592, |
| "grad_norm": 0.43141621351242065, |
| "learning_rate": 0.00030507638483965016, |
| "loss": 3.2013, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.606643356643357, |
| "grad_norm": 0.41042980551719666, |
| "learning_rate": 0.00030490145772594753, |
| "loss": 3.1867, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.62121212121212, |
| "grad_norm": 0.4320753216743469, |
| "learning_rate": 0.0003047265306122449, |
| "loss": 3.1914, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.635780885780886, |
| "grad_norm": 0.43569475412368774, |
| "learning_rate": 0.00030455160349854223, |
| "loss": 3.2025, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.65034965034965, |
| "grad_norm": 0.4299875795841217, |
| "learning_rate": 0.0003043766763848396, |
| "loss": 3.1958, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.664918414918414, |
| "grad_norm": 0.40073278546333313, |
| "learning_rate": 0.000304201749271137, |
| "loss": 3.2059, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.67948717948718, |
| "grad_norm": 0.43635815382003784, |
| "learning_rate": 0.00030402682215743436, |
| "loss": 3.1934, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.694055944055943, |
| "grad_norm": 0.438896507024765, |
| "learning_rate": 0.00030385189504373173, |
| "loss": 3.2093, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.708624708624708, |
| "grad_norm": 0.4136168360710144, |
| "learning_rate": 0.00030367696793002916, |
| "loss": 3.2136, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.723193473193472, |
| "grad_norm": 0.3988734483718872, |
| "learning_rate": 0.00030350204081632654, |
| "loss": 3.1998, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.737762237762237, |
| "grad_norm": 0.4054337441921234, |
| "learning_rate": 0.0003033271137026239, |
| "loss": 3.2049, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.752331002331, |
| "grad_norm": 0.4305388927459717, |
| "learning_rate": 0.0003031521865889213, |
| "loss": 3.2119, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.766899766899765, |
| "grad_norm": 0.4009264409542084, |
| "learning_rate": 0.0003029772594752186, |
| "loss": 3.2055, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.766899766899765, |
| "eval_accuracy": 0.3745446587760061, |
| "eval_loss": 3.5372345447540283, |
| "eval_runtime": 180.1149, |
| "eval_samples_per_second": 92.397, |
| "eval_steps_per_second": 5.78, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.78146853146853, |
| "grad_norm": 0.429372102022171, |
| "learning_rate": 0.000302802332361516, |
| "loss": 3.2005, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.796037296037294, |
| "grad_norm": 0.43524783849716187, |
| "learning_rate": 0.00030262740524781336, |
| "loss": 3.2103, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.810606060606062, |
| "grad_norm": 0.39188215136528015, |
| "learning_rate": 0.00030245247813411074, |
| "loss": 3.2086, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.825174825174827, |
| "grad_norm": 0.43128883838653564, |
| "learning_rate": 0.0003022775510204081, |
| "loss": 3.1966, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.83974358974359, |
| "grad_norm": 0.42142587900161743, |
| "learning_rate": 0.00030210262390670554, |
| "loss": 3.2024, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.854312354312356, |
| "grad_norm": 0.41260868310928345, |
| "learning_rate": 0.0003019276967930029, |
| "loss": 3.2134, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.86888111888112, |
| "grad_norm": 0.420828253030777, |
| "learning_rate": 0.0003017527696793003, |
| "loss": 3.2118, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.883449883449885, |
| "grad_norm": 0.40663793683052063, |
| "learning_rate": 0.00030157784256559767, |
| "loss": 3.2062, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.89801864801865, |
| "grad_norm": 0.4167376756668091, |
| "learning_rate": 0.000301402915451895, |
| "loss": 3.2122, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.912587412587413, |
| "grad_norm": 0.41143983602523804, |
| "learning_rate": 0.00030122798833819237, |
| "loss": 3.2072, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.927156177156178, |
| "grad_norm": 0.43470826745033264, |
| "learning_rate": 0.00030105306122448974, |
| "loss": 3.2142, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.941724941724942, |
| "grad_norm": 0.44198209047317505, |
| "learning_rate": 0.0003008781341107871, |
| "loss": 3.2156, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.956293706293707, |
| "grad_norm": 0.40351226925849915, |
| "learning_rate": 0.0003007032069970845, |
| "loss": 3.2137, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.97086247086247, |
| "grad_norm": 0.3993280529975891, |
| "learning_rate": 0.0003005282798833819, |
| "loss": 3.2199, |
| "step": 85700 |
| }, |
| { |
| "epoch": 24.985431235431236, |
| "grad_norm": 0.42026591300964355, |
| "learning_rate": 0.0003003533527696793, |
| "loss": 3.2142, |
| "step": 85750 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 0.4102586805820465, |
| "learning_rate": 0.0003001784256559767, |
| "loss": 3.2235, |
| "step": 85800 |
| }, |
| { |
| "epoch": 25.014568764568764, |
| "grad_norm": 0.43756303191185, |
| "learning_rate": 0.00030000349854227405, |
| "loss": 3.1136, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.02913752913753, |
| "grad_norm": 0.472674697637558, |
| "learning_rate": 0.00029982857142857143, |
| "loss": 3.1127, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.043706293706293, |
| "grad_norm": 0.40704378485679626, |
| "learning_rate": 0.0002996536443148688, |
| "loss": 3.1196, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.058275058275058, |
| "grad_norm": 0.42157062888145447, |
| "learning_rate": 0.0002994787172011661, |
| "loss": 3.125, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.058275058275058, |
| "eval_accuracy": 0.3741105133052887, |
| "eval_loss": 3.549722194671631, |
| "eval_runtime": 179.5998, |
| "eval_samples_per_second": 92.662, |
| "eval_steps_per_second": 5.796, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.072843822843822, |
| "grad_norm": 0.38535892963409424, |
| "learning_rate": 0.0002993037900874635, |
| "loss": 3.1172, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.087412587412587, |
| "grad_norm": 0.39269933104515076, |
| "learning_rate": 0.00029912886297376093, |
| "loss": 3.1283, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.10198135198135, |
| "grad_norm": 0.4526817500591278, |
| "learning_rate": 0.0002989539358600583, |
| "loss": 3.1342, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.116550116550115, |
| "grad_norm": 0.4041959047317505, |
| "learning_rate": 0.0002987790087463557, |
| "loss": 3.1267, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.13111888111888, |
| "grad_norm": 0.4244619905948639, |
| "learning_rate": 0.000298604081632653, |
| "loss": 3.1408, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.145687645687644, |
| "grad_norm": 0.42524653673171997, |
| "learning_rate": 0.00029842915451895044, |
| "loss": 3.138, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.16025641025641, |
| "grad_norm": 0.41081714630126953, |
| "learning_rate": 0.0002982542274052478, |
| "loss": 3.1426, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.174825174825173, |
| "grad_norm": 0.42908602952957153, |
| "learning_rate": 0.0002980793002915452, |
| "loss": 3.1439, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.189393939393938, |
| "grad_norm": 0.4357958734035492, |
| "learning_rate": 0.0002979043731778425, |
| "loss": 3.1352, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.203962703962706, |
| "grad_norm": 0.404898077249527, |
| "learning_rate": 0.0002977294460641399, |
| "loss": 3.1462, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.21853146853147, |
| "grad_norm": 0.4236783981323242, |
| "learning_rate": 0.0002975545189504373, |
| "loss": 3.1655, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.233100233100235, |
| "grad_norm": 0.4077592194080353, |
| "learning_rate": 0.0002973795918367347, |
| "loss": 3.1619, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.247668997669, |
| "grad_norm": 0.41832277178764343, |
| "learning_rate": 0.00029720466472303207, |
| "loss": 3.1489, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.262237762237763, |
| "grad_norm": 0.446024090051651, |
| "learning_rate": 0.0002970297376093294, |
| "loss": 3.1604, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.276806526806528, |
| "grad_norm": 0.4226219952106476, |
| "learning_rate": 0.0002968548104956268, |
| "loss": 3.1565, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.291375291375292, |
| "grad_norm": 0.4449532628059387, |
| "learning_rate": 0.0002966798833819242, |
| "loss": 3.1703, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.305944055944057, |
| "grad_norm": 0.4353093206882477, |
| "learning_rate": 0.00029650495626822157, |
| "loss": 3.1603, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.32051282051282, |
| "grad_norm": 0.41738250851631165, |
| "learning_rate": 0.0002963300291545189, |
| "loss": 3.1538, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.335081585081586, |
| "grad_norm": 0.4185565412044525, |
| "learning_rate": 0.0002961551020408163, |
| "loss": 3.1657, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.34965034965035, |
| "grad_norm": 0.438228577375412, |
| "learning_rate": 0.0002959801749271137, |
| "loss": 3.1831, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.34965034965035, |
| "eval_accuracy": 0.3737184653639637, |
| "eval_loss": 3.5500197410583496, |
| "eval_runtime": 179.576, |
| "eval_samples_per_second": 92.674, |
| "eval_steps_per_second": 5.797, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.364219114219114, |
| "grad_norm": 0.3924265205860138, |
| "learning_rate": 0.00029580524781341107, |
| "loss": 3.1609, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.37878787878788, |
| "grad_norm": 0.41711100935935974, |
| "learning_rate": 0.00029563032069970845, |
| "loss": 3.1553, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.393356643356643, |
| "grad_norm": 0.4473123550415039, |
| "learning_rate": 0.00029545539358600577, |
| "loss": 3.1595, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.407925407925408, |
| "grad_norm": 0.42639219760894775, |
| "learning_rate": 0.0002952804664723032, |
| "loss": 3.1708, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.422494172494172, |
| "grad_norm": 0.4454313814640045, |
| "learning_rate": 0.0002951055393586006, |
| "loss": 3.1641, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.437062937062937, |
| "grad_norm": 0.4151012599468231, |
| "learning_rate": 0.00029493061224489795, |
| "loss": 3.1709, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.4516317016317, |
| "grad_norm": 0.4197435975074768, |
| "learning_rate": 0.00029475568513119527, |
| "loss": 3.1739, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.466200466200466, |
| "grad_norm": 0.4248720705509186, |
| "learning_rate": 0.0002945807580174927, |
| "loss": 3.1689, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.48076923076923, |
| "grad_norm": 0.4041939079761505, |
| "learning_rate": 0.0002944058309037901, |
| "loss": 3.1715, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.495337995337994, |
| "grad_norm": 0.4290682077407837, |
| "learning_rate": 0.00029423090379008745, |
| "loss": 3.1731, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.50990675990676, |
| "grad_norm": 0.4535789489746094, |
| "learning_rate": 0.00029405597667638483, |
| "loss": 3.1684, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.524475524475523, |
| "grad_norm": 0.41663211584091187, |
| "learning_rate": 0.0002938810495626822, |
| "loss": 3.1749, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.539044289044288, |
| "grad_norm": 0.39012715220451355, |
| "learning_rate": 0.0002937061224489796, |
| "loss": 3.1681, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.553613053613052, |
| "grad_norm": 0.39378008246421814, |
| "learning_rate": 0.00029353119533527696, |
| "loss": 3.1925, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.568181818181817, |
| "grad_norm": 0.40518417954444885, |
| "learning_rate": 0.00029335626822157433, |
| "loss": 3.1886, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.582750582750585, |
| "grad_norm": 0.4220457673072815, |
| "learning_rate": 0.00029318134110787166, |
| "loss": 3.1855, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.59731934731935, |
| "grad_norm": 0.4504868686199188, |
| "learning_rate": 0.0002930064139941691, |
| "loss": 3.1812, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.611888111888113, |
| "grad_norm": 0.41079261898994446, |
| "learning_rate": 0.00029283148688046646, |
| "loss": 3.1858, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.626456876456878, |
| "grad_norm": 0.39636367559432983, |
| "learning_rate": 0.00029265655976676384, |
| "loss": 3.1923, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.641025641025642, |
| "grad_norm": 0.4257923364639282, |
| "learning_rate": 0.0002924816326530612, |
| "loss": 3.1967, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.641025641025642, |
| "eval_accuracy": 0.37449691688513087, |
| "eval_loss": 3.5418219566345215, |
| "eval_runtime": 181.8935, |
| "eval_samples_per_second": 91.493, |
| "eval_steps_per_second": 5.723, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.655594405594407, |
| "grad_norm": 0.42990806698799133, |
| "learning_rate": 0.0002923067055393586, |
| "loss": 3.1933, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.67016317016317, |
| "grad_norm": 0.41336822509765625, |
| "learning_rate": 0.00029213177842565596, |
| "loss": 3.1863, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.684731934731936, |
| "grad_norm": 0.42269113659858704, |
| "learning_rate": 0.00029195685131195334, |
| "loss": 3.1963, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.6993006993007, |
| "grad_norm": 0.42279052734375, |
| "learning_rate": 0.0002917819241982507, |
| "loss": 3.1905, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.713869463869464, |
| "grad_norm": 0.4442167580127716, |
| "learning_rate": 0.0002916069970845481, |
| "loss": 3.1855, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.72843822843823, |
| "grad_norm": 0.44783613085746765, |
| "learning_rate": 0.00029143206997084547, |
| "loss": 3.1926, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.743006993006993, |
| "grad_norm": 0.4339323341846466, |
| "learning_rate": 0.00029125714285714284, |
| "loss": 3.2009, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.757575757575758, |
| "grad_norm": 0.4134320616722107, |
| "learning_rate": 0.0002910822157434402, |
| "loss": 3.1883, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.772144522144522, |
| "grad_norm": 0.40038609504699707, |
| "learning_rate": 0.0002909072886297376, |
| "loss": 3.2002, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.786713286713287, |
| "grad_norm": 0.40860867500305176, |
| "learning_rate": 0.00029073236151603497, |
| "loss": 3.1973, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.80128205128205, |
| "grad_norm": 0.4466297924518585, |
| "learning_rate": 0.00029055743440233235, |
| "loss": 3.1971, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.815850815850816, |
| "grad_norm": 0.3885636329650879, |
| "learning_rate": 0.0002903825072886297, |
| "loss": 3.1982, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.83041958041958, |
| "grad_norm": 0.4263664186000824, |
| "learning_rate": 0.0002902075801749271, |
| "loss": 3.1932, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.844988344988344, |
| "grad_norm": 0.4211662709712982, |
| "learning_rate": 0.0002900326530612245, |
| "loss": 3.1943, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.85955710955711, |
| "grad_norm": 0.408115953207016, |
| "learning_rate": 0.00028985772594752185, |
| "loss": 3.1981, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.874125874125873, |
| "grad_norm": 0.41231685876846313, |
| "learning_rate": 0.0002896827988338192, |
| "loss": 3.2069, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.888694638694638, |
| "grad_norm": 0.42815014719963074, |
| "learning_rate": 0.0002895078717201166, |
| "loss": 3.2032, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.903263403263402, |
| "grad_norm": 0.3903138339519501, |
| "learning_rate": 0.000289332944606414, |
| "loss": 3.2072, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.917832167832167, |
| "grad_norm": 0.4448038339614868, |
| "learning_rate": 0.00028915801749271135, |
| "loss": 3.2024, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.93240093240093, |
| "grad_norm": 0.4322460889816284, |
| "learning_rate": 0.00028898309037900873, |
| "loss": 3.198, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.93240093240093, |
| "eval_accuracy": 0.3751864697129442, |
| "eval_loss": 3.5322518348693848, |
| "eval_runtime": 181.8154, |
| "eval_samples_per_second": 91.532, |
| "eval_steps_per_second": 5.726, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.946969696969695, |
| "grad_norm": 0.40827956795692444, |
| "learning_rate": 0.0002888081632653061, |
| "loss": 3.1979, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.96153846153846, |
| "grad_norm": 0.4115292727947235, |
| "learning_rate": 0.0002886332361516035, |
| "loss": 3.1955, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.976107226107224, |
| "grad_norm": 0.42888209223747253, |
| "learning_rate": 0.00028845830903790086, |
| "loss": 3.2055, |
| "step": 89150 |
| }, |
| { |
| "epoch": 25.990675990675992, |
| "grad_norm": 0.411482572555542, |
| "learning_rate": 0.00028828338192419823, |
| "loss": 3.2077, |
| "step": 89200 |
| }, |
| { |
| "epoch": 26.005244755244757, |
| "grad_norm": 0.3968612551689148, |
| "learning_rate": 0.0002881084548104956, |
| "loss": 3.1665, |
| "step": 89250 |
| }, |
| { |
| "epoch": 26.01981351981352, |
| "grad_norm": 0.42154043912887573, |
| "learning_rate": 0.000287933527696793, |
| "loss": 3.1057, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.034382284382286, |
| "grad_norm": 0.4301404356956482, |
| "learning_rate": 0.00028775860058309036, |
| "loss": 3.0975, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.04895104895105, |
| "grad_norm": 0.4084946811199188, |
| "learning_rate": 0.00028758367346938773, |
| "loss": 3.1097, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.063519813519815, |
| "grad_norm": 0.41912487149238586, |
| "learning_rate": 0.0002874087463556851, |
| "loss": 3.1278, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.07808857808858, |
| "grad_norm": 0.4204862713813782, |
| "learning_rate": 0.0002872338192419825, |
| "loss": 3.1172, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.092657342657343, |
| "grad_norm": 0.41860130429267883, |
| "learning_rate": 0.00028705889212827986, |
| "loss": 3.124, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.107226107226108, |
| "grad_norm": 0.41904857754707336, |
| "learning_rate": 0.00028688396501457724, |
| "loss": 3.1286, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.121794871794872, |
| "grad_norm": 0.418357789516449, |
| "learning_rate": 0.0002867090379008746, |
| "loss": 3.1165, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.136363636363637, |
| "grad_norm": 0.4240556061267853, |
| "learning_rate": 0.000286534110787172, |
| "loss": 3.1232, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.1509324009324, |
| "grad_norm": 0.41585004329681396, |
| "learning_rate": 0.00028635918367346937, |
| "loss": 3.1326, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.165501165501166, |
| "grad_norm": 0.4257110357284546, |
| "learning_rate": 0.00028618425655976674, |
| "loss": 3.1437, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.18006993006993, |
| "grad_norm": 0.41216740012168884, |
| "learning_rate": 0.0002860093294460641, |
| "loss": 3.1363, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.194638694638694, |
| "grad_norm": 0.4310924708843231, |
| "learning_rate": 0.0002858344023323615, |
| "loss": 3.1374, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.20920745920746, |
| "grad_norm": 0.44105252623558044, |
| "learning_rate": 0.00028565947521865887, |
| "loss": 3.1325, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.223776223776223, |
| "grad_norm": 0.4135071933269501, |
| "learning_rate": 0.00028548454810495624, |
| "loss": 3.1476, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.223776223776223, |
| "eval_accuracy": 0.37405395209959663, |
| "eval_loss": 3.5517945289611816, |
| "eval_runtime": 181.7859, |
| "eval_samples_per_second": 91.547, |
| "eval_steps_per_second": 5.727, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.238344988344988, |
| "grad_norm": 0.3974347412586212, |
| "learning_rate": 0.0002853096209912536, |
| "loss": 3.1361, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.252913752913752, |
| "grad_norm": 0.42860710620880127, |
| "learning_rate": 0.000285134693877551, |
| "loss": 3.1546, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.267482517482517, |
| "grad_norm": 0.41412895917892456, |
| "learning_rate": 0.00028495976676384837, |
| "loss": 3.1557, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.28205128205128, |
| "grad_norm": 0.41339340806007385, |
| "learning_rate": 0.00028478483965014575, |
| "loss": 3.1351, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.296620046620045, |
| "grad_norm": 0.42824968695640564, |
| "learning_rate": 0.0002846099125364431, |
| "loss": 3.1567, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.31118881118881, |
| "grad_norm": 0.40359318256378174, |
| "learning_rate": 0.0002844349854227405, |
| "loss": 3.1551, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.325757575757574, |
| "grad_norm": 0.4394044578075409, |
| "learning_rate": 0.0002842600583090379, |
| "loss": 3.157, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.34032634032634, |
| "grad_norm": 0.4745209515094757, |
| "learning_rate": 0.00028408513119533525, |
| "loss": 3.1352, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.354895104895103, |
| "grad_norm": 0.4229678213596344, |
| "learning_rate": 0.0002839102040816326, |
| "loss": 3.1542, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.36946386946387, |
| "grad_norm": 0.44974929094314575, |
| "learning_rate": 0.00028373527696793, |
| "loss": 3.1574, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.384032634032636, |
| "grad_norm": 0.4348454177379608, |
| "learning_rate": 0.0002835603498542274, |
| "loss": 3.162, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.3986013986014, |
| "grad_norm": 0.42422547936439514, |
| "learning_rate": 0.00028338542274052475, |
| "loss": 3.1615, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.413170163170165, |
| "grad_norm": 0.4377591609954834, |
| "learning_rate": 0.00028321049562682213, |
| "loss": 3.171, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.42773892773893, |
| "grad_norm": 0.4380994737148285, |
| "learning_rate": 0.0002830355685131195, |
| "loss": 3.1649, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.442307692307693, |
| "grad_norm": 0.4348939061164856, |
| "learning_rate": 0.0002828606413994169, |
| "loss": 3.1701, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.456876456876458, |
| "grad_norm": 0.4366742968559265, |
| "learning_rate": 0.00028268571428571426, |
| "loss": 3.1557, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.471445221445222, |
| "grad_norm": 0.44443434476852417, |
| "learning_rate": 0.00028251078717201163, |
| "loss": 3.1722, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.486013986013987, |
| "grad_norm": 0.4162462055683136, |
| "learning_rate": 0.000282335860058309, |
| "loss": 3.1655, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.50058275058275, |
| "grad_norm": 0.4146740138530731, |
| "learning_rate": 0.0002821609329446064, |
| "loss": 3.1597, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.515151515151516, |
| "grad_norm": 0.4478785991668701, |
| "learning_rate": 0.00028198600583090376, |
| "loss": 3.1699, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.515151515151516, |
| "eval_accuracy": 0.37451432033303617, |
| "eval_loss": 3.5433578491210938, |
| "eval_runtime": 181.9511, |
| "eval_samples_per_second": 91.464, |
| "eval_steps_per_second": 5.721, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.52972027972028, |
| "grad_norm": 0.43394702672958374, |
| "learning_rate": 0.00028181107871720114, |
| "loss": 3.1615, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.544289044289044, |
| "grad_norm": 0.4158869683742523, |
| "learning_rate": 0.0002816361516034985, |
| "loss": 3.1802, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.55885780885781, |
| "grad_norm": 0.43913793563842773, |
| "learning_rate": 0.0002814612244897959, |
| "loss": 3.1835, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.573426573426573, |
| "grad_norm": 0.4561229944229126, |
| "learning_rate": 0.00028128629737609326, |
| "loss": 3.1617, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.587995337995338, |
| "grad_norm": 0.4013047218322754, |
| "learning_rate": 0.00028111137026239064, |
| "loss": 3.1696, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.602564102564102, |
| "grad_norm": 0.4297322928905487, |
| "learning_rate": 0.000280936443148688, |
| "loss": 3.1684, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.617132867132867, |
| "grad_norm": 0.44311001896858215, |
| "learning_rate": 0.0002807615160349854, |
| "loss": 3.1867, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.63170163170163, |
| "grad_norm": 0.4296179413795471, |
| "learning_rate": 0.00028058658892128277, |
| "loss": 3.1862, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.646270396270396, |
| "grad_norm": 0.41797104477882385, |
| "learning_rate": 0.00028041166180758014, |
| "loss": 3.1884, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.66083916083916, |
| "grad_norm": 0.4100271165370941, |
| "learning_rate": 0.0002802367346938775, |
| "loss": 3.1727, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.675407925407924, |
| "grad_norm": 0.4347117245197296, |
| "learning_rate": 0.0002800618075801749, |
| "loss": 3.1848, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.68997668997669, |
| "grad_norm": 0.4233982563018799, |
| "learning_rate": 0.00027988688046647227, |
| "loss": 3.1763, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.704545454545453, |
| "grad_norm": 0.4804072976112366, |
| "learning_rate": 0.00027971195335276965, |
| "loss": 3.1964, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.719114219114218, |
| "grad_norm": 0.4588761627674103, |
| "learning_rate": 0.000279537026239067, |
| "loss": 3.185, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.733682983682982, |
| "grad_norm": 0.45718541741371155, |
| "learning_rate": 0.0002793620991253644, |
| "loss": 3.1868, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.748251748251747, |
| "grad_norm": 0.43570733070373535, |
| "learning_rate": 0.00027918717201166177, |
| "loss": 3.1869, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.76282051282051, |
| "grad_norm": 0.4332759976387024, |
| "learning_rate": 0.00027901224489795915, |
| "loss": 3.1816, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.77738927738928, |
| "grad_norm": 0.42579400539398193, |
| "learning_rate": 0.0002788373177842565, |
| "loss": 3.1852, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.791958041958043, |
| "grad_norm": 0.41861170530319214, |
| "learning_rate": 0.0002786623906705539, |
| "loss": 3.1825, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.806526806526808, |
| "grad_norm": 0.4397330582141876, |
| "learning_rate": 0.0002784874635568513, |
| "loss": 3.1932, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.806526806526808, |
| "eval_accuracy": 0.3745991033461421, |
| "eval_loss": 3.5361201763153076, |
| "eval_runtime": 182.0117, |
| "eval_samples_per_second": 91.434, |
| "eval_steps_per_second": 5.719, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.821095571095572, |
| "grad_norm": 0.4330262839794159, |
| "learning_rate": 0.00027831253644314865, |
| "loss": 3.1821, |
| "step": 92050 |
| }, |
| { |
| "epoch": 26.835664335664337, |
| "grad_norm": 0.4355219602584839, |
| "learning_rate": 0.00027813760932944603, |
| "loss": 3.1778, |
| "step": 92100 |
| }, |
| { |
| "epoch": 26.8502331002331, |
| "grad_norm": 0.4293540418148041, |
| "learning_rate": 0.0002779626822157434, |
| "loss": 3.1707, |
| "step": 92150 |
| }, |
| { |
| "epoch": 26.864801864801866, |
| "grad_norm": 0.41315534710884094, |
| "learning_rate": 0.0002777877551020408, |
| "loss": 3.1928, |
| "step": 92200 |
| }, |
| { |
| "epoch": 26.87937062937063, |
| "grad_norm": 0.42750847339630127, |
| "learning_rate": 0.0002776128279883382, |
| "loss": 3.1829, |
| "step": 92250 |
| }, |
| { |
| "epoch": 26.893939393939394, |
| "grad_norm": 0.4304085671901703, |
| "learning_rate": 0.00027743790087463553, |
| "loss": 3.193, |
| "step": 92300 |
| }, |
| { |
| "epoch": 26.90850815850816, |
| "grad_norm": 0.4276728332042694, |
| "learning_rate": 0.0002772629737609329, |
| "loss": 3.1917, |
| "step": 92350 |
| }, |
| { |
| "epoch": 26.923076923076923, |
| "grad_norm": 0.4612192213535309, |
| "learning_rate": 0.0002770880466472303, |
| "loss": 3.1975, |
| "step": 92400 |
| }, |
| { |
| "epoch": 26.937645687645688, |
| "grad_norm": 0.4436318874359131, |
| "learning_rate": 0.00027691311953352766, |
| "loss": 3.1836, |
| "step": 92450 |
| }, |
| { |
| "epoch": 26.952214452214452, |
| "grad_norm": 0.4181881844997406, |
| "learning_rate": 0.00027673819241982503, |
| "loss": 3.1878, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.966783216783217, |
| "grad_norm": 0.420480340719223, |
| "learning_rate": 0.0002765632653061224, |
| "loss": 3.1883, |
| "step": 92550 |
| }, |
| { |
| "epoch": 26.98135198135198, |
| "grad_norm": 0.3930075764656067, |
| "learning_rate": 0.0002763883381924198, |
| "loss": 3.1982, |
| "step": 92600 |
| }, |
| { |
| "epoch": 26.995920745920746, |
| "grad_norm": 0.43089744448661804, |
| "learning_rate": 0.00027621341107871716, |
| "loss": 3.1935, |
| "step": 92650 |
| }, |
| { |
| "epoch": 27.01048951048951, |
| "grad_norm": 0.4508107006549835, |
| "learning_rate": 0.0002760384839650146, |
| "loss": 3.1204, |
| "step": 92700 |
| }, |
| { |
| "epoch": 27.025058275058274, |
| "grad_norm": 0.4305284321308136, |
| "learning_rate": 0.0002758635568513119, |
| "loss": 3.1035, |
| "step": 92750 |
| }, |
| { |
| "epoch": 27.03962703962704, |
| "grad_norm": 0.42484739422798157, |
| "learning_rate": 0.0002756886297376093, |
| "loss": 3.0986, |
| "step": 92800 |
| }, |
| { |
| "epoch": 27.054195804195803, |
| "grad_norm": 0.44576549530029297, |
| "learning_rate": 0.00027551370262390666, |
| "loss": 3.102, |
| "step": 92850 |
| }, |
| { |
| "epoch": 27.068764568764568, |
| "grad_norm": 0.42311638593673706, |
| "learning_rate": 0.0002753387755102041, |
| "loss": 3.1047, |
| "step": 92900 |
| }, |
| { |
| "epoch": 27.083333333333332, |
| "grad_norm": 0.43278342485427856, |
| "learning_rate": 0.0002751638483965014, |
| "loss": 3.1077, |
| "step": 92950 |
| }, |
| { |
| "epoch": 27.097902097902097, |
| "grad_norm": 0.4202318787574768, |
| "learning_rate": 0.0002749889212827988, |
| "loss": 3.129, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.097902097902097, |
| "eval_accuracy": 0.37368542233111657, |
| "eval_loss": 3.5532684326171875, |
| "eval_runtime": 179.8211, |
| "eval_samples_per_second": 92.548, |
| "eval_steps_per_second": 5.789, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.11247086247086, |
| "grad_norm": 0.4148189127445221, |
| "learning_rate": 0.00027481399416909617, |
| "loss": 3.1146, |
| "step": 93050 |
| }, |
| { |
| "epoch": 27.127039627039625, |
| "grad_norm": 0.4284904897212982, |
| "learning_rate": 0.00027463906705539354, |
| "loss": 3.1226, |
| "step": 93100 |
| }, |
| { |
| "epoch": 27.14160839160839, |
| "grad_norm": 0.4357090890407562, |
| "learning_rate": 0.000274464139941691, |
| "loss": 3.1164, |
| "step": 93150 |
| }, |
| { |
| "epoch": 27.156177156177158, |
| "grad_norm": 0.4211357831954956, |
| "learning_rate": 0.0002742892128279883, |
| "loss": 3.1247, |
| "step": 93200 |
| }, |
| { |
| "epoch": 27.170745920745922, |
| "grad_norm": 0.4347440302371979, |
| "learning_rate": 0.00027411428571428567, |
| "loss": 3.1199, |
| "step": 93250 |
| }, |
| { |
| "epoch": 27.185314685314687, |
| "grad_norm": 0.4083610475063324, |
| "learning_rate": 0.00027393935860058305, |
| "loss": 3.1151, |
| "step": 93300 |
| }, |
| { |
| "epoch": 27.19988344988345, |
| "grad_norm": 0.451709121465683, |
| "learning_rate": 0.0002737644314868805, |
| "loss": 3.1266, |
| "step": 93350 |
| }, |
| { |
| "epoch": 27.214452214452216, |
| "grad_norm": 0.44590309262275696, |
| "learning_rate": 0.0002735895043731778, |
| "loss": 3.1305, |
| "step": 93400 |
| }, |
| { |
| "epoch": 27.22902097902098, |
| "grad_norm": 0.4306364357471466, |
| "learning_rate": 0.0002734145772594752, |
| "loss": 3.127, |
| "step": 93450 |
| }, |
| { |
| "epoch": 27.243589743589745, |
| "grad_norm": 0.4174492359161377, |
| "learning_rate": 0.00027323965014577255, |
| "loss": 3.1401, |
| "step": 93500 |
| }, |
| { |
| "epoch": 27.25815850815851, |
| "grad_norm": 0.4140374958515167, |
| "learning_rate": 0.00027306472303207, |
| "loss": 3.1344, |
| "step": 93550 |
| }, |
| { |
| "epoch": 27.272727272727273, |
| "grad_norm": 0.4186951518058777, |
| "learning_rate": 0.00027288979591836736, |
| "loss": 3.1323, |
| "step": 93600 |
| }, |
| { |
| "epoch": 27.287296037296038, |
| "grad_norm": 0.4227970838546753, |
| "learning_rate": 0.0002727148688046647, |
| "loss": 3.1298, |
| "step": 93650 |
| }, |
| { |
| "epoch": 27.301864801864802, |
| "grad_norm": 0.4474739730358124, |
| "learning_rate": 0.00027253994169096205, |
| "loss": 3.1429, |
| "step": 93700 |
| }, |
| { |
| "epoch": 27.316433566433567, |
| "grad_norm": 0.399031400680542, |
| "learning_rate": 0.00027236501457725943, |
| "loss": 3.1415, |
| "step": 93750 |
| }, |
| { |
| "epoch": 27.33100233100233, |
| "grad_norm": 0.4408739507198334, |
| "learning_rate": 0.00027219008746355686, |
| "loss": 3.1598, |
| "step": 93800 |
| }, |
| { |
| "epoch": 27.345571095571096, |
| "grad_norm": 0.4364926218986511, |
| "learning_rate": 0.0002720151603498542, |
| "loss": 3.1415, |
| "step": 93850 |
| }, |
| { |
| "epoch": 27.36013986013986, |
| "grad_norm": 0.43380430340766907, |
| "learning_rate": 0.00027184023323615156, |
| "loss": 3.1496, |
| "step": 93900 |
| }, |
| { |
| "epoch": 27.374708624708624, |
| "grad_norm": 0.4094713032245636, |
| "learning_rate": 0.00027166530612244893, |
| "loss": 3.152, |
| "step": 93950 |
| }, |
| { |
| "epoch": 27.38927738927739, |
| "grad_norm": 0.43629366159439087, |
| "learning_rate": 0.00027149037900874636, |
| "loss": 3.1585, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.38927738927739, |
| "eval_accuracy": 0.374065123231698, |
| "eval_loss": 3.546992301940918, |
| "eval_runtime": 179.6616, |
| "eval_samples_per_second": 92.63, |
| "eval_steps_per_second": 5.794, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.403846153846153, |
| "grad_norm": 0.4074999690055847, |
| "learning_rate": 0.00027131545189504374, |
| "loss": 3.1506, |
| "step": 94050 |
| }, |
| { |
| "epoch": 27.418414918414918, |
| "grad_norm": 0.45839694142341614, |
| "learning_rate": 0.00027114052478134106, |
| "loss": 3.1496, |
| "step": 94100 |
| }, |
| { |
| "epoch": 27.432983682983682, |
| "grad_norm": 0.4555986821651459, |
| "learning_rate": 0.00027096559766763843, |
| "loss": 3.1427, |
| "step": 94150 |
| }, |
| { |
| "epoch": 27.447552447552447, |
| "grad_norm": 0.4265489876270294, |
| "learning_rate": 0.00027079067055393586, |
| "loss": 3.1546, |
| "step": 94200 |
| }, |
| { |
| "epoch": 27.46212121212121, |
| "grad_norm": 0.4634106159210205, |
| "learning_rate": 0.00027061574344023324, |
| "loss": 3.1474, |
| "step": 94250 |
| }, |
| { |
| "epoch": 27.476689976689975, |
| "grad_norm": 0.46226605772972107, |
| "learning_rate": 0.00027044081632653056, |
| "loss": 3.16, |
| "step": 94300 |
| }, |
| { |
| "epoch": 27.49125874125874, |
| "grad_norm": 0.4321894347667694, |
| "learning_rate": 0.00027026588921282794, |
| "loss": 3.1583, |
| "step": 94350 |
| }, |
| { |
| "epoch": 27.505827505827504, |
| "grad_norm": 0.44746580719947815, |
| "learning_rate": 0.0002700909620991253, |
| "loss": 3.1516, |
| "step": 94400 |
| }, |
| { |
| "epoch": 27.52039627039627, |
| "grad_norm": 0.4335636794567108, |
| "learning_rate": 0.00026991603498542274, |
| "loss": 3.1583, |
| "step": 94450 |
| }, |
| { |
| "epoch": 27.534965034965033, |
| "grad_norm": 0.4515124559402466, |
| "learning_rate": 0.0002697411078717201, |
| "loss": 3.1625, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.5495337995338, |
| "grad_norm": 0.47119760513305664, |
| "learning_rate": 0.00026956618075801744, |
| "loss": 3.1551, |
| "step": 94550 |
| }, |
| { |
| "epoch": 27.564102564102566, |
| "grad_norm": 0.42898598313331604, |
| "learning_rate": 0.0002693912536443148, |
| "loss": 3.1534, |
| "step": 94600 |
| }, |
| { |
| "epoch": 27.57867132867133, |
| "grad_norm": 0.4289074242115021, |
| "learning_rate": 0.00026921632653061225, |
| "loss": 3.1637, |
| "step": 94650 |
| }, |
| { |
| "epoch": 27.593240093240095, |
| "grad_norm": 0.4532720148563385, |
| "learning_rate": 0.0002690413994169096, |
| "loss": 3.1754, |
| "step": 94700 |
| }, |
| { |
| "epoch": 27.60780885780886, |
| "grad_norm": 0.4546985626220703, |
| "learning_rate": 0.00026886647230320694, |
| "loss": 3.1657, |
| "step": 94750 |
| }, |
| { |
| "epoch": 27.622377622377623, |
| "grad_norm": 0.42070233821868896, |
| "learning_rate": 0.0002686915451895043, |
| "loss": 3.1667, |
| "step": 94800 |
| }, |
| { |
| "epoch": 27.636946386946388, |
| "grad_norm": 0.4291580021381378, |
| "learning_rate": 0.00026851661807580175, |
| "loss": 3.1627, |
| "step": 94850 |
| }, |
| { |
| "epoch": 27.651515151515152, |
| "grad_norm": 0.4327968657016754, |
| "learning_rate": 0.0002683416909620991, |
| "loss": 3.1766, |
| "step": 94900 |
| }, |
| { |
| "epoch": 27.666083916083917, |
| "grad_norm": 0.4629499912261963, |
| "learning_rate": 0.0002681667638483965, |
| "loss": 3.1685, |
| "step": 94950 |
| }, |
| { |
| "epoch": 27.68065268065268, |
| "grad_norm": 0.4146984815597534, |
| "learning_rate": 0.0002679918367346938, |
| "loss": 3.1692, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.68065268065268, |
| "eval_accuracy": 0.37507440561933814, |
| "eval_loss": 3.5368077754974365, |
| "eval_runtime": 179.6334, |
| "eval_samples_per_second": 92.644, |
| "eval_steps_per_second": 5.795, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.695221445221446, |
| "grad_norm": 0.41717877984046936, |
| "learning_rate": 0.00026781690962099125, |
| "loss": 3.1761, |
| "step": 95050 |
| }, |
| { |
| "epoch": 27.70979020979021, |
| "grad_norm": 0.43753674626350403, |
| "learning_rate": 0.00026764198250728863, |
| "loss": 3.1782, |
| "step": 95100 |
| }, |
| { |
| "epoch": 27.724358974358974, |
| "grad_norm": 0.42679542303085327, |
| "learning_rate": 0.000267467055393586, |
| "loss": 3.1722, |
| "step": 95150 |
| }, |
| { |
| "epoch": 27.73892773892774, |
| "grad_norm": 0.42229217290878296, |
| "learning_rate": 0.0002672921282798833, |
| "loss": 3.1785, |
| "step": 95200 |
| }, |
| { |
| "epoch": 27.753496503496503, |
| "grad_norm": 0.40234848856925964, |
| "learning_rate": 0.0002671172011661807, |
| "loss": 3.1675, |
| "step": 95250 |
| }, |
| { |
| "epoch": 27.768065268065268, |
| "grad_norm": 0.4103754758834839, |
| "learning_rate": 0.00026694227405247813, |
| "loss": 3.1787, |
| "step": 95300 |
| }, |
| { |
| "epoch": 27.782634032634032, |
| "grad_norm": 0.4386822581291199, |
| "learning_rate": 0.0002667673469387755, |
| "loss": 3.1718, |
| "step": 95350 |
| }, |
| { |
| "epoch": 27.797202797202797, |
| "grad_norm": 0.41249123215675354, |
| "learning_rate": 0.0002665924198250729, |
| "loss": 3.1797, |
| "step": 95400 |
| }, |
| { |
| "epoch": 27.81177156177156, |
| "grad_norm": 0.41915491223335266, |
| "learning_rate": 0.0002664174927113702, |
| "loss": 3.1828, |
| "step": 95450 |
| }, |
| { |
| "epoch": 27.826340326340326, |
| "grad_norm": 0.42128297686576843, |
| "learning_rate": 0.00026624256559766764, |
| "loss": 3.1821, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.84090909090909, |
| "grad_norm": 0.42073553800582886, |
| "learning_rate": 0.000266067638483965, |
| "loss": 3.2004, |
| "step": 95550 |
| }, |
| { |
| "epoch": 27.855477855477854, |
| "grad_norm": 0.4234536588191986, |
| "learning_rate": 0.0002658927113702624, |
| "loss": 3.1694, |
| "step": 95600 |
| }, |
| { |
| "epoch": 27.87004662004662, |
| "grad_norm": 0.4323881268501282, |
| "learning_rate": 0.0002657177842565597, |
| "loss": 3.183, |
| "step": 95650 |
| }, |
| { |
| "epoch": 27.884615384615383, |
| "grad_norm": 0.4549892544746399, |
| "learning_rate": 0.00026554285714285714, |
| "loss": 3.1858, |
| "step": 95700 |
| }, |
| { |
| "epoch": 27.899184149184148, |
| "grad_norm": 0.4438720941543579, |
| "learning_rate": 0.0002653679300291545, |
| "loss": 3.1879, |
| "step": 95750 |
| }, |
| { |
| "epoch": 27.913752913752912, |
| "grad_norm": 0.42953628301620483, |
| "learning_rate": 0.0002651930029154519, |
| "loss": 3.1892, |
| "step": 95800 |
| }, |
| { |
| "epoch": 27.92832167832168, |
| "grad_norm": 0.4279513657093048, |
| "learning_rate": 0.00026501807580174927, |
| "loss": 3.1857, |
| "step": 95850 |
| }, |
| { |
| "epoch": 27.942890442890445, |
| "grad_norm": 0.44362860918045044, |
| "learning_rate": 0.0002648431486880466, |
| "loss": 3.1813, |
| "step": 95900 |
| }, |
| { |
| "epoch": 27.95745920745921, |
| "grad_norm": 0.4112043082714081, |
| "learning_rate": 0.000264668221574344, |
| "loss": 3.1902, |
| "step": 95950 |
| }, |
| { |
| "epoch": 27.972027972027973, |
| "grad_norm": 0.4335898756980896, |
| "learning_rate": 0.0002644932944606414, |
| "loss": 3.2099, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.972027972027973, |
| "eval_accuracy": 0.37546492487942823, |
| "eval_loss": 3.533925771713257, |
| "eval_runtime": 179.0974, |
| "eval_samples_per_second": 92.922, |
| "eval_steps_per_second": 5.812, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.986596736596738, |
| "grad_norm": 0.3973131477832794, |
| "learning_rate": 0.00026431836734693877, |
| "loss": 3.1847, |
| "step": 96050 |
| }, |
| { |
| "epoch": 28.001165501165502, |
| "grad_norm": 0.4298485219478607, |
| "learning_rate": 0.0002641434402332361, |
| "loss": 3.1814, |
| "step": 96100 |
| }, |
| { |
| "epoch": 28.015734265734267, |
| "grad_norm": 0.4456147253513336, |
| "learning_rate": 0.0002639685131195335, |
| "loss": 3.0832, |
| "step": 96150 |
| }, |
| { |
| "epoch": 28.03030303030303, |
| "grad_norm": 0.4482695162296295, |
| "learning_rate": 0.0002637935860058309, |
| "loss": 3.0932, |
| "step": 96200 |
| }, |
| { |
| "epoch": 28.044871794871796, |
| "grad_norm": 0.4353080689907074, |
| "learning_rate": 0.00026361865889212827, |
| "loss": 3.0925, |
| "step": 96250 |
| }, |
| { |
| "epoch": 28.05944055944056, |
| "grad_norm": 0.438823938369751, |
| "learning_rate": 0.00026344373177842565, |
| "loss": 3.0935, |
| "step": 96300 |
| }, |
| { |
| "epoch": 28.074009324009324, |
| "grad_norm": 0.43021658062934875, |
| "learning_rate": 0.000263268804664723, |
| "loss": 3.1006, |
| "step": 96350 |
| }, |
| { |
| "epoch": 28.08857808857809, |
| "grad_norm": 0.4441169202327728, |
| "learning_rate": 0.0002630938775510204, |
| "loss": 3.101, |
| "step": 96400 |
| }, |
| { |
| "epoch": 28.103146853146853, |
| "grad_norm": 0.4555242359638214, |
| "learning_rate": 0.0002629189504373178, |
| "loss": 3.1125, |
| "step": 96450 |
| }, |
| { |
| "epoch": 28.117715617715618, |
| "grad_norm": 0.47698360681533813, |
| "learning_rate": 0.00026274402332361515, |
| "loss": 3.1132, |
| "step": 96500 |
| }, |
| { |
| "epoch": 28.132284382284382, |
| "grad_norm": 0.46771177649497986, |
| "learning_rate": 0.0002625690962099125, |
| "loss": 3.0982, |
| "step": 96550 |
| }, |
| { |
| "epoch": 28.146853146853147, |
| "grad_norm": 0.3934149146080017, |
| "learning_rate": 0.0002623941690962099, |
| "loss": 3.1162, |
| "step": 96600 |
| }, |
| { |
| "epoch": 28.16142191142191, |
| "grad_norm": 0.4455907642841339, |
| "learning_rate": 0.0002622192419825073, |
| "loss": 3.1177, |
| "step": 96650 |
| }, |
| { |
| "epoch": 28.175990675990676, |
| "grad_norm": 0.42018404603004456, |
| "learning_rate": 0.00026204431486880465, |
| "loss": 3.1113, |
| "step": 96700 |
| }, |
| { |
| "epoch": 28.19055944055944, |
| "grad_norm": 0.4048352837562561, |
| "learning_rate": 0.00026186938775510203, |
| "loss": 3.1236, |
| "step": 96750 |
| }, |
| { |
| "epoch": 28.205128205128204, |
| "grad_norm": 0.4386017620563507, |
| "learning_rate": 0.0002616944606413994, |
| "loss": 3.1282, |
| "step": 96800 |
| }, |
| { |
| "epoch": 28.21969696969697, |
| "grad_norm": 0.4359148144721985, |
| "learning_rate": 0.0002615195335276968, |
| "loss": 3.1246, |
| "step": 96850 |
| }, |
| { |
| "epoch": 28.234265734265733, |
| "grad_norm": 0.44523295760154724, |
| "learning_rate": 0.00026134460641399416, |
| "loss": 3.123, |
| "step": 96900 |
| }, |
| { |
| "epoch": 28.248834498834498, |
| "grad_norm": 0.43056222796440125, |
| "learning_rate": 0.00026116967930029153, |
| "loss": 3.1311, |
| "step": 96950 |
| }, |
| { |
| "epoch": 28.263403263403262, |
| "grad_norm": 0.42719486355781555, |
| "learning_rate": 0.0002609947521865889, |
| "loss": 3.1312, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.263403263403262, |
| "eval_accuracy": 0.37481805753532843, |
| "eval_loss": 3.547030448913574, |
| "eval_runtime": 179.1721, |
| "eval_samples_per_second": 92.883, |
| "eval_steps_per_second": 5.81, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.277972027972027, |
| "grad_norm": 0.437563955783844, |
| "learning_rate": 0.0002608198250728863, |
| "loss": 3.1513, |
| "step": 97050 |
| }, |
| { |
| "epoch": 28.29254079254079, |
| "grad_norm": 0.4412456452846527, |
| "learning_rate": 0.00026064489795918366, |
| "loss": 3.1316, |
| "step": 97100 |
| }, |
| { |
| "epoch": 28.307109557109555, |
| "grad_norm": 0.418857216835022, |
| "learning_rate": 0.00026046997084548104, |
| "loss": 3.1284, |
| "step": 97150 |
| }, |
| { |
| "epoch": 28.32167832167832, |
| "grad_norm": 0.42892611026763916, |
| "learning_rate": 0.0002602950437317784, |
| "loss": 3.1356, |
| "step": 97200 |
| }, |
| { |
| "epoch": 28.336247086247088, |
| "grad_norm": 0.42652612924575806, |
| "learning_rate": 0.0002601201166180758, |
| "loss": 3.1433, |
| "step": 97250 |
| }, |
| { |
| "epoch": 28.350815850815852, |
| "grad_norm": 0.45473426580429077, |
| "learning_rate": 0.00025994518950437316, |
| "loss": 3.146, |
| "step": 97300 |
| }, |
| { |
| "epoch": 28.365384615384617, |
| "grad_norm": 0.47524523735046387, |
| "learning_rate": 0.00025977026239067054, |
| "loss": 3.1339, |
| "step": 97350 |
| }, |
| { |
| "epoch": 28.37995337995338, |
| "grad_norm": 0.4518192708492279, |
| "learning_rate": 0.0002595953352769679, |
| "loss": 3.1333, |
| "step": 97400 |
| }, |
| { |
| "epoch": 28.394522144522146, |
| "grad_norm": 0.46246474981307983, |
| "learning_rate": 0.0002594204081632653, |
| "loss": 3.1494, |
| "step": 97450 |
| }, |
| { |
| "epoch": 28.40909090909091, |
| "grad_norm": 0.40630584955215454, |
| "learning_rate": 0.00025924548104956267, |
| "loss": 3.1528, |
| "step": 97500 |
| }, |
| { |
| "epoch": 28.423659673659674, |
| "grad_norm": 0.45706915855407715, |
| "learning_rate": 0.00025907055393586004, |
| "loss": 3.1553, |
| "step": 97550 |
| }, |
| { |
| "epoch": 28.43822843822844, |
| "grad_norm": 0.4858422875404358, |
| "learning_rate": 0.0002588956268221574, |
| "loss": 3.1473, |
| "step": 97600 |
| }, |
| { |
| "epoch": 28.452797202797203, |
| "grad_norm": 0.4400869905948639, |
| "learning_rate": 0.0002587206997084548, |
| "loss": 3.1509, |
| "step": 97650 |
| }, |
| { |
| "epoch": 28.467365967365968, |
| "grad_norm": 0.4365074336528778, |
| "learning_rate": 0.00025854577259475217, |
| "loss": 3.1461, |
| "step": 97700 |
| }, |
| { |
| "epoch": 28.481934731934732, |
| "grad_norm": 0.4489445686340332, |
| "learning_rate": 0.00025837084548104955, |
| "loss": 3.1589, |
| "step": 97750 |
| }, |
| { |
| "epoch": 28.496503496503497, |
| "grad_norm": 0.45960038900375366, |
| "learning_rate": 0.0002581959183673469, |
| "loss": 3.1521, |
| "step": 97800 |
| }, |
| { |
| "epoch": 28.51107226107226, |
| "grad_norm": 0.4620950222015381, |
| "learning_rate": 0.0002580209912536443, |
| "loss": 3.1591, |
| "step": 97850 |
| }, |
| { |
| "epoch": 28.525641025641026, |
| "grad_norm": 0.4320875108242035, |
| "learning_rate": 0.0002578460641399417, |
| "loss": 3.155, |
| "step": 97900 |
| }, |
| { |
| "epoch": 28.54020979020979, |
| "grad_norm": 0.42469337582588196, |
| "learning_rate": 0.00025767113702623905, |
| "loss": 3.1591, |
| "step": 97950 |
| }, |
| { |
| "epoch": 28.554778554778554, |
| "grad_norm": 0.44742336869239807, |
| "learning_rate": 0.0002574962099125364, |
| "loss": 3.1501, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.554778554778554, |
| "eval_accuracy": 0.3747974791340891, |
| "eval_loss": 3.5462825298309326, |
| "eval_runtime": 179.092, |
| "eval_samples_per_second": 92.924, |
| "eval_steps_per_second": 5.813, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.56934731934732, |
| "grad_norm": 0.45411232113838196, |
| "learning_rate": 0.0002573212827988338, |
| "loss": 3.1527, |
| "step": 98050 |
| }, |
| { |
| "epoch": 28.583916083916083, |
| "grad_norm": 0.43855708837509155, |
| "learning_rate": 0.0002571463556851312, |
| "loss": 3.1614, |
| "step": 98100 |
| }, |
| { |
| "epoch": 28.598484848484848, |
| "grad_norm": 0.4427168369293213, |
| "learning_rate": 0.00025697142857142855, |
| "loss": 3.1581, |
| "step": 98150 |
| }, |
| { |
| "epoch": 28.613053613053612, |
| "grad_norm": 0.4582144618034363, |
| "learning_rate": 0.00025679650145772593, |
| "loss": 3.1501, |
| "step": 98200 |
| }, |
| { |
| "epoch": 28.627622377622377, |
| "grad_norm": 0.43789151310920715, |
| "learning_rate": 0.0002566215743440233, |
| "loss": 3.1521, |
| "step": 98250 |
| }, |
| { |
| "epoch": 28.64219114219114, |
| "grad_norm": 0.4331110417842865, |
| "learning_rate": 0.0002564466472303207, |
| "loss": 3.1604, |
| "step": 98300 |
| }, |
| { |
| "epoch": 28.656759906759905, |
| "grad_norm": 0.4316922128200531, |
| "learning_rate": 0.00025627172011661806, |
| "loss": 3.1523, |
| "step": 98350 |
| }, |
| { |
| "epoch": 28.67132867132867, |
| "grad_norm": 0.4386139512062073, |
| "learning_rate": 0.00025609679300291543, |
| "loss": 3.1586, |
| "step": 98400 |
| }, |
| { |
| "epoch": 28.685897435897434, |
| "grad_norm": 0.41735491156578064, |
| "learning_rate": 0.0002559218658892128, |
| "loss": 3.1663, |
| "step": 98450 |
| }, |
| { |
| "epoch": 28.7004662004662, |
| "grad_norm": 0.4343891441822052, |
| "learning_rate": 0.0002557469387755102, |
| "loss": 3.1526, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.715034965034967, |
| "grad_norm": 0.4205450415611267, |
| "learning_rate": 0.00025557201166180756, |
| "loss": 3.1696, |
| "step": 98550 |
| }, |
| { |
| "epoch": 28.72960372960373, |
| "grad_norm": 0.416477769613266, |
| "learning_rate": 0.00025539708454810493, |
| "loss": 3.1599, |
| "step": 98600 |
| }, |
| { |
| "epoch": 28.744172494172496, |
| "grad_norm": 0.4437694549560547, |
| "learning_rate": 0.0002552221574344023, |
| "loss": 3.1663, |
| "step": 98650 |
| }, |
| { |
| "epoch": 28.75874125874126, |
| "grad_norm": 0.4323379695415497, |
| "learning_rate": 0.0002550472303206997, |
| "loss": 3.1713, |
| "step": 98700 |
| }, |
| { |
| "epoch": 28.773310023310025, |
| "grad_norm": 0.4636428952217102, |
| "learning_rate": 0.00025487230320699706, |
| "loss": 3.1653, |
| "step": 98750 |
| }, |
| { |
| "epoch": 28.78787878787879, |
| "grad_norm": 0.45252299308776855, |
| "learning_rate": 0.00025469737609329444, |
| "loss": 3.1766, |
| "step": 98800 |
| }, |
| { |
| "epoch": 28.802447552447553, |
| "grad_norm": 0.44343459606170654, |
| "learning_rate": 0.0002545224489795918, |
| "loss": 3.157, |
| "step": 98850 |
| }, |
| { |
| "epoch": 28.817016317016318, |
| "grad_norm": 0.5136388540267944, |
| "learning_rate": 0.0002543475218658892, |
| "loss": 3.1567, |
| "step": 98900 |
| }, |
| { |
| "epoch": 28.831585081585082, |
| "grad_norm": 0.42482438683509827, |
| "learning_rate": 0.00025417259475218657, |
| "loss": 3.1711, |
| "step": 98950 |
| }, |
| { |
| "epoch": 28.846153846153847, |
| "grad_norm": 0.40038368105888367, |
| "learning_rate": 0.00025399766763848394, |
| "loss": 3.1644, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.846153846153847, |
| "eval_accuracy": 0.37489166941633306, |
| "eval_loss": 3.5355660915374756, |
| "eval_runtime": 181.4571, |
| "eval_samples_per_second": 91.713, |
| "eval_steps_per_second": 5.737, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.86072261072261, |
| "grad_norm": 0.4373658001422882, |
| "learning_rate": 0.0002538227405247813, |
| "loss": 3.1621, |
| "step": 99050 |
| }, |
| { |
| "epoch": 28.875291375291376, |
| "grad_norm": 0.40323692560195923, |
| "learning_rate": 0.0002536478134110787, |
| "loss": 3.1735, |
| "step": 99100 |
| }, |
| { |
| "epoch": 28.88986013986014, |
| "grad_norm": 0.4565911293029785, |
| "learning_rate": 0.00025347288629737607, |
| "loss": 3.1726, |
| "step": 99150 |
| }, |
| { |
| "epoch": 28.904428904428904, |
| "grad_norm": 0.41919973492622375, |
| "learning_rate": 0.00025329795918367344, |
| "loss": 3.1786, |
| "step": 99200 |
| }, |
| { |
| "epoch": 28.91899766899767, |
| "grad_norm": 0.4275195002555847, |
| "learning_rate": 0.0002531230320699708, |
| "loss": 3.1762, |
| "step": 99250 |
| }, |
| { |
| "epoch": 28.933566433566433, |
| "grad_norm": 0.41187289357185364, |
| "learning_rate": 0.0002529481049562682, |
| "loss": 3.175, |
| "step": 99300 |
| }, |
| { |
| "epoch": 28.948135198135198, |
| "grad_norm": 0.4252161383628845, |
| "learning_rate": 0.00025277317784256557, |
| "loss": 3.1768, |
| "step": 99350 |
| }, |
| { |
| "epoch": 28.962703962703962, |
| "grad_norm": 0.44886091351509094, |
| "learning_rate": 0.00025259825072886295, |
| "loss": 3.1724, |
| "step": 99400 |
| }, |
| { |
| "epoch": 28.977272727272727, |
| "grad_norm": 0.4043419361114502, |
| "learning_rate": 0.0002524233236151603, |
| "loss": 3.1809, |
| "step": 99450 |
| }, |
| { |
| "epoch": 28.99184149184149, |
| "grad_norm": 0.44792184233665466, |
| "learning_rate": 0.0002522483965014577, |
| "loss": 3.1773, |
| "step": 99500 |
| }, |
| { |
| "epoch": 29.006410256410255, |
| "grad_norm": 0.4384615421295166, |
| "learning_rate": 0.0002520734693877551, |
| "loss": 3.127, |
| "step": 99550 |
| }, |
| { |
| "epoch": 29.02097902097902, |
| "grad_norm": 0.453096479177475, |
| "learning_rate": 0.00025189854227405245, |
| "loss": 3.0883, |
| "step": 99600 |
| }, |
| { |
| "epoch": 29.035547785547784, |
| "grad_norm": 0.4151376187801361, |
| "learning_rate": 0.0002517236151603498, |
| "loss": 3.0826, |
| "step": 99650 |
| }, |
| { |
| "epoch": 29.05011655011655, |
| "grad_norm": 0.44585660099983215, |
| "learning_rate": 0.0002515486880466472, |
| "loss": 3.0854, |
| "step": 99700 |
| }, |
| { |
| "epoch": 29.064685314685313, |
| "grad_norm": 0.42738720774650574, |
| "learning_rate": 0.0002513737609329446, |
| "loss": 3.0954, |
| "step": 99750 |
| }, |
| { |
| "epoch": 29.079254079254078, |
| "grad_norm": 0.42302823066711426, |
| "learning_rate": 0.00025119883381924195, |
| "loss": 3.0988, |
| "step": 99800 |
| }, |
| { |
| "epoch": 29.093822843822842, |
| "grad_norm": 0.4460339844226837, |
| "learning_rate": 0.00025102390670553933, |
| "loss": 3.1087, |
| "step": 99850 |
| }, |
| { |
| "epoch": 29.10839160839161, |
| "grad_norm": 0.4409368634223938, |
| "learning_rate": 0.0002508489795918367, |
| "loss": 3.109, |
| "step": 99900 |
| }, |
| { |
| "epoch": 29.122960372960375, |
| "grad_norm": 0.429755836725235, |
| "learning_rate": 0.0002506740524781341, |
| "loss": 3.0967, |
| "step": 99950 |
| }, |
| { |
| "epoch": 29.13752913752914, |
| "grad_norm": 0.41378381848335266, |
| "learning_rate": 0.00025049912536443146, |
| "loss": 3.1114, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.13752913752914, |
| "eval_accuracy": 0.37418765291222006, |
| "eval_loss": 3.5503756999969482, |
| "eval_runtime": 180.604, |
| "eval_samples_per_second": 92.146, |
| "eval_steps_per_second": 5.764, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.152097902097903, |
| "grad_norm": 0.4601365327835083, |
| "learning_rate": 0.00025032419825072883, |
| "loss": 3.1114, |
| "step": 100050 |
| }, |
| { |
| "epoch": 29.166666666666668, |
| "grad_norm": 0.4332452714443207, |
| "learning_rate": 0.0002501492711370262, |
| "loss": 3.1138, |
| "step": 100100 |
| }, |
| { |
| "epoch": 29.181235431235432, |
| "grad_norm": 0.44746899604797363, |
| "learning_rate": 0.0002499743440233236, |
| "loss": 3.1031, |
| "step": 100150 |
| }, |
| { |
| "epoch": 29.195804195804197, |
| "grad_norm": 0.4199642539024353, |
| "learning_rate": 0.00024979941690962096, |
| "loss": 3.1175, |
| "step": 100200 |
| }, |
| { |
| "epoch": 29.21037296037296, |
| "grad_norm": 0.4360141456127167, |
| "learning_rate": 0.00024962448979591834, |
| "loss": 3.1169, |
| "step": 100250 |
| }, |
| { |
| "epoch": 29.224941724941726, |
| "grad_norm": 0.41213443875312805, |
| "learning_rate": 0.0002494495626822157, |
| "loss": 3.1143, |
| "step": 100300 |
| }, |
| { |
| "epoch": 29.23951048951049, |
| "grad_norm": 0.46815258264541626, |
| "learning_rate": 0.0002492746355685131, |
| "loss": 3.116, |
| "step": 100350 |
| }, |
| { |
| "epoch": 29.254079254079254, |
| "grad_norm": 0.4512742757797241, |
| "learning_rate": 0.00024909970845481046, |
| "loss": 3.1304, |
| "step": 100400 |
| }, |
| { |
| "epoch": 29.26864801864802, |
| "grad_norm": 0.4723387658596039, |
| "learning_rate": 0.00024892478134110784, |
| "loss": 3.1248, |
| "step": 100450 |
| }, |
| { |
| "epoch": 29.283216783216783, |
| "grad_norm": 0.4225069284439087, |
| "learning_rate": 0.0002487498542274052, |
| "loss": 3.1198, |
| "step": 100500 |
| }, |
| { |
| "epoch": 29.297785547785548, |
| "grad_norm": 0.4604575037956238, |
| "learning_rate": 0.0002485749271137026, |
| "loss": 3.125, |
| "step": 100550 |
| }, |
| { |
| "epoch": 29.312354312354312, |
| "grad_norm": 0.4449054002761841, |
| "learning_rate": 0.00024839999999999997, |
| "loss": 3.1151, |
| "step": 100600 |
| }, |
| { |
| "epoch": 29.326923076923077, |
| "grad_norm": 0.4241639971733093, |
| "learning_rate": 0.00024822507288629734, |
| "loss": 3.1222, |
| "step": 100650 |
| }, |
| { |
| "epoch": 29.34149184149184, |
| "grad_norm": 0.4389256238937378, |
| "learning_rate": 0.0002480501457725947, |
| "loss": 3.1388, |
| "step": 100700 |
| }, |
| { |
| "epoch": 29.356060606060606, |
| "grad_norm": 0.4419405460357666, |
| "learning_rate": 0.0002478752186588921, |
| "loss": 3.1262, |
| "step": 100750 |
| }, |
| { |
| "epoch": 29.37062937062937, |
| "grad_norm": 0.4478071630001068, |
| "learning_rate": 0.00024770029154518947, |
| "loss": 3.1314, |
| "step": 100800 |
| }, |
| { |
| "epoch": 29.385198135198134, |
| "grad_norm": 0.4380553960800171, |
| "learning_rate": 0.00024752536443148685, |
| "loss": 3.1442, |
| "step": 100850 |
| }, |
| { |
| "epoch": 29.3997668997669, |
| "grad_norm": 0.45655930042266846, |
| "learning_rate": 0.0002473504373177842, |
| "loss": 3.1258, |
| "step": 100900 |
| }, |
| { |
| "epoch": 29.414335664335663, |
| "grad_norm": 0.4610624611377716, |
| "learning_rate": 0.0002471755102040816, |
| "loss": 3.1336, |
| "step": 100950 |
| }, |
| { |
| "epoch": 29.428904428904428, |
| "grad_norm": 0.4348837733268738, |
| "learning_rate": 0.000247000583090379, |
| "loss": 3.1421, |
| "step": 101000 |
| }, |
| { |
| "epoch": 29.428904428904428, |
| "eval_accuracy": 0.37460063202737703, |
| "eval_loss": 3.5466175079345703, |
| "eval_runtime": 180.3672, |
| "eval_samples_per_second": 92.267, |
| "eval_steps_per_second": 5.772, |
| "step": 101000 |
| }, |
| { |
| "epoch": 29.443473193473192, |
| "grad_norm": 0.42182600498199463, |
| "learning_rate": 0.00024682565597667635, |
| "loss": 3.1319, |
| "step": 101050 |
| }, |
| { |
| "epoch": 29.458041958041957, |
| "grad_norm": 0.4220890402793884, |
| "learning_rate": 0.0002466507288629737, |
| "loss": 3.1315, |
| "step": 101100 |
| }, |
| { |
| "epoch": 29.47261072261072, |
| "grad_norm": 0.45002591609954834, |
| "learning_rate": 0.0002464758017492711, |
| "loss": 3.1372, |
| "step": 101150 |
| }, |
| { |
| "epoch": 29.487179487179485, |
| "grad_norm": 0.44506534934043884, |
| "learning_rate": 0.0002463008746355685, |
| "loss": 3.1375, |
| "step": 101200 |
| }, |
| { |
| "epoch": 29.501748251748253, |
| "grad_norm": 0.44931530952453613, |
| "learning_rate": 0.00024612594752186585, |
| "loss": 3.1525, |
| "step": 101250 |
| }, |
| { |
| "epoch": 29.516317016317018, |
| "grad_norm": 0.4678424298763275, |
| "learning_rate": 0.00024595102040816323, |
| "loss": 3.1535, |
| "step": 101300 |
| }, |
| { |
| "epoch": 29.530885780885782, |
| "grad_norm": 0.4373933672904968, |
| "learning_rate": 0.0002457760932944606, |
| "loss": 3.1546, |
| "step": 101350 |
| }, |
| { |
| "epoch": 29.545454545454547, |
| "grad_norm": 0.4528273940086365, |
| "learning_rate": 0.000245601166180758, |
| "loss": 3.1348, |
| "step": 101400 |
| }, |
| { |
| "epoch": 29.56002331002331, |
| "grad_norm": 0.44172629714012146, |
| "learning_rate": 0.0002454262390670554, |
| "loss": 3.1459, |
| "step": 101450 |
| }, |
| { |
| "epoch": 29.574592074592076, |
| "grad_norm": 0.43117034435272217, |
| "learning_rate": 0.00024525131195335273, |
| "loss": 3.1472, |
| "step": 101500 |
| }, |
| { |
| "epoch": 29.58916083916084, |
| "grad_norm": 0.42502203583717346, |
| "learning_rate": 0.0002450763848396501, |
| "loss": 3.1463, |
| "step": 101550 |
| }, |
| { |
| "epoch": 29.603729603729604, |
| "grad_norm": 0.4515135586261749, |
| "learning_rate": 0.0002449014577259475, |
| "loss": 3.1579, |
| "step": 101600 |
| }, |
| { |
| "epoch": 29.61829836829837, |
| "grad_norm": 0.47481200098991394, |
| "learning_rate": 0.0002447265306122449, |
| "loss": 3.1538, |
| "step": 101650 |
| }, |
| { |
| "epoch": 29.632867132867133, |
| "grad_norm": 0.43279948830604553, |
| "learning_rate": 0.00024455160349854223, |
| "loss": 3.1372, |
| "step": 101700 |
| }, |
| { |
| "epoch": 29.647435897435898, |
| "grad_norm": 0.4356144964694977, |
| "learning_rate": 0.0002443766763848396, |
| "loss": 3.1568, |
| "step": 101750 |
| }, |
| { |
| "epoch": 29.662004662004662, |
| "grad_norm": 0.43911242485046387, |
| "learning_rate": 0.000244201749271137, |
| "loss": 3.1467, |
| "step": 101800 |
| }, |
| { |
| "epoch": 29.676573426573427, |
| "grad_norm": 0.420939564704895, |
| "learning_rate": 0.0002440268221574344, |
| "loss": 3.1472, |
| "step": 101850 |
| }, |
| { |
| "epoch": 29.69114219114219, |
| "grad_norm": 0.433510959148407, |
| "learning_rate": 0.00024385189504373176, |
| "loss": 3.1612, |
| "step": 101900 |
| }, |
| { |
| "epoch": 29.705710955710956, |
| "grad_norm": 0.4388565719127655, |
| "learning_rate": 0.0002436769679300291, |
| "loss": 3.1629, |
| "step": 101950 |
| }, |
| { |
| "epoch": 29.72027972027972, |
| "grad_norm": 0.43283745646476746, |
| "learning_rate": 0.00024350204081632652, |
| "loss": 3.1544, |
| "step": 102000 |
| }, |
| { |
| "epoch": 29.72027972027972, |
| "eval_accuracy": 0.37507475839193083, |
| "eval_loss": 3.542076826095581, |
| "eval_runtime": 180.2077, |
| "eval_samples_per_second": 92.349, |
| "eval_steps_per_second": 5.777, |
| "step": 102000 |
| }, |
| { |
| "epoch": 29.734848484848484, |
| "grad_norm": 0.43232461810112, |
| "learning_rate": 0.0002433271137026239, |
| "loss": 3.1606, |
| "step": 102050 |
| }, |
| { |
| "epoch": 29.74941724941725, |
| "grad_norm": 0.4560996890068054, |
| "learning_rate": 0.00024315218658892127, |
| "loss": 3.1453, |
| "step": 102100 |
| }, |
| { |
| "epoch": 29.763986013986013, |
| "grad_norm": 0.4297953248023987, |
| "learning_rate": 0.00024297725947521862, |
| "loss": 3.1618, |
| "step": 102150 |
| }, |
| { |
| "epoch": 29.778554778554778, |
| "grad_norm": 0.4410262405872345, |
| "learning_rate": 0.00024280233236151602, |
| "loss": 3.1584, |
| "step": 102200 |
| }, |
| { |
| "epoch": 29.793123543123542, |
| "grad_norm": 0.45969489216804504, |
| "learning_rate": 0.0002426274052478134, |
| "loss": 3.1546, |
| "step": 102250 |
| }, |
| { |
| "epoch": 29.807692307692307, |
| "grad_norm": 0.432098388671875, |
| "learning_rate": 0.00024245247813411077, |
| "loss": 3.159, |
| "step": 102300 |
| }, |
| { |
| "epoch": 29.82226107226107, |
| "grad_norm": 0.43258893489837646, |
| "learning_rate": 0.00024227755102040815, |
| "loss": 3.1638, |
| "step": 102350 |
| }, |
| { |
| "epoch": 29.836829836829835, |
| "grad_norm": 0.42827901244163513, |
| "learning_rate": 0.0002421026239067055, |
| "loss": 3.1682, |
| "step": 102400 |
| }, |
| { |
| "epoch": 29.8513986013986, |
| "grad_norm": 0.42459243535995483, |
| "learning_rate": 0.0002419276967930029, |
| "loss": 3.168, |
| "step": 102450 |
| }, |
| { |
| "epoch": 29.865967365967364, |
| "grad_norm": 0.4142208993434906, |
| "learning_rate": 0.00024175276967930027, |
| "loss": 3.1613, |
| "step": 102500 |
| }, |
| { |
| "epoch": 29.88053613053613, |
| "grad_norm": 0.4322669804096222, |
| "learning_rate": 0.00024157784256559765, |
| "loss": 3.1545, |
| "step": 102550 |
| }, |
| { |
| "epoch": 29.895104895104897, |
| "grad_norm": 0.47667908668518066, |
| "learning_rate": 0.000241402915451895, |
| "loss": 3.1543, |
| "step": 102600 |
| }, |
| { |
| "epoch": 29.90967365967366, |
| "grad_norm": 0.41840967535972595, |
| "learning_rate": 0.0002412279883381924, |
| "loss": 3.16, |
| "step": 102650 |
| }, |
| { |
| "epoch": 29.924242424242426, |
| "grad_norm": 0.41584140062332153, |
| "learning_rate": 0.00024105306122448978, |
| "loss": 3.1566, |
| "step": 102700 |
| }, |
| { |
| "epoch": 29.93881118881119, |
| "grad_norm": 0.43997734785079956, |
| "learning_rate": 0.00024087813411078715, |
| "loss": 3.1648, |
| "step": 102750 |
| }, |
| { |
| "epoch": 29.953379953379955, |
| "grad_norm": 0.44305679202079773, |
| "learning_rate": 0.00024070320699708453, |
| "loss": 3.174, |
| "step": 102800 |
| }, |
| { |
| "epoch": 29.96794871794872, |
| "grad_norm": 0.4263700246810913, |
| "learning_rate": 0.0002405282798833819, |
| "loss": 3.1631, |
| "step": 102850 |
| }, |
| { |
| "epoch": 29.982517482517483, |
| "grad_norm": 0.4472276270389557, |
| "learning_rate": 0.00024035335276967928, |
| "loss": 3.1648, |
| "step": 102900 |
| }, |
| { |
| "epoch": 29.997086247086248, |
| "grad_norm": 0.44121333956718445, |
| "learning_rate": 0.00024017842565597666, |
| "loss": 3.1706, |
| "step": 102950 |
| }, |
| { |
| "epoch": 30.011655011655012, |
| "grad_norm": 0.44513824582099915, |
| "learning_rate": 0.00024000349854227403, |
| "loss": 3.1002, |
| "step": 103000 |
| }, |
| { |
| "epoch": 30.011655011655012, |
| "eval_accuracy": 0.3748071215849555, |
| "eval_loss": 3.5492284297943115, |
| "eval_runtime": 180.2511, |
| "eval_samples_per_second": 92.327, |
| "eval_steps_per_second": 5.775, |
| "step": 103000 |
| }, |
| { |
| "epoch": 30.026223776223777, |
| "grad_norm": 0.45389237999916077, |
| "learning_rate": 0.00023982857142857138, |
| "loss": 3.0733, |
| "step": 103050 |
| }, |
| { |
| "epoch": 30.04079254079254, |
| "grad_norm": 0.45321691036224365, |
| "learning_rate": 0.00023965364431486878, |
| "loss": 3.0796, |
| "step": 103100 |
| }, |
| { |
| "epoch": 30.055361305361306, |
| "grad_norm": 0.43791627883911133, |
| "learning_rate": 0.00023947871720116616, |
| "loss": 3.0821, |
| "step": 103150 |
| }, |
| { |
| "epoch": 30.06993006993007, |
| "grad_norm": 0.43642351031303406, |
| "learning_rate": 0.00023930379008746353, |
| "loss": 3.0828, |
| "step": 103200 |
| }, |
| { |
| "epoch": 30.084498834498834, |
| "grad_norm": 0.45049965381622314, |
| "learning_rate": 0.00023912886297376094, |
| "loss": 3.0869, |
| "step": 103250 |
| }, |
| { |
| "epoch": 30.0990675990676, |
| "grad_norm": 0.42755648493766785, |
| "learning_rate": 0.00023895393586005829, |
| "loss": 3.0856, |
| "step": 103300 |
| }, |
| { |
| "epoch": 30.113636363636363, |
| "grad_norm": 0.43771249055862427, |
| "learning_rate": 0.00023877900874635566, |
| "loss": 3.0936, |
| "step": 103350 |
| }, |
| { |
| "epoch": 30.128205128205128, |
| "grad_norm": 0.44602829217910767, |
| "learning_rate": 0.00023860408163265304, |
| "loss": 3.1002, |
| "step": 103400 |
| }, |
| { |
| "epoch": 30.142773892773892, |
| "grad_norm": 0.46566757559776306, |
| "learning_rate": 0.00023842915451895041, |
| "loss": 3.099, |
| "step": 103450 |
| }, |
| { |
| "epoch": 30.157342657342657, |
| "grad_norm": 0.42642471194267273, |
| "learning_rate": 0.0002382542274052478, |
| "loss": 3.1071, |
| "step": 103500 |
| }, |
| { |
| "epoch": 30.17191142191142, |
| "grad_norm": 0.43897151947021484, |
| "learning_rate": 0.00023807930029154517, |
| "loss": 3.1069, |
| "step": 103550 |
| }, |
| { |
| "epoch": 30.186480186480185, |
| "grad_norm": 0.4233187139034271, |
| "learning_rate": 0.00023790437317784254, |
| "loss": 3.1073, |
| "step": 103600 |
| }, |
| { |
| "epoch": 30.20104895104895, |
| "grad_norm": 0.44739827513694763, |
| "learning_rate": 0.00023772944606413992, |
| "loss": 3.1098, |
| "step": 103650 |
| }, |
| { |
| "epoch": 30.215617715617714, |
| "grad_norm": 0.42843517661094666, |
| "learning_rate": 0.00023755451895043732, |
| "loss": 3.1015, |
| "step": 103700 |
| }, |
| { |
| "epoch": 30.23018648018648, |
| "grad_norm": 0.44605836272239685, |
| "learning_rate": 0.00023737959183673467, |
| "loss": 3.1081, |
| "step": 103750 |
| }, |
| { |
| "epoch": 30.244755244755243, |
| "grad_norm": 0.42383548617362976, |
| "learning_rate": 0.00023720466472303204, |
| "loss": 3.1034, |
| "step": 103800 |
| }, |
| { |
| "epoch": 30.259324009324008, |
| "grad_norm": 0.4467232823371887, |
| "learning_rate": 0.00023702973760932942, |
| "loss": 3.1198, |
| "step": 103850 |
| }, |
| { |
| "epoch": 30.273892773892776, |
| "grad_norm": 0.44068634510040283, |
| "learning_rate": 0.00023685481049562682, |
| "loss": 3.1312, |
| "step": 103900 |
| }, |
| { |
| "epoch": 30.28846153846154, |
| "grad_norm": 0.43057772517204285, |
| "learning_rate": 0.00023667988338192417, |
| "loss": 3.124, |
| "step": 103950 |
| }, |
| { |
| "epoch": 30.303030303030305, |
| "grad_norm": 0.461153507232666, |
| "learning_rate": 0.00023650495626822155, |
| "loss": 3.1165, |
| "step": 104000 |
| }, |
| { |
| "epoch": 30.303030303030305, |
| "eval_accuracy": 0.3746876492669033, |
| "eval_loss": 3.5488526821136475, |
| "eval_runtime": 180.1429, |
| "eval_samples_per_second": 92.382, |
| "eval_steps_per_second": 5.779, |
| "step": 104000 |
| }, |
| { |
| "epoch": 30.31759906759907, |
| "grad_norm": 0.44713693857192993, |
| "learning_rate": 0.00023633002915451892, |
| "loss": 3.1326, |
| "step": 104050 |
| }, |
| { |
| "epoch": 30.332167832167833, |
| "grad_norm": 0.44030696153640747, |
| "learning_rate": 0.00023615510204081633, |
| "loss": 3.121, |
| "step": 104100 |
| }, |
| { |
| "epoch": 30.346736596736598, |
| "grad_norm": 0.43711525201797485, |
| "learning_rate": 0.0002359801749271137, |
| "loss": 3.1182, |
| "step": 104150 |
| }, |
| { |
| "epoch": 30.361305361305362, |
| "grad_norm": 0.4294222295284271, |
| "learning_rate": 0.00023580524781341105, |
| "loss": 3.1187, |
| "step": 104200 |
| }, |
| { |
| "epoch": 30.375874125874127, |
| "grad_norm": 0.43076273798942566, |
| "learning_rate": 0.00023563032069970843, |
| "loss": 3.12, |
| "step": 104250 |
| }, |
| { |
| "epoch": 30.39044289044289, |
| "grad_norm": 0.4725896120071411, |
| "learning_rate": 0.0002354553935860058, |
| "loss": 3.1244, |
| "step": 104300 |
| }, |
| { |
| "epoch": 30.405011655011656, |
| "grad_norm": 0.45083412528038025, |
| "learning_rate": 0.0002352804664723032, |
| "loss": 3.1204, |
| "step": 104350 |
| }, |
| { |
| "epoch": 30.41958041958042, |
| "grad_norm": 0.4314779043197632, |
| "learning_rate": 0.00023510553935860055, |
| "loss": 3.1339, |
| "step": 104400 |
| }, |
| { |
| "epoch": 30.434149184149184, |
| "grad_norm": 0.4679701626300812, |
| "learning_rate": 0.00023493061224489793, |
| "loss": 3.1318, |
| "step": 104450 |
| }, |
| { |
| "epoch": 30.44871794871795, |
| "grad_norm": 0.47724515199661255, |
| "learning_rate": 0.0002347556851311953, |
| "loss": 3.1144, |
| "step": 104500 |
| }, |
| { |
| "epoch": 30.463286713286713, |
| "grad_norm": 0.46651652455329895, |
| "learning_rate": 0.0002345807580174927, |
| "loss": 3.1216, |
| "step": 104550 |
| }, |
| { |
| "epoch": 30.477855477855478, |
| "grad_norm": 0.3993292450904846, |
| "learning_rate": 0.00023440583090379008, |
| "loss": 3.1345, |
| "step": 104600 |
| }, |
| { |
| "epoch": 30.492424242424242, |
| "grad_norm": 0.4892179071903229, |
| "learning_rate": 0.00023423090379008743, |
| "loss": 3.1346, |
| "step": 104650 |
| }, |
| { |
| "epoch": 30.506993006993007, |
| "grad_norm": 0.4761413633823395, |
| "learning_rate": 0.0002340559766763848, |
| "loss": 3.1327, |
| "step": 104700 |
| }, |
| { |
| "epoch": 30.52156177156177, |
| "grad_norm": 0.4577150344848633, |
| "learning_rate": 0.0002338810495626822, |
| "loss": 3.1338, |
| "step": 104750 |
| }, |
| { |
| "epoch": 30.536130536130536, |
| "grad_norm": 0.4509866535663605, |
| "learning_rate": 0.0002337061224489796, |
| "loss": 3.134, |
| "step": 104800 |
| }, |
| { |
| "epoch": 30.5506993006993, |
| "grad_norm": 0.4465436339378357, |
| "learning_rate": 0.00023353119533527694, |
| "loss": 3.1374, |
| "step": 104850 |
| }, |
| { |
| "epoch": 30.565268065268064, |
| "grad_norm": 0.4383067786693573, |
| "learning_rate": 0.0002333562682215743, |
| "loss": 3.1411, |
| "step": 104900 |
| }, |
| { |
| "epoch": 30.57983682983683, |
| "grad_norm": 0.4641270041465759, |
| "learning_rate": 0.0002331813411078717, |
| "loss": 3.1356, |
| "step": 104950 |
| }, |
| { |
| "epoch": 30.594405594405593, |
| "grad_norm": 0.47275739908218384, |
| "learning_rate": 0.0002330064139941691, |
| "loss": 3.1392, |
| "step": 105000 |
| }, |
| { |
| "epoch": 30.594405594405593, |
| "eval_accuracy": 0.37494376216918457, |
| "eval_loss": 3.54292893409729, |
| "eval_runtime": 179.8448, |
| "eval_samples_per_second": 92.535, |
| "eval_steps_per_second": 5.788, |
| "step": 105000 |
| }, |
| { |
| "epoch": 30.608974358974358, |
| "grad_norm": 0.4609619081020355, |
| "learning_rate": 0.00023283148688046647, |
| "loss": 3.1408, |
| "step": 105050 |
| }, |
| { |
| "epoch": 30.623543123543122, |
| "grad_norm": 0.4698541462421417, |
| "learning_rate": 0.00023265655976676381, |
| "loss": 3.1478, |
| "step": 105100 |
| }, |
| { |
| "epoch": 30.638111888111887, |
| "grad_norm": 0.42247274518013, |
| "learning_rate": 0.0002324816326530612, |
| "loss": 3.1404, |
| "step": 105150 |
| }, |
| { |
| "epoch": 30.65268065268065, |
| "grad_norm": 0.4531615078449249, |
| "learning_rate": 0.0002323067055393586, |
| "loss": 3.1442, |
| "step": 105200 |
| }, |
| { |
| "epoch": 30.667249417249415, |
| "grad_norm": 0.44655829668045044, |
| "learning_rate": 0.00023213177842565597, |
| "loss": 3.1468, |
| "step": 105250 |
| }, |
| { |
| "epoch": 30.681818181818183, |
| "grad_norm": 0.45186808705329895, |
| "learning_rate": 0.00023195685131195332, |
| "loss": 3.1486, |
| "step": 105300 |
| }, |
| { |
| "epoch": 30.696386946386948, |
| "grad_norm": 0.43957528471946716, |
| "learning_rate": 0.0002317819241982507, |
| "loss": 3.1417, |
| "step": 105350 |
| }, |
| { |
| "epoch": 30.710955710955712, |
| "grad_norm": 0.46366026997566223, |
| "learning_rate": 0.0002316069970845481, |
| "loss": 3.1572, |
| "step": 105400 |
| }, |
| { |
| "epoch": 30.725524475524477, |
| "grad_norm": 0.4653265178203583, |
| "learning_rate": 0.00023143206997084547, |
| "loss": 3.1405, |
| "step": 105450 |
| }, |
| { |
| "epoch": 30.74009324009324, |
| "grad_norm": 0.46687573194503784, |
| "learning_rate": 0.00023125714285714285, |
| "loss": 3.1464, |
| "step": 105500 |
| }, |
| { |
| "epoch": 30.754662004662006, |
| "grad_norm": 0.4489416480064392, |
| "learning_rate": 0.0002310822157434402, |
| "loss": 3.15, |
| "step": 105550 |
| }, |
| { |
| "epoch": 30.76923076923077, |
| "grad_norm": 0.45375025272369385, |
| "learning_rate": 0.00023090728862973757, |
| "loss": 3.1586, |
| "step": 105600 |
| }, |
| { |
| "epoch": 30.783799533799534, |
| "grad_norm": 0.44260141253471375, |
| "learning_rate": 0.00023073236151603498, |
| "loss": 3.143, |
| "step": 105650 |
| }, |
| { |
| "epoch": 30.7983682983683, |
| "grad_norm": 0.42545315623283386, |
| "learning_rate": 0.00023055743440233235, |
| "loss": 3.1566, |
| "step": 105700 |
| }, |
| { |
| "epoch": 30.812937062937063, |
| "grad_norm": 0.4167237877845764, |
| "learning_rate": 0.0002303825072886297, |
| "loss": 3.1505, |
| "step": 105750 |
| }, |
| { |
| "epoch": 30.827505827505828, |
| "grad_norm": 0.414233922958374, |
| "learning_rate": 0.00023020758017492708, |
| "loss": 3.1588, |
| "step": 105800 |
| }, |
| { |
| "epoch": 30.842074592074592, |
| "grad_norm": 0.4713982939720154, |
| "learning_rate": 0.00023003265306122448, |
| "loss": 3.1509, |
| "step": 105850 |
| }, |
| { |
| "epoch": 30.856643356643357, |
| "grad_norm": 0.4120693504810333, |
| "learning_rate": 0.00022985772594752185, |
| "loss": 3.1542, |
| "step": 105900 |
| }, |
| { |
| "epoch": 30.87121212121212, |
| "grad_norm": 0.4330739974975586, |
| "learning_rate": 0.00022968279883381923, |
| "loss": 3.1513, |
| "step": 105950 |
| }, |
| { |
| "epoch": 30.885780885780886, |
| "grad_norm": 0.48839515447616577, |
| "learning_rate": 0.00022950787172011658, |
| "loss": 3.1409, |
| "step": 106000 |
| }, |
| { |
| "epoch": 30.885780885780886, |
| "eval_accuracy": 0.3753570940569342, |
| "eval_loss": 3.541912794113159, |
| "eval_runtime": 179.7698, |
| "eval_samples_per_second": 92.574, |
| "eval_steps_per_second": 5.791, |
| "step": 106000 |
| }, |
| { |
| "epoch": 30.90034965034965, |
| "grad_norm": 0.4531843662261963, |
| "learning_rate": 0.00022933294460641398, |
| "loss": 3.1547, |
| "step": 106050 |
| }, |
| { |
| "epoch": 30.914918414918414, |
| "grad_norm": 0.4466068744659424, |
| "learning_rate": 0.00022915801749271136, |
| "loss": 3.1528, |
| "step": 106100 |
| }, |
| { |
| "epoch": 30.92948717948718, |
| "grad_norm": 0.4656772017478943, |
| "learning_rate": 0.00022898309037900873, |
| "loss": 3.1565, |
| "step": 106150 |
| }, |
| { |
| "epoch": 30.944055944055943, |
| "grad_norm": 0.43916213512420654, |
| "learning_rate": 0.00022880816326530608, |
| "loss": 3.1612, |
| "step": 106200 |
| }, |
| { |
| "epoch": 30.958624708624708, |
| "grad_norm": 0.4404742419719696, |
| "learning_rate": 0.00022863323615160349, |
| "loss": 3.1599, |
| "step": 106250 |
| }, |
| { |
| "epoch": 30.973193473193472, |
| "grad_norm": 0.46946898102760315, |
| "learning_rate": 0.00022845830903790086, |
| "loss": 3.1581, |
| "step": 106300 |
| }, |
| { |
| "epoch": 30.987762237762237, |
| "grad_norm": 0.46753212809562683, |
| "learning_rate": 0.00022828338192419824, |
| "loss": 3.1671, |
| "step": 106350 |
| }, |
| { |
| "epoch": 31.002331002331, |
| "grad_norm": 0.4661099910736084, |
| "learning_rate": 0.0002281084548104956, |
| "loss": 3.1437, |
| "step": 106400 |
| }, |
| { |
| "epoch": 31.016899766899765, |
| "grad_norm": 0.49687930941581726, |
| "learning_rate": 0.00022793352769679296, |
| "loss": 3.0715, |
| "step": 106450 |
| }, |
| { |
| "epoch": 31.03146853146853, |
| "grad_norm": 0.44716677069664, |
| "learning_rate": 0.00022775860058309036, |
| "loss": 3.0721, |
| "step": 106500 |
| }, |
| { |
| "epoch": 31.046037296037294, |
| "grad_norm": 0.4831984341144562, |
| "learning_rate": 0.00022758367346938774, |
| "loss": 3.0636, |
| "step": 106550 |
| }, |
| { |
| "epoch": 31.060606060606062, |
| "grad_norm": 0.42915815114974976, |
| "learning_rate": 0.00022740874635568512, |
| "loss": 3.0796, |
| "step": 106600 |
| }, |
| { |
| "epoch": 31.075174825174827, |
| "grad_norm": 0.46241822838783264, |
| "learning_rate": 0.00022723381924198246, |
| "loss": 3.0771, |
| "step": 106650 |
| }, |
| { |
| "epoch": 31.08974358974359, |
| "grad_norm": 0.447934627532959, |
| "learning_rate": 0.00022705889212827987, |
| "loss": 3.0837, |
| "step": 106700 |
| }, |
| { |
| "epoch": 31.104312354312356, |
| "grad_norm": 0.44860297441482544, |
| "learning_rate": 0.00022688396501457724, |
| "loss": 3.0931, |
| "step": 106750 |
| }, |
| { |
| "epoch": 31.11888111888112, |
| "grad_norm": 0.41188955307006836, |
| "learning_rate": 0.00022670903790087462, |
| "loss": 3.0743, |
| "step": 106800 |
| }, |
| { |
| "epoch": 31.133449883449885, |
| "grad_norm": 0.4889490604400635, |
| "learning_rate": 0.000226534110787172, |
| "loss": 3.0849, |
| "step": 106850 |
| }, |
| { |
| "epoch": 31.14801864801865, |
| "grad_norm": 0.4424610435962677, |
| "learning_rate": 0.00022635918367346937, |
| "loss": 3.0869, |
| "step": 106900 |
| }, |
| { |
| "epoch": 31.162587412587413, |
| "grad_norm": 0.4582122564315796, |
| "learning_rate": 0.00022618425655976675, |
| "loss": 3.1015, |
| "step": 106950 |
| }, |
| { |
| "epoch": 31.177156177156178, |
| "grad_norm": 0.44879457354545593, |
| "learning_rate": 0.00022600932944606412, |
| "loss": 3.101, |
| "step": 107000 |
| }, |
| { |
| "epoch": 31.177156177156178, |
| "eval_accuracy": 0.37432429349644913, |
| "eval_loss": 3.5554325580596924, |
| "eval_runtime": 179.6724, |
| "eval_samples_per_second": 92.624, |
| "eval_steps_per_second": 5.794, |
| "step": 107000 |
| }, |
| { |
| "epoch": 31.191724941724942, |
| "grad_norm": 0.4423729181289673, |
| "learning_rate": 0.0002258344023323615, |
| "loss": 3.1023, |
| "step": 107050 |
| }, |
| { |
| "epoch": 31.206293706293707, |
| "grad_norm": 0.4656815528869629, |
| "learning_rate": 0.00022565947521865885, |
| "loss": 3.0914, |
| "step": 107100 |
| }, |
| { |
| "epoch": 31.22086247086247, |
| "grad_norm": 0.4330950975418091, |
| "learning_rate": 0.00022548454810495625, |
| "loss": 3.105, |
| "step": 107150 |
| }, |
| { |
| "epoch": 31.235431235431236, |
| "grad_norm": 0.48956480622291565, |
| "learning_rate": 0.00022530962099125363, |
| "loss": 3.1062, |
| "step": 107200 |
| }, |
| { |
| "epoch": 31.25, |
| "grad_norm": 0.4430369436740875, |
| "learning_rate": 0.000225134693877551, |
| "loss": 3.0958, |
| "step": 107250 |
| }, |
| { |
| "epoch": 31.264568764568764, |
| "grad_norm": 0.43993812799453735, |
| "learning_rate": 0.0002249597667638484, |
| "loss": 3.1047, |
| "step": 107300 |
| }, |
| { |
| "epoch": 31.27913752913753, |
| "grad_norm": 0.4541182518005371, |
| "learning_rate": 0.00022478483965014575, |
| "loss": 3.1035, |
| "step": 107350 |
| }, |
| { |
| "epoch": 31.293706293706293, |
| "grad_norm": 0.4626857340335846, |
| "learning_rate": 0.00022460991253644313, |
| "loss": 3.1051, |
| "step": 107400 |
| }, |
| { |
| "epoch": 31.308275058275058, |
| "grad_norm": 0.46208930015563965, |
| "learning_rate": 0.0002244349854227405, |
| "loss": 3.1121, |
| "step": 107450 |
| }, |
| { |
| "epoch": 31.322843822843822, |
| "grad_norm": 0.4782877266407013, |
| "learning_rate": 0.00022426005830903788, |
| "loss": 3.1049, |
| "step": 107500 |
| }, |
| { |
| "epoch": 31.337412587412587, |
| "grad_norm": 0.498509019613266, |
| "learning_rate": 0.00022408513119533526, |
| "loss": 3.1114, |
| "step": 107550 |
| }, |
| { |
| "epoch": 31.35198135198135, |
| "grad_norm": 0.44582095742225647, |
| "learning_rate": 0.00022391020408163263, |
| "loss": 3.1155, |
| "step": 107600 |
| }, |
| { |
| "epoch": 31.366550116550115, |
| "grad_norm": 0.4745688736438751, |
| "learning_rate": 0.00022373527696793, |
| "loss": 3.1096, |
| "step": 107650 |
| }, |
| { |
| "epoch": 31.38111888111888, |
| "grad_norm": 0.4498319625854492, |
| "learning_rate": 0.00022356034985422738, |
| "loss": 3.1138, |
| "step": 107700 |
| }, |
| { |
| "epoch": 31.395687645687644, |
| "grad_norm": 0.4679555594921112, |
| "learning_rate": 0.00022338542274052479, |
| "loss": 3.1141, |
| "step": 107750 |
| }, |
| { |
| "epoch": 31.41025641025641, |
| "grad_norm": 0.44934287667274475, |
| "learning_rate": 0.00022321049562682213, |
| "loss": 3.1055, |
| "step": 107800 |
| }, |
| { |
| "epoch": 31.424825174825173, |
| "grad_norm": 0.4695224165916443, |
| "learning_rate": 0.0002230355685131195, |
| "loss": 3.1201, |
| "step": 107850 |
| }, |
| { |
| "epoch": 31.439393939393938, |
| "grad_norm": 0.46642690896987915, |
| "learning_rate": 0.00022286064139941689, |
| "loss": 3.1202, |
| "step": 107900 |
| }, |
| { |
| "epoch": 31.453962703962706, |
| "grad_norm": 0.4324790835380554, |
| "learning_rate": 0.0002226857142857143, |
| "loss": 3.131, |
| "step": 107950 |
| }, |
| { |
| "epoch": 31.46853146853147, |
| "grad_norm": 0.4190344214439392, |
| "learning_rate": 0.00022251078717201164, |
| "loss": 3.1215, |
| "step": 108000 |
| }, |
| { |
| "epoch": 31.46853146853147, |
| "eval_accuracy": 0.37509957006428224, |
| "eval_loss": 3.5474321842193604, |
| "eval_runtime": 179.0221, |
| "eval_samples_per_second": 92.961, |
| "eval_steps_per_second": 5.815, |
| "step": 108000 |
| }, |
| { |
| "epoch": 31.483100233100235, |
| "grad_norm": 0.45015689730644226, |
| "learning_rate": 0.00022233586005830901, |
| "loss": 3.1334, |
| "step": 108050 |
| }, |
| { |
| "epoch": 31.497668997669, |
| "grad_norm": 0.45660534501075745, |
| "learning_rate": 0.0002221609329446064, |
| "loss": 3.1244, |
| "step": 108100 |
| }, |
| { |
| "epoch": 31.512237762237763, |
| "grad_norm": 0.4564882218837738, |
| "learning_rate": 0.0002219860058309038, |
| "loss": 3.1267, |
| "step": 108150 |
| }, |
| { |
| "epoch": 31.526806526806528, |
| "grad_norm": 0.4678419828414917, |
| "learning_rate": 0.00022181107871720117, |
| "loss": 3.1248, |
| "step": 108200 |
| }, |
| { |
| "epoch": 31.541375291375292, |
| "grad_norm": 0.4461950957775116, |
| "learning_rate": 0.00022163615160349852, |
| "loss": 3.138, |
| "step": 108250 |
| }, |
| { |
| "epoch": 31.555944055944057, |
| "grad_norm": 0.4868103563785553, |
| "learning_rate": 0.0002214612244897959, |
| "loss": 3.1406, |
| "step": 108300 |
| }, |
| { |
| "epoch": 31.57051282051282, |
| "grad_norm": 0.4507032036781311, |
| "learning_rate": 0.00022128629737609327, |
| "loss": 3.1398, |
| "step": 108350 |
| }, |
| { |
| "epoch": 31.585081585081586, |
| "grad_norm": 0.44092482328414917, |
| "learning_rate": 0.00022111137026239067, |
| "loss": 3.1261, |
| "step": 108400 |
| }, |
| { |
| "epoch": 31.59965034965035, |
| "grad_norm": 0.43679946660995483, |
| "learning_rate": 0.00022093644314868802, |
| "loss": 3.1314, |
| "step": 108450 |
| }, |
| { |
| "epoch": 31.614219114219114, |
| "grad_norm": 0.4783121645450592, |
| "learning_rate": 0.0002207615160349854, |
| "loss": 3.1387, |
| "step": 108500 |
| }, |
| { |
| "epoch": 31.62878787878788, |
| "grad_norm": 0.4393003284931183, |
| "learning_rate": 0.00022058658892128277, |
| "loss": 3.1268, |
| "step": 108550 |
| }, |
| { |
| "epoch": 31.643356643356643, |
| "grad_norm": 0.45997878909111023, |
| "learning_rate": 0.00022041166180758017, |
| "loss": 3.1362, |
| "step": 108600 |
| }, |
| { |
| "epoch": 31.657925407925408, |
| "grad_norm": 0.4729020297527313, |
| "learning_rate": 0.00022023673469387755, |
| "loss": 3.1391, |
| "step": 108650 |
| }, |
| { |
| "epoch": 31.672494172494172, |
| "grad_norm": 0.43747490644454956, |
| "learning_rate": 0.0002200618075801749, |
| "loss": 3.1279, |
| "step": 108700 |
| }, |
| { |
| "epoch": 31.687062937062937, |
| "grad_norm": 0.45657721161842346, |
| "learning_rate": 0.00021988688046647227, |
| "loss": 3.1444, |
| "step": 108750 |
| }, |
| { |
| "epoch": 31.7016317016317, |
| "grad_norm": 0.43069857358932495, |
| "learning_rate": 0.00021971195335276968, |
| "loss": 3.1376, |
| "step": 108800 |
| }, |
| { |
| "epoch": 31.716200466200466, |
| "grad_norm": 0.4645999073982239, |
| "learning_rate": 0.00021953702623906705, |
| "loss": 3.1327, |
| "step": 108850 |
| }, |
| { |
| "epoch": 31.73076923076923, |
| "grad_norm": 0.44925370812416077, |
| "learning_rate": 0.0002193620991253644, |
| "loss": 3.1401, |
| "step": 108900 |
| }, |
| { |
| "epoch": 31.745337995337994, |
| "grad_norm": 0.49291902780532837, |
| "learning_rate": 0.00021918717201166178, |
| "loss": 3.1433, |
| "step": 108950 |
| }, |
| { |
| "epoch": 31.75990675990676, |
| "grad_norm": 0.4575839042663574, |
| "learning_rate": 0.00021901224489795915, |
| "loss": 3.1317, |
| "step": 109000 |
| }, |
| { |
| "epoch": 31.75990675990676, |
| "eval_accuracy": 0.37544023079794103, |
| "eval_loss": 3.538872003555298, |
| "eval_runtime": 179.1437, |
| "eval_samples_per_second": 92.898, |
| "eval_steps_per_second": 5.811, |
| "step": 109000 |
| }, |
| { |
| "epoch": 31.75990675990676, |
| "step": 109000, |
| "total_flos": 2.278434118828032e+18, |
| "train_loss": 0.8388565721424348, |
| "train_runtime": 57880.4577, |
| "train_samples_per_second": 237.175, |
| "train_steps_per_second": 2.965 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171600, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 11 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.278434118828032e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|