| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.996706915477497, | |
| "eval_steps": 500, | |
| "global_step": 1365, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0021953896816684962, | |
| "grad_norm": 37.74675866275334, | |
| "learning_rate": 3.64963503649635e-09, | |
| "loss": 1.4854, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.010976948408342482, | |
| "grad_norm": 34.58078409329761, | |
| "learning_rate": 1.824817518248175e-08, | |
| "loss": 1.4947, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.021953896816684963, | |
| "grad_norm": 33.64622164796075, | |
| "learning_rate": 3.64963503649635e-08, | |
| "loss": 1.4655, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03293084522502744, | |
| "grad_norm": 33.96621368420792, | |
| "learning_rate": 5.474452554744526e-08, | |
| "loss": 1.4454, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.043907793633369926, | |
| "grad_norm": 32.64608823456708, | |
| "learning_rate": 7.2992700729927e-08, | |
| "loss": 1.441, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.054884742041712405, | |
| "grad_norm": 28.7407961705511, | |
| "learning_rate": 9.124087591240875e-08, | |
| "loss": 1.4399, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06586169045005488, | |
| "grad_norm": 26.49133383604883, | |
| "learning_rate": 1.0948905109489052e-07, | |
| "loss": 1.3907, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07683863885839737, | |
| "grad_norm": 28.558137483756873, | |
| "learning_rate": 1.2773722627737227e-07, | |
| "loss": 1.3469, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08781558726673985, | |
| "grad_norm": 16.332214000825115, | |
| "learning_rate": 1.45985401459854e-07, | |
| "loss": 1.2666, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09879253567508232, | |
| "grad_norm": 10.940594279582514, | |
| "learning_rate": 1.6423357664233575e-07, | |
| "loss": 1.2482, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.10976948408342481, | |
| "grad_norm": 5.741537178926412, | |
| "learning_rate": 1.824817518248175e-07, | |
| "loss": 1.2249, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1207464324917673, | |
| "grad_norm": 5.018901761568715, | |
| "learning_rate": 2.0072992700729928e-07, | |
| "loss": 1.1839, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.13172338090010977, | |
| "grad_norm": 4.321418760394092, | |
| "learning_rate": 2.1897810218978103e-07, | |
| "loss": 1.1806, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14270032930845225, | |
| "grad_norm": 3.6868934108802405, | |
| "learning_rate": 2.3722627737226276e-07, | |
| "loss": 1.1367, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.15367727771679474, | |
| "grad_norm": 3.6056758938325633, | |
| "learning_rate": 2.5547445255474454e-07, | |
| "loss": 1.1194, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16465422612513722, | |
| "grad_norm": 2.9888643007945923, | |
| "learning_rate": 2.737226277372263e-07, | |
| "loss": 1.1277, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1756311745334797, | |
| "grad_norm": 3.1811524089107737, | |
| "learning_rate": 2.91970802919708e-07, | |
| "loss": 1.1041, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18660812294182216, | |
| "grad_norm": 2.7324489147282356, | |
| "learning_rate": 3.102189781021898e-07, | |
| "loss": 1.1085, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.19758507135016465, | |
| "grad_norm": 2.612737685100592, | |
| "learning_rate": 3.284671532846715e-07, | |
| "loss": 1.0872, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20856201975850713, | |
| "grad_norm": 2.7784872016274806, | |
| "learning_rate": 3.467153284671533e-07, | |
| "loss": 1.1009, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.21953896816684962, | |
| "grad_norm": 2.6806669714481695, | |
| "learning_rate": 3.64963503649635e-07, | |
| "loss": 1.0742, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2305159165751921, | |
| "grad_norm": 2.8589613220847774, | |
| "learning_rate": 3.8321167883211675e-07, | |
| "loss": 1.0928, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2414928649835346, | |
| "grad_norm": 2.6851891388693887, | |
| "learning_rate": 4.0145985401459856e-07, | |
| "loss": 1.079, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2524698133918771, | |
| "grad_norm": 2.3561189558185136, | |
| "learning_rate": 4.1970802919708026e-07, | |
| "loss": 1.0441, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.26344676180021953, | |
| "grad_norm": 2.3680695140089796, | |
| "learning_rate": 4.3795620437956206e-07, | |
| "loss": 1.077, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.27442371020856204, | |
| "grad_norm": 2.9917242722942095, | |
| "learning_rate": 4.5620437956204376e-07, | |
| "loss": 1.0486, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2854006586169045, | |
| "grad_norm": 2.331882690510773, | |
| "learning_rate": 4.744525547445255e-07, | |
| "loss": 1.0478, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.29637760702524696, | |
| "grad_norm": 2.54506147430991, | |
| "learning_rate": 4.927007299270073e-07, | |
| "loss": 1.0501, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.30735455543358947, | |
| "grad_norm": 3.343713651464699, | |
| "learning_rate": 4.999926370237027e-07, | |
| "loss": 1.0464, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.31833150384193193, | |
| "grad_norm": 2.345761304992979, | |
| "learning_rate": 4.999476426280587e-07, | |
| "loss": 1.039, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.32930845225027444, | |
| "grad_norm": 2.5391392458703286, | |
| "learning_rate": 4.998617517322294e-07, | |
| "loss": 1.0316, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3402854006586169, | |
| "grad_norm": 2.472489561391851, | |
| "learning_rate": 4.997349783897061e-07, | |
| "loss": 1.0417, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3512623490669594, | |
| "grad_norm": 2.3737303149457016, | |
| "learning_rate": 4.99567343343177e-07, | |
| "loss": 1.0195, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.36223929747530187, | |
| "grad_norm": 2.3800187550942047, | |
| "learning_rate": 4.993588740211331e-07, | |
| "loss": 1.0474, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3732162458836443, | |
| "grad_norm": 2.330985495803907, | |
| "learning_rate": 4.991096045333809e-07, | |
| "loss": 1.0262, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.38419319429198684, | |
| "grad_norm": 2.4303555873767935, | |
| "learning_rate": 4.988195756654605e-07, | |
| "loss": 1.0142, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3951701427003293, | |
| "grad_norm": 2.4094455102172585, | |
| "learning_rate": 4.984888348719731e-07, | |
| "loss": 1.0188, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4061470911086718, | |
| "grad_norm": 2.1476465324301057, | |
| "learning_rate": 4.981174362688158e-07, | |
| "loss": 1.0142, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.41712403951701427, | |
| "grad_norm": 2.2526911826436615, | |
| "learning_rate": 4.977054406243274e-07, | |
| "loss": 1.0328, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4281009879253567, | |
| "grad_norm": 2.3224686636469443, | |
| "learning_rate": 4.972529153493455e-07, | |
| "loss": 0.9933, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.43907793633369924, | |
| "grad_norm": 2.4302653074531437, | |
| "learning_rate": 4.967599344861768e-07, | |
| "loss": 0.9954, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4500548847420417, | |
| "grad_norm": 2.4835737176654904, | |
| "learning_rate": 4.96226578696482e-07, | |
| "loss": 0.9985, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.4610318331503842, | |
| "grad_norm": 2.4230184538015354, | |
| "learning_rate": 4.956529352480782e-07, | |
| "loss": 0.9987, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.47200878155872666, | |
| "grad_norm": 2.329153987703765, | |
| "learning_rate": 4.950390980006599e-07, | |
| "loss": 1.0098, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4829857299670692, | |
| "grad_norm": 2.1643982961769574, | |
| "learning_rate": 4.943851673904419e-07, | |
| "loss": 1.0133, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.49396267837541163, | |
| "grad_norm": 2.3197403816872977, | |
| "learning_rate": 4.936912504137257e-07, | |
| "loss": 1.0142, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5049396267837541, | |
| "grad_norm": 2.34747990249977, | |
| "learning_rate": 4.929574606093926e-07, | |
| "loss": 0.9976, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5159165751920965, | |
| "grad_norm": 2.2485105183919107, | |
| "learning_rate": 4.92183918040327e-07, | |
| "loss": 0.9891, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5268935236004391, | |
| "grad_norm": 2.2790091103112977, | |
| "learning_rate": 4.913707492737708e-07, | |
| "loss": 1.002, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5378704720087816, | |
| "grad_norm": 2.239563511547285, | |
| "learning_rate": 4.905180873606156e-07, | |
| "loss": 0.9884, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5488474204171241, | |
| "grad_norm": 2.201197024971912, | |
| "learning_rate": 4.896260718136315e-07, | |
| "loss": 1.0053, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5598243688254665, | |
| "grad_norm": 2.4477006618500976, | |
| "learning_rate": 4.886948485846413e-07, | |
| "loss": 0.9905, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.570801317233809, | |
| "grad_norm": 2.1778270375289357, | |
| "learning_rate": 4.877245700406389e-07, | |
| "loss": 1.006, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5817782656421515, | |
| "grad_norm": 2.470318023588166, | |
| "learning_rate": 4.867153949388592e-07, | |
| "loss": 1.0001, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5927552140504939, | |
| "grad_norm": 2.246482390471278, | |
| "learning_rate": 4.856674884008027e-07, | |
| "loss": 1.0016, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6037321624588364, | |
| "grad_norm": 2.555761228606553, | |
| "learning_rate": 4.845810218852175e-07, | |
| "loss": 0.9909, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6147091108671789, | |
| "grad_norm": 2.6187160779641405, | |
| "learning_rate": 4.834561731600457e-07, | |
| "loss": 0.9879, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6256860592755215, | |
| "grad_norm": 2.2540909179670288, | |
| "learning_rate": 4.822931262733367e-07, | |
| "loss": 0.9723, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6366630076838639, | |
| "grad_norm": 2.3392602017439468, | |
| "learning_rate": 4.810920715231334e-07, | |
| "loss": 0.9813, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6476399560922064, | |
| "grad_norm": 2.5885537238335607, | |
| "learning_rate": 4.798532054263356e-07, | |
| "loss": 0.9672, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6586169045005489, | |
| "grad_norm": 2.3642433721473712, | |
| "learning_rate": 4.785767306865457e-07, | |
| "loss": 0.9774, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6695938529088913, | |
| "grad_norm": 2.408185221870625, | |
| "learning_rate": 4.772628561609021e-07, | |
| "loss": 0.985, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6805708013172338, | |
| "grad_norm": 2.2324134890318246, | |
| "learning_rate": 4.7591179682590644e-07, | |
| "loss": 0.9725, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6915477497255763, | |
| "grad_norm": 2.322340624031069, | |
| "learning_rate": 4.7452377374224874e-07, | |
| "loss": 0.9806, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7025246981339188, | |
| "grad_norm": 2.211381205162829, | |
| "learning_rate": 4.730990140186373e-07, | |
| "loss": 0.999, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7135016465422612, | |
| "grad_norm": 2.2193259266838155, | |
| "learning_rate": 4.7163775077463963e-07, | |
| "loss": 0.9728, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7244785949506037, | |
| "grad_norm": 2.307395076998798, | |
| "learning_rate": 4.7014022310253875e-07, | |
| "loss": 0.9924, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7354555433589463, | |
| "grad_norm": 2.242602418548775, | |
| "learning_rate": 4.6860667602821316e-07, | |
| "loss": 0.9861, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7464324917672887, | |
| "grad_norm": 2.2686164540827547, | |
| "learning_rate": 4.670373604710456e-07, | |
| "loss": 0.9803, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7574094401756312, | |
| "grad_norm": 2.1375492767276048, | |
| "learning_rate": 4.6543253320286755e-07, | |
| "loss": 0.9617, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7683863885839737, | |
| "grad_norm": 2.2558323943911684, | |
| "learning_rate": 4.6379245680594625e-07, | |
| "loss": 0.9531, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7793633369923162, | |
| "grad_norm": 2.39500346963894, | |
| "learning_rate": 4.621173996300207e-07, | |
| "loss": 0.9897, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7903402854006586, | |
| "grad_norm": 2.636774268293073, | |
| "learning_rate": 4.6040763574839435e-07, | |
| "loss": 0.9899, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8013172338090011, | |
| "grad_norm": 2.3125464385328236, | |
| "learning_rate": 4.5866344491309104e-07, | |
| "loss": 0.9844, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8122941822173436, | |
| "grad_norm": 2.158764874076257, | |
| "learning_rate": 4.568851125090822e-07, | |
| "loss": 0.9789, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.823271130625686, | |
| "grad_norm": 2.146303203312366, | |
| "learning_rate": 4.550729295075918e-07, | |
| "loss": 0.9894, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8342480790340285, | |
| "grad_norm": 2.1234461105972686, | |
| "learning_rate": 4.532271924184874e-07, | |
| "loss": 0.9573, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.845225027442371, | |
| "grad_norm": 2.5991666840318524, | |
| "learning_rate": 4.513482032417655e-07, | |
| "loss": 0.978, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8562019758507134, | |
| "grad_norm": 2.459654668779688, | |
| "learning_rate": 4.494362694181384e-07, | |
| "loss": 0.9774, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.867178924259056, | |
| "grad_norm": 2.247292046481084, | |
| "learning_rate": 4.474917037787297e-07, | |
| "loss": 0.9753, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8781558726673985, | |
| "grad_norm": 2.2211212349255574, | |
| "learning_rate": 4.455148244938901e-07, | |
| "loss": 0.9789, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.889132821075741, | |
| "grad_norm": 2.3018670472595644, | |
| "learning_rate": 4.435059550211371e-07, | |
| "loss": 0.9748, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9001097694840834, | |
| "grad_norm": 2.1721337174418784, | |
| "learning_rate": 4.414654240522315e-07, | |
| "loss": 0.9741, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9110867178924259, | |
| "grad_norm": 2.28819746756527, | |
| "learning_rate": 4.3939356545939677e-07, | |
| "loss": 0.9756, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9220636663007684, | |
| "grad_norm": 2.4856430019366806, | |
| "learning_rate": 4.372907182406902e-07, | |
| "loss": 0.9599, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9330406147091108, | |
| "grad_norm": 2.279118517164672, | |
| "learning_rate": 4.3515722646453657e-07, | |
| "loss": 0.9766, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9440175631174533, | |
| "grad_norm": 2.283587407407711, | |
| "learning_rate": 4.329934392134314e-07, | |
| "loss": 0.9708, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9549945115257958, | |
| "grad_norm": 2.3436121330856676, | |
| "learning_rate": 4.3079971052682387e-07, | |
| "loss": 0.9752, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.9659714599341384, | |
| "grad_norm": 2.33617606208673, | |
| "learning_rate": 4.2857639934318874e-07, | |
| "loss": 0.9719, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9769484083424808, | |
| "grad_norm": 2.2203473882036966, | |
| "learning_rate": 4.26323869441297e-07, | |
| "loss": 0.9613, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9879253567508233, | |
| "grad_norm": 2.5027842654915005, | |
| "learning_rate": 4.240424893806941e-07, | |
| "loss": 0.9864, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9989023051591658, | |
| "grad_norm": 2.333812012249256, | |
| "learning_rate": 4.217326324413962e-07, | |
| "loss": 0.9643, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.9989023051591658, | |
| "eval_loss": 1.019766926765442, | |
| "eval_runtime": 4.8917, | |
| "eval_samples_per_second": 54.786, | |
| "eval_steps_per_second": 1.022, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.0098792535675083, | |
| "grad_norm": 2.6384758953681495, | |
| "learning_rate": 4.1939467656281387e-07, | |
| "loss": 0.9411, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0208562019758507, | |
| "grad_norm": 2.4368918808468263, | |
| "learning_rate": 4.170290042819137e-07, | |
| "loss": 0.9304, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.031833150384193, | |
| "grad_norm": 2.606275797390668, | |
| "learning_rate": 4.146360026706276e-07, | |
| "loss": 0.9401, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0428100987925357, | |
| "grad_norm": 2.3703029555878175, | |
| "learning_rate": 4.122160632725195e-07, | |
| "loss": 0.9411, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.0537870472008781, | |
| "grad_norm": 2.3433873192058785, | |
| "learning_rate": 4.097695820387216e-07, | |
| "loss": 0.9314, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0647639956092205, | |
| "grad_norm": 2.3059951796342757, | |
| "learning_rate": 4.0729695926314813e-07, | |
| "loss": 0.9212, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.0757409440175631, | |
| "grad_norm": 2.246905867491328, | |
| "learning_rate": 4.0479859951699934e-07, | |
| "loss": 0.9299, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0867178924259056, | |
| "grad_norm": 2.332204582181345, | |
| "learning_rate": 4.0227491158256564e-07, | |
| "loss": 0.9283, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.0976948408342482, | |
| "grad_norm": 2.3052192318083096, | |
| "learning_rate": 3.9972630838634236e-07, | |
| "loss": 0.9081, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1086717892425906, | |
| "grad_norm": 2.2938816727121174, | |
| "learning_rate": 3.9715320693146653e-07, | |
| "loss": 0.9244, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.119648737650933, | |
| "grad_norm": 2.3376274658971794, | |
| "learning_rate": 3.9455602822948695e-07, | |
| "loss": 0.9356, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.1306256860592756, | |
| "grad_norm": 2.2234966705346437, | |
| "learning_rate": 3.9193519723147795e-07, | |
| "loss": 0.937, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.141602634467618, | |
| "grad_norm": 2.3167888423276897, | |
| "learning_rate": 3.892911427585089e-07, | |
| "loss": 0.9253, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.1525795828759604, | |
| "grad_norm": 2.2822984548495, | |
| "learning_rate": 3.8662429743148046e-07, | |
| "loss": 0.93, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.163556531284303, | |
| "grad_norm": 2.3315853272899885, | |
| "learning_rate": 3.839350976003386e-07, | |
| "loss": 0.9307, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1745334796926454, | |
| "grad_norm": 2.225438330230274, | |
| "learning_rate": 3.8122398327267956e-07, | |
| "loss": 0.9191, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.1855104281009878, | |
| "grad_norm": 2.282678706637911, | |
| "learning_rate": 3.7849139804175527e-07, | |
| "loss": 0.9242, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1964873765093305, | |
| "grad_norm": 2.4151431361330262, | |
| "learning_rate": 3.7573778901389264e-07, | |
| "loss": 0.9129, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.2074643249176729, | |
| "grad_norm": 2.1432543425535986, | |
| "learning_rate": 3.729636067353377e-07, | |
| "loss": 0.9352, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2184412733260155, | |
| "grad_norm": 2.2103634060126045, | |
| "learning_rate": 3.701693051185375e-07, | |
| "loss": 0.9217, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.2294182217343579, | |
| "grad_norm": 2.29979398895129, | |
| "learning_rate": 3.6735534136787034e-07, | |
| "loss": 0.9241, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.2403951701427003, | |
| "grad_norm": 2.211918385111886, | |
| "learning_rate": 3.6452217590483843e-07, | |
| "loss": 0.9316, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.2513721185510427, | |
| "grad_norm": 2.188755024370649, | |
| "learning_rate": 3.6167027229273296e-07, | |
| "loss": 0.9089, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.2623490669593853, | |
| "grad_norm": 2.4206253706110576, | |
| "learning_rate": 3.5880009716078643e-07, | |
| "loss": 0.9285, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.2733260153677277, | |
| "grad_norm": 2.2945442413712684, | |
| "learning_rate": 3.559121201278219e-07, | |
| "loss": 0.918, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.2843029637760703, | |
| "grad_norm": 2.345758700673061, | |
| "learning_rate": 3.5300681372541473e-07, | |
| "loss": 0.9305, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.2952799121844127, | |
| "grad_norm": 2.2789322285102314, | |
| "learning_rate": 3.5008465332057594e-07, | |
| "loss": 0.9281, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.3062568605927551, | |
| "grad_norm": 2.243522596217181, | |
| "learning_rate": 3.471461170379732e-07, | |
| "loss": 0.914, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.3172338090010978, | |
| "grad_norm": 2.323467820978964, | |
| "learning_rate": 3.4419168568169935e-07, | |
| "loss": 0.9361, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.3282107574094402, | |
| "grad_norm": 2.194684758387474, | |
| "learning_rate": 3.4122184265660395e-07, | |
| "loss": 0.9311, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.3391877058177828, | |
| "grad_norm": 2.2140499236992603, | |
| "learning_rate": 3.3823707388919736e-07, | |
| "loss": 0.9125, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.3501646542261252, | |
| "grad_norm": 2.258349553066923, | |
| "learning_rate": 3.352378677481444e-07, | |
| "loss": 0.9426, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.3611416026344676, | |
| "grad_norm": 2.358073970259772, | |
| "learning_rate": 3.3222471496435643e-07, | |
| "loss": 0.9188, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.37211855104281, | |
| "grad_norm": 2.293914944788209, | |
| "learning_rate": 3.291981085506986e-07, | |
| "loss": 0.9046, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.3830954994511526, | |
| "grad_norm": 2.239065960199277, | |
| "learning_rate": 3.2615854372132226e-07, | |
| "loss": 0.924, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.394072447859495, | |
| "grad_norm": 2.236539103492653, | |
| "learning_rate": 3.231065178106384e-07, | |
| "loss": 0.9229, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.4050493962678376, | |
| "grad_norm": 2.1945349736134707, | |
| "learning_rate": 3.2004253019194324e-07, | |
| "loss": 0.9272, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.41602634467618, | |
| "grad_norm": 2.2398945956325513, | |
| "learning_rate": 3.169670821957112e-07, | |
| "loss": 0.9102, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.4270032930845225, | |
| "grad_norm": 2.289801618738276, | |
| "learning_rate": 3.1388067702756655e-07, | |
| "loss": 0.9429, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.4379802414928649, | |
| "grad_norm": 2.245541374176715, | |
| "learning_rate": 3.1078381968594895e-07, | |
| "loss": 0.9115, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.4489571899012075, | |
| "grad_norm": 2.278894612214528, | |
| "learning_rate": 3.076770168794854e-07, | |
| "loss": 0.9398, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.4599341383095499, | |
| "grad_norm": 2.3065149845422392, | |
| "learning_rate": 3.045607769440829e-07, | |
| "loss": 0.8946, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.4709110867178925, | |
| "grad_norm": 2.3594569504860696, | |
| "learning_rate": 3.014356097597535e-07, | |
| "loss": 0.9137, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.481888035126235, | |
| "grad_norm": 2.4349120794769354, | |
| "learning_rate": 2.983020266671886e-07, | |
| "loss": 0.9113, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.4928649835345773, | |
| "grad_norm": 2.2209808300678318, | |
| "learning_rate": 2.951605403840921e-07, | |
| "loss": 0.8983, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.5038419319429197, | |
| "grad_norm": 2.3286466229872635, | |
| "learning_rate": 2.920116649212909e-07, | |
| "loss": 0.9312, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.5148188803512623, | |
| "grad_norm": 2.238157735155838, | |
| "learning_rate": 2.888559154986307e-07, | |
| "loss": 0.921, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.525795828759605, | |
| "grad_norm": 2.2522331987696944, | |
| "learning_rate": 2.856938084606769e-07, | |
| "loss": 0.9027, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.5367727771679474, | |
| "grad_norm": 2.225248865709702, | |
| "learning_rate": 2.825258611922292e-07, | |
| "loss": 0.9186, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.5477497255762898, | |
| "grad_norm": 2.2372307174396036, | |
| "learning_rate": 2.793525920336677e-07, | |
| "loss": 0.9179, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.5587266739846322, | |
| "grad_norm": 2.227278818178374, | |
| "learning_rate": 2.7617452019614144e-07, | |
| "loss": 0.9068, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.5697036223929748, | |
| "grad_norm": 2.1887205060374133, | |
| "learning_rate": 2.7299216567661503e-07, | |
| "loss": 0.9128, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.5806805708013172, | |
| "grad_norm": 2.1730778041819487, | |
| "learning_rate": 2.6980604917278675e-07, | |
| "loss": 0.9072, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.5916575192096598, | |
| "grad_norm": 2.306897540604203, | |
| "learning_rate": 2.6661669199789174e-07, | |
| "loss": 0.9241, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.6026344676180022, | |
| "grad_norm": 2.2513680878615956, | |
| "learning_rate": 2.6342461599540456e-07, | |
| "loss": 0.9183, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.6136114160263446, | |
| "grad_norm": 2.2683353014176504, | |
| "learning_rate": 2.6023034345365515e-07, | |
| "loss": 0.9101, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.624588364434687, | |
| "grad_norm": 2.3773000711715793, | |
| "learning_rate": 2.5703439702037155e-07, | |
| "loss": 0.9226, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.6355653128430296, | |
| "grad_norm": 2.2707149058977047, | |
| "learning_rate": 2.538372996171648e-07, | |
| "loss": 0.9099, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.6465422612513723, | |
| "grad_norm": 2.3321984673401808, | |
| "learning_rate": 2.506395743539677e-07, | |
| "loss": 0.9053, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.6575192096597147, | |
| "grad_norm": 2.289265422242154, | |
| "learning_rate": 2.474417444434436e-07, | |
| "loss": 0.9167, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.668496158068057, | |
| "grad_norm": 2.29419980236518, | |
| "learning_rate": 2.442443331153783e-07, | |
| "loss": 0.9255, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.6794731064763995, | |
| "grad_norm": 2.2919378273617785, | |
| "learning_rate": 2.4104786353106926e-07, | |
| "loss": 0.9039, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.690450054884742, | |
| "grad_norm": 2.2503081710593347, | |
| "learning_rate": 2.3785285869772525e-07, | |
| "loss": 0.908, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.7014270032930845, | |
| "grad_norm": 2.273983108472163, | |
| "learning_rate": 2.3465984138289237e-07, | |
| "loss": 0.9178, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.7124039517014271, | |
| "grad_norm": 2.2199735303427333, | |
| "learning_rate": 2.3146933402891812e-07, | |
| "loss": 0.9174, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.7233809001097695, | |
| "grad_norm": 2.246832371307188, | |
| "learning_rate": 2.2828185866746987e-07, | |
| "loss": 0.9063, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.734357848518112, | |
| "grad_norm": 2.262285299215957, | |
| "learning_rate": 2.2509793683411932e-07, | |
| "loss": 0.9282, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.7453347969264543, | |
| "grad_norm": 2.179628826722484, | |
| "learning_rate": 2.2191808948300904e-07, | |
| "loss": 0.8962, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.756311745334797, | |
| "grad_norm": 2.363447036072627, | |
| "learning_rate": 2.1874283690161364e-07, | |
| "loss": 0.9288, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.7672886937431396, | |
| "grad_norm": 2.2413506946996025, | |
| "learning_rate": 2.1557269862561042e-07, | |
| "loss": 0.9105, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.778265642151482, | |
| "grad_norm": 2.213099523042175, | |
| "learning_rate": 2.124081933538726e-07, | |
| "loss": 0.9152, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.7892425905598244, | |
| "grad_norm": 2.3911262169971232, | |
| "learning_rate": 2.0924983886359966e-07, | |
| "loss": 0.9114, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.8002195389681668, | |
| "grad_norm": 2.3443041782008, | |
| "learning_rate": 2.0609815192559863e-07, | |
| "loss": 0.906, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.8111964873765092, | |
| "grad_norm": 2.2448633071392736, | |
| "learning_rate": 2.0295364821972993e-07, | |
| "loss": 0.9229, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.8221734357848518, | |
| "grad_norm": 2.335369643205432, | |
| "learning_rate": 1.998168422505315e-07, | |
| "loss": 0.9055, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.8331503841931944, | |
| "grad_norm": 2.260199044110363, | |
| "learning_rate": 1.966882472630356e-07, | |
| "loss": 0.916, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.8441273326015368, | |
| "grad_norm": 2.1940831038305624, | |
| "learning_rate": 1.9356837515879137e-07, | |
| "loss": 0.9128, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.8551042810098792, | |
| "grad_norm": 2.2395348963129176, | |
| "learning_rate": 1.904577364121077e-07, | |
| "loss": 0.9137, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.8660812294182216, | |
| "grad_norm": 2.2788272374999288, | |
| "learning_rate": 1.873568399865288e-07, | |
| "loss": 0.8989, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.8770581778265643, | |
| "grad_norm": 2.364940762438768, | |
| "learning_rate": 1.8426619325155772e-07, | |
| "loss": 0.9136, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.8880351262349067, | |
| "grad_norm": 2.248184524351583, | |
| "learning_rate": 1.8118630189964055e-07, | |
| "loss": 0.9234, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.8990120746432493, | |
| "grad_norm": 2.265297755363968, | |
| "learning_rate": 1.7811766986342457e-07, | |
| "loss": 0.9003, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.9099890230515917, | |
| "grad_norm": 2.3424580199936447, | |
| "learning_rate": 1.7506079923330492e-07, | |
| "loss": 0.918, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.920965971459934, | |
| "grad_norm": 2.262902517301985, | |
| "learning_rate": 1.7201619017527224e-07, | |
| "loss": 0.9305, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.9319429198682765, | |
| "grad_norm": 2.2767358868319385, | |
| "learning_rate": 1.6898434084907548e-07, | |
| "loss": 0.9004, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.942919868276619, | |
| "grad_norm": 2.233676995391588, | |
| "learning_rate": 1.6596574732671286e-07, | |
| "loss": 0.9107, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.9538968166849617, | |
| "grad_norm": 2.2349418347267984, | |
| "learning_rate": 1.6296090351126445e-07, | |
| "loss": 0.9012, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.9648737650933041, | |
| "grad_norm": 2.2644252517319043, | |
| "learning_rate": 1.5997030105607967e-07, | |
| "loss": 0.9055, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.9758507135016465, | |
| "grad_norm": 2.3639914159405824, | |
| "learning_rate": 1.5699442928433255e-07, | |
| "loss": 0.8966, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.986827661909989, | |
| "grad_norm": 2.2554267100734346, | |
| "learning_rate": 1.5403377510895898e-07, | |
| "loss": 0.9192, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.9978046103183313, | |
| "grad_norm": 2.202331515306409, | |
| "learning_rate": 1.5108882295298748e-07, | |
| "loss": 0.9077, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.9618459939956665, | |
| "eval_runtime": 3.545, | |
| "eval_samples_per_second": 75.599, | |
| "eval_steps_per_second": 1.41, | |
| "step": 911 | |
| }, | |
| { | |
| "epoch": 2.008781558726674, | |
| "grad_norm": 2.3776213420255745, | |
| "learning_rate": 1.4816005467027793e-07, | |
| "loss": 0.8907, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 2.0197585071350166, | |
| "grad_norm": 2.2333706796571384, | |
| "learning_rate": 1.452479494666809e-07, | |
| "loss": 0.8927, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.030735455543359, | |
| "grad_norm": 2.2112426814454964, | |
| "learning_rate": 1.4235298382162897e-07, | |
| "loss": 0.8852, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 2.0417124039517014, | |
| "grad_norm": 2.236063023873273, | |
| "learning_rate": 1.3947563141017593e-07, | |
| "loss": 0.8955, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.052689352360044, | |
| "grad_norm": 2.329499119971109, | |
| "learning_rate": 1.3661636302549355e-07, | |
| "loss": 0.8726, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 2.063666300768386, | |
| "grad_norm": 2.226335466772911, | |
| "learning_rate": 1.3377564650184008e-07, | |
| "loss": 0.8844, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.074643249176729, | |
| "grad_norm": 2.359491075414422, | |
| "learning_rate": 1.3095394663801344e-07, | |
| "loss": 0.8888, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 2.0856201975850714, | |
| "grad_norm": 2.2422498605446894, | |
| "learning_rate": 1.2815172512130079e-07, | |
| "loss": 0.8946, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.096597145993414, | |
| "grad_norm": 2.34035910547768, | |
| "learning_rate": 1.2536944045193644e-07, | |
| "loss": 0.8925, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 2.1075740944017562, | |
| "grad_norm": 2.3092350865810105, | |
| "learning_rate": 1.2260754786808286e-07, | |
| "loss": 0.8769, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.1185510428100987, | |
| "grad_norm": 2.285345952327998, | |
| "learning_rate": 1.198664992713437e-07, | |
| "loss": 0.8637, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 2.129527991218441, | |
| "grad_norm": 2.31950118158217, | |
| "learning_rate": 1.1714674315282406e-07, | |
| "loss": 0.8851, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.140504939626784, | |
| "grad_norm": 2.260100588550131, | |
| "learning_rate": 1.144487245197481e-07, | |
| "loss": 0.877, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.1514818880351263, | |
| "grad_norm": 2.28406902577705, | |
| "learning_rate": 1.1177288482264652e-07, | |
| "loss": 0.8731, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.1624588364434687, | |
| "grad_norm": 2.349769500619024, | |
| "learning_rate": 1.091196618831268e-07, | |
| "loss": 0.8741, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 2.173435784851811, | |
| "grad_norm": 2.319775552144385, | |
| "learning_rate": 1.0648948982223657e-07, | |
| "loss": 0.9009, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.1844127332601535, | |
| "grad_norm": 2.854032904166289, | |
| "learning_rate": 1.0388279898943222e-07, | |
| "loss": 0.8867, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 2.1953896816684964, | |
| "grad_norm": 2.3639396187236508, | |
| "learning_rate": 1.0130001589216566e-07, | |
| "loss": 0.8848, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.2063666300768388, | |
| "grad_norm": 2.3793062354194143, | |
| "learning_rate": 9.874156312609835e-08, | |
| "loss": 0.8811, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 2.217343578485181, | |
| "grad_norm": 2.287306409448599, | |
| "learning_rate": 9.620785930595679e-08, | |
| "loss": 0.8742, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.2283205268935236, | |
| "grad_norm": 2.308427796984264, | |
| "learning_rate": 9.369931899703821e-08, | |
| "loss": 0.8879, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 2.239297475301866, | |
| "grad_norm": 2.266095319232965, | |
| "learning_rate": 9.121635264737923e-08, | |
| "loss": 0.8976, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.2502744237102084, | |
| "grad_norm": 2.290278874944272, | |
| "learning_rate": 8.87593665205987e-08, | |
| "loss": 0.8957, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 2.261251372118551, | |
| "grad_norm": 2.351021776661905, | |
| "learning_rate": 8.632876262942415e-08, | |
| "loss": 0.8833, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.2722283205268936, | |
| "grad_norm": 2.290071235509179, | |
| "learning_rate": 8.392493866991487e-08, | |
| "loss": 0.8728, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 2.283205268935236, | |
| "grad_norm": 2.2366625219176037, | |
| "learning_rate": 8.154828795639057e-08, | |
| "loss": 0.8925, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.2941822173435784, | |
| "grad_norm": 2.2935122329416253, | |
| "learning_rate": 7.9199199357077e-08, | |
| "loss": 0.8954, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 2.305159165751921, | |
| "grad_norm": 2.2174582268909346, | |
| "learning_rate": 7.687805723047952e-08, | |
| "loss": 0.8755, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.3161361141602637, | |
| "grad_norm": 2.320147905008012, | |
| "learning_rate": 7.45852413624943e-08, | |
| "loss": 0.8672, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 2.327113062568606, | |
| "grad_norm": 2.311040151424324, | |
| "learning_rate": 7.232112690426712e-08, | |
| "loss": 0.8732, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.3380900109769485, | |
| "grad_norm": 2.286645866168397, | |
| "learning_rate": 7.008608431081179e-08, | |
| "loss": 0.8862, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 2.349066959385291, | |
| "grad_norm": 2.243167376226312, | |
| "learning_rate": 6.78804792803955e-08, | |
| "loss": 0.8708, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.3600439077936333, | |
| "grad_norm": 2.315245531135358, | |
| "learning_rate": 6.570467269470375e-08, | |
| "loss": 0.8822, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 2.3710208562019757, | |
| "grad_norm": 2.268683761726241, | |
| "learning_rate": 6.355902055979253e-08, | |
| "loss": 0.879, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.3819978046103185, | |
| "grad_norm": 2.312610053338687, | |
| "learning_rate": 6.144387394783829e-08, | |
| "loss": 0.8647, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 2.392974753018661, | |
| "grad_norm": 2.2428629980309616, | |
| "learning_rate": 5.935957893969587e-08, | |
| "loss": 0.8762, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.4039517014270033, | |
| "grad_norm": 2.35765208556055, | |
| "learning_rate": 5.730647656827242e-08, | |
| "loss": 0.8841, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 2.4149286498353457, | |
| "grad_norm": 2.2494376360363164, | |
| "learning_rate": 5.528490276272732e-08, | |
| "loss": 0.8711, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.425905598243688, | |
| "grad_norm": 2.2596581762379793, | |
| "learning_rate": 5.329518829350788e-08, | |
| "loss": 0.8724, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 2.436882546652031, | |
| "grad_norm": 2.3221019724494982, | |
| "learning_rate": 5.1337658718227926e-08, | |
| "loss": 0.8716, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.4478594950603734, | |
| "grad_norm": 2.2548297711291467, | |
| "learning_rate": 4.941263432840062e-08, | |
| "loss": 0.8703, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 2.4588364434687158, | |
| "grad_norm": 2.2559502603970776, | |
| "learning_rate": 4.7520430097031855e-08, | |
| "loss": 0.891, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.469813391877058, | |
| "grad_norm": 2.348427252684778, | |
| "learning_rate": 4.566135562708437e-08, | |
| "loss": 0.8697, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 2.4807903402854006, | |
| "grad_norm": 2.216073741504324, | |
| "learning_rate": 4.383571510082051e-08, | |
| "loss": 0.8682, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.491767288693743, | |
| "grad_norm": 2.295163968424724, | |
| "learning_rate": 4.20438072300319e-08, | |
| "loss": 0.8837, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 2.5027442371020854, | |
| "grad_norm": 2.2516483968280343, | |
| "learning_rate": 4.028592520716387e-08, | |
| "loss": 0.8638, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.5137211855104282, | |
| "grad_norm": 2.269953195416105, | |
| "learning_rate": 3.8562356657343584e-08, | |
| "loss": 0.8788, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 2.5246981339187706, | |
| "grad_norm": 2.3024875486631133, | |
| "learning_rate": 3.6873383591318394e-08, | |
| "loss": 0.8714, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.535675082327113, | |
| "grad_norm": 2.2241713120206037, | |
| "learning_rate": 3.521928235931346e-08, | |
| "loss": 0.887, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 2.5466520307354554, | |
| "grad_norm": 2.2936145333162994, | |
| "learning_rate": 3.3600323605815107e-08, | |
| "loss": 0.87, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.5576289791437983, | |
| "grad_norm": 2.32545502497619, | |
| "learning_rate": 3.201677222528784e-08, | |
| "loss": 0.8686, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 2.5686059275521407, | |
| "grad_norm": 2.2489169681563195, | |
| "learning_rate": 3.0468887318832406e-08, | |
| "loss": 0.8746, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.579582875960483, | |
| "grad_norm": 2.2858684159165734, | |
| "learning_rate": 2.8956922151791547e-08, | |
| "loss": 0.8894, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 2.5905598243688255, | |
| "grad_norm": 2.2940920483922382, | |
| "learning_rate": 2.748112411231046e-08, | |
| "loss": 0.8682, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.601536772777168, | |
| "grad_norm": 2.2496855474263366, | |
| "learning_rate": 2.6041734670859488e-08, | |
| "loss": 0.8694, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 2.6125137211855103, | |
| "grad_norm": 2.3184342673634055, | |
| "learning_rate": 2.463898934072417e-08, | |
| "loss": 0.8742, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.6234906695938527, | |
| "grad_norm": 2.2891655132723576, | |
| "learning_rate": 2.3273117639470958e-08, | |
| "loss": 0.8692, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 2.6344676180021955, | |
| "grad_norm": 2.186621510859105, | |
| "learning_rate": 2.1944343051393173e-08, | |
| "loss": 0.8871, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.645444566410538, | |
| "grad_norm": 2.1909337313948467, | |
| "learning_rate": 2.0652882990944532e-08, | |
| "loss": 0.882, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 2.6564215148188803, | |
| "grad_norm": 2.3021129840080827, | |
| "learning_rate": 1.9398948767165774e-08, | |
| "loss": 0.8798, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.6673984632272227, | |
| "grad_norm": 2.233026500612387, | |
| "learning_rate": 1.818274554911034e-08, | |
| "loss": 0.87, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 2.6783754116355656, | |
| "grad_norm": 2.3034061687210925, | |
| "learning_rate": 1.7004472332274117e-08, | |
| "loss": 0.8878, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.689352360043908, | |
| "grad_norm": 2.2448202497203957, | |
| "learning_rate": 1.586432190603626e-08, | |
| "loss": 0.8924, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 2.7003293084522504, | |
| "grad_norm": 2.297719442799476, | |
| "learning_rate": 1.4762480822114731e-08, | |
| "loss": 0.8702, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.711306256860593, | |
| "grad_norm": 2.2378202672720273, | |
| "learning_rate": 1.3699129364042522e-08, | |
| "loss": 0.8907, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 2.722283205268935, | |
| "grad_norm": 2.2880262820978703, | |
| "learning_rate": 1.267444151766986e-08, | |
| "loss": 0.9005, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.7332601536772776, | |
| "grad_norm": 2.338012677352183, | |
| "learning_rate": 1.1688584942696366e-08, | |
| "loss": 0.8904, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 2.74423710208562, | |
| "grad_norm": 2.289922529284634, | |
| "learning_rate": 1.0741720945238731e-08, | |
| "loss": 0.8986, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.755214050493963, | |
| "grad_norm": 2.2470416957804273, | |
| "learning_rate": 9.834004451437699e-09, | |
| "loss": 0.8728, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 2.7661909989023052, | |
| "grad_norm": 2.2478830819509303, | |
| "learning_rate": 8.965583982108865e-09, | |
| "loss": 0.8754, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.7771679473106476, | |
| "grad_norm": 2.244470092578961, | |
| "learning_rate": 8.136601628441875e-09, | |
| "loss": 0.8828, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 2.78814489571899, | |
| "grad_norm": 2.2299781074531326, | |
| "learning_rate": 7.347193028751364e-09, | |
| "loss": 0.8755, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.7991218441273324, | |
| "grad_norm": 2.379183267890298, | |
| "learning_rate": 6.597487346283626e-09, | |
| "loss": 0.8697, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 2.8100987925356753, | |
| "grad_norm": 2.3021772414641837, | |
| "learning_rate": 5.8876072480831264e-09, | |
| "loss": 0.8849, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.8210757409440177, | |
| "grad_norm": 2.2739127597741144, | |
| "learning_rate": 5.217668884921505e-09, | |
| "loss": 0.8889, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 2.83205268935236, | |
| "grad_norm": 2.239037460712834, | |
| "learning_rate": 4.587781872293056e-09, | |
| "loss": 0.8636, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.8430296377607025, | |
| "grad_norm": 2.3065882078621436, | |
| "learning_rate": 3.998049272479431e-09, | |
| "loss": 0.8643, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 2.854006586169045, | |
| "grad_norm": 2.2987652140661265, | |
| "learning_rate": 3.4485675776863843e-09, | |
| "loss": 0.882, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.8649835345773873, | |
| "grad_norm": 2.2776905750235956, | |
| "learning_rate": 2.9394266942558976e-09, | |
| "loss": 0.8883, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 2.8759604829857297, | |
| "grad_norm": 2.2719810844272406, | |
| "learning_rate": 2.4707099279556164e-09, | |
| "loss": 0.9002, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.8869374313940726, | |
| "grad_norm": 2.393629212210359, | |
| "learning_rate": 2.04249397034828e-09, | |
| "loss": 0.8737, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.897914379802415, | |
| "grad_norm": 2.318705736365544, | |
| "learning_rate": 1.6548488862435095e-09, | |
| "loss": 0.8641, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.9088913282107574, | |
| "grad_norm": 2.1750311426072604, | |
| "learning_rate": 1.3078381022336715e-09, | |
| "loss": 0.8636, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.9198682766190998, | |
| "grad_norm": 2.2319410876071166, | |
| "learning_rate": 1.0015183963161811e-09, | |
| "loss": 0.8631, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.9308452250274426, | |
| "grad_norm": 2.2205987624431143, | |
| "learning_rate": 7.359398886032653e-10, | |
| "loss": 0.8888, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.941822173435785, | |
| "grad_norm": 2.255451203517086, | |
| "learning_rate": 5.111460331214124e-10, | |
| "loss": 0.8845, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.9527991218441274, | |
| "grad_norm": 2.360197105909993, | |
| "learning_rate": 3.271736107015033e-10, | |
| "loss": 0.8999, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.96377607025247, | |
| "grad_norm": 2.2839001061008553, | |
| "learning_rate": 1.8405272296045937e-10, | |
| "loss": 0.8876, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.974753018660812, | |
| "grad_norm": 2.240462357412488, | |
| "learning_rate": 8.180678737629287e-11, | |
| "loss": 0.8671, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.9857299670691546, | |
| "grad_norm": 2.282162777030651, | |
| "learning_rate": 2.0452533456311037e-11, | |
| "loss": 0.875, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.996706915477497, | |
| "grad_norm": 2.320056946149888, | |
| "learning_rate": 0.0, | |
| "loss": 0.8919, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.996706915477497, | |
| "eval_loss": 0.9529117345809937, | |
| "eval_runtime": 3.471, | |
| "eval_samples_per_second": 77.211, | |
| "eval_steps_per_second": 1.44, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.996706915477497, | |
| "step": 1365, | |
| "total_flos": 571501770178560.0, | |
| "train_loss": 0.9509169136648213, | |
| "train_runtime": 15780.8295, | |
| "train_samples_per_second": 22.162, | |
| "train_steps_per_second": 0.086 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1365, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 571501770178560.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |