| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9992094861660079, |
| "eval_steps": 500, |
| "global_step": 948, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001054018445322793, |
| "grad_norm": 0.39404706483981056, |
| "learning_rate": 2.105263157894737e-06, |
| "loss": 1.5147, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.005270092226613966, |
| "grad_norm": 0.40857024450725205, |
| "learning_rate": 1.0526315789473684e-05, |
| "loss": 1.4886, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.010540184453227932, |
| "grad_norm": 0.453453261928735, |
| "learning_rate": 2.105263157894737e-05, |
| "loss": 1.4972, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.015810276679841896, |
| "grad_norm": 0.3090923813519492, |
| "learning_rate": 3.157894736842105e-05, |
| "loss": 1.5027, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.021080368906455864, |
| "grad_norm": 0.22055096317618933, |
| "learning_rate": 4.210526315789474e-05, |
| "loss": 1.4481, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.026350461133069828, |
| "grad_norm": 0.17983601541246372, |
| "learning_rate": 5.2631578947368424e-05, |
| "loss": 1.4153, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.03162055335968379, |
| "grad_norm": 0.1874536292590189, |
| "learning_rate": 6.31578947368421e-05, |
| "loss": 1.3757, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.03689064558629776, |
| "grad_norm": 0.12837222377985866, |
| "learning_rate": 7.368421052631579e-05, |
| "loss": 1.3629, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.04216073781291173, |
| "grad_norm": 0.1498631297125495, |
| "learning_rate": 8.421052631578948e-05, |
| "loss": 1.3106, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.04743083003952569, |
| "grad_norm": 0.09189369689728918, |
| "learning_rate": 9.473684210526316e-05, |
| "loss": 1.3424, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.052700922266139656, |
| "grad_norm": 0.08752874670761675, |
| "learning_rate": 0.00010526315789473685, |
| "loss": 1.3074, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.057971014492753624, |
| "grad_norm": 0.08630425604693764, |
| "learning_rate": 0.00011578947368421053, |
| "loss": 1.2875, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.06324110671936758, |
| "grad_norm": 0.07094463605256358, |
| "learning_rate": 0.0001263157894736842, |
| "loss": 1.2732, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.06851119894598155, |
| "grad_norm": 0.07758160354755479, |
| "learning_rate": 0.0001368421052631579, |
| "loss": 1.2569, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.07378129117259552, |
| "grad_norm": 0.08435231995433538, |
| "learning_rate": 0.00014736842105263158, |
| "loss": 1.2585, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.07905138339920949, |
| "grad_norm": 0.08934371249105144, |
| "learning_rate": 0.00015789473684210527, |
| "loss": 1.2204, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08432147562582346, |
| "grad_norm": 0.07972601302596631, |
| "learning_rate": 0.00016842105263157895, |
| "loss": 1.2439, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.08959156785243742, |
| "grad_norm": 0.07061187876194241, |
| "learning_rate": 0.00017894736842105264, |
| "loss": 1.241, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.09486166007905138, |
| "grad_norm": 0.05628834298654776, |
| "learning_rate": 0.00018947368421052632, |
| "loss": 1.2285, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.10013175230566534, |
| "grad_norm": 0.06883977860396077, |
| "learning_rate": 0.0002, |
| "loss": 1.2223, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.10540184453227931, |
| "grad_norm": 0.06410218479217554, |
| "learning_rate": 0.00019998304493640002, |
| "loss": 1.2036, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.11067193675889328, |
| "grad_norm": 0.05911226834818556, |
| "learning_rate": 0.00019993218549508364, |
| "loss": 1.2039, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.11594202898550725, |
| "grad_norm": 0.08109814152684167, |
| "learning_rate": 0.0001998474389225522, |
| "loss": 1.1754, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.12121212121212122, |
| "grad_norm": 0.07284116557802132, |
| "learning_rate": 0.00019972883395647615, |
| "loss": 1.2187, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.12648221343873517, |
| "grad_norm": 0.07120114794969945, |
| "learning_rate": 0.00019957641081595043, |
| "loss": 1.1874, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.13175230566534915, |
| "grad_norm": 0.07493327249468582, |
| "learning_rate": 0.0001993902211878558, |
| "loss": 1.1929, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.1370223978919631, |
| "grad_norm": 0.07074388935975784, |
| "learning_rate": 0.000199170328209332, |
| "loss": 1.2024, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.1422924901185771, |
| "grad_norm": 0.06546615696707399, |
| "learning_rate": 0.00019891680644636782, |
| "loss": 1.1766, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.14756258234519104, |
| "grad_norm": 0.07629909100315264, |
| "learning_rate": 0.00019862974186851548, |
| "loss": 1.1809, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.152832674571805, |
| "grad_norm": 0.07791403098733285, |
| "learning_rate": 0.0001983092318197385, |
| "loss": 1.1846, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.15810276679841898, |
| "grad_norm": 0.1194107102069711, |
| "learning_rate": 0.0001979553849854021, |
| "loss": 1.2033, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.16337285902503293, |
| "grad_norm": 0.07429281190181045, |
| "learning_rate": 0.00019756832135541796, |
| "loss": 1.2156, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.1686429512516469, |
| "grad_norm": 0.07778983439544027, |
| "learning_rate": 0.00019714817218355525, |
| "loss": 1.1861, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.17391304347826086, |
| "grad_norm": 0.0700122776712239, |
| "learning_rate": 0.00019669507994293266, |
| "loss": 1.177, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.17918313570487485, |
| "grad_norm": 0.07173341867164325, |
| "learning_rate": 0.0001962091982777053, |
| "loss": 1.1865, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.1844532279314888, |
| "grad_norm": 0.08131438952683417, |
| "learning_rate": 0.00019569069195096386, |
| "loss": 1.2056, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.18972332015810275, |
| "grad_norm": 0.07216710714795249, |
| "learning_rate": 0.0001951397367888633, |
| "loss": 1.1912, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.19499341238471674, |
| "grad_norm": 0.08765620347237527, |
| "learning_rate": 0.00019455651962099987, |
| "loss": 1.1766, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.2002635046113307, |
| "grad_norm": 0.0727173970892306, |
| "learning_rate": 0.00019394123821705713, |
| "loss": 1.19, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.20553359683794467, |
| "grad_norm": 0.10812659541318984, |
| "learning_rate": 0.0001932941012197417, |
| "loss": 1.1709, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.21080368906455862, |
| "grad_norm": 0.07720565285012457, |
| "learning_rate": 0.0001926153280740326, |
| "loss": 1.1792, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2160737812911726, |
| "grad_norm": 0.08265713642469576, |
| "learning_rate": 0.00019190514895276687, |
| "loss": 1.1734, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.22134387351778656, |
| "grad_norm": 0.0841714036634629, |
| "learning_rate": 0.00019116380467858792, |
| "loss": 1.1692, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.22661396574440051, |
| "grad_norm": 0.0699684857746126, |
| "learning_rate": 0.00019039154664228213, |
| "loss": 1.1849, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.2318840579710145, |
| "grad_norm": 0.07747524061551442, |
| "learning_rate": 0.00018958863671753192, |
| "loss": 1.1777, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.23715415019762845, |
| "grad_norm": 0.06784812842240001, |
| "learning_rate": 0.000188755347172114, |
| "loss": 1.1843, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.24242424242424243, |
| "grad_norm": 0.06912548481398251, |
| "learning_rate": 0.00018789196057557325, |
| "loss": 1.185, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.24769433465085638, |
| "grad_norm": 0.06634358007027238, |
| "learning_rate": 0.00018699876970340278, |
| "loss": 1.1771, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.25296442687747034, |
| "grad_norm": 0.09303081500021829, |
| "learning_rate": 0.00018607607743776345, |
| "loss": 1.1625, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.25823451910408435, |
| "grad_norm": 0.09380407779557924, |
| "learning_rate": 0.0001851241966647762, |
| "loss": 1.1868, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.2635046113306983, |
| "grad_norm": 0.08035007580379026, |
| "learning_rate": 0.00018414345016842196, |
| "loss": 1.1755, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.26877470355731226, |
| "grad_norm": 0.0697105936452379, |
| "learning_rate": 0.00018313417052108513, |
| "loss": 1.1896, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.2740447957839262, |
| "grad_norm": 0.07028139619720467, |
| "learning_rate": 0.00018209669997077795, |
| "loss": 1.1867, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.27931488801054016, |
| "grad_norm": 0.08005551531383928, |
| "learning_rate": 0.0001810313903250837, |
| "loss": 1.172, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.2845849802371542, |
| "grad_norm": 0.06863200323099397, |
| "learning_rate": 0.0001799386028318583, |
| "loss": 1.1769, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.2898550724637681, |
| "grad_norm": 0.06181177265942269, |
| "learning_rate": 0.0001788187080567307, |
| "loss": 1.1501, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.2951251646903821, |
| "grad_norm": 0.07265216626352897, |
| "learning_rate": 0.00017767208575744368, |
| "loss": 1.1422, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.30039525691699603, |
| "grad_norm": 0.07072216223689005, |
| "learning_rate": 0.00017649912475507744, |
| "loss": 1.163, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.30566534914361, |
| "grad_norm": 0.06855120798778974, |
| "learning_rate": 0.00017530022280219987, |
| "loss": 1.165, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.310935441370224, |
| "grad_norm": 0.06589205274201403, |
| "learning_rate": 0.00017407578644798818, |
| "loss": 1.1832, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.31620553359683795, |
| "grad_norm": 0.06773952553025948, |
| "learning_rate": 0.0001728262309003676, |
| "loss": 1.1473, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3214756258234519, |
| "grad_norm": 0.06360929861844895, |
| "learning_rate": 0.00017155197988521375, |
| "loss": 1.1745, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.32674571805006586, |
| "grad_norm": 0.09173249469158068, |
| "learning_rate": 0.00017025346550266667, |
| "loss": 1.1668, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.33201581027667987, |
| "grad_norm": 0.077917756437729, |
| "learning_rate": 0.00016893112808060527, |
| "loss": 1.1713, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.3372859025032938, |
| "grad_norm": 0.06755300520679709, |
| "learning_rate": 0.00016758541602533136, |
| "loss": 1.1486, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.3425559947299078, |
| "grad_norm": 0.06672353792622543, |
| "learning_rate": 0.0001662167856695146, |
| "loss": 1.1564, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.34782608695652173, |
| "grad_norm": 0.07337028048547245, |
| "learning_rate": 0.00016482570111744956, |
| "loss": 1.1724, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.3530961791831357, |
| "grad_norm": 0.08156438942818965, |
| "learning_rate": 0.00016341263408767732, |
| "loss": 1.1545, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3583662714097497, |
| "grad_norm": 0.07740600091044697, |
| "learning_rate": 0.00016197806375302504, |
| "loss": 1.1554, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 0.07470926789946018, |
| "learning_rate": 0.00016052247657811805, |
| "loss": 1.1714, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.3689064558629776, |
| "grad_norm": 0.06531221589022533, |
| "learning_rate": 0.00015904636615441886, |
| "loss": 1.1798, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.37417654808959155, |
| "grad_norm": 0.06690445775706796, |
| "learning_rate": 0.00015755023303284972, |
| "loss": 1.1732, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.3794466403162055, |
| "grad_norm": 0.06742765638533584, |
| "learning_rate": 0.00015603458455405522, |
| "loss": 1.1547, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.3847167325428195, |
| "grad_norm": 0.06707326098047016, |
| "learning_rate": 0.00015449993467636248, |
| "loss": 1.1619, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.38998682476943347, |
| "grad_norm": 0.06593399580900151, |
| "learning_rate": 0.0001529468038014971, |
| "loss": 1.1524, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.3952569169960474, |
| "grad_norm": 0.0752943731160187, |
| "learning_rate": 0.00015137571859811426, |
| "loss": 1.1657, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.4005270092226614, |
| "grad_norm": 0.06613310198573072, |
| "learning_rate": 0.00014978721182320489, |
| "loss": 1.141, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.4057971014492754, |
| "grad_norm": 0.06461052548902453, |
| "learning_rate": 0.00014818182214143696, |
| "loss": 1.1428, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.41106719367588934, |
| "grad_norm": 0.0662307711187639, |
| "learning_rate": 0.00014656009394249357, |
| "loss": 1.1541, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.4163372859025033, |
| "grad_norm": 0.06622974227557915, |
| "learning_rate": 0.0001449225771564699, |
| "loss": 1.1605, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.42160737812911725, |
| "grad_norm": 0.06532140522118222, |
| "learning_rate": 0.0001432698270673909, |
| "loss": 1.1748, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4268774703557312, |
| "grad_norm": 0.06519413205191296, |
| "learning_rate": 0.00014160240412491417, |
| "loss": 1.1855, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.4321475625823452, |
| "grad_norm": 0.065127665509989, |
| "learning_rate": 0.0001399208737542804, |
| "loss": 1.1627, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.43741765480895917, |
| "grad_norm": 0.06758425908458236, |
| "learning_rate": 0.00013822580616457722, |
| "loss": 1.1679, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.4426877470355731, |
| "grad_norm": 0.07290843170304945, |
| "learning_rate": 0.0001365177761553804, |
| "loss": 1.1652, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.4479578392621871, |
| "grad_norm": 0.0688402351797127, |
| "learning_rate": 0.0001347973629218387, |
| "loss": 1.156, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.45322793148880103, |
| "grad_norm": 0.06985990308743745, |
| "learning_rate": 0.00013306514985826794, |
| "loss": 1.1481, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.45849802371541504, |
| "grad_norm": 0.07511059181574245, |
| "learning_rate": 0.0001313217243603214, |
| "loss": 1.1672, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.463768115942029, |
| "grad_norm": 0.08242275823498015, |
| "learning_rate": 0.00012956767762580305, |
| "loss": 1.1635, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.46903820816864294, |
| "grad_norm": 0.0822523100333473, |
| "learning_rate": 0.00012780360445419165, |
| "loss": 1.1553, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.4743083003952569, |
| "grad_norm": 0.07104777230281678, |
| "learning_rate": 0.00012603010304494368, |
| "loss": 1.1604, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.4795783926218709, |
| "grad_norm": 0.07220370861806837, |
| "learning_rate": 0.0001242477747946429, |
| "loss": 1.1768, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.48484848484848486, |
| "grad_norm": 0.07512849348769339, |
| "learning_rate": 0.00012245722409306607, |
| "loss": 1.1687, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.4901185770750988, |
| "grad_norm": 0.06808362624998583, |
| "learning_rate": 0.00012065905811823372, |
| "loss": 1.148, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.49538866930171277, |
| "grad_norm": 0.07337281775749502, |
| "learning_rate": 0.00011885388663051515, |
| "loss": 1.17, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.5006587615283268, |
| "grad_norm": 0.06535144834985215, |
| "learning_rate": 0.00011704232176585801, |
| "loss": 1.1602, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.5059288537549407, |
| "grad_norm": 0.08038169398057385, |
| "learning_rate": 0.00011522497782821226, |
| "loss": 1.155, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5111989459815547, |
| "grad_norm": 0.06948348795422524, |
| "learning_rate": 0.0001134024710812188, |
| "loss": 1.1466, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.5164690382081687, |
| "grad_norm": 0.06443312773095472, |
| "learning_rate": 0.00011157541953923369, |
| "loss": 1.1503, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5217391304347826, |
| "grad_norm": 0.06658965218069979, |
| "learning_rate": 0.00010974444275775898, |
| "loss": 1.1512, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.5270092226613966, |
| "grad_norm": 0.06309555633899999, |
| "learning_rate": 0.00010791016162335012, |
| "loss": 1.1582, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5322793148880105, |
| "grad_norm": 0.06263957199925645, |
| "learning_rate": 0.00010607319814307309, |
| "loss": 1.1495, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.5375494071146245, |
| "grad_norm": 0.06490327122238214, |
| "learning_rate": 0.00010423417523358061, |
| "loss": 1.14, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.5428194993412385, |
| "grad_norm": 0.06598136970382121, |
| "learning_rate": 0.00010239371650988061, |
| "loss": 1.1468, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.5480895915678524, |
| "grad_norm": 0.07318689438224826, |
| "learning_rate": 0.00010055244607386725, |
| "loss": 1.1743, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.5533596837944664, |
| "grad_norm": 0.06584183327376071, |
| "learning_rate": 9.871098830268751e-05, |
| "loss": 1.1508, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.5586297760210803, |
| "grad_norm": 0.06627963834560362, |
| "learning_rate": 9.686996763701401e-05, |
| "loss": 1.1541, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.5638998682476943, |
| "grad_norm": 0.07147807488637166, |
| "learning_rate": 9.503000836929617e-05, |
| "loss": 1.152, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.5691699604743083, |
| "grad_norm": 0.06945069029665552, |
| "learning_rate": 9.319173443206213e-05, |
| "loss": 1.17, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.5744400527009222, |
| "grad_norm": 0.06357283430309349, |
| "learning_rate": 9.135576918634231e-05, |
| "loss": 1.1345, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.5797101449275363, |
| "grad_norm": 0.0648394813973767, |
| "learning_rate": 8.952273521028682e-05, |
| "loss": 1.1682, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.5849802371541502, |
| "grad_norm": 0.06117798032020935, |
| "learning_rate": 8.769325408804864e-05, |
| "loss": 1.1479, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.5902503293807642, |
| "grad_norm": 0.06457098726991788, |
| "learning_rate": 8.586794619900335e-05, |
| "loss": 1.1372, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.5955204216073782, |
| "grad_norm": 0.06262155689743844, |
| "learning_rate": 8.404743050737797e-05, |
| "loss": 1.1424, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.6007905138339921, |
| "grad_norm": 0.06555228325330152, |
| "learning_rate": 8.22323243523592e-05, |
| "loss": 1.1653, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6060606060606061, |
| "grad_norm": 0.06036031193443284, |
| "learning_rate": 8.042324323875306e-05, |
| "loss": 1.1578, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.61133069828722, |
| "grad_norm": 0.07045247796462163, |
| "learning_rate": 7.862080062826627e-05, |
| "loss": 1.1471, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.616600790513834, |
| "grad_norm": 0.06334737777469358, |
| "learning_rate": 7.682560773148075e-05, |
| "loss": 1.1378, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.621870882740448, |
| "grad_norm": 0.06550800893698057, |
| "learning_rate": 7.503827330059134e-05, |
| "loss": 1.1628, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.6271409749670619, |
| "grad_norm": 0.062246738751270736, |
| "learning_rate": 7.325940342297697e-05, |
| "loss": 1.1502, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.6324110671936759, |
| "grad_norm": 0.0732931801527146, |
| "learning_rate": 7.148960131567597e-05, |
| "loss": 1.1484, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.6376811594202898, |
| "grad_norm": 0.07313278054554617, |
| "learning_rate": 6.9729467120834e-05, |
| "loss": 1.1669, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.6429512516469038, |
| "grad_norm": 0.0720018824446065, |
| "learning_rate": 6.797959770219548e-05, |
| "loss": 1.1714, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.6482213438735178, |
| "grad_norm": 0.06521254410551518, |
| "learning_rate": 6.624058644270613e-05, |
| "loss": 1.1516, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.6534914361001317, |
| "grad_norm": 0.06326771011144902, |
| "learning_rate": 6.451302304329597e-05, |
| "loss": 1.1515, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.6587615283267457, |
| "grad_norm": 0.07081901979627483, |
| "learning_rate": 6.279749332291129e-05, |
| "loss": 1.1528, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.6640316205533597, |
| "grad_norm": 0.06636113517143104, |
| "learning_rate": 6.109457901986238e-05, |
| "loss": 1.1453, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.6693017127799736, |
| "grad_norm": 0.07398567475177478, |
| "learning_rate": 5.940485759455556e-05, |
| "loss": 1.1583, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.6745718050065876, |
| "grad_norm": 0.06667357818098503, |
| "learning_rate": 5.77289020336754e-05, |
| "loss": 1.1507, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.6798418972332015, |
| "grad_norm": 0.07109662430845431, |
| "learning_rate": 5.606728065588447e-05, |
| "loss": 1.1575, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.6851119894598156, |
| "grad_norm": 0.06409037384972226, |
| "learning_rate": 5.442055691910557e-05, |
| "loss": 1.1498, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.6903820816864296, |
| "grad_norm": 0.0671240827602366, |
| "learning_rate": 5.278928922945243e-05, |
| "loss": 1.1525, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.6956521739130435, |
| "grad_norm": 0.061226401388187794, |
| "learning_rate": 5.1174030751873604e-05, |
| "loss": 1.1419, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.7009222661396575, |
| "grad_norm": 0.06728191241600981, |
| "learning_rate": 4.9575329222573444e-05, |
| "loss": 1.1271, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.7061923583662714, |
| "grad_norm": 0.06517202399426715, |
| "learning_rate": 4.799372676327409e-05, |
| "loss": 1.1476, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.7114624505928854, |
| "grad_norm": 0.06161897030170573, |
| "learning_rate": 4.642975969738128e-05, |
| "loss": 1.1348, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.7167325428194994, |
| "grad_norm": 0.06558609233456765, |
| "learning_rate": 4.4883958368116444e-05, |
| "loss": 1.1742, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.7220026350461133, |
| "grad_norm": 0.06373257188725644, |
| "learning_rate": 4.335684695867652e-05, |
| "loss": 1.1532, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 0.0650407336427327, |
| "learning_rate": 4.184894331448305e-05, |
| "loss": 1.1394, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.7325428194993412, |
| "grad_norm": 0.06397771203208771, |
| "learning_rate": 4.036075876757981e-05, |
| "loss": 1.1497, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.7378129117259552, |
| "grad_norm": 0.06079746681576419, |
| "learning_rate": 3.889279796323951e-05, |
| "loss": 1.1534, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.7430830039525692, |
| "grad_norm": 0.06479753103269026, |
| "learning_rate": 3.744555868883828e-05, |
| "loss": 1.1497, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.7483530961791831, |
| "grad_norm": 0.06581609418762477, |
| "learning_rate": 3.60195317050549e-05, |
| "loss": 1.1535, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.7536231884057971, |
| "grad_norm": 0.06666358480682857, |
| "learning_rate": 3.461520057945349e-05, |
| "loss": 1.148, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.758893280632411, |
| "grad_norm": 0.06647663836880385, |
| "learning_rate": 3.323304152250504e-05, |
| "loss": 1.1371, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.764163372859025, |
| "grad_norm": 0.06594824599007991, |
| "learning_rate": 3.187352322610387e-05, |
| "loss": 1.1624, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.769433465085639, |
| "grad_norm": 0.06670953963578341, |
| "learning_rate": 3.0537106704633576e-05, |
| "loss": 1.1324, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.7747035573122529, |
| "grad_norm": 0.0648144007582327, |
| "learning_rate": 2.9224245138636563e-05, |
| "loss": 1.1496, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.7799736495388669, |
| "grad_norm": 0.0925144186806725, |
| "learning_rate": 2.793538372113983e-05, |
| "loss": 1.1518, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.7852437417654808, |
| "grad_norm": 0.06930233703557911, |
| "learning_rate": 2.6670959506689373e-05, |
| "loss": 1.1558, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.7905138339920948, |
| "grad_norm": 0.06382392263545923, |
| "learning_rate": 2.5431401263144773e-05, |
| "loss": 1.1524, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.7957839262187089, |
| "grad_norm": 0.062313014975029925, |
| "learning_rate": 2.4217129326283083e-05, |
| "loss": 1.1597, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.8010540184453228, |
| "grad_norm": 0.06230846793052726, |
| "learning_rate": 2.3028555457262648e-05, |
| "loss": 1.1686, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.8063241106719368, |
| "grad_norm": 0.06223876855085951, |
| "learning_rate": 2.186608270299434e-05, |
| "loss": 1.168, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.8115942028985508, |
| "grad_norm": 0.06420690983295349, |
| "learning_rate": 2.0730105259467768e-05, |
| "loss": 1.1348, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.8168642951251647, |
| "grad_norm": 0.06375634397653643, |
| "learning_rate": 1.9621008338079017e-05, |
| "loss": 1.1404, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.8221343873517787, |
| "grad_norm": 0.06571314443226632, |
| "learning_rate": 1.8539168035004862e-05, |
| "loss": 1.174, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.8274044795783926, |
| "grad_norm": 0.06575457012483443, |
| "learning_rate": 1.7484951203668265e-05, |
| "loss": 1.168, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.8326745718050066, |
| "grad_norm": 0.06445557120587224, |
| "learning_rate": 1.645871533033758e-05, |
| "loss": 1.1582, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.8379446640316206, |
| "grad_norm": 0.062499489435608076, |
| "learning_rate": 1.5460808412902915e-05, |
| "loss": 1.1613, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.8432147562582345, |
| "grad_norm": 0.06482158243635854, |
| "learning_rate": 1.4491568842869285e-05, |
| "loss": 1.1419, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.8484848484848485, |
| "grad_norm": 0.06231433585747808, |
| "learning_rate": 1.3551325290607741e-05, |
| "loss": 1.1418, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.8537549407114624, |
| "grad_norm": 0.0634776786608025, |
| "learning_rate": 1.264039659390287e-05, |
| "loss": 1.1453, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.8590250329380764, |
| "grad_norm": 0.06386493701128639, |
| "learning_rate": 1.1759091649834398e-05, |
| "loss": 1.1587, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.8642951251646904, |
| "grad_norm": 0.06316991877085024, |
| "learning_rate": 1.0907709310029912e-05, |
| "loss": 1.1448, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.8695652173913043, |
| "grad_norm": 0.0647662180858074, |
| "learning_rate": 1.0086538279323742e-05, |
| "loss": 1.1454, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.8748353096179183, |
| "grad_norm": 0.06320482976344752, |
| "learning_rate": 9.295857017857024e-06, |
| "loss": 1.1464, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.8801054018445322, |
| "grad_norm": 0.06412760146925889, |
| "learning_rate": 8.535933646651162e-06, |
| "loss": 1.14, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.8853754940711462, |
| "grad_norm": 0.062132072998396924, |
| "learning_rate": 7.807025856688034e-06, |
| "loss": 1.1531, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.8906455862977603, |
| "grad_norm": 0.061898725911480094, |
| "learning_rate": 7.109380821526346e-06, |
| "loss": 1.1528, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.8959156785243741, |
| "grad_norm": 0.06186789888197545, |
| "learning_rate": 6.443235113484947e-06, |
| "loss": 1.16, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.9011857707509882, |
| "grad_norm": 0.06098802414462668, |
| "learning_rate": 5.8088146234207555e-06, |
| "loss": 1.1336, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.9064558629776021, |
| "grad_norm": 0.06532077132983789, |
| "learning_rate": 5.2063344841289385e-06, |
| "loss": 1.1609, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.9117259552042161, |
| "grad_norm": 0.06361827494551503, |
| "learning_rate": 4.6359989973911e-06, |
| "loss": 1.1463, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.9169960474308301, |
| "grad_norm": 0.06301560746292482, |
| "learning_rate": 4.0980015646962185e-06, |
| "loss": 1.1532, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.922266139657444, |
| "grad_norm": 0.06361543297830634, |
| "learning_rate": 3.592524621658111e-06, |
| "loss": 1.1692, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.927536231884058, |
| "grad_norm": 0.06206734430080948, |
| "learning_rate": 3.119739576151082e-06, |
| "loss": 1.142, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.932806324110672, |
| "grad_norm": 0.062312182913471736, |
| "learning_rate": 2.679806750185465e-06, |
| "loss": 1.1541, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.9380764163372859, |
| "grad_norm": 0.06186965071031368, |
| "learning_rate": 2.272875325542145e-06, |
| "loss": 1.157, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.9433465085638999, |
| "grad_norm": 0.06556483912601045, |
| "learning_rate": 1.8990832931848224e-06, |
| "loss": 1.1346, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.9486166007905138, |
| "grad_norm": 0.06271951543094142, |
| "learning_rate": 1.5585574064671315e-06, |
| "loss": 1.1578, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.9538866930171278, |
| "grad_norm": 0.06249606932946689, |
| "learning_rate": 1.2514131381504256e-06, |
| "loss": 1.1394, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.9591567852437418, |
| "grad_norm": 0.06061803805618644, |
| "learning_rate": 9.77754641246753e-07, |
| "loss": 1.1632, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.9644268774703557, |
| "grad_norm": 0.06139174351996552, |
| "learning_rate": 7.376747137005202e-07, |
| "loss": 1.1395, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "grad_norm": 0.06086778717293208, |
| "learning_rate": 5.312547669205526e-07, |
| "loss": 1.1657, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.9749670619235836, |
| "grad_norm": 0.06144011195020565, |
| "learning_rate": 3.585647981733442e-07, |
| "loss": 1.1427, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.9802371541501976, |
| "grad_norm": 0.06080565899530839, |
| "learning_rate": 2.196633668469783e-07, |
| "loss": 1.1487, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.9855072463768116, |
| "grad_norm": 0.06189880080446516, |
| "learning_rate": 1.1459757459350018e-07, |
| "loss": 1.1672, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.9907773386034255, |
| "grad_norm": 0.064621601430901, |
| "learning_rate": 4.3403049356693926e-08, |
| "loss": 1.1301, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.9960474308300395, |
| "grad_norm": 0.06340860836612439, |
| "learning_rate": 6.103933290624309e-09, |
| "loss": 1.157, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.9992094861660079, |
| "eval_loss": 1.1358933448791504, |
| "eval_runtime": 1187.7751, |
| "eval_samples_per_second": 5.631, |
| "eval_steps_per_second": 0.352, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.9992094861660079, |
| "step": 948, |
| "total_flos": 2.384330639592653e+16, |
| "train_loss": 1.1779414593921935, |
| "train_runtime": 35791.4311, |
| "train_samples_per_second": 1.696, |
| "train_steps_per_second": 0.026 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 948, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 25, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.384330639592653e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|