{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 26250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11428571428571428, "grad_norm": 1.9886202812194824, "learning_rate": 0.00029886857142857144, "loss": 1.2287, "step": 100 }, { "epoch": 0.22857142857142856, "grad_norm": 1.3383790254592896, "learning_rate": 0.0002977257142857143, "loss": 1.0151, "step": 200 }, { "epoch": 0.34285714285714286, "grad_norm": 1.4753262996673584, "learning_rate": 0.0002965828571428571, "loss": 0.9255, "step": 300 }, { "epoch": 0.45714285714285713, "grad_norm": 1.6120792627334595, "learning_rate": 0.00029544, "loss": 0.9156, "step": 400 }, { "epoch": 0.5714285714285714, "grad_norm": 1.7478028535842896, "learning_rate": 0.00029429714285714284, "loss": 0.8849, "step": 500 }, { "epoch": 0.6857142857142857, "grad_norm": 1.4654852151870728, "learning_rate": 0.0002931542857142857, "loss": 0.847, "step": 600 }, { "epoch": 0.8, "grad_norm": 1.6262428760528564, "learning_rate": 0.0002920114285714285, "loss": 0.8322, "step": 700 }, { "epoch": 0.9142857142857143, "grad_norm": 1.2275465726852417, "learning_rate": 0.0002908685714285714, "loss": 0.8382, "step": 800 }, { "epoch": 1.0285714285714285, "grad_norm": 0.8290926218032837, "learning_rate": 0.00028972571428571424, "loss": 0.8152, "step": 900 }, { "epoch": 1.1428571428571428, "grad_norm": 1.5321491956710815, "learning_rate": 0.0002885828571428571, "loss": 0.766, "step": 1000 }, { "epoch": 1.2571428571428571, "grad_norm": 1.1432169675827026, "learning_rate": 0.00028743999999999997, "loss": 0.7438, "step": 1100 }, { "epoch": 1.3714285714285714, "grad_norm": 3.4796738624572754, "learning_rate": 0.00028629714285714286, "loss": 0.7422, "step": 1200 }, { "epoch": 1.4857142857142858, "grad_norm": 0.8977236747741699, "learning_rate": 0.0002851542857142857, "loss": 0.7047, "step": 1300 }, { "epoch": 1.6, "grad_norm": 0.8644102811813354, "learning_rate": 0.00028401142857142854, "loss": 0.7545, "step": 1400 }, { "epoch": 1.7142857142857144, "grad_norm": 1.1881957054138184, "learning_rate": 0.00028286857142857143, "loss": 0.7159, "step": 1500 }, { "epoch": 1.8285714285714287, "grad_norm": 1.2069973945617676, "learning_rate": 0.00028172571428571427, "loss": 0.6974, "step": 1600 }, { "epoch": 1.9428571428571428, "grad_norm": 1.1324719190597534, "learning_rate": 0.0002805828571428571, "loss": 0.7014, "step": 1700 }, { "epoch": 2.057142857142857, "grad_norm": 1.6258182525634766, "learning_rate": 0.00027944, "loss": 0.6531, "step": 1800 }, { "epoch": 2.1714285714285713, "grad_norm": 1.6277265548706055, "learning_rate": 0.00027829714285714283, "loss": 0.6167, "step": 1900 }, { "epoch": 2.2857142857142856, "grad_norm": 1.05016028881073, "learning_rate": 0.00027715428571428567, "loss": 0.6365, "step": 2000 }, { "epoch": 2.4, "grad_norm": 1.4809651374816895, "learning_rate": 0.00027601142857142856, "loss": 0.64, "step": 2100 }, { "epoch": 2.5142857142857142, "grad_norm": 0.9632180333137512, "learning_rate": 0.0002748685714285714, "loss": 0.631, "step": 2200 }, { "epoch": 2.6285714285714286, "grad_norm": 1.4610211849212646, "learning_rate": 0.00027372571428571423, "loss": 0.6183, "step": 2300 }, { "epoch": 2.742857142857143, "grad_norm": 0.95488440990448, "learning_rate": 0.0002725828571428571, "loss": 0.6116, "step": 2400 }, { "epoch": 2.857142857142857, "grad_norm": 1.4659332036972046, "learning_rate": 0.00027144, "loss": 0.6281, "step": 2500 }, { "epoch": 2.9714285714285715, "grad_norm": 1.0329852104187012, "learning_rate": 0.00027029714285714285, "loss": 0.6289, "step": 2600 }, { "epoch": 3.085714285714286, "grad_norm": 1.1207853555679321, "learning_rate": 0.0002691542857142857, "loss": 0.5658, "step": 2700 }, { "epoch": 3.2, "grad_norm": 1.1021409034729004, "learning_rate": 0.00026801142857142853, "loss": 0.5508, "step": 2800 }, { "epoch": 3.314285714285714, "grad_norm": 1.3904248476028442, "learning_rate": 0.0002668685714285714, "loss": 0.5705, "step": 2900 }, { "epoch": 3.4285714285714284, "grad_norm": 1.0779317617416382, "learning_rate": 0.00026572571428571426, "loss": 0.5586, "step": 3000 }, { "epoch": 3.5428571428571427, "grad_norm": 1.5445743799209595, "learning_rate": 0.0002645828571428571, "loss": 0.5457, "step": 3100 }, { "epoch": 3.657142857142857, "grad_norm": 1.100846290588379, "learning_rate": 0.00026344, "loss": 0.5495, "step": 3200 }, { "epoch": 3.7714285714285714, "grad_norm": 1.7807058095932007, "learning_rate": 0.0002622971428571428, "loss": 0.5752, "step": 3300 }, { "epoch": 3.8857142857142857, "grad_norm": 1.18954336643219, "learning_rate": 0.00026115428571428566, "loss": 0.5443, "step": 3400 }, { "epoch": 4.0, "grad_norm": 1.0324475765228271, "learning_rate": 0.00026001142857142855, "loss": 0.5344, "step": 3500 }, { "epoch": 4.114285714285714, "grad_norm": 1.2776315212249756, "learning_rate": 0.0002588685714285714, "loss": 0.4963, "step": 3600 }, { "epoch": 4.228571428571429, "grad_norm": 1.8203849792480469, "learning_rate": 0.0002577257142857142, "loss": 0.4796, "step": 3700 }, { "epoch": 4.3428571428571425, "grad_norm": 1.0043076276779175, "learning_rate": 0.0002565828571428571, "loss": 0.4828, "step": 3800 }, { "epoch": 4.457142857142857, "grad_norm": 1.506948471069336, "learning_rate": 0.00025544, "loss": 0.5112, "step": 3900 }, { "epoch": 4.571428571428571, "grad_norm": 1.1585667133331299, "learning_rate": 0.00025429714285714284, "loss": 0.4957, "step": 4000 }, { "epoch": 4.685714285714286, "grad_norm": 1.3756364583969116, "learning_rate": 0.0002531542857142857, "loss": 0.5106, "step": 4100 }, { "epoch": 4.8, "grad_norm": 0.9732717871665955, "learning_rate": 0.00025201142857142857, "loss": 0.483, "step": 4200 }, { "epoch": 4.914285714285715, "grad_norm": 1.2413655519485474, "learning_rate": 0.0002508685714285714, "loss": 0.5, "step": 4300 }, { "epoch": 5.0285714285714285, "grad_norm": 1.0629197359085083, "learning_rate": 0.00024972571428571425, "loss": 0.4751, "step": 4400 }, { "epoch": 5.142857142857143, "grad_norm": 1.3967541456222534, "learning_rate": 0.00024858285714285714, "loss": 0.4427, "step": 4500 }, { "epoch": 5.257142857142857, "grad_norm": 1.1337262392044067, "learning_rate": 0.00024744, "loss": 0.4413, "step": 4600 }, { "epoch": 5.371428571428572, "grad_norm": 1.4365642070770264, "learning_rate": 0.0002462971428571428, "loss": 0.4342, "step": 4700 }, { "epoch": 5.485714285714286, "grad_norm": 1.1386431455612183, "learning_rate": 0.0002451542857142857, "loss": 0.4302, "step": 4800 }, { "epoch": 5.6, "grad_norm": 1.6757099628448486, "learning_rate": 0.00024401142857142854, "loss": 0.448, "step": 4900 }, { "epoch": 5.714285714285714, "grad_norm": 1.2465338706970215, "learning_rate": 0.00024286857142857143, "loss": 0.4401, "step": 5000 }, { "epoch": 5.828571428571428, "grad_norm": 1.482226014137268, "learning_rate": 0.00024172571428571427, "loss": 0.4613, "step": 5100 }, { "epoch": 5.942857142857143, "grad_norm": 2.177501678466797, "learning_rate": 0.00024058285714285713, "loss": 0.4364, "step": 5200 }, { "epoch": 6.057142857142857, "grad_norm": 1.069575309753418, "learning_rate": 0.00023944, "loss": 0.4228, "step": 5300 }, { "epoch": 6.171428571428572, "grad_norm": 1.0357013940811157, "learning_rate": 0.00023829714285714283, "loss": 0.3895, "step": 5400 }, { "epoch": 6.285714285714286, "grad_norm": 1.3480335474014282, "learning_rate": 0.0002371542857142857, "loss": 0.4087, "step": 5500 }, { "epoch": 6.4, "grad_norm": 1.4253255128860474, "learning_rate": 0.00023601142857142856, "loss": 0.4058, "step": 5600 }, { "epoch": 6.514285714285714, "grad_norm": 1.293028712272644, "learning_rate": 0.0002348685714285714, "loss": 0.4066, "step": 5700 }, { "epoch": 6.628571428571428, "grad_norm": 1.0373749732971191, "learning_rate": 0.00023372571428571426, "loss": 0.4091, "step": 5800 }, { "epoch": 6.742857142857143, "grad_norm": 1.2542062997817993, "learning_rate": 0.0002325828571428571, "loss": 0.3926, "step": 5900 }, { "epoch": 6.857142857142857, "grad_norm": 1.4273713827133179, "learning_rate": 0.00023143999999999997, "loss": 0.3899, "step": 6000 }, { "epoch": 6.9714285714285715, "grad_norm": 1.169870376586914, "learning_rate": 0.00023029714285714283, "loss": 0.4092, "step": 6100 }, { "epoch": 7.085714285714285, "grad_norm": 1.1924947500228882, "learning_rate": 0.00022915428571428567, "loss": 0.3754, "step": 6200 }, { "epoch": 7.2, "grad_norm": 1.349892497062683, "learning_rate": 0.00022801142857142856, "loss": 0.3537, "step": 6300 }, { "epoch": 7.314285714285714, "grad_norm": 1.2282274961471558, "learning_rate": 0.00022686857142857142, "loss": 0.3639, "step": 6400 }, { "epoch": 7.428571428571429, "grad_norm": 1.1712229251861572, "learning_rate": 0.0002257257142857143, "loss": 0.3688, "step": 6500 }, { "epoch": 7.542857142857143, "grad_norm": 2.2879111766815186, "learning_rate": 0.00022458285714285712, "loss": 0.3559, "step": 6600 }, { "epoch": 7.6571428571428575, "grad_norm": 1.1137748956680298, "learning_rate": 0.00022344, "loss": 0.3574, "step": 6700 }, { "epoch": 7.771428571428571, "grad_norm": 1.289184808731079, "learning_rate": 0.00022229714285714285, "loss": 0.3653, "step": 6800 }, { "epoch": 7.885714285714286, "grad_norm": 1.6281535625457764, "learning_rate": 0.0002211542857142857, "loss": 0.3476, "step": 6900 }, { "epoch": 8.0, "grad_norm": 1.3460326194763184, "learning_rate": 0.00022001142857142855, "loss": 0.3606, "step": 7000 }, { "epoch": 8.114285714285714, "grad_norm": 1.2988694906234741, "learning_rate": 0.0002188685714285714, "loss": 0.3064, "step": 7100 }, { "epoch": 8.228571428571428, "grad_norm": 1.059658408164978, "learning_rate": 0.00021772571428571426, "loss": 0.333, "step": 7200 }, { "epoch": 8.342857142857143, "grad_norm": 1.6318204402923584, "learning_rate": 0.00021658285714285712, "loss": 0.3313, "step": 7300 }, { "epoch": 8.457142857142857, "grad_norm": 1.5393931865692139, "learning_rate": 0.00021543999999999996, "loss": 0.3133, "step": 7400 }, { "epoch": 8.571428571428571, "grad_norm": 1.6614785194396973, "learning_rate": 0.00021429714285714282, "loss": 0.3393, "step": 7500 }, { "epoch": 8.685714285714285, "grad_norm": 1.206910490989685, "learning_rate": 0.0002131542857142857, "loss": 0.3279, "step": 7600 }, { "epoch": 8.8, "grad_norm": 1.3397912979125977, "learning_rate": 0.00021201142857142858, "loss": 0.3342, "step": 7700 }, { "epoch": 8.914285714285715, "grad_norm": 1.1242660284042358, "learning_rate": 0.0002108685714285714, "loss": 0.3237, "step": 7800 }, { "epoch": 9.028571428571428, "grad_norm": 1.088550329208374, "learning_rate": 0.00020972571428571428, "loss": 0.3128, "step": 7900 }, { "epoch": 9.142857142857142, "grad_norm": 1.238797664642334, "learning_rate": 0.00020858285714285714, "loss": 0.2904, "step": 8000 }, { "epoch": 9.257142857142856, "grad_norm": 1.502020239830017, "learning_rate": 0.00020743999999999998, "loss": 0.2967, "step": 8100 }, { "epoch": 9.371428571428572, "grad_norm": 1.6706186532974243, "learning_rate": 0.00020629714285714284, "loss": 0.2889, "step": 8200 }, { "epoch": 9.485714285714286, "grad_norm": 1.4780162572860718, "learning_rate": 0.00020515428571428568, "loss": 0.3039, "step": 8300 }, { "epoch": 9.6, "grad_norm": 1.3221495151519775, "learning_rate": 0.00020401142857142854, "loss": 0.3087, "step": 8400 }, { "epoch": 9.714285714285714, "grad_norm": 1.9556879997253418, "learning_rate": 0.0002028685714285714, "loss": 0.3006, "step": 8500 }, { "epoch": 9.82857142857143, "grad_norm": 2.5484800338745117, "learning_rate": 0.00020172571428571425, "loss": 0.2918, "step": 8600 }, { "epoch": 9.942857142857143, "grad_norm": 1.202545166015625, "learning_rate": 0.0002005828571428571, "loss": 0.2955, "step": 8700 }, { "epoch": 10.057142857142857, "grad_norm": 1.2312058210372925, "learning_rate": 0.00019943999999999997, "loss": 0.2711, "step": 8800 }, { "epoch": 10.17142857142857, "grad_norm": 0.9658439755439758, "learning_rate": 0.00019829714285714287, "loss": 0.2536, "step": 8900 }, { "epoch": 10.285714285714286, "grad_norm": 1.4016692638397217, "learning_rate": 0.0001971542857142857, "loss": 0.2712, "step": 9000 }, { "epoch": 10.4, "grad_norm": 1.0595059394836426, "learning_rate": 0.00019601142857142857, "loss": 0.2631, "step": 9100 }, { "epoch": 10.514285714285714, "grad_norm": 1.6047881841659546, "learning_rate": 0.00019486857142857143, "loss": 0.2755, "step": 9200 }, { "epoch": 10.628571428571428, "grad_norm": 1.8473031520843506, "learning_rate": 0.00019372571428571427, "loss": 0.2783, "step": 9300 }, { "epoch": 10.742857142857144, "grad_norm": 1.9216639995574951, "learning_rate": 0.00019258285714285713, "loss": 0.2734, "step": 9400 }, { "epoch": 10.857142857142858, "grad_norm": 1.0926100015640259, "learning_rate": 0.00019143999999999997, "loss": 0.2778, "step": 9500 }, { "epoch": 10.971428571428572, "grad_norm": 1.4277852773666382, "learning_rate": 0.00019029714285714283, "loss": 0.2678, "step": 9600 }, { "epoch": 11.085714285714285, "grad_norm": 1.273311734199524, "learning_rate": 0.0001891542857142857, "loss": 0.2468, "step": 9700 }, { "epoch": 11.2, "grad_norm": 1.50382399559021, "learning_rate": 0.00018801142857142854, "loss": 0.2328, "step": 9800 }, { "epoch": 11.314285714285715, "grad_norm": 2.2691490650177, "learning_rate": 0.0001868685714285714, "loss": 0.246, "step": 9900 }, { "epoch": 11.428571428571429, "grad_norm": 1.2265568971633911, "learning_rate": 0.00018572571428571426, "loss": 0.2492, "step": 10000 }, { "epoch": 11.542857142857143, "grad_norm": 1.1587599515914917, "learning_rate": 0.0001845828571428571, "loss": 0.2453, "step": 10100 }, { "epoch": 11.657142857142857, "grad_norm": 1.1462079286575317, "learning_rate": 0.00018344, "loss": 0.253, "step": 10200 }, { "epoch": 11.771428571428572, "grad_norm": 1.9905078411102295, "learning_rate": 0.00018229714285714286, "loss": 0.2556, "step": 10300 }, { "epoch": 11.885714285714286, "grad_norm": 1.667157769203186, "learning_rate": 0.00018115428571428572, "loss": 0.249, "step": 10400 }, { "epoch": 12.0, "grad_norm": 1.08194899559021, "learning_rate": 0.00018001142857142856, "loss": 0.2404, "step": 10500 }, { "epoch": 12.114285714285714, "grad_norm": 1.2508606910705566, "learning_rate": 0.00017886857142857142, "loss": 0.2171, "step": 10600 }, { "epoch": 12.228571428571428, "grad_norm": 1.10196053981781, "learning_rate": 0.00017772571428571426, "loss": 0.2162, "step": 10700 }, { "epoch": 12.342857142857143, "grad_norm": 1.325040340423584, "learning_rate": 0.00017658285714285712, "loss": 0.2165, "step": 10800 }, { "epoch": 12.457142857142857, "grad_norm": 1.4882842302322388, "learning_rate": 0.00017544, "loss": 0.2207, "step": 10900 }, { "epoch": 12.571428571428571, "grad_norm": 1.2574632167816162, "learning_rate": 0.00017429714285714282, "loss": 0.2292, "step": 11000 }, { "epoch": 12.685714285714285, "grad_norm": 1.5161538124084473, "learning_rate": 0.0001731542857142857, "loss": 0.2216, "step": 11100 }, { "epoch": 12.8, "grad_norm": 1.5018984079360962, "learning_rate": 0.00017201142857142855, "loss": 0.2273, "step": 11200 }, { "epoch": 12.914285714285715, "grad_norm": 1.2884104251861572, "learning_rate": 0.0001708685714285714, "loss": 0.2243, "step": 11300 }, { "epoch": 13.028571428571428, "grad_norm": 1.378460168838501, "learning_rate": 0.00016972571428571428, "loss": 0.225, "step": 11400 }, { "epoch": 13.142857142857142, "grad_norm": 1.5688245296478271, "learning_rate": 0.00016858285714285715, "loss": 0.1992, "step": 11500 }, { "epoch": 13.257142857142856, "grad_norm": 1.3006786108016968, "learning_rate": 0.00016744, "loss": 0.2068, "step": 11600 }, { "epoch": 13.371428571428572, "grad_norm": 1.047890543937683, "learning_rate": 0.00016629714285714285, "loss": 0.1957, "step": 11700 }, { "epoch": 13.485714285714286, "grad_norm": 0.9967881441116333, "learning_rate": 0.0001651542857142857, "loss": 0.2058, "step": 11800 }, { "epoch": 13.6, "grad_norm": 1.4264742136001587, "learning_rate": 0.00016401142857142855, "loss": 0.2063, "step": 11900 }, { "epoch": 13.714285714285714, "grad_norm": 1.3013545274734497, "learning_rate": 0.0001628685714285714, "loss": 0.2032, "step": 12000 }, { "epoch": 13.82857142857143, "grad_norm": 1.3055994510650635, "learning_rate": 0.00016172571428571428, "loss": 0.2162, "step": 12100 }, { "epoch": 13.942857142857143, "grad_norm": 1.4893743991851807, "learning_rate": 0.00016058285714285711, "loss": 0.2081, "step": 12200 }, { "epoch": 14.057142857142857, "grad_norm": 1.4137383699417114, "learning_rate": 0.00015943999999999998, "loss": 0.1841, "step": 12300 }, { "epoch": 14.17142857142857, "grad_norm": 1.8859280347824097, "learning_rate": 0.00015829714285714284, "loss": 0.1857, "step": 12400 }, { "epoch": 14.285714285714286, "grad_norm": 1.5282500982284546, "learning_rate": 0.00015715428571428568, "loss": 0.1904, "step": 12500 }, { "epoch": 14.4, "grad_norm": 0.9001047015190125, "learning_rate": 0.00015601142857142854, "loss": 0.1853, "step": 12600 }, { "epoch": 14.514285714285714, "grad_norm": 1.1927658319473267, "learning_rate": 0.00015486857142857143, "loss": 0.1874, "step": 12700 }, { "epoch": 14.628571428571428, "grad_norm": 1.1758664846420288, "learning_rate": 0.0001537257142857143, "loss": 0.1754, "step": 12800 }, { "epoch": 14.742857142857144, "grad_norm": 1.1734734773635864, "learning_rate": 0.00015258285714285714, "loss": 0.1868, "step": 12900 }, { "epoch": 14.857142857142858, "grad_norm": 0.8678969740867615, "learning_rate": 0.00015144, "loss": 0.1795, "step": 13000 }, { "epoch": 14.971428571428572, "grad_norm": 2.2901735305786133, "learning_rate": 0.00015029714285714284, "loss": 0.1965, "step": 13100 }, { "epoch": 15.085714285714285, "grad_norm": 1.0252338647842407, "learning_rate": 0.0001491542857142857, "loss": 0.1649, "step": 13200 }, { "epoch": 15.2, "grad_norm": 1.1025043725967407, "learning_rate": 0.00014801142857142857, "loss": 0.1722, "step": 13300 }, { "epoch": 15.314285714285715, "grad_norm": 1.2872519493103027, "learning_rate": 0.0001468685714285714, "loss": 0.1694, "step": 13400 }, { "epoch": 15.428571428571429, "grad_norm": 2.815004348754883, "learning_rate": 0.00014572571428571427, "loss": 0.1758, "step": 13500 }, { "epoch": 15.542857142857143, "grad_norm": 1.7336875200271606, "learning_rate": 0.00014458285714285713, "loss": 0.1698, "step": 13600 }, { "epoch": 15.657142857142857, "grad_norm": 1.5906660556793213, "learning_rate": 0.00014344, "loss": 0.1554, "step": 13700 }, { "epoch": 15.771428571428572, "grad_norm": 1.0536751747131348, "learning_rate": 0.00014229714285714286, "loss": 0.1667, "step": 13800 }, { "epoch": 15.885714285714286, "grad_norm": 2.1150689125061035, "learning_rate": 0.0001411542857142857, "loss": 0.1725, "step": 13900 }, { "epoch": 16.0, "grad_norm": 2.3918333053588867, "learning_rate": 0.00014001142857142856, "loss": 0.1855, "step": 14000 }, { "epoch": 16.114285714285714, "grad_norm": 1.5179935693740845, "learning_rate": 0.00013886857142857143, "loss": 0.1509, "step": 14100 }, { "epoch": 16.228571428571428, "grad_norm": 1.0404243469238281, "learning_rate": 0.00013772571428571426, "loss": 0.1545, "step": 14200 }, { "epoch": 16.34285714285714, "grad_norm": 1.7319324016571045, "learning_rate": 0.00013658285714285713, "loss": 0.1553, "step": 14300 }, { "epoch": 16.457142857142856, "grad_norm": 1.694320797920227, "learning_rate": 0.00013544, "loss": 0.1543, "step": 14400 }, { "epoch": 16.571428571428573, "grad_norm": 1.1056307554244995, "learning_rate": 0.00013429714285714285, "loss": 0.1639, "step": 14500 }, { "epoch": 16.685714285714287, "grad_norm": 1.5444873571395874, "learning_rate": 0.0001331542857142857, "loss": 0.1575, "step": 14600 }, { "epoch": 16.8, "grad_norm": 1.3893969058990479, "learning_rate": 0.00013201142857142856, "loss": 0.163, "step": 14700 }, { "epoch": 16.914285714285715, "grad_norm": 1.6132880449295044, "learning_rate": 0.00013086857142857142, "loss": 0.1537, "step": 14800 }, { "epoch": 17.02857142857143, "grad_norm": 1.5396114587783813, "learning_rate": 0.00012972571428571426, "loss": 0.1548, "step": 14900 }, { "epoch": 17.142857142857142, "grad_norm": 1.0118181705474854, "learning_rate": 0.00012858285714285715, "loss": 0.1417, "step": 15000 }, { "epoch": 17.257142857142856, "grad_norm": 1.0827256441116333, "learning_rate": 0.00012743999999999999, "loss": 0.1418, "step": 15100 }, { "epoch": 17.37142857142857, "grad_norm": 1.5309821367263794, "learning_rate": 0.00012629714285714285, "loss": 0.1375, "step": 15200 }, { "epoch": 17.485714285714284, "grad_norm": 1.1401481628417969, "learning_rate": 0.00012515428571428571, "loss": 0.1465, "step": 15300 }, { "epoch": 17.6, "grad_norm": 1.150075912475586, "learning_rate": 0.00012401142857142855, "loss": 0.1421, "step": 15400 }, { "epoch": 17.714285714285715, "grad_norm": 1.1666033267974854, "learning_rate": 0.00012286857142857142, "loss": 0.1527, "step": 15500 }, { "epoch": 17.82857142857143, "grad_norm": 0.8309689164161682, "learning_rate": 0.00012172571428571428, "loss": 0.1505, "step": 15600 }, { "epoch": 17.942857142857143, "grad_norm": 1.5989408493041992, "learning_rate": 0.00012058285714285713, "loss": 0.1436, "step": 15700 }, { "epoch": 18.057142857142857, "grad_norm": 0.6489440202713013, "learning_rate": 0.00011944, "loss": 0.1375, "step": 15800 }, { "epoch": 18.17142857142857, "grad_norm": 1.6196086406707764, "learning_rate": 0.00011829714285714285, "loss": 0.1352, "step": 15900 }, { "epoch": 18.285714285714285, "grad_norm": 1.2241395711898804, "learning_rate": 0.0001171542857142857, "loss": 0.1313, "step": 16000 }, { "epoch": 18.4, "grad_norm": 0.9825499653816223, "learning_rate": 0.00011601142857142856, "loss": 0.1346, "step": 16100 }, { "epoch": 18.514285714285712, "grad_norm": 1.8467905521392822, "learning_rate": 0.00011486857142857142, "loss": 0.1334, "step": 16200 }, { "epoch": 18.62857142857143, "grad_norm": 1.0591373443603516, "learning_rate": 0.00011372571428571428, "loss": 0.1392, "step": 16300 }, { "epoch": 18.742857142857144, "grad_norm": 2.0883259773254395, "learning_rate": 0.00011258285714285714, "loss": 0.134, "step": 16400 }, { "epoch": 18.857142857142858, "grad_norm": 1.0738496780395508, "learning_rate": 0.00011143999999999999, "loss": 0.1338, "step": 16500 }, { "epoch": 18.97142857142857, "grad_norm": 1.3434749841690063, "learning_rate": 0.00011029714285714284, "loss": 0.1296, "step": 16600 }, { "epoch": 19.085714285714285, "grad_norm": 0.6826351284980774, "learning_rate": 0.0001091542857142857, "loss": 0.1221, "step": 16700 }, { "epoch": 19.2, "grad_norm": 1.7247623205184937, "learning_rate": 0.00010801142857142856, "loss": 0.1212, "step": 16800 }, { "epoch": 19.314285714285713, "grad_norm": 1.4077801704406738, "learning_rate": 0.00010686857142857142, "loss": 0.1204, "step": 16900 }, { "epoch": 19.428571428571427, "grad_norm": 0.789215624332428, "learning_rate": 0.00010572571428571428, "loss": 0.1184, "step": 17000 }, { "epoch": 19.542857142857144, "grad_norm": 2.175265073776245, "learning_rate": 0.00010458285714285713, "loss": 0.1231, "step": 17100 }, { "epoch": 19.65714285714286, "grad_norm": 1.1125848293304443, "learning_rate": 0.00010343999999999999, "loss": 0.1227, "step": 17200 }, { "epoch": 19.771428571428572, "grad_norm": 1.0844342708587646, "learning_rate": 0.00010229714285714285, "loss": 0.1255, "step": 17300 }, { "epoch": 19.885714285714286, "grad_norm": 0.6973736882209778, "learning_rate": 0.0001011542857142857, "loss": 0.1277, "step": 17400 }, { "epoch": 20.0, "grad_norm": 0.9680613279342651, "learning_rate": 0.00010001142857142856, "loss": 0.1254, "step": 17500 }, { "epoch": 20.114285714285714, "grad_norm": 1.4217584133148193, "learning_rate": 9.886857142857143e-05, "loss": 0.1108, "step": 17600 }, { "epoch": 20.228571428571428, "grad_norm": 1.2597243785858154, "learning_rate": 9.772571428571428e-05, "loss": 0.1209, "step": 17700 }, { "epoch": 20.34285714285714, "grad_norm": 1.3436779975891113, "learning_rate": 9.658285714285713e-05, "loss": 0.1206, "step": 17800 }, { "epoch": 20.457142857142856, "grad_norm": 1.175439715385437, "learning_rate": 9.544e-05, "loss": 0.1164, "step": 17900 }, { "epoch": 20.571428571428573, "grad_norm": 1.3990012407302856, "learning_rate": 9.429714285714284e-05, "loss": 0.1144, "step": 18000 }, { "epoch": 20.685714285714287, "grad_norm": 1.0105007886886597, "learning_rate": 9.31542857142857e-05, "loss": 0.1135, "step": 18100 }, { "epoch": 20.8, "grad_norm": 1.1308010816574097, "learning_rate": 9.201142857142857e-05, "loss": 0.1141, "step": 18200 }, { "epoch": 20.914285714285715, "grad_norm": 0.7414535284042358, "learning_rate": 9.086857142857142e-05, "loss": 0.1208, "step": 18300 }, { "epoch": 21.02857142857143, "grad_norm": 0.8291124701499939, "learning_rate": 8.972571428571427e-05, "loss": 0.1049, "step": 18400 }, { "epoch": 21.142857142857142, "grad_norm": 0.7733851671218872, "learning_rate": 8.858285714285714e-05, "loss": 0.1071, "step": 18500 }, { "epoch": 21.257142857142856, "grad_norm": 2.2193784713745117, "learning_rate": 8.743999999999999e-05, "loss": 0.102, "step": 18600 }, { "epoch": 21.37142857142857, "grad_norm": 1.0695987939834595, "learning_rate": 8.629714285714284e-05, "loss": 0.106, "step": 18700 }, { "epoch": 21.485714285714284, "grad_norm": 1.0461671352386475, "learning_rate": 8.515428571428572e-05, "loss": 0.107, "step": 18800 }, { "epoch": 21.6, "grad_norm": 1.5757765769958496, "learning_rate": 8.401142857142857e-05, "loss": 0.106, "step": 18900 }, { "epoch": 21.714285714285715, "grad_norm": 1.9472708702087402, "learning_rate": 8.286857142857142e-05, "loss": 0.108, "step": 19000 }, { "epoch": 21.82857142857143, "grad_norm": 0.9918070435523987, "learning_rate": 8.172571428571428e-05, "loss": 0.1152, "step": 19100 }, { "epoch": 21.942857142857143, "grad_norm": 1.0311241149902344, "learning_rate": 8.058285714285713e-05, "loss": 0.1047, "step": 19200 }, { "epoch": 22.057142857142857, "grad_norm": 1.0554375648498535, "learning_rate": 7.943999999999998e-05, "loss": 0.1076, "step": 19300 }, { "epoch": 22.17142857142857, "grad_norm": 0.7874680757522583, "learning_rate": 7.829714285714286e-05, "loss": 0.0997, "step": 19400 }, { "epoch": 22.285714285714285, "grad_norm": 1.1105536222457886, "learning_rate": 7.715428571428571e-05, "loss": 0.1, "step": 19500 }, { "epoch": 22.4, "grad_norm": 1.0579336881637573, "learning_rate": 7.601142857142856e-05, "loss": 0.0963, "step": 19600 }, { "epoch": 22.514285714285712, "grad_norm": 1.0726921558380127, "learning_rate": 7.486857142857143e-05, "loss": 0.1005, "step": 19700 }, { "epoch": 22.62857142857143, "grad_norm": 1.0421086549758911, "learning_rate": 7.372571428571428e-05, "loss": 0.0976, "step": 19800 }, { "epoch": 22.742857142857144, "grad_norm": 1.198748230934143, "learning_rate": 7.258285714285714e-05, "loss": 0.1059, "step": 19900 }, { "epoch": 22.857142857142858, "grad_norm": 1.479467749595642, "learning_rate": 7.144e-05, "loss": 0.0993, "step": 20000 }, { "epoch": 22.97142857142857, "grad_norm": 1.1370179653167725, "learning_rate": 7.029714285714284e-05, "loss": 0.1021, "step": 20100 }, { "epoch": 23.085714285714285, "grad_norm": 0.9663624167442322, "learning_rate": 6.915428571428571e-05, "loss": 0.0956, "step": 20200 }, { "epoch": 23.2, "grad_norm": 0.6943888664245605, "learning_rate": 6.801142857142857e-05, "loss": 0.0994, "step": 20300 }, { "epoch": 23.314285714285713, "grad_norm": 1.3604183197021484, "learning_rate": 6.686857142857142e-05, "loss": 0.0963, "step": 20400 }, { "epoch": 23.428571428571427, "grad_norm": 1.3610256910324097, "learning_rate": 6.572571428571427e-05, "loss": 0.0956, "step": 20500 }, { "epoch": 23.542857142857144, "grad_norm": 1.2277686595916748, "learning_rate": 6.458285714285714e-05, "loss": 0.0953, "step": 20600 }, { "epoch": 23.65714285714286, "grad_norm": 1.1448625326156616, "learning_rate": 6.343999999999999e-05, "loss": 0.0954, "step": 20700 }, { "epoch": 23.771428571428572, "grad_norm": 0.8833436369895935, "learning_rate": 6.229714285714285e-05, "loss": 0.0968, "step": 20800 }, { "epoch": 23.885714285714286, "grad_norm": 1.0425817966461182, "learning_rate": 6.115428571428572e-05, "loss": 0.094, "step": 20900 }, { "epoch": 24.0, "grad_norm": 1.7617619037628174, "learning_rate": 6.001142857142857e-05, "loss": 0.098, "step": 21000 }, { "epoch": 24.114285714285714, "grad_norm": 0.7041512131690979, "learning_rate": 5.886857142857142e-05, "loss": 0.0891, "step": 21100 }, { "epoch": 24.228571428571428, "grad_norm": 1.3548294305801392, "learning_rate": 5.772571428571428e-05, "loss": 0.0937, "step": 21200 }, { "epoch": 24.34285714285714, "grad_norm": 1.0486685037612915, "learning_rate": 5.658285714285714e-05, "loss": 0.0947, "step": 21300 }, { "epoch": 24.457142857142856, "grad_norm": 1.6215624809265137, "learning_rate": 5.543999999999999e-05, "loss": 0.0934, "step": 21400 }, { "epoch": 24.571428571428573, "grad_norm": 1.0335862636566162, "learning_rate": 5.4297142857142855e-05, "loss": 0.088, "step": 21500 }, { "epoch": 24.685714285714287, "grad_norm": 1.3150044679641724, "learning_rate": 5.315428571428571e-05, "loss": 0.0957, "step": 21600 }, { "epoch": 24.8, "grad_norm": 1.3469773530960083, "learning_rate": 5.201142857142856e-05, "loss": 0.0893, "step": 21700 }, { "epoch": 24.914285714285715, "grad_norm": 0.7123535871505737, "learning_rate": 5.086857142857143e-05, "loss": 0.0833, "step": 21800 }, { "epoch": 25.02857142857143, "grad_norm": 0.9019558429718018, "learning_rate": 4.9725714285714285e-05, "loss": 0.084, "step": 21900 }, { "epoch": 25.142857142857142, "grad_norm": 1.2943990230560303, "learning_rate": 4.8582857142857136e-05, "loss": 0.0786, "step": 22000 }, { "epoch": 25.257142857142856, "grad_norm": 1.431429386138916, "learning_rate": 4.743999999999999e-05, "loss": 0.0882, "step": 22100 }, { "epoch": 25.37142857142857, "grad_norm": 0.8068431615829468, "learning_rate": 4.629714285714286e-05, "loss": 0.0846, "step": 22200 }, { "epoch": 25.485714285714284, "grad_norm": 1.0088834762573242, "learning_rate": 4.515428571428571e-05, "loss": 0.0832, "step": 22300 }, { "epoch": 25.6, "grad_norm": 1.1686701774597168, "learning_rate": 4.4011428571428565e-05, "loss": 0.0761, "step": 22400 }, { "epoch": 25.714285714285715, "grad_norm": 1.5640618801116943, "learning_rate": 4.286857142857143e-05, "loss": 0.0894, "step": 22500 }, { "epoch": 25.82857142857143, "grad_norm": 1.4497385025024414, "learning_rate": 4.172571428571428e-05, "loss": 0.0917, "step": 22600 }, { "epoch": 25.942857142857143, "grad_norm": 1.151632308959961, "learning_rate": 4.058285714285714e-05, "loss": 0.0826, "step": 22700 }, { "epoch": 26.057142857142857, "grad_norm": 1.5499285459518433, "learning_rate": 3.944e-05, "loss": 0.0826, "step": 22800 }, { "epoch": 26.17142857142857, "grad_norm": 1.4620537757873535, "learning_rate": 3.829714285714285e-05, "loss": 0.0843, "step": 22900 }, { "epoch": 26.285714285714285, "grad_norm": 0.7379088401794434, "learning_rate": 3.715428571428571e-05, "loss": 0.0748, "step": 23000 }, { "epoch": 26.4, "grad_norm": 0.5435966849327087, "learning_rate": 3.601142857142857e-05, "loss": 0.0758, "step": 23100 }, { "epoch": 26.514285714285712, "grad_norm": 0.7340735197067261, "learning_rate": 3.4868571428571425e-05, "loss": 0.0836, "step": 23200 }, { "epoch": 26.62857142857143, "grad_norm": 0.8306871056556702, "learning_rate": 3.372571428571428e-05, "loss": 0.0848, "step": 23300 }, { "epoch": 26.742857142857144, "grad_norm": 1.6395269632339478, "learning_rate": 3.258285714285714e-05, "loss": 0.0804, "step": 23400 }, { "epoch": 26.857142857142858, "grad_norm": 1.2032523155212402, "learning_rate": 3.144e-05, "loss": 0.0804, "step": 23500 }, { "epoch": 26.97142857142857, "grad_norm": 1.6825361251831055, "learning_rate": 3.0297142857142855e-05, "loss": 0.0827, "step": 23600 }, { "epoch": 27.085714285714285, "grad_norm": 1.4704478979110718, "learning_rate": 2.9154285714285712e-05, "loss": 0.0804, "step": 23700 }, { "epoch": 27.2, "grad_norm": 1.5141932964324951, "learning_rate": 2.801142857142857e-05, "loss": 0.0748, "step": 23800 }, { "epoch": 27.314285714285713, "grad_norm": 0.6547297835350037, "learning_rate": 2.6868571428571427e-05, "loss": 0.0791, "step": 23900 }, { "epoch": 27.428571428571427, "grad_norm": 1.112829327583313, "learning_rate": 2.5725714285714284e-05, "loss": 0.0781, "step": 24000 }, { "epoch": 27.542857142857144, "grad_norm": 1.1256695985794067, "learning_rate": 2.458285714285714e-05, "loss": 0.0828, "step": 24100 }, { "epoch": 27.65714285714286, "grad_norm": 1.4872969388961792, "learning_rate": 2.344e-05, "loss": 0.0793, "step": 24200 }, { "epoch": 27.771428571428572, "grad_norm": 1.0138152837753296, "learning_rate": 2.2297142857142857e-05, "loss": 0.0794, "step": 24300 }, { "epoch": 27.885714285714286, "grad_norm": 1.5701348781585693, "learning_rate": 2.115428571428571e-05, "loss": 0.079, "step": 24400 }, { "epoch": 28.0, "grad_norm": 1.4342325925827026, "learning_rate": 2.001142857142857e-05, "loss": 0.0815, "step": 24500 }, { "epoch": 28.114285714285714, "grad_norm": 0.6444075107574463, "learning_rate": 1.886857142857143e-05, "loss": 0.082, "step": 24600 }, { "epoch": 28.228571428571428, "grad_norm": 1.483933448791504, "learning_rate": 1.7725714285714283e-05, "loss": 0.0743, "step": 24700 }, { "epoch": 28.34285714285714, "grad_norm": 0.6548141241073608, "learning_rate": 1.6582857142857144e-05, "loss": 0.0755, "step": 24800 }, { "epoch": 28.457142857142856, "grad_norm": 1.0112509727478027, "learning_rate": 1.5439999999999998e-05, "loss": 0.0752, "step": 24900 }, { "epoch": 28.571428571428573, "grad_norm": 0.8533725738525391, "learning_rate": 1.4297142857142855e-05, "loss": 0.0819, "step": 25000 }, { "epoch": 28.685714285714287, "grad_norm": 1.2526988983154297, "learning_rate": 1.3154285714285713e-05, "loss": 0.0725, "step": 25100 }, { "epoch": 28.8, "grad_norm": 0.8007093667984009, "learning_rate": 1.2011428571428572e-05, "loss": 0.0764, "step": 25200 }, { "epoch": 28.914285714285715, "grad_norm": 0.9422992467880249, "learning_rate": 1.0868571428571428e-05, "loss": 0.0743, "step": 25300 }, { "epoch": 29.02857142857143, "grad_norm": 0.9128634333610535, "learning_rate": 9.725714285714285e-06, "loss": 0.0754, "step": 25400 }, { "epoch": 29.142857142857142, "grad_norm": 1.241011619567871, "learning_rate": 8.582857142857142e-06, "loss": 0.077, "step": 25500 }, { "epoch": 29.257142857142856, "grad_norm": 1.006628394126892, "learning_rate": 7.439999999999999e-06, "loss": 0.0689, "step": 25600 }, { "epoch": 29.37142857142857, "grad_norm": 1.3787076473236084, "learning_rate": 6.2971428571428565e-06, "loss": 0.0789, "step": 25700 }, { "epoch": 29.485714285714284, "grad_norm": 1.1843293905258179, "learning_rate": 5.154285714285714e-06, "loss": 0.0746, "step": 25800 }, { "epoch": 29.6, "grad_norm": 0.8705450296401978, "learning_rate": 4.0114285714285705e-06, "loss": 0.0754, "step": 25900 }, { "epoch": 29.714285714285715, "grad_norm": 1.1954331398010254, "learning_rate": 2.868571428571428e-06, "loss": 0.0716, "step": 26000 }, { "epoch": 29.82857142857143, "grad_norm": 0.8840579986572266, "learning_rate": 1.7257142857142856e-06, "loss": 0.0802, "step": 26100 }, { "epoch": 29.942857142857143, "grad_norm": 0.8856578469276428, "learning_rate": 5.828571428571428e-07, "loss": 0.076, "step": 26200 } ], "logging_steps": 100, "max_steps": 26250, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7105444577280000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }