| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.53, |
| "eval_steps": 500, |
| "global_step": 10600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0025, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.00025, |
| "loss": 10.4938, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 0.90625, |
| "learning_rate": 0.0005, |
| "loss": 9.1324, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 0.62890625, |
| "learning_rate": 0.00075, |
| "loss": 7.883, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.001, |
| "loss": 7.3925, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 0.6171875, |
| "learning_rate": 0.0009999842657116666, |
| "loss": 7.062, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 0.498046875, |
| "learning_rate": 0.0009999370638369377, |
| "loss": 6.8396, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 0.4296875, |
| "learning_rate": 0.0009998583973465647, |
| "loss": 6.6879, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.435546875, |
| "learning_rate": 0.0009997482711915926, |
| "loss": 6.6086, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 0.63671875, |
| "learning_rate": 0.0009996066923030483, |
| "loss": 6.5239, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 0.37109375, |
| "learning_rate": 0.000999433669591504, |
| "loss": 6.552, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 0.5234375, |
| "learning_rate": 0.0009992292139465165, |
| "loss": 6.4607, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.61328125, |
| "learning_rate": 0.0009989933382359422, |
| "loss": 6.3974, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 0.67578125, |
| "learning_rate": 0.0009987260573051267, |
| "loss": 6.3249, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 0.51953125, |
| "learning_rate": 0.0009984273879759713, |
| "loss": 6.3374, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 0.482421875, |
| "learning_rate": 0.0009980973490458728, |
| "loss": 6.3157, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.0009977359612865424, |
| "loss": 6.2525, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0009973432474426967, |
| "loss": 6.2116, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 0.439453125, |
| "learning_rate": 0.000996919232230627, |
| "loss": 6.1993, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 0.384765625, |
| "learning_rate": 0.0009964639423366442, |
| "loss": 6.0863, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.349609375, |
| "learning_rate": 0.0009959774064153978, |
| "loss": 6.049, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 0.44140625, |
| "learning_rate": 0.0009954596550880734, |
| "loss": 6.0222, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.0009949107209404665, |
| "loss": 6.0078, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 0.494140625, |
| "learning_rate": 0.000994330638520929, |
| "loss": 6.0114, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.46484375, |
| "learning_rate": 0.0009937194443381972, |
| "loss": 5.9764, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.0009930771768590933, |
| "loss": 5.9146, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 0.65625, |
| "learning_rate": 0.000992403876506104, |
| "loss": 5.9021, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 0.396484375, |
| "learning_rate": 0.0009916995856548369, |
| "loss": 5.9053, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.0009909643486313534, |
| "loss": 5.8727, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 0.484375, |
| "learning_rate": 0.0009901982117093786, |
| "loss": 5.8429, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 0.6640625, |
| "learning_rate": 0.0009894012231073895, |
| "loss": 5.8162, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 0.48828125, |
| "learning_rate": 0.0009885734329855799, |
| "loss": 5.8881, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.56640625, |
| "learning_rate": 0.0009877148934427035, |
| "loss": 5.7903, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 0.5390625, |
| "learning_rate": 0.0009868256585127955, |
| "loss": 5.7843, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.000985905784161771, |
| "loss": 5.7443, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 0.69921875, |
| "learning_rate": 0.0009849553282839025, |
| "loss": 5.749, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 0.37109375, |
| "learning_rate": 0.0009839743506981783, |
| "loss": 5.8367, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.0009829629131445341, |
| "loss": 5.7485, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 0.5625, |
| "learning_rate": 0.000981921079279971, |
| "loss": 5.7068, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 0.474609375, |
| "learning_rate": 0.0009808489146745465, |
| "loss": 5.7167, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.462890625, |
| "learning_rate": 0.0009797464868072487, |
| "loss": 5.7298, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.0009786138650617494, |
| "loss": 5.7332, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 0.50390625, |
| "learning_rate": 0.0009774511207220368, |
| "loss": 5.6218, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.0009762583269679303, |
| "loss": 5.662, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 0.435546875, |
| "learning_rate": 0.0009750355588704727, |
| "loss": 5.6434, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.0009737828933872075, |
| "loss": 5.585, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 0.46484375, |
| "learning_rate": 0.0009725004093573342, |
| "loss": 5.537, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 0.51953125, |
| "learning_rate": 0.000971188187496747, |
| "loss": 5.605, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.431640625, |
| "learning_rate": 0.0009698463103929542, |
| "loss": 5.5706, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.000968474862499881, |
| "loss": 5.5877, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.60546875, |
| "learning_rate": 0.0009670739301325534, |
| "loss": 5.61, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.1275, |
| "grad_norm": 0.70703125, |
| "learning_rate": 0.000965643601461667, |
| "loss": 5.6129, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.4375, |
| "learning_rate": 0.0009641839665080363, |
| "loss": 5.5661, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.1325, |
| "grad_norm": 0.396484375, |
| "learning_rate": 0.0009626951171369304, |
| "loss": 5.6271, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 0.51171875, |
| "learning_rate": 0.0009611771470522907, |
| "loss": 5.5214, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 0.5234375, |
| "learning_rate": 0.0009596301517908328, |
| "loss": 5.625, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.0009580542287160348, |
| "loss": 5.5264, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.1425, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.0009564494770120089, |
| "loss": 5.5348, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 0.462890625, |
| "learning_rate": 0.0009548159976772592, |
| "loss": 5.4634, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.1475, |
| "grad_norm": 0.48828125, |
| "learning_rate": 0.0009531538935183251, |
| "loss": 5.4421, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 0.439453125, |
| "learning_rate": 0.0009514632691433108, |
| "loss": 5.47, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.1525, |
| "grad_norm": 0.84375, |
| "learning_rate": 0.0009497442309553016, |
| "loss": 5.4528, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 0.61328125, |
| "learning_rate": 0.0009479968871456679, |
| "loss": 5.3831, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.1575, |
| "grad_norm": 0.439453125, |
| "learning_rate": 0.000946221347687255, |
| "loss": 5.3801, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.474609375, |
| "learning_rate": 0.0009444177243274617, |
| "loss": 5.394, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 0.90234375, |
| "learning_rate": 0.0009425861305812082, |
| "loss": 5.4232, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 0.5, |
| "learning_rate": 0.000940726681723791, |
| "loss": 5.4551, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.1675, |
| "grad_norm": 0.5625, |
| "learning_rate": 0.0009388394947836278, |
| "loss": 5.4354, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.455078125, |
| "learning_rate": 0.0009369246885348925, |
| "loss": 5.4606, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.1725, |
| "grad_norm": 0.474609375, |
| "learning_rate": 0.0009349823834900395, |
| "loss": 5.3785, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 0.6328125, |
| "learning_rate": 0.0009330127018922195, |
| "loss": 5.3472, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.1775, |
| "grad_norm": 0.40625, |
| "learning_rate": 0.0009310157677075847, |
| "loss": 5.4156, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.423828125, |
| "learning_rate": 0.0009289917066174886, |
| "loss": 5.2857, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.1825, |
| "grad_norm": 0.470703125, |
| "learning_rate": 0.000926940646010574, |
| "loss": 5.3815, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 0.46484375, |
| "learning_rate": 0.0009248627149747573, |
| "loss": 5.3596, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 0.455078125, |
| "learning_rate": 0.0009227580442891022, |
| "loss": 5.3197, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.0009206267664155906, |
| "loss": 5.2859, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.1925, |
| "grad_norm": 0.66015625, |
| "learning_rate": 0.0009184690154907849, |
| "loss": 5.3134, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 0.625, |
| "learning_rate": 0.0009162849273173857, |
| "loss": 5.218, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.1975, |
| "grad_norm": 0.9296875, |
| "learning_rate": 0.0009140746393556853, |
| "loss": 5.3191, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.416015625, |
| "learning_rate": 0.0009118382907149164, |
| "loss": 5.2908, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.2025, |
| "grad_norm": 0.498046875, |
| "learning_rate": 0.0009095760221444959, |
| "loss": 5.1909, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.0009072879760251679, |
| "loss": 5.2562, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.2075, |
| "grad_norm": 0.5859375, |
| "learning_rate": 0.0009049742963600418, |
| "loss": 5.2542, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 0.43359375, |
| "learning_rate": 0.0009026351287655293, |
| "loss": 5.1161, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.2125, |
| "grad_norm": 0.51953125, |
| "learning_rate": 0.0009002706204621802, |
| "loss": 5.1941, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 0.6015625, |
| "learning_rate": 0.0008978809202654162, |
| "loss": 5.2478, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.2175, |
| "grad_norm": 0.46875, |
| "learning_rate": 0.0008954661785761646, |
| "loss": 5.2331, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.0008930265473713938, |
| "loss": 5.1296, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.2225, |
| "grad_norm": 0.4765625, |
| "learning_rate": 0.0008905621801945467, |
| "loss": 5.194, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 0.48828125, |
| "learning_rate": 0.0008880732321458784, |
| "loss": 5.1855, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.2275, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.0008855598598726938, |
| "loss": 5.1613, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 0.578125, |
| "learning_rate": 0.000883022221559489, |
| "loss": 5.1545, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.2325, |
| "grad_norm": 0.41015625, |
| "learning_rate": 0.0008804604769179958, |
| "loss": 5.1522, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 0.44140625, |
| "learning_rate": 0.0008778747871771292, |
| "loss": 5.1246, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.2375, |
| "grad_norm": 0.625, |
| "learning_rate": 0.0008752653150728412, |
| "loss": 5.1371, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.50390625, |
| "learning_rate": 0.0008726322248378774, |
| "loss": 5.0847, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.2425, |
| "grad_norm": 0.44140625, |
| "learning_rate": 0.0008699756821914419, |
| "loss": 5.0916, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 0.59765625, |
| "learning_rate": 0.0008672958543287666, |
| "loss": 5.0772, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.2475, |
| "grad_norm": 0.51953125, |
| "learning_rate": 0.0008645929099105886, |
| "loss": 5.1363, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.451171875, |
| "learning_rate": 0.000861867019052535, |
| "loss": 5.075, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2525, |
| "grad_norm": 0.51171875, |
| "learning_rate": 0.0008591183533144171, |
| "loss": 5.1377, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.255, |
| "grad_norm": 0.470703125, |
| "learning_rate": 0.0008563470856894315, |
| "loss": 5.0829, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.2575, |
| "grad_norm": 0.59375, |
| "learning_rate": 0.0008535533905932737, |
| "loss": 5.0808, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.0008507374438531607, |
| "loss": 5.0014, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.2625, |
| "grad_norm": 0.5859375, |
| "learning_rate": 0.0008478994226967638, |
| "loss": 5.0243, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.265, |
| "grad_norm": 0.4140625, |
| "learning_rate": 0.000845039505741056, |
| "loss": 5.128, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.2675, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.0008421578729810691, |
| "loss": 5.0646, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 0.50390625, |
| "learning_rate": 0.0008392547057785661, |
| "loss": 5.0641, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.2725, |
| "grad_norm": 0.48046875, |
| "learning_rate": 0.0008363301868506264, |
| "loss": 5.0466, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 0.51171875, |
| "learning_rate": 0.0008333845002581458, |
| "loss": 5.0462, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.2775, |
| "grad_norm": 0.5390625, |
| "learning_rate": 0.0008304178313942535, |
| "loss": 5.1451, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.5859375, |
| "learning_rate": 0.0008274303669726426, |
| "loss": 5.1001, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.2825, |
| "grad_norm": 0.4921875, |
| "learning_rate": 0.0008244222950158193, |
| "loss": 4.9816, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.285, |
| "grad_norm": 0.466796875, |
| "learning_rate": 0.0008213938048432696, |
| "loss": 5.0636, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.2875, |
| "grad_norm": 0.60546875, |
| "learning_rate": 0.0008183450870595441, |
| "loss": 4.977, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.0008152763335422613, |
| "loss": 4.9954, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.2925, |
| "grad_norm": 0.5546875, |
| "learning_rate": 0.0008121877374300317, |
| "loss": 4.9429, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.295, |
| "grad_norm": 0.51171875, |
| "learning_rate": 0.0008090794931103026, |
| "loss": 4.9819, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.2975, |
| "grad_norm": 0.57421875, |
| "learning_rate": 0.0008059517962071233, |
| "loss": 5.0432, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.50390625, |
| "learning_rate": 0.0008028048435688333, |
| "loss": 4.8642, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.3025, |
| "grad_norm": 0.7421875, |
| "learning_rate": 0.0007996388332556734, |
| "loss": 5.0357, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.305, |
| "grad_norm": 0.46484375, |
| "learning_rate": 0.0007964539645273203, |
| "loss": 4.9587, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.3075, |
| "grad_norm": 0.470703125, |
| "learning_rate": 0.0007932504378303451, |
| "loss": 4.9245, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 0.625, |
| "learning_rate": 0.0007900284547855992, |
| "loss": 4.9813, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.3125, |
| "grad_norm": 0.51171875, |
| "learning_rate": 0.0007867882181755231, |
| "loss": 5.0188, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.315, |
| "grad_norm": 0.6953125, |
| "learning_rate": 0.0007835299319313853, |
| "loss": 4.9338, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.3175, |
| "grad_norm": 0.59375, |
| "learning_rate": 0.000780253801120447, |
| "loss": 4.8996, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.671875, |
| "learning_rate": 0.0007769600319330552, |
| "loss": 4.9161, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.3225, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.0007736488316696662, |
| "loss": 4.9648, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 0.625, |
| "learning_rate": 0.0007703204087277988, |
| "loss": 4.908, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3275, |
| "grad_norm": 0.50390625, |
| "learning_rate": 0.0007669749725889182, |
| "loss": 4.9536, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 0.87890625, |
| "learning_rate": 0.0007636127338052513, |
| "loss": 4.8755, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.3325, |
| "grad_norm": 0.5546875, |
| "learning_rate": 0.0007602339039865362, |
| "loss": 4.8819, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.335, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0007568386957867032, |
| "loss": 4.8511, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.3375, |
| "grad_norm": 0.515625, |
| "learning_rate": 0.0007534273228904916, |
| "loss": 4.9324, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.58203125, |
| "learning_rate": 0.00075, |
| "loss": 4.8898, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.3425, |
| "grad_norm": 0.63671875, |
| "learning_rate": 0.0007465569428211752, |
| "loss": 4.9447, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.345, |
| "grad_norm": 0.58203125, |
| "learning_rate": 0.0007430983680502344, |
| "loss": 4.8954, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.3475, |
| "grad_norm": 0.435546875, |
| "learning_rate": 0.0007396244933600284, |
| "loss": 4.8977, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.7734375, |
| "learning_rate": 0.0007361355373863414, |
| "loss": 4.7837, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3525, |
| "grad_norm": 0.458984375, |
| "learning_rate": 0.0007326317197141304, |
| "loss": 4.9241, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.355, |
| "grad_norm": 0.58203125, |
| "learning_rate": 0.0007291132608637052, |
| "loss": 4.9113, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.3575, |
| "grad_norm": 0.65234375, |
| "learning_rate": 0.0007255803822768504, |
| "loss": 4.9187, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.60546875, |
| "learning_rate": 0.0007220333063028871, |
| "loss": 4.8632, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.3625, |
| "grad_norm": 0.58203125, |
| "learning_rate": 0.0007184722561846798, |
| "loss": 5.0078, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.365, |
| "grad_norm": 0.50390625, |
| "learning_rate": 0.0007148974560445859, |
| "loss": 4.7923, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.3675, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.0007113091308703497, |
| "loss": 4.8948, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 0.466796875, |
| "learning_rate": 0.0007077075065009433, |
| "loss": 4.8259, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.3725, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.0007040928096123516, |
| "loss": 4.8009, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 0.4375, |
| "learning_rate": 0.0007004652677033068, |
| "loss": 4.847, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.3775, |
| "grad_norm": 0.486328125, |
| "learning_rate": 0.0006968251090809707, |
| "loss": 4.835, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.455078125, |
| "learning_rate": 0.0006931725628465643, |
| "loss": 4.8838, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.3825, |
| "grad_norm": 0.68359375, |
| "learning_rate": 0.0006895078588809502, |
| "loss": 4.8705, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.385, |
| "grad_norm": 0.46875, |
| "learning_rate": 0.0006858312278301637, |
| "loss": 4.763, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.3875, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0006821429010908972, |
| "loss": 4.8961, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0006784431107959359, |
| "loss": 4.8141, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.3925, |
| "grad_norm": 0.5078125, |
| "learning_rate": 0.0006747320897995492, |
| "loss": 4.8718, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.395, |
| "grad_norm": 0.5390625, |
| "learning_rate": 0.0006710100716628344, |
| "loss": 4.8394, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.3975, |
| "grad_norm": 0.65234375, |
| "learning_rate": 0.0006672772906390176, |
| "loss": 4.8264, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.80078125, |
| "learning_rate": 0.0006635339816587109, |
| "loss": 4.769, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4025, |
| "grad_norm": 0.61328125, |
| "learning_rate": 0.000659780380315125, |
| "loss": 4.8428, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.405, |
| "grad_norm": 0.515625, |
| "learning_rate": 0.0006560167228492435, |
| "loss": 4.8591, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.4075, |
| "grad_norm": 0.80078125, |
| "learning_rate": 0.0006522432461349536, |
| "loss": 4.9224, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 0.640625, |
| "learning_rate": 0.0006484601876641375, |
| "loss": 4.7773, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.4125, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.0006446677855317265, |
| "loss": 4.8107, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.415, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.0006408662784207149, |
| "loss": 4.8252, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.4175, |
| "grad_norm": 0.546875, |
| "learning_rate": 0.0006370559055871389, |
| "loss": 4.8562, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.5625, |
| "learning_rate": 0.0006332369068450174, |
| "loss": 4.7781, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.4225, |
| "grad_norm": 0.462890625, |
| "learning_rate": 0.0006294095225512603, |
| "loss": 4.8325, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 0.45703125, |
| "learning_rate": 0.0006255739935905395, |
| "loss": 4.8363, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.4275, |
| "grad_norm": 0.6015625, |
| "learning_rate": 0.0006217305613601295, |
| "loss": 4.8252, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 0.609375, |
| "learning_rate": 0.0006178794677547138, |
| "loss": 4.8079, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.4325, |
| "grad_norm": 0.470703125, |
| "learning_rate": 0.0006140209551511608, |
| "loss": 4.8508, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.435, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.0006101552663932703, |
| "loss": 4.8389, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.4375, |
| "grad_norm": 0.66796875, |
| "learning_rate": 0.0006062826447764884, |
| "loss": 4.7695, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.455078125, |
| "learning_rate": 0.0006024033340325954, |
| "loss": 4.7953, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.4425, |
| "grad_norm": 0.56640625, |
| "learning_rate": 0.0005985175783143666, |
| "loss": 4.8144, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.445, |
| "grad_norm": 0.48046875, |
| "learning_rate": 0.0005946256221802051, |
| "loss": 4.8123, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.4475, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.0005907277105787513, |
| "loss": 4.7778, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.0005868240888334653, |
| "loss": 4.7237, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4525, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.0005829150026271871, |
| "loss": 4.7938, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.455, |
| "grad_norm": 0.56640625, |
| "learning_rate": 0.000579000697986675, |
| "loss": 4.7879, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.4575, |
| "grad_norm": 0.578125, |
| "learning_rate": 0.0005750814212671201, |
| "loss": 4.7317, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.578125, |
| "learning_rate": 0.0005711574191366427, |
| "loss": 4.7819, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.4625, |
| "grad_norm": 0.61328125, |
| "learning_rate": 0.0005672289385607659, |
| "loss": 4.8393, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.465, |
| "grad_norm": 0.69921875, |
| "learning_rate": 0.0005632962267868747, |
| "loss": 4.7797, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.4675, |
| "grad_norm": 0.71484375, |
| "learning_rate": 0.0005593595313286526, |
| "loss": 4.764, |
| "step": 9350 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 0.4765625, |
| "learning_rate": 0.0005554190999505056, |
| "loss": 4.7639, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.4725, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0005514751806519673, |
| "loss": 4.8052, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 0.6328125, |
| "learning_rate": 0.0005475280216520913, |
| "loss": 4.8682, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.4775, |
| "grad_norm": 0.640625, |
| "learning_rate": 0.0005435778713738292, |
| "loss": 4.748, |
| "step": 9550 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.66015625, |
| "learning_rate": 0.0005396249784283942, |
| "loss": 4.7625, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.4825, |
| "grad_norm": 0.51171875, |
| "learning_rate": 0.0005356695915996161, |
| "loss": 4.824, |
| "step": 9650 |
| }, |
| { |
| "epoch": 0.485, |
| "grad_norm": 0.54296875, |
| "learning_rate": 0.0005317119598282822, |
| "loss": 4.7946, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.4875, |
| "grad_norm": 0.671875, |
| "learning_rate": 0.0005277523321964701, |
| "loss": 4.8476, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 0.55078125, |
| "learning_rate": 0.0005237909579118712, |
| "loss": 4.735, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.4925, |
| "grad_norm": 0.47265625, |
| "learning_rate": 0.0005198280862921062, |
| "loss": 4.8119, |
| "step": 9850 |
| }, |
| { |
| "epoch": 0.495, |
| "grad_norm": 0.49609375, |
| "learning_rate": 0.0005158639667490339, |
| "loss": 4.794, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.4975, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0005118988487730537, |
| "loss": 4.7226, |
| "step": 9950 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.56640625, |
| "learning_rate": 0.000507932981917404, |
| "loss": 4.9084, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5025, |
| "grad_norm": 0.54296875, |
| "learning_rate": 0.0005039666157824549, |
| "loss": 4.6737, |
| "step": 10050 |
| }, |
| { |
| "epoch": 0.505, |
| "grad_norm": 0.59765625, |
| "learning_rate": 0.0005, |
| "loss": 4.7808, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.5075, |
| "grad_norm": 0.86328125, |
| "learning_rate": 0.0004960333842175453, |
| "loss": 4.8202, |
| "step": 10150 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.546875, |
| "learning_rate": 0.000492067018082596, |
| "loss": 4.7873, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.5125, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.00048810115122694634, |
| "loss": 4.7753, |
| "step": 10250 |
| }, |
| { |
| "epoch": 0.515, |
| "grad_norm": 0.6796875, |
| "learning_rate": 0.0004841360332509663, |
| "loss": 4.767, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.5175, |
| "grad_norm": 0.60546875, |
| "learning_rate": 0.00048017191370789385, |
| "loss": 4.7159, |
| "step": 10350 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.0004762090420881289, |
| "loss": 4.8013, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.5225, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.00047224766780353, |
| "loss": 4.7983, |
| "step": 10450 |
| }, |
| { |
| "epoch": 0.525, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.00046828804017171776, |
| "loss": 4.7126, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5275, |
| "grad_norm": 0.474609375, |
| "learning_rate": 0.00046433040840038387, |
| "loss": 4.7993, |
| "step": 10550 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 0.78515625, |
| "learning_rate": 0.00046037502157160573, |
| "loss": 4.6986, |
| "step": 10600 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 20000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.54329059180544e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|