| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01, |
| "grad_norm": 32.03642272949219, |
| "learning_rate": 1.9904e-05, |
| "loss": 2.5441, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 11.376370429992676, |
| "learning_rate": 1.9804000000000002e-05, |
| "loss": 2.3283, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 16.60682487487793, |
| "learning_rate": 1.9704e-05, |
| "loss": 2.246, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 15.391746520996094, |
| "learning_rate": 1.9604e-05, |
| "loss": 2.2186, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 12.73359203338623, |
| "learning_rate": 1.9504e-05, |
| "loss": 2.3115, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 12.478874206542969, |
| "learning_rate": 1.9404e-05, |
| "loss": 2.3705, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 14.907727241516113, |
| "learning_rate": 1.9304e-05, |
| "loss": 2.2694, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 19.148876190185547, |
| "learning_rate": 1.9204e-05, |
| "loss": 2.2233, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 11.990699768066406, |
| "learning_rate": 1.9104000000000002e-05, |
| "loss": 2.2177, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 11.667814254760742, |
| "learning_rate": 1.9004000000000003e-05, |
| "loss": 2.2965, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 10.88569164276123, |
| "learning_rate": 1.8904000000000004e-05, |
| "loss": 2.2782, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 10.427560806274414, |
| "learning_rate": 1.8804e-05, |
| "loss": 2.319, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 16.95577049255371, |
| "learning_rate": 1.8704000000000003e-05, |
| "loss": 2.2264, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 13.226414680480957, |
| "learning_rate": 1.8604000000000003e-05, |
| "loss": 2.2142, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 10.91146183013916, |
| "learning_rate": 1.8504e-05, |
| "loss": 2.2503, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 12.8374605178833, |
| "learning_rate": 1.8404000000000002e-05, |
| "loss": 2.179, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 12.140827178955078, |
| "learning_rate": 1.8304000000000003e-05, |
| "loss": 2.4125, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 10.9592924118042, |
| "learning_rate": 1.8204e-05, |
| "loss": 2.166, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 9.604344367980957, |
| "learning_rate": 1.8104e-05, |
| "loss": 2.2307, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 13.164838790893555, |
| "learning_rate": 1.8004000000000002e-05, |
| "loss": 2.3573, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 12.212058067321777, |
| "learning_rate": 1.7904e-05, |
| "loss": 2.2179, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 9.600943565368652, |
| "learning_rate": 1.7804e-05, |
| "loss": 2.2763, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 12.947696685791016, |
| "learning_rate": 1.7704000000000002e-05, |
| "loss": 2.0638, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 17.80840301513672, |
| "learning_rate": 1.7604e-05, |
| "loss": 2.2055, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 12.0412015914917, |
| "learning_rate": 1.7504e-05, |
| "loss": 2.1173, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 10.081609725952148, |
| "learning_rate": 1.7404e-05, |
| "loss": 2.2154, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 9.901471138000488, |
| "learning_rate": 1.7304000000000002e-05, |
| "loss": 2.0941, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 11.670356750488281, |
| "learning_rate": 1.7204e-05, |
| "loss": 2.1167, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 14.8759126663208, |
| "learning_rate": 1.7104e-05, |
| "loss": 2.1551, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 11.987970352172852, |
| "learning_rate": 1.7004000000000002e-05, |
| "loss": 2.2229, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 13.350611686706543, |
| "learning_rate": 1.6904e-05, |
| "loss": 2.1713, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 10.762746810913086, |
| "learning_rate": 1.6804e-05, |
| "loss": 2.0882, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 9.588376998901367, |
| "learning_rate": 1.6704e-05, |
| "loss": 2.0899, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 12.84415054321289, |
| "learning_rate": 1.6604000000000002e-05, |
| "loss": 2.1902, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 11.016281127929688, |
| "learning_rate": 1.6504000000000003e-05, |
| "loss": 1.961, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 11.485596656799316, |
| "learning_rate": 1.6404e-05, |
| "loss": 2.0134, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 9.29987907409668, |
| "learning_rate": 1.6304000000000002e-05, |
| "loss": 2.2747, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 12.311944961547852, |
| "learning_rate": 1.6204000000000003e-05, |
| "loss": 2.0809, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 14.393691062927246, |
| "learning_rate": 1.6104e-05, |
| "loss": 2.0551, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 8.741480827331543, |
| "learning_rate": 1.6004e-05, |
| "loss": 1.8158, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 15.93017864227295, |
| "learning_rate": 1.5904000000000002e-05, |
| "loss": 2.1485, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 9.9660062789917, |
| "learning_rate": 1.5804000000000003e-05, |
| "loss": 2.1187, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 12.61098575592041, |
| "learning_rate": 1.5704e-05, |
| "loss": 1.9836, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 11.482502937316895, |
| "learning_rate": 1.5604000000000002e-05, |
| "loss": 2.0814, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 11.588091850280762, |
| "learning_rate": 1.5504000000000003e-05, |
| "loss": 2.0365, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 12.21688175201416, |
| "learning_rate": 1.5404e-05, |
| "loss": 2.0287, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 8.9583740234375, |
| "learning_rate": 1.5304e-05, |
| "loss": 1.9626, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 9.103066444396973, |
| "learning_rate": 1.5204e-05, |
| "loss": 2.0527, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 10.230585098266602, |
| "learning_rate": 1.5104000000000001e-05, |
| "loss": 1.9494, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 9.652959823608398, |
| "learning_rate": 1.5004e-05, |
| "loss": 1.9135, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 11.546061515808105, |
| "learning_rate": 1.4904e-05, |
| "loss": 2.0405, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 12.885430335998535, |
| "learning_rate": 1.4804000000000001e-05, |
| "loss": 2.1517, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 10.972395896911621, |
| "learning_rate": 1.4704e-05, |
| "loss": 2.1219, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 10.801010131835938, |
| "learning_rate": 1.4604000000000001e-05, |
| "loss": 1.9599, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 10.134483337402344, |
| "learning_rate": 1.4504e-05, |
| "loss": 2.0494, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 13.115943908691406, |
| "learning_rate": 1.4404e-05, |
| "loss": 1.9955, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 10.545869827270508, |
| "learning_rate": 1.4304e-05, |
| "loss": 2.0235, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 8.695992469787598, |
| "learning_rate": 1.4204000000000002e-05, |
| "loss": 1.8495, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 9.978080749511719, |
| "learning_rate": 1.4104000000000003e-05, |
| "loss": 1.8959, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 10.295175552368164, |
| "learning_rate": 1.4004000000000002e-05, |
| "loss": 1.9118, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.61, |
| "grad_norm": 13.270401954650879, |
| "learning_rate": 1.3904000000000003e-05, |
| "loss": 2.1106, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.62, |
| "grad_norm": 13.775677680969238, |
| "learning_rate": 1.3804000000000002e-05, |
| "loss": 1.8277, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.63, |
| "grad_norm": 8.527593612670898, |
| "learning_rate": 1.3704000000000001e-05, |
| "loss": 1.7639, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 11.629280090332031, |
| "learning_rate": 1.3604000000000002e-05, |
| "loss": 1.9064, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 11.989888191223145, |
| "learning_rate": 1.3504000000000001e-05, |
| "loss": 1.9594, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.66, |
| "grad_norm": 10.411887168884277, |
| "learning_rate": 1.3404e-05, |
| "loss": 2.1262, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 12.676020622253418, |
| "learning_rate": 1.3304000000000002e-05, |
| "loss": 1.676, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 11.182994842529297, |
| "learning_rate": 1.3204000000000001e-05, |
| "loss": 1.9182, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.69, |
| "grad_norm": 10.866786003112793, |
| "learning_rate": 1.3104000000000002e-05, |
| "loss": 1.9195, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 9.576918601989746, |
| "learning_rate": 1.3004000000000001e-05, |
| "loss": 1.9578, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.71, |
| "grad_norm": 10.5067138671875, |
| "learning_rate": 1.2904e-05, |
| "loss": 1.8355, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 10.727161407470703, |
| "learning_rate": 1.2804000000000001e-05, |
| "loss": 1.9025, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 13.11409854888916, |
| "learning_rate": 1.2704e-05, |
| "loss": 1.7382, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.74, |
| "grad_norm": 11.595176696777344, |
| "learning_rate": 1.2604e-05, |
| "loss": 1.9146, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 9.093597412109375, |
| "learning_rate": 1.2504000000000001e-05, |
| "loss": 1.8007, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 9.586053848266602, |
| "learning_rate": 1.2404e-05, |
| "loss": 1.845, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.77, |
| "grad_norm": 10.637762069702148, |
| "learning_rate": 1.2304000000000001e-05, |
| "loss": 2.0743, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.78, |
| "grad_norm": 13.100274085998535, |
| "learning_rate": 1.2204e-05, |
| "loss": 2.0598, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.79, |
| "grad_norm": 10.020994186401367, |
| "learning_rate": 1.2104e-05, |
| "loss": 2.0117, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 8.455347061157227, |
| "learning_rate": 1.2004e-05, |
| "loss": 1.6429, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.81, |
| "grad_norm": 11.668612480163574, |
| "learning_rate": 1.1904e-05, |
| "loss": 2.0177, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.82, |
| "grad_norm": 13.308269500732422, |
| "learning_rate": 1.1803999999999999e-05, |
| "loss": 1.8674, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 12.84695053100586, |
| "learning_rate": 1.1704000000000002e-05, |
| "loss": 1.8027, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 9.068670272827148, |
| "learning_rate": 1.1604000000000003e-05, |
| "loss": 1.7137, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 8.16296672821045, |
| "learning_rate": 1.1504000000000002e-05, |
| "loss": 1.9627, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.86, |
| "grad_norm": 15.622565269470215, |
| "learning_rate": 1.1404000000000001e-05, |
| "loss": 2.005, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 9.689506530761719, |
| "learning_rate": 1.1304000000000002e-05, |
| "loss": 2.0241, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 18.542600631713867, |
| "learning_rate": 1.1204000000000001e-05, |
| "loss": 1.8033, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 10.676534652709961, |
| "learning_rate": 1.1104e-05, |
| "loss": 1.8776, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 15.894271850585938, |
| "learning_rate": 1.1004000000000002e-05, |
| "loss": 1.5818, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.91, |
| "grad_norm": 8.65552043914795, |
| "learning_rate": 1.0904000000000001e-05, |
| "loss": 1.9338, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 10.770137786865234, |
| "learning_rate": 1.0804000000000002e-05, |
| "loss": 1.7558, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 10.38111400604248, |
| "learning_rate": 1.0704000000000001e-05, |
| "loss": 1.7643, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.94, |
| "grad_norm": 10.971551895141602, |
| "learning_rate": 1.0604e-05, |
| "loss": 1.6931, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 9.95477294921875, |
| "learning_rate": 1.0504000000000001e-05, |
| "loss": 1.8131, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 11.37536334991455, |
| "learning_rate": 1.0404e-05, |
| "loss": 1.7392, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.97, |
| "grad_norm": 10.042652130126953, |
| "learning_rate": 1.0304e-05, |
| "loss": 1.7464, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.98, |
| "grad_norm": 9.678910255432129, |
| "learning_rate": 1.0204000000000001e-05, |
| "loss": 1.6348, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.99, |
| "grad_norm": 12.561325073242188, |
| "learning_rate": 1.0104e-05, |
| "loss": 1.5169, |
| "step": 2475 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 14.051156044006348, |
| "learning_rate": 1.0004000000000001e-05, |
| "loss": 1.7342, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 12.802934646606445, |
| "learning_rate": 9.904e-06, |
| "loss": 1.1948, |
| "step": 2525 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 9.442876815795898, |
| "learning_rate": 9.804000000000001e-06, |
| "loss": 1.2588, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.03, |
| "grad_norm": 11.199936866760254, |
| "learning_rate": 9.704e-06, |
| "loss": 1.1248, |
| "step": 2575 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 9.727993965148926, |
| "learning_rate": 9.604000000000002e-06, |
| "loss": 1.1481, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 11.783479690551758, |
| "learning_rate": 9.504e-06, |
| "loss": 1.0526, |
| "step": 2625 |
| }, |
| { |
| "epoch": 1.06, |
| "grad_norm": 12.407696723937988, |
| "learning_rate": 9.404e-06, |
| "loss": 1.1352, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 14.621116638183594, |
| "learning_rate": 9.304000000000001e-06, |
| "loss": 1.1035, |
| "step": 2675 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 13.569993019104004, |
| "learning_rate": 9.204e-06, |
| "loss": 1.2436, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.09, |
| "grad_norm": 13.364871978759766, |
| "learning_rate": 9.104000000000001e-06, |
| "loss": 1.186, |
| "step": 2725 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 9.892740249633789, |
| "learning_rate": 9.004e-06, |
| "loss": 1.1549, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.11, |
| "grad_norm": 10.509928703308105, |
| "learning_rate": 8.904e-06, |
| "loss": 1.1382, |
| "step": 2775 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 11.294106483459473, |
| "learning_rate": 8.804e-06, |
| "loss": 1.1838, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 17.69602394104004, |
| "learning_rate": 8.704e-06, |
| "loss": 1.0854, |
| "step": 2825 |
| }, |
| { |
| "epoch": 1.1400000000000001, |
| "grad_norm": 11.137506484985352, |
| "learning_rate": 8.604000000000001e-06, |
| "loss": 1.1801, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 8.684813499450684, |
| "learning_rate": 8.504000000000002e-06, |
| "loss": 1.1309, |
| "step": 2875 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 18.216867446899414, |
| "learning_rate": 8.404000000000001e-06, |
| "loss": 1.1336, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.17, |
| "grad_norm": 13.37553882598877, |
| "learning_rate": 8.304e-06, |
| "loss": 1.0348, |
| "step": 2925 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 12.126663208007812, |
| "learning_rate": 8.204000000000001e-06, |
| "loss": 1.0356, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.19, |
| "grad_norm": 9.19808292388916, |
| "learning_rate": 8.104e-06, |
| "loss": 1.1679, |
| "step": 2975 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 11.421396255493164, |
| "learning_rate": 8.004e-06, |
| "loss": 1.1297, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 13.185827255249023, |
| "learning_rate": 7.904000000000001e-06, |
| "loss": 1.2146, |
| "step": 3025 |
| }, |
| { |
| "epoch": 1.22, |
| "grad_norm": 10.685110092163086, |
| "learning_rate": 7.804e-06, |
| "loss": 1.2583, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.23, |
| "grad_norm": 12.375425338745117, |
| "learning_rate": 7.704000000000001e-06, |
| "loss": 1.1996, |
| "step": 3075 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 14.291858673095703, |
| "learning_rate": 7.604e-06, |
| "loss": 1.2209, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 15.820944786071777, |
| "learning_rate": 7.5040000000000005e-06, |
| "loss": 1.3115, |
| "step": 3125 |
| }, |
| { |
| "epoch": 1.26, |
| "grad_norm": 11.021650314331055, |
| "learning_rate": 7.404e-06, |
| "loss": 1.025, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 9.467336654663086, |
| "learning_rate": 7.304000000000001e-06, |
| "loss": 1.1548, |
| "step": 3175 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 11.907764434814453, |
| "learning_rate": 7.204000000000001e-06, |
| "loss": 1.2011, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.29, |
| "grad_norm": 8.38372802734375, |
| "learning_rate": 7.104000000000001e-06, |
| "loss": 1.1483, |
| "step": 3225 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 14.970967292785645, |
| "learning_rate": 7.004000000000001e-06, |
| "loss": 1.1487, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.31, |
| "grad_norm": 11.194636344909668, |
| "learning_rate": 6.904e-06, |
| "loss": 1.1764, |
| "step": 3275 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 16.364320755004883, |
| "learning_rate": 6.804e-06, |
| "loss": 1.2492, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 15.269165992736816, |
| "learning_rate": 6.7040000000000005e-06, |
| "loss": 1.0329, |
| "step": 3325 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 11.500872611999512, |
| "learning_rate": 6.604000000000001e-06, |
| "loss": 1.092, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 8.786423683166504, |
| "learning_rate": 6.504e-06, |
| "loss": 1.2249, |
| "step": 3375 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "grad_norm": 13.731120109558105, |
| "learning_rate": 6.404e-06, |
| "loss": 1.1118, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.37, |
| "grad_norm": 16.703580856323242, |
| "learning_rate": 6.304e-06, |
| "loss": 1.0879, |
| "step": 3425 |
| }, |
| { |
| "epoch": 1.38, |
| "grad_norm": 10.39274787902832, |
| "learning_rate": 6.204e-06, |
| "loss": 1.0874, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.3900000000000001, |
| "grad_norm": 14.778815269470215, |
| "learning_rate": 6.104000000000001e-06, |
| "loss": 1.0982, |
| "step": 3475 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 15.760618209838867, |
| "learning_rate": 6.004000000000001e-06, |
| "loss": 0.9911, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.41, |
| "grad_norm": 10.676987648010254, |
| "learning_rate": 5.9040000000000006e-06, |
| "loss": 0.9779, |
| "step": 3525 |
| }, |
| { |
| "epoch": 1.42, |
| "grad_norm": 11.369353294372559, |
| "learning_rate": 5.804000000000001e-06, |
| "loss": 0.9832, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 11.173103332519531, |
| "learning_rate": 5.704000000000001e-06, |
| "loss": 1.1413, |
| "step": 3575 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 14.497014999389648, |
| "learning_rate": 5.604000000000001e-06, |
| "loss": 1.0769, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 10.242537498474121, |
| "learning_rate": 5.504e-06, |
| "loss": 1.077, |
| "step": 3625 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 14.58755874633789, |
| "learning_rate": 5.404e-06, |
| "loss": 1.0362, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 11.268484115600586, |
| "learning_rate": 5.304e-06, |
| "loss": 1.1701, |
| "step": 3675 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 12.811514854431152, |
| "learning_rate": 5.2040000000000005e-06, |
| "loss": 1.1419, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.49, |
| "grad_norm": 13.174732208251953, |
| "learning_rate": 5.104e-06, |
| "loss": 0.9834, |
| "step": 3725 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 14.920330047607422, |
| "learning_rate": 5.004e-06, |
| "loss": 1.1624, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.51, |
| "grad_norm": 10.778399467468262, |
| "learning_rate": 4.904000000000001e-06, |
| "loss": 1.0573, |
| "step": 3775 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 13.889476776123047, |
| "learning_rate": 4.804e-06, |
| "loss": 1.1256, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 14.520740509033203, |
| "learning_rate": 4.704e-06, |
| "loss": 1.0834, |
| "step": 3825 |
| }, |
| { |
| "epoch": 1.54, |
| "grad_norm": 12.798078536987305, |
| "learning_rate": 4.604e-06, |
| "loss": 1.0705, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 11.737931251525879, |
| "learning_rate": 4.504e-06, |
| "loss": 1.1076, |
| "step": 3875 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 12.435004234313965, |
| "learning_rate": 4.4040000000000005e-06, |
| "loss": 1.0675, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.5699999999999998, |
| "grad_norm": 11.721802711486816, |
| "learning_rate": 4.304000000000001e-06, |
| "loss": 1.1715, |
| "step": 3925 |
| }, |
| { |
| "epoch": 1.58, |
| "grad_norm": 16.310001373291016, |
| "learning_rate": 4.204e-06, |
| "loss": 1.0577, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.5899999999999999, |
| "grad_norm": 9.448426246643066, |
| "learning_rate": 4.104e-06, |
| "loss": 0.9881, |
| "step": 3975 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 15.235527992248535, |
| "learning_rate": 4.004e-06, |
| "loss": 0.9509, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.6099999999999999, |
| "grad_norm": 11.758926391601562, |
| "learning_rate": 3.904e-06, |
| "loss": 1.017, |
| "step": 4025 |
| }, |
| { |
| "epoch": 1.62, |
| "grad_norm": 12.346951484680176, |
| "learning_rate": 3.8040000000000003e-06, |
| "loss": 1.1987, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.63, |
| "grad_norm": 10.503011703491211, |
| "learning_rate": 3.7040000000000005e-06, |
| "loss": 1.0665, |
| "step": 4075 |
| }, |
| { |
| "epoch": 1.6400000000000001, |
| "grad_norm": 9.963557243347168, |
| "learning_rate": 3.604e-06, |
| "loss": 1.1667, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 15.800553321838379, |
| "learning_rate": 3.5040000000000002e-06, |
| "loss": 1.0871, |
| "step": 4125 |
| }, |
| { |
| "epoch": 1.6600000000000001, |
| "grad_norm": 14.133501052856445, |
| "learning_rate": 3.404e-06, |
| "loss": 1.0588, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 15.049243927001953, |
| "learning_rate": 3.3040000000000005e-06, |
| "loss": 1.1755, |
| "step": 4175 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "grad_norm": 11.534449577331543, |
| "learning_rate": 3.2040000000000006e-06, |
| "loss": 1.0809, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 12.926283836364746, |
| "learning_rate": 3.1040000000000003e-06, |
| "loss": 1.1184, |
| "step": 4225 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 9.016489028930664, |
| "learning_rate": 3.0040000000000004e-06, |
| "loss": 0.9284, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.71, |
| "grad_norm": 16.352996826171875, |
| "learning_rate": 2.904e-06, |
| "loss": 0.8954, |
| "step": 4275 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 11.342592239379883, |
| "learning_rate": 2.804e-06, |
| "loss": 1.0614, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 11.883009910583496, |
| "learning_rate": 2.704e-06, |
| "loss": 1.006, |
| "step": 4325 |
| }, |
| { |
| "epoch": 1.74, |
| "grad_norm": 11.741077423095703, |
| "learning_rate": 2.6040000000000004e-06, |
| "loss": 1.1116, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 11.227665901184082, |
| "learning_rate": 2.5040000000000005e-06, |
| "loss": 1.0802, |
| "step": 4375 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 12.50661849975586, |
| "learning_rate": 2.404e-06, |
| "loss": 0.9816, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 13.356459617614746, |
| "learning_rate": 2.3040000000000003e-06, |
| "loss": 1.0844, |
| "step": 4425 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 10.1768159866333, |
| "learning_rate": 2.2040000000000004e-06, |
| "loss": 0.9909, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.79, |
| "grad_norm": 9.657759666442871, |
| "learning_rate": 2.104e-06, |
| "loss": 0.9877, |
| "step": 4475 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 14.901777267456055, |
| "learning_rate": 2.004e-06, |
| "loss": 0.9987, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.81, |
| "grad_norm": 17.132478713989258, |
| "learning_rate": 1.9040000000000003e-06, |
| "loss": 1.0573, |
| "step": 4525 |
| }, |
| { |
| "epoch": 1.8199999999999998, |
| "grad_norm": 13.090239524841309, |
| "learning_rate": 1.8040000000000002e-06, |
| "loss": 0.9955, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.83, |
| "grad_norm": 7.524806022644043, |
| "learning_rate": 1.7040000000000001e-06, |
| "loss": 1.0188, |
| "step": 4575 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "grad_norm": 14.137269973754883, |
| "learning_rate": 1.604e-06, |
| "loss": 0.9507, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 15.662611961364746, |
| "learning_rate": 1.5040000000000001e-06, |
| "loss": 1.0512, |
| "step": 4625 |
| }, |
| { |
| "epoch": 1.8599999999999999, |
| "grad_norm": 13.104068756103516, |
| "learning_rate": 1.404e-06, |
| "loss": 1.0257, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 8.180632591247559, |
| "learning_rate": 1.304e-06, |
| "loss": 1.1014, |
| "step": 4675 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 9.715453147888184, |
| "learning_rate": 1.204e-06, |
| "loss": 0.9764, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.8900000000000001, |
| "grad_norm": 10.140015602111816, |
| "learning_rate": 1.1040000000000001e-06, |
| "loss": 1.1436, |
| "step": 4725 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 13.225520133972168, |
| "learning_rate": 1.004e-06, |
| "loss": 1.021, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.9100000000000001, |
| "grad_norm": 11.760268211364746, |
| "learning_rate": 9.04e-07, |
| "loss": 1.0674, |
| "step": 4775 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 10.296557426452637, |
| "learning_rate": 8.04e-07, |
| "loss": 1.044, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.9300000000000002, |
| "grad_norm": 11.622244834899902, |
| "learning_rate": 7.040000000000001e-07, |
| "loss": 1.0317, |
| "step": 4825 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 14.334859848022461, |
| "learning_rate": 6.040000000000001e-07, |
| "loss": 1.1214, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 10.96068286895752, |
| "learning_rate": 5.040000000000001e-07, |
| "loss": 1.1726, |
| "step": 4875 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 14.571761131286621, |
| "learning_rate": 4.04e-07, |
| "loss": 0.9106, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 11.797457695007324, |
| "learning_rate": 3.04e-07, |
| "loss": 1.1328, |
| "step": 4925 |
| }, |
| { |
| "epoch": 1.98, |
| "grad_norm": 11.585701942443848, |
| "learning_rate": 2.0400000000000003e-07, |
| "loss": 1.0677, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.99, |
| "grad_norm": 15.56652545928955, |
| "learning_rate": 1.04e-07, |
| "loss": 1.0312, |
| "step": 4975 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 8.480283737182617, |
| "learning_rate": 4e-09, |
| "loss": 0.8741, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.1142437888e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|