{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 32.03642272949219, "learning_rate": 1.9904e-05, "loss": 2.5441, "step": 25 }, { "epoch": 0.02, "grad_norm": 11.376370429992676, "learning_rate": 1.9804000000000002e-05, "loss": 2.3283, "step": 50 }, { "epoch": 0.03, "grad_norm": 16.60682487487793, "learning_rate": 1.9704e-05, "loss": 2.246, "step": 75 }, { "epoch": 0.04, "grad_norm": 15.391746520996094, "learning_rate": 1.9604e-05, "loss": 2.2186, "step": 100 }, { "epoch": 0.05, "grad_norm": 12.73359203338623, "learning_rate": 1.9504e-05, "loss": 2.3115, "step": 125 }, { "epoch": 0.06, "grad_norm": 12.478874206542969, "learning_rate": 1.9404e-05, "loss": 2.3705, "step": 150 }, { "epoch": 0.07, "grad_norm": 14.907727241516113, "learning_rate": 1.9304e-05, "loss": 2.2694, "step": 175 }, { "epoch": 0.08, "grad_norm": 19.148876190185547, "learning_rate": 1.9204e-05, "loss": 2.2233, "step": 200 }, { "epoch": 0.09, "grad_norm": 11.990699768066406, "learning_rate": 1.9104000000000002e-05, "loss": 2.2177, "step": 225 }, { "epoch": 0.1, "grad_norm": 11.667814254760742, "learning_rate": 1.9004000000000003e-05, "loss": 2.2965, "step": 250 }, { "epoch": 0.11, "grad_norm": 10.88569164276123, "learning_rate": 1.8904000000000004e-05, "loss": 2.2782, "step": 275 }, { "epoch": 0.12, "grad_norm": 10.427560806274414, "learning_rate": 1.8804e-05, "loss": 2.319, "step": 300 }, { "epoch": 0.13, "grad_norm": 16.95577049255371, "learning_rate": 1.8704000000000003e-05, "loss": 2.2264, "step": 325 }, { "epoch": 0.14, "grad_norm": 13.226414680480957, "learning_rate": 1.8604000000000003e-05, "loss": 2.2142, "step": 350 }, { "epoch": 0.15, "grad_norm": 10.91146183013916, "learning_rate": 1.8504e-05, "loss": 2.2503, "step": 375 }, { "epoch": 0.16, "grad_norm": 12.8374605178833, "learning_rate": 1.8404000000000002e-05, "loss": 2.179, "step": 400 }, { "epoch": 0.17, "grad_norm": 12.140827178955078, "learning_rate": 1.8304000000000003e-05, "loss": 2.4125, "step": 425 }, { "epoch": 0.18, "grad_norm": 10.9592924118042, "learning_rate": 1.8204e-05, "loss": 2.166, "step": 450 }, { "epoch": 0.19, "grad_norm": 9.604344367980957, "learning_rate": 1.8104e-05, "loss": 2.2307, "step": 475 }, { "epoch": 0.2, "grad_norm": 13.164838790893555, "learning_rate": 1.8004000000000002e-05, "loss": 2.3573, "step": 500 }, { "epoch": 0.21, "grad_norm": 12.212058067321777, "learning_rate": 1.7904e-05, "loss": 2.2179, "step": 525 }, { "epoch": 0.22, "grad_norm": 9.600943565368652, "learning_rate": 1.7804e-05, "loss": 2.2763, "step": 550 }, { "epoch": 0.23, "grad_norm": 12.947696685791016, "learning_rate": 1.7704000000000002e-05, "loss": 2.0638, "step": 575 }, { "epoch": 0.24, "grad_norm": 17.80840301513672, "learning_rate": 1.7604e-05, "loss": 2.2055, "step": 600 }, { "epoch": 0.25, "grad_norm": 12.0412015914917, "learning_rate": 1.7504e-05, "loss": 2.1173, "step": 625 }, { "epoch": 0.26, "grad_norm": 10.081609725952148, "learning_rate": 1.7404e-05, "loss": 2.2154, "step": 650 }, { "epoch": 0.27, "grad_norm": 9.901471138000488, "learning_rate": 1.7304000000000002e-05, "loss": 2.0941, "step": 675 }, { "epoch": 0.28, "grad_norm": 11.670356750488281, "learning_rate": 1.7204e-05, "loss": 2.1167, "step": 700 }, { "epoch": 0.29, "grad_norm": 14.8759126663208, "learning_rate": 1.7104e-05, "loss": 2.1551, "step": 725 }, { "epoch": 0.3, "grad_norm": 11.987970352172852, "learning_rate": 1.7004000000000002e-05, "loss": 2.2229, "step": 750 }, { "epoch": 0.31, "grad_norm": 13.350611686706543, "learning_rate": 1.6904e-05, "loss": 2.1713, "step": 775 }, { "epoch": 0.32, "grad_norm": 10.762746810913086, "learning_rate": 1.6804e-05, "loss": 2.0882, "step": 800 }, { "epoch": 0.33, "grad_norm": 9.588376998901367, "learning_rate": 1.6704e-05, "loss": 2.0899, "step": 825 }, { "epoch": 0.34, "grad_norm": 12.84415054321289, "learning_rate": 1.6604000000000002e-05, "loss": 2.1902, "step": 850 }, { "epoch": 0.35, "grad_norm": 11.016281127929688, "learning_rate": 1.6504000000000003e-05, "loss": 1.961, "step": 875 }, { "epoch": 0.36, "grad_norm": 11.485596656799316, "learning_rate": 1.6404e-05, "loss": 2.0134, "step": 900 }, { "epoch": 0.37, "grad_norm": 9.29987907409668, "learning_rate": 1.6304000000000002e-05, "loss": 2.2747, "step": 925 }, { "epoch": 0.38, "grad_norm": 12.311944961547852, "learning_rate": 1.6204000000000003e-05, "loss": 2.0809, "step": 950 }, { "epoch": 0.39, "grad_norm": 14.393691062927246, "learning_rate": 1.6104e-05, "loss": 2.0551, "step": 975 }, { "epoch": 0.4, "grad_norm": 8.741480827331543, "learning_rate": 1.6004e-05, "loss": 1.8158, "step": 1000 }, { "epoch": 0.41, "grad_norm": 15.93017864227295, "learning_rate": 1.5904000000000002e-05, "loss": 2.1485, "step": 1025 }, { "epoch": 0.42, "grad_norm": 9.9660062789917, "learning_rate": 1.5804000000000003e-05, "loss": 2.1187, "step": 1050 }, { "epoch": 0.43, "grad_norm": 12.61098575592041, "learning_rate": 1.5704e-05, "loss": 1.9836, "step": 1075 }, { "epoch": 0.44, "grad_norm": 11.482502937316895, "learning_rate": 1.5604000000000002e-05, "loss": 2.0814, "step": 1100 }, { "epoch": 0.45, "grad_norm": 11.588091850280762, "learning_rate": 1.5504000000000003e-05, "loss": 2.0365, "step": 1125 }, { "epoch": 0.46, "grad_norm": 12.21688175201416, "learning_rate": 1.5404e-05, "loss": 2.0287, "step": 1150 }, { "epoch": 0.47, "grad_norm": 8.9583740234375, "learning_rate": 1.5304e-05, "loss": 1.9626, "step": 1175 }, { "epoch": 0.48, "grad_norm": 9.103066444396973, "learning_rate": 1.5204e-05, "loss": 2.0527, "step": 1200 }, { "epoch": 0.49, "grad_norm": 10.230585098266602, "learning_rate": 1.5104000000000001e-05, "loss": 1.9494, "step": 1225 }, { "epoch": 0.5, "grad_norm": 9.652959823608398, "learning_rate": 1.5004e-05, "loss": 1.9135, "step": 1250 }, { "epoch": 0.51, "grad_norm": 11.546061515808105, "learning_rate": 1.4904e-05, "loss": 2.0405, "step": 1275 }, { "epoch": 0.52, "grad_norm": 12.885430335998535, "learning_rate": 1.4804000000000001e-05, "loss": 2.1517, "step": 1300 }, { "epoch": 0.53, "grad_norm": 10.972395896911621, "learning_rate": 1.4704e-05, "loss": 2.1219, "step": 1325 }, { "epoch": 0.54, "grad_norm": 10.801010131835938, "learning_rate": 1.4604000000000001e-05, "loss": 1.9599, "step": 1350 }, { "epoch": 0.55, "grad_norm": 10.134483337402344, "learning_rate": 1.4504e-05, "loss": 2.0494, "step": 1375 }, { "epoch": 0.56, "grad_norm": 13.115943908691406, "learning_rate": 1.4404e-05, "loss": 1.9955, "step": 1400 }, { "epoch": 0.57, "grad_norm": 10.545869827270508, "learning_rate": 1.4304e-05, "loss": 2.0235, "step": 1425 }, { "epoch": 0.58, "grad_norm": 8.695992469787598, "learning_rate": 1.4204000000000002e-05, "loss": 1.8495, "step": 1450 }, { "epoch": 0.59, "grad_norm": 9.978080749511719, "learning_rate": 1.4104000000000003e-05, "loss": 1.8959, "step": 1475 }, { "epoch": 0.6, "grad_norm": 10.295175552368164, "learning_rate": 1.4004000000000002e-05, "loss": 1.9118, "step": 1500 }, { "epoch": 0.61, "grad_norm": 13.270401954650879, "learning_rate": 1.3904000000000003e-05, "loss": 2.1106, "step": 1525 }, { "epoch": 0.62, "grad_norm": 13.775677680969238, "learning_rate": 1.3804000000000002e-05, "loss": 1.8277, "step": 1550 }, { "epoch": 0.63, "grad_norm": 8.527593612670898, "learning_rate": 1.3704000000000001e-05, "loss": 1.7639, "step": 1575 }, { "epoch": 0.64, "grad_norm": 11.629280090332031, "learning_rate": 1.3604000000000002e-05, "loss": 1.9064, "step": 1600 }, { "epoch": 0.65, "grad_norm": 11.989888191223145, "learning_rate": 1.3504000000000001e-05, "loss": 1.9594, "step": 1625 }, { "epoch": 0.66, "grad_norm": 10.411887168884277, "learning_rate": 1.3404e-05, "loss": 2.1262, "step": 1650 }, { "epoch": 0.67, "grad_norm": 12.676020622253418, "learning_rate": 1.3304000000000002e-05, "loss": 1.676, "step": 1675 }, { "epoch": 0.68, "grad_norm": 11.182994842529297, "learning_rate": 1.3204000000000001e-05, "loss": 1.9182, "step": 1700 }, { "epoch": 0.69, "grad_norm": 10.866786003112793, "learning_rate": 1.3104000000000002e-05, "loss": 1.9195, "step": 1725 }, { "epoch": 0.7, "grad_norm": 9.576918601989746, "learning_rate": 1.3004000000000001e-05, "loss": 1.9578, "step": 1750 }, { "epoch": 0.71, "grad_norm": 10.5067138671875, "learning_rate": 1.2904e-05, "loss": 1.8355, "step": 1775 }, { "epoch": 0.72, "grad_norm": 10.727161407470703, "learning_rate": 1.2804000000000001e-05, "loss": 1.9025, "step": 1800 }, { "epoch": 0.73, "grad_norm": 13.11409854888916, "learning_rate": 1.2704e-05, "loss": 1.7382, "step": 1825 }, { "epoch": 0.74, "grad_norm": 11.595176696777344, "learning_rate": 1.2604e-05, "loss": 1.9146, "step": 1850 }, { "epoch": 0.75, "grad_norm": 9.093597412109375, "learning_rate": 1.2504000000000001e-05, "loss": 1.8007, "step": 1875 }, { "epoch": 0.76, "grad_norm": 9.586053848266602, "learning_rate": 1.2404e-05, "loss": 1.845, "step": 1900 }, { "epoch": 0.77, "grad_norm": 10.637762069702148, "learning_rate": 1.2304000000000001e-05, "loss": 2.0743, "step": 1925 }, { "epoch": 0.78, "grad_norm": 13.100274085998535, "learning_rate": 1.2204e-05, "loss": 2.0598, "step": 1950 }, { "epoch": 0.79, "grad_norm": 10.020994186401367, "learning_rate": 1.2104e-05, "loss": 2.0117, "step": 1975 }, { "epoch": 0.8, "grad_norm": 8.455347061157227, "learning_rate": 1.2004e-05, "loss": 1.6429, "step": 2000 }, { "epoch": 0.81, "grad_norm": 11.668612480163574, "learning_rate": 1.1904e-05, "loss": 2.0177, "step": 2025 }, { "epoch": 0.82, "grad_norm": 13.308269500732422, "learning_rate": 1.1803999999999999e-05, "loss": 1.8674, "step": 2050 }, { "epoch": 0.83, "grad_norm": 12.84695053100586, "learning_rate": 1.1704000000000002e-05, "loss": 1.8027, "step": 2075 }, { "epoch": 0.84, "grad_norm": 9.068670272827148, "learning_rate": 1.1604000000000003e-05, "loss": 1.7137, "step": 2100 }, { "epoch": 0.85, "grad_norm": 8.16296672821045, "learning_rate": 1.1504000000000002e-05, "loss": 1.9627, "step": 2125 }, { "epoch": 0.86, "grad_norm": 15.622565269470215, "learning_rate": 1.1404000000000001e-05, "loss": 2.005, "step": 2150 }, { "epoch": 0.87, "grad_norm": 9.689506530761719, "learning_rate": 1.1304000000000002e-05, "loss": 2.0241, "step": 2175 }, { "epoch": 0.88, "grad_norm": 18.542600631713867, "learning_rate": 1.1204000000000001e-05, "loss": 1.8033, "step": 2200 }, { "epoch": 0.89, "grad_norm": 10.676534652709961, "learning_rate": 1.1104e-05, "loss": 1.8776, "step": 2225 }, { "epoch": 0.9, "grad_norm": 15.894271850585938, "learning_rate": 1.1004000000000002e-05, "loss": 1.5818, "step": 2250 }, { "epoch": 0.91, "grad_norm": 8.65552043914795, "learning_rate": 1.0904000000000001e-05, "loss": 1.9338, "step": 2275 }, { "epoch": 0.92, "grad_norm": 10.770137786865234, "learning_rate": 1.0804000000000002e-05, "loss": 1.7558, "step": 2300 }, { "epoch": 0.93, "grad_norm": 10.38111400604248, "learning_rate": 1.0704000000000001e-05, "loss": 1.7643, "step": 2325 }, { "epoch": 0.94, "grad_norm": 10.971551895141602, "learning_rate": 1.0604e-05, "loss": 1.6931, "step": 2350 }, { "epoch": 0.95, "grad_norm": 9.95477294921875, "learning_rate": 1.0504000000000001e-05, "loss": 1.8131, "step": 2375 }, { "epoch": 0.96, "grad_norm": 11.37536334991455, "learning_rate": 1.0404e-05, "loss": 1.7392, "step": 2400 }, { "epoch": 0.97, "grad_norm": 10.042652130126953, "learning_rate": 1.0304e-05, "loss": 1.7464, "step": 2425 }, { "epoch": 0.98, "grad_norm": 9.678910255432129, "learning_rate": 1.0204000000000001e-05, "loss": 1.6348, "step": 2450 }, { "epoch": 0.99, "grad_norm": 12.561325073242188, "learning_rate": 1.0104e-05, "loss": 1.5169, "step": 2475 }, { "epoch": 1.0, "grad_norm": 14.051156044006348, "learning_rate": 1.0004000000000001e-05, "loss": 1.7342, "step": 2500 }, { "epoch": 1.01, "grad_norm": 12.802934646606445, "learning_rate": 9.904e-06, "loss": 1.1948, "step": 2525 }, { "epoch": 1.02, "grad_norm": 9.442876815795898, "learning_rate": 9.804000000000001e-06, "loss": 1.2588, "step": 2550 }, { "epoch": 1.03, "grad_norm": 11.199936866760254, "learning_rate": 9.704e-06, "loss": 1.1248, "step": 2575 }, { "epoch": 1.04, "grad_norm": 9.727993965148926, "learning_rate": 9.604000000000002e-06, "loss": 1.1481, "step": 2600 }, { "epoch": 1.05, "grad_norm": 11.783479690551758, "learning_rate": 9.504e-06, "loss": 1.0526, "step": 2625 }, { "epoch": 1.06, "grad_norm": 12.407696723937988, "learning_rate": 9.404e-06, "loss": 1.1352, "step": 2650 }, { "epoch": 1.07, "grad_norm": 14.621116638183594, "learning_rate": 9.304000000000001e-06, "loss": 1.1035, "step": 2675 }, { "epoch": 1.08, "grad_norm": 13.569993019104004, "learning_rate": 9.204e-06, "loss": 1.2436, "step": 2700 }, { "epoch": 1.09, "grad_norm": 13.364871978759766, "learning_rate": 9.104000000000001e-06, "loss": 1.186, "step": 2725 }, { "epoch": 1.1, "grad_norm": 9.892740249633789, "learning_rate": 9.004e-06, "loss": 1.1549, "step": 2750 }, { "epoch": 1.11, "grad_norm": 10.509928703308105, "learning_rate": 8.904e-06, "loss": 1.1382, "step": 2775 }, { "epoch": 1.12, "grad_norm": 11.294106483459473, "learning_rate": 8.804e-06, "loss": 1.1838, "step": 2800 }, { "epoch": 1.13, "grad_norm": 17.69602394104004, "learning_rate": 8.704e-06, "loss": 1.0854, "step": 2825 }, { "epoch": 1.1400000000000001, "grad_norm": 11.137506484985352, "learning_rate": 8.604000000000001e-06, "loss": 1.1801, "step": 2850 }, { "epoch": 1.15, "grad_norm": 8.684813499450684, "learning_rate": 8.504000000000002e-06, "loss": 1.1309, "step": 2875 }, { "epoch": 1.16, "grad_norm": 18.216867446899414, "learning_rate": 8.404000000000001e-06, "loss": 1.1336, "step": 2900 }, { "epoch": 1.17, "grad_norm": 13.37553882598877, "learning_rate": 8.304e-06, "loss": 1.0348, "step": 2925 }, { "epoch": 1.18, "grad_norm": 12.126663208007812, "learning_rate": 8.204000000000001e-06, "loss": 1.0356, "step": 2950 }, { "epoch": 1.19, "grad_norm": 9.19808292388916, "learning_rate": 8.104e-06, "loss": 1.1679, "step": 2975 }, { "epoch": 1.2, "grad_norm": 11.421396255493164, "learning_rate": 8.004e-06, "loss": 1.1297, "step": 3000 }, { "epoch": 1.21, "grad_norm": 13.185827255249023, "learning_rate": 7.904000000000001e-06, "loss": 1.2146, "step": 3025 }, { "epoch": 1.22, "grad_norm": 10.685110092163086, "learning_rate": 7.804e-06, "loss": 1.2583, "step": 3050 }, { "epoch": 1.23, "grad_norm": 12.375425338745117, "learning_rate": 7.704000000000001e-06, "loss": 1.1996, "step": 3075 }, { "epoch": 1.24, "grad_norm": 14.291858673095703, "learning_rate": 7.604e-06, "loss": 1.2209, "step": 3100 }, { "epoch": 1.25, "grad_norm": 15.820944786071777, "learning_rate": 7.5040000000000005e-06, "loss": 1.3115, "step": 3125 }, { "epoch": 1.26, "grad_norm": 11.021650314331055, "learning_rate": 7.404e-06, "loss": 1.025, "step": 3150 }, { "epoch": 1.27, "grad_norm": 9.467336654663086, "learning_rate": 7.304000000000001e-06, "loss": 1.1548, "step": 3175 }, { "epoch": 1.28, "grad_norm": 11.907764434814453, "learning_rate": 7.204000000000001e-06, "loss": 1.2011, "step": 3200 }, { "epoch": 1.29, "grad_norm": 8.38372802734375, "learning_rate": 7.104000000000001e-06, "loss": 1.1483, "step": 3225 }, { "epoch": 1.3, "grad_norm": 14.970967292785645, "learning_rate": 7.004000000000001e-06, "loss": 1.1487, "step": 3250 }, { "epoch": 1.31, "grad_norm": 11.194636344909668, "learning_rate": 6.904e-06, "loss": 1.1764, "step": 3275 }, { "epoch": 1.32, "grad_norm": 16.364320755004883, "learning_rate": 6.804e-06, "loss": 1.2492, "step": 3300 }, { "epoch": 1.33, "grad_norm": 15.269165992736816, "learning_rate": 6.7040000000000005e-06, "loss": 1.0329, "step": 3325 }, { "epoch": 1.34, "grad_norm": 11.500872611999512, "learning_rate": 6.604000000000001e-06, "loss": 1.092, "step": 3350 }, { "epoch": 1.35, "grad_norm": 8.786423683166504, "learning_rate": 6.504e-06, "loss": 1.2249, "step": 3375 }, { "epoch": 1.3599999999999999, "grad_norm": 13.731120109558105, "learning_rate": 6.404e-06, "loss": 1.1118, "step": 3400 }, { "epoch": 1.37, "grad_norm": 16.703580856323242, "learning_rate": 6.304e-06, "loss": 1.0879, "step": 3425 }, { "epoch": 1.38, "grad_norm": 10.39274787902832, "learning_rate": 6.204e-06, "loss": 1.0874, "step": 3450 }, { "epoch": 1.3900000000000001, "grad_norm": 14.778815269470215, "learning_rate": 6.104000000000001e-06, "loss": 1.0982, "step": 3475 }, { "epoch": 1.4, "grad_norm": 15.760618209838867, "learning_rate": 6.004000000000001e-06, "loss": 0.9911, "step": 3500 }, { "epoch": 1.41, "grad_norm": 10.676987648010254, "learning_rate": 5.9040000000000006e-06, "loss": 0.9779, "step": 3525 }, { "epoch": 1.42, "grad_norm": 11.369353294372559, "learning_rate": 5.804000000000001e-06, "loss": 0.9832, "step": 3550 }, { "epoch": 1.43, "grad_norm": 11.173103332519531, "learning_rate": 5.704000000000001e-06, "loss": 1.1413, "step": 3575 }, { "epoch": 1.44, "grad_norm": 14.497014999389648, "learning_rate": 5.604000000000001e-06, "loss": 1.0769, "step": 3600 }, { "epoch": 1.45, "grad_norm": 10.242537498474121, "learning_rate": 5.504e-06, "loss": 1.077, "step": 3625 }, { "epoch": 1.46, "grad_norm": 14.58755874633789, "learning_rate": 5.404e-06, "loss": 1.0362, "step": 3650 }, { "epoch": 1.47, "grad_norm": 11.268484115600586, "learning_rate": 5.304e-06, "loss": 1.1701, "step": 3675 }, { "epoch": 1.48, "grad_norm": 12.811514854431152, "learning_rate": 5.2040000000000005e-06, "loss": 1.1419, "step": 3700 }, { "epoch": 1.49, "grad_norm": 13.174732208251953, "learning_rate": 5.104e-06, "loss": 0.9834, "step": 3725 }, { "epoch": 1.5, "grad_norm": 14.920330047607422, "learning_rate": 5.004e-06, "loss": 1.1624, "step": 3750 }, { "epoch": 1.51, "grad_norm": 10.778399467468262, "learning_rate": 4.904000000000001e-06, "loss": 1.0573, "step": 3775 }, { "epoch": 1.52, "grad_norm": 13.889476776123047, "learning_rate": 4.804e-06, "loss": 1.1256, "step": 3800 }, { "epoch": 1.53, "grad_norm": 14.520740509033203, "learning_rate": 4.704e-06, "loss": 1.0834, "step": 3825 }, { "epoch": 1.54, "grad_norm": 12.798078536987305, "learning_rate": 4.604e-06, "loss": 1.0705, "step": 3850 }, { "epoch": 1.55, "grad_norm": 11.737931251525879, "learning_rate": 4.504e-06, "loss": 1.1076, "step": 3875 }, { "epoch": 1.56, "grad_norm": 12.435004234313965, "learning_rate": 4.4040000000000005e-06, "loss": 1.0675, "step": 3900 }, { "epoch": 1.5699999999999998, "grad_norm": 11.721802711486816, "learning_rate": 4.304000000000001e-06, "loss": 1.1715, "step": 3925 }, { "epoch": 1.58, "grad_norm": 16.310001373291016, "learning_rate": 4.204e-06, "loss": 1.0577, "step": 3950 }, { "epoch": 1.5899999999999999, "grad_norm": 9.448426246643066, "learning_rate": 4.104e-06, "loss": 0.9881, "step": 3975 }, { "epoch": 1.6, "grad_norm": 15.235527992248535, "learning_rate": 4.004e-06, "loss": 0.9509, "step": 4000 }, { "epoch": 1.6099999999999999, "grad_norm": 11.758926391601562, "learning_rate": 3.904e-06, "loss": 1.017, "step": 4025 }, { "epoch": 1.62, "grad_norm": 12.346951484680176, "learning_rate": 3.8040000000000003e-06, "loss": 1.1987, "step": 4050 }, { "epoch": 1.63, "grad_norm": 10.503011703491211, "learning_rate": 3.7040000000000005e-06, "loss": 1.0665, "step": 4075 }, { "epoch": 1.6400000000000001, "grad_norm": 9.963557243347168, "learning_rate": 3.604e-06, "loss": 1.1667, "step": 4100 }, { "epoch": 1.65, "grad_norm": 15.800553321838379, "learning_rate": 3.5040000000000002e-06, "loss": 1.0871, "step": 4125 }, { "epoch": 1.6600000000000001, "grad_norm": 14.133501052856445, "learning_rate": 3.404e-06, "loss": 1.0588, "step": 4150 }, { "epoch": 1.67, "grad_norm": 15.049243927001953, "learning_rate": 3.3040000000000005e-06, "loss": 1.1755, "step": 4175 }, { "epoch": 1.6800000000000002, "grad_norm": 11.534449577331543, "learning_rate": 3.2040000000000006e-06, "loss": 1.0809, "step": 4200 }, { "epoch": 1.69, "grad_norm": 12.926283836364746, "learning_rate": 3.1040000000000003e-06, "loss": 1.1184, "step": 4225 }, { "epoch": 1.7, "grad_norm": 9.016489028930664, "learning_rate": 3.0040000000000004e-06, "loss": 0.9284, "step": 4250 }, { "epoch": 1.71, "grad_norm": 16.352996826171875, "learning_rate": 2.904e-06, "loss": 0.8954, "step": 4275 }, { "epoch": 1.72, "grad_norm": 11.342592239379883, "learning_rate": 2.804e-06, "loss": 1.0614, "step": 4300 }, { "epoch": 1.73, "grad_norm": 11.883009910583496, "learning_rate": 2.704e-06, "loss": 1.006, "step": 4325 }, { "epoch": 1.74, "grad_norm": 11.741077423095703, "learning_rate": 2.6040000000000004e-06, "loss": 1.1116, "step": 4350 }, { "epoch": 1.75, "grad_norm": 11.227665901184082, "learning_rate": 2.5040000000000005e-06, "loss": 1.0802, "step": 4375 }, { "epoch": 1.76, "grad_norm": 12.50661849975586, "learning_rate": 2.404e-06, "loss": 0.9816, "step": 4400 }, { "epoch": 1.77, "grad_norm": 13.356459617614746, "learning_rate": 2.3040000000000003e-06, "loss": 1.0844, "step": 4425 }, { "epoch": 1.78, "grad_norm": 10.1768159866333, "learning_rate": 2.2040000000000004e-06, "loss": 0.9909, "step": 4450 }, { "epoch": 1.79, "grad_norm": 9.657759666442871, "learning_rate": 2.104e-06, "loss": 0.9877, "step": 4475 }, { "epoch": 1.8, "grad_norm": 14.901777267456055, "learning_rate": 2.004e-06, "loss": 0.9987, "step": 4500 }, { "epoch": 1.81, "grad_norm": 17.132478713989258, "learning_rate": 1.9040000000000003e-06, "loss": 1.0573, "step": 4525 }, { "epoch": 1.8199999999999998, "grad_norm": 13.090239524841309, "learning_rate": 1.8040000000000002e-06, "loss": 0.9955, "step": 4550 }, { "epoch": 1.83, "grad_norm": 7.524806022644043, "learning_rate": 1.7040000000000001e-06, "loss": 1.0188, "step": 4575 }, { "epoch": 1.8399999999999999, "grad_norm": 14.137269973754883, "learning_rate": 1.604e-06, "loss": 0.9507, "step": 4600 }, { "epoch": 1.85, "grad_norm": 15.662611961364746, "learning_rate": 1.5040000000000001e-06, "loss": 1.0512, "step": 4625 }, { "epoch": 1.8599999999999999, "grad_norm": 13.104068756103516, "learning_rate": 1.404e-06, "loss": 1.0257, "step": 4650 }, { "epoch": 1.87, "grad_norm": 8.180632591247559, "learning_rate": 1.304e-06, "loss": 1.1014, "step": 4675 }, { "epoch": 1.88, "grad_norm": 9.715453147888184, "learning_rate": 1.204e-06, "loss": 0.9764, "step": 4700 }, { "epoch": 1.8900000000000001, "grad_norm": 10.140015602111816, "learning_rate": 1.1040000000000001e-06, "loss": 1.1436, "step": 4725 }, { "epoch": 1.9, "grad_norm": 13.225520133972168, "learning_rate": 1.004e-06, "loss": 1.021, "step": 4750 }, { "epoch": 1.9100000000000001, "grad_norm": 11.760268211364746, "learning_rate": 9.04e-07, "loss": 1.0674, "step": 4775 }, { "epoch": 1.92, "grad_norm": 10.296557426452637, "learning_rate": 8.04e-07, "loss": 1.044, "step": 4800 }, { "epoch": 1.9300000000000002, "grad_norm": 11.622244834899902, "learning_rate": 7.040000000000001e-07, "loss": 1.0317, "step": 4825 }, { "epoch": 1.94, "grad_norm": 14.334859848022461, "learning_rate": 6.040000000000001e-07, "loss": 1.1214, "step": 4850 }, { "epoch": 1.95, "grad_norm": 10.96068286895752, "learning_rate": 5.040000000000001e-07, "loss": 1.1726, "step": 4875 }, { "epoch": 1.96, "grad_norm": 14.571761131286621, "learning_rate": 4.04e-07, "loss": 0.9106, "step": 4900 }, { "epoch": 1.97, "grad_norm": 11.797457695007324, "learning_rate": 3.04e-07, "loss": 1.1328, "step": 4925 }, { "epoch": 1.98, "grad_norm": 11.585701942443848, "learning_rate": 2.0400000000000003e-07, "loss": 1.0677, "step": 4950 }, { "epoch": 1.99, "grad_norm": 15.56652545928955, "learning_rate": 1.04e-07, "loss": 1.0312, "step": 4975 }, { "epoch": 2.0, "grad_norm": 8.480283737182617, "learning_rate": 4e-09, "loss": 0.8741, "step": 5000 } ], "logging_steps": 25, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1142437888e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }