{ "best_global_step": 7500, "best_metric": 0.04980416223406792, "best_model_checkpoint": "byt5-xl-ocr-finetuned/checkpoint-7500", "epoch": 3.0, "eval_steps": 500, "global_step": 8235, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018214936247723135, "grad_norm": 69.06980895996094, "learning_rate": 5.946601941747574e-06, "loss": 5.2749, "step": 50 }, { "epoch": 0.03642987249544627, "grad_norm": 16.915973663330078, "learning_rate": 1.2014563106796117e-05, "loss": 1.5737, "step": 100 }, { "epoch": 0.0546448087431694, "grad_norm": 1.5618306398391724, "learning_rate": 1.808252427184466e-05, "loss": 0.5764, "step": 150 }, { "epoch": 0.07285974499089254, "grad_norm": 1.9782780408859253, "learning_rate": 2.4150485436893205e-05, "loss": 0.3942, "step": 200 }, { "epoch": 0.09107468123861566, "grad_norm": 2.0355050563812256, "learning_rate": 3.0218446601941746e-05, "loss": 0.2724, "step": 250 }, { "epoch": 0.1092896174863388, "grad_norm": 0.4324137568473816, "learning_rate": 3.62864077669903e-05, "loss": 0.2329, "step": 300 }, { "epoch": 0.12750455373406194, "grad_norm": 0.41908925771713257, "learning_rate": 4.235436893203884e-05, "loss": 0.2006, "step": 350 }, { "epoch": 0.14571948998178508, "grad_norm": 0.7745330333709717, "learning_rate": 4.8422330097087385e-05, "loss": 0.1838, "step": 400 }, { "epoch": 0.16393442622950818, "grad_norm": 0.4542683959007263, "learning_rate": 4.999724032108263e-05, "loss": 0.1702, "step": 450 }, { "epoch": 0.18214936247723132, "grad_norm": 0.34284254908561707, "learning_rate": 4.998474341173774e-05, "loss": 0.157, "step": 500 }, { "epoch": 0.18214936247723132, "eval_loss": 0.1169700026512146, "eval_runtime": 487.9364, "eval_samples_per_second": 20.003, "eval_steps_per_second": 5.001, "step": 500 }, { "epoch": 0.20036429872495445, "grad_norm": 0.3513261079788208, "learning_rate": 4.996217362852348e-05, "loss": 0.1655, "step": 550 }, { "epoch": 0.2185792349726776, "grad_norm": 0.2953558564186096, "learning_rate": 4.992954007069597e-05, "loss": 0.149, "step": 600 }, { "epoch": 0.23679417122040072, "grad_norm": 0.23343749344348907, "learning_rate": 4.988685589483267e-05, "loss": 0.1448, "step": 650 }, { "epoch": 0.2550091074681239, "grad_norm": 0.17609573900699615, "learning_rate": 4.9834138309528155e-05, "loss": 0.1311, "step": 700 }, { "epoch": 0.273224043715847, "grad_norm": 0.22480589151382446, "learning_rate": 4.977140856845625e-05, "loss": 0.1345, "step": 750 }, { "epoch": 0.29143897996357016, "grad_norm": 0.2684943974018097, "learning_rate": 4.969869196180143e-05, "loss": 0.1335, "step": 800 }, { "epoch": 0.30965391621129323, "grad_norm": 0.22488351166248322, "learning_rate": 4.9616017806062764e-05, "loss": 0.1274, "step": 850 }, { "epoch": 0.32786885245901637, "grad_norm": 0.16293711960315704, "learning_rate": 4.952341943223466e-05, "loss": 0.1219, "step": 900 }, { "epoch": 0.3460837887067395, "grad_norm": 0.2496214509010315, "learning_rate": 4.942093417236912e-05, "loss": 0.1208, "step": 950 }, { "epoch": 0.36429872495446264, "grad_norm": 0.21169240772724152, "learning_rate": 4.930860334452487e-05, "loss": 0.1191, "step": 1000 }, { "epoch": 0.36429872495446264, "eval_loss": 0.08914350718259811, "eval_runtime": 489.8598, "eval_samples_per_second": 19.924, "eval_steps_per_second": 4.981, "step": 1000 }, { "epoch": 0.3825136612021858, "grad_norm": 0.1340503990650177, "learning_rate": 4.918647223610961e-05, "loss": 0.1182, "step": 1050 }, { "epoch": 0.4007285974499089, "grad_norm": 0.47607916593551636, "learning_rate": 4.905459008562181e-05, "loss": 0.1137, "step": 1100 }, { "epoch": 0.41894353369763204, "grad_norm": 0.17456993460655212, "learning_rate": 4.8913010062799726e-05, "loss": 0.1125, "step": 1150 }, { "epoch": 0.4371584699453552, "grad_norm": 0.21866342425346375, "learning_rate": 4.8761789247185405e-05, "loss": 0.1132, "step": 1200 }, { "epoch": 0.4553734061930783, "grad_norm": 0.29610705375671387, "learning_rate": 4.860098860511248e-05, "loss": 0.1086, "step": 1250 }, { "epoch": 0.47358834244080145, "grad_norm": 0.21859906613826752, "learning_rate": 4.8430672965126864e-05, "loss": 0.1053, "step": 1300 }, { "epoch": 0.4918032786885246, "grad_norm": 0.28005993366241455, "learning_rate": 4.8250910991850444e-05, "loss": 0.1039, "step": 1350 }, { "epoch": 0.5100182149362478, "grad_norm": 0.24446171522140503, "learning_rate": 4.806177515829821e-05, "loss": 0.1018, "step": 1400 }, { "epoch": 0.5282331511839709, "grad_norm": 0.2147260159254074, "learning_rate": 4.7863341716659904e-05, "loss": 0.1001, "step": 1450 }, { "epoch": 0.546448087431694, "grad_norm": 0.33932870626449585, "learning_rate": 4.7655690667558156e-05, "loss": 0.1061, "step": 1500 }, { "epoch": 0.546448087431694, "eval_loss": 0.0756722092628479, "eval_runtime": 490.276, "eval_samples_per_second": 19.907, "eval_steps_per_second": 4.977, "step": 1500 }, { "epoch": 0.5646630236794171, "grad_norm": 0.14253345131874084, "learning_rate": 4.743890572779534e-05, "loss": 0.0966, "step": 1550 }, { "epoch": 0.5828779599271403, "grad_norm": 0.5406768918037415, "learning_rate": 4.72130742966022e-05, "loss": 0.1063, "step": 1600 }, { "epoch": 0.6010928961748634, "grad_norm": 0.13504968583583832, "learning_rate": 4.697828742040194e-05, "loss": 0.0989, "step": 1650 }, { "epoch": 0.6193078324225865, "grad_norm": 0.20972897112369537, "learning_rate": 4.673463975610385e-05, "loss": 0.1004, "step": 1700 }, { "epoch": 0.6375227686703097, "grad_norm": 0.15182198584079742, "learning_rate": 4.648222953294127e-05, "loss": 0.0949, "step": 1750 }, { "epoch": 0.6557377049180327, "grad_norm": 0.17607465386390686, "learning_rate": 4.622115851286945e-05, "loss": 0.0947, "step": 1800 }, { "epoch": 0.6739526411657559, "grad_norm": 0.14970912039279938, "learning_rate": 4.5951531949539126e-05, "loss": 0.1002, "step": 1850 }, { "epoch": 0.692167577413479, "grad_norm": 0.1299428790807724, "learning_rate": 4.5673458545862266e-05, "loss": 0.087, "step": 1900 }, { "epoch": 0.7103825136612022, "grad_norm": 0.16123345494270325, "learning_rate": 4.53870504101874e-05, "loss": 0.0939, "step": 1950 }, { "epoch": 0.7285974499089253, "grad_norm": 0.15291787683963776, "learning_rate": 4.50924230111018e-05, "loss": 0.0938, "step": 2000 }, { "epoch": 0.7285974499089253, "eval_loss": 0.06830848753452301, "eval_runtime": 490.1531, "eval_samples_per_second": 19.912, "eval_steps_per_second": 4.978, "step": 2000 }, { "epoch": 0.7468123861566485, "grad_norm": 0.16285483539104462, "learning_rate": 4.4789695130879156e-05, "loss": 0.0948, "step": 2050 }, { "epoch": 0.7650273224043715, "grad_norm": 0.1297728419303894, "learning_rate": 4.447898881759111e-05, "loss": 0.0827, "step": 2100 }, { "epoch": 0.7832422586520947, "grad_norm": 0.17550143599510193, "learning_rate": 4.416042933590229e-05, "loss": 0.0868, "step": 2150 }, { "epoch": 0.8014571948998178, "grad_norm": 0.14913193881511688, "learning_rate": 4.383414511656846e-05, "loss": 0.0905, "step": 2200 }, { "epoch": 0.819672131147541, "grad_norm": 0.1405882090330124, "learning_rate": 4.350026770465826e-05, "loss": 0.088, "step": 2250 }, { "epoch": 0.8378870673952641, "grad_norm": 0.17031477391719818, "learning_rate": 4.3158931706519345e-05, "loss": 0.0813, "step": 2300 }, { "epoch": 0.8561020036429873, "grad_norm": 0.14652223885059357, "learning_rate": 4.281027473551039e-05, "loss": 0.087, "step": 2350 }, { "epoch": 0.8743169398907104, "grad_norm": 0.1687142699956894, "learning_rate": 4.245443735652073e-05, "loss": 0.0849, "step": 2400 }, { "epoch": 0.8925318761384335, "grad_norm": 0.1347888559103012, "learning_rate": 4.209156302930006e-05, "loss": 0.0815, "step": 2450 }, { "epoch": 0.9107468123861566, "grad_norm": 0.18572378158569336, "learning_rate": 4.172179805062113e-05, "loss": 0.0852, "step": 2500 }, { "epoch": 0.9107468123861566, "eval_loss": 0.0633399486541748, "eval_runtime": 488.2149, "eval_samples_per_second": 19.991, "eval_steps_per_second": 4.998, "step": 2500 }, { "epoch": 0.9289617486338798, "grad_norm": 0.23590320348739624, "learning_rate": 4.134529149529852e-05, "loss": 0.0821, "step": 2550 }, { "epoch": 0.9471766848816029, "grad_norm": 0.15503446757793427, "learning_rate": 4.096219515608751e-05, "loss": 0.0794, "step": 2600 }, { "epoch": 0.9653916211293261, "grad_norm": 0.13011206686496735, "learning_rate": 4.05726634824872e-05, "loss": 0.082, "step": 2650 }, { "epoch": 0.9836065573770492, "grad_norm": 0.16497080028057098, "learning_rate": 4.017685351847245e-05, "loss": 0.0777, "step": 2700 }, { "epoch": 1.0018214936247722, "grad_norm": 0.1528683304786682, "learning_rate": 3.977492483917988e-05, "loss": 0.0877, "step": 2750 }, { "epoch": 1.0200364298724955, "grad_norm": 0.13187021017074585, "learning_rate": 3.9367039486573446e-05, "loss": 0.0762, "step": 2800 }, { "epoch": 1.0382513661202186, "grad_norm": 0.1285446584224701, "learning_rate": 3.895336190411539e-05, "loss": 0.0692, "step": 2850 }, { "epoch": 1.0564663023679417, "grad_norm": 0.1444154679775238, "learning_rate": 3.8534058870469095e-05, "loss": 0.0726, "step": 2900 }, { "epoch": 1.0746812386156648, "grad_norm": 0.13176533579826355, "learning_rate": 3.8109299432260356e-05, "loss": 0.0792, "step": 2950 }, { "epoch": 1.092896174863388, "grad_norm": 0.17461608350276947, "learning_rate": 3.767925483592448e-05, "loss": 0.0817, "step": 3000 }, { "epoch": 1.092896174863388, "eval_loss": 0.060991112142801285, "eval_runtime": 489.3008, "eval_samples_per_second": 19.947, "eval_steps_per_second": 4.987, "step": 3000 }, { "epoch": 1.1111111111111112, "grad_norm": 0.09941103309392929, "learning_rate": 3.7244098458666334e-05, "loss": 0.0763, "step": 3050 }, { "epoch": 1.1293260473588342, "grad_norm": 0.15530474483966827, "learning_rate": 3.6804005738561456e-05, "loss": 0.0734, "step": 3100 }, { "epoch": 1.1475409836065573, "grad_norm": 0.15263348817825317, "learning_rate": 3.6359154103826205e-05, "loss": 0.0725, "step": 3150 }, { "epoch": 1.1657559198542806, "grad_norm": 0.1578466147184372, "learning_rate": 3.590972290128571e-05, "loss": 0.0759, "step": 3200 }, { "epoch": 1.1839708561020037, "grad_norm": 0.12527093291282654, "learning_rate": 3.545589332406819e-05, "loss": 0.0703, "step": 3250 }, { "epoch": 1.2021857923497268, "grad_norm": 0.15813329815864563, "learning_rate": 3.499784833855492e-05, "loss": 0.0681, "step": 3300 }, { "epoch": 1.2204007285974499, "grad_norm": 0.11386945098638535, "learning_rate": 3.453577261061537e-05, "loss": 0.0708, "step": 3350 }, { "epoch": 1.238615664845173, "grad_norm": 0.14177338778972626, "learning_rate": 3.4069852431157117e-05, "loss": 0.0711, "step": 3400 }, { "epoch": 1.2568306010928962, "grad_norm": 0.1626819372177124, "learning_rate": 3.3600275641020605e-05, "loss": 0.0694, "step": 3450 }, { "epoch": 1.2750455373406193, "grad_norm": 0.1577647477388382, "learning_rate": 3.312723155524906e-05, "loss": 0.0695, "step": 3500 }, { "epoch": 1.2750455373406193, "eval_loss": 0.05769478902220726, "eval_runtime": 488.439, "eval_samples_per_second": 19.982, "eval_steps_per_second": 4.996, "step": 3500 }, { "epoch": 1.2932604735883424, "grad_norm": 0.15615570545196533, "learning_rate": 3.265091088676406e-05, "loss": 0.0699, "step": 3550 }, { "epoch": 1.3114754098360657, "grad_norm": 0.12867802381515503, "learning_rate": 3.217150566947749e-05, "loss": 0.0641, "step": 3600 }, { "epoch": 1.3296903460837888, "grad_norm": 0.10840287804603577, "learning_rate": 3.168920918087099e-05, "loss": 0.0676, "step": 3650 }, { "epoch": 1.3479052823315119, "grad_norm": 0.17157740890979767, "learning_rate": 3.1204215864074006e-05, "loss": 0.0608, "step": 3700 }, { "epoch": 1.366120218579235, "grad_norm": 0.08710721135139465, "learning_rate": 3.0716721249471905e-05, "loss": 0.0626, "step": 3750 }, { "epoch": 1.384335154826958, "grad_norm": 0.173573300242424, "learning_rate": 3.022692187587576e-05, "loss": 0.0638, "step": 3800 }, { "epoch": 1.4025500910746813, "grad_norm": 0.1484445184469223, "learning_rate": 2.9735015211285528e-05, "loss": 0.0709, "step": 3850 }, { "epoch": 1.4207650273224044, "grad_norm": 0.1358584463596344, "learning_rate": 2.9241199573278734e-05, "loss": 0.0693, "step": 3900 }, { "epoch": 1.4389799635701275, "grad_norm": 0.11952122300863266, "learning_rate": 2.8745674049056486e-05, "loss": 0.0695, "step": 3950 }, { "epoch": 1.4571948998178508, "grad_norm": 0.07695221155881882, "learning_rate": 2.8248638415179308e-05, "loss": 0.0625, "step": 4000 }, { "epoch": 1.4571948998178508, "eval_loss": 0.056611545383930206, "eval_runtime": 489.9552, "eval_samples_per_second": 19.92, "eval_steps_per_second": 4.98, "step": 4000 }, { "epoch": 1.4754098360655736, "grad_norm": 0.1474759578704834, "learning_rate": 2.7750293057025035e-05, "loss": 0.0727, "step": 4050 }, { "epoch": 1.493624772313297, "grad_norm": 0.11704400926828384, "learning_rate": 2.725083888800124e-05, "loss": 0.0628, "step": 4100 }, { "epoch": 1.51183970856102, "grad_norm": 0.18205316364765167, "learning_rate": 2.6750477268544777e-05, "loss": 0.0663, "step": 4150 }, { "epoch": 1.530054644808743, "grad_norm": 0.13981305062770844, "learning_rate": 2.6249409924941104e-05, "loss": 0.0647, "step": 4200 }, { "epoch": 1.5482695810564664, "grad_norm": 0.15816594660282135, "learning_rate": 2.5747838867996154e-05, "loss": 0.0713, "step": 4250 }, { "epoch": 1.5664845173041895, "grad_norm": 0.13389204442501068, "learning_rate": 2.5245966311593405e-05, "loss": 0.066, "step": 4300 }, { "epoch": 1.5846994535519126, "grad_norm": 0.1280335932970047, "learning_rate": 2.474399459116916e-05, "loss": 0.0658, "step": 4350 }, { "epoch": 1.6029143897996359, "grad_norm": 0.15096668899059296, "learning_rate": 2.4242126082138785e-05, "loss": 0.0683, "step": 4400 }, { "epoch": 1.6211293260473587, "grad_norm": 0.14366798102855682, "learning_rate": 2.3740563118306826e-05, "loss": 0.0628, "step": 4450 }, { "epoch": 1.639344262295082, "grad_norm": 0.13515126705169678, "learning_rate": 2.323950791029397e-05, "loss": 0.0664, "step": 4500 }, { "epoch": 1.639344262295082, "eval_loss": 0.0539417639374733, "eval_runtime": 489.8066, "eval_samples_per_second": 19.926, "eval_steps_per_second": 4.982, "step": 4500 }, { "epoch": 1.657559198542805, "grad_norm": 0.1308237463235855, "learning_rate": 2.2739162464013526e-05, "loss": 0.0657, "step": 4550 }, { "epoch": 1.6757741347905282, "grad_norm": 0.14330258965492249, "learning_rate": 2.2239728499230543e-05, "loss": 0.0689, "step": 4600 }, { "epoch": 1.6939890710382515, "grad_norm": 0.16095995903015137, "learning_rate": 2.1741407368236353e-05, "loss": 0.0698, "step": 4650 }, { "epoch": 1.7122040072859745, "grad_norm": 0.1826489120721817, "learning_rate": 2.1244399974671055e-05, "loss": 0.0662, "step": 4700 }, { "epoch": 1.7304189435336976, "grad_norm": 0.11087053269147873, "learning_rate": 2.0748906692527108e-05, "loss": 0.0585, "step": 4750 }, { "epoch": 1.748633879781421, "grad_norm": 0.1652335375547409, "learning_rate": 2.0255127285366263e-05, "loss": 0.0667, "step": 4800 }, { "epoch": 1.7668488160291438, "grad_norm": 0.10961362719535828, "learning_rate": 1.97632608257828e-05, "loss": 0.0644, "step": 4850 }, { "epoch": 1.785063752276867, "grad_norm": 0.14005999267101288, "learning_rate": 1.927350561514512e-05, "loss": 0.0638, "step": 4900 }, { "epoch": 1.8032786885245902, "grad_norm": 0.16552738845348358, "learning_rate": 1.8786059103648417e-05, "loss": 0.0635, "step": 4950 }, { "epoch": 1.8214936247723132, "grad_norm": 0.11428548395633698, "learning_rate": 1.830111781071047e-05, "loss": 0.0666, "step": 5000 }, { "epoch": 1.8214936247723132, "eval_loss": 0.05203519016504288, "eval_runtime": 489.7939, "eval_samples_per_second": 19.927, "eval_steps_per_second": 4.982, "step": 5000 }, { "epoch": 1.8397085610200365, "grad_norm": 0.1313815712928772, "learning_rate": 1.781887724574265e-05, "loss": 0.061, "step": 5050 }, { "epoch": 1.8579234972677594, "grad_norm": 0.14089354872703552, "learning_rate": 1.7339531829328163e-05, "loss": 0.0593, "step": 5100 }, { "epoch": 1.8761384335154827, "grad_norm": 0.145121231675148, "learning_rate": 1.6863274814839282e-05, "loss": 0.0635, "step": 5150 }, { "epoch": 1.8943533697632058, "grad_norm": 0.14890490472316742, "learning_rate": 1.6390298210525095e-05, "loss": 0.0609, "step": 5200 }, { "epoch": 1.9125683060109289, "grad_norm": 0.10599125921726227, "learning_rate": 1.5920792702101273e-05, "loss": 0.0655, "step": 5250 }, { "epoch": 1.9307832422586522, "grad_norm": 0.13904713094234467, "learning_rate": 1.5454947575873034e-05, "loss": 0.0614, "step": 5300 }, { "epoch": 1.9489981785063752, "grad_norm": 0.11440698057413101, "learning_rate": 1.499295064242229e-05, "loss": 0.0609, "step": 5350 }, { "epoch": 1.9672131147540983, "grad_norm": 0.11769060790538788, "learning_rate": 1.4534988160889767e-05, "loss": 0.0597, "step": 5400 }, { "epoch": 1.9854280510018216, "grad_norm": 0.1365710347890854, "learning_rate": 1.4081244763882529e-05, "loss": 0.0598, "step": 5450 }, { "epoch": 2.0036429872495445, "grad_norm": 0.089080311357975, "learning_rate": 1.363190338303737e-05, "loss": 0.0612, "step": 5500 }, { "epoch": 2.0036429872495445, "eval_loss": 0.05140851065516472, "eval_runtime": 489.7113, "eval_samples_per_second": 19.93, "eval_steps_per_second": 4.983, "step": 5500 }, { "epoch": 2.021857923497268, "grad_norm": 0.11200718581676483, "learning_rate": 1.3187145175269893e-05, "loss": 0.0547, "step": 5550 }, { "epoch": 2.040072859744991, "grad_norm": 0.19140245020389557, "learning_rate": 1.274714944973912e-05, "loss": 0.0571, "step": 5600 }, { "epoch": 2.058287795992714, "grad_norm": 0.34514617919921875, "learning_rate": 1.2312093595557001e-05, "loss": 0.056, "step": 5650 }, { "epoch": 2.0765027322404372, "grad_norm": 0.14967010915279388, "learning_rate": 1.1882153010272049e-05, "loss": 0.0565, "step": 5700 }, { "epoch": 2.09471766848816, "grad_norm": 0.13327553868293762, "learning_rate": 1.1457501029155978e-05, "loss": 0.0546, "step": 5750 }, { "epoch": 2.1129326047358834, "grad_norm": 0.10935332626104355, "learning_rate": 1.1038308855321542e-05, "loss": 0.0582, "step": 5800 }, { "epoch": 2.1311475409836067, "grad_norm": 0.09595629572868347, "learning_rate": 1.0624745490700228e-05, "loss": 0.0563, "step": 5850 }, { "epoch": 2.1493624772313296, "grad_norm": 0.14973677694797516, "learning_rate": 1.0216977667907232e-05, "loss": 0.0587, "step": 5900 }, { "epoch": 2.167577413479053, "grad_norm": 0.12422586977481842, "learning_rate": 9.815169783021347e-06, "loss": 0.0569, "step": 5950 }, { "epoch": 2.185792349726776, "grad_norm": 0.1235915794968605, "learning_rate": 9.419483829306938e-06, "loss": 0.0558, "step": 6000 }, { "epoch": 2.185792349726776, "eval_loss": 0.0504976250231266, "eval_runtime": 489.7715, "eval_samples_per_second": 19.928, "eval_steps_per_second": 4.982, "step": 6000 }, { "epoch": 2.204007285974499, "grad_norm": 0.14746901392936707, "learning_rate": 9.030079331904512e-06, "loss": 0.0528, "step": 6050 }, { "epoch": 2.2222222222222223, "grad_norm": 0.12787629663944244, "learning_rate": 8.647113283516454e-06, "loss": 0.055, "step": 6100 }, { "epoch": 2.240437158469945, "grad_norm": 0.14677022397518158, "learning_rate": 8.270740081113684e-06, "loss": 0.0568, "step": 6150 }, { "epoch": 2.2586520947176685, "grad_norm": 0.15499144792556763, "learning_rate": 7.90111146368878e-06, "loss": 0.0551, "step": 6200 }, { "epoch": 2.276867030965392, "grad_norm": 0.13909883797168732, "learning_rate": 7.5383764510807975e-06, "loss": 0.057, "step": 6250 }, { "epoch": 2.2950819672131146, "grad_norm": 0.11617127805948257, "learning_rate": 7.182681283896253e-06, "loss": 0.0537, "step": 6300 }, { "epoch": 2.313296903460838, "grad_norm": 0.14818504452705383, "learning_rate": 6.834169364550597e-06, "loss": 0.052, "step": 6350 }, { "epoch": 2.3315118397085612, "grad_norm": 0.10602527111768723, "learning_rate": 6.492981199453996e-06, "loss": 0.0525, "step": 6400 }, { "epoch": 2.349726775956284, "grad_norm": 0.10226523876190186, "learning_rate": 6.159254342364609e-06, "loss": 0.0538, "step": 6450 }, { "epoch": 2.3679417122040074, "grad_norm": 0.10851209610700607, "learning_rate": 5.833123338932256e-06, "loss": 0.0571, "step": 6500 }, { "epoch": 2.3679417122040074, "eval_loss": 0.05036979168653488, "eval_runtime": 489.7412, "eval_samples_per_second": 19.929, "eval_steps_per_second": 4.982, "step": 6500 }, { "epoch": 2.3861566484517303, "grad_norm": 0.1328704059123993, "learning_rate": 5.51471967245491e-06, "loss": 0.0556, "step": 6550 }, { "epoch": 2.4043715846994536, "grad_norm": 0.09755656123161316, "learning_rate": 5.2041717108697065e-06, "loss": 0.0514, "step": 6600 }, { "epoch": 2.422586520947177, "grad_norm": 0.10794492810964584, "learning_rate": 4.90160465500005e-06, "loss": 0.0529, "step": 6650 }, { "epoch": 2.4408014571948997, "grad_norm": 0.11240658164024353, "learning_rate": 4.607140488079492e-06, "loss": 0.0536, "step": 6700 }, { "epoch": 2.459016393442623, "grad_norm": 0.11631111800670624, "learning_rate": 4.320897926572853e-06, "loss": 0.0522, "step": 6750 }, { "epoch": 2.477231329690346, "grad_norm": 0.08789575099945068, "learning_rate": 4.0429923723143915e-06, "loss": 0.0489, "step": 6800 }, { "epoch": 2.495446265938069, "grad_norm": 0.10507268458604813, "learning_rate": 3.7735358659822752e-06, "loss": 0.05, "step": 6850 }, { "epoch": 2.5136612021857925, "grad_norm": 0.10618982464075089, "learning_rate": 3.5126370419281436e-06, "loss": 0.0605, "step": 6900 }, { "epoch": 2.5318761384335153, "grad_norm": 0.1154901459813118, "learning_rate": 3.260401084379991e-06, "loss": 0.053, "step": 6950 }, { "epoch": 2.5500910746812386, "grad_norm": 0.1327841877937317, "learning_rate": 3.0169296850359878e-06, "loss": 0.056, "step": 7000 }, { "epoch": 2.5500910746812386, "eval_loss": 0.049849580973386765, "eval_runtime": 489.6947, "eval_samples_per_second": 19.931, "eval_steps_per_second": 4.983, "step": 7000 }, { "epoch": 2.5683060109289615, "grad_norm": 0.12928417325019836, "learning_rate": 2.782321002066332e-06, "loss": 0.0568, "step": 7050 }, { "epoch": 2.586520947176685, "grad_norm": 0.14363490045070648, "learning_rate": 2.556669620539734e-06, "loss": 0.054, "step": 7100 }, { "epoch": 2.604735883424408, "grad_norm": 0.12806904315948486, "learning_rate": 2.3400665142903927e-06, "loss": 0.0525, "step": 7150 }, { "epoch": 2.6229508196721314, "grad_norm": 0.11711638420820236, "learning_rate": 2.132599009240882e-06, "loss": 0.0544, "step": 7200 }, { "epoch": 2.6411657559198543, "grad_norm": 0.11255411803722382, "learning_rate": 1.9343507481957846e-06, "loss": 0.0507, "step": 7250 }, { "epoch": 2.6593806921675776, "grad_norm": 0.13773201406002045, "learning_rate": 1.7454016571201186e-06, "loss": 0.0534, "step": 7300 }, { "epoch": 2.6775956284153004, "grad_norm": 0.11689828336238861, "learning_rate": 1.5658279129163706e-06, "loss": 0.0562, "step": 7350 }, { "epoch": 2.6958105646630237, "grad_norm": 0.1306321620941162, "learning_rate": 1.3957019127128851e-06, "loss": 0.0493, "step": 7400 }, { "epoch": 2.714025500910747, "grad_norm": 0.12783832848072052, "learning_rate": 1.235092244676192e-06, "loss": 0.0541, "step": 7450 }, { "epoch": 2.73224043715847, "grad_norm": 0.11963380128145218, "learning_rate": 1.0840636603589444e-06, "loss": 0.0541, "step": 7500 }, { "epoch": 2.73224043715847, "eval_loss": 0.04980416223406792, "eval_runtime": 489.8577, "eval_samples_per_second": 19.924, "eval_steps_per_second": 4.981, "step": 7500 }, { "epoch": 2.750455373406193, "grad_norm": 0.12381980568170547, "learning_rate": 9.426770485945924e-07, "loss": 0.0541, "step": 7550 }, { "epoch": 2.768670309653916, "grad_norm": 0.14063633978366852, "learning_rate": 8.109894109493976e-07, "loss": 0.0511, "step": 7600 }, { "epoch": 2.7868852459016393, "grad_norm": 0.10304458439350128, "learning_rate": 6.890538387416212e-07, "loss": 0.0533, "step": 7650 }, { "epoch": 2.8051001821493626, "grad_norm": 0.16216354072093964, "learning_rate": 5.76919491637179e-07, "loss": 0.0598, "step": 7700 }, { "epoch": 2.8233151183970855, "grad_norm": 0.13660581409931183, "learning_rate": 4.746315778303756e-07, "loss": 0.0531, "step": 7750 }, { "epoch": 2.841530054644809, "grad_norm": 0.28685104846954346, "learning_rate": 3.8223133581772595e-07, "loss": 0.0542, "step": 7800 }, { "epoch": 2.8597449908925316, "grad_norm": 0.11297158896923065, "learning_rate": 2.9975601777219863e-07, "loss": 0.0453, "step": 7850 }, { "epoch": 2.877959927140255, "grad_norm": 0.1289464831352234, "learning_rate": 2.2723887452461013e-07, "loss": 0.0527, "step": 7900 }, { "epoch": 2.8961748633879782, "grad_norm": 0.13939346373081207, "learning_rate": 1.6470914215816758e-07, "loss": 0.0544, "step": 7950 }, { "epoch": 2.9143897996357016, "grad_norm": 0.12807384133338928, "learning_rate": 1.1219203022162505e-07, "loss": 0.0558, "step": 8000 }, { "epoch": 2.9143897996357016, "eval_loss": 0.04983741417527199, "eval_runtime": 489.7256, "eval_samples_per_second": 19.93, "eval_steps_per_second": 4.982, "step": 8000 }, { "epoch": 2.9326047358834244, "grad_norm": 0.11168187856674194, "learning_rate": 6.970871156578573e-08, "loss": 0.0513, "step": 8050 }, { "epoch": 2.9508196721311473, "grad_norm": 0.11235981434583664, "learning_rate": 3.7276313807418917e-08, "loss": 0.0513, "step": 8100 }, { "epoch": 2.9690346083788706, "grad_norm": 0.16754215955734253, "learning_rate": 1.4907912424091952e-08, "loss": 0.0528, "step": 8150 }, { "epoch": 2.987249544626594, "grad_norm": 0.12086477875709534, "learning_rate": 2.612525482631467e-09, "loss": 0.0515, "step": 8200 } ], "logging_steps": 50, "max_steps": 8235, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4247978274909594e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }