| { |
| "best_global_step": 7500, |
| "best_metric": 0.04980416223406792, |
| "best_model_checkpoint": "byt5-xl-ocr-finetuned/checkpoint-7500", |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 8235, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.018214936247723135, |
| "grad_norm": 69.06980895996094, |
| "learning_rate": 5.946601941747574e-06, |
| "loss": 5.2749, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03642987249544627, |
| "grad_norm": 16.915973663330078, |
| "learning_rate": 1.2014563106796117e-05, |
| "loss": 1.5737, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0546448087431694, |
| "grad_norm": 1.5618306398391724, |
| "learning_rate": 1.808252427184466e-05, |
| "loss": 0.5764, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.07285974499089254, |
| "grad_norm": 1.9782780408859253, |
| "learning_rate": 2.4150485436893205e-05, |
| "loss": 0.3942, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.09107468123861566, |
| "grad_norm": 2.0355050563812256, |
| "learning_rate": 3.0218446601941746e-05, |
| "loss": 0.2724, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.1092896174863388, |
| "grad_norm": 0.4324137568473816, |
| "learning_rate": 3.62864077669903e-05, |
| "loss": 0.2329, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.12750455373406194, |
| "grad_norm": 0.41908925771713257, |
| "learning_rate": 4.235436893203884e-05, |
| "loss": 0.2006, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.14571948998178508, |
| "grad_norm": 0.7745330333709717, |
| "learning_rate": 4.8422330097087385e-05, |
| "loss": 0.1838, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.16393442622950818, |
| "grad_norm": 0.4542683959007263, |
| "learning_rate": 4.999724032108263e-05, |
| "loss": 0.1702, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.18214936247723132, |
| "grad_norm": 0.34284254908561707, |
| "learning_rate": 4.998474341173774e-05, |
| "loss": 0.157, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.18214936247723132, |
| "eval_loss": 0.1169700026512146, |
| "eval_runtime": 487.9364, |
| "eval_samples_per_second": 20.003, |
| "eval_steps_per_second": 5.001, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.20036429872495445, |
| "grad_norm": 0.3513261079788208, |
| "learning_rate": 4.996217362852348e-05, |
| "loss": 0.1655, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.2185792349726776, |
| "grad_norm": 0.2953558564186096, |
| "learning_rate": 4.992954007069597e-05, |
| "loss": 0.149, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.23679417122040072, |
| "grad_norm": 0.23343749344348907, |
| "learning_rate": 4.988685589483267e-05, |
| "loss": 0.1448, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.2550091074681239, |
| "grad_norm": 0.17609573900699615, |
| "learning_rate": 4.9834138309528155e-05, |
| "loss": 0.1311, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.273224043715847, |
| "grad_norm": 0.22480589151382446, |
| "learning_rate": 4.977140856845625e-05, |
| "loss": 0.1345, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.29143897996357016, |
| "grad_norm": 0.2684943974018097, |
| "learning_rate": 4.969869196180143e-05, |
| "loss": 0.1335, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.30965391621129323, |
| "grad_norm": 0.22488351166248322, |
| "learning_rate": 4.9616017806062764e-05, |
| "loss": 0.1274, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.32786885245901637, |
| "grad_norm": 0.16293711960315704, |
| "learning_rate": 4.952341943223466e-05, |
| "loss": 0.1219, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.3460837887067395, |
| "grad_norm": 0.2496214509010315, |
| "learning_rate": 4.942093417236912e-05, |
| "loss": 0.1208, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.36429872495446264, |
| "grad_norm": 0.21169240772724152, |
| "learning_rate": 4.930860334452487e-05, |
| "loss": 0.1191, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.36429872495446264, |
| "eval_loss": 0.08914350718259811, |
| "eval_runtime": 489.8598, |
| "eval_samples_per_second": 19.924, |
| "eval_steps_per_second": 4.981, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3825136612021858, |
| "grad_norm": 0.1340503990650177, |
| "learning_rate": 4.918647223610961e-05, |
| "loss": 0.1182, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.4007285974499089, |
| "grad_norm": 0.47607916593551636, |
| "learning_rate": 4.905459008562181e-05, |
| "loss": 0.1137, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.41894353369763204, |
| "grad_norm": 0.17456993460655212, |
| "learning_rate": 4.8913010062799726e-05, |
| "loss": 0.1125, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.4371584699453552, |
| "grad_norm": 0.21866342425346375, |
| "learning_rate": 4.8761789247185405e-05, |
| "loss": 0.1132, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4553734061930783, |
| "grad_norm": 0.29610705375671387, |
| "learning_rate": 4.860098860511248e-05, |
| "loss": 0.1086, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.47358834244080145, |
| "grad_norm": 0.21859906613826752, |
| "learning_rate": 4.8430672965126864e-05, |
| "loss": 0.1053, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.4918032786885246, |
| "grad_norm": 0.28005993366241455, |
| "learning_rate": 4.8250910991850444e-05, |
| "loss": 0.1039, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.5100182149362478, |
| "grad_norm": 0.24446171522140503, |
| "learning_rate": 4.806177515829821e-05, |
| "loss": 0.1018, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5282331511839709, |
| "grad_norm": 0.2147260159254074, |
| "learning_rate": 4.7863341716659904e-05, |
| "loss": 0.1001, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.546448087431694, |
| "grad_norm": 0.33932870626449585, |
| "learning_rate": 4.7655690667558156e-05, |
| "loss": 0.1061, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.546448087431694, |
| "eval_loss": 0.0756722092628479, |
| "eval_runtime": 490.276, |
| "eval_samples_per_second": 19.907, |
| "eval_steps_per_second": 4.977, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5646630236794171, |
| "grad_norm": 0.14253345131874084, |
| "learning_rate": 4.743890572779534e-05, |
| "loss": 0.0966, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.5828779599271403, |
| "grad_norm": 0.5406768918037415, |
| "learning_rate": 4.72130742966022e-05, |
| "loss": 0.1063, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.6010928961748634, |
| "grad_norm": 0.13504968583583832, |
| "learning_rate": 4.697828742040194e-05, |
| "loss": 0.0989, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.6193078324225865, |
| "grad_norm": 0.20972897112369537, |
| "learning_rate": 4.673463975610385e-05, |
| "loss": 0.1004, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6375227686703097, |
| "grad_norm": 0.15182198584079742, |
| "learning_rate": 4.648222953294127e-05, |
| "loss": 0.0949, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.6557377049180327, |
| "grad_norm": 0.17607465386390686, |
| "learning_rate": 4.622115851286945e-05, |
| "loss": 0.0947, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.6739526411657559, |
| "grad_norm": 0.14970912039279938, |
| "learning_rate": 4.5951531949539126e-05, |
| "loss": 0.1002, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.692167577413479, |
| "grad_norm": 0.1299428790807724, |
| "learning_rate": 4.5673458545862266e-05, |
| "loss": 0.087, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.7103825136612022, |
| "grad_norm": 0.16123345494270325, |
| "learning_rate": 4.53870504101874e-05, |
| "loss": 0.0939, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.7285974499089253, |
| "grad_norm": 0.15291787683963776, |
| "learning_rate": 4.50924230111018e-05, |
| "loss": 0.0938, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7285974499089253, |
| "eval_loss": 0.06830848753452301, |
| "eval_runtime": 490.1531, |
| "eval_samples_per_second": 19.912, |
| "eval_steps_per_second": 4.978, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7468123861566485, |
| "grad_norm": 0.16285483539104462, |
| "learning_rate": 4.4789695130879156e-05, |
| "loss": 0.0948, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.7650273224043715, |
| "grad_norm": 0.1297728419303894, |
| "learning_rate": 4.447898881759111e-05, |
| "loss": 0.0827, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.7832422586520947, |
| "grad_norm": 0.17550143599510193, |
| "learning_rate": 4.416042933590229e-05, |
| "loss": 0.0868, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.8014571948998178, |
| "grad_norm": 0.14913193881511688, |
| "learning_rate": 4.383414511656846e-05, |
| "loss": 0.0905, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.819672131147541, |
| "grad_norm": 0.1405882090330124, |
| "learning_rate": 4.350026770465826e-05, |
| "loss": 0.088, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.8378870673952641, |
| "grad_norm": 0.17031477391719818, |
| "learning_rate": 4.3158931706519345e-05, |
| "loss": 0.0813, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.8561020036429873, |
| "grad_norm": 0.14652223885059357, |
| "learning_rate": 4.281027473551039e-05, |
| "loss": 0.087, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.8743169398907104, |
| "grad_norm": 0.1687142699956894, |
| "learning_rate": 4.245443735652073e-05, |
| "loss": 0.0849, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.8925318761384335, |
| "grad_norm": 0.1347888559103012, |
| "learning_rate": 4.209156302930006e-05, |
| "loss": 0.0815, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.9107468123861566, |
| "grad_norm": 0.18572378158569336, |
| "learning_rate": 4.172179805062113e-05, |
| "loss": 0.0852, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.9107468123861566, |
| "eval_loss": 0.0633399486541748, |
| "eval_runtime": 488.2149, |
| "eval_samples_per_second": 19.991, |
| "eval_steps_per_second": 4.998, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.9289617486338798, |
| "grad_norm": 0.23590320348739624, |
| "learning_rate": 4.134529149529852e-05, |
| "loss": 0.0821, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.9471766848816029, |
| "grad_norm": 0.15503446757793427, |
| "learning_rate": 4.096219515608751e-05, |
| "loss": 0.0794, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.9653916211293261, |
| "grad_norm": 0.13011206686496735, |
| "learning_rate": 4.05726634824872e-05, |
| "loss": 0.082, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.9836065573770492, |
| "grad_norm": 0.16497080028057098, |
| "learning_rate": 4.017685351847245e-05, |
| "loss": 0.0777, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.0018214936247722, |
| "grad_norm": 0.1528683304786682, |
| "learning_rate": 3.977492483917988e-05, |
| "loss": 0.0877, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.0200364298724955, |
| "grad_norm": 0.13187021017074585, |
| "learning_rate": 3.9367039486573446e-05, |
| "loss": 0.0762, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.0382513661202186, |
| "grad_norm": 0.1285446584224701, |
| "learning_rate": 3.895336190411539e-05, |
| "loss": 0.0692, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.0564663023679417, |
| "grad_norm": 0.1444154679775238, |
| "learning_rate": 3.8534058870469095e-05, |
| "loss": 0.0726, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.0746812386156648, |
| "grad_norm": 0.13176533579826355, |
| "learning_rate": 3.8109299432260356e-05, |
| "loss": 0.0792, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.092896174863388, |
| "grad_norm": 0.17461608350276947, |
| "learning_rate": 3.767925483592448e-05, |
| "loss": 0.0817, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.092896174863388, |
| "eval_loss": 0.060991112142801285, |
| "eval_runtime": 489.3008, |
| "eval_samples_per_second": 19.947, |
| "eval_steps_per_second": 4.987, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 0.09941103309392929, |
| "learning_rate": 3.7244098458666334e-05, |
| "loss": 0.0763, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.1293260473588342, |
| "grad_norm": 0.15530474483966827, |
| "learning_rate": 3.6804005738561456e-05, |
| "loss": 0.0734, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.1475409836065573, |
| "grad_norm": 0.15263348817825317, |
| "learning_rate": 3.6359154103826205e-05, |
| "loss": 0.0725, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.1657559198542806, |
| "grad_norm": 0.1578466147184372, |
| "learning_rate": 3.590972290128571e-05, |
| "loss": 0.0759, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.1839708561020037, |
| "grad_norm": 0.12527093291282654, |
| "learning_rate": 3.545589332406819e-05, |
| "loss": 0.0703, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.2021857923497268, |
| "grad_norm": 0.15813329815864563, |
| "learning_rate": 3.499784833855492e-05, |
| "loss": 0.0681, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.2204007285974499, |
| "grad_norm": 0.11386945098638535, |
| "learning_rate": 3.453577261061537e-05, |
| "loss": 0.0708, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.238615664845173, |
| "grad_norm": 0.14177338778972626, |
| "learning_rate": 3.4069852431157117e-05, |
| "loss": 0.0711, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.2568306010928962, |
| "grad_norm": 0.1626819372177124, |
| "learning_rate": 3.3600275641020605e-05, |
| "loss": 0.0694, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.2750455373406193, |
| "grad_norm": 0.1577647477388382, |
| "learning_rate": 3.312723155524906e-05, |
| "loss": 0.0695, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.2750455373406193, |
| "eval_loss": 0.05769478902220726, |
| "eval_runtime": 488.439, |
| "eval_samples_per_second": 19.982, |
| "eval_steps_per_second": 4.996, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.2932604735883424, |
| "grad_norm": 0.15615570545196533, |
| "learning_rate": 3.265091088676406e-05, |
| "loss": 0.0699, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.3114754098360657, |
| "grad_norm": 0.12867802381515503, |
| "learning_rate": 3.217150566947749e-05, |
| "loss": 0.0641, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.3296903460837888, |
| "grad_norm": 0.10840287804603577, |
| "learning_rate": 3.168920918087099e-05, |
| "loss": 0.0676, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.3479052823315119, |
| "grad_norm": 0.17157740890979767, |
| "learning_rate": 3.1204215864074006e-05, |
| "loss": 0.0608, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.366120218579235, |
| "grad_norm": 0.08710721135139465, |
| "learning_rate": 3.0716721249471905e-05, |
| "loss": 0.0626, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.384335154826958, |
| "grad_norm": 0.173573300242424, |
| "learning_rate": 3.022692187587576e-05, |
| "loss": 0.0638, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.4025500910746813, |
| "grad_norm": 0.1484445184469223, |
| "learning_rate": 2.9735015211285528e-05, |
| "loss": 0.0709, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.4207650273224044, |
| "grad_norm": 0.1358584463596344, |
| "learning_rate": 2.9241199573278734e-05, |
| "loss": 0.0693, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.4389799635701275, |
| "grad_norm": 0.11952122300863266, |
| "learning_rate": 2.8745674049056486e-05, |
| "loss": 0.0695, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.4571948998178508, |
| "grad_norm": 0.07695221155881882, |
| "learning_rate": 2.8248638415179308e-05, |
| "loss": 0.0625, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.4571948998178508, |
| "eval_loss": 0.056611545383930206, |
| "eval_runtime": 489.9552, |
| "eval_samples_per_second": 19.92, |
| "eval_steps_per_second": 4.98, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.4754098360655736, |
| "grad_norm": 0.1474759578704834, |
| "learning_rate": 2.7750293057025035e-05, |
| "loss": 0.0727, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.493624772313297, |
| "grad_norm": 0.11704400926828384, |
| "learning_rate": 2.725083888800124e-05, |
| "loss": 0.0628, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.51183970856102, |
| "grad_norm": 0.18205316364765167, |
| "learning_rate": 2.6750477268544777e-05, |
| "loss": 0.0663, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.530054644808743, |
| "grad_norm": 0.13981305062770844, |
| "learning_rate": 2.6249409924941104e-05, |
| "loss": 0.0647, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.5482695810564664, |
| "grad_norm": 0.15816594660282135, |
| "learning_rate": 2.5747838867996154e-05, |
| "loss": 0.0713, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.5664845173041895, |
| "grad_norm": 0.13389204442501068, |
| "learning_rate": 2.5245966311593405e-05, |
| "loss": 0.066, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.5846994535519126, |
| "grad_norm": 0.1280335932970047, |
| "learning_rate": 2.474399459116916e-05, |
| "loss": 0.0658, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.6029143897996359, |
| "grad_norm": 0.15096668899059296, |
| "learning_rate": 2.4242126082138785e-05, |
| "loss": 0.0683, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.6211293260473587, |
| "grad_norm": 0.14366798102855682, |
| "learning_rate": 2.3740563118306826e-05, |
| "loss": 0.0628, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.639344262295082, |
| "grad_norm": 0.13515126705169678, |
| "learning_rate": 2.323950791029397e-05, |
| "loss": 0.0664, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.639344262295082, |
| "eval_loss": 0.0539417639374733, |
| "eval_runtime": 489.8066, |
| "eval_samples_per_second": 19.926, |
| "eval_steps_per_second": 4.982, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.657559198542805, |
| "grad_norm": 0.1308237463235855, |
| "learning_rate": 2.2739162464013526e-05, |
| "loss": 0.0657, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.6757741347905282, |
| "grad_norm": 0.14330258965492249, |
| "learning_rate": 2.2239728499230543e-05, |
| "loss": 0.0689, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.6939890710382515, |
| "grad_norm": 0.16095995903015137, |
| "learning_rate": 2.1741407368236353e-05, |
| "loss": 0.0698, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.7122040072859745, |
| "grad_norm": 0.1826489120721817, |
| "learning_rate": 2.1244399974671055e-05, |
| "loss": 0.0662, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.7304189435336976, |
| "grad_norm": 0.11087053269147873, |
| "learning_rate": 2.0748906692527108e-05, |
| "loss": 0.0585, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.748633879781421, |
| "grad_norm": 0.1652335375547409, |
| "learning_rate": 2.0255127285366263e-05, |
| "loss": 0.0667, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.7668488160291438, |
| "grad_norm": 0.10961362719535828, |
| "learning_rate": 1.97632608257828e-05, |
| "loss": 0.0644, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.785063752276867, |
| "grad_norm": 0.14005999267101288, |
| "learning_rate": 1.927350561514512e-05, |
| "loss": 0.0638, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.8032786885245902, |
| "grad_norm": 0.16552738845348358, |
| "learning_rate": 1.8786059103648417e-05, |
| "loss": 0.0635, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.8214936247723132, |
| "grad_norm": 0.11428548395633698, |
| "learning_rate": 1.830111781071047e-05, |
| "loss": 0.0666, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.8214936247723132, |
| "eval_loss": 0.05203519016504288, |
| "eval_runtime": 489.7939, |
| "eval_samples_per_second": 19.927, |
| "eval_steps_per_second": 4.982, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.8397085610200365, |
| "grad_norm": 0.1313815712928772, |
| "learning_rate": 1.781887724574265e-05, |
| "loss": 0.061, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.8579234972677594, |
| "grad_norm": 0.14089354872703552, |
| "learning_rate": 1.7339531829328163e-05, |
| "loss": 0.0593, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.8761384335154827, |
| "grad_norm": 0.145121231675148, |
| "learning_rate": 1.6863274814839282e-05, |
| "loss": 0.0635, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.8943533697632058, |
| "grad_norm": 0.14890490472316742, |
| "learning_rate": 1.6390298210525095e-05, |
| "loss": 0.0609, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.9125683060109289, |
| "grad_norm": 0.10599125921726227, |
| "learning_rate": 1.5920792702101273e-05, |
| "loss": 0.0655, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.9307832422586522, |
| "grad_norm": 0.13904713094234467, |
| "learning_rate": 1.5454947575873034e-05, |
| "loss": 0.0614, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.9489981785063752, |
| "grad_norm": 0.11440698057413101, |
| "learning_rate": 1.499295064242229e-05, |
| "loss": 0.0609, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.9672131147540983, |
| "grad_norm": 0.11769060790538788, |
| "learning_rate": 1.4534988160889767e-05, |
| "loss": 0.0597, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.9854280510018216, |
| "grad_norm": 0.1365710347890854, |
| "learning_rate": 1.4081244763882529e-05, |
| "loss": 0.0598, |
| "step": 5450 |
| }, |
| { |
| "epoch": 2.0036429872495445, |
| "grad_norm": 0.089080311357975, |
| "learning_rate": 1.363190338303737e-05, |
| "loss": 0.0612, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.0036429872495445, |
| "eval_loss": 0.05140851065516472, |
| "eval_runtime": 489.7113, |
| "eval_samples_per_second": 19.93, |
| "eval_steps_per_second": 4.983, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.021857923497268, |
| "grad_norm": 0.11200718581676483, |
| "learning_rate": 1.3187145175269893e-05, |
| "loss": 0.0547, |
| "step": 5550 |
| }, |
| { |
| "epoch": 2.040072859744991, |
| "grad_norm": 0.19140245020389557, |
| "learning_rate": 1.274714944973912e-05, |
| "loss": 0.0571, |
| "step": 5600 |
| }, |
| { |
| "epoch": 2.058287795992714, |
| "grad_norm": 0.34514617919921875, |
| "learning_rate": 1.2312093595557001e-05, |
| "loss": 0.056, |
| "step": 5650 |
| }, |
| { |
| "epoch": 2.0765027322404372, |
| "grad_norm": 0.14967010915279388, |
| "learning_rate": 1.1882153010272049e-05, |
| "loss": 0.0565, |
| "step": 5700 |
| }, |
| { |
| "epoch": 2.09471766848816, |
| "grad_norm": 0.13327553868293762, |
| "learning_rate": 1.1457501029155978e-05, |
| "loss": 0.0546, |
| "step": 5750 |
| }, |
| { |
| "epoch": 2.1129326047358834, |
| "grad_norm": 0.10935332626104355, |
| "learning_rate": 1.1038308855321542e-05, |
| "loss": 0.0582, |
| "step": 5800 |
| }, |
| { |
| "epoch": 2.1311475409836067, |
| "grad_norm": 0.09595629572868347, |
| "learning_rate": 1.0624745490700228e-05, |
| "loss": 0.0563, |
| "step": 5850 |
| }, |
| { |
| "epoch": 2.1493624772313296, |
| "grad_norm": 0.14973677694797516, |
| "learning_rate": 1.0216977667907232e-05, |
| "loss": 0.0587, |
| "step": 5900 |
| }, |
| { |
| "epoch": 2.167577413479053, |
| "grad_norm": 0.12422586977481842, |
| "learning_rate": 9.815169783021347e-06, |
| "loss": 0.0569, |
| "step": 5950 |
| }, |
| { |
| "epoch": 2.185792349726776, |
| "grad_norm": 0.1235915794968605, |
| "learning_rate": 9.419483829306938e-06, |
| "loss": 0.0558, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.185792349726776, |
| "eval_loss": 0.0504976250231266, |
| "eval_runtime": 489.7715, |
| "eval_samples_per_second": 19.928, |
| "eval_steps_per_second": 4.982, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.204007285974499, |
| "grad_norm": 0.14746901392936707, |
| "learning_rate": 9.030079331904512e-06, |
| "loss": 0.0528, |
| "step": 6050 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 0.12787629663944244, |
| "learning_rate": 8.647113283516454e-06, |
| "loss": 0.055, |
| "step": 6100 |
| }, |
| { |
| "epoch": 2.240437158469945, |
| "grad_norm": 0.14677022397518158, |
| "learning_rate": 8.270740081113684e-06, |
| "loss": 0.0568, |
| "step": 6150 |
| }, |
| { |
| "epoch": 2.2586520947176685, |
| "grad_norm": 0.15499144792556763, |
| "learning_rate": 7.90111146368878e-06, |
| "loss": 0.0551, |
| "step": 6200 |
| }, |
| { |
| "epoch": 2.276867030965392, |
| "grad_norm": 0.13909883797168732, |
| "learning_rate": 7.5383764510807975e-06, |
| "loss": 0.057, |
| "step": 6250 |
| }, |
| { |
| "epoch": 2.2950819672131146, |
| "grad_norm": 0.11617127805948257, |
| "learning_rate": 7.182681283896253e-06, |
| "loss": 0.0537, |
| "step": 6300 |
| }, |
| { |
| "epoch": 2.313296903460838, |
| "grad_norm": 0.14818504452705383, |
| "learning_rate": 6.834169364550597e-06, |
| "loss": 0.052, |
| "step": 6350 |
| }, |
| { |
| "epoch": 2.3315118397085612, |
| "grad_norm": 0.10602527111768723, |
| "learning_rate": 6.492981199453996e-06, |
| "loss": 0.0525, |
| "step": 6400 |
| }, |
| { |
| "epoch": 2.349726775956284, |
| "grad_norm": 0.10226523876190186, |
| "learning_rate": 6.159254342364609e-06, |
| "loss": 0.0538, |
| "step": 6450 |
| }, |
| { |
| "epoch": 2.3679417122040074, |
| "grad_norm": 0.10851209610700607, |
| "learning_rate": 5.833123338932256e-06, |
| "loss": 0.0571, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.3679417122040074, |
| "eval_loss": 0.05036979168653488, |
| "eval_runtime": 489.7412, |
| "eval_samples_per_second": 19.929, |
| "eval_steps_per_second": 4.982, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.3861566484517303, |
| "grad_norm": 0.1328704059123993, |
| "learning_rate": 5.51471967245491e-06, |
| "loss": 0.0556, |
| "step": 6550 |
| }, |
| { |
| "epoch": 2.4043715846994536, |
| "grad_norm": 0.09755656123161316, |
| "learning_rate": 5.2041717108697065e-06, |
| "loss": 0.0514, |
| "step": 6600 |
| }, |
| { |
| "epoch": 2.422586520947177, |
| "grad_norm": 0.10794492810964584, |
| "learning_rate": 4.90160465500005e-06, |
| "loss": 0.0529, |
| "step": 6650 |
| }, |
| { |
| "epoch": 2.4408014571948997, |
| "grad_norm": 0.11240658164024353, |
| "learning_rate": 4.607140488079492e-06, |
| "loss": 0.0536, |
| "step": 6700 |
| }, |
| { |
| "epoch": 2.459016393442623, |
| "grad_norm": 0.11631111800670624, |
| "learning_rate": 4.320897926572853e-06, |
| "loss": 0.0522, |
| "step": 6750 |
| }, |
| { |
| "epoch": 2.477231329690346, |
| "grad_norm": 0.08789575099945068, |
| "learning_rate": 4.0429923723143915e-06, |
| "loss": 0.0489, |
| "step": 6800 |
| }, |
| { |
| "epoch": 2.495446265938069, |
| "grad_norm": 0.10507268458604813, |
| "learning_rate": 3.7735358659822752e-06, |
| "loss": 0.05, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.5136612021857925, |
| "grad_norm": 0.10618982464075089, |
| "learning_rate": 3.5126370419281436e-06, |
| "loss": 0.0605, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.5318761384335153, |
| "grad_norm": 0.1154901459813118, |
| "learning_rate": 3.260401084379991e-06, |
| "loss": 0.053, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.5500910746812386, |
| "grad_norm": 0.1327841877937317, |
| "learning_rate": 3.0169296850359878e-06, |
| "loss": 0.056, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.5500910746812386, |
| "eval_loss": 0.049849580973386765, |
| "eval_runtime": 489.6947, |
| "eval_samples_per_second": 19.931, |
| "eval_steps_per_second": 4.983, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.5683060109289615, |
| "grad_norm": 0.12928417325019836, |
| "learning_rate": 2.782321002066332e-06, |
| "loss": 0.0568, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.586520947176685, |
| "grad_norm": 0.14363490045070648, |
| "learning_rate": 2.556669620539734e-06, |
| "loss": 0.054, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.604735883424408, |
| "grad_norm": 0.12806904315948486, |
| "learning_rate": 2.3400665142903927e-06, |
| "loss": 0.0525, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.6229508196721314, |
| "grad_norm": 0.11711638420820236, |
| "learning_rate": 2.132599009240882e-06, |
| "loss": 0.0544, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.6411657559198543, |
| "grad_norm": 0.11255411803722382, |
| "learning_rate": 1.9343507481957846e-06, |
| "loss": 0.0507, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.6593806921675776, |
| "grad_norm": 0.13773201406002045, |
| "learning_rate": 1.7454016571201186e-06, |
| "loss": 0.0534, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.6775956284153004, |
| "grad_norm": 0.11689828336238861, |
| "learning_rate": 1.5658279129163706e-06, |
| "loss": 0.0562, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.6958105646630237, |
| "grad_norm": 0.1306321620941162, |
| "learning_rate": 1.3957019127128851e-06, |
| "loss": 0.0493, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.714025500910747, |
| "grad_norm": 0.12783832848072052, |
| "learning_rate": 1.235092244676192e-06, |
| "loss": 0.0541, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.73224043715847, |
| "grad_norm": 0.11963380128145218, |
| "learning_rate": 1.0840636603589444e-06, |
| "loss": 0.0541, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.73224043715847, |
| "eval_loss": 0.04980416223406792, |
| "eval_runtime": 489.8577, |
| "eval_samples_per_second": 19.924, |
| "eval_steps_per_second": 4.981, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.750455373406193, |
| "grad_norm": 0.12381980568170547, |
| "learning_rate": 9.426770485945924e-07, |
| "loss": 0.0541, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.768670309653916, |
| "grad_norm": 0.14063633978366852, |
| "learning_rate": 8.109894109493976e-07, |
| "loss": 0.0511, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.7868852459016393, |
| "grad_norm": 0.10304458439350128, |
| "learning_rate": 6.890538387416212e-07, |
| "loss": 0.0533, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.8051001821493626, |
| "grad_norm": 0.16216354072093964, |
| "learning_rate": 5.76919491637179e-07, |
| "loss": 0.0598, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.8233151183970855, |
| "grad_norm": 0.13660581409931183, |
| "learning_rate": 4.746315778303756e-07, |
| "loss": 0.0531, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.841530054644809, |
| "grad_norm": 0.28685104846954346, |
| "learning_rate": 3.8223133581772595e-07, |
| "loss": 0.0542, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.8597449908925316, |
| "grad_norm": 0.11297158896923065, |
| "learning_rate": 2.9975601777219863e-07, |
| "loss": 0.0453, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.877959927140255, |
| "grad_norm": 0.1289464831352234, |
| "learning_rate": 2.2723887452461013e-07, |
| "loss": 0.0527, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.8961748633879782, |
| "grad_norm": 0.13939346373081207, |
| "learning_rate": 1.6470914215816758e-07, |
| "loss": 0.0544, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.9143897996357016, |
| "grad_norm": 0.12807384133338928, |
| "learning_rate": 1.1219203022162505e-07, |
| "loss": 0.0558, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.9143897996357016, |
| "eval_loss": 0.04983741417527199, |
| "eval_runtime": 489.7256, |
| "eval_samples_per_second": 19.93, |
| "eval_steps_per_second": 4.982, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.9326047358834244, |
| "grad_norm": 0.11168187856674194, |
| "learning_rate": 6.970871156578573e-08, |
| "loss": 0.0513, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.9508196721311473, |
| "grad_norm": 0.11235981434583664, |
| "learning_rate": 3.7276313807418917e-08, |
| "loss": 0.0513, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.9690346083788706, |
| "grad_norm": 0.16754215955734253, |
| "learning_rate": 1.4907912424091952e-08, |
| "loss": 0.0528, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.987249544626594, |
| "grad_norm": 0.12086477875709534, |
| "learning_rate": 2.612525482631467e-09, |
| "loss": 0.0515, |
| "step": 8200 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 8235, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.4247978274909594e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|