Emendator / trainer_state.json
aimgo's picture
Upload 14 files
bee97b5 verified
{
"best_global_step": 7500,
"best_metric": 0.04980416223406792,
"best_model_checkpoint": "byt5-xl-ocr-finetuned/checkpoint-7500",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 8235,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018214936247723135,
"grad_norm": 69.06980895996094,
"learning_rate": 5.946601941747574e-06,
"loss": 5.2749,
"step": 50
},
{
"epoch": 0.03642987249544627,
"grad_norm": 16.915973663330078,
"learning_rate": 1.2014563106796117e-05,
"loss": 1.5737,
"step": 100
},
{
"epoch": 0.0546448087431694,
"grad_norm": 1.5618306398391724,
"learning_rate": 1.808252427184466e-05,
"loss": 0.5764,
"step": 150
},
{
"epoch": 0.07285974499089254,
"grad_norm": 1.9782780408859253,
"learning_rate": 2.4150485436893205e-05,
"loss": 0.3942,
"step": 200
},
{
"epoch": 0.09107468123861566,
"grad_norm": 2.0355050563812256,
"learning_rate": 3.0218446601941746e-05,
"loss": 0.2724,
"step": 250
},
{
"epoch": 0.1092896174863388,
"grad_norm": 0.4324137568473816,
"learning_rate": 3.62864077669903e-05,
"loss": 0.2329,
"step": 300
},
{
"epoch": 0.12750455373406194,
"grad_norm": 0.41908925771713257,
"learning_rate": 4.235436893203884e-05,
"loss": 0.2006,
"step": 350
},
{
"epoch": 0.14571948998178508,
"grad_norm": 0.7745330333709717,
"learning_rate": 4.8422330097087385e-05,
"loss": 0.1838,
"step": 400
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.4542683959007263,
"learning_rate": 4.999724032108263e-05,
"loss": 0.1702,
"step": 450
},
{
"epoch": 0.18214936247723132,
"grad_norm": 0.34284254908561707,
"learning_rate": 4.998474341173774e-05,
"loss": 0.157,
"step": 500
},
{
"epoch": 0.18214936247723132,
"eval_loss": 0.1169700026512146,
"eval_runtime": 487.9364,
"eval_samples_per_second": 20.003,
"eval_steps_per_second": 5.001,
"step": 500
},
{
"epoch": 0.20036429872495445,
"grad_norm": 0.3513261079788208,
"learning_rate": 4.996217362852348e-05,
"loss": 0.1655,
"step": 550
},
{
"epoch": 0.2185792349726776,
"grad_norm": 0.2953558564186096,
"learning_rate": 4.992954007069597e-05,
"loss": 0.149,
"step": 600
},
{
"epoch": 0.23679417122040072,
"grad_norm": 0.23343749344348907,
"learning_rate": 4.988685589483267e-05,
"loss": 0.1448,
"step": 650
},
{
"epoch": 0.2550091074681239,
"grad_norm": 0.17609573900699615,
"learning_rate": 4.9834138309528155e-05,
"loss": 0.1311,
"step": 700
},
{
"epoch": 0.273224043715847,
"grad_norm": 0.22480589151382446,
"learning_rate": 4.977140856845625e-05,
"loss": 0.1345,
"step": 750
},
{
"epoch": 0.29143897996357016,
"grad_norm": 0.2684943974018097,
"learning_rate": 4.969869196180143e-05,
"loss": 0.1335,
"step": 800
},
{
"epoch": 0.30965391621129323,
"grad_norm": 0.22488351166248322,
"learning_rate": 4.9616017806062764e-05,
"loss": 0.1274,
"step": 850
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.16293711960315704,
"learning_rate": 4.952341943223466e-05,
"loss": 0.1219,
"step": 900
},
{
"epoch": 0.3460837887067395,
"grad_norm": 0.2496214509010315,
"learning_rate": 4.942093417236912e-05,
"loss": 0.1208,
"step": 950
},
{
"epoch": 0.36429872495446264,
"grad_norm": 0.21169240772724152,
"learning_rate": 4.930860334452487e-05,
"loss": 0.1191,
"step": 1000
},
{
"epoch": 0.36429872495446264,
"eval_loss": 0.08914350718259811,
"eval_runtime": 489.8598,
"eval_samples_per_second": 19.924,
"eval_steps_per_second": 4.981,
"step": 1000
},
{
"epoch": 0.3825136612021858,
"grad_norm": 0.1340503990650177,
"learning_rate": 4.918647223610961e-05,
"loss": 0.1182,
"step": 1050
},
{
"epoch": 0.4007285974499089,
"grad_norm": 0.47607916593551636,
"learning_rate": 4.905459008562181e-05,
"loss": 0.1137,
"step": 1100
},
{
"epoch": 0.41894353369763204,
"grad_norm": 0.17456993460655212,
"learning_rate": 4.8913010062799726e-05,
"loss": 0.1125,
"step": 1150
},
{
"epoch": 0.4371584699453552,
"grad_norm": 0.21866342425346375,
"learning_rate": 4.8761789247185405e-05,
"loss": 0.1132,
"step": 1200
},
{
"epoch": 0.4553734061930783,
"grad_norm": 0.29610705375671387,
"learning_rate": 4.860098860511248e-05,
"loss": 0.1086,
"step": 1250
},
{
"epoch": 0.47358834244080145,
"grad_norm": 0.21859906613826752,
"learning_rate": 4.8430672965126864e-05,
"loss": 0.1053,
"step": 1300
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.28005993366241455,
"learning_rate": 4.8250910991850444e-05,
"loss": 0.1039,
"step": 1350
},
{
"epoch": 0.5100182149362478,
"grad_norm": 0.24446171522140503,
"learning_rate": 4.806177515829821e-05,
"loss": 0.1018,
"step": 1400
},
{
"epoch": 0.5282331511839709,
"grad_norm": 0.2147260159254074,
"learning_rate": 4.7863341716659904e-05,
"loss": 0.1001,
"step": 1450
},
{
"epoch": 0.546448087431694,
"grad_norm": 0.33932870626449585,
"learning_rate": 4.7655690667558156e-05,
"loss": 0.1061,
"step": 1500
},
{
"epoch": 0.546448087431694,
"eval_loss": 0.0756722092628479,
"eval_runtime": 490.276,
"eval_samples_per_second": 19.907,
"eval_steps_per_second": 4.977,
"step": 1500
},
{
"epoch": 0.5646630236794171,
"grad_norm": 0.14253345131874084,
"learning_rate": 4.743890572779534e-05,
"loss": 0.0966,
"step": 1550
},
{
"epoch": 0.5828779599271403,
"grad_norm": 0.5406768918037415,
"learning_rate": 4.72130742966022e-05,
"loss": 0.1063,
"step": 1600
},
{
"epoch": 0.6010928961748634,
"grad_norm": 0.13504968583583832,
"learning_rate": 4.697828742040194e-05,
"loss": 0.0989,
"step": 1650
},
{
"epoch": 0.6193078324225865,
"grad_norm": 0.20972897112369537,
"learning_rate": 4.673463975610385e-05,
"loss": 0.1004,
"step": 1700
},
{
"epoch": 0.6375227686703097,
"grad_norm": 0.15182198584079742,
"learning_rate": 4.648222953294127e-05,
"loss": 0.0949,
"step": 1750
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.17607465386390686,
"learning_rate": 4.622115851286945e-05,
"loss": 0.0947,
"step": 1800
},
{
"epoch": 0.6739526411657559,
"grad_norm": 0.14970912039279938,
"learning_rate": 4.5951531949539126e-05,
"loss": 0.1002,
"step": 1850
},
{
"epoch": 0.692167577413479,
"grad_norm": 0.1299428790807724,
"learning_rate": 4.5673458545862266e-05,
"loss": 0.087,
"step": 1900
},
{
"epoch": 0.7103825136612022,
"grad_norm": 0.16123345494270325,
"learning_rate": 4.53870504101874e-05,
"loss": 0.0939,
"step": 1950
},
{
"epoch": 0.7285974499089253,
"grad_norm": 0.15291787683963776,
"learning_rate": 4.50924230111018e-05,
"loss": 0.0938,
"step": 2000
},
{
"epoch": 0.7285974499089253,
"eval_loss": 0.06830848753452301,
"eval_runtime": 490.1531,
"eval_samples_per_second": 19.912,
"eval_steps_per_second": 4.978,
"step": 2000
},
{
"epoch": 0.7468123861566485,
"grad_norm": 0.16285483539104462,
"learning_rate": 4.4789695130879156e-05,
"loss": 0.0948,
"step": 2050
},
{
"epoch": 0.7650273224043715,
"grad_norm": 0.1297728419303894,
"learning_rate": 4.447898881759111e-05,
"loss": 0.0827,
"step": 2100
},
{
"epoch": 0.7832422586520947,
"grad_norm": 0.17550143599510193,
"learning_rate": 4.416042933590229e-05,
"loss": 0.0868,
"step": 2150
},
{
"epoch": 0.8014571948998178,
"grad_norm": 0.14913193881511688,
"learning_rate": 4.383414511656846e-05,
"loss": 0.0905,
"step": 2200
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.1405882090330124,
"learning_rate": 4.350026770465826e-05,
"loss": 0.088,
"step": 2250
},
{
"epoch": 0.8378870673952641,
"grad_norm": 0.17031477391719818,
"learning_rate": 4.3158931706519345e-05,
"loss": 0.0813,
"step": 2300
},
{
"epoch": 0.8561020036429873,
"grad_norm": 0.14652223885059357,
"learning_rate": 4.281027473551039e-05,
"loss": 0.087,
"step": 2350
},
{
"epoch": 0.8743169398907104,
"grad_norm": 0.1687142699956894,
"learning_rate": 4.245443735652073e-05,
"loss": 0.0849,
"step": 2400
},
{
"epoch": 0.8925318761384335,
"grad_norm": 0.1347888559103012,
"learning_rate": 4.209156302930006e-05,
"loss": 0.0815,
"step": 2450
},
{
"epoch": 0.9107468123861566,
"grad_norm": 0.18572378158569336,
"learning_rate": 4.172179805062113e-05,
"loss": 0.0852,
"step": 2500
},
{
"epoch": 0.9107468123861566,
"eval_loss": 0.0633399486541748,
"eval_runtime": 488.2149,
"eval_samples_per_second": 19.991,
"eval_steps_per_second": 4.998,
"step": 2500
},
{
"epoch": 0.9289617486338798,
"grad_norm": 0.23590320348739624,
"learning_rate": 4.134529149529852e-05,
"loss": 0.0821,
"step": 2550
},
{
"epoch": 0.9471766848816029,
"grad_norm": 0.15503446757793427,
"learning_rate": 4.096219515608751e-05,
"loss": 0.0794,
"step": 2600
},
{
"epoch": 0.9653916211293261,
"grad_norm": 0.13011206686496735,
"learning_rate": 4.05726634824872e-05,
"loss": 0.082,
"step": 2650
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.16497080028057098,
"learning_rate": 4.017685351847245e-05,
"loss": 0.0777,
"step": 2700
},
{
"epoch": 1.0018214936247722,
"grad_norm": 0.1528683304786682,
"learning_rate": 3.977492483917988e-05,
"loss": 0.0877,
"step": 2750
},
{
"epoch": 1.0200364298724955,
"grad_norm": 0.13187021017074585,
"learning_rate": 3.9367039486573446e-05,
"loss": 0.0762,
"step": 2800
},
{
"epoch": 1.0382513661202186,
"grad_norm": 0.1285446584224701,
"learning_rate": 3.895336190411539e-05,
"loss": 0.0692,
"step": 2850
},
{
"epoch": 1.0564663023679417,
"grad_norm": 0.1444154679775238,
"learning_rate": 3.8534058870469095e-05,
"loss": 0.0726,
"step": 2900
},
{
"epoch": 1.0746812386156648,
"grad_norm": 0.13176533579826355,
"learning_rate": 3.8109299432260356e-05,
"loss": 0.0792,
"step": 2950
},
{
"epoch": 1.092896174863388,
"grad_norm": 0.17461608350276947,
"learning_rate": 3.767925483592448e-05,
"loss": 0.0817,
"step": 3000
},
{
"epoch": 1.092896174863388,
"eval_loss": 0.060991112142801285,
"eval_runtime": 489.3008,
"eval_samples_per_second": 19.947,
"eval_steps_per_second": 4.987,
"step": 3000
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.09941103309392929,
"learning_rate": 3.7244098458666334e-05,
"loss": 0.0763,
"step": 3050
},
{
"epoch": 1.1293260473588342,
"grad_norm": 0.15530474483966827,
"learning_rate": 3.6804005738561456e-05,
"loss": 0.0734,
"step": 3100
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.15263348817825317,
"learning_rate": 3.6359154103826205e-05,
"loss": 0.0725,
"step": 3150
},
{
"epoch": 1.1657559198542806,
"grad_norm": 0.1578466147184372,
"learning_rate": 3.590972290128571e-05,
"loss": 0.0759,
"step": 3200
},
{
"epoch": 1.1839708561020037,
"grad_norm": 0.12527093291282654,
"learning_rate": 3.545589332406819e-05,
"loss": 0.0703,
"step": 3250
},
{
"epoch": 1.2021857923497268,
"grad_norm": 0.15813329815864563,
"learning_rate": 3.499784833855492e-05,
"loss": 0.0681,
"step": 3300
},
{
"epoch": 1.2204007285974499,
"grad_norm": 0.11386945098638535,
"learning_rate": 3.453577261061537e-05,
"loss": 0.0708,
"step": 3350
},
{
"epoch": 1.238615664845173,
"grad_norm": 0.14177338778972626,
"learning_rate": 3.4069852431157117e-05,
"loss": 0.0711,
"step": 3400
},
{
"epoch": 1.2568306010928962,
"grad_norm": 0.1626819372177124,
"learning_rate": 3.3600275641020605e-05,
"loss": 0.0694,
"step": 3450
},
{
"epoch": 1.2750455373406193,
"grad_norm": 0.1577647477388382,
"learning_rate": 3.312723155524906e-05,
"loss": 0.0695,
"step": 3500
},
{
"epoch": 1.2750455373406193,
"eval_loss": 0.05769478902220726,
"eval_runtime": 488.439,
"eval_samples_per_second": 19.982,
"eval_steps_per_second": 4.996,
"step": 3500
},
{
"epoch": 1.2932604735883424,
"grad_norm": 0.15615570545196533,
"learning_rate": 3.265091088676406e-05,
"loss": 0.0699,
"step": 3550
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.12867802381515503,
"learning_rate": 3.217150566947749e-05,
"loss": 0.0641,
"step": 3600
},
{
"epoch": 1.3296903460837888,
"grad_norm": 0.10840287804603577,
"learning_rate": 3.168920918087099e-05,
"loss": 0.0676,
"step": 3650
},
{
"epoch": 1.3479052823315119,
"grad_norm": 0.17157740890979767,
"learning_rate": 3.1204215864074006e-05,
"loss": 0.0608,
"step": 3700
},
{
"epoch": 1.366120218579235,
"grad_norm": 0.08710721135139465,
"learning_rate": 3.0716721249471905e-05,
"loss": 0.0626,
"step": 3750
},
{
"epoch": 1.384335154826958,
"grad_norm": 0.173573300242424,
"learning_rate": 3.022692187587576e-05,
"loss": 0.0638,
"step": 3800
},
{
"epoch": 1.4025500910746813,
"grad_norm": 0.1484445184469223,
"learning_rate": 2.9735015211285528e-05,
"loss": 0.0709,
"step": 3850
},
{
"epoch": 1.4207650273224044,
"grad_norm": 0.1358584463596344,
"learning_rate": 2.9241199573278734e-05,
"loss": 0.0693,
"step": 3900
},
{
"epoch": 1.4389799635701275,
"grad_norm": 0.11952122300863266,
"learning_rate": 2.8745674049056486e-05,
"loss": 0.0695,
"step": 3950
},
{
"epoch": 1.4571948998178508,
"grad_norm": 0.07695221155881882,
"learning_rate": 2.8248638415179308e-05,
"loss": 0.0625,
"step": 4000
},
{
"epoch": 1.4571948998178508,
"eval_loss": 0.056611545383930206,
"eval_runtime": 489.9552,
"eval_samples_per_second": 19.92,
"eval_steps_per_second": 4.98,
"step": 4000
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.1474759578704834,
"learning_rate": 2.7750293057025035e-05,
"loss": 0.0727,
"step": 4050
},
{
"epoch": 1.493624772313297,
"grad_norm": 0.11704400926828384,
"learning_rate": 2.725083888800124e-05,
"loss": 0.0628,
"step": 4100
},
{
"epoch": 1.51183970856102,
"grad_norm": 0.18205316364765167,
"learning_rate": 2.6750477268544777e-05,
"loss": 0.0663,
"step": 4150
},
{
"epoch": 1.530054644808743,
"grad_norm": 0.13981305062770844,
"learning_rate": 2.6249409924941104e-05,
"loss": 0.0647,
"step": 4200
},
{
"epoch": 1.5482695810564664,
"grad_norm": 0.15816594660282135,
"learning_rate": 2.5747838867996154e-05,
"loss": 0.0713,
"step": 4250
},
{
"epoch": 1.5664845173041895,
"grad_norm": 0.13389204442501068,
"learning_rate": 2.5245966311593405e-05,
"loss": 0.066,
"step": 4300
},
{
"epoch": 1.5846994535519126,
"grad_norm": 0.1280335932970047,
"learning_rate": 2.474399459116916e-05,
"loss": 0.0658,
"step": 4350
},
{
"epoch": 1.6029143897996359,
"grad_norm": 0.15096668899059296,
"learning_rate": 2.4242126082138785e-05,
"loss": 0.0683,
"step": 4400
},
{
"epoch": 1.6211293260473587,
"grad_norm": 0.14366798102855682,
"learning_rate": 2.3740563118306826e-05,
"loss": 0.0628,
"step": 4450
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.13515126705169678,
"learning_rate": 2.323950791029397e-05,
"loss": 0.0664,
"step": 4500
},
{
"epoch": 1.639344262295082,
"eval_loss": 0.0539417639374733,
"eval_runtime": 489.8066,
"eval_samples_per_second": 19.926,
"eval_steps_per_second": 4.982,
"step": 4500
},
{
"epoch": 1.657559198542805,
"grad_norm": 0.1308237463235855,
"learning_rate": 2.2739162464013526e-05,
"loss": 0.0657,
"step": 4550
},
{
"epoch": 1.6757741347905282,
"grad_norm": 0.14330258965492249,
"learning_rate": 2.2239728499230543e-05,
"loss": 0.0689,
"step": 4600
},
{
"epoch": 1.6939890710382515,
"grad_norm": 0.16095995903015137,
"learning_rate": 2.1741407368236353e-05,
"loss": 0.0698,
"step": 4650
},
{
"epoch": 1.7122040072859745,
"grad_norm": 0.1826489120721817,
"learning_rate": 2.1244399974671055e-05,
"loss": 0.0662,
"step": 4700
},
{
"epoch": 1.7304189435336976,
"grad_norm": 0.11087053269147873,
"learning_rate": 2.0748906692527108e-05,
"loss": 0.0585,
"step": 4750
},
{
"epoch": 1.748633879781421,
"grad_norm": 0.1652335375547409,
"learning_rate": 2.0255127285366263e-05,
"loss": 0.0667,
"step": 4800
},
{
"epoch": 1.7668488160291438,
"grad_norm": 0.10961362719535828,
"learning_rate": 1.97632608257828e-05,
"loss": 0.0644,
"step": 4850
},
{
"epoch": 1.785063752276867,
"grad_norm": 0.14005999267101288,
"learning_rate": 1.927350561514512e-05,
"loss": 0.0638,
"step": 4900
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.16552738845348358,
"learning_rate": 1.8786059103648417e-05,
"loss": 0.0635,
"step": 4950
},
{
"epoch": 1.8214936247723132,
"grad_norm": 0.11428548395633698,
"learning_rate": 1.830111781071047e-05,
"loss": 0.0666,
"step": 5000
},
{
"epoch": 1.8214936247723132,
"eval_loss": 0.05203519016504288,
"eval_runtime": 489.7939,
"eval_samples_per_second": 19.927,
"eval_steps_per_second": 4.982,
"step": 5000
},
{
"epoch": 1.8397085610200365,
"grad_norm": 0.1313815712928772,
"learning_rate": 1.781887724574265e-05,
"loss": 0.061,
"step": 5050
},
{
"epoch": 1.8579234972677594,
"grad_norm": 0.14089354872703552,
"learning_rate": 1.7339531829328163e-05,
"loss": 0.0593,
"step": 5100
},
{
"epoch": 1.8761384335154827,
"grad_norm": 0.145121231675148,
"learning_rate": 1.6863274814839282e-05,
"loss": 0.0635,
"step": 5150
},
{
"epoch": 1.8943533697632058,
"grad_norm": 0.14890490472316742,
"learning_rate": 1.6390298210525095e-05,
"loss": 0.0609,
"step": 5200
},
{
"epoch": 1.9125683060109289,
"grad_norm": 0.10599125921726227,
"learning_rate": 1.5920792702101273e-05,
"loss": 0.0655,
"step": 5250
},
{
"epoch": 1.9307832422586522,
"grad_norm": 0.13904713094234467,
"learning_rate": 1.5454947575873034e-05,
"loss": 0.0614,
"step": 5300
},
{
"epoch": 1.9489981785063752,
"grad_norm": 0.11440698057413101,
"learning_rate": 1.499295064242229e-05,
"loss": 0.0609,
"step": 5350
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.11769060790538788,
"learning_rate": 1.4534988160889767e-05,
"loss": 0.0597,
"step": 5400
},
{
"epoch": 1.9854280510018216,
"grad_norm": 0.1365710347890854,
"learning_rate": 1.4081244763882529e-05,
"loss": 0.0598,
"step": 5450
},
{
"epoch": 2.0036429872495445,
"grad_norm": 0.089080311357975,
"learning_rate": 1.363190338303737e-05,
"loss": 0.0612,
"step": 5500
},
{
"epoch": 2.0036429872495445,
"eval_loss": 0.05140851065516472,
"eval_runtime": 489.7113,
"eval_samples_per_second": 19.93,
"eval_steps_per_second": 4.983,
"step": 5500
},
{
"epoch": 2.021857923497268,
"grad_norm": 0.11200718581676483,
"learning_rate": 1.3187145175269893e-05,
"loss": 0.0547,
"step": 5550
},
{
"epoch": 2.040072859744991,
"grad_norm": 0.19140245020389557,
"learning_rate": 1.274714944973912e-05,
"loss": 0.0571,
"step": 5600
},
{
"epoch": 2.058287795992714,
"grad_norm": 0.34514617919921875,
"learning_rate": 1.2312093595557001e-05,
"loss": 0.056,
"step": 5650
},
{
"epoch": 2.0765027322404372,
"grad_norm": 0.14967010915279388,
"learning_rate": 1.1882153010272049e-05,
"loss": 0.0565,
"step": 5700
},
{
"epoch": 2.09471766848816,
"grad_norm": 0.13327553868293762,
"learning_rate": 1.1457501029155978e-05,
"loss": 0.0546,
"step": 5750
},
{
"epoch": 2.1129326047358834,
"grad_norm": 0.10935332626104355,
"learning_rate": 1.1038308855321542e-05,
"loss": 0.0582,
"step": 5800
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.09595629572868347,
"learning_rate": 1.0624745490700228e-05,
"loss": 0.0563,
"step": 5850
},
{
"epoch": 2.1493624772313296,
"grad_norm": 0.14973677694797516,
"learning_rate": 1.0216977667907232e-05,
"loss": 0.0587,
"step": 5900
},
{
"epoch": 2.167577413479053,
"grad_norm": 0.12422586977481842,
"learning_rate": 9.815169783021347e-06,
"loss": 0.0569,
"step": 5950
},
{
"epoch": 2.185792349726776,
"grad_norm": 0.1235915794968605,
"learning_rate": 9.419483829306938e-06,
"loss": 0.0558,
"step": 6000
},
{
"epoch": 2.185792349726776,
"eval_loss": 0.0504976250231266,
"eval_runtime": 489.7715,
"eval_samples_per_second": 19.928,
"eval_steps_per_second": 4.982,
"step": 6000
},
{
"epoch": 2.204007285974499,
"grad_norm": 0.14746901392936707,
"learning_rate": 9.030079331904512e-06,
"loss": 0.0528,
"step": 6050
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.12787629663944244,
"learning_rate": 8.647113283516454e-06,
"loss": 0.055,
"step": 6100
},
{
"epoch": 2.240437158469945,
"grad_norm": 0.14677022397518158,
"learning_rate": 8.270740081113684e-06,
"loss": 0.0568,
"step": 6150
},
{
"epoch": 2.2586520947176685,
"grad_norm": 0.15499144792556763,
"learning_rate": 7.90111146368878e-06,
"loss": 0.0551,
"step": 6200
},
{
"epoch": 2.276867030965392,
"grad_norm": 0.13909883797168732,
"learning_rate": 7.5383764510807975e-06,
"loss": 0.057,
"step": 6250
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.11617127805948257,
"learning_rate": 7.182681283896253e-06,
"loss": 0.0537,
"step": 6300
},
{
"epoch": 2.313296903460838,
"grad_norm": 0.14818504452705383,
"learning_rate": 6.834169364550597e-06,
"loss": 0.052,
"step": 6350
},
{
"epoch": 2.3315118397085612,
"grad_norm": 0.10602527111768723,
"learning_rate": 6.492981199453996e-06,
"loss": 0.0525,
"step": 6400
},
{
"epoch": 2.349726775956284,
"grad_norm": 0.10226523876190186,
"learning_rate": 6.159254342364609e-06,
"loss": 0.0538,
"step": 6450
},
{
"epoch": 2.3679417122040074,
"grad_norm": 0.10851209610700607,
"learning_rate": 5.833123338932256e-06,
"loss": 0.0571,
"step": 6500
},
{
"epoch": 2.3679417122040074,
"eval_loss": 0.05036979168653488,
"eval_runtime": 489.7412,
"eval_samples_per_second": 19.929,
"eval_steps_per_second": 4.982,
"step": 6500
},
{
"epoch": 2.3861566484517303,
"grad_norm": 0.1328704059123993,
"learning_rate": 5.51471967245491e-06,
"loss": 0.0556,
"step": 6550
},
{
"epoch": 2.4043715846994536,
"grad_norm": 0.09755656123161316,
"learning_rate": 5.2041717108697065e-06,
"loss": 0.0514,
"step": 6600
},
{
"epoch": 2.422586520947177,
"grad_norm": 0.10794492810964584,
"learning_rate": 4.90160465500005e-06,
"loss": 0.0529,
"step": 6650
},
{
"epoch": 2.4408014571948997,
"grad_norm": 0.11240658164024353,
"learning_rate": 4.607140488079492e-06,
"loss": 0.0536,
"step": 6700
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.11631111800670624,
"learning_rate": 4.320897926572853e-06,
"loss": 0.0522,
"step": 6750
},
{
"epoch": 2.477231329690346,
"grad_norm": 0.08789575099945068,
"learning_rate": 4.0429923723143915e-06,
"loss": 0.0489,
"step": 6800
},
{
"epoch": 2.495446265938069,
"grad_norm": 0.10507268458604813,
"learning_rate": 3.7735358659822752e-06,
"loss": 0.05,
"step": 6850
},
{
"epoch": 2.5136612021857925,
"grad_norm": 0.10618982464075089,
"learning_rate": 3.5126370419281436e-06,
"loss": 0.0605,
"step": 6900
},
{
"epoch": 2.5318761384335153,
"grad_norm": 0.1154901459813118,
"learning_rate": 3.260401084379991e-06,
"loss": 0.053,
"step": 6950
},
{
"epoch": 2.5500910746812386,
"grad_norm": 0.1327841877937317,
"learning_rate": 3.0169296850359878e-06,
"loss": 0.056,
"step": 7000
},
{
"epoch": 2.5500910746812386,
"eval_loss": 0.049849580973386765,
"eval_runtime": 489.6947,
"eval_samples_per_second": 19.931,
"eval_steps_per_second": 4.983,
"step": 7000
},
{
"epoch": 2.5683060109289615,
"grad_norm": 0.12928417325019836,
"learning_rate": 2.782321002066332e-06,
"loss": 0.0568,
"step": 7050
},
{
"epoch": 2.586520947176685,
"grad_norm": 0.14363490045070648,
"learning_rate": 2.556669620539734e-06,
"loss": 0.054,
"step": 7100
},
{
"epoch": 2.604735883424408,
"grad_norm": 0.12806904315948486,
"learning_rate": 2.3400665142903927e-06,
"loss": 0.0525,
"step": 7150
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.11711638420820236,
"learning_rate": 2.132599009240882e-06,
"loss": 0.0544,
"step": 7200
},
{
"epoch": 2.6411657559198543,
"grad_norm": 0.11255411803722382,
"learning_rate": 1.9343507481957846e-06,
"loss": 0.0507,
"step": 7250
},
{
"epoch": 2.6593806921675776,
"grad_norm": 0.13773201406002045,
"learning_rate": 1.7454016571201186e-06,
"loss": 0.0534,
"step": 7300
},
{
"epoch": 2.6775956284153004,
"grad_norm": 0.11689828336238861,
"learning_rate": 1.5658279129163706e-06,
"loss": 0.0562,
"step": 7350
},
{
"epoch": 2.6958105646630237,
"grad_norm": 0.1306321620941162,
"learning_rate": 1.3957019127128851e-06,
"loss": 0.0493,
"step": 7400
},
{
"epoch": 2.714025500910747,
"grad_norm": 0.12783832848072052,
"learning_rate": 1.235092244676192e-06,
"loss": 0.0541,
"step": 7450
},
{
"epoch": 2.73224043715847,
"grad_norm": 0.11963380128145218,
"learning_rate": 1.0840636603589444e-06,
"loss": 0.0541,
"step": 7500
},
{
"epoch": 2.73224043715847,
"eval_loss": 0.04980416223406792,
"eval_runtime": 489.8577,
"eval_samples_per_second": 19.924,
"eval_steps_per_second": 4.981,
"step": 7500
},
{
"epoch": 2.750455373406193,
"grad_norm": 0.12381980568170547,
"learning_rate": 9.426770485945924e-07,
"loss": 0.0541,
"step": 7550
},
{
"epoch": 2.768670309653916,
"grad_norm": 0.14063633978366852,
"learning_rate": 8.109894109493976e-07,
"loss": 0.0511,
"step": 7600
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.10304458439350128,
"learning_rate": 6.890538387416212e-07,
"loss": 0.0533,
"step": 7650
},
{
"epoch": 2.8051001821493626,
"grad_norm": 0.16216354072093964,
"learning_rate": 5.76919491637179e-07,
"loss": 0.0598,
"step": 7700
},
{
"epoch": 2.8233151183970855,
"grad_norm": 0.13660581409931183,
"learning_rate": 4.746315778303756e-07,
"loss": 0.0531,
"step": 7750
},
{
"epoch": 2.841530054644809,
"grad_norm": 0.28685104846954346,
"learning_rate": 3.8223133581772595e-07,
"loss": 0.0542,
"step": 7800
},
{
"epoch": 2.8597449908925316,
"grad_norm": 0.11297158896923065,
"learning_rate": 2.9975601777219863e-07,
"loss": 0.0453,
"step": 7850
},
{
"epoch": 2.877959927140255,
"grad_norm": 0.1289464831352234,
"learning_rate": 2.2723887452461013e-07,
"loss": 0.0527,
"step": 7900
},
{
"epoch": 2.8961748633879782,
"grad_norm": 0.13939346373081207,
"learning_rate": 1.6470914215816758e-07,
"loss": 0.0544,
"step": 7950
},
{
"epoch": 2.9143897996357016,
"grad_norm": 0.12807384133338928,
"learning_rate": 1.1219203022162505e-07,
"loss": 0.0558,
"step": 8000
},
{
"epoch": 2.9143897996357016,
"eval_loss": 0.04983741417527199,
"eval_runtime": 489.7256,
"eval_samples_per_second": 19.93,
"eval_steps_per_second": 4.982,
"step": 8000
},
{
"epoch": 2.9326047358834244,
"grad_norm": 0.11168187856674194,
"learning_rate": 6.970871156578573e-08,
"loss": 0.0513,
"step": 8050
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.11235981434583664,
"learning_rate": 3.7276313807418917e-08,
"loss": 0.0513,
"step": 8100
},
{
"epoch": 2.9690346083788706,
"grad_norm": 0.16754215955734253,
"learning_rate": 1.4907912424091952e-08,
"loss": 0.0528,
"step": 8150
},
{
"epoch": 2.987249544626594,
"grad_norm": 0.12086477875709534,
"learning_rate": 2.612525482631467e-09,
"loss": 0.0515,
"step": 8200
}
],
"logging_steps": 50,
"max_steps": 8235,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4247978274909594e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}