{ "best_global_step": 13660, "best_metric": 0.5841874084919473, "best_model_checkpoint": "./saved_models/starencoder/checkpoint-13660", "epoch": 5.0, "eval_steps": 500, "global_step": 13660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018301610541727673, "grad_norm": 6.326382637023926, "learning_rate": 1.992825768667643e-05, "loss": 0.7093, "step": 50 }, { "epoch": 0.036603221083455345, "grad_norm": 1.9646369218826294, "learning_rate": 1.9855051244509516e-05, "loss": 0.6877, "step": 100 }, { "epoch": 0.05490483162518302, "grad_norm": 3.757148027420044, "learning_rate": 1.9781844802342606e-05, "loss": 0.7064, "step": 150 }, { "epoch": 0.07320644216691069, "grad_norm": 2.2845559120178223, "learning_rate": 1.97086383601757e-05, "loss": 0.6941, "step": 200 }, { "epoch": 0.09150805270863836, "grad_norm": 3.454932689666748, "learning_rate": 1.9635431918008785e-05, "loss": 0.697, "step": 250 }, { "epoch": 0.10980966325036604, "grad_norm": 2.188014268875122, "learning_rate": 1.9562225475841875e-05, "loss": 0.6939, "step": 300 }, { "epoch": 0.1281112737920937, "grad_norm": 3.7213497161865234, "learning_rate": 1.9489019033674965e-05, "loss": 0.7021, "step": 350 }, { "epoch": 0.14641288433382138, "grad_norm": 4.130824565887451, "learning_rate": 1.9415812591508055e-05, "loss": 0.6829, "step": 400 }, { "epoch": 0.16471449487554904, "grad_norm": 2.1340243816375732, "learning_rate": 1.9342606149341144e-05, "loss": 0.7163, "step": 450 }, { "epoch": 0.18301610541727673, "grad_norm": 2.3844826221466064, "learning_rate": 1.926939970717423e-05, "loss": 0.6817, "step": 500 }, { "epoch": 0.20131771595900438, "grad_norm": 5.518948078155518, "learning_rate": 1.919619326500732e-05, "loss": 0.6755, "step": 550 }, { "epoch": 0.21961932650073207, "grad_norm": 7.737420082092285, "learning_rate": 1.9122986822840414e-05, "loss": 0.6717, "step": 600 }, { "epoch": 0.23792093704245973, "grad_norm": 4.545921802520752, "learning_rate": 1.90497803806735e-05, "loss": 0.6749, "step": 650 }, { "epoch": 0.2562225475841874, "grad_norm": 2.528022289276123, "learning_rate": 1.897657393850659e-05, "loss": 0.6701, "step": 700 }, { "epoch": 0.2745241581259151, "grad_norm": 2.498490571975708, "learning_rate": 1.890336749633968e-05, "loss": 0.6808, "step": 750 }, { "epoch": 0.29282576866764276, "grad_norm": 2.387111186981201, "learning_rate": 1.883016105417277e-05, "loss": 0.6925, "step": 800 }, { "epoch": 0.31112737920937045, "grad_norm": 4.5703253746032715, "learning_rate": 1.875695461200586e-05, "loss": 0.71, "step": 850 }, { "epoch": 0.3294289897510981, "grad_norm": 2.2322046756744385, "learning_rate": 1.8683748169838946e-05, "loss": 0.6978, "step": 900 }, { "epoch": 0.34773060029282576, "grad_norm": 3.186796188354492, "learning_rate": 1.8610541727672035e-05, "loss": 0.6999, "step": 950 }, { "epoch": 0.36603221083455345, "grad_norm": 2.268221855163574, "learning_rate": 1.853733528550513e-05, "loss": 0.7027, "step": 1000 }, { "epoch": 0.38433382137628114, "grad_norm": 3.345116138458252, "learning_rate": 1.8464128843338215e-05, "loss": 0.687, "step": 1050 }, { "epoch": 0.40263543191800877, "grad_norm": 3.4372918605804443, "learning_rate": 1.8390922401171305e-05, "loss": 0.6892, "step": 1100 }, { "epoch": 0.42093704245973645, "grad_norm": 4.888953685760498, "learning_rate": 1.8317715959004394e-05, "loss": 0.688, "step": 1150 }, { "epoch": 0.43923865300146414, "grad_norm": 3.7842814922332764, "learning_rate": 1.8244509516837484e-05, "loss": 0.6893, "step": 1200 }, { "epoch": 0.4575402635431918, "grad_norm": 2.004110336303711, "learning_rate": 1.817130307467057e-05, "loss": 0.6965, "step": 1250 }, { "epoch": 0.47584187408491946, "grad_norm": 3.093764543533325, "learning_rate": 1.809809663250366e-05, "loss": 0.6946, "step": 1300 }, { "epoch": 0.49414348462664714, "grad_norm": 4.060361385345459, "learning_rate": 1.802489019033675e-05, "loss": 0.6926, "step": 1350 }, { "epoch": 0.5124450951683748, "grad_norm": 6.572564601898193, "learning_rate": 1.795168374816984e-05, "loss": 0.6926, "step": 1400 }, { "epoch": 0.5307467057101025, "grad_norm": 5.1151251792907715, "learning_rate": 1.787847730600293e-05, "loss": 0.6985, "step": 1450 }, { "epoch": 0.5490483162518301, "grad_norm": 7.343419075012207, "learning_rate": 1.780527086383602e-05, "loss": 0.6954, "step": 1500 }, { "epoch": 0.5673499267935578, "grad_norm": 4.611226558685303, "learning_rate": 1.773206442166911e-05, "loss": 0.683, "step": 1550 }, { "epoch": 0.5856515373352855, "grad_norm": 1.5302119255065918, "learning_rate": 1.76588579795022e-05, "loss": 0.6915, "step": 1600 }, { "epoch": 0.6039531478770132, "grad_norm": 1.781995415687561, "learning_rate": 1.7585651537335285e-05, "loss": 0.7022, "step": 1650 }, { "epoch": 0.6222547584187409, "grad_norm": 1.6124660968780518, "learning_rate": 1.7512445095168375e-05, "loss": 0.6913, "step": 1700 }, { "epoch": 0.6405563689604685, "grad_norm": 4.99559211730957, "learning_rate": 1.7439238653001465e-05, "loss": 0.6996, "step": 1750 }, { "epoch": 0.6588579795021962, "grad_norm": 9.72033405303955, "learning_rate": 1.7366032210834554e-05, "loss": 0.7121, "step": 1800 }, { "epoch": 0.6771595900439239, "grad_norm": 2.2888541221618652, "learning_rate": 1.7292825768667644e-05, "loss": 0.6852, "step": 1850 }, { "epoch": 0.6954612005856515, "grad_norm": 7.631497383117676, "learning_rate": 1.7219619326500734e-05, "loss": 0.6971, "step": 1900 }, { "epoch": 0.7137628111273792, "grad_norm": 1.9625825881958008, "learning_rate": 1.7146412884333824e-05, "loss": 0.6881, "step": 1950 }, { "epoch": 0.7320644216691069, "grad_norm": 10.556265830993652, "learning_rate": 1.7073206442166913e-05, "loss": 0.6978, "step": 2000 }, { "epoch": 0.7503660322108345, "grad_norm": 5.823647499084473, "learning_rate": 1.7e-05, "loss": 0.6974, "step": 2050 }, { "epoch": 0.7686676427525623, "grad_norm": 5.508345603942871, "learning_rate": 1.692679355783309e-05, "loss": 0.7045, "step": 2100 }, { "epoch": 0.7869692532942899, "grad_norm": 7.553188800811768, "learning_rate": 1.685358711566618e-05, "loss": 0.7011, "step": 2150 }, { "epoch": 0.8052708638360175, "grad_norm": 3.6769418716430664, "learning_rate": 1.678038067349927e-05, "loss": 0.6934, "step": 2200 }, { "epoch": 0.8235724743777453, "grad_norm": 2.2349374294281006, "learning_rate": 1.670717423133236e-05, "loss": 0.6951, "step": 2250 }, { "epoch": 0.8418740849194729, "grad_norm": 1.6531909704208374, "learning_rate": 1.663396778916545e-05, "loss": 0.7001, "step": 2300 }, { "epoch": 0.8601756954612005, "grad_norm": 4.906118869781494, "learning_rate": 1.656076134699854e-05, "loss": 0.6966, "step": 2350 }, { "epoch": 0.8784773060029283, "grad_norm": 7.1544013023376465, "learning_rate": 1.6487554904831625e-05, "loss": 0.6882, "step": 2400 }, { "epoch": 0.8967789165446559, "grad_norm": 1.9310983419418335, "learning_rate": 1.6414348462664715e-05, "loss": 0.7183, "step": 2450 }, { "epoch": 0.9150805270863837, "grad_norm": 5.646407127380371, "learning_rate": 1.6341142020497804e-05, "loss": 0.6884, "step": 2500 }, { "epoch": 0.9333821376281113, "grad_norm": 1.564105749130249, "learning_rate": 1.6267935578330894e-05, "loss": 0.6979, "step": 2550 }, { "epoch": 0.9516837481698389, "grad_norm": 2.552931308746338, "learning_rate": 1.6194729136163984e-05, "loss": 0.6921, "step": 2600 }, { "epoch": 0.9699853587115667, "grad_norm": 4.336668968200684, "learning_rate": 1.6121522693997074e-05, "loss": 0.6956, "step": 2650 }, { "epoch": 0.9882869692532943, "grad_norm": 1.3156540393829346, "learning_rate": 1.6048316251830163e-05, "loss": 0.6962, "step": 2700 }, { "epoch": 1.0, "eval_accuracy": 0.5655197657393851, "eval_loss": 0.6866211295127869, "eval_runtime": 30.1911, "eval_samples_per_second": 90.49, "eval_steps_per_second": 5.664, "step": 2732 }, { "epoch": 1.006588579795022, "grad_norm": 4.257384777069092, "learning_rate": 1.5975109809663253e-05, "loss": 0.6824, "step": 2750 }, { "epoch": 1.0248901903367496, "grad_norm": 4.433679580688477, "learning_rate": 1.590190336749634e-05, "loss": 0.6886, "step": 2800 }, { "epoch": 1.0431918008784773, "grad_norm": 2.416440010070801, "learning_rate": 1.582869692532943e-05, "loss": 0.7093, "step": 2850 }, { "epoch": 1.061493411420205, "grad_norm": 3.4436511993408203, "learning_rate": 1.575549048316252e-05, "loss": 0.6854, "step": 2900 }, { "epoch": 1.0797950219619326, "grad_norm": 3.391012668609619, "learning_rate": 1.568228404099561e-05, "loss": 0.6916, "step": 2950 }, { "epoch": 1.0980966325036603, "grad_norm": 2.20050311088562, "learning_rate": 1.56090775988287e-05, "loss": 0.6928, "step": 3000 }, { "epoch": 1.116398243045388, "grad_norm": 3.6996657848358154, "learning_rate": 1.5535871156661788e-05, "loss": 0.6978, "step": 3050 }, { "epoch": 1.1346998535871156, "grad_norm": 5.966168403625488, "learning_rate": 1.5462664714494878e-05, "loss": 0.6835, "step": 3100 }, { "epoch": 1.1530014641288433, "grad_norm": 7.096467971801758, "learning_rate": 1.5389458272327968e-05, "loss": 0.7008, "step": 3150 }, { "epoch": 1.171303074670571, "grad_norm": 2.978032350540161, "learning_rate": 1.5316251830161054e-05, "loss": 0.6923, "step": 3200 }, { "epoch": 1.1896046852122986, "grad_norm": 2.5770692825317383, "learning_rate": 1.5243045387994144e-05, "loss": 0.6988, "step": 3250 }, { "epoch": 1.2079062957540263, "grad_norm": 1.8154376745224, "learning_rate": 1.5169838945827234e-05, "loss": 0.693, "step": 3300 }, { "epoch": 1.226207906295754, "grad_norm": 5.659831523895264, "learning_rate": 1.5096632503660322e-05, "loss": 0.6951, "step": 3350 }, { "epoch": 1.2445095168374818, "grad_norm": 3.904559373855591, "learning_rate": 1.5023426061493413e-05, "loss": 0.6901, "step": 3400 }, { "epoch": 1.2628111273792093, "grad_norm": 3.0381088256835938, "learning_rate": 1.4950219619326503e-05, "loss": 0.6871, "step": 3450 }, { "epoch": 1.281112737920937, "grad_norm": 3.667318344116211, "learning_rate": 1.4877013177159591e-05, "loss": 0.6912, "step": 3500 }, { "epoch": 1.2994143484626648, "grad_norm": 6.238102912902832, "learning_rate": 1.480380673499268e-05, "loss": 0.6983, "step": 3550 }, { "epoch": 1.3177159590043923, "grad_norm": 4.712520599365234, "learning_rate": 1.4730600292825769e-05, "loss": 0.6898, "step": 3600 }, { "epoch": 1.33601756954612, "grad_norm": 4.511780261993408, "learning_rate": 1.4657393850658859e-05, "loss": 0.6972, "step": 3650 }, { "epoch": 1.3543191800878478, "grad_norm": 2.4388058185577393, "learning_rate": 1.4584187408491948e-05, "loss": 0.6962, "step": 3700 }, { "epoch": 1.3726207906295755, "grad_norm": 2.305985927581787, "learning_rate": 1.4510980966325036e-05, "loss": 0.696, "step": 3750 }, { "epoch": 1.390922401171303, "grad_norm": 5.038622856140137, "learning_rate": 1.4437774524158128e-05, "loss": 0.6945, "step": 3800 }, { "epoch": 1.4092240117130308, "grad_norm": 1.7405680418014526, "learning_rate": 1.4364568081991218e-05, "loss": 0.6975, "step": 3850 }, { "epoch": 1.4275256222547585, "grad_norm": 2.3643574714660645, "learning_rate": 1.4291361639824306e-05, "loss": 0.6961, "step": 3900 }, { "epoch": 1.445827232796486, "grad_norm": 5.455554008483887, "learning_rate": 1.4218155197657395e-05, "loss": 0.6974, "step": 3950 }, { "epoch": 1.4641288433382138, "grad_norm": 1.3421140909194946, "learning_rate": 1.4144948755490484e-05, "loss": 0.6929, "step": 4000 }, { "epoch": 1.4824304538799415, "grad_norm": 2.995839834213257, "learning_rate": 1.4071742313323573e-05, "loss": 0.6974, "step": 4050 }, { "epoch": 1.500732064421669, "grad_norm": 4.329990386962891, "learning_rate": 1.3998535871156661e-05, "loss": 0.6968, "step": 4100 }, { "epoch": 1.5190336749633968, "grad_norm": 2.4263505935668945, "learning_rate": 1.3925329428989751e-05, "loss": 0.694, "step": 4150 }, { "epoch": 1.5373352855051245, "grad_norm": 5.850309371948242, "learning_rate": 1.3852122986822843e-05, "loss": 0.6975, "step": 4200 }, { "epoch": 1.555636896046852, "grad_norm": 1.353864073753357, "learning_rate": 1.377891654465593e-05, "loss": 0.7005, "step": 4250 }, { "epoch": 1.5739385065885798, "grad_norm": 2.510244369506836, "learning_rate": 1.370571010248902e-05, "loss": 0.6906, "step": 4300 }, { "epoch": 1.5922401171303076, "grad_norm": 3.187347888946533, "learning_rate": 1.363250366032211e-05, "loss": 0.6902, "step": 4350 }, { "epoch": 1.610541727672035, "grad_norm": 2.019270420074463, "learning_rate": 1.3559297218155198e-05, "loss": 0.6959, "step": 4400 }, { "epoch": 1.6288433382137628, "grad_norm": 3.3284361362457275, "learning_rate": 1.3486090775988288e-05, "loss": 0.6901, "step": 4450 }, { "epoch": 1.6471449487554906, "grad_norm": 2.3892571926116943, "learning_rate": 1.3412884333821376e-05, "loss": 0.6831, "step": 4500 }, { "epoch": 1.665446559297218, "grad_norm": 2.138828992843628, "learning_rate": 1.3339677891654466e-05, "loss": 0.6961, "step": 4550 }, { "epoch": 1.6837481698389458, "grad_norm": 2.1902518272399902, "learning_rate": 1.3266471449487557e-05, "loss": 0.7011, "step": 4600 }, { "epoch": 1.7020497803806736, "grad_norm": 1.9349195957183838, "learning_rate": 1.3193265007320645e-05, "loss": 0.6964, "step": 4650 }, { "epoch": 1.720351390922401, "grad_norm": 2.414310932159424, "learning_rate": 1.3120058565153735e-05, "loss": 0.6993, "step": 4700 }, { "epoch": 1.7386530014641288, "grad_norm": 2.0503060817718506, "learning_rate": 1.3046852122986823e-05, "loss": 0.6945, "step": 4750 }, { "epoch": 1.7569546120058566, "grad_norm": 3.2496321201324463, "learning_rate": 1.2973645680819913e-05, "loss": 0.6984, "step": 4800 }, { "epoch": 1.775256222547584, "grad_norm": 4.175572395324707, "learning_rate": 1.2900439238653003e-05, "loss": 0.6945, "step": 4850 }, { "epoch": 1.7935578330893118, "grad_norm": 1.1053935289382935, "learning_rate": 1.282723279648609e-05, "loss": 0.6947, "step": 4900 }, { "epoch": 1.8118594436310396, "grad_norm": 4.932358741760254, "learning_rate": 1.2754026354319182e-05, "loss": 0.6893, "step": 4950 }, { "epoch": 1.830161054172767, "grad_norm": 1.429621934890747, "learning_rate": 1.2680819912152272e-05, "loss": 0.6711, "step": 5000 }, { "epoch": 1.8484626647144948, "grad_norm": 3.933276891708374, "learning_rate": 1.260761346998536e-05, "loss": 0.6942, "step": 5050 }, { "epoch": 1.8667642752562226, "grad_norm": 1.696254849433899, "learning_rate": 1.253440702781845e-05, "loss": 0.6922, "step": 5100 }, { "epoch": 1.88506588579795, "grad_norm": 2.4555442333221436, "learning_rate": 1.2461200585651538e-05, "loss": 0.6923, "step": 5150 }, { "epoch": 1.903367496339678, "grad_norm": 3.334120035171509, "learning_rate": 1.2387994143484628e-05, "loss": 0.6872, "step": 5200 }, { "epoch": 1.9216691068814056, "grad_norm": 1.637233018875122, "learning_rate": 1.2314787701317716e-05, "loss": 0.6832, "step": 5250 }, { "epoch": 1.939970717423133, "grad_norm": 1.511348009109497, "learning_rate": 1.2241581259150805e-05, "loss": 0.6892, "step": 5300 }, { "epoch": 1.958272327964861, "grad_norm": 3.0342915058135986, "learning_rate": 1.2168374816983897e-05, "loss": 0.6877, "step": 5350 }, { "epoch": 1.9765739385065886, "grad_norm": 4.36331033706665, "learning_rate": 1.2095168374816985e-05, "loss": 0.7022, "step": 5400 }, { "epoch": 1.994875549048316, "grad_norm": 1.6790521144866943, "learning_rate": 1.2021961932650075e-05, "loss": 0.689, "step": 5450 }, { "epoch": 2.0, "eval_accuracy": 0.43448023426061494, "eval_loss": 0.6969389319419861, "eval_runtime": 9.7517, "eval_samples_per_second": 280.155, "eval_steps_per_second": 17.535, "step": 5464 }, { "epoch": 2.013177159590044, "grad_norm": 2.8888535499572754, "learning_rate": 1.1948755490483164e-05, "loss": 0.6928, "step": 5500 }, { "epoch": 2.0314787701317716, "grad_norm": 1.8324801921844482, "learning_rate": 1.1875549048316253e-05, "loss": 0.6899, "step": 5550 }, { "epoch": 2.049780380673499, "grad_norm": 1.6425199508666992, "learning_rate": 1.1802342606149342e-05, "loss": 0.6923, "step": 5600 }, { "epoch": 2.068081991215227, "grad_norm": 2.4402670860290527, "learning_rate": 1.172913616398243e-05, "loss": 0.6882, "step": 5650 }, { "epoch": 2.0863836017569546, "grad_norm": 1.4472464323043823, "learning_rate": 1.165592972181552e-05, "loss": 0.6899, "step": 5700 }, { "epoch": 2.104685212298682, "grad_norm": 2.157292127609253, "learning_rate": 1.1582723279648612e-05, "loss": 0.7026, "step": 5750 }, { "epoch": 2.12298682284041, "grad_norm": 3.735875129699707, "learning_rate": 1.15095168374817e-05, "loss": 0.6818, "step": 5800 }, { "epoch": 2.1412884333821376, "grad_norm": 3.0418055057525635, "learning_rate": 1.143631039531479e-05, "loss": 0.6909, "step": 5850 }, { "epoch": 2.159590043923865, "grad_norm": 1.1220214366912842, "learning_rate": 1.1363103953147877e-05, "loss": 0.6977, "step": 5900 }, { "epoch": 2.177891654465593, "grad_norm": 1.926895022392273, "learning_rate": 1.1289897510980967e-05, "loss": 0.6937, "step": 5950 }, { "epoch": 2.1961932650073206, "grad_norm": 1.1366084814071655, "learning_rate": 1.1216691068814057e-05, "loss": 0.6981, "step": 6000 }, { "epoch": 2.214494875549048, "grad_norm": 2.684913396835327, "learning_rate": 1.1143484626647145e-05, "loss": 0.6923, "step": 6050 }, { "epoch": 2.232796486090776, "grad_norm": 1.1763278245925903, "learning_rate": 1.1070278184480235e-05, "loss": 0.6884, "step": 6100 }, { "epoch": 2.2510980966325036, "grad_norm": 0.9922837615013123, "learning_rate": 1.0997071742313326e-05, "loss": 0.6972, "step": 6150 }, { "epoch": 2.269399707174231, "grad_norm": 1.7432096004486084, "learning_rate": 1.0923865300146414e-05, "loss": 0.6956, "step": 6200 }, { "epoch": 2.287701317715959, "grad_norm": 5.431227207183838, "learning_rate": 1.0850658857979504e-05, "loss": 0.6949, "step": 6250 }, { "epoch": 2.3060029282576866, "grad_norm": 2.5441296100616455, "learning_rate": 1.0777452415812592e-05, "loss": 0.6922, "step": 6300 }, { "epoch": 2.3243045387994146, "grad_norm": 2.507460832595825, "learning_rate": 1.0704245973645682e-05, "loss": 0.6776, "step": 6350 }, { "epoch": 2.342606149341142, "grad_norm": 4.670897006988525, "learning_rate": 1.063103953147877e-05, "loss": 0.6888, "step": 6400 }, { "epoch": 2.3609077598828696, "grad_norm": 5.1314167976379395, "learning_rate": 1.055783308931186e-05, "loss": 0.6966, "step": 6450 }, { "epoch": 2.379209370424597, "grad_norm": 2.4066569805145264, "learning_rate": 1.048462664714495e-05, "loss": 0.6983, "step": 6500 }, { "epoch": 2.397510980966325, "grad_norm": 3.4087448120117188, "learning_rate": 1.041142020497804e-05, "loss": 0.6973, "step": 6550 }, { "epoch": 2.4158125915080526, "grad_norm": 1.7125242948532104, "learning_rate": 1.0338213762811129e-05, "loss": 0.6948, "step": 6600 }, { "epoch": 2.4341142020497806, "grad_norm": 3.0995583534240723, "learning_rate": 1.0265007320644219e-05, "loss": 0.6973, "step": 6650 }, { "epoch": 2.452415812591508, "grad_norm": 4.187213897705078, "learning_rate": 1.0191800878477307e-05, "loss": 0.6839, "step": 6700 }, { "epoch": 2.4707174231332356, "grad_norm": 2.849050521850586, "learning_rate": 1.0118594436310397e-05, "loss": 0.6927, "step": 6750 }, { "epoch": 2.4890190336749636, "grad_norm": 3.120654821395874, "learning_rate": 1.0045387994143485e-05, "loss": 0.6887, "step": 6800 }, { "epoch": 2.507320644216691, "grad_norm": 3.0376622676849365, "learning_rate": 9.972181551976574e-06, "loss": 0.6786, "step": 6850 }, { "epoch": 2.5256222547584186, "grad_norm": 2.4937405586242676, "learning_rate": 9.898975109809664e-06, "loss": 0.7005, "step": 6900 }, { "epoch": 2.5439238653001466, "grad_norm": 1.2871942520141602, "learning_rate": 9.825768667642754e-06, "loss": 0.6933, "step": 6950 }, { "epoch": 2.562225475841874, "grad_norm": 1.9713603258132935, "learning_rate": 9.752562225475842e-06, "loss": 0.6986, "step": 7000 }, { "epoch": 2.5805270863836016, "grad_norm": 3.823519706726074, "learning_rate": 9.679355783308932e-06, "loss": 0.6926, "step": 7050 }, { "epoch": 2.5988286969253296, "grad_norm": 1.7224080562591553, "learning_rate": 9.606149341142022e-06, "loss": 0.6894, "step": 7100 }, { "epoch": 2.617130307467057, "grad_norm": 1.2425187826156616, "learning_rate": 9.532942898975111e-06, "loss": 0.6898, "step": 7150 }, { "epoch": 2.6354319180087846, "grad_norm": 3.008572816848755, "learning_rate": 9.4597364568082e-06, "loss": 0.6902, "step": 7200 }, { "epoch": 2.6537335285505126, "grad_norm": 2.5645570755004883, "learning_rate": 9.386530014641289e-06, "loss": 0.6828, "step": 7250 }, { "epoch": 2.67203513909224, "grad_norm": 1.7012056112289429, "learning_rate": 9.313323572474379e-06, "loss": 0.6969, "step": 7300 }, { "epoch": 2.6903367496339676, "grad_norm": 6.18578577041626, "learning_rate": 9.240117130307467e-06, "loss": 0.68, "step": 7350 }, { "epoch": 2.7086383601756956, "grad_norm": 2.9371464252471924, "learning_rate": 9.166910688140557e-06, "loss": 0.7052, "step": 7400 }, { "epoch": 2.726939970717423, "grad_norm": 1.306884527206421, "learning_rate": 9.093704245973646e-06, "loss": 0.6945, "step": 7450 }, { "epoch": 2.745241581259151, "grad_norm": 2.4502739906311035, "learning_rate": 9.020497803806736e-06, "loss": 0.6998, "step": 7500 }, { "epoch": 2.7635431918008786, "grad_norm": 3.148775100708008, "learning_rate": 8.947291361639824e-06, "loss": 0.6931, "step": 7550 }, { "epoch": 2.781844802342606, "grad_norm": 2.4222412109375, "learning_rate": 8.874084919472914e-06, "loss": 0.6939, "step": 7600 }, { "epoch": 2.8001464128843336, "grad_norm": 2.543954610824585, "learning_rate": 8.800878477306004e-06, "loss": 0.6934, "step": 7650 }, { "epoch": 2.8184480234260616, "grad_norm": 1.2464910745620728, "learning_rate": 8.727672035139094e-06, "loss": 0.6919, "step": 7700 }, { "epoch": 2.836749633967789, "grad_norm": 3.3286757469177246, "learning_rate": 8.654465592972182e-06, "loss": 0.6955, "step": 7750 }, { "epoch": 2.855051244509517, "grad_norm": 1.269946575164795, "learning_rate": 8.581259150805271e-06, "loss": 0.6904, "step": 7800 }, { "epoch": 2.8733528550512446, "grad_norm": 2.8248860836029053, "learning_rate": 8.508052708638361e-06, "loss": 0.6876, "step": 7850 }, { "epoch": 2.891654465592972, "grad_norm": 4.0137553215026855, "learning_rate": 8.434846266471451e-06, "loss": 0.6901, "step": 7900 }, { "epoch": 2.9099560761346996, "grad_norm": 4.750382900238037, "learning_rate": 8.361639824304539e-06, "loss": 0.6986, "step": 7950 }, { "epoch": 2.9282576866764276, "grad_norm": 3.539860963821411, "learning_rate": 8.288433382137629e-06, "loss": 0.6838, "step": 8000 }, { "epoch": 2.946559297218155, "grad_norm": 2.6545543670654297, "learning_rate": 8.215226939970719e-06, "loss": 0.6942, "step": 8050 }, { "epoch": 2.964860907759883, "grad_norm": 2.351973056793213, "learning_rate": 8.142020497803808e-06, "loss": 0.6991, "step": 8100 }, { "epoch": 2.9831625183016106, "grad_norm": 1.9938989877700806, "learning_rate": 8.068814055636896e-06, "loss": 0.6928, "step": 8150 }, { "epoch": 3.0, "eval_accuracy": 0.5655197657393851, "eval_loss": 0.6848952770233154, "eval_runtime": 9.8156, "eval_samples_per_second": 278.332, "eval_steps_per_second": 17.421, "step": 8196 }, { "epoch": 3.001464128843338, "grad_norm": 3.9596476554870605, "learning_rate": 7.995607613469986e-06, "loss": 0.6903, "step": 8200 }, { "epoch": 3.019765739385066, "grad_norm": 4.190953254699707, "learning_rate": 7.922401171303076e-06, "loss": 0.6778, "step": 8250 }, { "epoch": 3.0380673499267936, "grad_norm": 3.9022440910339355, "learning_rate": 7.849194729136164e-06, "loss": 0.7014, "step": 8300 }, { "epoch": 3.056368960468521, "grad_norm": 3.035680055618286, "learning_rate": 7.775988286969254e-06, "loss": 0.6879, "step": 8350 }, { "epoch": 3.074670571010249, "grad_norm": 6.70428466796875, "learning_rate": 7.702781844802343e-06, "loss": 0.6871, "step": 8400 }, { "epoch": 3.0929721815519766, "grad_norm": 2.6576058864593506, "learning_rate": 7.629575402635433e-06, "loss": 0.6956, "step": 8450 }, { "epoch": 3.111273792093704, "grad_norm": 3.4227051734924316, "learning_rate": 7.556368960468522e-06, "loss": 0.6949, "step": 8500 }, { "epoch": 3.129575402635432, "grad_norm": 6.5497145652771, "learning_rate": 7.483162518301611e-06, "loss": 0.6882, "step": 8550 }, { "epoch": 3.1478770131771596, "grad_norm": 1.9692039489746094, "learning_rate": 7.4099560761347e-06, "loss": 0.6857, "step": 8600 }, { "epoch": 3.166178623718887, "grad_norm": 1.5063157081604004, "learning_rate": 7.3367496339677906e-06, "loss": 0.703, "step": 8650 }, { "epoch": 3.184480234260615, "grad_norm": 2.114001750946045, "learning_rate": 7.2635431918008795e-06, "loss": 0.6931, "step": 8700 }, { "epoch": 3.2027818448023426, "grad_norm": 4.059252738952637, "learning_rate": 7.190336749633968e-06, "loss": 0.7012, "step": 8750 }, { "epoch": 3.22108345534407, "grad_norm": 1.4372018575668335, "learning_rate": 7.117130307467057e-06, "loss": 0.6911, "step": 8800 }, { "epoch": 3.239385065885798, "grad_norm": 1.4479070901870728, "learning_rate": 7.043923865300147e-06, "loss": 0.6925, "step": 8850 }, { "epoch": 3.2576866764275256, "grad_norm": 1.4054046869277954, "learning_rate": 6.970717423133237e-06, "loss": 0.6948, "step": 8900 }, { "epoch": 3.275988286969253, "grad_norm": 2.7354447841644287, "learning_rate": 6.897510980966326e-06, "loss": 0.6931, "step": 8950 }, { "epoch": 3.294289897510981, "grad_norm": 6.485820293426514, "learning_rate": 6.824304538799415e-06, "loss": 0.6924, "step": 9000 }, { "epoch": 3.3125915080527086, "grad_norm": 2.2250216007232666, "learning_rate": 6.751098096632504e-06, "loss": 0.6955, "step": 9050 }, { "epoch": 3.330893118594436, "grad_norm": 3.7703425884246826, "learning_rate": 6.677891654465593e-06, "loss": 0.6978, "step": 9100 }, { "epoch": 3.349194729136164, "grad_norm": 1.3543486595153809, "learning_rate": 6.604685212298682e-06, "loss": 0.6851, "step": 9150 }, { "epoch": 3.3674963396778916, "grad_norm": 1.9245567321777344, "learning_rate": 6.531478770131772e-06, "loss": 0.6971, "step": 9200 }, { "epoch": 3.385797950219619, "grad_norm": 1.5131146907806396, "learning_rate": 6.458272327964862e-06, "loss": 0.6898, "step": 9250 }, { "epoch": 3.404099560761347, "grad_norm": 1.524381399154663, "learning_rate": 6.385065885797951e-06, "loss": 0.6881, "step": 9300 }, { "epoch": 3.4224011713030746, "grad_norm": 2.4183127880096436, "learning_rate": 6.3118594436310396e-06, "loss": 0.6957, "step": 9350 }, { "epoch": 3.440702781844802, "grad_norm": 6.259820461273193, "learning_rate": 6.2386530014641285e-06, "loss": 0.6945, "step": 9400 }, { "epoch": 3.45900439238653, "grad_norm": 4.742863655090332, "learning_rate": 6.165446559297219e-06, "loss": 0.6918, "step": 9450 }, { "epoch": 3.4773060029282576, "grad_norm": 3.144963026046753, "learning_rate": 6.092240117130308e-06, "loss": 0.6862, "step": 9500 }, { "epoch": 3.4956076134699856, "grad_norm": 2.9106500148773193, "learning_rate": 6.019033674963397e-06, "loss": 0.6889, "step": 9550 }, { "epoch": 3.513909224011713, "grad_norm": 6.509891510009766, "learning_rate": 5.945827232796486e-06, "loss": 0.6945, "step": 9600 }, { "epoch": 3.5322108345534406, "grad_norm": 2.27295184135437, "learning_rate": 5.8726207906295764e-06, "loss": 0.6943, "step": 9650 }, { "epoch": 3.550512445095168, "grad_norm": 7.141515731811523, "learning_rate": 5.799414348462665e-06, "loss": 0.6876, "step": 9700 }, { "epoch": 3.568814055636896, "grad_norm": 2.371335744857788, "learning_rate": 5.726207906295754e-06, "loss": 0.6988, "step": 9750 }, { "epoch": 3.5871156661786237, "grad_norm": 1.0689224004745483, "learning_rate": 5.653001464128843e-06, "loss": 0.6956, "step": 9800 }, { "epoch": 3.6054172767203516, "grad_norm": 3.6092991828918457, "learning_rate": 5.579795021961934e-06, "loss": 0.6926, "step": 9850 }, { "epoch": 3.623718887262079, "grad_norm": 1.2732123136520386, "learning_rate": 5.506588579795023e-06, "loss": 0.6901, "step": 9900 }, { "epoch": 3.6420204978038067, "grad_norm": 1.149488925933838, "learning_rate": 5.433382137628112e-06, "loss": 0.6947, "step": 9950 }, { "epoch": 3.660322108345534, "grad_norm": 1.580837607383728, "learning_rate": 5.3601756954612005e-06, "loss": 0.6931, "step": 10000 }, { "epoch": 3.678623718887262, "grad_norm": 4.544098854064941, "learning_rate": 5.28696925329429e-06, "loss": 0.6885, "step": 10050 }, { "epoch": 3.6969253294289897, "grad_norm": 2.011023998260498, "learning_rate": 5.21376281112738e-06, "loss": 0.6962, "step": 10100 }, { "epoch": 3.7152269399707176, "grad_norm": 5.078649997711182, "learning_rate": 5.140556368960469e-06, "loss": 0.6936, "step": 10150 }, { "epoch": 3.733528550512445, "grad_norm": 2.257343053817749, "learning_rate": 5.067349926793558e-06, "loss": 0.6945, "step": 10200 }, { "epoch": 3.7518301610541727, "grad_norm": 1.252966046333313, "learning_rate": 4.994143484626648e-06, "loss": 0.6892, "step": 10250 }, { "epoch": 3.7701317715959, "grad_norm": 2.1973466873168945, "learning_rate": 4.9209370424597365e-06, "loss": 0.6868, "step": 10300 }, { "epoch": 3.788433382137628, "grad_norm": 1.9186662435531616, "learning_rate": 4.847730600292826e-06, "loss": 0.6854, "step": 10350 }, { "epoch": 3.8067349926793557, "grad_norm": 3.1210763454437256, "learning_rate": 4.774524158125915e-06, "loss": 0.6977, "step": 10400 }, { "epoch": 3.8250366032210836, "grad_norm": 2.7756359577178955, "learning_rate": 4.701317715959005e-06, "loss": 0.696, "step": 10450 }, { "epoch": 3.843338213762811, "grad_norm": 1.0838842391967773, "learning_rate": 4.628111273792094e-06, "loss": 0.6921, "step": 10500 }, { "epoch": 3.8616398243045387, "grad_norm": 3.4335124492645264, "learning_rate": 4.554904831625183e-06, "loss": 0.6956, "step": 10550 }, { "epoch": 3.8799414348462666, "grad_norm": 1.28254234790802, "learning_rate": 4.4816983894582726e-06, "loss": 0.691, "step": 10600 }, { "epoch": 3.898243045387994, "grad_norm": 2.6828558444976807, "learning_rate": 4.4084919472913615e-06, "loss": 0.6926, "step": 10650 }, { "epoch": 3.9165446559297217, "grad_norm": 1.131003499031067, "learning_rate": 4.335285505124451e-06, "loss": 0.6894, "step": 10700 }, { "epoch": 3.9348462664714496, "grad_norm": 3.3838775157928467, "learning_rate": 4.26207906295754e-06, "loss": 0.6936, "step": 10750 }, { "epoch": 3.953147877013177, "grad_norm": 1.3978344202041626, "learning_rate": 4.18887262079063e-06, "loss": 0.6873, "step": 10800 }, { "epoch": 3.9714494875549047, "grad_norm": 4.850183010101318, "learning_rate": 4.115666178623719e-06, "loss": 0.6855, "step": 10850 }, { "epoch": 3.9897510980966326, "grad_norm": 3.063927412033081, "learning_rate": 4.0424597364568086e-06, "loss": 0.6942, "step": 10900 }, { "epoch": 4.0, "eval_accuracy": 0.5655197657393851, "eval_loss": 0.6846572756767273, "eval_runtime": 9.7626, "eval_samples_per_second": 279.843, "eval_steps_per_second": 17.516, "step": 10928 }, { "epoch": 4.008052708638361, "grad_norm": 2.9117279052734375, "learning_rate": 3.9692532942898975e-06, "loss": 0.6924, "step": 10950 }, { "epoch": 4.026354319180088, "grad_norm": 1.5743687152862549, "learning_rate": 3.896046852122987e-06, "loss": 0.6929, "step": 11000 }, { "epoch": 4.044655929721816, "grad_norm": 1.4719305038452148, "learning_rate": 3.822840409956076e-06, "loss": 0.6887, "step": 11050 }, { "epoch": 4.062957540263543, "grad_norm": 1.4639744758605957, "learning_rate": 3.749633967789166e-06, "loss": 0.6786, "step": 11100 }, { "epoch": 4.081259150805271, "grad_norm": 6.5527729988098145, "learning_rate": 3.676427525622255e-06, "loss": 0.6868, "step": 11150 }, { "epoch": 4.099560761346998, "grad_norm": 1.1454309225082397, "learning_rate": 3.6032210834553446e-06, "loss": 0.6991, "step": 11200 }, { "epoch": 4.117862371888727, "grad_norm": 2.6722428798675537, "learning_rate": 3.5300146412884335e-06, "loss": 0.6871, "step": 11250 }, { "epoch": 4.136163982430454, "grad_norm": 1.562593936920166, "learning_rate": 3.456808199121523e-06, "loss": 0.6911, "step": 11300 }, { "epoch": 4.154465592972182, "grad_norm": 1.5078984498977661, "learning_rate": 3.383601756954612e-06, "loss": 0.699, "step": 11350 }, { "epoch": 4.172767203513909, "grad_norm": 4.862717151641846, "learning_rate": 3.3103953147877015e-06, "loss": 0.6848, "step": 11400 }, { "epoch": 4.191068814055637, "grad_norm": 1.2064334154129028, "learning_rate": 3.2371888726207904e-06, "loss": 0.6784, "step": 11450 }, { "epoch": 4.209370424597364, "grad_norm": 1.4410967826843262, "learning_rate": 3.16398243045388e-06, "loss": 0.6867, "step": 11500 }, { "epoch": 4.227672035139093, "grad_norm": 1.5525456666946411, "learning_rate": 3.090775988286969e-06, "loss": 0.6878, "step": 11550 }, { "epoch": 4.24597364568082, "grad_norm": 2.721654176712036, "learning_rate": 3.017569546120059e-06, "loss": 0.6911, "step": 11600 }, { "epoch": 4.264275256222548, "grad_norm": 2.7285444736480713, "learning_rate": 2.944363103953148e-06, "loss": 0.6843, "step": 11650 }, { "epoch": 4.282576866764275, "grad_norm": 1.2089800834655762, "learning_rate": 2.8711566617862375e-06, "loss": 0.6989, "step": 11700 }, { "epoch": 4.300878477306003, "grad_norm": 3.368563175201416, "learning_rate": 2.797950219619327e-06, "loss": 0.683, "step": 11750 }, { "epoch": 4.31918008784773, "grad_norm": 1.4972318410873413, "learning_rate": 2.724743777452416e-06, "loss": 0.6897, "step": 11800 }, { "epoch": 4.337481698389459, "grad_norm": 3.92740797996521, "learning_rate": 2.6515373352855055e-06, "loss": 0.6876, "step": 11850 }, { "epoch": 4.355783308931186, "grad_norm": 2.367027521133423, "learning_rate": 2.5783308931185944e-06, "loss": 0.6884, "step": 11900 }, { "epoch": 4.374084919472914, "grad_norm": 3.318098306655884, "learning_rate": 2.505124450951684e-06, "loss": 0.6867, "step": 11950 }, { "epoch": 4.392386530014641, "grad_norm": 2.5479702949523926, "learning_rate": 2.431918008784773e-06, "loss": 0.6787, "step": 12000 }, { "epoch": 4.410688140556369, "grad_norm": 2.361260175704956, "learning_rate": 2.3587115666178625e-06, "loss": 0.6904, "step": 12050 }, { "epoch": 4.428989751098096, "grad_norm": 2.41310715675354, "learning_rate": 2.285505124450952e-06, "loss": 0.6814, "step": 12100 }, { "epoch": 4.447291361639825, "grad_norm": 2.390275716781616, "learning_rate": 2.212298682284041e-06, "loss": 0.667, "step": 12150 }, { "epoch": 4.465592972181552, "grad_norm": 1.9900853633880615, "learning_rate": 2.1390922401171305e-06, "loss": 0.6873, "step": 12200 }, { "epoch": 4.48389458272328, "grad_norm": 5.233725547790527, "learning_rate": 2.06588579795022e-06, "loss": 0.6844, "step": 12250 }, { "epoch": 4.502196193265007, "grad_norm": 2.4165773391723633, "learning_rate": 1.992679355783309e-06, "loss": 0.6803, "step": 12300 }, { "epoch": 4.520497803806735, "grad_norm": 1.9948047399520874, "learning_rate": 1.9194729136163985e-06, "loss": 0.6885, "step": 12350 }, { "epoch": 4.538799414348462, "grad_norm": 1.5085363388061523, "learning_rate": 1.8462664714494876e-06, "loss": 0.6883, "step": 12400 }, { "epoch": 4.557101024890191, "grad_norm": 5.91818380355835, "learning_rate": 1.773060029282577e-06, "loss": 0.6696, "step": 12450 }, { "epoch": 4.575402635431918, "grad_norm": 3.4706273078918457, "learning_rate": 1.6998535871156663e-06, "loss": 0.6798, "step": 12500 }, { "epoch": 4.593704245973646, "grad_norm": 3.0562493801116943, "learning_rate": 1.6266471449487556e-06, "loss": 0.6763, "step": 12550 }, { "epoch": 4.612005856515373, "grad_norm": 5.213473796844482, "learning_rate": 1.553440702781845e-06, "loss": 0.6793, "step": 12600 }, { "epoch": 4.630307467057101, "grad_norm": 2.782263994216919, "learning_rate": 1.4802342606149343e-06, "loss": 0.6748, "step": 12650 }, { "epoch": 4.648609077598829, "grad_norm": 12.179971694946289, "learning_rate": 1.4070278184480234e-06, "loss": 0.6653, "step": 12700 }, { "epoch": 4.666910688140557, "grad_norm": 5.823268413543701, "learning_rate": 1.3338213762811127e-06, "loss": 0.6916, "step": 12750 }, { "epoch": 4.685212298682284, "grad_norm": 4.365823745727539, "learning_rate": 1.260614934114202e-06, "loss": 0.6892, "step": 12800 }, { "epoch": 4.703513909224012, "grad_norm": 6.108221530914307, "learning_rate": 1.1874084919472914e-06, "loss": 0.6896, "step": 12850 }, { "epoch": 4.721815519765739, "grad_norm": 6.130873203277588, "learning_rate": 1.1142020497803808e-06, "loss": 0.6818, "step": 12900 }, { "epoch": 4.740117130307467, "grad_norm": 4.07578706741333, "learning_rate": 1.04099560761347e-06, "loss": 0.6795, "step": 12950 }, { "epoch": 4.758418740849194, "grad_norm": 3.392340898513794, "learning_rate": 9.677891654465594e-07, "loss": 0.6655, "step": 13000 }, { "epoch": 4.776720351390923, "grad_norm": 6.404242992401123, "learning_rate": 8.945827232796487e-07, "loss": 0.6769, "step": 13050 }, { "epoch": 4.79502196193265, "grad_norm": 3.1065897941589355, "learning_rate": 8.213762811127379e-07, "loss": 0.683, "step": 13100 }, { "epoch": 4.813323572474378, "grad_norm": 6.093231201171875, "learning_rate": 7.481698389458272e-07, "loss": 0.6798, "step": 13150 }, { "epoch": 4.831625183016105, "grad_norm": 7.247190952301025, "learning_rate": 6.749633967789166e-07, "loss": 0.6737, "step": 13200 }, { "epoch": 4.849926793557833, "grad_norm": 5.564077377319336, "learning_rate": 6.017569546120059e-07, "loss": 0.6805, "step": 13250 }, { "epoch": 4.868228404099561, "grad_norm": 3.6535987854003906, "learning_rate": 5.285505124450952e-07, "loss": 0.6612, "step": 13300 }, { "epoch": 4.886530014641289, "grad_norm": 3.1486198902130127, "learning_rate": 4.553440702781845e-07, "loss": 0.6833, "step": 13350 }, { "epoch": 4.904831625183016, "grad_norm": 2.8693318367004395, "learning_rate": 3.821376281112738e-07, "loss": 0.6775, "step": 13400 }, { "epoch": 4.923133235724744, "grad_norm": 3.7927544116973877, "learning_rate": 3.0893118594436313e-07, "loss": 0.673, "step": 13450 }, { "epoch": 4.941434846266471, "grad_norm": 3.532392740249634, "learning_rate": 2.3572474377745242e-07, "loss": 0.6713, "step": 13500 }, { "epoch": 4.959736456808199, "grad_norm": 3.34582257270813, "learning_rate": 1.6251830161054173e-07, "loss": 0.6542, "step": 13550 }, { "epoch": 4.978038067349927, "grad_norm": 4.675662517547607, "learning_rate": 8.931185944363105e-08, "loss": 0.6832, "step": 13600 }, { "epoch": 4.996339677891655, "grad_norm": 3.3549957275390625, "learning_rate": 1.610541727672035e-08, "loss": 0.6633, "step": 13650 }, { "epoch": 5.0, "eval_accuracy": 0.5841874084919473, "eval_loss": 0.6651186347007751, "eval_runtime": 9.7608, "eval_samples_per_second": 279.896, "eval_steps_per_second": 17.519, "step": 13660 } ], "logging_steps": 50, "max_steps": 13660, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7187536254796800.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }