| { | |
| "best_global_step": 13660, | |
| "best_metric": 0.5841874084919473, | |
| "best_model_checkpoint": "./saved_models/starencoder/checkpoint-13660", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 13660, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.018301610541727673, | |
| "grad_norm": 6.326382637023926, | |
| "learning_rate": 1.992825768667643e-05, | |
| "loss": 0.7093, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.036603221083455345, | |
| "grad_norm": 1.9646369218826294, | |
| "learning_rate": 1.9855051244509516e-05, | |
| "loss": 0.6877, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05490483162518302, | |
| "grad_norm": 3.757148027420044, | |
| "learning_rate": 1.9781844802342606e-05, | |
| "loss": 0.7064, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.07320644216691069, | |
| "grad_norm": 2.2845559120178223, | |
| "learning_rate": 1.97086383601757e-05, | |
| "loss": 0.6941, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09150805270863836, | |
| "grad_norm": 3.454932689666748, | |
| "learning_rate": 1.9635431918008785e-05, | |
| "loss": 0.697, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10980966325036604, | |
| "grad_norm": 2.188014268875122, | |
| "learning_rate": 1.9562225475841875e-05, | |
| "loss": 0.6939, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1281112737920937, | |
| "grad_norm": 3.7213497161865234, | |
| "learning_rate": 1.9489019033674965e-05, | |
| "loss": 0.7021, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.14641288433382138, | |
| "grad_norm": 4.130824565887451, | |
| "learning_rate": 1.9415812591508055e-05, | |
| "loss": 0.6829, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16471449487554904, | |
| "grad_norm": 2.1340243816375732, | |
| "learning_rate": 1.9342606149341144e-05, | |
| "loss": 0.7163, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.18301610541727673, | |
| "grad_norm": 2.3844826221466064, | |
| "learning_rate": 1.926939970717423e-05, | |
| "loss": 0.6817, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.20131771595900438, | |
| "grad_norm": 5.518948078155518, | |
| "learning_rate": 1.919619326500732e-05, | |
| "loss": 0.6755, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.21961932650073207, | |
| "grad_norm": 7.737420082092285, | |
| "learning_rate": 1.9122986822840414e-05, | |
| "loss": 0.6717, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.23792093704245973, | |
| "grad_norm": 4.545921802520752, | |
| "learning_rate": 1.90497803806735e-05, | |
| "loss": 0.6749, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2562225475841874, | |
| "grad_norm": 2.528022289276123, | |
| "learning_rate": 1.897657393850659e-05, | |
| "loss": 0.6701, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2745241581259151, | |
| "grad_norm": 2.498490571975708, | |
| "learning_rate": 1.890336749633968e-05, | |
| "loss": 0.6808, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.29282576866764276, | |
| "grad_norm": 2.387111186981201, | |
| "learning_rate": 1.883016105417277e-05, | |
| "loss": 0.6925, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.31112737920937045, | |
| "grad_norm": 4.5703253746032715, | |
| "learning_rate": 1.875695461200586e-05, | |
| "loss": 0.71, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3294289897510981, | |
| "grad_norm": 2.2322046756744385, | |
| "learning_rate": 1.8683748169838946e-05, | |
| "loss": 0.6978, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.34773060029282576, | |
| "grad_norm": 3.186796188354492, | |
| "learning_rate": 1.8610541727672035e-05, | |
| "loss": 0.6999, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.36603221083455345, | |
| "grad_norm": 2.268221855163574, | |
| "learning_rate": 1.853733528550513e-05, | |
| "loss": 0.7027, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.38433382137628114, | |
| "grad_norm": 3.345116138458252, | |
| "learning_rate": 1.8464128843338215e-05, | |
| "loss": 0.687, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.40263543191800877, | |
| "grad_norm": 3.4372918605804443, | |
| "learning_rate": 1.8390922401171305e-05, | |
| "loss": 0.6892, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.42093704245973645, | |
| "grad_norm": 4.888953685760498, | |
| "learning_rate": 1.8317715959004394e-05, | |
| "loss": 0.688, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.43923865300146414, | |
| "grad_norm": 3.7842814922332764, | |
| "learning_rate": 1.8244509516837484e-05, | |
| "loss": 0.6893, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4575402635431918, | |
| "grad_norm": 2.004110336303711, | |
| "learning_rate": 1.817130307467057e-05, | |
| "loss": 0.6965, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.47584187408491946, | |
| "grad_norm": 3.093764543533325, | |
| "learning_rate": 1.809809663250366e-05, | |
| "loss": 0.6946, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.49414348462664714, | |
| "grad_norm": 4.060361385345459, | |
| "learning_rate": 1.802489019033675e-05, | |
| "loss": 0.6926, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.5124450951683748, | |
| "grad_norm": 6.572564601898193, | |
| "learning_rate": 1.795168374816984e-05, | |
| "loss": 0.6926, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5307467057101025, | |
| "grad_norm": 5.1151251792907715, | |
| "learning_rate": 1.787847730600293e-05, | |
| "loss": 0.6985, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5490483162518301, | |
| "grad_norm": 7.343419075012207, | |
| "learning_rate": 1.780527086383602e-05, | |
| "loss": 0.6954, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5673499267935578, | |
| "grad_norm": 4.611226558685303, | |
| "learning_rate": 1.773206442166911e-05, | |
| "loss": 0.683, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5856515373352855, | |
| "grad_norm": 1.5302119255065918, | |
| "learning_rate": 1.76588579795022e-05, | |
| "loss": 0.6915, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6039531478770132, | |
| "grad_norm": 1.781995415687561, | |
| "learning_rate": 1.7585651537335285e-05, | |
| "loss": 0.7022, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.6222547584187409, | |
| "grad_norm": 1.6124660968780518, | |
| "learning_rate": 1.7512445095168375e-05, | |
| "loss": 0.6913, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6405563689604685, | |
| "grad_norm": 4.99559211730957, | |
| "learning_rate": 1.7439238653001465e-05, | |
| "loss": 0.6996, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.6588579795021962, | |
| "grad_norm": 9.72033405303955, | |
| "learning_rate": 1.7366032210834554e-05, | |
| "loss": 0.7121, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6771595900439239, | |
| "grad_norm": 2.2888541221618652, | |
| "learning_rate": 1.7292825768667644e-05, | |
| "loss": 0.6852, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6954612005856515, | |
| "grad_norm": 7.631497383117676, | |
| "learning_rate": 1.7219619326500734e-05, | |
| "loss": 0.6971, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7137628111273792, | |
| "grad_norm": 1.9625825881958008, | |
| "learning_rate": 1.7146412884333824e-05, | |
| "loss": 0.6881, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.7320644216691069, | |
| "grad_norm": 10.556265830993652, | |
| "learning_rate": 1.7073206442166913e-05, | |
| "loss": 0.6978, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7503660322108345, | |
| "grad_norm": 5.823647499084473, | |
| "learning_rate": 1.7e-05, | |
| "loss": 0.6974, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.7686676427525623, | |
| "grad_norm": 5.508345603942871, | |
| "learning_rate": 1.692679355783309e-05, | |
| "loss": 0.7045, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7869692532942899, | |
| "grad_norm": 7.553188800811768, | |
| "learning_rate": 1.685358711566618e-05, | |
| "loss": 0.7011, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.8052708638360175, | |
| "grad_norm": 3.6769418716430664, | |
| "learning_rate": 1.678038067349927e-05, | |
| "loss": 0.6934, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.8235724743777453, | |
| "grad_norm": 2.2349374294281006, | |
| "learning_rate": 1.670717423133236e-05, | |
| "loss": 0.6951, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.8418740849194729, | |
| "grad_norm": 1.6531909704208374, | |
| "learning_rate": 1.663396778916545e-05, | |
| "loss": 0.7001, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.8601756954612005, | |
| "grad_norm": 4.906118869781494, | |
| "learning_rate": 1.656076134699854e-05, | |
| "loss": 0.6966, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.8784773060029283, | |
| "grad_norm": 7.1544013023376465, | |
| "learning_rate": 1.6487554904831625e-05, | |
| "loss": 0.6882, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8967789165446559, | |
| "grad_norm": 1.9310983419418335, | |
| "learning_rate": 1.6414348462664715e-05, | |
| "loss": 0.7183, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.9150805270863837, | |
| "grad_norm": 5.646407127380371, | |
| "learning_rate": 1.6341142020497804e-05, | |
| "loss": 0.6884, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.9333821376281113, | |
| "grad_norm": 1.564105749130249, | |
| "learning_rate": 1.6267935578330894e-05, | |
| "loss": 0.6979, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.9516837481698389, | |
| "grad_norm": 2.552931308746338, | |
| "learning_rate": 1.6194729136163984e-05, | |
| "loss": 0.6921, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9699853587115667, | |
| "grad_norm": 4.336668968200684, | |
| "learning_rate": 1.6121522693997074e-05, | |
| "loss": 0.6956, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.9882869692532943, | |
| "grad_norm": 1.3156540393829346, | |
| "learning_rate": 1.6048316251830163e-05, | |
| "loss": 0.6962, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.5655197657393851, | |
| "eval_loss": 0.6866211295127869, | |
| "eval_runtime": 30.1911, | |
| "eval_samples_per_second": 90.49, | |
| "eval_steps_per_second": 5.664, | |
| "step": 2732 | |
| }, | |
| { | |
| "epoch": 1.006588579795022, | |
| "grad_norm": 4.257384777069092, | |
| "learning_rate": 1.5975109809663253e-05, | |
| "loss": 0.6824, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.0248901903367496, | |
| "grad_norm": 4.433679580688477, | |
| "learning_rate": 1.590190336749634e-05, | |
| "loss": 0.6886, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.0431918008784773, | |
| "grad_norm": 2.416440010070801, | |
| "learning_rate": 1.582869692532943e-05, | |
| "loss": 0.7093, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.061493411420205, | |
| "grad_norm": 3.4436511993408203, | |
| "learning_rate": 1.575549048316252e-05, | |
| "loss": 0.6854, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.0797950219619326, | |
| "grad_norm": 3.391012668609619, | |
| "learning_rate": 1.568228404099561e-05, | |
| "loss": 0.6916, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.0980966325036603, | |
| "grad_norm": 2.20050311088562, | |
| "learning_rate": 1.56090775988287e-05, | |
| "loss": 0.6928, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.116398243045388, | |
| "grad_norm": 3.6996657848358154, | |
| "learning_rate": 1.5535871156661788e-05, | |
| "loss": 0.6978, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.1346998535871156, | |
| "grad_norm": 5.966168403625488, | |
| "learning_rate": 1.5462664714494878e-05, | |
| "loss": 0.6835, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.1530014641288433, | |
| "grad_norm": 7.096467971801758, | |
| "learning_rate": 1.5389458272327968e-05, | |
| "loss": 0.7008, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.171303074670571, | |
| "grad_norm": 2.978032350540161, | |
| "learning_rate": 1.5316251830161054e-05, | |
| "loss": 0.6923, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.1896046852122986, | |
| "grad_norm": 2.5770692825317383, | |
| "learning_rate": 1.5243045387994144e-05, | |
| "loss": 0.6988, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.2079062957540263, | |
| "grad_norm": 1.8154376745224, | |
| "learning_rate": 1.5169838945827234e-05, | |
| "loss": 0.693, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.226207906295754, | |
| "grad_norm": 5.659831523895264, | |
| "learning_rate": 1.5096632503660322e-05, | |
| "loss": 0.6951, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.2445095168374818, | |
| "grad_norm": 3.904559373855591, | |
| "learning_rate": 1.5023426061493413e-05, | |
| "loss": 0.6901, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.2628111273792093, | |
| "grad_norm": 3.0381088256835938, | |
| "learning_rate": 1.4950219619326503e-05, | |
| "loss": 0.6871, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.281112737920937, | |
| "grad_norm": 3.667318344116211, | |
| "learning_rate": 1.4877013177159591e-05, | |
| "loss": 0.6912, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2994143484626648, | |
| "grad_norm": 6.238102912902832, | |
| "learning_rate": 1.480380673499268e-05, | |
| "loss": 0.6983, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.3177159590043923, | |
| "grad_norm": 4.712520599365234, | |
| "learning_rate": 1.4730600292825769e-05, | |
| "loss": 0.6898, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.33601756954612, | |
| "grad_norm": 4.511780261993408, | |
| "learning_rate": 1.4657393850658859e-05, | |
| "loss": 0.6972, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.3543191800878478, | |
| "grad_norm": 2.4388058185577393, | |
| "learning_rate": 1.4584187408491948e-05, | |
| "loss": 0.6962, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.3726207906295755, | |
| "grad_norm": 2.305985927581787, | |
| "learning_rate": 1.4510980966325036e-05, | |
| "loss": 0.696, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.390922401171303, | |
| "grad_norm": 5.038622856140137, | |
| "learning_rate": 1.4437774524158128e-05, | |
| "loss": 0.6945, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.4092240117130308, | |
| "grad_norm": 1.7405680418014526, | |
| "learning_rate": 1.4364568081991218e-05, | |
| "loss": 0.6975, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.4275256222547585, | |
| "grad_norm": 2.3643574714660645, | |
| "learning_rate": 1.4291361639824306e-05, | |
| "loss": 0.6961, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.445827232796486, | |
| "grad_norm": 5.455554008483887, | |
| "learning_rate": 1.4218155197657395e-05, | |
| "loss": 0.6974, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.4641288433382138, | |
| "grad_norm": 1.3421140909194946, | |
| "learning_rate": 1.4144948755490484e-05, | |
| "loss": 0.6929, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.4824304538799415, | |
| "grad_norm": 2.995839834213257, | |
| "learning_rate": 1.4071742313323573e-05, | |
| "loss": 0.6974, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.500732064421669, | |
| "grad_norm": 4.329990386962891, | |
| "learning_rate": 1.3998535871156661e-05, | |
| "loss": 0.6968, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.5190336749633968, | |
| "grad_norm": 2.4263505935668945, | |
| "learning_rate": 1.3925329428989751e-05, | |
| "loss": 0.694, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.5373352855051245, | |
| "grad_norm": 5.850309371948242, | |
| "learning_rate": 1.3852122986822843e-05, | |
| "loss": 0.6975, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.555636896046852, | |
| "grad_norm": 1.353864073753357, | |
| "learning_rate": 1.377891654465593e-05, | |
| "loss": 0.7005, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.5739385065885798, | |
| "grad_norm": 2.510244369506836, | |
| "learning_rate": 1.370571010248902e-05, | |
| "loss": 0.6906, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.5922401171303076, | |
| "grad_norm": 3.187347888946533, | |
| "learning_rate": 1.363250366032211e-05, | |
| "loss": 0.6902, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.610541727672035, | |
| "grad_norm": 2.019270420074463, | |
| "learning_rate": 1.3559297218155198e-05, | |
| "loss": 0.6959, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.6288433382137628, | |
| "grad_norm": 3.3284361362457275, | |
| "learning_rate": 1.3486090775988288e-05, | |
| "loss": 0.6901, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.6471449487554906, | |
| "grad_norm": 2.3892571926116943, | |
| "learning_rate": 1.3412884333821376e-05, | |
| "loss": 0.6831, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.665446559297218, | |
| "grad_norm": 2.138828992843628, | |
| "learning_rate": 1.3339677891654466e-05, | |
| "loss": 0.6961, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.6837481698389458, | |
| "grad_norm": 2.1902518272399902, | |
| "learning_rate": 1.3266471449487557e-05, | |
| "loss": 0.7011, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.7020497803806736, | |
| "grad_norm": 1.9349195957183838, | |
| "learning_rate": 1.3193265007320645e-05, | |
| "loss": 0.6964, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.720351390922401, | |
| "grad_norm": 2.414310932159424, | |
| "learning_rate": 1.3120058565153735e-05, | |
| "loss": 0.6993, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.7386530014641288, | |
| "grad_norm": 2.0503060817718506, | |
| "learning_rate": 1.3046852122986823e-05, | |
| "loss": 0.6945, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.7569546120058566, | |
| "grad_norm": 3.2496321201324463, | |
| "learning_rate": 1.2973645680819913e-05, | |
| "loss": 0.6984, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.775256222547584, | |
| "grad_norm": 4.175572395324707, | |
| "learning_rate": 1.2900439238653003e-05, | |
| "loss": 0.6945, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.7935578330893118, | |
| "grad_norm": 1.1053935289382935, | |
| "learning_rate": 1.282723279648609e-05, | |
| "loss": 0.6947, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.8118594436310396, | |
| "grad_norm": 4.932358741760254, | |
| "learning_rate": 1.2754026354319182e-05, | |
| "loss": 0.6893, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.830161054172767, | |
| "grad_norm": 1.429621934890747, | |
| "learning_rate": 1.2680819912152272e-05, | |
| "loss": 0.6711, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.8484626647144948, | |
| "grad_norm": 3.933276891708374, | |
| "learning_rate": 1.260761346998536e-05, | |
| "loss": 0.6942, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.8667642752562226, | |
| "grad_norm": 1.696254849433899, | |
| "learning_rate": 1.253440702781845e-05, | |
| "loss": 0.6922, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.88506588579795, | |
| "grad_norm": 2.4555442333221436, | |
| "learning_rate": 1.2461200585651538e-05, | |
| "loss": 0.6923, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.903367496339678, | |
| "grad_norm": 3.334120035171509, | |
| "learning_rate": 1.2387994143484628e-05, | |
| "loss": 0.6872, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.9216691068814056, | |
| "grad_norm": 1.637233018875122, | |
| "learning_rate": 1.2314787701317716e-05, | |
| "loss": 0.6832, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.939970717423133, | |
| "grad_norm": 1.511348009109497, | |
| "learning_rate": 1.2241581259150805e-05, | |
| "loss": 0.6892, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.958272327964861, | |
| "grad_norm": 3.0342915058135986, | |
| "learning_rate": 1.2168374816983897e-05, | |
| "loss": 0.6877, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.9765739385065886, | |
| "grad_norm": 4.36331033706665, | |
| "learning_rate": 1.2095168374816985e-05, | |
| "loss": 0.7022, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.994875549048316, | |
| "grad_norm": 1.6790521144866943, | |
| "learning_rate": 1.2021961932650075e-05, | |
| "loss": 0.689, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.43448023426061494, | |
| "eval_loss": 0.6969389319419861, | |
| "eval_runtime": 9.7517, | |
| "eval_samples_per_second": 280.155, | |
| "eval_steps_per_second": 17.535, | |
| "step": 5464 | |
| }, | |
| { | |
| "epoch": 2.013177159590044, | |
| "grad_norm": 2.8888535499572754, | |
| "learning_rate": 1.1948755490483164e-05, | |
| "loss": 0.6928, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.0314787701317716, | |
| "grad_norm": 1.8324801921844482, | |
| "learning_rate": 1.1875549048316253e-05, | |
| "loss": 0.6899, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.049780380673499, | |
| "grad_norm": 1.6425199508666992, | |
| "learning_rate": 1.1802342606149342e-05, | |
| "loss": 0.6923, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.068081991215227, | |
| "grad_norm": 2.4402670860290527, | |
| "learning_rate": 1.172913616398243e-05, | |
| "loss": 0.6882, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.0863836017569546, | |
| "grad_norm": 1.4472464323043823, | |
| "learning_rate": 1.165592972181552e-05, | |
| "loss": 0.6899, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.104685212298682, | |
| "grad_norm": 2.157292127609253, | |
| "learning_rate": 1.1582723279648612e-05, | |
| "loss": 0.7026, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.12298682284041, | |
| "grad_norm": 3.735875129699707, | |
| "learning_rate": 1.15095168374817e-05, | |
| "loss": 0.6818, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.1412884333821376, | |
| "grad_norm": 3.0418055057525635, | |
| "learning_rate": 1.143631039531479e-05, | |
| "loss": 0.6909, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.159590043923865, | |
| "grad_norm": 1.1220214366912842, | |
| "learning_rate": 1.1363103953147877e-05, | |
| "loss": 0.6977, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.177891654465593, | |
| "grad_norm": 1.926895022392273, | |
| "learning_rate": 1.1289897510980967e-05, | |
| "loss": 0.6937, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.1961932650073206, | |
| "grad_norm": 1.1366084814071655, | |
| "learning_rate": 1.1216691068814057e-05, | |
| "loss": 0.6981, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.214494875549048, | |
| "grad_norm": 2.684913396835327, | |
| "learning_rate": 1.1143484626647145e-05, | |
| "loss": 0.6923, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.232796486090776, | |
| "grad_norm": 1.1763278245925903, | |
| "learning_rate": 1.1070278184480235e-05, | |
| "loss": 0.6884, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.2510980966325036, | |
| "grad_norm": 0.9922837615013123, | |
| "learning_rate": 1.0997071742313326e-05, | |
| "loss": 0.6972, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.269399707174231, | |
| "grad_norm": 1.7432096004486084, | |
| "learning_rate": 1.0923865300146414e-05, | |
| "loss": 0.6956, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.287701317715959, | |
| "grad_norm": 5.431227207183838, | |
| "learning_rate": 1.0850658857979504e-05, | |
| "loss": 0.6949, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.3060029282576866, | |
| "grad_norm": 2.5441296100616455, | |
| "learning_rate": 1.0777452415812592e-05, | |
| "loss": 0.6922, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.3243045387994146, | |
| "grad_norm": 2.507460832595825, | |
| "learning_rate": 1.0704245973645682e-05, | |
| "loss": 0.6776, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.342606149341142, | |
| "grad_norm": 4.670897006988525, | |
| "learning_rate": 1.063103953147877e-05, | |
| "loss": 0.6888, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.3609077598828696, | |
| "grad_norm": 5.1314167976379395, | |
| "learning_rate": 1.055783308931186e-05, | |
| "loss": 0.6966, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.379209370424597, | |
| "grad_norm": 2.4066569805145264, | |
| "learning_rate": 1.048462664714495e-05, | |
| "loss": 0.6983, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.397510980966325, | |
| "grad_norm": 3.4087448120117188, | |
| "learning_rate": 1.041142020497804e-05, | |
| "loss": 0.6973, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.4158125915080526, | |
| "grad_norm": 1.7125242948532104, | |
| "learning_rate": 1.0338213762811129e-05, | |
| "loss": 0.6948, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.4341142020497806, | |
| "grad_norm": 3.0995583534240723, | |
| "learning_rate": 1.0265007320644219e-05, | |
| "loss": 0.6973, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.452415812591508, | |
| "grad_norm": 4.187213897705078, | |
| "learning_rate": 1.0191800878477307e-05, | |
| "loss": 0.6839, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.4707174231332356, | |
| "grad_norm": 2.849050521850586, | |
| "learning_rate": 1.0118594436310397e-05, | |
| "loss": 0.6927, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.4890190336749636, | |
| "grad_norm": 3.120654821395874, | |
| "learning_rate": 1.0045387994143485e-05, | |
| "loss": 0.6887, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.507320644216691, | |
| "grad_norm": 3.0376622676849365, | |
| "learning_rate": 9.972181551976574e-06, | |
| "loss": 0.6786, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.5256222547584186, | |
| "grad_norm": 2.4937405586242676, | |
| "learning_rate": 9.898975109809664e-06, | |
| "loss": 0.7005, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.5439238653001466, | |
| "grad_norm": 1.2871942520141602, | |
| "learning_rate": 9.825768667642754e-06, | |
| "loss": 0.6933, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.562225475841874, | |
| "grad_norm": 1.9713603258132935, | |
| "learning_rate": 9.752562225475842e-06, | |
| "loss": 0.6986, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.5805270863836016, | |
| "grad_norm": 3.823519706726074, | |
| "learning_rate": 9.679355783308932e-06, | |
| "loss": 0.6926, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.5988286969253296, | |
| "grad_norm": 1.7224080562591553, | |
| "learning_rate": 9.606149341142022e-06, | |
| "loss": 0.6894, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.617130307467057, | |
| "grad_norm": 1.2425187826156616, | |
| "learning_rate": 9.532942898975111e-06, | |
| "loss": 0.6898, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.6354319180087846, | |
| "grad_norm": 3.008572816848755, | |
| "learning_rate": 9.4597364568082e-06, | |
| "loss": 0.6902, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.6537335285505126, | |
| "grad_norm": 2.5645570755004883, | |
| "learning_rate": 9.386530014641289e-06, | |
| "loss": 0.6828, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.67203513909224, | |
| "grad_norm": 1.7012056112289429, | |
| "learning_rate": 9.313323572474379e-06, | |
| "loss": 0.6969, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.6903367496339676, | |
| "grad_norm": 6.18578577041626, | |
| "learning_rate": 9.240117130307467e-06, | |
| "loss": 0.68, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.7086383601756956, | |
| "grad_norm": 2.9371464252471924, | |
| "learning_rate": 9.166910688140557e-06, | |
| "loss": 0.7052, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.726939970717423, | |
| "grad_norm": 1.306884527206421, | |
| "learning_rate": 9.093704245973646e-06, | |
| "loss": 0.6945, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.745241581259151, | |
| "grad_norm": 2.4502739906311035, | |
| "learning_rate": 9.020497803806736e-06, | |
| "loss": 0.6998, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.7635431918008786, | |
| "grad_norm": 3.148775100708008, | |
| "learning_rate": 8.947291361639824e-06, | |
| "loss": 0.6931, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.781844802342606, | |
| "grad_norm": 2.4222412109375, | |
| "learning_rate": 8.874084919472914e-06, | |
| "loss": 0.6939, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.8001464128843336, | |
| "grad_norm": 2.543954610824585, | |
| "learning_rate": 8.800878477306004e-06, | |
| "loss": 0.6934, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.8184480234260616, | |
| "grad_norm": 1.2464910745620728, | |
| "learning_rate": 8.727672035139094e-06, | |
| "loss": 0.6919, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.836749633967789, | |
| "grad_norm": 3.3286757469177246, | |
| "learning_rate": 8.654465592972182e-06, | |
| "loss": 0.6955, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.855051244509517, | |
| "grad_norm": 1.269946575164795, | |
| "learning_rate": 8.581259150805271e-06, | |
| "loss": 0.6904, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.8733528550512446, | |
| "grad_norm": 2.8248860836029053, | |
| "learning_rate": 8.508052708638361e-06, | |
| "loss": 0.6876, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.891654465592972, | |
| "grad_norm": 4.0137553215026855, | |
| "learning_rate": 8.434846266471451e-06, | |
| "loss": 0.6901, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.9099560761346996, | |
| "grad_norm": 4.750382900238037, | |
| "learning_rate": 8.361639824304539e-06, | |
| "loss": 0.6986, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.9282576866764276, | |
| "grad_norm": 3.539860963821411, | |
| "learning_rate": 8.288433382137629e-06, | |
| "loss": 0.6838, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.946559297218155, | |
| "grad_norm": 2.6545543670654297, | |
| "learning_rate": 8.215226939970719e-06, | |
| "loss": 0.6942, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.964860907759883, | |
| "grad_norm": 2.351973056793213, | |
| "learning_rate": 8.142020497803808e-06, | |
| "loss": 0.6991, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.9831625183016106, | |
| "grad_norm": 1.9938989877700806, | |
| "learning_rate": 8.068814055636896e-06, | |
| "loss": 0.6928, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.5655197657393851, | |
| "eval_loss": 0.6848952770233154, | |
| "eval_runtime": 9.8156, | |
| "eval_samples_per_second": 278.332, | |
| "eval_steps_per_second": 17.421, | |
| "step": 8196 | |
| }, | |
| { | |
| "epoch": 3.001464128843338, | |
| "grad_norm": 3.9596476554870605, | |
| "learning_rate": 7.995607613469986e-06, | |
| "loss": 0.6903, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.019765739385066, | |
| "grad_norm": 4.190953254699707, | |
| "learning_rate": 7.922401171303076e-06, | |
| "loss": 0.6778, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 3.0380673499267936, | |
| "grad_norm": 3.9022440910339355, | |
| "learning_rate": 7.849194729136164e-06, | |
| "loss": 0.7014, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.056368960468521, | |
| "grad_norm": 3.035680055618286, | |
| "learning_rate": 7.775988286969254e-06, | |
| "loss": 0.6879, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 3.074670571010249, | |
| "grad_norm": 6.70428466796875, | |
| "learning_rate": 7.702781844802343e-06, | |
| "loss": 0.6871, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.0929721815519766, | |
| "grad_norm": 2.6576058864593506, | |
| "learning_rate": 7.629575402635433e-06, | |
| "loss": 0.6956, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 3.111273792093704, | |
| "grad_norm": 3.4227051734924316, | |
| "learning_rate": 7.556368960468522e-06, | |
| "loss": 0.6949, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.129575402635432, | |
| "grad_norm": 6.5497145652771, | |
| "learning_rate": 7.483162518301611e-06, | |
| "loss": 0.6882, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 3.1478770131771596, | |
| "grad_norm": 1.9692039489746094, | |
| "learning_rate": 7.4099560761347e-06, | |
| "loss": 0.6857, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.166178623718887, | |
| "grad_norm": 1.5063157081604004, | |
| "learning_rate": 7.3367496339677906e-06, | |
| "loss": 0.703, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 3.184480234260615, | |
| "grad_norm": 2.114001750946045, | |
| "learning_rate": 7.2635431918008795e-06, | |
| "loss": 0.6931, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 3.2027818448023426, | |
| "grad_norm": 4.059252738952637, | |
| "learning_rate": 7.190336749633968e-06, | |
| "loss": 0.7012, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 3.22108345534407, | |
| "grad_norm": 1.4372018575668335, | |
| "learning_rate": 7.117130307467057e-06, | |
| "loss": 0.6911, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 3.239385065885798, | |
| "grad_norm": 1.4479070901870728, | |
| "learning_rate": 7.043923865300147e-06, | |
| "loss": 0.6925, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 3.2576866764275256, | |
| "grad_norm": 1.4054046869277954, | |
| "learning_rate": 6.970717423133237e-06, | |
| "loss": 0.6948, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.275988286969253, | |
| "grad_norm": 2.7354447841644287, | |
| "learning_rate": 6.897510980966326e-06, | |
| "loss": 0.6931, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 3.294289897510981, | |
| "grad_norm": 6.485820293426514, | |
| "learning_rate": 6.824304538799415e-06, | |
| "loss": 0.6924, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.3125915080527086, | |
| "grad_norm": 2.2250216007232666, | |
| "learning_rate": 6.751098096632504e-06, | |
| "loss": 0.6955, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 3.330893118594436, | |
| "grad_norm": 3.7703425884246826, | |
| "learning_rate": 6.677891654465593e-06, | |
| "loss": 0.6978, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.349194729136164, | |
| "grad_norm": 1.3543486595153809, | |
| "learning_rate": 6.604685212298682e-06, | |
| "loss": 0.6851, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 3.3674963396778916, | |
| "grad_norm": 1.9245567321777344, | |
| "learning_rate": 6.531478770131772e-06, | |
| "loss": 0.6971, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.385797950219619, | |
| "grad_norm": 1.5131146907806396, | |
| "learning_rate": 6.458272327964862e-06, | |
| "loss": 0.6898, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 3.404099560761347, | |
| "grad_norm": 1.524381399154663, | |
| "learning_rate": 6.385065885797951e-06, | |
| "loss": 0.6881, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.4224011713030746, | |
| "grad_norm": 2.4183127880096436, | |
| "learning_rate": 6.3118594436310396e-06, | |
| "loss": 0.6957, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 3.440702781844802, | |
| "grad_norm": 6.259820461273193, | |
| "learning_rate": 6.2386530014641285e-06, | |
| "loss": 0.6945, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.45900439238653, | |
| "grad_norm": 4.742863655090332, | |
| "learning_rate": 6.165446559297219e-06, | |
| "loss": 0.6918, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 3.4773060029282576, | |
| "grad_norm": 3.144963026046753, | |
| "learning_rate": 6.092240117130308e-06, | |
| "loss": 0.6862, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.4956076134699856, | |
| "grad_norm": 2.9106500148773193, | |
| "learning_rate": 6.019033674963397e-06, | |
| "loss": 0.6889, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 3.513909224011713, | |
| "grad_norm": 6.509891510009766, | |
| "learning_rate": 5.945827232796486e-06, | |
| "loss": 0.6945, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.5322108345534406, | |
| "grad_norm": 2.27295184135437, | |
| "learning_rate": 5.8726207906295764e-06, | |
| "loss": 0.6943, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 3.550512445095168, | |
| "grad_norm": 7.141515731811523, | |
| "learning_rate": 5.799414348462665e-06, | |
| "loss": 0.6876, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.568814055636896, | |
| "grad_norm": 2.371335744857788, | |
| "learning_rate": 5.726207906295754e-06, | |
| "loss": 0.6988, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 3.5871156661786237, | |
| "grad_norm": 1.0689224004745483, | |
| "learning_rate": 5.653001464128843e-06, | |
| "loss": 0.6956, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.6054172767203516, | |
| "grad_norm": 3.6092991828918457, | |
| "learning_rate": 5.579795021961934e-06, | |
| "loss": 0.6926, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 3.623718887262079, | |
| "grad_norm": 1.2732123136520386, | |
| "learning_rate": 5.506588579795023e-06, | |
| "loss": 0.6901, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.6420204978038067, | |
| "grad_norm": 1.149488925933838, | |
| "learning_rate": 5.433382137628112e-06, | |
| "loss": 0.6947, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 3.660322108345534, | |
| "grad_norm": 1.580837607383728, | |
| "learning_rate": 5.3601756954612005e-06, | |
| "loss": 0.6931, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.678623718887262, | |
| "grad_norm": 4.544098854064941, | |
| "learning_rate": 5.28696925329429e-06, | |
| "loss": 0.6885, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 3.6969253294289897, | |
| "grad_norm": 2.011023998260498, | |
| "learning_rate": 5.21376281112738e-06, | |
| "loss": 0.6962, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.7152269399707176, | |
| "grad_norm": 5.078649997711182, | |
| "learning_rate": 5.140556368960469e-06, | |
| "loss": 0.6936, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 3.733528550512445, | |
| "grad_norm": 2.257343053817749, | |
| "learning_rate": 5.067349926793558e-06, | |
| "loss": 0.6945, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.7518301610541727, | |
| "grad_norm": 1.252966046333313, | |
| "learning_rate": 4.994143484626648e-06, | |
| "loss": 0.6892, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.7701317715959, | |
| "grad_norm": 2.1973466873168945, | |
| "learning_rate": 4.9209370424597365e-06, | |
| "loss": 0.6868, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.788433382137628, | |
| "grad_norm": 1.9186662435531616, | |
| "learning_rate": 4.847730600292826e-06, | |
| "loss": 0.6854, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.8067349926793557, | |
| "grad_norm": 3.1210763454437256, | |
| "learning_rate": 4.774524158125915e-06, | |
| "loss": 0.6977, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.8250366032210836, | |
| "grad_norm": 2.7756359577178955, | |
| "learning_rate": 4.701317715959005e-06, | |
| "loss": 0.696, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.843338213762811, | |
| "grad_norm": 1.0838842391967773, | |
| "learning_rate": 4.628111273792094e-06, | |
| "loss": 0.6921, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.8616398243045387, | |
| "grad_norm": 3.4335124492645264, | |
| "learning_rate": 4.554904831625183e-06, | |
| "loss": 0.6956, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.8799414348462666, | |
| "grad_norm": 1.28254234790802, | |
| "learning_rate": 4.4816983894582726e-06, | |
| "loss": 0.691, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.898243045387994, | |
| "grad_norm": 2.6828558444976807, | |
| "learning_rate": 4.4084919472913615e-06, | |
| "loss": 0.6926, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.9165446559297217, | |
| "grad_norm": 1.131003499031067, | |
| "learning_rate": 4.335285505124451e-06, | |
| "loss": 0.6894, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.9348462664714496, | |
| "grad_norm": 3.3838775157928467, | |
| "learning_rate": 4.26207906295754e-06, | |
| "loss": 0.6936, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.953147877013177, | |
| "grad_norm": 1.3978344202041626, | |
| "learning_rate": 4.18887262079063e-06, | |
| "loss": 0.6873, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.9714494875549047, | |
| "grad_norm": 4.850183010101318, | |
| "learning_rate": 4.115666178623719e-06, | |
| "loss": 0.6855, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.9897510980966326, | |
| "grad_norm": 3.063927412033081, | |
| "learning_rate": 4.0424597364568086e-06, | |
| "loss": 0.6942, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.5655197657393851, | |
| "eval_loss": 0.6846572756767273, | |
| "eval_runtime": 9.7626, | |
| "eval_samples_per_second": 279.843, | |
| "eval_steps_per_second": 17.516, | |
| "step": 10928 | |
| }, | |
| { | |
| "epoch": 4.008052708638361, | |
| "grad_norm": 2.9117279052734375, | |
| "learning_rate": 3.9692532942898975e-06, | |
| "loss": 0.6924, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 4.026354319180088, | |
| "grad_norm": 1.5743687152862549, | |
| "learning_rate": 3.896046852122987e-06, | |
| "loss": 0.6929, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.044655929721816, | |
| "grad_norm": 1.4719305038452148, | |
| "learning_rate": 3.822840409956076e-06, | |
| "loss": 0.6887, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 4.062957540263543, | |
| "grad_norm": 1.4639744758605957, | |
| "learning_rate": 3.749633967789166e-06, | |
| "loss": 0.6786, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 4.081259150805271, | |
| "grad_norm": 6.5527729988098145, | |
| "learning_rate": 3.676427525622255e-06, | |
| "loss": 0.6868, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 4.099560761346998, | |
| "grad_norm": 1.1454309225082397, | |
| "learning_rate": 3.6032210834553446e-06, | |
| "loss": 0.6991, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 4.117862371888727, | |
| "grad_norm": 2.6722428798675537, | |
| "learning_rate": 3.5300146412884335e-06, | |
| "loss": 0.6871, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 4.136163982430454, | |
| "grad_norm": 1.562593936920166, | |
| "learning_rate": 3.456808199121523e-06, | |
| "loss": 0.6911, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 4.154465592972182, | |
| "grad_norm": 1.5078984498977661, | |
| "learning_rate": 3.383601756954612e-06, | |
| "loss": 0.699, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 4.172767203513909, | |
| "grad_norm": 4.862717151641846, | |
| "learning_rate": 3.3103953147877015e-06, | |
| "loss": 0.6848, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 4.191068814055637, | |
| "grad_norm": 1.2064334154129028, | |
| "learning_rate": 3.2371888726207904e-06, | |
| "loss": 0.6784, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 4.209370424597364, | |
| "grad_norm": 1.4410967826843262, | |
| "learning_rate": 3.16398243045388e-06, | |
| "loss": 0.6867, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.227672035139093, | |
| "grad_norm": 1.5525456666946411, | |
| "learning_rate": 3.090775988286969e-06, | |
| "loss": 0.6878, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 4.24597364568082, | |
| "grad_norm": 2.721654176712036, | |
| "learning_rate": 3.017569546120059e-06, | |
| "loss": 0.6911, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 4.264275256222548, | |
| "grad_norm": 2.7285444736480713, | |
| "learning_rate": 2.944363103953148e-06, | |
| "loss": 0.6843, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 4.282576866764275, | |
| "grad_norm": 1.2089800834655762, | |
| "learning_rate": 2.8711566617862375e-06, | |
| "loss": 0.6989, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 4.300878477306003, | |
| "grad_norm": 3.368563175201416, | |
| "learning_rate": 2.797950219619327e-06, | |
| "loss": 0.683, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 4.31918008784773, | |
| "grad_norm": 1.4972318410873413, | |
| "learning_rate": 2.724743777452416e-06, | |
| "loss": 0.6897, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 4.337481698389459, | |
| "grad_norm": 3.92740797996521, | |
| "learning_rate": 2.6515373352855055e-06, | |
| "loss": 0.6876, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 4.355783308931186, | |
| "grad_norm": 2.367027521133423, | |
| "learning_rate": 2.5783308931185944e-06, | |
| "loss": 0.6884, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 4.374084919472914, | |
| "grad_norm": 3.318098306655884, | |
| "learning_rate": 2.505124450951684e-06, | |
| "loss": 0.6867, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 4.392386530014641, | |
| "grad_norm": 2.5479702949523926, | |
| "learning_rate": 2.431918008784773e-06, | |
| "loss": 0.6787, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 4.410688140556369, | |
| "grad_norm": 2.361260175704956, | |
| "learning_rate": 2.3587115666178625e-06, | |
| "loss": 0.6904, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 4.428989751098096, | |
| "grad_norm": 2.41310715675354, | |
| "learning_rate": 2.285505124450952e-06, | |
| "loss": 0.6814, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 4.447291361639825, | |
| "grad_norm": 2.390275716781616, | |
| "learning_rate": 2.212298682284041e-06, | |
| "loss": 0.667, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 4.465592972181552, | |
| "grad_norm": 1.9900853633880615, | |
| "learning_rate": 2.1390922401171305e-06, | |
| "loss": 0.6873, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 4.48389458272328, | |
| "grad_norm": 5.233725547790527, | |
| "learning_rate": 2.06588579795022e-06, | |
| "loss": 0.6844, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 4.502196193265007, | |
| "grad_norm": 2.4165773391723633, | |
| "learning_rate": 1.992679355783309e-06, | |
| "loss": 0.6803, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 4.520497803806735, | |
| "grad_norm": 1.9948047399520874, | |
| "learning_rate": 1.9194729136163985e-06, | |
| "loss": 0.6885, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 4.538799414348462, | |
| "grad_norm": 1.5085363388061523, | |
| "learning_rate": 1.8462664714494876e-06, | |
| "loss": 0.6883, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 4.557101024890191, | |
| "grad_norm": 5.91818380355835, | |
| "learning_rate": 1.773060029282577e-06, | |
| "loss": 0.6696, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 4.575402635431918, | |
| "grad_norm": 3.4706273078918457, | |
| "learning_rate": 1.6998535871156663e-06, | |
| "loss": 0.6798, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 4.593704245973646, | |
| "grad_norm": 3.0562493801116943, | |
| "learning_rate": 1.6266471449487556e-06, | |
| "loss": 0.6763, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 4.612005856515373, | |
| "grad_norm": 5.213473796844482, | |
| "learning_rate": 1.553440702781845e-06, | |
| "loss": 0.6793, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 4.630307467057101, | |
| "grad_norm": 2.782263994216919, | |
| "learning_rate": 1.4802342606149343e-06, | |
| "loss": 0.6748, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 4.648609077598829, | |
| "grad_norm": 12.179971694946289, | |
| "learning_rate": 1.4070278184480234e-06, | |
| "loss": 0.6653, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 4.666910688140557, | |
| "grad_norm": 5.823268413543701, | |
| "learning_rate": 1.3338213762811127e-06, | |
| "loss": 0.6916, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 4.685212298682284, | |
| "grad_norm": 4.365823745727539, | |
| "learning_rate": 1.260614934114202e-06, | |
| "loss": 0.6892, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 4.703513909224012, | |
| "grad_norm": 6.108221530914307, | |
| "learning_rate": 1.1874084919472914e-06, | |
| "loss": 0.6896, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 4.721815519765739, | |
| "grad_norm": 6.130873203277588, | |
| "learning_rate": 1.1142020497803808e-06, | |
| "loss": 0.6818, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 4.740117130307467, | |
| "grad_norm": 4.07578706741333, | |
| "learning_rate": 1.04099560761347e-06, | |
| "loss": 0.6795, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 4.758418740849194, | |
| "grad_norm": 3.392340898513794, | |
| "learning_rate": 9.677891654465594e-07, | |
| "loss": 0.6655, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 4.776720351390923, | |
| "grad_norm": 6.404242992401123, | |
| "learning_rate": 8.945827232796487e-07, | |
| "loss": 0.6769, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 4.79502196193265, | |
| "grad_norm": 3.1065897941589355, | |
| "learning_rate": 8.213762811127379e-07, | |
| "loss": 0.683, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 4.813323572474378, | |
| "grad_norm": 6.093231201171875, | |
| "learning_rate": 7.481698389458272e-07, | |
| "loss": 0.6798, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 4.831625183016105, | |
| "grad_norm": 7.247190952301025, | |
| "learning_rate": 6.749633967789166e-07, | |
| "loss": 0.6737, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 4.849926793557833, | |
| "grad_norm": 5.564077377319336, | |
| "learning_rate": 6.017569546120059e-07, | |
| "loss": 0.6805, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 4.868228404099561, | |
| "grad_norm": 3.6535987854003906, | |
| "learning_rate": 5.285505124450952e-07, | |
| "loss": 0.6612, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 4.886530014641289, | |
| "grad_norm": 3.1486198902130127, | |
| "learning_rate": 4.553440702781845e-07, | |
| "loss": 0.6833, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 4.904831625183016, | |
| "grad_norm": 2.8693318367004395, | |
| "learning_rate": 3.821376281112738e-07, | |
| "loss": 0.6775, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 4.923133235724744, | |
| "grad_norm": 3.7927544116973877, | |
| "learning_rate": 3.0893118594436313e-07, | |
| "loss": 0.673, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 4.941434846266471, | |
| "grad_norm": 3.532392740249634, | |
| "learning_rate": 2.3572474377745242e-07, | |
| "loss": 0.6713, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 4.959736456808199, | |
| "grad_norm": 3.34582257270813, | |
| "learning_rate": 1.6251830161054173e-07, | |
| "loss": 0.6542, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 4.978038067349927, | |
| "grad_norm": 4.675662517547607, | |
| "learning_rate": 8.931185944363105e-08, | |
| "loss": 0.6832, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 4.996339677891655, | |
| "grad_norm": 3.3549957275390625, | |
| "learning_rate": 1.610541727672035e-08, | |
| "loss": 0.6633, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.5841874084919473, | |
| "eval_loss": 0.6651186347007751, | |
| "eval_runtime": 9.7608, | |
| "eval_samples_per_second": 279.896, | |
| "eval_steps_per_second": 17.519, | |
| "step": 13660 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 13660, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7187536254796800.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |