| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 48.16824966078698, | |
| "eval_steps": 3538, | |
| "global_step": 71000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.060379918588873815, | |
| "grad_norm": 9.192076683044434, | |
| "learning_rate": 3.353428786737001e-06, | |
| "loss": 4.0695, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.12075983717774763, | |
| "grad_norm": 6.064570903778076, | |
| "learning_rate": 6.706857573474002e-06, | |
| "loss": 3.4544, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.18113975576662145, | |
| "grad_norm": 7.58292293548584, | |
| "learning_rate": 1.0060286360211004e-05, | |
| "loss": 3.2275, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.24151967435549526, | |
| "grad_norm": 12.527491569519043, | |
| "learning_rate": 1.3413715146948003e-05, | |
| "loss": 3.09, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.3018995929443691, | |
| "grad_norm": 12.255288124084473, | |
| "learning_rate": 1.6767143933685002e-05, | |
| "loss": 3.0322, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.3622795115332429, | |
| "grad_norm": 6.611302375793457, | |
| "learning_rate": 2.0120572720422008e-05, | |
| "loss": 2.9608, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.4226594301221167, | |
| "grad_norm": 12.716562271118164, | |
| "learning_rate": 2.3474001507159007e-05, | |
| "loss": 2.892, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.4830393487109905, | |
| "grad_norm": 6.532482624053955, | |
| "learning_rate": 2.6827430293896006e-05, | |
| "loss": 2.8621, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.5434192672998643, | |
| "grad_norm": 25.10944175720215, | |
| "learning_rate": 3.0180859080633005e-05, | |
| "loss": 2.754, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 0.6037991858887382, | |
| "grad_norm": 7.85612154006958, | |
| "learning_rate": 3.3534287867370005e-05, | |
| "loss": 2.7605, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.664179104477612, | |
| "grad_norm": 9.118956565856934, | |
| "learning_rate": 3.688771665410701e-05, | |
| "loss": 2.7105, | |
| "step": 979 | |
| }, | |
| { | |
| "epoch": 0.7245590230664858, | |
| "grad_norm": 8.141679763793945, | |
| "learning_rate": 4.0241145440844016e-05, | |
| "loss": 2.6589, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 0.7849389416553596, | |
| "grad_norm": 18.962980270385742, | |
| "learning_rate": 4.3594574227581015e-05, | |
| "loss": 2.6437, | |
| "step": 1157 | |
| }, | |
| { | |
| "epoch": 0.8453188602442334, | |
| "grad_norm": 7.252344608306885, | |
| "learning_rate": 4.6948003014318015e-05, | |
| "loss": 2.6, | |
| "step": 1246 | |
| }, | |
| { | |
| "epoch": 0.9056987788331072, | |
| "grad_norm": 9.572624206542969, | |
| "learning_rate": 5.030143180105501e-05, | |
| "loss": 2.6181, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.966078697421981, | |
| "grad_norm": 7.212714672088623, | |
| "learning_rate": 5.365486058779201e-05, | |
| "loss": 2.5157, | |
| "step": 1424 | |
| }, | |
| { | |
| "epoch": 1.0264586160108549, | |
| "grad_norm": 8.035208702087402, | |
| "learning_rate": 5.700828937452901e-05, | |
| "loss": 2.5217, | |
| "step": 1513 | |
| }, | |
| { | |
| "epoch": 1.0868385345997287, | |
| "grad_norm": 10.060546875, | |
| "learning_rate": 6.036171816126601e-05, | |
| "loss": 2.4495, | |
| "step": 1602 | |
| }, | |
| { | |
| "epoch": 1.1472184531886025, | |
| "grad_norm": 8.2612943649292, | |
| "learning_rate": 6.371514694800301e-05, | |
| "loss": 2.4271, | |
| "step": 1691 | |
| }, | |
| { | |
| "epoch": 1.2075983717774763, | |
| "grad_norm": 7.717799186706543, | |
| "learning_rate": 6.706857573474001e-05, | |
| "loss": 2.4397, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.2679782903663501, | |
| "grad_norm": 11.520405769348145, | |
| "learning_rate": 7.042200452147701e-05, | |
| "loss": 2.4099, | |
| "step": 1869 | |
| }, | |
| { | |
| "epoch": 1.328358208955224, | |
| "grad_norm": 8.826777458190918, | |
| "learning_rate": 7.377543330821402e-05, | |
| "loss": 2.3349, | |
| "step": 1958 | |
| }, | |
| { | |
| "epoch": 1.3887381275440978, | |
| "grad_norm": 7.810181140899658, | |
| "learning_rate": 7.712886209495102e-05, | |
| "loss": 2.3491, | |
| "step": 2047 | |
| }, | |
| { | |
| "epoch": 1.4491180461329716, | |
| "grad_norm": 9.809256553649902, | |
| "learning_rate": 8.048229088168803e-05, | |
| "loss": 2.3778, | |
| "step": 2136 | |
| }, | |
| { | |
| "epoch": 1.5094979647218452, | |
| "grad_norm": 8.396034240722656, | |
| "learning_rate": 8.383571966842503e-05, | |
| "loss": 2.3717, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 1.5698778833107192, | |
| "grad_norm": 10.409805297851562, | |
| "learning_rate": 8.718914845516203e-05, | |
| "loss": 2.3207, | |
| "step": 2314 | |
| }, | |
| { | |
| "epoch": 1.6302578018995928, | |
| "grad_norm": 7.1885786056518555, | |
| "learning_rate": 9.054257724189903e-05, | |
| "loss": 2.3651, | |
| "step": 2403 | |
| }, | |
| { | |
| "epoch": 1.6906377204884668, | |
| "grad_norm": 7.768437385559082, | |
| "learning_rate": 9.389600602863603e-05, | |
| "loss": 2.3313, | |
| "step": 2492 | |
| }, | |
| { | |
| "epoch": 1.7510176390773404, | |
| "grad_norm": 5.661167144775391, | |
| "learning_rate": 9.724943481537303e-05, | |
| "loss": 2.2961, | |
| "step": 2581 | |
| }, | |
| { | |
| "epoch": 1.8113975576662145, | |
| "grad_norm": 8.26041030883789, | |
| "learning_rate": 9.999999141684668e-05, | |
| "loss": 2.2966, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.871777476255088, | |
| "grad_norm": 9.347060203552246, | |
| "learning_rate": 9.999963035487687e-05, | |
| "loss": 2.2934, | |
| "step": 2759 | |
| }, | |
| { | |
| "epoch": 1.932157394843962, | |
| "grad_norm": 9.274127006530762, | |
| "learning_rate": 9.999873814762094e-05, | |
| "loss": 2.2813, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 1.9925373134328357, | |
| "grad_norm": 8.107317924499512, | |
| "learning_rate": 9.999731480455674e-05, | |
| "loss": 2.2005, | |
| "step": 2937 | |
| }, | |
| { | |
| "epoch": 2.0529172320217097, | |
| "grad_norm": 8.258145332336426, | |
| "learning_rate": 9.999536034080447e-05, | |
| "loss": 2.1059, | |
| "step": 3026 | |
| }, | |
| { | |
| "epoch": 2.1132971506105833, | |
| "grad_norm": 6.388680458068848, | |
| "learning_rate": 9.999287477712633e-05, | |
| "loss": 2.0797, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 2.1736770691994574, | |
| "grad_norm": 6.251929759979248, | |
| "learning_rate": 9.998985813992645e-05, | |
| "loss": 2.0776, | |
| "step": 3204 | |
| }, | |
| { | |
| "epoch": 2.234056987788331, | |
| "grad_norm": 7.3807172775268555, | |
| "learning_rate": 9.998631046125051e-05, | |
| "loss": 2.0028, | |
| "step": 3293 | |
| }, | |
| { | |
| "epoch": 2.294436906377205, | |
| "grad_norm": 8.793547630310059, | |
| "learning_rate": 9.998223177878545e-05, | |
| "loss": 2.0789, | |
| "step": 3382 | |
| }, | |
| { | |
| "epoch": 2.3548168249660786, | |
| "grad_norm": 8.737523078918457, | |
| "learning_rate": 9.997762213585903e-05, | |
| "loss": 2.0322, | |
| "step": 3471 | |
| }, | |
| { | |
| "epoch": 2.400271370420624, | |
| "eval_accuracy": 0.193359375, | |
| "eval_loss": 3.569305419921875, | |
| "eval_runtime": 19.2577, | |
| "eval_samples_per_second": 26.587, | |
| "eval_steps_per_second": 0.208, | |
| "step": 3538 | |
| }, | |
| { | |
| "epoch": 2.4151967435549526, | |
| "grad_norm": 13.652155876159668, | |
| "learning_rate": 9.997248158143945e-05, | |
| "loss": 1.9617, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.475576662143826, | |
| "grad_norm": 6.501727104187012, | |
| "learning_rate": 9.99668101701347e-05, | |
| "loss": 2.0389, | |
| "step": 3649 | |
| }, | |
| { | |
| "epoch": 2.5359565807327002, | |
| "grad_norm": 11.862299919128418, | |
| "learning_rate": 9.99606079621921e-05, | |
| "loss": 2.031, | |
| "step": 3738 | |
| }, | |
| { | |
| "epoch": 2.596336499321574, | |
| "grad_norm": 7.8563551902771, | |
| "learning_rate": 9.995387502349764e-05, | |
| "loss": 1.9729, | |
| "step": 3827 | |
| }, | |
| { | |
| "epoch": 2.656716417910448, | |
| "grad_norm": 9.843242645263672, | |
| "learning_rate": 9.99466114255752e-05, | |
| "loss": 1.9323, | |
| "step": 3916 | |
| }, | |
| { | |
| "epoch": 2.7170963364993215, | |
| "grad_norm": 6.065842628479004, | |
| "learning_rate": 9.993881724558587e-05, | |
| "loss": 1.9465, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 2.7774762550881955, | |
| "grad_norm": 11.29686450958252, | |
| "learning_rate": 9.993049256632708e-05, | |
| "loss": 1.8912, | |
| "step": 4094 | |
| }, | |
| { | |
| "epoch": 2.837856173677069, | |
| "grad_norm": 11.959461212158203, | |
| "learning_rate": 9.99216374762318e-05, | |
| "loss": 1.9665, | |
| "step": 4183 | |
| }, | |
| { | |
| "epoch": 2.898236092265943, | |
| "grad_norm": 8.116125106811523, | |
| "learning_rate": 9.991225206936747e-05, | |
| "loss": 1.9158, | |
| "step": 4272 | |
| }, | |
| { | |
| "epoch": 2.9586160108548167, | |
| "grad_norm": 7.0811309814453125, | |
| "learning_rate": 9.990233644543517e-05, | |
| "loss": 1.929, | |
| "step": 4361 | |
| }, | |
| { | |
| "epoch": 3.0189959294436908, | |
| "grad_norm": 7.11262845993042, | |
| "learning_rate": 9.989189070976839e-05, | |
| "loss": 1.8259, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 3.0793758480325644, | |
| "grad_norm": 10.385732650756836, | |
| "learning_rate": 9.988091497333202e-05, | |
| "loss": 1.6678, | |
| "step": 4539 | |
| }, | |
| { | |
| "epoch": 3.1397557666214384, | |
| "grad_norm": 8.934700965881348, | |
| "learning_rate": 9.986940935272113e-05, | |
| "loss": 1.7278, | |
| "step": 4628 | |
| }, | |
| { | |
| "epoch": 3.200135685210312, | |
| "grad_norm": 11.203325271606445, | |
| "learning_rate": 9.985737397015975e-05, | |
| "loss": 1.6957, | |
| "step": 4717 | |
| }, | |
| { | |
| "epoch": 3.260515603799186, | |
| "grad_norm": 10.464749336242676, | |
| "learning_rate": 9.984480895349955e-05, | |
| "loss": 1.6743, | |
| "step": 4806 | |
| }, | |
| { | |
| "epoch": 3.3208955223880596, | |
| "grad_norm": 9.995988845825195, | |
| "learning_rate": 9.983171443621853e-05, | |
| "loss": 1.692, | |
| "step": 4895 | |
| }, | |
| { | |
| "epoch": 3.3812754409769337, | |
| "grad_norm": 11.267080307006836, | |
| "learning_rate": 9.981809055741953e-05, | |
| "loss": 1.6836, | |
| "step": 4984 | |
| }, | |
| { | |
| "epoch": 3.4416553595658073, | |
| "grad_norm": 9.267989158630371, | |
| "learning_rate": 9.980393746182879e-05, | |
| "loss": 1.6307, | |
| "step": 5073 | |
| }, | |
| { | |
| "epoch": 3.5020352781546813, | |
| "grad_norm": 10.1551513671875, | |
| "learning_rate": 9.978925529979441e-05, | |
| "loss": 1.6547, | |
| "step": 5162 | |
| }, | |
| { | |
| "epoch": 3.562415196743555, | |
| "grad_norm": 8.809422492980957, | |
| "learning_rate": 9.97740442272848e-05, | |
| "loss": 1.6293, | |
| "step": 5251 | |
| }, | |
| { | |
| "epoch": 3.622795115332429, | |
| "grad_norm": 9.327820777893066, | |
| "learning_rate": 9.975830440588692e-05, | |
| "loss": 1.6611, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 3.6831750339213025, | |
| "grad_norm": 9.966196060180664, | |
| "learning_rate": 9.974203600280465e-05, | |
| "loss": 1.6296, | |
| "step": 5429 | |
| }, | |
| { | |
| "epoch": 3.743554952510176, | |
| "grad_norm": 9.004570007324219, | |
| "learning_rate": 9.972523919085699e-05, | |
| "loss": 1.6335, | |
| "step": 5518 | |
| }, | |
| { | |
| "epoch": 3.80393487109905, | |
| "grad_norm": 8.515008926391602, | |
| "learning_rate": 9.97079141484762e-05, | |
| "loss": 1.6038, | |
| "step": 5607 | |
| }, | |
| { | |
| "epoch": 3.864314789687924, | |
| "grad_norm": 8.961952209472656, | |
| "learning_rate": 9.969006105970593e-05, | |
| "loss": 1.6298, | |
| "step": 5696 | |
| }, | |
| { | |
| "epoch": 3.924694708276798, | |
| "grad_norm": 11.294893264770508, | |
| "learning_rate": 9.967168011419927e-05, | |
| "loss": 1.588, | |
| "step": 5785 | |
| }, | |
| { | |
| "epoch": 3.9850746268656714, | |
| "grad_norm": 9.631956100463867, | |
| "learning_rate": 9.965277150721669e-05, | |
| "loss": 1.5871, | |
| "step": 5874 | |
| }, | |
| { | |
| "epoch": 4.045454545454546, | |
| "grad_norm": 7.810765266418457, | |
| "learning_rate": 9.963333543962405e-05, | |
| "loss": 1.4341, | |
| "step": 5963 | |
| }, | |
| { | |
| "epoch": 4.1058344640434195, | |
| "grad_norm": 10.34420394897461, | |
| "learning_rate": 9.961337211789039e-05, | |
| "loss": 1.4289, | |
| "step": 6052 | |
| }, | |
| { | |
| "epoch": 4.166214382632293, | |
| "grad_norm": 10.042189598083496, | |
| "learning_rate": 9.959288175408577e-05, | |
| "loss": 1.392, | |
| "step": 6141 | |
| }, | |
| { | |
| "epoch": 4.226594301221167, | |
| "grad_norm": 9.480945587158203, | |
| "learning_rate": 9.957186456587896e-05, | |
| "loss": 1.4407, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 4.286974219810041, | |
| "grad_norm": 10.059048652648926, | |
| "learning_rate": 9.955032077653525e-05, | |
| "loss": 1.4126, | |
| "step": 6319 | |
| }, | |
| { | |
| "epoch": 4.347354138398915, | |
| "grad_norm": 8.111236572265625, | |
| "learning_rate": 9.9528250614914e-05, | |
| "loss": 1.3697, | |
| "step": 6408 | |
| }, | |
| { | |
| "epoch": 4.407734056987788, | |
| "grad_norm": 7.836842060089111, | |
| "learning_rate": 9.950565431546612e-05, | |
| "loss": 1.4165, | |
| "step": 6497 | |
| }, | |
| { | |
| "epoch": 4.468113975576662, | |
| "grad_norm": 10.462437629699707, | |
| "learning_rate": 9.948253211823182e-05, | |
| "loss": 1.3629, | |
| "step": 6586 | |
| }, | |
| { | |
| "epoch": 4.5284938941655355, | |
| "grad_norm": 8.220168113708496, | |
| "learning_rate": 9.945888426883778e-05, | |
| "loss": 1.4402, | |
| "step": 6675 | |
| }, | |
| { | |
| "epoch": 4.58887381275441, | |
| "grad_norm": 8.81052017211914, | |
| "learning_rate": 9.943471101849477e-05, | |
| "loss": 1.4194, | |
| "step": 6764 | |
| }, | |
| { | |
| "epoch": 4.649253731343284, | |
| "grad_norm": 13.403641700744629, | |
| "learning_rate": 9.941001262399482e-05, | |
| "loss": 1.3943, | |
| "step": 6853 | |
| }, | |
| { | |
| "epoch": 4.709633649932157, | |
| "grad_norm": 8.589107513427734, | |
| "learning_rate": 9.938478934770861e-05, | |
| "loss": 1.3888, | |
| "step": 6942 | |
| }, | |
| { | |
| "epoch": 4.770013568521032, | |
| "grad_norm": 12.76234245300293, | |
| "learning_rate": 9.935904145758259e-05, | |
| "loss": 1.415, | |
| "step": 7031 | |
| }, | |
| { | |
| "epoch": 4.800542740841248, | |
| "eval_accuracy": 0.19140625, | |
| "eval_loss": 3.4613265991210938, | |
| "eval_runtime": 18.4, | |
| "eval_samples_per_second": 27.826, | |
| "eval_steps_per_second": 0.217, | |
| "step": 7076 | |
| }, | |
| { | |
| "epoch": 4.830393487109905, | |
| "grad_norm": 12.034464836120605, | |
| "learning_rate": 9.933276922713619e-05, | |
| "loss": 1.3772, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 4.890773405698779, | |
| "grad_norm": 10.00120735168457, | |
| "learning_rate": 9.930597293545891e-05, | |
| "loss": 1.3427, | |
| "step": 7209 | |
| }, | |
| { | |
| "epoch": 4.951153324287652, | |
| "grad_norm": 10.592118263244629, | |
| "learning_rate": 9.927865286720734e-05, | |
| "loss": 1.3681, | |
| "step": 7298 | |
| }, | |
| { | |
| "epoch": 5.011533242876526, | |
| "grad_norm": 10.557718276977539, | |
| "learning_rate": 9.925080931260211e-05, | |
| "loss": 1.3345, | |
| "step": 7387 | |
| }, | |
| { | |
| "epoch": 5.0719131614654005, | |
| "grad_norm": 8.036050796508789, | |
| "learning_rate": 9.922244256742491e-05, | |
| "loss": 1.1945, | |
| "step": 7476 | |
| }, | |
| { | |
| "epoch": 5.132293080054274, | |
| "grad_norm": 8.104681015014648, | |
| "learning_rate": 9.919355293301515e-05, | |
| "loss": 1.191, | |
| "step": 7565 | |
| }, | |
| { | |
| "epoch": 5.192672998643148, | |
| "grad_norm": 8.685461044311523, | |
| "learning_rate": 9.916414071626704e-05, | |
| "loss": 1.1867, | |
| "step": 7654 | |
| }, | |
| { | |
| "epoch": 5.253052917232022, | |
| "grad_norm": 7.221011638641357, | |
| "learning_rate": 9.913420622962606e-05, | |
| "loss": 1.1737, | |
| "step": 7743 | |
| }, | |
| { | |
| "epoch": 5.313432835820896, | |
| "grad_norm": 9.594326972961426, | |
| "learning_rate": 9.910374979108579e-05, | |
| "loss": 1.2058, | |
| "step": 7832 | |
| }, | |
| { | |
| "epoch": 5.373812754409769, | |
| "grad_norm": 8.146512031555176, | |
| "learning_rate": 9.907277172418449e-05, | |
| "loss": 1.2173, | |
| "step": 7921 | |
| }, | |
| { | |
| "epoch": 5.434192672998643, | |
| "grad_norm": 8.147337913513184, | |
| "learning_rate": 9.904127235800169e-05, | |
| "loss": 1.2047, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 5.4945725915875165, | |
| "grad_norm": 10.820967674255371, | |
| "learning_rate": 9.900925202715468e-05, | |
| "loss": 1.2287, | |
| "step": 8099 | |
| }, | |
| { | |
| "epoch": 5.554952510176391, | |
| "grad_norm": 10.343106269836426, | |
| "learning_rate": 9.897671107179488e-05, | |
| "loss": 1.1927, | |
| "step": 8188 | |
| }, | |
| { | |
| "epoch": 5.615332428765265, | |
| "grad_norm": 10.41408920288086, | |
| "learning_rate": 9.894364983760439e-05, | |
| "loss": 1.2321, | |
| "step": 8277 | |
| }, | |
| { | |
| "epoch": 5.675712347354138, | |
| "grad_norm": 9.299535751342773, | |
| "learning_rate": 9.891006867579217e-05, | |
| "loss": 1.2012, | |
| "step": 8366 | |
| }, | |
| { | |
| "epoch": 5.736092265943013, | |
| "grad_norm": 10.728792190551758, | |
| "learning_rate": 9.887596794309035e-05, | |
| "loss": 1.1812, | |
| "step": 8455 | |
| }, | |
| { | |
| "epoch": 5.796472184531886, | |
| "grad_norm": 7.432964324951172, | |
| "learning_rate": 9.884134800175053e-05, | |
| "loss": 1.1521, | |
| "step": 8544 | |
| }, | |
| { | |
| "epoch": 5.85685210312076, | |
| "grad_norm": 7.614875316619873, | |
| "learning_rate": 9.880620921953974e-05, | |
| "loss": 1.1487, | |
| "step": 8633 | |
| }, | |
| { | |
| "epoch": 5.9172320217096335, | |
| "grad_norm": 10.49835205078125, | |
| "learning_rate": 9.877055196973674e-05, | |
| "loss": 1.2014, | |
| "step": 8722 | |
| }, | |
| { | |
| "epoch": 5.977611940298507, | |
| "grad_norm": 7.404662609100342, | |
| "learning_rate": 9.873437663112794e-05, | |
| "loss": 1.1821, | |
| "step": 8811 | |
| }, | |
| { | |
| "epoch": 6.0379918588873815, | |
| "grad_norm": 9.661059379577637, | |
| "learning_rate": 9.869768358800339e-05, | |
| "loss": 1.0712, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 6.098371777476255, | |
| "grad_norm": 11.615382194519043, | |
| "learning_rate": 9.866047323015269e-05, | |
| "loss": 1.0516, | |
| "step": 8989 | |
| }, | |
| { | |
| "epoch": 6.158751696065129, | |
| "grad_norm": 10.21226978302002, | |
| "learning_rate": 9.86227459528609e-05, | |
| "loss": 1.0813, | |
| "step": 9078 | |
| }, | |
| { | |
| "epoch": 6.219131614654002, | |
| "grad_norm": 7.748682975769043, | |
| "learning_rate": 9.85845021569043e-05, | |
| "loss": 1.0604, | |
| "step": 9167 | |
| }, | |
| { | |
| "epoch": 6.279511533242877, | |
| "grad_norm": 10.797855377197266, | |
| "learning_rate": 9.854574224854611e-05, | |
| "loss": 1.0417, | |
| "step": 9256 | |
| }, | |
| { | |
| "epoch": 6.33989145183175, | |
| "grad_norm": 9.862196922302246, | |
| "learning_rate": 9.850646663953227e-05, | |
| "loss": 1.0171, | |
| "step": 9345 | |
| }, | |
| { | |
| "epoch": 6.400271370420624, | |
| "grad_norm": 10.341273307800293, | |
| "learning_rate": 9.84666757470869e-05, | |
| "loss": 1.0216, | |
| "step": 9434 | |
| }, | |
| { | |
| "epoch": 6.460651289009498, | |
| "grad_norm": 7.858868598937988, | |
| "learning_rate": 9.842636999390807e-05, | |
| "loss": 1.0705, | |
| "step": 9523 | |
| }, | |
| { | |
| "epoch": 6.521031207598372, | |
| "grad_norm": 10.367132186889648, | |
| "learning_rate": 9.838554980816312e-05, | |
| "loss": 1.0489, | |
| "step": 9612 | |
| }, | |
| { | |
| "epoch": 6.581411126187246, | |
| "grad_norm": 13.918916702270508, | |
| "learning_rate": 9.834421562348428e-05, | |
| "loss": 1.0753, | |
| "step": 9701 | |
| }, | |
| { | |
| "epoch": 6.641791044776119, | |
| "grad_norm": 9.345829010009766, | |
| "learning_rate": 9.830236787896391e-05, | |
| "loss": 1.0584, | |
| "step": 9790 | |
| }, | |
| { | |
| "epoch": 6.702170963364993, | |
| "grad_norm": 12.244129180908203, | |
| "learning_rate": 9.826000701914998e-05, | |
| "loss": 1.0402, | |
| "step": 9879 | |
| }, | |
| { | |
| "epoch": 6.762550881953867, | |
| "grad_norm": 8.918442726135254, | |
| "learning_rate": 9.821713349404119e-05, | |
| "loss": 1.0522, | |
| "step": 9968 | |
| }, | |
| { | |
| "epoch": 6.822930800542741, | |
| "grad_norm": 8.40239143371582, | |
| "learning_rate": 9.817374775908237e-05, | |
| "loss": 1.0277, | |
| "step": 10057 | |
| }, | |
| { | |
| "epoch": 6.8833107191316145, | |
| "grad_norm": 12.844498634338379, | |
| "learning_rate": 9.812985027515947e-05, | |
| "loss": 1.077, | |
| "step": 10146 | |
| }, | |
| { | |
| "epoch": 6.943690637720488, | |
| "grad_norm": 8.832013130187988, | |
| "learning_rate": 9.808544150859476e-05, | |
| "loss": 1.0239, | |
| "step": 10235 | |
| }, | |
| { | |
| "epoch": 7.004070556309363, | |
| "grad_norm": 5.591742038726807, | |
| "learning_rate": 9.804052193114189e-05, | |
| "loss": 1.0128, | |
| "step": 10324 | |
| }, | |
| { | |
| "epoch": 7.064450474898236, | |
| "grad_norm": 6.905234336853027, | |
| "learning_rate": 9.799509201998083e-05, | |
| "loss": 0.9019, | |
| "step": 10413 | |
| }, | |
| { | |
| "epoch": 7.12483039348711, | |
| "grad_norm": 9.313871383666992, | |
| "learning_rate": 9.794915225771279e-05, | |
| "loss": 0.9515, | |
| "step": 10502 | |
| }, | |
| { | |
| "epoch": 7.185210312075983, | |
| "grad_norm": 8.015510559082031, | |
| "learning_rate": 9.790270313235517e-05, | |
| "loss": 0.9301, | |
| "step": 10591 | |
| }, | |
| { | |
| "epoch": 7.200814111261873, | |
| "eval_accuracy": 0.17578125, | |
| "eval_loss": 3.7450790405273438, | |
| "eval_runtime": 18.7511, | |
| "eval_samples_per_second": 27.305, | |
| "eval_steps_per_second": 0.213, | |
| "step": 10614 | |
| }, | |
| { | |
| "epoch": 7.245590230664858, | |
| "grad_norm": 7.311409950256348, | |
| "learning_rate": 9.785574513733625e-05, | |
| "loss": 0.9172, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 7.3059701492537314, | |
| "grad_norm": 5.454108238220215, | |
| "learning_rate": 9.780827877149013e-05, | |
| "loss": 0.9372, | |
| "step": 10769 | |
| }, | |
| { | |
| "epoch": 7.366350067842605, | |
| "grad_norm": 5.830528736114502, | |
| "learning_rate": 9.776030453905122e-05, | |
| "loss": 0.9163, | |
| "step": 10858 | |
| }, | |
| { | |
| "epoch": 7.426729986431479, | |
| "grad_norm": 9.309490203857422, | |
| "learning_rate": 9.771182294964905e-05, | |
| "loss": 0.9528, | |
| "step": 10947 | |
| }, | |
| { | |
| "epoch": 7.487109905020353, | |
| "grad_norm": 11.420437812805176, | |
| "learning_rate": 9.76628345183028e-05, | |
| "loss": 0.9198, | |
| "step": 11036 | |
| }, | |
| { | |
| "epoch": 7.547489823609227, | |
| "grad_norm": 11.052990913391113, | |
| "learning_rate": 9.761333976541578e-05, | |
| "loss": 0.9231, | |
| "step": 11125 | |
| }, | |
| { | |
| "epoch": 7.6078697421981, | |
| "grad_norm": 7.378238201141357, | |
| "learning_rate": 9.756333921676999e-05, | |
| "loss": 0.9452, | |
| "step": 11214 | |
| }, | |
| { | |
| "epoch": 7.668249660786974, | |
| "grad_norm": 11.708273887634277, | |
| "learning_rate": 9.751283340352044e-05, | |
| "loss": 0.9163, | |
| "step": 11303 | |
| }, | |
| { | |
| "epoch": 7.728629579375848, | |
| "grad_norm": 5.919505596160889, | |
| "learning_rate": 9.746182286218964e-05, | |
| "loss": 0.9254, | |
| "step": 11392 | |
| }, | |
| { | |
| "epoch": 7.789009497964722, | |
| "grad_norm": 10.179853439331055, | |
| "learning_rate": 9.741030813466172e-05, | |
| "loss": 0.9317, | |
| "step": 11481 | |
| }, | |
| { | |
| "epoch": 7.849389416553596, | |
| "grad_norm": 8.873759269714355, | |
| "learning_rate": 9.735828976817683e-05, | |
| "loss": 0.9474, | |
| "step": 11570 | |
| }, | |
| { | |
| "epoch": 7.909769335142469, | |
| "grad_norm": 6.65983772277832, | |
| "learning_rate": 9.730576831532528e-05, | |
| "loss": 0.9013, | |
| "step": 11659 | |
| }, | |
| { | |
| "epoch": 7.970149253731344, | |
| "grad_norm": 7.311088562011719, | |
| "learning_rate": 9.725274433404164e-05, | |
| "loss": 0.9119, | |
| "step": 11748 | |
| }, | |
| { | |
| "epoch": 8.030529172320216, | |
| "grad_norm": 10.026205062866211, | |
| "learning_rate": 9.719921838759878e-05, | |
| "loss": 0.876, | |
| "step": 11837 | |
| }, | |
| { | |
| "epoch": 8.090909090909092, | |
| "grad_norm": 8.08633804321289, | |
| "learning_rate": 9.714519104460202e-05, | |
| "loss": 0.8151, | |
| "step": 11926 | |
| }, | |
| { | |
| "epoch": 8.151289009497965, | |
| "grad_norm": 6.680150508880615, | |
| "learning_rate": 9.709066287898298e-05, | |
| "loss": 0.8111, | |
| "step": 12015 | |
| }, | |
| { | |
| "epoch": 8.211668928086839, | |
| "grad_norm": 8.399514198303223, | |
| "learning_rate": 9.70356344699935e-05, | |
| "loss": 0.8207, | |
| "step": 12104 | |
| }, | |
| { | |
| "epoch": 8.272048846675712, | |
| "grad_norm": 10.127174377441406, | |
| "learning_rate": 9.698010640219951e-05, | |
| "loss": 0.84, | |
| "step": 12193 | |
| }, | |
| { | |
| "epoch": 8.332428765264586, | |
| "grad_norm": 7.315372943878174, | |
| "learning_rate": 9.692407926547478e-05, | |
| "loss": 0.8473, | |
| "step": 12282 | |
| }, | |
| { | |
| "epoch": 8.39280868385346, | |
| "grad_norm": 11.611318588256836, | |
| "learning_rate": 9.686755365499471e-05, | |
| "loss": 0.8423, | |
| "step": 12371 | |
| }, | |
| { | |
| "epoch": 8.453188602442333, | |
| "grad_norm": 7.9076008796691895, | |
| "learning_rate": 9.681053017122996e-05, | |
| "loss": 0.8445, | |
| "step": 12460 | |
| }, | |
| { | |
| "epoch": 8.513568521031207, | |
| "grad_norm": 9.092277526855469, | |
| "learning_rate": 9.675300941994012e-05, | |
| "loss": 0.8652, | |
| "step": 12549 | |
| }, | |
| { | |
| "epoch": 8.573948439620082, | |
| "grad_norm": 8.704888343811035, | |
| "learning_rate": 9.669499201216723e-05, | |
| "loss": 0.8312, | |
| "step": 12638 | |
| }, | |
| { | |
| "epoch": 8.634328358208956, | |
| "grad_norm": 13.215127944946289, | |
| "learning_rate": 9.663647856422928e-05, | |
| "loss": 0.8306, | |
| "step": 12727 | |
| }, | |
| { | |
| "epoch": 8.69470827679783, | |
| "grad_norm": 6.171853542327881, | |
| "learning_rate": 9.657746969771371e-05, | |
| "loss": 0.8504, | |
| "step": 12816 | |
| }, | |
| { | |
| "epoch": 8.755088195386703, | |
| "grad_norm": 9.066251754760742, | |
| "learning_rate": 9.651796603947076e-05, | |
| "loss": 0.8711, | |
| "step": 12905 | |
| }, | |
| { | |
| "epoch": 8.815468113975577, | |
| "grad_norm": 7.504266262054443, | |
| "learning_rate": 9.645796822160691e-05, | |
| "loss": 0.8312, | |
| "step": 12994 | |
| }, | |
| { | |
| "epoch": 8.87584803256445, | |
| "grad_norm": 11.219298362731934, | |
| "learning_rate": 9.639747688147798e-05, | |
| "loss": 0.8264, | |
| "step": 13083 | |
| }, | |
| { | |
| "epoch": 8.936227951153324, | |
| "grad_norm": 9.841562271118164, | |
| "learning_rate": 9.633649266168256e-05, | |
| "loss": 0.8097, | |
| "step": 13172 | |
| }, | |
| { | |
| "epoch": 8.996607869742197, | |
| "grad_norm": 6.924744606018066, | |
| "learning_rate": 9.627501621005505e-05, | |
| "loss": 0.8315, | |
| "step": 13261 | |
| }, | |
| { | |
| "epoch": 9.056987788331073, | |
| "grad_norm": 12.85659408569336, | |
| "learning_rate": 9.62130481796588e-05, | |
| "loss": 0.7768, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 9.117367706919946, | |
| "grad_norm": 7.802920341491699, | |
| "learning_rate": 9.615058922877926e-05, | |
| "loss": 0.7363, | |
| "step": 13439 | |
| }, | |
| { | |
| "epoch": 9.17774762550882, | |
| "grad_norm": 5.512497425079346, | |
| "learning_rate": 9.608764002091686e-05, | |
| "loss": 0.7568, | |
| "step": 13528 | |
| }, | |
| { | |
| "epoch": 9.238127544097694, | |
| "grad_norm": 7.84502649307251, | |
| "learning_rate": 9.602420122478004e-05, | |
| "loss": 0.7754, | |
| "step": 13617 | |
| }, | |
| { | |
| "epoch": 9.298507462686567, | |
| "grad_norm": 7.394598484039307, | |
| "learning_rate": 9.596027351427814e-05, | |
| "loss": 0.7862, | |
| "step": 13706 | |
| }, | |
| { | |
| "epoch": 9.35888738127544, | |
| "grad_norm": 8.552702903747559, | |
| "learning_rate": 9.589585756851422e-05, | |
| "loss": 0.7404, | |
| "step": 13795 | |
| }, | |
| { | |
| "epoch": 9.419267299864314, | |
| "grad_norm": 8.93039608001709, | |
| "learning_rate": 9.583095407177788e-05, | |
| "loss": 0.7368, | |
| "step": 13884 | |
| }, | |
| { | |
| "epoch": 9.479647218453188, | |
| "grad_norm": 8.229623794555664, | |
| "learning_rate": 9.576556371353791e-05, | |
| "loss": 0.7699, | |
| "step": 13973 | |
| }, | |
| { | |
| "epoch": 9.540027137042063, | |
| "grad_norm": 10.284710884094238, | |
| "learning_rate": 9.569968718843507e-05, | |
| "loss": 0.7811, | |
| "step": 14062 | |
| }, | |
| { | |
| "epoch": 9.600407055630937, | |
| "grad_norm": 5.939275741577148, | |
| "learning_rate": 9.563332519627466e-05, | |
| "loss": 0.7419, | |
| "step": 14151 | |
| }, | |
| { | |
| "epoch": 9.601085481682496, | |
| "eval_accuracy": 0.20703125, | |
| "eval_loss": 3.737224578857422, | |
| "eval_runtime": 17.1346, | |
| "eval_samples_per_second": 29.881, | |
| "eval_steps_per_second": 0.233, | |
| "step": 14152 | |
| }, | |
| { | |
| "epoch": 9.66078697421981, | |
| "grad_norm": 7.720785140991211, | |
| "learning_rate": 9.556647844201908e-05, | |
| "loss": 0.7578, | |
| "step": 14240 | |
| }, | |
| { | |
| "epoch": 9.721166892808684, | |
| "grad_norm": 7.313141345977783, | |
| "learning_rate": 9.549914763578031e-05, | |
| "loss": 0.7662, | |
| "step": 14329 | |
| }, | |
| { | |
| "epoch": 9.781546811397558, | |
| "grad_norm": 10.582131385803223, | |
| "learning_rate": 9.543133349281248e-05, | |
| "loss": 0.7503, | |
| "step": 14418 | |
| }, | |
| { | |
| "epoch": 9.841926729986431, | |
| "grad_norm": 5.272374153137207, | |
| "learning_rate": 9.536303673350415e-05, | |
| "loss": 0.7729, | |
| "step": 14507 | |
| }, | |
| { | |
| "epoch": 9.902306648575305, | |
| "grad_norm": 6.4560370445251465, | |
| "learning_rate": 9.529425808337074e-05, | |
| "loss": 0.7659, | |
| "step": 14596 | |
| }, | |
| { | |
| "epoch": 9.962686567164178, | |
| "grad_norm": 4.996959686279297, | |
| "learning_rate": 9.522499827304674e-05, | |
| "loss": 0.7348, | |
| "step": 14685 | |
| }, | |
| { | |
| "epoch": 10.023066485753052, | |
| "grad_norm": 5.831302165985107, | |
| "learning_rate": 9.515525803827803e-05, | |
| "loss": 0.7534, | |
| "step": 14774 | |
| }, | |
| { | |
| "epoch": 10.083446404341927, | |
| "grad_norm": 6.166038990020752, | |
| "learning_rate": 9.508503811991405e-05, | |
| "loss": 0.7, | |
| "step": 14863 | |
| }, | |
| { | |
| "epoch": 10.143826322930801, | |
| "grad_norm": 9.589017868041992, | |
| "learning_rate": 9.501433926389986e-05, | |
| "loss": 0.6585, | |
| "step": 14952 | |
| }, | |
| { | |
| "epoch": 10.204206241519675, | |
| "grad_norm": 8.026691436767578, | |
| "learning_rate": 9.49431622212683e-05, | |
| "loss": 0.6973, | |
| "step": 15041 | |
| }, | |
| { | |
| "epoch": 10.264586160108548, | |
| "grad_norm": 8.68213939666748, | |
| "learning_rate": 9.487150774813198e-05, | |
| "loss": 0.698, | |
| "step": 15130 | |
| }, | |
| { | |
| "epoch": 10.324966078697422, | |
| "grad_norm": 11.472238540649414, | |
| "learning_rate": 9.479937660567523e-05, | |
| "loss": 0.7192, | |
| "step": 15219 | |
| }, | |
| { | |
| "epoch": 10.385345997286295, | |
| "grad_norm": 6.372411251068115, | |
| "learning_rate": 9.472676956014605e-05, | |
| "loss": 0.6859, | |
| "step": 15308 | |
| }, | |
| { | |
| "epoch": 10.445725915875169, | |
| "grad_norm": 5.333731174468994, | |
| "learning_rate": 9.465368738284794e-05, | |
| "loss": 0.7025, | |
| "step": 15397 | |
| }, | |
| { | |
| "epoch": 10.506105834464044, | |
| "grad_norm": 7.277047157287598, | |
| "learning_rate": 9.458013085013173e-05, | |
| "loss": 0.7102, | |
| "step": 15486 | |
| }, | |
| { | |
| "epoch": 10.566485753052918, | |
| "grad_norm": 10.157328605651855, | |
| "learning_rate": 9.45061007433873e-05, | |
| "loss": 0.6814, | |
| "step": 15575 | |
| }, | |
| { | |
| "epoch": 10.626865671641792, | |
| "grad_norm": 5.025580883026123, | |
| "learning_rate": 9.443159784903528e-05, | |
| "loss": 0.7038, | |
| "step": 15664 | |
| }, | |
| { | |
| "epoch": 10.687245590230665, | |
| "grad_norm": 7.037330627441406, | |
| "learning_rate": 9.43566229585188e-05, | |
| "loss": 0.6886, | |
| "step": 15753 | |
| }, | |
| { | |
| "epoch": 10.747625508819539, | |
| "grad_norm": 8.00758171081543, | |
| "learning_rate": 9.42811768682949e-05, | |
| "loss": 0.6988, | |
| "step": 15842 | |
| }, | |
| { | |
| "epoch": 10.808005427408412, | |
| "grad_norm": 6.200064659118652, | |
| "learning_rate": 9.42052603798262e-05, | |
| "loss": 0.6872, | |
| "step": 15931 | |
| }, | |
| { | |
| "epoch": 10.868385345997286, | |
| "grad_norm": 7.785628795623779, | |
| "learning_rate": 9.412887429957241e-05, | |
| "loss": 0.7191, | |
| "step": 16020 | |
| }, | |
| { | |
| "epoch": 10.92876526458616, | |
| "grad_norm": 5.606222629547119, | |
| "learning_rate": 9.405201943898162e-05, | |
| "loss": 0.6933, | |
| "step": 16109 | |
| }, | |
| { | |
| "epoch": 10.989145183175033, | |
| "grad_norm": 6.9870147705078125, | |
| "learning_rate": 9.397469661448182e-05, | |
| "loss": 0.6873, | |
| "step": 16198 | |
| }, | |
| { | |
| "epoch": 11.049525101763908, | |
| "grad_norm": 7.700918674468994, | |
| "learning_rate": 9.389690664747214e-05, | |
| "loss": 0.6515, | |
| "step": 16287 | |
| }, | |
| { | |
| "epoch": 11.109905020352782, | |
| "grad_norm": 4.668413162231445, | |
| "learning_rate": 9.38186503643142e-05, | |
| "loss": 0.6484, | |
| "step": 16376 | |
| }, | |
| { | |
| "epoch": 11.170284938941656, | |
| "grad_norm": 9.098540306091309, | |
| "learning_rate": 9.373992859632324e-05, | |
| "loss": 0.6479, | |
| "step": 16465 | |
| }, | |
| { | |
| "epoch": 11.23066485753053, | |
| "grad_norm": 7.96748161315918, | |
| "learning_rate": 9.366074217975938e-05, | |
| "loss": 0.6351, | |
| "step": 16554 | |
| }, | |
| { | |
| "epoch": 11.291044776119403, | |
| "grad_norm": 5.657280921936035, | |
| "learning_rate": 9.358109195581866e-05, | |
| "loss": 0.6362, | |
| "step": 16643 | |
| }, | |
| { | |
| "epoch": 11.351424694708276, | |
| "grad_norm": 7.184754371643066, | |
| "learning_rate": 9.350097877062418e-05, | |
| "loss": 0.6527, | |
| "step": 16732 | |
| }, | |
| { | |
| "epoch": 11.41180461329715, | |
| "grad_norm": 6.7868523597717285, | |
| "learning_rate": 9.342040347521702e-05, | |
| "loss": 0.667, | |
| "step": 16821 | |
| }, | |
| { | |
| "epoch": 11.472184531886024, | |
| "grad_norm": 7.017992973327637, | |
| "learning_rate": 9.333936692554729e-05, | |
| "loss": 0.633, | |
| "step": 16910 | |
| }, | |
| { | |
| "epoch": 11.532564450474899, | |
| "grad_norm": 6.653933048248291, | |
| "learning_rate": 9.325786998246498e-05, | |
| "loss": 0.6404, | |
| "step": 16999 | |
| }, | |
| { | |
| "epoch": 11.592944369063773, | |
| "grad_norm": 6.6855058670043945, | |
| "learning_rate": 9.317591351171082e-05, | |
| "loss": 0.6776, | |
| "step": 17088 | |
| }, | |
| { | |
| "epoch": 11.653324287652646, | |
| "grad_norm": 8.127620697021484, | |
| "learning_rate": 9.309349838390711e-05, | |
| "loss": 0.6385, | |
| "step": 17177 | |
| }, | |
| { | |
| "epoch": 11.71370420624152, | |
| "grad_norm": 7.420390605926514, | |
| "learning_rate": 9.301062547454849e-05, | |
| "loss": 0.6395, | |
| "step": 17266 | |
| }, | |
| { | |
| "epoch": 11.774084124830393, | |
| "grad_norm": 7.517685413360596, | |
| "learning_rate": 9.292729566399252e-05, | |
| "loss": 0.6335, | |
| "step": 17355 | |
| }, | |
| { | |
| "epoch": 11.834464043419267, | |
| "grad_norm": 7.267749786376953, | |
| "learning_rate": 9.284350983745049e-05, | |
| "loss": 0.6607, | |
| "step": 17444 | |
| }, | |
| { | |
| "epoch": 11.89484396200814, | |
| "grad_norm": 7.73004150390625, | |
| "learning_rate": 9.275926888497792e-05, | |
| "loss": 0.6671, | |
| "step": 17533 | |
| }, | |
| { | |
| "epoch": 11.955223880597014, | |
| "grad_norm": 7.934135913848877, | |
| "learning_rate": 9.267457370146513e-05, | |
| "loss": 0.6207, | |
| "step": 17622 | |
| }, | |
| { | |
| "epoch": 12.00135685210312, | |
| "eval_accuracy": 0.19140625, | |
| "eval_loss": 3.8003501892089844, | |
| "eval_runtime": 19.7102, | |
| "eval_samples_per_second": 25.976, | |
| "eval_steps_per_second": 0.203, | |
| "step": 17690 | |
| }, | |
| { | |
| "epoch": 12.01560379918589, | |
| "grad_norm": 5.052128314971924, | |
| "learning_rate": 9.25894251866277e-05, | |
| "loss": 0.6211, | |
| "step": 17711 | |
| }, | |
| { | |
| "epoch": 12.075983717774763, | |
| "grad_norm": 5.490070343017578, | |
| "learning_rate": 9.250382424499698e-05, | |
| "loss": 0.6037, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 12.136363636363637, | |
| "grad_norm": 6.631565570831299, | |
| "learning_rate": 9.241777178591043e-05, | |
| "loss": 0.6032, | |
| "step": 17889 | |
| }, | |
| { | |
| "epoch": 12.19674355495251, | |
| "grad_norm": 6.181819438934326, | |
| "learning_rate": 9.233126872350193e-05, | |
| "loss": 0.5988, | |
| "step": 17978 | |
| }, | |
| { | |
| "epoch": 12.257123473541384, | |
| "grad_norm": 5.3416361808776855, | |
| "learning_rate": 9.224431597669219e-05, | |
| "loss": 0.612, | |
| "step": 18067 | |
| }, | |
| { | |
| "epoch": 12.317503392130257, | |
| "grad_norm": 9.972622871398926, | |
| "learning_rate": 9.215691446917885e-05, | |
| "loss": 0.5976, | |
| "step": 18156 | |
| }, | |
| { | |
| "epoch": 12.377883310719131, | |
| "grad_norm": 6.693090915679932, | |
| "learning_rate": 9.206906512942676e-05, | |
| "loss": 0.6127, | |
| "step": 18245 | |
| }, | |
| { | |
| "epoch": 12.438263229308005, | |
| "grad_norm": 5.006298065185547, | |
| "learning_rate": 9.198076889065806e-05, | |
| "loss": 0.614, | |
| "step": 18334 | |
| }, | |
| { | |
| "epoch": 12.49864314789688, | |
| "grad_norm": 4.5717668533325195, | |
| "learning_rate": 9.189202669084233e-05, | |
| "loss": 0.6026, | |
| "step": 18423 | |
| }, | |
| { | |
| "epoch": 12.559023066485754, | |
| "grad_norm": 7.7340989112854, | |
| "learning_rate": 9.180283947268653e-05, | |
| "loss": 0.589, | |
| "step": 18512 | |
| }, | |
| { | |
| "epoch": 12.619402985074627, | |
| "grad_norm": 6.45162296295166, | |
| "learning_rate": 9.17132081836251e-05, | |
| "loss": 0.5889, | |
| "step": 18601 | |
| }, | |
| { | |
| "epoch": 12.6797829036635, | |
| "grad_norm": 7.008767604827881, | |
| "learning_rate": 9.162313377580979e-05, | |
| "loss": 0.5783, | |
| "step": 18690 | |
| }, | |
| { | |
| "epoch": 12.740162822252374, | |
| "grad_norm": 7.15552282333374, | |
| "learning_rate": 9.153261720609963e-05, | |
| "loss": 0.5953, | |
| "step": 18779 | |
| }, | |
| { | |
| "epoch": 12.800542740841248, | |
| "grad_norm": 5.7486748695373535, | |
| "learning_rate": 9.144165943605072e-05, | |
| "loss": 0.5965, | |
| "step": 18868 | |
| }, | |
| { | |
| "epoch": 12.860922659430122, | |
| "grad_norm": 5.747917652130127, | |
| "learning_rate": 9.135026143190601e-05, | |
| "loss": 0.5875, | |
| "step": 18957 | |
| }, | |
| { | |
| "epoch": 12.921302578018995, | |
| "grad_norm": 7.039977550506592, | |
| "learning_rate": 9.125842416458506e-05, | |
| "loss": 0.5954, | |
| "step": 19046 | |
| }, | |
| { | |
| "epoch": 12.98168249660787, | |
| "grad_norm": 3.8854663372039795, | |
| "learning_rate": 9.116614860967372e-05, | |
| "loss": 0.5818, | |
| "step": 19135 | |
| }, | |
| { | |
| "epoch": 13.042062415196744, | |
| "grad_norm": 5.661801815032959, | |
| "learning_rate": 9.107343574741374e-05, | |
| "loss": 0.5619, | |
| "step": 19224 | |
| }, | |
| { | |
| "epoch": 13.102442333785618, | |
| "grad_norm": 6.757572174072266, | |
| "learning_rate": 9.098028656269243e-05, | |
| "loss": 0.5639, | |
| "step": 19313 | |
| }, | |
| { | |
| "epoch": 13.162822252374491, | |
| "grad_norm": 7.3293352127075195, | |
| "learning_rate": 9.088670204503208e-05, | |
| "loss": 0.5633, | |
| "step": 19402 | |
| }, | |
| { | |
| "epoch": 13.223202170963365, | |
| "grad_norm": 7.053752899169922, | |
| "learning_rate": 9.079268318857957e-05, | |
| "loss": 0.5487, | |
| "step": 19491 | |
| }, | |
| { | |
| "epoch": 13.283582089552239, | |
| "grad_norm": 5.139120101928711, | |
| "learning_rate": 9.069823099209571e-05, | |
| "loss": 0.543, | |
| "step": 19580 | |
| }, | |
| { | |
| "epoch": 13.343962008141112, | |
| "grad_norm": 7.9965314865112305, | |
| "learning_rate": 9.060334645894472e-05, | |
| "loss": 0.5521, | |
| "step": 19669 | |
| }, | |
| { | |
| "epoch": 13.404341926729986, | |
| "grad_norm": 7.904087543487549, | |
| "learning_rate": 9.050803059708348e-05, | |
| "loss": 0.5763, | |
| "step": 19758 | |
| }, | |
| { | |
| "epoch": 13.464721845318861, | |
| "grad_norm": 4.6150221824646, | |
| "learning_rate": 9.041228441905092e-05, | |
| "loss": 0.5492, | |
| "step": 19847 | |
| }, | |
| { | |
| "epoch": 13.525101763907735, | |
| "grad_norm": 4.3521857261657715, | |
| "learning_rate": 9.031610894195715e-05, | |
| "loss": 0.5544, | |
| "step": 19936 | |
| }, | |
| { | |
| "epoch": 13.585481682496608, | |
| "grad_norm": 6.906470775604248, | |
| "learning_rate": 9.021950518747276e-05, | |
| "loss": 0.5922, | |
| "step": 20025 | |
| }, | |
| { | |
| "epoch": 13.645861601085482, | |
| "grad_norm": 7.304365158081055, | |
| "learning_rate": 9.012247418181792e-05, | |
| "loss": 0.5473, | |
| "step": 20114 | |
| }, | |
| { | |
| "epoch": 13.706241519674355, | |
| "grad_norm": 5.015029430389404, | |
| "learning_rate": 9.002501695575148e-05, | |
| "loss": 0.5843, | |
| "step": 20203 | |
| }, | |
| { | |
| "epoch": 13.766621438263229, | |
| "grad_norm": 5.353032112121582, | |
| "learning_rate": 8.992713454455999e-05, | |
| "loss": 0.5423, | |
| "step": 20292 | |
| }, | |
| { | |
| "epoch": 13.827001356852103, | |
| "grad_norm": 4.505341529846191, | |
| "learning_rate": 8.98288279880468e-05, | |
| "loss": 0.5511, | |
| "step": 20381 | |
| }, | |
| { | |
| "epoch": 13.887381275440976, | |
| "grad_norm": 6.68435525894165, | |
| "learning_rate": 8.973009833052087e-05, | |
| "loss": 0.5429, | |
| "step": 20470 | |
| }, | |
| { | |
| "epoch": 13.947761194029852, | |
| "grad_norm": 4.248044490814209, | |
| "learning_rate": 8.963094662078583e-05, | |
| "loss": 0.5637, | |
| "step": 20559 | |
| }, | |
| { | |
| "epoch": 14.008141112618725, | |
| "grad_norm": 4.230225563049316, | |
| "learning_rate": 8.953137391212875e-05, | |
| "loss": 0.5551, | |
| "step": 20648 | |
| }, | |
| { | |
| "epoch": 14.068521031207599, | |
| "grad_norm": 4.81500768661499, | |
| "learning_rate": 8.94313812623089e-05, | |
| "loss": 0.5027, | |
| "step": 20737 | |
| }, | |
| { | |
| "epoch": 14.128900949796472, | |
| "grad_norm": 6.79054594039917, | |
| "learning_rate": 8.933096973354664e-05, | |
| "loss": 0.4904, | |
| "step": 20826 | |
| }, | |
| { | |
| "epoch": 14.189280868385346, | |
| "grad_norm": 4.661177635192871, | |
| "learning_rate": 8.923014039251208e-05, | |
| "loss": 0.5076, | |
| "step": 20915 | |
| }, | |
| { | |
| "epoch": 14.24966078697422, | |
| "grad_norm": 10.014252662658691, | |
| "learning_rate": 8.91288943103137e-05, | |
| "loss": 0.5068, | |
| "step": 21004 | |
| }, | |
| { | |
| "epoch": 14.310040705563093, | |
| "grad_norm": 8.030250549316406, | |
| "learning_rate": 8.902723256248704e-05, | |
| "loss": 0.521, | |
| "step": 21093 | |
| }, | |
| { | |
| "epoch": 14.370420624151967, | |
| "grad_norm": 5.514551162719727, | |
| "learning_rate": 8.892515622898326e-05, | |
| "loss": 0.5053, | |
| "step": 21182 | |
| }, | |
| { | |
| "epoch": 14.401628222523746, | |
| "eval_accuracy": 0.193359375, | |
| "eval_loss": 3.79229736328125, | |
| "eval_runtime": 41.777, | |
| "eval_samples_per_second": 12.256, | |
| "eval_steps_per_second": 0.096, | |
| "step": 21228 | |
| }, | |
| { | |
| "epoch": 14.43080054274084, | |
| "grad_norm": 5.649023056030273, | |
| "learning_rate": 8.882266639415763e-05, | |
| "loss": 0.5103, | |
| "step": 21271 | |
| }, | |
| { | |
| "epoch": 14.491180461329716, | |
| "grad_norm": 6.628403663635254, | |
| "learning_rate": 8.871976414675805e-05, | |
| "loss": 0.5238, | |
| "step": 21360 | |
| }, | |
| { | |
| "epoch": 14.55156037991859, | |
| "grad_norm": 5.387028217315674, | |
| "learning_rate": 8.86164505799135e-05, | |
| "loss": 0.5278, | |
| "step": 21449 | |
| }, | |
| { | |
| "epoch": 14.611940298507463, | |
| "grad_norm": 5.111924171447754, | |
| "learning_rate": 8.851272679112234e-05, | |
| "loss": 0.5269, | |
| "step": 21538 | |
| }, | |
| { | |
| "epoch": 14.672320217096336, | |
| "grad_norm": 5.967355728149414, | |
| "learning_rate": 8.840859388224076e-05, | |
| "loss": 0.5188, | |
| "step": 21627 | |
| }, | |
| { | |
| "epoch": 14.73270013568521, | |
| "grad_norm": 5.387267589569092, | |
| "learning_rate": 8.830405295947102e-05, | |
| "loss": 0.5161, | |
| "step": 21716 | |
| }, | |
| { | |
| "epoch": 14.793080054274084, | |
| "grad_norm": 4.254080772399902, | |
| "learning_rate": 8.81991051333497e-05, | |
| "loss": 0.5228, | |
| "step": 21805 | |
| }, | |
| { | |
| "epoch": 14.853459972862957, | |
| "grad_norm": 3.855088233947754, | |
| "learning_rate": 8.809375151873589e-05, | |
| "loss": 0.5091, | |
| "step": 21894 | |
| }, | |
| { | |
| "epoch": 14.913839891451833, | |
| "grad_norm": 5.05858039855957, | |
| "learning_rate": 8.798799323479938e-05, | |
| "loss": 0.5259, | |
| "step": 21983 | |
| }, | |
| { | |
| "epoch": 14.974219810040706, | |
| "grad_norm": 8.726083755493164, | |
| "learning_rate": 8.788183140500874e-05, | |
| "loss": 0.5171, | |
| "step": 22072 | |
| }, | |
| { | |
| "epoch": 15.03459972862958, | |
| "grad_norm": 5.312582492828369, | |
| "learning_rate": 8.777526715711946e-05, | |
| "loss": 0.4804, | |
| "step": 22161 | |
| }, | |
| { | |
| "epoch": 15.094979647218453, | |
| "grad_norm": 4.794472694396973, | |
| "learning_rate": 8.766830162316183e-05, | |
| "loss": 0.4814, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 15.155359565807327, | |
| "grad_norm": 6.440197944641113, | |
| "learning_rate": 8.756093593942905e-05, | |
| "loss": 0.4829, | |
| "step": 22339 | |
| }, | |
| { | |
| "epoch": 15.2157394843962, | |
| "grad_norm": 4.757099151611328, | |
| "learning_rate": 8.745317124646508e-05, | |
| "loss": 0.4572, | |
| "step": 22428 | |
| }, | |
| { | |
| "epoch": 15.276119402985074, | |
| "grad_norm": 5.3460235595703125, | |
| "learning_rate": 8.734500868905258e-05, | |
| "loss": 0.476, | |
| "step": 22517 | |
| }, | |
| { | |
| "epoch": 15.336499321573948, | |
| "grad_norm": 4.173645496368408, | |
| "learning_rate": 8.723644941620065e-05, | |
| "loss": 0.4829, | |
| "step": 22606 | |
| }, | |
| { | |
| "epoch": 15.396879240162821, | |
| "grad_norm": 8.921795845031738, | |
| "learning_rate": 8.71274945811328e-05, | |
| "loss": 0.4758, | |
| "step": 22695 | |
| }, | |
| { | |
| "epoch": 15.457259158751697, | |
| "grad_norm": 5.059213161468506, | |
| "learning_rate": 8.701814534127446e-05, | |
| "loss": 0.4516, | |
| "step": 22784 | |
| }, | |
| { | |
| "epoch": 15.51763907734057, | |
| "grad_norm": 6.460654258728027, | |
| "learning_rate": 8.690840285824094e-05, | |
| "loss": 0.4946, | |
| "step": 22873 | |
| }, | |
| { | |
| "epoch": 15.578018995929444, | |
| "grad_norm": 5.588746547698975, | |
| "learning_rate": 8.679826829782485e-05, | |
| "loss": 0.5096, | |
| "step": 22962 | |
| }, | |
| { | |
| "epoch": 15.638398914518318, | |
| "grad_norm": 4.974047660827637, | |
| "learning_rate": 8.668774282998394e-05, | |
| "loss": 0.491, | |
| "step": 23051 | |
| }, | |
| { | |
| "epoch": 15.698778833107191, | |
| "grad_norm": 4.4067463874816895, | |
| "learning_rate": 8.65768276288285e-05, | |
| "loss": 0.487, | |
| "step": 23140 | |
| }, | |
| { | |
| "epoch": 15.759158751696065, | |
| "grad_norm": 5.659997463226318, | |
| "learning_rate": 8.646552387260898e-05, | |
| "loss": 0.4895, | |
| "step": 23229 | |
| }, | |
| { | |
| "epoch": 15.819538670284938, | |
| "grad_norm": 5.777614593505859, | |
| "learning_rate": 8.635383274370341e-05, | |
| "loss": 0.4951, | |
| "step": 23318 | |
| }, | |
| { | |
| "epoch": 15.879918588873814, | |
| "grad_norm": 6.594443321228027, | |
| "learning_rate": 8.62417554286049e-05, | |
| "loss": 0.4871, | |
| "step": 23407 | |
| }, | |
| { | |
| "epoch": 15.940298507462687, | |
| "grad_norm": 4.5751237869262695, | |
| "learning_rate": 8.612929311790899e-05, | |
| "loss": 0.5005, | |
| "step": 23496 | |
| }, | |
| { | |
| "epoch": 16.00067842605156, | |
| "grad_norm": 4.56909704208374, | |
| "learning_rate": 8.601644700630107e-05, | |
| "loss": 0.4875, | |
| "step": 23585 | |
| }, | |
| { | |
| "epoch": 16.061058344640433, | |
| "grad_norm": 5.793113708496094, | |
| "learning_rate": 8.590321829254358e-05, | |
| "loss": 0.4592, | |
| "step": 23674 | |
| }, | |
| { | |
| "epoch": 16.121438263229308, | |
| "grad_norm": 3.888392686843872, | |
| "learning_rate": 8.578960817946338e-05, | |
| "loss": 0.4343, | |
| "step": 23763 | |
| }, | |
| { | |
| "epoch": 16.181818181818183, | |
| "grad_norm": 3.910721778869629, | |
| "learning_rate": 8.567561787393888e-05, | |
| "loss": 0.4499, | |
| "step": 23852 | |
| }, | |
| { | |
| "epoch": 16.242198100407055, | |
| "grad_norm": 7.085721492767334, | |
| "learning_rate": 8.556124858688734e-05, | |
| "loss": 0.4391, | |
| "step": 23941 | |
| }, | |
| { | |
| "epoch": 16.30257801899593, | |
| "grad_norm": 6.454195022583008, | |
| "learning_rate": 8.54465015332519e-05, | |
| "loss": 0.4378, | |
| "step": 24030 | |
| }, | |
| { | |
| "epoch": 16.362957937584802, | |
| "grad_norm": 3.5428030490875244, | |
| "learning_rate": 8.533137793198866e-05, | |
| "loss": 0.4511, | |
| "step": 24119 | |
| }, | |
| { | |
| "epoch": 16.423337856173678, | |
| "grad_norm": 3.401646614074707, | |
| "learning_rate": 8.521587900605385e-05, | |
| "loss": 0.4642, | |
| "step": 24208 | |
| }, | |
| { | |
| "epoch": 16.48371777476255, | |
| "grad_norm": 6.838740825653076, | |
| "learning_rate": 8.510000598239075e-05, | |
| "loss": 0.4584, | |
| "step": 24297 | |
| }, | |
| { | |
| "epoch": 16.544097693351425, | |
| "grad_norm": 5.186567306518555, | |
| "learning_rate": 8.498376009191665e-05, | |
| "loss": 0.4741, | |
| "step": 24386 | |
| }, | |
| { | |
| "epoch": 16.604477611940297, | |
| "grad_norm": 3.8350930213928223, | |
| "learning_rate": 8.486714256950983e-05, | |
| "loss": 0.4475, | |
| "step": 24475 | |
| }, | |
| { | |
| "epoch": 16.664857530529172, | |
| "grad_norm": 5.290257453918457, | |
| "learning_rate": 8.475015465399638e-05, | |
| "loss": 0.4544, | |
| "step": 24564 | |
| }, | |
| { | |
| "epoch": 16.725237449118048, | |
| "grad_norm": 5.533965587615967, | |
| "learning_rate": 8.463279758813711e-05, | |
| "loss": 0.457, | |
| "step": 24653 | |
| }, | |
| { | |
| "epoch": 16.78561736770692, | |
| "grad_norm": 5.372981071472168, | |
| "learning_rate": 8.451507261861425e-05, | |
| "loss": 0.4537, | |
| "step": 24742 | |
| }, | |
| { | |
| "epoch": 16.80189959294437, | |
| "eval_accuracy": 0.16015625, | |
| "eval_loss": 3.9037704467773438, | |
| "eval_runtime": 39.2353, | |
| "eval_samples_per_second": 13.049, | |
| "eval_steps_per_second": 0.102, | |
| "step": 24766 | |
| }, | |
| { | |
| "epoch": 16.845997286295795, | |
| "grad_norm": 5.011608600616455, | |
| "learning_rate": 8.439698099601831e-05, | |
| "loss": 0.452, | |
| "step": 24831 | |
| }, | |
| { | |
| "epoch": 16.906377204884667, | |
| "grad_norm": 5.051270484924316, | |
| "learning_rate": 8.427852397483475e-05, | |
| "loss": 0.4493, | |
| "step": 24920 | |
| }, | |
| { | |
| "epoch": 16.966757123473542, | |
| "grad_norm": 3.670827627182007, | |
| "learning_rate": 8.415970281343061e-05, | |
| "loss": 0.4476, | |
| "step": 25009 | |
| }, | |
| { | |
| "epoch": 17.027137042062414, | |
| "grad_norm": 2.6706955432891846, | |
| "learning_rate": 8.404051877404126e-05, | |
| "loss": 0.4478, | |
| "step": 25098 | |
| }, | |
| { | |
| "epoch": 17.08751696065129, | |
| "grad_norm": 7.5127787590026855, | |
| "learning_rate": 8.392097312275686e-05, | |
| "loss": 0.4244, | |
| "step": 25187 | |
| }, | |
| { | |
| "epoch": 17.147896879240164, | |
| "grad_norm": 3.7548723220825195, | |
| "learning_rate": 8.380106712950896e-05, | |
| "loss": 0.4289, | |
| "step": 25276 | |
| }, | |
| { | |
| "epoch": 17.208276797829036, | |
| "grad_norm": 3.9628028869628906, | |
| "learning_rate": 8.368080206805706e-05, | |
| "loss": 0.4337, | |
| "step": 25365 | |
| }, | |
| { | |
| "epoch": 17.26865671641791, | |
| "grad_norm": 4.179431915283203, | |
| "learning_rate": 8.3560179215975e-05, | |
| "loss": 0.4147, | |
| "step": 25454 | |
| }, | |
| { | |
| "epoch": 17.329036635006783, | |
| "grad_norm": 3.3942129611968994, | |
| "learning_rate": 8.343919985463745e-05, | |
| "loss": 0.4175, | |
| "step": 25543 | |
| }, | |
| { | |
| "epoch": 17.38941655359566, | |
| "grad_norm": 4.166045665740967, | |
| "learning_rate": 8.331786526920626e-05, | |
| "loss": 0.423, | |
| "step": 25632 | |
| }, | |
| { | |
| "epoch": 17.44979647218453, | |
| "grad_norm": 3.0724310874938965, | |
| "learning_rate": 8.319617674861682e-05, | |
| "loss": 0.41, | |
| "step": 25721 | |
| }, | |
| { | |
| "epoch": 17.510176390773406, | |
| "grad_norm": 6.462100028991699, | |
| "learning_rate": 8.307413558556437e-05, | |
| "loss": 0.4125, | |
| "step": 25810 | |
| }, | |
| { | |
| "epoch": 17.570556309362278, | |
| "grad_norm": 4.838727951049805, | |
| "learning_rate": 8.295174307649024e-05, | |
| "loss": 0.4254, | |
| "step": 25899 | |
| }, | |
| { | |
| "epoch": 17.630936227951153, | |
| "grad_norm": 3.9609103202819824, | |
| "learning_rate": 8.282900052156817e-05, | |
| "loss": 0.4141, | |
| "step": 25988 | |
| }, | |
| { | |
| "epoch": 17.69131614654003, | |
| "grad_norm": 3.537935972213745, | |
| "learning_rate": 8.270590922469037e-05, | |
| "loss": 0.4189, | |
| "step": 26077 | |
| }, | |
| { | |
| "epoch": 17.7516960651289, | |
| "grad_norm": 5.015251159667969, | |
| "learning_rate": 8.258247049345373e-05, | |
| "loss": 0.439, | |
| "step": 26166 | |
| }, | |
| { | |
| "epoch": 17.812075983717776, | |
| "grad_norm": 4.997931957244873, | |
| "learning_rate": 8.245868563914598e-05, | |
| "loss": 0.4079, | |
| "step": 26255 | |
| }, | |
| { | |
| "epoch": 17.872455902306648, | |
| "grad_norm": 5.362955093383789, | |
| "learning_rate": 8.233455597673165e-05, | |
| "loss": 0.4165, | |
| "step": 26344 | |
| }, | |
| { | |
| "epoch": 17.932835820895523, | |
| "grad_norm": 6.1235880851745605, | |
| "learning_rate": 8.22100828248382e-05, | |
| "loss": 0.4121, | |
| "step": 26433 | |
| }, | |
| { | |
| "epoch": 17.993215739484395, | |
| "grad_norm": 4.939189434051514, | |
| "learning_rate": 8.208526750574199e-05, | |
| "loss": 0.4191, | |
| "step": 26522 | |
| }, | |
| { | |
| "epoch": 18.05359565807327, | |
| "grad_norm": 4.338520050048828, | |
| "learning_rate": 8.196011134535416e-05, | |
| "loss": 0.369, | |
| "step": 26611 | |
| }, | |
| { | |
| "epoch": 18.113975576662146, | |
| "grad_norm": 4.328836441040039, | |
| "learning_rate": 8.183461567320662e-05, | |
| "loss": 0.3939, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 18.174355495251017, | |
| "grad_norm": 3.7861499786376953, | |
| "learning_rate": 8.170878182243792e-05, | |
| "loss": 0.3841, | |
| "step": 26789 | |
| }, | |
| { | |
| "epoch": 18.234735413839893, | |
| "grad_norm": 4.84774112701416, | |
| "learning_rate": 8.158261112977913e-05, | |
| "loss": 0.3702, | |
| "step": 26878 | |
| }, | |
| { | |
| "epoch": 18.295115332428765, | |
| "grad_norm": 7.082802772521973, | |
| "learning_rate": 8.145610493553948e-05, | |
| "loss": 0.4059, | |
| "step": 26967 | |
| }, | |
| { | |
| "epoch": 18.35549525101764, | |
| "grad_norm": 2.84909987449646, | |
| "learning_rate": 8.13292645835923e-05, | |
| "loss": 0.41, | |
| "step": 27056 | |
| }, | |
| { | |
| "epoch": 18.41587516960651, | |
| "grad_norm": 4.116001605987549, | |
| "learning_rate": 8.120209142136065e-05, | |
| "loss": 0.4014, | |
| "step": 27145 | |
| }, | |
| { | |
| "epoch": 18.476255088195387, | |
| "grad_norm": 4.0977783203125, | |
| "learning_rate": 8.107458679980302e-05, | |
| "loss": 0.4041, | |
| "step": 27234 | |
| }, | |
| { | |
| "epoch": 18.53663500678426, | |
| "grad_norm": 9.48543930053711, | |
| "learning_rate": 8.0946752073399e-05, | |
| "loss": 0.3979, | |
| "step": 27323 | |
| }, | |
| { | |
| "epoch": 18.597014925373134, | |
| "grad_norm": 3.692593574523926, | |
| "learning_rate": 8.081858860013488e-05, | |
| "loss": 0.4034, | |
| "step": 27412 | |
| }, | |
| { | |
| "epoch": 18.65739484396201, | |
| "grad_norm": 3.500662326812744, | |
| "learning_rate": 8.069009774148923e-05, | |
| "loss": 0.3884, | |
| "step": 27501 | |
| }, | |
| { | |
| "epoch": 18.71777476255088, | |
| "grad_norm": 3.7085442543029785, | |
| "learning_rate": 8.056128086241841e-05, | |
| "loss": 0.3829, | |
| "step": 27590 | |
| }, | |
| { | |
| "epoch": 18.778154681139757, | |
| "grad_norm": 4.753846168518066, | |
| "learning_rate": 8.043213933134208e-05, | |
| "loss": 0.4079, | |
| "step": 27679 | |
| }, | |
| { | |
| "epoch": 18.83853459972863, | |
| "grad_norm": 3.4297168254852295, | |
| "learning_rate": 8.030267452012872e-05, | |
| "loss": 0.3934, | |
| "step": 27768 | |
| }, | |
| { | |
| "epoch": 18.898914518317504, | |
| "grad_norm": 5.62887716293335, | |
| "learning_rate": 8.017288780408096e-05, | |
| "loss": 0.4036, | |
| "step": 27857 | |
| }, | |
| { | |
| "epoch": 18.959294436906376, | |
| "grad_norm": 3.0904860496520996, | |
| "learning_rate": 8.004278056192107e-05, | |
| "loss": 0.3933, | |
| "step": 27946 | |
| }, | |
| { | |
| "epoch": 19.01967435549525, | |
| "grad_norm": 4.35064697265625, | |
| "learning_rate": 7.991235417577621e-05, | |
| "loss": 0.3759, | |
| "step": 28035 | |
| }, | |
| { | |
| "epoch": 19.080054274084127, | |
| "grad_norm": 5.101808547973633, | |
| "learning_rate": 7.978161003116382e-05, | |
| "loss": 0.3693, | |
| "step": 28124 | |
| }, | |
| { | |
| "epoch": 19.140434192673, | |
| "grad_norm": 4.391759395599365, | |
| "learning_rate": 7.96505495169769e-05, | |
| "loss": 0.3472, | |
| "step": 28213 | |
| }, | |
| { | |
| "epoch": 19.200814111261874, | |
| "grad_norm": 4.793941974639893, | |
| "learning_rate": 7.951917402546926e-05, | |
| "loss": 0.3551, | |
| "step": 28302 | |
| }, | |
| { | |
| "epoch": 19.202170963364992, | |
| "eval_accuracy": 0.19140625, | |
| "eval_loss": 3.9524879455566406, | |
| "eval_runtime": 19.8893, | |
| "eval_samples_per_second": 25.742, | |
| "eval_steps_per_second": 0.201, | |
| "step": 28304 | |
| }, | |
| { | |
| "epoch": 19.261194029850746, | |
| "grad_norm": 3.726491689682007, | |
| "learning_rate": 7.938748495224061e-05, | |
| "loss": 0.3555, | |
| "step": 28391 | |
| }, | |
| { | |
| "epoch": 19.32157394843962, | |
| "grad_norm": 3.4001190662384033, | |
| "learning_rate": 7.925548369622199e-05, | |
| "loss": 0.361, | |
| "step": 28480 | |
| }, | |
| { | |
| "epoch": 19.381953867028493, | |
| "grad_norm": 4.480808258056641, | |
| "learning_rate": 7.912317165966059e-05, | |
| "loss": 0.3656, | |
| "step": 28569 | |
| }, | |
| { | |
| "epoch": 19.442333785617368, | |
| "grad_norm": 3.043093681335449, | |
| "learning_rate": 7.899055024810511e-05, | |
| "loss": 0.3819, | |
| "step": 28658 | |
| }, | |
| { | |
| "epoch": 19.50271370420624, | |
| "grad_norm": 3.813091516494751, | |
| "learning_rate": 7.885762087039075e-05, | |
| "loss": 0.3939, | |
| "step": 28747 | |
| }, | |
| { | |
| "epoch": 19.563093622795115, | |
| "grad_norm": 4.2613525390625, | |
| "learning_rate": 7.872438493862415e-05, | |
| "loss": 0.353, | |
| "step": 28836 | |
| }, | |
| { | |
| "epoch": 19.62347354138399, | |
| "grad_norm": 2.884284734725952, | |
| "learning_rate": 7.859084386816854e-05, | |
| "loss": 0.3696, | |
| "step": 28925 | |
| }, | |
| { | |
| "epoch": 19.683853459972863, | |
| "grad_norm": 6.607941627502441, | |
| "learning_rate": 7.845699907762862e-05, | |
| "loss": 0.3869, | |
| "step": 29014 | |
| }, | |
| { | |
| "epoch": 19.744233378561738, | |
| "grad_norm": 6.069945335388184, | |
| "learning_rate": 7.832285198883548e-05, | |
| "loss": 0.3688, | |
| "step": 29103 | |
| }, | |
| { | |
| "epoch": 19.80461329715061, | |
| "grad_norm": 2.9537928104400635, | |
| "learning_rate": 7.818840402683151e-05, | |
| "loss": 0.3624, | |
| "step": 29192 | |
| }, | |
| { | |
| "epoch": 19.864993215739485, | |
| "grad_norm": 4.354130268096924, | |
| "learning_rate": 7.805365661985535e-05, | |
| "loss": 0.3589, | |
| "step": 29281 | |
| }, | |
| { | |
| "epoch": 19.925373134328357, | |
| "grad_norm": 3.5923469066619873, | |
| "learning_rate": 7.791861119932652e-05, | |
| "loss": 0.3432, | |
| "step": 29370 | |
| }, | |
| { | |
| "epoch": 19.985753052917232, | |
| "grad_norm": 3.5997955799102783, | |
| "learning_rate": 7.778326919983046e-05, | |
| "loss": 0.3611, | |
| "step": 29459 | |
| }, | |
| { | |
| "epoch": 20.046132971506104, | |
| "grad_norm": 2.281196355819702, | |
| "learning_rate": 7.764763205910304e-05, | |
| "loss": 0.3296, | |
| "step": 29548 | |
| }, | |
| { | |
| "epoch": 20.10651289009498, | |
| "grad_norm": 7.429330348968506, | |
| "learning_rate": 7.75117012180155e-05, | |
| "loss": 0.34, | |
| "step": 29637 | |
| }, | |
| { | |
| "epoch": 20.166892808683855, | |
| "grad_norm": 7.913335800170898, | |
| "learning_rate": 7.737547812055901e-05, | |
| "loss": 0.3428, | |
| "step": 29726 | |
| }, | |
| { | |
| "epoch": 20.227272727272727, | |
| "grad_norm": 2.8572380542755127, | |
| "learning_rate": 7.723896421382942e-05, | |
| "loss": 0.3394, | |
| "step": 29815 | |
| }, | |
| { | |
| "epoch": 20.287652645861602, | |
| "grad_norm": 2.90544056892395, | |
| "learning_rate": 7.710216094801179e-05, | |
| "loss": 0.3322, | |
| "step": 29904 | |
| }, | |
| { | |
| "epoch": 20.348032564450474, | |
| "grad_norm": 7.801008224487305, | |
| "learning_rate": 7.696506977636506e-05, | |
| "loss": 0.343, | |
| "step": 29993 | |
| }, | |
| { | |
| "epoch": 20.40841248303935, | |
| "grad_norm": 4.56928014755249, | |
| "learning_rate": 7.682769215520658e-05, | |
| "loss": 0.3513, | |
| "step": 30082 | |
| }, | |
| { | |
| "epoch": 20.46879240162822, | |
| "grad_norm": 3.2972512245178223, | |
| "learning_rate": 7.669002954389668e-05, | |
| "loss": 0.3361, | |
| "step": 30171 | |
| }, | |
| { | |
| "epoch": 20.529172320217096, | |
| "grad_norm": 2.6529455184936523, | |
| "learning_rate": 7.65520834048231e-05, | |
| "loss": 0.3481, | |
| "step": 30260 | |
| }, | |
| { | |
| "epoch": 20.58955223880597, | |
| "grad_norm": 2.281811475753784, | |
| "learning_rate": 7.641385520338551e-05, | |
| "loss": 0.3439, | |
| "step": 30349 | |
| }, | |
| { | |
| "epoch": 20.649932157394844, | |
| "grad_norm": 5.415365695953369, | |
| "learning_rate": 7.627534640797991e-05, | |
| "loss": 0.3426, | |
| "step": 30438 | |
| }, | |
| { | |
| "epoch": 20.71031207598372, | |
| "grad_norm": 4.79844856262207, | |
| "learning_rate": 7.613655848998305e-05, | |
| "loss": 0.3237, | |
| "step": 30527 | |
| }, | |
| { | |
| "epoch": 20.77069199457259, | |
| "grad_norm": 4.5184855461120605, | |
| "learning_rate": 7.599749292373679e-05, | |
| "loss": 0.3433, | |
| "step": 30616 | |
| }, | |
| { | |
| "epoch": 20.831071913161466, | |
| "grad_norm": 3.099209785461426, | |
| "learning_rate": 7.585815118653248e-05, | |
| "loss": 0.329, | |
| "step": 30705 | |
| }, | |
| { | |
| "epoch": 20.891451831750338, | |
| "grad_norm": 2.415534257888794, | |
| "learning_rate": 7.571853475859519e-05, | |
| "loss": 0.3377, | |
| "step": 30794 | |
| }, | |
| { | |
| "epoch": 20.951831750339213, | |
| "grad_norm": 4.010440349578857, | |
| "learning_rate": 7.557864512306802e-05, | |
| "loss": 0.3375, | |
| "step": 30883 | |
| }, | |
| { | |
| "epoch": 21.012211668928085, | |
| "grad_norm": 3.8156368732452393, | |
| "learning_rate": 7.543848376599637e-05, | |
| "loss": 0.3216, | |
| "step": 30972 | |
| }, | |
| { | |
| "epoch": 21.07259158751696, | |
| "grad_norm": 8.568528175354004, | |
| "learning_rate": 7.529805217631214e-05, | |
| "loss": 0.3043, | |
| "step": 31061 | |
| }, | |
| { | |
| "epoch": 21.132971506105836, | |
| "grad_norm": 5.376992225646973, | |
| "learning_rate": 7.515735184581791e-05, | |
| "loss": 0.3175, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 21.193351424694708, | |
| "grad_norm": 2.6105728149414062, | |
| "learning_rate": 7.501638426917106e-05, | |
| "loss": 0.3105, | |
| "step": 31239 | |
| }, | |
| { | |
| "epoch": 21.253731343283583, | |
| "grad_norm": 2.6053969860076904, | |
| "learning_rate": 7.487515094386792e-05, | |
| "loss": 0.3002, | |
| "step": 31328 | |
| }, | |
| { | |
| "epoch": 21.314111261872455, | |
| "grad_norm": 2.5073657035827637, | |
| "learning_rate": 7.473365337022791e-05, | |
| "loss": 0.3172, | |
| "step": 31417 | |
| }, | |
| { | |
| "epoch": 21.37449118046133, | |
| "grad_norm": 2.63193941116333, | |
| "learning_rate": 7.459189305137751e-05, | |
| "loss": 0.3183, | |
| "step": 31506 | |
| }, | |
| { | |
| "epoch": 21.434871099050202, | |
| "grad_norm": 2.9518582820892334, | |
| "learning_rate": 7.444987149323435e-05, | |
| "loss": 0.302, | |
| "step": 31595 | |
| }, | |
| { | |
| "epoch": 21.495251017639077, | |
| "grad_norm": 3.682440757751465, | |
| "learning_rate": 7.430759020449123e-05, | |
| "loss": 0.3106, | |
| "step": 31684 | |
| }, | |
| { | |
| "epoch": 21.555630936227953, | |
| "grad_norm": 3.564025402069092, | |
| "learning_rate": 7.416505069660003e-05, | |
| "loss": 0.3114, | |
| "step": 31773 | |
| }, | |
| { | |
| "epoch": 21.602442333785618, | |
| "eval_accuracy": 0.1953125, | |
| "eval_loss": 3.8780479431152344, | |
| "eval_runtime": 40.0838, | |
| "eval_samples_per_second": 12.773, | |
| "eval_steps_per_second": 0.1, | |
| "step": 31842 | |
| }, | |
| { | |
| "epoch": 21.616010854816825, | |
| "grad_norm": 4.089629173278809, | |
| "learning_rate": 7.402225448375569e-05, | |
| "loss": 0.3152, | |
| "step": 31862 | |
| }, | |
| { | |
| "epoch": 21.6763907734057, | |
| "grad_norm": 4.699454307556152, | |
| "learning_rate": 7.387920308288014e-05, | |
| "loss": 0.3094, | |
| "step": 31951 | |
| }, | |
| { | |
| "epoch": 21.736770691994572, | |
| "grad_norm": 3.2713539600372314, | |
| "learning_rate": 7.373589801360616e-05, | |
| "loss": 0.3276, | |
| "step": 32040 | |
| }, | |
| { | |
| "epoch": 21.797150610583447, | |
| "grad_norm": 1.9568812847137451, | |
| "learning_rate": 7.359234079826123e-05, | |
| "loss": 0.3181, | |
| "step": 32129 | |
| }, | |
| { | |
| "epoch": 21.85753052917232, | |
| "grad_norm": 2.7409889698028564, | |
| "learning_rate": 7.344853296185141e-05, | |
| "loss": 0.3023, | |
| "step": 32218 | |
| }, | |
| { | |
| "epoch": 21.917910447761194, | |
| "grad_norm": 2.9756550788879395, | |
| "learning_rate": 7.330447603204507e-05, | |
| "loss": 0.3162, | |
| "step": 32307 | |
| }, | |
| { | |
| "epoch": 21.978290366350066, | |
| "grad_norm": 3.314568281173706, | |
| "learning_rate": 7.316017153915671e-05, | |
| "loss": 0.2991, | |
| "step": 32396 | |
| }, | |
| { | |
| "epoch": 22.03867028493894, | |
| "grad_norm": 4.315303802490234, | |
| "learning_rate": 7.301562101613068e-05, | |
| "loss": 0.305, | |
| "step": 32485 | |
| }, | |
| { | |
| "epoch": 22.099050203527817, | |
| "grad_norm": 4.505661487579346, | |
| "learning_rate": 7.287082599852493e-05, | |
| "loss": 0.2807, | |
| "step": 32574 | |
| }, | |
| { | |
| "epoch": 22.15943012211669, | |
| "grad_norm": 3.841827392578125, | |
| "learning_rate": 7.272578802449464e-05, | |
| "loss": 0.2742, | |
| "step": 32663 | |
| }, | |
| { | |
| "epoch": 22.219810040705564, | |
| "grad_norm": 4.61216926574707, | |
| "learning_rate": 7.25805086347759e-05, | |
| "loss": 0.2994, | |
| "step": 32752 | |
| }, | |
| { | |
| "epoch": 22.280189959294436, | |
| "grad_norm": 2.9822754859924316, | |
| "learning_rate": 7.243498937266943e-05, | |
| "loss": 0.2854, | |
| "step": 32841 | |
| }, | |
| { | |
| "epoch": 22.34056987788331, | |
| "grad_norm": 3.7797086238861084, | |
| "learning_rate": 7.228923178402403e-05, | |
| "loss": 0.2967, | |
| "step": 32930 | |
| }, | |
| { | |
| "epoch": 22.400949796472183, | |
| "grad_norm": 2.8511717319488525, | |
| "learning_rate": 7.214323741722027e-05, | |
| "loss": 0.2772, | |
| "step": 33019 | |
| }, | |
| { | |
| "epoch": 22.46132971506106, | |
| "grad_norm": 2.439438581466675, | |
| "learning_rate": 7.199700782315403e-05, | |
| "loss": 0.2957, | |
| "step": 33108 | |
| }, | |
| { | |
| "epoch": 22.521709633649934, | |
| "grad_norm": 2.507317066192627, | |
| "learning_rate": 7.185054455521994e-05, | |
| "loss": 0.2883, | |
| "step": 33197 | |
| }, | |
| { | |
| "epoch": 22.582089552238806, | |
| "grad_norm": 2.963704824447632, | |
| "learning_rate": 7.170384916929504e-05, | |
| "loss": 0.2892, | |
| "step": 33286 | |
| }, | |
| { | |
| "epoch": 22.64246947082768, | |
| "grad_norm": 3.137892007827759, | |
| "learning_rate": 7.155692322372208e-05, | |
| "loss": 0.2936, | |
| "step": 33375 | |
| }, | |
| { | |
| "epoch": 22.702849389416553, | |
| "grad_norm": 2.860560178756714, | |
| "learning_rate": 7.140976827929308e-05, | |
| "loss": 0.2719, | |
| "step": 33464 | |
| }, | |
| { | |
| "epoch": 22.763229308005428, | |
| "grad_norm": 3.778202533721924, | |
| "learning_rate": 7.126238589923269e-05, | |
| "loss": 0.2909, | |
| "step": 33553 | |
| }, | |
| { | |
| "epoch": 22.8236092265943, | |
| "grad_norm": 8.442693710327148, | |
| "learning_rate": 7.111477764918159e-05, | |
| "loss": 0.2957, | |
| "step": 33642 | |
| }, | |
| { | |
| "epoch": 22.883989145183175, | |
| "grad_norm": 2.855881452560425, | |
| "learning_rate": 7.096694509717994e-05, | |
| "loss": 0.2893, | |
| "step": 33731 | |
| }, | |
| { | |
| "epoch": 22.944369063772047, | |
| "grad_norm": 3.649304151535034, | |
| "learning_rate": 7.081888981365062e-05, | |
| "loss": 0.3019, | |
| "step": 33820 | |
| }, | |
| { | |
| "epoch": 23.004748982360923, | |
| "grad_norm": 3.577422857284546, | |
| "learning_rate": 7.067061337138249e-05, | |
| "loss": 0.2794, | |
| "step": 33909 | |
| }, | |
| { | |
| "epoch": 23.065128900949798, | |
| "grad_norm": 3.4041476249694824, | |
| "learning_rate": 7.052211734551398e-05, | |
| "loss": 0.2653, | |
| "step": 33998 | |
| }, | |
| { | |
| "epoch": 23.12550881953867, | |
| "grad_norm": 3.21398663520813, | |
| "learning_rate": 7.037340331351592e-05, | |
| "loss": 0.2635, | |
| "step": 34087 | |
| }, | |
| { | |
| "epoch": 23.185888738127545, | |
| "grad_norm": 3.606840133666992, | |
| "learning_rate": 7.022447285517522e-05, | |
| "loss": 0.2612, | |
| "step": 34176 | |
| }, | |
| { | |
| "epoch": 23.246268656716417, | |
| "grad_norm": 3.4414963722229004, | |
| "learning_rate": 7.007532755257776e-05, | |
| "loss": 0.2621, | |
| "step": 34265 | |
| }, | |
| { | |
| "epoch": 23.306648575305292, | |
| "grad_norm": 3.429677724838257, | |
| "learning_rate": 6.992596899009174e-05, | |
| "loss": 0.2627, | |
| "step": 34354 | |
| }, | |
| { | |
| "epoch": 23.367028493894164, | |
| "grad_norm": 2.394657850265503, | |
| "learning_rate": 6.977639875435082e-05, | |
| "loss": 0.2651, | |
| "step": 34443 | |
| }, | |
| { | |
| "epoch": 23.42740841248304, | |
| "grad_norm": 3.796799421310425, | |
| "learning_rate": 6.962661843423725e-05, | |
| "loss": 0.2575, | |
| "step": 34532 | |
| }, | |
| { | |
| "epoch": 23.487788331071915, | |
| "grad_norm": 1.8303537368774414, | |
| "learning_rate": 6.947662962086506e-05, | |
| "loss": 0.2656, | |
| "step": 34621 | |
| }, | |
| { | |
| "epoch": 23.548168249660787, | |
| "grad_norm": 5.206216335296631, | |
| "learning_rate": 6.932643390756298e-05, | |
| "loss": 0.2789, | |
| "step": 34710 | |
| }, | |
| { | |
| "epoch": 23.608548168249662, | |
| "grad_norm": 2.8069159984588623, | |
| "learning_rate": 6.917603288985775e-05, | |
| "loss": 0.2679, | |
| "step": 34799 | |
| }, | |
| { | |
| "epoch": 23.668928086838534, | |
| "grad_norm": 2.3087520599365234, | |
| "learning_rate": 6.902542816545701e-05, | |
| "loss": 0.2625, | |
| "step": 34888 | |
| }, | |
| { | |
| "epoch": 23.72930800542741, | |
| "grad_norm": 3.139498472213745, | |
| "learning_rate": 6.887462133423237e-05, | |
| "loss": 0.2722, | |
| "step": 34977 | |
| }, | |
| { | |
| "epoch": 23.78968792401628, | |
| "grad_norm": 2.9781806468963623, | |
| "learning_rate": 6.872361399820245e-05, | |
| "loss": 0.2633, | |
| "step": 35066 | |
| }, | |
| { | |
| "epoch": 23.850067842605156, | |
| "grad_norm": 3.456528425216675, | |
| "learning_rate": 6.857240776151576e-05, | |
| "loss": 0.2767, | |
| "step": 35155 | |
| }, | |
| { | |
| "epoch": 23.91044776119403, | |
| "grad_norm": 2.8766520023345947, | |
| "learning_rate": 6.842100423043381e-05, | |
| "loss": 0.2655, | |
| "step": 35244 | |
| }, | |
| { | |
| "epoch": 23.970827679782904, | |
| "grad_norm": 2.811938524246216, | |
| "learning_rate": 6.826940501331391e-05, | |
| "loss": 0.26, | |
| "step": 35333 | |
| }, | |
| { | |
| "epoch": 24.00271370420624, | |
| "eval_accuracy": 0.1875, | |
| "eval_loss": 3.9947586059570312, | |
| "eval_runtime": 23.8781, | |
| "eval_samples_per_second": 21.442, | |
| "eval_steps_per_second": 0.168, | |
| "step": 35380 | |
| }, | |
| { | |
| "epoch": 24.03120759837178, | |
| "grad_norm": 3.4529502391815186, | |
| "learning_rate": 6.811761172059213e-05, | |
| "loss": 0.2424, | |
| "step": 35422 | |
| }, | |
| { | |
| "epoch": 24.09158751696065, | |
| "grad_norm": 7.157485485076904, | |
| "learning_rate": 6.796562596476629e-05, | |
| "loss": 0.2328, | |
| "step": 35511 | |
| }, | |
| { | |
| "epoch": 24.151967435549526, | |
| "grad_norm": 2.098388433456421, | |
| "learning_rate": 6.781344936037864e-05, | |
| "loss": 0.2368, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 24.212347354138398, | |
| "grad_norm": 2.5846946239471436, | |
| "learning_rate": 6.766108352399885e-05, | |
| "loss": 0.252, | |
| "step": 35689 | |
| }, | |
| { | |
| "epoch": 24.272727272727273, | |
| "grad_norm": 3.213495969772339, | |
| "learning_rate": 6.750853007420684e-05, | |
| "loss": 0.2563, | |
| "step": 35778 | |
| }, | |
| { | |
| "epoch": 24.333107191316145, | |
| "grad_norm": 5.0729498863220215, | |
| "learning_rate": 6.735579063157545e-05, | |
| "loss": 0.2623, | |
| "step": 35867 | |
| }, | |
| { | |
| "epoch": 24.39348710990502, | |
| "grad_norm": 2.973792791366577, | |
| "learning_rate": 6.720286681865339e-05, | |
| "loss": 0.2558, | |
| "step": 35956 | |
| }, | |
| { | |
| "epoch": 24.453867028493896, | |
| "grad_norm": 1.9252829551696777, | |
| "learning_rate": 6.704976025994796e-05, | |
| "loss": 0.2486, | |
| "step": 36045 | |
| }, | |
| { | |
| "epoch": 24.514246947082768, | |
| "grad_norm": 3.5804240703582764, | |
| "learning_rate": 6.689647258190768e-05, | |
| "loss": 0.2493, | |
| "step": 36134 | |
| }, | |
| { | |
| "epoch": 24.574626865671643, | |
| "grad_norm": 3.92348575592041, | |
| "learning_rate": 6.674300541290517e-05, | |
| "loss": 0.2447, | |
| "step": 36223 | |
| }, | |
| { | |
| "epoch": 24.635006784260515, | |
| "grad_norm": 2.7622110843658447, | |
| "learning_rate": 6.658936038321971e-05, | |
| "loss": 0.2381, | |
| "step": 36312 | |
| }, | |
| { | |
| "epoch": 24.69538670284939, | |
| "grad_norm": 2.5953946113586426, | |
| "learning_rate": 6.643553912502007e-05, | |
| "loss": 0.2467, | |
| "step": 36401 | |
| }, | |
| { | |
| "epoch": 24.755766621438262, | |
| "grad_norm": 2.8284683227539062, | |
| "learning_rate": 6.628154327234704e-05, | |
| "loss": 0.2435, | |
| "step": 36490 | |
| }, | |
| { | |
| "epoch": 24.816146540027137, | |
| "grad_norm": 2.8667030334472656, | |
| "learning_rate": 6.612737446109614e-05, | |
| "loss": 0.2476, | |
| "step": 36579 | |
| }, | |
| { | |
| "epoch": 24.87652645861601, | |
| "grad_norm": 2.5920257568359375, | |
| "learning_rate": 6.597303432900021e-05, | |
| "loss": 0.248, | |
| "step": 36668 | |
| }, | |
| { | |
| "epoch": 24.936906377204885, | |
| "grad_norm": 3.2936460971832275, | |
| "learning_rate": 6.581852451561207e-05, | |
| "loss": 0.2545, | |
| "step": 36757 | |
| }, | |
| { | |
| "epoch": 24.99728629579376, | |
| "grad_norm": 2.2897655963897705, | |
| "learning_rate": 6.5663846662287e-05, | |
| "loss": 0.2405, | |
| "step": 36846 | |
| }, | |
| { | |
| "epoch": 25.057666214382632, | |
| "grad_norm": 2.2279489040374756, | |
| "learning_rate": 6.550900241216545e-05, | |
| "loss": 0.2235, | |
| "step": 36935 | |
| }, | |
| { | |
| "epoch": 25.118046132971507, | |
| "grad_norm": 1.6091116666793823, | |
| "learning_rate": 6.535399341015543e-05, | |
| "loss": 0.2345, | |
| "step": 37024 | |
| }, | |
| { | |
| "epoch": 25.17842605156038, | |
| "grad_norm": 2.490220308303833, | |
| "learning_rate": 6.51988213029151e-05, | |
| "loss": 0.2264, | |
| "step": 37113 | |
| }, | |
| { | |
| "epoch": 25.238805970149254, | |
| "grad_norm": 2.3575713634490967, | |
| "learning_rate": 6.504348773883534e-05, | |
| "loss": 0.2384, | |
| "step": 37202 | |
| }, | |
| { | |
| "epoch": 25.299185888738126, | |
| "grad_norm": 2.0898985862731934, | |
| "learning_rate": 6.488799436802216e-05, | |
| "loss": 0.2332, | |
| "step": 37291 | |
| }, | |
| { | |
| "epoch": 25.359565807327, | |
| "grad_norm": 4.023237705230713, | |
| "learning_rate": 6.473234284227919e-05, | |
| "loss": 0.2186, | |
| "step": 37380 | |
| }, | |
| { | |
| "epoch": 25.419945725915873, | |
| "grad_norm": 1.7770565748214722, | |
| "learning_rate": 6.45765348150901e-05, | |
| "loss": 0.2318, | |
| "step": 37469 | |
| }, | |
| { | |
| "epoch": 25.48032564450475, | |
| "grad_norm": 3.1752917766571045, | |
| "learning_rate": 6.442057194160116e-05, | |
| "loss": 0.2234, | |
| "step": 37558 | |
| }, | |
| { | |
| "epoch": 25.540705563093624, | |
| "grad_norm": 3.1734275817871094, | |
| "learning_rate": 6.42644558786035e-05, | |
| "loss": 0.2388, | |
| "step": 37647 | |
| }, | |
| { | |
| "epoch": 25.601085481682496, | |
| "grad_norm": 3.916975259780884, | |
| "learning_rate": 6.410818828451557e-05, | |
| "loss": 0.227, | |
| "step": 37736 | |
| }, | |
| { | |
| "epoch": 25.66146540027137, | |
| "grad_norm": 2.7766647338867188, | |
| "learning_rate": 6.395177081936562e-05, | |
| "loss": 0.23, | |
| "step": 37825 | |
| }, | |
| { | |
| "epoch": 25.721845318860243, | |
| "grad_norm": 3.657627820968628, | |
| "learning_rate": 6.379520514477388e-05, | |
| "loss": 0.2329, | |
| "step": 37914 | |
| }, | |
| { | |
| "epoch": 25.78222523744912, | |
| "grad_norm": 4.11094331741333, | |
| "learning_rate": 6.363849292393507e-05, | |
| "loss": 0.2241, | |
| "step": 38003 | |
| }, | |
| { | |
| "epoch": 25.84260515603799, | |
| "grad_norm": 2.6179704666137695, | |
| "learning_rate": 6.348163582160062e-05, | |
| "loss": 0.2268, | |
| "step": 38092 | |
| }, | |
| { | |
| "epoch": 25.902985074626866, | |
| "grad_norm": 3.4568240642547607, | |
| "learning_rate": 6.332463550406107e-05, | |
| "loss": 0.2197, | |
| "step": 38181 | |
| }, | |
| { | |
| "epoch": 25.96336499321574, | |
| "grad_norm": 1.789491057395935, | |
| "learning_rate": 6.316749363912833e-05, | |
| "loss": 0.2087, | |
| "step": 38270 | |
| }, | |
| { | |
| "epoch": 26.023744911804613, | |
| "grad_norm": 2.606367588043213, | |
| "learning_rate": 6.301021189611793e-05, | |
| "loss": 0.2153, | |
| "step": 38359 | |
| }, | |
| { | |
| "epoch": 26.08412483039349, | |
| "grad_norm": 2.6728904247283936, | |
| "learning_rate": 6.28527919458314e-05, | |
| "loss": 0.2043, | |
| "step": 38448 | |
| }, | |
| { | |
| "epoch": 26.14450474898236, | |
| "grad_norm": 2.2943668365478516, | |
| "learning_rate": 6.269523546053832e-05, | |
| "loss": 0.2123, | |
| "step": 38537 | |
| }, | |
| { | |
| "epoch": 26.204884667571235, | |
| "grad_norm": 3.1198699474334717, | |
| "learning_rate": 6.253754411395882e-05, | |
| "loss": 0.2128, | |
| "step": 38626 | |
| }, | |
| { | |
| "epoch": 26.265264586160107, | |
| "grad_norm": 2.269235372543335, | |
| "learning_rate": 6.237971958124559e-05, | |
| "loss": 0.2213, | |
| "step": 38715 | |
| }, | |
| { | |
| "epoch": 26.325644504748983, | |
| "grad_norm": 3.090557098388672, | |
| "learning_rate": 6.22217635389661e-05, | |
| "loss": 0.2253, | |
| "step": 38804 | |
| }, | |
| { | |
| "epoch": 26.386024423337854, | |
| "grad_norm": 4.030007839202881, | |
| "learning_rate": 6.206367766508497e-05, | |
| "loss": 0.2104, | |
| "step": 38893 | |
| }, | |
| { | |
| "epoch": 26.402985074626866, | |
| "eval_accuracy": 0.181640625, | |
| "eval_loss": 4.0000152587890625, | |
| "eval_runtime": 20.3217, | |
| "eval_samples_per_second": 25.195, | |
| "eval_steps_per_second": 0.197, | |
| "step": 38918 | |
| }, | |
| { | |
| "epoch": 26.44640434192673, | |
| "grad_norm": 3.711073637008667, | |
| "learning_rate": 6.190546363894589e-05, | |
| "loss": 0.2019, | |
| "step": 38982 | |
| }, | |
| { | |
| "epoch": 26.506784260515605, | |
| "grad_norm": 4.125629901885986, | |
| "learning_rate": 6.1747123141254e-05, | |
| "loss": 0.218, | |
| "step": 39071 | |
| }, | |
| { | |
| "epoch": 26.567164179104477, | |
| "grad_norm": 2.719214916229248, | |
| "learning_rate": 6.158865785405792e-05, | |
| "loss": 0.2138, | |
| "step": 39160 | |
| }, | |
| { | |
| "epoch": 26.627544097693352, | |
| "grad_norm": 5.083952903747559, | |
| "learning_rate": 6.143006946073187e-05, | |
| "loss": 0.2098, | |
| "step": 39249 | |
| }, | |
| { | |
| "epoch": 26.687924016282224, | |
| "grad_norm": 2.9340269565582275, | |
| "learning_rate": 6.127135964595789e-05, | |
| "loss": 0.2004, | |
| "step": 39338 | |
| }, | |
| { | |
| "epoch": 26.7483039348711, | |
| "grad_norm": 1.930010437965393, | |
| "learning_rate": 6.111253009570781e-05, | |
| "loss": 0.2212, | |
| "step": 39427 | |
| }, | |
| { | |
| "epoch": 26.80868385345997, | |
| "grad_norm": 3.872161388397217, | |
| "learning_rate": 6.095358249722548e-05, | |
| "loss": 0.2116, | |
| "step": 39516 | |
| }, | |
| { | |
| "epoch": 26.869063772048847, | |
| "grad_norm": 2.4142067432403564, | |
| "learning_rate": 6.0794518539008716e-05, | |
| "loss": 0.2223, | |
| "step": 39605 | |
| }, | |
| { | |
| "epoch": 26.929443690637722, | |
| "grad_norm": 2.2030022144317627, | |
| "learning_rate": 6.063533991079143e-05, | |
| "loss": 0.2155, | |
| "step": 39694 | |
| }, | |
| { | |
| "epoch": 26.989823609226594, | |
| "grad_norm": 3.7845208644866943, | |
| "learning_rate": 6.0476048303525725e-05, | |
| "loss": 0.2177, | |
| "step": 39783 | |
| }, | |
| { | |
| "epoch": 27.05020352781547, | |
| "grad_norm": 2.8146162033081055, | |
| "learning_rate": 6.0316645409363794e-05, | |
| "loss": 0.1945, | |
| "step": 39872 | |
| }, | |
| { | |
| "epoch": 27.11058344640434, | |
| "grad_norm": 2.4782633781433105, | |
| "learning_rate": 6.015713292164008e-05, | |
| "loss": 0.1978, | |
| "step": 39961 | |
| }, | |
| { | |
| "epoch": 27.170963364993217, | |
| "grad_norm": 1.6334956884384155, | |
| "learning_rate": 5.999751253485325e-05, | |
| "loss": 0.1949, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 27.23134328358209, | |
| "grad_norm": 3.610597610473633, | |
| "learning_rate": 5.983778594464814e-05, | |
| "loss": 0.1997, | |
| "step": 40139 | |
| }, | |
| { | |
| "epoch": 27.291723202170964, | |
| "grad_norm": 3.140693426132202, | |
| "learning_rate": 5.967795484779781e-05, | |
| "loss": 0.1905, | |
| "step": 40228 | |
| }, | |
| { | |
| "epoch": 27.352103120759836, | |
| "grad_norm": 2.56771183013916, | |
| "learning_rate": 5.9518020942185494e-05, | |
| "loss": 0.1893, | |
| "step": 40317 | |
| }, | |
| { | |
| "epoch": 27.41248303934871, | |
| "grad_norm": 1.921730875968933, | |
| "learning_rate": 5.935798592678653e-05, | |
| "loss": 0.1972, | |
| "step": 40406 | |
| }, | |
| { | |
| "epoch": 27.472862957937586, | |
| "grad_norm": 2.7568604946136475, | |
| "learning_rate": 5.91978515016504e-05, | |
| "loss": 0.2038, | |
| "step": 40495 | |
| }, | |
| { | |
| "epoch": 27.533242876526458, | |
| "grad_norm": 3.526125192642212, | |
| "learning_rate": 5.903761936788255e-05, | |
| "loss": 0.1881, | |
| "step": 40584 | |
| }, | |
| { | |
| "epoch": 27.593622795115333, | |
| "grad_norm": 2.400557279586792, | |
| "learning_rate": 5.887729122762644e-05, | |
| "loss": 0.1908, | |
| "step": 40673 | |
| }, | |
| { | |
| "epoch": 27.654002713704205, | |
| "grad_norm": 2.814988374710083, | |
| "learning_rate": 5.8716868784045374e-05, | |
| "loss": 0.1946, | |
| "step": 40762 | |
| }, | |
| { | |
| "epoch": 27.71438263229308, | |
| "grad_norm": 3.351440906524658, | |
| "learning_rate": 5.855635374130442e-05, | |
| "loss": 0.199, | |
| "step": 40851 | |
| }, | |
| { | |
| "epoch": 27.774762550881952, | |
| "grad_norm": 3.108304023742676, | |
| "learning_rate": 5.839574780455239e-05, | |
| "loss": 0.2009, | |
| "step": 40940 | |
| }, | |
| { | |
| "epoch": 27.835142469470828, | |
| "grad_norm": 3.37080979347229, | |
| "learning_rate": 5.823505267990359e-05, | |
| "loss": 0.1929, | |
| "step": 41029 | |
| }, | |
| { | |
| "epoch": 27.895522388059703, | |
| "grad_norm": 2.852602005004883, | |
| "learning_rate": 5.807427007441981e-05, | |
| "loss": 0.1946, | |
| "step": 41118 | |
| }, | |
| { | |
| "epoch": 27.955902306648575, | |
| "grad_norm": 2.15985369682312, | |
| "learning_rate": 5.791340169609214e-05, | |
| "loss": 0.1997, | |
| "step": 41207 | |
| }, | |
| { | |
| "epoch": 28.01628222523745, | |
| "grad_norm": 0.9773418307304382, | |
| "learning_rate": 5.7752449253822815e-05, | |
| "loss": 0.1789, | |
| "step": 41296 | |
| }, | |
| { | |
| "epoch": 28.076662143826322, | |
| "grad_norm": 2.572413444519043, | |
| "learning_rate": 5.759141445740713e-05, | |
| "loss": 0.1816, | |
| "step": 41385 | |
| }, | |
| { | |
| "epoch": 28.137042062415198, | |
| "grad_norm": 1.8453723192214966, | |
| "learning_rate": 5.7430299017515166e-05, | |
| "loss": 0.1795, | |
| "step": 41474 | |
| }, | |
| { | |
| "epoch": 28.19742198100407, | |
| "grad_norm": 2.09143328666687, | |
| "learning_rate": 5.726910464567371e-05, | |
| "loss": 0.1748, | |
| "step": 41563 | |
| }, | |
| { | |
| "epoch": 28.257801899592945, | |
| "grad_norm": 4.368978977203369, | |
| "learning_rate": 5.710783305424804e-05, | |
| "loss": 0.1865, | |
| "step": 41652 | |
| }, | |
| { | |
| "epoch": 28.318181818181817, | |
| "grad_norm": 1.7974387407302856, | |
| "learning_rate": 5.694648595642372e-05, | |
| "loss": 0.1878, | |
| "step": 41741 | |
| }, | |
| { | |
| "epoch": 28.378561736770692, | |
| "grad_norm": 3.7262039184570312, | |
| "learning_rate": 5.6785065066188446e-05, | |
| "loss": 0.1873, | |
| "step": 41830 | |
| }, | |
| { | |
| "epoch": 28.438941655359567, | |
| "grad_norm": 5.437527656555176, | |
| "learning_rate": 5.662357209831378e-05, | |
| "loss": 0.1958, | |
| "step": 41919 | |
| }, | |
| { | |
| "epoch": 28.49932157394844, | |
| "grad_norm": 2.2726356983184814, | |
| "learning_rate": 5.646200876833699e-05, | |
| "loss": 0.1818, | |
| "step": 42008 | |
| }, | |
| { | |
| "epoch": 28.559701492537314, | |
| "grad_norm": 3.234407663345337, | |
| "learning_rate": 5.630037679254278e-05, | |
| "loss": 0.1893, | |
| "step": 42097 | |
| }, | |
| { | |
| "epoch": 28.620081411126186, | |
| "grad_norm": 2.5418026447296143, | |
| "learning_rate": 5.613867788794508e-05, | |
| "loss": 0.185, | |
| "step": 42186 | |
| }, | |
| { | |
| "epoch": 28.68046132971506, | |
| "grad_norm": 2.394573926925659, | |
| "learning_rate": 5.5976913772268823e-05, | |
| "loss": 0.1803, | |
| "step": 42275 | |
| }, | |
| { | |
| "epoch": 28.740841248303933, | |
| "grad_norm": 2.4603261947631836, | |
| "learning_rate": 5.581508616393165e-05, | |
| "loss": 0.1793, | |
| "step": 42364 | |
| }, | |
| { | |
| "epoch": 28.80122116689281, | |
| "grad_norm": 3.139146566390991, | |
| "learning_rate": 5.5653196782025696e-05, | |
| "loss": 0.1797, | |
| "step": 42453 | |
| }, | |
| { | |
| "epoch": 28.80325644504749, | |
| "eval_accuracy": 0.18359375, | |
| "eval_loss": 4.114618301391602, | |
| "eval_runtime": 29.6243, | |
| "eval_samples_per_second": 17.283, | |
| "eval_steps_per_second": 0.135, | |
| "step": 42456 | |
| }, | |
| { | |
| "epoch": 28.86160108548168, | |
| "grad_norm": 1.7056379318237305, | |
| "learning_rate": 5.5491247346299334e-05, | |
| "loss": 0.1811, | |
| "step": 42542 | |
| }, | |
| { | |
| "epoch": 28.921981004070556, | |
| "grad_norm": 1.6604520082473755, | |
| "learning_rate": 5.532923957713885e-05, | |
| "loss": 0.1751, | |
| "step": 42631 | |
| }, | |
| { | |
| "epoch": 28.98236092265943, | |
| "grad_norm": 2.6219496726989746, | |
| "learning_rate": 5.5167175195550235e-05, | |
| "loss": 0.1814, | |
| "step": 42720 | |
| }, | |
| { | |
| "epoch": 29.042740841248303, | |
| "grad_norm": 1.9368810653686523, | |
| "learning_rate": 5.500505592314086e-05, | |
| "loss": 0.1732, | |
| "step": 42809 | |
| }, | |
| { | |
| "epoch": 29.10312075983718, | |
| "grad_norm": 2.2955291271209717, | |
| "learning_rate": 5.484288348210121e-05, | |
| "loss": 0.1611, | |
| "step": 42898 | |
| }, | |
| { | |
| "epoch": 29.16350067842605, | |
| "grad_norm": 1.331339716911316, | |
| "learning_rate": 5.468065959518656e-05, | |
| "loss": 0.1682, | |
| "step": 42987 | |
| }, | |
| { | |
| "epoch": 29.223880597014926, | |
| "grad_norm": 1.5474261045455933, | |
| "learning_rate": 5.4518385985698714e-05, | |
| "loss": 0.1652, | |
| "step": 43076 | |
| }, | |
| { | |
| "epoch": 29.284260515603798, | |
| "grad_norm": 1.5148978233337402, | |
| "learning_rate": 5.4356064377467684e-05, | |
| "loss": 0.18, | |
| "step": 43165 | |
| }, | |
| { | |
| "epoch": 29.344640434192673, | |
| "grad_norm": 5.3867878913879395, | |
| "learning_rate": 5.4193696494833346e-05, | |
| "loss": 0.1693, | |
| "step": 43254 | |
| }, | |
| { | |
| "epoch": 29.40502035278155, | |
| "grad_norm": 2.20180082321167, | |
| "learning_rate": 5.4031284062627165e-05, | |
| "loss": 0.1599, | |
| "step": 43343 | |
| }, | |
| { | |
| "epoch": 29.46540027137042, | |
| "grad_norm": 2.1975841522216797, | |
| "learning_rate": 5.386882880615383e-05, | |
| "loss": 0.171, | |
| "step": 43432 | |
| }, | |
| { | |
| "epoch": 29.525780189959296, | |
| "grad_norm": 2.5658628940582275, | |
| "learning_rate": 5.3706332451173006e-05, | |
| "loss": 0.1714, | |
| "step": 43521 | |
| }, | |
| { | |
| "epoch": 29.586160108548167, | |
| "grad_norm": 2.7179007530212402, | |
| "learning_rate": 5.354379672388089e-05, | |
| "loss": 0.1713, | |
| "step": 43610 | |
| }, | |
| { | |
| "epoch": 29.646540027137043, | |
| "grad_norm": 1.867160677909851, | |
| "learning_rate": 5.338122335089196e-05, | |
| "loss": 0.1684, | |
| "step": 43699 | |
| }, | |
| { | |
| "epoch": 29.706919945725915, | |
| "grad_norm": 1.5570918321609497, | |
| "learning_rate": 5.321861405922063e-05, | |
| "loss": 0.1713, | |
| "step": 43788 | |
| }, | |
| { | |
| "epoch": 29.76729986431479, | |
| "grad_norm": 3.943268060684204, | |
| "learning_rate": 5.305597057626279e-05, | |
| "loss": 0.1714, | |
| "step": 43877 | |
| }, | |
| { | |
| "epoch": 29.827679782903665, | |
| "grad_norm": 1.6523535251617432, | |
| "learning_rate": 5.2893294629777644e-05, | |
| "loss": 0.1754, | |
| "step": 43966 | |
| }, | |
| { | |
| "epoch": 29.888059701492537, | |
| "grad_norm": 2.623303174972534, | |
| "learning_rate": 5.273058794786918e-05, | |
| "loss": 0.1724, | |
| "step": 44055 | |
| }, | |
| { | |
| "epoch": 29.948439620081412, | |
| "grad_norm": 1.8316419124603271, | |
| "learning_rate": 5.256785225896794e-05, | |
| "loss": 0.17, | |
| "step": 44144 | |
| }, | |
| { | |
| "epoch": 30.008819538670284, | |
| "grad_norm": 2.2553136348724365, | |
| "learning_rate": 5.240508929181258e-05, | |
| "loss": 0.1766, | |
| "step": 44233 | |
| }, | |
| { | |
| "epoch": 30.06919945725916, | |
| "grad_norm": 1.168664574623108, | |
| "learning_rate": 5.224230077543153e-05, | |
| "loss": 0.1523, | |
| "step": 44322 | |
| }, | |
| { | |
| "epoch": 30.12957937584803, | |
| "grad_norm": 1.4800312519073486, | |
| "learning_rate": 5.2079488439124644e-05, | |
| "loss": 0.1553, | |
| "step": 44411 | |
| }, | |
| { | |
| "epoch": 30.189959294436907, | |
| "grad_norm": 1.983797550201416, | |
| "learning_rate": 5.1916654012444796e-05, | |
| "loss": 0.1605, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 30.25033921302578, | |
| "grad_norm": 1.0819350481033325, | |
| "learning_rate": 5.1753799225179545e-05, | |
| "loss": 0.1592, | |
| "step": 44589 | |
| }, | |
| { | |
| "epoch": 30.310719131614654, | |
| "grad_norm": 2.143650531768799, | |
| "learning_rate": 5.159092580733276e-05, | |
| "loss": 0.1608, | |
| "step": 44678 | |
| }, | |
| { | |
| "epoch": 30.37109905020353, | |
| "grad_norm": 1.6740977764129639, | |
| "learning_rate": 5.142803548910614e-05, | |
| "loss": 0.1591, | |
| "step": 44767 | |
| }, | |
| { | |
| "epoch": 30.4314789687924, | |
| "grad_norm": 2.7269814014434814, | |
| "learning_rate": 5.126513000088101e-05, | |
| "loss": 0.1624, | |
| "step": 44856 | |
| }, | |
| { | |
| "epoch": 30.491858887381277, | |
| "grad_norm": 2.9973506927490234, | |
| "learning_rate": 5.1102211073199805e-05, | |
| "loss": 0.1597, | |
| "step": 44945 | |
| }, | |
| { | |
| "epoch": 30.55223880597015, | |
| "grad_norm": 1.9262616634368896, | |
| "learning_rate": 5.093928043674772e-05, | |
| "loss": 0.1517, | |
| "step": 45034 | |
| }, | |
| { | |
| "epoch": 30.612618724559024, | |
| "grad_norm": 2.577742099761963, | |
| "learning_rate": 5.077633982233433e-05, | |
| "loss": 0.1668, | |
| "step": 45123 | |
| }, | |
| { | |
| "epoch": 30.672998643147896, | |
| "grad_norm": 1.0925939083099365, | |
| "learning_rate": 5.061339096087523e-05, | |
| "loss": 0.1611, | |
| "step": 45212 | |
| }, | |
| { | |
| "epoch": 30.73337856173677, | |
| "grad_norm": 1.5580718517303467, | |
| "learning_rate": 5.0450435583373624e-05, | |
| "loss": 0.1691, | |
| "step": 45301 | |
| }, | |
| { | |
| "epoch": 30.793758480325643, | |
| "grad_norm": 1.7016775608062744, | |
| "learning_rate": 5.028747542090189e-05, | |
| "loss": 0.1565, | |
| "step": 45390 | |
| }, | |
| { | |
| "epoch": 30.854138398914518, | |
| "grad_norm": 2.930467128753662, | |
| "learning_rate": 5.012451220458328e-05, | |
| "loss": 0.1685, | |
| "step": 45479 | |
| }, | |
| { | |
| "epoch": 30.914518317503394, | |
| "grad_norm": 2.0711212158203125, | |
| "learning_rate": 4.996154766557351e-05, | |
| "loss": 0.1606, | |
| "step": 45568 | |
| }, | |
| { | |
| "epoch": 30.974898236092265, | |
| "grad_norm": 1.6559313535690308, | |
| "learning_rate": 4.9798583535042254e-05, | |
| "loss": 0.1695, | |
| "step": 45657 | |
| }, | |
| { | |
| "epoch": 31.03527815468114, | |
| "grad_norm": 2.794700860977173, | |
| "learning_rate": 4.9635621544154945e-05, | |
| "loss": 0.1506, | |
| "step": 45746 | |
| }, | |
| { | |
| "epoch": 31.095658073270013, | |
| "grad_norm": 2.3707473278045654, | |
| "learning_rate": 4.947266342405424e-05, | |
| "loss": 0.1474, | |
| "step": 45835 | |
| }, | |
| { | |
| "epoch": 31.156037991858888, | |
| "grad_norm": 1.6921839714050293, | |
| "learning_rate": 4.930971090584168e-05, | |
| "loss": 0.1468, | |
| "step": 45924 | |
| }, | |
| { | |
| "epoch": 31.203527815468114, | |
| "eval_accuracy": 0.181640625, | |
| "eval_loss": 4.099109649658203, | |
| "eval_runtime": 19.3439, | |
| "eval_samples_per_second": 26.468, | |
| "eval_steps_per_second": 0.207, | |
| "step": 45994 | |
| }, | |
| { | |
| "epoch": 31.21641791044776, | |
| "grad_norm": 1.611038327217102, | |
| "learning_rate": 4.91467657205593e-05, | |
| "loss": 0.1511, | |
| "step": 46013 | |
| }, | |
| { | |
| "epoch": 31.276797829036635, | |
| "grad_norm": 1.53565514087677, | |
| "learning_rate": 4.8983829599171235e-05, | |
| "loss": 0.1545, | |
| "step": 46102 | |
| }, | |
| { | |
| "epoch": 31.33717774762551, | |
| "grad_norm": 1.6248897314071655, | |
| "learning_rate": 4.8820904272545336e-05, | |
| "loss": 0.1456, | |
| "step": 46191 | |
| }, | |
| { | |
| "epoch": 31.397557666214382, | |
| "grad_norm": 1.318975806236267, | |
| "learning_rate": 4.865799147143479e-05, | |
| "loss": 0.1483, | |
| "step": 46280 | |
| }, | |
| { | |
| "epoch": 31.457937584803258, | |
| "grad_norm": 1.2955539226531982, | |
| "learning_rate": 4.8495092926459736e-05, | |
| "loss": 0.1515, | |
| "step": 46369 | |
| }, | |
| { | |
| "epoch": 31.51831750339213, | |
| "grad_norm": 3.0391619205474854, | |
| "learning_rate": 4.833221036808882e-05, | |
| "loss": 0.1479, | |
| "step": 46458 | |
| }, | |
| { | |
| "epoch": 31.578697421981005, | |
| "grad_norm": 1.7275387048721313, | |
| "learning_rate": 4.81693455266209e-05, | |
| "loss": 0.1517, | |
| "step": 46547 | |
| }, | |
| { | |
| "epoch": 31.639077340569877, | |
| "grad_norm": 2.1065945625305176, | |
| "learning_rate": 4.8006500132166625e-05, | |
| "loss": 0.1501, | |
| "step": 46636 | |
| }, | |
| { | |
| "epoch": 31.699457259158752, | |
| "grad_norm": 0.9785634875297546, | |
| "learning_rate": 4.784367591463008e-05, | |
| "loss": 0.1501, | |
| "step": 46725 | |
| }, | |
| { | |
| "epoch": 31.759837177747624, | |
| "grad_norm": 1.2350496053695679, | |
| "learning_rate": 4.768087460369036e-05, | |
| "loss": 0.146, | |
| "step": 46814 | |
| }, | |
| { | |
| "epoch": 31.8202170963365, | |
| "grad_norm": 1.5443971157073975, | |
| "learning_rate": 4.75180979287832e-05, | |
| "loss": 0.1514, | |
| "step": 46903 | |
| }, | |
| { | |
| "epoch": 31.880597014925375, | |
| "grad_norm": 1.1282203197479248, | |
| "learning_rate": 4.735534761908267e-05, | |
| "loss": 0.1478, | |
| "step": 46992 | |
| }, | |
| { | |
| "epoch": 31.940976933514246, | |
| "grad_norm": 1.1595454216003418, | |
| "learning_rate": 4.719262540348275e-05, | |
| "loss": 0.15, | |
| "step": 47081 | |
| }, | |
| { | |
| "epoch": 32.00135685210312, | |
| "grad_norm": 1.392354130744934, | |
| "learning_rate": 4.702993301057897e-05, | |
| "loss": 0.1402, | |
| "step": 47170 | |
| }, | |
| { | |
| "epoch": 32.061736770691994, | |
| "grad_norm": 1.6813993453979492, | |
| "learning_rate": 4.686727216865008e-05, | |
| "loss": 0.1458, | |
| "step": 47259 | |
| }, | |
| { | |
| "epoch": 32.122116689280865, | |
| "grad_norm": 2.200620174407959, | |
| "learning_rate": 4.6704644605639617e-05, | |
| "loss": 0.1426, | |
| "step": 47348 | |
| }, | |
| { | |
| "epoch": 32.182496607869744, | |
| "grad_norm": 1.1454344987869263, | |
| "learning_rate": 4.654205204913762e-05, | |
| "loss": 0.1417, | |
| "step": 47437 | |
| }, | |
| { | |
| "epoch": 32.242876526458616, | |
| "grad_norm": 1.6104034185409546, | |
| "learning_rate": 4.6379496226362285e-05, | |
| "loss": 0.1364, | |
| "step": 47526 | |
| }, | |
| { | |
| "epoch": 32.30325644504749, | |
| "grad_norm": 2.7888503074645996, | |
| "learning_rate": 4.621697886414152e-05, | |
| "loss": 0.1415, | |
| "step": 47615 | |
| }, | |
| { | |
| "epoch": 32.36363636363637, | |
| "grad_norm": 1.4862406253814697, | |
| "learning_rate": 4.605450168889475e-05, | |
| "loss": 0.1449, | |
| "step": 47704 | |
| }, | |
| { | |
| "epoch": 32.42401628222524, | |
| "grad_norm": 1.396264672279358, | |
| "learning_rate": 4.5892066426614426e-05, | |
| "loss": 0.1351, | |
| "step": 47793 | |
| }, | |
| { | |
| "epoch": 32.48439620081411, | |
| "grad_norm": 0.8358775973320007, | |
| "learning_rate": 4.572967480284777e-05, | |
| "loss": 0.1478, | |
| "step": 47882 | |
| }, | |
| { | |
| "epoch": 32.54477611940298, | |
| "grad_norm": 1.150931477546692, | |
| "learning_rate": 4.556732854267846e-05, | |
| "loss": 0.1388, | |
| "step": 47971 | |
| }, | |
| { | |
| "epoch": 32.60515603799186, | |
| "grad_norm": 1.9280314445495605, | |
| "learning_rate": 4.540502937070826e-05, | |
| "loss": 0.1336, | |
| "step": 48060 | |
| }, | |
| { | |
| "epoch": 32.66553595658073, | |
| "grad_norm": 1.9232927560806274, | |
| "learning_rate": 4.5242779011038746e-05, | |
| "loss": 0.1357, | |
| "step": 48149 | |
| }, | |
| { | |
| "epoch": 32.725915875169605, | |
| "grad_norm": 1.9297000169754028, | |
| "learning_rate": 4.5080579187252875e-05, | |
| "loss": 0.1434, | |
| "step": 48238 | |
| }, | |
| { | |
| "epoch": 32.786295793758484, | |
| "grad_norm": 1.3162543773651123, | |
| "learning_rate": 4.491843162239686e-05, | |
| "loss": 0.1357, | |
| "step": 48327 | |
| }, | |
| { | |
| "epoch": 32.846675712347356, | |
| "grad_norm": 2.112964391708374, | |
| "learning_rate": 4.4756338038961734e-05, | |
| "loss": 0.1347, | |
| "step": 48416 | |
| }, | |
| { | |
| "epoch": 32.90705563093623, | |
| "grad_norm": 2.025836944580078, | |
| "learning_rate": 4.459430015886507e-05, | |
| "loss": 0.1361, | |
| "step": 48505 | |
| }, | |
| { | |
| "epoch": 32.9674355495251, | |
| "grad_norm": 2.231003999710083, | |
| "learning_rate": 4.443231970343273e-05, | |
| "loss": 0.1493, | |
| "step": 48594 | |
| }, | |
| { | |
| "epoch": 33.02781546811398, | |
| "grad_norm": 0.858778715133667, | |
| "learning_rate": 4.427039839338051e-05, | |
| "loss": 0.1335, | |
| "step": 48683 | |
| }, | |
| { | |
| "epoch": 33.08819538670285, | |
| "grad_norm": 1.7645868062973022, | |
| "learning_rate": 4.410853794879596e-05, | |
| "loss": 0.1318, | |
| "step": 48772 | |
| }, | |
| { | |
| "epoch": 33.14857530529172, | |
| "grad_norm": 3.9215147495269775, | |
| "learning_rate": 4.3946740089120036e-05, | |
| "loss": 0.1289, | |
| "step": 48861 | |
| }, | |
| { | |
| "epoch": 33.208955223880594, | |
| "grad_norm": 1.0674065351486206, | |
| "learning_rate": 4.378500653312886e-05, | |
| "loss": 0.1314, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 33.26933514246947, | |
| "grad_norm": 1.1910934448242188, | |
| "learning_rate": 4.362333899891545e-05, | |
| "loss": 0.1285, | |
| "step": 49039 | |
| }, | |
| { | |
| "epoch": 33.329715061058344, | |
| "grad_norm": 3.3156814575195312, | |
| "learning_rate": 4.346173920387146e-05, | |
| "loss": 0.1314, | |
| "step": 49128 | |
| }, | |
| { | |
| "epoch": 33.390094979647216, | |
| "grad_norm": 1.518210530281067, | |
| "learning_rate": 4.330020886466898e-05, | |
| "loss": 0.1327, | |
| "step": 49217 | |
| }, | |
| { | |
| "epoch": 33.450474898236095, | |
| "grad_norm": 2.037992238998413, | |
| "learning_rate": 4.313874969724227e-05, | |
| "loss": 0.1294, | |
| "step": 49306 | |
| }, | |
| { | |
| "epoch": 33.51085481682497, | |
| "grad_norm": 1.0530787706375122, | |
| "learning_rate": 4.2977363416769495e-05, | |
| "loss": 0.1338, | |
| "step": 49395 | |
| }, | |
| { | |
| "epoch": 33.57123473541384, | |
| "grad_norm": 2.6281962394714355, | |
| "learning_rate": 4.281605173765462e-05, | |
| "loss": 0.1385, | |
| "step": 49484 | |
| }, | |
| { | |
| "epoch": 33.60379918588874, | |
| "eval_accuracy": 0.177734375, | |
| "eval_loss": 4.106353759765625, | |
| "eval_runtime": 43.8825, | |
| "eval_samples_per_second": 11.668, | |
| "eval_steps_per_second": 0.091, | |
| "step": 49532 | |
| }, | |
| { | |
| "epoch": 33.63161465400271, | |
| "grad_norm": 1.659097671508789, | |
| "learning_rate": 4.265481637350902e-05, | |
| "loss": 0.1334, | |
| "step": 49573 | |
| }, | |
| { | |
| "epoch": 33.69199457259159, | |
| "grad_norm": 1.2055881023406982, | |
| "learning_rate": 4.249365903713345e-05, | |
| "loss": 0.1277, | |
| "step": 49662 | |
| }, | |
| { | |
| "epoch": 33.75237449118046, | |
| "grad_norm": 1.3534148931503296, | |
| "learning_rate": 4.2332581440499765e-05, | |
| "loss": 0.1241, | |
| "step": 49751 | |
| }, | |
| { | |
| "epoch": 33.81275440976933, | |
| "grad_norm": 1.6355328559875488, | |
| "learning_rate": 4.217158529473275e-05, | |
| "loss": 0.1309, | |
| "step": 49840 | |
| }, | |
| { | |
| "epoch": 33.87313432835821, | |
| "grad_norm": 1.2613086700439453, | |
| "learning_rate": 4.2010672310091895e-05, | |
| "loss": 0.1306, | |
| "step": 49929 | |
| }, | |
| { | |
| "epoch": 33.933514246947084, | |
| "grad_norm": 2.427302837371826, | |
| "learning_rate": 4.1849844195953314e-05, | |
| "loss": 0.1335, | |
| "step": 50018 | |
| }, | |
| { | |
| "epoch": 33.993894165535956, | |
| "grad_norm": 1.0683902502059937, | |
| "learning_rate": 4.1689102660791536e-05, | |
| "loss": 0.137, | |
| "step": 50107 | |
| }, | |
| { | |
| "epoch": 34.05427408412483, | |
| "grad_norm": 1.184240460395813, | |
| "learning_rate": 4.1528449412161375e-05, | |
| "loss": 0.1206, | |
| "step": 50196 | |
| }, | |
| { | |
| "epoch": 34.114654002713706, | |
| "grad_norm": 2.108067512512207, | |
| "learning_rate": 4.136788615667974e-05, | |
| "loss": 0.125, | |
| "step": 50285 | |
| }, | |
| { | |
| "epoch": 34.17503392130258, | |
| "grad_norm": 1.4755454063415527, | |
| "learning_rate": 4.120741460000758e-05, | |
| "loss": 0.1283, | |
| "step": 50374 | |
| }, | |
| { | |
| "epoch": 34.23541383989145, | |
| "grad_norm": 1.8144526481628418, | |
| "learning_rate": 4.1047036446831686e-05, | |
| "loss": 0.1279, | |
| "step": 50463 | |
| }, | |
| { | |
| "epoch": 34.29579375848033, | |
| "grad_norm": 1.2851365804672241, | |
| "learning_rate": 4.088675340084668e-05, | |
| "loss": 0.1207, | |
| "step": 50552 | |
| }, | |
| { | |
| "epoch": 34.3561736770692, | |
| "grad_norm": 1.1482937335968018, | |
| "learning_rate": 4.072656716473684e-05, | |
| "loss": 0.1251, | |
| "step": 50641 | |
| }, | |
| { | |
| "epoch": 34.41655359565807, | |
| "grad_norm": 1.2348805665969849, | |
| "learning_rate": 4.0566479440158036e-05, | |
| "loss": 0.1235, | |
| "step": 50730 | |
| }, | |
| { | |
| "epoch": 34.476933514246944, | |
| "grad_norm": 1.1819324493408203, | |
| "learning_rate": 4.040649192771962e-05, | |
| "loss": 0.132, | |
| "step": 50819 | |
| }, | |
| { | |
| "epoch": 34.53731343283582, | |
| "grad_norm": 1.1830766201019287, | |
| "learning_rate": 4.0246606326966425e-05, | |
| "loss": 0.1176, | |
| "step": 50908 | |
| }, | |
| { | |
| "epoch": 34.597693351424695, | |
| "grad_norm": 1.9532086849212646, | |
| "learning_rate": 4.0086824336360676e-05, | |
| "loss": 0.1231, | |
| "step": 50997 | |
| }, | |
| { | |
| "epoch": 34.65807327001357, | |
| "grad_norm": 1.529571294784546, | |
| "learning_rate": 3.992714765326396e-05, | |
| "loss": 0.1242, | |
| "step": 51086 | |
| }, | |
| { | |
| "epoch": 34.71845318860244, | |
| "grad_norm": 1.2561233043670654, | |
| "learning_rate": 3.9767577973919146e-05, | |
| "loss": 0.1255, | |
| "step": 51175 | |
| }, | |
| { | |
| "epoch": 34.77883310719132, | |
| "grad_norm": 1.7090590000152588, | |
| "learning_rate": 3.960811699343243e-05, | |
| "loss": 0.1215, | |
| "step": 51264 | |
| }, | |
| { | |
| "epoch": 34.83921302578019, | |
| "grad_norm": 2.106395959854126, | |
| "learning_rate": 3.94487664057553e-05, | |
| "loss": 0.1285, | |
| "step": 51353 | |
| }, | |
| { | |
| "epoch": 34.89959294436906, | |
| "grad_norm": 1.165230393409729, | |
| "learning_rate": 3.928952790366654e-05, | |
| "loss": 0.1216, | |
| "step": 51442 | |
| }, | |
| { | |
| "epoch": 34.95997286295794, | |
| "grad_norm": 1.2061336040496826, | |
| "learning_rate": 3.913040317875424e-05, | |
| "loss": 0.1164, | |
| "step": 51531 | |
| }, | |
| { | |
| "epoch": 35.02035278154681, | |
| "grad_norm": 1.160407304763794, | |
| "learning_rate": 3.897139392139788e-05, | |
| "loss": 0.1258, | |
| "step": 51620 | |
| }, | |
| { | |
| "epoch": 35.080732700135684, | |
| "grad_norm": 0.8674483299255371, | |
| "learning_rate": 3.881250182075026e-05, | |
| "loss": 0.1129, | |
| "step": 51709 | |
| }, | |
| { | |
| "epoch": 35.141112618724556, | |
| "grad_norm": 1.4497802257537842, | |
| "learning_rate": 3.8653728564719674e-05, | |
| "loss": 0.1244, | |
| "step": 51798 | |
| }, | |
| { | |
| "epoch": 35.201492537313435, | |
| "grad_norm": 1.649856448173523, | |
| "learning_rate": 3.8495075839951937e-05, | |
| "loss": 0.1157, | |
| "step": 51887 | |
| }, | |
| { | |
| "epoch": 35.26187245590231, | |
| "grad_norm": 1.97478187084198, | |
| "learning_rate": 3.833654533181244e-05, | |
| "loss": 0.1182, | |
| "step": 51976 | |
| }, | |
| { | |
| "epoch": 35.32225237449118, | |
| "grad_norm": 1.4241811037063599, | |
| "learning_rate": 3.8178138724368275e-05, | |
| "loss": 0.1195, | |
| "step": 52065 | |
| }, | |
| { | |
| "epoch": 35.38263229308006, | |
| "grad_norm": 1.9427152872085571, | |
| "learning_rate": 3.8019857700370345e-05, | |
| "loss": 0.1214, | |
| "step": 52154 | |
| }, | |
| { | |
| "epoch": 35.44301221166893, | |
| "grad_norm": 1.2185932397842407, | |
| "learning_rate": 3.7861703941235444e-05, | |
| "loss": 0.1149, | |
| "step": 52243 | |
| }, | |
| { | |
| "epoch": 35.5033921302578, | |
| "grad_norm": 1.1983317136764526, | |
| "learning_rate": 3.770367912702849e-05, | |
| "loss": 0.1182, | |
| "step": 52332 | |
| }, | |
| { | |
| "epoch": 35.56377204884667, | |
| "grad_norm": 0.9646018147468567, | |
| "learning_rate": 3.7545784936444605e-05, | |
| "loss": 0.1272, | |
| "step": 52421 | |
| }, | |
| { | |
| "epoch": 35.62415196743555, | |
| "grad_norm": 1.189382791519165, | |
| "learning_rate": 3.73880230467913e-05, | |
| "loss": 0.1139, | |
| "step": 52510 | |
| }, | |
| { | |
| "epoch": 35.68453188602442, | |
| "grad_norm": 1.0490000247955322, | |
| "learning_rate": 3.7230395133970595e-05, | |
| "loss": 0.1179, | |
| "step": 52599 | |
| }, | |
| { | |
| "epoch": 35.744911804613295, | |
| "grad_norm": 1.055656909942627, | |
| "learning_rate": 3.7072902872461365e-05, | |
| "loss": 0.1184, | |
| "step": 52688 | |
| }, | |
| { | |
| "epoch": 35.805291723202174, | |
| "grad_norm": 1.564658522605896, | |
| "learning_rate": 3.691554793530143e-05, | |
| "loss": 0.12, | |
| "step": 52777 | |
| }, | |
| { | |
| "epoch": 35.865671641791046, | |
| "grad_norm": 1.054408311843872, | |
| "learning_rate": 3.6758331994069784e-05, | |
| "loss": 0.1145, | |
| "step": 52866 | |
| }, | |
| { | |
| "epoch": 35.92605156037992, | |
| "grad_norm": 1.5454896688461304, | |
| "learning_rate": 3.660125671886892e-05, | |
| "loss": 0.1104, | |
| "step": 52955 | |
| }, | |
| { | |
| "epoch": 35.98643147896879, | |
| "grad_norm": 0.9646552801132202, | |
| "learning_rate": 3.6444323778307e-05, | |
| "loss": 0.1192, | |
| "step": 53044 | |
| }, | |
| { | |
| "epoch": 36.004070556309365, | |
| "eval_accuracy": 0.181640625, | |
| "eval_loss": 4.21491813659668, | |
| "eval_runtime": 20.6978, | |
| "eval_samples_per_second": 24.737, | |
| "eval_steps_per_second": 0.193, | |
| "step": 53070 | |
| }, | |
| { | |
| "epoch": 36.04681139755767, | |
| "grad_norm": 2.2136874198913574, | |
| "learning_rate": 3.628753483948017e-05, | |
| "loss": 0.115, | |
| "step": 53133 | |
| }, | |
| { | |
| "epoch": 36.10719131614654, | |
| "grad_norm": 0.8969342708587646, | |
| "learning_rate": 3.613089156795489e-05, | |
| "loss": 0.111, | |
| "step": 53222 | |
| }, | |
| { | |
| "epoch": 36.16757123473541, | |
| "grad_norm": 1.5373083353042603, | |
| "learning_rate": 3.5974395627750136e-05, | |
| "loss": 0.1181, | |
| "step": 53311 | |
| }, | |
| { | |
| "epoch": 36.22795115332429, | |
| "grad_norm": 1.0511338710784912, | |
| "learning_rate": 3.581804868131986e-05, | |
| "loss": 0.1089, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 36.28833107191316, | |
| "grad_norm": 0.6941206455230713, | |
| "learning_rate": 3.566185238953516e-05, | |
| "loss": 0.1133, | |
| "step": 53489 | |
| }, | |
| { | |
| "epoch": 36.348710990502035, | |
| "grad_norm": 1.0698457956314087, | |
| "learning_rate": 3.5505808411666805e-05, | |
| "loss": 0.1046, | |
| "step": 53578 | |
| }, | |
| { | |
| "epoch": 36.40909090909091, | |
| "grad_norm": 1.1524955034255981, | |
| "learning_rate": 3.5349918405367533e-05, | |
| "loss": 0.1111, | |
| "step": 53667 | |
| }, | |
| { | |
| "epoch": 36.469470827679785, | |
| "grad_norm": 0.7653555274009705, | |
| "learning_rate": 3.519418402665441e-05, | |
| "loss": 0.1102, | |
| "step": 53756 | |
| }, | |
| { | |
| "epoch": 36.52985074626866, | |
| "grad_norm": 0.7626907229423523, | |
| "learning_rate": 3.503860692989129e-05, | |
| "loss": 0.1109, | |
| "step": 53845 | |
| }, | |
| { | |
| "epoch": 36.59023066485753, | |
| "grad_norm": 1.2246617078781128, | |
| "learning_rate": 3.4883188767771235e-05, | |
| "loss": 0.1087, | |
| "step": 53934 | |
| }, | |
| { | |
| "epoch": 36.6506105834464, | |
| "grad_norm": 0.8445035815238953, | |
| "learning_rate": 3.472793119129891e-05, | |
| "loss": 0.1104, | |
| "step": 54023 | |
| }, | |
| { | |
| "epoch": 36.71099050203528, | |
| "grad_norm": 0.4783117175102234, | |
| "learning_rate": 3.4572835849773124e-05, | |
| "loss": 0.1101, | |
| "step": 54112 | |
| }, | |
| { | |
| "epoch": 36.77137042062415, | |
| "grad_norm": 0.6431951522827148, | |
| "learning_rate": 3.441790439076924e-05, | |
| "loss": 0.1128, | |
| "step": 54201 | |
| }, | |
| { | |
| "epoch": 36.83175033921302, | |
| "grad_norm": 0.8060305118560791, | |
| "learning_rate": 3.426313846012174e-05, | |
| "loss": 0.1077, | |
| "step": 54290 | |
| }, | |
| { | |
| "epoch": 36.8921302578019, | |
| "grad_norm": 1.309480905532837, | |
| "learning_rate": 3.410853970190662e-05, | |
| "loss": 0.1094, | |
| "step": 54379 | |
| }, | |
| { | |
| "epoch": 36.952510176390774, | |
| "grad_norm": 0.7138769030570984, | |
| "learning_rate": 3.395410975842408e-05, | |
| "loss": 0.1119, | |
| "step": 54468 | |
| }, | |
| { | |
| "epoch": 37.012890094979646, | |
| "grad_norm": 1.4216080904006958, | |
| "learning_rate": 3.379985027018098e-05, | |
| "loss": 0.1117, | |
| "step": 54557 | |
| }, | |
| { | |
| "epoch": 37.07327001356852, | |
| "grad_norm": 1.4457802772521973, | |
| "learning_rate": 3.3645762875873415e-05, | |
| "loss": 0.1024, | |
| "step": 54646 | |
| }, | |
| { | |
| "epoch": 37.1336499321574, | |
| "grad_norm": 0.7809485793113708, | |
| "learning_rate": 3.349184921236939e-05, | |
| "loss": 0.1054, | |
| "step": 54735 | |
| }, | |
| { | |
| "epoch": 37.19402985074627, | |
| "grad_norm": 1.7159395217895508, | |
| "learning_rate": 3.333811091469129e-05, | |
| "loss": 0.1028, | |
| "step": 54824 | |
| }, | |
| { | |
| "epoch": 37.25440976933514, | |
| "grad_norm": 0.48482632637023926, | |
| "learning_rate": 3.318454961599864e-05, | |
| "loss": 0.105, | |
| "step": 54913 | |
| }, | |
| { | |
| "epoch": 37.31478968792402, | |
| "grad_norm": 1.0585776567459106, | |
| "learning_rate": 3.30311669475707e-05, | |
| "loss": 0.0995, | |
| "step": 55002 | |
| }, | |
| { | |
| "epoch": 37.37516960651289, | |
| "grad_norm": 0.7327682971954346, | |
| "learning_rate": 3.2877964538789154e-05, | |
| "loss": 0.1072, | |
| "step": 55091 | |
| }, | |
| { | |
| "epoch": 37.43554952510176, | |
| "grad_norm": 1.583203911781311, | |
| "learning_rate": 3.272494401712078e-05, | |
| "loss": 0.104, | |
| "step": 55180 | |
| }, | |
| { | |
| "epoch": 37.495929443690635, | |
| "grad_norm": 1.2305749654769897, | |
| "learning_rate": 3.257210700810015e-05, | |
| "loss": 0.1038, | |
| "step": 55269 | |
| }, | |
| { | |
| "epoch": 37.556309362279514, | |
| "grad_norm": 8.245156288146973, | |
| "learning_rate": 3.241945513531241e-05, | |
| "loss": 0.1087, | |
| "step": 55358 | |
| }, | |
| { | |
| "epoch": 37.616689280868385, | |
| "grad_norm": 0.8825012445449829, | |
| "learning_rate": 3.226699002037602e-05, | |
| "loss": 0.109, | |
| "step": 55447 | |
| }, | |
| { | |
| "epoch": 37.67706919945726, | |
| "grad_norm": 1.1218957901000977, | |
| "learning_rate": 3.2114713282925466e-05, | |
| "loss": 0.1038, | |
| "step": 55536 | |
| }, | |
| { | |
| "epoch": 37.737449118046136, | |
| "grad_norm": 1.4190541505813599, | |
| "learning_rate": 3.196262654059419e-05, | |
| "loss": 0.108, | |
| "step": 55625 | |
| }, | |
| { | |
| "epoch": 37.79782903663501, | |
| "grad_norm": 0.5339131951332092, | |
| "learning_rate": 3.1810731408997185e-05, | |
| "loss": 0.1103, | |
| "step": 55714 | |
| }, | |
| { | |
| "epoch": 37.85820895522388, | |
| "grad_norm": 1.3955272436141968, | |
| "learning_rate": 3.1659029501714077e-05, | |
| "loss": 0.0993, | |
| "step": 55803 | |
| }, | |
| { | |
| "epoch": 37.91858887381275, | |
| "grad_norm": 1.7271915674209595, | |
| "learning_rate": 3.150752243027185e-05, | |
| "loss": 0.1081, | |
| "step": 55892 | |
| }, | |
| { | |
| "epoch": 37.97896879240163, | |
| "grad_norm": 1.272377848625183, | |
| "learning_rate": 3.1356211804127726e-05, | |
| "loss": 0.0988, | |
| "step": 55981 | |
| }, | |
| { | |
| "epoch": 38.0393487109905, | |
| "grad_norm": 1.4303677082061768, | |
| "learning_rate": 3.1205099230652134e-05, | |
| "loss": 0.0947, | |
| "step": 56070 | |
| }, | |
| { | |
| "epoch": 38.099728629579374, | |
| "grad_norm": 0.8582963347434998, | |
| "learning_rate": 3.105418631511151e-05, | |
| "loss": 0.1023, | |
| "step": 56159 | |
| }, | |
| { | |
| "epoch": 38.16010854816825, | |
| "grad_norm": 0.9735032916069031, | |
| "learning_rate": 3.090347466065141e-05, | |
| "loss": 0.0994, | |
| "step": 56248 | |
| }, | |
| { | |
| "epoch": 38.220488466757125, | |
| "grad_norm": 0.8023036122322083, | |
| "learning_rate": 3.075296586827938e-05, | |
| "loss": 0.0968, | |
| "step": 56337 | |
| }, | |
| { | |
| "epoch": 38.280868385346, | |
| "grad_norm": 0.8283627033233643, | |
| "learning_rate": 3.060266153684792e-05, | |
| "loss": 0.0988, | |
| "step": 56426 | |
| }, | |
| { | |
| "epoch": 38.34124830393487, | |
| "grad_norm": 1.1911463737487793, | |
| "learning_rate": 3.045256326303762e-05, | |
| "loss": 0.1015, | |
| "step": 56515 | |
| }, | |
| { | |
| "epoch": 38.40162822252375, | |
| "grad_norm": 1.3183075189590454, | |
| "learning_rate": 3.030267264134003e-05, | |
| "loss": 0.1014, | |
| "step": 56604 | |
| }, | |
| { | |
| "epoch": 38.404341926729984, | |
| "eval_accuracy": 0.169921875, | |
| "eval_loss": 4.234889984130859, | |
| "eval_runtime": 20.5487, | |
| "eval_samples_per_second": 24.916, | |
| "eval_steps_per_second": 0.195, | |
| "step": 56608 | |
| }, | |
| { | |
| "epoch": 38.46200814111262, | |
| "grad_norm": 0.9125858545303345, | |
| "learning_rate": 3.0152991264040888e-05, | |
| "loss": 0.101, | |
| "step": 56693 | |
| }, | |
| { | |
| "epoch": 38.52238805970149, | |
| "grad_norm": 0.5899451971054077, | |
| "learning_rate": 3.0003520721203106e-05, | |
| "loss": 0.0969, | |
| "step": 56782 | |
| }, | |
| { | |
| "epoch": 38.58276797829036, | |
| "grad_norm": 1.2424014806747437, | |
| "learning_rate": 2.9854262600649907e-05, | |
| "loss": 0.1017, | |
| "step": 56871 | |
| }, | |
| { | |
| "epoch": 38.64314789687924, | |
| "grad_norm": 1.250267744064331, | |
| "learning_rate": 2.9705218487947984e-05, | |
| "loss": 0.0982, | |
| "step": 56960 | |
| }, | |
| { | |
| "epoch": 38.703527815468114, | |
| "grad_norm": 1.153306245803833, | |
| "learning_rate": 2.9556389966390552e-05, | |
| "loss": 0.1006, | |
| "step": 57049 | |
| }, | |
| { | |
| "epoch": 38.763907734056986, | |
| "grad_norm": 1.2941042184829712, | |
| "learning_rate": 2.940777861698068e-05, | |
| "loss": 0.0975, | |
| "step": 57138 | |
| }, | |
| { | |
| "epoch": 38.824287652645864, | |
| "grad_norm": 1.0015143156051636, | |
| "learning_rate": 2.9259386018414396e-05, | |
| "loss": 0.1054, | |
| "step": 57227 | |
| }, | |
| { | |
| "epoch": 38.884667571234736, | |
| "grad_norm": 0.8103719353675842, | |
| "learning_rate": 2.9111213747063915e-05, | |
| "loss": 0.1004, | |
| "step": 57316 | |
| }, | |
| { | |
| "epoch": 38.94504748982361, | |
| "grad_norm": 2.045173406600952, | |
| "learning_rate": 2.896326337696098e-05, | |
| "loss": 0.0993, | |
| "step": 57405 | |
| }, | |
| { | |
| "epoch": 39.00542740841248, | |
| "grad_norm": 0.9834128022193909, | |
| "learning_rate": 2.8815536479780014e-05, | |
| "loss": 0.0971, | |
| "step": 57494 | |
| }, | |
| { | |
| "epoch": 39.06580732700136, | |
| "grad_norm": 0.6491034030914307, | |
| "learning_rate": 2.8668034624821514e-05, | |
| "loss": 0.0957, | |
| "step": 57583 | |
| }, | |
| { | |
| "epoch": 39.12618724559023, | |
| "grad_norm": 1.0920275449752808, | |
| "learning_rate": 2.852075937899541e-05, | |
| "loss": 0.0938, | |
| "step": 57672 | |
| }, | |
| { | |
| "epoch": 39.1865671641791, | |
| "grad_norm": 0.9111031293869019, | |
| "learning_rate": 2.8373712306804267e-05, | |
| "loss": 0.0954, | |
| "step": 57761 | |
| }, | |
| { | |
| "epoch": 39.24694708276798, | |
| "grad_norm": 0.7507003545761108, | |
| "learning_rate": 2.8226894970326856e-05, | |
| "loss": 0.0926, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 39.30732700135685, | |
| "grad_norm": 1.0884746313095093, | |
| "learning_rate": 2.8080308929201392e-05, | |
| "loss": 0.0946, | |
| "step": 57939 | |
| }, | |
| { | |
| "epoch": 39.367706919945725, | |
| "grad_norm": 0.7752851843833923, | |
| "learning_rate": 2.793395574060911e-05, | |
| "loss": 0.0925, | |
| "step": 58028 | |
| }, | |
| { | |
| "epoch": 39.4280868385346, | |
| "grad_norm": 0.8282026052474976, | |
| "learning_rate": 2.7787836959257617e-05, | |
| "loss": 0.0954, | |
| "step": 58117 | |
| }, | |
| { | |
| "epoch": 39.488466757123476, | |
| "grad_norm": 0.7554723620414734, | |
| "learning_rate": 2.764195413736444e-05, | |
| "loss": 0.0965, | |
| "step": 58206 | |
| }, | |
| { | |
| "epoch": 39.54884667571235, | |
| "grad_norm": 1.461937427520752, | |
| "learning_rate": 2.7496308824640505e-05, | |
| "loss": 0.0963, | |
| "step": 58295 | |
| }, | |
| { | |
| "epoch": 39.60922659430122, | |
| "grad_norm": 1.260448694229126, | |
| "learning_rate": 2.735090256827365e-05, | |
| "loss": 0.0901, | |
| "step": 58384 | |
| }, | |
| { | |
| "epoch": 39.6696065128901, | |
| "grad_norm": 0.5917372703552246, | |
| "learning_rate": 2.720573691291226e-05, | |
| "loss": 0.0912, | |
| "step": 58473 | |
| }, | |
| { | |
| "epoch": 39.72998643147897, | |
| "grad_norm": 1.0899447202682495, | |
| "learning_rate": 2.70608134006488e-05, | |
| "loss": 0.0971, | |
| "step": 58562 | |
| }, | |
| { | |
| "epoch": 39.79036635006784, | |
| "grad_norm": 0.700945258140564, | |
| "learning_rate": 2.691613357100348e-05, | |
| "loss": 0.0959, | |
| "step": 58651 | |
| }, | |
| { | |
| "epoch": 39.850746268656714, | |
| "grad_norm": 0.563937783241272, | |
| "learning_rate": 2.6771698960907844e-05, | |
| "loss": 0.0924, | |
| "step": 58740 | |
| }, | |
| { | |
| "epoch": 39.91112618724559, | |
| "grad_norm": 1.2287607192993164, | |
| "learning_rate": 2.6627511104688463e-05, | |
| "loss": 0.0915, | |
| "step": 58829 | |
| }, | |
| { | |
| "epoch": 39.971506105834465, | |
| "grad_norm": 1.0432151556015015, | |
| "learning_rate": 2.6483571534050684e-05, | |
| "loss": 0.094, | |
| "step": 58918 | |
| }, | |
| { | |
| "epoch": 40.031886024423336, | |
| "grad_norm": 0.9087603092193604, | |
| "learning_rate": 2.6339881778062286e-05, | |
| "loss": 0.0914, | |
| "step": 59007 | |
| }, | |
| { | |
| "epoch": 40.09226594301221, | |
| "grad_norm": 1.0434340238571167, | |
| "learning_rate": 2.6196443363137295e-05, | |
| "loss": 0.0932, | |
| "step": 59096 | |
| }, | |
| { | |
| "epoch": 40.15264586160109, | |
| "grad_norm": 1.4416966438293457, | |
| "learning_rate": 2.6053257813019756e-05, | |
| "loss": 0.0951, | |
| "step": 59185 | |
| }, | |
| { | |
| "epoch": 40.21302578018996, | |
| "grad_norm": 0.5194874405860901, | |
| "learning_rate": 2.5910326648767464e-05, | |
| "loss": 0.0909, | |
| "step": 59274 | |
| }, | |
| { | |
| "epoch": 40.27340569877883, | |
| "grad_norm": 0.4782836139202118, | |
| "learning_rate": 2.5767651388735976e-05, | |
| "loss": 0.0917, | |
| "step": 59363 | |
| }, | |
| { | |
| "epoch": 40.33378561736771, | |
| "grad_norm": 0.7723681926727295, | |
| "learning_rate": 2.5625233548562288e-05, | |
| "loss": 0.0928, | |
| "step": 59452 | |
| }, | |
| { | |
| "epoch": 40.39416553595658, | |
| "grad_norm": 0.5637179017066956, | |
| "learning_rate": 2.5483074641148896e-05, | |
| "loss": 0.095, | |
| "step": 59541 | |
| }, | |
| { | |
| "epoch": 40.45454545454545, | |
| "grad_norm": 0.9517094492912292, | |
| "learning_rate": 2.534117617664766e-05, | |
| "loss": 0.0857, | |
| "step": 59630 | |
| }, | |
| { | |
| "epoch": 40.514925373134325, | |
| "grad_norm": 1.0360537767410278, | |
| "learning_rate": 2.5199539662443683e-05, | |
| "loss": 0.0923, | |
| "step": 59719 | |
| }, | |
| { | |
| "epoch": 40.575305291723204, | |
| "grad_norm": 0.993859589099884, | |
| "learning_rate": 2.5058166603139453e-05, | |
| "loss": 0.0918, | |
| "step": 59808 | |
| }, | |
| { | |
| "epoch": 40.635685210312076, | |
| "grad_norm": 0.5905105471611023, | |
| "learning_rate": 2.491705850053876e-05, | |
| "loss": 0.0914, | |
| "step": 59897 | |
| }, | |
| { | |
| "epoch": 40.69606512890095, | |
| "grad_norm": 1.8507524728775024, | |
| "learning_rate": 2.4776216853630747e-05, | |
| "loss": 0.0948, | |
| "step": 59986 | |
| }, | |
| { | |
| "epoch": 40.75644504748983, | |
| "grad_norm": 0.8569918274879456, | |
| "learning_rate": 2.4635643158574034e-05, | |
| "loss": 0.0933, | |
| "step": 60075 | |
| }, | |
| { | |
| "epoch": 40.80461329715061, | |
| "eval_accuracy": 0.17578125, | |
| "eval_loss": 4.287986755371094, | |
| "eval_runtime": 29.4248, | |
| "eval_samples_per_second": 17.4, | |
| "eval_steps_per_second": 0.136, | |
| "step": 60146 | |
| }, | |
| { | |
| "epoch": 40.8168249660787, | |
| "grad_norm": 1.2466926574707031, | |
| "learning_rate": 2.4495338908680733e-05, | |
| "loss": 0.0884, | |
| "step": 60164 | |
| }, | |
| { | |
| "epoch": 40.87720488466757, | |
| "grad_norm": 1.0967109203338623, | |
| "learning_rate": 2.4355305594400703e-05, | |
| "loss": 0.0885, | |
| "step": 60253 | |
| }, | |
| { | |
| "epoch": 40.93758480325644, | |
| "grad_norm": 3.7335941791534424, | |
| "learning_rate": 2.4215544703305624e-05, | |
| "loss": 0.0863, | |
| "step": 60342 | |
| }, | |
| { | |
| "epoch": 40.99796472184532, | |
| "grad_norm": 0.7128244638442993, | |
| "learning_rate": 2.4076057720073263e-05, | |
| "loss": 0.0916, | |
| "step": 60431 | |
| }, | |
| { | |
| "epoch": 41.05834464043419, | |
| "grad_norm": 0.6948025226593018, | |
| "learning_rate": 2.393684612647165e-05, | |
| "loss": 0.0907, | |
| "step": 60520 | |
| }, | |
| { | |
| "epoch": 41.118724559023065, | |
| "grad_norm": 0.9347543716430664, | |
| "learning_rate": 2.3797911401343324e-05, | |
| "loss": 0.0863, | |
| "step": 60609 | |
| }, | |
| { | |
| "epoch": 41.17910447761194, | |
| "grad_norm": 0.6577604413032532, | |
| "learning_rate": 2.3659255020589693e-05, | |
| "loss": 0.0893, | |
| "step": 60698 | |
| }, | |
| { | |
| "epoch": 41.239484396200815, | |
| "grad_norm": 1.0613411664962769, | |
| "learning_rate": 2.3520878457155317e-05, | |
| "loss": 0.0907, | |
| "step": 60787 | |
| }, | |
| { | |
| "epoch": 41.29986431478969, | |
| "grad_norm": 0.7223649024963379, | |
| "learning_rate": 2.338278318101224e-05, | |
| "loss": 0.0858, | |
| "step": 60876 | |
| }, | |
| { | |
| "epoch": 41.36024423337856, | |
| "grad_norm": 0.6473923325538635, | |
| "learning_rate": 2.3244970659144434e-05, | |
| "loss": 0.0881, | |
| "step": 60965 | |
| }, | |
| { | |
| "epoch": 41.42062415196744, | |
| "grad_norm": 0.6310983300209045, | |
| "learning_rate": 2.3107442355532105e-05, | |
| "loss": 0.0866, | |
| "step": 61054 | |
| }, | |
| { | |
| "epoch": 41.48100407055631, | |
| "grad_norm": 1.2830203771591187, | |
| "learning_rate": 2.2970199731136305e-05, | |
| "loss": 0.0882, | |
| "step": 61143 | |
| }, | |
| { | |
| "epoch": 41.54138398914518, | |
| "grad_norm": 0.6028885245323181, | |
| "learning_rate": 2.2833244243883222e-05, | |
| "loss": 0.0861, | |
| "step": 61232 | |
| }, | |
| { | |
| "epoch": 41.60176390773406, | |
| "grad_norm": 1.1787885427474976, | |
| "learning_rate": 2.2696577348648867e-05, | |
| "loss": 0.0897, | |
| "step": 61321 | |
| }, | |
| { | |
| "epoch": 41.66214382632293, | |
| "grad_norm": 0.5341454148292542, | |
| "learning_rate": 2.2560200497243537e-05, | |
| "loss": 0.0871, | |
| "step": 61410 | |
| }, | |
| { | |
| "epoch": 41.722523744911804, | |
| "grad_norm": 1.4164313077926636, | |
| "learning_rate": 2.2424115138396336e-05, | |
| "loss": 0.0924, | |
| "step": 61499 | |
| }, | |
| { | |
| "epoch": 41.782903663500676, | |
| "grad_norm": 0.7035442590713501, | |
| "learning_rate": 2.2288322717739912e-05, | |
| "loss": 0.088, | |
| "step": 61588 | |
| }, | |
| { | |
| "epoch": 41.843283582089555, | |
| "grad_norm": 0.6574503779411316, | |
| "learning_rate": 2.2152824677795003e-05, | |
| "loss": 0.0868, | |
| "step": 61677 | |
| }, | |
| { | |
| "epoch": 41.90366350067843, | |
| "grad_norm": 0.4766522943973541, | |
| "learning_rate": 2.201762245795516e-05, | |
| "loss": 0.0887, | |
| "step": 61766 | |
| }, | |
| { | |
| "epoch": 41.9640434192673, | |
| "grad_norm": 2.5811030864715576, | |
| "learning_rate": 2.188271749447146e-05, | |
| "loss": 0.0872, | |
| "step": 61855 | |
| }, | |
| { | |
| "epoch": 42.02442333785617, | |
| "grad_norm": 0.7208371758460999, | |
| "learning_rate": 2.1748111220437163e-05, | |
| "loss": 0.0825, | |
| "step": 61944 | |
| }, | |
| { | |
| "epoch": 42.08480325644505, | |
| "grad_norm": 0.7155792713165283, | |
| "learning_rate": 2.161380506577262e-05, | |
| "loss": 0.0913, | |
| "step": 62033 | |
| }, | |
| { | |
| "epoch": 42.14518317503392, | |
| "grad_norm": 0.7777039408683777, | |
| "learning_rate": 2.147980045720999e-05, | |
| "loss": 0.0837, | |
| "step": 62122 | |
| }, | |
| { | |
| "epoch": 42.20556309362279, | |
| "grad_norm": 0.5456185340881348, | |
| "learning_rate": 2.134609881827813e-05, | |
| "loss": 0.0825, | |
| "step": 62211 | |
| }, | |
| { | |
| "epoch": 42.26594301221167, | |
| "grad_norm": 0.614791750907898, | |
| "learning_rate": 2.1212701569287463e-05, | |
| "loss": 0.078, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 42.32632293080054, | |
| "grad_norm": 0.9303745627403259, | |
| "learning_rate": 2.1079610127314827e-05, | |
| "loss": 0.0815, | |
| "step": 62389 | |
| }, | |
| { | |
| "epoch": 42.386702849389415, | |
| "grad_norm": 0.6811819672584534, | |
| "learning_rate": 2.094682590618852e-05, | |
| "loss": 0.0842, | |
| "step": 62478 | |
| }, | |
| { | |
| "epoch": 42.44708276797829, | |
| "grad_norm": 0.7549321055412292, | |
| "learning_rate": 2.081435031647326e-05, | |
| "loss": 0.0834, | |
| "step": 62567 | |
| }, | |
| { | |
| "epoch": 42.507462686567166, | |
| "grad_norm": 0.6525147557258606, | |
| "learning_rate": 2.0682184765455143e-05, | |
| "loss": 0.0853, | |
| "step": 62656 | |
| }, | |
| { | |
| "epoch": 42.56784260515604, | |
| "grad_norm": 0.7095387578010559, | |
| "learning_rate": 2.0550330657126715e-05, | |
| "loss": 0.0873, | |
| "step": 62745 | |
| }, | |
| { | |
| "epoch": 42.62822252374491, | |
| "grad_norm": 0.8400213122367859, | |
| "learning_rate": 2.041878939217211e-05, | |
| "loss": 0.0875, | |
| "step": 62834 | |
| }, | |
| { | |
| "epoch": 42.68860244233379, | |
| "grad_norm": 0.9360200762748718, | |
| "learning_rate": 2.028756236795213e-05, | |
| "loss": 0.0843, | |
| "step": 62923 | |
| }, | |
| { | |
| "epoch": 42.74898236092266, | |
| "grad_norm": 0.5572984218597412, | |
| "learning_rate": 2.015665097848935e-05, | |
| "loss": 0.0813, | |
| "step": 63012 | |
| }, | |
| { | |
| "epoch": 42.80936227951153, | |
| "grad_norm": 0.9234522581100464, | |
| "learning_rate": 2.002605661445342e-05, | |
| "loss": 0.083, | |
| "step": 63101 | |
| }, | |
| { | |
| "epoch": 42.869742198100404, | |
| "grad_norm": 0.5887913107872009, | |
| "learning_rate": 1.989578066314623e-05, | |
| "loss": 0.0839, | |
| "step": 63190 | |
| }, | |
| { | |
| "epoch": 42.93012211668928, | |
| "grad_norm": 0.8760083913803101, | |
| "learning_rate": 1.9765824508487125e-05, | |
| "loss": 0.085, | |
| "step": 63279 | |
| }, | |
| { | |
| "epoch": 42.990502035278155, | |
| "grad_norm": 0.7094123959541321, | |
| "learning_rate": 1.9636189530998307e-05, | |
| "loss": 0.0798, | |
| "step": 63368 | |
| }, | |
| { | |
| "epoch": 43.05088195386703, | |
| "grad_norm": 0.5656801462173462, | |
| "learning_rate": 1.95068771077901e-05, | |
| "loss": 0.0785, | |
| "step": 63457 | |
| }, | |
| { | |
| "epoch": 43.111261872455906, | |
| "grad_norm": 0.8483113646507263, | |
| "learning_rate": 1.937788861254634e-05, | |
| "loss": 0.081, | |
| "step": 63546 | |
| }, | |
| { | |
| "epoch": 43.17164179104478, | |
| "grad_norm": 0.5962135791778564, | |
| "learning_rate": 1.9249225415509807e-05, | |
| "loss": 0.0832, | |
| "step": 63635 | |
| }, | |
| { | |
| "epoch": 43.204884667571235, | |
| "eval_accuracy": 0.177734375, | |
| "eval_loss": 4.267856597900391, | |
| "eval_runtime": 19.6828, | |
| "eval_samples_per_second": 26.013, | |
| "eval_steps_per_second": 0.203, | |
| "step": 63684 | |
| }, | |
| { | |
| "epoch": 43.23202170963365, | |
| "grad_norm": 1.1795192956924438, | |
| "learning_rate": 1.9120888883467574e-05, | |
| "loss": 0.0881, | |
| "step": 63724 | |
| }, | |
| { | |
| "epoch": 43.29240162822252, | |
| "grad_norm": 1.2301242351531982, | |
| "learning_rate": 1.899288037973662e-05, | |
| "loss": 0.0779, | |
| "step": 63813 | |
| }, | |
| { | |
| "epoch": 43.3527815468114, | |
| "grad_norm": 0.5368560552597046, | |
| "learning_rate": 1.8865201264149267e-05, | |
| "loss": 0.0793, | |
| "step": 63902 | |
| }, | |
| { | |
| "epoch": 43.41316146540027, | |
| "grad_norm": 1.1031700372695923, | |
| "learning_rate": 1.873785289303875e-05, | |
| "loss": 0.0824, | |
| "step": 63991 | |
| }, | |
| { | |
| "epoch": 43.473541383989144, | |
| "grad_norm": 0.9713082313537598, | |
| "learning_rate": 1.861083661922482e-05, | |
| "loss": 0.0766, | |
| "step": 64080 | |
| }, | |
| { | |
| "epoch": 43.53392130257802, | |
| "grad_norm": 0.681328296661377, | |
| "learning_rate": 1.8484153791999326e-05, | |
| "loss": 0.0799, | |
| "step": 64169 | |
| }, | |
| { | |
| "epoch": 43.594301221166894, | |
| "grad_norm": 0.8199315071105957, | |
| "learning_rate": 1.8357805757111966e-05, | |
| "loss": 0.0811, | |
| "step": 64258 | |
| }, | |
| { | |
| "epoch": 43.654681139755766, | |
| "grad_norm": 1.327650785446167, | |
| "learning_rate": 1.823179385675593e-05, | |
| "loss": 0.08, | |
| "step": 64347 | |
| }, | |
| { | |
| "epoch": 43.71506105834464, | |
| "grad_norm": 0.9341023564338684, | |
| "learning_rate": 1.810611942955365e-05, | |
| "loss": 0.0787, | |
| "step": 64436 | |
| }, | |
| { | |
| "epoch": 43.77544097693352, | |
| "grad_norm": 0.5767560601234436, | |
| "learning_rate": 1.7980783810542577e-05, | |
| "loss": 0.0812, | |
| "step": 64525 | |
| }, | |
| { | |
| "epoch": 43.83582089552239, | |
| "grad_norm": 0.5114635229110718, | |
| "learning_rate": 1.785578833116104e-05, | |
| "loss": 0.0823, | |
| "step": 64614 | |
| }, | |
| { | |
| "epoch": 43.89620081411126, | |
| "grad_norm": 0.5436065196990967, | |
| "learning_rate": 1.7731134319234016e-05, | |
| "loss": 0.0819, | |
| "step": 64703 | |
| }, | |
| { | |
| "epoch": 43.95658073270013, | |
| "grad_norm": 0.4684976041316986, | |
| "learning_rate": 1.760682309895913e-05, | |
| "loss": 0.0842, | |
| "step": 64792 | |
| }, | |
| { | |
| "epoch": 44.01696065128901, | |
| "grad_norm": 1.0648964643478394, | |
| "learning_rate": 1.7482855990892517e-05, | |
| "loss": 0.0822, | |
| "step": 64881 | |
| }, | |
| { | |
| "epoch": 44.07734056987788, | |
| "grad_norm": 0.6211819648742676, | |
| "learning_rate": 1.735923431193483e-05, | |
| "loss": 0.0797, | |
| "step": 64970 | |
| }, | |
| { | |
| "epoch": 44.137720488466755, | |
| "grad_norm": 0.4334025979042053, | |
| "learning_rate": 1.7235959375317185e-05, | |
| "loss": 0.0759, | |
| "step": 65059 | |
| }, | |
| { | |
| "epoch": 44.198100407055634, | |
| "grad_norm": 1.0753127336502075, | |
| "learning_rate": 1.711303249058731e-05, | |
| "loss": 0.0756, | |
| "step": 65148 | |
| }, | |
| { | |
| "epoch": 44.258480325644506, | |
| "grad_norm": 0.6846993565559387, | |
| "learning_rate": 1.6990454963595577e-05, | |
| "loss": 0.0795, | |
| "step": 65237 | |
| }, | |
| { | |
| "epoch": 44.31886024423338, | |
| "grad_norm": 0.44795066118240356, | |
| "learning_rate": 1.6868228096481104e-05, | |
| "loss": 0.0815, | |
| "step": 65326 | |
| }, | |
| { | |
| "epoch": 44.37924016282225, | |
| "grad_norm": 1.4556400775909424, | |
| "learning_rate": 1.674635318765801e-05, | |
| "loss": 0.0789, | |
| "step": 65415 | |
| }, | |
| { | |
| "epoch": 44.43962008141113, | |
| "grad_norm": 0.6817762851715088, | |
| "learning_rate": 1.66248315318015e-05, | |
| "loss": 0.0734, | |
| "step": 65504 | |
| }, | |
| { | |
| "epoch": 44.5, | |
| "grad_norm": 0.8499571681022644, | |
| "learning_rate": 1.6503664419834215e-05, | |
| "loss": 0.0798, | |
| "step": 65593 | |
| }, | |
| { | |
| "epoch": 44.56037991858887, | |
| "grad_norm": 0.5608311891555786, | |
| "learning_rate": 1.6382853138912485e-05, | |
| "loss": 0.0759, | |
| "step": 65682 | |
| }, | |
| { | |
| "epoch": 44.62075983717775, | |
| "grad_norm": 1.1510560512542725, | |
| "learning_rate": 1.6262398972412644e-05, | |
| "loss": 0.0774, | |
| "step": 65771 | |
| }, | |
| { | |
| "epoch": 44.68113975576662, | |
| "grad_norm": 0.591827392578125, | |
| "learning_rate": 1.614230319991743e-05, | |
| "loss": 0.0827, | |
| "step": 65860 | |
| }, | |
| { | |
| "epoch": 44.741519674355494, | |
| "grad_norm": 0.7560341358184814, | |
| "learning_rate": 1.60225670972023e-05, | |
| "loss": 0.0752, | |
| "step": 65949 | |
| }, | |
| { | |
| "epoch": 44.801899592944366, | |
| "grad_norm": 1.0043483972549438, | |
| "learning_rate": 1.5903191936222016e-05, | |
| "loss": 0.0794, | |
| "step": 66038 | |
| }, | |
| { | |
| "epoch": 44.862279511533245, | |
| "grad_norm": 1.1438446044921875, | |
| "learning_rate": 1.5784178985097024e-05, | |
| "loss": 0.08, | |
| "step": 66127 | |
| }, | |
| { | |
| "epoch": 44.92265943012212, | |
| "grad_norm": 0.39144688844680786, | |
| "learning_rate": 1.5665529508100052e-05, | |
| "loss": 0.0729, | |
| "step": 66216 | |
| }, | |
| { | |
| "epoch": 44.98303934871099, | |
| "grad_norm": 0.7558673620223999, | |
| "learning_rate": 1.5547244765642588e-05, | |
| "loss": 0.0759, | |
| "step": 66305 | |
| }, | |
| { | |
| "epoch": 45.04341926729987, | |
| "grad_norm": 0.6959690451622009, | |
| "learning_rate": 1.5429326014261632e-05, | |
| "loss": 0.0817, | |
| "step": 66394 | |
| }, | |
| { | |
| "epoch": 45.10379918588874, | |
| "grad_norm": 0.41576260328292847, | |
| "learning_rate": 1.531177450660618e-05, | |
| "loss": 0.0738, | |
| "step": 66483 | |
| }, | |
| { | |
| "epoch": 45.16417910447761, | |
| "grad_norm": 0.8202412724494934, | |
| "learning_rate": 1.5194591491424064e-05, | |
| "loss": 0.0763, | |
| "step": 66572 | |
| }, | |
| { | |
| "epoch": 45.22455902306648, | |
| "grad_norm": 1.1920087337493896, | |
| "learning_rate": 1.5077778213548622e-05, | |
| "loss": 0.0756, | |
| "step": 66661 | |
| }, | |
| { | |
| "epoch": 45.28493894165536, | |
| "grad_norm": 0.6442920565605164, | |
| "learning_rate": 1.496133591388547e-05, | |
| "loss": 0.0795, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 45.345318860244234, | |
| "grad_norm": 0.7332776784896851, | |
| "learning_rate": 1.4845265829399296e-05, | |
| "loss": 0.0766, | |
| "step": 66839 | |
| }, | |
| { | |
| "epoch": 45.405698778833106, | |
| "grad_norm": 0.887069821357727, | |
| "learning_rate": 1.4729569193100795e-05, | |
| "loss": 0.0756, | |
| "step": 66928 | |
| }, | |
| { | |
| "epoch": 45.46607869742198, | |
| "grad_norm": 0.6151465177536011, | |
| "learning_rate": 1.4614247234033518e-05, | |
| "loss": 0.0793, | |
| "step": 67017 | |
| }, | |
| { | |
| "epoch": 45.526458616010856, | |
| "grad_norm": 0.7770605087280273, | |
| "learning_rate": 1.449930117726081e-05, | |
| "loss": 0.0793, | |
| "step": 67106 | |
| }, | |
| { | |
| "epoch": 45.58683853459973, | |
| "grad_norm": 0.5736819505691528, | |
| "learning_rate": 1.438473224385285e-05, | |
| "loss": 0.0728, | |
| "step": 67195 | |
| }, | |
| { | |
| "epoch": 45.60515603799186, | |
| "eval_accuracy": 0.185546875, | |
| "eval_loss": 4.269733428955078, | |
| "eval_runtime": 40.0503, | |
| "eval_samples_per_second": 12.784, | |
| "eval_steps_per_second": 0.1, | |
| "step": 67222 | |
| }, | |
| { | |
| "epoch": 45.6472184531886, | |
| "grad_norm": 0.4615430533885956, | |
| "learning_rate": 1.4270541650873582e-05, | |
| "loss": 0.0706, | |
| "step": 67284 | |
| }, | |
| { | |
| "epoch": 45.70759837177748, | |
| "grad_norm": 0.7554183006286621, | |
| "learning_rate": 1.415673061136788e-05, | |
| "loss": 0.0788, | |
| "step": 67373 | |
| }, | |
| { | |
| "epoch": 45.76797829036635, | |
| "grad_norm": 0.6309983134269714, | |
| "learning_rate": 1.4043300334348641e-05, | |
| "loss": 0.0779, | |
| "step": 67462 | |
| }, | |
| { | |
| "epoch": 45.82835820895522, | |
| "grad_norm": 0.4782220125198364, | |
| "learning_rate": 1.3930252024783903e-05, | |
| "loss": 0.0769, | |
| "step": 67551 | |
| }, | |
| { | |
| "epoch": 45.888738127544094, | |
| "grad_norm": 0.5289342403411865, | |
| "learning_rate": 1.3817586883584094e-05, | |
| "loss": 0.0768, | |
| "step": 67640 | |
| }, | |
| { | |
| "epoch": 45.94911804613297, | |
| "grad_norm": 0.5275683403015137, | |
| "learning_rate": 1.370530610758921e-05, | |
| "loss": 0.0743, | |
| "step": 67729 | |
| }, | |
| { | |
| "epoch": 46.009497964721845, | |
| "grad_norm": 0.3685113787651062, | |
| "learning_rate": 1.359341088955618e-05, | |
| "loss": 0.0734, | |
| "step": 67818 | |
| }, | |
| { | |
| "epoch": 46.06987788331072, | |
| "grad_norm": 0.6584441661834717, | |
| "learning_rate": 1.3481902418146154e-05, | |
| "loss": 0.0742, | |
| "step": 67907 | |
| }, | |
| { | |
| "epoch": 46.130257801899596, | |
| "grad_norm": 0.7138823866844177, | |
| "learning_rate": 1.3370781877911842e-05, | |
| "loss": 0.0695, | |
| "step": 67996 | |
| }, | |
| { | |
| "epoch": 46.19063772048847, | |
| "grad_norm": 0.39327022433280945, | |
| "learning_rate": 1.326005044928501e-05, | |
| "loss": 0.0717, | |
| "step": 68085 | |
| }, | |
| { | |
| "epoch": 46.25101763907734, | |
| "grad_norm": 0.4522133469581604, | |
| "learning_rate": 1.3149709308563901e-05, | |
| "loss": 0.0749, | |
| "step": 68174 | |
| }, | |
| { | |
| "epoch": 46.31139755766621, | |
| "grad_norm": 0.6930340528488159, | |
| "learning_rate": 1.3039759627900672e-05, | |
| "loss": 0.074, | |
| "step": 68263 | |
| }, | |
| { | |
| "epoch": 46.37177747625509, | |
| "grad_norm": 2.3860812187194824, | |
| "learning_rate": 1.293020257528908e-05, | |
| "loss": 0.0756, | |
| "step": 68352 | |
| }, | |
| { | |
| "epoch": 46.43215739484396, | |
| "grad_norm": 0.8091538548469543, | |
| "learning_rate": 1.2821039314551958e-05, | |
| "loss": 0.0765, | |
| "step": 68441 | |
| }, | |
| { | |
| "epoch": 46.492537313432834, | |
| "grad_norm": 0.6948747038841248, | |
| "learning_rate": 1.2712271005328924e-05, | |
| "loss": 0.0746, | |
| "step": 68530 | |
| }, | |
| { | |
| "epoch": 46.55291723202171, | |
| "grad_norm": 1.2013221979141235, | |
| "learning_rate": 1.260389880306399e-05, | |
| "loss": 0.0709, | |
| "step": 68619 | |
| }, | |
| { | |
| "epoch": 46.613297150610585, | |
| "grad_norm": 1.0223325490951538, | |
| "learning_rate": 1.2495923858993364e-05, | |
| "loss": 0.076, | |
| "step": 68708 | |
| }, | |
| { | |
| "epoch": 46.67367706919946, | |
| "grad_norm": 0.7184458374977112, | |
| "learning_rate": 1.2388347320133182e-05, | |
| "loss": 0.0684, | |
| "step": 68797 | |
| }, | |
| { | |
| "epoch": 46.73405698778833, | |
| "grad_norm": 0.4814877510070801, | |
| "learning_rate": 1.2281170329267322e-05, | |
| "loss": 0.0724, | |
| "step": 68886 | |
| }, | |
| { | |
| "epoch": 46.79443690637721, | |
| "grad_norm": 0.5036719441413879, | |
| "learning_rate": 1.2174394024935281e-05, | |
| "loss": 0.0704, | |
| "step": 68975 | |
| }, | |
| { | |
| "epoch": 46.85481682496608, | |
| "grad_norm": 0.5806756019592285, | |
| "learning_rate": 1.2068019541420033e-05, | |
| "loss": 0.0723, | |
| "step": 69064 | |
| }, | |
| { | |
| "epoch": 46.91519674355495, | |
| "grad_norm": 1.2670601606369019, | |
| "learning_rate": 1.1962048008736053e-05, | |
| "loss": 0.0706, | |
| "step": 69153 | |
| }, | |
| { | |
| "epoch": 46.97557666214383, | |
| "grad_norm": 0.5702329277992249, | |
| "learning_rate": 1.1856480552617272e-05, | |
| "loss": 0.0702, | |
| "step": 69242 | |
| }, | |
| { | |
| "epoch": 47.0359565807327, | |
| "grad_norm": 0.49773919582366943, | |
| "learning_rate": 1.1751318294505104e-05, | |
| "loss": 0.0738, | |
| "step": 69331 | |
| }, | |
| { | |
| "epoch": 47.09633649932157, | |
| "grad_norm": 0.5580993294715881, | |
| "learning_rate": 1.1646562351536589e-05, | |
| "loss": 0.0714, | |
| "step": 69420 | |
| }, | |
| { | |
| "epoch": 47.156716417910445, | |
| "grad_norm": 0.47159460186958313, | |
| "learning_rate": 1.1542213836532417e-05, | |
| "loss": 0.0736, | |
| "step": 69509 | |
| }, | |
| { | |
| "epoch": 47.217096336499324, | |
| "grad_norm": 0.6028949618339539, | |
| "learning_rate": 1.1438273857985244e-05, | |
| "loss": 0.0748, | |
| "step": 69598 | |
| }, | |
| { | |
| "epoch": 47.277476255088196, | |
| "grad_norm": 0.7113878130912781, | |
| "learning_rate": 1.1334743520047836e-05, | |
| "loss": 0.0753, | |
| "step": 69687 | |
| }, | |
| { | |
| "epoch": 47.33785617367707, | |
| "grad_norm": 0.3303639888763428, | |
| "learning_rate": 1.1231623922521317e-05, | |
| "loss": 0.0716, | |
| "step": 69776 | |
| }, | |
| { | |
| "epoch": 47.39823609226594, | |
| "grad_norm": 1.0966421365737915, | |
| "learning_rate": 1.1128916160843578e-05, | |
| "loss": 0.0733, | |
| "step": 69865 | |
| }, | |
| { | |
| "epoch": 47.45861601085482, | |
| "grad_norm": 0.575943648815155, | |
| "learning_rate": 1.1026621326077525e-05, | |
| "loss": 0.0725, | |
| "step": 69954 | |
| }, | |
| { | |
| "epoch": 47.51899592944369, | |
| "grad_norm": 0.8768503665924072, | |
| "learning_rate": 1.0924740504899584e-05, | |
| "loss": 0.0704, | |
| "step": 70043 | |
| }, | |
| { | |
| "epoch": 47.57937584803256, | |
| "grad_norm": 0.6844857931137085, | |
| "learning_rate": 1.0823274779588122e-05, | |
| "loss": 0.0746, | |
| "step": 70132 | |
| }, | |
| { | |
| "epoch": 47.63975576662144, | |
| "grad_norm": 0.5367492437362671, | |
| "learning_rate": 1.0722225228011946e-05, | |
| "loss": 0.0714, | |
| "step": 70221 | |
| }, | |
| { | |
| "epoch": 47.70013568521031, | |
| "grad_norm": 0.5591740012168884, | |
| "learning_rate": 1.0621592923618856e-05, | |
| "loss": 0.0662, | |
| "step": 70310 | |
| }, | |
| { | |
| "epoch": 47.760515603799185, | |
| "grad_norm": 0.4710708558559418, | |
| "learning_rate": 1.0521378935424214e-05, | |
| "loss": 0.0743, | |
| "step": 70399 | |
| }, | |
| { | |
| "epoch": 47.82089552238806, | |
| "grad_norm": 0.7445366382598877, | |
| "learning_rate": 1.0421584327999651e-05, | |
| "loss": 0.0689, | |
| "step": 70488 | |
| }, | |
| { | |
| "epoch": 47.881275440976935, | |
| "grad_norm": 0.8262448906898499, | |
| "learning_rate": 1.0322210161461715e-05, | |
| "loss": 0.0763, | |
| "step": 70577 | |
| }, | |
| { | |
| "epoch": 47.94165535956581, | |
| "grad_norm": 0.5951725840568542, | |
| "learning_rate": 1.0223257491460608e-05, | |
| "loss": 0.0706, | |
| "step": 70666 | |
| }, | |
| { | |
| "epoch": 48.00203527815468, | |
| "grad_norm": 0.6799793243408203, | |
| "learning_rate": 1.0124727369169002e-05, | |
| "loss": 0.074, | |
| "step": 70755 | |
| }, | |
| { | |
| "epoch": 48.00542740841248, | |
| "eval_accuracy": 0.18359375, | |
| "eval_loss": 4.300548553466797, | |
| "eval_runtime": 21.4171, | |
| "eval_samples_per_second": 23.906, | |
| "eval_steps_per_second": 0.187, | |
| "step": 70760 | |
| }, | |
| { | |
| "epoch": 48.06241519674356, | |
| "grad_norm": 0.7727463841438293, | |
| "learning_rate": 1.0026620841270807e-05, | |
| "loss": 0.0711, | |
| "step": 70844 | |
| }, | |
| { | |
| "epoch": 48.12279511533243, | |
| "grad_norm": 0.641099214553833, | |
| "learning_rate": 9.928938949950133e-06, | |
| "loss": 0.0716, | |
| "step": 70933 | |
| } | |
| ], | |
| "logging_steps": 89, | |
| "max_steps": 88440, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 60, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2159619256203346e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |