diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4342 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3161361141602634, + "eval_steps": 50, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021953896816684962, + "grad_norm": 163.60501272566069, + "learning_rate": 0.0, + "loss": 1.3923, + "step": 1 + }, + { + "epoch": 0.0043907793633369925, + "grad_norm": 170.58640230921233, + "learning_rate": 7.2992700729927e-09, + "loss": 1.281, + "step": 2 + }, + { + "epoch": 0.006586169045005488, + "grad_norm": 131.10623897546836, + "learning_rate": 1.45985401459854e-08, + "loss": 1.3551, + "step": 3 + }, + { + "epoch": 0.008781558726673985, + "grad_norm": 178.96806532547603, + "learning_rate": 2.1897810218978102e-08, + "loss": 1.3577, + "step": 4 + }, + { + "epoch": 0.010976948408342482, + "grad_norm": 132.31431642741137, + "learning_rate": 2.91970802919708e-08, + "loss": 1.3135, + "step": 5 + }, + { + "epoch": 0.013172338090010977, + "grad_norm": 135.62398872133667, + "learning_rate": 3.64963503649635e-08, + "loss": 1.305, + "step": 6 + }, + { + "epoch": 0.015367727771679473, + "grad_norm": 128.75782005700225, + "learning_rate": 4.3795620437956203e-08, + "loss": 1.4486, + "step": 7 + }, + { + "epoch": 0.01756311745334797, + "grad_norm": 138.3771195817014, + "learning_rate": 5.10948905109489e-08, + "loss": 1.5554, + "step": 8 + }, + { + "epoch": 0.019758507135016465, + "grad_norm": 132.69136904564107, + "learning_rate": 5.83941605839416e-08, + "loss": 1.4041, + "step": 9 + }, + { + "epoch": 0.021953896816684963, + "grad_norm": 135.7212626140043, + "learning_rate": 6.569343065693431e-08, + "loss": 1.2969, + "step": 10 + }, + { + "epoch": 0.024149286498353458, + "grad_norm": 150.3565406204981, + "learning_rate": 7.2992700729927e-08, + "loss": 1.3152, + "step": 11 + }, + { + "epoch": 0.026344676180021953, + "grad_norm": 151.3406121199072, + "learning_rate": 8.029197080291971e-08, + "loss": 1.488, + "step": 12 + }, + { + "epoch": 0.02854006586169045, + "grad_norm": 124.37914565637091, + "learning_rate": 8.759124087591241e-08, + "loss": 1.244, + "step": 13 + }, + { + "epoch": 0.030735455543358946, + "grad_norm": 142.5907569544806, + "learning_rate": 9.48905109489051e-08, + "loss": 1.2863, + "step": 14 + }, + { + "epoch": 0.03293084522502744, + "grad_norm": 122.30421005497061, + "learning_rate": 1.021897810218978e-07, + "loss": 1.3103, + "step": 15 + }, + { + "epoch": 0.03512623490669594, + "grad_norm": 200.79885863608268, + "learning_rate": 1.0948905109489052e-07, + "loss": 1.4004, + "step": 16 + }, + { + "epoch": 0.03732162458836443, + "grad_norm": 167.14991660211373, + "learning_rate": 1.167883211678832e-07, + "loss": 1.2775, + "step": 17 + }, + { + "epoch": 0.03951701427003293, + "grad_norm": 131.28993539446094, + "learning_rate": 1.240875912408759e-07, + "loss": 1.3523, + "step": 18 + }, + { + "epoch": 0.04171240395170143, + "grad_norm": 162.3600505958806, + "learning_rate": 1.3138686131386862e-07, + "loss": 1.3361, + "step": 19 + }, + { + "epoch": 0.043907793633369926, + "grad_norm": 133.12292640519846, + "learning_rate": 1.386861313868613e-07, + "loss": 1.2538, + "step": 20 + }, + { + "epoch": 0.04610318331503842, + "grad_norm": 138.78896167890522, + "learning_rate": 1.45985401459854e-07, + "loss": 1.352, + "step": 21 + }, + { + "epoch": 0.048298572996706916, + "grad_norm": 139.69189042882115, + "learning_rate": 1.532846715328467e-07, + "loss": 1.5154, + "step": 22 + }, + { + "epoch": 0.050493962678375415, + "grad_norm": 137.39489076785355, + "learning_rate": 1.6058394160583942e-07, + "loss": 1.4735, + "step": 23 + }, + { + "epoch": 0.052689352360043906, + "grad_norm": 129.9057654211869, + "learning_rate": 1.678832116788321e-07, + "loss": 1.4751, + "step": 24 + }, + { + "epoch": 0.054884742041712405, + "grad_norm": 142.17880454005854, + "learning_rate": 1.7518248175182481e-07, + "loss": 1.4576, + "step": 25 + }, + { + "epoch": 0.0570801317233809, + "grad_norm": 213.94665712809794, + "learning_rate": 1.824817518248175e-07, + "loss": 1.5012, + "step": 26 + }, + { + "epoch": 0.059275521405049394, + "grad_norm": 150.33832122564692, + "learning_rate": 1.897810218978102e-07, + "loss": 1.4739, + "step": 27 + }, + { + "epoch": 0.06147091108671789, + "grad_norm": 135.59843657422903, + "learning_rate": 1.9708029197080292e-07, + "loss": 1.3503, + "step": 28 + }, + { + "epoch": 0.06366630076838639, + "grad_norm": 140.7138855818933, + "learning_rate": 2.043795620437956e-07, + "loss": 1.2969, + "step": 29 + }, + { + "epoch": 0.06586169045005488, + "grad_norm": 203.85681204065455, + "learning_rate": 2.116788321167883e-07, + "loss": 1.4389, + "step": 30 + }, + { + "epoch": 0.06805708013172337, + "grad_norm": 129.70436787634256, + "learning_rate": 2.1897810218978103e-07, + "loss": 1.352, + "step": 31 + }, + { + "epoch": 0.07025246981339188, + "grad_norm": 170.170655877601, + "learning_rate": 2.2627737226277372e-07, + "loss": 1.4311, + "step": 32 + }, + { + "epoch": 0.07244785949506037, + "grad_norm": 167.7752606625615, + "learning_rate": 2.335766423357664e-07, + "loss": 1.2723, + "step": 33 + }, + { + "epoch": 0.07464324917672886, + "grad_norm": 130.878840340516, + "learning_rate": 2.408759124087591e-07, + "loss": 1.3537, + "step": 34 + }, + { + "epoch": 0.07683863885839737, + "grad_norm": 137.56018328091912, + "learning_rate": 2.481751824817518e-07, + "loss": 1.4013, + "step": 35 + }, + { + "epoch": 0.07903402854006586, + "grad_norm": 110.2306130917618, + "learning_rate": 2.5547445255474454e-07, + "loss": 1.2338, + "step": 36 + }, + { + "epoch": 0.08122941822173436, + "grad_norm": 140.88497788079806, + "learning_rate": 2.6277372262773725e-07, + "loss": 1.2442, + "step": 37 + }, + { + "epoch": 0.08342480790340286, + "grad_norm": 131.1603732007104, + "learning_rate": 2.700729927007299e-07, + "loss": 1.3306, + "step": 38 + }, + { + "epoch": 0.08562019758507135, + "grad_norm": 125.8912594433526, + "learning_rate": 2.773722627737226e-07, + "loss": 1.2538, + "step": 39 + }, + { + "epoch": 0.08781558726673985, + "grad_norm": 173.62590699447912, + "learning_rate": 2.846715328467153e-07, + "loss": 1.282, + "step": 40 + }, + { + "epoch": 0.09001097694840834, + "grad_norm": 140.0348709144397, + "learning_rate": 2.91970802919708e-07, + "loss": 1.2933, + "step": 41 + }, + { + "epoch": 0.09220636663007684, + "grad_norm": 138.34191099990304, + "learning_rate": 2.9927007299270075e-07, + "loss": 1.2246, + "step": 42 + }, + { + "epoch": 0.09440175631174534, + "grad_norm": 133.3567358083278, + "learning_rate": 3.065693430656934e-07, + "loss": 1.2049, + "step": 43 + }, + { + "epoch": 0.09659714599341383, + "grad_norm": 149.86829216030878, + "learning_rate": 3.138686131386861e-07, + "loss": 1.1484, + "step": 44 + }, + { + "epoch": 0.09879253567508232, + "grad_norm": 139.54448207721754, + "learning_rate": 3.2116788321167883e-07, + "loss": 1.1161, + "step": 45 + }, + { + "epoch": 0.10098792535675083, + "grad_norm": 144.73507422182286, + "learning_rate": 3.284671532846715e-07, + "loss": 1.2278, + "step": 46 + }, + { + "epoch": 0.10318331503841932, + "grad_norm": 121.6383629486138, + "learning_rate": 3.357664233576642e-07, + "loss": 1.2288, + "step": 47 + }, + { + "epoch": 0.10537870472008781, + "grad_norm": 98.86387010070742, + "learning_rate": 3.4306569343065697e-07, + "loss": 1.1108, + "step": 48 + }, + { + "epoch": 0.10757409440175632, + "grad_norm": 163.06719307818943, + "learning_rate": 3.5036496350364963e-07, + "loss": 1.1668, + "step": 49 + }, + { + "epoch": 0.10976948408342481, + "grad_norm": 115.37025925475122, + "learning_rate": 3.5766423357664234e-07, + "loss": 1.0684, + "step": 50 + }, + { + "epoch": 0.10976948408342481, + "eval_accuracy": 0.506, + "eval_loss": 1.019887089729309, + "eval_runtime": 62.4289, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 1.009, + "step": 50 + }, + { + "epoch": 0.1119648737650933, + "grad_norm": 115.00520545818907, + "learning_rate": 3.64963503649635e-07, + "loss": 1.1397, + "step": 51 + }, + { + "epoch": 0.1141602634467618, + "grad_norm": 102.41196522989259, + "learning_rate": 3.722627737226277e-07, + "loss": 1.0745, + "step": 52 + }, + { + "epoch": 0.1163556531284303, + "grad_norm": 117.26316852602802, + "learning_rate": 3.795620437956204e-07, + "loss": 1.2356, + "step": 53 + }, + { + "epoch": 0.11855104281009879, + "grad_norm": 120.91380597609361, + "learning_rate": 3.8686131386861313e-07, + "loss": 1.2119, + "step": 54 + }, + { + "epoch": 0.1207464324917673, + "grad_norm": 92.29300818472724, + "learning_rate": 3.9416058394160584e-07, + "loss": 1.1304, + "step": 55 + }, + { + "epoch": 0.12294182217343579, + "grad_norm": 92.82345907233494, + "learning_rate": 4.0145985401459856e-07, + "loss": 1.1531, + "step": 56 + }, + { + "epoch": 0.1251372118551043, + "grad_norm": 85.5717973777339, + "learning_rate": 4.087591240875912e-07, + "loss": 0.9308, + "step": 57 + }, + { + "epoch": 0.12733260153677278, + "grad_norm": 85.43269620489475, + "learning_rate": 4.160583941605839e-07, + "loss": 1.1233, + "step": 58 + }, + { + "epoch": 0.12952799121844127, + "grad_norm": 115.80515321077907, + "learning_rate": 4.233576642335766e-07, + "loss": 1.1797, + "step": 59 + }, + { + "epoch": 0.13172338090010977, + "grad_norm": 84.2918940049183, + "learning_rate": 4.306569343065693e-07, + "loss": 1.092, + "step": 60 + }, + { + "epoch": 0.13391877058177826, + "grad_norm": 99.39269845007587, + "learning_rate": 4.3795620437956206e-07, + "loss": 1.1176, + "step": 61 + }, + { + "epoch": 0.13611416026344675, + "grad_norm": 70.59454501984223, + "learning_rate": 4.452554744525547e-07, + "loss": 1.0813, + "step": 62 + }, + { + "epoch": 0.13830954994511527, + "grad_norm": 109.5923399556689, + "learning_rate": 4.5255474452554743e-07, + "loss": 0.9878, + "step": 63 + }, + { + "epoch": 0.14050493962678376, + "grad_norm": 63.030131831059876, + "learning_rate": 4.5985401459854014e-07, + "loss": 0.9483, + "step": 64 + }, + { + "epoch": 0.14270032930845225, + "grad_norm": 87.02770783815946, + "learning_rate": 4.671532846715328e-07, + "loss": 0.9885, + "step": 65 + }, + { + "epoch": 0.14489571899012074, + "grad_norm": 70.75751318092878, + "learning_rate": 4.744525547445255e-07, + "loss": 0.8505, + "step": 66 + }, + { + "epoch": 0.14709110867178923, + "grad_norm": 84.18711981429041, + "learning_rate": 4.817518248175182e-07, + "loss": 0.9274, + "step": 67 + }, + { + "epoch": 0.14928649835345773, + "grad_norm": 93.24415466346244, + "learning_rate": 4.89051094890511e-07, + "loss": 1.0028, + "step": 68 + }, + { + "epoch": 0.15148188803512624, + "grad_norm": 58.159526228156615, + "learning_rate": 4.963503649635036e-07, + "loss": 0.8635, + "step": 69 + }, + { + "epoch": 0.15367727771679474, + "grad_norm": 69.94129826644978, + "learning_rate": 5.036496350364964e-07, + "loss": 0.9774, + "step": 70 + }, + { + "epoch": 0.15587266739846323, + "grad_norm": 69.03950330444896, + "learning_rate": 5.109489051094891e-07, + "loss": 0.9669, + "step": 71 + }, + { + "epoch": 0.15806805708013172, + "grad_norm": 76.92567375028155, + "learning_rate": 5.182481751824817e-07, + "loss": 0.86, + "step": 72 + }, + { + "epoch": 0.1602634467618002, + "grad_norm": 74.59715692707641, + "learning_rate": 5.255474452554745e-07, + "loss": 0.9128, + "step": 73 + }, + { + "epoch": 0.16245883644346873, + "grad_norm": 82.61633303553909, + "learning_rate": 5.328467153284672e-07, + "loss": 0.9546, + "step": 74 + }, + { + "epoch": 0.16465422612513722, + "grad_norm": 58.685512250843665, + "learning_rate": 5.401459854014598e-07, + "loss": 0.875, + "step": 75 + }, + { + "epoch": 0.1668496158068057, + "grad_norm": 58.60891793684997, + "learning_rate": 5.474452554744526e-07, + "loss": 0.8729, + "step": 76 + }, + { + "epoch": 0.1690450054884742, + "grad_norm": 68.60986432356273, + "learning_rate": 5.547445255474452e-07, + "loss": 0.8208, + "step": 77 + }, + { + "epoch": 0.1712403951701427, + "grad_norm": 58.35034555605016, + "learning_rate": 5.620437956204379e-07, + "loss": 0.903, + "step": 78 + }, + { + "epoch": 0.1734357848518112, + "grad_norm": 98.61406200927433, + "learning_rate": 5.693430656934306e-07, + "loss": 0.9472, + "step": 79 + }, + { + "epoch": 0.1756311745334797, + "grad_norm": 81.08439956625773, + "learning_rate": 5.766423357664233e-07, + "loss": 0.9163, + "step": 80 + }, + { + "epoch": 0.1778265642151482, + "grad_norm": 53.88557720126228, + "learning_rate": 5.83941605839416e-07, + "loss": 0.868, + "step": 81 + }, + { + "epoch": 0.1800219538968167, + "grad_norm": 53.143301158626926, + "learning_rate": 5.912408759124087e-07, + "loss": 0.88, + "step": 82 + }, + { + "epoch": 0.18221734357848518, + "grad_norm": 49.19404393103411, + "learning_rate": 5.985401459854015e-07, + "loss": 0.8291, + "step": 83 + }, + { + "epoch": 0.18441273326015367, + "grad_norm": 49.9632858603646, + "learning_rate": 6.058394160583942e-07, + "loss": 0.8155, + "step": 84 + }, + { + "epoch": 0.18660812294182216, + "grad_norm": 45.62330278251693, + "learning_rate": 6.131386861313868e-07, + "loss": 0.7906, + "step": 85 + }, + { + "epoch": 0.18880351262349068, + "grad_norm": 41.69982248481481, + "learning_rate": 6.204379562043796e-07, + "loss": 0.7877, + "step": 86 + }, + { + "epoch": 0.19099890230515917, + "grad_norm": 36.534122434280455, + "learning_rate": 6.277372262773722e-07, + "loss": 0.7527, + "step": 87 + }, + { + "epoch": 0.19319429198682767, + "grad_norm": 41.51590795972736, + "learning_rate": 6.350364963503649e-07, + "loss": 0.7701, + "step": 88 + }, + { + "epoch": 0.19538968166849616, + "grad_norm": 50.9447076217186, + "learning_rate": 6.423357664233577e-07, + "loss": 0.8, + "step": 89 + }, + { + "epoch": 0.19758507135016465, + "grad_norm": 39.73817349854984, + "learning_rate": 6.496350364963503e-07, + "loss": 0.7236, + "step": 90 + }, + { + "epoch": 0.19978046103183314, + "grad_norm": 38.43809365198889, + "learning_rate": 6.56934306569343e-07, + "loss": 0.6534, + "step": 91 + }, + { + "epoch": 0.20197585071350166, + "grad_norm": 37.357871240275045, + "learning_rate": 6.642335766423358e-07, + "loss": 0.7343, + "step": 92 + }, + { + "epoch": 0.20417124039517015, + "grad_norm": 44.96345470889994, + "learning_rate": 6.715328467153284e-07, + "loss": 0.6828, + "step": 93 + }, + { + "epoch": 0.20636663007683864, + "grad_norm": 37.388274455357625, + "learning_rate": 6.788321167883211e-07, + "loss": 0.6308, + "step": 94 + }, + { + "epoch": 0.20856201975850713, + "grad_norm": 30.201936308225815, + "learning_rate": 6.861313868613139e-07, + "loss": 0.6127, + "step": 95 + }, + { + "epoch": 0.21075740944017562, + "grad_norm": 35.90933604523665, + "learning_rate": 6.934306569343066e-07, + "loss": 0.6911, + "step": 96 + }, + { + "epoch": 0.21295279912184412, + "grad_norm": 29.700092316262612, + "learning_rate": 7.007299270072993e-07, + "loss": 0.6556, + "step": 97 + }, + { + "epoch": 0.21514818880351264, + "grad_norm": 31.842570539615227, + "learning_rate": 7.080291970802919e-07, + "loss": 0.6896, + "step": 98 + }, + { + "epoch": 0.21734357848518113, + "grad_norm": 31.238683970651707, + "learning_rate": 7.153284671532847e-07, + "loss": 0.6506, + "step": 99 + }, + { + "epoch": 0.21953896816684962, + "grad_norm": 35.48873900248551, + "learning_rate": 7.226277372262773e-07, + "loss": 0.7495, + "step": 100 + }, + { + "epoch": 0.21953896816684962, + "eval_accuracy": 0.682, + "eval_loss": 0.562687873840332, + "eval_runtime": 62.4373, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 1.009, + "step": 100 + }, + { + "epoch": 0.2217343578485181, + "grad_norm": 34.153901482286706, + "learning_rate": 7.2992700729927e-07, + "loss": 0.7136, + "step": 101 + }, + { + "epoch": 0.2239297475301866, + "grad_norm": 27.9624148195877, + "learning_rate": 7.372262773722628e-07, + "loss": 0.7178, + "step": 102 + }, + { + "epoch": 0.2261251372118551, + "grad_norm": 22.100202938968764, + "learning_rate": 7.445255474452554e-07, + "loss": 0.6749, + "step": 103 + }, + { + "epoch": 0.2283205268935236, + "grad_norm": 28.18545960285919, + "learning_rate": 7.518248175182481e-07, + "loss": 0.666, + "step": 104 + }, + { + "epoch": 0.2305159165751921, + "grad_norm": 22.264174917923164, + "learning_rate": 7.591240875912408e-07, + "loss": 0.6312, + "step": 105 + }, + { + "epoch": 0.2327113062568606, + "grad_norm": 17.37357275601335, + "learning_rate": 7.664233576642335e-07, + "loss": 0.6065, + "step": 106 + }, + { + "epoch": 0.2349066959385291, + "grad_norm": 19.49426500716658, + "learning_rate": 7.737226277372263e-07, + "loss": 0.6502, + "step": 107 + }, + { + "epoch": 0.23710208562019758, + "grad_norm": 22.022008695087543, + "learning_rate": 7.81021897810219e-07, + "loss": 0.5885, + "step": 108 + }, + { + "epoch": 0.23929747530186607, + "grad_norm": 21.515467941901086, + "learning_rate": 7.883211678832117e-07, + "loss": 0.5583, + "step": 109 + }, + { + "epoch": 0.2414928649835346, + "grad_norm": 25.78728071293923, + "learning_rate": 7.956204379562043e-07, + "loss": 0.6243, + "step": 110 + }, + { + "epoch": 0.24368825466520308, + "grad_norm": 18.18532044027736, + "learning_rate": 8.029197080291971e-07, + "loss": 0.617, + "step": 111 + }, + { + "epoch": 0.24588364434687157, + "grad_norm": 25.226965635992663, + "learning_rate": 8.102189781021898e-07, + "loss": 0.579, + "step": 112 + }, + { + "epoch": 0.24807903402854006, + "grad_norm": 22.512756730293077, + "learning_rate": 8.175182481751824e-07, + "loss": 0.5939, + "step": 113 + }, + { + "epoch": 0.2502744237102086, + "grad_norm": 18.596439780374983, + "learning_rate": 8.248175182481751e-07, + "loss": 0.6323, + "step": 114 + }, + { + "epoch": 0.2524698133918771, + "grad_norm": 17.02105845004187, + "learning_rate": 8.321167883211679e-07, + "loss": 0.5594, + "step": 115 + }, + { + "epoch": 0.25466520307354557, + "grad_norm": 16.250583027072732, + "learning_rate": 8.394160583941605e-07, + "loss": 0.577, + "step": 116 + }, + { + "epoch": 0.25686059275521406, + "grad_norm": 18.5715233786033, + "learning_rate": 8.467153284671532e-07, + "loss": 0.478, + "step": 117 + }, + { + "epoch": 0.25905598243688255, + "grad_norm": 17.614587096288357, + "learning_rate": 8.540145985401459e-07, + "loss": 0.5567, + "step": 118 + }, + { + "epoch": 0.26125137211855104, + "grad_norm": 21.00826003065434, + "learning_rate": 8.613138686131386e-07, + "loss": 0.5687, + "step": 119 + }, + { + "epoch": 0.26344676180021953, + "grad_norm": 15.607731096994556, + "learning_rate": 8.686131386861314e-07, + "loss": 0.5608, + "step": 120 + }, + { + "epoch": 0.265642151481888, + "grad_norm": 15.277425156097365, + "learning_rate": 8.759124087591241e-07, + "loss": 0.5288, + "step": 121 + }, + { + "epoch": 0.2678375411635565, + "grad_norm": 19.651993472199724, + "learning_rate": 8.832116788321168e-07, + "loss": 0.5026, + "step": 122 + }, + { + "epoch": 0.270032930845225, + "grad_norm": 24.891420468239335, + "learning_rate": 8.905109489051094e-07, + "loss": 0.5651, + "step": 123 + }, + { + "epoch": 0.2722283205268935, + "grad_norm": 17.627056937663315, + "learning_rate": 8.978102189781022e-07, + "loss": 0.5665, + "step": 124 + }, + { + "epoch": 0.27442371020856204, + "grad_norm": 16.656331513263602, + "learning_rate": 9.051094890510949e-07, + "loss": 0.4395, + "step": 125 + }, + { + "epoch": 0.27661909989023054, + "grad_norm": 15.567024748783735, + "learning_rate": 9.124087591240875e-07, + "loss": 0.4625, + "step": 126 + }, + { + "epoch": 0.278814489571899, + "grad_norm": 15.002053723893589, + "learning_rate": 9.197080291970803e-07, + "loss": 0.5267, + "step": 127 + }, + { + "epoch": 0.2810098792535675, + "grad_norm": 13.834884625752059, + "learning_rate": 9.270072992700729e-07, + "loss": 0.5017, + "step": 128 + }, + { + "epoch": 0.283205268935236, + "grad_norm": 15.615717349478732, + "learning_rate": 9.343065693430656e-07, + "loss": 0.4742, + "step": 129 + }, + { + "epoch": 0.2854006586169045, + "grad_norm": 13.471813706001054, + "learning_rate": 9.416058394160583e-07, + "loss": 0.512, + "step": 130 + }, + { + "epoch": 0.287596048298573, + "grad_norm": 15.094349701329602, + "learning_rate": 9.48905109489051e-07, + "loss": 0.5327, + "step": 131 + }, + { + "epoch": 0.2897914379802415, + "grad_norm": 13.19342166454616, + "learning_rate": 9.562043795620438e-07, + "loss": 0.47, + "step": 132 + }, + { + "epoch": 0.29198682766191, + "grad_norm": 12.139844355160385, + "learning_rate": 9.635036496350364e-07, + "loss": 0.5429, + "step": 133 + }, + { + "epoch": 0.29418221734357847, + "grad_norm": 13.352063594802008, + "learning_rate": 9.708029197080291e-07, + "loss": 0.4865, + "step": 134 + }, + { + "epoch": 0.29637760702524696, + "grad_norm": 14.225936575772286, + "learning_rate": 9.78102189781022e-07, + "loss": 0.4784, + "step": 135 + }, + { + "epoch": 0.29857299670691545, + "grad_norm": 12.461695683400633, + "learning_rate": 9.854014598540146e-07, + "loss": 0.4518, + "step": 136 + }, + { + "epoch": 0.300768386388584, + "grad_norm": 12.867009884092825, + "learning_rate": 9.927007299270073e-07, + "loss": 0.4426, + "step": 137 + }, + { + "epoch": 0.3029637760702525, + "grad_norm": 20.246601888258336, + "learning_rate": 1e-06, + "loss": 0.5418, + "step": 138 + }, + { + "epoch": 0.305159165751921, + "grad_norm": 19.62917306684865, + "learning_rate": 9.999983717412808e-07, + "loss": 0.5246, + "step": 139 + }, + { + "epoch": 0.30735455543358947, + "grad_norm": 14.70577630004396, + "learning_rate": 9.999934869757278e-07, + "loss": 0.4996, + "step": 140 + }, + { + "epoch": 0.30954994511525796, + "grad_norm": 11.895324662976222, + "learning_rate": 9.999853457351558e-07, + "loss": 0.4846, + "step": 141 + }, + { + "epoch": 0.31174533479692645, + "grad_norm": 11.772224391975989, + "learning_rate": 9.999739480725893e-07, + "loss": 0.4882, + "step": 142 + }, + { + "epoch": 0.31394072447859495, + "grad_norm": 12.342437917454156, + "learning_rate": 9.999592940622613e-07, + "loss": 0.4973, + "step": 143 + }, + { + "epoch": 0.31613611416026344, + "grad_norm": 9.63333593523558, + "learning_rate": 9.999413837996137e-07, + "loss": 0.4657, + "step": 144 + }, + { + "epoch": 0.31833150384193193, + "grad_norm": 12.051946387691274, + "learning_rate": 9.999202174012972e-07, + "loss": 0.4778, + "step": 145 + }, + { + "epoch": 0.3205268935236004, + "grad_norm": 11.959976548769715, + "learning_rate": 9.99895795005169e-07, + "loss": 0.4527, + "step": 146 + }, + { + "epoch": 0.3227222832052689, + "grad_norm": 10.113920097674713, + "learning_rate": 9.99868116770293e-07, + "loss": 0.4078, + "step": 147 + }, + { + "epoch": 0.32491767288693746, + "grad_norm": 12.435033880307502, + "learning_rate": 9.998371828769384e-07, + "loss": 0.4894, + "step": 148 + }, + { + "epoch": 0.32711306256860595, + "grad_norm": 14.939081919754774, + "learning_rate": 9.99802993526579e-07, + "loss": 0.5065, + "step": 149 + }, + { + "epoch": 0.32930845225027444, + "grad_norm": 14.254154244717148, + "learning_rate": 9.997655489418912e-07, + "loss": 0.494, + "step": 150 + }, + { + "epoch": 0.32930845225027444, + "eval_accuracy": 0.786, + "eval_loss": 0.435116708278656, + "eval_runtime": 62.3466, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 1.01, + "step": 150 + }, + { + "epoch": 0.33150384193194293, + "grad_norm": 12.039017936319885, + "learning_rate": 9.997248493667527e-07, + "loss": 0.4839, + "step": 151 + }, + { + "epoch": 0.3336992316136114, + "grad_norm": 11.070313457195804, + "learning_rate": 9.996808950662413e-07, + "loss": 0.4378, + "step": 152 + }, + { + "epoch": 0.3358946212952799, + "grad_norm": 9.336009234528, + "learning_rate": 9.99633686326633e-07, + "loss": 0.4351, + "step": 153 + }, + { + "epoch": 0.3380900109769484, + "grad_norm": 11.55439798512532, + "learning_rate": 9.995832234554e-07, + "loss": 0.4553, + "step": 154 + }, + { + "epoch": 0.3402854006586169, + "grad_norm": 12.154145816173985, + "learning_rate": 9.995295067812083e-07, + "loss": 0.4964, + "step": 155 + }, + { + "epoch": 0.3424807903402854, + "grad_norm": 9.628643083965512, + "learning_rate": 9.99472536653917e-07, + "loss": 0.5015, + "step": 156 + }, + { + "epoch": 0.3446761800219539, + "grad_norm": 11.610322067423159, + "learning_rate": 9.994123134445746e-07, + "loss": 0.4712, + "step": 157 + }, + { + "epoch": 0.3468715697036224, + "grad_norm": 12.74943545259812, + "learning_rate": 9.993488375454165e-07, + "loss": 0.4549, + "step": 158 + }, + { + "epoch": 0.34906695938529086, + "grad_norm": 11.741546873031675, + "learning_rate": 9.992821093698636e-07, + "loss": 0.5129, + "step": 159 + }, + { + "epoch": 0.3512623490669594, + "grad_norm": 10.451389621211415, + "learning_rate": 9.992121293525188e-07, + "loss": 0.4884, + "step": 160 + }, + { + "epoch": 0.3534577387486279, + "grad_norm": 9.48305774210746, + "learning_rate": 9.991388979491646e-07, + "loss": 0.4085, + "step": 161 + }, + { + "epoch": 0.3556531284302964, + "grad_norm": 14.783454444270212, + "learning_rate": 9.990624156367596e-07, + "loss": 0.5289, + "step": 162 + }, + { + "epoch": 0.3578485181119649, + "grad_norm": 8.468180083819155, + "learning_rate": 9.989826829134356e-07, + "loss": 0.449, + "step": 163 + }, + { + "epoch": 0.3600439077936334, + "grad_norm": 11.676809759593661, + "learning_rate": 9.988997002984949e-07, + "loss": 0.5199, + "step": 164 + }, + { + "epoch": 0.36223929747530187, + "grad_norm": 11.110035365580961, + "learning_rate": 9.988134683324058e-07, + "loss": 0.4988, + "step": 165 + }, + { + "epoch": 0.36443468715697036, + "grad_norm": 9.194590897066561, + "learning_rate": 9.987239875768006e-07, + "loss": 0.4447, + "step": 166 + }, + { + "epoch": 0.36663007683863885, + "grad_norm": 10.829285977863856, + "learning_rate": 9.9863125861447e-07, + "loss": 0.4645, + "step": 167 + }, + { + "epoch": 0.36882546652030734, + "grad_norm": 11.15768970639428, + "learning_rate": 9.985352820493614e-07, + "loss": 0.4429, + "step": 168 + }, + { + "epoch": 0.37102085620197583, + "grad_norm": 10.615797787300718, + "learning_rate": 9.984360585065733e-07, + "loss": 0.4236, + "step": 169 + }, + { + "epoch": 0.3732162458836443, + "grad_norm": 11.596646960772174, + "learning_rate": 9.983335886323524e-07, + "loss": 0.4212, + "step": 170 + }, + { + "epoch": 0.3754116355653128, + "grad_norm": 11.60172095267533, + "learning_rate": 9.98227873094088e-07, + "loss": 0.4796, + "step": 171 + }, + { + "epoch": 0.37760702524698136, + "grad_norm": 14.055785993956126, + "learning_rate": 9.981189125803095e-07, + "loss": 0.4841, + "step": 172 + }, + { + "epoch": 0.37980241492864986, + "grad_norm": 9.572819362372918, + "learning_rate": 9.980067078006804e-07, + "loss": 0.3985, + "step": 173 + }, + { + "epoch": 0.38199780461031835, + "grad_norm": 11.59813843253133, + "learning_rate": 9.978912594859946e-07, + "loss": 0.4564, + "step": 174 + }, + { + "epoch": 0.38419319429198684, + "grad_norm": 13.737142414943285, + "learning_rate": 9.977725683881707e-07, + "loss": 0.4522, + "step": 175 + }, + { + "epoch": 0.38638858397365533, + "grad_norm": 9.246882570201656, + "learning_rate": 9.97650635280248e-07, + "loss": 0.4097, + "step": 176 + }, + { + "epoch": 0.3885839736553238, + "grad_norm": 9.136180662179372, + "learning_rate": 9.97525460956381e-07, + "loss": 0.4141, + "step": 177 + }, + { + "epoch": 0.3907793633369923, + "grad_norm": 9.921404583485003, + "learning_rate": 9.973970462318349e-07, + "loss": 0.4357, + "step": 178 + }, + { + "epoch": 0.3929747530186608, + "grad_norm": 11.601008865801447, + "learning_rate": 9.972653919429788e-07, + "loss": 0.4594, + "step": 179 + }, + { + "epoch": 0.3951701427003293, + "grad_norm": 10.36094679422873, + "learning_rate": 9.971304989472817e-07, + "loss": 0.4177, + "step": 180 + }, + { + "epoch": 0.3973655323819978, + "grad_norm": 10.948879112431765, + "learning_rate": 9.969923681233066e-07, + "loss": 0.5064, + "step": 181 + }, + { + "epoch": 0.3995609220636663, + "grad_norm": 11.834429314578246, + "learning_rate": 9.968510003707042e-07, + "loss": 0.4665, + "step": 182 + }, + { + "epoch": 0.40175631174533477, + "grad_norm": 11.392606754798651, + "learning_rate": 9.967063966102079e-07, + "loss": 0.3952, + "step": 183 + }, + { + "epoch": 0.4039517014270033, + "grad_norm": 9.253848276057784, + "learning_rate": 9.965585577836264e-07, + "loss": 0.4712, + "step": 184 + }, + { + "epoch": 0.4061470911086718, + "grad_norm": 9.08494168337149, + "learning_rate": 9.9640748485384e-07, + "loss": 0.4026, + "step": 185 + }, + { + "epoch": 0.4083424807903403, + "grad_norm": 9.664013692503126, + "learning_rate": 9.962531788047913e-07, + "loss": 0.3652, + "step": 186 + }, + { + "epoch": 0.4105378704720088, + "grad_norm": 12.118857418627702, + "learning_rate": 9.960956406414813e-07, + "loss": 0.4189, + "step": 187 + }, + { + "epoch": 0.4127332601536773, + "grad_norm": 12.320330458732224, + "learning_rate": 9.959348713899613e-07, + "loss": 0.4341, + "step": 188 + }, + { + "epoch": 0.4149286498353458, + "grad_norm": 12.403712297845825, + "learning_rate": 9.957708720973273e-07, + "loss": 0.4016, + "step": 189 + }, + { + "epoch": 0.41712403951701427, + "grad_norm": 16.26187132736173, + "learning_rate": 9.956036438317123e-07, + "loss": 0.4644, + "step": 190 + }, + { + "epoch": 0.41931942919868276, + "grad_norm": 12.170054665158572, + "learning_rate": 9.954331876822798e-07, + "loss": 0.4374, + "step": 191 + }, + { + "epoch": 0.42151481888035125, + "grad_norm": 14.02551196287224, + "learning_rate": 9.952595047592167e-07, + "loss": 0.5103, + "step": 192 + }, + { + "epoch": 0.42371020856201974, + "grad_norm": 9.533566927751629, + "learning_rate": 9.950825961937257e-07, + "loss": 0.4551, + "step": 193 + }, + { + "epoch": 0.42590559824368823, + "grad_norm": 12.533660392960899, + "learning_rate": 9.949024631380189e-07, + "loss": 0.4751, + "step": 194 + }, + { + "epoch": 0.4281009879253567, + "grad_norm": 11.451058008246223, + "learning_rate": 9.94719106765309e-07, + "loss": 0.4058, + "step": 195 + }, + { + "epoch": 0.43029637760702527, + "grad_norm": 11.13982911192037, + "learning_rate": 9.945325282698022e-07, + "loss": 0.4395, + "step": 196 + }, + { + "epoch": 0.43249176728869376, + "grad_norm": 9.220791965183867, + "learning_rate": 9.94342728866691e-07, + "loss": 0.4104, + "step": 197 + }, + { + "epoch": 0.43468715697036225, + "grad_norm": 10.323390638102808, + "learning_rate": 9.941497097921456e-07, + "loss": 0.4384, + "step": 198 + }, + { + "epoch": 0.43688254665203075, + "grad_norm": 9.058003354224194, + "learning_rate": 9.939534723033057e-07, + "loss": 0.4268, + "step": 199 + }, + { + "epoch": 0.43907793633369924, + "grad_norm": 11.887165620636456, + "learning_rate": 9.937540176782731e-07, + "loss": 0.4492, + "step": 200 + }, + { + "epoch": 0.43907793633369924, + "eval_accuracy": 0.794, + "eval_loss": 0.4044578969478607, + "eval_runtime": 62.3322, + "eval_samples_per_second": 8.022, + "eval_steps_per_second": 1.011, + "step": 200 + }, + { + "epoch": 0.44127332601536773, + "grad_norm": 12.441278365943637, + "learning_rate": 9.935513472161026e-07, + "loss": 0.4653, + "step": 201 + }, + { + "epoch": 0.4434687156970362, + "grad_norm": 14.201601234233598, + "learning_rate": 9.93345462236794e-07, + "loss": 0.4865, + "step": 202 + }, + { + "epoch": 0.4456641053787047, + "grad_norm": 11.347903109722713, + "learning_rate": 9.931363640812837e-07, + "loss": 0.4097, + "step": 203 + }, + { + "epoch": 0.4478594950603732, + "grad_norm": 11.66358658856107, + "learning_rate": 9.929240541114347e-07, + "loss": 0.4315, + "step": 204 + }, + { + "epoch": 0.4500548847420417, + "grad_norm": 10.618630242743638, + "learning_rate": 9.927085337100298e-07, + "loss": 0.4678, + "step": 205 + }, + { + "epoch": 0.4522502744237102, + "grad_norm": 7.801209288203623, + "learning_rate": 9.924898042807604e-07, + "loss": 0.4143, + "step": 206 + }, + { + "epoch": 0.4544456641053787, + "grad_norm": 7.639533498673639, + "learning_rate": 9.922678672482192e-07, + "loss": 0.4286, + "step": 207 + }, + { + "epoch": 0.4566410537870472, + "grad_norm": 9.76898599627511, + "learning_rate": 9.920427240578898e-07, + "loss": 0.4785, + "step": 208 + }, + { + "epoch": 0.4588364434687157, + "grad_norm": 11.869232979224398, + "learning_rate": 9.918143761761376e-07, + "loss": 0.4192, + "step": 209 + }, + { + "epoch": 0.4610318331503842, + "grad_norm": 7.372984753013456, + "learning_rate": 9.915828250902003e-07, + "loss": 0.3777, + "step": 210 + }, + { + "epoch": 0.4632272228320527, + "grad_norm": 11.121453453807499, + "learning_rate": 9.913480723081782e-07, + "loss": 0.4398, + "step": 211 + }, + { + "epoch": 0.4654226125137212, + "grad_norm": 16.17068331984467, + "learning_rate": 9.911101193590243e-07, + "loss": 0.4948, + "step": 212 + }, + { + "epoch": 0.4676180021953897, + "grad_norm": 7.731820210763722, + "learning_rate": 9.908689677925347e-07, + "loss": 0.3828, + "step": 213 + }, + { + "epoch": 0.4698133918770582, + "grad_norm": 6.955625100083833, + "learning_rate": 9.906246191793378e-07, + "loss": 0.3839, + "step": 214 + }, + { + "epoch": 0.47200878155872666, + "grad_norm": 8.102074000516577, + "learning_rate": 9.903770751108845e-07, + "loss": 0.4021, + "step": 215 + }, + { + "epoch": 0.47420417124039516, + "grad_norm": 9.020918504573277, + "learning_rate": 9.901263371994381e-07, + "loss": 0.4212, + "step": 216 + }, + { + "epoch": 0.47639956092206365, + "grad_norm": 9.885678723860156, + "learning_rate": 9.898724070780636e-07, + "loss": 0.3961, + "step": 217 + }, + { + "epoch": 0.47859495060373214, + "grad_norm": 8.301941197636124, + "learning_rate": 9.896152864006163e-07, + "loss": 0.4615, + "step": 218 + }, + { + "epoch": 0.4807903402854007, + "grad_norm": 6.8025751469318845, + "learning_rate": 9.893549768417324e-07, + "loss": 0.4108, + "step": 219 + }, + { + "epoch": 0.4829857299670692, + "grad_norm": 7.269628718399149, + "learning_rate": 9.89091480096817e-07, + "loss": 0.4103, + "step": 220 + }, + { + "epoch": 0.48518111964873767, + "grad_norm": 7.581903789475218, + "learning_rate": 9.888247978820336e-07, + "loss": 0.4316, + "step": 221 + }, + { + "epoch": 0.48737650933040616, + "grad_norm": 5.990696348928147, + "learning_rate": 9.88554931934293e-07, + "loss": 0.3884, + "step": 222 + }, + { + "epoch": 0.48957189901207465, + "grad_norm": 9.189550396992603, + "learning_rate": 9.882818840112412e-07, + "loss": 0.4433, + "step": 223 + }, + { + "epoch": 0.49176728869374314, + "grad_norm": 7.254234068635307, + "learning_rate": 9.88005655891249e-07, + "loss": 0.3745, + "step": 224 + }, + { + "epoch": 0.49396267837541163, + "grad_norm": 7.202880981381821, + "learning_rate": 9.877262493734e-07, + "loss": 0.3223, + "step": 225 + }, + { + "epoch": 0.4961580680570801, + "grad_norm": 7.173182667515704, + "learning_rate": 9.874436662774781e-07, + "loss": 0.403, + "step": 226 + }, + { + "epoch": 0.4983534577387486, + "grad_norm": 8.612010220051477, + "learning_rate": 9.871579084439573e-07, + "loss": 0.4335, + "step": 227 + }, + { + "epoch": 0.5005488474204172, + "grad_norm": 9.74935576526992, + "learning_rate": 9.868689777339882e-07, + "loss": 0.3635, + "step": 228 + }, + { + "epoch": 0.5027442371020856, + "grad_norm": 8.08908514021729, + "learning_rate": 9.865768760293865e-07, + "loss": 0.3858, + "step": 229 + }, + { + "epoch": 0.5049396267837541, + "grad_norm": 7.235671126052564, + "learning_rate": 9.862816052326207e-07, + "loss": 0.3883, + "step": 230 + }, + { + "epoch": 0.5071350164654226, + "grad_norm": 7.987873641702222, + "learning_rate": 9.859831672668001e-07, + "loss": 0.4294, + "step": 231 + }, + { + "epoch": 0.5093304061470911, + "grad_norm": 8.372151998510336, + "learning_rate": 9.856815640756614e-07, + "loss": 0.3888, + "step": 232 + }, + { + "epoch": 0.5115257958287596, + "grad_norm": 6.161281036256101, + "learning_rate": 9.85376797623557e-07, + "loss": 0.4193, + "step": 233 + }, + { + "epoch": 0.5137211855104281, + "grad_norm": 9.631004836143994, + "learning_rate": 9.850688698954408e-07, + "loss": 0.4649, + "step": 234 + }, + { + "epoch": 0.5159165751920965, + "grad_norm": 6.271647506242595, + "learning_rate": 9.847577828968574e-07, + "loss": 0.3697, + "step": 235 + }, + { + "epoch": 0.5181119648737651, + "grad_norm": 8.270621201274638, + "learning_rate": 9.84443538653927e-07, + "loss": 0.4409, + "step": 236 + }, + { + "epoch": 0.5203073545554336, + "grad_norm": 6.618976245336643, + "learning_rate": 9.841261392133334e-07, + "loss": 0.4548, + "step": 237 + }, + { + "epoch": 0.5225027442371021, + "grad_norm": 5.99945752806192, + "learning_rate": 9.838055866423101e-07, + "loss": 0.3965, + "step": 238 + }, + { + "epoch": 0.5246981339187706, + "grad_norm": 6.170267518062791, + "learning_rate": 9.834818830286274e-07, + "loss": 0.38, + "step": 239 + }, + { + "epoch": 0.5268935236004391, + "grad_norm": 6.8738919806439025, + "learning_rate": 9.83155030480578e-07, + "loss": 0.4275, + "step": 240 + }, + { + "epoch": 0.5290889132821076, + "grad_norm": 6.858809296180418, + "learning_rate": 9.82825031126964e-07, + "loss": 0.4014, + "step": 241 + }, + { + "epoch": 0.531284302963776, + "grad_norm": 8.082830935574739, + "learning_rate": 9.82491887117083e-07, + "loss": 0.3369, + "step": 242 + }, + { + "epoch": 0.5334796926454446, + "grad_norm": 8.12372840208911, + "learning_rate": 9.821556006207131e-07, + "loss": 0.445, + "step": 243 + }, + { + "epoch": 0.535675082327113, + "grad_norm": 7.289328702001413, + "learning_rate": 9.818161738281003e-07, + "loss": 0.3874, + "step": 244 + }, + { + "epoch": 0.5378704720087816, + "grad_norm": 7.954325034186702, + "learning_rate": 9.81473608949943e-07, + "loss": 0.3915, + "step": 245 + }, + { + "epoch": 0.54006586169045, + "grad_norm": 9.55420544614635, + "learning_rate": 9.811279082173783e-07, + "loss": 0.4023, + "step": 246 + }, + { + "epoch": 0.5422612513721186, + "grad_norm": 9.209132221937256, + "learning_rate": 9.80779073881967e-07, + "loss": 0.4711, + "step": 247 + }, + { + "epoch": 0.544456641053787, + "grad_norm": 10.926492351335712, + "learning_rate": 9.804271082156792e-07, + "loss": 0.4764, + "step": 248 + }, + { + "epoch": 0.5466520307354555, + "grad_norm": 10.270432343850993, + "learning_rate": 9.800720135108798e-07, + "loss": 0.4473, + "step": 249 + }, + { + "epoch": 0.5488474204171241, + "grad_norm": 6.965551874562504, + "learning_rate": 9.79713792080313e-07, + "loss": 0.358, + "step": 250 + }, + { + "epoch": 0.5488474204171241, + "eval_accuracy": 0.786, + "eval_loss": 0.3991122543811798, + "eval_runtime": 62.3075, + "eval_samples_per_second": 8.025, + "eval_steps_per_second": 1.011, + "step": 250 + }, + { + "epoch": 0.5510428100987925, + "grad_norm": 5.5648923276243645, + "learning_rate": 9.793524462570874e-07, + "loss": 0.4018, + "step": 251 + }, + { + "epoch": 0.5532381997804611, + "grad_norm": 6.449792094094442, + "learning_rate": 9.78987978394661e-07, + "loss": 0.4422, + "step": 252 + }, + { + "epoch": 0.5554335894621295, + "grad_norm": 8.297585054184967, + "learning_rate": 9.786203908668255e-07, + "loss": 0.4425, + "step": 253 + }, + { + "epoch": 0.557628979143798, + "grad_norm": 6.0399607888851605, + "learning_rate": 9.78249686067691e-07, + "loss": 0.3961, + "step": 254 + }, + { + "epoch": 0.5598243688254665, + "grad_norm": 5.915525652384803, + "learning_rate": 9.778758664116717e-07, + "loss": 0.4248, + "step": 255 + }, + { + "epoch": 0.562019758507135, + "grad_norm": 7.340368964125289, + "learning_rate": 9.774989343334675e-07, + "loss": 0.4029, + "step": 256 + }, + { + "epoch": 0.5642151481888035, + "grad_norm": 6.723469259580046, + "learning_rate": 9.771188922880501e-07, + "loss": 0.4067, + "step": 257 + }, + { + "epoch": 0.566410537870472, + "grad_norm": 7.335423961287345, + "learning_rate": 9.76735742750647e-07, + "loss": 0.392, + "step": 258 + }, + { + "epoch": 0.5686059275521405, + "grad_norm": 6.368966519220958, + "learning_rate": 9.763494882167238e-07, + "loss": 0.4085, + "step": 259 + }, + { + "epoch": 0.570801317233809, + "grad_norm": 7.236440549041838, + "learning_rate": 9.759601312019705e-07, + "loss": 0.4253, + "step": 260 + }, + { + "epoch": 0.5729967069154775, + "grad_norm": 7.858966758509941, + "learning_rate": 9.755676742422824e-07, + "loss": 0.3497, + "step": 261 + }, + { + "epoch": 0.575192096597146, + "grad_norm": 6.220466022528224, + "learning_rate": 9.751721198937457e-07, + "loss": 0.3581, + "step": 262 + }, + { + "epoch": 0.5773874862788145, + "grad_norm": 7.112804748553098, + "learning_rate": 9.747734707326194e-07, + "loss": 0.3295, + "step": 263 + }, + { + "epoch": 0.579582875960483, + "grad_norm": 6.352762083606537, + "learning_rate": 9.743717293553197e-07, + "loss": 0.3378, + "step": 264 + }, + { + "epoch": 0.5817782656421515, + "grad_norm": 6.836533783139976, + "learning_rate": 9.73966898378402e-07, + "loss": 0.3405, + "step": 265 + }, + { + "epoch": 0.58397365532382, + "grad_norm": 12.555084006443275, + "learning_rate": 9.735589804385445e-07, + "loss": 0.4567, + "step": 266 + }, + { + "epoch": 0.5861690450054885, + "grad_norm": 7.870737080940946, + "learning_rate": 9.731479781925308e-07, + "loss": 0.344, + "step": 267 + }, + { + "epoch": 0.5883644346871569, + "grad_norm": 6.307480660613861, + "learning_rate": 9.727338943172335e-07, + "loss": 0.3541, + "step": 268 + }, + { + "epoch": 0.5905598243688255, + "grad_norm": 7.226896694611319, + "learning_rate": 9.723167315095947e-07, + "loss": 0.384, + "step": 269 + }, + { + "epoch": 0.5927552140504939, + "grad_norm": 11.54903734204424, + "learning_rate": 9.718964924866108e-07, + "loss": 0.5079, + "step": 270 + }, + { + "epoch": 0.5949506037321625, + "grad_norm": 6.907145915491046, + "learning_rate": 9.71473179985313e-07, + "loss": 0.4009, + "step": 271 + }, + { + "epoch": 0.5971459934138309, + "grad_norm": 7.0396191618231425, + "learning_rate": 9.710467967627502e-07, + "loss": 0.466, + "step": 272 + }, + { + "epoch": 0.5993413830954994, + "grad_norm": 6.766494453919865, + "learning_rate": 9.706173455959713e-07, + "loss": 0.3855, + "step": 273 + }, + { + "epoch": 0.601536772777168, + "grad_norm": 6.281456670193716, + "learning_rate": 9.701848292820069e-07, + "loss": 0.396, + "step": 274 + }, + { + "epoch": 0.6037321624588364, + "grad_norm": 8.289418010510577, + "learning_rate": 9.697492506378507e-07, + "loss": 0.4686, + "step": 275 + }, + { + "epoch": 0.605927552140505, + "grad_norm": 5.89839143614937, + "learning_rate": 9.693106125004416e-07, + "loss": 0.3974, + "step": 276 + }, + { + "epoch": 0.6081229418221734, + "grad_norm": 6.733592968899608, + "learning_rate": 9.688689177266452e-07, + "loss": 0.4243, + "step": 277 + }, + { + "epoch": 0.610318331503842, + "grad_norm": 6.051915856327664, + "learning_rate": 9.684241691932347e-07, + "loss": 0.4216, + "step": 278 + }, + { + "epoch": 0.6125137211855104, + "grad_norm": 7.552117598173567, + "learning_rate": 9.679763697968732e-07, + "loss": 0.4042, + "step": 279 + }, + { + "epoch": 0.6147091108671789, + "grad_norm": 5.316578030214541, + "learning_rate": 9.675255224540934e-07, + "loss": 0.3462, + "step": 280 + }, + { + "epoch": 0.6169045005488474, + "grad_norm": 8.421241423628498, + "learning_rate": 9.6707163010128e-07, + "loss": 0.4272, + "step": 281 + }, + { + "epoch": 0.6190998902305159, + "grad_norm": 8.77535805726173, + "learning_rate": 9.666146956946496e-07, + "loss": 0.4288, + "step": 282 + }, + { + "epoch": 0.6212952799121844, + "grad_norm": 6.601900738237074, + "learning_rate": 9.661547222102321e-07, + "loss": 0.3846, + "step": 283 + }, + { + "epoch": 0.6234906695938529, + "grad_norm": 7.4390146186640465, + "learning_rate": 9.656917126438508e-07, + "loss": 0.4568, + "step": 284 + }, + { + "epoch": 0.6256860592755215, + "grad_norm": 5.772511915683897, + "learning_rate": 9.65225670011103e-07, + "loss": 0.3924, + "step": 285 + }, + { + "epoch": 0.6278814489571899, + "grad_norm": 6.955674433545656, + "learning_rate": 9.647565973473407e-07, + "loss": 0.4033, + "step": 286 + }, + { + "epoch": 0.6300768386388584, + "grad_norm": 8.177359289353676, + "learning_rate": 9.642844977076507e-07, + "loss": 0.4666, + "step": 287 + }, + { + "epoch": 0.6322722283205269, + "grad_norm": 6.069507174731214, + "learning_rate": 9.63809374166834e-07, + "loss": 0.4373, + "step": 288 + }, + { + "epoch": 0.6344676180021954, + "grad_norm": 7.849034116126457, + "learning_rate": 9.633312298193871e-07, + "loss": 0.447, + "step": 289 + }, + { + "epoch": 0.6366630076838639, + "grad_norm": 6.926773533221862, + "learning_rate": 9.62850067779481e-07, + "loss": 0.4244, + "step": 290 + }, + { + "epoch": 0.6388583973655324, + "grad_norm": 7.158445163028628, + "learning_rate": 9.623658911809404e-07, + "loss": 0.3881, + "step": 291 + }, + { + "epoch": 0.6410537870472008, + "grad_norm": 5.7467162895599415, + "learning_rate": 9.618787031772245e-07, + "loss": 0.3568, + "step": 292 + }, + { + "epoch": 0.6432491767288694, + "grad_norm": 6.928058648989376, + "learning_rate": 9.61388506941406e-07, + "loss": 0.4281, + "step": 293 + }, + { + "epoch": 0.6454445664105378, + "grad_norm": 6.593161246770192, + "learning_rate": 9.6089530566615e-07, + "loss": 0.359, + "step": 294 + }, + { + "epoch": 0.6476399560922064, + "grad_norm": 7.614859979484859, + "learning_rate": 9.603991025636933e-07, + "loss": 0.3895, + "step": 295 + }, + { + "epoch": 0.6498353457738749, + "grad_norm": 9.246559759408955, + "learning_rate": 9.598999008658241e-07, + "loss": 0.3959, + "step": 296 + }, + { + "epoch": 0.6520307354555434, + "grad_norm": 10.624131439351855, + "learning_rate": 9.59397703823861e-07, + "loss": 0.4265, + "step": 297 + }, + { + "epoch": 0.6542261251372119, + "grad_norm": 7.146670140793182, + "learning_rate": 9.588925147086303e-07, + "loss": 0.3586, + "step": 298 + }, + { + "epoch": 0.6564215148188803, + "grad_norm": 5.708868849662112, + "learning_rate": 9.583843368104464e-07, + "loss": 0.3444, + "step": 299 + }, + { + "epoch": 0.6586169045005489, + "grad_norm": 6.538247134761959, + "learning_rate": 9.578731734390898e-07, + "loss": 0.3721, + "step": 300 + }, + { + "epoch": 0.6586169045005489, + "eval_accuracy": 0.8, + "eval_loss": 0.3827642798423767, + "eval_runtime": 62.403, + "eval_samples_per_second": 8.012, + "eval_steps_per_second": 1.01, + "step": 300 + }, + { + "epoch": 0.6608122941822173, + "grad_norm": 5.4614975487617174, + "learning_rate": 9.573590279237854e-07, + "loss": 0.3739, + "step": 301 + }, + { + "epoch": 0.6630076838638859, + "grad_norm": 5.616403801157149, + "learning_rate": 9.568419036131807e-07, + "loss": 0.3802, + "step": 302 + }, + { + "epoch": 0.6652030735455543, + "grad_norm": 6.54627994415682, + "learning_rate": 9.563218038753245e-07, + "loss": 0.369, + "step": 303 + }, + { + "epoch": 0.6673984632272228, + "grad_norm": 7.562917302782545, + "learning_rate": 9.557987320976446e-07, + "loss": 0.4048, + "step": 304 + }, + { + "epoch": 0.6695938529088913, + "grad_norm": 5.723183604217507, + "learning_rate": 9.552726916869254e-07, + "loss": 0.3875, + "step": 305 + }, + { + "epoch": 0.6717892425905598, + "grad_norm": 5.431663961962029, + "learning_rate": 9.547436860692869e-07, + "loss": 0.4294, + "step": 306 + }, + { + "epoch": 0.6739846322722283, + "grad_norm": 5.515569016162807, + "learning_rate": 9.542117186901608e-07, + "loss": 0.3534, + "step": 307 + }, + { + "epoch": 0.6761800219538968, + "grad_norm": 5.225609276477867, + "learning_rate": 9.536767930142692e-07, + "loss": 0.3497, + "step": 308 + }, + { + "epoch": 0.6783754116355654, + "grad_norm": 5.982849222319877, + "learning_rate": 9.53138912525602e-07, + "loss": 0.3831, + "step": 309 + }, + { + "epoch": 0.6805708013172338, + "grad_norm": 9.316032754601094, + "learning_rate": 9.525980807273933e-07, + "loss": 0.4859, + "step": 310 + }, + { + "epoch": 0.6827661909989023, + "grad_norm": 7.62002827563512, + "learning_rate": 9.520543011420994e-07, + "loss": 0.4667, + "step": 311 + }, + { + "epoch": 0.6849615806805708, + "grad_norm": 7.476256847848758, + "learning_rate": 9.515075773113758e-07, + "loss": 0.3561, + "step": 312 + }, + { + "epoch": 0.6871569703622393, + "grad_norm": 6.0952453407784395, + "learning_rate": 9.509579127960541e-07, + "loss": 0.3759, + "step": 313 + }, + { + "epoch": 0.6893523600439078, + "grad_norm": 5.264194015456075, + "learning_rate": 9.504053111761183e-07, + "loss": 0.3553, + "step": 314 + }, + { + "epoch": 0.6915477497255763, + "grad_norm": 5.722501060128661, + "learning_rate": 9.498497760506819e-07, + "loss": 0.41, + "step": 315 + }, + { + "epoch": 0.6937431394072447, + "grad_norm": 5.385616225560054, + "learning_rate": 9.492913110379647e-07, + "loss": 0.3811, + "step": 316 + }, + { + "epoch": 0.6959385290889133, + "grad_norm": 6.8590494144069565, + "learning_rate": 9.487299197752687e-07, + "loss": 0.4238, + "step": 317 + }, + { + "epoch": 0.6981339187705817, + "grad_norm": 8.374420284758637, + "learning_rate": 9.481656059189549e-07, + "loss": 0.4383, + "step": 318 + }, + { + "epoch": 0.7003293084522503, + "grad_norm": 6.74093160128879, + "learning_rate": 9.475983731444191e-07, + "loss": 0.4053, + "step": 319 + }, + { + "epoch": 0.7025246981339188, + "grad_norm": 6.3600960708427445, + "learning_rate": 9.47028225146068e-07, + "loss": 0.4135, + "step": 320 + }, + { + "epoch": 0.7047200878155873, + "grad_norm": 5.118236300502877, + "learning_rate": 9.464551656372955e-07, + "loss": 0.3437, + "step": 321 + }, + { + "epoch": 0.7069154774972558, + "grad_norm": 6.0512731733617775, + "learning_rate": 9.458791983504581e-07, + "loss": 0.3719, + "step": 322 + }, + { + "epoch": 0.7091108671789242, + "grad_norm": 4.689013131515306, + "learning_rate": 9.453003270368509e-07, + "loss": 0.4211, + "step": 323 + }, + { + "epoch": 0.7113062568605928, + "grad_norm": 5.847735855655551, + "learning_rate": 9.44718555466683e-07, + "loss": 0.4383, + "step": 324 + }, + { + "epoch": 0.7135016465422612, + "grad_norm": 5.665362800637276, + "learning_rate": 9.44133887429053e-07, + "loss": 0.3934, + "step": 325 + }, + { + "epoch": 0.7156970362239298, + "grad_norm": 5.874604417597781, + "learning_rate": 9.435463267319239e-07, + "loss": 0.3583, + "step": 326 + }, + { + "epoch": 0.7178924259055982, + "grad_norm": 5.159270630284724, + "learning_rate": 9.429558772020992e-07, + "loss": 0.4252, + "step": 327 + }, + { + "epoch": 0.7200878155872668, + "grad_norm": 5.706691193740145, + "learning_rate": 9.423625426851973e-07, + "loss": 0.3727, + "step": 328 + }, + { + "epoch": 0.7222832052689352, + "grad_norm": 5.633325508736258, + "learning_rate": 9.417663270456267e-07, + "loss": 0.3591, + "step": 329 + }, + { + "epoch": 0.7244785949506037, + "grad_norm": 6.114919710777612, + "learning_rate": 9.411672341665604e-07, + "loss": 0.3658, + "step": 330 + }, + { + "epoch": 0.7266739846322722, + "grad_norm": 7.068818492445254, + "learning_rate": 9.405652679499115e-07, + "loss": 0.3954, + "step": 331 + }, + { + "epoch": 0.7288693743139407, + "grad_norm": 6.02224410411034, + "learning_rate": 9.399604323163068e-07, + "loss": 0.346, + "step": 332 + }, + { + "epoch": 0.7310647639956093, + "grad_norm": 8.959081519304984, + "learning_rate": 9.393527312050617e-07, + "loss": 0.4459, + "step": 333 + }, + { + "epoch": 0.7332601536772777, + "grad_norm": 7.542390397020765, + "learning_rate": 9.387421685741552e-07, + "loss": 0.4022, + "step": 334 + }, + { + "epoch": 0.7354555433589463, + "grad_norm": 6.8197296180264555, + "learning_rate": 9.381287484002027e-07, + "loss": 0.3836, + "step": 335 + }, + { + "epoch": 0.7376509330406147, + "grad_norm": 5.343272686170338, + "learning_rate": 9.375124746784311e-07, + "loss": 0.3561, + "step": 336 + }, + { + "epoch": 0.7398463227222832, + "grad_norm": 6.461455373048154, + "learning_rate": 9.368933514226529e-07, + "loss": 0.4096, + "step": 337 + }, + { + "epoch": 0.7420417124039517, + "grad_norm": 4.854050051488241, + "learning_rate": 9.362713826652392e-07, + "loss": 0.3814, + "step": 338 + }, + { + "epoch": 0.7442371020856202, + "grad_norm": 5.286657056414475, + "learning_rate": 9.356465724570943e-07, + "loss": 0.3989, + "step": 339 + }, + { + "epoch": 0.7464324917672887, + "grad_norm": 5.696140592098042, + "learning_rate": 9.350189248676292e-07, + "loss": 0.4114, + "step": 340 + }, + { + "epoch": 0.7486278814489572, + "grad_norm": 5.963578851614639, + "learning_rate": 9.34388443984734e-07, + "loss": 0.4103, + "step": 341 + }, + { + "epoch": 0.7508232711306256, + "grad_norm": 5.167078850111144, + "learning_rate": 9.33755133914753e-07, + "loss": 0.3987, + "step": 342 + }, + { + "epoch": 0.7530186608122942, + "grad_norm": 5.365795093462018, + "learning_rate": 9.331189987824568e-07, + "loss": 0.414, + "step": 343 + }, + { + "epoch": 0.7552140504939627, + "grad_norm": 5.155962282558951, + "learning_rate": 9.324800427310155e-07, + "loss": 0.4107, + "step": 344 + }, + { + "epoch": 0.7574094401756312, + "grad_norm": 4.685610227798769, + "learning_rate": 9.318382699219722e-07, + "loss": 0.383, + "step": 345 + }, + { + "epoch": 0.7596048298572997, + "grad_norm": 5.451910902227303, + "learning_rate": 9.311936845352157e-07, + "loss": 0.3701, + "step": 346 + }, + { + "epoch": 0.7618002195389681, + "grad_norm": 4.878532494519716, + "learning_rate": 9.305462907689532e-07, + "loss": 0.3889, + "step": 347 + }, + { + "epoch": 0.7639956092206367, + "grad_norm": 6.488604304889017, + "learning_rate": 9.298960928396826e-07, + "loss": 0.3769, + "step": 348 + }, + { + "epoch": 0.7661909989023051, + "grad_norm": 7.230743847370782, + "learning_rate": 9.292430949821659e-07, + "loss": 0.3918, + "step": 349 + }, + { + "epoch": 0.7683863885839737, + "grad_norm": 8.84856037173542, + "learning_rate": 9.285873014494008e-07, + "loss": 0.4351, + "step": 350 + }, + { + "epoch": 0.7683863885839737, + "eval_accuracy": 0.818, + "eval_loss": 0.36839187145233154, + "eval_runtime": 62.4628, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 1.009, + "step": 350 + }, + { + "epoch": 0.7705817782656421, + "grad_norm": 5.617691309816291, + "learning_rate": 9.279287165125936e-07, + "loss": 0.3329, + "step": 351 + }, + { + "epoch": 0.7727771679473107, + "grad_norm": 10.749385714028035, + "learning_rate": 9.272673444611308e-07, + "loss": 0.4227, + "step": 352 + }, + { + "epoch": 0.7749725576289791, + "grad_norm": 6.679314682583184, + "learning_rate": 9.266031896025516e-07, + "loss": 0.3981, + "step": 353 + }, + { + "epoch": 0.7771679473106476, + "grad_norm": 7.906535171339716, + "learning_rate": 9.259362562625199e-07, + "loss": 0.3457, + "step": 354 + }, + { + "epoch": 0.7793633369923162, + "grad_norm": 6.476745284566722, + "learning_rate": 9.252665487847957e-07, + "loss": 0.351, + "step": 355 + }, + { + "epoch": 0.7815587266739846, + "grad_norm": 7.490252907466211, + "learning_rate": 9.245940715312074e-07, + "loss": 0.3614, + "step": 356 + }, + { + "epoch": 0.7837541163556532, + "grad_norm": 10.294959027911519, + "learning_rate": 9.239188288816226e-07, + "loss": 0.4772, + "step": 357 + }, + { + "epoch": 0.7859495060373216, + "grad_norm": 6.749458938354777, + "learning_rate": 9.232408252339201e-07, + "loss": 0.4137, + "step": 358 + }, + { + "epoch": 0.7881448957189902, + "grad_norm": 5.601238558514638, + "learning_rate": 9.225600650039615e-07, + "loss": 0.4017, + "step": 359 + }, + { + "epoch": 0.7903402854006586, + "grad_norm": 5.914207463946434, + "learning_rate": 9.218765526255619e-07, + "loss": 0.3949, + "step": 360 + }, + { + "epoch": 0.7925356750823271, + "grad_norm": 5.60976526230607, + "learning_rate": 9.211902925504613e-07, + "loss": 0.3947, + "step": 361 + }, + { + "epoch": 0.7947310647639956, + "grad_norm": 6.905598884857724, + "learning_rate": 9.205012892482952e-07, + "loss": 0.3593, + "step": 362 + }, + { + "epoch": 0.7969264544456641, + "grad_norm": 5.708597195424006, + "learning_rate": 9.198095472065667e-07, + "loss": 0.3845, + "step": 363 + }, + { + "epoch": 0.7991218441273326, + "grad_norm": 6.3826311537728415, + "learning_rate": 9.191150709306155e-07, + "loss": 0.426, + "step": 364 + }, + { + "epoch": 0.8013172338090011, + "grad_norm": 5.704176371314256, + "learning_rate": 9.184178649435896e-07, + "loss": 0.3623, + "step": 365 + }, + { + "epoch": 0.8035126234906695, + "grad_norm": 6.538927998752732, + "learning_rate": 9.177179337864163e-07, + "loss": 0.4081, + "step": 366 + }, + { + "epoch": 0.8057080131723381, + "grad_norm": 5.417066774084007, + "learning_rate": 9.170152820177714e-07, + "loss": 0.326, + "step": 367 + }, + { + "epoch": 0.8079034028540066, + "grad_norm": 5.7045823769473145, + "learning_rate": 9.163099142140505e-07, + "loss": 0.3464, + "step": 368 + }, + { + "epoch": 0.8100987925356751, + "grad_norm": 6.709508312095118, + "learning_rate": 9.156018349693386e-07, + "loss": 0.3343, + "step": 369 + }, + { + "epoch": 0.8122941822173436, + "grad_norm": 8.175912377684108, + "learning_rate": 9.148910488953807e-07, + "loss": 0.4461, + "step": 370 + }, + { + "epoch": 0.814489571899012, + "grad_norm": 6.172712289986408, + "learning_rate": 9.141775606215512e-07, + "loss": 0.3778, + "step": 371 + }, + { + "epoch": 0.8166849615806806, + "grad_norm": 9.110817197454034, + "learning_rate": 9.134613747948238e-07, + "loss": 0.4277, + "step": 372 + }, + { + "epoch": 0.818880351262349, + "grad_norm": 7.730875175896815, + "learning_rate": 9.127424960797423e-07, + "loss": 0.3732, + "step": 373 + }, + { + "epoch": 0.8210757409440176, + "grad_norm": 6.560594986117142, + "learning_rate": 9.120209291583885e-07, + "loss": 0.413, + "step": 374 + }, + { + "epoch": 0.823271130625686, + "grad_norm": 7.2498236273803, + "learning_rate": 9.11296678730353e-07, + "loss": 0.4097, + "step": 375 + }, + { + "epoch": 0.8254665203073546, + "grad_norm": 5.069648552101249, + "learning_rate": 9.10569749512704e-07, + "loss": 0.415, + "step": 376 + }, + { + "epoch": 0.827661909989023, + "grad_norm": 6.5382746464808355, + "learning_rate": 9.098401462399572e-07, + "loss": 0.3679, + "step": 377 + }, + { + "epoch": 0.8298572996706916, + "grad_norm": 7.063308742163228, + "learning_rate": 9.091078736640438e-07, + "loss": 0.3834, + "step": 378 + }, + { + "epoch": 0.8320526893523601, + "grad_norm": 5.3524000614593445, + "learning_rate": 9.083729365542807e-07, + "loss": 0.4322, + "step": 379 + }, + { + "epoch": 0.8342480790340285, + "grad_norm": 5.251326032637674, + "learning_rate": 9.076353396973391e-07, + "loss": 0.3737, + "step": 380 + }, + { + "epoch": 0.8364434687156971, + "grad_norm": 5.606890380106128, + "learning_rate": 9.068950878972128e-07, + "loss": 0.3916, + "step": 381 + }, + { + "epoch": 0.8386388583973655, + "grad_norm": 5.608911929639908, + "learning_rate": 9.06152185975188e-07, + "loss": 0.3895, + "step": 382 + }, + { + "epoch": 0.8408342480790341, + "grad_norm": 5.326991715289215, + "learning_rate": 9.054066387698103e-07, + "loss": 0.3505, + "step": 383 + }, + { + "epoch": 0.8430296377607025, + "grad_norm": 5.002446774906431, + "learning_rate": 9.04658451136855e-07, + "loss": 0.3567, + "step": 384 + }, + { + "epoch": 0.845225027442371, + "grad_norm": 6.2353029098708355, + "learning_rate": 9.039076279492938e-07, + "loss": 0.3654, + "step": 385 + }, + { + "epoch": 0.8474204171240395, + "grad_norm": 5.383798331488236, + "learning_rate": 9.03154174097265e-07, + "loss": 0.3528, + "step": 386 + }, + { + "epoch": 0.849615806805708, + "grad_norm": 7.522503969731689, + "learning_rate": 9.023980944880395e-07, + "loss": 0.3802, + "step": 387 + }, + { + "epoch": 0.8518111964873765, + "grad_norm": 5.624816423254138, + "learning_rate": 9.016393940459901e-07, + "loss": 0.3226, + "step": 388 + }, + { + "epoch": 0.854006586169045, + "grad_norm": 9.036186780868068, + "learning_rate": 9.008780777125592e-07, + "loss": 0.3459, + "step": 389 + }, + { + "epoch": 0.8562019758507134, + "grad_norm": 7.771864470871327, + "learning_rate": 9.001141504462267e-07, + "loss": 0.3945, + "step": 390 + }, + { + "epoch": 0.858397365532382, + "grad_norm": 10.948074987698849, + "learning_rate": 8.993476172224776e-07, + "loss": 0.3845, + "step": 391 + }, + { + "epoch": 0.8605927552140505, + "grad_norm": 6.279681321387117, + "learning_rate": 8.985784830337694e-07, + "loss": 0.3346, + "step": 392 + }, + { + "epoch": 0.862788144895719, + "grad_norm": 6.22711354144465, + "learning_rate": 8.978067528895001e-07, + "loss": 0.3589, + "step": 393 + }, + { + "epoch": 0.8649835345773875, + "grad_norm": 8.754649020649845, + "learning_rate": 8.970324318159747e-07, + "loss": 0.4202, + "step": 394 + }, + { + "epoch": 0.867178924259056, + "grad_norm": 6.308329054889705, + "learning_rate": 8.962555248563737e-07, + "loss": 0.3964, + "step": 395 + }, + { + "epoch": 0.8693743139407245, + "grad_norm": 5.393641757738077, + "learning_rate": 8.95476037070719e-07, + "loss": 0.4054, + "step": 396 + }, + { + "epoch": 0.8715697036223929, + "grad_norm": 5.6298700405971065, + "learning_rate": 8.94693973535842e-07, + "loss": 0.3947, + "step": 397 + }, + { + "epoch": 0.8737650933040615, + "grad_norm": 4.205803068097396, + "learning_rate": 8.939093393453494e-07, + "loss": 0.3618, + "step": 398 + }, + { + "epoch": 0.8759604829857299, + "grad_norm": 5.473714761916757, + "learning_rate": 8.931221396095914e-07, + "loss": 0.3509, + "step": 399 + }, + { + "epoch": 0.8781558726673985, + "grad_norm": 7.034796071540623, + "learning_rate": 8.92332379455627e-07, + "loss": 0.3482, + "step": 400 + }, + { + "epoch": 0.8781558726673985, + "eval_accuracy": 0.79, + "eval_loss": 0.38279932737350464, + "eval_runtime": 62.3937, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 1.01, + "step": 400 + }, + { + "epoch": 0.8803512623490669, + "grad_norm": 5.9849568856182955, + "learning_rate": 8.91540064027192e-07, + "loss": 0.3945, + "step": 401 + }, + { + "epoch": 0.8825466520307355, + "grad_norm": 4.900349286406815, + "learning_rate": 8.907451984846642e-07, + "loss": 0.3374, + "step": 402 + }, + { + "epoch": 0.884742041712404, + "grad_norm": 7.015373419320435, + "learning_rate": 8.899477880050305e-07, + "loss": 0.4297, + "step": 403 + }, + { + "epoch": 0.8869374313940724, + "grad_norm": 5.211500475863296, + "learning_rate": 8.891478377818533e-07, + "loss": 0.3425, + "step": 404 + }, + { + "epoch": 0.889132821075741, + "grad_norm": 4.850386411589254, + "learning_rate": 8.883453530252363e-07, + "loss": 0.3479, + "step": 405 + }, + { + "epoch": 0.8913282107574094, + "grad_norm": 5.755909710211362, + "learning_rate": 8.875403389617909e-07, + "loss": 0.4106, + "step": 406 + }, + { + "epoch": 0.893523600439078, + "grad_norm": 6.753928081610862, + "learning_rate": 8.867328008346012e-07, + "loss": 0.3862, + "step": 407 + }, + { + "epoch": 0.8957189901207464, + "grad_norm": 5.767435648190633, + "learning_rate": 8.859227439031917e-07, + "loss": 0.389, + "step": 408 + }, + { + "epoch": 0.897914379802415, + "grad_norm": 5.3650473270965975, + "learning_rate": 8.851101734434916e-07, + "loss": 0.3696, + "step": 409 + }, + { + "epoch": 0.9001097694840834, + "grad_norm": 6.082270492915926, + "learning_rate": 8.842950947478001e-07, + "loss": 0.3817, + "step": 410 + }, + { + "epoch": 0.9023051591657519, + "grad_norm": 4.627051115673926, + "learning_rate": 8.834775131247534e-07, + "loss": 0.3615, + "step": 411 + }, + { + "epoch": 0.9045005488474204, + "grad_norm": 4.703510731345825, + "learning_rate": 8.826574338992893e-07, + "loss": 0.3487, + "step": 412 + }, + { + "epoch": 0.9066959385290889, + "grad_norm": 4.403248729861, + "learning_rate": 8.818348624126122e-07, + "loss": 0.3409, + "step": 413 + }, + { + "epoch": 0.9088913282107574, + "grad_norm": 7.226848694927689, + "learning_rate": 8.810098040221588e-07, + "loss": 0.3674, + "step": 414 + }, + { + "epoch": 0.9110867178924259, + "grad_norm": 6.0854109691994145, + "learning_rate": 8.801822641015635e-07, + "loss": 0.3669, + "step": 415 + }, + { + "epoch": 0.9132821075740944, + "grad_norm": 5.0811223784714725, + "learning_rate": 8.793522480406223e-07, + "loss": 0.3236, + "step": 416 + }, + { + "epoch": 0.9154774972557629, + "grad_norm": 7.237396760486593, + "learning_rate": 8.785197612452591e-07, + "loss": 0.4279, + "step": 417 + }, + { + "epoch": 0.9176728869374314, + "grad_norm": 5.505858472667137, + "learning_rate": 8.776848091374892e-07, + "loss": 0.3658, + "step": 418 + }, + { + "epoch": 0.9198682766190999, + "grad_norm": 7.040175733229395, + "learning_rate": 8.768473971553847e-07, + "loss": 0.3852, + "step": 419 + }, + { + "epoch": 0.9220636663007684, + "grad_norm": 5.592000526909508, + "learning_rate": 8.760075307530392e-07, + "loss": 0.3306, + "step": 420 + }, + { + "epoch": 0.9242590559824369, + "grad_norm": 6.366945719939024, + "learning_rate": 8.75165215400532e-07, + "loss": 0.3906, + "step": 421 + }, + { + "epoch": 0.9264544456641054, + "grad_norm": 7.370004771820366, + "learning_rate": 8.743204565838922e-07, + "loss": 0.3853, + "step": 422 + }, + { + "epoch": 0.9286498353457738, + "grad_norm": 6.381558651368943, + "learning_rate": 8.734732598050636e-07, + "loss": 0.4168, + "step": 423 + }, + { + "epoch": 0.9308452250274424, + "grad_norm": 5.053966639028662, + "learning_rate": 8.726236305818681e-07, + "loss": 0.3842, + "step": 424 + }, + { + "epoch": 0.9330406147091108, + "grad_norm": 5.398688693219092, + "learning_rate": 8.717715744479706e-07, + "loss": 0.3838, + "step": 425 + }, + { + "epoch": 0.9352360043907794, + "grad_norm": 5.2483617600811625, + "learning_rate": 8.709170969528425e-07, + "loss": 0.4371, + "step": 426 + }, + { + "epoch": 0.9374313940724479, + "grad_norm": 4.227523591223431, + "learning_rate": 8.700602036617253e-07, + "loss": 0.3653, + "step": 427 + }, + { + "epoch": 0.9396267837541163, + "grad_norm": 5.346394544482726, + "learning_rate": 8.692009001555951e-07, + "loss": 0.4071, + "step": 428 + }, + { + "epoch": 0.9418221734357849, + "grad_norm": 5.868340936821356, + "learning_rate": 8.683391920311256e-07, + "loss": 0.415, + "step": 429 + }, + { + "epoch": 0.9440175631174533, + "grad_norm": 3.8517100242733346, + "learning_rate": 8.674750849006518e-07, + "loss": 0.3518, + "step": 430 + }, + { + "epoch": 0.9462129527991219, + "grad_norm": 5.514252217717242, + "learning_rate": 8.666085843921337e-07, + "loss": 0.3563, + "step": 431 + }, + { + "epoch": 0.9484083424807903, + "grad_norm": 4.306700297128568, + "learning_rate": 8.65739696149119e-07, + "loss": 0.3731, + "step": 432 + }, + { + "epoch": 0.9506037321624589, + "grad_norm": 5.318133544247306, + "learning_rate": 8.648684258307075e-07, + "loss": 0.4034, + "step": 433 + }, + { + "epoch": 0.9527991218441273, + "grad_norm": 6.306808553382266, + "learning_rate": 8.639947791115131e-07, + "loss": 0.3843, + "step": 434 + }, + { + "epoch": 0.9549945115257958, + "grad_norm": 4.683006509880547, + "learning_rate": 8.631187616816271e-07, + "loss": 0.3426, + "step": 435 + }, + { + "epoch": 0.9571899012074643, + "grad_norm": 5.272777856944059, + "learning_rate": 8.622403792465819e-07, + "loss": 0.3919, + "step": 436 + }, + { + "epoch": 0.9593852908891328, + "grad_norm": 6.567051623139762, + "learning_rate": 8.613596375273127e-07, + "loss": 0.3334, + "step": 437 + }, + { + "epoch": 0.9615806805708014, + "grad_norm": 4.661181804857022, + "learning_rate": 8.604765422601213e-07, + "loss": 0.3393, + "step": 438 + }, + { + "epoch": 0.9637760702524698, + "grad_norm": 7.723055911423496, + "learning_rate": 8.595910991966375e-07, + "loss": 0.3987, + "step": 439 + }, + { + "epoch": 0.9659714599341384, + "grad_norm": 5.1917875018161705, + "learning_rate": 8.587033141037833e-07, + "loss": 0.3833, + "step": 440 + }, + { + "epoch": 0.9681668496158068, + "grad_norm": 4.93039231657318, + "learning_rate": 8.578131927637339e-07, + "loss": 0.321, + "step": 441 + }, + { + "epoch": 0.9703622392974753, + "grad_norm": 5.662266928236786, + "learning_rate": 8.569207409738804e-07, + "loss": 0.3442, + "step": 442 + }, + { + "epoch": 0.9725576289791438, + "grad_norm": 7.530776092189444, + "learning_rate": 8.560259645467927e-07, + "loss": 0.3823, + "step": 443 + }, + { + "epoch": 0.9747530186608123, + "grad_norm": 5.722097050543397, + "learning_rate": 8.551288693101808e-07, + "loss": 0.3381, + "step": 444 + }, + { + "epoch": 0.9769484083424808, + "grad_norm": 5.2154253758252365, + "learning_rate": 8.542294611068573e-07, + "loss": 0.355, + "step": 445 + }, + { + "epoch": 0.9791437980241493, + "grad_norm": 6.183565635622652, + "learning_rate": 8.533277457946988e-07, + "loss": 0.3469, + "step": 446 + }, + { + "epoch": 0.9813391877058177, + "grad_norm": 6.199123575604012, + "learning_rate": 8.524237292466092e-07, + "loss": 0.365, + "step": 447 + }, + { + "epoch": 0.9835345773874863, + "grad_norm": 5.22531850121621, + "learning_rate": 8.515174173504795e-07, + "loss": 0.3699, + "step": 448 + }, + { + "epoch": 0.9857299670691547, + "grad_norm": 4.777247067342119, + "learning_rate": 8.506088160091506e-07, + "loss": 0.3778, + "step": 449 + }, + { + "epoch": 0.9879253567508233, + "grad_norm": 5.7270931969707854, + "learning_rate": 8.49697931140375e-07, + "loss": 0.3722, + "step": 450 + }, + { + "epoch": 0.9879253567508233, + "eval_accuracy": 0.812, + "eval_loss": 0.3796866834163666, + "eval_runtime": 62.6515, + "eval_samples_per_second": 7.981, + "eval_steps_per_second": 1.006, + "step": 450 + }, + { + "epoch": 0.9901207464324918, + "grad_norm": 5.722657890993041, + "learning_rate": 8.487847686767771e-07, + "loss": 0.3671, + "step": 451 + }, + { + "epoch": 0.9923161361141603, + "grad_norm": 6.144552426739494, + "learning_rate": 8.478693345658165e-07, + "loss": 0.3654, + "step": 452 + }, + { + "epoch": 0.9945115257958288, + "grad_norm": 6.225509000256566, + "learning_rate": 8.469516347697472e-07, + "loss": 0.3878, + "step": 453 + }, + { + "epoch": 0.9967069154774972, + "grad_norm": 6.652501158992634, + "learning_rate": 8.460316752655798e-07, + "loss": 0.3277, + "step": 454 + }, + { + "epoch": 0.9989023051591658, + "grad_norm": 4.827682071757918, + "learning_rate": 8.451094620450431e-07, + "loss": 0.3394, + "step": 455 + }, + { + "epoch": 1.0, + "grad_norm": 4.827682071757918, + "learning_rate": 8.441850011145435e-07, + "loss": 0.3908, + "step": 456 + }, + { + "epoch": 1.0021953896816684, + "grad_norm": 8.237448030205368, + "learning_rate": 8.432582984951276e-07, + "loss": 0.3649, + "step": 457 + }, + { + "epoch": 1.004390779363337, + "grad_norm": 5.244969465316055, + "learning_rate": 8.423293602224417e-07, + "loss": 0.3766, + "step": 458 + }, + { + "epoch": 1.0065861690450055, + "grad_norm": 5.735807992043271, + "learning_rate": 8.413981923466932e-07, + "loss": 0.3734, + "step": 459 + }, + { + "epoch": 1.008781558726674, + "grad_norm": 6.279435381847301, + "learning_rate": 8.404648009326111e-07, + "loss": 0.3716, + "step": 460 + }, + { + "epoch": 1.0109769484083424, + "grad_norm": 4.020301331975729, + "learning_rate": 8.395291920594061e-07, + "loss": 0.3068, + "step": 461 + }, + { + "epoch": 1.013172338090011, + "grad_norm": 5.932445501255306, + "learning_rate": 8.385913718207313e-07, + "loss": 0.3635, + "step": 462 + }, + { + "epoch": 1.0153677277716795, + "grad_norm": 6.129385754463756, + "learning_rate": 8.376513463246429e-07, + "loss": 0.3688, + "step": 463 + }, + { + "epoch": 1.017563117453348, + "grad_norm": 5.640316298247342, + "learning_rate": 8.367091216935596e-07, + "loss": 0.3777, + "step": 464 + }, + { + "epoch": 1.0197585071350164, + "grad_norm": 5.53344558624335, + "learning_rate": 8.357647040642231e-07, + "loss": 0.318, + "step": 465 + }, + { + "epoch": 1.021953896816685, + "grad_norm": 5.528084412484368, + "learning_rate": 8.348180995876587e-07, + "loss": 0.344, + "step": 466 + }, + { + "epoch": 1.0241492864983535, + "grad_norm": 4.720679027078674, + "learning_rate": 8.338693144291342e-07, + "loss": 0.3476, + "step": 467 + }, + { + "epoch": 1.026344676180022, + "grad_norm": 4.8918294986912585, + "learning_rate": 8.329183547681205e-07, + "loss": 0.3584, + "step": 468 + }, + { + "epoch": 1.0285400658616906, + "grad_norm": 6.727667695338807, + "learning_rate": 8.319652267982508e-07, + "loss": 0.3377, + "step": 469 + }, + { + "epoch": 1.030735455543359, + "grad_norm": 5.867502190382494, + "learning_rate": 8.310099367272812e-07, + "loss": 0.3591, + "step": 470 + }, + { + "epoch": 1.0329308452250274, + "grad_norm": 6.568389714898798, + "learning_rate": 8.30052490777049e-07, + "loss": 0.434, + "step": 471 + }, + { + "epoch": 1.0351262349066959, + "grad_norm": 5.375130488935029, + "learning_rate": 8.29092895183433e-07, + "loss": 0.3371, + "step": 472 + }, + { + "epoch": 1.0373216245883645, + "grad_norm": 5.0716837196261935, + "learning_rate": 8.281311561963129e-07, + "loss": 0.3583, + "step": 473 + }, + { + "epoch": 1.039517014270033, + "grad_norm": 4.9710453814081035, + "learning_rate": 8.271672800795284e-07, + "loss": 0.3454, + "step": 474 + }, + { + "epoch": 1.0417124039517014, + "grad_norm": 5.372729941376803, + "learning_rate": 8.26201273110838e-07, + "loss": 0.4327, + "step": 475 + }, + { + "epoch": 1.0439077936333698, + "grad_norm": 6.047019069783535, + "learning_rate": 8.252331415818788e-07, + "loss": 0.3166, + "step": 476 + }, + { + "epoch": 1.0461031833150385, + "grad_norm": 4.894109855984722, + "learning_rate": 8.242628917981253e-07, + "loss": 0.3486, + "step": 477 + }, + { + "epoch": 1.048298572996707, + "grad_norm": 4.960287165829203, + "learning_rate": 8.232905300788484e-07, + "loss": 0.3683, + "step": 478 + }, + { + "epoch": 1.0504939626783754, + "grad_norm": 5.9743260148319415, + "learning_rate": 8.223160627570736e-07, + "loss": 0.2962, + "step": 479 + }, + { + "epoch": 1.0526893523600438, + "grad_norm": 4.421002426118984, + "learning_rate": 8.213394961795406e-07, + "loss": 0.353, + "step": 480 + }, + { + "epoch": 1.0548847420417125, + "grad_norm": 5.826816362211787, + "learning_rate": 8.203608367066615e-07, + "loss": 0.4134, + "step": 481 + }, + { + "epoch": 1.057080131723381, + "grad_norm": 6.328500780012271, + "learning_rate": 8.193800907124798e-07, + "loss": 0.3944, + "step": 482 + }, + { + "epoch": 1.0592755214050493, + "grad_norm": 5.761570603930255, + "learning_rate": 8.183972645846282e-07, + "loss": 0.3999, + "step": 483 + }, + { + "epoch": 1.061470911086718, + "grad_norm": 7.269393542940046, + "learning_rate": 8.174123647242877e-07, + "loss": 0.3465, + "step": 484 + }, + { + "epoch": 1.0636663007683864, + "grad_norm": 5.531846580927394, + "learning_rate": 8.164253975461453e-07, + "loss": 0.3688, + "step": 485 + }, + { + "epoch": 1.0658616904500549, + "grad_norm": 4.776075343743403, + "learning_rate": 8.154363694783526e-07, + "loss": 0.3257, + "step": 486 + }, + { + "epoch": 1.0680570801317233, + "grad_norm": 5.378257513897468, + "learning_rate": 8.14445286962484e-07, + "loss": 0.3389, + "step": 487 + }, + { + "epoch": 1.070252469813392, + "grad_norm": 6.510185696868045, + "learning_rate": 8.134521564534947e-07, + "loss": 0.3559, + "step": 488 + }, + { + "epoch": 1.0724478594950604, + "grad_norm": 6.155857107945347, + "learning_rate": 8.124569844196779e-07, + "loss": 0.3617, + "step": 489 + }, + { + "epoch": 1.0746432491767288, + "grad_norm": 6.779049354813868, + "learning_rate": 8.11459777342624e-07, + "loss": 0.3594, + "step": 490 + }, + { + "epoch": 1.0768386388583973, + "grad_norm": 7.620793847029099, + "learning_rate": 8.104605417171776e-07, + "loss": 0.3984, + "step": 491 + }, + { + "epoch": 1.079034028540066, + "grad_norm": 5.064790231482452, + "learning_rate": 8.094592840513949e-07, + "loss": 0.3915, + "step": 492 + }, + { + "epoch": 1.0812294182217344, + "grad_norm": 5.422927842364402, + "learning_rate": 8.084560108665023e-07, + "loss": 0.3322, + "step": 493 + }, + { + "epoch": 1.0834248079034028, + "grad_norm": 4.466736209109948, + "learning_rate": 8.074507286968528e-07, + "loss": 0.3352, + "step": 494 + }, + { + "epoch": 1.0856201975850714, + "grad_norm": 6.338480704389545, + "learning_rate": 8.064434440898844e-07, + "loss": 0.3471, + "step": 495 + }, + { + "epoch": 1.0878155872667399, + "grad_norm": 5.4065627028591425, + "learning_rate": 8.054341636060766e-07, + "loss": 0.3549, + "step": 496 + }, + { + "epoch": 1.0900109769484083, + "grad_norm": 6.850990410700852, + "learning_rate": 8.044228938189088e-07, + "loss": 0.3903, + "step": 497 + }, + { + "epoch": 1.0922063666300768, + "grad_norm": 6.120484367930399, + "learning_rate": 8.034096413148161e-07, + "loss": 0.3097, + "step": 498 + }, + { + "epoch": 1.0944017563117454, + "grad_norm": 4.464128285510355, + "learning_rate": 8.023944126931475e-07, + "loss": 0.3447, + "step": 499 + }, + { + "epoch": 1.0965971459934138, + "grad_norm": 6.600601913970912, + "learning_rate": 8.013772145661224e-07, + "loss": 0.3887, + "step": 500 + }, + { + "epoch": 1.0965971459934138, + "eval_accuracy": 0.818, + "eval_loss": 0.36472970247268677, + "eval_runtime": 62.4385, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 1.009, + "step": 500 + }, + { + "epoch": 1.0987925356750823, + "grad_norm": 5.402138515205016, + "learning_rate": 8.003580535587877e-07, + "loss": 0.3415, + "step": 501 + }, + { + "epoch": 1.1009879253567507, + "grad_norm": 7.628594037642485, + "learning_rate": 7.993369363089745e-07, + "loss": 0.3728, + "step": 502 + }, + { + "epoch": 1.1031833150384194, + "grad_norm": 4.697016196200385, + "learning_rate": 7.983138694672551e-07, + "loss": 0.3638, + "step": 503 + }, + { + "epoch": 1.1053787047200878, + "grad_norm": 6.2040879102322215, + "learning_rate": 7.972888596968996e-07, + "loss": 0.3458, + "step": 504 + }, + { + "epoch": 1.1075740944017562, + "grad_norm": 5.283464383567096, + "learning_rate": 7.962619136738324e-07, + "loss": 0.297, + "step": 505 + }, + { + "epoch": 1.109769484083425, + "grad_norm": 5.685282017291933, + "learning_rate": 7.952330380865887e-07, + "loss": 0.3328, + "step": 506 + }, + { + "epoch": 1.1119648737650933, + "grad_norm": 6.504450967378091, + "learning_rate": 7.942022396362711e-07, + "loss": 0.373, + "step": 507 + }, + { + "epoch": 1.1141602634467618, + "grad_norm": 6.224136967697534, + "learning_rate": 7.931695250365057e-07, + "loss": 0.3587, + "step": 508 + }, + { + "epoch": 1.1163556531284302, + "grad_norm": 6.524542381480522, + "learning_rate": 7.921349010133989e-07, + "loss": 0.3534, + "step": 509 + }, + { + "epoch": 1.1185510428100989, + "grad_norm": 6.508769827549269, + "learning_rate": 7.910983743054931e-07, + "loss": 0.3718, + "step": 510 + }, + { + "epoch": 1.1207464324917673, + "grad_norm": 7.148678198675181, + "learning_rate": 7.900599516637228e-07, + "loss": 0.3331, + "step": 511 + }, + { + "epoch": 1.1229418221734357, + "grad_norm": 5.983605133474822, + "learning_rate": 7.890196398513707e-07, + "loss": 0.353, + "step": 512 + }, + { + "epoch": 1.1251372118551042, + "grad_norm": 7.153049742139248, + "learning_rate": 7.879774456440242e-07, + "loss": 0.348, + "step": 513 + }, + { + "epoch": 1.1273326015367728, + "grad_norm": 7.392639449061027, + "learning_rate": 7.869333758295304e-07, + "loss": 0.3006, + "step": 514 + }, + { + "epoch": 1.1295279912184413, + "grad_norm": 6.288847784247055, + "learning_rate": 7.858874372079522e-07, + "loss": 0.3126, + "step": 515 + }, + { + "epoch": 1.1317233809001097, + "grad_norm": 6.530822602581628, + "learning_rate": 7.848396365915248e-07, + "loss": 0.3757, + "step": 516 + }, + { + "epoch": 1.1339187705817784, + "grad_norm": 7.180396451090792, + "learning_rate": 7.8378998080461e-07, + "loss": 0.3765, + "step": 517 + }, + { + "epoch": 1.1361141602634468, + "grad_norm": 7.893013021100936, + "learning_rate": 7.827384766836523e-07, + "loss": 0.3395, + "step": 518 + }, + { + "epoch": 1.1383095499451152, + "grad_norm": 6.419804870104756, + "learning_rate": 7.81685131077135e-07, + "loss": 0.3196, + "step": 519 + }, + { + "epoch": 1.1405049396267837, + "grad_norm": 7.441481132760177, + "learning_rate": 7.806299508455345e-07, + "loss": 0.4138, + "step": 520 + }, + { + "epoch": 1.1427003293084523, + "grad_norm": 5.097261411128716, + "learning_rate": 7.795729428612768e-07, + "loss": 0.322, + "step": 521 + }, + { + "epoch": 1.1448957189901208, + "grad_norm": 5.716684381877557, + "learning_rate": 7.785141140086913e-07, + "loss": 0.3636, + "step": 522 + }, + { + "epoch": 1.1470911086717892, + "grad_norm": 5.867715208154881, + "learning_rate": 7.774534711839677e-07, + "loss": 0.2981, + "step": 523 + }, + { + "epoch": 1.1492864983534576, + "grad_norm": 4.658345922311703, + "learning_rate": 7.763910212951095e-07, + "loss": 0.3478, + "step": 524 + }, + { + "epoch": 1.1514818880351263, + "grad_norm": 4.168344564692843, + "learning_rate": 7.753267712618898e-07, + "loss": 0.3468, + "step": 525 + }, + { + "epoch": 1.1536772777167947, + "grad_norm": 4.7538599995042645, + "learning_rate": 7.742607280158064e-07, + "loss": 0.3637, + "step": 526 + }, + { + "epoch": 1.1558726673984632, + "grad_norm": 6.571603396212342, + "learning_rate": 7.731928985000359e-07, + "loss": 0.375, + "step": 527 + }, + { + "epoch": 1.1580680570801318, + "grad_norm": 5.753219982389443, + "learning_rate": 7.721232896693894e-07, + "loss": 0.3427, + "step": 528 + }, + { + "epoch": 1.1602634467618003, + "grad_norm": 5.225412701161947, + "learning_rate": 7.710519084902663e-07, + "loss": 0.337, + "step": 529 + }, + { + "epoch": 1.1624588364434687, + "grad_norm": 5.484596435215429, + "learning_rate": 7.699787619406098e-07, + "loss": 0.3408, + "step": 530 + }, + { + "epoch": 1.1646542261251371, + "grad_norm": 5.282076505790076, + "learning_rate": 7.689038570098608e-07, + "loss": 0.3288, + "step": 531 + }, + { + "epoch": 1.1668496158068058, + "grad_norm": 4.775504582805927, + "learning_rate": 7.678272006989124e-07, + "loss": 0.2762, + "step": 532 + }, + { + "epoch": 1.1690450054884742, + "grad_norm": 5.78149113535585, + "learning_rate": 7.667488000200648e-07, + "loss": 0.4126, + "step": 533 + }, + { + "epoch": 1.1712403951701427, + "grad_norm": 5.897322988891718, + "learning_rate": 7.656686619969794e-07, + "loss": 0.3295, + "step": 534 + }, + { + "epoch": 1.173435784851811, + "grad_norm": 5.36707442483951, + "learning_rate": 7.645867936646327e-07, + "loss": 0.3358, + "step": 535 + }, + { + "epoch": 1.1756311745334798, + "grad_norm": 6.350922404290376, + "learning_rate": 7.635032020692706e-07, + "loss": 0.3665, + "step": 536 + }, + { + "epoch": 1.1778265642151482, + "grad_norm": 6.672225216206268, + "learning_rate": 7.624178942683634e-07, + "loss": 0.3237, + "step": 537 + }, + { + "epoch": 1.1800219538968166, + "grad_norm": 5.244066732628582, + "learning_rate": 7.613308773305584e-07, + "loss": 0.3546, + "step": 538 + }, + { + "epoch": 1.1822173435784853, + "grad_norm": 5.640267324413999, + "learning_rate": 7.60242158335635e-07, + "loss": 0.3763, + "step": 539 + }, + { + "epoch": 1.1844127332601537, + "grad_norm": 5.735242776747761, + "learning_rate": 7.591517443744577e-07, + "loss": 0.28, + "step": 540 + }, + { + "epoch": 1.1866081229418222, + "grad_norm": 5.370731176540545, + "learning_rate": 7.58059642548931e-07, + "loss": 0.3005, + "step": 541 + }, + { + "epoch": 1.1888035126234906, + "grad_norm": 5.792287386806627, + "learning_rate": 7.569658599719519e-07, + "loss": 0.3118, + "step": 542 + }, + { + "epoch": 1.1909989023051593, + "grad_norm": 6.798904382545207, + "learning_rate": 7.558704037673647e-07, + "loss": 0.3714, + "step": 543 + }, + { + "epoch": 1.1931942919868277, + "grad_norm": 5.46397725868398, + "learning_rate": 7.547732810699137e-07, + "loss": 0.3581, + "step": 544 + }, + { + "epoch": 1.1953896816684961, + "grad_norm": 5.999655519044568, + "learning_rate": 7.536744990251973e-07, + "loss": 0.3468, + "step": 545 + }, + { + "epoch": 1.1975850713501646, + "grad_norm": 5.809032437933095, + "learning_rate": 7.525740647896213e-07, + "loss": 0.3496, + "step": 546 + }, + { + "epoch": 1.1997804610318332, + "grad_norm": 5.6213191897838035, + "learning_rate": 7.514719855303524e-07, + "loss": 0.2637, + "step": 547 + }, + { + "epoch": 1.2019758507135017, + "grad_norm": 6.309518734364315, + "learning_rate": 7.503682684252711e-07, + "loss": 0.2972, + "step": 548 + }, + { + "epoch": 1.20417124039517, + "grad_norm": 5.10655426718776, + "learning_rate": 7.492629206629255e-07, + "loss": 0.395, + "step": 549 + }, + { + "epoch": 1.2063666300768388, + "grad_norm": 6.985855074156219, + "learning_rate": 7.481559494424839e-07, + "loss": 0.3901, + "step": 550 + }, + { + "epoch": 1.2063666300768388, + "eval_accuracy": 0.818, + "eval_loss": 0.3550068438053131, + "eval_runtime": 62.4228, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 1.009, + "step": 550 + }, + { + "epoch": 1.2085620197585072, + "grad_norm": 5.452884623196212, + "learning_rate": 7.470473619736889e-07, + "loss": 0.3635, + "step": 551 + }, + { + "epoch": 1.2107574094401756, + "grad_norm": 6.963272384448837, + "learning_rate": 7.459371654768088e-07, + "loss": 0.3905, + "step": 552 + }, + { + "epoch": 1.212952799121844, + "grad_norm": 4.543092820790928, + "learning_rate": 7.448253671825926e-07, + "loss": 0.3023, + "step": 553 + }, + { + "epoch": 1.2151481888035127, + "grad_norm": 5.793455953413061, + "learning_rate": 7.43711974332221e-07, + "loss": 0.3553, + "step": 554 + }, + { + "epoch": 1.2173435784851812, + "grad_norm": 5.5971357032313085, + "learning_rate": 7.425969941772606e-07, + "loss": 0.3217, + "step": 555 + }, + { + "epoch": 1.2195389681668496, + "grad_norm": 7.432739740334462, + "learning_rate": 7.414804339796161e-07, + "loss": 0.3662, + "step": 556 + }, + { + "epoch": 1.221734357848518, + "grad_norm": 5.748364577860177, + "learning_rate": 7.403623010114831e-07, + "loss": 0.3456, + "step": 557 + }, + { + "epoch": 1.2239297475301867, + "grad_norm": 5.661380628505925, + "learning_rate": 7.392426025553002e-07, + "loss": 0.3484, + "step": 558 + }, + { + "epoch": 1.2261251372118551, + "grad_norm": 7.677386392401964, + "learning_rate": 7.381213459037031e-07, + "loss": 0.3324, + "step": 559 + }, + { + "epoch": 1.2283205268935236, + "grad_norm": 5.815978817257181, + "learning_rate": 7.36998538359475e-07, + "loss": 0.3306, + "step": 560 + }, + { + "epoch": 1.2305159165751922, + "grad_norm": 7.157642715894916, + "learning_rate": 7.358741872355008e-07, + "loss": 0.356, + "step": 561 + }, + { + "epoch": 1.2327113062568607, + "grad_norm": 9.217680532850702, + "learning_rate": 7.347482998547184e-07, + "loss": 0.3601, + "step": 562 + }, + { + "epoch": 1.234906695938529, + "grad_norm": 10.045847811800988, + "learning_rate": 7.336208835500719e-07, + "loss": 0.3745, + "step": 563 + }, + { + "epoch": 1.2371020856201975, + "grad_norm": 6.241693811898233, + "learning_rate": 7.324919456644627e-07, + "loss": 0.3575, + "step": 564 + }, + { + "epoch": 1.239297475301866, + "grad_norm": 6.430610540834492, + "learning_rate": 7.313614935507028e-07, + "loss": 0.3293, + "step": 565 + }, + { + "epoch": 1.2414928649835346, + "grad_norm": 5.841827763430915, + "learning_rate": 7.302295345714662e-07, + "loss": 0.3194, + "step": 566 + }, + { + "epoch": 1.243688254665203, + "grad_norm": 5.840825760143234, + "learning_rate": 7.290960760992413e-07, + "loss": 0.3401, + "step": 567 + }, + { + "epoch": 1.2458836443468715, + "grad_norm": 10.441987786035702, + "learning_rate": 7.279611255162824e-07, + "loss": 0.4262, + "step": 568 + }, + { + "epoch": 1.2480790340285401, + "grad_norm": 11.92569950751005, + "learning_rate": 7.268246902145625e-07, + "loss": 0.3483, + "step": 569 + }, + { + "epoch": 1.2502744237102086, + "grad_norm": 6.453124049655114, + "learning_rate": 7.256867775957243e-07, + "loss": 0.384, + "step": 570 + }, + { + "epoch": 1.252469813391877, + "grad_norm": 5.70168194447466, + "learning_rate": 7.245473950710322e-07, + "loss": 0.3201, + "step": 571 + }, + { + "epoch": 1.2546652030735457, + "grad_norm": 6.001337956114435, + "learning_rate": 7.234065500613244e-07, + "loss": 0.3473, + "step": 572 + }, + { + "epoch": 1.2568605927552141, + "grad_norm": 5.271757451994717, + "learning_rate": 7.222642499969644e-07, + "loss": 0.3644, + "step": 573 + }, + { + "epoch": 1.2590559824368825, + "grad_norm": 5.806447016385563, + "learning_rate": 7.211205023177923e-07, + "loss": 0.3146, + "step": 574 + }, + { + "epoch": 1.261251372118551, + "grad_norm": 5.047476182935009, + "learning_rate": 7.199753144730765e-07, + "loss": 0.3406, + "step": 575 + }, + { + "epoch": 1.2634467618002194, + "grad_norm": 5.853247665884473, + "learning_rate": 7.188286939214656e-07, + "loss": 0.3006, + "step": 576 + }, + { + "epoch": 1.265642151481888, + "grad_norm": 6.805598773740254, + "learning_rate": 7.176806481309387e-07, + "loss": 0.3891, + "step": 577 + }, + { + "epoch": 1.2678375411635565, + "grad_norm": 6.946161315608133, + "learning_rate": 7.165311845787587e-07, + "loss": 0.3392, + "step": 578 + }, + { + "epoch": 1.270032930845225, + "grad_norm": 6.46125490152312, + "learning_rate": 7.153803107514217e-07, + "loss": 0.3478, + "step": 579 + }, + { + "epoch": 1.2722283205268936, + "grad_norm": 5.096980092514172, + "learning_rate": 7.142280341446089e-07, + "loss": 0.3449, + "step": 580 + }, + { + "epoch": 1.274423710208562, + "grad_norm": 7.089646471526524, + "learning_rate": 7.130743622631378e-07, + "loss": 0.3008, + "step": 581 + }, + { + "epoch": 1.2766190998902305, + "grad_norm": 5.7888088906336055, + "learning_rate": 7.119193026209142e-07, + "loss": 0.3902, + "step": 582 + }, + { + "epoch": 1.2788144895718991, + "grad_norm": 5.824834458347463, + "learning_rate": 7.107628627408813e-07, + "loss": 0.3129, + "step": 583 + }, + { + "epoch": 1.2810098792535676, + "grad_norm": 8.738486775778084, + "learning_rate": 7.096050501549727e-07, + "loss": 0.4085, + "step": 584 + }, + { + "epoch": 1.283205268935236, + "grad_norm": 6.048998032376583, + "learning_rate": 7.084458724040621e-07, + "loss": 0.3345, + "step": 585 + }, + { + "epoch": 1.2854006586169044, + "grad_norm": 9.605936387748194, + "learning_rate": 7.072853370379144e-07, + "loss": 0.3393, + "step": 586 + }, + { + "epoch": 1.2875960482985729, + "grad_norm": 6.865950862731977, + "learning_rate": 7.061234516151371e-07, + "loss": 0.3301, + "step": 587 + }, + { + "epoch": 1.2897914379802415, + "grad_norm": 6.267095187214198, + "learning_rate": 7.049602237031305e-07, + "loss": 0.2994, + "step": 588 + }, + { + "epoch": 1.29198682766191, + "grad_norm": 5.6232557271177, + "learning_rate": 7.037956608780385e-07, + "loss": 0.3094, + "step": 589 + }, + { + "epoch": 1.2941822173435784, + "grad_norm": 5.801703901127963, + "learning_rate": 7.026297707246993e-07, + "loss": 0.3549, + "step": 590 + }, + { + "epoch": 1.296377607025247, + "grad_norm": 6.414391032155044, + "learning_rate": 7.014625608365962e-07, + "loss": 0.3183, + "step": 591 + }, + { + "epoch": 1.2985729967069155, + "grad_norm": 6.651766488870556, + "learning_rate": 7.002940388158083e-07, + "loss": 0.3676, + "step": 592 + }, + { + "epoch": 1.300768386388584, + "grad_norm": 7.417392797535265, + "learning_rate": 6.991242122729596e-07, + "loss": 0.4093, + "step": 593 + }, + { + "epoch": 1.3029637760702526, + "grad_norm": 6.527075497004787, + "learning_rate": 6.979530888271716e-07, + "loss": 0.3604, + "step": 594 + }, + { + "epoch": 1.305159165751921, + "grad_norm": 6.940696529099327, + "learning_rate": 6.96780676106012e-07, + "loss": 0.2738, + "step": 595 + }, + { + "epoch": 1.3073545554335895, + "grad_norm": 6.972366954094688, + "learning_rate": 6.956069817454458e-07, + "loss": 0.3495, + "step": 596 + }, + { + "epoch": 1.309549945115258, + "grad_norm": 6.427760417094834, + "learning_rate": 6.944320133897855e-07, + "loss": 0.3368, + "step": 597 + }, + { + "epoch": 1.3117453347969263, + "grad_norm": 5.47206109550229, + "learning_rate": 6.932557786916406e-07, + "loss": 0.3293, + "step": 598 + }, + { + "epoch": 1.313940724478595, + "grad_norm": 4.918239274946065, + "learning_rate": 6.920782853118689e-07, + "loss": 0.3746, + "step": 599 + }, + { + "epoch": 1.3161361141602634, + "grad_norm": 6.295802272512743, + "learning_rate": 6.908995409195258e-07, + "loss": 0.3043, + "step": 600 + }, + { + "epoch": 1.3161361141602634, + "eval_accuracy": 0.826, + "eval_loss": 0.35173022747039795, + "eval_runtime": 62.3581, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 1.01, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 1368, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}