| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0965971459934138, | |
| "eval_steps": 50, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0021953896816684962, | |
| "grad_norm": 63.43226149633802, | |
| "learning_rate": 0.0, | |
| "loss": 0.9349, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0043907793633369925, | |
| "grad_norm": 36.34002028523472, | |
| "learning_rate": 7.2992700729927e-09, | |
| "loss": 0.84, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.006586169045005488, | |
| "grad_norm": 35.076123786065985, | |
| "learning_rate": 1.45985401459854e-08, | |
| "loss": 0.872, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.008781558726673985, | |
| "grad_norm": 35.02301842850996, | |
| "learning_rate": 2.1897810218978102e-08, | |
| "loss": 0.851, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.010976948408342482, | |
| "grad_norm": 27.48208643895236, | |
| "learning_rate": 2.91970802919708e-08, | |
| "loss": 0.8809, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.013172338090010977, | |
| "grad_norm": 45.477345508153874, | |
| "learning_rate": 3.64963503649635e-08, | |
| "loss": 0.9287, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.015367727771679473, | |
| "grad_norm": 56.90118203461904, | |
| "learning_rate": 4.3795620437956203e-08, | |
| "loss": 0.9007, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.01756311745334797, | |
| "grad_norm": 46.42558807941359, | |
| "learning_rate": 5.10948905109489e-08, | |
| "loss": 0.9807, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.019758507135016465, | |
| "grad_norm": 42.603618137740504, | |
| "learning_rate": 5.83941605839416e-08, | |
| "loss": 0.9002, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.021953896816684963, | |
| "grad_norm": 31.238590091540356, | |
| "learning_rate": 6.569343065693431e-08, | |
| "loss": 0.8321, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.024149286498353458, | |
| "grad_norm": 74.05095323835384, | |
| "learning_rate": 7.2992700729927e-08, | |
| "loss": 0.7992, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.026344676180021953, | |
| "grad_norm": 37.54325755059656, | |
| "learning_rate": 8.029197080291971e-08, | |
| "loss": 0.8845, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.02854006586169045, | |
| "grad_norm": 42.06630970394279, | |
| "learning_rate": 8.759124087591241e-08, | |
| "loss": 0.9717, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.030735455543358946, | |
| "grad_norm": 46.71572391565178, | |
| "learning_rate": 9.48905109489051e-08, | |
| "loss": 0.868, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.03293084522502744, | |
| "grad_norm": 47.53527688382936, | |
| "learning_rate": 1.021897810218978e-07, | |
| "loss": 0.9339, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03512623490669594, | |
| "grad_norm": 30.17176017806663, | |
| "learning_rate": 1.0948905109489052e-07, | |
| "loss": 0.8985, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.03732162458836443, | |
| "grad_norm": 53.77317643602209, | |
| "learning_rate": 1.167883211678832e-07, | |
| "loss": 0.9014, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.03951701427003293, | |
| "grad_norm": 37.069232452020195, | |
| "learning_rate": 1.240875912408759e-07, | |
| "loss": 0.8917, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.04171240395170143, | |
| "grad_norm": 57.37762629699942, | |
| "learning_rate": 1.3138686131386862e-07, | |
| "loss": 0.8932, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.043907793633369926, | |
| "grad_norm": 35.926837420049885, | |
| "learning_rate": 1.386861313868613e-07, | |
| "loss": 0.903, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04610318331503842, | |
| "grad_norm": 32.248278685728465, | |
| "learning_rate": 1.45985401459854e-07, | |
| "loss": 0.8963, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.048298572996706916, | |
| "grad_norm": 29.26799952212303, | |
| "learning_rate": 1.532846715328467e-07, | |
| "loss": 0.8644, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.050493962678375415, | |
| "grad_norm": 30.785195210511173, | |
| "learning_rate": 1.6058394160583942e-07, | |
| "loss": 0.8437, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.052689352360043906, | |
| "grad_norm": 33.71197663612937, | |
| "learning_rate": 1.678832116788321e-07, | |
| "loss": 0.8897, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.054884742041712405, | |
| "grad_norm": 35.96552084220703, | |
| "learning_rate": 1.7518248175182481e-07, | |
| "loss": 0.8967, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0570801317233809, | |
| "grad_norm": 41.50138484689535, | |
| "learning_rate": 1.824817518248175e-07, | |
| "loss": 0.9074, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.059275521405049394, | |
| "grad_norm": 38.797876016859426, | |
| "learning_rate": 1.897810218978102e-07, | |
| "loss": 0.9287, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.06147091108671789, | |
| "grad_norm": 32.19034408841759, | |
| "learning_rate": 1.9708029197080292e-07, | |
| "loss": 0.8716, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.06366630076838639, | |
| "grad_norm": 51.76045022284941, | |
| "learning_rate": 2.043795620437956e-07, | |
| "loss": 0.8812, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.06586169045005488, | |
| "grad_norm": 31.359275794152296, | |
| "learning_rate": 2.116788321167883e-07, | |
| "loss": 0.8557, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06805708013172337, | |
| "grad_norm": 30.678686692428276, | |
| "learning_rate": 2.1897810218978103e-07, | |
| "loss": 0.8758, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.07025246981339188, | |
| "grad_norm": 32.41276845704741, | |
| "learning_rate": 2.2627737226277372e-07, | |
| "loss": 0.8713, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.07244785949506037, | |
| "grad_norm": 28.950024669686137, | |
| "learning_rate": 2.335766423357664e-07, | |
| "loss": 0.8627, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.07464324917672886, | |
| "grad_norm": 18.417431019302867, | |
| "learning_rate": 2.408759124087591e-07, | |
| "loss": 0.7561, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.07683863885839737, | |
| "grad_norm": 30.826596106550824, | |
| "learning_rate": 2.481751824817518e-07, | |
| "loss": 0.9044, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07903402854006586, | |
| "grad_norm": 27.86009502146877, | |
| "learning_rate": 2.5547445255474454e-07, | |
| "loss": 0.9158, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.08122941822173436, | |
| "grad_norm": 33.10171663561771, | |
| "learning_rate": 2.6277372262773725e-07, | |
| "loss": 0.8973, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.08342480790340286, | |
| "grad_norm": 33.54784642550598, | |
| "learning_rate": 2.700729927007299e-07, | |
| "loss": 0.8854, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.08562019758507135, | |
| "grad_norm": 31.94225571174728, | |
| "learning_rate": 2.773722627737226e-07, | |
| "loss": 0.8634, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.08781558726673985, | |
| "grad_norm": 18.667616381853037, | |
| "learning_rate": 2.846715328467153e-07, | |
| "loss": 0.8365, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09001097694840834, | |
| "grad_norm": 33.72334370651476, | |
| "learning_rate": 2.91970802919708e-07, | |
| "loss": 0.8826, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.09220636663007684, | |
| "grad_norm": 26.43734180083069, | |
| "learning_rate": 2.9927007299270075e-07, | |
| "loss": 0.8895, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.09440175631174534, | |
| "grad_norm": 31.963757242976843, | |
| "learning_rate": 3.065693430656934e-07, | |
| "loss": 0.8623, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.09659714599341383, | |
| "grad_norm": 21.0251515948571, | |
| "learning_rate": 3.138686131386861e-07, | |
| "loss": 0.7966, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.09879253567508232, | |
| "grad_norm": 31.038625878390096, | |
| "learning_rate": 3.2116788321167883e-07, | |
| "loss": 0.8834, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.10098792535675083, | |
| "grad_norm": 20.27262840408739, | |
| "learning_rate": 3.284671532846715e-07, | |
| "loss": 0.833, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.10318331503841932, | |
| "grad_norm": 27.9353455887378, | |
| "learning_rate": 3.357664233576642e-07, | |
| "loss": 0.8401, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.10537870472008781, | |
| "grad_norm": 24.712611262961317, | |
| "learning_rate": 3.4306569343065697e-07, | |
| "loss": 0.759, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.10757409440175632, | |
| "grad_norm": 19.616432703856, | |
| "learning_rate": 3.5036496350364963e-07, | |
| "loss": 0.7371, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.10976948408342481, | |
| "grad_norm": 34.3573339583084, | |
| "learning_rate": 3.5766423357664234e-07, | |
| "loss": 0.8114, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10976948408342481, | |
| "eval_accuracy": 0.564, | |
| "eval_loss": 0.6804020404815674, | |
| "eval_runtime": 51.4851, | |
| "eval_samples_per_second": 9.712, | |
| "eval_steps_per_second": 1.224, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1119648737650933, | |
| "grad_norm": 52.44727313113604, | |
| "learning_rate": 3.64963503649635e-07, | |
| "loss": 0.8058, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.1141602634467618, | |
| "grad_norm": 29.635805263155994, | |
| "learning_rate": 3.722627737226277e-07, | |
| "loss": 0.7798, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1163556531284303, | |
| "grad_norm": 16.05086854109441, | |
| "learning_rate": 3.795620437956204e-07, | |
| "loss": 0.7593, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.11855104281009879, | |
| "grad_norm": 29.823000715758084, | |
| "learning_rate": 3.8686131386861313e-07, | |
| "loss": 0.7561, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.1207464324917673, | |
| "grad_norm": 14.127126775727975, | |
| "learning_rate": 3.9416058394160584e-07, | |
| "loss": 0.7271, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.12294182217343579, | |
| "grad_norm": 21.593202970179426, | |
| "learning_rate": 4.0145985401459856e-07, | |
| "loss": 0.7764, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1251372118551043, | |
| "grad_norm": 14.951837963554619, | |
| "learning_rate": 4.087591240875912e-07, | |
| "loss": 0.6718, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.12733260153677278, | |
| "grad_norm": 15.378678774753151, | |
| "learning_rate": 4.160583941605839e-07, | |
| "loss": 0.7698, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.12952799121844127, | |
| "grad_norm": 24.73055485970318, | |
| "learning_rate": 4.233576642335766e-07, | |
| "loss": 0.7673, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.13172338090010977, | |
| "grad_norm": 17.57321911038973, | |
| "learning_rate": 4.306569343065693e-07, | |
| "loss": 0.7157, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13391877058177826, | |
| "grad_norm": 16.77467268690982, | |
| "learning_rate": 4.3795620437956206e-07, | |
| "loss": 0.7432, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.13611416026344675, | |
| "grad_norm": 27.713531401351116, | |
| "learning_rate": 4.452554744525547e-07, | |
| "loss": 0.8336, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.13830954994511527, | |
| "grad_norm": 18.28886043174935, | |
| "learning_rate": 4.5255474452554743e-07, | |
| "loss": 0.6868, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.14050493962678376, | |
| "grad_norm": 17.56165424803153, | |
| "learning_rate": 4.5985401459854014e-07, | |
| "loss": 0.8134, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.14270032930845225, | |
| "grad_norm": 16.76906700049957, | |
| "learning_rate": 4.671532846715328e-07, | |
| "loss": 0.7071, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.14489571899012074, | |
| "grad_norm": 22.604350303541462, | |
| "learning_rate": 4.744525547445255e-07, | |
| "loss": 0.7717, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.14709110867178923, | |
| "grad_norm": 15.57570075158043, | |
| "learning_rate": 4.817518248175182e-07, | |
| "loss": 0.6858, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.14928649835345773, | |
| "grad_norm": 23.99766199274266, | |
| "learning_rate": 4.89051094890511e-07, | |
| "loss": 0.6925, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.15148188803512624, | |
| "grad_norm": 14.622811969684838, | |
| "learning_rate": 4.963503649635036e-07, | |
| "loss": 0.6772, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.15367727771679474, | |
| "grad_norm": 11.873629512549707, | |
| "learning_rate": 5.036496350364964e-07, | |
| "loss": 0.7201, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15587266739846323, | |
| "grad_norm": 14.971650059257875, | |
| "learning_rate": 5.109489051094891e-07, | |
| "loss": 0.7144, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.15806805708013172, | |
| "grad_norm": 14.89771761097792, | |
| "learning_rate": 5.182481751824817e-07, | |
| "loss": 0.6823, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.1602634467618002, | |
| "grad_norm": 26.145706119308308, | |
| "learning_rate": 5.255474452554745e-07, | |
| "loss": 0.6815, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.16245883644346873, | |
| "grad_norm": 25.218475252582063, | |
| "learning_rate": 5.328467153284672e-07, | |
| "loss": 0.6683, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.16465422612513722, | |
| "grad_norm": 24.795553870705557, | |
| "learning_rate": 5.401459854014598e-07, | |
| "loss": 0.6869, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1668496158068057, | |
| "grad_norm": 20.06359220796973, | |
| "learning_rate": 5.474452554744526e-07, | |
| "loss": 0.6776, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.1690450054884742, | |
| "grad_norm": 14.605622927723077, | |
| "learning_rate": 5.547445255474452e-07, | |
| "loss": 0.6169, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.1712403951701427, | |
| "grad_norm": 13.398369259423399, | |
| "learning_rate": 5.620437956204379e-07, | |
| "loss": 0.5911, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.1734357848518112, | |
| "grad_norm": 22.20812822444505, | |
| "learning_rate": 5.693430656934306e-07, | |
| "loss": 0.6641, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.1756311745334797, | |
| "grad_norm": 11.530488794545967, | |
| "learning_rate": 5.766423357664233e-07, | |
| "loss": 0.6546, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1778265642151482, | |
| "grad_norm": 19.45472907398047, | |
| "learning_rate": 5.83941605839416e-07, | |
| "loss": 0.6346, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.1800219538968167, | |
| "grad_norm": 14.402272095344948, | |
| "learning_rate": 5.912408759124087e-07, | |
| "loss": 0.6157, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.18221734357848518, | |
| "grad_norm": 20.486016309108074, | |
| "learning_rate": 5.985401459854015e-07, | |
| "loss": 0.6353, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.18441273326015367, | |
| "grad_norm": 21.724039606610585, | |
| "learning_rate": 6.058394160583942e-07, | |
| "loss": 0.6669, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.18660812294182216, | |
| "grad_norm": 23.550204927202188, | |
| "learning_rate": 6.131386861313868e-07, | |
| "loss": 0.6573, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.18880351262349068, | |
| "grad_norm": 25.850348423694676, | |
| "learning_rate": 6.204379562043796e-07, | |
| "loss": 0.6096, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.19099890230515917, | |
| "grad_norm": 15.668802168044579, | |
| "learning_rate": 6.277372262773722e-07, | |
| "loss": 0.6074, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.19319429198682767, | |
| "grad_norm": 11.15207045876449, | |
| "learning_rate": 6.350364963503649e-07, | |
| "loss": 0.5861, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.19538968166849616, | |
| "grad_norm": 30.989418951503534, | |
| "learning_rate": 6.423357664233577e-07, | |
| "loss": 0.5803, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.19758507135016465, | |
| "grad_norm": 37.87369273674589, | |
| "learning_rate": 6.496350364963503e-07, | |
| "loss": 0.5607, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.19978046103183314, | |
| "grad_norm": 16.304752909448357, | |
| "learning_rate": 6.56934306569343e-07, | |
| "loss": 0.4855, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.20197585071350166, | |
| "grad_norm": 46.813965107422554, | |
| "learning_rate": 6.642335766423358e-07, | |
| "loss": 0.5277, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.20417124039517015, | |
| "grad_norm": 16.25834561476597, | |
| "learning_rate": 6.715328467153284e-07, | |
| "loss": 0.5671, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.20636663007683864, | |
| "grad_norm": 24.571054002712096, | |
| "learning_rate": 6.788321167883211e-07, | |
| "loss": 0.5485, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.20856201975850713, | |
| "grad_norm": 17.16346868917258, | |
| "learning_rate": 6.861313868613139e-07, | |
| "loss": 0.6168, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.21075740944017562, | |
| "grad_norm": 24.939402022765716, | |
| "learning_rate": 6.934306569343066e-07, | |
| "loss": 0.5097, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.21295279912184412, | |
| "grad_norm": 35.04290652380563, | |
| "learning_rate": 7.007299270072993e-07, | |
| "loss": 0.511, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.21514818880351264, | |
| "grad_norm": 19.369038528744387, | |
| "learning_rate": 7.080291970802919e-07, | |
| "loss": 0.501, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.21734357848518113, | |
| "grad_norm": 14.504342290526917, | |
| "learning_rate": 7.153284671532847e-07, | |
| "loss": 0.5576, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.21953896816684962, | |
| "grad_norm": 22.831165380560154, | |
| "learning_rate": 7.226277372262773e-07, | |
| "loss": 0.6357, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.21953896816684962, | |
| "eval_accuracy": 0.716, | |
| "eval_loss": 0.5146071910858154, | |
| "eval_runtime": 51.4544, | |
| "eval_samples_per_second": 9.717, | |
| "eval_steps_per_second": 1.224, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2217343578485181, | |
| "grad_norm": 17.214759546075644, | |
| "learning_rate": 7.2992700729927e-07, | |
| "loss": 0.5879, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.2239297475301866, | |
| "grad_norm": 28.14453156116428, | |
| "learning_rate": 7.372262773722628e-07, | |
| "loss": 0.6077, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.2261251372118551, | |
| "grad_norm": 36.81857394774189, | |
| "learning_rate": 7.445255474452554e-07, | |
| "loss": 0.5656, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.2283205268935236, | |
| "grad_norm": 19.720256889573427, | |
| "learning_rate": 7.518248175182481e-07, | |
| "loss": 0.5779, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.2305159165751921, | |
| "grad_norm": 13.404265085713362, | |
| "learning_rate": 7.591240875912408e-07, | |
| "loss": 0.6194, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2327113062568606, | |
| "grad_norm": 10.223644936756514, | |
| "learning_rate": 7.664233576642335e-07, | |
| "loss": 0.5653, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.2349066959385291, | |
| "grad_norm": 11.802424231339492, | |
| "learning_rate": 7.737226277372263e-07, | |
| "loss": 0.5243, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.23710208562019758, | |
| "grad_norm": 15.21599780255988, | |
| "learning_rate": 7.81021897810219e-07, | |
| "loss": 0.5237, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.23929747530186607, | |
| "grad_norm": 9.817905688792953, | |
| "learning_rate": 7.883211678832117e-07, | |
| "loss": 0.5559, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2414928649835346, | |
| "grad_norm": 14.019441837913432, | |
| "learning_rate": 7.956204379562043e-07, | |
| "loss": 0.5634, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.24368825466520308, | |
| "grad_norm": 11.287304916292086, | |
| "learning_rate": 8.029197080291971e-07, | |
| "loss": 0.5169, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.24588364434687157, | |
| "grad_norm": 13.731826260486923, | |
| "learning_rate": 8.102189781021898e-07, | |
| "loss": 0.4922, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.24807903402854006, | |
| "grad_norm": 10.127768690583599, | |
| "learning_rate": 8.175182481751824e-07, | |
| "loss": 0.4939, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2502744237102086, | |
| "grad_norm": 10.643603728793044, | |
| "learning_rate": 8.248175182481751e-07, | |
| "loss": 0.5433, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.2524698133918771, | |
| "grad_norm": 16.423886102329853, | |
| "learning_rate": 8.321167883211679e-07, | |
| "loss": 0.4931, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.25466520307354557, | |
| "grad_norm": 11.683580419342695, | |
| "learning_rate": 8.394160583941605e-07, | |
| "loss": 0.6014, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.25686059275521406, | |
| "grad_norm": 9.964987145456663, | |
| "learning_rate": 8.467153284671532e-07, | |
| "loss": 0.5111, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.25905598243688255, | |
| "grad_norm": 11.994223750763792, | |
| "learning_rate": 8.540145985401459e-07, | |
| "loss": 0.5427, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.26125137211855104, | |
| "grad_norm": 17.08922954256959, | |
| "learning_rate": 8.613138686131386e-07, | |
| "loss": 0.5454, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.26344676180021953, | |
| "grad_norm": 9.233224929973781, | |
| "learning_rate": 8.686131386861314e-07, | |
| "loss": 0.4425, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.265642151481888, | |
| "grad_norm": 11.158793551879755, | |
| "learning_rate": 8.759124087591241e-07, | |
| "loss": 0.4592, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.2678375411635565, | |
| "grad_norm": 9.984032512868838, | |
| "learning_rate": 8.832116788321168e-07, | |
| "loss": 0.4619, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.270032930845225, | |
| "grad_norm": 13.6121955869425, | |
| "learning_rate": 8.905109489051094e-07, | |
| "loss": 0.5102, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.2722283205268935, | |
| "grad_norm": 16.38565463082547, | |
| "learning_rate": 8.978102189781022e-07, | |
| "loss": 0.5414, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.27442371020856204, | |
| "grad_norm": 13.72322184659144, | |
| "learning_rate": 9.051094890510949e-07, | |
| "loss": 0.4404, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.27661909989023054, | |
| "grad_norm": 9.728231053042531, | |
| "learning_rate": 9.124087591240875e-07, | |
| "loss": 0.4891, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.278814489571899, | |
| "grad_norm": 19.653134550591925, | |
| "learning_rate": 9.197080291970803e-07, | |
| "loss": 0.4911, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.2810098792535675, | |
| "grad_norm": 15.233058767794994, | |
| "learning_rate": 9.270072992700729e-07, | |
| "loss": 0.4482, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.283205268935236, | |
| "grad_norm": 14.85049447902684, | |
| "learning_rate": 9.343065693430656e-07, | |
| "loss": 0.4381, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.2854006586169045, | |
| "grad_norm": 18.23588962213726, | |
| "learning_rate": 9.416058394160583e-07, | |
| "loss": 0.5267, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.287596048298573, | |
| "grad_norm": 13.687385484203132, | |
| "learning_rate": 9.48905109489051e-07, | |
| "loss": 0.533, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.2897914379802415, | |
| "grad_norm": 16.31348614065883, | |
| "learning_rate": 9.562043795620438e-07, | |
| "loss": 0.4592, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.29198682766191, | |
| "grad_norm": 10.619144950897025, | |
| "learning_rate": 9.635036496350364e-07, | |
| "loss": 0.5337, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.29418221734357847, | |
| "grad_norm": 10.964180780818202, | |
| "learning_rate": 9.708029197080291e-07, | |
| "loss": 0.531, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.29637760702524696, | |
| "grad_norm": 9.68554990094299, | |
| "learning_rate": 9.78102189781022e-07, | |
| "loss": 0.4466, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.29857299670691545, | |
| "grad_norm": 11.193131252386175, | |
| "learning_rate": 9.854014598540146e-07, | |
| "loss": 0.4518, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.300768386388584, | |
| "grad_norm": 7.652184708715949, | |
| "learning_rate": 9.927007299270073e-07, | |
| "loss": 0.4514, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.3029637760702525, | |
| "grad_norm": 9.282129557780532, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4952, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.305159165751921, | |
| "grad_norm": 9.514439267127576, | |
| "learning_rate": 9.999983717412808e-07, | |
| "loss": 0.5277, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.30735455543358947, | |
| "grad_norm": 8.04642891691016, | |
| "learning_rate": 9.999934869757278e-07, | |
| "loss": 0.5323, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.30954994511525796, | |
| "grad_norm": 14.24768033484951, | |
| "learning_rate": 9.999853457351558e-07, | |
| "loss": 0.4996, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.31174533479692645, | |
| "grad_norm": 11.061860342712375, | |
| "learning_rate": 9.999739480725893e-07, | |
| "loss": 0.4849, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.31394072447859495, | |
| "grad_norm": 15.443833167332631, | |
| "learning_rate": 9.999592940622613e-07, | |
| "loss": 0.4957, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.31613611416026344, | |
| "grad_norm": 9.488802082671176, | |
| "learning_rate": 9.999413837996137e-07, | |
| "loss": 0.495, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.31833150384193193, | |
| "grad_norm": 7.092185690265562, | |
| "learning_rate": 9.999202174012972e-07, | |
| "loss": 0.4189, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3205268935236004, | |
| "grad_norm": 8.422031666334862, | |
| "learning_rate": 9.99895795005169e-07, | |
| "loss": 0.4548, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.3227222832052689, | |
| "grad_norm": 9.442638766406834, | |
| "learning_rate": 9.99868116770293e-07, | |
| "loss": 0.4385, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.32491767288693746, | |
| "grad_norm": 7.687755924564992, | |
| "learning_rate": 9.998371828769384e-07, | |
| "loss": 0.4726, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.32711306256860595, | |
| "grad_norm": 11.01823455774795, | |
| "learning_rate": 9.99802993526579e-07, | |
| "loss": 0.5062, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.32930845225027444, | |
| "grad_norm": 8.828666504433839, | |
| "learning_rate": 9.997655489418912e-07, | |
| "loss": 0.5211, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.32930845225027444, | |
| "eval_accuracy": 0.774, | |
| "eval_loss": 0.4343988001346588, | |
| "eval_runtime": 52.0075, | |
| "eval_samples_per_second": 9.614, | |
| "eval_steps_per_second": 1.211, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.33150384193194293, | |
| "grad_norm": 9.826049631904308, | |
| "learning_rate": 9.997248493667527e-07, | |
| "loss": 0.5144, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.3336992316136114, | |
| "grad_norm": 9.822644682623359, | |
| "learning_rate": 9.996808950662413e-07, | |
| "loss": 0.46, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.3358946212952799, | |
| "grad_norm": 7.448251385258531, | |
| "learning_rate": 9.99633686326633e-07, | |
| "loss": 0.4837, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.3380900109769484, | |
| "grad_norm": 8.66825069576008, | |
| "learning_rate": 9.995832234554e-07, | |
| "loss": 0.4442, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.3402854006586169, | |
| "grad_norm": 9.445663098800049, | |
| "learning_rate": 9.995295067812083e-07, | |
| "loss": 0.4984, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3424807903402854, | |
| "grad_norm": 7.886446457789178, | |
| "learning_rate": 9.99472536653917e-07, | |
| "loss": 0.4888, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.3446761800219539, | |
| "grad_norm": 6.5270562387929285, | |
| "learning_rate": 9.994123134445746e-07, | |
| "loss": 0.4608, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.3468715697036224, | |
| "grad_norm": 8.44851437549508, | |
| "learning_rate": 9.993488375454165e-07, | |
| "loss": 0.4319, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.34906695938529086, | |
| "grad_norm": 8.945522697664286, | |
| "learning_rate": 9.992821093698636e-07, | |
| "loss": 0.52, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.3512623490669594, | |
| "grad_norm": 6.9004623022662965, | |
| "learning_rate": 9.992121293525188e-07, | |
| "loss": 0.5153, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3534577387486279, | |
| "grad_norm": 9.258099865422958, | |
| "learning_rate": 9.991388979491646e-07, | |
| "loss": 0.4248, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.3556531284302964, | |
| "grad_norm": 9.52300325877657, | |
| "learning_rate": 9.990624156367596e-07, | |
| "loss": 0.4744, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.3578485181119649, | |
| "grad_norm": 8.87937529404168, | |
| "learning_rate": 9.989826829134356e-07, | |
| "loss": 0.4804, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.3600439077936334, | |
| "grad_norm": 8.218898687145828, | |
| "learning_rate": 9.988997002984949e-07, | |
| "loss": 0.5154, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.36223929747530187, | |
| "grad_norm": 9.404715926766505, | |
| "learning_rate": 9.988134683324058e-07, | |
| "loss": 0.4595, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.36443468715697036, | |
| "grad_norm": 8.404212842327881, | |
| "learning_rate": 9.987239875768006e-07, | |
| "loss": 0.4122, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.36663007683863885, | |
| "grad_norm": 12.927034279467245, | |
| "learning_rate": 9.9863125861447e-07, | |
| "loss": 0.458, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.36882546652030734, | |
| "grad_norm": 9.395688075375638, | |
| "learning_rate": 9.985352820493614e-07, | |
| "loss": 0.4442, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.37102085620197583, | |
| "grad_norm": 12.027605468945074, | |
| "learning_rate": 9.984360585065733e-07, | |
| "loss": 0.4342, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.3732162458836443, | |
| "grad_norm": 11.101898867057535, | |
| "learning_rate": 9.983335886323524e-07, | |
| "loss": 0.4457, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3754116355653128, | |
| "grad_norm": 9.788751810099042, | |
| "learning_rate": 9.98227873094088e-07, | |
| "loss": 0.4773, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.37760702524698136, | |
| "grad_norm": 8.772604827049461, | |
| "learning_rate": 9.981189125803095e-07, | |
| "loss": 0.4763, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.37980241492864986, | |
| "grad_norm": 9.88650795305363, | |
| "learning_rate": 9.980067078006804e-07, | |
| "loss": 0.5062, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.38199780461031835, | |
| "grad_norm": 8.73207633341288, | |
| "learning_rate": 9.978912594859946e-07, | |
| "loss": 0.5079, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.38419319429198684, | |
| "grad_norm": 7.037929236057823, | |
| "learning_rate": 9.977725683881707e-07, | |
| "loss": 0.4444, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.38638858397365533, | |
| "grad_norm": 14.22447093808163, | |
| "learning_rate": 9.97650635280248e-07, | |
| "loss": 0.4683, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.3885839736553238, | |
| "grad_norm": 6.850018676344095, | |
| "learning_rate": 9.97525460956381e-07, | |
| "loss": 0.4544, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.3907793633369923, | |
| "grad_norm": 9.295362171330144, | |
| "learning_rate": 9.973970462318349e-07, | |
| "loss": 0.4168, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.3929747530186608, | |
| "grad_norm": 9.498462196078586, | |
| "learning_rate": 9.972653919429788e-07, | |
| "loss": 0.4762, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.3951701427003293, | |
| "grad_norm": 14.51893007863248, | |
| "learning_rate": 9.971304989472817e-07, | |
| "loss": 0.429, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3973655323819978, | |
| "grad_norm": 10.861523669617597, | |
| "learning_rate": 9.969923681233066e-07, | |
| "loss": 0.5009, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.3995609220636663, | |
| "grad_norm": 8.71310472180412, | |
| "learning_rate": 9.968510003707042e-07, | |
| "loss": 0.4562, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.40175631174533477, | |
| "grad_norm": 7.504849700550564, | |
| "learning_rate": 9.967063966102079e-07, | |
| "loss": 0.4103, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.4039517014270033, | |
| "grad_norm": 11.160809122945897, | |
| "learning_rate": 9.965585577836264e-07, | |
| "loss": 0.4868, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.4061470911086718, | |
| "grad_norm": 9.173442546613508, | |
| "learning_rate": 9.9640748485384e-07, | |
| "loss": 0.4178, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.4083424807903403, | |
| "grad_norm": 9.946380701067211, | |
| "learning_rate": 9.962531788047913e-07, | |
| "loss": 0.4075, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.4105378704720088, | |
| "grad_norm": 6.7742539591322775, | |
| "learning_rate": 9.960956406414813e-07, | |
| "loss": 0.4228, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.4127332601536773, | |
| "grad_norm": 6.667418828739606, | |
| "learning_rate": 9.959348713899613e-07, | |
| "loss": 0.4313, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.4149286498353458, | |
| "grad_norm": 8.5318733439587, | |
| "learning_rate": 9.957708720973273e-07, | |
| "loss": 0.432, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.41712403951701427, | |
| "grad_norm": 8.281669822513821, | |
| "learning_rate": 9.956036438317123e-07, | |
| "loss": 0.4454, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.41931942919868276, | |
| "grad_norm": 11.094591963482126, | |
| "learning_rate": 9.954331876822798e-07, | |
| "loss": 0.4472, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.42151481888035125, | |
| "grad_norm": 10.576004038338146, | |
| "learning_rate": 9.952595047592167e-07, | |
| "loss": 0.5284, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.42371020856201974, | |
| "grad_norm": 7.579127144334551, | |
| "learning_rate": 9.950825961937257e-07, | |
| "loss": 0.4835, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.42590559824368823, | |
| "grad_norm": 9.83385395859545, | |
| "learning_rate": 9.949024631380189e-07, | |
| "loss": 0.4513, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.4281009879253567, | |
| "grad_norm": 9.260566516176205, | |
| "learning_rate": 9.94719106765309e-07, | |
| "loss": 0.4215, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.43029637760702527, | |
| "grad_norm": 9.7492885786622, | |
| "learning_rate": 9.945325282698022e-07, | |
| "loss": 0.4338, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.43249176728869376, | |
| "grad_norm": 8.278244359603962, | |
| "learning_rate": 9.94342728866691e-07, | |
| "loss": 0.4249, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.43468715697036225, | |
| "grad_norm": 9.668030105849532, | |
| "learning_rate": 9.941497097921456e-07, | |
| "loss": 0.4328, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.43688254665203075, | |
| "grad_norm": 9.271666205840873, | |
| "learning_rate": 9.939534723033057e-07, | |
| "loss": 0.4424, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.43907793633369924, | |
| "grad_norm": 9.508865253140504, | |
| "learning_rate": 9.937540176782731e-07, | |
| "loss": 0.4076, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.43907793633369924, | |
| "eval_accuracy": 0.778, | |
| "eval_loss": 0.41148170828819275, | |
| "eval_runtime": 52.0039, | |
| "eval_samples_per_second": 9.615, | |
| "eval_steps_per_second": 1.211, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.44127332601536773, | |
| "grad_norm": 15.641606615350042, | |
| "learning_rate": 9.935513472161026e-07, | |
| "loss": 0.5175, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.4434687156970362, | |
| "grad_norm": 6.583946691921041, | |
| "learning_rate": 9.93345462236794e-07, | |
| "loss": 0.4612, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.4456641053787047, | |
| "grad_norm": 7.317576490413668, | |
| "learning_rate": 9.931363640812837e-07, | |
| "loss": 0.4319, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.4478594950603732, | |
| "grad_norm": 8.58238734593965, | |
| "learning_rate": 9.929240541114347e-07, | |
| "loss": 0.4362, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.4500548847420417, | |
| "grad_norm": 7.645878462772526, | |
| "learning_rate": 9.927085337100298e-07, | |
| "loss": 0.4449, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.4522502744237102, | |
| "grad_norm": 8.128944249572761, | |
| "learning_rate": 9.924898042807604e-07, | |
| "loss": 0.4359, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.4544456641053787, | |
| "grad_norm": 6.323254265724658, | |
| "learning_rate": 9.922678672482192e-07, | |
| "loss": 0.4349, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.4566410537870472, | |
| "grad_norm": 8.343466628765478, | |
| "learning_rate": 9.920427240578898e-07, | |
| "loss": 0.4601, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.4588364434687157, | |
| "grad_norm": 9.94252397747147, | |
| "learning_rate": 9.918143761761376e-07, | |
| "loss": 0.4275, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.4610318331503842, | |
| "grad_norm": 6.996773271668393, | |
| "learning_rate": 9.915828250902003e-07, | |
| "loss": 0.4092, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4632272228320527, | |
| "grad_norm": 7.702375964154107, | |
| "learning_rate": 9.913480723081782e-07, | |
| "loss": 0.4039, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.4654226125137212, | |
| "grad_norm": 12.275736080317706, | |
| "learning_rate": 9.911101193590243e-07, | |
| "loss": 0.483, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.4676180021953897, | |
| "grad_norm": 13.554222149681413, | |
| "learning_rate": 9.908689677925347e-07, | |
| "loss": 0.408, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.4698133918770582, | |
| "grad_norm": 10.859860574414233, | |
| "learning_rate": 9.906246191793378e-07, | |
| "loss": 0.4032, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.47200878155872666, | |
| "grad_norm": 13.78109413813406, | |
| "learning_rate": 9.903770751108845e-07, | |
| "loss": 0.414, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.47420417124039516, | |
| "grad_norm": 7.982093750602254, | |
| "learning_rate": 9.901263371994381e-07, | |
| "loss": 0.4097, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.47639956092206365, | |
| "grad_norm": 7.226081741797403, | |
| "learning_rate": 9.898724070780636e-07, | |
| "loss": 0.424, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.47859495060373214, | |
| "grad_norm": 6.5505528789866005, | |
| "learning_rate": 9.896152864006163e-07, | |
| "loss": 0.4117, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.4807903402854007, | |
| "grad_norm": 7.238329656366918, | |
| "learning_rate": 9.893549768417324e-07, | |
| "loss": 0.426, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.4829857299670692, | |
| "grad_norm": 9.018371675867607, | |
| "learning_rate": 9.89091480096817e-07, | |
| "loss": 0.4301, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.48518111964873767, | |
| "grad_norm": 6.873575031102029, | |
| "learning_rate": 9.888247978820336e-07, | |
| "loss": 0.4412, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.48737650933040616, | |
| "grad_norm": 7.71741695116787, | |
| "learning_rate": 9.88554931934293e-07, | |
| "loss": 0.401, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.48957189901207465, | |
| "grad_norm": 6.450074931709192, | |
| "learning_rate": 9.882818840112412e-07, | |
| "loss": 0.4453, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.49176728869374314, | |
| "grad_norm": 10.18694339380834, | |
| "learning_rate": 9.88005655891249e-07, | |
| "loss": 0.4499, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.49396267837541163, | |
| "grad_norm": 8.526454527609461, | |
| "learning_rate": 9.877262493734e-07, | |
| "loss": 0.3529, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4961580680570801, | |
| "grad_norm": 7.8436191187666795, | |
| "learning_rate": 9.874436662774781e-07, | |
| "loss": 0.3862, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.4983534577387486, | |
| "grad_norm": 12.16130920239228, | |
| "learning_rate": 9.871579084439573e-07, | |
| "loss": 0.4494, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.5005488474204172, | |
| "grad_norm": 13.22347671983063, | |
| "learning_rate": 9.868689777339882e-07, | |
| "loss": 0.3777, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.5027442371020856, | |
| "grad_norm": 10.714878837498341, | |
| "learning_rate": 9.865768760293865e-07, | |
| "loss": 0.4392, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.5049396267837541, | |
| "grad_norm": 10.989931098186185, | |
| "learning_rate": 9.862816052326207e-07, | |
| "loss": 0.4342, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5071350164654226, | |
| "grad_norm": 9.420235479586577, | |
| "learning_rate": 9.859831672668001e-07, | |
| "loss": 0.449, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.5093304061470911, | |
| "grad_norm": 7.8352789241534895, | |
| "learning_rate": 9.856815640756614e-07, | |
| "loss": 0.4066, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.5115257958287596, | |
| "grad_norm": 6.528093022715006, | |
| "learning_rate": 9.85376797623557e-07, | |
| "loss": 0.4076, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.5137211855104281, | |
| "grad_norm": 9.599510824703753, | |
| "learning_rate": 9.850688698954408e-07, | |
| "loss": 0.4656, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.5159165751920965, | |
| "grad_norm": 7.327102993121418, | |
| "learning_rate": 9.847577828968574e-07, | |
| "loss": 0.3928, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5181119648737651, | |
| "grad_norm": 8.47025842420312, | |
| "learning_rate": 9.84443538653927e-07, | |
| "loss": 0.4459, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.5203073545554336, | |
| "grad_norm": 8.808300848288402, | |
| "learning_rate": 9.841261392133334e-07, | |
| "loss": 0.4699, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.5225027442371021, | |
| "grad_norm": 6.4472715749678535, | |
| "learning_rate": 9.838055866423101e-07, | |
| "loss": 0.3927, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.5246981339187706, | |
| "grad_norm": 8.71789150223208, | |
| "learning_rate": 9.834818830286274e-07, | |
| "loss": 0.4112, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.5268935236004391, | |
| "grad_norm": 6.520504611614625, | |
| "learning_rate": 9.83155030480578e-07, | |
| "loss": 0.4221, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5290889132821076, | |
| "grad_norm": 7.822762966760966, | |
| "learning_rate": 9.82825031126964e-07, | |
| "loss": 0.4085, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.531284302963776, | |
| "grad_norm": 8.42042192814293, | |
| "learning_rate": 9.82491887117083e-07, | |
| "loss": 0.3344, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.5334796926454446, | |
| "grad_norm": 8.795849170631488, | |
| "learning_rate": 9.821556006207131e-07, | |
| "loss": 0.4706, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.535675082327113, | |
| "grad_norm": 11.237650950766433, | |
| "learning_rate": 9.818161738281003e-07, | |
| "loss": 0.3922, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.5378704720087816, | |
| "grad_norm": 8.79761175477303, | |
| "learning_rate": 9.81473608949943e-07, | |
| "loss": 0.3798, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.54006586169045, | |
| "grad_norm": 9.224818726295643, | |
| "learning_rate": 9.811279082173783e-07, | |
| "loss": 0.4045, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.5422612513721186, | |
| "grad_norm": 11.451420498389481, | |
| "learning_rate": 9.80779073881967e-07, | |
| "loss": 0.4767, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.544456641053787, | |
| "grad_norm": 14.204545837436886, | |
| "learning_rate": 9.804271082156792e-07, | |
| "loss": 0.5448, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.5466520307354555, | |
| "grad_norm": 8.162056177470248, | |
| "learning_rate": 9.800720135108798e-07, | |
| "loss": 0.4662, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.5488474204171241, | |
| "grad_norm": 5.928770792444553, | |
| "learning_rate": 9.79713792080313e-07, | |
| "loss": 0.3764, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5488474204171241, | |
| "eval_accuracy": 0.764, | |
| "eval_loss": 0.41615644097328186, | |
| "eval_runtime": 51.876, | |
| "eval_samples_per_second": 9.638, | |
| "eval_steps_per_second": 1.214, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5510428100987925, | |
| "grad_norm": 6.184202969509846, | |
| "learning_rate": 9.793524462570874e-07, | |
| "loss": 0.3895, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.5532381997804611, | |
| "grad_norm": 7.749574593570711, | |
| "learning_rate": 9.78987978394661e-07, | |
| "loss": 0.4515, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.5554335894621295, | |
| "grad_norm": 8.854833970278174, | |
| "learning_rate": 9.786203908668255e-07, | |
| "loss": 0.4269, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.557628979143798, | |
| "grad_norm": 6.18988995877324, | |
| "learning_rate": 9.78249686067691e-07, | |
| "loss": 0.4204, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.5598243688254665, | |
| "grad_norm": 6.0975770712127, | |
| "learning_rate": 9.778758664116717e-07, | |
| "loss": 0.4312, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.562019758507135, | |
| "grad_norm": 7.860127371723332, | |
| "learning_rate": 9.774989343334675e-07, | |
| "loss": 0.4056, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.5642151481888035, | |
| "grad_norm": 6.287357081202248, | |
| "learning_rate": 9.771188922880501e-07, | |
| "loss": 0.431, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.566410537870472, | |
| "grad_norm": 7.482025843040615, | |
| "learning_rate": 9.76735742750647e-07, | |
| "loss": 0.4237, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.5686059275521405, | |
| "grad_norm": 6.626343732308425, | |
| "learning_rate": 9.763494882167238e-07, | |
| "loss": 0.4258, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.570801317233809, | |
| "grad_norm": 6.379142997039637, | |
| "learning_rate": 9.759601312019705e-07, | |
| "loss": 0.4496, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5729967069154775, | |
| "grad_norm": 7.786859033495975, | |
| "learning_rate": 9.755676742422824e-07, | |
| "loss": 0.3697, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.575192096597146, | |
| "grad_norm": 7.474508144431387, | |
| "learning_rate": 9.751721198937457e-07, | |
| "loss": 0.3805, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.5773874862788145, | |
| "grad_norm": 5.797750979716916, | |
| "learning_rate": 9.747734707326194e-07, | |
| "loss": 0.3832, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.579582875960483, | |
| "grad_norm": 10.801633788158538, | |
| "learning_rate": 9.743717293553197e-07, | |
| "loss": 0.3489, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.5817782656421515, | |
| "grad_norm": 8.81184130305926, | |
| "learning_rate": 9.73966898378402e-07, | |
| "loss": 0.3933, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.58397365532382, | |
| "grad_norm": 10.99886571046786, | |
| "learning_rate": 9.735589804385445e-07, | |
| "loss": 0.4582, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.5861690450054885, | |
| "grad_norm": 8.989645189656823, | |
| "learning_rate": 9.731479781925308e-07, | |
| "loss": 0.39, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.5883644346871569, | |
| "grad_norm": 9.04325471249987, | |
| "learning_rate": 9.727338943172335e-07, | |
| "loss": 0.402, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.5905598243688255, | |
| "grad_norm": 8.32075801154753, | |
| "learning_rate": 9.723167315095947e-07, | |
| "loss": 0.3896, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.5927552140504939, | |
| "grad_norm": 11.735779106191535, | |
| "learning_rate": 9.718964924866108e-07, | |
| "loss": 0.4977, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5949506037321625, | |
| "grad_norm": 6.9875015531767515, | |
| "learning_rate": 9.71473179985313e-07, | |
| "loss": 0.4142, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.5971459934138309, | |
| "grad_norm": 7.256558656488309, | |
| "learning_rate": 9.710467967627502e-07, | |
| "loss": 0.4729, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.5993413830954994, | |
| "grad_norm": 6.63170478635617, | |
| "learning_rate": 9.706173455959713e-07, | |
| "loss": 0.3926, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.601536772777168, | |
| "grad_norm": 7.736941306994274, | |
| "learning_rate": 9.701848292820069e-07, | |
| "loss": 0.4269, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.6037321624588364, | |
| "grad_norm": 7.141227825990968, | |
| "learning_rate": 9.697492506378507e-07, | |
| "loss": 0.4746, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.605927552140505, | |
| "grad_norm": 6.175085064537569, | |
| "learning_rate": 9.693106125004416e-07, | |
| "loss": 0.3916, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.6081229418221734, | |
| "grad_norm": 8.496732166784003, | |
| "learning_rate": 9.688689177266452e-07, | |
| "loss": 0.4195, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.610318331503842, | |
| "grad_norm": 5.940495748803936, | |
| "learning_rate": 9.684241691932347e-07, | |
| "loss": 0.4218, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.6125137211855104, | |
| "grad_norm": 6.794809207027829, | |
| "learning_rate": 9.679763697968732e-07, | |
| "loss": 0.4064, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.6147091108671789, | |
| "grad_norm": 7.232988324397075, | |
| "learning_rate": 9.675255224540934e-07, | |
| "loss": 0.3699, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6169045005488474, | |
| "grad_norm": 7.07939070544042, | |
| "learning_rate": 9.6707163010128e-07, | |
| "loss": 0.4168, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.6190998902305159, | |
| "grad_norm": 8.718610382883304, | |
| "learning_rate": 9.666146956946496e-07, | |
| "loss": 0.4097, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.6212952799121844, | |
| "grad_norm": 9.093471494946654, | |
| "learning_rate": 9.661547222102321e-07, | |
| "loss": 0.4252, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.6234906695938529, | |
| "grad_norm": 13.940230416519052, | |
| "learning_rate": 9.656917126438508e-07, | |
| "loss": 0.4567, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.6256860592755215, | |
| "grad_norm": 8.779803873571954, | |
| "learning_rate": 9.65225670011103e-07, | |
| "loss": 0.383, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6278814489571899, | |
| "grad_norm": 7.5708107372450035, | |
| "learning_rate": 9.647565973473407e-07, | |
| "loss": 0.4124, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.6300768386388584, | |
| "grad_norm": 13.174216612702878, | |
| "learning_rate": 9.642844977076507e-07, | |
| "loss": 0.4502, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.6322722283205269, | |
| "grad_norm": 8.49295896068912, | |
| "learning_rate": 9.63809374166834e-07, | |
| "loss": 0.4369, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.6344676180021954, | |
| "grad_norm": 9.345508533381187, | |
| "learning_rate": 9.633312298193871e-07, | |
| "loss": 0.4625, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.6366630076838639, | |
| "grad_norm": 9.696466925170553, | |
| "learning_rate": 9.62850067779481e-07, | |
| "loss": 0.4442, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6388583973655324, | |
| "grad_norm": 8.896625991218926, | |
| "learning_rate": 9.623658911809404e-07, | |
| "loss": 0.3991, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.6410537870472008, | |
| "grad_norm": 9.525265923938584, | |
| "learning_rate": 9.618787031772245e-07, | |
| "loss": 0.3849, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.6432491767288694, | |
| "grad_norm": 5.576065012637179, | |
| "learning_rate": 9.61388506941406e-07, | |
| "loss": 0.4233, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.6454445664105378, | |
| "grad_norm": 7.207790760174152, | |
| "learning_rate": 9.6089530566615e-07, | |
| "loss": 0.3935, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.6476399560922064, | |
| "grad_norm": 6.129075752774988, | |
| "learning_rate": 9.603991025636933e-07, | |
| "loss": 0.3904, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6498353457738749, | |
| "grad_norm": 7.223439076532464, | |
| "learning_rate": 9.598999008658241e-07, | |
| "loss": 0.3842, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.6520307354555434, | |
| "grad_norm": 8.143383508497742, | |
| "learning_rate": 9.59397703823861e-07, | |
| "loss": 0.4269, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.6542261251372119, | |
| "grad_norm": 9.02225167943478, | |
| "learning_rate": 9.588925147086303e-07, | |
| "loss": 0.3878, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.6564215148188803, | |
| "grad_norm": 7.737039238378963, | |
| "learning_rate": 9.583843368104464e-07, | |
| "loss": 0.3508, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.6586169045005489, | |
| "grad_norm": 6.743123024533224, | |
| "learning_rate": 9.578731734390898e-07, | |
| "loss": 0.3808, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6586169045005489, | |
| "eval_accuracy": 0.77, | |
| "eval_loss": 0.41097283363342285, | |
| "eval_runtime": 52.0828, | |
| "eval_samples_per_second": 9.6, | |
| "eval_steps_per_second": 1.21, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6608122941822173, | |
| "grad_norm": 7.137538658423445, | |
| "learning_rate": 9.573590279237854e-07, | |
| "loss": 0.3938, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.6630076838638859, | |
| "grad_norm": 6.92894058529831, | |
| "learning_rate": 9.568419036131807e-07, | |
| "loss": 0.3729, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.6652030735455543, | |
| "grad_norm": 8.370178365332173, | |
| "learning_rate": 9.563218038753245e-07, | |
| "loss": 0.4004, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.6673984632272228, | |
| "grad_norm": 14.386262804761378, | |
| "learning_rate": 9.557987320976446e-07, | |
| "loss": 0.4211, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.6695938529088913, | |
| "grad_norm": 7.7875110721670975, | |
| "learning_rate": 9.552726916869254e-07, | |
| "loss": 0.388, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6717892425905598, | |
| "grad_norm": 8.49964853234254, | |
| "learning_rate": 9.547436860692869e-07, | |
| "loss": 0.4369, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.6739846322722283, | |
| "grad_norm": 6.5964314472495404, | |
| "learning_rate": 9.542117186901608e-07, | |
| "loss": 0.3423, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.6761800219538968, | |
| "grad_norm": 7.6505759115431085, | |
| "learning_rate": 9.536767930142692e-07, | |
| "loss": 0.3602, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.6783754116355654, | |
| "grad_norm": 7.556175808157708, | |
| "learning_rate": 9.53138912525602e-07, | |
| "loss": 0.4385, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.6805708013172338, | |
| "grad_norm": 9.165395898058962, | |
| "learning_rate": 9.525980807273933e-07, | |
| "loss": 0.4516, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6827661909989023, | |
| "grad_norm": 7.637449593962663, | |
| "learning_rate": 9.520543011420994e-07, | |
| "loss": 0.4289, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.6849615806805708, | |
| "grad_norm": 9.18857916606884, | |
| "learning_rate": 9.515075773113758e-07, | |
| "loss": 0.3785, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.6871569703622393, | |
| "grad_norm": 7.821219676613859, | |
| "learning_rate": 9.509579127960541e-07, | |
| "loss": 0.4158, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.6893523600439078, | |
| "grad_norm": 6.571343391685928, | |
| "learning_rate": 9.504053111761183e-07, | |
| "loss": 0.3609, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.6915477497255763, | |
| "grad_norm": 6.225595749149676, | |
| "learning_rate": 9.498497760506819e-07, | |
| "loss": 0.4359, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6937431394072447, | |
| "grad_norm": 6.950774628316192, | |
| "learning_rate": 9.492913110379647e-07, | |
| "loss": 0.4417, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.6959385290889133, | |
| "grad_norm": 5.612528566630172, | |
| "learning_rate": 9.487299197752687e-07, | |
| "loss": 0.4052, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.6981339187705817, | |
| "grad_norm": 8.404168783130382, | |
| "learning_rate": 9.481656059189549e-07, | |
| "loss": 0.4162, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.7003293084522503, | |
| "grad_norm": 8.031865429035927, | |
| "learning_rate": 9.475983731444191e-07, | |
| "loss": 0.4125, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.7025246981339188, | |
| "grad_norm": 10.321047597755841, | |
| "learning_rate": 9.47028225146068e-07, | |
| "loss": 0.4505, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7047200878155873, | |
| "grad_norm": 7.740999393168428, | |
| "learning_rate": 9.464551656372955e-07, | |
| "loss": 0.3765, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.7069154774972558, | |
| "grad_norm": 8.215346652475102, | |
| "learning_rate": 9.458791983504581e-07, | |
| "loss": 0.401, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.7091108671789242, | |
| "grad_norm": 7.4542945597526336, | |
| "learning_rate": 9.453003270368509e-07, | |
| "loss": 0.4796, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.7113062568605928, | |
| "grad_norm": 7.054708809148231, | |
| "learning_rate": 9.44718555466683e-07, | |
| "loss": 0.4497, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.7135016465422612, | |
| "grad_norm": 7.460280928904919, | |
| "learning_rate": 9.44133887429053e-07, | |
| "loss": 0.3926, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7156970362239298, | |
| "grad_norm": 8.265245053890029, | |
| "learning_rate": 9.435463267319239e-07, | |
| "loss": 0.3805, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.7178924259055982, | |
| "grad_norm": 7.800641885443644, | |
| "learning_rate": 9.429558772020992e-07, | |
| "loss": 0.4424, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.7200878155872668, | |
| "grad_norm": 8.978144769546304, | |
| "learning_rate": 9.423625426851973e-07, | |
| "loss": 0.4159, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.7222832052689352, | |
| "grad_norm": 6.867455347578303, | |
| "learning_rate": 9.417663270456267e-07, | |
| "loss": 0.402, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.7244785949506037, | |
| "grad_norm": 7.15816324078702, | |
| "learning_rate": 9.411672341665604e-07, | |
| "loss": 0.3814, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7266739846322722, | |
| "grad_norm": 7.444204700173956, | |
| "learning_rate": 9.405652679499115e-07, | |
| "loss": 0.3782, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.7288693743139407, | |
| "grad_norm": 6.789459839990034, | |
| "learning_rate": 9.399604323163068e-07, | |
| "loss": 0.3484, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.7310647639956093, | |
| "grad_norm": 14.478524238054472, | |
| "learning_rate": 9.393527312050617e-07, | |
| "loss": 0.464, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.7332601536772777, | |
| "grad_norm": 20.668512412883626, | |
| "learning_rate": 9.387421685741552e-07, | |
| "loss": 0.4503, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.7354555433589463, | |
| "grad_norm": 12.197800422910193, | |
| "learning_rate": 9.381287484002027e-07, | |
| "loss": 0.4281, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7376509330406147, | |
| "grad_norm": 6.445277090460248, | |
| "learning_rate": 9.375124746784311e-07, | |
| "loss": 0.3648, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.7398463227222832, | |
| "grad_norm": 11.96859661106773, | |
| "learning_rate": 9.368933514226529e-07, | |
| "loss": 0.4608, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.7420417124039517, | |
| "grad_norm": 5.939377322140176, | |
| "learning_rate": 9.362713826652392e-07, | |
| "loss": 0.4006, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.7442371020856202, | |
| "grad_norm": 8.561222995237147, | |
| "learning_rate": 9.356465724570943e-07, | |
| "loss": 0.4163, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.7464324917672887, | |
| "grad_norm": 6.033330356983505, | |
| "learning_rate": 9.350189248676292e-07, | |
| "loss": 0.4144, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7486278814489572, | |
| "grad_norm": 6.205787168601971, | |
| "learning_rate": 9.34388443984734e-07, | |
| "loss": 0.4258, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.7508232711306256, | |
| "grad_norm": 7.631254905556204, | |
| "learning_rate": 9.33755133914753e-07, | |
| "loss": 0.4065, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.7530186608122942, | |
| "grad_norm": 6.32445637747235, | |
| "learning_rate": 9.331189987824568e-07, | |
| "loss": 0.4389, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.7552140504939627, | |
| "grad_norm": 5.65488048442344, | |
| "learning_rate": 9.324800427310155e-07, | |
| "loss": 0.4363, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.7574094401756312, | |
| "grad_norm": 6.166240836463136, | |
| "learning_rate": 9.318382699219722e-07, | |
| "loss": 0.4091, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7596048298572997, | |
| "grad_norm": 7.489797283268898, | |
| "learning_rate": 9.311936845352157e-07, | |
| "loss": 0.3923, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.7618002195389681, | |
| "grad_norm": 7.098266537437995, | |
| "learning_rate": 9.305462907689532e-07, | |
| "loss": 0.4363, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.7639956092206367, | |
| "grad_norm": 7.518882790839629, | |
| "learning_rate": 9.298960928396826e-07, | |
| "loss": 0.3989, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.7661909989023051, | |
| "grad_norm": 7.535014992332507, | |
| "learning_rate": 9.292430949821659e-07, | |
| "loss": 0.3939, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.7683863885839737, | |
| "grad_norm": 7.793847428801205, | |
| "learning_rate": 9.285873014494008e-07, | |
| "loss": 0.4238, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7683863885839737, | |
| "eval_accuracy": 0.78, | |
| "eval_loss": 0.3999713063240051, | |
| "eval_runtime": 52.2328, | |
| "eval_samples_per_second": 9.573, | |
| "eval_steps_per_second": 1.206, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7705817782656421, | |
| "grad_norm": 6.9078346633967405, | |
| "learning_rate": 9.279287165125936e-07, | |
| "loss": 0.3417, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.7727771679473107, | |
| "grad_norm": 8.452074250696336, | |
| "learning_rate": 9.272673444611308e-07, | |
| "loss": 0.426, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.7749725576289791, | |
| "grad_norm": 6.7224178173856215, | |
| "learning_rate": 9.266031896025516e-07, | |
| "loss": 0.421, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.7771679473106476, | |
| "grad_norm": 8.735363673640215, | |
| "learning_rate": 9.259362562625199e-07, | |
| "loss": 0.3596, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.7793633369923162, | |
| "grad_norm": 7.949892246537311, | |
| "learning_rate": 9.252665487847957e-07, | |
| "loss": 0.3922, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7815587266739846, | |
| "grad_norm": 12.194367861210461, | |
| "learning_rate": 9.245940715312074e-07, | |
| "loss": 0.4538, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.7837541163556532, | |
| "grad_norm": 10.45987571002838, | |
| "learning_rate": 9.239188288816226e-07, | |
| "loss": 0.4928, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.7859495060373216, | |
| "grad_norm": 6.764374108633695, | |
| "learning_rate": 9.232408252339201e-07, | |
| "loss": 0.4226, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.7881448957189902, | |
| "grad_norm": 6.394843611556007, | |
| "learning_rate": 9.225600650039615e-07, | |
| "loss": 0.4096, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.7903402854006586, | |
| "grad_norm": 6.971937381598142, | |
| "learning_rate": 9.218765526255619e-07, | |
| "loss": 0.4162, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7925356750823271, | |
| "grad_norm": 5.9443940673499185, | |
| "learning_rate": 9.211902925504613e-07, | |
| "loss": 0.4161, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.7947310647639956, | |
| "grad_norm": 5.483471181116708, | |
| "learning_rate": 9.205012892482952e-07, | |
| "loss": 0.3627, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.7969264544456641, | |
| "grad_norm": 6.649198037257637, | |
| "learning_rate": 9.198095472065667e-07, | |
| "loss": 0.445, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.7991218441273326, | |
| "grad_norm": 7.566125395640005, | |
| "learning_rate": 9.191150709306155e-07, | |
| "loss": 0.4352, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.8013172338090011, | |
| "grad_norm": 5.877051153754724, | |
| "learning_rate": 9.184178649435896e-07, | |
| "loss": 0.3855, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8035126234906695, | |
| "grad_norm": 6.82408167677956, | |
| "learning_rate": 9.177179337864163e-07, | |
| "loss": 0.44, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.8057080131723381, | |
| "grad_norm": 5.76805550171372, | |
| "learning_rate": 9.170152820177714e-07, | |
| "loss": 0.3722, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.8079034028540066, | |
| "grad_norm": 5.37322049457271, | |
| "learning_rate": 9.163099142140505e-07, | |
| "loss": 0.39, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.8100987925356751, | |
| "grad_norm": 6.652734325876614, | |
| "learning_rate": 9.156018349693386e-07, | |
| "loss": 0.33, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.8122941822173436, | |
| "grad_norm": 8.078887258485842, | |
| "learning_rate": 9.148910488953807e-07, | |
| "loss": 0.4495, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.814489571899012, | |
| "grad_norm": 6.057501262886995, | |
| "learning_rate": 9.141775606215512e-07, | |
| "loss": 0.3793, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.8166849615806806, | |
| "grad_norm": 8.349159344050063, | |
| "learning_rate": 9.134613747948238e-07, | |
| "loss": 0.4165, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.818880351262349, | |
| "grad_norm": 6.093265741529057, | |
| "learning_rate": 9.127424960797423e-07, | |
| "loss": 0.3683, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.8210757409440176, | |
| "grad_norm": 7.626403918604381, | |
| "learning_rate": 9.120209291583885e-07, | |
| "loss": 0.4478, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.823271130625686, | |
| "grad_norm": 6.286457043534019, | |
| "learning_rate": 9.11296678730353e-07, | |
| "loss": 0.4432, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8254665203073546, | |
| "grad_norm": 6.1102728691763195, | |
| "learning_rate": 9.10569749512704e-07, | |
| "loss": 0.4408, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.827661909989023, | |
| "grad_norm": 5.8435296710540765, | |
| "learning_rate": 9.098401462399572e-07, | |
| "loss": 0.3889, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.8298572996706916, | |
| "grad_norm": 6.8221898822791776, | |
| "learning_rate": 9.091078736640438e-07, | |
| "loss": 0.3924, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.8320526893523601, | |
| "grad_norm": 5.449250972840727, | |
| "learning_rate": 9.083729365542807e-07, | |
| "loss": 0.4544, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.8342480790340285, | |
| "grad_norm": 6.531068732007626, | |
| "learning_rate": 9.076353396973391e-07, | |
| "loss": 0.3824, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8364434687156971, | |
| "grad_norm": 6.253901506150054, | |
| "learning_rate": 9.068950878972128e-07, | |
| "loss": 0.393, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.8386388583973655, | |
| "grad_norm": 6.104772949983095, | |
| "learning_rate": 9.06152185975188e-07, | |
| "loss": 0.3857, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.8408342480790341, | |
| "grad_norm": 5.459509353786465, | |
| "learning_rate": 9.054066387698103e-07, | |
| "loss": 0.3748, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.8430296377607025, | |
| "grad_norm": 6.221629121824156, | |
| "learning_rate": 9.04658451136855e-07, | |
| "loss": 0.3869, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.845225027442371, | |
| "grad_norm": 11.117787770421263, | |
| "learning_rate": 9.039076279492938e-07, | |
| "loss": 0.428, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8474204171240395, | |
| "grad_norm": 7.6402881827025695, | |
| "learning_rate": 9.03154174097265e-07, | |
| "loss": 0.3685, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.849615806805708, | |
| "grad_norm": 8.329862332560323, | |
| "learning_rate": 9.023980944880395e-07, | |
| "loss": 0.3888, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.8518111964873765, | |
| "grad_norm": 9.379807197876737, | |
| "learning_rate": 9.016393940459901e-07, | |
| "loss": 0.3228, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.854006586169045, | |
| "grad_norm": 9.497050528776587, | |
| "learning_rate": 9.008780777125592e-07, | |
| "loss": 0.3643, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.8562019758507134, | |
| "grad_norm": 11.317540635350868, | |
| "learning_rate": 9.001141504462267e-07, | |
| "loss": 0.3849, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.858397365532382, | |
| "grad_norm": 9.649076333363197, | |
| "learning_rate": 8.993476172224776e-07, | |
| "loss": 0.4216, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.8605927552140505, | |
| "grad_norm": 5.870968468017432, | |
| "learning_rate": 8.985784830337694e-07, | |
| "loss": 0.3512, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.862788144895719, | |
| "grad_norm": 6.63746206403619, | |
| "learning_rate": 8.978067528895001e-07, | |
| "loss": 0.4034, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.8649835345773875, | |
| "grad_norm": 7.9614797293095805, | |
| "learning_rate": 8.970324318159747e-07, | |
| "loss": 0.4374, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.867178924259056, | |
| "grad_norm": 6.6259673033312305, | |
| "learning_rate": 8.962555248563737e-07, | |
| "loss": 0.4034, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8693743139407245, | |
| "grad_norm": 5.874087728977652, | |
| "learning_rate": 8.95476037070719e-07, | |
| "loss": 0.4381, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.8715697036223929, | |
| "grad_norm": 5.374289997565401, | |
| "learning_rate": 8.94693973535842e-07, | |
| "loss": 0.4015, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.8737650933040615, | |
| "grad_norm": 5.135571917512859, | |
| "learning_rate": 8.939093393453494e-07, | |
| "loss": 0.3764, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.8759604829857299, | |
| "grad_norm": 6.320197335683808, | |
| "learning_rate": 8.931221396095914e-07, | |
| "loss": 0.3573, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.8781558726673985, | |
| "grad_norm": 5.934124946312254, | |
| "learning_rate": 8.92332379455627e-07, | |
| "loss": 0.3985, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8781558726673985, | |
| "eval_accuracy": 0.78, | |
| "eval_loss": 0.3908381462097168, | |
| "eval_runtime": 52.2922, | |
| "eval_samples_per_second": 9.562, | |
| "eval_steps_per_second": 1.205, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8803512623490669, | |
| "grad_norm": 5.227662807462205, | |
| "learning_rate": 8.91540064027192e-07, | |
| "loss": 0.3802, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.8825466520307355, | |
| "grad_norm": 5.4312238244768105, | |
| "learning_rate": 8.907451984846642e-07, | |
| "loss": 0.357, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.884742041712404, | |
| "grad_norm": 6.5227729333700255, | |
| "learning_rate": 8.899477880050305e-07, | |
| "loss": 0.4599, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.8869374313940724, | |
| "grad_norm": 6.320159868176597, | |
| "learning_rate": 8.891478377818533e-07, | |
| "loss": 0.3456, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.889132821075741, | |
| "grad_norm": 6.595230869297455, | |
| "learning_rate": 8.883453530252363e-07, | |
| "loss": 0.3385, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8913282107574094, | |
| "grad_norm": 8.372111940632312, | |
| "learning_rate": 8.875403389617909e-07, | |
| "loss": 0.4228, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.893523600439078, | |
| "grad_norm": 7.273966750571457, | |
| "learning_rate": 8.867328008346012e-07, | |
| "loss": 0.378, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.8957189901207464, | |
| "grad_norm": 9.685872652254469, | |
| "learning_rate": 8.859227439031917e-07, | |
| "loss": 0.4191, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.897914379802415, | |
| "grad_norm": 9.466266479949322, | |
| "learning_rate": 8.851101734434916e-07, | |
| "loss": 0.3949, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.9001097694840834, | |
| "grad_norm": 8.975473353994817, | |
| "learning_rate": 8.842950947478001e-07, | |
| "loss": 0.4137, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9023051591657519, | |
| "grad_norm": 10.051513089446058, | |
| "learning_rate": 8.834775131247534e-07, | |
| "loss": 0.4055, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.9045005488474204, | |
| "grad_norm": 7.417826600936023, | |
| "learning_rate": 8.826574338992893e-07, | |
| "loss": 0.4014, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.9066959385290889, | |
| "grad_norm": 7.7383905505396475, | |
| "learning_rate": 8.818348624126122e-07, | |
| "loss": 0.3484, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.9088913282107574, | |
| "grad_norm": 7.082975830005012, | |
| "learning_rate": 8.810098040221588e-07, | |
| "loss": 0.3709, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.9110867178924259, | |
| "grad_norm": 6.727376213025829, | |
| "learning_rate": 8.801822641015635e-07, | |
| "loss": 0.3843, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9132821075740944, | |
| "grad_norm": 7.275555271868482, | |
| "learning_rate": 8.793522480406223e-07, | |
| "loss": 0.356, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.9154774972557629, | |
| "grad_norm": 5.944537715719952, | |
| "learning_rate": 8.785197612452591e-07, | |
| "loss": 0.4293, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.9176728869374314, | |
| "grad_norm": 6.472465668641822, | |
| "learning_rate": 8.776848091374892e-07, | |
| "loss": 0.353, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.9198682766190999, | |
| "grad_norm": 7.887473362612184, | |
| "learning_rate": 8.768473971553847e-07, | |
| "loss": 0.417, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.9220636663007684, | |
| "grad_norm": 5.72620013047636, | |
| "learning_rate": 8.760075307530392e-07, | |
| "loss": 0.3725, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9242590559824369, | |
| "grad_norm": 6.518816397700794, | |
| "learning_rate": 8.75165215400532e-07, | |
| "loss": 0.3819, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.9264544456641054, | |
| "grad_norm": 7.2515922225795695, | |
| "learning_rate": 8.743204565838922e-07, | |
| "loss": 0.4082, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.9286498353457738, | |
| "grad_norm": 6.156643476702157, | |
| "learning_rate": 8.734732598050636e-07, | |
| "loss": 0.4111, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.9308452250274424, | |
| "grad_norm": 6.1131283071130635, | |
| "learning_rate": 8.726236305818681e-07, | |
| "loss": 0.3849, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.9330406147091108, | |
| "grad_norm": 8.044334742394684, | |
| "learning_rate": 8.717715744479706e-07, | |
| "loss": 0.4006, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9352360043907794, | |
| "grad_norm": 6.327277854929185, | |
| "learning_rate": 8.709170969528425e-07, | |
| "loss": 0.4525, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.9374313940724479, | |
| "grad_norm": 6.919003187239627, | |
| "learning_rate": 8.700602036617253e-07, | |
| "loss": 0.3976, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.9396267837541163, | |
| "grad_norm": 6.750362117722639, | |
| "learning_rate": 8.692009001555951e-07, | |
| "loss": 0.4214, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.9418221734357849, | |
| "grad_norm": 5.309527010131869, | |
| "learning_rate": 8.683391920311256e-07, | |
| "loss": 0.3837, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.9440175631174533, | |
| "grad_norm": 5.756043416124202, | |
| "learning_rate": 8.674750849006518e-07, | |
| "loss": 0.3748, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9462129527991219, | |
| "grad_norm": 5.947118773957151, | |
| "learning_rate": 8.666085843921337e-07, | |
| "loss": 0.3656, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.9484083424807903, | |
| "grad_norm": 6.803175272777888, | |
| "learning_rate": 8.65739696149119e-07, | |
| "loss": 0.3871, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.9506037321624589, | |
| "grad_norm": 11.872613980622894, | |
| "learning_rate": 8.648684258307075e-07, | |
| "loss": 0.4402, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.9527991218441273, | |
| "grad_norm": 11.845458798682339, | |
| "learning_rate": 8.639947791115131e-07, | |
| "loss": 0.398, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.9549945115257958, | |
| "grad_norm": 7.885958732125603, | |
| "learning_rate": 8.631187616816271e-07, | |
| "loss": 0.3649, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.9571899012074643, | |
| "grad_norm": 7.087463703257775, | |
| "learning_rate": 8.622403792465819e-07, | |
| "loss": 0.3938, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.9593852908891328, | |
| "grad_norm": 6.677319825795207, | |
| "learning_rate": 8.613596375273127e-07, | |
| "loss": 0.379, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.9615806805708014, | |
| "grad_norm": 5.657521052821894, | |
| "learning_rate": 8.604765422601213e-07, | |
| "loss": 0.3482, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.9637760702524698, | |
| "grad_norm": 5.630660484188195, | |
| "learning_rate": 8.595910991966375e-07, | |
| "loss": 0.4039, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.9659714599341384, | |
| "grad_norm": 6.274352234189354, | |
| "learning_rate": 8.587033141037833e-07, | |
| "loss": 0.3926, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9681668496158068, | |
| "grad_norm": 6.263591446228187, | |
| "learning_rate": 8.578131927637339e-07, | |
| "loss": 0.3528, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.9703622392974753, | |
| "grad_norm": 7.115149701513277, | |
| "learning_rate": 8.569207409738804e-07, | |
| "loss": 0.3812, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.9725576289791438, | |
| "grad_norm": 6.770204900870094, | |
| "learning_rate": 8.560259645467927e-07, | |
| "loss": 0.3994, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.9747530186608123, | |
| "grad_norm": 6.36111561811909, | |
| "learning_rate": 8.551288693101808e-07, | |
| "loss": 0.3578, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.9769484083424808, | |
| "grad_norm": 5.460698588059828, | |
| "learning_rate": 8.542294611068573e-07, | |
| "loss": 0.357, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9791437980241493, | |
| "grad_norm": 7.059061442819567, | |
| "learning_rate": 8.533277457946988e-07, | |
| "loss": 0.3943, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.9813391877058177, | |
| "grad_norm": 5.455920326056064, | |
| "learning_rate": 8.524237292466092e-07, | |
| "loss": 0.3498, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.9835345773874863, | |
| "grad_norm": 8.145756413409373, | |
| "learning_rate": 8.515174173504795e-07, | |
| "loss": 0.4005, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.9857299670691547, | |
| "grad_norm": 6.554023882646871, | |
| "learning_rate": 8.506088160091506e-07, | |
| "loss": 0.4014, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.9879253567508233, | |
| "grad_norm": 7.592409507167033, | |
| "learning_rate": 8.49697931140375e-07, | |
| "loss": 0.3976, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9879253567508233, | |
| "eval_accuracy": 0.776, | |
| "eval_loss": 0.38674676418304443, | |
| "eval_runtime": 52.2498, | |
| "eval_samples_per_second": 9.569, | |
| "eval_steps_per_second": 1.206, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9901207464324918, | |
| "grad_norm": 6.445935731601792, | |
| "learning_rate": 8.487847686767771e-07, | |
| "loss": 0.3376, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.9923161361141603, | |
| "grad_norm": 7.661577003586348, | |
| "learning_rate": 8.478693345658165e-07, | |
| "loss": 0.3673, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.9945115257958288, | |
| "grad_norm": 6.07812754087199, | |
| "learning_rate": 8.469516347697472e-07, | |
| "loss": 0.3901, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.9967069154774972, | |
| "grad_norm": 6.985188252913582, | |
| "learning_rate": 8.460316752655798e-07, | |
| "loss": 0.3532, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.9989023051591658, | |
| "grad_norm": 6.960412464204759, | |
| "learning_rate": 8.451094620450431e-07, | |
| "loss": 0.3584, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 6.960412464204759, | |
| "learning_rate": 8.441850011145435e-07, | |
| "loss": 0.3955, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.0021953896816684, | |
| "grad_norm": 10.21658360278486, | |
| "learning_rate": 8.432582984951276e-07, | |
| "loss": 0.3445, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.004390779363337, | |
| "grad_norm": 5.546577685721332, | |
| "learning_rate": 8.423293602224417e-07, | |
| "loss": 0.4039, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.0065861690450055, | |
| "grad_norm": 5.367108573957618, | |
| "learning_rate": 8.413981923466932e-07, | |
| "loss": 0.4015, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.008781558726674, | |
| "grad_norm": 7.677188675606839, | |
| "learning_rate": 8.404648009326111e-07, | |
| "loss": 0.4446, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0109769484083424, | |
| "grad_norm": 5.110493371628498, | |
| "learning_rate": 8.395291920594061e-07, | |
| "loss": 0.3263, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.013172338090011, | |
| "grad_norm": 5.952324335313491, | |
| "learning_rate": 8.385913718207313e-07, | |
| "loss": 0.3865, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.0153677277716795, | |
| "grad_norm": 6.4399342076113495, | |
| "learning_rate": 8.376513463246429e-07, | |
| "loss": 0.3821, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.017563117453348, | |
| "grad_norm": 5.698634912941117, | |
| "learning_rate": 8.367091216935596e-07, | |
| "loss": 0.4065, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.0197585071350164, | |
| "grad_norm": 6.689636508179602, | |
| "learning_rate": 8.357647040642231e-07, | |
| "loss": 0.3466, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.021953896816685, | |
| "grad_norm": 5.443917132674302, | |
| "learning_rate": 8.348180995876587e-07, | |
| "loss": 0.3785, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.0241492864983535, | |
| "grad_norm": 5.443111782189951, | |
| "learning_rate": 8.338693144291342e-07, | |
| "loss": 0.3985, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.026344676180022, | |
| "grad_norm": 5.887332270970953, | |
| "learning_rate": 8.329183547681205e-07, | |
| "loss": 0.3742, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.0285400658616906, | |
| "grad_norm": 6.187509225148951, | |
| "learning_rate": 8.319652267982508e-07, | |
| "loss": 0.3716, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.030735455543359, | |
| "grad_norm": 6.121985990165869, | |
| "learning_rate": 8.310099367272812e-07, | |
| "loss": 0.429, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0329308452250274, | |
| "grad_norm": 11.523110274065871, | |
| "learning_rate": 8.30052490777049e-07, | |
| "loss": 0.4404, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.0351262349066959, | |
| "grad_norm": 5.907917517927115, | |
| "learning_rate": 8.29092895183433e-07, | |
| "loss": 0.3681, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.0373216245883645, | |
| "grad_norm": 5.901926465788518, | |
| "learning_rate": 8.281311561963129e-07, | |
| "loss": 0.3975, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 1.039517014270033, | |
| "grad_norm": 5.906068428010956, | |
| "learning_rate": 8.271672800795284e-07, | |
| "loss": 0.3665, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.0417124039517014, | |
| "grad_norm": 5.294577975700778, | |
| "learning_rate": 8.26201273110838e-07, | |
| "loss": 0.4487, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.0439077936333698, | |
| "grad_norm": 5.978237084104336, | |
| "learning_rate": 8.252331415818788e-07, | |
| "loss": 0.333, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 1.0461031833150385, | |
| "grad_norm": 5.498099104199104, | |
| "learning_rate": 8.242628917981253e-07, | |
| "loss": 0.3685, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 1.048298572996707, | |
| "grad_norm": 6.789539587285106, | |
| "learning_rate": 8.232905300788484e-07, | |
| "loss": 0.3657, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 1.0504939626783754, | |
| "grad_norm": 7.957958255256437, | |
| "learning_rate": 8.223160627570736e-07, | |
| "loss": 0.3326, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 1.0526893523600438, | |
| "grad_norm": 5.600567494048053, | |
| "learning_rate": 8.213394961795406e-07, | |
| "loss": 0.3681, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0548847420417125, | |
| "grad_norm": 7.539054348927877, | |
| "learning_rate": 8.203608367066615e-07, | |
| "loss": 0.4324, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 1.057080131723381, | |
| "grad_norm": 8.903288366040119, | |
| "learning_rate": 8.193800907124798e-07, | |
| "loss": 0.4113, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 1.0592755214050493, | |
| "grad_norm": 8.111853302663592, | |
| "learning_rate": 8.183972645846282e-07, | |
| "loss": 0.4387, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 1.061470911086718, | |
| "grad_norm": 9.450203257580068, | |
| "learning_rate": 8.174123647242877e-07, | |
| "loss": 0.3703, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.0636663007683864, | |
| "grad_norm": 7.7233909149018025, | |
| "learning_rate": 8.164253975461453e-07, | |
| "loss": 0.3967, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.0658616904500549, | |
| "grad_norm": 6.1052811281532176, | |
| "learning_rate": 8.154363694783526e-07, | |
| "loss": 0.3528, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.0680570801317233, | |
| "grad_norm": 6.985081806859129, | |
| "learning_rate": 8.14445286962484e-07, | |
| "loss": 0.3402, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 1.070252469813392, | |
| "grad_norm": 6.737047071565736, | |
| "learning_rate": 8.134521564534947e-07, | |
| "loss": 0.3484, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 1.0724478594950604, | |
| "grad_norm": 8.024277201928818, | |
| "learning_rate": 8.124569844196779e-07, | |
| "loss": 0.4149, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 1.0746432491767288, | |
| "grad_norm": 6.684001008171824, | |
| "learning_rate": 8.11459777342624e-07, | |
| "loss": 0.3545, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0768386388583973, | |
| "grad_norm": 8.174576979591736, | |
| "learning_rate": 8.104605417171776e-07, | |
| "loss": 0.4058, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 1.079034028540066, | |
| "grad_norm": 6.567306113947398, | |
| "learning_rate": 8.094592840513949e-07, | |
| "loss": 0.3559, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.0812294182217344, | |
| "grad_norm": 5.861714140854109, | |
| "learning_rate": 8.084560108665023e-07, | |
| "loss": 0.3705, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 1.0834248079034028, | |
| "grad_norm": 5.948783706558882, | |
| "learning_rate": 8.074507286968528e-07, | |
| "loss": 0.3855, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 1.0856201975850714, | |
| "grad_norm": 9.048770250869833, | |
| "learning_rate": 8.064434440898844e-07, | |
| "loss": 0.3838, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.0878155872667399, | |
| "grad_norm": 6.4254457902019375, | |
| "learning_rate": 8.054341636060766e-07, | |
| "loss": 0.3616, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 1.0900109769484083, | |
| "grad_norm": 8.363722938696041, | |
| "learning_rate": 8.044228938189088e-07, | |
| "loss": 0.4117, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 1.0922063666300768, | |
| "grad_norm": 8.80748939543165, | |
| "learning_rate": 8.034096413148161e-07, | |
| "loss": 0.3498, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.0944017563117454, | |
| "grad_norm": 6.142541366225709, | |
| "learning_rate": 8.023944126931475e-07, | |
| "loss": 0.3959, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 1.0965971459934138, | |
| "grad_norm": 6.217108697055721, | |
| "learning_rate": 8.013772145661224e-07, | |
| "loss": 0.4058, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0965971459934138, | |
| "eval_accuracy": 0.79, | |
| "eval_loss": 0.3937423825263977, | |
| "eval_runtime": 52.5967, | |
| "eval_samples_per_second": 9.506, | |
| "eval_steps_per_second": 1.198, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1368, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |