HSENet_CLIP / trainer_state.json
Yanzhaoshi's picture
Upload folder using huggingface_hub
3c2e586 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 48.16824966078698,
"eval_steps": 3538,
"global_step": 71000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.060379918588873815,
"grad_norm": 9.192076683044434,
"learning_rate": 3.353428786737001e-06,
"loss": 4.0695,
"step": 89
},
{
"epoch": 0.12075983717774763,
"grad_norm": 6.064570903778076,
"learning_rate": 6.706857573474002e-06,
"loss": 3.4544,
"step": 178
},
{
"epoch": 0.18113975576662145,
"grad_norm": 7.58292293548584,
"learning_rate": 1.0060286360211004e-05,
"loss": 3.2275,
"step": 267
},
{
"epoch": 0.24151967435549526,
"grad_norm": 12.527491569519043,
"learning_rate": 1.3413715146948003e-05,
"loss": 3.09,
"step": 356
},
{
"epoch": 0.3018995929443691,
"grad_norm": 12.255288124084473,
"learning_rate": 1.6767143933685002e-05,
"loss": 3.0322,
"step": 445
},
{
"epoch": 0.3622795115332429,
"grad_norm": 6.611302375793457,
"learning_rate": 2.0120572720422008e-05,
"loss": 2.9608,
"step": 534
},
{
"epoch": 0.4226594301221167,
"grad_norm": 12.716562271118164,
"learning_rate": 2.3474001507159007e-05,
"loss": 2.892,
"step": 623
},
{
"epoch": 0.4830393487109905,
"grad_norm": 6.532482624053955,
"learning_rate": 2.6827430293896006e-05,
"loss": 2.8621,
"step": 712
},
{
"epoch": 0.5434192672998643,
"grad_norm": 25.10944175720215,
"learning_rate": 3.0180859080633005e-05,
"loss": 2.754,
"step": 801
},
{
"epoch": 0.6037991858887382,
"grad_norm": 7.85612154006958,
"learning_rate": 3.3534287867370005e-05,
"loss": 2.7605,
"step": 890
},
{
"epoch": 0.664179104477612,
"grad_norm": 9.118956565856934,
"learning_rate": 3.688771665410701e-05,
"loss": 2.7105,
"step": 979
},
{
"epoch": 0.7245590230664858,
"grad_norm": 8.141679763793945,
"learning_rate": 4.0241145440844016e-05,
"loss": 2.6589,
"step": 1068
},
{
"epoch": 0.7849389416553596,
"grad_norm": 18.962980270385742,
"learning_rate": 4.3594574227581015e-05,
"loss": 2.6437,
"step": 1157
},
{
"epoch": 0.8453188602442334,
"grad_norm": 7.252344608306885,
"learning_rate": 4.6948003014318015e-05,
"loss": 2.6,
"step": 1246
},
{
"epoch": 0.9056987788331072,
"grad_norm": 9.572624206542969,
"learning_rate": 5.030143180105501e-05,
"loss": 2.6181,
"step": 1335
},
{
"epoch": 0.966078697421981,
"grad_norm": 7.212714672088623,
"learning_rate": 5.365486058779201e-05,
"loss": 2.5157,
"step": 1424
},
{
"epoch": 1.0264586160108549,
"grad_norm": 8.035208702087402,
"learning_rate": 5.700828937452901e-05,
"loss": 2.5217,
"step": 1513
},
{
"epoch": 1.0868385345997287,
"grad_norm": 10.060546875,
"learning_rate": 6.036171816126601e-05,
"loss": 2.4495,
"step": 1602
},
{
"epoch": 1.1472184531886025,
"grad_norm": 8.2612943649292,
"learning_rate": 6.371514694800301e-05,
"loss": 2.4271,
"step": 1691
},
{
"epoch": 1.2075983717774763,
"grad_norm": 7.717799186706543,
"learning_rate": 6.706857573474001e-05,
"loss": 2.4397,
"step": 1780
},
{
"epoch": 1.2679782903663501,
"grad_norm": 11.520405769348145,
"learning_rate": 7.042200452147701e-05,
"loss": 2.4099,
"step": 1869
},
{
"epoch": 1.328358208955224,
"grad_norm": 8.826777458190918,
"learning_rate": 7.377543330821402e-05,
"loss": 2.3349,
"step": 1958
},
{
"epoch": 1.3887381275440978,
"grad_norm": 7.810181140899658,
"learning_rate": 7.712886209495102e-05,
"loss": 2.3491,
"step": 2047
},
{
"epoch": 1.4491180461329716,
"grad_norm": 9.809256553649902,
"learning_rate": 8.048229088168803e-05,
"loss": 2.3778,
"step": 2136
},
{
"epoch": 1.5094979647218452,
"grad_norm": 8.396034240722656,
"learning_rate": 8.383571966842503e-05,
"loss": 2.3717,
"step": 2225
},
{
"epoch": 1.5698778833107192,
"grad_norm": 10.409805297851562,
"learning_rate": 8.718914845516203e-05,
"loss": 2.3207,
"step": 2314
},
{
"epoch": 1.6302578018995928,
"grad_norm": 7.1885786056518555,
"learning_rate": 9.054257724189903e-05,
"loss": 2.3651,
"step": 2403
},
{
"epoch": 1.6906377204884668,
"grad_norm": 7.768437385559082,
"learning_rate": 9.389600602863603e-05,
"loss": 2.3313,
"step": 2492
},
{
"epoch": 1.7510176390773404,
"grad_norm": 5.661167144775391,
"learning_rate": 9.724943481537303e-05,
"loss": 2.2961,
"step": 2581
},
{
"epoch": 1.8113975576662145,
"grad_norm": 8.26041030883789,
"learning_rate": 9.999999141684668e-05,
"loss": 2.2966,
"step": 2670
},
{
"epoch": 1.871777476255088,
"grad_norm": 9.347060203552246,
"learning_rate": 9.999963035487687e-05,
"loss": 2.2934,
"step": 2759
},
{
"epoch": 1.932157394843962,
"grad_norm": 9.274127006530762,
"learning_rate": 9.999873814762094e-05,
"loss": 2.2813,
"step": 2848
},
{
"epoch": 1.9925373134328357,
"grad_norm": 8.107317924499512,
"learning_rate": 9.999731480455674e-05,
"loss": 2.2005,
"step": 2937
},
{
"epoch": 2.0529172320217097,
"grad_norm": 8.258145332336426,
"learning_rate": 9.999536034080447e-05,
"loss": 2.1059,
"step": 3026
},
{
"epoch": 2.1132971506105833,
"grad_norm": 6.388680458068848,
"learning_rate": 9.999287477712633e-05,
"loss": 2.0797,
"step": 3115
},
{
"epoch": 2.1736770691994574,
"grad_norm": 6.251929759979248,
"learning_rate": 9.998985813992645e-05,
"loss": 2.0776,
"step": 3204
},
{
"epoch": 2.234056987788331,
"grad_norm": 7.3807172775268555,
"learning_rate": 9.998631046125051e-05,
"loss": 2.0028,
"step": 3293
},
{
"epoch": 2.294436906377205,
"grad_norm": 8.793547630310059,
"learning_rate": 9.998223177878545e-05,
"loss": 2.0789,
"step": 3382
},
{
"epoch": 2.3548168249660786,
"grad_norm": 8.737523078918457,
"learning_rate": 9.997762213585903e-05,
"loss": 2.0322,
"step": 3471
},
{
"epoch": 2.400271370420624,
"eval_accuracy": 0.193359375,
"eval_loss": 3.569305419921875,
"eval_runtime": 19.2577,
"eval_samples_per_second": 26.587,
"eval_steps_per_second": 0.208,
"step": 3538
},
{
"epoch": 2.4151967435549526,
"grad_norm": 13.652155876159668,
"learning_rate": 9.997248158143945e-05,
"loss": 1.9617,
"step": 3560
},
{
"epoch": 2.475576662143826,
"grad_norm": 6.501727104187012,
"learning_rate": 9.99668101701347e-05,
"loss": 2.0389,
"step": 3649
},
{
"epoch": 2.5359565807327002,
"grad_norm": 11.862299919128418,
"learning_rate": 9.99606079621921e-05,
"loss": 2.031,
"step": 3738
},
{
"epoch": 2.596336499321574,
"grad_norm": 7.8563551902771,
"learning_rate": 9.995387502349764e-05,
"loss": 1.9729,
"step": 3827
},
{
"epoch": 2.656716417910448,
"grad_norm": 9.843242645263672,
"learning_rate": 9.99466114255752e-05,
"loss": 1.9323,
"step": 3916
},
{
"epoch": 2.7170963364993215,
"grad_norm": 6.065842628479004,
"learning_rate": 9.993881724558587e-05,
"loss": 1.9465,
"step": 4005
},
{
"epoch": 2.7774762550881955,
"grad_norm": 11.29686450958252,
"learning_rate": 9.993049256632708e-05,
"loss": 1.8912,
"step": 4094
},
{
"epoch": 2.837856173677069,
"grad_norm": 11.959461212158203,
"learning_rate": 9.99216374762318e-05,
"loss": 1.9665,
"step": 4183
},
{
"epoch": 2.898236092265943,
"grad_norm": 8.116125106811523,
"learning_rate": 9.991225206936747e-05,
"loss": 1.9158,
"step": 4272
},
{
"epoch": 2.9586160108548167,
"grad_norm": 7.0811309814453125,
"learning_rate": 9.990233644543517e-05,
"loss": 1.929,
"step": 4361
},
{
"epoch": 3.0189959294436908,
"grad_norm": 7.11262845993042,
"learning_rate": 9.989189070976839e-05,
"loss": 1.8259,
"step": 4450
},
{
"epoch": 3.0793758480325644,
"grad_norm": 10.385732650756836,
"learning_rate": 9.988091497333202e-05,
"loss": 1.6678,
"step": 4539
},
{
"epoch": 3.1397557666214384,
"grad_norm": 8.934700965881348,
"learning_rate": 9.986940935272113e-05,
"loss": 1.7278,
"step": 4628
},
{
"epoch": 3.200135685210312,
"grad_norm": 11.203325271606445,
"learning_rate": 9.985737397015975e-05,
"loss": 1.6957,
"step": 4717
},
{
"epoch": 3.260515603799186,
"grad_norm": 10.464749336242676,
"learning_rate": 9.984480895349955e-05,
"loss": 1.6743,
"step": 4806
},
{
"epoch": 3.3208955223880596,
"grad_norm": 9.995988845825195,
"learning_rate": 9.983171443621853e-05,
"loss": 1.692,
"step": 4895
},
{
"epoch": 3.3812754409769337,
"grad_norm": 11.267080307006836,
"learning_rate": 9.981809055741953e-05,
"loss": 1.6836,
"step": 4984
},
{
"epoch": 3.4416553595658073,
"grad_norm": 9.267989158630371,
"learning_rate": 9.980393746182879e-05,
"loss": 1.6307,
"step": 5073
},
{
"epoch": 3.5020352781546813,
"grad_norm": 10.1551513671875,
"learning_rate": 9.978925529979441e-05,
"loss": 1.6547,
"step": 5162
},
{
"epoch": 3.562415196743555,
"grad_norm": 8.809422492980957,
"learning_rate": 9.97740442272848e-05,
"loss": 1.6293,
"step": 5251
},
{
"epoch": 3.622795115332429,
"grad_norm": 9.327820777893066,
"learning_rate": 9.975830440588692e-05,
"loss": 1.6611,
"step": 5340
},
{
"epoch": 3.6831750339213025,
"grad_norm": 9.966196060180664,
"learning_rate": 9.974203600280465e-05,
"loss": 1.6296,
"step": 5429
},
{
"epoch": 3.743554952510176,
"grad_norm": 9.004570007324219,
"learning_rate": 9.972523919085699e-05,
"loss": 1.6335,
"step": 5518
},
{
"epoch": 3.80393487109905,
"grad_norm": 8.515008926391602,
"learning_rate": 9.97079141484762e-05,
"loss": 1.6038,
"step": 5607
},
{
"epoch": 3.864314789687924,
"grad_norm": 8.961952209472656,
"learning_rate": 9.969006105970593e-05,
"loss": 1.6298,
"step": 5696
},
{
"epoch": 3.924694708276798,
"grad_norm": 11.294893264770508,
"learning_rate": 9.967168011419927e-05,
"loss": 1.588,
"step": 5785
},
{
"epoch": 3.9850746268656714,
"grad_norm": 9.631956100463867,
"learning_rate": 9.965277150721669e-05,
"loss": 1.5871,
"step": 5874
},
{
"epoch": 4.045454545454546,
"grad_norm": 7.810765266418457,
"learning_rate": 9.963333543962405e-05,
"loss": 1.4341,
"step": 5963
},
{
"epoch": 4.1058344640434195,
"grad_norm": 10.34420394897461,
"learning_rate": 9.961337211789039e-05,
"loss": 1.4289,
"step": 6052
},
{
"epoch": 4.166214382632293,
"grad_norm": 10.042189598083496,
"learning_rate": 9.959288175408577e-05,
"loss": 1.392,
"step": 6141
},
{
"epoch": 4.226594301221167,
"grad_norm": 9.480945587158203,
"learning_rate": 9.957186456587896e-05,
"loss": 1.4407,
"step": 6230
},
{
"epoch": 4.286974219810041,
"grad_norm": 10.059048652648926,
"learning_rate": 9.955032077653525e-05,
"loss": 1.4126,
"step": 6319
},
{
"epoch": 4.347354138398915,
"grad_norm": 8.111236572265625,
"learning_rate": 9.9528250614914e-05,
"loss": 1.3697,
"step": 6408
},
{
"epoch": 4.407734056987788,
"grad_norm": 7.836842060089111,
"learning_rate": 9.950565431546612e-05,
"loss": 1.4165,
"step": 6497
},
{
"epoch": 4.468113975576662,
"grad_norm": 10.462437629699707,
"learning_rate": 9.948253211823182e-05,
"loss": 1.3629,
"step": 6586
},
{
"epoch": 4.5284938941655355,
"grad_norm": 8.220168113708496,
"learning_rate": 9.945888426883778e-05,
"loss": 1.4402,
"step": 6675
},
{
"epoch": 4.58887381275441,
"grad_norm": 8.81052017211914,
"learning_rate": 9.943471101849477e-05,
"loss": 1.4194,
"step": 6764
},
{
"epoch": 4.649253731343284,
"grad_norm": 13.403641700744629,
"learning_rate": 9.941001262399482e-05,
"loss": 1.3943,
"step": 6853
},
{
"epoch": 4.709633649932157,
"grad_norm": 8.589107513427734,
"learning_rate": 9.938478934770861e-05,
"loss": 1.3888,
"step": 6942
},
{
"epoch": 4.770013568521032,
"grad_norm": 12.76234245300293,
"learning_rate": 9.935904145758259e-05,
"loss": 1.415,
"step": 7031
},
{
"epoch": 4.800542740841248,
"eval_accuracy": 0.19140625,
"eval_loss": 3.4613265991210938,
"eval_runtime": 18.4,
"eval_samples_per_second": 27.826,
"eval_steps_per_second": 0.217,
"step": 7076
},
{
"epoch": 4.830393487109905,
"grad_norm": 12.034464836120605,
"learning_rate": 9.933276922713619e-05,
"loss": 1.3772,
"step": 7120
},
{
"epoch": 4.890773405698779,
"grad_norm": 10.00120735168457,
"learning_rate": 9.930597293545891e-05,
"loss": 1.3427,
"step": 7209
},
{
"epoch": 4.951153324287652,
"grad_norm": 10.592118263244629,
"learning_rate": 9.927865286720734e-05,
"loss": 1.3681,
"step": 7298
},
{
"epoch": 5.011533242876526,
"grad_norm": 10.557718276977539,
"learning_rate": 9.925080931260211e-05,
"loss": 1.3345,
"step": 7387
},
{
"epoch": 5.0719131614654005,
"grad_norm": 8.036050796508789,
"learning_rate": 9.922244256742491e-05,
"loss": 1.1945,
"step": 7476
},
{
"epoch": 5.132293080054274,
"grad_norm": 8.104681015014648,
"learning_rate": 9.919355293301515e-05,
"loss": 1.191,
"step": 7565
},
{
"epoch": 5.192672998643148,
"grad_norm": 8.685461044311523,
"learning_rate": 9.916414071626704e-05,
"loss": 1.1867,
"step": 7654
},
{
"epoch": 5.253052917232022,
"grad_norm": 7.221011638641357,
"learning_rate": 9.913420622962606e-05,
"loss": 1.1737,
"step": 7743
},
{
"epoch": 5.313432835820896,
"grad_norm": 9.594326972961426,
"learning_rate": 9.910374979108579e-05,
"loss": 1.2058,
"step": 7832
},
{
"epoch": 5.373812754409769,
"grad_norm": 8.146512031555176,
"learning_rate": 9.907277172418449e-05,
"loss": 1.2173,
"step": 7921
},
{
"epoch": 5.434192672998643,
"grad_norm": 8.147337913513184,
"learning_rate": 9.904127235800169e-05,
"loss": 1.2047,
"step": 8010
},
{
"epoch": 5.4945725915875165,
"grad_norm": 10.820967674255371,
"learning_rate": 9.900925202715468e-05,
"loss": 1.2287,
"step": 8099
},
{
"epoch": 5.554952510176391,
"grad_norm": 10.343106269836426,
"learning_rate": 9.897671107179488e-05,
"loss": 1.1927,
"step": 8188
},
{
"epoch": 5.615332428765265,
"grad_norm": 10.41408920288086,
"learning_rate": 9.894364983760439e-05,
"loss": 1.2321,
"step": 8277
},
{
"epoch": 5.675712347354138,
"grad_norm": 9.299535751342773,
"learning_rate": 9.891006867579217e-05,
"loss": 1.2012,
"step": 8366
},
{
"epoch": 5.736092265943013,
"grad_norm": 10.728792190551758,
"learning_rate": 9.887596794309035e-05,
"loss": 1.1812,
"step": 8455
},
{
"epoch": 5.796472184531886,
"grad_norm": 7.432964324951172,
"learning_rate": 9.884134800175053e-05,
"loss": 1.1521,
"step": 8544
},
{
"epoch": 5.85685210312076,
"grad_norm": 7.614875316619873,
"learning_rate": 9.880620921953974e-05,
"loss": 1.1487,
"step": 8633
},
{
"epoch": 5.9172320217096335,
"grad_norm": 10.49835205078125,
"learning_rate": 9.877055196973674e-05,
"loss": 1.2014,
"step": 8722
},
{
"epoch": 5.977611940298507,
"grad_norm": 7.404662609100342,
"learning_rate": 9.873437663112794e-05,
"loss": 1.1821,
"step": 8811
},
{
"epoch": 6.0379918588873815,
"grad_norm": 9.661059379577637,
"learning_rate": 9.869768358800339e-05,
"loss": 1.0712,
"step": 8900
},
{
"epoch": 6.098371777476255,
"grad_norm": 11.615382194519043,
"learning_rate": 9.866047323015269e-05,
"loss": 1.0516,
"step": 8989
},
{
"epoch": 6.158751696065129,
"grad_norm": 10.21226978302002,
"learning_rate": 9.86227459528609e-05,
"loss": 1.0813,
"step": 9078
},
{
"epoch": 6.219131614654002,
"grad_norm": 7.748682975769043,
"learning_rate": 9.85845021569043e-05,
"loss": 1.0604,
"step": 9167
},
{
"epoch": 6.279511533242877,
"grad_norm": 10.797855377197266,
"learning_rate": 9.854574224854611e-05,
"loss": 1.0417,
"step": 9256
},
{
"epoch": 6.33989145183175,
"grad_norm": 9.862196922302246,
"learning_rate": 9.850646663953227e-05,
"loss": 1.0171,
"step": 9345
},
{
"epoch": 6.400271370420624,
"grad_norm": 10.341273307800293,
"learning_rate": 9.84666757470869e-05,
"loss": 1.0216,
"step": 9434
},
{
"epoch": 6.460651289009498,
"grad_norm": 7.858868598937988,
"learning_rate": 9.842636999390807e-05,
"loss": 1.0705,
"step": 9523
},
{
"epoch": 6.521031207598372,
"grad_norm": 10.367132186889648,
"learning_rate": 9.838554980816312e-05,
"loss": 1.0489,
"step": 9612
},
{
"epoch": 6.581411126187246,
"grad_norm": 13.918916702270508,
"learning_rate": 9.834421562348428e-05,
"loss": 1.0753,
"step": 9701
},
{
"epoch": 6.641791044776119,
"grad_norm": 9.345829010009766,
"learning_rate": 9.830236787896391e-05,
"loss": 1.0584,
"step": 9790
},
{
"epoch": 6.702170963364993,
"grad_norm": 12.244129180908203,
"learning_rate": 9.826000701914998e-05,
"loss": 1.0402,
"step": 9879
},
{
"epoch": 6.762550881953867,
"grad_norm": 8.918442726135254,
"learning_rate": 9.821713349404119e-05,
"loss": 1.0522,
"step": 9968
},
{
"epoch": 6.822930800542741,
"grad_norm": 8.40239143371582,
"learning_rate": 9.817374775908237e-05,
"loss": 1.0277,
"step": 10057
},
{
"epoch": 6.8833107191316145,
"grad_norm": 12.844498634338379,
"learning_rate": 9.812985027515947e-05,
"loss": 1.077,
"step": 10146
},
{
"epoch": 6.943690637720488,
"grad_norm": 8.832013130187988,
"learning_rate": 9.808544150859476e-05,
"loss": 1.0239,
"step": 10235
},
{
"epoch": 7.004070556309363,
"grad_norm": 5.591742038726807,
"learning_rate": 9.804052193114189e-05,
"loss": 1.0128,
"step": 10324
},
{
"epoch": 7.064450474898236,
"grad_norm": 6.905234336853027,
"learning_rate": 9.799509201998083e-05,
"loss": 0.9019,
"step": 10413
},
{
"epoch": 7.12483039348711,
"grad_norm": 9.313871383666992,
"learning_rate": 9.794915225771279e-05,
"loss": 0.9515,
"step": 10502
},
{
"epoch": 7.185210312075983,
"grad_norm": 8.015510559082031,
"learning_rate": 9.790270313235517e-05,
"loss": 0.9301,
"step": 10591
},
{
"epoch": 7.200814111261873,
"eval_accuracy": 0.17578125,
"eval_loss": 3.7450790405273438,
"eval_runtime": 18.7511,
"eval_samples_per_second": 27.305,
"eval_steps_per_second": 0.213,
"step": 10614
},
{
"epoch": 7.245590230664858,
"grad_norm": 7.311409950256348,
"learning_rate": 9.785574513733625e-05,
"loss": 0.9172,
"step": 10680
},
{
"epoch": 7.3059701492537314,
"grad_norm": 5.454108238220215,
"learning_rate": 9.780827877149013e-05,
"loss": 0.9372,
"step": 10769
},
{
"epoch": 7.366350067842605,
"grad_norm": 5.830528736114502,
"learning_rate": 9.776030453905122e-05,
"loss": 0.9163,
"step": 10858
},
{
"epoch": 7.426729986431479,
"grad_norm": 9.309490203857422,
"learning_rate": 9.771182294964905e-05,
"loss": 0.9528,
"step": 10947
},
{
"epoch": 7.487109905020353,
"grad_norm": 11.420437812805176,
"learning_rate": 9.76628345183028e-05,
"loss": 0.9198,
"step": 11036
},
{
"epoch": 7.547489823609227,
"grad_norm": 11.052990913391113,
"learning_rate": 9.761333976541578e-05,
"loss": 0.9231,
"step": 11125
},
{
"epoch": 7.6078697421981,
"grad_norm": 7.378238201141357,
"learning_rate": 9.756333921676999e-05,
"loss": 0.9452,
"step": 11214
},
{
"epoch": 7.668249660786974,
"grad_norm": 11.708273887634277,
"learning_rate": 9.751283340352044e-05,
"loss": 0.9163,
"step": 11303
},
{
"epoch": 7.728629579375848,
"grad_norm": 5.919505596160889,
"learning_rate": 9.746182286218964e-05,
"loss": 0.9254,
"step": 11392
},
{
"epoch": 7.789009497964722,
"grad_norm": 10.179853439331055,
"learning_rate": 9.741030813466172e-05,
"loss": 0.9317,
"step": 11481
},
{
"epoch": 7.849389416553596,
"grad_norm": 8.873759269714355,
"learning_rate": 9.735828976817683e-05,
"loss": 0.9474,
"step": 11570
},
{
"epoch": 7.909769335142469,
"grad_norm": 6.65983772277832,
"learning_rate": 9.730576831532528e-05,
"loss": 0.9013,
"step": 11659
},
{
"epoch": 7.970149253731344,
"grad_norm": 7.311088562011719,
"learning_rate": 9.725274433404164e-05,
"loss": 0.9119,
"step": 11748
},
{
"epoch": 8.030529172320216,
"grad_norm": 10.026205062866211,
"learning_rate": 9.719921838759878e-05,
"loss": 0.876,
"step": 11837
},
{
"epoch": 8.090909090909092,
"grad_norm": 8.08633804321289,
"learning_rate": 9.714519104460202e-05,
"loss": 0.8151,
"step": 11926
},
{
"epoch": 8.151289009497965,
"grad_norm": 6.680150508880615,
"learning_rate": 9.709066287898298e-05,
"loss": 0.8111,
"step": 12015
},
{
"epoch": 8.211668928086839,
"grad_norm": 8.399514198303223,
"learning_rate": 9.70356344699935e-05,
"loss": 0.8207,
"step": 12104
},
{
"epoch": 8.272048846675712,
"grad_norm": 10.127174377441406,
"learning_rate": 9.698010640219951e-05,
"loss": 0.84,
"step": 12193
},
{
"epoch": 8.332428765264586,
"grad_norm": 7.315372943878174,
"learning_rate": 9.692407926547478e-05,
"loss": 0.8473,
"step": 12282
},
{
"epoch": 8.39280868385346,
"grad_norm": 11.611318588256836,
"learning_rate": 9.686755365499471e-05,
"loss": 0.8423,
"step": 12371
},
{
"epoch": 8.453188602442333,
"grad_norm": 7.9076008796691895,
"learning_rate": 9.681053017122996e-05,
"loss": 0.8445,
"step": 12460
},
{
"epoch": 8.513568521031207,
"grad_norm": 9.092277526855469,
"learning_rate": 9.675300941994012e-05,
"loss": 0.8652,
"step": 12549
},
{
"epoch": 8.573948439620082,
"grad_norm": 8.704888343811035,
"learning_rate": 9.669499201216723e-05,
"loss": 0.8312,
"step": 12638
},
{
"epoch": 8.634328358208956,
"grad_norm": 13.215127944946289,
"learning_rate": 9.663647856422928e-05,
"loss": 0.8306,
"step": 12727
},
{
"epoch": 8.69470827679783,
"grad_norm": 6.171853542327881,
"learning_rate": 9.657746969771371e-05,
"loss": 0.8504,
"step": 12816
},
{
"epoch": 8.755088195386703,
"grad_norm": 9.066251754760742,
"learning_rate": 9.651796603947076e-05,
"loss": 0.8711,
"step": 12905
},
{
"epoch": 8.815468113975577,
"grad_norm": 7.504266262054443,
"learning_rate": 9.645796822160691e-05,
"loss": 0.8312,
"step": 12994
},
{
"epoch": 8.87584803256445,
"grad_norm": 11.219298362731934,
"learning_rate": 9.639747688147798e-05,
"loss": 0.8264,
"step": 13083
},
{
"epoch": 8.936227951153324,
"grad_norm": 9.841562271118164,
"learning_rate": 9.633649266168256e-05,
"loss": 0.8097,
"step": 13172
},
{
"epoch": 8.996607869742197,
"grad_norm": 6.924744606018066,
"learning_rate": 9.627501621005505e-05,
"loss": 0.8315,
"step": 13261
},
{
"epoch": 9.056987788331073,
"grad_norm": 12.85659408569336,
"learning_rate": 9.62130481796588e-05,
"loss": 0.7768,
"step": 13350
},
{
"epoch": 9.117367706919946,
"grad_norm": 7.802920341491699,
"learning_rate": 9.615058922877926e-05,
"loss": 0.7363,
"step": 13439
},
{
"epoch": 9.17774762550882,
"grad_norm": 5.512497425079346,
"learning_rate": 9.608764002091686e-05,
"loss": 0.7568,
"step": 13528
},
{
"epoch": 9.238127544097694,
"grad_norm": 7.84502649307251,
"learning_rate": 9.602420122478004e-05,
"loss": 0.7754,
"step": 13617
},
{
"epoch": 9.298507462686567,
"grad_norm": 7.394598484039307,
"learning_rate": 9.596027351427814e-05,
"loss": 0.7862,
"step": 13706
},
{
"epoch": 9.35888738127544,
"grad_norm": 8.552702903747559,
"learning_rate": 9.589585756851422e-05,
"loss": 0.7404,
"step": 13795
},
{
"epoch": 9.419267299864314,
"grad_norm": 8.93039608001709,
"learning_rate": 9.583095407177788e-05,
"loss": 0.7368,
"step": 13884
},
{
"epoch": 9.479647218453188,
"grad_norm": 8.229623794555664,
"learning_rate": 9.576556371353791e-05,
"loss": 0.7699,
"step": 13973
},
{
"epoch": 9.540027137042063,
"grad_norm": 10.284710884094238,
"learning_rate": 9.569968718843507e-05,
"loss": 0.7811,
"step": 14062
},
{
"epoch": 9.600407055630937,
"grad_norm": 5.939275741577148,
"learning_rate": 9.563332519627466e-05,
"loss": 0.7419,
"step": 14151
},
{
"epoch": 9.601085481682496,
"eval_accuracy": 0.20703125,
"eval_loss": 3.737224578857422,
"eval_runtime": 17.1346,
"eval_samples_per_second": 29.881,
"eval_steps_per_second": 0.233,
"step": 14152
},
{
"epoch": 9.66078697421981,
"grad_norm": 7.720785140991211,
"learning_rate": 9.556647844201908e-05,
"loss": 0.7578,
"step": 14240
},
{
"epoch": 9.721166892808684,
"grad_norm": 7.313141345977783,
"learning_rate": 9.549914763578031e-05,
"loss": 0.7662,
"step": 14329
},
{
"epoch": 9.781546811397558,
"grad_norm": 10.582131385803223,
"learning_rate": 9.543133349281248e-05,
"loss": 0.7503,
"step": 14418
},
{
"epoch": 9.841926729986431,
"grad_norm": 5.272374153137207,
"learning_rate": 9.536303673350415e-05,
"loss": 0.7729,
"step": 14507
},
{
"epoch": 9.902306648575305,
"grad_norm": 6.4560370445251465,
"learning_rate": 9.529425808337074e-05,
"loss": 0.7659,
"step": 14596
},
{
"epoch": 9.962686567164178,
"grad_norm": 4.996959686279297,
"learning_rate": 9.522499827304674e-05,
"loss": 0.7348,
"step": 14685
},
{
"epoch": 10.023066485753052,
"grad_norm": 5.831302165985107,
"learning_rate": 9.515525803827803e-05,
"loss": 0.7534,
"step": 14774
},
{
"epoch": 10.083446404341927,
"grad_norm": 6.166038990020752,
"learning_rate": 9.508503811991405e-05,
"loss": 0.7,
"step": 14863
},
{
"epoch": 10.143826322930801,
"grad_norm": 9.589017868041992,
"learning_rate": 9.501433926389986e-05,
"loss": 0.6585,
"step": 14952
},
{
"epoch": 10.204206241519675,
"grad_norm": 8.026691436767578,
"learning_rate": 9.49431622212683e-05,
"loss": 0.6973,
"step": 15041
},
{
"epoch": 10.264586160108548,
"grad_norm": 8.68213939666748,
"learning_rate": 9.487150774813198e-05,
"loss": 0.698,
"step": 15130
},
{
"epoch": 10.324966078697422,
"grad_norm": 11.472238540649414,
"learning_rate": 9.479937660567523e-05,
"loss": 0.7192,
"step": 15219
},
{
"epoch": 10.385345997286295,
"grad_norm": 6.372411251068115,
"learning_rate": 9.472676956014605e-05,
"loss": 0.6859,
"step": 15308
},
{
"epoch": 10.445725915875169,
"grad_norm": 5.333731174468994,
"learning_rate": 9.465368738284794e-05,
"loss": 0.7025,
"step": 15397
},
{
"epoch": 10.506105834464044,
"grad_norm": 7.277047157287598,
"learning_rate": 9.458013085013173e-05,
"loss": 0.7102,
"step": 15486
},
{
"epoch": 10.566485753052918,
"grad_norm": 10.157328605651855,
"learning_rate": 9.45061007433873e-05,
"loss": 0.6814,
"step": 15575
},
{
"epoch": 10.626865671641792,
"grad_norm": 5.025580883026123,
"learning_rate": 9.443159784903528e-05,
"loss": 0.7038,
"step": 15664
},
{
"epoch": 10.687245590230665,
"grad_norm": 7.037330627441406,
"learning_rate": 9.43566229585188e-05,
"loss": 0.6886,
"step": 15753
},
{
"epoch": 10.747625508819539,
"grad_norm": 8.00758171081543,
"learning_rate": 9.42811768682949e-05,
"loss": 0.6988,
"step": 15842
},
{
"epoch": 10.808005427408412,
"grad_norm": 6.200064659118652,
"learning_rate": 9.42052603798262e-05,
"loss": 0.6872,
"step": 15931
},
{
"epoch": 10.868385345997286,
"grad_norm": 7.785628795623779,
"learning_rate": 9.412887429957241e-05,
"loss": 0.7191,
"step": 16020
},
{
"epoch": 10.92876526458616,
"grad_norm": 5.606222629547119,
"learning_rate": 9.405201943898162e-05,
"loss": 0.6933,
"step": 16109
},
{
"epoch": 10.989145183175033,
"grad_norm": 6.9870147705078125,
"learning_rate": 9.397469661448182e-05,
"loss": 0.6873,
"step": 16198
},
{
"epoch": 11.049525101763908,
"grad_norm": 7.700918674468994,
"learning_rate": 9.389690664747214e-05,
"loss": 0.6515,
"step": 16287
},
{
"epoch": 11.109905020352782,
"grad_norm": 4.668413162231445,
"learning_rate": 9.38186503643142e-05,
"loss": 0.6484,
"step": 16376
},
{
"epoch": 11.170284938941656,
"grad_norm": 9.098540306091309,
"learning_rate": 9.373992859632324e-05,
"loss": 0.6479,
"step": 16465
},
{
"epoch": 11.23066485753053,
"grad_norm": 7.96748161315918,
"learning_rate": 9.366074217975938e-05,
"loss": 0.6351,
"step": 16554
},
{
"epoch": 11.291044776119403,
"grad_norm": 5.657280921936035,
"learning_rate": 9.358109195581866e-05,
"loss": 0.6362,
"step": 16643
},
{
"epoch": 11.351424694708276,
"grad_norm": 7.184754371643066,
"learning_rate": 9.350097877062418e-05,
"loss": 0.6527,
"step": 16732
},
{
"epoch": 11.41180461329715,
"grad_norm": 6.7868523597717285,
"learning_rate": 9.342040347521702e-05,
"loss": 0.667,
"step": 16821
},
{
"epoch": 11.472184531886024,
"grad_norm": 7.017992973327637,
"learning_rate": 9.333936692554729e-05,
"loss": 0.633,
"step": 16910
},
{
"epoch": 11.532564450474899,
"grad_norm": 6.653933048248291,
"learning_rate": 9.325786998246498e-05,
"loss": 0.6404,
"step": 16999
},
{
"epoch": 11.592944369063773,
"grad_norm": 6.6855058670043945,
"learning_rate": 9.317591351171082e-05,
"loss": 0.6776,
"step": 17088
},
{
"epoch": 11.653324287652646,
"grad_norm": 8.127620697021484,
"learning_rate": 9.309349838390711e-05,
"loss": 0.6385,
"step": 17177
},
{
"epoch": 11.71370420624152,
"grad_norm": 7.420390605926514,
"learning_rate": 9.301062547454849e-05,
"loss": 0.6395,
"step": 17266
},
{
"epoch": 11.774084124830393,
"grad_norm": 7.517685413360596,
"learning_rate": 9.292729566399252e-05,
"loss": 0.6335,
"step": 17355
},
{
"epoch": 11.834464043419267,
"grad_norm": 7.267749786376953,
"learning_rate": 9.284350983745049e-05,
"loss": 0.6607,
"step": 17444
},
{
"epoch": 11.89484396200814,
"grad_norm": 7.73004150390625,
"learning_rate": 9.275926888497792e-05,
"loss": 0.6671,
"step": 17533
},
{
"epoch": 11.955223880597014,
"grad_norm": 7.934135913848877,
"learning_rate": 9.267457370146513e-05,
"loss": 0.6207,
"step": 17622
},
{
"epoch": 12.00135685210312,
"eval_accuracy": 0.19140625,
"eval_loss": 3.8003501892089844,
"eval_runtime": 19.7102,
"eval_samples_per_second": 25.976,
"eval_steps_per_second": 0.203,
"step": 17690
},
{
"epoch": 12.01560379918589,
"grad_norm": 5.052128314971924,
"learning_rate": 9.25894251866277e-05,
"loss": 0.6211,
"step": 17711
},
{
"epoch": 12.075983717774763,
"grad_norm": 5.490070343017578,
"learning_rate": 9.250382424499698e-05,
"loss": 0.6037,
"step": 17800
},
{
"epoch": 12.136363636363637,
"grad_norm": 6.631565570831299,
"learning_rate": 9.241777178591043e-05,
"loss": 0.6032,
"step": 17889
},
{
"epoch": 12.19674355495251,
"grad_norm": 6.181819438934326,
"learning_rate": 9.233126872350193e-05,
"loss": 0.5988,
"step": 17978
},
{
"epoch": 12.257123473541384,
"grad_norm": 5.3416361808776855,
"learning_rate": 9.224431597669219e-05,
"loss": 0.612,
"step": 18067
},
{
"epoch": 12.317503392130257,
"grad_norm": 9.972622871398926,
"learning_rate": 9.215691446917885e-05,
"loss": 0.5976,
"step": 18156
},
{
"epoch": 12.377883310719131,
"grad_norm": 6.693090915679932,
"learning_rate": 9.206906512942676e-05,
"loss": 0.6127,
"step": 18245
},
{
"epoch": 12.438263229308005,
"grad_norm": 5.006298065185547,
"learning_rate": 9.198076889065806e-05,
"loss": 0.614,
"step": 18334
},
{
"epoch": 12.49864314789688,
"grad_norm": 4.5717668533325195,
"learning_rate": 9.189202669084233e-05,
"loss": 0.6026,
"step": 18423
},
{
"epoch": 12.559023066485754,
"grad_norm": 7.7340989112854,
"learning_rate": 9.180283947268653e-05,
"loss": 0.589,
"step": 18512
},
{
"epoch": 12.619402985074627,
"grad_norm": 6.45162296295166,
"learning_rate": 9.17132081836251e-05,
"loss": 0.5889,
"step": 18601
},
{
"epoch": 12.6797829036635,
"grad_norm": 7.008767604827881,
"learning_rate": 9.162313377580979e-05,
"loss": 0.5783,
"step": 18690
},
{
"epoch": 12.740162822252374,
"grad_norm": 7.15552282333374,
"learning_rate": 9.153261720609963e-05,
"loss": 0.5953,
"step": 18779
},
{
"epoch": 12.800542740841248,
"grad_norm": 5.7486748695373535,
"learning_rate": 9.144165943605072e-05,
"loss": 0.5965,
"step": 18868
},
{
"epoch": 12.860922659430122,
"grad_norm": 5.747917652130127,
"learning_rate": 9.135026143190601e-05,
"loss": 0.5875,
"step": 18957
},
{
"epoch": 12.921302578018995,
"grad_norm": 7.039977550506592,
"learning_rate": 9.125842416458506e-05,
"loss": 0.5954,
"step": 19046
},
{
"epoch": 12.98168249660787,
"grad_norm": 3.8854663372039795,
"learning_rate": 9.116614860967372e-05,
"loss": 0.5818,
"step": 19135
},
{
"epoch": 13.042062415196744,
"grad_norm": 5.661801815032959,
"learning_rate": 9.107343574741374e-05,
"loss": 0.5619,
"step": 19224
},
{
"epoch": 13.102442333785618,
"grad_norm": 6.757572174072266,
"learning_rate": 9.098028656269243e-05,
"loss": 0.5639,
"step": 19313
},
{
"epoch": 13.162822252374491,
"grad_norm": 7.3293352127075195,
"learning_rate": 9.088670204503208e-05,
"loss": 0.5633,
"step": 19402
},
{
"epoch": 13.223202170963365,
"grad_norm": 7.053752899169922,
"learning_rate": 9.079268318857957e-05,
"loss": 0.5487,
"step": 19491
},
{
"epoch": 13.283582089552239,
"grad_norm": 5.139120101928711,
"learning_rate": 9.069823099209571e-05,
"loss": 0.543,
"step": 19580
},
{
"epoch": 13.343962008141112,
"grad_norm": 7.9965314865112305,
"learning_rate": 9.060334645894472e-05,
"loss": 0.5521,
"step": 19669
},
{
"epoch": 13.404341926729986,
"grad_norm": 7.904087543487549,
"learning_rate": 9.050803059708348e-05,
"loss": 0.5763,
"step": 19758
},
{
"epoch": 13.464721845318861,
"grad_norm": 4.6150221824646,
"learning_rate": 9.041228441905092e-05,
"loss": 0.5492,
"step": 19847
},
{
"epoch": 13.525101763907735,
"grad_norm": 4.3521857261657715,
"learning_rate": 9.031610894195715e-05,
"loss": 0.5544,
"step": 19936
},
{
"epoch": 13.585481682496608,
"grad_norm": 6.906470775604248,
"learning_rate": 9.021950518747276e-05,
"loss": 0.5922,
"step": 20025
},
{
"epoch": 13.645861601085482,
"grad_norm": 7.304365158081055,
"learning_rate": 9.012247418181792e-05,
"loss": 0.5473,
"step": 20114
},
{
"epoch": 13.706241519674355,
"grad_norm": 5.015029430389404,
"learning_rate": 9.002501695575148e-05,
"loss": 0.5843,
"step": 20203
},
{
"epoch": 13.766621438263229,
"grad_norm": 5.353032112121582,
"learning_rate": 8.992713454455999e-05,
"loss": 0.5423,
"step": 20292
},
{
"epoch": 13.827001356852103,
"grad_norm": 4.505341529846191,
"learning_rate": 8.98288279880468e-05,
"loss": 0.5511,
"step": 20381
},
{
"epoch": 13.887381275440976,
"grad_norm": 6.68435525894165,
"learning_rate": 8.973009833052087e-05,
"loss": 0.5429,
"step": 20470
},
{
"epoch": 13.947761194029852,
"grad_norm": 4.248044490814209,
"learning_rate": 8.963094662078583e-05,
"loss": 0.5637,
"step": 20559
},
{
"epoch": 14.008141112618725,
"grad_norm": 4.230225563049316,
"learning_rate": 8.953137391212875e-05,
"loss": 0.5551,
"step": 20648
},
{
"epoch": 14.068521031207599,
"grad_norm": 4.81500768661499,
"learning_rate": 8.94313812623089e-05,
"loss": 0.5027,
"step": 20737
},
{
"epoch": 14.128900949796472,
"grad_norm": 6.79054594039917,
"learning_rate": 8.933096973354664e-05,
"loss": 0.4904,
"step": 20826
},
{
"epoch": 14.189280868385346,
"grad_norm": 4.661177635192871,
"learning_rate": 8.923014039251208e-05,
"loss": 0.5076,
"step": 20915
},
{
"epoch": 14.24966078697422,
"grad_norm": 10.014252662658691,
"learning_rate": 8.91288943103137e-05,
"loss": 0.5068,
"step": 21004
},
{
"epoch": 14.310040705563093,
"grad_norm": 8.030250549316406,
"learning_rate": 8.902723256248704e-05,
"loss": 0.521,
"step": 21093
},
{
"epoch": 14.370420624151967,
"grad_norm": 5.514551162719727,
"learning_rate": 8.892515622898326e-05,
"loss": 0.5053,
"step": 21182
},
{
"epoch": 14.401628222523746,
"eval_accuracy": 0.193359375,
"eval_loss": 3.79229736328125,
"eval_runtime": 41.777,
"eval_samples_per_second": 12.256,
"eval_steps_per_second": 0.096,
"step": 21228
},
{
"epoch": 14.43080054274084,
"grad_norm": 5.649023056030273,
"learning_rate": 8.882266639415763e-05,
"loss": 0.5103,
"step": 21271
},
{
"epoch": 14.491180461329716,
"grad_norm": 6.628403663635254,
"learning_rate": 8.871976414675805e-05,
"loss": 0.5238,
"step": 21360
},
{
"epoch": 14.55156037991859,
"grad_norm": 5.387028217315674,
"learning_rate": 8.86164505799135e-05,
"loss": 0.5278,
"step": 21449
},
{
"epoch": 14.611940298507463,
"grad_norm": 5.111924171447754,
"learning_rate": 8.851272679112234e-05,
"loss": 0.5269,
"step": 21538
},
{
"epoch": 14.672320217096336,
"grad_norm": 5.967355728149414,
"learning_rate": 8.840859388224076e-05,
"loss": 0.5188,
"step": 21627
},
{
"epoch": 14.73270013568521,
"grad_norm": 5.387267589569092,
"learning_rate": 8.830405295947102e-05,
"loss": 0.5161,
"step": 21716
},
{
"epoch": 14.793080054274084,
"grad_norm": 4.254080772399902,
"learning_rate": 8.81991051333497e-05,
"loss": 0.5228,
"step": 21805
},
{
"epoch": 14.853459972862957,
"grad_norm": 3.855088233947754,
"learning_rate": 8.809375151873589e-05,
"loss": 0.5091,
"step": 21894
},
{
"epoch": 14.913839891451833,
"grad_norm": 5.05858039855957,
"learning_rate": 8.798799323479938e-05,
"loss": 0.5259,
"step": 21983
},
{
"epoch": 14.974219810040706,
"grad_norm": 8.726083755493164,
"learning_rate": 8.788183140500874e-05,
"loss": 0.5171,
"step": 22072
},
{
"epoch": 15.03459972862958,
"grad_norm": 5.312582492828369,
"learning_rate": 8.777526715711946e-05,
"loss": 0.4804,
"step": 22161
},
{
"epoch": 15.094979647218453,
"grad_norm": 4.794472694396973,
"learning_rate": 8.766830162316183e-05,
"loss": 0.4814,
"step": 22250
},
{
"epoch": 15.155359565807327,
"grad_norm": 6.440197944641113,
"learning_rate": 8.756093593942905e-05,
"loss": 0.4829,
"step": 22339
},
{
"epoch": 15.2157394843962,
"grad_norm": 4.757099151611328,
"learning_rate": 8.745317124646508e-05,
"loss": 0.4572,
"step": 22428
},
{
"epoch": 15.276119402985074,
"grad_norm": 5.3460235595703125,
"learning_rate": 8.734500868905258e-05,
"loss": 0.476,
"step": 22517
},
{
"epoch": 15.336499321573948,
"grad_norm": 4.173645496368408,
"learning_rate": 8.723644941620065e-05,
"loss": 0.4829,
"step": 22606
},
{
"epoch": 15.396879240162821,
"grad_norm": 8.921795845031738,
"learning_rate": 8.71274945811328e-05,
"loss": 0.4758,
"step": 22695
},
{
"epoch": 15.457259158751697,
"grad_norm": 5.059213161468506,
"learning_rate": 8.701814534127446e-05,
"loss": 0.4516,
"step": 22784
},
{
"epoch": 15.51763907734057,
"grad_norm": 6.460654258728027,
"learning_rate": 8.690840285824094e-05,
"loss": 0.4946,
"step": 22873
},
{
"epoch": 15.578018995929444,
"grad_norm": 5.588746547698975,
"learning_rate": 8.679826829782485e-05,
"loss": 0.5096,
"step": 22962
},
{
"epoch": 15.638398914518318,
"grad_norm": 4.974047660827637,
"learning_rate": 8.668774282998394e-05,
"loss": 0.491,
"step": 23051
},
{
"epoch": 15.698778833107191,
"grad_norm": 4.4067463874816895,
"learning_rate": 8.65768276288285e-05,
"loss": 0.487,
"step": 23140
},
{
"epoch": 15.759158751696065,
"grad_norm": 5.659997463226318,
"learning_rate": 8.646552387260898e-05,
"loss": 0.4895,
"step": 23229
},
{
"epoch": 15.819538670284938,
"grad_norm": 5.777614593505859,
"learning_rate": 8.635383274370341e-05,
"loss": 0.4951,
"step": 23318
},
{
"epoch": 15.879918588873814,
"grad_norm": 6.594443321228027,
"learning_rate": 8.62417554286049e-05,
"loss": 0.4871,
"step": 23407
},
{
"epoch": 15.940298507462687,
"grad_norm": 4.5751237869262695,
"learning_rate": 8.612929311790899e-05,
"loss": 0.5005,
"step": 23496
},
{
"epoch": 16.00067842605156,
"grad_norm": 4.56909704208374,
"learning_rate": 8.601644700630107e-05,
"loss": 0.4875,
"step": 23585
},
{
"epoch": 16.061058344640433,
"grad_norm": 5.793113708496094,
"learning_rate": 8.590321829254358e-05,
"loss": 0.4592,
"step": 23674
},
{
"epoch": 16.121438263229308,
"grad_norm": 3.888392686843872,
"learning_rate": 8.578960817946338e-05,
"loss": 0.4343,
"step": 23763
},
{
"epoch": 16.181818181818183,
"grad_norm": 3.910721778869629,
"learning_rate": 8.567561787393888e-05,
"loss": 0.4499,
"step": 23852
},
{
"epoch": 16.242198100407055,
"grad_norm": 7.085721492767334,
"learning_rate": 8.556124858688734e-05,
"loss": 0.4391,
"step": 23941
},
{
"epoch": 16.30257801899593,
"grad_norm": 6.454195022583008,
"learning_rate": 8.54465015332519e-05,
"loss": 0.4378,
"step": 24030
},
{
"epoch": 16.362957937584802,
"grad_norm": 3.5428030490875244,
"learning_rate": 8.533137793198866e-05,
"loss": 0.4511,
"step": 24119
},
{
"epoch": 16.423337856173678,
"grad_norm": 3.401646614074707,
"learning_rate": 8.521587900605385e-05,
"loss": 0.4642,
"step": 24208
},
{
"epoch": 16.48371777476255,
"grad_norm": 6.838740825653076,
"learning_rate": 8.510000598239075e-05,
"loss": 0.4584,
"step": 24297
},
{
"epoch": 16.544097693351425,
"grad_norm": 5.186567306518555,
"learning_rate": 8.498376009191665e-05,
"loss": 0.4741,
"step": 24386
},
{
"epoch": 16.604477611940297,
"grad_norm": 3.8350930213928223,
"learning_rate": 8.486714256950983e-05,
"loss": 0.4475,
"step": 24475
},
{
"epoch": 16.664857530529172,
"grad_norm": 5.290257453918457,
"learning_rate": 8.475015465399638e-05,
"loss": 0.4544,
"step": 24564
},
{
"epoch": 16.725237449118048,
"grad_norm": 5.533965587615967,
"learning_rate": 8.463279758813711e-05,
"loss": 0.457,
"step": 24653
},
{
"epoch": 16.78561736770692,
"grad_norm": 5.372981071472168,
"learning_rate": 8.451507261861425e-05,
"loss": 0.4537,
"step": 24742
},
{
"epoch": 16.80189959294437,
"eval_accuracy": 0.16015625,
"eval_loss": 3.9037704467773438,
"eval_runtime": 39.2353,
"eval_samples_per_second": 13.049,
"eval_steps_per_second": 0.102,
"step": 24766
},
{
"epoch": 16.845997286295795,
"grad_norm": 5.011608600616455,
"learning_rate": 8.439698099601831e-05,
"loss": 0.452,
"step": 24831
},
{
"epoch": 16.906377204884667,
"grad_norm": 5.051270484924316,
"learning_rate": 8.427852397483475e-05,
"loss": 0.4493,
"step": 24920
},
{
"epoch": 16.966757123473542,
"grad_norm": 3.670827627182007,
"learning_rate": 8.415970281343061e-05,
"loss": 0.4476,
"step": 25009
},
{
"epoch": 17.027137042062414,
"grad_norm": 2.6706955432891846,
"learning_rate": 8.404051877404126e-05,
"loss": 0.4478,
"step": 25098
},
{
"epoch": 17.08751696065129,
"grad_norm": 7.5127787590026855,
"learning_rate": 8.392097312275686e-05,
"loss": 0.4244,
"step": 25187
},
{
"epoch": 17.147896879240164,
"grad_norm": 3.7548723220825195,
"learning_rate": 8.380106712950896e-05,
"loss": 0.4289,
"step": 25276
},
{
"epoch": 17.208276797829036,
"grad_norm": 3.9628028869628906,
"learning_rate": 8.368080206805706e-05,
"loss": 0.4337,
"step": 25365
},
{
"epoch": 17.26865671641791,
"grad_norm": 4.179431915283203,
"learning_rate": 8.3560179215975e-05,
"loss": 0.4147,
"step": 25454
},
{
"epoch": 17.329036635006783,
"grad_norm": 3.3942129611968994,
"learning_rate": 8.343919985463745e-05,
"loss": 0.4175,
"step": 25543
},
{
"epoch": 17.38941655359566,
"grad_norm": 4.166045665740967,
"learning_rate": 8.331786526920626e-05,
"loss": 0.423,
"step": 25632
},
{
"epoch": 17.44979647218453,
"grad_norm": 3.0724310874938965,
"learning_rate": 8.319617674861682e-05,
"loss": 0.41,
"step": 25721
},
{
"epoch": 17.510176390773406,
"grad_norm": 6.462100028991699,
"learning_rate": 8.307413558556437e-05,
"loss": 0.4125,
"step": 25810
},
{
"epoch": 17.570556309362278,
"grad_norm": 4.838727951049805,
"learning_rate": 8.295174307649024e-05,
"loss": 0.4254,
"step": 25899
},
{
"epoch": 17.630936227951153,
"grad_norm": 3.9609103202819824,
"learning_rate": 8.282900052156817e-05,
"loss": 0.4141,
"step": 25988
},
{
"epoch": 17.69131614654003,
"grad_norm": 3.537935972213745,
"learning_rate": 8.270590922469037e-05,
"loss": 0.4189,
"step": 26077
},
{
"epoch": 17.7516960651289,
"grad_norm": 5.015251159667969,
"learning_rate": 8.258247049345373e-05,
"loss": 0.439,
"step": 26166
},
{
"epoch": 17.812075983717776,
"grad_norm": 4.997931957244873,
"learning_rate": 8.245868563914598e-05,
"loss": 0.4079,
"step": 26255
},
{
"epoch": 17.872455902306648,
"grad_norm": 5.362955093383789,
"learning_rate": 8.233455597673165e-05,
"loss": 0.4165,
"step": 26344
},
{
"epoch": 17.932835820895523,
"grad_norm": 6.1235880851745605,
"learning_rate": 8.22100828248382e-05,
"loss": 0.4121,
"step": 26433
},
{
"epoch": 17.993215739484395,
"grad_norm": 4.939189434051514,
"learning_rate": 8.208526750574199e-05,
"loss": 0.4191,
"step": 26522
},
{
"epoch": 18.05359565807327,
"grad_norm": 4.338520050048828,
"learning_rate": 8.196011134535416e-05,
"loss": 0.369,
"step": 26611
},
{
"epoch": 18.113975576662146,
"grad_norm": 4.328836441040039,
"learning_rate": 8.183461567320662e-05,
"loss": 0.3939,
"step": 26700
},
{
"epoch": 18.174355495251017,
"grad_norm": 3.7861499786376953,
"learning_rate": 8.170878182243792e-05,
"loss": 0.3841,
"step": 26789
},
{
"epoch": 18.234735413839893,
"grad_norm": 4.84774112701416,
"learning_rate": 8.158261112977913e-05,
"loss": 0.3702,
"step": 26878
},
{
"epoch": 18.295115332428765,
"grad_norm": 7.082802772521973,
"learning_rate": 8.145610493553948e-05,
"loss": 0.4059,
"step": 26967
},
{
"epoch": 18.35549525101764,
"grad_norm": 2.84909987449646,
"learning_rate": 8.13292645835923e-05,
"loss": 0.41,
"step": 27056
},
{
"epoch": 18.41587516960651,
"grad_norm": 4.116001605987549,
"learning_rate": 8.120209142136065e-05,
"loss": 0.4014,
"step": 27145
},
{
"epoch": 18.476255088195387,
"grad_norm": 4.0977783203125,
"learning_rate": 8.107458679980302e-05,
"loss": 0.4041,
"step": 27234
},
{
"epoch": 18.53663500678426,
"grad_norm": 9.48543930053711,
"learning_rate": 8.0946752073399e-05,
"loss": 0.3979,
"step": 27323
},
{
"epoch": 18.597014925373134,
"grad_norm": 3.692593574523926,
"learning_rate": 8.081858860013488e-05,
"loss": 0.4034,
"step": 27412
},
{
"epoch": 18.65739484396201,
"grad_norm": 3.500662326812744,
"learning_rate": 8.069009774148923e-05,
"loss": 0.3884,
"step": 27501
},
{
"epoch": 18.71777476255088,
"grad_norm": 3.7085442543029785,
"learning_rate": 8.056128086241841e-05,
"loss": 0.3829,
"step": 27590
},
{
"epoch": 18.778154681139757,
"grad_norm": 4.753846168518066,
"learning_rate": 8.043213933134208e-05,
"loss": 0.4079,
"step": 27679
},
{
"epoch": 18.83853459972863,
"grad_norm": 3.4297168254852295,
"learning_rate": 8.030267452012872e-05,
"loss": 0.3934,
"step": 27768
},
{
"epoch": 18.898914518317504,
"grad_norm": 5.62887716293335,
"learning_rate": 8.017288780408096e-05,
"loss": 0.4036,
"step": 27857
},
{
"epoch": 18.959294436906376,
"grad_norm": 3.0904860496520996,
"learning_rate": 8.004278056192107e-05,
"loss": 0.3933,
"step": 27946
},
{
"epoch": 19.01967435549525,
"grad_norm": 4.35064697265625,
"learning_rate": 7.991235417577621e-05,
"loss": 0.3759,
"step": 28035
},
{
"epoch": 19.080054274084127,
"grad_norm": 5.101808547973633,
"learning_rate": 7.978161003116382e-05,
"loss": 0.3693,
"step": 28124
},
{
"epoch": 19.140434192673,
"grad_norm": 4.391759395599365,
"learning_rate": 7.96505495169769e-05,
"loss": 0.3472,
"step": 28213
},
{
"epoch": 19.200814111261874,
"grad_norm": 4.793941974639893,
"learning_rate": 7.951917402546926e-05,
"loss": 0.3551,
"step": 28302
},
{
"epoch": 19.202170963364992,
"eval_accuracy": 0.19140625,
"eval_loss": 3.9524879455566406,
"eval_runtime": 19.8893,
"eval_samples_per_second": 25.742,
"eval_steps_per_second": 0.201,
"step": 28304
},
{
"epoch": 19.261194029850746,
"grad_norm": 3.726491689682007,
"learning_rate": 7.938748495224061e-05,
"loss": 0.3555,
"step": 28391
},
{
"epoch": 19.32157394843962,
"grad_norm": 3.4001190662384033,
"learning_rate": 7.925548369622199e-05,
"loss": 0.361,
"step": 28480
},
{
"epoch": 19.381953867028493,
"grad_norm": 4.480808258056641,
"learning_rate": 7.912317165966059e-05,
"loss": 0.3656,
"step": 28569
},
{
"epoch": 19.442333785617368,
"grad_norm": 3.043093681335449,
"learning_rate": 7.899055024810511e-05,
"loss": 0.3819,
"step": 28658
},
{
"epoch": 19.50271370420624,
"grad_norm": 3.813091516494751,
"learning_rate": 7.885762087039075e-05,
"loss": 0.3939,
"step": 28747
},
{
"epoch": 19.563093622795115,
"grad_norm": 4.2613525390625,
"learning_rate": 7.872438493862415e-05,
"loss": 0.353,
"step": 28836
},
{
"epoch": 19.62347354138399,
"grad_norm": 2.884284734725952,
"learning_rate": 7.859084386816854e-05,
"loss": 0.3696,
"step": 28925
},
{
"epoch": 19.683853459972863,
"grad_norm": 6.607941627502441,
"learning_rate": 7.845699907762862e-05,
"loss": 0.3869,
"step": 29014
},
{
"epoch": 19.744233378561738,
"grad_norm": 6.069945335388184,
"learning_rate": 7.832285198883548e-05,
"loss": 0.3688,
"step": 29103
},
{
"epoch": 19.80461329715061,
"grad_norm": 2.9537928104400635,
"learning_rate": 7.818840402683151e-05,
"loss": 0.3624,
"step": 29192
},
{
"epoch": 19.864993215739485,
"grad_norm": 4.354130268096924,
"learning_rate": 7.805365661985535e-05,
"loss": 0.3589,
"step": 29281
},
{
"epoch": 19.925373134328357,
"grad_norm": 3.5923469066619873,
"learning_rate": 7.791861119932652e-05,
"loss": 0.3432,
"step": 29370
},
{
"epoch": 19.985753052917232,
"grad_norm": 3.5997955799102783,
"learning_rate": 7.778326919983046e-05,
"loss": 0.3611,
"step": 29459
},
{
"epoch": 20.046132971506104,
"grad_norm": 2.281196355819702,
"learning_rate": 7.764763205910304e-05,
"loss": 0.3296,
"step": 29548
},
{
"epoch": 20.10651289009498,
"grad_norm": 7.429330348968506,
"learning_rate": 7.75117012180155e-05,
"loss": 0.34,
"step": 29637
},
{
"epoch": 20.166892808683855,
"grad_norm": 7.913335800170898,
"learning_rate": 7.737547812055901e-05,
"loss": 0.3428,
"step": 29726
},
{
"epoch": 20.227272727272727,
"grad_norm": 2.8572380542755127,
"learning_rate": 7.723896421382942e-05,
"loss": 0.3394,
"step": 29815
},
{
"epoch": 20.287652645861602,
"grad_norm": 2.90544056892395,
"learning_rate": 7.710216094801179e-05,
"loss": 0.3322,
"step": 29904
},
{
"epoch": 20.348032564450474,
"grad_norm": 7.801008224487305,
"learning_rate": 7.696506977636506e-05,
"loss": 0.343,
"step": 29993
},
{
"epoch": 20.40841248303935,
"grad_norm": 4.56928014755249,
"learning_rate": 7.682769215520658e-05,
"loss": 0.3513,
"step": 30082
},
{
"epoch": 20.46879240162822,
"grad_norm": 3.2972512245178223,
"learning_rate": 7.669002954389668e-05,
"loss": 0.3361,
"step": 30171
},
{
"epoch": 20.529172320217096,
"grad_norm": 2.6529455184936523,
"learning_rate": 7.65520834048231e-05,
"loss": 0.3481,
"step": 30260
},
{
"epoch": 20.58955223880597,
"grad_norm": 2.281811475753784,
"learning_rate": 7.641385520338551e-05,
"loss": 0.3439,
"step": 30349
},
{
"epoch": 20.649932157394844,
"grad_norm": 5.415365695953369,
"learning_rate": 7.627534640797991e-05,
"loss": 0.3426,
"step": 30438
},
{
"epoch": 20.71031207598372,
"grad_norm": 4.79844856262207,
"learning_rate": 7.613655848998305e-05,
"loss": 0.3237,
"step": 30527
},
{
"epoch": 20.77069199457259,
"grad_norm": 4.5184855461120605,
"learning_rate": 7.599749292373679e-05,
"loss": 0.3433,
"step": 30616
},
{
"epoch": 20.831071913161466,
"grad_norm": 3.099209785461426,
"learning_rate": 7.585815118653248e-05,
"loss": 0.329,
"step": 30705
},
{
"epoch": 20.891451831750338,
"grad_norm": 2.415534257888794,
"learning_rate": 7.571853475859519e-05,
"loss": 0.3377,
"step": 30794
},
{
"epoch": 20.951831750339213,
"grad_norm": 4.010440349578857,
"learning_rate": 7.557864512306802e-05,
"loss": 0.3375,
"step": 30883
},
{
"epoch": 21.012211668928085,
"grad_norm": 3.8156368732452393,
"learning_rate": 7.543848376599637e-05,
"loss": 0.3216,
"step": 30972
},
{
"epoch": 21.07259158751696,
"grad_norm": 8.568528175354004,
"learning_rate": 7.529805217631214e-05,
"loss": 0.3043,
"step": 31061
},
{
"epoch": 21.132971506105836,
"grad_norm": 5.376992225646973,
"learning_rate": 7.515735184581791e-05,
"loss": 0.3175,
"step": 31150
},
{
"epoch": 21.193351424694708,
"grad_norm": 2.6105728149414062,
"learning_rate": 7.501638426917106e-05,
"loss": 0.3105,
"step": 31239
},
{
"epoch": 21.253731343283583,
"grad_norm": 2.6053969860076904,
"learning_rate": 7.487515094386792e-05,
"loss": 0.3002,
"step": 31328
},
{
"epoch": 21.314111261872455,
"grad_norm": 2.5073657035827637,
"learning_rate": 7.473365337022791e-05,
"loss": 0.3172,
"step": 31417
},
{
"epoch": 21.37449118046133,
"grad_norm": 2.63193941116333,
"learning_rate": 7.459189305137751e-05,
"loss": 0.3183,
"step": 31506
},
{
"epoch": 21.434871099050202,
"grad_norm": 2.9518582820892334,
"learning_rate": 7.444987149323435e-05,
"loss": 0.302,
"step": 31595
},
{
"epoch": 21.495251017639077,
"grad_norm": 3.682440757751465,
"learning_rate": 7.430759020449123e-05,
"loss": 0.3106,
"step": 31684
},
{
"epoch": 21.555630936227953,
"grad_norm": 3.564025402069092,
"learning_rate": 7.416505069660003e-05,
"loss": 0.3114,
"step": 31773
},
{
"epoch": 21.602442333785618,
"eval_accuracy": 0.1953125,
"eval_loss": 3.8780479431152344,
"eval_runtime": 40.0838,
"eval_samples_per_second": 12.773,
"eval_steps_per_second": 0.1,
"step": 31842
},
{
"epoch": 21.616010854816825,
"grad_norm": 4.089629173278809,
"learning_rate": 7.402225448375569e-05,
"loss": 0.3152,
"step": 31862
},
{
"epoch": 21.6763907734057,
"grad_norm": 4.699454307556152,
"learning_rate": 7.387920308288014e-05,
"loss": 0.3094,
"step": 31951
},
{
"epoch": 21.736770691994572,
"grad_norm": 3.2713539600372314,
"learning_rate": 7.373589801360616e-05,
"loss": 0.3276,
"step": 32040
},
{
"epoch": 21.797150610583447,
"grad_norm": 1.9568812847137451,
"learning_rate": 7.359234079826123e-05,
"loss": 0.3181,
"step": 32129
},
{
"epoch": 21.85753052917232,
"grad_norm": 2.7409889698028564,
"learning_rate": 7.344853296185141e-05,
"loss": 0.3023,
"step": 32218
},
{
"epoch": 21.917910447761194,
"grad_norm": 2.9756550788879395,
"learning_rate": 7.330447603204507e-05,
"loss": 0.3162,
"step": 32307
},
{
"epoch": 21.978290366350066,
"grad_norm": 3.314568281173706,
"learning_rate": 7.316017153915671e-05,
"loss": 0.2991,
"step": 32396
},
{
"epoch": 22.03867028493894,
"grad_norm": 4.315303802490234,
"learning_rate": 7.301562101613068e-05,
"loss": 0.305,
"step": 32485
},
{
"epoch": 22.099050203527817,
"grad_norm": 4.505661487579346,
"learning_rate": 7.287082599852493e-05,
"loss": 0.2807,
"step": 32574
},
{
"epoch": 22.15943012211669,
"grad_norm": 3.841827392578125,
"learning_rate": 7.272578802449464e-05,
"loss": 0.2742,
"step": 32663
},
{
"epoch": 22.219810040705564,
"grad_norm": 4.61216926574707,
"learning_rate": 7.25805086347759e-05,
"loss": 0.2994,
"step": 32752
},
{
"epoch": 22.280189959294436,
"grad_norm": 2.9822754859924316,
"learning_rate": 7.243498937266943e-05,
"loss": 0.2854,
"step": 32841
},
{
"epoch": 22.34056987788331,
"grad_norm": 3.7797086238861084,
"learning_rate": 7.228923178402403e-05,
"loss": 0.2967,
"step": 32930
},
{
"epoch": 22.400949796472183,
"grad_norm": 2.8511717319488525,
"learning_rate": 7.214323741722027e-05,
"loss": 0.2772,
"step": 33019
},
{
"epoch": 22.46132971506106,
"grad_norm": 2.439438581466675,
"learning_rate": 7.199700782315403e-05,
"loss": 0.2957,
"step": 33108
},
{
"epoch": 22.521709633649934,
"grad_norm": 2.507317066192627,
"learning_rate": 7.185054455521994e-05,
"loss": 0.2883,
"step": 33197
},
{
"epoch": 22.582089552238806,
"grad_norm": 2.963704824447632,
"learning_rate": 7.170384916929504e-05,
"loss": 0.2892,
"step": 33286
},
{
"epoch": 22.64246947082768,
"grad_norm": 3.137892007827759,
"learning_rate": 7.155692322372208e-05,
"loss": 0.2936,
"step": 33375
},
{
"epoch": 22.702849389416553,
"grad_norm": 2.860560178756714,
"learning_rate": 7.140976827929308e-05,
"loss": 0.2719,
"step": 33464
},
{
"epoch": 22.763229308005428,
"grad_norm": 3.778202533721924,
"learning_rate": 7.126238589923269e-05,
"loss": 0.2909,
"step": 33553
},
{
"epoch": 22.8236092265943,
"grad_norm": 8.442693710327148,
"learning_rate": 7.111477764918159e-05,
"loss": 0.2957,
"step": 33642
},
{
"epoch": 22.883989145183175,
"grad_norm": 2.855881452560425,
"learning_rate": 7.096694509717994e-05,
"loss": 0.2893,
"step": 33731
},
{
"epoch": 22.944369063772047,
"grad_norm": 3.649304151535034,
"learning_rate": 7.081888981365062e-05,
"loss": 0.3019,
"step": 33820
},
{
"epoch": 23.004748982360923,
"grad_norm": 3.577422857284546,
"learning_rate": 7.067061337138249e-05,
"loss": 0.2794,
"step": 33909
},
{
"epoch": 23.065128900949798,
"grad_norm": 3.4041476249694824,
"learning_rate": 7.052211734551398e-05,
"loss": 0.2653,
"step": 33998
},
{
"epoch": 23.12550881953867,
"grad_norm": 3.21398663520813,
"learning_rate": 7.037340331351592e-05,
"loss": 0.2635,
"step": 34087
},
{
"epoch": 23.185888738127545,
"grad_norm": 3.606840133666992,
"learning_rate": 7.022447285517522e-05,
"loss": 0.2612,
"step": 34176
},
{
"epoch": 23.246268656716417,
"grad_norm": 3.4414963722229004,
"learning_rate": 7.007532755257776e-05,
"loss": 0.2621,
"step": 34265
},
{
"epoch": 23.306648575305292,
"grad_norm": 3.429677724838257,
"learning_rate": 6.992596899009174e-05,
"loss": 0.2627,
"step": 34354
},
{
"epoch": 23.367028493894164,
"grad_norm": 2.394657850265503,
"learning_rate": 6.977639875435082e-05,
"loss": 0.2651,
"step": 34443
},
{
"epoch": 23.42740841248304,
"grad_norm": 3.796799421310425,
"learning_rate": 6.962661843423725e-05,
"loss": 0.2575,
"step": 34532
},
{
"epoch": 23.487788331071915,
"grad_norm": 1.8303537368774414,
"learning_rate": 6.947662962086506e-05,
"loss": 0.2656,
"step": 34621
},
{
"epoch": 23.548168249660787,
"grad_norm": 5.206216335296631,
"learning_rate": 6.932643390756298e-05,
"loss": 0.2789,
"step": 34710
},
{
"epoch": 23.608548168249662,
"grad_norm": 2.8069159984588623,
"learning_rate": 6.917603288985775e-05,
"loss": 0.2679,
"step": 34799
},
{
"epoch": 23.668928086838534,
"grad_norm": 2.3087520599365234,
"learning_rate": 6.902542816545701e-05,
"loss": 0.2625,
"step": 34888
},
{
"epoch": 23.72930800542741,
"grad_norm": 3.139498472213745,
"learning_rate": 6.887462133423237e-05,
"loss": 0.2722,
"step": 34977
},
{
"epoch": 23.78968792401628,
"grad_norm": 2.9781806468963623,
"learning_rate": 6.872361399820245e-05,
"loss": 0.2633,
"step": 35066
},
{
"epoch": 23.850067842605156,
"grad_norm": 3.456528425216675,
"learning_rate": 6.857240776151576e-05,
"loss": 0.2767,
"step": 35155
},
{
"epoch": 23.91044776119403,
"grad_norm": 2.8766520023345947,
"learning_rate": 6.842100423043381e-05,
"loss": 0.2655,
"step": 35244
},
{
"epoch": 23.970827679782904,
"grad_norm": 2.811938524246216,
"learning_rate": 6.826940501331391e-05,
"loss": 0.26,
"step": 35333
},
{
"epoch": 24.00271370420624,
"eval_accuracy": 0.1875,
"eval_loss": 3.9947586059570312,
"eval_runtime": 23.8781,
"eval_samples_per_second": 21.442,
"eval_steps_per_second": 0.168,
"step": 35380
},
{
"epoch": 24.03120759837178,
"grad_norm": 3.4529502391815186,
"learning_rate": 6.811761172059213e-05,
"loss": 0.2424,
"step": 35422
},
{
"epoch": 24.09158751696065,
"grad_norm": 7.157485485076904,
"learning_rate": 6.796562596476629e-05,
"loss": 0.2328,
"step": 35511
},
{
"epoch": 24.151967435549526,
"grad_norm": 2.098388433456421,
"learning_rate": 6.781344936037864e-05,
"loss": 0.2368,
"step": 35600
},
{
"epoch": 24.212347354138398,
"grad_norm": 2.5846946239471436,
"learning_rate": 6.766108352399885e-05,
"loss": 0.252,
"step": 35689
},
{
"epoch": 24.272727272727273,
"grad_norm": 3.213495969772339,
"learning_rate": 6.750853007420684e-05,
"loss": 0.2563,
"step": 35778
},
{
"epoch": 24.333107191316145,
"grad_norm": 5.0729498863220215,
"learning_rate": 6.735579063157545e-05,
"loss": 0.2623,
"step": 35867
},
{
"epoch": 24.39348710990502,
"grad_norm": 2.973792791366577,
"learning_rate": 6.720286681865339e-05,
"loss": 0.2558,
"step": 35956
},
{
"epoch": 24.453867028493896,
"grad_norm": 1.9252829551696777,
"learning_rate": 6.704976025994796e-05,
"loss": 0.2486,
"step": 36045
},
{
"epoch": 24.514246947082768,
"grad_norm": 3.5804240703582764,
"learning_rate": 6.689647258190768e-05,
"loss": 0.2493,
"step": 36134
},
{
"epoch": 24.574626865671643,
"grad_norm": 3.92348575592041,
"learning_rate": 6.674300541290517e-05,
"loss": 0.2447,
"step": 36223
},
{
"epoch": 24.635006784260515,
"grad_norm": 2.7622110843658447,
"learning_rate": 6.658936038321971e-05,
"loss": 0.2381,
"step": 36312
},
{
"epoch": 24.69538670284939,
"grad_norm": 2.5953946113586426,
"learning_rate": 6.643553912502007e-05,
"loss": 0.2467,
"step": 36401
},
{
"epoch": 24.755766621438262,
"grad_norm": 2.8284683227539062,
"learning_rate": 6.628154327234704e-05,
"loss": 0.2435,
"step": 36490
},
{
"epoch": 24.816146540027137,
"grad_norm": 2.8667030334472656,
"learning_rate": 6.612737446109614e-05,
"loss": 0.2476,
"step": 36579
},
{
"epoch": 24.87652645861601,
"grad_norm": 2.5920257568359375,
"learning_rate": 6.597303432900021e-05,
"loss": 0.248,
"step": 36668
},
{
"epoch": 24.936906377204885,
"grad_norm": 3.2936460971832275,
"learning_rate": 6.581852451561207e-05,
"loss": 0.2545,
"step": 36757
},
{
"epoch": 24.99728629579376,
"grad_norm": 2.2897655963897705,
"learning_rate": 6.5663846662287e-05,
"loss": 0.2405,
"step": 36846
},
{
"epoch": 25.057666214382632,
"grad_norm": 2.2279489040374756,
"learning_rate": 6.550900241216545e-05,
"loss": 0.2235,
"step": 36935
},
{
"epoch": 25.118046132971507,
"grad_norm": 1.6091116666793823,
"learning_rate": 6.535399341015543e-05,
"loss": 0.2345,
"step": 37024
},
{
"epoch": 25.17842605156038,
"grad_norm": 2.490220308303833,
"learning_rate": 6.51988213029151e-05,
"loss": 0.2264,
"step": 37113
},
{
"epoch": 25.238805970149254,
"grad_norm": 2.3575713634490967,
"learning_rate": 6.504348773883534e-05,
"loss": 0.2384,
"step": 37202
},
{
"epoch": 25.299185888738126,
"grad_norm": 2.0898985862731934,
"learning_rate": 6.488799436802216e-05,
"loss": 0.2332,
"step": 37291
},
{
"epoch": 25.359565807327,
"grad_norm": 4.023237705230713,
"learning_rate": 6.473234284227919e-05,
"loss": 0.2186,
"step": 37380
},
{
"epoch": 25.419945725915873,
"grad_norm": 1.7770565748214722,
"learning_rate": 6.45765348150901e-05,
"loss": 0.2318,
"step": 37469
},
{
"epoch": 25.48032564450475,
"grad_norm": 3.1752917766571045,
"learning_rate": 6.442057194160116e-05,
"loss": 0.2234,
"step": 37558
},
{
"epoch": 25.540705563093624,
"grad_norm": 3.1734275817871094,
"learning_rate": 6.42644558786035e-05,
"loss": 0.2388,
"step": 37647
},
{
"epoch": 25.601085481682496,
"grad_norm": 3.916975259780884,
"learning_rate": 6.410818828451557e-05,
"loss": 0.227,
"step": 37736
},
{
"epoch": 25.66146540027137,
"grad_norm": 2.7766647338867188,
"learning_rate": 6.395177081936562e-05,
"loss": 0.23,
"step": 37825
},
{
"epoch": 25.721845318860243,
"grad_norm": 3.657627820968628,
"learning_rate": 6.379520514477388e-05,
"loss": 0.2329,
"step": 37914
},
{
"epoch": 25.78222523744912,
"grad_norm": 4.11094331741333,
"learning_rate": 6.363849292393507e-05,
"loss": 0.2241,
"step": 38003
},
{
"epoch": 25.84260515603799,
"grad_norm": 2.6179704666137695,
"learning_rate": 6.348163582160062e-05,
"loss": 0.2268,
"step": 38092
},
{
"epoch": 25.902985074626866,
"grad_norm": 3.4568240642547607,
"learning_rate": 6.332463550406107e-05,
"loss": 0.2197,
"step": 38181
},
{
"epoch": 25.96336499321574,
"grad_norm": 1.789491057395935,
"learning_rate": 6.316749363912833e-05,
"loss": 0.2087,
"step": 38270
},
{
"epoch": 26.023744911804613,
"grad_norm": 2.606367588043213,
"learning_rate": 6.301021189611793e-05,
"loss": 0.2153,
"step": 38359
},
{
"epoch": 26.08412483039349,
"grad_norm": 2.6728904247283936,
"learning_rate": 6.28527919458314e-05,
"loss": 0.2043,
"step": 38448
},
{
"epoch": 26.14450474898236,
"grad_norm": 2.2943668365478516,
"learning_rate": 6.269523546053832e-05,
"loss": 0.2123,
"step": 38537
},
{
"epoch": 26.204884667571235,
"grad_norm": 3.1198699474334717,
"learning_rate": 6.253754411395882e-05,
"loss": 0.2128,
"step": 38626
},
{
"epoch": 26.265264586160107,
"grad_norm": 2.269235372543335,
"learning_rate": 6.237971958124559e-05,
"loss": 0.2213,
"step": 38715
},
{
"epoch": 26.325644504748983,
"grad_norm": 3.090557098388672,
"learning_rate": 6.22217635389661e-05,
"loss": 0.2253,
"step": 38804
},
{
"epoch": 26.386024423337854,
"grad_norm": 4.030007839202881,
"learning_rate": 6.206367766508497e-05,
"loss": 0.2104,
"step": 38893
},
{
"epoch": 26.402985074626866,
"eval_accuracy": 0.181640625,
"eval_loss": 4.0000152587890625,
"eval_runtime": 20.3217,
"eval_samples_per_second": 25.195,
"eval_steps_per_second": 0.197,
"step": 38918
},
{
"epoch": 26.44640434192673,
"grad_norm": 3.711073637008667,
"learning_rate": 6.190546363894589e-05,
"loss": 0.2019,
"step": 38982
},
{
"epoch": 26.506784260515605,
"grad_norm": 4.125629901885986,
"learning_rate": 6.1747123141254e-05,
"loss": 0.218,
"step": 39071
},
{
"epoch": 26.567164179104477,
"grad_norm": 2.719214916229248,
"learning_rate": 6.158865785405792e-05,
"loss": 0.2138,
"step": 39160
},
{
"epoch": 26.627544097693352,
"grad_norm": 5.083952903747559,
"learning_rate": 6.143006946073187e-05,
"loss": 0.2098,
"step": 39249
},
{
"epoch": 26.687924016282224,
"grad_norm": 2.9340269565582275,
"learning_rate": 6.127135964595789e-05,
"loss": 0.2004,
"step": 39338
},
{
"epoch": 26.7483039348711,
"grad_norm": 1.930010437965393,
"learning_rate": 6.111253009570781e-05,
"loss": 0.2212,
"step": 39427
},
{
"epoch": 26.80868385345997,
"grad_norm": 3.872161388397217,
"learning_rate": 6.095358249722548e-05,
"loss": 0.2116,
"step": 39516
},
{
"epoch": 26.869063772048847,
"grad_norm": 2.4142067432403564,
"learning_rate": 6.0794518539008716e-05,
"loss": 0.2223,
"step": 39605
},
{
"epoch": 26.929443690637722,
"grad_norm": 2.2030022144317627,
"learning_rate": 6.063533991079143e-05,
"loss": 0.2155,
"step": 39694
},
{
"epoch": 26.989823609226594,
"grad_norm": 3.7845208644866943,
"learning_rate": 6.0476048303525725e-05,
"loss": 0.2177,
"step": 39783
},
{
"epoch": 27.05020352781547,
"grad_norm": 2.8146162033081055,
"learning_rate": 6.0316645409363794e-05,
"loss": 0.1945,
"step": 39872
},
{
"epoch": 27.11058344640434,
"grad_norm": 2.4782633781433105,
"learning_rate": 6.015713292164008e-05,
"loss": 0.1978,
"step": 39961
},
{
"epoch": 27.170963364993217,
"grad_norm": 1.6334956884384155,
"learning_rate": 5.999751253485325e-05,
"loss": 0.1949,
"step": 40050
},
{
"epoch": 27.23134328358209,
"grad_norm": 3.610597610473633,
"learning_rate": 5.983778594464814e-05,
"loss": 0.1997,
"step": 40139
},
{
"epoch": 27.291723202170964,
"grad_norm": 3.140693426132202,
"learning_rate": 5.967795484779781e-05,
"loss": 0.1905,
"step": 40228
},
{
"epoch": 27.352103120759836,
"grad_norm": 2.56771183013916,
"learning_rate": 5.9518020942185494e-05,
"loss": 0.1893,
"step": 40317
},
{
"epoch": 27.41248303934871,
"grad_norm": 1.921730875968933,
"learning_rate": 5.935798592678653e-05,
"loss": 0.1972,
"step": 40406
},
{
"epoch": 27.472862957937586,
"grad_norm": 2.7568604946136475,
"learning_rate": 5.91978515016504e-05,
"loss": 0.2038,
"step": 40495
},
{
"epoch": 27.533242876526458,
"grad_norm": 3.526125192642212,
"learning_rate": 5.903761936788255e-05,
"loss": 0.1881,
"step": 40584
},
{
"epoch": 27.593622795115333,
"grad_norm": 2.400557279586792,
"learning_rate": 5.887729122762644e-05,
"loss": 0.1908,
"step": 40673
},
{
"epoch": 27.654002713704205,
"grad_norm": 2.814988374710083,
"learning_rate": 5.8716868784045374e-05,
"loss": 0.1946,
"step": 40762
},
{
"epoch": 27.71438263229308,
"grad_norm": 3.351440906524658,
"learning_rate": 5.855635374130442e-05,
"loss": 0.199,
"step": 40851
},
{
"epoch": 27.774762550881952,
"grad_norm": 3.108304023742676,
"learning_rate": 5.839574780455239e-05,
"loss": 0.2009,
"step": 40940
},
{
"epoch": 27.835142469470828,
"grad_norm": 3.37080979347229,
"learning_rate": 5.823505267990359e-05,
"loss": 0.1929,
"step": 41029
},
{
"epoch": 27.895522388059703,
"grad_norm": 2.852602005004883,
"learning_rate": 5.807427007441981e-05,
"loss": 0.1946,
"step": 41118
},
{
"epoch": 27.955902306648575,
"grad_norm": 2.15985369682312,
"learning_rate": 5.791340169609214e-05,
"loss": 0.1997,
"step": 41207
},
{
"epoch": 28.01628222523745,
"grad_norm": 0.9773418307304382,
"learning_rate": 5.7752449253822815e-05,
"loss": 0.1789,
"step": 41296
},
{
"epoch": 28.076662143826322,
"grad_norm": 2.572413444519043,
"learning_rate": 5.759141445740713e-05,
"loss": 0.1816,
"step": 41385
},
{
"epoch": 28.137042062415198,
"grad_norm": 1.8453723192214966,
"learning_rate": 5.7430299017515166e-05,
"loss": 0.1795,
"step": 41474
},
{
"epoch": 28.19742198100407,
"grad_norm": 2.09143328666687,
"learning_rate": 5.726910464567371e-05,
"loss": 0.1748,
"step": 41563
},
{
"epoch": 28.257801899592945,
"grad_norm": 4.368978977203369,
"learning_rate": 5.710783305424804e-05,
"loss": 0.1865,
"step": 41652
},
{
"epoch": 28.318181818181817,
"grad_norm": 1.7974387407302856,
"learning_rate": 5.694648595642372e-05,
"loss": 0.1878,
"step": 41741
},
{
"epoch": 28.378561736770692,
"grad_norm": 3.7262039184570312,
"learning_rate": 5.6785065066188446e-05,
"loss": 0.1873,
"step": 41830
},
{
"epoch": 28.438941655359567,
"grad_norm": 5.437527656555176,
"learning_rate": 5.662357209831378e-05,
"loss": 0.1958,
"step": 41919
},
{
"epoch": 28.49932157394844,
"grad_norm": 2.2726356983184814,
"learning_rate": 5.646200876833699e-05,
"loss": 0.1818,
"step": 42008
},
{
"epoch": 28.559701492537314,
"grad_norm": 3.234407663345337,
"learning_rate": 5.630037679254278e-05,
"loss": 0.1893,
"step": 42097
},
{
"epoch": 28.620081411126186,
"grad_norm": 2.5418026447296143,
"learning_rate": 5.613867788794508e-05,
"loss": 0.185,
"step": 42186
},
{
"epoch": 28.68046132971506,
"grad_norm": 2.394573926925659,
"learning_rate": 5.5976913772268823e-05,
"loss": 0.1803,
"step": 42275
},
{
"epoch": 28.740841248303933,
"grad_norm": 2.4603261947631836,
"learning_rate": 5.581508616393165e-05,
"loss": 0.1793,
"step": 42364
},
{
"epoch": 28.80122116689281,
"grad_norm": 3.139146566390991,
"learning_rate": 5.5653196782025696e-05,
"loss": 0.1797,
"step": 42453
},
{
"epoch": 28.80325644504749,
"eval_accuracy": 0.18359375,
"eval_loss": 4.114618301391602,
"eval_runtime": 29.6243,
"eval_samples_per_second": 17.283,
"eval_steps_per_second": 0.135,
"step": 42456
},
{
"epoch": 28.86160108548168,
"grad_norm": 1.7056379318237305,
"learning_rate": 5.5491247346299334e-05,
"loss": 0.1811,
"step": 42542
},
{
"epoch": 28.921981004070556,
"grad_norm": 1.6604520082473755,
"learning_rate": 5.532923957713885e-05,
"loss": 0.1751,
"step": 42631
},
{
"epoch": 28.98236092265943,
"grad_norm": 2.6219496726989746,
"learning_rate": 5.5167175195550235e-05,
"loss": 0.1814,
"step": 42720
},
{
"epoch": 29.042740841248303,
"grad_norm": 1.9368810653686523,
"learning_rate": 5.500505592314086e-05,
"loss": 0.1732,
"step": 42809
},
{
"epoch": 29.10312075983718,
"grad_norm": 2.2955291271209717,
"learning_rate": 5.484288348210121e-05,
"loss": 0.1611,
"step": 42898
},
{
"epoch": 29.16350067842605,
"grad_norm": 1.331339716911316,
"learning_rate": 5.468065959518656e-05,
"loss": 0.1682,
"step": 42987
},
{
"epoch": 29.223880597014926,
"grad_norm": 1.5474261045455933,
"learning_rate": 5.4518385985698714e-05,
"loss": 0.1652,
"step": 43076
},
{
"epoch": 29.284260515603798,
"grad_norm": 1.5148978233337402,
"learning_rate": 5.4356064377467684e-05,
"loss": 0.18,
"step": 43165
},
{
"epoch": 29.344640434192673,
"grad_norm": 5.3867878913879395,
"learning_rate": 5.4193696494833346e-05,
"loss": 0.1693,
"step": 43254
},
{
"epoch": 29.40502035278155,
"grad_norm": 2.20180082321167,
"learning_rate": 5.4031284062627165e-05,
"loss": 0.1599,
"step": 43343
},
{
"epoch": 29.46540027137042,
"grad_norm": 2.1975841522216797,
"learning_rate": 5.386882880615383e-05,
"loss": 0.171,
"step": 43432
},
{
"epoch": 29.525780189959296,
"grad_norm": 2.5658628940582275,
"learning_rate": 5.3706332451173006e-05,
"loss": 0.1714,
"step": 43521
},
{
"epoch": 29.586160108548167,
"grad_norm": 2.7179007530212402,
"learning_rate": 5.354379672388089e-05,
"loss": 0.1713,
"step": 43610
},
{
"epoch": 29.646540027137043,
"grad_norm": 1.867160677909851,
"learning_rate": 5.338122335089196e-05,
"loss": 0.1684,
"step": 43699
},
{
"epoch": 29.706919945725915,
"grad_norm": 1.5570918321609497,
"learning_rate": 5.321861405922063e-05,
"loss": 0.1713,
"step": 43788
},
{
"epoch": 29.76729986431479,
"grad_norm": 3.943268060684204,
"learning_rate": 5.305597057626279e-05,
"loss": 0.1714,
"step": 43877
},
{
"epoch": 29.827679782903665,
"grad_norm": 1.6523535251617432,
"learning_rate": 5.2893294629777644e-05,
"loss": 0.1754,
"step": 43966
},
{
"epoch": 29.888059701492537,
"grad_norm": 2.623303174972534,
"learning_rate": 5.273058794786918e-05,
"loss": 0.1724,
"step": 44055
},
{
"epoch": 29.948439620081412,
"grad_norm": 1.8316419124603271,
"learning_rate": 5.256785225896794e-05,
"loss": 0.17,
"step": 44144
},
{
"epoch": 30.008819538670284,
"grad_norm": 2.2553136348724365,
"learning_rate": 5.240508929181258e-05,
"loss": 0.1766,
"step": 44233
},
{
"epoch": 30.06919945725916,
"grad_norm": 1.168664574623108,
"learning_rate": 5.224230077543153e-05,
"loss": 0.1523,
"step": 44322
},
{
"epoch": 30.12957937584803,
"grad_norm": 1.4800312519073486,
"learning_rate": 5.2079488439124644e-05,
"loss": 0.1553,
"step": 44411
},
{
"epoch": 30.189959294436907,
"grad_norm": 1.983797550201416,
"learning_rate": 5.1916654012444796e-05,
"loss": 0.1605,
"step": 44500
},
{
"epoch": 30.25033921302578,
"grad_norm": 1.0819350481033325,
"learning_rate": 5.1753799225179545e-05,
"loss": 0.1592,
"step": 44589
},
{
"epoch": 30.310719131614654,
"grad_norm": 2.143650531768799,
"learning_rate": 5.159092580733276e-05,
"loss": 0.1608,
"step": 44678
},
{
"epoch": 30.37109905020353,
"grad_norm": 1.6740977764129639,
"learning_rate": 5.142803548910614e-05,
"loss": 0.1591,
"step": 44767
},
{
"epoch": 30.4314789687924,
"grad_norm": 2.7269814014434814,
"learning_rate": 5.126513000088101e-05,
"loss": 0.1624,
"step": 44856
},
{
"epoch": 30.491858887381277,
"grad_norm": 2.9973506927490234,
"learning_rate": 5.1102211073199805e-05,
"loss": 0.1597,
"step": 44945
},
{
"epoch": 30.55223880597015,
"grad_norm": 1.9262616634368896,
"learning_rate": 5.093928043674772e-05,
"loss": 0.1517,
"step": 45034
},
{
"epoch": 30.612618724559024,
"grad_norm": 2.577742099761963,
"learning_rate": 5.077633982233433e-05,
"loss": 0.1668,
"step": 45123
},
{
"epoch": 30.672998643147896,
"grad_norm": 1.0925939083099365,
"learning_rate": 5.061339096087523e-05,
"loss": 0.1611,
"step": 45212
},
{
"epoch": 30.73337856173677,
"grad_norm": 1.5580718517303467,
"learning_rate": 5.0450435583373624e-05,
"loss": 0.1691,
"step": 45301
},
{
"epoch": 30.793758480325643,
"grad_norm": 1.7016775608062744,
"learning_rate": 5.028747542090189e-05,
"loss": 0.1565,
"step": 45390
},
{
"epoch": 30.854138398914518,
"grad_norm": 2.930467128753662,
"learning_rate": 5.012451220458328e-05,
"loss": 0.1685,
"step": 45479
},
{
"epoch": 30.914518317503394,
"grad_norm": 2.0711212158203125,
"learning_rate": 4.996154766557351e-05,
"loss": 0.1606,
"step": 45568
},
{
"epoch": 30.974898236092265,
"grad_norm": 1.6559313535690308,
"learning_rate": 4.9798583535042254e-05,
"loss": 0.1695,
"step": 45657
},
{
"epoch": 31.03527815468114,
"grad_norm": 2.794700860977173,
"learning_rate": 4.9635621544154945e-05,
"loss": 0.1506,
"step": 45746
},
{
"epoch": 31.095658073270013,
"grad_norm": 2.3707473278045654,
"learning_rate": 4.947266342405424e-05,
"loss": 0.1474,
"step": 45835
},
{
"epoch": 31.156037991858888,
"grad_norm": 1.6921839714050293,
"learning_rate": 4.930971090584168e-05,
"loss": 0.1468,
"step": 45924
},
{
"epoch": 31.203527815468114,
"eval_accuracy": 0.181640625,
"eval_loss": 4.099109649658203,
"eval_runtime": 19.3439,
"eval_samples_per_second": 26.468,
"eval_steps_per_second": 0.207,
"step": 45994
},
{
"epoch": 31.21641791044776,
"grad_norm": 1.611038327217102,
"learning_rate": 4.91467657205593e-05,
"loss": 0.1511,
"step": 46013
},
{
"epoch": 31.276797829036635,
"grad_norm": 1.53565514087677,
"learning_rate": 4.8983829599171235e-05,
"loss": 0.1545,
"step": 46102
},
{
"epoch": 31.33717774762551,
"grad_norm": 1.6248897314071655,
"learning_rate": 4.8820904272545336e-05,
"loss": 0.1456,
"step": 46191
},
{
"epoch": 31.397557666214382,
"grad_norm": 1.318975806236267,
"learning_rate": 4.865799147143479e-05,
"loss": 0.1483,
"step": 46280
},
{
"epoch": 31.457937584803258,
"grad_norm": 1.2955539226531982,
"learning_rate": 4.8495092926459736e-05,
"loss": 0.1515,
"step": 46369
},
{
"epoch": 31.51831750339213,
"grad_norm": 3.0391619205474854,
"learning_rate": 4.833221036808882e-05,
"loss": 0.1479,
"step": 46458
},
{
"epoch": 31.578697421981005,
"grad_norm": 1.7275387048721313,
"learning_rate": 4.81693455266209e-05,
"loss": 0.1517,
"step": 46547
},
{
"epoch": 31.639077340569877,
"grad_norm": 2.1065945625305176,
"learning_rate": 4.8006500132166625e-05,
"loss": 0.1501,
"step": 46636
},
{
"epoch": 31.699457259158752,
"grad_norm": 0.9785634875297546,
"learning_rate": 4.784367591463008e-05,
"loss": 0.1501,
"step": 46725
},
{
"epoch": 31.759837177747624,
"grad_norm": 1.2350496053695679,
"learning_rate": 4.768087460369036e-05,
"loss": 0.146,
"step": 46814
},
{
"epoch": 31.8202170963365,
"grad_norm": 1.5443971157073975,
"learning_rate": 4.75180979287832e-05,
"loss": 0.1514,
"step": 46903
},
{
"epoch": 31.880597014925375,
"grad_norm": 1.1282203197479248,
"learning_rate": 4.735534761908267e-05,
"loss": 0.1478,
"step": 46992
},
{
"epoch": 31.940976933514246,
"grad_norm": 1.1595454216003418,
"learning_rate": 4.719262540348275e-05,
"loss": 0.15,
"step": 47081
},
{
"epoch": 32.00135685210312,
"grad_norm": 1.392354130744934,
"learning_rate": 4.702993301057897e-05,
"loss": 0.1402,
"step": 47170
},
{
"epoch": 32.061736770691994,
"grad_norm": 1.6813993453979492,
"learning_rate": 4.686727216865008e-05,
"loss": 0.1458,
"step": 47259
},
{
"epoch": 32.122116689280865,
"grad_norm": 2.200620174407959,
"learning_rate": 4.6704644605639617e-05,
"loss": 0.1426,
"step": 47348
},
{
"epoch": 32.182496607869744,
"grad_norm": 1.1454344987869263,
"learning_rate": 4.654205204913762e-05,
"loss": 0.1417,
"step": 47437
},
{
"epoch": 32.242876526458616,
"grad_norm": 1.6104034185409546,
"learning_rate": 4.6379496226362285e-05,
"loss": 0.1364,
"step": 47526
},
{
"epoch": 32.30325644504749,
"grad_norm": 2.7888503074645996,
"learning_rate": 4.621697886414152e-05,
"loss": 0.1415,
"step": 47615
},
{
"epoch": 32.36363636363637,
"grad_norm": 1.4862406253814697,
"learning_rate": 4.605450168889475e-05,
"loss": 0.1449,
"step": 47704
},
{
"epoch": 32.42401628222524,
"grad_norm": 1.396264672279358,
"learning_rate": 4.5892066426614426e-05,
"loss": 0.1351,
"step": 47793
},
{
"epoch": 32.48439620081411,
"grad_norm": 0.8358775973320007,
"learning_rate": 4.572967480284777e-05,
"loss": 0.1478,
"step": 47882
},
{
"epoch": 32.54477611940298,
"grad_norm": 1.150931477546692,
"learning_rate": 4.556732854267846e-05,
"loss": 0.1388,
"step": 47971
},
{
"epoch": 32.60515603799186,
"grad_norm": 1.9280314445495605,
"learning_rate": 4.540502937070826e-05,
"loss": 0.1336,
"step": 48060
},
{
"epoch": 32.66553595658073,
"grad_norm": 1.9232927560806274,
"learning_rate": 4.5242779011038746e-05,
"loss": 0.1357,
"step": 48149
},
{
"epoch": 32.725915875169605,
"grad_norm": 1.9297000169754028,
"learning_rate": 4.5080579187252875e-05,
"loss": 0.1434,
"step": 48238
},
{
"epoch": 32.786295793758484,
"grad_norm": 1.3162543773651123,
"learning_rate": 4.491843162239686e-05,
"loss": 0.1357,
"step": 48327
},
{
"epoch": 32.846675712347356,
"grad_norm": 2.112964391708374,
"learning_rate": 4.4756338038961734e-05,
"loss": 0.1347,
"step": 48416
},
{
"epoch": 32.90705563093623,
"grad_norm": 2.025836944580078,
"learning_rate": 4.459430015886507e-05,
"loss": 0.1361,
"step": 48505
},
{
"epoch": 32.9674355495251,
"grad_norm": 2.231003999710083,
"learning_rate": 4.443231970343273e-05,
"loss": 0.1493,
"step": 48594
},
{
"epoch": 33.02781546811398,
"grad_norm": 0.858778715133667,
"learning_rate": 4.427039839338051e-05,
"loss": 0.1335,
"step": 48683
},
{
"epoch": 33.08819538670285,
"grad_norm": 1.7645868062973022,
"learning_rate": 4.410853794879596e-05,
"loss": 0.1318,
"step": 48772
},
{
"epoch": 33.14857530529172,
"grad_norm": 3.9215147495269775,
"learning_rate": 4.3946740089120036e-05,
"loss": 0.1289,
"step": 48861
},
{
"epoch": 33.208955223880594,
"grad_norm": 1.0674065351486206,
"learning_rate": 4.378500653312886e-05,
"loss": 0.1314,
"step": 48950
},
{
"epoch": 33.26933514246947,
"grad_norm": 1.1910934448242188,
"learning_rate": 4.362333899891545e-05,
"loss": 0.1285,
"step": 49039
},
{
"epoch": 33.329715061058344,
"grad_norm": 3.3156814575195312,
"learning_rate": 4.346173920387146e-05,
"loss": 0.1314,
"step": 49128
},
{
"epoch": 33.390094979647216,
"grad_norm": 1.518210530281067,
"learning_rate": 4.330020886466898e-05,
"loss": 0.1327,
"step": 49217
},
{
"epoch": 33.450474898236095,
"grad_norm": 2.037992238998413,
"learning_rate": 4.313874969724227e-05,
"loss": 0.1294,
"step": 49306
},
{
"epoch": 33.51085481682497,
"grad_norm": 1.0530787706375122,
"learning_rate": 4.2977363416769495e-05,
"loss": 0.1338,
"step": 49395
},
{
"epoch": 33.57123473541384,
"grad_norm": 2.6281962394714355,
"learning_rate": 4.281605173765462e-05,
"loss": 0.1385,
"step": 49484
},
{
"epoch": 33.60379918588874,
"eval_accuracy": 0.177734375,
"eval_loss": 4.106353759765625,
"eval_runtime": 43.8825,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.091,
"step": 49532
},
{
"epoch": 33.63161465400271,
"grad_norm": 1.659097671508789,
"learning_rate": 4.265481637350902e-05,
"loss": 0.1334,
"step": 49573
},
{
"epoch": 33.69199457259159,
"grad_norm": 1.2055881023406982,
"learning_rate": 4.249365903713345e-05,
"loss": 0.1277,
"step": 49662
},
{
"epoch": 33.75237449118046,
"grad_norm": 1.3534148931503296,
"learning_rate": 4.2332581440499765e-05,
"loss": 0.1241,
"step": 49751
},
{
"epoch": 33.81275440976933,
"grad_norm": 1.6355328559875488,
"learning_rate": 4.217158529473275e-05,
"loss": 0.1309,
"step": 49840
},
{
"epoch": 33.87313432835821,
"grad_norm": 1.2613086700439453,
"learning_rate": 4.2010672310091895e-05,
"loss": 0.1306,
"step": 49929
},
{
"epoch": 33.933514246947084,
"grad_norm": 2.427302837371826,
"learning_rate": 4.1849844195953314e-05,
"loss": 0.1335,
"step": 50018
},
{
"epoch": 33.993894165535956,
"grad_norm": 1.0683902502059937,
"learning_rate": 4.1689102660791536e-05,
"loss": 0.137,
"step": 50107
},
{
"epoch": 34.05427408412483,
"grad_norm": 1.184240460395813,
"learning_rate": 4.1528449412161375e-05,
"loss": 0.1206,
"step": 50196
},
{
"epoch": 34.114654002713706,
"grad_norm": 2.108067512512207,
"learning_rate": 4.136788615667974e-05,
"loss": 0.125,
"step": 50285
},
{
"epoch": 34.17503392130258,
"grad_norm": 1.4755454063415527,
"learning_rate": 4.120741460000758e-05,
"loss": 0.1283,
"step": 50374
},
{
"epoch": 34.23541383989145,
"grad_norm": 1.8144526481628418,
"learning_rate": 4.1047036446831686e-05,
"loss": 0.1279,
"step": 50463
},
{
"epoch": 34.29579375848033,
"grad_norm": 1.2851365804672241,
"learning_rate": 4.088675340084668e-05,
"loss": 0.1207,
"step": 50552
},
{
"epoch": 34.3561736770692,
"grad_norm": 1.1482937335968018,
"learning_rate": 4.072656716473684e-05,
"loss": 0.1251,
"step": 50641
},
{
"epoch": 34.41655359565807,
"grad_norm": 1.2348805665969849,
"learning_rate": 4.0566479440158036e-05,
"loss": 0.1235,
"step": 50730
},
{
"epoch": 34.476933514246944,
"grad_norm": 1.1819324493408203,
"learning_rate": 4.040649192771962e-05,
"loss": 0.132,
"step": 50819
},
{
"epoch": 34.53731343283582,
"grad_norm": 1.1830766201019287,
"learning_rate": 4.0246606326966425e-05,
"loss": 0.1176,
"step": 50908
},
{
"epoch": 34.597693351424695,
"grad_norm": 1.9532086849212646,
"learning_rate": 4.0086824336360676e-05,
"loss": 0.1231,
"step": 50997
},
{
"epoch": 34.65807327001357,
"grad_norm": 1.529571294784546,
"learning_rate": 3.992714765326396e-05,
"loss": 0.1242,
"step": 51086
},
{
"epoch": 34.71845318860244,
"grad_norm": 1.2561233043670654,
"learning_rate": 3.9767577973919146e-05,
"loss": 0.1255,
"step": 51175
},
{
"epoch": 34.77883310719132,
"grad_norm": 1.7090590000152588,
"learning_rate": 3.960811699343243e-05,
"loss": 0.1215,
"step": 51264
},
{
"epoch": 34.83921302578019,
"grad_norm": 2.106395959854126,
"learning_rate": 3.94487664057553e-05,
"loss": 0.1285,
"step": 51353
},
{
"epoch": 34.89959294436906,
"grad_norm": 1.165230393409729,
"learning_rate": 3.928952790366654e-05,
"loss": 0.1216,
"step": 51442
},
{
"epoch": 34.95997286295794,
"grad_norm": 1.2061336040496826,
"learning_rate": 3.913040317875424e-05,
"loss": 0.1164,
"step": 51531
},
{
"epoch": 35.02035278154681,
"grad_norm": 1.160407304763794,
"learning_rate": 3.897139392139788e-05,
"loss": 0.1258,
"step": 51620
},
{
"epoch": 35.080732700135684,
"grad_norm": 0.8674483299255371,
"learning_rate": 3.881250182075026e-05,
"loss": 0.1129,
"step": 51709
},
{
"epoch": 35.141112618724556,
"grad_norm": 1.4497802257537842,
"learning_rate": 3.8653728564719674e-05,
"loss": 0.1244,
"step": 51798
},
{
"epoch": 35.201492537313435,
"grad_norm": 1.649856448173523,
"learning_rate": 3.8495075839951937e-05,
"loss": 0.1157,
"step": 51887
},
{
"epoch": 35.26187245590231,
"grad_norm": 1.97478187084198,
"learning_rate": 3.833654533181244e-05,
"loss": 0.1182,
"step": 51976
},
{
"epoch": 35.32225237449118,
"grad_norm": 1.4241811037063599,
"learning_rate": 3.8178138724368275e-05,
"loss": 0.1195,
"step": 52065
},
{
"epoch": 35.38263229308006,
"grad_norm": 1.9427152872085571,
"learning_rate": 3.8019857700370345e-05,
"loss": 0.1214,
"step": 52154
},
{
"epoch": 35.44301221166893,
"grad_norm": 1.2185932397842407,
"learning_rate": 3.7861703941235444e-05,
"loss": 0.1149,
"step": 52243
},
{
"epoch": 35.5033921302578,
"grad_norm": 1.1983317136764526,
"learning_rate": 3.770367912702849e-05,
"loss": 0.1182,
"step": 52332
},
{
"epoch": 35.56377204884667,
"grad_norm": 0.9646018147468567,
"learning_rate": 3.7545784936444605e-05,
"loss": 0.1272,
"step": 52421
},
{
"epoch": 35.62415196743555,
"grad_norm": 1.189382791519165,
"learning_rate": 3.73880230467913e-05,
"loss": 0.1139,
"step": 52510
},
{
"epoch": 35.68453188602442,
"grad_norm": 1.0490000247955322,
"learning_rate": 3.7230395133970595e-05,
"loss": 0.1179,
"step": 52599
},
{
"epoch": 35.744911804613295,
"grad_norm": 1.055656909942627,
"learning_rate": 3.7072902872461365e-05,
"loss": 0.1184,
"step": 52688
},
{
"epoch": 35.805291723202174,
"grad_norm": 1.564658522605896,
"learning_rate": 3.691554793530143e-05,
"loss": 0.12,
"step": 52777
},
{
"epoch": 35.865671641791046,
"grad_norm": 1.054408311843872,
"learning_rate": 3.6758331994069784e-05,
"loss": 0.1145,
"step": 52866
},
{
"epoch": 35.92605156037992,
"grad_norm": 1.5454896688461304,
"learning_rate": 3.660125671886892e-05,
"loss": 0.1104,
"step": 52955
},
{
"epoch": 35.98643147896879,
"grad_norm": 0.9646552801132202,
"learning_rate": 3.6444323778307e-05,
"loss": 0.1192,
"step": 53044
},
{
"epoch": 36.004070556309365,
"eval_accuracy": 0.181640625,
"eval_loss": 4.21491813659668,
"eval_runtime": 20.6978,
"eval_samples_per_second": 24.737,
"eval_steps_per_second": 0.193,
"step": 53070
},
{
"epoch": 36.04681139755767,
"grad_norm": 2.2136874198913574,
"learning_rate": 3.628753483948017e-05,
"loss": 0.115,
"step": 53133
},
{
"epoch": 36.10719131614654,
"grad_norm": 0.8969342708587646,
"learning_rate": 3.613089156795489e-05,
"loss": 0.111,
"step": 53222
},
{
"epoch": 36.16757123473541,
"grad_norm": 1.5373083353042603,
"learning_rate": 3.5974395627750136e-05,
"loss": 0.1181,
"step": 53311
},
{
"epoch": 36.22795115332429,
"grad_norm": 1.0511338710784912,
"learning_rate": 3.581804868131986e-05,
"loss": 0.1089,
"step": 53400
},
{
"epoch": 36.28833107191316,
"grad_norm": 0.6941206455230713,
"learning_rate": 3.566185238953516e-05,
"loss": 0.1133,
"step": 53489
},
{
"epoch": 36.348710990502035,
"grad_norm": 1.0698457956314087,
"learning_rate": 3.5505808411666805e-05,
"loss": 0.1046,
"step": 53578
},
{
"epoch": 36.40909090909091,
"grad_norm": 1.1524955034255981,
"learning_rate": 3.5349918405367533e-05,
"loss": 0.1111,
"step": 53667
},
{
"epoch": 36.469470827679785,
"grad_norm": 0.7653555274009705,
"learning_rate": 3.519418402665441e-05,
"loss": 0.1102,
"step": 53756
},
{
"epoch": 36.52985074626866,
"grad_norm": 0.7626907229423523,
"learning_rate": 3.503860692989129e-05,
"loss": 0.1109,
"step": 53845
},
{
"epoch": 36.59023066485753,
"grad_norm": 1.2246617078781128,
"learning_rate": 3.4883188767771235e-05,
"loss": 0.1087,
"step": 53934
},
{
"epoch": 36.6506105834464,
"grad_norm": 0.8445035815238953,
"learning_rate": 3.472793119129891e-05,
"loss": 0.1104,
"step": 54023
},
{
"epoch": 36.71099050203528,
"grad_norm": 0.4783117175102234,
"learning_rate": 3.4572835849773124e-05,
"loss": 0.1101,
"step": 54112
},
{
"epoch": 36.77137042062415,
"grad_norm": 0.6431951522827148,
"learning_rate": 3.441790439076924e-05,
"loss": 0.1128,
"step": 54201
},
{
"epoch": 36.83175033921302,
"grad_norm": 0.8060305118560791,
"learning_rate": 3.426313846012174e-05,
"loss": 0.1077,
"step": 54290
},
{
"epoch": 36.8921302578019,
"grad_norm": 1.309480905532837,
"learning_rate": 3.410853970190662e-05,
"loss": 0.1094,
"step": 54379
},
{
"epoch": 36.952510176390774,
"grad_norm": 0.7138769030570984,
"learning_rate": 3.395410975842408e-05,
"loss": 0.1119,
"step": 54468
},
{
"epoch": 37.012890094979646,
"grad_norm": 1.4216080904006958,
"learning_rate": 3.379985027018098e-05,
"loss": 0.1117,
"step": 54557
},
{
"epoch": 37.07327001356852,
"grad_norm": 1.4457802772521973,
"learning_rate": 3.3645762875873415e-05,
"loss": 0.1024,
"step": 54646
},
{
"epoch": 37.1336499321574,
"grad_norm": 0.7809485793113708,
"learning_rate": 3.349184921236939e-05,
"loss": 0.1054,
"step": 54735
},
{
"epoch": 37.19402985074627,
"grad_norm": 1.7159395217895508,
"learning_rate": 3.333811091469129e-05,
"loss": 0.1028,
"step": 54824
},
{
"epoch": 37.25440976933514,
"grad_norm": 0.48482632637023926,
"learning_rate": 3.318454961599864e-05,
"loss": 0.105,
"step": 54913
},
{
"epoch": 37.31478968792402,
"grad_norm": 1.0585776567459106,
"learning_rate": 3.30311669475707e-05,
"loss": 0.0995,
"step": 55002
},
{
"epoch": 37.37516960651289,
"grad_norm": 0.7327682971954346,
"learning_rate": 3.2877964538789154e-05,
"loss": 0.1072,
"step": 55091
},
{
"epoch": 37.43554952510176,
"grad_norm": 1.583203911781311,
"learning_rate": 3.272494401712078e-05,
"loss": 0.104,
"step": 55180
},
{
"epoch": 37.495929443690635,
"grad_norm": 1.2305749654769897,
"learning_rate": 3.257210700810015e-05,
"loss": 0.1038,
"step": 55269
},
{
"epoch": 37.556309362279514,
"grad_norm": 8.245156288146973,
"learning_rate": 3.241945513531241e-05,
"loss": 0.1087,
"step": 55358
},
{
"epoch": 37.616689280868385,
"grad_norm": 0.8825012445449829,
"learning_rate": 3.226699002037602e-05,
"loss": 0.109,
"step": 55447
},
{
"epoch": 37.67706919945726,
"grad_norm": 1.1218957901000977,
"learning_rate": 3.2114713282925466e-05,
"loss": 0.1038,
"step": 55536
},
{
"epoch": 37.737449118046136,
"grad_norm": 1.4190541505813599,
"learning_rate": 3.196262654059419e-05,
"loss": 0.108,
"step": 55625
},
{
"epoch": 37.79782903663501,
"grad_norm": 0.5339131951332092,
"learning_rate": 3.1810731408997185e-05,
"loss": 0.1103,
"step": 55714
},
{
"epoch": 37.85820895522388,
"grad_norm": 1.3955272436141968,
"learning_rate": 3.1659029501714077e-05,
"loss": 0.0993,
"step": 55803
},
{
"epoch": 37.91858887381275,
"grad_norm": 1.7271915674209595,
"learning_rate": 3.150752243027185e-05,
"loss": 0.1081,
"step": 55892
},
{
"epoch": 37.97896879240163,
"grad_norm": 1.272377848625183,
"learning_rate": 3.1356211804127726e-05,
"loss": 0.0988,
"step": 55981
},
{
"epoch": 38.0393487109905,
"grad_norm": 1.4303677082061768,
"learning_rate": 3.1205099230652134e-05,
"loss": 0.0947,
"step": 56070
},
{
"epoch": 38.099728629579374,
"grad_norm": 0.8582963347434998,
"learning_rate": 3.105418631511151e-05,
"loss": 0.1023,
"step": 56159
},
{
"epoch": 38.16010854816825,
"grad_norm": 0.9735032916069031,
"learning_rate": 3.090347466065141e-05,
"loss": 0.0994,
"step": 56248
},
{
"epoch": 38.220488466757125,
"grad_norm": 0.8023036122322083,
"learning_rate": 3.075296586827938e-05,
"loss": 0.0968,
"step": 56337
},
{
"epoch": 38.280868385346,
"grad_norm": 0.8283627033233643,
"learning_rate": 3.060266153684792e-05,
"loss": 0.0988,
"step": 56426
},
{
"epoch": 38.34124830393487,
"grad_norm": 1.1911463737487793,
"learning_rate": 3.045256326303762e-05,
"loss": 0.1015,
"step": 56515
},
{
"epoch": 38.40162822252375,
"grad_norm": 1.3183075189590454,
"learning_rate": 3.030267264134003e-05,
"loss": 0.1014,
"step": 56604
},
{
"epoch": 38.404341926729984,
"eval_accuracy": 0.169921875,
"eval_loss": 4.234889984130859,
"eval_runtime": 20.5487,
"eval_samples_per_second": 24.916,
"eval_steps_per_second": 0.195,
"step": 56608
},
{
"epoch": 38.46200814111262,
"grad_norm": 0.9125858545303345,
"learning_rate": 3.0152991264040888e-05,
"loss": 0.101,
"step": 56693
},
{
"epoch": 38.52238805970149,
"grad_norm": 0.5899451971054077,
"learning_rate": 3.0003520721203106e-05,
"loss": 0.0969,
"step": 56782
},
{
"epoch": 38.58276797829036,
"grad_norm": 1.2424014806747437,
"learning_rate": 2.9854262600649907e-05,
"loss": 0.1017,
"step": 56871
},
{
"epoch": 38.64314789687924,
"grad_norm": 1.250267744064331,
"learning_rate": 2.9705218487947984e-05,
"loss": 0.0982,
"step": 56960
},
{
"epoch": 38.703527815468114,
"grad_norm": 1.153306245803833,
"learning_rate": 2.9556389966390552e-05,
"loss": 0.1006,
"step": 57049
},
{
"epoch": 38.763907734056986,
"grad_norm": 1.2941042184829712,
"learning_rate": 2.940777861698068e-05,
"loss": 0.0975,
"step": 57138
},
{
"epoch": 38.824287652645864,
"grad_norm": 1.0015143156051636,
"learning_rate": 2.9259386018414396e-05,
"loss": 0.1054,
"step": 57227
},
{
"epoch": 38.884667571234736,
"grad_norm": 0.8103719353675842,
"learning_rate": 2.9111213747063915e-05,
"loss": 0.1004,
"step": 57316
},
{
"epoch": 38.94504748982361,
"grad_norm": 2.045173406600952,
"learning_rate": 2.896326337696098e-05,
"loss": 0.0993,
"step": 57405
},
{
"epoch": 39.00542740841248,
"grad_norm": 0.9834128022193909,
"learning_rate": 2.8815536479780014e-05,
"loss": 0.0971,
"step": 57494
},
{
"epoch": 39.06580732700136,
"grad_norm": 0.6491034030914307,
"learning_rate": 2.8668034624821514e-05,
"loss": 0.0957,
"step": 57583
},
{
"epoch": 39.12618724559023,
"grad_norm": 1.0920275449752808,
"learning_rate": 2.852075937899541e-05,
"loss": 0.0938,
"step": 57672
},
{
"epoch": 39.1865671641791,
"grad_norm": 0.9111031293869019,
"learning_rate": 2.8373712306804267e-05,
"loss": 0.0954,
"step": 57761
},
{
"epoch": 39.24694708276798,
"grad_norm": 0.7507003545761108,
"learning_rate": 2.8226894970326856e-05,
"loss": 0.0926,
"step": 57850
},
{
"epoch": 39.30732700135685,
"grad_norm": 1.0884746313095093,
"learning_rate": 2.8080308929201392e-05,
"loss": 0.0946,
"step": 57939
},
{
"epoch": 39.367706919945725,
"grad_norm": 0.7752851843833923,
"learning_rate": 2.793395574060911e-05,
"loss": 0.0925,
"step": 58028
},
{
"epoch": 39.4280868385346,
"grad_norm": 0.8282026052474976,
"learning_rate": 2.7787836959257617e-05,
"loss": 0.0954,
"step": 58117
},
{
"epoch": 39.488466757123476,
"grad_norm": 0.7554723620414734,
"learning_rate": 2.764195413736444e-05,
"loss": 0.0965,
"step": 58206
},
{
"epoch": 39.54884667571235,
"grad_norm": 1.461937427520752,
"learning_rate": 2.7496308824640505e-05,
"loss": 0.0963,
"step": 58295
},
{
"epoch": 39.60922659430122,
"grad_norm": 1.260448694229126,
"learning_rate": 2.735090256827365e-05,
"loss": 0.0901,
"step": 58384
},
{
"epoch": 39.6696065128901,
"grad_norm": 0.5917372703552246,
"learning_rate": 2.720573691291226e-05,
"loss": 0.0912,
"step": 58473
},
{
"epoch": 39.72998643147897,
"grad_norm": 1.0899447202682495,
"learning_rate": 2.70608134006488e-05,
"loss": 0.0971,
"step": 58562
},
{
"epoch": 39.79036635006784,
"grad_norm": 0.700945258140564,
"learning_rate": 2.691613357100348e-05,
"loss": 0.0959,
"step": 58651
},
{
"epoch": 39.850746268656714,
"grad_norm": 0.563937783241272,
"learning_rate": 2.6771698960907844e-05,
"loss": 0.0924,
"step": 58740
},
{
"epoch": 39.91112618724559,
"grad_norm": 1.2287607192993164,
"learning_rate": 2.6627511104688463e-05,
"loss": 0.0915,
"step": 58829
},
{
"epoch": 39.971506105834465,
"grad_norm": 1.0432151556015015,
"learning_rate": 2.6483571534050684e-05,
"loss": 0.094,
"step": 58918
},
{
"epoch": 40.031886024423336,
"grad_norm": 0.9087603092193604,
"learning_rate": 2.6339881778062286e-05,
"loss": 0.0914,
"step": 59007
},
{
"epoch": 40.09226594301221,
"grad_norm": 1.0434340238571167,
"learning_rate": 2.6196443363137295e-05,
"loss": 0.0932,
"step": 59096
},
{
"epoch": 40.15264586160109,
"grad_norm": 1.4416966438293457,
"learning_rate": 2.6053257813019756e-05,
"loss": 0.0951,
"step": 59185
},
{
"epoch": 40.21302578018996,
"grad_norm": 0.5194874405860901,
"learning_rate": 2.5910326648767464e-05,
"loss": 0.0909,
"step": 59274
},
{
"epoch": 40.27340569877883,
"grad_norm": 0.4782836139202118,
"learning_rate": 2.5767651388735976e-05,
"loss": 0.0917,
"step": 59363
},
{
"epoch": 40.33378561736771,
"grad_norm": 0.7723681926727295,
"learning_rate": 2.5625233548562288e-05,
"loss": 0.0928,
"step": 59452
},
{
"epoch": 40.39416553595658,
"grad_norm": 0.5637179017066956,
"learning_rate": 2.5483074641148896e-05,
"loss": 0.095,
"step": 59541
},
{
"epoch": 40.45454545454545,
"grad_norm": 0.9517094492912292,
"learning_rate": 2.534117617664766e-05,
"loss": 0.0857,
"step": 59630
},
{
"epoch": 40.514925373134325,
"grad_norm": 1.0360537767410278,
"learning_rate": 2.5199539662443683e-05,
"loss": 0.0923,
"step": 59719
},
{
"epoch": 40.575305291723204,
"grad_norm": 0.993859589099884,
"learning_rate": 2.5058166603139453e-05,
"loss": 0.0918,
"step": 59808
},
{
"epoch": 40.635685210312076,
"grad_norm": 0.5905105471611023,
"learning_rate": 2.491705850053876e-05,
"loss": 0.0914,
"step": 59897
},
{
"epoch": 40.69606512890095,
"grad_norm": 1.8507524728775024,
"learning_rate": 2.4776216853630747e-05,
"loss": 0.0948,
"step": 59986
},
{
"epoch": 40.75644504748983,
"grad_norm": 0.8569918274879456,
"learning_rate": 2.4635643158574034e-05,
"loss": 0.0933,
"step": 60075
},
{
"epoch": 40.80461329715061,
"eval_accuracy": 0.17578125,
"eval_loss": 4.287986755371094,
"eval_runtime": 29.4248,
"eval_samples_per_second": 17.4,
"eval_steps_per_second": 0.136,
"step": 60146
},
{
"epoch": 40.8168249660787,
"grad_norm": 1.2466926574707031,
"learning_rate": 2.4495338908680733e-05,
"loss": 0.0884,
"step": 60164
},
{
"epoch": 40.87720488466757,
"grad_norm": 1.0967109203338623,
"learning_rate": 2.4355305594400703e-05,
"loss": 0.0885,
"step": 60253
},
{
"epoch": 40.93758480325644,
"grad_norm": 3.7335941791534424,
"learning_rate": 2.4215544703305624e-05,
"loss": 0.0863,
"step": 60342
},
{
"epoch": 40.99796472184532,
"grad_norm": 0.7128244638442993,
"learning_rate": 2.4076057720073263e-05,
"loss": 0.0916,
"step": 60431
},
{
"epoch": 41.05834464043419,
"grad_norm": 0.6948025226593018,
"learning_rate": 2.393684612647165e-05,
"loss": 0.0907,
"step": 60520
},
{
"epoch": 41.118724559023065,
"grad_norm": 0.9347543716430664,
"learning_rate": 2.3797911401343324e-05,
"loss": 0.0863,
"step": 60609
},
{
"epoch": 41.17910447761194,
"grad_norm": 0.6577604413032532,
"learning_rate": 2.3659255020589693e-05,
"loss": 0.0893,
"step": 60698
},
{
"epoch": 41.239484396200815,
"grad_norm": 1.0613411664962769,
"learning_rate": 2.3520878457155317e-05,
"loss": 0.0907,
"step": 60787
},
{
"epoch": 41.29986431478969,
"grad_norm": 0.7223649024963379,
"learning_rate": 2.338278318101224e-05,
"loss": 0.0858,
"step": 60876
},
{
"epoch": 41.36024423337856,
"grad_norm": 0.6473923325538635,
"learning_rate": 2.3244970659144434e-05,
"loss": 0.0881,
"step": 60965
},
{
"epoch": 41.42062415196744,
"grad_norm": 0.6310983300209045,
"learning_rate": 2.3107442355532105e-05,
"loss": 0.0866,
"step": 61054
},
{
"epoch": 41.48100407055631,
"grad_norm": 1.2830203771591187,
"learning_rate": 2.2970199731136305e-05,
"loss": 0.0882,
"step": 61143
},
{
"epoch": 41.54138398914518,
"grad_norm": 0.6028885245323181,
"learning_rate": 2.2833244243883222e-05,
"loss": 0.0861,
"step": 61232
},
{
"epoch": 41.60176390773406,
"grad_norm": 1.1787885427474976,
"learning_rate": 2.2696577348648867e-05,
"loss": 0.0897,
"step": 61321
},
{
"epoch": 41.66214382632293,
"grad_norm": 0.5341454148292542,
"learning_rate": 2.2560200497243537e-05,
"loss": 0.0871,
"step": 61410
},
{
"epoch": 41.722523744911804,
"grad_norm": 1.4164313077926636,
"learning_rate": 2.2424115138396336e-05,
"loss": 0.0924,
"step": 61499
},
{
"epoch": 41.782903663500676,
"grad_norm": 0.7035442590713501,
"learning_rate": 2.2288322717739912e-05,
"loss": 0.088,
"step": 61588
},
{
"epoch": 41.843283582089555,
"grad_norm": 0.6574503779411316,
"learning_rate": 2.2152824677795003e-05,
"loss": 0.0868,
"step": 61677
},
{
"epoch": 41.90366350067843,
"grad_norm": 0.4766522943973541,
"learning_rate": 2.201762245795516e-05,
"loss": 0.0887,
"step": 61766
},
{
"epoch": 41.9640434192673,
"grad_norm": 2.5811030864715576,
"learning_rate": 2.188271749447146e-05,
"loss": 0.0872,
"step": 61855
},
{
"epoch": 42.02442333785617,
"grad_norm": 0.7208371758460999,
"learning_rate": 2.1748111220437163e-05,
"loss": 0.0825,
"step": 61944
},
{
"epoch": 42.08480325644505,
"grad_norm": 0.7155792713165283,
"learning_rate": 2.161380506577262e-05,
"loss": 0.0913,
"step": 62033
},
{
"epoch": 42.14518317503392,
"grad_norm": 0.7777039408683777,
"learning_rate": 2.147980045720999e-05,
"loss": 0.0837,
"step": 62122
},
{
"epoch": 42.20556309362279,
"grad_norm": 0.5456185340881348,
"learning_rate": 2.134609881827813e-05,
"loss": 0.0825,
"step": 62211
},
{
"epoch": 42.26594301221167,
"grad_norm": 0.614791750907898,
"learning_rate": 2.1212701569287463e-05,
"loss": 0.078,
"step": 62300
},
{
"epoch": 42.32632293080054,
"grad_norm": 0.9303745627403259,
"learning_rate": 2.1079610127314827e-05,
"loss": 0.0815,
"step": 62389
},
{
"epoch": 42.386702849389415,
"grad_norm": 0.6811819672584534,
"learning_rate": 2.094682590618852e-05,
"loss": 0.0842,
"step": 62478
},
{
"epoch": 42.44708276797829,
"grad_norm": 0.7549321055412292,
"learning_rate": 2.081435031647326e-05,
"loss": 0.0834,
"step": 62567
},
{
"epoch": 42.507462686567166,
"grad_norm": 0.6525147557258606,
"learning_rate": 2.0682184765455143e-05,
"loss": 0.0853,
"step": 62656
},
{
"epoch": 42.56784260515604,
"grad_norm": 0.7095387578010559,
"learning_rate": 2.0550330657126715e-05,
"loss": 0.0873,
"step": 62745
},
{
"epoch": 42.62822252374491,
"grad_norm": 0.8400213122367859,
"learning_rate": 2.041878939217211e-05,
"loss": 0.0875,
"step": 62834
},
{
"epoch": 42.68860244233379,
"grad_norm": 0.9360200762748718,
"learning_rate": 2.028756236795213e-05,
"loss": 0.0843,
"step": 62923
},
{
"epoch": 42.74898236092266,
"grad_norm": 0.5572984218597412,
"learning_rate": 2.015665097848935e-05,
"loss": 0.0813,
"step": 63012
},
{
"epoch": 42.80936227951153,
"grad_norm": 0.9234522581100464,
"learning_rate": 2.002605661445342e-05,
"loss": 0.083,
"step": 63101
},
{
"epoch": 42.869742198100404,
"grad_norm": 0.5887913107872009,
"learning_rate": 1.989578066314623e-05,
"loss": 0.0839,
"step": 63190
},
{
"epoch": 42.93012211668928,
"grad_norm": 0.8760083913803101,
"learning_rate": 1.9765824508487125e-05,
"loss": 0.085,
"step": 63279
},
{
"epoch": 42.990502035278155,
"grad_norm": 0.7094123959541321,
"learning_rate": 1.9636189530998307e-05,
"loss": 0.0798,
"step": 63368
},
{
"epoch": 43.05088195386703,
"grad_norm": 0.5656801462173462,
"learning_rate": 1.95068771077901e-05,
"loss": 0.0785,
"step": 63457
},
{
"epoch": 43.111261872455906,
"grad_norm": 0.8483113646507263,
"learning_rate": 1.937788861254634e-05,
"loss": 0.081,
"step": 63546
},
{
"epoch": 43.17164179104478,
"grad_norm": 0.5962135791778564,
"learning_rate": 1.9249225415509807e-05,
"loss": 0.0832,
"step": 63635
},
{
"epoch": 43.204884667571235,
"eval_accuracy": 0.177734375,
"eval_loss": 4.267856597900391,
"eval_runtime": 19.6828,
"eval_samples_per_second": 26.013,
"eval_steps_per_second": 0.203,
"step": 63684
},
{
"epoch": 43.23202170963365,
"grad_norm": 1.1795192956924438,
"learning_rate": 1.9120888883467574e-05,
"loss": 0.0881,
"step": 63724
},
{
"epoch": 43.29240162822252,
"grad_norm": 1.2301242351531982,
"learning_rate": 1.899288037973662e-05,
"loss": 0.0779,
"step": 63813
},
{
"epoch": 43.3527815468114,
"grad_norm": 0.5368560552597046,
"learning_rate": 1.8865201264149267e-05,
"loss": 0.0793,
"step": 63902
},
{
"epoch": 43.41316146540027,
"grad_norm": 1.1031700372695923,
"learning_rate": 1.873785289303875e-05,
"loss": 0.0824,
"step": 63991
},
{
"epoch": 43.473541383989144,
"grad_norm": 0.9713082313537598,
"learning_rate": 1.861083661922482e-05,
"loss": 0.0766,
"step": 64080
},
{
"epoch": 43.53392130257802,
"grad_norm": 0.681328296661377,
"learning_rate": 1.8484153791999326e-05,
"loss": 0.0799,
"step": 64169
},
{
"epoch": 43.594301221166894,
"grad_norm": 0.8199315071105957,
"learning_rate": 1.8357805757111966e-05,
"loss": 0.0811,
"step": 64258
},
{
"epoch": 43.654681139755766,
"grad_norm": 1.327650785446167,
"learning_rate": 1.823179385675593e-05,
"loss": 0.08,
"step": 64347
},
{
"epoch": 43.71506105834464,
"grad_norm": 0.9341023564338684,
"learning_rate": 1.810611942955365e-05,
"loss": 0.0787,
"step": 64436
},
{
"epoch": 43.77544097693352,
"grad_norm": 0.5767560601234436,
"learning_rate": 1.7980783810542577e-05,
"loss": 0.0812,
"step": 64525
},
{
"epoch": 43.83582089552239,
"grad_norm": 0.5114635229110718,
"learning_rate": 1.785578833116104e-05,
"loss": 0.0823,
"step": 64614
},
{
"epoch": 43.89620081411126,
"grad_norm": 0.5436065196990967,
"learning_rate": 1.7731134319234016e-05,
"loss": 0.0819,
"step": 64703
},
{
"epoch": 43.95658073270013,
"grad_norm": 0.4684976041316986,
"learning_rate": 1.760682309895913e-05,
"loss": 0.0842,
"step": 64792
},
{
"epoch": 44.01696065128901,
"grad_norm": 1.0648964643478394,
"learning_rate": 1.7482855990892517e-05,
"loss": 0.0822,
"step": 64881
},
{
"epoch": 44.07734056987788,
"grad_norm": 0.6211819648742676,
"learning_rate": 1.735923431193483e-05,
"loss": 0.0797,
"step": 64970
},
{
"epoch": 44.137720488466755,
"grad_norm": 0.4334025979042053,
"learning_rate": 1.7235959375317185e-05,
"loss": 0.0759,
"step": 65059
},
{
"epoch": 44.198100407055634,
"grad_norm": 1.0753127336502075,
"learning_rate": 1.711303249058731e-05,
"loss": 0.0756,
"step": 65148
},
{
"epoch": 44.258480325644506,
"grad_norm": 0.6846993565559387,
"learning_rate": 1.6990454963595577e-05,
"loss": 0.0795,
"step": 65237
},
{
"epoch": 44.31886024423338,
"grad_norm": 0.44795066118240356,
"learning_rate": 1.6868228096481104e-05,
"loss": 0.0815,
"step": 65326
},
{
"epoch": 44.37924016282225,
"grad_norm": 1.4556400775909424,
"learning_rate": 1.674635318765801e-05,
"loss": 0.0789,
"step": 65415
},
{
"epoch": 44.43962008141113,
"grad_norm": 0.6817762851715088,
"learning_rate": 1.66248315318015e-05,
"loss": 0.0734,
"step": 65504
},
{
"epoch": 44.5,
"grad_norm": 0.8499571681022644,
"learning_rate": 1.6503664419834215e-05,
"loss": 0.0798,
"step": 65593
},
{
"epoch": 44.56037991858887,
"grad_norm": 0.5608311891555786,
"learning_rate": 1.6382853138912485e-05,
"loss": 0.0759,
"step": 65682
},
{
"epoch": 44.62075983717775,
"grad_norm": 1.1510560512542725,
"learning_rate": 1.6262398972412644e-05,
"loss": 0.0774,
"step": 65771
},
{
"epoch": 44.68113975576662,
"grad_norm": 0.591827392578125,
"learning_rate": 1.614230319991743e-05,
"loss": 0.0827,
"step": 65860
},
{
"epoch": 44.741519674355494,
"grad_norm": 0.7560341358184814,
"learning_rate": 1.60225670972023e-05,
"loss": 0.0752,
"step": 65949
},
{
"epoch": 44.801899592944366,
"grad_norm": 1.0043483972549438,
"learning_rate": 1.5903191936222016e-05,
"loss": 0.0794,
"step": 66038
},
{
"epoch": 44.862279511533245,
"grad_norm": 1.1438446044921875,
"learning_rate": 1.5784178985097024e-05,
"loss": 0.08,
"step": 66127
},
{
"epoch": 44.92265943012212,
"grad_norm": 0.39144688844680786,
"learning_rate": 1.5665529508100052e-05,
"loss": 0.0729,
"step": 66216
},
{
"epoch": 44.98303934871099,
"grad_norm": 0.7558673620223999,
"learning_rate": 1.5547244765642588e-05,
"loss": 0.0759,
"step": 66305
},
{
"epoch": 45.04341926729987,
"grad_norm": 0.6959690451622009,
"learning_rate": 1.5429326014261632e-05,
"loss": 0.0817,
"step": 66394
},
{
"epoch": 45.10379918588874,
"grad_norm": 0.41576260328292847,
"learning_rate": 1.531177450660618e-05,
"loss": 0.0738,
"step": 66483
},
{
"epoch": 45.16417910447761,
"grad_norm": 0.8202412724494934,
"learning_rate": 1.5194591491424064e-05,
"loss": 0.0763,
"step": 66572
},
{
"epoch": 45.22455902306648,
"grad_norm": 1.1920087337493896,
"learning_rate": 1.5077778213548622e-05,
"loss": 0.0756,
"step": 66661
},
{
"epoch": 45.28493894165536,
"grad_norm": 0.6442920565605164,
"learning_rate": 1.496133591388547e-05,
"loss": 0.0795,
"step": 66750
},
{
"epoch": 45.345318860244234,
"grad_norm": 0.7332776784896851,
"learning_rate": 1.4845265829399296e-05,
"loss": 0.0766,
"step": 66839
},
{
"epoch": 45.405698778833106,
"grad_norm": 0.887069821357727,
"learning_rate": 1.4729569193100795e-05,
"loss": 0.0756,
"step": 66928
},
{
"epoch": 45.46607869742198,
"grad_norm": 0.6151465177536011,
"learning_rate": 1.4614247234033518e-05,
"loss": 0.0793,
"step": 67017
},
{
"epoch": 45.526458616010856,
"grad_norm": 0.7770605087280273,
"learning_rate": 1.449930117726081e-05,
"loss": 0.0793,
"step": 67106
},
{
"epoch": 45.58683853459973,
"grad_norm": 0.5736819505691528,
"learning_rate": 1.438473224385285e-05,
"loss": 0.0728,
"step": 67195
},
{
"epoch": 45.60515603799186,
"eval_accuracy": 0.185546875,
"eval_loss": 4.269733428955078,
"eval_runtime": 40.0503,
"eval_samples_per_second": 12.784,
"eval_steps_per_second": 0.1,
"step": 67222
},
{
"epoch": 45.6472184531886,
"grad_norm": 0.4615430533885956,
"learning_rate": 1.4270541650873582e-05,
"loss": 0.0706,
"step": 67284
},
{
"epoch": 45.70759837177748,
"grad_norm": 0.7554183006286621,
"learning_rate": 1.415673061136788e-05,
"loss": 0.0788,
"step": 67373
},
{
"epoch": 45.76797829036635,
"grad_norm": 0.6309983134269714,
"learning_rate": 1.4043300334348641e-05,
"loss": 0.0779,
"step": 67462
},
{
"epoch": 45.82835820895522,
"grad_norm": 0.4782220125198364,
"learning_rate": 1.3930252024783903e-05,
"loss": 0.0769,
"step": 67551
},
{
"epoch": 45.888738127544094,
"grad_norm": 0.5289342403411865,
"learning_rate": 1.3817586883584094e-05,
"loss": 0.0768,
"step": 67640
},
{
"epoch": 45.94911804613297,
"grad_norm": 0.5275683403015137,
"learning_rate": 1.370530610758921e-05,
"loss": 0.0743,
"step": 67729
},
{
"epoch": 46.009497964721845,
"grad_norm": 0.3685113787651062,
"learning_rate": 1.359341088955618e-05,
"loss": 0.0734,
"step": 67818
},
{
"epoch": 46.06987788331072,
"grad_norm": 0.6584441661834717,
"learning_rate": 1.3481902418146154e-05,
"loss": 0.0742,
"step": 67907
},
{
"epoch": 46.130257801899596,
"grad_norm": 0.7138823866844177,
"learning_rate": 1.3370781877911842e-05,
"loss": 0.0695,
"step": 67996
},
{
"epoch": 46.19063772048847,
"grad_norm": 0.39327022433280945,
"learning_rate": 1.326005044928501e-05,
"loss": 0.0717,
"step": 68085
},
{
"epoch": 46.25101763907734,
"grad_norm": 0.4522133469581604,
"learning_rate": 1.3149709308563901e-05,
"loss": 0.0749,
"step": 68174
},
{
"epoch": 46.31139755766621,
"grad_norm": 0.6930340528488159,
"learning_rate": 1.3039759627900672e-05,
"loss": 0.074,
"step": 68263
},
{
"epoch": 46.37177747625509,
"grad_norm": 2.3860812187194824,
"learning_rate": 1.293020257528908e-05,
"loss": 0.0756,
"step": 68352
},
{
"epoch": 46.43215739484396,
"grad_norm": 0.8091538548469543,
"learning_rate": 1.2821039314551958e-05,
"loss": 0.0765,
"step": 68441
},
{
"epoch": 46.492537313432834,
"grad_norm": 0.6948747038841248,
"learning_rate": 1.2712271005328924e-05,
"loss": 0.0746,
"step": 68530
},
{
"epoch": 46.55291723202171,
"grad_norm": 1.2013221979141235,
"learning_rate": 1.260389880306399e-05,
"loss": 0.0709,
"step": 68619
},
{
"epoch": 46.613297150610585,
"grad_norm": 1.0223325490951538,
"learning_rate": 1.2495923858993364e-05,
"loss": 0.076,
"step": 68708
},
{
"epoch": 46.67367706919946,
"grad_norm": 0.7184458374977112,
"learning_rate": 1.2388347320133182e-05,
"loss": 0.0684,
"step": 68797
},
{
"epoch": 46.73405698778833,
"grad_norm": 0.4814877510070801,
"learning_rate": 1.2281170329267322e-05,
"loss": 0.0724,
"step": 68886
},
{
"epoch": 46.79443690637721,
"grad_norm": 0.5036719441413879,
"learning_rate": 1.2174394024935281e-05,
"loss": 0.0704,
"step": 68975
},
{
"epoch": 46.85481682496608,
"grad_norm": 0.5806756019592285,
"learning_rate": 1.2068019541420033e-05,
"loss": 0.0723,
"step": 69064
},
{
"epoch": 46.91519674355495,
"grad_norm": 1.2670601606369019,
"learning_rate": 1.1962048008736053e-05,
"loss": 0.0706,
"step": 69153
},
{
"epoch": 46.97557666214383,
"grad_norm": 0.5702329277992249,
"learning_rate": 1.1856480552617272e-05,
"loss": 0.0702,
"step": 69242
},
{
"epoch": 47.0359565807327,
"grad_norm": 0.49773919582366943,
"learning_rate": 1.1751318294505104e-05,
"loss": 0.0738,
"step": 69331
},
{
"epoch": 47.09633649932157,
"grad_norm": 0.5580993294715881,
"learning_rate": 1.1646562351536589e-05,
"loss": 0.0714,
"step": 69420
},
{
"epoch": 47.156716417910445,
"grad_norm": 0.47159460186958313,
"learning_rate": 1.1542213836532417e-05,
"loss": 0.0736,
"step": 69509
},
{
"epoch": 47.217096336499324,
"grad_norm": 0.6028949618339539,
"learning_rate": 1.1438273857985244e-05,
"loss": 0.0748,
"step": 69598
},
{
"epoch": 47.277476255088196,
"grad_norm": 0.7113878130912781,
"learning_rate": 1.1334743520047836e-05,
"loss": 0.0753,
"step": 69687
},
{
"epoch": 47.33785617367707,
"grad_norm": 0.3303639888763428,
"learning_rate": 1.1231623922521317e-05,
"loss": 0.0716,
"step": 69776
},
{
"epoch": 47.39823609226594,
"grad_norm": 1.0966421365737915,
"learning_rate": 1.1128916160843578e-05,
"loss": 0.0733,
"step": 69865
},
{
"epoch": 47.45861601085482,
"grad_norm": 0.575943648815155,
"learning_rate": 1.1026621326077525e-05,
"loss": 0.0725,
"step": 69954
},
{
"epoch": 47.51899592944369,
"grad_norm": 0.8768503665924072,
"learning_rate": 1.0924740504899584e-05,
"loss": 0.0704,
"step": 70043
},
{
"epoch": 47.57937584803256,
"grad_norm": 0.6844857931137085,
"learning_rate": 1.0823274779588122e-05,
"loss": 0.0746,
"step": 70132
},
{
"epoch": 47.63975576662144,
"grad_norm": 0.5367492437362671,
"learning_rate": 1.0722225228011946e-05,
"loss": 0.0714,
"step": 70221
},
{
"epoch": 47.70013568521031,
"grad_norm": 0.5591740012168884,
"learning_rate": 1.0621592923618856e-05,
"loss": 0.0662,
"step": 70310
},
{
"epoch": 47.760515603799185,
"grad_norm": 0.4710708558559418,
"learning_rate": 1.0521378935424214e-05,
"loss": 0.0743,
"step": 70399
},
{
"epoch": 47.82089552238806,
"grad_norm": 0.7445366382598877,
"learning_rate": 1.0421584327999651e-05,
"loss": 0.0689,
"step": 70488
},
{
"epoch": 47.881275440976935,
"grad_norm": 0.8262448906898499,
"learning_rate": 1.0322210161461715e-05,
"loss": 0.0763,
"step": 70577
},
{
"epoch": 47.94165535956581,
"grad_norm": 0.5951725840568542,
"learning_rate": 1.0223257491460608e-05,
"loss": 0.0706,
"step": 70666
},
{
"epoch": 48.00203527815468,
"grad_norm": 0.6799793243408203,
"learning_rate": 1.0124727369169002e-05,
"loss": 0.074,
"step": 70755
},
{
"epoch": 48.00542740841248,
"eval_accuracy": 0.18359375,
"eval_loss": 4.300548553466797,
"eval_runtime": 21.4171,
"eval_samples_per_second": 23.906,
"eval_steps_per_second": 0.187,
"step": 70760
},
{
"epoch": 48.06241519674356,
"grad_norm": 0.7727463841438293,
"learning_rate": 1.0026620841270807e-05,
"loss": 0.0711,
"step": 70844
},
{
"epoch": 48.12279511533243,
"grad_norm": 0.641099214553833,
"learning_rate": 9.928938949950133e-06,
"loss": 0.0716,
"step": 70933
}
],
"logging_steps": 89,
"max_steps": 88440,
"num_input_tokens_seen": 0,
"num_train_epochs": 60,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2159619256203346e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}