fuckme / checkpoint-272 /trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
c7e8a95 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4995408631772268,
"eval_steps": 136,
"global_step": 272,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018365472910927456,
"grad_norm": 0.49180246368465724,
"learning_rate": 0.0,
"loss": 1.3574,
"step": 1
},
{
"epoch": 0.0018365472910927456,
"eval_loss": 2.4259753227233887,
"eval_runtime": 39.8169,
"eval_samples_per_second": 5.149,
"eval_steps_per_second": 0.452,
"step": 1
},
{
"epoch": 0.0036730945821854912,
"grad_norm": 0.3994587341086747,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.444,
"step": 2
},
{
"epoch": 0.005509641873278237,
"grad_norm": 0.39017597145072364,
"learning_rate": 5.000000000000001e-07,
"loss": 1.5149,
"step": 3
},
{
"epoch": 0.0073461891643709825,
"grad_norm": 0.47444439480391987,
"learning_rate": 7.5e-07,
"loss": 1.5855,
"step": 4
},
{
"epoch": 0.009182736455463728,
"grad_norm": 0.794434599507836,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.6531,
"step": 5
},
{
"epoch": 0.011019283746556474,
"grad_norm": 0.7108153394196348,
"learning_rate": 1.25e-06,
"loss": 1.7881,
"step": 6
},
{
"epoch": 0.012855831037649219,
"grad_norm": 0.5204076611421804,
"learning_rate": 1.5e-06,
"loss": 1.5592,
"step": 7
},
{
"epoch": 0.014692378328741965,
"grad_norm": 0.7569351115766059,
"learning_rate": 1.75e-06,
"loss": 1.5735,
"step": 8
},
{
"epoch": 0.01652892561983471,
"grad_norm": 0.5678098307469198,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5934,
"step": 9
},
{
"epoch": 0.018365472910927456,
"grad_norm": 0.40098167184732914,
"learning_rate": 2.25e-06,
"loss": 1.688,
"step": 10
},
{
"epoch": 0.020202020202020204,
"grad_norm": 0.4268491869708594,
"learning_rate": 2.5e-06,
"loss": 1.6286,
"step": 11
},
{
"epoch": 0.02203856749311295,
"grad_norm": 0.5835872767276206,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.679,
"step": 12
},
{
"epoch": 0.023875114784205693,
"grad_norm": 0.41861495783037117,
"learning_rate": 3e-06,
"loss": 1.7093,
"step": 13
},
{
"epoch": 0.025711662075298437,
"grad_norm": 0.3375113108661923,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.7431,
"step": 14
},
{
"epoch": 0.027548209366391185,
"grad_norm": 0.48885555509704504,
"learning_rate": 3.5e-06,
"loss": 1.4873,
"step": 15
},
{
"epoch": 0.02938475665748393,
"grad_norm": 0.4483288005954393,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.6619,
"step": 16
},
{
"epoch": 0.031221303948576674,
"grad_norm": 0.4299802045225574,
"learning_rate": 4.000000000000001e-06,
"loss": 1.6997,
"step": 17
},
{
"epoch": 0.03305785123966942,
"grad_norm": 0.5824485228901973,
"learning_rate": 4.25e-06,
"loss": 1.691,
"step": 18
},
{
"epoch": 0.03489439853076217,
"grad_norm": 0.32860558551239133,
"learning_rate": 4.5e-06,
"loss": 1.5414,
"step": 19
},
{
"epoch": 0.03673094582185491,
"grad_norm": 1.7514560949718712,
"learning_rate": 4.75e-06,
"loss": 1.522,
"step": 20
},
{
"epoch": 0.03856749311294766,
"grad_norm": 0.3844467204577219,
"learning_rate": 5e-06,
"loss": 1.6948,
"step": 21
},
{
"epoch": 0.04040404040404041,
"grad_norm": 0.329345071029554,
"learning_rate": 5.2500000000000006e-06,
"loss": 1.509,
"step": 22
},
{
"epoch": 0.04224058769513315,
"grad_norm": 0.28268786055882505,
"learning_rate": 5.500000000000001e-06,
"loss": 1.5563,
"step": 23
},
{
"epoch": 0.0440771349862259,
"grad_norm": 0.3422617324754064,
"learning_rate": 5.75e-06,
"loss": 1.5615,
"step": 24
},
{
"epoch": 0.04591368227731864,
"grad_norm": 0.33860326348440456,
"learning_rate": 6e-06,
"loss": 1.7008,
"step": 25
},
{
"epoch": 0.047750229568411386,
"grad_norm": 0.2789219845798442,
"learning_rate": 6.25e-06,
"loss": 1.5667,
"step": 26
},
{
"epoch": 0.049586776859504134,
"grad_norm": 0.37047693270617005,
"learning_rate": 6.5000000000000004e-06,
"loss": 1.5557,
"step": 27
},
{
"epoch": 0.051423324150596875,
"grad_norm": 0.3368470842339362,
"learning_rate": 6.750000000000001e-06,
"loss": 1.6045,
"step": 28
},
{
"epoch": 0.05325987144168962,
"grad_norm": 0.30344837893854465,
"learning_rate": 7e-06,
"loss": 1.6766,
"step": 29
},
{
"epoch": 0.05509641873278237,
"grad_norm": 0.6359004570618099,
"learning_rate": 7.25e-06,
"loss": 1.6499,
"step": 30
},
{
"epoch": 0.05693296602387511,
"grad_norm": 0.3258591226646554,
"learning_rate": 7.500000000000001e-06,
"loss": 1.6565,
"step": 31
},
{
"epoch": 0.05876951331496786,
"grad_norm": 0.3697834149605013,
"learning_rate": 7.75e-06,
"loss": 1.4883,
"step": 32
},
{
"epoch": 0.06060606060606061,
"grad_norm": 0.265668853003883,
"learning_rate": 8.000000000000001e-06,
"loss": 1.8289,
"step": 33
},
{
"epoch": 0.06244260789715335,
"grad_norm": 0.39987679350668315,
"learning_rate": 8.25e-06,
"loss": 1.6633,
"step": 34
},
{
"epoch": 0.0642791551882461,
"grad_norm": 0.39636333420956993,
"learning_rate": 8.5e-06,
"loss": 1.2298,
"step": 35
},
{
"epoch": 0.06611570247933884,
"grad_norm": 0.3464362348603123,
"learning_rate": 8.750000000000001e-06,
"loss": 1.485,
"step": 36
},
{
"epoch": 0.06795224977043159,
"grad_norm": 0.31616445355046285,
"learning_rate": 9e-06,
"loss": 1.5553,
"step": 37
},
{
"epoch": 0.06978879706152434,
"grad_norm": 0.4441888412468937,
"learning_rate": 9.250000000000001e-06,
"loss": 1.5967,
"step": 38
},
{
"epoch": 0.07162534435261708,
"grad_norm": 0.3547318554597848,
"learning_rate": 9.5e-06,
"loss": 1.5006,
"step": 39
},
{
"epoch": 0.07346189164370982,
"grad_norm": 0.27557088472944596,
"learning_rate": 9.75e-06,
"loss": 1.5673,
"step": 40
},
{
"epoch": 0.07529843893480258,
"grad_norm": 0.2956312497066916,
"learning_rate": 1e-05,
"loss": 1.4936,
"step": 41
},
{
"epoch": 0.07713498622589532,
"grad_norm": 0.2322611308466898,
"learning_rate": 9.999994591993822e-06,
"loss": 1.4759,
"step": 42
},
{
"epoch": 0.07897153351698806,
"grad_norm": 0.22039029040435731,
"learning_rate": 9.999978367986988e-06,
"loss": 1.6625,
"step": 43
},
{
"epoch": 0.08080808080808081,
"grad_norm": 0.3446344059315139,
"learning_rate": 9.999951328014591e-06,
"loss": 1.6309,
"step": 44
},
{
"epoch": 0.08264462809917356,
"grad_norm": 0.23666900781940328,
"learning_rate": 9.999913472135126e-06,
"loss": 1.5347,
"step": 45
},
{
"epoch": 0.0844811753902663,
"grad_norm": 0.2840328883351318,
"learning_rate": 9.999864800430482e-06,
"loss": 1.5115,
"step": 46
},
{
"epoch": 0.08631772268135904,
"grad_norm": 0.2407651293731456,
"learning_rate": 9.999805313005946e-06,
"loss": 1.4199,
"step": 47
},
{
"epoch": 0.0881542699724518,
"grad_norm": 0.24925631476018442,
"learning_rate": 9.999735009990202e-06,
"loss": 1.7008,
"step": 48
},
{
"epoch": 0.08999081726354453,
"grad_norm": 0.2502728573485842,
"learning_rate": 9.99965389153533e-06,
"loss": 1.6181,
"step": 49
},
{
"epoch": 0.09182736455463728,
"grad_norm": 0.23272569963887466,
"learning_rate": 9.999561957816803e-06,
"loss": 1.4668,
"step": 50
},
{
"epoch": 0.09366391184573003,
"grad_norm": 0.23492521900860228,
"learning_rate": 9.999459209033495e-06,
"loss": 1.6884,
"step": 51
},
{
"epoch": 0.09550045913682277,
"grad_norm": 0.24231529936066795,
"learning_rate": 9.999345645407671e-06,
"loss": 1.3811,
"step": 52
},
{
"epoch": 0.09733700642791551,
"grad_norm": 0.23749137313908725,
"learning_rate": 9.999221267184993e-06,
"loss": 1.6599,
"step": 53
},
{
"epoch": 0.09917355371900827,
"grad_norm": 0.2631749052017168,
"learning_rate": 9.999086074634516e-06,
"loss": 1.5321,
"step": 54
},
{
"epoch": 0.10101010101010101,
"grad_norm": 0.265780364913351,
"learning_rate": 9.998940068048688e-06,
"loss": 1.7954,
"step": 55
},
{
"epoch": 0.10284664830119375,
"grad_norm": 0.23373755925772408,
"learning_rate": 9.998783247743353e-06,
"loss": 1.6503,
"step": 56
},
{
"epoch": 0.1046831955922865,
"grad_norm": 0.26345499971500486,
"learning_rate": 9.998615614057743e-06,
"loss": 1.6842,
"step": 57
},
{
"epoch": 0.10651974288337925,
"grad_norm": 0.22858688890828285,
"learning_rate": 9.998437167354485e-06,
"loss": 1.6053,
"step": 58
},
{
"epoch": 0.10835629017447199,
"grad_norm": 0.22732836541277007,
"learning_rate": 9.998247908019594e-06,
"loss": 1.5357,
"step": 59
},
{
"epoch": 0.11019283746556474,
"grad_norm": 0.21575052353545032,
"learning_rate": 9.998047836462476e-06,
"loss": 1.5875,
"step": 60
},
{
"epoch": 0.11202938475665748,
"grad_norm": 0.3308350003540223,
"learning_rate": 9.997836953115927e-06,
"loss": 1.9333,
"step": 61
},
{
"epoch": 0.11386593204775022,
"grad_norm": 0.24160639887541288,
"learning_rate": 9.99761525843613e-06,
"loss": 1.6121,
"step": 62
},
{
"epoch": 0.11570247933884298,
"grad_norm": 0.23537591666009566,
"learning_rate": 9.997382752902658e-06,
"loss": 1.6341,
"step": 63
},
{
"epoch": 0.11753902662993572,
"grad_norm": 0.30577270648255184,
"learning_rate": 9.997139437018463e-06,
"loss": 1.4635,
"step": 64
},
{
"epoch": 0.11937557392102846,
"grad_norm": 0.24971896873950858,
"learning_rate": 9.996885311309892e-06,
"loss": 1.8186,
"step": 65
},
{
"epoch": 0.12121212121212122,
"grad_norm": 0.25847791225683553,
"learning_rate": 9.996620376326667e-06,
"loss": 1.491,
"step": 66
},
{
"epoch": 0.12304866850321396,
"grad_norm": 0.24031460889715148,
"learning_rate": 9.996344632641895e-06,
"loss": 1.6563,
"step": 67
},
{
"epoch": 0.1248852157943067,
"grad_norm": 0.2550524831797301,
"learning_rate": 9.996058080852067e-06,
"loss": 1.6427,
"step": 68
},
{
"epoch": 0.12672176308539945,
"grad_norm": 0.22467108780123965,
"learning_rate": 9.995760721577053e-06,
"loss": 1.4595,
"step": 69
},
{
"epoch": 0.1285583103764922,
"grad_norm": 0.282181674918973,
"learning_rate": 9.995452555460098e-06,
"loss": 1.6837,
"step": 70
},
{
"epoch": 0.13039485766758493,
"grad_norm": 0.5727450301366364,
"learning_rate": 9.995133583167833e-06,
"loss": 1.6003,
"step": 71
},
{
"epoch": 0.1322314049586777,
"grad_norm": 0.22744917091853156,
"learning_rate": 9.994803805390257e-06,
"loss": 1.3544,
"step": 72
},
{
"epoch": 0.13406795224977044,
"grad_norm": 0.22395196659772004,
"learning_rate": 9.994463222840748e-06,
"loss": 1.6198,
"step": 73
},
{
"epoch": 0.13590449954086317,
"grad_norm": 0.2098292344869194,
"learning_rate": 9.994111836256049e-06,
"loss": 1.5059,
"step": 74
},
{
"epoch": 0.13774104683195593,
"grad_norm": 0.22329332454983955,
"learning_rate": 9.993749646396286e-06,
"loss": 1.4666,
"step": 75
},
{
"epoch": 0.13957759412304868,
"grad_norm": 1.131446258847722,
"learning_rate": 9.993376654044948e-06,
"loss": 1.5164,
"step": 76
},
{
"epoch": 0.1414141414141414,
"grad_norm": 0.22716717526603358,
"learning_rate": 9.992992860008893e-06,
"loss": 1.4978,
"step": 77
},
{
"epoch": 0.14325068870523416,
"grad_norm": 0.24179796579866508,
"learning_rate": 9.992598265118344e-06,
"loss": 1.3147,
"step": 78
},
{
"epoch": 0.14508723599632692,
"grad_norm": 0.2127244611718112,
"learning_rate": 9.99219287022689e-06,
"loss": 1.4498,
"step": 79
},
{
"epoch": 0.14692378328741965,
"grad_norm": 0.27213431233903435,
"learning_rate": 9.991776676211483e-06,
"loss": 1.8822,
"step": 80
},
{
"epoch": 0.1487603305785124,
"grad_norm": 0.22988463490869612,
"learning_rate": 9.991349683972435e-06,
"loss": 1.4995,
"step": 81
},
{
"epoch": 0.15059687786960516,
"grad_norm": 0.23481878477688978,
"learning_rate": 9.990911894433415e-06,
"loss": 1.5132,
"step": 82
},
{
"epoch": 0.15243342516069788,
"grad_norm": 0.37026228973170716,
"learning_rate": 9.990463308541452e-06,
"loss": 1.5846,
"step": 83
},
{
"epoch": 0.15426997245179064,
"grad_norm": 0.34328184021255104,
"learning_rate": 9.990003927266928e-06,
"loss": 1.5882,
"step": 84
},
{
"epoch": 0.1561065197428834,
"grad_norm": 0.2239251502463014,
"learning_rate": 9.989533751603578e-06,
"loss": 1.5294,
"step": 85
},
{
"epoch": 0.15794306703397612,
"grad_norm": 0.2531093496713428,
"learning_rate": 9.989052782568484e-06,
"loss": 1.4136,
"step": 86
},
{
"epoch": 0.15977961432506887,
"grad_norm": 0.22960377831673873,
"learning_rate": 9.988561021202083e-06,
"loss": 1.5179,
"step": 87
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.2353475082843009,
"learning_rate": 9.988058468568154e-06,
"loss": 1.5986,
"step": 88
},
{
"epoch": 0.16345270890725436,
"grad_norm": 0.2095228695568511,
"learning_rate": 9.987545125753818e-06,
"loss": 1.7077,
"step": 89
},
{
"epoch": 0.1652892561983471,
"grad_norm": 0.25954954331361235,
"learning_rate": 9.987020993869543e-06,
"loss": 1.5786,
"step": 90
},
{
"epoch": 0.16712580348943984,
"grad_norm": 0.3475458310303135,
"learning_rate": 9.986486074049131e-06,
"loss": 1.6145,
"step": 91
},
{
"epoch": 0.1689623507805326,
"grad_norm": 0.26936951415786353,
"learning_rate": 9.98594036744972e-06,
"loss": 1.7652,
"step": 92
},
{
"epoch": 0.17079889807162535,
"grad_norm": 0.21373420050407635,
"learning_rate": 9.985383875251783e-06,
"loss": 1.6871,
"step": 93
},
{
"epoch": 0.17263544536271808,
"grad_norm": 0.23205711833790327,
"learning_rate": 9.98481659865913e-06,
"loss": 1.4672,
"step": 94
},
{
"epoch": 0.17447199265381083,
"grad_norm": 0.26381334544086116,
"learning_rate": 9.98423853889889e-06,
"loss": 1.6741,
"step": 95
},
{
"epoch": 0.1763085399449036,
"grad_norm": 0.24396664978240085,
"learning_rate": 9.983649697221528e-06,
"loss": 1.5546,
"step": 96
},
{
"epoch": 0.1781450872359963,
"grad_norm": 0.2533801841511916,
"learning_rate": 9.983050074900824e-06,
"loss": 1.6902,
"step": 97
},
{
"epoch": 0.17998163452708907,
"grad_norm": 0.2115479895852038,
"learning_rate": 9.982439673233885e-06,
"loss": 1.6094,
"step": 98
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.22659674725493387,
"learning_rate": 9.98181849354113e-06,
"loss": 1.3953,
"step": 99
},
{
"epoch": 0.18365472910927455,
"grad_norm": 0.4346020173346875,
"learning_rate": 9.981186537166301e-06,
"loss": 1.4842,
"step": 100
},
{
"epoch": 0.1854912764003673,
"grad_norm": 0.22752027491511684,
"learning_rate": 9.980543805476447e-06,
"loss": 1.6125,
"step": 101
},
{
"epoch": 0.18732782369146006,
"grad_norm": 0.2178158645117292,
"learning_rate": 9.979890299861923e-06,
"loss": 1.3843,
"step": 102
},
{
"epoch": 0.1891643709825528,
"grad_norm": 0.2629457576326313,
"learning_rate": 9.979226021736396e-06,
"loss": 1.5034,
"step": 103
},
{
"epoch": 0.19100091827364554,
"grad_norm": 0.2638758563035785,
"learning_rate": 9.978550972536834e-06,
"loss": 1.2882,
"step": 104
},
{
"epoch": 0.1928374655647383,
"grad_norm": 0.25957055432676646,
"learning_rate": 9.977865153723508e-06,
"loss": 1.5669,
"step": 105
},
{
"epoch": 0.19467401285583102,
"grad_norm": 0.2107272760638706,
"learning_rate": 9.977168566779976e-06,
"loss": 1.4316,
"step": 106
},
{
"epoch": 0.19651056014692378,
"grad_norm": 0.23996444946668347,
"learning_rate": 9.976461213213104e-06,
"loss": 1.5958,
"step": 107
},
{
"epoch": 0.19834710743801653,
"grad_norm": 0.3866853138626082,
"learning_rate": 9.975743094553037e-06,
"loss": 1.6301,
"step": 108
},
{
"epoch": 0.20018365472910926,
"grad_norm": 0.2659575860373734,
"learning_rate": 9.975014212353212e-06,
"loss": 1.6435,
"step": 109
},
{
"epoch": 0.20202020202020202,
"grad_norm": 0.25698703101518205,
"learning_rate": 9.974274568190349e-06,
"loss": 1.4947,
"step": 110
},
{
"epoch": 0.20385674931129477,
"grad_norm": 0.23394729883248447,
"learning_rate": 9.973524163664447e-06,
"loss": 1.5486,
"step": 111
},
{
"epoch": 0.2056932966023875,
"grad_norm": 0.2682338169486529,
"learning_rate": 9.972763000398784e-06,
"loss": 1.562,
"step": 112
},
{
"epoch": 0.20752984389348025,
"grad_norm": 0.22594240276292787,
"learning_rate": 9.971991080039912e-06,
"loss": 1.6665,
"step": 113
},
{
"epoch": 0.209366391184573,
"grad_norm": 0.2716099274981758,
"learning_rate": 9.971208404257647e-06,
"loss": 1.6069,
"step": 114
},
{
"epoch": 0.21120293847566574,
"grad_norm": 0.215642600558555,
"learning_rate": 9.970414974745077e-06,
"loss": 1.426,
"step": 115
},
{
"epoch": 0.2130394857667585,
"grad_norm": 0.26192594390837787,
"learning_rate": 9.96961079321855e-06,
"loss": 1.4929,
"step": 116
},
{
"epoch": 0.21487603305785125,
"grad_norm": 0.24076680870279565,
"learning_rate": 9.968795861417676e-06,
"loss": 1.5116,
"step": 117
},
{
"epoch": 0.21671258034894397,
"grad_norm": 0.20785778608018918,
"learning_rate": 9.967970181105315e-06,
"loss": 1.4824,
"step": 118
},
{
"epoch": 0.21854912764003673,
"grad_norm": 0.2474137833375562,
"learning_rate": 9.967133754067581e-06,
"loss": 1.6394,
"step": 119
},
{
"epoch": 0.22038567493112948,
"grad_norm": 0.2415950163286452,
"learning_rate": 9.966286582113838e-06,
"loss": 1.4747,
"step": 120
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.25397048864143756,
"learning_rate": 9.965428667076687e-06,
"loss": 1.626,
"step": 121
},
{
"epoch": 0.22405876951331496,
"grad_norm": 0.27597092429735764,
"learning_rate": 9.964560010811972e-06,
"loss": 1.4853,
"step": 122
},
{
"epoch": 0.22589531680440772,
"grad_norm": 0.2773282618369793,
"learning_rate": 9.963680615198774e-06,
"loss": 1.2673,
"step": 123
},
{
"epoch": 0.22773186409550045,
"grad_norm": 0.2863982446521439,
"learning_rate": 9.962790482139402e-06,
"loss": 1.4531,
"step": 124
},
{
"epoch": 0.2295684113865932,
"grad_norm": 0.24940341962059176,
"learning_rate": 9.961889613559396e-06,
"loss": 1.5392,
"step": 125
},
{
"epoch": 0.23140495867768596,
"grad_norm": 0.2692207944659178,
"learning_rate": 9.960978011407516e-06,
"loss": 1.3452,
"step": 126
},
{
"epoch": 0.23324150596877868,
"grad_norm": 0.2394265357414062,
"learning_rate": 9.960055677655743e-06,
"loss": 1.4267,
"step": 127
},
{
"epoch": 0.23507805325987144,
"grad_norm": 0.2396211231483559,
"learning_rate": 9.95912261429927e-06,
"loss": 1.669,
"step": 128
},
{
"epoch": 0.2369146005509642,
"grad_norm": 0.22480647337089882,
"learning_rate": 9.958178823356503e-06,
"loss": 1.3276,
"step": 129
},
{
"epoch": 0.23875114784205692,
"grad_norm": 0.2322223276315898,
"learning_rate": 9.957224306869053e-06,
"loss": 1.6457,
"step": 130
},
{
"epoch": 0.24058769513314968,
"grad_norm": 0.28675358511507826,
"learning_rate": 9.956259066901733e-06,
"loss": 1.4844,
"step": 131
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.23506904729263256,
"learning_rate": 9.955283105542551e-06,
"loss": 1.5401,
"step": 132
},
{
"epoch": 0.24426078971533516,
"grad_norm": 0.25409979083106216,
"learning_rate": 9.954296424902709e-06,
"loss": 1.5796,
"step": 133
},
{
"epoch": 0.2460973370064279,
"grad_norm": 0.27537727017693314,
"learning_rate": 9.953299027116598e-06,
"loss": 1.7015,
"step": 134
},
{
"epoch": 0.24793388429752067,
"grad_norm": 0.255527421712665,
"learning_rate": 9.95229091434179e-06,
"loss": 1.6943,
"step": 135
},
{
"epoch": 0.2497704315886134,
"grad_norm": 0.2758931212786201,
"learning_rate": 9.95127208875904e-06,
"loss": 1.6373,
"step": 136
},
{
"epoch": 0.2497704315886134,
"eval_loss": 2.332951307296753,
"eval_runtime": 38.5842,
"eval_samples_per_second": 5.313,
"eval_steps_per_second": 0.467,
"step": 136
},
{
"epoch": 0.2516069788797062,
"grad_norm": 0.4849948087122718,
"learning_rate": 9.950242552572272e-06,
"loss": 1.5843,
"step": 137
},
{
"epoch": 0.2534435261707989,
"grad_norm": 0.27082023040786063,
"learning_rate": 9.949202308008581e-06,
"loss": 1.466,
"step": 138
},
{
"epoch": 0.25528007346189163,
"grad_norm": 0.21412319589684112,
"learning_rate": 9.948151357318228e-06,
"loss": 1.5575,
"step": 139
},
{
"epoch": 0.2571166207529844,
"grad_norm": 0.2630300060870272,
"learning_rate": 9.94708970277463e-06,
"loss": 1.8459,
"step": 140
},
{
"epoch": 0.25895316804407714,
"grad_norm": 0.29584228360860926,
"learning_rate": 9.946017346674362e-06,
"loss": 1.4765,
"step": 141
},
{
"epoch": 0.26078971533516987,
"grad_norm": 0.20420849217608372,
"learning_rate": 9.944934291337146e-06,
"loss": 1.3963,
"step": 142
},
{
"epoch": 0.26262626262626265,
"grad_norm": 0.2383087119690396,
"learning_rate": 9.943840539105853e-06,
"loss": 1.6948,
"step": 143
},
{
"epoch": 0.2644628099173554,
"grad_norm": 0.22707287878559146,
"learning_rate": 9.942736092346487e-06,
"loss": 1.5627,
"step": 144
},
{
"epoch": 0.2662993572084481,
"grad_norm": 0.2209791613669926,
"learning_rate": 9.941620953448195e-06,
"loss": 1.6011,
"step": 145
},
{
"epoch": 0.2681359044995409,
"grad_norm": 0.23620122895922513,
"learning_rate": 9.940495124823241e-06,
"loss": 1.4172,
"step": 146
},
{
"epoch": 0.2699724517906336,
"grad_norm": 0.4575809432111337,
"learning_rate": 9.939358608907026e-06,
"loss": 1.8045,
"step": 147
},
{
"epoch": 0.27180899908172634,
"grad_norm": 0.25576228816691954,
"learning_rate": 9.938211408158063e-06,
"loss": 1.5559,
"step": 148
},
{
"epoch": 0.2736455463728191,
"grad_norm": 0.32262207355286887,
"learning_rate": 9.937053525057977e-06,
"loss": 1.6491,
"step": 149
},
{
"epoch": 0.27548209366391185,
"grad_norm": 0.29602023759033497,
"learning_rate": 9.935884962111506e-06,
"loss": 1.6518,
"step": 150
},
{
"epoch": 0.2773186409550046,
"grad_norm": 0.24182886602369053,
"learning_rate": 9.934705721846487e-06,
"loss": 1.5457,
"step": 151
},
{
"epoch": 0.27915518824609736,
"grad_norm": 0.2247571699952089,
"learning_rate": 9.933515806813856e-06,
"loss": 1.481,
"step": 152
},
{
"epoch": 0.2809917355371901,
"grad_norm": 0.24890915769392072,
"learning_rate": 9.932315219587641e-06,
"loss": 1.618,
"step": 153
},
{
"epoch": 0.2828282828282828,
"grad_norm": 0.2597194625874549,
"learning_rate": 9.931103962764955e-06,
"loss": 1.5653,
"step": 154
},
{
"epoch": 0.2846648301193756,
"grad_norm": 0.23725030845640346,
"learning_rate": 9.92988203896599e-06,
"loss": 1.4677,
"step": 155
},
{
"epoch": 0.2865013774104683,
"grad_norm": 0.20619150412894088,
"learning_rate": 9.928649450834015e-06,
"loss": 1.6779,
"step": 156
},
{
"epoch": 0.28833792470156105,
"grad_norm": 0.23919303859135752,
"learning_rate": 9.927406201035368e-06,
"loss": 1.4417,
"step": 157
},
{
"epoch": 0.29017447199265384,
"grad_norm": 0.2244668514529571,
"learning_rate": 9.926152292259452e-06,
"loss": 1.7272,
"step": 158
},
{
"epoch": 0.29201101928374656,
"grad_norm": 0.26954841337527125,
"learning_rate": 9.924887727218724e-06,
"loss": 1.6068,
"step": 159
},
{
"epoch": 0.2938475665748393,
"grad_norm": 0.32951953455251004,
"learning_rate": 9.923612508648693e-06,
"loss": 1.406,
"step": 160
},
{
"epoch": 0.2956841138659321,
"grad_norm": 0.2698529792514822,
"learning_rate": 9.922326639307918e-06,
"loss": 1.683,
"step": 161
},
{
"epoch": 0.2975206611570248,
"grad_norm": 0.22420186125700636,
"learning_rate": 9.921030121977992e-06,
"loss": 1.5398,
"step": 162
},
{
"epoch": 0.29935720844811753,
"grad_norm": 0.24335042870850987,
"learning_rate": 9.919722959463545e-06,
"loss": 1.5752,
"step": 163
},
{
"epoch": 0.3011937557392103,
"grad_norm": 0.272993009874352,
"learning_rate": 9.918405154592234e-06,
"loss": 1.6791,
"step": 164
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.36447396005561716,
"learning_rate": 9.917076710214739e-06,
"loss": 1.8303,
"step": 165
},
{
"epoch": 0.30486685032139577,
"grad_norm": 0.266743073761148,
"learning_rate": 9.915737629204754e-06,
"loss": 1.652,
"step": 166
},
{
"epoch": 0.30670339761248855,
"grad_norm": 0.22394219745904015,
"learning_rate": 9.914387914458983e-06,
"loss": 1.5615,
"step": 167
},
{
"epoch": 0.3085399449035813,
"grad_norm": 0.23075889412968112,
"learning_rate": 9.91302756889713e-06,
"loss": 1.6447,
"step": 168
},
{
"epoch": 0.310376492194674,
"grad_norm": 0.2441975693564735,
"learning_rate": 9.911656595461899e-06,
"loss": 1.5981,
"step": 169
},
{
"epoch": 0.3122130394857668,
"grad_norm": 0.21377545765414097,
"learning_rate": 9.910274997118982e-06,
"loss": 1.6066,
"step": 170
},
{
"epoch": 0.3140495867768595,
"grad_norm": 0.21518096806340806,
"learning_rate": 9.908882776857057e-06,
"loss": 1.199,
"step": 171
},
{
"epoch": 0.31588613406795224,
"grad_norm": 0.22989533865542952,
"learning_rate": 9.907479937687779e-06,
"loss": 1.6019,
"step": 172
},
{
"epoch": 0.317722681359045,
"grad_norm": 0.24187157970795195,
"learning_rate": 9.906066482645774e-06,
"loss": 1.5453,
"step": 173
},
{
"epoch": 0.31955922865013775,
"grad_norm": 0.28880156295741904,
"learning_rate": 9.904642414788627e-06,
"loss": 1.6518,
"step": 174
},
{
"epoch": 0.3213957759412305,
"grad_norm": 0.3064989299701211,
"learning_rate": 9.903207737196892e-06,
"loss": 1.5614,
"step": 175
},
{
"epoch": 0.32323232323232326,
"grad_norm": 0.5501951504862674,
"learning_rate": 9.90176245297406e-06,
"loss": 1.3988,
"step": 176
},
{
"epoch": 0.325068870523416,
"grad_norm": 0.23086867624049656,
"learning_rate": 9.900306565246579e-06,
"loss": 1.4688,
"step": 177
},
{
"epoch": 0.3269054178145087,
"grad_norm": 0.24109267951104685,
"learning_rate": 9.898840077163824e-06,
"loss": 1.342,
"step": 178
},
{
"epoch": 0.3287419651056015,
"grad_norm": 0.221519141054226,
"learning_rate": 9.89736299189811e-06,
"loss": 1.6468,
"step": 179
},
{
"epoch": 0.3305785123966942,
"grad_norm": 0.2134481965910651,
"learning_rate": 9.89587531264467e-06,
"loss": 1.4762,
"step": 180
},
{
"epoch": 0.33241505968778695,
"grad_norm": 0.30273218459666634,
"learning_rate": 9.894377042621654e-06,
"loss": 1.2952,
"step": 181
},
{
"epoch": 0.3342516069788797,
"grad_norm": 0.2643249012553608,
"learning_rate": 9.892868185070125e-06,
"loss": 1.5922,
"step": 182
},
{
"epoch": 0.33608815426997246,
"grad_norm": 0.3017798778461224,
"learning_rate": 9.891348743254046e-06,
"loss": 1.3327,
"step": 183
},
{
"epoch": 0.3379247015610652,
"grad_norm": 0.198099058736144,
"learning_rate": 9.889818720460281e-06,
"loss": 1.4528,
"step": 184
},
{
"epoch": 0.3397612488521579,
"grad_norm": 0.2342720457991802,
"learning_rate": 9.888278119998573e-06,
"loss": 1.396,
"step": 185
},
{
"epoch": 0.3415977961432507,
"grad_norm": 0.30360601805904236,
"learning_rate": 9.886726945201556e-06,
"loss": 1.5982,
"step": 186
},
{
"epoch": 0.3434343434343434,
"grad_norm": 0.2890684354336481,
"learning_rate": 9.885165199424738e-06,
"loss": 1.3935,
"step": 187
},
{
"epoch": 0.34527089072543615,
"grad_norm": 0.2574557316071565,
"learning_rate": 9.883592886046486e-06,
"loss": 1.449,
"step": 188
},
{
"epoch": 0.34710743801652894,
"grad_norm": 0.23195985985184547,
"learning_rate": 9.882010008468038e-06,
"loss": 1.4909,
"step": 189
},
{
"epoch": 0.34894398530762166,
"grad_norm": 0.2539011848151687,
"learning_rate": 9.880416570113472e-06,
"loss": 1.5123,
"step": 190
},
{
"epoch": 0.3507805325987144,
"grad_norm": 0.26954455617431194,
"learning_rate": 9.878812574429722e-06,
"loss": 1.4445,
"step": 191
},
{
"epoch": 0.3526170798898072,
"grad_norm": 0.23141695311529595,
"learning_rate": 9.877198024886553e-06,
"loss": 1.9045,
"step": 192
},
{
"epoch": 0.3544536271808999,
"grad_norm": 0.2750943021523168,
"learning_rate": 9.875572924976568e-06,
"loss": 1.5868,
"step": 193
},
{
"epoch": 0.3562901744719926,
"grad_norm": 0.23421966078059409,
"learning_rate": 9.873937278215181e-06,
"loss": 1.7534,
"step": 194
},
{
"epoch": 0.3581267217630854,
"grad_norm": 0.24409550991932608,
"learning_rate": 9.87229108814063e-06,
"loss": 1.4753,
"step": 195
},
{
"epoch": 0.35996326905417814,
"grad_norm": 0.24998063066925763,
"learning_rate": 9.870634358313956e-06,
"loss": 1.6574,
"step": 196
},
{
"epoch": 0.36179981634527086,
"grad_norm": 0.2630317204109116,
"learning_rate": 9.868967092319003e-06,
"loss": 1.6794,
"step": 197
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.21473616516429722,
"learning_rate": 9.867289293762403e-06,
"loss": 1.7626,
"step": 198
},
{
"epoch": 0.3654729109274564,
"grad_norm": 0.3020374891635284,
"learning_rate": 9.865600966273576e-06,
"loss": 1.5234,
"step": 199
},
{
"epoch": 0.3673094582185491,
"grad_norm": 0.24122137539024896,
"learning_rate": 9.863902113504713e-06,
"loss": 1.5807,
"step": 200
},
{
"epoch": 0.3691460055096419,
"grad_norm": 0.2240345557985464,
"learning_rate": 9.86219273913078e-06,
"loss": 1.4635,
"step": 201
},
{
"epoch": 0.3709825528007346,
"grad_norm": 0.2561505890958098,
"learning_rate": 9.860472846849498e-06,
"loss": 1.5781,
"step": 202
},
{
"epoch": 0.37281910009182734,
"grad_norm": 0.2629334369467521,
"learning_rate": 9.858742440381343e-06,
"loss": 1.3698,
"step": 203
},
{
"epoch": 0.3746556473829201,
"grad_norm": 0.2643820816427679,
"learning_rate": 9.857001523469534e-06,
"loss": 1.6831,
"step": 204
},
{
"epoch": 0.37649219467401285,
"grad_norm": 0.27568924761133784,
"learning_rate": 9.855250099880026e-06,
"loss": 1.5586,
"step": 205
},
{
"epoch": 0.3783287419651056,
"grad_norm": 0.23562831853645116,
"learning_rate": 9.853488173401504e-06,
"loss": 1.4529,
"step": 206
},
{
"epoch": 0.38016528925619836,
"grad_norm": 0.24270051421435648,
"learning_rate": 9.851715747845372e-06,
"loss": 1.5449,
"step": 207
},
{
"epoch": 0.3820018365472911,
"grad_norm": 0.3141794855298513,
"learning_rate": 9.849932827045746e-06,
"loss": 1.6011,
"step": 208
},
{
"epoch": 0.3838383838383838,
"grad_norm": 0.2347451989106533,
"learning_rate": 9.848139414859441e-06,
"loss": 1.7994,
"step": 209
},
{
"epoch": 0.3856749311294766,
"grad_norm": 0.22998314428761488,
"learning_rate": 9.846335515165974e-06,
"loss": 1.5024,
"step": 210
},
{
"epoch": 0.3875114784205693,
"grad_norm": 0.2914601119925799,
"learning_rate": 9.844521131867546e-06,
"loss": 1.6718,
"step": 211
},
{
"epoch": 0.38934802571166205,
"grad_norm": 0.27307242607178106,
"learning_rate": 9.842696268889032e-06,
"loss": 1.6705,
"step": 212
},
{
"epoch": 0.39118457300275483,
"grad_norm": 0.26989044058601575,
"learning_rate": 9.840860930177984e-06,
"loss": 1.5762,
"step": 213
},
{
"epoch": 0.39302112029384756,
"grad_norm": 0.23544376195311703,
"learning_rate": 9.839015119704607e-06,
"loss": 1.4928,
"step": 214
},
{
"epoch": 0.3948576675849403,
"grad_norm": 0.40380733954206144,
"learning_rate": 9.837158841461767e-06,
"loss": 1.5587,
"step": 215
},
{
"epoch": 0.39669421487603307,
"grad_norm": 0.2416750029017525,
"learning_rate": 9.835292099464965e-06,
"loss": 1.4265,
"step": 216
},
{
"epoch": 0.3985307621671258,
"grad_norm": 0.20943162169896912,
"learning_rate": 9.833414897752346e-06,
"loss": 1.5836,
"step": 217
},
{
"epoch": 0.4003673094582185,
"grad_norm": 0.24319317049956413,
"learning_rate": 9.831527240384677e-06,
"loss": 1.6154,
"step": 218
},
{
"epoch": 0.4022038567493113,
"grad_norm": 0.2375469014345421,
"learning_rate": 9.829629131445342e-06,
"loss": 1.5652,
"step": 219
},
{
"epoch": 0.40404040404040403,
"grad_norm": 0.27739989413008637,
"learning_rate": 9.827720575040335e-06,
"loss": 1.5439,
"step": 220
},
{
"epoch": 0.40587695133149676,
"grad_norm": 0.24247147004672653,
"learning_rate": 9.825801575298248e-06,
"loss": 1.4613,
"step": 221
},
{
"epoch": 0.40771349862258954,
"grad_norm": 0.29413816874548104,
"learning_rate": 9.82387213637027e-06,
"loss": 1.4005,
"step": 222
},
{
"epoch": 0.40955004591368227,
"grad_norm": 0.21318983016138546,
"learning_rate": 9.821932262430164e-06,
"loss": 1.4747,
"step": 223
},
{
"epoch": 0.411386593204775,
"grad_norm": 0.24655890709082884,
"learning_rate": 9.819981957674273e-06,
"loss": 1.6599,
"step": 224
},
{
"epoch": 0.4132231404958678,
"grad_norm": 0.23420376177876423,
"learning_rate": 9.818021226321502e-06,
"loss": 1.5039,
"step": 225
},
{
"epoch": 0.4150596877869605,
"grad_norm": 0.23379966460981552,
"learning_rate": 9.816050072613306e-06,
"loss": 1.775,
"step": 226
},
{
"epoch": 0.41689623507805323,
"grad_norm": 0.30210714633151925,
"learning_rate": 9.814068500813692e-06,
"loss": 1.5692,
"step": 227
},
{
"epoch": 0.418732782369146,
"grad_norm": 0.24124025669836516,
"learning_rate": 9.812076515209201e-06,
"loss": 1.5682,
"step": 228
},
{
"epoch": 0.42056932966023874,
"grad_norm": 0.40694547415364124,
"learning_rate": 9.8100741201089e-06,
"loss": 1.3886,
"step": 229
},
{
"epoch": 0.42240587695133147,
"grad_norm": 0.5017797555556736,
"learning_rate": 9.808061319844376e-06,
"loss": 1.5096,
"step": 230
},
{
"epoch": 0.42424242424242425,
"grad_norm": 0.2395927998641603,
"learning_rate": 9.806038118769724e-06,
"loss": 1.6511,
"step": 231
},
{
"epoch": 0.426078971533517,
"grad_norm": 0.26346897129489716,
"learning_rate": 9.804004521261537e-06,
"loss": 1.4852,
"step": 232
},
{
"epoch": 0.4279155188246097,
"grad_norm": 0.21927054884906805,
"learning_rate": 9.801960531718898e-06,
"loss": 1.3796,
"step": 233
},
{
"epoch": 0.4297520661157025,
"grad_norm": 0.21216613988150154,
"learning_rate": 9.79990615456337e-06,
"loss": 1.3236,
"step": 234
},
{
"epoch": 0.4315886134067952,
"grad_norm": 0.2439837140444034,
"learning_rate": 9.797841394238987e-06,
"loss": 1.4752,
"step": 235
},
{
"epoch": 0.43342516069788795,
"grad_norm": 0.20264650772176668,
"learning_rate": 9.795766255212242e-06,
"loss": 1.4633,
"step": 236
},
{
"epoch": 0.43526170798898073,
"grad_norm": 0.2214142026219714,
"learning_rate": 9.793680741972084e-06,
"loss": 1.524,
"step": 237
},
{
"epoch": 0.43709825528007346,
"grad_norm": 0.32400463641987054,
"learning_rate": 9.791584859029901e-06,
"loss": 1.7717,
"step": 238
},
{
"epoch": 0.4389348025711662,
"grad_norm": 0.25353704234527624,
"learning_rate": 9.789478610919508e-06,
"loss": 1.4503,
"step": 239
},
{
"epoch": 0.44077134986225897,
"grad_norm": 0.28666088570706005,
"learning_rate": 9.787362002197147e-06,
"loss": 1.5374,
"step": 240
},
{
"epoch": 0.4426078971533517,
"grad_norm": 0.2575712788223055,
"learning_rate": 9.785235037441473e-06,
"loss": 1.3976,
"step": 241
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.235205219080783,
"learning_rate": 9.783097721253543e-06,
"loss": 1.6037,
"step": 242
},
{
"epoch": 0.4462809917355372,
"grad_norm": 0.23780080068966725,
"learning_rate": 9.780950058256802e-06,
"loss": 1.679,
"step": 243
},
{
"epoch": 0.44811753902662993,
"grad_norm": 0.3404150158565561,
"learning_rate": 9.778792053097079e-06,
"loss": 1.3573,
"step": 244
},
{
"epoch": 0.44995408631772266,
"grad_norm": 0.21082516759926104,
"learning_rate": 9.77662371044258e-06,
"loss": 1.4932,
"step": 245
},
{
"epoch": 0.45179063360881544,
"grad_norm": 0.24031601796602936,
"learning_rate": 9.774445034983864e-06,
"loss": 1.7397,
"step": 246
},
{
"epoch": 0.45362718089990817,
"grad_norm": 0.2207009467162113,
"learning_rate": 9.77225603143385e-06,
"loss": 1.585,
"step": 247
},
{
"epoch": 0.4554637281910009,
"grad_norm": 0.2551403718915401,
"learning_rate": 9.770056704527797e-06,
"loss": 1.4924,
"step": 248
},
{
"epoch": 0.4573002754820937,
"grad_norm": 0.23525171894308203,
"learning_rate": 9.767847059023292e-06,
"loss": 1.5658,
"step": 249
},
{
"epoch": 0.4591368227731864,
"grad_norm": 0.2340311514550259,
"learning_rate": 9.765627099700248e-06,
"loss": 1.7361,
"step": 250
},
{
"epoch": 0.46097337006427913,
"grad_norm": 0.3425610884720096,
"learning_rate": 9.763396831360884e-06,
"loss": 1.4723,
"step": 251
},
{
"epoch": 0.4628099173553719,
"grad_norm": 0.232643381531555,
"learning_rate": 9.761156258829723e-06,
"loss": 1.5543,
"step": 252
},
{
"epoch": 0.46464646464646464,
"grad_norm": 0.3601600310692427,
"learning_rate": 9.75890538695358e-06,
"loss": 1.4195,
"step": 253
},
{
"epoch": 0.46648301193755737,
"grad_norm": 0.2625087171937245,
"learning_rate": 9.756644220601541e-06,
"loss": 1.7209,
"step": 254
},
{
"epoch": 0.46831955922865015,
"grad_norm": 0.36091521146763844,
"learning_rate": 9.75437276466497e-06,
"loss": 1.4435,
"step": 255
},
{
"epoch": 0.4701561065197429,
"grad_norm": 0.22970766162190284,
"learning_rate": 9.752091024057485e-06,
"loss": 1.3894,
"step": 256
},
{
"epoch": 0.4719926538108356,
"grad_norm": 0.2708039374301605,
"learning_rate": 9.749799003714954e-06,
"loss": 1.4717,
"step": 257
},
{
"epoch": 0.4738292011019284,
"grad_norm": 0.22940138762480203,
"learning_rate": 9.747496708595482e-06,
"loss": 1.532,
"step": 258
},
{
"epoch": 0.4756657483930211,
"grad_norm": 0.27196638223072084,
"learning_rate": 9.745184143679398e-06,
"loss": 1.4425,
"step": 259
},
{
"epoch": 0.47750229568411384,
"grad_norm": 0.24883578158794614,
"learning_rate": 9.742861313969246e-06,
"loss": 1.4882,
"step": 260
},
{
"epoch": 0.4793388429752066,
"grad_norm": 0.24541193855388124,
"learning_rate": 9.74052822448978e-06,
"loss": 1.4851,
"step": 261
},
{
"epoch": 0.48117539026629935,
"grad_norm": 0.249310294245501,
"learning_rate": 9.738184880287946e-06,
"loss": 1.6463,
"step": 262
},
{
"epoch": 0.4830119375573921,
"grad_norm": 0.25707806827316415,
"learning_rate": 9.735831286432869e-06,
"loss": 1.6118,
"step": 263
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.22547510302041615,
"learning_rate": 9.733467448015849e-06,
"loss": 1.3343,
"step": 264
},
{
"epoch": 0.4866850321395776,
"grad_norm": 0.21415297945638267,
"learning_rate": 9.731093370150349e-06,
"loss": 1.8938,
"step": 265
},
{
"epoch": 0.4885215794306703,
"grad_norm": 0.25553884577553115,
"learning_rate": 9.728709057971979e-06,
"loss": 1.6681,
"step": 266
},
{
"epoch": 0.4903581267217631,
"grad_norm": 0.23479399192488148,
"learning_rate": 9.72631451663849e-06,
"loss": 1.3781,
"step": 267
},
{
"epoch": 0.4921946740128558,
"grad_norm": 0.2549932602973557,
"learning_rate": 9.723909751329759e-06,
"loss": 1.444,
"step": 268
},
{
"epoch": 0.49403122130394855,
"grad_norm": 0.24071196589287586,
"learning_rate": 9.721494767247779e-06,
"loss": 1.5817,
"step": 269
},
{
"epoch": 0.49586776859504134,
"grad_norm": 0.23040782897953827,
"learning_rate": 9.719069569616653e-06,
"loss": 1.2528,
"step": 270
},
{
"epoch": 0.49770431588613406,
"grad_norm": 0.2371564859647457,
"learning_rate": 9.71663416368257e-06,
"loss": 1.7053,
"step": 271
},
{
"epoch": 0.4995408631772268,
"grad_norm": 0.2489215879444709,
"learning_rate": 9.71418855471381e-06,
"loss": 1.4637,
"step": 272
},
{
"epoch": 0.4995408631772268,
"eval_loss": 2.313441038131714,
"eval_runtime": 39.8034,
"eval_samples_per_second": 5.15,
"eval_steps_per_second": 0.452,
"step": 272
}
],
"logging_steps": 1,
"max_steps": 2176,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 272,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 235355418132480.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}