9b-44 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
abad898 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 532,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015037593984962405,
"grad_norm": 0.8192565441131592,
"learning_rate": 1.111111111111111e-06,
"loss": 1.936692237854004,
"step": 2
},
{
"epoch": 0.03007518796992481,
"grad_norm": 0.7943888306617737,
"learning_rate": 3.3333333333333333e-06,
"loss": 2.246655225753784,
"step": 4
},
{
"epoch": 0.045112781954887216,
"grad_norm": 0.5120864510536194,
"learning_rate": 5.555555555555555e-06,
"loss": 2.0245468616485596,
"step": 6
},
{
"epoch": 0.06015037593984962,
"grad_norm": 0.3112846314907074,
"learning_rate": 7.777777777777777e-06,
"loss": 1.894791603088379,
"step": 8
},
{
"epoch": 0.07518796992481203,
"grad_norm": 0.7659847140312195,
"learning_rate": 9.999999999999999e-06,
"loss": 1.8956142663955688,
"step": 10
},
{
"epoch": 0.09022556390977443,
"grad_norm": 1.0534412860870361,
"learning_rate": 1.2222222222222222e-05,
"loss": 2.20853590965271,
"step": 12
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.2340962290763855,
"learning_rate": 1.4444444444444444e-05,
"loss": 1.858487606048584,
"step": 14
},
{
"epoch": 0.12030075187969924,
"grad_norm": 1.1463453769683838,
"learning_rate": 1.6666666666666667e-05,
"loss": 2.3683369159698486,
"step": 16
},
{
"epoch": 0.13533834586466165,
"grad_norm": 1.6465355157852173,
"learning_rate": 1.888888888888889e-05,
"loss": 2.907562017440796,
"step": 18
},
{
"epoch": 0.15037593984962405,
"grad_norm": 0.883372962474823,
"learning_rate": 2.111111111111111e-05,
"loss": 1.6304150819778442,
"step": 20
},
{
"epoch": 0.16541353383458646,
"grad_norm": 0.7589054703712463,
"learning_rate": 2.3333333333333336e-05,
"loss": 1.670052170753479,
"step": 22
},
{
"epoch": 0.18045112781954886,
"grad_norm": 0.5909481048583984,
"learning_rate": 2.5555555555555557e-05,
"loss": 1.6498050689697266,
"step": 24
},
{
"epoch": 0.19548872180451127,
"grad_norm": 1.89938223361969,
"learning_rate": 2.777777777777778e-05,
"loss": 1.5983651876449585,
"step": 26
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.8610926270484924,
"learning_rate": 3e-05,
"loss": 1.25473952293396,
"step": 28
},
{
"epoch": 0.22556390977443608,
"grad_norm": 1.2555012702941895,
"learning_rate": 2.999111925794138e-05,
"loss": 1.228848934173584,
"step": 30
},
{
"epoch": 0.24060150375939848,
"grad_norm": 1.1694159507751465,
"learning_rate": 2.996448940315055e-05,
"loss": 1.39642333984375,
"step": 32
},
{
"epoch": 0.2556390977443609,
"grad_norm": 0.687818169593811,
"learning_rate": 2.9920147532548513e-05,
"loss": 1.2702100276947021,
"step": 34
},
{
"epoch": 0.2706766917293233,
"grad_norm": 0.33861589431762695,
"learning_rate": 2.9858155416914135e-05,
"loss": 1.326142430305481,
"step": 36
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.371178388595581,
"learning_rate": 2.9778599414833865e-05,
"loss": 1.4221186637878418,
"step": 38
},
{
"epoch": 0.3007518796992481,
"grad_norm": 0.7431687712669373,
"learning_rate": 2.9681590352399252e-05,
"loss": 1.0404337644577026,
"step": 40
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.42008474469184875,
"learning_rate": 2.956726336881985e-05,
"loss": 1.2850358486175537,
"step": 42
},
{
"epoch": 0.3308270676691729,
"grad_norm": 0.24258685111999512,
"learning_rate": 2.9435777728166477e-05,
"loss": 0.9888750314712524,
"step": 44
},
{
"epoch": 0.3458646616541353,
"grad_norm": 0.19806115329265594,
"learning_rate": 2.928731659750722e-05,
"loss": 1.389718770980835,
"step": 46
},
{
"epoch": 0.3609022556390977,
"grad_norm": 0.5233106017112732,
"learning_rate": 2.912208679174516e-05,
"loss": 1.0381746292114258,
"step": 48
},
{
"epoch": 0.37593984962406013,
"grad_norm": 0.3707257807254791,
"learning_rate": 2.8940318485513296e-05,
"loss": 1.0249922275543213,
"step": 50
},
{
"epoch": 0.39097744360902253,
"grad_norm": 0.29312264919281006,
"learning_rate": 2.8742264892528024e-05,
"loss": 0.9782328009605408,
"step": 52
},
{
"epoch": 0.40601503759398494,
"grad_norm": 0.34875017404556274,
"learning_rate": 2.8528201912847877e-05,
"loss": 1.0573807954788208,
"step": 54
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.14175978302955627,
"learning_rate": 2.829842774852883e-05,
"loss": 1.005712628364563,
"step": 56
},
{
"epoch": 0.43609022556390975,
"grad_norm": 0.3452085852622986,
"learning_rate": 2.805326248821166e-05,
"loss": 0.9127753376960754,
"step": 58
},
{
"epoch": 0.45112781954887216,
"grad_norm": 0.27766284346580505,
"learning_rate": 2.7793047661220094e-05,
"loss": 1.0905134677886963,
"step": 60
},
{
"epoch": 0.46616541353383456,
"grad_norm": 0.4942661225795746,
"learning_rate": 2.751814576179072e-05,
"loss": 0.8560956120491028,
"step": 62
},
{
"epoch": 0.48120300751879697,
"grad_norm": 0.225164532661438,
"learning_rate": 2.722893974409769e-05,
"loss": 1.1211824417114258,
"step": 64
},
{
"epoch": 0.49624060150375937,
"grad_norm": 0.5361406803131104,
"learning_rate": 2.6925832488775517e-05,
"loss": 1.101810336112976,
"step": 66
},
{
"epoch": 0.5112781954887218,
"grad_norm": 0.1772085279226303,
"learning_rate": 2.660924624168312e-05,
"loss": 1.2826346158981323,
"step": 68
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.7415258288383484,
"learning_rate": 2.627962202569103e-05,
"loss": 1.0522770881652832,
"step": 70
},
{
"epoch": 0.5413533834586466,
"grad_norm": 0.14701074361801147,
"learning_rate": 2.593741902631119e-05,
"loss": 0.7640881538391113,
"step": 72
},
{
"epoch": 0.556390977443609,
"grad_norm": 0.2193162888288498,
"learning_rate": 2.558311395202502e-05,
"loss": 0.8525770306587219,
"step": 74
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.38675349950790405,
"learning_rate": 2.5217200370201126e-05,
"loss": 1.0316098928451538,
"step": 76
},
{
"epoch": 0.5864661654135338,
"grad_norm": 0.15427573025226593,
"learning_rate": 2.4840188019527494e-05,
"loss": 1.2194627523422241,
"step": 78
},
{
"epoch": 0.6015037593984962,
"grad_norm": 1.0054173469543457,
"learning_rate": 2.445260209991616e-05,
"loss": 0.6321249008178711,
"step": 80
},
{
"epoch": 0.6165413533834586,
"grad_norm": 0.3735978603363037,
"learning_rate": 2.4054982540869497e-05,
"loss": 1.1763536930084229,
"step": 82
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.1597137153148651,
"learning_rate": 2.3647883249327334e-05,
"loss": 1.271316409111023,
"step": 84
},
{
"epoch": 0.6466165413533834,
"grad_norm": 0.20840084552764893,
"learning_rate": 2.3231871338042668e-05,
"loss": 0.9115048050880432,
"step": 86
},
{
"epoch": 0.6616541353383458,
"grad_norm": 0.11203482747077942,
"learning_rate": 2.280752633556098e-05,
"loss": 1.0730034112930298,
"step": 88
},
{
"epoch": 0.6766917293233082,
"grad_norm": 0.2053004801273346,
"learning_rate": 2.2375439378903597e-05,
"loss": 1.1552447080612183,
"step": 90
},
{
"epoch": 0.6917293233082706,
"grad_norm": 0.1656394600868225,
"learning_rate": 2.1936212390079758e-05,
"loss": 1.035262107849121,
"step": 92
},
{
"epoch": 0.706766917293233,
"grad_norm": 0.18118023872375488,
"learning_rate": 2.1490457237574638e-05,
"loss": 0.961626410484314,
"step": 94
},
{
"epoch": 0.7218045112781954,
"grad_norm": 0.3013463020324707,
"learning_rate": 2.103879488398128e-05,
"loss": 1.3001712560653687,
"step": 96
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.21438409388065338,
"learning_rate": 2.058185452096397e-05,
"loss": 1.1097919940948486,
"step": 98
},
{
"epoch": 0.7518796992481203,
"grad_norm": 0.4920536279678345,
"learning_rate": 2.0120272692758044e-05,
"loss": 0.6178168058395386,
"step": 100
},
{
"epoch": 0.7669172932330827,
"grad_norm": 0.2382662147283554,
"learning_rate": 1.965469240942704e-05,
"loss": 1.3048324584960938,
"step": 102
},
{
"epoch": 0.7819548872180451,
"grad_norm": 0.18292629718780518,
"learning_rate": 1.918576225111276e-05,
"loss": 0.9727452397346497,
"step": 104
},
{
"epoch": 0.7969924812030075,
"grad_norm": 0.35222965478897095,
"learning_rate": 1.8714135464525706e-05,
"loss": 0.6771279573440552,
"step": 106
},
{
"epoch": 0.8120300751879699,
"grad_norm": 0.32725006341934204,
"learning_rate": 1.824046905293483e-05,
"loss": 0.9753497242927551,
"step": 108
},
{
"epoch": 0.8270676691729323,
"grad_norm": 0.40115198493003845,
"learning_rate": 1.7765422860924167e-05,
"loss": 0.8510618209838867,
"step": 110
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.26472556591033936,
"learning_rate": 1.7289658655191308e-05,
"loss": 0.7452026605606079,
"step": 112
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.6424808502197266,
"learning_rate": 1.6813839202668314e-05,
"loss": 0.9618666768074036,
"step": 114
},
{
"epoch": 0.8721804511278195,
"grad_norm": 0.2608170509338379,
"learning_rate": 1.6338627347249194e-05,
"loss": 1.3540914058685303,
"step": 116
},
{
"epoch": 0.8872180451127819,
"grad_norm": 0.16167515516281128,
"learning_rate": 1.5864685086410205e-05,
"loss": 1.023390293121338,
"step": 118
},
{
"epoch": 0.9022556390977443,
"grad_norm": 0.36847585439682007,
"learning_rate": 1.539267264900926e-05,
"loss": 0.8938322067260742,
"step": 120
},
{
"epoch": 0.9172932330827067,
"grad_norm": 0.42921480536460876,
"learning_rate": 1.4923247575549108e-05,
"loss": 0.8056025505065918,
"step": 122
},
{
"epoch": 0.9323308270676691,
"grad_norm": 0.6019521355628967,
"learning_rate": 1.4457063802185558e-05,
"loss": 0.8153986930847168,
"step": 124
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.17627808451652527,
"learning_rate": 1.3994770749756746e-05,
"loss": 0.7411991953849792,
"step": 126
},
{
"epoch": 0.9624060150375939,
"grad_norm": 0.27609097957611084,
"learning_rate": 1.3537012419102535e-05,
"loss": 1.0465192794799805,
"step": 128
},
{
"epoch": 0.9774436090225563,
"grad_norm": 0.7103683948516846,
"learning_rate": 1.3084426493934257e-05,
"loss": 0.8794819712638855,
"step": 130
},
{
"epoch": 0.9924812030075187,
"grad_norm": 0.13337653875350952,
"learning_rate": 1.2637643452504579e-05,
"loss": 1.019758701324463,
"step": 132
},
{
"epoch": 1.0075187969924813,
"grad_norm": 0.15754307806491852,
"learning_rate": 1.2197285689315004e-05,
"loss": 0.7352499961853027,
"step": 134
},
{
"epoch": 1.0225563909774436,
"grad_norm": 0.1713666021823883,
"learning_rate": 1.1763966648084505e-05,
"loss": 0.8829557299613953,
"step": 136
},
{
"epoch": 1.037593984962406,
"grad_norm": 0.3357762098312378,
"learning_rate": 1.1338289967187079e-05,
"loss": 0.6713441610336304,
"step": 138
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.17261233925819397,
"learning_rate": 1.0920848638748748e-05,
"loss": 0.7187601327896118,
"step": 140
},
{
"epoch": 1.0676691729323309,
"grad_norm": 0.13400448858737946,
"learning_rate": 1.0512224182575395e-05,
"loss": 0.7740556597709656,
"step": 142
},
{
"epoch": 1.0827067669172932,
"grad_norm": 0.3450721502304077,
"learning_rate": 1.0112985836062175e-05,
"loss": 0.969446063041687,
"step": 144
},
{
"epoch": 1.0977443609022557,
"grad_norm": 0.2375430464744568,
"learning_rate": 9.723689761213051e-06,
"loss": 0.9913895130157471,
"step": 146
},
{
"epoch": 1.112781954887218,
"grad_norm": 0.15823714435100555,
"learning_rate": 9.34487826987512e-06,
"loss": 0.925875186920166,
"step": 148
},
{
"epoch": 1.1278195488721805,
"grad_norm": 0.23397411406040192,
"learning_rate": 8.97707906826694e-06,
"loss": 0.8095348477363586,
"step": 150
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.23221950232982635,
"learning_rate": 8.620804521853441e-06,
"loss": 0.9493626952171326,
"step": 152
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.1775089055299759,
"learning_rate": 8.27655094159128e-06,
"loss": 0.9873220920562744,
"step": 154
},
{
"epoch": 1.1729323308270676,
"grad_norm": 0.22347131371498108,
"learning_rate": 7.944797892539146e-06,
"loss": 0.9379909634590149,
"step": 156
},
{
"epoch": 1.1879699248120301,
"grad_norm": 0.17521372437477112,
"learning_rate": 7.626007525795976e-06,
"loss": 0.9363319277763367,
"step": 158
},
{
"epoch": 1.2030075187969924,
"grad_norm": 0.5944448113441467,
"learning_rate": 7.320623934697899e-06,
"loss": 0.5006011724472046,
"step": 160
},
{
"epoch": 1.218045112781955,
"grad_norm": 0.21691644191741943,
"learning_rate": 7.029072536170642e-06,
"loss": 0.877805233001709,
"step": 162
},
{
"epoch": 1.2330827067669172,
"grad_norm": 0.19128523766994476,
"learning_rate": 6.751759478099246e-06,
"loss": 1.0667612552642822,
"step": 164
},
{
"epoch": 1.2481203007518797,
"grad_norm": 0.32106316089630127,
"learning_rate": 6.489071073540686e-06,
"loss": 0.8215808868408203,
"step": 166
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.1843944638967514,
"learning_rate": 6.241373262567537e-06,
"loss": 0.6570966243743896,
"step": 168
},
{
"epoch": 1.2781954887218046,
"grad_norm": 0.24442797899246216,
"learning_rate": 6.009011102492393e-06,
"loss": 0.7164343595504761,
"step": 170
},
{
"epoch": 1.2932330827067668,
"grad_norm": 0.17113906145095825,
"learning_rate": 5.7923082871831375e-06,
"loss": 0.8579428791999817,
"step": 172
},
{
"epoch": 1.3082706766917294,
"grad_norm": 0.24567635357379913,
"learning_rate": 5.591566696138772e-06,
"loss": 0.8993586301803589,
"step": 174
},
{
"epoch": 1.3233082706766917,
"grad_norm": 0.19532179832458496,
"learning_rate": 5.407065973953888e-06,
"loss": 0.6733898520469666,
"step": 176
},
{
"epoch": 1.3383458646616542,
"grad_norm": 0.3204295337200165,
"learning_rate": 5.239063140757639e-06,
"loss": 0.6425676345825195,
"step": 178
},
{
"epoch": 1.3533834586466165,
"grad_norm": 0.22146816551685333,
"learning_rate": 5.0877922341699066e-06,
"loss": 1.1642075777053833,
"step": 180
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.2698806822299957,
"learning_rate": 4.953463983273412e-06,
"loss": 0.9253040552139282,
"step": 182
},
{
"epoch": 1.3834586466165413,
"grad_norm": 0.2668271064758301,
"learning_rate": 4.836265515055985e-06,
"loss": 0.7467199563980103,
"step": 184
},
{
"epoch": 1.3984962406015038,
"grad_norm": 0.17395268380641937,
"learning_rate": 4.736360093731884e-06,
"loss": 1.0783255100250244,
"step": 186
},
{
"epoch": 1.413533834586466,
"grad_norm": 0.20097728073596954,
"learning_rate": 4.653886893305353e-06,
"loss": 0.5329846143722534,
"step": 188
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.2769680917263031,
"learning_rate": 4.588960803693209e-06,
"loss": 0.8872597813606262,
"step": 190
},
{
"epoch": 1.443609022556391,
"grad_norm": 0.1900765597820282,
"learning_rate": 4.5416722706765875e-06,
"loss": 0.653458297252655,
"step": 192
},
{
"epoch": 1.4586466165413534,
"grad_norm": 0.299067884683609,
"learning_rate": 4.512087169904754e-06,
"loss": 0.7420106530189514,
"step": 194
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.19370242953300476,
"learning_rate": 4.500246715126523e-06,
"loss": 0.8862230181694031,
"step": 196
},
{
"epoch": 1.4887218045112782,
"grad_norm": 0.17850318551063538,
"learning_rate": 4.506167400777152e-06,
"loss": 0.8613809943199158,
"step": 198
},
{
"epoch": 1.5037593984962405,
"grad_norm": 0.17290696501731873,
"learning_rate": 4.52984097900063e-06,
"loss": 0.8784961104393005,
"step": 200
},
{
"epoch": 1.518796992481203,
"grad_norm": 0.3493019938468933,
"learning_rate": 4.5712344711394154e-06,
"loss": 1.2700152397155762,
"step": 202
},
{
"epoch": 1.5338345864661656,
"grad_norm": 0.42357122898101807,
"learning_rate": 4.630290213675614e-06,
"loss": 0.9580332636833191,
"step": 204
},
{
"epoch": 1.5488721804511278,
"grad_norm": 1.1557518243789673,
"learning_rate": 4.706925938559573e-06,
"loss": 0.7860268354415894,
"step": 206
},
{
"epoch": 1.5639097744360901,
"grad_norm": 0.28890499472618103,
"learning_rate": 4.801034887814009e-06,
"loss": 0.9093602895736694,
"step": 208
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.5986707210540771,
"learning_rate": 4.912485962254024e-06,
"loss": 0.8598864674568176,
"step": 210
},
{
"epoch": 1.5939849624060152,
"grad_norm": 0.17273662984371185,
"learning_rate": 5.04112390411581e-06,
"loss": 0.5818964242935181,
"step": 212
},
{
"epoch": 1.6090225563909775,
"grad_norm": 0.21565358340740204,
"learning_rate": 5.186769513339663e-06,
"loss": 0.884915292263031,
"step": 214
},
{
"epoch": 1.6240601503759398,
"grad_norm": 0.26930728554725647,
"learning_rate": 5.349219897205977e-06,
"loss": 0.9705126881599426,
"step": 216
},
{
"epoch": 1.6390977443609023,
"grad_norm": 0.1285410076379776,
"learning_rate": 5.5282487529764855e-06,
"loss": 0.7298458218574524,
"step": 218
},
{
"epoch": 1.6541353383458648,
"grad_norm": 0.21168453991413116,
"learning_rate": 5.7236066831470105e-06,
"loss": 0.7564178109169006,
"step": 220
},
{
"epoch": 1.669172932330827,
"grad_norm": 0.5188248753547668,
"learning_rate": 5.935021542872539e-06,
"loss": 0.8646745681762695,
"step": 222
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.24589960277080536,
"learning_rate": 6.162198819080668e-06,
"loss": 0.699385941028595,
"step": 224
},
{
"epoch": 1.699248120300752,
"grad_norm": 0.3245724141597748,
"learning_rate": 6.404822040745263e-06,
"loss": 1.0145379304885864,
"step": 226
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.24885950982570648,
"learning_rate": 6.662553219748833e-06,
"loss": 0.7830167412757874,
"step": 228
},
{
"epoch": 1.7293233082706767,
"grad_norm": 0.38476553559303284,
"learning_rate": 6.935033321719419e-06,
"loss": 0.9040583372116089,
"step": 230
},
{
"epoch": 1.744360902255639,
"grad_norm": 0.3446056544780731,
"learning_rate": 7.2218827661861725e-06,
"loss": 1.0128272771835327,
"step": 232
},
{
"epoch": 1.7593984962406015,
"grad_norm": 0.15947787463665009,
"learning_rate": 7.522701955356779e-06,
"loss": 0.9765535593032837,
"step": 234
},
{
"epoch": 1.774436090225564,
"grad_norm": 0.2729548513889313,
"learning_rate": 7.837071830780217e-06,
"loss": 1.0480151176452637,
"step": 236
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.3705751597881317,
"learning_rate": 8.164554457119286e-06,
"loss": 0.5091387033462524,
"step": 238
},
{
"epoch": 1.8045112781954886,
"grad_norm": 0.3041347563266754,
"learning_rate": 8.504693632219755e-06,
"loss": 0.9318640232086182,
"step": 240
},
{
"epoch": 1.8195488721804511,
"grad_norm": 0.3558288812637329,
"learning_rate": 8.857015522626238e-06,
"loss": 0.6968544721603394,
"step": 242
},
{
"epoch": 1.8345864661654137,
"grad_norm": 0.2972772717475891,
"learning_rate": 9.221029323659478e-06,
"loss": 1.047217607498169,
"step": 244
},
{
"epoch": 1.849624060150376,
"grad_norm": 0.1722293198108673,
"learning_rate": 9.596227943135503e-06,
"loss": 1.0219006538391113,
"step": 246
},
{
"epoch": 1.8646616541353382,
"grad_norm": 1.3821203708648682,
"learning_rate": 9.982088707774262e-06,
"loss": 0.9025890827178955,
"step": 248
},
{
"epoch": 1.8796992481203008,
"grad_norm": 0.15428081154823303,
"learning_rate": 1.0378074091313615e-05,
"loss": 1.1540802717208862,
"step": 250
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.16087524592876434,
"learning_rate": 1.0783632463314283e-05,
"loss": 0.9519784450531006,
"step": 252
},
{
"epoch": 1.9097744360902256,
"grad_norm": 0.1820048987865448,
"learning_rate": 1.1198198857612926e-05,
"loss": 1.1188257932662964,
"step": 254
},
{
"epoch": 1.9248120300751879,
"grad_norm": 0.8227013349533081,
"learning_rate": 1.1621195759352438e-05,
"loss": 0.8220981955528259,
"step": 256
},
{
"epoch": 1.9398496240601504,
"grad_norm": 0.39509618282318115,
"learning_rate": 1.2052033909493471e-05,
"loss": 0.5426740646362305,
"step": 258
},
{
"epoch": 1.954887218045113,
"grad_norm": 0.2340533286333084,
"learning_rate": 1.2490113125686138e-05,
"loss": 0.8964567184448242,
"step": 260
},
{
"epoch": 1.9699248120300752,
"grad_norm": 0.1862659901380539,
"learning_rate": 1.2934823138358649e-05,
"loss": 0.9239405393600464,
"step": 262
},
{
"epoch": 1.9849624060150375,
"grad_norm": 0.21919912099838257,
"learning_rate": 1.338554444085792e-05,
"loss": 0.8809694051742554,
"step": 264
},
{
"epoch": 2.0,
"grad_norm": 0.2186099886894226,
"learning_rate": 1.3841649152458003e-05,
"loss": 0.7980599403381348,
"step": 266
},
{
"epoch": 2.0150375939849625,
"grad_norm": 0.38081610202789307,
"learning_rate": 1.430250189303413e-05,
"loss": 0.463468998670578,
"step": 268
},
{
"epoch": 2.030075187969925,
"grad_norm": 0.2002028077840805,
"learning_rate": 1.4767460668183795e-05,
"loss": 0.6159178614616394,
"step": 270
},
{
"epoch": 2.045112781954887,
"grad_norm": 0.21730108559131622,
"learning_rate": 1.523587776356188e-05,
"loss": 0.46370548009872437,
"step": 272
},
{
"epoch": 2.0601503759398496,
"grad_norm": 1.0244005918502808,
"learning_rate": 1.5707100647184093e-05,
"loss": 0.897263765335083,
"step": 274
},
{
"epoch": 2.075187969924812,
"grad_norm": 0.1658545732498169,
"learning_rate": 1.6180472878441575e-05,
"loss": 0.7874804735183716,
"step": 276
},
{
"epoch": 2.090225563909774,
"grad_norm": 0.47336888313293457,
"learning_rate": 1.6655335022560423e-05,
"loss": 0.7593191266059875,
"step": 278
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.3983185589313507,
"learning_rate": 1.7131025569232362e-05,
"loss": 0.8093394637107849,
"step": 280
},
{
"epoch": 2.1203007518796992,
"grad_norm": 0.5871224999427795,
"learning_rate": 1.7606881854136644e-05,
"loss": 0.8642159104347229,
"step": 282
},
{
"epoch": 2.1353383458646618,
"grad_norm": 0.16395071148872375,
"learning_rate": 1.8082240982069634e-05,
"loss": 0.5777812004089355,
"step": 284
},
{
"epoch": 2.1503759398496243,
"grad_norm": 0.266190767288208,
"learning_rate": 1.8556440750395985e-05,
"loss": 0.8966842889785767,
"step": 286
},
{
"epoch": 2.1654135338345863,
"grad_norm": 0.26495200395584106,
"learning_rate": 1.9028820571535015e-05,
"loss": 1.0453461408615112,
"step": 288
},
{
"epoch": 2.180451127819549,
"grad_norm": 0.19011439383029938,
"learning_rate": 1.949872239319729e-05,
"loss": 1.0706809759140015,
"step": 290
},
{
"epoch": 2.1954887218045114,
"grad_norm": 0.25460541248321533,
"learning_rate": 1.996549161508929e-05,
"loss": 0.6951987743377686,
"step": 292
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.2636259198188782,
"learning_rate": 2.042847800080939e-05,
"loss": 0.8458771705627441,
"step": 294
},
{
"epoch": 2.225563909774436,
"grad_norm": 0.3756290674209595,
"learning_rate": 2.0887036583664505e-05,
"loss": 0.3105054199695587,
"step": 296
},
{
"epoch": 2.2406015037593985,
"grad_norm": 0.23921579122543335,
"learning_rate": 2.1340528565145932e-05,
"loss": 1.1170181035995483,
"step": 298
},
{
"epoch": 2.255639097744361,
"grad_norm": 0.2471323013305664,
"learning_rate": 2.1788322204812397e-05,
"loss": 0.9951118230819702,
"step": 300
},
{
"epoch": 2.2706766917293235,
"grad_norm": 0.2469598799943924,
"learning_rate": 2.2229793700340833e-05,
"loss": 1.0403016805648804,
"step": 302
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.34822267293930054,
"learning_rate": 2.2664328056519028e-05,
"loss": 0.7423543334007263,
"step": 304
},
{
"epoch": 2.300751879699248,
"grad_norm": 0.5992878079414368,
"learning_rate": 2.3091319941969266e-05,
"loss": 0.7819874286651611,
"step": 306
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.31358832120895386,
"learning_rate": 2.3510174532409867e-05,
"loss": 1.109780192375183,
"step": 308
},
{
"epoch": 2.3308270676691727,
"grad_norm": 0.656645655632019,
"learning_rate": 2.392030833927959e-05,
"loss": 0.4651540219783783,
"step": 310
},
{
"epoch": 2.345864661654135,
"grad_norm": 0.20808559656143188,
"learning_rate": 2.4321150022570873e-05,
"loss": 0.8532482385635376,
"step": 312
},
{
"epoch": 2.3609022556390977,
"grad_norm": 0.20093803107738495,
"learning_rate": 2.471214118673929e-05,
"loss": 0.568276584148407,
"step": 314
},
{
"epoch": 2.3759398496240602,
"grad_norm": 0.2839231491088867,
"learning_rate": 2.509273715858074e-05,
"loss": 0.9199910163879395,
"step": 316
},
{
"epoch": 2.3909774436090228,
"grad_norm": 0.19820740818977356,
"learning_rate": 2.546240774599257e-05,
"loss": 0.8895071744918823,
"step": 318
},
{
"epoch": 2.406015037593985,
"grad_norm": 0.26256436109542847,
"learning_rate": 2.582063797656167e-05,
"loss": 1.0534682273864746,
"step": 320
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.18550805747509003,
"learning_rate": 2.6166928814950743e-05,
"loss": 1.1147539615631104,
"step": 322
},
{
"epoch": 2.43609022556391,
"grad_norm": 0.509353518486023,
"learning_rate": 2.6500797858083262e-05,
"loss": 0.9222637414932251,
"step": 324
},
{
"epoch": 2.451127819548872,
"grad_norm": 0.18731118738651276,
"learning_rate": 2.682178000715866e-05,
"loss": 1.0500245094299316,
"step": 326
},
{
"epoch": 2.4661654135338344,
"grad_norm": 0.29536113142967224,
"learning_rate": 2.712942811556184e-05,
"loss": 0.8539433479309082,
"step": 328
},
{
"epoch": 2.481203007518797,
"grad_norm": 0.3995274007320404,
"learning_rate": 2.7423313611764086e-05,
"loss": 0.6855474710464478,
"step": 330
},
{
"epoch": 2.4962406015037595,
"grad_norm": 0.23789720237255096,
"learning_rate": 2.77030270963479e-05,
"loss": 1.0410560369491577,
"step": 332
},
{
"epoch": 2.511278195488722,
"grad_norm": 0.21356765925884247,
"learning_rate": 2.796817891232397e-05,
"loss": 0.6004407405853271,
"step": 334
},
{
"epoch": 2.526315789473684,
"grad_norm": 1.3617724180221558,
"learning_rate": 2.8218399687945758e-05,
"loss": 0.7526741027832031,
"step": 336
},
{
"epoch": 2.5413533834586466,
"grad_norm": 0.6006386876106262,
"learning_rate": 2.8453340851265676e-05,
"loss": 0.6869713664054871,
"step": 338
},
{
"epoch": 2.556390977443609,
"grad_norm": 0.3373733460903168,
"learning_rate": 2.8672675115715806e-05,
"loss": 1.0165461301803589,
"step": 340
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.5051329731941223,
"learning_rate": 2.887609693603699e-05,
"loss": 0.9631428718566895,
"step": 342
},
{
"epoch": 2.5864661654135337,
"grad_norm": 0.3386491537094116,
"learning_rate": 2.906332293392093e-05,
"loss": 0.8245996236801147,
"step": 344
},
{
"epoch": 2.601503759398496,
"grad_norm": 0.21825748682022095,
"learning_rate": 2.92340922927725e-05,
"loss": 0.5915822386741638,
"step": 346
},
{
"epoch": 2.6165413533834587,
"grad_norm": 0.24130862951278687,
"learning_rate": 2.9388167121042307e-05,
"loss": 0.7320323586463928,
"step": 348
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.5413809418678284,
"learning_rate": 2.952533278362327e-05,
"loss": 0.8300567269325256,
"step": 350
},
{
"epoch": 2.6466165413533833,
"grad_norm": 0.25651729106903076,
"learning_rate": 2.9645398200849713e-05,
"loss": 0.7235583066940308,
"step": 352
},
{
"epoch": 2.661654135338346,
"grad_norm": 0.15655282139778137,
"learning_rate": 2.9748196114682335e-05,
"loss": 1.0085736513137817,
"step": 354
},
{
"epoch": 2.6766917293233083,
"grad_norm": 0.7222322225570679,
"learning_rate": 2.983358332170829e-05,
"loss": 0.7790261507034302,
"step": 356
},
{
"epoch": 2.6917293233082704,
"grad_norm": 0.21224772930145264,
"learning_rate": 2.9901440872631778e-05,
"loss": 0.42803671956062317,
"step": 358
},
{
"epoch": 2.706766917293233,
"grad_norm": 0.22365406155586243,
"learning_rate": 2.9951674237977273e-05,
"loss": 1.0629819631576538,
"step": 360
},
{
"epoch": 2.7218045112781954,
"grad_norm": 0.2804076373577118,
"learning_rate": 2.998421343977452e-05,
"loss": 0.550415575504303,
"step": 362
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.2370826154947281,
"learning_rate": 2.9999013149041885e-05,
"loss": 0.721561074256897,
"step": 364
},
{
"epoch": 2.7518796992481205,
"grad_norm": 0.835011899471283,
"learning_rate": 2.999605274893222e-05,
"loss": 0.8219574689865112,
"step": 366
},
{
"epoch": 2.7669172932330826,
"grad_norm": 0.14573420584201813,
"learning_rate": 2.9975336363453326e-05,
"loss": 0.7218166589736938,
"step": 368
},
{
"epoch": 2.781954887218045,
"grad_norm": 3.228212356567383,
"learning_rate": 2.993689285172299e-05,
"loss": 0.8398270010948181,
"step": 370
},
{
"epoch": 2.7969924812030076,
"grad_norm": 0.5311354994773865,
"learning_rate": 2.9880775767766535e-05,
"loss": 0.8649424314498901,
"step": 372
},
{
"epoch": 2.8120300751879697,
"grad_norm": 0.44514158368110657,
"learning_rate": 2.980706328591302e-05,
"loss": 0.7094336152076721,
"step": 374
},
{
"epoch": 2.827067669172932,
"grad_norm": 0.41514015197753906,
"learning_rate": 2.971585809189387e-05,
"loss": 0.9906347393989563,
"step": 376
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.14243760704994202,
"learning_rate": 2.9607287239795747e-05,
"loss": 1.0890015363693237,
"step": 378
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.4031289517879486,
"learning_rate": 2.94815019750669e-05,
"loss": 0.7638394832611084,
"step": 380
},
{
"epoch": 2.8721804511278197,
"grad_norm": 0.2597931921482086,
"learning_rate": 2.933867752382353e-05,
"loss": 0.9143038392066956,
"step": 382
},
{
"epoch": 2.887218045112782,
"grad_norm": 0.3925493061542511,
"learning_rate": 2.917901284874975e-05,
"loss": 0.9326249957084656,
"step": 384
},
{
"epoch": 2.9022556390977443,
"grad_norm": 0.31625744700431824,
"learning_rate": 2.9002730371931074e-05,
"loss": 0.6936108469963074,
"step": 386
},
{
"epoch": 2.917293233082707,
"grad_norm": 0.2891203463077545,
"learning_rate": 2.881007566500768e-05,
"loss": 0.9043726921081543,
"step": 388
},
{
"epoch": 2.932330827067669,
"grad_norm": 0.30182725191116333,
"learning_rate": 2.8601317107078944e-05,
"loss": 0.8188687562942505,
"step": 390
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.43725159764289856,
"learning_rate": 2.8376745510835926e-05,
"loss": 0.9015698432922363,
"step": 392
},
{
"epoch": 2.962406015037594,
"grad_norm": 0.39825642108917236,
"learning_rate": 2.813667371744254e-05,
"loss": 0.7247455716133118,
"step": 394
},
{
"epoch": 2.9774436090225564,
"grad_norm": 0.15052802860736847,
"learning_rate": 2.7881436160729783e-05,
"loss": 0.9713034629821777,
"step": 396
},
{
"epoch": 2.992481203007519,
"grad_norm": 0.4360320270061493,
"learning_rate": 2.7611388401310196e-05,
"loss": 0.7928329706192017,
"step": 398
},
{
"epoch": 3.007518796992481,
"grad_norm": 0.20822873711585999,
"learning_rate": 2.7326906631261394e-05,
"loss": 0.7827808856964111,
"step": 400
},
{
"epoch": 3.0225563909774436,
"grad_norm": 0.09618931263685226,
"learning_rate": 2.7028387150068913e-05,
"loss": 0.6030799150466919,
"step": 402
},
{
"epoch": 3.037593984962406,
"grad_norm": 0.3093872666358948,
"learning_rate": 2.6716245812558134e-05,
"loss": 0.7962419390678406,
"step": 404
},
{
"epoch": 3.0526315789473686,
"grad_norm": 0.3837755024433136,
"learning_rate": 2.6390917449584653e-05,
"loss": 0.6203740239143372,
"step": 406
},
{
"epoch": 3.0676691729323307,
"grad_norm": 0.14874151349067688,
"learning_rate": 2.605285526228978e-05,
"loss": 0.51124107837677,
"step": 408
},
{
"epoch": 3.082706766917293,
"grad_norm": 0.21628743410110474,
"learning_rate": 2.570253019076529e-05,
"loss": 0.7276190519332886,
"step": 410
},
{
"epoch": 3.0977443609022557,
"grad_norm": 0.284242182970047,
"learning_rate": 2.5340430258006786e-05,
"loss": 0.5125940442085266,
"step": 412
},
{
"epoch": 3.112781954887218,
"grad_norm": 0.25154373049736023,
"learning_rate": 2.496705989006952e-05,
"loss": 0.8815844058990479,
"step": 414
},
{
"epoch": 3.1278195488721803,
"grad_norm": 0.14138440787792206,
"learning_rate": 2.4582939213373886e-05,
"loss": 0.37600424885749817,
"step": 416
},
{
"epoch": 3.142857142857143,
"grad_norm": 0.3758879005908966,
"learning_rate": 2.4188603330139344e-05,
"loss": 0.6696433424949646,
"step": 418
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.16141177713871002,
"learning_rate": 2.378460157295626e-05,
"loss": 0.6787968277931213,
"step": 420
},
{
"epoch": 3.172932330827068,
"grad_norm": 0.29796668887138367,
"learning_rate": 2.3371496739533913e-05,
"loss": 0.5915691256523132,
"step": 422
},
{
"epoch": 3.18796992481203,
"grad_norm": 0.4774704575538635,
"learning_rate": 2.294986430869094e-05,
"loss": 0.733458399772644,
"step": 424
},
{
"epoch": 3.2030075187969924,
"grad_norm": 0.36931291222572327,
"learning_rate": 2.252029163868019e-05,
"loss": 0.6868959069252014,
"step": 426
},
{
"epoch": 3.218045112781955,
"grad_norm": 0.45511841773986816,
"learning_rate": 2.208337714896483e-05,
"loss": 0.569706380367279,
"step": 428
},
{
"epoch": 3.2330827067669174,
"grad_norm": 1.0468262434005737,
"learning_rate": 2.1639729486585647e-05,
"loss": 0.4343474209308624,
"step": 430
},
{
"epoch": 3.2481203007518795,
"grad_norm": 0.131326824426651,
"learning_rate": 2.1189966678280585e-05,
"loss": 0.4525618553161621,
"step": 432
},
{
"epoch": 3.263157894736842,
"grad_norm": 0.1383962780237198,
"learning_rate": 2.0734715269537963e-05,
"loss": 0.44801121950149536,
"step": 434
},
{
"epoch": 3.2781954887218046,
"grad_norm": 0.34697940945625305,
"learning_rate": 2.0274609451782568e-05,
"loss": 0.42984333634376526,
"step": 436
},
{
"epoch": 3.293233082706767,
"grad_norm": 0.10286783427000046,
"learning_rate": 1.9810290178910406e-05,
"loss": 0.4518528878688812,
"step": 438
},
{
"epoch": 3.308270676691729,
"grad_norm": 0.18340881168842316,
"learning_rate": 1.934240427440311e-05,
"loss": 0.9285587072372437,
"step": 440
},
{
"epoch": 3.3233082706766917,
"grad_norm": 0.18978752195835114,
"learning_rate": 1.8871603530265477e-05,
"loss": 0.39083921909332275,
"step": 442
},
{
"epoch": 3.338345864661654,
"grad_norm": 0.21235691010951996,
"learning_rate": 1.8398543799041773e-05,
"loss": 0.6497979760169983,
"step": 444
},
{
"epoch": 3.3533834586466167,
"grad_norm": 0.9397839903831482,
"learning_rate": 1.792388408017536e-05,
"loss": 0.5017030239105225,
"step": 446
},
{
"epoch": 3.3684210526315788,
"grad_norm": 0.23365262150764465,
"learning_rate": 1.744828560198448e-05,
"loss": 0.7379826903343201,
"step": 448
},
{
"epoch": 3.3834586466165413,
"grad_norm": 0.42739665508270264,
"learning_rate": 1.697241090053319e-05,
"loss": 0.7720116972923279,
"step": 450
},
{
"epoch": 3.398496240601504,
"grad_norm": 0.2359744757413864,
"learning_rate": 1.6496922896680423e-05,
"loss": 0.7877475619316101,
"step": 452
},
{
"epoch": 3.4135338345864663,
"grad_norm": 0.4221789240837097,
"learning_rate": 1.6022483972593128e-05,
"loss": 0.7371859550476074,
"step": 454
},
{
"epoch": 3.4285714285714284,
"grad_norm": 0.46123459935188293,
"learning_rate": 1.5549755049009714e-05,
"loss": 0.731837809085846,
"step": 456
},
{
"epoch": 3.443609022556391,
"grad_norm": 0.20335260033607483,
"learning_rate": 1.5079394664539421e-05,
"loss": 0.48273712396621704,
"step": 458
},
{
"epoch": 3.4586466165413534,
"grad_norm": 0.2316899299621582,
"learning_rate": 1.4612058058280153e-05,
"loss": 0.7381947636604309,
"step": 460
},
{
"epoch": 3.473684210526316,
"grad_norm": 0.3751467168331146,
"learning_rate": 1.4148396257032674e-05,
"loss": 0.769965648651123,
"step": 462
},
{
"epoch": 3.488721804511278,
"grad_norm": 0.5033459663391113,
"learning_rate": 1.3689055168382717e-05,
"loss": 0.6628371477127075,
"step": 464
},
{
"epoch": 3.5037593984962405,
"grad_norm": 0.30648085474967957,
"learning_rate": 1.3234674680914651e-05,
"loss": 0.7021836638450623,
"step": 466
},
{
"epoch": 3.518796992481203,
"grad_norm": 0.19549153745174408,
"learning_rate": 1.2785887772809783e-05,
"loss": 0.5976605415344238,
"step": 468
},
{
"epoch": 3.5338345864661656,
"grad_norm": 0.47317057847976685,
"learning_rate": 1.2343319630071227e-05,
"loss": 0.678418755531311,
"step": 470
},
{
"epoch": 3.548872180451128,
"grad_norm": 0.3564242720603943,
"learning_rate": 1.1907586775603957e-05,
"loss": 0.6626768708229065,
"step": 472
},
{
"epoch": 3.56390977443609,
"grad_norm": 0.33226093649864197,
"learning_rate": 1.147929621036279e-05,
"loss": 0.7116915583610535,
"step": 474
},
{
"epoch": 3.5789473684210527,
"grad_norm": 0.3433665931224823,
"learning_rate": 1.1059044567765164e-05,
"loss": 0.36730286478996277,
"step": 476
},
{
"epoch": 3.593984962406015,
"grad_norm": 0.17942893505096436,
"learning_rate": 1.0647417282546353e-05,
"loss": 0.3575655221939087,
"step": 478
},
{
"epoch": 3.6090225563909772,
"grad_norm": 0.14913895726203918,
"learning_rate": 1.024498777521529e-05,
"loss": 0.751462996006012,
"step": 480
},
{
"epoch": 3.6240601503759398,
"grad_norm": 0.6876167058944702,
"learning_rate": 9.852316653246724e-06,
"loss": 0.7515479922294617,
"step": 482
},
{
"epoch": 3.6390977443609023,
"grad_norm": 0.30825933814048767,
"learning_rate": 9.469950930122665e-06,
"loss": 0.6766018867492676,
"step": 484
},
{
"epoch": 3.654135338345865,
"grad_norm": 0.3747425675392151,
"learning_rate": 9.098423263311226e-06,
"loss": 0.3269270956516266,
"step": 486
},
{
"epoch": 3.6691729323308273,
"grad_norm": 0.19600830972194672,
"learning_rate": 8.738251212244036e-06,
"loss": 0.6345582008361816,
"step": 488
},
{
"epoch": 3.6842105263157894,
"grad_norm": 0.21161755919456482,
"learning_rate": 8.389936517326165e-06,
"loss": 0.8583235144615173,
"step": 490
},
{
"epoch": 3.699248120300752,
"grad_norm": 2.3246822357177734,
"learning_rate": 8.053964400982803e-06,
"loss": 0.7647910714149475,
"step": 492
},
{
"epoch": 3.7142857142857144,
"grad_norm": 0.3167845606803894,
"learning_rate": 7.730802891716579e-06,
"loss": 0.3876282870769501,
"step": 494
},
{
"epoch": 3.7293233082706765,
"grad_norm": 0.16611672937870026,
"learning_rate": 7.420902172116848e-06,
"loss": 0.8268077969551086,
"step": 496
},
{
"epoch": 3.744360902255639,
"grad_norm": 0.27863407135009766,
"learning_rate": 7.124693951729393e-06,
"loss": 0.9286668300628662,
"step": 498
},
{
"epoch": 3.7593984962406015,
"grad_norm": 0.37909525632858276,
"learning_rate": 6.842590865660255e-06,
"loss": 0.6480289101600647,
"step": 500
},
{
"epoch": 3.774436090225564,
"grad_norm": 0.2283431440591812,
"learning_rate": 6.574985899751219e-06,
"loss": 0.576987624168396,
"step": 502
},
{
"epoch": 3.7894736842105265,
"grad_norm": 0.6403146982192993,
"learning_rate": 6.322251843127883e-06,
"loss": 0.5665578842163086,
"step": 504
},
{
"epoch": 3.8045112781954886,
"grad_norm": 0.24255450069904327,
"learning_rate": 6.0847407688830226e-06,
"loss": 0.44220831990242004,
"step": 506
},
{
"epoch": 3.819548872180451,
"grad_norm": 0.17681249976158142,
"learning_rate": 5.862783543618414e-06,
"loss": 0.6706622242927551,
"step": 508
},
{
"epoch": 3.8345864661654137,
"grad_norm": 0.6901529431343079,
"learning_rate": 5.65668936652867e-06,
"loss": 0.45284244418144226,
"step": 510
},
{
"epoch": 3.8496240601503757,
"grad_norm": 0.2059166431427002,
"learning_rate": 5.466745338668931e-06,
"loss": 0.6849936246871948,
"step": 512
},
{
"epoch": 3.8646616541353382,
"grad_norm": 0.17384979128837585,
"learning_rate": 5.293216063006581e-06,
"loss": 0.6412226557731628,
"step": 514
},
{
"epoch": 3.8796992481203008,
"grad_norm": 1.0503121614456177,
"learning_rate": 5.136343275814039e-06,
"loss": 0.8608755469322205,
"step": 516
},
{
"epoch": 3.8947368421052633,
"grad_norm": 0.19333027303218842,
"learning_rate": 4.9963455099162615e-06,
"loss": 0.5098147392272949,
"step": 518
},
{
"epoch": 3.909774436090226,
"grad_norm": 0.14984972774982452,
"learning_rate": 4.8734177902619205e-06,
"loss": 0.7260234951972961,
"step": 520
},
{
"epoch": 3.924812030075188,
"grad_norm": 0.14300547540187836,
"learning_rate": 4.7677313622423905e-06,
"loss": 0.8742654919624329,
"step": 522
},
{
"epoch": 3.9398496240601504,
"grad_norm": 0.1549258679151535,
"learning_rate": 4.6794334531371056e-06,
"loss": 0.9179413318634033,
"step": 524
},
{
"epoch": 3.954887218045113,
"grad_norm": 3.7465577125549316,
"learning_rate": 4.608647067017448e-06,
"loss": 0.8616862297058105,
"step": 526
},
{
"epoch": 3.969924812030075,
"grad_norm": 0.1470378190279007,
"learning_rate": 4.555470813395014e-06,
"loss": 0.5497387647628784,
"step": 528
},
{
"epoch": 3.9849624060150375,
"grad_norm": 0.2949855327606201,
"learning_rate": 4.519978769852865e-06,
"loss": 0.42557334899902344,
"step": 530
},
{
"epoch": 4.0,
"grad_norm": 0.2642301321029663,
"learning_rate": 4.502220378851213e-06,
"loss": 0.6198008060455322,
"step": 532
},
{
"epoch": 4.0,
"step": 532,
"total_flos": 3.873354436822696e+18,
"train_loss": 0.8859393315431767,
"train_runtime": 11385.0199,
"train_samples_per_second": 5.607,
"train_steps_per_second": 0.047
}
],
"logging_steps": 2,
"max_steps": 532,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.873354436822696e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}