sedrickkeh's picture
End of training
2aad697 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9724310776942353,
"eval_steps": 500,
"global_step": 147,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020050125313283207,
"grad_norm": 6.100090649437154,
"learning_rate": 5.333333333333334e-06,
"loss": 1.0088,
"step": 1
},
{
"epoch": 0.040100250626566414,
"grad_norm": 6.165069480911681,
"learning_rate": 1.0666666666666667e-05,
"loss": 1.0202,
"step": 2
},
{
"epoch": 0.06015037593984962,
"grad_norm": 4.472412116195948,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.9496,
"step": 3
},
{
"epoch": 0.08020050125313283,
"grad_norm": 4.963164130948997,
"learning_rate": 2.1333333333333335e-05,
"loss": 0.9406,
"step": 4
},
{
"epoch": 0.10025062656641603,
"grad_norm": 4.041819143127636,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.8794,
"step": 5
},
{
"epoch": 0.12030075187969924,
"grad_norm": 3.719380540370943,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.9034,
"step": 6
},
{
"epoch": 0.14035087719298245,
"grad_norm": 2.3309985890149765,
"learning_rate": 3.733333333333334e-05,
"loss": 0.8102,
"step": 7
},
{
"epoch": 0.16040100250626566,
"grad_norm": 2.2467101719324454,
"learning_rate": 4.266666666666667e-05,
"loss": 0.786,
"step": 8
},
{
"epoch": 0.18045112781954886,
"grad_norm": 2.8539093369835227,
"learning_rate": 4.8e-05,
"loss": 0.7531,
"step": 9
},
{
"epoch": 0.20050125313283207,
"grad_norm": 2.0083048527276404,
"learning_rate": 5.333333333333333e-05,
"loss": 0.7468,
"step": 10
},
{
"epoch": 0.22055137844611528,
"grad_norm": 2.165125600420533,
"learning_rate": 5.8666666666666665e-05,
"loss": 0.7391,
"step": 11
},
{
"epoch": 0.24060150375939848,
"grad_norm": 2.122858303676939,
"learning_rate": 6.400000000000001e-05,
"loss": 0.7233,
"step": 12
},
{
"epoch": 0.2606516290726817,
"grad_norm": 2.2789276233273843,
"learning_rate": 6.933333333333334e-05,
"loss": 0.7286,
"step": 13
},
{
"epoch": 0.2807017543859649,
"grad_norm": 2.1616002933600393,
"learning_rate": 7.466666666666667e-05,
"loss": 0.7199,
"step": 14
},
{
"epoch": 0.3007518796992481,
"grad_norm": 3.2259560404881986,
"learning_rate": 8e-05,
"loss": 0.7223,
"step": 15
},
{
"epoch": 0.3208020050125313,
"grad_norm": 1.5369895434247212,
"learning_rate": 7.998867178772517e-05,
"loss": 0.7063,
"step": 16
},
{
"epoch": 0.3408521303258145,
"grad_norm": 3.28272311228189,
"learning_rate": 7.995469356732033e-05,
"loss": 0.7174,
"step": 17
},
{
"epoch": 0.3609022556390977,
"grad_norm": 2.1611599200246974,
"learning_rate": 7.989808458441014e-05,
"loss": 0.6981,
"step": 18
},
{
"epoch": 0.38095238095238093,
"grad_norm": 1.9324451852274291,
"learning_rate": 7.981887690292339e-05,
"loss": 0.6943,
"step": 19
},
{
"epoch": 0.40100250626566414,
"grad_norm": 18.248869277300834,
"learning_rate": 7.971711538693153e-05,
"loss": 0.6998,
"step": 20
},
{
"epoch": 0.42105263157894735,
"grad_norm": 2.181882821404686,
"learning_rate": 7.959285767523732e-05,
"loss": 0.7193,
"step": 21
},
{
"epoch": 0.44110275689223055,
"grad_norm": 1.2480300021039636,
"learning_rate": 7.944617414872747e-05,
"loss": 0.6843,
"step": 22
},
{
"epoch": 0.46115288220551376,
"grad_norm": 2.3412336103636693,
"learning_rate": 7.927714789050826e-05,
"loss": 0.7089,
"step": 23
},
{
"epoch": 0.48120300751879697,
"grad_norm": 1.9642689883848843,
"learning_rate": 7.908587463884638e-05,
"loss": 0.6787,
"step": 24
},
{
"epoch": 0.5012531328320802,
"grad_norm": 1.3434579333848315,
"learning_rate": 7.887246273294167e-05,
"loss": 0.6773,
"step": 25
},
{
"epoch": 0.5213032581453634,
"grad_norm": 1.440602627293297,
"learning_rate": 7.863703305156273e-05,
"loss": 0.6792,
"step": 26
},
{
"epoch": 0.5413533834586466,
"grad_norm": 1.3175913095028617,
"learning_rate": 7.837971894457991e-05,
"loss": 0.6654,
"step": 27
},
{
"epoch": 0.5614035087719298,
"grad_norm": 1.3198703652450279,
"learning_rate": 7.810066615743443e-05,
"loss": 0.6524,
"step": 28
},
{
"epoch": 0.581453634085213,
"grad_norm": 0.781557000061513,
"learning_rate": 7.780003274858674e-05,
"loss": 0.6573,
"step": 29
},
{
"epoch": 0.6015037593984962,
"grad_norm": 1.1674794127969026,
"learning_rate": 7.747798899999048e-05,
"loss": 0.6664,
"step": 30
},
{
"epoch": 0.6215538847117794,
"grad_norm": 1.623871556077205,
"learning_rate": 7.71347173206429e-05,
"loss": 0.6722,
"step": 31
},
{
"epoch": 0.6416040100250626,
"grad_norm": 1.1713900589893373,
"learning_rate": 7.677041214326663e-05,
"loss": 0.6403,
"step": 32
},
{
"epoch": 0.6616541353383458,
"grad_norm": 1.1207108511375592,
"learning_rate": 7.638527981418075e-05,
"loss": 0.6427,
"step": 33
},
{
"epoch": 0.681704260651629,
"grad_norm": 1.6390101502728163,
"learning_rate": 7.597953847642413e-05,
"loss": 0.6451,
"step": 34
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.9703477640239104,
"learning_rate": 7.555341794619695e-05,
"loss": 0.6371,
"step": 35
},
{
"epoch": 0.7218045112781954,
"grad_norm": 1.888557130236824,
"learning_rate": 7.510715958269023e-05,
"loss": 0.6385,
"step": 36
},
{
"epoch": 0.7418546365914787,
"grad_norm": 1.4709265925596289,
"learning_rate": 7.464101615137756e-05,
"loss": 0.6468,
"step": 37
},
{
"epoch": 0.7619047619047619,
"grad_norm": 1.3340842858015245,
"learning_rate": 7.415525168084593e-05,
"loss": 0.636,
"step": 38
},
{
"epoch": 0.7819548872180451,
"grad_norm": 1.2331491459930695,
"learning_rate": 7.365014131324725e-05,
"loss": 0.6423,
"step": 39
},
{
"epoch": 0.8020050125313283,
"grad_norm": 1.0459942770103978,
"learning_rate": 7.312597114845483e-05,
"loss": 0.6405,
"step": 40
},
{
"epoch": 0.8220551378446115,
"grad_norm": 0.9742677496779505,
"learning_rate": 7.258303808201343e-05,
"loss": 0.619,
"step": 41
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.9126761857775636,
"learning_rate": 7.202164963697442e-05,
"loss": 0.6237,
"step": 42
},
{
"epoch": 0.8621553884711779,
"grad_norm": 1.1034224988906396,
"learning_rate": 7.144212378971151e-05,
"loss": 0.6126,
"step": 43
},
{
"epoch": 0.8822055137844611,
"grad_norm": 0.9050299139188852,
"learning_rate": 7.084478878981552e-05,
"loss": 0.6199,
"step": 44
},
{
"epoch": 0.9022556390977443,
"grad_norm": 0.9484346415934313,
"learning_rate": 7.022998297417034e-05,
"loss": 0.6242,
"step": 45
},
{
"epoch": 0.9223057644110275,
"grad_norm": 0.6761984153684619,
"learning_rate": 6.959805457531536e-05,
"loss": 0.6271,
"step": 46
},
{
"epoch": 0.9423558897243107,
"grad_norm": 0.7734163089859195,
"learning_rate": 6.89493615242028e-05,
"loss": 0.6057,
"step": 47
},
{
"epoch": 0.9624060150375939,
"grad_norm": 0.7126127374462361,
"learning_rate": 6.828427124746191e-05,
"loss": 0.6143,
"step": 48
},
{
"epoch": 0.9824561403508771,
"grad_norm": 0.7040138292374798,
"learning_rate": 6.760316045928449e-05,
"loss": 0.5971,
"step": 49
},
{
"epoch": 1.0150375939849625,
"grad_norm": 1.0358340840041869,
"learning_rate": 6.690641494805011e-05,
"loss": 1.0623,
"step": 50
},
{
"epoch": 1.0350877192982457,
"grad_norm": 0.975942175036343,
"learning_rate": 6.619442935781141e-05,
"loss": 0.5949,
"step": 51
},
{
"epoch": 1.055137844611529,
"grad_norm": 1.3817333278949258,
"learning_rate": 6.546760696476354e-05,
"loss": 0.5965,
"step": 52
},
{
"epoch": 1.0751879699248121,
"grad_norm": 0.5594198623207998,
"learning_rate": 6.472635944882421e-05,
"loss": 0.5817,
"step": 53
},
{
"epoch": 1.0952380952380953,
"grad_norm": 1.3502604906380835,
"learning_rate": 6.397110666045388e-05,
"loss": 0.5936,
"step": 54
},
{
"epoch": 1.1152882205513786,
"grad_norm": 0.662382371248644,
"learning_rate": 6.320227638284793e-05,
"loss": 0.597,
"step": 55
},
{
"epoch": 1.1353383458646618,
"grad_norm": 1.0092828128239282,
"learning_rate": 6.242030408963576e-05,
"loss": 0.5895,
"step": 56
},
{
"epoch": 1.155388471177945,
"grad_norm": 0.8148251028605993,
"learning_rate": 6.162563269822391e-05,
"loss": 0.5796,
"step": 57
},
{
"epoch": 1.1754385964912282,
"grad_norm": 0.7504641683568708,
"learning_rate": 6.0818712318922894e-05,
"loss": 0.5756,
"step": 58
},
{
"epoch": 1.1954887218045114,
"grad_norm": 0.7075215261142646,
"learning_rate": 6.000000000000001e-05,
"loss": 0.5899,
"step": 59
},
{
"epoch": 1.2155388471177946,
"grad_norm": 0.616627011160271,
"learning_rate": 5.916995946880228e-05,
"loss": 0.5756,
"step": 60
},
{
"epoch": 1.2355889724310778,
"grad_norm": 0.6224665456566449,
"learning_rate": 5.832906086909642e-05,
"loss": 0.5717,
"step": 61
},
{
"epoch": 1.255639097744361,
"grad_norm": 0.5410026994188678,
"learning_rate": 5.747778049477438e-05,
"loss": 0.5719,
"step": 62
},
{
"epoch": 1.2756892230576442,
"grad_norm": 0.5138816389771951,
"learning_rate": 5.661660052007547e-05,
"loss": 0.5767,
"step": 63
},
{
"epoch": 1.2957393483709274,
"grad_norm": 0.538651466379072,
"learning_rate": 5.574600872647766e-05,
"loss": 0.5754,
"step": 64
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.39832101327162095,
"learning_rate": 5.48664982264131e-05,
"loss": 0.5806,
"step": 65
},
{
"epoch": 1.3358395989974938,
"grad_norm": 0.46665463498566045,
"learning_rate": 5.397856718396394e-05,
"loss": 0.5622,
"step": 66
},
{
"epoch": 1.355889724310777,
"grad_norm": 0.33596753910393834,
"learning_rate": 5.3082718532696874e-05,
"loss": 0.5635,
"step": 67
},
{
"epoch": 1.3759398496240602,
"grad_norm": 0.36330721064185245,
"learning_rate": 5.217945969079629e-05,
"loss": 0.5728,
"step": 68
},
{
"epoch": 1.3959899749373434,
"grad_norm": 0.31668248946183414,
"learning_rate": 5.1269302273657195e-05,
"loss": 0.5829,
"step": 69
},
{
"epoch": 1.4160401002506267,
"grad_norm": 0.3108858835369522,
"learning_rate": 5.0352761804100835e-05,
"loss": 0.5797,
"step": 70
},
{
"epoch": 1.4360902255639099,
"grad_norm": 0.37076038809913947,
"learning_rate": 4.94303574203771e-05,
"loss": 0.5678,
"step": 71
},
{
"epoch": 1.456140350877193,
"grad_norm": 0.27669921803024705,
"learning_rate": 4.8502611582119065e-05,
"loss": 0.5644,
"step": 72
},
{
"epoch": 1.4761904761904763,
"grad_norm": 0.3206960394723747,
"learning_rate": 4.7570049774416414e-05,
"loss": 0.5696,
"step": 73
},
{
"epoch": 1.4962406015037595,
"grad_norm": 0.35836009606291436,
"learning_rate": 4.663320021017497e-05,
"loss": 0.5574,
"step": 74
},
{
"epoch": 1.5162907268170427,
"grad_norm": 0.26025491223433994,
"learning_rate": 4.5692593530931416e-05,
"loss": 0.5683,
"step": 75
},
{
"epoch": 1.536340852130326,
"grad_norm": 0.25457026883587025,
"learning_rate": 4.474876250629221e-05,
"loss": 0.565,
"step": 76
},
{
"epoch": 1.556390977443609,
"grad_norm": 0.32769122481932667,
"learning_rate": 4.38022417321673e-05,
"loss": 0.5641,
"step": 77
},
{
"epoch": 1.5764411027568923,
"grad_norm": 0.22460593199796938,
"learning_rate": 4.2853567327969296e-05,
"loss": 0.557,
"step": 78
},
{
"epoch": 1.5964912280701755,
"grad_norm": 0.21753199007833202,
"learning_rate": 4.19032766329497e-05,
"loss": 0.5578,
"step": 79
},
{
"epoch": 1.6165413533834587,
"grad_norm": 0.23083331300054297,
"learning_rate": 4.0951907901844296e-05,
"loss": 0.5622,
"step": 80
},
{
"epoch": 1.636591478696742,
"grad_norm": 0.2564157643733938,
"learning_rate": 4e-05,
"loss": 0.5657,
"step": 81
},
{
"epoch": 1.6566416040100251,
"grad_norm": 0.27813133557097336,
"learning_rate": 3.904809209815571e-05,
"loss": 0.5603,
"step": 82
},
{
"epoch": 1.6766917293233083,
"grad_norm": 0.2038258932695245,
"learning_rate": 3.809672336705031e-05,
"loss": 0.5572,
"step": 83
},
{
"epoch": 1.6967418546365916,
"grad_norm": 0.29155489774909854,
"learning_rate": 3.714643267203071e-05,
"loss": 0.5544,
"step": 84
},
{
"epoch": 1.7167919799498748,
"grad_norm": 0.16516627488221486,
"learning_rate": 3.6197758267832705e-05,
"loss": 0.5584,
"step": 85
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.29249607068570543,
"learning_rate": 3.5251237493707804e-05,
"loss": 0.5677,
"step": 86
},
{
"epoch": 1.7568922305764412,
"grad_norm": 0.19265408478071472,
"learning_rate": 3.4307406469068604e-05,
"loss": 0.5632,
"step": 87
},
{
"epoch": 1.7769423558897244,
"grad_norm": 0.21254433897953356,
"learning_rate": 3.3366799789825044e-05,
"loss": 0.5512,
"step": 88
},
{
"epoch": 1.7969924812030076,
"grad_norm": 0.2170294921675785,
"learning_rate": 3.2429950225583606e-05,
"loss": 0.5493,
"step": 89
},
{
"epoch": 1.8170426065162908,
"grad_norm": 0.17517457106296594,
"learning_rate": 3.1497388417880935e-05,
"loss": 0.5522,
"step": 90
},
{
"epoch": 1.837092731829574,
"grad_norm": 0.21184436677480925,
"learning_rate": 3.0569642579622905e-05,
"loss": 0.5533,
"step": 91
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.1566054507779003,
"learning_rate": 2.9647238195899168e-05,
"loss": 0.538,
"step": 92
},
{
"epoch": 1.8771929824561404,
"grad_norm": 0.19291518917014827,
"learning_rate": 2.873069772634281e-05,
"loss": 0.5613,
"step": 93
},
{
"epoch": 1.8972431077694236,
"grad_norm": 0.14403155629811726,
"learning_rate": 2.7820540309203728e-05,
"loss": 0.5561,
"step": 94
},
{
"epoch": 1.9172932330827068,
"grad_norm": 0.18577775134211605,
"learning_rate": 2.691728146730314e-05,
"loss": 0.5619,
"step": 95
},
{
"epoch": 1.93734335839599,
"grad_norm": 0.17655395819783837,
"learning_rate": 2.6021432816036073e-05,
"loss": 0.557,
"step": 96
},
{
"epoch": 1.9573934837092732,
"grad_norm": 0.15678742149650277,
"learning_rate": 2.5133501773586905e-05,
"loss": 0.55,
"step": 97
},
{
"epoch": 1.9774436090225564,
"grad_norm": 0.147011080540491,
"learning_rate": 2.425399127352235e-05,
"loss": 0.5615,
"step": 98
},
{
"epoch": 2.0100250626566414,
"grad_norm": 0.2896327588272941,
"learning_rate": 2.338339947992455e-05,
"loss": 0.984,
"step": 99
},
{
"epoch": 2.030075187969925,
"grad_norm": 0.17749955956373084,
"learning_rate": 2.2522219505225627e-05,
"loss": 0.5472,
"step": 100
},
{
"epoch": 2.050125313283208,
"grad_norm": 0.18960883016778624,
"learning_rate": 2.1670939130903585e-05,
"loss": 0.5246,
"step": 101
},
{
"epoch": 2.0701754385964914,
"grad_norm": 0.19580310466277323,
"learning_rate": 2.0830040531197744e-05,
"loss": 0.5333,
"step": 102
},
{
"epoch": 2.090225563909774,
"grad_norm": 0.17002438913970971,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.5232,
"step": 103
},
{
"epoch": 2.110275689223058,
"grad_norm": 0.23635746246402964,
"learning_rate": 1.9181287681077116e-05,
"loss": 0.5299,
"step": 104
},
{
"epoch": 2.1303258145363406,
"grad_norm": 0.15533319903966208,
"learning_rate": 1.8374367301776112e-05,
"loss": 0.5193,
"step": 105
},
{
"epoch": 2.1503759398496243,
"grad_norm": 0.2082307249817901,
"learning_rate": 1.7579695910364235e-05,
"loss": 0.5342,
"step": 106
},
{
"epoch": 2.170426065162907,
"grad_norm": 0.14435259557657618,
"learning_rate": 1.679772361715208e-05,
"loss": 0.5361,
"step": 107
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.20712238757176837,
"learning_rate": 1.6028893339546122e-05,
"loss": 0.5331,
"step": 108
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.13522587328433971,
"learning_rate": 1.527364055117579e-05,
"loss": 0.5329,
"step": 109
},
{
"epoch": 2.230576441102757,
"grad_norm": 0.1593070945975799,
"learning_rate": 1.4532393035236477e-05,
"loss": 0.5323,
"step": 110
},
{
"epoch": 2.25062656641604,
"grad_norm": 0.1461967123983933,
"learning_rate": 1.3805570642188602e-05,
"loss": 0.5162,
"step": 111
},
{
"epoch": 2.2706766917293235,
"grad_norm": 0.11139864548024211,
"learning_rate": 1.30935850519499e-05,
"loss": 0.5258,
"step": 112
},
{
"epoch": 2.2907268170426063,
"grad_norm": 0.1459949648258527,
"learning_rate": 1.2396839540715528e-05,
"loss": 0.5249,
"step": 113
},
{
"epoch": 2.31077694235589,
"grad_norm": 0.1134072731915256,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.5283,
"step": 114
},
{
"epoch": 2.3308270676691727,
"grad_norm": 0.10061797227153219,
"learning_rate": 1.1050638475797193e-05,
"loss": 0.5264,
"step": 115
},
{
"epoch": 2.3508771929824563,
"grad_norm": 0.11967704006720019,
"learning_rate": 1.0401945424684653e-05,
"loss": 0.5258,
"step": 116
},
{
"epoch": 2.370927318295739,
"grad_norm": 0.10081258865949869,
"learning_rate": 9.770017025829675e-06,
"loss": 0.5125,
"step": 117
},
{
"epoch": 2.3909774436090228,
"grad_norm": 0.0910736424605034,
"learning_rate": 9.155211210184495e-06,
"loss": 0.5215,
"step": 118
},
{
"epoch": 2.4110275689223055,
"grad_norm": 0.10320752546874508,
"learning_rate": 8.55787621028851e-06,
"loss": 0.5162,
"step": 119
},
{
"epoch": 2.431077694235589,
"grad_norm": 0.09217800293365436,
"learning_rate": 7.978350363025588e-06,
"loss": 0.5343,
"step": 120
},
{
"epoch": 2.451127819548872,
"grad_norm": 0.08490382451555073,
"learning_rate": 7.416961917986572e-06,
"loss": 0.5219,
"step": 121
},
{
"epoch": 2.4711779448621556,
"grad_norm": 0.08656662969137564,
"learning_rate": 6.874028851545174e-06,
"loss": 0.5212,
"step": 122
},
{
"epoch": 2.4912280701754383,
"grad_norm": 0.09050314726798651,
"learning_rate": 6.349858686752748e-06,
"loss": 0.5328,
"step": 123
},
{
"epoch": 2.511278195488722,
"grad_norm": 0.0824254065764698,
"learning_rate": 5.8447483191540784e-06,
"loss": 0.5282,
"step": 124
},
{
"epoch": 2.5313283208020048,
"grad_norm": 0.09004753737634522,
"learning_rate": 5.358983848622452e-06,
"loss": 0.5291,
"step": 125
},
{
"epoch": 2.5513784461152884,
"grad_norm": 0.08491851685766005,
"learning_rate": 4.892840417309775e-06,
"loss": 0.5174,
"step": 126
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.08667374099676277,
"learning_rate": 4.446582053803066e-06,
"loss": 0.5269,
"step": 127
},
{
"epoch": 2.591478696741855,
"grad_norm": 0.08634185611814611,
"learning_rate": 4.020461523575873e-06,
"loss": 0.5404,
"step": 128
},
{
"epoch": 2.6115288220551376,
"grad_norm": 0.08142061930820212,
"learning_rate": 3.6147201858192627e-06,
"loss": 0.5297,
"step": 129
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.08234802409772482,
"learning_rate": 3.2295878567333784e-06,
"loss": 0.5347,
"step": 130
},
{
"epoch": 2.651629072681704,
"grad_norm": 0.0850700303547024,
"learning_rate": 2.8652826793570975e-06,
"loss": 0.5309,
"step": 131
},
{
"epoch": 2.6716791979949877,
"grad_norm": 0.08576766617100663,
"learning_rate": 2.5220110000095366e-06,
"loss": 0.529,
"step": 132
},
{
"epoch": 2.6917293233082704,
"grad_norm": 0.08621611962612956,
"learning_rate": 2.199967251413262e-06,
"loss": 0.526,
"step": 133
},
{
"epoch": 2.711779448621554,
"grad_norm": 0.07973648331165978,
"learning_rate": 1.8993338425655805e-06,
"loss": 0.5291,
"step": 134
},
{
"epoch": 2.731829573934837,
"grad_norm": 0.07607751613838545,
"learning_rate": 1.6202810554201099e-06,
"loss": 0.5287,
"step": 135
},
{
"epoch": 2.7518796992481205,
"grad_norm": 0.0764140719381241,
"learning_rate": 1.3629669484372722e-06,
"loss": 0.519,
"step": 136
},
{
"epoch": 2.7719298245614032,
"grad_norm": 0.07473144353146483,
"learning_rate": 1.127537267058334e-06,
"loss": 0.5299,
"step": 137
},
{
"epoch": 2.791979949874687,
"grad_norm": 0.07476164790510335,
"learning_rate": 9.141253611536238e-07,
"loss": 0.5335,
"step": 138
},
{
"epoch": 2.8120300751879697,
"grad_norm": 0.07472989997236351,
"learning_rate": 7.228521094917318e-07,
"loss": 0.5156,
"step": 139
},
{
"epoch": 2.8320802005012533,
"grad_norm": 0.07763611917758945,
"learning_rate": 5.538258512725403e-07,
"loss": 0.528,
"step": 140
},
{
"epoch": 2.852130325814536,
"grad_norm": 0.07864532159225761,
"learning_rate": 4.0714232476269265e-07,
"loss": 0.5205,
"step": 141
},
{
"epoch": 2.8721804511278197,
"grad_norm": 0.07157416039875991,
"learning_rate": 2.8288461306846817e-07,
"loss": 0.5251,
"step": 142
},
{
"epoch": 2.8922305764411025,
"grad_norm": 0.07874362826484563,
"learning_rate": 1.8112309707661647e-07,
"loss": 0.5326,
"step": 143
},
{
"epoch": 2.912280701754386,
"grad_norm": 0.07678251046065998,
"learning_rate": 1.019154155898594e-07,
"loss": 0.5325,
"step": 144
},
{
"epoch": 2.932330827067669,
"grad_norm": 0.07928853233638587,
"learning_rate": 4.530643267968149e-08,
"loss": 0.5283,
"step": 145
},
{
"epoch": 2.9523809523809526,
"grad_norm": 0.07049261679058248,
"learning_rate": 1.1328212274839267e-08,
"loss": 0.5307,
"step": 146
},
{
"epoch": 2.9724310776942353,
"grad_norm": 0.07518275904349948,
"learning_rate": 0.0,
"loss": 0.5333,
"step": 147
},
{
"epoch": 2.9724310776942353,
"step": 147,
"total_flos": 3.782746824809382e+18,
"train_loss": 0.6079183743924511,
"train_runtime": 22677.177,
"train_samples_per_second": 3.374,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1.0,
"max_steps": 147,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.782746824809382e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}