Delta-Vector's picture
Upload folder using huggingface_hub
d68886d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9985974754558204,
"eval_steps": 45,
"global_step": 178,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005610098176718092,
"grad_norm": 12.789434532326155,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.6735,
"step": 1
},
{
"epoch": 0.005610098176718092,
"eval_loss": 1.6279159784317017,
"eval_runtime": 411.6995,
"eval_samples_per_second": 3.998,
"eval_steps_per_second": 0.5,
"step": 1
},
{
"epoch": 0.011220196353436185,
"grad_norm": 12.449110229764761,
"learning_rate": 6.000000000000001e-07,
"loss": 1.6787,
"step": 2
},
{
"epoch": 0.016830294530154277,
"grad_norm": 12.965521915327457,
"learning_rate": 9e-07,
"loss": 1.7002,
"step": 3
},
{
"epoch": 0.02244039270687237,
"grad_norm": 12.312461520787544,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.6735,
"step": 4
},
{
"epoch": 0.028050490883590462,
"grad_norm": 16.45705379890311,
"learning_rate": 1.5e-06,
"loss": 1.6845,
"step": 5
},
{
"epoch": 0.033660589060308554,
"grad_norm": 18.324079461896748,
"learning_rate": 1.8e-06,
"loss": 1.7064,
"step": 6
},
{
"epoch": 0.03927068723702665,
"grad_norm": 9.790851828818997,
"learning_rate": 2.1e-06,
"loss": 1.6267,
"step": 7
},
{
"epoch": 0.04488078541374474,
"grad_norm": 7.050739527602004,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.6552,
"step": 8
},
{
"epoch": 0.05049088359046283,
"grad_norm": 4.33578468991953,
"learning_rate": 2.7e-06,
"loss": 1.6194,
"step": 9
},
{
"epoch": 0.056100981767180924,
"grad_norm": 5.030821312124895,
"learning_rate": 3e-06,
"loss": 1.5938,
"step": 10
},
{
"epoch": 0.061711079943899017,
"grad_norm": 5.650505437860036,
"learning_rate": 2.99973774136912e-06,
"loss": 1.5772,
"step": 11
},
{
"epoch": 0.06732117812061711,
"grad_norm": 4.829608912708524,
"learning_rate": 2.998951057182598e-06,
"loss": 1.5624,
"step": 12
},
{
"epoch": 0.0729312762973352,
"grad_norm": 4.067559287459243,
"learning_rate": 2.997640222526725e-06,
"loss": 1.5824,
"step": 13
},
{
"epoch": 0.0785413744740533,
"grad_norm": 2.856039166477444,
"learning_rate": 2.99580569577177e-06,
"loss": 1.5468,
"step": 14
},
{
"epoch": 0.08415147265077139,
"grad_norm": 1.5746141536824405,
"learning_rate": 2.9934481184117006e-06,
"loss": 1.5761,
"step": 15
},
{
"epoch": 0.08976157082748948,
"grad_norm": 1.594708904780719,
"learning_rate": 2.9905683148398643e-06,
"loss": 1.5417,
"step": 16
},
{
"epoch": 0.09537166900420757,
"grad_norm": 2.285228899703148,
"learning_rate": 2.9871672920607156e-06,
"loss": 1.5515,
"step": 17
},
{
"epoch": 0.10098176718092566,
"grad_norm": 2.767131287855067,
"learning_rate": 2.9832462393376928e-06,
"loss": 1.5464,
"step": 18
},
{
"epoch": 0.10659186535764376,
"grad_norm": 2.8597567602325418,
"learning_rate": 2.9788065277773537e-06,
"loss": 1.5283,
"step": 19
},
{
"epoch": 0.11220196353436185,
"grad_norm": 1.8265836755261031,
"learning_rate": 2.9738497098499328e-06,
"loss": 1.5256,
"step": 20
},
{
"epoch": 0.11781206171107994,
"grad_norm": 1.4609854239411417,
"learning_rate": 2.968377518846473e-06,
"loss": 1.5102,
"step": 21
},
{
"epoch": 0.12342215988779803,
"grad_norm": 1.4893607868634404,
"learning_rate": 2.9623918682727352e-06,
"loss": 1.5025,
"step": 22
},
{
"epoch": 0.12903225806451613,
"grad_norm": 1.4138175059246425,
"learning_rate": 2.9558948511800864e-06,
"loss": 1.5052,
"step": 23
},
{
"epoch": 0.13464235624123422,
"grad_norm": 1.0946413298956141,
"learning_rate": 2.9488887394336023e-06,
"loss": 1.48,
"step": 24
},
{
"epoch": 0.1402524544179523,
"grad_norm": 1.4436793505790257,
"learning_rate": 2.9413759829176495e-06,
"loss": 1.4985,
"step": 25
},
{
"epoch": 0.1458625525946704,
"grad_norm": 1.12305310745146,
"learning_rate": 2.933359208679211e-06,
"loss": 1.5052,
"step": 26
},
{
"epoch": 0.1514726507713885,
"grad_norm": 1.2555515662683936,
"learning_rate": 2.924841220009269e-06,
"loss": 1.4739,
"step": 27
},
{
"epoch": 0.1570827489481066,
"grad_norm": 1.2671751142906982,
"learning_rate": 2.9158249954625514e-06,
"loss": 1.4685,
"step": 28
},
{
"epoch": 0.16269284712482468,
"grad_norm": 1.0644419598203905,
"learning_rate": 2.906313687815999e-06,
"loss": 1.4829,
"step": 29
},
{
"epoch": 0.16830294530154277,
"grad_norm": 0.9862228471334081,
"learning_rate": 2.8963106229663065e-06,
"loss": 1.4973,
"step": 30
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.9461750827879526,
"learning_rate": 2.88581929876693e-06,
"loss": 1.5025,
"step": 31
},
{
"epoch": 0.17952314165497896,
"grad_norm": 1.0011739456144289,
"learning_rate": 2.8748433838049645e-06,
"loss": 1.4622,
"step": 32
},
{
"epoch": 0.18513323983169705,
"grad_norm": 0.7946864477883866,
"learning_rate": 2.8633867161183166e-06,
"loss": 1.4547,
"step": 33
},
{
"epoch": 0.19074333800841514,
"grad_norm": 0.9031467465213865,
"learning_rate": 2.851453301853629e-06,
"loss": 1.4632,
"step": 34
},
{
"epoch": 0.19635343618513323,
"grad_norm": 0.8215549924647519,
"learning_rate": 2.839047313865417e-06,
"loss": 1.4513,
"step": 35
},
{
"epoch": 0.20196353436185133,
"grad_norm": 0.6987065221709342,
"learning_rate": 2.8261730902569146e-06,
"loss": 1.4159,
"step": 36
},
{
"epoch": 0.20757363253856942,
"grad_norm": 0.7968200327984795,
"learning_rate": 2.8128351328631308e-06,
"loss": 1.4227,
"step": 37
},
{
"epoch": 0.2131837307152875,
"grad_norm": 0.7608131707190037,
"learning_rate": 2.7990381056766585e-06,
"loss": 1.4261,
"step": 38
},
{
"epoch": 0.2187938288920056,
"grad_norm": 0.8694558959167964,
"learning_rate": 2.7847868332167773e-06,
"loss": 1.4577,
"step": 39
},
{
"epoch": 0.2244039270687237,
"grad_norm": 0.8245063288142666,
"learning_rate": 2.7700862988424264e-06,
"loss": 1.4384,
"step": 40
},
{
"epoch": 0.2300140252454418,
"grad_norm": 0.7268091063750695,
"learning_rate": 2.7549416430096295e-06,
"loss": 1.4452,
"step": 41
},
{
"epoch": 0.23562412342215988,
"grad_norm": 0.8453387994411659,
"learning_rate": 2.7393581614739926e-06,
"loss": 1.4571,
"step": 42
},
{
"epoch": 0.24123422159887797,
"grad_norm": 0.7332728453209687,
"learning_rate": 2.7233413034388904e-06,
"loss": 1.4106,
"step": 43
},
{
"epoch": 0.24684431977559607,
"grad_norm": 0.7653926321695257,
"learning_rate": 2.7068966696500025e-06,
"loss": 1.4177,
"step": 44
},
{
"epoch": 0.25245441795231416,
"grad_norm": 0.7101618711873879,
"learning_rate": 2.690030010436853e-06,
"loss": 1.4032,
"step": 45
},
{
"epoch": 0.25245441795231416,
"eval_loss": 1.3188974857330322,
"eval_runtime": 411.4591,
"eval_samples_per_second": 4.0,
"eval_steps_per_second": 0.501,
"step": 45
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.6872418512994495,
"learning_rate": 2.6727472237020448e-06,
"loss": 1.4613,
"step": 46
},
{
"epoch": 0.26367461430575034,
"grad_norm": 0.8079045785150919,
"learning_rate": 2.6550543528588946e-06,
"loss": 1.4195,
"step": 47
},
{
"epoch": 0.26928471248246844,
"grad_norm": 0.7835286370246854,
"learning_rate": 2.6369575847181795e-06,
"loss": 1.438,
"step": 48
},
{
"epoch": 0.27489481065918653,
"grad_norm": 0.7388557016632363,
"learning_rate": 2.6184632473247484e-06,
"loss": 1.4499,
"step": 49
},
{
"epoch": 0.2805049088359046,
"grad_norm": 0.7379003000881167,
"learning_rate": 2.5995778077447395e-06,
"loss": 1.4335,
"step": 50
},
{
"epoch": 0.2861150070126227,
"grad_norm": 0.7208955742904049,
"learning_rate": 2.58030786980419e-06,
"loss": 1.4219,
"step": 51
},
{
"epoch": 0.2917251051893408,
"grad_norm": 0.7240715420334977,
"learning_rate": 2.5606601717798212e-06,
"loss": 1.3834,
"step": 52
},
{
"epoch": 0.2973352033660589,
"grad_norm": 0.6878143254748253,
"learning_rate": 2.5406415840428124e-06,
"loss": 1.41,
"step": 53
},
{
"epoch": 0.302945301542777,
"grad_norm": 0.702576965354967,
"learning_rate": 2.520259106656379e-06,
"loss": 1.423,
"step": 54
},
{
"epoch": 0.3085553997194951,
"grad_norm": 0.6940628737560203,
"learning_rate": 2.499519866928006e-06,
"loss": 1.4233,
"step": 55
},
{
"epoch": 0.3141654978962132,
"grad_norm": 0.7706923504861696,
"learning_rate": 2.4784311169171817e-06,
"loss": 1.4052,
"step": 56
},
{
"epoch": 0.31977559607293127,
"grad_norm": 0.7234464811104628,
"learning_rate": 2.457000230899513e-06,
"loss": 1.3801,
"step": 57
},
{
"epoch": 0.32538569424964936,
"grad_norm": 0.7217889248580507,
"learning_rate": 2.4352347027881005e-06,
"loss": 1.4094,
"step": 58
},
{
"epoch": 0.33099579242636745,
"grad_norm": 0.7024631703571308,
"learning_rate": 2.4131421435130812e-06,
"loss": 1.4145,
"step": 59
},
{
"epoch": 0.33660589060308554,
"grad_norm": 0.6852493385331954,
"learning_rate": 2.390730278360252e-06,
"loss": 1.3909,
"step": 60
},
{
"epoch": 0.34221598877980364,
"grad_norm": 0.6951742071723167,
"learning_rate": 2.368006944269709e-06,
"loss": 1.3818,
"step": 61
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.6431308485618643,
"learning_rate": 2.344980087095433e-06,
"loss": 1.4271,
"step": 62
},
{
"epoch": 0.3534361851332398,
"grad_norm": 0.7100647383746512,
"learning_rate": 2.321657758826807e-06,
"loss": 1.4262,
"step": 63
},
{
"epoch": 0.3590462833099579,
"grad_norm": 0.7213767487790479,
"learning_rate": 2.298048114773005e-06,
"loss": 1.3896,
"step": 64
},
{
"epoch": 0.364656381486676,
"grad_norm": 0.6448491692515501,
"learning_rate": 2.27415941071126e-06,
"loss": 1.3985,
"step": 65
},
{
"epoch": 0.3702664796633941,
"grad_norm": 0.6409510040219449,
"learning_rate": 2.25e-06,
"loss": 1.3947,
"step": 66
},
{
"epoch": 0.3758765778401122,
"grad_norm": 0.6347781401680477,
"learning_rate": 2.22557833065786e-06,
"loss": 1.3912,
"step": 67
},
{
"epoch": 0.3814866760168303,
"grad_norm": 0.6448895288436695,
"learning_rate": 2.200902942409593e-06,
"loss": 1.3961,
"step": 68
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.6681259500416868,
"learning_rate": 2.175982463699918e-06,
"loss": 1.3908,
"step": 69
},
{
"epoch": 0.39270687237026647,
"grad_norm": 0.6645485721507981,
"learning_rate": 2.150825608676337e-06,
"loss": 1.3841,
"step": 70
},
{
"epoch": 0.39831697054698456,
"grad_norm": 0.6566579644017881,
"learning_rate": 2.1254411741419925e-06,
"loss": 1.4203,
"step": 71
},
{
"epoch": 0.40392706872370265,
"grad_norm": 0.6725026800665824,
"learning_rate": 2.0998380364796113e-06,
"loss": 1.3682,
"step": 72
},
{
"epoch": 0.40953716690042075,
"grad_norm": 0.6488827604064351,
"learning_rate": 2.074025148547635e-06,
"loss": 1.3813,
"step": 73
},
{
"epoch": 0.41514726507713884,
"grad_norm": 0.719525409048368,
"learning_rate": 2.048011536549593e-06,
"loss": 1.3905,
"step": 74
},
{
"epoch": 0.42075736325385693,
"grad_norm": 0.6834354719235972,
"learning_rate": 2.0218062968778406e-06,
"loss": 1.3495,
"step": 75
},
{
"epoch": 0.426367461430575,
"grad_norm": 0.6906956307555222,
"learning_rate": 1.9954185929327507e-06,
"loss": 1.4199,
"step": 76
},
{
"epoch": 0.4319775596072931,
"grad_norm": 0.6568235975258354,
"learning_rate": 1.9688576519184667e-06,
"loss": 1.3895,
"step": 77
},
{
"epoch": 0.4375876577840112,
"grad_norm": 0.6386011841528437,
"learning_rate": 1.9421327616163564e-06,
"loss": 1.3956,
"step": 78
},
{
"epoch": 0.4431977559607293,
"grad_norm": 0.6891639766601969,
"learning_rate": 1.915253267137274e-06,
"loss": 1.3674,
"step": 79
},
{
"epoch": 0.4488078541374474,
"grad_norm": 0.6656583761906066,
"learning_rate": 1.888228567653781e-06,
"loss": 1.3872,
"step": 80
},
{
"epoch": 0.4544179523141655,
"grad_norm": 0.6515567085010856,
"learning_rate": 1.8610681131134598e-06,
"loss": 1.4106,
"step": 81
},
{
"epoch": 0.4600280504908836,
"grad_norm": 0.6363151573327763,
"learning_rate": 1.8337814009344715e-06,
"loss": 1.3741,
"step": 82
},
{
"epoch": 0.46563814866760167,
"grad_norm": 0.6412726533083525,
"learning_rate": 1.8063779726845207e-06,
"loss": 1.3622,
"step": 83
},
{
"epoch": 0.47124824684431976,
"grad_norm": 0.6306344422672963,
"learning_rate": 1.778867410744372e-06,
"loss": 1.3962,
"step": 84
},
{
"epoch": 0.47685834502103785,
"grad_norm": 0.6430451206374499,
"learning_rate": 1.7512593349571046e-06,
"loss": 1.3781,
"step": 85
},
{
"epoch": 0.48246844319775595,
"grad_norm": 0.6972355527865212,
"learning_rate": 1.7235633992642616e-06,
"loss": 1.3768,
"step": 86
},
{
"epoch": 0.48807854137447404,
"grad_norm": 0.6512242836910552,
"learning_rate": 1.6957892883300778e-06,
"loss": 1.374,
"step": 87
},
{
"epoch": 0.49368863955119213,
"grad_norm": 0.6424119203317555,
"learning_rate": 1.6679467141549617e-06,
"loss": 1.3632,
"step": 88
},
{
"epoch": 0.4992987377279102,
"grad_norm": 0.6244684823196682,
"learning_rate": 1.6400454126794258e-06,
"loss": 1.3795,
"step": 89
},
{
"epoch": 0.5049088359046283,
"grad_norm": 0.6597628935764579,
"learning_rate": 1.6120951403796365e-06,
"loss": 1.3647,
"step": 90
},
{
"epoch": 0.5049088359046283,
"eval_loss": 1.2600828409194946,
"eval_runtime": 411.695,
"eval_samples_per_second": 3.998,
"eval_steps_per_second": 0.5,
"step": 90
},
{
"epoch": 0.5105189340813464,
"grad_norm": 0.6314706351932109,
"learning_rate": 1.5841056708557877e-06,
"loss": 1.3898,
"step": 91
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.6204474126915034,
"learning_rate": 1.5560867914144889e-06,
"loss": 1.3316,
"step": 92
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.6452070749940022,
"learning_rate": 1.5280482996463535e-06,
"loss": 1.365,
"step": 93
},
{
"epoch": 0.5273492286115007,
"grad_norm": 0.623770196055313,
"learning_rate": 1.5e-06,
"loss": 1.367,
"step": 94
},
{
"epoch": 0.5329593267882188,
"grad_norm": 0.6294036193527197,
"learning_rate": 1.471951700353647e-06,
"loss": 1.3445,
"step": 95
},
{
"epoch": 0.5385694249649369,
"grad_norm": 0.6442245103204152,
"learning_rate": 1.4439132085855116e-06,
"loss": 1.3814,
"step": 96
},
{
"epoch": 0.544179523141655,
"grad_norm": 0.6225427313645252,
"learning_rate": 1.4158943291442122e-06,
"loss": 1.3695,
"step": 97
},
{
"epoch": 0.5497896213183731,
"grad_norm": 0.6552086842949354,
"learning_rate": 1.3879048596203636e-06,
"loss": 1.3786,
"step": 98
},
{
"epoch": 0.5553997194950911,
"grad_norm": 0.62874676463444,
"learning_rate": 1.3599545873205742e-06,
"loss": 1.3822,
"step": 99
},
{
"epoch": 0.5610098176718092,
"grad_norm": 0.6574014422172325,
"learning_rate": 1.3320532858450384e-06,
"loss": 1.3618,
"step": 100
},
{
"epoch": 0.5666199158485273,
"grad_norm": 0.6383976290918701,
"learning_rate": 1.304210711669923e-06,
"loss": 1.3588,
"step": 101
},
{
"epoch": 0.5722300140252454,
"grad_norm": 0.6270735205120959,
"learning_rate": 1.2764366007357383e-06,
"loss": 1.3777,
"step": 102
},
{
"epoch": 0.5778401122019635,
"grad_norm": 0.6351754233235616,
"learning_rate": 1.2487406650428957e-06,
"loss": 1.3828,
"step": 103
},
{
"epoch": 0.5834502103786816,
"grad_norm": 0.6527833020722423,
"learning_rate": 1.2211325892556282e-06,
"loss": 1.3954,
"step": 104
},
{
"epoch": 0.5890603085553997,
"grad_norm": 0.6427076558908459,
"learning_rate": 1.1936220273154798e-06,
"loss": 1.4066,
"step": 105
},
{
"epoch": 0.5946704067321178,
"grad_norm": 0.6691035857323574,
"learning_rate": 1.1662185990655286e-06,
"loss": 1.3558,
"step": 106
},
{
"epoch": 0.6002805049088359,
"grad_norm": 0.6342212523114369,
"learning_rate": 1.138931886886541e-06,
"loss": 1.3616,
"step": 107
},
{
"epoch": 0.605890603085554,
"grad_norm": 0.6425719617099173,
"learning_rate": 1.1117714323462188e-06,
"loss": 1.3708,
"step": 108
},
{
"epoch": 0.6115007012622721,
"grad_norm": 0.6372645993247724,
"learning_rate": 1.084746732862726e-06,
"loss": 1.3392,
"step": 109
},
{
"epoch": 0.6171107994389902,
"grad_norm": 0.6808155418004899,
"learning_rate": 1.0578672383836437e-06,
"loss": 1.3685,
"step": 110
},
{
"epoch": 0.6227208976157083,
"grad_norm": 0.625992555276094,
"learning_rate": 1.0311423480815335e-06,
"loss": 1.3819,
"step": 111
},
{
"epoch": 0.6283309957924264,
"grad_norm": 0.6502367036299962,
"learning_rate": 1.0045814070672498e-06,
"loss": 1.3529,
"step": 112
},
{
"epoch": 0.6339410939691444,
"grad_norm": 0.6806106708733632,
"learning_rate": 9.78193703122159e-07,
"loss": 1.3678,
"step": 113
},
{
"epoch": 0.6395511921458625,
"grad_norm": 0.6207625932802451,
"learning_rate": 9.519884634504075e-07,
"loss": 1.361,
"step": 114
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.6316217262222912,
"learning_rate": 9.259748514523654e-07,
"loss": 1.3581,
"step": 115
},
{
"epoch": 0.6507713884992987,
"grad_norm": 0.6903717607402587,
"learning_rate": 9.001619635203888e-07,
"loss": 1.3457,
"step": 116
},
{
"epoch": 0.6563814866760168,
"grad_norm": 0.6430969718754993,
"learning_rate": 8.745588258580084e-07,
"loss": 1.3689,
"step": 117
},
{
"epoch": 0.6619915848527349,
"grad_norm": 0.6191251274798687,
"learning_rate": 8.49174391323663e-07,
"loss": 1.3652,
"step": 118
},
{
"epoch": 0.667601683029453,
"grad_norm": 0.6607471005683063,
"learning_rate": 8.240175363000819e-07,
"loss": 1.3715,
"step": 119
},
{
"epoch": 0.6732117812061711,
"grad_norm": 0.642069328346416,
"learning_rate": 7.99097057590407e-07,
"loss": 1.3533,
"step": 120
},
{
"epoch": 0.6788218793828892,
"grad_norm": 0.6140370733400766,
"learning_rate": 7.744216693421403e-07,
"loss": 1.3517,
"step": 121
},
{
"epoch": 0.6844319775596073,
"grad_norm": 0.6340075524580069,
"learning_rate": 7.500000000000003e-07,
"loss": 1.3818,
"step": 122
},
{
"epoch": 0.6900420757363254,
"grad_norm": 0.645243182347647,
"learning_rate": 7.258405892887399e-07,
"loss": 1.3672,
"step": 123
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.6702639753033074,
"learning_rate": 7.019518852269954e-07,
"loss": 1.3539,
"step": 124
},
{
"epoch": 0.7012622720897616,
"grad_norm": 0.6387297942086914,
"learning_rate": 6.783422411731932e-07,
"loss": 1.3604,
"step": 125
},
{
"epoch": 0.7068723702664796,
"grad_norm": 0.620394148061845,
"learning_rate": 6.550199129045669e-07,
"loss": 1.383,
"step": 126
},
{
"epoch": 0.7124824684431977,
"grad_norm": 0.6327577478686799,
"learning_rate": 6.319930557302914e-07,
"loss": 1.3602,
"step": 127
},
{
"epoch": 0.7180925666199158,
"grad_norm": 0.6230687543936638,
"learning_rate": 6.092697216397478e-07,
"loss": 1.3563,
"step": 128
},
{
"epoch": 0.7237026647966339,
"grad_norm": 0.6138116069294336,
"learning_rate": 5.868578564869191e-07,
"loss": 1.3562,
"step": 129
},
{
"epoch": 0.729312762973352,
"grad_norm": 0.6378183688587699,
"learning_rate": 5.647652972118998e-07,
"loss": 1.3466,
"step": 130
},
{
"epoch": 0.7349228611500701,
"grad_norm": 0.611795840417991,
"learning_rate": 5.429997691004874e-07,
"loss": 1.3373,
"step": 131
},
{
"epoch": 0.7405329593267882,
"grad_norm": 0.6281810459467961,
"learning_rate": 5.215688830828188e-07,
"loss": 1.3747,
"step": 132
},
{
"epoch": 0.7461430575035063,
"grad_norm": 0.6349646039018948,
"learning_rate": 5.004801330719941e-07,
"loss": 1.3683,
"step": 133
},
{
"epoch": 0.7517531556802244,
"grad_norm": 0.629745084121343,
"learning_rate": 4.797408933436207e-07,
"loss": 1.3565,
"step": 134
},
{
"epoch": 0.7573632538569425,
"grad_norm": 0.6269791171808803,
"learning_rate": 4.5935841595718754e-07,
"loss": 1.3609,
"step": 135
},
{
"epoch": 0.7573632538569425,
"eval_loss": 1.2410979270935059,
"eval_runtime": 413.3103,
"eval_samples_per_second": 3.982,
"eval_steps_per_second": 0.498,
"step": 135
},
{
"epoch": 0.7629733520336606,
"grad_norm": 0.6267450715270643,
"learning_rate": 4.3933982822017883e-07,
"loss": 1.3764,
"step": 136
},
{
"epoch": 0.7685834502103787,
"grad_norm": 0.6372731343606247,
"learning_rate": 4.196921301958104e-07,
"loss": 1.3639,
"step": 137
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.6290588594803654,
"learning_rate": 4.0042219225526084e-07,
"loss": 1.3475,
"step": 138
},
{
"epoch": 0.7798036465638148,
"grad_norm": 0.6282644727559273,
"learning_rate": 3.8153675267525163e-07,
"loss": 1.3802,
"step": 139
},
{
"epoch": 0.7854137447405329,
"grad_norm": 0.6333972099731577,
"learning_rate": 3.6304241528182033e-07,
"loss": 1.3695,
"step": 140
},
{
"epoch": 0.791023842917251,
"grad_norm": 0.615703161986596,
"learning_rate": 3.449456471411058e-07,
"loss": 1.3712,
"step": 141
},
{
"epoch": 0.7966339410939691,
"grad_norm": 0.6341051573545761,
"learning_rate": 3.272527762979553e-07,
"loss": 1.3432,
"step": 142
},
{
"epoch": 0.8022440392706872,
"grad_norm": 0.6146652686424743,
"learning_rate": 3.0996998956314745e-07,
"loss": 1.351,
"step": 143
},
{
"epoch": 0.8078541374474053,
"grad_norm": 0.6013902465160864,
"learning_rate": 2.9310333034999746e-07,
"loss": 1.337,
"step": 144
},
{
"epoch": 0.8134642356241234,
"grad_norm": 0.6173433844669265,
"learning_rate": 2.7665869656110975e-07,
"loss": 1.3672,
"step": 145
},
{
"epoch": 0.8190743338008415,
"grad_norm": 0.6391927308858952,
"learning_rate": 2.6064183852600797e-07,
"loss": 1.3472,
"step": 146
},
{
"epoch": 0.8246844319775596,
"grad_norm": 0.6341150140648005,
"learning_rate": 2.4505835699037037e-07,
"loss": 1.3729,
"step": 147
},
{
"epoch": 0.8302945301542777,
"grad_norm": 0.642511533531499,
"learning_rate": 2.299137011575738e-07,
"loss": 1.3734,
"step": 148
},
{
"epoch": 0.8359046283309958,
"grad_norm": 0.6268984544418437,
"learning_rate": 2.15213166783223e-07,
"loss": 1.3552,
"step": 149
},
{
"epoch": 0.8415147265077139,
"grad_norm": 0.599379702979861,
"learning_rate": 2.0096189432334195e-07,
"loss": 1.3036,
"step": 150
},
{
"epoch": 0.847124824684432,
"grad_norm": 0.6046862952380841,
"learning_rate": 1.8716486713686948e-07,
"loss": 1.3505,
"step": 151
},
{
"epoch": 0.85273492286115,
"grad_norm": 0.6115539109578072,
"learning_rate": 1.7382690974308551e-07,
"loss": 1.3102,
"step": 152
},
{
"epoch": 0.8583450210378681,
"grad_norm": 0.6012756116742142,
"learning_rate": 1.6095268613458302e-07,
"loss": 1.3474,
"step": 153
},
{
"epoch": 0.8639551192145862,
"grad_norm": 0.5895604903788673,
"learning_rate": 1.4854669814637145e-07,
"loss": 1.3893,
"step": 154
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.608459654122053,
"learning_rate": 1.3661328388168358e-07,
"loss": 1.3721,
"step": 155
},
{
"epoch": 0.8751753155680224,
"grad_norm": 0.6357118654920566,
"learning_rate": 1.251566161950357e-07,
"loss": 1.3614,
"step": 156
},
{
"epoch": 0.8807854137447405,
"grad_norm": 0.6224055669530941,
"learning_rate": 1.141807012330699e-07,
"loss": 1.3448,
"step": 157
},
{
"epoch": 0.8863955119214586,
"grad_norm": 0.6303897910712531,
"learning_rate": 1.036893770336938e-07,
"loss": 1.3708,
"step": 158
},
{
"epoch": 0.8920056100981767,
"grad_norm": 0.6361734072038168,
"learning_rate": 9.368631218400137e-08,
"loss": 1.339,
"step": 159
},
{
"epoch": 0.8976157082748948,
"grad_norm": 0.6146505272733668,
"learning_rate": 8.417500453744864e-08,
"loss": 1.374,
"step": 160
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.6153978550286938,
"learning_rate": 7.515877999073101e-08,
"loss": 1.365,
"step": 161
},
{
"epoch": 0.908835904628331,
"grad_norm": 0.6345016503291776,
"learning_rate": 6.664079132078882e-08,
"loss": 1.3761,
"step": 162
},
{
"epoch": 0.9144460028050491,
"grad_norm": 0.6151090819609036,
"learning_rate": 5.8624017082350765e-08,
"loss": 1.3381,
"step": 163
},
{
"epoch": 0.9200561009817672,
"grad_norm": 0.6155169224032129,
"learning_rate": 5.11112605663977e-08,
"loss": 1.3534,
"step": 164
},
{
"epoch": 0.9256661991584852,
"grad_norm": 0.6227407416490074,
"learning_rate": 4.4105148819913564e-08,
"loss": 1.393,
"step": 165
},
{
"epoch": 0.9312762973352033,
"grad_norm": 0.6234769604342408,
"learning_rate": 3.7608131727264573e-08,
"loss": 1.3618,
"step": 166
},
{
"epoch": 0.9368863955119214,
"grad_norm": 0.6200580565294606,
"learning_rate": 3.162248115352745e-08,
"loss": 1.3641,
"step": 167
},
{
"epoch": 0.9424964936886395,
"grad_norm": 0.6007414904648882,
"learning_rate": 2.6150290150067592e-08,
"loss": 1.3281,
"step": 168
},
{
"epoch": 0.9481065918653576,
"grad_norm": 0.6452144194523336,
"learning_rate": 2.1193472222646172e-08,
"loss": 1.3445,
"step": 169
},
{
"epoch": 0.9537166900420757,
"grad_norm": 0.5953544138269807,
"learning_rate": 1.6753760662307216e-08,
"loss": 1.3349,
"step": 170
},
{
"epoch": 0.9593267882187938,
"grad_norm": 0.591032939002349,
"learning_rate": 1.2832707939284426e-08,
"loss": 1.3163,
"step": 171
},
{
"epoch": 0.9649368863955119,
"grad_norm": 0.6080867514880087,
"learning_rate": 9.431685160136094e-09,
"loss": 1.3439,
"step": 172
},
{
"epoch": 0.97054698457223,
"grad_norm": 0.5948670418509701,
"learning_rate": 6.55188158829928e-09,
"loss": 1.3127,
"step": 173
},
{
"epoch": 0.9761570827489481,
"grad_norm": 0.5908769690929122,
"learning_rate": 4.194304228229806e-09,
"loss": 1.3366,
"step": 174
},
{
"epoch": 0.9817671809256662,
"grad_norm": 0.6166895839754339,
"learning_rate": 2.359777473275093e-09,
"loss": 1.3558,
"step": 175
},
{
"epoch": 0.9873772791023843,
"grad_norm": 0.604759073223834,
"learning_rate": 1.0489428174020875e-09,
"loss": 1.3575,
"step": 176
},
{
"epoch": 0.9929873772791024,
"grad_norm": 0.6214697548686083,
"learning_rate": 2.622586308803632e-10,
"loss": 1.3599,
"step": 177
},
{
"epoch": 0.9985974754558204,
"grad_norm": 0.6024720195804787,
"learning_rate": 0.0,
"loss": 1.3521,
"step": 178
}
],
"logging_steps": 1,
"max_steps": 178,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 223460437524480.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}