ReMA-PS-7B-SFT / trainer_state.json
OhCherryFire's picture
add model
c646dab
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 249,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012048192771084338,
"grad_norm": 40.75222396850586,
"learning_rate": 9.99960204377842e-06,
"loss": 0.8283,
"step": 1
},
{
"epoch": 0.024096385542168676,
"grad_norm": 7.6217546463012695,
"learning_rate": 9.99840823846134e-06,
"loss": 0.7463,
"step": 2
},
{
"epoch": 0.03614457831325301,
"grad_norm": 4.7737321853637695,
"learning_rate": 9.996418774081658e-06,
"loss": 0.6773,
"step": 3
},
{
"epoch": 0.04819277108433735,
"grad_norm": 7.380457401275635,
"learning_rate": 9.99363396732727e-06,
"loss": 0.7069,
"step": 4
},
{
"epoch": 0.060240963855421686,
"grad_norm": 6.07143497467041,
"learning_rate": 9.990054261490643e-06,
"loss": 0.8157,
"step": 5
},
{
"epoch": 0.07228915662650602,
"grad_norm": 32.628204345703125,
"learning_rate": 9.985680226398261e-06,
"loss": 0.7604,
"step": 6
},
{
"epoch": 0.08433734939759036,
"grad_norm": 7.0896759033203125,
"learning_rate": 9.980512558319915e-06,
"loss": 0.6947,
"step": 7
},
{
"epoch": 0.0963855421686747,
"grad_norm": 4.166346549987793,
"learning_rate": 9.974552079857873e-06,
"loss": 0.5901,
"step": 8
},
{
"epoch": 0.10843373493975904,
"grad_norm": 5.307025909423828,
"learning_rate": 9.967799739815925e-06,
"loss": 0.6768,
"step": 9
},
{
"epoch": 0.12048192771084337,
"grad_norm": 5.205200672149658,
"learning_rate": 9.960256613048367e-06,
"loss": 0.7401,
"step": 10
},
{
"epoch": 0.13253012048192772,
"grad_norm": 4.746809482574463,
"learning_rate": 9.951923900288888e-06,
"loss": 0.603,
"step": 11
},
{
"epoch": 0.14457831325301204,
"grad_norm": 4.554243087768555,
"learning_rate": 9.942802927959444e-06,
"loss": 0.5951,
"step": 12
},
{
"epoch": 0.1566265060240964,
"grad_norm": 4.13496732711792,
"learning_rate": 9.932895147959106e-06,
"loss": 0.6246,
"step": 13
},
{
"epoch": 0.1686746987951807,
"grad_norm": 5.653600692749023,
"learning_rate": 9.922202137432954e-06,
"loss": 0.8116,
"step": 14
},
{
"epoch": 0.18072289156626506,
"grad_norm": 4.084902286529541,
"learning_rate": 9.910725598521014e-06,
"loss": 0.5243,
"step": 15
},
{
"epoch": 0.1927710843373494,
"grad_norm": 4.84393835067749,
"learning_rate": 9.89846735808731e-06,
"loss": 0.5911,
"step": 16
},
{
"epoch": 0.20481927710843373,
"grad_norm": 3.5985801219940186,
"learning_rate": 9.885429367429062e-06,
"loss": 0.5873,
"step": 17
},
{
"epoch": 0.21686746987951808,
"grad_norm": 4.133760452270508,
"learning_rate": 9.871613701966067e-06,
"loss": 0.58,
"step": 18
},
{
"epoch": 0.2289156626506024,
"grad_norm": 5.736385345458984,
"learning_rate": 9.857022560910338e-06,
"loss": 0.6884,
"step": 19
},
{
"epoch": 0.24096385542168675,
"grad_norm": 5.400482177734375,
"learning_rate": 9.84165826691602e-06,
"loss": 0.7507,
"step": 20
},
{
"epoch": 0.25301204819277107,
"grad_norm": 3.2082321643829346,
"learning_rate": 9.825523265709667e-06,
"loss": 0.4539,
"step": 21
},
{
"epoch": 0.26506024096385544,
"grad_norm": 3.9605965614318848,
"learning_rate": 9.808620125700925e-06,
"loss": 0.5744,
"step": 22
},
{
"epoch": 0.27710843373493976,
"grad_norm": 3.652902603149414,
"learning_rate": 9.790951537573686e-06,
"loss": 0.4361,
"step": 23
},
{
"epoch": 0.2891566265060241,
"grad_norm": 3.5659713745117188,
"learning_rate": 9.772520313857777e-06,
"loss": 0.4565,
"step": 24
},
{
"epoch": 0.30120481927710846,
"grad_norm": 5.866443157196045,
"learning_rate": 9.753329388481261e-06,
"loss": 0.6564,
"step": 25
},
{
"epoch": 0.3132530120481928,
"grad_norm": 5.043295383453369,
"learning_rate": 9.733381816303395e-06,
"loss": 0.5905,
"step": 26
},
{
"epoch": 0.3253012048192771,
"grad_norm": 4.576389789581299,
"learning_rate": 9.712680772628365e-06,
"loss": 0.5458,
"step": 27
},
{
"epoch": 0.3373493975903614,
"grad_norm": 2.964594602584839,
"learning_rate": 9.691229552699817e-06,
"loss": 0.4196,
"step": 28
},
{
"epoch": 0.3493975903614458,
"grad_norm": 3.668825387954712,
"learning_rate": 9.669031571176322e-06,
"loss": 0.5939,
"step": 29
},
{
"epoch": 0.3614457831325301,
"grad_norm": 3.4135804176330566,
"learning_rate": 9.646090361587828e-06,
"loss": 0.4942,
"step": 30
},
{
"epoch": 0.37349397590361444,
"grad_norm": 3.3271186351776123,
"learning_rate": 9.622409575773162e-06,
"loss": 0.4447,
"step": 31
},
{
"epoch": 0.3855421686746988,
"grad_norm": 3.8561484813690186,
"learning_rate": 9.597992983298748e-06,
"loss": 0.5443,
"step": 32
},
{
"epoch": 0.39759036144578314,
"grad_norm": 3.4959912300109863,
"learning_rate": 9.572844470858537e-06,
"loss": 0.5228,
"step": 33
},
{
"epoch": 0.40963855421686746,
"grad_norm": 4.416797637939453,
"learning_rate": 9.546968041655326e-06,
"loss": 0.5745,
"step": 34
},
{
"epoch": 0.42168674698795183,
"grad_norm": 3.1685950756073,
"learning_rate": 9.520367814763514e-06,
"loss": 0.5249,
"step": 35
},
{
"epoch": 0.43373493975903615,
"grad_norm": 3.5792479515075684,
"learning_rate": 9.493048024473413e-06,
"loss": 0.5533,
"step": 36
},
{
"epoch": 0.4457831325301205,
"grad_norm": 3.136587619781494,
"learning_rate": 9.46501301961723e-06,
"loss": 0.5707,
"step": 37
},
{
"epoch": 0.4578313253012048,
"grad_norm": 6.66333532333374,
"learning_rate": 9.436267262876808e-06,
"loss": 0.5537,
"step": 38
},
{
"epoch": 0.46987951807228917,
"grad_norm": 3.710054397583008,
"learning_rate": 9.406815330073244e-06,
"loss": 0.5072,
"step": 39
},
{
"epoch": 0.4819277108433735,
"grad_norm": 2.439741611480713,
"learning_rate": 9.376661909438496e-06,
"loss": 0.4088,
"step": 40
},
{
"epoch": 0.4939759036144578,
"grad_norm": 2.6984646320343018,
"learning_rate": 9.3458118008691e-06,
"loss": 0.548,
"step": 41
},
{
"epoch": 0.5060240963855421,
"grad_norm": 2.626049757003784,
"learning_rate": 9.314269915162115e-06,
"loss": 0.541,
"step": 42
},
{
"epoch": 0.5180722891566265,
"grad_norm": 31.189899444580078,
"learning_rate": 9.282041273233402e-06,
"loss": 0.5461,
"step": 43
},
{
"epoch": 0.5301204819277109,
"grad_norm": 4.356227397918701,
"learning_rate": 9.249131005318388e-06,
"loss": 0.6082,
"step": 44
},
{
"epoch": 0.5421686746987951,
"grad_norm": 10.281394958496094,
"learning_rate": 9.215544350155423e-06,
"loss": 0.6193,
"step": 45
},
{
"epoch": 0.5542168674698795,
"grad_norm": 81.10453796386719,
"learning_rate": 9.18128665415186e-06,
"loss": 0.4795,
"step": 46
},
{
"epoch": 0.5662650602409639,
"grad_norm": 136.2274932861328,
"learning_rate": 9.146363370533004e-06,
"loss": 0.5669,
"step": 47
},
{
"epoch": 0.5783132530120482,
"grad_norm": 24.73008155822754,
"learning_rate": 9.110780058474052e-06,
"loss": 0.4712,
"step": 48
},
{
"epoch": 0.5903614457831325,
"grad_norm": 3.0569868087768555,
"learning_rate": 9.07454238221517e-06,
"loss": 0.4934,
"step": 49
},
{
"epoch": 0.6024096385542169,
"grad_norm": 3.192237615585327,
"learning_rate": 9.03765611015985e-06,
"loss": 0.5427,
"step": 50
},
{
"epoch": 0.6144578313253012,
"grad_norm": 1.920320749282837,
"learning_rate": 9.000127113956673e-06,
"loss": 0.4182,
"step": 51
},
{
"epoch": 0.6265060240963856,
"grad_norm": 3.1197104454040527,
"learning_rate": 8.961961367564652e-06,
"loss": 0.5577,
"step": 52
},
{
"epoch": 0.6385542168674698,
"grad_norm": 2.1309397220611572,
"learning_rate": 8.923164946302274e-06,
"loss": 0.5111,
"step": 53
},
{
"epoch": 0.6506024096385542,
"grad_norm": 2.3042995929718018,
"learning_rate": 8.883744025880429e-06,
"loss": 0.5015,
"step": 54
},
{
"epoch": 0.6626506024096386,
"grad_norm": 2.4492433071136475,
"learning_rate": 8.843704881419333e-06,
"loss": 0.3826,
"step": 55
},
{
"epoch": 0.6746987951807228,
"grad_norm": 2.3031723499298096,
"learning_rate": 8.803053886449644e-06,
"loss": 0.4694,
"step": 56
},
{
"epoch": 0.6867469879518072,
"grad_norm": 3.1464896202087402,
"learning_rate": 8.761797511897907e-06,
"loss": 0.5708,
"step": 57
},
{
"epoch": 0.6987951807228916,
"grad_norm": 2.5254249572753906,
"learning_rate": 8.719942325056496e-06,
"loss": 0.5605,
"step": 58
},
{
"epoch": 0.7108433734939759,
"grad_norm": 2.614318370819092,
"learning_rate": 8.67749498853821e-06,
"loss": 0.5702,
"step": 59
},
{
"epoch": 0.7228915662650602,
"grad_norm": 2.1782386302948,
"learning_rate": 8.634462259215719e-06,
"loss": 0.5409,
"step": 60
},
{
"epoch": 0.7349397590361446,
"grad_norm": 2.084237813949585,
"learning_rate": 8.590850987145964e-06,
"loss": 0.4923,
"step": 61
},
{
"epoch": 0.7469879518072289,
"grad_norm": 2.4142396450042725,
"learning_rate": 8.546668114479769e-06,
"loss": 0.6142,
"step": 62
},
{
"epoch": 0.7590361445783133,
"grad_norm": 1.6900039911270142,
"learning_rate": 8.501920674356755e-06,
"loss": 0.4445,
"step": 63
},
{
"epoch": 0.7710843373493976,
"grad_norm": 1.9757111072540283,
"learning_rate": 8.456615789785804e-06,
"loss": 0.491,
"step": 64
},
{
"epoch": 0.7831325301204819,
"grad_norm": 2.328930139541626,
"learning_rate": 8.410760672511188e-06,
"loss": 0.5563,
"step": 65
},
{
"epoch": 0.7951807228915663,
"grad_norm": 2.8067822456359863,
"learning_rate": 8.364362621864595e-06,
"loss": 0.6574,
"step": 66
},
{
"epoch": 0.8072289156626506,
"grad_norm": 2.0766549110412598,
"learning_rate": 8.31742902360319e-06,
"loss": 0.5063,
"step": 67
},
{
"epoch": 0.8192771084337349,
"grad_norm": 2.085911989212036,
"learning_rate": 8.269967348733947e-06,
"loss": 0.5504,
"step": 68
},
{
"epoch": 0.8313253012048193,
"grad_norm": 1.8254350423812866,
"learning_rate": 8.221985152324385e-06,
"loss": 0.4678,
"step": 69
},
{
"epoch": 0.8433734939759037,
"grad_norm": 2.208496332168579,
"learning_rate": 8.17349007229994e-06,
"loss": 0.5589,
"step": 70
},
{
"epoch": 0.8554216867469879,
"grad_norm": 2.833843469619751,
"learning_rate": 8.124489828228136e-06,
"loss": 0.6464,
"step": 71
},
{
"epoch": 0.8674698795180723,
"grad_norm": 2.181140661239624,
"learning_rate": 8.07499222008977e-06,
"loss": 0.6037,
"step": 72
},
{
"epoch": 0.8795180722891566,
"grad_norm": 1.5879639387130737,
"learning_rate": 8.025005127037282e-06,
"loss": 0.4077,
"step": 73
},
{
"epoch": 0.891566265060241,
"grad_norm": 1.94895601272583,
"learning_rate": 7.974536506140546e-06,
"loss": 0.4523,
"step": 74
},
{
"epoch": 0.9036144578313253,
"grad_norm": 2.282900810241699,
"learning_rate": 7.923594391120237e-06,
"loss": 0.4889,
"step": 75
},
{
"epoch": 0.9156626506024096,
"grad_norm": 1.8225998878479004,
"learning_rate": 7.872186891068997e-06,
"loss": 0.4483,
"step": 76
},
{
"epoch": 0.927710843373494,
"grad_norm": 2.1921205520629883,
"learning_rate": 7.820322189160618e-06,
"loss": 0.4848,
"step": 77
},
{
"epoch": 0.9397590361445783,
"grad_norm": 1.9695558547973633,
"learning_rate": 7.768008541347423e-06,
"loss": 0.4577,
"step": 78
},
{
"epoch": 0.9518072289156626,
"grad_norm": 2.367926836013794,
"learning_rate": 7.715254275046062e-06,
"loss": 0.6004,
"step": 79
},
{
"epoch": 0.963855421686747,
"grad_norm": 1.95900297164917,
"learning_rate": 7.66206778781193e-06,
"loss": 0.5161,
"step": 80
},
{
"epoch": 0.9759036144578314,
"grad_norm": 4.2675557136535645,
"learning_rate": 7.608457546002423e-06,
"loss": 0.4645,
"step": 81
},
{
"epoch": 0.9879518072289156,
"grad_norm": 2.129870891571045,
"learning_rate": 7.554432083429253e-06,
"loss": 0.5267,
"step": 82
},
{
"epoch": 1.0,
"grad_norm": 1.7695404291152954,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3909,
"step": 83
},
{
"epoch": 1.0120481927710843,
"grad_norm": 2.0876364707946777,
"learning_rate": 7.445169960349167e-06,
"loss": 0.3333,
"step": 84
},
{
"epoch": 1.0240963855421688,
"grad_norm": 1.5992554426193237,
"learning_rate": 7.389950692458916e-06,
"loss": 0.3103,
"step": 85
},
{
"epoch": 1.036144578313253,
"grad_norm": 2.081721544265747,
"learning_rate": 7.3343509862697295e-06,
"loss": 0.286,
"step": 86
},
{
"epoch": 1.0481927710843373,
"grad_norm": 1.5453327894210815,
"learning_rate": 7.278379692281209e-06,
"loss": 0.2851,
"step": 87
},
{
"epoch": 1.0602409638554218,
"grad_norm": 1.6960233449935913,
"learning_rate": 7.22204572014322e-06,
"loss": 0.3118,
"step": 88
},
{
"epoch": 1.072289156626506,
"grad_norm": 1.6961935758590698,
"learning_rate": 7.165358037237644e-06,
"loss": 0.3024,
"step": 89
},
{
"epoch": 1.0843373493975903,
"grad_norm": 1.9473631381988525,
"learning_rate": 7.10832566725092e-06,
"loss": 0.3262,
"step": 90
},
{
"epoch": 1.0963855421686748,
"grad_norm": 1.5019605159759521,
"learning_rate": 7.0509576887376375e-06,
"loss": 0.23,
"step": 91
},
{
"epoch": 1.108433734939759,
"grad_norm": 1.7088998556137085,
"learning_rate": 6.99326323367538e-06,
"loss": 0.2511,
"step": 92
},
{
"epoch": 1.1204819277108433,
"grad_norm": 2.8957417011260986,
"learning_rate": 6.9352514860110876e-06,
"loss": 0.3191,
"step": 93
},
{
"epoch": 1.1325301204819278,
"grad_norm": 1.71742844581604,
"learning_rate": 6.876931680199121e-06,
"loss": 0.2792,
"step": 94
},
{
"epoch": 1.144578313253012,
"grad_norm": 1.615378975868225,
"learning_rate": 6.818313099731308e-06,
"loss": 0.2653,
"step": 95
},
{
"epoch": 1.1566265060240963,
"grad_norm": 1.4427539110183716,
"learning_rate": 6.759405075659165e-06,
"loss": 0.2909,
"step": 96
},
{
"epoch": 1.1686746987951806,
"grad_norm": 1.1839165687561035,
"learning_rate": 6.700216985108568e-06,
"loss": 0.1959,
"step": 97
},
{
"epoch": 1.180722891566265,
"grad_norm": 1.7143460512161255,
"learning_rate": 6.640758249787067e-06,
"loss": 0.2841,
"step": 98
},
{
"epoch": 1.1927710843373494,
"grad_norm": 1.3873624801635742,
"learning_rate": 6.58103833448412e-06,
"loss": 0.2838,
"step": 99
},
{
"epoch": 1.2048192771084336,
"grad_norm": 1.8592312335968018,
"learning_rate": 6.521066745564467e-06,
"loss": 0.2963,
"step": 100
},
{
"epoch": 1.216867469879518,
"grad_norm": 1.608494758605957,
"learning_rate": 6.460853029454879e-06,
"loss": 0.2877,
"step": 101
},
{
"epoch": 1.2289156626506024,
"grad_norm": 1.8831335306167603,
"learning_rate": 6.4004067711245366e-06,
"loss": 0.3066,
"step": 102
},
{
"epoch": 1.2409638554216866,
"grad_norm": 1.743905782699585,
"learning_rate": 6.3397375925592675e-06,
"loss": 0.3099,
"step": 103
},
{
"epoch": 1.2530120481927711,
"grad_norm": 1.8759677410125732,
"learning_rate": 6.2788551512299014e-06,
"loss": 0.2914,
"step": 104
},
{
"epoch": 1.2650602409638554,
"grad_norm": 1.7082366943359375,
"learning_rate": 6.2177691385549595e-06,
"loss": 0.2931,
"step": 105
},
{
"epoch": 1.2771084337349397,
"grad_norm": 1.519975185394287,
"learning_rate": 6.156489278357967e-06,
"loss": 0.2499,
"step": 106
},
{
"epoch": 1.2891566265060241,
"grad_norm": 1.8293309211730957,
"learning_rate": 6.0950253253195656e-06,
"loss": 0.3611,
"step": 107
},
{
"epoch": 1.3012048192771084,
"grad_norm": 1.728571891784668,
"learning_rate": 6.033387063424765e-06,
"loss": 0.3017,
"step": 108
},
{
"epoch": 1.3132530120481927,
"grad_norm": 1.6766902208328247,
"learning_rate": 5.971584304405489e-06,
"loss": 0.2823,
"step": 109
},
{
"epoch": 1.3253012048192772,
"grad_norm": 1.7143419981002808,
"learning_rate": 5.909626886178721e-06,
"loss": 0.2307,
"step": 110
},
{
"epoch": 1.3373493975903614,
"grad_norm": 1.5373152494430542,
"learning_rate": 5.8475246712804845e-06,
"loss": 0.2963,
"step": 111
},
{
"epoch": 1.3493975903614457,
"grad_norm": 1.8781455755233765,
"learning_rate": 5.785287545295895e-06,
"loss": 0.2874,
"step": 112
},
{
"epoch": 1.3614457831325302,
"grad_norm": 1.824504017829895,
"learning_rate": 5.722925415285555e-06,
"loss": 0.2454,
"step": 113
},
{
"epoch": 1.3734939759036144,
"grad_norm": 1.7806376218795776,
"learning_rate": 5.660448208208513e-06,
"loss": 0.3654,
"step": 114
},
{
"epoch": 1.3855421686746987,
"grad_norm": 1.5633933544158936,
"learning_rate": 5.597865869342075e-06,
"loss": 0.2931,
"step": 115
},
{
"epoch": 1.3975903614457832,
"grad_norm": 1.8875840902328491,
"learning_rate": 5.535188360698687e-06,
"loss": 0.331,
"step": 116
},
{
"epoch": 1.4096385542168675,
"grad_norm": 1.404435634613037,
"learning_rate": 5.472425659440157e-06,
"loss": 0.246,
"step": 117
},
{
"epoch": 1.4216867469879517,
"grad_norm": 1.4050829410552979,
"learning_rate": 5.409587756289462e-06,
"loss": 0.2689,
"step": 118
},
{
"epoch": 1.4337349397590362,
"grad_norm": 1.5876859426498413,
"learning_rate": 5.346684653940408e-06,
"loss": 0.2645,
"step": 119
},
{
"epoch": 1.4457831325301205,
"grad_norm": 1.6692218780517578,
"learning_rate": 5.2837263654653715e-06,
"loss": 0.3155,
"step": 120
},
{
"epoch": 1.4578313253012047,
"grad_norm": 1.2533305883407593,
"learning_rate": 5.2207229127213866e-06,
"loss": 0.2112,
"step": 121
},
{
"epoch": 1.4698795180722892,
"grad_norm": 1.5980626344680786,
"learning_rate": 5.157684324754858e-06,
"loss": 0.2441,
"step": 122
},
{
"epoch": 1.4819277108433735,
"grad_norm": 1.6085745096206665,
"learning_rate": 5.094620636205096e-06,
"loss": 0.3087,
"step": 123
},
{
"epoch": 1.4939759036144578,
"grad_norm": 1.7097792625427246,
"learning_rate": 5.031541885706987e-06,
"loss": 0.2499,
"step": 124
},
{
"epoch": 1.5060240963855422,
"grad_norm": 1.4703900814056396,
"learning_rate": 4.9684581142930135e-06,
"loss": 0.2413,
"step": 125
},
{
"epoch": 1.5180722891566265,
"grad_norm": 2.3154144287109375,
"learning_rate": 4.905379363794907e-06,
"loss": 0.3701,
"step": 126
},
{
"epoch": 1.5301204819277108,
"grad_norm": 1.665852427482605,
"learning_rate": 4.842315675245144e-06,
"loss": 0.2791,
"step": 127
},
{
"epoch": 1.5421686746987953,
"grad_norm": 1.7872849702835083,
"learning_rate": 4.779277087278615e-06,
"loss": 0.3303,
"step": 128
},
{
"epoch": 1.5542168674698795,
"grad_norm": 1.4255069494247437,
"learning_rate": 4.71627363453463e-06,
"loss": 0.2462,
"step": 129
},
{
"epoch": 1.5662650602409638,
"grad_norm": 1.8723397254943848,
"learning_rate": 4.653315346059592e-06,
"loss": 0.3083,
"step": 130
},
{
"epoch": 1.5783132530120483,
"grad_norm": 1.6238393783569336,
"learning_rate": 4.5904122437105384e-06,
"loss": 0.2947,
"step": 131
},
{
"epoch": 1.5903614457831325,
"grad_norm": 1.5982369184494019,
"learning_rate": 4.527574340559844e-06,
"loss": 0.3114,
"step": 132
},
{
"epoch": 1.6024096385542168,
"grad_norm": 1.7584006786346436,
"learning_rate": 4.464811639301314e-06,
"loss": 0.3335,
"step": 133
},
{
"epoch": 1.6144578313253013,
"grad_norm": 1.7169082164764404,
"learning_rate": 4.402134130657925e-06,
"loss": 0.2783,
"step": 134
},
{
"epoch": 1.6265060240963856,
"grad_norm": 1.6119632720947266,
"learning_rate": 4.33955179179149e-06,
"loss": 0.252,
"step": 135
},
{
"epoch": 1.6385542168674698,
"grad_norm": 1.5756961107254028,
"learning_rate": 4.277074584714447e-06,
"loss": 0.2825,
"step": 136
},
{
"epoch": 1.6506024096385543,
"grad_norm": 1.511651873588562,
"learning_rate": 4.214712454704107e-06,
"loss": 0.2479,
"step": 137
},
{
"epoch": 1.6626506024096386,
"grad_norm": 1.354615330696106,
"learning_rate": 4.152475328719517e-06,
"loss": 0.2192,
"step": 138
},
{
"epoch": 1.6746987951807228,
"grad_norm": 1.821956753730774,
"learning_rate": 4.090373113821281e-06,
"loss": 0.2735,
"step": 139
},
{
"epoch": 1.6867469879518073,
"grad_norm": 1.4524273872375488,
"learning_rate": 4.028415695594512e-06,
"loss": 0.2222,
"step": 140
},
{
"epoch": 1.6987951807228916,
"grad_norm": 1.6997952461242676,
"learning_rate": 3.966612936575235e-06,
"loss": 0.2841,
"step": 141
},
{
"epoch": 1.7108433734939759,
"grad_norm": 1.5502634048461914,
"learning_rate": 3.904974674680436e-06,
"loss": 0.281,
"step": 142
},
{
"epoch": 1.7228915662650603,
"grad_norm": 1.6944836378097534,
"learning_rate": 3.843510721642036e-06,
"loss": 0.19,
"step": 143
},
{
"epoch": 1.7349397590361446,
"grad_norm": 1.958292007446289,
"learning_rate": 3.782230861445041e-06,
"loss": 0.3143,
"step": 144
},
{
"epoch": 1.7469879518072289,
"grad_norm": 1.9379884004592896,
"learning_rate": 3.7211448487701002e-06,
"loss": 0.2964,
"step": 145
},
{
"epoch": 1.7590361445783134,
"grad_norm": 1.6362128257751465,
"learning_rate": 3.6602624074407354e-06,
"loss": 0.2749,
"step": 146
},
{
"epoch": 1.7710843373493976,
"grad_norm": 1.740090250968933,
"learning_rate": 3.5995932288754655e-06,
"loss": 0.2572,
"step": 147
},
{
"epoch": 1.783132530120482,
"grad_norm": 1.3941646814346313,
"learning_rate": 3.539146970545124e-06,
"loss": 0.2476,
"step": 148
},
{
"epoch": 1.7951807228915664,
"grad_norm": 1.6419267654418945,
"learning_rate": 3.478933254435534e-06,
"loss": 0.2902,
"step": 149
},
{
"epoch": 1.8072289156626506,
"grad_norm": 1.825861930847168,
"learning_rate": 3.4189616655158803e-06,
"loss": 0.3345,
"step": 150
},
{
"epoch": 1.819277108433735,
"grad_norm": 1.749080777168274,
"learning_rate": 3.359241750212934e-06,
"loss": 0.314,
"step": 151
},
{
"epoch": 1.8313253012048194,
"grad_norm": 1.2390449047088623,
"learning_rate": 3.2997830148914316e-06,
"loss": 0.214,
"step": 152
},
{
"epoch": 1.8433734939759037,
"grad_norm": 1.6753946542739868,
"learning_rate": 3.240594924340835e-06,
"loss": 0.2988,
"step": 153
},
{
"epoch": 1.855421686746988,
"grad_norm": 1.6091383695602417,
"learning_rate": 3.181686900268694e-06,
"loss": 0.2481,
"step": 154
},
{
"epoch": 1.8674698795180724,
"grad_norm": 1.438892126083374,
"learning_rate": 3.1230683198008817e-06,
"loss": 0.2702,
"step": 155
},
{
"epoch": 1.8795180722891565,
"grad_norm": 1.931443691253662,
"learning_rate": 3.0647485139889145e-06,
"loss": 0.2957,
"step": 156
},
{
"epoch": 1.891566265060241,
"grad_norm": 1.5713204145431519,
"learning_rate": 3.006736766324623e-06,
"loss": 0.2815,
"step": 157
},
{
"epoch": 1.9036144578313254,
"grad_norm": 1.5962169170379639,
"learning_rate": 2.9490423112623646e-06,
"loss": 0.2791,
"step": 158
},
{
"epoch": 1.9156626506024095,
"grad_norm": 2.0020360946655273,
"learning_rate": 2.89167433274908e-06,
"loss": 0.3897,
"step": 159
},
{
"epoch": 1.927710843373494,
"grad_norm": 1.6599327325820923,
"learning_rate": 2.834641962762358e-06,
"loss": 0.2742,
"step": 160
},
{
"epoch": 1.9397590361445785,
"grad_norm": 1.6006088256835938,
"learning_rate": 2.7779542798567804e-06,
"loss": 0.2678,
"step": 161
},
{
"epoch": 1.9518072289156625,
"grad_norm": 1.5215158462524414,
"learning_rate": 2.721620307718793e-06,
"loss": 0.3035,
"step": 162
},
{
"epoch": 1.963855421686747,
"grad_norm": 1.8756093978881836,
"learning_rate": 2.66564901373027e-06,
"loss": 0.3407,
"step": 163
},
{
"epoch": 1.9759036144578315,
"grad_norm": 1.5014938116073608,
"learning_rate": 2.610049307541085e-06,
"loss": 0.2533,
"step": 164
},
{
"epoch": 1.9879518072289155,
"grad_norm": 1.6140003204345703,
"learning_rate": 2.554830039650834e-06,
"loss": 0.2369,
"step": 165
},
{
"epoch": 2.0,
"grad_norm": 1.5059895515441895,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.1612,
"step": 166
},
{
"epoch": 2.0120481927710845,
"grad_norm": 1.2760642766952515,
"learning_rate": 2.4455679165707473e-06,
"loss": 0.1247,
"step": 167
},
{
"epoch": 2.0240963855421685,
"grad_norm": 1.3720568418502808,
"learning_rate": 2.391542453997578e-06,
"loss": 0.1618,
"step": 168
},
{
"epoch": 2.036144578313253,
"grad_norm": 1.4044466018676758,
"learning_rate": 2.337932212188073e-06,
"loss": 0.1427,
"step": 169
},
{
"epoch": 2.0481927710843375,
"grad_norm": 1.2212741374969482,
"learning_rate": 2.284745724953939e-06,
"loss": 0.1587,
"step": 170
},
{
"epoch": 2.0602409638554215,
"grad_norm": 1.1166741847991943,
"learning_rate": 2.2319914586525776e-06,
"loss": 0.1169,
"step": 171
},
{
"epoch": 2.072289156626506,
"grad_norm": 1.2007352113723755,
"learning_rate": 2.1796778108393824e-06,
"loss": 0.1232,
"step": 172
},
{
"epoch": 2.0843373493975905,
"grad_norm": 1.4228880405426025,
"learning_rate": 2.127813108931007e-06,
"loss": 0.1646,
"step": 173
},
{
"epoch": 2.0963855421686746,
"grad_norm": 1.2214866876602173,
"learning_rate": 2.0764056088797646e-06,
"loss": 0.1058,
"step": 174
},
{
"epoch": 2.108433734939759,
"grad_norm": 1.8072195053100586,
"learning_rate": 2.0254634938594555e-06,
"loss": 0.1579,
"step": 175
},
{
"epoch": 2.1204819277108435,
"grad_norm": 1.872309684753418,
"learning_rate": 1.9749948729627188e-06,
"loss": 0.138,
"step": 176
},
{
"epoch": 2.1325301204819276,
"grad_norm": 1.8318668603897095,
"learning_rate": 1.9250077799102323e-06,
"loss": 0.1331,
"step": 177
},
{
"epoch": 2.144578313253012,
"grad_norm": 2.1385316848754883,
"learning_rate": 1.875510171771865e-06,
"loss": 0.1635,
"step": 178
},
{
"epoch": 2.1566265060240966,
"grad_norm": 1.719831943511963,
"learning_rate": 1.8265099277000614e-06,
"loss": 0.1561,
"step": 179
},
{
"epoch": 2.1686746987951806,
"grad_norm": 1.7940328121185303,
"learning_rate": 1.7780148476756148e-06,
"loss": 0.14,
"step": 180
},
{
"epoch": 2.180722891566265,
"grad_norm": 1.3721911907196045,
"learning_rate": 1.7300326512660542e-06,
"loss": 0.1233,
"step": 181
},
{
"epoch": 2.1927710843373496,
"grad_norm": 1.2797173261642456,
"learning_rate": 1.6825709763968112e-06,
"loss": 0.0936,
"step": 182
},
{
"epoch": 2.2048192771084336,
"grad_norm": 1.5839323997497559,
"learning_rate": 1.6356373781354058e-06,
"loss": 0.1648,
"step": 183
},
{
"epoch": 2.216867469879518,
"grad_norm": 1.3700120449066162,
"learning_rate": 1.589239327488812e-06,
"loss": 0.126,
"step": 184
},
{
"epoch": 2.2289156626506026,
"grad_norm": 1.5171151161193848,
"learning_rate": 1.543384210214196e-06,
"loss": 0.1212,
"step": 185
},
{
"epoch": 2.2409638554216866,
"grad_norm": 1.6373289823532104,
"learning_rate": 1.4980793256432474e-06,
"loss": 0.1509,
"step": 186
},
{
"epoch": 2.253012048192771,
"grad_norm": 1.30360746383667,
"learning_rate": 1.453331885520234e-06,
"loss": 0.12,
"step": 187
},
{
"epoch": 2.2650602409638556,
"grad_norm": 1.395431399345398,
"learning_rate": 1.4091490128540374e-06,
"loss": 0.1406,
"step": 188
},
{
"epoch": 2.2771084337349397,
"grad_norm": 1.3656375408172607,
"learning_rate": 1.3655377407842813e-06,
"loss": 0.1706,
"step": 189
},
{
"epoch": 2.289156626506024,
"grad_norm": 1.189477562904358,
"learning_rate": 1.32250501146179e-06,
"loss": 0.1243,
"step": 190
},
{
"epoch": 2.3012048192771086,
"grad_norm": 1.273803949356079,
"learning_rate": 1.2800576749435068e-06,
"loss": 0.1132,
"step": 191
},
{
"epoch": 2.3132530120481927,
"grad_norm": 1.249987244606018,
"learning_rate": 1.2382024881020937e-06,
"loss": 0.133,
"step": 192
},
{
"epoch": 2.325301204819277,
"grad_norm": 1.2117363214492798,
"learning_rate": 1.1969461135503573e-06,
"loss": 0.1153,
"step": 193
},
{
"epoch": 2.337349397590361,
"grad_norm": 1.115524172782898,
"learning_rate": 1.1562951185806675e-06,
"loss": 0.1068,
"step": 194
},
{
"epoch": 2.3493975903614457,
"grad_norm": 1.2410939931869507,
"learning_rate": 1.1162559741195733e-06,
"loss": 0.0926,
"step": 195
},
{
"epoch": 2.36144578313253,
"grad_norm": 1.0989357233047485,
"learning_rate": 1.076835053697728e-06,
"loss": 0.1147,
"step": 196
},
{
"epoch": 2.3734939759036147,
"grad_norm": 1.2773900032043457,
"learning_rate": 1.0380386324353508e-06,
"loss": 0.131,
"step": 197
},
{
"epoch": 2.3855421686746987,
"grad_norm": 1.2643158435821533,
"learning_rate": 9.998728860433277e-07,
"loss": 0.1377,
"step": 198
},
{
"epoch": 2.397590361445783,
"grad_norm": 1.3157423734664917,
"learning_rate": 9.62343889840151e-07,
"loss": 0.127,
"step": 199
},
{
"epoch": 2.4096385542168672,
"grad_norm": 1.1823986768722534,
"learning_rate": 9.254576177848313e-07,
"loss": 0.1039,
"step": 200
},
{
"epoch": 2.4216867469879517,
"grad_norm": 1.2062366008758545,
"learning_rate": 8.892199415259501e-07,
"loss": 0.1137,
"step": 201
},
{
"epoch": 2.433734939759036,
"grad_norm": 1.358426570892334,
"learning_rate": 8.536366294669979e-07,
"loss": 0.1188,
"step": 202
},
{
"epoch": 2.4457831325301207,
"grad_norm": 1.4414290189743042,
"learning_rate": 8.187133458481416e-07,
"loss": 0.1393,
"step": 203
},
{
"epoch": 2.4578313253012047,
"grad_norm": 1.2111995220184326,
"learning_rate": 7.844556498445788e-07,
"loss": 0.1088,
"step": 204
},
{
"epoch": 2.4698795180722892,
"grad_norm": 1.0797122716903687,
"learning_rate": 7.508689946816128e-07,
"loss": 0.1012,
"step": 205
},
{
"epoch": 2.4819277108433733,
"grad_norm": 1.2910206317901611,
"learning_rate": 7.179587267665999e-07,
"loss": 0.1283,
"step": 206
},
{
"epoch": 2.4939759036144578,
"grad_norm": 1.76227867603302,
"learning_rate": 6.857300848378857e-07,
"loss": 0.1773,
"step": 207
},
{
"epoch": 2.5060240963855422,
"grad_norm": 1.2892178297042847,
"learning_rate": 6.541881991309013e-07,
"loss": 0.1003,
"step": 208
},
{
"epoch": 2.5180722891566267,
"grad_norm": 1.2142372131347656,
"learning_rate": 6.233380905615049e-07,
"loss": 0.1059,
"step": 209
},
{
"epoch": 2.5301204819277108,
"grad_norm": 1.3028932809829712,
"learning_rate": 5.931846699267558e-07,
"loss": 0.0997,
"step": 210
},
{
"epoch": 2.5421686746987953,
"grad_norm": 1.2703701257705688,
"learning_rate": 5.637327371231921e-07,
"loss": 0.1074,
"step": 211
},
{
"epoch": 2.5542168674698793,
"grad_norm": 1.6055101156234741,
"learning_rate": 5.349869803827717e-07,
"loss": 0.1635,
"step": 212
},
{
"epoch": 2.566265060240964,
"grad_norm": 1.2764710187911987,
"learning_rate": 5.0695197552659e-07,
"loss": 0.1394,
"step": 213
},
{
"epoch": 2.5783132530120483,
"grad_norm": 1.3518632650375366,
"learning_rate": 4.796321852364877e-07,
"loss": 0.1363,
"step": 214
},
{
"epoch": 2.5903614457831328,
"grad_norm": 1.3571412563323975,
"learning_rate": 4.5303195834467463e-07,
"loss": 0.1326,
"step": 215
},
{
"epoch": 2.602409638554217,
"grad_norm": 1.4019449949264526,
"learning_rate": 4.271555291414636e-07,
"loss": 0.1222,
"step": 216
},
{
"epoch": 2.6144578313253013,
"grad_norm": 1.184061050415039,
"learning_rate": 4.020070167012541e-07,
"loss": 0.0845,
"step": 217
},
{
"epoch": 2.6265060240963853,
"grad_norm": 1.5907222032546997,
"learning_rate": 3.775904242268391e-07,
"loss": 0.1353,
"step": 218
},
{
"epoch": 2.63855421686747,
"grad_norm": 1.3479151725769043,
"learning_rate": 3.539096384121743e-07,
"loss": 0.1445,
"step": 219
},
{
"epoch": 2.6506024096385543,
"grad_norm": 1.36601722240448,
"learning_rate": 3.309684288236775e-07,
"loss": 0.1386,
"step": 220
},
{
"epoch": 2.662650602409639,
"grad_norm": 1.6582006216049194,
"learning_rate": 3.0877044730018515e-07,
"loss": 0.1237,
"step": 221
},
{
"epoch": 2.674698795180723,
"grad_norm": 1.505927324295044,
"learning_rate": 2.873192273716369e-07,
"loss": 0.153,
"step": 222
},
{
"epoch": 2.6867469879518073,
"grad_norm": 1.2739795446395874,
"learning_rate": 2.666181836966053e-07,
"loss": 0.1038,
"step": 223
},
{
"epoch": 2.6987951807228914,
"grad_norm": 1.3373569250106812,
"learning_rate": 2.466706115187406e-07,
"loss": 0.1208,
"step": 224
},
{
"epoch": 2.710843373493976,
"grad_norm": 1.3513188362121582,
"learning_rate": 2.274796861422246e-07,
"loss": 0.1209,
"step": 225
},
{
"epoch": 2.7228915662650603,
"grad_norm": 1.4020378589630127,
"learning_rate": 2.090484624263167e-07,
"loss": 0.1323,
"step": 226
},
{
"epoch": 2.734939759036145,
"grad_norm": 1.4146372079849243,
"learning_rate": 1.9137987429907635e-07,
"loss": 0.1304,
"step": 227
},
{
"epoch": 2.746987951807229,
"grad_norm": 1.3225347995758057,
"learning_rate": 1.7447673429033361e-07,
"loss": 0.1149,
"step": 228
},
{
"epoch": 2.7590361445783134,
"grad_norm": 1.3890403509140015,
"learning_rate": 1.583417330839798e-07,
"loss": 0.1557,
"step": 229
},
{
"epoch": 2.7710843373493974,
"grad_norm": 1.466339349746704,
"learning_rate": 1.4297743908966212e-07,
"loss": 0.1489,
"step": 230
},
{
"epoch": 2.783132530120482,
"grad_norm": 1.2367849349975586,
"learning_rate": 1.2838629803393343e-07,
"loss": 0.0997,
"step": 231
},
{
"epoch": 2.7951807228915664,
"grad_norm": 1.390717625617981,
"learning_rate": 1.1457063257093892e-07,
"loss": 0.1218,
"step": 232
},
{
"epoch": 2.807228915662651,
"grad_norm": 1.2239187955856323,
"learning_rate": 1.0153264191269052e-07,
"loss": 0.1135,
"step": 233
},
{
"epoch": 2.819277108433735,
"grad_norm": 1.0472311973571777,
"learning_rate": 8.927440147898703e-08,
"loss": 0.1065,
"step": 234
},
{
"epoch": 2.8313253012048194,
"grad_norm": 1.2322300672531128,
"learning_rate": 7.779786256704669e-08,
"loss": 0.1016,
"step": 235
},
{
"epoch": 2.8433734939759034,
"grad_norm": 1.3203635215759277,
"learning_rate": 6.710485204089456e-08,
"loss": 0.1239,
"step": 236
},
{
"epoch": 2.855421686746988,
"grad_norm": 1.2621276378631592,
"learning_rate": 5.7197072040557356e-08,
"loss": 0.1358,
"step": 237
},
{
"epoch": 2.8674698795180724,
"grad_norm": 1.3743759393692017,
"learning_rate": 4.807609971111238e-08,
"loss": 0.1337,
"step": 238
},
{
"epoch": 2.8795180722891565,
"grad_norm": 1.0450865030288696,
"learning_rate": 3.974338695163393e-08,
"loss": 0.0945,
"step": 239
},
{
"epoch": 2.891566265060241,
"grad_norm": 1.5270490646362305,
"learning_rate": 3.220026018407541e-08,
"loss": 0.0994,
"step": 240
},
{
"epoch": 2.9036144578313254,
"grad_norm": 1.694990873336792,
"learning_rate": 2.5447920142128712e-08,
"loss": 0.1689,
"step": 241
},
{
"epoch": 2.9156626506024095,
"grad_norm": 1.4968199729919434,
"learning_rate": 1.9487441680084983e-08,
"loss": 0.1219,
"step": 242
},
{
"epoch": 2.927710843373494,
"grad_norm": 1.332356572151184,
"learning_rate": 1.431977360173975e-08,
"loss": 0.1137,
"step": 243
},
{
"epoch": 2.9397590361445785,
"grad_norm": 1.4295134544372559,
"learning_rate": 9.945738509358205e-09,
"loss": 0.1498,
"step": 244
},
{
"epoch": 2.9518072289156625,
"grad_norm": 1.1830252408981323,
"learning_rate": 6.366032672731059e-09,
"loss": 0.1002,
"step": 245
},
{
"epoch": 2.963855421686747,
"grad_norm": 1.3982295989990234,
"learning_rate": 3.5812259183426457e-09,
"loss": 0.1247,
"step": 246
},
{
"epoch": 2.9759036144578315,
"grad_norm": 1.467788577079773,
"learning_rate": 1.591761538662362e-09,
"loss": 0.1098,
"step": 247
},
{
"epoch": 2.9879518072289155,
"grad_norm": 1.2710858583450317,
"learning_rate": 3.9795622158111945e-10,
"loss": 0.1335,
"step": 248
},
{
"epoch": 3.0,
"grad_norm": 0.9258020520210266,
"learning_rate": 0.0,
"loss": 0.0612,
"step": 249
},
{
"epoch": 3.0,
"step": 249,
"total_flos": 3.4614492313052774e+17,
"train_loss": 0.32225324711706266,
"train_runtime": 433.6826,
"train_samples_per_second": 4.545,
"train_steps_per_second": 0.574
}
],
"logging_steps": 1,
"max_steps": 249,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.4614492313052774e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}