highlight_model_multiimage / trainer_state.json
Agatha7k's picture
Upload folder using huggingface_hub
6954b57 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.989090909090909,
"eval_steps": 500,
"global_step": 255,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019393939393939394,
"grad_norm": 90.48059844970703,
"learning_rate": 0.0,
"loss": 2.5757,
"step": 1
},
{
"epoch": 0.03878787878787879,
"grad_norm": 92.12358093261719,
"learning_rate": 2.5e-05,
"loss": 2.5426,
"step": 2
},
{
"epoch": 0.05818181818181818,
"grad_norm": 38.16162109375,
"learning_rate": 5e-05,
"loss": 1.1907,
"step": 3
},
{
"epoch": 0.07757575757575758,
"grad_norm": 35.746028900146484,
"learning_rate": 7.500000000000001e-05,
"loss": 0.936,
"step": 4
},
{
"epoch": 0.09696969696969697,
"grad_norm": 31.175495147705078,
"learning_rate": 0.0001,
"loss": 1.557,
"step": 5
},
{
"epoch": 0.11636363636363636,
"grad_norm": 97.79412078857422,
"learning_rate": 0.000125,
"loss": 2.0208,
"step": 6
},
{
"epoch": 0.13575757575757577,
"grad_norm": 96.80253601074219,
"learning_rate": 0.00015000000000000001,
"loss": 1.925,
"step": 7
},
{
"epoch": 0.15515151515151515,
"grad_norm": 211.5525665283203,
"learning_rate": 0.000175,
"loss": 1.3849,
"step": 8
},
{
"epoch": 0.17454545454545456,
"grad_norm": 28.884292602539062,
"learning_rate": 0.0002,
"loss": 1.1013,
"step": 9
},
{
"epoch": 0.19393939393939394,
"grad_norm": 77.68536376953125,
"learning_rate": 0.0001999919114627769,
"loss": 2.9445,
"step": 10
},
{
"epoch": 0.21333333333333335,
"grad_norm": 55.35911178588867,
"learning_rate": 0.00019996764715959618,
"loss": 1.4681,
"step": 11
},
{
"epoch": 0.23272727272727273,
"grad_norm": 42.826019287109375,
"learning_rate": 0.00019992721101571236,
"loss": 1.7236,
"step": 12
},
{
"epoch": 0.25212121212121213,
"grad_norm": 25.61245346069336,
"learning_rate": 0.00019987060957251047,
"loss": 1.4904,
"step": 13
},
{
"epoch": 0.27151515151515154,
"grad_norm": 33.56877136230469,
"learning_rate": 0.00019979785198644806,
"loss": 1.4225,
"step": 14
},
{
"epoch": 0.2909090909090909,
"grad_norm": 26.03700065612793,
"learning_rate": 0.00019970895002757413,
"loss": 1.0714,
"step": 15
},
{
"epoch": 0.3103030303030303,
"grad_norm": 3.605429172515869,
"learning_rate": 0.00019960391807762463,
"loss": 0.8201,
"step": 16
},
{
"epoch": 0.3296969696969697,
"grad_norm": 27.68207359313965,
"learning_rate": 0.0001994827731276963,
"loss": 0.8841,
"step": 17
},
{
"epoch": 0.3490909090909091,
"grad_norm": 33.42802047729492,
"learning_rate": 0.00019934553477549794,
"loss": 0.9123,
"step": 18
},
{
"epoch": 0.36848484848484847,
"grad_norm": 3.847541093826294,
"learning_rate": 0.00019919222522217996,
"loss": 0.7937,
"step": 19
},
{
"epoch": 0.3878787878787879,
"grad_norm": 190.63131713867188,
"learning_rate": 0.0001990228692687429,
"loss": 2.0077,
"step": 20
},
{
"epoch": 0.4072727272727273,
"grad_norm": 20.3385066986084,
"learning_rate": 0.0001988374943120254,
"loss": 0.8601,
"step": 21
},
{
"epoch": 0.4266666666666667,
"grad_norm": 1.9217650890350342,
"learning_rate": 0.00019863613034027224,
"loss": 0.7884,
"step": 22
},
{
"epoch": 0.44606060606060605,
"grad_norm": 620.0447998046875,
"learning_rate": 0.00019841880992828306,
"loss": 1.9,
"step": 23
},
{
"epoch": 0.46545454545454545,
"grad_norm": 34.09405517578125,
"learning_rate": 0.00019818556823214268,
"loss": 2.6336,
"step": 24
},
{
"epoch": 0.48484848484848486,
"grad_norm": 160.58441162109375,
"learning_rate": 0.0001979364429835339,
"loss": 2.5285,
"step": 25
},
{
"epoch": 0.5042424242424243,
"grad_norm": 38.78010940551758,
"learning_rate": 0.00019767147448363366,
"loss": 2.2263,
"step": 26
},
{
"epoch": 0.5236363636363637,
"grad_norm": 13.957600593566895,
"learning_rate": 0.00019739070559659347,
"loss": 1.563,
"step": 27
},
{
"epoch": 0.5430303030303031,
"grad_norm": 51.03530502319336,
"learning_rate": 0.0001970941817426052,
"loss": 1.4006,
"step": 28
},
{
"epoch": 0.5624242424242424,
"grad_norm": 15.889177322387695,
"learning_rate": 0.00019678195089055346,
"loss": 1.1385,
"step": 29
},
{
"epoch": 0.5818181818181818,
"grad_norm": 9.053852081298828,
"learning_rate": 0.00019645406355025565,
"loss": 0.8776,
"step": 30
},
{
"epoch": 0.6012121212121212,
"grad_norm": 12.983526229858398,
"learning_rate": 0.00019611057276429085,
"loss": 0.8769,
"step": 31
},
{
"epoch": 0.6206060606060606,
"grad_norm": 4.714831352233887,
"learning_rate": 0.0001957515340994193,
"loss": 0.8055,
"step": 32
},
{
"epoch": 0.64,
"grad_norm": 5.783924102783203,
"learning_rate": 0.00019537700563759304,
"loss": 0.7976,
"step": 33
},
{
"epoch": 0.6593939393939394,
"grad_norm": 3.041721820831299,
"learning_rate": 0.00019498704796656018,
"loss": 0.7942,
"step": 34
},
{
"epoch": 0.6787878787878788,
"grad_norm": 4.065732955932617,
"learning_rate": 0.00019458172417006347,
"loss": 0.7701,
"step": 35
},
{
"epoch": 0.6981818181818182,
"grad_norm": 4.388843059539795,
"learning_rate": 0.00019416109981763526,
"loss": 0.7835,
"step": 36
},
{
"epoch": 0.7175757575757575,
"grad_norm": 3.284567356109619,
"learning_rate": 0.00019372524295399013,
"loss": 0.7665,
"step": 37
},
{
"epoch": 0.7369696969696969,
"grad_norm": 2.197493553161621,
"learning_rate": 0.00019327422408801744,
"loss": 0.7675,
"step": 38
},
{
"epoch": 0.7563636363636363,
"grad_norm": 230.88824462890625,
"learning_rate": 0.00019280811618137484,
"loss": 0.7732,
"step": 39
},
{
"epoch": 0.7757575757575758,
"grad_norm": 4.613775253295898,
"learning_rate": 0.00019232699463668542,
"loss": 0.7681,
"step": 40
},
{
"epoch": 0.7951515151515152,
"grad_norm": 3.3040153980255127,
"learning_rate": 0.00019183093728533966,
"loss": 0.7729,
"step": 41
},
{
"epoch": 0.8145454545454546,
"grad_norm": 3.864863872528076,
"learning_rate": 0.00019132002437490458,
"loss": 0.7712,
"step": 42
},
{
"epoch": 0.833939393939394,
"grad_norm": 3.212442636489868,
"learning_rate": 0.00019079433855614201,
"loss": 0.7461,
"step": 43
},
{
"epoch": 0.8533333333333334,
"grad_norm": 3.442129611968994,
"learning_rate": 0.00019025396486963827,
"loss": 0.7566,
"step": 44
},
{
"epoch": 0.8727272727272727,
"grad_norm": 3.749457359313965,
"learning_rate": 0.00018969899073204686,
"loss": 0.7522,
"step": 45
},
{
"epoch": 0.8921212121212121,
"grad_norm": 1.6893056631088257,
"learning_rate": 0.0001891295059219472,
"loss": 0.7671,
"step": 46
},
{
"epoch": 0.9115151515151515,
"grad_norm": 2.853459119796753,
"learning_rate": 0.000188545602565321,
"loss": 0.7381,
"step": 47
},
{
"epoch": 0.9309090909090909,
"grad_norm": 1.8483091592788696,
"learning_rate": 0.0001879473751206489,
"loss": 0.7374,
"step": 48
},
{
"epoch": 0.9503030303030303,
"grad_norm": 2.6860077381134033,
"learning_rate": 0.00018733492036363005,
"loss": 0.7537,
"step": 49
},
{
"epoch": 0.9696969696969697,
"grad_norm": 2.142801284790039,
"learning_rate": 0.0001867083373715264,
"loss": 0.7547,
"step": 50
},
{
"epoch": 0.9890909090909091,
"grad_norm": 3.1881351470947266,
"learning_rate": 0.00018606772750713504,
"loss": 0.7617,
"step": 51
},
{
"epoch": 1.0193939393939393,
"grad_norm": 3.6325433254241943,
"learning_rate": 0.00018541319440239066,
"loss": 1.0347,
"step": 52
},
{
"epoch": 1.0387878787878788,
"grad_norm": 2.459991455078125,
"learning_rate": 0.0001847448439416009,
"loss": 0.7477,
"step": 53
},
{
"epoch": 1.0581818181818181,
"grad_norm": 2.6495184898376465,
"learning_rate": 0.00018406278424431736,
"loss": 0.7358,
"step": 54
},
{
"epoch": 1.0775757575757576,
"grad_norm": 1.890399694442749,
"learning_rate": 0.00018336712564784503,
"loss": 0.7449,
"step": 55
},
{
"epoch": 1.096969696969697,
"grad_norm": 2.979245662689209,
"learning_rate": 0.00018265798068939294,
"loss": 0.7396,
"step": 56
},
{
"epoch": 1.1163636363636364,
"grad_norm": 3.2054319381713867,
"learning_rate": 0.00018193546408786898,
"loss": 0.7479,
"step": 57
},
{
"epoch": 1.1357575757575757,
"grad_norm": 2.0698201656341553,
"learning_rate": 0.00018119969272532166,
"loss": 0.7433,
"step": 58
},
{
"epoch": 1.1551515151515153,
"grad_norm": 2.5635318756103516,
"learning_rate": 0.00018045078562803203,
"loss": 0.741,
"step": 59
},
{
"epoch": 1.1745454545454546,
"grad_norm": 2.7970149517059326,
"learning_rate": 0.00017968886394725874,
"loss": 0.7366,
"step": 60
},
{
"epoch": 1.1939393939393939,
"grad_norm": 2.812278985977173,
"learning_rate": 0.00017891405093963938,
"loss": 0.7463,
"step": 61
},
{
"epoch": 1.2133333333333334,
"grad_norm": 1.9571610689163208,
"learning_rate": 0.00017812647194725094,
"loss": 0.7193,
"step": 62
},
{
"epoch": 1.2327272727272727,
"grad_norm": 2.2157516479492188,
"learning_rate": 0.00017732625437733335,
"loss": 0.7389,
"step": 63
},
{
"epoch": 1.2521212121212122,
"grad_norm": 2.3364241123199463,
"learning_rate": 0.0001765135276816787,
"loss": 0.7223,
"step": 64
},
{
"epoch": 1.2715151515151515,
"grad_norm": 2.172851324081421,
"learning_rate": 0.00017568842333568952,
"loss": 0.7343,
"step": 65
},
{
"epoch": 1.290909090909091,
"grad_norm": 2.2759182453155518,
"learning_rate": 0.00017485107481711012,
"loss": 0.7384,
"step": 66
},
{
"epoch": 1.3103030303030303,
"grad_norm": 2.428541898727417,
"learning_rate": 0.00017400161758443375,
"loss": 0.7298,
"step": 67
},
{
"epoch": 1.3296969696969696,
"grad_norm": 2.293830394744873,
"learning_rate": 0.00017314018905498931,
"loss": 0.742,
"step": 68
},
{
"epoch": 1.3490909090909091,
"grad_norm": 2.105808973312378,
"learning_rate": 0.00017226692858271134,
"loss": 0.7261,
"step": 69
},
{
"epoch": 1.3684848484848484,
"grad_norm": 2.2897746562957764,
"learning_rate": 0.00017138197743559654,
"loss": 0.7318,
"step": 70
},
{
"epoch": 1.387878787878788,
"grad_norm": 1.595840573310852,
"learning_rate": 0.00017048547877285077,
"loss": 0.7307,
"step": 71
},
{
"epoch": 1.4072727272727272,
"grad_norm": 1.8382622003555298,
"learning_rate": 0.0001695775776217301,
"loss": 0.7365,
"step": 72
},
{
"epoch": 1.4266666666666667,
"grad_norm": 2.425427198410034,
"learning_rate": 0.0001686584208540797,
"loss": 0.7382,
"step": 73
},
{
"epoch": 1.446060606060606,
"grad_norm": 1.973556399345398,
"learning_rate": 0.00016772815716257412,
"loss": 0.7261,
"step": 74
},
{
"epoch": 1.4654545454545453,
"grad_norm": 2.711993455886841,
"learning_rate": 0.00016678693703666325,
"loss": 0.721,
"step": 75
},
{
"epoch": 1.4848484848484849,
"grad_norm": 1.8177202939987183,
"learning_rate": 0.00016583491273822765,
"loss": 0.729,
"step": 76
},
{
"epoch": 1.5042424242424244,
"grad_norm": 2.3554399013519287,
"learning_rate": 0.00016487223827694672,
"loss": 0.7232,
"step": 77
},
{
"epoch": 1.5236363636363637,
"grad_norm": 1.8325488567352295,
"learning_rate": 0.0001638990693853848,
"loss": 0.7369,
"step": 78
},
{
"epoch": 1.543030303030303,
"grad_norm": 2.512336254119873,
"learning_rate": 0.00016291556349379795,
"loss": 0.7337,
"step": 79
},
{
"epoch": 1.5624242424242425,
"grad_norm": 2.4152209758758545,
"learning_rate": 0.00016192187970466644,
"loss": 0.719,
"step": 80
},
{
"epoch": 1.5818181818181818,
"grad_norm": 2.310332775115967,
"learning_rate": 0.00016091817876695655,
"loss": 0.7165,
"step": 81
},
{
"epoch": 1.601212121212121,
"grad_norm": 2.328439712524414,
"learning_rate": 0.0001599046230501163,
"loss": 0.728,
"step": 82
},
{
"epoch": 1.6206060606060606,
"grad_norm": 1.824378490447998,
"learning_rate": 0.00015888137651780845,
"loss": 0.7347,
"step": 83
},
{
"epoch": 1.6400000000000001,
"grad_norm": 2.719447612762451,
"learning_rate": 0.00015784860470138633,
"loss": 0.738,
"step": 84
},
{
"epoch": 1.6593939393939394,
"grad_norm": 1.9578591585159302,
"learning_rate": 0.00015680647467311557,
"loss": 0.7195,
"step": 85
},
{
"epoch": 1.6787878787878787,
"grad_norm": 1.436728835105896,
"learning_rate": 0.00015575515501914668,
"loss": 0.7351,
"step": 86
},
{
"epoch": 1.6981818181818182,
"grad_norm": 1.9513907432556152,
"learning_rate": 0.00015469481581224272,
"loss": 0.7404,
"step": 87
},
{
"epoch": 1.7175757575757575,
"grad_norm": 2.2551791667938232,
"learning_rate": 0.00015362562858426654,
"loss": 0.7307,
"step": 88
},
{
"epoch": 1.7369696969696968,
"grad_norm": 2.2636687755584717,
"learning_rate": 0.00015254776629843205,
"loss": 0.7192,
"step": 89
},
{
"epoch": 1.7563636363636363,
"grad_norm": 1.4845975637435913,
"learning_rate": 0.00015146140332132358,
"loss": 0.7191,
"step": 90
},
{
"epoch": 1.7757575757575759,
"grad_norm": 2.420228958129883,
"learning_rate": 0.00015036671539468878,
"loss": 0.7348,
"step": 91
},
{
"epoch": 1.7951515151515152,
"grad_norm": 1.9322905540466309,
"learning_rate": 0.00014926387960700842,
"loss": 0.7333,
"step": 92
},
{
"epoch": 1.8145454545454545,
"grad_norm": 2.3328347206115723,
"learning_rate": 0.00014815307436484898,
"loss": 0.7116,
"step": 93
},
{
"epoch": 1.833939393939394,
"grad_norm": 2.307257890701294,
"learning_rate": 0.00014703447936400134,
"loss": 0.729,
"step": 94
},
{
"epoch": 1.8533333333333335,
"grad_norm": 1.4340693950653076,
"learning_rate": 0.00014590827556041158,
"loss": 0.7175,
"step": 95
},
{
"epoch": 1.8727272727272726,
"grad_norm": 2.0661492347717285,
"learning_rate": 0.00014477464514090743,
"loss": 0.7324,
"step": 96
},
{
"epoch": 1.892121212121212,
"grad_norm": 2.7764408588409424,
"learning_rate": 0.00014363377149372584,
"loss": 0.7337,
"step": 97
},
{
"epoch": 1.9115151515151516,
"grad_norm": 2.023960590362549,
"learning_rate": 0.00014248583917884594,
"loss": 0.7137,
"step": 98
},
{
"epoch": 1.930909090909091,
"grad_norm": 1.200355887413025,
"learning_rate": 0.00014133103389813302,
"loss": 0.7194,
"step": 99
},
{
"epoch": 1.9503030303030302,
"grad_norm": 2.1049160957336426,
"learning_rate": 0.00014016954246529696,
"loss": 0.7221,
"step": 100
},
{
"epoch": 1.9696969696969697,
"grad_norm": 2.351930618286133,
"learning_rate": 0.00013900155277567157,
"loss": 0.7288,
"step": 101
},
{
"epoch": 1.9890909090909092,
"grad_norm": 1.6542885303497314,
"learning_rate": 0.00013782725377581848,
"loss": 0.715,
"step": 102
},
{
"epoch": 2.0193939393939395,
"grad_norm": 1.6560240983963013,
"learning_rate": 0.00013664683543296112,
"loss": 0.9868,
"step": 103
},
{
"epoch": 2.0387878787878786,
"grad_norm": 2.0962977409362793,
"learning_rate": 0.00013546048870425356,
"loss": 0.7302,
"step": 104
},
{
"epoch": 2.058181818181818,
"grad_norm": 2.4153010845184326,
"learning_rate": 0.00013426840550588933,
"loss": 0.7068,
"step": 105
},
{
"epoch": 2.0775757575757576,
"grad_norm": 2.4013113975524902,
"learning_rate": 0.00013307077868205487,
"loss": 0.7231,
"step": 106
},
{
"epoch": 2.096969696969697,
"grad_norm": 2.382469415664673,
"learning_rate": 0.00013186780197373306,
"loss": 0.7241,
"step": 107
},
{
"epoch": 2.1163636363636362,
"grad_norm": 3.5327625274658203,
"learning_rate": 0.00013065966998736155,
"loss": 0.7179,
"step": 108
},
{
"epoch": 2.1357575757575757,
"grad_norm": 1.7616554498672485,
"learning_rate": 0.00012944657816335123,
"loss": 0.7262,
"step": 109
},
{
"epoch": 2.1551515151515153,
"grad_norm": 1.7108697891235352,
"learning_rate": 0.00012822872274446958,
"loss": 0.7349,
"step": 110
},
{
"epoch": 2.174545454545455,
"grad_norm": 1.8982540369033813,
"learning_rate": 0.00012700630074409427,
"loss": 0.7282,
"step": 111
},
{
"epoch": 2.193939393939394,
"grad_norm": 3.6565070152282715,
"learning_rate": 0.00012577950991434248,
"loss": 0.7313,
"step": 112
},
{
"epoch": 2.2133333333333334,
"grad_norm": 2.186326503753662,
"learning_rate": 0.00012454854871407994,
"loss": 0.7187,
"step": 113
},
{
"epoch": 2.232727272727273,
"grad_norm": 2.0525989532470703,
"learning_rate": 0.00012331361627681645,
"loss": 0.724,
"step": 114
},
{
"epoch": 2.252121212121212,
"grad_norm": 3.1588134765625,
"learning_rate": 0.00012207491237849172,
"loss": 0.7215,
"step": 115
},
{
"epoch": 2.2715151515151515,
"grad_norm": 1.7911540269851685,
"learning_rate": 0.00012083263740515765,
"loss": 0.7301,
"step": 116
},
{
"epoch": 2.290909090909091,
"grad_norm": 2.4920434951782227,
"learning_rate": 0.00011958699232056134,
"loss": 0.7066,
"step": 117
},
{
"epoch": 2.3103030303030305,
"grad_norm": 1.8157334327697754,
"learning_rate": 0.00011833817863363564,
"loss": 0.7261,
"step": 118
},
{
"epoch": 2.3296969696969696,
"grad_norm": 1.7181109189987183,
"learning_rate": 0.00011708639836590023,
"loss": 0.7234,
"step": 119
},
{
"epoch": 2.349090909090909,
"grad_norm": 2.088304281234741,
"learning_rate": 0.00011583185401878101,
"loss": 0.7205,
"step": 120
},
{
"epoch": 2.3684848484848486,
"grad_norm": 1.6647237539291382,
"learning_rate": 0.00011457474854085096,
"loss": 0.727,
"step": 121
},
{
"epoch": 2.3878787878787877,
"grad_norm": 2.0125958919525146,
"learning_rate": 0.00011331528529499909,
"loss": 0.7249,
"step": 122
},
{
"epoch": 2.4072727272727272,
"grad_norm": 1.881591558456421,
"learning_rate": 0.0001120536680255323,
"loss": 0.7254,
"step": 123
},
{
"epoch": 2.4266666666666667,
"grad_norm": 1.6693066358566284,
"learning_rate": 0.00011079010082521557,
"loss": 0.717,
"step": 124
},
{
"epoch": 2.4460606060606063,
"grad_norm": 1.1339530944824219,
"learning_rate": 0.00010952478810225548,
"loss": 0.7147,
"step": 125
},
{
"epoch": 2.4654545454545453,
"grad_norm": 1.7603578567504883,
"learning_rate": 0.00010825793454723325,
"loss": 0.7245,
"step": 126
},
{
"epoch": 2.484848484848485,
"grad_norm": 2.23610782623291,
"learning_rate": 0.00010698974509999158,
"loss": 0.72,
"step": 127
},
{
"epoch": 2.5042424242424244,
"grad_norm": 1.5710601806640625,
"learning_rate": 0.00010572042491648149,
"loss": 0.7204,
"step": 128
},
{
"epoch": 2.5236363636363635,
"grad_norm": 1.6866320371627808,
"learning_rate": 0.00010445017933557404,
"loss": 0.7185,
"step": 129
},
{
"epoch": 2.543030303030303,
"grad_norm": 1.427932620048523,
"learning_rate": 0.00010317921384584244,
"loss": 0.7193,
"step": 130
},
{
"epoch": 2.5624242424242425,
"grad_norm": 1.7711869478225708,
"learning_rate": 0.00010190773405232024,
"loss": 0.7167,
"step": 131
},
{
"epoch": 2.581818181818182,
"grad_norm": 1.4821707010269165,
"learning_rate": 0.00010063594564324012,
"loss": 0.7128,
"step": 132
},
{
"epoch": 2.601212121212121,
"grad_norm": 1.8503328561782837,
"learning_rate": 9.93640543567599e-05,
"loss": 0.7241,
"step": 133
},
{
"epoch": 2.6206060606060606,
"grad_norm": 1.7489912509918213,
"learning_rate": 9.809226594767978e-05,
"loss": 0.7292,
"step": 134
},
{
"epoch": 2.64,
"grad_norm": 1.7079439163208008,
"learning_rate": 9.682078615415754e-05,
"loss": 0.7082,
"step": 135
},
{
"epoch": 2.659393939393939,
"grad_norm": 1.3097881078720093,
"learning_rate": 9.5549820664426e-05,
"loss": 0.7175,
"step": 136
},
{
"epoch": 2.6787878787878787,
"grad_norm": 2.135385751724243,
"learning_rate": 9.427957508351852e-05,
"loss": 0.7176,
"step": 137
},
{
"epoch": 2.6981818181818182,
"grad_norm": 2.1517341136932373,
"learning_rate": 9.301025490000841e-05,
"loss": 0.7236,
"step": 138
},
{
"epoch": 2.7175757575757578,
"grad_norm": 1.3631223440170288,
"learning_rate": 9.174206545276677e-05,
"loss": 0.7108,
"step": 139
},
{
"epoch": 2.736969696969697,
"grad_norm": 1.5596174001693726,
"learning_rate": 9.047521189774455e-05,
"loss": 0.7097,
"step": 140
},
{
"epoch": 2.7563636363636363,
"grad_norm": 1.211077094078064,
"learning_rate": 8.920989917478447e-05,
"loss": 0.7159,
"step": 141
},
{
"epoch": 2.775757575757576,
"grad_norm": 1.354500651359558,
"learning_rate": 8.79463319744677e-05,
"loss": 0.7232,
"step": 142
},
{
"epoch": 2.795151515151515,
"grad_norm": 1.613215684890747,
"learning_rate": 8.668471470500095e-05,
"loss": 0.7187,
"step": 143
},
{
"epoch": 2.8145454545454545,
"grad_norm": 1.8118970394134521,
"learning_rate": 8.542525145914905e-05,
"loss": 0.7066,
"step": 144
},
{
"epoch": 2.833939393939394,
"grad_norm": 1.9452157020568848,
"learning_rate": 8.4168145981219e-05,
"loss": 0.7297,
"step": 145
},
{
"epoch": 2.8533333333333335,
"grad_norm": 1.0430099964141846,
"learning_rate": 8.291360163409978e-05,
"loss": 0.7136,
"step": 146
},
{
"epoch": 2.8727272727272726,
"grad_norm": 1.285246729850769,
"learning_rate": 8.16618213663644e-05,
"loss": 0.7157,
"step": 147
},
{
"epoch": 2.892121212121212,
"grad_norm": 1.107146143913269,
"learning_rate": 8.041300767943867e-05,
"loss": 0.7142,
"step": 148
},
{
"epoch": 2.9115151515151516,
"grad_norm": 1.6460542678833008,
"learning_rate": 7.916736259484239e-05,
"loss": 0.7286,
"step": 149
},
{
"epoch": 2.9309090909090907,
"grad_norm": 1.8070957660675049,
"learning_rate": 7.792508762150833e-05,
"loss": 0.7149,
"step": 150
},
{
"epoch": 2.95030303030303,
"grad_norm": 1.2644233703613281,
"learning_rate": 7.668638372318359e-05,
"loss": 0.7255,
"step": 151
},
{
"epoch": 2.9696969696969697,
"grad_norm": 1.6092453002929688,
"learning_rate": 7.54514512859201e-05,
"loss": 0.7104,
"step": 152
},
{
"epoch": 2.9890909090909092,
"grad_norm": 1.6419603824615479,
"learning_rate": 7.422049008565757e-05,
"loss": 0.711,
"step": 153
},
{
"epoch": 3.0193939393939395,
"grad_norm": 1.1830960512161255,
"learning_rate": 7.299369925590574e-05,
"loss": 0.9748,
"step": 154
},
{
"epoch": 3.0387878787878786,
"grad_norm": 1.928208589553833,
"learning_rate": 7.177127725553045e-05,
"loss": 0.7168,
"step": 155
},
{
"epoch": 3.058181818181818,
"grad_norm": 1.4579167366027832,
"learning_rate": 7.05534218366488e-05,
"loss": 0.7039,
"step": 156
},
{
"epoch": 3.0775757575757576,
"grad_norm": 1.7855753898620605,
"learning_rate": 6.934033001263847e-05,
"loss": 0.7204,
"step": 157
},
{
"epoch": 3.096969696969697,
"grad_norm": 1.458349585533142,
"learning_rate": 6.813219802626698e-05,
"loss": 0.7155,
"step": 158
},
{
"epoch": 3.1163636363636362,
"grad_norm": 1.2767351865768433,
"learning_rate": 6.692922131794517e-05,
"loss": 0.7161,
"step": 159
},
{
"epoch": 3.1357575757575757,
"grad_norm": 1.5871436595916748,
"learning_rate": 6.57315944941107e-05,
"loss": 0.7234,
"step": 160
},
{
"epoch": 3.1551515151515153,
"grad_norm": 1.7959891557693481,
"learning_rate": 6.453951129574644e-05,
"loss": 0.7117,
"step": 161
},
{
"epoch": 3.174545454545455,
"grad_norm": 1.654347538948059,
"learning_rate": 6.33531645670389e-05,
"loss": 0.7221,
"step": 162
},
{
"epoch": 3.193939393939394,
"grad_norm": 1.482458233833313,
"learning_rate": 6.217274622418153e-05,
"loss": 0.716,
"step": 163
},
{
"epoch": 3.2133333333333334,
"grad_norm": 1.868960976600647,
"learning_rate": 6.099844722432843e-05,
"loss": 0.7116,
"step": 164
},
{
"epoch": 3.232727272727273,
"grad_norm": 1.4825832843780518,
"learning_rate": 5.983045753470308e-05,
"loss": 0.7143,
"step": 165
},
{
"epoch": 3.252121212121212,
"grad_norm": 0.9575663805007935,
"learning_rate": 5.8668966101867005e-05,
"loss": 0.714,
"step": 166
},
{
"epoch": 3.2715151515151515,
"grad_norm": 1.4929826259613037,
"learning_rate": 5.751416082115408e-05,
"loss": 0.7123,
"step": 167
},
{
"epoch": 3.290909090909091,
"grad_norm": 1.765995740890503,
"learning_rate": 5.63662285062742e-05,
"loss": 0.7167,
"step": 168
},
{
"epoch": 3.3103030303030305,
"grad_norm": 0.9527196884155273,
"learning_rate": 5.522535485909257e-05,
"loss": 0.7117,
"step": 169
},
{
"epoch": 3.3296969696969696,
"grad_norm": 1.6173697710037231,
"learning_rate": 5.409172443958843e-05,
"loss": 0.7113,
"step": 170
},
{
"epoch": 3.349090909090909,
"grad_norm": 1.3506577014923096,
"learning_rate": 5.296552063599868e-05,
"loss": 0.7107,
"step": 171
},
{
"epoch": 3.3684848484848486,
"grad_norm": 1.3540844917297363,
"learning_rate": 5.1846925635151045e-05,
"loss": 0.7034,
"step": 172
},
{
"epoch": 3.3878787878787877,
"grad_norm": 1.2271199226379395,
"learning_rate": 5.073612039299157e-05,
"loss": 0.6967,
"step": 173
},
{
"epoch": 3.4072727272727272,
"grad_norm": 1.2509523630142212,
"learning_rate": 4.963328460531127e-05,
"loss": 0.7158,
"step": 174
},
{
"epoch": 3.4266666666666667,
"grad_norm": 1.36748206615448,
"learning_rate": 4.8538596678676406e-05,
"loss": 0.698,
"step": 175
},
{
"epoch": 3.4460606060606063,
"grad_norm": 2.118276834487915,
"learning_rate": 4.745223370156797e-05,
"loss": 0.713,
"step": 176
},
{
"epoch": 3.4654545454545453,
"grad_norm": 1.510697603225708,
"learning_rate": 4.6374371415733496e-05,
"loss": 0.7173,
"step": 177
},
{
"epoch": 3.484848484848485,
"grad_norm": 1.0324652194976807,
"learning_rate": 4.530518418775733e-05,
"loss": 0.7103,
"step": 178
},
{
"epoch": 3.5042424242424244,
"grad_norm": 1.728299617767334,
"learning_rate": 4.424484498085335e-05,
"loss": 0.7201,
"step": 179
},
{
"epoch": 3.5236363636363635,
"grad_norm": 1.0601152181625366,
"learning_rate": 4.3193525326884435e-05,
"loss": 0.7023,
"step": 180
},
{
"epoch": 3.543030303030303,
"grad_norm": 1.0438584089279175,
"learning_rate": 4.215139529861367e-05,
"loss": 0.7076,
"step": 181
},
{
"epoch": 3.5624242424242425,
"grad_norm": 1.3601257801055908,
"learning_rate": 4.111862348219158e-05,
"loss": 0.7116,
"step": 182
},
{
"epoch": 3.581818181818182,
"grad_norm": 1.2380729913711548,
"learning_rate": 4.009537694988372e-05,
"loss": 0.7123,
"step": 183
},
{
"epoch": 3.601212121212121,
"grad_norm": 1.5108550786972046,
"learning_rate": 3.9081821233043436e-05,
"loss": 0.723,
"step": 184
},
{
"epoch": 3.6206060606060606,
"grad_norm": 1.560524821281433,
"learning_rate": 3.807812029533362e-05,
"loss": 0.6999,
"step": 185
},
{
"epoch": 3.64,
"grad_norm": 1.6480681896209717,
"learning_rate": 3.708443650620206e-05,
"loss": 0.718,
"step": 186
},
{
"epoch": 3.659393939393939,
"grad_norm": 1.3738715648651123,
"learning_rate": 3.6100930614615205e-05,
"loss": 0.6958,
"step": 187
},
{
"epoch": 3.6787878787878787,
"grad_norm": 1.5863178968429565,
"learning_rate": 3.512776172305331e-05,
"loss": 0.707,
"step": 188
},
{
"epoch": 3.6981818181818182,
"grad_norm": 1.5687789916992188,
"learning_rate": 3.41650872617724e-05,
"loss": 0.6985,
"step": 189
},
{
"epoch": 3.7175757575757578,
"grad_norm": 2.139619827270508,
"learning_rate": 3.321306296333673e-05,
"loss": 0.7014,
"step": 190
},
{
"epoch": 3.736969696969697,
"grad_norm": 1.9615211486816406,
"learning_rate": 3.227184283742591e-05,
"loss": 0.7005,
"step": 191
},
{
"epoch": 3.7563636363636363,
"grad_norm": 1.5449055433273315,
"learning_rate": 3.134157914592032e-05,
"loss": 0.7198,
"step": 192
},
{
"epoch": 3.775757575757576,
"grad_norm": 1.0495221614837646,
"learning_rate": 3.042242237826991e-05,
"loss": 0.705,
"step": 193
},
{
"epoch": 3.795151515151515,
"grad_norm": 1.448522925376892,
"learning_rate": 2.951452122714926e-05,
"loss": 0.7047,
"step": 194
},
{
"epoch": 3.8145454545454545,
"grad_norm": 1.6612343788146973,
"learning_rate": 2.861802256440348e-05,
"loss": 0.706,
"step": 195
},
{
"epoch": 3.833939393939394,
"grad_norm": 1.2114323377609253,
"learning_rate": 2.773307141728867e-05,
"loss": 0.7036,
"step": 196
},
{
"epoch": 3.8533333333333335,
"grad_norm": 1.5746084451675415,
"learning_rate": 2.685981094501069e-05,
"loss": 0.7004,
"step": 197
},
{
"epoch": 3.8727272727272726,
"grad_norm": 1.3881734609603882,
"learning_rate": 2.599838241556626e-05,
"loss": 0.7165,
"step": 198
},
{
"epoch": 3.892121212121212,
"grad_norm": 1.5346940755844116,
"learning_rate": 2.514892518288988e-05,
"loss": 0.6957,
"step": 199
},
{
"epoch": 3.9115151515151516,
"grad_norm": 1.664461612701416,
"learning_rate": 2.431157666431052e-05,
"loss": 0.6943,
"step": 200
},
{
"epoch": 3.9309090909090907,
"grad_norm": 2.282188653945923,
"learning_rate": 2.3486472318321307e-05,
"loss": 0.7005,
"step": 201
},
{
"epoch": 3.95030303030303,
"grad_norm": 1.741380214691162,
"learning_rate": 2.267374562266662e-05,
"loss": 0.7015,
"step": 202
},
{
"epoch": 3.9696969696969697,
"grad_norm": 0.8635579943656921,
"learning_rate": 2.1873528052749092e-05,
"loss": 0.6991,
"step": 203
},
{
"epoch": 3.9890909090909092,
"grad_norm": 1.5674488544464111,
"learning_rate": 2.1085949060360654e-05,
"loss": 0.6957,
"step": 204
},
{
"epoch": 4.0193939393939395,
"grad_norm": 1.245754599571228,
"learning_rate": 2.0311136052741277e-05,
"loss": 0.9636,
"step": 205
},
{
"epoch": 4.038787878787879,
"grad_norm": 1.461623191833496,
"learning_rate": 1.9549214371968004e-05,
"loss": 0.6993,
"step": 206
},
{
"epoch": 4.058181818181819,
"grad_norm": 1.416317105293274,
"learning_rate": 1.8800307274678364e-05,
"loss": 0.7026,
"step": 207
},
{
"epoch": 4.077575757575757,
"grad_norm": 1.5567563772201538,
"learning_rate": 1.806453591213103e-05,
"loss": 0.7,
"step": 208
},
{
"epoch": 4.096969696969697,
"grad_norm": 1.345225214958191,
"learning_rate": 1.734201931060706e-05,
"loss": 0.6913,
"step": 209
},
{
"epoch": 4.116363636363636,
"grad_norm": 1.3206875324249268,
"learning_rate": 1.663287435215498e-05,
"loss": 0.6944,
"step": 210
},
{
"epoch": 4.135757575757576,
"grad_norm": 2.078009843826294,
"learning_rate": 1.5937215755682665e-05,
"loss": 0.7074,
"step": 211
},
{
"epoch": 4.155151515151515,
"grad_norm": 1.833019733428955,
"learning_rate": 1.5255156058399122e-05,
"loss": 0.7026,
"step": 212
},
{
"epoch": 4.174545454545455,
"grad_norm": 1.4473344087600708,
"learning_rate": 1.4586805597609331e-05,
"loss": 0.6917,
"step": 213
},
{
"epoch": 4.193939393939394,
"grad_norm": 1.2081327438354492,
"learning_rate": 1.3932272492864984e-05,
"loss": 0.6926,
"step": 214
},
{
"epoch": 4.213333333333333,
"grad_norm": 1.1429706811904907,
"learning_rate": 1.3291662628473633e-05,
"loss": 0.6991,
"step": 215
},
{
"epoch": 4.2327272727272724,
"grad_norm": 1.9675796031951904,
"learning_rate": 1.2665079636369969e-05,
"loss": 0.7054,
"step": 216
},
{
"epoch": 4.252121212121212,
"grad_norm": 1.7647342681884766,
"learning_rate": 1.2052624879351104e-05,
"loss": 0.6975,
"step": 217
},
{
"epoch": 4.2715151515151515,
"grad_norm": 1.6249873638153076,
"learning_rate": 1.1454397434679021e-05,
"loss": 0.6979,
"step": 218
},
{
"epoch": 4.290909090909091,
"grad_norm": 1.4342982769012451,
"learning_rate": 1.0870494078052796e-05,
"loss": 0.7147,
"step": 219
},
{
"epoch": 4.3103030303030305,
"grad_norm": 2.3979902267456055,
"learning_rate": 1.0301009267953143e-05,
"loss": 0.6939,
"step": 220
},
{
"epoch": 4.32969696969697,
"grad_norm": 1.076079249382019,
"learning_rate": 9.746035130361742e-06,
"loss": 0.6899,
"step": 221
},
{
"epoch": 4.34909090909091,
"grad_norm": 1.58867347240448,
"learning_rate": 9.205661443857994e-06,
"loss": 0.6902,
"step": 222
},
{
"epoch": 4.368484848484848,
"grad_norm": 1.9176396131515503,
"learning_rate": 8.67997562509546e-06,
"loss": 0.689,
"step": 223
},
{
"epoch": 4.387878787878788,
"grad_norm": 1.5462384223937988,
"learning_rate": 8.169062714660346e-06,
"loss": 0.7039,
"step": 224
},
{
"epoch": 4.407272727272727,
"grad_norm": 1.3088514804840088,
"learning_rate": 7.673005363314579e-06,
"loss": 0.6991,
"step": 225
},
{
"epoch": 4.426666666666667,
"grad_norm": 1.4000589847564697,
"learning_rate": 7.191883818625189e-06,
"loss": 0.7088,
"step": 226
},
{
"epoch": 4.446060606060606,
"grad_norm": 0.8691507577896118,
"learning_rate": 6.725775911982601e-06,
"loss": 0.6854,
"step": 227
},
{
"epoch": 4.465454545454546,
"grad_norm": 1.6848564147949219,
"learning_rate": 6.274757046009871e-06,
"loss": 0.6967,
"step": 228
},
{
"epoch": 4.484848484848484,
"grad_norm": 1.418238639831543,
"learning_rate": 5.83890018236476e-06,
"loss": 0.6972,
"step": 229
},
{
"epoch": 4.504242424242424,
"grad_norm": 1.3531463146209717,
"learning_rate": 5.418275829936537e-06,
"loss": 0.6951,
"step": 230
},
{
"epoch": 4.5236363636363635,
"grad_norm": 1.5666790008544922,
"learning_rate": 5.012952033439844e-06,
"loss": 0.6938,
"step": 231
},
{
"epoch": 4.543030303030303,
"grad_norm": 1.2016419172286987,
"learning_rate": 4.622994362406996e-06,
"loss": 0.6994,
"step": 232
},
{
"epoch": 4.5624242424242425,
"grad_norm": 1.440388560295105,
"learning_rate": 4.248465900580734e-06,
"loss": 0.6948,
"step": 233
},
{
"epoch": 4.581818181818182,
"grad_norm": 1.5158016681671143,
"learning_rate": 3.889427235709153e-06,
"loss": 0.6848,
"step": 234
},
{
"epoch": 4.6012121212121215,
"grad_norm": 1.3135181665420532,
"learning_rate": 3.5459364497443694e-06,
"loss": 0.7015,
"step": 235
},
{
"epoch": 4.620606060606061,
"grad_norm": 1.0853769779205322,
"learning_rate": 3.2180491094465415e-06,
"loss": 0.7022,
"step": 236
},
{
"epoch": 4.64,
"grad_norm": 1.0479052066802979,
"learning_rate": 2.905818257394799e-06,
"loss": 0.6974,
"step": 237
},
{
"epoch": 4.659393939393939,
"grad_norm": 1.171940803527832,
"learning_rate": 2.609294403406537e-06,
"loss": 0.691,
"step": 238
},
{
"epoch": 4.678787878787879,
"grad_norm": 2.0728201866149902,
"learning_rate": 2.3285255163663532e-06,
"loss": 0.7053,
"step": 239
},
{
"epoch": 4.698181818181818,
"grad_norm": 1.5811423063278198,
"learning_rate": 2.063557016466111e-06,
"loss": 0.7003,
"step": 240
},
{
"epoch": 4.717575757575758,
"grad_norm": 1.1456055641174316,
"learning_rate": 1.8144317678573497e-06,
"loss": 0.7007,
"step": 241
},
{
"epoch": 4.736969696969697,
"grad_norm": 1.3572897911071777,
"learning_rate": 1.5811900717169538e-06,
"loss": 0.6815,
"step": 242
},
{
"epoch": 4.756363636363636,
"grad_norm": 1.8911006450653076,
"learning_rate": 1.3638696597277679e-06,
"loss": 0.6883,
"step": 243
},
{
"epoch": 4.775757575757575,
"grad_norm": 1.2662522792816162,
"learning_rate": 1.1625056879746133e-06,
"loss": 0.6864,
"step": 244
},
{
"epoch": 4.795151515151515,
"grad_norm": 1.2779842615127563,
"learning_rate": 9.771307312571254e-07,
"loss": 0.6906,
"step": 245
},
{
"epoch": 4.8145454545454545,
"grad_norm": 1.303374171257019,
"learning_rate": 8.077747778200473e-07,
"loss": 0.694,
"step": 246
},
{
"epoch": 4.833939393939394,
"grad_norm": 1.74978506565094,
"learning_rate": 6.544652245020433e-07,
"loss": 0.7097,
"step": 247
},
{
"epoch": 4.8533333333333335,
"grad_norm": 1.2199087142944336,
"learning_rate": 5.172268723036999e-07,
"loss": 0.6931,
"step": 248
},
{
"epoch": 4.872727272727273,
"grad_norm": 1.7115343809127808,
"learning_rate": 3.96081922375402e-07,
"loss": 0.6899,
"step": 249
},
{
"epoch": 4.8921212121212125,
"grad_norm": 1.1172815561294556,
"learning_rate": 2.9104997242590527e-07,
"loss": 0.6867,
"step": 250
},
{
"epoch": 4.911515151515151,
"grad_norm": 1.2331411838531494,
"learning_rate": 2.0214801355192824e-07,
"loss": 0.6999,
"step": 251
},
{
"epoch": 4.930909090909091,
"grad_norm": 1.2371054887771606,
"learning_rate": 1.2939042748955077e-07,
"loss": 0.6921,
"step": 252
},
{
"epoch": 4.95030303030303,
"grad_norm": 1.7776347398757935,
"learning_rate": 7.27889842876417e-08,
"loss": 0.6836,
"step": 253
},
{
"epoch": 4.96969696969697,
"grad_norm": 1.3677096366882324,
"learning_rate": 3.2352840403804264e-08,
"loss": 0.6996,
"step": 254
},
{
"epoch": 4.989090909090909,
"grad_norm": 2.0456924438476562,
"learning_rate": 8.088537223116532e-09,
"loss": 0.7067,
"step": 255
},
{
"epoch": 4.989090909090909,
"step": 255,
"total_flos": 8.698530825739698e+18,
"train_loss": 0.8222096034124786,
"train_runtime": 33944.2014,
"train_samples_per_second": 0.972,
"train_steps_per_second": 0.008
}
],
"logging_steps": 1.0,
"max_steps": 255,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.698530825739698e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}