Hamanasu-Instruct-lora / epoch3 /trainer_state.json
intervitens's picture
Upload folder using huggingface_hub
4d4a5c9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0526315789473686,
"eval_steps": 500,
"global_step": 174,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017543859649122806,
"grad_norm": 0.2277653039057954,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.0079,
"step": 1
},
{
"epoch": 0.03508771929824561,
"grad_norm": 0.22929541131469036,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.0155,
"step": 2
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.23320532182877252,
"learning_rate": 5e-06,
"loss": 1.042,
"step": 3
},
{
"epoch": 0.07017543859649122,
"grad_norm": 0.2327235097386226,
"learning_rate": 6.666666666666667e-06,
"loss": 1.0147,
"step": 4
},
{
"epoch": 0.08771929824561403,
"grad_norm": 0.22899683750189437,
"learning_rate": 8.333333333333334e-06,
"loss": 1.0155,
"step": 5
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.2346984759899663,
"learning_rate": 1e-05,
"loss": 1.0471,
"step": 6
},
{
"epoch": 0.12280701754385964,
"grad_norm": 0.22126116958454167,
"learning_rate": 1.1666666666666668e-05,
"loss": 1.0295,
"step": 7
},
{
"epoch": 0.14035087719298245,
"grad_norm": 0.21457628974392648,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.0319,
"step": 8
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.21253345072366,
"learning_rate": 1.5e-05,
"loss": 1.0166,
"step": 9
},
{
"epoch": 0.17543859649122806,
"grad_norm": 0.21642169088434604,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.0331,
"step": 10
},
{
"epoch": 0.19298245614035087,
"grad_norm": 0.18849235769492945,
"learning_rate": 1.8333333333333333e-05,
"loss": 1.0142,
"step": 11
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.17642654464303906,
"learning_rate": 2e-05,
"loss": 0.9902,
"step": 12
},
{
"epoch": 0.22807017543859648,
"grad_norm": 0.17187933882719988,
"learning_rate": 2.1666666666666667e-05,
"loss": 1.017,
"step": 13
},
{
"epoch": 0.24561403508771928,
"grad_norm": 0.17103598555992858,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.9751,
"step": 14
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.16014487415950107,
"learning_rate": 2.5e-05,
"loss": 0.9881,
"step": 15
},
{
"epoch": 0.2807017543859649,
"grad_norm": 0.14028695923022452,
"learning_rate": 2.4998640395219987e-05,
"loss": 0.9778,
"step": 16
},
{
"epoch": 0.2982456140350877,
"grad_norm": 0.12551729140438972,
"learning_rate": 2.499456187664396e-05,
"loss": 0.9689,
"step": 17
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.1251340971956454,
"learning_rate": 2.4987765331499672e-05,
"loss": 0.9429,
"step": 18
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.1296210035785423,
"learning_rate": 2.497825223828555e-05,
"loss": 0.946,
"step": 19
},
{
"epoch": 0.3508771929824561,
"grad_norm": 0.11329484685623345,
"learning_rate": 2.4966024666449125e-05,
"loss": 0.9366,
"step": 20
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.10321338855040195,
"learning_rate": 2.495108527593681e-05,
"loss": 0.9259,
"step": 21
},
{
"epoch": 0.38596491228070173,
"grad_norm": 0.09404432805330766,
"learning_rate": 2.493343731661529e-05,
"loss": 0.9482,
"step": 22
},
{
"epoch": 0.40350877192982454,
"grad_norm": 0.09243083734470846,
"learning_rate": 2.4913084627564535e-05,
"loss": 0.9065,
"step": 23
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.08906226913721381,
"learning_rate": 2.4890031636242685e-05,
"loss": 0.8938,
"step": 24
},
{
"epoch": 0.43859649122807015,
"grad_norm": 0.08389891645958811,
"learning_rate": 2.486428335752288e-05,
"loss": 0.916,
"step": 25
},
{
"epoch": 0.45614035087719296,
"grad_norm": 0.08425667259579685,
"learning_rate": 2.483584539260238e-05,
"loss": 0.8779,
"step": 26
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.07744492406671652,
"learning_rate": 2.480472392778407e-05,
"loss": 0.8834,
"step": 27
},
{
"epoch": 0.49122807017543857,
"grad_norm": 0.08767059933162678,
"learning_rate": 2.4770925733130725e-05,
"loss": 0.9148,
"step": 28
},
{
"epoch": 0.5087719298245614,
"grad_norm": 0.07726724919511256,
"learning_rate": 2.473445816099226e-05,
"loss": 0.9088,
"step": 29
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.07836879806212735,
"learning_rate": 2.4695329144406337e-05,
"loss": 0.8944,
"step": 30
},
{
"epoch": 0.543859649122807,
"grad_norm": 0.07205578520489196,
"learning_rate": 2.465354719537264e-05,
"loss": 0.8966,
"step": 31
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.0732976565919632,
"learning_rate": 2.460912140300119e-05,
"loss": 0.8933,
"step": 32
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.06413069817944542,
"learning_rate": 2.4562061431535128e-05,
"loss": 0.8687,
"step": 33
},
{
"epoch": 0.5964912280701754,
"grad_norm": 0.061655130915948715,
"learning_rate": 2.4512377518248398e-05,
"loss": 0.8757,
"step": 34
},
{
"epoch": 0.6140350877192983,
"grad_norm": 0.06005143040792398,
"learning_rate": 2.4460080471218766e-05,
"loss": 0.8763,
"step": 35
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.059268901460994255,
"learning_rate": 2.4405181666976646e-05,
"loss": 0.8691,
"step": 36
},
{
"epoch": 0.6491228070175439,
"grad_norm": 0.0632968952247683,
"learning_rate": 2.43476930480303e-05,
"loss": 0.876,
"step": 37
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.05479140319445762,
"learning_rate": 2.428762712026792e-05,
"loss": 0.8682,
"step": 38
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.05563512390428038,
"learning_rate": 2.4224996950237093e-05,
"loss": 0.8841,
"step": 39
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.05247972036076721,
"learning_rate": 2.4159816162302394e-05,
"loss": 0.8787,
"step": 40
},
{
"epoch": 0.7192982456140351,
"grad_norm": 0.061183366717530226,
"learning_rate": 2.4092098935681556e-05,
"loss": 0.8549,
"step": 41
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.05375757865160034,
"learning_rate": 2.402186000136098e-05,
"loss": 0.8528,
"step": 42
},
{
"epoch": 0.7543859649122807,
"grad_norm": 0.05057045758521559,
"learning_rate": 2.39491146388912e-05,
"loss": 0.8536,
"step": 43
},
{
"epoch": 0.7719298245614035,
"grad_norm": 0.04649702726117417,
"learning_rate": 2.387387867306302e-05,
"loss": 0.8488,
"step": 44
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.04787518315340449,
"learning_rate": 2.379616847046505e-05,
"loss": 0.8573,
"step": 45
},
{
"epoch": 0.8070175438596491,
"grad_norm": 0.045213502324060025,
"learning_rate": 2.371600093592335e-05,
"loss": 0.8727,
"step": 46
},
{
"epoch": 0.8245614035087719,
"grad_norm": 0.04738126021150567,
"learning_rate": 2.3633393508824022e-05,
"loss": 0.8633,
"step": 47
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.043833518738075415,
"learning_rate": 2.3548364159319513e-05,
"loss": 0.868,
"step": 48
},
{
"epoch": 0.8596491228070176,
"grad_norm": 0.042510793627425734,
"learning_rate": 2.3460931384419427e-05,
"loss": 0.852,
"step": 49
},
{
"epoch": 0.8771929824561403,
"grad_norm": 0.0408258194280563,
"learning_rate": 2.3371114203966756e-05,
"loss": 0.8595,
"step": 50
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.041284443987598174,
"learning_rate": 2.3278932156500348e-05,
"loss": 0.8701,
"step": 51
},
{
"epoch": 0.9122807017543859,
"grad_norm": 0.04015399468491511,
"learning_rate": 2.3184405295004592e-05,
"loss": 0.8378,
"step": 52
},
{
"epoch": 0.9298245614035088,
"grad_norm": 0.0470194493755998,
"learning_rate": 2.3087554182547123e-05,
"loss": 0.8522,
"step": 53
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.040346271463175765,
"learning_rate": 2.298839988780561e-05,
"loss": 0.8571,
"step": 54
},
{
"epoch": 0.9649122807017544,
"grad_norm": 0.03907150168014673,
"learning_rate": 2.288696398048455e-05,
"loss": 0.8389,
"step": 55
},
{
"epoch": 0.9824561403508771,
"grad_norm": 0.03966173521753987,
"learning_rate": 2.278326852662305e-05,
"loss": 0.8473,
"step": 56
},
{
"epoch": 1.0,
"grad_norm": 0.03792422830368885,
"learning_rate": 2.267733608379468e-05,
"loss": 0.8308,
"step": 57
},
{
"epoch": 1.0175438596491229,
"grad_norm": 0.039916995591762185,
"learning_rate": 2.2569189696200327e-05,
"loss": 0.8363,
"step": 58
},
{
"epoch": 1.0350877192982457,
"grad_norm": 0.04222978922823946,
"learning_rate": 2.2458852889655284e-05,
"loss": 0.8248,
"step": 59
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.037766008270626816,
"learning_rate": 2.234634966647148e-05,
"loss": 0.8108,
"step": 60
},
{
"epoch": 1.0701754385964912,
"grad_norm": 0.03910137890933386,
"learning_rate": 2.2231704500236117e-05,
"loss": 0.8111,
"step": 61
},
{
"epoch": 1.087719298245614,
"grad_norm": 0.03723938187511672,
"learning_rate": 2.211494233048776e-05,
"loss": 0.7961,
"step": 62
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.03536618534586841,
"learning_rate": 2.1996088557291062e-05,
"loss": 0.8083,
"step": 63
},
{
"epoch": 1.1228070175438596,
"grad_norm": 0.03509557198121232,
"learning_rate": 2.1875169035711335e-05,
"loss": 0.8301,
"step": 64
},
{
"epoch": 1.1403508771929824,
"grad_norm": 0.03631360422996243,
"learning_rate": 2.1752210070190106e-05,
"loss": 0.8119,
"step": 65
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.034923838848099804,
"learning_rate": 2.162723840882293e-05,
"loss": 0.8351,
"step": 66
},
{
"epoch": 1.1754385964912282,
"grad_norm": 0.034222477158642954,
"learning_rate": 2.150028123754072e-05,
"loss": 0.8396,
"step": 67
},
{
"epoch": 1.1929824561403508,
"grad_norm": 0.044099006560021574,
"learning_rate": 2.137136617419578e-05,
"loss": 0.8132,
"step": 68
},
{
"epoch": 1.2105263157894737,
"grad_norm": 0.03760194901086737,
"learning_rate": 2.1240521262553927e-05,
"loss": 0.8277,
"step": 69
},
{
"epoch": 1.2280701754385965,
"grad_norm": 0.034132230637497686,
"learning_rate": 2.1107774966193932e-05,
"loss": 0.8231,
"step": 70
},
{
"epoch": 1.2456140350877192,
"grad_norm": 0.034145371494878535,
"learning_rate": 2.097315616231564e-05,
"loss": 0.8116,
"step": 71
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.03402745474331636,
"learning_rate": 2.0836694135458136e-05,
"loss": 0.8283,
"step": 72
},
{
"epoch": 1.280701754385965,
"grad_norm": 0.046074062113807,
"learning_rate": 2.0698418571129255e-05,
"loss": 0.8161,
"step": 73
},
{
"epoch": 1.2982456140350878,
"grad_norm": 0.03409175484451008,
"learning_rate": 2.055835954934791e-05,
"loss": 0.8056,
"step": 74
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.03601398897730395,
"learning_rate": 2.041654753810059e-05,
"loss": 0.8139,
"step": 75
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.035605249354124874,
"learning_rate": 2.027301338671342e-05,
"loss": 0.7993,
"step": 76
},
{
"epoch": 1.3508771929824561,
"grad_norm": 0.03851629705577501,
"learning_rate": 2.0127788319141345e-05,
"loss": 0.8192,
"step": 77
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.03560123297297108,
"learning_rate": 1.998090392717572e-05,
"loss": 0.8194,
"step": 78
},
{
"epoch": 1.3859649122807016,
"grad_norm": 0.03338440818080332,
"learning_rate": 1.9832392163571977e-05,
"loss": 0.823,
"step": 79
},
{
"epoch": 1.4035087719298245,
"grad_norm": 0.039320599103418945,
"learning_rate": 1.968228533509871e-05,
"loss": 0.7991,
"step": 80
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.03413703166680613,
"learning_rate": 1.953061609550976e-05,
"loss": 0.8122,
"step": 81
},
{
"epoch": 1.4385964912280702,
"grad_norm": 0.03632660780988978,
"learning_rate": 1.937741743844082e-05,
"loss": 0.8051,
"step": 82
},
{
"epoch": 1.456140350877193,
"grad_norm": 0.2720784291107051,
"learning_rate": 1.9222722690232124e-05,
"loss": 0.7982,
"step": 83
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.033948193629099205,
"learning_rate": 1.9066565502678735e-05,
"loss": 0.8244,
"step": 84
},
{
"epoch": 1.4912280701754386,
"grad_norm": 0.04015231276799685,
"learning_rate": 1.8908979845710028e-05,
"loss": 0.802,
"step": 85
},
{
"epoch": 1.5087719298245614,
"grad_norm": 0.0334280910595663,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.7944,
"step": 86
},
{
"epoch": 1.526315789473684,
"grad_norm": 0.033433004016842426,
"learning_rate": 1.8589660549509958e-05,
"loss": 0.8086,
"step": 87
},
{
"epoch": 1.543859649122807,
"grad_norm": 0.03647767785675323,
"learning_rate": 1.842799637396523e-05,
"loss": 0.8005,
"step": 88
},
{
"epoch": 1.5614035087719298,
"grad_norm": 0.034851294076943595,
"learning_rate": 1.8265042641267543e-05,
"loss": 0.7697,
"step": 89
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.035934237530251795,
"learning_rate": 1.8100834799844733e-05,
"loss": 0.8017,
"step": 90
},
{
"epoch": 1.5964912280701755,
"grad_norm": 0.035583328708530516,
"learning_rate": 1.793540857093937e-05,
"loss": 0.8035,
"step": 91
},
{
"epoch": 1.6140350877192984,
"grad_norm": 0.035602801094138097,
"learning_rate": 1.77687999408381e-05,
"loss": 0.7785,
"step": 92
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.03338943294215932,
"learning_rate": 1.760104515304331e-05,
"loss": 0.809,
"step": 93
},
{
"epoch": 1.6491228070175439,
"grad_norm": 0.03440455754366396,
"learning_rate": 1.743218070038882e-05,
"loss": 0.7835,
"step": 94
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.03413755186702014,
"learning_rate": 1.7262243317101342e-05,
"loss": 0.7857,
"step": 95
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.03450977935766268,
"learning_rate": 1.709126997080946e-05,
"loss": 0.8045,
"step": 96
},
{
"epoch": 1.7017543859649122,
"grad_norm": 0.03521585021316178,
"learning_rate": 1.6919297854501793e-05,
"loss": 0.7935,
"step": 97
},
{
"epoch": 1.719298245614035,
"grad_norm": 0.03493887488862163,
"learning_rate": 1.674636437843616e-05,
"loss": 0.798,
"step": 98
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.035529760900503735,
"learning_rate": 1.6572507162001472e-05,
"loss": 0.799,
"step": 99
},
{
"epoch": 1.7543859649122808,
"grad_norm": 0.033513087382252796,
"learning_rate": 1.6397764025534122e-05,
"loss": 0.7894,
"step": 100
},
{
"epoch": 1.7719298245614035,
"grad_norm": 0.19122994636460844,
"learning_rate": 1.6222172982090696e-05,
"loss": 0.7804,
"step": 101
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.035807687623747184,
"learning_rate": 1.604577222917871e-05,
"loss": 0.7951,
"step": 102
},
{
"epoch": 1.807017543859649,
"grad_norm": 0.03259107722786136,
"learning_rate": 1.586860014044726e-05,
"loss": 0.7781,
"step": 103
},
{
"epoch": 1.8245614035087718,
"grad_norm": 0.036809691518080494,
"learning_rate": 1.5690695257339348e-05,
"loss": 0.8008,
"step": 104
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.035038305488987835,
"learning_rate": 1.551209628070768e-05,
"loss": 0.7753,
"step": 105
},
{
"epoch": 1.8596491228070176,
"grad_norm": 0.03563704680282362,
"learning_rate": 1.5332842062395837e-05,
"loss": 0.8109,
"step": 106
},
{
"epoch": 1.8771929824561404,
"grad_norm": 0.03337909268033773,
"learning_rate": 1.5152971596786539e-05,
"loss": 0.8074,
"step": 107
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.03364182376321439,
"learning_rate": 1.4972524012318968e-05,
"loss": 0.7814,
"step": 108
},
{
"epoch": 1.912280701754386,
"grad_norm": 0.03394899793236395,
"learning_rate": 1.4791538562976858e-05,
"loss": 0.8046,
"step": 109
},
{
"epoch": 1.9298245614035088,
"grad_norm": 0.03276078625186682,
"learning_rate": 1.4610054619749335e-05,
"loss": 0.7923,
"step": 110
},
{
"epoch": 1.9473684210526314,
"grad_norm": 0.035403995130927304,
"learning_rate": 1.442811166206628e-05,
"loss": 0.8036,
"step": 111
},
{
"epoch": 1.9649122807017543,
"grad_norm": 0.036133327261938186,
"learning_rate": 1.4245749269210077e-05,
"loss": 0.7875,
"step": 112
},
{
"epoch": 1.9824561403508771,
"grad_norm": 0.036600172448979534,
"learning_rate": 1.40630071117057e-05,
"loss": 0.7697,
"step": 113
},
{
"epoch": 2.0,
"grad_norm": 0.037601350569099724,
"learning_rate": 1.3879924942690875e-05,
"loss": 0.8189,
"step": 114
},
{
"epoch": 2.017543859649123,
"grad_norm": 0.03302530011499694,
"learning_rate": 1.3696542589268343e-05,
"loss": 0.7611,
"step": 115
},
{
"epoch": 2.0350877192982457,
"grad_norm": 0.03512883427203999,
"learning_rate": 1.3512899943842001e-05,
"loss": 0.8027,
"step": 116
},
{
"epoch": 2.0526315789473686,
"grad_norm": 0.032765823734136675,
"learning_rate": 1.3329036955438801e-05,
"loss": 0.7739,
"step": 117
},
{
"epoch": 2.0701754385964914,
"grad_norm": 0.0325717443826363,
"learning_rate": 1.3144993621018414e-05,
"loss": 0.7749,
"step": 118
},
{
"epoch": 2.087719298245614,
"grad_norm": 0.03351967483422044,
"learning_rate": 1.2960809976772395e-05,
"loss": 0.7776,
"step": 119
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.03412426117127494,
"learning_rate": 1.2776526089414836e-05,
"loss": 0.7604,
"step": 120
},
{
"epoch": 2.1228070175438596,
"grad_norm": 0.034817647751256633,
"learning_rate": 1.2592182047466405e-05,
"loss": 0.7701,
"step": 121
},
{
"epoch": 2.1403508771929824,
"grad_norm": 0.03260110561553477,
"learning_rate": 1.2407817952533594e-05,
"loss": 0.7741,
"step": 122
},
{
"epoch": 2.1578947368421053,
"grad_norm": 0.03193494436049472,
"learning_rate": 1.2223473910585165e-05,
"loss": 0.7645,
"step": 123
},
{
"epoch": 2.175438596491228,
"grad_norm": 0.03312398775341158,
"learning_rate": 1.2039190023227611e-05,
"loss": 0.7585,
"step": 124
},
{
"epoch": 2.192982456140351,
"grad_norm": 0.03343964045395972,
"learning_rate": 1.1855006378981588e-05,
"loss": 0.7921,
"step": 125
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.033686542752496544,
"learning_rate": 1.1670963044561205e-05,
"loss": 0.7827,
"step": 126
},
{
"epoch": 2.2280701754385963,
"grad_norm": 0.03497709388430689,
"learning_rate": 1.1487100056158e-05,
"loss": 0.7867,
"step": 127
},
{
"epoch": 2.245614035087719,
"grad_norm": 0.03909574941588132,
"learning_rate": 1.1303457410731658e-05,
"loss": 0.7651,
"step": 128
},
{
"epoch": 2.263157894736842,
"grad_norm": 0.03295131774552763,
"learning_rate": 1.112007505730913e-05,
"loss": 0.7716,
"step": 129
},
{
"epoch": 2.280701754385965,
"grad_norm": 0.03535332720570186,
"learning_rate": 1.0936992888294304e-05,
"loss": 0.7519,
"step": 130
},
{
"epoch": 2.2982456140350878,
"grad_norm": 0.034260157940805745,
"learning_rate": 1.0754250730789925e-05,
"loss": 0.7778,
"step": 131
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.03778502943480454,
"learning_rate": 1.057188833793372e-05,
"loss": 0.7785,
"step": 132
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.03282521968583762,
"learning_rate": 1.0389945380250666e-05,
"loss": 0.7822,
"step": 133
},
{
"epoch": 2.3508771929824563,
"grad_norm": 0.03439059832810125,
"learning_rate": 1.0208461437023146e-05,
"loss": 0.7774,
"step": 134
},
{
"epoch": 2.3684210526315788,
"grad_norm": 0.03579842875417821,
"learning_rate": 1.0027475987681033e-05,
"loss": 0.7626,
"step": 135
},
{
"epoch": 2.3859649122807016,
"grad_norm": 0.04487526535583229,
"learning_rate": 9.847028403213464e-06,
"loss": 0.785,
"step": 136
},
{
"epoch": 2.4035087719298245,
"grad_norm": 0.03582408613012423,
"learning_rate": 9.667157937604165e-06,
"loss": 0.772,
"step": 137
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.035176969293327615,
"learning_rate": 9.487903719292321e-06,
"loss": 0.7777,
"step": 138
},
{
"epoch": 2.43859649122807,
"grad_norm": 0.03346578608050542,
"learning_rate": 9.309304742660656e-06,
"loss": 0.7577,
"step": 139
},
{
"epoch": 2.456140350877193,
"grad_norm": 0.039503770377912154,
"learning_rate": 9.131399859552739e-06,
"loss": 0.7901,
"step": 140
},
{
"epoch": 2.473684210526316,
"grad_norm": 0.03677116076069421,
"learning_rate": 8.954227770821292e-06,
"loss": 0.7723,
"step": 141
},
{
"epoch": 2.4912280701754383,
"grad_norm": 0.03338240458340637,
"learning_rate": 8.77782701790931e-06,
"loss": 0.7617,
"step": 142
},
{
"epoch": 2.5087719298245617,
"grad_norm": 0.033837659971756015,
"learning_rate": 8.60223597446588e-06,
"loss": 0.7713,
"step": 143
},
{
"epoch": 2.526315789473684,
"grad_norm": 0.03820384430127108,
"learning_rate": 8.427492837998533e-06,
"loss": 0.7557,
"step": 144
},
{
"epoch": 2.543859649122807,
"grad_norm": 0.03252410748125962,
"learning_rate": 8.25363562156384e-06,
"loss": 0.7805,
"step": 145
},
{
"epoch": 2.56140350877193,
"grad_norm": 0.03706815897768721,
"learning_rate": 8.080702145498206e-06,
"loss": 0.7645,
"step": 146
},
{
"epoch": 2.5789473684210527,
"grad_norm": 0.035598168147733984,
"learning_rate": 7.908730029190544e-06,
"loss": 0.7877,
"step": 147
},
{
"epoch": 2.5964912280701755,
"grad_norm": 0.0333688858025551,
"learning_rate": 7.737756682898659e-06,
"loss": 0.7591,
"step": 148
},
{
"epoch": 2.6140350877192984,
"grad_norm": 0.044640638601682346,
"learning_rate": 7.567819299611184e-06,
"loss": 0.7658,
"step": 149
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.033158357578421456,
"learning_rate": 7.398954846956688e-06,
"loss": 0.7719,
"step": 150
},
{
"epoch": 2.6491228070175437,
"grad_norm": 0.033194259963536865,
"learning_rate": 7.231200059161899e-06,
"loss": 0.7806,
"step": 151
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.03518173471203294,
"learning_rate": 7.064591429060635e-06,
"loss": 0.7679,
"step": 152
},
{
"epoch": 2.6842105263157894,
"grad_norm": 0.03276207653785537,
"learning_rate": 6.8991652001552695e-06,
"loss": 0.7728,
"step": 153
},
{
"epoch": 2.7017543859649122,
"grad_norm": 0.0347231674496661,
"learning_rate": 6.734957358732458e-06,
"loss": 0.7741,
"step": 154
},
{
"epoch": 2.719298245614035,
"grad_norm": 0.033258910497780264,
"learning_rate": 6.572003626034776e-06,
"loss": 0.7728,
"step": 155
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.033923018163736,
"learning_rate": 6.410339450490047e-06,
"loss": 0.7838,
"step": 156
},
{
"epoch": 2.754385964912281,
"grad_norm": 0.03499101652185436,
"learning_rate": 6.250000000000003e-06,
"loss": 0.7836,
"step": 157
},
{
"epoch": 2.7719298245614032,
"grad_norm": 0.037455923489755065,
"learning_rate": 6.091020154289971e-06,
"loss": 0.786,
"step": 158
},
{
"epoch": 2.7894736842105265,
"grad_norm": 0.03406798967875397,
"learning_rate": 5.933434497321268e-06,
"loss": 0.7607,
"step": 159
},
{
"epoch": 2.807017543859649,
"grad_norm": 0.03263055408168355,
"learning_rate": 5.777277309767873e-06,
"loss": 0.7835,
"step": 160
},
{
"epoch": 2.824561403508772,
"grad_norm": 0.0370087111535257,
"learning_rate": 5.62258256155918e-06,
"loss": 0.7506,
"step": 161
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.03253506069052104,
"learning_rate": 5.469383904490243e-06,
"loss": 0.7849,
"step": 162
},
{
"epoch": 2.8596491228070176,
"grad_norm": 0.034164574715283676,
"learning_rate": 5.317714664901289e-06,
"loss": 0.7665,
"step": 163
},
{
"epoch": 2.8771929824561404,
"grad_norm": 0.03375521327460909,
"learning_rate": 5.167607836428023e-06,
"loss": 0.7497,
"step": 164
},
{
"epoch": 2.8947368421052633,
"grad_norm": 0.035848240948616814,
"learning_rate": 5.0190960728242834e-06,
"loss": 0.7904,
"step": 165
},
{
"epoch": 2.912280701754386,
"grad_norm": 0.03266391216652421,
"learning_rate": 4.872211680858662e-06,
"loss": 0.7592,
"step": 166
},
{
"epoch": 2.9298245614035086,
"grad_norm": 0.03349205497732682,
"learning_rate": 4.726986613286583e-06,
"loss": 0.7666,
"step": 167
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.034893605290688134,
"learning_rate": 4.5834524618994106e-06,
"loss": 0.7676,
"step": 168
},
{
"epoch": 2.9649122807017543,
"grad_norm": 0.034664258032328234,
"learning_rate": 4.441640450652093e-06,
"loss": 0.7675,
"step": 169
},
{
"epoch": 2.982456140350877,
"grad_norm": 0.032567147685664447,
"learning_rate": 4.30158142887075e-06,
"loss": 0.7607,
"step": 170
},
{
"epoch": 3.0,
"grad_norm": 0.034965409248096775,
"learning_rate": 4.163305864541865e-06,
"loss": 0.7622,
"step": 171
},
{
"epoch": 3.017543859649123,
"grad_norm": 0.034735445589064905,
"learning_rate": 4.026843837684359e-06,
"loss": 0.7767,
"step": 172
},
{
"epoch": 3.0350877192982457,
"grad_norm": 0.061447912336751793,
"learning_rate": 3.89222503380607e-06,
"loss": 0.7423,
"step": 173
},
{
"epoch": 3.0526315789473686,
"grad_norm": 0.03296491017047373,
"learning_rate": 3.7594787374460747e-06,
"loss": 0.7608,
"step": 174
}
],
"logging_steps": 1,
"max_steps": 228,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 29,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.1115148672434176e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}