creed-qwen-0.5b-lora / checkpoint-312 /trainer_state.json
phxdev's picture
Upload folder using huggingface_hub
1ec00a5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.899521531100478,
"eval_steps": 500,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019138755980861243,
"grad_norm": 3.7146408557891846,
"learning_rate": 2.2222222222222223e-05,
"loss": 4.4869,
"step": 1
},
{
"epoch": 0.03827751196172249,
"grad_norm": 3.3118133544921875,
"learning_rate": 4.4444444444444447e-05,
"loss": 4.1867,
"step": 2
},
{
"epoch": 0.05741626794258373,
"grad_norm": 2.972708225250244,
"learning_rate": 6.666666666666667e-05,
"loss": 4.001,
"step": 3
},
{
"epoch": 0.07655502392344497,
"grad_norm": 4.938202381134033,
"learning_rate": 8.888888888888889e-05,
"loss": 5.0582,
"step": 4
},
{
"epoch": 0.09569377990430622,
"grad_norm": 3.5732812881469727,
"learning_rate": 0.00011111111111111112,
"loss": 4.5871,
"step": 5
},
{
"epoch": 0.11483253588516747,
"grad_norm": 3.350315570831299,
"learning_rate": 0.00013333333333333334,
"loss": 4.0071,
"step": 6
},
{
"epoch": 0.1339712918660287,
"grad_norm": 3.4415643215179443,
"learning_rate": 0.00015555555555555556,
"loss": 3.9791,
"step": 7
},
{
"epoch": 0.15311004784688995,
"grad_norm": 2.558781385421753,
"learning_rate": 0.00017777777777777779,
"loss": 3.6497,
"step": 8
},
{
"epoch": 0.1722488038277512,
"grad_norm": 2.3021087646484375,
"learning_rate": 0.0002,
"loss": 3.5205,
"step": 9
},
{
"epoch": 0.19138755980861244,
"grad_norm": 2.301999568939209,
"learning_rate": 0.00019999462497359466,
"loss": 4.112,
"step": 10
},
{
"epoch": 0.21052631578947367,
"grad_norm": 3.0552637577056885,
"learning_rate": 0.0001999785004721968,
"loss": 3.8723,
"step": 11
},
{
"epoch": 0.22966507177033493,
"grad_norm": 2.5972537994384766,
"learning_rate": 0.00019995162822919883,
"loss": 3.8135,
"step": 12
},
{
"epoch": 0.24880382775119617,
"grad_norm": 2.0281920433044434,
"learning_rate": 0.00019991401113338104,
"loss": 3.8702,
"step": 13
},
{
"epoch": 0.2679425837320574,
"grad_norm": 1.7147849798202515,
"learning_rate": 0.00019986565322860115,
"loss": 3.463,
"step": 14
},
{
"epoch": 0.28708133971291866,
"grad_norm": 2.082582473754883,
"learning_rate": 0.00019980655971335945,
"loss": 3.3816,
"step": 15
},
{
"epoch": 0.3062200956937799,
"grad_norm": 2.1299426555633545,
"learning_rate": 0.00019973673694024,
"loss": 3.698,
"step": 16
},
{
"epoch": 0.3253588516746411,
"grad_norm": 1.8626389503479004,
"learning_rate": 0.0001996561924152278,
"loss": 3.3583,
"step": 17
},
{
"epoch": 0.3444976076555024,
"grad_norm": 2.452871322631836,
"learning_rate": 0.0001995649347969019,
"loss": 3.4957,
"step": 18
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.265108585357666,
"learning_rate": 0.00019946297389550433,
"loss": 3.1115,
"step": 19
},
{
"epoch": 0.3827751196172249,
"grad_norm": 1.996728777885437,
"learning_rate": 0.0001993503206718859,
"loss": 3.4159,
"step": 20
},
{
"epoch": 0.4019138755980861,
"grad_norm": 1.913594365119934,
"learning_rate": 0.00019922698723632767,
"loss": 3.3288,
"step": 21
},
{
"epoch": 0.42105263157894735,
"grad_norm": 2.4316132068634033,
"learning_rate": 0.00019909298684723904,
"loss": 3.4245,
"step": 22
},
{
"epoch": 0.44019138755980863,
"grad_norm": 1.998693823814392,
"learning_rate": 0.00019894833390973266,
"loss": 3.1687,
"step": 23
},
{
"epoch": 0.45933014354066987,
"grad_norm": 2.21382737159729,
"learning_rate": 0.0001987930439740757,
"loss": 3.4307,
"step": 24
},
{
"epoch": 0.4784688995215311,
"grad_norm": 2.586013078689575,
"learning_rate": 0.0001986271337340182,
"loss": 3.3596,
"step": 25
},
{
"epoch": 0.49760765550239233,
"grad_norm": 2.8244550228118896,
"learning_rate": 0.0001984506210249986,
"loss": 3.3107,
"step": 26
},
{
"epoch": 0.5167464114832536,
"grad_norm": 2.0228700637817383,
"learning_rate": 0.00019826352482222638,
"loss": 3.1749,
"step": 27
},
{
"epoch": 0.5358851674641149,
"grad_norm": 2.7035820484161377,
"learning_rate": 0.0001980658652386421,
"loss": 3.0995,
"step": 28
},
{
"epoch": 0.5550239234449761,
"grad_norm": 2.119741916656494,
"learning_rate": 0.00019785766352275542,
"loss": 3.225,
"step": 29
},
{
"epoch": 0.5741626794258373,
"grad_norm": 2.5071310997009277,
"learning_rate": 0.00019763894205636072,
"loss": 3.0066,
"step": 30
},
{
"epoch": 0.5933014354066986,
"grad_norm": 2.992201566696167,
"learning_rate": 0.00019740972435213115,
"loss": 3.0412,
"step": 31
},
{
"epoch": 0.6124401913875598,
"grad_norm": 2.820875883102417,
"learning_rate": 0.00019717003505109095,
"loss": 3.3575,
"step": 32
},
{
"epoch": 0.631578947368421,
"grad_norm": 2.7096059322357178,
"learning_rate": 0.00019691989991996663,
"loss": 3.3531,
"step": 33
},
{
"epoch": 0.6507177033492823,
"grad_norm": 2.172783374786377,
"learning_rate": 0.00019665934584841682,
"loss": 3.2852,
"step": 34
},
{
"epoch": 0.6698564593301436,
"grad_norm": 3.238025188446045,
"learning_rate": 0.00019638840084614182,
"loss": 3.3818,
"step": 35
},
{
"epoch": 0.6889952153110048,
"grad_norm": 2.92851185798645,
"learning_rate": 0.00019610709403987246,
"loss": 3.1,
"step": 36
},
{
"epoch": 0.7081339712918661,
"grad_norm": 2.514800786972046,
"learning_rate": 0.000195815455670239,
"loss": 3.0883,
"step": 37
},
{
"epoch": 0.7272727272727273,
"grad_norm": 3.264613151550293,
"learning_rate": 0.0001955135170885202,
"loss": 2.9932,
"step": 38
},
{
"epoch": 0.7464114832535885,
"grad_norm": 2.4111247062683105,
"learning_rate": 0.00019520131075327298,
"loss": 2.974,
"step": 39
},
{
"epoch": 0.7655502392344498,
"grad_norm": 2.692473888397217,
"learning_rate": 0.00019487887022684336,
"loss": 2.8822,
"step": 40
},
{
"epoch": 0.784688995215311,
"grad_norm": 3.3863365650177,
"learning_rate": 0.00019454623017175812,
"loss": 3.0255,
"step": 41
},
{
"epoch": 0.8038277511961722,
"grad_norm": 2.2267720699310303,
"learning_rate": 0.0001942034263469989,
"loss": 3.1883,
"step": 42
},
{
"epoch": 0.8229665071770335,
"grad_norm": 2.31858491897583,
"learning_rate": 0.00019385049560415794,
"loss": 2.9882,
"step": 43
},
{
"epoch": 0.8421052631578947,
"grad_norm": 2.3098323345184326,
"learning_rate": 0.00019348747588347637,
"loss": 2.8801,
"step": 44
},
{
"epoch": 0.861244019138756,
"grad_norm": 3.3286585807800293,
"learning_rate": 0.00019311440620976597,
"loss": 2.9762,
"step": 45
},
{
"epoch": 0.8803827751196173,
"grad_norm": 3.1082146167755127,
"learning_rate": 0.00019273132668821364,
"loss": 2.8514,
"step": 46
},
{
"epoch": 0.8995215311004785,
"grad_norm": 2.2908411026000977,
"learning_rate": 0.00019233827850007027,
"loss": 3.2993,
"step": 47
},
{
"epoch": 0.9186602870813397,
"grad_norm": 2.1068387031555176,
"learning_rate": 0.00019193530389822363,
"loss": 3.0606,
"step": 48
},
{
"epoch": 0.937799043062201,
"grad_norm": 2.951885938644409,
"learning_rate": 0.0001915224462026563,
"loss": 3.042,
"step": 49
},
{
"epoch": 0.9569377990430622,
"grad_norm": 2.2476351261138916,
"learning_rate": 0.0001910997497957885,
"loss": 2.9928,
"step": 50
},
{
"epoch": 0.9760765550239234,
"grad_norm": 1.9801242351531982,
"learning_rate": 0.00019066726011770726,
"loss": 2.8911,
"step": 51
},
{
"epoch": 0.9952153110047847,
"grad_norm": 2.5246548652648926,
"learning_rate": 0.00019022502366128135,
"loss": 3.2457,
"step": 52
},
{
"epoch": 1.0,
"grad_norm": 5.682666778564453,
"learning_rate": 0.0001897730879671634,
"loss": 2.4435,
"step": 53
},
{
"epoch": 1.0191387559808613,
"grad_norm": 2.66831374168396,
"learning_rate": 0.00018931150161867916,
"loss": 2.7807,
"step": 54
},
{
"epoch": 1.0382775119617225,
"grad_norm": 2.5246026515960693,
"learning_rate": 0.0001888403142366049,
"loss": 2.7599,
"step": 55
},
{
"epoch": 1.0574162679425838,
"grad_norm": 1.959625244140625,
"learning_rate": 0.00018835957647383303,
"loss": 2.9087,
"step": 56
},
{
"epoch": 1.076555023923445,
"grad_norm": 2.277261257171631,
"learning_rate": 0.00018786934000992688,
"loss": 3.2283,
"step": 57
},
{
"epoch": 1.0956937799043063,
"grad_norm": 3.0258898735046387,
"learning_rate": 0.00018736965754556528,
"loss": 3.0084,
"step": 58
},
{
"epoch": 1.1148325358851674,
"grad_norm": 2.4277517795562744,
"learning_rate": 0.00018686058279687698,
"loss": 2.7159,
"step": 59
},
{
"epoch": 1.1339712918660287,
"grad_norm": 3.0732321739196777,
"learning_rate": 0.00018634217048966637,
"loss": 2.9704,
"step": 60
},
{
"epoch": 1.1531100478468899,
"grad_norm": 3.0256996154785156,
"learning_rate": 0.0001858144763535302,
"loss": 3.0254,
"step": 61
},
{
"epoch": 1.1722488038277512,
"grad_norm": 2.7575695514678955,
"learning_rate": 0.00018527755711586678,
"loss": 2.4907,
"step": 62
},
{
"epoch": 1.1913875598086126,
"grad_norm": 2.813037157058716,
"learning_rate": 0.00018473147049577774,
"loss": 2.8598,
"step": 63
},
{
"epoch": 1.2105263157894737,
"grad_norm": 2.197244644165039,
"learning_rate": 0.00018417627519786315,
"loss": 2.965,
"step": 64
},
{
"epoch": 1.229665071770335,
"grad_norm": 2.0711350440979004,
"learning_rate": 0.00018361203090591071,
"loss": 2.9582,
"step": 65
},
{
"epoch": 1.2488038277511961,
"grad_norm": 2.7295780181884766,
"learning_rate": 0.00018303879827647975,
"loss": 2.5148,
"step": 66
},
{
"epoch": 1.2679425837320575,
"grad_norm": 2.511603593826294,
"learning_rate": 0.00018245663893238075,
"loss": 2.7535,
"step": 67
},
{
"epoch": 1.2870813397129186,
"grad_norm": 3.695086717605591,
"learning_rate": 0.00018186561545605054,
"loss": 2.6003,
"step": 68
},
{
"epoch": 1.30622009569378,
"grad_norm": 3.2395761013031006,
"learning_rate": 0.00018126579138282503,
"loss": 2.7334,
"step": 69
},
{
"epoch": 1.325358851674641,
"grad_norm": 3.004142999649048,
"learning_rate": 0.00018065723119410884,
"loss": 2.7788,
"step": 70
},
{
"epoch": 1.3444976076555024,
"grad_norm": 2.964301824569702,
"learning_rate": 0.0001800400003104436,
"loss": 2.734,
"step": 71
},
{
"epoch": 1.3636363636363638,
"grad_norm": 3.981093645095825,
"learning_rate": 0.00017941416508447536,
"loss": 2.7476,
"step": 72
},
{
"epoch": 1.3827751196172249,
"grad_norm": 3.2536420822143555,
"learning_rate": 0.00017877979279382135,
"loss": 2.5386,
"step": 73
},
{
"epoch": 1.401913875598086,
"grad_norm": 3.6163337230682373,
"learning_rate": 0.0001781369516338378,
"loss": 2.6767,
"step": 74
},
{
"epoch": 1.4210526315789473,
"grad_norm": 3.6883926391601562,
"learning_rate": 0.000177485710710289,
"loss": 2.5656,
"step": 75
},
{
"epoch": 1.4401913875598087,
"grad_norm": 3.5389018058776855,
"learning_rate": 0.00017682614003191807,
"loss": 2.6626,
"step": 76
},
{
"epoch": 1.4593301435406698,
"grad_norm": 2.324506998062134,
"learning_rate": 0.0001761583105029213,
"loss": 2.6479,
"step": 77
},
{
"epoch": 1.4784688995215312,
"grad_norm": 2.271515130996704,
"learning_rate": 0.00017548229391532572,
"loss": 2.874,
"step": 78
},
{
"epoch": 1.4976076555023923,
"grad_norm": 3.023533821105957,
"learning_rate": 0.00017479816294127152,
"loss": 2.4017,
"step": 79
},
{
"epoch": 1.5167464114832536,
"grad_norm": 4.101243495941162,
"learning_rate": 0.0001741059911251997,
"loss": 3.0185,
"step": 80
},
{
"epoch": 1.535885167464115,
"grad_norm": 3.056877374649048,
"learning_rate": 0.00017340585287594604,
"loss": 2.7875,
"step": 81
},
{
"epoch": 1.555023923444976,
"grad_norm": 3.0255823135375977,
"learning_rate": 0.00017269782345874203,
"loss": 2.8453,
"step": 82
},
{
"epoch": 1.5741626794258372,
"grad_norm": 3.57423734664917,
"learning_rate": 0.00017198197898712404,
"loss": 2.6948,
"step": 83
},
{
"epoch": 1.5933014354066986,
"grad_norm": 3.436167001724243,
"learning_rate": 0.00017125839641475072,
"loss": 2.6287,
"step": 84
},
{
"epoch": 1.61244019138756,
"grad_norm": 3.1058871746063232,
"learning_rate": 0.00017052715352713075,
"loss": 2.5887,
"step": 85
},
{
"epoch": 1.631578947368421,
"grad_norm": 2.1073200702667236,
"learning_rate": 0.00016978832893326074,
"loss": 2.9573,
"step": 86
},
{
"epoch": 1.6507177033492821,
"grad_norm": 2.8039920330047607,
"learning_rate": 0.0001690420020571747,
"loss": 2.9652,
"step": 87
},
{
"epoch": 1.6698564593301435,
"grad_norm": 2.8494677543640137,
"learning_rate": 0.00016828825312940592,
"loss": 2.6263,
"step": 88
},
{
"epoch": 1.6889952153110048,
"grad_norm": 2.3521246910095215,
"learning_rate": 0.00016752716317836229,
"loss": 2.683,
"step": 89
},
{
"epoch": 1.7081339712918662,
"grad_norm": 2.5750181674957275,
"learning_rate": 0.00016675881402161536,
"loss": 2.611,
"step": 90
},
{
"epoch": 1.7272727272727273,
"grad_norm": 2.687619924545288,
"learning_rate": 0.00016598328825710533,
"loss": 2.523,
"step": 91
},
{
"epoch": 1.7464114832535884,
"grad_norm": 3.112954616546631,
"learning_rate": 0.00016520066925426144,
"loss": 2.6558,
"step": 92
},
{
"epoch": 1.7655502392344498,
"grad_norm": 3.4932713508605957,
"learning_rate": 0.0001644110411450398,
"loss": 2.6962,
"step": 93
},
{
"epoch": 1.784688995215311,
"grad_norm": 2.564894437789917,
"learning_rate": 0.00016361448881487914,
"loss": 2.8202,
"step": 94
},
{
"epoch": 1.8038277511961722,
"grad_norm": 3.1496503353118896,
"learning_rate": 0.0001628110978935756,
"loss": 2.2734,
"step": 95
},
{
"epoch": 1.8229665071770333,
"grad_norm": 2.6274123191833496,
"learning_rate": 0.00016200095474607753,
"loss": 2.5264,
"step": 96
},
{
"epoch": 1.8421052631578947,
"grad_norm": 2.374180555343628,
"learning_rate": 0.0001611841464632011,
"loss": 2.8034,
"step": 97
},
{
"epoch": 1.861244019138756,
"grad_norm": 2.691254138946533,
"learning_rate": 0.00016036076085226814,
"loss": 2.5935,
"step": 98
},
{
"epoch": 1.8803827751196174,
"grad_norm": 2.9795515537261963,
"learning_rate": 0.0001595308864276666,
"loss": 2.8707,
"step": 99
},
{
"epoch": 1.8995215311004785,
"grad_norm": 3.1781864166259766,
"learning_rate": 0.0001586946124013354,
"loss": 2.458,
"step": 100
},
{
"epoch": 1.9186602870813396,
"grad_norm": 2.8759453296661377,
"learning_rate": 0.00015785202867317407,
"loss": 2.5201,
"step": 101
},
{
"epoch": 1.937799043062201,
"grad_norm": 3.2317118644714355,
"learning_rate": 0.00015700322582137827,
"loss": 2.5585,
"step": 102
},
{
"epoch": 1.9569377990430623,
"grad_norm": 3.463688373565674,
"learning_rate": 0.0001561482950927029,
"loss": 2.4652,
"step": 103
},
{
"epoch": 1.9760765550239234,
"grad_norm": 2.4766316413879395,
"learning_rate": 0.00015528732839265272,
"loss": 2.5966,
"step": 104
},
{
"epoch": 1.9952153110047846,
"grad_norm": 2.8042709827423096,
"learning_rate": 0.00015442041827560274,
"loss": 2.5278,
"step": 105
},
{
"epoch": 2.0,
"grad_norm": 8.298028945922852,
"learning_rate": 0.00015354765793484834,
"loss": 2.8732,
"step": 106
},
{
"epoch": 2.0191387559808613,
"grad_norm": 3.808393716812134,
"learning_rate": 0.000152669141192587,
"loss": 2.1442,
"step": 107
},
{
"epoch": 2.0382775119617227,
"grad_norm": 3.3381223678588867,
"learning_rate": 0.00015178496248983254,
"loss": 2.6125,
"step": 108
},
{
"epoch": 2.0574162679425836,
"grad_norm": 4.778241157531738,
"learning_rate": 0.00015089521687626243,
"loss": 2.399,
"step": 109
},
{
"epoch": 2.076555023923445,
"grad_norm": 2.613919973373413,
"learning_rate": 0.00015000000000000001,
"loss": 2.5189,
"step": 110
},
{
"epoch": 2.0956937799043063,
"grad_norm": 3.6656932830810547,
"learning_rate": 0.00014909940809733222,
"loss": 2.2785,
"step": 111
},
{
"epoch": 2.1148325358851676,
"grad_norm": 2.968078136444092,
"learning_rate": 0.00014819353798236427,
"loss": 2.3605,
"step": 112
},
{
"epoch": 2.1339712918660285,
"grad_norm": 2.7252070903778076,
"learning_rate": 0.00014728248703661182,
"loss": 2.173,
"step": 113
},
{
"epoch": 2.15311004784689,
"grad_norm": 3.9389491081237793,
"learning_rate": 0.00014636635319853275,
"loss": 2.457,
"step": 114
},
{
"epoch": 2.172248803827751,
"grad_norm": 3.658862590789795,
"learning_rate": 0.00014544523495299842,
"loss": 2.6971,
"step": 115
},
{
"epoch": 2.1913875598086126,
"grad_norm": 3.303403377532959,
"learning_rate": 0.0001445192313207067,
"loss": 2.7851,
"step": 116
},
{
"epoch": 2.2105263157894735,
"grad_norm": 3.910428047180176,
"learning_rate": 0.00014358844184753712,
"loss": 2.1422,
"step": 117
},
{
"epoch": 2.229665071770335,
"grad_norm": 3.3043367862701416,
"learning_rate": 0.00014265296659384956,
"loss": 2.5404,
"step": 118
},
{
"epoch": 2.248803827751196,
"grad_norm": 2.9098987579345703,
"learning_rate": 0.0001417129061237278,
"loss": 2.567,
"step": 119
},
{
"epoch": 2.2679425837320575,
"grad_norm": 4.142232894897461,
"learning_rate": 0.00014076836149416887,
"loss": 2.4179,
"step": 120
},
{
"epoch": 2.287081339712919,
"grad_norm": 2.110104560852051,
"learning_rate": 0.00013981943424421932,
"loss": 2.4976,
"step": 121
},
{
"epoch": 2.3062200956937797,
"grad_norm": 2.6828229427337646,
"learning_rate": 0.00013886622638405952,
"loss": 2.5762,
"step": 122
},
{
"epoch": 2.325358851674641,
"grad_norm": 3.0066471099853516,
"learning_rate": 0.00013790884038403795,
"loss": 2.2882,
"step": 123
},
{
"epoch": 2.3444976076555024,
"grad_norm": 3.791444778442383,
"learning_rate": 0.00013694737916365517,
"loss": 2.1788,
"step": 124
},
{
"epoch": 2.3636363636363638,
"grad_norm": 2.78275203704834,
"learning_rate": 0.0001359819460805001,
"loss": 2.6037,
"step": 125
},
{
"epoch": 2.382775119617225,
"grad_norm": 4.18953275680542,
"learning_rate": 0.00013501264491913906,
"loss": 2.3284,
"step": 126
},
{
"epoch": 2.401913875598086,
"grad_norm": 2.925140142440796,
"learning_rate": 0.00013403957987995882,
"loss": 2.4364,
"step": 127
},
{
"epoch": 2.4210526315789473,
"grad_norm": 4.545037746429443,
"learning_rate": 0.00013306285556796495,
"loss": 2.4096,
"step": 128
},
{
"epoch": 2.4401913875598087,
"grad_norm": 3.785428524017334,
"learning_rate": 0.00013208257698153677,
"loss": 2.1047,
"step": 129
},
{
"epoch": 2.45933014354067,
"grad_norm": 3.6228346824645996,
"learning_rate": 0.00013109884950114007,
"loss": 2.5744,
"step": 130
},
{
"epoch": 2.478468899521531,
"grad_norm": 2.9221742153167725,
"learning_rate": 0.00013011177887799845,
"loss": 2.5266,
"step": 131
},
{
"epoch": 2.4976076555023923,
"grad_norm": 3.659484386444092,
"learning_rate": 0.00012912147122272523,
"loss": 2.3707,
"step": 132
},
{
"epoch": 2.5167464114832536,
"grad_norm": 3.5442514419555664,
"learning_rate": 0.00012812803299391628,
"loss": 2.4695,
"step": 133
},
{
"epoch": 2.535885167464115,
"grad_norm": 3.1291420459747314,
"learning_rate": 0.0001271315709867059,
"loss": 2.687,
"step": 134
},
{
"epoch": 2.555023923444976,
"grad_norm": 4.138225078582764,
"learning_rate": 0.00012613219232128608,
"loss": 2.2378,
"step": 135
},
{
"epoch": 2.574162679425837,
"grad_norm": 2.8483548164367676,
"learning_rate": 0.00012513000443139112,
"loss": 2.4044,
"step": 136
},
{
"epoch": 2.5933014354066986,
"grad_norm": 2.434741497039795,
"learning_rate": 0.00012412511505274844,
"loss": 2.5664,
"step": 137
},
{
"epoch": 2.61244019138756,
"grad_norm": 3.9319725036621094,
"learning_rate": 0.000123117632211497,
"loss": 2.2586,
"step": 138
},
{
"epoch": 2.6315789473684212,
"grad_norm": 3.4802486896514893,
"learning_rate": 0.0001221076642125742,
"loss": 2.0743,
"step": 139
},
{
"epoch": 2.650717703349282,
"grad_norm": 3.1535286903381348,
"learning_rate": 0.00012109531962807332,
"loss": 2.302,
"step": 140
},
{
"epoch": 2.6698564593301435,
"grad_norm": 2.9818458557128906,
"learning_rate": 0.00012008070728557186,
"loss": 2.4418,
"step": 141
},
{
"epoch": 2.688995215311005,
"grad_norm": 4.8768630027771,
"learning_rate": 0.00011906393625643244,
"loss": 2.5083,
"step": 142
},
{
"epoch": 2.708133971291866,
"grad_norm": 3.8520619869232178,
"learning_rate": 0.00011804511584407763,
"loss": 1.9994,
"step": 143
},
{
"epoch": 2.7272727272727275,
"grad_norm": 3.784248113632202,
"learning_rate": 0.00011702435557223987,
"loss": 2.2376,
"step": 144
},
{
"epoch": 2.7464114832535884,
"grad_norm": 4.1650800704956055,
"learning_rate": 0.00011600176517318741,
"loss": 2.3886,
"step": 145
},
{
"epoch": 2.7655502392344498,
"grad_norm": 4.099468231201172,
"learning_rate": 0.00011497745457592816,
"loss": 2.5978,
"step": 146
},
{
"epoch": 2.784688995215311,
"grad_norm": 4.268674850463867,
"learning_rate": 0.00011395153389439233,
"loss": 2.4882,
"step": 147
},
{
"epoch": 2.803827751196172,
"grad_norm": 4.081464767456055,
"learning_rate": 0.0001129241134155949,
"loss": 2.6547,
"step": 148
},
{
"epoch": 2.8229665071770333,
"grad_norm": 3.1537716388702393,
"learning_rate": 0.00011189530358778005,
"loss": 2.5361,
"step": 149
},
{
"epoch": 2.8421052631578947,
"grad_norm": 4.182295322418213,
"learning_rate": 0.00011086521500854745,
"loss": 2.385,
"step": 150
},
{
"epoch": 2.861244019138756,
"grad_norm": 2.5511474609375,
"learning_rate": 0.00010983395841296348,
"loss": 2.4617,
"step": 151
},
{
"epoch": 2.8803827751196174,
"grad_norm": 3.1007962226867676,
"learning_rate": 0.00010880164466165674,
"loss": 2.5788,
"step": 152
},
{
"epoch": 2.8995215311004783,
"grad_norm": 4.509490966796875,
"learning_rate": 0.00010776838472890065,
"loss": 2.1361,
"step": 153
},
{
"epoch": 2.9186602870813396,
"grad_norm": 2.6765851974487305,
"learning_rate": 0.00010673428969068364,
"loss": 2.3922,
"step": 154
},
{
"epoch": 2.937799043062201,
"grad_norm": 3.704310894012451,
"learning_rate": 0.00010569947071276847,
"loss": 2.6924,
"step": 155
},
{
"epoch": 2.9569377990430623,
"grad_norm": 3.935804843902588,
"learning_rate": 0.00010466403903874176,
"loss": 2.2886,
"step": 156
},
{
"epoch": 2.9760765550239237,
"grad_norm": 4.105613708496094,
"learning_rate": 0.00010362810597805526,
"loss": 2.3865,
"step": 157
},
{
"epoch": 2.9952153110047846,
"grad_norm": 3.669766664505005,
"learning_rate": 0.00010259178289406011,
"loss": 2.2158,
"step": 158
},
{
"epoch": 3.0,
"grad_norm": 8.930411338806152,
"learning_rate": 0.0001015551811920351,
"loss": 2.5214,
"step": 159
},
{
"epoch": 3.0191387559808613,
"grad_norm": 3.3217484951019287,
"learning_rate": 0.00010051841230721065,
"loss": 2.5409,
"step": 160
},
{
"epoch": 3.0382775119617227,
"grad_norm": 3.8041253089904785,
"learning_rate": 9.948158769278939e-05,
"loss": 1.8309,
"step": 161
},
{
"epoch": 3.0574162679425836,
"grad_norm": 3.892636775970459,
"learning_rate": 9.844481880796491e-05,
"loss": 2.0955,
"step": 162
},
{
"epoch": 3.076555023923445,
"grad_norm": 3.4822261333465576,
"learning_rate": 9.740821710593989e-05,
"loss": 2.2865,
"step": 163
},
{
"epoch": 3.0956937799043063,
"grad_norm": 3.033822774887085,
"learning_rate": 9.637189402194476e-05,
"loss": 2.3693,
"step": 164
},
{
"epoch": 3.1148325358851676,
"grad_norm": 3.693204641342163,
"learning_rate": 9.533596096125825e-05,
"loss": 1.984,
"step": 165
},
{
"epoch": 3.1339712918660285,
"grad_norm": 3.3877508640289307,
"learning_rate": 9.430052928723153e-05,
"loss": 2.0891,
"step": 166
},
{
"epoch": 3.15311004784689,
"grad_norm": 4.376189708709717,
"learning_rate": 9.326571030931637e-05,
"loss": 2.1974,
"step": 167
},
{
"epoch": 3.172248803827751,
"grad_norm": 3.557032823562622,
"learning_rate": 9.223161527109937e-05,
"loss": 1.9757,
"step": 168
},
{
"epoch": 3.1913875598086126,
"grad_norm": 2.733353853225708,
"learning_rate": 9.119835533834331e-05,
"loss": 2.4171,
"step": 169
},
{
"epoch": 3.2105263157894735,
"grad_norm": 2.7016165256500244,
"learning_rate": 9.016604158703654e-05,
"loss": 2.2992,
"step": 170
},
{
"epoch": 3.229665071770335,
"grad_norm": 3.997654438018799,
"learning_rate": 8.913478499145254e-05,
"loss": 2.1117,
"step": 171
},
{
"epoch": 3.248803827751196,
"grad_norm": 4.044878005981445,
"learning_rate": 8.810469641222001e-05,
"loss": 2.245,
"step": 172
},
{
"epoch": 3.2679425837320575,
"grad_norm": 3.080991506576538,
"learning_rate": 8.707588658440511e-05,
"loss": 2.1612,
"step": 173
},
{
"epoch": 3.287081339712919,
"grad_norm": 3.295807123184204,
"learning_rate": 8.604846610560771e-05,
"loss": 2.3217,
"step": 174
},
{
"epoch": 3.3062200956937797,
"grad_norm": 3.5904176235198975,
"learning_rate": 8.502254542407186e-05,
"loss": 2.3138,
"step": 175
},
{
"epoch": 3.325358851674641,
"grad_norm": 4.395754814147949,
"learning_rate": 8.399823482681262e-05,
"loss": 1.9074,
"step": 176
},
{
"epoch": 3.3444976076555024,
"grad_norm": 3.2221572399139404,
"learning_rate": 8.297564442776014e-05,
"loss": 2.2977,
"step": 177
},
{
"epoch": 3.3636363636363638,
"grad_norm": 2.9927215576171875,
"learning_rate": 8.195488415592238e-05,
"loss": 2.3785,
"step": 178
},
{
"epoch": 3.382775119617225,
"grad_norm": 3.9036011695861816,
"learning_rate": 8.093606374356759e-05,
"loss": 1.9962,
"step": 179
},
{
"epoch": 3.401913875598086,
"grad_norm": 4.485937595367432,
"learning_rate": 7.991929271442817e-05,
"loss": 1.7251,
"step": 180
},
{
"epoch": 3.4210526315789473,
"grad_norm": 4.750828742980957,
"learning_rate": 7.89046803719267e-05,
"loss": 2.1263,
"step": 181
},
{
"epoch": 3.4401913875598087,
"grad_norm": 4.138678550720215,
"learning_rate": 7.789233578742582e-05,
"loss": 2.0091,
"step": 182
},
{
"epoch": 3.45933014354067,
"grad_norm": 3.6726274490356445,
"learning_rate": 7.688236778850306e-05,
"loss": 2.3806,
"step": 183
},
{
"epoch": 3.478468899521531,
"grad_norm": 4.481295108795166,
"learning_rate": 7.587488494725157e-05,
"loss": 2.1338,
"step": 184
},
{
"epoch": 3.4976076555023923,
"grad_norm": 3.9401016235351562,
"learning_rate": 7.48699955686089e-05,
"loss": 2.1403,
"step": 185
},
{
"epoch": 3.5167464114832536,
"grad_norm": 4.227544784545898,
"learning_rate": 7.386780767871397e-05,
"loss": 2.3207,
"step": 186
},
{
"epoch": 3.535885167464115,
"grad_norm": 3.4885573387145996,
"learning_rate": 7.286842901329412e-05,
"loss": 2.2671,
"step": 187
},
{
"epoch": 3.555023923444976,
"grad_norm": 4.438218593597412,
"learning_rate": 7.187196700608373e-05,
"loss": 2.0748,
"step": 188
},
{
"epoch": 3.574162679425837,
"grad_norm": 3.766284465789795,
"learning_rate": 7.087852877727481e-05,
"loss": 2.5101,
"step": 189
},
{
"epoch": 3.5933014354066986,
"grad_norm": 4.027716636657715,
"learning_rate": 6.988822112200156e-05,
"loss": 2.3361,
"step": 190
},
{
"epoch": 3.61244019138756,
"grad_norm": 4.409999370574951,
"learning_rate": 6.890115049885994e-05,
"loss": 2.2492,
"step": 191
},
{
"epoch": 3.6315789473684212,
"grad_norm": 3.596459150314331,
"learning_rate": 6.791742301846326e-05,
"loss": 2.2855,
"step": 192
},
{
"epoch": 3.650717703349282,
"grad_norm": 4.667017459869385,
"learning_rate": 6.693714443203507e-05,
"loss": 2.083,
"step": 193
},
{
"epoch": 3.6698564593301435,
"grad_norm": 4.831173896789551,
"learning_rate": 6.59604201200412e-05,
"loss": 2.1568,
"step": 194
},
{
"epoch": 3.688995215311005,
"grad_norm": 3.5013201236724854,
"learning_rate": 6.498735508086093e-05,
"loss": 2.108,
"step": 195
},
{
"epoch": 3.708133971291866,
"grad_norm": 4.176932334899902,
"learning_rate": 6.40180539194999e-05,
"loss": 1.8315,
"step": 196
},
{
"epoch": 3.7272727272727275,
"grad_norm": 5.187565803527832,
"learning_rate": 6.305262083634488e-05,
"loss": 2.3541,
"step": 197
},
{
"epoch": 3.7464114832535884,
"grad_norm": 4.090083599090576,
"learning_rate": 6.209115961596208e-05,
"loss": 2.0691,
"step": 198
},
{
"epoch": 3.7655502392344498,
"grad_norm": 3.806030750274658,
"learning_rate": 6.113377361594049e-05,
"loss": 2.0471,
"step": 199
},
{
"epoch": 3.784688995215311,
"grad_norm": 4.668728828430176,
"learning_rate": 6.018056575578075e-05,
"loss": 2.335,
"step": 200
},
{
"epoch": 3.803827751196172,
"grad_norm": 4.811546325683594,
"learning_rate": 5.923163850583113e-05,
"loss": 2.2242,
"step": 201
},
{
"epoch": 3.8229665071770333,
"grad_norm": 5.359763145446777,
"learning_rate": 5.828709387627218e-05,
"loss": 2.3298,
"step": 202
},
{
"epoch": 3.8421052631578947,
"grad_norm": 3.5501046180725098,
"learning_rate": 5.73470334061505e-05,
"loss": 2.1297,
"step": 203
},
{
"epoch": 3.861244019138756,
"grad_norm": 3.4878952503204346,
"learning_rate": 5.6411558152462894e-05,
"loss": 2.3615,
"step": 204
},
{
"epoch": 3.8803827751196174,
"grad_norm": 4.381737232208252,
"learning_rate": 5.54807686792933e-05,
"loss": 2.0084,
"step": 205
},
{
"epoch": 3.8995215311004783,
"grad_norm": 5.2298359870910645,
"learning_rate": 5.4554765047001613e-05,
"loss": 2.362,
"step": 206
},
{
"epoch": 3.9186602870813396,
"grad_norm": 3.3613922595977783,
"learning_rate": 5.363364680146725e-05,
"loss": 2.1292,
"step": 207
},
{
"epoch": 3.937799043062201,
"grad_norm": 4.079115867614746,
"learning_rate": 5.271751296338823e-05,
"loss": 2.3561,
"step": 208
},
{
"epoch": 3.9569377990430623,
"grad_norm": 4.030163764953613,
"learning_rate": 5.180646201763577e-05,
"loss": 2.1954,
"step": 209
},
{
"epoch": 3.9760765550239237,
"grad_norm": 4.383935928344727,
"learning_rate": 5.090059190266779e-05,
"loss": 2.0793,
"step": 210
},
{
"epoch": 3.9952153110047846,
"grad_norm": 5.1565775871276855,
"learning_rate": 5.000000000000002e-05,
"loss": 2.0134,
"step": 211
},
{
"epoch": 4.0,
"grad_norm": 8.855152130126953,
"learning_rate": 4.9104783123737566e-05,
"loss": 2.4084,
"step": 212
},
{
"epoch": 4.019138755980861,
"grad_norm": 3.997187614440918,
"learning_rate": 4.821503751016746e-05,
"loss": 1.978,
"step": 213
},
{
"epoch": 4.038277511961723,
"grad_norm": 5.068262100219727,
"learning_rate": 4.733085880741301e-05,
"loss": 2.199,
"step": 214
},
{
"epoch": 4.057416267942584,
"grad_norm": 3.602715015411377,
"learning_rate": 4.645234206515171e-05,
"loss": 2.0417,
"step": 215
},
{
"epoch": 4.076555023923445,
"grad_norm": 4.461487293243408,
"learning_rate": 4.5579581724397255e-05,
"loss": 1.7777,
"step": 216
},
{
"epoch": 4.095693779904306,
"grad_norm": 3.1041159629821777,
"learning_rate": 4.471267160734731e-05,
"loss": 2.1061,
"step": 217
},
{
"epoch": 4.114832535885167,
"grad_norm": 3.8727328777313232,
"learning_rate": 4.385170490729712e-05,
"loss": 2.4006,
"step": 218
},
{
"epoch": 4.133971291866029,
"grad_norm": 3.868997097015381,
"learning_rate": 4.2996774178621736e-05,
"loss": 2.0803,
"step": 219
},
{
"epoch": 4.15311004784689,
"grad_norm": 3.5627689361572266,
"learning_rate": 4.2147971326825966e-05,
"loss": 2.0853,
"step": 220
},
{
"epoch": 4.172248803827751,
"grad_norm": 4.383954048156738,
"learning_rate": 4.130538759866457e-05,
"loss": 1.7176,
"step": 221
},
{
"epoch": 4.1913875598086126,
"grad_norm": 4.526845932006836,
"learning_rate": 4.046911357233343e-05,
"loss": 2.0527,
"step": 222
},
{
"epoch": 4.2105263157894735,
"grad_norm": 2.923349380493164,
"learning_rate": 3.963923914773187e-05,
"loss": 2.1541,
"step": 223
},
{
"epoch": 4.229665071770335,
"grad_norm": 4.575229167938232,
"learning_rate": 3.8815853536798904e-05,
"loss": 1.986,
"step": 224
},
{
"epoch": 4.248803827751196,
"grad_norm": 3.529787540435791,
"learning_rate": 3.79990452539225e-05,
"loss": 2.0539,
"step": 225
},
{
"epoch": 4.267942583732057,
"grad_norm": 4.581504821777344,
"learning_rate": 3.7188902106424416e-05,
"loss": 1.9526,
"step": 226
},
{
"epoch": 4.287081339712919,
"grad_norm": 3.2781484127044678,
"learning_rate": 3.638551118512089e-05,
"loss": 2.1485,
"step": 227
},
{
"epoch": 4.30622009569378,
"grad_norm": 3.7174124717712402,
"learning_rate": 3.558895885496023e-05,
"loss": 2.3293,
"step": 228
},
{
"epoch": 4.3253588516746415,
"grad_norm": 4.923449993133545,
"learning_rate": 3.479933074573858e-05,
"loss": 2.0144,
"step": 229
},
{
"epoch": 4.344497607655502,
"grad_norm": 4.567214488983154,
"learning_rate": 3.401671174289469e-05,
"loss": 1.8614,
"step": 230
},
{
"epoch": 4.363636363636363,
"grad_norm": 3.7625460624694824,
"learning_rate": 3.324118597838464e-05,
"loss": 2.1933,
"step": 231
},
{
"epoch": 4.382775119617225,
"grad_norm": 5.30003023147583,
"learning_rate": 3.2472836821637744e-05,
"loss": 2.0038,
"step": 232
},
{
"epoch": 4.401913875598086,
"grad_norm": 4.20980167388916,
"learning_rate": 3.1711746870594086e-05,
"loss": 1.9264,
"step": 233
},
{
"epoch": 4.421052631578947,
"grad_norm": 4.678532600402832,
"learning_rate": 3.0957997942825336e-05,
"loss": 1.9475,
"step": 234
},
{
"epoch": 4.440191387559809,
"grad_norm": 4.418569564819336,
"learning_rate": 3.021167106673928e-05,
"loss": 2.062,
"step": 235
},
{
"epoch": 4.45933014354067,
"grad_norm": 4.576781272888184,
"learning_rate": 2.9472846472869298e-05,
"loss": 2.2673,
"step": 236
},
{
"epoch": 4.478468899521531,
"grad_norm": 5.059473037719727,
"learning_rate": 2.874160358524931e-05,
"loss": 2.2399,
"step": 237
},
{
"epoch": 4.497607655502392,
"grad_norm": 5.032463073730469,
"learning_rate": 2.8018021012875994e-05,
"loss": 1.8512,
"step": 238
},
{
"epoch": 4.516746411483254,
"grad_norm": 4.410358428955078,
"learning_rate": 2.7302176541257986e-05,
"loss": 1.8909,
"step": 239
},
{
"epoch": 4.535885167464115,
"grad_norm": 4.2732319831848145,
"learning_rate": 2.659414712405398e-05,
"loss": 1.833,
"step": 240
},
{
"epoch": 4.555023923444976,
"grad_norm": 4.440384387969971,
"learning_rate": 2.5894008874800325e-05,
"loss": 1.8964,
"step": 241
},
{
"epoch": 4.574162679425838,
"grad_norm": 4.8430891036987305,
"learning_rate": 2.5201837058728505e-05,
"loss": 1.7943,
"step": 242
},
{
"epoch": 4.5933014354066986,
"grad_norm": 3.676851987838745,
"learning_rate": 2.451770608467432e-05,
"loss": 2.0328,
"step": 243
},
{
"epoch": 4.6124401913875595,
"grad_norm": 4.80816650390625,
"learning_rate": 2.3841689497078746e-05,
"loss": 2.1791,
"step": 244
},
{
"epoch": 4.631578947368421,
"grad_norm": 4.105157852172852,
"learning_rate": 2.3173859968081944e-05,
"loss": 2.2402,
"step": 245
},
{
"epoch": 4.650717703349282,
"grad_norm": 5.055697441101074,
"learning_rate": 2.251428928971102e-05,
"loss": 2.2174,
"step": 246
},
{
"epoch": 4.669856459330144,
"grad_norm": 5.220304012298584,
"learning_rate": 2.1863048366162208e-05,
"loss": 2.163,
"step": 247
},
{
"epoch": 4.688995215311005,
"grad_norm": 5.349198818206787,
"learning_rate": 2.1220207206178688e-05,
"loss": 1.8591,
"step": 248
},
{
"epoch": 4.708133971291866,
"grad_norm": 3.800992012023926,
"learning_rate": 2.058583491552465e-05,
"loss": 2.1511,
"step": 249
},
{
"epoch": 4.7272727272727275,
"grad_norm": 4.178462982177734,
"learning_rate": 1.995999968955641e-05,
"loss": 2.2553,
"step": 250
},
{
"epoch": 4.746411483253588,
"grad_norm": 5.495607852935791,
"learning_rate": 1.9342768805891178e-05,
"loss": 2.022,
"step": 251
},
{
"epoch": 4.76555023923445,
"grad_norm": 4.614135265350342,
"learning_rate": 1.8734208617174988e-05,
"loss": 2.1751,
"step": 252
},
{
"epoch": 4.784688995215311,
"grad_norm": 3.8945748805999756,
"learning_rate": 1.8134384543949478e-05,
"loss": 2.0986,
"step": 253
},
{
"epoch": 4.803827751196172,
"grad_norm": 5.491265773773193,
"learning_rate": 1.754336106761927e-05,
"loss": 1.8482,
"step": 254
},
{
"epoch": 4.822966507177034,
"grad_norm": 5.249953269958496,
"learning_rate": 1.696120172352025e-05,
"loss": 1.8416,
"step": 255
},
{
"epoch": 4.842105263157895,
"grad_norm": 4.254781246185303,
"learning_rate": 1.6387969094089316e-05,
"loss": 2.0863,
"step": 256
},
{
"epoch": 4.861244019138756,
"grad_norm": 5.3179779052734375,
"learning_rate": 1.5823724802136865e-05,
"loss": 2.2049,
"step": 257
},
{
"epoch": 4.880382775119617,
"grad_norm": 5.007632732391357,
"learning_rate": 1.526852950422226e-05,
"loss": 1.9023,
"step": 258
},
{
"epoch": 4.899521531100478,
"grad_norm": 3.3414082527160645,
"learning_rate": 1.4722442884133214e-05,
"loss": 2.0638,
"step": 259
},
{
"epoch": 4.91866028708134,
"grad_norm": 4.421596050262451,
"learning_rate": 1.4185523646469822e-05,
"loss": 2.0366,
"step": 260
},
{
"epoch": 4.937799043062201,
"grad_norm": 5.466579914093018,
"learning_rate": 1.3657829510333654e-05,
"loss": 1.8725,
"step": 261
},
{
"epoch": 4.956937799043062,
"grad_norm": 3.617340326309204,
"learning_rate": 1.3139417203123027e-05,
"loss": 2.0564,
"step": 262
},
{
"epoch": 4.976076555023924,
"grad_norm": 3.936239719390869,
"learning_rate": 1.263034245443473e-05,
"loss": 1.8447,
"step": 263
},
{
"epoch": 4.9952153110047846,
"grad_norm": 5.094753265380859,
"learning_rate": 1.2130659990073146e-05,
"loss": 1.7893,
"step": 264
},
{
"epoch": 5.0,
"grad_norm": 8.643269538879395,
"learning_rate": 1.1640423526166988e-05,
"loss": 1.6151,
"step": 265
},
{
"epoch": 5.019138755980861,
"grad_norm": 3.2149994373321533,
"learning_rate": 1.1159685763395111e-05,
"loss": 2.0046,
"step": 266
},
{
"epoch": 5.038277511961723,
"grad_norm": 4.97629976272583,
"learning_rate": 1.0688498381320855e-05,
"loss": 1.8201,
"step": 267
},
{
"epoch": 5.057416267942584,
"grad_norm": 3.756946325302124,
"learning_rate": 1.0226912032836611e-05,
"loss": 2.0823,
"step": 268
},
{
"epoch": 5.076555023923445,
"grad_norm": 3.5998761653900146,
"learning_rate": 9.774976338718677e-06,
"loss": 2.1409,
"step": 269
},
{
"epoch": 5.095693779904306,
"grad_norm": 4.599725246429443,
"learning_rate": 9.332739882292752e-06,
"loss": 1.9638,
"step": 270
},
{
"epoch": 5.114832535885167,
"grad_norm": 5.403920650482178,
"learning_rate": 8.900250204211514e-06,
"loss": 1.9577,
"step": 271
},
{
"epoch": 5.133971291866029,
"grad_norm": 4.915902137756348,
"learning_rate": 8.47755379734373e-06,
"loss": 2.0126,
"step": 272
},
{
"epoch": 5.15311004784689,
"grad_norm": 5.105212688446045,
"learning_rate": 8.064696101776358e-06,
"loss": 1.9987,
"step": 273
},
{
"epoch": 5.172248803827751,
"grad_norm": 4.95185661315918,
"learning_rate": 7.661721499929753e-06,
"loss": 1.7857,
"step": 274
},
{
"epoch": 5.1913875598086126,
"grad_norm": 4.504748344421387,
"learning_rate": 7.2686733117863784e-06,
"loss": 1.8317,
"step": 275
},
{
"epoch": 5.2105263157894735,
"grad_norm": 4.897287845611572,
"learning_rate": 6.8855937902340576e-06,
"loss": 1.8711,
"step": 276
},
{
"epoch": 5.229665071770335,
"grad_norm": 4.072137355804443,
"learning_rate": 6.512524116523633e-06,
"loss": 2.0629,
"step": 277
},
{
"epoch": 5.248803827751196,
"grad_norm": 3.6332151889801025,
"learning_rate": 6.149504395842087e-06,
"loss": 2.1024,
"step": 278
},
{
"epoch": 5.267942583732057,
"grad_norm": 3.8086438179016113,
"learning_rate": 5.7965736530010916e-06,
"loss": 2.247,
"step": 279
},
{
"epoch": 5.287081339712919,
"grad_norm": 3.1464338302612305,
"learning_rate": 5.453769828241872e-06,
"loss": 2.205,
"step": 280
},
{
"epoch": 5.30622009569378,
"grad_norm": 4.133326530456543,
"learning_rate": 5.121129773156663e-06,
"loss": 1.9466,
"step": 281
},
{
"epoch": 5.3253588516746415,
"grad_norm": 3.292668342590332,
"learning_rate": 4.798689246727006e-06,
"loss": 2.1296,
"step": 282
},
{
"epoch": 5.344497607655502,
"grad_norm": 3.0857577323913574,
"learning_rate": 4.486482911479839e-06,
"loss": 2.1429,
"step": 283
},
{
"epoch": 5.363636363636363,
"grad_norm": 3.311474084854126,
"learning_rate": 4.184544329761009e-06,
"loss": 1.9996,
"step": 284
},
{
"epoch": 5.382775119617225,
"grad_norm": 4.887283802032471,
"learning_rate": 3.892905960127546e-06,
"loss": 1.9941,
"step": 285
},
{
"epoch": 5.401913875598086,
"grad_norm": 4.46961784362793,
"learning_rate": 3.611599153858214e-06,
"loss": 2.0069,
"step": 286
},
{
"epoch": 5.421052631578947,
"grad_norm": 4.479908466339111,
"learning_rate": 3.3406541515832003e-06,
"loss": 1.9726,
"step": 287
},
{
"epoch": 5.440191387559809,
"grad_norm": 4.271525859832764,
"learning_rate": 3.0801000800333877e-06,
"loss": 2.0064,
"step": 288
},
{
"epoch": 5.45933014354067,
"grad_norm": 4.767016410827637,
"learning_rate": 2.8299649489090475e-06,
"loss": 1.7359,
"step": 289
},
{
"epoch": 5.478468899521531,
"grad_norm": 4.151036262512207,
"learning_rate": 2.590275647868867e-06,
"loss": 1.9747,
"step": 290
},
{
"epoch": 5.497607655502392,
"grad_norm": 5.108813285827637,
"learning_rate": 2.3610579436393e-06,
"loss": 1.9633,
"step": 291
},
{
"epoch": 5.516746411483254,
"grad_norm": 5.201232433319092,
"learning_rate": 2.1423364772445887e-06,
"loss": 1.9408,
"step": 292
},
{
"epoch": 5.535885167464115,
"grad_norm": 3.8995492458343506,
"learning_rate": 1.9341347613579087e-06,
"loss": 1.9715,
"step": 293
},
{
"epoch": 5.555023923444976,
"grad_norm": 5.370357036590576,
"learning_rate": 1.7364751777736332e-06,
"loss": 1.9897,
"step": 294
},
{
"epoch": 5.574162679425838,
"grad_norm": 3.702716588973999,
"learning_rate": 1.5493789750014031e-06,
"loss": 1.8739,
"step": 295
},
{
"epoch": 5.5933014354066986,
"grad_norm": 4.484430313110352,
"learning_rate": 1.3728662659818204e-06,
"loss": 2.0162,
"step": 296
},
{
"epoch": 5.6124401913875595,
"grad_norm": 4.100718975067139,
"learning_rate": 1.2069560259243328e-06,
"loss": 2.0228,
"step": 297
},
{
"epoch": 5.631578947368421,
"grad_norm": 4.891741752624512,
"learning_rate": 1.0516660902673448e-06,
"loss": 1.9772,
"step": 298
},
{
"epoch": 5.650717703349282,
"grad_norm": 4.323902130126953,
"learning_rate": 9.070131527609604e-07,
"loss": 1.8804,
"step": 299
},
{
"epoch": 5.669856459330144,
"grad_norm": 5.198728561401367,
"learning_rate": 7.730127636723539e-07,
"loss": 1.6451,
"step": 300
},
{
"epoch": 5.688995215311005,
"grad_norm": 4.846747398376465,
"learning_rate": 6.496793281141056e-07,
"loss": 1.9766,
"step": 301
},
{
"epoch": 5.708133971291866,
"grad_norm": 5.043095588684082,
"learning_rate": 5.370261044956971e-07,
"loss": 2.1393,
"step": 302
},
{
"epoch": 5.7272727272727275,
"grad_norm": 4.933630466461182,
"learning_rate": 4.3506520309813947e-07,
"loss": 1.8928,
"step": 303
},
{
"epoch": 5.746411483253588,
"grad_norm": 4.211745738983154,
"learning_rate": 3.4380758477219333e-07,
"loss": 1.9026,
"step": 304
},
{
"epoch": 5.76555023923445,
"grad_norm": 5.295810222625732,
"learning_rate": 2.6326305976001055e-07,
"loss": 2.0479,
"step": 305
},
{
"epoch": 5.784688995215311,
"grad_norm": 4.567193984985352,
"learning_rate": 1.9344028664056713e-07,
"loss": 2.1354,
"step": 306
},
{
"epoch": 5.803827751196172,
"grad_norm": 4.380620002746582,
"learning_rate": 1.3434677139885222e-07,
"loss": 1.952,
"step": 307
},
{
"epoch": 5.822966507177034,
"grad_norm": 4.634738922119141,
"learning_rate": 8.598886661895788e-08,
"loss": 1.6821,
"step": 308
},
{
"epoch": 5.842105263157895,
"grad_norm": 4.653122901916504,
"learning_rate": 4.837177080119215e-08,
"loss": 1.8293,
"step": 309
},
{
"epoch": 5.861244019138756,
"grad_norm": 5.309375286102295,
"learning_rate": 2.1499527803214846e-08,
"loss": 1.7619,
"step": 310
},
{
"epoch": 5.880382775119617,
"grad_norm": 5.267163276672363,
"learning_rate": 5.375026405352035e-09,
"loss": 1.9069,
"step": 311
},
{
"epoch": 5.899521531100478,
"grad_norm": 3.548534870147705,
"learning_rate": 0.0,
"loss": 2.1263,
"step": 312
}
],
"logging_steps": 1,
"max_steps": 312,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 876589621248000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}