Safetensors
llama
LightThinker-Plus-Llama / trainer_state.json
Yukirsh's picture
Upload folder using huggingface_hub
b996f58 verified
Raw
History Blame Contribute Delete
50.2 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0125,
"grad_norm": 10.18331757027076,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.8960644006729126,
"num_input_tokens_seen": 0,
"step": 1
},
{
"epoch": 0.025,
"grad_norm": 9.16263025394979,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.8488595485687256,
"num_input_tokens_seen": 0,
"step": 2
},
{
"epoch": 0.0375,
"grad_norm": 8.92834992115862,
"learning_rate": 3e-06,
"loss": 0.8538516759872437,
"num_input_tokens_seen": 0,
"step": 3
},
{
"epoch": 0.05,
"grad_norm": 9.282181600009132,
"learning_rate": 4.000000000000001e-06,
"loss": 0.856260359287262,
"num_input_tokens_seen": 0,
"step": 4
},
{
"epoch": 0.0625,
"grad_norm": 5.998755609205955,
"learning_rate": 5e-06,
"loss": 0.7300517559051514,
"num_input_tokens_seen": 0,
"step": 5
},
{
"epoch": 0.075,
"grad_norm": 4.585755656488519,
"learning_rate": 6e-06,
"loss": 0.6810914278030396,
"num_input_tokens_seen": 0,
"step": 6
},
{
"epoch": 0.0875,
"grad_norm": 3.5581298112309763,
"learning_rate": 7e-06,
"loss": 0.6221330761909485,
"num_input_tokens_seen": 0,
"step": 7
},
{
"epoch": 0.1,
"grad_norm": 4.186415400005925,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5942986011505127,
"num_input_tokens_seen": 0,
"step": 8
},
{
"epoch": 0.1125,
"grad_norm": 2.0815923930377633,
"learning_rate": 9e-06,
"loss": 0.5573145151138306,
"num_input_tokens_seen": 0,
"step": 9
},
{
"epoch": 0.125,
"grad_norm": 1.538737138740704,
"learning_rate": 1e-05,
"loss": 0.5063657164573669,
"num_input_tokens_seen": 0,
"step": 10
},
{
"epoch": 0.1375,
"grad_norm": 1.3167709252541377,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.5080946087837219,
"num_input_tokens_seen": 0,
"step": 11
},
{
"epoch": 0.15,
"grad_norm": 1.0093046488935542,
"learning_rate": 1.2e-05,
"loss": 0.5118667483329773,
"num_input_tokens_seen": 0,
"step": 12
},
{
"epoch": 0.1625,
"grad_norm": 1.0176825029370344,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.4826335608959198,
"num_input_tokens_seen": 0,
"step": 13
},
{
"epoch": 0.175,
"grad_norm": 0.868692377639643,
"learning_rate": 1.4e-05,
"loss": 0.4561425447463989,
"num_input_tokens_seen": 0,
"step": 14
},
{
"epoch": 0.1875,
"grad_norm": 0.7576207830400876,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.46192091703414917,
"num_input_tokens_seen": 0,
"step": 15
},
{
"epoch": 0.2,
"grad_norm": 0.6607030167581055,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.436120867729187,
"num_input_tokens_seen": 0,
"step": 16
},
{
"epoch": 0.2125,
"grad_norm": 0.6388324060478595,
"learning_rate": 1.7e-05,
"loss": 0.43788450956344604,
"num_input_tokens_seen": 0,
"step": 17
},
{
"epoch": 0.225,
"grad_norm": 0.6522444786796912,
"learning_rate": 1.8e-05,
"loss": 0.42514723539352417,
"num_input_tokens_seen": 0,
"step": 18
},
{
"epoch": 0.2375,
"grad_norm": 0.5939148285210124,
"learning_rate": 1.9e-05,
"loss": 0.41062527894973755,
"num_input_tokens_seen": 0,
"step": 19
},
{
"epoch": 0.25,
"grad_norm": 0.6228429094034043,
"learning_rate": 2e-05,
"loss": 0.3994959890842438,
"num_input_tokens_seen": 0,
"step": 20
},
{
"epoch": 0.2625,
"grad_norm": 0.5940088612229671,
"learning_rate": 1.9999658256641746e-05,
"loss": 0.41216734051704407,
"num_input_tokens_seen": 0,
"step": 21
},
{
"epoch": 0.275,
"grad_norm": 0.5920054647245636,
"learning_rate": 1.9998633049924693e-05,
"loss": 0.41810792684555054,
"num_input_tokens_seen": 0,
"step": 22
},
{
"epoch": 0.2875,
"grad_norm": 0.5480649368192407,
"learning_rate": 1.999692444992035e-05,
"loss": 0.4036347568035126,
"num_input_tokens_seen": 0,
"step": 23
},
{
"epoch": 0.3,
"grad_norm": 0.5565757348287612,
"learning_rate": 1.999453257340926e-05,
"loss": 0.419705331325531,
"num_input_tokens_seen": 0,
"step": 24
},
{
"epoch": 0.3125,
"grad_norm": 0.5548087155409337,
"learning_rate": 1.999145758387301e-05,
"loss": 0.40048182010650635,
"num_input_tokens_seen": 0,
"step": 25
},
{
"epoch": 0.325,
"grad_norm": 0.5338888290743757,
"learning_rate": 1.998769969148305e-05,
"loss": 0.40444415807724,
"num_input_tokens_seen": 0,
"step": 26
},
{
"epoch": 0.3375,
"grad_norm": 0.5018287159150987,
"learning_rate": 1.9983259153086328e-05,
"loss": 0.3908202052116394,
"num_input_tokens_seen": 0,
"step": 27
},
{
"epoch": 0.35,
"grad_norm": 0.5666363779267786,
"learning_rate": 1.9978136272187745e-05,
"loss": 0.3917597532272339,
"num_input_tokens_seen": 0,
"step": 28
},
{
"epoch": 0.3625,
"grad_norm": 0.5311731782089377,
"learning_rate": 1.997233139892941e-05,
"loss": 0.38186532258987427,
"num_input_tokens_seen": 0,
"step": 29
},
{
"epoch": 0.375,
"grad_norm": 0.5017076164418427,
"learning_rate": 1.99658449300667e-05,
"loss": 0.3953409492969513,
"num_input_tokens_seen": 0,
"step": 30
},
{
"epoch": 0.3875,
"grad_norm": 0.5191236527322374,
"learning_rate": 1.995867730894114e-05,
"loss": 0.3810531198978424,
"num_input_tokens_seen": 0,
"step": 31
},
{
"epoch": 0.4,
"grad_norm": 0.479389601325034,
"learning_rate": 1.9950829025450116e-05,
"loss": 0.3787350058555603,
"num_input_tokens_seen": 0,
"step": 32
},
{
"epoch": 0.4125,
"grad_norm": 0.5052682473625102,
"learning_rate": 1.9942300616013378e-05,
"loss": 0.3784908056259155,
"num_input_tokens_seen": 0,
"step": 33
},
{
"epoch": 0.425,
"grad_norm": 0.5093747529791427,
"learning_rate": 1.9933092663536384e-05,
"loss": 0.377264142036438,
"num_input_tokens_seen": 0,
"step": 34
},
{
"epoch": 0.4375,
"grad_norm": 0.5496127833092357,
"learning_rate": 1.992320579737045e-05,
"loss": 0.4013134837150574,
"num_input_tokens_seen": 0,
"step": 35
},
{
"epoch": 0.45,
"grad_norm": 0.5194182976027281,
"learning_rate": 1.9912640693269754e-05,
"loss": 0.3868390917778015,
"num_input_tokens_seen": 0,
"step": 36
},
{
"epoch": 0.4625,
"grad_norm": 0.4772201831075947,
"learning_rate": 1.990139807334512e-05,
"loss": 0.3554729223251343,
"num_input_tokens_seen": 0,
"step": 37
},
{
"epoch": 0.475,
"grad_norm": 0.5411145901050805,
"learning_rate": 1.9889478706014687e-05,
"loss": 0.3768259882926941,
"num_input_tokens_seen": 0,
"step": 38
},
{
"epoch": 0.4875,
"grad_norm": 0.47151248168784593,
"learning_rate": 1.9876883405951378e-05,
"loss": 0.3637596368789673,
"num_input_tokens_seen": 0,
"step": 39
},
{
"epoch": 0.5,
"grad_norm": 0.4900043722490537,
"learning_rate": 1.9863613034027224e-05,
"loss": 0.3731005787849426,
"num_input_tokens_seen": 0,
"step": 40
},
{
"epoch": 0.5125,
"grad_norm": 0.4740418320125202,
"learning_rate": 1.984966849725452e-05,
"loss": 0.36029529571533203,
"num_input_tokens_seen": 0,
"step": 41
},
{
"epoch": 0.525,
"grad_norm": 0.5282467622205699,
"learning_rate": 1.9835050748723826e-05,
"loss": 0.371197909116745,
"num_input_tokens_seen": 0,
"step": 42
},
{
"epoch": 0.5375,
"grad_norm": 0.4755646348335201,
"learning_rate": 1.981976078753884e-05,
"loss": 0.34178704023361206,
"num_input_tokens_seen": 0,
"step": 43
},
{
"epoch": 0.55,
"grad_norm": 0.507685264776996,
"learning_rate": 1.9803799658748096e-05,
"loss": 0.3663065731525421,
"num_input_tokens_seen": 0,
"step": 44
},
{
"epoch": 0.5625,
"grad_norm": 0.5265705406067266,
"learning_rate": 1.9787168453273546e-05,
"loss": 0.3737153708934784,
"num_input_tokens_seen": 0,
"step": 45
},
{
"epoch": 0.575,
"grad_norm": 0.5571656899248943,
"learning_rate": 1.9769868307835996e-05,
"loss": 0.3628424406051636,
"num_input_tokens_seen": 0,
"step": 46
},
{
"epoch": 0.5875,
"grad_norm": 0.569279834268425,
"learning_rate": 1.97519004048774e-05,
"loss": 0.37988948822021484,
"num_input_tokens_seen": 0,
"step": 47
},
{
"epoch": 0.6,
"grad_norm": 0.4969114483932452,
"learning_rate": 1.973326597248006e-05,
"loss": 0.3636181950569153,
"num_input_tokens_seen": 0,
"step": 48
},
{
"epoch": 0.6125,
"grad_norm": 0.5439018148028109,
"learning_rate": 1.9713966284282677e-05,
"loss": 0.3687742352485657,
"num_input_tokens_seen": 0,
"step": 49
},
{
"epoch": 0.625,
"grad_norm": 0.47096960108233016,
"learning_rate": 1.9694002659393306e-05,
"loss": 0.3656526207923889,
"num_input_tokens_seen": 0,
"step": 50
},
{
"epoch": 0.6375,
"grad_norm": 0.49069042740477836,
"learning_rate": 1.9673376462299186e-05,
"loss": 0.3719867467880249,
"num_input_tokens_seen": 0,
"step": 51
},
{
"epoch": 0.65,
"grad_norm": 0.5230240054949966,
"learning_rate": 1.9652089102773487e-05,
"loss": 0.3670026361942291,
"num_input_tokens_seen": 0,
"step": 52
},
{
"epoch": 0.6625,
"grad_norm": 0.539667408085974,
"learning_rate": 1.963014203577896e-05,
"loss": 0.3777502775192261,
"num_input_tokens_seen": 0,
"step": 53
},
{
"epoch": 0.675,
"grad_norm": 0.5303927925423055,
"learning_rate": 1.9607536761368484e-05,
"loss": 0.360956609249115,
"num_input_tokens_seen": 0,
"step": 54
},
{
"epoch": 0.6875,
"grad_norm": 0.5669762895801259,
"learning_rate": 1.958427482458253e-05,
"loss": 0.3712082505226135,
"num_input_tokens_seen": 0,
"step": 55
},
{
"epoch": 0.7,
"grad_norm": 0.4924233132486405,
"learning_rate": 1.9560357815343577e-05,
"loss": 0.34069931507110596,
"num_input_tokens_seen": 0,
"step": 56
},
{
"epoch": 0.7125,
"grad_norm": 0.4939181685825231,
"learning_rate": 1.9535787368347444e-05,
"loss": 0.3445311188697815,
"num_input_tokens_seen": 0,
"step": 57
},
{
"epoch": 0.725,
"grad_norm": 0.5132891832051437,
"learning_rate": 1.9510565162951538e-05,
"loss": 0.352506160736084,
"num_input_tokens_seen": 0,
"step": 58
},
{
"epoch": 0.7375,
"grad_norm": 0.5228074950046403,
"learning_rate": 1.9484692923060095e-05,
"loss": 0.352910578250885,
"num_input_tokens_seen": 0,
"step": 59
},
{
"epoch": 0.75,
"grad_norm": 0.48331453046149664,
"learning_rate": 1.9458172417006347e-05,
"loss": 0.3518715500831604,
"num_input_tokens_seen": 0,
"step": 60
},
{
"epoch": 0.7625,
"grad_norm": 0.4971643896288794,
"learning_rate": 1.9431005457431654e-05,
"loss": 0.3532984256744385,
"num_input_tokens_seen": 0,
"step": 61
},
{
"epoch": 0.775,
"grad_norm": 0.5331008024051833,
"learning_rate": 1.9403193901161614e-05,
"loss": 0.3648609220981598,
"num_input_tokens_seen": 0,
"step": 62
},
{
"epoch": 0.7875,
"grad_norm": 1.2839239169737044,
"learning_rate": 1.9374739649079155e-05,
"loss": 0.3517247438430786,
"num_input_tokens_seen": 0,
"step": 63
},
{
"epoch": 0.8,
"grad_norm": 0.5877379242744403,
"learning_rate": 1.934564464599461e-05,
"loss": 0.3657301962375641,
"num_input_tokens_seen": 0,
"step": 64
},
{
"epoch": 0.8125,
"grad_norm": 0.4867340909281986,
"learning_rate": 1.9315910880512792e-05,
"loss": 0.36049771308898926,
"num_input_tokens_seen": 0,
"step": 65
},
{
"epoch": 0.825,
"grad_norm": 0.49356286585265735,
"learning_rate": 1.9285540384897073e-05,
"loss": 0.34778010845184326,
"num_input_tokens_seen": 0,
"step": 66
},
{
"epoch": 0.8375,
"grad_norm": 0.5535250380039742,
"learning_rate": 1.9254535234930486e-05,
"loss": 0.36141228675842285,
"num_input_tokens_seen": 0,
"step": 67
},
{
"epoch": 0.85,
"grad_norm": 0.5243933357988406,
"learning_rate": 1.922289754977385e-05,
"loss": 0.3524037301540375,
"num_input_tokens_seen": 0,
"step": 68
},
{
"epoch": 0.8625,
"grad_norm": 0.5324496372479985,
"learning_rate": 1.919062949182091e-05,
"loss": 0.36538389325141907,
"num_input_tokens_seen": 0,
"step": 69
},
{
"epoch": 0.875,
"grad_norm": 0.49491533066436794,
"learning_rate": 1.9157733266550577e-05,
"loss": 0.35084018111228943,
"num_input_tokens_seen": 0,
"step": 70
},
{
"epoch": 0.8875,
"grad_norm": 0.5287693804586798,
"learning_rate": 1.9124211122376138e-05,
"loss": 0.3459678292274475,
"num_input_tokens_seen": 0,
"step": 71
},
{
"epoch": 0.9,
"grad_norm": 0.4675049739386483,
"learning_rate": 1.909006535049163e-05,
"loss": 0.34027597308158875,
"num_input_tokens_seen": 0,
"step": 72
},
{
"epoch": 0.9125,
"grad_norm": 0.5068359141834208,
"learning_rate": 1.9055298284715192e-05,
"loss": 0.34234559535980225,
"num_input_tokens_seen": 0,
"step": 73
},
{
"epoch": 0.925,
"grad_norm": 0.5454693950983837,
"learning_rate": 1.9019912301329593e-05,
"loss": 0.3697267770767212,
"num_input_tokens_seen": 0,
"step": 74
},
{
"epoch": 0.9375,
"grad_norm": 0.48781498197167644,
"learning_rate": 1.898390981891979e-05,
"loss": 0.3399868607521057,
"num_input_tokens_seen": 0,
"step": 75
},
{
"epoch": 0.95,
"grad_norm": 0.467529345150941,
"learning_rate": 1.8947293298207637e-05,
"loss": 0.3481365442276001,
"num_input_tokens_seen": 0,
"step": 76
},
{
"epoch": 0.9625,
"grad_norm": 0.5173617613436983,
"learning_rate": 1.891006524188368e-05,
"loss": 0.35163643956184387,
"num_input_tokens_seen": 0,
"step": 77
},
{
"epoch": 0.975,
"grad_norm": 0.4964553718237553,
"learning_rate": 1.887222819443612e-05,
"loss": 0.3495897650718689,
"num_input_tokens_seen": 0,
"step": 78
},
{
"epoch": 0.9875,
"grad_norm": 0.5266061782006969,
"learning_rate": 1.883378474197689e-05,
"loss": 0.3525278568267822,
"num_input_tokens_seen": 0,
"step": 79
},
{
"epoch": 1.0,
"grad_norm": 0.48903026352892565,
"learning_rate": 1.879473751206489e-05,
"loss": 0.3540067672729492,
"num_input_tokens_seen": 0,
"step": 80
},
{
"epoch": 1.0125,
"grad_norm": 0.8791291129207356,
"learning_rate": 1.875508917352643e-05,
"loss": 0.26043254137039185,
"num_input_tokens_seen": 0,
"step": 81
},
{
"epoch": 1.025,
"grad_norm": 0.6487054540761681,
"learning_rate": 1.8714842436272774e-05,
"loss": 0.2440587878227234,
"num_input_tokens_seen": 0,
"step": 82
},
{
"epoch": 1.0375,
"grad_norm": 0.6795282145993521,
"learning_rate": 1.8674000051114953e-05,
"loss": 0.2597944140434265,
"num_input_tokens_seen": 0,
"step": 83
},
{
"epoch": 1.05,
"grad_norm": 0.6670449632773737,
"learning_rate": 1.863256480957574e-05,
"loss": 0.23764365911483765,
"num_input_tokens_seen": 0,
"step": 84
},
{
"epoch": 1.0625,
"grad_norm": 0.8445000033015495,
"learning_rate": 1.8590539543698852e-05,
"loss": 0.22128596901893616,
"num_input_tokens_seen": 0,
"step": 85
},
{
"epoch": 1.075,
"grad_norm": 0.7319387611539306,
"learning_rate": 1.854792712585539e-05,
"loss": 0.22451584041118622,
"num_input_tokens_seen": 0,
"step": 86
},
{
"epoch": 1.0875,
"grad_norm": 0.7347266658746741,
"learning_rate": 1.8504730468547508e-05,
"loss": 0.23484283685684204,
"num_input_tokens_seen": 0,
"step": 87
},
{
"epoch": 1.1,
"grad_norm": 0.684429737748084,
"learning_rate": 1.8460952524209355e-05,
"loss": 0.22682486474514008,
"num_input_tokens_seen": 0,
"step": 88
},
{
"epoch": 1.1125,
"grad_norm": 0.6253196189027981,
"learning_rate": 1.8416596285005274e-05,
"loss": 0.23638179898262024,
"num_input_tokens_seen": 0,
"step": 89
},
{
"epoch": 1.125,
"grad_norm": 0.7140008070317728,
"learning_rate": 1.8371664782625287e-05,
"loss": 0.22379158437252045,
"num_input_tokens_seen": 0,
"step": 90
},
{
"epoch": 1.1375,
"grad_norm": 0.5609948172492557,
"learning_rate": 1.8326161088077905e-05,
"loss": 0.23043715953826904,
"num_input_tokens_seen": 0,
"step": 91
},
{
"epoch": 1.15,
"grad_norm": 0.5762843622395748,
"learning_rate": 1.8280088311480203e-05,
"loss": 0.22419373691082,
"num_input_tokens_seen": 0,
"step": 92
},
{
"epoch": 1.1625,
"grad_norm": 0.7331777727867734,
"learning_rate": 1.823344960184526e-05,
"loss": 0.22210514545440674,
"num_input_tokens_seen": 0,
"step": 93
},
{
"epoch": 1.175,
"grad_norm": 0.6205497447918622,
"learning_rate": 1.8186248146866928e-05,
"loss": 0.23843634128570557,
"num_input_tokens_seen": 0,
"step": 94
},
{
"epoch": 1.1875,
"grad_norm": 0.645067422698941,
"learning_rate": 1.813848717270195e-05,
"loss": 0.22561757266521454,
"num_input_tokens_seen": 0,
"step": 95
},
{
"epoch": 1.2,
"grad_norm": 0.5305489735340998,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.23090419173240662,
"num_input_tokens_seen": 0,
"step": 96
},
{
"epoch": 1.2125,
"grad_norm": 0.652421080289407,
"learning_rate": 1.804129976242792e-05,
"loss": 0.21994651854038239,
"num_input_tokens_seen": 0,
"step": 97
},
{
"epoch": 1.225,
"grad_norm": 0.546464902083412,
"learning_rate": 1.7991879968949248e-05,
"loss": 0.2290341556072235,
"num_input_tokens_seen": 0,
"step": 98
},
{
"epoch": 1.2375,
"grad_norm": 0.6476920457708351,
"learning_rate": 1.7941913941090712e-05,
"loss": 0.24561816453933716,
"num_input_tokens_seen": 0,
"step": 99
},
{
"epoch": 1.25,
"grad_norm": 0.5782245623531915,
"learning_rate": 1.789140509396394e-05,
"loss": 0.23482097685337067,
"num_input_tokens_seen": 0,
"step": 100
},
{
"epoch": 1.2625,
"grad_norm": 0.6348611730944753,
"learning_rate": 1.784035687978153e-05,
"loss": 0.21998700499534607,
"num_input_tokens_seen": 0,
"step": 101
},
{
"epoch": 1.275,
"grad_norm": 0.5855684501933186,
"learning_rate": 1.7788772787621126e-05,
"loss": 0.22731050848960876,
"num_input_tokens_seen": 0,
"step": 102
},
{
"epoch": 1.2875,
"grad_norm": 0.5849783819104052,
"learning_rate": 1.7736656343186897e-05,
"loss": 0.21847045421600342,
"num_input_tokens_seen": 0,
"step": 103
},
{
"epoch": 1.3,
"grad_norm": 0.6468451663174432,
"learning_rate": 1.7684011108568593e-05,
"loss": 0.23889131844043732,
"num_input_tokens_seen": 0,
"step": 104
},
{
"epoch": 1.3125,
"grad_norm": 0.5379228455435396,
"learning_rate": 1.7630840681998068e-05,
"loss": 0.22917379438877106,
"num_input_tokens_seen": 0,
"step": 105
},
{
"epoch": 1.325,
"grad_norm": 0.6386415797352409,
"learning_rate": 1.757714869760335e-05,
"loss": 0.22476345300674438,
"num_input_tokens_seen": 0,
"step": 106
},
{
"epoch": 1.3375,
"grad_norm": 0.5432607518100792,
"learning_rate": 1.752293882516025e-05,
"loss": 0.23026630282402039,
"num_input_tokens_seen": 0,
"step": 107
},
{
"epoch": 1.35,
"grad_norm": 0.6251676560462321,
"learning_rate": 1.7468214769841542e-05,
"loss": 0.22793394327163696,
"num_input_tokens_seen": 0,
"step": 108
},
{
"epoch": 1.3625,
"grad_norm": 0.5786323989132912,
"learning_rate": 1.7412980271963712e-05,
"loss": 0.24610519409179688,
"num_input_tokens_seen": 0,
"step": 109
},
{
"epoch": 1.375,
"grad_norm": 0.5735661628137917,
"learning_rate": 1.735723910673132e-05,
"loss": 0.21196551620960236,
"num_input_tokens_seen": 0,
"step": 110
},
{
"epoch": 1.3875,
"grad_norm": 0.5881031218701362,
"learning_rate": 1.7300995083978965e-05,
"loss": 0.22360339760780334,
"num_input_tokens_seen": 0,
"step": 111
},
{
"epoch": 1.4,
"grad_norm": 0.5603841737931269,
"learning_rate": 1.7244252047910893e-05,
"loss": 0.22254161536693573,
"num_input_tokens_seen": 0,
"step": 112
},
{
"epoch": 1.4125,
"grad_norm": 0.566304468507664,
"learning_rate": 1.718701387683824e-05,
"loss": 0.21939656138420105,
"num_input_tokens_seen": 0,
"step": 113
},
{
"epoch": 1.425,
"grad_norm": 0.532198517126917,
"learning_rate": 1.7129284482913973e-05,
"loss": 0.2217104434967041,
"num_input_tokens_seen": 0,
"step": 114
},
{
"epoch": 1.4375,
"grad_norm": 0.5486267196897048,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.22763556241989136,
"num_input_tokens_seen": 0,
"step": 115
},
{
"epoch": 1.45,
"grad_norm": 0.5318598905653756,
"learning_rate": 1.7012367842724887e-05,
"loss": 0.22226908802986145,
"num_input_tokens_seen": 0,
"step": 116
},
{
"epoch": 1.4625,
"grad_norm": 0.5219241851069808,
"learning_rate": 1.6953188587557122e-05,
"loss": 0.2251712530851364,
"num_input_tokens_seen": 0,
"step": 117
},
{
"epoch": 1.475,
"grad_norm": 0.5576204648175208,
"learning_rate": 1.6893534091185658e-05,
"loss": 0.2300577610731125,
"num_input_tokens_seen": 0,
"step": 118
},
{
"epoch": 1.4875,
"grad_norm": 0.5157156531813916,
"learning_rate": 1.6833408430916085e-05,
"loss": 0.23422931134700775,
"num_input_tokens_seen": 0,
"step": 119
},
{
"epoch": 1.5,
"grad_norm": 0.5525368224210166,
"learning_rate": 1.6772815716257414e-05,
"loss": 0.23097172379493713,
"num_input_tokens_seen": 0,
"step": 120
},
{
"epoch": 1.5125,
"grad_norm": 0.5671209735467264,
"learning_rate": 1.6711760088641197e-05,
"loss": 0.22700801491737366,
"num_input_tokens_seen": 0,
"step": 121
},
{
"epoch": 1.525,
"grad_norm": 0.49340662518168044,
"learning_rate": 1.6650245721138483e-05,
"loss": 0.21679332852363586,
"num_input_tokens_seen": 0,
"step": 122
},
{
"epoch": 1.5375,
"grad_norm": 0.5241279113240838,
"learning_rate": 1.658827681817458e-05,
"loss": 0.23128211498260498,
"num_input_tokens_seen": 0,
"step": 123
},
{
"epoch": 1.55,
"grad_norm": 0.5605937616757047,
"learning_rate": 1.6525857615241686e-05,
"loss": 0.2341003566980362,
"num_input_tokens_seen": 0,
"step": 124
},
{
"epoch": 1.5625,
"grad_norm": 0.5731377051701606,
"learning_rate": 1.646299237860941e-05,
"loss": 0.24362272024154663,
"num_input_tokens_seen": 0,
"step": 125
},
{
"epoch": 1.575,
"grad_norm": 0.4973323542505726,
"learning_rate": 1.6399685405033168e-05,
"loss": 0.21342068910598755,
"num_input_tokens_seen": 0,
"step": 126
},
{
"epoch": 1.5875,
"grad_norm": 0.5599176772369371,
"learning_rate": 1.6335941021460507e-05,
"loss": 0.2271297574043274,
"num_input_tokens_seen": 0,
"step": 127
},
{
"epoch": 1.6,
"grad_norm": 0.5203662408979232,
"learning_rate": 1.6271763584735373e-05,
"loss": 0.22479626536369324,
"num_input_tokens_seen": 0,
"step": 128
},
{
"epoch": 1.6125,
"grad_norm": 0.5095863344896252,
"learning_rate": 1.6207157481300315e-05,
"loss": 0.22810155153274536,
"num_input_tokens_seen": 0,
"step": 129
},
{
"epoch": 1.625,
"grad_norm": 0.5317196453792732,
"learning_rate": 1.6142127126896682e-05,
"loss": 0.21053889393806458,
"num_input_tokens_seen": 0,
"step": 130
},
{
"epoch": 1.6375,
"grad_norm": 0.5425148287230596,
"learning_rate": 1.6076676966262815e-05,
"loss": 0.23062941431999207,
"num_input_tokens_seen": 0,
"step": 131
},
{
"epoch": 1.65,
"grad_norm": 0.6023208751142557,
"learning_rate": 1.6010811472830253e-05,
"loss": 0.2463783472776413,
"num_input_tokens_seen": 0,
"step": 132
},
{
"epoch": 1.6625,
"grad_norm": 0.5550292129482939,
"learning_rate": 1.5944535148417982e-05,
"loss": 0.23075446486473083,
"num_input_tokens_seen": 0,
"step": 133
},
{
"epoch": 1.675,
"grad_norm": 0.5643030708792179,
"learning_rate": 1.5877852522924733e-05,
"loss": 0.22998251020908356,
"num_input_tokens_seen": 0,
"step": 134
},
{
"epoch": 1.6875,
"grad_norm": 0.6739206936197925,
"learning_rate": 1.5810768154019386e-05,
"loss": 0.23184943199157715,
"num_input_tokens_seen": 0,
"step": 135
},
{
"epoch": 1.7,
"grad_norm": 0.5322077543412791,
"learning_rate": 1.5743286626829437e-05,
"loss": 0.24749401211738586,
"num_input_tokens_seen": 0,
"step": 136
},
{
"epoch": 1.7125,
"grad_norm": 0.6117204460062114,
"learning_rate": 1.5675412553627638e-05,
"loss": 0.22619368135929108,
"num_input_tokens_seen": 0,
"step": 137
},
{
"epoch": 1.725,
"grad_norm": 0.5683821158495919,
"learning_rate": 1.560715057351673e-05,
"loss": 0.2415420114994049,
"num_input_tokens_seen": 0,
"step": 138
},
{
"epoch": 1.7375,
"grad_norm": 0.5531774081547891,
"learning_rate": 1.5538505352112373e-05,
"loss": 0.23251904547214508,
"num_input_tokens_seen": 0,
"step": 139
},
{
"epoch": 1.75,
"grad_norm": 0.5656996064712688,
"learning_rate": 1.5469481581224274e-05,
"loss": 0.2148549109697342,
"num_input_tokens_seen": 0,
"step": 140
},
{
"epoch": 1.7625,
"grad_norm": 0.5342290575425301,
"learning_rate": 1.5400083978535475e-05,
"loss": 0.221043199300766,
"num_input_tokens_seen": 0,
"step": 141
},
{
"epoch": 1.775,
"grad_norm": 0.5312824045538664,
"learning_rate": 1.533031728727994e-05,
"loss": 0.22295251488685608,
"num_input_tokens_seen": 0,
"step": 142
},
{
"epoch": 1.7875,
"grad_norm": 0.5532281742783289,
"learning_rate": 1.526018627591834e-05,
"loss": 0.23230503499507904,
"num_input_tokens_seen": 0,
"step": 143
},
{
"epoch": 1.8,
"grad_norm": 0.5486342376292719,
"learning_rate": 1.5189695737812153e-05,
"loss": 0.22794640064239502,
"num_input_tokens_seen": 0,
"step": 144
},
{
"epoch": 1.8125,
"grad_norm": 0.5282686765938053,
"learning_rate": 1.5118850490896012e-05,
"loss": 0.23277482390403748,
"num_input_tokens_seen": 0,
"step": 145
},
{
"epoch": 1.825,
"grad_norm": 0.5221771559817829,
"learning_rate": 1.504765537734844e-05,
"loss": 0.23067018389701843,
"num_input_tokens_seen": 0,
"step": 146
},
{
"epoch": 1.8375,
"grad_norm": 0.5641718599884086,
"learning_rate": 1.4976115263260876e-05,
"loss": 0.21835649013519287,
"num_input_tokens_seen": 0,
"step": 147
},
{
"epoch": 1.85,
"grad_norm": 0.5505224831003699,
"learning_rate": 1.4904235038305084e-05,
"loss": 0.2256629765033722,
"num_input_tokens_seen": 0,
"step": 148
},
{
"epoch": 1.8625,
"grad_norm": 0.5686783195373294,
"learning_rate": 1.4832019615398962e-05,
"loss": 0.215658038854599,
"num_input_tokens_seen": 0,
"step": 149
},
{
"epoch": 1.875,
"grad_norm": 0.5570580083339357,
"learning_rate": 1.4759473930370738e-05,
"loss": 0.23637929558753967,
"num_input_tokens_seen": 0,
"step": 150
},
{
"epoch": 1.8875,
"grad_norm": 0.5613938118868574,
"learning_rate": 1.4686602941621618e-05,
"loss": 0.2198466658592224,
"num_input_tokens_seen": 0,
"step": 151
},
{
"epoch": 1.9,
"grad_norm": 0.5554584703399501,
"learning_rate": 1.461341162978688e-05,
"loss": 0.2246859222650528,
"num_input_tokens_seen": 0,
"step": 152
},
{
"epoch": 1.9125,
"grad_norm": 0.5909451217927424,
"learning_rate": 1.4539904997395468e-05,
"loss": 0.24911729991436005,
"num_input_tokens_seen": 0,
"step": 153
},
{
"epoch": 1.925,
"grad_norm": 0.5842777476606686,
"learning_rate": 1.4466088068528068e-05,
"loss": 0.23788337409496307,
"num_input_tokens_seen": 0,
"step": 154
},
{
"epoch": 1.9375,
"grad_norm": 0.4981326579110442,
"learning_rate": 1.4391965888473705e-05,
"loss": 0.2209474742412567,
"num_input_tokens_seen": 0,
"step": 155
},
{
"epoch": 1.95,
"grad_norm": 0.5397167568674605,
"learning_rate": 1.4317543523384928e-05,
"loss": 0.23926502466201782,
"num_input_tokens_seen": 0,
"step": 156
},
{
"epoch": 1.9625,
"grad_norm": 0.5359703981598113,
"learning_rate": 1.4242826059931538e-05,
"loss": 0.2312660813331604,
"num_input_tokens_seen": 0,
"step": 157
},
{
"epoch": 1.975,
"grad_norm": 0.5386500827778109,
"learning_rate": 1.4167818604952906e-05,
"loss": 0.2199496328830719,
"num_input_tokens_seen": 0,
"step": 158
},
{
"epoch": 1.9875,
"grad_norm": 0.5538059936709024,
"learning_rate": 1.409252628510894e-05,
"loss": 0.23412078619003296,
"num_input_tokens_seen": 0,
"step": 159
},
{
"epoch": 2.0,
"grad_norm": 0.540376667893175,
"learning_rate": 1.4016954246529697e-05,
"loss": 0.2249927967786789,
"num_input_tokens_seen": 0,
"step": 160
},
{
"epoch": 2.0125,
"grad_norm": 1.0198244871776765,
"learning_rate": 1.3941107654463619e-05,
"loss": 0.12236860394477844,
"num_input_tokens_seen": 0,
"step": 161
},
{
"epoch": 2.025,
"grad_norm": 0.8629260127173042,
"learning_rate": 1.3864991692924524e-05,
"loss": 0.11816570162773132,
"num_input_tokens_seen": 0,
"step": 162
},
{
"epoch": 2.0375,
"grad_norm": 0.6404376890511378,
"learning_rate": 1.3788611564337277e-05,
"loss": 0.10550174862146378,
"num_input_tokens_seen": 0,
"step": 163
},
{
"epoch": 2.05,
"grad_norm": 1.140833870667296,
"learning_rate": 1.3711972489182208e-05,
"loss": 0.10521072149276733,
"num_input_tokens_seen": 0,
"step": 164
},
{
"epoch": 2.0625,
"grad_norm": 1.004887415099318,
"learning_rate": 1.3635079705638298e-05,
"loss": 0.10304014384746552,
"num_input_tokens_seen": 0,
"step": 165
},
{
"epoch": 2.075,
"grad_norm": 0.7658325943968483,
"learning_rate": 1.3557938469225167e-05,
"loss": 0.10292299091815948,
"num_input_tokens_seen": 0,
"step": 166
},
{
"epoch": 2.0875,
"grad_norm": 0.7338849003327649,
"learning_rate": 1.3480554052443847e-05,
"loss": 0.10219870507717133,
"num_input_tokens_seen": 0,
"step": 167
},
{
"epoch": 2.1,
"grad_norm": 0.7745676732651274,
"learning_rate": 1.3402931744416432e-05,
"loss": 0.10130374133586884,
"num_input_tokens_seen": 0,
"step": 168
},
{
"epoch": 2.1125,
"grad_norm": 0.6584333483269132,
"learning_rate": 1.332507685052457e-05,
"loss": 0.10410261154174805,
"num_input_tokens_seen": 0,
"step": 169
},
{
"epoch": 2.125,
"grad_norm": 0.6821199419154345,
"learning_rate": 1.3246994692046837e-05,
"loss": 0.09977763891220093,
"num_input_tokens_seen": 0,
"step": 170
},
{
"epoch": 2.1375,
"grad_norm": 0.6358679555892452,
"learning_rate": 1.3168690605795044e-05,
"loss": 0.09835107624530792,
"num_input_tokens_seen": 0,
"step": 171
},
{
"epoch": 2.15,
"grad_norm": 0.6632837484026789,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.09917747974395752,
"num_input_tokens_seen": 0,
"step": 172
},
{
"epoch": 2.1625,
"grad_norm": 0.6342553931938368,
"learning_rate": 1.3011438072693077e-05,
"loss": 0.09428848326206207,
"num_input_tokens_seen": 0,
"step": 173
},
{
"epoch": 2.175,
"grad_norm": 0.6387577881812592,
"learning_rate": 1.293250037384465e-05,
"loss": 0.0958920419216156,
"num_input_tokens_seen": 0,
"step": 174
},
{
"epoch": 2.1875,
"grad_norm": 0.6954184373882712,
"learning_rate": 1.2853362242491054e-05,
"loss": 0.09917563199996948,
"num_input_tokens_seen": 0,
"step": 175
},
{
"epoch": 2.2,
"grad_norm": 0.5974385656105545,
"learning_rate": 1.2774029087618448e-05,
"loss": 0.09344319999217987,
"num_input_tokens_seen": 0,
"step": 176
},
{
"epoch": 2.2125,
"grad_norm": 0.6472498757039972,
"learning_rate": 1.269450633154258e-05,
"loss": 0.10768131911754608,
"num_input_tokens_seen": 0,
"step": 177
},
{
"epoch": 2.225,
"grad_norm": 0.6685276002639324,
"learning_rate": 1.26147994095382e-05,
"loss": 0.10777394473552704,
"num_input_tokens_seen": 0,
"step": 178
},
{
"epoch": 2.2375,
"grad_norm": 0.6340976446307695,
"learning_rate": 1.253491376946754e-05,
"loss": 0.10363772511482239,
"num_input_tokens_seen": 0,
"step": 179
},
{
"epoch": 2.25,
"grad_norm": 0.5829966761174877,
"learning_rate": 1.2454854871407993e-05,
"loss": 0.09860391169786453,
"num_input_tokens_seen": 0,
"step": 180
},
{
"epoch": 2.2625,
"grad_norm": 0.6211718617191284,
"learning_rate": 1.2374628187278888e-05,
"loss": 0.09785199165344238,
"num_input_tokens_seen": 0,
"step": 181
},
{
"epoch": 2.275,
"grad_norm": 0.5692692956767395,
"learning_rate": 1.2294239200467516e-05,
"loss": 0.09915725886821747,
"num_input_tokens_seen": 0,
"step": 182
},
{
"epoch": 2.2875,
"grad_norm": 0.6193608649665566,
"learning_rate": 1.2213693405454345e-05,
"loss": 0.09728467464447021,
"num_input_tokens_seen": 0,
"step": 183
},
{
"epoch": 2.3,
"grad_norm": 0.6644514294612655,
"learning_rate": 1.213299630743747e-05,
"loss": 0.10535012185573578,
"num_input_tokens_seen": 0,
"step": 184
},
{
"epoch": 2.3125,
"grad_norm": 0.5826634498573604,
"learning_rate": 1.2052153421956343e-05,
"loss": 0.09781628847122192,
"num_input_tokens_seen": 0,
"step": 185
},
{
"epoch": 2.325,
"grad_norm": 0.6951420866310761,
"learning_rate": 1.1971170274514802e-05,
"loss": 0.09797348082065582,
"num_input_tokens_seen": 0,
"step": 186
},
{
"epoch": 2.3375,
"grad_norm": 0.6449424096000549,
"learning_rate": 1.1890052400203405e-05,
"loss": 0.11403355002403259,
"num_input_tokens_seen": 0,
"step": 187
},
{
"epoch": 2.35,
"grad_norm": 0.6977055112415091,
"learning_rate": 1.1808805343321102e-05,
"loss": 0.09498105198144913,
"num_input_tokens_seen": 0,
"step": 188
},
{
"epoch": 2.3625,
"grad_norm": 0.6309628403695112,
"learning_rate": 1.1727434656996306e-05,
"loss": 0.0992412120103836,
"num_input_tokens_seen": 0,
"step": 189
},
{
"epoch": 2.375,
"grad_norm": 0.6731776940075981,
"learning_rate": 1.164594590280734e-05,
"loss": 0.11293548345565796,
"num_input_tokens_seen": 0,
"step": 190
},
{
"epoch": 2.3875,
"grad_norm": 0.6741092344709629,
"learning_rate": 1.156434465040231e-05,
"loss": 0.094619981944561,
"num_input_tokens_seen": 0,
"step": 191
},
{
"epoch": 2.4,
"grad_norm": 0.5967386003242268,
"learning_rate": 1.148263647711842e-05,
"loss": 0.09347110986709595,
"num_input_tokens_seen": 0,
"step": 192
},
{
"epoch": 2.4125,
"grad_norm": 0.7159915413192838,
"learning_rate": 1.140082696760078e-05,
"loss": 0.10074266791343689,
"num_input_tokens_seen": 0,
"step": 193
},
{
"epoch": 2.425,
"grad_norm": 0.5968881913694516,
"learning_rate": 1.1318921713420691e-05,
"loss": 0.10465800762176514,
"num_input_tokens_seen": 0,
"step": 194
},
{
"epoch": 2.4375,
"grad_norm": 0.6589415029348668,
"learning_rate": 1.123692631269348e-05,
"loss": 0.10594508051872253,
"num_input_tokens_seen": 0,
"step": 195
},
{
"epoch": 2.45,
"grad_norm": 0.663988394877507,
"learning_rate": 1.1154846369695864e-05,
"loss": 0.09538954496383667,
"num_input_tokens_seen": 0,
"step": 196
},
{
"epoch": 2.4625,
"grad_norm": 0.6228831383495524,
"learning_rate": 1.107268749448292e-05,
"loss": 0.09765101969242096,
"num_input_tokens_seen": 0,
"step": 197
},
{
"epoch": 2.475,
"grad_norm": 0.61772733625651,
"learning_rate": 1.099045530250463e-05,
"loss": 0.09396866708993912,
"num_input_tokens_seen": 0,
"step": 198
},
{
"epoch": 2.4875,
"grad_norm": 0.5929697542542042,
"learning_rate": 1.0908155414222083e-05,
"loss": 0.09863300621509552,
"num_input_tokens_seen": 0,
"step": 199
},
{
"epoch": 2.5,
"grad_norm": 0.6899961462901135,
"learning_rate": 1.0825793454723325e-05,
"loss": 0.09633656591176987,
"num_input_tokens_seen": 0,
"step": 200
},
{
"epoch": 2.5125,
"grad_norm": 0.5997938148312979,
"learning_rate": 1.0743375053338879e-05,
"loss": 0.10011985152959824,
"num_input_tokens_seen": 0,
"step": 201
},
{
"epoch": 2.525,
"grad_norm": 0.6332752356735514,
"learning_rate": 1.0660905843256995e-05,
"loss": 0.09651218354701996,
"num_input_tokens_seen": 0,
"step": 202
},
{
"epoch": 2.5375,
"grad_norm": 0.6204192543359215,
"learning_rate": 1.0578391461138642e-05,
"loss": 0.0994209498167038,
"num_input_tokens_seen": 0,
"step": 203
},
{
"epoch": 2.55,
"grad_norm": 0.6132942561027718,
"learning_rate": 1.0495837546732224e-05,
"loss": 0.10166719555854797,
"num_input_tokens_seen": 0,
"step": 204
},
{
"epoch": 2.5625,
"grad_norm": 0.5964143279459994,
"learning_rate": 1.0413249742488132e-05,
"loss": 0.10079332441091537,
"num_input_tokens_seen": 0,
"step": 205
},
{
"epoch": 2.575,
"grad_norm": 0.6090535510158961,
"learning_rate": 1.0330633693173083e-05,
"loss": 0.10682345926761627,
"num_input_tokens_seen": 0,
"step": 206
},
{
"epoch": 2.5875,
"grad_norm": 0.5757143060836244,
"learning_rate": 1.0247995045484303e-05,
"loss": 0.10448747873306274,
"num_input_tokens_seen": 0,
"step": 207
},
{
"epoch": 2.6,
"grad_norm": 0.5318320587544387,
"learning_rate": 1.0165339447663586e-05,
"loss": 0.09677817672491074,
"num_input_tokens_seen": 0,
"step": 208
},
{
"epoch": 2.6125,
"grad_norm": 0.5653047610868341,
"learning_rate": 1.008267254911125e-05,
"loss": 0.10041716694831848,
"num_input_tokens_seen": 0,
"step": 209
},
{
"epoch": 2.625,
"grad_norm": 0.6709564219802151,
"learning_rate": 1e-05,
"loss": 0.11834404617547989,
"num_input_tokens_seen": 0,
"step": 210
},
{
"epoch": 2.6375,
"grad_norm": 0.59364936824183,
"learning_rate": 9.917327450888751e-06,
"loss": 0.10398055613040924,
"num_input_tokens_seen": 0,
"step": 211
},
{
"epoch": 2.65,
"grad_norm": 0.6135322123850998,
"learning_rate": 9.834660552336415e-06,
"loss": 0.09955007582902908,
"num_input_tokens_seen": 0,
"step": 212
},
{
"epoch": 2.6625,
"grad_norm": 0.5771675846362164,
"learning_rate": 9.7520049545157e-06,
"loss": 0.09840433299541473,
"num_input_tokens_seen": 0,
"step": 213
},
{
"epoch": 2.675,
"grad_norm": 0.6231353540589576,
"learning_rate": 9.669366306826919e-06,
"loss": 0.10380570590496063,
"num_input_tokens_seen": 0,
"step": 214
},
{
"epoch": 2.6875,
"grad_norm": 0.5948673639371316,
"learning_rate": 9.586750257511868e-06,
"loss": 0.0970538780093193,
"num_input_tokens_seen": 0,
"step": 215
},
{
"epoch": 2.7,
"grad_norm": 0.6060511739998478,
"learning_rate": 9.504162453267776e-06,
"loss": 0.10509540140628815,
"num_input_tokens_seen": 0,
"step": 216
},
{
"epoch": 2.7125,
"grad_norm": 0.5510731357722317,
"learning_rate": 9.421608538861361e-06,
"loss": 0.09432905912399292,
"num_input_tokens_seen": 0,
"step": 217
},
{
"epoch": 2.725,
"grad_norm": 0.6639639280503086,
"learning_rate": 9.339094156743007e-06,
"loss": 0.09837593883275986,
"num_input_tokens_seen": 0,
"step": 218
},
{
"epoch": 2.7375,
"grad_norm": 0.6270361275419634,
"learning_rate": 9.256624946661126e-06,
"loss": 0.10134841501712799,
"num_input_tokens_seen": 0,
"step": 219
},
{
"epoch": 2.75,
"grad_norm": 0.6771440915713338,
"learning_rate": 9.174206545276678e-06,
"loss": 0.11078701913356781,
"num_input_tokens_seen": 0,
"step": 220
},
{
"epoch": 2.7625,
"grad_norm": 0.705959945496674,
"learning_rate": 9.091844585777919e-06,
"loss": 0.1001749038696289,
"num_input_tokens_seen": 0,
"step": 221
},
{
"epoch": 2.775,
"grad_norm": 0.60937291192483,
"learning_rate": 9.009544697495373e-06,
"loss": 0.10557325184345245,
"num_input_tokens_seen": 0,
"step": 222
},
{
"epoch": 2.7875,
"grad_norm": 0.5893171675572725,
"learning_rate": 8.927312505517086e-06,
"loss": 0.10320694744586945,
"num_input_tokens_seen": 0,
"step": 223
},
{
"epoch": 2.8,
"grad_norm": 0.6133977099413181,
"learning_rate": 8.84515363030414e-06,
"loss": 0.09662497788667679,
"num_input_tokens_seen": 0,
"step": 224
},
{
"epoch": 2.8125,
"grad_norm": 0.6010610432879008,
"learning_rate": 8.763073687306523e-06,
"loss": 0.10481660068035126,
"num_input_tokens_seen": 0,
"step": 225
},
{
"epoch": 2.825,
"grad_norm": 0.6248420832412895,
"learning_rate": 8.68107828657931e-06,
"loss": 0.10239937901496887,
"num_input_tokens_seen": 0,
"step": 226
},
{
"epoch": 2.8375,
"grad_norm": 0.5868088627329976,
"learning_rate": 8.599173032399222e-06,
"loss": 0.10687953978776932,
"num_input_tokens_seen": 0,
"step": 227
},
{
"epoch": 2.85,
"grad_norm": 0.6268898716250957,
"learning_rate": 8.51736352288158e-06,
"loss": 0.10268527269363403,
"num_input_tokens_seen": 0,
"step": 228
},
{
"epoch": 2.8625,
"grad_norm": 0.6191681347772287,
"learning_rate": 8.43565534959769e-06,
"loss": 0.10145796835422516,
"num_input_tokens_seen": 0,
"step": 229
},
{
"epoch": 2.875,
"grad_norm": 0.6058846171541453,
"learning_rate": 8.35405409719266e-06,
"loss": 0.104601189494133,
"num_input_tokens_seen": 0,
"step": 230
},
{
"epoch": 2.8875,
"grad_norm": 0.5661519092091898,
"learning_rate": 8.2725653430037e-06,
"loss": 0.09631498157978058,
"num_input_tokens_seen": 0,
"step": 231
},
{
"epoch": 2.9,
"grad_norm": 0.5902711791447997,
"learning_rate": 8.191194656678905e-06,
"loss": 0.0936303585767746,
"num_input_tokens_seen": 0,
"step": 232
},
{
"epoch": 2.9125,
"grad_norm": 0.5638822103591541,
"learning_rate": 8.109947599796599e-06,
"loss": 0.1050085574388504,
"num_input_tokens_seen": 0,
"step": 233
},
{
"epoch": 2.925,
"grad_norm": 0.5703156532416298,
"learning_rate": 8.0288297254852e-06,
"loss": 0.09568509459495544,
"num_input_tokens_seen": 0,
"step": 234
},
{
"epoch": 2.9375,
"grad_norm": 0.6177421968590858,
"learning_rate": 7.947846578043658e-06,
"loss": 0.10282063484191895,
"num_input_tokens_seen": 0,
"step": 235
},
{
"epoch": 2.95,
"grad_norm": 0.5724279751966526,
"learning_rate": 7.867003692562533e-06,
"loss": 0.10039152950048447,
"num_input_tokens_seen": 0,
"step": 236
},
{
"epoch": 2.9625,
"grad_norm": 0.665494938328959,
"learning_rate": 7.786306594545658e-06,
"loss": 0.09484650194644928,
"num_input_tokens_seen": 0,
"step": 237
},
{
"epoch": 2.975,
"grad_norm": 0.5909403346328018,
"learning_rate": 7.705760799532485e-06,
"loss": 0.0970713198184967,
"num_input_tokens_seen": 0,
"step": 238
},
{
"epoch": 2.9875,
"grad_norm": 0.5930301687493515,
"learning_rate": 7.625371812721115e-06,
"loss": 0.1040472462773323,
"num_input_tokens_seen": 0,
"step": 239
},
{
"epoch": 3.0,
"grad_norm": 0.5975944410229318,
"learning_rate": 7.545145128592009e-06,
"loss": 0.10229268670082092,
"num_input_tokens_seen": 0,
"step": 240
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 200924346515456.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}