LLM4RecAug / trainer_state.json
wuyue0129's picture
Upload folder using huggingface_hub
05c56fa verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 5,
"global_step": 201,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014925373134328358,
"grad_norm": 7.200786590576172,
"learning_rate": 4.5454545454545457e-07,
"loss": 0.1719,
"step": 1
},
{
"epoch": 0.029850746268656716,
"grad_norm": 7.886128902435303,
"learning_rate": 9.090909090909091e-07,
"loss": 0.1847,
"step": 2
},
{
"epoch": 0.04477611940298507,
"grad_norm": 8.476142883300781,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.2012,
"step": 3
},
{
"epoch": 0.05970149253731343,
"grad_norm": 4.055701732635498,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.1325,
"step": 4
},
{
"epoch": 0.07462686567164178,
"grad_norm": 3.249504566192627,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.0968,
"step": 5
},
{
"epoch": 0.07462686567164178,
"eval_loss": 0.06899096071720123,
"eval_runtime": 14.448,
"eval_samples_per_second": 8.236,
"eval_steps_per_second": 0.277,
"step": 5
},
{
"epoch": 0.08955223880597014,
"grad_norm": 1.5543451309204102,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.0721,
"step": 6
},
{
"epoch": 0.1044776119402985,
"grad_norm": 1.0983480215072632,
"learning_rate": 3.181818181818182e-06,
"loss": 0.0506,
"step": 7
},
{
"epoch": 0.11940298507462686,
"grad_norm": 0.9362279176712036,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.0397,
"step": 8
},
{
"epoch": 0.13432835820895522,
"grad_norm": 0.84356290102005,
"learning_rate": 4.0909090909090915e-06,
"loss": 0.0427,
"step": 9
},
{
"epoch": 0.14925373134328357,
"grad_norm": 0.7953129410743713,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.0346,
"step": 10
},
{
"epoch": 0.14925373134328357,
"eval_loss": 0.03658153489232063,
"eval_runtime": 8.2566,
"eval_samples_per_second": 14.413,
"eval_steps_per_second": 0.484,
"step": 10
},
{
"epoch": 0.16417910447761194,
"grad_norm": 0.6031929850578308,
"learning_rate": 5e-06,
"loss": 0.0325,
"step": 11
},
{
"epoch": 0.1791044776119403,
"grad_norm": 0.6844140291213989,
"learning_rate": 4.999658262481173e-06,
"loss": 0.0372,
"step": 12
},
{
"epoch": 0.19402985074626866,
"grad_norm": 0.6683032512664795,
"learning_rate": 4.998633143352315e-06,
"loss": 0.0309,
"step": 13
},
{
"epoch": 0.208955223880597,
"grad_norm": 0.5927382111549377,
"learning_rate": 4.9969249228707625e-06,
"loss": 0.0265,
"step": 14
},
{
"epoch": 0.22388059701492538,
"grad_norm": 0.6329669952392578,
"learning_rate": 4.994534068046936e-06,
"loss": 0.0256,
"step": 15
},
{
"epoch": 0.22388059701492538,
"eval_loss": 0.03213270381093025,
"eval_runtime": 8.2934,
"eval_samples_per_second": 14.349,
"eval_steps_per_second": 0.482,
"step": 15
},
{
"epoch": 0.23880597014925373,
"grad_norm": 0.689909040927887,
"learning_rate": 4.991461232516675e-06,
"loss": 0.0393,
"step": 16
},
{
"epoch": 0.2537313432835821,
"grad_norm": 0.623458743095398,
"learning_rate": 4.987707256362529e-06,
"loss": 0.0329,
"step": 17
},
{
"epoch": 0.26865671641791045,
"grad_norm": 0.622241735458374,
"learning_rate": 4.983273165884096e-06,
"loss": 0.0311,
"step": 18
},
{
"epoch": 0.2835820895522388,
"grad_norm": 0.5380865335464478,
"learning_rate": 4.978160173317439e-06,
"loss": 0.0272,
"step": 19
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.6085941791534424,
"learning_rate": 4.972369676503672e-06,
"loss": 0.0328,
"step": 20
},
{
"epoch": 0.29850746268656714,
"eval_loss": 0.030199836939573288,
"eval_runtime": 8.2944,
"eval_samples_per_second": 14.347,
"eval_steps_per_second": 0.482,
"step": 20
},
{
"epoch": 0.31343283582089554,
"grad_norm": 0.5220608711242676,
"learning_rate": 4.965903258506806e-06,
"loss": 0.0286,
"step": 21
},
{
"epoch": 0.3283582089552239,
"grad_norm": 0.5969937443733215,
"learning_rate": 4.9587626871809564e-06,
"loss": 0.0294,
"step": 22
},
{
"epoch": 0.34328358208955223,
"grad_norm": 0.4778025448322296,
"learning_rate": 4.950949914687024e-06,
"loss": 0.0238,
"step": 23
},
{
"epoch": 0.3582089552238806,
"grad_norm": 0.41100114583969116,
"learning_rate": 4.942467076958999e-06,
"loss": 0.022,
"step": 24
},
{
"epoch": 0.373134328358209,
"grad_norm": 0.4831908941268921,
"learning_rate": 4.933316493120015e-06,
"loss": 0.0287,
"step": 25
},
{
"epoch": 0.373134328358209,
"eval_loss": 0.02796478196978569,
"eval_runtime": 8.2814,
"eval_samples_per_second": 14.37,
"eval_steps_per_second": 0.483,
"step": 25
},
{
"epoch": 0.3880597014925373,
"grad_norm": 0.5058245062828064,
"learning_rate": 4.923500664848327e-06,
"loss": 0.0262,
"step": 26
},
{
"epoch": 0.40298507462686567,
"grad_norm": 0.5201866626739502,
"learning_rate": 4.913022275693372e-06,
"loss": 0.029,
"step": 27
},
{
"epoch": 0.417910447761194,
"grad_norm": 0.46352681517601013,
"learning_rate": 4.901884190342121e-06,
"loss": 0.0261,
"step": 28
},
{
"epoch": 0.43283582089552236,
"grad_norm": 0.5337287187576294,
"learning_rate": 4.890089453835894e-06,
"loss": 0.0258,
"step": 29
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.43212640285491943,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.0221,
"step": 30
},
{
"epoch": 0.44776119402985076,
"eval_loss": 0.027510978281497955,
"eval_runtime": 8.3069,
"eval_samples_per_second": 14.325,
"eval_steps_per_second": 0.482,
"step": 30
},
{
"epoch": 0.4626865671641791,
"grad_norm": 0.5198773145675659,
"learning_rate": 4.864543104251587e-06,
"loss": 0.0261,
"step": 31
},
{
"epoch": 0.47761194029850745,
"grad_norm": 0.46835553646087646,
"learning_rate": 4.850798475290403e-06,
"loss": 0.0238,
"step": 32
},
{
"epoch": 0.4925373134328358,
"grad_norm": 0.521562397480011,
"learning_rate": 4.836411161498653e-06,
"loss": 0.0311,
"step": 33
},
{
"epoch": 0.5074626865671642,
"grad_norm": 0.3529301881790161,
"learning_rate": 4.821385096224268e-06,
"loss": 0.0216,
"step": 34
},
{
"epoch": 0.5223880597014925,
"grad_norm": 0.559899091720581,
"learning_rate": 4.8057243874434625e-06,
"loss": 0.0347,
"step": 35
},
{
"epoch": 0.5223880597014925,
"eval_loss": 0.026492305099964142,
"eval_runtime": 8.3008,
"eval_samples_per_second": 14.336,
"eval_steps_per_second": 0.482,
"step": 35
},
{
"epoch": 0.5373134328358209,
"grad_norm": 0.42942824959754944,
"learning_rate": 4.789433316637644e-06,
"loss": 0.0253,
"step": 36
},
{
"epoch": 0.5522388059701493,
"grad_norm": 0.432647168636322,
"learning_rate": 4.772516337622907e-06,
"loss": 0.0209,
"step": 37
},
{
"epoch": 0.5671641791044776,
"grad_norm": 0.47124215960502625,
"learning_rate": 4.754978075332398e-06,
"loss": 0.0213,
"step": 38
},
{
"epoch": 0.582089552238806,
"grad_norm": 0.41911983489990234,
"learning_rate": 4.736823324551909e-06,
"loss": 0.0209,
"step": 39
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.3859071731567383,
"learning_rate": 4.71805704860903e-06,
"loss": 0.0212,
"step": 40
},
{
"epoch": 0.5970149253731343,
"eval_loss": 0.02548597753047943,
"eval_runtime": 8.3185,
"eval_samples_per_second": 14.306,
"eval_steps_per_second": 0.481,
"step": 40
},
{
"epoch": 0.6119402985074627,
"grad_norm": 0.5006839036941528,
"learning_rate": 4.698684378016223e-06,
"loss": 0.0237,
"step": 41
},
{
"epoch": 0.6268656716417911,
"grad_norm": 0.35737383365631104,
"learning_rate": 4.678710609068193e-06,
"loss": 0.0218,
"step": 42
},
{
"epoch": 0.6417910447761194,
"grad_norm": 0.41281658411026,
"learning_rate": 4.658141202393935e-06,
"loss": 0.0193,
"step": 43
},
{
"epoch": 0.6567164179104478,
"grad_norm": 0.4524123966693878,
"learning_rate": 4.636981781463848e-06,
"loss": 0.0314,
"step": 44
},
{
"epoch": 0.6716417910447762,
"grad_norm": 0.4154703617095947,
"learning_rate": 4.615238131052339e-06,
"loss": 0.0245,
"step": 45
},
{
"epoch": 0.6716417910447762,
"eval_loss": 0.025405339896678925,
"eval_runtime": 8.3109,
"eval_samples_per_second": 14.318,
"eval_steps_per_second": 0.481,
"step": 45
},
{
"epoch": 0.6865671641791045,
"grad_norm": 0.45151591300964355,
"learning_rate": 4.592916195656322e-06,
"loss": 0.0273,
"step": 46
},
{
"epoch": 0.7014925373134329,
"grad_norm": 0.3298991918563843,
"learning_rate": 4.570022077870051e-06,
"loss": 0.0204,
"step": 47
},
{
"epoch": 0.7164179104477612,
"grad_norm": 0.4888248145580292,
"learning_rate": 4.546562036716732e-06,
"loss": 0.0269,
"step": 48
},
{
"epoch": 0.7313432835820896,
"grad_norm": 0.3940542936325073,
"learning_rate": 4.522542485937369e-06,
"loss": 0.0235,
"step": 49
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.3737621307373047,
"learning_rate": 4.497969992237312e-06,
"loss": 0.0209,
"step": 50
},
{
"epoch": 0.746268656716418,
"eval_loss": 0.024418316781520844,
"eval_runtime": 8.3141,
"eval_samples_per_second": 14.313,
"eval_steps_per_second": 0.481,
"step": 50
},
{
"epoch": 0.7611940298507462,
"grad_norm": 0.32089656591415405,
"learning_rate": 4.472851273490985e-06,
"loss": 0.0182,
"step": 51
},
{
"epoch": 0.7761194029850746,
"grad_norm": 0.4259447455406189,
"learning_rate": 4.4471931969052816e-06,
"loss": 0.0229,
"step": 52
},
{
"epoch": 0.7910447761194029,
"grad_norm": 0.33431047201156616,
"learning_rate": 4.421002777142148e-06,
"loss": 0.0191,
"step": 53
},
{
"epoch": 0.8059701492537313,
"grad_norm": 0.3420027196407318,
"learning_rate": 4.394287174400838e-06,
"loss": 0.0187,
"step": 54
},
{
"epoch": 0.8208955223880597,
"grad_norm": 0.4479522407054901,
"learning_rate": 4.3670536924603855e-06,
"loss": 0.0242,
"step": 55
},
{
"epoch": 0.8208955223880597,
"eval_loss": 0.023885194212198257,
"eval_runtime": 8.3377,
"eval_samples_per_second": 14.273,
"eval_steps_per_second": 0.48,
"step": 55
},
{
"epoch": 0.835820895522388,
"grad_norm": 0.46404945850372314,
"learning_rate": 4.33930977668283e-06,
"loss": 0.0226,
"step": 56
},
{
"epoch": 0.8507462686567164,
"grad_norm": 0.49134090542793274,
"learning_rate": 4.311063011977723e-06,
"loss": 0.0277,
"step": 57
},
{
"epoch": 0.8656716417910447,
"grad_norm": 0.373234361410141,
"learning_rate": 4.282321120728493e-06,
"loss": 0.0199,
"step": 58
},
{
"epoch": 0.8805970149253731,
"grad_norm": 0.31845158338546753,
"learning_rate": 4.253091960681222e-06,
"loss": 0.0196,
"step": 59
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.4006720781326294,
"learning_rate": 4.2233835227964145e-06,
"loss": 0.0226,
"step": 60
},
{
"epoch": 0.8955223880597015,
"eval_loss": 0.023553457111120224,
"eval_runtime": 8.2827,
"eval_samples_per_second": 14.367,
"eval_steps_per_second": 0.483,
"step": 60
},
{
"epoch": 0.9104477611940298,
"grad_norm": 0.395221084356308,
"learning_rate": 4.1932039290643534e-06,
"loss": 0.0238,
"step": 61
},
{
"epoch": 0.9253731343283582,
"grad_norm": 0.40594613552093506,
"learning_rate": 4.162561430284621e-06,
"loss": 0.0235,
"step": 62
},
{
"epoch": 0.9402985074626866,
"grad_norm": 0.32194435596466064,
"learning_rate": 4.1314644038104215e-06,
"loss": 0.0188,
"step": 63
},
{
"epoch": 0.9552238805970149,
"grad_norm": 0.36028629541397095,
"learning_rate": 4.099921351258292e-06,
"loss": 0.0207,
"step": 64
},
{
"epoch": 0.9701492537313433,
"grad_norm": 0.3143925368785858,
"learning_rate": 4.067940896183843e-06,
"loss": 0.0208,
"step": 65
},
{
"epoch": 0.9701492537313433,
"eval_loss": 0.023211363703012466,
"eval_runtime": 8.272,
"eval_samples_per_second": 14.386,
"eval_steps_per_second": 0.484,
"step": 65
},
{
"epoch": 0.9850746268656716,
"grad_norm": 0.4360805153846741,
"learning_rate": 4.0355317817241705e-06,
"loss": 0.0195,
"step": 66
},
{
"epoch": 1.0,
"grad_norm": 0.424087256193161,
"learning_rate": 4.002702868207563e-06,
"loss": 0.0243,
"step": 67
},
{
"epoch": 1.0149253731343284,
"grad_norm": 0.3290930986404419,
"learning_rate": 3.969463130731183e-06,
"loss": 0.0169,
"step": 68
},
{
"epoch": 1.0298507462686568,
"grad_norm": 0.34683364629745483,
"learning_rate": 3.935821656707359e-06,
"loss": 0.0188,
"step": 69
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.32021239399909973,
"learning_rate": 3.901787643379183e-06,
"loss": 0.0135,
"step": 70
},
{
"epoch": 1.044776119402985,
"eval_loss": 0.022930506616830826,
"eval_runtime": 8.303,
"eval_samples_per_second": 14.332,
"eval_steps_per_second": 0.482,
"step": 70
},
{
"epoch": 1.0597014925373134,
"grad_norm": 0.2900172472000122,
"learning_rate": 3.8673703953060685e-06,
"loss": 0.0169,
"step": 71
},
{
"epoch": 1.0746268656716418,
"grad_norm": 0.35844284296035767,
"learning_rate": 3.832579321819985e-06,
"loss": 0.0152,
"step": 72
},
{
"epoch": 1.0895522388059702,
"grad_norm": 0.352897584438324,
"learning_rate": 3.797423934453038e-06,
"loss": 0.0168,
"step": 73
},
{
"epoch": 1.1044776119402986,
"grad_norm": 0.35191500186920166,
"learning_rate": 3.76191384433711e-06,
"loss": 0.0173,
"step": 74
},
{
"epoch": 1.1194029850746268,
"grad_norm": 0.28631043434143066,
"learning_rate": 3.726058759576271e-06,
"loss": 0.0141,
"step": 75
},
{
"epoch": 1.1194029850746268,
"eval_loss": 0.023363711312413216,
"eval_runtime": 8.2771,
"eval_samples_per_second": 14.377,
"eval_steps_per_second": 0.483,
"step": 75
},
{
"epoch": 1.1343283582089552,
"grad_norm": 0.36245712637901306,
"learning_rate": 3.6898684825926845e-06,
"loss": 0.0133,
"step": 76
},
{
"epoch": 1.1492537313432836,
"grad_norm": 0.3419128954410553,
"learning_rate": 3.65335290744672e-06,
"loss": 0.0139,
"step": 77
},
{
"epoch": 1.164179104477612,
"grad_norm": 0.3986120820045471,
"learning_rate": 3.616522017132017e-06,
"loss": 0.0168,
"step": 78
},
{
"epoch": 1.1791044776119404,
"grad_norm": 0.3793441951274872,
"learning_rate": 3.579385880846232e-06,
"loss": 0.0165,
"step": 79
},
{
"epoch": 1.1940298507462686,
"grad_norm": 0.36774227023124695,
"learning_rate": 3.5419546512382264e-06,
"loss": 0.0165,
"step": 80
},
{
"epoch": 1.1940298507462686,
"eval_loss": 0.023542851209640503,
"eval_runtime": 8.316,
"eval_samples_per_second": 14.31,
"eval_steps_per_second": 0.481,
"step": 80
},
{
"epoch": 1.208955223880597,
"grad_norm": 0.36220625042915344,
"learning_rate": 3.5042385616324243e-06,
"loss": 0.0189,
"step": 81
},
{
"epoch": 1.2238805970149254,
"grad_norm": 0.3013781011104584,
"learning_rate": 3.466247923231131e-06,
"loss": 0.0141,
"step": 82
},
{
"epoch": 1.2388059701492538,
"grad_norm": 0.359733521938324,
"learning_rate": 3.427993122295552e-06,
"loss": 0.0161,
"step": 83
},
{
"epoch": 1.2537313432835822,
"grad_norm": 0.39510107040405273,
"learning_rate": 3.3894846173062917e-06,
"loss": 0.0153,
"step": 84
},
{
"epoch": 1.2686567164179103,
"grad_norm": 0.38427668809890747,
"learning_rate": 3.350732936104108e-06,
"loss": 0.0173,
"step": 85
},
{
"epoch": 1.2686567164179103,
"eval_loss": 0.023341603577136993,
"eval_runtime": 8.5378,
"eval_samples_per_second": 13.938,
"eval_steps_per_second": 0.469,
"step": 85
},
{
"epoch": 1.2835820895522387,
"grad_norm": 0.30994802713394165,
"learning_rate": 3.3117486730117092e-06,
"loss": 0.0134,
"step": 86
},
{
"epoch": 1.2985074626865671,
"grad_norm": 0.3489951193332672,
"learning_rate": 3.272542485937369e-06,
"loss": 0.0169,
"step": 87
},
{
"epoch": 1.3134328358208955,
"grad_norm": 0.31990131735801697,
"learning_rate": 3.2331250934611623e-06,
"loss": 0.0169,
"step": 88
},
{
"epoch": 1.328358208955224,
"grad_norm": 0.29082977771759033,
"learning_rate": 3.193507271904612e-06,
"loss": 0.0121,
"step": 89
},
{
"epoch": 1.3432835820895521,
"grad_norm": 0.3279978334903717,
"learning_rate": 3.15369985238455e-06,
"loss": 0.0123,
"step": 90
},
{
"epoch": 1.3432835820895521,
"eval_loss": 0.02307475358247757,
"eval_runtime": 8.3253,
"eval_samples_per_second": 14.294,
"eval_steps_per_second": 0.48,
"step": 90
},
{
"epoch": 1.3582089552238805,
"grad_norm": 0.30484575033187866,
"learning_rate": 3.1137137178519983e-06,
"loss": 0.0153,
"step": 91
},
{
"epoch": 1.373134328358209,
"grad_norm": 0.37242934107780457,
"learning_rate": 3.073559800116879e-06,
"loss": 0.0189,
"step": 92
},
{
"epoch": 1.3880597014925373,
"grad_norm": 0.33749932050704956,
"learning_rate": 3.0332490768593676e-06,
"loss": 0.02,
"step": 93
},
{
"epoch": 1.4029850746268657,
"grad_norm": 0.322444349527359,
"learning_rate": 2.9927925686287006e-06,
"loss": 0.0135,
"step": 94
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.3586093783378601,
"learning_rate": 2.9522013358302754e-06,
"loss": 0.0145,
"step": 95
},
{
"epoch": 1.417910447761194,
"eval_loss": 0.02322915382683277,
"eval_runtime": 8.3124,
"eval_samples_per_second": 14.316,
"eval_steps_per_second": 0.481,
"step": 95
},
{
"epoch": 1.4328358208955223,
"grad_norm": 0.29217156767845154,
"learning_rate": 2.911486475701835e-06,
"loss": 0.0132,
"step": 96
},
{
"epoch": 1.4477611940298507,
"grad_norm": 0.36368077993392944,
"learning_rate": 2.870659119279605e-06,
"loss": 0.0157,
"step": 97
},
{
"epoch": 1.462686567164179,
"grad_norm": 0.44833701848983765,
"learning_rate": 2.829730428355173e-06,
"loss": 0.0163,
"step": 98
},
{
"epoch": 1.4776119402985075,
"grad_norm": 0.3234724700450897,
"learning_rate": 2.788711592423966e-06,
"loss": 0.0126,
"step": 99
},
{
"epoch": 1.4925373134328357,
"grad_norm": 0.3331272006034851,
"learning_rate": 2.7476138256261575e-06,
"loss": 0.0154,
"step": 100
},
{
"epoch": 1.4925373134328357,
"eval_loss": 0.02257104031741619,
"eval_runtime": 8.3116,
"eval_samples_per_second": 14.317,
"eval_steps_per_second": 0.481,
"step": 100
},
{
"epoch": 1.5074626865671643,
"grad_norm": 0.2923891544342041,
"learning_rate": 2.7064483636808314e-06,
"loss": 0.012,
"step": 101
},
{
"epoch": 1.5223880597014925,
"grad_norm": 0.4166359007358551,
"learning_rate": 2.6652264608142487e-06,
"loss": 0.0207,
"step": 102
},
{
"epoch": 1.537313432835821,
"grad_norm": 0.3134080469608307,
"learning_rate": 2.623959386683056e-06,
"loss": 0.0129,
"step": 103
},
{
"epoch": 1.5522388059701493,
"grad_norm": 0.33056458830833435,
"learning_rate": 2.5826584232932707e-06,
"loss": 0.0141,
"step": 104
},
{
"epoch": 1.5671641791044775,
"grad_norm": 0.36771681904792786,
"learning_rate": 2.5413348619158966e-06,
"loss": 0.0147,
"step": 105
},
{
"epoch": 1.5671641791044775,
"eval_loss": 0.022433940321207047,
"eval_runtime": 8.2998,
"eval_samples_per_second": 14.338,
"eval_steps_per_second": 0.482,
"step": 105
},
{
"epoch": 1.582089552238806,
"grad_norm": 0.36414942145347595,
"learning_rate": 2.5e-06,
"loss": 0.017,
"step": 106
},
{
"epoch": 1.5970149253731343,
"grad_norm": 0.37462717294692993,
"learning_rate": 2.458665138084104e-06,
"loss": 0.0152,
"step": 107
},
{
"epoch": 1.6119402985074627,
"grad_norm": 0.3305724263191223,
"learning_rate": 2.4173415767067297e-06,
"loss": 0.0147,
"step": 108
},
{
"epoch": 1.626865671641791,
"grad_norm": 0.3597583472728729,
"learning_rate": 2.376040613316944e-06,
"loss": 0.0152,
"step": 109
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.32973572611808777,
"learning_rate": 2.3347735391857517e-06,
"loss": 0.0132,
"step": 110
},
{
"epoch": 1.6417910447761193,
"eval_loss": 0.022819483652710915,
"eval_runtime": 8.3196,
"eval_samples_per_second": 14.304,
"eval_steps_per_second": 0.481,
"step": 110
},
{
"epoch": 1.6567164179104479,
"grad_norm": 0.28718486428260803,
"learning_rate": 2.2935516363191695e-06,
"loss": 0.0115,
"step": 111
},
{
"epoch": 1.671641791044776,
"grad_norm": 0.3861285150051117,
"learning_rate": 2.2523861743738433e-06,
"loss": 0.0159,
"step": 112
},
{
"epoch": 1.6865671641791045,
"grad_norm": 0.465282678604126,
"learning_rate": 2.211288407576035e-06,
"loss": 0.0187,
"step": 113
},
{
"epoch": 1.7014925373134329,
"grad_norm": 0.31902506947517395,
"learning_rate": 2.1702695716448276e-06,
"loss": 0.0142,
"step": 114
},
{
"epoch": 1.716417910447761,
"grad_norm": 0.3674122989177704,
"learning_rate": 2.129340880720395e-06,
"loss": 0.0155,
"step": 115
},
{
"epoch": 1.716417910447761,
"eval_loss": 0.022671934217214584,
"eval_runtime": 8.2807,
"eval_samples_per_second": 14.371,
"eval_steps_per_second": 0.483,
"step": 115
},
{
"epoch": 1.7313432835820897,
"grad_norm": 0.4284082353115082,
"learning_rate": 2.088513524298165e-06,
"loss": 0.0185,
"step": 116
},
{
"epoch": 1.7462686567164178,
"grad_norm": 0.3359587490558624,
"learning_rate": 2.0477986641697263e-06,
"loss": 0.0161,
"step": 117
},
{
"epoch": 1.7611940298507462,
"grad_norm": 0.33396944403648376,
"learning_rate": 2.0072074313713e-06,
"loss": 0.0109,
"step": 118
},
{
"epoch": 1.7761194029850746,
"grad_norm": 0.3261159658432007,
"learning_rate": 1.9667509231406332e-06,
"loss": 0.0142,
"step": 119
},
{
"epoch": 1.7910447761194028,
"grad_norm": 0.35204392671585083,
"learning_rate": 1.9264401998831213e-06,
"loss": 0.0149,
"step": 120
},
{
"epoch": 1.7910447761194028,
"eval_loss": 0.022122090682387352,
"eval_runtime": 8.2768,
"eval_samples_per_second": 14.378,
"eval_steps_per_second": 0.483,
"step": 120
},
{
"epoch": 1.8059701492537314,
"grad_norm": 0.320769727230072,
"learning_rate": 1.8862862821480023e-06,
"loss": 0.0145,
"step": 121
},
{
"epoch": 1.8208955223880596,
"grad_norm": 0.34036341309547424,
"learning_rate": 1.8463001476154508e-06,
"loss": 0.0142,
"step": 122
},
{
"epoch": 1.835820895522388,
"grad_norm": 0.2602291405200958,
"learning_rate": 1.8064927280953893e-06,
"loss": 0.0117,
"step": 123
},
{
"epoch": 1.8507462686567164,
"grad_norm": 0.3351423442363739,
"learning_rate": 1.7668749065388385e-06,
"loss": 0.0132,
"step": 124
},
{
"epoch": 1.8656716417910446,
"grad_norm": 0.4113386869430542,
"learning_rate": 1.7274575140626318e-06,
"loss": 0.0169,
"step": 125
},
{
"epoch": 1.8656716417910446,
"eval_loss": 0.02193240076303482,
"eval_runtime": 8.2973,
"eval_samples_per_second": 14.342,
"eval_steps_per_second": 0.482,
"step": 125
},
{
"epoch": 1.8805970149253732,
"grad_norm": 0.3114112615585327,
"learning_rate": 1.6882513269882916e-06,
"loss": 0.0134,
"step": 126
},
{
"epoch": 1.8955223880597014,
"grad_norm": 0.39908191561698914,
"learning_rate": 1.6492670638958924e-06,
"loss": 0.0195,
"step": 127
},
{
"epoch": 1.9104477611940298,
"grad_norm": 0.3014167845249176,
"learning_rate": 1.6105153826937087e-06,
"loss": 0.0147,
"step": 128
},
{
"epoch": 1.9253731343283582,
"grad_norm": 0.3079487681388855,
"learning_rate": 1.5720068777044479e-06,
"loss": 0.0134,
"step": 129
},
{
"epoch": 1.9402985074626866,
"grad_norm": 0.28381192684173584,
"learning_rate": 1.53375207676887e-06,
"loss": 0.0136,
"step": 130
},
{
"epoch": 1.9402985074626866,
"eval_loss": 0.02178417146205902,
"eval_runtime": 8.276,
"eval_samples_per_second": 14.379,
"eval_steps_per_second": 0.483,
"step": 130
},
{
"epoch": 1.955223880597015,
"grad_norm": 0.344938188791275,
"learning_rate": 1.495761438367577e-06,
"loss": 0.0137,
"step": 131
},
{
"epoch": 1.9701492537313432,
"grad_norm": 0.3104920983314514,
"learning_rate": 1.4580453487617747e-06,
"loss": 0.0146,
"step": 132
},
{
"epoch": 1.9850746268656716,
"grad_norm": 0.3058537244796753,
"learning_rate": 1.4206141191537681e-06,
"loss": 0.0141,
"step": 133
},
{
"epoch": 2.0,
"grad_norm": 0.311489999294281,
"learning_rate": 1.383477982867984e-06,
"loss": 0.0127,
"step": 134
},
{
"epoch": 2.014925373134328,
"grad_norm": 0.35948511958122253,
"learning_rate": 1.346647092553281e-06,
"loss": 0.0139,
"step": 135
},
{
"epoch": 2.014925373134328,
"eval_loss": 0.02187744900584221,
"eval_runtime": 8.2762,
"eval_samples_per_second": 14.379,
"eval_steps_per_second": 0.483,
"step": 135
},
{
"epoch": 2.029850746268657,
"grad_norm": 0.24142009019851685,
"learning_rate": 1.3101315174073162e-06,
"loss": 0.011,
"step": 136
},
{
"epoch": 2.044776119402985,
"grad_norm": 0.2628946602344513,
"learning_rate": 1.2739412404237306e-06,
"loss": 0.0107,
"step": 137
},
{
"epoch": 2.0597014925373136,
"grad_norm": 0.2656816244125366,
"learning_rate": 1.2380861556628915e-06,
"loss": 0.0087,
"step": 138
},
{
"epoch": 2.074626865671642,
"grad_norm": 0.23779849708080292,
"learning_rate": 1.2025760655469629e-06,
"loss": 0.0096,
"step": 139
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.2537413537502289,
"learning_rate": 1.1674206781800162e-06,
"loss": 0.0101,
"step": 140
},
{
"epoch": 2.08955223880597,
"eval_loss": 0.02204073593020439,
"eval_runtime": 8.28,
"eval_samples_per_second": 14.372,
"eval_steps_per_second": 0.483,
"step": 140
},
{
"epoch": 2.1044776119402986,
"grad_norm": 0.22632823884487152,
"learning_rate": 1.1326296046939334e-06,
"loss": 0.0097,
"step": 141
},
{
"epoch": 2.1194029850746268,
"grad_norm": 0.2632769048213959,
"learning_rate": 1.0982123566208187e-06,
"loss": 0.012,
"step": 142
},
{
"epoch": 2.1343283582089554,
"grad_norm": 0.2738955616950989,
"learning_rate": 1.0641783432926412e-06,
"loss": 0.0109,
"step": 143
},
{
"epoch": 2.1492537313432836,
"grad_norm": 0.30985602736473083,
"learning_rate": 1.0305368692688175e-06,
"loss": 0.011,
"step": 144
},
{
"epoch": 2.1641791044776117,
"grad_norm": 0.2866548001766205,
"learning_rate": 9.972971317924373e-07,
"loss": 0.0087,
"step": 145
},
{
"epoch": 2.1641791044776117,
"eval_loss": 0.02224661409854889,
"eval_runtime": 8.27,
"eval_samples_per_second": 14.389,
"eval_steps_per_second": 0.484,
"step": 145
},
{
"epoch": 2.1791044776119404,
"grad_norm": 0.29876846075057983,
"learning_rate": 9.644682182758305e-07,
"loss": 0.0114,
"step": 146
},
{
"epoch": 2.1940298507462686,
"grad_norm": 0.2641301453113556,
"learning_rate": 9.320591038161575e-07,
"loss": 0.0099,
"step": 147
},
{
"epoch": 2.208955223880597,
"grad_norm": 0.3317098915576935,
"learning_rate": 9.000786487417084e-07,
"loss": 0.0115,
"step": 148
},
{
"epoch": 2.2238805970149254,
"grad_norm": 0.22708454728126526,
"learning_rate": 8.685355961895783e-07,
"loss": 0.0088,
"step": 149
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.24335134029388428,
"learning_rate": 8.374385697153792e-07,
"loss": 0.0089,
"step": 150
},
{
"epoch": 2.2388059701492535,
"eval_loss": 0.022272665053606033,
"eval_runtime": 8.2936,
"eval_samples_per_second": 14.348,
"eval_steps_per_second": 0.482,
"step": 150
},
{
"epoch": 2.253731343283582,
"grad_norm": 0.26413631439208984,
"learning_rate": 8.067960709356479e-07,
"loss": 0.0101,
"step": 151
},
{
"epoch": 2.2686567164179103,
"grad_norm": 0.24371112883090973,
"learning_rate": 7.766164772035856e-07,
"loss": 0.0079,
"step": 152
},
{
"epoch": 2.283582089552239,
"grad_norm": 0.25570055842399597,
"learning_rate": 7.469080393187786e-07,
"loss": 0.0089,
"step": 153
},
{
"epoch": 2.298507462686567,
"grad_norm": 0.26083242893218994,
"learning_rate": 7.176788792715076e-07,
"loss": 0.008,
"step": 154
},
{
"epoch": 2.3134328358208958,
"grad_norm": 0.36927682161331177,
"learning_rate": 6.889369880222776e-07,
"loss": 0.0112,
"step": 155
},
{
"epoch": 2.3134328358208958,
"eval_loss": 0.0225172471255064,
"eval_runtime": 8.2668,
"eval_samples_per_second": 14.395,
"eval_steps_per_second": 0.484,
"step": 155
},
{
"epoch": 2.328358208955224,
"grad_norm": 0.2831665873527527,
"learning_rate": 6.60690223317171e-07,
"loss": 0.0093,
"step": 156
},
{
"epoch": 2.343283582089552,
"grad_norm": 0.2767029106616974,
"learning_rate": 6.329463075396161e-07,
"loss": 0.0093,
"step": 157
},
{
"epoch": 2.3582089552238807,
"grad_norm": 0.22897273302078247,
"learning_rate": 6.057128255991637e-07,
"loss": 0.007,
"step": 158
},
{
"epoch": 2.373134328358209,
"grad_norm": 0.2247919887304306,
"learning_rate": 5.78997222857853e-07,
"loss": 0.0081,
"step": 159
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.25745531916618347,
"learning_rate": 5.528068030947193e-07,
"loss": 0.0083,
"step": 160
},
{
"epoch": 2.388059701492537,
"eval_loss": 0.022666901350021362,
"eval_runtime": 8.287,
"eval_samples_per_second": 14.36,
"eval_steps_per_second": 0.483,
"step": 160
},
{
"epoch": 2.4029850746268657,
"grad_norm": 0.2648358643054962,
"learning_rate": 5.271487265090163e-07,
"loss": 0.009,
"step": 161
},
{
"epoch": 2.417910447761194,
"grad_norm": 0.28286054730415344,
"learning_rate": 5.020300077626883e-07,
"loss": 0.0101,
"step": 162
},
{
"epoch": 2.4328358208955225,
"grad_norm": 0.28459128737449646,
"learning_rate": 4.774575140626317e-07,
"loss": 0.0087,
"step": 163
},
{
"epoch": 2.4477611940298507,
"grad_norm": 0.21343541145324707,
"learning_rate": 4.534379632832692e-07,
"loss": 0.0079,
"step": 164
},
{
"epoch": 2.4626865671641793,
"grad_norm": 0.2659505009651184,
"learning_rate": 4.299779221299499e-07,
"loss": 0.008,
"step": 165
},
{
"epoch": 2.4626865671641793,
"eval_loss": 0.022702785208821297,
"eval_runtime": 8.2796,
"eval_samples_per_second": 14.373,
"eval_steps_per_second": 0.483,
"step": 165
},
{
"epoch": 2.4776119402985075,
"grad_norm": 0.29091840982437134,
"learning_rate": 4.070838043436787e-07,
"loss": 0.0093,
"step": 166
},
{
"epoch": 2.4925373134328357,
"grad_norm": 0.24148190021514893,
"learning_rate": 3.847618689476612e-07,
"loss": 0.0083,
"step": 167
},
{
"epoch": 2.5074626865671643,
"grad_norm": 0.33075305819511414,
"learning_rate": 3.630182185361522e-07,
"loss": 0.0081,
"step": 168
},
{
"epoch": 2.5223880597014925,
"grad_norm": 0.24389711022377014,
"learning_rate": 3.4185879760606525e-07,
"loss": 0.0079,
"step": 169
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.32192865014076233,
"learning_rate": 3.2128939093180654e-07,
"loss": 0.0109,
"step": 170
},
{
"epoch": 2.5373134328358207,
"eval_loss": 0.022760972380638123,
"eval_runtime": 8.2746,
"eval_samples_per_second": 14.381,
"eval_steps_per_second": 0.483,
"step": 170
},
{
"epoch": 2.5522388059701493,
"grad_norm": 0.3323243260383606,
"learning_rate": 3.0131562198377763e-07,
"loss": 0.0074,
"step": 171
},
{
"epoch": 2.5671641791044775,
"grad_norm": 0.209702268242836,
"learning_rate": 2.819429513909705e-07,
"loss": 0.006,
"step": 172
},
{
"epoch": 2.582089552238806,
"grad_norm": 0.29595962166786194,
"learning_rate": 2.6317667544809135e-07,
"loss": 0.0102,
"step": 173
},
{
"epoch": 2.5970149253731343,
"grad_norm": 0.23989610373973846,
"learning_rate": 2.450219246676028e-07,
"loss": 0.0073,
"step": 174
},
{
"epoch": 2.611940298507463,
"grad_norm": 0.3013235330581665,
"learning_rate": 2.2748366237709374e-07,
"loss": 0.0103,
"step": 175
},
{
"epoch": 2.611940298507463,
"eval_loss": 0.02282480150461197,
"eval_runtime": 8.2894,
"eval_samples_per_second": 14.356,
"eval_steps_per_second": 0.483,
"step": 175
},
{
"epoch": 2.626865671641791,
"grad_norm": 0.23502513766288757,
"learning_rate": 2.1056668336235624e-07,
"loss": 0.0089,
"step": 176
},
{
"epoch": 2.6417910447761193,
"grad_norm": 0.26030927896499634,
"learning_rate": 1.9427561255653816e-07,
"loss": 0.0082,
"step": 177
},
{
"epoch": 2.656716417910448,
"grad_norm": 0.27521783113479614,
"learning_rate": 1.786149037757326e-07,
"loss": 0.0089,
"step": 178
},
{
"epoch": 2.671641791044776,
"grad_norm": 0.3519136905670166,
"learning_rate": 1.6358883850134815e-07,
"loss": 0.0121,
"step": 179
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.25255823135375977,
"learning_rate": 1.492015247095971e-07,
"loss": 0.008,
"step": 180
},
{
"epoch": 2.6865671641791042,
"eval_loss": 0.02286040224134922,
"eval_runtime": 8.2767,
"eval_samples_per_second": 14.378,
"eval_steps_per_second": 0.483,
"step": 180
},
{
"epoch": 2.701492537313433,
"grad_norm": 0.24869883060455322,
"learning_rate": 1.3545689574841341e-07,
"loss": 0.0072,
"step": 181
},
{
"epoch": 2.716417910447761,
"grad_norm": 0.3076685070991516,
"learning_rate": 1.223587092621162e-07,
"loss": 0.009,
"step": 182
},
{
"epoch": 2.7313432835820897,
"grad_norm": 0.2468510866165161,
"learning_rate": 1.099105461641059e-07,
"loss": 0.0091,
"step": 183
},
{
"epoch": 2.746268656716418,
"grad_norm": 0.4652602970600128,
"learning_rate": 9.811580965787965e-08,
"loss": 0.0078,
"step": 184
},
{
"epoch": 2.7611940298507465,
"grad_norm": 0.30688929557800293,
"learning_rate": 8.697772430662859e-08,
"loss": 0.0103,
"step": 185
},
{
"epoch": 2.7611940298507465,
"eval_loss": 0.022866524755954742,
"eval_runtime": 8.2907,
"eval_samples_per_second": 14.353,
"eval_steps_per_second": 0.482,
"step": 185
},
{
"epoch": 2.7761194029850746,
"grad_norm": 0.3009447753429413,
"learning_rate": 7.649933515167407e-08,
"loss": 0.0077,
"step": 186
},
{
"epoch": 2.791044776119403,
"grad_norm": 0.3035335838794708,
"learning_rate": 6.668350687998565e-08,
"loss": 0.0104,
"step": 187
},
{
"epoch": 2.8059701492537314,
"grad_norm": 0.2949415445327759,
"learning_rate": 5.753292304100183e-08,
"loss": 0.0084,
"step": 188
},
{
"epoch": 2.8208955223880596,
"grad_norm": 0.26432371139526367,
"learning_rate": 4.905008531297661e-08,
"loss": 0.0085,
"step": 189
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.25077345967292786,
"learning_rate": 4.123731281904408e-08,
"loss": 0.008,
"step": 190
},
{
"epoch": 2.835820895522388,
"eval_loss": 0.02286113053560257,
"eval_runtime": 8.2705,
"eval_samples_per_second": 14.389,
"eval_steps_per_second": 0.484,
"step": 190
},
{
"epoch": 2.8507462686567164,
"grad_norm": 0.2702929675579071,
"learning_rate": 3.4096741493194196e-08,
"loss": 0.0096,
"step": 191
},
{
"epoch": 2.8656716417910446,
"grad_norm": 0.3009881377220154,
"learning_rate": 2.763032349632877e-08,
"loss": 0.0098,
"step": 192
},
{
"epoch": 2.8805970149253732,
"grad_norm": 0.24812233448028564,
"learning_rate": 2.1839826682562015e-08,
"loss": 0.0081,
"step": 193
},
{
"epoch": 2.8955223880597014,
"grad_norm": 0.2779428958892822,
"learning_rate": 1.6726834115904645e-08,
"loss": 0.008,
"step": 194
},
{
"epoch": 2.91044776119403,
"grad_norm": 0.3340453803539276,
"learning_rate": 1.2292743637471461e-08,
"loss": 0.0109,
"step": 195
},
{
"epoch": 2.91044776119403,
"eval_loss": 0.022852875292301178,
"eval_runtime": 8.2644,
"eval_samples_per_second": 14.399,
"eval_steps_per_second": 0.484,
"step": 195
},
{
"epoch": 2.925373134328358,
"grad_norm": 0.2553161382675171,
"learning_rate": 8.538767483325384e-09,
"loss": 0.0068,
"step": 196
},
{
"epoch": 2.9402985074626864,
"grad_norm": 0.2572329044342041,
"learning_rate": 5.465931953063663e-09,
"loss": 0.0097,
"step": 197
},
{
"epoch": 2.955223880597015,
"grad_norm": 0.25870975852012634,
"learning_rate": 3.0750771292381575e-09,
"loss": 0.0093,
"step": 198
},
{
"epoch": 2.970149253731343,
"grad_norm": 0.21340855956077576,
"learning_rate": 1.3668566476848777e-09,
"loss": 0.0075,
"step": 199
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.3591207265853882,
"learning_rate": 3.4173751882748964e-10,
"loss": 0.009,
"step": 200
},
{
"epoch": 2.9850746268656714,
"eval_loss": 0.022861387580633163,
"eval_runtime": 8.2928,
"eval_samples_per_second": 14.35,
"eval_steps_per_second": 0.482,
"step": 200
},
{
"epoch": 3.0,
"grad_norm": 0.2385999709367752,
"learning_rate": 0.0,
"loss": 0.0082,
"step": 201
},
{
"epoch": 3.0,
"step": 201,
"total_flos": 21438131871744.0,
"train_loss": 0.020334236270548842,
"train_runtime": 4074.9852,
"train_samples_per_second": 0.786,
"train_steps_per_second": 0.049
}
],
"logging_steps": 1,
"max_steps": 201,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 81,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 21438131871744.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}