tinystories_baseline / trainer_state.json
ptsv's picture
ptsv/olmo-7b-lora_tinystories_baseline
00b0a67 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 400,
"global_step": 375,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016,
"grad_norm": 0.5721463561058044,
"learning_rate": 0.000997326203208556,
"loss": 22.6723,
"step": 2
},
{
"epoch": 0.032,
"grad_norm": 1.5221903324127197,
"learning_rate": 0.0009919786096256684,
"loss": 22.7502,
"step": 4
},
{
"epoch": 0.048,
"grad_norm": 3.269012212753296,
"learning_rate": 0.0009866310160427808,
"loss": 22.2706,
"step": 6
},
{
"epoch": 0.064,
"grad_norm": 4.567020416259766,
"learning_rate": 0.0009812834224598931,
"loss": 21.3625,
"step": 8
},
{
"epoch": 0.08,
"grad_norm": 7.019204139709473,
"learning_rate": 0.0009759358288770054,
"loss": 20.7279,
"step": 10
},
{
"epoch": 0.096,
"grad_norm": 8.498096466064453,
"learning_rate": 0.0009705882352941176,
"loss": 20.8221,
"step": 12
},
{
"epoch": 0.112,
"grad_norm": 7.8151397705078125,
"learning_rate": 0.00096524064171123,
"loss": 20.4136,
"step": 14
},
{
"epoch": 0.128,
"grad_norm": 8.028499603271484,
"learning_rate": 0.0009598930481283422,
"loss": 20.2719,
"step": 16
},
{
"epoch": 0.144,
"grad_norm": 8.516434669494629,
"learning_rate": 0.0009545454545454546,
"loss": 20.1681,
"step": 18
},
{
"epoch": 0.16,
"grad_norm": 8.52490520477295,
"learning_rate": 0.0009491978609625669,
"loss": 19.8895,
"step": 20
},
{
"epoch": 0.176,
"grad_norm": 6.709629058837891,
"learning_rate": 0.0009438502673796791,
"loss": 19.93,
"step": 22
},
{
"epoch": 0.192,
"grad_norm": 6.038687705993652,
"learning_rate": 0.0009385026737967914,
"loss": 19.6312,
"step": 24
},
{
"epoch": 0.208,
"grad_norm": 5.785665512084961,
"learning_rate": 0.0009331550802139037,
"loss": 19.7683,
"step": 26
},
{
"epoch": 0.224,
"grad_norm": 5.79067850112915,
"learning_rate": 0.0009278074866310161,
"loss": 19.6965,
"step": 28
},
{
"epoch": 0.24,
"grad_norm": 5.166928291320801,
"learning_rate": 0.0009224598930481284,
"loss": 19.4005,
"step": 30
},
{
"epoch": 0.256,
"grad_norm": 4.578023433685303,
"learning_rate": 0.0009171122994652407,
"loss": 19.3963,
"step": 32
},
{
"epoch": 0.272,
"grad_norm": 4.7540693283081055,
"learning_rate": 0.0009117647058823529,
"loss": 19.4129,
"step": 34
},
{
"epoch": 0.288,
"grad_norm": 5.394408226013184,
"learning_rate": 0.0009064171122994653,
"loss": 19.5821,
"step": 36
},
{
"epoch": 0.304,
"grad_norm": 4.4902753829956055,
"learning_rate": 0.0009010695187165776,
"loss": 19.6562,
"step": 38
},
{
"epoch": 0.32,
"grad_norm": 5.49019193649292,
"learning_rate": 0.0008957219251336899,
"loss": 19.3588,
"step": 40
},
{
"epoch": 0.336,
"grad_norm": 4.184142589569092,
"learning_rate": 0.0008903743315508022,
"loss": 18.9032,
"step": 42
},
{
"epoch": 0.352,
"grad_norm": 3.98618483543396,
"learning_rate": 0.0008850267379679144,
"loss": 19.1882,
"step": 44
},
{
"epoch": 0.368,
"grad_norm": 4.851687908172607,
"learning_rate": 0.0008796791443850267,
"loss": 19.4565,
"step": 46
},
{
"epoch": 0.384,
"grad_norm": 4.108444690704346,
"learning_rate": 0.0008743315508021391,
"loss": 19.6149,
"step": 48
},
{
"epoch": 0.4,
"grad_norm": 3.7055838108062744,
"learning_rate": 0.0008689839572192514,
"loss": 18.9573,
"step": 50
},
{
"epoch": 0.416,
"grad_norm": 4.930137634277344,
"learning_rate": 0.0008636363636363636,
"loss": 19.4389,
"step": 52
},
{
"epoch": 0.432,
"grad_norm": 3.910098075866699,
"learning_rate": 0.000858288770053476,
"loss": 19.1465,
"step": 54
},
{
"epoch": 0.448,
"grad_norm": 4.0127716064453125,
"learning_rate": 0.0008529411764705882,
"loss": 19.5038,
"step": 56
},
{
"epoch": 0.464,
"grad_norm": 4.495028018951416,
"learning_rate": 0.0008475935828877005,
"loss": 19.3252,
"step": 58
},
{
"epoch": 0.48,
"grad_norm": 3.7703821659088135,
"learning_rate": 0.0008422459893048129,
"loss": 19.0238,
"step": 60
},
{
"epoch": 0.496,
"grad_norm": 3.6335291862487793,
"learning_rate": 0.0008368983957219252,
"loss": 19.1296,
"step": 62
},
{
"epoch": 0.512,
"grad_norm": 3.819183588027954,
"learning_rate": 0.0008315508021390374,
"loss": 18.4946,
"step": 64
},
{
"epoch": 0.528,
"grad_norm": 3.3171255588531494,
"learning_rate": 0.0008262032085561497,
"loss": 18.8054,
"step": 66
},
{
"epoch": 0.544,
"grad_norm": 4.316566467285156,
"learning_rate": 0.000820855614973262,
"loss": 19.162,
"step": 68
},
{
"epoch": 0.56,
"grad_norm": 3.39648175239563,
"learning_rate": 0.0008155080213903744,
"loss": 18.5671,
"step": 70
},
{
"epoch": 0.576,
"grad_norm": 3.7200136184692383,
"learning_rate": 0.0008101604278074867,
"loss": 18.9179,
"step": 72
},
{
"epoch": 0.592,
"grad_norm": 3.6730430126190186,
"learning_rate": 0.0008048128342245989,
"loss": 18.7162,
"step": 74
},
{
"epoch": 0.608,
"grad_norm": 3.5580945014953613,
"learning_rate": 0.0007994652406417113,
"loss": 19.0574,
"step": 76
},
{
"epoch": 0.624,
"grad_norm": 3.4793589115142822,
"learning_rate": 0.0007941176470588235,
"loss": 18.8649,
"step": 78
},
{
"epoch": 0.64,
"grad_norm": 4.074679374694824,
"learning_rate": 0.0007887700534759359,
"loss": 18.5553,
"step": 80
},
{
"epoch": 0.656,
"grad_norm": 3.315810441970825,
"learning_rate": 0.0007834224598930482,
"loss": 18.2136,
"step": 82
},
{
"epoch": 0.672,
"grad_norm": 4.288172721862793,
"learning_rate": 0.0007780748663101605,
"loss": 18.6089,
"step": 84
},
{
"epoch": 0.688,
"grad_norm": 3.5749149322509766,
"learning_rate": 0.0007727272727272727,
"loss": 18.8697,
"step": 86
},
{
"epoch": 0.704,
"grad_norm": 3.608825206756592,
"learning_rate": 0.000767379679144385,
"loss": 18.4129,
"step": 88
},
{
"epoch": 0.72,
"grad_norm": 3.5199592113494873,
"learning_rate": 0.0007620320855614974,
"loss": 18.1619,
"step": 90
},
{
"epoch": 0.736,
"grad_norm": 3.5022549629211426,
"learning_rate": 0.0007566844919786096,
"loss": 18.7368,
"step": 92
},
{
"epoch": 0.752,
"grad_norm": 3.6002230644226074,
"learning_rate": 0.000751336898395722,
"loss": 18.7792,
"step": 94
},
{
"epoch": 0.768,
"grad_norm": 4.682362079620361,
"learning_rate": 0.0007459893048128342,
"loss": 18.5495,
"step": 96
},
{
"epoch": 0.784,
"grad_norm": 3.6108767986297607,
"learning_rate": 0.0007406417112299465,
"loss": 18.7077,
"step": 98
},
{
"epoch": 0.8,
"grad_norm": 3.4719815254211426,
"learning_rate": 0.0007352941176470589,
"loss": 18.3262,
"step": 100
},
{
"epoch": 0.816,
"grad_norm": 4.4115986824035645,
"learning_rate": 0.0007299465240641712,
"loss": 18.3416,
"step": 102
},
{
"epoch": 0.832,
"grad_norm": 3.324169158935547,
"learning_rate": 0.0007245989304812834,
"loss": 18.7297,
"step": 104
},
{
"epoch": 0.848,
"grad_norm": 3.4287421703338623,
"learning_rate": 0.0007192513368983958,
"loss": 18.4499,
"step": 106
},
{
"epoch": 0.864,
"grad_norm": 3.9451239109039307,
"learning_rate": 0.000713903743315508,
"loss": 18.2669,
"step": 108
},
{
"epoch": 0.88,
"grad_norm": 3.5031988620758057,
"learning_rate": 0.0007085561497326202,
"loss": 18.8895,
"step": 110
},
{
"epoch": 0.896,
"grad_norm": 3.5174903869628906,
"learning_rate": 0.0007032085561497327,
"loss": 18.2961,
"step": 112
},
{
"epoch": 0.912,
"grad_norm": 4.080729961395264,
"learning_rate": 0.0006978609625668449,
"loss": 18.5613,
"step": 114
},
{
"epoch": 0.928,
"grad_norm": 3.7523930072784424,
"learning_rate": 0.0006925133689839572,
"loss": 18.5538,
"step": 116
},
{
"epoch": 0.944,
"grad_norm": 3.066669225692749,
"learning_rate": 0.0006871657754010695,
"loss": 18.6904,
"step": 118
},
{
"epoch": 0.96,
"grad_norm": 4.274256706237793,
"learning_rate": 0.0006818181818181818,
"loss": 18.6147,
"step": 120
},
{
"epoch": 0.976,
"grad_norm": 3.690139055252075,
"learning_rate": 0.0006764705882352942,
"loss": 18.1693,
"step": 122
},
{
"epoch": 0.992,
"grad_norm": 3.6681807041168213,
"learning_rate": 0.0006711229946524065,
"loss": 18.2498,
"step": 124
},
{
"epoch": 1.008,
"grad_norm": 3.5203354358673096,
"learning_rate": 0.0006657754010695187,
"loss": 18.4522,
"step": 126
},
{
"epoch": 1.024,
"grad_norm": 4.650991439819336,
"learning_rate": 0.000660427807486631,
"loss": 18.2839,
"step": 128
},
{
"epoch": 1.04,
"grad_norm": 3.7944228649139404,
"learning_rate": 0.0006550802139037433,
"loss": 18.051,
"step": 130
},
{
"epoch": 1.056,
"grad_norm": 3.2437500953674316,
"learning_rate": 0.0006497326203208556,
"loss": 18.1842,
"step": 132
},
{
"epoch": 1.072,
"grad_norm": 3.2863543033599854,
"learning_rate": 0.000644385026737968,
"loss": 18.2304,
"step": 134
},
{
"epoch": 1.088,
"grad_norm": 3.553260326385498,
"learning_rate": 0.0006390374331550802,
"loss": 18.1385,
"step": 136
},
{
"epoch": 1.104,
"grad_norm": 3.4277195930480957,
"learning_rate": 0.0006336898395721925,
"loss": 18.1337,
"step": 138
},
{
"epoch": 1.12,
"grad_norm": 3.974073886871338,
"learning_rate": 0.0006283422459893048,
"loss": 18.0326,
"step": 140
},
{
"epoch": 1.1360000000000001,
"grad_norm": 3.3450510501861572,
"learning_rate": 0.0006229946524064172,
"loss": 18.2695,
"step": 142
},
{
"epoch": 1.152,
"grad_norm": 3.2181997299194336,
"learning_rate": 0.0006176470588235294,
"loss": 18.0315,
"step": 144
},
{
"epoch": 1.168,
"grad_norm": 3.8346364498138428,
"learning_rate": 0.0006122994652406418,
"loss": 18.4272,
"step": 146
},
{
"epoch": 1.184,
"grad_norm": 3.2085418701171875,
"learning_rate": 0.000606951871657754,
"loss": 18.1768,
"step": 148
},
{
"epoch": 1.2,
"grad_norm": 3.462108850479126,
"learning_rate": 0.0006016042780748662,
"loss": 18.1731,
"step": 150
},
{
"epoch": 1.216,
"grad_norm": 3.444965362548828,
"learning_rate": 0.0005962566844919787,
"loss": 18.3599,
"step": 152
},
{
"epoch": 1.232,
"grad_norm": 3.3701171875,
"learning_rate": 0.0005909090909090909,
"loss": 18.1495,
"step": 154
},
{
"epoch": 1.248,
"grad_norm": 3.5145843029022217,
"learning_rate": 0.0005855614973262032,
"loss": 18.0835,
"step": 156
},
{
"epoch": 1.264,
"grad_norm": 3.4785313606262207,
"learning_rate": 0.0005802139037433155,
"loss": 17.8138,
"step": 158
},
{
"epoch": 1.28,
"grad_norm": 3.9735538959503174,
"learning_rate": 0.0005748663101604278,
"loss": 18.0071,
"step": 160
},
{
"epoch": 1.296,
"grad_norm": 3.650447368621826,
"learning_rate": 0.00056951871657754,
"loss": 18.0124,
"step": 162
},
{
"epoch": 1.312,
"grad_norm": 3.6459813117980957,
"learning_rate": 0.0005641711229946525,
"loss": 18.0059,
"step": 164
},
{
"epoch": 1.328,
"grad_norm": 3.2154831886291504,
"learning_rate": 0.0005588235294117647,
"loss": 17.9694,
"step": 166
},
{
"epoch": 1.3439999999999999,
"grad_norm": 3.367403507232666,
"learning_rate": 0.0005534759358288771,
"loss": 17.6557,
"step": 168
},
{
"epoch": 1.3599999999999999,
"grad_norm": 3.9948298931121826,
"learning_rate": 0.0005481283422459893,
"loss": 18.1942,
"step": 170
},
{
"epoch": 1.376,
"grad_norm": 3.3495073318481445,
"learning_rate": 0.0005427807486631015,
"loss": 18.2016,
"step": 172
},
{
"epoch": 1.392,
"grad_norm": 3.373162269592285,
"learning_rate": 0.000537433155080214,
"loss": 18.0422,
"step": 174
},
{
"epoch": 1.408,
"grad_norm": 4.063633441925049,
"learning_rate": 0.0005320855614973262,
"loss": 18.0809,
"step": 176
},
{
"epoch": 1.424,
"grad_norm": 3.4912514686584473,
"learning_rate": 0.0005267379679144385,
"loss": 18.0674,
"step": 178
},
{
"epoch": 1.44,
"grad_norm": 3.5900015830993652,
"learning_rate": 0.0005213903743315508,
"loss": 17.9285,
"step": 180
},
{
"epoch": 1.456,
"grad_norm": 4.066802024841309,
"learning_rate": 0.0005160427807486631,
"loss": 18.1551,
"step": 182
},
{
"epoch": 1.472,
"grad_norm": 3.9782357215881348,
"learning_rate": 0.0005106951871657754,
"loss": 18.0509,
"step": 184
},
{
"epoch": 1.488,
"grad_norm": 3.314682960510254,
"learning_rate": 0.0005053475935828878,
"loss": 17.7608,
"step": 186
},
{
"epoch": 1.504,
"grad_norm": 3.3548595905303955,
"learning_rate": 0.0005,
"loss": 17.8103,
"step": 188
},
{
"epoch": 1.52,
"grad_norm": 3.3475797176361084,
"learning_rate": 0.0004946524064171123,
"loss": 17.9465,
"step": 190
},
{
"epoch": 1.536,
"grad_norm": 3.4256432056427,
"learning_rate": 0.0004893048128342246,
"loss": 17.6619,
"step": 192
},
{
"epoch": 1.552,
"grad_norm": 3.390056848526001,
"learning_rate": 0.0004839572192513369,
"loss": 17.9681,
"step": 194
},
{
"epoch": 1.568,
"grad_norm": 3.4441208839416504,
"learning_rate": 0.00047860962566844924,
"loss": 17.9407,
"step": 196
},
{
"epoch": 1.584,
"grad_norm": 3.2374165058135986,
"learning_rate": 0.0004732620320855615,
"loss": 17.7235,
"step": 198
},
{
"epoch": 1.6,
"grad_norm": 3.5628514289855957,
"learning_rate": 0.0004679144385026738,
"loss": 18.1743,
"step": 200
},
{
"epoch": 1.616,
"grad_norm": 3.41139554977417,
"learning_rate": 0.00046256684491978613,
"loss": 17.8456,
"step": 202
},
{
"epoch": 1.6320000000000001,
"grad_norm": 3.423110008239746,
"learning_rate": 0.0004572192513368984,
"loss": 17.6656,
"step": 204
},
{
"epoch": 1.6480000000000001,
"grad_norm": 3.3344337940216064,
"learning_rate": 0.00045187165775401067,
"loss": 17.962,
"step": 206
},
{
"epoch": 1.6640000000000001,
"grad_norm": 3.5036981105804443,
"learning_rate": 0.000446524064171123,
"loss": 18.0875,
"step": 208
},
{
"epoch": 1.6800000000000002,
"grad_norm": 3.4953839778900146,
"learning_rate": 0.0004411764705882353,
"loss": 17.3435,
"step": 210
},
{
"epoch": 1.696,
"grad_norm": 3.6864068508148193,
"learning_rate": 0.0004358288770053476,
"loss": 17.9087,
"step": 212
},
{
"epoch": 1.712,
"grad_norm": 3.4755449295043945,
"learning_rate": 0.0004304812834224599,
"loss": 17.5076,
"step": 214
},
{
"epoch": 1.728,
"grad_norm": 3.8116891384124756,
"learning_rate": 0.0004251336898395722,
"loss": 17.9272,
"step": 216
},
{
"epoch": 1.744,
"grad_norm": 3.18284010887146,
"learning_rate": 0.0004197860962566845,
"loss": 17.7148,
"step": 218
},
{
"epoch": 1.76,
"grad_norm": 3.2884979248046875,
"learning_rate": 0.0004144385026737968,
"loss": 17.8813,
"step": 220
},
{
"epoch": 1.776,
"grad_norm": 3.3735768795013428,
"learning_rate": 0.00040909090909090913,
"loss": 18.0372,
"step": 222
},
{
"epoch": 1.792,
"grad_norm": 3.2611794471740723,
"learning_rate": 0.00040374331550802143,
"loss": 17.3771,
"step": 224
},
{
"epoch": 1.808,
"grad_norm": 3.3338570594787598,
"learning_rate": 0.00039839572192513367,
"loss": 18.4657,
"step": 226
},
{
"epoch": 1.8239999999999998,
"grad_norm": 3.405127763748169,
"learning_rate": 0.000393048128342246,
"loss": 17.9076,
"step": 228
},
{
"epoch": 1.8399999999999999,
"grad_norm": 3.561793565750122,
"learning_rate": 0.0003877005347593583,
"loss": 17.8996,
"step": 230
},
{
"epoch": 1.8559999999999999,
"grad_norm": 3.5615479946136475,
"learning_rate": 0.00038235294117647055,
"loss": 17.6746,
"step": 232
},
{
"epoch": 1.8719999999999999,
"grad_norm": 3.4306275844573975,
"learning_rate": 0.0003770053475935829,
"loss": 17.7182,
"step": 234
},
{
"epoch": 1.888,
"grad_norm": 3.5057003498077393,
"learning_rate": 0.0003716577540106952,
"loss": 17.8058,
"step": 236
},
{
"epoch": 1.904,
"grad_norm": 3.3117101192474365,
"learning_rate": 0.0003663101604278075,
"loss": 17.8643,
"step": 238
},
{
"epoch": 1.92,
"grad_norm": 3.6897945404052734,
"learning_rate": 0.0003609625668449198,
"loss": 17.8266,
"step": 240
},
{
"epoch": 1.936,
"grad_norm": 3.7577505111694336,
"learning_rate": 0.0003556149732620321,
"loss": 18.6381,
"step": 242
},
{
"epoch": 1.952,
"grad_norm": 3.2401480674743652,
"learning_rate": 0.0003502673796791444,
"loss": 17.6933,
"step": 244
},
{
"epoch": 1.968,
"grad_norm": 3.6619515419006348,
"learning_rate": 0.0003449197860962567,
"loss": 18.0547,
"step": 246
},
{
"epoch": 1.984,
"grad_norm": 3.8387668132781982,
"learning_rate": 0.000339572192513369,
"loss": 17.7932,
"step": 248
},
{
"epoch": 2.0,
"grad_norm": 3.390653371810913,
"learning_rate": 0.0003342245989304813,
"loss": 17.2655,
"step": 250
},
{
"epoch": 2.016,
"grad_norm": 3.40058970451355,
"learning_rate": 0.00032887700534759356,
"loss": 17.703,
"step": 252
},
{
"epoch": 2.032,
"grad_norm": 3.568702220916748,
"learning_rate": 0.0003235294117647059,
"loss": 17.2042,
"step": 254
},
{
"epoch": 2.048,
"grad_norm": 3.529431104660034,
"learning_rate": 0.0003181818181818182,
"loss": 17.5732,
"step": 256
},
{
"epoch": 2.064,
"grad_norm": 3.3919003009796143,
"learning_rate": 0.00031283422459893044,
"loss": 17.6191,
"step": 258
},
{
"epoch": 2.08,
"grad_norm": 3.878042459487915,
"learning_rate": 0.0003074866310160428,
"loss": 17.4911,
"step": 260
},
{
"epoch": 2.096,
"grad_norm": 3.772318124771118,
"learning_rate": 0.0003021390374331551,
"loss": 17.7258,
"step": 262
},
{
"epoch": 2.112,
"grad_norm": 3.4453060626983643,
"learning_rate": 0.0002967914438502674,
"loss": 17.4906,
"step": 264
},
{
"epoch": 2.128,
"grad_norm": 3.4957454204559326,
"learning_rate": 0.0002914438502673797,
"loss": 17.5716,
"step": 266
},
{
"epoch": 2.144,
"grad_norm": 3.530831813812256,
"learning_rate": 0.000286096256684492,
"loss": 17.4089,
"step": 268
},
{
"epoch": 2.16,
"grad_norm": 3.7524755001068115,
"learning_rate": 0.0002807486631016043,
"loss": 17.7712,
"step": 270
},
{
"epoch": 2.176,
"grad_norm": 3.297961711883545,
"learning_rate": 0.00027540106951871656,
"loss": 17.4408,
"step": 272
},
{
"epoch": 2.192,
"grad_norm": 3.3661088943481445,
"learning_rate": 0.0002700534759358289,
"loss": 17.6753,
"step": 274
},
{
"epoch": 2.208,
"grad_norm": 3.646210193634033,
"learning_rate": 0.0002647058823529412,
"loss": 17.7821,
"step": 276
},
{
"epoch": 2.224,
"grad_norm": 3.475140333175659,
"learning_rate": 0.00025935828877005345,
"loss": 17.6129,
"step": 278
},
{
"epoch": 2.24,
"grad_norm": 3.4734578132629395,
"learning_rate": 0.0002540106951871658,
"loss": 17.6856,
"step": 280
},
{
"epoch": 2.2560000000000002,
"grad_norm": 3.491572380065918,
"learning_rate": 0.0002486631016042781,
"loss": 17.6071,
"step": 282
},
{
"epoch": 2.2720000000000002,
"grad_norm": 3.4102542400360107,
"learning_rate": 0.0002433155080213904,
"loss": 17.352,
"step": 284
},
{
"epoch": 2.288,
"grad_norm": 3.393477439880371,
"learning_rate": 0.00023796791443850268,
"loss": 17.2612,
"step": 286
},
{
"epoch": 2.304,
"grad_norm": 3.112462282180786,
"learning_rate": 0.000232620320855615,
"loss": 17.3272,
"step": 288
},
{
"epoch": 2.32,
"grad_norm": 3.3398191928863525,
"learning_rate": 0.00022727272727272727,
"loss": 17.5815,
"step": 290
},
{
"epoch": 2.336,
"grad_norm": 3.5039889812469482,
"learning_rate": 0.00022192513368983957,
"loss": 17.7557,
"step": 292
},
{
"epoch": 2.352,
"grad_norm": 3.532892942428589,
"learning_rate": 0.0002165775401069519,
"loss": 18.0523,
"step": 294
},
{
"epoch": 2.368,
"grad_norm": 3.2969062328338623,
"learning_rate": 0.00021122994652406418,
"loss": 17.7496,
"step": 296
},
{
"epoch": 2.384,
"grad_norm": 3.262855291366577,
"learning_rate": 0.00020588235294117645,
"loss": 17.793,
"step": 298
},
{
"epoch": 2.4,
"grad_norm": 3.459914445877075,
"learning_rate": 0.00020053475935828877,
"loss": 17.9245,
"step": 300
},
{
"epoch": 2.416,
"grad_norm": 3.6749696731567383,
"learning_rate": 0.00019518716577540107,
"loss": 17.7125,
"step": 302
},
{
"epoch": 2.432,
"grad_norm": 3.266754150390625,
"learning_rate": 0.0001898395721925134,
"loss": 17.5905,
"step": 304
},
{
"epoch": 2.448,
"grad_norm": 3.1848971843719482,
"learning_rate": 0.00018449197860962566,
"loss": 17.523,
"step": 306
},
{
"epoch": 2.464,
"grad_norm": 3.2962844371795654,
"learning_rate": 0.00017914438502673795,
"loss": 17.5297,
"step": 308
},
{
"epoch": 2.48,
"grad_norm": 3.4688000679016113,
"learning_rate": 0.00017379679144385028,
"loss": 17.6315,
"step": 310
},
{
"epoch": 2.496,
"grad_norm": 3.4146833419799805,
"learning_rate": 0.00016844919786096257,
"loss": 17.5776,
"step": 312
},
{
"epoch": 2.512,
"grad_norm": 3.3122944831848145,
"learning_rate": 0.0001631016042780749,
"loss": 17.7264,
"step": 314
},
{
"epoch": 2.528,
"grad_norm": 3.2939462661743164,
"learning_rate": 0.00015775401069518716,
"loss": 17.48,
"step": 316
},
{
"epoch": 2.544,
"grad_norm": 3.8504631519317627,
"learning_rate": 0.00015240641711229946,
"loss": 17.3854,
"step": 318
},
{
"epoch": 2.56,
"grad_norm": 4.062356948852539,
"learning_rate": 0.00014705882352941178,
"loss": 17.6811,
"step": 320
},
{
"epoch": 2.576,
"grad_norm": 3.741989850997925,
"learning_rate": 0.00014171122994652407,
"loss": 17.4078,
"step": 322
},
{
"epoch": 2.592,
"grad_norm": 3.7287967205047607,
"learning_rate": 0.00013636363636363637,
"loss": 17.3517,
"step": 324
},
{
"epoch": 2.608,
"grad_norm": 3.6224465370178223,
"learning_rate": 0.00013101604278074866,
"loss": 17.254,
"step": 326
},
{
"epoch": 2.624,
"grad_norm": 3.5674147605895996,
"learning_rate": 0.00012566844919786096,
"loss": 17.869,
"step": 328
},
{
"epoch": 2.64,
"grad_norm": 3.722736358642578,
"learning_rate": 0.00012032085561497325,
"loss": 17.7399,
"step": 330
},
{
"epoch": 2.656,
"grad_norm": 3.6463096141815186,
"learning_rate": 0.00011497326203208556,
"loss": 17.5016,
"step": 332
},
{
"epoch": 2.672,
"grad_norm": 3.5358524322509766,
"learning_rate": 0.00010962566844919786,
"loss": 17.0355,
"step": 334
},
{
"epoch": 2.6879999999999997,
"grad_norm": 3.5321309566497803,
"learning_rate": 0.00010427807486631017,
"loss": 17.5089,
"step": 336
},
{
"epoch": 2.7039999999999997,
"grad_norm": 3.4019291400909424,
"learning_rate": 9.893048128342247e-05,
"loss": 17.3768,
"step": 338
},
{
"epoch": 2.7199999999999998,
"grad_norm": 3.4486570358276367,
"learning_rate": 9.358288770053476e-05,
"loss": 17.488,
"step": 340
},
{
"epoch": 2.7359999999999998,
"grad_norm": 3.7740256786346436,
"learning_rate": 8.823529411764706e-05,
"loss": 17.5768,
"step": 342
},
{
"epoch": 2.752,
"grad_norm": 3.5659339427948,
"learning_rate": 8.288770053475936e-05,
"loss": 17.6865,
"step": 344
},
{
"epoch": 2.768,
"grad_norm": 3.3678972721099854,
"learning_rate": 7.754010695187167e-05,
"loss": 17.4687,
"step": 346
},
{
"epoch": 2.784,
"grad_norm": 3.585134506225586,
"learning_rate": 7.219251336898395e-05,
"loss": 17.536,
"step": 348
},
{
"epoch": 2.8,
"grad_norm": 3.6471846103668213,
"learning_rate": 6.684491978609626e-05,
"loss": 17.6269,
"step": 350
},
{
"epoch": 2.816,
"grad_norm": 3.533790111541748,
"learning_rate": 6.149732620320857e-05,
"loss": 17.5771,
"step": 352
},
{
"epoch": 2.832,
"grad_norm": 3.7971367835998535,
"learning_rate": 5.614973262032086e-05,
"loss": 17.874,
"step": 354
},
{
"epoch": 2.848,
"grad_norm": 3.391874074935913,
"learning_rate": 5.080213903743316e-05,
"loss": 17.2528,
"step": 356
},
{
"epoch": 2.864,
"grad_norm": 3.069033145904541,
"learning_rate": 4.545454545454546e-05,
"loss": 17.6175,
"step": 358
},
{
"epoch": 2.88,
"grad_norm": 3.780275821685791,
"learning_rate": 4.0106951871657754e-05,
"loss": 17.2663,
"step": 360
},
{
"epoch": 2.896,
"grad_norm": 3.3377978801727295,
"learning_rate": 3.4759358288770055e-05,
"loss": 17.3711,
"step": 362
},
{
"epoch": 2.912,
"grad_norm": 3.356203317642212,
"learning_rate": 2.9411764705882354e-05,
"loss": 17.6077,
"step": 364
},
{
"epoch": 2.928,
"grad_norm": 3.302241563796997,
"learning_rate": 2.4064171122994652e-05,
"loss": 17.4777,
"step": 366
},
{
"epoch": 2.944,
"grad_norm": 3.73811411857605,
"learning_rate": 1.871657754010695e-05,
"loss": 17.3149,
"step": 368
},
{
"epoch": 2.96,
"grad_norm": 3.392902135848999,
"learning_rate": 1.336898395721925e-05,
"loss": 17.8118,
"step": 370
},
{
"epoch": 2.976,
"grad_norm": 3.8080010414123535,
"learning_rate": 8.021390374331552e-06,
"loss": 17.1875,
"step": 372
},
{
"epoch": 2.992,
"grad_norm": 3.5202646255493164,
"learning_rate": 2.67379679144385e-06,
"loss": 17.7556,
"step": 374
},
{
"epoch": 3.0,
"step": 375,
"total_flos": 2.6461914289864704e+17,
"train_loss": 18.264725362141927,
"train_runtime": 1944.3243,
"train_samples_per_second": 24.687,
"train_steps_per_second": 0.193
},
{
"epoch": 3.0,
"eval_loss": 2.2290163040161133,
"eval_runtime": 83.3238,
"eval_samples_per_second": 24.003,
"eval_steps_per_second": 3.0,
"step": 375
},
{
"epoch": 3.0,
"eval_loss": 2.226619243621826,
"eval_runtime": 83.9815,
"eval_samples_per_second": 23.815,
"eval_steps_per_second": 2.977,
"step": 375
}
],
"logging_steps": 2,
"max_steps": 375,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.6461914289864704e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}