Training in progress, step 2300, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:34e7dd2d9b6f0970cb6393fa01c4d5b46e08a118b2ca11c92398326a18aca9b6
 size 2066752

 version https://git-lfs.github.com/spec/v1
+oid sha256:d403900b6e4e06d1060ea96c9f9125452e44e25ecb0fe98a4888dab20918096a
 size 2066752

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ea6c445c08f665e093952010dc41c9cfe5bc6fd09fae8a9ddc99dbd25132738
 size 4121235

 version https://git-lfs.github.com/spec/v1
+oid sha256:27d100c984f8d641346f5da5c506557f408847bc183236db48c58f564b4d2d81
 size 4121235

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:606f8ae83137b5e17dffec803b5eb8d484f9023ac65a91db2b3909da806f7963
 size 14391

 version https://git-lfs.github.com/spec/v1
+oid sha256:60fe173f9860062ebc60b002a64ae72dc915d76f9849b9cb85632a7a607221b5
 size 14391

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:93112c230b7ca5a82c24435d90248d5e745b06d96f80a988308b962666674dd0
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:2e16073f3e3321f4e6a7e2a6eca78556f1d80fe032e948e4721dde289f8623b3
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.09501597996026605,
   "eval_steps": 100,
-  "global_step": 2200,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -15584,6 +15584,714 @@
       "eval_samples_per_second": 1.719,
       "eval_steps_per_second": 0.215,
       "step": 2200
     }
   ],
   "logging_steps": 1,
@@ -15603,7 +16311,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7032412569600.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.09933488814027813,
   "eval_steps": 100,
+  "global_step": 2300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 1.719,
       "eval_steps_per_second": 0.215,
       "step": 2200
+    },
+    {
+      "epoch": 0.09505916904206617,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0009944730436502519,
+      "loss": 8.6376,
+      "step": 2201
+    },
+    {
+      "epoch": 0.09510235812386629,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0009944624498326824,
+      "loss": 8.4071,
+      "step": 2202
+    },
+    {
+      "epoch": 0.09514554720566641,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0009944518459284934,
+      "loss": 8.4233,
+      "step": 2203
+    },
+    {
+      "epoch": 0.09518873628746653,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.000994441231937901,
+      "loss": 8.4952,
+      "step": 2204
+    },
+    {
+      "epoch": 0.09523192536926665,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0009944306078611223,
+      "loss": 8.4472,
+      "step": 2205
+    },
+    {
+      "epoch": 0.09527511445106678,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0009944199736983733,
+      "loss": 8.186,
+      "step": 2206
+    },
+    {
+      "epoch": 0.0953183035328669,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0009944093294498714,
+      "loss": 8.4121,
+      "step": 2207
+    },
+    {
+      "epoch": 0.09536149261466702,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0009943986751158335,
+      "loss": 8.2663,
+      "step": 2208
+    },
+    {
+      "epoch": 0.09540468169646714,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.000994388010696477,
+      "loss": 8.202,
+      "step": 2209
+    },
+    {
+      "epoch": 0.09544787077826726,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0009943773361920198,
+      "loss": 8.2491,
+      "step": 2210
+    },
+    {
+      "epoch": 0.09549105986006738,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.000994366651602679,
+      "loss": 8.4205,
+      "step": 2211
+    },
+    {
+      "epoch": 0.0955342489418675,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0009943559569286732,
+      "loss": 8.2114,
+      "step": 2212
+    },
+    {
+      "epoch": 0.09557743802366761,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0009943452521702198,
+      "loss": 8.2514,
+      "step": 2213
+    },
+    {
+      "epoch": 0.09562062710546773,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.000994334537327538,
+      "loss": 8.3806,
+      "step": 2214
+    },
+    {
+      "epoch": 0.09566381618726785,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.000994323812400846,
+      "loss": 8.4841,
+      "step": 2215
+    },
+    {
+      "epoch": 0.09570700526906797,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0009943130773903623,
+      "loss": 8.4155,
+      "step": 2216
+    },
+    {
+      "epoch": 0.0957501943508681,
+      "grad_norm": 0.4609375,
+      "learning_rate": 0.0009943023322963062,
+      "loss": 8.2637,
+      "step": 2217
+    },
+    {
+      "epoch": 0.09579338343266822,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.000994291577118897,
+      "loss": 8.4826,
+      "step": 2218
+    },
+    {
+      "epoch": 0.09583657251446834,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.000994280811858354,
+      "loss": 8.322,
+      "step": 2219
+    },
+    {
+      "epoch": 0.09587976159626846,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0009942700365148964,
+      "loss": 8.1471,
+      "step": 2220
+    },
+    {
+      "epoch": 0.09592295067806858,
+      "grad_norm": 0.625,
+      "learning_rate": 0.0009942592510887448,
+      "loss": 8.4959,
+      "step": 2221
+    },
+    {
+      "epoch": 0.0959661397598687,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0009942484555801184,
+      "loss": 8.2918,
+      "step": 2222
+    },
+    {
+      "epoch": 0.09600932884166882,
+      "grad_norm": 1.1796875,
+      "learning_rate": 0.000994237649989238,
+      "loss": 8.5336,
+      "step": 2223
+    },
+    {
+      "epoch": 0.09605251792346894,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0009942268343163237,
+      "loss": 8.5094,
+      "step": 2224
+    },
+    {
+      "epoch": 0.09609570700526907,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0009942160085615961,
+      "loss": 8.1301,
+      "step": 2225
+    },
+    {
+      "epoch": 0.09613889608706919,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0009942051727252765,
+      "loss": 8.2714,
+      "step": 2226
+    },
+    {
+      "epoch": 0.09618208516886931,
+      "grad_norm": 0.416015625,
+      "learning_rate": 0.0009941943268075854,
+      "loss": 8.4242,
+      "step": 2227
+    },
+    {
+      "epoch": 0.09622527425066943,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0009941834708087445,
+      "loss": 8.463,
+      "step": 2228
+    },
+    {
+      "epoch": 0.09626846333246955,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0009941726047289748,
+      "loss": 8.305,
+      "step": 2229
+    },
+    {
+      "epoch": 0.09631165241426967,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.0009941617285684982,
+      "loss": 8.2656,
+      "step": 2230
+    },
+    {
+      "epoch": 0.09635484149606979,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0009941508423275366,
+      "loss": 8.3025,
+      "step": 2231
+    },
+    {
+      "epoch": 0.09639803057786991,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.000994139946006312,
+      "loss": 8.2322,
+      "step": 2232
+    },
+    {
+      "epoch": 0.09644121965967004,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0009941290396050467,
+      "loss": 8.2482,
+      "step": 2233
+    },
+    {
+      "epoch": 0.09648440874147016,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.000994118123123963,
+      "loss": 8.2431,
+      "step": 2234
+    },
+    {
+      "epoch": 0.09652759782327028,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.000994107196563284,
+      "loss": 8.7743,
+      "step": 2235
+    },
+    {
+      "epoch": 0.0965707869050704,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0009940962599232323,
+      "loss": 7.9271,
+      "step": 2236
+    },
+    {
+      "epoch": 0.09661397598687052,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.000994085313204031,
+      "loss": 8.279,
+      "step": 2237
+    },
+    {
+      "epoch": 0.09665716506867064,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0009940743564059037,
+      "loss": 8.4154,
+      "step": 2238
+    },
+    {
+      "epoch": 0.09670035415047076,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0009940633895290732,
+      "loss": 8.3311,
+      "step": 2239
+    },
+    {
+      "epoch": 0.09674354323227088,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.000994052412573764,
+      "loss": 8.6155,
+      "step": 2240
+    },
+    {
+      "epoch": 0.096786732314071,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0009940414255401996,
+      "loss": 8.4563,
+      "step": 2241
+    },
+    {
+      "epoch": 0.09682992139587113,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0009940304284286044,
+      "loss": 8.4334,
+      "step": 2242
+    },
+    {
+      "epoch": 0.09687311047767125,
+      "grad_norm": 0.7109375,
+      "learning_rate": 0.0009940194212392022,
+      "loss": 8.3314,
+      "step": 2243
+    },
+    {
+      "epoch": 0.09691629955947137,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.000994008403972218,
+      "loss": 8.7214,
+      "step": 2244
+    },
+    {
+      "epoch": 0.09695948864127149,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0009939973766278766,
+      "loss": 8.297,
+      "step": 2245
+    },
+    {
+      "epoch": 0.09700267772307161,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0009939863392064029,
+      "loss": 8.4925,
+      "step": 2246
+    },
+    {
+      "epoch": 0.09704586680487173,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0009939752917080217,
+      "loss": 8.2892,
+      "step": 2247
+    },
+    {
+      "epoch": 0.09708905588667185,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0009939642341329586,
+      "loss": 8.5275,
+      "step": 2248
+    },
+    {
+      "epoch": 0.09713224496847198,
+      "grad_norm": 2.390625,
+      "learning_rate": 0.0009939531664814392,
+      "loss": 8.694,
+      "step": 2249
+    },
+    {
+      "epoch": 0.0971754340502721,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0009939420887536893,
+      "loss": 8.4036,
+      "step": 2250
+    },
+    {
+      "epoch": 0.09721862313207222,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0009939310009499348,
+      "loss": 8.3543,
+      "step": 2251
+    },
+    {
+      "epoch": 0.09726181221387234,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0009939199030704019,
+      "loss": 8.3396,
+      "step": 2252
+    },
+    {
+      "epoch": 0.09730500129567246,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0009939087951153168,
+      "loss": 8.3102,
+      "step": 2253
+    },
+    {
+      "epoch": 0.09734819037747257,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0009938976770849065,
+      "loss": 8.236,
+      "step": 2254
+    },
+    {
+      "epoch": 0.09739137945927269,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0009938865489793976,
+      "loss": 8.1535,
+      "step": 2255
+    },
+    {
+      "epoch": 0.09743456854107281,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.000993875410799017,
+      "loss": 8.3056,
+      "step": 2256
+    },
+    {
+      "epoch": 0.09747775762287293,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0009938642625439917,
+      "loss": 8.2276,
+      "step": 2257
+    },
+    {
+      "epoch": 0.09752094670467305,
+      "grad_norm": 0.482421875,
+      "learning_rate": 0.0009938531042145498,
+      "loss": 8.4162,
+      "step": 2258
+    },
+    {
+      "epoch": 0.09756413578647317,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0009938419358109182,
+      "loss": 8.2504,
+      "step": 2259
+    },
+    {
+      "epoch": 0.0976073248682733,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0009938307573333254,
+      "loss": 8.595,
+      "step": 2260
+    },
+    {
+      "epoch": 0.09765051395007342,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0009938195687819989,
+      "loss": 8.3727,
+      "step": 2261
+    },
+    {
+      "epoch": 0.09769370303187354,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0009938083701571672,
+      "loss": 8.4693,
+      "step": 2262
+    },
+    {
+      "epoch": 0.09773689211367366,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0009937971614590586,
+      "loss": 8.383,
+      "step": 2263
+    },
+    {
+      "epoch": 0.09778008119547378,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.0009937859426879018,
+      "loss": 8.1874,
+      "step": 2264
+    },
+    {
+      "epoch": 0.0978232702772739,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0009937747138439256,
+      "loss": 8.5885,
+      "step": 2265
+    },
+    {
+      "epoch": 0.09786645935907402,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.000993763474927359,
+      "loss": 8.2082,
+      "step": 2266
+    },
+    {
+      "epoch": 0.09790964844087414,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0009937522259384317,
+      "loss": 8.4838,
+      "step": 2267
+    },
+    {
+      "epoch": 0.09795283752267427,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0009937409668773728,
+      "loss": 8.301,
+      "step": 2268
+    },
+    {
+      "epoch": 0.09799602660447439,
+      "grad_norm": 0.58984375,
+      "learning_rate": 0.000993729697744412,
+      "loss": 8.5776,
+      "step": 2269
+    },
+    {
+      "epoch": 0.09803921568627451,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.000993718418539779,
+      "loss": 8.8736,
+      "step": 2270
+    },
+    {
+      "epoch": 0.09808240476807463,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0009937071292637043,
+      "loss": 8.0556,
+      "step": 2271
+    },
+    {
+      "epoch": 0.09812559384987475,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.000993695829916418,
+      "loss": 7.9003,
+      "step": 2272
+    },
+    {
+      "epoch": 0.09816878293167487,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0009936845204981505,
+      "loss": 8.4324,
+      "step": 2273
+    },
+    {
+      "epoch": 0.098211972013475,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0009936732010091328,
+      "loss": 8.5961,
+      "step": 2274
+    },
+    {
+      "epoch": 0.09825516109527511,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0009936618714495953,
+      "loss": 8.6324,
+      "step": 2275
+    },
+    {
+      "epoch": 0.09829835017707524,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0009936505318197694,
+      "loss": 8.0815,
+      "step": 2276
+    },
+    {
+      "epoch": 0.09834153925887536,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0009936391821198866,
+      "loss": 8.584,
+      "step": 2277
+    },
+    {
+      "epoch": 0.09838472834067548,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0009936278223501782,
+      "loss": 8.3207,
+      "step": 2278
+    },
+    {
+      "epoch": 0.0984279174224756,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0009936164525108761,
+      "loss": 8.617,
+      "step": 2279
+    },
+    {
+      "epoch": 0.09847110650427572,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.000993605072602212,
+      "loss": 8.7178,
+      "step": 2280
+    },
+    {
+      "epoch": 0.09851429558607584,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0009935936826244182,
+      "loss": 8.3508,
+      "step": 2281
+    },
+    {
+      "epoch": 0.09855748466787596,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.000993582282577727,
+      "loss": 9.0207,
+      "step": 2282
+    },
+    {
+      "epoch": 0.09860067374967608,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009935708724623708,
+      "loss": 8.4822,
+      "step": 2283
+    },
+    {
+      "epoch": 0.0986438628314762,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0009935594522785826,
+      "loss": 8.537,
+      "step": 2284
+    },
+    {
+      "epoch": 0.09868705191327633,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0009935480220265955,
+      "loss": 7.9235,
+      "step": 2285
+    },
+    {
+      "epoch": 0.09873024099507645,
+      "grad_norm": 0.66015625,
+      "learning_rate": 0.0009935365817066422,
+      "loss": 8.3552,
+      "step": 2286
+    },
+    {
+      "epoch": 0.09877343007687657,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0009935251313189565,
+      "loss": 8.199,
+      "step": 2287
+    },
+    {
+      "epoch": 0.09881661915867669,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0009935136708637716,
+      "loss": 8.3347,
+      "step": 2288
+    },
+    {
+      "epoch": 0.09885980824047681,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0009935022003413217,
+      "loss": 8.1595,
+      "step": 2289
+    },
+    {
+      "epoch": 0.09890299732227693,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0009934907197518405,
+      "loss": 8.3218,
+      "step": 2290
+    },
+    {
+      "epoch": 0.09894618640407706,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0009934792290955622,
+      "loss": 8.3675,
+      "step": 2291
+    },
+    {
+      "epoch": 0.09898937548587718,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0009934677283727211,
+      "loss": 8.1193,
+      "step": 2292
+    },
+    {
+      "epoch": 0.0990325645676773,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.000993456217583552,
+      "loss": 8.5143,
+      "step": 2293
+    },
+    {
+      "epoch": 0.09907575364947742,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0009934446967282899,
+      "loss": 7.9545,
+      "step": 2294
+    },
+    {
+      "epoch": 0.09911894273127753,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.0009934331658071694,
+      "loss": 8.4399,
+      "step": 2295
+    },
+    {
+      "epoch": 0.09916213181307765,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.000993421624820426,
+      "loss": 8.3492,
+      "step": 2296
+    },
+    {
+      "epoch": 0.09920532089487777,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0009934100737682952,
+      "loss": 8.4884,
+      "step": 2297
+    },
+    {
+      "epoch": 0.09924850997667789,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0009933985126510123,
+      "loss": 8.0807,
+      "step": 2298
+    },
+    {
+      "epoch": 0.09929169905847801,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0009933869414688132,
+      "loss": 8.3986,
+      "step": 2299
+    },
+    {
+      "epoch": 0.09933488814027813,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0009933753602219342,
+      "loss": 8.331,
+      "step": 2300
+    },
+    {
+      "epoch": 0.09933488814027813,
+      "eval_loss": 8.394790649414062,
+      "eval_runtime": 14.132,
+      "eval_samples_per_second": 1.698,
+      "eval_steps_per_second": 0.212,
+      "step": 2300
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 7352067686400.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null