Training in progress, step 2400, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d403900b6e4e06d1060ea96c9f9125452e44e25ecb0fe98a4888dab20918096a
 size 2066752

 version https://git-lfs.github.com/spec/v1
+oid sha256:e24a0310c4bf5f98acaaef3be18a2ca4d2a87e738c09732e426a40950ed1a048
 size 2066752

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27d100c984f8d641346f5da5c506557f408847bc183236db48c58f564b4d2d81
 size 4121235

 version https://git-lfs.github.com/spec/v1
+oid sha256:cd1af5c861590ed1aa8a2c671dc6f425bb6ed5438470f7ad02d8e2b79717ef16
 size 4121235

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:60fe173f9860062ebc60b002a64ae72dc915d76f9849b9cb85632a7a607221b5
 size 14391

 version https://git-lfs.github.com/spec/v1
+oid sha256:945f86d8abac1aa8354820af0171c56d0716798844a742f05ee2e852dae77534
 size 14391

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e16073f3e3321f4e6a7e2a6eca78556f1d80fe032e948e4721dde289f8623b3
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:d80fe92cd59dacf4b7d6a34254c209dfa9333b3830ad0abdd0afa76d827d8203
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.09933488814027813,
   "eval_steps": 100,
-  "global_step": 2300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -16292,6 +16292,714 @@
       "eval_samples_per_second": 1.698,
       "eval_steps_per_second": 0.212,
       "step": 2300
     }
   ],
   "logging_steps": 1,
@@ -16311,7 +17019,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7352067686400.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.10365379632029023,
   "eval_steps": 100,
+  "global_step": 2400,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 1.698,
       "eval_steps_per_second": 0.212,
       "step": 2300
+    },
+    {
+      "epoch": 0.09937807722207825,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0009933637689106113,
+      "loss": 8.1346,
+      "step": 2301
+    },
+    {
+      "epoch": 0.09942126630387838,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0009933521675350812,
+      "loss": 8.3205,
+      "step": 2302
+    },
+    {
+      "epoch": 0.0994644553856785,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0009933405560955803,
+      "loss": 8.3031,
+      "step": 2303
+    },
+    {
+      "epoch": 0.09950764446747862,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0009933289345923457,
+      "loss": 8.3805,
+      "step": 2304
+    },
+    {
+      "epoch": 0.09955083354927874,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0009933173030256142,
+      "loss": 8.191,
+      "step": 2305
+    },
+    {
+      "epoch": 0.09959402263107886,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0009933056613956233,
+      "loss": 8.427,
+      "step": 2306
+    },
+    {
+      "epoch": 0.09963721171287898,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0009932940097026105,
+      "loss": 8.2676,
+      "step": 2307
+    },
+    {
+      "epoch": 0.0996804007946791,
+      "grad_norm": 0.671875,
+      "learning_rate": 0.0009932823479468131,
+      "loss": 8.4411,
+      "step": 2308
+    },
+    {
+      "epoch": 0.09972358987647922,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009932706761284695,
+      "loss": 8.1711,
+      "step": 2309
+    },
+    {
+      "epoch": 0.09976677895827935,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0009932589942478174,
+      "loss": 8.4505,
+      "step": 2310
+    },
+    {
+      "epoch": 0.09980996804007947,
+      "grad_norm": 0.59765625,
+      "learning_rate": 0.0009932473023050955,
+      "loss": 8.1172,
+      "step": 2311
+    },
+    {
+      "epoch": 0.09985315712187959,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0009932356003005418,
+      "loss": 8.5325,
+      "step": 2312
+    },
+    {
+      "epoch": 0.09989634620367971,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0009932238882343956,
+      "loss": 8.4303,
+      "step": 2313
+    },
+    {
+      "epoch": 0.09993953528547983,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0009932121661068952,
+      "loss": 8.434,
+      "step": 2314
+    },
+    {
+      "epoch": 0.09998272436727995,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0009932004339182803,
+      "loss": 8.3212,
+      "step": 2315
+    },
+    {
+      "epoch": 0.10002591344908007,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0009931886916687896,
+      "loss": 8.3366,
+      "step": 2316
+    },
+    {
+      "epoch": 0.1000691025308802,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0009931769393586632,
+      "loss": 8.3998,
+      "step": 2317
+    },
+    {
+      "epoch": 0.10011229161268032,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0009931651769881408,
+      "loss": 8.4232,
+      "step": 2318
+    },
+    {
+      "epoch": 0.10015548069448044,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009931534045574616,
+      "loss": 8.5407,
+      "step": 2319
+    },
+    {
+      "epoch": 0.10019866977628056,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0009931416220668669,
+      "loss": 8.2877,
+      "step": 2320
+    },
+    {
+      "epoch": 0.10024185885808068,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0009931298295165962,
+      "loss": 8.4644,
+      "step": 2321
+    },
+    {
+      "epoch": 0.1002850479398808,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0009931180269068904,
+      "loss": 8.3041,
+      "step": 2322
+    },
+    {
+      "epoch": 0.10032823702168092,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.00099310621423799,
+      "loss": 7.9393,
+      "step": 2323
+    },
+    {
+      "epoch": 0.10037142610348104,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0009930943915101363,
+      "loss": 8.2464,
+      "step": 2324
+    },
+    {
+      "epoch": 0.10041461518528116,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0009930825587235704,
+      "loss": 8.3575,
+      "step": 2325
+    },
+    {
+      "epoch": 0.10045780426708129,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.0009930707158785335,
+      "loss": 8.5688,
+      "step": 2326
+    },
+    {
+      "epoch": 0.1005009933488814,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0009930588629752672,
+      "loss": 8.2146,
+      "step": 2327
+    },
+    {
+      "epoch": 0.10054418243068153,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0009930470000140135,
+      "loss": 8.5153,
+      "step": 2328
+    },
+    {
+      "epoch": 0.10058737151248165,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0009930351269950143,
+      "loss": 8.2966,
+      "step": 2329
+    },
+    {
+      "epoch": 0.10063056059428177,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0009930232439185117,
+      "loss": 8.4384,
+      "step": 2330
+    },
+    {
+      "epoch": 0.10067374967608189,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0009930113507847483,
+      "loss": 8.4281,
+      "step": 2331
+    },
+    {
+      "epoch": 0.10071693875788201,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0009929994475939665,
+      "loss": 8.3741,
+      "step": 2332
+    },
+    {
+      "epoch": 0.10076012783968213,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.000992987534346409,
+      "loss": 8.6786,
+      "step": 2333
+    },
+    {
+      "epoch": 0.10080331692148226,
+      "grad_norm": 0.4453125,
+      "learning_rate": 0.0009929756110423193,
+      "loss": 8.4003,
+      "step": 2334
+    },
+    {
+      "epoch": 0.10084650600328238,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0009929636776819402,
+      "loss": 8.349,
+      "step": 2335
+    },
+    {
+      "epoch": 0.1008896950850825,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0009929517342655155,
+      "loss": 8.2853,
+      "step": 2336
+    },
+    {
+      "epoch": 0.1009328841668826,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0009929397807932883,
+      "loss": 8.4347,
+      "step": 2337
+    },
+    {
+      "epoch": 0.10097607324868273,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0009929278172655029,
+      "loss": 8.1882,
+      "step": 2338
+    },
+    {
+      "epoch": 0.10101926233048285,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.0009929158436824033,
+      "loss": 8.5276,
+      "step": 2339
+    },
+    {
+      "epoch": 0.10106245141228297,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0009929038600442336,
+      "loss": 8.2453,
+      "step": 2340
+    },
+    {
+      "epoch": 0.10110564049408309,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0009928918663512382,
+      "loss": 8.5133,
+      "step": 2341
+    },
+    {
+      "epoch": 0.10114882957588321,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.000992879862603662,
+      "loss": 8.1969,
+      "step": 2342
+    },
+    {
+      "epoch": 0.10119201865768333,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0009928678488017497,
+      "loss": 8.2737,
+      "step": 2343
+    },
+    {
+      "epoch": 0.10123520773948345,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0009928558249457462,
+      "loss": 8.333,
+      "step": 2344
+    },
+    {
+      "epoch": 0.10127839682128358,
+      "grad_norm": 0.439453125,
+      "learning_rate": 0.0009928437910358971,
+      "loss": 8.2923,
+      "step": 2345
+    },
+    {
+      "epoch": 0.1013215859030837,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.000992831747072448,
+      "loss": 8.5941,
+      "step": 2346
+    },
+    {
+      "epoch": 0.10136477498488382,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0009928196930556442,
+      "loss": 8.2137,
+      "step": 2347
+    },
+    {
+      "epoch": 0.10140796406668394,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0009928076289857318,
+      "loss": 8.5716,
+      "step": 2348
+    },
+    {
+      "epoch": 0.10145115314848406,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0009927955548629567,
+      "loss": 8.4401,
+      "step": 2349
+    },
+    {
+      "epoch": 0.10149434223028418,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0009927834706875654,
+      "loss": 8.3506,
+      "step": 2350
+    },
+    {
+      "epoch": 0.1015375313120843,
+      "grad_norm": 1.859375,
+      "learning_rate": 0.0009927713764598042,
+      "loss": 8.1892,
+      "step": 2351
+    },
+    {
+      "epoch": 0.10158072039388442,
+      "grad_norm": 0.71484375,
+      "learning_rate": 0.0009927592721799203,
+      "loss": 8.5624,
+      "step": 2352
+    },
+    {
+      "epoch": 0.10162390947568455,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0009927471578481599,
+      "loss": 8.0008,
+      "step": 2353
+    },
+    {
+      "epoch": 0.10166709855748467,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.000992735033464771,
+      "loss": 8.2854,
+      "step": 2354
+    },
+    {
+      "epoch": 0.10171028763928479,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0009927228990299999,
+      "loss": 8.2719,
+      "step": 2355
+    },
+    {
+      "epoch": 0.10175347672108491,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.000992710754544095,
+      "loss": 8.3854,
+      "step": 2356
+    },
+    {
+      "epoch": 0.10179666580288503,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.0009926986000073036,
+      "loss": 8.4913,
+      "step": 2357
+    },
+    {
+      "epoch": 0.10183985488468515,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0009926864354198738,
+      "loss": 8.4161,
+      "step": 2358
+    },
+    {
+      "epoch": 0.10188304396648527,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0009926742607820535,
+      "loss": 8.365,
+      "step": 2359
+    },
+    {
+      "epoch": 0.1019262330482854,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0009926620760940914,
+      "loss": 8.2836,
+      "step": 2360
+    },
+    {
+      "epoch": 0.10196942213008552,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0009926498813562358,
+      "loss": 8.7514,
+      "step": 2361
+    },
+    {
+      "epoch": 0.10201261121188564,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0009926376765687357,
+      "loss": 8.4432,
+      "step": 2362
+    },
+    {
+      "epoch": 0.10205580029368576,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.0009926254617318398,
+      "loss": 8.3274,
+      "step": 2363
+    },
+    {
+      "epoch": 0.10209898937548588,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0009926132368457974,
+      "loss": 8.3462,
+      "step": 2364
+    },
+    {
+      "epoch": 0.102142178457286,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0009926010019108578,
+      "loss": 8.3675,
+      "step": 2365
+    },
+    {
+      "epoch": 0.10218536753908612,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0009925887569272708,
+      "loss": 8.2461,
+      "step": 2366
+    },
+    {
+      "epoch": 0.10222855662088624,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0009925765018952862,
+      "loss": 8.1833,
+      "step": 2367
+    },
+    {
+      "epoch": 0.10227174570268636,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0009925642368151535,
+      "loss": 8.0176,
+      "step": 2368
+    },
+    {
+      "epoch": 0.10231493478448649,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009925519616871235,
+      "loss": 8.4266,
+      "step": 2369
+    },
+    {
+      "epoch": 0.10235812386628661,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0009925396765114462,
+      "loss": 8.3391,
+      "step": 2370
+    },
+    {
+      "epoch": 0.10240131294808673,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0009925273812883724,
+      "loss": 8.6259,
+      "step": 2371
+    },
+    {
+      "epoch": 0.10244450202988685,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.000992515076018153,
+      "loss": 8.2038,
+      "step": 2372
+    },
+    {
+      "epoch": 0.10248769111168697,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0009925027607010385,
+      "loss": 8.5486,
+      "step": 2373
+    },
+    {
+      "epoch": 0.10253088019348709,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0009924904353372807,
+      "loss": 8.4328,
+      "step": 2374
+    },
+    {
+      "epoch": 0.10257406927528721,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.000992478099927131,
+      "loss": 8.3761,
+      "step": 2375
+    },
+    {
+      "epoch": 0.10261725835708733,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0009924657544708407,
+      "loss": 8.4608,
+      "step": 2376
+    },
+    {
+      "epoch": 0.10266044743888746,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0009924533989686616,
+      "loss": 8.3795,
+      "step": 2377
+    },
+    {
+      "epoch": 0.10270363652068756,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0009924410334208463,
+      "loss": 8.2331,
+      "step": 2378
+    },
+    {
+      "epoch": 0.10274682560248768,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0009924286578276464,
+      "loss": 8.6885,
+      "step": 2379
+    },
+    {
+      "epoch": 0.1027900146842878,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0009924162721893148,
+      "loss": 8.3229,
+      "step": 2380
+    },
+    {
+      "epoch": 0.10283320376608793,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.000992403876506104,
+      "loss": 8.5207,
+      "step": 2381
+    },
+    {
+      "epoch": 0.10287639284788805,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0009923914707782669,
+      "loss": 8.5089,
+      "step": 2382
+    },
+    {
+      "epoch": 0.10291958192968817,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.0009923790550060564,
+      "loss": 8.4575,
+      "step": 2383
+    },
+    {
+      "epoch": 0.10296277101148829,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0009923666291897259,
+      "loss": 8.3337,
+      "step": 2384
+    },
+    {
+      "epoch": 0.10300596009328841,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0009923541933295288,
+      "loss": 8.537,
+      "step": 2385
+    },
+    {
+      "epoch": 0.10304914917508853,
+      "grad_norm": 0.486328125,
+      "learning_rate": 0.000992341747425719,
+      "loss": 8.3868,
+      "step": 2386
+    },
+    {
+      "epoch": 0.10309233825688865,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0009923292914785502,
+      "loss": 8.5046,
+      "step": 2387
+    },
+    {
+      "epoch": 0.10313552733868878,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0009923168254882763,
+      "loss": 8.3372,
+      "step": 2388
+    },
+    {
+      "epoch": 0.1031787164204889,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0009923043494551522,
+      "loss": 8.3775,
+      "step": 2389
+    },
+    {
+      "epoch": 0.10322190550228902,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.0009922918633794317,
+      "loss": 8.665,
+      "step": 2390
+    },
+    {
+      "epoch": 0.10326509458408914,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.00099227936726137,
+      "loss": 8.502,
+      "step": 2391
+    },
+    {
+      "epoch": 0.10330828366588926,
+      "grad_norm": 0.9453125,
+      "learning_rate": 0.0009922668611012217,
+      "loss": 8.6291,
+      "step": 2392
+    },
+    {
+      "epoch": 0.10335147274768938,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.0009922543448992423,
+      "loss": 8.2589,
+      "step": 2393
+    },
+    {
+      "epoch": 0.1033946618294895,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0009922418186556867,
+      "loss": 7.6237,
+      "step": 2394
+    },
+    {
+      "epoch": 0.10343785091128962,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0009922292823708106,
+      "loss": 8.5899,
+      "step": 2395
+    },
+    {
+      "epoch": 0.10348103999308975,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0009922167360448698,
+      "loss": 8.184,
+      "step": 2396
+    },
+    {
+      "epoch": 0.10352422907488987,
+      "grad_norm": 0.9921875,
+      "learning_rate": 0.00099220417967812,
+      "loss": 8.4565,
+      "step": 2397
+    },
+    {
+      "epoch": 0.10356741815668999,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0009921916132708177,
+      "loss": 8.3024,
+      "step": 2398
+    },
+    {
+      "epoch": 0.10361060723849011,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0009921790368232189,
+      "loss": 8.7929,
+      "step": 2399
+    },
+    {
+      "epoch": 0.10365379632029023,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0009921664503355803,
+      "loss": 8.1994,
+      "step": 2400
+    },
+    {
+      "epoch": 0.10365379632029023,
+      "eval_loss": 8.382716178894043,
+      "eval_runtime": 13.8499,
+      "eval_samples_per_second": 1.733,
+      "eval_steps_per_second": 0.217,
+      "step": 2400
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 7671722803200.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null