Training in progress, step 2500, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e24a0310c4bf5f98acaaef3be18a2ca4d2a87e738c09732e426a40950ed1a048
 size 2066752

 version https://git-lfs.github.com/spec/v1
+oid sha256:412ae50cdeb5cca99c6d46aab796b0711066e3d9f4b41a911e3eb9d3dc6de17f
 size 2066752

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd1af5c861590ed1aa8a2c671dc6f425bb6ed5438470f7ad02d8e2b79717ef16
 size 4121235

 version https://git-lfs.github.com/spec/v1
+oid sha256:836b014a8c276d1f5618f6f4807f34376d58eccee7d3467c4acd8af2f036f8f3
 size 4121235

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:945f86d8abac1aa8354820af0171c56d0716798844a742f05ee2e852dae77534
 size 14391

 version https://git-lfs.github.com/spec/v1
+oid sha256:ed9d71331f73f26faac079d17a5f8873c17bceffe8dbf3eb835123619d3824be
 size 14391

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d80fe92cd59dacf4b7d6a34254c209dfa9333b3830ad0abdd0afa76d827d8203
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:28a833366aa970d3c976fd14c1ac36f1a287b5de565f4adb4a55d51debbe07ea
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.10365379632029023,
   "eval_steps": 100,
-  "global_step": 2400,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -17000,6 +17000,714 @@
       "eval_samples_per_second": 1.733,
       "eval_steps_per_second": 0.217,
       "step": 2400
     }
   ],
   "logging_steps": 1,
@@ -17019,7 +17727,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7671722803200.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.10797270450030233,
   "eval_steps": 100,
+  "global_step": 2500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 1.733,
       "eval_steps_per_second": 0.217,
       "step": 2400
+    },
+    {
+      "epoch": 0.10369698540209035,
+      "grad_norm": 0.70703125,
+      "learning_rate": 0.0009921538538081587,
+      "loss": 8.4378,
+      "step": 2401
+    },
+    {
+      "epoch": 0.10374017448389047,
+      "grad_norm": 0.75,
+      "learning_rate": 0.000992141247241211,
+      "loss": 8.099,
+      "step": 2402
+    },
+    {
+      "epoch": 0.1037833635656906,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0009921286306349944,
+      "loss": 8.683,
+      "step": 2403
+    },
+    {
+      "epoch": 0.10382655264749072,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0009921160039897661,
+      "loss": 8.6846,
+      "step": 2404
+    },
+    {
+      "epoch": 0.10386974172929084,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.000992103367305784,
+      "loss": 7.8544,
+      "step": 2405
+    },
+    {
+      "epoch": 0.10391293081109096,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0009920907205833056,
+      "loss": 8.1833,
+      "step": 2406
+    },
+    {
+      "epoch": 0.10395611989289108,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0009920780638225891,
+      "loss": 8.2644,
+      "step": 2407
+    },
+    {
+      "epoch": 0.1039993089746912,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0009920653970238924,
+      "loss": 8.0629,
+      "step": 2408
+    },
+    {
+      "epoch": 0.10404249805649132,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.000992052720187474,
+      "loss": 8.1571,
+      "step": 2409
+    },
+    {
+      "epoch": 0.10408568713829144,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0009920400333135926,
+      "loss": 8.3833,
+      "step": 2410
+    },
+    {
+      "epoch": 0.10412887622009157,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.000992027336402507,
+      "loss": 8.6215,
+      "step": 2411
+    },
+    {
+      "epoch": 0.10417206530189169,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0009920146294544762,
+      "loss": 8.2631,
+      "step": 2412
+    },
+    {
+      "epoch": 0.10421525438369181,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0009920019124697592,
+      "loss": 8.0769,
+      "step": 2413
+    },
+    {
+      "epoch": 0.10425844346549193,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0009919891854486159,
+      "loss": 8.5708,
+      "step": 2414
+    },
+    {
+      "epoch": 0.10430163254729205,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.000991976448391305,
+      "loss": 7.9082,
+      "step": 2415
+    },
+    {
+      "epoch": 0.10434482162909217,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0009919637012980875,
+      "loss": 8.4871,
+      "step": 2416
+    },
+    {
+      "epoch": 0.10438801071089229,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0009919509441692227,
+      "loss": 8.658,
+      "step": 2417
+    },
+    {
+      "epoch": 0.10443119979269241,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.000991938177004971,
+      "loss": 8.1744,
+      "step": 2418
+    },
+    {
+      "epoch": 0.10447438887449252,
+      "grad_norm": 0.67578125,
+      "learning_rate": 0.0009919253998055928,
+      "loss": 8.255,
+      "step": 2419
+    },
+    {
+      "epoch": 0.10451757795629264,
+      "grad_norm": 0.69140625,
+      "learning_rate": 0.0009919126125713489,
+      "loss": 8.3867,
+      "step": 2420
+    },
+    {
+      "epoch": 0.10456076703809276,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0009918998153024999,
+      "loss": 8.0586,
+      "step": 2421
+    },
+    {
+      "epoch": 0.10460395611989289,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0009918870079993068,
+      "loss": 8.2327,
+      "step": 2422
+    },
+    {
+      "epoch": 0.104647145201693,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0009918741906620313,
+      "loss": 8.5985,
+      "step": 2423
+    },
+    {
+      "epoch": 0.10469033428349313,
+      "grad_norm": 3.375,
+      "learning_rate": 0.0009918613632909346,
+      "loss": 9.423,
+      "step": 2424
+    },
+    {
+      "epoch": 0.10473352336529325,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0009918485258862781,
+      "loss": 8.1833,
+      "step": 2425
+    },
+    {
+      "epoch": 0.10477671244709337,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0009918356784483242,
+      "loss": 8.5462,
+      "step": 2426
+    },
+    {
+      "epoch": 0.10481990152889349,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0009918228209773346,
+      "loss": 8.5186,
+      "step": 2427
+    },
+    {
+      "epoch": 0.10486309061069361,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0009918099534735718,
+      "loss": 8.562,
+      "step": 2428
+    },
+    {
+      "epoch": 0.10490627969249373,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.000991797075937298,
+      "loss": 8.1776,
+      "step": 2429
+    },
+    {
+      "epoch": 0.10494946877429386,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0009917841883687764,
+      "loss": 8.3763,
+      "step": 2430
+    },
+    {
+      "epoch": 0.10499265785609398,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0009917712907682693,
+      "loss": 8.2266,
+      "step": 2431
+    },
+    {
+      "epoch": 0.1050358469378941,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0009917583831360402,
+      "loss": 8.4739,
+      "step": 2432
+    },
+    {
+      "epoch": 0.10507903601969422,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0009917454654723523,
+      "loss": 8.3661,
+      "step": 2433
+    },
+    {
+      "epoch": 0.10512222510149434,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0009917325377774688,
+      "loss": 8.2533,
+      "step": 2434
+    },
+    {
+      "epoch": 0.10516541418329446,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.000991719600051654,
+      "loss": 8.4593,
+      "step": 2435
+    },
+    {
+      "epoch": 0.10520860326509458,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0009917066522951714,
+      "loss": 8.2202,
+      "step": 2436
+    },
+    {
+      "epoch": 0.1052517923468947,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0009916936945082854,
+      "loss": 8.0726,
+      "step": 2437
+    },
+    {
+      "epoch": 0.10529498142869483,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.00099168072669126,
+      "loss": 8.3176,
+      "step": 2438
+    },
+    {
+      "epoch": 0.10533817051049495,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.00099166774884436,
+      "loss": 8.4613,
+      "step": 2439
+    },
+    {
+      "epoch": 0.10538135959229507,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00099165476096785,
+      "loss": 8.4511,
+      "step": 2440
+    },
+    {
+      "epoch": 0.10542454867409519,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.000991641763061995,
+      "loss": 8.1388,
+      "step": 2441
+    },
+    {
+      "epoch": 0.10546773775589531,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0009916287551270599,
+      "loss": 8.254,
+      "step": 2442
+    },
+    {
+      "epoch": 0.10551092683769543,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.0009916157371633106,
+      "loss": 8.3485,
+      "step": 2443
+    },
+    {
+      "epoch": 0.10555411591949555,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0009916027091710123,
+      "loss": 8.303,
+      "step": 2444
+    },
+    {
+      "epoch": 0.10559730500129567,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0009915896711504306,
+      "loss": 8.7418,
+      "step": 2445
+    },
+    {
+      "epoch": 0.1056404940830958,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0009915766231018317,
+      "loss": 8.2505,
+      "step": 2446
+    },
+    {
+      "epoch": 0.10568368316489592,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.000991563565025482,
+      "loss": 8.5745,
+      "step": 2447
+    },
+    {
+      "epoch": 0.10572687224669604,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0009915504969216472,
+      "loss": 8.242,
+      "step": 2448
+    },
+    {
+      "epoch": 0.10577006132849616,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0009915374187905945,
+      "loss": 8.4203,
+      "step": 2449
+    },
+    {
+      "epoch": 0.10581325041029628,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0009915243306325905,
+      "loss": 8.1327,
+      "step": 2450
+    },
+    {
+      "epoch": 0.1058564394920964,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0009915112324479021,
+      "loss": 8.4274,
+      "step": 2451
+    },
+    {
+      "epoch": 0.10589962857389652,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0009914981242367966,
+      "loss": 8.141,
+      "step": 2452
+    },
+    {
+      "epoch": 0.10594281765569664,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0009914850059995412,
+      "loss": 8.222,
+      "step": 2453
+    },
+    {
+      "epoch": 0.10598600673749677,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.0009914718777364038,
+      "loss": 8.2166,
+      "step": 2454
+    },
+    {
+      "epoch": 0.10602919581929689,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.000991458739447652,
+      "loss": 8.2447,
+      "step": 2455
+    },
+    {
+      "epoch": 0.10607238490109701,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0009914455911335537,
+      "loss": 8.471,
+      "step": 2456
+    },
+    {
+      "epoch": 0.10611557398289713,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0009914324327943774,
+      "loss": 8.376,
+      "step": 2457
+    },
+    {
+      "epoch": 0.10615876306469725,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0009914192644303915,
+      "loss": 8.4581,
+      "step": 2458
+    },
+    {
+      "epoch": 0.10620195214649737,
+      "grad_norm": 0.421875,
+      "learning_rate": 0.0009914060860418644,
+      "loss": 8.6168,
+      "step": 2459
+    },
+    {
+      "epoch": 0.1062451412282975,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0009913928976290648,
+      "loss": 8.2619,
+      "step": 2460
+    },
+    {
+      "epoch": 0.1062883303100976,
+      "grad_norm": 0.443359375,
+      "learning_rate": 0.0009913796991922624,
+      "loss": 8.5319,
+      "step": 2461
+    },
+    {
+      "epoch": 0.10633151939189772,
+      "grad_norm": 0.5625,
+      "learning_rate": 0.000991366490731726,
+      "loss": 8.2694,
+      "step": 2462
+    },
+    {
+      "epoch": 0.10637470847369784,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0009913532722477247,
+      "loss": 8.5479,
+      "step": 2463
+    },
+    {
+      "epoch": 0.10641789755549796,
+      "grad_norm": 0.4765625,
+      "learning_rate": 0.0009913400437405286,
+      "loss": 8.3825,
+      "step": 2464
+    },
+    {
+      "epoch": 0.10646108663729809,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.0009913268052104077,
+      "loss": 8.3031,
+      "step": 2465
+    },
+    {
+      "epoch": 0.10650427571909821,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.0009913135566576314,
+      "loss": 8.3329,
+      "step": 2466
+    },
+    {
+      "epoch": 0.10654746480089833,
+      "grad_norm": 0.431640625,
+      "learning_rate": 0.000991300298082471,
+      "loss": 8.1919,
+      "step": 2467
+    },
+    {
+      "epoch": 0.10659065388269845,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0009912870294851957,
+      "loss": 8.265,
+      "step": 2468
+    },
+    {
+      "epoch": 0.10663384296449857,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.000991273750866077,
+      "loss": 8.1909,
+      "step": 2469
+    },
+    {
+      "epoch": 0.10667703204629869,
+      "grad_norm": 0.64453125,
+      "learning_rate": 0.0009912604622253857,
+      "loss": 8.4647,
+      "step": 2470
+    },
+    {
+      "epoch": 0.10672022112809881,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0009912471635633924,
+      "loss": 8.006,
+      "step": 2471
+    },
+    {
+      "epoch": 0.10676341020989893,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.000991233854880369,
+      "loss": 8.237,
+      "step": 2472
+    },
+    {
+      "epoch": 0.10680659929169906,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0009912205361765868,
+      "loss": 8.4347,
+      "step": 2473
+    },
+    {
+      "epoch": 0.10684978837349918,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0009912072074523173,
+      "loss": 8.6241,
+      "step": 2474
+    },
+    {
+      "epoch": 0.1068929774552993,
+      "grad_norm": 0.470703125,
+      "learning_rate": 0.0009911938687078323,
+      "loss": 8.4256,
+      "step": 2475
+    },
+    {
+      "epoch": 0.10693616653709942,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0009911805199434044,
+      "loss": 8.4895,
+      "step": 2476
+    },
+    {
+      "epoch": 0.10697935561889954,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0009911671611593056,
+      "loss": 8.3152,
+      "step": 2477
+    },
+    {
+      "epoch": 0.10702254470069966,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009911537923558082,
+      "loss": 8.2683,
+      "step": 2478
+    },
+    {
+      "epoch": 0.10706573378249978,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0009911404135331853,
+      "loss": 8.4738,
+      "step": 2479
+    },
+    {
+      "epoch": 0.1071089228642999,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0009911270246917095,
+      "loss": 8.2726,
+      "step": 2480
+    },
+    {
+      "epoch": 0.10715211194610003,
+      "grad_norm": 0.447265625,
+      "learning_rate": 0.000991113625831654,
+      "loss": 8.3151,
+      "step": 2481
+    },
+    {
+      "epoch": 0.10719530102790015,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0009911002169532923,
+      "loss": 8.4751,
+      "step": 2482
+    },
+    {
+      "epoch": 0.10723849010970027,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0009910867980568979,
+      "loss": 8.5141,
+      "step": 2483
+    },
+    {
+      "epoch": 0.10728167919150039,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0009910733691427442,
+      "loss": 8.4462,
+      "step": 2484
+    },
+    {
+      "epoch": 0.10732486827330051,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0009910599302111057,
+      "loss": 8.232,
+      "step": 2485
+    },
+    {
+      "epoch": 0.10736805735510063,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.000991046481262256,
+      "loss": 8.1304,
+      "step": 2486
+    },
+    {
+      "epoch": 0.10741124643690075,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.00099103302229647,
+      "loss": 8.3622,
+      "step": 2487
+    },
+    {
+      "epoch": 0.10745443551870087,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0009910195533140214,
+      "loss": 8.3612,
+      "step": 2488
+    },
+    {
+      "epoch": 0.107497624600501,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.000991006074315186,
+      "loss": 8.3691,
+      "step": 2489
+    },
+    {
+      "epoch": 0.10754081368230112,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.000990992585300238,
+      "loss": 8.752,
+      "step": 2490
+    },
+    {
+      "epoch": 0.10758400276410124,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0009909790862694529,
+      "loss": 8.3788,
+      "step": 2491
+    },
+    {
+      "epoch": 0.10762719184590136,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0009909655772231056,
+      "loss": 8.378,
+      "step": 2492
+    },
+    {
+      "epoch": 0.10767038092770148,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0009909520581614724,
+      "loss": 8.2585,
+      "step": 2493
+    },
+    {
+      "epoch": 0.1077135700095016,
+      "grad_norm": 0.453125,
+      "learning_rate": 0.0009909385290848287,
+      "loss": 8.3038,
+      "step": 2494
+    },
+    {
+      "epoch": 0.10775675909130172,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0009909249899934505,
+      "loss": 8.5207,
+      "step": 2495
+    },
+    {
+      "epoch": 0.10779994817310184,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.000990911440887614,
+      "loss": 7.9057,
+      "step": 2496
+    },
+    {
+      "epoch": 0.10784313725490197,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.0009908978817675957,
+      "loss": 8.4006,
+      "step": 2497
+    },
+    {
+      "epoch": 0.10788632633670209,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.000990884312633672,
+      "loss": 8.3619,
+      "step": 2498
+    },
+    {
+      "epoch": 0.10792951541850221,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0009908707334861197,
+      "loss": 8.5389,
+      "step": 2499
+    },
+    {
+      "epoch": 0.10797270450030233,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.000990857144325216,
+      "loss": 8.3051,
+      "step": 2500
+    },
+    {
+      "epoch": 0.10797270450030233,
+      "eval_loss": 8.355310440063477,
+      "eval_runtime": 21.5458,
+      "eval_samples_per_second": 1.114,
+      "eval_steps_per_second": 0.139,
+      "step": 2500
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 7991377920000.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null