Training in progress, step 300, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d11d0d50a5f319c00547314089e917e81cf7d4025ad36d223d9654b7fd00af18
 size 40001880

 version https://git-lfs.github.com/spec/v1
+oid sha256:f8af9875dba3abc72b08d5cc40e93518bd82b3e85f63b6cad6ce774111d3dff6
 size 40001880

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:daad3398daf503ff56b15ae48fea43ae113a26da4f3b05c6bdf650fa55832975
 size 40043787

 version https://git-lfs.github.com/spec/v1
+oid sha256:3c652b40dc1376b5f1c6481c56235751d13ff2f60b4ee715e4f48755095005c2
 size 40043787

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f122671057b196d87919746ab31fde7751674c4ff91d4edbe8291727f123568
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:793829d79b248c3a7b8954f2cd95073c2ba034f6ee2bb0edff8ce8fef88cb5ad
 size 14645

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c1974eed1e05a84c5cdcaa5ca9e0c0e7cecb70b73f4e5db163c1db3a32122b7e
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:954462bb708bcdba3c0732881adee53bb51d512303049568372c51a54b8cb129
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.14144271570014144,
   "eval_steps": 100,
-  "global_step": 200,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1424,6 +1424,714 @@
       "eval_samples_per_second": 149.98,
       "eval_steps_per_second": 18.953,
       "step": 200
     }
   ],
   "logging_steps": 1,
@@ -1443,7 +2151,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.4851460038656e+16,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.21216407355021216,
   "eval_steps": 100,
+  "global_step": 300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 149.98,
       "eval_steps_per_second": 18.953,
       "step": 200
+    },
+    {
+      "epoch": 0.14214992927864215,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0008796665258991866,
+      "loss": 10.2804,
+      "step": 201
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0008793524987925326,
+      "loss": 10.2891,
+      "step": 202
+    },
+    {
+      "epoch": 0.14356435643564355,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0008790361222612515,
+      "loss": 10.2691,
+      "step": 203
+    },
+    {
+      "epoch": 0.14427157001414428,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0008787173980365612,
+      "loss": 10.2514,
+      "step": 204
+    },
+    {
+      "epoch": 0.144978783592645,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0008783963278625267,
+      "loss": 10.264,
+      "step": 205
+    },
+    {
+      "epoch": 0.1456859971711457,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0008780729134960495,
+      "loss": 10.2424,
+      "step": 206
+    },
+    {
+      "epoch": 0.1463932107496464,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.000877747156706859,
+      "loss": 10.2754,
+      "step": 207
+    },
+    {
+      "epoch": 0.1471004243281471,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0008774190592775022,
+      "loss": 10.2663,
+      "step": 208
+    },
+    {
+      "epoch": 0.1478076379066478,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.0008770886230033342,
+      "loss": 10.255,
+      "step": 209
+    },
+    {
+      "epoch": 0.1485148514851485,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0008767558496925081,
+      "loss": 10.2529,
+      "step": 210
+    },
+    {
+      "epoch": 0.1492220650636492,
+      "grad_norm": 0.76953125,
+      "learning_rate": 0.0008764207411659658,
+      "loss": 10.2576,
+      "step": 211
+    },
+    {
+      "epoch": 0.14992927864214992,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0008760832992574269,
+      "loss": 10.243,
+      "step": 212
+    },
+    {
+      "epoch": 0.15063649222065065,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0008757435258133798,
+      "loss": 10.2374,
+      "step": 213
+    },
+    {
+      "epoch": 0.15134370579915135,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0008754014226930707,
+      "loss": 10.248,
+      "step": 214
+    },
+    {
+      "epoch": 0.15205091937765206,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0008750569917684944,
+      "loss": 10.2509,
+      "step": 215
+    },
+    {
+      "epoch": 0.15275813295615276,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0008747102349243827,
+      "loss": 10.2583,
+      "step": 216
+    },
+    {
+      "epoch": 0.15346534653465346,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0008743611540581957,
+      "loss": 10.2646,
+      "step": 217
+    },
+    {
+      "epoch": 0.15417256011315417,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00087400975108011,
+      "loss": 10.2393,
+      "step": 218
+    },
+    {
+      "epoch": 0.15487977369165487,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0008736560279130091,
+      "loss": 10.2518,
+      "step": 219
+    },
+    {
+      "epoch": 0.15558698727015557,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.0008732999864924726,
+      "loss": 10.2396,
+      "step": 220
+    },
+    {
+      "epoch": 0.1562942008486563,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.0008729416287667654,
+      "loss": 10.2418,
+      "step": 221
+    },
+    {
+      "epoch": 0.157001414427157,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0008725809566968277,
+      "loss": 10.2397,
+      "step": 222
+    },
+    {
+      "epoch": 0.15770862800565771,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0008722179722562636,
+      "loss": 10.2309,
+      "step": 223
+    },
+    {
+      "epoch": 0.15841584158415842,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0008718526774313301,
+      "loss": 10.248,
+      "step": 224
+    },
+    {
+      "epoch": 0.15912305516265912,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0008714850742209275,
+      "loss": 10.232,
+      "step": 225
+    },
+    {
+      "epoch": 0.15983026874115983,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.000871115164636587,
+      "loss": 10.261,
+      "step": 226
+    },
+    {
+      "epoch": 0.16053748231966053,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.0008707429507024607,
+      "loss": 10.2448,
+      "step": 227
+    },
+    {
+      "epoch": 0.16124469589816123,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0008703684344553098,
+      "loss": 10.2303,
+      "step": 228
+    },
+    {
+      "epoch": 0.16195190947666196,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.000869991617944494,
+      "loss": 10.2336,
+      "step": 229
+    },
+    {
+      "epoch": 0.16265912305516267,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0008696125032319601,
+      "loss": 10.2486,
+      "step": 230
+    },
+    {
+      "epoch": 0.16336633663366337,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0008692310923922306,
+      "loss": 10.2299,
+      "step": 231
+    },
+    {
+      "epoch": 0.16407355021216408,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0008688473875123925,
+      "loss": 10.2432,
+      "step": 232
+    },
+    {
+      "epoch": 0.16478076379066478,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.000868461390692086,
+      "loss": 10.2641,
+      "step": 233
+    },
+    {
+      "epoch": 0.16548797736916548,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0008680731040434925,
+      "loss": 10.2143,
+      "step": 234
+    },
+    {
+      "epoch": 0.1661951909476662,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0008676825296913235,
+      "loss": 10.2482,
+      "step": 235
+    },
+    {
+      "epoch": 0.1669024045261669,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0008672896697728091,
+      "loss": 10.2266,
+      "step": 236
+    },
+    {
+      "epoch": 0.1676096181046676,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0008668945264376857,
+      "loss": 10.2357,
+      "step": 237
+    },
+    {
+      "epoch": 0.16831683168316833,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0008664971018481848,
+      "loss": 10.2237,
+      "step": 238
+    },
+    {
+      "epoch": 0.16902404526166903,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.000866097398179021,
+      "loss": 10.2447,
+      "step": 239
+    },
+    {
+      "epoch": 0.16973125884016974,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.00086569541761738,
+      "loss": 10.2407,
+      "step": 240
+    },
+    {
+      "epoch": 0.17043847241867044,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0008652911623629067,
+      "loss": 10.2269,
+      "step": 241
+    },
+    {
+      "epoch": 0.17114568599717114,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.000864884634627693,
+      "loss": 10.2356,
+      "step": 242
+    },
+    {
+      "epoch": 0.17185289957567185,
+      "grad_norm": 0.78125,
+      "learning_rate": 0.0008644758366362661,
+      "loss": 10.2402,
+      "step": 243
+    },
+    {
+      "epoch": 0.17256011315417255,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0008640647706255762,
+      "loss": 10.2485,
+      "step": 244
+    },
+    {
+      "epoch": 0.17326732673267325,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.0008636514388449835,
+      "loss": 10.2711,
+      "step": 245
+    },
+    {
+      "epoch": 0.173974540311174,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0008632358435562474,
+      "loss": 10.2419,
+      "step": 246
+    },
+    {
+      "epoch": 0.1746817538896747,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0008628179870335125,
+      "loss": 10.2475,
+      "step": 247
+    },
+    {
+      "epoch": 0.1753889674681754,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0008623978715632973,
+      "loss": 10.2354,
+      "step": 248
+    },
+    {
+      "epoch": 0.1760961810466761,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0008619754994444814,
+      "loss": 10.2518,
+      "step": 249
+    },
+    {
+      "epoch": 0.1768033946251768,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0008615508729882928,
+      "loss": 10.2316,
+      "step": 250
+    },
+    {
+      "epoch": 0.1775106082036775,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0008611239945182946,
+      "loss": 10.2365,
+      "step": 251
+    },
+    {
+      "epoch": 0.1782178217821782,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0008606948663703739,
+      "loss": 10.2641,
+      "step": 252
+    },
+    {
+      "epoch": 0.1789250353606789,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0008602634908927277,
+      "loss": 10.2601,
+      "step": 253
+    },
+    {
+      "epoch": 0.17963224893917965,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0008598298704458502,
+      "loss": 10.2662,
+      "step": 254
+    },
+    {
+      "epoch": 0.18033946251768035,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0008593940074025203,
+      "loss": 10.2693,
+      "step": 255
+    },
+    {
+      "epoch": 0.18104667609618105,
+      "grad_norm": 0.78515625,
+      "learning_rate": 0.0008589559041477887,
+      "loss": 10.2418,
+      "step": 256
+    },
+    {
+      "epoch": 0.18175388967468176,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.000858515563078964,
+      "loss": 10.2431,
+      "step": 257
+    },
+    {
+      "epoch": 0.18246110325318246,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0008580729866056009,
+      "loss": 10.2722,
+      "step": 258
+    },
+    {
+      "epoch": 0.18316831683168316,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0008576281771494854,
+      "loss": 10.2565,
+      "step": 259
+    },
+    {
+      "epoch": 0.18387553041018387,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0008571811371446231,
+      "loss": 10.2524,
+      "step": 260
+    },
+    {
+      "epoch": 0.18458274398868457,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0008567318690372251,
+      "loss": 10.2671,
+      "step": 261
+    },
+    {
+      "epoch": 0.18528995756718528,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.0008562803752856944,
+      "loss": 10.267,
+      "step": 262
+    },
+    {
+      "epoch": 0.185997171145686,
+      "grad_norm": 0.81640625,
+      "learning_rate": 0.000855826658360613,
+      "loss": 10.2668,
+      "step": 263
+    },
+    {
+      "epoch": 0.1867043847241867,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.000855370720744728,
+      "loss": 10.2517,
+      "step": 264
+    },
+    {
+      "epoch": 0.18741159830268742,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0008549125649329386,
+      "loss": 10.256,
+      "step": 265
+    },
+    {
+      "epoch": 0.18811881188118812,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.0008544521934322814,
+      "loss": 10.2607,
+      "step": 266
+    },
+    {
+      "epoch": 0.18882602545968882,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0008539896087619176,
+      "loss": 10.2778,
+      "step": 267
+    },
+    {
+      "epoch": 0.18953323903818953,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0008535248134531189,
+      "loss": 10.2601,
+      "step": 268
+    },
+    {
+      "epoch": 0.19024045261669023,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0008530578100492538,
+      "loss": 10.2399,
+      "step": 269
+    },
+    {
+      "epoch": 0.19094766619519093,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.000852588601105773,
+      "loss": 10.2647,
+      "step": 270
+    },
+    {
+      "epoch": 0.19165487977369167,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0008521171891901965,
+      "loss": 10.265,
+      "step": 271
+    },
+    {
+      "epoch": 0.19236209335219237,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.000851643576882099,
+      "loss": 10.2793,
+      "step": 272
+    },
+    {
+      "epoch": 0.19306930693069307,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0008511677667730952,
+      "loss": 10.283,
+      "step": 273
+    },
+    {
+      "epoch": 0.19377652050919378,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.000850689761466827,
+      "loss": 10.2648,
+      "step": 274
+    },
+    {
+      "epoch": 0.19448373408769448,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0008502095635789478,
+      "loss": 10.2946,
+      "step": 275
+    },
+    {
+      "epoch": 0.19519094766619519,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0008497271757371093,
+      "loss": 10.2982,
+      "step": 276
+    },
+    {
+      "epoch": 0.1958981612446959,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.0008492426005809464,
+      "loss": 10.2918,
+      "step": 277
+    },
+    {
+      "epoch": 0.1966053748231966,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0008487558407620629,
+      "loss": 10.2779,
+      "step": 278
+    },
+    {
+      "epoch": 0.19731258840169733,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0008482668989440178,
+      "loss": 10.294,
+      "step": 279
+    },
+    {
+      "epoch": 0.19801980198019803,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0008477757778023092,
+      "loss": 10.2892,
+      "step": 280
+    },
+    {
+      "epoch": 0.19872701555869873,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0008472824800243608,
+      "loss": 10.3013,
+      "step": 281
+    },
+    {
+      "epoch": 0.19943422913719944,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0008467870083095073,
+      "loss": 10.3026,
+      "step": 282
+    },
+    {
+      "epoch": 0.20014144271570014,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0008462893653689785,
+      "loss": 10.3071,
+      "step": 283
+    },
+    {
+      "epoch": 0.20084865629420084,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.0008457895539258857,
+      "loss": 10.3179,
+      "step": 284
+    },
+    {
+      "epoch": 0.20155586987270155,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.0008452875767152062,
+      "loss": 10.2782,
+      "step": 285
+    },
+    {
+      "epoch": 0.20226308345120225,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0008447834364837685,
+      "loss": 10.3004,
+      "step": 286
+    },
+    {
+      "epoch": 0.20297029702970298,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0008442771359902366,
+      "loss": 10.317,
+      "step": 287
+    },
+    {
+      "epoch": 0.2036775106082037,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0008437686780050964,
+      "loss": 10.3256,
+      "step": 288
+    },
+    {
+      "epoch": 0.2043847241867044,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0008432580653106389,
+      "loss": 10.3102,
+      "step": 289
+    },
+    {
+      "epoch": 0.2050919377652051,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.000842745300700946,
+      "loss": 10.3125,
+      "step": 290
+    },
+    {
+      "epoch": 0.2057991513437058,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0008422303869818752,
+      "loss": 10.325,
+      "step": 291
+    },
+    {
+      "epoch": 0.2065063649222065,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0008417133269710432,
+      "loss": 10.3293,
+      "step": 292
+    },
+    {
+      "epoch": 0.2072135785007072,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0008411941234978122,
+      "loss": 10.3085,
+      "step": 293
+    },
+    {
+      "epoch": 0.2079207920792079,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0008406727794032725,
+      "loss": 10.3144,
+      "step": 294
+    },
+    {
+      "epoch": 0.20862800565770862,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0008401492975402288,
+      "loss": 10.3095,
+      "step": 295
+    },
+    {
+      "epoch": 0.20933521923620935,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0008396236807731831,
+      "loss": 10.3412,
+      "step": 296
+    },
+    {
+      "epoch": 0.21004243281471005,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00083909593197832,
+      "loss": 10.3543,
+      "step": 297
+    },
+    {
+      "epoch": 0.21074964639321075,
+      "grad_norm": 0.86328125,
+      "learning_rate": 0.0008385660540434904,
+      "loss": 10.3612,
+      "step": 298
+    },
+    {
+      "epoch": 0.21145685997171146,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0008380340498681957,
+      "loss": 10.3614,
+      "step": 299
+    },
+    {
+      "epoch": 0.21216407355021216,
+      "grad_norm": 0.94140625,
+      "learning_rate": 0.0008374999223635726,
+      "loss": 10.3179,
+      "step": 300
+    },
+    {
+      "epoch": 0.21216407355021216,
+      "eval_loss": 10.361818313598633,
+      "eval_runtime": 1.2522,
+      "eval_samples_per_second": 145.349,
+      "eval_steps_per_second": 18.368,
+      "step": 300
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 2.2277190057984e+16,
   "train_batch_size": 128,
   "trial_name": null,
   "trial_params": null