Azrail commited on
Commit
5597448
·
verified ·
1 Parent(s): 9dc85f2

Training in progress, step 64000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0f52abb8596fb1c55e5609ec97ec3ea8479c701d5763f12612f03207baebfdc
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:567849b3336c60bd2ca86c0e32d8fa276a554db52049aae022ae3912ae149f08
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc42234c3f4bb7923a06f1e41810d1e801108c51e07feed1ea66a8af7c05bc5a
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe74b2d737ce2dc3386b2964624b6ffd7d46aa98c026d78df24bca83b7a5f473
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a97095234a7b82e99cd1b23ba4db26c35942b8b4622876b166d0ce65b7c7110
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f03ef68c121377c551657263f23acf972b60bf546b00ad9803912e5c78e5ecd
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ca0e8dbf69c9810c713183e067be8112924d576870302a9fb3c526f389826e7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a987661a10dd2abc0dca231a45c2e361e0f28b82da18aba64a79545986bd62dc
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4237694290715918,
6
  "eval_steps": 500,
7
- "global_step": 63000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11222,11 +11222,189 @@
11222
  "eval_steps_per_second": 23.421,
11223
  "num_input_tokens_seen": 16515072000,
11224
  "step": 63000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11225
  }
11226
  ],
11227
  "logging_steps": 50,
11228
  "max_steps": 70000,
11229
- "num_input_tokens_seen": 16515072000,
11230
  "num_train_epochs": 1,
11231
  "save_steps": 1000,
11232
  "stateful_callbacks": {
@@ -11241,7 +11419,7 @@
11241
  "attributes": {}
11242
  }
11243
  },
11244
- "total_flos": 4.41794294710272e+18,
11245
  "train_batch_size": 64,
11246
  "trial_name": null,
11247
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.43049592794574404,
6
  "eval_steps": 500,
7
+ "global_step": 64000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11222
  "eval_steps_per_second": 23.421,
11223
  "num_input_tokens_seen": 16515072000,
11224
  "step": 63000
11225
+ },
11226
+ {
11227
+ "epoch": 0.42410575401529943,
11228
+ "grad_norm": 0.15995506942272186,
11229
+ "learning_rate": 0.00039764519864531023,
11230
+ "loss": 2.9898,
11231
+ "num_input_tokens_seen": 16528179200,
11232
+ "step": 63050
11233
+ },
11234
+ {
11235
+ "epoch": 0.42444207895900704,
11236
+ "grad_norm": 0.16034817695617676,
11237
+ "learning_rate": 0.0003928454234674747,
11238
+ "loss": 2.9884,
11239
+ "num_input_tokens_seen": 16541286400,
11240
+ "step": 63100
11241
+ },
11242
+ {
11243
+ "epoch": 0.42477840390271465,
11244
+ "grad_norm": 0.17681469023227692,
11245
+ "learning_rate": 0.00038805597607075075,
11246
+ "loss": 2.9952,
11247
+ "num_input_tokens_seen": 16554393600,
11248
+ "step": 63150
11249
+ },
11250
+ {
11251
+ "epoch": 0.42511472884642226,
11252
+ "grad_norm": 0.18527273833751678,
11253
+ "learning_rate": 0.00038327731807204744,
11254
+ "loss": 2.9947,
11255
+ "num_input_tokens_seen": 16567500800,
11256
+ "step": 63200
11257
+ },
11258
+ {
11259
+ "epoch": 0.4254510537901299,
11260
+ "grad_norm": 0.16262546181678772,
11261
+ "learning_rate": 0.0003785099100483681,
11262
+ "loss": 2.9972,
11263
+ "num_input_tokens_seen": 16580608000,
11264
+ "step": 63250
11265
+ },
11266
+ {
11267
+ "epoch": 0.4257873787338375,
11268
+ "grad_norm": 0.1709870994091034,
11269
+ "learning_rate": 0.00037375421149242103,
11270
+ "loss": 2.999,
11271
+ "num_input_tokens_seen": 16593715200,
11272
+ "step": 63300
11273
+ },
11274
+ {
11275
+ "epoch": 0.4261237036775451,
11276
+ "grad_norm": 0.1716383844614029,
11277
+ "learning_rate": 0.0003690106807683313,
11278
+ "loss": 2.9964,
11279
+ "num_input_tokens_seen": 16606822400,
11280
+ "step": 63350
11281
+ },
11282
+ {
11283
+ "epoch": 0.4264600286212527,
11284
+ "grad_norm": 0.18682868778705597,
11285
+ "learning_rate": 0.0003642797750674629,
11286
+ "loss": 3.0037,
11287
+ "num_input_tokens_seen": 16619929600,
11288
+ "step": 63400
11289
+ },
11290
+ {
11291
+ "epoch": 0.4267963535649603,
11292
+ "grad_norm": 0.16003596782684326,
11293
+ "learning_rate": 0.00035956195036435405,
11294
+ "loss": 2.9893,
11295
+ "num_input_tokens_seen": 16633036800,
11296
+ "step": 63450
11297
+ },
11298
+ {
11299
+ "epoch": 0.42713267850866793,
11300
+ "grad_norm": 0.17876048386096954,
11301
+ "learning_rate": 0.0003548576613727689,
11302
+ "loss": 3.0004,
11303
+ "num_input_tokens_seen": 16646144000,
11304
+ "step": 63500
11305
+ },
11306
+ {
11307
+ "epoch": 0.42713267850866793,
11308
+ "eval_loss": 2.8903579711914062,
11309
+ "eval_runtime": 53.0482,
11310
+ "eval_samples_per_second": 94.254,
11311
+ "eval_steps_per_second": 23.563,
11312
+ "num_input_tokens_seen": 16646144000,
11313
+ "step": 63500
11314
+ },
11315
+ {
11316
+ "epoch": 0.42746900345237554,
11317
+ "grad_norm": 0.21229425072669983,
11318
+ "learning_rate": 0.00035016736150187165,
11319
+ "loss": 2.9925,
11320
+ "num_input_tokens_seen": 16659251200,
11321
+ "step": 63550
11322
+ },
11323
+ {
11324
+ "epoch": 0.42780532839608315,
11325
+ "grad_norm": 0.19477584958076477,
11326
+ "learning_rate": 0.00034549150281252633,
11327
+ "loss": 2.9892,
11328
+ "num_input_tokens_seen": 16672358400,
11329
+ "step": 63600
11330
+ },
11331
+ {
11332
+ "epoch": 0.42814165333979076,
11333
+ "grad_norm": 0.1866609901189804,
11334
+ "learning_rate": 0.0003408305359737252,
11335
+ "loss": 2.9913,
11336
+ "num_input_tokens_seen": 16685465600,
11337
+ "step": 63650
11338
+ },
11339
+ {
11340
+ "epoch": 0.4284779782834984,
11341
+ "grad_norm": 0.19487887620925903,
11342
+ "learning_rate": 0.0003361849102191533,
11343
+ "loss": 2.9875,
11344
+ "num_input_tokens_seen": 16698572800,
11345
+ "step": 63700
11346
+ },
11347
+ {
11348
+ "epoch": 0.428814303227206,
11349
+ "grad_norm": 0.15979841351509094,
11350
+ "learning_rate": 0.00033155507330389,
11351
+ "loss": 2.9894,
11352
+ "num_input_tokens_seen": 16711680000,
11353
+ "step": 63750
11354
+ },
11355
+ {
11356
+ "epoch": 0.4291506281709136,
11357
+ "grad_norm": 0.1749998778104782,
11358
+ "learning_rate": 0.0003269414714612534,
11359
+ "loss": 2.9945,
11360
+ "num_input_tokens_seen": 16724787200,
11361
+ "step": 63800
11362
+ },
11363
+ {
11364
+ "epoch": 0.4294869531146212,
11365
+ "grad_norm": 0.16839075088500977,
11366
+ "learning_rate": 0.00032234454935979205,
11367
+ "loss": 2.9989,
11368
+ "num_input_tokens_seen": 16737894400,
11369
+ "step": 63850
11370
+ },
11371
+ {
11372
+ "epoch": 0.4298232780583288,
11373
+ "grad_norm": 0.19226372241973877,
11374
+ "learning_rate": 0.0003177647500604252,
11375
+ "loss": 2.9854,
11376
+ "num_input_tokens_seen": 16751001600,
11377
+ "step": 63900
11378
+ },
11379
+ {
11380
+ "epoch": 0.43015960300203643,
11381
+ "grad_norm": 0.15530380606651306,
11382
+ "learning_rate": 0.0003132025149737419,
11383
+ "loss": 2.9903,
11384
+ "num_input_tokens_seen": 16764108800,
11385
+ "step": 63950
11386
+ },
11387
+ {
11388
+ "epoch": 0.43049592794574404,
11389
+ "grad_norm": 0.17773845791816711,
11390
+ "learning_rate": 0.0003086582838174551,
11391
+ "loss": 2.9839,
11392
+ "num_input_tokens_seen": 16777216000,
11393
+ "step": 64000
11394
+ },
11395
+ {
11396
+ "epoch": 0.43049592794574404,
11397
+ "eval_loss": 2.8860437870025635,
11398
+ "eval_runtime": 53.1514,
11399
+ "eval_samples_per_second": 94.071,
11400
+ "eval_steps_per_second": 23.518,
11401
+ "num_input_tokens_seen": 16777216000,
11402
+ "step": 64000
11403
  }
11404
  ],
11405
  "logging_steps": 50,
11406
  "max_steps": 70000,
11407
+ "num_input_tokens_seen": 16777216000,
11408
  "num_train_epochs": 1,
11409
  "save_steps": 1000,
11410
  "stateful_callbacks": {
 
11419
  "attributes": {}
11420
  }
11421
  },
11422
+ "total_flos": 4.48806902562816e+18,
11423
  "train_batch_size": 64,
11424
  "trial_name": null,
11425
  "trial_params": null