Azrail commited on
Commit
b9e6be8
·
verified ·
1 Parent(s): 613f415

Training in progress, step 64000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a92901e6dc98a2f43e5ab06e2e35886c7f4c68e401e8be0d01acd281cd82349c
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4464ecdd36ba9fcbb768fb530bac7125d2ebc3403bceccb2b1857ab10495094
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34104db96694bb116cf3048bcf68919612a7c6c79ff646c13c6e8d5a81aff8f6
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca3203148d8c928e5e1184f1534f1177f73f6ec7ebdc7acfe3b62c2af0779f9d
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a97095234a7b82e99cd1b23ba4db26c35942b8b4622876b166d0ce65b7c7110
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f03ef68c121377c551657263f23acf972b60bf546b00ad9803912e5c78e5ecd
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d1de5c681ac3c8b6bb5235a71c5b6efd72fc9171aa2c9c6e093b8695c8a08b8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a5e9561ab6074dc857170aae9d2b27d70afb0686bb61ba701f52af71ad4d4a9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.30051158519861193,
6
  "eval_steps": 500,
7
- "global_step": 63000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11222,11 +11222,189 @@
11222
  "eval_steps_per_second": 23.044,
11223
  "num_input_tokens_seen": 16515067456,
11224
  "step": 63000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11225
  }
11226
  ],
11227
  "logging_steps": 50,
11228
  "max_steps": 70000,
11229
- "num_input_tokens_seen": 16515067456,
11230
  "num_train_epochs": 1,
11231
  "save_steps": 1000,
11232
  "stateful_callbacks": {
@@ -11241,7 +11419,7 @@
11241
  "attributes": {}
11242
  }
11243
  },
11244
- "total_flos": 4.4179417315383706e+18,
11245
  "train_batch_size": 64,
11246
  "trial_name": null,
11247
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.30528161036049467,
6
  "eval_steps": 500,
7
+ "global_step": 64000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11222
  "eval_steps_per_second": 23.044,
11223
  "num_input_tokens_seen": 16515067456,
11224
  "step": 63000
11225
+ },
11226
+ {
11227
+ "epoch": 0.30075008645670603,
11228
+ "grad_norm": 0.20152603089809418,
11229
+ "learning_rate": 0.0004943901308226771,
11230
+ "loss": 2.5562,
11231
+ "num_input_tokens_seen": 16528174656,
11232
+ "step": 63050
11233
+ },
11234
+ {
11235
+ "epoch": 0.3009885877148002,
11236
+ "grad_norm": 0.18534454703330994,
11237
+ "learning_rate": 0.0004887809678520976,
11238
+ "loss": 2.5559,
11239
+ "num_input_tokens_seen": 16541281856,
11240
+ "step": 63100
11241
+ },
11242
+ {
11243
+ "epoch": 0.30122708897289435,
11244
+ "grad_norm": 0.18770301342010498,
11245
+ "learning_rate": 0.0004831732172061032,
11246
+ "loss": 2.5538,
11247
+ "num_input_tokens_seen": 16554389056,
11248
+ "step": 63150
11249
+ },
11250
+ {
11251
+ "epoch": 0.30146559023098846,
11252
+ "grad_norm": 0.19565705955028534,
11253
+ "learning_rate": 0.0004775675848247427,
11254
+ "loss": 2.5593,
11255
+ "num_input_tokens_seen": 16567496256,
11256
+ "step": 63200
11257
+ },
11258
+ {
11259
+ "epoch": 0.3017040914890826,
11260
+ "grad_norm": 0.1954822540283203,
11261
+ "learning_rate": 0.00047196477638140405,
11262
+ "loss": 2.5694,
11263
+ "num_input_tokens_seen": 16580603456,
11264
+ "step": 63250
11265
+ },
11266
+ {
11267
+ "epoch": 0.3019425927471767,
11268
+ "grad_norm": 0.18120840191841125,
11269
+ "learning_rate": 0.0004663654971939802,
11270
+ "loss": 2.5622,
11271
+ "num_input_tokens_seen": 16593710656,
11272
+ "step": 63300
11273
+ },
11274
+ {
11275
+ "epoch": 0.3021810940052709,
11276
+ "grad_norm": 0.18100927770137787,
11277
+ "learning_rate": 0.0004607704521360776,
11278
+ "loss": 2.5437,
11279
+ "num_input_tokens_seen": 16606817856,
11280
+ "step": 63350
11281
+ },
11282
+ {
11283
+ "epoch": 0.30241959526336504,
11284
+ "grad_norm": 0.20565176010131836,
11285
+ "learning_rate": 0.0004551803455482833,
11286
+ "loss": 2.5463,
11287
+ "num_input_tokens_seen": 16619925056,
11288
+ "step": 63400
11289
+ },
11290
+ {
11291
+ "epoch": 0.30265809652145914,
11292
+ "grad_norm": 0.18989761173725128,
11293
+ "learning_rate": 0.0004495958811494978,
11294
+ "loss": 2.5609,
11295
+ "num_input_tokens_seen": 16633032256,
11296
+ "step": 63450
11297
+ },
11298
+ {
11299
+ "epoch": 0.3028965977795533,
11300
+ "grad_norm": 0.1870686262845993,
11301
+ "learning_rate": 0.0004440177619483461,
11302
+ "loss": 2.5554,
11303
+ "num_input_tokens_seen": 16646139456,
11304
+ "step": 63500
11305
+ },
11306
+ {
11307
+ "epoch": 0.3028965977795533,
11308
+ "eval_loss": 2.4395649433135986,
11309
+ "eval_runtime": 53.4665,
11310
+ "eval_samples_per_second": 93.516,
11311
+ "eval_steps_per_second": 23.379,
11312
+ "num_input_tokens_seen": 16646139456,
11313
+ "step": 63500
11314
+ },
11315
+ {
11316
+ "epoch": 0.3031350990376474,
11317
+ "grad_norm": 0.1891048699617386,
11318
+ "learning_rate": 0.00043844669015467863,
11319
+ "loss": 2.5627,
11320
+ "num_input_tokens_seen": 16659246656,
11321
+ "step": 63550
11322
+ },
11323
+ {
11324
+ "epoch": 0.30337360029574156,
11325
+ "grad_norm": 0.18591411411762238,
11326
+ "learning_rate": 0.0004328833670911724,
11327
+ "loss": 2.5545,
11328
+ "num_input_tokens_seen": 16672353856,
11329
+ "step": 63600
11330
+ },
11331
+ {
11332
+ "epoch": 0.3036121015538357,
11333
+ "grad_norm": 0.18640951812267303,
11334
+ "learning_rate": 0.0004273284931050438,
11335
+ "loss": 2.5672,
11336
+ "num_input_tokens_seen": 16685461056,
11337
+ "step": 63650
11338
+ },
11339
+ {
11340
+ "epoch": 0.3038506028119298,
11341
+ "grad_norm": 0.1919756680727005,
11342
+ "learning_rate": 0.0004217827674798845,
11343
+ "loss": 2.5492,
11344
+ "num_input_tokens_seen": 16698568256,
11345
+ "step": 63700
11346
+ },
11347
+ {
11348
+ "epoch": 0.304089104070024,
11349
+ "grad_norm": 0.18388938903808594,
11350
+ "learning_rate": 0.00041624688834763184,
11351
+ "loss": 2.5487,
11352
+ "num_input_tokens_seen": 16711675456,
11353
+ "step": 63750
11354
+ },
11355
+ {
11356
+ "epoch": 0.3043276053281181,
11357
+ "grad_norm": 0.1851562261581421,
11358
+ "learning_rate": 0.0004107215526006817,
11359
+ "loss": 2.5539,
11360
+ "num_input_tokens_seen": 16724782656,
11361
+ "step": 63800
11362
+ },
11363
+ {
11364
+ "epoch": 0.30456610658621225,
11365
+ "grad_norm": 0.17315496504306793,
11366
+ "learning_rate": 0.0004052074558041608,
11367
+ "loss": 2.5544,
11368
+ "num_input_tokens_seen": 16737889856,
11369
+ "step": 63850
11370
+ },
11371
+ {
11372
+ "epoch": 0.30480460784430635,
11373
+ "grad_norm": 0.17985352873802185,
11374
+ "learning_rate": 0.00039970529210836363,
11375
+ "loss": 2.5511,
11376
+ "num_input_tokens_seen": 16750997056,
11377
+ "step": 63900
11378
+ },
11379
+ {
11380
+ "epoch": 0.3050431091024005,
11381
+ "grad_norm": 0.20455212891101837,
11382
+ "learning_rate": 0.0003942157541613686,
11383
+ "loss": 2.5593,
11384
+ "num_input_tokens_seen": 16764104256,
11385
+ "step": 63950
11386
+ },
11387
+ {
11388
+ "epoch": 0.30528161036049467,
11389
+ "grad_norm": 0.1965632140636444,
11390
+ "learning_rate": 0.00038873953302184284,
11391
+ "loss": 2.5599,
11392
+ "num_input_tokens_seen": 16777211456,
11393
+ "step": 64000
11394
+ },
11395
+ {
11396
+ "epoch": 0.30528161036049467,
11397
+ "eval_loss": 2.437380790710449,
11398
+ "eval_runtime": 53.2524,
11399
+ "eval_samples_per_second": 93.893,
11400
+ "eval_steps_per_second": 23.473,
11401
+ "num_input_tokens_seen": 16777211456,
11402
+ "step": 64000
11403
  }
11404
  ],
11405
  "logging_steps": 50,
11406
  "max_steps": 70000,
11407
+ "num_input_tokens_seen": 16777211456,
11408
  "num_train_epochs": 1,
11409
  "save_steps": 1000,
11410
  "stateful_callbacks": {
 
11419
  "attributes": {}
11420
  }
11421
  },
11422
+ "total_flos": 4.4880678100638106e+18,
11423
  "train_batch_size": 64,
11424
  "trial_name": null,
11425
  "trial_params": null