Azrail commited on
Commit
a13a28f
·
verified ·
1 Parent(s): d2f60d9

Training in progress, step 70000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:255db0ba45691d582a9ee109acc81eb21c645c6dd171a1bf2c3e231a0982d734
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2cd2d83e8bcf2e24c3dfb835a421fc560d30e2495b3907943e618272cae7419
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78af01c29bc4dc4815cd8cb2a0e12aac4f0221e2ad669f18caa4315a15ef83d7
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:976f02141679563933b488383c3b486345d46612fec4531d1a034013ee84ec05
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1397d04798a1fd86f4b074ba5cc769a269eab9bb0994d2bcfee86faa58f609a6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f91f0395ad8bb44fd81f1444330dede040f6b66dbc15e61e2a7fe4c1ef60aa2a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16992573f9fe212ca32ce6bacf3d51d66103db65f351073047f24fab4f0f55af
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c3a675e6db9104dd282c679a41cb4bdc17a98118d756c948d809458e24a6b37
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3291317361699083,
6
  "eval_steps": 500,
7
- "global_step": 69000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12290,11 +12290,189 @@
12290
  "eval_steps_per_second": 23.618,
12291
  "num_input_tokens_seen": 18087931456,
12292
  "step": 69000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12293
  }
12294
  ],
12295
  "logging_steps": 50,
12296
  "max_steps": 70000,
12297
- "num_input_tokens_seen": 18087931456,
12298
  "num_train_epochs": 1,
12299
  "save_steps": 1000,
12300
  "stateful_callbacks": {
@@ -12304,12 +12482,12 @@
12304
  "should_evaluate": false,
12305
  "should_log": false,
12306
  "should_save": true,
12307
- "should_training_stop": false
12308
  },
12309
  "attributes": {}
12310
  }
12311
  },
12312
- "total_flos": 4.838698202691011e+18,
12313
  "train_batch_size": 64,
12314
  "trial_name": null,
12315
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.333901761331791,
6
  "eval_steps": 500,
7
+ "global_step": 70000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12290
  "eval_steps_per_second": 23.618,
12291
  "num_input_tokens_seen": 18087931456,
12292
  "step": 69000
12293
+ },
12294
+ {
12295
+ "epoch": 0.3293702374280024,
12296
+ "grad_norm": 0.14565595984458923,
12297
+ "learning_rate": 1.1318413143740436e-05,
12298
+ "loss": 2.5358,
12299
+ "num_input_tokens_seen": 18101038656,
12300
+ "step": 69050
12301
+ },
12302
+ {
12303
+ "epoch": 0.3296087386860966,
12304
+ "grad_norm": 0.15810008347034454,
12305
+ "learning_rate": 1.0162300788382261e-05,
12306
+ "loss": 2.5288,
12307
+ "num_input_tokens_seen": 18114145856,
12308
+ "step": 69100
12309
+ },
12310
+ {
12311
+ "epoch": 0.3298472399441907,
12312
+ "grad_norm": 0.14960281550884247,
12313
+ "learning_rate": 9.0678523819408e-06,
12314
+ "loss": 2.5267,
12315
+ "num_input_tokens_seen": 18127253056,
12316
+ "step": 69150
12317
+ },
12318
+ {
12319
+ "epoch": 0.33008574120228484,
12320
+ "grad_norm": 0.14473624527454376,
12321
+ "learning_rate": 8.035205700685167e-06,
12322
+ "loss": 2.5133,
12323
+ "num_input_tokens_seen": 18140360256,
12324
+ "step": 69200
12325
+ },
12326
+ {
12327
+ "epoch": 0.330324242460379,
12328
+ "grad_norm": 0.1450708657503128,
12329
+ "learning_rate": 7.064490740882057e-06,
12330
+ "loss": 2.5302,
12331
+ "num_input_tokens_seen": 18153467456,
12332
+ "step": 69250
12333
+ },
12334
+ {
12335
+ "epoch": 0.3305627437184731,
12336
+ "grad_norm": 0.14883211255073547,
12337
+ "learning_rate": 6.15582970243117e-06,
12338
+ "loss": 2.5307,
12339
+ "num_input_tokens_seen": 18166574656,
12340
+ "step": 69300
12341
+ },
12342
+ {
12343
+ "epoch": 0.33080124497656727,
12344
+ "grad_norm": 0.15696081519126892,
12345
+ "learning_rate": 5.309336973481682e-06,
12346
+ "loss": 2.5341,
12347
+ "num_input_tokens_seen": 18179681856,
12348
+ "step": 69350
12349
+ },
12350
+ {
12351
+ "epoch": 0.33103974623466137,
12352
+ "grad_norm": 0.1564367264509201,
12353
+ "learning_rate": 4.52511911603265e-06,
12354
+ "loss": 2.5299,
12355
+ "num_input_tokens_seen": 18192789056,
12356
+ "step": 69400
12357
+ },
12358
+ {
12359
+ "epoch": 0.33127824749275553,
12360
+ "grad_norm": 0.15558916330337524,
12361
+ "learning_rate": 3.803274852517968e-06,
12362
+ "loss": 2.5197,
12363
+ "num_input_tokens_seen": 18205896256,
12364
+ "step": 69450
12365
+ },
12366
+ {
12367
+ "epoch": 0.3315167487508497,
12368
+ "grad_norm": 0.1532556265592575,
12369
+ "learning_rate": 3.143895053378698e-06,
12370
+ "loss": 2.5176,
12371
+ "num_input_tokens_seen": 18219003456,
12372
+ "step": 69500
12373
+ },
12374
+ {
12375
+ "epoch": 0.3315167487508497,
12376
+ "eval_loss": 2.412046194076538,
12377
+ "eval_runtime": 53.2476,
12378
+ "eval_samples_per_second": 93.901,
12379
+ "eval_steps_per_second": 23.475,
12380
+ "num_input_tokens_seen": 18219003456,
12381
+ "step": 69500
12382
+ },
12383
+ {
12384
+ "epoch": 0.3317552500089438,
12385
+ "grad_norm": 0.1502823829650879,
12386
+ "learning_rate": 2.547062725623828e-06,
12387
+ "loss": 2.5207,
12388
+ "num_input_tokens_seen": 18232110656,
12389
+ "step": 69550
12390
+ },
12391
+ {
12392
+ "epoch": 0.33199375126703795,
12393
+ "grad_norm": 0.1560440957546234,
12394
+ "learning_rate": 2.012853002380466e-06,
12395
+ "loss": 2.5078,
12396
+ "num_input_tokens_seen": 18245217856,
12397
+ "step": 69600
12398
+ },
12399
+ {
12400
+ "epoch": 0.33223225252513205,
12401
+ "grad_norm": 0.15284490585327148,
12402
+ "learning_rate": 1.541333133436018e-06,
12403
+ "loss": 2.5404,
12404
+ "num_input_tokens_seen": 18258325056,
12405
+ "step": 69650
12406
+ },
12407
+ {
12408
+ "epoch": 0.3324707537832262,
12409
+ "grad_norm": 0.14594900608062744,
12410
+ "learning_rate": 1.132562476771959e-06,
12411
+ "loss": 2.5267,
12412
+ "num_input_tokens_seen": 18271432256,
12413
+ "step": 69700
12414
+ },
12415
+ {
12416
+ "epoch": 0.3327092550413203,
12417
+ "grad_norm": 0.15198394656181335,
12418
+ "learning_rate": 7.865924910916978e-07,
12419
+ "loss": 2.5232,
12420
+ "num_input_tokens_seen": 18284539456,
12421
+ "step": 69750
12422
+ },
12423
+ {
12424
+ "epoch": 0.3329477562994145,
12425
+ "grad_norm": 0.15011271834373474,
12426
+ "learning_rate": 5.034667293427053e-07,
12427
+ "loss": 2.5308,
12428
+ "num_input_tokens_seen": 18297646656,
12429
+ "step": 69800
12430
+ },
12431
+ {
12432
+ "epoch": 0.33318625755750864,
12433
+ "grad_norm": 0.147654727101326,
12434
+ "learning_rate": 2.8322083323334415e-07,
12435
+ "loss": 2.5281,
12436
+ "num_input_tokens_seen": 18310753856,
12437
+ "step": 69850
12438
+ },
12439
+ {
12440
+ "epoch": 0.33342475881560274,
12441
+ "grad_norm": 0.15056386590003967,
12442
+ "learning_rate": 1.2588252874673466e-07,
12443
+ "loss": 2.5112,
12444
+ "num_input_tokens_seen": 18323861056,
12445
+ "step": 69900
12446
+ },
12447
+ {
12448
+ "epoch": 0.3336632600736969,
12449
+ "grad_norm": 0.14858213067054749,
12450
+ "learning_rate": 3.147162264971471e-08,
12451
+ "loss": 2.5226,
12452
+ "num_input_tokens_seen": 18336968256,
12453
+ "step": 69950
12454
+ },
12455
+ {
12456
+ "epoch": 0.333901761331791,
12457
+ "grad_norm": 0.1534891128540039,
12458
+ "learning_rate": 0.0,
12459
+ "loss": 2.5303,
12460
+ "num_input_tokens_seen": 18350075456,
12461
+ "step": 70000
12462
+ },
12463
+ {
12464
+ "epoch": 0.333901761331791,
12465
+ "eval_loss": 2.411842107772827,
12466
+ "eval_runtime": 53.9812,
12467
+ "eval_samples_per_second": 92.625,
12468
+ "eval_steps_per_second": 23.156,
12469
+ "num_input_tokens_seen": 18350075456,
12470
+ "step": 70000
12471
  }
12472
  ],
12473
  "logging_steps": 50,
12474
  "max_steps": 70000,
12475
+ "num_input_tokens_seen": 18350075456,
12476
  "num_train_epochs": 1,
12477
  "save_steps": 1000,
12478
  "stateful_callbacks": {
 
12482
  "should_evaluate": false,
12483
  "should_log": false,
12484
  "should_save": true,
12485
+ "should_training_stop": true
12486
  },
12487
  "attributes": {}
12488
  }
12489
  },
12490
+ "total_flos": 4.908824281216451e+18,
12491
  "train_batch_size": 64,
12492
  "trial_name": null,
12493
  "trial_params": null