Azrail commited on
Commit
4d41ac6
·
verified ·
1 Parent(s): 8c30211

Training in progress, step 121000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e70907b0d675ee2643842e014ed6c972c9663ac94c350f0ab42a0be8632152c
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71fae22dcd21758bd18c93255be6587d157b9938e670e9b4e1e58707f826293b
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3a65d04a4bb9bbc428894a0e56fe5a8ff86920144b87270537f75bf5b3558c9
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c78fd0c407d20f07636b49b2421a64b67521b73a2c07508922e8bab006631080
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d8a9a435a8fb7efaea34ed653a04299793c4ab23d440f306a1001d1a5e2fe4d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d05682589c4464dbd9ebcfc283944f7611626ce7745ad85f4042e5c5171b5198
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f14fb013cc682f88bd394d32631eff6723ea097f4e238bec79824a853a5616c4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5601bca8adb9619336ad1a8f8dd5a3bb4b196a7ee7870568f8cb821d9554477
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1448084238644358,
6
  "eval_steps": 500,
7
- "global_step": 120000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -21368,11 +21368,189 @@
21368
  "eval_steps_per_second": 15.087,
21369
  "num_input_tokens_seen": 62904447680,
21370
  "step": 120000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21371
  }
21372
  ],
21373
  "logging_steps": 50,
21374
  "max_steps": 140000,
21375
- "num_input_tokens_seen": 62904447680,
21376
  "num_train_epochs": 2,
21377
  "save_steps": 1000,
21378
  "stateful_callbacks": {
@@ -21387,7 +21565,7 @@
21387
  "attributes": {}
21388
  }
21389
  },
21390
- "total_flos": 1.1132939862234317e+20,
21391
  "train_batch_size": 32,
21392
  "trial_name": null,
21393
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1543484741882013,
6
  "eval_steps": 500,
7
+ "global_step": 121000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
21368
  "eval_steps_per_second": 15.087,
21369
  "num_input_tokens_seen": 62904447680,
21370
  "step": 120000
21371
+ },
21372
+ {
21373
+ "epoch": 1.1452854263806242,
21374
+ "grad_norm": 0.15100175142288208,
21375
+ "learning_rate": 0.0008095469746549171,
21376
+ "loss": 2.0793,
21377
+ "num_input_tokens_seen": 62930656352,
21378
+ "step": 120050
21379
+ },
21380
+ {
21381
+ "epoch": 1.1457624288968125,
21382
+ "grad_norm": 0.14095434546470642,
21383
+ "learning_rate": 0.0008073393063582386,
21384
+ "loss": 2.0828,
21385
+ "num_input_tokens_seen": 62956868576,
21386
+ "step": 120100
21387
+ },
21388
+ {
21389
+ "epoch": 1.1462394314130007,
21390
+ "grad_norm": 0.15013264119625092,
21391
+ "learning_rate": 0.0008051219655187818,
21392
+ "loss": 2.0711,
21393
+ "num_input_tokens_seen": 62983080544,
21394
+ "step": 120150
21395
+ },
21396
+ {
21397
+ "epoch": 1.146716433929189,
21398
+ "grad_norm": 0.1443673074245453,
21399
+ "learning_rate": 0.00080289502192041,
21400
+ "loss": 2.0764,
21401
+ "num_input_tokens_seen": 63009276608,
21402
+ "step": 120200
21403
+ },
21404
+ {
21405
+ "epoch": 1.1471934364453773,
21406
+ "grad_norm": 0.13627703487873077,
21407
+ "learning_rate": 0.0008006585456492029,
21408
+ "loss": 2.0805,
21409
+ "num_input_tokens_seen": 63035488032,
21410
+ "step": 120250
21411
+ },
21412
+ {
21413
+ "epoch": 1.1476704389615655,
21414
+ "grad_norm": 0.14744721353054047,
21415
+ "learning_rate": 0.0007984126070912518,
21416
+ "loss": 2.0691,
21417
+ "num_input_tokens_seen": 63061701600,
21418
+ "step": 120300
21419
+ },
21420
+ {
21421
+ "epoch": 1.1481474414777537,
21422
+ "grad_norm": 0.14301970601081848,
21423
+ "learning_rate": 0.0007961572769304437,
21424
+ "loss": 2.0788,
21425
+ "num_input_tokens_seen": 63087914624,
21426
+ "step": 120350
21427
+ },
21428
+ {
21429
+ "epoch": 1.1486244439939421,
21430
+ "grad_norm": 0.13261480629444122,
21431
+ "learning_rate": 0.0007938926261462366,
21432
+ "loss": 2.0802,
21433
+ "num_input_tokens_seen": 63114128096,
21434
+ "step": 120400
21435
+ },
21436
+ {
21437
+ "epoch": 1.1491014465101304,
21438
+ "grad_norm": 0.14857733249664307,
21439
+ "learning_rate": 0.0007916187260114262,
21440
+ "loss": 2.0773,
21441
+ "num_input_tokens_seen": 63140341024,
21442
+ "step": 120450
21443
+ },
21444
+ {
21445
+ "epoch": 1.1495784490263186,
21446
+ "grad_norm": 0.13263733685016632,
21447
+ "learning_rate": 0.000789335648089903,
21448
+ "loss": 2.0796,
21449
+ "num_input_tokens_seen": 63166554368,
21450
+ "step": 120500
21451
+ },
21452
+ {
21453
+ "epoch": 1.1495784490263186,
21454
+ "eval_loss": 1.9961134195327759,
21455
+ "eval_runtime": 82.5305,
21456
+ "eval_samples_per_second": 60.584,
21457
+ "eval_steps_per_second": 15.146,
21458
+ "num_input_tokens_seen": 63166554368,
21459
+ "step": 120500
21460
+ },
21461
+ {
21462
+ "epoch": 1.150055451542507,
21463
+ "grad_norm": 0.13879702985286713,
21464
+ "learning_rate": 0.0007870434642343984,
21465
+ "loss": 2.0783,
21466
+ "num_input_tokens_seen": 63192764288,
21467
+ "step": 120550
21468
+ },
21469
+ {
21470
+ "epoch": 1.1505324540586952,
21471
+ "grad_norm": 0.13164860010147095,
21472
+ "learning_rate": 0.000784742246584226,
21473
+ "loss": 2.081,
21474
+ "num_input_tokens_seen": 63218969504,
21475
+ "step": 120600
21476
+ },
21477
+ {
21478
+ "epoch": 1.1510094565748834,
21479
+ "grad_norm": 0.1406654268503189,
21480
+ "learning_rate": 0.0007824320675630089,
21481
+ "loss": 2.0704,
21482
+ "num_input_tokens_seen": 63245179680,
21483
+ "step": 120650
21484
+ },
21485
+ {
21486
+ "epoch": 1.1514864590910716,
21487
+ "grad_norm": 0.13722951710224152,
21488
+ "learning_rate": 0.0007801129998764014,
21489
+ "loss": 2.0693,
21490
+ "num_input_tokens_seen": 63271389024,
21491
+ "step": 120700
21492
+ },
21493
+ {
21494
+ "epoch": 1.15196346160726,
21495
+ "grad_norm": 0.15168820321559906,
21496
+ "learning_rate": 0.0007777851165098011,
21497
+ "loss": 2.0813,
21498
+ "num_input_tokens_seen": 63297594624,
21499
+ "step": 120750
21500
+ },
21501
+ {
21502
+ "epoch": 1.1524404641234482,
21503
+ "grad_norm": 0.13907547295093536,
21504
+ "learning_rate": 0.0007754484907260512,
21505
+ "loss": 2.0747,
21506
+ "num_input_tokens_seen": 63323809024,
21507
+ "step": 120800
21508
+ },
21509
+ {
21510
+ "epoch": 1.1529174666396365,
21511
+ "grad_norm": 0.13827022910118103,
21512
+ "learning_rate": 0.0007731031960631354,
21513
+ "loss": 2.079,
21514
+ "num_input_tokens_seen": 63350015808,
21515
+ "step": 120850
21516
+ },
21517
+ {
21518
+ "epoch": 1.1533944691558249,
21519
+ "grad_norm": 0.1326221376657486,
21520
+ "learning_rate": 0.0007707493063318629,
21521
+ "loss": 2.0856,
21522
+ "num_input_tokens_seen": 63376227968,
21523
+ "step": 120900
21524
+ },
21525
+ {
21526
+ "epoch": 1.153871471672013,
21527
+ "grad_norm": 0.13669894635677338,
21528
+ "learning_rate": 0.000768386895613546,
21529
+ "loss": 2.0691,
21530
+ "num_input_tokens_seen": 63402433504,
21531
+ "step": 120950
21532
+ },
21533
+ {
21534
+ "epoch": 1.1543484741882013,
21535
+ "grad_norm": 0.1403321623802185,
21536
+ "learning_rate": 0.0007660160382576683,
21537
+ "loss": 2.077,
21538
+ "num_input_tokens_seen": 63428647904,
21539
+ "step": 121000
21540
+ },
21541
+ {
21542
+ "epoch": 1.1543484741882013,
21543
+ "eval_loss": 1.9939944744110107,
21544
+ "eval_runtime": 82.7663,
21545
+ "eval_samples_per_second": 60.411,
21546
+ "eval_steps_per_second": 15.103,
21547
+ "num_input_tokens_seen": 63428647904,
21548
+ "step": 121000
21549
  }
21550
  ],
21551
  "logging_steps": 50,
21552
  "max_steps": 140000,
21553
+ "num_input_tokens_seen": 63428647904,
21554
  "num_train_epochs": 2,
21555
  "save_steps": 1000,
21556
  "stateful_callbacks": {
 
21565
  "attributes": {}
21566
  }
21567
  },
21568
+ "total_flos": 1.1225713740470231e+20,
21569
  "train_batch_size": 32,
21570
  "trial_name": null,
21571
  "trial_params": null