FormlessAI commited on
Commit
f0fbf33
·
verified ·
1 Parent(s): 9236b6d

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0d4d53d47cf148bf090bf85a00e8d8ce95def5a92d928f35b159ba03df5b14b
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12ce7779d96a024c4dd4f58d076b05867f31b520868639145f1b25c63bf1906
3
  size 1037269336
last-checkpoint/global_step3600/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b616c059747eaccce047c3112e296ed72258ba5d5394b0108a0212546122845
3
+ size 781993445
last-checkpoint/global_step3600/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3a0874b23896609b2b966f82c616126aa647394a112baa83b98e5d2062c88c9
3
+ size 781993509
last-checkpoint/global_step3600/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48fa9a4ed997374aade9c8737caed2aecf614bb3b3706b3d5f2e66eaf40351ff
3
+ size 781993509
last-checkpoint/global_step3600/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67535a7ee65e672147ce20bec6495dce79103690259ea40e50c21fc33d4a4953
3
+ size 781993509
last-checkpoint/global_step3600/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a25fb5ddf9c1af2d5420efdae26d4b9821cf64a076b842481731c6ebb07d1b3d
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step3400
 
1
+ global_step3600
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a0dae475a2e74a7eba2183dafd43f7f364f1783f2c428d16e9ebda71fcc129b
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4560423a884b4db453d7d1b748155a1cd58f131c7e355290b17af66a745e3b19
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c92dedf94907603b200c04a410bb8278e46b6a33fdfe1169ce038f2d7b3407de
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8985cb46ba2723280ee973b265fe66bd4b26b2de0dd0dbbe501d8869c79c0a4c
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d828406bc3d74852e5371798f20aafaca25c8bf291eef03f86935f60052b842
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5ad915f857347045218dcd0e5ba757cb7b726c9623b0b49d51280ce11e9c427
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a55ca83ba43de275fbe01b00f1c525156344f01bb28f4ae23e7fcc92b0451a6
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a2a0ea073b71609cb1e29e7290795e9416839e4a2e0d6ed6b40688c05f20303
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb5c50f2f5f52986bf42683d33595b37ca91b5dd7eae281283c2ab87207f371d
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab73268050baa090e15858a3a718e94ea470376fdd081db33931bc775fd1484a
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 1.9858521223068237,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4942578863206861,
6
  "eval_steps": 50,
7
- "global_step": 3400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5312,6 +5312,318 @@
5312
  "eval_samples_per_second": 174.212,
5313
  "eval_steps_per_second": 10.925,
5314
  "step": 3400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5315
  }
5316
  ],
5317
  "logging_steps": 5,
@@ -5340,7 +5652,7 @@
5340
  "attributes": {}
5341
  }
5342
  },
5343
- "total_flos": 8.857243019724718e+17,
5344
  "train_batch_size": 4,
5345
  "trial_name": null,
5346
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 1.9708884954452515,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5233318796336677,
6
  "eval_steps": 50,
7
+ "global_step": 3600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5312
  "eval_samples_per_second": 174.212,
5313
  "eval_steps_per_second": 10.925,
5314
  "step": 3400
5315
+ },
5316
+ {
5317
+ "epoch": 0.4949847361535107,
5318
+ "grad_norm": 2.4753811359405518,
5319
+ "learning_rate": 7.509245947645659e-05,
5320
+ "loss": 1.9676,
5321
+ "step": 3405
5322
+ },
5323
+ {
5324
+ "epoch": 0.49571158598633525,
5325
+ "grad_norm": 2.552577495574951,
5326
+ "learning_rate": 7.502353796314939e-05,
5327
+ "loss": 2.0703,
5328
+ "step": 3410
5329
+ },
5330
+ {
5331
+ "epoch": 0.49643843581915975,
5332
+ "grad_norm": 2.2608258724212646,
5333
+ "learning_rate": 7.495455370075547e-05,
5334
+ "loss": 2.0442,
5335
+ "step": 3415
5336
+ },
5337
+ {
5338
+ "epoch": 0.4971652856519843,
5339
+ "grad_norm": 2.7708072662353516,
5340
+ "learning_rate": 7.488550686294263e-05,
5341
+ "loss": 2.0061,
5342
+ "step": 3420
5343
+ },
5344
+ {
5345
+ "epoch": 0.4978921354848088,
5346
+ "grad_norm": 2.265629768371582,
5347
+ "learning_rate": 7.481639762353621e-05,
5348
+ "loss": 2.0098,
5349
+ "step": 3425
5350
+ },
5351
+ {
5352
+ "epoch": 0.4986189853176334,
5353
+ "grad_norm": 2.200986623764038,
5354
+ "learning_rate": 7.474722615651865e-05,
5355
+ "loss": 2.0711,
5356
+ "step": 3430
5357
+ },
5358
+ {
5359
+ "epoch": 0.49934583515045794,
5360
+ "grad_norm": 2.6096930503845215,
5361
+ "learning_rate": 7.4677992636029e-05,
5362
+ "loss": 2.0267,
5363
+ "step": 3435
5364
+ },
5365
+ {
5366
+ "epoch": 0.5000726849832825,
5367
+ "grad_norm": 2.679610013961792,
5368
+ "learning_rate": 7.460869723636259e-05,
5369
+ "loss": 2.0392,
5370
+ "step": 3440
5371
+ },
5372
+ {
5373
+ "epoch": 0.500799534816107,
5374
+ "grad_norm": 2.4646713733673096,
5375
+ "learning_rate": 7.45393401319705e-05,
5376
+ "loss": 1.9999,
5377
+ "step": 3445
5378
+ },
5379
+ {
5380
+ "epoch": 0.5015263846489315,
5381
+ "grad_norm": 2.341169834136963,
5382
+ "learning_rate": 7.446992149745914e-05,
5383
+ "loss": 2.0061,
5384
+ "step": 3450
5385
+ },
5386
+ {
5387
+ "epoch": 0.5015263846489315,
5388
+ "eval_loss": 1.9797232151031494,
5389
+ "eval_runtime": 21.8402,
5390
+ "eval_samples_per_second": 151.144,
5391
+ "eval_steps_per_second": 9.478,
5392
+ "step": 3450
5393
+ },
5394
+ {
5395
+ "epoch": 0.5022532344817561,
5396
+ "grad_norm": 2.2362568378448486,
5397
+ "learning_rate": 7.440044150758987e-05,
5398
+ "loss": 1.8974,
5399
+ "step": 3455
5400
+ },
5401
+ {
5402
+ "epoch": 0.5029800843145806,
5403
+ "grad_norm": 2.497943878173828,
5404
+ "learning_rate": 7.433090033727847e-05,
5405
+ "loss": 2.178,
5406
+ "step": 3460
5407
+ },
5408
+ {
5409
+ "epoch": 0.5037069341474052,
5410
+ "grad_norm": 2.9573802947998047,
5411
+ "learning_rate": 7.426129816159475e-05,
5412
+ "loss": 2.0595,
5413
+ "step": 3465
5414
+ },
5415
+ {
5416
+ "epoch": 0.5044337839802296,
5417
+ "grad_norm": 2.28165602684021,
5418
+ "learning_rate": 7.419163515576209e-05,
5419
+ "loss": 2.2754,
5420
+ "step": 3470
5421
+ },
5422
+ {
5423
+ "epoch": 0.5051606338130542,
5424
+ "grad_norm": 2.49424147605896,
5425
+ "learning_rate": 7.412191149515707e-05,
5426
+ "loss": 2.1558,
5427
+ "step": 3475
5428
+ },
5429
+ {
5430
+ "epoch": 0.5058874836458788,
5431
+ "grad_norm": 3.0092170238494873,
5432
+ "learning_rate": 7.405212735530888e-05,
5433
+ "loss": 2.1079,
5434
+ "step": 3480
5435
+ },
5436
+ {
5437
+ "epoch": 0.5066143334787033,
5438
+ "grad_norm": 2.4972879886627197,
5439
+ "learning_rate": 7.398228291189901e-05,
5440
+ "loss": 2.181,
5441
+ "step": 3485
5442
+ },
5443
+ {
5444
+ "epoch": 0.5073411833115279,
5445
+ "grad_norm": 2.6351096630096436,
5446
+ "learning_rate": 7.391237834076077e-05,
5447
+ "loss": 1.9635,
5448
+ "step": 3490
5449
+ },
5450
+ {
5451
+ "epoch": 0.5080680331443523,
5452
+ "grad_norm": 2.5686097145080566,
5453
+ "learning_rate": 7.384241381787888e-05,
5454
+ "loss": 2.1353,
5455
+ "step": 3495
5456
+ },
5457
+ {
5458
+ "epoch": 0.5087948829771769,
5459
+ "grad_norm": 2.4493703842163086,
5460
+ "learning_rate": 7.377238951938886e-05,
5461
+ "loss": 2.1474,
5462
+ "step": 3500
5463
+ },
5464
+ {
5465
+ "epoch": 0.5087948829771769,
5466
+ "eval_loss": 1.9865467548370361,
5467
+ "eval_runtime": 19.1097,
5468
+ "eval_samples_per_second": 172.74,
5469
+ "eval_steps_per_second": 10.832,
5470
+ "step": 3500
5471
+ },
5472
+ {
5473
+ "epoch": 0.5095217328100015,
5474
+ "grad_norm": 2.4284589290618896,
5475
+ "learning_rate": 7.370230562157685e-05,
5476
+ "loss": 2.0678,
5477
+ "step": 3505
5478
+ },
5479
+ {
5480
+ "epoch": 0.510248582642826,
5481
+ "grad_norm": 2.635737657546997,
5482
+ "learning_rate": 7.363216230087898e-05,
5483
+ "loss": 2.2497,
5484
+ "step": 3510
5485
+ },
5486
+ {
5487
+ "epoch": 0.5109754324756506,
5488
+ "grad_norm": 2.3156023025512695,
5489
+ "learning_rate": 7.356195973388096e-05,
5490
+ "loss": 2.1084,
5491
+ "step": 3515
5492
+ },
5493
+ {
5494
+ "epoch": 0.511702282308475,
5495
+ "grad_norm": 2.362034559249878,
5496
+ "learning_rate": 7.349169809731767e-05,
5497
+ "loss": 1.9663,
5498
+ "step": 3520
5499
+ },
5500
+ {
5501
+ "epoch": 0.5124291321412996,
5502
+ "grad_norm": 2.198225975036621,
5503
+ "learning_rate": 7.342137756807273e-05,
5504
+ "loss": 1.9753,
5505
+ "step": 3525
5506
+ },
5507
+ {
5508
+ "epoch": 0.5131559819741242,
5509
+ "grad_norm": 2.3297581672668457,
5510
+ "learning_rate": 7.335099832317792e-05,
5511
+ "loss": 1.9516,
5512
+ "step": 3530
5513
+ },
5514
+ {
5515
+ "epoch": 0.5138828318069487,
5516
+ "grad_norm": 2.580559492111206,
5517
+ "learning_rate": 7.328056053981296e-05,
5518
+ "loss": 2.1125,
5519
+ "step": 3535
5520
+ },
5521
+ {
5522
+ "epoch": 0.5146096816397733,
5523
+ "grad_norm": 2.454136371612549,
5524
+ "learning_rate": 7.321006439530488e-05,
5525
+ "loss": 2.1955,
5526
+ "step": 3540
5527
+ },
5528
+ {
5529
+ "epoch": 0.5153365314725977,
5530
+ "grad_norm": 2.720200300216675,
5531
+ "learning_rate": 7.313951006712762e-05,
5532
+ "loss": 2.1802,
5533
+ "step": 3545
5534
+ },
5535
+ {
5536
+ "epoch": 0.5160633813054223,
5537
+ "grad_norm": 2.2702293395996094,
5538
+ "learning_rate": 7.306889773290163e-05,
5539
+ "loss": 2.0275,
5540
+ "step": 3550
5541
+ },
5542
+ {
5543
+ "epoch": 0.5160633813054223,
5544
+ "eval_loss": 1.9806544780731201,
5545
+ "eval_runtime": 19.2538,
5546
+ "eval_samples_per_second": 171.447,
5547
+ "eval_steps_per_second": 10.751,
5548
+ "step": 3550
5549
+ },
5550
+ {
5551
+ "epoch": 0.5167902311382468,
5552
+ "grad_norm": 2.6502344608306885,
5553
+ "learning_rate": 7.299822757039339e-05,
5554
+ "loss": 2.2931,
5555
+ "step": 3555
5556
+ },
5557
+ {
5558
+ "epoch": 0.5175170809710714,
5559
+ "grad_norm": 2.4326069355010986,
5560
+ "learning_rate": 7.292749975751491e-05,
5561
+ "loss": 2.0597,
5562
+ "step": 3560
5563
+ },
5564
+ {
5565
+ "epoch": 0.518243930803896,
5566
+ "grad_norm": 2.45497465133667,
5567
+ "learning_rate": 7.285671447232342e-05,
5568
+ "loss": 2.1446,
5569
+ "step": 3565
5570
+ },
5571
+ {
5572
+ "epoch": 0.5189707806367204,
5573
+ "grad_norm": 2.320857048034668,
5574
+ "learning_rate": 7.278587189302076e-05,
5575
+ "loss": 2.1279,
5576
+ "step": 3570
5577
+ },
5578
+ {
5579
+ "epoch": 0.519697630469545,
5580
+ "grad_norm": 2.6278252601623535,
5581
+ "learning_rate": 7.271497219795305e-05,
5582
+ "loss": 1.9936,
5583
+ "step": 3575
5584
+ },
5585
+ {
5586
+ "epoch": 0.5204244803023695,
5587
+ "grad_norm": 2.3981995582580566,
5588
+ "learning_rate": 7.264401556561019e-05,
5589
+ "loss": 1.9534,
5590
+ "step": 3580
5591
+ },
5592
+ {
5593
+ "epoch": 0.5211513301351941,
5594
+ "grad_norm": 2.486588716506958,
5595
+ "learning_rate": 7.257300217462541e-05,
5596
+ "loss": 2.291,
5597
+ "step": 3585
5598
+ },
5599
+ {
5600
+ "epoch": 0.5218781799680187,
5601
+ "grad_norm": 2.3635659217834473,
5602
+ "learning_rate": 7.250193220377486e-05,
5603
+ "loss": 1.9516,
5604
+ "step": 3590
5605
+ },
5606
+ {
5607
+ "epoch": 0.5226050298008431,
5608
+ "grad_norm": 2.548090934753418,
5609
+ "learning_rate": 7.243080583197707e-05,
5610
+ "loss": 2.0224,
5611
+ "step": 3595
5612
+ },
5613
+ {
5614
+ "epoch": 0.5233318796336677,
5615
+ "grad_norm": 2.575667142868042,
5616
+ "learning_rate": 7.235962323829262e-05,
5617
+ "loss": 2.3508,
5618
+ "step": 3600
5619
+ },
5620
+ {
5621
+ "epoch": 0.5233318796336677,
5622
+ "eval_loss": 1.9708884954452515,
5623
+ "eval_runtime": 18.8558,
5624
+ "eval_samples_per_second": 175.066,
5625
+ "eval_steps_per_second": 10.978,
5626
+ "step": 3600
5627
  }
5628
  ],
5629
  "logging_steps": 5,
 
5652
  "attributes": {}
5653
  }
5654
  },
5655
+ "total_flos": 9.384889709651558e+17,
5656
  "train_batch_size": 4,
5657
  "trial_name": null,
5658
  "trial_params": null