mohammadmahdinouri commited on
Commit
f51a643
·
verified ·
1 Parent(s): 66576f7

Training in progress, step 45000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fcd37382582cb8125ec6d2b4c1830cfd9571895d7c529b9e0a164d8e6840c5f
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c34146894dfe342922a0c8a606eac0350f76f11ea9e61c107c4fbf6ed4906e82
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e21cff1a1a482dfd42e5de94ed9f704de413d72174366b200ada5de0ca1e11ea
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:502f558a0369d8b367137ad2a3eafab0d0eba581e23c553e257aa247277dbe02
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bcdfb03c2359167be102884353768ec17e50976fbf8f4e087f73ce920ca2083a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5bd1df9dd287561b432a4ed1887fdcf8336e4c007fc54d3406e19e31c0bf33c
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ec4a5179b74ef8eb313eb14817a74f7fc913bdcaa29bebe58410f2cfcc1721c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a04ed282bfc2791246883061552e93b53278d67ed9004819c40202da8064598
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f47d801e0e1e17041030e161c1b98962f17c60b9ac26c132a50dbfe0dbb10fd9
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06efddacd0797f2209b470dd13b726f0378cede9676bfe8789e9ef52e5513689
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:908ba09c9714e54813137013872e64f7f3d3ddeb4ee5dff69a0168885135e1ba
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f815d3a5b83aa01364285e2a8f42845f4c04a057e51ff2194a37de3386990687
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2235aa419a253926d81708d5bb132f65ffe83cce4aceb17112e87544e0f82b3d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a0f1ff729c197851c973f7c1a73abcd6f67c6109dc7b232bef28f6096173a7b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.08579883099092775,
6
  "eval_steps": 500,
7
- "global_step": 44000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -15408,10 +15408,360 @@
15408
  "learning_rate": 0.0004858584078736993,
15409
  "loss": 17.2386,
15410
  "step": 44000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15411
  }
15412
  ],
15413
  "logging_steps": 20,
15414
- "max_steps": 1538481,
15415
  "num_input_tokens_seen": 0,
15416
  "num_train_epochs": 3,
15417
  "save_steps": 1000,
@@ -15427,7 +15777,7 @@
15427
  "attributes": {}
15428
  }
15429
  },
15430
- "total_flos": 3.2346940087260414e+19,
15431
  "train_batch_size": 48,
15432
  "trial_name": null,
15433
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.06665916133887148,
6
  "eval_steps": 500,
7
+ "global_step": 45000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
15408
  "learning_rate": 0.0004858584078736993,
15409
  "loss": 17.2386,
15410
  "step": 44000
15411
+ },
15412
+ {
15413
+ "epoch": 0.0652074729363805,
15414
+ "grad_norm": 7.21875,
15415
+ "learning_rate": 0.0004892531243702858,
15416
+ "loss": 18.4759,
15417
+ "step": 44020
15418
+ },
15419
+ {
15420
+ "epoch": 0.06523709923030889,
15421
+ "grad_norm": 7.5,
15422
+ "learning_rate": 0.0004892481854352782,
15423
+ "loss": 18.408,
15424
+ "step": 44040
15425
+ },
15426
+ {
15427
+ "epoch": 0.06526672552423728,
15428
+ "grad_norm": 7.90625,
15429
+ "learning_rate": 0.0004892432465002707,
15430
+ "loss": 18.2598,
15431
+ "step": 44060
15432
+ },
15433
+ {
15434
+ "epoch": 0.06529635181816566,
15435
+ "grad_norm": 7.625,
15436
+ "learning_rate": 0.0004892383075652631,
15437
+ "loss": 18.2414,
15438
+ "step": 44080
15439
+ },
15440
+ {
15441
+ "epoch": 0.06532597811209405,
15442
+ "grad_norm": 7.5,
15443
+ "learning_rate": 0.0004892333686302555,
15444
+ "loss": 18.2764,
15445
+ "step": 44100
15446
+ },
15447
+ {
15448
+ "epoch": 0.06535560440602244,
15449
+ "grad_norm": 6.59375,
15450
+ "learning_rate": 0.000489228429695248,
15451
+ "loss": 18.2161,
15452
+ "step": 44120
15453
+ },
15454
+ {
15455
+ "epoch": 0.06538523069995082,
15456
+ "grad_norm": 7.125,
15457
+ "learning_rate": 0.0004892234907602404,
15458
+ "loss": 18.2661,
15459
+ "step": 44140
15460
+ },
15461
+ {
15462
+ "epoch": 0.06541485699387921,
15463
+ "grad_norm": 7.0625,
15464
+ "learning_rate": 0.0004892185518252329,
15465
+ "loss": 18.2414,
15466
+ "step": 44160
15467
+ },
15468
+ {
15469
+ "epoch": 0.0654444832878076,
15470
+ "grad_norm": 7.34375,
15471
+ "learning_rate": 0.0004892136128902253,
15472
+ "loss": 18.1743,
15473
+ "step": 44180
15474
+ },
15475
+ {
15476
+ "epoch": 0.06547410958173598,
15477
+ "grad_norm": 7.34375,
15478
+ "learning_rate": 0.0004892086739552177,
15479
+ "loss": 18.1813,
15480
+ "step": 44200
15481
+ },
15482
+ {
15483
+ "epoch": 0.06550373587566437,
15484
+ "grad_norm": 7.03125,
15485
+ "learning_rate": 0.0004892037350202102,
15486
+ "loss": 18.239,
15487
+ "step": 44220
15488
+ },
15489
+ {
15490
+ "epoch": 0.06553336216959275,
15491
+ "grad_norm": 7.375,
15492
+ "learning_rate": 0.0004891987960852026,
15493
+ "loss": 18.084,
15494
+ "step": 44240
15495
+ },
15496
+ {
15497
+ "epoch": 0.06556298846352114,
15498
+ "grad_norm": 7.375,
15499
+ "learning_rate": 0.000489193857150195,
15500
+ "loss": 18.1419,
15501
+ "step": 44260
15502
+ },
15503
+ {
15504
+ "epoch": 0.06559261475744953,
15505
+ "grad_norm": 8.1875,
15506
+ "learning_rate": 0.0004891889182151874,
15507
+ "loss": 18.1111,
15508
+ "step": 44280
15509
+ },
15510
+ {
15511
+ "epoch": 0.06562224105137791,
15512
+ "grad_norm": 7.09375,
15513
+ "learning_rate": 0.0004891839792801799,
15514
+ "loss": 18.0775,
15515
+ "step": 44300
15516
+ },
15517
+ {
15518
+ "epoch": 0.0656518673453063,
15519
+ "grad_norm": 7.34375,
15520
+ "learning_rate": 0.0004891790403451723,
15521
+ "loss": 18.1751,
15522
+ "step": 44320
15523
+ },
15524
+ {
15525
+ "epoch": 0.0656814936392347,
15526
+ "grad_norm": 6.96875,
15527
+ "learning_rate": 0.0004891741014101648,
15528
+ "loss": 18.1515,
15529
+ "step": 44340
15530
+ },
15531
+ {
15532
+ "epoch": 0.06571111993316309,
15533
+ "grad_norm": 7.6875,
15534
+ "learning_rate": 0.0004891691624751572,
15535
+ "loss": 18.1534,
15536
+ "step": 44360
15537
+ },
15538
+ {
15539
+ "epoch": 0.06574074622709147,
15540
+ "grad_norm": 6.96875,
15541
+ "learning_rate": 0.0004891642235401497,
15542
+ "loss": 18.1406,
15543
+ "step": 44380
15544
+ },
15545
+ {
15546
+ "epoch": 0.06577037252101986,
15547
+ "grad_norm": 8.625,
15548
+ "learning_rate": 0.000489159284605142,
15549
+ "loss": 18.1022,
15550
+ "step": 44400
15551
+ },
15552
+ {
15553
+ "epoch": 0.06579999881494825,
15554
+ "grad_norm": 7.9375,
15555
+ "learning_rate": 0.0004891543456701344,
15556
+ "loss": 18.0375,
15557
+ "step": 44420
15558
+ },
15559
+ {
15560
+ "epoch": 0.06582962510887663,
15561
+ "grad_norm": 8.5625,
15562
+ "learning_rate": 0.0004891494067351269,
15563
+ "loss": 18.0538,
15564
+ "step": 44440
15565
+ },
15566
+ {
15567
+ "epoch": 0.06585925140280502,
15568
+ "grad_norm": 6.90625,
15569
+ "learning_rate": 0.0004891444678001193,
15570
+ "loss": 18.0487,
15571
+ "step": 44460
15572
+ },
15573
+ {
15574
+ "epoch": 0.0658888776967334,
15575
+ "grad_norm": 7.8125,
15576
+ "learning_rate": 0.0004891395288651117,
15577
+ "loss": 18.1347,
15578
+ "step": 44480
15579
+ },
15580
+ {
15581
+ "epoch": 0.06591850399066179,
15582
+ "grad_norm": 7.21875,
15583
+ "learning_rate": 0.0004891345899301041,
15584
+ "loss": 18.0459,
15585
+ "step": 44500
15586
+ },
15587
+ {
15588
+ "epoch": 0.06594813028459018,
15589
+ "grad_norm": 8.6875,
15590
+ "learning_rate": 0.0004891296509950966,
15591
+ "loss": 18.0135,
15592
+ "step": 44520
15593
+ },
15594
+ {
15595
+ "epoch": 0.06597775657851856,
15596
+ "grad_norm": 7.8125,
15597
+ "learning_rate": 0.000489124712060089,
15598
+ "loss": 18.0667,
15599
+ "step": 44540
15600
+ },
15601
+ {
15602
+ "epoch": 0.06600738287244695,
15603
+ "grad_norm": 7.34375,
15604
+ "learning_rate": 0.0004891197731250815,
15605
+ "loss": 18.0663,
15606
+ "step": 44560
15607
+ },
15608
+ {
15609
+ "epoch": 0.06603700916637534,
15610
+ "grad_norm": 7.5,
15611
+ "learning_rate": 0.0004891148341900739,
15612
+ "loss": 18.035,
15613
+ "step": 44580
15614
+ },
15615
+ {
15616
+ "epoch": 0.06606663546030372,
15617
+ "grad_norm": 7.28125,
15618
+ "learning_rate": 0.0004891098952550664,
15619
+ "loss": 18.0706,
15620
+ "step": 44600
15621
+ },
15622
+ {
15623
+ "epoch": 0.06609626175423211,
15624
+ "grad_norm": 6.6875,
15625
+ "learning_rate": 0.0004891049563200588,
15626
+ "loss": 18.0513,
15627
+ "step": 44620
15628
+ },
15629
+ {
15630
+ "epoch": 0.0661258880481605,
15631
+ "grad_norm": 7.78125,
15632
+ "learning_rate": 0.0004891000173850512,
15633
+ "loss": 18.0185,
15634
+ "step": 44640
15635
+ },
15636
+ {
15637
+ "epoch": 0.0661555143420889,
15638
+ "grad_norm": 6.90625,
15639
+ "learning_rate": 0.0004890950784500437,
15640
+ "loss": 17.9851,
15641
+ "step": 44660
15642
+ },
15643
+ {
15644
+ "epoch": 0.06618514063601728,
15645
+ "grad_norm": 6.65625,
15646
+ "learning_rate": 0.0004890901395150361,
15647
+ "loss": 18.0043,
15648
+ "step": 44680
15649
+ },
15650
+ {
15651
+ "epoch": 0.06621476692994567,
15652
+ "grad_norm": 7.28125,
15653
+ "learning_rate": 0.0004890852005800285,
15654
+ "loss": 17.9712,
15655
+ "step": 44700
15656
+ },
15657
+ {
15658
+ "epoch": 0.06624439322387406,
15659
+ "grad_norm": 7.34375,
15660
+ "learning_rate": 0.000489080261645021,
15661
+ "loss": 18.0501,
15662
+ "step": 44720
15663
+ },
15664
+ {
15665
+ "epoch": 0.06627401951780244,
15666
+ "grad_norm": 7.59375,
15667
+ "learning_rate": 0.0004890753227100134,
15668
+ "loss": 18.029,
15669
+ "step": 44740
15670
+ },
15671
+ {
15672
+ "epoch": 0.06630364581173083,
15673
+ "grad_norm": 6.5625,
15674
+ "learning_rate": 0.0004890703837750059,
15675
+ "loss": 18.0173,
15676
+ "step": 44760
15677
+ },
15678
+ {
15679
+ "epoch": 0.06633327210565922,
15680
+ "grad_norm": 7.71875,
15681
+ "learning_rate": 0.0004890654448399983,
15682
+ "loss": 17.9833,
15683
+ "step": 44780
15684
+ },
15685
+ {
15686
+ "epoch": 0.0663628983995876,
15687
+ "grad_norm": 7.125,
15688
+ "learning_rate": 0.0004890605059049907,
15689
+ "loss": 18.0,
15690
+ "step": 44800
15691
+ },
15692
+ {
15693
+ "epoch": 0.06639252469351599,
15694
+ "grad_norm": 7.15625,
15695
+ "learning_rate": 0.0004890555669699832,
15696
+ "loss": 17.9968,
15697
+ "step": 44820
15698
+ },
15699
+ {
15700
+ "epoch": 0.06642215098744438,
15701
+ "grad_norm": 6.96875,
15702
+ "learning_rate": 0.0004890506280349756,
15703
+ "loss": 18.009,
15704
+ "step": 44840
15705
+ },
15706
+ {
15707
+ "epoch": 0.06645177728137276,
15708
+ "grad_norm": 6.75,
15709
+ "learning_rate": 0.000489045689099968,
15710
+ "loss": 17.9378,
15711
+ "step": 44860
15712
+ },
15713
+ {
15714
+ "epoch": 0.06648140357530115,
15715
+ "grad_norm": 8.625,
15716
+ "learning_rate": 0.0004890407501649604,
15717
+ "loss": 17.9775,
15718
+ "step": 44880
15719
+ },
15720
+ {
15721
+ "epoch": 0.06651102986922953,
15722
+ "grad_norm": 7.28125,
15723
+ "learning_rate": 0.0004890358112299529,
15724
+ "loss": 17.9533,
15725
+ "step": 44900
15726
+ },
15727
+ {
15728
+ "epoch": 0.06654065616315792,
15729
+ "grad_norm": 6.75,
15730
+ "learning_rate": 0.0004890308722949453,
15731
+ "loss": 18.0079,
15732
+ "step": 44920
15733
+ },
15734
+ {
15735
+ "epoch": 0.06657028245708631,
15736
+ "grad_norm": 6.8125,
15737
+ "learning_rate": 0.0004890259333599378,
15738
+ "loss": 17.9616,
15739
+ "step": 44940
15740
+ },
15741
+ {
15742
+ "epoch": 0.0665999087510147,
15743
+ "grad_norm": 7.28125,
15744
+ "learning_rate": 0.0004890209944249302,
15745
+ "loss": 17.9617,
15746
+ "step": 44960
15747
+ },
15748
+ {
15749
+ "epoch": 0.0666295350449431,
15750
+ "grad_norm": 7.25,
15751
+ "learning_rate": 0.0004890160554899227,
15752
+ "loss": 17.9462,
15753
+ "step": 44980
15754
+ },
15755
+ {
15756
+ "epoch": 0.06665916133887148,
15757
+ "grad_norm": 7.21875,
15758
+ "learning_rate": 0.000489011116554915,
15759
+ "loss": 17.8648,
15760
+ "step": 45000
15761
  }
15762
  ],
15763
  "logging_steps": 20,
15764
+ "max_steps": 2025228,
15765
  "num_input_tokens_seen": 0,
15766
  "num_train_epochs": 3,
15767
  "save_steps": 1000,
 
15777
  "attributes": {}
15778
  }
15779
  },
15780
+ "total_flos": 3.3082299293261365e+19,
15781
  "train_batch_size": 48,
15782
  "trial_name": null,
15783
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1027262c4bd8a4d0ff155140133e02a994449886838662007d5d33ca8bb602f9
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ae6fe7865a6680f0788decd4b8035db04ae39b0ae4392f872489469c00e7d58
3
  size 5432