NBAmine commited on
Commit
3e7237e
·
verified ·
1 Parent(s): 7ea4c77

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "o_proj",
34
- "k_proj",
35
  "down_proj",
36
- "gate_proj",
37
  "q_proj",
38
- "up_proj"
 
 
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
 
32
  "down_proj",
 
33
  "q_proj",
34
+ "o_proj",
35
+ "v_proj",
36
+ "gate_proj",
37
+ "up_proj",
38
+ "k_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b28a611f18c5bf6bd4e94b537636210f4683b5a33960138bf4b7f8759dfcb59e
3
  size 228140600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df1c1aa2916d5972f5efeb6284e8eab0c9c72f3782a534e72ebed006d4a326dc
3
  size 228140600
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b0a7a51782095db67faf17d4424456c08cb55127408d12014dc3c304ffbdab7
3
- size 116484839
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84a3341db1ea0703a3a5bfb4c84d9c6f9c629d7ec814a41d47f66338b4b4316a
3
+ size 117931203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b0b09bbaedfdbe2893d036820ab6e355fea8a9aab8a443615445767baabde29
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c823b36aa64ec6d5ba470435413c8fa628bdc36879db73fd6bcc786691658d3
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee67534ce5c31fc7fdd40446bc6096b050048bea81431627ee7eb7a4e0420fce
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7db9931cd2bdb0cce107e4058673881f0e4939f11f21f05dabe6ed2ca0118fd7
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba3d9ccd5719a6149375c0d8ad46aab7eeed3ae74f7933879cf60a287f920385
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53f409af08acb24ba2f85422d6d830e93fdc97a01268b4582a53eec3cbfeb20a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 438,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -450,6 +450,458 @@
450
  "eval_samples_per_second": 1.56,
451
  "eval_steps_per_second": 0.391,
452
  "step": 438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  }
454
  ],
455
  "logging_steps": 10,
@@ -469,7 +921,7 @@
469
  "attributes": {}
470
  }
471
  },
472
- "total_flos": 1.8963131243019264e+17,
473
  "train_batch_size": 1,
474
  "trial_name": null,
475
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 876,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
450
  "eval_samples_per_second": 1.56,
451
  "eval_steps_per_second": 0.391,
452
  "step": 438
453
+ },
454
+ {
455
+ "entropy": 0.19485942274332047,
456
+ "epoch": 1.0045714285714287,
457
+ "grad_norm": 0.5989819169044495,
458
+ "learning_rate": 8.000000000000001e-06,
459
+ "loss": 0.1729,
460
+ "mean_token_accuracy": 0.9562485031783581,
461
+ "num_tokens": 46756.0,
462
+ "step": 440
463
+ },
464
+ {
465
+ "entropy": 0.15740223932079972,
466
+ "epoch": 1.0274285714285714,
467
+ "grad_norm": 0.9629825949668884,
468
+ "learning_rate": 7.95433789954338e-06,
469
+ "loss": 0.1453,
470
+ "mean_token_accuracy": 0.9640702843666077,
471
+ "num_tokens": 162230.0,
472
+ "step": 450
473
+ },
474
+ {
475
+ "entropy": 0.18924359129741788,
476
+ "epoch": 1.0502857142857143,
477
+ "grad_norm": 0.8555030226707458,
478
+ "learning_rate": 7.908675799086758e-06,
479
+ "loss": 0.1647,
480
+ "mean_token_accuracy": 0.9583326201885939,
481
+ "num_tokens": 231685.0,
482
+ "step": 460
483
+ },
484
+ {
485
+ "entropy": 0.22046699812635778,
486
+ "epoch": 1.0731428571428572,
487
+ "grad_norm": 0.6105676293373108,
488
+ "learning_rate": 7.863013698630137e-06,
489
+ "loss": 0.1864,
490
+ "mean_token_accuracy": 0.9536379788070917,
491
+ "num_tokens": 276927.0,
492
+ "step": 470
493
+ },
494
+ {
495
+ "entropy": 0.26470216070301833,
496
+ "epoch": 1.096,
497
+ "grad_norm": 0.6477329134941101,
498
+ "learning_rate": 7.817351598173517e-06,
499
+ "loss": 0.2354,
500
+ "mean_token_accuracy": 0.9424127731472254,
501
+ "num_tokens": 309026.0,
502
+ "step": 480
503
+ },
504
+ {
505
+ "entropy": 0.25112292610574516,
506
+ "epoch": 1.1188571428571428,
507
+ "grad_norm": 0.8889337778091431,
508
+ "learning_rate": 7.771689497716896e-06,
509
+ "loss": 0.222,
510
+ "mean_token_accuracy": 0.9455349139869214,
511
+ "num_tokens": 375120.0,
512
+ "step": 490
513
+ },
514
+ {
515
+ "entropy": 0.15172108153346925,
516
+ "epoch": 1.1417142857142857,
517
+ "grad_norm": 1.6094820499420166,
518
+ "learning_rate": 7.726027397260276e-06,
519
+ "loss": 0.1358,
520
+ "mean_token_accuracy": 0.9661709513515234,
521
+ "num_tokens": 486745.0,
522
+ "step": 500
523
+ },
524
+ {
525
+ "entropy": 0.167777626728639,
526
+ "epoch": 1.1645714285714286,
527
+ "grad_norm": 0.7636669278144836,
528
+ "learning_rate": 7.680365296803653e-06,
529
+ "loss": 0.1456,
530
+ "mean_token_accuracy": 0.964617732539773,
531
+ "num_tokens": 548794.0,
532
+ "step": 510
533
+ },
534
+ {
535
+ "entropy": 0.20727068707346916,
536
+ "epoch": 1.1874285714285715,
537
+ "grad_norm": 0.6218438148498535,
538
+ "learning_rate": 7.634703196347033e-06,
539
+ "loss": 0.1779,
540
+ "mean_token_accuracy": 0.9553128611296415,
541
+ "num_tokens": 590048.0,
542
+ "step": 520
543
+ },
544
+ {
545
+ "entropy": 0.24928416800685227,
546
+ "epoch": 1.2102857142857144,
547
+ "grad_norm": 0.5786827802658081,
548
+ "learning_rate": 7.589041095890411e-06,
549
+ "loss": 0.2181,
550
+ "mean_token_accuracy": 0.945609737932682,
551
+ "num_tokens": 620670.0,
552
+ "step": 530
553
+ },
554
+ {
555
+ "entropy": 0.23164914632216096,
556
+ "epoch": 1.233142857142857,
557
+ "grad_norm": 0.9021991491317749,
558
+ "learning_rate": 7.543378995433791e-06,
559
+ "loss": 0.2029,
560
+ "mean_token_accuracy": 0.9481291055679322,
561
+ "num_tokens": 678029.0,
562
+ "step": 540
563
+ },
564
+ {
565
+ "entropy": 0.11451007889118045,
566
+ "epoch": 1.256,
567
+ "grad_norm": 1.143864631652832,
568
+ "learning_rate": 7.497716894977169e-06,
569
+ "loss": 0.105,
570
+ "mean_token_accuracy": 0.9750772431492806,
571
+ "num_tokens": 789478.0,
572
+ "step": 550
573
+ },
574
+ {
575
+ "entropy": 0.14542806874960662,
576
+ "epoch": 1.278857142857143,
577
+ "grad_norm": 0.6672413349151611,
578
+ "learning_rate": 7.452054794520549e-06,
579
+ "loss": 0.1256,
580
+ "mean_token_accuracy": 0.9681380245834589,
581
+ "num_tokens": 853276.0,
582
+ "step": 560
583
+ },
584
+ {
585
+ "entropy": 0.19598664692603052,
586
+ "epoch": 1.3017142857142856,
587
+ "grad_norm": 0.5779910683631897,
588
+ "learning_rate": 7.406392694063927e-06,
589
+ "loss": 0.1742,
590
+ "mean_token_accuracy": 0.9562454361468553,
591
+ "num_tokens": 894797.0,
592
+ "step": 570
593
+ },
594
+ {
595
+ "entropy": 0.23272629571147263,
596
+ "epoch": 1.3245714285714285,
597
+ "grad_norm": 0.4893546998500824,
598
+ "learning_rate": 7.360730593607307e-06,
599
+ "loss": 0.2026,
600
+ "mean_token_accuracy": 0.9472691085189581,
601
+ "num_tokens": 925575.0,
602
+ "step": 580
603
+ },
604
+ {
605
+ "entropy": 0.22576562578324227,
606
+ "epoch": 1.3474285714285714,
607
+ "grad_norm": 0.47898435592651367,
608
+ "learning_rate": 7.315068493150685e-06,
609
+ "loss": 0.1976,
610
+ "mean_token_accuracy": 0.9479547172784806,
611
+ "num_tokens": 983597.0,
612
+ "step": 590
613
+ },
614
+ {
615
+ "entropy": 0.11407975524198263,
616
+ "epoch": 1.3702857142857143,
617
+ "grad_norm": 1.4250750541687012,
618
+ "learning_rate": 7.269406392694065e-06,
619
+ "loss": 0.1095,
620
+ "mean_token_accuracy": 0.9737830355763435,
621
+ "num_tokens": 1090322.0,
622
+ "step": 600
623
+ },
624
+ {
625
+ "entropy": 0.1459290421102196,
626
+ "epoch": 1.3931428571428572,
627
+ "grad_norm": 0.6979950666427612,
628
+ "learning_rate": 7.223744292237444e-06,
629
+ "loss": 0.1302,
630
+ "mean_token_accuracy": 0.9667541589587927,
631
+ "num_tokens": 1152923.0,
632
+ "step": 610
633
+ },
634
+ {
635
+ "entropy": 0.18885702546685934,
636
+ "epoch": 1.416,
637
+ "grad_norm": 0.5068536996841431,
638
+ "learning_rate": 7.178082191780823e-06,
639
+ "loss": 0.1636,
640
+ "mean_token_accuracy": 0.959310057759285,
641
+ "num_tokens": 1193481.0,
642
+ "step": 620
643
+ },
644
+ {
645
+ "entropy": 0.23108526985161007,
646
+ "epoch": 1.4388571428571428,
647
+ "grad_norm": 0.8436884880065918,
648
+ "learning_rate": 7.132420091324202e-06,
649
+ "loss": 0.2072,
650
+ "mean_token_accuracy": 0.9471893258392811,
651
+ "num_tokens": 1222847.0,
652
+ "step": 630
653
+ },
654
+ {
655
+ "entropy": 0.21841485593467952,
656
+ "epoch": 1.4617142857142857,
657
+ "grad_norm": 0.8229106068611145,
658
+ "learning_rate": 7.086757990867581e-06,
659
+ "loss": 0.1863,
660
+ "mean_token_accuracy": 0.9520192969590425,
661
+ "num_tokens": 1290269.0,
662
+ "step": 640
663
+ },
664
+ {
665
+ "entropy": 0.10626114641781896,
666
+ "epoch": 1.4845714285714287,
667
+ "grad_norm": 1.5995644330978394,
668
+ "learning_rate": 7.0410958904109596e-06,
669
+ "loss": 0.0994,
670
+ "mean_token_accuracy": 0.976220278069377,
671
+ "num_tokens": 1402634.0,
672
+ "step": 650
673
+ },
674
+ {
675
+ "entropy": 0.13410865939222277,
676
+ "epoch": 1.5074285714285716,
677
+ "grad_norm": 0.4029393196105957,
678
+ "learning_rate": 6.995433789954339e-06,
679
+ "loss": 0.1213,
680
+ "mean_token_accuracy": 0.9693065240979195,
681
+ "num_tokens": 1466175.0,
682
+ "step": 660
683
+ },
684
+ {
685
+ "entropy": 0.17447674251161516,
686
+ "epoch": 1.5302857142857142,
687
+ "grad_norm": 0.418222576379776,
688
+ "learning_rate": 6.9497716894977175e-06,
689
+ "loss": 0.1528,
690
+ "mean_token_accuracy": 0.9606836523860693,
691
+ "num_tokens": 1507803.0,
692
+ "step": 670
693
+ },
694
+ {
695
+ "entropy": 0.21088513871654868,
696
+ "epoch": 1.5531428571428572,
697
+ "grad_norm": 0.5335624814033508,
698
+ "learning_rate": 6.904109589041097e-06,
699
+ "loss": 0.1855,
700
+ "mean_token_accuracy": 0.9512796506285668,
701
+ "num_tokens": 1538690.0,
702
+ "step": 680
703
+ },
704
+ {
705
+ "entropy": 0.21284959067124873,
706
+ "epoch": 1.576,
707
+ "grad_norm": 0.7531531453132629,
708
+ "learning_rate": 6.858447488584475e-06,
709
+ "loss": 0.1834,
710
+ "mean_token_accuracy": 0.9524108562618494,
711
+ "num_tokens": 1599601.0,
712
+ "step": 690
713
+ },
714
+ {
715
+ "entropy": 0.08681527464650571,
716
+ "epoch": 1.5988571428571428,
717
+ "grad_norm": 0.6462493538856506,
718
+ "learning_rate": 6.812785388127855e-06,
719
+ "loss": 0.078,
720
+ "mean_token_accuracy": 0.980168628692627,
721
+ "num_tokens": 1713685.0,
722
+ "step": 700
723
+ },
724
+ {
725
+ "entropy": 0.1219312352128327,
726
+ "epoch": 1.6217142857142857,
727
+ "grad_norm": 0.40015217661857605,
728
+ "learning_rate": 6.767123287671233e-06,
729
+ "loss": 0.1132,
730
+ "mean_token_accuracy": 0.9700204558670521,
731
+ "num_tokens": 1776814.0,
732
+ "step": 710
733
+ },
734
+ {
735
+ "entropy": 0.16716388445347546,
736
+ "epoch": 1.6445714285714286,
737
+ "grad_norm": 0.5019240975379944,
738
+ "learning_rate": 6.721461187214613e-06,
739
+ "loss": 0.1529,
740
+ "mean_token_accuracy": 0.9590244695544243,
741
+ "num_tokens": 1818772.0,
742
+ "step": 720
743
+ },
744
+ {
745
+ "entropy": 0.20658068330958484,
746
+ "epoch": 1.6674285714285715,
747
+ "grad_norm": 0.48935461044311523,
748
+ "learning_rate": 6.675799086757991e-06,
749
+ "loss": 0.186,
750
+ "mean_token_accuracy": 0.9494243700057268,
751
+ "num_tokens": 1848794.0,
752
+ "step": 730
753
+ },
754
+ {
755
+ "entropy": 0.21933096905704588,
756
+ "epoch": 1.6902857142857144,
757
+ "grad_norm": 1.0664595365524292,
758
+ "learning_rate": 6.630136986301371e-06,
759
+ "loss": 0.195,
760
+ "mean_token_accuracy": 0.9486182644963265,
761
+ "num_tokens": 1912529.0,
762
+ "step": 740
763
+ },
764
+ {
765
+ "entropy": 0.09219505588989704,
766
+ "epoch": 1.713142857142857,
767
+ "grad_norm": 0.9666043519973755,
768
+ "learning_rate": 6.584474885844749e-06,
769
+ "loss": 0.0842,
770
+ "mean_token_accuracy": 0.9790573690086604,
771
+ "num_tokens": 2023382.0,
772
+ "step": 750
773
+ },
774
+ {
775
+ "entropy": 0.12722355276346206,
776
+ "epoch": 1.736,
777
+ "grad_norm": 0.325158953666687,
778
+ "learning_rate": 6.538812785388129e-06,
779
+ "loss": 0.1151,
780
+ "mean_token_accuracy": 0.9699444197118282,
781
+ "num_tokens": 2088409.0,
782
+ "step": 760
783
+ },
784
+ {
785
+ "entropy": 0.16778573733754457,
786
+ "epoch": 1.758857142857143,
787
+ "grad_norm": 0.4155607521533966,
788
+ "learning_rate": 6.493150684931508e-06,
789
+ "loss": 0.1528,
790
+ "mean_token_accuracy": 0.9595557443797589,
791
+ "num_tokens": 2130039.0,
792
+ "step": 770
793
+ },
794
+ {
795
+ "entropy": 0.2043486479204148,
796
+ "epoch": 1.7817142857142856,
797
+ "grad_norm": 0.4696311354637146,
798
+ "learning_rate": 6.447488584474887e-06,
799
+ "loss": 0.1915,
800
+ "mean_token_accuracy": 0.9497469838708639,
801
+ "num_tokens": 2160612.0,
802
+ "step": 780
803
+ },
804
+ {
805
+ "entropy": 0.2095549178076908,
806
+ "epoch": 1.8045714285714287,
807
+ "grad_norm": 0.49995991587638855,
808
+ "learning_rate": 6.401826484018266e-06,
809
+ "loss": 0.1917,
810
+ "mean_token_accuracy": 0.951378521323204,
811
+ "num_tokens": 2219878.0,
812
+ "step": 790
813
+ },
814
+ {
815
+ "entropy": 0.09752151321154087,
816
+ "epoch": 1.8274285714285714,
817
+ "grad_norm": 0.7513600587844849,
818
+ "learning_rate": 6.356164383561645e-06,
819
+ "loss": 0.0877,
820
+ "mean_token_accuracy": 0.9781307391822338,
821
+ "num_tokens": 2328503.0,
822
+ "step": 800
823
+ },
824
+ {
825
+ "entropy": 0.1315026845317334,
826
+ "epoch": 1.8502857142857143,
827
+ "grad_norm": 0.4865649938583374,
828
+ "learning_rate": 6.3105022831050235e-06,
829
+ "loss": 0.1191,
830
+ "mean_token_accuracy": 0.9693873535841704,
831
+ "num_tokens": 2393270.0,
832
+ "step": 810
833
+ },
834
+ {
835
+ "entropy": 0.15703398073092104,
836
+ "epoch": 1.8731428571428572,
837
+ "grad_norm": 0.46761906147003174,
838
+ "learning_rate": 6.264840182648403e-06,
839
+ "loss": 0.1418,
840
+ "mean_token_accuracy": 0.9627573467791081,
841
+ "num_tokens": 2436335.0,
842
+ "step": 820
843
+ },
844
+ {
845
+ "entropy": 0.19582971301861107,
846
+ "epoch": 1.896,
847
+ "grad_norm": 0.5706267356872559,
848
+ "learning_rate": 6.219178082191781e-06,
849
+ "loss": 0.1771,
850
+ "mean_token_accuracy": 0.9532948363572359,
851
+ "num_tokens": 2467211.0,
852
+ "step": 830
853
+ },
854
+ {
855
+ "entropy": 0.19951584844384343,
856
+ "epoch": 1.9188571428571428,
857
+ "grad_norm": 0.1575632095336914,
858
+ "learning_rate": 6.173515981735161e-06,
859
+ "loss": 0.1733,
860
+ "mean_token_accuracy": 0.9532737210392952,
861
+ "num_tokens": 2527699.0,
862
+ "step": 840
863
+ },
864
+ {
865
+ "entropy": 0.09860867839306593,
866
+ "epoch": 1.9417142857142857,
867
+ "grad_norm": 0.37800732254981995,
868
+ "learning_rate": 6.127853881278539e-06,
869
+ "loss": 0.0922,
870
+ "mean_token_accuracy": 0.9767971355468035,
871
+ "num_tokens": 2624761.0,
872
+ "step": 850
873
+ },
874
+ {
875
+ "entropy": 0.14002426667138934,
876
+ "epoch": 1.9645714285714284,
877
+ "grad_norm": 0.8500357866287231,
878
+ "learning_rate": 6.082191780821919e-06,
879
+ "loss": 0.1284,
880
+ "mean_token_accuracy": 0.9653151527047157,
881
+ "num_tokens": 2674387.0,
882
+ "step": 860
883
+ },
884
+ {
885
+ "entropy": 0.20728676998987794,
886
+ "epoch": 1.9874285714285715,
887
+ "grad_norm": 0.6311262845993042,
888
+ "learning_rate": 6.036529680365297e-06,
889
+ "loss": 0.1892,
890
+ "mean_token_accuracy": 0.9492694169282914,
891
+ "num_tokens": 2705462.0,
892
+ "step": 870
893
+ },
894
+ {
895
+ "epoch": 2.0,
896
+ "eval_accuracy": 0.001506220691455712,
897
+ "eval_entropy": 0.38320175457645106,
898
+ "eval_loss": 1.0289124250411987,
899
+ "eval_mean_token_accuracy": 0.8326533791181203,
900
+ "eval_num_tokens": 2716693.0,
901
+ "eval_runtime": 734.0738,
902
+ "eval_samples_per_second": 1.409,
903
+ "eval_steps_per_second": 0.353,
904
+ "step": 876
905
  }
906
  ],
907
  "logging_steps": 10,
 
921
  "attributes": {}
922
  }
923
  },
924
+ "total_flos": 3.792626248603853e+17,
925
  "train_batch_size": 1,
926
  "trial_name": null,
927
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58259b31d8c49fec76c9044575d2a0dc11fa8080720bce2e7820a1dfbfb8174f
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd36270ff585b2d668c6df7d7ada51207c25255f0fc66fa207d06a8a67152786
3
  size 6353