robertou2 commited on
Commit
cb34667
·
verified ·
1 Parent(s): 69bcd4e

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:170e9283396f794ac39c141ef58fc732a915723bcc48acda06109764aede853c
3
  size 201361312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ba743cbcbe7a17a13ffee64e044e449254882634e848aa631f63e6778810b27
3
  size 201361312
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aba8d5f3fb425d691ad00a11ff612f2c4ce2ef2f2350b1aa78ac024098d151a6
3
  size 402868986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa8c590b229a780debdb448bc28cb8f79b28f2ec2c6ea5636a4abf950ae5a038
3
  size 402868986
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3df287feaf25c6bbc3e39d1e8402382f635590ca96adbe728944eb6f0edd1fc9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78412adf2dda42daa646069b544a18df9b06cb455b0068bb5473d031abd28e97
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50d23b4f208a9403528cc4590d75da0ba9842779b9cd25a1b5978ffbe9bcceb1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45e12526c8172a948234d8cb869935e517c484d36da5eb6ac9a7382e7d268eff
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.6163371205329895,
3
- "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-200",
4
- "epoch": 22.235294117647058,
5
  "eval_steps": 500,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -471,12 +471,332 @@
471
  "eval_samples_per_second": 4.46,
472
  "eval_steps_per_second": 0.595,
473
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  }
475
  ],
476
  "logging_steps": 5,
477
- "max_steps": 200,
478
  "num_input_tokens_seen": 0,
479
- "num_train_epochs": 25,
480
  "save_steps": 500,
481
  "stateful_callbacks": {
482
  "TrainerControl": {
@@ -490,7 +810,7 @@
490
  "attributes": {}
491
  }
492
  },
493
- "total_flos": 1.0471069689549005e+17,
494
  "train_batch_size": 2,
495
  "trial_name": null,
496
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.5424160957336426,
3
+ "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-400",
4
+ "epoch": 8.0,
5
  "eval_steps": 500,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
471
  "eval_samples_per_second": 4.46,
472
  "eval_steps_per_second": 0.595,
473
  "step": 200
474
+ },
475
+ {
476
+ "epoch": 3.586666666666667,
477
+ "eval_loss": 0.5931960940361023,
478
+ "eval_runtime": 28.5506,
479
+ "eval_samples_per_second": 3.503,
480
+ "eval_steps_per_second": 0.455,
481
+ "step": 201
482
+ },
483
+ {
484
+ "epoch": 4.1,
485
+ "grad_norm": 0.8143700957298279,
486
+ "learning_rate": 5.206624871244066e-06,
487
+ "loss": 0.9672,
488
+ "step": 205
489
+ },
490
+ {
491
+ "epoch": 4.2,
492
+ "grad_norm": 0.670274019241333,
493
+ "learning_rate": 5e-06,
494
+ "loss": 0.9171,
495
+ "step": 210
496
+ },
497
+ {
498
+ "epoch": 4.3,
499
+ "grad_norm": 0.5900228023529053,
500
+ "learning_rate": 4.793375128755934e-06,
501
+ "loss": 0.8865,
502
+ "step": 215
503
+ },
504
+ {
505
+ "epoch": 4.4,
506
+ "grad_norm": 0.5981155633926392,
507
+ "learning_rate": 4.587103272638339e-06,
508
+ "loss": 1.1775,
509
+ "step": 220
510
+ },
511
+ {
512
+ "epoch": 4.5,
513
+ "grad_norm": 0.5991724729537964,
514
+ "learning_rate": 4.381536843653262e-06,
515
+ "loss": 0.7489,
516
+ "step": 225
517
+ },
518
+ {
519
+ "epoch": 4.6,
520
+ "grad_norm": 0.5450884103775024,
521
+ "learning_rate": 4.17702704859633e-06,
522
+ "loss": 0.8612,
523
+ "step": 230
524
+ },
525
+ {
526
+ "epoch": 4.7,
527
+ "grad_norm": 0.444416344165802,
528
+ "learning_rate": 3.973923289021829e-06,
529
+ "loss": 0.7293,
530
+ "step": 235
531
+ },
532
+ {
533
+ "epoch": 4.8,
534
+ "grad_norm": 0.3834201395511627,
535
+ "learning_rate": 3.7725725642960047e-06,
536
+ "loss": 0.7699,
537
+ "step": 240
538
+ },
539
+ {
540
+ "epoch": 4.9,
541
+ "grad_norm": 0.3441762924194336,
542
+ "learning_rate": 3.573318878754475e-06,
543
+ "loss": 0.8972,
544
+ "step": 245
545
+ },
546
+ {
547
+ "epoch": 5.0,
548
+ "grad_norm": 0.5351847410202026,
549
+ "learning_rate": 3.3765026539765832e-06,
550
+ "loss": 0.6602,
551
+ "step": 250
552
+ },
553
+ {
554
+ "epoch": 5.0,
555
+ "eval_loss": 0.5578957200050354,
556
+ "eval_runtime": 52.5326,
557
+ "eval_samples_per_second": 3.807,
558
+ "eval_steps_per_second": 0.476,
559
+ "step": 250
560
+ },
561
+ {
562
+ "epoch": 5.1,
563
+ "grad_norm": 0.37455469369888306,
564
+ "learning_rate": 3.1824601471808504e-06,
565
+ "loss": 0.884,
566
+ "step": 255
567
+ },
568
+ {
569
+ "epoch": 5.2,
570
+ "grad_norm": 0.6285215020179749,
571
+ "learning_rate": 2.991522876735154e-06,
572
+ "loss": 0.8042,
573
+ "step": 260
574
+ },
575
+ {
576
+ "epoch": 5.3,
577
+ "grad_norm": 0.37903887033462524,
578
+ "learning_rate": 2.804017055763149e-06,
579
+ "loss": 0.6865,
580
+ "step": 265
581
+ },
582
+ {
583
+ "epoch": 5.4,
584
+ "grad_norm": 0.4468790292739868,
585
+ "learning_rate": 2.6202630348146323e-06,
586
+ "loss": 0.9571,
587
+ "step": 270
588
+ },
589
+ {
590
+ "epoch": 5.5,
591
+ "grad_norm": 2.321368932723999,
592
+ "learning_rate": 2.4405747545519966e-06,
593
+ "loss": 0.7722,
594
+ "step": 275
595
+ },
596
+ {
597
+ "epoch": 5.6,
598
+ "grad_norm": 0.3462996482849121,
599
+ "learning_rate": 2.265259209387867e-06,
600
+ "loss": 0.6575,
601
+ "step": 280
602
+ },
603
+ {
604
+ "epoch": 5.7,
605
+ "grad_norm": 0.7634517550468445,
606
+ "learning_rate": 2.094615922990309e-06,
607
+ "loss": 0.7036,
608
+ "step": 285
609
+ },
610
+ {
611
+ "epoch": 5.8,
612
+ "grad_norm": 0.33972227573394775,
613
+ "learning_rate": 1.928936436551661e-06,
614
+ "loss": 0.6193,
615
+ "step": 290
616
+ },
617
+ {
618
+ "epoch": 5.9,
619
+ "grad_norm": 0.863368570804596,
620
+ "learning_rate": 1.7685038106952952e-06,
621
+ "loss": 0.7429,
622
+ "step": 295
623
+ },
624
+ {
625
+ "epoch": 6.0,
626
+ "grad_norm": 0.8421957492828369,
627
+ "learning_rate": 1.6135921418712959e-06,
628
+ "loss": 0.6177,
629
+ "step": 300
630
+ },
631
+ {
632
+ "epoch": 6.0,
633
+ "eval_loss": 0.5471388697624207,
634
+ "eval_runtime": 52.1971,
635
+ "eval_samples_per_second": 3.832,
636
+ "eval_steps_per_second": 0.479,
637
+ "step": 300
638
+ },
639
+ {
640
+ "epoch": 6.1,
641
+ "grad_norm": 0.42387768626213074,
642
+ "learning_rate": 1.4644660940672628e-06,
643
+ "loss": 0.7107,
644
+ "step": 305
645
+ },
646
+ {
647
+ "epoch": 6.2,
648
+ "grad_norm": 0.40212640166282654,
649
+ "learning_rate": 1.321380446634342e-06,
650
+ "loss": 0.6465,
651
+ "step": 310
652
+ },
653
+ {
654
+ "epoch": 6.3,
655
+ "grad_norm": 0.38275906443595886,
656
+ "learning_rate": 1.1845796590009684e-06,
657
+ "loss": 0.7838,
658
+ "step": 315
659
+ },
660
+ {
661
+ "epoch": 6.4,
662
+ "grad_norm": 0.517331063747406,
663
+ "learning_rate": 1.0542974530180327e-06,
664
+ "loss": 0.6743,
665
+ "step": 320
666
+ },
667
+ {
668
+ "epoch": 6.5,
669
+ "grad_norm": 0.4819343388080597,
670
+ "learning_rate": 9.307564136490255e-07,
671
+ "loss": 0.6544,
672
+ "step": 325
673
+ },
674
+ {
675
+ "epoch": 6.6,
676
+ "grad_norm": 0.5918112397193909,
677
+ "learning_rate": 8.141676086873574e-07,
678
+ "loss": 0.6178,
679
+ "step": 330
680
+ },
681
+ {
682
+ "epoch": 6.7,
683
+ "grad_norm": 0.3847924768924713,
684
+ "learning_rate": 7.047302281505735e-07,
685
+ "loss": 0.5631,
686
+ "step": 335
687
+ },
688
+ {
689
+ "epoch": 6.8,
690
+ "grad_norm": 0.43630239367485046,
691
+ "learning_rate": 6.026312439675553e-07,
692
+ "loss": 0.5709,
693
+ "step": 340
694
+ },
695
+ {
696
+ "epoch": 6.9,
697
+ "grad_norm": 0.6350282430648804,
698
+ "learning_rate": 5.080450905401057e-07,
699
+ "loss": 0.7065,
700
+ "step": 345
701
+ },
702
+ {
703
+ "epoch": 7.0,
704
+ "grad_norm": 0.5881220102310181,
705
+ "learning_rate": 4.211333667247125e-07,
706
+ "loss": 0.6102,
707
+ "step": 350
708
+ },
709
+ {
710
+ "epoch": 7.0,
711
+ "eval_loss": 0.5426855683326721,
712
+ "eval_runtime": 52.2072,
713
+ "eval_samples_per_second": 3.831,
714
+ "eval_steps_per_second": 0.479,
715
+ "step": 350
716
+ },
717
+ {
718
+ "epoch": 7.1,
719
+ "grad_norm": 0.5317939519882202,
720
+ "learning_rate": 3.420445597436056e-07,
721
+ "loss": 0.6632,
722
+ "step": 355
723
+ },
724
+ {
725
+ "epoch": 7.2,
726
+ "grad_norm": 0.5702535510063171,
727
+ "learning_rate": 2.7091379149682683e-07,
728
+ "loss": 0.5992,
729
+ "step": 360
730
+ },
731
+ {
732
+ "epoch": 7.3,
733
+ "grad_norm": 0.6872391104698181,
734
+ "learning_rate": 2.0786258770873647e-07,
735
+ "loss": 0.6422,
736
+ "step": 365
737
+ },
738
+ {
739
+ "epoch": 7.4,
740
+ "grad_norm": 0.32829490303993225,
741
+ "learning_rate": 1.5299867030334815e-07,
742
+ "loss": 0.6811,
743
+ "step": 370
744
+ },
745
+ {
746
+ "epoch": 7.5,
747
+ "grad_norm": 0.5375828742980957,
748
+ "learning_rate": 1.0641577336322761e-07,
749
+ "loss": 0.8423,
750
+ "step": 375
751
+ },
752
+ {
753
+ "epoch": 7.6,
754
+ "grad_norm": 0.6306584477424622,
755
+ "learning_rate": 6.819348298638839e-08,
756
+ "loss": 0.5899,
757
+ "step": 380
758
+ },
759
+ {
760
+ "epoch": 7.7,
761
+ "grad_norm": 0.44418570399284363,
762
+ "learning_rate": 3.839710131477492e-08,
763
+ "loss": 0.6571,
764
+ "step": 385
765
+ },
766
+ {
767
+ "epoch": 7.8,
768
+ "grad_norm": 0.49700650572776794,
769
+ "learning_rate": 1.7077534966650767e-08,
770
+ "loss": 0.6561,
771
+ "step": 390
772
+ },
773
+ {
774
+ "epoch": 7.9,
775
+ "grad_norm": 0.3311610519886017,
776
+ "learning_rate": 4.2712080634949024e-09,
777
+ "loss": 0.6226,
778
+ "step": 395
779
+ },
780
+ {
781
+ "epoch": 8.0,
782
+ "grad_norm": 1.5899903774261475,
783
+ "learning_rate": 0.0,
784
+ "loss": 0.6762,
785
+ "step": 400
786
+ },
787
+ {
788
+ "epoch": 8.0,
789
+ "eval_loss": 0.5424160957336426,
790
+ "eval_runtime": 52.193,
791
+ "eval_samples_per_second": 3.832,
792
+ "eval_steps_per_second": 0.479,
793
+ "step": 400
794
  }
795
  ],
796
  "logging_steps": 5,
797
+ "max_steps": 400,
798
  "num_input_tokens_seen": 0,
799
+ "num_train_epochs": 8,
800
  "save_steps": 500,
801
  "stateful_callbacks": {
802
  "TrainerControl": {
 
810
  "attributes": {}
811
  }
812
  },
813
+ "total_flos": 1.4231605134807245e+17,
814
  "train_batch_size": 2,
815
  "trial_name": null,
816
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:354b637532320af3e4fc7a75a7a30ab3076b3ef28de912201c92125d861c2822
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d7ef1ca84158a115fb2ab949b3f781c814c5ef428f591fc8d6d01108daabb83
3
  size 5624