Training in progress, step 120, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 83945296
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ce1d3865491490218207bac75bccedb8c2e28ef95b0cf9a0f748b479a6adc57
|
| 3 |
size 83945296
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 43122580
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8287c39800d6ba02c8dfe28b1ae66f1a0eed7e1907cdd7f1aa84e95e9369848e
|
| 3 |
size 43122580
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03991d31508c600c9181221b8bb07dd43258c023851238fbb98928b343012b1a
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2409fb1c3fd1c2de2c63b5459c49a97abc9a50084af4337cc5090c00ef975a6
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch":
|
| 5 |
"eval_steps": 500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -637,6 +637,224 @@
|
|
| 637 |
"learning_rate": 1.5872342839067306e-05,
|
| 638 |
"loss": 2.7213,
|
| 639 |
"step": 90
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
}
|
| 641 |
],
|
| 642 |
"logging_steps": 1,
|
|
@@ -651,12 +869,12 @@
|
|
| 651 |
"should_evaluate": false,
|
| 652 |
"should_log": false,
|
| 653 |
"should_save": true,
|
| 654 |
-
"should_training_stop":
|
| 655 |
},
|
| 656 |
"attributes": {}
|
| 657 |
}
|
| 658 |
},
|
| 659 |
-
"total_flos":
|
| 660 |
"train_batch_size": 4,
|
| 661 |
"trial_name": null,
|
| 662 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 1.0020876826722338,
|
| 5 |
"eval_steps": 500,
|
| 6 |
+
"global_step": 120,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 637 |
"learning_rate": 1.5872342839067306e-05,
|
| 638 |
"loss": 2.7213,
|
| 639 |
"step": 90
|
| 640 |
+
},
|
| 641 |
+
{
|
| 642 |
+
"epoch": 0.7599164926931107,
|
| 643 |
+
"grad_norm": 39.1318244934082,
|
| 644 |
+
"learning_rate": 1.4886944624647647e-05,
|
| 645 |
+
"loss": 2.5212,
|
| 646 |
+
"step": 91
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"epoch": 0.7682672233820459,
|
| 650 |
+
"grad_norm": 37.528324127197266,
|
| 651 |
+
"learning_rate": 1.3927749088052217e-05,
|
| 652 |
+
"loss": 3.2661,
|
| 653 |
+
"step": 92
|
| 654 |
+
},
|
| 655 |
+
{
|
| 656 |
+
"epoch": 0.7766179540709812,
|
| 657 |
+
"grad_norm": 33.13786315917969,
|
| 658 |
+
"learning_rate": 1.2995472016937404e-05,
|
| 659 |
+
"loss": 4.3563,
|
| 660 |
+
"step": 93
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"epoch": 0.7849686847599165,
|
| 664 |
+
"grad_norm": 28.756526947021484,
|
| 665 |
+
"learning_rate": 1.209080911139187e-05,
|
| 666 |
+
"loss": 3.6397,
|
| 667 |
+
"step": 94
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"epoch": 0.7933194154488518,
|
| 671 |
+
"grad_norm": 25.469545364379883,
|
| 672 |
+
"learning_rate": 1.1214435464779006e-05,
|
| 673 |
+
"loss": 4.4051,
|
| 674 |
+
"step": 95
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 0.8016701461377871,
|
| 678 |
+
"grad_norm": 27.566421508789062,
|
| 679 |
+
"learning_rate": 1.0367005059957096e-05,
|
| 680 |
+
"loss": 5.1261,
|
| 681 |
+
"step": 96
|
| 682 |
+
},
|
| 683 |
+
{
|
| 684 |
+
"epoch": 0.8100208768267223,
|
| 685 |
+
"grad_norm": 18.103004455566406,
|
| 686 |
+
"learning_rate": 9.549150281252633e-06,
|
| 687 |
+
"loss": 3.8833,
|
| 688 |
+
"step": 97
|
| 689 |
+
},
|
| 690 |
+
{
|
| 691 |
+
"epoch": 0.8183716075156576,
|
| 692 |
+
"grad_norm": 13.448304176330566,
|
| 693 |
+
"learning_rate": 8.761481442551573e-06,
|
| 694 |
+
"loss": 5.2468,
|
| 695 |
+
"step": 98
|
| 696 |
+
},
|
| 697 |
+
{
|
| 698 |
+
"epoch": 0.826722338204593,
|
| 699 |
+
"grad_norm": 11.937213897705078,
|
| 700 |
+
"learning_rate": 8.004586331860175e-06,
|
| 701 |
+
"loss": 4.6798,
|
| 702 |
+
"step": 99
|
| 703 |
+
},
|
| 704 |
+
{
|
| 705 |
+
"epoch": 0.8350730688935282,
|
| 706 |
+
"grad_norm": 8.884134292602539,
|
| 707 |
+
"learning_rate": 7.2790297726755716e-06,
|
| 708 |
+
"loss": 4.9037,
|
| 709 |
+
"step": 100
|
| 710 |
+
},
|
| 711 |
+
{
|
| 712 |
+
"epoch": 0.8434237995824635,
|
| 713 |
+
"grad_norm": 9.988716125488281,
|
| 714 |
+
"learning_rate": 6.585353202493322e-06,
|
| 715 |
+
"loss": 5.0198,
|
| 716 |
+
"step": 101
|
| 717 |
+
},
|
| 718 |
+
{
|
| 719 |
+
"epoch": 0.8517745302713987,
|
| 720 |
+
"grad_norm": 9.69389820098877,
|
| 721 |
+
"learning_rate": 5.924074268766422e-06,
|
| 722 |
+
"loss": 3.5639,
|
| 723 |
+
"step": 102
|
| 724 |
+
},
|
| 725 |
+
{
|
| 726 |
+
"epoch": 0.860125260960334,
|
| 727 |
+
"grad_norm": 12.106159210205078,
|
| 728 |
+
"learning_rate": 5.295686442617443e-06,
|
| 729 |
+
"loss": 4.083,
|
| 730 |
+
"step": 103
|
| 731 |
+
},
|
| 732 |
+
{
|
| 733 |
+
"epoch": 0.8684759916492694,
|
| 734 |
+
"grad_norm": 10.521677017211914,
|
| 735 |
+
"learning_rate": 4.700658650591827e-06,
|
| 736 |
+
"loss": 4.8845,
|
| 737 |
+
"step": 104
|
| 738 |
+
},
|
| 739 |
+
{
|
| 740 |
+
"epoch": 0.8768267223382046,
|
| 741 |
+
"grad_norm": 12.082547187805176,
|
| 742 |
+
"learning_rate": 4.139434924727359e-06,
|
| 743 |
+
"loss": 4.4946,
|
| 744 |
+
"step": 105
|
| 745 |
+
},
|
| 746 |
+
{
|
| 747 |
+
"epoch": 0.8851774530271399,
|
| 748 |
+
"grad_norm": 10.821547508239746,
|
| 749 |
+
"learning_rate": 3.612434071200771e-06,
|
| 750 |
+
"loss": 4.7307,
|
| 751 |
+
"step": 106
|
| 752 |
+
},
|
| 753 |
+
{
|
| 754 |
+
"epoch": 0.8935281837160751,
|
| 755 |
+
"grad_norm": 10.070833206176758,
|
| 756 |
+
"learning_rate": 3.1200493577989875e-06,
|
| 757 |
+
"loss": 5.0714,
|
| 758 |
+
"step": 107
|
| 759 |
+
},
|
| 760 |
+
{
|
| 761 |
+
"epoch": 0.9018789144050104,
|
| 762 |
+
"grad_norm": 19.112159729003906,
|
| 763 |
+
"learning_rate": 2.662648220447811e-06,
|
| 764 |
+
"loss": 4.6327,
|
| 765 |
+
"step": 108
|
| 766 |
+
},
|
| 767 |
+
{
|
| 768 |
+
"epoch": 0.9102296450939458,
|
| 769 |
+
"grad_norm": 12.466245651245117,
|
| 770 |
+
"learning_rate": 2.240571989017598e-06,
|
| 771 |
+
"loss": 5.2114,
|
| 772 |
+
"step": 109
|
| 773 |
+
},
|
| 774 |
+
{
|
| 775 |
+
"epoch": 0.918580375782881,
|
| 776 |
+
"grad_norm": 17.981706619262695,
|
| 777 |
+
"learning_rate": 1.8541356326100433e-06,
|
| 778 |
+
"loss": 5.5056,
|
| 779 |
+
"step": 110
|
| 780 |
+
},
|
| 781 |
+
{
|
| 782 |
+
"epoch": 0.9269311064718163,
|
| 783 |
+
"grad_norm": 20.902307510375977,
|
| 784 |
+
"learning_rate": 1.5036275245164377e-06,
|
| 785 |
+
"loss": 4.974,
|
| 786 |
+
"step": 111
|
| 787 |
+
},
|
| 788 |
+
{
|
| 789 |
+
"epoch": 0.9352818371607515,
|
| 790 |
+
"grad_norm": 17.289682388305664,
|
| 791 |
+
"learning_rate": 1.1893092270227724e-06,
|
| 792 |
+
"loss": 4.9496,
|
| 793 |
+
"step": 112
|
| 794 |
+
},
|
| 795 |
+
{
|
| 796 |
+
"epoch": 0.9436325678496869,
|
| 797 |
+
"grad_norm": 17.364238739013672,
|
| 798 |
+
"learning_rate": 9.114152962220735e-07,
|
| 799 |
+
"loss": 6.3092,
|
| 800 |
+
"step": 113
|
| 801 |
+
},
|
| 802 |
+
{
|
| 803 |
+
"epoch": 0.9519832985386222,
|
| 804 |
+
"grad_norm": 30.94085121154785,
|
| 805 |
+
"learning_rate": 6.701531069799038e-07,
|
| 806 |
+
"loss": 5.5584,
|
| 807 |
+
"step": 114
|
| 808 |
+
},
|
| 809 |
+
{
|
| 810 |
+
"epoch": 0.9603340292275574,
|
| 811 |
+
"grad_norm": 27.87859535217285,
|
| 812 |
+
"learning_rate": 4.6570269818346224e-07,
|
| 813 |
+
"loss": 6.069,
|
| 814 |
+
"step": 115
|
| 815 |
+
},
|
| 816 |
+
{
|
| 817 |
+
"epoch": 0.9686847599164927,
|
| 818 |
+
"grad_norm": 30.613059997558594,
|
| 819 |
+
"learning_rate": 2.9821663838981993e-07,
|
| 820 |
+
"loss": 6.3244,
|
| 821 |
+
"step": 116
|
| 822 |
+
},
|
| 823 |
+
{
|
| 824 |
+
"epoch": 0.9770354906054279,
|
| 825 |
+
"grad_norm": 8.08796215057373,
|
| 826 |
+
"learning_rate": 1.6781991197352133e-07,
|
| 827 |
+
"loss": 2.1168,
|
| 828 |
+
"step": 117
|
| 829 |
+
},
|
| 830 |
+
{
|
| 831 |
+
"epoch": 0.9853862212943633,
|
| 832 |
+
"grad_norm": 7.885867595672607,
|
| 833 |
+
"learning_rate": 7.460982585860144e-08,
|
| 834 |
+
"loss": 4.3936,
|
| 835 |
+
"step": 118
|
| 836 |
+
},
|
| 837 |
+
{
|
| 838 |
+
"epoch": 0.9937369519832986,
|
| 839 |
+
"grad_norm": 13.48105525970459,
|
| 840 |
+
"learning_rate": 1.8655936904465875e-08,
|
| 841 |
+
"loss": 5.1137,
|
| 842 |
+
"step": 119
|
| 843 |
+
},
|
| 844 |
+
{
|
| 845 |
+
"epoch": 0.9937369519832986,
|
| 846 |
+
"eval_loss": 0.985297679901123,
|
| 847 |
+
"eval_runtime": 5.2963,
|
| 848 |
+
"eval_samples_per_second": 19.07,
|
| 849 |
+
"eval_steps_per_second": 4.909,
|
| 850 |
+
"step": 119
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"epoch": 1.0020876826722338,
|
| 854 |
+
"grad_norm": 15.896007537841797,
|
| 855 |
+
"learning_rate": 0.0,
|
| 856 |
+
"loss": 4.0913,
|
| 857 |
+
"step": 120
|
| 858 |
}
|
| 859 |
],
|
| 860 |
"logging_steps": 1,
|
|
|
|
| 869 |
"should_evaluate": false,
|
| 870 |
"should_log": false,
|
| 871 |
"should_save": true,
|
| 872 |
+
"should_training_stop": true
|
| 873 |
},
|
| 874 |
"attributes": {}
|
| 875 |
}
|
| 876 |
},
|
| 877 |
+
"total_flos": 8.408442956729549e+16,
|
| 878 |
"train_batch_size": 4,
|
| 879 |
"trial_name": null,
|
| 880 |
"trial_params": null
|