Add new SentenceTransformer model
Browse files- README.md +43 -107
- model.safetensors +1 -1
README.md
CHANGED
|
@@ -166,7 +166,7 @@ print(query_embeddings.shape, document_embeddings.shape)
|
|
| 166 |
# Get the similarity scores for the embeddings
|
| 167 |
similarities = model.similarity(query_embeddings, document_embeddings)
|
| 168 |
print(similarities)
|
| 169 |
-
# tensor([[ 0.
|
| 170 |
```
|
| 171 |
|
| 172 |
<!--
|
|
@@ -726,13 +726,15 @@ You can finetune this model on your own dataset.
|
|
| 726 |
### Training Hyperparameters
|
| 727 |
#### Non-Default Hyperparameters
|
| 728 |
|
| 729 |
-
- `per_device_train_batch_size`:
|
| 730 |
-
- `learning_rate`:
|
| 731 |
- `weight_decay`: 1e-06
|
| 732 |
-
- `num_train_epochs`:
|
| 733 |
- `warmup_ratio`: 0.1
|
| 734 |
- `fp16`: True
|
| 735 |
- `gradient_checkpointing`: True
|
|
|
|
|
|
|
| 736 |
|
| 737 |
#### All Hyperparameters
|
| 738 |
<details><summary>Click to expand</summary>
|
|
@@ -741,20 +743,20 @@ You can finetune this model on your own dataset.
|
|
| 741 |
- `do_predict`: False
|
| 742 |
- `eval_strategy`: no
|
| 743 |
- `prediction_loss_only`: True
|
| 744 |
-
- `per_device_train_batch_size`:
|
| 745 |
- `per_device_eval_batch_size`: 8
|
| 746 |
- `per_gpu_train_batch_size`: None
|
| 747 |
- `per_gpu_eval_batch_size`: None
|
| 748 |
- `gradient_accumulation_steps`: 1
|
| 749 |
- `eval_accumulation_steps`: None
|
| 750 |
- `torch_empty_cache_steps`: None
|
| 751 |
-
- `learning_rate`:
|
| 752 |
- `weight_decay`: 1e-06
|
| 753 |
- `adam_beta1`: 0.9
|
| 754 |
- `adam_beta2`: 0.999
|
| 755 |
- `adam_epsilon`: 1e-08
|
| 756 |
- `max_grad_norm`: 1.0
|
| 757 |
-
- `num_train_epochs`:
|
| 758 |
- `max_steps`: -1
|
| 759 |
- `lr_scheduler_type`: linear
|
| 760 |
- `lr_scheduler_kwargs`: {}
|
|
@@ -838,8 +840,8 @@ You can finetune this model on your own dataset.
|
|
| 838 |
- `torchdynamo`: None
|
| 839 |
- `ray_scope`: last
|
| 840 |
- `ddp_timeout`: 1800
|
| 841 |
-
- `torch_compile`:
|
| 842 |
-
- `torch_compile_backend`:
|
| 843 |
- `torch_compile_mode`: None
|
| 844 |
- `include_tokens_per_second`: False
|
| 845 |
- `include_num_input_tokens_seen`: no
|
|
@@ -862,104 +864,38 @@ You can finetune this model on your own dataset.
|
|
| 862 |
### Training Logs
|
| 863 |
| Epoch | Step | Training Loss |
|
| 864 |
|:------:|:-----:|:-------------:|
|
| 865 |
-
| 0.
|
| 866 |
-
| 0.
|
| 867 |
-
| 0.
|
| 868 |
-
| 0.
|
| 869 |
-
| 0.
|
| 870 |
-
| 0.
|
| 871 |
-
| 0.
|
| 872 |
-
| 0.
|
| 873 |
-
| 0.
|
| 874 |
-
| 0.
|
| 875 |
-
| 0.
|
| 876 |
-
| 0.
|
| 877 |
-
| 0.
|
| 878 |
-
| 0.
|
| 879 |
-
| 0.
|
| 880 |
-
| 0.
|
| 881 |
-
| 0.
|
| 882 |
-
| 0.
|
| 883 |
-
| 0.
|
| 884 |
-
| 0.
|
| 885 |
-
| 0.
|
| 886 |
-
| 0.
|
| 887 |
-
| 0.
|
| 888 |
-
| 0.
|
| 889 |
-
| 0.
|
| 890 |
-
| 0.
|
| 891 |
-
| 0.
|
| 892 |
-
| 0.
|
| 893 |
-
| 0.
|
| 894 |
-
| 0.
|
| 895 |
-
| 0.
|
| 896 |
-
| 0.
|
| 897 |
-
| 0.6670 | 16500 | 1.743 |
|
| 898 |
-
| 0.6872 | 17000 | 1.5893 |
|
| 899 |
-
| 0.7074 | 17500 | 1.9079 |
|
| 900 |
-
| 0.7276 | 18000 | 1.5885 |
|
| 901 |
-
| 0.7478 | 18500 | 1.9128 |
|
| 902 |
-
| 0.7680 | 19000 | 1.6654 |
|
| 903 |
-
| 0.7882 | 19500 | 1.7099 |
|
| 904 |
-
| 0.8084 | 20000 | 1.4688 |
|
| 905 |
-
| 0.8287 | 20500 | 1.3844 |
|
| 906 |
-
| 0.8489 | 21000 | 1.7908 |
|
| 907 |
-
| 0.8691 | 21500 | 1.7075 |
|
| 908 |
-
| 0.8893 | 22000 | 1.8114 |
|
| 909 |
-
| 0.9095 | 22500 | 1.5198 |
|
| 910 |
-
| 0.9297 | 23000 | 1.8605 |
|
| 911 |
-
| 0.9499 | 23500 | 1.6604 |
|
| 912 |
-
| 0.9701 | 24000 | 1.5891 |
|
| 913 |
-
| 0.9903 | 24500 | 1.5906 |
|
| 914 |
-
| 1.0106 | 25000 | 1.5027 |
|
| 915 |
-
| 1.0308 | 25500 | 1.7599 |
|
| 916 |
-
| 1.0510 | 26000 | 1.4124 |
|
| 917 |
-
| 1.0712 | 26500 | 1.5636 |
|
| 918 |
-
| 1.0914 | 27000 | 1.6126 |
|
| 919 |
-
| 1.1116 | 27500 | 1.4625 |
|
| 920 |
-
| 1.1318 | 28000 | 1.4467 |
|
| 921 |
-
| 1.1520 | 28500 | 1.6898 |
|
| 922 |
-
| 1.1722 | 29000 | 1.5088 |
|
| 923 |
-
| 1.1924 | 29500 | 1.5158 |
|
| 924 |
-
| 1.2127 | 30000 | 1.5266 |
|
| 925 |
-
| 1.2329 | 30500 | 1.465 |
|
| 926 |
-
| 1.2531 | 31000 | 1.5687 |
|
| 927 |
-
| 1.2733 | 31500 | 1.4397 |
|
| 928 |
-
| 1.2935 | 32000 | 1.7929 |
|
| 929 |
-
| 1.3137 | 32500 | 1.5893 |
|
| 930 |
-
| 1.3339 | 33000 | 1.4727 |
|
| 931 |
-
| 1.3541 | 33500 | 1.6007 |
|
| 932 |
-
| 1.3743 | 34000 | 1.2833 |
|
| 933 |
-
| 1.3946 | 34500 | 1.5541 |
|
| 934 |
-
| 1.4148 | 35000 | 1.3354 |
|
| 935 |
-
| 1.4350 | 35500 | 1.4509 |
|
| 936 |
-
| 1.4552 | 36000 | 1.6065 |
|
| 937 |
-
| 1.4754 | 36500 | 1.6393 |
|
| 938 |
-
| 1.4956 | 37000 | 1.3914 |
|
| 939 |
-
| 1.5158 | 37500 | 1.3584 |
|
| 940 |
-
| 1.5360 | 38000 | 1.5504 |
|
| 941 |
-
| 1.5562 | 38500 | 1.2169 |
|
| 942 |
-
| 1.5765 | 39000 | 1.4081 |
|
| 943 |
-
| 1.5967 | 39500 | 1.5506 |
|
| 944 |
-
| 1.6169 | 40000 | 1.473 |
|
| 945 |
-
| 1.6371 | 40500 | 1.2517 |
|
| 946 |
-
| 1.6573 | 41000 | 1.7644 |
|
| 947 |
-
| 1.6775 | 41500 | 1.4237 |
|
| 948 |
-
| 1.6977 | 42000 | 1.295 |
|
| 949 |
-
| 1.7179 | 42500 | 1.4951 |
|
| 950 |
-
| 1.7381 | 43000 | 1.4389 |
|
| 951 |
-
| 1.7584 | 43500 | 1.5742 |
|
| 952 |
-
| 1.7786 | 44000 | 1.4843 |
|
| 953 |
-
| 1.7988 | 44500 | 1.4806 |
|
| 954 |
-
| 1.8190 | 45000 | 1.3674 |
|
| 955 |
-
| 1.8392 | 45500 | 1.329 |
|
| 956 |
-
| 1.8594 | 46000 | 1.7644 |
|
| 957 |
-
| 1.8796 | 46500 | 1.36 |
|
| 958 |
-
| 1.8998 | 47000 | 1.2003 |
|
| 959 |
-
| 1.9200 | 47500 | 1.233 |
|
| 960 |
-
| 1.9403 | 48000 | 1.5147 |
|
| 961 |
-
| 1.9605 | 48500 | 1.3838 |
|
| 962 |
-
| 1.9807 | 49000 | 1.4928 |
|
| 963 |
|
| 964 |
|
| 965 |
### Framework Versions
|
|
|
|
| 166 |
# Get the similarity scores for the embeddings
|
| 167 |
similarities = model.similarity(query_embeddings, document_embeddings)
|
| 168 |
print(similarities)
|
| 169 |
+
# tensor([[ 0.5738, 0.0240, -0.0787]])
|
| 170 |
```
|
| 171 |
|
| 172 |
<!--
|
|
|
|
| 726 |
### Training Hyperparameters
|
| 727 |
#### Non-Default Hyperparameters
|
| 728 |
|
| 729 |
+
- `per_device_train_batch_size`: 384
|
| 730 |
+
- `learning_rate`: 0.0001
|
| 731 |
- `weight_decay`: 1e-06
|
| 732 |
+
- `num_train_epochs`: 1
|
| 733 |
- `warmup_ratio`: 0.1
|
| 734 |
- `fp16`: True
|
| 735 |
- `gradient_checkpointing`: True
|
| 736 |
+
- `torch_compile`: True
|
| 737 |
+
- `torch_compile_backend`: inductor
|
| 738 |
|
| 739 |
#### All Hyperparameters
|
| 740 |
<details><summary>Click to expand</summary>
|
|
|
|
| 743 |
- `do_predict`: False
|
| 744 |
- `eval_strategy`: no
|
| 745 |
- `prediction_loss_only`: True
|
| 746 |
+
- `per_device_train_batch_size`: 384
|
| 747 |
- `per_device_eval_batch_size`: 8
|
| 748 |
- `per_gpu_train_batch_size`: None
|
| 749 |
- `per_gpu_eval_batch_size`: None
|
| 750 |
- `gradient_accumulation_steps`: 1
|
| 751 |
- `eval_accumulation_steps`: None
|
| 752 |
- `torch_empty_cache_steps`: None
|
| 753 |
+
- `learning_rate`: 0.0001
|
| 754 |
- `weight_decay`: 1e-06
|
| 755 |
- `adam_beta1`: 0.9
|
| 756 |
- `adam_beta2`: 0.999
|
| 757 |
- `adam_epsilon`: 1e-08
|
| 758 |
- `max_grad_norm`: 1.0
|
| 759 |
+
- `num_train_epochs`: 1
|
| 760 |
- `max_steps`: -1
|
| 761 |
- `lr_scheduler_type`: linear
|
| 762 |
- `lr_scheduler_kwargs`: {}
|
|
|
|
| 840 |
- `torchdynamo`: None
|
| 841 |
- `ray_scope`: last
|
| 842 |
- `ddp_timeout`: 1800
|
| 843 |
+
- `torch_compile`: True
|
| 844 |
+
- `torch_compile_backend`: inductor
|
| 845 |
- `torch_compile_mode`: None
|
| 846 |
- `include_tokens_per_second`: False
|
| 847 |
- `include_num_input_tokens_seen`: no
|
|
|
|
| 864 |
### Training Logs
|
| 865 |
| Epoch | Step | Training Loss |
|
| 866 |
|:------:|:-----:|:-------------:|
|
| 867 |
+
| 0.0303 | 500 | 4.8473 |
|
| 868 |
+
| 0.0606 | 1000 | 2.6754 |
|
| 869 |
+
| 0.0909 | 1500 | 2.6358 |
|
| 870 |
+
| 0.1212 | 2000 | 2.619 |
|
| 871 |
+
| 0.1515 | 2500 | 2.8342 |
|
| 872 |
+
| 0.1818 | 3000 | 2.2872 |
|
| 873 |
+
| 0.2121 | 3500 | 2.2727 |
|
| 874 |
+
| 0.2424 | 4000 | 2.3469 |
|
| 875 |
+
| 0.2727 | 4500 | 2.1085 |
|
| 876 |
+
| 0.3030 | 5000 | 2.2076 |
|
| 877 |
+
| 0.3334 | 5500 | 2.1161 |
|
| 878 |
+
| 0.3637 | 6000 | 2.2332 |
|
| 879 |
+
| 0.3940 | 6500 | 2.1574 |
|
| 880 |
+
| 0.4243 | 7000 | 2.1012 |
|
| 881 |
+
| 0.4546 | 7500 | 1.946 |
|
| 882 |
+
| 0.4849 | 8000 | 1.7233 |
|
| 883 |
+
| 0.5152 | 8500 | 2.4444 |
|
| 884 |
+
| 0.5455 | 9000 | 2.1055 |
|
| 885 |
+
| 0.5758 | 9500 | 1.9107 |
|
| 886 |
+
| 0.6061 | 10000 | 2.0212 |
|
| 887 |
+
| 0.6364 | 10500 | 2.1029 |
|
| 888 |
+
| 0.6667 | 11000 | 1.8484 |
|
| 889 |
+
| 0.6970 | 11500 | 2.1658 |
|
| 890 |
+
| 0.7273 | 12000 | 2.1007 |
|
| 891 |
+
| 0.7576 | 12500 | 1.9194 |
|
| 892 |
+
| 0.7879 | 13000 | 1.6709 |
|
| 893 |
+
| 0.8182 | 13500 | 1.7653 |
|
| 894 |
+
| 0.8485 | 14000 | 1.952 |
|
| 895 |
+
| 0.8788 | 14500 | 1.8437 |
|
| 896 |
+
| 0.9091 | 15000 | 1.6667 |
|
| 897 |
+
| 0.9395 | 15500 | 1.7433 |
|
| 898 |
+
| 0.9698 | 16000 | 1.7623 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 899 |
|
| 900 |
|
| 901 |
### Framework Versions
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 127538496
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7658290cf36da3d18ee7ebfc328f9c40bd49d23c22c9bf0cd9cb101c1c526c40
|
| 3 |
size 127538496
|