Training in progress, epoch 1, checkpoint
Browse files
checkpoint-2071/README.md
CHANGED
|
@@ -49,7 +49,7 @@ model-index:
|
|
| 49 |
type: unknown
|
| 50 |
metrics:
|
| 51 |
- type: cosine_accuracy
|
| 52 |
-
value: 0.
|
| 53 |
name: Cosine Accuracy
|
| 54 |
---
|
| 55 |
|
|
@@ -114,9 +114,9 @@ print(embeddings.shape)
|
|
| 114 |
# Get the similarity scores for the embeddings
|
| 115 |
similarities = model.similarity(embeddings, embeddings)
|
| 116 |
print(similarities)
|
| 117 |
-
# tensor([[1.0000, 0.
|
| 118 |
-
# [0.
|
| 119 |
-
# [0.
|
| 120 |
```
|
| 121 |
|
| 122 |
<!--
|
|
@@ -153,7 +153,7 @@ You can finetune this model on your own dataset.
|
|
| 153 |
|
| 154 |
| Metric | Value |
|
| 155 |
|:--------------------|:-----------|
|
| 156 |
-
| **cosine_accuracy** | **0.
|
| 157 |
|
| 158 |
<!--
|
| 159 |
## Bias, Risks and Limitations
|
|
@@ -227,9 +227,10 @@ You can finetune this model on your own dataset.
|
|
| 227 |
- `eval_strategy`: steps
|
| 228 |
- `per_device_train_batch_size`: 256
|
| 229 |
- `per_device_eval_batch_size`: 256
|
|
|
|
| 230 |
- `weight_decay`: 0.001
|
| 231 |
-
- `num_train_epochs`:
|
| 232 |
-
- `warmup_ratio`: 0.
|
| 233 |
- `fp16`: True
|
| 234 |
- `dataloader_num_workers`: 1
|
| 235 |
- `dataloader_prefetch_factor`: 2
|
|
@@ -252,17 +253,17 @@ You can finetune this model on your own dataset.
|
|
| 252 |
- `gradient_accumulation_steps`: 1
|
| 253 |
- `eval_accumulation_steps`: None
|
| 254 |
- `torch_empty_cache_steps`: None
|
| 255 |
-
- `learning_rate`:
|
| 256 |
- `weight_decay`: 0.001
|
| 257 |
- `adam_beta1`: 0.9
|
| 258 |
- `adam_beta2`: 0.999
|
| 259 |
- `adam_epsilon`: 1e-08
|
| 260 |
- `max_grad_norm`: 1.0
|
| 261 |
-
- `num_train_epochs`:
|
| 262 |
- `max_steps`: -1
|
| 263 |
- `lr_scheduler_type`: linear
|
| 264 |
- `lr_scheduler_kwargs`: {}
|
| 265 |
-
- `warmup_ratio`: 0.
|
| 266 |
- `warmup_steps`: 0
|
| 267 |
- `log_level`: passive
|
| 268 |
- `log_level_replica`: warning
|
|
@@ -365,8 +366,8 @@ You can finetune this model on your own dataset.
|
|
| 365 |
| Epoch | Step | Training Loss | Validation Loss | cosine_accuracy |
|
| 366 |
|:------:|:----:|:-------------:|:---------------:|:---------------:|
|
| 367 |
| 0.0005 | 1 | 4.1585 | - | - |
|
| 368 |
-
| 0.4829 | 1000 |
|
| 369 |
-
| 0.9657 | 2000 |
|
| 370 |
|
| 371 |
|
| 372 |
### Framework Versions
|
|
|
|
| 49 |
type: unknown
|
| 50 |
metrics:
|
| 51 |
- type: cosine_accuracy
|
| 52 |
+
value: 0.9479440450668335
|
| 53 |
name: Cosine Accuracy
|
| 54 |
---
|
| 55 |
|
|
|
|
| 114 |
# Get the similarity scores for the embeddings
|
| 115 |
similarities = model.similarity(embeddings, embeddings)
|
| 116 |
print(similarities)
|
| 117 |
+
# tensor([[1.0000, 0.9667, 0.2278],
|
| 118 |
+
# [0.9667, 1.0000, 0.2161],
|
| 119 |
+
# [0.2278, 0.2161, 1.0000]])
|
| 120 |
```
|
| 121 |
|
| 122 |
<!--
|
|
|
|
| 153 |
|
| 154 |
| Metric | Value |
|
| 155 |
|:--------------------|:-----------|
|
| 156 |
+
| **cosine_accuracy** | **0.9479** |
|
| 157 |
|
| 158 |
<!--
|
| 159 |
## Bias, Risks and Limitations
|
|
|
|
| 227 |
- `eval_strategy`: steps
|
| 228 |
- `per_device_train_batch_size`: 256
|
| 229 |
- `per_device_eval_batch_size`: 256
|
| 230 |
+
- `learning_rate`: 2e-05
|
| 231 |
- `weight_decay`: 0.001
|
| 232 |
+
- `num_train_epochs`: 8
|
| 233 |
+
- `warmup_ratio`: 0.2
|
| 234 |
- `fp16`: True
|
| 235 |
- `dataloader_num_workers`: 1
|
| 236 |
- `dataloader_prefetch_factor`: 2
|
|
|
|
| 253 |
- `gradient_accumulation_steps`: 1
|
| 254 |
- `eval_accumulation_steps`: None
|
| 255 |
- `torch_empty_cache_steps`: None
|
| 256 |
+
- `learning_rate`: 2e-05
|
| 257 |
- `weight_decay`: 0.001
|
| 258 |
- `adam_beta1`: 0.9
|
| 259 |
- `adam_beta2`: 0.999
|
| 260 |
- `adam_epsilon`: 1e-08
|
| 261 |
- `max_grad_norm`: 1.0
|
| 262 |
+
- `num_train_epochs`: 8
|
| 263 |
- `max_steps`: -1
|
| 264 |
- `lr_scheduler_type`: linear
|
| 265 |
- `lr_scheduler_kwargs`: {}
|
| 266 |
+
- `warmup_ratio`: 0.2
|
| 267 |
- `warmup_steps`: 0
|
| 268 |
- `log_level`: passive
|
| 269 |
- `log_level_replica`: warning
|
|
|
|
| 366 |
| Epoch | Step | Training Loss | Validation Loss | cosine_accuracy |
|
| 367 |
|:------:|:----:|:-------------:|:---------------:|:---------------:|
|
| 368 |
| 0.0005 | 1 | 4.1585 | - | - |
|
| 369 |
+
| 0.4829 | 1000 | 3.2055 | 0.5676 | 0.9401 |
|
| 370 |
+
| 0.9657 | 2000 | 2.0069 | 0.5089 | 0.9479 |
|
| 371 |
|
| 372 |
|
| 373 |
### Framework Versions
|
checkpoint-2071/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 90864192
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb8c69769dbd05c7072f34ea2f05925262d6c023d92e9f985b4dc449c8405505
|
| 3 |
size 90864192
|
checkpoint-2071/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 180607738
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e5da75eb39176d4a290a09f3f7bf1add65552f6dc6d63eaf67c1b5cb81f0edd
|
| 3 |
size 180607738
|
checkpoint-2071/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:091e83d21e287330b10fb74cc1330244dba58d03818725e35e0230804e5f3346
|
| 3 |
size 1064
|
checkpoint-2071/trainer_state.json
CHANGED
|
@@ -18,41 +18,41 @@
|
|
| 18 |
},
|
| 19 |
{
|
| 20 |
"epoch": 0.48285852245292127,
|
| 21 |
-
"grad_norm": 6.
|
| 22 |
-
"learning_rate":
|
| 23 |
-
"loss":
|
| 24 |
"step": 1000
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"epoch": 0.48285852245292127,
|
| 28 |
-
"eval_cosine_accuracy": 0.
|
| 29 |
-
"eval_loss": 0.
|
| 30 |
-
"eval_runtime": 35.
|
| 31 |
-
"eval_samples_per_second":
|
| 32 |
-
"eval_steps_per_second": 1.
|
| 33 |
"step": 1000
|
| 34 |
},
|
| 35 |
{
|
| 36 |
"epoch": 0.9657170449058425,
|
| 37 |
-
"grad_norm":
|
| 38 |
-
"learning_rate":
|
| 39 |
-
"loss":
|
| 40 |
"step": 2000
|
| 41 |
},
|
| 42 |
{
|
| 43 |
"epoch": 0.9657170449058425,
|
| 44 |
-
"eval_cosine_accuracy": 0.
|
| 45 |
-
"eval_loss": 0.
|
| 46 |
-
"eval_runtime":
|
| 47 |
-
"eval_samples_per_second":
|
| 48 |
-
"eval_steps_per_second": 1.
|
| 49 |
"step": 2000
|
| 50 |
}
|
| 51 |
],
|
| 52 |
"logging_steps": 1000,
|
| 53 |
-
"max_steps":
|
| 54 |
"num_input_tokens_seen": 0,
|
| 55 |
-
"num_train_epochs":
|
| 56 |
"save_steps": 500,
|
| 57 |
"stateful_callbacks": {
|
| 58 |
"TrainerControl": {
|
|
|
|
| 18 |
},
|
| 19 |
{
|
| 20 |
"epoch": 0.48285852245292127,
|
| 21 |
+
"grad_norm": 6.372687339782715,
|
| 22 |
+
"learning_rate": 6.028968014484008e-06,
|
| 23 |
+
"loss": 3.2055,
|
| 24 |
"step": 1000
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"epoch": 0.48285852245292127,
|
| 28 |
+
"eval_cosine_accuracy": 0.9400568008422852,
|
| 29 |
+
"eval_loss": 0.5675864219665527,
|
| 30 |
+
"eval_runtime": 35.3676,
|
| 31 |
+
"eval_samples_per_second": 268.862,
|
| 32 |
+
"eval_steps_per_second": 1.074,
|
| 33 |
"step": 1000
|
| 34 |
},
|
| 35 |
{
|
| 36 |
"epoch": 0.9657170449058425,
|
| 37 |
+
"grad_norm": 8.225760459899902,
|
| 38 |
+
"learning_rate": 1.2063971031985518e-05,
|
| 39 |
+
"loss": 2.0069,
|
| 40 |
"step": 2000
|
| 41 |
},
|
| 42 |
{
|
| 43 |
"epoch": 0.9657170449058425,
|
| 44 |
+
"eval_cosine_accuracy": 0.9479440450668335,
|
| 45 |
+
"eval_loss": 0.5088897347450256,
|
| 46 |
+
"eval_runtime": 35.33,
|
| 47 |
+
"eval_samples_per_second": 269.148,
|
| 48 |
+
"eval_steps_per_second": 1.076,
|
| 49 |
"step": 2000
|
| 50 |
}
|
| 51 |
],
|
| 52 |
"logging_steps": 1000,
|
| 53 |
+
"max_steps": 16568,
|
| 54 |
"num_input_tokens_seen": 0,
|
| 55 |
+
"num_train_epochs": 8,
|
| 56 |
"save_steps": 500,
|
| 57 |
"stateful_callbacks": {
|
| 58 |
"TrainerControl": {
|
checkpoint-2071/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5752
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21cb488b39046dd5929796463136d527fa7f4b248e28c84eb80348f28dc5da8a
|
| 3 |
size 5752
|