Training in progress, step 200, checkpoint
Browse files- last-checkpoint/1_Pooling/config.json +1 -1
- last-checkpoint/README.md +29 -37
- last-checkpoint/config.json +16 -52
- last-checkpoint/config_sentence_transformers.json +3 -15
- last-checkpoint/model.safetensors +2 -2
- last-checkpoint/modules.json +1 -13
- last-checkpoint/optimizer.pt +2 -2
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/scaler.pt +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/special_tokens_map.json +13 -9
- last-checkpoint/tokenizer.json +2 -2
- last-checkpoint/tokenizer_config.json +0 -0
- last-checkpoint/trainer_state.json +21 -49
- last-checkpoint/training_args.bin +1 -1
- last-checkpoint/vocab.txt +0 -0
last-checkpoint/1_Pooling/config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"word_embedding_dimension":
|
| 3 |
"pooling_mode_cls_token": false,
|
| 4 |
"pooling_mode_mean_tokens": true,
|
| 5 |
"pooling_mode_max_tokens": false,
|
|
|
|
| 1 |
{
|
| 2 |
+
"word_embedding_dimension": 384,
|
| 3 |
"pooling_mode_cls_token": false,
|
| 4 |
"pooling_mode_mean_tokens": true,
|
| 5 |
"pooling_mode_max_tokens": false,
|
last-checkpoint/README.md
CHANGED
|
@@ -7,7 +7,7 @@ tags:
|
|
| 7 |
- generated_from_trainer
|
| 8 |
- dataset_size:552482
|
| 9 |
- loss:MultipleNegativesRankingLoss
|
| 10 |
-
base_model:
|
| 11 |
widget:
|
| 12 |
- source_sentence: "title: \nCreatto Flashy Fish Silly Swimmers LightUp 3D Puzzle\
|
| 13 |
\ Kit Includes Creatto Puzzle Pieces to Make Illuminated Craft Creations Sting\
|
|
@@ -356,17 +356,17 @@ pipeline_tag: sentence-similarity
|
|
| 356 |
library_name: sentence-transformers
|
| 357 |
---
|
| 358 |
|
| 359 |
-
# SentenceTransformer based on
|
| 360 |
|
| 361 |
-
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [
|
| 362 |
|
| 363 |
## Model Details
|
| 364 |
|
| 365 |
### Model Description
|
| 366 |
- **Model Type:** Sentence Transformer
|
| 367 |
-
- **Base model:** [
|
| 368 |
- **Maximum Sequence Length:** 512 tokens
|
| 369 |
-
- **Output Dimensionality:**
|
| 370 |
- **Similarity Function:** Cosine Similarity
|
| 371 |
- **Training Dataset:**
|
| 372 |
- [amazon_2023_items_processed_filtered](https://huggingface.co/datasets/guyhadad01/Amazon_2023_items_processed_filtered)
|
|
@@ -383,11 +383,9 @@ This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [g
|
|
| 383 |
|
| 384 |
```
|
| 385 |
SentenceTransformer(
|
| 386 |
-
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': '
|
| 387 |
-
(1): Pooling({'word_embedding_dimension':
|
| 388 |
-
(2):
|
| 389 |
-
(3): Dense({'in_features': 3072, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
|
| 390 |
-
(4): Normalize()
|
| 391 |
)
|
| 392 |
```
|
| 393 |
|
|
@@ -408,23 +406,21 @@ from sentence_transformers import SentenceTransformer
|
|
| 408 |
# Download from the 🤗 Hub
|
| 409 |
model = SentenceTransformer("guyhadad01/EncodeRec_Toys")
|
| 410 |
# Run inference
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
]
|
| 414 |
-
documents = [
|
| 415 |
'description\ntop bottom main gear for 910108 or 905308 or 9050081 set top bottom main geartop bottom main gear for 910108 or 905308 or 905008',
|
| 416 |
'description\nFrom the Manufacturer\nRound 2s Polar Lights brings back another popular movie car the Time Machine made famous in Back to the Future The kit features every detail to be expected in a Polar Lights kit from the flux capacitor to its gullwing doors A lightning rod hook power assembly is also included as an option The car body features a new authentic looking brushed metal look that captures the brushed aluminum of the real car The tires roll and the snap assembly means the kit can be assembled in no time with great results With this model kit youre sure to hit 88 miles an hour and be outta timeOfficially licensed from the classic film\nFeatures every detail including the Flux Capacitor\nAuthentic brushedmetal look\nFeatures rolling tires\nSnap assembly',
|
| 417 |
-
'description\nLord of the Fries is the followup to Give Me the Brain another game in the Fast Food Restaurant of the Damned The mechanic is simple combine the ingredients in your hand to build combo meals with dishes like the Cowabunga the Meat Munch and of course the Lord of the Fries This is the fourth edition of Lord of the Fries It was introduced in 1998 and has been updated and improved in every edition The game returns to Cheapass Games after a brief vacation at Steve Jackson Games This time the game is designed to expand The core box contains the original restaurant menu 55 cards plus a completely new Coffee Shop menu with allnew art and ingredients also 55 cards Each deck can support up to 6 players and there is a third menu using ingredients from both decks which supports up to 8 players Also arriving on the scene will be four standalone restaurant decks Mexican Chinese Italian and Irish Each deck plays alone for up to 6 players fits into the core game box and can combine with the core deck for up to 8 playersLightweight funny card game make combo meals from random ingredients\nZombies All new card art by Brian Snoddy\nIncludes a complete Coffee Shop expansion\nDesigned by awardwinning game designer James Ernest\nAll new card art by Brian Snoddy',
|
| 418 |
]
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
# [1, 768] [3, 768]
|
| 423 |
|
| 424 |
# Get the similarity scores for the embeddings
|
| 425 |
-
similarities = model.similarity(
|
| 426 |
print(similarities)
|
| 427 |
-
# tensor([[
|
|
|
|
|
|
|
| 428 |
```
|
| 429 |
|
| 430 |
<!--
|
|
@@ -473,10 +469,10 @@ You can finetune this model on your own dataset.
|
|
| 473 |
* Size: 552,482 training samples
|
| 474 |
* Columns: <code>title</code> and <code>description</code>
|
| 475 |
* Approximate statistics based on the first 1000 samples:
|
| 476 |
-
| | title
|
| 477 |
-
|
| 478 |
-
| type | string
|
| 479 |
-
| details | <ul><li>min:
|
| 480 |
* Samples:
|
| 481 |
| title | description |
|
| 482 |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
@@ -495,8 +491,8 @@ You can finetune this model on your own dataset.
|
|
| 495 |
### Training Hyperparameters
|
| 496 |
#### Non-Default Hyperparameters
|
| 497 |
|
| 498 |
-
- `per_device_train_batch_size`:
|
| 499 |
-
- `num_train_epochs`:
|
| 500 |
- `warmup_ratio`: 0.1
|
| 501 |
- `fp16`: True
|
| 502 |
- `push_to_hub`: True
|
|
@@ -510,7 +506,7 @@ You can finetune this model on your own dataset.
|
|
| 510 |
- `do_predict`: False
|
| 511 |
- `eval_strategy`: no
|
| 512 |
- `prediction_loss_only`: True
|
| 513 |
-
- `per_device_train_batch_size`:
|
| 514 |
- `per_device_eval_batch_size`: 8
|
| 515 |
- `per_gpu_train_batch_size`: None
|
| 516 |
- `per_gpu_eval_batch_size`: None
|
|
@@ -523,7 +519,7 @@ You can finetune this model on your own dataset.
|
|
| 523 |
- `adam_beta2`: 0.999
|
| 524 |
- `adam_epsilon`: 1e-08
|
| 525 |
- `max_grad_norm`: 1.0
|
| 526 |
-
- `num_train_epochs`:
|
| 527 |
- `max_steps`: -1
|
| 528 |
- `lr_scheduler_type`: linear
|
| 529 |
- `lr_scheduler_kwargs`: {}
|
|
@@ -629,14 +625,10 @@ You can finetune this model on your own dataset.
|
|
| 629 |
### Training Logs
|
| 630 |
| Epoch | Step | Training Loss |
|
| 631 |
|:------:|:----:|:-------------:|
|
| 632 |
-
| 0.
|
| 633 |
-
| 0.
|
| 634 |
-
| 0.
|
| 635 |
-
| 0.
|
| 636 |
-
| 0.0145 | 250 | 0.0597 |
|
| 637 |
-
| 0.0174 | 300 | 0.0598 |
|
| 638 |
-
| 0.0203 | 350 | 0.0764 |
|
| 639 |
-
| 0.0232 | 400 | 0.0506 |
|
| 640 |
|
| 641 |
|
| 642 |
### Framework Versions
|
|
|
|
| 7 |
- generated_from_trainer
|
| 8 |
- dataset_size:552482
|
| 9 |
- loss:MultipleNegativesRankingLoss
|
| 10 |
+
base_model: sentence-transformers/all-MiniLM-L6-v2
|
| 11 |
widget:
|
| 12 |
- source_sentence: "title: \nCreatto Flashy Fish Silly Swimmers LightUp 3D Puzzle\
|
| 13 |
\ Kit Includes Creatto Puzzle Pieces to Make Illuminated Craft Creations Sting\
|
|
|
|
| 356 |
library_name: sentence-transformers
|
| 357 |
---
|
| 358 |
|
| 359 |
+
# SentenceTransformer based on sentence-transformers/all-MiniLM-L6-v2
|
| 360 |
|
| 361 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) on the [amazon_2023_items_processed_filtered](https://huggingface.co/datasets/guyhadad01/Amazon_2023_items_processed_filtered) dataset. It maps sentences & paragraphs to a 384-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
| 362 |
|
| 363 |
## Model Details
|
| 364 |
|
| 365 |
### Model Description
|
| 366 |
- **Model Type:** Sentence Transformer
|
| 367 |
+
- **Base model:** [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) <!-- at revision c9745ed1d9f207416be6d2e6f8de32d1f16199bf -->
|
| 368 |
- **Maximum Sequence Length:** 512 tokens
|
| 369 |
+
- **Output Dimensionality:** 384 dimensions
|
| 370 |
- **Similarity Function:** Cosine Similarity
|
| 371 |
- **Training Dataset:**
|
| 372 |
- [amazon_2023_items_processed_filtered](https://huggingface.co/datasets/guyhadad01/Amazon_2023_items_processed_filtered)
|
|
|
|
| 383 |
|
| 384 |
```
|
| 385 |
SentenceTransformer(
|
| 386 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'BertModel'})
|
| 387 |
+
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
| 388 |
+
(2): Normalize()
|
|
|
|
|
|
|
| 389 |
)
|
| 390 |
```
|
| 391 |
|
|
|
|
| 406 |
# Download from the 🤗 Hub
|
| 407 |
model = SentenceTransformer("guyhadad01/EncodeRec_Toys")
|
| 408 |
# Run inference
|
| 409 |
+
sentences = [
|
| 410 |
+
'title: \nDH top bottom main gear for 910108 or 905308 or 905008',
|
|
|
|
|
|
|
| 411 |
'description\ntop bottom main gear for 910108 or 905308 or 9050081 set top bottom main geartop bottom main gear for 910108 or 905308 or 905008',
|
| 412 |
'description\nFrom the Manufacturer\nRound 2s Polar Lights brings back another popular movie car the Time Machine made famous in Back to the Future The kit features every detail to be expected in a Polar Lights kit from the flux capacitor to its gullwing doors A lightning rod hook power assembly is also included as an option The car body features a new authentic looking brushed metal look that captures the brushed aluminum of the real car The tires roll and the snap assembly means the kit can be assembled in no time with great results With this model kit youre sure to hit 88 miles an hour and be outta timeOfficially licensed from the classic film\nFeatures every detail including the Flux Capacitor\nAuthentic brushedmetal look\nFeatures rolling tires\nSnap assembly',
|
|
|
|
| 413 |
]
|
| 414 |
+
embeddings = model.encode(sentences)
|
| 415 |
+
print(embeddings.shape)
|
| 416 |
+
# [3, 384]
|
|
|
|
| 417 |
|
| 418 |
# Get the similarity scores for the embeddings
|
| 419 |
+
similarities = model.similarity(embeddings, embeddings)
|
| 420 |
print(similarities)
|
| 421 |
+
# tensor([[1.0000, 0.8211, 0.0629],
|
| 422 |
+
# [0.8211, 1.0000, 0.1016],
|
| 423 |
+
# [0.0629, 0.1016, 1.0000]])
|
| 424 |
```
|
| 425 |
|
| 426 |
<!--
|
|
|
|
| 469 |
* Size: 552,482 training samples
|
| 470 |
* Columns: <code>title</code> and <code>description</code>
|
| 471 |
* Approximate statistics based on the first 1000 samples:
|
| 472 |
+
| | title | description |
|
| 473 |
+
|:--------|:----------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
|
| 474 |
+
| type | string | string |
|
| 475 |
+
| details | <ul><li>min: 6 tokens</li><li>mean: 21.46 tokens</li><li>max: 61 tokens</li></ul> | <ul><li>min: 13 tokens</li><li>mean: 193.18 tokens</li><li>max: 512 tokens</li></ul> |
|
| 476 |
* Samples:
|
| 477 |
| title | description |
|
| 478 |
|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
|
|
| 491 |
### Training Hyperparameters
|
| 492 |
#### Non-Default Hyperparameters
|
| 493 |
|
| 494 |
+
- `per_device_train_batch_size`: 512
|
| 495 |
+
- `num_train_epochs`: 1
|
| 496 |
- `warmup_ratio`: 0.1
|
| 497 |
- `fp16`: True
|
| 498 |
- `push_to_hub`: True
|
|
|
|
| 506 |
- `do_predict`: False
|
| 507 |
- `eval_strategy`: no
|
| 508 |
- `prediction_loss_only`: True
|
| 509 |
+
- `per_device_train_batch_size`: 512
|
| 510 |
- `per_device_eval_batch_size`: 8
|
| 511 |
- `per_gpu_train_batch_size`: None
|
| 512 |
- `per_gpu_eval_batch_size`: None
|
|
|
|
| 519 |
- `adam_beta2`: 0.999
|
| 520 |
- `adam_epsilon`: 1e-08
|
| 521 |
- `max_grad_norm`: 1.0
|
| 522 |
+
- `num_train_epochs`: 1
|
| 523 |
- `max_steps`: -1
|
| 524 |
- `lr_scheduler_type`: linear
|
| 525 |
- `lr_scheduler_kwargs`: {}
|
|
|
|
| 625 |
### Training Logs
|
| 626 |
| Epoch | Step | Training Loss |
|
| 627 |
|:------:|:----:|:-------------:|
|
| 628 |
+
| 0.0463 | 50 | 0.6644 |
|
| 629 |
+
| 0.0926 | 100 | 0.3737 |
|
| 630 |
+
| 0.1389 | 150 | 0.3271 |
|
| 631 |
+
| 0.1852 | 200 | 0.309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
|
| 634 |
### Framework Versions
|
last-checkpoint/config.json
CHANGED
|
@@ -1,61 +1,25 @@
|
|
| 1 |
{
|
| 2 |
-
"_sliding_window_pattern": 6,
|
| 3 |
"architectures": [
|
| 4 |
-
"
|
| 5 |
],
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
-
"
|
| 9 |
-
"
|
| 10 |
-
"
|
| 11 |
-
"
|
| 12 |
-
"final_logit_softcapping": null,
|
| 13 |
-
"head_dim": 256,
|
| 14 |
-
"hidden_activation": "gelu_pytorch_tanh",
|
| 15 |
-
"hidden_size": 768,
|
| 16 |
"initializer_range": 0.02,
|
| 17 |
-
"intermediate_size":
|
| 18 |
-
"
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
"sliding_attention",
|
| 24 |
-
"full_attention",
|
| 25 |
-
"sliding_attention",
|
| 26 |
-
"sliding_attention",
|
| 27 |
-
"sliding_attention",
|
| 28 |
-
"sliding_attention",
|
| 29 |
-
"sliding_attention",
|
| 30 |
-
"full_attention",
|
| 31 |
-
"sliding_attention",
|
| 32 |
-
"sliding_attention",
|
| 33 |
-
"sliding_attention",
|
| 34 |
-
"sliding_attention",
|
| 35 |
-
"sliding_attention",
|
| 36 |
-
"full_attention",
|
| 37 |
-
"sliding_attention",
|
| 38 |
-
"sliding_attention",
|
| 39 |
-
"sliding_attention",
|
| 40 |
-
"sliding_attention",
|
| 41 |
-
"sliding_attention",
|
| 42 |
-
"full_attention"
|
| 43 |
-
],
|
| 44 |
-
"max_position_embeddings": 2048,
|
| 45 |
-
"model_type": "gemma3_text",
|
| 46 |
-
"num_attention_heads": 3,
|
| 47 |
-
"num_hidden_layers": 24,
|
| 48 |
-
"num_key_value_heads": 1,
|
| 49 |
"pad_token_id": 0,
|
| 50 |
-
"
|
| 51 |
-
"rms_norm_eps": 1e-06,
|
| 52 |
-
"rope_local_base_freq": 10000.0,
|
| 53 |
-
"rope_scaling": null,
|
| 54 |
-
"rope_theta": 1000000.0,
|
| 55 |
-
"sliding_window": 512,
|
| 56 |
"torch_dtype": "float32",
|
| 57 |
"transformers_version": "4.55.2",
|
| 58 |
-
"
|
| 59 |
"use_cache": true,
|
| 60 |
-
"vocab_size":
|
| 61 |
}
|
|
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"gradient_checkpointing": false,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_dropout_prob": 0.1,
|
| 10 |
+
"hidden_size": 384,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 1536,
|
| 13 |
+
"layer_norm_eps": 1e-12,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"model_type": "bert",
|
| 16 |
+
"num_attention_heads": 12,
|
| 17 |
+
"num_hidden_layers": 6,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"pad_token_id": 0,
|
| 19 |
+
"position_embedding_type": "absolute",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"torch_dtype": "float32",
|
| 21 |
"transformers_version": "4.55.2",
|
| 22 |
+
"type_vocab_size": 2,
|
| 23 |
"use_cache": true,
|
| 24 |
+
"vocab_size": 30522
|
| 25 |
}
|
last-checkpoint/config_sentence_transformers.json
CHANGED
|
@@ -1,25 +1,13 @@
|
|
| 1 |
{
|
| 2 |
-
"model_type": "SentenceTransformer",
|
| 3 |
"__version__": {
|
| 4 |
"sentence_transformers": "5.1.0",
|
| 5 |
"transformers": "4.55.2",
|
| 6 |
"pytorch": "2.7.1+cu126"
|
| 7 |
},
|
|
|
|
| 8 |
"prompts": {
|
| 9 |
-
"query": "
|
| 10 |
-
"document": "
|
| 11 |
-
"BitextMining": "task: search result | query: ",
|
| 12 |
-
"Clustering": "task: clustering | query: ",
|
| 13 |
-
"Classification": "task: classification | query: ",
|
| 14 |
-
"InstructionRetrieval": "task: code retrieval | query: ",
|
| 15 |
-
"MultilabelClassification": "task: classification | query: ",
|
| 16 |
-
"PairClassification": "task: sentence similarity | query: ",
|
| 17 |
-
"Reranking": "task: search result | query: ",
|
| 18 |
-
"Retrieval": "task: search result | query: ",
|
| 19 |
-
"Retrieval-query": "task: search result | query: ",
|
| 20 |
-
"Retrieval-document": "title: none | text: ",
|
| 21 |
-
"STS": "task: sentence similarity | query: ",
|
| 22 |
-
"Summarization": "task: summarization | query: "
|
| 23 |
},
|
| 24 |
"default_prompt_name": null,
|
| 25 |
"similarity_fn_name": "cosine"
|
|
|
|
| 1 |
{
|
|
|
|
| 2 |
"__version__": {
|
| 3 |
"sentence_transformers": "5.1.0",
|
| 4 |
"transformers": "4.55.2",
|
| 5 |
"pytorch": "2.7.1+cu126"
|
| 6 |
},
|
| 7 |
+
"model_type": "SentenceTransformer",
|
| 8 |
"prompts": {
|
| 9 |
+
"query": "",
|
| 10 |
+
"document": ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
},
|
| 12 |
"default_prompt_name": null,
|
| 13 |
"similarity_fn_name": "cosine"
|
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee6d5fe7579a8728e8ce1b5a3d16cc6d66c5719392898bc44d8ff0f2fa3b0a9f
|
| 3 |
+
size 90864192
|
last-checkpoint/modules.json
CHANGED
|
@@ -14,19 +14,7 @@
|
|
| 14 |
{
|
| 15 |
"idx": 2,
|
| 16 |
"name": "2",
|
| 17 |
-
"path": "
|
| 18 |
-
"type": "sentence_transformers.models.Dense"
|
| 19 |
-
},
|
| 20 |
-
{
|
| 21 |
-
"idx": 3,
|
| 22 |
-
"name": "3",
|
| 23 |
-
"path": "3_Dense",
|
| 24 |
-
"type": "sentence_transformers.models.Dense"
|
| 25 |
-
},
|
| 26 |
-
{
|
| 27 |
-
"idx": 4,
|
| 28 |
-
"name": "4",
|
| 29 |
-
"path": "4_Normalize",
|
| 30 |
"type": "sentence_transformers.models.Normalize"
|
| 31 |
}
|
| 32 |
]
|
|
|
|
| 14 |
{
|
| 15 |
"idx": 2,
|
| 16 |
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"type": "sentence_transformers.models.Normalize"
|
| 19 |
}
|
| 20 |
]
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30bfc8d92d7169e34af54cf060e06b10ee4e5ac9ddc2b191b9c37e4b367c1665
|
| 3 |
+
size 180608203
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cb75720d80bc56d0ec34834b2514caa83f7e0d893f31f4afcbf3ca5aa7e264b
|
| 3 |
size 14645
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:124625e167eb28acbfc793cfcb3e8a08b32e7fea06501462bc9e420a5e1beb2a
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:541a8d7a44578a73d37559badc34c3a1afe0d9b1d50ab26f1598001f3a3c1618
|
| 3 |
size 1465
|
last-checkpoint/special_tokens_map.json
CHANGED
|
@@ -1,30 +1,34 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
-
|
| 4 |
-
"content": "<bos>",
|
| 5 |
"lstrip": false,
|
| 6 |
"normalized": false,
|
| 7 |
"rstrip": false,
|
| 8 |
"single_word": false
|
| 9 |
},
|
| 10 |
-
"
|
| 11 |
-
|
| 12 |
-
"content": "<eos>",
|
| 13 |
"lstrip": false,
|
| 14 |
"normalized": false,
|
| 15 |
"rstrip": false,
|
| 16 |
"single_word": false
|
| 17 |
},
|
| 18 |
-
"image_token": "<image_soft_token>",
|
| 19 |
"pad_token": {
|
| 20 |
-
"content": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"lstrip": false,
|
| 22 |
"normalized": false,
|
| 23 |
"rstrip": false,
|
| 24 |
"single_word": false
|
| 25 |
},
|
| 26 |
"unk_token": {
|
| 27 |
-
"content": "
|
| 28 |
"lstrip": false,
|
| 29 |
"normalized": false,
|
| 30 |
"rstrip": false,
|
|
|
|
| 1 |
{
|
| 2 |
+
"cls_token": {
|
| 3 |
+
"content": "[CLS]",
|
|
|
|
| 4 |
"lstrip": false,
|
| 5 |
"normalized": false,
|
| 6 |
"rstrip": false,
|
| 7 |
"single_word": false
|
| 8 |
},
|
| 9 |
+
"mask_token": {
|
| 10 |
+
"content": "[MASK]",
|
|
|
|
| 11 |
"lstrip": false,
|
| 12 |
"normalized": false,
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
|
|
|
| 16 |
"pad_token": {
|
| 17 |
+
"content": "[PAD]",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"sep_token": {
|
| 24 |
+
"content": "[SEP]",
|
| 25 |
"lstrip": false,
|
| 26 |
"normalized": false,
|
| 27 |
"rstrip": false,
|
| 28 |
"single_word": false
|
| 29 |
},
|
| 30 |
"unk_token": {
|
| 31 |
+
"content": "[UNK]",
|
| 32 |
"lstrip": false,
|
| 33 |
"normalized": false,
|
| 34 |
"rstrip": false,
|
last-checkpoint/tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91f1def9b9391fdabe028cd3f3fcc4efd34e5d1f08c3bf2de513ebb5911a1854
|
| 3 |
+
size 711649
|
last-checkpoint/tokenizer_config.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,74 +2,46 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
-
"epoch": 0.
|
| 14 |
-
"grad_norm":
|
| 15 |
-
"learning_rate":
|
| 16 |
-
"loss": 0.
|
| 17 |
"step": 50
|
| 18 |
},
|
| 19 |
{
|
| 20 |
-
"epoch": 0.
|
| 21 |
-
"grad_norm":
|
| 22 |
-
"learning_rate":
|
| 23 |
-
"loss": 0.
|
| 24 |
"step": 100
|
| 25 |
},
|
| 26 |
{
|
| 27 |
-
"epoch": 0.
|
| 28 |
-
"grad_norm":
|
| 29 |
-
"learning_rate":
|
| 30 |
-
"loss": 0.
|
| 31 |
"step": 150
|
| 32 |
},
|
| 33 |
{
|
| 34 |
-
"epoch": 0.
|
| 35 |
-
"grad_norm":
|
| 36 |
-
"learning_rate":
|
| 37 |
-
"loss": 0.
|
| 38 |
"step": 200
|
| 39 |
-
},
|
| 40 |
-
{
|
| 41 |
-
"epoch": 0.014479323526004865,
|
| 42 |
-
"grad_norm": 4.010167598724365,
|
| 43 |
-
"learning_rate": 3.546612623045744e-06,
|
| 44 |
-
"loss": 0.0597,
|
| 45 |
-
"step": 250
|
| 46 |
-
},
|
| 47 |
-
{
|
| 48 |
-
"epoch": 0.01737518823120584,
|
| 49 |
-
"grad_norm": 10.415971755981445,
|
| 50 |
-
"learning_rate": 4.270411117544876e-06,
|
| 51 |
-
"loss": 0.0598,
|
| 52 |
-
"step": 300
|
| 53 |
-
},
|
| 54 |
-
{
|
| 55 |
-
"epoch": 0.02027105293640681,
|
| 56 |
-
"grad_norm": 2.172306537628174,
|
| 57 |
-
"learning_rate": 4.994209612044008e-06,
|
| 58 |
-
"loss": 0.0764,
|
| 59 |
-
"step": 350
|
| 60 |
-
},
|
| 61 |
-
{
|
| 62 |
-
"epoch": 0.023166917641607783,
|
| 63 |
-
"grad_norm": 15.422524452209473,
|
| 64 |
-
"learning_rate": 5.718008106543138e-06,
|
| 65 |
-
"loss": 0.0506,
|
| 66 |
-
"step": 400
|
| 67 |
}
|
| 68 |
],
|
| 69 |
"logging_steps": 50,
|
| 70 |
-
"max_steps":
|
| 71 |
"num_input_tokens_seen": 0,
|
| 72 |
-
"num_train_epochs":
|
| 73 |
"save_steps": 200,
|
| 74 |
"stateful_callbacks": {
|
| 75 |
"TrainerControl": {
|
|
@@ -84,7 +56,7 @@
|
|
| 84 |
}
|
| 85 |
},
|
| 86 |
"total_flos": 0.0,
|
| 87 |
-
"train_batch_size":
|
| 88 |
"trial_name": null,
|
| 89 |
"trial_params": null
|
| 90 |
}
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.18518518518518517,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 200,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
+
"epoch": 0.046296296296296294,
|
| 14 |
+
"grad_norm": 1.7154957056045532,
|
| 15 |
+
"learning_rate": 2.2685185185185187e-05,
|
| 16 |
+
"loss": 0.6644,
|
| 17 |
"step": 50
|
| 18 |
},
|
| 19 |
{
|
| 20 |
+
"epoch": 0.09259259259259259,
|
| 21 |
+
"grad_norm": 1.6062076091766357,
|
| 22 |
+
"learning_rate": 4.5833333333333334e-05,
|
| 23 |
+
"loss": 0.3737,
|
| 24 |
"step": 100
|
| 25 |
},
|
| 26 |
{
|
| 27 |
+
"epoch": 0.1388888888888889,
|
| 28 |
+
"grad_norm": 1.5934187173843384,
|
| 29 |
+
"learning_rate": 4.7890946502057616e-05,
|
| 30 |
+
"loss": 0.3271,
|
| 31 |
"step": 150
|
| 32 |
},
|
| 33 |
{
|
| 34 |
+
"epoch": 0.18518518518518517,
|
| 35 |
+
"grad_norm": 1.4217034578323364,
|
| 36 |
+
"learning_rate": 4.531893004115226e-05,
|
| 37 |
+
"loss": 0.309,
|
| 38 |
"step": 200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
],
|
| 41 |
"logging_steps": 50,
|
| 42 |
+
"max_steps": 1080,
|
| 43 |
"num_input_tokens_seen": 0,
|
| 44 |
+
"num_train_epochs": 1,
|
| 45 |
"save_steps": 200,
|
| 46 |
"stateful_callbacks": {
|
| 47 |
"TrainerControl": {
|
|
|
|
| 56 |
}
|
| 57 |
},
|
| 58 |
"total_flos": 0.0,
|
| 59 |
+
"train_batch_size": 512,
|
| 60 |
"trial_name": null,
|
| 61 |
"trial_params": null
|
| 62 |
}
|
last-checkpoint/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6097
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d38868219e5f23127c67539cb1aea90b6a40b71d9518f7f8f35a1f4dc71c50de
|
| 3 |
size 6097
|
last-checkpoint/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|