Push GCN + LTR + CE artifacts
Browse files- .gitattributes +2 -0
- README.md +56 -0
- adapter/adapter_config.json +36 -0
- adapter/adapter_model.safetensors +3 -0
- cross_encoder/README.md +342 -0
- cross_encoder/config.json +34 -0
- cross_encoder/config_sentence_transformers.json +11 -0
- cross_encoder/model.safetensors +3 -0
- cross_encoder/modules.json +8 -0
- cross_encoder/sentence_bert_config.json +10 -0
- cross_encoder/special_tokens_map.json +51 -0
- cross_encoder/tokenizer.json +3 -0
- cross_encoder/tokenizer_config.json +56 -0
- gcn_head.pt +3 -0
- ltr_idf.json +0 -0
- ltr_model.txt +0 -0
- ltr_spec_asins.json +0 -0
- ltr_spec_embs.pt +3 -0
- ltr_subcat_canonical.json +1 -0
- ltr_subcat_embs.pt +3 -0
- ltr_subcat_names.json +1 -0
- tokenizer/special_tokens_map.json +51 -0
- tokenizer/tokenizer.json +3 -0
- tokenizer/tokenizer_config.json +67 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
cross_encoder/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- retrieval
|
| 5 |
+
- search
|
| 6 |
+
- lightgbm
|
| 7 |
+
- cross-encoder
|
| 8 |
+
- bge-m3
|
| 9 |
+
language:
|
| 10 |
+
- en
|
| 11 |
+
- ar
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Harrir Search Stack v1
|
| 15 |
+
|
| 16 |
+
Two-tower asymmetric GCN search for fashion retail (Harrir catalog ~28k SKUs),
|
| 17 |
+
multilingual EN+AR. Three artifacts, version-locked together.
|
| 18 |
+
|
| 19 |
+
## Layout
|
| 20 |
+
|
| 21 |
+
```
|
| 22 |
+
.
|
| 23 |
+
├── adapter/ # LoRA adapter on BGE-M3 (LIVE)
|
| 24 |
+
├── tokenizer/ # BGE-M3 tokenizer (LIVE)
|
| 25 |
+
├── gcn_head.pt # W_q / W_p / W_img projection heads (LIVE)
|
| 26 |
+
├── ltr_model.txt # LightGBM LambdaRank champion (LIVE)
|
| 27 |
+
├── ltr_idf.json # BM25/TF-IDF stats over catalog
|
| 28 |
+
├── ltr_subcat_*.{json,pt} # subcategory embeddings (EN+AR)
|
| 29 |
+
├── ltr_spec_*.{json,pt} # product spec embeddings
|
| 30 |
+
└── cross_encoder/ # BAAI/bge-reranker-base fine-tuned
|
| 31 |
+
# NOT loaded by the app today (served from
|
| 32 |
+
# Modal). Kept here for future bake-in.
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## Usage
|
| 36 |
+
|
| 37 |
+
The Harrir app loads these at startup. Set this repo in `.env`:
|
| 38 |
+
|
| 39 |
+
```
|
| 40 |
+
GCN_HF_REPO=rdxtremity/search-stack-v1
|
| 41 |
+
GCN_ARTIFACTS_DIR=./models/gcn_stage2
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Then `python download_models.py` snapshot-downloads into `./models/gcn_stage2/`.
|
| 45 |
+
|
| 46 |
+
Or manually:
|
| 47 |
+
|
| 48 |
+
```python
|
| 49 |
+
from huggingface_hub import snapshot_download
|
| 50 |
+
snapshot_download("rdxtremity/search-stack-v1", local_dir="./models/gcn_stage2")
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Eval
|
| 54 |
+
|
| 55 |
+
Baseline nDCG (golden_v1, 346 queries / 19,923 graded pairs): **0.7695**
|
| 56 |
+
Latest with LTR+CE rerank: see `Primary.GateRuns` audit log in the app.
|
adapter/adapter_config.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": {
|
| 4 |
+
"base_model_class": "XLMRobertaModel",
|
| 5 |
+
"parent_library": "transformers.models.xlm_roberta.modeling_xlm_roberta"
|
| 6 |
+
},
|
| 7 |
+
"base_model_name_or_path": "BAAI/bge-m3",
|
| 8 |
+
"bias": "none",
|
| 9 |
+
"eva_config": null,
|
| 10 |
+
"exclude_modules": null,
|
| 11 |
+
"fan_in_fan_out": false,
|
| 12 |
+
"inference_mode": true,
|
| 13 |
+
"init_lora_weights": true,
|
| 14 |
+
"layer_replication": null,
|
| 15 |
+
"layers_pattern": null,
|
| 16 |
+
"layers_to_transform": null,
|
| 17 |
+
"loftq_config": {},
|
| 18 |
+
"lora_alpha": 16,
|
| 19 |
+
"lora_bias": false,
|
| 20 |
+
"lora_dropout": 0.05,
|
| 21 |
+
"megatron_config": null,
|
| 22 |
+
"megatron_core": "megatron.core",
|
| 23 |
+
"modules_to_save": null,
|
| 24 |
+
"peft_type": "LORA",
|
| 25 |
+
"r": 8,
|
| 26 |
+
"rank_pattern": {},
|
| 27 |
+
"revision": null,
|
| 28 |
+
"target_modules": [
|
| 29 |
+
"key",
|
| 30 |
+
"query",
|
| 31 |
+
"value"
|
| 32 |
+
],
|
| 33 |
+
"task_type": null,
|
| 34 |
+
"use_dora": false,
|
| 35 |
+
"use_rslora": false
|
| 36 |
+
}
|
adapter/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a304235dbab8adfc42af4b1b54d811d166b4f35d79714938a5e542b4cfdc630d
|
| 3 |
+
size 4738136
|
cross_encoder/README.md
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- sentence-transformers
|
| 4 |
+
- cross-encoder
|
| 5 |
+
- reranker
|
| 6 |
+
- generated_from_trainer
|
| 7 |
+
- dataset_size:31340
|
| 8 |
+
- loss:BinaryCrossEntropyLoss
|
| 9 |
+
base_model: BAAI/bge-reranker-base
|
| 10 |
+
pipeline_tag: text-ranking
|
| 11 |
+
library_name: sentence-transformers
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# CrossEncoder based on BAAI/bge-reranker-base
|
| 15 |
+
|
| 16 |
+
This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) using the [sentence-transformers](https://www.SBERT.net) library. It computes scores for pairs of texts, which can be used for text reranking and semantic search.
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
### Model Description
|
| 21 |
+
- **Model Type:** Cross Encoder
|
| 22 |
+
- **Base model:** [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) <!-- at revision 2cfc18c9415c912f9d8155881c133215df768a70 -->
|
| 23 |
+
- **Maximum Sequence Length:** 128 tokens
|
| 24 |
+
- **Number of Output Labels:** 1 label
|
| 25 |
+
- **Supported Modality:** Text
|
| 26 |
+
<!-- - **Training Dataset:** Unknown -->
|
| 27 |
+
<!-- - **Language:** Unknown -->
|
| 28 |
+
<!-- - **License:** Unknown -->
|
| 29 |
+
|
| 30 |
+
### Model Sources
|
| 31 |
+
|
| 32 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
| 33 |
+
- **Documentation:** [Cross Encoder Documentation](https://www.sbert.net/docs/cross_encoder/usage/usage.html)
|
| 34 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
|
| 35 |
+
- **Hugging Face:** [Cross Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=cross-encoder)
|
| 36 |
+
|
| 37 |
+
### Full Model Architecture
|
| 38 |
+
|
| 39 |
+
```
|
| 40 |
+
CrossEncoder(
|
| 41 |
+
(0): Transformer({'transformer_task': 'sequence-classification', 'modality_config': {'text': {'method': 'forward', 'method_output_name': 'logits'}}, 'module_output_name': 'scores', 'architecture': 'XLMRobertaForSequenceClassification'})
|
| 42 |
+
)
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
## Usage
|
| 46 |
+
|
| 47 |
+
### Direct Usage (Sentence Transformers)
|
| 48 |
+
|
| 49 |
+
First install the Sentence Transformers library:
|
| 50 |
+
|
| 51 |
+
```bash
|
| 52 |
+
pip install -U sentence-transformers
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
Then you can load this model and run inference.
|
| 56 |
+
```python
|
| 57 |
+
from sentence_transformers import CrossEncoder
|
| 58 |
+
|
| 59 |
+
# Download from the 🤗 Hub
|
| 60 |
+
model = CrossEncoder("cross_encoder_model_id")
|
| 61 |
+
# Get scores for pairs of inputs
|
| 62 |
+
pairs = [
|
| 63 |
+
['حقيبة تشانك لوكس', "Globus Women's Textured Vegan Leather Sling Bag Tan | Globus Women Tan Vegan Leather Textured Sling Bag With Detachable Strap | globus | Crossbody Bags | Tan"],
|
| 64 |
+
['حريمية ماسكات كورية', 'Kappa 3-Pack Crew Socks Multicolour | Kappa Pack of 3 Crew Length Socks | Kappa | Socks | Multicolour'],
|
| 65 |
+
['شسي غير مبطنة', 'Fall In Love Unlined Bodysuit | فول إن لوف بودي سوت غير مبطن | DeFacto | Body Suits | Deep Magenta'],
|
| 66 |
+
['كندرة رموش مريحة للستات', 'Lift N Snatch Brow Tint Pen Black | قلم تحديد الحواجب ليفت أند سناتش رمادي أسود | NYX PROFESSIONAL MAKEUP | All Products | Black'],
|
| 67 |
+
['white blouse', '2Xtremz Schiffli Ruffle Cotton Top White | 2Xtremz Regular Fit Cotton Top with Schiffli and Ruffle Detail | 2Xtremz | Blouses | White'],
|
| 68 |
+
]
|
| 69 |
+
scores = model.predict(pairs)
|
| 70 |
+
print(scores)
|
| 71 |
+
# [0.9418 0.0044 0.978 0.2881 0.9463]
|
| 72 |
+
|
| 73 |
+
# Or rank different texts based on similarity to a single text
|
| 74 |
+
ranks = model.rank(
|
| 75 |
+
'حقيبة تشانك لوكس',
|
| 76 |
+
[
|
| 77 |
+
"Globus Women's Textured Vegan Leather Sling Bag Tan | Globus Women Tan Vegan Leather Textured Sling Bag With Detachable Strap | globus | Crossbody Bags | Tan",
|
| 78 |
+
'Kappa 3-Pack Crew Socks Multicolour | Kappa Pack of 3 Crew Length Socks | Kappa | Socks | Multicolour',
|
| 79 |
+
'Fall In Love Unlined Bodysuit | فول إن لوف بودي سوت غير مبطن | DeFacto | Body Suits | Deep Magenta',
|
| 80 |
+
'Lift N Snatch Brow Tint Pen Black | قلم تحديد الحواجب ليفت أند سناتش رمادي أسود | NYX PROFESSIONAL MAKEUP | All Products | Black',
|
| 81 |
+
'2Xtremz Schiffli Ruffle Cotton Top White | 2Xtremz Regular Fit Cotton Top with Schiffli and Ruffle Detail | 2Xtremz | Blouses | White',
|
| 82 |
+
]
|
| 83 |
+
)
|
| 84 |
+
# [{'corpus_id': ..., 'score': ...}, {'corpus_id': ..., 'score': ...}, ...]
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
<!--
|
| 88 |
+
### Direct Usage (Transformers)
|
| 89 |
+
|
| 90 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
| 91 |
+
|
| 92 |
+
</details>
|
| 93 |
+
-->
|
| 94 |
+
|
| 95 |
+
<!--
|
| 96 |
+
### Downstream Usage (Sentence Transformers)
|
| 97 |
+
|
| 98 |
+
You can finetune this model on your own dataset.
|
| 99 |
+
|
| 100 |
+
<details><summary>Click to expand</summary>
|
| 101 |
+
|
| 102 |
+
</details>
|
| 103 |
+
-->
|
| 104 |
+
|
| 105 |
+
<!--
|
| 106 |
+
### Out-of-Scope Use
|
| 107 |
+
|
| 108 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
| 109 |
+
-->
|
| 110 |
+
|
| 111 |
+
<!--
|
| 112 |
+
## Bias, Risks and Limitations
|
| 113 |
+
|
| 114 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
| 115 |
+
-->
|
| 116 |
+
|
| 117 |
+
<!--
|
| 118 |
+
### Recommendations
|
| 119 |
+
|
| 120 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
| 121 |
+
-->
|
| 122 |
+
|
| 123 |
+
## Training Details
|
| 124 |
+
|
| 125 |
+
### Training Dataset
|
| 126 |
+
|
| 127 |
+
#### Unnamed Dataset
|
| 128 |
+
|
| 129 |
+
* Size: 31,340 training samples
|
| 130 |
+
* Columns: <code>sentence_0</code>, <code>sentence_1</code>, and <code>label</code>
|
| 131 |
+
* Approximate statistics based on the first 100 samples:
|
| 132 |
+
| | sentence_0 | sentence_1 | label |
|
| 133 |
+
|:---------|:---------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:---------------------------------------------------------------|
|
| 134 |
+
| type | string | string | float |
|
| 135 |
+
| modality | text | text | |
|
| 136 |
+
| details | <ul><li>min: 3 tokens</li><li>mean: 7.44 tokens</li><li>max: 19 tokens</li></ul> | <ul><li>min: 19 tokens</li><li>mean: 37.84 tokens</li><li>max: 57 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.69</li><li>max: 1.0</li></ul> |
|
| 137 |
+
* Samples:
|
| 138 |
+
| sentence_0 | sentence_1 | label |
|
| 139 |
+
|:---------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------|
|
| 140 |
+
| <code>حقيبة تشانك لوكس</code> | <code>Globus Women's Textured Vegan Leather Sling Bag Tan \| Globus Women Tan Vegan Leather Textured Sling Bag With Detachable Strap \| globus \| Crossbody Bags \| Tan</code> | <code>1.0</code> |
|
| 141 |
+
| <code>حريمية ماسكات كورية</code> | <code>Kappa 3-Pack Crew Socks Multicolour \| Kappa Pack of 3 Crew Length Socks \| Kappa \| Socks \| Multicolour</code> | <code>0.0</code> |
|
| 142 |
+
| <code>شسي غير مبطنة</code> | <code>Fall In Love Unlined Bodysuit \| فول إن لوف بودي سوت غير مبطن \| DeFacto \| Body Suits \| Deep Magenta</code> | <code>1.0</code> |
|
| 143 |
+
* Loss: [<code>BinaryCrossEntropyLoss</code>](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters:
|
| 144 |
+
```json
|
| 145 |
+
{
|
| 146 |
+
"activation_fn": "torch.nn.modules.linear.Identity",
|
| 147 |
+
"pos_weight": null
|
| 148 |
+
}
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
### Training Hyperparameters
|
| 152 |
+
#### Non-Default Hyperparameters
|
| 153 |
+
|
| 154 |
+
- `per_device_train_batch_size`: 32
|
| 155 |
+
- `per_device_eval_batch_size`: 32
|
| 156 |
+
- `fp16`: True
|
| 157 |
+
- `disable_tqdm`: True
|
| 158 |
+
|
| 159 |
+
#### All Hyperparameters
|
| 160 |
+
<details><summary>Click to expand</summary>
|
| 161 |
+
|
| 162 |
+
- `overwrite_output_dir`: False
|
| 163 |
+
- `do_predict`: False
|
| 164 |
+
- `prediction_loss_only`: True
|
| 165 |
+
- `per_device_train_batch_size`: 32
|
| 166 |
+
- `per_device_eval_batch_size`: 32
|
| 167 |
+
- `per_gpu_train_batch_size`: None
|
| 168 |
+
- `per_gpu_eval_batch_size`: None
|
| 169 |
+
- `gradient_accumulation_steps`: 1
|
| 170 |
+
- `eval_accumulation_steps`: None
|
| 171 |
+
- `torch_empty_cache_steps`: None
|
| 172 |
+
- `learning_rate`: 5e-05
|
| 173 |
+
- `weight_decay`: 0.0
|
| 174 |
+
- `adam_beta1`: 0.9
|
| 175 |
+
- `adam_beta2`: 0.999
|
| 176 |
+
- `adam_epsilon`: 1e-08
|
| 177 |
+
- `max_grad_norm`: 1
|
| 178 |
+
- `num_train_epochs`: 3
|
| 179 |
+
- `max_steps`: -1
|
| 180 |
+
- `lr_scheduler_type`: linear
|
| 181 |
+
- `lr_scheduler_kwargs`: {}
|
| 182 |
+
- `warmup_ratio`: 0.0
|
| 183 |
+
- `warmup_steps`: 0
|
| 184 |
+
- `log_level`: passive
|
| 185 |
+
- `log_level_replica`: warning
|
| 186 |
+
- `log_on_each_node`: True
|
| 187 |
+
- `logging_nan_inf_filter`: True
|
| 188 |
+
- `save_safetensors`: True
|
| 189 |
+
- `save_on_each_node`: False
|
| 190 |
+
- `save_only_model`: False
|
| 191 |
+
- `restore_callback_states_from_checkpoint`: False
|
| 192 |
+
- `no_cuda`: False
|
| 193 |
+
- `use_cpu`: False
|
| 194 |
+
- `use_mps_device`: False
|
| 195 |
+
- `seed`: 42
|
| 196 |
+
- `data_seed`: None
|
| 197 |
+
- `jit_mode_eval`: False
|
| 198 |
+
- `use_ipex`: False
|
| 199 |
+
- `bf16`: False
|
| 200 |
+
- `fp16`: True
|
| 201 |
+
- `fp16_opt_level`: O1
|
| 202 |
+
- `half_precision_backend`: auto
|
| 203 |
+
- `bf16_full_eval`: False
|
| 204 |
+
- `fp16_full_eval`: False
|
| 205 |
+
- `tf32`: None
|
| 206 |
+
- `local_rank`: 0
|
| 207 |
+
- `ddp_backend`: None
|
| 208 |
+
- `tpu_num_cores`: None
|
| 209 |
+
- `tpu_metrics_debug`: False
|
| 210 |
+
- `debug`: []
|
| 211 |
+
- `dataloader_drop_last`: False
|
| 212 |
+
- `dataloader_num_workers`: 0
|
| 213 |
+
- `dataloader_prefetch_factor`: None
|
| 214 |
+
- `past_index`: -1
|
| 215 |
+
- `disable_tqdm`: True
|
| 216 |
+
- `remove_unused_columns`: True
|
| 217 |
+
- `label_names`: None
|
| 218 |
+
- `load_best_model_at_end`: False
|
| 219 |
+
- `ignore_data_skip`: False
|
| 220 |
+
- `fsdp`: []
|
| 221 |
+
- `fsdp_min_num_params`: 0
|
| 222 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
| 223 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
| 224 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
| 225 |
+
- `deepspeed`: None
|
| 226 |
+
- `label_smoothing_factor`: 0.0
|
| 227 |
+
- `optim`: adamw_torch
|
| 228 |
+
- `optim_args`: None
|
| 229 |
+
- `adafactor`: False
|
| 230 |
+
- `group_by_length`: False
|
| 231 |
+
- `length_column_name`: length
|
| 232 |
+
- `ddp_find_unused_parameters`: None
|
| 233 |
+
- `ddp_bucket_cap_mb`: None
|
| 234 |
+
- `ddp_broadcast_buffers`: False
|
| 235 |
+
- `dataloader_pin_memory`: True
|
| 236 |
+
- `dataloader_persistent_workers`: False
|
| 237 |
+
- `skip_memory_metrics`: True
|
| 238 |
+
- `use_legacy_prediction_loop`: False
|
| 239 |
+
- `push_to_hub`: False
|
| 240 |
+
- `resume_from_checkpoint`: None
|
| 241 |
+
- `hub_model_id`: None
|
| 242 |
+
- `hub_strategy`: every_save
|
| 243 |
+
- `hub_private_repo`: None
|
| 244 |
+
- `hub_always_push`: False
|
| 245 |
+
- `gradient_checkpointing`: False
|
| 246 |
+
- `gradient_checkpointing_kwargs`: None
|
| 247 |
+
- `include_inputs_for_metrics`: False
|
| 248 |
+
- `include_for_metrics`: []
|
| 249 |
+
- `eval_do_concat_batches`: True
|
| 250 |
+
- `fp16_backend`: auto
|
| 251 |
+
- `push_to_hub_model_id`: None
|
| 252 |
+
- `push_to_hub_organization`: None
|
| 253 |
+
- `mp_parameters`:
|
| 254 |
+
- `auto_find_batch_size`: False
|
| 255 |
+
- `full_determinism`: False
|
| 256 |
+
- `torchdynamo`: None
|
| 257 |
+
- `ray_scope`: last
|
| 258 |
+
- `ddp_timeout`: 1800
|
| 259 |
+
- `torch_compile`: False
|
| 260 |
+
- `torch_compile_backend`: None
|
| 261 |
+
- `torch_compile_mode`: None
|
| 262 |
+
- `dispatch_batches`: None
|
| 263 |
+
- `split_batches`: None
|
| 264 |
+
- `include_tokens_per_second`: False
|
| 265 |
+
- `include_num_input_tokens_seen`: False
|
| 266 |
+
- `neftune_noise_alpha`: None
|
| 267 |
+
- `optim_target_modules`: None
|
| 268 |
+
- `batch_eval_metrics`: False
|
| 269 |
+
- `eval_on_start`: False
|
| 270 |
+
- `use_liger_kernel`: False
|
| 271 |
+
- `eval_use_gather_object`: False
|
| 272 |
+
- `average_tokens_across_devices`: False
|
| 273 |
+
- `prompts`: None
|
| 274 |
+
- `batch_sampler`: batch_sampler
|
| 275 |
+
- `multi_dataset_batch_sampler`: proportional
|
| 276 |
+
- `router_mapping`: {}
|
| 277 |
+
- `learning_rate_mapping`: {}
|
| 278 |
+
|
| 279 |
+
</details>
|
| 280 |
+
|
| 281 |
+
### Training Logs
|
| 282 |
+
| Epoch | Step | Training Loss |
|
| 283 |
+
|:------:|:----:|:-------------:|
|
| 284 |
+
| 0.5102 | 500 | 0.6826 |
|
| 285 |
+
| 1.0204 | 1000 | 0.4261 |
|
| 286 |
+
| 1.5306 | 1500 | 0.3741 |
|
| 287 |
+
| 2.0408 | 2000 | 0.3523 |
|
| 288 |
+
| 2.5510 | 2500 | 0.33 |
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
### Training Time
|
| 292 |
+
- **Training**: 5.3 minutes
|
| 293 |
+
|
| 294 |
+
### Framework Versions
|
| 295 |
+
- Python: 3.11.12
|
| 296 |
+
- Sentence Transformers: 5.5.1
|
| 297 |
+
- Transformers: 4.49.0
|
| 298 |
+
- PyTorch: 2.7.0+cu128
|
| 299 |
+
- Accelerate: 1.13.0
|
| 300 |
+
- Datasets: 4.8.5
|
| 301 |
+
- Tokenizers: 0.21.4
|
| 302 |
+
|
| 303 |
+
## Additional Resources
|
| 304 |
+
|
| 305 |
+
- [Training and Finetuning Reranker Models with Sentence Transformers](https://huggingface.co/blog/train-reranker): the end-to-end guide for training or finetuning Cross Encoder (reranker) models.
|
| 306 |
+
- [Multimodal Embedding & Reranker Models with Sentence Transformers](https://huggingface.co/blog/multimodal-sentence-transformers): use text, image, audio, and video reranker models through the same API.
|
| 307 |
+
- [Training and Finetuning Multimodal Embedding & Reranker Models with Sentence Transformers](https://huggingface.co/blog/train-multimodal-sentence-transformers): training multimodal Cross Encoders.
|
| 308 |
+
|
| 309 |
+
## Citation
|
| 310 |
+
|
| 311 |
+
### BibTeX
|
| 312 |
+
|
| 313 |
+
#### Sentence Transformers
|
| 314 |
+
```bibtex
|
| 315 |
+
@inproceedings{reimers-2019-sentence-bert,
|
| 316 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
| 317 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
| 318 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
| 319 |
+
month = "11",
|
| 320 |
+
year = "2019",
|
| 321 |
+
publisher = "Association for Computational Linguistics",
|
| 322 |
+
url = "https://arxiv.org/abs/1908.10084",
|
| 323 |
+
}
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
<!--
|
| 327 |
+
## Glossary
|
| 328 |
+
|
| 329 |
+
*Clearly define terms in order to be accessible across audiences.*
|
| 330 |
+
-->
|
| 331 |
+
|
| 332 |
+
<!--
|
| 333 |
+
## Model Card Authors
|
| 334 |
+
|
| 335 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
| 336 |
+
-->
|
| 337 |
+
|
| 338 |
+
<!--
|
| 339 |
+
## Model Card Contact
|
| 340 |
+
|
| 341 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
| 342 |
+
-->
|
cross_encoder/config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "BAAI/bge-reranker-base",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"XLMRobertaForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.1,
|
| 12 |
+
"hidden_size": 768,
|
| 13 |
+
"id2label": {
|
| 14 |
+
"0": "LABEL_0"
|
| 15 |
+
},
|
| 16 |
+
"initializer_range": 0.02,
|
| 17 |
+
"intermediate_size": 3072,
|
| 18 |
+
"label2id": {
|
| 19 |
+
"LABEL_0": 0
|
| 20 |
+
},
|
| 21 |
+
"layer_norm_eps": 1e-05,
|
| 22 |
+
"max_position_embeddings": 514,
|
| 23 |
+
"model_type": "xlm-roberta",
|
| 24 |
+
"num_attention_heads": 12,
|
| 25 |
+
"num_hidden_layers": 12,
|
| 26 |
+
"output_past": true,
|
| 27 |
+
"pad_token_id": 1,
|
| 28 |
+
"position_embedding_type": "absolute",
|
| 29 |
+
"torch_dtype": "float32",
|
| 30 |
+
"transformers_version": "4.49.0",
|
| 31 |
+
"type_vocab_size": 1,
|
| 32 |
+
"use_cache": true,
|
| 33 |
+
"vocab_size": 250002
|
| 34 |
+
}
|
cross_encoder/config_sentence_transformers.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"__version__": {
|
| 3 |
+
"pytorch": "2.7.0+cu128",
|
| 4 |
+
"sentence_transformers": "5.5.1",
|
| 5 |
+
"transformers": "4.49.0"
|
| 6 |
+
},
|
| 7 |
+
"activation_fn": "torch.nn.modules.activation.Sigmoid",
|
| 8 |
+
"default_prompt_name": null,
|
| 9 |
+
"model_type": "CrossEncoder",
|
| 10 |
+
"prompts": {}
|
| 11 |
+
}
|
cross_encoder/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6ff0ad22d85fff1bf93551097d3653a6cc9fc96103db654fdd2847a3f75400f
|
| 3 |
+
size 1112201932
|
cross_encoder/modules.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.base.modules.transformer.Transformer"
|
| 7 |
+
}
|
| 8 |
+
]
|
cross_encoder/sentence_bert_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"transformer_task": "sequence-classification",
|
| 3 |
+
"modality_config": {
|
| 4 |
+
"text": {
|
| 5 |
+
"method": "forward",
|
| 6 |
+
"method_output_name": "logits"
|
| 7 |
+
}
|
| 8 |
+
},
|
| 9 |
+
"module_output_name": "scores"
|
| 10 |
+
}
|
cross_encoder/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": true,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
cross_encoder/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ae4855312bea5f8100cf97370091e0c5232dca305c756d98ebce45c2ea383cc
|
| 3 |
+
size 17083051
|
cross_encoder/tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"250001": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": true,
|
| 38 |
+
"normalized": true,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": true,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "<mask>",
|
| 50 |
+
"model_max_length": 128,
|
| 51 |
+
"pad_token": "<pad>",
|
| 52 |
+
"sep_token": "</s>",
|
| 53 |
+
"sp_model_kwargs": {},
|
| 54 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
| 55 |
+
"unk_token": "<unk>"
|
| 56 |
+
}
|
gcn_head.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:173f9566f3b9275e9722dd96c861bc77f607c6e37db33dcf600d3c02868514d4
|
| 3 |
+
size 19938337
|
ltr_idf.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ltr_model.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ltr_spec_asins.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ltr_spec_embs.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:489299985d47e21770912eab0c0a9a8583b0ec7636cd92b4e65a217bd8f37519
|
| 3 |
+
size 114706003
|
ltr_subcat_canonical.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["Accessories", "Acne & Blemish Care", "All Products", "Anklets", "Athletic Leggings", "Athletic Shorts", "Athletic Tops", "BB & CC Cream", "Backpacks", "Bags", "Barrel", "Belt Bags", "Belts", "Bikinis", "Blazers", "Blouses", "Blush", "Body Suits", "Bodysuits (Casual)", "Bodysuits (Intimate)", "Boots", "Bottoms", "Boyfriend", "Bracelets", "Bras", "Bridal Lingerie", "Bridesmaid Dresses", "Bronzer & Contour", "Bucket Bags", "Bundles", "Cardigans", "Cleansers", "Clutches", "Co-ords", "Coats", "Cocktail Dresses", "Concealer", "Conditioner", "Cosmetic Bags", "Cover-Ups", "Crop Tops", "Crossbody Bags", "Culottes", "Cuticle Care", "Denim Dresses", "Denim Jackets & Coats", "Denim Overalls & Jumpsuits", "Denim Shorts", "Denim Skirts", "Denim Tops", "Denim Two-piece Outfits", "Down Coats", "Dresses", "Dry Shampoo", "Duffle Bags", "Earrings", "Exfoliators & Scrubs", "Eye Cream", "Eye Primer", "Eyebrow Pencils & Gels", "Eyeliner", "Eyeshadow Palettes", "Face Masks", "Face Oils", "Face Primer", "False Lashes", "Faux Fur Coats", "Flats & Slip-Ons", "Formal & Evening Dresses", "Foundation", "Gel & Shellac", "Hair Accessories", "Hair Brushes & Combs", "Hair Color", "Hair Dryers & Stylers", "Hair Masks & Treatments", "Hair Oil", "Hair Styling Products", "Heat Protectant", "Heels", "Highlighter", "Hoodies", "Jackets", "Jeans", "Jumpsuits", "Jumpsuits & Two-pieces", "Knit Tops", "Laptop Bags & Sleeves", "Leggings", "Lightweight Blazers", "Lightweight Cardigans", "Lightweight Jackets", "Lingerie & Underwear", "Lingerie Sets", "Lip Balm & Treatment", "Lip Care", "Lip Gloss", "Lip Liner", "Lipstick", "Loafers", "Long Dresses", "Loungewear", "Makeup", "Makeup Brushes & Tools", "Makeup Remover", "Makeup Sets & Kits", "Makeup Sponges", "Mascara", "Maternity Bottoms", "Maternity Coats & Jackets", "Maternity Denim", "Maternity Dresses", "Maternity Gowns", "Maternity Sweaters", "Maternity Tops", "Maxi Dresses", "Maxi Party Dresses", "Micellar Water", "Midi Dresses", "Mini Bags", "Mini Dresses", "Modest Evening Dresses", "Moisturizers", "Mules", "Nail Care Tools", "Nail Polish", "Nail Polish Remover", "Necklaces", "Night Cream", "Nighties", "Nightwear", "Nursing", "One-Pieces", "Other Tops", "Overcoats", "Pant Sets", "Panties", "Pants", "Party Jumpsuits", "Plus Size Activewear", "Plus Size Bottoms", "Plus Size Dresses", "Plus Size Jumpsuits", "Plus Size Knitwear", "Plus Size Lingerie", "Plus Size Outerwear", "Plus Size Tops", "Press-On Nails", "Purses", "Pyjama Pants", "Pyjama Sets", "Ring Sets", "Robes", "Sandals", "Sarongs", "Satchels", "Scalp Care", "Scarves", "Serums", "Setting Powder", "Setting Spray", "Shampoo", "Shapewear", "Sheet Masks", "Shirts & Button-Downs", "Shoe Care", "Shoes", "Shoppers & Totes", "Short Dresses", "Shorts", "Shoulder Bags", "Skin Tools (Rollers, Gua Sha)", "Skincare", "Skincare Sets", "Skirts", "Slides & Flip-Flops", "Slips", "Sneakers", "Socks", "Sports Bags", "Sports Bras", "Sports Gea", "Sports Sets", "Sports Shoes", "Sports Skirts", "Sportswear", "Strapless & Convertible Bras", "Suit Pants", "Suit Sets", "Suits", "Sunglasses", "Sunscreen & SPF", "Sweater Co-ords", "Sweater Dresses", "Sweater Skirts", "Sweater Vests", "Sweaters", "Sweatpants", "Sweatshirts", "Swim Shorts", "Swimwear", "T-shirts", "Tank Tops & Camis", "Tankinis", "The Hat Store", "Toners", "Tops", "Track Pants", "Trench Coats", "Two-piece Outfits", "Unitards", "Watches", "Wedding Dresses", "Wide-Leg Pants", "Windbreakers", "Accessories", "Acne & Blemish Care", "All Products", "Anklets", "Athletic Leggings", "Athletic Shorts", "Athletic Tops", "BB & CC Cream", "Backpacks", "Bags", "Belt Bags", "Belts", "Bikinis", "Blazers", "Blouses", "Blush", "Body Suits", "Bodysuits (Casual)", "Bodysuits (Intimate)", "Boots", "Bottoms", "Bracelets", "Bras", "Bridal Lingerie", "Bridesmaid Dresses", "Bronzer & Contour", "Bucket Bags", "Bundles", "Cardigans", "Cleansers", "Clutches", "Co-ords", "Coats", "Cocktail Dresses", "Concealer", "Conditioner", "Cosmetic Bags", "Cover-Ups", "Crop Tops", "Crossbody Bags", "Culottes", "Cuticle Care", "Denim Dresses", "Denim Jackets & Coats", "Denim Overalls & Jumpsuits", "Denim Shorts", "Denim Skirts", "Denim Tops", "Denim Two-piece Outfits", "Down Coats", "Dresses", "Dry Shampoo", "Duffle Bags", "Earrings", "Exfoliators & Scrubs", "Eye Cream", "Eye Primer", "Eyebrow Pencils & Gels", "Eyeliner", "Eyeshadow Palettes", "Face Masks", "Face Oils", "Face Primer", "False Lashes", "Faux Fur Coats", "Flats & Slip-Ons", "Formal & Evening Dresses", "Foundation", "Gel & Shellac", "Hair Accessories", "Hair Brushes & Combs", "Hair Color", "Hair Dryers & Stylers", "Hair Masks & Treatments", "Hair Oil", "Hair Styling Products", "Heat Protectant", "Heels", "Highlighter", "Hoodies", "Jackets", "Jeans", "Jumpsuits", "Jumpsuits & Two-pieces", "Knit Tops", "Laptop Bags & Sleeves", "Leggings", "Lightweight Blazers", "Lightweight Cardigans", "Lightweight Jackets", "Lingerie & Underwear", "Lingerie Sets", "Lip Balm & Treatment", "Lip Care", "Lip Gloss", "Lip Liner", "Lipstick", "Loafers", "Long Dresses", "Loungewear", "Makeup", "Makeup Brushes & Tools", "Makeup Remover", "Makeup Sets & Kits", "Makeup Sponges", "Mascara", "Maternity Bottoms", "Maternity Coats & Jackets", "Maternity Denim", "Maternity Dresses", "Maternity Gowns", "Maternity Sweaters", "Maternity Tops", "Maxi Dresses", "Maxi Party Dresses", "Micellar Water", "Midi Dresses", "Mini Bags", "Mini Dresses", "Modest Evening Dresses", "Moisturizers", "Mules", "Nail Care Tools", "Nail Polish", "Nail Polish Remover", "Necklaces", "Night Cream", "Nighties", "Nightwear", "Nursing", "One-Pieces", "Other Tops", "Other Tops", "Overcoats", "Pant Sets", "Panties", "Pants", "Party Jumpsuits", "Plus Size Activewear", "Plus Size Bottoms", "Plus Size Dresses", "Plus Size Jumpsuits", "Plus Size Knitwear", "Plus Size Lingerie", "Plus Size Outerwear", "Plus Size Tops", "Press-On Nails", "Purses", "Pyjama Pants", "Pyjama Sets", "Ring Sets", "Robes", "Sandals", "Sarongs", "Satchels", "Scalp Care", "Scarves", "Serums", "Setting Powder", "Setting Spray", "Shampoo", "Shapewear", "Sheet Masks", "Shirts & Button-Downs", "Shoe Care", "Shoes", "Shoppers & Totes", "Short Dresses", "Shorts", "Shorts", "Shoulder Bags", "Skin Tools (Rollers, Gua Sha)", "Skincare", "Skincare Sets", "Skirts", "Slides & Flip-Flops", "Slips", "Sneakers", "Socks", "Sports Bags", "Sports Bras", "Sports Gea", "Sports Sets", "Sports Shoes", "Sports Skirts", "Sportswear", "Strapless & Convertible Bras", "Suit Pants", "Suit Sets", "Suits", "Sunglasses", "Sunscreen & SPF", "Sweater Co-ords", "Sweater Dresses", "Sweater Skirts", "Sweater Vests", "Sweaters", "Sweatpants", "Sweatshirts", "Swim Shorts", "Swimwear", "T-shirts", "T-shirts", "Tank Tops & Camis", "Tank Tops & Camis", "Tankinis", "The Hat Store", "Toners", "Tops", "Track Pants", "Trench Coats", "Two-piece Outfits", "Unitards", "Watches", "Wedding Dresses", "Wide-Leg Pants", "Windbreakers"]
|
ltr_subcat_embs.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:997ca747f4998bb6be8e5c678bb7d2900943656929ba0d7928181882ffe8ca7d
|
| 3 |
+
size 1771105
|
ltr_subcat_names.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
["Accessories", "Acne & Blemish Care", "All Products", "Anklets", "Athletic Leggings", "Athletic Shorts", "Athletic Tops", "BB & CC Cream", "Backpacks", "Bags", "Barrel", "Belt Bags", "Belts", "Bikinis", "Blazers", "Blouses", "Blush", "Body Suits", "Bodysuits (Casual)", "Bodysuits (Intimate)", "Boots", "Bottoms", "Boyfriend", "Bracelets", "Bras", "Bridal Lingerie", "Bridesmaid Dresses", "Bronzer & Contour", "Bucket Bags", "Bundles", "Cardigans", "Cleansers", "Clutches", "Co-ords", "Coats", "Cocktail Dresses", "Concealer", "Conditioner", "Cosmetic Bags", "Cover-Ups", "Crop Tops", "Crossbody Bags", "Culottes", "Cuticle Care", "Denim Dresses", "Denim Jackets & Coats", "Denim Overalls & Jumpsuits", "Denim Shorts", "Denim Skirts", "Denim Tops", "Denim Two-piece Outfits", "Down Coats", "Dresses", "Dry Shampoo", "Duffle Bags", "Earrings", "Exfoliators & Scrubs", "Eye Cream", "Eye Primer", "Eyebrow Pencils & Gels", "Eyeliner", "Eyeshadow Palettes", "Face Masks", "Face Oils", "Face Primer", "False Lashes", "Faux Fur Coats", "Flats & Slip-Ons", "Formal & Evening Dresses", "Foundation", "Gel & Shellac", "Hair Accessories", "Hair Brushes & Combs", "Hair Color", "Hair Dryers & Stylers", "Hair Masks & Treatments", "Hair Oil", "Hair Styling Products", "Heat Protectant", "Heels", "Highlighter", "Hoodies", "Jackets", "Jeans", "Jumpsuits", "Jumpsuits & Two-pieces", "Knit Tops", "Laptop Bags & Sleeves", "Leggings", "Lightweight Blazers", "Lightweight Cardigans", "Lightweight Jackets", "Lingerie & Underwear", "Lingerie Sets", "Lip Balm & Treatment", "Lip Care", "Lip Gloss", "Lip Liner", "Lipstick", "Loafers", "Long Dresses", "Loungewear", "Makeup", "Makeup Brushes & Tools", "Makeup Remover", "Makeup Sets & Kits", "Makeup Sponges", "Mascara", "Maternity Bottoms", "Maternity Coats & Jackets", "Maternity Denim", "Maternity Dresses", "Maternity Gowns", "Maternity Sweaters", "Maternity Tops", "Maxi Dresses", "Maxi Party Dresses", "Micellar Water", "Midi Dresses", "Mini Bags", "Mini Dresses", "Modest Evening Dresses", "Moisturizers", "Mules", "Nail Care Tools", "Nail Polish", "Nail Polish Remover", "Necklaces", "Night Cream", "Nighties", "Nightwear", "Nursing", "One-Pieces", "Other Tops", "Overcoats", "Pant Sets", "Panties", "Pants", "Party Jumpsuits", "Plus Size Activewear", "Plus Size Bottoms", "Plus Size Dresses", "Plus Size Jumpsuits", "Plus Size Knitwear", "Plus Size Lingerie", "Plus Size Outerwear", "Plus Size Tops", "Press-On Nails", "Purses", "Pyjama Pants", "Pyjama Sets", "Ring Sets", "Robes", "Sandals", "Sarongs", "Satchels", "Scalp Care", "Scarves", "Serums", "Setting Powder", "Setting Spray", "Shampoo", "Shapewear", "Sheet Masks", "Shirts & Button-Downs", "Shoe Care", "Shoes", "Shoppers & Totes", "Short Dresses", "Shorts", "Shoulder Bags", "Skin Tools (Rollers, Gua Sha)", "Skincare", "Skincare Sets", "Skirts", "Slides & Flip-Flops", "Slips", "Sneakers", "Socks", "Sports Bags", "Sports Bras", "Sports Gea", "Sports Sets", "Sports Shoes", "Sports Skirts", "Sportswear", "Strapless & Convertible Bras", "Suit Pants", "Suit Sets", "Suits", "Sunglasses", "Sunscreen & SPF", "Sweater Co-ords", "Sweater Dresses", "Sweater Skirts", "Sweater Vests", "Sweaters", "Sweatpants", "Sweatshirts", "Swim Shorts", "Swimwear", "T-shirts", "Tank Tops & Camis", "Tankinis", "The Hat Store", "Toners", "Tops", "Track Pants", "Trench Coats", "Two-piece Outfits", "Unitards", "Watches", "Wedding Dresses", "Wide-Leg Pants", "Windbreakers", "إكسسوارات", "علاج حب الشباب", "كل المنتجات", "خلاخيل", "ليقنز رياضي", "شورتات رياضية", "بلوزات رياضية", "بي بي وسي سي كريم", "حقائب ظهر", "حقائب", "حقائب خصر", "أحزمة", "بيكيني", "بليزرات", "بلوزات", "بلاشر", "بودي سوت", "بودي سوت كاجوال", "بودي سوت لانجري", "بوتات", "بناطيل وتنانير", "أساور", "حمالات صدر", "لانجري عرائس", "فساتين وصيفات العروس", "برونزر وكونتور", "حقائب باكيت", "عروض أحذية", "كارديغان", "غسول الوجه", "كلاتش", "أطقم متناسقة", "معاطف", "فساتين كوكتيل", "كونسيلر", "بلسم", "حقائب مكياج", "أغطية شاطئ", "كروب توب", "حقائب كروس", "كيلوت", "العناية بالجلد المحيط بالأظافر", "فساتين جينز", "جاكيتات ومعاطف جينز", "أفرولات وجمبسوت جينز", "شورتات جينز", "تنانير جينز", "بلوزات جينز", "أطقم جينز", "معاطف محشوة", "فساتين", "شامبو جاف", "حقائب دفل", "أقراط", "مقشرات", "كريم العيون", "برايمر العيون", "أقلام وجل الحواجب", "آيلاينر", "باليتات ظلال العيون", "أقنعة الوجه", "زيوت الوجه", "برايمر الوجه", "رموش صناعية", "معاطف فرو صناعي", "أحذية مسطحة", "فساتين رسمية وسهرة", "كريم أساس", "جل وشيلاك", "إكسسوارات شعر", "فرش وأمشاط", "صبغة شعر", "مجففات ومصففات شعر", "أقنعة وعلاجات الشعر", "زيت شعر", "منتجات تصفيف الشعر", "واقي حراري", "كعب عالي", "هايلايتر", "هوديز", "جاكيتات", "جينزات", "جمبسوت", "جمبسوت وأطقم حوامل", "بلوزات تريكو", "حقائب لابتوب", "ليقنز", "بليزرات خفيفة", "كارديغان خفيف", "جاكيتات خفيفة", "ملابس داخلية", "أطقم لانجري", "مرطب وعلاج الشفاه", "العناية بالشفاه", "ملمع شفاه", "محدد شفاه", "أحمر شفاه", "لوفرز", "فساتين طويلة", "ملابس منزلية", "مكياج", "فرش وأدوات مكياج", "مزيل مكياج", "أطقم مكياج", "إسفنجات مكياج", "ماسكارا", "بناطيل حوامل", "معاطف وجاكيتات حوامل", "جينز حوامل", "فساتين حوامل", "فساتين سهرة حوامل", "سويترات حوامل", "بلوزات حوامل", "فساتين ماكسي", "فساتين سهرة طويلة", "ماء ميسيلار", "فساتين ميدي", "حقائب صغيرة", "فساتين ميني", "فساتين سهرة محتشمة", "مرطبات", "ميولز", "أدوات العناية بالأظافر", "طلاء أظافر", "مزيل طلاء الأظافر", "قلادات", "كريم ليلي", "قمصان نوم", "ملابس نوم", "ملابس رضاعة", "مايوه قطعة واحدة", "بلوزات أخرى", "قمم أخرى", "معاطف طويلة", "أطقم بناطيل", "سراويل داخلية", "بناطيل", "جمبسوت حفلات", "ملابس رياضية مقاسات كبيرة", "بناطيل مقاسات كبيرة", "فساتين مقاسات كبيرة", "جمبسوت مقاسات كبيرة", "تريكو مقاسات كبيرة", "لانجري مقاسات كبيرة", "ملابس خارجية مقاسات كبيرة", "بلوزات مقاسات كبيرة", "أظافر لاصقة", "محافظ", "بناطيل بيجامة", "أطقم بيجامات", "أطقم خواتم", "روب", "صنادل", "سارونغ", "حقائب ساتشل", "العناية بفروة الرأس", "أوشحة", "سيروم", "بودرة تثبيت", "سبراي تثبيت", "شامبو", "مشدات الجسم", "أقنعة ورقية", "قمصان وأزرار", "العناية بالأحذية", "أحذية", "حقائب تسوق", "فساتين قصيرة", "شورتات", "شورتات نوم", "حقائب كتف", "أدوات البشرة رولر وغوا شا", "العناية بالبشرة", "أطقم عناية بالبشرة", "تنانير", "شباشب", "قمصان نوم داخلية", "سنيكرز", "جوارب", "حقائب رياضية", "حمالات صدر رياضية", "أدوات رياضية", "أطقم رياضية", "أحذية رياضية", "تنانير رياضية", "ملابس رياضية", "حمالات بدون أحزمة وقابلة للتحويل", "بناطيل بدلة", "أطقم بدلات", "بدلات", "نظارات شمسية", "واقي شمس", "أطقم سويتر", "فساتين سويتر", "تنانير سويتر", "سترات سويتر", "سويترات", "بنطلونات رياضية", "سويت شيرت", "شورتات سباحة", "ملابس سباحة", "تي شيرتات", "تيشيرتات", "بلوزات بدون أكمام", "قمم دبابات وكاميس", "تانكيني", "القبعات", "تونر", "بلوزات", "بناطيل رياضية", "معاطف ترنش", "أطقم من قطعتين", "يونيتارد", "ساعات", "فساتين زفاف", "بناطيل واسعة", "جاكيتات واقية من الرياح"]
|
tokenizer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"cls_token": {
|
| 10 |
+
"content": "<s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"eos_token": {
|
| 17 |
+
"content": "</s>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"mask_token": {
|
| 24 |
+
"content": "<mask>",
|
| 25 |
+
"lstrip": true,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"pad_token": {
|
| 31 |
+
"content": "<pad>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"sep_token": {
|
| 38 |
+
"content": "</s>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"unk_token": {
|
| 45 |
+
"content": "<unk>",
|
| 46 |
+
"lstrip": false,
|
| 47 |
+
"normalized": false,
|
| 48 |
+
"rstrip": false,
|
| 49 |
+
"single_word": false
|
| 50 |
+
}
|
| 51 |
+
}
|
tokenizer/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:abe9726b02fea865ca71b5b97cd26e57cf6623ac637c24cb85e701bfeacdfad7
|
| 3 |
+
size 17082997
|
tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": true,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"0": {
|
| 5 |
+
"content": "<s>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": false,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"1": {
|
| 13 |
+
"content": "<pad>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"2": {
|
| 21 |
+
"content": "</s>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
},
|
| 28 |
+
"3": {
|
| 29 |
+
"content": "<unk>",
|
| 30 |
+
"lstrip": false,
|
| 31 |
+
"normalized": false,
|
| 32 |
+
"rstrip": false,
|
| 33 |
+
"single_word": false,
|
| 34 |
+
"special": true
|
| 35 |
+
},
|
| 36 |
+
"250001": {
|
| 37 |
+
"content": "<mask>",
|
| 38 |
+
"lstrip": true,
|
| 39 |
+
"normalized": false,
|
| 40 |
+
"rstrip": false,
|
| 41 |
+
"single_word": false,
|
| 42 |
+
"special": true
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"backend": "tokenizers",
|
| 46 |
+
"bos_token": "<s>",
|
| 47 |
+
"clean_up_tokenization_spaces": true,
|
| 48 |
+
"cls_token": "<s>",
|
| 49 |
+
"eos_token": "</s>",
|
| 50 |
+
"extra_special_tokens": {},
|
| 51 |
+
"is_local": true,
|
| 52 |
+
"local_files_only": false,
|
| 53 |
+
"mask_token": "<mask>",
|
| 54 |
+
"max_length": 64,
|
| 55 |
+
"model_max_length": 8192,
|
| 56 |
+
"pad_to_multiple_of": null,
|
| 57 |
+
"pad_token": "<pad>",
|
| 58 |
+
"pad_token_type_id": 0,
|
| 59 |
+
"padding_side": "right",
|
| 60 |
+
"sep_token": "</s>",
|
| 61 |
+
"sp_model_kwargs": {},
|
| 62 |
+
"stride": 0,
|
| 63 |
+
"tokenizer_class": "XLMRobertaTokenizerFast",
|
| 64 |
+
"truncation_side": "right",
|
| 65 |
+
"truncation_strategy": "longest_first",
|
| 66 |
+
"unk_token": "<unk>"
|
| 67 |
+
}
|