rdxtremity commited on
Commit
206999a
·
verified ·
1 Parent(s): a70dc9a

Push GCN + LTR + CE artifacts

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cross_encoder/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - retrieval
5
+ - search
6
+ - lightgbm
7
+ - cross-encoder
8
+ - bge-m3
9
+ language:
10
+ - en
11
+ - ar
12
+ ---
13
+
14
+ # Harrir Search Stack v1
15
+
16
+ Two-tower asymmetric GCN search for fashion retail (Harrir catalog ~28k SKUs),
17
+ multilingual EN+AR. Three artifacts, version-locked together.
18
+
19
+ ## Layout
20
+
21
+ ```
22
+ .
23
+ ├── adapter/ # LoRA adapter on BGE-M3 (LIVE)
24
+ ├── tokenizer/ # BGE-M3 tokenizer (LIVE)
25
+ ├── gcn_head.pt # W_q / W_p / W_img projection heads (LIVE)
26
+ ├── ltr_model.txt # LightGBM LambdaRank champion (LIVE)
27
+ ├── ltr_idf.json # BM25/TF-IDF stats over catalog
28
+ ├── ltr_subcat_*.{json,pt} # subcategory embeddings (EN+AR)
29
+ ├── ltr_spec_*.{json,pt} # product spec embeddings
30
+ └── cross_encoder/ # BAAI/bge-reranker-base fine-tuned
31
+ # NOT loaded by the app today (served from
32
+ # Modal). Kept here for future bake-in.
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ The Harrir app loads these at startup. Set this repo in `.env`:
38
+
39
+ ```
40
+ GCN_HF_REPO=rdxtremity/search-stack-v1
41
+ GCN_ARTIFACTS_DIR=./models/gcn_stage2
42
+ ```
43
+
44
+ Then `python download_models.py` snapshot-downloads into `./models/gcn_stage2/`.
45
+
46
+ Or manually:
47
+
48
+ ```python
49
+ from huggingface_hub import snapshot_download
50
+ snapshot_download("rdxtremity/search-stack-v1", local_dir="./models/gcn_stage2")
51
+ ```
52
+
53
+ ## Eval
54
+
55
+ Baseline nDCG (golden_v1, 346 queries / 19,923 graded pairs): **0.7695**
56
+ Latest with LTR+CE rerank: see `Primary.GateRuns` audit log in the app.
adapter/adapter_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "XLMRobertaModel",
5
+ "parent_library": "transformers.models.xlm_roberta.modeling_xlm_roberta"
6
+ },
7
+ "base_model_name_or_path": "BAAI/bge-m3",
8
+ "bias": "none",
9
+ "eva_config": null,
10
+ "exclude_modules": null,
11
+ "fan_in_fan_out": false,
12
+ "inference_mode": true,
13
+ "init_lora_weights": true,
14
+ "layer_replication": null,
15
+ "layers_pattern": null,
16
+ "layers_to_transform": null,
17
+ "loftq_config": {},
18
+ "lora_alpha": 16,
19
+ "lora_bias": false,
20
+ "lora_dropout": 0.05,
21
+ "megatron_config": null,
22
+ "megatron_core": "megatron.core",
23
+ "modules_to_save": null,
24
+ "peft_type": "LORA",
25
+ "r": 8,
26
+ "rank_pattern": {},
27
+ "revision": null,
28
+ "target_modules": [
29
+ "key",
30
+ "query",
31
+ "value"
32
+ ],
33
+ "task_type": null,
34
+ "use_dora": false,
35
+ "use_rslora": false
36
+ }
adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a304235dbab8adfc42af4b1b54d811d166b4f35d79714938a5e542b4cfdc630d
3
+ size 4738136
cross_encoder/README.md ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - sentence-transformers
4
+ - cross-encoder
5
+ - reranker
6
+ - generated_from_trainer
7
+ - dataset_size:31340
8
+ - loss:BinaryCrossEntropyLoss
9
+ base_model: BAAI/bge-reranker-base
10
+ pipeline_tag: text-ranking
11
+ library_name: sentence-transformers
12
+ ---
13
+
14
+ # CrossEncoder based on BAAI/bge-reranker-base
15
+
16
+ This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) using the [sentence-transformers](https://www.SBERT.net) library. It computes scores for pairs of texts, which can be used for text reranking and semantic search.
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+ - **Model Type:** Cross Encoder
22
+ - **Base model:** [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) <!-- at revision 2cfc18c9415c912f9d8155881c133215df768a70 -->
23
+ - **Maximum Sequence Length:** 128 tokens
24
+ - **Number of Output Labels:** 1 label
25
+ - **Supported Modality:** Text
26
+ <!-- - **Training Dataset:** Unknown -->
27
+ <!-- - **Language:** Unknown -->
28
+ <!-- - **License:** Unknown -->
29
+
30
+ ### Model Sources
31
+
32
+ - **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
33
+ - **Documentation:** [Cross Encoder Documentation](https://www.sbert.net/docs/cross_encoder/usage/usage.html)
34
+ - **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers)
35
+ - **Hugging Face:** [Cross Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=cross-encoder)
36
+
37
+ ### Full Model Architecture
38
+
39
+ ```
40
+ CrossEncoder(
41
+ (0): Transformer({'transformer_task': 'sequence-classification', 'modality_config': {'text': {'method': 'forward', 'method_output_name': 'logits'}}, 'module_output_name': 'scores', 'architecture': 'XLMRobertaForSequenceClassification'})
42
+ )
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ ### Direct Usage (Sentence Transformers)
48
+
49
+ First install the Sentence Transformers library:
50
+
51
+ ```bash
52
+ pip install -U sentence-transformers
53
+ ```
54
+
55
+ Then you can load this model and run inference.
56
+ ```python
57
+ from sentence_transformers import CrossEncoder
58
+
59
+ # Download from the 🤗 Hub
60
+ model = CrossEncoder("cross_encoder_model_id")
61
+ # Get scores for pairs of inputs
62
+ pairs = [
63
+ ['حقيبة تشانك لوكس', "Globus Women's Textured Vegan Leather Sling Bag Tan | Globus Women Tan Vegan Leather Textured Sling Bag With Detachable Strap | globus | Crossbody Bags | Tan"],
64
+ ['حريمية ماسكات كورية', 'Kappa 3-Pack Crew Socks Multicolour | Kappa Pack of 3 Crew Length Socks | Kappa | Socks | Multicolour'],
65
+ ['شسي غير مبطنة', 'Fall In Love Unlined Bodysuit | فول إن لوف بودي سوت غير مبطن | DeFacto | Body Suits | Deep Magenta'],
66
+ ['كندرة رموش مريحة للستات', 'Lift N Snatch Brow Tint Pen Black | قلم تحديد الحواجب ليفت أند سناتش رمادي أسود | NYX PROFESSIONAL MAKEUP | All Products | Black'],
67
+ ['white blouse', '2Xtremz Schiffli Ruffle Cotton Top White | 2Xtremz Regular Fit Cotton Top with Schiffli and Ruffle Detail | 2Xtremz | Blouses | White'],
68
+ ]
69
+ scores = model.predict(pairs)
70
+ print(scores)
71
+ # [0.9418 0.0044 0.978 0.2881 0.9463]
72
+
73
+ # Or rank different texts based on similarity to a single text
74
+ ranks = model.rank(
75
+ 'حقيبة تشانك لوكس',
76
+ [
77
+ "Globus Women's Textured Vegan Leather Sling Bag Tan | Globus Women Tan Vegan Leather Textured Sling Bag With Detachable Strap | globus | Crossbody Bags | Tan",
78
+ 'Kappa 3-Pack Crew Socks Multicolour | Kappa Pack of 3 Crew Length Socks | Kappa | Socks | Multicolour',
79
+ 'Fall In Love Unlined Bodysuit | فول إن لوف بودي سوت غير مبطن | DeFacto | Body Suits | Deep Magenta',
80
+ 'Lift N Snatch Brow Tint Pen Black | قلم تحديد الحواجب ليفت أند سناتش رمادي أسود | NYX PROFESSIONAL MAKEUP | All Products | Black',
81
+ '2Xtremz Schiffli Ruffle Cotton Top White | 2Xtremz Regular Fit Cotton Top with Schiffli and Ruffle Detail | 2Xtremz | Blouses | White',
82
+ ]
83
+ )
84
+ # [{'corpus_id': ..., 'score': ...}, {'corpus_id': ..., 'score': ...}, ...]
85
+ ```
86
+
87
+ <!--
88
+ ### Direct Usage (Transformers)
89
+
90
+ <details><summary>Click to see the direct usage in Transformers</summary>
91
+
92
+ </details>
93
+ -->
94
+
95
+ <!--
96
+ ### Downstream Usage (Sentence Transformers)
97
+
98
+ You can finetune this model on your own dataset.
99
+
100
+ <details><summary>Click to expand</summary>
101
+
102
+ </details>
103
+ -->
104
+
105
+ <!--
106
+ ### Out-of-Scope Use
107
+
108
+ *List how the model may foreseeably be misused and address what users ought not to do with the model.*
109
+ -->
110
+
111
+ <!--
112
+ ## Bias, Risks and Limitations
113
+
114
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
115
+ -->
116
+
117
+ <!--
118
+ ### Recommendations
119
+
120
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
121
+ -->
122
+
123
+ ## Training Details
124
+
125
+ ### Training Dataset
126
+
127
+ #### Unnamed Dataset
128
+
129
+ * Size: 31,340 training samples
130
+ * Columns: <code>sentence_0</code>, <code>sentence_1</code>, and <code>label</code>
131
+ * Approximate statistics based on the first 100 samples:
132
+ | | sentence_0 | sentence_1 | label |
133
+ |:---------|:---------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:---------------------------------------------------------------|
134
+ | type | string | string | float |
135
+ | modality | text | text | |
136
+ | details | <ul><li>min: 3 tokens</li><li>mean: 7.44 tokens</li><li>max: 19 tokens</li></ul> | <ul><li>min: 19 tokens</li><li>mean: 37.84 tokens</li><li>max: 57 tokens</li></ul> | <ul><li>min: 0.0</li><li>mean: 0.69</li><li>max: 1.0</li></ul> |
137
+ * Samples:
138
+ | sentence_0 | sentence_1 | label |
139
+ |:---------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------|
140
+ | <code>حقيبة تشانك لوكس</code> | <code>Globus Women's Textured Vegan Leather Sling Bag Tan \| Globus Women Tan Vegan Leather Textured Sling Bag With Detachable Strap \| globus \| Crossbody Bags \| Tan</code> | <code>1.0</code> |
141
+ | <code>حريمية ماسكات كورية</code> | <code>Kappa 3-Pack Crew Socks Multicolour \| Kappa Pack of 3 Crew Length Socks \| Kappa \| Socks \| Multicolour</code> | <code>0.0</code> |
142
+ | <code>شسي غير مبطنة</code> | <code>Fall In Love Unlined Bodysuit \| فول إن لوف بودي سوت غير مبطن \| DeFacto \| Body Suits \| Deep Magenta</code> | <code>1.0</code> |
143
+ * Loss: [<code>BinaryCrossEntropyLoss</code>](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters:
144
+ ```json
145
+ {
146
+ "activation_fn": "torch.nn.modules.linear.Identity",
147
+ "pos_weight": null
148
+ }
149
+ ```
150
+
151
+ ### Training Hyperparameters
152
+ #### Non-Default Hyperparameters
153
+
154
+ - `per_device_train_batch_size`: 32
155
+ - `per_device_eval_batch_size`: 32
156
+ - `fp16`: True
157
+ - `disable_tqdm`: True
158
+
159
+ #### All Hyperparameters
160
+ <details><summary>Click to expand</summary>
161
+
162
+ - `overwrite_output_dir`: False
163
+ - `do_predict`: False
164
+ - `prediction_loss_only`: True
165
+ - `per_device_train_batch_size`: 32
166
+ - `per_device_eval_batch_size`: 32
167
+ - `per_gpu_train_batch_size`: None
168
+ - `per_gpu_eval_batch_size`: None
169
+ - `gradient_accumulation_steps`: 1
170
+ - `eval_accumulation_steps`: None
171
+ - `torch_empty_cache_steps`: None
172
+ - `learning_rate`: 5e-05
173
+ - `weight_decay`: 0.0
174
+ - `adam_beta1`: 0.9
175
+ - `adam_beta2`: 0.999
176
+ - `adam_epsilon`: 1e-08
177
+ - `max_grad_norm`: 1
178
+ - `num_train_epochs`: 3
179
+ - `max_steps`: -1
180
+ - `lr_scheduler_type`: linear
181
+ - `lr_scheduler_kwargs`: {}
182
+ - `warmup_ratio`: 0.0
183
+ - `warmup_steps`: 0
184
+ - `log_level`: passive
185
+ - `log_level_replica`: warning
186
+ - `log_on_each_node`: True
187
+ - `logging_nan_inf_filter`: True
188
+ - `save_safetensors`: True
189
+ - `save_on_each_node`: False
190
+ - `save_only_model`: False
191
+ - `restore_callback_states_from_checkpoint`: False
192
+ - `no_cuda`: False
193
+ - `use_cpu`: False
194
+ - `use_mps_device`: False
195
+ - `seed`: 42
196
+ - `data_seed`: None
197
+ - `jit_mode_eval`: False
198
+ - `use_ipex`: False
199
+ - `bf16`: False
200
+ - `fp16`: True
201
+ - `fp16_opt_level`: O1
202
+ - `half_precision_backend`: auto
203
+ - `bf16_full_eval`: False
204
+ - `fp16_full_eval`: False
205
+ - `tf32`: None
206
+ - `local_rank`: 0
207
+ - `ddp_backend`: None
208
+ - `tpu_num_cores`: None
209
+ - `tpu_metrics_debug`: False
210
+ - `debug`: []
211
+ - `dataloader_drop_last`: False
212
+ - `dataloader_num_workers`: 0
213
+ - `dataloader_prefetch_factor`: None
214
+ - `past_index`: -1
215
+ - `disable_tqdm`: True
216
+ - `remove_unused_columns`: True
217
+ - `label_names`: None
218
+ - `load_best_model_at_end`: False
219
+ - `ignore_data_skip`: False
220
+ - `fsdp`: []
221
+ - `fsdp_min_num_params`: 0
222
+ - `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
223
+ - `fsdp_transformer_layer_cls_to_wrap`: None
224
+ - `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
225
+ - `deepspeed`: None
226
+ - `label_smoothing_factor`: 0.0
227
+ - `optim`: adamw_torch
228
+ - `optim_args`: None
229
+ - `adafactor`: False
230
+ - `group_by_length`: False
231
+ - `length_column_name`: length
232
+ - `ddp_find_unused_parameters`: None
233
+ - `ddp_bucket_cap_mb`: None
234
+ - `ddp_broadcast_buffers`: False
235
+ - `dataloader_pin_memory`: True
236
+ - `dataloader_persistent_workers`: False
237
+ - `skip_memory_metrics`: True
238
+ - `use_legacy_prediction_loop`: False
239
+ - `push_to_hub`: False
240
+ - `resume_from_checkpoint`: None
241
+ - `hub_model_id`: None
242
+ - `hub_strategy`: every_save
243
+ - `hub_private_repo`: None
244
+ - `hub_always_push`: False
245
+ - `gradient_checkpointing`: False
246
+ - `gradient_checkpointing_kwargs`: None
247
+ - `include_inputs_for_metrics`: False
248
+ - `include_for_metrics`: []
249
+ - `eval_do_concat_batches`: True
250
+ - `fp16_backend`: auto
251
+ - `push_to_hub_model_id`: None
252
+ - `push_to_hub_organization`: None
253
+ - `mp_parameters`:
254
+ - `auto_find_batch_size`: False
255
+ - `full_determinism`: False
256
+ - `torchdynamo`: None
257
+ - `ray_scope`: last
258
+ - `ddp_timeout`: 1800
259
+ - `torch_compile`: False
260
+ - `torch_compile_backend`: None
261
+ - `torch_compile_mode`: None
262
+ - `dispatch_batches`: None
263
+ - `split_batches`: None
264
+ - `include_tokens_per_second`: False
265
+ - `include_num_input_tokens_seen`: False
266
+ - `neftune_noise_alpha`: None
267
+ - `optim_target_modules`: None
268
+ - `batch_eval_metrics`: False
269
+ - `eval_on_start`: False
270
+ - `use_liger_kernel`: False
271
+ - `eval_use_gather_object`: False
272
+ - `average_tokens_across_devices`: False
273
+ - `prompts`: None
274
+ - `batch_sampler`: batch_sampler
275
+ - `multi_dataset_batch_sampler`: proportional
276
+ - `router_mapping`: {}
277
+ - `learning_rate_mapping`: {}
278
+
279
+ </details>
280
+
281
+ ### Training Logs
282
+ | Epoch | Step | Training Loss |
283
+ |:------:|:----:|:-------------:|
284
+ | 0.5102 | 500 | 0.6826 |
285
+ | 1.0204 | 1000 | 0.4261 |
286
+ | 1.5306 | 1500 | 0.3741 |
287
+ | 2.0408 | 2000 | 0.3523 |
288
+ | 2.5510 | 2500 | 0.33 |
289
+
290
+
291
+ ### Training Time
292
+ - **Training**: 5.3 minutes
293
+
294
+ ### Framework Versions
295
+ - Python: 3.11.12
296
+ - Sentence Transformers: 5.5.1
297
+ - Transformers: 4.49.0
298
+ - PyTorch: 2.7.0+cu128
299
+ - Accelerate: 1.13.0
300
+ - Datasets: 4.8.5
301
+ - Tokenizers: 0.21.4
302
+
303
+ ## Additional Resources
304
+
305
+ - [Training and Finetuning Reranker Models with Sentence Transformers](https://huggingface.co/blog/train-reranker): the end-to-end guide for training or finetuning Cross Encoder (reranker) models.
306
+ - [Multimodal Embedding & Reranker Models with Sentence Transformers](https://huggingface.co/blog/multimodal-sentence-transformers): use text, image, audio, and video reranker models through the same API.
307
+ - [Training and Finetuning Multimodal Embedding & Reranker Models with Sentence Transformers](https://huggingface.co/blog/train-multimodal-sentence-transformers): training multimodal Cross Encoders.
308
+
309
+ ## Citation
310
+
311
+ ### BibTeX
312
+
313
+ #### Sentence Transformers
314
+ ```bibtex
315
+ @inproceedings{reimers-2019-sentence-bert,
316
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
317
+ author = "Reimers, Nils and Gurevych, Iryna",
318
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
319
+ month = "11",
320
+ year = "2019",
321
+ publisher = "Association for Computational Linguistics",
322
+ url = "https://arxiv.org/abs/1908.10084",
323
+ }
324
+ ```
325
+
326
+ <!--
327
+ ## Glossary
328
+
329
+ *Clearly define terms in order to be accessible across audiences.*
330
+ -->
331
+
332
+ <!--
333
+ ## Model Card Authors
334
+
335
+ *Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
336
+ -->
337
+
338
+ <!--
339
+ ## Model Card Contact
340
+
341
+ *Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
342
+ -->
cross_encoder/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "BAAI/bge-reranker-base",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "LABEL_0": 0
20
+ },
21
+ "layer_norm_eps": 1e-05,
22
+ "max_position_embeddings": 514,
23
+ "model_type": "xlm-roberta",
24
+ "num_attention_heads": 12,
25
+ "num_hidden_layers": 12,
26
+ "output_past": true,
27
+ "pad_token_id": 1,
28
+ "position_embedding_type": "absolute",
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.49.0",
31
+ "type_vocab_size": 1,
32
+ "use_cache": true,
33
+ "vocab_size": 250002
34
+ }
cross_encoder/config_sentence_transformers.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.7.0+cu128",
4
+ "sentence_transformers": "5.5.1",
5
+ "transformers": "4.49.0"
6
+ },
7
+ "activation_fn": "torch.nn.modules.activation.Sigmoid",
8
+ "default_prompt_name": null,
9
+ "model_type": "CrossEncoder",
10
+ "prompts": {}
11
+ }
cross_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6ff0ad22d85fff1bf93551097d3653a6cc9fc96103db654fdd2847a3f75400f
3
+ size 1112201932
cross_encoder/modules.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.base.modules.transformer.Transformer"
7
+ }
8
+ ]
cross_encoder/sentence_bert_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "sequence-classification",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "logits"
7
+ }
8
+ },
9
+ "module_output_name": "scores"
10
+ }
cross_encoder/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
cross_encoder/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ae4855312bea5f8100cf97370091e0c5232dca305c756d98ebce45c2ea383cc
3
+ size 17083051
cross_encoder/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 128,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "sp_model_kwargs": {},
54
+ "tokenizer_class": "XLMRobertaTokenizer",
55
+ "unk_token": "<unk>"
56
+ }
gcn_head.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:173f9566f3b9275e9722dd96c861bc77f607c6e37db33dcf600d3c02868514d4
3
+ size 19938337
ltr_idf.json ADDED
The diff for this file is too large to render. See raw diff
 
ltr_model.txt ADDED
The diff for this file is too large to render. See raw diff
 
ltr_spec_asins.json ADDED
The diff for this file is too large to render. See raw diff
 
ltr_spec_embs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:489299985d47e21770912eab0c0a9a8583b0ec7636cd92b4e65a217bd8f37519
3
+ size 114706003
ltr_subcat_canonical.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["Accessories", "Acne & Blemish Care", "All Products", "Anklets", "Athletic Leggings", "Athletic Shorts", "Athletic Tops", "BB & CC Cream", "Backpacks", "Bags", "Barrel", "Belt Bags", "Belts", "Bikinis", "Blazers", "Blouses", "Blush", "Body Suits", "Bodysuits (Casual)", "Bodysuits (Intimate)", "Boots", "Bottoms", "Boyfriend", "Bracelets", "Bras", "Bridal Lingerie", "Bridesmaid Dresses", "Bronzer & Contour", "Bucket Bags", "Bundles", "Cardigans", "Cleansers", "Clutches", "Co-ords", "Coats", "Cocktail Dresses", "Concealer", "Conditioner", "Cosmetic Bags", "Cover-Ups", "Crop Tops", "Crossbody Bags", "Culottes", "Cuticle Care", "Denim Dresses", "Denim Jackets & Coats", "Denim Overalls & Jumpsuits", "Denim Shorts", "Denim Skirts", "Denim Tops", "Denim Two-piece Outfits", "Down Coats", "Dresses", "Dry Shampoo", "Duffle Bags", "Earrings", "Exfoliators & Scrubs", "Eye Cream", "Eye Primer", "Eyebrow Pencils & Gels", "Eyeliner", "Eyeshadow Palettes", "Face Masks", "Face Oils", "Face Primer", "False Lashes", "Faux Fur Coats", "Flats & Slip-Ons", "Formal & Evening Dresses", "Foundation", "Gel & Shellac", "Hair Accessories", "Hair Brushes & Combs", "Hair Color", "Hair Dryers & Stylers", "Hair Masks & Treatments", "Hair Oil", "Hair Styling Products", "Heat Protectant", "Heels", "Highlighter", "Hoodies", "Jackets", "Jeans", "Jumpsuits", "Jumpsuits & Two-pieces", "Knit Tops", "Laptop Bags & Sleeves", "Leggings", "Lightweight Blazers", "Lightweight Cardigans", "Lightweight Jackets", "Lingerie & Underwear", "Lingerie Sets", "Lip Balm & Treatment", "Lip Care", "Lip Gloss", "Lip Liner", "Lipstick", "Loafers", "Long Dresses", "Loungewear", "Makeup", "Makeup Brushes & Tools", "Makeup Remover", "Makeup Sets & Kits", "Makeup Sponges", "Mascara", "Maternity Bottoms", "Maternity Coats & Jackets", "Maternity Denim", "Maternity Dresses", "Maternity Gowns", "Maternity Sweaters", "Maternity Tops", "Maxi Dresses", "Maxi Party Dresses", "Micellar Water", "Midi Dresses", "Mini Bags", "Mini Dresses", "Modest Evening Dresses", "Moisturizers", "Mules", "Nail Care Tools", "Nail Polish", "Nail Polish Remover", "Necklaces", "Night Cream", "Nighties", "Nightwear", "Nursing", "One-Pieces", "Other Tops", "Overcoats", "Pant Sets", "Panties", "Pants", "Party Jumpsuits", "Plus Size Activewear", "Plus Size Bottoms", "Plus Size Dresses", "Plus Size Jumpsuits", "Plus Size Knitwear", "Plus Size Lingerie", "Plus Size Outerwear", "Plus Size Tops", "Press-On Nails", "Purses", "Pyjama Pants", "Pyjama Sets", "Ring Sets", "Robes", "Sandals", "Sarongs", "Satchels", "Scalp Care", "Scarves", "Serums", "Setting Powder", "Setting Spray", "Shampoo", "Shapewear", "Sheet Masks", "Shirts & Button-Downs", "Shoe Care", "Shoes", "Shoppers & Totes", "Short Dresses", "Shorts", "Shoulder Bags", "Skin Tools (Rollers, Gua Sha)", "Skincare", "Skincare Sets", "Skirts", "Slides & Flip-Flops", "Slips", "Sneakers", "Socks", "Sports Bags", "Sports Bras", "Sports Gea", "Sports Sets", "Sports Shoes", "Sports Skirts", "Sportswear", "Strapless & Convertible Bras", "Suit Pants", "Suit Sets", "Suits", "Sunglasses", "Sunscreen & SPF", "Sweater Co-ords", "Sweater Dresses", "Sweater Skirts", "Sweater Vests", "Sweaters", "Sweatpants", "Sweatshirts", "Swim Shorts", "Swimwear", "T-shirts", "Tank Tops & Camis", "Tankinis", "The Hat Store", "Toners", "Tops", "Track Pants", "Trench Coats", "Two-piece Outfits", "Unitards", "Watches", "Wedding Dresses", "Wide-Leg Pants", "Windbreakers", "Accessories", "Acne & Blemish Care", "All Products", "Anklets", "Athletic Leggings", "Athletic Shorts", "Athletic Tops", "BB & CC Cream", "Backpacks", "Bags", "Belt Bags", "Belts", "Bikinis", "Blazers", "Blouses", "Blush", "Body Suits", "Bodysuits (Casual)", "Bodysuits (Intimate)", "Boots", "Bottoms", "Bracelets", "Bras", "Bridal Lingerie", "Bridesmaid Dresses", "Bronzer & Contour", "Bucket Bags", "Bundles", "Cardigans", "Cleansers", "Clutches", "Co-ords", "Coats", "Cocktail Dresses", "Concealer", "Conditioner", "Cosmetic Bags", "Cover-Ups", "Crop Tops", "Crossbody Bags", "Culottes", "Cuticle Care", "Denim Dresses", "Denim Jackets & Coats", "Denim Overalls & Jumpsuits", "Denim Shorts", "Denim Skirts", "Denim Tops", "Denim Two-piece Outfits", "Down Coats", "Dresses", "Dry Shampoo", "Duffle Bags", "Earrings", "Exfoliators & Scrubs", "Eye Cream", "Eye Primer", "Eyebrow Pencils & Gels", "Eyeliner", "Eyeshadow Palettes", "Face Masks", "Face Oils", "Face Primer", "False Lashes", "Faux Fur Coats", "Flats & Slip-Ons", "Formal & Evening Dresses", "Foundation", "Gel & Shellac", "Hair Accessories", "Hair Brushes & Combs", "Hair Color", "Hair Dryers & Stylers", "Hair Masks & Treatments", "Hair Oil", "Hair Styling Products", "Heat Protectant", "Heels", "Highlighter", "Hoodies", "Jackets", "Jeans", "Jumpsuits", "Jumpsuits & Two-pieces", "Knit Tops", "Laptop Bags & Sleeves", "Leggings", "Lightweight Blazers", "Lightweight Cardigans", "Lightweight Jackets", "Lingerie & Underwear", "Lingerie Sets", "Lip Balm & Treatment", "Lip Care", "Lip Gloss", "Lip Liner", "Lipstick", "Loafers", "Long Dresses", "Loungewear", "Makeup", "Makeup Brushes & Tools", "Makeup Remover", "Makeup Sets & Kits", "Makeup Sponges", "Mascara", "Maternity Bottoms", "Maternity Coats & Jackets", "Maternity Denim", "Maternity Dresses", "Maternity Gowns", "Maternity Sweaters", "Maternity Tops", "Maxi Dresses", "Maxi Party Dresses", "Micellar Water", "Midi Dresses", "Mini Bags", "Mini Dresses", "Modest Evening Dresses", "Moisturizers", "Mules", "Nail Care Tools", "Nail Polish", "Nail Polish Remover", "Necklaces", "Night Cream", "Nighties", "Nightwear", "Nursing", "One-Pieces", "Other Tops", "Other Tops", "Overcoats", "Pant Sets", "Panties", "Pants", "Party Jumpsuits", "Plus Size Activewear", "Plus Size Bottoms", "Plus Size Dresses", "Plus Size Jumpsuits", "Plus Size Knitwear", "Plus Size Lingerie", "Plus Size Outerwear", "Plus Size Tops", "Press-On Nails", "Purses", "Pyjama Pants", "Pyjama Sets", "Ring Sets", "Robes", "Sandals", "Sarongs", "Satchels", "Scalp Care", "Scarves", "Serums", "Setting Powder", "Setting Spray", "Shampoo", "Shapewear", "Sheet Masks", "Shirts & Button-Downs", "Shoe Care", "Shoes", "Shoppers & Totes", "Short Dresses", "Shorts", "Shorts", "Shoulder Bags", "Skin Tools (Rollers, Gua Sha)", "Skincare", "Skincare Sets", "Skirts", "Slides & Flip-Flops", "Slips", "Sneakers", "Socks", "Sports Bags", "Sports Bras", "Sports Gea", "Sports Sets", "Sports Shoes", "Sports Skirts", "Sportswear", "Strapless & Convertible Bras", "Suit Pants", "Suit Sets", "Suits", "Sunglasses", "Sunscreen & SPF", "Sweater Co-ords", "Sweater Dresses", "Sweater Skirts", "Sweater Vests", "Sweaters", "Sweatpants", "Sweatshirts", "Swim Shorts", "Swimwear", "T-shirts", "T-shirts", "Tank Tops & Camis", "Tank Tops & Camis", "Tankinis", "The Hat Store", "Toners", "Tops", "Track Pants", "Trench Coats", "Two-piece Outfits", "Unitards", "Watches", "Wedding Dresses", "Wide-Leg Pants", "Windbreakers"]
ltr_subcat_embs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:997ca747f4998bb6be8e5c678bb7d2900943656929ba0d7928181882ffe8ca7d
3
+ size 1771105
ltr_subcat_names.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["Accessories", "Acne & Blemish Care", "All Products", "Anklets", "Athletic Leggings", "Athletic Shorts", "Athletic Tops", "BB & CC Cream", "Backpacks", "Bags", "Barrel", "Belt Bags", "Belts", "Bikinis", "Blazers", "Blouses", "Blush", "Body Suits", "Bodysuits (Casual)", "Bodysuits (Intimate)", "Boots", "Bottoms", "Boyfriend", "Bracelets", "Bras", "Bridal Lingerie", "Bridesmaid Dresses", "Bronzer & Contour", "Bucket Bags", "Bundles", "Cardigans", "Cleansers", "Clutches", "Co-ords", "Coats", "Cocktail Dresses", "Concealer", "Conditioner", "Cosmetic Bags", "Cover-Ups", "Crop Tops", "Crossbody Bags", "Culottes", "Cuticle Care", "Denim Dresses", "Denim Jackets & Coats", "Denim Overalls & Jumpsuits", "Denim Shorts", "Denim Skirts", "Denim Tops", "Denim Two-piece Outfits", "Down Coats", "Dresses", "Dry Shampoo", "Duffle Bags", "Earrings", "Exfoliators & Scrubs", "Eye Cream", "Eye Primer", "Eyebrow Pencils & Gels", "Eyeliner", "Eyeshadow Palettes", "Face Masks", "Face Oils", "Face Primer", "False Lashes", "Faux Fur Coats", "Flats & Slip-Ons", "Formal & Evening Dresses", "Foundation", "Gel & Shellac", "Hair Accessories", "Hair Brushes & Combs", "Hair Color", "Hair Dryers & Stylers", "Hair Masks & Treatments", "Hair Oil", "Hair Styling Products", "Heat Protectant", "Heels", "Highlighter", "Hoodies", "Jackets", "Jeans", "Jumpsuits", "Jumpsuits & Two-pieces", "Knit Tops", "Laptop Bags & Sleeves", "Leggings", "Lightweight Blazers", "Lightweight Cardigans", "Lightweight Jackets", "Lingerie & Underwear", "Lingerie Sets", "Lip Balm & Treatment", "Lip Care", "Lip Gloss", "Lip Liner", "Lipstick", "Loafers", "Long Dresses", "Loungewear", "Makeup", "Makeup Brushes & Tools", "Makeup Remover", "Makeup Sets & Kits", "Makeup Sponges", "Mascara", "Maternity Bottoms", "Maternity Coats & Jackets", "Maternity Denim", "Maternity Dresses", "Maternity Gowns", "Maternity Sweaters", "Maternity Tops", "Maxi Dresses", "Maxi Party Dresses", "Micellar Water", "Midi Dresses", "Mini Bags", "Mini Dresses", "Modest Evening Dresses", "Moisturizers", "Mules", "Nail Care Tools", "Nail Polish", "Nail Polish Remover", "Necklaces", "Night Cream", "Nighties", "Nightwear", "Nursing", "One-Pieces", "Other Tops", "Overcoats", "Pant Sets", "Panties", "Pants", "Party Jumpsuits", "Plus Size Activewear", "Plus Size Bottoms", "Plus Size Dresses", "Plus Size Jumpsuits", "Plus Size Knitwear", "Plus Size Lingerie", "Plus Size Outerwear", "Plus Size Tops", "Press-On Nails", "Purses", "Pyjama Pants", "Pyjama Sets", "Ring Sets", "Robes", "Sandals", "Sarongs", "Satchels", "Scalp Care", "Scarves", "Serums", "Setting Powder", "Setting Spray", "Shampoo", "Shapewear", "Sheet Masks", "Shirts & Button-Downs", "Shoe Care", "Shoes", "Shoppers & Totes", "Short Dresses", "Shorts", "Shoulder Bags", "Skin Tools (Rollers, Gua Sha)", "Skincare", "Skincare Sets", "Skirts", "Slides & Flip-Flops", "Slips", "Sneakers", "Socks", "Sports Bags", "Sports Bras", "Sports Gea", "Sports Sets", "Sports Shoes", "Sports Skirts", "Sportswear", "Strapless & Convertible Bras", "Suit Pants", "Suit Sets", "Suits", "Sunglasses", "Sunscreen & SPF", "Sweater Co-ords", "Sweater Dresses", "Sweater Skirts", "Sweater Vests", "Sweaters", "Sweatpants", "Sweatshirts", "Swim Shorts", "Swimwear", "T-shirts", "Tank Tops & Camis", "Tankinis", "The Hat Store", "Toners", "Tops", "Track Pants", "Trench Coats", "Two-piece Outfits", "Unitards", "Watches", "Wedding Dresses", "Wide-Leg Pants", "Windbreakers", "إكسسوارات", "علاج حب الشباب", "كل المنتجات", "خلاخيل", "ليقنز رياضي", "شورتات رياضية", "بلوزات رياضية", "بي بي وسي سي كريم", "حقائب ظهر", "حقائب", "حقائب خصر", "أحزمة", "بيكيني", "بليزرات", "بلوزات", "بلاشر", "بودي سوت", "بودي سوت كاجوال", "بودي سوت لانجري", "بوتات", "بناطيل وتنانير", "أساور", "حمالات صدر", "لانجري عرائس", "فساتين وصيفات العروس", "برونزر وكونتور", "حقائب باكيت", "عروض أحذية", "كارديغان", "غسول الوجه", "كلاتش", "أطقم متناسقة", "معاطف", "فساتين كوكتيل", "كونسيلر", "بلسم", "حقائب مكياج", "أغطية شاطئ", "كروب توب", "حقائب كروس", "كيلوت", "العناية بالجلد المحيط بالأظافر", "فساتين جينز", "جاكيتات ومعاطف جينز", "أفرولات وجمبسوت جينز", "شورتات جينز", "تنانير جينز", "بلوزات جينز", "أطقم جينز", "معاطف محشوة", "فساتين", "شامبو جاف", "حقائب دفل", "أقراط", "مقشرات", "كريم العيون", "برايمر العيون", "أقلام وجل الحواجب", "آيلاينر", "باليتات ظلال العيون", "أقنعة الوجه", "زيوت الوجه", "برايمر الوجه", "رموش صناعية", "معاطف فرو صناعي", "أحذية مسطحة", "فساتين رسمية وسهرة", "كريم أساس", "جل وشيلاك", "إكسسوارات شعر", "فرش وأمشاط", "صبغة شعر", "مجففات ومصففات شعر", "أقنعة وعلاجات الشعر", "زيت شعر", "منتجات تصفيف الشعر", "واقي حراري", "كعب عالي", "هايلايتر", "هوديز", "جاكيتات", "جينزات", "جمبسوت", "جمبسوت وأطقم حوامل", "بلوزات تريكو", "حقائب لابتوب", "ليقنز", "بليزرات خفيفة", "كارديغان خفيف", "جاكيتات خفيفة", "ملابس داخلية", "أطقم لانجري", "مرطب وعلاج الشفاه", "العناية بالشفاه", "ملمع شفاه", "محدد شفاه", "أحمر شفاه", "لوفرز", "فساتين طويلة", "ملابس منزلية", "مكياج", "فرش وأدوات مكياج", "مزيل مكياج", "أطقم مكياج", "إسفنجات مكياج", "ماسكارا", "بناطيل حوامل", "معاطف وجاكيتات حوامل", "جينز حوامل", "فساتين حوامل", "فساتين سهرة حوامل", "سويترات حوامل", "بلوزات حوامل", "فساتين ماكسي", "فساتين سهرة طويلة", "ماء ميسيلار", "فساتين ميدي", "حقائب صغيرة", "فساتين ميني", "فساتين سهرة محتشمة", "مرطبات", "ميولز", "أدوات العناية بالأظافر", "طلاء أظافر", "مزيل طلاء الأظافر", "قلادات", "كريم ليلي", "قمصان نوم", "ملابس نوم", "ملابس رضاعة", "مايوه قطعة واحدة", "بلوزات أخرى", "قمم أخرى", "معاطف طويلة", "أطقم بناطيل", "سراويل داخلية", "بناطيل", "جمبسوت حفلات", "ملابس رياضية مقاسات كبيرة", "بناطيل مقاسات كبيرة", "فساتين مقاسات كبيرة", "جمبسوت مقاسات كبيرة", "تريكو مقاسات كبيرة", "لانجري مقاسات كبيرة", "ملابس خارجية مقاسات كبيرة", "بلوزات مقاسات كبيرة", "أظافر لاصقة", "محافظ", "بناطيل بيجامة", "أطقم بيجامات", "أطقم خواتم", "روب", "صنادل", "سارونغ", "حقائب ساتشل", "العناية بفروة الرأس", "أوشحة", "سيروم", "بودرة تثبيت", "سبراي تثبيت", "شامبو", "مشدات الجسم", "أقنعة ورقية", "قمصان وأزرار", "العناية بالأحذية", "أحذية", "حقائب تسوق", "فساتين قصيرة", "شورتات", "شورتات نوم", "حقائب كتف", "أدوات البشرة رولر وغوا شا", "العناية بالبشرة", "أطقم عناية بالبشرة", "تنانير", "شباشب", "قمصان نوم داخلية", "سنيكرز", "جوارب", "حقائب رياضية", "حمالات صدر رياضية", "أدوات رياضية", "أطقم رياضية", "أحذية رياضية", "تنانير رياضية", "ملابس رياضية", "حمالات بدون أحزمة وقابلة للتحويل", "بناطيل بدلة", "أطقم بدلات", "بدلات", "نظارات شمسية", "واقي شمس", "أطقم سويتر", "فساتين سويتر", "تنانير سويتر", "سترات سويتر", "سويترات", "بنطلونات رياضية", "سويت شيرت", "شورتات سباحة", "ملابس سباحة", "تي شيرتات", "تيشيرتات", "بلوزات بدون أكمام", "قمم دبابات وكاميس", "تانكيني", "القبعات", "تونر", "بلوزات", "بناطيل رياضية", "معاطف ترنش", "أطقم من قطعتين", "يونيتارد", "ساعات", "فساتين زفاف", "بناطيل واسعة", "جاكيتات واقية من الرياح"]
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abe9726b02fea865ca71b5b97cd26e57cf6623ac637c24cb85e701bfeacdfad7
3
+ size 17082997
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "250001": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "backend": "tokenizers",
46
+ "bos_token": "<s>",
47
+ "clean_up_tokenization_spaces": true,
48
+ "cls_token": "<s>",
49
+ "eos_token": "</s>",
50
+ "extra_special_tokens": {},
51
+ "is_local": true,
52
+ "local_files_only": false,
53
+ "mask_token": "<mask>",
54
+ "max_length": 64,
55
+ "model_max_length": 8192,
56
+ "pad_to_multiple_of": null,
57
+ "pad_token": "<pad>",
58
+ "pad_token_type_id": 0,
59
+ "padding_side": "right",
60
+ "sep_token": "</s>",
61
+ "sp_model_kwargs": {},
62
+ "stride": 0,
63
+ "tokenizer_class": "XLMRobertaTokenizerFast",
64
+ "truncation_side": "right",
65
+ "truncation_strategy": "longest_first",
66
+ "unk_token": "<unk>"
67
+ }