mmitsui-shopify commited on
Commit
4ad230e
·
verified ·
1 Parent(s): 56c99e7

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +162 -0
  2. checkpoint-1200/config.json +47 -0
  3. checkpoint-1200/model.safetensors +3 -0
  4. checkpoint-1200/optimizer.pt +3 -0
  5. checkpoint-1200/rng_state.pth +3 -0
  6. checkpoint-1200/scheduler.pt +3 -0
  7. checkpoint-1200/tokenizer.json +0 -0
  8. checkpoint-1200/tokenizer_config.json +14 -0
  9. checkpoint-1200/trainer_state.json +166 -0
  10. checkpoint-1200/training_args.bin +3 -0
  11. checkpoint-1500/config.json +47 -0
  12. checkpoint-1500/model.safetensors +3 -0
  13. checkpoint-1500/optimizer.pt +3 -0
  14. checkpoint-1500/rng_state.pth +3 -0
  15. checkpoint-1500/scheduler.pt +3 -0
  16. checkpoint-1500/tokenizer.json +0 -0
  17. checkpoint-1500/tokenizer_config.json +14 -0
  18. checkpoint-1500/trainer_state.json +199 -0
  19. checkpoint-1500/training_args.bin +3 -0
  20. checkpoint-1800/config.json +47 -0
  21. checkpoint-1800/model.safetensors +3 -0
  22. checkpoint-1800/optimizer.pt +3 -0
  23. checkpoint-1800/rng_state.pth +3 -0
  24. checkpoint-1800/scheduler.pt +3 -0
  25. checkpoint-1800/tokenizer.json +0 -0
  26. checkpoint-1800/tokenizer_config.json +14 -0
  27. checkpoint-1800/trainer_state.json +232 -0
  28. checkpoint-1800/training_args.bin +3 -0
  29. checkpoint-2100/config.json +47 -0
  30. checkpoint-2100/model.safetensors +3 -0
  31. checkpoint-2100/optimizer.pt +3 -0
  32. checkpoint-2100/rng_state.pth +3 -0
  33. checkpoint-2100/scheduler.pt +3 -0
  34. checkpoint-2100/tokenizer.json +0 -0
  35. checkpoint-2100/tokenizer_config.json +14 -0
  36. checkpoint-2100/trainer_state.json +265 -0
  37. checkpoint-2100/training_args.bin +3 -0
  38. checkpoint-2400/config.json +47 -0
  39. checkpoint-2400/model.safetensors +3 -0
  40. checkpoint-2400/optimizer.pt +3 -0
  41. checkpoint-2400/rng_state.pth +3 -0
  42. checkpoint-2400/scheduler.pt +3 -0
  43. checkpoint-2400/tokenizer.json +0 -0
  44. checkpoint-2400/tokenizer_config.json +14 -0
  45. checkpoint-2400/trainer_state.json +298 -0
  46. checkpoint-2400/training_args.bin +3 -0
  47. checkpoint-2700/config.json +47 -0
  48. checkpoint-2700/model.safetensors +3 -0
  49. checkpoint-2700/optimizer.pt +3 -0
  50. checkpoint-2700/rng_state.pth +3 -0
README.md ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # UPI matching model
2
+
3
+ Binary classifier for product variant matching (cross-store UPI).
4
+
5
+ ## Model type
6
+
7
+ - **encoder_type**: `cross`
8
+ - **num_labels**: `2`
9
+
10
+ ### Config
11
+
12
+ ```json
13
+ {
14
+ "add_cross_attention": false,
15
+ "architectures": [
16
+ "BERTCrossEncoderClassifier"
17
+ ],
18
+ "attention_probs_dropout_prob": 0.1,
19
+ "bert_projection_dim": null,
20
+ "bos_token_id": null,
21
+ "catboost_dropout": 0.05,
22
+ "catboost_hidden_layers": null,
23
+ "catboost_hidden_size": 256,
24
+ "classifier_dropout": null,
25
+ "dtype": "float32",
26
+ "encoder_type": "cross",
27
+ "eos_token_id": null,
28
+ "exclude_url_in_text": false,
29
+ "gradient_checkpointing": false,
30
+ "hidden_act": "gelu",
31
+ "hidden_dropout_prob": 0.1,
32
+ "hidden_size": 1024,
33
+ "include_avg_price_in_text": false,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 4096,
36
+ "is_decoder": false,
37
+ "layer_norm_eps": 1e-12,
38
+ "max_position_embeddings": 512,
39
+ "model_type": "bert",
40
+ "num_attention_heads": 16,
41
+ "num_catboost_features": 43,
42
+ "num_hidden_layers": 24,
43
+ "pad_token_id": 0,
44
+ "position_embedding_type": "absolute",
45
+ "preprocess_url_in_text": false,
46
+ "tie_word_embeddings": true,
47
+ "transformers_version": "5.2.0",
48
+ "type_vocab_size": 2,
49
+ "use_batch_norm": false,
50
+ "use_bert_layer_norm": false,
51
+ "use_cache": false,
52
+ "use_catboost_features": false,
53
+ "use_faiss_distance": false,
54
+ "use_standardized_description_in_text": false,
55
+ "use_standardized_title_in_text": false,
56
+ "use_taxonomy_product_category_in_text": false,
57
+ "use_variant_attributes_in_text": false,
58
+ "vocab_size": 30522
59
+ }
60
+ ```
61
+
62
+ ### Input text format
63
+
64
+ Each product is one text string. Fields are key-value pairs joined by ` | `; only non-empty fields are included.
65
+
66
+ **Standard fields (per product):**
67
+ - **Title** — product title (from `title_1` / `title_2`; often `COALESCE(simplified_title, product_title)` in data).
68
+ - **Vendor** — `vendor_1` / `vendor_2` (e.g. product vendor).
69
+ - **Category** — predicted product category (`predicted_category_1` / `predicted_category_2`), or taxonomy category if the run used `use_taxonomy_product_category`.
70
+ - **URL** — product URL (`url_1` / `url_2`); included unless the run used `exclude_url`.
71
+ - **Shop** — shop name (`shop_name_1` / `shop_name_2`), when present in the dataset.
72
+
73
+ **Optional fields (if enabled at training time):** Average price, Standardized title, Standardized description (truncated), Attributes (variant attributes, truncated).
74
+
75
+ **Example (this run: Title | Vendor | Category | URL | Shop):**
76
+
77
+ ```
78
+ Product 1: Title: Blue Cotton Shirt | Vendor: Acme | Category: Apparel > Tops | URL: https://... | Shop: My Store
79
+ Product 2: Title: Blue Cotton Shirt | Vendor: Acme | Category: Apparel > Tops | URL: https://... | Shop: Other Store
80
+ ```
81
+
82
+ **Tokenization:** Cross-encoder input is `[CLS] tokens_product_1 [SEP] tokens_product_2 [SEP]`, with the same tokenizer and `max_length` (e.g. 512) as training. Use the same field order and separators for inference.
83
+
84
+ ## Training
85
+
86
+ - Train samples: 180294
87
+ - Eval samples: 77148
88
+ - Test samples: 43054
89
+
90
+ ### Final metrics
91
+
92
+ - **eval_loss**: 0.014397569000720978
93
+ - **eval_accuracy**: 0.9569399077098564
94
+ - **eval_precision**: 0.7910981883247595
95
+ - **eval_recall**: 0.5969620253164557
96
+ - **eval_f1**: 0.6804540207772221
97
+ - **test_accuracy**: 0.958726250754866
98
+ - **test_precision**: 0.8079390537289495
99
+ - **test_recall**: 0.6082100814971325
100
+ - **test_f1**: 0.6939900120544171
101
+
102
+ ### Trainable components (excerpt)
103
+
104
+ - **encoder_type**: bert
105
+ - **use_focal_loss**: True
106
+ - **use_class_weights**: False
107
+ - **embedding_only**: False
108
+
109
+
110
+ ## Raw metrics (training_metrics.json)
111
+
112
+ ```
113
+ {
114
+ "train": {
115
+ "train_runtime": 11590.1262,
116
+ "train_samples_per_second": 77.779,
117
+ "train_steps_per_second": 0.304,
118
+ "total_flos": 3.535755746865862e+17,
119
+ "train_loss": 0.014852612838677481,
120
+ "epoch": 5.0
121
+ },
122
+ "validation": {
123
+ "eval_loss": 0.014397569000720978,
124
+ "eval_accuracy": 0.9569399077098564,
125
+ "eval_precision": 0.7910981883247595,
126
+ "eval_recall": 0.5969620253164557,
127
+ "eval_f1": 0.6804540207772221,
128
+ "eval_runtime": 248.7283,
129
+ "eval_samples_per_second": 310.17,
130
+ "eval_steps_per_second": 4.849,
131
+ "epoch": 5.0
132
+ },
133
+ "test": {
134
+ "eval_loss": 0.013887105509638786,
135
+ "eval_accuracy": 0.958726250754866,
136
+ "eval_precision": 0.8079390537289495,
137
+ "eval_recall": 0.6082100814971325,
138
+ "eval_f1": 0.6939900120544171,
139
+ "eval_runtime": 140.0606,
140
+ "eval_samples_per_second": 307.396,
141
+ "eval_steps_per_second": 4.805,
142
+ "epoch": 5.0
143
+ }
144
+ }
145
+ ```
146
+
147
+ ## Source
148
+
149
+ Model directory: `gs://sdp-stg-ml-taxonomy/mattmitsui/models/7df743d96b1c660bbfe9ad164066c96e/366c89cfd45d57ecf84823ea3edfebef/bert_only`
150
+
151
+ ## Load
152
+
153
+ This model uses a custom config and loader (not `AutoModel.from_pretrained`).
154
+
155
+ ```python
156
+ # Use threshold_tuning for inference (handles all encoder_type):
157
+ # python scripts/threshold_tuning.py --model-path /path/to/model --data-path /path/to/data.parquet ...
158
+ # Or load manually: read config.json, instantiate the class for encoder_type, then
159
+ # torch.load(path / 'pytorch_model.bin', map_location='cpu', weights_only=True)
160
+ ```
161
+
162
+ Inference: `scripts/threshold_tuning.py --model-path <path>`.
checkpoint-1200/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BERTCrossEncoderClassifier"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_projection_dim": null,
8
+ "bos_token_id": null,
9
+ "catboost_dropout": 0.05,
10
+ "catboost_hidden_layers": null,
11
+ "catboost_hidden_size": 256,
12
+ "classifier_dropout": null,
13
+ "dtype": "float32",
14
+ "encoder_type": "cross",
15
+ "eos_token_id": null,
16
+ "exclude_url_in_text": false,
17
+ "gradient_checkpointing": false,
18
+ "hidden_act": "gelu",
19
+ "hidden_dropout_prob": 0.1,
20
+ "hidden_size": 1024,
21
+ "include_avg_price_in_text": false,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "is_decoder": false,
25
+ "layer_norm_eps": 1e-12,
26
+ "max_position_embeddings": 512,
27
+ "model_type": "bert",
28
+ "num_attention_heads": 16,
29
+ "num_catboost_features": 43,
30
+ "num_hidden_layers": 24,
31
+ "pad_token_id": 0,
32
+ "position_embedding_type": "absolute",
33
+ "preprocess_url_in_text": false,
34
+ "tie_word_embeddings": true,
35
+ "transformers_version": "5.2.0",
36
+ "type_vocab_size": 2,
37
+ "use_batch_norm": false,
38
+ "use_bert_layer_norm": false,
39
+ "use_cache": false,
40
+ "use_catboost_features": false,
41
+ "use_faiss_distance": false,
42
+ "use_standardized_description_in_text": false,
43
+ "use_standardized_title_in_text": false,
44
+ "use_taxonomy_product_category_in_text": false,
45
+ "use_variant_attributes_in_text": false,
46
+ "vocab_size": 30522
47
+ }
checkpoint-1200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46f25a0bba0d12c0bc39797960e1c4233bca3b05d21008cd4768e071b4589565
3
+ size 1340622760
checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edb9b5c763125d37949afd4f7a6368773edee2ccd33d9096e0a527d93f2a978e
3
+ size 2673087800
checkpoint-1200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c08822cd68a7301c609b3c6f7ec568cdc5035d72b07a8052a140327e515308eb
3
+ size 14645
checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a2143d77d39415b00b89630a13cc5f9a530b512abbe6ab7d8c7733ff9670881
3
+ size 1465
checkpoint-1200/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1200/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-1200/trainer_state.json ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 900,
3
+ "best_metric": 0.6193430285772399,
4
+ "best_model_checkpoint": "/workspace/models/bert_only/checkpoint-900",
5
+ "epoch": 1.702625975869411,
6
+ "eval_steps": 300,
7
+ "global_step": 1200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.14194464158977999,
14
+ "grad_norm": 0.06425805389881134,
15
+ "learning_rate": 2.804532577903683e-06,
16
+ "loss": 0.11154786109924317,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.28388928317955997,
21
+ "grad_norm": 0.061825525015592575,
22
+ "learning_rate": 5.637393767705382e-06,
23
+ "loss": 0.0235357666015625,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.42583392476933996,
28
+ "grad_norm": 0.16315294802188873,
29
+ "learning_rate": 8.470254957507082e-06,
30
+ "loss": 0.019212119579315186,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.42583392476933996,
35
+ "eval_accuracy": 0.9339062581013118,
36
+ "eval_f1": 0.2938651156349536,
37
+ "eval_loss": 0.016514133661985397,
38
+ "eval_precision": 0.8186728395061729,
39
+ "eval_recall": 0.1790717299578059,
40
+ "eval_runtime": 247.8026,
41
+ "eval_samples_per_second": 311.328,
42
+ "eval_steps_per_second": 4.867,
43
+ "step": 300
44
+ },
45
+ {
46
+ "epoch": 0.5677785663591199,
47
+ "grad_norm": 0.07577917724847794,
48
+ "learning_rate": 9.854981084489283e-06,
49
+ "loss": 0.017763136625289916,
50
+ "step": 400
51
+ },
52
+ {
53
+ "epoch": 0.7097232079489,
54
+ "grad_norm": 0.07774042338132858,
55
+ "learning_rate": 9.539722572509459e-06,
56
+ "loss": 0.016379492282867433,
57
+ "step": 500
58
+ },
59
+ {
60
+ "epoch": 0.8516678495386799,
61
+ "grad_norm": 0.03525112196803093,
62
+ "learning_rate": 9.224464060529636e-06,
63
+ "loss": 0.015504951477050782,
64
+ "step": 600
65
+ },
66
+ {
67
+ "epoch": 0.8516678495386799,
68
+ "eval_accuracy": 0.9414501996163219,
69
+ "eval_f1": 0.42245237181946044,
70
+ "eval_loss": 0.014568264596164227,
71
+ "eval_precision": 0.8713080168776371,
72
+ "eval_recall": 0.2788185654008439,
73
+ "eval_runtime": 248.1591,
74
+ "eval_samples_per_second": 310.881,
75
+ "eval_steps_per_second": 4.86,
76
+ "step": 600
77
+ },
78
+ {
79
+ "epoch": 0.99361249112846,
80
+ "grad_norm": 0.042659107595682144,
81
+ "learning_rate": 8.909205548549812e-06,
82
+ "loss": 0.015153419971466065,
83
+ "step": 700
84
+ },
85
+ {
86
+ "epoch": 1.134847409510291,
87
+ "grad_norm": 0.06090683117508888,
88
+ "learning_rate": 8.593947036569988e-06,
89
+ "loss": 0.014712293148040772,
90
+ "step": 800
91
+ },
92
+ {
93
+ "epoch": 1.276792051100071,
94
+ "grad_norm": 0.06917829066514969,
95
+ "learning_rate": 8.278688524590165e-06,
96
+ "loss": 0.014011555910110473,
97
+ "step": 900
98
+ },
99
+ {
100
+ "epoch": 1.276792051100071,
101
+ "eval_accuracy": 0.951482864105356,
102
+ "eval_f1": 0.6193430285772399,
103
+ "eval_loss": 0.014504444785416126,
104
+ "eval_precision": 0.7791709314227226,
105
+ "eval_recall": 0.5139240506329114,
106
+ "eval_runtime": 248.1665,
107
+ "eval_samples_per_second": 310.872,
108
+ "eval_steps_per_second": 4.86,
109
+ "step": 900
110
+ },
111
+ {
112
+ "epoch": 1.418736692689851,
113
+ "grad_norm": 0.060546569526195526,
114
+ "learning_rate": 7.963430012610341e-06,
115
+ "loss": 0.013459330797195435,
116
+ "step": 1000
117
+ },
118
+ {
119
+ "epoch": 1.5606813342796308,
120
+ "grad_norm": 0.03910629078745842,
121
+ "learning_rate": 7.648171500630517e-06,
122
+ "loss": 0.013057354688644409,
123
+ "step": 1100
124
+ },
125
+ {
126
+ "epoch": 1.702625975869411,
127
+ "grad_norm": 0.028518082574009895,
128
+ "learning_rate": 7.332912988650695e-06,
129
+ "loss": 0.013156681060791016,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "epoch": 1.702625975869411,
134
+ "eval_accuracy": 0.9492015347125007,
135
+ "eval_f1": 0.5465694781904431,
136
+ "eval_loss": 0.013104956597089767,
137
+ "eval_precision": 0.8690213392200147,
138
+ "eval_recall": 0.39864978902953585,
139
+ "eval_runtime": 247.5183,
140
+ "eval_samples_per_second": 311.686,
141
+ "eval_steps_per_second": 4.872,
142
+ "step": 1200
143
+ }
144
+ ],
145
+ "logging_steps": 100,
146
+ "max_steps": 3525,
147
+ "num_input_tokens_seen": 0,
148
+ "num_train_epochs": 5,
149
+ "save_steps": 300,
150
+ "stateful_callbacks": {
151
+ "TrainerControl": {
152
+ "args": {
153
+ "should_epoch_stop": false,
154
+ "should_evaluate": false,
155
+ "should_log": false,
156
+ "should_save": true,
157
+ "should_training_stop": false
158
+ },
159
+ "attributes": {}
160
+ }
161
+ },
162
+ "total_flos": 1.2038501873091373e+17,
163
+ "train_batch_size": 64,
164
+ "trial_name": null,
165
+ "trial_params": null
166
+ }
checkpoint-1200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:585c0926506188c343b653866d2ad896a5ab56ac5e6078148d8cadc9d45c9841
3
+ size 5265
checkpoint-1500/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BERTCrossEncoderClassifier"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_projection_dim": null,
8
+ "bos_token_id": null,
9
+ "catboost_dropout": 0.05,
10
+ "catboost_hidden_layers": null,
11
+ "catboost_hidden_size": 256,
12
+ "classifier_dropout": null,
13
+ "dtype": "float32",
14
+ "encoder_type": "cross",
15
+ "eos_token_id": null,
16
+ "exclude_url_in_text": false,
17
+ "gradient_checkpointing": false,
18
+ "hidden_act": "gelu",
19
+ "hidden_dropout_prob": 0.1,
20
+ "hidden_size": 1024,
21
+ "include_avg_price_in_text": false,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "is_decoder": false,
25
+ "layer_norm_eps": 1e-12,
26
+ "max_position_embeddings": 512,
27
+ "model_type": "bert",
28
+ "num_attention_heads": 16,
29
+ "num_catboost_features": 43,
30
+ "num_hidden_layers": 24,
31
+ "pad_token_id": 0,
32
+ "position_embedding_type": "absolute",
33
+ "preprocess_url_in_text": false,
34
+ "tie_word_embeddings": true,
35
+ "transformers_version": "5.2.0",
36
+ "type_vocab_size": 2,
37
+ "use_batch_norm": false,
38
+ "use_bert_layer_norm": false,
39
+ "use_cache": false,
40
+ "use_catboost_features": false,
41
+ "use_faiss_distance": false,
42
+ "use_standardized_description_in_text": false,
43
+ "use_standardized_title_in_text": false,
44
+ "use_taxonomy_product_category_in_text": false,
45
+ "use_variant_attributes_in_text": false,
46
+ "vocab_size": 30522
47
+ }
checkpoint-1500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eeb5ca72908b9a122ddf647fc55174b3580b16aadf2ac9495ff34c894b95f4e
3
+ size 1340622760
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96352822eced5f5347aebb5e60eb75a51b4f506eb7c832347ad8681b7f577595
3
+ size 2673087800
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4afcb43bf773e02399dd7410c940dae962b9ff9c17411d977e4b597ad295d386
3
+ size 14645
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea56dbdaa051940ebb3c46394ee873876d802736373239784bb555707c947d8c
3
+ size 1465
checkpoint-1500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 900,
3
+ "best_metric": 0.6193430285772399,
4
+ "best_model_checkpoint": "/workspace/models/bert_only/checkpoint-900",
5
+ "epoch": 2.127750177430802,
6
+ "eval_steps": 300,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.14194464158977999,
14
+ "grad_norm": 0.06425805389881134,
15
+ "learning_rate": 2.804532577903683e-06,
16
+ "loss": 0.11154786109924317,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.28388928317955997,
21
+ "grad_norm": 0.061825525015592575,
22
+ "learning_rate": 5.637393767705382e-06,
23
+ "loss": 0.0235357666015625,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.42583392476933996,
28
+ "grad_norm": 0.16315294802188873,
29
+ "learning_rate": 8.470254957507082e-06,
30
+ "loss": 0.019212119579315186,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.42583392476933996,
35
+ "eval_accuracy": 0.9339062581013118,
36
+ "eval_f1": 0.2938651156349536,
37
+ "eval_loss": 0.016514133661985397,
38
+ "eval_precision": 0.8186728395061729,
39
+ "eval_recall": 0.1790717299578059,
40
+ "eval_runtime": 247.8026,
41
+ "eval_samples_per_second": 311.328,
42
+ "eval_steps_per_second": 4.867,
43
+ "step": 300
44
+ },
45
+ {
46
+ "epoch": 0.5677785663591199,
47
+ "grad_norm": 0.07577917724847794,
48
+ "learning_rate": 9.854981084489283e-06,
49
+ "loss": 0.017763136625289916,
50
+ "step": 400
51
+ },
52
+ {
53
+ "epoch": 0.7097232079489,
54
+ "grad_norm": 0.07774042338132858,
55
+ "learning_rate": 9.539722572509459e-06,
56
+ "loss": 0.016379492282867433,
57
+ "step": 500
58
+ },
59
+ {
60
+ "epoch": 0.8516678495386799,
61
+ "grad_norm": 0.03525112196803093,
62
+ "learning_rate": 9.224464060529636e-06,
63
+ "loss": 0.015504951477050782,
64
+ "step": 600
65
+ },
66
+ {
67
+ "epoch": 0.8516678495386799,
68
+ "eval_accuracy": 0.9414501996163219,
69
+ "eval_f1": 0.42245237181946044,
70
+ "eval_loss": 0.014568264596164227,
71
+ "eval_precision": 0.8713080168776371,
72
+ "eval_recall": 0.2788185654008439,
73
+ "eval_runtime": 248.1591,
74
+ "eval_samples_per_second": 310.881,
75
+ "eval_steps_per_second": 4.86,
76
+ "step": 600
77
+ },
78
+ {
79
+ "epoch": 0.99361249112846,
80
+ "grad_norm": 0.042659107595682144,
81
+ "learning_rate": 8.909205548549812e-06,
82
+ "loss": 0.015153419971466065,
83
+ "step": 700
84
+ },
85
+ {
86
+ "epoch": 1.134847409510291,
87
+ "grad_norm": 0.06090683117508888,
88
+ "learning_rate": 8.593947036569988e-06,
89
+ "loss": 0.014712293148040772,
90
+ "step": 800
91
+ },
92
+ {
93
+ "epoch": 1.276792051100071,
94
+ "grad_norm": 0.06917829066514969,
95
+ "learning_rate": 8.278688524590165e-06,
96
+ "loss": 0.014011555910110473,
97
+ "step": 900
98
+ },
99
+ {
100
+ "epoch": 1.276792051100071,
101
+ "eval_accuracy": 0.951482864105356,
102
+ "eval_f1": 0.6193430285772399,
103
+ "eval_loss": 0.014504444785416126,
104
+ "eval_precision": 0.7791709314227226,
105
+ "eval_recall": 0.5139240506329114,
106
+ "eval_runtime": 248.1665,
107
+ "eval_samples_per_second": 310.872,
108
+ "eval_steps_per_second": 4.86,
109
+ "step": 900
110
+ },
111
+ {
112
+ "epoch": 1.418736692689851,
113
+ "grad_norm": 0.060546569526195526,
114
+ "learning_rate": 7.963430012610341e-06,
115
+ "loss": 0.013459330797195435,
116
+ "step": 1000
117
+ },
118
+ {
119
+ "epoch": 1.5606813342796308,
120
+ "grad_norm": 0.03910629078745842,
121
+ "learning_rate": 7.648171500630517e-06,
122
+ "loss": 0.013057354688644409,
123
+ "step": 1100
124
+ },
125
+ {
126
+ "epoch": 1.702625975869411,
127
+ "grad_norm": 0.028518082574009895,
128
+ "learning_rate": 7.332912988650695e-06,
129
+ "loss": 0.013156681060791016,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "epoch": 1.702625975869411,
134
+ "eval_accuracy": 0.9492015347125007,
135
+ "eval_f1": 0.5465694781904431,
136
+ "eval_loss": 0.013104956597089767,
137
+ "eval_precision": 0.8690213392200147,
138
+ "eval_recall": 0.39864978902953585,
139
+ "eval_runtime": 247.5183,
140
+ "eval_samples_per_second": 311.686,
141
+ "eval_steps_per_second": 4.872,
142
+ "step": 1200
143
+ },
144
+ {
145
+ "epoch": 1.844570617459191,
146
+ "grad_norm": 0.03921860456466675,
147
+ "learning_rate": 7.017654476670871e-06,
148
+ "loss": 0.0128374981880188,
149
+ "step": 1300
150
+ },
151
+ {
152
+ "epoch": 1.986515259048971,
153
+ "grad_norm": 0.06688380986452103,
154
+ "learning_rate": 6.702395964691047e-06,
155
+ "loss": 0.01282167911529541,
156
+ "step": 1400
157
+ },
158
+ {
159
+ "epoch": 2.127750177430802,
160
+ "grad_norm": 0.05814122408628464,
161
+ "learning_rate": 6.387137452711224e-06,
162
+ "loss": 0.011622587442398071,
163
+ "step": 1500
164
+ },
165
+ {
166
+ "epoch": 2.127750177430802,
167
+ "eval_accuracy": 0.9516384092912324,
168
+ "eval_f1": 0.5833612506979341,
169
+ "eval_loss": 0.012981283478438854,
170
+ "eval_precision": 0.862046204620462,
171
+ "eval_recall": 0.4408438818565401,
172
+ "eval_runtime": 247.7101,
173
+ "eval_samples_per_second": 311.445,
174
+ "eval_steps_per_second": 4.869,
175
+ "step": 1500
176
+ }
177
+ ],
178
+ "logging_steps": 100,
179
+ "max_steps": 3525,
180
+ "num_input_tokens_seen": 0,
181
+ "num_train_epochs": 5,
182
+ "save_steps": 300,
183
+ "stateful_callbacks": {
184
+ "TrainerControl": {
185
+ "args": {
186
+ "should_epoch_stop": false,
187
+ "should_evaluate": false,
188
+ "should_log": false,
189
+ "should_save": true,
190
+ "should_training_stop": false
191
+ },
192
+ "attributes": {}
193
+ }
194
+ },
195
+ "total_flos": 1.5040565136991853e+17,
196
+ "train_batch_size": 64,
197
+ "trial_name": null,
198
+ "trial_params": null
199
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:585c0926506188c343b653866d2ad896a5ab56ac5e6078148d8cadc9d45c9841
3
+ size 5265
checkpoint-1800/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BERTCrossEncoderClassifier"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_projection_dim": null,
8
+ "bos_token_id": null,
9
+ "catboost_dropout": 0.05,
10
+ "catboost_hidden_layers": null,
11
+ "catboost_hidden_size": 256,
12
+ "classifier_dropout": null,
13
+ "dtype": "float32",
14
+ "encoder_type": "cross",
15
+ "eos_token_id": null,
16
+ "exclude_url_in_text": false,
17
+ "gradient_checkpointing": false,
18
+ "hidden_act": "gelu",
19
+ "hidden_dropout_prob": 0.1,
20
+ "hidden_size": 1024,
21
+ "include_avg_price_in_text": false,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "is_decoder": false,
25
+ "layer_norm_eps": 1e-12,
26
+ "max_position_embeddings": 512,
27
+ "model_type": "bert",
28
+ "num_attention_heads": 16,
29
+ "num_catboost_features": 43,
30
+ "num_hidden_layers": 24,
31
+ "pad_token_id": 0,
32
+ "position_embedding_type": "absolute",
33
+ "preprocess_url_in_text": false,
34
+ "tie_word_embeddings": true,
35
+ "transformers_version": "5.2.0",
36
+ "type_vocab_size": 2,
37
+ "use_batch_norm": false,
38
+ "use_bert_layer_norm": false,
39
+ "use_cache": false,
40
+ "use_catboost_features": false,
41
+ "use_faiss_distance": false,
42
+ "use_standardized_description_in_text": false,
43
+ "use_standardized_title_in_text": false,
44
+ "use_taxonomy_product_category_in_text": false,
45
+ "use_variant_attributes_in_text": false,
46
+ "vocab_size": 30522
47
+ }
checkpoint-1800/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b5ddbb8227c9054b777a9740a0cc46df5dba3f2724fb060f9b7328ae9cead57
3
+ size 1340622760
checkpoint-1800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cc670b376ffb79116b58bee2954aaf300f46d1a3543b15f59a1a1d80d4aa0fa
3
+ size 2673087800
checkpoint-1800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f006c1ac49f72fb1d83d32073f96e753c01a295e17ddd90a2250099e7f3b8101
3
+ size 14645
checkpoint-1800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12ec3cb85711df9e48ba46e22eb9af82d920e8a74869e430d996bda6aff298a4
3
+ size 1465
checkpoint-1800/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1800/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-1800/trainer_state.json ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 900,
3
+ "best_metric": 0.6193430285772399,
4
+ "best_model_checkpoint": "/workspace/models/bert_only/checkpoint-900",
5
+ "epoch": 2.553584102200142,
6
+ "eval_steps": 300,
7
+ "global_step": 1800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.14194464158977999,
14
+ "grad_norm": 0.06425805389881134,
15
+ "learning_rate": 2.804532577903683e-06,
16
+ "loss": 0.11154786109924317,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.28388928317955997,
21
+ "grad_norm": 0.061825525015592575,
22
+ "learning_rate": 5.637393767705382e-06,
23
+ "loss": 0.0235357666015625,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.42583392476933996,
28
+ "grad_norm": 0.16315294802188873,
29
+ "learning_rate": 8.470254957507082e-06,
30
+ "loss": 0.019212119579315186,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.42583392476933996,
35
+ "eval_accuracy": 0.9339062581013118,
36
+ "eval_f1": 0.2938651156349536,
37
+ "eval_loss": 0.016514133661985397,
38
+ "eval_precision": 0.8186728395061729,
39
+ "eval_recall": 0.1790717299578059,
40
+ "eval_runtime": 247.8026,
41
+ "eval_samples_per_second": 311.328,
42
+ "eval_steps_per_second": 4.867,
43
+ "step": 300
44
+ },
45
+ {
46
+ "epoch": 0.5677785663591199,
47
+ "grad_norm": 0.07577917724847794,
48
+ "learning_rate": 9.854981084489283e-06,
49
+ "loss": 0.017763136625289916,
50
+ "step": 400
51
+ },
52
+ {
53
+ "epoch": 0.7097232079489,
54
+ "grad_norm": 0.07774042338132858,
55
+ "learning_rate": 9.539722572509459e-06,
56
+ "loss": 0.016379492282867433,
57
+ "step": 500
58
+ },
59
+ {
60
+ "epoch": 0.8516678495386799,
61
+ "grad_norm": 0.03525112196803093,
62
+ "learning_rate": 9.224464060529636e-06,
63
+ "loss": 0.015504951477050782,
64
+ "step": 600
65
+ },
66
+ {
67
+ "epoch": 0.8516678495386799,
68
+ "eval_accuracy": 0.9414501996163219,
69
+ "eval_f1": 0.42245237181946044,
70
+ "eval_loss": 0.014568264596164227,
71
+ "eval_precision": 0.8713080168776371,
72
+ "eval_recall": 0.2788185654008439,
73
+ "eval_runtime": 248.1591,
74
+ "eval_samples_per_second": 310.881,
75
+ "eval_steps_per_second": 4.86,
76
+ "step": 600
77
+ },
78
+ {
79
+ "epoch": 0.99361249112846,
80
+ "grad_norm": 0.042659107595682144,
81
+ "learning_rate": 8.909205548549812e-06,
82
+ "loss": 0.015153419971466065,
83
+ "step": 700
84
+ },
85
+ {
86
+ "epoch": 1.134847409510291,
87
+ "grad_norm": 0.06090683117508888,
88
+ "learning_rate": 8.593947036569988e-06,
89
+ "loss": 0.014712293148040772,
90
+ "step": 800
91
+ },
92
+ {
93
+ "epoch": 1.276792051100071,
94
+ "grad_norm": 0.06917829066514969,
95
+ "learning_rate": 8.278688524590165e-06,
96
+ "loss": 0.014011555910110473,
97
+ "step": 900
98
+ },
99
+ {
100
+ "epoch": 1.276792051100071,
101
+ "eval_accuracy": 0.951482864105356,
102
+ "eval_f1": 0.6193430285772399,
103
+ "eval_loss": 0.014504444785416126,
104
+ "eval_precision": 0.7791709314227226,
105
+ "eval_recall": 0.5139240506329114,
106
+ "eval_runtime": 248.1665,
107
+ "eval_samples_per_second": 310.872,
108
+ "eval_steps_per_second": 4.86,
109
+ "step": 900
110
+ },
111
+ {
112
+ "epoch": 1.418736692689851,
113
+ "grad_norm": 0.060546569526195526,
114
+ "learning_rate": 7.963430012610341e-06,
115
+ "loss": 0.013459330797195435,
116
+ "step": 1000
117
+ },
118
+ {
119
+ "epoch": 1.5606813342796308,
120
+ "grad_norm": 0.03910629078745842,
121
+ "learning_rate": 7.648171500630517e-06,
122
+ "loss": 0.013057354688644409,
123
+ "step": 1100
124
+ },
125
+ {
126
+ "epoch": 1.702625975869411,
127
+ "grad_norm": 0.028518082574009895,
128
+ "learning_rate": 7.332912988650695e-06,
129
+ "loss": 0.013156681060791016,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "epoch": 1.702625975869411,
134
+ "eval_accuracy": 0.9492015347125007,
135
+ "eval_f1": 0.5465694781904431,
136
+ "eval_loss": 0.013104956597089767,
137
+ "eval_precision": 0.8690213392200147,
138
+ "eval_recall": 0.39864978902953585,
139
+ "eval_runtime": 247.5183,
140
+ "eval_samples_per_second": 311.686,
141
+ "eval_steps_per_second": 4.872,
142
+ "step": 1200
143
+ },
144
+ {
145
+ "epoch": 1.844570617459191,
146
+ "grad_norm": 0.03921860456466675,
147
+ "learning_rate": 7.017654476670871e-06,
148
+ "loss": 0.0128374981880188,
149
+ "step": 1300
150
+ },
151
+ {
152
+ "epoch": 1.986515259048971,
153
+ "grad_norm": 0.06688380986452103,
154
+ "learning_rate": 6.702395964691047e-06,
155
+ "loss": 0.01282167911529541,
156
+ "step": 1400
157
+ },
158
+ {
159
+ "epoch": 2.127750177430802,
160
+ "grad_norm": 0.05814122408628464,
161
+ "learning_rate": 6.387137452711224e-06,
162
+ "loss": 0.011622587442398071,
163
+ "step": 1500
164
+ },
165
+ {
166
+ "epoch": 2.127750177430802,
167
+ "eval_accuracy": 0.9516384092912324,
168
+ "eval_f1": 0.5833612506979341,
169
+ "eval_loss": 0.012981283478438854,
170
+ "eval_precision": 0.862046204620462,
171
+ "eval_recall": 0.4408438818565401,
172
+ "eval_runtime": 247.7101,
173
+ "eval_samples_per_second": 311.445,
174
+ "eval_steps_per_second": 4.869,
175
+ "step": 1500
176
+ },
177
+ {
178
+ "epoch": 2.269694819020582,
179
+ "grad_norm": 0.05174160748720169,
180
+ "learning_rate": 6.071878940731401e-06,
181
+ "loss": 0.011430107355117798,
182
+ "step": 1600
183
+ },
184
+ {
185
+ "epoch": 2.411639460610362,
186
+ "grad_norm": 0.04454905912280083,
187
+ "learning_rate": 5.756620428751577e-06,
188
+ "loss": 0.011401889324188232,
189
+ "step": 1700
190
+ },
191
+ {
192
+ "epoch": 2.553584102200142,
193
+ "grad_norm": 0.1080143004655838,
194
+ "learning_rate": 5.441361916771753e-06,
195
+ "loss": 0.011695735454559326,
196
+ "step": 1800
197
+ },
198
+ {
199
+ "epoch": 2.553584102200142,
200
+ "eval_accuracy": 0.9499274122465909,
201
+ "eval_f1": 0.5463300058719907,
202
+ "eval_loss": 0.013252142816781998,
203
+ "eval_precision": 0.8980694980694981,
204
+ "eval_recall": 0.39257383966244724,
205
+ "eval_runtime": 247.9584,
206
+ "eval_samples_per_second": 311.133,
207
+ "eval_steps_per_second": 4.864,
208
+ "step": 1800
209
+ }
210
+ ],
211
+ "logging_steps": 100,
212
+ "max_steps": 3525,
213
+ "num_input_tokens_seen": 0,
214
+ "num_train_epochs": 5,
215
+ "save_steps": 300,
216
+ "stateful_callbacks": {
217
+ "TrainerControl": {
218
+ "args": {
219
+ "should_epoch_stop": false,
220
+ "should_evaluate": false,
221
+ "should_log": false,
222
+ "should_save": true,
223
+ "should_training_stop": false
224
+ },
225
+ "attributes": {}
226
+ }
227
+ },
228
+ "total_flos": 1.802781287642955e+17,
229
+ "train_batch_size": 64,
230
+ "trial_name": null,
231
+ "trial_params": null
232
+ }
checkpoint-1800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:585c0926506188c343b653866d2ad896a5ab56ac5e6078148d8cadc9d45c9841
3
+ size 5265
checkpoint-2100/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BERTCrossEncoderClassifier"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_projection_dim": null,
8
+ "bos_token_id": null,
9
+ "catboost_dropout": 0.05,
10
+ "catboost_hidden_layers": null,
11
+ "catboost_hidden_size": 256,
12
+ "classifier_dropout": null,
13
+ "dtype": "float32",
14
+ "encoder_type": "cross",
15
+ "eos_token_id": null,
16
+ "exclude_url_in_text": false,
17
+ "gradient_checkpointing": false,
18
+ "hidden_act": "gelu",
19
+ "hidden_dropout_prob": 0.1,
20
+ "hidden_size": 1024,
21
+ "include_avg_price_in_text": false,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "is_decoder": false,
25
+ "layer_norm_eps": 1e-12,
26
+ "max_position_embeddings": 512,
27
+ "model_type": "bert",
28
+ "num_attention_heads": 16,
29
+ "num_catboost_features": 43,
30
+ "num_hidden_layers": 24,
31
+ "pad_token_id": 0,
32
+ "position_embedding_type": "absolute",
33
+ "preprocess_url_in_text": false,
34
+ "tie_word_embeddings": true,
35
+ "transformers_version": "5.2.0",
36
+ "type_vocab_size": 2,
37
+ "use_batch_norm": false,
38
+ "use_bert_layer_norm": false,
39
+ "use_cache": false,
40
+ "use_catboost_features": false,
41
+ "use_faiss_distance": false,
42
+ "use_standardized_description_in_text": false,
43
+ "use_standardized_title_in_text": false,
44
+ "use_taxonomy_product_category_in_text": false,
45
+ "use_variant_attributes_in_text": false,
46
+ "vocab_size": 30522
47
+ }
checkpoint-2100/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f07540c6387626ddd998177c439a461f2937d408c392a03ec4d26f670f172687
3
+ size 1340622760
checkpoint-2100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e8b65e7372cfd426e73730590d247946bf1660fca93a7ff267242d2e631499d
3
+ size 2673087800
checkpoint-2100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc02ceadcdb8ae8447ec193758f49bc25a9d29d8469e70c5a40ca275c5ef890
3
+ size 14645
checkpoint-2100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca600cbae926b6b9c36e4f50a04cbd029e68a1a4d634572c8882a843e2ef0788
3
+ size 1465
checkpoint-2100/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2100/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-2100/trainer_state.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 900,
3
+ "best_metric": 0.6193430285772399,
4
+ "best_model_checkpoint": "/workspace/models/bert_only/checkpoint-900",
5
+ "epoch": 2.9794180269694817,
6
+ "eval_steps": 300,
7
+ "global_step": 2100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.14194464158977999,
14
+ "grad_norm": 0.06425805389881134,
15
+ "learning_rate": 2.804532577903683e-06,
16
+ "loss": 0.11154786109924317,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.28388928317955997,
21
+ "grad_norm": 0.061825525015592575,
22
+ "learning_rate": 5.637393767705382e-06,
23
+ "loss": 0.0235357666015625,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.42583392476933996,
28
+ "grad_norm": 0.16315294802188873,
29
+ "learning_rate": 8.470254957507082e-06,
30
+ "loss": 0.019212119579315186,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.42583392476933996,
35
+ "eval_accuracy": 0.9339062581013118,
36
+ "eval_f1": 0.2938651156349536,
37
+ "eval_loss": 0.016514133661985397,
38
+ "eval_precision": 0.8186728395061729,
39
+ "eval_recall": 0.1790717299578059,
40
+ "eval_runtime": 247.8026,
41
+ "eval_samples_per_second": 311.328,
42
+ "eval_steps_per_second": 4.867,
43
+ "step": 300
44
+ },
45
+ {
46
+ "epoch": 0.5677785663591199,
47
+ "grad_norm": 0.07577917724847794,
48
+ "learning_rate": 9.854981084489283e-06,
49
+ "loss": 0.017763136625289916,
50
+ "step": 400
51
+ },
52
+ {
53
+ "epoch": 0.7097232079489,
54
+ "grad_norm": 0.07774042338132858,
55
+ "learning_rate": 9.539722572509459e-06,
56
+ "loss": 0.016379492282867433,
57
+ "step": 500
58
+ },
59
+ {
60
+ "epoch": 0.8516678495386799,
61
+ "grad_norm": 0.03525112196803093,
62
+ "learning_rate": 9.224464060529636e-06,
63
+ "loss": 0.015504951477050782,
64
+ "step": 600
65
+ },
66
+ {
67
+ "epoch": 0.8516678495386799,
68
+ "eval_accuracy": 0.9414501996163219,
69
+ "eval_f1": 0.42245237181946044,
70
+ "eval_loss": 0.014568264596164227,
71
+ "eval_precision": 0.8713080168776371,
72
+ "eval_recall": 0.2788185654008439,
73
+ "eval_runtime": 248.1591,
74
+ "eval_samples_per_second": 310.881,
75
+ "eval_steps_per_second": 4.86,
76
+ "step": 600
77
+ },
78
+ {
79
+ "epoch": 0.99361249112846,
80
+ "grad_norm": 0.042659107595682144,
81
+ "learning_rate": 8.909205548549812e-06,
82
+ "loss": 0.015153419971466065,
83
+ "step": 700
84
+ },
85
+ {
86
+ "epoch": 1.134847409510291,
87
+ "grad_norm": 0.06090683117508888,
88
+ "learning_rate": 8.593947036569988e-06,
89
+ "loss": 0.014712293148040772,
90
+ "step": 800
91
+ },
92
+ {
93
+ "epoch": 1.276792051100071,
94
+ "grad_norm": 0.06917829066514969,
95
+ "learning_rate": 8.278688524590165e-06,
96
+ "loss": 0.014011555910110473,
97
+ "step": 900
98
+ },
99
+ {
100
+ "epoch": 1.276792051100071,
101
+ "eval_accuracy": 0.951482864105356,
102
+ "eval_f1": 0.6193430285772399,
103
+ "eval_loss": 0.014504444785416126,
104
+ "eval_precision": 0.7791709314227226,
105
+ "eval_recall": 0.5139240506329114,
106
+ "eval_runtime": 248.1665,
107
+ "eval_samples_per_second": 310.872,
108
+ "eval_steps_per_second": 4.86,
109
+ "step": 900
110
+ },
111
+ {
112
+ "epoch": 1.418736692689851,
113
+ "grad_norm": 0.060546569526195526,
114
+ "learning_rate": 7.963430012610341e-06,
115
+ "loss": 0.013459330797195435,
116
+ "step": 1000
117
+ },
118
+ {
119
+ "epoch": 1.5606813342796308,
120
+ "grad_norm": 0.03910629078745842,
121
+ "learning_rate": 7.648171500630517e-06,
122
+ "loss": 0.013057354688644409,
123
+ "step": 1100
124
+ },
125
+ {
126
+ "epoch": 1.702625975869411,
127
+ "grad_norm": 0.028518082574009895,
128
+ "learning_rate": 7.332912988650695e-06,
129
+ "loss": 0.013156681060791016,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "epoch": 1.702625975869411,
134
+ "eval_accuracy": 0.9492015347125007,
135
+ "eval_f1": 0.5465694781904431,
136
+ "eval_loss": 0.013104956597089767,
137
+ "eval_precision": 0.8690213392200147,
138
+ "eval_recall": 0.39864978902953585,
139
+ "eval_runtime": 247.5183,
140
+ "eval_samples_per_second": 311.686,
141
+ "eval_steps_per_second": 4.872,
142
+ "step": 1200
143
+ },
144
+ {
145
+ "epoch": 1.844570617459191,
146
+ "grad_norm": 0.03921860456466675,
147
+ "learning_rate": 7.017654476670871e-06,
148
+ "loss": 0.0128374981880188,
149
+ "step": 1300
150
+ },
151
+ {
152
+ "epoch": 1.986515259048971,
153
+ "grad_norm": 0.06688380986452103,
154
+ "learning_rate": 6.702395964691047e-06,
155
+ "loss": 0.01282167911529541,
156
+ "step": 1400
157
+ },
158
+ {
159
+ "epoch": 2.127750177430802,
160
+ "grad_norm": 0.05814122408628464,
161
+ "learning_rate": 6.387137452711224e-06,
162
+ "loss": 0.011622587442398071,
163
+ "step": 1500
164
+ },
165
+ {
166
+ "epoch": 2.127750177430802,
167
+ "eval_accuracy": 0.9516384092912324,
168
+ "eval_f1": 0.5833612506979341,
169
+ "eval_loss": 0.012981283478438854,
170
+ "eval_precision": 0.862046204620462,
171
+ "eval_recall": 0.4408438818565401,
172
+ "eval_runtime": 247.7101,
173
+ "eval_samples_per_second": 311.445,
174
+ "eval_steps_per_second": 4.869,
175
+ "step": 1500
176
+ },
177
+ {
178
+ "epoch": 2.269694819020582,
179
+ "grad_norm": 0.05174160748720169,
180
+ "learning_rate": 6.071878940731401e-06,
181
+ "loss": 0.011430107355117798,
182
+ "step": 1600
183
+ },
184
+ {
185
+ "epoch": 2.411639460610362,
186
+ "grad_norm": 0.04454905912280083,
187
+ "learning_rate": 5.756620428751577e-06,
188
+ "loss": 0.011401889324188232,
189
+ "step": 1700
190
+ },
191
+ {
192
+ "epoch": 2.553584102200142,
193
+ "grad_norm": 0.1080143004655838,
194
+ "learning_rate": 5.441361916771753e-06,
195
+ "loss": 0.011695735454559326,
196
+ "step": 1800
197
+ },
198
+ {
199
+ "epoch": 2.553584102200142,
200
+ "eval_accuracy": 0.9499274122465909,
201
+ "eval_f1": 0.5463300058719907,
202
+ "eval_loss": 0.013252142816781998,
203
+ "eval_precision": 0.8980694980694981,
204
+ "eval_recall": 0.39257383966244724,
205
+ "eval_runtime": 247.9584,
206
+ "eval_samples_per_second": 311.133,
207
+ "eval_steps_per_second": 4.864,
208
+ "step": 1800
209
+ },
210
+ {
211
+ "epoch": 2.695528743789922,
212
+ "grad_norm": 0.04515964910387993,
213
+ "learning_rate": 5.12610340479193e-06,
214
+ "loss": 0.011148114204406739,
215
+ "step": 1900
216
+ },
217
+ {
218
+ "epoch": 2.837473385379702,
219
+ "grad_norm": 0.05003391578793526,
220
+ "learning_rate": 4.810844892812107e-06,
221
+ "loss": 0.011369050741195678,
222
+ "step": 2000
223
+ },
224
+ {
225
+ "epoch": 2.9794180269694817,
226
+ "grad_norm": 0.045471154153347015,
227
+ "learning_rate": 4.495586380832283e-06,
228
+ "loss": 0.011400833129882812,
229
+ "step": 2100
230
+ },
231
+ {
232
+ "epoch": 2.9794180269694817,
233
+ "eval_accuracy": 0.952740187691191,
234
+ "eval_f1": 0.5911639381027136,
235
+ "eval_loss": 0.01266519445925951,
236
+ "eval_precision": 0.880721683929168,
237
+ "eval_recall": 0.4448945147679325,
238
+ "eval_runtime": 247.8326,
239
+ "eval_samples_per_second": 311.291,
240
+ "eval_steps_per_second": 4.866,
241
+ "step": 2100
242
+ }
243
+ ],
244
+ "logging_steps": 100,
245
+ "max_steps": 3525,
246
+ "num_input_tokens_seen": 0,
247
+ "num_train_epochs": 5,
248
+ "save_steps": 300,
249
+ "stateful_callbacks": {
250
+ "TrainerControl": {
251
+ "args": {
252
+ "should_epoch_stop": false,
253
+ "should_evaluate": false,
254
+ "should_log": false,
255
+ "should_save": true,
256
+ "should_training_stop": false
257
+ },
258
+ "attributes": {}
259
+ }
260
+ },
261
+ "total_flos": 2.107318983465483e+17,
262
+ "train_batch_size": 64,
263
+ "trial_name": null,
264
+ "trial_params": null
265
+ }
checkpoint-2100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:585c0926506188c343b653866d2ad896a5ab56ac5e6078148d8cadc9d45c9841
3
+ size 5265
checkpoint-2400/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BERTCrossEncoderClassifier"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_projection_dim": null,
8
+ "bos_token_id": null,
9
+ "catboost_dropout": 0.05,
10
+ "catboost_hidden_layers": null,
11
+ "catboost_hidden_size": 256,
12
+ "classifier_dropout": null,
13
+ "dtype": "float32",
14
+ "encoder_type": "cross",
15
+ "eos_token_id": null,
16
+ "exclude_url_in_text": false,
17
+ "gradient_checkpointing": false,
18
+ "hidden_act": "gelu",
19
+ "hidden_dropout_prob": 0.1,
20
+ "hidden_size": 1024,
21
+ "include_avg_price_in_text": false,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "is_decoder": false,
25
+ "layer_norm_eps": 1e-12,
26
+ "max_position_embeddings": 512,
27
+ "model_type": "bert",
28
+ "num_attention_heads": 16,
29
+ "num_catboost_features": 43,
30
+ "num_hidden_layers": 24,
31
+ "pad_token_id": 0,
32
+ "position_embedding_type": "absolute",
33
+ "preprocess_url_in_text": false,
34
+ "tie_word_embeddings": true,
35
+ "transformers_version": "5.2.0",
36
+ "type_vocab_size": 2,
37
+ "use_batch_norm": false,
38
+ "use_bert_layer_norm": false,
39
+ "use_cache": false,
40
+ "use_catboost_features": false,
41
+ "use_faiss_distance": false,
42
+ "use_standardized_description_in_text": false,
43
+ "use_standardized_title_in_text": false,
44
+ "use_taxonomy_product_category_in_text": false,
45
+ "use_variant_attributes_in_text": false,
46
+ "vocab_size": 30522
47
+ }
checkpoint-2400/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9389eccc73ce3622db80b09e193e2f7001a9fa4adb66f3d78e2b7385819836ba
3
+ size 1340622760
checkpoint-2400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b44b0f22272328cef1c52776e3ef9fcfbc1f5a3e819d0ea0c061ae365b606b2e
3
+ size 2673087800
checkpoint-2400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b4954c9dd20e13b3cd14583345a4b565c137ac8a2b08a68e934ea632d565505
3
+ size 14645
checkpoint-2400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25249fad0573b83da4e1e90a1939f29c3c4b4a391fe06700f793f2c5720fca31
3
+ size 1465
checkpoint-2400/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2400/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
checkpoint-2400/trainer_state.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2400,
3
+ "best_metric": 0.6504048375525264,
4
+ "best_model_checkpoint": "/workspace/models/bert_only/checkpoint-2400",
5
+ "epoch": 3.404542228530873,
6
+ "eval_steps": 300,
7
+ "global_step": 2400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.14194464158977999,
14
+ "grad_norm": 0.06425805389881134,
15
+ "learning_rate": 2.804532577903683e-06,
16
+ "loss": 0.11154786109924317,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.28388928317955997,
21
+ "grad_norm": 0.061825525015592575,
22
+ "learning_rate": 5.637393767705382e-06,
23
+ "loss": 0.0235357666015625,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.42583392476933996,
28
+ "grad_norm": 0.16315294802188873,
29
+ "learning_rate": 8.470254957507082e-06,
30
+ "loss": 0.019212119579315186,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.42583392476933996,
35
+ "eval_accuracy": 0.9339062581013118,
36
+ "eval_f1": 0.2938651156349536,
37
+ "eval_loss": 0.016514133661985397,
38
+ "eval_precision": 0.8186728395061729,
39
+ "eval_recall": 0.1790717299578059,
40
+ "eval_runtime": 247.8026,
41
+ "eval_samples_per_second": 311.328,
42
+ "eval_steps_per_second": 4.867,
43
+ "step": 300
44
+ },
45
+ {
46
+ "epoch": 0.5677785663591199,
47
+ "grad_norm": 0.07577917724847794,
48
+ "learning_rate": 9.854981084489283e-06,
49
+ "loss": 0.017763136625289916,
50
+ "step": 400
51
+ },
52
+ {
53
+ "epoch": 0.7097232079489,
54
+ "grad_norm": 0.07774042338132858,
55
+ "learning_rate": 9.539722572509459e-06,
56
+ "loss": 0.016379492282867433,
57
+ "step": 500
58
+ },
59
+ {
60
+ "epoch": 0.8516678495386799,
61
+ "grad_norm": 0.03525112196803093,
62
+ "learning_rate": 9.224464060529636e-06,
63
+ "loss": 0.015504951477050782,
64
+ "step": 600
65
+ },
66
+ {
67
+ "epoch": 0.8516678495386799,
68
+ "eval_accuracy": 0.9414501996163219,
69
+ "eval_f1": 0.42245237181946044,
70
+ "eval_loss": 0.014568264596164227,
71
+ "eval_precision": 0.8713080168776371,
72
+ "eval_recall": 0.2788185654008439,
73
+ "eval_runtime": 248.1591,
74
+ "eval_samples_per_second": 310.881,
75
+ "eval_steps_per_second": 4.86,
76
+ "step": 600
77
+ },
78
+ {
79
+ "epoch": 0.99361249112846,
80
+ "grad_norm": 0.042659107595682144,
81
+ "learning_rate": 8.909205548549812e-06,
82
+ "loss": 0.015153419971466065,
83
+ "step": 700
84
+ },
85
+ {
86
+ "epoch": 1.134847409510291,
87
+ "grad_norm": 0.06090683117508888,
88
+ "learning_rate": 8.593947036569988e-06,
89
+ "loss": 0.014712293148040772,
90
+ "step": 800
91
+ },
92
+ {
93
+ "epoch": 1.276792051100071,
94
+ "grad_norm": 0.06917829066514969,
95
+ "learning_rate": 8.278688524590165e-06,
96
+ "loss": 0.014011555910110473,
97
+ "step": 900
98
+ },
99
+ {
100
+ "epoch": 1.276792051100071,
101
+ "eval_accuracy": 0.951482864105356,
102
+ "eval_f1": 0.6193430285772399,
103
+ "eval_loss": 0.014504444785416126,
104
+ "eval_precision": 0.7791709314227226,
105
+ "eval_recall": 0.5139240506329114,
106
+ "eval_runtime": 248.1665,
107
+ "eval_samples_per_second": 310.872,
108
+ "eval_steps_per_second": 4.86,
109
+ "step": 900
110
+ },
111
+ {
112
+ "epoch": 1.418736692689851,
113
+ "grad_norm": 0.060546569526195526,
114
+ "learning_rate": 7.963430012610341e-06,
115
+ "loss": 0.013459330797195435,
116
+ "step": 1000
117
+ },
118
+ {
119
+ "epoch": 1.5606813342796308,
120
+ "grad_norm": 0.03910629078745842,
121
+ "learning_rate": 7.648171500630517e-06,
122
+ "loss": 0.013057354688644409,
123
+ "step": 1100
124
+ },
125
+ {
126
+ "epoch": 1.702625975869411,
127
+ "grad_norm": 0.028518082574009895,
128
+ "learning_rate": 7.332912988650695e-06,
129
+ "loss": 0.013156681060791016,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "epoch": 1.702625975869411,
134
+ "eval_accuracy": 0.9492015347125007,
135
+ "eval_f1": 0.5465694781904431,
136
+ "eval_loss": 0.013104956597089767,
137
+ "eval_precision": 0.8690213392200147,
138
+ "eval_recall": 0.39864978902953585,
139
+ "eval_runtime": 247.5183,
140
+ "eval_samples_per_second": 311.686,
141
+ "eval_steps_per_second": 4.872,
142
+ "step": 1200
143
+ },
144
+ {
145
+ "epoch": 1.844570617459191,
146
+ "grad_norm": 0.03921860456466675,
147
+ "learning_rate": 7.017654476670871e-06,
148
+ "loss": 0.0128374981880188,
149
+ "step": 1300
150
+ },
151
+ {
152
+ "epoch": 1.986515259048971,
153
+ "grad_norm": 0.06688380986452103,
154
+ "learning_rate": 6.702395964691047e-06,
155
+ "loss": 0.01282167911529541,
156
+ "step": 1400
157
+ },
158
+ {
159
+ "epoch": 2.127750177430802,
160
+ "grad_norm": 0.05814122408628464,
161
+ "learning_rate": 6.387137452711224e-06,
162
+ "loss": 0.011622587442398071,
163
+ "step": 1500
164
+ },
165
+ {
166
+ "epoch": 2.127750177430802,
167
+ "eval_accuracy": 0.9516384092912324,
168
+ "eval_f1": 0.5833612506979341,
169
+ "eval_loss": 0.012981283478438854,
170
+ "eval_precision": 0.862046204620462,
171
+ "eval_recall": 0.4408438818565401,
172
+ "eval_runtime": 247.7101,
173
+ "eval_samples_per_second": 311.445,
174
+ "eval_steps_per_second": 4.869,
175
+ "step": 1500
176
+ },
177
+ {
178
+ "epoch": 2.269694819020582,
179
+ "grad_norm": 0.05174160748720169,
180
+ "learning_rate": 6.071878940731401e-06,
181
+ "loss": 0.011430107355117798,
182
+ "step": 1600
183
+ },
184
+ {
185
+ "epoch": 2.411639460610362,
186
+ "grad_norm": 0.04454905912280083,
187
+ "learning_rate": 5.756620428751577e-06,
188
+ "loss": 0.011401889324188232,
189
+ "step": 1700
190
+ },
191
+ {
192
+ "epoch": 2.553584102200142,
193
+ "grad_norm": 0.1080143004655838,
194
+ "learning_rate": 5.441361916771753e-06,
195
+ "loss": 0.011695735454559326,
196
+ "step": 1800
197
+ },
198
+ {
199
+ "epoch": 2.553584102200142,
200
+ "eval_accuracy": 0.9499274122465909,
201
+ "eval_f1": 0.5463300058719907,
202
+ "eval_loss": 0.013252142816781998,
203
+ "eval_precision": 0.8980694980694981,
204
+ "eval_recall": 0.39257383966244724,
205
+ "eval_runtime": 247.9584,
206
+ "eval_samples_per_second": 311.133,
207
+ "eval_steps_per_second": 4.864,
208
+ "step": 1800
209
+ },
210
+ {
211
+ "epoch": 2.695528743789922,
212
+ "grad_norm": 0.04515964910387993,
213
+ "learning_rate": 5.12610340479193e-06,
214
+ "loss": 0.011148114204406739,
215
+ "step": 1900
216
+ },
217
+ {
218
+ "epoch": 2.837473385379702,
219
+ "grad_norm": 0.05003391578793526,
220
+ "learning_rate": 4.810844892812107e-06,
221
+ "loss": 0.011369050741195678,
222
+ "step": 2000
223
+ },
224
+ {
225
+ "epoch": 2.9794180269694817,
226
+ "grad_norm": 0.045471154153347015,
227
+ "learning_rate": 4.495586380832283e-06,
228
+ "loss": 0.011400833129882812,
229
+ "step": 2100
230
+ },
231
+ {
232
+ "epoch": 2.9794180269694817,
233
+ "eval_accuracy": 0.952740187691191,
234
+ "eval_f1": 0.5911639381027136,
235
+ "eval_loss": 0.01266519445925951,
236
+ "eval_precision": 0.880721683929168,
237
+ "eval_recall": 0.4448945147679325,
238
+ "eval_runtime": 247.8326,
239
+ "eval_samples_per_second": 311.291,
240
+ "eval_steps_per_second": 4.866,
241
+ "step": 2100
242
+ },
243
+ {
244
+ "epoch": 3.120652945351313,
245
+ "grad_norm": 0.04946517199277878,
246
+ "learning_rate": 4.180327868852459e-06,
247
+ "loss": 0.009943812489509582,
248
+ "step": 2200
249
+ },
250
+ {
251
+ "epoch": 3.262597586941093,
252
+ "grad_norm": 0.046277161687612534,
253
+ "learning_rate": 3.865069356872636e-06,
254
+ "loss": 0.010015236139297485,
255
+ "step": 2300
256
+ },
257
+ {
258
+ "epoch": 3.404542228530873,
259
+ "grad_norm": 0.04235660284757614,
260
+ "learning_rate": 3.5498108448928128e-06,
261
+ "loss": 0.009706299304962158,
262
+ "step": 2400
263
+ },
264
+ {
265
+ "epoch": 3.404542228530873,
266
+ "eval_accuracy": 0.9557862809146057,
267
+ "eval_f1": 0.6504048375525264,
268
+ "eval_loss": 0.013760806061327457,
269
+ "eval_precision": 0.8280271398747391,
270
+ "eval_recall": 0.5355274261603375,
271
+ "eval_runtime": 247.7549,
272
+ "eval_samples_per_second": 311.388,
273
+ "eval_steps_per_second": 4.868,
274
+ "step": 2400
275
+ }
276
+ ],
277
+ "logging_steps": 100,
278
+ "max_steps": 3525,
279
+ "num_input_tokens_seen": 0,
280
+ "num_train_epochs": 5,
281
+ "save_steps": 300,
282
+ "stateful_callbacks": {
283
+ "TrainerControl": {
284
+ "args": {
285
+ "should_epoch_stop": false,
286
+ "should_evaluate": false,
287
+ "should_log": false,
288
+ "should_save": true,
289
+ "should_training_stop": false
290
+ },
291
+ "attributes": {}
292
+ }
293
+ },
294
+ "total_flos": 2.406777143707909e+17,
295
+ "train_batch_size": 64,
296
+ "trial_name": null,
297
+ "trial_params": null
298
+ }
checkpoint-2400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:585c0926506188c343b653866d2ad896a5ab56ac5e6078148d8cadc9d45c9841
3
+ size 5265
checkpoint-2700/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BERTCrossEncoderClassifier"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_projection_dim": null,
8
+ "bos_token_id": null,
9
+ "catboost_dropout": 0.05,
10
+ "catboost_hidden_layers": null,
11
+ "catboost_hidden_size": 256,
12
+ "classifier_dropout": null,
13
+ "dtype": "float32",
14
+ "encoder_type": "cross",
15
+ "eos_token_id": null,
16
+ "exclude_url_in_text": false,
17
+ "gradient_checkpointing": false,
18
+ "hidden_act": "gelu",
19
+ "hidden_dropout_prob": 0.1,
20
+ "hidden_size": 1024,
21
+ "include_avg_price_in_text": false,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 4096,
24
+ "is_decoder": false,
25
+ "layer_norm_eps": 1e-12,
26
+ "max_position_embeddings": 512,
27
+ "model_type": "bert",
28
+ "num_attention_heads": 16,
29
+ "num_catboost_features": 43,
30
+ "num_hidden_layers": 24,
31
+ "pad_token_id": 0,
32
+ "position_embedding_type": "absolute",
33
+ "preprocess_url_in_text": false,
34
+ "tie_word_embeddings": true,
35
+ "transformers_version": "5.2.0",
36
+ "type_vocab_size": 2,
37
+ "use_batch_norm": false,
38
+ "use_bert_layer_norm": false,
39
+ "use_cache": false,
40
+ "use_catboost_features": false,
41
+ "use_faiss_distance": false,
42
+ "use_standardized_description_in_text": false,
43
+ "use_standardized_title_in_text": false,
44
+ "use_taxonomy_product_category_in_text": false,
45
+ "use_variant_attributes_in_text": false,
46
+ "vocab_size": 30522
47
+ }
checkpoint-2700/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0663b6466f97897b955c014f4dfda5e7331bd0b84893ef613c217adc7d63f73
3
+ size 1340622760
checkpoint-2700/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e6206937214402aa3d2994267aa299071d82b89779468cb5690336760679ef
3
+ size 2673087800
checkpoint-2700/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4e7c593fa5e1e63022c74144afea942ab6080a5a96293b917b4e4b8a1dd70a3
3
+ size 14645