lewmi commited on
Commit
2136fc5
·
verified ·
1 Parent(s): 1984c19

Training in progress, step 500

Browse files
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: calculator_model_test
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ # calculator_model_test
14
+
15
+ This model is a fine-tuned version of [](https://huggingface.co/) on the None dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 0.2387
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 0.001
37
+ - train_batch_size: 512
38
+ - eval_batch_size: 512
39
+ - seed: 42
40
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
41
+ - lr_scheduler_type: linear
42
+ - num_epochs: 40
43
+
44
+ ### Training results
45
+
46
+ | Training Loss | Epoch | Step | Validation Loss |
47
+ |:-------------:|:-----:|:----:|:---------------:|
48
+ | 3.0501 | 1.0 | 6 | 2.2966 |
49
+ | 2.0486 | 2.0 | 12 | 1.7533 |
50
+ | 1.5929 | 3.0 | 18 | 1.3909 |
51
+ | 1.2724 | 4.0 | 24 | 1.1387 |
52
+ | 1.0906 | 5.0 | 30 | 0.9902 |
53
+ | 0.9501 | 6.0 | 36 | 0.8982 |
54
+ | 0.8489 | 7.0 | 42 | 0.8078 |
55
+ | 0.7684 | 8.0 | 48 | 0.7145 |
56
+ | 0.6872 | 9.0 | 54 | 0.6570 |
57
+ | 0.6574 | 10.0 | 60 | 0.6683 |
58
+ | 0.6364 | 11.0 | 66 | 0.6542 |
59
+ | 0.6217 | 12.0 | 72 | 0.5741 |
60
+ | 0.561 | 13.0 | 78 | 0.5594 |
61
+ | 0.548 | 14.0 | 84 | 0.5491 |
62
+ | 0.5447 | 15.0 | 90 | 0.5075 |
63
+ | 0.5114 | 16.0 | 96 | 0.5275 |
64
+ | 0.4897 | 17.0 | 102 | 0.4606 |
65
+ | 0.4736 | 18.0 | 108 | 0.4505 |
66
+ | 0.4536 | 19.0 | 114 | 0.4420 |
67
+ | 0.4438 | 20.0 | 120 | 0.4338 |
68
+ | 0.4178 | 21.0 | 126 | 0.4388 |
69
+ | 0.439 | 22.0 | 132 | 0.4336 |
70
+ | 0.4315 | 23.0 | 138 | 0.3953 |
71
+ | 0.4006 | 24.0 | 144 | 0.3763 |
72
+ | 0.3923 | 25.0 | 150 | 0.3776 |
73
+ | 0.3785 | 26.0 | 156 | 0.3616 |
74
+ | 0.38 | 27.0 | 162 | 0.3504 |
75
+ | 0.3546 | 28.0 | 168 | 0.3411 |
76
+ | 0.3507 | 29.0 | 174 | 0.3395 |
77
+ | 0.3518 | 30.0 | 180 | 0.3256 |
78
+ | 0.3324 | 31.0 | 186 | 0.3084 |
79
+ | 0.3161 | 32.0 | 192 | 0.2991 |
80
+ | 0.3043 | 33.0 | 198 | 0.2820 |
81
+ | 0.2977 | 34.0 | 204 | 0.2706 |
82
+ | 0.2911 | 35.0 | 210 | 0.2644 |
83
+ | 0.2843 | 36.0 | 216 | 0.2583 |
84
+ | 0.2662 | 37.0 | 222 | 0.2491 |
85
+ | 0.2737 | 38.0 | 228 | 0.2438 |
86
+ | 0.2599 | 39.0 | 234 | 0.2397 |
87
+ | 0.2647 | 40.0 | 240 | 0.2387 |
88
+
89
+
90
+ ### Framework versions
91
+
92
+ - Transformers 4.57.2
93
+ - Pytorch 2.9.1+cu128
94
+ - Datasets 4.5.0
95
+ - Tokenizers 0.22.1
config.json CHANGED
@@ -3,92 +3,49 @@
3
  "EncoderDecoderModel"
4
  ],
5
  "decoder": {
6
- "_name_or_path": "",
7
  "add_cross_attention": true,
8
- "architectures": null,
9
  "attention_probs_dropout_prob": 0.1,
10
- "bos_token_id": null,
11
- "chunk_size_feed_forward": 0,
12
  "classifier_dropout": null,
13
- "dtype": null,
14
- "eos_token_id": null,
15
  "hidden_act": "gelu",
16
  "hidden_dropout_prob": 0.1,
17
  "hidden_size": 256,
18
- "id2label": {
19
- "0": "LABEL_0",
20
- "1": "LABEL_1"
21
- },
22
  "initializer_range": 0.02,
23
  "intermediate_size": 1024,
24
  "is_decoder": true,
25
- "is_encoder_decoder": false,
26
- "label2id": {
27
- "LABEL_0": 0,
28
- "LABEL_1": 1
29
- },
30
  "layer_norm_eps": 1e-12,
31
  "max_position_embeddings": 512,
32
  "model_type": "bert",
33
  "num_attention_heads": 4,
34
  "num_hidden_layers": 4,
35
- "output_attentions": false,
36
- "output_hidden_states": false,
37
- "pad_token_id": 0,
38
- "problem_type": null,
39
- "return_dict": true,
40
- "tie_word_embeddings": true,
41
  "type_vocab_size": 2,
42
  "use_cache": true,
43
- "vocab_size": 53
44
  },
45
  "decoder_start_token_id": 2,
46
  "dtype": "float32",
47
  "encoder": {
48
- "_name_or_path": "",
49
- "add_cross_attention": false,
50
- "architectures": null,
51
  "attention_probs_dropout_prob": 0.1,
52
- "bos_token_id": null,
53
- "chunk_size_feed_forward": 0,
54
  "classifier_dropout": null,
55
- "dtype": null,
56
- "eos_token_id": null,
57
  "hidden_act": "gelu",
58
  "hidden_dropout_prob": 0.1,
59
  "hidden_size": 256,
60
- "id2label": {
61
- "0": "LABEL_0",
62
- "1": "LABEL_1"
63
- },
64
  "initializer_range": 0.02,
65
  "intermediate_size": 1024,
66
- "is_decoder": false,
67
- "is_encoder_decoder": false,
68
- "label2id": {
69
- "LABEL_0": 0,
70
- "LABEL_1": 1
71
- },
72
  "layer_norm_eps": 1e-12,
73
  "max_position_embeddings": 512,
74
  "model_type": "bert",
75
  "num_attention_heads": 4,
76
  "num_hidden_layers": 4,
77
- "output_attentions": false,
78
- "output_hidden_states": false,
79
- "pad_token_id": 0,
80
- "problem_type": null,
81
- "return_dict": true,
82
- "tie_word_embeddings": true,
83
  "type_vocab_size": 2,
84
  "use_cache": true,
85
- "vocab_size": 53
86
  },
87
  "eos_token_id": 0,
88
  "is_encoder_decoder": true,
89
  "model_type": "encoder-decoder",
90
  "pad_token_id": 3,
91
- "transformers_version": "5.0.0",
92
- "unk_token_id": null,
93
- "use_cache": false
94
  }
 
3
  "EncoderDecoderModel"
4
  ],
5
  "decoder": {
 
6
  "add_cross_attention": true,
 
7
  "attention_probs_dropout_prob": 0.1,
 
 
8
  "classifier_dropout": null,
 
 
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 256,
 
 
 
 
12
  "initializer_range": 0.02,
13
  "intermediate_size": 1024,
14
  "is_decoder": true,
 
 
 
 
 
15
  "layer_norm_eps": 1e-12,
16
  "max_position_embeddings": 512,
17
  "model_type": "bert",
18
  "num_attention_heads": 4,
19
  "num_hidden_layers": 4,
20
+ "position_embedding_type": "absolute",
 
 
 
 
 
21
  "type_vocab_size": 2,
22
  "use_cache": true,
23
+ "vocab_size": 79
24
  },
25
  "decoder_start_token_id": 2,
26
  "dtype": "float32",
27
  "encoder": {
 
 
 
28
  "attention_probs_dropout_prob": 0.1,
 
 
29
  "classifier_dropout": null,
 
 
30
  "hidden_act": "gelu",
31
  "hidden_dropout_prob": 0.1,
32
  "hidden_size": 256,
 
 
 
 
33
  "initializer_range": 0.02,
34
  "intermediate_size": 1024,
 
 
 
 
 
 
35
  "layer_norm_eps": 1e-12,
36
  "max_position_embeddings": 512,
37
  "model_type": "bert",
38
  "num_attention_heads": 4,
39
  "num_hidden_layers": 4,
40
+ "position_embedding_type": "absolute",
 
 
 
 
 
41
  "type_vocab_size": 2,
42
  "use_cache": true,
43
+ "vocab_size": 79
44
  },
45
  "eos_token_id": 0,
46
  "is_encoder_decoder": true,
47
  "model_type": "encoder-decoder",
48
  "pad_token_id": 3,
49
+ "transformers_version": "4.57.2",
50
+ "unk_token_id": null
 
51
  }
generation_config.json CHANGED
@@ -4,9 +4,6 @@
4
  "eos_token_id": [
5
  0
6
  ],
7
- "output_attentions": false,
8
- "output_hidden_states": false,
9
  "pad_token_id": 3,
10
- "transformers_version": "5.0.0",
11
- "use_cache": true
12
  }
 
4
  "eos_token_id": [
5
  0
6
  ],
 
 
7
  "pad_token_id": 3,
8
+ "transformers_version": "4.57.2"
 
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:103c01ad2911e7bbd4d00a2f155af7fc317efb8edc43a4c8645b1228d97f3204
3
- size 31207604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:815ea68831b7fc324e3ce92509adacb8c5f8abb0c10333c5ae37c35adfcadf3d
3
+ size 31260956
runs/Mar06_11-40-37_vicm/events.out.tfevents.1772793638.vicm.75786.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e6c0c621399a431ad7a9daa10745ed4400f26b79d4e3732374c11914c6ace55
3
+ size 25246
runs/Mar06_20-32-57_vicm/events.out.tfevents.1772825580.vicm.20324.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:085fb258b20e167a85e888265a4cd61f0a06bc23c69cbbf69c08a783b0eea065
3
+ size 22603
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "[EOS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json CHANGED
@@ -114,18 +114,18 @@
114
  "8": 14,
115
  "9": 15,
116
  "=": 16,
117
- "10": 17,
118
- "99": 18,
119
- "11": 19,
120
- "98": 20,
121
- "12": 21,
122
- "97": 22,
123
- "13": 23,
124
- "96": 24,
125
- "14": 25,
126
- "95": 26,
127
- "94": 27,
128
- "15": 28,
129
  "93": 29,
130
  "16": 30,
131
  "17": 31,
@@ -134,14 +134,14 @@
134
  "91": 34,
135
  "19": 35,
136
  "90": 36,
137
- "20": 37,
138
- "89": 38,
139
- "88": 39,
140
- "21": 40,
141
  "87": 41,
142
  "22": 42,
143
- "23": 43,
144
- "86": 44,
145
  "85": 45,
146
  "24": 46,
147
  "25": 47,
@@ -149,20 +149,42 @@
149
  "26": 49,
150
  "83": 50,
151
  "27": 51,
152
- "82": 52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  },
154
  "merges": [
155
- [
156
- "1",
157
- "0"
158
- ],
159
  [
160
  "9",
161
  "9"
162
  ],
163
  [
164
  "1",
165
- "1"
166
  ],
167
  [
168
  "9",
@@ -170,7 +192,7 @@
170
  ],
171
  [
172
  "1",
173
- "2"
174
  ],
175
  [
176
  "9",
@@ -178,7 +200,7 @@
178
  ],
179
  [
180
  "1",
181
- "3"
182
  ],
183
  [
184
  "9",
@@ -186,20 +208,24 @@
186
  ],
187
  [
188
  "1",
189
- "4"
190
  ],
191
  [
192
  "9",
193
  "5"
194
  ],
195
  [
196
- "9",
197
  "4"
198
  ],
199
  [
200
  "1",
201
  "5"
202
  ],
 
 
 
 
203
  [
204
  "9",
205
  "3"
@@ -232,17 +258,13 @@
232
  "9",
233
  "0"
234
  ],
235
- [
236
- "2",
237
- "0"
238
- ],
239
  [
240
  "8",
241
  "9"
242
  ],
243
  [
244
- "8",
245
- "8"
246
  ],
247
  [
248
  "2",
@@ -250,20 +272,24 @@
250
  ],
251
  [
252
  "8",
253
- "7"
254
  ],
255
  [
256
- "2",
257
- "2"
258
  ],
259
  [
260
  "2",
261
- "3"
262
  ],
263
  [
264
  "8",
265
  "6"
266
  ],
 
 
 
 
267
  [
268
  "8",
269
  "5"
@@ -295,6 +321,110 @@
295
  [
296
  "8",
297
  "2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  ]
299
  ]
300
  }
 
114
  "8": 14,
115
  "9": 15,
116
  "=": 16,
117
+ "99": 17,
118
+ "10": 18,
119
+ "98": 19,
120
+ "11": 20,
121
+ "97": 21,
122
+ "12": 22,
123
+ "96": 23,
124
+ "13": 24,
125
+ "95": 25,
126
+ "14": 26,
127
+ "15": 27,
128
+ "94": 28,
129
  "93": 29,
130
  "16": 30,
131
  "17": 31,
 
134
  "91": 34,
135
  "19": 35,
136
  "90": 36,
137
+ "89": 37,
138
+ "20": 38,
139
+ "21": 39,
140
+ "88": 40,
141
  "87": 41,
142
  "22": 42,
143
+ "86": 43,
144
+ "23": 44,
145
  "85": 45,
146
  "24": 46,
147
  "25": 47,
 
149
  "26": 49,
150
  "83": 50,
151
  "27": 51,
152
+ "82": 52,
153
+ "81": 53,
154
+ "28": 54,
155
+ "29": 55,
156
+ "80": 56,
157
+ "30": 57,
158
+ "79": 58,
159
+ "78": 59,
160
+ "31": 60,
161
+ "34": 61,
162
+ "76": 62,
163
+ "33": 63,
164
+ "75": 64,
165
+ "77": 65,
166
+ "32": 66,
167
+ "74": 67,
168
+ "35": 68,
169
+ "36": 69,
170
+ "38": 70,
171
+ "72": 71,
172
+ "39": 72,
173
+ "71": 73,
174
+ "73": 74,
175
+ "37": 75,
176
+ "40": 76,
177
+ "69": 77,
178
+ "70": 78
179
  },
180
  "merges": [
 
 
 
 
181
  [
182
  "9",
183
  "9"
184
  ],
185
  [
186
  "1",
187
+ "0"
188
  ],
189
  [
190
  "9",
 
192
  ],
193
  [
194
  "1",
195
+ "1"
196
  ],
197
  [
198
  "9",
 
200
  ],
201
  [
202
  "1",
203
+ "2"
204
  ],
205
  [
206
  "9",
 
208
  ],
209
  [
210
  "1",
211
+ "3"
212
  ],
213
  [
214
  "9",
215
  "5"
216
  ],
217
  [
218
+ "1",
219
  "4"
220
  ],
221
  [
222
  "1",
223
  "5"
224
  ],
225
+ [
226
+ "9",
227
+ "4"
228
+ ],
229
  [
230
  "9",
231
  "3"
 
258
  "9",
259
  "0"
260
  ],
 
 
 
 
261
  [
262
  "8",
263
  "9"
264
  ],
265
  [
266
+ "2",
267
+ "0"
268
  ],
269
  [
270
  "2",
 
272
  ],
273
  [
274
  "8",
275
+ "8"
276
  ],
277
  [
278
+ "8",
279
+ "7"
280
  ],
281
  [
282
  "2",
283
+ "2"
284
  ],
285
  [
286
  "8",
287
  "6"
288
  ],
289
+ [
290
+ "2",
291
+ "3"
292
+ ],
293
  [
294
  "8",
295
  "5"
 
321
  [
322
  "8",
323
  "2"
324
+ ],
325
+ [
326
+ "8",
327
+ "1"
328
+ ],
329
+ [
330
+ "2",
331
+ "8"
332
+ ],
333
+ [
334
+ "2",
335
+ "9"
336
+ ],
337
+ [
338
+ "8",
339
+ "0"
340
+ ],
341
+ [
342
+ "3",
343
+ "0"
344
+ ],
345
+ [
346
+ "7",
347
+ "9"
348
+ ],
349
+ [
350
+ "7",
351
+ "8"
352
+ ],
353
+ [
354
+ "3",
355
+ "1"
356
+ ],
357
+ [
358
+ "3",
359
+ "4"
360
+ ],
361
+ [
362
+ "7",
363
+ "6"
364
+ ],
365
+ [
366
+ "3",
367
+ "3"
368
+ ],
369
+ [
370
+ "7",
371
+ "5"
372
+ ],
373
+ [
374
+ "7",
375
+ "7"
376
+ ],
377
+ [
378
+ "3",
379
+ "2"
380
+ ],
381
+ [
382
+ "7",
383
+ "4"
384
+ ],
385
+ [
386
+ "3",
387
+ "5"
388
+ ],
389
+ [
390
+ "3",
391
+ "6"
392
+ ],
393
+ [
394
+ "3",
395
+ "8"
396
+ ],
397
+ [
398
+ "7",
399
+ "2"
400
+ ],
401
+ [
402
+ "3",
403
+ "9"
404
+ ],
405
+ [
406
+ "7",
407
+ "1"
408
+ ],
409
+ [
410
+ "7",
411
+ "3"
412
+ ],
413
+ [
414
+ "3",
415
+ "7"
416
+ ],
417
+ [
418
+ "4",
419
+ "0"
420
+ ],
421
+ [
422
+ "6",
423
+ "9"
424
+ ],
425
+ [
426
+ "7",
427
+ "0"
428
  ]
429
  ]
430
  }
tokenizer_config.json CHANGED
@@ -1,8 +1,43 @@
1
  {
2
- "backend": "tokenizers",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "cls_token": "[CLS]",
4
  "eos_token": "[EOS]",
 
5
  "model_max_length": 1000000000000000019884624838656,
6
  "pad_token": "[PAD]",
7
- "tokenizer_class": "TokenizersBackend"
8
  }
 
1
  {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[EOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "clean_up_tokenization_spaces": false,
37
  "cls_token": "[CLS]",
38
  "eos_token": "[EOS]",
39
+ "extra_special_tokens": {},
40
  "model_max_length": 1000000000000000019884624838656,
41
  "pad_token": "[PAD]",
42
+ "tokenizer_class": "PreTrainedTokenizerFast"
43
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64e9595e750a35613569607f993afec0f1b3799ad4045f502f7167862c781a8a
3
- size 5329
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dede89be1cd24623150d5bbe219fdf1f9af0bc7b374512bb168ffecdfa31eb58
3
+ size 6033