Mehd212 commited on
Commit
8b50ad3
·
verified ·
1 Parent(s): 2e3d1db

Use bi-encoder FAISS top-K retrieval to train cross-encoder with hard negatives

Browse files
checkpoint-3696/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "architectures": [
3
- "CamembertForSequenceClassification"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "bos_token_id": 5,
7
- "classifier_dropout": null,
8
- "dtype": "float32",
9
- "eos_token_id": 6,
10
- "hidden_act": "gelu",
11
- "hidden_dropout_prob": 0.1,
12
- "hidden_size": 768,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 3072,
15
- "layer_norm_eps": 1e-05,
16
- "max_position_embeddings": 514,
17
- "model_type": "camembert",
18
- "num_attention_heads": 12,
19
- "num_hidden_layers": 12,
20
- "output_past": true,
21
- "pad_token_id": 1,
22
- "position_embedding_type": "absolute",
23
- "problem_type": "single_label_classification",
24
- "transformers_version": "4.57.3",
25
- "type_vocab_size": 1,
26
- "use_cache": true,
27
- "vocab_size": 32005
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3696/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:30438456f15a191a07f2afbfbc4a691c12406b1aebc1a73e3167a45e3be3387c
3
- size 442518104
 
 
 
 
checkpoint-3696/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6154f15e03e2a66ab7ac5b9c342d955a771c7a394f5d1197e58343bef20c8c57
3
- size 885159307
 
 
 
 
checkpoint-3696/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:90edcc18339bcab188fe886c7de8ddb156e19ba42815ef3e77d0d587c505e977
3
- size 14645
 
 
 
 
checkpoint-3696/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:42d95b7b22877f7a7369647179abfb3ef136059915c0ffc1326249b0c778809b
3
- size 1383
 
 
 
 
checkpoint-3696/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6edcf01ce506662fd12ef7f2765fef0277805c6d3c3105814529925f34924e15
3
- size 1465
 
 
 
 
checkpoint-3696/special_tokens_map.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<s>NOTUSED",
4
- "</s>NOTUSED",
5
- "<unk>NOTUSED"
6
- ],
7
- "bos_token": {
8
- "content": "<s>",
9
- "lstrip": false,
10
- "normalized": false,
11
- "rstrip": false,
12
- "single_word": false
13
- },
14
- "cls_token": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "eos_token": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false
27
- },
28
- "mask_token": {
29
- "content": "<mask>",
30
- "lstrip": true,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false
34
- },
35
- "pad_token": {
36
- "content": "<pad>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false
41
- },
42
- "sep_token": {
43
- "content": "</s>",
44
- "lstrip": false,
45
- "normalized": false,
46
- "rstrip": false,
47
- "single_word": false
48
- },
49
- "unk_token": {
50
- "content": "<unk>",
51
- "lstrip": false,
52
- "normalized": false,
53
- "rstrip": false,
54
- "single_word": false
55
- }
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3696/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-3696/tokenizer_config.json DELETED
@@ -1,84 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<s>NOTUSED",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<pad>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>NOTUSED",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "4": {
36
- "content": "<unk>NOTUSED",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- },
43
- "5": {
44
- "content": "<s>",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "6": {
52
- "content": "</s>",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- },
59
- "32004": {
60
- "content": "<mask>",
61
- "lstrip": true,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false,
65
- "special": true
66
- }
67
- },
68
- "additional_special_tokens": [
69
- "<s>NOTUSED",
70
- "</s>NOTUSED",
71
- "<unk>NOTUSED"
72
- ],
73
- "bos_token": "<s>",
74
- "clean_up_tokenization_spaces": true,
75
- "cls_token": "<s>",
76
- "eos_token": "</s>",
77
- "extra_special_tokens": {},
78
- "mask_token": "<mask>",
79
- "model_max_length": 512,
80
- "pad_token": "<pad>",
81
- "sep_token": "</s>",
82
- "tokenizer_class": "CamembertTokenizer",
83
- "unk_token": "<unk>"
84
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3696/trainer_state.json DELETED
@@ -1,617 +0,0 @@
1
- {
2
- "best_global_step": 3696,
3
- "best_metric": 0.9914414414414414,
4
- "best_model_checkpoint": "../models/camembert-bio-morpho-cross-encoder/checkpoint-3696",
5
- "epoch": 7.0,
6
- "eval_steps": 500,
7
- "global_step": 3696,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0946969696969697,
14
- "grad_norm": 1.7212085723876953,
15
- "learning_rate": 9.280303030303031e-07,
16
- "loss": 0.6819,
17
- "step": 50
18
- },
19
- {
20
- "epoch": 0.1893939393939394,
21
- "grad_norm": 0.8152473568916321,
22
- "learning_rate": 1.8750000000000003e-06,
23
- "loss": 0.615,
24
- "step": 100
25
- },
26
- {
27
- "epoch": 0.2840909090909091,
28
- "grad_norm": 0.8320032954216003,
29
- "learning_rate": 2.8219696969696973e-06,
30
- "loss": 0.5181,
31
- "step": 150
32
- },
33
- {
34
- "epoch": 0.3787878787878788,
35
- "grad_norm": 1.74077570438385,
36
- "learning_rate": 3.7689393939393944e-06,
37
- "loss": 0.412,
38
- "step": 200
39
- },
40
- {
41
- "epoch": 0.4734848484848485,
42
- "grad_norm": 3.2630276679992676,
43
- "learning_rate": 4.715909090909091e-06,
44
- "loss": 0.2989,
45
- "step": 250
46
- },
47
- {
48
- "epoch": 0.5681818181818182,
49
- "grad_norm": 5.9127020835876465,
50
- "learning_rate": 5.662878787878788e-06,
51
- "loss": 0.2506,
52
- "step": 300
53
- },
54
- {
55
- "epoch": 0.6628787878787878,
56
- "grad_norm": 4.532700538635254,
57
- "learning_rate": 6.6098484848484855e-06,
58
- "loss": 0.2167,
59
- "step": 350
60
- },
61
- {
62
- "epoch": 0.7575757575757576,
63
- "grad_norm": 2.2779574394226074,
64
- "learning_rate": 7.556818181818183e-06,
65
- "loss": 0.2103,
66
- "step": 400
67
- },
68
- {
69
- "epoch": 0.8522727272727273,
70
- "grad_norm": 3.8016679286956787,
71
- "learning_rate": 8.50378787878788e-06,
72
- "loss": 0.1825,
73
- "step": 450
74
- },
75
- {
76
- "epoch": 0.946969696969697,
77
- "grad_norm": 2.031386375427246,
78
- "learning_rate": 9.450757575757576e-06,
79
- "loss": 0.1771,
80
- "step": 500
81
- },
82
- {
83
- "epoch": 1.0,
84
- "eval_accuracy": 0.9608108108108108,
85
- "eval_loss": 0.14045676589012146,
86
- "eval_runtime": 0.9313,
87
- "eval_samples_per_second": 2383.741,
88
- "eval_steps_per_second": 7.516,
89
- "step": 528
90
- },
91
- {
92
- "epoch": 1.0416666666666667,
93
- "grad_norm": 4.0610175132751465,
94
- "learning_rate": 1.0397727272727275e-05,
95
- "loss": 0.1615,
96
- "step": 550
97
- },
98
- {
99
- "epoch": 1.1363636363636362,
100
- "grad_norm": 1.9611369371414185,
101
- "learning_rate": 1.1344696969696971e-05,
102
- "loss": 0.1433,
103
- "step": 600
104
- },
105
- {
106
- "epoch": 1.231060606060606,
107
- "grad_norm": 6.061194896697998,
108
- "learning_rate": 1.2291666666666668e-05,
109
- "loss": 0.1517,
110
- "step": 650
111
- },
112
- {
113
- "epoch": 1.3257575757575757,
114
- "grad_norm": 2.7738304138183594,
115
- "learning_rate": 1.3238636363636366e-05,
116
- "loss": 0.1496,
117
- "step": 700
118
- },
119
- {
120
- "epoch": 1.4204545454545454,
121
- "grad_norm": 2.841794967651367,
122
- "learning_rate": 1.4185606060606061e-05,
123
- "loss": 0.1275,
124
- "step": 750
125
- },
126
- {
127
- "epoch": 1.5151515151515151,
128
- "grad_norm": 5.296891689300537,
129
- "learning_rate": 1.5132575757575758e-05,
130
- "loss": 0.1398,
131
- "step": 800
132
- },
133
- {
134
- "epoch": 1.6098484848484849,
135
- "grad_norm": 2.8792037963867188,
136
- "learning_rate": 1.6079545454545456e-05,
137
- "loss": 0.1062,
138
- "step": 850
139
- },
140
- {
141
- "epoch": 1.7045454545454546,
142
- "grad_norm": 7.044574737548828,
143
- "learning_rate": 1.7026515151515154e-05,
144
- "loss": 0.1209,
145
- "step": 900
146
- },
147
- {
148
- "epoch": 1.7992424242424243,
149
- "grad_norm": 4.128571033477783,
150
- "learning_rate": 1.797348484848485e-05,
151
- "loss": 0.1019,
152
- "step": 950
153
- },
154
- {
155
- "epoch": 1.893939393939394,
156
- "grad_norm": 3.093858242034912,
157
- "learning_rate": 1.8920454545454548e-05,
158
- "loss": 0.0905,
159
- "step": 1000
160
- },
161
- {
162
- "epoch": 1.9886363636363638,
163
- "grad_norm": 1.8322410583496094,
164
- "learning_rate": 1.9867424242424246e-05,
165
- "loss": 0.1046,
166
- "step": 1050
167
- },
168
- {
169
- "epoch": 2.0,
170
- "eval_accuracy": 0.9702702702702702,
171
- "eval_loss": 0.09388745576143265,
172
- "eval_runtime": 0.9328,
173
- "eval_samples_per_second": 2379.885,
174
- "eval_steps_per_second": 7.504,
175
- "step": 1056
176
- },
177
- {
178
- "epoch": 2.0833333333333335,
179
- "grad_norm": 1.9016932249069214,
180
- "learning_rate": 1.999898984854493e-05,
181
- "loss": 0.092,
182
- "step": 1100
183
- },
184
- {
185
- "epoch": 2.178030303030303,
186
- "grad_norm": 3.790273904800415,
187
- "learning_rate": 1.999527514387006e-05,
188
- "loss": 0.0958,
189
- "step": 1150
190
- },
191
- {
192
- "epoch": 2.2727272727272725,
193
- "grad_norm": 1.2855342626571655,
194
- "learning_rate": 1.9988830130412106e-05,
195
- "loss": 0.0775,
196
- "step": 1200
197
- },
198
- {
199
- "epoch": 2.367424242424242,
200
- "grad_norm": 4.88389253616333,
201
- "learning_rate": 1.997965656869057e-05,
202
- "loss": 0.0777,
203
- "step": 1250
204
- },
205
- {
206
- "epoch": 2.462121212121212,
207
- "grad_norm": 4.471704483032227,
208
- "learning_rate": 1.9967756964555044e-05,
209
- "loss": 0.0881,
210
- "step": 1300
211
- },
212
- {
213
- "epoch": 2.5568181818181817,
214
- "grad_norm": 4.597264289855957,
215
- "learning_rate": 1.995313456850071e-05,
216
- "loss": 0.0722,
217
- "step": 1350
218
- },
219
- {
220
- "epoch": 2.6515151515151514,
221
- "grad_norm": 1.7858144044876099,
222
- "learning_rate": 1.9935793374780435e-05,
223
- "loss": 0.0823,
224
- "step": 1400
225
- },
226
- {
227
- "epoch": 2.746212121212121,
228
- "grad_norm": 3.1771280765533447,
229
- "learning_rate": 1.991573812031369e-05,
230
- "loss": 0.0619,
231
- "step": 1450
232
- },
233
- {
234
- "epoch": 2.840909090909091,
235
- "grad_norm": 3.0146450996398926,
236
- "learning_rate": 1.989297428339264e-05,
237
- "loss": 0.0722,
238
- "step": 1500
239
- },
240
- {
241
- "epoch": 2.9356060606060606,
242
- "grad_norm": 1.1221269369125366,
243
- "learning_rate": 1.9867508082185663e-05,
244
- "loss": 0.071,
245
- "step": 1550
246
- },
247
- {
248
- "epoch": 3.0,
249
- "eval_accuracy": 0.9837837837837838,
250
- "eval_loss": 0.05881131812930107,
251
- "eval_runtime": 0.9401,
252
- "eval_samples_per_second": 2361.381,
253
- "eval_steps_per_second": 7.446,
254
- "step": 1584
255
- },
256
- {
257
- "epoch": 3.0303030303030303,
258
- "grad_norm": 0.7528719305992126,
259
- "learning_rate": 1.9839346473038815e-05,
260
- "loss": 0.0676,
261
- "step": 1600
262
- },
263
- {
264
- "epoch": 3.125,
265
- "grad_norm": 0.5066124200820923,
266
- "learning_rate": 1.980849714857563e-05,
267
- "loss": 0.0495,
268
- "step": 1650
269
- },
270
- {
271
- "epoch": 3.2196969696969697,
272
- "grad_norm": 5.64263391494751,
273
- "learning_rate": 1.9774968535595808e-05,
274
- "loss": 0.0626,
275
- "step": 1700
276
- },
277
- {
278
- "epoch": 3.3143939393939394,
279
- "grad_norm": 1.1057571172714233,
280
- "learning_rate": 1.9738769792773338e-05,
281
- "loss": 0.0611,
282
- "step": 1750
283
- },
284
- {
285
- "epoch": 3.409090909090909,
286
- "grad_norm": 0.09228092432022095,
287
- "learning_rate": 1.9699910808154726e-05,
288
- "loss": 0.0576,
289
- "step": 1800
290
- },
291
- {
292
- "epoch": 3.503787878787879,
293
- "grad_norm": 3.1624414920806885,
294
- "learning_rate": 1.965840219645797e-05,
295
- "loss": 0.0575,
296
- "step": 1850
297
- },
298
- {
299
- "epoch": 3.5984848484848486,
300
- "grad_norm": 4.033729553222656,
301
- "learning_rate": 1.961425529617306e-05,
302
- "loss": 0.0656,
303
- "step": 1900
304
- },
305
- {
306
- "epoch": 3.6931818181818183,
307
- "grad_norm": 3.7267255783081055,
308
- "learning_rate": 1.956748216646473e-05,
309
- "loss": 0.0594,
310
- "step": 1950
311
- },
312
- {
313
- "epoch": 3.787878787878788,
314
- "grad_norm": 2.7791688442230225,
315
- "learning_rate": 1.9518095583878406e-05,
316
- "loss": 0.054,
317
- "step": 2000
318
- },
319
- {
320
- "epoch": 3.882575757575758,
321
- "grad_norm": 0.10295089334249496,
322
- "learning_rate": 1.946610903885014e-05,
323
- "loss": 0.04,
324
- "step": 2050
325
- },
326
- {
327
- "epoch": 3.9772727272727275,
328
- "grad_norm": 1.05549156665802,
329
- "learning_rate": 1.941153673202158e-05,
330
- "loss": 0.0441,
331
- "step": 2100
332
- },
333
- {
334
- "epoch": 4.0,
335
- "eval_accuracy": 0.9864864864864865,
336
- "eval_loss": 0.056456033140420914,
337
- "eval_runtime": 0.9408,
338
- "eval_samples_per_second": 2359.707,
339
- "eval_steps_per_second": 7.441,
340
- "step": 2112
341
- },
342
- {
343
- "epoch": 4.071969696969697,
344
- "grad_norm": 2.183342933654785,
345
- "learning_rate": 1.9354393570360924e-05,
346
- "loss": 0.0449,
347
- "step": 2150
348
- },
349
- {
350
- "epoch": 4.166666666666667,
351
- "grad_norm": 3.2201144695281982,
352
- "learning_rate": 1.929469516309092e-05,
353
- "loss": 0.0443,
354
- "step": 2200
355
- },
356
- {
357
- "epoch": 4.261363636363637,
358
- "grad_norm": 2.585134744644165,
359
- "learning_rate": 1.9232457817425058e-05,
360
- "loss": 0.0378,
361
- "step": 2250
362
- },
363
- {
364
- "epoch": 4.356060606060606,
365
- "grad_norm": 0.22277259826660156,
366
- "learning_rate": 1.9167698534113105e-05,
367
- "loss": 0.0418,
368
- "step": 2300
369
- },
370
- {
371
- "epoch": 4.450757575757576,
372
- "grad_norm": 1.6783980131149292,
373
- "learning_rate": 1.910043500279716e-05,
374
- "loss": 0.0357,
375
- "step": 2350
376
- },
377
- {
378
- "epoch": 4.545454545454545,
379
- "grad_norm": 0.4771471619606018,
380
- "learning_rate": 1.903068559717957e-05,
381
- "loss": 0.0345,
382
- "step": 2400
383
- },
384
- {
385
- "epoch": 4.640151515151516,
386
- "grad_norm": 0.10980956256389618,
387
- "learning_rate": 1.8958469370003954e-05,
388
- "loss": 0.026,
389
- "step": 2450
390
- },
391
- {
392
- "epoch": 4.734848484848484,
393
- "grad_norm": 0.514388918876648,
394
- "learning_rate": 1.8883806047850772e-05,
395
- "loss": 0.0425,
396
- "step": 2500
397
- },
398
- {
399
- "epoch": 4.829545454545455,
400
- "grad_norm": 2.0213735103607178,
401
- "learning_rate": 1.8806716025748813e-05,
402
- "loss": 0.0321,
403
- "step": 2550
404
- },
405
- {
406
- "epoch": 4.924242424242424,
407
- "grad_norm": 2.96073842048645,
408
- "learning_rate": 1.872722036160407e-05,
409
- "loss": 0.0324,
410
- "step": 2600
411
- },
412
- {
413
- "epoch": 5.0,
414
- "eval_accuracy": 0.9891891891891892,
415
- "eval_loss": 0.039831362664699554,
416
- "eval_runtime": 0.9492,
417
- "eval_samples_per_second": 2338.807,
418
- "eval_steps_per_second": 7.375,
419
- "step": 2640
420
- },
421
- {
422
- "epoch": 5.018939393939394,
423
- "grad_norm": 5.567176342010498,
424
- "learning_rate": 1.8645340770447595e-05,
425
- "loss": 0.0448,
426
- "step": 2650
427
- },
428
- {
429
- "epoch": 5.113636363636363,
430
- "grad_norm": 0.33689576387405396,
431
- "learning_rate": 1.8561099618503785e-05,
432
- "loss": 0.0398,
433
- "step": 2700
434
- },
435
- {
436
- "epoch": 5.208333333333333,
437
- "grad_norm": 3.0077149868011475,
438
- "learning_rate": 1.8474519917080867e-05,
439
- "loss": 0.0234,
440
- "step": 2750
441
- },
442
- {
443
- "epoch": 5.303030303030303,
444
- "grad_norm": 0.3385215103626251,
445
- "learning_rate": 1.8385625316285095e-05,
446
- "loss": 0.03,
447
- "step": 2800
448
- },
449
- {
450
- "epoch": 5.3977272727272725,
451
- "grad_norm": 3.120093584060669,
452
- "learning_rate": 1.8294440098560508e-05,
453
- "loss": 0.0259,
454
- "step": 2850
455
- },
456
- {
457
- "epoch": 5.492424242424242,
458
- "grad_norm": 5.4590864181518555,
459
- "learning_rate": 1.8200989172055926e-05,
460
- "loss": 0.027,
461
- "step": 2900
462
- },
463
- {
464
- "epoch": 5.587121212121212,
465
- "grad_norm": 0.14214850962162018,
466
- "learning_rate": 1.8105298063821065e-05,
467
- "loss": 0.0396,
468
- "step": 2950
469
- },
470
- {
471
- "epoch": 5.681818181818182,
472
- "grad_norm": 5.496220111846924,
473
- "learning_rate": 1.8007392912833534e-05,
474
- "loss": 0.0386,
475
- "step": 3000
476
- },
477
- {
478
- "epoch": 5.776515151515151,
479
- "grad_norm": 2.2691736221313477,
480
- "learning_rate": 1.7907300462858752e-05,
481
- "loss": 0.0288,
482
- "step": 3050
483
- },
484
- {
485
- "epoch": 5.871212121212121,
486
- "grad_norm": 0.3925817608833313,
487
- "learning_rate": 1.7805048055144584e-05,
488
- "loss": 0.0305,
489
- "step": 3100
490
- },
491
- {
492
- "epoch": 5.965909090909091,
493
- "grad_norm": 0.03450781852006912,
494
- "learning_rate": 1.7700663620952844e-05,
495
- "loss": 0.0234,
496
- "step": 3150
497
- },
498
- {
499
- "epoch": 6.0,
500
- "eval_accuracy": 0.9882882882882883,
501
- "eval_loss": 0.0493258498609066,
502
- "eval_runtime": 0.9634,
503
- "eval_samples_per_second": 2304.357,
504
- "eval_steps_per_second": 7.266,
505
- "step": 3168
506
- },
507
- {
508
- "epoch": 6.0606060606060606,
509
- "grad_norm": 0.05821343883872032,
510
- "learning_rate": 1.7594175673929564e-05,
511
- "loss": 0.0146,
512
- "step": 3200
513
- },
514
- {
515
- "epoch": 6.15530303030303,
516
- "grad_norm": 0.04762452840805054,
517
- "learning_rate": 1.7485613302316226e-05,
518
- "loss": 0.02,
519
- "step": 3250
520
- },
521
- {
522
- "epoch": 6.25,
523
- "grad_norm": 0.05227584019303322,
524
- "learning_rate": 1.7375006161004018e-05,
525
- "loss": 0.0117,
526
- "step": 3300
527
- },
528
- {
529
- "epoch": 6.34469696969697,
530
- "grad_norm": 6.182316780090332,
531
- "learning_rate": 1.7262384463433286e-05,
532
- "loss": 0.0312,
533
- "step": 3350
534
- },
535
- {
536
- "epoch": 6.4393939393939394,
537
- "grad_norm": 2.6263253688812256,
538
- "learning_rate": 1.7147778973340466e-05,
539
- "loss": 0.0273,
540
- "step": 3400
541
- },
542
- {
543
- "epoch": 6.534090909090909,
544
- "grad_norm": 4.184931755065918,
545
- "learning_rate": 1.703122099635463e-05,
546
- "loss": 0.0339,
547
- "step": 3450
548
- },
549
- {
550
- "epoch": 6.628787878787879,
551
- "grad_norm": 4.779547214508057,
552
- "learning_rate": 1.6912742371446068e-05,
553
- "loss": 0.0187,
554
- "step": 3500
555
- },
556
- {
557
- "epoch": 6.723484848484849,
558
- "grad_norm": 4.305413246154785,
559
- "learning_rate": 1.6792375462229132e-05,
560
- "loss": 0.0288,
561
- "step": 3550
562
- },
563
- {
564
- "epoch": 6.818181818181818,
565
- "grad_norm": 0.8370099067687988,
566
- "learning_rate": 1.6670153148121834e-05,
567
- "loss": 0.022,
568
- "step": 3600
569
- },
570
- {
571
- "epoch": 6.912878787878788,
572
- "grad_norm": 0.042217787355184555,
573
- "learning_rate": 1.6546108815364448e-05,
574
- "loss": 0.0165,
575
- "step": 3650
576
- },
577
- {
578
- "epoch": 7.0,
579
- "eval_accuracy": 0.9914414414414414,
580
- "eval_loss": 0.042690977454185486,
581
- "eval_runtime": 1.1165,
582
- "eval_samples_per_second": 1988.281,
583
- "eval_steps_per_second": 6.269,
584
- "step": 3696
585
- }
586
- ],
587
- "logging_steps": 50,
588
- "max_steps": 10560,
589
- "num_input_tokens_seen": 0,
590
- "num_train_epochs": 20,
591
- "save_steps": 500,
592
- "stateful_callbacks": {
593
- "EarlyStoppingCallback": {
594
- "args": {
595
- "early_stopping_patience": 2,
596
- "early_stopping_threshold": 0.0
597
- },
598
- "attributes": {
599
- "early_stopping_patience_counter": 0
600
- }
601
- },
602
- "TrainerControl": {
603
- "args": {
604
- "should_epoch_stop": false,
605
- "should_evaluate": false,
606
- "should_log": false,
607
- "should_save": true,
608
- "should_training_stop": false
609
- },
610
- "attributes": {}
611
- }
612
- },
613
- "total_flos": 7628165050320000.0,
614
- "train_batch_size": 80,
615
- "trial_name": null,
616
- "trial_params": null
617
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3696/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e483228a10899db361a89a78f6d4c066e2522650debdb0af61e02a9f9faa73
3
- size 5905
 
 
 
 
checkpoint-4224/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "architectures": [
3
- "CamembertForSequenceClassification"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "bos_token_id": 5,
7
- "classifier_dropout": null,
8
- "dtype": "float32",
9
- "eos_token_id": 6,
10
- "hidden_act": "gelu",
11
- "hidden_dropout_prob": 0.1,
12
- "hidden_size": 768,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 3072,
15
- "layer_norm_eps": 1e-05,
16
- "max_position_embeddings": 514,
17
- "model_type": "camembert",
18
- "num_attention_heads": 12,
19
- "num_hidden_layers": 12,
20
- "output_past": true,
21
- "pad_token_id": 1,
22
- "position_embedding_type": "absolute",
23
- "problem_type": "single_label_classification",
24
- "transformers_version": "4.57.3",
25
- "type_vocab_size": 1,
26
- "use_cache": true,
27
- "vocab_size": 32005
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-4224/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:195734aa47d7048bc0b5ce2aa08bc41782501331182aca59d8fb149370e42d51
3
- size 442518104
 
 
 
 
checkpoint-4224/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a1ff0a7e9da896c796d85a38b00a897c7c64860aa1ba012ea2abdee677ce762
3
- size 885159307
 
 
 
 
checkpoint-4224/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f6ce09fab811a03946d54140f022dcb0f2d669094f78143026b34a3d4a9f00e
3
- size 14645
 
 
 
 
checkpoint-4224/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f1fdbb2b1c1114fc2b99a720d34948dc2aa42cd71b7fa5e0643e0738375279b
3
- size 1383
 
 
 
 
checkpoint-4224/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:dade6b4bafabb9222857aa03dfb48e1b30b2282bd87341bd8220c43dc0086cdf
3
- size 1465
 
 
 
 
checkpoint-4224/special_tokens_map.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<s>NOTUSED",
4
- "</s>NOTUSED",
5
- "<unk>NOTUSED"
6
- ],
7
- "bos_token": {
8
- "content": "<s>",
9
- "lstrip": false,
10
- "normalized": false,
11
- "rstrip": false,
12
- "single_word": false
13
- },
14
- "cls_token": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "eos_token": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false
27
- },
28
- "mask_token": {
29
- "content": "<mask>",
30
- "lstrip": true,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false
34
- },
35
- "pad_token": {
36
- "content": "<pad>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false
41
- },
42
- "sep_token": {
43
- "content": "</s>",
44
- "lstrip": false,
45
- "normalized": false,
46
- "rstrip": false,
47
- "single_word": false
48
- },
49
- "unk_token": {
50
- "content": "<unk>",
51
- "lstrip": false,
52
- "normalized": false,
53
- "rstrip": false,
54
- "single_word": false
55
- }
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-4224/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-4224/tokenizer_config.json DELETED
@@ -1,84 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<s>NOTUSED",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<pad>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>NOTUSED",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "4": {
36
- "content": "<unk>NOTUSED",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- },
43
- "5": {
44
- "content": "<s>",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "6": {
52
- "content": "</s>",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- },
59
- "32004": {
60
- "content": "<mask>",
61
- "lstrip": true,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false,
65
- "special": true
66
- }
67
- },
68
- "additional_special_tokens": [
69
- "<s>NOTUSED",
70
- "</s>NOTUSED",
71
- "<unk>NOTUSED"
72
- ],
73
- "bos_token": "<s>",
74
- "clean_up_tokenization_spaces": true,
75
- "cls_token": "<s>",
76
- "eos_token": "</s>",
77
- "extra_special_tokens": {},
78
- "mask_token": "<mask>",
79
- "model_max_length": 512,
80
- "pad_token": "<pad>",
81
- "sep_token": "</s>",
82
- "tokenizer_class": "CamembertTokenizer",
83
- "unk_token": "<unk>"
84
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-4224/trainer_state.json DELETED
@@ -1,703 +0,0 @@
1
- {
2
- "best_global_step": 3696,
3
- "best_metric": 0.9914414414414414,
4
- "best_model_checkpoint": "../models/camembert-bio-morpho-cross-encoder/checkpoint-3696",
5
- "epoch": 8.0,
6
- "eval_steps": 500,
7
- "global_step": 4224,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0946969696969697,
14
- "grad_norm": 1.7212085723876953,
15
- "learning_rate": 9.280303030303031e-07,
16
- "loss": 0.6819,
17
- "step": 50
18
- },
19
- {
20
- "epoch": 0.1893939393939394,
21
- "grad_norm": 0.8152473568916321,
22
- "learning_rate": 1.8750000000000003e-06,
23
- "loss": 0.615,
24
- "step": 100
25
- },
26
- {
27
- "epoch": 0.2840909090909091,
28
- "grad_norm": 0.8320032954216003,
29
- "learning_rate": 2.8219696969696973e-06,
30
- "loss": 0.5181,
31
- "step": 150
32
- },
33
- {
34
- "epoch": 0.3787878787878788,
35
- "grad_norm": 1.74077570438385,
36
- "learning_rate": 3.7689393939393944e-06,
37
- "loss": 0.412,
38
- "step": 200
39
- },
40
- {
41
- "epoch": 0.4734848484848485,
42
- "grad_norm": 3.2630276679992676,
43
- "learning_rate": 4.715909090909091e-06,
44
- "loss": 0.2989,
45
- "step": 250
46
- },
47
- {
48
- "epoch": 0.5681818181818182,
49
- "grad_norm": 5.9127020835876465,
50
- "learning_rate": 5.662878787878788e-06,
51
- "loss": 0.2506,
52
- "step": 300
53
- },
54
- {
55
- "epoch": 0.6628787878787878,
56
- "grad_norm": 4.532700538635254,
57
- "learning_rate": 6.6098484848484855e-06,
58
- "loss": 0.2167,
59
- "step": 350
60
- },
61
- {
62
- "epoch": 0.7575757575757576,
63
- "grad_norm": 2.2779574394226074,
64
- "learning_rate": 7.556818181818183e-06,
65
- "loss": 0.2103,
66
- "step": 400
67
- },
68
- {
69
- "epoch": 0.8522727272727273,
70
- "grad_norm": 3.8016679286956787,
71
- "learning_rate": 8.50378787878788e-06,
72
- "loss": 0.1825,
73
- "step": 450
74
- },
75
- {
76
- "epoch": 0.946969696969697,
77
- "grad_norm": 2.031386375427246,
78
- "learning_rate": 9.450757575757576e-06,
79
- "loss": 0.1771,
80
- "step": 500
81
- },
82
- {
83
- "epoch": 1.0,
84
- "eval_accuracy": 0.9608108108108108,
85
- "eval_loss": 0.14045676589012146,
86
- "eval_runtime": 0.9313,
87
- "eval_samples_per_second": 2383.741,
88
- "eval_steps_per_second": 7.516,
89
- "step": 528
90
- },
91
- {
92
- "epoch": 1.0416666666666667,
93
- "grad_norm": 4.0610175132751465,
94
- "learning_rate": 1.0397727272727275e-05,
95
- "loss": 0.1615,
96
- "step": 550
97
- },
98
- {
99
- "epoch": 1.1363636363636362,
100
- "grad_norm": 1.9611369371414185,
101
- "learning_rate": 1.1344696969696971e-05,
102
- "loss": 0.1433,
103
- "step": 600
104
- },
105
- {
106
- "epoch": 1.231060606060606,
107
- "grad_norm": 6.061194896697998,
108
- "learning_rate": 1.2291666666666668e-05,
109
- "loss": 0.1517,
110
- "step": 650
111
- },
112
- {
113
- "epoch": 1.3257575757575757,
114
- "grad_norm": 2.7738304138183594,
115
- "learning_rate": 1.3238636363636366e-05,
116
- "loss": 0.1496,
117
- "step": 700
118
- },
119
- {
120
- "epoch": 1.4204545454545454,
121
- "grad_norm": 2.841794967651367,
122
- "learning_rate": 1.4185606060606061e-05,
123
- "loss": 0.1275,
124
- "step": 750
125
- },
126
- {
127
- "epoch": 1.5151515151515151,
128
- "grad_norm": 5.296891689300537,
129
- "learning_rate": 1.5132575757575758e-05,
130
- "loss": 0.1398,
131
- "step": 800
132
- },
133
- {
134
- "epoch": 1.6098484848484849,
135
- "grad_norm": 2.8792037963867188,
136
- "learning_rate": 1.6079545454545456e-05,
137
- "loss": 0.1062,
138
- "step": 850
139
- },
140
- {
141
- "epoch": 1.7045454545454546,
142
- "grad_norm": 7.044574737548828,
143
- "learning_rate": 1.7026515151515154e-05,
144
- "loss": 0.1209,
145
- "step": 900
146
- },
147
- {
148
- "epoch": 1.7992424242424243,
149
- "grad_norm": 4.128571033477783,
150
- "learning_rate": 1.797348484848485e-05,
151
- "loss": 0.1019,
152
- "step": 950
153
- },
154
- {
155
- "epoch": 1.893939393939394,
156
- "grad_norm": 3.093858242034912,
157
- "learning_rate": 1.8920454545454548e-05,
158
- "loss": 0.0905,
159
- "step": 1000
160
- },
161
- {
162
- "epoch": 1.9886363636363638,
163
- "grad_norm": 1.8322410583496094,
164
- "learning_rate": 1.9867424242424246e-05,
165
- "loss": 0.1046,
166
- "step": 1050
167
- },
168
- {
169
- "epoch": 2.0,
170
- "eval_accuracy": 0.9702702702702702,
171
- "eval_loss": 0.09388745576143265,
172
- "eval_runtime": 0.9328,
173
- "eval_samples_per_second": 2379.885,
174
- "eval_steps_per_second": 7.504,
175
- "step": 1056
176
- },
177
- {
178
- "epoch": 2.0833333333333335,
179
- "grad_norm": 1.9016932249069214,
180
- "learning_rate": 1.999898984854493e-05,
181
- "loss": 0.092,
182
- "step": 1100
183
- },
184
- {
185
- "epoch": 2.178030303030303,
186
- "grad_norm": 3.790273904800415,
187
- "learning_rate": 1.999527514387006e-05,
188
- "loss": 0.0958,
189
- "step": 1150
190
- },
191
- {
192
- "epoch": 2.2727272727272725,
193
- "grad_norm": 1.2855342626571655,
194
- "learning_rate": 1.9988830130412106e-05,
195
- "loss": 0.0775,
196
- "step": 1200
197
- },
198
- {
199
- "epoch": 2.367424242424242,
200
- "grad_norm": 4.88389253616333,
201
- "learning_rate": 1.997965656869057e-05,
202
- "loss": 0.0777,
203
- "step": 1250
204
- },
205
- {
206
- "epoch": 2.462121212121212,
207
- "grad_norm": 4.471704483032227,
208
- "learning_rate": 1.9967756964555044e-05,
209
- "loss": 0.0881,
210
- "step": 1300
211
- },
212
- {
213
- "epoch": 2.5568181818181817,
214
- "grad_norm": 4.597264289855957,
215
- "learning_rate": 1.995313456850071e-05,
216
- "loss": 0.0722,
217
- "step": 1350
218
- },
219
- {
220
- "epoch": 2.6515151515151514,
221
- "grad_norm": 1.7858144044876099,
222
- "learning_rate": 1.9935793374780435e-05,
223
- "loss": 0.0823,
224
- "step": 1400
225
- },
226
- {
227
- "epoch": 2.746212121212121,
228
- "grad_norm": 3.1771280765533447,
229
- "learning_rate": 1.991573812031369e-05,
230
- "loss": 0.0619,
231
- "step": 1450
232
- },
233
- {
234
- "epoch": 2.840909090909091,
235
- "grad_norm": 3.0146450996398926,
236
- "learning_rate": 1.989297428339264e-05,
237
- "loss": 0.0722,
238
- "step": 1500
239
- },
240
- {
241
- "epoch": 2.9356060606060606,
242
- "grad_norm": 1.1221269369125366,
243
- "learning_rate": 1.9867508082185663e-05,
244
- "loss": 0.071,
245
- "step": 1550
246
- },
247
- {
248
- "epoch": 3.0,
249
- "eval_accuracy": 0.9837837837837838,
250
- "eval_loss": 0.05881131812930107,
251
- "eval_runtime": 0.9401,
252
- "eval_samples_per_second": 2361.381,
253
- "eval_steps_per_second": 7.446,
254
- "step": 1584
255
- },
256
- {
257
- "epoch": 3.0303030303030303,
258
- "grad_norm": 0.7528719305992126,
259
- "learning_rate": 1.9839346473038815e-05,
260
- "loss": 0.0676,
261
- "step": 1600
262
- },
263
- {
264
- "epoch": 3.125,
265
- "grad_norm": 0.5066124200820923,
266
- "learning_rate": 1.980849714857563e-05,
267
- "loss": 0.0495,
268
- "step": 1650
269
- },
270
- {
271
- "epoch": 3.2196969696969697,
272
- "grad_norm": 5.64263391494751,
273
- "learning_rate": 1.9774968535595808e-05,
274
- "loss": 0.0626,
275
- "step": 1700
276
- },
277
- {
278
- "epoch": 3.3143939393939394,
279
- "grad_norm": 1.1057571172714233,
280
- "learning_rate": 1.9738769792773338e-05,
281
- "loss": 0.0611,
282
- "step": 1750
283
- },
284
- {
285
- "epoch": 3.409090909090909,
286
- "grad_norm": 0.09228092432022095,
287
- "learning_rate": 1.9699910808154726e-05,
288
- "loss": 0.0576,
289
- "step": 1800
290
- },
291
- {
292
- "epoch": 3.503787878787879,
293
- "grad_norm": 3.1624414920806885,
294
- "learning_rate": 1.965840219645797e-05,
295
- "loss": 0.0575,
296
- "step": 1850
297
- },
298
- {
299
- "epoch": 3.5984848484848486,
300
- "grad_norm": 4.033729553222656,
301
- "learning_rate": 1.961425529617306e-05,
302
- "loss": 0.0656,
303
- "step": 1900
304
- },
305
- {
306
- "epoch": 3.6931818181818183,
307
- "grad_norm": 3.7267255783081055,
308
- "learning_rate": 1.956748216646473e-05,
309
- "loss": 0.0594,
310
- "step": 1950
311
- },
312
- {
313
- "epoch": 3.787878787878788,
314
- "grad_norm": 2.7791688442230225,
315
- "learning_rate": 1.9518095583878406e-05,
316
- "loss": 0.054,
317
- "step": 2000
318
- },
319
- {
320
- "epoch": 3.882575757575758,
321
- "grad_norm": 0.10295089334249496,
322
- "learning_rate": 1.946610903885014e-05,
323
- "loss": 0.04,
324
- "step": 2050
325
- },
326
- {
327
- "epoch": 3.9772727272727275,
328
- "grad_norm": 1.05549156665802,
329
- "learning_rate": 1.941153673202158e-05,
330
- "loss": 0.0441,
331
- "step": 2100
332
- },
333
- {
334
- "epoch": 4.0,
335
- "eval_accuracy": 0.9864864864864865,
336
- "eval_loss": 0.056456033140420914,
337
- "eval_runtime": 0.9408,
338
- "eval_samples_per_second": 2359.707,
339
- "eval_steps_per_second": 7.441,
340
- "step": 2112
341
- },
342
- {
343
- "epoch": 4.071969696969697,
344
- "grad_norm": 2.183342933654785,
345
- "learning_rate": 1.9354393570360924e-05,
346
- "loss": 0.0449,
347
- "step": 2150
348
- },
349
- {
350
- "epoch": 4.166666666666667,
351
- "grad_norm": 3.2201144695281982,
352
- "learning_rate": 1.929469516309092e-05,
353
- "loss": 0.0443,
354
- "step": 2200
355
- },
356
- {
357
- "epoch": 4.261363636363637,
358
- "grad_norm": 2.585134744644165,
359
- "learning_rate": 1.9232457817425058e-05,
360
- "loss": 0.0378,
361
- "step": 2250
362
- },
363
- {
364
- "epoch": 4.356060606060606,
365
- "grad_norm": 0.22277259826660156,
366
- "learning_rate": 1.9167698534113105e-05,
367
- "loss": 0.0418,
368
- "step": 2300
369
- },
370
- {
371
- "epoch": 4.450757575757576,
372
- "grad_norm": 1.6783980131149292,
373
- "learning_rate": 1.910043500279716e-05,
374
- "loss": 0.0357,
375
- "step": 2350
376
- },
377
- {
378
- "epoch": 4.545454545454545,
379
- "grad_norm": 0.4771471619606018,
380
- "learning_rate": 1.903068559717957e-05,
381
- "loss": 0.0345,
382
- "step": 2400
383
- },
384
- {
385
- "epoch": 4.640151515151516,
386
- "grad_norm": 0.10980956256389618,
387
- "learning_rate": 1.8958469370003954e-05,
388
- "loss": 0.026,
389
- "step": 2450
390
- },
391
- {
392
- "epoch": 4.734848484848484,
393
- "grad_norm": 0.514388918876648,
394
- "learning_rate": 1.8883806047850772e-05,
395
- "loss": 0.0425,
396
- "step": 2500
397
- },
398
- {
399
- "epoch": 4.829545454545455,
400
- "grad_norm": 2.0213735103607178,
401
- "learning_rate": 1.8806716025748813e-05,
402
- "loss": 0.0321,
403
- "step": 2550
404
- },
405
- {
406
- "epoch": 4.924242424242424,
407
- "grad_norm": 2.96073842048645,
408
- "learning_rate": 1.872722036160407e-05,
409
- "loss": 0.0324,
410
- "step": 2600
411
- },
412
- {
413
- "epoch": 5.0,
414
- "eval_accuracy": 0.9891891891891892,
415
- "eval_loss": 0.039831362664699554,
416
- "eval_runtime": 0.9492,
417
- "eval_samples_per_second": 2338.807,
418
- "eval_steps_per_second": 7.375,
419
- "step": 2640
420
- },
421
- {
422
- "epoch": 5.018939393939394,
423
- "grad_norm": 5.567176342010498,
424
- "learning_rate": 1.8645340770447595e-05,
425
- "loss": 0.0448,
426
- "step": 2650
427
- },
428
- {
429
- "epoch": 5.113636363636363,
430
- "grad_norm": 0.33689576387405396,
431
- "learning_rate": 1.8561099618503785e-05,
432
- "loss": 0.0398,
433
- "step": 2700
434
- },
435
- {
436
- "epoch": 5.208333333333333,
437
- "grad_norm": 3.0077149868011475,
438
- "learning_rate": 1.8474519917080867e-05,
439
- "loss": 0.0234,
440
- "step": 2750
441
- },
442
- {
443
- "epoch": 5.303030303030303,
444
- "grad_norm": 0.3385215103626251,
445
- "learning_rate": 1.8385625316285095e-05,
446
- "loss": 0.03,
447
- "step": 2800
448
- },
449
- {
450
- "epoch": 5.3977272727272725,
451
- "grad_norm": 3.120093584060669,
452
- "learning_rate": 1.8294440098560508e-05,
453
- "loss": 0.0259,
454
- "step": 2850
455
- },
456
- {
457
- "epoch": 5.492424242424242,
458
- "grad_norm": 5.4590864181518555,
459
- "learning_rate": 1.8200989172055926e-05,
460
- "loss": 0.027,
461
- "step": 2900
462
- },
463
- {
464
- "epoch": 5.587121212121212,
465
- "grad_norm": 0.14214850962162018,
466
- "learning_rate": 1.8105298063821065e-05,
467
- "loss": 0.0396,
468
- "step": 2950
469
- },
470
- {
471
- "epoch": 5.681818181818182,
472
- "grad_norm": 5.496220111846924,
473
- "learning_rate": 1.8007392912833534e-05,
474
- "loss": 0.0386,
475
- "step": 3000
476
- },
477
- {
478
- "epoch": 5.776515151515151,
479
- "grad_norm": 2.2691736221313477,
480
- "learning_rate": 1.7907300462858752e-05,
481
- "loss": 0.0288,
482
- "step": 3050
483
- },
484
- {
485
- "epoch": 5.871212121212121,
486
- "grad_norm": 0.3925817608833313,
487
- "learning_rate": 1.7805048055144584e-05,
488
- "loss": 0.0305,
489
- "step": 3100
490
- },
491
- {
492
- "epoch": 5.965909090909091,
493
- "grad_norm": 0.03450781852006912,
494
- "learning_rate": 1.7700663620952844e-05,
495
- "loss": 0.0234,
496
- "step": 3150
497
- },
498
- {
499
- "epoch": 6.0,
500
- "eval_accuracy": 0.9882882882882883,
501
- "eval_loss": 0.0493258498609066,
502
- "eval_runtime": 0.9634,
503
- "eval_samples_per_second": 2304.357,
504
- "eval_steps_per_second": 7.266,
505
- "step": 3168
506
- },
507
- {
508
- "epoch": 6.0606060606060606,
509
- "grad_norm": 0.05821343883872032,
510
- "learning_rate": 1.7594175673929564e-05,
511
- "loss": 0.0146,
512
- "step": 3200
513
- },
514
- {
515
- "epoch": 6.15530303030303,
516
- "grad_norm": 0.04762452840805054,
517
- "learning_rate": 1.7485613302316226e-05,
518
- "loss": 0.02,
519
- "step": 3250
520
- },
521
- {
522
- "epoch": 6.25,
523
- "grad_norm": 0.05227584019303322,
524
- "learning_rate": 1.7375006161004018e-05,
525
- "loss": 0.0117,
526
- "step": 3300
527
- },
528
- {
529
- "epoch": 6.34469696969697,
530
- "grad_norm": 6.182316780090332,
531
- "learning_rate": 1.7262384463433286e-05,
532
- "loss": 0.0312,
533
- "step": 3350
534
- },
535
- {
536
- "epoch": 6.4393939393939394,
537
- "grad_norm": 2.6263253688812256,
538
- "learning_rate": 1.7147778973340466e-05,
539
- "loss": 0.0273,
540
- "step": 3400
541
- },
542
- {
543
- "epoch": 6.534090909090909,
544
- "grad_norm": 4.184931755065918,
545
- "learning_rate": 1.703122099635463e-05,
546
- "loss": 0.0339,
547
- "step": 3450
548
- },
549
- {
550
- "epoch": 6.628787878787879,
551
- "grad_norm": 4.779547214508057,
552
- "learning_rate": 1.6912742371446068e-05,
553
- "loss": 0.0187,
554
- "step": 3500
555
- },
556
- {
557
- "epoch": 6.723484848484849,
558
- "grad_norm": 4.305413246154785,
559
- "learning_rate": 1.6792375462229132e-05,
560
- "loss": 0.0288,
561
- "step": 3550
562
- },
563
- {
564
- "epoch": 6.818181818181818,
565
- "grad_norm": 0.8370099067687988,
566
- "learning_rate": 1.6670153148121834e-05,
567
- "loss": 0.022,
568
- "step": 3600
569
- },
570
- {
571
- "epoch": 6.912878787878788,
572
- "grad_norm": 0.042217787355184555,
573
- "learning_rate": 1.6546108815364448e-05,
574
- "loss": 0.0165,
575
- "step": 3650
576
- },
577
- {
578
- "epoch": 7.0,
579
- "eval_accuracy": 0.9914414414414414,
580
- "eval_loss": 0.042690977454185486,
581
- "eval_runtime": 1.1165,
582
- "eval_samples_per_second": 1988.281,
583
- "eval_steps_per_second": 6.269,
584
- "step": 3696
585
- },
586
- {
587
- "epoch": 7.007575757575758,
588
- "grad_norm": 0.08638785779476166,
589
- "learning_rate": 1.6420276347899776e-05,
590
- "loss": 0.0267,
591
- "step": 3700
592
- },
593
- {
594
- "epoch": 7.1022727272727275,
595
- "grad_norm": 0.019671985879540443,
596
- "learning_rate": 1.6292690118117393e-05,
597
- "loss": 0.015,
598
- "step": 3750
599
- },
600
- {
601
- "epoch": 7.196969696969697,
602
- "grad_norm": 0.10662596672773361,
603
- "learning_rate": 1.6163384977464476e-05,
604
- "loss": 0.0177,
605
- "step": 3800
606
- },
607
- {
608
- "epoch": 7.291666666666667,
609
- "grad_norm": 0.1051739975810051,
610
- "learning_rate": 1.6032396246925806e-05,
611
- "loss": 0.0204,
612
- "step": 3850
613
- },
614
- {
615
- "epoch": 7.386363636363637,
616
- "grad_norm": 0.06513633579015732,
617
- "learning_rate": 1.5899759707375487e-05,
618
- "loss": 0.0146,
619
- "step": 3900
620
- },
621
- {
622
- "epoch": 7.481060606060606,
623
- "grad_norm": 5.043694496154785,
624
- "learning_rate": 1.576551158980302e-05,
625
- "loss": 0.0202,
626
- "step": 3950
627
- },
628
- {
629
- "epoch": 7.575757575757576,
630
- "grad_norm": 0.04632123187184334,
631
- "learning_rate": 1.562968856541648e-05,
632
- "loss": 0.0188,
633
- "step": 4000
634
- },
635
- {
636
- "epoch": 7.670454545454545,
637
- "grad_norm": 0.1046363040804863,
638
- "learning_rate": 1.549232773562539e-05,
639
- "loss": 0.0218,
640
- "step": 4050
641
- },
642
- {
643
- "epoch": 7.765151515151516,
644
- "grad_norm": 10.116546630859375,
645
- "learning_rate": 1.5353466621906113e-05,
646
- "loss": 0.0223,
647
- "step": 4100
648
- },
649
- {
650
- "epoch": 7.859848484848484,
651
- "grad_norm": 0.06823263317346573,
652
- "learning_rate": 1.5213143155552479e-05,
653
- "loss": 0.0234,
654
- "step": 4150
655
- },
656
- {
657
- "epoch": 7.954545454545455,
658
- "grad_norm": 0.10893430560827255,
659
- "learning_rate": 1.5071395667314481e-05,
660
- "loss": 0.0162,
661
- "step": 4200
662
- },
663
- {
664
- "epoch": 8.0,
665
- "eval_accuracy": 0.986936936936937,
666
- "eval_loss": 0.05621395632624626,
667
- "eval_runtime": 1.1546,
668
- "eval_samples_per_second": 1922.822,
669
- "eval_steps_per_second": 6.063,
670
- "step": 4224
671
- }
672
- ],
673
- "logging_steps": 50,
674
- "max_steps": 10560,
675
- "num_input_tokens_seen": 0,
676
- "num_train_epochs": 20,
677
- "save_steps": 500,
678
- "stateful_callbacks": {
679
- "EarlyStoppingCallback": {
680
- "args": {
681
- "early_stopping_patience": 2,
682
- "early_stopping_threshold": 0.0
683
- },
684
- "attributes": {
685
- "early_stopping_patience_counter": 1
686
- }
687
- },
688
- "TrainerControl": {
689
- "args": {
690
- "should_epoch_stop": false,
691
- "should_evaluate": false,
692
- "should_log": false,
693
- "should_save": true,
694
- "should_training_stop": false
695
- },
696
- "attributes": {}
697
- }
698
- },
699
- "total_flos": 8718626763704400.0,
700
- "train_batch_size": 80,
701
- "trial_name": null,
702
- "trial_params": null
703
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-4224/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e483228a10899db361a89a78f6d4c066e2522650debdb0af61e02a9f9faa73
3
- size 5905
 
 
 
 
checkpoint-4752/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "architectures": [
3
- "CamembertForSequenceClassification"
4
- ],
5
- "attention_probs_dropout_prob": 0.1,
6
- "bos_token_id": 5,
7
- "classifier_dropout": null,
8
- "dtype": "float32",
9
- "eos_token_id": 6,
10
- "hidden_act": "gelu",
11
- "hidden_dropout_prob": 0.1,
12
- "hidden_size": 768,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 3072,
15
- "layer_norm_eps": 1e-05,
16
- "max_position_embeddings": 514,
17
- "model_type": "camembert",
18
- "num_attention_heads": 12,
19
- "num_hidden_layers": 12,
20
- "output_past": true,
21
- "pad_token_id": 1,
22
- "position_embedding_type": "absolute",
23
- "problem_type": "single_label_classification",
24
- "transformers_version": "4.57.3",
25
- "type_vocab_size": 1,
26
- "use_cache": true,
27
- "vocab_size": 32005
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-4752/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a92f81d0998505b572ae1a4353cb36da7aca685709add9adcef3459b56efc6e8
3
- size 442518104
 
 
 
 
checkpoint-4752/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:164e6340d8057d32a3f432666c7eef3c760d513bfceab5b37c57f5ffb63ac19a
3
- size 885159307
 
 
 
 
checkpoint-4752/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f29eec5ad54e1257873d92553224fc1d69e6d9fd97fe540fc487ea60d1571a33
3
- size 14645
 
 
 
 
checkpoint-4752/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d4b6d35c6c0125cf4871d4cd86dbb7076b3d466839f8f76d76dc8ea8717753d
3
- size 1383
 
 
 
 
checkpoint-4752/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddb23ad2f31807017b65eb087ce8ca17ea34a977497e8157fd4c85e3fb5b7a26
3
- size 1465
 
 
 
 
checkpoint-4752/special_tokens_map.json DELETED
@@ -1,56 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<s>NOTUSED",
4
- "</s>NOTUSED",
5
- "<unk>NOTUSED"
6
- ],
7
- "bos_token": {
8
- "content": "<s>",
9
- "lstrip": false,
10
- "normalized": false,
11
- "rstrip": false,
12
- "single_word": false
13
- },
14
- "cls_token": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "eos_token": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false
27
- },
28
- "mask_token": {
29
- "content": "<mask>",
30
- "lstrip": true,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false
34
- },
35
- "pad_token": {
36
- "content": "<pad>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false
41
- },
42
- "sep_token": {
43
- "content": "</s>",
44
- "lstrip": false,
45
- "normalized": false,
46
- "rstrip": false,
47
- "single_word": false
48
- },
49
- "unk_token": {
50
- "content": "<unk>",
51
- "lstrip": false,
52
- "normalized": false,
53
- "rstrip": false,
54
- "single_word": false
55
- }
56
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-4752/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-4752/tokenizer_config.json DELETED
@@ -1,84 +0,0 @@
1
- {
2
- "added_tokens_decoder": {
3
- "0": {
4
- "content": "<s>NOTUSED",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false,
9
- "special": true
10
- },
11
- "1": {
12
- "content": "<pad>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false,
17
- "special": true
18
- },
19
- "2": {
20
- "content": "</s>NOTUSED",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
- "content": "<unk>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "4": {
36
- "content": "<unk>NOTUSED",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- },
43
- "5": {
44
- "content": "<s>",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "6": {
52
- "content": "</s>",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- },
59
- "32004": {
60
- "content": "<mask>",
61
- "lstrip": true,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false,
65
- "special": true
66
- }
67
- },
68
- "additional_special_tokens": [
69
- "<s>NOTUSED",
70
- "</s>NOTUSED",
71
- "<unk>NOTUSED"
72
- ],
73
- "bos_token": "<s>",
74
- "clean_up_tokenization_spaces": true,
75
- "cls_token": "<s>",
76
- "eos_token": "</s>",
77
- "extra_special_tokens": {},
78
- "mask_token": "<mask>",
79
- "model_max_length": 512,
80
- "pad_token": "<pad>",
81
- "sep_token": "</s>",
82
- "tokenizer_class": "CamembertTokenizer",
83
- "unk_token": "<unk>"
84
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-4752/trainer_state.json DELETED
@@ -1,789 +0,0 @@
1
- {
2
- "best_global_step": 3696,
3
- "best_metric": 0.9914414414414414,
4
- "best_model_checkpoint": "../models/camembert-bio-morpho-cross-encoder/checkpoint-3696",
5
- "epoch": 9.0,
6
- "eval_steps": 500,
7
- "global_step": 4752,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.0946969696969697,
14
- "grad_norm": 1.7212085723876953,
15
- "learning_rate": 9.280303030303031e-07,
16
- "loss": 0.6819,
17
- "step": 50
18
- },
19
- {
20
- "epoch": 0.1893939393939394,
21
- "grad_norm": 0.8152473568916321,
22
- "learning_rate": 1.8750000000000003e-06,
23
- "loss": 0.615,
24
- "step": 100
25
- },
26
- {
27
- "epoch": 0.2840909090909091,
28
- "grad_norm": 0.8320032954216003,
29
- "learning_rate": 2.8219696969696973e-06,
30
- "loss": 0.5181,
31
- "step": 150
32
- },
33
- {
34
- "epoch": 0.3787878787878788,
35
- "grad_norm": 1.74077570438385,
36
- "learning_rate": 3.7689393939393944e-06,
37
- "loss": 0.412,
38
- "step": 200
39
- },
40
- {
41
- "epoch": 0.4734848484848485,
42
- "grad_norm": 3.2630276679992676,
43
- "learning_rate": 4.715909090909091e-06,
44
- "loss": 0.2989,
45
- "step": 250
46
- },
47
- {
48
- "epoch": 0.5681818181818182,
49
- "grad_norm": 5.9127020835876465,
50
- "learning_rate": 5.662878787878788e-06,
51
- "loss": 0.2506,
52
- "step": 300
53
- },
54
- {
55
- "epoch": 0.6628787878787878,
56
- "grad_norm": 4.532700538635254,
57
- "learning_rate": 6.6098484848484855e-06,
58
- "loss": 0.2167,
59
- "step": 350
60
- },
61
- {
62
- "epoch": 0.7575757575757576,
63
- "grad_norm": 2.2779574394226074,
64
- "learning_rate": 7.556818181818183e-06,
65
- "loss": 0.2103,
66
- "step": 400
67
- },
68
- {
69
- "epoch": 0.8522727272727273,
70
- "grad_norm": 3.8016679286956787,
71
- "learning_rate": 8.50378787878788e-06,
72
- "loss": 0.1825,
73
- "step": 450
74
- },
75
- {
76
- "epoch": 0.946969696969697,
77
- "grad_norm": 2.031386375427246,
78
- "learning_rate": 9.450757575757576e-06,
79
- "loss": 0.1771,
80
- "step": 500
81
- },
82
- {
83
- "epoch": 1.0,
84
- "eval_accuracy": 0.9608108108108108,
85
- "eval_loss": 0.14045676589012146,
86
- "eval_runtime": 0.9313,
87
- "eval_samples_per_second": 2383.741,
88
- "eval_steps_per_second": 7.516,
89
- "step": 528
90
- },
91
- {
92
- "epoch": 1.0416666666666667,
93
- "grad_norm": 4.0610175132751465,
94
- "learning_rate": 1.0397727272727275e-05,
95
- "loss": 0.1615,
96
- "step": 550
97
- },
98
- {
99
- "epoch": 1.1363636363636362,
100
- "grad_norm": 1.9611369371414185,
101
- "learning_rate": 1.1344696969696971e-05,
102
- "loss": 0.1433,
103
- "step": 600
104
- },
105
- {
106
- "epoch": 1.231060606060606,
107
- "grad_norm": 6.061194896697998,
108
- "learning_rate": 1.2291666666666668e-05,
109
- "loss": 0.1517,
110
- "step": 650
111
- },
112
- {
113
- "epoch": 1.3257575757575757,
114
- "grad_norm": 2.7738304138183594,
115
- "learning_rate": 1.3238636363636366e-05,
116
- "loss": 0.1496,
117
- "step": 700
118
- },
119
- {
120
- "epoch": 1.4204545454545454,
121
- "grad_norm": 2.841794967651367,
122
- "learning_rate": 1.4185606060606061e-05,
123
- "loss": 0.1275,
124
- "step": 750
125
- },
126
- {
127
- "epoch": 1.5151515151515151,
128
- "grad_norm": 5.296891689300537,
129
- "learning_rate": 1.5132575757575758e-05,
130
- "loss": 0.1398,
131
- "step": 800
132
- },
133
- {
134
- "epoch": 1.6098484848484849,
135
- "grad_norm": 2.8792037963867188,
136
- "learning_rate": 1.6079545454545456e-05,
137
- "loss": 0.1062,
138
- "step": 850
139
- },
140
- {
141
- "epoch": 1.7045454545454546,
142
- "grad_norm": 7.044574737548828,
143
- "learning_rate": 1.7026515151515154e-05,
144
- "loss": 0.1209,
145
- "step": 900
146
- },
147
- {
148
- "epoch": 1.7992424242424243,
149
- "grad_norm": 4.128571033477783,
150
- "learning_rate": 1.797348484848485e-05,
151
- "loss": 0.1019,
152
- "step": 950
153
- },
154
- {
155
- "epoch": 1.893939393939394,
156
- "grad_norm": 3.093858242034912,
157
- "learning_rate": 1.8920454545454548e-05,
158
- "loss": 0.0905,
159
- "step": 1000
160
- },
161
- {
162
- "epoch": 1.9886363636363638,
163
- "grad_norm": 1.8322410583496094,
164
- "learning_rate": 1.9867424242424246e-05,
165
- "loss": 0.1046,
166
- "step": 1050
167
- },
168
- {
169
- "epoch": 2.0,
170
- "eval_accuracy": 0.9702702702702702,
171
- "eval_loss": 0.09388745576143265,
172
- "eval_runtime": 0.9328,
173
- "eval_samples_per_second": 2379.885,
174
- "eval_steps_per_second": 7.504,
175
- "step": 1056
176
- },
177
- {
178
- "epoch": 2.0833333333333335,
179
- "grad_norm": 1.9016932249069214,
180
- "learning_rate": 1.999898984854493e-05,
181
- "loss": 0.092,
182
- "step": 1100
183
- },
184
- {
185
- "epoch": 2.178030303030303,
186
- "grad_norm": 3.790273904800415,
187
- "learning_rate": 1.999527514387006e-05,
188
- "loss": 0.0958,
189
- "step": 1150
190
- },
191
- {
192
- "epoch": 2.2727272727272725,
193
- "grad_norm": 1.2855342626571655,
194
- "learning_rate": 1.9988830130412106e-05,
195
- "loss": 0.0775,
196
- "step": 1200
197
- },
198
- {
199
- "epoch": 2.367424242424242,
200
- "grad_norm": 4.88389253616333,
201
- "learning_rate": 1.997965656869057e-05,
202
- "loss": 0.0777,
203
- "step": 1250
204
- },
205
- {
206
- "epoch": 2.462121212121212,
207
- "grad_norm": 4.471704483032227,
208
- "learning_rate": 1.9967756964555044e-05,
209
- "loss": 0.0881,
210
- "step": 1300
211
- },
212
- {
213
- "epoch": 2.5568181818181817,
214
- "grad_norm": 4.597264289855957,
215
- "learning_rate": 1.995313456850071e-05,
216
- "loss": 0.0722,
217
- "step": 1350
218
- },
219
- {
220
- "epoch": 2.6515151515151514,
221
- "grad_norm": 1.7858144044876099,
222
- "learning_rate": 1.9935793374780435e-05,
223
- "loss": 0.0823,
224
- "step": 1400
225
- },
226
- {
227
- "epoch": 2.746212121212121,
228
- "grad_norm": 3.1771280765533447,
229
- "learning_rate": 1.991573812031369e-05,
230
- "loss": 0.0619,
231
- "step": 1450
232
- },
233
- {
234
- "epoch": 2.840909090909091,
235
- "grad_norm": 3.0146450996398926,
236
- "learning_rate": 1.989297428339264e-05,
237
- "loss": 0.0722,
238
- "step": 1500
239
- },
240
- {
241
- "epoch": 2.9356060606060606,
242
- "grad_norm": 1.1221269369125366,
243
- "learning_rate": 1.9867508082185663e-05,
244
- "loss": 0.071,
245
- "step": 1550
246
- },
247
- {
248
- "epoch": 3.0,
249
- "eval_accuracy": 0.9837837837837838,
250
- "eval_loss": 0.05881131812930107,
251
- "eval_runtime": 0.9401,
252
- "eval_samples_per_second": 2361.381,
253
- "eval_steps_per_second": 7.446,
254
- "step": 1584
255
- },
256
- {
257
- "epoch": 3.0303030303030303,
258
- "grad_norm": 0.7528719305992126,
259
- "learning_rate": 1.9839346473038815e-05,
260
- "loss": 0.0676,
261
- "step": 1600
262
- },
263
- {
264
- "epoch": 3.125,
265
- "grad_norm": 0.5066124200820923,
266
- "learning_rate": 1.980849714857563e-05,
267
- "loss": 0.0495,
268
- "step": 1650
269
- },
270
- {
271
- "epoch": 3.2196969696969697,
272
- "grad_norm": 5.64263391494751,
273
- "learning_rate": 1.9774968535595808e-05,
274
- "loss": 0.0626,
275
- "step": 1700
276
- },
277
- {
278
- "epoch": 3.3143939393939394,
279
- "grad_norm": 1.1057571172714233,
280
- "learning_rate": 1.9738769792773338e-05,
281
- "loss": 0.0611,
282
- "step": 1750
283
- },
284
- {
285
- "epoch": 3.409090909090909,
286
- "grad_norm": 0.09228092432022095,
287
- "learning_rate": 1.9699910808154726e-05,
288
- "loss": 0.0576,
289
- "step": 1800
290
- },
291
- {
292
- "epoch": 3.503787878787879,
293
- "grad_norm": 3.1624414920806885,
294
- "learning_rate": 1.965840219645797e-05,
295
- "loss": 0.0575,
296
- "step": 1850
297
- },
298
- {
299
- "epoch": 3.5984848484848486,
300
- "grad_norm": 4.033729553222656,
301
- "learning_rate": 1.961425529617306e-05,
302
- "loss": 0.0656,
303
- "step": 1900
304
- },
305
- {
306
- "epoch": 3.6931818181818183,
307
- "grad_norm": 3.7267255783081055,
308
- "learning_rate": 1.956748216646473e-05,
309
- "loss": 0.0594,
310
- "step": 1950
311
- },
312
- {
313
- "epoch": 3.787878787878788,
314
- "grad_norm": 2.7791688442230225,
315
- "learning_rate": 1.9518095583878406e-05,
316
- "loss": 0.054,
317
- "step": 2000
318
- },
319
- {
320
- "epoch": 3.882575757575758,
321
- "grad_norm": 0.10295089334249496,
322
- "learning_rate": 1.946610903885014e-05,
323
- "loss": 0.04,
324
- "step": 2050
325
- },
326
- {
327
- "epoch": 3.9772727272727275,
328
- "grad_norm": 1.05549156665802,
329
- "learning_rate": 1.941153673202158e-05,
330
- "loss": 0.0441,
331
- "step": 2100
332
- },
333
- {
334
- "epoch": 4.0,
335
- "eval_accuracy": 0.9864864864864865,
336
- "eval_loss": 0.056456033140420914,
337
- "eval_runtime": 0.9408,
338
- "eval_samples_per_second": 2359.707,
339
- "eval_steps_per_second": 7.441,
340
- "step": 2112
341
- },
342
- {
343
- "epoch": 4.071969696969697,
344
- "grad_norm": 2.183342933654785,
345
- "learning_rate": 1.9354393570360924e-05,
346
- "loss": 0.0449,
347
- "step": 2150
348
- },
349
- {
350
- "epoch": 4.166666666666667,
351
- "grad_norm": 3.2201144695281982,
352
- "learning_rate": 1.929469516309092e-05,
353
- "loss": 0.0443,
354
- "step": 2200
355
- },
356
- {
357
- "epoch": 4.261363636363637,
358
- "grad_norm": 2.585134744644165,
359
- "learning_rate": 1.9232457817425058e-05,
360
- "loss": 0.0378,
361
- "step": 2250
362
- },
363
- {
364
- "epoch": 4.356060606060606,
365
- "grad_norm": 0.22277259826660156,
366
- "learning_rate": 1.9167698534113105e-05,
367
- "loss": 0.0418,
368
- "step": 2300
369
- },
370
- {
371
- "epoch": 4.450757575757576,
372
- "grad_norm": 1.6783980131149292,
373
- "learning_rate": 1.910043500279716e-05,
374
- "loss": 0.0357,
375
- "step": 2350
376
- },
377
- {
378
- "epoch": 4.545454545454545,
379
- "grad_norm": 0.4771471619606018,
380
- "learning_rate": 1.903068559717957e-05,
381
- "loss": 0.0345,
382
- "step": 2400
383
- },
384
- {
385
- "epoch": 4.640151515151516,
386
- "grad_norm": 0.10980956256389618,
387
- "learning_rate": 1.8958469370003954e-05,
388
- "loss": 0.026,
389
- "step": 2450
390
- },
391
- {
392
- "epoch": 4.734848484848484,
393
- "grad_norm": 0.514388918876648,
394
- "learning_rate": 1.8883806047850772e-05,
395
- "loss": 0.0425,
396
- "step": 2500
397
- },
398
- {
399
- "epoch": 4.829545454545455,
400
- "grad_norm": 2.0213735103607178,
401
- "learning_rate": 1.8806716025748813e-05,
402
- "loss": 0.0321,
403
- "step": 2550
404
- },
405
- {
406
- "epoch": 4.924242424242424,
407
- "grad_norm": 2.96073842048645,
408
- "learning_rate": 1.872722036160407e-05,
409
- "loss": 0.0324,
410
- "step": 2600
411
- },
412
- {
413
- "epoch": 5.0,
414
- "eval_accuracy": 0.9891891891891892,
415
- "eval_loss": 0.039831362664699554,
416
- "eval_runtime": 0.9492,
417
- "eval_samples_per_second": 2338.807,
418
- "eval_steps_per_second": 7.375,
419
- "step": 2640
420
- },
421
- {
422
- "epoch": 5.018939393939394,
423
- "grad_norm": 5.567176342010498,
424
- "learning_rate": 1.8645340770447595e-05,
425
- "loss": 0.0448,
426
- "step": 2650
427
- },
428
- {
429
- "epoch": 5.113636363636363,
430
- "grad_norm": 0.33689576387405396,
431
- "learning_rate": 1.8561099618503785e-05,
432
- "loss": 0.0398,
433
- "step": 2700
434
- },
435
- {
436
- "epoch": 5.208333333333333,
437
- "grad_norm": 3.0077149868011475,
438
- "learning_rate": 1.8474519917080867e-05,
439
- "loss": 0.0234,
440
- "step": 2750
441
- },
442
- {
443
- "epoch": 5.303030303030303,
444
- "grad_norm": 0.3385215103626251,
445
- "learning_rate": 1.8385625316285095e-05,
446
- "loss": 0.03,
447
- "step": 2800
448
- },
449
- {
450
- "epoch": 5.3977272727272725,
451
- "grad_norm": 3.120093584060669,
452
- "learning_rate": 1.8294440098560508e-05,
453
- "loss": 0.0259,
454
- "step": 2850
455
- },
456
- {
457
- "epoch": 5.492424242424242,
458
- "grad_norm": 5.4590864181518555,
459
- "learning_rate": 1.8200989172055926e-05,
460
- "loss": 0.027,
461
- "step": 2900
462
- },
463
- {
464
- "epoch": 5.587121212121212,
465
- "grad_norm": 0.14214850962162018,
466
- "learning_rate": 1.8105298063821065e-05,
467
- "loss": 0.0396,
468
- "step": 2950
469
- },
470
- {
471
- "epoch": 5.681818181818182,
472
- "grad_norm": 5.496220111846924,
473
- "learning_rate": 1.8007392912833534e-05,
474
- "loss": 0.0386,
475
- "step": 3000
476
- },
477
- {
478
- "epoch": 5.776515151515151,
479
- "grad_norm": 2.2691736221313477,
480
- "learning_rate": 1.7907300462858752e-05,
481
- "loss": 0.0288,
482
- "step": 3050
483
- },
484
- {
485
- "epoch": 5.871212121212121,
486
- "grad_norm": 0.3925817608833313,
487
- "learning_rate": 1.7805048055144584e-05,
488
- "loss": 0.0305,
489
- "step": 3100
490
- },
491
- {
492
- "epoch": 5.965909090909091,
493
- "grad_norm": 0.03450781852006912,
494
- "learning_rate": 1.7700663620952844e-05,
495
- "loss": 0.0234,
496
- "step": 3150
497
- },
498
- {
499
- "epoch": 6.0,
500
- "eval_accuracy": 0.9882882882882883,
501
- "eval_loss": 0.0493258498609066,
502
- "eval_runtime": 0.9634,
503
- "eval_samples_per_second": 2304.357,
504
- "eval_steps_per_second": 7.266,
505
- "step": 3168
506
- },
507
- {
508
- "epoch": 6.0606060606060606,
509
- "grad_norm": 0.05821343883872032,
510
- "learning_rate": 1.7594175673929564e-05,
511
- "loss": 0.0146,
512
- "step": 3200
513
- },
514
- {
515
- "epoch": 6.15530303030303,
516
- "grad_norm": 0.04762452840805054,
517
- "learning_rate": 1.7485613302316226e-05,
518
- "loss": 0.02,
519
- "step": 3250
520
- },
521
- {
522
- "epoch": 6.25,
523
- "grad_norm": 0.05227584019303322,
524
- "learning_rate": 1.7375006161004018e-05,
525
- "loss": 0.0117,
526
- "step": 3300
527
- },
528
- {
529
- "epoch": 6.34469696969697,
530
- "grad_norm": 6.182316780090332,
531
- "learning_rate": 1.7262384463433286e-05,
532
- "loss": 0.0312,
533
- "step": 3350
534
- },
535
- {
536
- "epoch": 6.4393939393939394,
537
- "grad_norm": 2.6263253688812256,
538
- "learning_rate": 1.7147778973340466e-05,
539
- "loss": 0.0273,
540
- "step": 3400
541
- },
542
- {
543
- "epoch": 6.534090909090909,
544
- "grad_norm": 4.184931755065918,
545
- "learning_rate": 1.703122099635463e-05,
546
- "loss": 0.0339,
547
- "step": 3450
548
- },
549
- {
550
- "epoch": 6.628787878787879,
551
- "grad_norm": 4.779547214508057,
552
- "learning_rate": 1.6912742371446068e-05,
553
- "loss": 0.0187,
554
- "step": 3500
555
- },
556
- {
557
- "epoch": 6.723484848484849,
558
- "grad_norm": 4.305413246154785,
559
- "learning_rate": 1.6792375462229132e-05,
560
- "loss": 0.0288,
561
- "step": 3550
562
- },
563
- {
564
- "epoch": 6.818181818181818,
565
- "grad_norm": 0.8370099067687988,
566
- "learning_rate": 1.6670153148121834e-05,
567
- "loss": 0.022,
568
- "step": 3600
569
- },
570
- {
571
- "epoch": 6.912878787878788,
572
- "grad_norm": 0.042217787355184555,
573
- "learning_rate": 1.6546108815364448e-05,
574
- "loss": 0.0165,
575
- "step": 3650
576
- },
577
- {
578
- "epoch": 7.0,
579
- "eval_accuracy": 0.9914414414414414,
580
- "eval_loss": 0.042690977454185486,
581
- "eval_runtime": 1.1165,
582
- "eval_samples_per_second": 1988.281,
583
- "eval_steps_per_second": 6.269,
584
- "step": 3696
585
- },
586
- {
587
- "epoch": 7.007575757575758,
588
- "grad_norm": 0.08638785779476166,
589
- "learning_rate": 1.6420276347899776e-05,
590
- "loss": 0.0267,
591
- "step": 3700
592
- },
593
- {
594
- "epoch": 7.1022727272727275,
595
- "grad_norm": 0.019671985879540443,
596
- "learning_rate": 1.6292690118117393e-05,
597
- "loss": 0.015,
598
- "step": 3750
599
- },
600
- {
601
- "epoch": 7.196969696969697,
602
- "grad_norm": 0.10662596672773361,
603
- "learning_rate": 1.6163384977464476e-05,
604
- "loss": 0.0177,
605
- "step": 3800
606
- },
607
- {
608
- "epoch": 7.291666666666667,
609
- "grad_norm": 0.1051739975810051,
610
- "learning_rate": 1.6032396246925806e-05,
611
- "loss": 0.0204,
612
- "step": 3850
613
- },
614
- {
615
- "epoch": 7.386363636363637,
616
- "grad_norm": 0.06513633579015732,
617
- "learning_rate": 1.5899759707375487e-05,
618
- "loss": 0.0146,
619
- "step": 3900
620
- },
621
- {
622
- "epoch": 7.481060606060606,
623
- "grad_norm": 5.043694496154785,
624
- "learning_rate": 1.576551158980302e-05,
625
- "loss": 0.0202,
626
- "step": 3950
627
- },
628
- {
629
- "epoch": 7.575757575757576,
630
- "grad_norm": 0.04632123187184334,
631
- "learning_rate": 1.562968856541648e-05,
632
- "loss": 0.0188,
633
- "step": 4000
634
- },
635
- {
636
- "epoch": 7.670454545454545,
637
- "grad_norm": 0.1046363040804863,
638
- "learning_rate": 1.549232773562539e-05,
639
- "loss": 0.0218,
640
- "step": 4050
641
- },
642
- {
643
- "epoch": 7.765151515151516,
644
- "grad_norm": 10.116546630859375,
645
- "learning_rate": 1.5353466621906113e-05,
646
- "loss": 0.0223,
647
- "step": 4100
648
- },
649
- {
650
- "epoch": 7.859848484848484,
651
- "grad_norm": 0.06823263317346573,
652
- "learning_rate": 1.5213143155552479e-05,
653
- "loss": 0.0234,
654
- "step": 4150
655
- },
656
- {
657
- "epoch": 7.954545454545455,
658
- "grad_norm": 0.10893430560827255,
659
- "learning_rate": 1.5071395667314481e-05,
660
- "loss": 0.0162,
661
- "step": 4200
662
- },
663
- {
664
- "epoch": 8.0,
665
- "eval_accuracy": 0.986936936936937,
666
- "eval_loss": 0.05621395632624626,
667
- "eval_runtime": 1.1546,
668
- "eval_samples_per_second": 1922.822,
669
- "eval_steps_per_second": 6.063,
670
- "step": 4224
671
- },
672
- {
673
- "epoch": 8.049242424242424,
674
- "grad_norm": 3.010509967803955,
675
- "learning_rate": 1.4928262876927855e-05,
676
- "loss": 0.0255,
677
- "step": 4250
678
- },
679
- {
680
- "epoch": 8.143939393939394,
681
- "grad_norm": 0.14104370772838593,
682
- "learning_rate": 1.478378388253738e-05,
683
- "loss": 0.0151,
684
- "step": 4300
685
- },
686
- {
687
- "epoch": 8.238636363636363,
688
- "grad_norm": 0.05722634121775627,
689
- "learning_rate": 1.4637998150016847e-05,
690
- "loss": 0.0122,
691
- "step": 4350
692
- },
693
- {
694
- "epoch": 8.333333333333334,
695
- "grad_norm": 0.0412001870572567,
696
- "learning_rate": 1.4490945502188572e-05,
697
- "loss": 0.02,
698
- "step": 4400
699
- },
700
- {
701
- "epoch": 8.428030303030303,
702
- "grad_norm": 6.0322265625,
703
- "learning_rate": 1.4342666107945362e-05,
704
- "loss": 0.0112,
705
- "step": 4450
706
- },
707
- {
708
- "epoch": 8.522727272727273,
709
- "grad_norm": 0.04742131009697914,
710
- "learning_rate": 1.4193200471278019e-05,
711
- "loss": 0.0095,
712
- "step": 4500
713
- },
714
- {
715
- "epoch": 8.617424242424242,
716
- "grad_norm": 0.47154614329338074,
717
- "learning_rate": 1.4042589420211254e-05,
718
- "loss": 0.0182,
719
- "step": 4550
720
- },
721
- {
722
- "epoch": 8.712121212121213,
723
- "grad_norm": 6.622648239135742,
724
- "learning_rate": 1.3890874095651113e-05,
725
- "loss": 0.0155,
726
- "step": 4600
727
- },
728
- {
729
- "epoch": 8.806818181818182,
730
- "grad_norm": 0.6059552431106567,
731
- "learning_rate": 1.3738095940146916e-05,
732
- "loss": 0.0126,
733
- "step": 4650
734
- },
735
- {
736
- "epoch": 8.901515151515152,
737
- "grad_norm": 0.7394792437553406,
738
- "learning_rate": 1.3584296686570828e-05,
739
- "loss": 0.0129,
740
- "step": 4700
741
- },
742
- {
743
- "epoch": 8.996212121212121,
744
- "grad_norm": 0.03701222315430641,
745
- "learning_rate": 1.3429518346718109e-05,
746
- "loss": 0.0197,
747
- "step": 4750
748
- },
749
- {
750
- "epoch": 9.0,
751
- "eval_accuracy": 0.9891891891891892,
752
- "eval_loss": 0.05412689596414566,
753
- "eval_runtime": 0.9468,
754
- "eval_samples_per_second": 2344.86,
755
- "eval_steps_per_second": 7.394,
756
- "step": 4752
757
- }
758
- ],
759
- "logging_steps": 50,
760
- "max_steps": 10560,
761
- "num_input_tokens_seen": 0,
762
- "num_train_epochs": 20,
763
- "save_steps": 500,
764
- "stateful_callbacks": {
765
- "EarlyStoppingCallback": {
766
- "args": {
767
- "early_stopping_patience": 2,
768
- "early_stopping_threshold": 0.0
769
- },
770
- "attributes": {
771
- "early_stopping_patience_counter": 2
772
- }
773
- },
774
- "TrainerControl": {
775
- "args": {
776
- "should_epoch_stop": false,
777
- "should_evaluate": false,
778
- "should_log": false,
779
- "should_save": true,
780
- "should_training_stop": true
781
- },
782
- "attributes": {}
783
- }
784
- },
785
- "total_flos": 9807783199587600.0,
786
- "train_batch_size": 80,
787
- "trial_name": null,
788
- "trial_params": null
789
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-4752/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e483228a10899db361a89a78f6d4c066e2522650debdb0af61e02a9f9faa73
3
- size 5905
 
 
 
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30438456f15a191a07f2afbfbc4a691c12406b1aebc1a73e3167a45e3be3387c
3
  size 442518104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06d4e71f248361094d73ece09b36ab275208ef3436f90a0b0951597e42a72341
3
  size 442518104
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e483228a10899db361a89a78f6d4c066e2522650debdb0af61e02a9f9faa73
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9c47a27dab82cc7c6005a2d96f6c72daa67abe67559305a0271da3c32ac59a8
3
  size 5905