Anwaarma commited on
Commit
e404cf5
·
verified ·
1 Parent(s): cb8f7af

Training in progress, step 500

Browse files
Files changed (47) hide show
  1. config.json +26 -0
  2. model.safetensors +3 -0
  3. run-0/checkpoint-1000/config.json +26 -0
  4. run-0/checkpoint-1000/model.safetensors +3 -0
  5. run-0/checkpoint-1000/optimizer.pt +3 -0
  6. run-0/checkpoint-1000/rng_state.pth +3 -0
  7. run-0/checkpoint-1000/scaler.pt +3 -0
  8. run-0/checkpoint-1000/scheduler.pt +3 -0
  9. run-0/checkpoint-1000/special_tokens_map.json +51 -0
  10. run-0/checkpoint-1000/tokenizer.json +0 -0
  11. run-0/checkpoint-1000/tokenizer_config.json +62 -0
  12. run-0/checkpoint-1000/trainer_state.json +368 -0
  13. run-0/checkpoint-1000/training_args.bin +3 -0
  14. run-0/checkpoint-500/config.json +26 -0
  15. run-0/checkpoint-500/model.safetensors +3 -0
  16. run-0/checkpoint-500/optimizer.pt +3 -0
  17. run-0/checkpoint-500/rng_state.pth +3 -0
  18. run-0/checkpoint-500/scaler.pt +3 -0
  19. run-0/checkpoint-500/scheduler.pt +3 -0
  20. run-0/checkpoint-500/special_tokens_map.json +51 -0
  21. run-0/checkpoint-500/tokenizer.json +0 -0
  22. run-0/checkpoint-500/tokenizer_config.json +62 -0
  23. run-0/checkpoint-500/trainer_state.json +208 -0
  24. run-0/checkpoint-500/training_args.bin +3 -0
  25. run-1/checkpoint-500/config.json +26 -0
  26. run-1/checkpoint-500/model.safetensors +3 -0
  27. run-1/checkpoint-500/optimizer.pt +3 -0
  28. run-1/checkpoint-500/rng_state.pth +3 -0
  29. run-1/checkpoint-500/scaler.pt +3 -0
  30. run-1/checkpoint-500/scheduler.pt +3 -0
  31. run-1/checkpoint-500/special_tokens_map.json +51 -0
  32. run-1/checkpoint-500/tokenizer.json +0 -0
  33. run-1/checkpoint-500/tokenizer_config.json +62 -0
  34. run-1/checkpoint-500/trainer_state.json +208 -0
  35. run-1/checkpoint-500/training_args.bin +3 -0
  36. runs/May10_21-55-16_kestrel-03/events.out.tfevents.1746910538.kestrel-03.184643.0 +3 -0
  37. runs/May10_22-05-26_kestrel-03/events.out.tfevents.1746911132.kestrel-03.185312.0 +3 -0
  38. runs/May10_22-21-59_eagle-02/events.out.tfevents.1746912125.eagle-02.560110.0 +3 -0
  39. runs/May10_22-25-58_eagle-01/events.out.tfevents.1746912364.eagle-01.185430.0 +3 -0
  40. runs/May10_22-33-08_eagle-01/events.out.tfevents.1746912790.eagle-01.185992.0 +3 -0
  41. runs/May10_22-38-30_eagle-01/events.out.tfevents.1746913111.eagle-01.186318.0 +3 -0
  42. runs/May11_11-11-27_falcon-05/events.out.tfevents.1746958290.falcon-05.265234.0 +3 -0
  43. runs/May11_11-11-27_falcon-05/events.out.tfevents.1746958443.falcon-05.265234.1 +3 -0
  44. special_tokens_map.json +51 -0
  45. tokenizer.json +0 -0
  46. tokenizer_config.json +62 -0
  47. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 130,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.51.3",
23
+ "type_vocab_size": 1,
24
+ "use_cache": true,
25
+ "vocab_size": 30000
26
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23b32e402598ca88acd5d632ade7854e4a965519b9c271006ca11d9a6fb458a8
3
+ size 435179080
run-0/checkpoint-1000/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 130,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.51.3",
23
+ "type_vocab_size": 1,
24
+ "use_cache": true,
25
+ "vocab_size": 30000
26
+ }
run-0/checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ac632e969d522739039e6a88d009a09be08b04d8f01bc23f3d8f70fb67130e2
3
+ size 435179080
run-0/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e62af9e74e73f616420b83c02f7cb3b38132a29943deaca12ca25dfea54a6fd
3
+ size 870478475
run-0/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a381048e72fe1109ff86010097edd3b4d1b6fd2f4426abab234534dc310be96a
3
+ size 14645
run-0/checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f87f789072d9b79c9157eb9688b945a42852694f1f091923d2b8df6e7321f08
3
+ size 1383
run-0/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec4cd42a30394b70aa166ac98c27dae831ccdf8b9e7716e3c455595157266777
3
+ size 1465
run-0/checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
run-0/checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
run-0/checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "max_length": 128,
51
+ "model_max_length": 128,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "<pad>",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "</s>",
57
+ "stride": 0,
58
+ "tokenizer_class": "PreTrainedTokenizer",
59
+ "truncation_side": "right",
60
+ "truncation_strategy": "longest_first",
61
+ "unk_token": "<unk>"
62
+ }
run-0/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.8606300925228798,
4
+ "best_model_checkpoint": "./robertuito-esp/run-0/checkpoint-1000",
5
+ "epoch": 0.15654351909830932,
6
+ "eval_steps": 50,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": true,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007827175954915467,
14
+ "grad_norm": 19.564104080200195,
15
+ "learning_rate": 1.2465069045357695e-05,
16
+ "loss": 0.7153,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.007827175954915467,
21
+ "eval_f1": 0.5814973293844199,
22
+ "eval_loss": 0.66972416639328,
23
+ "eval_runtime": 2.1927,
24
+ "eval_samples_per_second": 446.933,
25
+ "eval_steps_per_second": 111.733,
26
+ "step": 50
27
+ },
28
+ {
29
+ "epoch": 0.015654351909830933,
30
+ "grad_norm": 6.274869918823242,
31
+ "learning_rate": 1.2451395392864412e-05,
32
+ "loss": 0.723,
33
+ "step": 100
34
+ },
35
+ {
36
+ "epoch": 0.015654351909830933,
37
+ "eval_f1": 0.5240407965031568,
38
+ "eval_loss": 0.6730000376701355,
39
+ "eval_runtime": 2.2102,
40
+ "eval_samples_per_second": 443.407,
41
+ "eval_steps_per_second": 110.852,
42
+ "step": 100
43
+ },
44
+ {
45
+ "epoch": 0.023481527864746398,
46
+ "grad_norm": 12.325215339660645,
47
+ "learning_rate": 1.243744268623861e-05,
48
+ "loss": 0.7258,
49
+ "step": 150
50
+ },
51
+ {
52
+ "epoch": 0.023481527864746398,
53
+ "eval_f1": 0.6702165564749198,
54
+ "eval_loss": 0.618554949760437,
55
+ "eval_runtime": 2.1668,
56
+ "eval_samples_per_second": 452.276,
57
+ "eval_steps_per_second": 113.069,
58
+ "step": 150
59
+ },
60
+ {
61
+ "epoch": 0.031308703819661866,
62
+ "grad_norm": 7.337674140930176,
63
+ "learning_rate": 1.242348997961281e-05,
64
+ "loss": 0.6409,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.031308703819661866,
69
+ "eval_f1": 0.6504815944850184,
70
+ "eval_loss": 0.629406750202179,
71
+ "eval_runtime": 2.2116,
72
+ "eval_samples_per_second": 443.123,
73
+ "eval_steps_per_second": 110.781,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.03913587977457733,
78
+ "grad_norm": 11.70043659210205,
79
+ "learning_rate": 1.240953727298701e-05,
80
+ "loss": 0.6825,
81
+ "step": 250
82
+ },
83
+ {
84
+ "epoch": 0.03913587977457733,
85
+ "eval_f1": 0.6294069289489137,
86
+ "eval_loss": 0.6766626238822937,
87
+ "eval_runtime": 2.1929,
88
+ "eval_samples_per_second": 446.898,
89
+ "eval_steps_per_second": 111.725,
90
+ "step": 250
91
+ },
92
+ {
93
+ "epoch": 0.046963055729492796,
94
+ "grad_norm": 20.191137313842773,
95
+ "learning_rate": 1.239558456636121e-05,
96
+ "loss": 0.6583,
97
+ "step": 300
98
+ },
99
+ {
100
+ "epoch": 0.046963055729492796,
101
+ "eval_f1": 0.7144522144522144,
102
+ "eval_loss": 0.5963508486747742,
103
+ "eval_runtime": 2.1548,
104
+ "eval_samples_per_second": 454.799,
105
+ "eval_steps_per_second": 113.7,
106
+ "step": 300
107
+ },
108
+ {
109
+ "epoch": 0.05479023168440827,
110
+ "grad_norm": 10.917688369750977,
111
+ "learning_rate": 1.2381910913867924e-05,
112
+ "loss": 0.6621,
113
+ "step": 350
114
+ },
115
+ {
116
+ "epoch": 0.05479023168440827,
117
+ "eval_f1": 0.7455362231951733,
118
+ "eval_loss": 0.5801416635513306,
119
+ "eval_runtime": 2.1278,
120
+ "eval_samples_per_second": 460.569,
121
+ "eval_steps_per_second": 115.142,
122
+ "step": 350
123
+ },
124
+ {
125
+ "epoch": 0.06261740763932373,
126
+ "grad_norm": 2.700626850128174,
127
+ "learning_rate": 1.2367958207242124e-05,
128
+ "loss": 0.6045,
129
+ "step": 400
130
+ },
131
+ {
132
+ "epoch": 0.06261740763932373,
133
+ "eval_f1": 0.7608345960078264,
134
+ "eval_loss": 0.5664511919021606,
135
+ "eval_runtime": 2.1741,
136
+ "eval_samples_per_second": 450.751,
137
+ "eval_steps_per_second": 112.688,
138
+ "step": 400
139
+ },
140
+ {
141
+ "epoch": 0.0704445835942392,
142
+ "grad_norm": 41.931392669677734,
143
+ "learning_rate": 1.2354005500616322e-05,
144
+ "loss": 0.642,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.0704445835942392,
149
+ "eval_f1": 0.7535545438244311,
150
+ "eval_loss": 0.6086084842681885,
151
+ "eval_runtime": 2.1637,
152
+ "eval_samples_per_second": 452.937,
153
+ "eval_steps_per_second": 113.234,
154
+ "step": 450
155
+ },
156
+ {
157
+ "epoch": 0.07827175954915466,
158
+ "grad_norm": 14.928444862365723,
159
+ "learning_rate": 1.2340052793990522e-05,
160
+ "loss": 0.5615,
161
+ "step": 500
162
+ },
163
+ {
164
+ "epoch": 0.07827175954915466,
165
+ "eval_f1": 0.7875816993464052,
166
+ "eval_loss": 0.5279112458229065,
167
+ "eval_runtime": 2.1932,
168
+ "eval_samples_per_second": 446.837,
169
+ "eval_steps_per_second": 111.709,
170
+ "step": 500
171
+ },
172
+ {
173
+ "epoch": 0.08609893550407013,
174
+ "grad_norm": 0.15141427516937256,
175
+ "learning_rate": 1.2326100087364722e-05,
176
+ "loss": 0.6715,
177
+ "step": 550
178
+ },
179
+ {
180
+ "epoch": 0.08609893550407013,
181
+ "eval_f1": 0.802000408246581,
182
+ "eval_loss": 0.49675270915031433,
183
+ "eval_runtime": 2.3287,
184
+ "eval_samples_per_second": 420.836,
185
+ "eval_steps_per_second": 105.209,
186
+ "step": 550
187
+ },
188
+ {
189
+ "epoch": 0.09392611145898559,
190
+ "grad_norm": 67.29998779296875,
191
+ "learning_rate": 1.2312147380738922e-05,
192
+ "loss": 0.6121,
193
+ "step": 600
194
+ },
195
+ {
196
+ "epoch": 0.09392611145898559,
197
+ "eval_f1": 0.818196977592496,
198
+ "eval_loss": 0.48247167468070984,
199
+ "eval_runtime": 2.3577,
200
+ "eval_samples_per_second": 415.654,
201
+ "eval_steps_per_second": 103.914,
202
+ "step": 600
203
+ },
204
+ {
205
+ "epoch": 0.10175328741390106,
206
+ "grad_norm": 47.65116882324219,
207
+ "learning_rate": 1.2298194674113121e-05,
208
+ "loss": 0.6235,
209
+ "step": 650
210
+ },
211
+ {
212
+ "epoch": 0.10175328741390106,
213
+ "eval_f1": 0.8135306553911205,
214
+ "eval_loss": 0.46964216232299805,
215
+ "eval_runtime": 2.3043,
216
+ "eval_samples_per_second": 425.285,
217
+ "eval_steps_per_second": 106.321,
218
+ "step": 650
219
+ },
220
+ {
221
+ "epoch": 0.10958046336881654,
222
+ "grad_norm": 64.26823425292969,
223
+ "learning_rate": 1.228424196748732e-05,
224
+ "loss": 0.5202,
225
+ "step": 700
226
+ },
227
+ {
228
+ "epoch": 0.10958046336881654,
229
+ "eval_f1": 0.8239942528735633,
230
+ "eval_loss": 0.4604596197605133,
231
+ "eval_runtime": 2.2844,
232
+ "eval_samples_per_second": 428.995,
233
+ "eval_steps_per_second": 107.249,
234
+ "step": 700
235
+ },
236
+ {
237
+ "epoch": 0.117407639323732,
238
+ "grad_norm": 18.147687911987305,
239
+ "learning_rate": 1.227028926086152e-05,
240
+ "loss": 0.4495,
241
+ "step": 750
242
+ },
243
+ {
244
+ "epoch": 0.117407639323732,
245
+ "eval_f1": 0.8460610477901906,
246
+ "eval_loss": 0.4548790156841278,
247
+ "eval_runtime": 2.2457,
248
+ "eval_samples_per_second": 436.391,
249
+ "eval_steps_per_second": 109.098,
250
+ "step": 750
251
+ },
252
+ {
253
+ "epoch": 0.12523481527864747,
254
+ "grad_norm": 0.5921919941902161,
255
+ "learning_rate": 1.225633655423572e-05,
256
+ "loss": 0.3791,
257
+ "step": 800
258
+ },
259
+ {
260
+ "epoch": 0.12523481527864747,
261
+ "eval_f1": 0.7905613416348929,
262
+ "eval_loss": 0.5360157489776611,
263
+ "eval_runtime": 2.189,
264
+ "eval_samples_per_second": 447.699,
265
+ "eval_steps_per_second": 111.925,
266
+ "step": 800
267
+ },
268
+ {
269
+ "epoch": 0.13306199123356294,
270
+ "grad_norm": 18.999170303344727,
271
+ "learning_rate": 1.2242383847609919e-05,
272
+ "loss": 0.4911,
273
+ "step": 850
274
+ },
275
+ {
276
+ "epoch": 0.13306199123356294,
277
+ "eval_f1": 0.840813674530188,
278
+ "eval_loss": 0.46410810947418213,
279
+ "eval_runtime": 2.1583,
280
+ "eval_samples_per_second": 454.058,
281
+ "eval_steps_per_second": 113.514,
282
+ "step": 850
283
+ },
284
+ {
285
+ "epoch": 0.1408891671884784,
286
+ "grad_norm": 26.950950622558594,
287
+ "learning_rate": 1.2228431140984119e-05,
288
+ "loss": 0.5958,
289
+ "step": 900
290
+ },
291
+ {
292
+ "epoch": 0.1408891671884784,
293
+ "eval_f1": 0.8473905723905724,
294
+ "eval_loss": 0.43761199712753296,
295
+ "eval_runtime": 2.1666,
296
+ "eval_samples_per_second": 452.312,
297
+ "eval_steps_per_second": 113.078,
298
+ "step": 900
299
+ },
300
+ {
301
+ "epoch": 0.14871634314339385,
302
+ "grad_norm": 0.00022970873396843672,
303
+ "learning_rate": 1.2214478434358317e-05,
304
+ "loss": 0.5189,
305
+ "step": 950
306
+ },
307
+ {
308
+ "epoch": 0.14871634314339385,
309
+ "eval_f1": 0.8488183986257734,
310
+ "eval_loss": 0.45094814896583557,
311
+ "eval_runtime": 2.1617,
312
+ "eval_samples_per_second": 453.351,
313
+ "eval_steps_per_second": 113.338,
314
+ "step": 950
315
+ },
316
+ {
317
+ "epoch": 0.15654351909830932,
318
+ "grad_norm": 2.4208905696868896,
319
+ "learning_rate": 1.2200525727732517e-05,
320
+ "loss": 0.4722,
321
+ "step": 1000
322
+ },
323
+ {
324
+ "epoch": 0.15654351909830932,
325
+ "eval_f1": 0.8606300925228798,
326
+ "eval_loss": 0.44244199991226196,
327
+ "eval_runtime": 2.1775,
328
+ "eval_samples_per_second": 450.049,
329
+ "eval_steps_per_second": 112.512,
330
+ "step": 1000
331
+ }
332
+ ],
333
+ "logging_steps": 50,
334
+ "max_steps": 44716,
335
+ "num_input_tokens_seen": 0,
336
+ "num_train_epochs": 7,
337
+ "save_steps": 500,
338
+ "stateful_callbacks": {
339
+ "EarlyStoppingCallback": {
340
+ "args": {
341
+ "early_stopping_patience": 3,
342
+ "early_stopping_threshold": 0.0
343
+ },
344
+ "attributes": {
345
+ "early_stopping_patience_counter": 0
346
+ }
347
+ },
348
+ "TrainerControl": {
349
+ "args": {
350
+ "should_epoch_stop": false,
351
+ "should_evaluate": false,
352
+ "should_log": false,
353
+ "should_save": true,
354
+ "should_training_stop": false
355
+ },
356
+ "attributes": {}
357
+ }
358
+ },
359
+ "total_flos": 21893203694340.0,
360
+ "train_batch_size": 1,
361
+ "trial_name": null,
362
+ "trial_params": {
363
+ "learning_rate": 1.2478184589585948e-05,
364
+ "num_train_epochs": 7,
365
+ "per_device_train_batch_size": 1,
366
+ "weight_decay": 0.05945867605206346
367
+ }
368
+ }
run-0/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:755cd6f28e7767e2abb9d2f1a8dceef555a9ce6bfa3b12742b43177292af4d13
3
+ size 5777
run-0/checkpoint-500/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 130,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.51.3",
23
+ "type_vocab_size": 1,
24
+ "use_cache": true,
25
+ "vocab_size": 30000
26
+ }
run-0/checkpoint-500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7335ac0549eefd160bda7ef9753f8da182b7892fd483414aec8254da1011ab0
3
+ size 435179080
run-0/checkpoint-500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f0d69a17a9039c18d4344cf22338ca2a838db5cd53546c6694f9df63ab693b0
3
+ size 870478475
run-0/checkpoint-500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13e906f66e903280d52015bfe82ad0b677e22105864a77948b2c10c8d250d8f6
3
+ size 14645
run-0/checkpoint-500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d265aedd04ff4f51f4f09b5269b650135e64e396997f15e9ed8ca76ab092a354
3
+ size 1383
run-0/checkpoint-500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a532aad0cc7978f5f1963c3fb78c79918442fb54e0d68a2d1a6fbc6eb4707cf
3
+ size 1465
run-0/checkpoint-500/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
run-0/checkpoint-500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
run-0/checkpoint-500/tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "max_length": 128,
51
+ "model_max_length": 128,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "<pad>",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "</s>",
57
+ "stride": 0,
58
+ "tokenizer_class": "PreTrainedTokenizer",
59
+ "truncation_side": "right",
60
+ "truncation_strategy": "longest_first",
61
+ "unk_token": "<unk>"
62
+ }
run-0/checkpoint-500/trainer_state.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 500,
3
+ "best_metric": 0.7875816993464052,
4
+ "best_model_checkpoint": "./robertuito-esp/run-0/checkpoint-500",
5
+ "epoch": 0.07827175954915466,
6
+ "eval_steps": 50,
7
+ "global_step": 500,
8
+ "is_hyper_param_search": true,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007827175954915467,
14
+ "grad_norm": 19.564104080200195,
15
+ "learning_rate": 1.2465069045357695e-05,
16
+ "loss": 0.7153,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.007827175954915467,
21
+ "eval_f1": 0.5814973293844199,
22
+ "eval_loss": 0.66972416639328,
23
+ "eval_runtime": 2.1927,
24
+ "eval_samples_per_second": 446.933,
25
+ "eval_steps_per_second": 111.733,
26
+ "step": 50
27
+ },
28
+ {
29
+ "epoch": 0.015654351909830933,
30
+ "grad_norm": 6.274869918823242,
31
+ "learning_rate": 1.2451395392864412e-05,
32
+ "loss": 0.723,
33
+ "step": 100
34
+ },
35
+ {
36
+ "epoch": 0.015654351909830933,
37
+ "eval_f1": 0.5240407965031568,
38
+ "eval_loss": 0.6730000376701355,
39
+ "eval_runtime": 2.2102,
40
+ "eval_samples_per_second": 443.407,
41
+ "eval_steps_per_second": 110.852,
42
+ "step": 100
43
+ },
44
+ {
45
+ "epoch": 0.023481527864746398,
46
+ "grad_norm": 12.325215339660645,
47
+ "learning_rate": 1.243744268623861e-05,
48
+ "loss": 0.7258,
49
+ "step": 150
50
+ },
51
+ {
52
+ "epoch": 0.023481527864746398,
53
+ "eval_f1": 0.6702165564749198,
54
+ "eval_loss": 0.618554949760437,
55
+ "eval_runtime": 2.1668,
56
+ "eval_samples_per_second": 452.276,
57
+ "eval_steps_per_second": 113.069,
58
+ "step": 150
59
+ },
60
+ {
61
+ "epoch": 0.031308703819661866,
62
+ "grad_norm": 7.337674140930176,
63
+ "learning_rate": 1.242348997961281e-05,
64
+ "loss": 0.6409,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.031308703819661866,
69
+ "eval_f1": 0.6504815944850184,
70
+ "eval_loss": 0.629406750202179,
71
+ "eval_runtime": 2.2116,
72
+ "eval_samples_per_second": 443.123,
73
+ "eval_steps_per_second": 110.781,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.03913587977457733,
78
+ "grad_norm": 11.70043659210205,
79
+ "learning_rate": 1.240953727298701e-05,
80
+ "loss": 0.6825,
81
+ "step": 250
82
+ },
83
+ {
84
+ "epoch": 0.03913587977457733,
85
+ "eval_f1": 0.6294069289489137,
86
+ "eval_loss": 0.6766626238822937,
87
+ "eval_runtime": 2.1929,
88
+ "eval_samples_per_second": 446.898,
89
+ "eval_steps_per_second": 111.725,
90
+ "step": 250
91
+ },
92
+ {
93
+ "epoch": 0.046963055729492796,
94
+ "grad_norm": 20.191137313842773,
95
+ "learning_rate": 1.239558456636121e-05,
96
+ "loss": 0.6583,
97
+ "step": 300
98
+ },
99
+ {
100
+ "epoch": 0.046963055729492796,
101
+ "eval_f1": 0.7144522144522144,
102
+ "eval_loss": 0.5963508486747742,
103
+ "eval_runtime": 2.1548,
104
+ "eval_samples_per_second": 454.799,
105
+ "eval_steps_per_second": 113.7,
106
+ "step": 300
107
+ },
108
+ {
109
+ "epoch": 0.05479023168440827,
110
+ "grad_norm": 10.917688369750977,
111
+ "learning_rate": 1.2381910913867924e-05,
112
+ "loss": 0.6621,
113
+ "step": 350
114
+ },
115
+ {
116
+ "epoch": 0.05479023168440827,
117
+ "eval_f1": 0.7455362231951733,
118
+ "eval_loss": 0.5801416635513306,
119
+ "eval_runtime": 2.1278,
120
+ "eval_samples_per_second": 460.569,
121
+ "eval_steps_per_second": 115.142,
122
+ "step": 350
123
+ },
124
+ {
125
+ "epoch": 0.06261740763932373,
126
+ "grad_norm": 2.700626850128174,
127
+ "learning_rate": 1.2367958207242124e-05,
128
+ "loss": 0.6045,
129
+ "step": 400
130
+ },
131
+ {
132
+ "epoch": 0.06261740763932373,
133
+ "eval_f1": 0.7608345960078264,
134
+ "eval_loss": 0.5664511919021606,
135
+ "eval_runtime": 2.1741,
136
+ "eval_samples_per_second": 450.751,
137
+ "eval_steps_per_second": 112.688,
138
+ "step": 400
139
+ },
140
+ {
141
+ "epoch": 0.0704445835942392,
142
+ "grad_norm": 41.931392669677734,
143
+ "learning_rate": 1.2354005500616322e-05,
144
+ "loss": 0.642,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.0704445835942392,
149
+ "eval_f1": 0.7535545438244311,
150
+ "eval_loss": 0.6086084842681885,
151
+ "eval_runtime": 2.1637,
152
+ "eval_samples_per_second": 452.937,
153
+ "eval_steps_per_second": 113.234,
154
+ "step": 450
155
+ },
156
+ {
157
+ "epoch": 0.07827175954915466,
158
+ "grad_norm": 14.928444862365723,
159
+ "learning_rate": 1.2340052793990522e-05,
160
+ "loss": 0.5615,
161
+ "step": 500
162
+ },
163
+ {
164
+ "epoch": 0.07827175954915466,
165
+ "eval_f1": 0.7875816993464052,
166
+ "eval_loss": 0.5279112458229065,
167
+ "eval_runtime": 2.1932,
168
+ "eval_samples_per_second": 446.837,
169
+ "eval_steps_per_second": 111.709,
170
+ "step": 500
171
+ }
172
+ ],
173
+ "logging_steps": 50,
174
+ "max_steps": 44716,
175
+ "num_input_tokens_seen": 0,
176
+ "num_train_epochs": 7,
177
+ "save_steps": 500,
178
+ "stateful_callbacks": {
179
+ "EarlyStoppingCallback": {
180
+ "args": {
181
+ "early_stopping_patience": 3,
182
+ "early_stopping_threshold": 0.0
183
+ },
184
+ "attributes": {
185
+ "early_stopping_patience_counter": 0
186
+ }
187
+ },
188
+ "TrainerControl": {
189
+ "args": {
190
+ "should_epoch_stop": false,
191
+ "should_evaluate": false,
192
+ "should_log": false,
193
+ "should_save": true,
194
+ "should_training_stop": false
195
+ },
196
+ "attributes": {}
197
+ }
198
+ },
199
+ "total_flos": 11053747657800.0,
200
+ "train_batch_size": 1,
201
+ "trial_name": null,
202
+ "trial_params": {
203
+ "learning_rate": 1.2478184589585948e-05,
204
+ "num_train_epochs": 7,
205
+ "per_device_train_batch_size": 1,
206
+ "weight_decay": 0.05945867605206346
207
+ }
208
+ }
run-0/checkpoint-500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:755cd6f28e7767e2abb9d2f1a8dceef555a9ce6bfa3b12742b43177292af4d13
3
+ size 5777
run-1/checkpoint-500/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 130,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.51.3",
23
+ "type_vocab_size": 1,
24
+ "use_cache": true,
25
+ "vocab_size": 30000
26
+ }
run-1/checkpoint-500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23b32e402598ca88acd5d632ade7854e4a965519b9c271006ca11d9a6fb458a8
3
+ size 435179080
run-1/checkpoint-500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e992a490bfae20571b9b6cf61085460490f94b12700007b1978d88ee6409bc6
3
+ size 870478475
run-1/checkpoint-500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619776da0951c1c4afacf4e47900ce6d06a79bfce504c0895e1cd1bce504069e
3
+ size 14645
run-1/checkpoint-500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f3f0dacd62ed0c1d0cf61c392d53c17c34d60cdb864c1f50aae811adb62ad00
3
+ size 1383
run-1/checkpoint-500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32c7835cc2414546ba3315e7af0e069872bc1fb2791ab8b70fe0ab170fe10087
3
+ size 1465
run-1/checkpoint-500/special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
run-1/checkpoint-500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
run-1/checkpoint-500/tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "max_length": 128,
51
+ "model_max_length": 128,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "<pad>",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "</s>",
57
+ "stride": 0,
58
+ "tokenizer_class": "PreTrainedTokenizer",
59
+ "truncation_side": "right",
60
+ "truncation_strategy": "longest_first",
61
+ "unk_token": "<unk>"
62
+ }
run-1/checkpoint-500/trainer_state.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 450,
3
+ "best_metric": 0.868959868959869,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.31308703819661865,
6
+ "eval_steps": 50,
7
+ "global_step": 500,
8
+ "is_hyper_param_search": true,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.031308703819661866,
14
+ "grad_norm": 8.278752326965332,
15
+ "learning_rate": 1.4608659906664293e-05,
16
+ "loss": 0.6911,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.031308703819661866,
21
+ "eval_f1": 0.6744135183690473,
22
+ "eval_loss": 0.6388781070709229,
23
+ "eval_runtime": 2.17,
24
+ "eval_samples_per_second": 451.608,
25
+ "eval_steps_per_second": 112.902,
26
+ "step": 50
27
+ },
28
+ {
29
+ "epoch": 0.06261740763932373,
30
+ "grad_norm": 7.287458419799805,
31
+ "learning_rate": 1.4551272022052947e-05,
32
+ "loss": 0.6292,
33
+ "step": 100
34
+ },
35
+ {
36
+ "epoch": 0.06261740763932373,
37
+ "eval_f1": 0.7405039802177549,
38
+ "eval_loss": 0.5760383009910583,
39
+ "eval_runtime": 2.1717,
40
+ "eval_samples_per_second": 451.262,
41
+ "eval_steps_per_second": 112.816,
42
+ "step": 100
43
+ },
44
+ {
45
+ "epoch": 0.09392611145898559,
46
+ "grad_norm": 9.542899131774902,
47
+ "learning_rate": 1.4493884137441602e-05,
48
+ "loss": 0.61,
49
+ "step": 150
50
+ },
51
+ {
52
+ "epoch": 0.09392611145898559,
53
+ "eval_f1": 0.7517948717948717,
54
+ "eval_loss": 0.5416612029075623,
55
+ "eval_runtime": 2.1902,
56
+ "eval_samples_per_second": 447.454,
57
+ "eval_steps_per_second": 111.864,
58
+ "step": 150
59
+ },
60
+ {
61
+ "epoch": 0.12523481527864747,
62
+ "grad_norm": 7.49862813949585,
63
+ "learning_rate": 1.4436496252830255e-05,
64
+ "loss": 0.5278,
65
+ "step": 200
66
+ },
67
+ {
68
+ "epoch": 0.12523481527864747,
69
+ "eval_f1": 0.7796941838408146,
70
+ "eval_loss": 0.4883626699447632,
71
+ "eval_runtime": 2.1941,
72
+ "eval_samples_per_second": 446.649,
73
+ "eval_steps_per_second": 111.662,
74
+ "step": 200
75
+ },
76
+ {
77
+ "epoch": 0.15654351909830932,
78
+ "grad_norm": 10.641839981079102,
79
+ "learning_rate": 1.437910836821891e-05,
80
+ "loss": 0.5119,
81
+ "step": 250
82
+ },
83
+ {
84
+ "epoch": 0.15654351909830932,
85
+ "eval_f1": 0.8223890608660148,
86
+ "eval_loss": 0.45297476649284363,
87
+ "eval_runtime": 2.1824,
88
+ "eval_samples_per_second": 449.042,
89
+ "eval_steps_per_second": 112.261,
90
+ "step": 250
91
+ },
92
+ {
93
+ "epoch": 0.18785222291797118,
94
+ "grad_norm": 12.158267974853516,
95
+ "learning_rate": 1.432286824129979e-05,
96
+ "loss": 0.5144,
97
+ "step": 300
98
+ },
99
+ {
100
+ "epoch": 0.18785222291797118,
101
+ "eval_f1": 0.84075,
102
+ "eval_loss": 0.4445246160030365,
103
+ "eval_runtime": 2.1904,
104
+ "eval_samples_per_second": 447.415,
105
+ "eval_steps_per_second": 111.854,
106
+ "step": 300
107
+ },
108
+ {
109
+ "epoch": 0.21916092673763307,
110
+ "grad_norm": 3.415942668914795,
111
+ "learning_rate": 1.4265480356688443e-05,
112
+ "loss": 0.4357,
113
+ "step": 350
114
+ },
115
+ {
116
+ "epoch": 0.21916092673763307,
117
+ "eval_f1": 0.8448953751832601,
118
+ "eval_loss": 0.4431275427341461,
119
+ "eval_runtime": 2.1418,
120
+ "eval_samples_per_second": 457.551,
121
+ "eval_steps_per_second": 114.388,
122
+ "step": 350
123
+ },
124
+ {
125
+ "epoch": 0.25046963055729493,
126
+ "grad_norm": 10.299009323120117,
127
+ "learning_rate": 1.4208092472077096e-05,
128
+ "loss": 0.506,
129
+ "step": 400
130
+ },
131
+ {
132
+ "epoch": 0.25046963055729493,
133
+ "eval_f1": 0.8667425348624123,
134
+ "eval_loss": 0.423656165599823,
135
+ "eval_runtime": 2.2069,
136
+ "eval_samples_per_second": 444.056,
137
+ "eval_steps_per_second": 111.014,
138
+ "step": 400
139
+ },
140
+ {
141
+ "epoch": 0.2817783343769568,
142
+ "grad_norm": 12.57047176361084,
143
+ "learning_rate": 1.4150704587465751e-05,
144
+ "loss": 0.4393,
145
+ "step": 450
146
+ },
147
+ {
148
+ "epoch": 0.2817783343769568,
149
+ "eval_f1": 0.868959868959869,
150
+ "eval_loss": 0.4268187880516052,
151
+ "eval_runtime": 2.2031,
152
+ "eval_samples_per_second": 444.826,
153
+ "eval_steps_per_second": 111.207,
154
+ "step": 450
155
+ },
156
+ {
157
+ "epoch": 0.31308703819661865,
158
+ "grad_norm": 5.100917339324951,
159
+ "learning_rate": 1.4093316702854404e-05,
160
+ "loss": 0.4728,
161
+ "step": 500
162
+ },
163
+ {
164
+ "epoch": 0.31308703819661865,
165
+ "eval_f1": 0.8650813516896121,
166
+ "eval_loss": 0.41771137714385986,
167
+ "eval_runtime": 2.1879,
168
+ "eval_samples_per_second": 447.908,
169
+ "eval_steps_per_second": 111.977,
170
+ "step": 500
171
+ }
172
+ ],
173
+ "logging_steps": 50,
174
+ "max_steps": 12776,
175
+ "num_input_tokens_seen": 0,
176
+ "num_train_epochs": 8,
177
+ "save_steps": 500,
178
+ "stateful_callbacks": {
179
+ "EarlyStoppingCallback": {
180
+ "args": {
181
+ "early_stopping_patience": 3,
182
+ "early_stopping_threshold": 0.0
183
+ },
184
+ "attributes": {
185
+ "early_stopping_patience_counter": 1
186
+ }
187
+ },
188
+ "TrainerControl": {
189
+ "args": {
190
+ "should_epoch_stop": false,
191
+ "should_evaluate": false,
192
+ "should_log": false,
193
+ "should_save": true,
194
+ "should_training_stop": false
195
+ },
196
+ "attributes": {}
197
+ }
198
+ },
199
+ "total_flos": 64618430752320.0,
200
+ "train_batch_size": 4,
201
+ "trial_name": null,
202
+ "trial_params": {
203
+ "learning_rate": 1.4663752275891186e-05,
204
+ "num_train_epochs": 8,
205
+ "per_device_train_batch_size": 4,
206
+ "weight_decay": 0.09441711486215941
207
+ }
208
+ }
run-1/checkpoint-500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8cd685483891e41b5652487e985c77e8ec27d51adad32aecea426f7780676fe
3
+ size 5777
runs/May10_21-55-16_kestrel-03/events.out.tfevents.1746910538.kestrel-03.184643.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:200cc29838a49de1571799d4daeb95bec672ec5bcdb9b2de3901d88c45e4d3cb
3
+ size 5011
runs/May10_22-05-26_kestrel-03/events.out.tfevents.1746911132.kestrel-03.185312.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b96bbac73a6ab8342cf902fea9a5b7020652695c9e9659ec36bcd596f35db367
3
+ size 5531
runs/May10_22-21-59_eagle-02/events.out.tfevents.1746912125.eagle-02.560110.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a33d067ed160b902bc8a60c83af6c6c1406044832db2a13e982630f191050207
3
+ size 5010
runs/May10_22-25-58_eagle-01/events.out.tfevents.1746912364.eagle-01.185430.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d5acb1dc42b9888b435ae10a5b0ab7f652a2f8c520e7026212acaa00add2412
3
+ size 5009
runs/May10_22-33-08_eagle-01/events.out.tfevents.1746912790.eagle-01.185992.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26aaa070458923bb21ebb8bfd7bd5951eef269f3e89aa369e32b5fedbd2f4066
3
+ size 5008
runs/May10_22-38-30_eagle-01/events.out.tfevents.1746913111.eagle-01.186318.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73599df05e49387918471a0c9059368011d37488904cb9e220550472b72158a2
3
+ size 5010
runs/May11_11-11-27_falcon-05/events.out.tfevents.1746958290.falcon-05.265234.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c41776648afdf73f1f1c13e8bce41d1450adc66d8ece8373a663c50b115699c
3
+ size 20656
runs/May11_11-11-27_falcon-05/events.out.tfevents.1746958443.falcon-05.265234.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a88a4dc3d80abe5dd28b0d9c8ae04c8c258970eaace112e56a0e229e7780b5c7
3
+ size 10481
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "max_length": 128,
51
+ "model_max_length": 128,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "<pad>",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "</s>",
57
+ "stride": 0,
58
+ "tokenizer_class": "PreTrainedTokenizer",
59
+ "truncation_side": "right",
60
+ "truncation_strategy": "longest_first",
61
+ "unk_token": "<unk>"
62
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8cd685483891e41b5652487e985c77e8ec27d51adad32aecea426f7780676fe
3
+ size 5777