contemmcm commited on
Commit
218e56c
·
verified ·
1 Parent(s): b4997cb

Upload 8 files

Browse files
Files changed (8) hide show
  1. config.json +59 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scaler.pt +3 -0
  6. scheduler.pt +3 -0
  7. trainer_state.json +1699 -0
  8. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ForSequenceClassification"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4",
17
+ "5": "LABEL_5",
18
+ "6": "LABEL_6"
19
+ },
20
+ "initializer_range": 0.02,
21
+ "label2id": {
22
+ "LABEL_0": 0,
23
+ "LABEL_1": 1,
24
+ "LABEL_2": 2,
25
+ "LABEL_3": 3,
26
+ "LABEL_4": 4,
27
+ "LABEL_5": 5,
28
+ "LABEL_6": 6
29
+ },
30
+ "layer_norm_epsilon": 1e-05,
31
+ "model_type": "gpt2",
32
+ "n_ctx": 1024,
33
+ "n_embd": 768,
34
+ "n_head": 12,
35
+ "n_inner": null,
36
+ "n_layer": 12,
37
+ "n_positions": 1024,
38
+ "pad_token_id": 50256,
39
+ "problem_type": "single_label_classification",
40
+ "reorder_and_upcast_attn": false,
41
+ "resid_pdrop": 0.1,
42
+ "scale_attn_by_inverse_layer_idx": false,
43
+ "scale_attn_weights": true,
44
+ "summary_activation": null,
45
+ "summary_first_dropout": 0.1,
46
+ "summary_proj_to_labels": true,
47
+ "summary_type": "cls_index",
48
+ "summary_use_proj": true,
49
+ "task_specific_params": {
50
+ "text-generation": {
51
+ "do_sample": true,
52
+ "max_length": 50
53
+ }
54
+ },
55
+ "torch_dtype": "float32",
56
+ "transformers_version": "4.49.0",
57
+ "use_cache": true,
58
+ "vocab_size": 50257
59
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0fc38abd2423000deac57ae4a3d8817ff4bc2330d719a447f94ccd95810b6bc
3
+ size 497795792
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0f69c0da68472420420e7035f0117250e64b5a441a109d859637922cf92ab64
3
+ size 995686138
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:186f463110bb3b61f265e81b827eb6cfe9e9853b320a88016808d64284d272d7
3
+ size 14244
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05fdd26cf6851b130b583e49ff5643be0d0a64e3b5c5f62158e5106f14e6144d
3
+ size 988
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab94b495a347b0fdc27086b8b7987c756563e833856d443bc82b8ca4b65f4d1
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,1699 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.7819397449493408,
3
+ "best_model_checkpoint": ".cache/models/de3349474c859e664bc22aa79edda407/checkpoint-116484",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 116484,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.012877305037601731,
13
+ "grad_norm": 9.567660331726074,
14
+ "learning_rate": 6.998222416812609e-06,
15
+ "loss": 2.1235,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.025754610075203462,
20
+ "grad_norm": 8.706146240234375,
21
+ "learning_rate": 6.996419594107345e-06,
22
+ "loss": 1.9258,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.03863191511280519,
27
+ "grad_norm": 11.026541709899902,
28
+ "learning_rate": 6.9946167714020805e-06,
29
+ "loss": 1.8954,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.051509220150406924,
34
+ "grad_norm": 11.393457412719727,
35
+ "learning_rate": 6.992813948696817e-06,
36
+ "loss": 1.8783,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.06438652518800865,
41
+ "grad_norm": 6.881910800933838,
42
+ "learning_rate": 6.991011125991553e-06,
43
+ "loss": 1.8627,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.07726383022561038,
48
+ "grad_norm": 9.6551513671875,
49
+ "learning_rate": 6.989208303286289e-06,
50
+ "loss": 1.8543,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.09014113526321212,
55
+ "grad_norm": 7.362414836883545,
56
+ "learning_rate": 6.987405480581024e-06,
57
+ "loss": 1.825,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.10301844030081385,
62
+ "grad_norm": 7.49143123626709,
63
+ "learning_rate": 6.9856026578757595e-06,
64
+ "loss": 1.822,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.11589574533841558,
69
+ "grad_norm": 5.751263618469238,
70
+ "learning_rate": 6.983799835170495e-06,
71
+ "loss": 1.8325,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.1287730503760173,
76
+ "grad_norm": 12.193325996398926,
77
+ "learning_rate": 6.981997012465231e-06,
78
+ "loss": 1.8237,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.14165035541361903,
83
+ "grad_norm": 5.82090950012207,
84
+ "learning_rate": 6.980194189759967e-06,
85
+ "loss": 1.8273,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.15452766045122077,
90
+ "grad_norm": 10.159476280212402,
91
+ "learning_rate": 6.978391367054702e-06,
92
+ "loss": 1.8281,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.1674049654888225,
97
+ "grad_norm": 9.086551666259766,
98
+ "learning_rate": 6.9765885443494386e-06,
99
+ "loss": 1.8149,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.18028227052642423,
104
+ "grad_norm": 6.515509128570557,
105
+ "learning_rate": 6.974785721644174e-06,
106
+ "loss": 1.8216,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.19315957556402596,
111
+ "grad_norm": 6.047694683074951,
112
+ "learning_rate": 6.97298289893891e-06,
113
+ "loss": 1.8194,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.2060368806016277,
118
+ "grad_norm": 7.643856525421143,
119
+ "learning_rate": 6.971180076233646e-06,
120
+ "loss": 1.8236,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 0.21891418563922943,
125
+ "grad_norm": 7.796109199523926,
126
+ "learning_rate": 6.9693808591737915e-06,
127
+ "loss": 1.8147,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 0.23179149067683116,
132
+ "grad_norm": 5.515515327453613,
133
+ "learning_rate": 6.967578036468528e-06,
134
+ "loss": 1.8178,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 0.2446687957144329,
139
+ "grad_norm": 8.763411521911621,
140
+ "learning_rate": 6.965775213763264e-06,
141
+ "loss": 1.8173,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 0.2575461007520346,
146
+ "grad_norm": 5.124142169952393,
147
+ "learning_rate": 6.963972391058e-06,
148
+ "loss": 1.8086,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 0.27042340578963636,
153
+ "grad_norm": 7.082516193389893,
154
+ "learning_rate": 6.962173173998145e-06,
155
+ "loss": 1.8162,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 0.28330071082723807,
160
+ "grad_norm": 4.873811721801758,
161
+ "learning_rate": 6.960370351292881e-06,
162
+ "loss": 1.8011,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 0.2961780158648398,
167
+ "grad_norm": 5.1067585945129395,
168
+ "learning_rate": 6.958567528587618e-06,
169
+ "loss": 1.8113,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 0.30905532090244153,
174
+ "grad_norm": 5.661747932434082,
175
+ "learning_rate": 6.956764705882353e-06,
176
+ "loss": 1.8179,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 0.3219326259400433,
181
+ "grad_norm": 6.289585590362549,
182
+ "learning_rate": 6.954965488822499e-06,
183
+ "loss": 1.807,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 0.334809930977645,
188
+ "grad_norm": 5.190219402313232,
189
+ "learning_rate": 6.953162666117235e-06,
190
+ "loss": 1.8019,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 0.3476872360152467,
195
+ "grad_norm": 7.0948333740234375,
196
+ "learning_rate": 6.951359843411971e-06,
197
+ "loss": 1.8114,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 0.36056454105284846,
202
+ "grad_norm": 8.964125633239746,
203
+ "learning_rate": 6.9495570207067066e-06,
204
+ "loss": 1.8003,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 0.37344184609045017,
209
+ "grad_norm": 8.000853538513184,
210
+ "learning_rate": 6.947757803646853e-06,
211
+ "loss": 1.7997,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 0.38631915112805193,
216
+ "grad_norm": 5.036130428314209,
217
+ "learning_rate": 6.945954980941588e-06,
218
+ "loss": 1.8046,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 0.39919645616565363,
223
+ "grad_norm": 5.919955730438232,
224
+ "learning_rate": 6.944152158236324e-06,
225
+ "loss": 1.8084,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 0.4120737612032554,
230
+ "grad_norm": 6.051347255706787,
231
+ "learning_rate": 6.9423493355310595e-06,
232
+ "loss": 1.8101,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 0.4249510662408571,
237
+ "grad_norm": 5.415486812591553,
238
+ "learning_rate": 6.940550118471206e-06,
239
+ "loss": 1.8002,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 0.43782837127845886,
244
+ "grad_norm": 5.6131744384765625,
245
+ "learning_rate": 6.938747295765942e-06,
246
+ "loss": 1.8176,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 0.45070567631606057,
251
+ "grad_norm": 5.342150688171387,
252
+ "learning_rate": 6.9369444730606775e-06,
253
+ "loss": 1.8064,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 0.4635829813536623,
258
+ "grad_norm": 8.20988655090332,
259
+ "learning_rate": 6.935141650355413e-06,
260
+ "loss": 1.8067,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 0.47646028639126403,
265
+ "grad_norm": 6.165362358093262,
266
+ "learning_rate": 6.933338827650149e-06,
267
+ "loss": 1.7982,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 0.4893375914288658,
272
+ "grad_norm": 5.723970413208008,
273
+ "learning_rate": 6.9315396105902955e-06,
274
+ "loss": 1.8078,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 0.5022148964664676,
279
+ "grad_norm": 5.340003967285156,
280
+ "learning_rate": 6.929740393530442e-06,
281
+ "loss": 1.8059,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 0.5150922015040692,
286
+ "grad_norm": 7.968153953552246,
287
+ "learning_rate": 6.927937570825178e-06,
288
+ "loss": 1.8028,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 0.527969506541671,
293
+ "grad_norm": 5.750583171844482,
294
+ "learning_rate": 6.9261347481199135e-06,
295
+ "loss": 1.7912,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 0.5408468115792727,
300
+ "grad_norm": 6.711771488189697,
301
+ "learning_rate": 6.924331925414649e-06,
302
+ "loss": 1.8043,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 0.5537241166168744,
307
+ "grad_norm": 6.380643367767334,
308
+ "learning_rate": 6.922529102709385e-06,
309
+ "loss": 1.7956,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 0.5666014216544761,
314
+ "grad_norm": 7.911167621612549,
315
+ "learning_rate": 6.92072628000412e-06,
316
+ "loss": 1.8033,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 0.5794787266920779,
321
+ "grad_norm": 5.734021186828613,
322
+ "learning_rate": 6.918923457298856e-06,
323
+ "loss": 1.8028,
324
+ "step": 22500
325
+ },
326
+ {
327
+ "epoch": 0.5923560317296797,
328
+ "grad_norm": 3.4658050537109375,
329
+ "learning_rate": 6.917120634593592e-06,
330
+ "loss": 1.8017,
331
+ "step": 23000
332
+ },
333
+ {
334
+ "epoch": 0.6052333367672813,
335
+ "grad_norm": 5.958466053009033,
336
+ "learning_rate": 6.915321417533738e-06,
337
+ "loss": 1.8048,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 0.6181106418048831,
342
+ "grad_norm": 6.3394455909729,
343
+ "learning_rate": 6.913518594828474e-06,
344
+ "loss": 1.7956,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 0.6309879468424848,
349
+ "grad_norm": 5.954572677612305,
350
+ "learning_rate": 6.91171577212321e-06,
351
+ "loss": 1.7944,
352
+ "step": 24500
353
+ },
354
+ {
355
+ "epoch": 0.6438652518800866,
356
+ "grad_norm": 7.130244731903076,
357
+ "learning_rate": 6.9099129494179455e-06,
358
+ "loss": 1.7954,
359
+ "step": 25000
360
+ },
361
+ {
362
+ "epoch": 0.6567425569176882,
363
+ "grad_norm": 7.7536725997924805,
364
+ "learning_rate": 6.908110126712681e-06,
365
+ "loss": 1.8015,
366
+ "step": 25500
367
+ },
368
+ {
369
+ "epoch": 0.66961986195529,
370
+ "grad_norm": 8.227217674255371,
371
+ "learning_rate": 6.906307304007418e-06,
372
+ "loss": 1.7917,
373
+ "step": 26000
374
+ },
375
+ {
376
+ "epoch": 0.6824971669928918,
377
+ "grad_norm": 5.820123195648193,
378
+ "learning_rate": 6.904504481302153e-06,
379
+ "loss": 1.7853,
380
+ "step": 26500
381
+ },
382
+ {
383
+ "epoch": 0.6953744720304934,
384
+ "grad_norm": 3.5955381393432617,
385
+ "learning_rate": 6.902705264242299e-06,
386
+ "loss": 1.7995,
387
+ "step": 27000
388
+ },
389
+ {
390
+ "epoch": 0.7082517770680952,
391
+ "grad_norm": 6.445123672485352,
392
+ "learning_rate": 6.900902441537035e-06,
393
+ "loss": 1.795,
394
+ "step": 27500
395
+ },
396
+ {
397
+ "epoch": 0.7211290821056969,
398
+ "grad_norm": 8.052949905395508,
399
+ "learning_rate": 6.89909961883177e-06,
400
+ "loss": 1.7967,
401
+ "step": 28000
402
+ },
403
+ {
404
+ "epoch": 0.7340063871432987,
405
+ "grad_norm": 4.261895179748535,
406
+ "learning_rate": 6.897296796126507e-06,
407
+ "loss": 1.789,
408
+ "step": 28500
409
+ },
410
+ {
411
+ "epoch": 0.7468836921809003,
412
+ "grad_norm": 5.8953633308410645,
413
+ "learning_rate": 6.895497579066653e-06,
414
+ "loss": 1.7828,
415
+ "step": 29000
416
+ },
417
+ {
418
+ "epoch": 0.7597609972185021,
419
+ "grad_norm": 5.249965667724609,
420
+ "learning_rate": 6.893694756361388e-06,
421
+ "loss": 1.8035,
422
+ "step": 29500
423
+ },
424
+ {
425
+ "epoch": 0.7726383022561039,
426
+ "grad_norm": 7.793389320373535,
427
+ "learning_rate": 6.891895539301535e-06,
428
+ "loss": 1.8054,
429
+ "step": 30000
430
+ },
431
+ {
432
+ "epoch": 0.7855156072937056,
433
+ "grad_norm": 5.460562229156494,
434
+ "learning_rate": 6.890092716596271e-06,
435
+ "loss": 1.7936,
436
+ "step": 30500
437
+ },
438
+ {
439
+ "epoch": 0.7983929123313073,
440
+ "grad_norm": 5.304771423339844,
441
+ "learning_rate": 6.888289893891006e-06,
442
+ "loss": 1.783,
443
+ "step": 31000
444
+ },
445
+ {
446
+ "epoch": 0.811270217368909,
447
+ "grad_norm": 5.964851379394531,
448
+ "learning_rate": 6.886487071185742e-06,
449
+ "loss": 1.7881,
450
+ "step": 31500
451
+ },
452
+ {
453
+ "epoch": 0.8241475224065108,
454
+ "grad_norm": 5.454218864440918,
455
+ "learning_rate": 6.884684248480478e-06,
456
+ "loss": 1.7944,
457
+ "step": 32000
458
+ },
459
+ {
460
+ "epoch": 0.8370248274441126,
461
+ "grad_norm": 5.067232131958008,
462
+ "learning_rate": 6.8828814257752135e-06,
463
+ "loss": 1.7939,
464
+ "step": 32500
465
+ },
466
+ {
467
+ "epoch": 0.8499021324817142,
468
+ "grad_norm": 6.87580680847168,
469
+ "learning_rate": 6.881078603069949e-06,
470
+ "loss": 1.7849,
471
+ "step": 33000
472
+ },
473
+ {
474
+ "epoch": 0.862779437519316,
475
+ "grad_norm": 7.284745693206787,
476
+ "learning_rate": 6.879275780364685e-06,
477
+ "loss": 1.7981,
478
+ "step": 33500
479
+ },
480
+ {
481
+ "epoch": 0.8756567425569177,
482
+ "grad_norm": 7.6209540367126465,
483
+ "learning_rate": 6.8774765633048315e-06,
484
+ "loss": 1.8011,
485
+ "step": 34000
486
+ },
487
+ {
488
+ "epoch": 0.8885340475945194,
489
+ "grad_norm": 6.659396648406982,
490
+ "learning_rate": 6.875673740599567e-06,
491
+ "loss": 1.8105,
492
+ "step": 34500
493
+ },
494
+ {
495
+ "epoch": 0.9014113526321211,
496
+ "grad_norm": 5.977092266082764,
497
+ "learning_rate": 6.873870917894302e-06,
498
+ "loss": 1.7939,
499
+ "step": 35000
500
+ },
501
+ {
502
+ "epoch": 0.9142886576697229,
503
+ "grad_norm": 6.564746379852295,
504
+ "learning_rate": 6.872068095189039e-06,
505
+ "loss": 1.7882,
506
+ "step": 35500
507
+ },
508
+ {
509
+ "epoch": 0.9271659627073247,
510
+ "grad_norm": 6.975079536437988,
511
+ "learning_rate": 6.870265272483775e-06,
512
+ "loss": 1.7942,
513
+ "step": 36000
514
+ },
515
+ {
516
+ "epoch": 0.9400432677449263,
517
+ "grad_norm": 8.673888206481934,
518
+ "learning_rate": 6.8684624497785105e-06,
519
+ "loss": 1.7938,
520
+ "step": 36500
521
+ },
522
+ {
523
+ "epoch": 0.9529205727825281,
524
+ "grad_norm": 6.115748405456543,
525
+ "learning_rate": 6.866663232718656e-06,
526
+ "loss": 1.8014,
527
+ "step": 37000
528
+ },
529
+ {
530
+ "epoch": 0.9657978778201298,
531
+ "grad_norm": 5.892397403717041,
532
+ "learning_rate": 6.864860410013392e-06,
533
+ "loss": 1.7955,
534
+ "step": 37500
535
+ },
536
+ {
537
+ "epoch": 0.9786751828577316,
538
+ "grad_norm": 3.719658374786377,
539
+ "learning_rate": 6.863057587308128e-06,
540
+ "loss": 1.773,
541
+ "step": 38000
542
+ },
543
+ {
544
+ "epoch": 0.9915524878953332,
545
+ "grad_norm": 6.752033233642578,
546
+ "learning_rate": 6.861254764602864e-06,
547
+ "loss": 1.7828,
548
+ "step": 38500
549
+ },
550
+ {
551
+ "epoch": 1.0,
552
+ "eval_accuracy": 0.2799319579535046,
553
+ "eval_f1_macro": 0.14142429830939493,
554
+ "eval_f1_micro": 0.2799319579535046,
555
+ "eval_loss": 1.8260160684585571,
556
+ "eval_runtime": 225.2986,
557
+ "eval_samples_per_second": 101.763,
558
+ "eval_steps_per_second": 12.721,
559
+ "step": 38828
560
+ },
561
+ {
562
+ "epoch": 1.0044297929329349,
563
+ "grad_norm": 5.212080955505371,
564
+ "learning_rate": 6.85945554754301e-06,
565
+ "loss": 1.7907,
566
+ "step": 39000
567
+ },
568
+ {
569
+ "epoch": 1.0173070979705368,
570
+ "grad_norm": 3.8510255813598633,
571
+ "learning_rate": 6.857652724837746e-06,
572
+ "loss": 1.8011,
573
+ "step": 39500
574
+ },
575
+ {
576
+ "epoch": 1.0301844030081384,
577
+ "grad_norm": 6.683523654937744,
578
+ "learning_rate": 6.8558499021324815e-06,
579
+ "loss": 1.7797,
580
+ "step": 40000
581
+ },
582
+ {
583
+ "epoch": 1.0430617080457403,
584
+ "grad_norm": 5.834150791168213,
585
+ "learning_rate": 6.854047079427217e-06,
586
+ "loss": 1.7872,
587
+ "step": 40500
588
+ },
589
+ {
590
+ "epoch": 1.055939013083342,
591
+ "grad_norm": 6.447351932525635,
592
+ "learning_rate": 6.852244256721953e-06,
593
+ "loss": 1.7769,
594
+ "step": 41000
595
+ },
596
+ {
597
+ "epoch": 1.0688163181209436,
598
+ "grad_norm": 7.13225793838501,
599
+ "learning_rate": 6.8504450396620995e-06,
600
+ "loss": 1.7762,
601
+ "step": 41500
602
+ },
603
+ {
604
+ "epoch": 1.0816936231585454,
605
+ "grad_norm": 4.302614212036133,
606
+ "learning_rate": 6.848642216956835e-06,
607
+ "loss": 1.7885,
608
+ "step": 42000
609
+ },
610
+ {
611
+ "epoch": 1.094570928196147,
612
+ "grad_norm": 4.33312463760376,
613
+ "learning_rate": 6.84683939425157e-06,
614
+ "loss": 1.7955,
615
+ "step": 42500
616
+ },
617
+ {
618
+ "epoch": 1.1074482332337487,
619
+ "grad_norm": 4.740920066833496,
620
+ "learning_rate": 6.845036571546307e-06,
621
+ "loss": 1.7874,
622
+ "step": 43000
623
+ },
624
+ {
625
+ "epoch": 1.1203255382713506,
626
+ "grad_norm": 3.6999611854553223,
627
+ "learning_rate": 6.843233748841043e-06,
628
+ "loss": 1.7879,
629
+ "step": 43500
630
+ },
631
+ {
632
+ "epoch": 1.1332028433089523,
633
+ "grad_norm": 4.698836803436279,
634
+ "learning_rate": 6.8414309261357785e-06,
635
+ "loss": 1.7902,
636
+ "step": 44000
637
+ },
638
+ {
639
+ "epoch": 1.1460801483465541,
640
+ "grad_norm": 4.617445945739746,
641
+ "learning_rate": 6.839631709075924e-06,
642
+ "loss": 1.7941,
643
+ "step": 44500
644
+ },
645
+ {
646
+ "epoch": 1.1589574533841558,
647
+ "grad_norm": 4.726156234741211,
648
+ "learning_rate": 6.83782888637066e-06,
649
+ "loss": 1.8009,
650
+ "step": 45000
651
+ },
652
+ {
653
+ "epoch": 1.1718347584217574,
654
+ "grad_norm": 7.927763938903809,
655
+ "learning_rate": 6.8360260636653965e-06,
656
+ "loss": 1.7896,
657
+ "step": 45500
658
+ },
659
+ {
660
+ "epoch": 1.1847120634593593,
661
+ "grad_norm": 3.4403719902038574,
662
+ "learning_rate": 6.834223240960132e-06,
663
+ "loss": 1.796,
664
+ "step": 46000
665
+ },
666
+ {
667
+ "epoch": 1.197589368496961,
668
+ "grad_norm": 4.452911376953125,
669
+ "learning_rate": 6.832420418254867e-06,
670
+ "loss": 1.7789,
671
+ "step": 46500
672
+ },
673
+ {
674
+ "epoch": 1.2104666735345626,
675
+ "grad_norm": 6.698709487915039,
676
+ "learning_rate": 6.830621201195014e-06,
677
+ "loss": 1.7847,
678
+ "step": 47000
679
+ },
680
+ {
681
+ "epoch": 1.2233439785721645,
682
+ "grad_norm": 6.807077884674072,
683
+ "learning_rate": 6.8288183784897495e-06,
684
+ "loss": 1.7809,
685
+ "step": 47500
686
+ },
687
+ {
688
+ "epoch": 1.2362212836097661,
689
+ "grad_norm": 5.895312309265137,
690
+ "learning_rate": 6.827015555784485e-06,
691
+ "loss": 1.7701,
692
+ "step": 48000
693
+ },
694
+ {
695
+ "epoch": 1.2490985886473678,
696
+ "grad_norm": 4.329184055328369,
697
+ "learning_rate": 6.825212733079221e-06,
698
+ "loss": 1.7972,
699
+ "step": 48500
700
+ },
701
+ {
702
+ "epoch": 1.2619758936849697,
703
+ "grad_norm": 7.711607456207275,
704
+ "learning_rate": 6.8234135160193675e-06,
705
+ "loss": 1.7655,
706
+ "step": 49000
707
+ },
708
+ {
709
+ "epoch": 1.2748531987225713,
710
+ "grad_norm": 7.793875694274902,
711
+ "learning_rate": 6.8216106933141025e-06,
712
+ "loss": 1.7942,
713
+ "step": 49500
714
+ },
715
+ {
716
+ "epoch": 1.287730503760173,
717
+ "grad_norm": 5.680514812469482,
718
+ "learning_rate": 6.819807870608838e-06,
719
+ "loss": 1.7883,
720
+ "step": 50000
721
+ },
722
+ {
723
+ "epoch": 1.3006078087977748,
724
+ "grad_norm": 5.679476261138916,
725
+ "learning_rate": 6.818005047903575e-06,
726
+ "loss": 1.7771,
727
+ "step": 50500
728
+ },
729
+ {
730
+ "epoch": 1.3134851138353765,
731
+ "grad_norm": 7.666828632354736,
732
+ "learning_rate": 6.816202225198311e-06,
733
+ "loss": 1.7921,
734
+ "step": 51000
735
+ },
736
+ {
737
+ "epoch": 1.3263624188729783,
738
+ "grad_norm": 6.097265243530273,
739
+ "learning_rate": 6.814403008138456e-06,
740
+ "loss": 1.7928,
741
+ "step": 51500
742
+ },
743
+ {
744
+ "epoch": 1.33923972391058,
745
+ "grad_norm": 4.842618942260742,
746
+ "learning_rate": 6.812600185433192e-06,
747
+ "loss": 1.783,
748
+ "step": 52000
749
+ },
750
+ {
751
+ "epoch": 1.3521170289481816,
752
+ "grad_norm": 3.718636989593506,
753
+ "learning_rate": 6.810797362727928e-06,
754
+ "loss": 1.7889,
755
+ "step": 52500
756
+ },
757
+ {
758
+ "epoch": 1.3649943339857835,
759
+ "grad_norm": 5.283341884613037,
760
+ "learning_rate": 6.8089945400226645e-06,
761
+ "loss": 1.7918,
762
+ "step": 53000
763
+ },
764
+ {
765
+ "epoch": 1.3778716390233852,
766
+ "grad_norm": 5.642731189727783,
767
+ "learning_rate": 6.80719532296281e-06,
768
+ "loss": 1.7918,
769
+ "step": 53500
770
+ },
771
+ {
772
+ "epoch": 1.390748944060987,
773
+ "grad_norm": 8.50252914428711,
774
+ "learning_rate": 6.805392500257546e-06,
775
+ "loss": 1.7879,
776
+ "step": 54000
777
+ },
778
+ {
779
+ "epoch": 1.4036262490985887,
780
+ "grad_norm": 7.05935525894165,
781
+ "learning_rate": 6.803589677552282e-06,
782
+ "loss": 1.7857,
783
+ "step": 54500
784
+ },
785
+ {
786
+ "epoch": 1.4165035541361903,
787
+ "grad_norm": 4.355367183685303,
788
+ "learning_rate": 6.8017868548470175e-06,
789
+ "loss": 1.7781,
790
+ "step": 55000
791
+ },
792
+ {
793
+ "epoch": 1.4293808591737922,
794
+ "grad_norm": 3.4202096462249756,
795
+ "learning_rate": 6.799984032141753e-06,
796
+ "loss": 1.7979,
797
+ "step": 55500
798
+ },
799
+ {
800
+ "epoch": 1.4422581642113939,
801
+ "grad_norm": 4.578690052032471,
802
+ "learning_rate": 6.7981848150819e-06,
803
+ "loss": 1.7862,
804
+ "step": 56000
805
+ },
806
+ {
807
+ "epoch": 1.4551354692489955,
808
+ "grad_norm": 6.677925109863281,
809
+ "learning_rate": 6.7963819923766355e-06,
810
+ "loss": 1.7747,
811
+ "step": 56500
812
+ },
813
+ {
814
+ "epoch": 1.4680127742865974,
815
+ "grad_norm": 8.975627899169922,
816
+ "learning_rate": 6.7945791696713704e-06,
817
+ "loss": 1.8019,
818
+ "step": 57000
819
+ },
820
+ {
821
+ "epoch": 1.480890079324199,
822
+ "grad_norm": 4.4174275398254395,
823
+ "learning_rate": 6.792776346966106e-06,
824
+ "loss": 1.8029,
825
+ "step": 57500
826
+ },
827
+ {
828
+ "epoch": 1.4937673843618007,
829
+ "grad_norm": 5.21982479095459,
830
+ "learning_rate": 6.790973524260843e-06,
831
+ "loss": 1.788,
832
+ "step": 58000
833
+ },
834
+ {
835
+ "epoch": 1.5066446893994025,
836
+ "grad_norm": 13.477604866027832,
837
+ "learning_rate": 6.789170701555579e-06,
838
+ "loss": 1.781,
839
+ "step": 58500
840
+ },
841
+ {
842
+ "epoch": 1.5195219944370042,
843
+ "grad_norm": 6.470083236694336,
844
+ "learning_rate": 6.787371484495724e-06,
845
+ "loss": 1.7786,
846
+ "step": 59000
847
+ },
848
+ {
849
+ "epoch": 1.5323992994746058,
850
+ "grad_norm": 6.517702102661133,
851
+ "learning_rate": 6.78556866179046e-06,
852
+ "loss": 1.7833,
853
+ "step": 59500
854
+ },
855
+ {
856
+ "epoch": 1.5452766045122077,
857
+ "grad_norm": 5.956171035766602,
858
+ "learning_rate": 6.783765839085196e-06,
859
+ "loss": 1.7729,
860
+ "step": 60000
861
+ },
862
+ {
863
+ "epoch": 1.5581539095498094,
864
+ "grad_norm": 6.486398696899414,
865
+ "learning_rate": 6.7819630163799325e-06,
866
+ "loss": 1.7973,
867
+ "step": 60500
868
+ },
869
+ {
870
+ "epoch": 1.571031214587411,
871
+ "grad_norm": 4.920289993286133,
872
+ "learning_rate": 6.7801601936746675e-06,
873
+ "loss": 1.781,
874
+ "step": 61000
875
+ },
876
+ {
877
+ "epoch": 1.583908519625013,
878
+ "grad_norm": 5.362635612487793,
879
+ "learning_rate": 6.778357370969403e-06,
880
+ "loss": 1.7889,
881
+ "step": 61500
882
+ },
883
+ {
884
+ "epoch": 1.5967858246626148,
885
+ "grad_norm": 6.79608678817749,
886
+ "learning_rate": 6.776554548264139e-06,
887
+ "loss": 1.7926,
888
+ "step": 62000
889
+ },
890
+ {
891
+ "epoch": 1.6096631297002162,
892
+ "grad_norm": 4.749091625213623,
893
+ "learning_rate": 6.774751725558875e-06,
894
+ "loss": 1.7783,
895
+ "step": 62500
896
+ },
897
+ {
898
+ "epoch": 1.622540434737818,
899
+ "grad_norm": 7.530384540557861,
900
+ "learning_rate": 6.772956114144432e-06,
901
+ "loss": 1.7833,
902
+ "step": 63000
903
+ },
904
+ {
905
+ "epoch": 1.63541773977542,
906
+ "grad_norm": 6.624202251434326,
907
+ "learning_rate": 6.771153291439168e-06,
908
+ "loss": 1.7739,
909
+ "step": 63500
910
+ },
911
+ {
912
+ "epoch": 1.6482950448130216,
913
+ "grad_norm": 3.6536526679992676,
914
+ "learning_rate": 6.769350468733903e-06,
915
+ "loss": 1.7856,
916
+ "step": 64000
917
+ },
918
+ {
919
+ "epoch": 1.6611723498506232,
920
+ "grad_norm": 6.610978603363037,
921
+ "learning_rate": 6.7675476460286384e-06,
922
+ "loss": 1.7854,
923
+ "step": 64500
924
+ },
925
+ {
926
+ "epoch": 1.674049654888225,
927
+ "grad_norm": 5.544578552246094,
928
+ "learning_rate": 6.765744823323375e-06,
929
+ "loss": 1.774,
930
+ "step": 65000
931
+ },
932
+ {
933
+ "epoch": 1.6869269599258268,
934
+ "grad_norm": 7.2254133224487305,
935
+ "learning_rate": 6.763945606263521e-06,
936
+ "loss": 1.7946,
937
+ "step": 65500
938
+ },
939
+ {
940
+ "epoch": 1.6998042649634284,
941
+ "grad_norm": 4.5243730545043945,
942
+ "learning_rate": 6.7621427835582564e-06,
943
+ "loss": 1.8038,
944
+ "step": 66000
945
+ },
946
+ {
947
+ "epoch": 1.7126815700010303,
948
+ "grad_norm": 5.477443218231201,
949
+ "learning_rate": 6.760343566498404e-06,
950
+ "loss": 1.8058,
951
+ "step": 66500
952
+ },
953
+ {
954
+ "epoch": 1.725558875038632,
955
+ "grad_norm": 5.0484771728515625,
956
+ "learning_rate": 6.758540743793139e-06,
957
+ "loss": 1.7836,
958
+ "step": 67000
959
+ },
960
+ {
961
+ "epoch": 1.7384361800762336,
962
+ "grad_norm": 3.814950466156006,
963
+ "learning_rate": 6.7567379210878744e-06,
964
+ "loss": 1.7774,
965
+ "step": 67500
966
+ },
967
+ {
968
+ "epoch": 1.7513134851138354,
969
+ "grad_norm": 4.360231399536133,
970
+ "learning_rate": 6.75493509838261e-06,
971
+ "loss": 1.7959,
972
+ "step": 68000
973
+ },
974
+ {
975
+ "epoch": 1.764190790151437,
976
+ "grad_norm": 4.7067646980285645,
977
+ "learning_rate": 6.753132275677346e-06,
978
+ "loss": 1.7942,
979
+ "step": 68500
980
+ },
981
+ {
982
+ "epoch": 1.7770680951890387,
983
+ "grad_norm": 4.299397945404053,
984
+ "learning_rate": 6.751329452972082e-06,
985
+ "loss": 1.7847,
986
+ "step": 69000
987
+ },
988
+ {
989
+ "epoch": 1.7899454002266406,
990
+ "grad_norm": 4.759008407592773,
991
+ "learning_rate": 6.749526630266818e-06,
992
+ "loss": 1.7788,
993
+ "step": 69500
994
+ },
995
+ {
996
+ "epoch": 1.8028227052642423,
997
+ "grad_norm": 5.2251787185668945,
998
+ "learning_rate": 6.7477238075615535e-06,
999
+ "loss": 1.7998,
1000
+ "step": 70000
1001
+ },
1002
+ {
1003
+ "epoch": 1.815700010301844,
1004
+ "grad_norm": 6.724719524383545,
1005
+ "learning_rate": 6.745920984856289e-06,
1006
+ "loss": 1.7823,
1007
+ "step": 70500
1008
+ },
1009
+ {
1010
+ "epoch": 1.8285773153394458,
1011
+ "grad_norm": 4.814666271209717,
1012
+ "learning_rate": 6.744121767796436e-06,
1013
+ "loss": 1.7828,
1014
+ "step": 71000
1015
+ },
1016
+ {
1017
+ "epoch": 1.8414546203770477,
1018
+ "grad_norm": 4.108809947967529,
1019
+ "learning_rate": 6.742318945091171e-06,
1020
+ "loss": 1.7811,
1021
+ "step": 71500
1022
+ },
1023
+ {
1024
+ "epoch": 1.854331925414649,
1025
+ "grad_norm": 5.329784393310547,
1026
+ "learning_rate": 6.7405161223859064e-06,
1027
+ "loss": 1.7808,
1028
+ "step": 72000
1029
+ },
1030
+ {
1031
+ "epoch": 1.867209230452251,
1032
+ "grad_norm": 5.064823150634766,
1033
+ "learning_rate": 6.738713299680643e-06,
1034
+ "loss": 1.779,
1035
+ "step": 72500
1036
+ },
1037
+ {
1038
+ "epoch": 1.8800865354898528,
1039
+ "grad_norm": 4.863822937011719,
1040
+ "learning_rate": 6.736914082620789e-06,
1041
+ "loss": 1.7885,
1042
+ "step": 73000
1043
+ },
1044
+ {
1045
+ "epoch": 1.8929638405274543,
1046
+ "grad_norm": 8.565622329711914,
1047
+ "learning_rate": 6.7351112599155244e-06,
1048
+ "loss": 1.7841,
1049
+ "step": 73500
1050
+ },
1051
+ {
1052
+ "epoch": 1.9058411455650561,
1053
+ "grad_norm": 5.2519145011901855,
1054
+ "learning_rate": 6.73330843721026e-06,
1055
+ "loss": 1.7882,
1056
+ "step": 74000
1057
+ },
1058
+ {
1059
+ "epoch": 1.918718450602658,
1060
+ "grad_norm": 8.66575813293457,
1061
+ "learning_rate": 6.731505614504996e-06,
1062
+ "loss": 1.771,
1063
+ "step": 74500
1064
+ },
1065
+ {
1066
+ "epoch": 1.9315957556402596,
1067
+ "grad_norm": 9.521093368530273,
1068
+ "learning_rate": 6.729702791799733e-06,
1069
+ "loss": 1.7866,
1070
+ "step": 75000
1071
+ },
1072
+ {
1073
+ "epoch": 1.9444730606778613,
1074
+ "grad_norm": 4.49517297744751,
1075
+ "learning_rate": 6.727899969094468e-06,
1076
+ "loss": 1.7867,
1077
+ "step": 75500
1078
+ },
1079
+ {
1080
+ "epoch": 1.9573503657154632,
1081
+ "grad_norm": 7.0434699058532715,
1082
+ "learning_rate": 6.726100752034614e-06,
1083
+ "loss": 1.7806,
1084
+ "step": 76000
1085
+ },
1086
+ {
1087
+ "epoch": 1.9702276707530648,
1088
+ "grad_norm": 4.631639003753662,
1089
+ "learning_rate": 6.72429792932935e-06,
1090
+ "loss": 1.7965,
1091
+ "step": 76500
1092
+ },
1093
+ {
1094
+ "epoch": 1.9831049757906665,
1095
+ "grad_norm": 6.64503288269043,
1096
+ "learning_rate": 6.722495106624085e-06,
1097
+ "loss": 1.7866,
1098
+ "step": 77000
1099
+ },
1100
+ {
1101
+ "epoch": 1.9959822808282683,
1102
+ "grad_norm": 6.729311943054199,
1103
+ "learning_rate": 6.7206922839188215e-06,
1104
+ "loss": 1.7776,
1105
+ "step": 77500
1106
+ },
1107
+ {
1108
+ "epoch": 2.0,
1109
+ "eval_accuracy": 0.2799319579535046,
1110
+ "eval_f1_macro": 0.14207620906161436,
1111
+ "eval_f1_micro": 0.2799319579535046,
1112
+ "eval_loss": 1.7851710319519043,
1113
+ "eval_runtime": 223.7334,
1114
+ "eval_samples_per_second": 102.475,
1115
+ "eval_steps_per_second": 12.81,
1116
+ "step": 77656
1117
+ },
1118
+ {
1119
+ "epoch": 2.0088595858658698,
1120
+ "grad_norm": 5.343408584594727,
1121
+ "learning_rate": 6.718893066858968e-06,
1122
+ "loss": 1.7612,
1123
+ "step": 78000
1124
+ },
1125
+ {
1126
+ "epoch": 2.0217368909034716,
1127
+ "grad_norm": 7.235130786895752,
1128
+ "learning_rate": 6.717090244153703e-06,
1129
+ "loss": 1.7718,
1130
+ "step": 78500
1131
+ },
1132
+ {
1133
+ "epoch": 2.0346141959410735,
1134
+ "grad_norm": 3.554536819458008,
1135
+ "learning_rate": 6.715287421448439e-06,
1136
+ "loss": 1.7887,
1137
+ "step": 79000
1138
+ },
1139
+ {
1140
+ "epoch": 2.0474915009786754,
1141
+ "grad_norm": 6.864304542541504,
1142
+ "learning_rate": 6.7134845987431744e-06,
1143
+ "loss": 1.7795,
1144
+ "step": 79500
1145
+ },
1146
+ {
1147
+ "epoch": 2.060368806016277,
1148
+ "grad_norm": 4.944849967956543,
1149
+ "learning_rate": 6.711681776037911e-06,
1150
+ "loss": 1.7826,
1151
+ "step": 80000
1152
+ },
1153
+ {
1154
+ "epoch": 2.0732461110538787,
1155
+ "grad_norm": 4.683295726776123,
1156
+ "learning_rate": 6.709878953332647e-06,
1157
+ "loss": 1.7711,
1158
+ "step": 80500
1159
+ },
1160
+ {
1161
+ "epoch": 2.0861234160914806,
1162
+ "grad_norm": 5.293025493621826,
1163
+ "learning_rate": 6.708076130627383e-06,
1164
+ "loss": 1.7863,
1165
+ "step": 81000
1166
+ },
1167
+ {
1168
+ "epoch": 2.099000721129082,
1169
+ "grad_norm": 3.2971994876861572,
1170
+ "learning_rate": 6.706273307922118e-06,
1171
+ "loss": 1.7759,
1172
+ "step": 81500
1173
+ },
1174
+ {
1175
+ "epoch": 2.111878026166684,
1176
+ "grad_norm": 4.406072616577148,
1177
+ "learning_rate": 6.704474090862264e-06,
1178
+ "loss": 1.7828,
1179
+ "step": 82000
1180
+ },
1181
+ {
1182
+ "epoch": 2.1247553312042857,
1183
+ "grad_norm": 5.096445083618164,
1184
+ "learning_rate": 6.7026748738024104e-06,
1185
+ "loss": 1.7764,
1186
+ "step": 82500
1187
+ },
1188
+ {
1189
+ "epoch": 2.137632636241887,
1190
+ "grad_norm": 5.056951522827148,
1191
+ "learning_rate": 6.700872051097146e-06,
1192
+ "loss": 1.7884,
1193
+ "step": 83000
1194
+ },
1195
+ {
1196
+ "epoch": 2.150509941279489,
1197
+ "grad_norm": 5.062712669372559,
1198
+ "learning_rate": 6.699069228391882e-06,
1199
+ "loss": 1.7772,
1200
+ "step": 83500
1201
+ },
1202
+ {
1203
+ "epoch": 2.163387246317091,
1204
+ "grad_norm": 5.221599578857422,
1205
+ "learning_rate": 6.697266405686618e-06,
1206
+ "loss": 1.7845,
1207
+ "step": 84000
1208
+ },
1209
+ {
1210
+ "epoch": 2.1762645513546923,
1211
+ "grad_norm": 6.651246547698975,
1212
+ "learning_rate": 6.695463582981354e-06,
1213
+ "loss": 1.7873,
1214
+ "step": 84500
1215
+ },
1216
+ {
1217
+ "epoch": 2.189141856392294,
1218
+ "grad_norm": 3.7498083114624023,
1219
+ "learning_rate": 6.6936607602760895e-06,
1220
+ "loss": 1.7729,
1221
+ "step": 85000
1222
+ },
1223
+ {
1224
+ "epoch": 2.202019161429896,
1225
+ "grad_norm": 3.9556636810302734,
1226
+ "learning_rate": 6.691857937570825e-06,
1227
+ "loss": 1.7914,
1228
+ "step": 85500
1229
+ },
1230
+ {
1231
+ "epoch": 2.2148964664674975,
1232
+ "grad_norm": 4.400400638580322,
1233
+ "learning_rate": 6.690058720510971e-06,
1234
+ "loss": 1.7878,
1235
+ "step": 86000
1236
+ },
1237
+ {
1238
+ "epoch": 2.2277737715050994,
1239
+ "grad_norm": 5.495286464691162,
1240
+ "learning_rate": 6.688255897805707e-06,
1241
+ "loss": 1.7894,
1242
+ "step": 86500
1243
+ },
1244
+ {
1245
+ "epoch": 2.2406510765427012,
1246
+ "grad_norm": 5.062817573547363,
1247
+ "learning_rate": 6.686453075100443e-06,
1248
+ "loss": 1.78,
1249
+ "step": 87000
1250
+ },
1251
+ {
1252
+ "epoch": 2.2535283815803027,
1253
+ "grad_norm": 7.037624835968018,
1254
+ "learning_rate": 6.684650252395179e-06,
1255
+ "loss": 1.7799,
1256
+ "step": 87500
1257
+ },
1258
+ {
1259
+ "epoch": 2.2664056866179045,
1260
+ "grad_norm": 5.7241291999816895,
1261
+ "learning_rate": 6.682851035335325e-06,
1262
+ "loss": 1.7864,
1263
+ "step": 88000
1264
+ },
1265
+ {
1266
+ "epoch": 2.2792829916555064,
1267
+ "grad_norm": 6.502715587615967,
1268
+ "learning_rate": 6.68104821263006e-06,
1269
+ "loss": 1.7818,
1270
+ "step": 88500
1271
+ },
1272
+ {
1273
+ "epoch": 2.2921602966931083,
1274
+ "grad_norm": 9.763015747070312,
1275
+ "learning_rate": 6.679245389924796e-06,
1276
+ "loss": 1.7673,
1277
+ "step": 89000
1278
+ },
1279
+ {
1280
+ "epoch": 2.3050376017307097,
1281
+ "grad_norm": 6.935070991516113,
1282
+ "learning_rate": 6.677442567219532e-06,
1283
+ "loss": 1.7659,
1284
+ "step": 89500
1285
+ },
1286
+ {
1287
+ "epoch": 2.3179149067683116,
1288
+ "grad_norm": 9.335469245910645,
1289
+ "learning_rate": 6.675639744514268e-06,
1290
+ "loss": 1.7795,
1291
+ "step": 90000
1292
+ },
1293
+ {
1294
+ "epoch": 2.3307922118059134,
1295
+ "grad_norm": 7.891449928283691,
1296
+ "learning_rate": 6.673836921809004e-06,
1297
+ "loss": 1.778,
1298
+ "step": 90500
1299
+ },
1300
+ {
1301
+ "epoch": 2.343669516843515,
1302
+ "grad_norm": 4.85399055480957,
1303
+ "learning_rate": 6.67203770474915e-06,
1304
+ "loss": 1.782,
1305
+ "step": 91000
1306
+ },
1307
+ {
1308
+ "epoch": 2.3565468218811167,
1309
+ "grad_norm": 6.049405574798584,
1310
+ "learning_rate": 6.670234882043885e-06,
1311
+ "loss": 1.7915,
1312
+ "step": 91500
1313
+ },
1314
+ {
1315
+ "epoch": 2.3694241269187186,
1316
+ "grad_norm": 8.361068725585938,
1317
+ "learning_rate": 6.668432059338622e-06,
1318
+ "loss": 1.774,
1319
+ "step": 92000
1320
+ },
1321
+ {
1322
+ "epoch": 2.38230143195632,
1323
+ "grad_norm": 7.679973602294922,
1324
+ "learning_rate": 6.6666292366333575e-06,
1325
+ "loss": 1.7758,
1326
+ "step": 92500
1327
+ },
1328
+ {
1329
+ "epoch": 2.395178736993922,
1330
+ "grad_norm": 7.885756492614746,
1331
+ "learning_rate": 6.664830019573503e-06,
1332
+ "loss": 1.7808,
1333
+ "step": 93000
1334
+ },
1335
+ {
1336
+ "epoch": 2.408056042031524,
1337
+ "grad_norm": 5.197179794311523,
1338
+ "learning_rate": 6.663027196868239e-06,
1339
+ "loss": 1.7829,
1340
+ "step": 93500
1341
+ },
1342
+ {
1343
+ "epoch": 2.420933347069125,
1344
+ "grad_norm": 5.253746509552002,
1345
+ "learning_rate": 6.661224374162975e-06,
1346
+ "loss": 1.774,
1347
+ "step": 94000
1348
+ },
1349
+ {
1350
+ "epoch": 2.433810652106727,
1351
+ "grad_norm": 6.105677604675293,
1352
+ "learning_rate": 6.659421551457711e-06,
1353
+ "loss": 1.7851,
1354
+ "step": 94500
1355
+ },
1356
+ {
1357
+ "epoch": 2.446687957144329,
1358
+ "grad_norm": 4.1233015060424805,
1359
+ "learning_rate": 6.657618728752447e-06,
1360
+ "loss": 1.77,
1361
+ "step": 95000
1362
+ },
1363
+ {
1364
+ "epoch": 2.4595652621819304,
1365
+ "grad_norm": 5.5101728439331055,
1366
+ "learning_rate": 6.655819511692593e-06,
1367
+ "loss": 1.7782,
1368
+ "step": 95500
1369
+ },
1370
+ {
1371
+ "epoch": 2.4724425672195323,
1372
+ "grad_norm": 6.412257194519043,
1373
+ "learning_rate": 6.654016688987328e-06,
1374
+ "loss": 1.7782,
1375
+ "step": 96000
1376
+ },
1377
+ {
1378
+ "epoch": 2.485319872257134,
1379
+ "grad_norm": 7.24741268157959,
1380
+ "learning_rate": 6.652213866282064e-06,
1381
+ "loss": 1.7836,
1382
+ "step": 96500
1383
+ },
1384
+ {
1385
+ "epoch": 2.4981971772947356,
1386
+ "grad_norm": 4.249999046325684,
1387
+ "learning_rate": 6.650411043576801e-06,
1388
+ "loss": 1.7816,
1389
+ "step": 97000
1390
+ },
1391
+ {
1392
+ "epoch": 2.5110744823323374,
1393
+ "grad_norm": 6.146023750305176,
1394
+ "learning_rate": 6.648608220871536e-06,
1395
+ "loss": 1.7716,
1396
+ "step": 97500
1397
+ },
1398
+ {
1399
+ "epoch": 2.5239517873699393,
1400
+ "grad_norm": 5.631804466247559,
1401
+ "learning_rate": 6.646809003811682e-06,
1402
+ "loss": 1.7912,
1403
+ "step": 98000
1404
+ },
1405
+ {
1406
+ "epoch": 2.536829092407541,
1407
+ "grad_norm": 6.987208843231201,
1408
+ "learning_rate": 6.645006181106418e-06,
1409
+ "loss": 1.7704,
1410
+ "step": 98500
1411
+ },
1412
+ {
1413
+ "epoch": 2.5497063974451426,
1414
+ "grad_norm": 7.857710838317871,
1415
+ "learning_rate": 6.643203358401153e-06,
1416
+ "loss": 1.7756,
1417
+ "step": 99000
1418
+ },
1419
+ {
1420
+ "epoch": 2.5625837024827445,
1421
+ "grad_norm": 6.447986602783203,
1422
+ "learning_rate": 6.64140053569589e-06,
1423
+ "loss": 1.7808,
1424
+ "step": 99500
1425
+ },
1426
+ {
1427
+ "epoch": 2.575461007520346,
1428
+ "grad_norm": 7.811335563659668,
1429
+ "learning_rate": 6.6395977129906255e-06,
1430
+ "loss": 1.7759,
1431
+ "step": 100000
1432
+ },
1433
+ {
1434
+ "epoch": 2.5883383125579478,
1435
+ "grad_norm": 5.068165302276611,
1436
+ "learning_rate": 6.637794890285361e-06,
1437
+ "loss": 1.7931,
1438
+ "step": 100500
1439
+ },
1440
+ {
1441
+ "epoch": 2.6012156175955496,
1442
+ "grad_norm": 7.218021392822266,
1443
+ "learning_rate": 6.635992067580097e-06,
1444
+ "loss": 1.7843,
1445
+ "step": 101000
1446
+ },
1447
+ {
1448
+ "epoch": 2.6140929226331515,
1449
+ "grad_norm": 6.719173908233643,
1450
+ "learning_rate": 6.634192850520243e-06,
1451
+ "loss": 1.7702,
1452
+ "step": 101500
1453
+ },
1454
+ {
1455
+ "epoch": 2.626970227670753,
1456
+ "grad_norm": 5.827301025390625,
1457
+ "learning_rate": 6.632390027814979e-06,
1458
+ "loss": 1.7702,
1459
+ "step": 102000
1460
+ },
1461
+ {
1462
+ "epoch": 2.639847532708355,
1463
+ "grad_norm": 3.2344276905059814,
1464
+ "learning_rate": 6.630587205109715e-06,
1465
+ "loss": 1.7823,
1466
+ "step": 102500
1467
+ },
1468
+ {
1469
+ "epoch": 2.6527248377459567,
1470
+ "grad_norm": 3.8477556705474854,
1471
+ "learning_rate": 6.628784382404451e-06,
1472
+ "loss": 1.7838,
1473
+ "step": 103000
1474
+ },
1475
+ {
1476
+ "epoch": 2.665602142783558,
1477
+ "grad_norm": 5.570420265197754,
1478
+ "learning_rate": 6.626981559699186e-06,
1479
+ "loss": 1.7745,
1480
+ "step": 103500
1481
+ },
1482
+ {
1483
+ "epoch": 2.67847944782116,
1484
+ "grad_norm": 4.79215669631958,
1485
+ "learning_rate": 6.625182342639332e-06,
1486
+ "loss": 1.7727,
1487
+ "step": 104000
1488
+ },
1489
+ {
1490
+ "epoch": 2.691356752858762,
1491
+ "grad_norm": 5.203454494476318,
1492
+ "learning_rate": 6.623379519934068e-06,
1493
+ "loss": 1.7863,
1494
+ "step": 104500
1495
+ },
1496
+ {
1497
+ "epoch": 2.7042340578963633,
1498
+ "grad_norm": 6.587026119232178,
1499
+ "learning_rate": 6.621576697228804e-06,
1500
+ "loss": 1.7687,
1501
+ "step": 105000
1502
+ },
1503
+ {
1504
+ "epoch": 2.717111362933965,
1505
+ "grad_norm": 5.685227394104004,
1506
+ "learning_rate": 6.61977387452354e-06,
1507
+ "loss": 1.7764,
1508
+ "step": 105500
1509
+ },
1510
+ {
1511
+ "epoch": 2.729988667971567,
1512
+ "grad_norm": 5.212521076202393,
1513
+ "learning_rate": 6.6179710518182755e-06,
1514
+ "loss": 1.782,
1515
+ "step": 106000
1516
+ },
1517
+ {
1518
+ "epoch": 2.7428659730091685,
1519
+ "grad_norm": 6.606338024139404,
1520
+ "learning_rate": 6.616171834758422e-06,
1521
+ "loss": 1.7778,
1522
+ "step": 106500
1523
+ },
1524
+ {
1525
+ "epoch": 2.7557432780467703,
1526
+ "grad_norm": 5.185567378997803,
1527
+ "learning_rate": 6.614369012053158e-06,
1528
+ "loss": 1.7819,
1529
+ "step": 107000
1530
+ },
1531
+ {
1532
+ "epoch": 2.768620583084372,
1533
+ "grad_norm": 4.298264503479004,
1534
+ "learning_rate": 6.6125661893478934e-06,
1535
+ "loss": 1.7857,
1536
+ "step": 107500
1537
+ },
1538
+ {
1539
+ "epoch": 2.781497888121974,
1540
+ "grad_norm": 4.79362154006958,
1541
+ "learning_rate": 6.610763366642629e-06,
1542
+ "loss": 1.7912,
1543
+ "step": 108000
1544
+ },
1545
+ {
1546
+ "epoch": 2.7943751931595755,
1547
+ "grad_norm": 4.650264739990234,
1548
+ "learning_rate": 6.608960543937365e-06,
1549
+ "loss": 1.7815,
1550
+ "step": 108500
1551
+ },
1552
+ {
1553
+ "epoch": 2.8072524981971774,
1554
+ "grad_norm": 5.515447616577148,
1555
+ "learning_rate": 6.607161326877511e-06,
1556
+ "loss": 1.7736,
1557
+ "step": 109000
1558
+ },
1559
+ {
1560
+ "epoch": 2.820129803234779,
1561
+ "grad_norm": 4.840721130371094,
1562
+ "learning_rate": 6.605358504172247e-06,
1563
+ "loss": 1.7843,
1564
+ "step": 109500
1565
+ },
1566
+ {
1567
+ "epoch": 2.8330071082723807,
1568
+ "grad_norm": 5.676022052764893,
1569
+ "learning_rate": 6.603555681466983e-06,
1570
+ "loss": 1.7732,
1571
+ "step": 110000
1572
+ },
1573
+ {
1574
+ "epoch": 2.8458844133099825,
1575
+ "grad_norm": 6.819264888763428,
1576
+ "learning_rate": 6.601752858761718e-06,
1577
+ "loss": 1.7786,
1578
+ "step": 110500
1579
+ },
1580
+ {
1581
+ "epoch": 2.8587617183475844,
1582
+ "grad_norm": 5.747689723968506,
1583
+ "learning_rate": 6.599953641701864e-06,
1584
+ "loss": 1.7845,
1585
+ "step": 111000
1586
+ },
1587
+ {
1588
+ "epoch": 2.871639023385186,
1589
+ "grad_norm": 6.419643402099609,
1590
+ "learning_rate": 6.5981508189966e-06,
1591
+ "loss": 1.7745,
1592
+ "step": 111500
1593
+ },
1594
+ {
1595
+ "epoch": 2.8845163284227877,
1596
+ "grad_norm": 5.376995086669922,
1597
+ "learning_rate": 6.596347996291336e-06,
1598
+ "loss": 1.7765,
1599
+ "step": 112000
1600
+ },
1601
+ {
1602
+ "epoch": 2.897393633460389,
1603
+ "grad_norm": 6.216523170471191,
1604
+ "learning_rate": 6.594545173586072e-06,
1605
+ "loss": 1.7718,
1606
+ "step": 112500
1607
+ },
1608
+ {
1609
+ "epoch": 2.910270938497991,
1610
+ "grad_norm": 4.209794998168945,
1611
+ "learning_rate": 6.592742350880808e-06,
1612
+ "loss": 1.7656,
1613
+ "step": 113000
1614
+ },
1615
+ {
1616
+ "epoch": 2.923148243535593,
1617
+ "grad_norm": 4.328191757202148,
1618
+ "learning_rate": 6.590943133820953e-06,
1619
+ "loss": 1.7868,
1620
+ "step": 113500
1621
+ },
1622
+ {
1623
+ "epoch": 2.9360255485731948,
1624
+ "grad_norm": 2.272550344467163,
1625
+ "learning_rate": 6.58914031111569e-06,
1626
+ "loss": 1.7769,
1627
+ "step": 114000
1628
+ },
1629
+ {
1630
+ "epoch": 2.948902853610796,
1631
+ "grad_norm": 4.638970375061035,
1632
+ "learning_rate": 6.587337488410426e-06,
1633
+ "loss": 1.7656,
1634
+ "step": 114500
1635
+ },
1636
+ {
1637
+ "epoch": 2.961780158648398,
1638
+ "grad_norm": 5.238265037536621,
1639
+ "learning_rate": 6.585538271350571e-06,
1640
+ "loss": 1.7628,
1641
+ "step": 115000
1642
+ },
1643
+ {
1644
+ "epoch": 2.974657463686,
1645
+ "grad_norm": 5.025523662567139,
1646
+ "learning_rate": 6.583735448645307e-06,
1647
+ "loss": 1.7875,
1648
+ "step": 115500
1649
+ },
1650
+ {
1651
+ "epoch": 2.9875347687236014,
1652
+ "grad_norm": 5.705627918243408,
1653
+ "learning_rate": 6.581932625940043e-06,
1654
+ "loss": 1.7735,
1655
+ "step": 116000
1656
+ },
1657
+ {
1658
+ "epoch": 3.0,
1659
+ "eval_accuracy": 0.28071705848998996,
1660
+ "eval_f1_macro": 0.15207874587429973,
1661
+ "eval_f1_micro": 0.28071705848998996,
1662
+ "eval_loss": 1.7819397449493408,
1663
+ "eval_runtime": 223.6939,
1664
+ "eval_samples_per_second": 102.493,
1665
+ "eval_steps_per_second": 12.812,
1666
+ "step": 116484
1667
+ }
1668
+ ],
1669
+ "logging_steps": 500,
1670
+ "max_steps": 1941400,
1671
+ "num_input_tokens_seen": 0,
1672
+ "num_train_epochs": 50,
1673
+ "save_steps": 500,
1674
+ "stateful_callbacks": {
1675
+ "EarlyStoppingCallback": {
1676
+ "args": {
1677
+ "early_stopping_patience": 3,
1678
+ "early_stopping_threshold": 0.0
1679
+ },
1680
+ "attributes": {
1681
+ "early_stopping_patience_counter": 0
1682
+ }
1683
+ },
1684
+ "TrainerControl": {
1685
+ "args": {
1686
+ "should_epoch_stop": false,
1687
+ "should_evaluate": false,
1688
+ "should_log": false,
1689
+ "should_save": true,
1690
+ "should_training_stop": false
1691
+ },
1692
+ "attributes": {}
1693
+ }
1694
+ },
1695
+ "total_flos": 4.870091010515927e+17,
1696
+ "train_batch_size": 8,
1697
+ "trial_name": null,
1698
+ "trial_params": null
1699
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39f6e65c04bdafa26d7201f33e0101bcdc3fff3e3674275315697d828819750
3
+ size 5368