Dinh commited on
Commit
79031ed
·
verified ·
1 Parent(s): 6b40012

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ ocr.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: Qwen/Qwen2.5-VL-3B-Instruct
3
  library_name: peft
4
  ---
5
 
 
1
  ---
2
+ base_model: Qwen/Qwen2.5-VL-7B-Instruct
3
  library_name: peft
4
  ---
5
 
adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "Qwen/Qwen2.5-VL-3B-Instruct",
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
@@ -24,118 +24,94 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "layers.28.mlp.up_proj",
28
- "layers.24.mlp.up_proj",
29
- "layers.18.mlp.down_proj",
30
- "layers.21.mlp.down_proj",
31
- "layers.24.mlp.gate_proj",
 
32
  "layers.25.mlp.up_proj",
33
- "layers.28.mlp.gate_proj",
34
- "32.mlp.gate_proj",
35
- "layers.3.mlp.down_proj",
36
- "layers.14.mlp.gate_proj",
37
- "layers.30.mlp.gate_proj",
38
- "layers.22.mlp.up_proj",
39
- "32.mlp.up_proj",
40
- "layers.15.mlp.down_proj",
41
- "layers.7.mlp.gate_proj",
42
- "layers.29.mlp.down_proj",
43
- "34.mlp.down_proj",
44
- "layers.2.mlp.up_proj",
45
  "v_proj",
46
- "layers.17.mlp.down_proj",
 
 
 
47
  "layers.18.mlp.up_proj",
48
- "layers.5.mlp.gate_proj",
49
- "layers.26.mlp.gate_proj",
50
- "layers.3.mlp.gate_proj",
 
 
 
 
 
 
 
51
  "layers.24.mlp.down_proj",
52
- "layers.11.mlp.up_proj",
53
- "layers.2.mlp.down_proj",
 
54
  "layers.27.mlp.down_proj",
55
- "layers.25.mlp.gate_proj",
 
 
56
  "layers.2.mlp.gate_proj",
 
 
 
 
 
 
 
57
  "layers.20.mlp.up_proj",
 
 
 
58
  "layers.20.mlp.gate_proj",
59
- "layers.0.mlp.gate_proj",
60
- "33.mlp.up_proj",
61
- "32.mlp.down_proj",
62
- "layers.25.mlp.down_proj",
63
- "layers.28.mlp.down_proj",
64
- "layers.0.mlp.down_proj",
65
- "35.mlp.down_proj",
66
- "layers.6.mlp.down_proj",
67
- "o_proj",
68
- "layers.29.mlp.up_proj",
69
- "layers.3.mlp.up_proj",
70
- "35.mlp.gate_proj",
71
  "layers.20.mlp.down_proj",
72
- "layers.31.mlp.down_proj",
73
- "layers.13.mlp.gate_proj",
74
- "layers.7.mlp.down_proj",
75
- "layers.5.mlp.down_proj",
76
- "layers.5.mlp.up_proj",
77
- "q_proj",
78
- "layers.21.mlp.up_proj",
79
- "layers.10.mlp.down_proj",
80
- "layers.10.mlp.gate_proj",
81
- "layers.31.mlp.up_proj",
82
- "layers.0.mlp.up_proj",
83
- "layers.14.mlp.up_proj",
84
- "33.mlp.gate_proj",
85
- "35.mlp.up_proj",
86
- "layers.17.mlp.up_proj",
87
- "layers.11.mlp.down_proj",
88
- "layers.30.mlp.down_proj",
89
- "layers.6.mlp.up_proj",
90
- "layers.19.mlp.up_proj",
91
- "layers.4.mlp.up_proj",
92
  "layers.10.mlp.up_proj",
93
- "34.mlp.up_proj",
94
- "33.mlp.down_proj",
95
- "layers.22.mlp.gate_proj",
96
- "k_proj",
97
- "layers.9.mlp.gate_proj",
98
- "layers.9.mlp.up_proj",
99
- "layers.11.mlp.gate_proj",
100
- "layers.23.mlp.down_proj",
101
- "layers.7.mlp.up_proj",
102
  "layers.9.mlp.down_proj",
103
- "layers.21.mlp.gate_proj",
104
- "layers.17.mlp.gate_proj",
105
- "layers.29.mlp.gate_proj",
106
- "layers.13.mlp.down_proj",
107
- "layers.19.mlp.gate_proj",
108
- "layers.6.mlp.gate_proj",
109
- "layers.13.mlp.up_proj",
110
- "layers.26.mlp.down_proj",
111
- "layers.15.mlp.gate_proj",
112
- "layers.22.mlp.down_proj",
113
- "layers.30.mlp.up_proj",
114
- "layers.16.mlp.gate_proj",
115
- "layers.23.mlp.gate_proj",
116
- "layers.31.mlp.gate_proj",
117
- "layers.12.mlp.down_proj",
118
- "layers.1.mlp.up_proj",
119
- "layers.8.mlp.up_proj",
120
  "layers.4.mlp.down_proj",
121
- "layers.27.mlp.gate_proj",
 
 
 
122
  "layers.8.mlp.gate_proj",
123
  "layers.19.mlp.down_proj",
124
- "layers.8.mlp.down_proj",
125
- "34.mlp.gate_proj",
126
- "layers.16.mlp.down_proj",
127
  "layers.16.mlp.up_proj",
128
- "layers.18.mlp.gate_proj",
 
 
 
 
 
 
129
  "layers.12.mlp.up_proj",
130
- "layers.1.mlp.gate_proj",
131
- "layers.23.mlp.up_proj",
132
- "layers.15.mlp.up_proj",
133
- "layers.26.mlp.up_proj",
134
  "layers.1.mlp.down_proj",
135
- "layers.14.mlp.down_proj",
136
- "layers.12.mlp.gate_proj",
137
  "layers.4.mlp.gate_proj",
138
- "layers.27.mlp.up_proj"
 
 
 
 
 
 
 
139
  ],
140
  "task_type": "CAUSAL_LM",
141
  "trainable_token_indices": null,
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
+ "layers.16.mlp.down_proj",
28
+ "layers.12.mlp.gate_proj",
29
+ "layers.17.mlp.up_proj",
30
+ "layers.3.mlp.up_proj",
31
+ "layers.5.mlp.gate_proj",
32
+ "k_proj",
33
  "layers.25.mlp.up_proj",
 
 
 
 
 
 
 
 
 
 
 
 
34
  "v_proj",
35
+ "layers.8.mlp.down_proj",
36
+ "layers.24.mlp.up_proj",
37
+ "layers.15.mlp.up_proj",
38
+ "layers.7.mlp.gate_proj",
39
  "layers.18.mlp.up_proj",
40
+ "layers.13.mlp.gate_proj",
41
+ "layers.6.mlp.gate_proj",
42
+ "layers.9.mlp.gate_proj",
43
+ "layers.16.mlp.gate_proj",
44
+ "layers.22.mlp.up_proj",
45
+ "layers.21.mlp.up_proj",
46
+ "layers.27.mlp.gate_proj",
47
+ "layers.23.mlp.down_proj",
48
+ "layers.24.mlp.gate_proj",
49
+ "layers.23.mlp.up_proj",
50
  "layers.24.mlp.down_proj",
51
+ "layers.0.mlp.down_proj",
52
+ "layers.0.mlp.up_proj",
53
+ "layers.26.mlp.gate_proj",
54
  "layers.27.mlp.down_proj",
55
+ "q_proj",
56
+ "layers.1.mlp.up_proj",
57
+ "layers.21.mlp.down_proj",
58
  "layers.2.mlp.gate_proj",
59
+ "layers.11.mlp.gate_proj",
60
+ "layers.14.mlp.up_proj",
61
+ "layers.22.mlp.down_proj",
62
+ "layers.5.mlp.down_proj",
63
+ "layers.19.mlp.gate_proj",
64
+ "layers.17.mlp.down_proj",
65
+ "layers.9.mlp.up_proj",
66
  "layers.20.mlp.up_proj",
67
+ "layers.3.mlp.gate_proj",
68
+ "layers.1.mlp.gate_proj",
69
+ "layers.13.mlp.up_proj",
70
  "layers.20.mlp.gate_proj",
71
+ "layers.26.mlp.down_proj",
 
 
 
 
 
 
 
 
 
 
 
72
  "layers.20.mlp.down_proj",
73
+ "layers.14.mlp.down_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  "layers.10.mlp.up_proj",
 
 
 
 
 
 
 
 
 
75
  "layers.9.mlp.down_proj",
76
+ "layers.6.mlp.up_proj",
77
+ "layers.11.mlp.up_proj",
78
+ "layers.10.mlp.gate_proj",
79
+ "layers.4.mlp.up_proj",
80
+ "layers.14.mlp.gate_proj",
81
+ "layers.2.mlp.up_proj",
 
 
 
 
 
 
 
 
 
 
 
82
  "layers.4.mlp.down_proj",
83
+ "layers.7.mlp.down_proj",
84
+ "o_proj",
85
+ "layers.0.mlp.gate_proj",
86
+ "layers.17.mlp.gate_proj",
87
  "layers.8.mlp.gate_proj",
88
  "layers.19.mlp.down_proj",
89
+ "layers.25.mlp.gate_proj",
90
+ "layers.11.mlp.down_proj",
 
91
  "layers.16.mlp.up_proj",
92
+ "layers.2.mlp.down_proj",
93
+ "layers.19.mlp.up_proj",
94
+ "layers.15.mlp.gate_proj",
95
+ "layers.22.mlp.gate_proj",
96
+ "layers.23.mlp.gate_proj",
97
+ "layers.6.mlp.down_proj",
98
+ "layers.10.mlp.down_proj",
99
  "layers.12.mlp.up_proj",
100
+ "layers.8.mlp.up_proj",
101
+ "layers.5.mlp.up_proj",
102
+ "layers.3.mlp.down_proj",
 
103
  "layers.1.mlp.down_proj",
104
+ "layers.21.mlp.gate_proj",
105
+ "layers.26.mlp.up_proj",
106
  "layers.4.mlp.gate_proj",
107
+ "layers.27.mlp.up_proj",
108
+ "layers.12.mlp.down_proj",
109
+ "layers.15.mlp.down_proj",
110
+ "layers.18.mlp.gate_proj",
111
+ "layers.18.mlp.down_proj",
112
+ "layers.7.mlp.up_proj",
113
+ "layers.25.mlp.down_proj",
114
+ "layers.13.mlp.down_proj"
115
  ],
116
  "task_type": "CAUSAL_LM",
117
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07f01f8f0b4027eaf41807cbd6678cfc2044054280411ce3145aec238302c90d
3
- size 239536776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65869936ca775b0b7e34856b673e70c7b88167d895229d6337d74ee4f2a74aa
3
+ size 323014560
latest CHANGED
@@ -1 +1 @@
1
- global_step375
 
1
+ global_step800
ocr.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ad6da340f2f6403e305dac3b8deef53107b7c5de50f73da9c490b8b174d6e40
3
+ size 34218525
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcffc811ab0ada587a376eecb6fc27cadcbc02597d7ed30d159fba6bf764c2b2
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbb214732a7d2d242635ceda6bf2c890011d7f28b61b690994eae9c7558a5c03
3
  size 1401
trainer_state.json CHANGED
@@ -2,543 +2,1138 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 375,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.013333333333333334,
14
- "grad_norm": 0.1966240406036377,
15
- "learning_rate": 2.105263157894737e-05,
16
- "loss": 0.0919,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.02666666666666667,
21
- "grad_norm": 0.109957255423069,
22
- "learning_rate": 4.736842105263158e-05,
23
- "loss": 0.0647,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.04,
28
- "grad_norm": 0.06744097173213959,
29
- "learning_rate": 7.368421052631579e-05,
30
- "loss": 0.053,
31
  "step": 15
32
  },
33
  {
34
- "epoch": 0.05333333333333334,
35
- "grad_norm": 0.054616399109363556,
36
- "learning_rate": 0.0001,
37
- "loss": 0.0415,
38
  "step": 20
39
  },
40
  {
41
- "epoch": 0.06666666666666667,
42
- "grad_norm": 0.07561139017343521,
43
- "learning_rate": 9.995133583167832e-05,
44
- "loss": 0.0447,
45
  "step": 25
46
  },
47
  {
48
- "epoch": 0.08,
49
- "grad_norm": 0.048080697655677795,
50
- "learning_rate": 9.980543805476446e-05,
51
- "loss": 0.0345,
52
  "step": 30
53
  },
54
  {
55
- "epoch": 0.09333333333333334,
56
- "grad_norm": 0.052209969609975815,
57
- "learning_rate": 9.956259066901733e-05,
58
- "loss": 0.0316,
59
  "step": 35
60
  },
61
  {
62
- "epoch": 0.10666666666666667,
63
- "grad_norm": 0.04528443142771721,
64
- "learning_rate": 9.922326639307917e-05,
65
- "loss": 0.0338,
66
  "step": 40
67
  },
68
  {
69
- "epoch": 0.12,
70
- "grad_norm": 0.06078013405203819,
71
- "learning_rate": 9.878812574429721e-05,
72
- "loss": 0.0338,
73
  "step": 45
74
  },
75
  {
76
- "epoch": 0.13333333333333333,
77
- "grad_norm": 0.04745374619960785,
78
- "learning_rate": 9.825801575298248e-05,
79
- "loss": 0.0284,
80
  "step": 50
81
  },
82
  {
83
- "epoch": 0.14666666666666667,
84
- "grad_norm": 0.051620353013277054,
85
- "learning_rate": 9.763396831360884e-05,
86
- "loss": 0.0345,
87
  "step": 55
88
  },
89
  {
90
- "epoch": 0.16,
91
- "grad_norm": 0.049457110464572906,
92
- "learning_rate": 9.691719817616147e-05,
93
- "loss": 0.0317,
94
  "step": 60
95
  },
96
  {
97
- "epoch": 0.17333333333333334,
98
- "grad_norm": 0.04445967078208923,
99
- "learning_rate": 9.61091005815451e-05,
100
- "loss": 0.0278,
101
  "step": 65
102
  },
103
  {
104
- "epoch": 0.18666666666666668,
105
- "grad_norm": 0.04890437796711922,
106
- "learning_rate": 9.521124854565425e-05,
107
- "loss": 0.0316,
108
  "step": 70
109
  },
110
  {
111
- "epoch": 0.2,
112
- "grad_norm": 0.030447857454419136,
113
- "learning_rate": 9.422538979739307e-05,
114
- "loss": 0.0304,
115
  "step": 75
116
  },
117
  {
118
- "epoch": 0.21333333333333335,
119
- "grad_norm": 0.052393049001693726,
120
- "learning_rate": 9.315344337660421e-05,
121
- "loss": 0.032,
122
  "step": 80
123
  },
124
  {
125
- "epoch": 0.22666666666666666,
126
- "grad_norm": 0.05007480829954147,
127
- "learning_rate": 9.19974958985298e-05,
128
- "loss": 0.0259,
129
  "step": 85
130
  },
131
  {
132
- "epoch": 0.24,
133
- "grad_norm": 0.05939432606101036,
134
- "learning_rate": 9.075979749207561e-05,
135
- "loss": 0.0286,
136
  "step": 90
137
  },
138
  {
139
- "epoch": 0.25333333333333335,
140
- "grad_norm": 0.04563208296895027,
141
- "learning_rate": 8.944275741978493e-05,
142
- "loss": 0.025,
143
  "step": 95
144
  },
145
  {
146
- "epoch": 0.26666666666666666,
147
- "grad_norm": 0.05097965523600578,
148
- "learning_rate": 8.80489393880484e-05,
149
- "loss": 0.0267,
150
  "step": 100
151
  },
152
  {
153
- "epoch": 0.28,
154
- "grad_norm": 0.03556825965642929,
155
- "learning_rate": 8.65810565566782e-05,
156
- "loss": 0.0216,
157
  "step": 105
158
  },
159
  {
160
- "epoch": 0.29333333333333333,
161
- "grad_norm": 0.04471671208739281,
162
- "learning_rate": 8.504196625756166e-05,
163
- "loss": 0.0213,
164
  "step": 110
165
  },
166
  {
167
- "epoch": 0.30666666666666664,
168
- "grad_norm": 0.038890305906534195,
169
- "learning_rate": 8.343466443267391e-05,
170
- "loss": 0.0238,
171
  "step": 115
172
  },
173
  {
174
- "epoch": 0.32,
175
- "grad_norm": 0.04462519288063049,
176
- "learning_rate": 8.176227980227694e-05,
177
- "loss": 0.024,
178
  "step": 120
179
  },
180
  {
181
- "epoch": 0.3333333333333333,
182
- "grad_norm": 0.0567193366587162,
183
- "learning_rate": 8.002806777465685e-05,
184
- "loss": 0.0309,
185
  "step": 125
186
  },
187
  {
188
- "epoch": 0.3466666666666667,
189
- "grad_norm": 0.047946710139513016,
190
- "learning_rate": 7.823540410925435e-05,
191
- "loss": 0.0254,
192
  "step": 130
193
  },
194
  {
195
- "epoch": 0.36,
196
- "grad_norm": 0.050420425832271576,
197
- "learning_rate": 7.63877783455237e-05,
198
- "loss": 0.0259,
199
  "step": 135
200
  },
201
  {
202
- "epoch": 0.37333333333333335,
203
- "grad_norm": 0.044385556131601334,
204
- "learning_rate": 7.448878701031142e-05,
205
- "loss": 0.0256,
206
  "step": 140
207
  },
208
  {
209
- "epoch": 0.38666666666666666,
210
- "grad_norm": 0.038669321686029434,
211
- "learning_rate": 7.254212661697659e-05,
212
- "loss": 0.0221,
213
  "step": 145
214
  },
215
  {
216
- "epoch": 0.4,
217
- "grad_norm": 0.039255157113075256,
218
- "learning_rate": 7.055158646988109e-05,
219
- "loss": 0.0209,
220
  "step": 150
221
  },
222
  {
223
- "epoch": 0.41333333333333333,
224
- "grad_norm": 0.052665840834379196,
225
- "learning_rate": 6.85210412882557e-05,
226
- "loss": 0.0254,
227
  "step": 155
228
  },
229
  {
230
- "epoch": 0.4266666666666667,
231
- "grad_norm": 0.05023692920804024,
232
- "learning_rate": 6.64544436638005e-05,
233
- "loss": 0.0328,
234
  "step": 160
235
  },
236
  {
237
- "epoch": 0.44,
238
- "grad_norm": 0.06168365851044655,
239
- "learning_rate": 6.435581636670154e-05,
240
- "loss": 0.0275,
241
  "step": 165
242
  },
243
  {
244
- "epoch": 0.4533333333333333,
245
- "grad_norm": 0.04346410557627678,
246
- "learning_rate": 6.222924451504001e-05,
247
- "loss": 0.0199,
248
  "step": 170
249
  },
250
  {
251
- "epoch": 0.4666666666666667,
252
- "grad_norm": 0.04977494478225708,
253
- "learning_rate": 6.0078867622837395e-05,
254
- "loss": 0.0217,
255
  "step": 175
256
  },
257
  {
258
- "epoch": 0.48,
259
- "grad_norm": 0.048214834183454514,
260
- "learning_rate": 5.79088715422152e-05,
261
- "loss": 0.0217,
262
  "step": 180
263
  },
264
  {
265
- "epoch": 0.49333333333333335,
266
- "grad_norm": 0.04549489915370941,
267
- "learning_rate": 5.572348031535441e-05,
268
- "loss": 0.0243,
269
  "step": 185
270
  },
271
  {
272
- "epoch": 0.5066666666666667,
273
- "grad_norm": 0.03898506984114647,
274
- "learning_rate": 5.352694795211555e-05,
275
- "loss": 0.0225,
276
  "step": 190
277
  },
278
  {
279
- "epoch": 0.52,
280
- "grad_norm": 0.04560156539082527,
281
- "learning_rate": 5.132355014932455e-05,
282
- "loss": 0.0306,
283
  "step": 195
284
  },
285
  {
286
- "epoch": 0.5333333333333333,
287
- "grad_norm": 0.04489945247769356,
288
- "learning_rate": 4.911757596784357e-05,
289
- "loss": 0.0196,
290
  "step": 200
291
  },
292
  {
293
- "epoch": 0.5466666666666666,
294
- "grad_norm": 0.04646344482898712,
295
- "learning_rate": 4.691331948362789e-05,
296
- "loss": 0.022,
297
  "step": 205
298
  },
299
  {
300
- "epoch": 0.56,
301
- "grad_norm": 0.04694559797644615,
302
- "learning_rate": 4.471507142902036e-05,
303
- "loss": 0.0267,
304
  "step": 210
305
  },
306
  {
307
- "epoch": 0.5733333333333334,
308
- "grad_norm": 0.07199534773826599,
309
- "learning_rate": 4.252711084055467e-05,
310
- "loss": 0.0207,
311
  "step": 215
312
  },
313
  {
314
- "epoch": 0.5866666666666667,
315
- "grad_norm": 0.035585273057222366,
316
- "learning_rate": 4.035369672952516e-05,
317
- "loss": 0.0174,
318
  "step": 220
319
  },
320
  {
321
- "epoch": 0.6,
322
- "grad_norm": 0.04401592165231705,
323
- "learning_rate": 3.81990597915371e-05,
324
- "loss": 0.0215,
325
  "step": 225
326
  },
327
  {
328
- "epoch": 0.6133333333333333,
329
- "grad_norm": 0.05258890613913536,
330
- "learning_rate": 3.6067394171175394e-05,
331
- "loss": 0.0302,
332
  "step": 230
333
  },
334
  {
335
- "epoch": 0.6266666666666667,
336
- "grad_norm": 0.03661968186497688,
337
- "learning_rate": 3.3962849297822226e-05,
338
- "loss": 0.024,
339
  "step": 235
340
  },
341
  {
342
- "epoch": 0.64,
343
- "grad_norm": 0.05009055882692337,
344
- "learning_rate": 3.188952180851589e-05,
345
- "loss": 0.0206,
346
  "step": 240
347
  },
348
  {
349
- "epoch": 0.6533333333333333,
350
- "grad_norm": 0.043911464512348175,
351
- "learning_rate": 2.9851447573573384e-05,
352
- "loss": 0.0237,
353
  "step": 245
354
  },
355
  {
356
- "epoch": 0.6666666666666666,
357
- "grad_norm": 0.041712645441293716,
358
- "learning_rate": 2.785259384049959e-05,
359
- "loss": 0.0209,
360
  "step": 250
361
  },
362
  {
363
- "epoch": 0.68,
364
- "grad_norm": 0.046845823526382446,
365
- "learning_rate": 2.5896851511475186e-05,
366
- "loss": 0.0269,
367
  "step": 255
368
  },
369
  {
370
- "epoch": 0.6933333333333334,
371
- "grad_norm": 0.03639785200357437,
372
- "learning_rate": 2.3988027569455895e-05,
373
- "loss": 0.0224,
374
  "step": 260
375
  },
376
  {
377
- "epoch": 0.7066666666666667,
378
- "grad_norm": 0.03725459799170494,
379
- "learning_rate": 2.2129837667626145e-05,
380
- "loss": 0.0224,
381
  "step": 265
382
  },
383
  {
384
- "epoch": 0.72,
385
- "grad_norm": 0.04400669410824776,
386
- "learning_rate": 2.0325898896632177e-05,
387
- "loss": 0.0202,
388
  "step": 270
389
  },
390
  {
391
- "epoch": 0.7333333333333333,
392
- "grad_norm": 0.04929959774017334,
393
- "learning_rate": 1.8579722743673773e-05,
394
- "loss": 0.027,
395
  "step": 275
396
  },
397
  {
398
- "epoch": 0.7466666666666667,
399
- "grad_norm": 0.05031820759177208,
400
- "learning_rate": 1.689470825715998e-05,
401
- "loss": 0.023,
402
  "step": 280
403
  },
404
  {
405
- "epoch": 0.76,
406
- "grad_norm": 0.03400196135044098,
407
- "learning_rate": 1.5274135430234654e-05,
408
- "loss": 0.0222,
409
  "step": 285
410
  },
411
  {
412
- "epoch": 0.7733333333333333,
413
- "grad_norm": 0.04948483780026436,
414
- "learning_rate": 1.3721158816050873e-05,
415
- "loss": 0.0237,
416
  "step": 290
417
  },
418
  {
419
- "epoch": 0.7866666666666666,
420
- "grad_norm": 0.036222368478775024,
421
- "learning_rate": 1.2238801387222714e-05,
422
- "loss": 0.017,
423
  "step": 295
424
  },
425
  {
426
- "epoch": 0.8,
427
- "grad_norm": 0.05248803272843361,
428
- "learning_rate": 1.0829948651407374e-05,
429
- "loss": 0.0225,
430
  "step": 300
431
  },
432
  {
433
- "epoch": 0.8133333333333334,
434
- "grad_norm": 0.04245099052786827,
435
- "learning_rate": 9.497343034471895e-06,
436
- "loss": 0.0206,
437
  "step": 305
438
  },
439
  {
440
- "epoch": 0.8266666666666667,
441
- "grad_norm": 0.054338328540325165,
442
- "learning_rate": 8.243578542178226e-06,
443
- "loss": 0.0263,
444
  "step": 310
445
  },
446
  {
447
- "epoch": 0.84,
448
- "grad_norm": 0.053041160106658936,
449
- "learning_rate": 7.071095710777925e-06,
450
- "loss": 0.0261,
451
  "step": 315
452
  },
453
  {
454
- "epoch": 0.8533333333333334,
455
- "grad_norm": 0.04103247821331024,
456
- "learning_rate": 5.982176856345445e-06,
457
- "loss": 0.0277,
458
  "step": 320
459
  },
460
  {
461
- "epoch": 0.8666666666666667,
462
- "grad_norm": 0.038258735090494156,
463
- "learning_rate": 4.978941632097611e-06,
464
- "loss": 0.0196,
465
  "step": 325
466
  },
467
  {
468
- "epoch": 0.88,
469
- "grad_norm": 0.034113574773073196,
470
- "learning_rate": 4.0633429023472e-06,
471
- "loss": 0.0228,
472
  "step": 330
473
  },
474
  {
475
- "epoch": 0.8933333333333333,
476
- "grad_norm": 0.037502411752939224,
477
- "learning_rate": 3.2371629411221848e-06,
478
- "loss": 0.0216,
479
  "step": 335
480
  },
481
  {
482
- "epoch": 0.9066666666666666,
483
- "grad_norm": 0.041906435042619705,
484
- "learning_rate": 2.50200996285046e-06,
485
- "loss": 0.0245,
486
  "step": 340
487
  },
488
  {
489
- "epoch": 0.92,
490
- "grad_norm": 0.03619668632745743,
491
- "learning_rate": 1.8593149918630925e-06,
492
- "loss": 0.0193,
493
  "step": 345
494
  },
495
  {
496
- "epoch": 0.9333333333333333,
497
- "grad_norm": 0.053711794316768646,
498
- "learning_rate": 1.3103290768099797e-06,
499
- "loss": 0.0209,
500
  "step": 350
501
  },
502
  {
503
- "epoch": 0.9466666666666667,
504
- "grad_norm": 0.04455176740884781,
505
- "learning_rate": 8.561208554101863e-07,
506
- "loss": 0.0211,
507
  "step": 355
508
  },
509
  {
510
- "epoch": 0.96,
511
- "grad_norm": 0.04045643284916878,
512
- "learning_rate": 4.975744742772848e-07,
513
- "loss": 0.0214,
514
  "step": 360
515
  },
516
  {
517
- "epoch": 0.9733333333333334,
518
- "grad_norm": 0.039244893938302994,
519
- "learning_rate": 2.3538786786896915e-07,
520
- "loss": 0.0194,
521
  "step": 365
522
  },
523
  {
524
- "epoch": 0.9866666666666667,
525
- "grad_norm": 0.03764765337109566,
526
- "learning_rate": 7.007139991108135e-08,
527
- "loss": 0.0246,
528
  "step": 370
529
  },
530
  {
531
- "epoch": 1.0,
532
- "grad_norm": 0.04596920311450958,
533
- "learning_rate": 1.9468699405444934e-09,
534
- "loss": 0.0253,
535
  "step": 375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  }
537
  ],
538
  "logging_steps": 5,
539
- "max_steps": 375,
540
  "num_input_tokens_seen": 0,
541
- "num_train_epochs": 1,
542
  "save_steps": 100,
543
  "stateful_callbacks": {
544
  "TrainerControl": {
@@ -547,12 +1142,12 @@
547
  "should_evaluate": false,
548
  "should_log": false,
549
  "should_save": true,
550
- "should_training_stop": true
551
  },
552
  "attributes": {}
553
  }
554
  },
555
- "total_flos": 5.816262106860749e+17,
556
  "train_batch_size": 2,
557
  "trial_name": null,
558
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8921103986618344,
6
  "eval_steps": 500,
7
+ "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.005575689991636465,
14
+ "grad_norm": 0.16284966468811035,
15
+ "learning_rate": 7.142857142857143e-06,
16
+ "loss": 0.0626,
17
  "step": 5
18
  },
19
  {
20
+ "epoch": 0.01115137998327293,
21
+ "grad_norm": 0.10163773596286774,
22
+ "learning_rate": 1.6071428571428572e-05,
23
+ "loss": 0.051,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 0.016727069974909393,
28
+ "grad_norm": 0.12601175904273987,
29
+ "learning_rate": 2.5e-05,
30
+ "loss": 0.041,
31
  "step": 15
32
  },
33
  {
34
+ "epoch": 0.02230275996654586,
35
+ "grad_norm": 0.04396357387304306,
36
+ "learning_rate": 3.392857142857143e-05,
37
+ "loss": 0.0336,
38
  "step": 20
39
  },
40
  {
41
+ "epoch": 0.027878449958182325,
42
+ "grad_norm": 0.04541896656155586,
43
+ "learning_rate": 4.2857142857142856e-05,
44
+ "loss": 0.03,
45
  "step": 25
46
  },
47
  {
48
+ "epoch": 0.03345413994981879,
49
+ "grad_norm": 0.035561174154281616,
50
+ "learning_rate": 5.1785714285714296e-05,
51
+ "loss": 0.026,
52
  "step": 30
53
  },
54
  {
55
+ "epoch": 0.039029829941455256,
56
+ "grad_norm": 0.04301896691322327,
57
+ "learning_rate": 6.0714285714285715e-05,
58
+ "loss": 0.0232,
59
  "step": 35
60
  },
61
  {
62
+ "epoch": 0.04460551993309172,
63
+ "grad_norm": 0.04101714491844177,
64
+ "learning_rate": 6.964285714285715e-05,
65
+ "loss": 0.0234,
66
  "step": 40
67
  },
68
  {
69
+ "epoch": 0.05018120992472819,
70
+ "grad_norm": 0.03225620836019516,
71
+ "learning_rate": 7.857142857142858e-05,
72
+ "loss": 0.0199,
73
  "step": 45
74
  },
75
  {
76
+ "epoch": 0.05575689991636465,
77
+ "grad_norm": 0.03156769648194313,
78
+ "learning_rate": 8.75e-05,
79
+ "loss": 0.02,
80
  "step": 50
81
  },
82
  {
83
+ "epoch": 0.06133258990800112,
84
+ "grad_norm": 0.03408223018050194,
85
+ "learning_rate": 9.642857142857143e-05,
86
+ "loss": 0.0227,
87
  "step": 55
88
  },
89
  {
90
+ "epoch": 0.06690827989963757,
91
+ "grad_norm": 0.031089385971426964,
92
+ "learning_rate": 9.999803846452024e-05,
93
+ "loss": 0.02,
94
  "step": 60
95
  },
96
  {
97
+ "epoch": 0.07248396989127405,
98
+ "grad_norm": 0.033274080604314804,
99
+ "learning_rate": 9.998605186060137e-05,
100
+ "loss": 0.014,
101
  "step": 65
102
  },
103
  {
104
+ "epoch": 0.07805965988291051,
105
+ "grad_norm": 0.02827683836221695,
106
+ "learning_rate": 9.996317100396068e-05,
107
+ "loss": 0.0202,
108
  "step": 70
109
  },
110
  {
111
+ "epoch": 0.08363534987454697,
112
+ "grad_norm": 0.037661969661712646,
113
+ "learning_rate": 9.992940088138597e-05,
114
+ "loss": 0.0222,
115
  "step": 75
116
  },
117
  {
118
+ "epoch": 0.08921103986618344,
119
+ "grad_norm": 0.04279659315943718,
120
+ "learning_rate": 9.988474885293544e-05,
121
+ "loss": 0.0186,
122
  "step": 80
123
  },
124
  {
125
+ "epoch": 0.0947867298578199,
126
+ "grad_norm": 0.026112260296940804,
127
+ "learning_rate": 9.98292246503335e-05,
128
+ "loss": 0.02,
129
  "step": 85
130
  },
131
  {
132
+ "epoch": 0.10036241984945637,
133
+ "grad_norm": 0.029179614037275314,
134
+ "learning_rate": 9.976284037484988e-05,
135
+ "loss": 0.0175,
136
  "step": 90
137
  },
138
  {
139
+ "epoch": 0.10593810984109284,
140
+ "grad_norm": 0.035265687853097916,
141
+ "learning_rate": 9.968561049466214e-05,
142
+ "loss": 0.0163,
143
  "step": 95
144
  },
145
  {
146
+ "epoch": 0.1115137998327293,
147
+ "grad_norm": 0.02659149281680584,
148
+ "learning_rate": 9.95975518417024e-05,
149
+ "loss": 0.0134,
150
  "step": 100
151
  },
152
  {
153
+ "epoch": 0.11708948982436576,
154
+ "grad_norm": 0.03535303473472595,
155
+ "learning_rate": 9.949868360798893e-05,
156
+ "loss": 0.0174,
157
  "step": 105
158
  },
159
  {
160
+ "epoch": 0.12266517981600224,
161
+ "grad_norm": 0.028047222644090652,
162
+ "learning_rate": 9.938902734144326e-05,
163
+ "loss": 0.014,
164
  "step": 110
165
  },
166
  {
167
+ "epoch": 0.12824086980763869,
168
+ "grad_norm": 0.025049524381756783,
169
+ "learning_rate": 9.926860694119398e-05,
170
+ "loss": 0.014,
171
  "step": 115
172
  },
173
  {
174
+ "epoch": 0.13381655979927515,
175
+ "grad_norm": 0.03536759689450264,
176
+ "learning_rate": 9.913744865236798e-05,
177
+ "loss": 0.0161,
178
  "step": 120
179
  },
180
  {
181
+ "epoch": 0.13939224979091164,
182
+ "grad_norm": 0.03142830356955528,
183
+ "learning_rate": 9.899558106037039e-05,
184
+ "loss": 0.016,
185
  "step": 125
186
  },
187
  {
188
+ "epoch": 0.1449679397825481,
189
+ "grad_norm": 0.029837962239980698,
190
+ "learning_rate": 9.884303508465463e-05,
191
+ "loss": 0.017,
192
  "step": 130
193
  },
194
  {
195
+ "epoch": 0.15054362977418456,
196
+ "grad_norm": 0.050843242555856705,
197
+ "learning_rate": 9.867984397198348e-05,
198
+ "loss": 0.0209,
199
  "step": 135
200
  },
201
  {
202
+ "epoch": 0.15611931976582102,
203
+ "grad_norm": 0.022020680829882622,
204
+ "learning_rate": 9.85060432891833e-05,
205
+ "loss": 0.013,
206
  "step": 140
207
  },
208
  {
209
+ "epoch": 0.1616950097574575,
210
+ "grad_norm": 0.03178994357585907,
211
+ "learning_rate": 9.832167091539214e-05,
212
+ "loss": 0.0144,
213
  "step": 145
214
  },
215
  {
216
+ "epoch": 0.16727069974909395,
217
+ "grad_norm": 0.030585451051592827,
218
+ "learning_rate": 9.812676703380433e-05,
219
+ "loss": 0.0144,
220
  "step": 150
221
  },
222
  {
223
+ "epoch": 0.1728463897407304,
224
+ "grad_norm": 0.02348562888801098,
225
+ "learning_rate": 9.792137412291265e-05,
226
+ "loss": 0.0137,
227
  "step": 155
228
  },
229
  {
230
+ "epoch": 0.17842207973236687,
231
+ "grad_norm": 0.028541121631860733,
232
+ "learning_rate": 9.770553694725028e-05,
233
+ "loss": 0.0169,
234
  "step": 160
235
  },
236
  {
237
+ "epoch": 0.18399776972400333,
238
+ "grad_norm": 0.02496708557009697,
239
+ "learning_rate": 9.747930254763467e-05,
240
+ "loss": 0.013,
241
  "step": 165
242
  },
243
  {
244
+ "epoch": 0.1895734597156398,
245
+ "grad_norm": 0.03572074696421623,
246
+ "learning_rate": 9.724272023091503e-05,
247
+ "loss": 0.0164,
248
  "step": 170
249
  },
250
  {
251
+ "epoch": 0.1951491497072763,
252
+ "grad_norm": 0.02730608731508255,
253
+ "learning_rate": 9.699584155922625e-05,
254
+ "loss": 0.0135,
255
  "step": 175
256
  },
257
  {
258
+ "epoch": 0.20072483969891275,
259
+ "grad_norm": 0.03099130466580391,
260
+ "learning_rate": 9.673872033875109e-05,
261
+ "loss": 0.0157,
262
  "step": 180
263
  },
264
  {
265
+ "epoch": 0.2063005296905492,
266
+ "grad_norm": 0.031458914279937744,
267
+ "learning_rate": 9.64714126079933e-05,
268
+ "loss": 0.0138,
269
  "step": 185
270
  },
271
  {
272
+ "epoch": 0.21187621968218567,
273
+ "grad_norm": 0.03125375509262085,
274
+ "learning_rate": 9.619397662556435e-05,
275
+ "loss": 0.0114,
276
  "step": 190
277
  },
278
  {
279
+ "epoch": 0.21745190967382214,
280
+ "grad_norm": 0.031778380274772644,
281
+ "learning_rate": 9.590647285748613e-05,
282
+ "loss": 0.0117,
283
  "step": 195
284
  },
285
  {
286
+ "epoch": 0.2230275996654586,
287
+ "grad_norm": 0.019305897876620293,
288
+ "learning_rate": 9.56089639640127e-05,
289
+ "loss": 0.0143,
290
  "step": 200
291
  },
292
  {
293
+ "epoch": 0.22860328965709506,
294
+ "grad_norm": 0.02124331146478653,
295
+ "learning_rate": 9.530151478597366e-05,
296
+ "loss": 0.0135,
297
  "step": 205
298
  },
299
  {
300
+ "epoch": 0.23417897964873152,
301
+ "grad_norm": 0.033200375735759735,
302
+ "learning_rate": 9.498419233064246e-05,
303
+ "loss": 0.0143,
304
  "step": 210
305
  },
306
  {
307
+ "epoch": 0.23975466964036798,
308
+ "grad_norm": 0.03514528647065163,
309
+ "learning_rate": 9.465706575713236e-05,
310
+ "loss": 0.0204,
311
  "step": 215
312
  },
313
  {
314
+ "epoch": 0.24533035963200447,
315
+ "grad_norm": 0.029311848804354668,
316
+ "learning_rate": 9.432020636132354e-05,
317
+ "loss": 0.0176,
318
  "step": 220
319
  },
320
  {
321
+ "epoch": 0.25090604962364094,
322
+ "grad_norm": 0.030977483838796616,
323
+ "learning_rate": 9.397368756032445e-05,
324
+ "loss": 0.0146,
325
  "step": 225
326
  },
327
  {
328
+ "epoch": 0.25648173961527737,
329
+ "grad_norm": 0.025566186755895615,
330
+ "learning_rate": 9.361758487647082e-05,
331
+ "loss": 0.0136,
332
  "step": 230
333
  },
334
  {
335
+ "epoch": 0.26205742960691386,
336
+ "grad_norm": 0.02457290142774582,
337
+ "learning_rate": 9.32519759208659e-05,
338
+ "loss": 0.0152,
339
  "step": 235
340
  },
341
  {
342
+ "epoch": 0.2676331195985503,
343
+ "grad_norm": 0.023812102153897285,
344
+ "learning_rate": 9.287694037646548e-05,
345
+ "loss": 0.0148,
346
  "step": 240
347
  },
348
  {
349
+ "epoch": 0.2732088095901868,
350
+ "grad_norm": 0.023294365033507347,
351
+ "learning_rate": 9.249255998071126e-05,
352
+ "loss": 0.0123,
353
  "step": 245
354
  },
355
  {
356
+ "epoch": 0.2787844995818233,
357
+ "grad_norm": 0.018993759527802467,
358
+ "learning_rate": 9.209891850771657e-05,
359
+ "loss": 0.0099,
360
  "step": 250
361
  },
362
  {
363
+ "epoch": 0.2843601895734597,
364
+ "grad_norm": 0.034646522253751755,
365
+ "learning_rate": 9.169610175000812e-05,
366
+ "loss": 0.0139,
367
  "step": 255
368
  },
369
  {
370
+ "epoch": 0.2899358795650962,
371
+ "grad_norm": 0.029509609565138817,
372
+ "learning_rate": 9.12841974998278e-05,
373
+ "loss": 0.0117,
374
  "step": 260
375
  },
376
  {
377
+ "epoch": 0.29551156955673263,
378
+ "grad_norm": 0.024864595383405685,
379
+ "learning_rate": 9.086329552999891e-05,
380
+ "loss": 0.0146,
381
  "step": 265
382
  },
383
  {
384
+ "epoch": 0.3010872595483691,
385
+ "grad_norm": 0.023953670635819435,
386
+ "learning_rate": 9.043348757436037e-05,
387
+ "loss": 0.0131,
388
  "step": 270
389
  },
390
  {
391
+ "epoch": 0.30666294954000556,
392
+ "grad_norm": 0.019653376191854477,
393
+ "learning_rate": 8.99948673077738e-05,
394
+ "loss": 0.0107,
395
  "step": 275
396
  },
397
  {
398
+ "epoch": 0.31223863953164205,
399
+ "grad_norm": 0.03767814487218857,
400
+ "learning_rate": 8.954753032570742e-05,
401
+ "loss": 0.0143,
402
  "step": 280
403
  },
404
  {
405
+ "epoch": 0.3178143295232785,
406
+ "grad_norm": 0.021987926214933395,
407
+ "learning_rate": 8.90915741234015e-05,
408
+ "loss": 0.0098,
409
  "step": 285
410
  },
411
  {
412
+ "epoch": 0.323390019514915,
413
+ "grad_norm": 0.021244384348392487,
414
+ "learning_rate": 8.862709807461956e-05,
415
+ "loss": 0.0106,
416
  "step": 290
417
  },
418
  {
419
+ "epoch": 0.32896570950655146,
420
+ "grad_norm": 0.02844163216650486,
421
+ "learning_rate": 8.815420340999033e-05,
422
+ "loss": 0.0162,
423
  "step": 295
424
  },
425
  {
426
+ "epoch": 0.3345413994981879,
427
+ "grad_norm": 0.02485722117125988,
428
+ "learning_rate": 8.767299319494503e-05,
429
+ "loss": 0.0164,
430
  "step": 300
431
  },
432
  {
433
+ "epoch": 0.3401170894898244,
434
+ "grad_norm": 0.019627615809440613,
435
+ "learning_rate": 8.718357230725449e-05,
436
+ "loss": 0.0139,
437
  "step": 305
438
  },
439
  {
440
+ "epoch": 0.3456927794814608,
441
+ "grad_norm": 0.02517726831138134,
442
+ "learning_rate": 8.668604741417171e-05,
443
+ "loss": 0.0128,
444
  "step": 310
445
  },
446
  {
447
+ "epoch": 0.3512684694730973,
448
+ "grad_norm": 0.02175074815750122,
449
+ "learning_rate": 8.618052694918399e-05,
450
+ "loss": 0.0111,
451
  "step": 315
452
  },
453
  {
454
+ "epoch": 0.35684415946473375,
455
+ "grad_norm": 0.021222930401563644,
456
+ "learning_rate": 8.566712108838042e-05,
457
+ "loss": 0.0111,
458
  "step": 320
459
  },
460
  {
461
+ "epoch": 0.36241984945637024,
462
+ "grad_norm": 0.024494808167219162,
463
+ "learning_rate": 8.514594172643934e-05,
464
+ "loss": 0.0138,
465
  "step": 325
466
  },
467
  {
468
+ "epoch": 0.36799553944800667,
469
+ "grad_norm": 0.022174010053277016,
470
+ "learning_rate": 8.461710245224148e-05,
471
+ "loss": 0.0134,
472
  "step": 330
473
  },
474
  {
475
+ "epoch": 0.37357122943964316,
476
+ "grad_norm": 0.01959528774023056,
477
+ "learning_rate": 8.40807185241137e-05,
478
+ "loss": 0.0102,
479
  "step": 335
480
  },
481
  {
482
+ "epoch": 0.3791469194312796,
483
+ "grad_norm": 0.017945902422070503,
484
+ "learning_rate": 8.353690684470884e-05,
485
+ "loss": 0.0143,
486
  "step": 340
487
  },
488
  {
489
+ "epoch": 0.3847226094229161,
490
+ "grad_norm": 0.020864926278591156,
491
+ "learning_rate": 8.298578593552737e-05,
492
+ "loss": 0.0179,
493
  "step": 345
494
  },
495
  {
496
+ "epoch": 0.3902982994145526,
497
+ "grad_norm": 0.027325566858053207,
498
+ "learning_rate": 8.242747591108605e-05,
499
+ "loss": 0.0133,
500
  "step": 350
501
  },
502
  {
503
+ "epoch": 0.395873989406189,
504
+ "grad_norm": 0.019658569246530533,
505
+ "learning_rate": 8.186209845273954e-05,
506
+ "loss": 0.0139,
507
  "step": 355
508
  },
509
  {
510
+ "epoch": 0.4014496793978255,
511
+ "grad_norm": 0.02014886401593685,
512
+ "learning_rate": 8.128977678216039e-05,
513
+ "loss": 0.009,
514
  "step": 360
515
  },
516
  {
517
+ "epoch": 0.40702536938946193,
518
+ "grad_norm": 0.02425803802907467,
519
+ "learning_rate": 8.07106356344834e-05,
520
+ "loss": 0.0125,
521
  "step": 365
522
  },
523
  {
524
+ "epoch": 0.4126010593810984,
525
+ "grad_norm": 0.030235106125473976,
526
+ "learning_rate": 8.012480123112014e-05,
527
+ "loss": 0.0171,
528
  "step": 370
529
  },
530
  {
531
+ "epoch": 0.41817674937273486,
532
+ "grad_norm": 0.022229960188269615,
533
+ "learning_rate": 7.953240125224948e-05,
534
+ "loss": 0.0116,
535
  "step": 375
536
+ },
537
+ {
538
+ "epoch": 0.42375243936437135,
539
+ "grad_norm": 0.025448938831686974,
540
+ "learning_rate": 7.89335648089903e-05,
541
+ "loss": 0.0142,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 0.4293281293560078,
546
+ "grad_norm": 0.023552658036351204,
547
+ "learning_rate": 7.832842241526212e-05,
548
+ "loss": 0.0147,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 0.43490381934764427,
553
+ "grad_norm": 0.019487692043185234,
554
+ "learning_rate": 7.77171059593403e-05,
555
+ "loss": 0.0115,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 0.44047950933928076,
560
+ "grad_norm": 0.021791953593492508,
561
+ "learning_rate": 7.709974867511138e-05,
562
+ "loss": 0.012,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 0.4460551993309172,
567
+ "grad_norm": 0.02281327173113823,
568
+ "learning_rate": 7.647648511303544e-05,
569
+ "loss": 0.0126,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 0.4516308893225537,
574
+ "grad_norm": 0.02623576670885086,
575
+ "learning_rate": 7.584745111082127e-05,
576
+ "loss": 0.0128,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 0.4572065793141901,
581
+ "grad_norm": 0.019894316792488098,
582
+ "learning_rate": 7.521278376382123e-05,
583
+ "loss": 0.0092,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 0.4627822693058266,
588
+ "grad_norm": 0.022427916526794434,
589
+ "learning_rate": 7.457262139515171e-05,
590
+ "loss": 0.0111,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.46835795929746304,
595
+ "grad_norm": 0.021067511290311813,
596
+ "learning_rate": 7.392710352554641e-05,
597
+ "loss": 0.0099,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.47393364928909953,
602
+ "grad_norm": 0.019623806700110435,
603
+ "learning_rate": 7.327637084294817e-05,
604
+ "loss": 0.012,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.47950933928073597,
609
+ "grad_norm": 0.02039971947669983,
610
+ "learning_rate": 7.262056517184669e-05,
611
+ "loss": 0.0138,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.48508502927237246,
616
+ "grad_norm": 0.021388281136751175,
617
+ "learning_rate": 7.195982944236851e-05,
618
+ "loss": 0.0123,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.49066071926400895,
623
+ "grad_norm": 0.022272834554314613,
624
+ "learning_rate": 7.1294307659126e-05,
625
+ "loss": 0.0126,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.4962364092556454,
630
+ "grad_norm": 0.02803129144012928,
631
+ "learning_rate": 7.062414486983197e-05,
632
+ "loss": 0.0118,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.5018120992472819,
637
+ "grad_norm": 0.025339094921946526,
638
+ "learning_rate": 6.994948713368737e-05,
639
+ "loss": 0.0147,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.5073877892389184,
644
+ "grad_norm": 0.024465398862957954,
645
+ "learning_rate": 6.927048148954812e-05,
646
+ "loss": 0.0118,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.5129634792305547,
651
+ "grad_norm": 0.025315098464488983,
652
+ "learning_rate": 6.858727592387867e-05,
653
+ "loss": 0.0165,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.5185391692221912,
658
+ "grad_norm": 0.020109234377741814,
659
+ "learning_rate": 6.790001933849899e-05,
660
+ "loss": 0.0108,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.5241148592138277,
665
+ "grad_norm": 0.01888495869934559,
666
+ "learning_rate": 6.720886151813194e-05,
667
+ "loss": 0.0097,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.5296905492054642,
672
+ "grad_norm": 0.023433005437254906,
673
+ "learning_rate": 6.651395309775837e-05,
674
+ "loss": 0.0122,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.5352662391971006,
679
+ "grad_norm": 0.019298607483506203,
680
+ "learning_rate": 6.581544552978687e-05,
681
+ "loss": 0.0134,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.5408419291887371,
686
+ "grad_norm": 0.025588713586330414,
687
+ "learning_rate": 6.511349105104534e-05,
688
+ "loss": 0.0108,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.5464176191803736,
693
+ "grad_norm": 0.03148540109395981,
694
+ "learning_rate": 6.440824264960157e-05,
695
+ "loss": 0.0115,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.5519933091720101,
700
+ "grad_norm": 0.01748904027044773,
701
+ "learning_rate": 6.369985403142014e-05,
702
+ "loss": 0.0112,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.5575689991636466,
707
+ "grad_norm": 0.027883267030119896,
708
+ "learning_rate": 6.298847958686283e-05,
709
+ "loss": 0.0125,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 0.5631446891552829,
714
+ "grad_norm": 0.02129966951906681,
715
+ "learning_rate": 6.227427435703997e-05,
716
+ "loss": 0.0149,
717
+ "step": 505
718
+ },
719
+ {
720
+ "epoch": 0.5687203791469194,
721
+ "grad_norm": 0.02562125027179718,
722
+ "learning_rate": 6.15573940000197e-05,
723
+ "loss": 0.0136,
724
+ "step": 510
725
+ },
726
+ {
727
+ "epoch": 0.5742960691385559,
728
+ "grad_norm": 0.02446940541267395,
729
+ "learning_rate": 6.083799475690309e-05,
730
+ "loss": 0.0112,
731
+ "step": 515
732
+ },
733
+ {
734
+ "epoch": 0.5798717591301924,
735
+ "grad_norm": 0.024061646312475204,
736
+ "learning_rate": 6.0116233417771994e-05,
737
+ "loss": 0.0115,
738
+ "step": 520
739
+ },
740
+ {
741
+ "epoch": 0.5854474491218288,
742
+ "grad_norm": 0.01640748232603073,
743
+ "learning_rate": 5.9392267287517325e-05,
744
+ "loss": 0.0103,
745
+ "step": 525
746
+ },
747
+ {
748
+ "epoch": 0.5910231391134653,
749
+ "grad_norm": 0.023191062733530998,
750
+ "learning_rate": 5.8666254151554976e-05,
751
+ "loss": 0.0113,
752
+ "step": 530
753
+ },
754
+ {
755
+ "epoch": 0.5965988291051018,
756
+ "grad_norm": 0.017516452819108963,
757
+ "learning_rate": 5.7938352241437366e-05,
758
+ "loss": 0.0093,
759
+ "step": 535
760
+ },
761
+ {
762
+ "epoch": 0.6021745190967382,
763
+ "grad_norm": 0.019351812079548836,
764
+ "learning_rate": 5.720872020036734e-05,
765
+ "loss": 0.0125,
766
+ "step": 540
767
+ },
768
+ {
769
+ "epoch": 0.6077502090883747,
770
+ "grad_norm": 0.029706666246056557,
771
+ "learning_rate": 5.647751704862263e-05,
772
+ "loss": 0.008,
773
+ "step": 545
774
+ },
775
+ {
776
+ "epoch": 0.6133258990800111,
777
+ "grad_norm": 0.016750017181038857,
778
+ "learning_rate": 5.5744902148898005e-05,
779
+ "loss": 0.0118,
780
+ "step": 550
781
+ },
782
+ {
783
+ "epoch": 0.6189015890716476,
784
+ "grad_norm": 0.022833596915006638,
785
+ "learning_rate": 5.501103517157288e-05,
786
+ "loss": 0.0088,
787
+ "step": 555
788
+ },
789
+ {
790
+ "epoch": 0.6244772790632841,
791
+ "grad_norm": 0.03475171700119972,
792
+ "learning_rate": 5.427607605991176e-05,
793
+ "loss": 0.0136,
794
+ "step": 560
795
+ },
796
+ {
797
+ "epoch": 0.6300529690549206,
798
+ "grad_norm": 0.021340183913707733,
799
+ "learning_rate": 5.354018499520536e-05,
800
+ "loss": 0.0103,
801
+ "step": 565
802
+ },
803
+ {
804
+ "epoch": 0.635628659046557,
805
+ "grad_norm": 0.024497641250491142,
806
+ "learning_rate": 5.2803522361859594e-05,
807
+ "loss": 0.011,
808
+ "step": 570
809
+ },
810
+ {
811
+ "epoch": 0.6412043490381935,
812
+ "grad_norm": 0.01924068294465542,
813
+ "learning_rate": 5.2066248712440656e-05,
814
+ "loss": 0.0125,
815
+ "step": 575
816
+ },
817
+ {
818
+ "epoch": 0.64678003902983,
819
+ "grad_norm": 0.017638731747865677,
820
+ "learning_rate": 5.1328524732683134e-05,
821
+ "loss": 0.0104,
822
+ "step": 580
823
+ },
824
+ {
825
+ "epoch": 0.6523557290214664,
826
+ "grad_norm": 0.022175751626491547,
827
+ "learning_rate": 5.059051120646924e-05,
828
+ "loss": 0.0128,
829
+ "step": 585
830
+ },
831
+ {
832
+ "epoch": 0.6579314190131029,
833
+ "grad_norm": 0.022414250299334526,
834
+ "learning_rate": 4.985236898078658e-05,
835
+ "loss": 0.0128,
836
+ "step": 590
837
+ },
838
+ {
839
+ "epoch": 0.6635071090047393,
840
+ "grad_norm": 0.020940134301781654,
841
+ "learning_rate": 4.911425893067239e-05,
842
+ "loss": 0.0124,
843
+ "step": 595
844
+ },
845
+ {
846
+ "epoch": 0.6690827989963758,
847
+ "grad_norm": 0.021777737885713577,
848
+ "learning_rate": 4.837634192415128e-05,
849
+ "loss": 0.0126,
850
+ "step": 600
851
+ },
852
+ {
853
+ "epoch": 0.6746584889880123,
854
+ "grad_norm": 0.01768389716744423,
855
+ "learning_rate": 4.763877878717484e-05,
856
+ "loss": 0.0095,
857
+ "step": 605
858
+ },
859
+ {
860
+ "epoch": 0.6802341789796488,
861
+ "grad_norm": 0.02011968567967415,
862
+ "learning_rate": 4.6901730268570275e-05,
863
+ "loss": 0.0093,
864
+ "step": 610
865
+ },
866
+ {
867
+ "epoch": 0.6858098689712852,
868
+ "grad_norm": 0.02239886298775673,
869
+ "learning_rate": 4.616535700500583e-05,
870
+ "loss": 0.0126,
871
+ "step": 615
872
+ },
873
+ {
874
+ "epoch": 0.6913855589629216,
875
+ "grad_norm": 0.022233402356505394,
876
+ "learning_rate": 4.542981948598071e-05,
877
+ "loss": 0.0107,
878
+ "step": 620
879
+ },
880
+ {
881
+ "epoch": 0.6969612489545581,
882
+ "grad_norm": 0.027380308136343956,
883
+ "learning_rate": 4.4695278018847105e-05,
884
+ "loss": 0.0142,
885
+ "step": 625
886
+ },
887
+ {
888
+ "epoch": 0.7025369389461946,
889
+ "grad_norm": 0.025442643091082573,
890
+ "learning_rate": 4.396189269387176e-05,
891
+ "loss": 0.0153,
892
+ "step": 630
893
+ },
894
+ {
895
+ "epoch": 0.708112628937831,
896
+ "grad_norm": 0.01852184161543846,
897
+ "learning_rate": 4.322982334934509e-05,
898
+ "loss": 0.0102,
899
+ "step": 635
900
+ },
901
+ {
902
+ "epoch": 0.7136883189294675,
903
+ "grad_norm": 0.01528929267078638,
904
+ "learning_rate": 4.2499229536744986e-05,
905
+ "loss": 0.0097,
906
+ "step": 640
907
+ },
908
+ {
909
+ "epoch": 0.719264008921104,
910
+ "grad_norm": 0.026008352637290955,
911
+ "learning_rate": 4.17702704859633e-05,
912
+ "loss": 0.0159,
913
+ "step": 645
914
+ },
915
+ {
916
+ "epoch": 0.7248396989127405,
917
+ "grad_norm": 0.018146734684705734,
918
+ "learning_rate": 4.104310507060234e-05,
919
+ "loss": 0.0095,
920
+ "step": 650
921
+ },
922
+ {
923
+ "epoch": 0.730415388904377,
924
+ "grad_norm": 0.022718293592333794,
925
+ "learning_rate": 4.0317891773348946e-05,
926
+ "loss": 0.0095,
927
+ "step": 655
928
+ },
929
+ {
930
+ "epoch": 0.7359910788960133,
931
+ "grad_norm": 0.02410387434065342,
932
+ "learning_rate": 3.959478865143397e-05,
933
+ "loss": 0.0109,
934
+ "step": 660
935
+ },
936
+ {
937
+ "epoch": 0.7415667688876498,
938
+ "grad_norm": 0.017437651753425598,
939
+ "learning_rate": 3.887395330218429e-05,
940
+ "loss": 0.0107,
941
+ "step": 665
942
+ },
943
+ {
944
+ "epoch": 0.7471424588792863,
945
+ "grad_norm": 0.020500419661402702,
946
+ "learning_rate": 3.815554282867513e-05,
947
+ "loss": 0.0107,
948
+ "step": 670
949
+ },
950
+ {
951
+ "epoch": 0.7527181488709228,
952
+ "grad_norm": 0.01553898025304079,
953
+ "learning_rate": 3.743971380549008e-05,
954
+ "loss": 0.0083,
955
+ "step": 675
956
+ },
957
+ {
958
+ "epoch": 0.7582938388625592,
959
+ "grad_norm": 0.020166153088212013,
960
+ "learning_rate": 3.67266222445964e-05,
961
+ "loss": 0.0111,
962
+ "step": 680
963
+ },
964
+ {
965
+ "epoch": 0.7638695288541957,
966
+ "grad_norm": 0.023076798766851425,
967
+ "learning_rate": 3.6016423561342706e-05,
968
+ "loss": 0.0128,
969
+ "step": 685
970
+ },
971
+ {
972
+ "epoch": 0.7694452188458322,
973
+ "grad_norm": 0.02140919119119644,
974
+ "learning_rate": 3.5309272540587e-05,
975
+ "loss": 0.0104,
976
+ "step": 690
977
+ },
978
+ {
979
+ "epoch": 0.7750209088374687,
980
+ "grad_norm": 0.016010567545890808,
981
+ "learning_rate": 3.4605323302961854e-05,
982
+ "loss": 0.0135,
983
+ "step": 695
984
+ },
985
+ {
986
+ "epoch": 0.7805965988291051,
987
+ "grad_norm": 0.017510782927274704,
988
+ "learning_rate": 3.3904729271284473e-05,
989
+ "loss": 0.0115,
990
+ "step": 700
991
+ },
992
+ {
993
+ "epoch": 0.7861722888207415,
994
+ "grad_norm": 0.024468230083584785,
995
+ "learning_rate": 3.3207643137118874e-05,
996
+ "loss": 0.01,
997
+ "step": 705
998
+ },
999
+ {
1000
+ "epoch": 0.791747978812378,
1001
+ "grad_norm": 0.020976202562451363,
1002
+ "learning_rate": 3.251421682749732e-05,
1003
+ "loss": 0.0114,
1004
+ "step": 710
1005
+ },
1006
+ {
1007
+ "epoch": 0.7973236688040145,
1008
+ "grad_norm": 0.02256660722196102,
1009
+ "learning_rate": 3.18246014718085e-05,
1010
+ "loss": 0.0108,
1011
+ "step": 715
1012
+ },
1013
+ {
1014
+ "epoch": 0.802899358795651,
1015
+ "grad_norm": 0.030024701729416847,
1016
+ "learning_rate": 3.113894736885953e-05,
1017
+ "loss": 0.0104,
1018
+ "step": 720
1019
+ },
1020
+ {
1021
+ "epoch": 0.8084750487872874,
1022
+ "grad_norm": 0.019312310963869095,
1023
+ "learning_rate": 3.0457403954118856e-05,
1024
+ "loss": 0.0082,
1025
+ "step": 725
1026
+ },
1027
+ {
1028
+ "epoch": 0.8140507387789239,
1029
+ "grad_norm": 0.02008041925728321,
1030
+ "learning_rate": 2.978011976714753e-05,
1031
+ "loss": 0.0099,
1032
+ "step": 730
1033
+ },
1034
+ {
1035
+ "epoch": 0.8196264287705604,
1036
+ "grad_norm": 0.021896323189139366,
1037
+ "learning_rate": 2.9107242419225577e-05,
1038
+ "loss": 0.0143,
1039
+ "step": 735
1040
+ },
1041
+ {
1042
+ "epoch": 0.8252021187621968,
1043
+ "grad_norm": 0.019579166546463966,
1044
+ "learning_rate": 2.8438918561180634e-05,
1045
+ "loss": 0.0106,
1046
+ "step": 740
1047
+ },
1048
+ {
1049
+ "epoch": 0.8307778087538333,
1050
+ "grad_norm": 0.02049921080470085,
1051
+ "learning_rate": 2.7775293851426232e-05,
1052
+ "loss": 0.0115,
1053
+ "step": 745
1054
+ },
1055
+ {
1056
+ "epoch": 0.8363534987454697,
1057
+ "grad_norm": 0.014969157055020332,
1058
+ "learning_rate": 2.711651292421593e-05,
1059
+ "loss": 0.0101,
1060
+ "step": 750
1061
+ },
1062
+ {
1063
+ "epoch": 0.8419291887371062,
1064
+ "grad_norm": 0.020416075363755226,
1065
+ "learning_rate": 2.646271935812098e-05,
1066
+ "loss": 0.0098,
1067
+ "step": 755
1068
+ },
1069
+ {
1070
+ "epoch": 0.8475048787287427,
1071
+ "grad_norm": 0.018367785960435867,
1072
+ "learning_rate": 2.581405564473801e-05,
1073
+ "loss": 0.0165,
1074
+ "step": 760
1075
+ },
1076
+ {
1077
+ "epoch": 0.8530805687203792,
1078
+ "grad_norm": 0.0190111193805933,
1079
+ "learning_rate": 2.5170663157633477e-05,
1080
+ "loss": 0.0135,
1081
+ "step": 765
1082
+ },
1083
+ {
1084
+ "epoch": 0.8586562587120156,
1085
+ "grad_norm": 0.024806899949908257,
1086
+ "learning_rate": 2.45326821215319e-05,
1087
+ "loss": 0.0116,
1088
+ "step": 770
1089
+ },
1090
+ {
1091
+ "epoch": 0.864231948703652,
1092
+ "grad_norm": 0.02073819749057293,
1093
+ "learning_rate": 2.390025158175458e-05,
1094
+ "loss": 0.0129,
1095
+ "step": 775
1096
+ },
1097
+ {
1098
+ "epoch": 0.8698076386952885,
1099
+ "grad_norm": 0.02042596973478794,
1100
+ "learning_rate": 2.3273509373915093e-05,
1101
+ "loss": 0.0088,
1102
+ "step": 780
1103
+ },
1104
+ {
1105
+ "epoch": 0.875383328686925,
1106
+ "grad_norm": 0.015911240130662918,
1107
+ "learning_rate": 2.2652592093878666e-05,
1108
+ "loss": 0.01,
1109
+ "step": 785
1110
+ },
1111
+ {
1112
+ "epoch": 0.8809590186785615,
1113
+ "grad_norm": 0.023589760065078735,
1114
+ "learning_rate": 2.2037635067991663e-05,
1115
+ "loss": 0.0107,
1116
+ "step": 790
1117
+ },
1118
+ {
1119
+ "epoch": 0.8865347086701979,
1120
+ "grad_norm": 0.017796384170651436,
1121
+ "learning_rate": 2.1428772323587827e-05,
1122
+ "loss": 0.0103,
1123
+ "step": 795
1124
+ },
1125
+ {
1126
+ "epoch": 0.8921103986618344,
1127
+ "grad_norm": 0.018958481028676033,
1128
+ "learning_rate": 2.082613655977745e-05,
1129
+ "loss": 0.0079,
1130
+ "step": 800
1131
  }
1132
  ],
1133
  "logging_steps": 5,
1134
+ "max_steps": 1120,
1135
  "num_input_tokens_seen": 0,
1136
+ "num_train_epochs": 2,
1137
  "save_steps": 100,
1138
  "stateful_callbacks": {
1139
  "TrainerControl": {
 
1142
  "should_evaluate": false,
1143
  "should_log": false,
1144
  "should_save": true,
1145
+ "should_training_stop": false
1146
  },
1147
  "attributes": {}
1148
  }
1149
  },
1150
+ "total_flos": 2.728813755050754e+18,
1151
  "train_batch_size": 2,
1152
  "trial_name": null,
1153
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51c4eab5dcfa36e42ea1871a443badd7333ec1d0f3b46f9485f900e1f6e3db2c
3
  size 7825
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8568d83effc9ae0e86eecad81d4c2bc1c32496e167d72cebf2a94b86d0aa123c
3
  size 7825