Reyad-Ahmmed commited on
Commit
e708fa1
·
verified ·
1 Parent(s): 22fb5b1

Push getvar generic t5 model

Browse files
Files changed (22) hide show
  1. json_extraction_point_activity/checkpoint-1000/config.json +61 -0
  2. json_extraction_point_activity/checkpoint-1000/generation_config.json +7 -0
  3. json_extraction_point_activity/checkpoint-1000/model.safetensors +3 -0
  4. json_extraction_point_activity/checkpoint-1000/optimizer.pt +3 -0
  5. json_extraction_point_activity/checkpoint-1000/rng_state.pth +3 -0
  6. json_extraction_point_activity/checkpoint-1000/scheduler.pt +3 -0
  7. json_extraction_point_activity/checkpoint-1000/trainer_state.json +1005 -0
  8. json_extraction_point_activity/checkpoint-1000/training_args.bin +3 -0
  9. json_extraction_point_activity/checkpoint-1015/config.json +61 -0
  10. json_extraction_point_activity/checkpoint-1015/generation_config.json +7 -0
  11. json_extraction_point_activity/checkpoint-1015/model.safetensors +3 -0
  12. json_extraction_point_activity/checkpoint-1015/optimizer.pt +3 -0
  13. json_extraction_point_activity/checkpoint-1015/rng_state.pth +3 -0
  14. json_extraction_point_activity/checkpoint-1015/scheduler.pt +3 -0
  15. json_extraction_point_activity/checkpoint-1015/trainer_state.json +1012 -0
  16. json_extraction_point_activity/checkpoint-1015/training_args.bin +3 -0
  17. json_extraction_point_activity/checkpoint-500/model.safetensors +1 -1
  18. json_extraction_point_activity/checkpoint-500/optimizer.pt +1 -1
  19. json_extraction_point_activity/checkpoint-500/rng_state.pth +1 -1
  20. json_extraction_point_activity/checkpoint-500/scheduler.pt +1 -1
  21. json_extraction_point_activity/checkpoint-500/trainer_state.json +312 -336
  22. json_extraction_point_activity/model.safetensors +1 -1
json_extraction_point_activity/checkpoint-1000/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-large",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 4096,
8
+ "d_kv": 64,
9
+ "d_model": 1024,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "relu",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "relu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": false,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 24,
22
+ "num_heads": 16,
23
+ "num_layers": 24,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.48.2",
59
+ "use_cache": true,
60
+ "vocab_size": 32128
61
+ }
json_extraction_point_activity/checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.48.2"
7
+ }
json_extraction_point_activity/checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2033e1d91207701dcf2a50652e20805abce211fa98d7df187b671b1ef7e7783
3
+ size 2950734544
json_extraction_point_activity/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:304f67deadb2dd9d193f2e60ec9ab4c75474742551d5a0b0cdf99c290e5c949d
3
+ size 5901778825
json_extraction_point_activity/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a17102bba356f793b2331a99300071e6d313d5a727155ed5f9df148159ac27b
3
+ size 14244
json_extraction_point_activity/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f68200a6cbb8b4b695a170a8030cf83701b3b906f43a823586949eff641b80a
3
+ size 1064
json_extraction_point_activity/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1005 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 34.48275862068966,
5
+ "eval_steps": 500,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.3448275862068966,
13
+ "grad_norm": 174.2040557861328,
14
+ "learning_rate": 1.9802955665024632e-05,
15
+ "loss": 11.9232,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.6896551724137931,
20
+ "grad_norm": 54.92245101928711,
21
+ "learning_rate": 1.9605911330049263e-05,
22
+ "loss": 6.0635,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 1.0,
27
+ "eval_loss": 0.8173177242279053,
28
+ "eval_runtime": 0.1894,
29
+ "eval_samples_per_second": 31.679,
30
+ "eval_steps_per_second": 10.56,
31
+ "step": 29
32
+ },
33
+ {
34
+ "epoch": 1.0344827586206897,
35
+ "grad_norm": 16.176136016845703,
36
+ "learning_rate": 1.9408866995073893e-05,
37
+ "loss": 2.7637,
38
+ "step": 30
39
+ },
40
+ {
41
+ "epoch": 1.3793103448275863,
42
+ "grad_norm": 5.2406907081604,
43
+ "learning_rate": 1.9211822660098524e-05,
44
+ "loss": 1.0566,
45
+ "step": 40
46
+ },
47
+ {
48
+ "epoch": 1.7241379310344827,
49
+ "grad_norm": 3.6556057929992676,
50
+ "learning_rate": 1.9014778325123154e-05,
51
+ "loss": 0.6543,
52
+ "step": 50
53
+ },
54
+ {
55
+ "epoch": 2.0,
56
+ "eval_loss": 0.26999202370643616,
57
+ "eval_runtime": 0.1963,
58
+ "eval_samples_per_second": 30.571,
59
+ "eval_steps_per_second": 10.19,
60
+ "step": 58
61
+ },
62
+ {
63
+ "epoch": 2.0689655172413794,
64
+ "grad_norm": 1.8488504886627197,
65
+ "learning_rate": 1.8817733990147784e-05,
66
+ "loss": 0.4713,
67
+ "step": 60
68
+ },
69
+ {
70
+ "epoch": 2.413793103448276,
71
+ "grad_norm": 2.723362922668457,
72
+ "learning_rate": 1.8620689655172415e-05,
73
+ "loss": 0.3144,
74
+ "step": 70
75
+ },
76
+ {
77
+ "epoch": 2.7586206896551726,
78
+ "grad_norm": 1.8972634077072144,
79
+ "learning_rate": 1.8423645320197045e-05,
80
+ "loss": 0.2604,
81
+ "step": 80
82
+ },
83
+ {
84
+ "epoch": 3.0,
85
+ "eval_loss": 0.09994816780090332,
86
+ "eval_runtime": 0.1912,
87
+ "eval_samples_per_second": 31.381,
88
+ "eval_steps_per_second": 10.46,
89
+ "step": 87
90
+ },
91
+ {
92
+ "epoch": 3.103448275862069,
93
+ "grad_norm": 3.115511417388916,
94
+ "learning_rate": 1.8226600985221676e-05,
95
+ "loss": 0.2067,
96
+ "step": 90
97
+ },
98
+ {
99
+ "epoch": 3.4482758620689653,
100
+ "grad_norm": 1.7388259172439575,
101
+ "learning_rate": 1.8029556650246306e-05,
102
+ "loss": 0.1494,
103
+ "step": 100
104
+ },
105
+ {
106
+ "epoch": 3.793103448275862,
107
+ "grad_norm": 1.2075275182724,
108
+ "learning_rate": 1.7832512315270937e-05,
109
+ "loss": 0.1411,
110
+ "step": 110
111
+ },
112
+ {
113
+ "epoch": 4.0,
114
+ "eval_loss": 0.05977391079068184,
115
+ "eval_runtime": 0.1933,
116
+ "eval_samples_per_second": 31.034,
117
+ "eval_steps_per_second": 10.345,
118
+ "step": 116
119
+ },
120
+ {
121
+ "epoch": 4.137931034482759,
122
+ "grad_norm": 1.7329133749008179,
123
+ "learning_rate": 1.7635467980295567e-05,
124
+ "loss": 0.1196,
125
+ "step": 120
126
+ },
127
+ {
128
+ "epoch": 4.482758620689655,
129
+ "grad_norm": 2.210278272628784,
130
+ "learning_rate": 1.7438423645320198e-05,
131
+ "loss": 0.1132,
132
+ "step": 130
133
+ },
134
+ {
135
+ "epoch": 4.827586206896552,
136
+ "grad_norm": 1.0056334733963013,
137
+ "learning_rate": 1.7241379310344828e-05,
138
+ "loss": 0.0789,
139
+ "step": 140
140
+ },
141
+ {
142
+ "epoch": 5.0,
143
+ "eval_loss": 0.04751617833971977,
144
+ "eval_runtime": 0.1898,
145
+ "eval_samples_per_second": 31.605,
146
+ "eval_steps_per_second": 10.535,
147
+ "step": 145
148
+ },
149
+ {
150
+ "epoch": 5.172413793103448,
151
+ "grad_norm": 0.9742516279220581,
152
+ "learning_rate": 1.704433497536946e-05,
153
+ "loss": 0.0927,
154
+ "step": 150
155
+ },
156
+ {
157
+ "epoch": 5.517241379310345,
158
+ "grad_norm": 1.4696099758148193,
159
+ "learning_rate": 1.684729064039409e-05,
160
+ "loss": 0.0837,
161
+ "step": 160
162
+ },
163
+ {
164
+ "epoch": 5.862068965517241,
165
+ "grad_norm": 1.0493124723434448,
166
+ "learning_rate": 1.665024630541872e-05,
167
+ "loss": 0.0689,
168
+ "step": 170
169
+ },
170
+ {
171
+ "epoch": 6.0,
172
+ "eval_loss": 0.03317258134484291,
173
+ "eval_runtime": 0.2045,
174
+ "eval_samples_per_second": 29.343,
175
+ "eval_steps_per_second": 9.781,
176
+ "step": 174
177
+ },
178
+ {
179
+ "epoch": 6.206896551724138,
180
+ "grad_norm": 0.9956067204475403,
181
+ "learning_rate": 1.645320197044335e-05,
182
+ "loss": 0.0702,
183
+ "step": 180
184
+ },
185
+ {
186
+ "epoch": 6.551724137931035,
187
+ "grad_norm": 0.4664933979511261,
188
+ "learning_rate": 1.625615763546798e-05,
189
+ "loss": 0.0675,
190
+ "step": 190
191
+ },
192
+ {
193
+ "epoch": 6.896551724137931,
194
+ "grad_norm": 1.2444266080856323,
195
+ "learning_rate": 1.605911330049261e-05,
196
+ "loss": 0.0596,
197
+ "step": 200
198
+ },
199
+ {
200
+ "epoch": 7.0,
201
+ "eval_loss": 0.030222313478589058,
202
+ "eval_runtime": 0.1913,
203
+ "eval_samples_per_second": 31.365,
204
+ "eval_steps_per_second": 10.455,
205
+ "step": 203
206
+ },
207
+ {
208
+ "epoch": 7.241379310344827,
209
+ "grad_norm": 0.5432140827178955,
210
+ "learning_rate": 1.586206896551724e-05,
211
+ "loss": 0.045,
212
+ "step": 210
213
+ },
214
+ {
215
+ "epoch": 7.586206896551724,
216
+ "grad_norm": 0.7679450511932373,
217
+ "learning_rate": 1.5665024630541875e-05,
218
+ "loss": 0.0538,
219
+ "step": 220
220
+ },
221
+ {
222
+ "epoch": 7.931034482758621,
223
+ "grad_norm": 0.7759860754013062,
224
+ "learning_rate": 1.5467980295566506e-05,
225
+ "loss": 0.0624,
226
+ "step": 230
227
+ },
228
+ {
229
+ "epoch": 8.0,
230
+ "eval_loss": 0.02808324061334133,
231
+ "eval_runtime": 0.2003,
232
+ "eval_samples_per_second": 29.953,
233
+ "eval_steps_per_second": 9.984,
234
+ "step": 232
235
+ },
236
+ {
237
+ "epoch": 8.275862068965518,
238
+ "grad_norm": 1.7437331676483154,
239
+ "learning_rate": 1.5270935960591133e-05,
240
+ "loss": 0.0369,
241
+ "step": 240
242
+ },
243
+ {
244
+ "epoch": 8.620689655172415,
245
+ "grad_norm": 0.5273000597953796,
246
+ "learning_rate": 1.5073891625615764e-05,
247
+ "loss": 0.0499,
248
+ "step": 250
249
+ },
250
+ {
251
+ "epoch": 8.96551724137931,
252
+ "grad_norm": 0.6120426058769226,
253
+ "learning_rate": 1.4876847290640396e-05,
254
+ "loss": 0.0425,
255
+ "step": 260
256
+ },
257
+ {
258
+ "epoch": 9.0,
259
+ "eval_loss": 0.03035571426153183,
260
+ "eval_runtime": 0.1888,
261
+ "eval_samples_per_second": 31.778,
262
+ "eval_steps_per_second": 10.593,
263
+ "step": 261
264
+ },
265
+ {
266
+ "epoch": 9.310344827586206,
267
+ "grad_norm": 1.587663173675537,
268
+ "learning_rate": 1.4679802955665026e-05,
269
+ "loss": 0.0395,
270
+ "step": 270
271
+ },
272
+ {
273
+ "epoch": 9.655172413793103,
274
+ "grad_norm": 0.7260332703590393,
275
+ "learning_rate": 1.4482758620689657e-05,
276
+ "loss": 0.0493,
277
+ "step": 280
278
+ },
279
+ {
280
+ "epoch": 10.0,
281
+ "grad_norm": 0.9717508554458618,
282
+ "learning_rate": 1.4285714285714287e-05,
283
+ "loss": 0.0424,
284
+ "step": 290
285
+ },
286
+ {
287
+ "epoch": 10.0,
288
+ "eval_loss": 0.026125147938728333,
289
+ "eval_runtime": 0.0989,
290
+ "eval_samples_per_second": 60.684,
291
+ "eval_steps_per_second": 20.228,
292
+ "step": 290
293
+ },
294
+ {
295
+ "epoch": 10.344827586206897,
296
+ "grad_norm": 0.7487574815750122,
297
+ "learning_rate": 1.4088669950738918e-05,
298
+ "loss": 0.0412,
299
+ "step": 300
300
+ },
301
+ {
302
+ "epoch": 10.689655172413794,
303
+ "grad_norm": 1.3717066049575806,
304
+ "learning_rate": 1.3891625615763548e-05,
305
+ "loss": 0.0296,
306
+ "step": 310
307
+ },
308
+ {
309
+ "epoch": 11.0,
310
+ "eval_loss": 0.031011082231998444,
311
+ "eval_runtime": 0.1922,
312
+ "eval_samples_per_second": 31.22,
313
+ "eval_steps_per_second": 10.407,
314
+ "step": 319
315
+ },
316
+ {
317
+ "epoch": 11.03448275862069,
318
+ "grad_norm": 0.6860368847846985,
319
+ "learning_rate": 1.369458128078818e-05,
320
+ "loss": 0.0411,
321
+ "step": 320
322
+ },
323
+ {
324
+ "epoch": 11.379310344827585,
325
+ "grad_norm": 0.9999271035194397,
326
+ "learning_rate": 1.3497536945812807e-05,
327
+ "loss": 0.0358,
328
+ "step": 330
329
+ },
330
+ {
331
+ "epoch": 11.724137931034482,
332
+ "grad_norm": 1.2313721179962158,
333
+ "learning_rate": 1.330049261083744e-05,
334
+ "loss": 0.0324,
335
+ "step": 340
336
+ },
337
+ {
338
+ "epoch": 12.0,
339
+ "eval_loss": 0.029439905658364296,
340
+ "eval_runtime": 0.2015,
341
+ "eval_samples_per_second": 29.784,
342
+ "eval_steps_per_second": 9.928,
343
+ "step": 348
344
+ },
345
+ {
346
+ "epoch": 12.068965517241379,
347
+ "grad_norm": 0.9032502174377441,
348
+ "learning_rate": 1.310344827586207e-05,
349
+ "loss": 0.041,
350
+ "step": 350
351
+ },
352
+ {
353
+ "epoch": 12.413793103448276,
354
+ "grad_norm": 0.612800657749176,
355
+ "learning_rate": 1.29064039408867e-05,
356
+ "loss": 0.0384,
357
+ "step": 360
358
+ },
359
+ {
360
+ "epoch": 12.758620689655173,
361
+ "grad_norm": 0.19512540102005005,
362
+ "learning_rate": 1.2709359605911331e-05,
363
+ "loss": 0.0349,
364
+ "step": 370
365
+ },
366
+ {
367
+ "epoch": 13.0,
368
+ "eval_loss": 0.02956104278564453,
369
+ "eval_runtime": 0.1909,
370
+ "eval_samples_per_second": 31.424,
371
+ "eval_steps_per_second": 10.475,
372
+ "step": 377
373
+ },
374
+ {
375
+ "epoch": 13.10344827586207,
376
+ "grad_norm": 0.8481155633926392,
377
+ "learning_rate": 1.2512315270935961e-05,
378
+ "loss": 0.0271,
379
+ "step": 380
380
+ },
381
+ {
382
+ "epoch": 13.448275862068966,
383
+ "grad_norm": 1.3683249950408936,
384
+ "learning_rate": 1.2315270935960592e-05,
385
+ "loss": 0.0265,
386
+ "step": 390
387
+ },
388
+ {
389
+ "epoch": 13.793103448275861,
390
+ "grad_norm": 0.6365839838981628,
391
+ "learning_rate": 1.2118226600985224e-05,
392
+ "loss": 0.0298,
393
+ "step": 400
394
+ },
395
+ {
396
+ "epoch": 14.0,
397
+ "eval_loss": 0.03038203716278076,
398
+ "eval_runtime": 0.1902,
399
+ "eval_samples_per_second": 31.538,
400
+ "eval_steps_per_second": 10.513,
401
+ "step": 406
402
+ },
403
+ {
404
+ "epoch": 14.137931034482758,
405
+ "grad_norm": 0.7672634124755859,
406
+ "learning_rate": 1.1921182266009855e-05,
407
+ "loss": 0.0335,
408
+ "step": 410
409
+ },
410
+ {
411
+ "epoch": 14.482758620689655,
412
+ "grad_norm": 0.2541676163673401,
413
+ "learning_rate": 1.1724137931034483e-05,
414
+ "loss": 0.0286,
415
+ "step": 420
416
+ },
417
+ {
418
+ "epoch": 14.827586206896552,
419
+ "grad_norm": 0.8434980511665344,
420
+ "learning_rate": 1.1527093596059114e-05,
421
+ "loss": 0.0205,
422
+ "step": 430
423
+ },
424
+ {
425
+ "epoch": 15.0,
426
+ "eval_loss": 0.030394822359085083,
427
+ "eval_runtime": 0.19,
428
+ "eval_samples_per_second": 31.587,
429
+ "eval_steps_per_second": 10.529,
430
+ "step": 435
431
+ },
432
+ {
433
+ "epoch": 15.172413793103448,
434
+ "grad_norm": 0.4303562641143799,
435
+ "learning_rate": 1.1330049261083744e-05,
436
+ "loss": 0.0323,
437
+ "step": 440
438
+ },
439
+ {
440
+ "epoch": 15.517241379310345,
441
+ "grad_norm": 0.42710408568382263,
442
+ "learning_rate": 1.1133004926108375e-05,
443
+ "loss": 0.0227,
444
+ "step": 450
445
+ },
446
+ {
447
+ "epoch": 15.862068965517242,
448
+ "grad_norm": 0.6126664876937866,
449
+ "learning_rate": 1.0935960591133005e-05,
450
+ "loss": 0.0215,
451
+ "step": 460
452
+ },
453
+ {
454
+ "epoch": 16.0,
455
+ "eval_loss": 0.030341001227498055,
456
+ "eval_runtime": 0.1895,
457
+ "eval_samples_per_second": 31.669,
458
+ "eval_steps_per_second": 10.556,
459
+ "step": 464
460
+ },
461
+ {
462
+ "epoch": 16.20689655172414,
463
+ "grad_norm": 0.7011211514472961,
464
+ "learning_rate": 1.0738916256157637e-05,
465
+ "loss": 0.0337,
466
+ "step": 470
467
+ },
468
+ {
469
+ "epoch": 16.551724137931036,
470
+ "grad_norm": 0.5126092433929443,
471
+ "learning_rate": 1.0541871921182268e-05,
472
+ "loss": 0.0233,
473
+ "step": 480
474
+ },
475
+ {
476
+ "epoch": 16.896551724137932,
477
+ "grad_norm": 0.9033933281898499,
478
+ "learning_rate": 1.0344827586206898e-05,
479
+ "loss": 0.0182,
480
+ "step": 490
481
+ },
482
+ {
483
+ "epoch": 17.0,
484
+ "eval_loss": 0.027958964928984642,
485
+ "eval_runtime": 0.1902,
486
+ "eval_samples_per_second": 31.553,
487
+ "eval_steps_per_second": 10.518,
488
+ "step": 493
489
+ },
490
+ {
491
+ "epoch": 17.24137931034483,
492
+ "grad_norm": 0.4326847493648529,
493
+ "learning_rate": 1.0147783251231529e-05,
494
+ "loss": 0.0249,
495
+ "step": 500
496
+ },
497
+ {
498
+ "epoch": 17.586206896551722,
499
+ "grad_norm": 0.4034535586833954,
500
+ "learning_rate": 9.95073891625616e-06,
501
+ "loss": 0.0215,
502
+ "step": 510
503
+ },
504
+ {
505
+ "epoch": 17.93103448275862,
506
+ "grad_norm": 0.4999659061431885,
507
+ "learning_rate": 9.75369458128079e-06,
508
+ "loss": 0.0247,
509
+ "step": 520
510
+ },
511
+ {
512
+ "epoch": 18.0,
513
+ "eval_loss": 0.030091799795627594,
514
+ "eval_runtime": 0.1897,
515
+ "eval_samples_per_second": 31.624,
516
+ "eval_steps_per_second": 10.541,
517
+ "step": 522
518
+ },
519
+ {
520
+ "epoch": 18.275862068965516,
521
+ "grad_norm": 0.3004280924797058,
522
+ "learning_rate": 9.55665024630542e-06,
523
+ "loss": 0.0161,
524
+ "step": 530
525
+ },
526
+ {
527
+ "epoch": 18.620689655172413,
528
+ "grad_norm": 0.5018593072891235,
529
+ "learning_rate": 9.359605911330049e-06,
530
+ "loss": 0.0227,
531
+ "step": 540
532
+ },
533
+ {
534
+ "epoch": 18.96551724137931,
535
+ "grad_norm": 0.5728505849838257,
536
+ "learning_rate": 9.162561576354681e-06,
537
+ "loss": 0.0221,
538
+ "step": 550
539
+ },
540
+ {
541
+ "epoch": 19.0,
542
+ "eval_loss": 0.030765995383262634,
543
+ "eval_runtime": 0.1882,
544
+ "eval_samples_per_second": 31.886,
545
+ "eval_steps_per_second": 10.629,
546
+ "step": 551
547
+ },
548
+ {
549
+ "epoch": 19.310344827586206,
550
+ "grad_norm": 1.1493581533432007,
551
+ "learning_rate": 8.965517241379312e-06,
552
+ "loss": 0.0204,
553
+ "step": 560
554
+ },
555
+ {
556
+ "epoch": 19.655172413793103,
557
+ "grad_norm": 0.3401670753955841,
558
+ "learning_rate": 8.768472906403942e-06,
559
+ "loss": 0.0164,
560
+ "step": 570
561
+ },
562
+ {
563
+ "epoch": 20.0,
564
+ "grad_norm": 0.9875850081443787,
565
+ "learning_rate": 8.571428571428571e-06,
566
+ "loss": 0.0213,
567
+ "step": 580
568
+ },
569
+ {
570
+ "epoch": 20.0,
571
+ "eval_loss": 0.028663409873843193,
572
+ "eval_runtime": 0.1008,
573
+ "eval_samples_per_second": 59.511,
574
+ "eval_steps_per_second": 19.837,
575
+ "step": 580
576
+ },
577
+ {
578
+ "epoch": 20.344827586206897,
579
+ "grad_norm": 0.6725947260856628,
580
+ "learning_rate": 8.374384236453203e-06,
581
+ "loss": 0.0192,
582
+ "step": 590
583
+ },
584
+ {
585
+ "epoch": 20.689655172413794,
586
+ "grad_norm": 0.6594141721725464,
587
+ "learning_rate": 8.177339901477834e-06,
588
+ "loss": 0.0194,
589
+ "step": 600
590
+ },
591
+ {
592
+ "epoch": 21.0,
593
+ "eval_loss": 0.027452999725937843,
594
+ "eval_runtime": 0.1899,
595
+ "eval_samples_per_second": 31.602,
596
+ "eval_steps_per_second": 10.534,
597
+ "step": 609
598
+ },
599
+ {
600
+ "epoch": 21.03448275862069,
601
+ "grad_norm": 0.32469141483306885,
602
+ "learning_rate": 7.980295566502464e-06,
603
+ "loss": 0.0167,
604
+ "step": 610
605
+ },
606
+ {
607
+ "epoch": 21.379310344827587,
608
+ "grad_norm": 0.7029064893722534,
609
+ "learning_rate": 7.783251231527095e-06,
610
+ "loss": 0.0237,
611
+ "step": 620
612
+ },
613
+ {
614
+ "epoch": 21.724137931034484,
615
+ "grad_norm": 0.5634991526603699,
616
+ "learning_rate": 7.586206896551724e-06,
617
+ "loss": 0.0212,
618
+ "step": 630
619
+ },
620
+ {
621
+ "epoch": 22.0,
622
+ "eval_loss": 0.027833983302116394,
623
+ "eval_runtime": 0.1913,
624
+ "eval_samples_per_second": 31.357,
625
+ "eval_steps_per_second": 10.452,
626
+ "step": 638
627
+ },
628
+ {
629
+ "epoch": 22.06896551724138,
630
+ "grad_norm": 0.7171387672424316,
631
+ "learning_rate": 7.3891625615763555e-06,
632
+ "loss": 0.0204,
633
+ "step": 640
634
+ },
635
+ {
636
+ "epoch": 22.413793103448278,
637
+ "grad_norm": 0.3534681797027588,
638
+ "learning_rate": 7.192118226600986e-06,
639
+ "loss": 0.0162,
640
+ "step": 650
641
+ },
642
+ {
643
+ "epoch": 22.75862068965517,
644
+ "grad_norm": 0.2446085512638092,
645
+ "learning_rate": 6.995073891625616e-06,
646
+ "loss": 0.0178,
647
+ "step": 660
648
+ },
649
+ {
650
+ "epoch": 23.0,
651
+ "eval_loss": 0.0313410721719265,
652
+ "eval_runtime": 0.1923,
653
+ "eval_samples_per_second": 31.2,
654
+ "eval_steps_per_second": 10.4,
655
+ "step": 667
656
+ },
657
+ {
658
+ "epoch": 23.103448275862068,
659
+ "grad_norm": 0.3782992959022522,
660
+ "learning_rate": 6.798029556650246e-06,
661
+ "loss": 0.0214,
662
+ "step": 670
663
+ },
664
+ {
665
+ "epoch": 23.448275862068964,
666
+ "grad_norm": 0.2016129344701767,
667
+ "learning_rate": 6.600985221674877e-06,
668
+ "loss": 0.0167,
669
+ "step": 680
670
+ },
671
+ {
672
+ "epoch": 23.79310344827586,
673
+ "grad_norm": 0.3993048667907715,
674
+ "learning_rate": 6.403940886699508e-06,
675
+ "loss": 0.0217,
676
+ "step": 690
677
+ },
678
+ {
679
+ "epoch": 24.0,
680
+ "eval_loss": 0.030709436163306236,
681
+ "eval_runtime": 0.1914,
682
+ "eval_samples_per_second": 31.345,
683
+ "eval_steps_per_second": 10.448,
684
+ "step": 696
685
+ },
686
+ {
687
+ "epoch": 24.137931034482758,
688
+ "grad_norm": 0.7120524048805237,
689
+ "learning_rate": 6.206896551724138e-06,
690
+ "loss": 0.0227,
691
+ "step": 700
692
+ },
693
+ {
694
+ "epoch": 24.482758620689655,
695
+ "grad_norm": 0.3477053940296173,
696
+ "learning_rate": 6.00985221674877e-06,
697
+ "loss": 0.0203,
698
+ "step": 710
699
+ },
700
+ {
701
+ "epoch": 24.82758620689655,
702
+ "grad_norm": 0.30051225423812866,
703
+ "learning_rate": 5.812807881773399e-06,
704
+ "loss": 0.0118,
705
+ "step": 720
706
+ },
707
+ {
708
+ "epoch": 25.0,
709
+ "eval_loss": 0.02974347025156021,
710
+ "eval_runtime": 0.1936,
711
+ "eval_samples_per_second": 30.992,
712
+ "eval_steps_per_second": 10.331,
713
+ "step": 725
714
+ },
715
+ {
716
+ "epoch": 25.17241379310345,
717
+ "grad_norm": 0.38762152194976807,
718
+ "learning_rate": 5.61576354679803e-06,
719
+ "loss": 0.0161,
720
+ "step": 730
721
+ },
722
+ {
723
+ "epoch": 25.517241379310345,
724
+ "grad_norm": 0.48786938190460205,
725
+ "learning_rate": 5.41871921182266e-06,
726
+ "loss": 0.0153,
727
+ "step": 740
728
+ },
729
+ {
730
+ "epoch": 25.862068965517242,
731
+ "grad_norm": 0.273346483707428,
732
+ "learning_rate": 5.2216748768472915e-06,
733
+ "loss": 0.0189,
734
+ "step": 750
735
+ },
736
+ {
737
+ "epoch": 26.0,
738
+ "eval_loss": 0.031065121293067932,
739
+ "eval_runtime": 0.1912,
740
+ "eval_samples_per_second": 31.387,
741
+ "eval_steps_per_second": 10.462,
742
+ "step": 754
743
+ },
744
+ {
745
+ "epoch": 26.20689655172414,
746
+ "grad_norm": 0.8301162123680115,
747
+ "learning_rate": 5.024630541871922e-06,
748
+ "loss": 0.0194,
749
+ "step": 760
750
+ },
751
+ {
752
+ "epoch": 26.551724137931036,
753
+ "grad_norm": 0.46895724534988403,
754
+ "learning_rate": 4.8275862068965525e-06,
755
+ "loss": 0.0149,
756
+ "step": 770
757
+ },
758
+ {
759
+ "epoch": 26.896551724137932,
760
+ "grad_norm": 0.4784580171108246,
761
+ "learning_rate": 4.630541871921182e-06,
762
+ "loss": 0.0185,
763
+ "step": 780
764
+ },
765
+ {
766
+ "epoch": 27.0,
767
+ "eval_loss": 0.029767701402306557,
768
+ "eval_runtime": 0.1921,
769
+ "eval_samples_per_second": 31.24,
770
+ "eval_steps_per_second": 10.413,
771
+ "step": 783
772
+ },
773
+ {
774
+ "epoch": 27.24137931034483,
775
+ "grad_norm": 0.7427147626876831,
776
+ "learning_rate": 4.4334975369458135e-06,
777
+ "loss": 0.0155,
778
+ "step": 790
779
+ },
780
+ {
781
+ "epoch": 27.586206896551722,
782
+ "grad_norm": 0.4144653081893921,
783
+ "learning_rate": 4.236453201970444e-06,
784
+ "loss": 0.0159,
785
+ "step": 800
786
+ },
787
+ {
788
+ "epoch": 27.93103448275862,
789
+ "grad_norm": 0.29942747950553894,
790
+ "learning_rate": 4.039408866995074e-06,
791
+ "loss": 0.021,
792
+ "step": 810
793
+ },
794
+ {
795
+ "epoch": 28.0,
796
+ "eval_loss": 0.029253564774990082,
797
+ "eval_runtime": 0.1914,
798
+ "eval_samples_per_second": 31.352,
799
+ "eval_steps_per_second": 10.451,
800
+ "step": 812
801
+ },
802
+ {
803
+ "epoch": 28.275862068965516,
804
+ "grad_norm": 0.21119730174541473,
805
+ "learning_rate": 3.842364532019705e-06,
806
+ "loss": 0.0153,
807
+ "step": 820
808
+ },
809
+ {
810
+ "epoch": 28.620689655172413,
811
+ "grad_norm": 0.2980373501777649,
812
+ "learning_rate": 3.6453201970443354e-06,
813
+ "loss": 0.0189,
814
+ "step": 830
815
+ },
816
+ {
817
+ "epoch": 28.96551724137931,
818
+ "grad_norm": 0.09113238751888275,
819
+ "learning_rate": 3.448275862068966e-06,
820
+ "loss": 0.0102,
821
+ "step": 840
822
+ },
823
+ {
824
+ "epoch": 29.0,
825
+ "eval_loss": 0.028439467772841454,
826
+ "eval_runtime": 0.1908,
827
+ "eval_samples_per_second": 31.449,
828
+ "eval_steps_per_second": 10.483,
829
+ "step": 841
830
+ },
831
+ {
832
+ "epoch": 29.310344827586206,
833
+ "grad_norm": 0.3905262351036072,
834
+ "learning_rate": 3.2512315270935963e-06,
835
+ "loss": 0.0153,
836
+ "step": 850
837
+ },
838
+ {
839
+ "epoch": 29.655172413793103,
840
+ "grad_norm": 0.36752381920814514,
841
+ "learning_rate": 3.054187192118227e-06,
842
+ "loss": 0.014,
843
+ "step": 860
844
+ },
845
+ {
846
+ "epoch": 30.0,
847
+ "grad_norm": 0.6769506931304932,
848
+ "learning_rate": 2.8571428571428573e-06,
849
+ "loss": 0.018,
850
+ "step": 870
851
+ },
852
+ {
853
+ "epoch": 30.0,
854
+ "eval_loss": 0.02924039028584957,
855
+ "eval_runtime": 0.102,
856
+ "eval_samples_per_second": 58.797,
857
+ "eval_steps_per_second": 19.599,
858
+ "step": 870
859
+ },
860
+ {
861
+ "epoch": 30.344827586206897,
862
+ "grad_norm": 0.676186740398407,
863
+ "learning_rate": 2.660098522167488e-06,
864
+ "loss": 0.0202,
865
+ "step": 880
866
+ },
867
+ {
868
+ "epoch": 30.689655172413794,
869
+ "grad_norm": 0.2909271717071533,
870
+ "learning_rate": 2.4630541871921186e-06,
871
+ "loss": 0.0105,
872
+ "step": 890
873
+ },
874
+ {
875
+ "epoch": 31.0,
876
+ "eval_loss": 0.030303308740258217,
877
+ "eval_runtime": 0.191,
878
+ "eval_samples_per_second": 31.415,
879
+ "eval_steps_per_second": 10.472,
880
+ "step": 899
881
+ },
882
+ {
883
+ "epoch": 31.03448275862069,
884
+ "grad_norm": 0.8155515193939209,
885
+ "learning_rate": 2.266009852216749e-06,
886
+ "loss": 0.0174,
887
+ "step": 900
888
+ },
889
+ {
890
+ "epoch": 31.379310344827587,
891
+ "grad_norm": 0.5190662741661072,
892
+ "learning_rate": 2.0689655172413796e-06,
893
+ "loss": 0.0157,
894
+ "step": 910
895
+ },
896
+ {
897
+ "epoch": 31.724137931034484,
898
+ "grad_norm": 0.8997210264205933,
899
+ "learning_rate": 1.8719211822660098e-06,
900
+ "loss": 0.0155,
901
+ "step": 920
902
+ },
903
+ {
904
+ "epoch": 32.0,
905
+ "eval_loss": 0.030399195849895477,
906
+ "eval_runtime": 0.1899,
907
+ "eval_samples_per_second": 31.593,
908
+ "eval_steps_per_second": 10.531,
909
+ "step": 928
910
+ },
911
+ {
912
+ "epoch": 32.06896551724138,
913
+ "grad_norm": 0.5701755881309509,
914
+ "learning_rate": 1.6748768472906405e-06,
915
+ "loss": 0.0139,
916
+ "step": 930
917
+ },
918
+ {
919
+ "epoch": 32.41379310344828,
920
+ "grad_norm": 0.16824859380722046,
921
+ "learning_rate": 1.4778325123152712e-06,
922
+ "loss": 0.0113,
923
+ "step": 940
924
+ },
925
+ {
926
+ "epoch": 32.758620689655174,
927
+ "grad_norm": 0.5066978931427002,
928
+ "learning_rate": 1.2807881773399017e-06,
929
+ "loss": 0.0127,
930
+ "step": 950
931
+ },
932
+ {
933
+ "epoch": 33.0,
934
+ "eval_loss": 0.030721982941031456,
935
+ "eval_runtime": 0.191,
936
+ "eval_samples_per_second": 31.41,
937
+ "eval_steps_per_second": 10.47,
938
+ "step": 957
939
+ },
940
+ {
941
+ "epoch": 33.10344827586207,
942
+ "grad_norm": 0.5038828253746033,
943
+ "learning_rate": 1.0837438423645322e-06,
944
+ "loss": 0.0138,
945
+ "step": 960
946
+ },
947
+ {
948
+ "epoch": 33.44827586206897,
949
+ "grad_norm": 0.4406326115131378,
950
+ "learning_rate": 8.866995073891626e-07,
951
+ "loss": 0.0116,
952
+ "step": 970
953
+ },
954
+ {
955
+ "epoch": 33.793103448275865,
956
+ "grad_norm": 0.3732337951660156,
957
+ "learning_rate": 6.896551724137931e-07,
958
+ "loss": 0.0171,
959
+ "step": 980
960
+ },
961
+ {
962
+ "epoch": 34.0,
963
+ "eval_loss": 0.03042110800743103,
964
+ "eval_runtime": 0.191,
965
+ "eval_samples_per_second": 31.415,
966
+ "eval_steps_per_second": 10.472,
967
+ "step": 986
968
+ },
969
+ {
970
+ "epoch": 34.13793103448276,
971
+ "grad_norm": 0.32739463448524475,
972
+ "learning_rate": 4.926108374384237e-07,
973
+ "loss": 0.0186,
974
+ "step": 990
975
+ },
976
+ {
977
+ "epoch": 34.48275862068966,
978
+ "grad_norm": 0.45314541459083557,
979
+ "learning_rate": 2.955665024630542e-07,
980
+ "loss": 0.0108,
981
+ "step": 1000
982
+ }
983
+ ],
984
+ "logging_steps": 10,
985
+ "max_steps": 1015,
986
+ "num_input_tokens_seen": 0,
987
+ "num_train_epochs": 35,
988
+ "save_steps": 500,
989
+ "stateful_callbacks": {
990
+ "TrainerControl": {
991
+ "args": {
992
+ "should_epoch_stop": false,
993
+ "should_evaluate": false,
994
+ "should_log": false,
995
+ "should_save": true,
996
+ "should_training_stop": false
997
+ },
998
+ "attributes": {}
999
+ }
1000
+ },
1001
+ "total_flos": 2128241491968000.0,
1002
+ "train_batch_size": 4,
1003
+ "trial_name": null,
1004
+ "trial_params": null
1005
+ }
json_extraction_point_activity/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:333cc851b675d7a01620eeba59cdfbc1a624d74927d086ab660bd04fef9b6029
3
+ size 5240
json_extraction_point_activity/checkpoint-1015/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-large",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 4096,
8
+ "d_kv": 64,
9
+ "d_model": 1024,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "relu",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "relu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": false,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "t5",
20
+ "n_positions": 512,
21
+ "num_decoder_layers": 24,
22
+ "num_heads": 16,
23
+ "num_layers": 24,
24
+ "output_past": true,
25
+ "pad_token_id": 0,
26
+ "relative_attention_max_distance": 128,
27
+ "relative_attention_num_buckets": 32,
28
+ "task_specific_params": {
29
+ "summarization": {
30
+ "early_stopping": true,
31
+ "length_penalty": 2.0,
32
+ "max_length": 200,
33
+ "min_length": 30,
34
+ "no_repeat_ngram_size": 3,
35
+ "num_beams": 4,
36
+ "prefix": "summarize: "
37
+ },
38
+ "translation_en_to_de": {
39
+ "early_stopping": true,
40
+ "max_length": 300,
41
+ "num_beams": 4,
42
+ "prefix": "translate English to German: "
43
+ },
44
+ "translation_en_to_fr": {
45
+ "early_stopping": true,
46
+ "max_length": 300,
47
+ "num_beams": 4,
48
+ "prefix": "translate English to French: "
49
+ },
50
+ "translation_en_to_ro": {
51
+ "early_stopping": true,
52
+ "max_length": 300,
53
+ "num_beams": 4,
54
+ "prefix": "translate English to Romanian: "
55
+ }
56
+ },
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.48.2",
59
+ "use_cache": true,
60
+ "vocab_size": 32128
61
+ }
json_extraction_point_activity/checkpoint-1015/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.48.2"
7
+ }
json_extraction_point_activity/checkpoint-1015/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3f3216f61837c04638fd20747110cf419cd0cf7777e19c57b1a79ccbd5cce8
3
+ size 2950734544
json_extraction_point_activity/checkpoint-1015/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f50e5090f6f75af54d40ad57109be3f0f7daaffacb86271dba6ac32dfc076bb
3
+ size 5901778825
json_extraction_point_activity/checkpoint-1015/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2a4fa7b4c0bb14dd8ab49891754dfc89b075c92267608720041d0fd455797e7
3
+ size 14244
json_extraction_point_activity/checkpoint-1015/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2ccfc7b23418fc7ce59db86126d54164c357ffeea106e4ff6ff9ed8cf664181
3
+ size 1064
json_extraction_point_activity/checkpoint-1015/trainer_state.json ADDED
@@ -0,0 +1,1012 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 35.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1015,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.3448275862068966,
13
+ "grad_norm": 174.2040557861328,
14
+ "learning_rate": 1.9802955665024632e-05,
15
+ "loss": 11.9232,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.6896551724137931,
20
+ "grad_norm": 54.92245101928711,
21
+ "learning_rate": 1.9605911330049263e-05,
22
+ "loss": 6.0635,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 1.0,
27
+ "eval_loss": 0.8173177242279053,
28
+ "eval_runtime": 0.1894,
29
+ "eval_samples_per_second": 31.679,
30
+ "eval_steps_per_second": 10.56,
31
+ "step": 29
32
+ },
33
+ {
34
+ "epoch": 1.0344827586206897,
35
+ "grad_norm": 16.176136016845703,
36
+ "learning_rate": 1.9408866995073893e-05,
37
+ "loss": 2.7637,
38
+ "step": 30
39
+ },
40
+ {
41
+ "epoch": 1.3793103448275863,
42
+ "grad_norm": 5.2406907081604,
43
+ "learning_rate": 1.9211822660098524e-05,
44
+ "loss": 1.0566,
45
+ "step": 40
46
+ },
47
+ {
48
+ "epoch": 1.7241379310344827,
49
+ "grad_norm": 3.6556057929992676,
50
+ "learning_rate": 1.9014778325123154e-05,
51
+ "loss": 0.6543,
52
+ "step": 50
53
+ },
54
+ {
55
+ "epoch": 2.0,
56
+ "eval_loss": 0.26999202370643616,
57
+ "eval_runtime": 0.1963,
58
+ "eval_samples_per_second": 30.571,
59
+ "eval_steps_per_second": 10.19,
60
+ "step": 58
61
+ },
62
+ {
63
+ "epoch": 2.0689655172413794,
64
+ "grad_norm": 1.8488504886627197,
65
+ "learning_rate": 1.8817733990147784e-05,
66
+ "loss": 0.4713,
67
+ "step": 60
68
+ },
69
+ {
70
+ "epoch": 2.413793103448276,
71
+ "grad_norm": 2.723362922668457,
72
+ "learning_rate": 1.8620689655172415e-05,
73
+ "loss": 0.3144,
74
+ "step": 70
75
+ },
76
+ {
77
+ "epoch": 2.7586206896551726,
78
+ "grad_norm": 1.8972634077072144,
79
+ "learning_rate": 1.8423645320197045e-05,
80
+ "loss": 0.2604,
81
+ "step": 80
82
+ },
83
+ {
84
+ "epoch": 3.0,
85
+ "eval_loss": 0.09994816780090332,
86
+ "eval_runtime": 0.1912,
87
+ "eval_samples_per_second": 31.381,
88
+ "eval_steps_per_second": 10.46,
89
+ "step": 87
90
+ },
91
+ {
92
+ "epoch": 3.103448275862069,
93
+ "grad_norm": 3.115511417388916,
94
+ "learning_rate": 1.8226600985221676e-05,
95
+ "loss": 0.2067,
96
+ "step": 90
97
+ },
98
+ {
99
+ "epoch": 3.4482758620689653,
100
+ "grad_norm": 1.7388259172439575,
101
+ "learning_rate": 1.8029556650246306e-05,
102
+ "loss": 0.1494,
103
+ "step": 100
104
+ },
105
+ {
106
+ "epoch": 3.793103448275862,
107
+ "grad_norm": 1.2075275182724,
108
+ "learning_rate": 1.7832512315270937e-05,
109
+ "loss": 0.1411,
110
+ "step": 110
111
+ },
112
+ {
113
+ "epoch": 4.0,
114
+ "eval_loss": 0.05977391079068184,
115
+ "eval_runtime": 0.1933,
116
+ "eval_samples_per_second": 31.034,
117
+ "eval_steps_per_second": 10.345,
118
+ "step": 116
119
+ },
120
+ {
121
+ "epoch": 4.137931034482759,
122
+ "grad_norm": 1.7329133749008179,
123
+ "learning_rate": 1.7635467980295567e-05,
124
+ "loss": 0.1196,
125
+ "step": 120
126
+ },
127
+ {
128
+ "epoch": 4.482758620689655,
129
+ "grad_norm": 2.210278272628784,
130
+ "learning_rate": 1.7438423645320198e-05,
131
+ "loss": 0.1132,
132
+ "step": 130
133
+ },
134
+ {
135
+ "epoch": 4.827586206896552,
136
+ "grad_norm": 1.0056334733963013,
137
+ "learning_rate": 1.7241379310344828e-05,
138
+ "loss": 0.0789,
139
+ "step": 140
140
+ },
141
+ {
142
+ "epoch": 5.0,
143
+ "eval_loss": 0.04751617833971977,
144
+ "eval_runtime": 0.1898,
145
+ "eval_samples_per_second": 31.605,
146
+ "eval_steps_per_second": 10.535,
147
+ "step": 145
148
+ },
149
+ {
150
+ "epoch": 5.172413793103448,
151
+ "grad_norm": 0.9742516279220581,
152
+ "learning_rate": 1.704433497536946e-05,
153
+ "loss": 0.0927,
154
+ "step": 150
155
+ },
156
+ {
157
+ "epoch": 5.517241379310345,
158
+ "grad_norm": 1.4696099758148193,
159
+ "learning_rate": 1.684729064039409e-05,
160
+ "loss": 0.0837,
161
+ "step": 160
162
+ },
163
+ {
164
+ "epoch": 5.862068965517241,
165
+ "grad_norm": 1.0493124723434448,
166
+ "learning_rate": 1.665024630541872e-05,
167
+ "loss": 0.0689,
168
+ "step": 170
169
+ },
170
+ {
171
+ "epoch": 6.0,
172
+ "eval_loss": 0.03317258134484291,
173
+ "eval_runtime": 0.2045,
174
+ "eval_samples_per_second": 29.343,
175
+ "eval_steps_per_second": 9.781,
176
+ "step": 174
177
+ },
178
+ {
179
+ "epoch": 6.206896551724138,
180
+ "grad_norm": 0.9956067204475403,
181
+ "learning_rate": 1.645320197044335e-05,
182
+ "loss": 0.0702,
183
+ "step": 180
184
+ },
185
+ {
186
+ "epoch": 6.551724137931035,
187
+ "grad_norm": 0.4664933979511261,
188
+ "learning_rate": 1.625615763546798e-05,
189
+ "loss": 0.0675,
190
+ "step": 190
191
+ },
192
+ {
193
+ "epoch": 6.896551724137931,
194
+ "grad_norm": 1.2444266080856323,
195
+ "learning_rate": 1.605911330049261e-05,
196
+ "loss": 0.0596,
197
+ "step": 200
198
+ },
199
+ {
200
+ "epoch": 7.0,
201
+ "eval_loss": 0.030222313478589058,
202
+ "eval_runtime": 0.1913,
203
+ "eval_samples_per_second": 31.365,
204
+ "eval_steps_per_second": 10.455,
205
+ "step": 203
206
+ },
207
+ {
208
+ "epoch": 7.241379310344827,
209
+ "grad_norm": 0.5432140827178955,
210
+ "learning_rate": 1.586206896551724e-05,
211
+ "loss": 0.045,
212
+ "step": 210
213
+ },
214
+ {
215
+ "epoch": 7.586206896551724,
216
+ "grad_norm": 0.7679450511932373,
217
+ "learning_rate": 1.5665024630541875e-05,
218
+ "loss": 0.0538,
219
+ "step": 220
220
+ },
221
+ {
222
+ "epoch": 7.931034482758621,
223
+ "grad_norm": 0.7759860754013062,
224
+ "learning_rate": 1.5467980295566506e-05,
225
+ "loss": 0.0624,
226
+ "step": 230
227
+ },
228
+ {
229
+ "epoch": 8.0,
230
+ "eval_loss": 0.02808324061334133,
231
+ "eval_runtime": 0.2003,
232
+ "eval_samples_per_second": 29.953,
233
+ "eval_steps_per_second": 9.984,
234
+ "step": 232
235
+ },
236
+ {
237
+ "epoch": 8.275862068965518,
238
+ "grad_norm": 1.7437331676483154,
239
+ "learning_rate": 1.5270935960591133e-05,
240
+ "loss": 0.0369,
241
+ "step": 240
242
+ },
243
+ {
244
+ "epoch": 8.620689655172415,
245
+ "grad_norm": 0.5273000597953796,
246
+ "learning_rate": 1.5073891625615764e-05,
247
+ "loss": 0.0499,
248
+ "step": 250
249
+ },
250
+ {
251
+ "epoch": 8.96551724137931,
252
+ "grad_norm": 0.6120426058769226,
253
+ "learning_rate": 1.4876847290640396e-05,
254
+ "loss": 0.0425,
255
+ "step": 260
256
+ },
257
+ {
258
+ "epoch": 9.0,
259
+ "eval_loss": 0.03035571426153183,
260
+ "eval_runtime": 0.1888,
261
+ "eval_samples_per_second": 31.778,
262
+ "eval_steps_per_second": 10.593,
263
+ "step": 261
264
+ },
265
+ {
266
+ "epoch": 9.310344827586206,
267
+ "grad_norm": 1.587663173675537,
268
+ "learning_rate": 1.4679802955665026e-05,
269
+ "loss": 0.0395,
270
+ "step": 270
271
+ },
272
+ {
273
+ "epoch": 9.655172413793103,
274
+ "grad_norm": 0.7260332703590393,
275
+ "learning_rate": 1.4482758620689657e-05,
276
+ "loss": 0.0493,
277
+ "step": 280
278
+ },
279
+ {
280
+ "epoch": 10.0,
281
+ "grad_norm": 0.9717508554458618,
282
+ "learning_rate": 1.4285714285714287e-05,
283
+ "loss": 0.0424,
284
+ "step": 290
285
+ },
286
+ {
287
+ "epoch": 10.0,
288
+ "eval_loss": 0.026125147938728333,
289
+ "eval_runtime": 0.0989,
290
+ "eval_samples_per_second": 60.684,
291
+ "eval_steps_per_second": 20.228,
292
+ "step": 290
293
+ },
294
+ {
295
+ "epoch": 10.344827586206897,
296
+ "grad_norm": 0.7487574815750122,
297
+ "learning_rate": 1.4088669950738918e-05,
298
+ "loss": 0.0412,
299
+ "step": 300
300
+ },
301
+ {
302
+ "epoch": 10.689655172413794,
303
+ "grad_norm": 1.3717066049575806,
304
+ "learning_rate": 1.3891625615763548e-05,
305
+ "loss": 0.0296,
306
+ "step": 310
307
+ },
308
+ {
309
+ "epoch": 11.0,
310
+ "eval_loss": 0.031011082231998444,
311
+ "eval_runtime": 0.1922,
312
+ "eval_samples_per_second": 31.22,
313
+ "eval_steps_per_second": 10.407,
314
+ "step": 319
315
+ },
316
+ {
317
+ "epoch": 11.03448275862069,
318
+ "grad_norm": 0.6860368847846985,
319
+ "learning_rate": 1.369458128078818e-05,
320
+ "loss": 0.0411,
321
+ "step": 320
322
+ },
323
+ {
324
+ "epoch": 11.379310344827585,
325
+ "grad_norm": 0.9999271035194397,
326
+ "learning_rate": 1.3497536945812807e-05,
327
+ "loss": 0.0358,
328
+ "step": 330
329
+ },
330
+ {
331
+ "epoch": 11.724137931034482,
332
+ "grad_norm": 1.2313721179962158,
333
+ "learning_rate": 1.330049261083744e-05,
334
+ "loss": 0.0324,
335
+ "step": 340
336
+ },
337
+ {
338
+ "epoch": 12.0,
339
+ "eval_loss": 0.029439905658364296,
340
+ "eval_runtime": 0.2015,
341
+ "eval_samples_per_second": 29.784,
342
+ "eval_steps_per_second": 9.928,
343
+ "step": 348
344
+ },
345
+ {
346
+ "epoch": 12.068965517241379,
347
+ "grad_norm": 0.9032502174377441,
348
+ "learning_rate": 1.310344827586207e-05,
349
+ "loss": 0.041,
350
+ "step": 350
351
+ },
352
+ {
353
+ "epoch": 12.413793103448276,
354
+ "grad_norm": 0.612800657749176,
355
+ "learning_rate": 1.29064039408867e-05,
356
+ "loss": 0.0384,
357
+ "step": 360
358
+ },
359
+ {
360
+ "epoch": 12.758620689655173,
361
+ "grad_norm": 0.19512540102005005,
362
+ "learning_rate": 1.2709359605911331e-05,
363
+ "loss": 0.0349,
364
+ "step": 370
365
+ },
366
+ {
367
+ "epoch": 13.0,
368
+ "eval_loss": 0.02956104278564453,
369
+ "eval_runtime": 0.1909,
370
+ "eval_samples_per_second": 31.424,
371
+ "eval_steps_per_second": 10.475,
372
+ "step": 377
373
+ },
374
+ {
375
+ "epoch": 13.10344827586207,
376
+ "grad_norm": 0.8481155633926392,
377
+ "learning_rate": 1.2512315270935961e-05,
378
+ "loss": 0.0271,
379
+ "step": 380
380
+ },
381
+ {
382
+ "epoch": 13.448275862068966,
383
+ "grad_norm": 1.3683249950408936,
384
+ "learning_rate": 1.2315270935960592e-05,
385
+ "loss": 0.0265,
386
+ "step": 390
387
+ },
388
+ {
389
+ "epoch": 13.793103448275861,
390
+ "grad_norm": 0.6365839838981628,
391
+ "learning_rate": 1.2118226600985224e-05,
392
+ "loss": 0.0298,
393
+ "step": 400
394
+ },
395
+ {
396
+ "epoch": 14.0,
397
+ "eval_loss": 0.03038203716278076,
398
+ "eval_runtime": 0.1902,
399
+ "eval_samples_per_second": 31.538,
400
+ "eval_steps_per_second": 10.513,
401
+ "step": 406
402
+ },
403
+ {
404
+ "epoch": 14.137931034482758,
405
+ "grad_norm": 0.7672634124755859,
406
+ "learning_rate": 1.1921182266009855e-05,
407
+ "loss": 0.0335,
408
+ "step": 410
409
+ },
410
+ {
411
+ "epoch": 14.482758620689655,
412
+ "grad_norm": 0.2541676163673401,
413
+ "learning_rate": 1.1724137931034483e-05,
414
+ "loss": 0.0286,
415
+ "step": 420
416
+ },
417
+ {
418
+ "epoch": 14.827586206896552,
419
+ "grad_norm": 0.8434980511665344,
420
+ "learning_rate": 1.1527093596059114e-05,
421
+ "loss": 0.0205,
422
+ "step": 430
423
+ },
424
+ {
425
+ "epoch": 15.0,
426
+ "eval_loss": 0.030394822359085083,
427
+ "eval_runtime": 0.19,
428
+ "eval_samples_per_second": 31.587,
429
+ "eval_steps_per_second": 10.529,
430
+ "step": 435
431
+ },
432
+ {
433
+ "epoch": 15.172413793103448,
434
+ "grad_norm": 0.4303562641143799,
435
+ "learning_rate": 1.1330049261083744e-05,
436
+ "loss": 0.0323,
437
+ "step": 440
438
+ },
439
+ {
440
+ "epoch": 15.517241379310345,
441
+ "grad_norm": 0.42710408568382263,
442
+ "learning_rate": 1.1133004926108375e-05,
443
+ "loss": 0.0227,
444
+ "step": 450
445
+ },
446
+ {
447
+ "epoch": 15.862068965517242,
448
+ "grad_norm": 0.6126664876937866,
449
+ "learning_rate": 1.0935960591133005e-05,
450
+ "loss": 0.0215,
451
+ "step": 460
452
+ },
453
+ {
454
+ "epoch": 16.0,
455
+ "eval_loss": 0.030341001227498055,
456
+ "eval_runtime": 0.1895,
457
+ "eval_samples_per_second": 31.669,
458
+ "eval_steps_per_second": 10.556,
459
+ "step": 464
460
+ },
461
+ {
462
+ "epoch": 16.20689655172414,
463
+ "grad_norm": 0.7011211514472961,
464
+ "learning_rate": 1.0738916256157637e-05,
465
+ "loss": 0.0337,
466
+ "step": 470
467
+ },
468
+ {
469
+ "epoch": 16.551724137931036,
470
+ "grad_norm": 0.5126092433929443,
471
+ "learning_rate": 1.0541871921182268e-05,
472
+ "loss": 0.0233,
473
+ "step": 480
474
+ },
475
+ {
476
+ "epoch": 16.896551724137932,
477
+ "grad_norm": 0.9033933281898499,
478
+ "learning_rate": 1.0344827586206898e-05,
479
+ "loss": 0.0182,
480
+ "step": 490
481
+ },
482
+ {
483
+ "epoch": 17.0,
484
+ "eval_loss": 0.027958964928984642,
485
+ "eval_runtime": 0.1902,
486
+ "eval_samples_per_second": 31.553,
487
+ "eval_steps_per_second": 10.518,
488
+ "step": 493
489
+ },
490
+ {
491
+ "epoch": 17.24137931034483,
492
+ "grad_norm": 0.4326847493648529,
493
+ "learning_rate": 1.0147783251231529e-05,
494
+ "loss": 0.0249,
495
+ "step": 500
496
+ },
497
+ {
498
+ "epoch": 17.586206896551722,
499
+ "grad_norm": 0.4034535586833954,
500
+ "learning_rate": 9.95073891625616e-06,
501
+ "loss": 0.0215,
502
+ "step": 510
503
+ },
504
+ {
505
+ "epoch": 17.93103448275862,
506
+ "grad_norm": 0.4999659061431885,
507
+ "learning_rate": 9.75369458128079e-06,
508
+ "loss": 0.0247,
509
+ "step": 520
510
+ },
511
+ {
512
+ "epoch": 18.0,
513
+ "eval_loss": 0.030091799795627594,
514
+ "eval_runtime": 0.1897,
515
+ "eval_samples_per_second": 31.624,
516
+ "eval_steps_per_second": 10.541,
517
+ "step": 522
518
+ },
519
+ {
520
+ "epoch": 18.275862068965516,
521
+ "grad_norm": 0.3004280924797058,
522
+ "learning_rate": 9.55665024630542e-06,
523
+ "loss": 0.0161,
524
+ "step": 530
525
+ },
526
+ {
527
+ "epoch": 18.620689655172413,
528
+ "grad_norm": 0.5018593072891235,
529
+ "learning_rate": 9.359605911330049e-06,
530
+ "loss": 0.0227,
531
+ "step": 540
532
+ },
533
+ {
534
+ "epoch": 18.96551724137931,
535
+ "grad_norm": 0.5728505849838257,
536
+ "learning_rate": 9.162561576354681e-06,
537
+ "loss": 0.0221,
538
+ "step": 550
539
+ },
540
+ {
541
+ "epoch": 19.0,
542
+ "eval_loss": 0.030765995383262634,
543
+ "eval_runtime": 0.1882,
544
+ "eval_samples_per_second": 31.886,
545
+ "eval_steps_per_second": 10.629,
546
+ "step": 551
547
+ },
548
+ {
549
+ "epoch": 19.310344827586206,
550
+ "grad_norm": 1.1493581533432007,
551
+ "learning_rate": 8.965517241379312e-06,
552
+ "loss": 0.0204,
553
+ "step": 560
554
+ },
555
+ {
556
+ "epoch": 19.655172413793103,
557
+ "grad_norm": 0.3401670753955841,
558
+ "learning_rate": 8.768472906403942e-06,
559
+ "loss": 0.0164,
560
+ "step": 570
561
+ },
562
+ {
563
+ "epoch": 20.0,
564
+ "grad_norm": 0.9875850081443787,
565
+ "learning_rate": 8.571428571428571e-06,
566
+ "loss": 0.0213,
567
+ "step": 580
568
+ },
569
+ {
570
+ "epoch": 20.0,
571
+ "eval_loss": 0.028663409873843193,
572
+ "eval_runtime": 0.1008,
573
+ "eval_samples_per_second": 59.511,
574
+ "eval_steps_per_second": 19.837,
575
+ "step": 580
576
+ },
577
+ {
578
+ "epoch": 20.344827586206897,
579
+ "grad_norm": 0.6725947260856628,
580
+ "learning_rate": 8.374384236453203e-06,
581
+ "loss": 0.0192,
582
+ "step": 590
583
+ },
584
+ {
585
+ "epoch": 20.689655172413794,
586
+ "grad_norm": 0.6594141721725464,
587
+ "learning_rate": 8.177339901477834e-06,
588
+ "loss": 0.0194,
589
+ "step": 600
590
+ },
591
+ {
592
+ "epoch": 21.0,
593
+ "eval_loss": 0.027452999725937843,
594
+ "eval_runtime": 0.1899,
595
+ "eval_samples_per_second": 31.602,
596
+ "eval_steps_per_second": 10.534,
597
+ "step": 609
598
+ },
599
+ {
600
+ "epoch": 21.03448275862069,
601
+ "grad_norm": 0.32469141483306885,
602
+ "learning_rate": 7.980295566502464e-06,
603
+ "loss": 0.0167,
604
+ "step": 610
605
+ },
606
+ {
607
+ "epoch": 21.379310344827587,
608
+ "grad_norm": 0.7029064893722534,
609
+ "learning_rate": 7.783251231527095e-06,
610
+ "loss": 0.0237,
611
+ "step": 620
612
+ },
613
+ {
614
+ "epoch": 21.724137931034484,
615
+ "grad_norm": 0.5634991526603699,
616
+ "learning_rate": 7.586206896551724e-06,
617
+ "loss": 0.0212,
618
+ "step": 630
619
+ },
620
+ {
621
+ "epoch": 22.0,
622
+ "eval_loss": 0.027833983302116394,
623
+ "eval_runtime": 0.1913,
624
+ "eval_samples_per_second": 31.357,
625
+ "eval_steps_per_second": 10.452,
626
+ "step": 638
627
+ },
628
+ {
629
+ "epoch": 22.06896551724138,
630
+ "grad_norm": 0.7171387672424316,
631
+ "learning_rate": 7.3891625615763555e-06,
632
+ "loss": 0.0204,
633
+ "step": 640
634
+ },
635
+ {
636
+ "epoch": 22.413793103448278,
637
+ "grad_norm": 0.3534681797027588,
638
+ "learning_rate": 7.192118226600986e-06,
639
+ "loss": 0.0162,
640
+ "step": 650
641
+ },
642
+ {
643
+ "epoch": 22.75862068965517,
644
+ "grad_norm": 0.2446085512638092,
645
+ "learning_rate": 6.995073891625616e-06,
646
+ "loss": 0.0178,
647
+ "step": 660
648
+ },
649
+ {
650
+ "epoch": 23.0,
651
+ "eval_loss": 0.0313410721719265,
652
+ "eval_runtime": 0.1923,
653
+ "eval_samples_per_second": 31.2,
654
+ "eval_steps_per_second": 10.4,
655
+ "step": 667
656
+ },
657
+ {
658
+ "epoch": 23.103448275862068,
659
+ "grad_norm": 0.3782992959022522,
660
+ "learning_rate": 6.798029556650246e-06,
661
+ "loss": 0.0214,
662
+ "step": 670
663
+ },
664
+ {
665
+ "epoch": 23.448275862068964,
666
+ "grad_norm": 0.2016129344701767,
667
+ "learning_rate": 6.600985221674877e-06,
668
+ "loss": 0.0167,
669
+ "step": 680
670
+ },
671
+ {
672
+ "epoch": 23.79310344827586,
673
+ "grad_norm": 0.3993048667907715,
674
+ "learning_rate": 6.403940886699508e-06,
675
+ "loss": 0.0217,
676
+ "step": 690
677
+ },
678
+ {
679
+ "epoch": 24.0,
680
+ "eval_loss": 0.030709436163306236,
681
+ "eval_runtime": 0.1914,
682
+ "eval_samples_per_second": 31.345,
683
+ "eval_steps_per_second": 10.448,
684
+ "step": 696
685
+ },
686
+ {
687
+ "epoch": 24.137931034482758,
688
+ "grad_norm": 0.7120524048805237,
689
+ "learning_rate": 6.206896551724138e-06,
690
+ "loss": 0.0227,
691
+ "step": 700
692
+ },
693
+ {
694
+ "epoch": 24.482758620689655,
695
+ "grad_norm": 0.3477053940296173,
696
+ "learning_rate": 6.00985221674877e-06,
697
+ "loss": 0.0203,
698
+ "step": 710
699
+ },
700
+ {
701
+ "epoch": 24.82758620689655,
702
+ "grad_norm": 0.30051225423812866,
703
+ "learning_rate": 5.812807881773399e-06,
704
+ "loss": 0.0118,
705
+ "step": 720
706
+ },
707
+ {
708
+ "epoch": 25.0,
709
+ "eval_loss": 0.02974347025156021,
710
+ "eval_runtime": 0.1936,
711
+ "eval_samples_per_second": 30.992,
712
+ "eval_steps_per_second": 10.331,
713
+ "step": 725
714
+ },
715
+ {
716
+ "epoch": 25.17241379310345,
717
+ "grad_norm": 0.38762152194976807,
718
+ "learning_rate": 5.61576354679803e-06,
719
+ "loss": 0.0161,
720
+ "step": 730
721
+ },
722
+ {
723
+ "epoch": 25.517241379310345,
724
+ "grad_norm": 0.48786938190460205,
725
+ "learning_rate": 5.41871921182266e-06,
726
+ "loss": 0.0153,
727
+ "step": 740
728
+ },
729
+ {
730
+ "epoch": 25.862068965517242,
731
+ "grad_norm": 0.273346483707428,
732
+ "learning_rate": 5.2216748768472915e-06,
733
+ "loss": 0.0189,
734
+ "step": 750
735
+ },
736
+ {
737
+ "epoch": 26.0,
738
+ "eval_loss": 0.031065121293067932,
739
+ "eval_runtime": 0.1912,
740
+ "eval_samples_per_second": 31.387,
741
+ "eval_steps_per_second": 10.462,
742
+ "step": 754
743
+ },
744
+ {
745
+ "epoch": 26.20689655172414,
746
+ "grad_norm": 0.8301162123680115,
747
+ "learning_rate": 5.024630541871922e-06,
748
+ "loss": 0.0194,
749
+ "step": 760
750
+ },
751
+ {
752
+ "epoch": 26.551724137931036,
753
+ "grad_norm": 0.46895724534988403,
754
+ "learning_rate": 4.8275862068965525e-06,
755
+ "loss": 0.0149,
756
+ "step": 770
757
+ },
758
+ {
759
+ "epoch": 26.896551724137932,
760
+ "grad_norm": 0.4784580171108246,
761
+ "learning_rate": 4.630541871921182e-06,
762
+ "loss": 0.0185,
763
+ "step": 780
764
+ },
765
+ {
766
+ "epoch": 27.0,
767
+ "eval_loss": 0.029767701402306557,
768
+ "eval_runtime": 0.1921,
769
+ "eval_samples_per_second": 31.24,
770
+ "eval_steps_per_second": 10.413,
771
+ "step": 783
772
+ },
773
+ {
774
+ "epoch": 27.24137931034483,
775
+ "grad_norm": 0.7427147626876831,
776
+ "learning_rate": 4.4334975369458135e-06,
777
+ "loss": 0.0155,
778
+ "step": 790
779
+ },
780
+ {
781
+ "epoch": 27.586206896551722,
782
+ "grad_norm": 0.4144653081893921,
783
+ "learning_rate": 4.236453201970444e-06,
784
+ "loss": 0.0159,
785
+ "step": 800
786
+ },
787
+ {
788
+ "epoch": 27.93103448275862,
789
+ "grad_norm": 0.29942747950553894,
790
+ "learning_rate": 4.039408866995074e-06,
791
+ "loss": 0.021,
792
+ "step": 810
793
+ },
794
+ {
795
+ "epoch": 28.0,
796
+ "eval_loss": 0.029253564774990082,
797
+ "eval_runtime": 0.1914,
798
+ "eval_samples_per_second": 31.352,
799
+ "eval_steps_per_second": 10.451,
800
+ "step": 812
801
+ },
802
+ {
803
+ "epoch": 28.275862068965516,
804
+ "grad_norm": 0.21119730174541473,
805
+ "learning_rate": 3.842364532019705e-06,
806
+ "loss": 0.0153,
807
+ "step": 820
808
+ },
809
+ {
810
+ "epoch": 28.620689655172413,
811
+ "grad_norm": 0.2980373501777649,
812
+ "learning_rate": 3.6453201970443354e-06,
813
+ "loss": 0.0189,
814
+ "step": 830
815
+ },
816
+ {
817
+ "epoch": 28.96551724137931,
818
+ "grad_norm": 0.09113238751888275,
819
+ "learning_rate": 3.448275862068966e-06,
820
+ "loss": 0.0102,
821
+ "step": 840
822
+ },
823
+ {
824
+ "epoch": 29.0,
825
+ "eval_loss": 0.028439467772841454,
826
+ "eval_runtime": 0.1908,
827
+ "eval_samples_per_second": 31.449,
828
+ "eval_steps_per_second": 10.483,
829
+ "step": 841
830
+ },
831
+ {
832
+ "epoch": 29.310344827586206,
833
+ "grad_norm": 0.3905262351036072,
834
+ "learning_rate": 3.2512315270935963e-06,
835
+ "loss": 0.0153,
836
+ "step": 850
837
+ },
838
+ {
839
+ "epoch": 29.655172413793103,
840
+ "grad_norm": 0.36752381920814514,
841
+ "learning_rate": 3.054187192118227e-06,
842
+ "loss": 0.014,
843
+ "step": 860
844
+ },
845
+ {
846
+ "epoch": 30.0,
847
+ "grad_norm": 0.6769506931304932,
848
+ "learning_rate": 2.8571428571428573e-06,
849
+ "loss": 0.018,
850
+ "step": 870
851
+ },
852
+ {
853
+ "epoch": 30.0,
854
+ "eval_loss": 0.02924039028584957,
855
+ "eval_runtime": 0.102,
856
+ "eval_samples_per_second": 58.797,
857
+ "eval_steps_per_second": 19.599,
858
+ "step": 870
859
+ },
860
+ {
861
+ "epoch": 30.344827586206897,
862
+ "grad_norm": 0.676186740398407,
863
+ "learning_rate": 2.660098522167488e-06,
864
+ "loss": 0.0202,
865
+ "step": 880
866
+ },
867
+ {
868
+ "epoch": 30.689655172413794,
869
+ "grad_norm": 0.2909271717071533,
870
+ "learning_rate": 2.4630541871921186e-06,
871
+ "loss": 0.0105,
872
+ "step": 890
873
+ },
874
+ {
875
+ "epoch": 31.0,
876
+ "eval_loss": 0.030303308740258217,
877
+ "eval_runtime": 0.191,
878
+ "eval_samples_per_second": 31.415,
879
+ "eval_steps_per_second": 10.472,
880
+ "step": 899
881
+ },
882
+ {
883
+ "epoch": 31.03448275862069,
884
+ "grad_norm": 0.8155515193939209,
885
+ "learning_rate": 2.266009852216749e-06,
886
+ "loss": 0.0174,
887
+ "step": 900
888
+ },
889
+ {
890
+ "epoch": 31.379310344827587,
891
+ "grad_norm": 0.5190662741661072,
892
+ "learning_rate": 2.0689655172413796e-06,
893
+ "loss": 0.0157,
894
+ "step": 910
895
+ },
896
+ {
897
+ "epoch": 31.724137931034484,
898
+ "grad_norm": 0.8997210264205933,
899
+ "learning_rate": 1.8719211822660098e-06,
900
+ "loss": 0.0155,
901
+ "step": 920
902
+ },
903
+ {
904
+ "epoch": 32.0,
905
+ "eval_loss": 0.030399195849895477,
906
+ "eval_runtime": 0.1899,
907
+ "eval_samples_per_second": 31.593,
908
+ "eval_steps_per_second": 10.531,
909
+ "step": 928
910
+ },
911
+ {
912
+ "epoch": 32.06896551724138,
913
+ "grad_norm": 0.5701755881309509,
914
+ "learning_rate": 1.6748768472906405e-06,
915
+ "loss": 0.0139,
916
+ "step": 930
917
+ },
918
+ {
919
+ "epoch": 32.41379310344828,
920
+ "grad_norm": 0.16824859380722046,
921
+ "learning_rate": 1.4778325123152712e-06,
922
+ "loss": 0.0113,
923
+ "step": 940
924
+ },
925
+ {
926
+ "epoch": 32.758620689655174,
927
+ "grad_norm": 0.5066978931427002,
928
+ "learning_rate": 1.2807881773399017e-06,
929
+ "loss": 0.0127,
930
+ "step": 950
931
+ },
932
+ {
933
+ "epoch": 33.0,
934
+ "eval_loss": 0.030721982941031456,
935
+ "eval_runtime": 0.191,
936
+ "eval_samples_per_second": 31.41,
937
+ "eval_steps_per_second": 10.47,
938
+ "step": 957
939
+ },
940
+ {
941
+ "epoch": 33.10344827586207,
942
+ "grad_norm": 0.5038828253746033,
943
+ "learning_rate": 1.0837438423645322e-06,
944
+ "loss": 0.0138,
945
+ "step": 960
946
+ },
947
+ {
948
+ "epoch": 33.44827586206897,
949
+ "grad_norm": 0.4406326115131378,
950
+ "learning_rate": 8.866995073891626e-07,
951
+ "loss": 0.0116,
952
+ "step": 970
953
+ },
954
+ {
955
+ "epoch": 33.793103448275865,
956
+ "grad_norm": 0.3732337951660156,
957
+ "learning_rate": 6.896551724137931e-07,
958
+ "loss": 0.0171,
959
+ "step": 980
960
+ },
961
+ {
962
+ "epoch": 34.0,
963
+ "eval_loss": 0.03042110800743103,
964
+ "eval_runtime": 0.191,
965
+ "eval_samples_per_second": 31.415,
966
+ "eval_steps_per_second": 10.472,
967
+ "step": 986
968
+ },
969
+ {
970
+ "epoch": 34.13793103448276,
971
+ "grad_norm": 0.32739463448524475,
972
+ "learning_rate": 4.926108374384237e-07,
973
+ "loss": 0.0186,
974
+ "step": 990
975
+ },
976
+ {
977
+ "epoch": 34.48275862068966,
978
+ "grad_norm": 0.45314541459083557,
979
+ "learning_rate": 2.955665024630542e-07,
980
+ "loss": 0.0108,
981
+ "step": 1000
982
+ },
983
+ {
984
+ "epoch": 34.827586206896555,
985
+ "grad_norm": 0.8981533050537109,
986
+ "learning_rate": 9.852216748768474e-08,
987
+ "loss": 0.0157,
988
+ "step": 1010
989
+ }
990
+ ],
991
+ "logging_steps": 10,
992
+ "max_steps": 1015,
993
+ "num_input_tokens_seen": 0,
994
+ "num_train_epochs": 35,
995
+ "save_steps": 500,
996
+ "stateful_callbacks": {
997
+ "TrainerControl": {
998
+ "args": {
999
+ "should_epoch_stop": false,
1000
+ "should_evaluate": false,
1001
+ "should_log": false,
1002
+ "should_save": true,
1003
+ "should_training_stop": true
1004
+ },
1005
+ "attributes": {}
1006
+ }
1007
+ },
1008
+ "total_flos": 2159634677760000.0,
1009
+ "train_batch_size": 4,
1010
+ "trial_name": null,
1011
+ "trial_params": null
1012
+ }
json_extraction_point_activity/checkpoint-1015/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:333cc851b675d7a01620eeba59cdfbc1a624d74927d086ab660bd04fef9b6029
3
+ size 5240
json_extraction_point_activity/checkpoint-500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dac1c68bd5ddfee1bd36e872a6fb47a5f809fc491c8d61b7ee22e498a0b7c3d7
3
  size 2950734544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87363b3a1b2a51942b6c278f69df9a806706e5d83c60c65870a507e043356124
3
  size 2950734544
json_extraction_point_activity/checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33915ed8c8c005cb2285418a69ed4291c58470f07ae0d5f91aed2346d287361e
3
  size 5901778825
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26c2d8aae51544bc1ae1adf721bf158fe114281c194c7e88ada3907e4afe56eb
3
  size 5901778825
json_extraction_point_activity/checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d8b889645e4f402ba04e157cdd08a05cb68dd7d9f11bb493b1bf76fa6eb8a7e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2533398c3a2262d5247b0295eea340c12039f2768e0c6be8c67c59019ab0553c
3
  size 14244
json_extraction_point_activity/checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e6c1dc4044f0c3031e90c1485e3784294b3b11dfa0721002bf880df9139935
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fac53bd556f9070ef9763eb8ffdc7241c2002524738db57098477c02123b9841
3
  size 1064
json_extraction_point_activity/checkpoint-500/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 20.833333333333332,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": false,
@@ -9,518 +9,494 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.4166666666666667,
13
- "grad_norm": 57.950931549072266,
14
- "learning_rate": 1.9761904761904763e-05,
15
- "loss": 12.3716,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.8333333333333334,
20
- "grad_norm": 52.104652404785156,
21
- "learning_rate": 1.9523809523809524e-05,
22
- "loss": 7.0679,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 1.0,
27
- "eval_loss": 2.7666285037994385,
28
- "eval_runtime": 0.1913,
29
- "eval_samples_per_second": 26.135,
30
- "eval_steps_per_second": 10.454,
31
- "step": 24
32
  },
33
  {
34
- "epoch": 1.25,
35
- "grad_norm": 31.3590087890625,
36
- "learning_rate": 1.928571428571429e-05,
37
- "loss": 3.499,
38
  "step": 30
39
  },
40
  {
41
- "epoch": 1.6666666666666665,
42
- "grad_norm": 5.804717063903809,
43
- "learning_rate": 1.904761904761905e-05,
44
- "loss": 1.3892,
45
  "step": 40
46
  },
47
  {
48
- "epoch": 2.0,
49
- "eval_loss": 0.43550997972488403,
50
- "eval_runtime": 0.1886,
51
- "eval_samples_per_second": 26.504,
52
- "eval_steps_per_second": 10.602,
53
- "step": 48
54
  },
55
  {
56
- "epoch": 2.0833333333333335,
57
- "grad_norm": 3.528578042984009,
58
- "learning_rate": 1.880952380952381e-05,
59
- "loss": 0.7215,
60
- "step": 50
 
61
  },
62
  {
63
- "epoch": 2.5,
64
- "grad_norm": 2.6735429763793945,
65
- "learning_rate": 1.8571428571428575e-05,
66
- "loss": 0.5029,
67
  "step": 60
68
  },
69
  {
70
- "epoch": 2.9166666666666665,
71
- "grad_norm": 2.6698122024536133,
72
- "learning_rate": 1.8333333333333333e-05,
73
- "loss": 0.3523,
74
  "step": 70
75
  },
76
  {
77
- "epoch": 3.0,
78
- "eval_loss": 0.15193703770637512,
79
- "eval_runtime": 0.1931,
80
- "eval_samples_per_second": 25.899,
81
- "eval_steps_per_second": 10.36,
82
- "step": 72
83
- },
84
- {
85
- "epoch": 3.3333333333333335,
86
- "grad_norm": 1.314430832862854,
87
- "learning_rate": 1.8095238095238097e-05,
88
- "loss": 0.2508,
89
  "step": 80
90
  },
91
  {
92
- "epoch": 3.75,
93
- "grad_norm": 1.6598138809204102,
94
- "learning_rate": 1.785714285714286e-05,
95
- "loss": 0.2041,
96
- "step": 90
 
97
  },
98
  {
99
- "epoch": 4.0,
100
- "eval_loss": 0.09314510226249695,
101
- "eval_runtime": 0.1918,
102
- "eval_samples_per_second": 26.075,
103
- "eval_steps_per_second": 10.43,
104
- "step": 96
105
  },
106
  {
107
- "epoch": 4.166666666666667,
108
- "grad_norm": 1.4283037185668945,
109
- "learning_rate": 1.761904761904762e-05,
110
- "loss": 0.1733,
111
  "step": 100
112
  },
113
  {
114
- "epoch": 4.583333333333333,
115
- "grad_norm": 1.2669694423675537,
116
- "learning_rate": 1.7380952380952384e-05,
117
- "loss": 0.1301,
118
  "step": 110
119
  },
120
  {
121
- "epoch": 5.0,
122
- "grad_norm": 1.077564001083374,
123
- "learning_rate": 1.7142857142857142e-05,
124
- "loss": 0.131,
125
- "step": 120
 
126
  },
127
  {
128
- "epoch": 5.0,
129
- "eval_loss": 0.07609681040048599,
130
- "eval_runtime": 0.0991,
131
- "eval_samples_per_second": 50.479,
132
- "eval_steps_per_second": 20.192,
133
  "step": 120
134
  },
135
  {
136
- "epoch": 5.416666666666667,
137
- "grad_norm": 1.0863076448440552,
138
- "learning_rate": 1.6904761904761906e-05,
139
- "loss": 0.0968,
140
  "step": 130
141
  },
142
  {
143
- "epoch": 5.833333333333333,
144
- "grad_norm": 0.9947916865348816,
145
- "learning_rate": 1.6666666666666667e-05,
146
- "loss": 0.1015,
147
  "step": 140
148
  },
149
  {
150
- "epoch": 6.0,
151
- "eval_loss": 0.06843644380569458,
152
- "eval_runtime": 0.191,
153
- "eval_samples_per_second": 26.175,
154
- "eval_steps_per_second": 10.47,
155
- "step": 144
156
  },
157
  {
158
- "epoch": 6.25,
159
- "grad_norm": 0.8523992896080017,
160
- "learning_rate": 1.642857142857143e-05,
161
- "loss": 0.0789,
162
  "step": 150
163
  },
164
  {
165
- "epoch": 6.666666666666667,
166
- "grad_norm": 2.2475197315216064,
167
- "learning_rate": 1.6190476190476193e-05,
168
- "loss": 0.0831,
169
  "step": 160
170
  },
171
  {
172
- "epoch": 7.0,
173
- "eval_loss": 0.06810685992240906,
174
- "eval_runtime": 0.1922,
175
- "eval_samples_per_second": 26.014,
176
- "eval_steps_per_second": 10.405,
177
- "step": 168
178
  },
179
  {
180
- "epoch": 7.083333333333333,
181
- "grad_norm": 1.283435344696045,
182
- "learning_rate": 1.5952380952380954e-05,
183
- "loss": 0.0726,
184
- "step": 170
 
185
  },
186
  {
187
- "epoch": 7.5,
188
- "grad_norm": 0.7859775424003601,
189
- "learning_rate": 1.5714285714285715e-05,
190
- "loss": 0.0751,
191
  "step": 180
192
  },
193
  {
194
- "epoch": 7.916666666666667,
195
- "grad_norm": 1.2889643907546997,
196
- "learning_rate": 1.5476190476190476e-05,
197
- "loss": 0.0541,
198
  "step": 190
199
  },
200
  {
201
- "epoch": 8.0,
202
- "eval_loss": 0.06697859615087509,
203
- "eval_runtime": 0.1877,
204
- "eval_samples_per_second": 26.637,
205
- "eval_steps_per_second": 10.655,
206
- "step": 192
207
- },
208
- {
209
- "epoch": 8.333333333333334,
210
- "grad_norm": 1.0495927333831787,
211
- "learning_rate": 1.523809523809524e-05,
212
- "loss": 0.0474,
213
  "step": 200
214
  },
215
  {
216
- "epoch": 8.75,
217
- "grad_norm": 0.7832169532775879,
218
- "learning_rate": 1.5000000000000002e-05,
219
- "loss": 0.0565,
220
- "step": 210
 
221
  },
222
  {
223
- "epoch": 9.0,
224
- "eval_loss": 0.06908619403839111,
225
- "eval_runtime": 0.19,
226
- "eval_samples_per_second": 26.312,
227
- "eval_steps_per_second": 10.525,
228
- "step": 216
229
  },
230
  {
231
- "epoch": 9.166666666666666,
232
- "grad_norm": 1.551951289176941,
233
- "learning_rate": 1.4761904761904763e-05,
234
- "loss": 0.0509,
235
  "step": 220
236
  },
237
  {
238
- "epoch": 9.583333333333334,
239
- "grad_norm": 0.6954114437103271,
240
- "learning_rate": 1.4523809523809524e-05,
241
- "loss": 0.0487,
242
  "step": 230
243
  },
244
  {
245
- "epoch": 10.0,
246
- "grad_norm": 1.1518425941467285,
247
- "learning_rate": 1.4285714285714287e-05,
248
- "loss": 0.0421,
249
- "step": 240
 
250
  },
251
  {
252
- "epoch": 10.0,
253
- "eval_loss": 0.06548431515693665,
254
- "eval_runtime": 0.0985,
255
- "eval_samples_per_second": 50.777,
256
- "eval_steps_per_second": 20.311,
257
  "step": 240
258
  },
259
  {
260
- "epoch": 10.416666666666666,
261
- "grad_norm": 0.5938514471054077,
262
- "learning_rate": 1.4047619047619048e-05,
263
- "loss": 0.037,
264
  "step": 250
265
  },
266
  {
267
- "epoch": 10.833333333333334,
268
- "grad_norm": 0.6409568190574646,
269
- "learning_rate": 1.3809523809523811e-05,
270
- "loss": 0.0538,
271
  "step": 260
272
  },
273
  {
274
- "epoch": 11.0,
275
- "eval_loss": 0.06579773128032684,
276
- "eval_runtime": 0.19,
277
- "eval_samples_per_second": 26.317,
278
- "eval_steps_per_second": 10.527,
279
- "step": 264
280
  },
281
  {
282
- "epoch": 11.25,
283
- "grad_norm": 0.4954162836074829,
284
- "learning_rate": 1.3571428571428574e-05,
285
- "loss": 0.0418,
286
  "step": 270
287
  },
288
  {
289
- "epoch": 11.666666666666666,
290
- "grad_norm": 1.2319365739822388,
291
- "learning_rate": 1.3333333333333333e-05,
292
- "loss": 0.0406,
293
  "step": 280
294
  },
295
  {
296
- "epoch": 12.0,
297
- "eval_loss": 0.065009705722332,
298
- "eval_runtime": 0.1877,
299
- "eval_samples_per_second": 26.643,
300
- "eval_steps_per_second": 10.657,
301
- "step": 288
302
  },
303
  {
304
- "epoch": 12.083333333333334,
305
- "grad_norm": 0.4227233827114105,
306
- "learning_rate": 1.3095238095238096e-05,
307
- "loss": 0.0365,
 
308
  "step": 290
309
  },
310
  {
311
- "epoch": 12.5,
312
- "grad_norm": 0.860186755657196,
313
- "learning_rate": 1.2857142857142859e-05,
314
- "loss": 0.0376,
315
  "step": 300
316
  },
317
  {
318
- "epoch": 12.916666666666666,
319
- "grad_norm": 0.6124868392944336,
320
- "learning_rate": 1.261904761904762e-05,
321
- "loss": 0.0389,
322
  "step": 310
323
  },
324
  {
325
- "epoch": 13.0,
326
- "eval_loss": 0.06893934309482574,
327
- "eval_runtime": 0.1868,
328
- "eval_samples_per_second": 26.769,
329
- "eval_steps_per_second": 10.708,
330
- "step": 312
331
  },
332
  {
333
- "epoch": 13.333333333333334,
334
- "grad_norm": 0.3505653738975525,
335
- "learning_rate": 1.2380952380952383e-05,
336
- "loss": 0.0274,
337
  "step": 320
338
  },
339
  {
340
- "epoch": 13.75,
341
- "grad_norm": 0.8498353362083435,
342
- "learning_rate": 1.2142857142857142e-05,
343
- "loss": 0.0322,
344
  "step": 330
345
  },
346
  {
347
- "epoch": 14.0,
348
- "eval_loss": 0.07290869951248169,
349
- "eval_runtime": 0.1947,
350
- "eval_samples_per_second": 25.679,
351
- "eval_steps_per_second": 10.271,
352
- "step": 336
353
- },
354
- {
355
- "epoch": 14.166666666666666,
356
- "grad_norm": 0.6511815786361694,
357
- "learning_rate": 1.1904761904761905e-05,
358
- "loss": 0.0318,
359
  "step": 340
360
  },
361
  {
362
- "epoch": 14.583333333333334,
363
- "grad_norm": 1.0918514728546143,
364
- "learning_rate": 1.1666666666666668e-05,
365
- "loss": 0.0277,
366
- "step": 350
 
367
  },
368
  {
369
- "epoch": 15.0,
370
- "grad_norm": 0.7109629511833191,
371
- "learning_rate": 1.1428571428571429e-05,
372
- "loss": 0.0387,
373
- "step": 360
374
  },
375
  {
376
- "epoch": 15.0,
377
- "eval_loss": 0.07193797826766968,
378
- "eval_runtime": 0.1001,
379
- "eval_samples_per_second": 49.943,
380
- "eval_steps_per_second": 19.977,
381
  "step": 360
382
  },
383
  {
384
- "epoch": 15.416666666666666,
385
- "grad_norm": 1.073655128479004,
386
- "learning_rate": 1.1190476190476192e-05,
387
- "loss": 0.0283,
388
  "step": 370
389
  },
390
  {
391
- "epoch": 15.833333333333334,
392
- "grad_norm": 0.4060705006122589,
393
- "learning_rate": 1.0952380952380955e-05,
394
- "loss": 0.0283,
395
- "step": 380
 
396
  },
397
  {
398
- "epoch": 16.0,
399
- "eval_loss": 0.0718567967414856,
400
- "eval_runtime": 0.1894,
401
- "eval_samples_per_second": 26.402,
402
- "eval_steps_per_second": 10.561,
403
- "step": 384
404
  },
405
  {
406
- "epoch": 16.25,
407
- "grad_norm": 0.6933162212371826,
408
- "learning_rate": 1.0714285714285714e-05,
409
- "loss": 0.0355,
410
  "step": 390
411
  },
412
  {
413
- "epoch": 16.666666666666668,
414
- "grad_norm": 0.9838053584098816,
415
- "learning_rate": 1.0476190476190477e-05,
416
- "loss": 0.0272,
417
  "step": 400
418
  },
419
  {
420
- "epoch": 17.0,
421
- "eval_loss": 0.07115109264850616,
422
- "eval_runtime": 0.1928,
423
- "eval_samples_per_second": 25.937,
424
- "eval_steps_per_second": 10.375,
425
- "step": 408
426
  },
427
  {
428
- "epoch": 17.083333333333332,
429
- "grad_norm": 0.3024757206439972,
430
- "learning_rate": 1.0238095238095238e-05,
431
- "loss": 0.0211,
432
  "step": 410
433
  },
434
  {
435
- "epoch": 17.5,
436
- "grad_norm": 0.606316328048706,
437
- "learning_rate": 1e-05,
438
- "loss": 0.0295,
439
  "step": 420
440
  },
441
  {
442
- "epoch": 17.916666666666668,
443
- "grad_norm": 0.5690513253211975,
444
- "learning_rate": 9.761904761904762e-06,
445
- "loss": 0.0307,
446
  "step": 430
447
  },
448
  {
449
- "epoch": 18.0,
450
- "eval_loss": 0.07331715524196625,
451
- "eval_runtime": 0.1902,
452
- "eval_samples_per_second": 26.283,
453
- "eval_steps_per_second": 10.513,
454
- "step": 432
455
  },
456
  {
457
- "epoch": 18.333333333333332,
458
- "grad_norm": 0.4299153983592987,
459
- "learning_rate": 9.523809523809525e-06,
460
- "loss": 0.0201,
461
  "step": 440
462
  },
463
  {
464
- "epoch": 18.75,
465
- "grad_norm": 0.6338627338409424,
466
- "learning_rate": 9.285714285714288e-06,
467
- "loss": 0.0258,
468
  "step": 450
469
  },
470
  {
471
- "epoch": 19.0,
472
- "eval_loss": 0.0750732421875,
473
- "eval_runtime": 0.188,
474
- "eval_samples_per_second": 26.599,
475
- "eval_steps_per_second": 10.64,
476
- "step": 456
477
  },
478
  {
479
- "epoch": 19.166666666666668,
480
- "grad_norm": 0.5476926565170288,
481
- "learning_rate": 9.047619047619049e-06,
482
- "loss": 0.0291,
483
- "step": 460
 
484
  },
485
  {
486
- "epoch": 19.583333333333332,
487
- "grad_norm": 0.3064030706882477,
488
- "learning_rate": 8.80952380952381e-06,
489
- "loss": 0.0259,
490
  "step": 470
491
  },
492
  {
493
- "epoch": 20.0,
494
- "grad_norm": 0.7045935392379761,
495
- "learning_rate": 8.571428571428571e-06,
496
- "loss": 0.0192,
497
  "step": 480
498
  },
499
  {
500
- "epoch": 20.0,
501
- "eval_loss": 0.07358075678348541,
502
- "eval_runtime": 0.1059,
503
- "eval_samples_per_second": 47.21,
504
- "eval_steps_per_second": 18.884,
505
- "step": 480
506
  },
507
  {
508
- "epoch": 20.416666666666668,
509
- "grad_norm": 0.4414753019809723,
510
- "learning_rate": 8.333333333333334e-06,
511
- "loss": 0.0267,
512
- "step": 490
 
513
  },
514
  {
515
- "epoch": 20.833333333333332,
516
- "grad_norm": 0.2409076988697052,
517
- "learning_rate": 8.095238095238097e-06,
518
- "loss": 0.0188,
519
  "step": 500
520
  }
521
  ],
522
  "logging_steps": 10,
523
- "max_steps": 840,
524
  "num_input_tokens_seen": 0,
525
  "num_train_epochs": 35,
526
  "save_steps": 500,
@@ -536,7 +512,7 @@
536
  "attributes": {}
537
  }
538
  },
539
- "total_flos": 1071698411520000.0,
540
  "train_batch_size": 4,
541
  "trial_name": null,
542
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 17.24137931034483,
5
  "eval_steps": 500,
6
  "global_step": 500,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.3448275862068966,
13
+ "grad_norm": 174.2040557861328,
14
+ "learning_rate": 1.9802955665024632e-05,
15
+ "loss": 11.9232,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.6896551724137931,
20
+ "grad_norm": 54.92245101928711,
21
+ "learning_rate": 1.9605911330049263e-05,
22
+ "loss": 6.0635,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 1.0,
27
+ "eval_loss": 0.8173177242279053,
28
+ "eval_runtime": 0.1894,
29
+ "eval_samples_per_second": 31.679,
30
+ "eval_steps_per_second": 10.56,
31
+ "step": 29
32
  },
33
  {
34
+ "epoch": 1.0344827586206897,
35
+ "grad_norm": 16.176136016845703,
36
+ "learning_rate": 1.9408866995073893e-05,
37
+ "loss": 2.7637,
38
  "step": 30
39
  },
40
  {
41
+ "epoch": 1.3793103448275863,
42
+ "grad_norm": 5.2406907081604,
43
+ "learning_rate": 1.9211822660098524e-05,
44
+ "loss": 1.0566,
45
  "step": 40
46
  },
47
  {
48
+ "epoch": 1.7241379310344827,
49
+ "grad_norm": 3.6556057929992676,
50
+ "learning_rate": 1.9014778325123154e-05,
51
+ "loss": 0.6543,
52
+ "step": 50
 
53
  },
54
  {
55
+ "epoch": 2.0,
56
+ "eval_loss": 0.26999202370643616,
57
+ "eval_runtime": 0.1963,
58
+ "eval_samples_per_second": 30.571,
59
+ "eval_steps_per_second": 10.19,
60
+ "step": 58
61
  },
62
  {
63
+ "epoch": 2.0689655172413794,
64
+ "grad_norm": 1.8488504886627197,
65
+ "learning_rate": 1.8817733990147784e-05,
66
+ "loss": 0.4713,
67
  "step": 60
68
  },
69
  {
70
+ "epoch": 2.413793103448276,
71
+ "grad_norm": 2.723362922668457,
72
+ "learning_rate": 1.8620689655172415e-05,
73
+ "loss": 0.3144,
74
  "step": 70
75
  },
76
  {
77
+ "epoch": 2.7586206896551726,
78
+ "grad_norm": 1.8972634077072144,
79
+ "learning_rate": 1.8423645320197045e-05,
80
+ "loss": 0.2604,
 
 
 
 
 
 
 
 
81
  "step": 80
82
  },
83
  {
84
+ "epoch": 3.0,
85
+ "eval_loss": 0.09994816780090332,
86
+ "eval_runtime": 0.1912,
87
+ "eval_samples_per_second": 31.381,
88
+ "eval_steps_per_second": 10.46,
89
+ "step": 87
90
  },
91
  {
92
+ "epoch": 3.103448275862069,
93
+ "grad_norm": 3.115511417388916,
94
+ "learning_rate": 1.8226600985221676e-05,
95
+ "loss": 0.2067,
96
+ "step": 90
 
97
  },
98
  {
99
+ "epoch": 3.4482758620689653,
100
+ "grad_norm": 1.7388259172439575,
101
+ "learning_rate": 1.8029556650246306e-05,
102
+ "loss": 0.1494,
103
  "step": 100
104
  },
105
  {
106
+ "epoch": 3.793103448275862,
107
+ "grad_norm": 1.2075275182724,
108
+ "learning_rate": 1.7832512315270937e-05,
109
+ "loss": 0.1411,
110
  "step": 110
111
  },
112
  {
113
+ "epoch": 4.0,
114
+ "eval_loss": 0.05977391079068184,
115
+ "eval_runtime": 0.1933,
116
+ "eval_samples_per_second": 31.034,
117
+ "eval_steps_per_second": 10.345,
118
+ "step": 116
119
  },
120
  {
121
+ "epoch": 4.137931034482759,
122
+ "grad_norm": 1.7329133749008179,
123
+ "learning_rate": 1.7635467980295567e-05,
124
+ "loss": 0.1196,
 
125
  "step": 120
126
  },
127
  {
128
+ "epoch": 4.482758620689655,
129
+ "grad_norm": 2.210278272628784,
130
+ "learning_rate": 1.7438423645320198e-05,
131
+ "loss": 0.1132,
132
  "step": 130
133
  },
134
  {
135
+ "epoch": 4.827586206896552,
136
+ "grad_norm": 1.0056334733963013,
137
+ "learning_rate": 1.7241379310344828e-05,
138
+ "loss": 0.0789,
139
  "step": 140
140
  },
141
  {
142
+ "epoch": 5.0,
143
+ "eval_loss": 0.04751617833971977,
144
+ "eval_runtime": 0.1898,
145
+ "eval_samples_per_second": 31.605,
146
+ "eval_steps_per_second": 10.535,
147
+ "step": 145
148
  },
149
  {
150
+ "epoch": 5.172413793103448,
151
+ "grad_norm": 0.9742516279220581,
152
+ "learning_rate": 1.704433497536946e-05,
153
+ "loss": 0.0927,
154
  "step": 150
155
  },
156
  {
157
+ "epoch": 5.517241379310345,
158
+ "grad_norm": 1.4696099758148193,
159
+ "learning_rate": 1.684729064039409e-05,
160
+ "loss": 0.0837,
161
  "step": 160
162
  },
163
  {
164
+ "epoch": 5.862068965517241,
165
+ "grad_norm": 1.0493124723434448,
166
+ "learning_rate": 1.665024630541872e-05,
167
+ "loss": 0.0689,
168
+ "step": 170
 
169
  },
170
  {
171
+ "epoch": 6.0,
172
+ "eval_loss": 0.03317258134484291,
173
+ "eval_runtime": 0.2045,
174
+ "eval_samples_per_second": 29.343,
175
+ "eval_steps_per_second": 9.781,
176
+ "step": 174
177
  },
178
  {
179
+ "epoch": 6.206896551724138,
180
+ "grad_norm": 0.9956067204475403,
181
+ "learning_rate": 1.645320197044335e-05,
182
+ "loss": 0.0702,
183
  "step": 180
184
  },
185
  {
186
+ "epoch": 6.551724137931035,
187
+ "grad_norm": 0.4664933979511261,
188
+ "learning_rate": 1.625615763546798e-05,
189
+ "loss": 0.0675,
190
  "step": 190
191
  },
192
  {
193
+ "epoch": 6.896551724137931,
194
+ "grad_norm": 1.2444266080856323,
195
+ "learning_rate": 1.605911330049261e-05,
196
+ "loss": 0.0596,
 
 
 
 
 
 
 
 
197
  "step": 200
198
  },
199
  {
200
+ "epoch": 7.0,
201
+ "eval_loss": 0.030222313478589058,
202
+ "eval_runtime": 0.1913,
203
+ "eval_samples_per_second": 31.365,
204
+ "eval_steps_per_second": 10.455,
205
+ "step": 203
206
  },
207
  {
208
+ "epoch": 7.241379310344827,
209
+ "grad_norm": 0.5432140827178955,
210
+ "learning_rate": 1.586206896551724e-05,
211
+ "loss": 0.045,
212
+ "step": 210
 
213
  },
214
  {
215
+ "epoch": 7.586206896551724,
216
+ "grad_norm": 0.7679450511932373,
217
+ "learning_rate": 1.5665024630541875e-05,
218
+ "loss": 0.0538,
219
  "step": 220
220
  },
221
  {
222
+ "epoch": 7.931034482758621,
223
+ "grad_norm": 0.7759860754013062,
224
+ "learning_rate": 1.5467980295566506e-05,
225
+ "loss": 0.0624,
226
  "step": 230
227
  },
228
  {
229
+ "epoch": 8.0,
230
+ "eval_loss": 0.02808324061334133,
231
+ "eval_runtime": 0.2003,
232
+ "eval_samples_per_second": 29.953,
233
+ "eval_steps_per_second": 9.984,
234
+ "step": 232
235
  },
236
  {
237
+ "epoch": 8.275862068965518,
238
+ "grad_norm": 1.7437331676483154,
239
+ "learning_rate": 1.5270935960591133e-05,
240
+ "loss": 0.0369,
 
241
  "step": 240
242
  },
243
  {
244
+ "epoch": 8.620689655172415,
245
+ "grad_norm": 0.5273000597953796,
246
+ "learning_rate": 1.5073891625615764e-05,
247
+ "loss": 0.0499,
248
  "step": 250
249
  },
250
  {
251
+ "epoch": 8.96551724137931,
252
+ "grad_norm": 0.6120426058769226,
253
+ "learning_rate": 1.4876847290640396e-05,
254
+ "loss": 0.0425,
255
  "step": 260
256
  },
257
  {
258
+ "epoch": 9.0,
259
+ "eval_loss": 0.03035571426153183,
260
+ "eval_runtime": 0.1888,
261
+ "eval_samples_per_second": 31.778,
262
+ "eval_steps_per_second": 10.593,
263
+ "step": 261
264
  },
265
  {
266
+ "epoch": 9.310344827586206,
267
+ "grad_norm": 1.587663173675537,
268
+ "learning_rate": 1.4679802955665026e-05,
269
+ "loss": 0.0395,
270
  "step": 270
271
  },
272
  {
273
+ "epoch": 9.655172413793103,
274
+ "grad_norm": 0.7260332703590393,
275
+ "learning_rate": 1.4482758620689657e-05,
276
+ "loss": 0.0493,
277
  "step": 280
278
  },
279
  {
280
+ "epoch": 10.0,
281
+ "grad_norm": 0.9717508554458618,
282
+ "learning_rate": 1.4285714285714287e-05,
283
+ "loss": 0.0424,
284
+ "step": 290
 
285
  },
286
  {
287
+ "epoch": 10.0,
288
+ "eval_loss": 0.026125147938728333,
289
+ "eval_runtime": 0.0989,
290
+ "eval_samples_per_second": 60.684,
291
+ "eval_steps_per_second": 20.228,
292
  "step": 290
293
  },
294
  {
295
+ "epoch": 10.344827586206897,
296
+ "grad_norm": 0.7487574815750122,
297
+ "learning_rate": 1.4088669950738918e-05,
298
+ "loss": 0.0412,
299
  "step": 300
300
  },
301
  {
302
+ "epoch": 10.689655172413794,
303
+ "grad_norm": 1.3717066049575806,
304
+ "learning_rate": 1.3891625615763548e-05,
305
+ "loss": 0.0296,
306
  "step": 310
307
  },
308
  {
309
+ "epoch": 11.0,
310
+ "eval_loss": 0.031011082231998444,
311
+ "eval_runtime": 0.1922,
312
+ "eval_samples_per_second": 31.22,
313
+ "eval_steps_per_second": 10.407,
314
+ "step": 319
315
  },
316
  {
317
+ "epoch": 11.03448275862069,
318
+ "grad_norm": 0.6860368847846985,
319
+ "learning_rate": 1.369458128078818e-05,
320
+ "loss": 0.0411,
321
  "step": 320
322
  },
323
  {
324
+ "epoch": 11.379310344827585,
325
+ "grad_norm": 0.9999271035194397,
326
+ "learning_rate": 1.3497536945812807e-05,
327
+ "loss": 0.0358,
328
  "step": 330
329
  },
330
  {
331
+ "epoch": 11.724137931034482,
332
+ "grad_norm": 1.2313721179962158,
333
+ "learning_rate": 1.330049261083744e-05,
334
+ "loss": 0.0324,
 
 
 
 
 
 
 
 
335
  "step": 340
336
  },
337
  {
338
+ "epoch": 12.0,
339
+ "eval_loss": 0.029439905658364296,
340
+ "eval_runtime": 0.2015,
341
+ "eval_samples_per_second": 29.784,
342
+ "eval_steps_per_second": 9.928,
343
+ "step": 348
344
  },
345
  {
346
+ "epoch": 12.068965517241379,
347
+ "grad_norm": 0.9032502174377441,
348
+ "learning_rate": 1.310344827586207e-05,
349
+ "loss": 0.041,
350
+ "step": 350
351
  },
352
  {
353
+ "epoch": 12.413793103448276,
354
+ "grad_norm": 0.612800657749176,
355
+ "learning_rate": 1.29064039408867e-05,
356
+ "loss": 0.0384,
 
357
  "step": 360
358
  },
359
  {
360
+ "epoch": 12.758620689655173,
361
+ "grad_norm": 0.19512540102005005,
362
+ "learning_rate": 1.2709359605911331e-05,
363
+ "loss": 0.0349,
364
  "step": 370
365
  },
366
  {
367
+ "epoch": 13.0,
368
+ "eval_loss": 0.02956104278564453,
369
+ "eval_runtime": 0.1909,
370
+ "eval_samples_per_second": 31.424,
371
+ "eval_steps_per_second": 10.475,
372
+ "step": 377
373
  },
374
  {
375
+ "epoch": 13.10344827586207,
376
+ "grad_norm": 0.8481155633926392,
377
+ "learning_rate": 1.2512315270935961e-05,
378
+ "loss": 0.0271,
379
+ "step": 380
 
380
  },
381
  {
382
+ "epoch": 13.448275862068966,
383
+ "grad_norm": 1.3683249950408936,
384
+ "learning_rate": 1.2315270935960592e-05,
385
+ "loss": 0.0265,
386
  "step": 390
387
  },
388
  {
389
+ "epoch": 13.793103448275861,
390
+ "grad_norm": 0.6365839838981628,
391
+ "learning_rate": 1.2118226600985224e-05,
392
+ "loss": 0.0298,
393
  "step": 400
394
  },
395
  {
396
+ "epoch": 14.0,
397
+ "eval_loss": 0.03038203716278076,
398
+ "eval_runtime": 0.1902,
399
+ "eval_samples_per_second": 31.538,
400
+ "eval_steps_per_second": 10.513,
401
+ "step": 406
402
  },
403
  {
404
+ "epoch": 14.137931034482758,
405
+ "grad_norm": 0.7672634124755859,
406
+ "learning_rate": 1.1921182266009855e-05,
407
+ "loss": 0.0335,
408
  "step": 410
409
  },
410
  {
411
+ "epoch": 14.482758620689655,
412
+ "grad_norm": 0.2541676163673401,
413
+ "learning_rate": 1.1724137931034483e-05,
414
+ "loss": 0.0286,
415
  "step": 420
416
  },
417
  {
418
+ "epoch": 14.827586206896552,
419
+ "grad_norm": 0.8434980511665344,
420
+ "learning_rate": 1.1527093596059114e-05,
421
+ "loss": 0.0205,
422
  "step": 430
423
  },
424
  {
425
+ "epoch": 15.0,
426
+ "eval_loss": 0.030394822359085083,
427
+ "eval_runtime": 0.19,
428
+ "eval_samples_per_second": 31.587,
429
+ "eval_steps_per_second": 10.529,
430
+ "step": 435
431
  },
432
  {
433
+ "epoch": 15.172413793103448,
434
+ "grad_norm": 0.4303562641143799,
435
+ "learning_rate": 1.1330049261083744e-05,
436
+ "loss": 0.0323,
437
  "step": 440
438
  },
439
  {
440
+ "epoch": 15.517241379310345,
441
+ "grad_norm": 0.42710408568382263,
442
+ "learning_rate": 1.1133004926108375e-05,
443
+ "loss": 0.0227,
444
  "step": 450
445
  },
446
  {
447
+ "epoch": 15.862068965517242,
448
+ "grad_norm": 0.6126664876937866,
449
+ "learning_rate": 1.0935960591133005e-05,
450
+ "loss": 0.0215,
451
+ "step": 460
 
452
  },
453
  {
454
+ "epoch": 16.0,
455
+ "eval_loss": 0.030341001227498055,
456
+ "eval_runtime": 0.1895,
457
+ "eval_samples_per_second": 31.669,
458
+ "eval_steps_per_second": 10.556,
459
+ "step": 464
460
  },
461
  {
462
+ "epoch": 16.20689655172414,
463
+ "grad_norm": 0.7011211514472961,
464
+ "learning_rate": 1.0738916256157637e-05,
465
+ "loss": 0.0337,
466
  "step": 470
467
  },
468
  {
469
+ "epoch": 16.551724137931036,
470
+ "grad_norm": 0.5126092433929443,
471
+ "learning_rate": 1.0541871921182268e-05,
472
+ "loss": 0.0233,
473
  "step": 480
474
  },
475
  {
476
+ "epoch": 16.896551724137932,
477
+ "grad_norm": 0.9033933281898499,
478
+ "learning_rate": 1.0344827586206898e-05,
479
+ "loss": 0.0182,
480
+ "step": 490
 
481
  },
482
  {
483
+ "epoch": 17.0,
484
+ "eval_loss": 0.027958964928984642,
485
+ "eval_runtime": 0.1902,
486
+ "eval_samples_per_second": 31.553,
487
+ "eval_steps_per_second": 10.518,
488
+ "step": 493
489
  },
490
  {
491
+ "epoch": 17.24137931034483,
492
+ "grad_norm": 0.4326847493648529,
493
+ "learning_rate": 1.0147783251231529e-05,
494
+ "loss": 0.0249,
495
  "step": 500
496
  }
497
  ],
498
  "logging_steps": 10,
499
+ "max_steps": 1015,
500
  "num_input_tokens_seen": 0,
501
  "num_train_epochs": 35,
502
  "save_steps": 500,
 
512
  "attributes": {}
513
  }
514
  },
515
+ "total_flos": 1064120745984000.0,
516
  "train_batch_size": 4,
517
  "trial_name": null,
518
  "trial_params": null
json_extraction_point_activity/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8c5865d167b402f00592918cff22eefe5ba34af5683474e67ad686da38ca7c5
3
  size 2950734544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3f3216f61837c04638fd20747110cf419cd0cf7777e19c57b1a79ccbd5cce8
3
  size 2950734544