Johnny050407 commited on
Commit
50e6273
·
verified ·
1 Parent(s): d10d2d0

Delete gpt2cocoSAD

Browse files
Files changed (28) hide show
  1. gpt2cocoSAD/model_SAD/checkpoint/added_tokens.json +0 -4
  2. gpt2cocoSAD/model_SAD/checkpoint/config.json +0 -40
  3. gpt2cocoSAD/model_SAD/checkpoint/generation_config.json +0 -9
  4. gpt2cocoSAD/model_SAD/checkpoint/merges.txt +0 -0
  5. gpt2cocoSAD/model_SAD/checkpoint/model.safetensors +0 -3
  6. gpt2cocoSAD/model_SAD/checkpoint/optimizer.pt +0 -3
  7. gpt2cocoSAD/model_SAD/checkpoint/rng_state_0.pth +0 -3
  8. gpt2cocoSAD/model_SAD/checkpoint/rng_state_1.pth +0 -3
  9. gpt2cocoSAD/model_SAD/checkpoint/rng_state_2.pth +0 -3
  10. gpt2cocoSAD/model_SAD/checkpoint/rng_state_3.pth +0 -3
  11. gpt2cocoSAD/model_SAD/checkpoint/rng_state_4.pth +0 -3
  12. gpt2cocoSAD/model_SAD/checkpoint/rng_state_5.pth +0 -3
  13. gpt2cocoSAD/model_SAD/checkpoint/rng_state_6.pth +0 -3
  14. gpt2cocoSAD/model_SAD/checkpoint/rng_state_7.pth +0 -3
  15. gpt2cocoSAD/model_SAD/checkpoint/scaler.pt +0 -3
  16. gpt2cocoSAD/model_SAD/checkpoint/scheduler.pt +0 -3
  17. gpt2cocoSAD/model_SAD/checkpoint/special_tokens_map.json +0 -37
  18. gpt2cocoSAD/model_SAD/checkpoint/tokenizer_config.json +0 -40
  19. gpt2cocoSAD/model_SAD/checkpoint/trainer_state.json +0 -3499
  20. gpt2cocoSAD/model_SAD/checkpoint/training_args.bin +0 -3
  21. gpt2cocoSAD/model_SAD/checkpoint/vocab.json +0 -0
  22. gpt2cocoSAD/tokenized_data_SAD/dataset_dict.json +0 -1
  23. gpt2cocoSAD/tokenized_data_SAD/test/data-00000-of-00001.arrow +0 -3
  24. gpt2cocoSAD/tokenized_data_SAD/test/dataset_info.json +0 -49
  25. gpt2cocoSAD/tokenized_data_SAD/test/state.json +0 -22
  26. gpt2cocoSAD/tokenized_data_SAD/train/data-00000-of-00001.arrow +0 -3
  27. gpt2cocoSAD/tokenized_data_SAD/train/dataset_info.json +0 -49
  28. gpt2cocoSAD/tokenized_data_SAD/train/state.json +0 -22
gpt2cocoSAD/model_SAD/checkpoint/added_tokens.json DELETED
@@ -1,4 +0,0 @@
1
- {
2
- "<pad>": 50258,
3
- "<|startoftext|>": 50257
4
- }
 
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/config.json DELETED
@@ -1,40 +0,0 @@
1
- {
2
- "activation_function": "gelu_new",
3
- "architectures": [
4
- "GPT2LMHeadModel"
5
- ],
6
- "attn_pdrop": 0.1,
7
- "bos_token_id": 50256,
8
- "cls_token_id": 50257,
9
- "dtype": "float32",
10
- "embd_pdrop": 0.1,
11
- "eos_token_id": 50256,
12
- "initializer_range": 0.02,
13
- "layer_norm_epsilon": 1e-05,
14
- "model_type": "gpt2",
15
- "n_ctx": 1024,
16
- "n_embd": 768,
17
- "n_head": 12,
18
- "n_inner": null,
19
- "n_layer": 12,
20
- "n_positions": 1024,
21
- "pad_token_id": 50258,
22
- "reorder_and_upcast_attn": false,
23
- "resid_pdrop": 0.1,
24
- "scale_attn_by_inverse_layer_idx": false,
25
- "scale_attn_weights": true,
26
- "summary_activation": null,
27
- "summary_first_dropout": 0.1,
28
- "summary_proj_to_labels": true,
29
- "summary_type": "cls_index",
30
- "summary_use_proj": true,
31
- "task_specific_params": {
32
- "text-generation": {
33
- "do_sample": true,
34
- "max_length": 50
35
- }
36
- },
37
- "transformers_version": "4.57.1",
38
- "use_cache": true,
39
- "vocab_size": 50259
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/generation_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 50256,
4
- "eos_token_id": [
5
- 50256
6
- ],
7
- "pad_token_id": 50258,
8
- "transformers_version": "4.57.1"
9
- }
 
 
 
 
 
 
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
gpt2cocoSAD/model_SAD/checkpoint/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:83d7ea52920f80121594d61fd152d1c622fc6281af37054c33380b081b2ef277
3
- size 497780352
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ca6a6f8edba573904f208a82a8a5a807be4eccb032727fcd9833d4f6cc58be9
3
- size 995654586
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aebbbdd9f6b6458006140b9399df973322b39193c3491688f309b28eda7f6516
3
- size 15984
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:02092b11072872275f41067881e0ff9c3ebb4f71cf3347a14575ee0c3bb4dc37
3
- size 15984
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:834873e0297393e1fa57f1b955d4a9198d35ae71d5b1b5b0661327390e456a6a
3
- size 15984
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6756722a563897612ae24bdf113ccc0f63bb646ff00b0de94943e7815ca03cf9
3
- size 15984
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/rng_state_4.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:811a877d2d10d1034d381ef47c53f9a523828b58a68b612cd1b0858ee81e6c1e
3
- size 15984
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/rng_state_5.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:65538c95dd42ed5ff89d120cad03ed208d8388db3902f5332493f3f25f8ac51a
3
- size 15984
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/rng_state_6.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:44952594d0b6ac119c313df4062415014d62be491e303928f3548bb0f2c5c55f
3
- size 15984
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/rng_state_7.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2081cb2f3fdd83c5b77b31a5b51f1f3492a4f66d21e8c736d093ea56b1461995
3
- size 15984
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:52553f41dcf91d35bf246cc9a094ff932d32b1cb29717295de4846f5f25e9a11
3
- size 988
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f86d4debfeccbc9cc7f34fa8036cc4f47e09cf0625bf34c4a4cb3f392bd25f62
3
- size 1064
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/special_tokens_map.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<|endoftext|>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "<|startoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "<|endoftext|>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "pad_token": {
24
- "content": "<pad>",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "unk_token": {
31
- "content": "<|endoftext|>",
32
- "lstrip": false,
33
- "normalized": true,
34
- "rstrip": false,
35
- "single_word": false
36
- }
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/tokenizer_config.json DELETED
@@ -1,40 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "50256": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "50257": {
14
- "content": "<|startoftext|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "50258": {
22
- "content": "<pad>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- }
29
- },
30
- "bos_token": "<|endoftext|>",
31
- "clean_up_tokenization_spaces": false,
32
- "cls_token": "<|startoftext|>",
33
- "eos_token": "<|endoftext|>",
34
- "errors": "replace",
35
- "extra_special_tokens": {},
36
- "model_max_length": 1024,
37
- "pad_token": "<pad>",
38
- "tokenizer_class": "GPT2Tokenizer",
39
- "unk_token": "<|endoftext|>"
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/trainer_state.json DELETED
@@ -1,3499 +0,0 @@
1
- {
2
- "best_global_step": 900,
3
- "best_metric": 2.5223655700683594,
4
- "best_model_checkpoint": "../a1_text/fl/gpt2/pt3lgpt2coco/finetuned_on_pt3lgpt2coco_ICLR_rebuttal_prompt_pt3lgpt2coco_percent_0.01/checkpoint-900",
5
- "epoch": 99.56896551724138,
6
- "eval_steps": 100,
7
- "global_step": 23100,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.43103448275862066,
14
- "grad_norm": 0.3040359318256378,
15
- "learning_rate": 0.00188,
16
- "loss": 3.181,
17
- "step": 100
18
- },
19
- {
20
- "epoch": 0.43103448275862066,
21
- "eval_loss": 2.6918227672576904,
22
- "eval_runtime": 9.2125,
23
- "eval_samples_per_second": 12846.835,
24
- "eval_steps_per_second": 6.296,
25
- "step": 100
26
- },
27
- {
28
- "epoch": 0.8620689655172413,
29
- "grad_norm": 0.3815867006778717,
30
- "learning_rate": 0.0019999182861670534,
31
- "loss": 2.675,
32
- "step": 200
33
- },
34
- {
35
- "epoch": 0.8620689655172413,
36
- "eval_loss": 2.596940517425537,
37
- "eval_runtime": 9.249,
38
- "eval_samples_per_second": 12796.027,
39
- "eval_steps_per_second": 6.271,
40
- "step": 200
41
- },
42
- {
43
- "epoch": 1.293103448275862,
44
- "grad_norm": 0.24237173795700073,
45
- "learning_rate": 0.0019996519640898547,
46
- "loss": 2.5393,
47
- "step": 300
48
- },
49
- {
50
- "epoch": 1.293103448275862,
51
- "eval_loss": 2.5603010654449463,
52
- "eval_runtime": 9.5083,
53
- "eval_samples_per_second": 12447.149,
54
- "eval_steps_per_second": 6.1,
55
- "step": 300
56
- },
57
- {
58
- "epoch": 1.7241379310344827,
59
- "grad_norm": 0.22608302533626556,
60
- "learning_rate": 0.0019992007500999214,
61
- "loss": 2.5055,
62
- "step": 400
63
- },
64
- {
65
- "epoch": 1.7241379310344827,
66
- "eval_loss": 2.5346174240112305,
67
- "eval_runtime": 9.4684,
68
- "eval_samples_per_second": 12499.57,
69
- "eval_steps_per_second": 6.126,
70
- "step": 400
71
- },
72
- {
73
- "epoch": 2.1551724137931036,
74
- "grad_norm": 0.21417607367038727,
75
- "learning_rate": 0.001998564727652117,
76
- "loss": 2.4472,
77
- "step": 500
78
- },
79
- {
80
- "epoch": 2.1551724137931036,
81
- "eval_loss": 2.539055109024048,
82
- "eval_runtime": 9.2538,
83
- "eval_samples_per_second": 12789.421,
84
- "eval_steps_per_second": 6.268,
85
- "step": 500
86
- },
87
- {
88
- "epoch": 2.586206896551724,
89
- "grad_norm": 0.2077101767063141,
90
- "learning_rate": 0.001997744014382789,
91
- "loss": 2.392,
92
- "step": 600
93
- },
94
- {
95
- "epoch": 2.586206896551724,
96
- "eval_loss": 2.5249011516571045,
97
- "eval_runtime": 9.3812,
98
- "eval_samples_per_second": 12615.75,
99
- "eval_steps_per_second": 6.183,
100
- "step": 600
101
- },
102
- {
103
- "epoch": 3.0172413793103448,
104
- "grad_norm": 0.2634534537792206,
105
- "learning_rate": 0.0019967387620880147,
106
- "loss": 2.4016,
107
- "step": 700
108
- },
109
- {
110
- "epoch": 3.0172413793103448,
111
- "eval_loss": 2.549821138381958,
112
- "eval_runtime": 9.2268,
113
- "eval_samples_per_second": 12826.906,
114
- "eval_steps_per_second": 6.286,
115
- "step": 700
116
- },
117
- {
118
- "epoch": 3.4482758620689653,
119
- "grad_norm": 0.21819734573364258,
120
- "learning_rate": 0.0019955491566955234,
121
- "loss": 2.2858,
122
- "step": 800
123
- },
124
- {
125
- "epoch": 3.4482758620689653,
126
- "eval_loss": 2.543490171432495,
127
- "eval_runtime": 9.2266,
128
- "eval_samples_per_second": 12827.175,
129
- "eval_steps_per_second": 6.286,
130
- "step": 800
131
- },
132
- {
133
- "epoch": 3.8793103448275863,
134
- "grad_norm": 0.16630978882312775,
135
- "learning_rate": 0.0019941754182303075,
136
- "loss": 2.3272,
137
- "step": 900
138
- },
139
- {
140
- "epoch": 3.8793103448275863,
141
- "eval_loss": 2.5223655700683594,
142
- "eval_runtime": 9.4402,
143
- "eval_samples_per_second": 12536.875,
144
- "eval_steps_per_second": 6.144,
145
- "step": 900
146
- },
147
- {
148
- "epoch": 4.310344827586207,
149
- "grad_norm": 0.18484139442443848,
150
- "learning_rate": 0.0019926178007739293,
151
- "loss": 2.2288,
152
- "step": 1000
153
- },
154
- {
155
- "epoch": 4.310344827586207,
156
- "eval_loss": 2.5807507038116455,
157
- "eval_runtime": 9.3252,
158
- "eval_samples_per_second": 12691.536,
159
- "eval_steps_per_second": 6.22,
160
- "step": 1000
161
- },
162
- {
163
- "epoch": 4.741379310344827,
164
- "grad_norm": 0.19197525084018707,
165
- "learning_rate": 0.0019908765924175256,
166
- "loss": 2.2444,
167
- "step": 1100
168
- },
169
- {
170
- "epoch": 4.741379310344827,
171
- "eval_loss": 2.553419589996338,
172
- "eval_runtime": 9.2519,
173
- "eval_samples_per_second": 12792.034,
174
- "eval_steps_per_second": 6.269,
175
- "step": 1100
176
- },
177
- {
178
- "epoch": 5.172413793103448,
179
- "grad_norm": 0.21536719799041748,
180
- "learning_rate": 0.001988952115208524,
181
- "loss": 2.1968,
182
- "step": 1200
183
- },
184
- {
185
- "epoch": 5.172413793103448,
186
- "eval_loss": 2.6241047382354736,
187
- "eval_runtime": 9.2642,
188
- "eval_samples_per_second": 12775.054,
189
- "eval_steps_per_second": 6.261,
190
- "step": 1200
191
- },
192
- {
193
- "epoch": 5.603448275862069,
194
- "grad_norm": 0.19505859911441803,
195
- "learning_rate": 0.001986844725091078,
196
- "loss": 2.1572,
197
- "step": 1300
198
- },
199
- {
200
- "epoch": 5.603448275862069,
201
- "eval_loss": 2.598952531814575,
202
- "eval_runtime": 9.2767,
203
- "eval_samples_per_second": 12757.901,
204
- "eval_steps_per_second": 6.252,
205
- "step": 1300
206
- },
207
- {
208
- "epoch": 6.0344827586206895,
209
- "grad_norm": 0.23920711874961853,
210
- "learning_rate": 0.0019845548118402334,
211
- "loss": 2.1771,
212
- "step": 1400
213
- },
214
- {
215
- "epoch": 6.0344827586206895,
216
- "eval_loss": 2.670865535736084,
217
- "eval_runtime": 9.4662,
218
- "eval_samples_per_second": 12502.495,
219
- "eval_steps_per_second": 6.127,
220
- "step": 1400
221
- },
222
- {
223
- "epoch": 6.4655172413793105,
224
- "grad_norm": 0.18848589062690735,
225
- "learning_rate": 0.0019820827989898353,
226
- "loss": 2.0576,
227
- "step": 1500
228
- },
229
- {
230
- "epoch": 6.4655172413793105,
231
- "eval_loss": 2.640592575073242,
232
- "eval_runtime": 9.3904,
233
- "eval_samples_per_second": 12603.362,
234
- "eval_steps_per_second": 6.177,
235
- "step": 1500
236
- },
237
- {
238
- "epoch": 6.896551724137931,
239
- "grad_norm": 0.18979904055595398,
240
- "learning_rate": 0.0019794291437541956,
241
- "loss": 2.1191,
242
- "step": 1600
243
- },
244
- {
245
- "epoch": 6.896551724137931,
246
- "eval_loss": 2.6196117401123047,
247
- "eval_runtime": 9.2558,
248
- "eval_samples_per_second": 12786.7,
249
- "eval_steps_per_second": 6.266,
250
- "step": 1600
251
- },
252
- {
253
- "epoch": 7.327586206896552,
254
- "grad_norm": 0.24726751446723938,
255
- "learning_rate": 0.001976594336943526,
256
- "loss": 2.0069,
257
- "step": 1700
258
- },
259
- {
260
- "epoch": 7.327586206896552,
261
- "eval_loss": 2.7223079204559326,
262
- "eval_runtime": 9.3003,
263
- "eval_samples_per_second": 12725.505,
264
- "eval_steps_per_second": 6.236,
265
- "step": 1700
266
- },
267
- {
268
- "epoch": 7.758620689655173,
269
- "grad_norm": 0.21912235021591187,
270
- "learning_rate": 0.0019735789028731605,
271
- "loss": 2.0379,
272
- "step": 1800
273
- },
274
- {
275
- "epoch": 7.758620689655173,
276
- "eval_loss": 2.6878578662872314,
277
- "eval_runtime": 9.3885,
278
- "eval_samples_per_second": 12605.897,
279
- "eval_steps_per_second": 6.178,
280
- "step": 1800
281
- },
282
- {
283
- "epoch": 8.189655172413794,
284
- "grad_norm": 0.22405335307121277,
285
- "learning_rate": 0.0019703833992665795,
286
- "loss": 1.9835,
287
- "step": 1900
288
- },
289
- {
290
- "epoch": 8.189655172413794,
291
- "eval_loss": 2.8071248531341553,
292
- "eval_runtime": 9.2669,
293
- "eval_samples_per_second": 12771.428,
294
- "eval_steps_per_second": 6.259,
295
- "step": 1900
296
- },
297
- {
298
- "epoch": 8.620689655172415,
299
- "grad_norm": 0.2046048790216446,
300
- "learning_rate": 0.0019670084171522564,
301
- "loss": 1.9511,
302
- "step": 2000
303
- },
304
- {
305
- "epoch": 8.620689655172415,
306
- "eval_loss": 2.7486164569854736,
307
- "eval_runtime": 9.4519,
308
- "eval_samples_per_second": 12521.34,
309
- "eval_steps_per_second": 6.136,
310
- "step": 2000
311
- },
312
- {
313
- "epoch": 9.051724137931034,
314
- "grad_norm": 0.25268155336380005,
315
- "learning_rate": 0.0019634545807543425,
316
- "loss": 1.9748,
317
- "step": 2100
318
- },
319
- {
320
- "epoch": 9.051724137931034,
321
- "eval_loss": 2.8860645294189453,
322
- "eval_runtime": 9.397,
323
- "eval_samples_per_second": 12594.608,
324
- "eval_steps_per_second": 6.172,
325
- "step": 2100
326
- },
327
- {
328
- "epoch": 9.482758620689655,
329
- "grad_norm": 0.21367767453193665,
330
- "learning_rate": 0.0019597225473772122,
331
- "loss": 1.8546,
332
- "step": 2200
333
- },
334
- {
335
- "epoch": 9.482758620689655,
336
- "eval_loss": 2.8317956924438477,
337
- "eval_runtime": 9.3796,
338
- "eval_samples_per_second": 12617.898,
339
- "eval_steps_per_second": 6.184,
340
- "step": 2200
341
- },
342
- {
343
- "epoch": 9.913793103448276,
344
- "grad_norm": 0.20437228679656982,
345
- "learning_rate": 0.001955813007283891,
346
- "loss": 1.9269,
347
- "step": 2300
348
- },
349
- {
350
- "epoch": 9.913793103448276,
351
- "eval_loss": 2.774157762527466,
352
- "eval_runtime": 9.4231,
353
- "eval_samples_per_second": 12559.694,
354
- "eval_steps_per_second": 6.155,
355
- "step": 2300
356
- },
357
- {
358
- "epoch": 10.344827586206897,
359
- "grad_norm": 0.23834232985973358,
360
- "learning_rate": 0.001951726683568388,
361
- "loss": 1.8023,
362
- "step": 2400
363
- },
364
- {
365
- "epoch": 10.344827586206897,
366
- "eval_loss": 2.9143717288970947,
367
- "eval_runtime": 9.2647,
368
- "eval_samples_per_second": 12774.461,
369
- "eval_steps_per_second": 6.26,
370
- "step": 2400
371
- },
372
- {
373
- "epoch": 10.775862068965518,
374
- "grad_norm": 0.24124032258987427,
375
- "learning_rate": 0.0019474643320219532,
376
- "loss": 1.8488,
377
- "step": 2500
378
- },
379
- {
380
- "epoch": 10.775862068965518,
381
- "eval_loss": 2.8489134311676025,
382
- "eval_runtime": 9.438,
383
- "eval_samples_per_second": 12539.884,
384
- "eval_steps_per_second": 6.145,
385
- "step": 2500
386
- },
387
- {
388
- "epoch": 11.206896551724139,
389
- "grad_norm": 0.2650582194328308,
390
- "learning_rate": 0.00194302674099329,
391
- "loss": 1.7888,
392
- "step": 2600
393
- },
394
- {
395
- "epoch": 11.206896551724139,
396
- "eval_loss": 2.997723340988159,
397
- "eval_runtime": 9.2695,
398
- "eval_samples_per_second": 12767.769,
399
- "eval_steps_per_second": 6.257,
400
- "step": 2600
401
- },
402
- {
403
- "epoch": 11.637931034482758,
404
- "grad_norm": 0.23494461178779602,
405
- "learning_rate": 0.0019384147312427463,
406
- "loss": 1.7675,
407
- "step": 2700
408
- },
409
- {
410
- "epoch": 11.637931034482758,
411
- "eval_loss": 2.9378740787506104,
412
- "eval_runtime": 9.4233,
413
- "eval_samples_per_second": 12559.421,
414
- "eval_steps_per_second": 6.155,
415
- "step": 2700
416
- },
417
- {
418
- "epoch": 12.068965517241379,
419
- "grad_norm": 0.28737518191337585,
420
- "learning_rate": 0.001933629155790508,
421
- "loss": 1.7851,
422
- "step": 2800
423
- },
424
- {
425
- "epoch": 12.068965517241379,
426
- "eval_loss": 3.1028385162353516,
427
- "eval_runtime": 9.2265,
428
- "eval_samples_per_second": 12827.292,
429
- "eval_steps_per_second": 6.286,
430
- "step": 2800
431
- },
432
- {
433
- "epoch": 12.5,
434
- "grad_norm": 0.2447143793106079,
435
- "learning_rate": 0.0019286708997588277,
436
- "loss": 1.6799,
437
- "step": 2900
438
- },
439
- {
440
- "epoch": 12.5,
441
- "eval_loss": 3.0367963314056396,
442
- "eval_runtime": 9.4036,
443
- "eval_samples_per_second": 12585.728,
444
- "eval_steps_per_second": 6.168,
445
- "step": 2900
446
- },
447
- {
448
- "epoch": 12.931034482758621,
449
- "grad_norm": 0.23348909616470337,
450
- "learning_rate": 0.0019235408802083177,
451
- "loss": 1.7546,
452
- "step": 3000
453
- },
454
- {
455
- "epoch": 12.931034482758621,
456
- "eval_loss": 2.9751391410827637,
457
- "eval_runtime": 9.3086,
458
- "eval_samples_per_second": 12714.109,
459
- "eval_steps_per_second": 6.231,
460
- "step": 3000
461
- },
462
- {
463
- "epoch": 13.362068965517242,
464
- "grad_norm": 0.23927970230579376,
465
- "learning_rate": 0.0019182400459683319,
466
- "loss": 1.6305,
467
- "step": 3100
468
- },
469
- {
470
- "epoch": 13.362068965517242,
471
- "eval_loss": 3.11104679107666,
472
- "eval_runtime": 9.2164,
473
- "eval_samples_per_second": 12841.37,
474
- "eval_steps_per_second": 6.293,
475
- "step": 3100
476
- },
477
- {
478
- "epoch": 13.793103448275861,
479
- "grad_norm": 0.2492324709892273,
480
- "learning_rate": 0.0019127693774614737,
481
- "loss": 1.6843,
482
- "step": 3200
483
- },
484
- {
485
- "epoch": 13.793103448275861,
486
- "eval_loss": 3.0621368885040283,
487
- "eval_runtime": 9.6341,
488
- "eval_samples_per_second": 12284.569,
489
- "eval_steps_per_second": 6.02,
490
- "step": 3200
491
- },
492
- {
493
- "epoch": 14.224137931034482,
494
- "grad_norm": 0.2568209171295166,
495
- "learning_rate": 0.0019071298865222619,
496
- "loss": 1.6259,
497
- "step": 3300
498
- },
499
- {
500
- "epoch": 14.224137931034482,
501
- "eval_loss": 3.23667311668396,
502
- "eval_runtime": 9.3875,
503
- "eval_samples_per_second": 12607.29,
504
- "eval_steps_per_second": 6.178,
505
- "step": 3300
506
- },
507
- {
508
- "epoch": 14.655172413793103,
509
- "grad_norm": 0.2605755627155304,
510
- "learning_rate": 0.0019013226162099856,
511
- "loss": 1.6142,
512
- "step": 3400
513
- },
514
- {
515
- "epoch": 14.655172413793103,
516
- "eval_loss": 3.167109489440918,
517
- "eval_runtime": 9.2685,
518
- "eval_samples_per_second": 12769.199,
519
- "eval_steps_per_second": 6.258,
520
- "step": 3400
521
- },
522
- {
523
- "epoch": 15.086206896551724,
524
- "grad_norm": 0.23708142340183258,
525
- "learning_rate": 0.001895348640615783,
526
- "loss": 1.6297,
527
- "step": 3500
528
- },
529
- {
530
- "epoch": 15.086206896551724,
531
- "eval_loss": 3.3472063541412354,
532
- "eval_runtime": 9.2633,
533
- "eval_samples_per_second": 12776.346,
534
- "eval_steps_per_second": 6.261,
535
- "step": 3500
536
- },
537
- {
538
- "epoch": 15.517241379310345,
539
- "grad_norm": 0.26707184314727783,
540
- "learning_rate": 0.0018892090646639814,
541
- "loss": 1.5412,
542
- "step": 3600
543
- },
544
- {
545
- "epoch": 15.517241379310345,
546
- "eval_loss": 3.2221648693084717,
547
- "eval_runtime": 9.3835,
548
- "eval_samples_per_second": 12612.691,
549
- "eval_steps_per_second": 6.181,
550
- "step": 3600
551
- },
552
- {
553
- "epoch": 15.948275862068966,
554
- "grad_norm": 0.281376451253891,
555
- "learning_rate": 0.001882905023907735,
556
- "loss": 1.612,
557
- "step": 3700
558
- },
559
- {
560
- "epoch": 15.948275862068966,
561
- "eval_loss": 3.1913859844207764,
562
- "eval_runtime": 9.2915,
563
- "eval_samples_per_second": 12737.598,
564
- "eval_steps_per_second": 6.242,
565
- "step": 3700
566
- },
567
- {
568
- "epoch": 16.379310344827587,
569
- "grad_norm": 0.3034946322441101,
570
- "learning_rate": 0.0018764376843189978,
571
- "loss": 1.4939,
572
- "step": 3800
573
- },
574
- {
575
- "epoch": 16.379310344827587,
576
- "eval_loss": 3.3291513919830322,
577
- "eval_runtime": 9.3993,
578
- "eval_samples_per_second": 12591.436,
579
- "eval_steps_per_second": 6.171,
580
- "step": 3800
581
- },
582
- {
583
- "epoch": 16.810344827586206,
584
- "grad_norm": 0.2752383053302765,
585
- "learning_rate": 0.0018698082420728684,
586
- "loss": 1.5533,
587
- "step": 3900
588
- },
589
- {
590
- "epoch": 16.810344827586206,
591
- "eval_loss": 3.2778160572052,
592
- "eval_runtime": 9.2577,
593
- "eval_samples_per_second": 12784.003,
594
- "eval_steps_per_second": 6.265,
595
- "step": 3900
596
- },
597
- {
598
- "epoch": 17.24137931034483,
599
- "grad_norm": 0.24308009445667267,
600
- "learning_rate": 0.0018630179233263503,
601
- "loss": 1.4946,
602
- "step": 4000
603
- },
604
- {
605
- "epoch": 17.24137931034483,
606
- "eval_loss": 3.4539055824279785,
607
- "eval_runtime": 9.2861,
608
- "eval_samples_per_second": 12744.951,
609
- "eval_steps_per_second": 6.246,
610
- "step": 4000
611
- },
612
- {
613
- "epoch": 17.67241379310345,
614
- "grad_norm": 0.27771443128585815,
615
- "learning_rate": 0.0018560679839915678,
616
- "loss": 1.4915,
617
- "step": 4100
618
- },
619
- {
620
- "epoch": 17.67241379310345,
621
- "eval_loss": 3.3372902870178223,
622
- "eval_runtime": 9.3808,
623
- "eval_samples_per_second": 12616.313,
624
- "eval_steps_per_second": 6.183,
625
- "step": 4100
626
- },
627
- {
628
- "epoch": 18.103448275862068,
629
- "grad_norm": 0.23893053829669952,
630
- "learning_rate": 0.0018489597095034747,
631
- "loss": 1.502,
632
- "step": 4200
633
- },
634
- {
635
- "epoch": 18.103448275862068,
636
- "eval_loss": 3.5338973999023438,
637
- "eval_runtime": 9.3911,
638
- "eval_samples_per_second": 12602.423,
639
- "eval_steps_per_second": 6.176,
640
- "step": 4200
641
- },
642
- {
643
- "epoch": 18.53448275862069,
644
- "grad_norm": 0.27007514238357544,
645
- "learning_rate": 0.0018416944145821076,
646
- "loss": 1.4326,
647
- "step": 4300
648
- },
649
- {
650
- "epoch": 18.53448275862069,
651
- "eval_loss": 3.4548091888427734,
652
- "eval_runtime": 9.4336,
653
- "eval_samples_per_second": 12545.738,
654
- "eval_steps_per_second": 6.148,
655
- "step": 4300
656
- },
657
- {
658
- "epoch": 18.96551724137931,
659
- "grad_norm": 0.2778242528438568,
660
- "learning_rate": 0.0018342734429894186,
661
- "loss": 1.4974,
662
- "step": 4400
663
- },
664
- {
665
- "epoch": 18.96551724137931,
666
- "eval_loss": 3.3584742546081543,
667
- "eval_runtime": 9.3945,
668
- "eval_samples_per_second": 12597.905,
669
- "eval_steps_per_second": 6.174,
670
- "step": 4400
671
- },
672
- {
673
- "epoch": 19.396551724137932,
674
- "grad_norm": 0.2755739390850067,
675
- "learning_rate": 0.0018266981672807382,
676
- "loss": 1.3891,
677
- "step": 4500
678
- },
679
- {
680
- "epoch": 19.396551724137932,
681
- "eval_loss": 3.53035831451416,
682
- "eval_runtime": 9.3977,
683
- "eval_samples_per_second": 12593.626,
684
- "eval_steps_per_second": 6.172,
685
- "step": 4500
686
- },
687
- {
688
- "epoch": 19.82758620689655,
689
- "grad_norm": 0.2956218719482422,
690
- "learning_rate": 0.0018189699885509127,
691
- "loss": 1.4471,
692
- "step": 4600
693
- },
694
- {
695
- "epoch": 19.82758620689655,
696
- "eval_loss": 3.429187536239624,
697
- "eval_runtime": 9.3785,
698
- "eval_samples_per_second": 12619.367,
699
- "eval_steps_per_second": 6.184,
700
- "step": 4600
701
- },
702
- {
703
- "epoch": 20.25862068965517,
704
- "grad_norm": 0.26031580567359924,
705
- "learning_rate": 0.0018110903361751639,
706
- "loss": 1.3933,
707
- "step": 4700
708
- },
709
- {
710
- "epoch": 20.25862068965517,
711
- "eval_loss": 3.6417198181152344,
712
- "eval_runtime": 9.2276,
713
- "eval_samples_per_second": 12825.78,
714
- "eval_steps_per_second": 6.286,
715
- "step": 4700
716
- },
717
- {
718
- "epoch": 20.689655172413794,
719
- "grad_norm": 0.29798653721809387,
720
- "learning_rate": 0.0018030606675447152,
721
- "loss": 1.3977,
722
- "step": 4800
723
- },
724
- {
725
- "epoch": 20.689655172413794,
726
- "eval_loss": 3.537259817123413,
727
- "eval_runtime": 9.3854,
728
- "eval_samples_per_second": 12610.18,
729
- "eval_steps_per_second": 6.18,
730
- "step": 4800
731
- },
732
- {
733
- "epoch": 21.120689655172413,
734
- "grad_norm": 0.2423907071352005,
735
- "learning_rate": 0.0017948824677972398,
736
- "loss": 1.4009,
737
- "step": 4900
738
- },
739
- {
740
- "epoch": 21.120689655172413,
741
- "eval_loss": 3.746032476425171,
742
- "eval_runtime": 9.4268,
743
- "eval_samples_per_second": 12554.68,
744
- "eval_steps_per_second": 6.153,
745
- "step": 4900
746
- },
747
- {
748
- "epoch": 21.551724137931036,
749
- "grad_norm": 0.285411536693573,
750
- "learning_rate": 0.0017865572495421743,
751
- "loss": 1.3483,
752
- "step": 5000
753
- },
754
- {
755
- "epoch": 21.551724137931036,
756
- "eval_loss": 3.626232624053955,
757
- "eval_runtime": 9.2375,
758
- "eval_samples_per_second": 12812.033,
759
- "eval_steps_per_second": 6.279,
760
- "step": 5000
761
- },
762
- {
763
- "epoch": 21.982758620689655,
764
- "grad_norm": 0.2717497944831848,
765
- "learning_rate": 0.0017780865525809514,
766
- "loss": 1.4067,
767
- "step": 5100
768
- },
769
- {
770
- "epoch": 21.982758620689655,
771
- "eval_loss": 3.52826189994812,
772
- "eval_runtime": 9.3933,
773
- "eval_samples_per_second": 12599.514,
774
- "eval_steps_per_second": 6.175,
775
- "step": 5100
776
- },
777
- {
778
- "epoch": 22.413793103448278,
779
- "grad_norm": 0.2960723042488098,
780
- "learning_rate": 0.001769471943622206,
781
- "loss": 1.3058,
782
- "step": 5200
783
- },
784
- {
785
- "epoch": 22.413793103448278,
786
- "eval_loss": 3.7002322673797607,
787
- "eval_runtime": 9.2501,
788
- "eval_samples_per_second": 12794.575,
789
- "eval_steps_per_second": 6.27,
790
- "step": 5200
791
- },
792
- {
793
- "epoch": 22.844827586206897,
794
- "grad_norm": 0.2612112760543823,
795
- "learning_rate": 0.001760715015992,
796
- "loss": 1.3648,
797
- "step": 5300
798
- },
799
- {
800
- "epoch": 22.844827586206897,
801
- "eval_loss": 3.6350343227386475,
802
- "eval_runtime": 9.3548,
803
- "eval_samples_per_second": 12651.31,
804
- "eval_steps_per_second": 6.2,
805
- "step": 5300
806
- },
807
- {
808
- "epoch": 23.275862068965516,
809
- "grad_norm": 0.2936803996562958,
810
- "learning_rate": 0.0017518173893391294,
811
- "loss": 1.3116,
812
- "step": 5400
813
- },
814
- {
815
- "epoch": 23.275862068965516,
816
- "eval_loss": 3.8155357837677,
817
- "eval_runtime": 9.3802,
818
- "eval_samples_per_second": 12617.144,
819
- "eval_steps_per_second": 6.183,
820
- "step": 5400
821
- },
822
- {
823
- "epoch": 23.70689655172414,
824
- "grad_norm": 0.31589367985725403,
825
- "learning_rate": 0.0017427807093355573,
826
- "loss": 1.3237,
827
- "step": 5500
828
- },
829
- {
830
- "epoch": 23.70689655172414,
831
- "eval_loss": 3.7021758556365967,
832
- "eval_runtime": 9.2626,
833
- "eval_samples_per_second": 12777.362,
834
- "eval_steps_per_second": 6.262,
835
- "step": 5500
836
- },
837
- {
838
- "epoch": 24.137931034482758,
839
- "grad_norm": 0.2593901753425598,
840
- "learning_rate": 0.0017336066473720379,
841
- "loss": 1.321,
842
- "step": 5600
843
- },
844
- {
845
- "epoch": 24.137931034482758,
846
- "eval_loss": 3.911588191986084,
847
- "eval_runtime": 9.3553,
848
- "eval_samples_per_second": 12650.682,
849
- "eval_steps_per_second": 6.2,
850
- "step": 5600
851
- },
852
- {
853
- "epoch": 24.56896551724138,
854
- "grad_norm": 0.25670963525772095,
855
- "learning_rate": 0.0017242969002489832,
856
- "loss": 1.2831,
857
- "step": 5700
858
- },
859
- {
860
- "epoch": 24.56896551724138,
861
- "eval_loss": 3.7876436710357666,
862
- "eval_runtime": 9.2098,
863
- "eval_samples_per_second": 12850.53,
864
- "eval_steps_per_second": 6.298,
865
- "step": 5700
866
- },
867
- {
868
- "epoch": 25.0,
869
- "grad_norm": 0.2661011815071106,
870
- "learning_rate": 0.0017148531898626277,
871
- "loss": 1.3339,
872
- "step": 5800
873
- },
874
- {
875
- "epoch": 25.0,
876
- "eval_loss": 3.7584354877471924,
877
- "eval_runtime": 9.3748,
878
- "eval_samples_per_second": 12624.378,
879
- "eval_steps_per_second": 6.187,
880
- "step": 5800
881
- },
882
- {
883
- "epoch": 25.43103448275862,
884
- "grad_norm": 0.28436627984046936,
885
- "learning_rate": 0.0017052772628865526,
886
- "loss": 1.2398,
887
- "step": 5900
888
- },
889
- {
890
- "epoch": 25.43103448275862,
891
- "eval_loss": 3.871803045272827,
892
- "eval_runtime": 9.2847,
893
- "eval_samples_per_second": 12746.93,
894
- "eval_steps_per_second": 6.247,
895
- "step": 5900
896
- },
897
- {
898
- "epoch": 25.862068965517242,
899
- "grad_norm": 0.29255756735801697,
900
- "learning_rate": 0.0016955708904486295,
901
- "loss": 1.2999,
902
- "step": 6000
903
- },
904
- {
905
- "epoch": 25.862068965517242,
906
- "eval_loss": 3.7788214683532715,
907
- "eval_runtime": 9.402,
908
- "eval_samples_per_second": 12587.813,
909
- "eval_steps_per_second": 6.169,
910
- "step": 6000
911
- },
912
- {
913
- "epoch": 26.29310344827586,
914
- "grad_norm": 0.2579859793186188,
915
- "learning_rate": 0.0016857358678034368,
916
- "loss": 1.2468,
917
- "step": 6100
918
- },
919
- {
920
- "epoch": 26.29310344827586,
921
- "eval_loss": 3.988563060760498,
922
- "eval_runtime": 9.3885,
923
- "eval_samples_per_second": 12605.909,
924
- "eval_steps_per_second": 6.178,
925
- "step": 6100
926
- },
927
- {
928
- "epoch": 26.724137931034484,
929
- "grad_norm": 0.2829207479953766,
930
- "learning_rate": 0.0016757740140002178,
931
- "loss": 1.2634,
932
- "step": 6200
933
- },
934
- {
935
- "epoch": 26.724137931034484,
936
- "eval_loss": 3.8294568061828613,
937
- "eval_runtime": 9.2608,
938
- "eval_samples_per_second": 12779.842,
939
- "eval_steps_per_second": 6.263,
940
- "step": 6200
941
- },
942
- {
943
- "epoch": 27.155172413793103,
944
- "grad_norm": 0.26081788539886475,
945
- "learning_rate": 0.0016656871715464352,
946
- "loss": 1.2572,
947
- "step": 6300
948
- },
949
- {
950
- "epoch": 27.155172413793103,
951
- "eval_loss": 4.046365737915039,
952
- "eval_runtime": 9.3759,
953
- "eval_samples_per_second": 12622.851,
954
- "eval_steps_per_second": 6.186,
955
- "step": 6300
956
- },
957
- {
958
- "epoch": 27.586206896551722,
959
- "grad_norm": 0.27960658073425293,
960
- "learning_rate": 0.001655477206066988,
961
- "loss": 1.2288,
962
- "step": 6400
963
- },
964
- {
965
- "epoch": 27.586206896551722,
966
- "eval_loss": 3.938596248626709,
967
- "eval_runtime": 9.2164,
968
- "eval_samples_per_second": 12841.411,
969
- "eval_steps_per_second": 6.293,
970
- "step": 6400
971
- },
972
- {
973
- "epoch": 28.017241379310345,
974
- "grad_norm": 0.283263236284256,
975
- "learning_rate": 0.0016451460059591532,
976
- "loss": 1.2708,
977
- "step": 6500
978
- },
979
- {
980
- "epoch": 28.017241379310345,
981
- "eval_loss": 4.127839088439941,
982
- "eval_runtime": 9.2469,
983
- "eval_samples_per_second": 12799.012,
984
- "eval_steps_per_second": 6.272,
985
- "step": 6500
986
- },
987
- {
988
- "epoch": 28.448275862068964,
989
- "grad_norm": 0.2840973138809204,
990
- "learning_rate": 0.001634695482043313,
991
- "loss": 1.1919,
992
- "step": 6600
993
- },
994
- {
995
- "epoch": 28.448275862068964,
996
- "eval_loss": 4.020354747772217,
997
- "eval_runtime": 9.5176,
998
- "eval_samples_per_second": 12434.937,
999
- "eval_steps_per_second": 6.094,
1000
- "step": 6600
1001
- },
1002
- {
1003
- "epoch": 28.879310344827587,
1004
- "grad_norm": 0.2763742208480835,
1005
- "learning_rate": 0.0016241275672095395,
1006
- "loss": 1.2454,
1007
- "step": 6700
1008
- },
1009
- {
1010
- "epoch": 28.879310344827587,
1011
- "eval_loss": 3.929727792739868,
1012
- "eval_runtime": 9.1984,
1013
- "eval_samples_per_second": 12866.438,
1014
- "eval_steps_per_second": 6.305,
1015
- "step": 6700
1016
- },
1017
- {
1018
- "epoch": 29.310344827586206,
1019
- "grad_norm": 0.24317006766796112,
1020
- "learning_rate": 0.0016134442160600903,
1021
- "loss": 1.1954,
1022
- "step": 6800
1023
- },
1024
- {
1025
- "epoch": 29.310344827586206,
1026
- "eval_loss": 4.102382659912109,
1027
- "eval_runtime": 9.3618,
1028
- "eval_samples_per_second": 12641.871,
1029
- "eval_steps_per_second": 6.195,
1030
- "step": 6800
1031
- },
1032
- {
1033
- "epoch": 29.74137931034483,
1034
- "grad_norm": 0.26351216435432434,
1035
- "learning_rate": 0.0016026474045478978,
1036
- "loss": 1.2149,
1037
- "step": 6900
1038
- },
1039
- {
1040
- "epoch": 29.74137931034483,
1041
- "eval_loss": 4.013133525848389,
1042
- "eval_runtime": 9.2046,
1043
- "eval_samples_per_second": 12857.79,
1044
- "eval_steps_per_second": 6.301,
1045
- "step": 6900
1046
- },
1047
- {
1048
- "epoch": 30.17241379310345,
1049
- "grad_norm": 0.2500752806663513,
1050
- "learning_rate": 0.0015917391296110992,
1051
- "loss": 1.205,
1052
- "step": 7000
1053
- },
1054
- {
1055
- "epoch": 30.17241379310345,
1056
- "eval_loss": 4.178187847137451,
1057
- "eval_runtime": 9.2498,
1058
- "eval_samples_per_second": 12794.992,
1059
- "eval_steps_per_second": 6.27,
1060
- "step": 7000
1061
- },
1062
- {
1063
- "epoch": 30.603448275862068,
1064
- "grad_norm": 0.27639082074165344,
1065
- "learning_rate": 0.0015807214088036938,
1066
- "loss": 1.1858,
1067
- "step": 7100
1068
- },
1069
- {
1070
- "epoch": 30.603448275862068,
1071
- "eval_loss": 4.095057487487793,
1072
- "eval_runtime": 9.5169,
1073
- "eval_samples_per_second": 12435.848,
1074
- "eval_steps_per_second": 6.094,
1075
- "step": 7100
1076
- },
1077
- {
1078
- "epoch": 31.03448275862069,
1079
- "grad_norm": 0.2760908305644989,
1080
- "learning_rate": 0.0015695962799223819,
1081
- "loss": 1.2155,
1082
- "step": 7200
1083
- },
1084
- {
1085
- "epoch": 31.03448275862069,
1086
- "eval_loss": 4.218206405639648,
1087
- "eval_runtime": 9.2226,
1088
- "eval_samples_per_second": 12832.692,
1089
- "eval_steps_per_second": 6.289,
1090
- "step": 7200
1091
- },
1092
- {
1093
- "epoch": 31.46551724137931,
1094
- "grad_norm": 0.28241831064224243,
1095
- "learning_rate": 0.0015583658006296623,
1096
- "loss": 1.1527,
1097
- "step": 7300
1098
- },
1099
- {
1100
- "epoch": 31.46551724137931,
1101
- "eval_loss": 4.151749610900879,
1102
- "eval_runtime": 9.4127,
1103
- "eval_samples_per_second": 12573.563,
1104
- "eval_steps_per_second": 6.162,
1105
- "step": 7300
1106
- },
1107
- {
1108
- "epoch": 31.896551724137932,
1109
- "grad_norm": 0.28293758630752563,
1110
- "learning_rate": 0.0015470320480732548,
1111
- "loss": 1.2015,
1112
- "step": 7400
1113
- },
1114
- {
1115
- "epoch": 31.896551724137932,
1116
- "eval_loss": 4.056530475616455,
1117
- "eval_runtime": 9.2396,
1118
- "eval_samples_per_second": 12809.071,
1119
- "eval_steps_per_second": 6.277,
1120
- "step": 7400
1121
- },
1122
- {
1123
- "epoch": 32.327586206896555,
1124
- "grad_norm": 0.26155322790145874,
1125
- "learning_rate": 0.0015355971185019174,
1126
- "loss": 1.1499,
1127
- "step": 7500
1128
- },
1129
- {
1130
- "epoch": 32.327586206896555,
1131
- "eval_loss": 4.238384246826172,
1132
- "eval_runtime": 9.2212,
1133
- "eval_samples_per_second": 12834.667,
1134
- "eval_steps_per_second": 6.29,
1135
- "step": 7500
1136
- },
1137
- {
1138
- "epoch": 32.758620689655174,
1139
- "grad_norm": 0.26788121461868286,
1140
- "learning_rate": 0.0015240631268777327,
1141
- "loss": 1.1748,
1142
- "step": 7600
1143
- },
1144
- {
1145
- "epoch": 32.758620689655174,
1146
- "eval_loss": 4.0977630615234375,
1147
- "eval_runtime": 9.4085,
1148
- "eval_samples_per_second": 12579.119,
1149
- "eval_steps_per_second": 6.165,
1150
- "step": 7600
1151
- },
1152
- {
1153
- "epoch": 33.189655172413794,
1154
- "grad_norm": 0.24862082302570343,
1155
- "learning_rate": 0.0015124322064849342,
1156
- "loss": 1.1615,
1157
- "step": 7700
1158
- },
1159
- {
1160
- "epoch": 33.189655172413794,
1161
- "eval_loss": 4.326200008392334,
1162
- "eval_runtime": 9.3683,
1163
- "eval_samples_per_second": 12633.085,
1164
- "eval_steps_per_second": 6.191,
1165
- "step": 7700
1166
- },
1167
- {
1168
- "epoch": 33.62068965517241,
1169
- "grad_norm": 0.24566680192947388,
1170
- "learning_rate": 0.0015007065085353387,
1171
- "loss": 1.1482,
1172
- "step": 7800
1173
- },
1174
- {
1175
- "epoch": 33.62068965517241,
1176
- "eval_loss": 4.191620826721191,
1177
- "eval_runtime": 9.5145,
1178
- "eval_samples_per_second": 12439.027,
1179
- "eval_steps_per_second": 6.096,
1180
- "step": 7800
1181
- },
1182
- {
1183
- "epoch": 34.05172413793103,
1184
- "grad_norm": 0.2441052347421646,
1185
- "learning_rate": 0.001488888201770468,
1186
- "loss": 1.1722,
1187
- "step": 7900
1188
- },
1189
- {
1190
- "epoch": 34.05172413793103,
1191
- "eval_loss": 4.365562438964844,
1192
- "eval_runtime": 9.2343,
1193
- "eval_samples_per_second": 12816.408,
1194
- "eval_steps_per_second": 6.281,
1195
- "step": 7900
1196
- },
1197
- {
1198
- "epoch": 34.48275862068966,
1199
- "grad_norm": 0.2535702586174011,
1200
- "learning_rate": 0.0014769794720604263,
1201
- "loss": 1.1194,
1202
- "step": 8000
1203
- },
1204
- {
1205
- "epoch": 34.48275862068966,
1206
- "eval_loss": 4.26139497756958,
1207
- "eval_runtime": 9.2337,
1208
- "eval_samples_per_second": 12817.319,
1209
- "eval_steps_per_second": 6.281,
1210
- "step": 8000
1211
- },
1212
- {
1213
- "epoch": 34.91379310344828,
1214
- "grad_norm": 0.25358447432518005,
1215
- "learning_rate": 0.0014649825219996107,
1216
- "loss": 1.1624,
1217
- "step": 8100
1218
- },
1219
- {
1220
- "epoch": 34.91379310344828,
1221
- "eval_loss": 4.159610748291016,
1222
- "eval_runtime": 9.3698,
1223
- "eval_samples_per_second": 12631.093,
1224
- "eval_steps_per_second": 6.19,
1225
- "step": 8100
1226
- },
1227
- {
1228
- "epoch": 35.3448275862069,
1229
- "grad_norm": 0.2515275776386261,
1230
- "learning_rate": 0.0014528995704993248,
1231
- "loss": 1.1138,
1232
- "step": 8200
1233
- },
1234
- {
1235
- "epoch": 35.3448275862069,
1236
- "eval_loss": 4.359853267669678,
1237
- "eval_runtime": 9.249,
1238
- "eval_samples_per_second": 12796.154,
1239
- "eval_steps_per_second": 6.271,
1240
- "step": 8200
1241
- },
1242
- {
1243
- "epoch": 35.775862068965516,
1244
- "grad_norm": 0.2827693819999695,
1245
- "learning_rate": 0.0014407328523773804,
1246
- "loss": 1.1399,
1247
- "step": 8300
1248
- },
1249
- {
1250
- "epoch": 35.775862068965516,
1251
- "eval_loss": 4.250208377838135,
1252
- "eval_runtime": 9.5293,
1253
- "eval_samples_per_second": 12419.637,
1254
- "eval_steps_per_second": 6.086,
1255
- "step": 8300
1256
- },
1257
- {
1258
- "epoch": 36.206896551724135,
1259
- "grad_norm": 0.25135308504104614,
1260
- "learning_rate": 0.0014284846179447514,
1261
- "loss": 1.1223,
1262
- "step": 8400
1263
- },
1264
- {
1265
- "epoch": 36.206896551724135,
1266
- "eval_loss": 4.41204833984375,
1267
- "eval_runtime": 9.2023,
1268
- "eval_samples_per_second": 12861.028,
1269
- "eval_steps_per_second": 6.303,
1270
- "step": 8400
1271
- },
1272
- {
1273
- "epoch": 36.63793103448276,
1274
- "grad_norm": 0.2558540999889374,
1275
- "learning_rate": 0.0014161571325893665,
1276
- "loss": 1.1157,
1277
- "step": 8500
1278
- },
1279
- {
1280
- "epoch": 36.63793103448276,
1281
- "eval_loss": 4.329861640930176,
1282
- "eval_runtime": 9.2024,
1283
- "eval_samples_per_second": 12860.85,
1284
- "eval_steps_per_second": 6.303,
1285
- "step": 8500
1286
- },
1287
- {
1288
- "epoch": 37.06896551724138,
1289
- "grad_norm": 0.23830156028270721,
1290
- "learning_rate": 0.0014037526763571095,
1291
- "loss": 1.1328,
1292
- "step": 8600
1293
- },
1294
- {
1295
- "epoch": 37.06896551724138,
1296
- "eval_loss": 4.509239673614502,
1297
- "eval_runtime": 9.3506,
1298
- "eval_samples_per_second": 12657.0,
1299
- "eval_steps_per_second": 6.203,
1300
- "step": 8600
1301
- },
1302
- {
1303
- "epoch": 37.5,
1304
- "grad_norm": 0.24350416660308838,
1305
- "learning_rate": 0.0013912735435301126,
1306
- "loss": 1.0913,
1307
- "step": 8700
1308
- },
1309
- {
1310
- "epoch": 37.5,
1311
- "eval_loss": 4.415010452270508,
1312
- "eval_runtime": 9.2551,
1313
- "eval_samples_per_second": 12787.61,
1314
- "eval_steps_per_second": 6.267,
1315
- "step": 8700
1316
- },
1317
- {
1318
- "epoch": 37.93103448275862,
1319
- "grad_norm": 0.27101027965545654,
1320
- "learning_rate": 0.0013787220422024134,
1321
- "loss": 1.1287,
1322
- "step": 8800
1323
- },
1324
- {
1325
- "epoch": 37.93103448275862,
1326
- "eval_loss": 4.311288833618164,
1327
- "eval_runtime": 9.5314,
1328
- "eval_samples_per_second": 12416.995,
1329
- "eval_steps_per_second": 6.085,
1330
- "step": 8800
1331
- },
1332
- {
1333
- "epoch": 38.36206896551724,
1334
- "grad_norm": 0.23868116736412048,
1335
- "learning_rate": 0.0013661004938530573,
1336
- "loss": 1.0804,
1337
- "step": 8900
1338
- },
1339
- {
1340
- "epoch": 38.36206896551724,
1341
- "eval_loss": 4.474818229675293,
1342
- "eval_runtime": 9.2149,
1343
- "eval_samples_per_second": 12843.381,
1344
- "eval_steps_per_second": 6.294,
1345
- "step": 8900
1346
- },
1347
- {
1348
- "epoch": 38.793103448275865,
1349
- "grad_norm": 0.2615257799625397,
1350
- "learning_rate": 0.0013534112329167277,
1351
- "loss": 1.1087,
1352
- "step": 9000
1353
- },
1354
- {
1355
- "epoch": 38.793103448275865,
1356
- "eval_loss": 4.382425308227539,
1357
- "eval_runtime": 9.2362,
1358
- "eval_samples_per_second": 12813.759,
1359
- "eval_steps_per_second": 6.28,
1360
- "step": 9000
1361
- },
1362
- {
1363
- "epoch": 39.224137931034484,
1364
- "grad_norm": 0.23843148350715637,
1365
- "learning_rate": 0.0013406566063519764,
1366
- "loss": 1.0892,
1367
- "step": 9100
1368
- },
1369
- {
1370
- "epoch": 39.224137931034484,
1371
- "eval_loss": 4.5527825355529785,
1372
- "eval_runtime": 9.4035,
1373
- "eval_samples_per_second": 12585.803,
1374
- "eval_steps_per_second": 6.168,
1375
- "step": 9100
1376
- },
1377
- {
1378
- "epoch": 39.6551724137931,
1379
- "grad_norm": 0.2602212131023407,
1380
- "learning_rate": 0.0013278389732071388,
1381
- "loss": 1.0872,
1382
- "step": 9200
1383
- },
1384
- {
1385
- "epoch": 39.6551724137931,
1386
- "eval_loss": 4.4413909912109375,
1387
- "eval_runtime": 9.2185,
1388
- "eval_samples_per_second": 12838.457,
1389
- "eval_steps_per_second": 6.292,
1390
- "step": 9200
1391
- },
1392
- {
1393
- "epoch": 40.08620689655172,
1394
- "grad_norm": 0.2304549515247345,
1395
- "learning_rate": 0.0013149607041840125,
1396
- "loss": 1.0997,
1397
- "step": 9300
1398
- },
1399
- {
1400
- "epoch": 40.08620689655172,
1401
- "eval_loss": 4.610817909240723,
1402
- "eval_runtime": 9.3893,
1403
- "eval_samples_per_second": 12604.936,
1404
- "eval_steps_per_second": 6.177,
1405
- "step": 9300
1406
- },
1407
- {
1408
- "epoch": 40.51724137931034,
1409
- "grad_norm": 0.24227924644947052,
1410
- "learning_rate": 0.0013020241811993831,
1411
- "loss": 1.0649,
1412
- "step": 9400
1413
- },
1414
- {
1415
- "epoch": 40.51724137931034,
1416
- "eval_loss": 4.493175506591797,
1417
- "eval_runtime": 9.3864,
1418
- "eval_samples_per_second": 12608.751,
1419
- "eval_steps_per_second": 6.179,
1420
- "step": 9400
1421
- },
1422
- {
1423
- "epoch": 40.94827586206897,
1424
- "grad_norm": 0.28171467781066895,
1425
- "learning_rate": 0.0012890317969444715,
1426
- "loss": 1.1007,
1427
- "step": 9500
1428
- },
1429
- {
1430
- "epoch": 40.94827586206897,
1431
- "eval_loss": 4.4659504890441895,
1432
- "eval_runtime": 9.3378,
1433
- "eval_samples_per_second": 12674.445,
1434
- "eval_steps_per_second": 6.211,
1435
- "step": 9500
1436
- },
1437
- {
1438
- "epoch": 41.37931034482759,
1439
- "grad_norm": 0.22989529371261597,
1440
- "learning_rate": 0.0012759859544423924,
1441
- "loss": 1.0526,
1442
- "step": 9600
1443
- },
1444
- {
1445
- "epoch": 41.37931034482759,
1446
- "eval_loss": 4.579300880432129,
1447
- "eval_runtime": 9.346,
1448
- "eval_samples_per_second": 12663.266,
1449
- "eval_steps_per_second": 6.206,
1450
- "step": 9600
1451
- },
1452
- {
1453
- "epoch": 41.810344827586206,
1454
- "grad_norm": 0.22905075550079346,
1455
- "learning_rate": 0.001262889066603698,
1456
- "loss": 1.0816,
1457
- "step": 9700
1458
- },
1459
- {
1460
- "epoch": 41.810344827586206,
1461
- "eval_loss": 4.495105743408203,
1462
- "eval_runtime": 9.2447,
1463
- "eval_samples_per_second": 12802.093,
1464
- "eval_steps_per_second": 6.274,
1465
- "step": 9700
1466
- },
1467
- {
1468
- "epoch": 42.241379310344826,
1469
- "grad_norm": 0.23457792401313782,
1470
- "learning_rate": 0.0012497435557800973,
1471
- "loss": 1.0612,
1472
- "step": 9800
1473
- },
1474
- {
1475
- "epoch": 42.241379310344826,
1476
- "eval_loss": 4.662010192871094,
1477
- "eval_runtime": 9.3901,
1478
- "eval_samples_per_second": 12603.862,
1479
- "eval_steps_per_second": 6.177,
1480
- "step": 9800
1481
- },
1482
- {
1483
- "epoch": 42.672413793103445,
1484
- "grad_norm": 0.24168674647808075,
1485
- "learning_rate": 0.0012365518533164264,
1486
- "loss": 1.0609,
1487
- "step": 9900
1488
- },
1489
- {
1490
- "epoch": 42.672413793103445,
1491
- "eval_loss": 4.597413539886475,
1492
- "eval_runtime": 9.2147,
1493
- "eval_samples_per_second": 12843.706,
1494
- "eval_steps_per_second": 6.294,
1495
- "step": 9900
1496
- },
1497
- {
1498
- "epoch": 43.10344827586207,
1499
- "grad_norm": 0.25161978602409363,
1500
- "learning_rate": 0.0012233163991009536,
1501
- "loss": 1.0705,
1502
- "step": 10000
1503
- },
1504
- {
1505
- "epoch": 43.10344827586207,
1506
- "eval_loss": 4.722944736480713,
1507
- "eval_runtime": 9.377,
1508
- "eval_samples_per_second": 12621.406,
1509
- "eval_steps_per_second": 6.185,
1510
- "step": 10000
1511
- },
1512
- {
1513
- "epoch": 43.53448275862069,
1514
- "grad_norm": 0.2435440868139267,
1515
- "learning_rate": 0.0012100396411141104,
1516
- "loss": 1.0413,
1517
- "step": 10100
1518
- },
1519
- {
1520
- "epoch": 43.53448275862069,
1521
- "eval_loss": 4.6760735511779785,
1522
- "eval_runtime": 9.4169,
1523
- "eval_samples_per_second": 12567.971,
1524
- "eval_steps_per_second": 6.159,
1525
- "step": 10100
1526
- },
1527
- {
1528
- "epoch": 43.96551724137931,
1529
- "grad_norm": 0.22854387760162354,
1530
- "learning_rate": 0.0011967240349757202,
1531
- "loss": 1.075,
1532
- "step": 10200
1533
- },
1534
- {
1535
- "epoch": 43.96551724137931,
1536
- "eval_loss": 4.56270170211792,
1537
- "eval_runtime": 9.2186,
1538
- "eval_samples_per_second": 12838.229,
1539
- "eval_steps_per_second": 6.292,
1540
- "step": 10200
1541
- },
1542
- {
1543
- "epoch": 44.39655172413793,
1544
- "grad_norm": 0.22642293572425842,
1545
- "learning_rate": 0.001183372043490815,
1546
- "loss": 1.0275,
1547
- "step": 10300
1548
- },
1549
- {
1550
- "epoch": 44.39655172413793,
1551
- "eval_loss": 4.700027942657471,
1552
- "eval_runtime": 9.3756,
1553
- "eval_samples_per_second": 12623.278,
1554
- "eval_steps_per_second": 6.186,
1555
- "step": 10300
1556
- },
1557
- {
1558
- "epoch": 44.827586206896555,
1559
- "grad_norm": 0.2266691029071808,
1560
- "learning_rate": 0.0011699861361941259,
1561
- "loss": 1.0559,
1562
- "step": 10400
1563
- },
1564
- {
1565
- "epoch": 44.827586206896555,
1566
- "eval_loss": 4.6264777183532715,
1567
- "eval_runtime": 9.2603,
1568
- "eval_samples_per_second": 12780.536,
1569
- "eval_steps_per_second": 6.263,
1570
- "step": 10400
1571
- },
1572
- {
1573
- "epoch": 45.258620689655174,
1574
- "grad_norm": 0.21492455899715424,
1575
- "learning_rate": 0.0011565687888933276,
1576
- "loss": 1.0347,
1577
- "step": 10500
1578
- },
1579
- {
1580
- "epoch": 45.258620689655174,
1581
- "eval_loss": 4.728958606719971,
1582
- "eval_runtime": 9.3904,
1583
- "eval_samples_per_second": 12603.413,
1584
- "eval_steps_per_second": 6.177,
1585
- "step": 10500
1586
- },
1587
- {
1588
- "epoch": 45.689655172413794,
1589
- "grad_norm": 0.21673394739627838,
1590
- "learning_rate": 0.0011431224832111197,
1591
- "loss": 1.0399,
1592
- "step": 10600
1593
- },
1594
- {
1595
- "epoch": 45.689655172413794,
1596
- "eval_loss": 4.708123207092285,
1597
- "eval_runtime": 9.2682,
1598
- "eval_samples_per_second": 12769.564,
1599
- "eval_steps_per_second": 6.258,
1600
- "step": 10600
1601
- },
1602
- {
1603
- "epoch": 46.12068965517241,
1604
- "grad_norm": 0.22129711508750916,
1605
- "learning_rate": 0.0011296497061262364,
1606
- "loss": 1.0426,
1607
- "step": 10700
1608
- },
1609
- {
1610
- "epoch": 46.12068965517241,
1611
- "eval_loss": 4.850292205810547,
1612
- "eval_runtime": 9.2246,
1613
- "eval_samples_per_second": 12829.961,
1614
- "eval_steps_per_second": 6.288,
1615
- "step": 10700
1616
- },
1617
- {
1618
- "epoch": 46.55172413793103,
1619
- "grad_norm": 0.24281208217144012,
1620
- "learning_rate": 0.0011161529495134655,
1621
- "loss": 1.0222,
1622
- "step": 10800
1623
- },
1624
- {
1625
- "epoch": 46.55172413793103,
1626
- "eval_loss": 4.75180196762085,
1627
- "eval_runtime": 9.3686,
1628
- "eval_samples_per_second": 12632.692,
1629
- "eval_steps_per_second": 6.191,
1630
- "step": 10800
1631
- },
1632
- {
1633
- "epoch": 46.98275862068966,
1634
- "grad_norm": 0.24109460413455963,
1635
- "learning_rate": 0.0011026347096827578,
1636
- "loss": 1.05,
1637
- "step": 10900
1638
- },
1639
- {
1640
- "epoch": 46.98275862068966,
1641
- "eval_loss": 4.682476043701172,
1642
- "eval_runtime": 9.3818,
1643
- "eval_samples_per_second": 12614.995,
1644
- "eval_steps_per_second": 6.182,
1645
- "step": 10900
1646
- },
1647
- {
1648
- "epoch": 47.41379310344828,
1649
- "grad_norm": 0.26203250885009766,
1650
- "learning_rate": 0.0010890974869175213,
1651
- "loss": 1.0048,
1652
- "step": 11000
1653
- },
1654
- {
1655
- "epoch": 47.41379310344828,
1656
- "eval_loss": 4.7717461585998535,
1657
- "eval_runtime": 9.2238,
1658
- "eval_samples_per_second": 12831.044,
1659
- "eval_steps_per_second": 6.288,
1660
- "step": 11000
1661
- },
1662
- {
1663
- "epoch": 47.8448275862069,
1664
- "grad_norm": 0.22025622427463531,
1665
- "learning_rate": 0.001075543785012176,
1666
- "loss": 1.0356,
1667
- "step": 11100
1668
- },
1669
- {
1670
- "epoch": 47.8448275862069,
1671
- "eval_loss": 4.745931148529053,
1672
- "eval_runtime": 9.3379,
1673
- "eval_samples_per_second": 12674.319,
1674
- "eval_steps_per_second": 6.211,
1675
- "step": 11100
1676
- },
1677
- {
1678
- "epoch": 48.275862068965516,
1679
- "grad_norm": 0.2139374315738678,
1680
- "learning_rate": 0.0010619761108090632,
1681
- "loss": 1.0127,
1682
- "step": 11200
1683
- },
1684
- {
1685
- "epoch": 48.275862068965516,
1686
- "eval_loss": 4.8763251304626465,
1687
- "eval_runtime": 9.1962,
1688
- "eval_samples_per_second": 12869.532,
1689
- "eval_steps_per_second": 6.307,
1690
- "step": 11200
1691
- },
1692
- {
1693
- "epoch": 48.706896551724135,
1694
- "grad_norm": 0.21663136780261993,
1695
- "learning_rate": 0.0010483969737347882,
1696
- "loss": 1.0203,
1697
- "step": 11300
1698
- },
1699
- {
1700
- "epoch": 48.706896551724135,
1701
- "eval_loss": 4.779572010040283,
1702
- "eval_runtime": 9.2083,
1703
- "eval_samples_per_second": 12852.706,
1704
- "eval_steps_per_second": 6.299,
1705
- "step": 11300
1706
- },
1707
- {
1708
- "epoch": 49.13793103448276,
1709
- "grad_norm": 0.19012530148029327,
1710
- "learning_rate": 0.0010348088853360865,
1711
- "loss": 1.0188,
1712
- "step": 11400
1713
- },
1714
- {
1715
- "epoch": 49.13793103448276,
1716
- "eval_loss": 4.949375629425049,
1717
- "eval_runtime": 9.4015,
1718
- "eval_samples_per_second": 12588.477,
1719
- "eval_steps_per_second": 6.169,
1720
- "step": 11400
1721
- },
1722
- {
1723
- "epoch": 49.56896551724138,
1724
- "grad_norm": 0.21295411884784698,
1725
- "learning_rate": 0.0010212143588152972,
1726
- "loss": 1.0028,
1727
- "step": 11500
1728
- },
1729
- {
1730
- "epoch": 49.56896551724138,
1731
- "eval_loss": 4.887173652648926,
1732
- "eval_runtime": 9.2299,
1733
- "eval_samples_per_second": 12822.585,
1734
- "eval_steps_per_second": 6.284,
1735
- "step": 11500
1736
- },
1737
- {
1738
- "epoch": 50.0,
1739
- "grad_norm": 0.2105313092470169,
1740
- "learning_rate": 0.0010076159085655307,
1741
- "loss": 1.0284,
1742
- "step": 11600
1743
- },
1744
- {
1745
- "epoch": 50.0,
1746
- "eval_loss": 4.840916633605957,
1747
- "eval_runtime": 9.517,
1748
- "eval_samples_per_second": 12435.741,
1749
- "eval_steps_per_second": 6.094,
1750
- "step": 11600
1751
- },
1752
- {
1753
- "epoch": 50.43103448275862,
1754
- "grad_norm": 0.20751339197158813,
1755
- "learning_rate": 0.0009940160497056154,
1756
- "loss": 0.986,
1757
- "step": 11700
1758
- },
1759
- {
1760
- "epoch": 50.43103448275862,
1761
- "eval_loss": 4.903811931610107,
1762
- "eval_runtime": 9.2176,
1763
- "eval_samples_per_second": 12839.722,
1764
- "eval_steps_per_second": 6.292,
1765
- "step": 11700
1766
- },
1767
- {
1768
- "epoch": 50.86206896551724,
1769
- "grad_norm": 0.22046105563640594,
1770
- "learning_rate": 0.0009804172976149107,
1771
- "loss": 1.015,
1772
- "step": 11800
1773
- },
1774
- {
1775
- "epoch": 50.86206896551724,
1776
- "eval_loss": 4.859233379364014,
1777
- "eval_runtime": 9.2588,
1778
- "eval_samples_per_second": 12782.512,
1779
- "eval_steps_per_second": 6.264,
1780
- "step": 11800
1781
- },
1782
- {
1783
- "epoch": 51.293103448275865,
1784
- "grad_norm": 0.20184583961963654,
1785
- "learning_rate": 0.0009668221674680736,
1786
- "loss": 0.9919,
1787
- "step": 11900
1788
- },
1789
- {
1790
- "epoch": 51.293103448275865,
1791
- "eval_loss": 4.959497451782227,
1792
- "eval_runtime": 9.4293,
1793
- "eval_samples_per_second": 12551.434,
1794
- "eval_steps_per_second": 6.151,
1795
- "step": 11900
1796
- },
1797
- {
1798
- "epoch": 51.724137931034484,
1799
- "grad_norm": 0.19977693259716034,
1800
- "learning_rate": 0.0009532331737698578,
1801
- "loss": 1.001,
1802
- "step": 12000
1803
- },
1804
- {
1805
- "epoch": 51.724137931034484,
1806
- "eval_loss": 4.924416542053223,
1807
- "eval_runtime": 9.2287,
1808
- "eval_samples_per_second": 12824.178,
1809
- "eval_steps_per_second": 6.285,
1810
- "step": 12000
1811
- },
1812
- {
1813
- "epoch": 52.1551724137931,
1814
- "grad_norm": 0.20899131894111633,
1815
- "learning_rate": 0.0009396528298900436,
1816
- "loss": 0.9974,
1817
- "step": 12100
1818
- },
1819
- {
1820
- "epoch": 52.1551724137931,
1821
- "eval_loss": 5.01582670211792,
1822
- "eval_runtime": 9.5191,
1823
- "eval_samples_per_second": 12432.95,
1824
- "eval_steps_per_second": 6.093,
1825
- "step": 12100
1826
- },
1827
- {
1828
- "epoch": 52.58620689655172,
1829
- "grad_norm": 0.20686785876750946,
1830
- "learning_rate": 0.0009260836475985731,
1831
- "loss": 0.9863,
1832
- "step": 12200
1833
- },
1834
- {
1835
- "epoch": 52.58620689655172,
1836
- "eval_loss": 4.9655070304870605,
1837
- "eval_runtime": 9.2685,
1838
- "eval_samples_per_second": 12769.132,
1839
- "eval_steps_per_second": 6.258,
1840
- "step": 12200
1841
- },
1842
- {
1843
- "epoch": 53.01724137931034,
1844
- "grad_norm": 0.20469722151756287,
1845
- "learning_rate": 0.0009125281366009814,
1846
- "loss": 1.0061,
1847
- "step": 12300
1848
- },
1849
- {
1850
- "epoch": 53.01724137931034,
1851
- "eval_loss": 5.180333137512207,
1852
- "eval_runtime": 9.266,
1853
- "eval_samples_per_second": 12772.604,
1854
- "eval_steps_per_second": 6.259,
1855
- "step": 12300
1856
- },
1857
- {
1858
- "epoch": 53.44827586206897,
1859
- "grad_norm": 0.1923903524875641,
1860
- "learning_rate": 0.0008989888040742127,
1861
- "loss": 0.971,
1862
- "step": 12400
1863
- },
1864
- {
1865
- "epoch": 53.44827586206897,
1866
- "eval_loss": 4.984365463256836,
1867
- "eval_runtime": 9.4114,
1868
- "eval_samples_per_second": 12575.325,
1869
- "eval_steps_per_second": 6.163,
1870
- "step": 12400
1871
- },
1872
- {
1873
- "epoch": 53.87931034482759,
1874
- "grad_norm": 0.2009354680776596,
1875
- "learning_rate": 0.0008854681542029017,
1876
- "loss": 0.9965,
1877
- "step": 12500
1878
- },
1879
- {
1880
- "epoch": 53.87931034482759,
1881
- "eval_loss": 4.9768877029418945,
1882
- "eval_runtime": 9.2039,
1883
- "eval_samples_per_second": 12858.815,
1884
- "eval_steps_per_second": 6.302,
1885
- "step": 12500
1886
- },
1887
- {
1888
- "epoch": 54.310344827586206,
1889
- "grad_norm": 0.2066151350736618,
1890
- "learning_rate": 0.0008719686877162076,
1891
- "loss": 0.9717,
1892
- "step": 12600
1893
- },
1894
- {
1895
- "epoch": 54.310344827586206,
1896
- "eval_loss": 5.092445373535156,
1897
- "eval_runtime": 9.5207,
1898
- "eval_samples_per_second": 12430.879,
1899
- "eval_steps_per_second": 6.092,
1900
- "step": 12600
1901
- },
1902
- {
1903
- "epoch": 54.741379310344826,
1904
- "grad_norm": 0.1960015892982483,
1905
- "learning_rate": 0.0008584929014252898,
1906
- "loss": 0.9842,
1907
- "step": 12700
1908
- },
1909
- {
1910
- "epoch": 54.741379310344826,
1911
- "eval_loss": 4.997062683105469,
1912
- "eval_runtime": 9.2008,
1913
- "eval_samples_per_second": 12863.189,
1914
- "eval_steps_per_second": 6.304,
1915
- "step": 12700
1916
- },
1917
- {
1918
- "epoch": 55.172413793103445,
1919
- "grad_norm": 0.19153521955013275,
1920
- "learning_rate": 0.0008450432877615072,
1921
- "loss": 0.9782,
1922
- "step": 12800
1923
- },
1924
- {
1925
- "epoch": 55.172413793103445,
1926
- "eval_loss": 5.148665428161621,
1927
- "eval_runtime": 9.203,
1928
- "eval_samples_per_second": 12859.991,
1929
- "eval_steps_per_second": 6.302,
1930
- "step": 12800
1931
- },
1932
- {
1933
- "epoch": 55.60344827586207,
1934
- "grad_norm": 0.19057317078113556,
1935
- "learning_rate": 0.0008316223343154266,
1936
- "loss": 0.9708,
1937
- "step": 12900
1938
- },
1939
- {
1940
- "epoch": 55.60344827586207,
1941
- "eval_loss": 5.028558731079102,
1942
- "eval_runtime": 9.3777,
1943
- "eval_samples_per_second": 12620.494,
1944
- "eval_steps_per_second": 6.185,
1945
- "step": 12900
1946
- },
1947
- {
1948
- "epoch": 56.03448275862069,
1949
- "grad_norm": 0.18864724040031433,
1950
- "learning_rate": 0.0008182325233767267,
1951
- "loss": 0.9854,
1952
- "step": 13000
1953
- },
1954
- {
1955
- "epoch": 56.03448275862069,
1956
- "eval_loss": 5.237986087799072,
1957
- "eval_runtime": 9.2752,
1958
- "eval_samples_per_second": 12759.99,
1959
- "eval_steps_per_second": 6.253,
1960
- "step": 13000
1961
- },
1962
- {
1963
- "epoch": 56.46551724137931,
1964
- "grad_norm": 0.19948884844779968,
1965
- "learning_rate": 0.0008048763314750851,
1966
- "loss": 0.9562,
1967
- "step": 13100
1968
- },
1969
- {
1970
- "epoch": 56.46551724137931,
1971
- "eval_loss": 5.091177940368652,
1972
- "eval_runtime": 9.5374,
1973
- "eval_samples_per_second": 12409.191,
1974
- "eval_steps_per_second": 6.081,
1975
- "step": 13100
1976
- },
1977
- {
1978
- "epoch": 56.89655172413793,
1979
- "grad_norm": 0.1820322722196579,
1980
- "learning_rate": 0.0007915562289221262,
1981
- "loss": 0.9795,
1982
- "step": 13200
1983
- },
1984
- {
1985
- "epoch": 56.89655172413793,
1986
- "eval_loss": 5.0259809494018555,
1987
- "eval_runtime": 9.2875,
1988
- "eval_samples_per_second": 12743.094,
1989
- "eval_steps_per_second": 6.245,
1990
- "step": 13200
1991
- },
1992
- {
1993
- "epoch": 57.327586206896555,
1994
- "grad_norm": 0.18932276964187622,
1995
- "learning_rate": 0.0007782746793545225,
1996
- "loss": 0.9556,
1997
- "step": 13300
1998
- },
1999
- {
2000
- "epoch": 57.327586206896555,
2001
- "eval_loss": 5.16719388961792,
2002
- "eval_runtime": 9.2252,
2003
- "eval_samples_per_second": 12829.131,
2004
- "eval_steps_per_second": 6.287,
2005
- "step": 13300
2006
- },
2007
- {
2008
- "epoch": 57.758620689655174,
2009
- "grad_norm": 0.18779167532920837,
2010
- "learning_rate": 0.0007650341392783306,
2011
- "loss": 0.9689,
2012
- "step": 13400
2013
- },
2014
- {
2015
- "epoch": 57.758620689655174,
2016
- "eval_loss": 5.090976715087891,
2017
- "eval_runtime": 9.3802,
2018
- "eval_samples_per_second": 12617.111,
2019
- "eval_steps_per_second": 6.183,
2020
- "step": 13400
2021
- },
2022
- {
2023
- "epoch": 58.189655172413794,
2024
- "grad_norm": 0.1727251261472702,
2025
- "learning_rate": 0.0007518370576146431,
2026
- "loss": 0.9599,
2027
- "step": 13500
2028
- },
2029
- {
2030
- "epoch": 58.189655172413794,
2031
- "eval_loss": 5.20599889755249,
2032
- "eval_runtime": 9.2626,
2033
- "eval_samples_per_second": 12777.352,
2034
- "eval_steps_per_second": 6.262,
2035
- "step": 13500
2036
- },
2037
- {
2038
- "epoch": 58.62068965517241,
2039
- "grad_norm": 0.18696245551109314,
2040
- "learning_rate": 0.0007386858752466458,
2041
- "loss": 0.9559,
2042
- "step": 13600
2043
- },
2044
- {
2045
- "epoch": 58.62068965517241,
2046
- "eval_loss": 5.134738445281982,
2047
- "eval_runtime": 9.3926,
2048
- "eval_samples_per_second": 12600.491,
2049
- "eval_steps_per_second": 6.175,
2050
- "step": 13600
2051
- },
2052
- {
2053
- "epoch": 59.05172413793103,
2054
- "grad_norm": 0.19362613558769226,
2055
- "learning_rate": 0.0007255830245681625,
2056
- "loss": 0.9677,
2057
- "step": 13700
2058
- },
2059
- {
2060
- "epoch": 59.05172413793103,
2061
- "eval_loss": 5.218345642089844,
2062
- "eval_runtime": 9.2309,
2063
- "eval_samples_per_second": 12821.143,
2064
- "eval_steps_per_second": 6.283,
2065
- "step": 13700
2066
- },
2067
- {
2068
- "epoch": 59.48275862068966,
2069
- "grad_norm": 0.17938446998596191,
2070
- "learning_rate": 0.0007125309290337666,
2071
- "loss": 0.9431,
2072
- "step": 13800
2073
- },
2074
- {
2075
- "epoch": 59.48275862068966,
2076
- "eval_loss": 5.174517631530762,
2077
- "eval_runtime": 9.4671,
2078
- "eval_samples_per_second": 12501.228,
2079
- "eval_steps_per_second": 6.126,
2080
- "step": 13800
2081
- },
2082
- {
2083
- "epoch": 59.91379310344828,
2084
- "grad_norm": 0.17371727526187897,
2085
- "learning_rate": 0.000699532002710548,
2086
- "loss": 0.9641,
2087
- "step": 13900
2088
- },
2089
- {
2090
- "epoch": 59.91379310344828,
2091
- "eval_loss": 5.125094890594482,
2092
- "eval_runtime": 9.3495,
2093
- "eval_samples_per_second": 12658.593,
2094
- "eval_steps_per_second": 6.204,
2095
- "step": 13900
2096
- },
2097
- {
2098
- "epoch": 60.3448275862069,
2099
- "grad_norm": 0.1803039014339447,
2100
- "learning_rate": 0.0006865886498316185,
2101
- "loss": 0.9402,
2102
- "step": 14000
2103
- },
2104
- {
2105
- "epoch": 60.3448275862069,
2106
- "eval_loss": 5.229825496673584,
2107
- "eval_runtime": 9.2198,
2108
- "eval_samples_per_second": 12836.561,
2109
- "eval_steps_per_second": 6.291,
2110
- "step": 14000
2111
- },
2112
- {
2113
- "epoch": 60.775862068965516,
2114
- "grad_norm": 0.17432528734207153,
2115
- "learning_rate": 0.0006737032643514306,
2116
- "loss": 0.9542,
2117
- "step": 14100
2118
- },
2119
- {
2120
- "epoch": 60.775862068965516,
2121
- "eval_loss": 5.173377513885498,
2122
- "eval_runtime": 9.6205,
2123
- "eval_samples_per_second": 12301.935,
2124
- "eval_steps_per_second": 6.029,
2125
- "step": 14100
2126
- },
2127
- {
2128
- "epoch": 61.206896551724135,
2129
- "grad_norm": 0.17738671600818634,
2130
- "learning_rate": 0.000660878229503003,
2131
- "loss": 0.9459,
2132
- "step": 14200
2133
- },
2134
- {
2135
- "epoch": 61.206896551724135,
2136
- "eval_loss": 5.284204006195068,
2137
- "eval_runtime": 9.202,
2138
- "eval_samples_per_second": 12861.408,
2139
- "eval_steps_per_second": 6.303,
2140
- "step": 14200
2141
- },
2142
- {
2143
- "epoch": 61.63793103448276,
2144
- "grad_norm": 0.17412865161895752,
2145
- "learning_rate": 0.0006481159173571249,
2146
- "loss": 0.9431,
2147
- "step": 14300
2148
- },
2149
- {
2150
- "epoch": 61.63793103448276,
2151
- "eval_loss": 5.202380180358887,
2152
- "eval_runtime": 9.2023,
2153
- "eval_samples_per_second": 12861.037,
2154
- "eval_steps_per_second": 6.303,
2155
- "step": 14300
2156
- },
2157
- {
2158
- "epoch": 62.06896551724138,
2159
- "grad_norm": 0.1586960405111313,
2160
- "learning_rate": 0.0006354186883836291,
2161
- "loss": 0.951,
2162
- "step": 14400
2163
- },
2164
- {
2165
- "epoch": 62.06896551724138,
2166
- "eval_loss": 5.338278293609619,
2167
- "eval_runtime": 9.3841,
2168
- "eval_samples_per_second": 12611.844,
2169
- "eval_steps_per_second": 6.181,
2170
- "step": 14400
2171
- },
2172
- {
2173
- "epoch": 62.5,
2174
- "grad_norm": 0.16516196727752686,
2175
- "learning_rate": 0.0006227888910148052,
2176
- "loss": 0.9316,
2177
- "step": 14500
2178
- },
2179
- {
2180
- "epoch": 62.5,
2181
- "eval_loss": 5.300503253936768,
2182
- "eval_runtime": 9.3048,
2183
- "eval_samples_per_second": 12719.348,
2184
- "eval_steps_per_second": 6.233,
2185
- "step": 14500
2186
- },
2187
- {
2188
- "epoch": 62.93103448275862,
2189
- "grad_norm": 0.17453870177268982,
2190
- "learning_rate": 0.000610228861211044,
2191
- "loss": 0.9506,
2192
- "step": 14600
2193
- },
2194
- {
2195
- "epoch": 62.93103448275862,
2196
- "eval_loss": 5.2108259201049805,
2197
- "eval_runtime": 9.373,
2198
- "eval_samples_per_second": 12626.747,
2199
- "eval_steps_per_second": 6.188,
2200
- "step": 14600
2201
- },
2202
- {
2203
- "epoch": 63.36206896551724,
2204
- "grad_norm": 0.18113704025745392,
2205
- "learning_rate": 0.0005977409220287874,
2206
- "loss": 0.9268,
2207
- "step": 14700
2208
- },
2209
- {
2210
- "epoch": 63.36206896551724,
2211
- "eval_loss": 5.307285785675049,
2212
- "eval_runtime": 9.2153,
2213
- "eval_samples_per_second": 12842.815,
2214
- "eval_steps_per_second": 6.294,
2215
- "step": 14700
2216
- },
2217
- {
2218
- "epoch": 63.793103448275865,
2219
- "grad_norm": 0.16791464388370514,
2220
- "learning_rate": 0.0005853273831908637,
2221
- "loss": 0.941,
2222
- "step": 14800
2223
- },
2224
- {
2225
- "epoch": 63.793103448275865,
2226
- "eval_loss": 5.276530742645264,
2227
- "eval_runtime": 9.3621,
2228
- "eval_samples_per_second": 12641.481,
2229
- "eval_steps_per_second": 6.195,
2230
- "step": 14800
2231
- },
2232
- {
2233
- "epoch": 64.22413793103448,
2234
- "grad_norm": 0.18001683056354523,
2235
- "learning_rate": 0.0005729905406592884,
2236
- "loss": 0.9315,
2237
- "step": 14900
2238
- },
2239
- {
2240
- "epoch": 64.22413793103448,
2241
- "eval_loss": 5.365150451660156,
2242
- "eval_runtime": 9.3887,
2243
- "eval_samples_per_second": 12605.671,
2244
- "eval_steps_per_second": 6.178,
2245
- "step": 14900
2246
- },
2247
- {
2248
- "epoch": 64.65517241379311,
2249
- "grad_norm": 0.1623552292585373,
2250
- "learning_rate": 0.0005607326762106162,
2251
- "loss": 0.9308,
2252
- "step": 15000
2253
- },
2254
- {
2255
- "epoch": 64.65517241379311,
2256
- "eval_loss": 5.266965389251709,
2257
- "eval_runtime": 9.2811,
2258
- "eval_samples_per_second": 12751.873,
2259
- "eval_steps_per_second": 6.249,
2260
- "step": 15000
2261
- },
2262
- {
2263
- "epoch": 65.08620689655173,
2264
- "grad_norm": 0.16720953583717346,
2265
- "learning_rate": 0.0005485560570139061,
2266
- "loss": 0.9359,
2267
- "step": 15100
2268
- },
2269
- {
2270
- "epoch": 65.08620689655173,
2271
- "eval_loss": 5.438762664794922,
2272
- "eval_runtime": 9.4208,
2273
- "eval_samples_per_second": 12562.678,
2274
- "eval_steps_per_second": 6.157,
2275
- "step": 15100
2276
- },
2277
- {
2278
- "epoch": 65.51724137931035,
2279
- "grad_norm": 0.1636320948600769,
2280
- "learning_rate": 0.0005364629352113988,
2281
- "loss": 0.9214,
2282
- "step": 15200
2283
- },
2284
- {
2285
- "epoch": 65.51724137931035,
2286
- "eval_loss": 5.348054885864258,
2287
- "eval_runtime": 9.2475,
2288
- "eval_samples_per_second": 12798.204,
2289
- "eval_steps_per_second": 6.272,
2290
- "step": 15200
2291
- },
2292
- {
2293
- "epoch": 65.94827586206897,
2294
- "grad_norm": 0.1635085791349411,
2295
- "learning_rate": 0.0005244555475019666,
2296
- "loss": 0.9376,
2297
- "step": 15300
2298
- },
2299
- {
2300
- "epoch": 65.94827586206897,
2301
- "eval_loss": 5.3104448318481445,
2302
- "eval_runtime": 9.394,
2303
- "eval_samples_per_second": 12598.626,
2304
- "eval_steps_per_second": 6.174,
2305
- "step": 15300
2306
- },
2307
- {
2308
- "epoch": 66.37931034482759,
2309
- "grad_norm": 0.15515539050102234,
2310
- "learning_rate": 0.000512536114727423,
2311
- "loss": 0.914,
2312
- "step": 15400
2313
- },
2314
- {
2315
- "epoch": 66.37931034482759,
2316
- "eval_loss": 5.391454696655273,
2317
- "eval_runtime": 9.4016,
2318
- "eval_samples_per_second": 12588.456,
2319
- "eval_steps_per_second": 6.169,
2320
- "step": 15400
2321
- },
2322
- {
2323
- "epoch": 66.8103448275862,
2324
- "grad_norm": 0.15986432135105133,
2325
- "learning_rate": 0.0005007068414617632,
2326
- "loss": 0.9296,
2327
- "step": 15500
2328
- },
2329
- {
2330
- "epoch": 66.8103448275862,
2331
- "eval_loss": 5.335239887237549,
2332
- "eval_runtime": 9.2358,
2333
- "eval_samples_per_second": 12814.357,
2334
- "eval_steps_per_second": 6.28,
2335
- "step": 15500
2336
- },
2337
- {
2338
- "epoch": 67.24137931034483,
2339
- "grad_norm": 0.15565365552902222,
2340
- "learning_rate": 0.0004889699156034146,
2341
- "loss": 0.9188,
2342
- "step": 15600
2343
- },
2344
- {
2345
- "epoch": 67.24137931034483,
2346
- "eval_loss": 5.442201137542725,
2347
- "eval_runtime": 9.493,
2348
- "eval_samples_per_second": 12467.232,
2349
- "eval_steps_per_second": 6.11,
2350
- "step": 15600
2351
- },
2352
- {
2353
- "epoch": 67.67241379310344,
2354
- "grad_norm": 0.160415917634964,
2355
- "learning_rate": 0.0004773275079705718,
2356
- "loss": 0.9198,
2357
- "step": 15700
2358
- },
2359
- {
2360
- "epoch": 67.67241379310344,
2361
- "eval_loss": 5.36320686340332,
2362
- "eval_runtime": 9.1953,
2363
- "eval_samples_per_second": 12870.809,
2364
- "eval_steps_per_second": 6.308,
2365
- "step": 15700
2366
- },
2367
- {
2368
- "epoch": 68.10344827586206,
2369
- "grad_norm": 0.17050059139728546,
2370
- "learning_rate": 0.00046578177189968904,
2371
- "loss": 0.923,
2372
- "step": 15800
2373
- },
2374
- {
2375
- "epoch": 68.10344827586206,
2376
- "eval_loss": 5.504161357879639,
2377
- "eval_runtime": 9.194,
2378
- "eval_samples_per_second": 12872.672,
2379
- "eval_steps_per_second": 6.308,
2380
- "step": 15800
2381
- },
2382
- {
2383
- "epoch": 68.53448275862068,
2384
- "grad_norm": 0.14325299859046936,
2385
- "learning_rate": 0.0004543348428472082,
2386
- "loss": 0.9104,
2387
- "step": 15900
2388
- },
2389
- {
2390
- "epoch": 68.53448275862068,
2391
- "eval_loss": 5.397451877593994,
2392
- "eval_runtime": 9.3626,
2393
- "eval_samples_per_second": 12640.857,
2394
- "eval_steps_per_second": 6.195,
2395
- "step": 15900
2396
- },
2397
- {
2398
- "epoch": 68.96551724137932,
2399
- "grad_norm": 0.1505531370639801,
2400
- "learning_rate": 0.0004429888379945914,
2401
- "loss": 0.9259,
2402
- "step": 16000
2403
- },
2404
- {
2405
- "epoch": 68.96551724137932,
2406
- "eval_loss": 5.372323036193848,
2407
- "eval_runtime": 9.2765,
2408
- "eval_samples_per_second": 12758.16,
2409
- "eval_steps_per_second": 6.252,
2410
- "step": 16000
2411
- },
2412
- {
2413
- "epoch": 69.39655172413794,
2414
- "grad_norm": 0.14758853614330292,
2415
- "learning_rate": 0.00043174585585673675,
2416
- "loss": 0.9034,
2417
- "step": 16100
2418
- },
2419
- {
2420
- "epoch": 69.39655172413794,
2421
- "eval_loss": 5.451719284057617,
2422
- "eval_runtime": 9.5225,
2423
- "eval_samples_per_second": 12428.515,
2424
- "eval_steps_per_second": 6.091,
2425
- "step": 16100
2426
- },
2427
- {
2428
- "epoch": 69.82758620689656,
2429
- "grad_norm": 0.16002579033374786,
2430
- "learning_rate": 0.00042060797589384325,
2431
- "loss": 0.9185,
2432
- "step": 16200
2433
- },
2434
- {
2435
- "epoch": 69.82758620689656,
2436
- "eval_loss": 5.384088516235352,
2437
- "eval_runtime": 9.2516,
2438
- "eval_samples_per_second": 12792.54,
2439
- "eval_steps_per_second": 6.269,
2440
- "step": 16200
2441
- },
2442
- {
2443
- "epoch": 70.25862068965517,
2444
- "grad_norm": 0.15093061327934265,
2445
- "learning_rate": 0.00040957725812680227,
2446
- "loss": 0.9071,
2447
- "step": 16300
2448
- },
2449
- {
2450
- "epoch": 70.25862068965517,
2451
- "eval_loss": 5.495941638946533,
2452
- "eval_runtime": 9.2246,
2453
- "eval_samples_per_second": 12829.882,
2454
- "eval_steps_per_second": 6.288,
2455
- "step": 16300
2456
- },
2457
- {
2458
- "epoch": 70.6896551724138,
2459
- "grad_norm": 0.14055319130420685,
2460
- "learning_rate": 0.00039865574275618353,
2461
- "loss": 0.9105,
2462
- "step": 16400
2463
- },
2464
- {
2465
- "epoch": 70.6896551724138,
2466
- "eval_loss": 5.418017864227295,
2467
- "eval_runtime": 9.3637,
2468
- "eval_samples_per_second": 12639.396,
2469
- "eval_steps_per_second": 6.194,
2470
- "step": 16400
2471
- },
2472
- {
2473
- "epoch": 71.12068965517241,
2474
- "grad_norm": 0.1445504128932953,
2475
- "learning_rate": 0.00038784544978488756,
2476
- "loss": 0.9127,
2477
- "step": 16500
2478
- },
2479
- {
2480
- "epoch": 71.12068965517241,
2481
- "eval_loss": 5.526876449584961,
2482
- "eval_runtime": 9.2747,
2483
- "eval_samples_per_second": 12760.573,
2484
- "eval_steps_per_second": 6.254,
2485
- "step": 16500
2486
- },
2487
- {
2488
- "epoch": 71.55172413793103,
2489
- "grad_norm": 0.14889004826545715,
2490
- "learning_rate": 0.0003771483786445331,
2491
- "loss": 0.903,
2492
- "step": 16600
2493
- },
2494
- {
2495
- "epoch": 71.55172413793103,
2496
- "eval_loss": 5.482020854949951,
2497
- "eval_runtime": 9.5332,
2498
- "eval_samples_per_second": 12414.639,
2499
- "eval_steps_per_second": 6.084,
2500
- "step": 16600
2501
- },
2502
- {
2503
- "epoch": 71.98275862068965,
2504
- "grad_norm": 0.14940929412841797,
2505
- "learning_rate": 0.00036656650782564983,
2506
- "loss": 0.9144,
2507
- "step": 16700
2508
- },
2509
- {
2510
- "epoch": 71.98275862068965,
2511
- "eval_loss": 5.4448442459106445,
2512
- "eval_runtime": 9.2742,
2513
- "eval_samples_per_second": 12761.28,
2514
- "eval_steps_per_second": 6.254,
2515
- "step": 16700
2516
- },
2517
- {
2518
- "epoch": 72.41379310344827,
2519
- "grad_norm": 0.14626102149486542,
2520
- "learning_rate": 0.00035610179451174307,
2521
- "loss": 0.8945,
2522
- "step": 16800
2523
- },
2524
- {
2525
- "epoch": 72.41379310344827,
2526
- "eval_loss": 5.535341739654541,
2527
- "eval_runtime": 9.2018,
2528
- "eval_samples_per_second": 12861.765,
2529
- "eval_steps_per_second": 6.303,
2530
- "step": 16800
2531
- },
2532
- {
2533
- "epoch": 72.84482758620689,
2534
- "grad_norm": 0.14408394694328308,
2535
- "learning_rate": 0.0003457561742173032,
2536
- "loss": 0.9088,
2537
- "step": 16900
2538
- },
2539
- {
2540
- "epoch": 72.84482758620689,
2541
- "eval_loss": 5.446293354034424,
2542
- "eval_runtime": 9.3364,
2543
- "eval_samples_per_second": 12676.282,
2544
- "eval_steps_per_second": 6.212,
2545
- "step": 16900
2546
- },
2547
- {
2548
- "epoch": 73.27586206896552,
2549
- "grad_norm": 0.14265799522399902,
2550
- "learning_rate": 0.00033553156042981716,
2551
- "loss": 0.8975,
2552
- "step": 17000
2553
- },
2554
- {
2555
- "epoch": 73.27586206896552,
2556
- "eval_loss": 5.535012722015381,
2557
- "eval_runtime": 9.2156,
2558
- "eval_samples_per_second": 12842.519,
2559
- "eval_steps_per_second": 6.294,
2560
- "step": 17000
2561
- },
2562
- {
2563
- "epoch": 73.70689655172414,
2564
- "grad_norm": 0.146506205201149,
2565
- "learning_rate": 0.0003254298442558571,
2566
- "loss": 0.9019,
2567
- "step": 17100
2568
- },
2569
- {
2570
- "epoch": 73.70689655172414,
2571
- "eval_loss": 5.51116418838501,
2572
- "eval_runtime": 9.3669,
2573
- "eval_samples_per_second": 12635.03,
2574
- "eval_steps_per_second": 6.192,
2575
- "step": 17100
2576
- },
2577
- {
2578
- "epoch": 74.13793103448276,
2579
- "grad_norm": 0.14774905145168304,
2580
- "learning_rate": 0.00031545289407131126,
2581
- "loss": 0.9005,
2582
- "step": 17200
2583
- },
2584
- {
2585
- "epoch": 74.13793103448276,
2586
- "eval_loss": 5.60396671295166,
2587
- "eval_runtime": 9.252,
2588
- "eval_samples_per_second": 12791.945,
2589
- "eval_steps_per_second": 6.269,
2590
- "step": 17200
2591
- },
2592
- {
2593
- "epoch": 74.56896551724138,
2594
- "grad_norm": 0.15397560596466064,
2595
- "learning_rate": 0.000305602555175813,
2596
- "loss": 0.8947,
2597
- "step": 17300
2598
- },
2599
- {
2600
- "epoch": 74.56896551724138,
2601
- "eval_loss": 5.518400192260742,
2602
- "eval_runtime": 9.2957,
2603
- "eval_samples_per_second": 12731.779,
2604
- "eval_steps_per_second": 6.239,
2605
- "step": 17300
2606
- },
2607
- {
2608
- "epoch": 75.0,
2609
- "grad_norm": 0.14761576056480408,
2610
- "learning_rate": 0.00029588064945144135,
2611
- "loss": 0.9052,
2612
- "step": 17400
2613
- },
2614
- {
2615
- "epoch": 75.0,
2616
- "eval_loss": 5.501093864440918,
2617
- "eval_runtime": 9.4777,
2618
- "eval_samples_per_second": 12487.311,
2619
- "eval_steps_per_second": 6.12,
2620
- "step": 17400
2621
- },
2622
- {
2623
- "epoch": 75.43103448275862,
2624
- "grad_norm": 0.14727619290351868,
2625
- "learning_rate": 0.0002862889750257551,
2626
- "loss": 0.8863,
2627
- "step": 17500
2628
- },
2629
- {
2630
- "epoch": 75.43103448275862,
2631
- "eval_loss": 5.560401916503906,
2632
- "eval_runtime": 9.1883,
2633
- "eval_samples_per_second": 12880.563,
2634
- "eval_steps_per_second": 6.312,
2635
- "step": 17500
2636
- },
2637
- {
2638
- "epoch": 75.86206896551724,
2639
- "grad_norm": 0.15626391768455505,
2640
- "learning_rate": 0.0002768293059392143,
2641
- "loss": 0.8993,
2642
- "step": 17600
2643
- },
2644
- {
2645
- "epoch": 75.86206896551724,
2646
- "eval_loss": 5.520768642425537,
2647
- "eval_runtime": 9.1909,
2648
- "eval_samples_per_second": 12876.988,
2649
- "eval_steps_per_second": 6.311,
2650
- "step": 17600
2651
- },
2652
- {
2653
- "epoch": 76.29310344827586,
2654
- "grad_norm": 0.14485207200050354,
2655
- "learning_rate": 0.0002675033918170623,
2656
- "loss": 0.8885,
2657
- "step": 17700
2658
- },
2659
- {
2660
- "epoch": 76.29310344827586,
2661
- "eval_loss": 5.576803207397461,
2662
- "eval_runtime": 9.3606,
2663
- "eval_samples_per_second": 12643.536,
2664
- "eval_steps_per_second": 6.196,
2665
- "step": 17700
2666
- },
2667
- {
2668
- "epoch": 76.72413793103448,
2669
- "grad_norm": 0.13398423790931702,
2670
- "learning_rate": 0.0002583129575457227,
2671
- "loss": 0.8936,
2672
- "step": 17800
2673
- },
2674
- {
2675
- "epoch": 76.72413793103448,
2676
- "eval_loss": 5.554868221282959,
2677
- "eval_runtime": 9.2384,
2678
- "eval_samples_per_second": 12810.763,
2679
- "eval_steps_per_second": 6.278,
2680
- "step": 17800
2681
- },
2682
- {
2683
- "epoch": 77.15517241379311,
2684
- "grad_norm": 0.13094207644462585,
2685
- "learning_rate": 0.0002492597029537672,
2686
- "loss": 0.8919,
2687
- "step": 17900
2688
- },
2689
- {
2690
- "epoch": 77.15517241379311,
2691
- "eval_loss": 5.611755847930908,
2692
- "eval_runtime": 9.3911,
2693
- "eval_samples_per_second": 12602.462,
2694
- "eval_steps_per_second": 6.176,
2695
- "step": 17900
2696
- },
2697
- {
2698
- "epoch": 77.58620689655173,
2699
- "grad_norm": 0.13792350888252258,
2700
- "learning_rate": 0.00024034530249752463,
2701
- "loss": 0.887,
2702
- "step": 18000
2703
- },
2704
- {
2705
- "epoch": 77.58620689655173,
2706
- "eval_loss": 5.579117774963379,
2707
- "eval_runtime": 9.36,
2708
- "eval_samples_per_second": 12644.296,
2709
- "eval_steps_per_second": 6.197,
2710
- "step": 18000
2711
- },
2712
- {
2713
- "epoch": 78.01724137931035,
2714
- "grad_norm": 0.13721516728401184,
2715
- "learning_rate": 0.00023157140495137996,
2716
- "loss": 0.8951,
2717
- "step": 18100
2718
- },
2719
- {
2720
- "epoch": 78.01724137931035,
2721
- "eval_loss": 5.630533695220947,
2722
- "eval_runtime": 9.2429,
2723
- "eval_samples_per_second": 12804.499,
2724
- "eval_steps_per_second": 6.275,
2725
- "step": 18100
2726
- },
2727
- {
2728
- "epoch": 78.44827586206897,
2729
- "grad_norm": 0.13419011235237122,
2730
- "learning_rate": 0.00022293963310281994,
2731
- "loss": 0.8804,
2732
- "step": 18200
2733
- },
2734
- {
2735
- "epoch": 78.44827586206897,
2736
- "eval_loss": 5.606773853302002,
2737
- "eval_runtime": 9.2669,
2738
- "eval_samples_per_second": 12771.352,
2739
- "eval_steps_per_second": 6.259,
2740
- "step": 18200
2741
- },
2742
- {
2743
- "epoch": 78.87931034482759,
2744
- "grad_norm": 0.137063667178154,
2745
- "learning_rate": 0.00021445158345229155,
2746
- "loss": 0.8912,
2747
- "step": 18300
2748
- },
2749
- {
2750
- "epoch": 78.87931034482759,
2751
- "eval_loss": 5.56123161315918,
2752
- "eval_runtime": 9.2532,
2753
- "eval_samples_per_second": 12790.212,
2754
- "eval_steps_per_second": 6.268,
2755
- "step": 18300
2756
- },
2757
- {
2758
- "epoch": 79.3103448275862,
2759
- "grad_norm": 0.13209742307662964,
2760
- "learning_rate": 0.00020610882591791857,
2761
- "loss": 0.8813,
2762
- "step": 18400
2763
- },
2764
- {
2765
- "epoch": 79.3103448275862,
2766
- "eval_loss": 5.633670806884766,
2767
- "eval_runtime": 9.3853,
2768
- "eval_samples_per_second": 12610.295,
2769
- "eval_steps_per_second": 6.18,
2770
- "step": 18400
2771
- },
2772
- {
2773
- "epoch": 79.74137931034483,
2774
- "grad_norm": 0.13960996270179749,
2775
- "learning_rate": 0.0001979129035451337,
2776
- "loss": 0.8861,
2777
- "step": 18500
2778
- },
2779
- {
2780
- "epoch": 79.74137931034483,
2781
- "eval_loss": 5.5989484786987305,
2782
- "eval_runtime": 9.2688,
2783
- "eval_samples_per_second": 12768.736,
2784
- "eval_steps_per_second": 6.258,
2785
- "step": 18500
2786
- },
2787
- {
2788
- "epoch": 80.17241379310344,
2789
- "grad_norm": 0.12470439076423645,
2790
- "learning_rate": 0.0001898653322212841,
2791
- "loss": 0.8838,
2792
- "step": 18600
2793
- },
2794
- {
2795
- "epoch": 80.17241379310344,
2796
- "eval_loss": 5.66691780090332,
2797
- "eval_runtime": 9.2463,
2798
- "eval_samples_per_second": 12799.872,
2799
- "eval_steps_per_second": 6.273,
2800
- "step": 18600
2801
- },
2802
- {
2803
- "epoch": 80.60344827586206,
2804
- "grad_norm": 0.13329234719276428,
2805
- "learning_rate": 0.00018204583118856953,
2806
- "loss": 0.8798,
2807
- "step": 18700
2808
- },
2809
- {
2810
- "epoch": 80.60344827586206,
2811
- "eval_loss": 5.619058132171631,
2812
- "eval_runtime": 9.3773,
2813
- "eval_samples_per_second": 12620.958,
2814
- "eval_steps_per_second": 6.185,
2815
- "step": 18700
2816
- },
2817
- {
2818
- "epoch": 81.03448275862068,
2819
- "grad_norm": 0.1379682719707489,
2820
- "learning_rate": 0.00017429787945419606,
2821
- "loss": 0.8873,
2822
- "step": 18800
2823
- },
2824
- {
2825
- "epoch": 81.03448275862068,
2826
- "eval_loss": 5.695350646972656,
2827
- "eval_runtime": 9.2531,
2828
- "eval_samples_per_second": 12790.455,
2829
- "eval_steps_per_second": 6.268,
2830
- "step": 18800
2831
- },
2832
- {
2833
- "epoch": 81.46551724137932,
2834
- "grad_norm": 0.1285446733236313,
2835
- "learning_rate": 0.00016670264651586397,
2836
- "loss": 0.8755,
2837
- "step": 18900
2838
- },
2839
- {
2840
- "epoch": 81.46551724137932,
2841
- "eval_loss": 5.643267631530762,
2842
- "eval_runtime": 9.3848,
2843
- "eval_samples_per_second": 12610.893,
2844
- "eval_steps_per_second": 6.18,
2845
- "step": 18900
2846
- },
2847
- {
2848
- "epoch": 81.89655172413794,
2849
- "grad_norm": 0.12641337513923645,
2850
- "learning_rate": 0.00015926153715963565,
2851
- "loss": 0.8836,
2852
- "step": 19000
2853
- },
2854
- {
2855
- "epoch": 81.89655172413794,
2856
- "eval_loss": 5.626917839050293,
2857
- "eval_runtime": 9.2261,
2858
- "eval_samples_per_second": 12827.886,
2859
- "eval_steps_per_second": 6.287,
2860
- "step": 19000
2861
- },
2862
- {
2863
- "epoch": 82.32758620689656,
2864
- "grad_norm": 0.13148842751979828,
2865
- "learning_rate": 0.00015197592766544755,
2866
- "loss": 0.8756,
2867
- "step": 19100
2868
- },
2869
- {
2870
- "epoch": 82.32758620689656,
2871
- "eval_loss": 5.652148723602295,
2872
- "eval_runtime": 9.2465,
2873
- "eval_samples_per_second": 12799.543,
2874
- "eval_steps_per_second": 6.273,
2875
- "step": 19100
2876
- },
2877
- {
2878
- "epoch": 82.75862068965517,
2879
- "grad_norm": 0.12983547151088715,
2880
- "learning_rate": 0.00014484716555255916,
2881
- "loss": 0.8795,
2882
- "step": 19200
2883
- },
2884
- {
2885
- "epoch": 82.75862068965517,
2886
- "eval_loss": 5.643835544586182,
2887
- "eval_runtime": 9.2246,
2888
- "eval_samples_per_second": 12829.918,
2889
- "eval_steps_per_second": 6.288,
2890
- "step": 19200
2891
- },
2892
- {
2893
- "epoch": 83.1896551724138,
2894
- "grad_norm": 0.13281339406967163,
2895
- "learning_rate": 0.000137876569330322,
2896
- "loss": 0.8766,
2897
- "step": 19300
2898
- },
2899
- {
2900
- "epoch": 83.1896551724138,
2901
- "eval_loss": 5.681993007659912,
2902
- "eval_runtime": 9.223,
2903
- "eval_samples_per_second": 12832.225,
2904
- "eval_steps_per_second": 6.289,
2905
- "step": 19300
2906
- },
2907
- {
2908
- "epoch": 83.62068965517241,
2909
- "grad_norm": 0.1261378824710846,
2910
- "learning_rate": 0.0001310654282543118,
2911
- "loss": 0.8739,
2912
- "step": 19400
2913
- },
2914
- {
2915
- "epoch": 83.62068965517241,
2916
- "eval_loss": 5.645988464355469,
2917
- "eval_runtime": 9.5681,
2918
- "eval_samples_per_second": 12369.283,
2919
- "eval_steps_per_second": 6.062,
2920
- "step": 19400
2921
- },
2922
- {
2923
- "epoch": 84.05172413793103,
2924
- "grad_norm": 0.1342853307723999,
2925
- "learning_rate": 0.00012441500208787225,
2926
- "loss": 0.88,
2927
- "step": 19500
2928
- },
2929
- {
2930
- "epoch": 84.05172413793103,
2931
- "eval_loss": 5.7274885177612305,
2932
- "eval_runtime": 9.2591,
2933
- "eval_samples_per_second": 12782.084,
2934
- "eval_steps_per_second": 6.264,
2935
- "step": 19500
2936
- },
2937
- {
2938
- "epoch": 84.48275862068965,
2939
- "grad_norm": 0.13619554042816162,
2940
- "learning_rate": 0.00011792652086911582,
2941
- "loss": 0.8707,
2942
- "step": 19600
2943
- },
2944
- {
2945
- "epoch": 84.48275862068965,
2946
- "eval_loss": 5.669314384460449,
2947
- "eval_runtime": 9.231,
2948
- "eval_samples_per_second": 12821.08,
2949
- "eval_steps_per_second": 6.283,
2950
- "step": 19600
2951
- },
2952
- {
2953
- "epoch": 84.91379310344827,
2954
- "grad_norm": 0.1309289187192917,
2955
- "learning_rate": 0.00011160118468341863,
2956
- "loss": 0.8778,
2957
- "step": 19700
2958
- },
2959
- {
2960
- "epoch": 84.91379310344827,
2961
- "eval_loss": 5.653628349304199,
2962
- "eval_runtime": 9.2387,
2963
- "eval_samples_per_second": 12810.365,
2964
- "eval_steps_per_second": 6.278,
2965
- "step": 19700
2966
- },
2967
- {
2968
- "epoch": 85.34482758620689,
2969
- "grad_norm": 0.12790584564208984,
2970
- "learning_rate": 0.00010544016344145713,
2971
- "loss": 0.8699,
2972
- "step": 19800
2973
- },
2974
- {
2975
- "epoch": 85.34482758620689,
2976
- "eval_loss": 5.69779634475708,
2977
- "eval_runtime": 9.4304,
2978
- "eval_samples_per_second": 12549.969,
2979
- "eval_steps_per_second": 6.15,
2980
- "step": 19800
2981
- },
2982
- {
2983
- "epoch": 85.77586206896552,
2984
- "grad_norm": 0.125418022274971,
2985
- "learning_rate": 9.944459666282745e-05,
2986
- "loss": 0.8734,
2987
- "step": 19900
2988
- },
2989
- {
2990
- "epoch": 85.77586206896552,
2991
- "eval_loss": 5.671902656555176,
2992
- "eval_runtime": 9.407,
2993
- "eval_samples_per_second": 12581.158,
2994
- "eval_steps_per_second": 6.166,
2995
- "step": 19900
2996
- },
2997
- {
2998
- "epoch": 86.20689655172414,
2999
- "grad_norm": 0.1286143958568573,
3000
- "learning_rate": 9.361559326528213e-05,
3001
- "loss": 0.8713,
3002
- "step": 20000
3003
- },
3004
- {
3005
- "epoch": 86.20689655172414,
3006
- "eval_loss": 5.720515251159668,
3007
- "eval_runtime": 9.2375,
3008
- "eval_samples_per_second": 12812.036,
3009
- "eval_steps_per_second": 6.279,
3010
- "step": 20000
3011
- },
3012
- {
3013
- "epoch": 86.63793103448276,
3014
- "grad_norm": 0.1244572177529335,
3015
- "learning_rate": 8.795423135962966e-05,
3016
- "loss": 0.8702,
3017
- "step": 20100
3018
- },
3019
- {
3020
- "epoch": 86.63793103448276,
3021
- "eval_loss": 5.679359436035156,
3022
- "eval_runtime": 9.3749,
3023
- "eval_samples_per_second": 12624.294,
3024
- "eval_steps_per_second": 6.187,
3025
- "step": 20100
3026
- },
3027
- {
3028
- "epoch": 87.06896551724138,
3029
- "grad_norm": 0.13650184869766235,
3030
- "learning_rate": 8.246155805033218e-05,
3031
- "loss": 0.8729,
3032
- "step": 20200
3033
- },
3034
- {
3035
- "epoch": 87.06896551724138,
3036
- "eval_loss": 5.74306583404541,
3037
- "eval_runtime": 9.2113,
3038
- "eval_samples_per_second": 12848.47,
3039
- "eval_steps_per_second": 6.297,
3040
- "step": 20200
3041
- },
3042
- {
3043
- "epoch": 87.5,
3044
- "grad_norm": 0.1307421624660492,
3045
- "learning_rate": 7.713858924183448e-05,
3046
- "loss": 0.8664,
3047
- "step": 20300
3048
- },
3049
- {
3050
- "epoch": 87.5,
3051
- "eval_loss": 5.708529472351074,
3052
- "eval_runtime": 9.4057,
3053
- "eval_samples_per_second": 12582.911,
3054
- "eval_steps_per_second": 6.166,
3055
- "step": 20300
3056
- },
3057
- {
3058
- "epoch": 87.93103448275862,
3059
- "grad_norm": 0.12468791753053665,
3060
- "learning_rate": 7.198630945066809e-05,
3061
- "loss": 0.8717,
3062
- "step": 20400
3063
- },
3064
- {
3065
- "epoch": 87.93103448275862,
3066
- "eval_loss": 5.68959903717041,
3067
- "eval_runtime": 10.135,
3068
- "eval_samples_per_second": 11677.436,
3069
- "eval_steps_per_second": 5.723,
3070
- "step": 20400
3071
- },
3072
- {
3073
- "epoch": 88.36206896551724,
3074
- "grad_norm": 0.12915968894958496,
3075
- "learning_rate": 6.70056716233587e-05,
3076
- "loss": 0.8662,
3077
- "step": 20500
3078
- },
3079
- {
3080
- "epoch": 88.36206896551724,
3081
- "eval_loss": 5.714221954345703,
3082
- "eval_runtime": 9.9287,
3083
- "eval_samples_per_second": 11920.034,
3084
- "eval_steps_per_second": 5.842,
3085
- "step": 20500
3086
- },
3087
- {
3088
- "epoch": 88.79310344827586,
3089
- "grad_norm": 0.12067535519599915,
3090
- "learning_rate": 6.219759696017113e-05,
3091
- "loss": 0.8686,
3092
- "step": 20600
3093
- },
3094
- {
3095
- "epoch": 88.79310344827586,
3096
- "eval_loss": 5.706583023071289,
3097
- "eval_runtime": 9.7773,
3098
- "eval_samples_per_second": 12104.609,
3099
- "eval_steps_per_second": 5.932,
3100
- "step": 20600
3101
- },
3102
- {
3103
- "epoch": 89.22413793103448,
3104
- "grad_norm": 0.12324505299329758,
3105
- "learning_rate": 5.7562974744729046e-05,
3106
- "loss": 0.867,
3107
- "step": 20700
3108
- },
3109
- {
3110
- "epoch": 89.22413793103448,
3111
- "eval_loss": 5.7304487228393555,
3112
- "eval_runtime": 9.5287,
3113
- "eval_samples_per_second": 12420.512,
3114
- "eval_steps_per_second": 6.087,
3115
- "step": 20700
3116
- },
3117
- {
3118
- "epoch": 89.65517241379311,
3119
- "grad_norm": 0.12154043465852737,
3120
- "learning_rate": 5.3102662179536295e-05,
3121
- "loss": 0.8663,
3122
- "step": 20800
3123
- },
3124
- {
3125
- "epoch": 89.65517241379311,
3126
- "eval_loss": 5.711974143981934,
3127
- "eval_runtime": 9.2625,
3128
- "eval_samples_per_second": 12777.417,
3129
- "eval_steps_per_second": 6.262,
3130
- "step": 20800
3131
- },
3132
- {
3133
- "epoch": 90.08620689655173,
3134
- "grad_norm": 0.12663687765598297,
3135
- "learning_rate": 4.881748422743082e-05,
3136
- "loss": 0.8673,
3137
- "step": 20900
3138
- },
3139
- {
3140
- "epoch": 90.08620689655173,
3141
- "eval_loss": 5.735599994659424,
3142
- "eval_runtime": 9.79,
3143
- "eval_samples_per_second": 12088.923,
3144
- "eval_steps_per_second": 5.924,
3145
- "step": 20900
3146
- },
3147
- {
3148
- "epoch": 90.51724137931035,
3149
- "grad_norm": 0.12401026487350464,
3150
- "learning_rate": 4.474845260451621e-05,
3151
- "loss": 0.864,
3152
- "step": 21000
3153
- },
3154
- {
3155
- "epoch": 90.51724137931035,
3156
- "eval_loss": 5.719780445098877,
3157
- "eval_runtime": 9.8841,
3158
- "eval_samples_per_second": 11973.938,
3159
- "eval_steps_per_second": 5.868,
3160
- "step": 21000
3161
- },
3162
- {
3163
- "epoch": 90.94827586206897,
3164
- "grad_norm": 0.13143125176429749,
3165
- "learning_rate": 4.081411852439809e-05,
3166
- "loss": 0.8669,
3167
- "step": 21100
3168
- },
3169
- {
3170
- "epoch": 90.94827586206897,
3171
- "eval_loss": 5.718562602996826,
3172
- "eval_runtime": 9.4018,
3173
- "eval_samples_per_second": 12588.15,
3174
- "eval_steps_per_second": 6.169,
3175
- "step": 21100
3176
- },
3177
- {
3178
- "epoch": 91.37931034482759,
3179
- "grad_norm": 0.11996802687644958,
3180
- "learning_rate": 3.705719190073964e-05,
3181
- "loss": 0.8635,
3182
- "step": 21200
3183
- },
3184
- {
3185
- "epoch": 91.37931034482759,
3186
- "eval_loss": 5.731278896331787,
3187
- "eval_runtime": 9.8189,
3188
- "eval_samples_per_second": 12053.388,
3189
- "eval_steps_per_second": 5.907,
3190
- "step": 21200
3191
- },
3192
- {
3193
- "epoch": 91.8103448275862,
3194
- "grad_norm": 0.13093972206115723,
3195
- "learning_rate": 3.3478367600728754e-05,
3196
- "loss": 0.8638,
3197
- "step": 21300
3198
- },
3199
- {
3200
- "epoch": 91.8103448275862,
3201
- "eval_loss": 5.722977161407471,
3202
- "eval_runtime": 9.5816,
3203
- "eval_samples_per_second": 12351.934,
3204
- "eval_steps_per_second": 6.053,
3205
- "step": 21300
3206
- },
3207
- {
3208
- "epoch": 92.24137931034483,
3209
- "grad_norm": 0.12292136996984482,
3210
- "learning_rate": 3.0078307550409967e-05,
3211
- "loss": 0.8643,
3212
- "step": 21400
3213
- },
3214
- {
3215
- "epoch": 92.24137931034483,
3216
- "eval_loss": 5.742223262786865,
3217
- "eval_runtime": 10.257,
3218
- "eval_samples_per_second": 11538.52,
3219
- "eval_steps_per_second": 5.655,
3220
- "step": 21400
3221
- },
3222
- {
3223
- "epoch": 92.67241379310344,
3224
- "grad_norm": 0.12908169627189636,
3225
- "learning_rate": 2.685764061225615e-05,
3226
- "loss": 0.8628,
3227
- "step": 21500
3228
- },
3229
- {
3230
- "epoch": 92.67241379310344,
3231
- "eval_loss": 5.730621814727783,
3232
- "eval_runtime": 9.8371,
3233
- "eval_samples_per_second": 12031.124,
3234
- "eval_steps_per_second": 5.896,
3235
- "step": 21500
3236
- },
3237
- {
3238
- "epoch": 93.10344827586206,
3239
- "grad_norm": 0.12206296622753143,
3240
- "learning_rate": 2.381696246885634e-05,
3241
- "loss": 0.8639,
3242
- "step": 21600
3243
- },
3244
- {
3245
- "epoch": 93.10344827586206,
3246
- "eval_loss": 5.747194290161133,
3247
- "eval_runtime": 10.0468,
3248
- "eval_samples_per_second": 11779.952,
3249
- "eval_steps_per_second": 5.773,
3250
- "step": 21600
3251
- },
3252
- {
3253
- "epoch": 93.53448275862068,
3254
- "grad_norm": 0.12943461537361145,
3255
- "learning_rate": 2.0956835512741523e-05,
3256
- "loss": 0.8617,
3257
- "step": 21700
3258
- },
3259
- {
3260
- "epoch": 93.53448275862068,
3261
- "eval_loss": 5.736154556274414,
3262
- "eval_runtime": 9.5983,
3263
- "eval_samples_per_second": 12330.438,
3264
- "eval_steps_per_second": 6.043,
3265
- "step": 21700
3266
- },
3267
- {
3268
- "epoch": 93.96551724137932,
3269
- "grad_norm": 0.11701935529708862,
3270
- "learning_rate": 1.8277788742365964e-05,
3271
- "loss": 0.8638,
3272
- "step": 21800
3273
- },
3274
- {
3275
- "epoch": 93.96551724137932,
3276
- "eval_loss": 5.730475425720215,
3277
- "eval_runtime": 9.2662,
3278
- "eval_samples_per_second": 12772.39,
3279
- "eval_steps_per_second": 6.259,
3280
- "step": 21800
3281
- },
3282
- {
3283
- "epoch": 94.39655172413794,
3284
- "grad_norm": 0.12347536534070969,
3285
- "learning_rate": 1.5780317664264798e-05,
3286
- "loss": 0.8614,
3287
- "step": 21900
3288
- },
3289
- {
3290
- "epoch": 94.39655172413794,
3291
- "eval_loss": 5.749207019805908,
3292
- "eval_runtime": 10.0776,
3293
- "eval_samples_per_second": 11743.987,
3294
- "eval_steps_per_second": 5.755,
3295
- "step": 21900
3296
- },
3297
- {
3298
- "epoch": 94.82758620689656,
3299
- "grad_norm": 0.1210562139749527,
3300
- "learning_rate": 1.3464884201409122e-05,
3301
- "loss": 0.8618,
3302
- "step": 22000
3303
- },
3304
- {
3305
- "epoch": 94.82758620689656,
3306
- "eval_loss": 5.740473747253418,
3307
- "eval_runtime": 9.8545,
3308
- "eval_samples_per_second": 12009.888,
3309
- "eval_steps_per_second": 5.886,
3310
- "step": 22000
3311
- },
3312
- {
3313
- "epoch": 95.25862068965517,
3314
- "grad_norm": 0.1231653168797493,
3315
- "learning_rate": 1.1331916607768001e-05,
3316
- "loss": 0.8614,
3317
- "step": 22100
3318
- },
3319
- {
3320
- "epoch": 95.25862068965517,
3321
- "eval_loss": 5.748401641845703,
3322
- "eval_runtime": 9.3274,
3323
- "eval_samples_per_second": 12688.59,
3324
- "eval_steps_per_second": 6.218,
3325
- "step": 22100
3326
- },
3327
- {
3328
- "epoch": 95.6896551724138,
3329
- "grad_norm": 0.12054872512817383,
3330
- "learning_rate": 9.381809389101825e-06,
3331
- "loss": 0.8606,
3332
- "step": 22200
3333
- },
3334
- {
3335
- "epoch": 95.6896551724138,
3336
- "eval_loss": 5.742131233215332,
3337
- "eval_runtime": 9.7151,
3338
- "eval_samples_per_second": 12182.214,
3339
- "eval_steps_per_second": 5.97,
3340
- "step": 22200
3341
- },
3342
- {
3343
- "epoch": 96.12068965517241,
3344
- "grad_norm": 0.12231267988681793,
3345
- "learning_rate": 7.614923229995796e-06,
3346
- "loss": 0.8617,
3347
- "step": 22300
3348
- },
3349
- {
3350
- "epoch": 96.12068965517241,
3351
- "eval_loss": 5.743917942047119,
3352
- "eval_runtime": 9.8384,
3353
- "eval_samples_per_second": 12029.513,
3354
- "eval_steps_per_second": 5.895,
3355
- "step": 22300
3356
- },
3357
- {
3358
- "epoch": 96.55172413793103,
3359
- "grad_norm": 0.12903857231140137,
3360
- "learning_rate": 6.03158492714806e-06,
3361
- "loss": 0.8606,
3362
- "step": 22400
3363
- },
3364
- {
3365
- "epoch": 96.55172413793103,
3366
- "eval_loss": 5.747518062591553,
3367
- "eval_runtime": 10.0786,
3368
- "eval_samples_per_second": 11742.801,
3369
- "eval_steps_per_second": 5.755,
3370
- "step": 22400
3371
- },
3372
- {
3373
- "epoch": 96.98275862068965,
3374
- "grad_norm": 0.12624970078468323,
3375
- "learning_rate": 4.632087328927947e-06,
3376
- "loss": 0.8612,
3377
- "step": 22500
3378
- },
3379
- {
3380
- "epoch": 96.98275862068965,
3381
- "eval_loss": 5.744797229766846,
3382
- "eval_runtime": 9.9675,
3383
- "eval_samples_per_second": 11873.695,
3384
- "eval_steps_per_second": 5.819,
3385
- "step": 22500
3386
- },
3387
- {
3388
- "epoch": 97.41379310344827,
3389
- "grad_norm": 0.11606816947460175,
3390
- "learning_rate": 3.4166892812107496e-06,
3391
- "loss": 0.8601,
3392
- "step": 22600
3393
- },
3394
- {
3395
- "epoch": 97.41379310344827,
3396
- "eval_loss": 5.747043609619141,
3397
- "eval_runtime": 9.7354,
3398
- "eval_samples_per_second": 12156.826,
3399
- "eval_steps_per_second": 5.958,
3400
- "step": 22600
3401
- },
3402
- {
3403
- "epoch": 97.84482758620689,
3404
- "grad_norm": 0.12005724757909775,
3405
- "learning_rate": 2.3856155795032307e-06,
3406
- "loss": 0.8606,
3407
- "step": 22700
3408
- },
3409
- {
3410
- "epoch": 97.84482758620689,
3411
- "eval_loss": 5.7480788230896,
3412
- "eval_runtime": 9.7932,
3413
- "eval_samples_per_second": 12085.032,
3414
- "eval_steps_per_second": 5.922,
3415
- "step": 22700
3416
- },
3417
- {
3418
- "epoch": 98.27586206896552,
3419
- "grad_norm": 0.12075820565223694,
3420
- "learning_rate": 1.5390569273660005e-06,
3421
- "loss": 0.8598,
3422
- "step": 22800
3423
- },
3424
- {
3425
- "epoch": 98.27586206896552,
3426
- "eval_loss": 5.749686241149902,
3427
- "eval_runtime": 9.95,
3428
- "eval_samples_per_second": 11894.516,
3429
- "eval_steps_per_second": 5.829,
3430
- "step": 22800
3431
- },
3432
- {
3433
- "epoch": 98.70689655172414,
3434
- "grad_norm": 0.12352725863456726,
3435
- "learning_rate": 8.771699011416167e-07,
3436
- "loss": 0.8598,
3437
- "step": 22900
3438
- },
3439
- {
3440
- "epoch": 98.70689655172414,
3441
- "eval_loss": 5.7505316734313965,
3442
- "eval_runtime": 9.3854,
3443
- "eval_samples_per_second": 12610.145,
3444
- "eval_steps_per_second": 6.18,
3445
- "step": 22900
3446
- },
3447
- {
3448
- "epoch": 99.13793103448276,
3449
- "grad_norm": 0.11930395662784576,
3450
- "learning_rate": 4.0007692099508495e-07,
3451
- "loss": 0.8599,
3452
- "step": 23000
3453
- },
3454
- {
3455
- "epoch": 99.13793103448276,
3456
- "eval_loss": 5.750540733337402,
3457
- "eval_runtime": 9.2142,
3458
- "eval_samples_per_second": 12844.449,
3459
- "eval_steps_per_second": 6.295,
3460
- "step": 23000
3461
- },
3462
- {
3463
- "epoch": 99.56896551724138,
3464
- "grad_norm": 0.11672677099704742,
3465
- "learning_rate": 1.0786622827108072e-07,
3466
- "loss": 0.8603,
3467
- "step": 23100
3468
- },
3469
- {
3470
- "epoch": 99.56896551724138,
3471
- "eval_loss": 5.750354766845703,
3472
- "eval_runtime": 9.2285,
3473
- "eval_samples_per_second": 12824.547,
3474
- "eval_steps_per_second": 6.285,
3475
- "step": 23100
3476
- }
3477
- ],
3478
- "logging_steps": 100,
3479
- "max_steps": 23200,
3480
- "num_input_tokens_seen": 0,
3481
- "num_train_epochs": 100,
3482
- "save_steps": 100,
3483
- "stateful_callbacks": {
3484
- "TrainerControl": {
3485
- "args": {
3486
- "should_epoch_stop": false,
3487
- "should_evaluate": false,
3488
- "should_log": false,
3489
- "should_save": true,
3490
- "should_training_stop": false
3491
- },
3492
- "attributes": {}
3493
- }
3494
- },
3495
- "total_flos": 1.5693199771340636e+18,
3496
- "train_batch_size": 256,
3497
- "trial_name": null,
3498
- "trial_params": null
3499
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9617fbe5a0c3abe149306a96ad0a9d4a3ca9a552041eae0dde931fc7e7d5e0bf
3
- size 5432
 
 
 
 
gpt2cocoSAD/model_SAD/checkpoint/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
gpt2cocoSAD/tokenized_data_SAD/dataset_dict.json DELETED
@@ -1 +0,0 @@
1
- {"splits": ["train", "test"]}
 
 
gpt2cocoSAD/tokenized_data_SAD/test/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f03321f4766876b1f90d58a01bc178d49183785fb5a314342037dad68d68343f
3
- size 120516976
 
 
 
 
gpt2cocoSAD/tokenized_data_SAD/test/dataset_info.json DELETED
@@ -1,49 +0,0 @@
1
- {
2
- "citation": "",
3
- "description": "",
4
- "features": {
5
- "similarity": {
6
- "dtype": "float32",
7
- "_type": "Value"
8
- },
9
- "aesthetics_score": {
10
- "dtype": "float32",
11
- "_type": "Value"
12
- },
13
- "IQAs": {
14
- "dtype": "float32",
15
- "_type": "Value"
16
- },
17
- "prompt": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "query": {
22
- "dtype": "string",
23
- "_type": "Value"
24
- },
25
- "input_ids": {
26
- "feature": {
27
- "dtype": "int32",
28
- "_type": "Value"
29
- },
30
- "_type": "List"
31
- },
32
- "attention_mask": {
33
- "feature": {
34
- "dtype": "int8",
35
- "_type": "Value"
36
- },
37
- "_type": "List"
38
- },
39
- "labels": {
40
- "feature": {
41
- "dtype": "int64",
42
- "_type": "Value"
43
- },
44
- "_type": "List"
45
- }
46
- },
47
- "homepage": "",
48
- "license": ""
49
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gpt2cocoSAD/tokenized_data_SAD/test/state.json DELETED
@@ -1,22 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "6f9a3d303ecf4724",
8
- "_format_columns": [
9
- "IQAs",
10
- "aesthetics_score",
11
- "attention_mask",
12
- "input_ids",
13
- "labels",
14
- "prompt",
15
- "query",
16
- "similarity"
17
- ],
18
- "_format_kwargs": {},
19
- "_format_type": "torch",
20
- "_output_all_columns": false,
21
- "_split": null
22
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gpt2cocoSAD/tokenized_data_SAD/train/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:69b57887cf3f44daf4894e056d56ae569e8052d1f3095fd0eb15c1b3faf264ed
3
- size 482053608
 
 
 
 
gpt2cocoSAD/tokenized_data_SAD/train/dataset_info.json DELETED
@@ -1,49 +0,0 @@
1
- {
2
- "citation": "",
3
- "description": "",
4
- "features": {
5
- "similarity": {
6
- "dtype": "float32",
7
- "_type": "Value"
8
- },
9
- "aesthetics_score": {
10
- "dtype": "float32",
11
- "_type": "Value"
12
- },
13
- "IQAs": {
14
- "dtype": "float32",
15
- "_type": "Value"
16
- },
17
- "prompt": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "query": {
22
- "dtype": "string",
23
- "_type": "Value"
24
- },
25
- "input_ids": {
26
- "feature": {
27
- "dtype": "int32",
28
- "_type": "Value"
29
- },
30
- "_type": "List"
31
- },
32
- "attention_mask": {
33
- "feature": {
34
- "dtype": "int8",
35
- "_type": "Value"
36
- },
37
- "_type": "List"
38
- },
39
- "labels": {
40
- "feature": {
41
- "dtype": "int64",
42
- "_type": "Value"
43
- },
44
- "_type": "List"
45
- }
46
- },
47
- "homepage": "",
48
- "license": ""
49
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gpt2cocoSAD/tokenized_data_SAD/train/state.json DELETED
@@ -1,22 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "faf36a2481d8a52b",
8
- "_format_columns": [
9
- "IQAs",
10
- "aesthetics_score",
11
- "attention_mask",
12
- "input_ids",
13
- "labels",
14
- "prompt",
15
- "query",
16
- "similarity"
17
- ],
18
- "_format_kwargs": {},
19
- "_format_type": "torch",
20
- "_output_all_columns": false,
21
- "_split": null
22
- }