MothMalone commited on
Commit
5186da4
·
verified ·
1 Parent(s): 7d1e8a5

Upload folder using huggingface_hub

Browse files
Files changed (48) hide show
  1. .ipynb_checkpoints/eval_metrics-checkpoint.json +6 -0
  2. .ipynb_checkpoints/special_tokens_map-checkpoint.json +30 -0
  3. .ipynb_checkpoints/tokenizer-checkpoint.json +0 -0
  4. checkpoint-1000/config.json +29 -0
  5. checkpoint-1000/generation_config.json +7 -0
  6. checkpoint-1000/model.safetensors +3 -0
  7. checkpoint-1000/optimizer.pt +3 -0
  8. checkpoint-1000/rng_state.pth +3 -0
  9. checkpoint-1000/scheduler.pt +3 -0
  10. checkpoint-1000/special_tokens_map.json +30 -0
  11. checkpoint-1000/tokenizer.json +0 -0
  12. checkpoint-1000/tokenizer_config.json +51 -0
  13. checkpoint-1000/trainer_state.json +734 -0
  14. checkpoint-1000/training_args.bin +3 -0
  15. checkpoint-1125/.ipynb_checkpoints/generation_config-checkpoint.json +7 -0
  16. checkpoint-1125/.ipynb_checkpoints/tokenizer-checkpoint.json +0 -0
  17. checkpoint-1125/.ipynb_checkpoints/trainer_state-checkpoint.json +818 -0
  18. checkpoint-1125/config.json +29 -0
  19. checkpoint-1125/generation_config.json +7 -0
  20. checkpoint-1125/model.safetensors +3 -0
  21. checkpoint-1125/optimizer.pt +3 -0
  22. checkpoint-1125/rng_state.pth +3 -0
  23. checkpoint-1125/scheduler.pt +3 -0
  24. checkpoint-1125/special_tokens_map.json +30 -0
  25. checkpoint-1125/tokenizer.json +0 -0
  26. checkpoint-1125/tokenizer_config.json +51 -0
  27. checkpoint-1125/trainer_state.json +818 -0
  28. checkpoint-1125/training_args.bin +3 -0
  29. checkpoint-500/config.json +29 -0
  30. checkpoint-500/generation_config.json +7 -0
  31. checkpoint-500/model.safetensors +3 -0
  32. checkpoint-500/optimizer.pt +3 -0
  33. checkpoint-500/rng_state.pth +3 -0
  34. checkpoint-500/scheduler.pt +3 -0
  35. checkpoint-500/special_tokens_map.json +30 -0
  36. checkpoint-500/tokenizer.json +0 -0
  37. checkpoint-500/tokenizer_config.json +51 -0
  38. checkpoint-500/trainer_state.json +384 -0
  39. checkpoint-500/training_args.bin +3 -0
  40. config.json +29 -0
  41. eval_metrics.json +6 -0
  42. generation_config.json +7 -0
  43. inference_results.json +43 -0
  44. model.safetensors +3 -0
  45. special_tokens_map.json +30 -0
  46. tokenizer.json +0 -0
  47. tokenizer_config.json +51 -0
  48. training_args.bin +3 -0
.ipynb_checkpoints/eval_metrics-checkpoint.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "eval_runtime": 54.5547,
3
+ "eval_samples_per_second": 18.33,
4
+ "eval_steps_per_second": 2.291,
5
+ "epoch": 1.0
6
+ }
.ipynb_checkpoints/special_tokens_map-checkpoint.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
.ipynb_checkpoints/tokenizer-checkpoint.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 2048,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 22,
19
+ "num_key_value_heads": 4,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.52.4",
27
+ "use_cache": true,
28
+ "vocab_size": 32001
29
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.52.4"
7
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aac3c6bc9755e318bb14063db8d7b06cb54c7e242945ae7d054fc134596514a
3
+ size 2200128056
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:377a3b222f683f34e864837094a2e92d6b98c4a4b280ffc054da9551f5d553b2
3
+ size 2626422906
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca8f54a5e5f4ad44505113217beae8d0cea6c78024de1e51bb2c964c933a0e8a
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "<s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 1000000000000000019884624838656,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "<unk>",
50
+ "use_default_system_prompt": false
51
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.8888888888888888,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008888888888888889,
14
+ "grad_norm": 329728.0,
15
+ "learning_rate": 1.592920353982301e-06,
16
+ "loss": 6840.2352,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.017777777777777778,
21
+ "grad_norm": 110080.0,
22
+ "learning_rate": 3.36283185840708e-06,
23
+ "loss": 6186.3184,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02666666666666667,
28
+ "grad_norm": 67584.0,
29
+ "learning_rate": 5.132743362831859e-06,
30
+ "loss": 5175.825,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.035555555555555556,
35
+ "grad_norm": 28800.0,
36
+ "learning_rate": 6.902654867256637e-06,
37
+ "loss": 4705.709,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.044444444444444446,
42
+ "grad_norm": 31232.0,
43
+ "learning_rate": 8.672566371681418e-06,
44
+ "loss": 4302.4035,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.05333333333333334,
49
+ "grad_norm": 31232.0,
50
+ "learning_rate": 1.0442477876106197e-05,
51
+ "loss": 4149.0633,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.06222222222222222,
56
+ "grad_norm": 32000.0,
57
+ "learning_rate": 1.2212389380530973e-05,
58
+ "loss": 3953.5156,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.07111111111111111,
63
+ "grad_norm": 18432.0,
64
+ "learning_rate": 1.3982300884955752e-05,
65
+ "loss": 3825.391,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.08,
70
+ "grad_norm": 27008.0,
71
+ "learning_rate": 1.5752212389380532e-05,
72
+ "loss": 3717.7289,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.08888888888888889,
77
+ "grad_norm": 33536.0,
78
+ "learning_rate": 1.7522123893805313e-05,
79
+ "loss": 3600.6922,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.09777777777777778,
84
+ "grad_norm": 41984.0,
85
+ "learning_rate": 1.929203539823009e-05,
86
+ "loss": 3577.2379,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.10666666666666667,
91
+ "grad_norm": 22912.0,
92
+ "learning_rate": 1.999826540268562e-05,
93
+ "loss": 3496.2227,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.11555555555555555,
98
+ "grad_norm": 14848.0,
99
+ "learning_rate": 1.998766726491935e-05,
100
+ "loss": 3380.4918,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.12444444444444444,
105
+ "grad_norm": 20352.0,
106
+ "learning_rate": 1.9967444854710313e-05,
107
+ "loss": 3359.9414,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.13333333333333333,
112
+ "grad_norm": 33024.0,
113
+ "learning_rate": 1.9937617658689385e-05,
114
+ "loss": 3332.8012,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.14222222222222222,
119
+ "grad_norm": 16640.0,
120
+ "learning_rate": 1.989821441880933e-05,
121
+ "loss": 3265.1035,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.1511111111111111,
126
+ "grad_norm": 19840.0,
127
+ "learning_rate": 1.9849273104648592e-05,
128
+ "loss": 3195.4633,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.16,
133
+ "grad_norm": 16896.0,
134
+ "learning_rate": 1.979084087682323e-05,
135
+ "loss": 3178.0693,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.1688888888888889,
140
+ "grad_norm": 16064.0,
141
+ "learning_rate": 1.9722974041542205e-05,
142
+ "loss": 3216.1639,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.17777777777777778,
147
+ "grad_norm": 26112.0,
148
+ "learning_rate": 1.9645737996349828e-05,
149
+ "loss": 3163.891,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.18666666666666668,
154
+ "grad_norm": 23040.0,
155
+ "learning_rate": 1.9559207167107684e-05,
156
+ "loss": 3117.5008,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.19555555555555557,
161
+ "grad_norm": 17664.0,
162
+ "learning_rate": 1.9463464936276676e-05,
163
+ "loss": 3096.1451,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.20444444444444446,
168
+ "grad_norm": 16256.0,
169
+ "learning_rate": 1.9358603562568417e-05,
170
+ "loss": 3047.9447,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.21333333333333335,
175
+ "grad_norm": 19584.0,
176
+ "learning_rate": 1.924472409204326e-05,
177
+ "loss": 3054.9166,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.2222222222222222,
182
+ "grad_norm": 18432.0,
183
+ "learning_rate": 1.9121936260740752e-05,
184
+ "loss": 3090.7711,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.2311111111111111,
189
+ "grad_norm": 13056.0,
190
+ "learning_rate": 1.899035838893627e-05,
191
+ "loss": 3075.3973,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.24,
196
+ "grad_norm": 15744.0,
197
+ "learning_rate": 1.885011726712574e-05,
198
+ "loss": 2975.9182,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.24888888888888888,
203
+ "grad_norm": 21632.0,
204
+ "learning_rate": 1.870134803384834e-05,
205
+ "loss": 3013.1484,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.2577777777777778,
210
+ "grad_norm": 13888.0,
211
+ "learning_rate": 1.8544194045464888e-05,
212
+ "loss": 3003.3625,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.26666666666666666,
217
+ "grad_norm": 14592.0,
218
+ "learning_rate": 1.837880673801741e-05,
219
+ "loss": 2980.8666,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.27555555555555555,
224
+ "grad_norm": 19456.0,
225
+ "learning_rate": 1.8205345481303e-05,
226
+ "loss": 2986.1092,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.28444444444444444,
231
+ "grad_norm": 22528.0,
232
+ "learning_rate": 1.802397742530259e-05,
233
+ "loss": 2940.208,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.29333333333333333,
238
+ "grad_norm": 13376.0,
239
+ "learning_rate": 1.7834877339112613e-05,
240
+ "loss": 3002.926,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.3022222222222222,
245
+ "grad_norm": 16192.0,
246
+ "learning_rate": 1.763822744253477e-05,
247
+ "loss": 3008.3018,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.3111111111111111,
252
+ "grad_norm": 19840.0,
253
+ "learning_rate": 1.7434217230486162e-05,
254
+ "loss": 2978.5563,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.32,
259
+ "grad_norm": 19968.0,
260
+ "learning_rate": 1.7223043290399065e-05,
261
+ "loss": 2917.9051,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.3288888888888889,
266
+ "grad_norm": 11968.0,
267
+ "learning_rate": 1.7004909112786142e-05,
268
+ "loss": 2916.0309,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.3377777777777778,
273
+ "grad_norm": 13504.0,
274
+ "learning_rate": 1.6780024895153862e-05,
275
+ "loss": 2964.793,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.3466666666666667,
280
+ "grad_norm": 32256.0,
281
+ "learning_rate": 1.6548607339452853e-05,
282
+ "loss": 2886.1475,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.35555555555555557,
287
+ "grad_norm": 11648.0,
288
+ "learning_rate": 1.631087944326053e-05,
289
+ "loss": 2871.3844,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.36444444444444446,
294
+ "grad_norm": 13504.0,
295
+ "learning_rate": 1.606707028489714e-05,
296
+ "loss": 2951.7568,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.37333333333333335,
301
+ "grad_norm": 9344.0,
302
+ "learning_rate": 1.5817414802682292e-05,
303
+ "loss": 2948.7459,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.38222222222222224,
308
+ "grad_norm": 14336.0,
309
+ "learning_rate": 1.5562153568544753e-05,
310
+ "loss": 2925.3027,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.39111111111111113,
315
+ "grad_norm": 17280.0,
316
+ "learning_rate": 1.5301532556203524e-05,
317
+ "loss": 2861.5111,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.4,
322
+ "grad_norm": 15808.0,
323
+ "learning_rate": 1.503580290414376e-05,
324
+ "loss": 2862.7781,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.4088888888888889,
329
+ "grad_norm": 10880.0,
330
+ "learning_rate": 1.4765220673615774e-05,
331
+ "loss": 2894.2242,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.4177777777777778,
336
+ "grad_norm": 20224.0,
337
+ "learning_rate": 1.4490046601890405e-05,
338
+ "loss": 2876.3199,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.4266666666666667,
343
+ "grad_norm": 12928.0,
344
+ "learning_rate": 1.4210545851008529e-05,
345
+ "loss": 2894.2551,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.43555555555555553,
350
+ "grad_norm": 11776.0,
351
+ "learning_rate": 1.3926987752266733e-05,
352
+ "loss": 2852.1971,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.4444444444444444,
357
+ "grad_norm": 17664.0,
358
+ "learning_rate": 1.36396455466855e-05,
359
+ "loss": 2854.5852,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.4533333333333333,
364
+ "grad_norm": 16768.0,
365
+ "learning_rate": 1.3348796121709862e-05,
366
+ "loss": 2917.9811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.4622222222222222,
371
+ "grad_norm": 9728.0,
372
+ "learning_rate": 1.3054719744396333e-05,
373
+ "loss": 2848.7223,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.4711111111111111,
378
+ "grad_norm": 14912.0,
379
+ "learning_rate": 1.2757699791343188e-05,
380
+ "loss": 2840.7688,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.48,
385
+ "grad_norm": 21120.0,
386
+ "learning_rate": 1.2458022475624343e-05,
387
+ "loss": 2890.6824,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.4888888888888889,
392
+ "grad_norm": 12544.0,
393
+ "learning_rate": 1.2155976570989949e-05,
394
+ "loss": 2826.4334,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.49777777777777776,
399
+ "grad_norm": 18560.0,
400
+ "learning_rate": 1.1851853133599507e-05,
401
+ "loss": 2879.9736,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.5066666666666667,
406
+ "grad_norm": 12288.0,
407
+ "learning_rate": 1.1545945221555571e-05,
408
+ "loss": 2821.0361,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.5155555555555555,
413
+ "grad_norm": 9536.0,
414
+ "learning_rate": 1.12385476125084e-05,
415
+ "loss": 2807.2881,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.5244444444444445,
420
+ "grad_norm": 10048.0,
421
+ "learning_rate": 1.0929956519603595e-05,
422
+ "loss": 2811.4141,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.5333333333333333,
427
+ "grad_norm": 15296.0,
428
+ "learning_rate": 1.0620469306046473e-05,
429
+ "loss": 2790.8965,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.5422222222222223,
434
+ "grad_norm": 19584.0,
435
+ "learning_rate": 1.0310384198558226e-05,
436
+ "loss": 2852.466,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.5511111111111111,
441
+ "grad_norm": 8160.0,
442
+ "learning_rate": 1e-05,
443
+ "loss": 2758.1283,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.56,
448
+ "grad_norm": 11008.0,
449
+ "learning_rate": 9.689615801441776e-06,
450
+ "loss": 2822.6701,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.5688888888888889,
455
+ "grad_norm": 14080.0,
456
+ "learning_rate": 9.37953069395353e-06,
457
+ "loss": 2809.1648,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.5777777777777777,
462
+ "grad_norm": 15104.0,
463
+ "learning_rate": 9.070043480396404e-06,
464
+ "loss": 2821.9559,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.5866666666666667,
469
+ "grad_norm": 19200.0,
470
+ "learning_rate": 8.761452387491601e-06,
471
+ "loss": 2818.5842,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.5955555555555555,
476
+ "grad_norm": 15808.0,
477
+ "learning_rate": 8.45405477844443e-06,
478
+ "loss": 2798.8832,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.6044444444444445,
483
+ "grad_norm": 11392.0,
484
+ "learning_rate": 8.148146866400498e-06,
485
+ "loss": 2779.3459,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.6133333333333333,
490
+ "grad_norm": 17792.0,
491
+ "learning_rate": 7.844023429010051e-06,
492
+ "loss": 2801.9057,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.6222222222222222,
497
+ "grad_norm": 19200.0,
498
+ "learning_rate": 7.541977524375661e-06,
499
+ "loss": 2828.118,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.6311111111111111,
504
+ "grad_norm": 14016.0,
505
+ "learning_rate": 7.242300208656814e-06,
506
+ "loss": 2800.459,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.64,
511
+ "grad_norm": 15872.0,
512
+ "learning_rate": 6.9452802556036705e-06,
513
+ "loss": 2761.099,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.6488888888888888,
518
+ "grad_norm": 16512.0,
519
+ "learning_rate": 6.651203878290139e-06,
520
+ "loss": 2769.051,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.6577777777777778,
525
+ "grad_norm": 11264.0,
526
+ "learning_rate": 6.360354453314502e-06,
527
+ "loss": 2813.532,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.6666666666666666,
532
+ "grad_norm": 12160.0,
533
+ "learning_rate": 6.073012247733267e-06,
534
+ "loss": 2761.1883,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.6755555555555556,
539
+ "grad_norm": 11840.0,
540
+ "learning_rate": 5.789454148991477e-06,
541
+ "loss": 2775.6535,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.6844444444444444,
546
+ "grad_norm": 14656.0,
547
+ "learning_rate": 5.5099533981095945e-06,
548
+ "loss": 2782.9367,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.6933333333333334,
553
+ "grad_norm": 14144.0,
554
+ "learning_rate": 5.234779326384227e-06,
555
+ "loss": 2781.4521,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.7022222222222222,
560
+ "grad_norm": 13760.0,
561
+ "learning_rate": 4.964197095856237e-06,
562
+ "loss": 2839.6955,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.7111111111111111,
567
+ "grad_norm": 17024.0,
568
+ "learning_rate": 4.698467443796479e-06,
569
+ "loss": 2808.0578,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.72,
574
+ "grad_norm": 15488.0,
575
+ "learning_rate": 4.437846431455249e-06,
576
+ "loss": 2783.9123,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.7288888888888889,
581
+ "grad_norm": 10560.0,
582
+ "learning_rate": 4.182585197317709e-06,
583
+ "loss": 2823.9398,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.7377777777777778,
588
+ "grad_norm": 11840.0,
589
+ "learning_rate": 3.932929715102863e-06,
590
+ "loss": 2776.0838,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.7466666666666667,
595
+ "grad_norm": 15424.0,
596
+ "learning_rate": 3.689120556739475e-06,
597
+ "loss": 2765.9592,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.7555555555555555,
602
+ "grad_norm": 17536.0,
603
+ "learning_rate": 3.4513926605471504e-06,
604
+ "loss": 2757.6773,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.7644444444444445,
609
+ "grad_norm": 10560.0,
610
+ "learning_rate": 3.2199751048461414e-06,
611
+ "loss": 2745.9385,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.7733333333333333,
616
+ "grad_norm": 7392.0,
617
+ "learning_rate": 2.9950908872138585e-06,
618
+ "loss": 2768.2516,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.7822222222222223,
623
+ "grad_norm": 10880.0,
624
+ "learning_rate": 2.776956709600941e-06,
625
+ "loss": 2775.5641,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.7911111111111111,
630
+ "grad_norm": 9792.0,
631
+ "learning_rate": 2.565782769513837e-06,
632
+ "loss": 2746.8189,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.8,
637
+ "grad_norm": 13120.0,
638
+ "learning_rate": 2.3617725574652352e-06,
639
+ "loss": 2855.6207,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.8088888888888889,
644
+ "grad_norm": 8640.0,
645
+ "learning_rate": 2.1651226608873876e-06,
646
+ "loss": 2792.5463,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.8177777777777778,
651
+ "grad_norm": 15808.0,
652
+ "learning_rate": 1.9760225746974136e-06,
653
+ "loss": 2792.7549,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.8266666666666667,
658
+ "grad_norm": 12800.0,
659
+ "learning_rate": 1.7946545186970022e-06,
660
+ "loss": 2801.327,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.8355555555555556,
665
+ "grad_norm": 11520.0,
666
+ "learning_rate": 1.6211932619825932e-06,
667
+ "loss": 2731.5381,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.8444444444444444,
672
+ "grad_norm": 14144.0,
673
+ "learning_rate": 1.4558059545351144e-06,
674
+ "loss": 2826.2076,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.8533333333333334,
679
+ "grad_norm": 13184.0,
680
+ "learning_rate": 1.298651966151665e-06,
681
+ "loss": 2842.1312,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.8622222222222222,
686
+ "grad_norm": 8256.0,
687
+ "learning_rate": 1.1498827328742623e-06,
688
+ "loss": 2740.9932,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.8711111111111111,
693
+ "grad_norm": 8640.0,
694
+ "learning_rate": 1.009641611063732e-06,
695
+ "loss": 2773.5756,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.88,
700
+ "grad_norm": 10688.0,
701
+ "learning_rate": 8.780637392592494e-07,
702
+ "loss": 2739.3299,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.8888888888888888,
707
+ "grad_norm": 9408.0,
708
+ "learning_rate": 7.552759079567418e-07,
709
+ "loss": 2791.6674,
710
+ "step": 1000
711
+ }
712
+ ],
713
+ "logging_steps": 10,
714
+ "max_steps": 1125,
715
+ "num_input_tokens_seen": 0,
716
+ "num_train_epochs": 1,
717
+ "save_steps": 500,
718
+ "stateful_callbacks": {
719
+ "TrainerControl": {
720
+ "args": {
721
+ "should_epoch_stop": false,
722
+ "should_evaluate": false,
723
+ "should_log": false,
724
+ "should_save": true,
725
+ "should_training_stop": false
726
+ },
727
+ "attributes": {}
728
+ }
729
+ },
730
+ "total_flos": 5.0848453361664e+16,
731
+ "train_batch_size": 1,
732
+ "trial_name": null,
733
+ "trial_params": null
734
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e1bafbdfd4c195674c42da05ea52538467616210e2cea66a39948e8046595b
3
+ size 5304
checkpoint-1125/.ipynb_checkpoints/generation_config-checkpoint.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.52.4"
7
+ }
checkpoint-1125/.ipynb_checkpoints/tokenizer-checkpoint.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1125/.ipynb_checkpoints/trainer_state-checkpoint.json ADDED
@@ -0,0 +1,818 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1125,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008888888888888889,
14
+ "grad_norm": 329728.0,
15
+ "learning_rate": 1.592920353982301e-06,
16
+ "loss": 6840.2352,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.017777777777777778,
21
+ "grad_norm": 110080.0,
22
+ "learning_rate": 3.36283185840708e-06,
23
+ "loss": 6186.3184,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02666666666666667,
28
+ "grad_norm": 67584.0,
29
+ "learning_rate": 5.132743362831859e-06,
30
+ "loss": 5175.825,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.035555555555555556,
35
+ "grad_norm": 28800.0,
36
+ "learning_rate": 6.902654867256637e-06,
37
+ "loss": 4705.709,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.044444444444444446,
42
+ "grad_norm": 31232.0,
43
+ "learning_rate": 8.672566371681418e-06,
44
+ "loss": 4302.4035,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.05333333333333334,
49
+ "grad_norm": 31232.0,
50
+ "learning_rate": 1.0442477876106197e-05,
51
+ "loss": 4149.0633,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.06222222222222222,
56
+ "grad_norm": 32000.0,
57
+ "learning_rate": 1.2212389380530973e-05,
58
+ "loss": 3953.5156,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.07111111111111111,
63
+ "grad_norm": 18432.0,
64
+ "learning_rate": 1.3982300884955752e-05,
65
+ "loss": 3825.391,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.08,
70
+ "grad_norm": 27008.0,
71
+ "learning_rate": 1.5752212389380532e-05,
72
+ "loss": 3717.7289,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.08888888888888889,
77
+ "grad_norm": 33536.0,
78
+ "learning_rate": 1.7522123893805313e-05,
79
+ "loss": 3600.6922,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.09777777777777778,
84
+ "grad_norm": 41984.0,
85
+ "learning_rate": 1.929203539823009e-05,
86
+ "loss": 3577.2379,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.10666666666666667,
91
+ "grad_norm": 22912.0,
92
+ "learning_rate": 1.999826540268562e-05,
93
+ "loss": 3496.2227,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.11555555555555555,
98
+ "grad_norm": 14848.0,
99
+ "learning_rate": 1.998766726491935e-05,
100
+ "loss": 3380.4918,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.12444444444444444,
105
+ "grad_norm": 20352.0,
106
+ "learning_rate": 1.9967444854710313e-05,
107
+ "loss": 3359.9414,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.13333333333333333,
112
+ "grad_norm": 33024.0,
113
+ "learning_rate": 1.9937617658689385e-05,
114
+ "loss": 3332.8012,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.14222222222222222,
119
+ "grad_norm": 16640.0,
120
+ "learning_rate": 1.989821441880933e-05,
121
+ "loss": 3265.1035,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.1511111111111111,
126
+ "grad_norm": 19840.0,
127
+ "learning_rate": 1.9849273104648592e-05,
128
+ "loss": 3195.4633,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.16,
133
+ "grad_norm": 16896.0,
134
+ "learning_rate": 1.979084087682323e-05,
135
+ "loss": 3178.0693,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.1688888888888889,
140
+ "grad_norm": 16064.0,
141
+ "learning_rate": 1.9722974041542205e-05,
142
+ "loss": 3216.1639,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.17777777777777778,
147
+ "grad_norm": 26112.0,
148
+ "learning_rate": 1.9645737996349828e-05,
149
+ "loss": 3163.891,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.18666666666666668,
154
+ "grad_norm": 23040.0,
155
+ "learning_rate": 1.9559207167107684e-05,
156
+ "loss": 3117.5008,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.19555555555555557,
161
+ "grad_norm": 17664.0,
162
+ "learning_rate": 1.9463464936276676e-05,
163
+ "loss": 3096.1451,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.20444444444444446,
168
+ "grad_norm": 16256.0,
169
+ "learning_rate": 1.9358603562568417e-05,
170
+ "loss": 3047.9447,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.21333333333333335,
175
+ "grad_norm": 19584.0,
176
+ "learning_rate": 1.924472409204326e-05,
177
+ "loss": 3054.9166,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.2222222222222222,
182
+ "grad_norm": 18432.0,
183
+ "learning_rate": 1.9121936260740752e-05,
184
+ "loss": 3090.7711,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.2311111111111111,
189
+ "grad_norm": 13056.0,
190
+ "learning_rate": 1.899035838893627e-05,
191
+ "loss": 3075.3973,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.24,
196
+ "grad_norm": 15744.0,
197
+ "learning_rate": 1.885011726712574e-05,
198
+ "loss": 2975.9182,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.24888888888888888,
203
+ "grad_norm": 21632.0,
204
+ "learning_rate": 1.870134803384834e-05,
205
+ "loss": 3013.1484,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.2577777777777778,
210
+ "grad_norm": 13888.0,
211
+ "learning_rate": 1.8544194045464888e-05,
212
+ "loss": 3003.3625,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.26666666666666666,
217
+ "grad_norm": 14592.0,
218
+ "learning_rate": 1.837880673801741e-05,
219
+ "loss": 2980.8666,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.27555555555555555,
224
+ "grad_norm": 19456.0,
225
+ "learning_rate": 1.8205345481303e-05,
226
+ "loss": 2986.1092,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.28444444444444444,
231
+ "grad_norm": 22528.0,
232
+ "learning_rate": 1.802397742530259e-05,
233
+ "loss": 2940.208,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.29333333333333333,
238
+ "grad_norm": 13376.0,
239
+ "learning_rate": 1.7834877339112613e-05,
240
+ "loss": 3002.926,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.3022222222222222,
245
+ "grad_norm": 16192.0,
246
+ "learning_rate": 1.763822744253477e-05,
247
+ "loss": 3008.3018,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.3111111111111111,
252
+ "grad_norm": 19840.0,
253
+ "learning_rate": 1.7434217230486162e-05,
254
+ "loss": 2978.5563,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.32,
259
+ "grad_norm": 19968.0,
260
+ "learning_rate": 1.7223043290399065e-05,
261
+ "loss": 2917.9051,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.3288888888888889,
266
+ "grad_norm": 11968.0,
267
+ "learning_rate": 1.7004909112786142e-05,
268
+ "loss": 2916.0309,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.3377777777777778,
273
+ "grad_norm": 13504.0,
274
+ "learning_rate": 1.6780024895153862e-05,
275
+ "loss": 2964.793,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.3466666666666667,
280
+ "grad_norm": 32256.0,
281
+ "learning_rate": 1.6548607339452853e-05,
282
+ "loss": 2886.1475,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.35555555555555557,
287
+ "grad_norm": 11648.0,
288
+ "learning_rate": 1.631087944326053e-05,
289
+ "loss": 2871.3844,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.36444444444444446,
294
+ "grad_norm": 13504.0,
295
+ "learning_rate": 1.606707028489714e-05,
296
+ "loss": 2951.7568,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.37333333333333335,
301
+ "grad_norm": 9344.0,
302
+ "learning_rate": 1.5817414802682292e-05,
303
+ "loss": 2948.7459,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.38222222222222224,
308
+ "grad_norm": 14336.0,
309
+ "learning_rate": 1.5562153568544753e-05,
310
+ "loss": 2925.3027,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.39111111111111113,
315
+ "grad_norm": 17280.0,
316
+ "learning_rate": 1.5301532556203524e-05,
317
+ "loss": 2861.5111,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.4,
322
+ "grad_norm": 15808.0,
323
+ "learning_rate": 1.503580290414376e-05,
324
+ "loss": 2862.7781,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.4088888888888889,
329
+ "grad_norm": 10880.0,
330
+ "learning_rate": 1.4765220673615774e-05,
331
+ "loss": 2894.2242,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.4177777777777778,
336
+ "grad_norm": 20224.0,
337
+ "learning_rate": 1.4490046601890405e-05,
338
+ "loss": 2876.3199,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.4266666666666667,
343
+ "grad_norm": 12928.0,
344
+ "learning_rate": 1.4210545851008529e-05,
345
+ "loss": 2894.2551,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.43555555555555553,
350
+ "grad_norm": 11776.0,
351
+ "learning_rate": 1.3926987752266733e-05,
352
+ "loss": 2852.1971,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.4444444444444444,
357
+ "grad_norm": 17664.0,
358
+ "learning_rate": 1.36396455466855e-05,
359
+ "loss": 2854.5852,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.4533333333333333,
364
+ "grad_norm": 16768.0,
365
+ "learning_rate": 1.3348796121709862e-05,
366
+ "loss": 2917.9811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.4622222222222222,
371
+ "grad_norm": 9728.0,
372
+ "learning_rate": 1.3054719744396333e-05,
373
+ "loss": 2848.7223,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.4711111111111111,
378
+ "grad_norm": 14912.0,
379
+ "learning_rate": 1.2757699791343188e-05,
380
+ "loss": 2840.7688,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.48,
385
+ "grad_norm": 21120.0,
386
+ "learning_rate": 1.2458022475624343e-05,
387
+ "loss": 2890.6824,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.4888888888888889,
392
+ "grad_norm": 12544.0,
393
+ "learning_rate": 1.2155976570989949e-05,
394
+ "loss": 2826.4334,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.49777777777777776,
399
+ "grad_norm": 18560.0,
400
+ "learning_rate": 1.1851853133599507e-05,
401
+ "loss": 2879.9736,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.5066666666666667,
406
+ "grad_norm": 12288.0,
407
+ "learning_rate": 1.1545945221555571e-05,
408
+ "loss": 2821.0361,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.5155555555555555,
413
+ "grad_norm": 9536.0,
414
+ "learning_rate": 1.12385476125084e-05,
415
+ "loss": 2807.2881,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.5244444444444445,
420
+ "grad_norm": 10048.0,
421
+ "learning_rate": 1.0929956519603595e-05,
422
+ "loss": 2811.4141,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.5333333333333333,
427
+ "grad_norm": 15296.0,
428
+ "learning_rate": 1.0620469306046473e-05,
429
+ "loss": 2790.8965,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.5422222222222223,
434
+ "grad_norm": 19584.0,
435
+ "learning_rate": 1.0310384198558226e-05,
436
+ "loss": 2852.466,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.5511111111111111,
441
+ "grad_norm": 8160.0,
442
+ "learning_rate": 1e-05,
443
+ "loss": 2758.1283,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.56,
448
+ "grad_norm": 11008.0,
449
+ "learning_rate": 9.689615801441776e-06,
450
+ "loss": 2822.6701,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.5688888888888889,
455
+ "grad_norm": 14080.0,
456
+ "learning_rate": 9.37953069395353e-06,
457
+ "loss": 2809.1648,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.5777777777777777,
462
+ "grad_norm": 15104.0,
463
+ "learning_rate": 9.070043480396404e-06,
464
+ "loss": 2821.9559,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.5866666666666667,
469
+ "grad_norm": 19200.0,
470
+ "learning_rate": 8.761452387491601e-06,
471
+ "loss": 2818.5842,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.5955555555555555,
476
+ "grad_norm": 15808.0,
477
+ "learning_rate": 8.45405477844443e-06,
478
+ "loss": 2798.8832,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.6044444444444445,
483
+ "grad_norm": 11392.0,
484
+ "learning_rate": 8.148146866400498e-06,
485
+ "loss": 2779.3459,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.6133333333333333,
490
+ "grad_norm": 17792.0,
491
+ "learning_rate": 7.844023429010051e-06,
492
+ "loss": 2801.9057,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.6222222222222222,
497
+ "grad_norm": 19200.0,
498
+ "learning_rate": 7.541977524375661e-06,
499
+ "loss": 2828.118,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.6311111111111111,
504
+ "grad_norm": 14016.0,
505
+ "learning_rate": 7.242300208656814e-06,
506
+ "loss": 2800.459,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.64,
511
+ "grad_norm": 15872.0,
512
+ "learning_rate": 6.9452802556036705e-06,
513
+ "loss": 2761.099,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.6488888888888888,
518
+ "grad_norm": 16512.0,
519
+ "learning_rate": 6.651203878290139e-06,
520
+ "loss": 2769.051,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.6577777777777778,
525
+ "grad_norm": 11264.0,
526
+ "learning_rate": 6.360354453314502e-06,
527
+ "loss": 2813.532,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.6666666666666666,
532
+ "grad_norm": 12160.0,
533
+ "learning_rate": 6.073012247733267e-06,
534
+ "loss": 2761.1883,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.6755555555555556,
539
+ "grad_norm": 11840.0,
540
+ "learning_rate": 5.789454148991477e-06,
541
+ "loss": 2775.6535,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.6844444444444444,
546
+ "grad_norm": 14656.0,
547
+ "learning_rate": 5.5099533981095945e-06,
548
+ "loss": 2782.9367,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.6933333333333334,
553
+ "grad_norm": 14144.0,
554
+ "learning_rate": 5.234779326384227e-06,
555
+ "loss": 2781.4521,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.7022222222222222,
560
+ "grad_norm": 13760.0,
561
+ "learning_rate": 4.964197095856237e-06,
562
+ "loss": 2839.6955,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.7111111111111111,
567
+ "grad_norm": 17024.0,
568
+ "learning_rate": 4.698467443796479e-06,
569
+ "loss": 2808.0578,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.72,
574
+ "grad_norm": 15488.0,
575
+ "learning_rate": 4.437846431455249e-06,
576
+ "loss": 2783.9123,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.7288888888888889,
581
+ "grad_norm": 10560.0,
582
+ "learning_rate": 4.182585197317709e-06,
583
+ "loss": 2823.9398,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.7377777777777778,
588
+ "grad_norm": 11840.0,
589
+ "learning_rate": 3.932929715102863e-06,
590
+ "loss": 2776.0838,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.7466666666666667,
595
+ "grad_norm": 15424.0,
596
+ "learning_rate": 3.689120556739475e-06,
597
+ "loss": 2765.9592,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.7555555555555555,
602
+ "grad_norm": 17536.0,
603
+ "learning_rate": 3.4513926605471504e-06,
604
+ "loss": 2757.6773,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.7644444444444445,
609
+ "grad_norm": 10560.0,
610
+ "learning_rate": 3.2199751048461414e-06,
611
+ "loss": 2745.9385,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.7733333333333333,
616
+ "grad_norm": 7392.0,
617
+ "learning_rate": 2.9950908872138585e-06,
618
+ "loss": 2768.2516,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.7822222222222223,
623
+ "grad_norm": 10880.0,
624
+ "learning_rate": 2.776956709600941e-06,
625
+ "loss": 2775.5641,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.7911111111111111,
630
+ "grad_norm": 9792.0,
631
+ "learning_rate": 2.565782769513837e-06,
632
+ "loss": 2746.8189,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.8,
637
+ "grad_norm": 13120.0,
638
+ "learning_rate": 2.3617725574652352e-06,
639
+ "loss": 2855.6207,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.8088888888888889,
644
+ "grad_norm": 8640.0,
645
+ "learning_rate": 2.1651226608873876e-06,
646
+ "loss": 2792.5463,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.8177777777777778,
651
+ "grad_norm": 15808.0,
652
+ "learning_rate": 1.9760225746974136e-06,
653
+ "loss": 2792.7549,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.8266666666666667,
658
+ "grad_norm": 12800.0,
659
+ "learning_rate": 1.7946545186970022e-06,
660
+ "loss": 2801.327,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.8355555555555556,
665
+ "grad_norm": 11520.0,
666
+ "learning_rate": 1.6211932619825932e-06,
667
+ "loss": 2731.5381,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.8444444444444444,
672
+ "grad_norm": 14144.0,
673
+ "learning_rate": 1.4558059545351144e-06,
674
+ "loss": 2826.2076,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.8533333333333334,
679
+ "grad_norm": 13184.0,
680
+ "learning_rate": 1.298651966151665e-06,
681
+ "loss": 2842.1312,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.8622222222222222,
686
+ "grad_norm": 8256.0,
687
+ "learning_rate": 1.1498827328742623e-06,
688
+ "loss": 2740.9932,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.8711111111111111,
693
+ "grad_norm": 8640.0,
694
+ "learning_rate": 1.009641611063732e-06,
695
+ "loss": 2773.5756,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.88,
700
+ "grad_norm": 10688.0,
701
+ "learning_rate": 8.780637392592494e-07,
702
+ "loss": 2739.3299,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.8888888888888888,
707
+ "grad_norm": 9408.0,
708
+ "learning_rate": 7.552759079567418e-07,
709
+ "loss": 2791.6674,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.8977777777777778,
714
+ "grad_norm": 13376.0,
715
+ "learning_rate": 6.413964374315851e-07,
716
+ "loss": 2784.6709,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.9066666666666666,
721
+ "grad_norm": 11904.0,
722
+ "learning_rate": 5.365350637233236e-07,
723
+ "loss": 2741.3131,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.9155555555555556,
728
+ "grad_norm": 14784.0,
729
+ "learning_rate": 4.407928328923194e-07,
730
+ "loss": 2791.416,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.9244444444444444,
735
+ "grad_norm": 11072.0,
736
+ "learning_rate": 3.5426200365017207e-07,
737
+ "loss": 2769.5869,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.9333333333333333,
742
+ "grad_norm": 13120.0,
743
+ "learning_rate": 2.770259584577972e-07,
744
+ "loss": 2769.8787,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.9422222222222222,
749
+ "grad_norm": 7808.0,
750
+ "learning_rate": 2.091591231767709e-07,
751
+ "loss": 2795.082,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.9511111111111111,
756
+ "grad_norm": 8640.0,
757
+ "learning_rate": 1.5072689535141072e-07,
758
+ "loss": 2778.0055,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.96,
763
+ "grad_norm": 12096.0,
764
+ "learning_rate": 1.0178558119067316e-07,
765
+ "loss": 2744.5266,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.9688888888888889,
770
+ "grad_norm": 11584.0,
771
+ "learning_rate": 6.238234131061616e-08,
772
+ "loss": 2765.4174,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.9777777777777777,
777
+ "grad_norm": 15488.0,
778
+ "learning_rate": 3.255514528968884e-08,
779
+ "loss": 2780.9643,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.9866666666666667,
784
+ "grad_norm": 9728.0,
785
+ "learning_rate": 1.2332735080651248e-08,
786
+ "loss": 2821.4281,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.9955555555555555,
791
+ "grad_norm": 25216.0,
792
+ "learning_rate": 1.7345973143789717e-09,
793
+ "loss": 2802.424,
794
+ "step": 1120
795
+ }
796
+ ],
797
+ "logging_steps": 10,
798
+ "max_steps": 1125,
799
+ "num_input_tokens_seen": 0,
800
+ "num_train_epochs": 1,
801
+ "save_steps": 500,
802
+ "stateful_callbacks": {
803
+ "TrainerControl": {
804
+ "args": {
805
+ "should_epoch_stop": false,
806
+ "should_evaluate": false,
807
+ "should_log": false,
808
+ "should_save": true,
809
+ "should_training_stop": true
810
+ },
811
+ "attributes": {}
812
+ }
813
+ },
814
+ "total_flos": 5.7204510031872e+16,
815
+ "train_batch_size": 1,
816
+ "trial_name": null,
817
+ "trial_params": null
818
+ }
checkpoint-1125/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 2048,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 22,
19
+ "num_key_value_heads": 4,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.52.4",
27
+ "use_cache": true,
28
+ "vocab_size": 32001
29
+ }
checkpoint-1125/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.52.4"
7
+ }
checkpoint-1125/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bab3f0cfb1cd7054a3230d497dbce246493f1b365a5e2c74a69eff3d3280204a
3
+ size 2200128056
checkpoint-1125/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dff2f6400faed82835d899976dae38dd7650073fe3babf7a93bb08349279f764
3
+ size 2626422906
checkpoint-1125/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-1125/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adf13864cc86b23419d70837ee0cffc319c3afdca5e0912fb61af37d2b8f3989
3
+ size 1064
checkpoint-1125/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-1125/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1125/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "<s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 1000000000000000019884624838656,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "<unk>",
50
+ "use_default_system_prompt": false
51
+ }
checkpoint-1125/trainer_state.json ADDED
@@ -0,0 +1,818 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1125,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008888888888888889,
14
+ "grad_norm": 329728.0,
15
+ "learning_rate": 1.592920353982301e-06,
16
+ "loss": 6840.2352,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.017777777777777778,
21
+ "grad_norm": 110080.0,
22
+ "learning_rate": 3.36283185840708e-06,
23
+ "loss": 6186.3184,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02666666666666667,
28
+ "grad_norm": 67584.0,
29
+ "learning_rate": 5.132743362831859e-06,
30
+ "loss": 5175.825,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.035555555555555556,
35
+ "grad_norm": 28800.0,
36
+ "learning_rate": 6.902654867256637e-06,
37
+ "loss": 4705.709,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.044444444444444446,
42
+ "grad_norm": 31232.0,
43
+ "learning_rate": 8.672566371681418e-06,
44
+ "loss": 4302.4035,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.05333333333333334,
49
+ "grad_norm": 31232.0,
50
+ "learning_rate": 1.0442477876106197e-05,
51
+ "loss": 4149.0633,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.06222222222222222,
56
+ "grad_norm": 32000.0,
57
+ "learning_rate": 1.2212389380530973e-05,
58
+ "loss": 3953.5156,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.07111111111111111,
63
+ "grad_norm": 18432.0,
64
+ "learning_rate": 1.3982300884955752e-05,
65
+ "loss": 3825.391,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.08,
70
+ "grad_norm": 27008.0,
71
+ "learning_rate": 1.5752212389380532e-05,
72
+ "loss": 3717.7289,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.08888888888888889,
77
+ "grad_norm": 33536.0,
78
+ "learning_rate": 1.7522123893805313e-05,
79
+ "loss": 3600.6922,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.09777777777777778,
84
+ "grad_norm": 41984.0,
85
+ "learning_rate": 1.929203539823009e-05,
86
+ "loss": 3577.2379,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.10666666666666667,
91
+ "grad_norm": 22912.0,
92
+ "learning_rate": 1.999826540268562e-05,
93
+ "loss": 3496.2227,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.11555555555555555,
98
+ "grad_norm": 14848.0,
99
+ "learning_rate": 1.998766726491935e-05,
100
+ "loss": 3380.4918,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.12444444444444444,
105
+ "grad_norm": 20352.0,
106
+ "learning_rate": 1.9967444854710313e-05,
107
+ "loss": 3359.9414,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.13333333333333333,
112
+ "grad_norm": 33024.0,
113
+ "learning_rate": 1.9937617658689385e-05,
114
+ "loss": 3332.8012,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.14222222222222222,
119
+ "grad_norm": 16640.0,
120
+ "learning_rate": 1.989821441880933e-05,
121
+ "loss": 3265.1035,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.1511111111111111,
126
+ "grad_norm": 19840.0,
127
+ "learning_rate": 1.9849273104648592e-05,
128
+ "loss": 3195.4633,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.16,
133
+ "grad_norm": 16896.0,
134
+ "learning_rate": 1.979084087682323e-05,
135
+ "loss": 3178.0693,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.1688888888888889,
140
+ "grad_norm": 16064.0,
141
+ "learning_rate": 1.9722974041542205e-05,
142
+ "loss": 3216.1639,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.17777777777777778,
147
+ "grad_norm": 26112.0,
148
+ "learning_rate": 1.9645737996349828e-05,
149
+ "loss": 3163.891,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.18666666666666668,
154
+ "grad_norm": 23040.0,
155
+ "learning_rate": 1.9559207167107684e-05,
156
+ "loss": 3117.5008,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.19555555555555557,
161
+ "grad_norm": 17664.0,
162
+ "learning_rate": 1.9463464936276676e-05,
163
+ "loss": 3096.1451,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.20444444444444446,
168
+ "grad_norm": 16256.0,
169
+ "learning_rate": 1.9358603562568417e-05,
170
+ "loss": 3047.9447,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.21333333333333335,
175
+ "grad_norm": 19584.0,
176
+ "learning_rate": 1.924472409204326e-05,
177
+ "loss": 3054.9166,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.2222222222222222,
182
+ "grad_norm": 18432.0,
183
+ "learning_rate": 1.9121936260740752e-05,
184
+ "loss": 3090.7711,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.2311111111111111,
189
+ "grad_norm": 13056.0,
190
+ "learning_rate": 1.899035838893627e-05,
191
+ "loss": 3075.3973,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.24,
196
+ "grad_norm": 15744.0,
197
+ "learning_rate": 1.885011726712574e-05,
198
+ "loss": 2975.9182,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.24888888888888888,
203
+ "grad_norm": 21632.0,
204
+ "learning_rate": 1.870134803384834e-05,
205
+ "loss": 3013.1484,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.2577777777777778,
210
+ "grad_norm": 13888.0,
211
+ "learning_rate": 1.8544194045464888e-05,
212
+ "loss": 3003.3625,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.26666666666666666,
217
+ "grad_norm": 14592.0,
218
+ "learning_rate": 1.837880673801741e-05,
219
+ "loss": 2980.8666,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.27555555555555555,
224
+ "grad_norm": 19456.0,
225
+ "learning_rate": 1.8205345481303e-05,
226
+ "loss": 2986.1092,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.28444444444444444,
231
+ "grad_norm": 22528.0,
232
+ "learning_rate": 1.802397742530259e-05,
233
+ "loss": 2940.208,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.29333333333333333,
238
+ "grad_norm": 13376.0,
239
+ "learning_rate": 1.7834877339112613e-05,
240
+ "loss": 3002.926,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.3022222222222222,
245
+ "grad_norm": 16192.0,
246
+ "learning_rate": 1.763822744253477e-05,
247
+ "loss": 3008.3018,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.3111111111111111,
252
+ "grad_norm": 19840.0,
253
+ "learning_rate": 1.7434217230486162e-05,
254
+ "loss": 2978.5563,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.32,
259
+ "grad_norm": 19968.0,
260
+ "learning_rate": 1.7223043290399065e-05,
261
+ "loss": 2917.9051,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.3288888888888889,
266
+ "grad_norm": 11968.0,
267
+ "learning_rate": 1.7004909112786142e-05,
268
+ "loss": 2916.0309,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.3377777777777778,
273
+ "grad_norm": 13504.0,
274
+ "learning_rate": 1.6780024895153862e-05,
275
+ "loss": 2964.793,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.3466666666666667,
280
+ "grad_norm": 32256.0,
281
+ "learning_rate": 1.6548607339452853e-05,
282
+ "loss": 2886.1475,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.35555555555555557,
287
+ "grad_norm": 11648.0,
288
+ "learning_rate": 1.631087944326053e-05,
289
+ "loss": 2871.3844,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.36444444444444446,
294
+ "grad_norm": 13504.0,
295
+ "learning_rate": 1.606707028489714e-05,
296
+ "loss": 2951.7568,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.37333333333333335,
301
+ "grad_norm": 9344.0,
302
+ "learning_rate": 1.5817414802682292e-05,
303
+ "loss": 2948.7459,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.38222222222222224,
308
+ "grad_norm": 14336.0,
309
+ "learning_rate": 1.5562153568544753e-05,
310
+ "loss": 2925.3027,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.39111111111111113,
315
+ "grad_norm": 17280.0,
316
+ "learning_rate": 1.5301532556203524e-05,
317
+ "loss": 2861.5111,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.4,
322
+ "grad_norm": 15808.0,
323
+ "learning_rate": 1.503580290414376e-05,
324
+ "loss": 2862.7781,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.4088888888888889,
329
+ "grad_norm": 10880.0,
330
+ "learning_rate": 1.4765220673615774e-05,
331
+ "loss": 2894.2242,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.4177777777777778,
336
+ "grad_norm": 20224.0,
337
+ "learning_rate": 1.4490046601890405e-05,
338
+ "loss": 2876.3199,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.4266666666666667,
343
+ "grad_norm": 12928.0,
344
+ "learning_rate": 1.4210545851008529e-05,
345
+ "loss": 2894.2551,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.43555555555555553,
350
+ "grad_norm": 11776.0,
351
+ "learning_rate": 1.3926987752266733e-05,
352
+ "loss": 2852.1971,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.4444444444444444,
357
+ "grad_norm": 17664.0,
358
+ "learning_rate": 1.36396455466855e-05,
359
+ "loss": 2854.5852,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.4533333333333333,
364
+ "grad_norm": 16768.0,
365
+ "learning_rate": 1.3348796121709862e-05,
366
+ "loss": 2917.9811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.4622222222222222,
371
+ "grad_norm": 9728.0,
372
+ "learning_rate": 1.3054719744396333e-05,
373
+ "loss": 2848.7223,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.4711111111111111,
378
+ "grad_norm": 14912.0,
379
+ "learning_rate": 1.2757699791343188e-05,
380
+ "loss": 2840.7688,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.48,
385
+ "grad_norm": 21120.0,
386
+ "learning_rate": 1.2458022475624343e-05,
387
+ "loss": 2890.6824,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.4888888888888889,
392
+ "grad_norm": 12544.0,
393
+ "learning_rate": 1.2155976570989949e-05,
394
+ "loss": 2826.4334,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.49777777777777776,
399
+ "grad_norm": 18560.0,
400
+ "learning_rate": 1.1851853133599507e-05,
401
+ "loss": 2879.9736,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.5066666666666667,
406
+ "grad_norm": 12288.0,
407
+ "learning_rate": 1.1545945221555571e-05,
408
+ "loss": 2821.0361,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.5155555555555555,
413
+ "grad_norm": 9536.0,
414
+ "learning_rate": 1.12385476125084e-05,
415
+ "loss": 2807.2881,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.5244444444444445,
420
+ "grad_norm": 10048.0,
421
+ "learning_rate": 1.0929956519603595e-05,
422
+ "loss": 2811.4141,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.5333333333333333,
427
+ "grad_norm": 15296.0,
428
+ "learning_rate": 1.0620469306046473e-05,
429
+ "loss": 2790.8965,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.5422222222222223,
434
+ "grad_norm": 19584.0,
435
+ "learning_rate": 1.0310384198558226e-05,
436
+ "loss": 2852.466,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.5511111111111111,
441
+ "grad_norm": 8160.0,
442
+ "learning_rate": 1e-05,
443
+ "loss": 2758.1283,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.56,
448
+ "grad_norm": 11008.0,
449
+ "learning_rate": 9.689615801441776e-06,
450
+ "loss": 2822.6701,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.5688888888888889,
455
+ "grad_norm": 14080.0,
456
+ "learning_rate": 9.37953069395353e-06,
457
+ "loss": 2809.1648,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.5777777777777777,
462
+ "grad_norm": 15104.0,
463
+ "learning_rate": 9.070043480396404e-06,
464
+ "loss": 2821.9559,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.5866666666666667,
469
+ "grad_norm": 19200.0,
470
+ "learning_rate": 8.761452387491601e-06,
471
+ "loss": 2818.5842,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.5955555555555555,
476
+ "grad_norm": 15808.0,
477
+ "learning_rate": 8.45405477844443e-06,
478
+ "loss": 2798.8832,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.6044444444444445,
483
+ "grad_norm": 11392.0,
484
+ "learning_rate": 8.148146866400498e-06,
485
+ "loss": 2779.3459,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.6133333333333333,
490
+ "grad_norm": 17792.0,
491
+ "learning_rate": 7.844023429010051e-06,
492
+ "loss": 2801.9057,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.6222222222222222,
497
+ "grad_norm": 19200.0,
498
+ "learning_rate": 7.541977524375661e-06,
499
+ "loss": 2828.118,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.6311111111111111,
504
+ "grad_norm": 14016.0,
505
+ "learning_rate": 7.242300208656814e-06,
506
+ "loss": 2800.459,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.64,
511
+ "grad_norm": 15872.0,
512
+ "learning_rate": 6.9452802556036705e-06,
513
+ "loss": 2761.099,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.6488888888888888,
518
+ "grad_norm": 16512.0,
519
+ "learning_rate": 6.651203878290139e-06,
520
+ "loss": 2769.051,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.6577777777777778,
525
+ "grad_norm": 11264.0,
526
+ "learning_rate": 6.360354453314502e-06,
527
+ "loss": 2813.532,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.6666666666666666,
532
+ "grad_norm": 12160.0,
533
+ "learning_rate": 6.073012247733267e-06,
534
+ "loss": 2761.1883,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.6755555555555556,
539
+ "grad_norm": 11840.0,
540
+ "learning_rate": 5.789454148991477e-06,
541
+ "loss": 2775.6535,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.6844444444444444,
546
+ "grad_norm": 14656.0,
547
+ "learning_rate": 5.5099533981095945e-06,
548
+ "loss": 2782.9367,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.6933333333333334,
553
+ "grad_norm": 14144.0,
554
+ "learning_rate": 5.234779326384227e-06,
555
+ "loss": 2781.4521,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.7022222222222222,
560
+ "grad_norm": 13760.0,
561
+ "learning_rate": 4.964197095856237e-06,
562
+ "loss": 2839.6955,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.7111111111111111,
567
+ "grad_norm": 17024.0,
568
+ "learning_rate": 4.698467443796479e-06,
569
+ "loss": 2808.0578,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.72,
574
+ "grad_norm": 15488.0,
575
+ "learning_rate": 4.437846431455249e-06,
576
+ "loss": 2783.9123,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.7288888888888889,
581
+ "grad_norm": 10560.0,
582
+ "learning_rate": 4.182585197317709e-06,
583
+ "loss": 2823.9398,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.7377777777777778,
588
+ "grad_norm": 11840.0,
589
+ "learning_rate": 3.932929715102863e-06,
590
+ "loss": 2776.0838,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.7466666666666667,
595
+ "grad_norm": 15424.0,
596
+ "learning_rate": 3.689120556739475e-06,
597
+ "loss": 2765.9592,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.7555555555555555,
602
+ "grad_norm": 17536.0,
603
+ "learning_rate": 3.4513926605471504e-06,
604
+ "loss": 2757.6773,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.7644444444444445,
609
+ "grad_norm": 10560.0,
610
+ "learning_rate": 3.2199751048461414e-06,
611
+ "loss": 2745.9385,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.7733333333333333,
616
+ "grad_norm": 7392.0,
617
+ "learning_rate": 2.9950908872138585e-06,
618
+ "loss": 2768.2516,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.7822222222222223,
623
+ "grad_norm": 10880.0,
624
+ "learning_rate": 2.776956709600941e-06,
625
+ "loss": 2775.5641,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.7911111111111111,
630
+ "grad_norm": 9792.0,
631
+ "learning_rate": 2.565782769513837e-06,
632
+ "loss": 2746.8189,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.8,
637
+ "grad_norm": 13120.0,
638
+ "learning_rate": 2.3617725574652352e-06,
639
+ "loss": 2855.6207,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.8088888888888889,
644
+ "grad_norm": 8640.0,
645
+ "learning_rate": 2.1651226608873876e-06,
646
+ "loss": 2792.5463,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.8177777777777778,
651
+ "grad_norm": 15808.0,
652
+ "learning_rate": 1.9760225746974136e-06,
653
+ "loss": 2792.7549,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.8266666666666667,
658
+ "grad_norm": 12800.0,
659
+ "learning_rate": 1.7946545186970022e-06,
660
+ "loss": 2801.327,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.8355555555555556,
665
+ "grad_norm": 11520.0,
666
+ "learning_rate": 1.6211932619825932e-06,
667
+ "loss": 2731.5381,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.8444444444444444,
672
+ "grad_norm": 14144.0,
673
+ "learning_rate": 1.4558059545351144e-06,
674
+ "loss": 2826.2076,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.8533333333333334,
679
+ "grad_norm": 13184.0,
680
+ "learning_rate": 1.298651966151665e-06,
681
+ "loss": 2842.1312,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.8622222222222222,
686
+ "grad_norm": 8256.0,
687
+ "learning_rate": 1.1498827328742623e-06,
688
+ "loss": 2740.9932,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.8711111111111111,
693
+ "grad_norm": 8640.0,
694
+ "learning_rate": 1.009641611063732e-06,
695
+ "loss": 2773.5756,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.88,
700
+ "grad_norm": 10688.0,
701
+ "learning_rate": 8.780637392592494e-07,
702
+ "loss": 2739.3299,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.8888888888888888,
707
+ "grad_norm": 9408.0,
708
+ "learning_rate": 7.552759079567418e-07,
709
+ "loss": 2791.6674,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.8977777777777778,
714
+ "grad_norm": 13376.0,
715
+ "learning_rate": 6.413964374315851e-07,
716
+ "loss": 2784.6709,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.9066666666666666,
721
+ "grad_norm": 11904.0,
722
+ "learning_rate": 5.365350637233236e-07,
723
+ "loss": 2741.3131,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.9155555555555556,
728
+ "grad_norm": 14784.0,
729
+ "learning_rate": 4.407928328923194e-07,
730
+ "loss": 2791.416,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.9244444444444444,
735
+ "grad_norm": 11072.0,
736
+ "learning_rate": 3.5426200365017207e-07,
737
+ "loss": 2769.5869,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.9333333333333333,
742
+ "grad_norm": 13120.0,
743
+ "learning_rate": 2.770259584577972e-07,
744
+ "loss": 2769.8787,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.9422222222222222,
749
+ "grad_norm": 7808.0,
750
+ "learning_rate": 2.091591231767709e-07,
751
+ "loss": 2795.082,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.9511111111111111,
756
+ "grad_norm": 8640.0,
757
+ "learning_rate": 1.5072689535141072e-07,
758
+ "loss": 2778.0055,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.96,
763
+ "grad_norm": 12096.0,
764
+ "learning_rate": 1.0178558119067316e-07,
765
+ "loss": 2744.5266,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.9688888888888889,
770
+ "grad_norm": 11584.0,
771
+ "learning_rate": 6.238234131061616e-08,
772
+ "loss": 2765.4174,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.9777777777777777,
777
+ "grad_norm": 15488.0,
778
+ "learning_rate": 3.255514528968884e-08,
779
+ "loss": 2780.9643,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.9866666666666667,
784
+ "grad_norm": 9728.0,
785
+ "learning_rate": 1.2332735080651248e-08,
786
+ "loss": 2821.4281,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.9955555555555555,
791
+ "grad_norm": 25216.0,
792
+ "learning_rate": 1.7345973143789717e-09,
793
+ "loss": 2802.424,
794
+ "step": 1120
795
+ }
796
+ ],
797
+ "logging_steps": 10,
798
+ "max_steps": 1125,
799
+ "num_input_tokens_seen": 0,
800
+ "num_train_epochs": 1,
801
+ "save_steps": 500,
802
+ "stateful_callbacks": {
803
+ "TrainerControl": {
804
+ "args": {
805
+ "should_epoch_stop": false,
806
+ "should_evaluate": false,
807
+ "should_log": false,
808
+ "should_save": true,
809
+ "should_training_stop": true
810
+ },
811
+ "attributes": {}
812
+ }
813
+ },
814
+ "total_flos": 5.7204510031872e+16,
815
+ "train_batch_size": 1,
816
+ "trial_name": null,
817
+ "trial_params": null
818
+ }
checkpoint-1125/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e1bafbdfd4c195674c42da05ea52538467616210e2cea66a39948e8046595b
3
+ size 5304
checkpoint-500/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 2048,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 22,
19
+ "num_key_value_heads": 4,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.52.4",
27
+ "use_cache": true,
28
+ "vocab_size": 32001
29
+ }
checkpoint-500/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.52.4"
7
+ }
checkpoint-500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a57285736d21c09bcf334478a4f097ba5e589fbb1db61c12ff966c86538f9f54
3
+ size 2200128056
checkpoint-500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a01f4c5ee80a41e6b111eba81859b076fa46b40236cad27c08f6124f6cc212
3
+ size 2626422906
checkpoint-500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:043c2dc0da93e29e74dac72d09bd2d2f06d105e3faf0b59aae639601ab53e400
3
+ size 1064
checkpoint-500/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-500/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "<s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 1000000000000000019884624838656,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "<unk>",
50
+ "use_default_system_prompt": false
51
+ }
checkpoint-500/trainer_state.json ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.4444444444444444,
6
+ "eval_steps": 500,
7
+ "global_step": 500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008888888888888889,
14
+ "grad_norm": 329728.0,
15
+ "learning_rate": 1.592920353982301e-06,
16
+ "loss": 6840.2352,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.017777777777777778,
21
+ "grad_norm": 110080.0,
22
+ "learning_rate": 3.36283185840708e-06,
23
+ "loss": 6186.3184,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02666666666666667,
28
+ "grad_norm": 67584.0,
29
+ "learning_rate": 5.132743362831859e-06,
30
+ "loss": 5175.825,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.035555555555555556,
35
+ "grad_norm": 28800.0,
36
+ "learning_rate": 6.902654867256637e-06,
37
+ "loss": 4705.709,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.044444444444444446,
42
+ "grad_norm": 31232.0,
43
+ "learning_rate": 8.672566371681418e-06,
44
+ "loss": 4302.4035,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.05333333333333334,
49
+ "grad_norm": 31232.0,
50
+ "learning_rate": 1.0442477876106197e-05,
51
+ "loss": 4149.0633,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.06222222222222222,
56
+ "grad_norm": 32000.0,
57
+ "learning_rate": 1.2212389380530973e-05,
58
+ "loss": 3953.5156,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.07111111111111111,
63
+ "grad_norm": 18432.0,
64
+ "learning_rate": 1.3982300884955752e-05,
65
+ "loss": 3825.391,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.08,
70
+ "grad_norm": 27008.0,
71
+ "learning_rate": 1.5752212389380532e-05,
72
+ "loss": 3717.7289,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.08888888888888889,
77
+ "grad_norm": 33536.0,
78
+ "learning_rate": 1.7522123893805313e-05,
79
+ "loss": 3600.6922,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.09777777777777778,
84
+ "grad_norm": 41984.0,
85
+ "learning_rate": 1.929203539823009e-05,
86
+ "loss": 3577.2379,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.10666666666666667,
91
+ "grad_norm": 22912.0,
92
+ "learning_rate": 1.999826540268562e-05,
93
+ "loss": 3496.2227,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.11555555555555555,
98
+ "grad_norm": 14848.0,
99
+ "learning_rate": 1.998766726491935e-05,
100
+ "loss": 3380.4918,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.12444444444444444,
105
+ "grad_norm": 20352.0,
106
+ "learning_rate": 1.9967444854710313e-05,
107
+ "loss": 3359.9414,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.13333333333333333,
112
+ "grad_norm": 33024.0,
113
+ "learning_rate": 1.9937617658689385e-05,
114
+ "loss": 3332.8012,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.14222222222222222,
119
+ "grad_norm": 16640.0,
120
+ "learning_rate": 1.989821441880933e-05,
121
+ "loss": 3265.1035,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.1511111111111111,
126
+ "grad_norm": 19840.0,
127
+ "learning_rate": 1.9849273104648592e-05,
128
+ "loss": 3195.4633,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.16,
133
+ "grad_norm": 16896.0,
134
+ "learning_rate": 1.979084087682323e-05,
135
+ "loss": 3178.0693,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.1688888888888889,
140
+ "grad_norm": 16064.0,
141
+ "learning_rate": 1.9722974041542205e-05,
142
+ "loss": 3216.1639,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.17777777777777778,
147
+ "grad_norm": 26112.0,
148
+ "learning_rate": 1.9645737996349828e-05,
149
+ "loss": 3163.891,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.18666666666666668,
154
+ "grad_norm": 23040.0,
155
+ "learning_rate": 1.9559207167107684e-05,
156
+ "loss": 3117.5008,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.19555555555555557,
161
+ "grad_norm": 17664.0,
162
+ "learning_rate": 1.9463464936276676e-05,
163
+ "loss": 3096.1451,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.20444444444444446,
168
+ "grad_norm": 16256.0,
169
+ "learning_rate": 1.9358603562568417e-05,
170
+ "loss": 3047.9447,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.21333333333333335,
175
+ "grad_norm": 19584.0,
176
+ "learning_rate": 1.924472409204326e-05,
177
+ "loss": 3054.9166,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.2222222222222222,
182
+ "grad_norm": 18432.0,
183
+ "learning_rate": 1.9121936260740752e-05,
184
+ "loss": 3090.7711,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.2311111111111111,
189
+ "grad_norm": 13056.0,
190
+ "learning_rate": 1.899035838893627e-05,
191
+ "loss": 3075.3973,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.24,
196
+ "grad_norm": 15744.0,
197
+ "learning_rate": 1.885011726712574e-05,
198
+ "loss": 2975.9182,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.24888888888888888,
203
+ "grad_norm": 21632.0,
204
+ "learning_rate": 1.870134803384834e-05,
205
+ "loss": 3013.1484,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.2577777777777778,
210
+ "grad_norm": 13888.0,
211
+ "learning_rate": 1.8544194045464888e-05,
212
+ "loss": 3003.3625,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.26666666666666666,
217
+ "grad_norm": 14592.0,
218
+ "learning_rate": 1.837880673801741e-05,
219
+ "loss": 2980.8666,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.27555555555555555,
224
+ "grad_norm": 19456.0,
225
+ "learning_rate": 1.8205345481303e-05,
226
+ "loss": 2986.1092,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.28444444444444444,
231
+ "grad_norm": 22528.0,
232
+ "learning_rate": 1.802397742530259e-05,
233
+ "loss": 2940.208,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.29333333333333333,
238
+ "grad_norm": 13376.0,
239
+ "learning_rate": 1.7834877339112613e-05,
240
+ "loss": 3002.926,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.3022222222222222,
245
+ "grad_norm": 16192.0,
246
+ "learning_rate": 1.763822744253477e-05,
247
+ "loss": 3008.3018,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.3111111111111111,
252
+ "grad_norm": 19840.0,
253
+ "learning_rate": 1.7434217230486162e-05,
254
+ "loss": 2978.5563,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.32,
259
+ "grad_norm": 19968.0,
260
+ "learning_rate": 1.7223043290399065e-05,
261
+ "loss": 2917.9051,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.3288888888888889,
266
+ "grad_norm": 11968.0,
267
+ "learning_rate": 1.7004909112786142e-05,
268
+ "loss": 2916.0309,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.3377777777777778,
273
+ "grad_norm": 13504.0,
274
+ "learning_rate": 1.6780024895153862e-05,
275
+ "loss": 2964.793,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.3466666666666667,
280
+ "grad_norm": 32256.0,
281
+ "learning_rate": 1.6548607339452853e-05,
282
+ "loss": 2886.1475,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.35555555555555557,
287
+ "grad_norm": 11648.0,
288
+ "learning_rate": 1.631087944326053e-05,
289
+ "loss": 2871.3844,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.36444444444444446,
294
+ "grad_norm": 13504.0,
295
+ "learning_rate": 1.606707028489714e-05,
296
+ "loss": 2951.7568,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.37333333333333335,
301
+ "grad_norm": 9344.0,
302
+ "learning_rate": 1.5817414802682292e-05,
303
+ "loss": 2948.7459,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.38222222222222224,
308
+ "grad_norm": 14336.0,
309
+ "learning_rate": 1.5562153568544753e-05,
310
+ "loss": 2925.3027,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.39111111111111113,
315
+ "grad_norm": 17280.0,
316
+ "learning_rate": 1.5301532556203524e-05,
317
+ "loss": 2861.5111,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.4,
322
+ "grad_norm": 15808.0,
323
+ "learning_rate": 1.503580290414376e-05,
324
+ "loss": 2862.7781,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.4088888888888889,
329
+ "grad_norm": 10880.0,
330
+ "learning_rate": 1.4765220673615774e-05,
331
+ "loss": 2894.2242,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.4177777777777778,
336
+ "grad_norm": 20224.0,
337
+ "learning_rate": 1.4490046601890405e-05,
338
+ "loss": 2876.3199,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.4266666666666667,
343
+ "grad_norm": 12928.0,
344
+ "learning_rate": 1.4210545851008529e-05,
345
+ "loss": 2894.2551,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.43555555555555553,
350
+ "grad_norm": 11776.0,
351
+ "learning_rate": 1.3926987752266733e-05,
352
+ "loss": 2852.1971,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.4444444444444444,
357
+ "grad_norm": 17664.0,
358
+ "learning_rate": 1.36396455466855e-05,
359
+ "loss": 2854.5852,
360
+ "step": 500
361
+ }
362
+ ],
363
+ "logging_steps": 10,
364
+ "max_steps": 1125,
365
+ "num_input_tokens_seen": 0,
366
+ "num_train_epochs": 1,
367
+ "save_steps": 500,
368
+ "stateful_callbacks": {
369
+ "TrainerControl": {
370
+ "args": {
371
+ "should_epoch_stop": false,
372
+ "should_evaluate": false,
373
+ "should_log": false,
374
+ "should_save": true,
375
+ "should_training_stop": false
376
+ },
377
+ "attributes": {}
378
+ }
379
+ },
380
+ "total_flos": 2.5424226680832e+16,
381
+ "train_batch_size": 1,
382
+ "trial_name": null,
383
+ "trial_params": null
384
+ }
checkpoint-500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e1bafbdfd4c195674c42da05ea52538467616210e2cea66a39948e8046595b
3
+ size 5304
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 2048,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 22,
19
+ "num_key_value_heads": 4,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.52.4",
27
+ "use_cache": true,
28
+ "vocab_size": 32001
29
+ }
eval_metrics.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "eval_runtime": 54.5547,
3
+ "eval_samples_per_second": 18.33,
4
+ "eval_steps_per_second": 2.291,
5
+ "epoch": 1.0
6
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.52.4"
7
+ }
inference_results.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.20670391061452514,
3
+ "f1_macro": 0.06851851851851852,
4
+ "confusion_matrix": [
5
+ [
6
+ 74,
7
+ 0,
8
+ 0,
9
+ 0,
10
+ 0
11
+ ],
12
+ [
13
+ 64,
14
+ 0,
15
+ 0,
16
+ 0,
17
+ 0
18
+ ],
19
+ [
20
+ 66,
21
+ 0,
22
+ 0,
23
+ 0,
24
+ 0
25
+ ],
26
+ [
27
+ 75,
28
+ 0,
29
+ 0,
30
+ 0,
31
+ 0
32
+ ],
33
+ [
34
+ 79,
35
+ 0,
36
+ 0,
37
+ 0,
38
+ 0
39
+ ]
40
+ ],
41
+ "total_samples": 1000,
42
+ "invalid_predictions": 642
43
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bab3f0cfb1cd7054a3230d497dbce246493f1b365a5e2c74a69eff3d3280204a
3
+ size 2200128056
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "<s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 1000000000000000019884624838656,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "<unk>",
50
+ "use_default_system_prompt": false
51
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e1bafbdfd4c195674c42da05ea52538467616210e2cea66a39948e8046595b
3
+ size 5304