IParraMartin commited on
Commit
02a32a5
·
verified ·
1 Parent(s): 18f5986

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-20000/config.json +32 -0
  2. checkpoint-20000/generation_config.json +9 -0
  3. checkpoint-20000/merges.txt +0 -0
  4. checkpoint-20000/model.safetensors +3 -0
  5. checkpoint-20000/optimizer.pt +3 -0
  6. checkpoint-20000/rng_state.pth +3 -0
  7. checkpoint-20000/scheduler.pt +3 -0
  8. checkpoint-20000/special_tokens_map.json +6 -0
  9. checkpoint-20000/tokenizer.json +0 -0
  10. checkpoint-20000/tokenizer_config.json +21 -0
  11. checkpoint-20000/trainer_state.json +1594 -0
  12. checkpoint-20000/training_args.bin +3 -0
  13. checkpoint-20000/vocab.json +0 -0
  14. checkpoint-80000/config.json +32 -0
  15. checkpoint-80000/generation_config.json +9 -0
  16. checkpoint-80000/merges.txt +0 -0
  17. checkpoint-80000/model.safetensors +3 -0
  18. checkpoint-80000/optimizer.pt +3 -0
  19. checkpoint-80000/rng_state.pth +3 -0
  20. checkpoint-80000/scheduler.pt +3 -0
  21. checkpoint-80000/special_tokens_map.json +6 -0
  22. checkpoint-80000/tokenizer.json +0 -0
  23. checkpoint-80000/tokenizer_config.json +21 -0
  24. checkpoint-80000/trainer_state.json +0 -0
  25. checkpoint-80000/training_args.bin +3 -0
  26. checkpoint-80000/vocab.json +0 -0
  27. checkpoint-85000/config.json +32 -0
  28. checkpoint-85000/generation_config.json +9 -0
  29. checkpoint-85000/merges.txt +0 -0
  30. checkpoint-85000/model.safetensors +3 -0
  31. checkpoint-85000/optimizer.pt +3 -0
  32. checkpoint-85000/rng_state.pth +3 -0
  33. checkpoint-85000/scheduler.pt +3 -0
  34. checkpoint-85000/special_tokens_map.json +6 -0
  35. checkpoint-85000/tokenizer.json +0 -0
  36. checkpoint-85000/tokenizer_config.json +21 -0
  37. checkpoint-85000/trainer_state.json +0 -0
  38. checkpoint-85000/training_args.bin +3 -0
  39. checkpoint-85000/vocab.json +0 -0
  40. checkpoint-90000/config.json +32 -0
  41. checkpoint-90000/generation_config.json +9 -0
  42. checkpoint-90000/merges.txt +0 -0
  43. checkpoint-90000/model.safetensors +3 -0
  44. checkpoint-90000/optimizer.pt +3 -0
  45. checkpoint-90000/rng_state.pth +3 -0
  46. checkpoint-90000/scheduler.pt +3 -0
  47. checkpoint-90000/special_tokens_map.json +6 -0
  48. checkpoint-90000/tokenizer.json +0 -0
  49. checkpoint-90000/tokenizer_config.json +21 -0
  50. checkpoint-90000/trainer_state.json +0 -0
checkpoint-20000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-12,
13
+ "model_type": "gpt2",
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "pad_token_id": 50256,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.1",
30
+ "use_cache": false,
31
+ "vocab_size": 50257
32
+ }
checkpoint-20000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": [
5
+ 50256
6
+ ],
7
+ "pad_token_id": 50256,
8
+ "transformers_version": "4.57.1"
9
+ }
checkpoint-20000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5d895cdd801d17ee1f853033477f709c328a6f43cd0c184d690add28259e2eb
3
+ size 497774208
checkpoint-20000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82b212f70d9b7820eba6767f4e012abdc58e0c6a143f8eb89d541ff7eb29b54e
3
+ size 995642298
checkpoint-20000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:503f445c78f40dc4ad61e238e038b404e464f79991963ff3bd4b145bca7ec5a2
3
+ size 14244
checkpoint-20000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba2cd51d1e70cfadf2ee79c9673c9efb2d37ff854d2c5b9dd041586cfdb6711
3
+ size 1064
checkpoint-20000/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-20000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20000/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoint-20000/trainer_state.json ADDED
@@ -0,0 +1,1594 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 20000,
3
+ "best_metric": 3.3754522800445557,
4
+ "best_model_checkpoint": "models/plausigpt/checkpoint-20000",
5
+ "epoch": 18.86812927577259,
6
+ "eval_steps": 1000,
7
+ "global_step": 20000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09436187780136825,
14
+ "grad_norm": 2.9101743698120117,
15
+ "learning_rate": 9.9e-07,
16
+ "loss": 10.3242,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.1887237556027365,
21
+ "grad_norm": 2.162980556488037,
22
+ "learning_rate": 1.99e-06,
23
+ "loss": 9.296,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.28308563340410475,
28
+ "grad_norm": 2.0969018936157227,
29
+ "learning_rate": 2.99e-06,
30
+ "loss": 8.7898,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.377447511205473,
35
+ "grad_norm": 1.897030234336853,
36
+ "learning_rate": 3.99e-06,
37
+ "loss": 8.2757,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.4718093890068412,
42
+ "grad_norm": 1.666489601135254,
43
+ "learning_rate": 4.9900000000000005e-06,
44
+ "loss": 7.8218,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5661712668082095,
49
+ "grad_norm": 1.5309317111968994,
50
+ "learning_rate": 5.99e-06,
51
+ "loss": 7.3494,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.6605331446095777,
56
+ "grad_norm": 1.2604295015335083,
57
+ "learning_rate": 6.990000000000001e-06,
58
+ "loss": 6.8932,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.754895022410946,
63
+ "grad_norm": 1.1508737802505493,
64
+ "learning_rate": 7.99e-06,
65
+ "loss": 6.5159,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.8492569002123143,
70
+ "grad_norm": 1.1227755546569824,
71
+ "learning_rate": 8.99e-06,
72
+ "loss": 6.2373,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.9436187780136824,
77
+ "grad_norm": 1.1233224868774414,
78
+ "learning_rate": 9.990000000000001e-06,
79
+ "loss": 6.053,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.9436187780136824,
84
+ "eval_loss": 5.930422782897949,
85
+ "eval_runtime": 89.9185,
86
+ "eval_samples_per_second": 167.607,
87
+ "eval_steps_per_second": 5.238,
88
+ "step": 1000
89
+ },
90
+ {
91
+ "epoch": 1.0377447511205473,
92
+ "grad_norm": 1.1704257726669312,
93
+ "learning_rate": 1.099e-05,
94
+ "loss": 5.9047,
95
+ "step": 1100
96
+ },
97
+ {
98
+ "epoch": 1.1321066289219155,
99
+ "grad_norm": 1.0163617134094238,
100
+ "learning_rate": 1.199e-05,
101
+ "loss": 5.7717,
102
+ "step": 1200
103
+ },
104
+ {
105
+ "epoch": 1.226468506723284,
106
+ "grad_norm": 0.9280975461006165,
107
+ "learning_rate": 1.299e-05,
108
+ "loss": 5.7047,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 1.320830384524652,
113
+ "grad_norm": 1.1254018545150757,
114
+ "learning_rate": 1.399e-05,
115
+ "loss": 5.6121,
116
+ "step": 1400
117
+ },
118
+ {
119
+ "epoch": 1.4151922623260202,
120
+ "grad_norm": 1.5115385055541992,
121
+ "learning_rate": 1.499e-05,
122
+ "loss": 5.5445,
123
+ "step": 1500
124
+ },
125
+ {
126
+ "epoch": 1.5095541401273884,
127
+ "grad_norm": 1.1451691389083862,
128
+ "learning_rate": 1.599e-05,
129
+ "loss": 5.4659,
130
+ "step": 1600
131
+ },
132
+ {
133
+ "epoch": 1.6039160179287568,
134
+ "grad_norm": 1.0520662069320679,
135
+ "learning_rate": 1.699e-05,
136
+ "loss": 5.4362,
137
+ "step": 1700
138
+ },
139
+ {
140
+ "epoch": 1.698277895730125,
141
+ "grad_norm": 1.2363018989562988,
142
+ "learning_rate": 1.7990000000000002e-05,
143
+ "loss": 5.3637,
144
+ "step": 1800
145
+ },
146
+ {
147
+ "epoch": 1.7926397735314934,
148
+ "grad_norm": 1.2105119228363037,
149
+ "learning_rate": 1.8990000000000003e-05,
150
+ "loss": 5.3063,
151
+ "step": 1900
152
+ },
153
+ {
154
+ "epoch": 1.8870016513328616,
155
+ "grad_norm": 1.5523535013198853,
156
+ "learning_rate": 1.999e-05,
157
+ "loss": 5.2543,
158
+ "step": 2000
159
+ },
160
+ {
161
+ "epoch": 1.8870016513328616,
162
+ "eval_loss": 5.177983283996582,
163
+ "eval_runtime": 89.9532,
164
+ "eval_samples_per_second": 167.543,
165
+ "eval_steps_per_second": 5.236,
166
+ "step": 2000
167
+ },
168
+ {
169
+ "epoch": 1.9813635291342298,
170
+ "grad_norm": 1.5004462003707886,
171
+ "learning_rate": 2.099e-05,
172
+ "loss": 5.1994,
173
+ "step": 2100
174
+ },
175
+ {
176
+ "epoch": 2.0754895022410946,
177
+ "grad_norm": 1.4016139507293701,
178
+ "learning_rate": 2.199e-05,
179
+ "loss": 5.1422,
180
+ "step": 2200
181
+ },
182
+ {
183
+ "epoch": 2.169851380042463,
184
+ "grad_norm": 1.5630654096603394,
185
+ "learning_rate": 2.2990000000000002e-05,
186
+ "loss": 5.0975,
187
+ "step": 2300
188
+ },
189
+ {
190
+ "epoch": 2.264213257843831,
191
+ "grad_norm": 1.373801827430725,
192
+ "learning_rate": 2.3990000000000002e-05,
193
+ "loss": 5.0522,
194
+ "step": 2400
195
+ },
196
+ {
197
+ "epoch": 2.358575135645199,
198
+ "grad_norm": 1.3656255006790161,
199
+ "learning_rate": 2.4990000000000003e-05,
200
+ "loss": 5.0166,
201
+ "step": 2500
202
+ },
203
+ {
204
+ "epoch": 2.452937013446568,
205
+ "grad_norm": 1.4151180982589722,
206
+ "learning_rate": 2.5990000000000004e-05,
207
+ "loss": 4.9653,
208
+ "step": 2600
209
+ },
210
+ {
211
+ "epoch": 2.547298891247936,
212
+ "grad_norm": 1.5235018730163574,
213
+ "learning_rate": 2.6989999999999997e-05,
214
+ "loss": 4.9166,
215
+ "step": 2700
216
+ },
217
+ {
218
+ "epoch": 2.641660769049304,
219
+ "grad_norm": 1.2987467050552368,
220
+ "learning_rate": 2.7989999999999998e-05,
221
+ "loss": 4.877,
222
+ "step": 2800
223
+ },
224
+ {
225
+ "epoch": 2.7360226468506723,
226
+ "grad_norm": 1.3841923475265503,
227
+ "learning_rate": 2.8990000000000002e-05,
228
+ "loss": 4.8477,
229
+ "step": 2900
230
+ },
231
+ {
232
+ "epoch": 2.8303845246520405,
233
+ "grad_norm": 1.5736286640167236,
234
+ "learning_rate": 2.9990000000000003e-05,
235
+ "loss": 4.7941,
236
+ "step": 3000
237
+ },
238
+ {
239
+ "epoch": 2.8303845246520405,
240
+ "eval_loss": 4.716542720794678,
241
+ "eval_runtime": 89.9497,
242
+ "eval_samples_per_second": 167.549,
243
+ "eval_steps_per_second": 5.236,
244
+ "step": 3000
245
+ },
246
+ {
247
+ "epoch": 2.9247464024534087,
248
+ "grad_norm": 1.404842734336853,
249
+ "learning_rate": 3.099e-05,
250
+ "loss": 4.7567,
251
+ "step": 3100
252
+ },
253
+ {
254
+ "epoch": 3.0188723755602735,
255
+ "grad_norm": 1.2254925966262817,
256
+ "learning_rate": 3.1990000000000004e-05,
257
+ "loss": 4.7165,
258
+ "step": 3200
259
+ },
260
+ {
261
+ "epoch": 3.1132342533616417,
262
+ "grad_norm": 1.2496081590652466,
263
+ "learning_rate": 3.299e-05,
264
+ "loss": 4.6894,
265
+ "step": 3300
266
+ },
267
+ {
268
+ "epoch": 3.2075961311630103,
269
+ "grad_norm": 1.3087129592895508,
270
+ "learning_rate": 3.399e-05,
271
+ "loss": 4.6382,
272
+ "step": 3400
273
+ },
274
+ {
275
+ "epoch": 3.3019580089643785,
276
+ "grad_norm": 1.2795251607894897,
277
+ "learning_rate": 3.499e-05,
278
+ "loss": 4.6091,
279
+ "step": 3500
280
+ },
281
+ {
282
+ "epoch": 3.3963198867657467,
283
+ "grad_norm": 1.2816106081008911,
284
+ "learning_rate": 3.599e-05,
285
+ "loss": 4.5872,
286
+ "step": 3600
287
+ },
288
+ {
289
+ "epoch": 3.490681764567115,
290
+ "grad_norm": 1.2217532396316528,
291
+ "learning_rate": 3.699e-05,
292
+ "loss": 4.5528,
293
+ "step": 3700
294
+ },
295
+ {
296
+ "epoch": 3.585043642368483,
297
+ "grad_norm": 1.2160422801971436,
298
+ "learning_rate": 3.799e-05,
299
+ "loss": 4.5129,
300
+ "step": 3800
301
+ },
302
+ {
303
+ "epoch": 3.6794055201698512,
304
+ "grad_norm": 1.2504174709320068,
305
+ "learning_rate": 3.8990000000000004e-05,
306
+ "loss": 4.4912,
307
+ "step": 3900
308
+ },
309
+ {
310
+ "epoch": 3.77376739797122,
311
+ "grad_norm": 1.3709900379180908,
312
+ "learning_rate": 3.999e-05,
313
+ "loss": 4.4662,
314
+ "step": 4000
315
+ },
316
+ {
317
+ "epoch": 3.77376739797122,
318
+ "eval_loss": 4.381489276885986,
319
+ "eval_runtime": 89.9497,
320
+ "eval_samples_per_second": 167.549,
321
+ "eval_steps_per_second": 5.236,
322
+ "step": 4000
323
+ },
324
+ {
325
+ "epoch": 3.868129275772588,
326
+ "grad_norm": 1.2039287090301514,
327
+ "learning_rate": 4.099e-05,
328
+ "loss": 4.4265,
329
+ "step": 4100
330
+ },
331
+ {
332
+ "epoch": 3.962491153573956,
333
+ "grad_norm": 1.2931344509124756,
334
+ "learning_rate": 4.199e-05,
335
+ "loss": 4.4062,
336
+ "step": 4200
337
+ },
338
+ {
339
+ "epoch": 4.056617126680821,
340
+ "grad_norm": 1.1824957132339478,
341
+ "learning_rate": 4.299e-05,
342
+ "loss": 4.3693,
343
+ "step": 4300
344
+ },
345
+ {
346
+ "epoch": 4.150979004482189,
347
+ "grad_norm": 1.1529172658920288,
348
+ "learning_rate": 4.3990000000000004e-05,
349
+ "loss": 4.3337,
350
+ "step": 4400
351
+ },
352
+ {
353
+ "epoch": 4.245340882283557,
354
+ "grad_norm": 1.076854944229126,
355
+ "learning_rate": 4.499e-05,
356
+ "loss": 4.3073,
357
+ "step": 4500
358
+ },
359
+ {
360
+ "epoch": 4.339702760084926,
361
+ "grad_norm": 1.2523971796035767,
362
+ "learning_rate": 4.599e-05,
363
+ "loss": 4.2877,
364
+ "step": 4600
365
+ },
366
+ {
367
+ "epoch": 4.434064637886294,
368
+ "grad_norm": 1.19026780128479,
369
+ "learning_rate": 4.699e-05,
370
+ "loss": 4.2684,
371
+ "step": 4700
372
+ },
373
+ {
374
+ "epoch": 4.528426515687662,
375
+ "grad_norm": 1.1093727350234985,
376
+ "learning_rate": 4.799e-05,
377
+ "loss": 4.2301,
378
+ "step": 4800
379
+ },
380
+ {
381
+ "epoch": 4.622788393489031,
382
+ "grad_norm": 1.170032262802124,
383
+ "learning_rate": 4.8990000000000004e-05,
384
+ "loss": 4.2172,
385
+ "step": 4900
386
+ },
387
+ {
388
+ "epoch": 4.717150271290398,
389
+ "grad_norm": 1.1602752208709717,
390
+ "learning_rate": 4.999e-05,
391
+ "loss": 4.1815,
392
+ "step": 5000
393
+ },
394
+ {
395
+ "epoch": 4.717150271290398,
396
+ "eval_loss": 4.115845203399658,
397
+ "eval_runtime": 89.9622,
398
+ "eval_samples_per_second": 167.526,
399
+ "eval_steps_per_second": 5.236,
400
+ "step": 5000
401
+ },
402
+ {
403
+ "epoch": 4.811512149091767,
404
+ "grad_norm": 1.1583232879638672,
405
+ "learning_rate": 5.0990000000000005e-05,
406
+ "loss": 4.1705,
407
+ "step": 5100
408
+ },
409
+ {
410
+ "epoch": 4.905874026893136,
411
+ "grad_norm": 1.1284886598587036,
412
+ "learning_rate": 5.199000000000001e-05,
413
+ "loss": 4.1492,
414
+ "step": 5200
415
+ },
416
+ {
417
+ "epoch": 5.0,
418
+ "grad_norm": 1.2270270586013794,
419
+ "learning_rate": 5.2990000000000006e-05,
420
+ "loss": 4.1234,
421
+ "step": 5300
422
+ },
423
+ {
424
+ "epoch": 5.094361877801369,
425
+ "grad_norm": 1.1221809387207031,
426
+ "learning_rate": 5.399000000000001e-05,
427
+ "loss": 4.0913,
428
+ "step": 5400
429
+ },
430
+ {
431
+ "epoch": 5.188723755602736,
432
+ "grad_norm": 1.1447559595108032,
433
+ "learning_rate": 5.499000000000001e-05,
434
+ "loss": 4.0568,
435
+ "step": 5500
436
+ },
437
+ {
438
+ "epoch": 5.283085633404105,
439
+ "grad_norm": 1.109061598777771,
440
+ "learning_rate": 5.599e-05,
441
+ "loss": 4.0514,
442
+ "step": 5600
443
+ },
444
+ {
445
+ "epoch": 5.377447511205473,
446
+ "grad_norm": 1.2101478576660156,
447
+ "learning_rate": 5.699e-05,
448
+ "loss": 4.0345,
449
+ "step": 5700
450
+ },
451
+ {
452
+ "epoch": 5.471809389006841,
453
+ "grad_norm": 1.0513982772827148,
454
+ "learning_rate": 5.799e-05,
455
+ "loss": 4.0028,
456
+ "step": 5800
457
+ },
458
+ {
459
+ "epoch": 5.56617126680821,
460
+ "grad_norm": 1.1381795406341553,
461
+ "learning_rate": 5.899e-05,
462
+ "loss": 4.0061,
463
+ "step": 5900
464
+ },
465
+ {
466
+ "epoch": 5.660533144609578,
467
+ "grad_norm": 1.102358102798462,
468
+ "learning_rate": 5.999e-05,
469
+ "loss": 3.981,
470
+ "step": 6000
471
+ },
472
+ {
473
+ "epoch": 5.660533144609578,
474
+ "eval_loss": 3.9213197231292725,
475
+ "eval_runtime": 89.9286,
476
+ "eval_samples_per_second": 167.589,
477
+ "eval_steps_per_second": 5.237,
478
+ "step": 6000
479
+ },
480
+ {
481
+ "epoch": 5.754895022410946,
482
+ "grad_norm": 1.0048632621765137,
483
+ "learning_rate": 6.0990000000000004e-05,
484
+ "loss": 3.9619,
485
+ "step": 6100
486
+ },
487
+ {
488
+ "epoch": 5.849256900212314,
489
+ "grad_norm": 1.0199745893478394,
490
+ "learning_rate": 6.199000000000001e-05,
491
+ "loss": 3.96,
492
+ "step": 6200
493
+ },
494
+ {
495
+ "epoch": 5.943618778013683,
496
+ "grad_norm": 1.052060842514038,
497
+ "learning_rate": 6.299e-05,
498
+ "loss": 3.9441,
499
+ "step": 6300
500
+ },
501
+ {
502
+ "epoch": 6.037744751120547,
503
+ "grad_norm": 1.1077136993408203,
504
+ "learning_rate": 6.399e-05,
505
+ "loss": 3.9129,
506
+ "step": 6400
507
+ },
508
+ {
509
+ "epoch": 6.132106628921916,
510
+ "grad_norm": 0.9797239303588867,
511
+ "learning_rate": 6.499000000000001e-05,
512
+ "loss": 3.8972,
513
+ "step": 6500
514
+ },
515
+ {
516
+ "epoch": 6.2264685067232834,
517
+ "grad_norm": 0.9934578537940979,
518
+ "learning_rate": 6.599000000000001e-05,
519
+ "loss": 3.8777,
520
+ "step": 6600
521
+ },
522
+ {
523
+ "epoch": 6.320830384524652,
524
+ "grad_norm": 0.9989880323410034,
525
+ "learning_rate": 6.699000000000001e-05,
526
+ "loss": 3.859,
527
+ "step": 6700
528
+ },
529
+ {
530
+ "epoch": 6.415192262326021,
531
+ "grad_norm": 0.9644502997398376,
532
+ "learning_rate": 6.799e-05,
533
+ "loss": 3.8596,
534
+ "step": 6800
535
+ },
536
+ {
537
+ "epoch": 6.509554140127388,
538
+ "grad_norm": 1.004499912261963,
539
+ "learning_rate": 6.899e-05,
540
+ "loss": 3.8494,
541
+ "step": 6900
542
+ },
543
+ {
544
+ "epoch": 6.603916017928757,
545
+ "grad_norm": 0.9108039736747742,
546
+ "learning_rate": 6.999e-05,
547
+ "loss": 3.8294,
548
+ "step": 7000
549
+ },
550
+ {
551
+ "epoch": 6.603916017928757,
552
+ "eval_loss": 3.7933058738708496,
553
+ "eval_runtime": 89.9278,
554
+ "eval_samples_per_second": 167.59,
555
+ "eval_steps_per_second": 5.238,
556
+ "step": 7000
557
+ },
558
+ {
559
+ "epoch": 6.698277895730125,
560
+ "grad_norm": 0.9396700859069824,
561
+ "learning_rate": 7.099e-05,
562
+ "loss": 3.8365,
563
+ "step": 7100
564
+ },
565
+ {
566
+ "epoch": 6.792639773531493,
567
+ "grad_norm": 0.9836630821228027,
568
+ "learning_rate": 7.199000000000001e-05,
569
+ "loss": 3.8102,
570
+ "step": 7200
571
+ },
572
+ {
573
+ "epoch": 6.887001651332861,
574
+ "grad_norm": 0.9225268959999084,
575
+ "learning_rate": 7.299e-05,
576
+ "loss": 3.8053,
577
+ "step": 7300
578
+ },
579
+ {
580
+ "epoch": 6.98136352913423,
581
+ "grad_norm": 0.9103354215621948,
582
+ "learning_rate": 7.399e-05,
583
+ "loss": 3.8067,
584
+ "step": 7400
585
+ },
586
+ {
587
+ "epoch": 7.075489502241094,
588
+ "grad_norm": 0.9512243270874023,
589
+ "learning_rate": 7.499e-05,
590
+ "loss": 3.7489,
591
+ "step": 7500
592
+ },
593
+ {
594
+ "epoch": 7.169851380042463,
595
+ "grad_norm": 0.9671022295951843,
596
+ "learning_rate": 7.599000000000001e-05,
597
+ "loss": 3.7524,
598
+ "step": 7600
599
+ },
600
+ {
601
+ "epoch": 7.264213257843831,
602
+ "grad_norm": 0.9643733501434326,
603
+ "learning_rate": 7.699e-05,
604
+ "loss": 3.7309,
605
+ "step": 7700
606
+ },
607
+ {
608
+ "epoch": 7.358575135645199,
609
+ "grad_norm": 0.94767165184021,
610
+ "learning_rate": 7.799e-05,
611
+ "loss": 3.7428,
612
+ "step": 7800
613
+ },
614
+ {
615
+ "epoch": 7.452937013446568,
616
+ "grad_norm": 0.9278003573417664,
617
+ "learning_rate": 7.899000000000001e-05,
618
+ "loss": 3.7256,
619
+ "step": 7900
620
+ },
621
+ {
622
+ "epoch": 7.5472988912479355,
623
+ "grad_norm": 0.8693475127220154,
624
+ "learning_rate": 7.999000000000001e-05,
625
+ "loss": 3.7299,
626
+ "step": 8000
627
+ },
628
+ {
629
+ "epoch": 7.5472988912479355,
630
+ "eval_loss": 3.701340675354004,
631
+ "eval_runtime": 89.9644,
632
+ "eval_samples_per_second": 167.522,
633
+ "eval_steps_per_second": 5.235,
634
+ "step": 8000
635
+ },
636
+ {
637
+ "epoch": 7.641660769049304,
638
+ "grad_norm": 0.8692898750305176,
639
+ "learning_rate": 8.099e-05,
640
+ "loss": 3.7201,
641
+ "step": 8100
642
+ },
643
+ {
644
+ "epoch": 7.736022646850673,
645
+ "grad_norm": 0.86644047498703,
646
+ "learning_rate": 8.199e-05,
647
+ "loss": 3.7185,
648
+ "step": 8200
649
+ },
650
+ {
651
+ "epoch": 7.8303845246520405,
652
+ "grad_norm": 0.9059743285179138,
653
+ "learning_rate": 8.299e-05,
654
+ "loss": 3.712,
655
+ "step": 8300
656
+ },
657
+ {
658
+ "epoch": 7.924746402453409,
659
+ "grad_norm": 0.8368768692016602,
660
+ "learning_rate": 8.399e-05,
661
+ "loss": 3.7064,
662
+ "step": 8400
663
+ },
664
+ {
665
+ "epoch": 8.018872375560274,
666
+ "grad_norm": 0.8828296065330505,
667
+ "learning_rate": 8.499e-05,
668
+ "loss": 3.6786,
669
+ "step": 8500
670
+ },
671
+ {
672
+ "epoch": 8.113234253361641,
673
+ "grad_norm": 0.8559228181838989,
674
+ "learning_rate": 8.599000000000001e-05,
675
+ "loss": 3.643,
676
+ "step": 8600
677
+ },
678
+ {
679
+ "epoch": 8.20759613116301,
680
+ "grad_norm": 0.8702303171157837,
681
+ "learning_rate": 8.699e-05,
682
+ "loss": 3.6411,
683
+ "step": 8700
684
+ },
685
+ {
686
+ "epoch": 8.301958008964379,
687
+ "grad_norm": 0.8181409239768982,
688
+ "learning_rate": 8.799e-05,
689
+ "loss": 3.6498,
690
+ "step": 8800
691
+ },
692
+ {
693
+ "epoch": 8.396319886765747,
694
+ "grad_norm": 0.839365541934967,
695
+ "learning_rate": 8.899e-05,
696
+ "loss": 3.641,
697
+ "step": 8900
698
+ },
699
+ {
700
+ "epoch": 8.490681764567114,
701
+ "grad_norm": 0.8675922155380249,
702
+ "learning_rate": 8.999000000000001e-05,
703
+ "loss": 3.6283,
704
+ "step": 9000
705
+ },
706
+ {
707
+ "epoch": 8.490681764567114,
708
+ "eval_loss": 3.6308939456939697,
709
+ "eval_runtime": 89.9234,
710
+ "eval_samples_per_second": 167.598,
711
+ "eval_steps_per_second": 5.238,
712
+ "step": 9000
713
+ },
714
+ {
715
+ "epoch": 8.585043642368483,
716
+ "grad_norm": 0.8916610479354858,
717
+ "learning_rate": 9.099000000000001e-05,
718
+ "loss": 3.6343,
719
+ "step": 9100
720
+ },
721
+ {
722
+ "epoch": 8.679405520169851,
723
+ "grad_norm": 0.81273353099823,
724
+ "learning_rate": 9.199e-05,
725
+ "loss": 3.6309,
726
+ "step": 9200
727
+ },
728
+ {
729
+ "epoch": 8.77376739797122,
730
+ "grad_norm": 0.8205325603485107,
731
+ "learning_rate": 9.299e-05,
732
+ "loss": 3.6322,
733
+ "step": 9300
734
+ },
735
+ {
736
+ "epoch": 8.868129275772588,
737
+ "grad_norm": 0.8169659972190857,
738
+ "learning_rate": 9.399e-05,
739
+ "loss": 3.6216,
740
+ "step": 9400
741
+ },
742
+ {
743
+ "epoch": 8.962491153573955,
744
+ "grad_norm": 0.8198681473731995,
745
+ "learning_rate": 9.499e-05,
746
+ "loss": 3.6054,
747
+ "step": 9500
748
+ },
749
+ {
750
+ "epoch": 9.056617126680822,
751
+ "grad_norm": 0.8319467902183533,
752
+ "learning_rate": 9.599000000000001e-05,
753
+ "loss": 3.5813,
754
+ "step": 9600
755
+ },
756
+ {
757
+ "epoch": 9.150979004482188,
758
+ "grad_norm": 0.814388632774353,
759
+ "learning_rate": 9.699e-05,
760
+ "loss": 3.5636,
761
+ "step": 9700
762
+ },
763
+ {
764
+ "epoch": 9.245340882283557,
765
+ "grad_norm": 0.8515892624855042,
766
+ "learning_rate": 9.799e-05,
767
+ "loss": 3.5551,
768
+ "step": 9800
769
+ },
770
+ {
771
+ "epoch": 9.339702760084926,
772
+ "grad_norm": 0.8092982769012451,
773
+ "learning_rate": 9.899e-05,
774
+ "loss": 3.5585,
775
+ "step": 9900
776
+ },
777
+ {
778
+ "epoch": 9.434064637886294,
779
+ "grad_norm": 0.8135222792625427,
780
+ "learning_rate": 9.999000000000001e-05,
781
+ "loss": 3.5615,
782
+ "step": 10000
783
+ },
784
+ {
785
+ "epoch": 9.434064637886294,
786
+ "eval_loss": 3.57912540435791,
787
+ "eval_runtime": 89.9276,
788
+ "eval_samples_per_second": 167.59,
789
+ "eval_steps_per_second": 5.238,
790
+ "step": 10000
791
+ },
792
+ {
793
+ "epoch": 9.528426515687663,
794
+ "grad_norm": 0.7966075539588928,
795
+ "learning_rate": 9.999970144476398e-05,
796
+ "loss": 3.5535,
797
+ "step": 10100
798
+ },
799
+ {
800
+ "epoch": 9.62278839348903,
801
+ "grad_norm": 0.7658608555793762,
802
+ "learning_rate": 9.999879368940656e-05,
803
+ "loss": 3.5665,
804
+ "step": 10200
805
+ },
806
+ {
807
+ "epoch": 9.717150271290398,
808
+ "grad_norm": 0.7524927854537964,
809
+ "learning_rate": 9.999727671452668e-05,
810
+ "loss": 3.5502,
811
+ "step": 10300
812
+ },
813
+ {
814
+ "epoch": 9.811512149091767,
815
+ "grad_norm": 0.743500292301178,
816
+ "learning_rate": 9.999515053860821e-05,
817
+ "loss": 3.5497,
818
+ "step": 10400
819
+ },
820
+ {
821
+ "epoch": 9.905874026893136,
822
+ "grad_norm": 0.7590805292129517,
823
+ "learning_rate": 9.999241518755793e-05,
824
+ "loss": 3.5467,
825
+ "step": 10500
826
+ },
827
+ {
828
+ "epoch": 10.0,
829
+ "grad_norm": 0.8875275254249573,
830
+ "learning_rate": 9.998907069470524e-05,
831
+ "loss": 3.545,
832
+ "step": 10600
833
+ },
834
+ {
835
+ "epoch": 10.094361877801369,
836
+ "grad_norm": 0.7724853157997131,
837
+ "learning_rate": 9.998511710080171e-05,
838
+ "loss": 3.4858,
839
+ "step": 10700
840
+ },
841
+ {
842
+ "epoch": 10.188723755602737,
843
+ "grad_norm": 0.7321507334709167,
844
+ "learning_rate": 9.998055445402067e-05,
845
+ "loss": 3.4796,
846
+ "step": 10800
847
+ },
848
+ {
849
+ "epoch": 10.283085633404104,
850
+ "grad_norm": 0.7564536333084106,
851
+ "learning_rate": 9.997538280995651e-05,
852
+ "loss": 3.4922,
853
+ "step": 10900
854
+ },
855
+ {
856
+ "epoch": 10.377447511205473,
857
+ "grad_norm": 0.7601178288459778,
858
+ "learning_rate": 9.996960223162406e-05,
859
+ "loss": 3.4881,
860
+ "step": 11000
861
+ },
862
+ {
863
+ "epoch": 10.377447511205473,
864
+ "eval_loss": 3.530785322189331,
865
+ "eval_runtime": 89.9198,
866
+ "eval_samples_per_second": 167.605,
867
+ "eval_steps_per_second": 5.238,
868
+ "step": 11000
869
+ },
870
+ {
871
+ "epoch": 10.471809389006841,
872
+ "grad_norm": 0.7771745324134827,
873
+ "learning_rate": 9.996321278945788e-05,
874
+ "loss": 3.4822,
875
+ "step": 11100
876
+ },
877
+ {
878
+ "epoch": 10.56617126680821,
879
+ "grad_norm": 0.7326973676681519,
880
+ "learning_rate": 9.995621456131128e-05,
881
+ "loss": 3.4838,
882
+ "step": 11200
883
+ },
884
+ {
885
+ "epoch": 10.660533144609577,
886
+ "grad_norm": 0.728434145450592,
887
+ "learning_rate": 9.994860763245549e-05,
888
+ "loss": 3.4832,
889
+ "step": 11300
890
+ },
891
+ {
892
+ "epoch": 10.754895022410945,
893
+ "grad_norm": 0.7702102065086365,
894
+ "learning_rate": 9.99403920955785e-05,
895
+ "loss": 3.4962,
896
+ "step": 11400
897
+ },
898
+ {
899
+ "epoch": 10.849256900212314,
900
+ "grad_norm": 0.718971848487854,
901
+ "learning_rate": 9.993156805078405e-05,
902
+ "loss": 3.486,
903
+ "step": 11500
904
+ },
905
+ {
906
+ "epoch": 10.943618778013683,
907
+ "grad_norm": 0.7548109889030457,
908
+ "learning_rate": 9.992213560559034e-05,
909
+ "loss": 3.484,
910
+ "step": 11600
911
+ },
912
+ {
913
+ "epoch": 11.037744751120547,
914
+ "grad_norm": 0.7601837515830994,
915
+ "learning_rate": 9.991209487492876e-05,
916
+ "loss": 3.4513,
917
+ "step": 11700
918
+ },
919
+ {
920
+ "epoch": 11.132106628921916,
921
+ "grad_norm": 0.7187873721122742,
922
+ "learning_rate": 9.990144598114242e-05,
923
+ "loss": 3.4157,
924
+ "step": 11800
925
+ },
926
+ {
927
+ "epoch": 11.226468506723284,
928
+ "grad_norm": 0.7205685377120972,
929
+ "learning_rate": 9.989018905398473e-05,
930
+ "loss": 3.4232,
931
+ "step": 11900
932
+ },
933
+ {
934
+ "epoch": 11.320830384524651,
935
+ "grad_norm": 0.761542558670044,
936
+ "learning_rate": 9.98783242306178e-05,
937
+ "loss": 3.4295,
938
+ "step": 12000
939
+ },
940
+ {
941
+ "epoch": 11.320830384524651,
942
+ "eval_loss": 3.497931957244873,
943
+ "eval_runtime": 89.9528,
944
+ "eval_samples_per_second": 167.543,
945
+ "eval_steps_per_second": 5.236,
946
+ "step": 12000
947
+ },
948
+ {
949
+ "epoch": 11.41519226232602,
950
+ "grad_norm": 0.7080798149108887,
951
+ "learning_rate": 9.986585165561076e-05,
952
+ "loss": 3.4227,
953
+ "step": 12100
954
+ },
955
+ {
956
+ "epoch": 11.509554140127388,
957
+ "grad_norm": 0.7278120517730713,
958
+ "learning_rate": 9.9852771480938e-05,
959
+ "loss": 3.432,
960
+ "step": 12200
961
+ },
962
+ {
963
+ "epoch": 11.603916017928757,
964
+ "grad_norm": 0.7304459810256958,
965
+ "learning_rate": 9.983908386597732e-05,
966
+ "loss": 3.4355,
967
+ "step": 12300
968
+ },
969
+ {
970
+ "epoch": 11.698277895730126,
971
+ "grad_norm": 0.7287798523902893,
972
+ "learning_rate": 9.9824788977508e-05,
973
+ "loss": 3.4281,
974
+ "step": 12400
975
+ },
976
+ {
977
+ "epoch": 11.792639773531493,
978
+ "grad_norm": 0.6873247027397156,
979
+ "learning_rate": 9.980988698970872e-05,
980
+ "loss": 3.4263,
981
+ "step": 12500
982
+ },
983
+ {
984
+ "epoch": 11.887001651332861,
985
+ "grad_norm": 0.7197590470314026,
986
+ "learning_rate": 9.979437808415552e-05,
987
+ "loss": 3.4261,
988
+ "step": 12600
989
+ },
990
+ {
991
+ "epoch": 11.98136352913423,
992
+ "grad_norm": 0.691047728061676,
993
+ "learning_rate": 9.977826244981952e-05,
994
+ "loss": 3.4262,
995
+ "step": 12700
996
+ },
997
+ {
998
+ "epoch": 12.075489502241094,
999
+ "grad_norm": 0.7147277593612671,
1000
+ "learning_rate": 9.976154028306461e-05,
1001
+ "loss": 3.3695,
1002
+ "step": 12800
1003
+ },
1004
+ {
1005
+ "epoch": 12.169851380042463,
1006
+ "grad_norm": 0.7131621837615967,
1007
+ "learning_rate": 9.974421178764515e-05,
1008
+ "loss": 3.3619,
1009
+ "step": 12900
1010
+ },
1011
+ {
1012
+ "epoch": 12.264213257843831,
1013
+ "grad_norm": 0.6843485832214355,
1014
+ "learning_rate": 9.972627717470337e-05,
1015
+ "loss": 3.3786,
1016
+ "step": 13000
1017
+ },
1018
+ {
1019
+ "epoch": 12.264213257843831,
1020
+ "eval_loss": 3.4681344032287598,
1021
+ "eval_runtime": 89.9337,
1022
+ "eval_samples_per_second": 167.579,
1023
+ "eval_steps_per_second": 5.237,
1024
+ "step": 13000
1025
+ },
1026
+ {
1027
+ "epoch": 12.3585751356452,
1028
+ "grad_norm": 0.7265843152999878,
1029
+ "learning_rate": 9.970773666276686e-05,
1030
+ "loss": 3.3759,
1031
+ "step": 13100
1032
+ },
1033
+ {
1034
+ "epoch": 12.452937013446567,
1035
+ "grad_norm": 0.7135173082351685,
1036
+ "learning_rate": 9.968859047774595e-05,
1037
+ "loss": 3.3757,
1038
+ "step": 13200
1039
+ },
1040
+ {
1041
+ "epoch": 12.547298891247936,
1042
+ "grad_norm": 0.7075929045677185,
1043
+ "learning_rate": 9.966883885293081e-05,
1044
+ "loss": 3.3868,
1045
+ "step": 13300
1046
+ },
1047
+ {
1048
+ "epoch": 12.641660769049304,
1049
+ "grad_norm": 0.6600580811500549,
1050
+ "learning_rate": 9.964848202898879e-05,
1051
+ "loss": 3.3768,
1052
+ "step": 13400
1053
+ },
1054
+ {
1055
+ "epoch": 12.736022646850673,
1056
+ "grad_norm": 0.6909327507019043,
1057
+ "learning_rate": 9.962752025396133e-05,
1058
+ "loss": 3.3761,
1059
+ "step": 13500
1060
+ },
1061
+ {
1062
+ "epoch": 12.830384524652041,
1063
+ "grad_norm": 0.7116390466690063,
1064
+ "learning_rate": 9.96059537832611e-05,
1065
+ "loss": 3.3696,
1066
+ "step": 13600
1067
+ },
1068
+ {
1069
+ "epoch": 12.924746402453408,
1070
+ "grad_norm": 0.6888706088066101,
1071
+ "learning_rate": 9.958378287966868e-05,
1072
+ "loss": 3.3835,
1073
+ "step": 13700
1074
+ },
1075
+ {
1076
+ "epoch": 13.018872375560274,
1077
+ "grad_norm": 0.6996840834617615,
1078
+ "learning_rate": 9.956100781332958e-05,
1079
+ "loss": 3.3644,
1080
+ "step": 13800
1081
+ },
1082
+ {
1083
+ "epoch": 13.113234253361641,
1084
+ "grad_norm": 0.7074296474456787,
1085
+ "learning_rate": 9.953762886175075e-05,
1086
+ "loss": 3.3085,
1087
+ "step": 13900
1088
+ },
1089
+ {
1090
+ "epoch": 13.20759613116301,
1091
+ "grad_norm": 0.7509676218032837,
1092
+ "learning_rate": 9.951364630979738e-05,
1093
+ "loss": 3.324,
1094
+ "step": 14000
1095
+ },
1096
+ {
1097
+ "epoch": 13.20759613116301,
1098
+ "eval_loss": 3.446702241897583,
1099
+ "eval_runtime": 89.9626,
1100
+ "eval_samples_per_second": 167.525,
1101
+ "eval_steps_per_second": 5.236,
1102
+ "step": 14000
1103
+ },
1104
+ {
1105
+ "epoch": 13.301958008964379,
1106
+ "grad_norm": 0.6905140280723572,
1107
+ "learning_rate": 9.948906044968926e-05,
1108
+ "loss": 3.3204,
1109
+ "step": 14100
1110
+ },
1111
+ {
1112
+ "epoch": 13.396319886765747,
1113
+ "grad_norm": 0.6943195462226868,
1114
+ "learning_rate": 9.946387158099738e-05,
1115
+ "loss": 3.3314,
1116
+ "step": 14200
1117
+ },
1118
+ {
1119
+ "epoch": 13.490681764567114,
1120
+ "grad_norm": 0.748652994632721,
1121
+ "learning_rate": 9.943808001064013e-05,
1122
+ "loss": 3.3365,
1123
+ "step": 14300
1124
+ },
1125
+ {
1126
+ "epoch": 13.585043642368483,
1127
+ "grad_norm": 0.6941584944725037,
1128
+ "learning_rate": 9.941168605287965e-05,
1129
+ "loss": 3.3327,
1130
+ "step": 14400
1131
+ },
1132
+ {
1133
+ "epoch": 13.679405520169851,
1134
+ "grad_norm": 0.7011757493019104,
1135
+ "learning_rate": 9.938469002931798e-05,
1136
+ "loss": 3.336,
1137
+ "step": 14500
1138
+ },
1139
+ {
1140
+ "epoch": 13.77376739797122,
1141
+ "grad_norm": 0.6881093978881836,
1142
+ "learning_rate": 9.935709226889319e-05,
1143
+ "loss": 3.3441,
1144
+ "step": 14600
1145
+ },
1146
+ {
1147
+ "epoch": 13.868129275772588,
1148
+ "grad_norm": 0.6721529960632324,
1149
+ "learning_rate": 9.932889310787522e-05,
1150
+ "loss": 3.3355,
1151
+ "step": 14700
1152
+ },
1153
+ {
1154
+ "epoch": 13.962491153573955,
1155
+ "grad_norm": 0.6991400718688965,
1156
+ "learning_rate": 9.9300092889862e-05,
1157
+ "loss": 3.332,
1158
+ "step": 14800
1159
+ },
1160
+ {
1161
+ "epoch": 14.056617126680822,
1162
+ "grad_norm": 0.6730444431304932,
1163
+ "learning_rate": 9.927069196577507e-05,
1164
+ "loss": 3.2893,
1165
+ "step": 14900
1166
+ },
1167
+ {
1168
+ "epoch": 14.150979004482188,
1169
+ "grad_norm": 0.6822571754455566,
1170
+ "learning_rate": 9.924069069385543e-05,
1171
+ "loss": 3.2673,
1172
+ "step": 15000
1173
+ },
1174
+ {
1175
+ "epoch": 14.150979004482188,
1176
+ "eval_loss": 3.428182601928711,
1177
+ "eval_runtime": 89.9364,
1178
+ "eval_samples_per_second": 167.574,
1179
+ "eval_steps_per_second": 5.237,
1180
+ "step": 15000
1181
+ },
1182
+ {
1183
+ "epoch": 14.245340882283557,
1184
+ "grad_norm": 0.6944796442985535,
1185
+ "learning_rate": 9.921008943965908e-05,
1186
+ "loss": 3.2846,
1187
+ "step": 15100
1188
+ },
1189
+ {
1190
+ "epoch": 14.339702760084926,
1191
+ "grad_norm": 0.6927157044410706,
1192
+ "learning_rate": 9.917888857605268e-05,
1193
+ "loss": 3.2856,
1194
+ "step": 15200
1195
+ },
1196
+ {
1197
+ "epoch": 14.434064637886294,
1198
+ "grad_norm": 0.6758902072906494,
1199
+ "learning_rate": 9.91470884832089e-05,
1200
+ "loss": 3.2768,
1201
+ "step": 15300
1202
+ },
1203
+ {
1204
+ "epoch": 14.528426515687663,
1205
+ "grad_norm": 0.7083920836448669,
1206
+ "learning_rate": 9.911468954860181e-05,
1207
+ "loss": 3.2846,
1208
+ "step": 15400
1209
+ },
1210
+ {
1211
+ "epoch": 14.62278839348903,
1212
+ "grad_norm": 0.6658477783203125,
1213
+ "learning_rate": 9.908169216700223e-05,
1214
+ "loss": 3.3001,
1215
+ "step": 15500
1216
+ },
1217
+ {
1218
+ "epoch": 14.717150271290398,
1219
+ "grad_norm": 0.6643409729003906,
1220
+ "learning_rate": 9.904809674047284e-05,
1221
+ "loss": 3.3046,
1222
+ "step": 15600
1223
+ },
1224
+ {
1225
+ "epoch": 14.811512149091767,
1226
+ "grad_norm": 0.6668530106544495,
1227
+ "learning_rate": 9.90139036783633e-05,
1228
+ "loss": 3.3031,
1229
+ "step": 15700
1230
+ },
1231
+ {
1232
+ "epoch": 14.905874026893136,
1233
+ "grad_norm": 0.6760970950126648,
1234
+ "learning_rate": 9.897911339730527e-05,
1235
+ "loss": 3.3031,
1236
+ "step": 15800
1237
+ },
1238
+ {
1239
+ "epoch": 15.0,
1240
+ "grad_norm": 0.8160315155982971,
1241
+ "learning_rate": 9.894372632120738e-05,
1242
+ "loss": 3.3028,
1243
+ "step": 15900
1244
+ },
1245
+ {
1246
+ "epoch": 15.094361877801369,
1247
+ "grad_norm": 0.6879032850265503,
1248
+ "learning_rate": 9.890774288124996e-05,
1249
+ "loss": 3.2276,
1250
+ "step": 16000
1251
+ },
1252
+ {
1253
+ "epoch": 15.094361877801369,
1254
+ "eval_loss": 3.4133388996124268,
1255
+ "eval_runtime": 89.9633,
1256
+ "eval_samples_per_second": 167.524,
1257
+ "eval_steps_per_second": 5.235,
1258
+ "step": 16000
1259
+ },
1260
+ {
1261
+ "epoch": 15.188723755602737,
1262
+ "grad_norm": 0.688949704170227,
1263
+ "learning_rate": 9.887116351587985e-05,
1264
+ "loss": 3.2447,
1265
+ "step": 16100
1266
+ },
1267
+ {
1268
+ "epoch": 15.283085633404104,
1269
+ "grad_norm": 0.6961474418640137,
1270
+ "learning_rate": 9.883398867080513e-05,
1271
+ "loss": 3.2392,
1272
+ "step": 16200
1273
+ },
1274
+ {
1275
+ "epoch": 15.377447511205473,
1276
+ "grad_norm": 0.681828498840332,
1277
+ "learning_rate": 9.87962187989895e-05,
1278
+ "loss": 3.2465,
1279
+ "step": 16300
1280
+ },
1281
+ {
1282
+ "epoch": 15.471809389006841,
1283
+ "grad_norm": 0.6817638874053955,
1284
+ "learning_rate": 9.875785436064697e-05,
1285
+ "loss": 3.2503,
1286
+ "step": 16400
1287
+ },
1288
+ {
1289
+ "epoch": 15.56617126680821,
1290
+ "grad_norm": 0.6779124736785889,
1291
+ "learning_rate": 9.871889582323609e-05,
1292
+ "loss": 3.2555,
1293
+ "step": 16500
1294
+ },
1295
+ {
1296
+ "epoch": 15.660533144609577,
1297
+ "grad_norm": 0.6662207841873169,
1298
+ "learning_rate": 9.867934366145435e-05,
1299
+ "loss": 3.263,
1300
+ "step": 16600
1301
+ },
1302
+ {
1303
+ "epoch": 15.754895022410945,
1304
+ "grad_norm": 0.691040575504303,
1305
+ "learning_rate": 9.863919835723236e-05,
1306
+ "loss": 3.2616,
1307
+ "step": 16700
1308
+ },
1309
+ {
1310
+ "epoch": 15.849256900212314,
1311
+ "grad_norm": 0.6542192101478577,
1312
+ "learning_rate": 9.859846039972798e-05,
1313
+ "loss": 3.2662,
1314
+ "step": 16800
1315
+ },
1316
+ {
1317
+ "epoch": 15.943618778013683,
1318
+ "grad_norm": 0.6532755494117737,
1319
+ "learning_rate": 9.855713028532036e-05,
1320
+ "loss": 3.2684,
1321
+ "step": 16900
1322
+ },
1323
+ {
1324
+ "epoch": 16.03774475112055,
1325
+ "grad_norm": 0.6761746406555176,
1326
+ "learning_rate": 9.851520851760394e-05,
1327
+ "loss": 3.2356,
1328
+ "step": 17000
1329
+ },
1330
+ {
1331
+ "epoch": 16.03774475112055,
1332
+ "eval_loss": 3.402015447616577,
1333
+ "eval_runtime": 89.929,
1334
+ "eval_samples_per_second": 167.588,
1335
+ "eval_steps_per_second": 5.237,
1336
+ "step": 17000
1337
+ },
1338
+ {
1339
+ "epoch": 16.132106628921914,
1340
+ "grad_norm": 0.6820452809333801,
1341
+ "learning_rate": 9.847269560738218e-05,
1342
+ "loss": 3.1936,
1343
+ "step": 17100
1344
+ },
1345
+ {
1346
+ "epoch": 16.226468506723283,
1347
+ "grad_norm": 0.6789988875389099,
1348
+ "learning_rate": 9.842959207266149e-05,
1349
+ "loss": 3.2047,
1350
+ "step": 17200
1351
+ },
1352
+ {
1353
+ "epoch": 16.32083038452465,
1354
+ "grad_norm": 0.6698039174079895,
1355
+ "learning_rate": 9.838589843864484e-05,
1356
+ "loss": 3.2103,
1357
+ "step": 17300
1358
+ },
1359
+ {
1360
+ "epoch": 16.41519226232602,
1361
+ "grad_norm": 0.6566837430000305,
1362
+ "learning_rate": 9.834161523772539e-05,
1363
+ "loss": 3.2203,
1364
+ "step": 17400
1365
+ },
1366
+ {
1367
+ "epoch": 16.50955414012739,
1368
+ "grad_norm": 0.677543044090271,
1369
+ "learning_rate": 9.829674300947993e-05,
1370
+ "loss": 3.222,
1371
+ "step": 17500
1372
+ },
1373
+ {
1374
+ "epoch": 16.603916017928757,
1375
+ "grad_norm": 0.679976761341095,
1376
+ "learning_rate": 9.825128230066244e-05,
1377
+ "loss": 3.2282,
1378
+ "step": 17600
1379
+ },
1380
+ {
1381
+ "epoch": 16.698277895730126,
1382
+ "grad_norm": 0.670319676399231,
1383
+ "learning_rate": 9.82052336651973e-05,
1384
+ "loss": 3.2225,
1385
+ "step": 17700
1386
+ },
1387
+ {
1388
+ "epoch": 16.792639773531494,
1389
+ "grad_norm": 0.6647588610649109,
1390
+ "learning_rate": 9.815859766417257e-05,
1391
+ "loss": 3.2326,
1392
+ "step": 17800
1393
+ },
1394
+ {
1395
+ "epoch": 16.887001651332863,
1396
+ "grad_norm": 0.6643775701522827,
1397
+ "learning_rate": 9.811137486583324e-05,
1398
+ "loss": 3.2256,
1399
+ "step": 17900
1400
+ },
1401
+ {
1402
+ "epoch": 16.981363529134228,
1403
+ "grad_norm": 0.6705678701400757,
1404
+ "learning_rate": 9.806356584557419e-05,
1405
+ "loss": 3.2403,
1406
+ "step": 18000
1407
+ },
1408
+ {
1409
+ "epoch": 16.981363529134228,
1410
+ "eval_loss": 3.387256622314453,
1411
+ "eval_runtime": 89.9338,
1412
+ "eval_samples_per_second": 167.579,
1413
+ "eval_steps_per_second": 5.237,
1414
+ "step": 18000
1415
+ },
1416
+ {
1417
+ "epoch": 17.075489502241094,
1418
+ "grad_norm": 0.6900054216384888,
1419
+ "learning_rate": 9.801517118593327e-05,
1420
+ "loss": 3.1775,
1421
+ "step": 18100
1422
+ },
1423
+ {
1424
+ "epoch": 17.169851380042463,
1425
+ "grad_norm": 0.6650823950767517,
1426
+ "learning_rate": 9.796619147658408e-05,
1427
+ "loss": 3.1641,
1428
+ "step": 18200
1429
+ },
1430
+ {
1431
+ "epoch": 17.26421325784383,
1432
+ "grad_norm": 0.6726897358894348,
1433
+ "learning_rate": 9.791662731432898e-05,
1434
+ "loss": 3.175,
1435
+ "step": 18300
1436
+ },
1437
+ {
1438
+ "epoch": 17.3585751356452,
1439
+ "grad_norm": 0.6691387295722961,
1440
+ "learning_rate": 9.78664793030916e-05,
1441
+ "loss": 3.1834,
1442
+ "step": 18400
1443
+ },
1444
+ {
1445
+ "epoch": 17.45293701344657,
1446
+ "grad_norm": 0.6631948351860046,
1447
+ "learning_rate": 9.781574805390967e-05,
1448
+ "loss": 3.1814,
1449
+ "step": 18500
1450
+ },
1451
+ {
1452
+ "epoch": 17.547298891247937,
1453
+ "grad_norm": 0.6776889562606812,
1454
+ "learning_rate": 9.776443418492744e-05,
1455
+ "loss": 3.1934,
1456
+ "step": 18600
1457
+ },
1458
+ {
1459
+ "epoch": 17.641660769049302,
1460
+ "grad_norm": 0.6866058111190796,
1461
+ "learning_rate": 9.771253832138819e-05,
1462
+ "loss": 3.1933,
1463
+ "step": 18700
1464
+ },
1465
+ {
1466
+ "epoch": 17.73602264685067,
1467
+ "grad_norm": 0.6719706058502197,
1468
+ "learning_rate": 9.766006109562664e-05,
1469
+ "loss": 3.1993,
1470
+ "step": 18800
1471
+ },
1472
+ {
1473
+ "epoch": 17.83038452465204,
1474
+ "grad_norm": 0.6513810753822327,
1475
+ "learning_rate": 9.760700314706125e-05,
1476
+ "loss": 3.21,
1477
+ "step": 18900
1478
+ },
1479
+ {
1480
+ "epoch": 17.924746402453408,
1481
+ "grad_norm": 0.6892839074134827,
1482
+ "learning_rate": 9.755336512218638e-05,
1483
+ "loss": 3.2045,
1484
+ "step": 19000
1485
+ },
1486
+ {
1487
+ "epoch": 17.924746402453408,
1488
+ "eval_loss": 3.3803834915161133,
1489
+ "eval_runtime": 89.9356,
1490
+ "eval_samples_per_second": 167.575,
1491
+ "eval_steps_per_second": 5.237,
1492
+ "step": 19000
1493
+ },
1494
+ {
1495
+ "epoch": 18.018872375560274,
1496
+ "grad_norm": 0.671567440032959,
1497
+ "learning_rate": 9.749914767456441e-05,
1498
+ "loss": 3.1867,
1499
+ "step": 19100
1500
+ },
1501
+ {
1502
+ "epoch": 18.113234253361643,
1503
+ "grad_norm": 0.6859995126724243,
1504
+ "learning_rate": 9.744435146481785e-05,
1505
+ "loss": 3.1267,
1506
+ "step": 19200
1507
+ },
1508
+ {
1509
+ "epoch": 18.20759613116301,
1510
+ "grad_norm": 0.6942476630210876,
1511
+ "learning_rate": 9.738897716062121e-05,
1512
+ "loss": 3.1458,
1513
+ "step": 19300
1514
+ },
1515
+ {
1516
+ "epoch": 18.301958008964377,
1517
+ "grad_norm": 0.6862732768058777,
1518
+ "learning_rate": 9.733302543669291e-05,
1519
+ "loss": 3.151,
1520
+ "step": 19400
1521
+ },
1522
+ {
1523
+ "epoch": 18.396319886765745,
1524
+ "grad_norm": 0.6695058941841125,
1525
+ "learning_rate": 9.727649697478708e-05,
1526
+ "loss": 3.1599,
1527
+ "step": 19500
1528
+ },
1529
+ {
1530
+ "epoch": 18.490681764567114,
1531
+ "grad_norm": 0.6894610524177551,
1532
+ "learning_rate": 9.721939246368515e-05,
1533
+ "loss": 3.1535,
1534
+ "step": 19600
1535
+ },
1536
+ {
1537
+ "epoch": 18.585043642368483,
1538
+ "grad_norm": 0.65924471616745,
1539
+ "learning_rate": 9.716171259918758e-05,
1540
+ "loss": 3.1606,
1541
+ "step": 19700
1542
+ },
1543
+ {
1544
+ "epoch": 18.67940552016985,
1545
+ "grad_norm": 0.6839491724967957,
1546
+ "learning_rate": 9.710345808410532e-05,
1547
+ "loss": 3.1706,
1548
+ "step": 19800
1549
+ },
1550
+ {
1551
+ "epoch": 18.77376739797122,
1552
+ "grad_norm": 0.6813986897468567,
1553
+ "learning_rate": 9.704462962825124e-05,
1554
+ "loss": 3.1755,
1555
+ "step": 19900
1556
+ },
1557
+ {
1558
+ "epoch": 18.86812927577259,
1559
+ "grad_norm": 0.677698016166687,
1560
+ "learning_rate": 9.698522794843154e-05,
1561
+ "loss": 3.1827,
1562
+ "step": 20000
1563
+ },
1564
+ {
1565
+ "epoch": 18.86812927577259,
1566
+ "eval_loss": 3.3754522800445557,
1567
+ "eval_runtime": 89.9428,
1568
+ "eval_samples_per_second": 167.562,
1569
+ "eval_steps_per_second": 5.237,
1570
+ "step": 20000
1571
+ }
1572
+ ],
1573
+ "logging_steps": 100,
1574
+ "max_steps": 100000,
1575
+ "num_input_tokens_seen": 0,
1576
+ "num_train_epochs": 95,
1577
+ "save_steps": 5000,
1578
+ "stateful_callbacks": {
1579
+ "TrainerControl": {
1580
+ "args": {
1581
+ "should_epoch_stop": false,
1582
+ "should_evaluate": false,
1583
+ "should_log": false,
1584
+ "should_save": true,
1585
+ "should_training_stop": false
1586
+ },
1587
+ "attributes": {}
1588
+ }
1589
+ },
1590
+ "total_flos": 6.6869595537408e+17,
1591
+ "train_batch_size": 32,
1592
+ "trial_name": null,
1593
+ "trial_params": null
1594
+ }
checkpoint-20000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:957cda2979c68b0633284b2934027f8314c64d83a5adc00047c3d99cde26be4b
3
+ size 5432
checkpoint-20000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-12,
13
+ "model_type": "gpt2",
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "pad_token_id": 50256,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.1",
30
+ "use_cache": false,
31
+ "vocab_size": 50257
32
+ }
checkpoint-80000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": [
5
+ 50256
6
+ ],
7
+ "pad_token_id": 50256,
8
+ "transformers_version": "4.57.1"
9
+ }
checkpoint-80000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f739c2e20fecc880100925cc6b2ce918af1f005622ef858ee4e45306bd2dcc86
3
+ size 497774208
checkpoint-80000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a398069ba74167566ee6e402d21808ef801da7c29dab9c0fd92ed174aa93a69
3
+ size 995642298
checkpoint-80000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55b50e8305ac657ea79c24b79ab1e4479e249aa89323442c2b87b005533c135f
3
+ size 14244
checkpoint-80000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32ff1c11e4894b5e7e282f5008b74cc430c25fff808f615cdf0f627d1157b43d
3
+ size 1064
checkpoint-80000/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-80000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoint-80000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-80000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:957cda2979c68b0633284b2934027f8314c64d83a5adc00047c3d99cde26be4b
3
+ size 5432
checkpoint-80000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-12,
13
+ "model_type": "gpt2",
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "pad_token_id": 50256,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.1",
30
+ "use_cache": false,
31
+ "vocab_size": 50257
32
+ }
checkpoint-85000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": [
5
+ 50256
6
+ ],
7
+ "pad_token_id": 50256,
8
+ "transformers_version": "4.57.1"
9
+ }
checkpoint-85000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02869a0c70ea08672453c9b90f10fd5ada1d4dda2a8b63ffe3dc32d335d5a7b6
3
+ size 497774208
checkpoint-85000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee96fa7338788756ace617c632e0c64b1156ae34253da434afd9237cd29b8f9c
3
+ size 995642298
checkpoint-85000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88127b5daf4718fb4a591df0ff25e4fc8035057dcbb94eb84c9d99a8b6bf684a
3
+ size 14244
checkpoint-85000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee02bcd5f604a40b6bfc7bdb366ad2852db969eff753f24f2a39e11d755c9c48
3
+ size 1064
checkpoint-85000/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-85000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoint-85000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-85000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:957cda2979c68b0633284b2934027f8314c64d83a5adc00047c3d99cde26be4b
3
+ size 5432
checkpoint-85000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-90000/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-12,
13
+ "model_type": "gpt2",
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": 3072,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "pad_token_id": 50256,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "transformers_version": "4.57.1",
30
+ "use_cache": false,
31
+ "vocab_size": 50257
32
+ }
checkpoint-90000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": [
5
+ 50256
6
+ ],
7
+ "pad_token_id": 50256,
8
+ "transformers_version": "4.57.1"
9
+ }
checkpoint-90000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-90000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3237f4700e803890a6e9019e2e0fa8df8b709af12c64793a3adab195589f7d11
3
+ size 497774208
checkpoint-90000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae2142ee7a722d78f6e8ad559e9903e1e2396b2ed800d305db89edf1911d380
3
+ size 995642298
checkpoint-90000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bc71c2bbc9f5ec50891e422471377fac0e60017caea1d1e31a441cd3ffed609
3
+ size 14244
checkpoint-90000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50300a1a7032aa664901bea90ce70e91cf6ac0ae95bab8681aa8bae002d20396
3
+ size 1064
checkpoint-90000/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-90000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-90000/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
checkpoint-90000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff