tomekkorbak commited on
Commit
b6c7f68
·
1 Parent(s): 237b8e0

Training in progress, step 34

Browse files
checkpoint-30/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMAndValueHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": true,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.20.1",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
checkpoint-30/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-30/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29f35a10697a095a25917c4e6963fef15e916b41e15a7b1bba0b9a65f126e390
3
+ size 995603825
checkpoint-30/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6061184716d5a67a373d131bf775f944bc612fcfbe0f39f956e405e54f24a20
3
+ size 510396521
checkpoint-30/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b16c25b896cedad07e9ce24c91d964cdf0d50787d798daadcd393b8640b9744
3
+ size 14503
checkpoint-30/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae6492aeecfb622d4ccee1087e6eba7e09e214319c7745292fc9d5bfaf2c4189
3
+ size 559
checkpoint-30/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d14c8ce4e0a59517ca15865c59f5dcc587caa183c91c8f1a2f9ff639d7b57299
3
+ size 623
checkpoint-30/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-30/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-30/tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "gpt2",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
checkpoint-30/trainer_state.json ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.0005957818644000476,
5
+ "global_step": 30,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 0.0001984126984126984,
13
+ "loss": 10.961,
14
+ "theoretical_loss": 20.81281780154715,
15
+ "tokens_seen": 65536
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 0.0003968253968253968,
20
+ "loss": 10.9749,
21
+ "theoretical_loss": 17.566201104328645,
22
+ "tokens_seen": 131072
23
+ },
24
+ {
25
+ "epoch": 0.0,
26
+ "learning_rate": 0.0005952380952380953,
27
+ "loss": 9.683,
28
+ "theoretical_loss": 15.939477092836569,
29
+ "tokens_seen": 196608
30
+ },
31
+ {
32
+ "epoch": 0.0,
33
+ "learning_rate": 0.0007936507936507937,
34
+ "loss": 11.0117,
35
+ "theoretical_loss": 14.89231675598857,
36
+ "tokens_seen": 262144
37
+ },
38
+ {
39
+ "epoch": 0.0,
40
+ "learning_rate": 0.000992063492063492,
41
+ "loss": 9.1322,
42
+ "theoretical_loss": 14.136216937762974,
43
+ "tokens_seen": 327680
44
+ },
45
+ {
46
+ "epoch": 0.0,
47
+ "learning_rate": 0.0011904761904761906,
48
+ "loss": 9.58,
49
+ "theoretical_loss": 13.552561472550224,
50
+ "tokens_seen": 393216
51
+ },
52
+ {
53
+ "epoch": 0.0,
54
+ "learning_rate": 0.001388888888888889,
55
+ "loss": 8.4208,
56
+ "theoretical_loss": 13.08180900140119,
57
+ "tokens_seen": 458752
58
+ },
59
+ {
60
+ "epoch": 0.0,
61
+ "learning_rate": 0.0015873015873015873,
62
+ "loss": 8.0254,
63
+ "theoretical_loss": 12.690129625483323,
64
+ "tokens_seen": 524288
65
+ },
66
+ {
67
+ "epoch": 0.0,
68
+ "learning_rate": 0.0017857142857142857,
69
+ "loss": 7.547,
70
+ "theoretical_loss": 12.356592463873625,
71
+ "tokens_seen": 589824
72
+ },
73
+ {
74
+ "epoch": 0.0,
75
+ "learning_rate": 0.001984126984126984,
76
+ "loss": 7.1093,
77
+ "theoretical_loss": 12.067412607035077,
78
+ "tokens_seen": 655360
79
+ },
80
+ {
81
+ "epoch": 0.0,
82
+ "learning_rate": 0.0021825396825396826,
83
+ "loss": 7.2169,
84
+ "theoretical_loss": 11.813066231101676,
85
+ "tokens_seen": 720896
86
+ },
87
+ {
88
+ "epoch": 0.0,
89
+ "learning_rate": 0.002380952380952381,
90
+ "loss": 7.4624,
91
+ "theoretical_loss": 11.586719208706729,
92
+ "tokens_seen": 786432
93
+ },
94
+ {
95
+ "epoch": 0.0,
96
+ "objective/train/docs_used": 12399,
97
+ "objective/train/instantaneous_batch_size": 8,
98
+ "objective/train/instantaneous_microbatch_size": 8192,
99
+ "objective/train/original_loss": 8.1644926071167,
100
+ "objective/train/theoretical_loss": 11.482412519286804,
101
+ "objective/train/tokens_used": 21279200,
102
+ "theoretical_loss": 11.482412519286804,
103
+ "tokens_seen": 819200
104
+ },
105
+ {
106
+ "epoch": 0.0,
107
+ "learning_rate": 0.0025793650793650793,
108
+ "loss": 7.5937,
109
+ "theoretical_loss": 11.383314140186787,
110
+ "tokens_seen": 851968
111
+ },
112
+ {
113
+ "epoch": 0.0,
114
+ "learning_rate": 0.002777777777777778,
115
+ "loss": 7.1562,
116
+ "theoretical_loss": 11.199011702111871,
117
+ "tokens_seen": 917504
118
+ },
119
+ {
120
+ "epoch": 0.0,
121
+ "learning_rate": 0.002976190476190476,
122
+ "loss": 7.492,
123
+ "theoretical_loss": 11.030833917977912,
124
+ "tokens_seen": 983040
125
+ },
126
+ {
127
+ "epoch": 0.0,
128
+ "learning_rate": 0.0031746031746031746,
129
+ "loss": 7.5084,
130
+ "theoretical_loss": 10.87642808645695,
131
+ "tokens_seen": 1048576
132
+ },
133
+ {
134
+ "epoch": 0.0,
135
+ "learning_rate": 0.003373015873015873,
136
+ "loss": 7.4717,
137
+ "theoretical_loss": 10.733905740062724,
138
+ "tokens_seen": 1114112
139
+ },
140
+ {
141
+ "epoch": 0.0,
142
+ "learning_rate": 0.0035714285714285713,
143
+ "loss": 7.1015,
144
+ "theoretical_loss": 10.60172987623028,
145
+ "tokens_seen": 1179648
146
+ },
147
+ {
148
+ "epoch": 0.0,
149
+ "learning_rate": 0.00376984126984127,
150
+ "loss": 6.8498,
151
+ "theoretical_loss": 10.478634172356642,
152
+ "tokens_seen": 1245184
153
+ },
154
+ {
155
+ "epoch": 0.0,
156
+ "learning_rate": 0.003968253968253968,
157
+ "loss": 7.3766,
158
+ "theoretical_loss": 10.36356394376333,
159
+ "tokens_seen": 1310720
160
+ },
161
+ {
162
+ "epoch": 0.0,
163
+ "learning_rate": 0.004166666666666667,
164
+ "loss": 7.5166,
165
+ "theoretical_loss": 10.255632220896747,
166
+ "tokens_seen": 1376256
167
+ },
168
+ {
169
+ "epoch": 0.0,
170
+ "learning_rate": 0.004365079365079365,
171
+ "loss": 7.6046,
172
+ "theoretical_loss": 10.15408655327002,
173
+ "tokens_seen": 1441792
174
+ },
175
+ {
176
+ "epoch": 0.0,
177
+ "learning_rate": 0.004563492063492064,
178
+ "loss": 7.584,
179
+ "theoretical_loss": 10.058283561732598,
180
+ "tokens_seen": 1507328
181
+ },
182
+ {
183
+ "epoch": 0.0,
184
+ "learning_rate": 0.004761904761904762,
185
+ "loss": 7.8262,
186
+ "theoretical_loss": 9.967669178840278,
187
+ "tokens_seen": 1572864
188
+ },
189
+ {
190
+ "epoch": 0.0,
191
+ "objective/train/docs_used": 13112,
192
+ "objective/train/instantaneous_batch_size": 8,
193
+ "objective/train/instantaneous_microbatch_size": 8192,
194
+ "objective/train/original_loss": 7.06024694442749,
195
+ "objective/train/theoretical_loss": 9.881763126393109,
196
+ "objective/train/tokens_used": 22098400,
197
+ "theoretical_loss": 9.881763126393109,
198
+ "tokens_seen": 1638400
199
+ },
200
+ {
201
+ "epoch": 0.0,
202
+ "learning_rate": 0.00496031746031746,
203
+ "loss": 7.4158,
204
+ "theoretical_loss": 9.881763126393109,
205
+ "tokens_seen": 1638400
206
+ },
207
+ {
208
+ "epoch": 0.0,
209
+ "learning_rate": 0.005158730158730159,
210
+ "loss": 7.393,
211
+ "theoretical_loss": 9.80014659154056,
212
+ "tokens_seen": 1703936
213
+ },
214
+ {
215
+ "epoch": 0.0,
216
+ "learning_rate": 0.005357142857142857,
217
+ "loss": 7.1589,
218
+ "theoretical_loss": 9.722452346907446,
219
+ "tokens_seen": 1769472
220
+ },
221
+ {
222
+ "epoch": 0.0,
223
+ "learning_rate": 0.005555555555555556,
224
+ "loss": 7.4701,
225
+ "theoretical_loss": 9.648356759081546,
226
+ "tokens_seen": 1835008
227
+ },
228
+ {
229
+ "epoch": 0.0,
230
+ "learning_rate": 0.005753968253968254,
231
+ "loss": 7.3914,
232
+ "theoretical_loss": 9.577573271145639,
233
+ "tokens_seen": 1900544
234
+ },
235
+ {
236
+ "epoch": 0.0,
237
+ "learning_rate": 0.005952380952380952,
238
+ "loss": 7.1136,
239
+ "theoretical_loss": 9.509847046764852,
240
+ "tokens_seen": 1966080
241
+ }
242
+ ],
243
+ "max_steps": 50354,
244
+ "num_train_epochs": 9223372036854775807,
245
+ "total_flos": 1003361402880000.0,
246
+ "trial_name": null,
247
+ "trial_params": null
248
+ }
checkpoint-30/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:861274747fa77141802f480df74fe77f6d6c3bcdf6fc289e9c550ff0ab8c621e
3
+ size 3311
checkpoint-30/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-32/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMAndValueHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": true,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.20.1",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
checkpoint-32/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-32/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c06b7cb0540805270d9dcb9ceeb9e5d0ef17dc0d7e2e3bdf4e2cb65fde8aca0
3
+ size 995603825
checkpoint-32/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ef3283f4c767715b420819bdb19220fc36a4c419aad50182b74f5aaedb0b2cb
3
+ size 510396521
checkpoint-32/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc5c0c86dfb78efc643eedc2282a14851849952e4e485502760811b052ebc6e
3
+ size 14439
checkpoint-32/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:412d8423553991c0d0f2016856ceaf8ccf9dcc442423d98f152d56c0997dbe44
3
+ size 559
checkpoint-32/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1830b3173e37b421dd6b8636c11660495f63bce2d95756dd10c3facbb5dd5af8
3
+ size 623
checkpoint-32/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-32/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-32/tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "gpt2",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
checkpoint-32/trainer_state.json ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.0006355006553600508,
5
+ "global_step": 32,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 0.0001984126984126984,
13
+ "loss": 10.961,
14
+ "theoretical_loss": 20.81281780154715,
15
+ "tokens_seen": 65536
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 0.0003968253968253968,
20
+ "loss": 10.9749,
21
+ "theoretical_loss": 17.566201104328645,
22
+ "tokens_seen": 131072
23
+ },
24
+ {
25
+ "epoch": 0.0,
26
+ "learning_rate": 0.0005952380952380953,
27
+ "loss": 9.683,
28
+ "theoretical_loss": 15.939477092836569,
29
+ "tokens_seen": 196608
30
+ },
31
+ {
32
+ "epoch": 0.0,
33
+ "learning_rate": 0.0007936507936507937,
34
+ "loss": 11.0117,
35
+ "theoretical_loss": 14.89231675598857,
36
+ "tokens_seen": 262144
37
+ },
38
+ {
39
+ "epoch": 0.0,
40
+ "learning_rate": 0.000992063492063492,
41
+ "loss": 9.1322,
42
+ "theoretical_loss": 14.136216937762974,
43
+ "tokens_seen": 327680
44
+ },
45
+ {
46
+ "epoch": 0.0,
47
+ "learning_rate": 0.0011904761904761906,
48
+ "loss": 9.58,
49
+ "theoretical_loss": 13.552561472550224,
50
+ "tokens_seen": 393216
51
+ },
52
+ {
53
+ "epoch": 0.0,
54
+ "learning_rate": 0.001388888888888889,
55
+ "loss": 8.4208,
56
+ "theoretical_loss": 13.08180900140119,
57
+ "tokens_seen": 458752
58
+ },
59
+ {
60
+ "epoch": 0.0,
61
+ "learning_rate": 0.0015873015873015873,
62
+ "loss": 8.0254,
63
+ "theoretical_loss": 12.690129625483323,
64
+ "tokens_seen": 524288
65
+ },
66
+ {
67
+ "epoch": 0.0,
68
+ "learning_rate": 0.0017857142857142857,
69
+ "loss": 7.547,
70
+ "theoretical_loss": 12.356592463873625,
71
+ "tokens_seen": 589824
72
+ },
73
+ {
74
+ "epoch": 0.0,
75
+ "learning_rate": 0.001984126984126984,
76
+ "loss": 7.1093,
77
+ "theoretical_loss": 12.067412607035077,
78
+ "tokens_seen": 655360
79
+ },
80
+ {
81
+ "epoch": 0.0,
82
+ "learning_rate": 0.0021825396825396826,
83
+ "loss": 7.2169,
84
+ "theoretical_loss": 11.813066231101676,
85
+ "tokens_seen": 720896
86
+ },
87
+ {
88
+ "epoch": 0.0,
89
+ "learning_rate": 0.002380952380952381,
90
+ "loss": 7.4624,
91
+ "theoretical_loss": 11.586719208706729,
92
+ "tokens_seen": 786432
93
+ },
94
+ {
95
+ "epoch": 0.0,
96
+ "objective/train/docs_used": 12399,
97
+ "objective/train/instantaneous_batch_size": 8,
98
+ "objective/train/instantaneous_microbatch_size": 8192,
99
+ "objective/train/original_loss": 8.1644926071167,
100
+ "objective/train/theoretical_loss": 11.482412519286804,
101
+ "objective/train/tokens_used": 21279200,
102
+ "theoretical_loss": 11.482412519286804,
103
+ "tokens_seen": 819200
104
+ },
105
+ {
106
+ "epoch": 0.0,
107
+ "learning_rate": 0.0025793650793650793,
108
+ "loss": 7.5937,
109
+ "theoretical_loss": 11.383314140186787,
110
+ "tokens_seen": 851968
111
+ },
112
+ {
113
+ "epoch": 0.0,
114
+ "learning_rate": 0.002777777777777778,
115
+ "loss": 7.1562,
116
+ "theoretical_loss": 11.199011702111871,
117
+ "tokens_seen": 917504
118
+ },
119
+ {
120
+ "epoch": 0.0,
121
+ "learning_rate": 0.002976190476190476,
122
+ "loss": 7.492,
123
+ "theoretical_loss": 11.030833917977912,
124
+ "tokens_seen": 983040
125
+ },
126
+ {
127
+ "epoch": 0.0,
128
+ "learning_rate": 0.0031746031746031746,
129
+ "loss": 7.5084,
130
+ "theoretical_loss": 10.87642808645695,
131
+ "tokens_seen": 1048576
132
+ },
133
+ {
134
+ "epoch": 0.0,
135
+ "learning_rate": 0.003373015873015873,
136
+ "loss": 7.4717,
137
+ "theoretical_loss": 10.733905740062724,
138
+ "tokens_seen": 1114112
139
+ },
140
+ {
141
+ "epoch": 0.0,
142
+ "learning_rate": 0.0035714285714285713,
143
+ "loss": 7.1015,
144
+ "theoretical_loss": 10.60172987623028,
145
+ "tokens_seen": 1179648
146
+ },
147
+ {
148
+ "epoch": 0.0,
149
+ "learning_rate": 0.00376984126984127,
150
+ "loss": 6.8498,
151
+ "theoretical_loss": 10.478634172356642,
152
+ "tokens_seen": 1245184
153
+ },
154
+ {
155
+ "epoch": 0.0,
156
+ "learning_rate": 0.003968253968253968,
157
+ "loss": 7.3766,
158
+ "theoretical_loss": 10.36356394376333,
159
+ "tokens_seen": 1310720
160
+ },
161
+ {
162
+ "epoch": 0.0,
163
+ "learning_rate": 0.004166666666666667,
164
+ "loss": 7.5166,
165
+ "theoretical_loss": 10.255632220896747,
166
+ "tokens_seen": 1376256
167
+ },
168
+ {
169
+ "epoch": 0.0,
170
+ "learning_rate": 0.004365079365079365,
171
+ "loss": 7.6046,
172
+ "theoretical_loss": 10.15408655327002,
173
+ "tokens_seen": 1441792
174
+ },
175
+ {
176
+ "epoch": 0.0,
177
+ "learning_rate": 0.004563492063492064,
178
+ "loss": 7.584,
179
+ "theoretical_loss": 10.058283561732598,
180
+ "tokens_seen": 1507328
181
+ },
182
+ {
183
+ "epoch": 0.0,
184
+ "learning_rate": 0.004761904761904762,
185
+ "loss": 7.8262,
186
+ "theoretical_loss": 9.967669178840278,
187
+ "tokens_seen": 1572864
188
+ },
189
+ {
190
+ "epoch": 0.0,
191
+ "objective/train/docs_used": 13112,
192
+ "objective/train/instantaneous_batch_size": 8,
193
+ "objective/train/instantaneous_microbatch_size": 8192,
194
+ "objective/train/original_loss": 7.06024694442749,
195
+ "objective/train/theoretical_loss": 9.881763126393109,
196
+ "objective/train/tokens_used": 22098400,
197
+ "theoretical_loss": 9.881763126393109,
198
+ "tokens_seen": 1638400
199
+ },
200
+ {
201
+ "epoch": 0.0,
202
+ "learning_rate": 0.00496031746031746,
203
+ "loss": 7.4158,
204
+ "theoretical_loss": 9.881763126393109,
205
+ "tokens_seen": 1638400
206
+ },
207
+ {
208
+ "epoch": 0.0,
209
+ "learning_rate": 0.005158730158730159,
210
+ "loss": 7.393,
211
+ "theoretical_loss": 9.80014659154056,
212
+ "tokens_seen": 1703936
213
+ },
214
+ {
215
+ "epoch": 0.0,
216
+ "learning_rate": 0.005357142857142857,
217
+ "loss": 7.1589,
218
+ "theoretical_loss": 9.722452346907446,
219
+ "tokens_seen": 1769472
220
+ },
221
+ {
222
+ "epoch": 0.0,
223
+ "learning_rate": 0.005555555555555556,
224
+ "loss": 7.4701,
225
+ "theoretical_loss": 9.648356759081546,
226
+ "tokens_seen": 1835008
227
+ },
228
+ {
229
+ "epoch": 0.0,
230
+ "learning_rate": 0.005753968253968254,
231
+ "loss": 7.3914,
232
+ "theoretical_loss": 9.577573271145639,
233
+ "tokens_seen": 1900544
234
+ },
235
+ {
236
+ "epoch": 0.0,
237
+ "learning_rate": 0.005952380952380952,
238
+ "loss": 7.1136,
239
+ "theoretical_loss": 9.509847046764852,
240
+ "tokens_seen": 1966080
241
+ },
242
+ {
243
+ "epoch": 0.0,
244
+ "learning_rate": 0.006150793650793651,
245
+ "loss": 7.5276,
246
+ "theoretical_loss": 9.444950537631936,
247
+ "tokens_seen": 2031616
248
+ },
249
+ {
250
+ "epoch": 0.0,
251
+ "learning_rate": 0.006349206349206349,
252
+ "loss": 7.1837,
253
+ "theoretical_loss": 9.382679790910457,
254
+ "tokens_seen": 2097152
255
+ }
256
+ ],
257
+ "max_steps": 50354,
258
+ "num_train_epochs": 9223372036854775807,
259
+ "total_flos": 1070252163072000.0,
260
+ "trial_name": null,
261
+ "trial_params": null
262
+ }
checkpoint-32/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:861274747fa77141802f480df74fe77f6d6c3bcdf6fc289e9c550ff0ab8c621e
3
+ size 3311
checkpoint-32/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-34/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "gpt2",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMAndValueHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": true,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.20.1",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
checkpoint-34/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-34/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:283261ab731a31a58ca904e1f78b372ba1cfbc7890796394cd3ce60448479b9b
3
+ size 995603825
checkpoint-34/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:058810adf5194c07e6b081a2ed1c35dba558237a8460bbafa66ff1005b9dd61a
3
+ size 510396521
checkpoint-34/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e3d064f3d702a7f31db602c3cf70042e9b15b03525b8a7339e0b80aa67ac3b
3
+ size 14439
checkpoint-34/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de7098048aeb0fbb1188f12c033c3f3315e2bfd7819e04ebc7f02c4eaf1968b9
3
+ size 559
checkpoint-34/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dc3149ea2c28860823f911c662132f7d03217c356e5f8901942a7b6ea5ee64c
3
+ size 623
checkpoint-34/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
checkpoint-34/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-34/tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1024,
6
+ "name_or_path": "gpt2",
7
+ "special_tokens_map_file": null,
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<|endoftext|>"
10
+ }
checkpoint-34/trainer_state.json ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.0006752194463200541,
5
+ "global_step": 34,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 0.0001984126984126984,
13
+ "loss": 10.961,
14
+ "theoretical_loss": 20.81281780154715,
15
+ "tokens_seen": 65536
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 0.0003968253968253968,
20
+ "loss": 10.9749,
21
+ "theoretical_loss": 17.566201104328645,
22
+ "tokens_seen": 131072
23
+ },
24
+ {
25
+ "epoch": 0.0,
26
+ "learning_rate": 0.0005952380952380953,
27
+ "loss": 9.683,
28
+ "theoretical_loss": 15.939477092836569,
29
+ "tokens_seen": 196608
30
+ },
31
+ {
32
+ "epoch": 0.0,
33
+ "learning_rate": 0.0007936507936507937,
34
+ "loss": 11.0117,
35
+ "theoretical_loss": 14.89231675598857,
36
+ "tokens_seen": 262144
37
+ },
38
+ {
39
+ "epoch": 0.0,
40
+ "learning_rate": 0.000992063492063492,
41
+ "loss": 9.1322,
42
+ "theoretical_loss": 14.136216937762974,
43
+ "tokens_seen": 327680
44
+ },
45
+ {
46
+ "epoch": 0.0,
47
+ "learning_rate": 0.0011904761904761906,
48
+ "loss": 9.58,
49
+ "theoretical_loss": 13.552561472550224,
50
+ "tokens_seen": 393216
51
+ },
52
+ {
53
+ "epoch": 0.0,
54
+ "learning_rate": 0.001388888888888889,
55
+ "loss": 8.4208,
56
+ "theoretical_loss": 13.08180900140119,
57
+ "tokens_seen": 458752
58
+ },
59
+ {
60
+ "epoch": 0.0,
61
+ "learning_rate": 0.0015873015873015873,
62
+ "loss": 8.0254,
63
+ "theoretical_loss": 12.690129625483323,
64
+ "tokens_seen": 524288
65
+ },
66
+ {
67
+ "epoch": 0.0,
68
+ "learning_rate": 0.0017857142857142857,
69
+ "loss": 7.547,
70
+ "theoretical_loss": 12.356592463873625,
71
+ "tokens_seen": 589824
72
+ },
73
+ {
74
+ "epoch": 0.0,
75
+ "learning_rate": 0.001984126984126984,
76
+ "loss": 7.1093,
77
+ "theoretical_loss": 12.067412607035077,
78
+ "tokens_seen": 655360
79
+ },
80
+ {
81
+ "epoch": 0.0,
82
+ "learning_rate": 0.0021825396825396826,
83
+ "loss": 7.2169,
84
+ "theoretical_loss": 11.813066231101676,
85
+ "tokens_seen": 720896
86
+ },
87
+ {
88
+ "epoch": 0.0,
89
+ "learning_rate": 0.002380952380952381,
90
+ "loss": 7.4624,
91
+ "theoretical_loss": 11.586719208706729,
92
+ "tokens_seen": 786432
93
+ },
94
+ {
95
+ "epoch": 0.0,
96
+ "objective/train/docs_used": 12399,
97
+ "objective/train/instantaneous_batch_size": 8,
98
+ "objective/train/instantaneous_microbatch_size": 8192,
99
+ "objective/train/original_loss": 8.1644926071167,
100
+ "objective/train/theoretical_loss": 11.482412519286804,
101
+ "objective/train/tokens_used": 21279200,
102
+ "theoretical_loss": 11.482412519286804,
103
+ "tokens_seen": 819200
104
+ },
105
+ {
106
+ "epoch": 0.0,
107
+ "learning_rate": 0.0025793650793650793,
108
+ "loss": 7.5937,
109
+ "theoretical_loss": 11.383314140186787,
110
+ "tokens_seen": 851968
111
+ },
112
+ {
113
+ "epoch": 0.0,
114
+ "learning_rate": 0.002777777777777778,
115
+ "loss": 7.1562,
116
+ "theoretical_loss": 11.199011702111871,
117
+ "tokens_seen": 917504
118
+ },
119
+ {
120
+ "epoch": 0.0,
121
+ "learning_rate": 0.002976190476190476,
122
+ "loss": 7.492,
123
+ "theoretical_loss": 11.030833917977912,
124
+ "tokens_seen": 983040
125
+ },
126
+ {
127
+ "epoch": 0.0,
128
+ "learning_rate": 0.0031746031746031746,
129
+ "loss": 7.5084,
130
+ "theoretical_loss": 10.87642808645695,
131
+ "tokens_seen": 1048576
132
+ },
133
+ {
134
+ "epoch": 0.0,
135
+ "learning_rate": 0.003373015873015873,
136
+ "loss": 7.4717,
137
+ "theoretical_loss": 10.733905740062724,
138
+ "tokens_seen": 1114112
139
+ },
140
+ {
141
+ "epoch": 0.0,
142
+ "learning_rate": 0.0035714285714285713,
143
+ "loss": 7.1015,
144
+ "theoretical_loss": 10.60172987623028,
145
+ "tokens_seen": 1179648
146
+ },
147
+ {
148
+ "epoch": 0.0,
149
+ "learning_rate": 0.00376984126984127,
150
+ "loss": 6.8498,
151
+ "theoretical_loss": 10.478634172356642,
152
+ "tokens_seen": 1245184
153
+ },
154
+ {
155
+ "epoch": 0.0,
156
+ "learning_rate": 0.003968253968253968,
157
+ "loss": 7.3766,
158
+ "theoretical_loss": 10.36356394376333,
159
+ "tokens_seen": 1310720
160
+ },
161
+ {
162
+ "epoch": 0.0,
163
+ "learning_rate": 0.004166666666666667,
164
+ "loss": 7.5166,
165
+ "theoretical_loss": 10.255632220896747,
166
+ "tokens_seen": 1376256
167
+ },
168
+ {
169
+ "epoch": 0.0,
170
+ "learning_rate": 0.004365079365079365,
171
+ "loss": 7.6046,
172
+ "theoretical_loss": 10.15408655327002,
173
+ "tokens_seen": 1441792
174
+ },
175
+ {
176
+ "epoch": 0.0,
177
+ "learning_rate": 0.004563492063492064,
178
+ "loss": 7.584,
179
+ "theoretical_loss": 10.058283561732598,
180
+ "tokens_seen": 1507328
181
+ },
182
+ {
183
+ "epoch": 0.0,
184
+ "learning_rate": 0.004761904761904762,
185
+ "loss": 7.8262,
186
+ "theoretical_loss": 9.967669178840278,
187
+ "tokens_seen": 1572864
188
+ },
189
+ {
190
+ "epoch": 0.0,
191
+ "objective/train/docs_used": 13112,
192
+ "objective/train/instantaneous_batch_size": 8,
193
+ "objective/train/instantaneous_microbatch_size": 8192,
194
+ "objective/train/original_loss": 7.06024694442749,
195
+ "objective/train/theoretical_loss": 9.881763126393109,
196
+ "objective/train/tokens_used": 22098400,
197
+ "theoretical_loss": 9.881763126393109,
198
+ "tokens_seen": 1638400
199
+ },
200
+ {
201
+ "epoch": 0.0,
202
+ "learning_rate": 0.00496031746031746,
203
+ "loss": 7.4158,
204
+ "theoretical_loss": 9.881763126393109,
205
+ "tokens_seen": 1638400
206
+ },
207
+ {
208
+ "epoch": 0.0,
209
+ "learning_rate": 0.005158730158730159,
210
+ "loss": 7.393,
211
+ "theoretical_loss": 9.80014659154056,
212
+ "tokens_seen": 1703936
213
+ },
214
+ {
215
+ "epoch": 0.0,
216
+ "learning_rate": 0.005357142857142857,
217
+ "loss": 7.1589,
218
+ "theoretical_loss": 9.722452346907446,
219
+ "tokens_seen": 1769472
220
+ },
221
+ {
222
+ "epoch": 0.0,
223
+ "learning_rate": 0.005555555555555556,
224
+ "loss": 7.4701,
225
+ "theoretical_loss": 9.648356759081546,
226
+ "tokens_seen": 1835008
227
+ },
228
+ {
229
+ "epoch": 0.0,
230
+ "learning_rate": 0.005753968253968254,
231
+ "loss": 7.3914,
232
+ "theoretical_loss": 9.577573271145639,
233
+ "tokens_seen": 1900544
234
+ },
235
+ {
236
+ "epoch": 0.0,
237
+ "learning_rate": 0.005952380952380952,
238
+ "loss": 7.1136,
239
+ "theoretical_loss": 9.509847046764852,
240
+ "tokens_seen": 1966080
241
+ },
242
+ {
243
+ "epoch": 0.0,
244
+ "learning_rate": 0.006150793650793651,
245
+ "loss": 7.5276,
246
+ "theoretical_loss": 9.444950537631936,
247
+ "tokens_seen": 2031616
248
+ },
249
+ {
250
+ "epoch": 0.0,
251
+ "learning_rate": 0.006349206349206349,
252
+ "loss": 7.1837,
253
+ "theoretical_loss": 9.382679790910457,
254
+ "tokens_seen": 2097152
255
+ },
256
+ {
257
+ "epoch": 0.0,
258
+ "learning_rate": 0.006547619047619049,
259
+ "loss": 7.1127,
260
+ "theoretical_loss": 9.32285135423398,
261
+ "tokens_seen": 2162688
262
+ },
263
+ {
264
+ "epoch": 0.0,
265
+ "learning_rate": 0.006746031746031746,
266
+ "loss": 7.4547,
267
+ "theoretical_loss": 9.265299666660276,
268
+ "tokens_seen": 2228224
269
+ }
270
+ ],
271
+ "max_steps": 50354,
272
+ "num_train_epochs": 9223372036854775807,
273
+ "total_flos": 1137142923264000.0,
274
+ "trial_name": null,
275
+ "trial_params": null
276
+ }
checkpoint-34/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:861274747fa77141802f480df74fe77f6d6c3bcdf6fc289e9c550ff0ab8c621e
3
+ size 3311
checkpoint-34/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a074760d19e87afc693b5649d5a3a8b25185805c99159e64728785a7382c5a73
3
  size 510396521
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:058810adf5194c07e6b081a2ed1c35dba558237a8460bbafa66ff1005b9dd61a
3
  size 510396521