kejian commited on
Commit
ccbeb82
·
1 Parent(s): f16ee88

Training in progress, step 40

Browse files
checkpoint-40/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "codeparrot/codeparrot-small",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMAndValueHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": true,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": true,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.17.0",
37
+ "use_cache": true,
38
+ "vocab_size": 32768
39
+ }
checkpoint-40/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-40/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17144a718e61c709f368d738c19122133ddf57070ed55e923a57098bb6ee6b93
3
+ size 888151409
checkpoint-40/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a251f33e7271e8dc2f8313d2aa9ac00ff941ded6b800cc767cab402590dd7153
3
+ size 456678185
checkpoint-40/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdfbeb3c5f2501db29343c81561f26ca3ed78bf7a00d1c0461fa6c324973fbde
3
+ size 14503
checkpoint-40/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abdff73def3c7c2f549a7aacbb830158b59d50cdbee371e7008e4a60f1604615
3
+ size 559
checkpoint-40/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:062151bbcefd351734ad20a48aaa00700c74fb129103564116e9890c8a19da8d
3
+ size 623
checkpoint-40/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "pad_token": "<|endoftext|>"}
checkpoint-40/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-40/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "codeparrot/codeparrot-small", "tokenizer_class": "GPT2Tokenizer"}
checkpoint-40/trainer_state.json ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.001588751638400127,
5
+ "global_step": 40,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 1.984126984126984e-06,
13
+ "loss": 10.5689,
14
+ "theoretical_loss": 17.59466794495971,
15
+ "tokens_seen": 131072
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 3.968253968253968e-06,
20
+ "loss": 10.5816,
21
+ "theoretical_loss": 14.920783596619636,
22
+ "tokens_seen": 262144
23
+ },
24
+ {
25
+ "epoch": 0.0,
26
+ "learning_rate": 5.9523809523809525e-06,
27
+ "loss": 10.4187,
28
+ "theoretical_loss": 13.581028313181289,
29
+ "tokens_seen": 393216
30
+ },
31
+ {
32
+ "epoch": 0.0,
33
+ "learning_rate": 7.936507936507936e-06,
34
+ "loss": 10.1647,
35
+ "theoretical_loss": 12.71859646611439,
36
+ "tokens_seen": 524288
37
+ },
38
+ {
39
+ "epoch": 0.0,
40
+ "learning_rate": 9.92063492063492e-06,
41
+ "loss": 9.8414,
42
+ "theoretical_loss": 12.095879447666144,
43
+ "tokens_seen": 655360
44
+ },
45
+ {
46
+ "epoch": 0.0,
47
+ "learning_rate": 1.1904761904761905e-05,
48
+ "loss": 9.5154,
49
+ "theoretical_loss": 11.615186049337796,
50
+ "tokens_seen": 786432
51
+ },
52
+ {
53
+ "epoch": 0.0,
54
+ "learning_rate": 1.3888888888888888e-05,
55
+ "loss": 9.4061,
56
+ "theoretical_loss": 11.227478542742938,
57
+ "tokens_seen": 917504
58
+ },
59
+ {
60
+ "epoch": 0.0,
61
+ "learning_rate": 1.5873015873015872e-05,
62
+ "loss": 9.3243,
63
+ "theoretical_loss": 10.904894927088016,
64
+ "tokens_seen": 1048576
65
+ },
66
+ {
67
+ "epoch": 0.0,
68
+ "learning_rate": 1.7857142857142855e-05,
69
+ "loss": 9.0303,
70
+ "theoretical_loss": 10.630196716861345,
71
+ "tokens_seen": 1179648
72
+ },
73
+ {
74
+ "epoch": 0.0,
75
+ "learning_rate": 1.984126984126984e-05,
76
+ "loss": 8.9144,
77
+ "theoretical_loss": 10.392030784394397,
78
+ "tokens_seen": 1310720
79
+ },
80
+ {
81
+ "epoch": 0.0,
82
+ "learning_rate": 2.1825396825396824e-05,
83
+ "loss": 8.7927,
84
+ "theoretical_loss": 10.182553393901085,
85
+ "tokens_seen": 1441792
86
+ },
87
+ {
88
+ "epoch": 0.0,
89
+ "learning_rate": 2.380952380952381e-05,
90
+ "loss": 8.7473,
91
+ "theoretical_loss": 9.996136019471344,
92
+ "tokens_seen": 1572864
93
+ },
94
+ {
95
+ "epoch": 0.0,
96
+ "objective/train/docs_used": 831,
97
+ "objective/train/instantaneous_batch_size": 16,
98
+ "objective/train/instantaneous_microbatch_size": 16384,
99
+ "objective/train/original_loss": 8.822301864624023,
100
+ "objective/train/theoretical_loss": 9.910229967024176,
101
+ "objective/train/tokens_used": -18841600,
102
+ "theoretical_loss": 9.910229967024176,
103
+ "tokens_seen": 1638400
104
+ },
105
+ {
106
+ "epoch": 0.0,
107
+ "learning_rate": 2.5793650793650793e-05,
108
+ "loss": 8.8575,
109
+ "theoretical_loss": 9.828613432171625,
110
+ "tokens_seen": 1703936
111
+ },
112
+ {
113
+ "epoch": 0.0,
114
+ "learning_rate": 2.7777777777777776e-05,
115
+ "loss": 8.6865,
116
+ "theoretical_loss": 9.676823599712613,
117
+ "tokens_seen": 1835008
118
+ },
119
+ {
120
+ "epoch": 0.0,
121
+ "learning_rate": 2.9761904761904762e-05,
122
+ "loss": 8.6022,
123
+ "theoretical_loss": 9.538313887395919,
124
+ "tokens_seen": 1966080
125
+ },
126
+ {
127
+ "epoch": 0.0,
128
+ "learning_rate": 3.1746031746031745e-05,
129
+ "loss": 8.4823,
130
+ "theoretical_loss": 9.411146631541524,
131
+ "tokens_seen": 2097152
132
+ },
133
+ {
134
+ "epoch": 0.0,
135
+ "learning_rate": 3.373015873015873e-05,
136
+ "loss": 8.4221,
137
+ "theoretical_loss": 9.293766507291341,
138
+ "tokens_seen": 2228224
139
+ },
140
+ {
141
+ "epoch": 0.0,
142
+ "learning_rate": 3.571428571428571e-05,
143
+ "loss": 8.3834,
144
+ "theoretical_loss": 9.184907653139359,
145
+ "tokens_seen": 2359296
146
+ },
147
+ {
148
+ "epoch": 0.0,
149
+ "learning_rate": 3.76984126984127e-05,
150
+ "loss": 8.2756,
151
+ "theoretical_loss": 9.0835271371648,
152
+ "tokens_seen": 2490368
153
+ },
154
+ {
155
+ "epoch": 0.0,
156
+ "learning_rate": 3.968253968253968e-05,
157
+ "loss": 8.3847,
158
+ "theoretical_loss": 8.988756330540422,
159
+ "tokens_seen": 2621440
160
+ },
161
+ {
162
+ "epoch": 0.0,
163
+ "learning_rate": 4.1666666666666665e-05,
164
+ "loss": 8.251,
165
+ "theoretical_loss": 8.89986473310929,
166
+ "tokens_seen": 2752512
167
+ },
168
+ {
169
+ "epoch": 0.0,
170
+ "learning_rate": 4.365079365079365e-05,
171
+ "loss": 8.1076,
172
+ "theoretical_loss": 8.816232633409479,
173
+ "tokens_seen": 2883584
174
+ },
175
+ {
176
+ "epoch": 0.0,
177
+ "learning_rate": 4.563492063492063e-05,
178
+ "loss": 8.1488,
179
+ "theoretical_loss": 8.737330150151898,
180
+ "tokens_seen": 3014656
181
+ },
182
+ {
183
+ "epoch": 0.0,
184
+ "learning_rate": 4.761904761904762e-05,
185
+ "loss": 8.0482,
186
+ "theoretical_loss": 8.662700958366539,
187
+ "tokens_seen": 3145728
188
+ },
189
+ {
190
+ "epoch": 0.0,
191
+ "objective/train/docs_used": 1233,
192
+ "objective/train/instantaneous_batch_size": 16,
193
+ "objective/train/instantaneous_microbatch_size": 16384,
194
+ "objective/train/original_loss": 8.187471389770508,
195
+ "objective/train/theoretical_loss": 8.591949505242134,
196
+ "objective/train/tokens_used": -17203200,
197
+ "theoretical_loss": 8.591949505242134,
198
+ "tokens_seen": 3276800
199
+ },
200
+ {
201
+ "epoch": 0.0,
202
+ "learning_rate": 4.96031746031746e-05,
203
+ "loss": 7.9834,
204
+ "theoretical_loss": 8.591949505242134,
205
+ "tokens_seen": 3276800
206
+ },
207
+ {
208
+ "epoch": 0.0,
209
+ "learning_rate": 5.1587301587301586e-05,
210
+ "loss": 8.1198,
211
+ "theoretical_loss": 8.524730860277067,
212
+ "tokens_seen": 3407872
213
+ },
214
+ {
215
+ "epoch": 0.0,
216
+ "learning_rate": 5.357142857142857e-05,
217
+ "loss": 7.8858,
218
+ "theoretical_loss": 8.460742578303845,
219
+ "tokens_seen": 3538944
220
+ },
221
+ {
222
+ "epoch": 0.0,
223
+ "learning_rate": 5.555555555555555e-05,
224
+ "loss": 8.0049,
225
+ "theoretical_loss": 8.399718117751275,
226
+ "tokens_seen": 3670016
227
+ },
228
+ {
229
+ "epoch": 0.0,
230
+ "learning_rate": 5.753968253968254e-05,
231
+ "loss": 7.81,
232
+ "theoretical_loss": 8.341421472916394,
233
+ "tokens_seen": 3801088
234
+ },
235
+ {
236
+ "epoch": 0.0,
237
+ "learning_rate": 5.9523809523809524e-05,
238
+ "loss": 7.8978,
239
+ "theoretical_loss": 8.28564276288293,
240
+ "tokens_seen": 3932160
241
+ },
242
+ {
243
+ "epoch": 0.0,
244
+ "learning_rate": 6.15079365079365e-05,
245
+ "loss": 7.5006,
246
+ "theoretical_loss": 8.232194580909036,
247
+ "tokens_seen": 4063232
248
+ },
249
+ {
250
+ "epoch": 0.0,
251
+ "learning_rate": 6.349206349206349e-05,
252
+ "loss": 7.6305,
253
+ "theoretical_loss": 8.180908953270682,
254
+ "tokens_seen": 4194304
255
+ },
256
+ {
257
+ "epoch": 0.0,
258
+ "learning_rate": 6.547619047619048e-05,
259
+ "loss": 7.662,
260
+ "theoretical_loss": 8.131634790246775,
261
+ "tokens_seen": 4325376
262
+ },
263
+ {
264
+ "epoch": 0.0,
265
+ "learning_rate": 6.746031746031745e-05,
266
+ "loss": 7.7852,
267
+ "theoretical_loss": 8.084235737332481,
268
+ "tokens_seen": 4456448
269
+ },
270
+ {
271
+ "epoch": 0.0,
272
+ "learning_rate": 6.944444444444444e-05,
273
+ "loss": 7.5064,
274
+ "theoretical_loss": 8.038588354092902,
275
+ "tokens_seen": 4587520
276
+ },
277
+ {
278
+ "epoch": 0.0,
279
+ "learning_rate": 7.142857142857142e-05,
280
+ "loss": 7.5163,
281
+ "theoretical_loss": 7.994580562902867,
282
+ "tokens_seen": 4718592
283
+ },
284
+ {
285
+ "epoch": 0.0,
286
+ "learning_rate": 7.341269841269842e-05,
287
+ "loss": 7.4521,
288
+ "theoretical_loss": 7.952110321298584,
289
+ "tokens_seen": 4849664
290
+ },
291
+ {
292
+ "epoch": 0.0,
293
+ "objective/train/docs_used": 2007,
294
+ "objective/train/instantaneous_batch_size": 16,
295
+ "objective/train/instantaneous_microbatch_size": 16384,
296
+ "objective/train/original_loss": 7.20822286605835,
297
+ "objective/train/theoretical_loss": 7.931422353115133,
298
+ "objective/train/tokens_used": -15564800,
299
+ "theoretical_loss": 7.931422353115133,
300
+ "tokens_seen": 4915200
301
+ },
302
+ {
303
+ "epoch": 0.0,
304
+ "learning_rate": 7.53968253968254e-05,
305
+ "loss": 7.3365,
306
+ "theoretical_loss": 7.911084480620269,
307
+ "tokens_seen": 4980736
308
+ },
309
+ {
310
+ "epoch": 0.0,
311
+ "learning_rate": 7.738095238095239e-05,
312
+ "loss": 7.2213,
313
+ "theoretical_loss": 7.871417800659003,
314
+ "tokens_seen": 5111808
315
+ },
316
+ {
317
+ "epoch": 0.0,
318
+ "learning_rate": 7.936507936507937e-05,
319
+ "loss": 7.2532,
320
+ "theoretical_loss": 7.833032095585231,
321
+ "tokens_seen": 5242880
322
+ }
323
+ ],
324
+ "max_steps": 25177,
325
+ "num_train_epochs": 9223372036854775807,
326
+ "total_flos": 2675630407680000.0,
327
+ "trial_name": null,
328
+ "trial_params": null
329
+ }
checkpoint-40/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86d1e83c3ca51308f5ebb0f10ae5ffaf864dd4254cf5bd9d3ff7bdd866102c0d
3
+ size 3055
checkpoint-40/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "codeparrot/codeparrot-small",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMAndValueHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": true,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": true,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.17.0",
37
+ "use_cache": true,
38
+ "vocab_size": 32768
39
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a251f33e7271e8dc2f8313d2aa9ac00ff941ded6b800cc767cab402590dd7153
3
+ size 456678185
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>", "pad_token": "<|endoftext|>"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "codeparrot/codeparrot-small", "tokenizer_class": "GPT2Tokenizer"}
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86d1e83c3ca51308f5ebb0f10ae5ffaf864dd4254cf5bd9d3ff7bdd866102c0d
3
+ size 3055
vocab.json ADDED
The diff for this file is too large to render. See raw diff