Wilsonwin commited on
Commit
b1cc27a
·
verified ·
1 Parent(s): 01d78c0

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -5,10 +5,10 @@
5
  "GPT2LMHeadModel"
6
  ],
7
  "attn_pdrop": 0.0,
8
- "bos_token_id": 1,
9
  "dtype": "float32",
10
  "embd_pdrop": 0.0,
11
- "eos_token_id": 2,
12
  "initializer_range": 0.02,
13
  "layer_norm_epsilon": 1e-05,
14
  "model_type": "gpt2",
@@ -18,7 +18,7 @@
18
  "n_inner": 3072,
19
  "n_layer": 8,
20
  "n_positions": 1024,
21
- "pad_token_id": 3,
22
  "reorder_and_upcast_attn": false,
23
  "resid_pdrop": 0.0,
24
  "scale_attn_by_inverse_layer_idx": false,
@@ -31,5 +31,5 @@
31
  "tie_word_embeddings": true,
32
  "transformers_version": "5.0.0",
33
  "use_cache": false,
34
- "vocab_size": 4
35
  }
 
5
  "GPT2LMHeadModel"
6
  ],
7
  "attn_pdrop": 0.0,
8
+ "bos_token_id": 2,
9
  "dtype": "float32",
10
  "embd_pdrop": 0.0,
11
+ "eos_token_id": 3,
12
  "initializer_range": 0.02,
13
  "layer_norm_epsilon": 1e-05,
14
  "model_type": "gpt2",
 
18
  "n_inner": 3072,
19
  "n_layer": 8,
20
  "n_positions": 1024,
21
+ "pad_token_id": 0,
22
  "reorder_and_upcast_attn": false,
23
  "resid_pdrop": 0.0,
24
  "scale_attn_by_inverse_layer_idx": false,
 
31
  "tie_word_embeddings": true,
32
  "transformers_version": "5.0.0",
33
  "use_cache": false,
34
+ "vocab_size": 32000
35
  }
last-checkpoint/generation_config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 1,
4
  "eos_token_id": [
5
- 2
6
  ],
7
  "output_attentions": false,
8
  "output_hidden_states": false,
9
- "pad_token_id": 3,
10
  "transformers_version": "5.0.0",
11
  "use_cache": true
12
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 2,
4
  "eos_token_id": [
5
+ 3
6
  ],
7
  "output_attentions": false,
8
  "output_hidden_states": false,
9
+ "pad_token_id": 0,
10
  "transformers_version": "5.0.0",
11
  "use_cache": true
12
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e7506a1280aec2dd18c1213ed1020dae2e07f4f943a5bd4dd04540876174160
3
- size 229986128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87ff9a10f70d200b2c856dee103d85c0b4769807377164ae058876c9b2f8cdf4
3
+ size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8570213bcbaacf39c48f95d9aef6027fc1317ef916ca936ef3f3e3b83eb71b21
3
- size 122063307
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3e3d47782f2acc46ebaf4d2a3bfa3f0989afdba6a7eed3d7d950f2b764c9119
3
+ size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a9464afe392ee166598218bbc5440e93bb5f6227028ba0c26fc8e4fdcf73ae8
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2458121b795ce86cf99025460230b02abd4a71e9c5777618f7febb360b86c2e1
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6f69db244377919464a655cd8dc38a3ecd36738e8ab7fd789261995d39f57a
3
  size 1465
last-checkpoint/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json CHANGED
@@ -1,13 +1,10 @@
1
  {
2
- "add_prefix_space": null,
3
  "backend": "tokenizers",
4
  "bos_token": "<s>",
5
- "clean_up_tokenization_spaces": false,
6
  "eos_token": "</s>",
7
  "is_local": true,
8
  "model_max_length": 1000000000000000019884624838656,
9
  "pad_token": "<pad>",
10
  "tokenizer_class": "TokenizersBackend",
11
- "unk_token": "<unk>",
12
- "use_default_system_prompt": false
13
  }
 
1
  {
 
2
  "backend": "tokenizers",
3
  "bos_token": "<s>",
 
4
  "eos_token": "</s>",
5
  "is_local": true,
6
  "model_max_length": 1000000000000000019884624838656,
7
  "pad_token": "<pad>",
8
  "tokenizer_class": "TokenizersBackend",
9
+ "unk_token": "<unk>"
 
10
  }
last-checkpoint/trainer_state.json CHANGED
@@ -2,30 +2,381 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 14,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.14285714285714285,
14
- "grad_norm": 78.03356170654297,
15
  "learning_rate": 0.0,
16
- "loss": 1.7032254934310913,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 1.4285714285714286,
21
- "grad_norm": 45.858306884765625,
22
  "learning_rate": 1.3499999999999998e-06,
23
- "loss": 1.292690912882487,
24
  "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
  ],
27
  "logging_steps": 10,
28
- "max_steps": 14,
29
  "num_input_tokens_seen": 0,
30
  "num_train_epochs": 2,
31
  "save_steps": 500,
@@ -36,12 +387,12 @@
36
  "should_evaluate": false,
37
  "should_log": false,
38
  "should_save": true,
39
- "should_training_stop": true
40
  },
41
  "attributes": {}
42
  }
43
  },
44
- "total_flos": 440368147464192.0,
45
  "train_batch_size": 48,
46
  "trial_name": null,
47
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.08447372867038351,
6
  "eval_steps": 500,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.00016894745734076703,
14
+ "grad_norm": 6.621576309204102,
15
  "learning_rate": 0.0,
16
+ "loss": 10.540443420410156,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.0016894745734076701,
21
+ "grad_norm": 6.755827903747559,
22
  "learning_rate": 1.3499999999999998e-06,
23
+ "loss": 10.498292711046007,
24
  "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.0033789491468153403,
28
+ "grad_norm": 5.475645542144775,
29
+ "learning_rate": 2.85e-06,
30
+ "loss": 10.21649169921875,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.00506842372022301,
35
+ "grad_norm": 2.392394781112671,
36
+ "learning_rate": 4.35e-06,
37
+ "loss": 9.751205444335938,
38
+ "step": 30
39
+ },
40
+ {
41
+ "epoch": 0.0067578982936306806,
42
+ "grad_norm": 2.019033670425415,
43
+ "learning_rate": 5.85e-06,
44
+ "loss": 9.445993041992187,
45
+ "step": 40
46
+ },
47
+ {
48
+ "epoch": 0.00844737286703835,
49
+ "grad_norm": 1.1508617401123047,
50
+ "learning_rate": 7.35e-06,
51
+ "loss": 9.269256591796875,
52
+ "step": 50
53
+ },
54
+ {
55
+ "epoch": 0.01013684744044602,
56
+ "grad_norm": 0.9643108248710632,
57
+ "learning_rate": 8.849999999999998e-06,
58
+ "loss": 9.148248291015625,
59
+ "step": 60
60
+ },
61
+ {
62
+ "epoch": 0.011826322013853691,
63
+ "grad_norm": 0.9441804885864258,
64
+ "learning_rate": 1.035e-05,
65
+ "loss": 9.042961120605469,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.013515796587261361,
70
+ "grad_norm": 0.8737426996231079,
71
+ "learning_rate": 1.1849999999999998e-05,
72
+ "loss": 8.958677673339844,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 0.015205271160669031,
77
+ "grad_norm": 0.7889962792396545,
78
+ "learning_rate": 1.3349999999999998e-05,
79
+ "loss": 8.854808044433593,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 0.0168947457340767,
84
+ "grad_norm": 0.8155699968338013,
85
+ "learning_rate": 1.485e-05,
86
+ "loss": 8.736964416503906,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.018584220307484373,
91
+ "grad_norm": 0.686811625957489,
92
+ "learning_rate": 1.6349999999999998e-05,
93
+ "loss": 8.659473419189453,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.02027369488089204,
98
+ "grad_norm": 0.6954357624053955,
99
+ "learning_rate": 1.7849999999999997e-05,
100
+ "loss": 8.550489807128907,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.021963169454299714,
105
+ "grad_norm": 0.6710366010665894,
106
+ "learning_rate": 1.935e-05,
107
+ "loss": 8.454399108886719,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.023652644027707382,
112
+ "grad_norm": 0.5743041634559631,
113
+ "learning_rate": 2.085e-05,
114
+ "loss": 8.361704254150391,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.025342118601115054,
119
+ "grad_norm": 0.5449837446212769,
120
+ "learning_rate": 2.2349999999999998e-05,
121
+ "loss": 8.286068725585938,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.027031593174522722,
126
+ "grad_norm": 0.6015759706497192,
127
+ "learning_rate": 2.3849999999999997e-05,
128
+ "loss": 8.23370819091797,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.028721067747930394,
133
+ "grad_norm": 0.5669003129005432,
134
+ "learning_rate": 2.535e-05,
135
+ "loss": 8.169088745117188,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.030410542321338063,
140
+ "grad_norm": 0.42918503284454346,
141
+ "learning_rate": 2.6849999999999995e-05,
142
+ "loss": 8.121391296386719,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.03210001689474573,
147
+ "grad_norm": 0.5449455976486206,
148
+ "learning_rate": 2.8349999999999998e-05,
149
+ "loss": 8.079780578613281,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.0337894914681534,
154
+ "grad_norm": 0.47680142521858215,
155
+ "learning_rate": 2.985e-05,
156
+ "loss": 8.047093200683594,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.035478966041561075,
161
+ "grad_norm": 0.46382999420166016,
162
+ "learning_rate": 3.1349999999999996e-05,
163
+ "loss": 7.9976959228515625,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 0.03716844061496875,
168
+ "grad_norm": 0.4039631485939026,
169
+ "learning_rate": 3.285e-05,
170
+ "loss": 7.981047821044922,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 0.03885791518837641,
175
+ "grad_norm": 0.3981921374797821,
176
+ "learning_rate": 3.435e-05,
177
+ "loss": 7.976937103271484,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 0.04054738976178408,
182
+ "grad_norm": 0.6785397529602051,
183
+ "learning_rate": 3.585e-05,
184
+ "loss": 7.935275268554688,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 0.042236864335191755,
189
+ "grad_norm": 0.47772353887557983,
190
+ "learning_rate": 3.735e-05,
191
+ "loss": 7.9202117919921875,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 0.04392633890859943,
196
+ "grad_norm": 0.4412820041179657,
197
+ "learning_rate": 3.8849999999999996e-05,
198
+ "loss": 7.917314910888672,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 0.0456158134820071,
203
+ "grad_norm": 0.49126797914505005,
204
+ "learning_rate": 4.035e-05,
205
+ "loss": 7.892892456054687,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 0.047305288055414764,
210
+ "grad_norm": 0.41135749220848083,
211
+ "learning_rate": 4.185e-05,
212
+ "loss": 7.873385620117188,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 0.048994762628822436,
217
+ "grad_norm": 0.3579074442386627,
218
+ "learning_rate": 4.334999999999999e-05,
219
+ "loss": 7.852500915527344,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 0.05068423720223011,
224
+ "grad_norm": 0.47334495186805725,
225
+ "learning_rate": 4.484999999999999e-05,
226
+ "loss": 7.833164215087891,
227
+ "step": 300
228
+ },
229
+ {
230
+ "epoch": 0.05237371177563778,
231
+ "grad_norm": 0.5246961712837219,
232
+ "learning_rate": 4.6349999999999995e-05,
233
+ "loss": 7.8142242431640625,
234
+ "step": 310
235
+ },
236
+ {
237
+ "epoch": 0.054063186349045445,
238
+ "grad_norm": 0.4747469127178192,
239
+ "learning_rate": 4.785e-05,
240
+ "loss": 7.786991119384766,
241
+ "step": 320
242
+ },
243
+ {
244
+ "epoch": 0.055752660922453116,
245
+ "grad_norm": 0.5847325921058655,
246
+ "learning_rate": 4.935e-05,
247
+ "loss": 7.7430572509765625,
248
+ "step": 330
249
+ },
250
+ {
251
+ "epoch": 0.05744213549586079,
252
+ "grad_norm": 0.5468484163284302,
253
+ "learning_rate": 5.0849999999999996e-05,
254
+ "loss": 7.7266845703125,
255
+ "step": 340
256
+ },
257
+ {
258
+ "epoch": 0.05913161006926846,
259
+ "grad_norm": 0.5309090614318848,
260
+ "learning_rate": 5.234999999999999e-05,
261
+ "loss": 7.715785217285156,
262
+ "step": 350
263
+ },
264
+ {
265
+ "epoch": 0.060821084642676125,
266
+ "grad_norm": 0.45586320757865906,
267
+ "learning_rate": 5.3849999999999994e-05,
268
+ "loss": 7.676018524169922,
269
+ "step": 360
270
+ },
271
+ {
272
+ "epoch": 0.0625105592160838,
273
+ "grad_norm": 0.6824954748153687,
274
+ "learning_rate": 5.535e-05,
275
+ "loss": 7.658743286132813,
276
+ "step": 370
277
+ },
278
+ {
279
+ "epoch": 0.06420003378949146,
280
+ "grad_norm": 0.4770432710647583,
281
+ "learning_rate": 5.684999999999999e-05,
282
+ "loss": 7.6480712890625,
283
+ "step": 380
284
+ },
285
+ {
286
+ "epoch": 0.06588950836289914,
287
+ "grad_norm": 0.4217078387737274,
288
+ "learning_rate": 5.8349999999999995e-05,
289
+ "loss": 7.612128448486328,
290
+ "step": 390
291
+ },
292
+ {
293
+ "epoch": 0.0675789829363068,
294
+ "grad_norm": 0.6638941168785095,
295
+ "learning_rate": 5.985e-05,
296
+ "loss": 7.605058288574218,
297
+ "step": 400
298
+ },
299
+ {
300
+ "epoch": 0.06926845750971448,
301
+ "grad_norm": 0.5383668541908264,
302
+ "learning_rate": 6.134999999999999e-05,
303
+ "loss": 7.576911926269531,
304
+ "step": 410
305
+ },
306
+ {
307
+ "epoch": 0.07095793208312215,
308
+ "grad_norm": 0.6174895763397217,
309
+ "learning_rate": 6.285e-05,
310
+ "loss": 7.5539703369140625,
311
+ "step": 420
312
+ },
313
+ {
314
+ "epoch": 0.07264740665652981,
315
+ "grad_norm": 0.5723831653594971,
316
+ "learning_rate": 6.434999999999999e-05,
317
+ "loss": 7.541629028320313,
318
+ "step": 430
319
+ },
320
+ {
321
+ "epoch": 0.0743368812299375,
322
+ "grad_norm": 0.6536343693733215,
323
+ "learning_rate": 6.584999999999999e-05,
324
+ "loss": 7.522487640380859,
325
+ "step": 440
326
+ },
327
+ {
328
+ "epoch": 0.07602635580334516,
329
+ "grad_norm": 0.654776394367218,
330
+ "learning_rate": 6.735e-05,
331
+ "loss": 7.4989463806152346,
332
+ "step": 450
333
+ },
334
+ {
335
+ "epoch": 0.07771583037675282,
336
+ "grad_norm": 0.865844190120697,
337
+ "learning_rate": 6.884999999999999e-05,
338
+ "loss": 7.476695251464844,
339
+ "step": 460
340
+ },
341
+ {
342
+ "epoch": 0.0794053049501605,
343
+ "grad_norm": 0.5125293731689453,
344
+ "learning_rate": 7.034999999999999e-05,
345
+ "loss": 7.4570671081542965,
346
+ "step": 470
347
+ },
348
+ {
349
+ "epoch": 0.08109477952356817,
350
+ "grad_norm": 0.5413561463356018,
351
+ "learning_rate": 7.184999999999998e-05,
352
+ "loss": 7.443260192871094,
353
+ "step": 480
354
+ },
355
+ {
356
+ "epoch": 0.08278425409697585,
357
+ "grad_norm": 0.5976828932762146,
358
+ "learning_rate": 7.335e-05,
359
+ "loss": 7.403485107421875,
360
+ "step": 490
361
+ },
362
+ {
363
+ "epoch": 0.08447372867038351,
364
+ "grad_norm": 0.6476218700408936,
365
+ "learning_rate": 7.484999999999999e-05,
366
+ "loss": 7.4032142639160154,
367
+ "step": 500
368
+ },
369
+ {
370
+ "epoch": 0.08447372867038351,
371
+ "eval_loss": 7.389806747436523,
372
+ "eval_runtime": 8.5684,
373
+ "eval_samples_per_second": 116.708,
374
+ "eval_steps_per_second": 2.451,
375
+ "step": 500
376
  }
377
  ],
378
  "logging_steps": 10,
379
+ "max_steps": 11838,
380
  "num_input_tokens_seen": 0,
381
  "num_train_epochs": 2,
382
  "save_steps": 500,
 
387
  "should_evaluate": false,
388
  "should_log": false,
389
  "should_save": true,
390
+ "should_training_stop": false
391
  },
392
  "attributes": {}
393
  }
394
  },
395
+ "total_flos": 1.6722841042944e+16,
396
  "train_batch_size": 48,
397
  "trial_name": null,
398
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b2c5bf738d3dfdf4bb29b32f549e85540e511a9724f3a1b5d213bbd748c534c
3
  size 5201
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fce02471cb82f83442a696c0b3bab66a52bc0f41341df1b015701dd7299e8f59
3
  size 5201