iko-01 commited on
Commit
790344c
·
verified ·
1 Parent(s): 5de9b7c

Upload fine-tuned GPT-2 dialogue model v2

Browse files
checkpoint-2548/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "transformers_version": "4.57.3",
36
+ "use_cache": true,
37
+ "vocab_size": 50257
38
+ }
checkpoint-2548/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.57.3"
6
+ }
checkpoint-2548/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2548/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6010c3e489ee87366458c79d44d85b7c8291fa7ab7dc3a7c523bfd5cbfa3cff
3
+ size 497774208
checkpoint-2548/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c12bd5de9aec0d1034fdc7524f14f8d4e16987d837b754ae94b2463269e0342d
3
+ size 995642763
checkpoint-2548/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30ac1131b50f7f9816b465f789b0e3d305dc942d7c01a377761dcb0113513b23
3
+ size 14645
checkpoint-2548/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a047abd089beec2b2259a7432932b821cc10a3dbec32a3fe72ab716224d327c
3
+ size 1383
checkpoint-2548/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b2b89fbc63e042c1595d51cc922e188a59fbb6b796370e3413bae020033edc9
3
+ size 1465
checkpoint-2548/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-2548/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": false,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
+ "extra_special_tokens": {},
19
+ "model_max_length": 1024,
20
+ "pad_token": "<|endoftext|>",
21
+ "tokenizer_class": "GPT2Tokenizer",
22
+ "unk_token": "<|endoftext|>"
23
+ }
checkpoint-2548/trainer_state.json ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2548,
3
+ "best_metric": 0.47001567482948303,
4
+ "best_model_checkpoint": "/content/gpt2-finetuned-dialogue-v2/checkpoint-2548",
5
+ "epoch": 14.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2548,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.797799174690509,
14
+ "grad_norm": 6.382191181182861,
15
+ "learning_rate": 4.329268292682927e-05,
16
+ "loss": 5.6544,
17
+ "step": 145
18
+ },
19
+ {
20
+ "epoch": 1.0,
21
+ "eval_loss": 2.7988412380218506,
22
+ "eval_runtime": 6.9158,
23
+ "eval_samples_per_second": 46.705,
24
+ "eval_steps_per_second": 11.712,
25
+ "step": 182
26
+ },
27
+ {
28
+ "epoch": 1.5942228335625859,
29
+ "grad_norm": 5.028477191925049,
30
+ "learning_rate": 4.760327357755262e-05,
31
+ "loss": 2.9373,
32
+ "step": 290
33
+ },
34
+ {
35
+ "epoch": 2.0,
36
+ "eval_loss": 2.292213201522827,
37
+ "eval_runtime": 6.9846,
38
+ "eval_samples_per_second": 46.245,
39
+ "eval_steps_per_second": 11.597,
40
+ "step": 364
41
+ },
42
+ {
43
+ "epoch": 2.390646492434663,
44
+ "grad_norm": 5.537014484405518,
45
+ "learning_rate": 4.477786438035854e-05,
46
+ "loss": 2.4918,
47
+ "step": 435
48
+ },
49
+ {
50
+ "epoch": 3.0,
51
+ "eval_loss": 1.920114278793335,
52
+ "eval_runtime": 6.9658,
53
+ "eval_samples_per_second": 46.369,
54
+ "eval_steps_per_second": 11.628,
55
+ "step": 546
56
+ },
57
+ {
58
+ "epoch": 3.18707015130674,
59
+ "grad_norm": 4.482412338256836,
60
+ "learning_rate": 4.195245518316446e-05,
61
+ "loss": 2.1431,
62
+ "step": 580
63
+ },
64
+ {
65
+ "epoch": 3.984869325997249,
66
+ "grad_norm": 3.806051731109619,
67
+ "learning_rate": 3.912704598597038e-05,
68
+ "loss": 1.8536,
69
+ "step": 725
70
+ },
71
+ {
72
+ "epoch": 4.0,
73
+ "eval_loss": 1.5900342464447021,
74
+ "eval_runtime": 6.9658,
75
+ "eval_samples_per_second": 46.369,
76
+ "eval_steps_per_second": 11.628,
77
+ "step": 728
78
+ },
79
+ {
80
+ "epoch": 4.781292984869326,
81
+ "grad_norm": 4.049495220184326,
82
+ "learning_rate": 3.630163678877631e-05,
83
+ "loss": 1.5906,
84
+ "step": 870
85
+ },
86
+ {
87
+ "epoch": 5.0,
88
+ "eval_loss": 1.3074204921722412,
89
+ "eval_runtime": 6.9705,
90
+ "eval_samples_per_second": 46.338,
91
+ "eval_steps_per_second": 11.62,
92
+ "step": 910
93
+ },
94
+ {
95
+ "epoch": 5.577716643741403,
96
+ "grad_norm": 4.306301593780518,
97
+ "learning_rate": 3.3476227591582234e-05,
98
+ "loss": 1.3729,
99
+ "step": 1015
100
+ },
101
+ {
102
+ "epoch": 6.0,
103
+ "eval_loss": 1.0866929292678833,
104
+ "eval_runtime": 6.9232,
105
+ "eval_samples_per_second": 46.655,
106
+ "eval_steps_per_second": 11.7,
107
+ "step": 1092
108
+ },
109
+ {
110
+ "epoch": 6.37414030261348,
111
+ "grad_norm": 3.724321126937866,
112
+ "learning_rate": 3.065081839438815e-05,
113
+ "loss": 1.2009,
114
+ "step": 1160
115
+ },
116
+ {
117
+ "epoch": 7.0,
118
+ "eval_loss": 0.9100379347801208,
119
+ "eval_runtime": 6.9831,
120
+ "eval_samples_per_second": 46.255,
121
+ "eval_steps_per_second": 11.599,
122
+ "step": 1274
123
+ },
124
+ {
125
+ "epoch": 7.170563961485557,
126
+ "grad_norm": 3.3211708068847656,
127
+ "learning_rate": 2.782540919719408e-05,
128
+ "loss": 1.0504,
129
+ "step": 1305
130
+ },
131
+ {
132
+ "epoch": 7.968363136176066,
133
+ "grad_norm": 2.471958875656128,
134
+ "learning_rate": 2.5e-05,
135
+ "loss": 0.9314,
136
+ "step": 1450
137
+ },
138
+ {
139
+ "epoch": 8.0,
140
+ "eval_loss": 0.7771766781806946,
141
+ "eval_runtime": 7.0432,
142
+ "eval_samples_per_second": 45.86,
143
+ "eval_steps_per_second": 11.5,
144
+ "step": 1456
145
+ },
146
+ {
147
+ "epoch": 8.764786795048144,
148
+ "grad_norm": 2.442481517791748,
149
+ "learning_rate": 2.2174590802805924e-05,
150
+ "loss": 0.8322,
151
+ "step": 1595
152
+ },
153
+ {
154
+ "epoch": 9.0,
155
+ "eval_loss": 0.6769014000892639,
156
+ "eval_runtime": 6.9508,
157
+ "eval_samples_per_second": 46.47,
158
+ "eval_steps_per_second": 11.653,
159
+ "step": 1638
160
+ },
161
+ {
162
+ "epoch": 9.56121045392022,
163
+ "grad_norm": 2.086437940597534,
164
+ "learning_rate": 1.934918160561185e-05,
165
+ "loss": 0.7578,
166
+ "step": 1740
167
+ },
168
+ {
169
+ "epoch": 10.0,
170
+ "eval_loss": 0.601751446723938,
171
+ "eval_runtime": 6.9024,
172
+ "eval_samples_per_second": 46.795,
173
+ "eval_steps_per_second": 11.735,
174
+ "step": 1820
175
+ },
176
+ {
177
+ "epoch": 10.357634112792297,
178
+ "grad_norm": 2.158360481262207,
179
+ "learning_rate": 1.6523772408417772e-05,
180
+ "loss": 0.6874,
181
+ "step": 1885
182
+ },
183
+ {
184
+ "epoch": 11.0,
185
+ "eval_loss": 0.5506384372711182,
186
+ "eval_runtime": 7.0206,
187
+ "eval_samples_per_second": 46.008,
188
+ "eval_steps_per_second": 11.537,
189
+ "step": 2002
190
+ },
191
+ {
192
+ "epoch": 11.154057771664375,
193
+ "grad_norm": 2.359311819076538,
194
+ "learning_rate": 1.3698363211223694e-05,
195
+ "loss": 0.6506,
196
+ "step": 2030
197
+ },
198
+ {
199
+ "epoch": 11.951856946354884,
200
+ "grad_norm": 2.4090545177459717,
201
+ "learning_rate": 1.0872954014029618e-05,
202
+ "loss": 0.6091,
203
+ "step": 2175
204
+ },
205
+ {
206
+ "epoch": 12.0,
207
+ "eval_loss": 0.5107725858688354,
208
+ "eval_runtime": 6.9921,
209
+ "eval_samples_per_second": 46.195,
210
+ "eval_steps_per_second": 11.584,
211
+ "step": 2184
212
+ },
213
+ {
214
+ "epoch": 12.74828060522696,
215
+ "grad_norm": 2.641254425048828,
216
+ "learning_rate": 8.047544816835542e-06,
217
+ "loss": 0.581,
218
+ "step": 2320
219
+ },
220
+ {
221
+ "epoch": 13.0,
222
+ "eval_loss": 0.48598870635032654,
223
+ "eval_runtime": 7.0084,
224
+ "eval_samples_per_second": 46.087,
225
+ "eval_steps_per_second": 11.557,
226
+ "step": 2366
227
+ },
228
+ {
229
+ "epoch": 13.544704264099037,
230
+ "grad_norm": 2.234863042831421,
231
+ "learning_rate": 5.222135619641466e-06,
232
+ "loss": 0.5598,
233
+ "step": 2465
234
+ },
235
+ {
236
+ "epoch": 14.0,
237
+ "eval_loss": 0.47001567482948303,
238
+ "eval_runtime": 6.9078,
239
+ "eval_samples_per_second": 46.759,
240
+ "eval_steps_per_second": 11.726,
241
+ "step": 2548
242
+ }
243
+ ],
244
+ "logging_steps": 145,
245
+ "max_steps": 2730,
246
+ "num_input_tokens_seen": 0,
247
+ "num_train_epochs": 15,
248
+ "save_steps": 500,
249
+ "stateful_callbacks": {
250
+ "TrainerControl": {
251
+ "args": {
252
+ "should_epoch_stop": false,
253
+ "should_evaluate": false,
254
+ "should_log": false,
255
+ "should_save": true,
256
+ "should_training_stop": false
257
+ },
258
+ "attributes": {}
259
+ }
260
+ },
261
+ "total_flos": 1.0630405029888e+16,
262
+ "train_batch_size": 4,
263
+ "trial_name": null,
264
+ "trial_params": null
265
+ }
checkpoint-2548/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7db1c3b396ab4647640791be8eb5e64ed0e1caa1b2e2d95e1461f28d187f1f6d
3
+ size 5841
checkpoint-2548/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2730/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "transformers_version": "4.57.3",
36
+ "use_cache": true,
37
+ "vocab_size": 50257
38
+ }
checkpoint-2730/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.57.3"
6
+ }
checkpoint-2730/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2730/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4746335bd649570cce06cb14d3637b472ecd72c73fbada2481eb85a4a61b28ed
3
+ size 497774208
checkpoint-2730/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a39fcf238f79cc2934d0d85ab8c9a1e68f57b4cb70c4cc4183a7813d9c499bd
3
+ size 995642763
checkpoint-2730/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7104f369380c0b85fbaa81296ed76192ad576305fe06eaa696125d472db42b7a
3
+ size 14645
checkpoint-2730/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d2ac9124a3f9d71861ec987ee52c91d65f4aae60c8c559f2d0bdb8f6b576e57
3
+ size 1383
checkpoint-2730/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96fd98ecacf62de2e67018478f3ead54588efc958f4279fb68a061b5d6e7ed41
3
+ size 1465
checkpoint-2730/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-2730/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": false,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
+ "extra_special_tokens": {},
19
+ "model_max_length": 1024,
20
+ "pad_token": "<|endoftext|>",
21
+ "tokenizer_class": "GPT2Tokenizer",
22
+ "unk_token": "<|endoftext|>"
23
+ }
checkpoint-2730/trainer_state.json ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2730,
3
+ "best_metric": 0.4640800952911377,
4
+ "best_model_checkpoint": "/content/gpt2-finetuned-dialogue-v2/checkpoint-2730",
5
+ "epoch": 15.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2730,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.797799174690509,
14
+ "grad_norm": 6.382191181182861,
15
+ "learning_rate": 4.329268292682927e-05,
16
+ "loss": 5.6544,
17
+ "step": 145
18
+ },
19
+ {
20
+ "epoch": 1.0,
21
+ "eval_loss": 2.7988412380218506,
22
+ "eval_runtime": 6.9158,
23
+ "eval_samples_per_second": 46.705,
24
+ "eval_steps_per_second": 11.712,
25
+ "step": 182
26
+ },
27
+ {
28
+ "epoch": 1.5942228335625859,
29
+ "grad_norm": 5.028477191925049,
30
+ "learning_rate": 4.760327357755262e-05,
31
+ "loss": 2.9373,
32
+ "step": 290
33
+ },
34
+ {
35
+ "epoch": 2.0,
36
+ "eval_loss": 2.292213201522827,
37
+ "eval_runtime": 6.9846,
38
+ "eval_samples_per_second": 46.245,
39
+ "eval_steps_per_second": 11.597,
40
+ "step": 364
41
+ },
42
+ {
43
+ "epoch": 2.390646492434663,
44
+ "grad_norm": 5.537014484405518,
45
+ "learning_rate": 4.477786438035854e-05,
46
+ "loss": 2.4918,
47
+ "step": 435
48
+ },
49
+ {
50
+ "epoch": 3.0,
51
+ "eval_loss": 1.920114278793335,
52
+ "eval_runtime": 6.9658,
53
+ "eval_samples_per_second": 46.369,
54
+ "eval_steps_per_second": 11.628,
55
+ "step": 546
56
+ },
57
+ {
58
+ "epoch": 3.18707015130674,
59
+ "grad_norm": 4.482412338256836,
60
+ "learning_rate": 4.195245518316446e-05,
61
+ "loss": 2.1431,
62
+ "step": 580
63
+ },
64
+ {
65
+ "epoch": 3.984869325997249,
66
+ "grad_norm": 3.806051731109619,
67
+ "learning_rate": 3.912704598597038e-05,
68
+ "loss": 1.8536,
69
+ "step": 725
70
+ },
71
+ {
72
+ "epoch": 4.0,
73
+ "eval_loss": 1.5900342464447021,
74
+ "eval_runtime": 6.9658,
75
+ "eval_samples_per_second": 46.369,
76
+ "eval_steps_per_second": 11.628,
77
+ "step": 728
78
+ },
79
+ {
80
+ "epoch": 4.781292984869326,
81
+ "grad_norm": 4.049495220184326,
82
+ "learning_rate": 3.630163678877631e-05,
83
+ "loss": 1.5906,
84
+ "step": 870
85
+ },
86
+ {
87
+ "epoch": 5.0,
88
+ "eval_loss": 1.3074204921722412,
89
+ "eval_runtime": 6.9705,
90
+ "eval_samples_per_second": 46.338,
91
+ "eval_steps_per_second": 11.62,
92
+ "step": 910
93
+ },
94
+ {
95
+ "epoch": 5.577716643741403,
96
+ "grad_norm": 4.306301593780518,
97
+ "learning_rate": 3.3476227591582234e-05,
98
+ "loss": 1.3729,
99
+ "step": 1015
100
+ },
101
+ {
102
+ "epoch": 6.0,
103
+ "eval_loss": 1.0866929292678833,
104
+ "eval_runtime": 6.9232,
105
+ "eval_samples_per_second": 46.655,
106
+ "eval_steps_per_second": 11.7,
107
+ "step": 1092
108
+ },
109
+ {
110
+ "epoch": 6.37414030261348,
111
+ "grad_norm": 3.724321126937866,
112
+ "learning_rate": 3.065081839438815e-05,
113
+ "loss": 1.2009,
114
+ "step": 1160
115
+ },
116
+ {
117
+ "epoch": 7.0,
118
+ "eval_loss": 0.9100379347801208,
119
+ "eval_runtime": 6.9831,
120
+ "eval_samples_per_second": 46.255,
121
+ "eval_steps_per_second": 11.599,
122
+ "step": 1274
123
+ },
124
+ {
125
+ "epoch": 7.170563961485557,
126
+ "grad_norm": 3.3211708068847656,
127
+ "learning_rate": 2.782540919719408e-05,
128
+ "loss": 1.0504,
129
+ "step": 1305
130
+ },
131
+ {
132
+ "epoch": 7.968363136176066,
133
+ "grad_norm": 2.471958875656128,
134
+ "learning_rate": 2.5e-05,
135
+ "loss": 0.9314,
136
+ "step": 1450
137
+ },
138
+ {
139
+ "epoch": 8.0,
140
+ "eval_loss": 0.7771766781806946,
141
+ "eval_runtime": 7.0432,
142
+ "eval_samples_per_second": 45.86,
143
+ "eval_steps_per_second": 11.5,
144
+ "step": 1456
145
+ },
146
+ {
147
+ "epoch": 8.764786795048144,
148
+ "grad_norm": 2.442481517791748,
149
+ "learning_rate": 2.2174590802805924e-05,
150
+ "loss": 0.8322,
151
+ "step": 1595
152
+ },
153
+ {
154
+ "epoch": 9.0,
155
+ "eval_loss": 0.6769014000892639,
156
+ "eval_runtime": 6.9508,
157
+ "eval_samples_per_second": 46.47,
158
+ "eval_steps_per_second": 11.653,
159
+ "step": 1638
160
+ },
161
+ {
162
+ "epoch": 9.56121045392022,
163
+ "grad_norm": 2.086437940597534,
164
+ "learning_rate": 1.934918160561185e-05,
165
+ "loss": 0.7578,
166
+ "step": 1740
167
+ },
168
+ {
169
+ "epoch": 10.0,
170
+ "eval_loss": 0.601751446723938,
171
+ "eval_runtime": 6.9024,
172
+ "eval_samples_per_second": 46.795,
173
+ "eval_steps_per_second": 11.735,
174
+ "step": 1820
175
+ },
176
+ {
177
+ "epoch": 10.357634112792297,
178
+ "grad_norm": 2.158360481262207,
179
+ "learning_rate": 1.6523772408417772e-05,
180
+ "loss": 0.6874,
181
+ "step": 1885
182
+ },
183
+ {
184
+ "epoch": 11.0,
185
+ "eval_loss": 0.5506384372711182,
186
+ "eval_runtime": 7.0206,
187
+ "eval_samples_per_second": 46.008,
188
+ "eval_steps_per_second": 11.537,
189
+ "step": 2002
190
+ },
191
+ {
192
+ "epoch": 11.154057771664375,
193
+ "grad_norm": 2.359311819076538,
194
+ "learning_rate": 1.3698363211223694e-05,
195
+ "loss": 0.6506,
196
+ "step": 2030
197
+ },
198
+ {
199
+ "epoch": 11.951856946354884,
200
+ "grad_norm": 2.4090545177459717,
201
+ "learning_rate": 1.0872954014029618e-05,
202
+ "loss": 0.6091,
203
+ "step": 2175
204
+ },
205
+ {
206
+ "epoch": 12.0,
207
+ "eval_loss": 0.5107725858688354,
208
+ "eval_runtime": 6.9921,
209
+ "eval_samples_per_second": 46.195,
210
+ "eval_steps_per_second": 11.584,
211
+ "step": 2184
212
+ },
213
+ {
214
+ "epoch": 12.74828060522696,
215
+ "grad_norm": 2.641254425048828,
216
+ "learning_rate": 8.047544816835542e-06,
217
+ "loss": 0.581,
218
+ "step": 2320
219
+ },
220
+ {
221
+ "epoch": 13.0,
222
+ "eval_loss": 0.48598870635032654,
223
+ "eval_runtime": 7.0084,
224
+ "eval_samples_per_second": 46.087,
225
+ "eval_steps_per_second": 11.557,
226
+ "step": 2366
227
+ },
228
+ {
229
+ "epoch": 13.544704264099037,
230
+ "grad_norm": 2.234863042831421,
231
+ "learning_rate": 5.222135619641466e-06,
232
+ "loss": 0.5598,
233
+ "step": 2465
234
+ },
235
+ {
236
+ "epoch": 14.0,
237
+ "eval_loss": 0.47001567482948303,
238
+ "eval_runtime": 6.9078,
239
+ "eval_samples_per_second": 46.759,
240
+ "eval_steps_per_second": 11.726,
241
+ "step": 2548
242
+ },
243
+ {
244
+ "epoch": 14.341127922971115,
245
+ "grad_norm": 2.0931003093719482,
246
+ "learning_rate": 2.396726422447389e-06,
247
+ "loss": 0.542,
248
+ "step": 2610
249
+ },
250
+ {
251
+ "epoch": 15.0,
252
+ "eval_loss": 0.4640800952911377,
253
+ "eval_runtime": 6.9334,
254
+ "eval_samples_per_second": 46.586,
255
+ "eval_steps_per_second": 11.683,
256
+ "step": 2730
257
+ }
258
+ ],
259
+ "logging_steps": 145,
260
+ "max_steps": 2730,
261
+ "num_input_tokens_seen": 0,
262
+ "num_train_epochs": 15,
263
+ "save_steps": 500,
264
+ "stateful_callbacks": {
265
+ "TrainerControl": {
266
+ "args": {
267
+ "should_epoch_stop": false,
268
+ "should_evaluate": false,
269
+ "should_log": false,
270
+ "should_save": true,
271
+ "should_training_stop": true
272
+ },
273
+ "attributes": {}
274
+ }
275
+ },
276
+ "total_flos": 1.138971967488e+16,
277
+ "train_batch_size": 4,
278
+ "trial_name": null,
279
+ "trial_params": null
280
+ }
checkpoint-2730/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7db1c3b396ab4647640791be8eb5e64ed0e1caa1b2e2d95e1461f28d187f1f6d
3
+ size 5841
checkpoint-2730/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "transformers_version": "4.57.3",
36
+ "use_cache": true,
37
+ "vocab_size": 50257
38
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.57.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4746335bd649570cce06cb14d3637b472ecd72c73fbada2481eb85a4a61b28ed
3
+ size 497774208
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": false,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
+ "extra_special_tokens": {},
19
+ "model_max_length": 1024,
20
+ "pad_token": "<|endoftext|>",
21
+ "tokenizer_class": "GPT2Tokenizer",
22
+ "unk_token": "<|endoftext|>"
23
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff