Dening commited on
Commit
b5f4470
·
1 Parent(s): ac5f914

First model version

Browse files
checkpoint-109/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 50257
3
+ }
checkpoint-109/config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./rlaif/gpt2/summarization/tldr/exps/sft/2025-02-14 11:12:57.316850/checkpoint-1824",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2ForSequenceClassification"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {
12
+ "0": "LABEL_0"
13
+ },
14
+ "initializer_range": 0.02,
15
+ "label2id": {
16
+ "LABEL_0": 0
17
+ },
18
+ "layer_norm_epsilon": 1e-05,
19
+ "model_type": "gpt2",
20
+ "n_ctx": 1024,
21
+ "n_embd": 1280,
22
+ "n_head": 20,
23
+ "n_inner": null,
24
+ "n_layer": 36,
25
+ "n_positions": 1024,
26
+ "pad_token_id": 50257,
27
+ "reorder_and_upcast_attn": false,
28
+ "resid_pdrop": 0.1,
29
+ "scale_attn_by_inverse_layer_idx": false,
30
+ "scale_attn_weights": true,
31
+ "summary_activation": null,
32
+ "summary_first_dropout": 0.1,
33
+ "summary_proj_to_labels": true,
34
+ "summary_type": "cls_index",
35
+ "summary_use_proj": true,
36
+ "task_specific_params": {
37
+ "text-generation": {
38
+ "do_sample": true,
39
+ "max_length": 50
40
+ }
41
+ },
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.48.3",
44
+ "use_cache": false,
45
+ "vocab_size": 50258
46
+ }
checkpoint-109/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-109/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cec61238609e61e5bdc7e0b8ba15a538dc347f0e568f0c2358b5efb24c76242d
3
+ size 3096176240
checkpoint-109/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5fb7bfd61b86c6fa2e773b805cd400f6b20515853360822538be94c14179324
3
+ size 6192635305
checkpoint-109/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08aee9b47408c88c0160058be14e5ec56f680f6382b1beba43c04fa5eb38904a
3
+ size 14244
checkpoint-109/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96b5aae5390b1962f00771adfdcb7f413d4f45e990cdf8e60681aa0bc0feee09
3
+ size 1064
checkpoint-109/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
checkpoint-109/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-109/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": "[PAD]",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "bos_token": "<|endoftext|>",
22
+ "clean_up_tokenization_spaces": false,
23
+ "eos_token": "<|endoftext|>",
24
+ "extra_special_tokens": {},
25
+ "model_max_length": 1024,
26
+ "pad_token": "[PAD]",
27
+ "padding_side": "right",
28
+ "tokenizer_class": "GPT2Tokenizer",
29
+ "unk_token": "<|endoftext|>"
30
+ }
checkpoint-109/trainer_state.json ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7077131258457375,
3
+ "best_model_checkpoint": "./rlaif/gpt2/summarization/tldr/exps/rm/2025-02-20_17-35-37.568836/checkpoint-40",
4
+ "epoch": 0.9931662870159453,
5
+ "eval_steps": 10,
6
+ "global_step": 109,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04555808656036447,
13
+ "grad_norm": 0.4330473244190216,
14
+ "learning_rate": 9.541284403669725e-05,
15
+ "loss": 0.6941,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.09111617312072894,
20
+ "grad_norm": 0.43270525336265564,
21
+ "learning_rate": 9.08256880733945e-05,
22
+ "loss": 0.6853,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.09111617312072894,
27
+ "eval_accuracy": 0.6617050067658998,
28
+ "eval_loss": 0.6401128768920898,
29
+ "eval_runtime": 107.2999,
30
+ "eval_samples_per_second": 6.887,
31
+ "eval_steps_per_second": 0.69,
32
+ "step": 10
33
+ },
34
+ {
35
+ "epoch": 0.1366742596810934,
36
+ "grad_norm": 0.873494565486908,
37
+ "learning_rate": 8.623853211009176e-05,
38
+ "loss": 0.6626,
39
+ "step": 15
40
+ },
41
+ {
42
+ "epoch": 0.18223234624145787,
43
+ "grad_norm": 1.3137083053588867,
44
+ "learning_rate": 8.165137614678899e-05,
45
+ "loss": 0.6088,
46
+ "step": 20
47
+ },
48
+ {
49
+ "epoch": 0.18223234624145787,
50
+ "eval_accuracy": 0.6698240866035182,
51
+ "eval_loss": 0.5892247557640076,
52
+ "eval_runtime": 107.3781,
53
+ "eval_samples_per_second": 6.882,
54
+ "eval_steps_per_second": 0.689,
55
+ "step": 20
56
+ },
57
+ {
58
+ "epoch": 0.22779043280182232,
59
+ "grad_norm": 0.799824595451355,
60
+ "learning_rate": 7.706422018348625e-05,
61
+ "loss": 0.6396,
62
+ "step": 25
63
+ },
64
+ {
65
+ "epoch": 0.2733485193621868,
66
+ "grad_norm": 0.9413164854049683,
67
+ "learning_rate": 7.247706422018348e-05,
68
+ "loss": 0.6112,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.2733485193621868,
73
+ "eval_accuracy": 0.7009472259810555,
74
+ "eval_loss": 0.5853174924850464,
75
+ "eval_runtime": 107.3138,
76
+ "eval_samples_per_second": 6.886,
77
+ "eval_steps_per_second": 0.69,
78
+ "step": 30
79
+ },
80
+ {
81
+ "epoch": 0.31890660592255127,
82
+ "grad_norm": 0.8462890386581421,
83
+ "learning_rate": 6.788990825688074e-05,
84
+ "loss": 0.6252,
85
+ "step": 35
86
+ },
87
+ {
88
+ "epoch": 0.36446469248291574,
89
+ "grad_norm": 0.8693629503250122,
90
+ "learning_rate": 6.330275229357799e-05,
91
+ "loss": 0.6181,
92
+ "step": 40
93
+ },
94
+ {
95
+ "epoch": 0.36446469248291574,
96
+ "eval_accuracy": 0.7077131258457375,
97
+ "eval_loss": 0.5807654857635498,
98
+ "eval_runtime": 107.3736,
99
+ "eval_samples_per_second": 6.883,
100
+ "eval_steps_per_second": 0.689,
101
+ "step": 40
102
+ },
103
+ {
104
+ "epoch": 0.41002277904328016,
105
+ "grad_norm": 1.0260130167007446,
106
+ "learning_rate": 5.8715596330275236e-05,
107
+ "loss": 0.6198,
108
+ "step": 45
109
+ },
110
+ {
111
+ "epoch": 0.45558086560364464,
112
+ "grad_norm": 0.9564087986946106,
113
+ "learning_rate": 5.4128440366972475e-05,
114
+ "loss": 0.6075,
115
+ "step": 50
116
+ },
117
+ {
118
+ "epoch": 0.45558086560364464,
119
+ "eval_accuracy": 0.6901217861975643,
120
+ "eval_loss": 0.5816063284873962,
121
+ "eval_runtime": 107.3358,
122
+ "eval_samples_per_second": 6.885,
123
+ "eval_steps_per_second": 0.689,
124
+ "step": 50
125
+ },
126
+ {
127
+ "epoch": 0.5011389521640092,
128
+ "grad_norm": 0.8393993377685547,
129
+ "learning_rate": 4.954128440366973e-05,
130
+ "loss": 0.6224,
131
+ "step": 55
132
+ },
133
+ {
134
+ "epoch": 0.5466970387243736,
135
+ "grad_norm": 0.8436751365661621,
136
+ "learning_rate": 4.4954128440366975e-05,
137
+ "loss": 0.6031,
138
+ "step": 60
139
+ },
140
+ {
141
+ "epoch": 0.5466970387243736,
142
+ "eval_accuracy": 0.6792963464140731,
143
+ "eval_loss": 0.5921587347984314,
144
+ "eval_runtime": 107.3803,
145
+ "eval_samples_per_second": 6.882,
146
+ "eval_steps_per_second": 0.689,
147
+ "step": 60
148
+ },
149
+ {
150
+ "epoch": 0.592255125284738,
151
+ "grad_norm": 1.0020941495895386,
152
+ "learning_rate": 4.036697247706422e-05,
153
+ "loss": 0.5905,
154
+ "step": 65
155
+ },
156
+ {
157
+ "epoch": 0.6378132118451025,
158
+ "grad_norm": 1.0870941877365112,
159
+ "learning_rate": 3.5779816513761474e-05,
160
+ "loss": 0.6065,
161
+ "step": 70
162
+ },
163
+ {
164
+ "epoch": 0.6378132118451025,
165
+ "eval_accuracy": 0.6941813261163735,
166
+ "eval_loss": 0.5733721256256104,
167
+ "eval_runtime": 107.4068,
168
+ "eval_samples_per_second": 6.88,
169
+ "eval_steps_per_second": 0.689,
170
+ "step": 70
171
+ },
172
+ {
173
+ "epoch": 0.683371298405467,
174
+ "grad_norm": 0.9016109108924866,
175
+ "learning_rate": 3.119266055045872e-05,
176
+ "loss": 0.5989,
177
+ "step": 75
178
+ },
179
+ {
180
+ "epoch": 0.7289293849658315,
181
+ "grad_norm": 0.984087347984314,
182
+ "learning_rate": 2.6605504587155967e-05,
183
+ "loss": 0.5793,
184
+ "step": 80
185
+ },
186
+ {
187
+ "epoch": 0.7289293849658315,
188
+ "eval_accuracy": 0.6874154262516915,
189
+ "eval_loss": 0.5740049481391907,
190
+ "eval_runtime": 107.3709,
191
+ "eval_samples_per_second": 6.883,
192
+ "eval_steps_per_second": 0.689,
193
+ "step": 80
194
+ },
195
+ {
196
+ "epoch": 0.7744874715261959,
197
+ "grad_norm": 1.2113629579544067,
198
+ "learning_rate": 2.2018348623853213e-05,
199
+ "loss": 0.6008,
200
+ "step": 85
201
+ },
202
+ {
203
+ "epoch": 0.8200455580865603,
204
+ "grad_norm": 1.0072749853134155,
205
+ "learning_rate": 1.743119266055046e-05,
206
+ "loss": 0.6114,
207
+ "step": 90
208
+ },
209
+ {
210
+ "epoch": 0.8200455580865603,
211
+ "eval_accuracy": 0.6725304465493911,
212
+ "eval_loss": 0.5750434398651123,
213
+ "eval_runtime": 107.3678,
214
+ "eval_samples_per_second": 6.883,
215
+ "eval_steps_per_second": 0.689,
216
+ "step": 90
217
+ },
218
+ {
219
+ "epoch": 0.8656036446469249,
220
+ "grad_norm": 0.865993320941925,
221
+ "learning_rate": 1.2844036697247708e-05,
222
+ "loss": 0.5915,
223
+ "step": 95
224
+ },
225
+ {
226
+ "epoch": 0.9111617312072893,
227
+ "grad_norm": 0.8444318175315857,
228
+ "learning_rate": 8.256880733944954e-06,
229
+ "loss": 0.6075,
230
+ "step": 100
231
+ },
232
+ {
233
+ "epoch": 0.9111617312072893,
234
+ "eval_accuracy": 0.6928281461434371,
235
+ "eval_loss": 0.57265305519104,
236
+ "eval_runtime": 107.4085,
237
+ "eval_samples_per_second": 6.88,
238
+ "eval_steps_per_second": 0.689,
239
+ "step": 100
240
+ },
241
+ {
242
+ "epoch": 0.9567198177676538,
243
+ "grad_norm": 0.8599985241889954,
244
+ "learning_rate": 3.6697247706422022e-06,
245
+ "loss": 0.5882,
246
+ "step": 105
247
+ }
248
+ ],
249
+ "logging_steps": 5,
250
+ "max_steps": 109,
251
+ "num_input_tokens_seen": 0,
252
+ "num_train_epochs": 1,
253
+ "save_steps": 50,
254
+ "stateful_callbacks": {
255
+ "TrainerControl": {
256
+ "args": {
257
+ "should_epoch_stop": false,
258
+ "should_evaluate": false,
259
+ "should_log": false,
260
+ "should_save": true,
261
+ "should_training_stop": true
262
+ },
263
+ "attributes": {}
264
+ }
265
+ },
266
+ "total_flos": 0.0,
267
+ "train_batch_size": 32,
268
+ "trial_name": null,
269
+ "trial_params": null
270
+ }
checkpoint-109/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab35ffacc44f83c47fadd882d00e580571cb9dc0685c52e2220bea5cec978628
3
+ size 5560
checkpoint-109/vocab.json ADDED
The diff for this file is too large to render. See raw diff