shorecode commited on
Commit
7dee52f
·
verified ·
1 Parent(s): 42230a4

Upload folder using huggingface_hub

Browse files
checkpoint-latest/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec5e19390f915af7d82e4e4d4ab59cbb91dfd4e2451e13a4fd5b2fcb2756dcd2
3
  size 62293080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:393c31c029be3e0c558e9b43e5093997ec4c94e8cc3df4239769190abffdda69
3
  size 62293080
checkpoint-latest/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f08646ac1adb0510b1dd9481036497e5ccbd6dc461543f2ea2ce32d8d76f3f42
3
  size 124642443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e0467a1e128a83101d757df9edbc49229825b87355d89ca5c115347fdffcc60
3
  size 124642443
checkpoint-latest/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b0ec4d2188868fd24263efa2856258953fca7ad21aed2b50e22b491f1d8939f
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8442053f994922dc69efe15ac7d6938fd15f0fd7a705fec6122ab91041dc1f14
3
  size 14645
checkpoint-latest/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a902ca58dea28be10847ac21293e6d27c44fc74bd49d763b881d90cbd1e58f0a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85ab8cc1fdedbccfbfeb8687bbad9305fda8976259d4a6ffe9a48f328a2c592d
3
  size 1465
checkpoint-latest/tokenizer.json CHANGED
@@ -1,21 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 512,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": {
11
- "Fixed": 512
12
- },
13
- "direction": "Right",
14
- "pad_to_multiple_of": null,
15
- "pad_id": 0,
16
- "pad_type_id": 0,
17
- "pad_token": "<pad>"
18
- },
19
  "added_tokens": [
20
  {
21
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
checkpoint-latest/tokenizer_config.json CHANGED
@@ -932,16 +932,9 @@
932
  "eos_token": "</s>",
933
  "extra_ids": 100,
934
  "extra_special_tokens": {},
935
- "max_length": 512,
936
  "model_max_length": 1000000000000000019884624838656,
937
- "pad_to_multiple_of": null,
938
  "pad_token": "<pad>",
939
- "pad_token_type_id": 0,
940
- "padding_side": "right",
941
  "sp_model_kwargs": {},
942
- "stride": 0,
943
  "tokenizer_class": "T5Tokenizer",
944
- "truncation_side": "right",
945
- "truncation_strategy": "longest_first",
946
  "unk_token": "<unk>"
947
  }
 
932
  "eos_token": "</s>",
933
  "extra_ids": 100,
934
  "extra_special_tokens": {},
 
935
  "model_max_length": 1000000000000000019884624838656,
 
936
  "pad_token": "<pad>",
 
 
937
  "sp_model_kwargs": {},
 
938
  "tokenizer_class": "T5Tokenizer",
 
 
939
  "unk_token": "<unk>"
940
  }
checkpoint-latest/trainer_state.json CHANGED
@@ -2,304 +2,60 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7038535984515221,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.021995424951610065,
14
- "grad_norm": 0.7317402958869934,
15
- "learning_rate": 0.00029781805384480024,
16
- "loss": 3.1594,
17
- "step": 125
18
- },
19
- {
20
- "epoch": 0.04399084990322013,
21
- "grad_norm": 0.7342692613601685,
22
- "learning_rate": 0.00029561851134963927,
23
- "loss": 3.1518,
24
- "step": 250
25
- },
26
- {
27
- "epoch": 0.06598627485483019,
28
- "grad_norm": 0.6049332022666931,
29
- "learning_rate": 0.00029341896885447824,
30
- "loss": 3.1479,
31
- "step": 375
32
- },
33
- {
34
- "epoch": 0.08798169980644026,
35
- "grad_norm": 0.7177291512489319,
36
- "learning_rate": 0.0002912194263593172,
37
- "loss": 3.1586,
38
- "step": 500
39
- },
40
- {
41
- "epoch": 0.08798169980644026,
42
- "eval_loss": 2.6734766960144043,
43
- "eval_runtime": 59.2675,
44
- "eval_samples_per_second": 166.196,
45
- "eval_steps_per_second": 6.395,
46
- "step": 500
47
- },
48
- {
49
- "epoch": 0.10997712475805033,
50
- "grad_norm": 0.7241224050521851,
51
- "learning_rate": 0.00028901988386415624,
52
- "loss": 3.1257,
53
- "step": 625
54
- },
55
- {
56
- "epoch": 0.13197254970966038,
57
- "grad_norm": 1.1103954315185547,
58
- "learning_rate": 0.0002868203413689952,
59
- "loss": 3.1179,
60
- "step": 750
61
- },
62
- {
63
- "epoch": 0.15396797466127046,
64
- "grad_norm": 0.7277866005897522,
65
- "learning_rate": 0.00028462079887383424,
66
- "loss": 3.1451,
67
- "step": 875
68
- },
69
- {
70
- "epoch": 0.17596339961288052,
71
- "grad_norm": 0.7038848400115967,
72
- "learning_rate": 0.0002824212563786732,
73
- "loss": 3.1223,
74
- "step": 1000
75
- },
76
- {
77
- "epoch": 0.17596339961288052,
78
- "eval_loss": 2.664580821990967,
79
- "eval_runtime": 59.2093,
80
- "eval_samples_per_second": 166.359,
81
- "eval_steps_per_second": 6.401,
82
- "step": 1000
83
- },
84
- {
85
- "epoch": 0.1979588245644906,
86
- "grad_norm": 0.7378321290016174,
87
- "learning_rate": 0.0002802217138835122,
88
- "loss": 3.1179,
89
- "step": 1125
90
- },
91
- {
92
- "epoch": 0.21995424951610065,
93
- "grad_norm": 0.861381471157074,
94
- "learning_rate": 0.0002780221713883512,
95
- "loss": 3.103,
96
- "step": 1250
97
- },
98
- {
99
- "epoch": 0.2419496744677107,
100
- "grad_norm": 0.8002694249153137,
101
- "learning_rate": 0.0002758226288931902,
102
- "loss": 3.1028,
103
- "step": 1375
104
- },
105
- {
106
- "epoch": 0.26394509941932076,
107
- "grad_norm": 0.6079952120780945,
108
- "learning_rate": 0.0002736230863980292,
109
- "loss": 3.0921,
110
- "step": 1500
111
- },
112
- {
113
- "epoch": 0.26394509941932076,
114
- "eval_loss": 2.646458864212036,
115
- "eval_runtime": 59.2373,
116
- "eval_samples_per_second": 166.28,
117
- "eval_steps_per_second": 6.398,
118
- "step": 1500
119
- },
120
- {
121
- "epoch": 0.28594052437093087,
122
- "grad_norm": 0.9203604459762573,
123
- "learning_rate": 0.0002714235439028682,
124
- "loss": 3.1046,
125
- "step": 1625
126
- },
127
- {
128
- "epoch": 0.3079359493225409,
129
- "grad_norm": 1.0269505977630615,
130
- "learning_rate": 0.00026922400140770716,
131
- "loss": 3.0836,
132
- "step": 1750
133
- },
134
- {
135
- "epoch": 0.329931374274151,
136
- "grad_norm": 0.7157150506973267,
137
- "learning_rate": 0.0002670244589125462,
138
- "loss": 3.095,
139
- "step": 1875
140
- },
141
- {
142
- "epoch": 0.35192679922576103,
143
- "grad_norm": 0.6990401148796082,
144
- "learning_rate": 0.00026482491641738516,
145
- "loss": 3.083,
146
- "step": 2000
147
- },
148
- {
149
- "epoch": 0.35192679922576103,
150
- "eval_loss": 2.6299610137939453,
151
- "eval_runtime": 58.9681,
152
- "eval_samples_per_second": 167.039,
153
- "eval_steps_per_second": 6.427,
154
- "step": 2000
155
- },
156
- {
157
- "epoch": 0.3739222241773711,
158
- "grad_norm": 0.7473997473716736,
159
- "learning_rate": 0.00026262537392222413,
160
- "loss": 3.072,
161
- "step": 2125
162
- },
163
- {
164
- "epoch": 0.3959176491289812,
165
- "grad_norm": 0.8357605338096619,
166
- "learning_rate": 0.00026042583142706316,
167
- "loss": 3.073,
168
- "step": 2250
169
- },
170
- {
171
- "epoch": 0.41791307408059125,
172
- "grad_norm": 0.6772239804267883,
173
- "learning_rate": 0.00025822628893190213,
174
- "loss": 3.0631,
175
- "step": 2375
176
- },
177
- {
178
- "epoch": 0.4399084990322013,
179
- "grad_norm": 0.8163031339645386,
180
- "learning_rate": 0.00025602674643674116,
181
- "loss": 3.0659,
182
- "step": 2500
183
- },
184
- {
185
- "epoch": 0.4399084990322013,
186
- "eval_loss": 2.623256206512451,
187
- "eval_runtime": 59.1416,
188
- "eval_samples_per_second": 166.55,
189
- "eval_steps_per_second": 6.408,
190
- "step": 2500
191
- },
192
- {
193
- "epoch": 0.46190392398381136,
194
- "grad_norm": 0.7648818492889404,
195
- "learning_rate": 0.00025382720394158013,
196
- "loss": 3.0765,
197
- "step": 2625
198
- },
199
- {
200
- "epoch": 0.4838993489354214,
201
- "grad_norm": 0.8686987161636353,
202
- "learning_rate": 0.0002516276614464191,
203
- "loss": 3.075,
204
- "step": 2750
205
- },
206
- {
207
- "epoch": 0.5058947738870315,
208
- "grad_norm": 0.721097469329834,
209
- "learning_rate": 0.00024942811895125813,
210
- "loss": 3.1497,
211
- "step": 2875
212
- },
213
- {
214
- "epoch": 0.5278901988386415,
215
- "grad_norm": 0.0,
216
- "learning_rate": 0.0002472285764560971,
217
- "loss": 3.9227,
218
- "step": 3000
219
- },
220
- {
221
- "epoch": 0.5278901988386415,
222
- "eval_loss": 3.1224141120910645,
223
- "eval_runtime": 59.1848,
224
- "eval_samples_per_second": 166.428,
225
- "eval_steps_per_second": 6.404,
226
- "step": 3000
227
- },
228
- {
229
- "epoch": 0.5498856237902516,
230
- "grad_norm": 0.0,
231
- "learning_rate": 0.0002450290339609361,
232
- "loss": 4.0045,
233
- "step": 3125
234
- },
235
- {
236
- "epoch": 0.5718810487418617,
237
- "grad_norm": NaN,
238
- "learning_rate": 0.00024282949146577507,
239
- "loss": 4.0077,
240
- "step": 3250
241
- },
242
- {
243
- "epoch": 0.5938764736934717,
244
- "grad_norm": NaN,
245
- "learning_rate": 0.0002406299489706141,
246
- "loss": 3.5634,
247
- "step": 3375
248
- },
249
- {
250
- "epoch": 0.6158718986450818,
251
  "grad_norm": NaN,
252
- "learning_rate": 0.0002384304064754531,
253
  "loss": 0.0,
254
- "step": 3500
255
- },
256
- {
257
- "epoch": 0.6158718986450818,
258
- "eval_loss": NaN,
259
- "eval_runtime": 58.2581,
260
- "eval_samples_per_second": 169.075,
261
- "eval_steps_per_second": 6.506,
262
- "step": 3500
263
  },
264
  {
265
- "epoch": 0.6378673235966918,
266
  "grad_norm": NaN,
267
- "learning_rate": 0.00023623086398029207,
268
  "loss": 0.0,
269
- "step": 3625
270
  },
271
  {
272
- "epoch": 0.659862748548302,
273
- "grad_norm": NaN,
274
- "learning_rate": 0.00023403132148513107,
275
- "loss": 0.0,
276
- "step": 3750
 
277
  },
278
  {
279
- "epoch": 0.6818581734999121,
280
  "grad_norm": NaN,
281
- "learning_rate": 0.00023183177898997007,
282
  "loss": 0.0,
283
- "step": 3875
284
  },
285
  {
286
- "epoch": 0.7038535984515221,
287
  "grad_norm": NaN,
288
- "learning_rate": 0.00022963223649480905,
289
  "loss": 0.0,
290
- "step": 4000
291
  },
292
  {
293
- "epoch": 0.7038535984515221,
294
  "eval_loss": NaN,
295
- "eval_runtime": 58.3487,
296
- "eval_samples_per_second": 168.813,
297
- "eval_steps_per_second": 6.495,
298
- "step": 4000
299
  }
300
  ],
301
- "logging_steps": 125,
302
- "max_steps": 17049,
303
  "num_input_tokens_seen": 0,
304
  "num_train_epochs": 3,
305
  "save_steps": 1000,
@@ -315,8 +71,8 @@
315
  "attributes": {}
316
  }
317
  },
318
- "total_flos": 2346851500032000.0,
319
- "train_batch_size": 26,
320
  "trial_name": null,
321
  "trial_params": null
322
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.18950161076369149,
6
  "eval_steps": 500,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.04263786242183058,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "grad_norm": NaN,
15
+ "learning_rate": 0.0002957551639188933,
16
  "loss": 0.0,
17
+ "step": 225
 
 
 
 
 
 
 
 
18
  },
19
  {
20
+ "epoch": 0.08527572484366117,
21
  "grad_norm": NaN,
22
+ "learning_rate": 0.0002914913776767102,
23
  "loss": 0.0,
24
+ "step": 450
25
  },
26
  {
27
+ "epoch": 0.09475080538184574,
28
+ "eval_loss": NaN,
29
+ "eval_runtime": 58.7663,
30
+ "eval_samples_per_second": 167.613,
31
+ "eval_steps_per_second": 5.99,
32
+ "step": 500
33
  },
34
  {
35
+ "epoch": 0.12791358726549176,
36
  "grad_norm": NaN,
37
+ "learning_rate": 0.0002872275914345272,
38
  "loss": 0.0,
39
+ "step": 675
40
  },
41
  {
42
+ "epoch": 0.17055144968732233,
43
  "grad_norm": NaN,
44
+ "learning_rate": 0.0002829638051923441,
45
  "loss": 0.0,
46
+ "step": 900
47
  },
48
  {
49
+ "epoch": 0.18950161076369149,
50
  "eval_loss": NaN,
51
+ "eval_runtime": 58.8198,
52
+ "eval_samples_per_second": 167.46,
53
+ "eval_steps_per_second": 5.984,
54
+ "step": 1000
55
  }
56
  ],
57
+ "logging_steps": 225,
58
+ "max_steps": 15831,
59
  "num_input_tokens_seen": 0,
60
  "num_train_epochs": 3,
61
  "save_steps": 1000,
 
71
  "attributes": {}
72
  }
73
  },
74
+ "total_flos": 631844634624000.0,
75
+ "train_batch_size": 28,
76
  "trial_name": null,
77
  "trial_params": null
78
  }
checkpoint-latest/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bef956c560dcdc3bac6492fdc576c6dff35538e184d8d9734adbb5fe3c9b01e
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:669a48aa09ae0e5107616885177fad4ad55736c6332be20bb6f916b99a7e1ab7
3
  size 6033