Sabbir772 commited on
Commit
2bb2506
·
verified ·
1 Parent(s): 2627386

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/added_tokens.json CHANGED
@@ -2,6 +2,7 @@
2
  "<BH>": 32102,
3
  "<BN>": 32100,
4
  "<CH>": 32101,
 
5
  "<extra_id_0>": 32099,
6
  "<extra_id_10>": 32089,
7
  "<extra_id_11>": 32088,
 
2
  "<BH>": 32102,
3
  "<BN>": 32100,
4
  "<CH>": 32101,
5
+ "<SY>": 32103,
6
  "<extra_id_0>": 32099,
7
  "<extra_id_10>": 32089,
8
  "<extra_id_11>": 32088,
last-checkpoint/config.json CHANGED
@@ -28,5 +28,5 @@
28
  "tie_word_embeddings": false,
29
  "transformers_version": "4.57.1",
30
  "use_cache": true,
31
- "vocab_size": 32103
32
  }
 
28
  "tie_word_embeddings": false,
29
  "transformers_version": "4.57.1",
30
  "use_cache": true,
31
+ "vocab_size": 32104
32
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c4dbbd07ff37198e380878eb94bf5e5f99d2589a40d0e914b57f45927127f2e
3
- size 990191464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe2a15bf99c0f458b2206d6f35220f4c958e95e187565eba49da4fb8564ae369
3
+ size 990197608
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ce0547f9ddfa37097a60fda229647d24bb9e099009591e12b9f63d93c447f83
3
- size 1980557579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c91c5b91e574bf8d32168fe23a0d8f97d2c0d7a1ad5ad3fe64951d9602e1bdc7
3
+ size 1980569867
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fed951d50f20e88aed1c5d289b7872841507ec83be8fd1c47fabf58dc50ace96
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2276a80239d2bec0f27af2510173a92b0a5242a76a5b11dff11d2bba9784d26
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eeeaf3e3e3934f426458af741488045385cfffe6a8596ba5f06df4561656e614
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9bf41974fd17e7e1e3fb4258cf1a9ed2a23fdde64d539be88a5a088f814ff5
3
  size 1465
last-checkpoint/special_tokens_map.json CHANGED
@@ -7,12 +7,26 @@
7
  "rstrip": false,
8
  "single_word": false
9
  },
 
 
 
 
 
 
 
10
  {
11
  "content": "<BH>",
12
  "lstrip": false,
13
  "normalized": false,
14
  "rstrip": false,
15
  "single_word": false
 
 
 
 
 
 
 
16
  }
17
  ],
18
  "eos_token": {
 
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
+ {
11
+ "content": "<CH>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
  {
18
  "content": "<BH>",
19
  "lstrip": false,
20
  "normalized": false,
21
  "rstrip": false,
22
  "single_word": false
23
+ },
24
+ {
25
+ "content": "<SY>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
  }
31
  ],
32
  "eos_token": {
last-checkpoint/tokenizer_config.json CHANGED
@@ -848,11 +848,21 @@
848
  "rstrip": false,
849
  "single_word": false,
850
  "special": true
 
 
 
 
 
 
 
 
851
  }
852
  },
853
  "additional_special_tokens": [
854
  "<BN>",
855
- "<BH>"
 
 
856
  ],
857
  "clean_up_tokenization_spaces": false,
858
  "eos_token": "</s>",
 
848
  "rstrip": false,
849
  "single_word": false,
850
  "special": true
851
+ },
852
+ "32103": {
853
+ "content": "<SY>",
854
+ "lstrip": false,
855
+ "normalized": false,
856
+ "rstrip": false,
857
+ "single_word": false,
858
+ "special": true
859
  }
860
  },
861
  "additional_special_tokens": [
862
  "<BN>",
863
+ "<CH>",
864
+ "<BH>",
865
+ "<SY>"
866
  ],
867
  "clean_up_tokenization_spaces": false,
868
  "eos_token": "</s>",
last-checkpoint/trainer_state.json CHANGED
@@ -4,108 +4,388 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 1235,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.08097165991902834,
14
- "grad_norm": NaN,
15
- "learning_rate": 2.987975708502024e-05,
16
- "loss": 0.0,
17
  "step": 100
18
  },
19
  {
20
- "epoch": 0.16194331983805668,
21
- "grad_norm": NaN,
22
- "learning_rate": 2.9758299595141703e-05,
23
- "loss": 0.0,
24
  "step": 200
25
  },
26
  {
27
- "epoch": 0.242914979757085,
28
- "grad_norm": NaN,
29
- "learning_rate": 2.9636842105263158e-05,
30
- "loss": 0.0,
31
  "step": 300
32
  },
33
  {
34
- "epoch": 0.32388663967611336,
35
- "grad_norm": NaN,
36
- "learning_rate": 2.9515384615384617e-05,
37
- "loss": 0.0,
38
  "step": 400
39
  },
40
  {
41
- "epoch": 0.4048582995951417,
42
- "grad_norm": NaN,
43
- "learning_rate": 2.9393927125506075e-05,
44
- "loss": 0.0,
45
  "step": 500
46
  },
47
  {
48
- "epoch": 0.48582995951417,
49
- "grad_norm": NaN,
50
- "learning_rate": 2.927246963562753e-05,
51
- "loss": 0.0,
52
  "step": 600
53
  },
54
  {
55
- "epoch": 0.5668016194331984,
56
- "grad_norm": NaN,
57
- "learning_rate": 2.915101214574899e-05,
58
- "loss": 0.0,
59
  "step": 700
60
  },
61
  {
62
- "epoch": 0.6477732793522267,
63
- "grad_norm": NaN,
64
- "learning_rate": 2.9029554655870447e-05,
65
- "loss": 0.0,
66
  "step": 800
67
  },
68
  {
69
- "epoch": 0.728744939271255,
70
- "grad_norm": NaN,
71
- "learning_rate": 2.8908097165991902e-05,
72
- "loss": 0.0,
73
  "step": 900
74
  },
75
  {
76
- "epoch": 0.8097165991902834,
77
- "grad_norm": NaN,
78
- "learning_rate": 2.878663967611336e-05,
79
- "loss": 0.0,
80
  "step": 1000
81
  },
82
  {
83
- "epoch": 0.8906882591093117,
84
- "grad_norm": NaN,
85
- "learning_rate": 2.866518218623482e-05,
86
- "loss": 0.0,
87
  "step": 1100
88
  },
89
  {
90
- "epoch": 0.97165991902834,
91
- "grad_norm": NaN,
92
- "learning_rate": 2.8543724696356277e-05,
93
- "loss": 0.0,
94
  "step": 1200
95
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  {
97
  "epoch": 1.0,
98
- "eval_loss": NaN,
99
- "eval_runtime": 27.9209,
100
- "eval_samples_per_second": 48.243,
101
- "eval_steps_per_second": 6.053,
102
- "step": 1235
103
  }
104
  ],
105
  "logging_steps": 100,
106
- "max_steps": 24700,
107
  "num_input_tokens_seen": 0,
108
- "num_train_epochs": 20,
109
  "save_steps": 500,
110
  "stateful_callbacks": {
111
  "TrainerControl": {
@@ -119,7 +399,7 @@
119
  "attributes": {}
120
  }
121
  },
122
- "total_flos": 3380013503741952.0,
123
  "train_batch_size": 8,
124
  "trial_name": null,
125
  "trial_params": null
 
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 5241,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.019080328181644724,
14
+ "grad_norm": 3213.42529296875,
15
+ "learning_rate": 4.990555237550087e-05,
16
+ "loss": 15.0258,
17
  "step": 100
18
  },
19
  {
20
+ "epoch": 0.03816065636328945,
21
+ "grad_norm": 57.823360443115234,
22
+ "learning_rate": 4.981015073459264e-05,
23
+ "loss": 10.8207,
24
  "step": 200
25
  },
26
  {
27
+ "epoch": 0.057240984544934176,
28
+ "grad_norm": 21.06822967529297,
29
+ "learning_rate": 4.971474909368441e-05,
30
+ "loss": 8.1947,
31
  "step": 300
32
  },
33
  {
34
+ "epoch": 0.0763213127265789,
35
+ "grad_norm": 25.405773162841797,
36
+ "learning_rate": 4.9619347452776186e-05,
37
+ "loss": 6.9094,
38
  "step": 400
39
  },
40
  {
41
+ "epoch": 0.09540164090822362,
42
+ "grad_norm": 10.772579193115234,
43
+ "learning_rate": 4.9523945811867966e-05,
44
+ "loss": 6.1739,
45
  "step": 500
46
  },
47
  {
48
+ "epoch": 0.11448196908986835,
49
+ "grad_norm": 7.486720085144043,
50
+ "learning_rate": 4.9428544170959746e-05,
51
+ "loss": 5.7799,
52
  "step": 600
53
  },
54
  {
55
+ "epoch": 0.13356229727151306,
56
+ "grad_norm": 17.71492576599121,
57
+ "learning_rate": 4.933314253005152e-05,
58
+ "loss": 5.4888,
59
  "step": 700
60
  },
61
  {
62
+ "epoch": 0.1526426254531578,
63
+ "grad_norm": 7.538534164428711,
64
+ "learning_rate": 4.92377408891433e-05,
65
+ "loss": 5.1287,
66
  "step": 800
67
  },
68
  {
69
+ "epoch": 0.17172295363480253,
70
+ "grad_norm": 4.852605819702148,
71
+ "learning_rate": 4.914233924823507e-05,
72
+ "loss": 5.0529,
73
  "step": 900
74
  },
75
  {
76
+ "epoch": 0.19080328181644723,
77
+ "grad_norm": 13.604716300964355,
78
+ "learning_rate": 4.9046937607326846e-05,
79
+ "loss": 4.918,
80
  "step": 1000
81
  },
82
  {
83
+ "epoch": 0.20988360999809197,
84
+ "grad_norm": 7.916113376617432,
85
+ "learning_rate": 4.8951535966418626e-05,
86
+ "loss": 4.676,
87
  "step": 1100
88
  },
89
  {
90
+ "epoch": 0.2289639381797367,
91
+ "grad_norm": 9.100526809692383,
92
+ "learning_rate": 4.88561343255104e-05,
93
+ "loss": 4.5912,
94
  "step": 1200
95
  },
96
+ {
97
+ "epoch": 0.2480442663613814,
98
+ "grad_norm": 7.34255313873291,
99
+ "learning_rate": 4.876073268460218e-05,
100
+ "loss": 4.58,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.2671245945430261,
105
+ "grad_norm": 11.250228881835938,
106
+ "learning_rate": 4.866533104369395e-05,
107
+ "loss": 4.3254,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.28620492272467085,
112
+ "grad_norm": 10.732246398925781,
113
+ "learning_rate": 4.856992940278573e-05,
114
+ "loss": 4.2089,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 0.3052852509063156,
119
+ "grad_norm": 5.343315124511719,
120
+ "learning_rate": 4.847452776187751e-05,
121
+ "loss": 4.2465,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 0.3243655790879603,
126
+ "grad_norm": 4.741443157196045,
127
+ "learning_rate": 4.837912612096928e-05,
128
+ "loss": 4.0806,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 0.34344590726960506,
133
+ "grad_norm": 5.156026363372803,
134
+ "learning_rate": 4.828372448006106e-05,
135
+ "loss": 3.9851,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 0.36252623545124973,
140
+ "grad_norm": 5.886138916015625,
141
+ "learning_rate": 4.818832283915283e-05,
142
+ "loss": 3.9438,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 0.38160656363289447,
147
+ "grad_norm": 8.471810340881348,
148
+ "learning_rate": 4.809292119824461e-05,
149
+ "loss": 3.9255,
150
+ "step": 2000
151
+ },
152
+ {
153
+ "epoch": 0.4006868918145392,
154
+ "grad_norm": 6.079995155334473,
155
+ "learning_rate": 4.799751955733639e-05,
156
+ "loss": 3.8405,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 0.41976721999618394,
161
+ "grad_norm": 8.389578819274902,
162
+ "learning_rate": 4.7902117916428164e-05,
163
+ "loss": 3.8096,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 0.4388475481778287,
168
+ "grad_norm": 5.557002544403076,
169
+ "learning_rate": 4.7806716275519944e-05,
170
+ "loss": 3.8691,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 0.4579278763594734,
175
+ "grad_norm": 22.748138427734375,
176
+ "learning_rate": 4.771131463461172e-05,
177
+ "loss": 3.8514,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 0.4770082045411181,
182
+ "grad_norm": 8.196257591247559,
183
+ "learning_rate": 4.761591299370349e-05,
184
+ "loss": 3.6942,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 0.4960885327227628,
189
+ "grad_norm": 5.773292064666748,
190
+ "learning_rate": 4.752051135279527e-05,
191
+ "loss": 3.5213,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 0.5151688609044076,
196
+ "grad_norm": 8.48924446105957,
197
+ "learning_rate": 4.7425109711887044e-05,
198
+ "loss": 3.4898,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 0.5342491890860522,
203
+ "grad_norm": 4.775367736816406,
204
+ "learning_rate": 4.7329708070978824e-05,
205
+ "loss": 3.6168,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 0.553329517267697,
210
+ "grad_norm": 8.374899864196777,
211
+ "learning_rate": 4.7234306430070604e-05,
212
+ "loss": 3.3548,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 0.5724098454493417,
217
+ "grad_norm": 8.98188591003418,
218
+ "learning_rate": 4.713890478916238e-05,
219
+ "loss": 3.4166,
220
+ "step": 3000
221
+ },
222
+ {
223
+ "epoch": 0.5914901736309864,
224
+ "grad_norm": 5.574390411376953,
225
+ "learning_rate": 4.704350314825415e-05,
226
+ "loss": 3.3954,
227
+ "step": 3100
228
+ },
229
+ {
230
+ "epoch": 0.6105705018126312,
231
+ "grad_norm": 6.005886554718018,
232
+ "learning_rate": 4.694810150734592e-05,
233
+ "loss": 3.3784,
234
+ "step": 3200
235
+ },
236
+ {
237
+ "epoch": 0.6296508299942759,
238
+ "grad_norm": 5.605027675628662,
239
+ "learning_rate": 4.68526998664377e-05,
240
+ "loss": 3.4084,
241
+ "step": 3300
242
+ },
243
+ {
244
+ "epoch": 0.6487311581759206,
245
+ "grad_norm": 6.774117946624756,
246
+ "learning_rate": 4.675729822552948e-05,
247
+ "loss": 3.2394,
248
+ "step": 3400
249
+ },
250
+ {
251
+ "epoch": 0.6678114863575654,
252
+ "grad_norm": 6.457028865814209,
253
+ "learning_rate": 4.6661896584621256e-05,
254
+ "loss": 3.1922,
255
+ "step": 3500
256
+ },
257
+ {
258
+ "epoch": 0.6868918145392101,
259
+ "grad_norm": 5.8072028160095215,
260
+ "learning_rate": 4.6566494943713036e-05,
261
+ "loss": 3.2006,
262
+ "step": 3600
263
+ },
264
+ {
265
+ "epoch": 0.7059721427208548,
266
+ "grad_norm": 5.757169723510742,
267
+ "learning_rate": 4.647109330280481e-05,
268
+ "loss": 3.2241,
269
+ "step": 3700
270
+ },
271
+ {
272
+ "epoch": 0.7250524709024995,
273
+ "grad_norm": 7.030402660369873,
274
+ "learning_rate": 4.637569166189659e-05,
275
+ "loss": 3.1935,
276
+ "step": 3800
277
+ },
278
+ {
279
+ "epoch": 0.7441327990841442,
280
+ "grad_norm": 5.983602523803711,
281
+ "learning_rate": 4.628029002098836e-05,
282
+ "loss": 3.2654,
283
+ "step": 3900
284
+ },
285
+ {
286
+ "epoch": 0.7632131272657889,
287
+ "grad_norm": 5.327564239501953,
288
+ "learning_rate": 4.6184888380080136e-05,
289
+ "loss": 3.1183,
290
+ "step": 4000
291
+ },
292
+ {
293
+ "epoch": 0.7822934554474337,
294
+ "grad_norm": 5.376145362854004,
295
+ "learning_rate": 4.6089486739171916e-05,
296
+ "loss": 2.9999,
297
+ "step": 4100
298
+ },
299
+ {
300
+ "epoch": 0.8013737836290784,
301
+ "grad_norm": 7.061012268066406,
302
+ "learning_rate": 4.5994085098263696e-05,
303
+ "loss": 2.9889,
304
+ "step": 4200
305
+ },
306
+ {
307
+ "epoch": 0.8204541118107231,
308
+ "grad_norm": 7.72469425201416,
309
+ "learning_rate": 4.589868345735547e-05,
310
+ "loss": 3.1113,
311
+ "step": 4300
312
+ },
313
+ {
314
+ "epoch": 0.8395344399923679,
315
+ "grad_norm": 7.000074863433838,
316
+ "learning_rate": 4.580328181644725e-05,
317
+ "loss": 3.0533,
318
+ "step": 4400
319
+ },
320
+ {
321
+ "epoch": 0.8586147681740126,
322
+ "grad_norm": 4.172057628631592,
323
+ "learning_rate": 4.570788017553902e-05,
324
+ "loss": 3.0605,
325
+ "step": 4500
326
+ },
327
+ {
328
+ "epoch": 0.8776950963556573,
329
+ "grad_norm": 7.515087604522705,
330
+ "learning_rate": 4.5612478534630795e-05,
331
+ "loss": 3.0591,
332
+ "step": 4600
333
+ },
334
+ {
335
+ "epoch": 0.8967754245373021,
336
+ "grad_norm": 5.259555339813232,
337
+ "learning_rate": 4.5517076893722575e-05,
338
+ "loss": 3.0083,
339
+ "step": 4700
340
+ },
341
+ {
342
+ "epoch": 0.9158557527189468,
343
+ "grad_norm": 6.854462146759033,
344
+ "learning_rate": 4.542167525281435e-05,
345
+ "loss": 2.8963,
346
+ "step": 4800
347
+ },
348
+ {
349
+ "epoch": 0.9349360809005914,
350
+ "grad_norm": 7.629165172576904,
351
+ "learning_rate": 4.532627361190613e-05,
352
+ "loss": 2.9848,
353
+ "step": 4900
354
+ },
355
+ {
356
+ "epoch": 0.9540164090822362,
357
+ "grad_norm": 7.160517692565918,
358
+ "learning_rate": 4.52308719709979e-05,
359
+ "loss": 2.989,
360
+ "step": 5000
361
+ },
362
+ {
363
+ "epoch": 0.9730967372638809,
364
+ "grad_norm": 5.499044895172119,
365
+ "learning_rate": 4.513547033008968e-05,
366
+ "loss": 2.8799,
367
+ "step": 5100
368
+ },
369
+ {
370
+ "epoch": 0.9921770654455256,
371
+ "grad_norm": 7.522376537322998,
372
+ "learning_rate": 4.504006868918146e-05,
373
+ "loss": 2.8484,
374
+ "step": 5200
375
+ },
376
  {
377
  "epoch": 1.0,
378
+ "eval_loss": 2.272996425628662,
379
+ "eval_runtime": 202.0635,
380
+ "eval_samples_per_second": 23.052,
381
+ "eval_steps_per_second": 2.885,
382
+ "step": 5241
383
  }
384
  ],
385
  "logging_steps": 100,
386
+ "max_steps": 52410,
387
  "num_input_tokens_seen": 0,
388
+ "num_train_epochs": 10,
389
  "save_steps": 500,
390
  "stateful_callbacks": {
391
  "TrainerControl": {
 
399
  "attributes": {}
400
  }
401
  },
402
+ "total_flos": 1.4352011987779584e+16,
403
  "train_batch_size": 8,
404
  "trial_name": null,
405
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a161f8db15e46e0431c594288b65f83be576afa01d8659f92dff4ee62dad08cd
3
  size 5969
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2066df18aeb0fb38be2277456a18ddd60a6e6a650077f856b0bf30798baf0ae
3
  size 5969