fguryel commited on
Commit
378a207
·
verified ·
1 Parent(s): dcb65e4

Upload folder using huggingface_hub

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22e61a7a8de9f77bde779dfb846c91806787ab268e54083d22ef514071bb5f82
3
  size 4991031824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fdc69c9532d50d8cbcfc7357cd186c5f961216500c340a60e088c6248848814
3
  size 4991031824
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb2a023ee138db84cb938d952170f44e13d54562e07e0156b3aeab57d754461a
3
  size 1610725592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5acc5280e967a03411a879cc37d8f4c9b24d1d8a7df4b74a0ab654ca2b2308b4
3
  size 1610725592
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7f94ceb272f725414956e963487ef397fff2499b3207027f08016ee0e30d1ae
3
- size 13203681623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2acac140f381a83b89f9d555cc696d4840505d66eeeaf95bf3c42656fe32c150
3
+ size 13203678103
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61c19bab1174704a4a4441475683bf1270277af15d2e2c95e964789128e482c4
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15db80273eb922131fee165eee31e9743cd2224399faebd1ef9e6addce818d49
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:993186ddfb6142501973771452980601dfb89818da88b2abc705869965101f78
3
  size 1465
trainer_state.json CHANGED
@@ -2,264 +2,229 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 1784,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.028026905829596414,
14
- "grad_norm": 9.9375,
15
- "learning_rate": 4.862668161434978e-05,
16
- "loss": 5.2794,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.05605381165919283,
21
- "grad_norm": 16.125,
22
- "learning_rate": 4.7225336322869954e-05,
23
- "loss": 5.2112,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.08408071748878924,
28
- "grad_norm": 7.5,
29
- "learning_rate": 4.5823991031390135e-05,
30
- "loss": 5.1903,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.11210762331838565,
35
- "grad_norm": 8.25,
36
- "learning_rate": 4.442264573991032e-05,
37
- "loss": 5.1039,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.14013452914798205,
42
- "grad_norm": 7.6875,
43
- "learning_rate": 4.30213004484305e-05,
44
- "loss": 5.1759,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.1681614349775785,
49
- "grad_norm": 6.875,
50
- "learning_rate": 4.161995515695067e-05,
51
- "loss": 5.1191,
52
  "step": 300
53
  },
54
  {
55
- "epoch": 0.1961883408071749,
56
- "grad_norm": 9.0625,
57
- "learning_rate": 4.0218609865470855e-05,
58
- "loss": 5.1635,
59
  "step": 350
60
  },
61
  {
62
- "epoch": 0.2242152466367713,
63
- "grad_norm": 7.34375,
64
- "learning_rate": 3.8817264573991036e-05,
65
- "loss": 5.0679,
66
  "step": 400
67
  },
68
  {
69
- "epoch": 0.2522421524663677,
70
- "grad_norm": 7.65625,
71
- "learning_rate": 3.741591928251121e-05,
72
- "loss": 4.9766,
73
  "step": 450
74
  },
75
  {
76
- "epoch": 0.2802690582959641,
77
- "grad_norm": 11.0,
78
- "learning_rate": 3.601457399103139e-05,
79
- "loss": 5.0054,
80
  "step": 500
81
  },
82
  {
83
- "epoch": 0.30829596412556054,
84
- "grad_norm": 9.0,
85
- "learning_rate": 3.461322869955157e-05,
86
- "loss": 5.0283,
87
  "step": 550
88
  },
89
  {
90
- "epoch": 0.336322869955157,
91
- "grad_norm": 9.875,
92
- "learning_rate": 3.321188340807175e-05,
93
- "loss": 5.013,
94
  "step": 600
95
  },
96
  {
97
- "epoch": 0.36434977578475336,
98
- "grad_norm": 7.75,
99
- "learning_rate": 3.181053811659193e-05,
100
- "loss": 4.9833,
101
  "step": 650
102
  },
103
  {
104
- "epoch": 0.3923766816143498,
105
- "grad_norm": 10.0,
106
- "learning_rate": 3.040919282511211e-05,
107
- "loss": 5.0368,
108
  "step": 700
109
  },
110
  {
111
- "epoch": 0.4204035874439462,
112
- "grad_norm": 9.4375,
113
- "learning_rate": 2.9007847533632287e-05,
114
- "loss": 4.9734,
115
  "step": 750
116
  },
117
  {
118
- "epoch": 0.4484304932735426,
119
- "grad_norm": 11.25,
120
- "learning_rate": 2.7606502242152465e-05,
121
- "loss": 4.9578,
122
  "step": 800
123
  },
124
  {
125
- "epoch": 0.476457399103139,
126
- "grad_norm": 8.75,
127
- "learning_rate": 2.620515695067265e-05,
128
- "loss": 4.9768,
129
  "step": 850
130
  },
131
  {
132
- "epoch": 0.5044843049327354,
133
- "grad_norm": 10.25,
134
- "learning_rate": 2.480381165919283e-05,
135
- "loss": 4.897,
136
  "step": 900
137
  },
138
  {
139
- "epoch": 0.5325112107623319,
140
- "grad_norm": 9.0625,
141
- "learning_rate": 2.3402466367713007e-05,
142
- "loss": 4.8971,
143
  "step": 950
144
  },
145
  {
146
- "epoch": 0.5605381165919282,
147
- "grad_norm": 6.875,
148
- "learning_rate": 2.2001121076233185e-05,
149
- "loss": 4.8569,
150
  "step": 1000
151
  },
152
  {
153
- "epoch": 0.5885650224215246,
154
- "grad_norm": 8.9375,
155
- "learning_rate": 2.0599775784753363e-05,
156
- "loss": 4.8683,
157
  "step": 1050
158
  },
159
  {
160
- "epoch": 0.6165919282511211,
161
- "grad_norm": 8.125,
162
- "learning_rate": 1.9198430493273544e-05,
163
- "loss": 4.9153,
164
  "step": 1100
165
  },
166
  {
167
- "epoch": 0.6446188340807175,
168
- "grad_norm": 7.4375,
169
- "learning_rate": 1.7797085201793723e-05,
170
- "loss": 4.9176,
171
  "step": 1150
172
  },
173
  {
174
- "epoch": 0.672645739910314,
175
- "grad_norm": 7.375,
176
- "learning_rate": 1.63957399103139e-05,
177
- "loss": 4.884,
178
  "step": 1200
179
  },
180
  {
181
- "epoch": 0.7006726457399103,
182
- "grad_norm": 7.6875,
183
- "learning_rate": 1.4994394618834082e-05,
184
- "loss": 4.9137,
185
  "step": 1250
186
  },
187
  {
188
- "epoch": 0.7286995515695067,
189
- "grad_norm": 7.34375,
190
- "learning_rate": 1.359304932735426e-05,
191
- "loss": 4.8802,
192
  "step": 1300
193
  },
194
  {
195
- "epoch": 0.7567264573991032,
196
- "grad_norm": 9.25,
197
- "learning_rate": 1.219170403587444e-05,
198
- "loss": 4.9338,
199
  "step": 1350
200
  },
201
  {
202
- "epoch": 0.7847533632286996,
203
- "grad_norm": 10.375,
204
- "learning_rate": 1.079035874439462e-05,
205
- "loss": 4.9095,
206
  "step": 1400
207
  },
208
  {
209
- "epoch": 0.8127802690582959,
210
- "grad_norm": 7.15625,
211
- "learning_rate": 9.389013452914798e-06,
212
- "loss": 4.9246,
213
  "step": 1450
214
  },
215
  {
216
- "epoch": 0.8408071748878924,
217
- "grad_norm": 6.34375,
218
- "learning_rate": 7.987668161434977e-06,
219
- "loss": 4.8827,
220
  "step": 1500
221
- },
222
- {
223
- "epoch": 0.8688340807174888,
224
- "grad_norm": 7.0,
225
- "learning_rate": 6.5863228699551565e-06,
226
- "loss": 4.9036,
227
- "step": 1550
228
- },
229
- {
230
- "epoch": 0.8968609865470852,
231
- "grad_norm": 10.0,
232
- "learning_rate": 5.184977578475336e-06,
233
- "loss": 4.8653,
234
- "step": 1600
235
- },
236
- {
237
- "epoch": 0.9248878923766816,
238
- "grad_norm": 7.46875,
239
- "learning_rate": 3.783632286995516e-06,
240
- "loss": 4.8676,
241
- "step": 1650
242
- },
243
- {
244
- "epoch": 0.952914798206278,
245
- "grad_norm": 8.25,
246
- "learning_rate": 2.3822869955156952e-06,
247
- "loss": 4.8952,
248
- "step": 1700
249
- },
250
- {
251
- "epoch": 0.9809417040358744,
252
- "grad_norm": 6.3125,
253
- "learning_rate": 9.809417040358745e-07,
254
- "loss": 4.9213,
255
- "step": 1750
256
  }
257
  ],
258
  "logging_steps": 50,
259
- "max_steps": 1784,
260
  "num_input_tokens_seen": 0,
261
- "num_train_epochs": 1,
262
- "save_steps": 5000,
263
  "stateful_callbacks": {
264
  "TrainerControl": {
265
  "args": {
@@ -267,12 +232,12 @@
267
  "should_evaluate": false,
268
  "should_log": false,
269
  "should_save": true,
270
- "should_training_stop": true
271
  },
272
  "attributes": {}
273
  }
274
  },
275
- "total_flos": 1.5271922632402944e+16,
276
  "train_batch_size": 1,
277
  "trial_name": null,
278
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.3632286995515694,
6
  "eval_steps": 500,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.11210762331838565,
14
+ "grad_norm": 4.625,
15
+ "learning_rate": 9.14179104477612e-06,
16
+ "loss": 5.492,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.2242152466367713,
21
+ "grad_norm": 4.90625,
22
+ "learning_rate": 1.8470149253731344e-05,
23
+ "loss": 5.439,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.336322869955157,
28
+ "grad_norm": 3.84375,
29
+ "learning_rate": 2.7798507462686568e-05,
30
+ "loss": 5.2502,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.4484304932735426,
35
+ "grad_norm": 6.59375,
36
+ "learning_rate": 3.7126865671641795e-05,
37
+ "loss": 5.161,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 0.5605381165919282,
42
+ "grad_norm": 6.25,
43
+ "learning_rate": 4.645522388059701e-05,
44
+ "loss": 5.0548,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 0.672645739910314,
49
+ "grad_norm": 6.71875,
50
+ "learning_rate": 4.999325361589072e-05,
51
+ "loss": 5.0148,
52
  "step": 300
53
  },
54
  {
55
+ "epoch": 0.7847533632286996,
56
+ "grad_norm": 5.6875,
57
+ "learning_rate": 4.9953952730494324e-05,
58
+ "loss": 5.0028,
59
  "step": 350
60
  },
61
  {
62
+ "epoch": 0.8968609865470852,
63
+ "grad_norm": 5.09375,
64
+ "learning_rate": 4.987961816680492e-05,
65
+ "loss": 4.9576,
66
  "step": 400
67
  },
68
  {
69
+ "epoch": 1.0089686098654709,
70
+ "grad_norm": 4.375,
71
+ "learning_rate": 4.977035428557125e-05,
72
+ "loss": 4.929,
73
  "step": 450
74
  },
75
  {
76
+ "epoch": 1.1210762331838564,
77
+ "grad_norm": 4.78125,
78
+ "learning_rate": 4.9626314485964385e-05,
79
+ "loss": 4.7677,
80
  "step": 500
81
  },
82
  {
83
+ "epoch": 1.2331838565022422,
84
+ "grad_norm": 4.875,
85
+ "learning_rate": 4.944770099021562e-05,
86
+ "loss": 4.7483,
87
  "step": 550
88
  },
89
  {
90
+ "epoch": 1.3452914798206277,
91
+ "grad_norm": 5.9375,
92
+ "learning_rate": 4.923476455971e-05,
93
+ "loss": 4.7362,
94
  "step": 600
95
  },
96
  {
97
+ "epoch": 1.4573991031390134,
98
+ "grad_norm": 7.0625,
99
+ "learning_rate": 4.898780414293411e-05,
100
+ "loss": 4.7181,
101
  "step": 650
102
  },
103
  {
104
+ "epoch": 1.5695067264573992,
105
+ "grad_norm": 4.1875,
106
+ "learning_rate": 4.870716645577244e-05,
107
+ "loss": 4.719,
108
  "step": 700
109
  },
110
  {
111
+ "epoch": 1.6816143497757847,
112
+ "grad_norm": 5.15625,
113
+ "learning_rate": 4.839324549474148e-05,
114
+ "loss": 4.727,
115
  "step": 750
116
  },
117
  {
118
+ "epoch": 1.7937219730941703,
119
+ "grad_norm": 4.46875,
120
+ "learning_rate": 4.804648198384507e-05,
121
+ "loss": 4.7355,
122
  "step": 800
123
  },
124
  {
125
+ "epoch": 1.905829596412556,
126
+ "grad_norm": 4.59375,
127
+ "learning_rate": 4.7667362755827306e-05,
128
+ "loss": 4.7111,
129
  "step": 850
130
  },
131
  {
132
+ "epoch": 2.0179372197309418,
133
+ "grad_norm": 4.15625,
134
+ "learning_rate": 4.725642006869207e-05,
135
+ "loss": 4.6229,
136
  "step": 900
137
  },
138
  {
139
+ "epoch": 2.1300448430493275,
140
+ "grad_norm": 5.53125,
141
+ "learning_rate": 4.68142308584484e-05,
142
+ "loss": 4.3583,
143
  "step": 950
144
  },
145
  {
146
+ "epoch": 2.242152466367713,
147
+ "grad_norm": 3.875,
148
+ "learning_rate": 4.634141592913097e-05,
149
+ "loss": 4.3665,
150
  "step": 1000
151
  },
152
  {
153
+ "epoch": 2.3542600896860986,
154
+ "grad_norm": 3.890625,
155
+ "learning_rate": 4.583863908123282e-05,
156
+ "loss": 4.4125,
157
  "step": 1050
158
  },
159
  {
160
+ "epoch": 2.4663677130044843,
161
+ "grad_norm": 4.21875,
162
+ "learning_rate": 4.530660617977393e-05,
163
+ "loss": 4.3592,
164
  "step": 1100
165
  },
166
  {
167
+ "epoch": 2.57847533632287,
168
+ "grad_norm": 4.5,
169
+ "learning_rate": 4.474606416331397e-05,
170
+ "loss": 4.371,
171
  "step": 1150
172
  },
173
  {
174
+ "epoch": 2.6905829596412554,
175
+ "grad_norm": 4.3125,
176
+ "learning_rate": 4.415779999530064e-05,
177
+ "loss": 4.4046,
178
  "step": 1200
179
  },
180
  {
181
+ "epoch": 2.802690582959641,
182
+ "grad_norm": 4.21875,
183
+ "learning_rate": 4.354263955922568e-05,
184
+ "loss": 4.3805,
185
  "step": 1250
186
  },
187
  {
188
+ "epoch": 2.914798206278027,
189
+ "grad_norm": 4.15625,
190
+ "learning_rate": 4.290144649913973e-05,
191
+ "loss": 4.3829,
192
  "step": 1300
193
  },
194
  {
195
+ "epoch": 3.0269058295964126,
196
+ "grad_norm": 4.6875,
197
+ "learning_rate": 4.2235121007153975e-05,
198
+ "loss": 4.2465,
199
  "step": 1350
200
  },
201
  {
202
+ "epoch": 3.1390134529147984,
203
+ "grad_norm": 6.0,
204
+ "learning_rate": 4.1544598559630694e-05,
205
+ "loss": 3.8933,
206
  "step": 1400
207
  },
208
  {
209
+ "epoch": 3.2511210762331837,
210
+ "grad_norm": 4.875,
211
+ "learning_rate": 4.083084860383708e-05,
212
+ "loss": 3.9146,
213
  "step": 1450
214
  },
215
  {
216
+ "epoch": 3.3632286995515694,
217
+ "grad_norm": 4.6875,
218
+ "learning_rate": 4.009487319690626e-05,
219
+ "loss": 3.9086,
220
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  }
222
  ],
223
  "logging_steps": 50,
224
+ "max_steps": 4460,
225
  "num_input_tokens_seen": 0,
226
+ "num_train_epochs": 10,
227
+ "save_steps": 500,
228
  "stateful_callbacks": {
229
  "TrainerControl": {
230
  "args": {
 
232
  "should_evaluate": false,
233
  "should_log": false,
234
  "should_save": true,
235
+ "should_training_stop": false
236
  },
237
  "attributes": {}
238
  }
239
  },
240
+ "total_flos": 5.135152845033677e+16,
241
  "train_batch_size": 1,
242
  "trial_name": null,
243
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14b8e7fa23f8365e97ec329503b35572e8528d5f7cac5d82eacf16bcc34bdde3
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07c97d6cade78ad89a0fb0e3020fbd76a7fe36e45ae68b0391dd5d87dad3625
3
  size 5777