mrplants commited on
Commit
bffbdbe
·
verified ·
1 Parent(s): fc981e8

Training in progress, epoch 3, checkpoint

Browse files
last-checkpoint/model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c94a575f649f1b53eb088a532b1755757f8761cdcae11831a2bca32fe97cd14
3
  size 4998420448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf52f10965e393b8e0899b7abd10f5b56344c60211b12e28c940ae3935f3645f
3
  size 4998420448
last-checkpoint/model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfb41cc90197db04ef4fde551b6a8aa2c560c91c8966fefa517d34ad486048a2
3
  size 4983891952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:343d02b7c5678b0bf23ba6cfb7f1d78e9094c8049b00e4399dfcd9480ced5097
3
  size 4983891952
last-checkpoint/model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cca55f3b11ef3116929cf316e11ddff3ec7c9c2fa6bc791b6a3026f5ef7e4ccb
3
  size 1905111704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b0587b29a2c6056b7a55bcbd389dad6e8b4730d0f209a9485b9d9a3a642d86e
3
  size 1905111704
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0aa22d5b5a9a33a92836abaaa824bc7daca7cca430c7f5eb9930c4a97092a2e4
3
- size 15344257558
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59f68e78e982909d4e878fb37ffc83266760fb6dae40dac940b403410ce90a97
3
+ size 2536
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6648495cda64bef1e1d053d107c48e5074edca604a5b9a2f219b35e47f3cacd4
3
- size 14308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95b6047bd8cc6f4cdf7c46dea47edb8e542435510070c6cd1e0a7d9ccf5fd7da
3
+ size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7acac30208351d1ca8c3e665090883ff7723a00418637911eff12485d0ffbf91
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4933c471fb1e4ba81de00146ddd721361901645c866fd1d76662b6837ae85d16
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,233 +1,285 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 314,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03184713375796178,
13
- "grad_norm": 450.0,
14
- "learning_rate": 6.896551724137932e-06,
15
- "loss": 16.8123,
16
- "step": 10
17
  },
18
  {
19
- "epoch": 0.06369426751592357,
20
- "grad_norm": 288.0,
21
- "learning_rate": 1.3793103448275863e-05,
22
- "loss": 13.0617,
23
- "step": 20
24
  },
25
  {
26
- "epoch": 0.09554140127388536,
27
- "grad_norm": 110.0,
28
- "learning_rate": 1.9999940799174018e-05,
29
- "loss": 9.2073,
30
- "step": 30
31
  },
32
  {
33
- "epoch": 0.12738853503184713,
34
- "grad_norm": 31.25,
35
- "learning_rate": 1.9992837548163315e-05,
36
- "loss": 7.4672,
37
- "step": 40
38
  },
39
  {
40
- "epoch": 0.1592356687898089,
41
- "grad_norm": 67.5,
42
- "learning_rate": 1.9973903768108406e-05,
43
- "loss": 7.0625,
44
- "step": 50
45
  },
46
  {
47
- "epoch": 0.1910828025477707,
48
- "grad_norm": 57.75,
49
- "learning_rate": 1.994316187472792e-05,
50
- "loss": 6.8102,
51
- "step": 60
52
  },
53
  {
54
- "epoch": 0.2229299363057325,
55
- "grad_norm": 61.25,
56
- "learning_rate": 1.9900648263376108e-05,
57
- "loss": 6.5759,
58
- "step": 70
59
  },
60
  {
61
- "epoch": 0.25477707006369427,
62
- "grad_norm": 46.25,
63
- "learning_rate": 1.9846413265954338e-05,
64
- "loss": 6.4318,
65
- "step": 80
66
  },
67
  {
68
- "epoch": 0.28662420382165604,
69
- "grad_norm": 35.5,
70
- "learning_rate": 1.9780521091323124e-05,
71
- "loss": 6.3965,
72
- "step": 90
73
  },
74
  {
75
- "epoch": 0.3184713375796178,
76
- "grad_norm": 320.0,
77
- "learning_rate": 1.970304974928518e-05,
78
- "loss": 6.4524,
79
- "step": 100
80
  },
81
  {
82
- "epoch": 0.3503184713375796,
83
- "grad_norm": 32.75,
84
- "learning_rate": 1.961409095822957e-05,
85
- "loss": 6.3121,
86
- "step": 110
87
  },
88
  {
89
- "epoch": 0.3821656050955414,
90
- "grad_norm": 38.25,
91
- "learning_rate": 1.9513750036546222e-05,
92
- "loss": 6.1557,
93
- "step": 120
 
94
  },
95
  {
96
- "epoch": 0.4140127388535032,
97
- "grad_norm": 32.25,
98
- "learning_rate": 1.9402145777939374e-05,
99
- "loss": 5.9113,
100
- "step": 130
101
  },
102
  {
103
- "epoch": 0.445859872611465,
104
- "grad_norm": 34.5,
105
- "learning_rate": 1.9279410310787638e-05,
106
- "loss": 5.8595,
107
- "step": 140
108
  },
109
  {
110
- "epoch": 0.47770700636942676,
111
- "grad_norm": 28.375,
112
- "learning_rate": 1.9145688941717074e-05,
113
- "loss": 6.0551,
114
- "step": 150
115
  },
116
  {
117
- "epoch": 0.5095541401273885,
118
- "grad_norm": 26.5,
119
- "learning_rate": 1.9001139983572554e-05,
120
- "loss": 5.2247,
121
- "step": 160
122
  },
123
  {
124
- "epoch": 0.5414012738853503,
125
- "grad_norm": 37.75,
126
- "learning_rate": 1.8845934567991056e-05,
127
- "loss": 5.8697,
128
- "step": 170
129
  },
130
  {
131
- "epoch": 0.5732484076433121,
132
- "grad_norm": 27.875,
133
- "learning_rate": 1.8680256442798756e-05,
134
- "loss": 5.9877,
135
- "step": 180
136
  },
137
  {
138
- "epoch": 0.6050955414012739,
139
- "grad_norm": 31.75,
140
- "learning_rate": 1.8504301754471837e-05,
141
- "loss": 5.892,
142
- "step": 190
143
  },
144
  {
145
- "epoch": 0.6369426751592356,
146
- "grad_norm": 29.25,
147
- "learning_rate": 1.8318278815918483e-05,
148
- "loss": 5.8725,
149
- "step": 200
150
  },
151
  {
152
- "epoch": 0.6687898089171974,
153
- "grad_norm": 26.875,
154
- "learning_rate": 1.8122407859857064e-05,
155
- "loss": 5.8386,
156
- "step": 210
157
  },
158
  {
159
- "epoch": 0.7006369426751592,
160
- "grad_norm": 46.25,
161
- "learning_rate": 1.791692077808242e-05,
162
- "loss": 6.0679,
163
- "step": 220
164
  },
165
  {
166
- "epoch": 0.732484076433121,
167
- "grad_norm": 26.75,
168
- "learning_rate": 1.770206084692897e-05,
169
- "loss": 5.4833,
170
- "step": 230
171
  },
172
  {
173
- "epoch": 0.7643312101910829,
174
- "grad_norm": 64.0,
175
- "learning_rate": 1.747808243925565e-05,
176
- "loss": 5.6222,
177
- "step": 240
178
  },
179
  {
180
- "epoch": 0.7961783439490446,
181
- "grad_norm": 41.75,
182
- "learning_rate": 1.724525072329364e-05,
183
- "loss": 5.8434,
184
- "step": 250
 
185
  },
186
  {
187
- "epoch": 0.8280254777070064,
188
- "grad_norm": 35.5,
189
- "learning_rate": 1.700384134871351e-05,
190
- "loss": 5.8525,
191
- "step": 260
192
  },
193
  {
194
- "epoch": 0.8598726114649682,
195
- "grad_norm": 22.875,
196
- "learning_rate": 1.6754140120283295e-05,
197
- "loss": 6.089,
198
- "step": 270
199
  },
200
  {
201
- "epoch": 0.89171974522293,
202
- "grad_norm": 19.75,
203
- "learning_rate": 1.6496442659504005e-05,
204
- "loss": 5.203,
205
- "step": 280
206
  },
207
  {
208
- "epoch": 0.9235668789808917,
209
- "grad_norm": 23.375,
210
- "learning_rate": 1.6231054054623066e-05,
211
- "loss": 6.0398,
212
- "step": 290
213
  },
214
  {
215
- "epoch": 0.9554140127388535,
216
- "grad_norm": 28.75,
217
- "learning_rate": 1.5958288499440075e-05,
218
- "loss": 5.9388,
219
- "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  },
221
  {
222
- "epoch": 0.9872611464968153,
223
- "grad_norm": 27.75,
224
- "learning_rate": 1.56784689213325e-05,
225
- "loss": 5.5168,
226
- "step": 310
 
227
  }
228
  ],
229
- "logging_steps": 10,
230
- "max_steps": 942,
231
  "num_input_tokens_seen": 0,
232
  "num_train_epochs": 3,
233
  "save_steps": 500,
@@ -238,12 +290,12 @@
238
  "should_evaluate": false,
239
  "should_log": false,
240
  "should_save": true,
241
- "should_training_stop": false
242
  },
243
  "attributes": {}
244
  }
245
  },
246
- "total_flos": 1.1679210169046016e+16,
247
  "train_batch_size": 8,
248
  "trial_name": null,
249
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 894,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08389261744966443,
13
+ "grad_norm": 0.0,
14
+ "learning_rate": 1.851851851851852e-05,
15
+ "loss": 4.5636,
16
+ "step": 25
17
  },
18
  {
19
+ "epoch": 0.16778523489932887,
20
+ "grad_norm": 0.0,
21
+ "learning_rate": 1.9965291500546865e-05,
22
+ "loss": 4.5267,
23
+ "step": 50
24
  },
25
  {
26
+ "epoch": 0.2516778523489933,
27
+ "grad_norm": 0.0,
28
+ "learning_rate": 1.984912443051131e-05,
29
+ "loss": 4.6574,
30
+ "step": 75
31
  },
32
  {
33
+ "epoch": 0.33557046979865773,
34
+ "grad_norm": 0.0,
35
+ "learning_rate": 1.965218883028299e-05,
36
+ "loss": 4.589,
37
+ "step": 100
38
  },
39
  {
40
+ "epoch": 0.41946308724832215,
41
+ "grad_norm": 0.0,
42
+ "learning_rate": 1.9376099685953836e-05,
43
+ "loss": 4.6256,
44
+ "step": 125
45
  },
46
  {
47
+ "epoch": 0.5033557046979866,
48
+ "grad_norm": 0.0,
49
+ "learning_rate": 1.9023121088565353e-05,
50
+ "loss": 4.4971,
51
+ "step": 150
52
  },
53
  {
54
+ "epoch": 0.587248322147651,
55
+ "grad_norm": 0.0,
56
+ "learning_rate": 1.8596147667249457e-05,
57
+ "loss": 4.7099,
58
+ "step": 175
59
  },
60
  {
61
+ "epoch": 0.6711409395973155,
62
+ "grad_norm": 0.0,
63
+ "learning_rate": 1.8098680851591538e-05,
64
+ "loss": 4.619,
65
+ "step": 200
66
  },
67
  {
68
+ "epoch": 0.7550335570469798,
69
+ "grad_norm": 0.0,
70
+ "learning_rate": 1.753480015787792e-05,
71
+ "loss": 4.6121,
72
+ "step": 225
73
  },
74
  {
75
+ "epoch": 0.8389261744966443,
76
+ "grad_norm": 0.0,
77
+ "learning_rate": 1.6909129734697306e-05,
78
+ "loss": 4.4854,
79
+ "step": 250
80
  },
81
  {
82
+ "epoch": 0.9228187919463087,
83
+ "grad_norm": 0.0,
84
+ "learning_rate": 1.6226800442241582e-05,
85
+ "loss": 4.6153,
86
+ "step": 275
87
  },
88
  {
89
+ "epoch": 1.0,
90
+ "eval_loss": 4.619876861572266,
91
+ "eval_runtime": 1.7691,
92
+ "eval_samples_per_second": 71.223,
93
+ "eval_steps_per_second": 9.044,
94
+ "step": 298
95
  },
96
  {
97
+ "epoch": 1.0067114093959733,
98
+ "grad_norm": 0.0,
99
+ "learning_rate": 1.54934077762777e-05,
100
+ "loss": 4.6071,
101
+ "step": 300
102
  },
103
  {
104
+ "epoch": 1.0906040268456376,
105
+ "grad_norm": 0.0,
106
+ "learning_rate": 1.4714965981838503e-05,
107
+ "loss": 4.5226,
108
+ "step": 325
109
  },
110
  {
111
+ "epoch": 1.174496644295302,
112
+ "grad_norm": 0.0,
113
+ "learning_rate": 1.3897858732926794e-05,
114
+ "loss": 4.4466,
115
+ "step": 350
116
  },
117
  {
118
+ "epoch": 1.2583892617449663,
119
+ "grad_norm": 0.0,
120
+ "learning_rate": 1.3048786782687706e-05,
121
+ "loss": 4.4967,
122
+ "step": 375
123
  },
124
  {
125
+ "epoch": 1.342281879194631,
126
+ "grad_norm": 0.0,
127
+ "learning_rate": 1.2174713013348227e-05,
128
+ "loss": 4.7321,
129
+ "step": 400
130
  },
131
  {
132
+ "epoch": 1.4261744966442953,
133
+ "grad_norm": 0.0,
134
+ "learning_rate": 1.128280533654637e-05,
135
+ "loss": 4.6957,
136
+ "step": 425
137
  },
138
  {
139
+ "epoch": 1.5100671140939599,
140
+ "grad_norm": 0.0,
141
+ "learning_rate": 1.0380377912300231e-05,
142
+ "loss": 4.5298,
143
+ "step": 450
144
  },
145
  {
146
+ "epoch": 1.5939597315436242,
147
+ "grad_norm": 0.0,
148
+ "learning_rate": 9.474831168655596e-06,
149
+ "loss": 4.359,
150
+ "step": 475
151
  },
152
  {
153
+ "epoch": 1.6778523489932886,
154
+ "grad_norm": 0.0,
155
+ "learning_rate": 8.573591113885695e-06,
156
+ "loss": 4.5875,
157
+ "step": 500
158
  },
159
  {
160
+ "epoch": 1.761744966442953,
161
+ "grad_norm": 0.0,
162
+ "learning_rate": 7.684048438918247e-06,
163
+ "loss": 4.6772,
164
+ "step": 525
165
  },
166
  {
167
+ "epoch": 1.8456375838926173,
168
+ "grad_norm": 0.0,
169
+ "learning_rate": 6.813497909385252e-06,
170
+ "loss": 4.5338,
171
+ "step": 550
172
  },
173
  {
174
+ "epoch": 1.929530201342282,
175
+ "grad_norm": 0.0,
176
+ "learning_rate": 5.969078544315747e-06,
177
+ "loss": 4.6363,
178
+ "step": 575
179
  },
180
  {
181
+ "epoch": 2.0,
182
+ "eval_loss": 4.619876861572266,
183
+ "eval_runtime": 2.1011,
184
+ "eval_samples_per_second": 59.968,
185
+ "eval_steps_per_second": 7.615,
186
+ "step": 596
187
  },
188
  {
189
+ "epoch": 2.0134228187919465,
190
+ "grad_norm": 0.0,
191
+ "learning_rate": 5.157715072041094e-06,
192
+ "loss": 4.6935,
193
+ "step": 600
194
  },
195
  {
196
+ "epoch": 2.097315436241611,
197
+ "grad_norm": 0.0,
198
+ "learning_rate": 4.386061143408135e-06,
199
+ "loss": 4.5611,
200
+ "step": 625
201
  },
202
  {
203
+ "epoch": 2.1812080536912752,
204
+ "grad_norm": 0.0,
205
+ "learning_rate": 3.660444767984911e-06,
206
+ "loss": 4.4651,
207
+ "step": 650
208
  },
209
  {
210
+ "epoch": 2.2651006711409396,
211
+ "grad_norm": 0.0,
212
+ "learning_rate": 2.986816420713662e-06,
213
+ "loss": 4.6059,
214
+ "step": 675
215
  },
216
  {
217
+ "epoch": 2.348993288590604,
218
+ "grad_norm": 0.0,
219
+ "learning_rate": 2.370700244566605e-06,
220
+ "loss": 4.4503,
221
+ "step": 700
222
+ },
223
+ {
224
+ "epoch": 2.4328859060402683,
225
+ "grad_norm": 0.0,
226
+ "learning_rate": 1.8171487493710337e-06,
227
+ "loss": 4.5825,
228
+ "step": 725
229
+ },
230
+ {
231
+ "epoch": 2.5167785234899327,
232
+ "grad_norm": 0.0,
233
+ "learning_rate": 1.3307013782996237e-06,
234
+ "loss": 4.6254,
235
+ "step": 750
236
+ },
237
+ {
238
+ "epoch": 2.600671140939597,
239
+ "grad_norm": 0.0,
240
+ "learning_rate": 9.153472818047627e-07,
241
+ "loss": 4.6237,
242
+ "step": 775
243
+ },
244
+ {
245
+ "epoch": 2.684563758389262,
246
+ "grad_norm": 0.0,
247
+ "learning_rate": 5.74492604272191e-07,
248
+ "loss": 4.4167,
249
+ "step": 800
250
+ },
251
+ {
252
+ "epoch": 2.7684563758389262,
253
+ "grad_norm": 0.0,
254
+ "learning_rate": 3.109325516623818e-07,
255
+ "loss": 4.6393,
256
+ "step": 825
257
+ },
258
+ {
259
+ "epoch": 2.8523489932885906,
260
+ "grad_norm": 0.0,
261
+ "learning_rate": 1.2682846920120228e-07,
262
+ "loss": 4.6915,
263
+ "step": 850
264
+ },
265
+ {
266
+ "epoch": 2.936241610738255,
267
+ "grad_norm": 0.0,
268
+ "learning_rate": 2.369011709604463e-08,
269
+ "loss": 4.7184,
270
+ "step": 875
271
  },
272
  {
273
+ "epoch": 3.0,
274
+ "eval_loss": 4.619876861572266,
275
+ "eval_runtime": 1.8875,
276
+ "eval_samples_per_second": 66.756,
277
+ "eval_steps_per_second": 8.477,
278
+ "step": 894
279
  }
280
  ],
281
+ "logging_steps": 25,
282
+ "max_steps": 894,
283
  "num_input_tokens_seen": 0,
284
  "num_train_epochs": 3,
285
  "save_steps": 500,
 
290
  "should_evaluate": false,
291
  "should_log": false,
292
  "should_save": true,
293
+ "should_training_stop": true
294
  },
295
  "attributes": {}
296
  }
297
  },
298
+ "total_flos": 3.4434735820916736e+16,
299
  "train_batch_size": 8,
300
  "trial_name": null,
301
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca6eb23889ca18fbcd4c9269548b5b828e7fb6c6f42bcc3fffd9ed4e3b37c2fb
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6697f2864903978d31dd6ca1fe39ca44ced565a75f65d198b42ce2ce420093a
3
  size 5368