error577 commited on
Commit
ed6af7d
·
verified ·
1 Parent(s): 80499bb

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -10,23 +10,23 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 8,
14
  "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 4,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
  "k_proj",
24
  "v_proj",
25
  "q_proj",
26
- "o_proj",
27
  "up_proj",
28
- "gate_proj",
29
- "down_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 128,
14
  "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 64,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
  "k_proj",
24
  "v_proj",
25
  "q_proj",
 
26
  "up_proj",
27
+ "o_proj",
28
+ "down_proj",
29
+ "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b85235b9fd06fc2b7556026c9d400735ea1cf6ceb01413094a4e13a4f220409c
3
- size 40036040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fda53b2dbb64ee3df1572930d050a53fd43af43308677011be467155149e9da9
3
+ size 639691872
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd89e21dd25e4404711c18b21837d9f7f514a8add2433a4536b637bc67095f19
3
- size 20814996
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60f307f665d6353bf718fbea916abadf16ba6ed584c31e16d0fb310e793bcda5
3
+ size 325350676
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cc25fc0d4a476d263c6b1dbecc0b805055d4f792509dbda83e19cdd50420ee0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5915d9c3b1deb3d66ce6ff11178835811888398e5199a01a9136f0a008ed6a10
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a75dcc5ec09eada6641b366eac390a2a47e7ec4306b94cfdb718bc9a73ac9b0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:183d86b4afef5d114e28423b41699eb53696ddb9b0b1e5de0b39a3f185c3455e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,323 +1,385 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0014274116566004406,
5
- "eval_steps": 20,
6
- "global_step": 40,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 3.568529141501102e-05,
13
- "grad_norm": 1.025141716003418,
14
  "learning_rate": 2.9999999999999997e-05,
15
- "loss": 2.7097,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 3.568529141501102e-05,
20
- "eval_loss": 3.7184572219848633,
21
- "eval_runtime": 200.2041,
22
- "eval_samples_per_second": 11.428,
23
- "eval_steps_per_second": 11.428,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 7.137058283002204e-05,
28
- "grad_norm": 1.6995129585266113,
29
  "learning_rate": 5.9999999999999995e-05,
30
- "loss": 2.9932,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.00010705587424503305,
35
- "grad_norm": 2.4651355743408203,
36
  "learning_rate": 8.999999999999999e-05,
37
- "loss": 3.8545,
38
  "step": 3
39
  },
40
  {
41
- "epoch": 0.00014274116566004408,
42
- "grad_norm": 1.8627501726150513,
43
  "learning_rate": 0.00011999999999999999,
44
- "loss": 2.4703,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 0.00017842645707505508,
49
- "grad_norm": 3.136054277420044,
50
  "learning_rate": 0.00015,
51
- "loss": 3.9009,
52
  "step": 5
53
  },
54
  {
55
- "epoch": 0.0002141117484900661,
56
- "grad_norm": 1.439232349395752,
57
  "learning_rate": 0.00017999999999999998,
58
- "loss": 2.9751,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 0.00024979703990507713,
63
- "grad_norm": 3.2479472160339355,
64
  "learning_rate": 0.00020999999999999998,
65
- "loss": 3.1769,
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.00028548233132008816,
70
- "grad_norm": 5.708984375,
71
  "learning_rate": 0.00023999999999999998,
72
- "loss": 4.0564,
73
  "step": 8
74
  },
75
  {
76
- "epoch": 0.00032116762273509913,
77
- "grad_norm": 3.132951259613037,
78
  "learning_rate": 0.00027,
79
- "loss": 2.5955,
80
  "step": 9
81
  },
82
  {
83
- "epoch": 0.00035685291415011016,
84
- "grad_norm": 3.4401779174804688,
85
  "learning_rate": 0.0003,
86
- "loss": 2.9243,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 0.0003925382055651212,
91
- "grad_norm": 3.282693386077881,
92
- "learning_rate": 0.0002999911984174669,
93
- "loss": 2.7178,
94
  "step": 11
95
  },
96
  {
97
- "epoch": 0.0004282234969801322,
98
- "grad_norm": 3.4943759441375732,
99
- "learning_rate": 0.0002999647947027726,
100
- "loss": 2.9392,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 0.00046390878839514324,
105
- "grad_norm": 3.1215672492980957,
106
- "learning_rate": 0.0002999207919545099,
107
- "loss": 3.2245,
108
  "step": 13
109
  },
110
  {
111
- "epoch": 0.0004995940798101543,
112
- "grad_norm": 3.432668924331665,
113
- "learning_rate": 0.0002998591953365965,
114
- "loss": 3.2253,
115
  "step": 14
116
  },
117
  {
118
- "epoch": 0.0005352793712251652,
119
- "grad_norm": 4.813501358032227,
120
- "learning_rate": 0.00029978001207766854,
121
- "loss": 3.104,
122
  "step": 15
123
  },
124
  {
125
- "epoch": 0.0005709646626401763,
126
- "grad_norm": 2.568082809448242,
127
- "learning_rate": 0.00029968325147023263,
128
- "loss": 2.8546,
129
  "step": 16
130
  },
131
  {
132
- "epoch": 0.0006066499540551873,
133
- "grad_norm": 4.107773780822754,
134
- "learning_rate": 0.000299568924869575,
135
- "loss": 2.7499,
136
  "step": 17
137
  },
138
  {
139
- "epoch": 0.0006423352454701983,
140
- "grad_norm": 3.392408847808838,
141
- "learning_rate": 0.00029943704569242917,
142
- "loss": 3.1998,
143
  "step": 18
144
  },
145
  {
146
- "epoch": 0.0006780205368852093,
147
- "grad_norm": 4.212235927581787,
148
- "learning_rate": 0.0002992876294154013,
149
- "loss": 2.4685,
150
  "step": 19
151
  },
152
  {
153
- "epoch": 0.0007137058283002203,
154
- "grad_norm": 2.821186065673828,
155
- "learning_rate": 0.00029912069357315393,
156
- "loss": 2.7551,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 0.0007137058283002203,
161
- "eval_loss": 2.9195058345794678,
162
- "eval_runtime": 199.0718,
163
- "eval_samples_per_second": 11.493,
164
- "eval_steps_per_second": 11.493,
165
- "step": 20
166
- },
167
- {
168
- "epoch": 0.0007493911197152314,
169
- "grad_norm": 3.009274482727051,
170
- "learning_rate": 0.00029893625775634835,
171
- "loss": 3.1292,
172
  "step": 21
173
  },
174
  {
175
- "epoch": 0.0007850764111302424,
176
- "grad_norm": 4.148017406463623,
177
- "learning_rate": 0.0002987343436093454,
178
- "loss": 3.0939,
179
  "step": 22
180
  },
181
  {
182
- "epoch": 0.0008207617025452535,
183
- "grad_norm": 2.899528980255127,
184
- "learning_rate": 0.00029851497482766547,
185
- "loss": 2.8648,
186
  "step": 23
187
  },
188
  {
189
- "epoch": 0.0008564469939602644,
190
- "grad_norm": 3.6243538856506348,
191
- "learning_rate": 0.00029827817715520773,
192
- "loss": 3.1808,
193
  "step": 24
194
  },
195
  {
196
- "epoch": 0.0008921322853752754,
197
- "grad_norm": 4.4735894203186035,
198
- "learning_rate": 0.0002980239783812289,
199
- "loss": 2.9876,
200
  "step": 25
201
  },
202
  {
203
- "epoch": 0.0009278175767902865,
204
- "grad_norm": 3.8239352703094482,
205
- "learning_rate": 0.0002977524083370822,
206
- "loss": 3.6976,
207
  "step": 26
208
  },
209
  {
210
- "epoch": 0.0009635028682052975,
211
- "grad_norm": 3.177990436553955,
212
- "learning_rate": 0.00029746349889271645,
213
- "loss": 2.5572,
214
  "step": 27
215
  },
216
  {
217
- "epoch": 0.0009991881596203085,
218
- "grad_norm": 2.7317492961883545,
219
- "learning_rate": 0.0002971572839529358,
220
- "loss": 2.6316,
221
  "step": 28
222
  },
223
  {
224
- "epoch": 0.0010348734510353195,
225
- "grad_norm": 4.169151306152344,
226
- "learning_rate": 0.00029683379945342125,
227
- "loss": 3.3164,
228
  "step": 29
229
  },
230
  {
231
- "epoch": 0.0010705587424503305,
232
- "grad_norm": 4.244836807250977,
233
- "learning_rate": 0.000296493083356513,
234
- "loss": 2.8311,
235
  "step": 30
236
  },
237
  {
238
- "epoch": 0.0011062440338653415,
239
- "grad_norm": 3.5411789417266846,
240
- "learning_rate": 0.00029613517564675565,
241
- "loss": 2.5447,
242
  "step": 31
243
  },
244
  {
245
- "epoch": 0.0011419293252803526,
246
- "grad_norm": 3.9989023208618164,
247
- "learning_rate": 0.0002957601183262058,
248
- "loss": 2.6177,
249
  "step": 32
250
  },
251
  {
252
- "epoch": 0.0011776146166953636,
253
- "grad_norm": 3.9558351039886475,
254
- "learning_rate": 0.000295367955409503,
255
- "loss": 2.5028,
256
  "step": 33
257
  },
258
  {
259
- "epoch": 0.0012132999081103746,
260
- "grad_norm": 4.089743614196777,
261
- "learning_rate": 0.00029495873291870436,
262
- "loss": 3.1483,
263
  "step": 34
264
  },
265
  {
266
- "epoch": 0.0012489851995253856,
267
- "grad_norm": 4.250207901000977,
268
- "learning_rate": 0.0002945324988778834,
269
- "loss": 2.8495,
270
  "step": 35
271
  },
272
  {
273
- "epoch": 0.0012846704909403965,
274
- "grad_norm": 5.241243839263916,
275
- "learning_rate": 0.00029408930330749477,
276
- "loss": 3.318,
277
  "step": 36
278
  },
279
  {
280
- "epoch": 0.0013203557823554077,
281
- "grad_norm": 3.5582404136657715,
282
- "learning_rate": 0.0002936291982185036,
283
- "loss": 2.5436,
284
  "step": 37
285
  },
286
  {
287
- "epoch": 0.0013560410737704187,
288
- "grad_norm": 4.2042622566223145,
289
- "learning_rate": 0.00029315223760628217,
290
- "loss": 2.849,
291
  "step": 38
292
  },
293
  {
294
- "epoch": 0.0013917263651854297,
295
- "grad_norm": 3.4244472980499268,
296
- "learning_rate": 0.00029265847744427303,
297
- "loss": 2.2284,
298
  "step": 39
299
  },
300
  {
301
- "epoch": 0.0014274116566004406,
302
- "grad_norm": 5.198617458343506,
303
- "learning_rate": 0.00029214797567742035,
304
- "loss": 2.3866,
305
  "step": 40
306
  },
307
  {
308
- "epoch": 0.0014274116566004406,
309
- "eval_loss": 2.7816474437713623,
310
- "eval_runtime": 199.5889,
311
- "eval_samples_per_second": 11.464,
312
- "eval_steps_per_second": 11.464,
313
- "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  }
315
  ],
316
  "logging_steps": 1,
317
- "max_steps": 300,
318
  "num_input_tokens_seen": 0,
319
  "num_train_epochs": 1,
320
- "save_steps": 20,
321
  "stateful_callbacks": {
322
  "TrainerControl": {
323
  "args": {
@@ -330,7 +392,7 @@
330
  "attributes": {}
331
  }
332
  },
333
- "total_flos": 769818193035264.0,
334
  "train_batch_size": 1,
335
  "trial_name": null,
336
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.001757361146502412,
5
+ "eval_steps": 50,
6
+ "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 3.514722293004824e-05,
13
+ "grad_norm": 3.2373061180114746,
14
  "learning_rate": 2.9999999999999997e-05,
15
+ "loss": 2.6534,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 3.514722293004824e-05,
20
+ "eval_loss": 3.5773849487304688,
21
+ "eval_runtime": 122.8556,
22
+ "eval_samples_per_second": 4.656,
23
+ "eval_steps_per_second": 4.656,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 7.029444586009648e-05,
28
+ "grad_norm": 10.253190040588379,
29
  "learning_rate": 5.9999999999999995e-05,
30
+ "loss": 3.5291,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.00010544166879014472,
35
+ "grad_norm": 8.353500366210938,
36
  "learning_rate": 8.999999999999999e-05,
37
+ "loss": 2.8977,
38
  "step": 3
39
  },
40
  {
41
+ "epoch": 0.00014058889172019297,
42
+ "grad_norm": 7.733084201812744,
43
  "learning_rate": 0.00011999999999999999,
44
+ "loss": 3.5255,
45
  "step": 4
46
  },
47
  {
48
+ "epoch": 0.0001757361146502412,
49
+ "grad_norm": 9.435683250427246,
50
  "learning_rate": 0.00015,
51
+ "loss": 2.3491,
52
  "step": 5
53
  },
54
  {
55
+ "epoch": 0.00021088333758028944,
56
+ "grad_norm": 7.884566307067871,
57
  "learning_rate": 0.00017999999999999998,
58
+ "loss": 3.0472,
59
  "step": 6
60
  },
61
  {
62
+ "epoch": 0.0002460305605103377,
63
+ "grad_norm": 7.662365436553955,
64
  "learning_rate": 0.00020999999999999998,
65
+ "loss": 2.8102,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.00028117778344038594,
70
+ "grad_norm": 12.221363067626953,
71
  "learning_rate": 0.00023999999999999998,
72
+ "loss": 2.9842,
73
  "step": 8
74
  },
75
  {
76
+ "epoch": 0.00031632500637043413,
77
+ "grad_norm": 6.3601908683776855,
78
  "learning_rate": 0.00027,
79
+ "loss": 2.3343,
80
  "step": 9
81
  },
82
  {
83
+ "epoch": 0.0003514722293004824,
84
+ "grad_norm": 7.852142810821533,
85
  "learning_rate": 0.0003,
86
+ "loss": 2.6562,
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.00038661945223053063,
91
+ "grad_norm": 3.860373020172119,
92
+ "learning_rate": 0.00029999691704375486,
93
+ "loss": 3.1401,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.0004217666751605789,
98
+ "grad_norm": 6.923058032989502,
99
+ "learning_rate": 0.00029998766830174786,
100
+ "loss": 2.9283,
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.00045691389809062713,
105
+ "grad_norm": 7.274583339691162,
106
+ "learning_rate": 0.00029997225415415846,
107
+ "loss": 2.9534,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.0004920611210206754,
112
+ "grad_norm": 6.7508978843688965,
113
+ "learning_rate": 0.00029995067523460196,
114
+ "loss": 3.2048,
115
  "step": 14
116
  },
117
  {
118
+ "epoch": 0.0005272083439507236,
119
+ "grad_norm": 7.768868446350098,
120
+ "learning_rate": 0.0002999229324301032,
121
+ "loss": 3.0451,
122
  "step": 15
123
  },
124
  {
125
+ "epoch": 0.0005623555668807719,
126
+ "grad_norm": 5.01677131652832,
127
+ "learning_rate": 0.0002998890268810601,
128
+ "loss": 2.8798,
129
  "step": 16
130
  },
131
  {
132
+ "epoch": 0.0005975027898108201,
133
+ "grad_norm": 6.7470293045043945,
134
+ "learning_rate": 0.0002998489599811972,
135
+ "loss": 2.47,
136
  "step": 17
137
  },
138
  {
139
+ "epoch": 0.0006326500127408683,
140
+ "grad_norm": 8.27274227142334,
141
+ "learning_rate": 0.00029980273337750765,
142
+ "loss": 3.1441,
143
  "step": 18
144
  },
145
  {
146
+ "epoch": 0.0006677972356709165,
147
+ "grad_norm": 8.152812004089355,
148
+ "learning_rate": 0.00029975034897018613,
149
+ "loss": 3.4123,
150
  "step": 19
151
  },
152
  {
153
+ "epoch": 0.0007029444586009648,
154
+ "grad_norm": 7.479596138000488,
155
+ "learning_rate": 0.00029969180891255043,
156
+ "loss": 3.336,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.000738091681531013,
161
+ "grad_norm": 6.2453789710998535,
162
+ "learning_rate": 0.00029962711561095306,
163
+ "loss": 3.3127,
 
 
 
 
 
 
 
 
164
  "step": 21
165
  },
166
  {
167
+ "epoch": 0.0007732389044610613,
168
+ "grad_norm": 7.580628871917725,
169
+ "learning_rate": 0.00029955627172468223,
170
+ "loss": 3.2636,
171
  "step": 22
172
  },
173
  {
174
+ "epoch": 0.0008083861273911095,
175
+ "grad_norm": 6.434226989746094,
176
+ "learning_rate": 0.0002994792801658527,
177
+ "loss": 2.7362,
178
  "step": 23
179
  },
180
  {
181
+ "epoch": 0.0008435333503211578,
182
+ "grad_norm": 6.997501373291016,
183
+ "learning_rate": 0.00029939614409928584,
184
+ "loss": 2.872,
185
  "step": 24
186
  },
187
  {
188
+ "epoch": 0.000878680573251206,
189
+ "grad_norm": 6.878482818603516,
190
+ "learning_rate": 0.0002993068669423797,
191
+ "loss": 2.7587,
192
  "step": 25
193
  },
194
  {
195
+ "epoch": 0.0009138277961812543,
196
+ "grad_norm": 6.985559463500977,
197
+ "learning_rate": 0.0002992114523649686,
198
+ "loss": 2.891,
199
  "step": 26
200
  },
201
  {
202
+ "epoch": 0.0009489750191113025,
203
+ "grad_norm": 9.444601058959961,
204
+ "learning_rate": 0.000299109904289172,
205
+ "loss": 2.9249,
206
  "step": 27
207
  },
208
  {
209
+ "epoch": 0.0009841222420413508,
210
+ "grad_norm": 6.679138660430908,
211
+ "learning_rate": 0.0002990022268892337,
212
+ "loss": 2.752,
213
  "step": 28
214
  },
215
  {
216
+ "epoch": 0.001019269464971399,
217
+ "grad_norm": 9.364578247070312,
218
+ "learning_rate": 0.00029888842459134974,
219
+ "loss": 3.3749,
220
  "step": 29
221
  },
222
  {
223
+ "epoch": 0.0010544166879014473,
224
+ "grad_norm": 12.166234970092773,
225
+ "learning_rate": 0.0002987685020734869,
226
+ "loss": 3.7972,
227
  "step": 30
228
  },
229
  {
230
+ "epoch": 0.0010895639108314955,
231
+ "grad_norm": 7.539794921875,
232
+ "learning_rate": 0.0002986424642651902,
233
+ "loss": 2.9194,
234
  "step": 31
235
  },
236
  {
237
+ "epoch": 0.0011247111337615438,
238
+ "grad_norm": 9.334528923034668,
239
+ "learning_rate": 0.00029851031634738024,
240
+ "loss": 3.0255,
241
  "step": 32
242
  },
243
  {
244
+ "epoch": 0.001159858356691592,
245
+ "grad_norm": 9.99315357208252,
246
+ "learning_rate": 0.0002983720637521404,
247
+ "loss": 2.8137,
248
  "step": 33
249
  },
250
  {
251
+ "epoch": 0.0011950055796216402,
252
+ "grad_norm": 8.625016212463379,
253
+ "learning_rate": 0.00029822771216249334,
254
+ "loss": 3.1498,
255
  "step": 34
256
  },
257
  {
258
+ "epoch": 0.0012301528025516883,
259
+ "grad_norm": 7.1607441902160645,
260
+ "learning_rate": 0.00029807726751216753,
261
+ "loss": 2.9576,
262
  "step": 35
263
  },
264
  {
265
+ "epoch": 0.0012653000254817365,
266
+ "grad_norm": 7.234793186187744,
267
+ "learning_rate": 0.0002979207359853532,
268
+ "loss": 2.7008,
269
  "step": 36
270
  },
271
  {
272
+ "epoch": 0.0013004472484117848,
273
+ "grad_norm": 9.99887752532959,
274
+ "learning_rate": 0.0002977581240164485,
275
+ "loss": 3.6027,
276
  "step": 37
277
  },
278
  {
279
+ "epoch": 0.001335594471341833,
280
+ "grad_norm": 9.398946762084961,
281
+ "learning_rate": 0.00029758943828979444,
282
+ "loss": 3.0668,
283
  "step": 38
284
  },
285
  {
286
+ "epoch": 0.0013707416942718813,
287
+ "grad_norm": 7.37238073348999,
288
+ "learning_rate": 0.00029741468573940056,
289
+ "loss": 2.915,
290
  "step": 39
291
  },
292
  {
293
+ "epoch": 0.0014058889172019295,
294
+ "grad_norm": 9.463376998901367,
295
+ "learning_rate": 0.0002972338735486598,
296
+ "loss": 2.7669,
297
  "step": 40
298
  },
299
  {
300
+ "epoch": 0.0014410361401319778,
301
+ "grad_norm": 8.600595474243164,
302
+ "learning_rate": 0.00029704700915005305,
303
+ "loss": 2.9336,
304
+ "step": 41
305
+ },
306
+ {
307
+ "epoch": 0.001476183363062026,
308
+ "grad_norm": 9.883442878723145,
309
+ "learning_rate": 0.00029685410022484393,
310
+ "loss": 3.2071,
311
+ "step": 42
312
+ },
313
+ {
314
+ "epoch": 0.0015113305859920743,
315
+ "grad_norm": 12.098119735717773,
316
+ "learning_rate": 0.0002966551547027627,
317
+ "loss": 3.0556,
318
+ "step": 43
319
+ },
320
+ {
321
+ "epoch": 0.0015464778089221225,
322
+ "grad_norm": 17.335891723632812,
323
+ "learning_rate": 0.0002964501807616806,
324
+ "loss": 3.9033,
325
+ "step": 44
326
+ },
327
+ {
328
+ "epoch": 0.0015816250318521708,
329
+ "grad_norm": 8.842806816101074,
330
+ "learning_rate": 0.0002962391868272735,
331
+ "loss": 3.3062,
332
+ "step": 45
333
+ },
334
+ {
335
+ "epoch": 0.001616772254782219,
336
+ "grad_norm": 11.304153442382812,
337
+ "learning_rate": 0.0002960221815726757,
338
+ "loss": 2.0332,
339
+ "step": 46
340
+ },
341
+ {
342
+ "epoch": 0.0016519194777122673,
343
+ "grad_norm": 10.703750610351562,
344
+ "learning_rate": 0.00029579917391812314,
345
+ "loss": 2.9962,
346
+ "step": 47
347
+ },
348
+ {
349
+ "epoch": 0.0016870667006423155,
350
+ "grad_norm": 20.069766998291016,
351
+ "learning_rate": 0.0002955701730305872,
352
+ "loss": 3.6418,
353
+ "step": 48
354
+ },
355
+ {
356
+ "epoch": 0.0017222139235723638,
357
+ "grad_norm": 11.564349174499512,
358
+ "learning_rate": 0.00029533518832339727,
359
+ "loss": 3.2814,
360
+ "step": 49
361
+ },
362
+ {
363
+ "epoch": 0.001757361146502412,
364
+ "grad_norm": 12.544486045837402,
365
+ "learning_rate": 0.0002950942294558544,
366
+ "loss": 3.4021,
367
+ "step": 50
368
+ },
369
+ {
370
+ "epoch": 0.001757361146502412,
371
+ "eval_loss": 3.1045420169830322,
372
+ "eval_runtime": 122.4658,
373
+ "eval_samples_per_second": 4.671,
374
+ "eval_steps_per_second": 4.671,
375
+ "step": 50
376
  }
377
  ],
378
  "logging_steps": 1,
379
+ "max_steps": 500,
380
  "num_input_tokens_seen": 0,
381
  "num_train_epochs": 1,
382
+ "save_steps": 50,
383
  "stateful_callbacks": {
384
  "TrainerControl": {
385
  "args": {
 
392
  "attributes": {}
393
  }
394
  },
395
+ "total_flos": 4482169061769216.0,
396
  "train_batch_size": 1,
397
  "trial_name": null,
398
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8572afd05393422d8959a963fd5560752865fdb6ad5191112e1b0b4f625b9672
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:467127371b2d4da2f03dc3723fc2daad3034ada0f638fbfcb2df74368b95df56
3
  size 6776