starlineventures commited on
Commit
a4cc6c5
·
verified ·
1 Parent(s): c7d6b26

starlineventures/pilot-talk

Browse files
adapter_config.json CHANGED
@@ -27,13 +27,13 @@
27
  "rank_pattern": {},
28
  "revision": null,
29
  "target_modules": [
30
- "gate_proj",
31
  "v_proj",
32
- "k_proj",
33
- "q_proj",
34
  "up_proj",
 
 
35
  "down_proj",
36
- "o_proj"
37
  ],
38
  "task_type": null,
39
  "trainable_token_indices": null,
 
27
  "rank_pattern": {},
28
  "revision": null,
29
  "target_modules": [
 
30
  "v_proj",
31
+ "gate_proj",
 
32
  "up_proj",
33
+ "k_proj",
34
+ "o_proj",
35
  "down_proj",
36
+ "q_proj"
37
  ],
38
  "task_type": null,
39
  "trainable_token_indices": null,
all_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.09647794626653194,
4
- "train_runtime": 276.4027,
5
- "train_samples_per_second": 4.631,
6
- "train_steps_per_second": 1.158
7
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.07282223819444576,
4
+ "train_runtime": 407.0393,
5
+ "train_samples_per_second": 4.717,
6
+ "train_steps_per_second": 1.179
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25afe2c83203d89e709118ec4867d6c1d40d0479a8296b4f90f5090578885517
3
  size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5dcb51bcb5fd5b04df0d65cdfc09af0808fe05bcde2c3dcf56ecefa366eed2f
3
  size 3554214752
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.09647794626653194,
4
- "train_runtime": 276.4027,
5
- "train_samples_per_second": 4.631,
6
- "train_steps_per_second": 1.158
7
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.07282223819444576,
4
+ "train_runtime": 407.0393,
5
+ "train_samples_per_second": 4.717,
6
+ "train_steps_per_second": 1.179
7
  }
trainer_state.json CHANGED
@@ -2,98 +2,98 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 320,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
- "grad_norm": 0.6001169085502625,
15
- "learning_rate": 9.718750000000001e-05,
16
- "loss": 1.9881,
17
- "mean_token_accuracy": 0.7977039694786072,
18
  "num_tokens": 81920.0,
19
  "step": 10
20
  },
21
  {
22
  "epoch": 0.125,
23
- "grad_norm": 0.46209943294525146,
24
- "learning_rate": 9.40625e-05,
25
- "loss": 0.1785,
26
- "mean_token_accuracy": 0.9618344008922577,
27
  "num_tokens": 163840.0,
28
  "step": 20
29
  },
30
  {
31
  "epoch": 0.1875,
32
- "grad_norm": 0.25115829706192017,
33
- "learning_rate": 9.093750000000001e-05,
34
- "loss": 0.075,
35
- "mean_token_accuracy": 0.9808378159999848,
36
  "num_tokens": 245760.0,
37
  "step": 30
38
  },
39
  {
40
  "epoch": 0.25,
41
- "grad_norm": 0.18340203166007996,
42
- "learning_rate": 8.781250000000001e-05,
43
- "loss": 0.0518,
44
- "mean_token_accuracy": 0.98480703830719,
45
  "num_tokens": 327680.0,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 0.3125,
50
- "grad_norm": 0.16958290338516235,
51
- "learning_rate": 8.46875e-05,
52
- "loss": 0.0439,
53
- "mean_token_accuracy": 0.9856253087520599,
54
  "num_tokens": 409600.0,
55
  "step": 50
56
  },
57
  {
58
  "epoch": 0.375,
59
- "grad_norm": 0.18090437352657318,
60
- "learning_rate": 8.156250000000001e-05,
61
- "loss": 0.0362,
62
- "mean_token_accuracy": 0.9867611169815064,
63
  "num_tokens": 491520.0,
64
  "step": 60
65
  },
66
  {
67
  "epoch": 0.4375,
68
- "grad_norm": 0.14541096985340118,
69
- "learning_rate": 7.84375e-05,
70
- "loss": 0.0324,
71
- "mean_token_accuracy": 0.9876038134098053,
72
  "num_tokens": 573440.0,
73
  "step": 70
74
  },
75
  {
76
  "epoch": 0.5,
77
- "grad_norm": 0.3248767852783203,
78
- "learning_rate": 7.531250000000001e-05,
79
- "loss": 0.0309,
80
- "mean_token_accuracy": 0.9874206185340881,
81
  "num_tokens": 655360.0,
82
  "step": 80
83
  },
84
  {
85
  "epoch": 0.5625,
86
- "grad_norm": 0.12345177680253983,
87
- "learning_rate": 7.218750000000001e-05,
88
- "loss": 0.03,
89
- "mean_token_accuracy": 0.9874084055423736,
90
  "num_tokens": 737280.0,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 0.625,
95
- "grad_norm": 0.12568464875221252,
96
- "learning_rate": 6.90625e-05,
97
  "loss": 0.0288,
98
  "mean_token_accuracy": 0.9879335641860962,
99
  "num_tokens": 819200.0,
@@ -101,237 +101,388 @@
101
  },
102
  {
103
  "epoch": 0.6875,
104
- "grad_norm": 0.11818379908800125,
105
- "learning_rate": 6.593750000000001e-05,
106
- "loss": 0.0288,
107
- "mean_token_accuracy": 0.9873839795589447,
108
  "num_tokens": 901120.0,
109
  "step": 110
110
  },
111
- {
112
- "epoch": 0.6875,
113
- "eval_runtime": 11.8942,
114
- "eval_samples_per_second": 13.452,
115
- "eval_steps_per_second": 0.841,
116
- "step": 110
117
- },
118
  {
119
  "epoch": 0.75,
120
- "grad_norm": 0.12166234850883484,
121
- "learning_rate": 6.28125e-05,
122
- "loss": 0.0282,
123
- "mean_token_accuracy": 0.9881167590618134,
124
  "num_tokens": 983040.0,
125
  "step": 120
126
  },
127
  {
128
  "epoch": 0.8125,
129
- "grad_norm": 0.1256779283285141,
130
- "learning_rate": 5.968750000000001e-05,
131
  "loss": 0.0279,
132
- "mean_token_accuracy": 0.9878847122192382,
133
  "num_tokens": 1064960.0,
134
  "step": 130
135
  },
136
  {
137
  "epoch": 0.875,
138
- "grad_norm": 0.11304116994142532,
139
- "learning_rate": 5.6562500000000006e-05,
140
  "loss": 0.0276,
141
- "mean_token_accuracy": 0.987799221277237,
142
  "num_tokens": 1146880.0,
143
  "step": 140
144
  },
145
  {
146
  "epoch": 0.9375,
147
- "grad_norm": 0.10594538599252701,
148
- "learning_rate": 5.3437500000000005e-05,
149
  "loss": 0.0276,
150
- "mean_token_accuracy": 0.9880923330783844,
151
  "num_tokens": 1228800.0,
152
  "step": 150
153
  },
154
  {
155
  "epoch": 1.0,
156
- "grad_norm": 0.1052316427230835,
157
- "learning_rate": 5.031250000000001e-05,
158
- "loss": 0.0274,
159
- "mean_token_accuracy": 0.9881289720535278,
160
  "num_tokens": 1310720.0,
161
  "step": 160
162
  },
163
  {
164
  "epoch": 1.0,
165
- "eval_runtime": 10.8105,
166
- "eval_samples_per_second": 14.8,
167
- "eval_steps_per_second": 0.925,
 
 
 
 
 
 
 
168
  "step": 160
169
  },
170
  {
171
  "epoch": 1.0625,
172
- "grad_norm": 0.11138579249382019,
173
- "learning_rate": 4.71875e-05,
174
  "loss": 0.0267,
175
- "mean_token_accuracy": 0.9883243799209595,
176
  "num_tokens": 1392640.0,
177
  "step": 170
178
  },
179
  {
180
  "epoch": 1.125,
181
- "grad_norm": 0.11131029576063156,
182
- "learning_rate": 4.40625e-05,
183
- "loss": 0.027,
184
- "mean_token_accuracy": 0.9881411850452423,
185
  "num_tokens": 1474560.0,
186
  "step": 180
187
  },
188
  {
189
  "epoch": 1.1875,
190
- "grad_norm": 0.11658758670091629,
191
- "learning_rate": 4.09375e-05,
192
- "loss": 0.0269,
193
- "mean_token_accuracy": 0.9882755279541016,
194
  "num_tokens": 1556480.0,
195
  "step": 190
196
  },
197
  {
198
  "epoch": 1.25,
199
- "grad_norm": 0.11032383143901825,
200
- "learning_rate": 3.78125e-05,
201
- "loss": 0.0266,
202
- "mean_token_accuracy": 0.9884098708629608,
203
  "num_tokens": 1638400.0,
204
  "step": 200
205
  },
206
  {
207
  "epoch": 1.3125,
208
- "grad_norm": 0.10452437400817871,
209
- "learning_rate": 3.46875e-05,
210
- "loss": 0.0267,
211
- "mean_token_accuracy": 0.9882388889789582,
212
  "num_tokens": 1720320.0,
213
  "step": 210
214
  },
215
  {
216
  "epoch": 1.375,
217
- "grad_norm": 0.10454697161912918,
218
- "learning_rate": 3.15625e-05,
219
- "loss": 0.0267,
220
- "mean_token_accuracy": 0.9878969252109527,
221
  "num_tokens": 1802240.0,
222
  "step": 220
223
  },
224
  {
225
  "epoch": 1.4375,
226
- "grad_norm": 0.09663768112659454,
227
- "learning_rate": 2.84375e-05,
228
- "loss": 0.0266,
229
- "mean_token_accuracy": 0.9883976578712463,
230
  "num_tokens": 1884160.0,
231
  "step": 230
232
  },
233
  {
234
  "epoch": 1.5,
235
- "grad_norm": 0.11553214490413666,
236
- "learning_rate": 2.53125e-05,
237
- "loss": 0.0267,
238
- "mean_token_accuracy": 0.9884098708629608,
239
  "num_tokens": 1966080.0,
240
  "step": 240
241
  },
242
  {
243
  "epoch": 1.5625,
244
- "grad_norm": 0.11412502825260162,
245
- "learning_rate": 2.21875e-05,
246
- "loss": 0.0263,
247
- "mean_token_accuracy": 0.9883854448795318,
248
  "num_tokens": 2048000.0,
249
  "step": 250
250
  },
251
  {
252
  "epoch": 1.625,
253
- "grad_norm": 0.1027199923992157,
254
- "learning_rate": 1.90625e-05,
255
- "loss": 0.0264,
256
- "mean_token_accuracy": 0.9882999539375306,
257
  "num_tokens": 2129920.0,
258
  "step": 260
259
  },
260
  {
261
  "epoch": 1.6875,
262
- "grad_norm": 0.12076633423566818,
263
- "learning_rate": 1.59375e-05,
264
- "loss": 0.0264,
265
- "mean_token_accuracy": 0.9882266759872437,
266
  "num_tokens": 2211840.0,
267
  "step": 270
268
  },
269
  {
270
  "epoch": 1.75,
271
- "grad_norm": 0.1111675500869751,
272
- "learning_rate": 1.28125e-05,
273
- "loss": 0.0264,
274
- "mean_token_accuracy": 0.9879335641860962,
275
  "num_tokens": 2293760.0,
276
  "step": 280
277
  },
278
  {
279
  "epoch": 1.8125,
280
- "grad_norm": 0.1163574606180191,
281
- "learning_rate": 9.6875e-06,
282
- "loss": 0.0262,
283
- "mean_token_accuracy": 0.9885197877883911,
284
  "num_tokens": 2375680.0,
285
  "step": 290
286
  },
287
  {
288
  "epoch": 1.875,
289
- "grad_norm": 0.11461054533720016,
290
- "learning_rate": 6.5625e-06,
291
- "loss": 0.0261,
292
- "mean_token_accuracy": 0.988593065738678,
293
  "num_tokens": 2457600.0,
294
  "step": 300
295
  },
296
  {
297
  "epoch": 1.9375,
298
- "grad_norm": 0.10500755161046982,
299
- "learning_rate": 3.4375000000000005e-06,
300
- "loss": 0.0263,
301
- "mean_token_accuracy": 0.9882633149623871,
302
  "num_tokens": 2539520.0,
303
  "step": 310
304
  },
305
  {
306
  "epoch": 2.0,
307
- "grad_norm": 0.11479007452726364,
308
- "learning_rate": 3.125e-07,
309
- "loss": 0.0262,
310
- "mean_token_accuracy": 0.988336592912674,
311
  "num_tokens": 2621440.0,
312
  "step": 320
313
  },
314
  {
315
  "epoch": 2.0,
316
- "eval_runtime": 10.8162,
317
- "eval_samples_per_second": 14.793,
318
- "eval_steps_per_second": 0.925,
319
  "step": 320
320
  },
321
  {
322
- "epoch": 2.0,
323
- "step": 320,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  "total_flos": 0.0,
325
- "train_loss": 0.09647794626653194,
326
- "train_runtime": 276.4027,
327
- "train_samples_per_second": 4.631,
328
- "train_steps_per_second": 1.158
329
  }
330
  ],
331
  "logging_steps": 10,
332
- "max_steps": 320,
333
  "num_input_tokens_seen": 0,
334
- "num_train_epochs": 2,
335
  "save_steps": 500,
336
  "stateful_callbacks": {
337
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 480,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
+ "grad_norm": 0.5966607332229614,
15
+ "learning_rate": 9.8125e-05,
16
+ "loss": 1.9853,
17
+ "mean_token_accuracy": 0.7986810088157654,
18
  "num_tokens": 81920.0,
19
  "step": 10
20
  },
21
  {
22
  "epoch": 0.125,
23
+ "grad_norm": 0.517964780330658,
24
+ "learning_rate": 9.604166666666668e-05,
25
+ "loss": 0.1767,
26
+ "mean_token_accuracy": 0.9620053827762604,
27
  "num_tokens": 163840.0,
28
  "step": 20
29
  },
30
  {
31
  "epoch": 0.1875,
32
+ "grad_norm": 0.3082728683948517,
33
+ "learning_rate": 9.395833333333333e-05,
34
+ "loss": 0.0745,
35
+ "mean_token_accuracy": 0.9809599459171295,
36
  "num_tokens": 245760.0,
37
  "step": 30
38
  },
39
  {
40
  "epoch": 0.25,
41
+ "grad_norm": 0.1719624400138855,
42
+ "learning_rate": 9.1875e-05,
43
+ "loss": 0.0512,
44
+ "mean_token_accuracy": 0.9850024461746216,
45
  "num_tokens": 327680.0,
46
  "step": 40
47
  },
48
  {
49
  "epoch": 0.3125,
50
+ "grad_norm": 0.16888852417469025,
51
+ "learning_rate": 8.979166666666668e-05,
52
+ "loss": 0.0432,
53
+ "mean_token_accuracy": 0.9855642437934875,
54
  "num_tokens": 409600.0,
55
  "step": 50
56
  },
57
  {
58
  "epoch": 0.375,
59
+ "grad_norm": 0.18019415438175201,
60
+ "learning_rate": 8.770833333333334e-05,
61
+ "loss": 0.0358,
62
+ "mean_token_accuracy": 0.9871152937412262,
63
  "num_tokens": 491520.0,
64
  "step": 60
65
  },
66
  {
67
  "epoch": 0.4375,
68
+ "grad_norm": 0.14396421611309052,
69
+ "learning_rate": 8.5625e-05,
70
+ "loss": 0.0322,
71
+ "mean_token_accuracy": 0.9876770913600922,
72
  "num_tokens": 573440.0,
73
  "step": 70
74
  },
75
  {
76
  "epoch": 0.5,
77
+ "grad_norm": 0.1333707869052887,
78
+ "learning_rate": 8.354166666666667e-05,
79
+ "loss": 0.0307,
80
+ "mean_token_accuracy": 0.9872984886169434,
81
  "num_tokens": 655360.0,
82
  "step": 80
83
  },
84
  {
85
  "epoch": 0.5625,
86
+ "grad_norm": 0.118904247879982,
87
+ "learning_rate": 8.145833333333334e-05,
88
+ "loss": 0.0298,
89
+ "mean_token_accuracy": 0.9874206185340881,
90
  "num_tokens": 737280.0,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 0.625,
95
+ "grad_norm": 0.10813665390014648,
96
+ "learning_rate": 7.9375e-05,
97
  "loss": 0.0288,
98
  "mean_token_accuracy": 0.9879335641860962,
99
  "num_tokens": 819200.0,
 
101
  },
102
  {
103
  "epoch": 0.6875,
104
+ "grad_norm": 0.11911392956972122,
105
+ "learning_rate": 7.729166666666667e-05,
106
+ "loss": 0.0287,
107
+ "mean_token_accuracy": 0.987469470500946,
108
  "num_tokens": 901120.0,
109
  "step": 110
110
  },
 
 
 
 
 
 
 
111
  {
112
  "epoch": 0.75,
113
+ "grad_norm": 0.11651206016540527,
114
+ "learning_rate": 7.520833333333334e-05,
115
+ "loss": 0.0281,
116
+ "mean_token_accuracy": 0.9881045460700989,
117
  "num_tokens": 983040.0,
118
  "step": 120
119
  },
120
  {
121
  "epoch": 0.8125,
122
+ "grad_norm": 0.12028653174638748,
123
+ "learning_rate": 7.3125e-05,
124
  "loss": 0.0279,
125
+ "mean_token_accuracy": 0.9879335641860962,
126
  "num_tokens": 1064960.0,
127
  "step": 130
128
  },
129
  {
130
  "epoch": 0.875,
131
+ "grad_norm": 0.10548015683889389,
132
+ "learning_rate": 7.104166666666667e-05,
133
  "loss": 0.0276,
134
+ "mean_token_accuracy": 0.9877259433269501,
135
  "num_tokens": 1146880.0,
136
  "step": 140
137
  },
138
  {
139
  "epoch": 0.9375,
140
+ "grad_norm": 0.10059994459152222,
141
+ "learning_rate": 6.895833333333333e-05,
142
  "loss": 0.0276,
143
+ "mean_token_accuracy": 0.9879824161529541,
144
  "num_tokens": 1228800.0,
145
  "step": 150
146
  },
147
  {
148
  "epoch": 1.0,
149
+ "grad_norm": 0.09910538047552109,
150
+ "learning_rate": 6.6875e-05,
151
+ "loss": 0.0275,
152
+ "mean_token_accuracy": 0.9881045460700989,
153
  "num_tokens": 1310720.0,
154
  "step": 160
155
  },
156
  {
157
  "epoch": 1.0,
158
+ "eval_runtime": 11.8471,
159
+ "eval_samples_per_second": 13.505,
160
+ "eval_steps_per_second": 0.844,
161
+ "step": 160
162
+ },
163
+ {
164
+ "epoch": 1.0,
165
+ "eval_runtime": 10.9136,
166
+ "eval_samples_per_second": 14.661,
167
+ "eval_steps_per_second": 0.916,
168
  "step": 160
169
  },
170
  {
171
  "epoch": 1.0625,
172
+ "grad_norm": 0.10032763332128525,
173
+ "learning_rate": 6.479166666666668e-05,
174
  "loss": 0.0267,
175
+ "mean_token_accuracy": 0.9882633149623871,
176
  "num_tokens": 1392640.0,
177
  "step": 170
178
  },
179
  {
180
  "epoch": 1.125,
181
+ "grad_norm": 0.10291949659585953,
182
+ "learning_rate": 6.270833333333333e-05,
183
+ "loss": 0.0271,
184
+ "mean_token_accuracy": 0.9882144629955292,
185
  "num_tokens": 1474560.0,
186
  "step": 180
187
  },
188
  {
189
  "epoch": 1.1875,
190
+ "grad_norm": 0.10503465682268143,
191
+ "learning_rate": 6.0624999999999996e-05,
192
+ "loss": 0.027,
193
+ "mean_token_accuracy": 0.988031268119812,
194
  "num_tokens": 1556480.0,
195
  "step": 190
196
  },
197
  {
198
  "epoch": 1.25,
199
+ "grad_norm": 0.09680064767599106,
200
+ "learning_rate": 5.8541666666666676e-05,
201
+ "loss": 0.0268,
202
+ "mean_token_accuracy": 0.9883976578712463,
203
  "num_tokens": 1638400.0,
204
  "step": 200
205
  },
206
  {
207
  "epoch": 1.3125,
208
+ "grad_norm": 0.09701237827539444,
209
+ "learning_rate": 5.6458333333333335e-05,
210
+ "loss": 0.0268,
211
+ "mean_token_accuracy": 0.9880923330783844,
212
  "num_tokens": 1720320.0,
213
  "step": 210
214
  },
215
  {
216
  "epoch": 1.375,
217
+ "grad_norm": 0.09592857956886292,
218
+ "learning_rate": 5.4375e-05,
219
+ "loss": 0.027,
220
+ "mean_token_accuracy": 0.9881533980369568,
221
  "num_tokens": 1802240.0,
222
  "step": 220
223
  },
224
  {
225
  "epoch": 1.4375,
226
+ "grad_norm": 0.09052903950214386,
227
+ "learning_rate": 5.229166666666667e-05,
228
+ "loss": 0.027,
229
+ "mean_token_accuracy": 0.9882999539375306,
230
  "num_tokens": 1884160.0,
231
  "step": 230
232
  },
233
  {
234
  "epoch": 1.5,
235
+ "grad_norm": 0.1032903790473938,
236
+ "learning_rate": 5.020833333333333e-05,
237
+ "loss": 0.0268,
238
+ "mean_token_accuracy": 0.988312166929245,
239
  "num_tokens": 1966080.0,
240
  "step": 240
241
  },
242
  {
243
  "epoch": 1.5625,
244
+ "grad_norm": 0.10449512302875519,
245
+ "learning_rate": 4.8125000000000004e-05,
246
+ "loss": 0.0266,
247
+ "mean_token_accuracy": 0.9881900370121002,
248
  "num_tokens": 2048000.0,
249
  "step": 250
250
  },
251
  {
252
  "epoch": 1.625,
253
+ "grad_norm": 0.09428944438695908,
254
+ "learning_rate": 4.604166666666666e-05,
255
+ "loss": 0.0267,
256
+ "mean_token_accuracy": 0.9882511019706726,
257
  "num_tokens": 2129920.0,
258
  "step": 260
259
  },
260
  {
261
  "epoch": 1.6875,
262
+ "grad_norm": 0.10462497174739838,
263
+ "learning_rate": 4.3958333333333336e-05,
264
+ "loss": 0.0266,
265
+ "mean_token_accuracy": 0.9879091382026672,
266
  "num_tokens": 2211840.0,
267
  "step": 270
268
  },
269
  {
270
  "epoch": 1.75,
271
+ "grad_norm": 0.09638702869415283,
272
+ "learning_rate": 4.1875e-05,
273
+ "loss": 0.0266,
274
+ "mean_token_accuracy": 0.9883488059043884,
275
  "num_tokens": 2293760.0,
276
  "step": 280
277
  },
278
  {
279
  "epoch": 1.8125,
280
+ "grad_norm": 0.10269024223089218,
281
+ "learning_rate": 3.979166666666667e-05,
282
+ "loss": 0.0265,
283
+ "mean_token_accuracy": 0.9881533980369568,
284
  "num_tokens": 2375680.0,
285
  "step": 290
286
  },
287
  {
288
  "epoch": 1.875,
289
+ "grad_norm": 0.09432139992713928,
290
+ "learning_rate": 3.770833333333333e-05,
291
+ "loss": 0.0264,
292
+ "mean_token_accuracy": 0.9882633149623871,
293
  "num_tokens": 2457600.0,
294
  "step": 300
295
  },
296
  {
297
  "epoch": 1.9375,
298
+ "grad_norm": 0.10591922700405121,
299
+ "learning_rate": 3.5625000000000005e-05,
300
+ "loss": 0.0265,
301
+ "mean_token_accuracy": 0.9878358602523803,
302
  "num_tokens": 2539520.0,
303
  "step": 310
304
  },
305
  {
306
  "epoch": 2.0,
307
+ "grad_norm": 0.0988362580537796,
308
+ "learning_rate": 3.3541666666666664e-05,
309
+ "loss": 0.0264,
310
+ "mean_token_accuracy": 0.9880923330783844,
311
  "num_tokens": 2621440.0,
312
  "step": 320
313
  },
314
  {
315
  "epoch": 2.0,
316
+ "eval_runtime": 10.7547,
317
+ "eval_samples_per_second": 14.877,
318
+ "eval_steps_per_second": 0.93,
319
  "step": 320
320
  },
321
  {
322
+ "epoch": 2.0625,
323
+ "grad_norm": 0.08879899233579636,
324
+ "learning_rate": 3.145833333333334e-05,
325
+ "loss": 0.026,
326
+ "mean_token_accuracy": 0.9881656110286713,
327
+ "num_tokens": 2703360.0,
328
+ "step": 330
329
+ },
330
+ {
331
+ "epoch": 2.125,
332
+ "grad_norm": 0.09961431473493576,
333
+ "learning_rate": 2.9375000000000003e-05,
334
+ "loss": 0.0257,
335
+ "mean_token_accuracy": 0.9886052787303925,
336
+ "num_tokens": 2785280.0,
337
+ "step": 340
338
+ },
339
+ {
340
+ "epoch": 2.1875,
341
+ "grad_norm": 0.1122380793094635,
342
+ "learning_rate": 2.7291666666666665e-05,
343
+ "loss": 0.0261,
344
+ "mean_token_accuracy": 0.9886541306972504,
345
+ "num_tokens": 2867200.0,
346
+ "step": 350
347
+ },
348
+ {
349
+ "epoch": 2.25,
350
+ "grad_norm": 0.09964418411254883,
351
+ "learning_rate": 2.5208333333333334e-05,
352
+ "loss": 0.0255,
353
+ "mean_token_accuracy": 0.9889594554901123,
354
+ "num_tokens": 2949120.0,
355
+ "step": 360
356
+ },
357
+ {
358
+ "epoch": 2.3125,
359
+ "grad_norm": 0.09933824837207794,
360
+ "learning_rate": 2.3125000000000003e-05,
361
+ "loss": 0.0259,
362
+ "mean_token_accuracy": 0.9884220838546753,
363
+ "num_tokens": 3031040.0,
364
+ "step": 370
365
+ },
366
+ {
367
+ "epoch": 2.375,
368
+ "grad_norm": 0.09340930730104446,
369
+ "learning_rate": 2.104166666666667e-05,
370
+ "loss": 0.0262,
371
+ "mean_token_accuracy": 0.9882755279541016,
372
+ "num_tokens": 3112960.0,
373
+ "step": 380
374
+ },
375
+ {
376
+ "epoch": 2.4375,
377
+ "grad_norm": 0.09159277379512787,
378
+ "learning_rate": 1.8958333333333334e-05,
379
+ "loss": 0.0259,
380
+ "mean_token_accuracy": 0.9886297047138214,
381
+ "num_tokens": 3194880.0,
382
+ "step": 390
383
+ },
384
+ {
385
+ "epoch": 2.5,
386
+ "grad_norm": 0.10940947383642197,
387
+ "learning_rate": 1.6875000000000004e-05,
388
+ "loss": 0.0258,
389
+ "mean_token_accuracy": 0.9885075747966766,
390
+ "num_tokens": 3276800.0,
391
+ "step": 400
392
+ },
393
+ {
394
+ "epoch": 2.5625,
395
+ "grad_norm": 0.09535407274961472,
396
+ "learning_rate": 1.4791666666666668e-05,
397
+ "loss": 0.0259,
398
+ "mean_token_accuracy": 0.9886174917221069,
399
+ "num_tokens": 3358720.0,
400
+ "step": 410
401
+ },
402
+ {
403
+ "epoch": 2.625,
404
+ "grad_norm": 0.08938491344451904,
405
+ "learning_rate": 1.2708333333333333e-05,
406
+ "loss": 0.0257,
407
+ "mean_token_accuracy": 0.9884831488132477,
408
+ "num_tokens": 3440640.0,
409
+ "step": 420
410
+ },
411
+ {
412
+ "epoch": 2.6875,
413
+ "grad_norm": 0.09536239504814148,
414
+ "learning_rate": 1.0625e-05,
415
+ "loss": 0.0257,
416
+ "mean_token_accuracy": 0.9886052787303925,
417
+ "num_tokens": 3522560.0,
418
+ "step": 430
419
+ },
420
+ {
421
+ "epoch": 2.75,
422
+ "grad_norm": 0.0934009775519371,
423
+ "learning_rate": 8.541666666666666e-06,
424
+ "loss": 0.0257,
425
+ "mean_token_accuracy": 0.9886907696723938,
426
+ "num_tokens": 3604480.0,
427
+ "step": 440
428
+ },
429
+ {
430
+ "epoch": 2.8125,
431
+ "grad_norm": 0.09570059180259705,
432
+ "learning_rate": 6.458333333333334e-06,
433
+ "loss": 0.0255,
434
+ "mean_token_accuracy": 0.9888617515563964,
435
+ "num_tokens": 3686400.0,
436
+ "step": 450
437
+ },
438
+ {
439
+ "epoch": 2.875,
440
+ "grad_norm": 0.09678570926189423,
441
+ "learning_rate": 4.375e-06,
442
+ "loss": 0.0255,
443
+ "mean_token_accuracy": 0.988361018896103,
444
+ "num_tokens": 3768320.0,
445
+ "step": 460
446
+ },
447
+ {
448
+ "epoch": 2.9375,
449
+ "grad_norm": 0.0881657674908638,
450
+ "learning_rate": 2.2916666666666666e-06,
451
+ "loss": 0.0255,
452
+ "mean_token_accuracy": 0.9887884736061097,
453
+ "num_tokens": 3850240.0,
454
+ "step": 470
455
+ },
456
+ {
457
+ "epoch": 3.0,
458
+ "grad_norm": 0.10601094365119934,
459
+ "learning_rate": 2.0833333333333333e-07,
460
+ "loss": 0.0257,
461
+ "mean_token_accuracy": 0.9887518346309662,
462
+ "num_tokens": 3932160.0,
463
+ "step": 480
464
+ },
465
+ {
466
+ "epoch": 3.0,
467
+ "eval_runtime": 10.7566,
468
+ "eval_samples_per_second": 14.875,
469
+ "eval_steps_per_second": 0.93,
470
+ "step": 480
471
+ },
472
+ {
473
+ "epoch": 3.0,
474
+ "step": 480,
475
  "total_flos": 0.0,
476
+ "train_loss": 0.07282223819444576,
477
+ "train_runtime": 407.0393,
478
+ "train_samples_per_second": 4.717,
479
+ "train_steps_per_second": 1.179
480
  }
481
  ],
482
  "logging_steps": 10,
483
+ "max_steps": 480,
484
  "num_input_tokens_seen": 0,
485
+ "num_train_epochs": 3,
486
  "save_steps": 500,
487
  "stateful_callbacks": {
488
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2d1ad38d9466f8a2da26fdcc2bc0cfe7f4da2baa9299dcd78d391608ab03e27
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:073190de65ef768971fe2524ebc4e07c2ec65829248c8f666616f97e2e837807
3
  size 6033