sedrickkeh commited on
Commit
1058391
·
verified ·
1 Parent(s): 389160c

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -1
  2. all_results.json +8 -0
  3. train_results.json +8 -0
  4. trainer_state.json +924 -0
  5. training_loss.png +0 -0
README.md CHANGED
@@ -4,6 +4,7 @@ license: apache-2.0
4
  base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: multiple_samples_none_numina_aime
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # multiple_samples_none_numina_aime
17
 
18
- This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on an unknown dataset.
19
 
20
  ## Model description
21
 
 
4
  base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: multiple_samples_none_numina_aime
 
16
 
17
  # multiple_samples_none_numina_aime
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on the mlfoundations-dev/multiple_samples_none_numina_aime dataset.
20
 
21
  ## Model description
22
 
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9725490196078432,
3
+ "total_flos": 121055548211200.0,
4
+ "train_loss": 0.7961960165273576,
5
+ "train_runtime": 4088.556,
6
+ "train_samples_per_second": 2.986,
7
+ "train_steps_per_second": 0.031
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9725490196078432,
3
+ "total_flos": 121055548211200.0,
4
+ "train_loss": 0.7961960165273576,
5
+ "train_runtime": 4088.556,
6
+ "train_samples_per_second": 2.986,
7
+ "train_steps_per_second": 0.031
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,924 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9725490196078432,
5
+ "eval_steps": 500,
6
+ "global_step": 126,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.023529411764705882,
13
+ "grad_norm": 6.901826858520508,
14
+ "learning_rate": 7.692307692307694e-07,
15
+ "loss": 1.1119,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.047058823529411764,
20
+ "grad_norm": 7.422886371612549,
21
+ "learning_rate": 1.5384615384615387e-06,
22
+ "loss": 1.1537,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.07058823529411765,
27
+ "grad_norm": 6.920129299163818,
28
+ "learning_rate": 2.307692307692308e-06,
29
+ "loss": 1.1205,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.09411764705882353,
34
+ "grad_norm": 6.285982608795166,
35
+ "learning_rate": 3.0769230769230774e-06,
36
+ "loss": 1.0476,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.11764705882352941,
41
+ "grad_norm": 5.431225299835205,
42
+ "learning_rate": 3.846153846153847e-06,
43
+ "loss": 1.0753,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.1411764705882353,
48
+ "grad_norm": 3.6898839473724365,
49
+ "learning_rate": 4.615384615384616e-06,
50
+ "loss": 0.9761,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.16470588235294117,
55
+ "grad_norm": 2.8562192916870117,
56
+ "learning_rate": 5.384615384615385e-06,
57
+ "loss": 0.9581,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.18823529411764706,
62
+ "grad_norm": 4.019009590148926,
63
+ "learning_rate": 6.153846153846155e-06,
64
+ "loss": 1.0193,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.21176470588235294,
69
+ "grad_norm": 4.8232526779174805,
70
+ "learning_rate": 6.923076923076923e-06,
71
+ "loss": 0.951,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.23529411764705882,
76
+ "grad_norm": 4.59989595413208,
77
+ "learning_rate": 7.692307692307694e-06,
78
+ "loss": 1.0139,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.25882352941176473,
83
+ "grad_norm": 3.852893829345703,
84
+ "learning_rate": 8.461538461538462e-06,
85
+ "loss": 0.9087,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.2823529411764706,
90
+ "grad_norm": 3.7579197883605957,
91
+ "learning_rate": 9.230769230769232e-06,
92
+ "loss": 1.0375,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.3058823529411765,
97
+ "grad_norm": 2.5401360988616943,
98
+ "learning_rate": 1e-05,
99
+ "loss": 0.9651,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.32941176470588235,
104
+ "grad_norm": 2.095151424407959,
105
+ "learning_rate": 9.998067787472772e-06,
106
+ "loss": 0.9098,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.35294117647058826,
111
+ "grad_norm": 2.1526248455047607,
112
+ "learning_rate": 9.992272643269181e-06,
113
+ "loss": 0.8308,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.3764705882352941,
118
+ "grad_norm": 1.9617197513580322,
119
+ "learning_rate": 9.982619046369321e-06,
120
+ "loss": 0.9148,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.4,
125
+ "grad_norm": 1.5976275205612183,
126
+ "learning_rate": 9.96911445789354e-06,
127
+ "loss": 0.8948,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.4235294117647059,
132
+ "grad_norm": 1.484428882598877,
133
+ "learning_rate": 9.951769315335843e-06,
134
+ "loss": 0.8592,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.4470588235294118,
139
+ "grad_norm": 1.4591351747512817,
140
+ "learning_rate": 9.930597024496933e-06,
141
+ "loss": 0.8315,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.47058823529411764,
146
+ "grad_norm": 1.1599817276000977,
147
+ "learning_rate": 9.905613949123036e-06,
148
+ "loss": 0.808,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.49411764705882355,
153
+ "grad_norm": 1.2381192445755005,
154
+ "learning_rate": 9.87683939825864e-06,
155
+ "loss": 0.8833,
156
+ "step": 21
157
+ },
158
+ {
159
+ "epoch": 0.5176470588235295,
160
+ "grad_norm": 1.25748872756958,
161
+ "learning_rate": 9.844295611322804e-06,
162
+ "loss": 0.873,
163
+ "step": 22
164
+ },
165
+ {
166
+ "epoch": 0.5411764705882353,
167
+ "grad_norm": 1.0262051820755005,
168
+ "learning_rate": 9.808007740920647e-06,
169
+ "loss": 0.7802,
170
+ "step": 23
171
+ },
172
+ {
173
+ "epoch": 0.5647058823529412,
174
+ "grad_norm": 0.9352391362190247,
175
+ "learning_rate": 9.768003833403278e-06,
176
+ "loss": 0.8134,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.5882352941176471,
181
+ "grad_norm": 1.0994199514389038,
182
+ "learning_rate": 9.724314807191197e-06,
183
+ "loss": 0.8358,
184
+ "step": 25
185
+ },
186
+ {
187
+ "epoch": 0.611764705882353,
188
+ "grad_norm": 0.9108858704566956,
189
+ "learning_rate": 9.6769744288779e-06,
190
+ "loss": 0.8229,
191
+ "step": 26
192
+ },
193
+ {
194
+ "epoch": 0.6352941176470588,
195
+ "grad_norm": 0.7783969044685364,
196
+ "learning_rate": 9.626019287132202e-06,
197
+ "loss": 0.7928,
198
+ "step": 27
199
+ },
200
+ {
201
+ "epoch": 0.6588235294117647,
202
+ "grad_norm": 0.8140386343002319,
203
+ "learning_rate": 9.571488764419381e-06,
204
+ "loss": 0.8129,
205
+ "step": 28
206
+ },
207
+ {
208
+ "epoch": 0.6823529411764706,
209
+ "grad_norm": 0.7752570509910583,
210
+ "learning_rate": 9.51342500656308e-06,
211
+ "loss": 0.8572,
212
+ "step": 29
213
+ },
214
+ {
215
+ "epoch": 0.7058823529411765,
216
+ "grad_norm": 0.678372323513031,
217
+ "learning_rate": 9.451872890171419e-06,
218
+ "loss": 0.8103,
219
+ "step": 30
220
+ },
221
+ {
222
+ "epoch": 0.7294117647058823,
223
+ "grad_norm": 0.6800512075424194,
224
+ "learning_rate": 9.386879987952549e-06,
225
+ "loss": 0.897,
226
+ "step": 31
227
+ },
228
+ {
229
+ "epoch": 0.7529411764705882,
230
+ "grad_norm": 0.6305904388427734,
231
+ "learning_rate": 9.318496531946411e-06,
232
+ "loss": 0.818,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 0.7764705882352941,
237
+ "grad_norm": 0.7293695211410522,
238
+ "learning_rate": 9.246775374701139e-06,
239
+ "loss": 0.8332,
240
+ "step": 33
241
+ },
242
+ {
243
+ "epoch": 0.8,
244
+ "grad_norm": 0.7706238031387329,
245
+ "learning_rate": 9.171771948424138e-06,
246
+ "loss": 0.8584,
247
+ "step": 34
248
+ },
249
+ {
250
+ "epoch": 0.8235294117647058,
251
+ "grad_norm": 0.6435885429382324,
252
+ "learning_rate": 9.093544222139338e-06,
253
+ "loss": 0.8726,
254
+ "step": 35
255
+ },
256
+ {
257
+ "epoch": 0.8470588235294118,
258
+ "grad_norm": 0.7461095452308655,
259
+ "learning_rate": 9.012152656883824e-06,
260
+ "loss": 0.7851,
261
+ "step": 36
262
+ },
263
+ {
264
+ "epoch": 0.8705882352941177,
265
+ "grad_norm": 0.8690148591995239,
266
+ "learning_rate": 8.927660158978392e-06,
267
+ "loss": 0.8347,
268
+ "step": 37
269
+ },
270
+ {
271
+ "epoch": 0.8941176470588236,
272
+ "grad_norm": 0.6226567625999451,
273
+ "learning_rate": 8.84013203140821e-06,
274
+ "loss": 0.8419,
275
+ "step": 38
276
+ },
277
+ {
278
+ "epoch": 0.9176470588235294,
279
+ "grad_norm": 0.603524923324585,
280
+ "learning_rate": 8.749635923351108e-06,
281
+ "loss": 0.776,
282
+ "step": 39
283
+ },
284
+ {
285
+ "epoch": 0.9411764705882353,
286
+ "grad_norm": 0.6303524374961853,
287
+ "learning_rate": 8.656241777892544e-06,
288
+ "loss": 0.7207,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 0.9647058823529412,
293
+ "grad_norm": 0.5369915962219238,
294
+ "learning_rate": 8.56002177796765e-06,
295
+ "loss": 0.7694,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 0.9882352941176471,
300
+ "grad_norm": 0.5721177458763123,
301
+ "learning_rate": 8.461050290572114e-06,
302
+ "loss": 0.7732,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 1.0156862745098039,
307
+ "grad_norm": 1.0953840017318726,
308
+ "learning_rate": 8.359403809285054e-06,
309
+ "loss": 1.3119,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 1.0392156862745099,
314
+ "grad_norm": 0.5467516183853149,
315
+ "learning_rate": 8.255160895148263e-06,
316
+ "loss": 0.7665,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 1.0627450980392157,
321
+ "grad_norm": 0.6043545007705688,
322
+ "learning_rate": 8.14840211594757e-06,
323
+ "loss": 0.7368,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 1.0862745098039215,
328
+ "grad_norm": 0.6263077259063721,
329
+ "learning_rate": 8.039209983943201e-06,
330
+ "loss": 0.7976,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 1.1098039215686275,
335
+ "grad_norm": 0.5699981451034546,
336
+ "learning_rate": 7.927668892097288e-06,
337
+ "loss": 0.7109,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 1.1333333333333333,
342
+ "grad_norm": 0.5554935336112976,
343
+ "learning_rate": 7.81386504884782e-06,
344
+ "loss": 0.738,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 1.156862745098039,
349
+ "grad_norm": 0.7168362736701965,
350
+ "learning_rate": 7.697886411479422e-06,
351
+ "loss": 0.8269,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 1.1803921568627451,
356
+ "grad_norm": 0.5106287002563477,
357
+ "learning_rate": 7.579822618142505e-06,
358
+ "loss": 0.7993,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 1.203921568627451,
363
+ "grad_norm": 0.7224318385124207,
364
+ "learning_rate": 7.459764918573264e-06,
365
+ "loss": 0.8325,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 1.227450980392157,
370
+ "grad_norm": 0.6035781502723694,
371
+ "learning_rate": 7.3378061035681415e-06,
372
+ "loss": 0.73,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 1.2509803921568627,
377
+ "grad_norm": 0.5344979166984558,
378
+ "learning_rate": 7.2140404332671986e-06,
379
+ "loss": 0.7398,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 1.2745098039215685,
384
+ "grad_norm": 0.6349881291389465,
385
+ "learning_rate": 7.088563564301874e-06,
386
+ "loss": 0.8289,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 1.2980392156862746,
391
+ "grad_norm": 0.5782693028450012,
392
+ "learning_rate": 6.961472475863406e-06,
393
+ "loss": 0.7454,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 1.3215686274509804,
398
+ "grad_norm": 0.4461568295955658,
399
+ "learning_rate": 6.832865394749065e-06,
400
+ "loss": 0.6694,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 1.3450980392156864,
405
+ "grad_norm": 0.5781851410865784,
406
+ "learning_rate": 6.702841719444141e-06,
407
+ "loss": 0.8093,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 1.3686274509803922,
412
+ "grad_norm": 0.48329582810401917,
413
+ "learning_rate": 6.571501943298335e-06,
414
+ "loss": 0.7096,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 1.392156862745098,
419
+ "grad_norm": 0.6234760284423828,
420
+ "learning_rate": 6.4389475768559675e-06,
421
+ "loss": 0.8138,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 1.415686274509804,
426
+ "grad_norm": 0.49070653319358826,
427
+ "learning_rate": 6.305281069399989e-06,
428
+ "loss": 0.6619,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 1.4392156862745098,
433
+ "grad_norm": 0.49748146533966064,
434
+ "learning_rate": 6.17060572977047e-06,
435
+ "loss": 0.693,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 1.4627450980392158,
440
+ "grad_norm": 0.5554483532905579,
441
+ "learning_rate": 6.035025646518747e-06,
442
+ "loss": 0.7561,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 1.4862745098039216,
447
+ "grad_norm": 0.6127786040306091,
448
+ "learning_rate": 5.898645607458941e-06,
449
+ "loss": 0.7799,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 1.5098039215686274,
454
+ "grad_norm": 0.5526847839355469,
455
+ "learning_rate": 5.761571018679025e-06,
456
+ "loss": 0.7374,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 1.5333333333333332,
461
+ "grad_norm": 0.5685780644416809,
462
+ "learning_rate": 5.623907823074044e-06,
463
+ "loss": 0.8134,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 1.5568627450980392,
468
+ "grad_norm": 0.4792926013469696,
469
+ "learning_rate": 5.48576241846443e-06,
470
+ "loss": 0.7933,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 1.5803921568627453,
475
+ "grad_norm": 0.4758462607860565,
476
+ "learning_rate": 5.347241575362729e-06,
477
+ "loss": 0.7209,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 1.603921568627451,
482
+ "grad_norm": 0.5107057690620422,
483
+ "learning_rate": 5.208452354452275e-06,
484
+ "loss": 0.7746,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 1.6274509803921569,
489
+ "grad_norm": 0.4799031913280487,
490
+ "learning_rate": 5.069502023841576e-06,
491
+ "loss": 0.7635,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 1.6509803921568627,
496
+ "grad_norm": 0.5203085541725159,
497
+ "learning_rate": 4.9304979761584256e-06,
498
+ "loss": 0.7708,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 1.6745098039215687,
503
+ "grad_norm": 0.44460946321487427,
504
+ "learning_rate": 4.791547645547727e-06,
505
+ "loss": 0.6827,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 1.6980392156862745,
510
+ "grad_norm": 0.5535275340080261,
511
+ "learning_rate": 4.652758424637271e-06,
512
+ "loss": 0.794,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 1.7215686274509805,
517
+ "grad_norm": 0.4878956377506256,
518
+ "learning_rate": 4.514237581535571e-06,
519
+ "loss": 0.7368,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 1.7450980392156863,
524
+ "grad_norm": 0.5016121864318848,
525
+ "learning_rate": 4.3760921769259585e-06,
526
+ "loss": 0.6936,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 1.768627450980392,
531
+ "grad_norm": 0.5011301040649414,
532
+ "learning_rate": 4.2384289813209754e-06,
533
+ "loss": 0.7475,
534
+ "step": 75
535
+ },
536
+ {
537
+ "epoch": 1.792156862745098,
538
+ "grad_norm": 0.4553963243961334,
539
+ "learning_rate": 4.101354392541061e-06,
540
+ "loss": 0.7358,
541
+ "step": 76
542
+ },
543
+ {
544
+ "epoch": 1.815686274509804,
545
+ "grad_norm": 0.4620165228843689,
546
+ "learning_rate": 3.964974353481254e-06,
547
+ "loss": 0.7331,
548
+ "step": 77
549
+ },
550
+ {
551
+ "epoch": 1.83921568627451,
552
+ "grad_norm": 0.4453507363796234,
553
+ "learning_rate": 3.829394270229531e-06,
554
+ "loss": 0.7295,
555
+ "step": 78
556
+ },
557
+ {
558
+ "epoch": 1.8627450980392157,
559
+ "grad_norm": 0.402537077665329,
560
+ "learning_rate": 3.694718930600012e-06,
561
+ "loss": 0.642,
562
+ "step": 79
563
+ },
564
+ {
565
+ "epoch": 1.8862745098039215,
566
+ "grad_norm": 0.4362320601940155,
567
+ "learning_rate": 3.5610524231440324e-06,
568
+ "loss": 0.7889,
569
+ "step": 80
570
+ },
571
+ {
572
+ "epoch": 1.9098039215686273,
573
+ "grad_norm": 0.43875452876091003,
574
+ "learning_rate": 3.428498056701665e-06,
575
+ "loss": 0.7499,
576
+ "step": 81
577
+ },
578
+ {
579
+ "epoch": 1.9333333333333333,
580
+ "grad_norm": 0.43475160002708435,
581
+ "learning_rate": 3.2971582805558622e-06,
582
+ "loss": 0.7663,
583
+ "step": 82
584
+ },
585
+ {
586
+ "epoch": 1.9568627450980394,
587
+ "grad_norm": 0.46115896105766296,
588
+ "learning_rate": 3.167134605250938e-06,
589
+ "loss": 0.7652,
590
+ "step": 83
591
+ },
592
+ {
593
+ "epoch": 1.9803921568627452,
594
+ "grad_norm": 0.4670518934726715,
595
+ "learning_rate": 3.0385275241365965e-06,
596
+ "loss": 0.7709,
597
+ "step": 84
598
+ },
599
+ {
600
+ "epoch": 2.007843137254902,
601
+ "grad_norm": 0.8610158562660217,
602
+ "learning_rate": 2.9114364356981274e-06,
603
+ "loss": 1.2373,
604
+ "step": 85
605
+ },
606
+ {
607
+ "epoch": 2.0313725490196077,
608
+ "grad_norm": 0.3888493478298187,
609
+ "learning_rate": 2.7859595667328027e-06,
610
+ "loss": 0.7255,
611
+ "step": 86
612
+ },
613
+ {
614
+ "epoch": 2.0549019607843135,
615
+ "grad_norm": 0.42477184534072876,
616
+ "learning_rate": 2.6621938964318593e-06,
617
+ "loss": 0.6407,
618
+ "step": 87
619
+ },
620
+ {
621
+ "epoch": 2.0784313725490198,
622
+ "grad_norm": 0.4506017863750458,
623
+ "learning_rate": 2.5402350814267364e-06,
624
+ "loss": 0.6873,
625
+ "step": 88
626
+ },
627
+ {
628
+ "epoch": 2.1019607843137256,
629
+ "grad_norm": 0.48074784874916077,
630
+ "learning_rate": 2.4201773818574956e-06,
631
+ "loss": 0.6542,
632
+ "step": 89
633
+ },
634
+ {
635
+ "epoch": 2.1254901960784314,
636
+ "grad_norm": 0.45760810375213623,
637
+ "learning_rate": 2.302113588520578e-06,
638
+ "loss": 0.6809,
639
+ "step": 90
640
+ },
641
+ {
642
+ "epoch": 2.149019607843137,
643
+ "grad_norm": 0.3958469033241272,
644
+ "learning_rate": 2.1861349511521817e-06,
645
+ "loss": 0.6087,
646
+ "step": 91
647
+ },
648
+ {
649
+ "epoch": 2.172549019607843,
650
+ "grad_norm": 0.479245662689209,
651
+ "learning_rate": 2.072331107902713e-06,
652
+ "loss": 0.9135,
653
+ "step": 92
654
+ },
655
+ {
656
+ "epoch": 2.196078431372549,
657
+ "grad_norm": 0.4283719062805176,
658
+ "learning_rate": 1.960790016056801e-06,
659
+ "loss": 0.6736,
660
+ "step": 93
661
+ },
662
+ {
663
+ "epoch": 2.219607843137255,
664
+ "grad_norm": 0.4299345314502716,
665
+ "learning_rate": 1.8515978840524302e-06,
666
+ "loss": 0.6972,
667
+ "step": 94
668
+ },
669
+ {
670
+ "epoch": 2.243137254901961,
671
+ "grad_norm": 0.45547375082969666,
672
+ "learning_rate": 1.7448391048517378e-06,
673
+ "loss": 0.7224,
674
+ "step": 95
675
+ },
676
+ {
677
+ "epoch": 2.2666666666666666,
678
+ "grad_norm": 0.40750595927238464,
679
+ "learning_rate": 1.640596190714947e-06,
680
+ "loss": 0.7225,
681
+ "step": 96
682
+ },
683
+ {
684
+ "epoch": 2.2901960784313724,
685
+ "grad_norm": 0.43789613246917725,
686
+ "learning_rate": 1.5389497094278861e-06,
687
+ "loss": 0.7208,
688
+ "step": 97
689
+ },
690
+ {
691
+ "epoch": 2.313725490196078,
692
+ "grad_norm": 0.4415332078933716,
693
+ "learning_rate": 1.4399782220323515e-06,
694
+ "loss": 0.6706,
695
+ "step": 98
696
+ },
697
+ {
698
+ "epoch": 2.3372549019607844,
699
+ "grad_norm": 0.49981608986854553,
700
+ "learning_rate": 1.3437582221074574e-06,
701
+ "loss": 0.7925,
702
+ "step": 99
703
+ },
704
+ {
705
+ "epoch": 2.3607843137254902,
706
+ "grad_norm": 0.48716047406196594,
707
+ "learning_rate": 1.250364076648894e-06,
708
+ "loss": 0.7385,
709
+ "step": 100
710
+ },
711
+ {
712
+ "epoch": 2.384313725490196,
713
+ "grad_norm": 0.3869420886039734,
714
+ "learning_rate": 1.1598679685917901e-06,
715
+ "loss": 0.6665,
716
+ "step": 101
717
+ },
718
+ {
719
+ "epoch": 2.407843137254902,
720
+ "grad_norm": 0.4081011116504669,
721
+ "learning_rate": 1.0723398410216085e-06,
722
+ "loss": 0.8291,
723
+ "step": 102
724
+ },
725
+ {
726
+ "epoch": 2.431372549019608,
727
+ "grad_norm": 0.4056829512119293,
728
+ "learning_rate": 9.878473431161767e-07,
729
+ "loss": 0.6668,
730
+ "step": 103
731
+ },
732
+ {
733
+ "epoch": 2.454901960784314,
734
+ "grad_norm": 0.36377865076065063,
735
+ "learning_rate": 9.064557778606631e-07,
736
+ "loss": 0.6017,
737
+ "step": 104
738
+ },
739
+ {
740
+ "epoch": 2.4784313725490197,
741
+ "grad_norm": 0.43392133712768555,
742
+ "learning_rate": 8.282280515758639e-07,
743
+ "loss": 0.7824,
744
+ "step": 105
745
+ },
746
+ {
747
+ "epoch": 2.5019607843137255,
748
+ "grad_norm": 0.47024935483932495,
749
+ "learning_rate": 7.532246252988617e-07,
750
+ "loss": 0.7446,
751
+ "step": 106
752
+ },
753
+ {
754
+ "epoch": 2.5254901960784313,
755
+ "grad_norm": 0.425630658864975,
756
+ "learning_rate": 6.815034680535915e-07,
757
+ "loss": 0.713,
758
+ "step": 107
759
+ },
760
+ {
761
+ "epoch": 2.549019607843137,
762
+ "grad_norm": 0.40889060497283936,
763
+ "learning_rate": 6.131200120474512e-07,
764
+ "loss": 0.7409,
765
+ "step": 108
766
+ },
767
+ {
768
+ "epoch": 2.572549019607843,
769
+ "grad_norm": 0.40218353271484375,
770
+ "learning_rate": 5.481271098285818e-07,
771
+ "loss": 0.7501,
772
+ "step": 109
773
+ },
774
+ {
775
+ "epoch": 2.596078431372549,
776
+ "grad_norm": 0.35820406675338745,
777
+ "learning_rate": 4.865749934369224e-07,
778
+ "loss": 0.6082,
779
+ "step": 110
780
+ },
781
+ {
782
+ "epoch": 2.619607843137255,
783
+ "grad_norm": 0.42070600390434265,
784
+ "learning_rate": 4.2851123558061927e-07,
785
+ "loss": 0.7516,
786
+ "step": 111
787
+ },
788
+ {
789
+ "epoch": 2.6431372549019607,
790
+ "grad_norm": 0.36287838220596313,
791
+ "learning_rate": 3.739807128677986e-07,
792
+ "loss": 0.6589,
793
+ "step": 112
794
+ },
795
+ {
796
+ "epoch": 2.6666666666666665,
797
+ "grad_norm": 0.3751106560230255,
798
+ "learning_rate": 3.230255711220992e-07,
799
+ "loss": 0.7008,
800
+ "step": 113
801
+ },
802
+ {
803
+ "epoch": 2.6901960784313728,
804
+ "grad_norm": 0.39373522996902466,
805
+ "learning_rate": 2.756851928088056e-07,
806
+ "loss": 0.7579,
807
+ "step": 114
808
+ },
809
+ {
810
+ "epoch": 2.7137254901960786,
811
+ "grad_norm": 0.3703792989253998,
812
+ "learning_rate": 2.3199616659672352e-07,
813
+ "loss": 0.8005,
814
+ "step": 115
815
+ },
816
+ {
817
+ "epoch": 2.7372549019607844,
818
+ "grad_norm": 0.3416251242160797,
819
+ "learning_rate": 1.9199225907935492e-07,
820
+ "loss": 0.6913,
821
+ "step": 116
822
+ },
823
+ {
824
+ "epoch": 2.76078431372549,
825
+ "grad_norm": 0.3666976988315582,
826
+ "learning_rate": 1.5570438867719695e-07,
827
+ "loss": 0.6749,
828
+ "step": 117
829
+ },
830
+ {
831
+ "epoch": 2.784313725490196,
832
+ "grad_norm": 0.41929343342781067,
833
+ "learning_rate": 1.2316060174136e-07,
834
+ "loss": 0.93,
835
+ "step": 118
836
+ },
837
+ {
838
+ "epoch": 2.8078431372549018,
839
+ "grad_norm": 0.3298085033893585,
840
+ "learning_rate": 9.43860508769645e-08,
841
+ "loss": 0.5853,
842
+ "step": 119
843
+ },
844
+ {
845
+ "epoch": 2.831372549019608,
846
+ "grad_norm": 0.4238205552101135,
847
+ "learning_rate": 6.940297550306895e-08,
848
+ "loss": 0.7548,
849
+ "step": 120
850
+ },
851
+ {
852
+ "epoch": 2.854901960784314,
853
+ "grad_norm": 0.37445592880249023,
854
+ "learning_rate": 4.823068466415615e-08,
855
+ "loss": 0.7453,
856
+ "step": 121
857
+ },
858
+ {
859
+ "epoch": 2.8784313725490196,
860
+ "grad_norm": 0.42801082134246826,
861
+ "learning_rate": 3.088554210646133e-08,
862
+ "loss": 0.8001,
863
+ "step": 122
864
+ },
865
+ {
866
+ "epoch": 2.9019607843137254,
867
+ "grad_norm": 0.3497636616230011,
868
+ "learning_rate": 1.7380953630678488e-08,
869
+ "loss": 0.7289,
870
+ "step": 123
871
+ },
872
+ {
873
+ "epoch": 2.9254901960784316,
874
+ "grad_norm": 0.4116727411746979,
875
+ "learning_rate": 7.727356730820035e-09,
876
+ "loss": 0.6974,
877
+ "step": 124
878
+ },
879
+ {
880
+ "epoch": 2.9490196078431374,
881
+ "grad_norm": 0.3742615282535553,
882
+ "learning_rate": 1.9322125272297488e-09,
883
+ "loss": 0.765,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 2.9725490196078432,
888
+ "grad_norm": 0.36835694313049316,
889
+ "learning_rate": 0.0,
890
+ "loss": 0.6713,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 2.9725490196078432,
895
+ "step": 126,
896
+ "total_flos": 121055548211200.0,
897
+ "train_loss": 0.7961960165273576,
898
+ "train_runtime": 4088.556,
899
+ "train_samples_per_second": 2.986,
900
+ "train_steps_per_second": 0.031
901
+ }
902
+ ],
903
+ "logging_steps": 1.0,
904
+ "max_steps": 126,
905
+ "num_input_tokens_seen": 0,
906
+ "num_train_epochs": 3,
907
+ "save_steps": 500,
908
+ "stateful_callbacks": {
909
+ "TrainerControl": {
910
+ "args": {
911
+ "should_epoch_stop": false,
912
+ "should_evaluate": false,
913
+ "should_log": false,
914
+ "should_save": true,
915
+ "should_training_stop": true
916
+ },
917
+ "attributes": {}
918
+ }
919
+ },
920
+ "total_flos": 121055548211200.0,
921
+ "train_batch_size": 1,
922
+ "trial_name": null,
923
+ "trial_params": null
924
+ }
training_loss.png ADDED