aniloid2 commited on
Commit
8959f36
·
verified ·
1 Parent(s): 42389b9

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +112 -112
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
3
- datasets: open-r1/OpenR1-Math-220k
4
  library_name: transformers
5
  model_name: Qwen2.5-1.5B-Open-R1-Distill
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/aniloid2/huggingface/runs/3196v3g2)
33
 
34
 
35
  This model was trained with SFT.
 
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/aniloid2/huggingface/runs/qyr4byeh)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 488165445992448.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 1.4542,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 23587.156,
7
- "train_steps_per_second": 184.291
8
  }
 
1
  {
2
+ "total_flos": 126040110268416.0,
3
+ "train_loss": 2.3174947375681865,
4
+ "train_runtime": 34132.5347,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 1.005,
7
+ "train_steps_per_second": 0.008
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 488165445992448.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 1.4542,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 23587.156,
7
- "train_steps_per_second": 184.291
8
  }
 
1
  {
2
+ "total_flos": 126040110268416.0,
3
+ "train_loss": 2.3174947375681865,
4
+ "train_runtime": 34132.5347,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 1.005,
7
+ "train_steps_per_second": 0.008
8
  }
trainer_state.json CHANGED
@@ -10,383 +10,383 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.018656716417910446,
13
- "grad_norm": 1.8340438042183895,
14
  "learning_rate": 1.785714285714286e-05,
15
- "loss": 0.8456,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.03731343283582089,
20
- "grad_norm": 0.7951826236027404,
21
  "learning_rate": 3.571428571428572e-05,
22
- "loss": 0.7681,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.055970149253731345,
27
- "grad_norm": 0.5154315861714606,
28
  "learning_rate": 4.999827900623038e-05,
29
- "loss": 0.7025,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.07462686567164178,
34
- "grad_norm": 0.4096644979236753,
35
  "learning_rate": 4.993807186343243e-05,
36
- "loss": 0.6745,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.09328358208955224,
41
- "grad_norm": 0.41615570016262526,
42
  "learning_rate": 4.979207812402531e-05,
43
- "loss": 0.6435,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.11194029850746269,
48
- "grad_norm": 0.3382911483266423,
49
  "learning_rate": 4.956085596012407e-05,
50
- "loss": 0.6363,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.13059701492537312,
55
- "grad_norm": 0.3343739047829077,
56
  "learning_rate": 4.924528939432311e-05,
57
- "loss": 0.62,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.14925373134328357,
62
- "grad_norm": 0.2592771324041628,
63
  "learning_rate": 4.884658491984735e-05,
64
- "loss": 0.6106,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.16791044776119404,
69
- "grad_norm": 0.23367554316276612,
70
  "learning_rate": 4.8366266887814235e-05,
71
- "loss": 0.6113,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.1865671641791045,
76
- "grad_norm": 0.27188191160952146,
77
  "learning_rate": 4.780617167924209e-05,
78
- "loss": 0.5939,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.20522388059701493,
83
- "grad_norm": 0.27682296737618883,
84
  "learning_rate": 4.716844068408693e-05,
85
- "loss": 0.5965,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.22388059701492538,
90
- "grad_norm": 0.27317012100328225,
91
  "learning_rate": 4.6455512114150546e-05,
92
- "loss": 0.5919,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.24253731343283583,
97
- "grad_norm": 0.24464311795706967,
98
  "learning_rate": 4.5670111681161296e-05,
99
- "loss": 0.5825,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.26119402985074625,
104
- "grad_norm": 0.34766047560756597,
105
  "learning_rate": 4.481524217566783e-05,
106
- "loss": 0.5789,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.2798507462686567,
111
- "grad_norm": 0.5405423254584241,
112
  "learning_rate": 4.3894171986588217e-05,
113
- "loss": 0.5785,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.29850746268656714,
118
- "grad_norm": 0.38583732913974567,
119
  "learning_rate": 4.29104226053073e-05,
120
- "loss": 0.5777,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.31716417910447764,
125
- "grad_norm": 0.3711588959211694,
126
  "learning_rate": 4.186775516209732e-05,
127
- "loss": 0.573,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.3358208955223881,
132
- "grad_norm": 0.36207623515411447,
133
  "learning_rate": 4.077015604633669e-05,
134
- "loss": 0.5754,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.35447761194029853,
139
- "grad_norm": 0.31192395276343104,
140
  "learning_rate": 3.962182166550441e-05,
141
- "loss": 0.5739,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.373134328358209,
146
- "grad_norm": 0.31751083011527065,
147
  "learning_rate": 3.8427142401220634e-05,
148
- "loss": 0.5698,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.3917910447761194,
153
- "grad_norm": 0.2556620179749335,
154
  "learning_rate": 3.71906858236735e-05,
155
- "loss": 0.5661,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 0.41044776119402987,
160
- "grad_norm": 0.28171907193629253,
161
  "learning_rate": 3.591717922860785e-05,
162
- "loss": 0.5733,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.4291044776119403,
167
- "grad_norm": 0.25565772162050643,
168
  "learning_rate": 3.46114915636416e-05,
169
- "loss": 0.5641,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 0.44776119402985076,
174
- "grad_norm": 0.2692058231168393,
175
  "learning_rate": 3.3278614813010034e-05,
176
- "loss": 0.5651,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 0.4664179104477612,
181
- "grad_norm": 0.2595641295924728,
182
  "learning_rate": 3.1923644911909e-05,
183
- "loss": 0.562,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 0.48507462686567165,
188
- "grad_norm": 0.23192953907425143,
189
  "learning_rate": 3.0551762263406576e-05,
190
- "loss": 0.5607,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 0.503731343283582,
195
- "grad_norm": 0.2525775632798149,
196
  "learning_rate": 2.9168211932412042e-05,
197
- "loss": 0.5579,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 0.5223880597014925,
202
- "grad_norm": 0.2169419638537429,
203
  "learning_rate": 2.777828359242567e-05,
204
- "loss": 0.5632,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 0.5410447761194029,
209
- "grad_norm": 0.23790444425342444,
210
  "learning_rate": 2.6387291301738377e-05,
211
- "loss": 0.556,
212
  "step": 145
213
  },
214
  {
215
  "epoch": 0.5597014925373134,
216
- "grad_norm": 0.23123075574840998,
217
  "learning_rate": 2.50005531864019e-05,
218
- "loss": 0.5538,
219
  "step": 150
220
  },
221
  {
222
  "epoch": 0.5783582089552238,
223
- "grad_norm": 0.24287089032197215,
224
  "learning_rate": 2.362337110764688e-05,
225
- "loss": 0.5541,
226
  "step": 155
227
  },
228
  {
229
  "epoch": 0.5970149253731343,
230
- "grad_norm": 0.1863954428523962,
231
  "learning_rate": 2.226101039148557e-05,
232
- "loss": 0.5523,
233
  "step": 160
234
  },
235
  {
236
  "epoch": 0.6156716417910447,
237
- "grad_norm": 0.2558894409316875,
238
  "learning_rate": 2.0918679697998252e-05,
239
- "loss": 0.5512,
240
  "step": 165
241
  },
242
  {
243
  "epoch": 0.6343283582089553,
244
- "grad_norm": 0.2233289728652354,
245
  "learning_rate": 1.9601511107268255e-05,
246
- "loss": 0.5516,
247
  "step": 170
248
  },
249
  {
250
  "epoch": 0.6529850746268657,
251
- "grad_norm": 0.20246143955254822,
252
  "learning_rate": 1.8314540498102216e-05,
253
- "loss": 0.5512,
254
  "step": 175
255
  },
256
  {
257
  "epoch": 0.6716417910447762,
258
- "grad_norm": 0.18020359913875209,
259
  "learning_rate": 1.7062688294552992e-05,
260
- "loss": 0.5434,
261
  "step": 180
262
  },
263
  {
264
  "epoch": 0.6902985074626866,
265
- "grad_norm": 0.19749393449240044,
266
  "learning_rate": 1.5850740653856096e-05,
267
- "loss": 0.5467,
268
  "step": 185
269
  },
270
  {
271
  "epoch": 0.7089552238805971,
272
- "grad_norm": 0.18597460483892075,
273
  "learning_rate": 1.4683331167703218e-05,
274
- "loss": 0.5503,
275
  "step": 190
276
  },
277
  {
278
  "epoch": 0.7276119402985075,
279
- "grad_norm": 0.174856006547511,
280
  "learning_rate": 1.356492314681356e-05,
281
- "loss": 0.5532,
282
  "step": 195
283
  },
284
  {
285
  "epoch": 0.746268656716418,
286
- "grad_norm": 0.16385260185410305,
287
  "learning_rate": 1.2499792556533716e-05,
288
- "loss": 0.5475,
289
  "step": 200
290
  },
291
  {
292
  "epoch": 0.7649253731343284,
293
- "grad_norm": 0.1774467357426527,
294
  "learning_rate": 1.1492011668707753e-05,
295
- "loss": 0.5449,
296
  "step": 205
297
  },
298
  {
299
  "epoch": 0.7835820895522388,
300
- "grad_norm": 0.18006988137177274,
301
  "learning_rate": 1.0545433492320603e-05,
302
- "loss": 0.5501,
303
  "step": 210
304
  },
305
  {
306
  "epoch": 0.8022388059701493,
307
- "grad_norm": 0.16663148804007366,
308
  "learning_rate": 9.663677042440537e-06,
309
- "loss": 0.5444,
310
  "step": 215
311
  },
312
  {
313
  "epoch": 0.8208955223880597,
314
- "grad_norm": 0.16072306564901104,
315
  "learning_rate": 8.850113503781367e-06,
316
- "loss": 0.5443,
317
  "step": 220
318
  },
319
  {
320
  "epoch": 0.8395522388059702,
321
- "grad_norm": 0.1426075821782254,
322
  "learning_rate": 8.107853341784671e-06,
323
- "loss": 0.5507,
324
  "step": 225
325
  },
326
  {
327
  "epoch": 0.8582089552238806,
328
- "grad_norm": 0.1635359007036467,
329
  "learning_rate": 7.439734410499752e-06,
330
- "loss": 0.5471,
331
  "step": 230
332
  },
333
  {
334
  "epoch": 0.8768656716417911,
335
- "grad_norm": 0.18747407294942076,
336
  "learning_rate": 6.848311102728011e-06,
337
- "loss": 0.5473,
338
  "step": 235
339
  },
340
  {
341
  "epoch": 0.8955223880597015,
342
- "grad_norm": 0.17682712499368686,
343
  "learning_rate": 6.335844583913515e-06,
344
- "loss": 0.5434,
345
  "step": 240
346
  },
347
  {
348
  "epoch": 0.914179104477612,
349
- "grad_norm": 0.15507027757764974,
350
  "learning_rate": 5.904294147118193e-06,
351
- "loss": 0.5471,
352
  "step": 245
353
  },
354
  {
355
  "epoch": 0.9328358208955224,
356
- "grad_norm": 0.15176498907816743,
357
  "learning_rate": 5.555309722133842e-06,
358
- "loss": 0.5436,
359
  "step": 250
360
  },
361
  {
362
  "epoch": 0.9514925373134329,
363
- "grad_norm": 0.1468741880558366,
364
  "learning_rate": 5.290225567370509e-06,
365
- "loss": 0.5397,
366
  "step": 255
367
  },
368
  {
369
  "epoch": 0.9701492537313433,
370
- "grad_norm": 0.15706775164322176,
371
  "learning_rate": 5.110055168638854e-06,
372
- "loss": 0.5434,
373
  "step": 260
374
  },
375
  {
376
  "epoch": 0.9888059701492538,
377
- "grad_norm": 0.15191116377634428,
378
  "learning_rate": 5.0154873643297575e-06,
379
- "loss": 0.5471,
380
  "step": 265
381
  },
382
  {
383
  "epoch": 1.0,
384
  "step": 268,
385
- "total_flos": 488165445992448.0,
386
- "train_loss": 0.0,
387
- "train_runtime": 1.4542,
388
- "train_samples_per_second": 23587.156,
389
- "train_steps_per_second": 184.291
390
  }
391
  ],
392
  "logging_steps": 5,
@@ -406,7 +406,7 @@
406
  "attributes": {}
407
  }
408
  },
409
- "total_flos": 488165445992448.0,
410
  "train_batch_size": 4,
411
  "trial_name": null,
412
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.018656716417910446,
13
+ "grad_norm": 7.992456557616748,
14
  "learning_rate": 1.785714285714286e-05,
15
+ "loss": 3.3955,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.03731343283582089,
20
+ "grad_norm": 3.4968199831167857,
21
  "learning_rate": 3.571428571428572e-05,
22
+ "loss": 3.0816,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.055970149253731345,
27
+ "grad_norm": 2.968125450447822,
28
  "learning_rate": 4.999827900623038e-05,
29
+ "loss": 2.804,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.07462686567164178,
34
+ "grad_norm": 2.588497606835322,
35
  "learning_rate": 4.993807186343243e-05,
36
+ "loss": 2.6908,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.09328358208955224,
41
+ "grad_norm": 2.549871368666643,
42
  "learning_rate": 4.979207812402531e-05,
43
+ "loss": 2.5652,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.11194029850746269,
48
+ "grad_norm": 1.9738305926810327,
49
  "learning_rate": 4.956085596012407e-05,
50
+ "loss": 2.5363,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.13059701492537312,
55
+ "grad_norm": 2.3251077114713885,
56
  "learning_rate": 4.924528939432311e-05,
57
+ "loss": 2.4766,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.14925373134328357,
62
+ "grad_norm": 2.764973986868281,
63
  "learning_rate": 4.884658491984735e-05,
64
+ "loss": 2.4402,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.16791044776119404,
69
+ "grad_norm": 1.9523889288733327,
70
  "learning_rate": 4.8366266887814235e-05,
71
+ "loss": 2.4491,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.1865671641791045,
76
+ "grad_norm": 1.9187589431371008,
77
  "learning_rate": 4.780617167924209e-05,
78
+ "loss": 2.3786,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.20522388059701493,
83
+ "grad_norm": 1.9304839973674266,
84
  "learning_rate": 4.716844068408693e-05,
85
+ "loss": 2.3897,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.22388059701492538,
90
+ "grad_norm": 2.8003484835380372,
91
  "learning_rate": 4.6455512114150546e-05,
92
+ "loss": 2.3704,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.24253731343283583,
97
+ "grad_norm": 1.5013907534892832,
98
  "learning_rate": 4.5670111681161296e-05,
99
+ "loss": 2.3343,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.26119402985074625,
104
+ "grad_norm": 2.9773909499359377,
105
  "learning_rate": 4.481524217566783e-05,
106
+ "loss": 2.3199,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.2798507462686567,
111
+ "grad_norm": 4.2665640456752785,
112
  "learning_rate": 4.3894171986588217e-05,
113
+ "loss": 2.3193,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.29850746268656714,
118
+ "grad_norm": 1.9307726906007678,
119
  "learning_rate": 4.29104226053073e-05,
120
+ "loss": 2.322,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.31716417910447764,
125
+ "grad_norm": 2.237168079938654,
126
  "learning_rate": 4.186775516209732e-05,
127
+ "loss": 2.3077,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.3358208955223881,
132
+ "grad_norm": 2.901386689863122,
133
  "learning_rate": 4.077015604633669e-05,
134
+ "loss": 2.3101,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.35447761194029853,
139
+ "grad_norm": 2.5210971167889245,
140
  "learning_rate": 3.962182166550441e-05,
141
+ "loss": 2.3042,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.373134328358209,
146
+ "grad_norm": 2.6070376003561933,
147
  "learning_rate": 3.8427142401220634e-05,
148
+ "loss": 2.2873,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.3917910447761194,
153
+ "grad_norm": 2.2004757384840152,
154
  "learning_rate": 3.71906858236735e-05,
155
+ "loss": 2.2728,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 0.41044776119402987,
160
+ "grad_norm": 2.1473625420095868,
161
  "learning_rate": 3.591717922860785e-05,
162
+ "loss": 2.2999,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.4291044776119403,
167
+ "grad_norm": 1.4790945776347835,
168
  "learning_rate": 3.46114915636416e-05,
169
+ "loss": 2.2623,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 0.44776119402985076,
174
+ "grad_norm": 1.4602444685381062,
175
  "learning_rate": 3.3278614813010034e-05,
176
+ "loss": 2.2668,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 0.4664179104477612,
181
+ "grad_norm": 1.3383920889270082,
182
  "learning_rate": 3.1923644911909e-05,
183
+ "loss": 2.2541,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 0.48507462686567165,
188
+ "grad_norm": 1.6295501707411197,
189
  "learning_rate": 3.0551762263406576e-05,
190
+ "loss": 2.2505,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 0.503731343283582,
195
+ "grad_norm": 1.5700164012256488,
196
  "learning_rate": 2.9168211932412042e-05,
197
+ "loss": 2.2392,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 0.5223880597014925,
202
+ "grad_norm": 1.3183550705568836,
203
  "learning_rate": 2.777828359242567e-05,
204
+ "loss": 2.26,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 0.5410447761194029,
209
+ "grad_norm": 1.270837720420853,
210
  "learning_rate": 2.6387291301738377e-05,
211
+ "loss": 2.2291,
212
  "step": 145
213
  },
214
  {
215
  "epoch": 0.5597014925373134,
216
+ "grad_norm": 1.2360031288329183,
217
  "learning_rate": 2.50005531864019e-05,
218
+ "loss": 2.2191,
219
  "step": 150
220
  },
221
  {
222
  "epoch": 0.5783582089552238,
223
+ "grad_norm": 1.4058327877557197,
224
  "learning_rate": 2.362337110764688e-05,
225
+ "loss": 2.2201,
226
  "step": 155
227
  },
228
  {
229
  "epoch": 0.5970149253731343,
230
+ "grad_norm": 1.1447309950598807,
231
  "learning_rate": 2.226101039148557e-05,
232
+ "loss": 2.212,
233
  "step": 160
234
  },
235
  {
236
  "epoch": 0.6156716417910447,
237
+ "grad_norm": 1.2920928760986912,
238
  "learning_rate": 2.0918679697998252e-05,
239
+ "loss": 2.2068,
240
  "step": 165
241
  },
242
  {
243
  "epoch": 0.6343283582089553,
244
+ "grad_norm": 1.3303686169767341,
245
  "learning_rate": 1.9601511107268255e-05,
246
+ "loss": 2.208,
247
  "step": 170
248
  },
249
  {
250
  "epoch": 0.6529850746268657,
251
+ "grad_norm": 1.0982459829577405,
252
  "learning_rate": 1.8314540498102216e-05,
253
+ "loss": 2.205,
254
  "step": 175
255
  },
256
  {
257
  "epoch": 0.6716417910447762,
258
+ "grad_norm": 1.2297852411953294,
259
  "learning_rate": 1.7062688294552992e-05,
260
+ "loss": 2.1727,
261
  "step": 180
262
  },
263
  {
264
  "epoch": 0.6902985074626866,
265
+ "grad_norm": 1.0412249297641412,
266
  "learning_rate": 1.5850740653856096e-05,
267
+ "loss": 2.1852,
268
  "step": 185
269
  },
270
  {
271
  "epoch": 0.7089552238805971,
272
+ "grad_norm": 1.2871807028686466,
273
  "learning_rate": 1.4683331167703218e-05,
274
+ "loss": 2.199,
275
  "step": 190
276
  },
277
  {
278
  "epoch": 0.7276119402985075,
279
+ "grad_norm": 1.2102977879839474,
280
  "learning_rate": 1.356492314681356e-05,
281
+ "loss": 2.2087,
282
  "step": 195
283
  },
284
  {
285
  "epoch": 0.746268656716418,
286
+ "grad_norm": 0.9891067478501752,
287
  "learning_rate": 1.2499792556533716e-05,
288
+ "loss": 2.1852,
289
  "step": 200
290
  },
291
  {
292
  "epoch": 0.7649253731343284,
293
+ "grad_norm": 0.8301326918730746,
294
  "learning_rate": 1.1492011668707753e-05,
295
+ "loss": 2.1736,
296
  "step": 205
297
  },
298
  {
299
  "epoch": 0.7835820895522388,
300
+ "grad_norm": 0.7981204348651115,
301
  "learning_rate": 1.0545433492320603e-05,
302
+ "loss": 2.1941,
303
  "step": 210
304
  },
305
  {
306
  "epoch": 0.8022388059701493,
307
+ "grad_norm": 0.9338161168132167,
308
  "learning_rate": 9.663677042440537e-06,
309
+ "loss": 2.1702,
310
  "step": 215
311
  },
312
  {
313
  "epoch": 0.8208955223880597,
314
+ "grad_norm": 0.8537076459178821,
315
  "learning_rate": 8.850113503781367e-06,
316
+ "loss": 2.169,
317
  "step": 220
318
  },
319
  {
320
  "epoch": 0.8395522388059702,
321
+ "grad_norm": 0.8531025855292982,
322
  "learning_rate": 8.107853341784671e-06,
323
+ "loss": 2.1934,
324
  "step": 225
325
  },
326
  {
327
  "epoch": 0.8582089552238806,
328
+ "grad_norm": 0.7887299648309287,
329
  "learning_rate": 7.439734410499752e-06,
330
+ "loss": 2.1789,
331
  "step": 230
332
  },
333
  {
334
  "epoch": 0.8768656716417911,
335
+ "grad_norm": 0.7325960398919809,
336
  "learning_rate": 6.848311102728011e-06,
337
+ "loss": 2.179,
338
  "step": 235
339
  },
340
  {
341
  "epoch": 0.8955223880597015,
342
+ "grad_norm": 0.8564233439601584,
343
  "learning_rate": 6.335844583913515e-06,
344
+ "loss": 2.1635,
345
  "step": 240
346
  },
347
  {
348
  "epoch": 0.914179104477612,
349
+ "grad_norm": 0.9533929441227149,
350
  "learning_rate": 5.904294147118193e-06,
351
+ "loss": 2.1775,
352
  "step": 245
353
  },
354
  {
355
  "epoch": 0.9328358208955224,
356
+ "grad_norm": 0.713300332791207,
357
  "learning_rate": 5.555309722133842e-06,
358
+ "loss": 2.1629,
359
  "step": 250
360
  },
361
  {
362
  "epoch": 0.9514925373134329,
363
+ "grad_norm": 0.7316453284294552,
364
  "learning_rate": 5.290225567370509e-06,
365
+ "loss": 2.1478,
366
  "step": 255
367
  },
368
  {
369
  "epoch": 0.9701492537313433,
370
+ "grad_norm": 0.6667774680701307,
371
  "learning_rate": 5.110055168638854e-06,
372
+ "loss": 2.1616,
373
  "step": 260
374
  },
375
  {
376
  "epoch": 0.9888059701492538,
377
+ "grad_norm": 0.7087383615740679,
378
  "learning_rate": 5.0154873643297575e-06,
379
+ "loss": 2.1761,
380
  "step": 265
381
  },
382
  {
383
  "epoch": 1.0,
384
  "step": 268,
385
+ "total_flos": 126040110268416.0,
386
+ "train_loss": 2.3174947375681865,
387
+ "train_runtime": 34132.5347,
388
+ "train_samples_per_second": 1.005,
389
+ "train_steps_per_second": 0.008
390
  }
391
  ],
392
  "logging_steps": 5,
 
406
  "attributes": {}
407
  }
408
  },
409
+ "total_flos": 126040110268416.0,
410
  "train_batch_size": 4,
411
  "trial_name": null,
412
  "trial_params": null