Jennny commited on
Commit
6049b26
·
verified ·
1 Parent(s): ae70fd3

Model save

Browse files
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B
3
+ library_name: transformers
4
+ model_name: qwen25_7b_sft_math
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for qwen25_7b_sft_math
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="Jennny/qwen25_7b_sft_math", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jenny-shen/huggingface/runs/r912sj1z)
31
+
32
+ This model was trained with SFT.
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.12.2
37
+ - Transformers: 4.46.3
38
+ - Pytorch: 2.5.1+cu124
39
+ - Datasets: 3.3.2
40
+ - Tokenizers: 0.20.3
41
+
42
+ ## Citations
43
+
44
+
45
+
46
+ Cite TRL as:
47
+
48
+ ```bibtex
49
+ @misc{vonwerra2022trl,
50
+ title = {{TRL: Transformer Reinforcement Learning}},
51
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
52
+ year = 2020,
53
+ journal = {GitHub repository},
54
+ publisher = {GitHub},
55
+ howpublished = {\url{https://github.com/huggingface/trl}}
56
+ }
57
+ ```
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 24329682780160.0,
4
+ "train_loss": 0.14914496035286876,
5
+ "train_runtime": 934.4929,
6
+ "train_samples": 5274,
7
+ "train_samples_per_second": 11.287,
8
+ "train_steps_per_second": 0.353
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.46.3"
6
+ }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea4d19d19387ed04886c6e7ad825f2376fe46f2ac1f3e1a68dafb043d9a11d44
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10147203b67be5a160709ebc57781cd8fff90dd3360a0f3eac2b7f0f2f053673
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b2ddb97d57b189055535bac3dfa2b59481934882968dd40f80c032e924aef95
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44a142dd671d80f4e3060f084447b5349a84bf6c15aad72df10ab3baf8fabc48
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c53f6423939a85f65850c6be7cee9b6b2c539e60caf44a571a46a70049cd911a
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b3a68110d39341eae2f46e62860343e24e2e09b2616ace54f1136082bfef65d
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83da8f4036c38e2bad7ca1a855629135dcfcae9e78b480d4e0e8ef825de3137a
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b22ca4336a63c3bae0867343d3ae3de8eb0414bb17aa8f4d9d731359964420
3
  size 1089994880
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 24329682780160.0,
4
+ "train_loss": 0.14914496035286876,
5
+ "train_runtime": 934.4929,
6
+ "train_samples": 5274,
7
+ "train_samples_per_second": 11.287,
8
+ "train_steps_per_second": 0.353
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 100,
6
+ "global_step": 330,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.006060606060606061,
13
+ "grad_norm": 35.87954264224515,
14
+ "learning_rate": 6.060606060606061e-07,
15
+ "loss": 0.4258,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.030303030303030304,
20
+ "grad_norm": 19.325121547130255,
21
+ "learning_rate": 3.0303030303030305e-06,
22
+ "loss": 0.3764,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.06060606060606061,
27
+ "grad_norm": 28.11024716591639,
28
+ "learning_rate": 6.060606060606061e-06,
29
+ "loss": 0.3414,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.09090909090909091,
34
+ "grad_norm": 4.816824647664995,
35
+ "learning_rate": 9.090909090909091e-06,
36
+ "loss": 0.2563,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.12121212121212122,
41
+ "grad_norm": 1.6101991751262534,
42
+ "learning_rate": 1.2121212121212122e-05,
43
+ "loss": 0.2229,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.15151515151515152,
48
+ "grad_norm": 1.4466897117313482,
49
+ "learning_rate": 1.5151515151515153e-05,
50
+ "loss": 0.2053,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.18181818181818182,
55
+ "grad_norm": 1.411982251904895,
56
+ "learning_rate": 1.8181818181818182e-05,
57
+ "loss": 0.1941,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.21212121212121213,
62
+ "grad_norm": 1.1694992562710236,
63
+ "learning_rate": 1.999776230627102e-05,
64
+ "loss": 0.1899,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.24242424242424243,
69
+ "grad_norm": 1.0438333794181684,
70
+ "learning_rate": 1.9972599751485225e-05,
71
+ "loss": 0.1846,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.2727272727272727,
76
+ "grad_norm": 0.8319491355818911,
77
+ "learning_rate": 1.9919548128307954e-05,
78
+ "loss": 0.1841,
79
+ "step": 45
80
+ },
81
+ {
82
+ "epoch": 0.30303030303030304,
83
+ "grad_norm": 0.9996099757839279,
84
+ "learning_rate": 1.9838755799290993e-05,
85
+ "loss": 0.1964,
86
+ "step": 50
87
+ },
88
+ {
89
+ "epoch": 0.3333333333333333,
90
+ "grad_norm": 0.7913395175455826,
91
+ "learning_rate": 1.973044870579824e-05,
92
+ "loss": 0.2125,
93
+ "step": 55
94
+ },
95
+ {
96
+ "epoch": 0.36363636363636365,
97
+ "grad_norm": 0.8429105469391287,
98
+ "learning_rate": 1.9594929736144978e-05,
99
+ "loss": 0.1969,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.3939393939393939,
104
+ "grad_norm": 0.9398324090183084,
105
+ "learning_rate": 1.9432577878549635e-05,
106
+ "loss": 0.2056,
107
+ "step": 65
108
+ },
109
+ {
110
+ "epoch": 0.42424242424242425,
111
+ "grad_norm": 0.7734388791502247,
112
+ "learning_rate": 1.9243847161266924e-05,
113
+ "loss": 0.209,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.45454545454545453,
118
+ "grad_norm": 0.8398725546299396,
119
+ "learning_rate": 1.9029265382866216e-05,
120
+ "loss": 0.2001,
121
+ "step": 75
122
+ },
123
+ {
124
+ "epoch": 0.48484848484848486,
125
+ "grad_norm": 0.811876332964586,
126
+ "learning_rate": 1.8789432636206197e-05,
127
+ "loss": 0.1909,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 0.5151515151515151,
132
+ "grad_norm": 0.9379567664437414,
133
+ "learning_rate": 1.8525019630233463e-05,
134
+ "loss": 0.2014,
135
+ "step": 85
136
+ },
137
+ {
138
+ "epoch": 0.5454545454545454,
139
+ "grad_norm": 0.7797031245696635,
140
+ "learning_rate": 1.8236765814298328e-05,
141
+ "loss": 0.1954,
142
+ "step": 90
143
+ },
144
+ {
145
+ "epoch": 0.5757575757575758,
146
+ "grad_norm": 0.8027439585103229,
147
+ "learning_rate": 1.792547731023332e-05,
148
+ "loss": 0.1893,
149
+ "step": 95
150
+ },
151
+ {
152
+ "epoch": 0.6060606060606061,
153
+ "grad_norm": 0.8602477802444706,
154
+ "learning_rate": 1.7592024657977432e-05,
155
+ "loss": 0.2123,
156
+ "step": 100
157
+ },
158
+ {
159
+ "epoch": 0.6060606060606061,
160
+ "eval_loss": 0.2053576111793518,
161
+ "eval_runtime": 5.4818,
162
+ "eval_samples_per_second": 54.727,
163
+ "eval_steps_per_second": 1.824,
164
+ "step": 100
165
+ },
166
+ {
167
+ "epoch": 0.6363636363636364,
168
+ "grad_norm": 0.8118731483818742,
169
+ "learning_rate": 1.72373403810507e-05,
170
+ "loss": 0.2138,
171
+ "step": 105
172
+ },
173
+ {
174
+ "epoch": 0.6666666666666666,
175
+ "grad_norm": 0.8978812848872854,
176
+ "learning_rate": 1.686241637868734e-05,
177
+ "loss": 0.1936,
178
+ "step": 110
179
+ },
180
+ {
181
+ "epoch": 0.696969696969697,
182
+ "grad_norm": 0.6638605979568581,
183
+ "learning_rate": 1.6468301151920576e-05,
184
+ "loss": 0.2009,
185
+ "step": 115
186
+ },
187
+ {
188
+ "epoch": 0.7272727272727273,
189
+ "grad_norm": 0.8138109930342797,
190
+ "learning_rate": 1.6056096871376667e-05,
191
+ "loss": 0.1909,
192
+ "step": 120
193
+ },
194
+ {
195
+ "epoch": 0.7575757575757576,
196
+ "grad_norm": 0.8916280069276008,
197
+ "learning_rate": 1.5626956294978103e-05,
198
+ "loss": 0.1995,
199
+ "step": 125
200
+ },
201
+ {
202
+ "epoch": 0.7878787878787878,
203
+ "grad_norm": 0.929042576381489,
204
+ "learning_rate": 1.5182079544175957e-05,
205
+ "loss": 0.2032,
206
+ "step": 130
207
+ },
208
+ {
209
+ "epoch": 0.8181818181818182,
210
+ "grad_norm": 0.822937565391655,
211
+ "learning_rate": 1.472271074772683e-05,
212
+ "loss": 0.1992,
213
+ "step": 135
214
+ },
215
+ {
216
+ "epoch": 0.8484848484848485,
217
+ "grad_norm": 0.7355733490412192,
218
+ "learning_rate": 1.4250134562400301e-05,
219
+ "loss": 0.1905,
220
+ "step": 140
221
+ },
222
+ {
223
+ "epoch": 0.8787878787878788,
224
+ "grad_norm": 0.8972031528134459,
225
+ "learning_rate": 1.3765672580346986e-05,
226
+ "loss": 0.1963,
227
+ "step": 145
228
+ },
229
+ {
230
+ "epoch": 0.9090909090909091,
231
+ "grad_norm": 0.7484213489004259,
232
+ "learning_rate": 1.3270679633174219e-05,
233
+ "loss": 0.1872,
234
+ "step": 150
235
+ },
236
+ {
237
+ "epoch": 0.9393939393939394,
238
+ "grad_norm": 0.8864952745311097,
239
+ "learning_rate": 1.2766540003065272e-05,
240
+ "loss": 0.1813,
241
+ "step": 155
242
+ },
243
+ {
244
+ "epoch": 0.9696969696969697,
245
+ "grad_norm": 0.7562707141763975,
246
+ "learning_rate": 1.2254663551538047e-05,
247
+ "loss": 0.1937,
248
+ "step": 160
249
+ },
250
+ {
251
+ "epoch": 1.0,
252
+ "grad_norm": 0.7925812619946488,
253
+ "learning_rate": 1.1736481776669307e-05,
254
+ "loss": 0.1869,
255
+ "step": 165
256
+ },
257
+ {
258
+ "epoch": 1.0303030303030303,
259
+ "grad_norm": 0.7262299560547822,
260
+ "learning_rate": 1.121344380981082e-05,
261
+ "loss": 0.1035,
262
+ "step": 170
263
+ },
264
+ {
265
+ "epoch": 1.0606060606060606,
266
+ "grad_norm": 0.8601895545477497,
267
+ "learning_rate": 1.068701236299281e-05,
268
+ "loss": 0.0924,
269
+ "step": 175
270
+ },
271
+ {
272
+ "epoch": 1.0909090909090908,
273
+ "grad_norm": 0.8688643496111008,
274
+ "learning_rate": 1.015865963834808e-05,
275
+ "loss": 0.0927,
276
+ "step": 180
277
+ },
278
+ {
279
+ "epoch": 1.121212121212121,
280
+ "grad_norm": 0.6566542211184438,
281
+ "learning_rate": 9.62986321099642e-06,
282
+ "loss": 0.0929,
283
+ "step": 185
284
+ },
285
+ {
286
+ "epoch": 1.1515151515151516,
287
+ "grad_norm": 0.7231975837719832,
288
+ "learning_rate": 9.102101896903084e-06,
289
+ "loss": 0.0992,
290
+ "step": 190
291
+ },
292
+ {
293
+ "epoch": 1.1818181818181819,
294
+ "grad_norm": 0.6644028122208893,
295
+ "learning_rate": 8.576851617267151e-06,
296
+ "loss": 0.1022,
297
+ "step": 195
298
+ },
299
+ {
300
+ "epoch": 1.2121212121212122,
301
+ "grad_norm": 0.6779100739664534,
302
+ "learning_rate": 8.055581271005292e-06,
303
+ "loss": 0.0911,
304
+ "step": 200
305
+ },
306
+ {
307
+ "epoch": 1.2121212121212122,
308
+ "eval_loss": 0.20411260426044464,
309
+ "eval_runtime": 5.4814,
310
+ "eval_samples_per_second": 54.731,
311
+ "eval_steps_per_second": 1.824,
312
+ "step": 200
313
+ },
314
+ {
315
+ "epoch": 1.2424242424242424,
316
+ "grad_norm": 0.7215991029800394,
317
+ "learning_rate": 7.539748626873866e-06,
318
+ "loss": 0.097,
319
+ "step": 205
320
+ },
321
+ {
322
+ "epoch": 1.2727272727272727,
323
+ "grad_norm": 0.6420840571051777,
324
+ "learning_rate": 7.0307962467172555e-06,
325
+ "loss": 0.0828,
326
+ "step": 210
327
+ },
328
+ {
329
+ "epoch": 1.303030303030303,
330
+ "grad_norm": 0.7157636358082953,
331
+ "learning_rate": 6.530147451243377e-06,
332
+ "loss": 0.0953,
333
+ "step": 215
334
+ },
335
+ {
336
+ "epoch": 1.3333333333333333,
337
+ "grad_norm": 0.6543745308409002,
338
+ "learning_rate": 6.039202339608432e-06,
339
+ "loss": 0.0911,
340
+ "step": 220
341
+ },
342
+ {
343
+ "epoch": 1.3636363636363638,
344
+ "grad_norm": 0.6350258787861462,
345
+ "learning_rate": 5.559333873942259e-06,
346
+ "loss": 0.0873,
347
+ "step": 225
348
+ },
349
+ {
350
+ "epoch": 1.393939393939394,
351
+ "grad_norm": 0.6107660743054869,
352
+ "learning_rate": 5.091884039764321e-06,
353
+ "loss": 0.0883,
354
+ "step": 230
355
+ },
356
+ {
357
+ "epoch": 1.4242424242424243,
358
+ "grad_norm": 0.7110830925103607,
359
+ "learning_rate": 4.638160093027908e-06,
360
+ "loss": 0.0938,
361
+ "step": 235
362
+ },
363
+ {
364
+ "epoch": 1.4545454545454546,
365
+ "grad_norm": 0.7041242370967582,
366
+ "learning_rate": 4.19943090428802e-06,
367
+ "loss": 0.0938,
368
+ "step": 240
369
+ },
370
+ {
371
+ "epoch": 1.4848484848484849,
372
+ "grad_norm": 0.6257423065834941,
373
+ "learning_rate": 3.7769234102166365e-06,
374
+ "loss": 0.0869,
375
+ "step": 245
376
+ },
377
+ {
378
+ "epoch": 1.5151515151515151,
379
+ "grad_norm": 0.6573334135670441,
380
+ "learning_rate": 3.37181918238904e-06,
381
+ "loss": 0.0886,
382
+ "step": 250
383
+ },
384
+ {
385
+ "epoch": 1.5454545454545454,
386
+ "grad_norm": 0.5682090705026351,
387
+ "learning_rate": 2.9852511229367862e-06,
388
+ "loss": 0.0897,
389
+ "step": 255
390
+ },
391
+ {
392
+ "epoch": 1.5757575757575757,
393
+ "grad_norm": 0.599789982325095,
394
+ "learning_rate": 2.618300296308135e-06,
395
+ "loss": 0.0844,
396
+ "step": 260
397
+ },
398
+ {
399
+ "epoch": 1.606060606060606,
400
+ "grad_norm": 0.6976590209412902,
401
+ "learning_rate": 2.27199290599617e-06,
402
+ "loss": 0.0871,
403
+ "step": 265
404
+ },
405
+ {
406
+ "epoch": 1.6363636363636362,
407
+ "grad_norm": 0.739903490519265,
408
+ "learning_rate": 1.947297424689414e-06,
409
+ "loss": 0.0871,
410
+ "step": 270
411
+ },
412
+ {
413
+ "epoch": 1.6666666666666665,
414
+ "grad_norm": 0.6235342968784775,
415
+ "learning_rate": 1.6451218858706374e-06,
416
+ "loss": 0.0891,
417
+ "step": 275
418
+ },
419
+ {
420
+ "epoch": 1.696969696969697,
421
+ "grad_norm": 0.6326699548618793,
422
+ "learning_rate": 1.3663113444380905e-06,
423
+ "loss": 0.0836,
424
+ "step": 280
425
+ },
426
+ {
427
+ "epoch": 1.7272727272727273,
428
+ "grad_norm": 0.6057890106461301,
429
+ "learning_rate": 1.1116455134507665e-06,
430
+ "loss": 0.0835,
431
+ "step": 285
432
+ },
433
+ {
434
+ "epoch": 1.7575757575757576,
435
+ "grad_norm": 0.6815041899874066,
436
+ "learning_rate": 8.818365836066101e-07,
437
+ "loss": 0.0823,
438
+ "step": 290
439
+ },
440
+ {
441
+ "epoch": 1.7878787878787878,
442
+ "grad_norm": 0.6655841999996316,
443
+ "learning_rate": 6.775272315517423e-07,
444
+ "loss": 0.0809,
445
+ "step": 295
446
+ },
447
+ {
448
+ "epoch": 1.8181818181818183,
449
+ "grad_norm": 0.6919417565002871,
450
+ "learning_rate": 4.992888225905467e-07,
451
+ "loss": 0.0827,
452
+ "step": 300
453
+ },
454
+ {
455
+ "epoch": 1.8181818181818183,
456
+ "eval_loss": 0.19683437049388885,
457
+ "eval_runtime": 5.4854,
458
+ "eval_samples_per_second": 54.691,
459
+ "eval_steps_per_second": 1.823,
460
+ "step": 300
461
+ },
462
+ {
463
+ "epoch": 1.8484848484848486,
464
+ "grad_norm": 0.6671269900553093,
465
+ "learning_rate": 3.476198128228736e-07,
466
+ "loss": 0.0806,
467
+ "step": 305
468
+ },
469
+ {
470
+ "epoch": 1.878787878787879,
471
+ "grad_norm": 0.7226986759311911,
472
+ "learning_rate": 2.2294435517691504e-07,
473
+ "loss": 0.0849,
474
+ "step": 310
475
+ },
476
+ {
477
+ "epoch": 1.9090909090909092,
478
+ "grad_norm": 0.6203115249287291,
479
+ "learning_rate": 1.2561111323605714e-07,
480
+ "loss": 0.0796,
481
+ "step": 315
482
+ },
483
+ {
484
+ "epoch": 1.9393939393939394,
485
+ "grad_norm": 0.5457139445057179,
486
+ "learning_rate": 5.5892286176932875e-08,
487
+ "loss": 0.0844,
488
+ "step": 320
489
+ },
490
+ {
491
+ "epoch": 1.9696969696969697,
492
+ "grad_norm": 0.6283544510778528,
493
+ "learning_rate": 1.3982847545507271e-08,
494
+ "loss": 0.0868,
495
+ "step": 325
496
+ },
497
+ {
498
+ "epoch": 2.0,
499
+ "grad_norm": 0.6466043764676332,
500
+ "learning_rate": 0.0,
501
+ "loss": 0.0862,
502
+ "step": 330
503
+ },
504
+ {
505
+ "epoch": 2.0,
506
+ "step": 330,
507
+ "total_flos": 24329682780160.0,
508
+ "train_loss": 0.14914496035286876,
509
+ "train_runtime": 934.4929,
510
+ "train_samples_per_second": 11.287,
511
+ "train_steps_per_second": 0.353
512
+ }
513
+ ],
514
+ "logging_steps": 5,
515
+ "max_steps": 330,
516
+ "num_input_tokens_seen": 0,
517
+ "num_train_epochs": 2,
518
+ "save_steps": 100,
519
+ "stateful_callbacks": {
520
+ "TrainerControl": {
521
+ "args": {
522
+ "should_epoch_stop": false,
523
+ "should_evaluate": false,
524
+ "should_log": false,
525
+ "should_save": true,
526
+ "should_training_stop": true
527
+ },
528
+ "attributes": {}
529
+ }
530
+ },
531
+ "total_flos": 24329682780160.0,
532
+ "train_batch_size": 4,
533
+ "trial_name": null,
534
+ "trial_params": null
535
+ }