chunping-m commited on
Commit
a156208
·
verified ·
1 Parent(s): dc7bd13

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/mnt/d/f0_kmeans_T5/checkpoint-2000",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 1024,
8
+ "d_kv": 64,
9
+ "d_model": 512,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "mt5",
20
+ "num_decoder_layers": 8,
21
+ "num_heads": 6,
22
+ "num_layers": 8,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "tokenizer_class": "T5Tokenizer",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.40.2",
30
+ "use_cache": true,
31
+ "vocab_size": 250101
32
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.40.2"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43226b3a6968b718374df385ca06a4426e699e2dd890eeafdf6f79819d782a6d
3
+ size 1200684456
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e64dfc14c3a15656d3863d5cb576e968dff0c46e35fd78558dd031a89092f9f1
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5fb67bbe6b7e3f079d8158118ffcdf8ca1fb16112ad5b1903035ffc3876c7d5
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.3676984310150146,
3
+ "best_model_checkpoint": "/mnt/d/f0_kmeans_T5_resume/checkpoint-8000",
4
+ "epoch": 0.8033287936888481,
5
+ "eval_steps": 2000,
6
+ "global_step": 8000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.010041609921110603,
13
+ "grad_norm": 0.6225447654724121,
14
+ "learning_rate": 2e-05,
15
+ "loss": 1.8574,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.020083219842221205,
20
+ "grad_norm": 0.58293217420578,
21
+ "learning_rate": 4e-05,
22
+ "loss": 1.8565,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.030124829763331806,
27
+ "grad_norm": 0.7323440313339233,
28
+ "learning_rate": 6e-05,
29
+ "loss": 1.8463,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.04016643968444241,
34
+ "grad_norm": 0.501658022403717,
35
+ "learning_rate": 8e-05,
36
+ "loss": 1.8284,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.05020804960555301,
41
+ "grad_norm": 0.971801221370697,
42
+ "learning_rate": 0.0001,
43
+ "loss": 1.8107,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.06024965952666361,
48
+ "grad_norm": 0.6389101147651672,
49
+ "learning_rate": 9.894269401564813e-05,
50
+ "loss": 1.7857,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.07029126944777421,
55
+ "grad_norm": 0.6149382591247559,
56
+ "learning_rate": 9.788538803129626e-05,
57
+ "loss": 1.7686,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.08033287936888482,
62
+ "grad_norm": 0.9680651426315308,
63
+ "learning_rate": 9.68280820469444e-05,
64
+ "loss": 1.7481,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.09037448928999542,
69
+ "grad_norm": 0.5966058969497681,
70
+ "learning_rate": 9.577077606259251e-05,
71
+ "loss": 1.7298,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.10041609921110602,
76
+ "grad_norm": 0.806626558303833,
77
+ "learning_rate": 9.471347007824066e-05,
78
+ "loss": 1.7173,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.11045770913221663,
83
+ "grad_norm": 0.6925487518310547,
84
+ "learning_rate": 9.365616409388877e-05,
85
+ "loss": 1.6905,
86
+ "step": 1100
87
+ },
88
+ {
89
+ "epoch": 0.12049931905332723,
90
+ "grad_norm": 0.7050909399986267,
91
+ "learning_rate": 9.25988581095369e-05,
92
+ "loss": 1.679,
93
+ "step": 1200
94
+ },
95
+ {
96
+ "epoch": 0.13054092897443784,
97
+ "grad_norm": 0.560664713382721,
98
+ "learning_rate": 9.154155212518503e-05,
99
+ "loss": 1.6691,
100
+ "step": 1300
101
+ },
102
+ {
103
+ "epoch": 0.14058253889554842,
104
+ "grad_norm": 0.7096071243286133,
105
+ "learning_rate": 9.048424614083316e-05,
106
+ "loss": 1.6542,
107
+ "step": 1400
108
+ },
109
+ {
110
+ "epoch": 0.15062414881665903,
111
+ "grad_norm": 0.8381897211074829,
112
+ "learning_rate": 8.942694015648129e-05,
113
+ "loss": 1.6545,
114
+ "step": 1500
115
+ },
116
+ {
117
+ "epoch": 0.16066575873776964,
118
+ "grad_norm": 0.8832806944847107,
119
+ "learning_rate": 8.836963417212942e-05,
120
+ "loss": 1.6378,
121
+ "step": 1600
122
+ },
123
+ {
124
+ "epoch": 0.17070736865888023,
125
+ "grad_norm": 0.5661273002624512,
126
+ "learning_rate": 8.731232818777755e-05,
127
+ "loss": 1.625,
128
+ "step": 1700
129
+ },
130
+ {
131
+ "epoch": 0.18074897857999084,
132
+ "grad_norm": 0.4389723241329193,
133
+ "learning_rate": 8.625502220342567e-05,
134
+ "loss": 1.6253,
135
+ "step": 1800
136
+ },
137
+ {
138
+ "epoch": 0.19079058850110145,
139
+ "grad_norm": 0.45324140787124634,
140
+ "learning_rate": 8.519771621907381e-05,
141
+ "loss": 1.6048,
142
+ "step": 1900
143
+ },
144
+ {
145
+ "epoch": 0.20083219842221203,
146
+ "grad_norm": 0.6926701068878174,
147
+ "learning_rate": 8.414041023472193e-05,
148
+ "loss": 1.6103,
149
+ "step": 2000
150
+ },
151
+ {
152
+ "epoch": 0.20083219842221203,
153
+ "eval_loss": 1.4620566368103027,
154
+ "eval_runtime": 407.2516,
155
+ "eval_samples_per_second": 173.89,
156
+ "eval_steps_per_second": 21.738,
157
+ "step": 2000
158
+ },
159
+ {
160
+ "epoch": 0.21087380834332264,
161
+ "grad_norm": 0.5207750797271729,
162
+ "learning_rate": 8.308310425037006e-05,
163
+ "loss": 1.6101,
164
+ "step": 2100
165
+ },
166
+ {
167
+ "epoch": 0.22091541826443326,
168
+ "grad_norm": 0.564079225063324,
169
+ "learning_rate": 8.202579826601819e-05,
170
+ "loss": 1.5943,
171
+ "step": 2200
172
+ },
173
+ {
174
+ "epoch": 0.23095702818554384,
175
+ "grad_norm": 0.6724056601524353,
176
+ "learning_rate": 8.096849228166632e-05,
177
+ "loss": 1.5963,
178
+ "step": 2300
179
+ },
180
+ {
181
+ "epoch": 0.24099863810665445,
182
+ "grad_norm": 0.42919212579727173,
183
+ "learning_rate": 7.991118629731445e-05,
184
+ "loss": 1.5863,
185
+ "step": 2400
186
+ },
187
+ {
188
+ "epoch": 0.25104024802776503,
189
+ "grad_norm": 0.49685391783714294,
190
+ "learning_rate": 7.885388031296258e-05,
191
+ "loss": 1.5833,
192
+ "step": 2500
193
+ },
194
+ {
195
+ "epoch": 0.2610818579488757,
196
+ "grad_norm": 0.784831702709198,
197
+ "learning_rate": 7.779657432861071e-05,
198
+ "loss": 1.5759,
199
+ "step": 2600
200
+ },
201
+ {
202
+ "epoch": 0.27112346786998626,
203
+ "grad_norm": 0.3932148814201355,
204
+ "learning_rate": 7.673926834425883e-05,
205
+ "loss": 1.5764,
206
+ "step": 2700
207
+ },
208
+ {
209
+ "epoch": 0.28116507779109684,
210
+ "grad_norm": 0.7318882346153259,
211
+ "learning_rate": 7.568196235990697e-05,
212
+ "loss": 1.5531,
213
+ "step": 2800
214
+ },
215
+ {
216
+ "epoch": 0.2912066877122075,
217
+ "grad_norm": 0.46883219480514526,
218
+ "learning_rate": 7.462465637555509e-05,
219
+ "loss": 1.5597,
220
+ "step": 2900
221
+ },
222
+ {
223
+ "epoch": 0.30124829763331806,
224
+ "grad_norm": 0.5420709252357483,
225
+ "learning_rate": 7.356735039120322e-05,
226
+ "loss": 1.5607,
227
+ "step": 3000
228
+ },
229
+ {
230
+ "epoch": 0.31128990755442865,
231
+ "grad_norm": 0.4739568829536438,
232
+ "learning_rate": 7.251004440685135e-05,
233
+ "loss": 1.5571,
234
+ "step": 3100
235
+ },
236
+ {
237
+ "epoch": 0.3213315174755393,
238
+ "grad_norm": 0.524760365486145,
239
+ "learning_rate": 7.145273842249948e-05,
240
+ "loss": 1.5413,
241
+ "step": 3200
242
+ },
243
+ {
244
+ "epoch": 0.33137312739664987,
245
+ "grad_norm": 0.40299034118652344,
246
+ "learning_rate": 7.039543243814761e-05,
247
+ "loss": 1.544,
248
+ "step": 3300
249
+ },
250
+ {
251
+ "epoch": 0.34141473731776045,
252
+ "grad_norm": 0.6930205225944519,
253
+ "learning_rate": 6.933812645379572e-05,
254
+ "loss": 1.5407,
255
+ "step": 3400
256
+ },
257
+ {
258
+ "epoch": 0.3514563472388711,
259
+ "grad_norm": 0.4104674160480499,
260
+ "learning_rate": 6.828082046944387e-05,
261
+ "loss": 1.539,
262
+ "step": 3500
263
+ },
264
+ {
265
+ "epoch": 0.3614979571599817,
266
+ "grad_norm": 0.4443911910057068,
267
+ "learning_rate": 6.722351448509198e-05,
268
+ "loss": 1.534,
269
+ "step": 3600
270
+ },
271
+ {
272
+ "epoch": 0.37153956708109226,
273
+ "grad_norm": 0.3941650092601776,
274
+ "learning_rate": 6.616620850074011e-05,
275
+ "loss": 1.5338,
276
+ "step": 3700
277
+ },
278
+ {
279
+ "epoch": 0.3815811770022029,
280
+ "grad_norm": 0.530119001865387,
281
+ "learning_rate": 6.510890251638824e-05,
282
+ "loss": 1.5244,
283
+ "step": 3800
284
+ },
285
+ {
286
+ "epoch": 0.3916227869233135,
287
+ "grad_norm": 0.5141203999519348,
288
+ "learning_rate": 6.405159653203637e-05,
289
+ "loss": 1.5242,
290
+ "step": 3900
291
+ },
292
+ {
293
+ "epoch": 0.40166439684442407,
294
+ "grad_norm": 0.5742161870002747,
295
+ "learning_rate": 6.29942905476845e-05,
296
+ "loss": 1.5233,
297
+ "step": 4000
298
+ },
299
+ {
300
+ "epoch": 0.40166439684442407,
301
+ "eval_loss": 1.409444808959961,
302
+ "eval_runtime": 406.7484,
303
+ "eval_samples_per_second": 174.105,
304
+ "eval_steps_per_second": 21.765,
305
+ "step": 4000
306
+ },
307
+ {
308
+ "epoch": 0.4117060067655347,
309
+ "grad_norm": 0.49223774671554565,
310
+ "learning_rate": 6.193698456333263e-05,
311
+ "loss": 1.5254,
312
+ "step": 4100
313
+ },
314
+ {
315
+ "epoch": 0.4217476166866453,
316
+ "grad_norm": 0.4922013580799103,
317
+ "learning_rate": 6.087967857898076e-05,
318
+ "loss": 1.5187,
319
+ "step": 4200
320
+ },
321
+ {
322
+ "epoch": 0.43178922660775587,
323
+ "grad_norm": 0.5500440001487732,
324
+ "learning_rate": 5.982237259462888e-05,
325
+ "loss": 1.5217,
326
+ "step": 4300
327
+ },
328
+ {
329
+ "epoch": 0.4418308365288665,
330
+ "grad_norm": 0.5224810242652893,
331
+ "learning_rate": 5.876506661027702e-05,
332
+ "loss": 1.5187,
333
+ "step": 4400
334
+ },
335
+ {
336
+ "epoch": 0.4518724464499771,
337
+ "grad_norm": 0.42916256189346313,
338
+ "learning_rate": 5.770776062592515e-05,
339
+ "loss": 1.5064,
340
+ "step": 4500
341
+ },
342
+ {
343
+ "epoch": 0.4619140563710877,
344
+ "grad_norm": 0.4909166693687439,
345
+ "learning_rate": 5.665045464157327e-05,
346
+ "loss": 1.5065,
347
+ "step": 4600
348
+ },
349
+ {
350
+ "epoch": 0.4719556662921983,
351
+ "grad_norm": 0.4582947790622711,
352
+ "learning_rate": 5.559314865722141e-05,
353
+ "loss": 1.5095,
354
+ "step": 4700
355
+ },
356
+ {
357
+ "epoch": 0.4819972762133089,
358
+ "grad_norm": 0.4197562038898468,
359
+ "learning_rate": 5.453584267286953e-05,
360
+ "loss": 1.501,
361
+ "step": 4800
362
+ },
363
+ {
364
+ "epoch": 0.4920388861344195,
365
+ "grad_norm": 0.42700713872909546,
366
+ "learning_rate": 5.3478536688517655e-05,
367
+ "loss": 1.5088,
368
+ "step": 4900
369
+ },
370
+ {
371
+ "epoch": 0.5020804960555301,
372
+ "grad_norm": 0.6198418140411377,
373
+ "learning_rate": 5.242123070416579e-05,
374
+ "loss": 1.4986,
375
+ "step": 5000
376
+ },
377
+ {
378
+ "epoch": 0.5121221059766407,
379
+ "grad_norm": 0.46520060300827026,
380
+ "learning_rate": 5.1363924719813915e-05,
381
+ "loss": 1.5118,
382
+ "step": 5100
383
+ },
384
+ {
385
+ "epoch": 0.5221637158977513,
386
+ "grad_norm": 0.36571305990219116,
387
+ "learning_rate": 5.030661873546204e-05,
388
+ "loss": 1.5023,
389
+ "step": 5200
390
+ },
391
+ {
392
+ "epoch": 0.5322053258188619,
393
+ "grad_norm": 0.3751266300678253,
394
+ "learning_rate": 4.924931275111017e-05,
395
+ "loss": 1.4921,
396
+ "step": 5300
397
+ },
398
+ {
399
+ "epoch": 0.5422469357399725,
400
+ "grad_norm": 0.37080907821655273,
401
+ "learning_rate": 4.8192006766758305e-05,
402
+ "loss": 1.492,
403
+ "step": 5400
404
+ },
405
+ {
406
+ "epoch": 0.5522885456610831,
407
+ "grad_norm": 0.43024304509162903,
408
+ "learning_rate": 4.7134700782406435e-05,
409
+ "loss": 1.4859,
410
+ "step": 5500
411
+ },
412
+ {
413
+ "epoch": 0.5623301555821937,
414
+ "grad_norm": 0.40753471851348877,
415
+ "learning_rate": 4.607739479805456e-05,
416
+ "loss": 1.4853,
417
+ "step": 5600
418
+ },
419
+ {
420
+ "epoch": 0.5723717655033043,
421
+ "grad_norm": 0.40475383400917053,
422
+ "learning_rate": 4.502008881370269e-05,
423
+ "loss": 1.489,
424
+ "step": 5700
425
+ },
426
+ {
427
+ "epoch": 0.582413375424415,
428
+ "grad_norm": 0.3590254485607147,
429
+ "learning_rate": 4.396278282935082e-05,
430
+ "loss": 1.4892,
431
+ "step": 5800
432
+ },
433
+ {
434
+ "epoch": 0.5924549853455255,
435
+ "grad_norm": 0.4830130338668823,
436
+ "learning_rate": 4.290547684499894e-05,
437
+ "loss": 1.4922,
438
+ "step": 5900
439
+ },
440
+ {
441
+ "epoch": 0.6024965952666361,
442
+ "grad_norm": 0.42672935128211975,
443
+ "learning_rate": 4.184817086064707e-05,
444
+ "loss": 1.483,
445
+ "step": 6000
446
+ },
447
+ {
448
+ "epoch": 0.6024965952666361,
449
+ "eval_loss": 1.3820070028305054,
450
+ "eval_runtime": 408.5783,
451
+ "eval_samples_per_second": 173.325,
452
+ "eval_steps_per_second": 21.668,
453
+ "step": 6000
454
+ },
455
+ {
456
+ "epoch": 0.6125382051877467,
457
+ "grad_norm": 0.39759695529937744,
458
+ "learning_rate": 4.07908648762952e-05,
459
+ "loss": 1.4778,
460
+ "step": 6100
461
+ },
462
+ {
463
+ "epoch": 0.6225798151088573,
464
+ "grad_norm": 0.43023234605789185,
465
+ "learning_rate": 3.9733558891943326e-05,
466
+ "loss": 1.4774,
467
+ "step": 6200
468
+ },
469
+ {
470
+ "epoch": 0.6326214250299679,
471
+ "grad_norm": 0.4756367802619934,
472
+ "learning_rate": 3.867625290759146e-05,
473
+ "loss": 1.4879,
474
+ "step": 6300
475
+ },
476
+ {
477
+ "epoch": 0.6426630349510786,
478
+ "grad_norm": 0.46574530005455017,
479
+ "learning_rate": 3.761894692323959e-05,
480
+ "loss": 1.4765,
481
+ "step": 6400
482
+ },
483
+ {
484
+ "epoch": 0.6527046448721892,
485
+ "grad_norm": 0.6158186197280884,
486
+ "learning_rate": 3.6561640938887716e-05,
487
+ "loss": 1.4741,
488
+ "step": 6500
489
+ },
490
+ {
491
+ "epoch": 0.6627462547932997,
492
+ "grad_norm": 0.42495304346084595,
493
+ "learning_rate": 3.5504334954535846e-05,
494
+ "loss": 1.4764,
495
+ "step": 6600
496
+ },
497
+ {
498
+ "epoch": 0.6727878647144103,
499
+ "grad_norm": 0.44197046756744385,
500
+ "learning_rate": 3.4447028970183976e-05,
501
+ "loss": 1.4688,
502
+ "step": 6700
503
+ },
504
+ {
505
+ "epoch": 0.6828294746355209,
506
+ "grad_norm": 0.38747721910476685,
507
+ "learning_rate": 3.33897229858321e-05,
508
+ "loss": 1.4772,
509
+ "step": 6800
510
+ },
511
+ {
512
+ "epoch": 0.6928710845566315,
513
+ "grad_norm": 0.3641980290412903,
514
+ "learning_rate": 3.233241700148023e-05,
515
+ "loss": 1.47,
516
+ "step": 6900
517
+ },
518
+ {
519
+ "epoch": 0.7029126944777422,
520
+ "grad_norm": 0.39210596680641174,
521
+ "learning_rate": 3.127511101712835e-05,
522
+ "loss": 1.4734,
523
+ "step": 7000
524
+ },
525
+ {
526
+ "epoch": 0.7129543043988528,
527
+ "grad_norm": 0.4860726594924927,
528
+ "learning_rate": 3.0217805032776486e-05,
529
+ "loss": 1.4815,
530
+ "step": 7100
531
+ },
532
+ {
533
+ "epoch": 0.7229959143199634,
534
+ "grad_norm": 0.4656274616718292,
535
+ "learning_rate": 2.9160499048424616e-05,
536
+ "loss": 1.464,
537
+ "step": 7200
538
+ },
539
+ {
540
+ "epoch": 0.7330375242410739,
541
+ "grad_norm": 0.37134116888046265,
542
+ "learning_rate": 2.810319306407274e-05,
543
+ "loss": 1.4716,
544
+ "step": 7300
545
+ },
546
+ {
547
+ "epoch": 0.7430791341621845,
548
+ "grad_norm": 0.46362295746803284,
549
+ "learning_rate": 2.7045887079720873e-05,
550
+ "loss": 1.4725,
551
+ "step": 7400
552
+ },
553
+ {
554
+ "epoch": 0.7531207440832951,
555
+ "grad_norm": 0.3737928867340088,
556
+ "learning_rate": 2.5988581095369003e-05,
557
+ "loss": 1.4673,
558
+ "step": 7500
559
+ },
560
+ {
561
+ "epoch": 0.7631623540044058,
562
+ "grad_norm": 0.3332815170288086,
563
+ "learning_rate": 2.493127511101713e-05,
564
+ "loss": 1.4608,
565
+ "step": 7600
566
+ },
567
+ {
568
+ "epoch": 0.7732039639255164,
569
+ "grad_norm": 0.44841036200523376,
570
+ "learning_rate": 2.3873969126665257e-05,
571
+ "loss": 1.466,
572
+ "step": 7700
573
+ },
574
+ {
575
+ "epoch": 0.783245573846627,
576
+ "grad_norm": 0.5019286274909973,
577
+ "learning_rate": 2.2816663142313387e-05,
578
+ "loss": 1.4651,
579
+ "step": 7800
580
+ },
581
+ {
582
+ "epoch": 0.7932871837677375,
583
+ "grad_norm": 0.3360118567943573,
584
+ "learning_rate": 2.1759357157961517e-05,
585
+ "loss": 1.4706,
586
+ "step": 7900
587
+ },
588
+ {
589
+ "epoch": 0.8033287936888481,
590
+ "grad_norm": 0.38790610432624817,
591
+ "learning_rate": 2.0702051173609644e-05,
592
+ "loss": 1.4646,
593
+ "step": 8000
594
+ },
595
+ {
596
+ "epoch": 0.8033287936888481,
597
+ "eval_loss": 1.3676984310150146,
598
+ "eval_runtime": 408.474,
599
+ "eval_samples_per_second": 173.37,
600
+ "eval_steps_per_second": 21.673,
601
+ "step": 8000
602
+ }
603
+ ],
604
+ "logging_steps": 100,
605
+ "max_steps": 9958,
606
+ "num_input_tokens_seen": 0,
607
+ "num_train_epochs": 1,
608
+ "save_steps": 2000,
609
+ "total_flos": 1.35355491680256e+17,
610
+ "train_batch_size": 4,
611
+ "trial_name": null,
612
+ "trial_params": null
613
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:698e0e256724fb8c8e561a1465421623b9a309763586162a64221d91635c40fd
3
+ size 5112