robertou2 commited on
Commit
5145fed
·
verified ·
1 Parent(s): d3062f9

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:635adffaeb09dbb7a664651d5421d2072d6032c31bd3760a8d552e0fe2a86f3e
3
  size 369133600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8bcbd57b5881ae329bb1c641f134c0e7e34dc8a10c269f9bef1144855507336
3
  size 369133600
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:770091e990e6db7a8ac8842db96d2c20c9b97b4bd60932176cd2127ad3bd7c4f
3
+ size 738417355
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8ebf0c029a2d2b49dc7dbaff0f2c313f2655675d94aa9e96c6aa9ea3ce9d185
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9da6c927c97f3fdf917426312ae7a36977f7cc25e99d53c6db7c5bf832d85594
3
+ size 1465
trainer_state.json ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.686695278969957,
6
+ "eval_steps": 500,
7
+ "global_step": 40,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 2.0594023689627647,
14
+ "epoch": 0.06866952789699571,
15
+ "grad_norm": 13.235921859741211,
16
+ "learning_rate": 0.0,
17
+ "loss": 2.0981,
18
+ "mean_token_accuracy": 0.47590671852231026,
19
+ "num_tokens": 3141.0,
20
+ "step": 1
21
+ },
22
+ {
23
+ "entropy": 2.0151727944612503,
24
+ "epoch": 0.13733905579399142,
25
+ "grad_norm": 12.92041015625,
26
+ "learning_rate": 1.3333333333333333e-05,
27
+ "loss": 2.0398,
28
+ "mean_token_accuracy": 0.4710495248436928,
29
+ "num_tokens": 6354.0,
30
+ "step": 2
31
+ },
32
+ {
33
+ "entropy": 2.1832356229424477,
34
+ "epoch": 0.20600858369098712,
35
+ "grad_norm": 9.109127044677734,
36
+ "learning_rate": 2.6666666666666667e-05,
37
+ "loss": 1.9638,
38
+ "mean_token_accuracy": 0.4837193079292774,
39
+ "num_tokens": 9122.0,
40
+ "step": 3
41
+ },
42
+ {
43
+ "entropy": 2.1158930361270905,
44
+ "epoch": 0.27467811158798283,
45
+ "grad_norm": 6.479565143585205,
46
+ "learning_rate": 4e-05,
47
+ "loss": 1.791,
48
+ "mean_token_accuracy": 0.5011687465012074,
49
+ "num_tokens": 12598.0,
50
+ "step": 4
51
+ },
52
+ {
53
+ "entropy": 2.1543598622083664,
54
+ "epoch": 0.34334763948497854,
55
+ "grad_norm": 6.700535774230957,
56
+ "learning_rate": 5.333333333333333e-05,
57
+ "loss": 1.7927,
58
+ "mean_token_accuracy": 0.5191793460398912,
59
+ "num_tokens": 15894.0,
60
+ "step": 5
61
+ },
62
+ {
63
+ "entropy": 2.190734125673771,
64
+ "epoch": 0.41201716738197425,
65
+ "grad_norm": 6.992319583892822,
66
+ "learning_rate": 6.666666666666667e-05,
67
+ "loss": 1.7767,
68
+ "mean_token_accuracy": 0.5401564333587885,
69
+ "num_tokens": 18120.0,
70
+ "step": 6
71
+ },
72
+ {
73
+ "entropy": 1.8032616525888443,
74
+ "epoch": 0.48068669527896996,
75
+ "grad_norm": 4.852386951446533,
76
+ "learning_rate": 8e-05,
77
+ "loss": 1.4557,
78
+ "mean_token_accuracy": 0.5802906360477209,
79
+ "num_tokens": 21398.0,
80
+ "step": 7
81
+ },
82
+ {
83
+ "entropy": 1.911305882036686,
84
+ "epoch": 0.5493562231759657,
85
+ "grad_norm": 5.9996657371521,
86
+ "learning_rate": 9.333333333333334e-05,
87
+ "loss": 1.7139,
88
+ "mean_token_accuracy": 0.5444964710623026,
89
+ "num_tokens": 24176.0,
90
+ "step": 8
91
+ },
92
+ {
93
+ "entropy": 1.6898048743605614,
94
+ "epoch": 0.6180257510729614,
95
+ "grad_norm": 4.547244071960449,
96
+ "learning_rate": 0.00010666666666666667,
97
+ "loss": 1.534,
98
+ "mean_token_accuracy": 0.5793795734643936,
99
+ "num_tokens": 27757.0,
100
+ "step": 9
101
+ },
102
+ {
103
+ "entropy": 1.9274577349424362,
104
+ "epoch": 0.6866952789699571,
105
+ "grad_norm": 5.497706413269043,
106
+ "learning_rate": 0.00012,
107
+ "loss": 1.6664,
108
+ "mean_token_accuracy": 0.5688131004571915,
109
+ "num_tokens": 30127.0,
110
+ "step": 10
111
+ },
112
+ {
113
+ "entropy": 2.0350103303790092,
114
+ "epoch": 0.7553648068669528,
115
+ "grad_norm": 4.125185966491699,
116
+ "learning_rate": 0.00013333333333333334,
117
+ "loss": 1.4849,
118
+ "mean_token_accuracy": 0.5754233915358782,
119
+ "num_tokens": 33423.0,
120
+ "step": 11
121
+ },
122
+ {
123
+ "entropy": 2.0800106152892113,
124
+ "epoch": 0.8240343347639485,
125
+ "grad_norm": 5.943212032318115,
126
+ "learning_rate": 0.00014666666666666666,
127
+ "loss": 1.4663,
128
+ "mean_token_accuracy": 0.5716470144689083,
129
+ "num_tokens": 36704.0,
130
+ "step": 12
131
+ },
132
+ {
133
+ "entropy": 1.8619608655571938,
134
+ "epoch": 0.8927038626609443,
135
+ "grad_norm": 4.216862678527832,
136
+ "learning_rate": 0.00016,
137
+ "loss": 1.4236,
138
+ "mean_token_accuracy": 0.6065030060708523,
139
+ "num_tokens": 40036.0,
140
+ "step": 13
141
+ },
142
+ {
143
+ "entropy": 1.793141208589077,
144
+ "epoch": 0.9613733905579399,
145
+ "grad_norm": 4.655040264129639,
146
+ "learning_rate": 0.00017333333333333334,
147
+ "loss": 1.4756,
148
+ "mean_token_accuracy": 0.5741604901850224,
149
+ "num_tokens": 43346.0,
150
+ "step": 14
151
+ },
152
+ {
153
+ "entropy": 1.6789460447099473,
154
+ "epoch": 1.0,
155
+ "grad_norm": 5.187127113342285,
156
+ "learning_rate": 0.0001866666666666667,
157
+ "loss": 1.5609,
158
+ "mean_token_accuracy": 0.5648946828312345,
159
+ "num_tokens": 45318.0,
160
+ "step": 15
161
+ },
162
+ {
163
+ "entropy": 1.6579683423042297,
164
+ "epoch": 1.0686695278969958,
165
+ "grad_norm": 5.212145805358887,
166
+ "learning_rate": 0.0002,
167
+ "loss": 1.0621,
168
+ "mean_token_accuracy": 0.6966231800615788,
169
+ "num_tokens": 47725.0,
170
+ "step": 16
171
+ },
172
+ {
173
+ "entropy": 1.4356463253498077,
174
+ "epoch": 1.1373390557939915,
175
+ "grad_norm": 4.121367931365967,
176
+ "learning_rate": 0.00019945218953682734,
177
+ "loss": 0.9024,
178
+ "mean_token_accuracy": 0.7124389447271824,
179
+ "num_tokens": 51539.0,
180
+ "step": 17
181
+ },
182
+ {
183
+ "entropy": 1.2889379784464836,
184
+ "epoch": 1.206008583690987,
185
+ "grad_norm": 3.5511586666107178,
186
+ "learning_rate": 0.00019781476007338058,
187
+ "loss": 0.7942,
188
+ "mean_token_accuracy": 0.7587139792740345,
189
+ "num_tokens": 54445.0,
190
+ "step": 18
191
+ },
192
+ {
193
+ "entropy": 1.2650777027010918,
194
+ "epoch": 1.2746781115879828,
195
+ "grad_norm": 5.292742729187012,
196
+ "learning_rate": 0.00019510565162951537,
197
+ "loss": 0.9175,
198
+ "mean_token_accuracy": 0.7510803155601025,
199
+ "num_tokens": 56486.0,
200
+ "step": 19
201
+ },
202
+ {
203
+ "entropy": 1.0445934683084488,
204
+ "epoch": 1.3433476394849786,
205
+ "grad_norm": 4.494050025939941,
206
+ "learning_rate": 0.0001913545457642601,
207
+ "loss": 0.7743,
208
+ "mean_token_accuracy": 0.7553940825164318,
209
+ "num_tokens": 59598.0,
210
+ "step": 20
211
+ },
212
+ {
213
+ "entropy": 1.078294474631548,
214
+ "epoch": 1.4120171673819741,
215
+ "grad_norm": 4.316277027130127,
216
+ "learning_rate": 0.00018660254037844388,
217
+ "loss": 0.8193,
218
+ "mean_token_accuracy": 0.7530446909368038,
219
+ "num_tokens": 62856.0,
220
+ "step": 21
221
+ },
222
+ {
223
+ "entropy": 1.084236167371273,
224
+ "epoch": 1.48068669527897,
225
+ "grad_norm": 3.509615898132324,
226
+ "learning_rate": 0.00018090169943749476,
227
+ "loss": 0.7071,
228
+ "mean_token_accuracy": 0.7676379047334194,
229
+ "num_tokens": 66016.0,
230
+ "step": 22
231
+ },
232
+ {
233
+ "entropy": 1.0741066485643387,
234
+ "epoch": 1.5493562231759657,
235
+ "grad_norm": 3.072199821472168,
236
+ "learning_rate": 0.00017431448254773944,
237
+ "loss": 0.6951,
238
+ "mean_token_accuracy": 0.7804224453866482,
239
+ "num_tokens": 70035.0,
240
+ "step": 23
241
+ },
242
+ {
243
+ "entropy": 1.0479706078767776,
244
+ "epoch": 1.6180257510729614,
245
+ "grad_norm": 3.600970983505249,
246
+ "learning_rate": 0.00016691306063588583,
247
+ "loss": 0.8583,
248
+ "mean_token_accuracy": 0.7423711605370045,
249
+ "num_tokens": 73691.0,
250
+ "step": 24
251
+ },
252
+ {
253
+ "entropy": 0.9390149228274822,
254
+ "epoch": 1.6866952789699572,
255
+ "grad_norm": 3.461914539337158,
256
+ "learning_rate": 0.00015877852522924732,
257
+ "loss": 0.7094,
258
+ "mean_token_accuracy": 0.7819090783596039,
259
+ "num_tokens": 76935.0,
260
+ "step": 25
261
+ },
262
+ {
263
+ "entropy": 1.0067210085690022,
264
+ "epoch": 1.755364806866953,
265
+ "grad_norm": 3.8042051792144775,
266
+ "learning_rate": 0.00015000000000000001,
267
+ "loss": 0.7906,
268
+ "mean_token_accuracy": 0.7519370801746845,
269
+ "num_tokens": 80389.0,
270
+ "step": 26
271
+ },
272
+ {
273
+ "entropy": 1.0217487923800945,
274
+ "epoch": 1.8240343347639485,
275
+ "grad_norm": 3.956174373626709,
276
+ "learning_rate": 0.00014067366430758004,
277
+ "loss": 0.806,
278
+ "mean_token_accuracy": 0.7380720600485802,
279
+ "num_tokens": 83048.0,
280
+ "step": 27
281
+ },
282
+ {
283
+ "entropy": 1.019801527261734,
284
+ "epoch": 1.8927038626609443,
285
+ "grad_norm": 3.346088171005249,
286
+ "learning_rate": 0.00013090169943749476,
287
+ "loss": 0.6631,
288
+ "mean_token_accuracy": 0.7932046018540859,
289
+ "num_tokens": 86052.0,
290
+ "step": 28
291
+ },
292
+ {
293
+ "entropy": 1.1218348927795887,
294
+ "epoch": 1.9613733905579398,
295
+ "grad_norm": 3.5671541690826416,
296
+ "learning_rate": 0.00012079116908177593,
297
+ "loss": 0.7907,
298
+ "mean_token_accuracy": 0.7694815509021282,
299
+ "num_tokens": 88805.0,
300
+ "step": 29
301
+ },
302
+ {
303
+ "entropy": 1.0706661343574524,
304
+ "epoch": 2.0,
305
+ "grad_norm": 3.657078504562378,
306
+ "learning_rate": 0.00011045284632676536,
307
+ "loss": 0.6981,
308
+ "mean_token_accuracy": 0.7813876933521695,
309
+ "num_tokens": 90636.0,
310
+ "step": 30
311
+ },
312
+ {
313
+ "entropy": 0.8892248384654522,
314
+ "epoch": 2.0686695278969958,
315
+ "grad_norm": 2.3997206687927246,
316
+ "learning_rate": 0.0001,
317
+ "loss": 0.2915,
318
+ "mean_token_accuracy": 0.9392501749098301,
319
+ "num_tokens": 93457.0,
320
+ "step": 31
321
+ },
322
+ {
323
+ "entropy": 0.7985986340790987,
324
+ "epoch": 2.1373390557939915,
325
+ "grad_norm": 2.149660587310791,
326
+ "learning_rate": 8.954715367323468e-05,
327
+ "loss": 0.2838,
328
+ "mean_token_accuracy": 0.9058267325162888,
329
+ "num_tokens": 97113.0,
330
+ "step": 32
331
+ },
332
+ {
333
+ "entropy": 0.7246442474424839,
334
+ "epoch": 2.2060085836909873,
335
+ "grad_norm": 2.443434715270996,
336
+ "learning_rate": 7.920883091822408e-05,
337
+ "loss": 0.2394,
338
+ "mean_token_accuracy": 0.9322504326701164,
339
+ "num_tokens": 100135.0,
340
+ "step": 33
341
+ },
342
+ {
343
+ "entropy": 0.6230949554592371,
344
+ "epoch": 2.274678111587983,
345
+ "grad_norm": 2.339956283569336,
346
+ "learning_rate": 6.909830056250527e-05,
347
+ "loss": 0.2399,
348
+ "mean_token_accuracy": 0.9419549070298672,
349
+ "num_tokens": 103861.0,
350
+ "step": 34
351
+ },
352
+ {
353
+ "entropy": 0.5878649838268757,
354
+ "epoch": 2.3433476394849784,
355
+ "grad_norm": 2.7043843269348145,
356
+ "learning_rate": 5.9326335692419995e-05,
357
+ "loss": 0.2052,
358
+ "mean_token_accuracy": 0.9398090243339539,
359
+ "num_tokens": 106839.0,
360
+ "step": 35
361
+ },
362
+ {
363
+ "entropy": 0.5923287644982338,
364
+ "epoch": 2.412017167381974,
365
+ "grad_norm": 3.071157932281494,
366
+ "learning_rate": 5.000000000000002e-05,
367
+ "loss": 0.2369,
368
+ "mean_token_accuracy": 0.9214167520403862,
369
+ "num_tokens": 109733.0,
370
+ "step": 36
371
+ },
372
+ {
373
+ "entropy": 0.4931443203240633,
374
+ "epoch": 2.48068669527897,
375
+ "grad_norm": 3.6524157524108887,
376
+ "learning_rate": 4.12214747707527e-05,
377
+ "loss": 0.2197,
378
+ "mean_token_accuracy": 0.9345290660858154,
379
+ "num_tokens": 113006.0,
380
+ "step": 37
381
+ },
382
+ {
383
+ "entropy": 0.5408180318772793,
384
+ "epoch": 2.5493562231759657,
385
+ "grad_norm": 3.9559812545776367,
386
+ "learning_rate": 3.308693936411421e-05,
387
+ "loss": 0.2708,
388
+ "mean_token_accuracy": 0.9073256962001324,
389
+ "num_tokens": 115925.0,
390
+ "step": 38
391
+ },
392
+ {
393
+ "entropy": 0.47205063328146935,
394
+ "epoch": 2.6180257510729614,
395
+ "grad_norm": 3.9854965209960938,
396
+ "learning_rate": 2.5685517452260567e-05,
397
+ "loss": 0.228,
398
+ "mean_token_accuracy": 0.9301580972969532,
399
+ "num_tokens": 118763.0,
400
+ "step": 39
401
+ },
402
+ {
403
+ "entropy": 0.5000962279736996,
404
+ "epoch": 2.686695278969957,
405
+ "grad_norm": 2.9278979301452637,
406
+ "learning_rate": 1.9098300562505266e-05,
407
+ "loss": 0.252,
408
+ "mean_token_accuracy": 0.92122907564044,
409
+ "num_tokens": 122351.0,
410
+ "step": 40
411
+ }
412
+ ],
413
+ "logging_steps": 1,
414
+ "max_steps": 45,
415
+ "num_input_tokens_seen": 0,
416
+ "num_train_epochs": 3,
417
+ "save_steps": 10,
418
+ "stateful_callbacks": {
419
+ "TrainerControl": {
420
+ "args": {
421
+ "should_epoch_stop": false,
422
+ "should_evaluate": false,
423
+ "should_log": false,
424
+ "should_save": true,
425
+ "should_training_stop": false
426
+ },
427
+ "attributes": {}
428
+ }
429
+ },
430
+ "total_flos": 2432606934743040.0,
431
+ "train_batch_size": 1,
432
+ "trial_name": null,
433
+ "trial_params": null
434
+ }