belal271 commited on
Commit
044baa2
·
verified ·
1 Parent(s): 8c1a330

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +423 -0
trainer_state.json ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9826119126896042,
5
+ "eval_steps": 100,
6
+ "global_step": 504,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.059193488716241215,
13
+ "grad_norm": 0.19465851783752441,
14
+ "learning_rate": 2e-05,
15
+ "loss": 1.1117,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.11838697743248243,
20
+ "grad_norm": 0.21135994791984558,
21
+ "learning_rate": 4e-05,
22
+ "loss": 1.1022,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.17758046614872364,
27
+ "grad_norm": 0.2708509862422943,
28
+ "learning_rate": 6e-05,
29
+ "loss": 1.0546,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.23677395486496486,
34
+ "grad_norm": 0.21392174065113068,
35
+ "learning_rate": 8e-05,
36
+ "loss": 0.94,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.2959674435812061,
41
+ "grad_norm": 0.17460189759731293,
42
+ "learning_rate": 0.0001,
43
+ "loss": 0.8246,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.3551609322974473,
48
+ "grad_norm": 0.16554006934165955,
49
+ "learning_rate": 0.00012,
50
+ "loss": 0.7437,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.41435442101368847,
55
+ "grad_norm": 0.16454806923866272,
56
+ "learning_rate": 0.00014,
57
+ "loss": 0.6901,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.4735479097299297,
62
+ "grad_norm": 0.1758105754852295,
63
+ "learning_rate": 0.00016,
64
+ "loss": 0.6429,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.532741398446171,
69
+ "grad_norm": 0.1696159690618515,
70
+ "learning_rate": 0.00018,
71
+ "loss": 0.6319,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.5919348871624122,
76
+ "grad_norm": 0.17172178626060486,
77
+ "learning_rate": 0.0002,
78
+ "loss": 0.5995,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.5919348871624122,
83
+ "eval_loss": 0.5879706740379333,
84
+ "eval_runtime": 186.3324,
85
+ "eval_samples_per_second": 3.687,
86
+ "eval_steps_per_second": 0.462,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.6511283758786534,
91
+ "grad_norm": 0.16957880556583405,
92
+ "learning_rate": 0.00019504950495049505,
93
+ "loss": 0.5913,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.7103218645948945,
98
+ "grad_norm": 0.17516390979290009,
99
+ "learning_rate": 0.0001900990099009901,
100
+ "loss": 0.5803,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.7695153533111357,
105
+ "grad_norm": 0.18869660794734955,
106
+ "learning_rate": 0.00018514851485148517,
107
+ "loss": 0.5631,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.8287088420273769,
112
+ "grad_norm": 0.18655870854854584,
113
+ "learning_rate": 0.00018019801980198022,
114
+ "loss": 0.5538,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.8879023307436182,
119
+ "grad_norm": 0.18268819153308868,
120
+ "learning_rate": 0.00017524752475247526,
121
+ "loss": 0.5475,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.9470958194598594,
126
+ "grad_norm": 0.1926167607307434,
127
+ "learning_rate": 0.0001702970297029703,
128
+ "loss": 0.5462,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 1.005919348871624,
133
+ "grad_norm": 0.19758427143096924,
134
+ "learning_rate": 0.00016534653465346535,
135
+ "loss": 0.5375,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 1.0651128375878653,
140
+ "grad_norm": 0.19379234313964844,
141
+ "learning_rate": 0.00016039603960396042,
142
+ "loss": 0.5234,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 1.1243063263041067,
147
+ "grad_norm": 0.19841700792312622,
148
+ "learning_rate": 0.00015544554455445547,
149
+ "loss": 0.5336,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 1.1834998150203477,
154
+ "grad_norm": 0.19659265875816345,
155
+ "learning_rate": 0.00015049504950495051,
156
+ "loss": 0.5265,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 1.1834998150203477,
161
+ "eval_loss": 0.5296523571014404,
162
+ "eval_runtime": 186.0309,
163
+ "eval_samples_per_second": 3.693,
164
+ "eval_steps_per_second": 0.462,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 1.242693303736589,
169
+ "grad_norm": 0.2189786285161972,
170
+ "learning_rate": 0.00014554455445544556,
171
+ "loss": 0.5228,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 1.3018867924528301,
176
+ "grad_norm": 0.2317192554473877,
177
+ "learning_rate": 0.0001405940594059406,
178
+ "loss": 0.5219,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 1.3610802811690714,
183
+ "grad_norm": 0.20830461382865906,
184
+ "learning_rate": 0.00013564356435643565,
185
+ "loss": 0.5095,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 1.4202737698853127,
190
+ "grad_norm": 0.2214994877576828,
191
+ "learning_rate": 0.0001306930693069307,
192
+ "loss": 0.5192,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 1.4794672586015538,
197
+ "grad_norm": 0.21869614720344543,
198
+ "learning_rate": 0.00012574257425742574,
199
+ "loss": 0.5031,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 1.538660747317795,
204
+ "grad_norm": 0.21614821255207062,
205
+ "learning_rate": 0.0001207920792079208,
206
+ "loss": 0.5177,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 1.5978542360340362,
211
+ "grad_norm": 0.2306758016347885,
212
+ "learning_rate": 0.00011584158415841584,
213
+ "loss": 0.5124,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 1.6570477247502775,
218
+ "grad_norm": 0.22710908949375153,
219
+ "learning_rate": 0.0001108910891089109,
220
+ "loss": 0.5025,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 1.7162412134665188,
225
+ "grad_norm": 0.22501707077026367,
226
+ "learning_rate": 0.00010594059405940595,
227
+ "loss": 0.5048,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 1.77543470218276,
232
+ "grad_norm": 0.22507990896701813,
233
+ "learning_rate": 0.00010099009900990099,
234
+ "loss": 0.5009,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 1.77543470218276,
239
+ "eval_loss": 0.508837103843689,
240
+ "eval_runtime": 185.3987,
241
+ "eval_samples_per_second": 3.706,
242
+ "eval_steps_per_second": 0.464,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 1.834628190899001,
247
+ "grad_norm": 0.22626681625843048,
248
+ "learning_rate": 9.603960396039604e-05,
249
+ "loss": 0.5045,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 1.8938216796152423,
254
+ "grad_norm": 0.21802489459514618,
255
+ "learning_rate": 9.10891089108911e-05,
256
+ "loss": 0.5087,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 1.9530151683314836,
261
+ "grad_norm": 0.2331380993127823,
262
+ "learning_rate": 8.613861386138614e-05,
263
+ "loss": 0.5038,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 2.011838697743248,
268
+ "grad_norm": 0.21599650382995605,
269
+ "learning_rate": 8.11881188118812e-05,
270
+ "loss": 0.4912,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 2.0710321864594894,
275
+ "grad_norm": 0.24118073284626007,
276
+ "learning_rate": 7.623762376237625e-05,
277
+ "loss": 0.4842,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 2.1302256751757307,
282
+ "grad_norm": 0.232917919754982,
283
+ "learning_rate": 7.128712871287129e-05,
284
+ "loss": 0.4835,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 2.189419163891972,
289
+ "grad_norm": 0.23639123141765594,
290
+ "learning_rate": 6.633663366336635e-05,
291
+ "loss": 0.4786,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 2.2486126526082133,
296
+ "grad_norm": 0.2632993161678314,
297
+ "learning_rate": 6.13861386138614e-05,
298
+ "loss": 0.4824,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 2.307806141324454,
303
+ "grad_norm": 0.2573336064815521,
304
+ "learning_rate": 5.643564356435643e-05,
305
+ "loss": 0.4771,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 2.3669996300406955,
310
+ "grad_norm": 0.24001090228557587,
311
+ "learning_rate": 5.148514851485149e-05,
312
+ "loss": 0.4848,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 2.3669996300406955,
317
+ "eval_loss": 0.4998326301574707,
318
+ "eval_runtime": 186.3844,
319
+ "eval_samples_per_second": 3.686,
320
+ "eval_steps_per_second": 0.461,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 2.4261931187569368,
325
+ "grad_norm": 0.2617679834365845,
326
+ "learning_rate": 4.653465346534654e-05,
327
+ "loss": 0.4854,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 2.485386607473178,
332
+ "grad_norm": 0.2378547489643097,
333
+ "learning_rate": 4.158415841584158e-05,
334
+ "loss": 0.4848,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 2.544580096189419,
339
+ "grad_norm": 0.2498575747013092,
340
+ "learning_rate": 3.6633663366336634e-05,
341
+ "loss": 0.4768,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 2.6037735849056602,
346
+ "grad_norm": 0.2568998336791992,
347
+ "learning_rate": 3.1683168316831686e-05,
348
+ "loss": 0.4863,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 2.6629670736219015,
353
+ "grad_norm": 0.25008225440979004,
354
+ "learning_rate": 2.6732673267326734e-05,
355
+ "loss": 0.4747,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 2.722160562338143,
360
+ "grad_norm": 0.24171195924282074,
361
+ "learning_rate": 2.1782178217821783e-05,
362
+ "loss": 0.4736,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 2.781354051054384,
367
+ "grad_norm": 0.2468671202659607,
368
+ "learning_rate": 1.6831683168316834e-05,
369
+ "loss": 0.4894,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 2.8405475397706255,
374
+ "grad_norm": 0.2496197372674942,
375
+ "learning_rate": 1.1881188118811881e-05,
376
+ "loss": 0.4798,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 2.8997410284868663,
381
+ "grad_norm": 0.24788333475589752,
382
+ "learning_rate": 6.9306930693069314e-06,
383
+ "loss": 0.4739,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 2.9589345172031076,
388
+ "grad_norm": 0.2498323768377304,
389
+ "learning_rate": 1.9801980198019803e-06,
390
+ "loss": 0.4718,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 2.9589345172031076,
395
+ "eval_loss": 0.49539193511009216,
396
+ "eval_runtime": 185.1827,
397
+ "eval_samples_per_second": 3.71,
398
+ "eval_steps_per_second": 0.464,
399
+ "step": 500
400
+ }
401
+ ],
402
+ "logging_steps": 10,
403
+ "max_steps": 504,
404
+ "num_input_tokens_seen": 0,
405
+ "num_train_epochs": 3,
406
+ "save_steps": 500,
407
+ "stateful_callbacks": {
408
+ "TrainerControl": {
409
+ "args": {
410
+ "should_epoch_stop": false,
411
+ "should_evaluate": false,
412
+ "should_log": false,
413
+ "should_save": true,
414
+ "should_training_stop": true
415
+ },
416
+ "attributes": {}
417
+ }
418
+ },
419
+ "total_flos": 9.23509345222656e+16,
420
+ "train_batch_size": 1,
421
+ "trial_name": null,
422
+ "trial_params": null
423
+ }