FlameF0X commited on
Commit
9387d36
·
verified ·
1 Parent(s): f3ca47e

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scaler.pt +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +183 -43
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:adf1daa567e842d4a49a2a309c28335adcbc556ed387bd933791c5748955879c
3
  size 105084648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2038b7d186dcf5681c4d83de9768682356029a22590cad51687295c3994b772
3
  size 105084648
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:036dfbad9db93056477a452361b658411f43d32c9474a36e48ff0e618e0db69a
3
  size 210233675
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:593fb2c6e4858080c601758c9635d147ce263203bca363c3dfc105c7a402733c
3
  size 210233675
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84be4ea5bd79d9a9d3711f20f0c9582c128f347dd9ed89e8c000bb14920e6f49
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:878ed4344f40441f51be4ac14226427439b6dc5bb61077e19d16d91c52b3678a
3
  size 14645
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4aa03f6e0cd07cf67ce1fbe3101d545f5771ef9148b9debf02b11cf6948da5c
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0033c7745b46bdca3ecab5787678834ca68f7f7e1288869dceeb38812abc253
3
  size 1383
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0062eed7ddae26275a466fd49b27ea8db659c723946cdc1a97b53a778e040e27
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caad6e0f583a07a0c98d029dca8047c09991d00e348a78110b876e78a7d60992
3
  size 1465
trainer_state.json CHANGED
@@ -2,152 +2,292 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.7229642395519171,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.08616975441619991,
14
- "grad_norm": 1.3410744667053223,
15
  "learning_rate": 1.98e-05,
16
- "loss": 5.56121826171875,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 0.17233950883239982,
21
- "grad_norm": 0.6071470379829407,
22
  "learning_rate": 3.979999999999999e-05,
23
- "loss": 3.912900390625,
24
  "step": 200
25
  },
26
  {
27
  "epoch": 0.25850926324859974,
28
- "grad_norm": 1.0832934379577637,
29
  "learning_rate": 5.98e-05,
30
- "loss": 3.399825134277344,
31
  "step": 300
32
  },
33
  {
34
  "epoch": 0.34467901766479964,
35
- "grad_norm": 0.8494643568992615,
36
  "learning_rate": 7.98e-05,
37
- "loss": 3.0080584716796874,
38
  "step": 400
39
  },
40
  {
41
  "epoch": 0.4308487720809996,
42
- "grad_norm": 0.9254368543624878,
43
  "learning_rate": 9.979999999999999e-05,
44
- "loss": 2.8078402709960937,
45
  "step": 500
46
  },
47
  {
48
  "epoch": 0.5170185264971995,
49
- "grad_norm": 2.038386821746826,
50
  "learning_rate": 0.00011979999999999998,
51
- "loss": 2.6626937866210936,
52
  "step": 600
53
  },
54
  {
55
  "epoch": 0.6031882809133994,
56
- "grad_norm": 2.6908981800079346,
57
  "learning_rate": 0.00013979999999999998,
58
- "loss": 2.5472840881347656,
59
  "step": 700
60
  },
61
  {
62
  "epoch": 0.6893580353295993,
63
- "grad_norm": 1.0315098762512207,
64
  "learning_rate": 0.00015979999999999998,
65
- "loss": 2.4246885681152346,
66
  "step": 800
67
  },
68
  {
69
  "epoch": 0.7755277897457993,
70
- "grad_norm": 0.698249101638794,
71
  "learning_rate": 0.0001798,
72
- "loss": 2.268210601806641,
73
  "step": 900
74
  },
75
  {
76
  "epoch": 0.8616975441619992,
77
- "grad_norm": 0.7050304412841797,
78
  "learning_rate": 0.0001998,
79
- "loss": 2.119554290771484,
80
  "step": 1000
81
  },
82
  {
83
  "epoch": 0.9478672985781991,
84
- "grad_norm": 0.6321768760681152,
85
  "learning_rate": 0.00021979999999999998,
86
- "loss": 2.0509904479980468,
87
  "step": 1100
88
  },
89
  {
90
  "epoch": 1.033606204222318,
91
- "grad_norm": 0.5764491558074951,
92
  "learning_rate": 0.00023979999999999997,
93
- "loss": 1.9791275024414063,
94
  "step": 1200
95
  },
96
  {
97
  "epoch": 1.1197759586385179,
98
- "grad_norm": 0.5116275548934937,
99
  "learning_rate": 0.00025979999999999997,
100
- "loss": 1.925416259765625,
101
  "step": 1300
102
  },
103
  {
104
  "epoch": 1.2059457130547178,
105
- "grad_norm": 0.5461997389793396,
106
  "learning_rate": 0.00027979999999999997,
107
- "loss": 1.8484840393066406,
108
  "step": 1400
109
  },
110
  {
111
  "epoch": 1.2921154674709177,
112
- "grad_norm": 0.4215202331542969,
113
  "learning_rate": 0.00029979999999999997,
114
- "loss": 1.8083718872070313,
115
  "step": 1500
116
  },
117
  {
118
  "epoch": 1.3782852218871176,
119
- "grad_norm": 0.4892556369304657,
120
  "learning_rate": 0.000299991068233357,
121
- "loss": 1.762831573486328,
122
  "step": 1600
123
  },
124
  {
125
  "epoch": 1.4644549763033177,
126
- "grad_norm": 0.4539775252342224,
127
  "learning_rate": 0.0002999639122316208,
128
- "loss": 1.7123506164550781,
129
  "step": 1700
130
  },
131
  {
132
  "epoch": 1.5506247307195173,
133
- "grad_norm": 0.4713730216026306,
134
  "learning_rate": 0.0002999185343831476,
135
- "loss": 1.6709410095214843,
136
  "step": 1800
137
  },
138
  {
139
  "epoch": 1.6367944851357175,
140
- "grad_norm": 0.46391186118125916,
141
  "learning_rate": 0.0002998549402017187,
142
- "loss": 1.6304544067382813,
143
  "step": 1900
144
  },
145
  {
146
  "epoch": 1.7229642395519171,
147
- "grad_norm": 0.3968624770641327,
148
  "learning_rate": 0.0002997731374145493,
149
- "loss": 1.5913111877441406,
150
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  }
152
  ],
153
  "logging_steps": 100,
@@ -167,7 +307,7 @@
167
  "attributes": {}
168
  }
169
  },
170
- "total_flos": 9914156127879168.0,
171
  "train_batch_size": 4,
172
  "trial_name": null,
173
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.4454976303317535,
6
  "eval_steps": 500,
7
+ "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.08616975441619991,
14
+ "grad_norm": 1.8284112215042114,
15
  "learning_rate": 1.98e-05,
16
+ "loss": 5.544659423828125,
17
  "step": 100
18
  },
19
  {
20
  "epoch": 0.17233950883239982,
21
+ "grad_norm": 0.5319015383720398,
22
  "learning_rate": 3.979999999999999e-05,
23
+ "loss": 3.90488037109375,
24
  "step": 200
25
  },
26
  {
27
  "epoch": 0.25850926324859974,
28
+ "grad_norm": 1.3977950811386108,
29
  "learning_rate": 5.98e-05,
30
+ "loss": 3.39756103515625,
31
  "step": 300
32
  },
33
  {
34
  "epoch": 0.34467901766479964,
35
+ "grad_norm": 1.9291573762893677,
36
  "learning_rate": 7.98e-05,
37
+ "loss": 3.019991149902344,
38
  "step": 400
39
  },
40
  {
41
  "epoch": 0.4308487720809996,
42
+ "grad_norm": 1.4095340967178345,
43
  "learning_rate": 9.979999999999999e-05,
44
+ "loss": 2.815445861816406,
45
  "step": 500
46
  },
47
  {
48
  "epoch": 0.5170185264971995,
49
+ "grad_norm": 3.2716641426086426,
50
  "learning_rate": 0.00011979999999999998,
51
+ "loss": 2.6590045166015623,
52
  "step": 600
53
  },
54
  {
55
  "epoch": 0.6031882809133994,
56
+ "grad_norm": 1.3838716745376587,
57
  "learning_rate": 0.00013979999999999998,
58
+ "loss": 2.543310089111328,
59
  "step": 700
60
  },
61
  {
62
  "epoch": 0.6893580353295993,
63
+ "grad_norm": 1.069161057472229,
64
  "learning_rate": 0.00015979999999999998,
65
+ "loss": 2.396273651123047,
66
  "step": 800
67
  },
68
  {
69
  "epoch": 0.7755277897457993,
70
+ "grad_norm": 0.8585665822029114,
71
  "learning_rate": 0.0001798,
72
+ "loss": 2.242165985107422,
73
  "step": 900
74
  },
75
  {
76
  "epoch": 0.8616975441619992,
77
+ "grad_norm": 0.7467069625854492,
78
  "learning_rate": 0.0001998,
79
+ "loss": 2.1027012634277344,
80
  "step": 1000
81
  },
82
  {
83
  "epoch": 0.9478672985781991,
84
+ "grad_norm": 0.5805935859680176,
85
  "learning_rate": 0.00021979999999999998,
86
+ "loss": 2.037454376220703,
87
  "step": 1100
88
  },
89
  {
90
  "epoch": 1.033606204222318,
91
+ "grad_norm": 0.5948718786239624,
92
  "learning_rate": 0.00023979999999999997,
93
+ "loss": 1.9681085205078126,
94
  "step": 1200
95
  },
96
  {
97
  "epoch": 1.1197759586385179,
98
+ "grad_norm": 0.5413378477096558,
99
  "learning_rate": 0.00025979999999999997,
100
+ "loss": 1.9135774230957032,
101
  "step": 1300
102
  },
103
  {
104
  "epoch": 1.2059457130547178,
105
+ "grad_norm": 0.5196030139923096,
106
  "learning_rate": 0.00027979999999999997,
107
+ "loss": 1.8392716979980468,
108
  "step": 1400
109
  },
110
  {
111
  "epoch": 1.2921154674709177,
112
+ "grad_norm": 0.49619364738464355,
113
  "learning_rate": 0.00029979999999999997,
114
+ "loss": 1.8049734497070313,
115
  "step": 1500
116
  },
117
  {
118
  "epoch": 1.3782852218871176,
119
+ "grad_norm": 0.44414839148521423,
120
  "learning_rate": 0.000299991068233357,
121
+ "loss": 1.7638165283203124,
122
  "step": 1600
123
  },
124
  {
125
  "epoch": 1.4644549763033177,
126
+ "grad_norm": 0.46444711089134216,
127
  "learning_rate": 0.0002999639122316208,
128
+ "loss": 1.7137832641601562,
129
  "step": 1700
130
  },
131
  {
132
  "epoch": 1.5506247307195173,
133
+ "grad_norm": 0.5176238417625427,
134
  "learning_rate": 0.0002999185343831476,
135
+ "loss": 1.675589599609375,
136
  "step": 1800
137
  },
138
  {
139
  "epoch": 1.6367944851357175,
140
+ "grad_norm": 0.4177858829498291,
141
  "learning_rate": 0.0002998549402017187,
142
+ "loss": 1.6349491882324219,
143
  "step": 1900
144
  },
145
  {
146
  "epoch": 1.7229642395519171,
147
+ "grad_norm": 0.42198434472084045,
148
  "learning_rate": 0.0002997731374145493,
149
+ "loss": 1.596505126953125,
150
  "step": 2000
151
+ },
152
+ {
153
+ "epoch": 1.8091339939681172,
154
+ "grad_norm": 0.4523915946483612,
155
+ "learning_rate": 0.0002996731359613498,
156
+ "loss": 1.5908058166503907,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 1.8953037483843171,
161
+ "grad_norm": 0.3901713788509369,
162
+ "learning_rate": 0.0002995549479931178,
163
+ "loss": 1.5610142517089844,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 1.981473502800517,
168
+ "grad_norm": 0.41816478967666626,
169
+ "learning_rate": 0.00029941858787066206,
170
+ "loss": 1.5319706726074218,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 2.067212408444636,
175
+ "grad_norm": 0.3872755765914917,
176
+ "learning_rate": 0.00029926407216285706,
177
+ "loss": 1.5055549621582032,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 2.1533821628608356,
182
+ "grad_norm": 0.4193103611469269,
183
+ "learning_rate": 0.0002990914196446301,
184
+ "loss": 1.4792218017578125,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 2.2395519172770357,
189
+ "grad_norm": 0.4024358093738556,
190
+ "learning_rate": 0.00029890065129467986,
191
+ "loss": 1.4786280822753906,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 2.325721671693236,
196
+ "grad_norm": 0.37588468194007874,
197
+ "learning_rate": 0.0002986917902929273,
198
+ "loss": 1.4545697021484374,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 2.4118914261094355,
203
+ "grad_norm": 0.39736974239349365,
204
+ "learning_rate": 0.0002984648620176991,
205
+ "loss": 1.4498170471191407,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 2.4980611805256356,
210
+ "grad_norm": 0.42380592226982117,
211
+ "learning_rate": 0.00029821989404264424,
212
+ "loss": 1.4262150573730468,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 2.5842309349418353,
217
+ "grad_norm": 0.411803662776947,
218
+ "learning_rate": 0.00029795691613338307,
219
+ "loss": 1.417086181640625,
220
+ "step": 3000
221
+ },
222
+ {
223
+ "epoch": 2.6704006893580354,
224
+ "grad_norm": 0.3662901818752289,
225
+ "learning_rate": 0.000297675960243891,
226
+ "loss": 1.3942941284179688,
227
+ "step": 3100
228
+ },
229
+ {
230
+ "epoch": 2.756570443774235,
231
+ "grad_norm": 0.3642771244049072,
232
+ "learning_rate": 0.00029737706051261557,
233
+ "loss": 1.38471923828125,
234
+ "step": 3200
235
+ },
236
+ {
237
+ "epoch": 2.842740198190435,
238
+ "grad_norm": 0.4138600826263428,
239
+ "learning_rate": 0.00029706025325832857,
240
+ "loss": 1.3765927124023438,
241
+ "step": 3300
242
+ },
243
+ {
244
+ "epoch": 2.9289099526066353,
245
+ "grad_norm": 0.3687536418437958,
246
+ "learning_rate": 0.0002967255769757127,
247
+ "loss": 1.3617820739746094,
248
+ "step": 3400
249
+ },
250
+ {
251
+ "epoch": 3.014648858250754,
252
+ "grad_norm": 0.3252148926258087,
253
+ "learning_rate": 0.0002963730723306845,
254
+ "loss": 1.3490205383300782,
255
+ "step": 3500
256
+ },
257
+ {
258
+ "epoch": 3.100818612666954,
259
+ "grad_norm": 0.3874260187149048,
260
+ "learning_rate": 0.0002960027821554529,
261
+ "loss": 1.3380169677734375,
262
+ "step": 3600
263
+ },
264
+ {
265
+ "epoch": 3.1869883670831536,
266
+ "grad_norm": 0.37778887152671814,
267
+ "learning_rate": 0.00029561475144331467,
268
+ "loss": 1.3190237426757812,
269
+ "step": 3700
270
+ },
271
+ {
272
+ "epoch": 3.2731581214993537,
273
+ "grad_norm": 0.37266016006469727,
274
+ "learning_rate": 0.00029520902734318766,
275
+ "loss": 1.313209991455078,
276
+ "step": 3800
277
+ },
278
+ {
279
+ "epoch": 3.359327875915554,
280
+ "grad_norm": 0.3792646527290344,
281
+ "learning_rate": 0.00029478565915388153,
282
+ "loss": 1.3055996704101562,
283
+ "step": 3900
284
+ },
285
+ {
286
+ "epoch": 3.4454976303317535,
287
+ "grad_norm": 0.3583495318889618,
288
+ "learning_rate": 0.00029434469831810764,
289
+ "loss": 1.301021728515625,
290
+ "step": 4000
291
  }
292
  ],
293
  "logging_steps": 100,
 
307
  "attributes": {}
308
  }
309
  },
310
+ "total_flos": 1.9825523114901504e+16,
311
  "train_batch_size": 4,
312
  "trial_name": null,
313
  "trial_params": null