aixk commited on
Commit
6abcfd0
·
1 Parent(s): 0c9c3fb

Upload folder using huggingface_hub

Browse files
shared/checkpoints/latest/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 1280,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 5120,
16
+ "max_position_embeddings": 128,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 20,
20
+ "num_hidden_layers": 20,
21
+ "num_key_value_heads": 5,
22
+ "pad_token_id": 0,
23
+ "pretraining_tp": 1,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_parameters": {
26
+ "rope_theta": 10000.0,
27
+ "rope_type": "default"
28
+ },
29
+ "tie_word_embeddings": false,
30
+ "transformers_version": "5.0.0",
31
+ "use_cache": false,
32
+ "vocab_size": 32000
33
+ }
shared/checkpoints/latest/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.0.0",
9
+ "use_cache": false
10
+ }
shared/checkpoints/latest/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4311470cef32c1db5bc50c12b71d7076de73af2c4e68bc64e1faa26a574b6be7
3
+ size 2228454760
shared/checkpoints/latest/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51eab96e89995e62448c2bab1f28c327ff9df2e722666bf96aac0b55349e22b9
3
+ size 373040459
shared/checkpoints/latest/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
3
+ size 14645
shared/checkpoints/latest/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850c3d909f8a0af6f9b431fac5a25833ab1658c39f899825e3b347b6af8a490b
3
+ size 1383
shared/checkpoints/latest/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:226108bf7e5c19e39ac121293561fcf99628514e7bf5811de63e81d47d460150
3
+ size 1465
shared/checkpoints/latest/trainer_state.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.53715507343917,
6
+ "eval_steps": 500,
7
+ "global_step": 700,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011862835959221501,
14
+ "grad_norm": 1.0314325094223022,
15
+ "learning_rate": 5e-05,
16
+ "loss": 10.542851448059082,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.23725671918443003,
21
+ "grad_norm": 1.70680832862854,
22
+ "learning_rate": 4.9992874484134653e-05,
23
+ "loss": 9.478467439350329,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.47451343836886006,
28
+ "grad_norm": 0.7485072016716003,
29
+ "learning_rate": 4.996998267226905e-05,
30
+ "loss": 7.760343933105469,
31
+ "step": 40
32
+ },
33
+ {
34
+ "epoch": 0.7117701575532901,
35
+ "grad_norm": 0.5104997754096985,
36
+ "learning_rate": 4.993131928415602e-05,
37
+ "loss": 7.206849670410156,
38
+ "step": 60
39
+ },
40
+ {
41
+ "epoch": 0.9490268767377201,
42
+ "grad_norm": 0.46393775939941406,
43
+ "learning_rate": 4.9876908740420175e-05,
44
+ "loss": 7.096773529052735,
45
+ "step": 80
46
+ },
47
+ {
48
+ "epoch": 1.1779425393883225,
49
+ "grad_norm": 0.447471559047699,
50
+ "learning_rate": 4.980678540792715e-05,
51
+ "loss": 7.068167877197266,
52
+ "step": 100
53
+ },
54
+ {
55
+ "epoch": 1.4151992585727524,
56
+ "grad_norm": 0.5010190606117249,
57
+ "learning_rate": 4.972099357807671e-05,
58
+ "loss": 7.053585815429687,
59
+ "step": 120
60
+ },
61
+ {
62
+ "epoch": 1.6524559777571826,
63
+ "grad_norm": 0.7706360816955566,
64
+ "learning_rate": 4.961958743882742e-05,
65
+ "loss": 7.03430404663086,
66
+ "step": 140
67
+ },
68
+ {
69
+ "epoch": 1.8897126969416127,
70
+ "grad_norm": 0.5128363370895386,
71
+ "learning_rate": 4.950263104047031e-05,
72
+ "loss": 7.022041320800781,
73
+ "step": 160
74
+ },
75
+ {
76
+ "epoch": 2.118628359592215,
77
+ "grad_norm": 0.705746054649353,
78
+ "learning_rate": 4.937019825517333e-05,
79
+ "loss": 6.9862548828125,
80
+ "step": 180
81
+ },
82
+ {
83
+ "epoch": 2.355885078776645,
84
+ "grad_norm": 0.8337900042533875,
85
+ "learning_rate": 4.9222372730322176e-05,
86
+ "loss": 6.937237548828125,
87
+ "step": 200
88
+ },
89
+ {
90
+ "epoch": 1.5406769433320533,
91
+ "grad_norm": 0.5159856081008911,
92
+ "learning_rate": 4.568695539880615e-05,
93
+ "loss": 6.9568915367126465,
94
+ "step": 220
95
+ },
96
+ {
97
+ "epoch": 1.6811125130286906,
98
+ "grad_norm": 0.4523787200450897,
99
+ "learning_rate": 4.489238055764833e-05,
100
+ "loss": 6.900994873046875,
101
+ "step": 240
102
+ },
103
+ {
104
+ "epoch": 1.8215480827253279,
105
+ "grad_norm": 0.48097074031829834,
106
+ "learning_rate": 4.4038849773874356e-05,
107
+ "loss": 6.876528930664063,
108
+ "step": 260
109
+ },
110
+ {
111
+ "epoch": 1.961983652421965,
112
+ "grad_norm": 0.5671383738517761,
113
+ "learning_rate": 4.3128892695042654e-05,
114
+ "loss": 6.83782730102539,
115
+ "step": 280
116
+ },
117
+ {
118
+ "epoch": 2.0983048987876463,
119
+ "grad_norm": 0.6388454437255859,
120
+ "learning_rate": 4.2165206201859265e-05,
121
+ "loss": 6.788776397705078,
122
+ "step": 300
123
+ },
124
+ {
125
+ "epoch": 2.2387404684842833,
126
+ "grad_norm": 0.5785893201828003,
127
+ "learning_rate": 4.115064641531117e-05,
128
+ "loss": 6.7587730407714846,
129
+ "step": 320
130
+ },
131
+ {
132
+ "epoch": 2.3791760381809204,
133
+ "grad_norm": 0.4917908012866974,
134
+ "learning_rate": 4.008822023185218e-05,
135
+ "loss": 6.704537200927734,
136
+ "step": 340
137
+ },
138
+ {
139
+ "epoch": 2.519611607877558,
140
+ "grad_norm": 0.607780396938324,
141
+ "learning_rate": 3.898107641172868e-05,
142
+ "loss": 6.676227569580078,
143
+ "step": 360
144
+ },
145
+ {
146
+ "epoch": 2.660047177574195,
147
+ "grad_norm": 0.45474570989608765,
148
+ "learning_rate": 3.783249624685734e-05,
149
+ "loss": 6.64794692993164,
150
+ "step": 380
151
+ },
152
+ {
153
+ "epoch": 2.8004827472708325,
154
+ "grad_norm": 0.49765315651893616,
155
+ "learning_rate": 3.6645883835912714e-05,
156
+ "loss": 6.635832214355469,
157
+ "step": 400
158
+ },
159
+ {
160
+ "epoch": 2.9409183169674695,
161
+ "grad_norm": 0.5915655493736267,
162
+ "learning_rate": 3.542475599544699e-05,
163
+ "loss": 6.623738098144531,
164
+ "step": 420
165
+ },
166
+ {
167
+ "epoch": 3.0772395633331504,
168
+ "grad_norm": 0.6189742684364319,
169
+ "learning_rate": 3.417273183694259e-05,
170
+ "loss": 6.541598510742188,
171
+ "step": 440
172
+ },
173
+ {
174
+ "epoch": 3.217675133029788,
175
+ "grad_norm": 0.6115455627441406,
176
+ "learning_rate": 3.289352204068886e-05,
177
+ "loss": 6.4810935974121096,
178
+ "step": 460
179
+ },
180
+ {
181
+ "epoch": 3.358110702726425,
182
+ "grad_norm": 0.5563525557518005,
183
+ "learning_rate": 3.1590917858271966e-05,
184
+ "loss": 6.469013214111328,
185
+ "step": 480
186
+ },
187
+ {
188
+ "epoch": 3.498546272423062,
189
+ "grad_norm": 0.5592976212501526,
190
+ "learning_rate": 3.0268779876272162e-05,
191
+ "loss": 6.469371032714844,
192
+ "step": 500
193
+ },
194
+ {
195
+ "epoch": 3.6389818421196996,
196
+ "grad_norm": 0.6607327461242676,
197
+ "learning_rate": 2.893102657446976e-05,
198
+ "loss": 6.4586669921875,
199
+ "step": 520
200
+ },
201
+ {
202
+ "epoch": 3.7794174118163366,
203
+ "grad_norm": 0.5511732697486877,
204
+ "learning_rate": 2.7581622712470417e-05,
205
+ "loss": 6.438571929931641,
206
+ "step": 540
207
+ },
208
+ {
209
+ "epoch": 3.9198529815129737,
210
+ "grad_norm": 0.694491446018219,
211
+ "learning_rate": 2.6224567579168897e-05,
212
+ "loss": 6.430049133300781,
213
+ "step": 560
214
+ },
215
+ {
216
+ "epoch": 4.056174227878655,
217
+ "grad_norm": 0.5763441324234009,
218
+ "learning_rate": 2.4863883139876677e-05,
219
+ "loss": 6.385451889038086,
220
+ "step": 580
221
+ },
222
+ {
223
+ "epoch": 3.0304050041569344,
224
+ "grad_norm": 0.603993833065033,
225
+ "learning_rate": 2.35036021162426e-05,
226
+ "loss": 6.4556640625,
227
+ "step": 600
228
+ },
229
+ {
230
+ "epoch": 3.1317550180133815,
231
+ "grad_norm": 0.7310755252838135,
232
+ "learning_rate": 2.214775603429435e-05,
233
+ "loss": 6.486899566650391,
234
+ "step": 620
235
+ },
236
+ {
237
+ "epoch": 3.2331050318698287,
238
+ "grad_norm": 0.7162438631057739,
239
+ "learning_rate": 2.0800363276023586e-05,
240
+ "loss": 6.448640441894531,
241
+ "step": 640
242
+ },
243
+ {
244
+ "epoch": 3.334455045726276,
245
+ "grad_norm": 0.6547061800956726,
246
+ "learning_rate": 1.9465417169926507e-05,
247
+ "loss": 6.455082702636719,
248
+ "step": 660
249
+ },
250
+ {
251
+ "epoch": 3.435805059582723,
252
+ "grad_norm": 0.6867275834083557,
253
+ "learning_rate": 1.8146874155796643e-05,
254
+ "loss": 6.440997314453125,
255
+ "step": 680
256
+ },
257
+ {
258
+ "epoch": 3.53715507343917,
259
+ "grad_norm": 0.6481226682662964,
260
+ "learning_rate": 1.6848642058846426e-05,
261
+ "loss": 6.447025299072266,
262
+ "step": 700
263
+ }
264
+ ],
265
+ "logging_steps": 20,
266
+ "max_steps": 1154,
267
+ "num_input_tokens_seen": 0,
268
+ "num_train_epochs": 6,
269
+ "save_steps": 1000000000,
270
+ "stateful_callbacks": {
271
+ "TrainerControl": {
272
+ "args": {
273
+ "should_epoch_stop": false,
274
+ "should_evaluate": false,
275
+ "should_log": false,
276
+ "should_save": true,
277
+ "should_training_stop": true
278
+ },
279
+ "attributes": {}
280
+ }
281
+ },
282
+ "total_flos": 3.53570793013248e+16,
283
+ "train_batch_size": 1,
284
+ "trial_name": null,
285
+ "trial_params": null
286
+ }