ChiefTheLord commited on
Commit
6228e7c
·
verified ·
1 Parent(s): 00b6eba

Upload folder using huggingface_hub

Browse files
checkpoints-v4.1-discrete-conditional/checkpoint-1792/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:348def0448a17fc33dfd58e269365b3a7518263e219bda91f83e9de110663795
3
+ size 24416696
checkpoints-v4.1-discrete-conditional/checkpoint-1792/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fccede8ce292e8b16ad8a02eafd56aba6f2498411f1075b01c8a3df4b4701cd
3
+ size 816907
checkpoints-v4.1-discrete-conditional/checkpoint-1792/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d33ede87b7e42c95f7d6d7faccd1e35edd1370f62e10350592062c54fbbf34bf
3
+ size 14645
checkpoints-v4.1-discrete-conditional/checkpoint-1792/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a42e33465160c5ace903c63375f0694cfab8943854b6c37c46848f754e8871c0
3
+ size 1383
checkpoints-v4.1-discrete-conditional/checkpoint-1792/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970abb983a5a0197c365bc7fdfdc8155569e58b62f56513c3b9d937587189b2d
3
+ size 1465
checkpoints-v4.1-discrete-conditional/checkpoint-1792/trainer_state.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.8849382716049383,
6
+ "eval_steps": 256,
7
+ "global_step": 1792,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06320987654320988,
14
+ "grad_norm": 0.08904296904802322,
15
+ "learning_rate": 0.000248046875,
16
+ "loss": 0.4711284637451172,
17
+ "step": 128
18
+ },
19
+ {
20
+ "epoch": 0.12641975308641976,
21
+ "grad_norm": 0.15019357204437256,
22
+ "learning_rate": 0.000498046875,
23
+ "loss": 0.4766996204853058,
24
+ "step": 256
25
+ },
26
+ {
27
+ "epoch": 0.12641975308641976,
28
+ "eval_cos_loss": 0.13597975344192692,
29
+ "eval_loss": 0.4843393615106257,
30
+ "eval_mse_loss": 0.4843393615106257,
31
+ "step": 256
32
+ },
33
+ {
34
+ "epoch": 0.12641975308641976,
35
+ "eval_cos_loss": 0.13597975344192692,
36
+ "eval_loss": 0.4843393615106257,
37
+ "eval_mse_loss": 0.4843393615106257,
38
+ "eval_runtime": 5.5201,
39
+ "eval_samples_per_second": 474.263,
40
+ "eval_steps_per_second": 7.427,
41
+ "step": 256
42
+ },
43
+ {
44
+ "epoch": 0.18962962962962962,
45
+ "grad_norm": 0.10759640485048294,
46
+ "learning_rate": 0.000748046875,
47
+ "loss": 0.47947996854782104,
48
+ "step": 384
49
+ },
50
+ {
51
+ "epoch": 0.2528395061728395,
52
+ "grad_norm": 0.08275946974754333,
53
+ "learning_rate": 0.000998046875,
54
+ "loss": 0.47462955117225647,
55
+ "step": 512
56
+ },
57
+ {
58
+ "epoch": 0.2528395061728395,
59
+ "eval_cos_loss": 0.13039116016248378,
60
+ "eval_loss": 0.4657249777782254,
61
+ "eval_mse_loss": 0.4657249777782254,
62
+ "step": 512
63
+ },
64
+ {
65
+ "epoch": 0.2528395061728395,
66
+ "eval_cos_loss": 0.13039116016248378,
67
+ "eval_loss": 0.4657249777782254,
68
+ "eval_mse_loss": 0.4657249777782254,
69
+ "eval_runtime": 5.7382,
70
+ "eval_samples_per_second": 456.244,
71
+ "eval_steps_per_second": 7.145,
72
+ "step": 512
73
+ },
74
+ {
75
+ "epoch": 0.3160493827160494,
76
+ "grad_norm": 0.08074437826871872,
77
+ "learning_rate": 0.0009827157247249464,
78
+ "loss": 0.4701014459133148,
79
+ "step": 640
80
+ },
81
+ {
82
+ "epoch": 0.37925925925925924,
83
+ "grad_norm": 0.059887129813432693,
84
+ "learning_rate": 0.0009315344337660421,
85
+ "loss": 0.47220277786254883,
86
+ "step": 768
87
+ },
88
+ {
89
+ "epoch": 0.37925925925925924,
90
+ "eval_cos_loss": 0.13229864417779735,
91
+ "eval_loss": 0.4738647668826871,
92
+ "eval_mse_loss": 0.4738647668826871,
93
+ "step": 768
94
+ },
95
+ {
96
+ "epoch": 0.37925925925925924,
97
+ "eval_cos_loss": 0.13229864417779735,
98
+ "eval_loss": 0.4738647668826871,
99
+ "eval_mse_loss": 0.4738647668826871,
100
+ "eval_runtime": 5.7163,
101
+ "eval_samples_per_second": 457.991,
102
+ "eval_steps_per_second": 7.173,
103
+ "step": 768
104
+ },
105
+ {
106
+ "epoch": 0.44246913580246916,
107
+ "grad_norm": 0.04632481560111046,
108
+ "learning_rate": 0.0008500491898731988,
109
+ "loss": 0.4803292453289032,
110
+ "step": 896
111
+ },
112
+ {
113
+ "epoch": 0.505679012345679,
114
+ "grad_norm": 0.04784788191318512,
115
+ "learning_rate": 0.0007439821899385376,
116
+ "loss": 0.47202467918395996,
117
+ "step": 1024
118
+ },
119
+ {
120
+ "epoch": 0.505679012345679,
121
+ "eval_cos_loss": 0.13205302797439622,
122
+ "eval_loss": 0.4683481622033003,
123
+ "eval_mse_loss": 0.4683481622033003,
124
+ "step": 1024
125
+ },
126
+ {
127
+ "epoch": 0.505679012345679,
128
+ "eval_cos_loss": 0.13205302797439622,
129
+ "eval_loss": 0.4683481622033003,
130
+ "eval_mse_loss": 0.4683481622033003,
131
+ "eval_runtime": 5.7275,
132
+ "eval_samples_per_second": 457.091,
133
+ "eval_steps_per_second": 7.158,
134
+ "step": 1024
135
+ },
136
+ {
137
+ "epoch": 0.5688888888888889,
138
+ "grad_norm": 0.04520433768630028,
139
+ "learning_rate": 0.0006207818531897271,
140
+ "loss": 0.47000765800476074,
141
+ "step": 1152
142
+ },
143
+ {
144
+ "epoch": 0.6320987654320988,
145
+ "grad_norm": 0.052492521703243256,
146
+ "learning_rate": 0.0004890997654891032,
147
+ "loss": 0.4752293825149536,
148
+ "step": 1280
149
+ },
150
+ {
151
+ "epoch": 0.6320987654320988,
152
+ "eval_cos_loss": 0.13366234865857335,
153
+ "eval_loss": 0.4745350596381397,
154
+ "eval_mse_loss": 0.4745350596381397,
155
+ "step": 1280
156
+ },
157
+ {
158
+ "epoch": 0.6320987654320988,
159
+ "eval_cos_loss": 0.13366234865857335,
160
+ "eval_loss": 0.4745350596381397,
161
+ "eval_mse_loss": 0.4745350596381397,
162
+ "eval_runtime": 5.4531,
163
+ "eval_samples_per_second": 480.092,
164
+ "eval_steps_per_second": 7.519,
165
+ "step": 1280
166
+ },
167
+ {
168
+ "epoch": 0.6953086419753086,
169
+ "grad_norm": 0.06958144158124924,
170
+ "learning_rate": 0.00035818313279679524,
171
+ "loss": 0.468513160943985,
172
+ "step": 1408
173
+ },
174
+ {
175
+ "epoch": 0.7585185185185185,
176
+ "grad_norm": 0.07884542644023895,
177
+ "learning_rate": 0.00023722540797531234,
178
+ "loss": 0.4731239974498749,
179
+ "step": 1536
180
+ },
181
+ {
182
+ "epoch": 0.7585185185185185,
183
+ "eval_cos_loss": 0.13044148719892268,
184
+ "eval_loss": 0.46598181055813304,
185
+ "eval_mse_loss": 0.46598181055813304,
186
+ "step": 1536
187
+ },
188
+ {
189
+ "epoch": 0.7585185185185185,
190
+ "eval_cos_loss": 0.13044148719892268,
191
+ "eval_loss": 0.46598181055813304,
192
+ "eval_mse_loss": 0.46598181055813304,
193
+ "eval_runtime": 5.2545,
194
+ "eval_samples_per_second": 498.242,
195
+ "eval_steps_per_second": 7.803,
196
+ "step": 1536
197
+ },
198
+ {
199
+ "epoch": 0.8217283950617283,
200
+ "grad_norm": 0.08384841680526733,
201
+ "learning_rate": 0.00013472069233656453,
202
+ "loss": 0.47170335054397583,
203
+ "step": 1664
204
+ },
205
+ {
206
+ "epoch": 0.8849382716049383,
207
+ "grad_norm": 0.08261716365814209,
208
+ "learning_rate": 5.786724825584927e-05,
209
+ "loss": 0.4725135862827301,
210
+ "step": 1792
211
+ },
212
+ {
213
+ "epoch": 0.8849382716049383,
214
+ "eval_cos_loss": 0.13125932561915096,
215
+ "eval_loss": 0.47124562975836964,
216
+ "eval_mse_loss": 0.47124562975836964,
217
+ "step": 1792
218
+ },
219
+ {
220
+ "epoch": 0.8849382716049383,
221
+ "eval_cos_loss": 0.13125932561915096,
222
+ "eval_loss": 0.47124562975836964,
223
+ "eval_mse_loss": 0.47124562975836964,
224
+ "eval_runtime": 5.1931,
225
+ "eval_samples_per_second": 504.135,
226
+ "eval_steps_per_second": 7.895,
227
+ "step": 1792
228
+ }
229
+ ],
230
+ "logging_steps": 128,
231
+ "max_steps": 2025,
232
+ "num_input_tokens_seen": 0,
233
+ "num_train_epochs": 1,
234
+ "save_steps": 256,
235
+ "stateful_callbacks": {
236
+ "TrainerControl": {
237
+ "args": {
238
+ "should_epoch_stop": false,
239
+ "should_evaluate": false,
240
+ "should_log": false,
241
+ "should_save": true,
242
+ "should_training_stop": false
243
+ },
244
+ "attributes": {}
245
+ }
246
+ },
247
+ "total_flos": 0.0,
248
+ "train_batch_size": 64,
249
+ "trial_name": null,
250
+ "trial_params": null
251
+ }
checkpoints-v4.1-discrete-conditional/checkpoint-1792/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0839bffbc58eb6068cc228e4d756dbb22a9adf723766e40a7bc2a03aca92630
3
+ size 5137