ljcamargo commited on
Commit
44ac430
·
verified ·
1 Parent(s): 143349f

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d40c404ef2c6591a63d62d374d2ae723dbb012f99f314f1f0721032e50b86c4
3
+ size 2558403928
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ffd4ecbcd1f5cdd5bd52f54030b72efa2c358b8e75c6c4731b1e15ea43bd19c
3
+ size 1313044361
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb5c3c2c6a04f8bf56e98b3d5a045f8c1ab465d43652320e01114dda9b0cb0d
3
+ size 14645
last-checkpoint/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30af866df24edce708e1eb20700878b402fa05707fa9bc5f332496baf440dbbb
3
+ size 1383
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e2eb54ad71aa36e8a3c519325614d3113e01de2bc05cb8cce62c849b7fd068c
3
+ size 1465
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.24,
6
+ "eval_steps": 500,
7
+ "global_step": 300,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0008,
14
+ "grad_norm": 10.566986083984375,
15
+ "learning_rate": 0.0,
16
+ "loss": 14.1421,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.008,
21
+ "grad_norm": 12.296218872070312,
22
+ "learning_rate": 1.730769230769231e-05,
23
+ "loss": 13.35,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.016,
28
+ "grad_norm": 6.457699775695801,
29
+ "learning_rate": 3.653846153846154e-05,
30
+ "loss": 11.8957,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.024,
35
+ "grad_norm": 6.461245059967041,
36
+ "learning_rate": 5.576923076923077e-05,
37
+ "loss": 11.2465,
38
+ "step": 30
39
+ },
40
+ {
41
+ "epoch": 0.032,
42
+ "grad_norm": 6.351202011108398,
43
+ "learning_rate": 7.500000000000001e-05,
44
+ "loss": 10.7197,
45
+ "step": 40
46
+ },
47
+ {
48
+ "epoch": 0.04,
49
+ "grad_norm": 5.675596714019775,
50
+ "learning_rate": 9.423076923076924e-05,
51
+ "loss": 10.4108,
52
+ "step": 50
53
+ },
54
+ {
55
+ "epoch": 0.048,
56
+ "grad_norm": 6.45210599899292,
57
+ "learning_rate": 0.00011346153846153846,
58
+ "loss": 9.499,
59
+ "step": 60
60
+ },
61
+ {
62
+ "epoch": 0.056,
63
+ "grad_norm": 3.97434663772583,
64
+ "learning_rate": 0.0001326923076923077,
65
+ "loss": 9.2464,
66
+ "step": 70
67
+ },
68
+ {
69
+ "epoch": 0.064,
70
+ "grad_norm": 4.443643093109131,
71
+ "learning_rate": 0.00015192307692307692,
72
+ "loss": 9.0007,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 0.072,
77
+ "grad_norm": 4.448770046234131,
78
+ "learning_rate": 0.00017115384615384616,
79
+ "loss": 8.8057,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 0.08,
84
+ "grad_norm": 5.425487041473389,
85
+ "learning_rate": 0.00019038461538461538,
86
+ "loss": 8.9744,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.088,
91
+ "grad_norm": 4.242831230163574,
92
+ "learning_rate": 0.00019999785100910492,
93
+ "loss": 8.9241,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.096,
98
+ "grad_norm": 3.6791751384735107,
99
+ "learning_rate": 0.00019998065963611962,
100
+ "loss": 8.8742,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.104,
105
+ "grad_norm": 5.0801777839660645,
106
+ "learning_rate": 0.00019994627984564557,
107
+ "loss": 8.8388,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.112,
112
+ "grad_norm": 5.117883205413818,
113
+ "learning_rate": 0.00019989471754816785,
114
+ "loss": 8.412,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.12,
119
+ "grad_norm": 5.7691802978515625,
120
+ "learning_rate": 0.00019982598160814377,
121
+ "loss": 8.7482,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.128,
126
+ "grad_norm": 4.111888885498047,
127
+ "learning_rate": 0.00019974008384247908,
128
+ "loss": 8.6456,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.136,
133
+ "grad_norm": 3.717806816101074,
134
+ "learning_rate": 0.0001996370390184965,
135
+ "loss": 8.3429,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.144,
140
+ "grad_norm": 5.897804260253906,
141
+ "learning_rate": 0.00019951686485139672,
142
+ "loss": 8.5481,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.152,
147
+ "grad_norm": 4.452871799468994,
148
+ "learning_rate": 0.00019937958200121303,
149
+ "loss": 8.6357,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.16,
154
+ "grad_norm": 5.103796482086182,
155
+ "learning_rate": 0.0001992252140692594,
156
+ "loss": 8.5245,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.168,
161
+ "grad_norm": 5.806966304779053,
162
+ "learning_rate": 0.00019905378759407314,
163
+ "loss": 8.4875,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 0.176,
168
+ "grad_norm": 4.293936729431152,
169
+ "learning_rate": 0.00019886533204685228,
170
+ "loss": 8.3073,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 0.184,
175
+ "grad_norm": 3.6153390407562256,
176
+ "learning_rate": 0.00019865987982638914,
177
+ "loss": 8.5256,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 0.192,
182
+ "grad_norm": 5.031829357147217,
183
+ "learning_rate": 0.00019843746625350028,
184
+ "loss": 8.3936,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 0.2,
189
+ "grad_norm": 4.666059970855713,
190
+ "learning_rate": 0.0001981981295649543,
191
+ "loss": 7.9453,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 0.208,
196
+ "grad_norm": 4.338928699493408,
197
+ "learning_rate": 0.0001979419109068982,
198
+ "loss": 8.5403,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 0.216,
203
+ "grad_norm": 5.491336345672607,
204
+ "learning_rate": 0.0001976688543277838,
205
+ "loss": 8.5499,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 0.224,
210
+ "grad_norm": 4.206221580505371,
211
+ "learning_rate": 0.00019737900677079483,
212
+ "loss": 8.202,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 0.232,
217
+ "grad_norm": 4.248091220855713,
218
+ "learning_rate": 0.0001970724180657768,
219
+ "loss": 8.1605,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 0.24,
224
+ "grad_norm": 4.153928279876709,
225
+ "learning_rate": 0.00019674914092067015,
226
+ "loss": 8.2001,
227
+ "step": 300
228
+ }
229
+ ],
230
+ "logging_steps": 10,
231
+ "max_steps": 2500,
232
+ "num_input_tokens_seen": 0,
233
+ "num_train_epochs": 2,
234
+ "save_steps": 300,
235
+ "stateful_callbacks": {
236
+ "TrainerControl": {
237
+ "args": {
238
+ "should_epoch_stop": false,
239
+ "should_evaluate": false,
240
+ "should_log": false,
241
+ "should_save": true,
242
+ "should_training_stop": false
243
+ },
244
+ "attributes": {}
245
+ }
246
+ },
247
+ "total_flos": 0.0,
248
+ "train_batch_size": 8,
249
+ "trial_name": null,
250
+ "trial_params": null
251
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2cbad370ceca105eb29fd83703abdf3f11645c66605ea050dcf46365bfd8be8
3
+ size 5905