SystemAdmin123 commited on
Commit
4fde597
·
verified ·
1 Parent(s): d5b2941

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57931ccc5ede5e9b408087ee39e0c8acfea478feb0db465454ea28f70f45fce8
3
  size 3086634632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:166fb3c7f81e1941b29aa434d5a62e1083fa93f3516c2edd871c14c0ee8ee1e7
3
  size 3086634632
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a670ff016a28acc74aa8c9382b99fae9c30939c44c51045fb6b5702d079ab4d8
3
  size 3136004290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e6e5f44d44cd693465a1cf0aa7d39d1a46b29cfeba6b932788ffd8248551c9
3
  size 3136004290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9d2eb4230608a2e50c5e2e997035fa89b37c3a1f78cd2d086d678da431507f0
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2762d792d815ec78151cdfb1183ff8fe6b1a4c5fcc050ac87b15dc66050802
3
+ size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc50388692076d892e9a6a3cc11b337ccbfc6d0655c35c5d50cb2003fe50a00f
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbc1f875219f8ac3c776408d86b647fbc8462ad7ecb23ac468a078c2c50ec46e
3
+ size 15024
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbcdba7bc63bee2e58552ca3df8d3d9521da29d066231bf49913479cc60a05a4
3
+ size 15024
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac162cf8f57eb46f3e3f12f50328cd5574b5384b6fe27493b80f0979cc8d636b
3
+ size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15ba0115447c74c421f2515daef64bb4b3b3b78267a7971a106df2ae18c0ebef
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abcd078b1186023c3032f4652f953246e76b5f62233ca3f894e881ea9feb17b7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,242 +1,175 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 6.756756756756757,
5
- "eval_steps": 50,
6
- "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.02702702702702703,
13
- "eval_loss": 3.023751735687256,
14
- "eval_runtime": 36.6493,
15
- "eval_samples_per_second": 40.956,
16
- "eval_steps_per_second": 2.074,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.2702702702702703,
21
- "grad_norm": 9.0625,
22
- "learning_rate": 3.6363636363636364e-05,
23
- "loss": 2.3049,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.5405405405405406,
28
- "grad_norm": 2.375,
29
- "learning_rate": 7.272727272727273e-05,
30
- "loss": 2.0487,
31
  "step": 20
32
  },
33
  {
34
- "epoch": 0.8108108108108109,
35
- "grad_norm": 1.59375,
36
- "learning_rate": 0.00010909090909090909,
37
- "loss": 1.9946,
38
  "step": 30
39
  },
40
  {
41
- "epoch": 1.0810810810810811,
42
- "grad_norm": 2.8125,
43
- "learning_rate": 0.00014545454545454546,
44
- "loss": 1.9062,
45
  "step": 40
46
  },
47
  {
48
- "epoch": 1.3513513513513513,
49
- "grad_norm": 1.59375,
50
- "learning_rate": 0.00018181818181818183,
51
- "loss": 1.6039,
52
- "step": 50
53
- },
54
- {
55
- "epoch": 1.3513513513513513,
56
- "eval_loss": 2.009673595428467,
57
- "eval_runtime": 36.2182,
58
- "eval_samples_per_second": 41.443,
59
- "eval_steps_per_second": 2.098,
60
  "step": 50
61
  },
62
  {
63
- "epoch": 1.6216216216216215,
64
- "grad_norm": 1.6640625,
65
- "learning_rate": 0.00019998870284726968,
66
- "loss": 1.6751,
67
  "step": 60
68
  },
69
  {
70
- "epoch": 1.8918918918918919,
71
- "grad_norm": 1.4296875,
72
- "learning_rate": 0.00019989834093992945,
73
- "loss": 1.7299,
74
  "step": 70
75
  },
76
  {
77
- "epoch": 2.1621621621621623,
78
- "grad_norm": 1.59375,
79
- "learning_rate": 0.00019971769878721743,
80
- "loss": 1.272,
81
  "step": 80
82
  },
83
  {
84
- "epoch": 2.4324324324324325,
85
- "grad_norm": 1.453125,
86
- "learning_rate": 0.00019944693963927092,
87
- "loss": 0.9261,
88
  "step": 90
89
  },
90
  {
91
- "epoch": 2.7027027027027026,
92
- "grad_norm": 1.4921875,
93
- "learning_rate": 0.00019908630818686338,
94
- "loss": 0.9744,
95
  "step": 100
96
  },
97
  {
98
- "epoch": 2.7027027027027026,
99
- "eval_loss": 2.4561517238616943,
100
- "eval_runtime": 36.7479,
101
- "eval_samples_per_second": 40.846,
102
- "eval_steps_per_second": 2.068,
103
- "step": 100
104
- },
105
- {
106
- "epoch": 2.972972972972973,
107
- "grad_norm": 1.53125,
108
- "learning_rate": 0.00019863613034027224,
109
- "loss": 0.9624,
110
  "step": 110
111
  },
112
  {
113
- "epoch": 3.2432432432432434,
114
- "grad_norm": 1.1796875,
115
- "learning_rate": 0.00019809681293474693,
116
- "loss": 0.4856,
117
  "step": 120
118
  },
119
  {
120
- "epoch": 3.5135135135135136,
121
- "grad_norm": 2.359375,
122
- "learning_rate": 0.00019746884336284317,
123
- "loss": 0.4526,
124
  "step": 130
125
  },
126
  {
127
- "epoch": 3.7837837837837838,
128
- "grad_norm": 1.3203125,
129
- "learning_rate": 0.00019675278913395606,
130
- "loss": 0.4579,
131
  "step": 140
132
  },
133
  {
134
- "epoch": 4.054054054054054,
135
- "grad_norm": 0.87890625,
136
- "learning_rate": 0.00019594929736144976,
137
- "loss": 0.4416,
138
- "step": 150
139
- },
140
- {
141
- "epoch": 4.054054054054054,
142
- "eval_loss": 3.037287473678589,
143
- "eval_runtime": 36.5611,
144
- "eval_samples_per_second": 41.055,
145
- "eval_steps_per_second": 2.079,
146
  "step": 150
147
  },
148
  {
149
- "epoch": 4.324324324324325,
150
- "grad_norm": 1.5859375,
151
- "learning_rate": 0.00019505909417784754,
152
- "loss": 0.2196,
153
  "step": 160
154
  },
155
  {
156
- "epoch": 4.594594594594595,
157
- "grad_norm": 0.9921875,
158
- "learning_rate": 0.00019408298407861042,
159
- "loss": 0.2407,
160
  "step": 170
161
  },
162
  {
163
- "epoch": 4.864864864864865,
164
- "grad_norm": 1.03125,
165
- "learning_rate": 0.00019302184919509755,
166
- "loss": 0.2556,
167
  "step": 180
168
  },
169
  {
170
- "epoch": 5.135135135135135,
171
- "grad_norm": 0.734375,
172
- "learning_rate": 0.0001918766484973654,
173
- "loss": 0.1991,
174
  "step": 190
175
  },
176
  {
177
- "epoch": 5.405405405405405,
178
- "grad_norm": 0.828125,
179
- "learning_rate": 0.0001906484169275263,
180
- "loss": 0.1492,
181
  "step": 200
182
  },
183
  {
184
- "epoch": 5.405405405405405,
185
- "eval_loss": 3.3331992626190186,
186
- "eval_runtime": 34.9671,
187
- "eval_samples_per_second": 42.926,
188
- "eval_steps_per_second": 2.173,
189
  "step": 200
190
- },
191
- {
192
- "epoch": 5.675675675675675,
193
- "grad_norm": 0.796875,
194
- "learning_rate": 0.00018933826446444933,
195
- "loss": 0.1548,
196
- "step": 210
197
- },
198
- {
199
- "epoch": 5.945945945945946,
200
- "grad_norm": 1.1953125,
201
- "learning_rate": 0.0001879473751206489,
202
- "loss": 0.1538,
203
- "step": 220
204
- },
205
- {
206
- "epoch": 6.216216216216216,
207
- "grad_norm": 0.546875,
208
- "learning_rate": 0.0001864770058722676,
209
- "loss": 0.0797,
210
- "step": 230
211
- },
212
- {
213
- "epoch": 6.486486486486487,
214
- "grad_norm": 0.53515625,
215
- "learning_rate": 0.00018492848552312014,
216
- "loss": 0.0722,
217
- "step": 240
218
- },
219
- {
220
- "epoch": 6.756756756756757,
221
- "grad_norm": 0.52734375,
222
- "learning_rate": 0.00018330321350382544,
223
- "loss": 0.0669,
224
- "step": 250
225
- },
226
- {
227
- "epoch": 6.756756756756757,
228
- "eval_loss": 3.573293685913086,
229
- "eval_runtime": 35.0908,
230
- "eval_samples_per_second": 42.775,
231
- "eval_steps_per_second": 2.166,
232
- "step": 250
233
  }
234
  ],
235
  "logging_steps": 10,
236
- "max_steps": 1100,
237
  "num_input_tokens_seen": 0,
238
- "num_train_epochs": 30,
239
- "save_steps": 50,
240
  "stateful_callbacks": {
241
  "TrainerControl": {
242
  "args": {
@@ -249,7 +182,7 @@
249
  "attributes": {}
250
  }
251
  },
252
- "total_flos": 8.05073298915328e+16,
253
  "train_batch_size": 10,
254
  "trial_name": null,
255
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 11.11111111111111,
5
+ "eval_steps": 200,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.05555555555555555,
13
+ "eval_loss": 3.0254549980163574,
14
+ "eval_runtime": 19.3939,
15
+ "eval_samples_per_second": 77.395,
16
+ "eval_steps_per_second": 1.959,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.5555555555555556,
21
+ "grad_norm": 1.7109375,
22
+ "learning_rate": 0.0001,
23
+ "loss": 2.2097,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 1.1111111111111112,
28
+ "grad_norm": 1.71875,
29
+ "learning_rate": 0.0002,
30
+ "loss": 1.9869,
31
  "step": 20
32
  },
33
  {
34
+ "epoch": 1.6666666666666665,
35
+ "grad_norm": 1.3671875,
36
+ "learning_rate": 0.000199658449300667,
37
+ "loss": 1.6439,
38
  "step": 30
39
  },
40
  {
41
+ "epoch": 2.2222222222222223,
42
+ "grad_norm": 1.40625,
43
+ "learning_rate": 0.00019863613034027224,
44
+ "loss": 1.3908,
45
  "step": 40
46
  },
47
  {
48
+ "epoch": 2.7777777777777777,
49
+ "grad_norm": 0.98828125,
50
+ "learning_rate": 0.00019694002659393305,
51
+ "loss": 0.9319,
 
 
 
 
 
 
 
 
52
  "step": 50
53
  },
54
  {
55
+ "epoch": 3.3333333333333335,
56
+ "grad_norm": 0.99609375,
57
+ "learning_rate": 0.00019458172417006347,
58
+ "loss": 0.6271,
59
  "step": 60
60
  },
61
  {
62
+ "epoch": 3.888888888888889,
63
+ "grad_norm": 0.89453125,
64
+ "learning_rate": 0.00019157733266550575,
65
+ "loss": 0.4473,
66
  "step": 70
67
  },
68
  {
69
+ "epoch": 4.444444444444445,
70
+ "grad_norm": 0.76953125,
71
+ "learning_rate": 0.0001879473751206489,
72
+ "loss": 0.247,
73
  "step": 80
74
  },
75
  {
76
+ "epoch": 5.0,
77
+ "grad_norm": 0.74609375,
78
+ "learning_rate": 0.00018371664782625287,
79
+ "loss": 0.2054,
80
  "step": 90
81
  },
82
  {
83
+ "epoch": 5.555555555555555,
84
+ "grad_norm": 0.49609375,
85
+ "learning_rate": 0.00017891405093963938,
86
+ "loss": 0.096,
87
  "step": 100
88
  },
89
  {
90
+ "epoch": 6.111111111111111,
91
+ "grad_norm": 0.244140625,
92
+ "learning_rate": 0.00017357239106731317,
93
+ "loss": 0.0806,
 
 
 
 
 
 
 
 
94
  "step": 110
95
  },
96
  {
97
+ "epoch": 6.666666666666667,
98
+ "grad_norm": 0.41796875,
99
+ "learning_rate": 0.00016772815716257412,
100
+ "loss": 0.0332,
101
  "step": 120
102
  },
103
  {
104
+ "epoch": 7.222222222222222,
105
+ "grad_norm": 0.130859375,
106
+ "learning_rate": 0.0001614212712689668,
107
+ "loss": 0.0287,
108
  "step": 130
109
  },
110
  {
111
+ "epoch": 7.777777777777778,
112
+ "grad_norm": 0.283203125,
113
+ "learning_rate": 0.00015469481581224272,
114
+ "loss": 0.0125,
115
  "step": 140
116
  },
117
  {
118
+ "epoch": 8.333333333333334,
119
+ "grad_norm": 0.08740234375,
120
+ "learning_rate": 0.00014759473930370736,
121
+ "loss": 0.0085,
 
 
 
 
 
 
 
 
122
  "step": 150
123
  },
124
  {
125
+ "epoch": 8.88888888888889,
126
+ "grad_norm": 0.06591796875,
127
+ "learning_rate": 0.00014016954246529696,
128
+ "loss": 0.0051,
129
  "step": 160
130
  },
131
  {
132
+ "epoch": 9.444444444444445,
133
+ "grad_norm": 0.057861328125,
134
+ "learning_rate": 0.00013246994692046836,
135
+ "loss": 0.0036,
136
  "step": 170
137
  },
138
  {
139
+ "epoch": 10.0,
140
+ "grad_norm": 0.0322265625,
141
+ "learning_rate": 0.00012454854871407994,
142
+ "loss": 0.003,
143
  "step": 180
144
  },
145
  {
146
+ "epoch": 10.555555555555555,
147
+ "grad_norm": 0.05078125,
148
+ "learning_rate": 0.00011645945902807341,
149
+ "loss": 0.0026,
150
  "step": 190
151
  },
152
  {
153
+ "epoch": 11.11111111111111,
154
+ "grad_norm": 0.036865234375,
155
+ "learning_rate": 0.00010825793454723325,
156
+ "loss": 0.0026,
157
  "step": 200
158
  },
159
  {
160
+ "epoch": 11.11111111111111,
161
+ "eval_loss": 4.190612316131592,
162
+ "eval_runtime": 18.3153,
163
+ "eval_samples_per_second": 81.953,
164
+ "eval_steps_per_second": 2.075,
165
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  }
167
  ],
168
  "logging_steps": 10,
169
+ "max_steps": 400,
170
  "num_input_tokens_seen": 0,
171
+ "num_train_epochs": 23,
172
+ "save_steps": 200,
173
  "stateful_callbacks": {
174
  "TrainerControl": {
175
  "args": {
 
182
  "attributes": {}
183
  }
184
  },
185
+ "total_flos": 1.2881172782645248e+17,
186
  "train_batch_size": 10,
187
  "trial_name": null,
188
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34e809989457dd9bfde4bf2b0182d58f9d3a5925b9457bf37508ace824c39b8b
3
  size 9144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c8cd09cbe4299841bcebbf82901e4817a1a9d47a163f032efe3abc4fb66766c
3
  size 9144