3N3G commited on
Commit
89d5f5b
·
verified ·
1 Parent(s): f6b30a6

Training in progress, step 16, checkpoint

Browse files
last-checkpoint/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:943608aa5c6d1a33cd3de6c65e5e3dc4364cc2718c2e96c2431f1f8af7ed45a8
3
  size 4969539560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e7da2c9aef8f35f6786cbf06af70258ed692543ecd8515c205ebddc810fd910
3
  size 4969539560
last-checkpoint/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0f8c786480fb81dbef237ecccb6d214b6a308947b049039952c10df566011d7
3
  size 1912795688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16af573dbb77b92352dc7d86e3ffabc1ab8f05af70d970bb7737f8c187b8c429
3
  size 1912795688
last-checkpoint/trainer_state.json CHANGED
@@ -2,242 +2,138 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 8.0,
6
- "eval_steps": 100,
7
- "global_step": 32,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.2909090909090909,
14
- "grad_norm": 10.45223617553711,
15
  "learning_rate": 0.0,
16
  "loss": 0.7859,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.5818181818181818,
21
- "grad_norm": 9.886490821838379,
22
- "learning_rate": 2.5e-08,
23
  "loss": 0.7965,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.8727272727272727,
28
- "grad_norm": 10.403158187866211,
29
- "learning_rate": 5e-08,
30
  "loss": 0.7893,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 1.0,
35
- "grad_norm": 12.796398162841797,
36
- "learning_rate": 7.5e-08,
37
- "loss": 0.8701,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 1.290909090909091,
42
- "grad_norm": 10.29249095916748,
43
- "learning_rate": 1e-07,
44
- "loss": 0.7784,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 1.5818181818181818,
49
- "grad_norm": 9.718952178955078,
50
- "learning_rate": 9.971704944519592e-08,
51
- "loss": 0.7589,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 1.8727272727272726,
56
- "grad_norm": 10.934309959411621,
57
- "learning_rate": 9.887175604818205e-08,
58
- "loss": 0.8307,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 2.0,
63
- "grad_norm": 11.188789367675781,
64
- "learning_rate": 9.747474986387654e-08,
65
- "loss": 0.8821,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 2.290909090909091,
70
- "grad_norm": 9.692912101745605,
71
- "learning_rate": 9.554359905560885e-08,
72
- "loss": 0.7998,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 2.581818181818182,
77
- "grad_norm": 11.390420913696289,
78
- "learning_rate": 9.310258896527278e-08,
79
- "loss": 0.8196,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 2.8727272727272726,
84
- "grad_norm": 10.247771263122559,
85
- "learning_rate": 9.018241671106134e-08,
86
- "loss": 0.7766,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 3.0,
91
- "grad_norm": 11.571775436401367,
92
- "learning_rate": 8.681980515339464e-08,
93
- "loss": 0.8289,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 3.290909090909091,
98
- "grad_norm": 10.701568603515625,
99
- "learning_rate": 8.305704108364301e-08,
100
- "loss": 0.8375,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 3.581818181818182,
105
- "grad_norm": 10.49411678314209,
106
- "learning_rate": 7.894144344319013e-08,
107
- "loss": 0.8383,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 3.8727272727272726,
112
- "grad_norm": 9.895997047424316,
113
- "learning_rate": 7.452476826029011e-08,
114
- "loss": 0.772,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 4.0,
119
- "grad_norm": 10.467330932617188,
120
- "learning_rate": 6.986255778798252e-08,
121
- "loss": 0.7012,
122
  "step": 16
123
  },
124
  {
125
- "epoch": 4.290909090909091,
126
- "grad_norm": 9.836618423461914,
127
- "learning_rate": 6.501344202803413e-08,
128
- "loss": 0.777,
129
- "step": 17
130
- },
131
- {
132
- "epoch": 4.581818181818182,
133
- "grad_norm": 11.242887496948242,
134
- "learning_rate": 6.003840142464886e-08,
135
- "loss": 0.8631,
136
- "step": 18
137
- },
138
- {
139
- "epoch": 4.872727272727273,
140
- "grad_norm": 10.001364707946777,
141
- "learning_rate": 5.5e-08,
142
- "loss": 0.7819,
143
- "step": 19
144
- },
145
- {
146
- "epoch": 5.0,
147
- "grad_norm": 10.092758178710938,
148
- "learning_rate": 4.996159857535115e-08,
149
- "loss": 0.7722,
150
- "step": 20
151
- },
152
- {
153
- "epoch": 5.290909090909091,
154
- "grad_norm": 9.45466423034668,
155
- "learning_rate": 4.498655797196585e-08,
156
- "loss": 0.7416,
157
- "step": 21
158
- },
159
- {
160
- "epoch": 5.581818181818182,
161
- "grad_norm": 10.496912956237793,
162
- "learning_rate": 4.0137442212017494e-08,
163
- "loss": 0.8161,
164
- "step": 22
165
- },
166
- {
167
- "epoch": 5.872727272727273,
168
- "grad_norm": 10.202836036682129,
169
- "learning_rate": 3.5475231739709885e-08,
170
- "loss": 0.823,
171
- "step": 23
172
- },
173
- {
174
- "epoch": 6.0,
175
- "grad_norm": 12.920607566833496,
176
- "learning_rate": 3.105855655680986e-08,
177
- "loss": 0.8315,
178
- "step": 24
179
- },
180
- {
181
- "epoch": 6.290909090909091,
182
- "grad_norm": 10.253811836242676,
183
- "learning_rate": 2.6942958916356994e-08,
184
- "loss": 0.8316,
185
- "step": 25
186
- },
187
- {
188
- "epoch": 6.581818181818182,
189
- "grad_norm": 9.783924102783203,
190
- "learning_rate": 2.3180194846605363e-08,
191
- "loss": 0.7542,
192
- "step": 26
193
- },
194
- {
195
- "epoch": 6.872727272727273,
196
- "grad_norm": 10.855210304260254,
197
- "learning_rate": 1.981758328893866e-08,
198
- "loss": 0.8357,
199
- "step": 27
200
- },
201
- {
202
- "epoch": 7.0,
203
- "grad_norm": 10.147912979125977,
204
- "learning_rate": 1.6897411034727217e-08,
205
- "loss": 0.7426,
206
- "step": 28
207
- },
208
- {
209
- "epoch": 7.290909090909091,
210
- "grad_norm": 10.078908920288086,
211
- "learning_rate": 1.4456400944391145e-08,
212
- "loss": 0.7832,
213
- "step": 29
214
- },
215
- {
216
- "epoch": 7.581818181818182,
217
- "grad_norm": 10.833037376403809,
218
- "learning_rate": 1.2525250136123459e-08,
219
- "loss": 0.7954,
220
- "step": 30
221
- },
222
- {
223
- "epoch": 7.872727272727273,
224
- "grad_norm": 9.931336402893066,
225
- "learning_rate": 1.1128243951817936e-08,
226
- "loss": 0.7893,
227
- "step": 31
228
- },
229
- {
230
- "epoch": 8.0,
231
- "grad_norm": 10.87130355834961,
232
- "learning_rate": 1.0282950554804083e-08,
233
- "loss": 0.9104,
234
- "step": 32
235
  }
236
  ],
237
  "logging_steps": 1,
238
- "max_steps": 32,
239
  "num_input_tokens_seen": 0,
240
- "num_train_epochs": 8,
241
  "save_steps": 16,
242
  "stateful_callbacks": {
243
  "TrainerControl": {
@@ -246,12 +142,12 @@
246
  "should_evaluate": false,
247
  "should_log": false,
248
  "should_save": true,
249
- "should_training_stop": true
250
  },
251
  "attributes": {}
252
  }
253
  },
254
- "total_flos": 1.073226288070656e+16,
255
  "train_batch_size": 1,
256
  "trial_name": null,
257
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 16,
7
+ "global_step": 16,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.2909090909090909,
14
+ "grad_norm": 10.451888084411621,
15
  "learning_rate": 0.0,
16
  "loss": 0.7859,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.5818181818181818,
21
+ "grad_norm": 9.886292457580566,
22
+ "learning_rate": 2.5e-09,
23
  "loss": 0.7965,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.8727272727272727,
28
+ "grad_norm": 10.406240463256836,
29
+ "learning_rate": 5e-09,
30
  "loss": 0.7893,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 1.0,
35
+ "grad_norm": 12.746437072753906,
36
+ "learning_rate": 7.5e-09,
37
+ "loss": 0.8702,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 1.290909090909091,
42
+ "grad_norm": 10.291970252990723,
43
+ "learning_rate": 1e-08,
44
+ "loss": 0.7785,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 1.5818181818181818,
49
+ "grad_norm": 9.746875762939453,
50
+ "learning_rate": 1.25e-08,
51
+ "loss": 0.759,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 1.8727272727272726,
56
+ "grad_norm": 10.920265197753906,
57
+ "learning_rate": 1.5e-08,
58
+ "loss": 0.8308,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 2.0,
63
+ "grad_norm": 11.265154838562012,
64
+ "learning_rate": 1.7499999999999998e-08,
65
+ "loss": 0.8828,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 2.290909090909091,
70
+ "grad_norm": 9.750505447387695,
71
+ "learning_rate": 2e-08,
72
+ "loss": 0.8004,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 2.581818181818182,
77
+ "grad_norm": 11.47065544128418,
78
+ "learning_rate": 2.25e-08,
79
+ "loss": 0.8204,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 2.8727272727272726,
84
+ "grad_norm": 10.275605201721191,
85
+ "learning_rate": 2.5e-08,
86
+ "loss": 0.7771,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 3.0,
91
+ "grad_norm": 11.604477882385254,
92
+ "learning_rate": 2.75e-08,
93
+ "loss": 0.8295,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 3.290909090909091,
98
+ "grad_norm": 10.712018966674805,
99
+ "learning_rate": 3e-08,
100
+ "loss": 0.8378,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 3.581818181818182,
105
+ "grad_norm": 10.54987907409668,
106
+ "learning_rate": 3.25e-08,
107
+ "loss": 0.8398,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 3.8727272727272726,
112
+ "grad_norm": 9.999624252319336,
113
+ "learning_rate": 3.4999999999999996e-08,
114
+ "loss": 0.773,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 4.0,
119
+ "grad_norm": 10.562870979309082,
120
+ "learning_rate": 3.75e-08,
121
+ "loss": 0.7025,
122
  "step": 16
123
  },
124
  {
125
+ "epoch": 4.0,
126
+ "eval_loss": 0.760595977306366,
127
+ "eval_runtime": 0.722,
128
+ "eval_samples_per_second": 18.005,
129
+ "eval_steps_per_second": 18.005,
130
+ "step": 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
  ],
133
  "logging_steps": 1,
134
+ "max_steps": 400,
135
  "num_input_tokens_seen": 0,
136
+ "num_train_epochs": 100,
137
  "save_steps": 16,
138
  "stateful_callbacks": {
139
  "TrainerControl": {
 
142
  "should_evaluate": false,
143
  "should_log": false,
144
  "should_save": true,
145
+ "should_training_stop": false
146
  },
147
  "attributes": {}
148
  }
149
  },
150
+ "total_flos": 5366131440353280.0,
151
  "train_batch_size": 1,
152
  "trial_name": null,
153
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d840eebf56f8b6e42656fd7c380bd348e6804b913efec9e3bbfeabc52c3a4df6
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b66b3dd2c00440ec978f1804a2b69b05b8711272903c469b51af4f9859ceb8e9
3
  size 5816