Lolol857 commited on
Commit
a6776ad
·
verified ·
1 Parent(s): 2a31f34

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -34,8 +34,8 @@
34
  "revision": null,
35
  "target_modules": [
36
  "k_proj",
37
- "v_proj",
38
  "gate_proj",
 
39
  "down_proj",
40
  "o_proj",
41
  "q_proj",
 
34
  "revision": null,
35
  "target_modules": [
36
  "k_proj",
 
37
  "gate_proj",
38
+ "v_proj",
39
  "down_proj",
40
  "o_proj",
41
  "q_proj",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f6ffbbc3ef1f6459a42262c20daa57644415f98821543c7715481de3e049f96
3
- size 71320216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:004a710a01739ae3c15a50d64e44652fec98d5ecb61417891e3231a60dc3e913
3
+ size 69157536
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d772851b906c9cbdcaa2be3b9e1ae1aed7be5fd89bb71ea2c128fa1c81e39880
3
- size 142716747
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:692fc2df0713e15ddbeca9b9efc0cc0b8de2692777ad27c718868912049276ef
3
+ size 1657
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:053a4a2984d013d4d0873c07ec5f7e11687c065a6bb1930ad375f212dd298451
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa733b20b2588180e4f01040300a6b66e5d3ccc8c1888e13968f16f4605a02f5
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e25765e01d3a9886ffd3a8cf684f09035523b1ed36102a40cec344e367336da1
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a81eebbf90e05919b75a3aef49bb021bcba6fe6e53cbcca4e18d781822f621e
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,229 +2,152 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 314,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.064,
14
- "grad_norm": 4.04085636138916,
15
  "learning_rate": 9.713375796178345e-06,
16
- "loss": 3.491233062744141,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.128,
21
- "grad_norm": 2.1230499744415283,
22
  "learning_rate": 9.394904458598726e-06,
23
- "loss": 3.2013439178466796,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.192,
28
- "grad_norm": 2.1368634700775146,
29
  "learning_rate": 9.07643312101911e-06,
30
- "loss": 3.034148406982422,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.256,
35
- "grad_norm": 2.1545660495758057,
36
  "learning_rate": 8.757961783439492e-06,
37
- "loss": 2.9740901947021485,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.32,
42
- "grad_norm": 1.511207938194275,
43
  "learning_rate": 8.439490445859873e-06,
44
- "loss": 2.9105926513671876,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.384,
49
- "grad_norm": 1.613718867301941,
50
  "learning_rate": 8.121019108280256e-06,
51
- "loss": 2.92062931060791,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.448,
56
- "grad_norm": 1.6831860542297363,
57
  "learning_rate": 7.802547770700637e-06,
58
- "loss": 2.9122573852539064,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.512,
63
- "grad_norm": 1.7110016345977783,
64
  "learning_rate": 7.484076433121019e-06,
65
- "loss": 2.856836128234863,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.576,
70
- "grad_norm": 2.022252321243286,
71
  "learning_rate": 7.1656050955414014e-06,
72
- "loss": 2.9169572830200194,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.64,
77
- "grad_norm": 1.6567405462265015,
78
  "learning_rate": 6.8471337579617835e-06,
79
- "loss": 2.8249130249023438,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.704,
84
- "grad_norm": 1.503312587738037,
85
  "learning_rate": 6.5286624203821655e-06,
86
- "loss": 2.827799987792969,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.768,
91
- "grad_norm": 1.7029547691345215,
92
  "learning_rate": 6.210191082802548e-06,
93
- "loss": 2.833590507507324,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.832,
98
- "grad_norm": 1.9305304288864136,
99
  "learning_rate": 5.89171974522293e-06,
100
- "loss": 2.7830537796020507,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.896,
105
- "grad_norm": 1.4598714113235474,
106
  "learning_rate": 5.573248407643312e-06,
107
- "loss": 2.8037981033325194,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.96,
112
- "grad_norm": 1.537855625152588,
113
  "learning_rate": 5.2547770700636944e-06,
114
- "loss": 2.820992088317871,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 1.0192,
119
- "grad_norm": 1.706451654434204,
120
  "learning_rate": 4.9363057324840765e-06,
121
- "loss": 2.7822286605834963,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 1.0832,
126
- "grad_norm": 1.5079314708709717,
127
  "learning_rate": 4.617834394904459e-06,
128
- "loss": 2.770013427734375,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 1.1472,
133
- "grad_norm": 1.8642104864120483,
134
  "learning_rate": 4.299363057324841e-06,
135
- "loss": 2.804817962646484,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 1.2112,
140
- "grad_norm": 1.5836384296417236,
141
  "learning_rate": 3.980891719745223e-06,
142
- "loss": 2.7627302169799806,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 1.2752,
147
- "grad_norm": 1.5155103206634521,
148
  "learning_rate": 3.662420382165605e-06,
149
- "loss": 2.7734092712402343,
150
  "step": 200
151
- },
152
- {
153
- "epoch": 1.3392,
154
- "grad_norm": 1.5116642713546753,
155
- "learning_rate": 3.3439490445859875e-06,
156
- "loss": 2.756932830810547,
157
- "step": 210
158
- },
159
- {
160
- "epoch": 1.4032,
161
- "grad_norm": 1.5571694374084473,
162
- "learning_rate": 3.0254777070063695e-06,
163
- "loss": 2.738229751586914,
164
- "step": 220
165
- },
166
- {
167
- "epoch": 1.4672,
168
- "grad_norm": 1.8248878717422485,
169
- "learning_rate": 2.707006369426752e-06,
170
- "loss": 2.718592643737793,
171
- "step": 230
172
- },
173
- {
174
- "epoch": 1.5312000000000001,
175
- "grad_norm": 1.8225314617156982,
176
- "learning_rate": 2.388535031847134e-06,
177
- "loss": 2.7289609909057617,
178
- "step": 240
179
- },
180
- {
181
- "epoch": 1.5952,
182
- "grad_norm": 1.5926852226257324,
183
- "learning_rate": 2.070063694267516e-06,
184
- "loss": 2.7409887313842773,
185
- "step": 250
186
- },
187
- {
188
- "epoch": 1.6592,
189
- "grad_norm": 1.8249088525772095,
190
- "learning_rate": 1.7515923566878982e-06,
191
- "loss": 2.784321975708008,
192
- "step": 260
193
- },
194
- {
195
- "epoch": 1.7231999999999998,
196
- "grad_norm": 1.720214605331421,
197
- "learning_rate": 1.4331210191082802e-06,
198
- "loss": 2.704195976257324,
199
- "step": 270
200
- },
201
- {
202
- "epoch": 1.7872,
203
- "grad_norm": 2.020470380783081,
204
- "learning_rate": 1.1146496815286625e-06,
205
- "loss": 2.7614681243896486,
206
- "step": 280
207
- },
208
- {
209
- "epoch": 1.8512,
210
- "grad_norm": 1.5761101245880127,
211
- "learning_rate": 7.961783439490446e-07,
212
- "loss": 2.762880325317383,
213
- "step": 290
214
- },
215
- {
216
- "epoch": 1.9152,
217
- "grad_norm": 1.4878287315368652,
218
- "learning_rate": 4.777070063694269e-07,
219
- "loss": 2.7690963745117188,
220
- "step": 300
221
- },
222
- {
223
- "epoch": 1.9792,
224
- "grad_norm": 1.6394439935684204,
225
- "learning_rate": 1.5923566878980893e-07,
226
- "loss": 2.7822860717773437,
227
- "step": 310
228
  }
229
  ],
230
  "logging_steps": 10,
@@ -239,12 +162,12 @@
239
  "should_evaluate": false,
240
  "should_log": false,
241
  "should_save": true,
242
- "should_training_stop": true
243
  },
244
  "attributes": {}
245
  }
246
  },
247
- "total_flos": 8715036223782912.0,
248
  "train_batch_size": 4,
249
  "trial_name": null,
250
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2752,
6
  "eval_steps": 500,
7
+ "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.064,
14
+ "grad_norm": 0.0,
15
  "learning_rate": 9.713375796178345e-06,
16
+ "loss": 2.5330434799194337,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.128,
21
+ "grad_norm": 0.0,
22
  "learning_rate": 9.394904458598726e-06,
23
+ "loss": 2.5563262939453124,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.192,
28
+ "grad_norm": 0.0,
29
  "learning_rate": 9.07643312101911e-06,
30
+ "loss": 2.4857761383056642,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.256,
35
+ "grad_norm": 0.0,
36
  "learning_rate": 8.757961783439492e-06,
37
+ "loss": 2.5507957458496096,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.32,
42
+ "grad_norm": 0.0,
43
  "learning_rate": 8.439490445859873e-06,
44
+ "loss": 2.5324316024780273,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.384,
49
+ "grad_norm": 0.0,
50
  "learning_rate": 8.121019108280256e-06,
51
+ "loss": 2.533696174621582,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.448,
56
+ "grad_norm": 0.0,
57
  "learning_rate": 7.802547770700637e-06,
58
+ "loss": 2.688054656982422,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.512,
63
+ "grad_norm": 0.0,
64
  "learning_rate": 7.484076433121019e-06,
65
+ "loss": 2.598434829711914,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.576,
70
+ "grad_norm": 0.0,
71
  "learning_rate": 7.1656050955414014e-06,
72
+ "loss": 2.514725685119629,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.64,
77
+ "grad_norm": 0.0,
78
  "learning_rate": 6.8471337579617835e-06,
79
+ "loss": 2.4956493377685547,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.704,
84
+ "grad_norm": 0.0,
85
  "learning_rate": 6.5286624203821655e-06,
86
+ "loss": 2.6257051467895507,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.768,
91
+ "grad_norm": 0.0,
92
  "learning_rate": 6.210191082802548e-06,
93
+ "loss": 2.5550819396972657,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.832,
98
+ "grad_norm": 0.0,
99
  "learning_rate": 5.89171974522293e-06,
100
+ "loss": 2.5322742462158203,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.896,
105
+ "grad_norm": 0.0,
106
  "learning_rate": 5.573248407643312e-06,
107
+ "loss": 2.61910343170166,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.96,
112
+ "grad_norm": 0.0,
113
  "learning_rate": 5.2547770700636944e-06,
114
+ "loss": 2.584405708312988,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 1.0192,
119
+ "grad_norm": 0.0,
120
  "learning_rate": 4.9363057324840765e-06,
121
+ "loss": 2.620287322998047,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 1.0832,
126
+ "grad_norm": 0.0,
127
  "learning_rate": 4.617834394904459e-06,
128
+ "loss": 2.5575042724609376,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 1.1472,
133
+ "grad_norm": 0.0,
134
  "learning_rate": 4.299363057324841e-06,
135
+ "loss": 2.554376411437988,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 1.2112,
140
+ "grad_norm": 0.0,
141
  "learning_rate": 3.980891719745223e-06,
142
+ "loss": 2.51497745513916,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 1.2752,
147
+ "grad_norm": 0.0,
148
  "learning_rate": 3.662420382165605e-06,
149
+ "loss": 2.5773569107055665,
150
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  }
152
  ],
153
  "logging_steps": 10,
 
162
  "should_evaluate": false,
163
  "should_log": false,
164
  "should_save": true,
165
+ "should_training_stop": false
166
  },
167
  "attributes": {}
168
  }
169
  },
170
+ "total_flos": 3909179867234304.0,
171
  "train_batch_size": 4,
172
  "trial_name": null,
173
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e88c25a1d4de80853a53173b8cfe311be9e1d5e07531379233d09d62bc4c4a2
3
  size 5649
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4aa2e3325989e8434fe3c0f7564866f88b52c2009f394bc964c603eedb7a1a38
3
  size 5649