besimray commited on
Commit
074543a
·
verified ·
1 Parent(s): 6e98e29

Training in progress, step 5, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "v_proj",
24
- "gate_proj",
25
- "q_proj",
26
  "o_proj",
27
- "up_proj",
28
  "k_proj",
29
- "down_proj"
 
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
 
23
  "o_proj",
 
24
  "k_proj",
25
+ "down_proj",
26
+ "q_proj",
27
+ "v_proj",
28
+ "gate_proj",
29
+ "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31cbaf9f678fb5252161490e7de95855e09d9f5a4bee67c60dc0b199da4a6b53
3
  size 45118424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:618d4dffa48aa304c89e57556f5a754f77703341912518007c0973875435650d
3
  size 45118424
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fe342709cbad6397e41ae1e5c634ef6e4b377eb7e04ad4b2e12e258c1aeb717
3
  size 23159290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a7263f6678616c895fcb41276c3a31a171dd3cc7d8b9b556e243780ec5aee8f
3
  size 23159290
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e90410ed8d75deee232d46a71672a78439ef812c0e8c37ade4c255c49bee23b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e378f86598b07f914582636a0353a13b8e8c042e0abc3b7613d98f9ca9baf0b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc8ae5b9632b883900417a4b328f111a055e2a3387d176daa619ce2ea248142d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cda2d7da3ce8a95be6df505b84eace6ccd5aa18ffc6d1bcc9a79572045c8d78f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,265 +1,93 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.3333333333333335,
5
- "eval_steps": 3,
6
- "global_step": 25,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.09523809523809523,
13
- "grad_norm": 0.5386500358581543,
14
  "learning_rate": 2e-05,
15
- "loss": 1.3356,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.09523809523809523,
20
- "eval_loss": 1.2671657800674438,
21
- "eval_runtime": 6.3713,
22
- "eval_samples_per_second": 15.695,
23
- "eval_steps_per_second": 7.848,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.19047619047619047,
28
- "grad_norm": 0.5549973845481873,
29
  "learning_rate": 4e-05,
30
- "loss": 1.3576,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.2857142857142857,
35
- "grad_norm": 0.4884030818939209,
 
 
 
 
 
 
 
 
36
  "learning_rate": 6e-05,
37
- "loss": 1.2158,
38
  "step": 3
39
  },
40
  {
41
- "epoch": 0.2857142857142857,
42
- "eval_loss": 1.2561376094818115,
43
- "eval_runtime": 6.1155,
44
- "eval_samples_per_second": 16.352,
45
- "eval_steps_per_second": 8.176,
46
  "step": 3
47
  },
48
  {
49
- "epoch": 0.38095238095238093,
50
- "grad_norm": 0.45884644985198975,
51
  "learning_rate": 8e-05,
52
- "loss": 1.2199,
53
  "step": 4
54
  },
55
  {
56
- "epoch": 0.47619047619047616,
57
- "grad_norm": 0.3866402208805084,
58
- "learning_rate": 0.0001,
59
- "loss": 1.2785,
60
- "step": 5
61
- },
62
- {
63
- "epoch": 0.5714285714285714,
64
- "grad_norm": 0.26828739047050476,
65
- "learning_rate": 0.00012,
66
- "loss": 1.2706,
67
- "step": 6
68
- },
69
- {
70
- "epoch": 0.5714285714285714,
71
- "eval_loss": 1.2105836868286133,
72
- "eval_runtime": 6.3215,
73
- "eval_samples_per_second": 15.819,
74
- "eval_steps_per_second": 7.91,
75
- "step": 6
76
- },
77
- {
78
- "epoch": 0.6666666666666666,
79
- "grad_norm": 0.3657456636428833,
80
- "learning_rate": 0.00014,
81
- "loss": 1.2182,
82
- "step": 7
83
- },
84
- {
85
- "epoch": 0.7619047619047619,
86
- "grad_norm": 0.49670976400375366,
87
- "learning_rate": 0.00016,
88
- "loss": 1.2607,
89
- "step": 8
90
- },
91
- {
92
- "epoch": 0.8571428571428571,
93
- "grad_norm": 0.42949965596199036,
94
- "learning_rate": 0.00018,
95
- "loss": 1.2699,
96
- "step": 9
97
- },
98
- {
99
- "epoch": 0.8571428571428571,
100
- "eval_loss": 1.1932201385498047,
101
- "eval_runtime": 6.3734,
102
- "eval_samples_per_second": 15.69,
103
- "eval_steps_per_second": 7.845,
104
- "step": 9
105
- },
106
- {
107
- "epoch": 0.9523809523809523,
108
- "grad_norm": 0.281423419713974,
109
- "learning_rate": 0.0002,
110
- "loss": 1.2143,
111
- "step": 10
112
- },
113
- {
114
- "epoch": 1.0238095238095237,
115
- "grad_norm": 0.24105204641819,
116
- "learning_rate": 0.00019876883405951377,
117
- "loss": 1.2345,
118
- "step": 11
119
- },
120
- {
121
- "epoch": 1.119047619047619,
122
- "grad_norm": 0.20358140766620636,
123
- "learning_rate": 0.00019510565162951537,
124
- "loss": 1.2237,
125
- "step": 12
126
- },
127
- {
128
- "epoch": 1.119047619047619,
129
- "eval_loss": 1.175634741783142,
130
- "eval_runtime": 6.3529,
131
- "eval_samples_per_second": 15.741,
132
- "eval_steps_per_second": 7.87,
133
- "step": 12
134
- },
135
- {
136
- "epoch": 1.2142857142857142,
137
- "grad_norm": 0.2333052158355713,
138
- "learning_rate": 0.0001891006524188368,
139
- "loss": 1.1478,
140
- "step": 13
141
- },
142
- {
143
- "epoch": 1.3095238095238095,
144
- "grad_norm": 0.2163972407579422,
145
- "learning_rate": 0.00018090169943749476,
146
- "loss": 1.14,
147
- "step": 14
148
- },
149
- {
150
- "epoch": 1.4047619047619047,
151
- "grad_norm": 0.19463765621185303,
152
- "learning_rate": 0.00017071067811865476,
153
- "loss": 1.197,
154
- "step": 15
155
- },
156
- {
157
- "epoch": 1.4047619047619047,
158
- "eval_loss": 1.1602592468261719,
159
- "eval_runtime": 6.3444,
160
- "eval_samples_per_second": 15.762,
161
- "eval_steps_per_second": 7.881,
162
- "step": 15
163
- },
164
- {
165
- "epoch": 1.5,
166
- "grad_norm": 0.1736566573381424,
167
- "learning_rate": 0.00015877852522924732,
168
- "loss": 1.1874,
169
- "step": 16
170
- },
171
- {
172
- "epoch": 1.5952380952380953,
173
- "grad_norm": 0.16423116624355316,
174
- "learning_rate": 0.00014539904997395468,
175
- "loss": 1.1152,
176
- "step": 17
177
- },
178
- {
179
- "epoch": 1.6904761904761905,
180
- "grad_norm": 0.19566665589809418,
181
- "learning_rate": 0.00013090169943749476,
182
- "loss": 1.1665,
183
- "step": 18
184
- },
185
- {
186
- "epoch": 1.6904761904761905,
187
- "eval_loss": 1.148561954498291,
188
- "eval_runtime": 6.4854,
189
- "eval_samples_per_second": 15.419,
190
- "eval_steps_per_second": 7.71,
191
- "step": 18
192
- },
193
- {
194
- "epoch": 1.7857142857142856,
195
- "grad_norm": 0.21023060381412506,
196
- "learning_rate": 0.0001156434465040231,
197
- "loss": 1.2285,
198
- "step": 19
199
  },
200
  {
201
- "epoch": 1.880952380952381,
202
- "grad_norm": 0.21021750569343567,
203
  "learning_rate": 0.0001,
204
- "loss": 1.2653,
205
- "step": 20
206
- },
207
- {
208
- "epoch": 1.9761904761904763,
209
- "grad_norm": 0.17035318911075592,
210
- "learning_rate": 8.435655349597689e-05,
211
- "loss": 1.1873,
212
- "step": 21
213
- },
214
- {
215
- "epoch": 1.9761904761904763,
216
- "eval_loss": 1.1423017978668213,
217
- "eval_runtime": 6.2847,
218
- "eval_samples_per_second": 15.912,
219
- "eval_steps_per_second": 7.956,
220
- "step": 21
221
- },
222
- {
223
- "epoch": 2.0476190476190474,
224
- "grad_norm": 0.20932504534721375,
225
- "learning_rate": 6.909830056250527e-05,
226
- "loss": 1.1359,
227
- "step": 22
228
- },
229
- {
230
- "epoch": 2.142857142857143,
231
- "grad_norm": 0.1693231463432312,
232
- "learning_rate": 5.4600950026045326e-05,
233
- "loss": 1.1202,
234
- "step": 23
235
- },
236
- {
237
- "epoch": 2.238095238095238,
238
- "grad_norm": 0.16167840361595154,
239
- "learning_rate": 4.12214747707527e-05,
240
- "loss": 1.1978,
241
- "step": 24
242
- },
243
- {
244
- "epoch": 2.238095238095238,
245
- "eval_loss": 1.1400079727172852,
246
- "eval_runtime": 6.3594,
247
- "eval_samples_per_second": 15.725,
248
- "eval_steps_per_second": 7.862,
249
- "step": 24
250
  },
251
  {
252
- "epoch": 2.3333333333333335,
253
- "grad_norm": 0.16464297473430634,
254
- "learning_rate": 2.9289321881345254e-05,
255
- "loss": 1.1135,
256
- "step": 25
 
257
  }
258
  ],
259
  "logging_steps": 1,
260
- "max_steps": 30,
261
  "num_input_tokens_seen": 0,
262
- "num_train_epochs": 3,
263
  "save_steps": 5,
264
  "stateful_callbacks": {
265
  "TrainerControl": {
@@ -273,8 +101,8 @@
273
  "attributes": {}
274
  }
275
  },
276
- "total_flos": 4707063061020672.0,
277
- "train_batch_size": 2,
278
  "trial_name": null,
279
  "trial_params": null
280
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.25,
5
+ "eval_steps": 1,
6
+ "global_step": 5,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.5,
13
+ "grad_norm": 0.5036605000495911,
14
  "learning_rate": 2e-05,
15
+ "loss": 1.2878,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.5,
20
+ "eval_loss": 1.2575896978378296,
21
+ "eval_runtime": 3.7164,
22
+ "eval_samples_per_second": 26.908,
23
+ "eval_steps_per_second": 2.691,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 1.0,
28
+ "grad_norm": 0.5406743288040161,
29
  "learning_rate": 4e-05,
30
+ "loss": 1.294,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 1.0,
35
+ "eval_loss": 1.2571214437484741,
36
+ "eval_runtime": 3.1758,
37
+ "eval_samples_per_second": 31.489,
38
+ "eval_steps_per_second": 3.149,
39
+ "step": 2
40
+ },
41
+ {
42
+ "epoch": 1.375,
43
+ "grad_norm": 0.513503909111023,
44
  "learning_rate": 6e-05,
45
+ "loss": 1.2719,
46
  "step": 3
47
  },
48
  {
49
+ "epoch": 1.375,
50
+ "eval_loss": 1.2467983961105347,
51
+ "eval_runtime": 3.1914,
52
+ "eval_samples_per_second": 31.334,
53
+ "eval_steps_per_second": 3.133,
54
  "step": 3
55
  },
56
  {
57
+ "epoch": 1.875,
58
+ "grad_norm": 0.46774598956108093,
59
  "learning_rate": 8e-05,
60
+ "loss": 1.2869,
61
  "step": 4
62
  },
63
  {
64
+ "epoch": 1.875,
65
+ "eval_loss": 1.2302355766296387,
66
+ "eval_runtime": 2.6979,
67
+ "eval_samples_per_second": 37.066,
68
+ "eval_steps_per_second": 3.707,
69
+ "step": 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  },
71
  {
72
+ "epoch": 2.25,
73
+ "grad_norm": 0.3603326678276062,
74
  "learning_rate": 0.0001,
75
+ "loss": 1.2828,
76
+ "step": 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  },
78
  {
79
+ "epoch": 2.25,
80
+ "eval_loss": 1.2147082090377808,
81
+ "eval_runtime": 3.2232,
82
+ "eval_samples_per_second": 31.025,
83
+ "eval_steps_per_second": 3.103,
84
+ "step": 5
85
  }
86
  ],
87
  "logging_steps": 1,
88
+ "max_steps": 10,
89
  "num_input_tokens_seen": 0,
90
+ "num_train_epochs": 5,
91
  "save_steps": 5,
92
  "stateful_callbacks": {
93
  "TrainerControl": {
 
101
  "attributes": {}
102
  }
103
  },
104
+ "total_flos": 4510872430706688.0,
105
+ "train_batch_size": 10,
106
  "trial_name": null,
107
  "trial_params": null
108
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:992cb611dfba558f000caff8c8336327eaecf852390523fd335dc016a4bf9fd7
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd62ee7ea8812c424f5e6128f1308129a16dbbf40f504753e5dc76062e826242
3
  size 6648