CharlesLi commited on
Commit
3674b21
·
verified ·
1 Parent(s): ce90b2f

Model save

Browse files
Files changed (4) hide show
  1. README.md +9 -21
  2. all_results.json +6 -11
  3. train_results.json +6 -6
  4. trainer_state.json +56 -163
README.md CHANGED
@@ -1,11 +1,8 @@
1
  ---
2
  base_model: mistralai/Mistral-7B-Instruct-v0.1
3
- datasets:
4
- - generator
5
  library_name: peft
6
  license: apache-2.0
7
  tags:
8
- - alignment-handbook
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -19,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # mistral_cot_true_simple_lora
21
 
22
- This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 0.9395
25
 
26
  ## Model description
27
 
@@ -50,25 +47,16 @@ The following hyperparameters were used during training:
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
  - lr_scheduler_type: cosine
52
  - lr_scheduler_warmup_ratio: 0.1
53
- - num_epochs: 15
54
 
55
  ### Training results
56
 
57
- | Training Loss | Epoch | Step | Validation Loss |
58
- |:-------------:|:-------:|:----:|:---------------:|
59
- | 1.0104 | 0.8571 | 3 | 0.8812 |
60
- | 0.8795 | 2.0 | 7 | 0.7078 |
61
- | 0.6536 | 2.8571 | 10 | 0.6557 |
62
- | 0.6536 | 4.0 | 14 | 0.6372 |
63
- | 0.472 | 4.8571 | 17 | 0.6561 |
64
- | 0.3263 | 6.0 | 21 | 0.6875 |
65
- | 0.3263 | 6.8571 | 24 | 0.7510 |
66
- | 0.1683 | 8.0 | 28 | 0.8222 |
67
- | 0.0699 | 8.8571 | 31 | 0.8685 |
68
- | 0.0369 | 10.0 | 35 | 0.9062 |
69
- | 0.0369 | 10.8571 | 38 | 0.9268 |
70
- | 0.0248 | 12.0 | 42 | 0.9377 |
71
- | 0.0212 | 12.8571 | 45 | 0.9395 |
72
 
73
 
74
  ### Framework versions
 
1
  ---
2
  base_model: mistralai/Mistral-7B-Instruct-v0.1
 
 
3
  library_name: peft
4
  license: apache-2.0
5
  tags:
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
 
16
 
17
  # mistral_cot_true_simple_lora
18
 
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.6070
22
 
23
  ## Model description
24
 
 
47
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
  - lr_scheduler_type: cosine
49
  - lr_scheduler_warmup_ratio: 0.1
50
+ - training_steps: 20
51
 
52
  ### Training results
53
 
54
+ | Training Loss | Epoch | Step | Validation Loss |
55
+ |:-------------:|:-----:|:----:|:---------------:|
56
+ | 0.9142 | 0.4 | 5 | 0.7228 |
57
+ | 0.6834 | 0.8 | 10 | 0.6278 |
58
+ | 0.5454 | 1.2 | 15 | 0.6100 |
59
+ | 0.5243 | 1.6 | 20 | 0.6070 |
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 12.857142857142858,
3
- "eval_loss": 0.939548134803772,
4
- "eval_runtime": 1.1735,
5
- "eval_samples": 20,
6
- "eval_samples_per_second": 4.261,
7
- "eval_steps_per_second": 1.704,
8
- "total_flos": 10453480636416.0,
9
- "train_loss": 0.2976339568694433,
10
- "train_runtime": 339.6314,
11
  "train_samples": 100,
12
- "train_samples_per_second": 1.148,
13
- "train_steps_per_second": 0.132
14
  }
 
1
  {
2
+ "epoch": 1.6,
3
+ "total_flos": 1591906369536.0,
4
+ "train_loss": 0.6768446683883667,
5
+ "train_runtime": 82.5503,
 
 
 
 
 
6
  "train_samples": 100,
7
+ "train_samples_per_second": 1.938,
8
+ "train_steps_per_second": 0.242
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 12.857142857142858,
3
- "total_flos": 10453480636416.0,
4
- "train_loss": 0.2976339568694433,
5
- "train_runtime": 339.6314,
6
  "train_samples": 100,
7
- "train_samples_per_second": 1.148,
8
- "train_steps_per_second": 0.132
9
  }
 
1
  {
2
+ "epoch": 1.6,
3
+ "total_flos": 1591906369536.0,
4
+ "train_loss": 0.6768446683883667,
5
+ "train_runtime": 82.5503,
6
  "train_samples": 100,
7
+ "train_samples_per_second": 1.938,
8
+ "train_steps_per_second": 0.242
9
  }
trainer_state.json CHANGED
@@ -1,201 +1,94 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 12.857142857142858,
5
- "eval_steps": 500,
6
- "global_step": 45,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.2857142857142857,
13
- "grad_norm": 0.7675084033789445,
14
- "learning_rate": 4e-05,
15
- "loss": 1.0104,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.8571428571428571,
20
- "eval_loss": 0.8811686635017395,
21
- "eval_runtime": 2.3203,
22
- "eval_samples_per_second": 2.155,
23
- "eval_steps_per_second": 0.862,
24
- "step": 3
25
- },
26
- {
27
- "epoch": 1.4285714285714286,
28
- "grad_norm": 0.3794777121113468,
29
- "learning_rate": 0.0002,
30
- "loss": 0.8795,
31
  "step": 5
32
  },
33
  {
34
- "epoch": 2.0,
35
- "eval_loss": 0.7077744007110596,
36
- "eval_runtime": 1.1217,
37
- "eval_samples_per_second": 4.458,
38
- "eval_steps_per_second": 1.783,
39
- "step": 7
40
  },
41
  {
42
- "epoch": 2.857142857142857,
43
- "grad_norm": 0.3530496345436479,
44
- "learning_rate": 0.0001923879532511287,
45
- "loss": 0.6536,
46
  "step": 10
47
  },
48
  {
49
- "epoch": 2.857142857142857,
50
- "eval_loss": 0.6557201743125916,
51
- "eval_runtime": 1.1188,
52
- "eval_samples_per_second": 4.469,
53
- "eval_steps_per_second": 1.788,
54
  "step": 10
55
  },
56
  {
57
- "epoch": 4.0,
58
- "eval_loss": 0.6372172236442566,
59
- "eval_runtime": 1.1233,
60
- "eval_samples_per_second": 4.451,
61
- "eval_steps_per_second": 1.781,
62
- "step": 14
63
- },
64
- {
65
- "epoch": 4.285714285714286,
66
- "grad_norm": 0.26413927917508156,
67
- "learning_rate": 0.00017071067811865476,
68
- "loss": 0.472,
69
  "step": 15
70
  },
71
  {
72
- "epoch": 4.857142857142857,
73
- "eval_loss": 0.6561022400856018,
74
- "eval_runtime": 1.1268,
75
- "eval_samples_per_second": 4.437,
76
- "eval_steps_per_second": 1.775,
77
- "step": 17
78
- },
79
- {
80
- "epoch": 5.714285714285714,
81
- "grad_norm": 0.3618434995437738,
82
- "learning_rate": 0.000138268343236509,
83
- "loss": 0.3263,
84
- "step": 20
85
- },
86
- {
87
- "epoch": 6.0,
88
- "eval_loss": 0.687515914440155,
89
- "eval_runtime": 1.1222,
90
- "eval_samples_per_second": 4.456,
91
- "eval_steps_per_second": 1.782,
92
- "step": 21
93
- },
94
- {
95
- "epoch": 6.857142857142857,
96
- "eval_loss": 0.7509881854057312,
97
- "eval_runtime": 1.1228,
98
- "eval_samples_per_second": 4.453,
99
- "eval_steps_per_second": 1.781,
100
- "step": 24
101
- },
102
- {
103
- "epoch": 7.142857142857143,
104
- "grad_norm": 0.3781011248380884,
105
- "learning_rate": 0.0001,
106
- "loss": 0.1683,
107
- "step": 25
108
- },
109
- {
110
- "epoch": 8.0,
111
- "eval_loss": 0.8222058415412903,
112
- "eval_runtime": 1.1218,
113
- "eval_samples_per_second": 4.457,
114
- "eval_steps_per_second": 1.783,
115
- "step": 28
116
- },
117
- {
118
- "epoch": 8.571428571428571,
119
- "grad_norm": 0.264128562506645,
120
- "learning_rate": 6.173165676349103e-05,
121
- "loss": 0.0699,
122
- "step": 30
123
- },
124
- {
125
- "epoch": 8.857142857142858,
126
- "eval_loss": 0.8684995770454407,
127
- "eval_runtime": 1.1229,
128
- "eval_samples_per_second": 4.453,
129
- "eval_steps_per_second": 1.781,
130
- "step": 31
131
- },
132
- {
133
- "epoch": 10.0,
134
- "grad_norm": 0.24436285012704104,
135
- "learning_rate": 2.9289321881345254e-05,
136
- "loss": 0.0369,
137
- "step": 35
138
- },
139
- {
140
- "epoch": 10.0,
141
- "eval_loss": 0.906228244304657,
142
- "eval_runtime": 1.122,
143
- "eval_samples_per_second": 4.456,
144
- "eval_steps_per_second": 1.782,
145
- "step": 35
146
- },
147
- {
148
- "epoch": 10.857142857142858,
149
- "eval_loss": 0.9267812967300415,
150
- "eval_runtime": 1.1231,
151
- "eval_samples_per_second": 4.452,
152
- "eval_steps_per_second": 1.781,
153
- "step": 38
154
- },
155
- {
156
- "epoch": 11.428571428571429,
157
- "grad_norm": 0.13496685015110904,
158
- "learning_rate": 7.612046748871327e-06,
159
- "loss": 0.0248,
160
- "step": 40
161
- },
162
- {
163
- "epoch": 12.0,
164
- "eval_loss": 0.9376745223999023,
165
- "eval_runtime": 1.1206,
166
- "eval_samples_per_second": 4.462,
167
- "eval_steps_per_second": 1.785,
168
- "step": 42
169
  },
170
  {
171
- "epoch": 12.857142857142858,
172
- "grad_norm": 0.13014521278503757,
173
  "learning_rate": 0.0,
174
- "loss": 0.0212,
175
- "step": 45
176
  },
177
  {
178
- "epoch": 12.857142857142858,
179
- "eval_loss": 0.939548134803772,
180
- "eval_runtime": 1.1886,
181
- "eval_samples_per_second": 4.207,
182
- "eval_steps_per_second": 1.683,
183
- "step": 45
184
  },
185
  {
186
- "epoch": 12.857142857142858,
187
- "step": 45,
188
- "total_flos": 10453480636416.0,
189
- "train_loss": 0.2976339568694433,
190
- "train_runtime": 339.6314,
191
- "train_samples_per_second": 1.148,
192
- "train_steps_per_second": 0.132
193
  }
194
  ],
195
  "logging_steps": 5,
196
- "max_steps": 45,
197
  "num_input_tokens_seen": 0,
198
- "num_train_epochs": 15,
199
  "save_steps": 1000,
200
  "stateful_callbacks": {
201
  "TrainerControl": {
@@ -209,7 +102,7 @@
209
  "attributes": {}
210
  }
211
  },
212
- "total_flos": 10453480636416.0,
213
  "train_batch_size": 4,
214
  "trial_name": null,
215
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.6,
5
+ "eval_steps": 5,
6
+ "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08,
13
+ "grad_norm": 0.7334153552614141,
14
+ "learning_rate": 0.0001,
15
+ "loss": 1.115,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.4,
20
+ "grad_norm": 0.5975179564042915,
21
+ "learning_rate": 0.00018660254037844388,
22
+ "loss": 0.9142,
 
 
 
 
 
 
 
 
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.4,
27
+ "eval_loss": 0.722756564617157,
28
+ "eval_runtime": 3.0808,
29
+ "eval_samples_per_second": 6.492,
30
+ "eval_steps_per_second": 1.623,
31
+ "step": 5
32
  },
33
  {
34
+ "epoch": 0.8,
35
+ "grad_norm": 0.4159151923493577,
36
+ "learning_rate": 0.00011736481776669306,
37
+ "loss": 0.6834,
38
  "step": 10
39
  },
40
  {
41
+ "epoch": 0.8,
42
+ "eval_loss": 0.6278184056282043,
43
+ "eval_runtime": 1.8619,
44
+ "eval_samples_per_second": 10.742,
45
+ "eval_steps_per_second": 2.685,
46
  "step": 10
47
  },
48
  {
49
+ "epoch": 1.2,
50
+ "grad_norm": 0.40842948448715766,
51
+ "learning_rate": 3.5721239031346066e-05,
52
+ "loss": 0.5454,
 
 
 
 
 
 
 
 
53
  "step": 15
54
  },
55
  {
56
+ "epoch": 1.2,
57
+ "eval_loss": 0.6100303530693054,
58
+ "eval_runtime": 1.8606,
59
+ "eval_samples_per_second": 10.749,
60
+ "eval_steps_per_second": 2.687,
61
+ "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  },
63
  {
64
+ "epoch": 1.6,
65
+ "grad_norm": 0.3737806540766406,
66
  "learning_rate": 0.0,
67
+ "loss": 0.5243,
68
+ "step": 20
69
  },
70
  {
71
+ "epoch": 1.6,
72
+ "eval_loss": 0.6069896221160889,
73
+ "eval_runtime": 1.8906,
74
+ "eval_samples_per_second": 10.579,
75
+ "eval_steps_per_second": 2.645,
76
+ "step": 20
77
  },
78
  {
79
+ "epoch": 1.6,
80
+ "step": 20,
81
+ "total_flos": 1591906369536.0,
82
+ "train_loss": 0.6768446683883667,
83
+ "train_runtime": 82.5503,
84
+ "train_samples_per_second": 1.938,
85
+ "train_steps_per_second": 0.242
86
  }
87
  ],
88
  "logging_steps": 5,
89
+ "max_steps": 20,
90
  "num_input_tokens_seen": 0,
91
+ "num_train_epochs": 2,
92
  "save_steps": 1000,
93
  "stateful_callbacks": {
94
  "TrainerControl": {
 
102
  "attributes": {}
103
  }
104
  },
105
+ "total_flos": 1591906369536.0,
106
  "train_batch_size": 4,
107
  "trial_name": null,
108
  "trial_params": null