j05hr3d commited on
Commit
45dc712
·
verified ·
1 Parent(s): 056e004

Model save

Browse files
Files changed (3) hide show
  1. README.md +11 -12
  2. adapter_model.safetensors +1 -1
  3. trainer_state.json +107 -122
README.md CHANGED
@@ -19,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.9411
23
 
24
  ## Model description
25
 
@@ -53,17 +53,16 @@ The following hyperparameters were used during training:
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:------:|:----:|:---------------:|
56
- | 1.194 | 0.2712 | 20 | 1.1335 |
57
- | 0.8781 | 0.5424 | 40 | 1.0606 |
58
- | 0.971 | 0.8136 | 60 | 1.0339 |
59
- | 0.8568 | 1.0814 | 80 | 1.0087 |
60
- | 0.8531 | 1.3525 | 100 | 0.9888 |
61
- | 0.8971 | 1.6237 | 120 | 0.9694 |
62
- | 0.8475 | 1.8949 | 140 | 0.9589 |
63
- | 0.8708 | 2.1627 | 160 | 0.9529 |
64
- | 0.7806 | 2.4339 | 180 | 0.9453 |
65
- | 0.6945 | 2.7051 | 200 | 0.9455 |
66
- | 0.7384 | 2.9763 | 220 | 0.9411 |
67
 
68
 
69
  ### Framework versions
 
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.9464
23
 
24
  ## Model description
25
 
 
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:------:|:----:|:---------------:|
56
+ | 1.1739 | 0.2759 | 20 | 1.1330 |
57
+ | 0.8905 | 0.5517 | 40 | 1.0622 |
58
+ | 0.9773 | 0.8276 | 60 | 1.0305 |
59
+ | 0.8092 | 1.0966 | 80 | 1.0049 |
60
+ | 0.7883 | 1.3724 | 100 | 0.9816 |
61
+ | 0.7641 | 1.6483 | 120 | 0.9676 |
62
+ | 0.79 | 1.9241 | 140 | 0.9591 |
63
+ | 0.8975 | 2.1931 | 160 | 0.9538 |
64
+ | 0.7537 | 2.4690 | 180 | 0.9526 |
65
+ | 0.6484 | 2.7448 | 200 | 0.9464 |
 
66
 
67
 
68
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d74899206b7b1f73a46cb08c7a9011b0708626bdeac88d08b309e62c96ec31d
3
  size 147770496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a5f1d6b328858de3e31aa52666e6e68c00333a702d37f457a3e538c7f8024ed
3
  size 147770496
trainer_state.json CHANGED
@@ -1,199 +1,184 @@
1
  {
2
- "best_global_step": 220,
3
- "best_metric": 0.9410861134529114,
4
- "best_model_checkpoint": "j05hr3d/SFT-Qwen2.5-Coder-1.5B_v1.1/checkpoint-220",
5
  "epoch": 3.0,
6
  "eval_steps": 20,
7
- "global_step": 222,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.2711864406779661,
14
- "grad_norm": 0.7084594964981079,
15
- "learning_rate": 9.441860465116279e-05,
16
- "loss": 1.194,
17
  "step": 20
18
  },
19
  {
20
- "epoch": 0.2711864406779661,
21
- "eval_loss": 1.133489727973938,
22
- "eval_runtime": 8.8736,
23
- "eval_samples_per_second": 7.325,
24
- "eval_steps_per_second": 3.719,
25
  "step": 20
26
  },
27
  {
28
- "epoch": 0.5423728813559322,
29
- "grad_norm": 0.6419398784637451,
30
- "learning_rate": 8.511627906976745e-05,
31
- "loss": 0.8781,
32
  "step": 40
33
  },
34
  {
35
- "epoch": 0.5423728813559322,
36
- "eval_loss": 1.0605597496032715,
37
- "eval_runtime": 8.8573,
38
- "eval_samples_per_second": 7.339,
39
- "eval_steps_per_second": 3.726,
40
  "step": 40
41
  },
42
  {
43
- "epoch": 0.8135593220338984,
44
- "grad_norm": 0.3213561773300171,
45
- "learning_rate": 7.58139534883721e-05,
46
- "loss": 0.971,
47
  "step": 60
48
  },
49
  {
50
- "epoch": 0.8135593220338984,
51
- "eval_loss": 1.0338528156280518,
52
- "eval_runtime": 8.7742,
53
- "eval_samples_per_second": 7.408,
54
- "eval_steps_per_second": 3.761,
55
  "step": 60
56
  },
57
  {
58
- "epoch": 1.0813559322033899,
59
- "grad_norm": 0.424533873796463,
60
- "learning_rate": 6.651162790697675e-05,
61
- "loss": 0.8568,
62
  "step": 80
63
  },
64
  {
65
- "epoch": 1.0813559322033899,
66
- "eval_loss": 1.0086784362792969,
67
- "eval_runtime": 8.7994,
68
- "eval_samples_per_second": 7.387,
69
- "eval_steps_per_second": 3.75,
70
  "step": 80
71
  },
72
  {
73
- "epoch": 1.352542372881356,
74
- "grad_norm": 0.8564242124557495,
75
- "learning_rate": 5.720930232558139e-05,
76
- "loss": 0.8531,
77
  "step": 100
78
  },
79
  {
80
- "epoch": 1.352542372881356,
81
- "eval_loss": 0.9888104796409607,
82
- "eval_runtime": 8.8125,
83
- "eval_samples_per_second": 7.376,
84
- "eval_steps_per_second": 3.745,
85
  "step": 100
86
  },
87
  {
88
- "epoch": 1.623728813559322,
89
- "grad_norm": 1.0968314409255981,
90
- "learning_rate": 4.790697674418605e-05,
91
- "loss": 0.8971,
92
  "step": 120
93
  },
94
  {
95
- "epoch": 1.623728813559322,
96
- "eval_loss": 0.9693613648414612,
97
- "eval_runtime": 8.8377,
98
- "eval_samples_per_second": 7.355,
99
- "eval_steps_per_second": 3.734,
100
  "step": 120
101
  },
102
  {
103
- "epoch": 1.8949152542372882,
104
- "grad_norm": 0.4130585491657257,
105
- "learning_rate": 3.86046511627907e-05,
106
- "loss": 0.8475,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 1.8949152542372882,
111
- "eval_loss": 0.9588562846183777,
112
- "eval_runtime": 8.8593,
113
- "eval_samples_per_second": 7.337,
114
- "eval_steps_per_second": 3.725,
115
  "step": 140
116
  },
117
  {
118
- "epoch": 2.1627118644067798,
119
- "grad_norm": 0.5262497663497925,
120
- "learning_rate": 2.9302325581395352e-05,
121
- "loss": 0.8708,
122
  "step": 160
123
  },
124
  {
125
- "epoch": 2.1627118644067798,
126
- "eval_loss": 0.952869176864624,
127
- "eval_runtime": 8.8336,
128
- "eval_samples_per_second": 7.358,
129
- "eval_steps_per_second": 3.736,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 2.4338983050847456,
134
- "grad_norm": 0.44273582100868225,
135
- "learning_rate": 2e-05,
136
- "loss": 0.7806,
137
  "step": 180
138
  },
139
  {
140
- "epoch": 2.4338983050847456,
141
- "eval_loss": 0.945322573184967,
142
- "eval_runtime": 8.7983,
143
- "eval_samples_per_second": 7.388,
144
- "eval_steps_per_second": 3.751,
145
  "step": 180
146
  },
147
  {
148
- "epoch": 2.705084745762712,
149
- "grad_norm": 0.43839001655578613,
150
- "learning_rate": 1.0697674418604651e-05,
151
- "loss": 0.6945,
152
  "step": 200
153
  },
154
  {
155
- "epoch": 2.705084745762712,
156
- "eval_loss": 0.9455349445343018,
157
- "eval_runtime": 8.8108,
158
- "eval_samples_per_second": 7.377,
159
- "eval_steps_per_second": 3.745,
160
  "step": 200
161
  },
162
- {
163
- "epoch": 2.976271186440678,
164
- "grad_norm": 0.8535068035125732,
165
- "learning_rate": 1.3953488372093023e-06,
166
- "loss": 0.7384,
167
- "step": 220
168
- },
169
- {
170
- "epoch": 2.976271186440678,
171
- "eval_loss": 0.9410861134529114,
172
- "eval_runtime": 8.8196,
173
- "eval_samples_per_second": 7.37,
174
- "eval_steps_per_second": 3.742,
175
- "step": 220
176
- },
177
  {
178
  "epoch": 3.0,
179
- "step": 222,
180
- "total_flos": 2.5054811021408256e+16,
181
- "train_loss": 0.8678630219923483,
182
- "train_runtime": 981.2953,
183
- "train_samples_per_second": 1.804,
184
- "train_steps_per_second": 0.226
185
  },
186
  {
187
  "epoch": 3.0,
188
- "eval_loss": 0.9410861134529114,
189
- "eval_runtime": 8.8886,
190
- "eval_samples_per_second": 7.313,
191
- "eval_steps_per_second": 3.713,
192
- "step": 222
193
  }
194
  ],
195
  "logging_steps": 20,
196
- "max_steps": 222,
197
  "num_input_tokens_seen": 0,
198
  "num_train_epochs": 3,
199
  "save_steps": 20,
@@ -218,7 +203,7 @@
218
  "attributes": {}
219
  }
220
  },
221
- "total_flos": 2.5054811021408256e+16,
222
  "train_batch_size": 2,
223
  "trial_name": null,
224
  "trial_params": null
 
1
  {
2
+ "best_global_step": 200,
3
+ "best_metric": 0.946416974067688,
4
+ "best_model_checkpoint": "j05hr3d/SFT-Qwen2.5-Coder-1.5B_v1.1/checkpoint-200",
5
  "epoch": 3.0,
6
  "eval_steps": 20,
7
+ "global_step": 219,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.27586206896551724,
14
+ "grad_norm": 0.7840086817741394,
15
+ "learning_rate": 9.433962264150944e-05,
16
+ "loss": 1.1739,
17
  "step": 20
18
  },
19
  {
20
+ "epoch": 0.27586206896551724,
21
+ "eval_loss": 1.1330275535583496,
22
+ "eval_runtime": 8.7827,
23
+ "eval_samples_per_second": 7.401,
24
+ "eval_steps_per_second": 3.757,
25
  "step": 20
26
  },
27
  {
28
+ "epoch": 0.5517241379310345,
29
+ "grad_norm": 0.5770975947380066,
30
+ "learning_rate": 8.49056603773585e-05,
31
+ "loss": 0.8905,
32
  "step": 40
33
  },
34
  {
35
+ "epoch": 0.5517241379310345,
36
+ "eval_loss": 1.0621660947799683,
37
+ "eval_runtime": 8.7027,
38
+ "eval_samples_per_second": 7.469,
39
+ "eval_steps_per_second": 3.792,
40
  "step": 40
41
  },
42
  {
43
+ "epoch": 0.8275862068965517,
44
+ "grad_norm": 0.3944004774093628,
45
+ "learning_rate": 7.547169811320755e-05,
46
+ "loss": 0.9773,
47
  "step": 60
48
  },
49
  {
50
+ "epoch": 0.8275862068965517,
51
+ "eval_loss": 1.030529260635376,
52
+ "eval_runtime": 8.7329,
53
+ "eval_samples_per_second": 7.443,
54
+ "eval_steps_per_second": 3.779,
55
  "step": 60
56
  },
57
  {
58
+ "epoch": 1.096551724137931,
59
+ "grad_norm": 0.6047775149345398,
60
+ "learning_rate": 6.60377358490566e-05,
61
+ "loss": 0.8092,
62
  "step": 80
63
  },
64
  {
65
+ "epoch": 1.096551724137931,
66
+ "eval_loss": 1.0048853158950806,
67
+ "eval_runtime": 8.7488,
68
+ "eval_samples_per_second": 7.43,
69
+ "eval_steps_per_second": 3.772,
70
  "step": 80
71
  },
72
  {
73
+ "epoch": 1.3724137931034484,
74
+ "grad_norm": 0.9171755313873291,
75
+ "learning_rate": 5.660377358490566e-05,
76
+ "loss": 0.7883,
77
  "step": 100
78
  },
79
  {
80
+ "epoch": 1.3724137931034484,
81
+ "eval_loss": 0.9816080927848816,
82
+ "eval_runtime": 8.7226,
83
+ "eval_samples_per_second": 7.452,
84
+ "eval_steps_per_second": 3.783,
85
  "step": 100
86
  },
87
  {
88
+ "epoch": 1.6482758620689655,
89
+ "grad_norm": 0.5142741799354553,
90
+ "learning_rate": 4.716981132075472e-05,
91
+ "loss": 0.7641,
92
  "step": 120
93
  },
94
  {
95
+ "epoch": 1.6482758620689655,
96
+ "eval_loss": 0.9676293730735779,
97
+ "eval_runtime": 8.7198,
98
+ "eval_samples_per_second": 7.454,
99
+ "eval_steps_per_second": 3.785,
100
  "step": 120
101
  },
102
  {
103
+ "epoch": 1.9241379310344828,
104
+ "grad_norm": 0.4295555353164673,
105
+ "learning_rate": 3.7735849056603776e-05,
106
+ "loss": 0.79,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 1.9241379310344828,
111
+ "eval_loss": 0.9590840935707092,
112
+ "eval_runtime": 8.6822,
113
+ "eval_samples_per_second": 7.487,
114
+ "eval_steps_per_second": 3.801,
115
  "step": 140
116
  },
117
  {
118
+ "epoch": 2.193103448275862,
119
+ "grad_norm": 0.7362737059593201,
120
+ "learning_rate": 2.830188679245283e-05,
121
+ "loss": 0.8975,
122
  "step": 160
123
  },
124
  {
125
+ "epoch": 2.193103448275862,
126
+ "eval_loss": 0.9538200497627258,
127
+ "eval_runtime": 8.7607,
128
+ "eval_samples_per_second": 7.42,
129
+ "eval_steps_per_second": 3.767,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 2.4689655172413794,
134
+ "grad_norm": 0.6581431031227112,
135
+ "learning_rate": 1.8867924528301888e-05,
136
+ "loss": 0.7537,
137
  "step": 180
138
  },
139
  {
140
+ "epoch": 2.4689655172413794,
141
+ "eval_loss": 0.9525668025016785,
142
+ "eval_runtime": 8.7426,
143
+ "eval_samples_per_second": 7.435,
144
+ "eval_steps_per_second": 3.775,
145
  "step": 180
146
  },
147
  {
148
+ "epoch": 2.7448275862068967,
149
+ "grad_norm": 0.583696722984314,
150
+ "learning_rate": 9.433962264150944e-06,
151
+ "loss": 0.6484,
152
  "step": 200
153
  },
154
  {
155
+ "epoch": 2.7448275862068967,
156
+ "eval_loss": 0.946416974067688,
157
+ "eval_runtime": 8.7773,
158
+ "eval_samples_per_second": 7.405,
159
+ "eval_steps_per_second": 3.76,
160
  "step": 200
161
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  {
163
  "epoch": 3.0,
164
+ "step": 219,
165
+ "total_flos": 2.0274076806230016e+16,
166
+ "train_loss": 0.8416161210569617,
167
+ "train_runtime": 794.3439,
168
+ "train_samples_per_second": 2.19,
169
+ "train_steps_per_second": 0.276
170
  },
171
  {
172
  "epoch": 3.0,
173
+ "eval_loss": 0.946416974067688,
174
+ "eval_runtime": 8.7789,
175
+ "eval_samples_per_second": 7.404,
176
+ "eval_steps_per_second": 3.759,
177
+ "step": 219
178
  }
179
  ],
180
  "logging_steps": 20,
181
+ "max_steps": 219,
182
  "num_input_tokens_seen": 0,
183
  "num_train_epochs": 3,
184
  "save_steps": 20,
 
203
  "attributes": {}
204
  }
205
  },
206
+ "total_flos": 2.0274076806230016e+16,
207
  "train_batch_size": 2,
208
  "trial_name": null,
209
  "trial_params": null