IeBoytsov commited on
Commit
98ad7f4
·
verified ·
1 Parent(s): 6062372

Model save

Browse files
Files changed (4) hide show
  1. README.md +5 -0
  2. all_results.json +7 -7
  3. train_results.json +7 -7
  4. trainer_state.json +80 -177
README.md CHANGED
@@ -19,6 +19,8 @@ should probably proofread and complete it, then remove this comment. -->
19
  # llama-3-1-sft-qlora-test
20
 
21
  This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the generator dataset.
 
 
22
 
23
  ## Model description
24
 
@@ -51,6 +53,9 @@ The following hyperparameters were used during training:
51
 
52
  ### Training results
53
 
 
 
 
54
 
55
 
56
  ### Framework versions
 
19
  # llama-3-1-sft-qlora-test
20
 
21
  This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 1.1645
24
 
25
  ## Model description
26
 
 
53
 
54
  ### Training results
55
 
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:------:|:----:|:---------------:|
58
+ | 1.2385 | 0.9934 | 75 | 1.1645 |
59
 
60
 
61
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 1.1183951188905165e+17,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0082,
6
- "train_samples": 2078,
7
- "train_samples_per_second": 147602.294,
8
- "train_steps_per_second": 18480.884
9
  }
 
1
  {
2
+ "epoch": 0.9933774834437086,
3
+ "total_flos": 5.564154814608179e+16,
4
+ "train_loss": 1.2146572240193685,
5
+ "train_runtime": 3027.7749,
6
+ "train_samples": 1039,
7
+ "train_samples_per_second": 0.199,
8
+ "train_steps_per_second": 0.025
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 1.1183951188905165e+17,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0082,
6
- "train_samples": 2078,
7
- "train_samples_per_second": 147602.294,
8
- "train_steps_per_second": 18480.884
9
  }
 
1
  {
2
+ "epoch": 0.9933774834437086,
3
+ "total_flos": 5.564154814608179e+16,
4
+ "train_loss": 1.2146572240193685,
5
+ "train_runtime": 3027.7749,
6
+ "train_samples": 1039,
7
+ "train_samples_per_second": 0.199,
8
+ "train_steps_per_second": 0.025
9
  }
trainer_state.json CHANGED
@@ -1,242 +1,145 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 151,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.006622516556291391,
13
- "grad_norm": 0.39227479696273804,
14
- "learning_rate": 1.25e-05,
15
- "loss": 1.5025,
16
  "step": 1
17
  },
18
- {
19
- "epoch": 0.033112582781456956,
20
- "grad_norm": 0.6300222873687744,
21
- "learning_rate": 6.25e-05,
22
- "loss": 1.4454,
23
- "step": 5
24
- },
25
  {
26
  "epoch": 0.06622516556291391,
27
- "grad_norm": 0.2265906035900116,
28
  "learning_rate": 0.000125,
29
- "loss": 1.3232,
30
- "step": 10
31
- },
32
- {
33
- "epoch": 0.09933774834437085,
34
- "grad_norm": 0.15402744710445404,
35
- "learning_rate": 0.0001875,
36
- "loss": 1.2574,
37
- "step": 15
38
  },
39
  {
40
  "epoch": 0.13245033112582782,
41
- "grad_norm": 0.2852768301963806,
42
- "learning_rate": 0.00019956707906498044,
43
- "loss": 1.2792,
44
- "step": 20
45
- },
46
- {
47
- "epoch": 0.16556291390728478,
48
- "grad_norm": 0.15974751114845276,
49
- "learning_rate": 0.00019781476007338058,
50
- "loss": 1.2323,
51
- "step": 25
52
  },
53
  {
54
  "epoch": 0.1986754966887417,
55
- "grad_norm": 0.14968091249465942,
56
- "learning_rate": 0.00019473966425143292,
57
- "loss": 1.1704,
58
- "step": 30
59
- },
60
- {
61
- "epoch": 0.23178807947019867,
62
- "grad_norm": 0.14632880687713623,
63
- "learning_rate": 0.00019038337699485208,
64
- "loss": 1.1832,
65
- "step": 35
66
  },
67
  {
68
  "epoch": 0.26490066225165565,
69
- "grad_norm": 0.14008674025535583,
70
- "learning_rate": 0.0001848048096156426,
71
- "loss": 1.1302,
72
- "step": 40
73
- },
74
- {
75
- "epoch": 0.2980132450331126,
76
- "grad_norm": 0.13251474499702454,
77
- "learning_rate": 0.00017807940266766593,
78
- "loss": 1.1606,
79
- "step": 45
80
  },
81
  {
82
  "epoch": 0.33112582781456956,
83
- "grad_norm": 0.1554378867149353,
84
- "learning_rate": 0.0001702981057425662,
85
- "loss": 1.1845,
86
- "step": 50
87
- },
88
- {
89
- "epoch": 0.36423841059602646,
90
- "grad_norm": 0.13782458007335663,
91
- "learning_rate": 0.0001615661475325658,
92
- "loss": 1.1304,
93
- "step": 55
94
  },
95
  {
96
  "epoch": 0.3973509933774834,
97
- "grad_norm": 0.1376216560602188,
98
- "learning_rate": 0.00015200161279292155,
99
- "loss": 1.1386,
100
- "step": 60
101
- },
102
- {
103
- "epoch": 0.4304635761589404,
104
- "grad_norm": 0.13750730454921722,
105
- "learning_rate": 0.0001417338454481818,
106
- "loss": 1.1886,
107
- "step": 65
108
  },
109
  {
110
  "epoch": 0.46357615894039733,
111
- "grad_norm": 0.14349249005317688,
112
- "learning_rate": 0.00013090169943749476,
113
- "loss": 1.2558,
114
- "step": 70
115
- },
116
- {
117
- "epoch": 0.4966887417218543,
118
- "grad_norm": 0.1315854787826538,
119
- "learning_rate": 0.00011965166095328301,
120
- "loss": 1.2015,
121
- "step": 75
122
  },
123
  {
124
  "epoch": 0.5298013245033113,
125
- "grad_norm": 0.13306115567684174,
126
- "learning_rate": 0.00010813586746678583,
127
- "loss": 1.1666,
128
- "step": 80
129
- },
130
- {
131
- "epoch": 0.5629139072847682,
132
- "grad_norm": 0.1344432234764099,
133
- "learning_rate": 9.651005032974994e-05,
134
- "loss": 1.1596,
135
- "step": 85
136
  },
137
  {
138
  "epoch": 0.5960264900662252,
139
- "grad_norm": 0.14689402282238007,
140
- "learning_rate": 8.49314287750517e-05,
141
- "loss": 1.1657,
142
- "step": 90
143
- },
144
- {
145
- "epoch": 0.6291390728476821,
146
- "grad_norm": 0.13607865571975708,
147
- "learning_rate": 7.35565837962798e-05,
148
- "loss": 1.1268,
149
- "step": 95
150
  },
151
  {
152
  "epoch": 0.6622516556291391,
153
- "grad_norm": 0.14480474591255188,
154
- "learning_rate": 6.25393406584088e-05,
155
- "loss": 1.1523,
156
- "step": 100
157
- },
158
- {
159
- "epoch": 0.695364238410596,
160
- "grad_norm": 0.13964812457561493,
161
- "learning_rate": 5.2028688674975415e-05,
162
- "loss": 1.1185,
163
- "step": 105
164
  },
165
  {
166
  "epoch": 0.7284768211920529,
167
- "grad_norm": 0.13802023231983185,
168
- "learning_rate": 4.216676638320135e-05,
169
- "loss": 1.1517,
170
- "step": 110
171
- },
172
- {
173
- "epoch": 0.7615894039735099,
174
- "grad_norm": 0.13757328689098358,
175
- "learning_rate": 3.308693936411421e-05,
176
- "loss": 1.1655,
177
- "step": 115
178
  },
179
  {
180
  "epoch": 0.7947019867549668,
181
- "grad_norm": 0.14344969391822815,
182
- "learning_rate": 2.491199670185008e-05,
183
- "loss": 1.1571,
184
- "step": 120
185
- },
186
- {
187
- "epoch": 0.8278145695364238,
188
- "grad_norm": 0.15166440606117249,
189
- "learning_rate": 1.775249047193377e-05,
190
- "loss": 1.2056,
191
- "step": 125
192
  },
193
  {
194
  "epoch": 0.8609271523178808,
195
- "grad_norm": 0.13406524062156677,
196
- "learning_rate": 1.1705240714107302e-05,
197
- "loss": 1.1083,
198
- "step": 130
199
- },
200
- {
201
- "epoch": 0.8940397350993378,
202
- "grad_norm": 0.1382841318845749,
203
- "learning_rate": 6.852026107385756e-06,
204
- "loss": 1.1221,
205
- "step": 135
206
  },
207
  {
208
  "epoch": 0.9271523178807947,
209
- "grad_norm": 0.17914775013923645,
210
- "learning_rate": 3.2584780537136207e-06,
211
- "loss": 1.2247,
212
- "step": 140
213
  },
214
  {
215
- "epoch": 0.9602649006622517,
216
- "grad_norm": 0.14941184222698212,
217
- "learning_rate": 9.731931258429638e-07,
218
- "loss": 1.1813,
219
- "step": 145
220
  },
221
  {
222
  "epoch": 0.9933774834437086,
223
- "grad_norm": 0.13595885038375854,
224
- "learning_rate": 2.7075882053828605e-08,
225
- "loss": 1.1418,
226
- "step": 150
 
227
  },
228
  {
229
- "epoch": 1.0,
230
- "step": 151,
231
- "total_flos": 1.1183951188905165e+17,
232
- "train_loss": 0.0,
233
- "train_runtime": 0.0082,
234
- "train_samples_per_second": 147602.294,
235
- "train_steps_per_second": 18480.884
236
  }
237
  ],
238
  "logging_steps": 5,
239
- "max_steps": 151,
240
  "num_input_tokens_seen": 0,
241
  "num_train_epochs": 1,
242
  "save_steps": 100,
@@ -252,7 +155,7 @@
252
  "attributes": {}
253
  }
254
  },
255
- "total_flos": 1.1183951188905165e+17,
256
  "train_batch_size": 4,
257
  "trial_name": null,
258
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9933774834437086,
5
  "eval_steps": 500,
6
+ "global_step": 75,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.013245033112582781,
13
+ "grad_norm": 0.4943664073944092,
14
+ "learning_rate": 2.5e-05,
15
+ "loss": 1.4272,
16
  "step": 1
17
  },
 
 
 
 
 
 
 
18
  {
19
  "epoch": 0.06622516556291391,
20
+ "grad_norm": 0.20984387397766113,
21
  "learning_rate": 0.000125,
22
+ "loss": 1.3101,
23
+ "step": 5
 
 
 
 
 
 
 
24
  },
25
  {
26
  "epoch": 0.13245033112582782,
27
+ "grad_norm": 0.2290477156639099,
28
+ "learning_rate": 0.00019956059820218982,
29
+ "loss": 1.2917,
30
+ "step": 10
 
 
 
 
 
 
 
31
  },
32
  {
33
  "epoch": 0.1986754966887417,
34
+ "grad_norm": 0.15163910388946533,
35
+ "learning_rate": 0.00019466156752904343,
36
+ "loss": 1.2823,
37
+ "step": 15
 
 
 
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.26490066225165565,
41
+ "grad_norm": 0.1627238243818283,
42
+ "learning_rate": 0.00018458320592590975,
43
+ "loss": 1.1889,
44
+ "step": 20
 
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 0.33112582781456956,
48
+ "grad_norm": 0.15383219718933105,
49
+ "learning_rate": 0.00016987694277788417,
50
+ "loss": 1.198,
51
+ "step": 25
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 0.3973509933774834,
55
+ "grad_norm": 0.1501755714416504,
56
+ "learning_rate": 0.0001513474193514842,
57
+ "loss": 1.1762,
58
+ "step": 30
 
 
 
 
 
 
 
59
  },
60
  {
61
  "epoch": 0.46357615894039733,
62
+ "grad_norm": 0.14539840817451477,
63
+ "learning_rate": 0.0001300084635000341,
64
+ "loss": 1.2176,
65
+ "step": 35
 
 
 
 
 
 
 
66
  },
67
  {
68
  "epoch": 0.5298013245033113,
69
+ "grad_norm": 0.12844280898571014,
70
+ "learning_rate": 0.0001070276188945293,
71
+ "loss": 1.1942,
72
+ "step": 40
 
 
 
 
 
 
 
73
  },
74
  {
75
  "epoch": 0.5960264900662252,
76
+ "grad_norm": 0.13806107640266418,
77
+ "learning_rate": 8.366226381814697e-05,
78
+ "loss": 1.2928,
79
+ "step": 45
 
 
 
 
 
 
 
80
  },
81
  {
82
  "epoch": 0.6622516556291391,
83
+ "grad_norm": 0.13188520073890686,
84
+ "learning_rate": 6.119081473277501e-05,
85
+ "loss": 1.1959,
86
+ "step": 50
 
 
 
 
 
 
 
87
  },
88
  {
89
  "epoch": 0.7284768211920529,
90
+ "grad_norm": 0.12824179232120514,
91
+ "learning_rate": 4.084277875864776e-05,
92
+ "loss": 1.1188,
93
+ "step": 55
 
 
 
 
 
 
 
94
  },
95
  {
96
  "epoch": 0.7947019867549668,
97
+ "grad_norm": 0.14250224828720093,
98
+ "learning_rate": 2.3731482188961818e-05,
99
+ "loss": 1.2076,
100
+ "step": 60
 
 
 
 
 
 
 
101
  },
102
  {
103
  "epoch": 0.8609271523178808,
104
+ "grad_norm": 0.14001749455928802,
105
+ "learning_rate": 1.0793155744261351e-05,
106
+ "loss": 1.1352,
107
+ "step": 65
 
 
 
 
 
 
 
108
  },
109
  {
110
  "epoch": 0.9271523178807947,
111
+ "grad_norm": 0.15154731273651123,
112
+ "learning_rate": 2.735709467518699e-06,
113
+ "loss": 1.1486,
114
+ "step": 70
115
  },
116
  {
117
+ "epoch": 0.9933774834437086,
118
+ "grad_norm": 0.14987458288669586,
119
+ "learning_rate": 0.0,
120
+ "loss": 1.2385,
121
+ "step": 75
122
  },
123
  {
124
  "epoch": 0.9933774834437086,
125
+ "eval_loss": 1.1645171642303467,
126
+ "eval_runtime": 2610.6612,
127
+ "eval_samples_per_second": 5.155,
128
+ "eval_steps_per_second": 0.645,
129
+ "step": 75
130
  },
131
  {
132
+ "epoch": 0.9933774834437086,
133
+ "step": 75,
134
+ "total_flos": 5.564154814608179e+16,
135
+ "train_loss": 1.2146572240193685,
136
+ "train_runtime": 3027.7749,
137
+ "train_samples_per_second": 0.199,
138
+ "train_steps_per_second": 0.025
139
  }
140
  ],
141
  "logging_steps": 5,
142
+ "max_steps": 75,
143
  "num_input_tokens_seen": 0,
144
  "num_train_epochs": 1,
145
  "save_steps": 100,
 
155
  "attributes": {}
156
  }
157
  },
158
+ "total_flos": 5.564154814608179e+16,
159
  "train_batch_size": 4,
160
  "trial_name": null,
161
  "trial_params": null