YuanZ77 commited on
Commit
a63e6e5
·
verified ·
1 Parent(s): c5fc239

Model save

Browse files
Files changed (4) hide show
  1. README.md +5 -5
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +76 -76
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.9842
24
 
25
  ## Model description
26
 
@@ -39,7 +39,7 @@ More information needed
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
- - learning_rate: 2e-05
43
  - train_batch_size: 4
44
  - eval_batch_size: 4
45
  - seed: 42
@@ -57,9 +57,9 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
- | 1.3036 | 0.9180 | 7 | 1.2136 |
61
- | 0.9604 | 1.9672 | 15 | 0.9997 |
62
- | 0.8292 | 2.7541 | 21 | 0.9842 |
63
 
64
 
65
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.0122
24
 
25
  ## Model description
26
 
 
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
+ - learning_rate: 5e-05
43
  - train_batch_size: 4
44
  - eval_batch_size: 4
45
  - seed: 42
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.39 | 0.9180 | 7 | 1.3376 |
61
+ | 0.9541 | 1.9672 | 15 | 1.0215 |
62
+ | 0.745 | 2.7541 | 21 | 1.0122 |
63
 
64
 
65
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.7540983606557377,
3
  "total_flos": 3971544514560.0,
4
- "train_loss": 1.268857215132032,
5
- "train_runtime": 396.9611,
6
  "train_samples": 726,
7
- "train_samples_per_second": 3.673,
8
- "train_steps_per_second": 0.053
9
  }
 
1
  {
2
  "epoch": 2.7540983606557377,
3
  "total_flos": 3971544514560.0,
4
+ "train_loss": 1.29271438008263,
5
+ "train_runtime": 605.3238,
6
  "train_samples": 726,
7
+ "train_samples_per_second": 2.409,
8
+ "train_steps_per_second": 0.035
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.7540983606557377,
3
  "total_flos": 3971544514560.0,
4
- "train_loss": 1.268857215132032,
5
- "train_runtime": 396.9611,
6
  "train_samples": 726,
7
- "train_samples_per_second": 3.673,
8
- "train_steps_per_second": 0.053
9
  }
 
1
  {
2
  "epoch": 2.7540983606557377,
3
  "total_flos": 3971544514560.0,
4
+ "train_loss": 1.29271438008263,
5
+ "train_runtime": 605.3238,
6
  "train_samples": 726,
7
+ "train_samples_per_second": 2.409,
8
+ "train_steps_per_second": 0.035
9
  }
trainer_state.json CHANGED
@@ -10,183 +10,183 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.13114754098360656,
13
- "grad_norm": 39.05130772271081,
14
- "learning_rate": 6.666666666666667e-06,
15
  "loss": 2.4376,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.26229508196721313,
20
- "grad_norm": 39.015973031999486,
21
- "learning_rate": 1.3333333333333333e-05,
22
  "loss": 2.4339,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.39344262295081966,
27
- "grad_norm": 15.152312064538828,
28
- "learning_rate": 2e-05,
29
- "loss": 1.8888,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.5245901639344263,
34
- "grad_norm": 29.273723674546083,
35
- "learning_rate": 1.9848077530122083e-05,
36
- "loss": 1.8725,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.6557377049180327,
41
- "grad_norm": 11.667536919946288,
42
- "learning_rate": 1.9396926207859085e-05,
43
- "loss": 1.7797,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.7868852459016393,
48
- "grad_norm": 5.696635279505377,
49
- "learning_rate": 1.866025403784439e-05,
50
- "loss": 1.4526,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.9180327868852459,
55
- "grad_norm": 6.1474254223603655,
56
- "learning_rate": 1.766044443118978e-05,
57
- "loss": 1.3036,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.9180327868852459,
62
- "eval_loss": 1.2136186361312866,
63
- "eval_runtime": 9.285,
64
- "eval_samples_per_second": 20.678,
65
- "eval_steps_per_second": 2.585,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 1.0491803278688525,
70
- "grad_norm": 3.8441152970886963,
71
- "learning_rate": 1.6427876096865394e-05,
72
- "loss": 1.193,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 1.180327868852459,
77
- "grad_norm": 3.216661811188678,
78
- "learning_rate": 1.5000000000000002e-05,
79
- "loss": 1.1161,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 1.3114754098360657,
84
- "grad_norm": 2.783161962423741,
85
- "learning_rate": 1.342020143325669e-05,
86
- "loss": 1.0656,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 1.4426229508196722,
91
- "grad_norm": 2.6684579204452517,
92
- "learning_rate": 1.1736481776669307e-05,
93
- "loss": 1.0333,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 1.5737704918032787,
98
- "grad_norm": 2.1293181551342286,
99
- "learning_rate": 1e-05,
100
- "loss": 1.0045,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 1.7049180327868854,
105
- "grad_norm": 1.7349053966220533,
106
- "learning_rate": 8.263518223330698e-06,
107
- "loss": 0.971,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 1.8360655737704918,
112
- "grad_norm": 1.708605250889302,
113
- "learning_rate": 6.579798566743314e-06,
114
- "loss": 0.9746,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 1.9672131147540983,
119
- "grad_norm": 1.565922544143404,
120
- "learning_rate": 5.000000000000003e-06,
121
- "loss": 0.9604,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 1.9672131147540983,
126
- "eval_loss": 0.9996973872184753,
127
- "eval_runtime": 9.0907,
128
- "eval_samples_per_second": 21.121,
129
- "eval_steps_per_second": 2.64,
130
  "step": 15
131
  },
132
  {
133
  "epoch": 2.098360655737705,
134
- "grad_norm": 1.4703758003535075,
135
- "learning_rate": 3.5721239031346067e-06,
136
- "loss": 0.9006,
137
  "step": 16
138
  },
139
  {
140
  "epoch": 2.2295081967213113,
141
- "grad_norm": 1.4546503498400736,
142
- "learning_rate": 2.339555568810221e-06,
143
- "loss": 0.8672,
144
  "step": 17
145
  },
146
  {
147
  "epoch": 2.360655737704918,
148
- "grad_norm": 1.333908386274604,
149
- "learning_rate": 1.339745962155613e-06,
150
- "loss": 0.8603,
151
  "step": 18
152
  },
153
  {
154
  "epoch": 2.4918032786885247,
155
- "grad_norm": 1.2309918853415422,
156
- "learning_rate": 6.030737921409169e-07,
157
- "loss": 0.8533,
158
  "step": 19
159
  },
160
  {
161
  "epoch": 2.6229508196721314,
162
- "grad_norm": 1.187570177680188,
163
- "learning_rate": 1.519224698779198e-07,
164
- "loss": 0.8482,
165
  "step": 20
166
  },
167
  {
168
  "epoch": 2.7540983606557377,
169
- "grad_norm": 1.1796688649183953,
170
  "learning_rate": 0.0,
171
- "loss": 0.8292,
172
  "step": 21
173
  },
174
  {
175
  "epoch": 2.7540983606557377,
176
- "eval_loss": 0.9841778874397278,
177
- "eval_runtime": 9.0575,
178
- "eval_samples_per_second": 21.198,
179
- "eval_steps_per_second": 2.65,
180
  "step": 21
181
  },
182
  {
183
  "epoch": 2.7540983606557377,
184
  "step": 21,
185
  "total_flos": 3971544514560.0,
186
- "train_loss": 1.268857215132032,
187
- "train_runtime": 396.9611,
188
- "train_samples_per_second": 3.673,
189
- "train_steps_per_second": 0.053
190
  }
191
  ],
192
  "logging_steps": 1,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.13114754098360656,
13
+ "grad_norm": 39.05072126696723,
14
+ "learning_rate": 1.6666666666666667e-05,
15
  "loss": 2.4376,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.26229508196721313,
20
+ "grad_norm": 39.016375245765836,
21
+ "learning_rate": 3.3333333333333335e-05,
22
  "loss": 2.4339,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.39344262295081966,
27
+ "grad_norm": 32.661267867985714,
28
+ "learning_rate": 5e-05,
29
+ "loss": 1.9547,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.5245901639344263,
34
+ "grad_norm": 13.966533792864706,
35
+ "learning_rate": 4.962019382530521e-05,
36
+ "loss": 2.0498,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.6557377049180327,
41
+ "grad_norm": 16.547294305437767,
42
+ "learning_rate": 4.849231551964771e-05,
43
+ "loss": 1.9131,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.7868852459016393,
48
+ "grad_norm": 7.978971381469852,
49
+ "learning_rate": 4.665063509461097e-05,
50
+ "loss": 1.5865,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.9180327868852459,
55
+ "grad_norm": 4.596921803156147,
56
+ "learning_rate": 4.415111107797445e-05,
57
+ "loss": 1.39,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.9180327868852459,
62
+ "eval_loss": 1.3376487493515015,
63
+ "eval_runtime": 17.0154,
64
+ "eval_samples_per_second": 11.284,
65
+ "eval_steps_per_second": 1.41,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 1.0491803278688525,
70
+ "grad_norm": 4.90093363719236,
71
+ "learning_rate": 4.1069690242163484e-05,
72
+ "loss": 1.2949,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 1.180327868852459,
77
+ "grad_norm": 4.892431290254693,
78
+ "learning_rate": 3.7500000000000003e-05,
79
+ "loss": 1.2567,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 1.3114754098360657,
84
+ "grad_norm": 2.793814810608094,
85
+ "learning_rate": 3.355050358314172e-05,
86
+ "loss": 1.1411,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 1.4426229508196722,
91
+ "grad_norm": 2.132448746823284,
92
+ "learning_rate": 2.9341204441673266e-05,
93
+ "loss": 1.0593,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 1.5737704918032787,
98
+ "grad_norm": 1.6858973805764268,
99
+ "learning_rate": 2.5e-05,
100
+ "loss": 1.0091,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 1.7049180327868854,
105
+ "grad_norm": 2.079280289342001,
106
+ "learning_rate": 2.0658795558326743e-05,
107
+ "loss": 0.977,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 1.8360655737704918,
112
+ "grad_norm": 1.5764028928084828,
113
+ "learning_rate": 1.6449496416858284e-05,
114
+ "loss": 0.9782,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 1.9672131147540983,
119
+ "grad_norm": 1.3883568841727698,
120
+ "learning_rate": 1.2500000000000006e-05,
121
+ "loss": 0.9541,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 1.9672131147540983,
126
+ "eval_loss": 1.0214608907699585,
127
+ "eval_runtime": 16.9378,
128
+ "eval_samples_per_second": 11.336,
129
+ "eval_steps_per_second": 1.417,
130
  "step": 15
131
  },
132
  {
133
  "epoch": 2.098360655737705,
134
+ "grad_norm": 1.4882737473631815,
135
+ "learning_rate": 8.930309757836517e-06,
136
+ "loss": 0.8517,
137
  "step": 16
138
  },
139
  {
140
  "epoch": 2.2295081967213113,
141
+ "grad_norm": 1.3413297585539747,
142
+ "learning_rate": 5.848888922025553e-06,
143
+ "loss": 0.7971,
144
  "step": 17
145
  },
146
  {
147
  "epoch": 2.360655737704918,
148
+ "grad_norm": 1.1809008492312185,
149
+ "learning_rate": 3.3493649053890326e-06,
150
+ "loss": 0.7859,
151
  "step": 18
152
  },
153
  {
154
  "epoch": 2.4918032786885247,
155
+ "grad_norm": 1.0943970674897627,
156
+ "learning_rate": 1.5076844803522922e-06,
157
+ "loss": 0.7711,
158
  "step": 19
159
  },
160
  {
161
  "epoch": 2.6229508196721314,
162
+ "grad_norm": 1.1052118316853639,
163
+ "learning_rate": 3.7980617469479953e-07,
164
+ "loss": 0.7603,
165
  "step": 20
166
  },
167
  {
168
  "epoch": 2.7540983606557377,
169
+ "grad_norm": 1.0269995716510345,
170
  "learning_rate": 0.0,
171
+ "loss": 0.745,
172
  "step": 21
173
  },
174
  {
175
  "epoch": 2.7540983606557377,
176
+ "eval_loss": 1.0121965408325195,
177
+ "eval_runtime": 9.0436,
178
+ "eval_samples_per_second": 21.231,
179
+ "eval_steps_per_second": 2.654,
180
  "step": 21
181
  },
182
  {
183
  "epoch": 2.7540983606557377,
184
  "step": 21,
185
  "total_flos": 3971544514560.0,
186
+ "train_loss": 1.29271438008263,
187
+ "train_runtime": 605.3238,
188
+ "train_samples_per_second": 2.409,
189
+ "train_steps_per_second": 0.035
190
  }
191
  ],
192
  "logging_steps": 1,