YuanZ77 commited on
Commit
bbfcc0f
·
verified ·
1 Parent(s): 671cbe0

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.9877
24
 
25
  ## Model description
26
 
@@ -57,9 +57,9 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
- | 1.2963 | 0.9180 | 7 | 1.2142 |
61
- | 0.954 | 1.9672 | 15 | 1.0018 |
62
- | 0.8356 | 2.7541 | 21 | 0.9877 |
63
 
64
 
65
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.0156
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.3675 | 0.9180 | 7 | 1.2845 |
61
+ | 0.9289 | 1.9672 | 15 | 1.0186 |
62
+ | 0.7336 | 2.7541 | 21 | 1.0156 |
63
 
64
 
65
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.7540983606557377,
3
  "total_flos": 3971544514560.0,
4
- "train_loss": 0.0,
5
- "train_runtime": 11.242,
6
  "train_samples": 725,
7
- "train_samples_per_second": 128.892,
8
- "train_steps_per_second": 1.868
9
  }
 
1
  {
2
  "epoch": 2.7540983606557377,
3
  "total_flos": 3971544514560.0,
4
+ "train_loss": 1.26855130422683,
5
+ "train_runtime": 385.6521,
6
  "train_samples": 725,
7
+ "train_samples_per_second": 3.757,
8
+ "train_steps_per_second": 0.054
9
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:679815afad561b9f08e9a30b4d955907c2185180cc65560f7cec06e72080f34f
3
  size 4988025760
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dcff2d834517f116d11bb21e10303ce38f4453fd72f581734f5f3d7aecb1336
3
  size 4988025760
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e413fd577635aea4005f4a402d7e827ad3e8fdb78d080b5b6188909acb151f8
3
  size 240691728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:135f2ee801e93f5389fcc1f19c6c7bc6f1dcf4344f079ff6dc0e5b3a5ac3f5d7
3
  size 240691728
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.7540983606557377,
3
  "total_flos": 3971544514560.0,
4
- "train_loss": 0.0,
5
- "train_runtime": 11.242,
6
  "train_samples": 725,
7
- "train_samples_per_second": 128.892,
8
- "train_steps_per_second": 1.868
9
  }
 
1
  {
2
  "epoch": 2.7540983606557377,
3
  "total_flos": 3971544514560.0,
4
+ "train_loss": 1.26855130422683,
5
+ "train_runtime": 385.6521,
6
  "train_samples": 725,
7
+ "train_samples_per_second": 3.757,
8
+ "train_steps_per_second": 0.054
9
  }
trainer_state.json CHANGED
@@ -10,183 +10,183 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.13114754098360656,
13
- "grad_norm": 38.4906907414524,
14
- "learning_rate": 6.666666666666667e-06,
15
  "loss": 2.4278,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.26229508196721313,
20
- "grad_norm": 39.35649873095453,
21
- "learning_rate": 1.3333333333333333e-05,
22
  "loss": 2.4386,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.39344262295081966,
27
- "grad_norm": 15.02300885231628,
28
- "learning_rate": 2e-05,
29
- "loss": 1.8851,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.5245901639344263,
34
- "grad_norm": 30.015021153424833,
35
- "learning_rate": 1.9848077530122083e-05,
36
- "loss": 1.8762,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.6557377049180327,
41
- "grad_norm": 11.906470741405201,
42
- "learning_rate": 1.9396926207859085e-05,
43
- "loss": 1.7922,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.7868852459016393,
48
- "grad_norm": 6.013424294278915,
49
- "learning_rate": 1.866025403784439e-05,
50
- "loss": 1.4363,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.9180327868852459,
55
- "grad_norm": 5.488571980550277,
56
- "learning_rate": 1.766044443118978e-05,
57
- "loss": 1.2963,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.9180327868852459,
62
- "eval_loss": 1.2142459154129028,
63
- "eval_runtime": 9.3088,
64
- "eval_samples_per_second": 20.626,
65
- "eval_steps_per_second": 2.578,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 1.0491803278688525,
70
- "grad_norm": 4.278302382312637,
71
- "learning_rate": 1.6427876096865394e-05,
72
- "loss": 1.1874,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 1.180327868852459,
77
- "grad_norm": 3.204774458338129,
78
- "learning_rate": 1.5000000000000002e-05,
79
- "loss": 1.1064,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 1.3114754098360657,
84
- "grad_norm": 2.8914531523767013,
85
- "learning_rate": 1.342020143325669e-05,
86
- "loss": 1.0671,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 1.4426229508196722,
91
- "grad_norm": 2.2472150773134363,
92
- "learning_rate": 1.1736481776669307e-05,
93
- "loss": 1.0103,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 1.5737704918032787,
98
- "grad_norm": 2.093742944888527,
99
- "learning_rate": 1e-05,
100
- "loss": 1.0028,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 1.7049180327868854,
105
- "grad_norm": 1.8675088356857141,
106
- "learning_rate": 8.263518223330698e-06,
107
- "loss": 0.9762,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 1.8360655737704918,
112
- "grad_norm": 1.6366608455568707,
113
- "learning_rate": 6.579798566743314e-06,
114
- "loss": 0.9678,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 1.9672131147540983,
119
- "grad_norm": 1.5464740113993372,
120
- "learning_rate": 5.000000000000003e-06,
121
- "loss": 0.954,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 1.9672131147540983,
126
- "eval_loss": 1.001825213432312,
127
- "eval_runtime": 9.0799,
128
- "eval_samples_per_second": 21.145,
129
- "eval_steps_per_second": 2.643,
130
  "step": 15
131
  },
132
  {
133
  "epoch": 2.098360655737705,
134
- "grad_norm": 1.4407616891998447,
135
- "learning_rate": 3.5721239031346067e-06,
136
- "loss": 0.8805,
137
  "step": 16
138
  },
139
  {
140
  "epoch": 2.2295081967213113,
141
- "grad_norm": 1.4358137476258483,
142
- "learning_rate": 2.339555568810221e-06,
143
- "loss": 0.8681,
144
  "step": 17
145
  },
146
  {
147
  "epoch": 2.360655737704918,
148
- "grad_norm": 1.332633367234562,
149
- "learning_rate": 1.339745962155613e-06,
150
- "loss": 0.8592,
151
  "step": 18
152
  },
153
  {
154
  "epoch": 2.4918032786885247,
155
- "grad_norm": 1.2284953779578738,
156
- "learning_rate": 6.030737921409169e-07,
157
- "loss": 0.8545,
158
  "step": 19
159
  },
160
  {
161
  "epoch": 2.6229508196721314,
162
- "grad_norm": 1.2207700124089214,
163
- "learning_rate": 1.519224698779198e-07,
164
- "loss": 0.8337,
165
  "step": 20
166
  },
167
  {
168
  "epoch": 2.7540983606557377,
169
- "grad_norm": 1.1315984846861744,
170
  "learning_rate": 0.0,
171
- "loss": 0.8356,
172
  "step": 21
173
  },
174
  {
175
  "epoch": 2.7540983606557377,
176
- "eval_loss": 0.9876740574836731,
177
- "eval_runtime": 8.9893,
178
- "eval_samples_per_second": 21.359,
179
- "eval_steps_per_second": 2.67,
180
  "step": 21
181
  },
182
  {
183
  "epoch": 2.7540983606557377,
184
  "step": 21,
185
  "total_flos": 3971544514560.0,
186
- "train_loss": 0.0,
187
- "train_runtime": 11.242,
188
- "train_samples_per_second": 128.892,
189
- "train_steps_per_second": 1.868
190
  }
191
  ],
192
  "logging_steps": 1,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.13114754098360656,
13
+ "grad_norm": 38.48949567054442,
14
+ "learning_rate": 1.6666666666666667e-05,
15
  "loss": 2.4278,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.26229508196721313,
20
+ "grad_norm": 39.356777754899475,
21
+ "learning_rate": 3.3333333333333335e-05,
22
  "loss": 2.4386,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.39344262295081966,
27
+ "grad_norm": 33.30743701183079,
28
+ "learning_rate": 5e-05,
29
+ "loss": 1.9706,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.5245901639344263,
34
+ "grad_norm": 14.11009284862285,
35
+ "learning_rate": 4.962019382530521e-05,
36
+ "loss": 2.046,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.6557377049180327,
41
+ "grad_norm": 15.728498285095311,
42
+ "learning_rate": 4.849231551964771e-05,
43
+ "loss": 1.8774,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.7868852459016393,
48
+ "grad_norm": 7.811945380246204,
49
+ "learning_rate": 4.665063509461097e-05,
50
+ "loss": 1.555,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.9180327868852459,
55
+ "grad_norm": 4.212013153204252,
56
+ "learning_rate": 4.415111107797445e-05,
57
+ "loss": 1.3675,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.9180327868852459,
62
+ "eval_loss": 1.2844842672348022,
63
+ "eval_runtime": 9.2887,
64
+ "eval_samples_per_second": 20.67,
65
+ "eval_steps_per_second": 2.584,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 1.0491803278688525,
70
+ "grad_norm": 4.5268328646450735,
71
+ "learning_rate": 4.1069690242163484e-05,
72
+ "loss": 1.2474,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 1.180327868852459,
77
+ "grad_norm": 3.2715775686151596,
78
+ "learning_rate": 3.7500000000000003e-05,
79
+ "loss": 1.1467,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 1.3114754098360657,
84
+ "grad_norm": 2.0643659092546915,
85
+ "learning_rate": 3.355050358314172e-05,
86
+ "loss": 1.0749,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 1.4426229508196722,
91
+ "grad_norm": 2.304739353698177,
92
+ "learning_rate": 2.9341204441673266e-05,
93
+ "loss": 1.0221,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 1.5737704918032787,
98
+ "grad_norm": 2.127617323697091,
99
+ "learning_rate": 2.5e-05,
100
+ "loss": 1.0107,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 1.7049180327868854,
105
+ "grad_norm": 2.0707864059238807,
106
+ "learning_rate": 2.0658795558326743e-05,
107
+ "loss": 0.9786,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 1.8360655737704918,
112
+ "grad_norm": 1.3495555593879531,
113
+ "learning_rate": 1.6449496416858284e-05,
114
+ "loss": 0.956,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 1.9672131147540983,
119
+ "grad_norm": 1.420785537317645,
120
+ "learning_rate": 1.2500000000000006e-05,
121
+ "loss": 0.9289,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 1.9672131147540983,
126
+ "eval_loss": 1.018620491027832,
127
+ "eval_runtime": 9.0608,
128
+ "eval_samples_per_second": 21.19,
129
+ "eval_steps_per_second": 2.649,
130
  "step": 15
131
  },
132
  {
133
  "epoch": 2.098360655737705,
134
+ "grad_norm": 1.2601137070878647,
135
+ "learning_rate": 8.930309757836517e-06,
136
+ "loss": 0.8122,
137
  "step": 16
138
  },
139
  {
140
  "epoch": 2.2295081967213113,
141
+ "grad_norm": 1.3786806913074527,
142
+ "learning_rate": 5.848888922025553e-06,
143
+ "loss": 0.7862,
144
  "step": 17
145
  },
146
  {
147
  "epoch": 2.360655737704918,
148
+ "grad_norm": 1.1372265617537292,
149
+ "learning_rate": 3.3493649053890326e-06,
150
+ "loss": 0.7698,
151
  "step": 18
152
  },
153
  {
154
  "epoch": 2.4918032786885247,
155
+ "grad_norm": 1.085080138353371,
156
+ "learning_rate": 1.5076844803522922e-06,
157
+ "loss": 0.7537,
158
  "step": 19
159
  },
160
  {
161
  "epoch": 2.6229508196721314,
162
+ "grad_norm": 1.067963510724275,
163
+ "learning_rate": 3.7980617469479953e-07,
164
+ "loss": 0.7358,
165
  "step": 20
166
  },
167
  {
168
  "epoch": 2.7540983606557377,
169
+ "grad_norm": 1.0345534581342344,
170
  "learning_rate": 0.0,
171
+ "loss": 0.7336,
172
  "step": 21
173
  },
174
  {
175
  "epoch": 2.7540983606557377,
176
+ "eval_loss": 1.0155988931655884,
177
+ "eval_runtime": 8.9999,
178
+ "eval_samples_per_second": 21.334,
179
+ "eval_steps_per_second": 2.667,
180
  "step": 21
181
  },
182
  {
183
  "epoch": 2.7540983606557377,
184
  "step": 21,
185
  "total_flos": 3971544514560.0,
186
+ "train_loss": 1.26855130422683,
187
+ "train_runtime": 385.6521,
188
+ "train_samples_per_second": 3.757,
189
+ "train_steps_per_second": 0.054
190
  }
191
  ],
192
  "logging_steps": 1,