rovdetection commited on
Commit
4153006
·
verified ·
1 Parent(s): b1a59e8

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -206,4 +206,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
206
  [More Information Needed]
207
  ### Framework versions
208
 
209
- - PEFT 0.19.1
 
206
  [More Information Needed]
207
  ### Framework versions
208
 
209
+ - PEFT 0.18.1
last-checkpoint/adapter_config.json CHANGED
@@ -19,29 +19,27 @@
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
22
- "lora_ga_config": null,
23
  "megatron_config": null,
24
  "megatron_core": "megatron.core",
25
  "modules_to_save": null,
26
  "peft_type": "LORA",
27
- "peft_version": "0.19.1",
28
  "qalora_group_size": 16,
29
  "r": 128,
30
  "rank_pattern": {},
31
  "revision": null,
32
  "target_modules": [
 
 
33
  "v_proj",
34
- "gate_proj",
35
  "up_proj",
36
- "down_proj",
37
- "k_proj",
38
- "q_proj",
39
- "o_proj"
40
  ],
41
  "target_parameters": null,
42
  "task_type": "CAUSAL_LM",
43
  "trainable_token_indices": null,
44
- "use_bdlora": null,
45
  "use_dora": false,
46
  "use_qalora": false,
47
  "use_rslora": false
 
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
  "lora_dropout": 0.05,
 
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
25
  "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
  "qalora_group_size": 16,
28
  "r": 128,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
+ "down_proj",
34
  "v_proj",
35
+ "o_proj",
36
  "up_proj",
37
+ "gate_proj",
38
+ "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
42
  "trainable_token_indices": null,
 
43
  "use_dora": false,
44
  "use_qalora": false,
45
  "use_rslora": false
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbc31def6fc1183573b4ca53bf82830b3e675c174bc8741079dee8f82ace3c3d
3
  size 363365712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ece9918ba6ae7ff5c6354d8843a6ff5e36f4fd5dcd0c2fc4f171e781fa6c7b95
3
  size 363365712
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:618454c138b7364f8dc01f3db6c85573dc0fe52fe5fefec5e12a9d9ba2f00f2f
3
  size 184804245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c790bf95a1b16a87fcf911b7884e6a62ad5b0af02cfb8b9640661805d06a06a
3
  size 184804245
last-checkpoint/trainer_state.json CHANGED
@@ -10,202 +10,202 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 2.937763540148735,
14
  "epoch": 0.02127659574468085,
15
- "grad_norm": 0.4686773121356964,
16
  "learning_rate": 9.951e-05,
17
- "loss": 4.138067016601562,
18
- "mean_token_accuracy": 0.4264019676297903,
19
  "num_tokens": 988301.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 2.962523487210274,
24
  "epoch": 0.0425531914893617,
25
- "grad_norm": 0.6529152989387512,
26
  "learning_rate": 9.901e-05,
27
- "loss": 3.1858258056640625,
28
- "mean_token_accuracy": 0.491901636198163,
29
  "num_tokens": 1981516.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 2.8207009065151216,
34
  "epoch": 0.06382978723404255,
35
- "grad_norm": 0.6847605109214783,
36
  "learning_rate": 9.851e-05,
37
- "loss": 2.97326904296875,
38
- "mean_token_accuracy": 0.5133598280698061,
39
  "num_tokens": 2962772.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 2.6926355504989625,
44
  "epoch": 0.0851063829787234,
45
- "grad_norm": 0.7111132740974426,
46
  "learning_rate": 9.801e-05,
47
- "loss": 2.8132635498046876,
48
- "mean_token_accuracy": 0.5304626803100109,
49
  "num_tokens": 3952403.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 2.6115103197097778,
54
  "epoch": 0.10638297872340426,
55
- "grad_norm": 0.6989225149154663,
56
  "learning_rate": 9.751e-05,
57
- "loss": 2.7149212646484373,
58
- "mean_token_accuracy": 0.5415211926400662,
59
  "num_tokens": 4939231.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 2.552058919072151,
64
  "epoch": 0.1276595744680851,
65
- "grad_norm": 0.6323665976524353,
66
  "learning_rate": 9.701e-05,
67
- "loss": 2.6398919677734374,
68
- "mean_token_accuracy": 0.5516271162033081,
69
  "num_tokens": 5931036.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 2.4714413863420486,
74
  "epoch": 0.14893617021276595,
75
- "grad_norm": 0.6492835283279419,
76
  "learning_rate": 9.651e-05,
77
- "loss": 2.546295471191406,
78
- "mean_token_accuracy": 0.5627167555689812,
79
  "num_tokens": 6919236.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 2.452097176015377,
84
  "epoch": 0.1702127659574468,
85
- "grad_norm": 0.7032881379127502,
86
  "learning_rate": 9.601e-05,
87
- "loss": 2.5202024841308592,
88
- "mean_token_accuracy": 0.5672398428618908,
89
  "num_tokens": 7901385.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 2.38740619212389,
94
  "epoch": 0.19148936170212766,
95
- "grad_norm": 0.715317964553833,
96
  "learning_rate": 9.551e-05,
97
- "loss": 2.449654541015625,
98
- "mean_token_accuracy": 0.5774810115993023,
99
  "num_tokens": 8891410.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 2.339343198239803,
104
  "epoch": 0.2127659574468085,
105
- "grad_norm": 0.7024112939834595,
106
  "learning_rate": 9.501e-05,
107
- "loss": 2.399354248046875,
108
- "mean_token_accuracy": 0.584610120356083,
109
  "num_tokens": 9872429.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 2.3019780376553536,
114
  "epoch": 0.23404255319148937,
115
- "grad_norm": 0.7314584851264954,
116
  "learning_rate": 9.451000000000002e-05,
117
- "loss": 2.3609829711914063,
118
- "mean_token_accuracy": 0.588694809526205,
119
  "num_tokens": 10852406.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 2.2801691934466364,
124
  "epoch": 0.2553191489361702,
125
- "grad_norm": 0.7630459070205688,
126
  "learning_rate": 9.401e-05,
127
- "loss": 2.322369384765625,
128
- "mean_token_accuracy": 0.5943487723916769,
129
  "num_tokens": 11847983.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 2.231162509918213,
134
  "epoch": 0.2765957446808511,
135
- "grad_norm": 0.7081300616264343,
136
  "learning_rate": 9.351e-05,
137
- "loss": 2.2724359130859373,
138
- "mean_token_accuracy": 0.6005339217931032,
139
  "num_tokens": 12839631.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 2.195046606659889,
144
  "epoch": 0.2978723404255319,
145
- "grad_norm": 0.8117260336875916,
146
  "learning_rate": 9.301e-05,
147
- "loss": 2.2360572814941406,
148
- "mean_token_accuracy": 0.6055160685628652,
149
  "num_tokens": 13825421.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 2.17775638371706,
154
  "epoch": 0.3191489361702128,
155
- "grad_norm": 0.7083834409713745,
156
  "learning_rate": 9.251000000000001e-05,
157
- "loss": 2.2109527587890625,
158
- "mean_token_accuracy": 0.610633347928524,
159
  "num_tokens": 14811783.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 2.1384602162241935,
164
  "epoch": 0.3404255319148936,
165
- "grad_norm": 0.6845762729644775,
166
  "learning_rate": 9.201000000000001e-05,
167
- "loss": 2.1738687133789063,
168
- "mean_token_accuracy": 0.6158557101339102,
169
  "num_tokens": 15799402.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 2.1188551610708237,
174
  "epoch": 0.3617021276595745,
175
- "grad_norm": 0.7421966791152954,
176
  "learning_rate": 9.151000000000001e-05,
177
- "loss": 2.146589660644531,
178
- "mean_token_accuracy": 0.6187865848094225,
179
  "num_tokens": 16787125.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 2.0914336186647415,
184
  "epoch": 0.3829787234042553,
185
- "grad_norm": 0.706660807132721,
186
  "learning_rate": 9.101000000000001e-05,
187
- "loss": 2.1187652587890624,
188
- "mean_token_accuracy": 0.6228364047408104,
189
  "num_tokens": 17783798.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 2.0830547219514846,
194
  "epoch": 0.40425531914893614,
195
- "grad_norm": 0.7128080725669861,
196
  "learning_rate": 9.051000000000001e-05,
197
- "loss": 2.10831298828125,
198
- "mean_token_accuracy": 0.6251034809648991,
199
  "num_tokens": 18760192.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 2.042051522433758,
204
  "epoch": 0.425531914893617,
205
- "grad_norm": 0.7421649098396301,
206
  "learning_rate": 9.001e-05,
207
- "loss": 2.059340362548828,
208
- "mean_token_accuracy": 0.6305920536071062,
209
  "num_tokens": 19742108.0,
210
  "step": 1000
211
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 2.933625968694687,
14
  "epoch": 0.02127659574468085,
15
+ "grad_norm": 0.4740275740623474,
16
  "learning_rate": 9.951e-05,
17
+ "loss": 4.140991821289062,
18
+ "mean_token_accuracy": 0.4264977302402258,
19
  "num_tokens": 988301.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 2.962103115916252,
24
  "epoch": 0.0425531914893617,
25
+ "grad_norm": 0.654786229133606,
26
  "learning_rate": 9.901e-05,
27
+ "loss": 3.1859881591796877,
28
+ "mean_token_accuracy": 0.4919606515020132,
29
  "num_tokens": 1981516.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 2.8202617847919464,
34
  "epoch": 0.06382978723404255,
35
+ "grad_norm": 0.6856227517127991,
36
  "learning_rate": 9.851e-05,
37
+ "loss": 2.973466796875,
38
+ "mean_token_accuracy": 0.5133317831903697,
39
  "num_tokens": 2962772.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 2.692441967725754,
44
  "epoch": 0.0851063829787234,
45
+ "grad_norm": 0.7118680477142334,
46
  "learning_rate": 9.801e-05,
47
+ "loss": 2.8135525512695314,
48
+ "mean_token_accuracy": 0.5304926330596209,
49
  "num_tokens": 3952403.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 2.6121814751625063,
54
  "epoch": 0.10638297872340426,
55
+ "grad_norm": 0.7034955620765686,
56
  "learning_rate": 9.751e-05,
57
+ "loss": 2.715215148925781,
58
+ "mean_token_accuracy": 0.5413537514209747,
59
  "num_tokens": 4939231.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 2.551229443252087,
64
  "epoch": 0.1276595744680851,
65
+ "grad_norm": 0.6356984972953796,
66
  "learning_rate": 9.701e-05,
67
+ "loss": 2.6399871826171877,
68
+ "mean_token_accuracy": 0.55161267362535,
69
  "num_tokens": 5931036.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 2.471001845598221,
74
  "epoch": 0.14893617021276595,
75
+ "grad_norm": 0.653047502040863,
76
  "learning_rate": 9.651e-05,
77
+ "loss": 2.5464208984375,
78
+ "mean_token_accuracy": 0.5625544948130846,
79
  "num_tokens": 6919236.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 2.451621271967888,
84
  "epoch": 0.1702127659574468,
85
+ "grad_norm": 0.7060537934303284,
86
  "learning_rate": 9.601e-05,
87
+ "loss": 2.520094299316406,
88
+ "mean_token_accuracy": 0.5672792405635119,
89
  "num_tokens": 7901385.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 2.386715810596943,
94
  "epoch": 0.19148936170212766,
95
+ "grad_norm": 0.7115684747695923,
96
  "learning_rate": 9.551e-05,
97
+ "loss": 2.449301452636719,
98
+ "mean_token_accuracy": 0.5778872921317816,
99
  "num_tokens": 8891410.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 2.3399247616529464,
104
  "epoch": 0.2127659574468085,
105
+ "grad_norm": 0.7020632028579712,
106
  "learning_rate": 9.501e-05,
107
+ "loss": 2.3999610900878907,
108
+ "mean_token_accuracy": 0.5845850779861211,
109
  "num_tokens": 9872429.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 2.301297716200352,
114
  "epoch": 0.23404255319148937,
115
+ "grad_norm": 0.7258976101875305,
116
  "learning_rate": 9.451000000000002e-05,
117
+ "loss": 2.3609393310546873,
118
+ "mean_token_accuracy": 0.5889028573036194,
119
  "num_tokens": 10852406.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 2.28171086281538,
124
  "epoch": 0.2553191489361702,
125
+ "grad_norm": 0.7619220018386841,
126
  "learning_rate": 9.401e-05,
127
+ "loss": 2.32339111328125,
128
+ "mean_token_accuracy": 0.5941300053894519,
129
  "num_tokens": 11847983.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 2.2323903796076774,
134
  "epoch": 0.2765957446808511,
135
+ "grad_norm": 0.7111139893531799,
136
  "learning_rate": 9.351e-05,
137
+ "loss": 2.2743270874023436,
138
+ "mean_token_accuracy": 0.6003150211274624,
139
  "num_tokens": 12839631.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 2.195645292699337,
144
  "epoch": 0.2978723404255319,
145
+ "grad_norm": 0.8029466867446899,
146
  "learning_rate": 9.301e-05,
147
+ "loss": 2.2373281860351564,
148
+ "mean_token_accuracy": 0.605772587954998,
149
  "num_tokens": 13825421.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 2.1788447910547255,
154
  "epoch": 0.3191489361702128,
155
+ "grad_norm": 0.7066243290901184,
156
  "learning_rate": 9.251000000000001e-05,
157
+ "loss": 2.2124517822265624,
158
+ "mean_token_accuracy": 0.6105387426912785,
159
  "num_tokens": 14811783.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 2.140341859459877,
164
  "epoch": 0.3404255319148936,
165
+ "grad_norm": 0.6823806166648865,
166
  "learning_rate": 9.201000000000001e-05,
167
+ "loss": 2.175841064453125,
168
+ "mean_token_accuracy": 0.615707865729928,
169
  "num_tokens": 15799402.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 2.121108899116516,
174
  "epoch": 0.3617021276595745,
175
+ "grad_norm": 0.7394977807998657,
176
  "learning_rate": 9.151000000000001e-05,
177
+ "loss": 2.1485189819335937,
178
+ "mean_token_accuracy": 0.6186434020847082,
179
  "num_tokens": 16787125.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 2.092526486814022,
184
  "epoch": 0.3829787234042553,
185
+ "grad_norm": 0.7025715112686157,
186
  "learning_rate": 9.101000000000001e-05,
187
+ "loss": 2.1199916076660155,
188
+ "mean_token_accuracy": 0.6228454371541738,
189
  "num_tokens": 17783798.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 2.0856992295384407,
194
  "epoch": 0.40425531914893614,
195
+ "grad_norm": 0.726789653301239,
196
  "learning_rate": 9.051000000000001e-05,
197
+ "loss": 2.1102699279785155,
198
+ "mean_token_accuracy": 0.6247553788125515,
199
  "num_tokens": 18760192.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 2.0423195973038673,
204
  "epoch": 0.425531914893617,
205
+ "grad_norm": 0.7402950525283813,
206
  "learning_rate": 9.001e-05,
207
+ "loss": 2.059694519042969,
208
+ "mean_token_accuracy": 0.6303426054865122,
209
  "num_tokens": 19742108.0,
210
  "step": 1000
211
  }