Sela223 commited on
Commit
e5cef58
·
verified ·
1 Parent(s): 52d07d0

Upload checkpoint checkpoint-350

Browse files
checkpoint-350/README.md CHANGED
@@ -1,6 +1,14 @@
1
  ---
2
  base_model: unsloth/gemma-3n-e4b-it-unsloth-bnb-4bit
3
  library_name: peft
 
 
 
 
 
 
 
 
4
  ---
5
 
6
  # Model Card for Model ID
@@ -199,4 +207,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.15.2
 
1
  ---
2
  base_model: unsloth/gemma-3n-e4b-it-unsloth-bnb-4bit
3
  library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/gemma-3n-e4b-it-unsloth-bnb-4bit
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
  ---
13
 
14
  # Model Card for Model ID
 
207
  [More Information Needed]
208
  ### Framework versions
209
 
210
+ - PEFT 0.16.0
checkpoint-350/adapter_config.json CHANGED
@@ -20,6 +20,7 @@
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
 
23
  "r": 64,
24
  "rank_pattern": {},
25
  "revision": null,
@@ -27,5 +28,6 @@
27
  "task_type": "CAUSAL_LM",
28
  "trainable_token_indices": null,
29
  "use_dora": false,
 
30
  "use_rslora": false
31
  }
 
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
  "r": 64,
25
  "rank_pattern": {},
26
  "revision": null,
 
28
  "task_type": "CAUSAL_LM",
29
  "trainable_token_indices": null,
30
  "use_dora": false,
31
+ "use_qalora": false,
32
  "use_rslora": false
33
  }
checkpoint-350/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed34c742a062137185cf9a341f5dabb889c5d3a55d9f0d4b5418093363b2835b
3
  size 614801160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2be9000d0113c7bb8c3fb993672ae03bd817348a83f13a0314406645048165c7
3
  size 614801160
checkpoint-350/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94f5861ace1d92e7b4cec384cff6340cd128c1983cd2e2561be9bde03b902f37
3
  size 314017998
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:081ca991d80a3e709679f51738d09c3c9d38c3eeb5dc78925a2887a16b522362
3
  size 314017998
checkpoint-350/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d7ecf15e83ac4d18e0d90f8a44821af2f304313a6ae05eeb21767226a79c463
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4ae31f3bd6abd5e088309ad57fa2e995bc6dd61c02221bc158a3d63e6ad1f06
3
  size 14244
checkpoint-350/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b6c30ef093fa2dd3d81a7e4ac493646dc32b312671f5091f4043a25d83c434a
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a80ac1076ee6a590a8afc8f7c2e173aeb236819a68cd950eb3e2d5227eadb56
3
  size 988
checkpoint-350/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f54f61cd21e7715c2d1c98c82263f221a334d1b450170bfc3ffdd4457e178d3d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c165b4db31643b30ea8c9aef92480c1c2793ff9eb23d3b7bd85759601ce62c1c
3
  size 1064
checkpoint-350/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.1404580152671757,
6
  "eval_steps": 500,
7
  "global_step": 350,
8
  "is_hyper_param_search": false,
@@ -10,255 +10,255 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.061068702290076333,
14
  "grad_norm": NaN,
15
  "learning_rate": 0.0,
16
- "loss": 9.249,
17
  "step": 10
18
  },
19
  {
20
- "epoch": 0.12213740458015267,
21
- "grad_norm": 4165304.25,
22
- "learning_rate": 7.317073170731707e-09,
23
- "loss": 9.3268,
24
  "step": 20
25
  },
26
  {
27
- "epoch": 0.183206106870229,
28
- "grad_norm": 52283.171875,
29
- "learning_rate": 3.170731707317073e-08,
30
- "loss": 5.2487,
31
  "step": 30
32
  },
33
  {
34
- "epoch": 0.24427480916030533,
35
- "grad_norm": 34552.171875,
36
- "learning_rate": 5.609756097560976e-08,
37
- "loss": 3.5598,
38
  "step": 40
39
  },
40
  {
41
- "epoch": 0.3053435114503817,
42
- "grad_norm": 26069.314453125,
43
- "learning_rate": 8.048780487804878e-08,
44
- "loss": 3.2428,
45
  "step": 50
46
  },
47
  {
48
- "epoch": 0.366412213740458,
49
- "grad_norm": 17445.56640625,
50
- "learning_rate": 1.048780487804878e-07,
51
- "loss": 3.2248,
52
  "step": 60
53
  },
54
  {
55
- "epoch": 0.42748091603053434,
56
- "grad_norm": 23311.046875,
57
- "learning_rate": 1.2926829268292682e-07,
58
- "loss": 3.2595,
59
  "step": 70
60
  },
61
  {
62
- "epoch": 0.48854961832061067,
63
- "grad_norm": 13832.998046875,
64
- "learning_rate": 1.5365853658536586e-07,
65
- "loss": 3.0941,
66
  "step": 80
67
  },
68
  {
69
- "epoch": 0.549618320610687,
70
- "grad_norm": 11702.7421875,
71
- "learning_rate": 1.7804878048780488e-07,
72
- "loss": 3.0741,
73
  "step": 90
74
  },
75
  {
76
- "epoch": 0.6106870229007634,
77
- "grad_norm": 11529.9228515625,
78
- "learning_rate": 1.997289972899729e-07,
79
- "loss": 3.163,
80
  "step": 100
81
  },
82
  {
83
- "epoch": 0.6717557251908397,
84
- "grad_norm": 9445.1396484375,
85
- "learning_rate": 1.970189701897019e-07,
86
- "loss": 3.0271,
87
  "step": 110
88
  },
89
  {
90
- "epoch": 0.732824427480916,
91
- "grad_norm": 9955.53515625,
92
- "learning_rate": 1.9430894308943088e-07,
93
- "loss": 2.9452,
94
  "step": 120
95
  },
96
  {
97
- "epoch": 0.7938931297709924,
98
- "grad_norm": 8468.791015625,
99
- "learning_rate": 1.915989159891599e-07,
100
- "loss": 2.9369,
101
  "step": 130
102
  },
103
  {
104
- "epoch": 0.8549618320610687,
105
- "grad_norm": 9535.78125,
106
- "learning_rate": 1.8888888888888888e-07,
107
- "loss": 2.8495,
108
  "step": 140
109
  },
110
  {
111
- "epoch": 0.916030534351145,
112
- "grad_norm": 6875.52783203125,
113
- "learning_rate": 1.861788617886179e-07,
114
- "loss": 2.9174,
115
  "step": 150
116
  },
117
  {
118
- "epoch": 0.9770992366412213,
119
- "grad_norm": 7618.35107421875,
120
- "learning_rate": 1.8346883468834688e-07,
121
- "loss": 2.8059,
122
  "step": 160
123
  },
124
  {
125
- "epoch": 1.036641221374046,
126
- "grad_norm": 7838.251953125,
127
- "learning_rate": 1.8075880758807586e-07,
128
- "loss": 2.7811,
129
  "step": 170
130
  },
131
  {
132
- "epoch": 1.0977099236641221,
133
- "grad_norm": 7657.27880859375,
134
- "learning_rate": 1.7804878048780488e-07,
135
- "loss": 2.7413,
136
  "step": 180
137
  },
138
  {
139
- "epoch": 1.1587786259541986,
140
- "grad_norm": 7777.9345703125,
141
- "learning_rate": 1.753387533875339e-07,
142
- "loss": 2.7186,
143
  "step": 190
144
  },
145
  {
146
- "epoch": 1.2198473282442748,
147
- "grad_norm": 8100.41259765625,
148
- "learning_rate": 1.7262872628726285e-07,
149
- "loss": 2.6789,
150
  "step": 200
151
  },
152
  {
153
- "epoch": 1.2809160305343512,
154
- "grad_norm": 6984.42822265625,
155
- "learning_rate": 1.6991869918699186e-07,
156
- "loss": 2.608,
157
  "step": 210
158
  },
159
  {
160
- "epoch": 1.3419847328244274,
161
- "grad_norm": 7442.26806640625,
162
- "learning_rate": 1.6720867208672087e-07,
163
- "loss": 2.5542,
164
  "step": 220
165
  },
166
  {
167
- "epoch": 1.4030534351145039,
168
- "grad_norm": 7562.8271484375,
169
- "learning_rate": 1.6449864498644986e-07,
170
- "loss": 2.7031,
171
  "step": 230
172
  },
173
  {
174
- "epoch": 1.46412213740458,
175
- "grad_norm": 7235.27392578125,
176
- "learning_rate": 1.6178861788617885e-07,
177
- "loss": 2.6075,
178
  "step": 240
179
  },
180
  {
181
- "epoch": 1.5251908396946565,
182
- "grad_norm": 7698.4599609375,
183
- "learning_rate": 1.5907859078590786e-07,
184
- "loss": 2.5737,
185
  "step": 250
186
  },
187
  {
188
- "epoch": 1.5862595419847327,
189
- "grad_norm": 6776.5927734375,
190
- "learning_rate": 1.5636856368563685e-07,
191
- "loss": 2.6098,
192
  "step": 260
193
  },
194
  {
195
- "epoch": 1.6473282442748092,
196
- "grad_norm": 6810.216796875,
197
- "learning_rate": 1.5365853658536586e-07,
198
- "loss": 2.6596,
199
  "step": 270
200
  },
201
  {
202
- "epoch": 1.7083969465648856,
203
- "grad_norm": 8227.4892578125,
204
- "learning_rate": 1.5094850948509485e-07,
205
- "loss": 2.5461,
206
  "step": 280
207
  },
208
  {
209
- "epoch": 1.7694656488549618,
210
- "grad_norm": 6727.93212890625,
211
- "learning_rate": 1.4823848238482383e-07,
212
- "loss": 2.5579,
213
  "step": 290
214
  },
215
  {
216
- "epoch": 1.830534351145038,
217
- "grad_norm": 7243.64111328125,
218
- "learning_rate": 1.4552845528455284e-07,
219
- "loss": 2.5538,
220
  "step": 300
221
  },
222
  {
223
- "epoch": 1.8916030534351145,
224
- "grad_norm": 6177.71240234375,
225
- "learning_rate": 1.4281842818428186e-07,
226
- "loss": 2.4577,
227
  "step": 310
228
  },
229
  {
230
- "epoch": 1.952671755725191,
231
- "grad_norm": 7574.3271484375,
232
- "learning_rate": 1.4010840108401082e-07,
233
- "loss": 2.4778,
234
  "step": 320
235
  },
236
  {
237
- "epoch": 2.018320610687023,
238
- "grad_norm": 6852.73095703125,
239
- "learning_rate": 1.3739837398373983e-07,
240
- "loss": 2.7076,
241
  "step": 330
242
  },
243
  {
244
- "epoch": 2.0793893129770993,
245
- "grad_norm": 6963.75927734375,
246
- "learning_rate": 1.3468834688346884e-07,
247
- "loss": 2.3829,
248
  "step": 340
249
  },
250
  {
251
- "epoch": 2.1404580152671757,
252
- "grad_norm": 7274.21630859375,
253
- "learning_rate": 1.3197831978319783e-07,
254
- "loss": 2.4186,
255
  "step": 350
256
  }
257
  ],
258
  "logging_steps": 10,
259
- "max_steps": 820,
260
  "num_input_tokens_seen": 0,
261
- "num_train_epochs": 5,
262
  "save_steps": 50,
263
  "stateful_callbacks": {
264
  "TrainerControl": {
@@ -272,7 +272,7 @@
272
  "attributes": {}
273
  }
274
  },
275
- "total_flos": 2.585680433386243e+16,
276
  "train_batch_size": 5,
277
  "trial_name": null,
278
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.8324607329842932,
6
  "eval_steps": 500,
7
  "global_step": 350,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.05235602094240838,
14
  "grad_norm": NaN,
15
  "learning_rate": 0.0,
16
+ "loss": 9.7408,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.10471204188481675,
21
+ "grad_norm": 202209.765625,
22
+ "learning_rate": 3.1413612565445024e-08,
23
+ "loss": 9.1548,
24
  "step": 20
25
  },
26
  {
27
+ "epoch": 0.15706806282722513,
28
+ "grad_norm": 27189.787109375,
29
+ "learning_rate": 1.3612565445026178e-07,
30
+ "loss": 3.8451,
31
  "step": 30
32
  },
33
  {
34
+ "epoch": 0.2094240837696335,
35
+ "grad_norm": 10457.34765625,
36
+ "learning_rate": 2.4083769633507854e-07,
37
+ "loss": 3.267,
38
  "step": 40
39
  },
40
  {
41
+ "epoch": 0.2617801047120419,
42
+ "grad_norm": 8087.2939453125,
43
+ "learning_rate": 3.4554973821989523e-07,
44
+ "loss": 3.0939,
45
  "step": 50
46
  },
47
  {
48
+ "epoch": 0.31413612565445026,
49
+ "grad_norm": 7214.744140625,
50
+ "learning_rate": 4.50261780104712e-07,
51
+ "loss": 3.0211,
52
  "step": 60
53
  },
54
  {
55
+ "epoch": 0.36649214659685864,
56
+ "grad_norm": 6162.826171875,
57
+ "learning_rate": 5.549738219895288e-07,
58
+ "loss": 2.846,
59
  "step": 70
60
  },
61
  {
62
+ "epoch": 0.418848167539267,
63
+ "grad_norm": 4688.05615234375,
64
+ "learning_rate": 6.596858638743455e-07,
65
+ "loss": 2.8104,
66
  "step": 80
67
  },
68
  {
69
+ "epoch": 0.4712041884816754,
70
+ "grad_norm": 3856.7578125,
71
+ "learning_rate": 7.643979057591623e-07,
72
+ "loss": 2.8735,
73
  "step": 90
74
  },
75
  {
76
+ "epoch": 0.5235602094240838,
77
+ "grad_norm": 3529.413330078125,
78
+ "learning_rate": 8.691099476439791e-07,
79
+ "loss": 2.8117,
80
  "step": 100
81
  },
82
  {
83
+ "epoch": 0.5759162303664922,
84
+ "grad_norm": 2830.52734375,
85
+ "learning_rate": 9.738219895287958e-07,
86
+ "loss": 2.7099,
87
  "step": 110
88
  },
89
  {
90
+ "epoch": 0.6282722513089005,
91
+ "grad_norm": 2316.537353515625,
92
+ "learning_rate": 1.0785340314136124e-06,
93
+ "loss": 2.6387,
94
  "step": 120
95
  },
96
  {
97
+ "epoch": 0.680628272251309,
98
+ "grad_norm": 2685.246826171875,
99
+ "learning_rate": 1.1832460732984293e-06,
100
+ "loss": 2.6667,
101
  "step": 130
102
  },
103
  {
104
+ "epoch": 0.7329842931937173,
105
+ "grad_norm": 2066.593017578125,
106
+ "learning_rate": 1.2879581151832458e-06,
107
+ "loss": 2.5786,
108
  "step": 140
109
  },
110
  {
111
+ "epoch": 0.7853403141361257,
112
+ "grad_norm": 2110.41748046875,
113
+ "learning_rate": 1.3926701570680628e-06,
114
+ "loss": 2.4927,
115
  "step": 150
116
  },
117
  {
118
+ "epoch": 0.837696335078534,
119
+ "grad_norm": 1557.745849609375,
120
+ "learning_rate": 1.4973821989528795e-06,
121
+ "loss": 2.6125,
122
  "step": 160
123
  },
124
  {
125
+ "epoch": 0.8900523560209425,
126
+ "grad_norm": 1510.9991455078125,
127
+ "learning_rate": 1.6020942408376963e-06,
128
+ "loss": 2.5048,
129
  "step": 170
130
  },
131
  {
132
+ "epoch": 0.9424083769633508,
133
+ "grad_norm": 1395.5841064453125,
134
+ "learning_rate": 1.706806282722513e-06,
135
+ "loss": 2.5049,
136
  "step": 180
137
  },
138
  {
139
+ "epoch": 0.9947643979057592,
140
+ "grad_norm": 1400.4466552734375,
141
+ "learning_rate": 1.8115183246073297e-06,
142
+ "loss": 2.4902,
143
  "step": 190
144
  },
145
  {
146
+ "epoch": 1.0471204188481675,
147
+ "grad_norm": 1328.171142578125,
148
+ "learning_rate": 1.9162303664921463e-06,
149
+ "loss": 2.3063,
150
  "step": 200
151
  },
152
  {
153
+ "epoch": 1.0994764397905759,
154
+ "grad_norm": 1169.1490478515625,
155
+ "learning_rate": 1.997673065735893e-06,
156
+ "loss": 2.3826,
157
  "step": 210
158
  },
159
  {
160
+ "epoch": 1.1518324607329844,
161
+ "grad_norm": 1007.3028564453125,
162
+ "learning_rate": 1.9860383944153577e-06,
163
+ "loss": 2.2646,
164
  "step": 220
165
  },
166
  {
167
+ "epoch": 1.2041884816753927,
168
+ "grad_norm": 905.8086547851562,
169
+ "learning_rate": 1.9744037230948225e-06,
170
+ "loss": 2.3065,
171
  "step": 230
172
  },
173
  {
174
+ "epoch": 1.256544502617801,
175
+ "grad_norm": 904.2677001953125,
176
+ "learning_rate": 1.9627690517742874e-06,
177
+ "loss": 2.369,
178
  "step": 240
179
  },
180
  {
181
+ "epoch": 1.3089005235602094,
182
+ "grad_norm": 878.70751953125,
183
+ "learning_rate": 1.951134380453752e-06,
184
+ "loss": 2.2916,
185
  "step": 250
186
  },
187
  {
188
+ "epoch": 1.3612565445026177,
189
+ "grad_norm": 785.525146484375,
190
+ "learning_rate": 1.9394997091332166e-06,
191
+ "loss": 2.2916,
192
  "step": 260
193
  },
194
  {
195
+ "epoch": 1.4136125654450262,
196
+ "grad_norm": 715.8485107421875,
197
+ "learning_rate": 1.927865037812682e-06,
198
+ "loss": 2.247,
199
  "step": 270
200
  },
201
  {
202
+ "epoch": 1.4659685863874345,
203
+ "grad_norm": 742.1319580078125,
204
+ "learning_rate": 1.9162303664921463e-06,
205
+ "loss": 2.2293,
206
  "step": 280
207
  },
208
  {
209
+ "epoch": 1.518324607329843,
210
+ "grad_norm": 777.41259765625,
211
+ "learning_rate": 1.9045956951716113e-06,
212
+ "loss": 2.1447,
213
  "step": 290
214
  },
215
  {
216
+ "epoch": 1.5706806282722514,
217
+ "grad_norm": 693.8157348632812,
218
+ "learning_rate": 1.8929610238510761e-06,
219
+ "loss": 2.1851,
220
  "step": 300
221
  },
222
  {
223
+ "epoch": 1.6230366492146597,
224
+ "grad_norm": 707.2672119140625,
225
+ "learning_rate": 1.881326352530541e-06,
226
+ "loss": 2.1879,
227
  "step": 310
228
  },
229
  {
230
+ "epoch": 1.675392670157068,
231
+ "grad_norm": 727.61767578125,
232
+ "learning_rate": 1.8696916812100056e-06,
233
+ "loss": 2.1962,
234
  "step": 320
235
  },
236
  {
237
+ "epoch": 1.7277486910994764,
238
+ "grad_norm": 695.4833984375,
239
+ "learning_rate": 1.8580570098894706e-06,
240
+ "loss": 2.2057,
241
  "step": 330
242
  },
243
  {
244
+ "epoch": 1.7801047120418847,
245
+ "grad_norm": 614.199462890625,
246
+ "learning_rate": 1.8464223385689352e-06,
247
+ "loss": 2.0654,
248
  "step": 340
249
  },
250
  {
251
+ "epoch": 1.8324607329842932,
252
+ "grad_norm": 724.0316162109375,
253
+ "learning_rate": 1.8347876672484e-06,
254
+ "loss": 2.0803,
255
  "step": 350
256
  }
257
  ],
258
  "logging_steps": 10,
259
+ "max_steps": 1910,
260
  "num_input_tokens_seen": 0,
261
+ "num_train_epochs": 10,
262
  "save_steps": 50,
263
  "stateful_callbacks": {
264
  "TrainerControl": {
 
272
  "attributes": {}
273
  }
274
  },
275
+ "total_flos": 2.664823984891824e+16,
276
  "train_batch_size": 5,
277
  "trial_name": null,
278
  "trial_params": null
checkpoint-350/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:757d9ed6a271cc7dd663b202a023b2731e235bf47955a4f40ff4c18331f20ba4
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a98a6f231f28700315fa8e9cb612a94ae1e99d1ff2b7795e1d31ff2c428a5d2
3
  size 5816