rgb255 commited on
Commit
a5835d3
·
verified ·
1 Parent(s): d702d83

Upload LoRA adapter (README written by author)

Browse files
README.md CHANGED
@@ -13,7 +13,7 @@ tags:
13
  - structured-output
14
  ---
15
 
16
- qwen3-4b-structured-output-lora-sample
17
 
18
  This repository provides a **LoRA adapter** fine-tuned from
19
  **Qwen/Qwen3-4B-Instruct-2507** using **QLoRA (4-bit, Unsloth)**.
@@ -34,9 +34,9 @@ while intermediate reasoning (Chain-of-Thought) is masked.
34
  - Base model: Qwen/Qwen3-4B-Instruct-2507
35
  - Method: QLoRA (4-bit)
36
  - Max sequence length: 512
37
- - Epochs: 1
38
- - Learning rate: 1e-06
39
- - LoRA: r=64, alpha=128
40
 
41
  ## Usage
42
 
 
13
  - structured-output
14
  ---
15
 
16
+ # Qwen3-4B RSLoRA (R=256) for Structured Output
17
 
18
  This repository provides a **LoRA adapter** fine-tuned from
19
  **Qwen/Qwen3-4B-Instruct-2507** using **QLoRA (4-bit, Unsloth)**.
 
34
  - Base model: Qwen/Qwen3-4B-Instruct-2507
35
  - Method: QLoRA (4-bit)
36
  - Max sequence length: 512
37
+ - Epochs: 2
38
+ - Learning rate: 2e-04
39
+ - LoRA: r=256, alpha=32
40
 
41
  ## Usage
42
 
adapter_config.json CHANGED
@@ -20,7 +20,7 @@
20
  "layers_pattern": null,
21
  "layers_to_transform": null,
22
  "loftq_config": {},
23
- "lora_alpha": 128,
24
  "lora_bias": false,
25
  "lora_dropout": 0.0,
26
  "megatron_config": null,
@@ -29,22 +29,22 @@
29
  "peft_type": "LORA",
30
  "peft_version": "0.18.1",
31
  "qalora_group_size": 16,
32
- "r": 64,
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
- "k_proj",
37
- "v_proj",
38
- "q_proj",
39
  "gate_proj",
40
  "down_proj",
 
41
  "up_proj",
42
- "o_proj"
 
 
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
46
  "trainable_token_indices": null,
47
  "use_dora": false,
48
  "use_qalora": false,
49
- "use_rslora": false
50
  }
 
20
  "layers_pattern": null,
21
  "layers_to_transform": null,
22
  "loftq_config": {},
23
+ "lora_alpha": 32,
24
  "lora_bias": false,
25
  "lora_dropout": 0.0,
26
  "megatron_config": null,
 
29
  "peft_type": "LORA",
30
  "peft_version": "0.18.1",
31
  "qalora_group_size": 16,
32
+ "r": 256,
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
 
 
 
36
  "gate_proj",
37
  "down_proj",
38
+ "k_proj",
39
  "up_proj",
40
+ "o_proj",
41
+ "v_proj",
42
+ "q_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
46
  "trainable_token_indices": null,
47
  "use_dora": false,
48
  "use_qalora": false,
49
+ "use_rslora": true
50
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4153ae1cdb39b66371216d654822939f812dfffe97bec65d35b8eaaeda011817
3
- size 528550256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f7e32a8d79b5e697d84e7807625629d764aff33c2a8a303e7ff31d5cb7fe96d
3
+ size 2113998360
all_experiments_details.json ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Experiment_1": {
3
+ "config": {
4
+ "BASE_MODEL_ID": "Qwen/Qwen3-4B-Instruct-2507",
5
+ "DATASET_ID": "u-10bei/structured_data_with_cot_dataset_512_v2",
6
+ "BASE_OUT_DIR": "./lora_experiments",
7
+ "SEED": 3407,
8
+ "VAL_RATIO": 0.05,
9
+ "MAX_SEQ_LEN": 512,
10
+ "LORA_R": 256,
11
+ "LORA_ALPHA": 32,
12
+ "LORA_DROPOUT": 0.0,
13
+ "LORA_TARGET_MODULES": [
14
+ "q_proj",
15
+ "k_proj",
16
+ "v_proj",
17
+ "o_proj",
18
+ "gate_proj",
19
+ "up_proj",
20
+ "down_proj"
21
+ ],
22
+ "EPOCHS": 2,
23
+ "PER_DEVICE_TRAIN_BS": 2,
24
+ "PER_DEVICE_EVAL_BS": 2,
25
+ "GRAD_ACCUM": 8,
26
+ "LR": 0.0002,
27
+ "WARMUP_RATIO": 0.1,
28
+ "WEIGHT_DECAY": 0.05,
29
+ "MAX_STEPS": -1,
30
+ "LOGGING_STEPS": 10,
31
+ "EVAL_STEPS": 50,
32
+ "SAVE_STEPS": 100,
33
+ "SAVE_TOTAL_LIMIT": 2,
34
+ "MASK_COT": true,
35
+ "OUTPUT_MARKERS": [
36
+ "Output:",
37
+ "OUTPUT:",
38
+ "Final:",
39
+ "Answer:",
40
+ "Result:",
41
+ "Response:"
42
+ ],
43
+ "OUTPUT_LEARN_MODE": "after_marker",
44
+ "UPSAMPLE_ENABLE": false,
45
+ "UPSAMPLE_RULES_JSON": "{\"xml_to_yaml\": 2.0}",
46
+ "Experiment_Name": "Method_RSLoRA_R256",
47
+ "USE_RSLORA": true
48
+ },
49
+ "history": [
50
+ {
51
+ "loss": 1.3966,
52
+ "grad_norm": 1.0872466564178467,
53
+ "learning_rate": 4e-05,
54
+ "epoch": 0.04501969611705121,
55
+ "step": 10
56
+ },
57
+ {
58
+ "loss": 0.9325,
59
+ "grad_norm": 0.9557391405105591,
60
+ "learning_rate": 8.444444444444444e-05,
61
+ "epoch": 0.09003939223410241,
62
+ "step": 20
63
+ },
64
+ {
65
+ "loss": 0.8011,
66
+ "grad_norm": 0.513979971408844,
67
+ "learning_rate": 0.00012888888888888892,
68
+ "epoch": 0.13505908835115363,
69
+ "step": 30
70
+ },
71
+ {
72
+ "loss": 0.8125,
73
+ "grad_norm": 0.5614296197891235,
74
+ "learning_rate": 0.00017333333333333334,
75
+ "epoch": 0.18007878446820483,
76
+ "step": 40
77
+ },
78
+ {
79
+ "loss": 0.7486,
80
+ "grad_norm": 0.32493776082992554,
81
+ "learning_rate": 0.0001999509018141497,
82
+ "epoch": 0.22509848058525606,
83
+ "step": 50
84
+ },
85
+ {
86
+ "eval_loss": 0.8527934551239014,
87
+ "eval_runtime": 25.8751,
88
+ "eval_samples_per_second": 7.227,
89
+ "eval_steps_per_second": 3.633,
90
+ "epoch": 0.22509848058525606,
91
+ "step": 50
92
+ },
93
+ {
94
+ "loss": 0.7415,
95
+ "grad_norm": 0.2533496022224426,
96
+ "learning_rate": 0.00019939910076582706,
97
+ "epoch": 0.27011817670230726,
98
+ "step": 60
99
+ },
100
+ {
101
+ "loss": 0.8629,
102
+ "grad_norm": 0.3107249438762665,
103
+ "learning_rate": 0.00019823752233636866,
104
+ "epoch": 0.31513787281935846,
105
+ "step": 70
106
+ },
107
+ {
108
+ "loss": 0.7151,
109
+ "grad_norm": 0.32290422916412354,
110
+ "learning_rate": 0.00019647329238755036,
111
+ "epoch": 0.36015756893640966,
112
+ "step": 80
113
+ },
114
+ {
115
+ "loss": 0.7843,
116
+ "grad_norm": 0.3160030245780945,
117
+ "learning_rate": 0.0001941172338293343,
118
+ "epoch": 0.4051772650534609,
119
+ "step": 90
120
+ },
121
+ {
122
+ "loss": 0.7931,
123
+ "grad_norm": 0.40372225642204285,
124
+ "learning_rate": 0.00019118380022524738,
125
+ "epoch": 0.4501969611705121,
126
+ "step": 100
127
+ },
128
+ {
129
+ "eval_loss": 0.7929844260215759,
130
+ "eval_runtime": 25.6248,
131
+ "eval_samples_per_second": 7.298,
132
+ "eval_steps_per_second": 3.668,
133
+ "epoch": 0.4501969611705121,
134
+ "step": 100
135
+ },
136
+ {
137
+ "loss": 0.812,
138
+ "grad_norm": 0.4371040165424347,
139
+ "learning_rate": 0.0001876909871250184,
140
+ "epoch": 0.4952166572875633,
141
+ "step": 110
142
+ },
143
+ {
144
+ "loss": 0.7258,
145
+ "grad_norm": 0.28696581721305847,
146
+ "learning_rate": 0.00018366022166841676,
147
+ "epoch": 0.5402363534046145,
148
+ "step": 120
149
+ },
150
+ {
151
+ "loss": 0.7824,
152
+ "grad_norm": 0.9101247787475586,
153
+ "learning_rate": 0.0001791162311375321,
154
+ "epoch": 0.5852560495216658,
155
+ "step": 130
156
+ },
157
+ {
158
+ "loss": 0.7643,
159
+ "grad_norm": 0.2658868730068207,
160
+ "learning_rate": 0.00017408689126387995,
161
+ "epoch": 0.6302757456387169,
162
+ "step": 140
163
+ },
164
+ {
165
+ "loss": 0.711,
166
+ "grad_norm": 0.43262964487075806,
167
+ "learning_rate": 0.0001686030552209133,
168
+ "epoch": 0.6752954417557682,
169
+ "step": 150
170
+ },
171
+ {
172
+ "eval_loss": 0.7660654783248901,
173
+ "eval_runtime": 24.682,
174
+ "eval_samples_per_second": 7.576,
175
+ "eval_steps_per_second": 3.808,
176
+ "epoch": 0.6752954417557682,
177
+ "step": 150
178
+ },
179
+ {
180
+ "loss": 0.6915,
181
+ "grad_norm": 0.36354902386665344,
182
+ "learning_rate": 0.00016269836435100934,
183
+ "epoch": 0.7203151378728193,
184
+ "step": 160
185
+ },
186
+ {
187
+ "loss": 0.7233,
188
+ "grad_norm": 0.3520168662071228,
189
+ "learning_rate": 0.0001564090417880529,
190
+ "epoch": 0.7653348339898706,
191
+ "step": 170
192
+ },
193
+ {
194
+ "loss": 0.6626,
195
+ "grad_norm": 0.33897820115089417,
196
+ "learning_rate": 0.0001497736702416662,
197
+ "epoch": 0.8103545301069218,
198
+ "step": 180
199
+ },
200
+ {
201
+ "loss": 0.6801,
202
+ "grad_norm": 0.4690793752670288,
203
+ "learning_rate": 0.00014283295530629877,
204
+ "epoch": 0.855374226223973,
205
+ "step": 190
206
+ },
207
+ {
208
+ "loss": 0.6281,
209
+ "grad_norm": 0.42146065831184387,
210
+ "learning_rate": 0.00013562947574718976,
211
+ "epoch": 0.9003939223410242,
212
+ "step": 200
213
+ },
214
+ {
215
+ "eval_loss": 0.6899478435516357,
216
+ "eval_runtime": 24.6215,
217
+ "eval_samples_per_second": 7.595,
218
+ "eval_steps_per_second": 3.818,
219
+ "epoch": 0.9003939223410242,
220
+ "step": 200
221
+ },
222
+ {
223
+ "loss": 0.6237,
224
+ "grad_norm": 0.38645192980766296,
225
+ "learning_rate": 0.00012820742229510817,
226
+ "epoch": 0.9454136184580754,
227
+ "step": 210
228
+ },
229
+ {
230
+ "loss": 0.5856,
231
+ "grad_norm": 0.4482150375843048,
232
+ "learning_rate": 0.00012061232655226964,
233
+ "epoch": 0.9904333145751266,
234
+ "step": 220
235
+ },
236
+ {
237
+ "loss": 0.4553,
238
+ "grad_norm": 0.5086686015129089,
239
+ "learning_rate": 0.00011289078167249402,
240
+ "epoch": 1.0315137872819358,
241
+ "step": 230
242
+ },
243
+ {
244
+ "loss": 0.5029,
245
+ "grad_norm": 0.8159873485565186,
246
+ "learning_rate": 0.00010509015652912966,
247
+ "epoch": 1.076533483398987,
248
+ "step": 240
249
+ },
250
+ {
251
+ "loss": 0.5179,
252
+ "grad_norm": 0.5635101199150085,
253
+ "learning_rate": 9.72583051242198e-05,
254
+ "epoch": 1.1215531795160383,
255
+ "step": 250
256
+ },
257
+ {
258
+ "eval_loss": 0.5836588144302368,
259
+ "eval_runtime": 24.7575,
260
+ "eval_samples_per_second": 7.553,
261
+ "eval_steps_per_second": 3.797,
262
+ "epoch": 1.1215531795160383,
263
+ "step": 250
264
+ },
265
+ {
266
+ "loss": 0.514,
267
+ "grad_norm": 0.661852240562439,
268
+ "learning_rate": 8.944327302158073e-05,
269
+ "epoch": 1.1665728756330895,
270
+ "step": 260
271
+ },
272
+ {
273
+ "loss": 0.4472,
274
+ "grad_norm": 0.4757942259311676,
275
+ "learning_rate": 8.169300260471818e-05,
276
+ "epoch": 1.2115925717501406,
277
+ "step": 270
278
+ },
279
+ {
280
+ "loss": 0.4286,
281
+ "grad_norm": 0.6126232743263245,
282
+ "learning_rate": 7.405503896771729e-05,
283
+ "epoch": 1.2566122678671918,
284
+ "step": 280
285
+ },
286
+ {
287
+ "loss": 0.4862,
288
+ "grad_norm": 0.5707330703735352,
289
+ "learning_rate": 6.65762382433589e-05,
290
+ "epoch": 1.301631963984243,
291
+ "step": 290
292
+ },
293
+ {
294
+ "loss": 0.4352,
295
+ "grad_norm": 0.3476680815219879,
296
+ "learning_rate": 5.930248015776325e-05,
297
+ "epoch": 1.3466516601012943,
298
+ "step": 300
299
+ },
300
+ {
301
+ "eval_loss": 0.4677433371543884,
302
+ "eval_runtime": 24.7074,
303
+ "eval_samples_per_second": 7.569,
304
+ "eval_steps_per_second": 3.805,
305
+ "epoch": 1.3466516601012943,
306
+ "step": 300
307
+ },
308
+ {
309
+ "loss": 0.3118,
310
+ "grad_norm": 0.6370311379432678,
311
+ "learning_rate": 5.227838657493396e-05,
312
+ "epoch": 1.3916713562183456,
313
+ "step": 310
314
+ },
315
+ {
316
+ "loss": 0.3847,
317
+ "grad_norm": 0.5159108638763428,
318
+ "learning_rate": 4.5547047757828985e-05,
319
+ "epoch": 1.4366910523353966,
320
+ "step": 320
321
+ },
322
+ {
323
+ "loss": 0.3931,
324
+ "grad_norm": 0.6911277174949646,
325
+ "learning_rate": 3.914975802524806e-05,
326
+ "epoch": 1.4817107484524479,
327
+ "step": 330
328
+ },
329
+ {
330
+ "loss": 0.5302,
331
+ "grad_norm": 0.5555063486099243,
332
+ "learning_rate": 3.312576242618511e-05,
333
+ "epoch": 1.5267304445694991,
334
+ "step": 340
335
+ },
336
+ {
337
+ "loss": 0.3183,
338
+ "grad_norm": 0.8983607292175293,
339
+ "learning_rate": 2.7512015985706418e-05,
340
+ "epoch": 1.5717501406865504,
341
+ "step": 350
342
+ },
343
+ {
344
+ "eval_loss": 0.3523830473423004,
345
+ "eval_runtime": 24.7895,
346
+ "eval_samples_per_second": 7.544,
347
+ "eval_steps_per_second": 3.792,
348
+ "epoch": 1.5717501406865504,
349
+ "step": 350
350
+ },
351
+ {
352
+ "loss": 0.3694,
353
+ "grad_norm": 0.5024566054344177,
354
+ "learning_rate": 2.234295699929413e-05,
355
+ "epoch": 1.6167698368036016,
356
+ "step": 360
357
+ },
358
+ {
359
+ "loss": 0.3502,
360
+ "grad_norm": 0.5496794581413269,
361
+ "learning_rate": 1.7650295766411605e-05,
362
+ "epoch": 1.6617895329206527,
363
+ "step": 370
364
+ },
365
+ {
366
+ "loss": 0.3006,
367
+ "grad_norm": 0.4716707766056061,
368
+ "learning_rate": 1.3462820059333403e-05,
369
+ "epoch": 1.7068092290377042,
370
+ "step": 380
371
+ },
372
+ {
373
+ "loss": 0.3471,
374
+ "grad_norm": 0.46408089995384216,
375
+ "learning_rate": 9.80621852061826e-06,
376
+ "epoch": 1.7518289251547552,
377
+ "step": 390
378
+ },
379
+ {
380
+ "loss": 0.2301,
381
+ "grad_norm": 0.41809141635894775,
382
+ "learning_rate": 6.702923072617129e-06,
383
+ "epoch": 1.7968486212718064,
384
+ "step": 400
385
+ },
386
+ {
387
+ "eval_loss": 0.3102871775627136,
388
+ "eval_runtime": 24.3436,
389
+ "eval_samples_per_second": 7.682,
390
+ "eval_steps_per_second": 3.861,
391
+ "epoch": 1.7968486212718064,
392
+ "step": 400
393
+ },
394
+ {
395
+ "loss": 0.2704,
396
+ "grad_norm": 0.4462619423866272,
397
+ "learning_rate": 4.171971305776945e-06,
398
+ "epoch": 1.8418683173888577,
399
+ "step": 410
400
+ },
401
+ {
402
+ "loss": 0.2168,
403
+ "grad_norm": 0.4414360523223877,
404
+ "learning_rate": 2.2288896899377186e-06,
405
+ "epoch": 1.8868880135059087,
406
+ "step": 420
407
+ },
408
+ {
409
+ "loss": 0.2626,
410
+ "grad_norm": 0.3277634084224701,
411
+ "learning_rate": 8.855983250793288e-07,
412
+ "epoch": 1.93190770962296,
413
+ "step": 430
414
+ },
415
+ {
416
+ "loss": 0.342,
417
+ "grad_norm": 0.4310093820095062,
418
+ "learning_rate": 1.5033781583758678e-07,
419
+ "epoch": 1.9769274057400112,
420
+ "step": 440
421
+ },
422
+ {
423
+ "train_runtime": 3776.627,
424
+ "train_samples_per_second": 1.882,
425
+ "train_steps_per_second": 0.118,
426
+ "total_flos": 7.57271106173184e+16,
427
+ "train_loss": 0.5718902958882763,
428
+ "epoch": 2.0,
429
+ "step": 446,
430
+ "total_runtime_sec": 3777.7196531295776
431
+ }
432
+ ]
433
+ }
434
+ }