SHasan commited on
Commit
245383d
·
verified ·
1 Parent(s): fd9055f

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +274 -0
README.md ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ base_model: mlabonne/NeuralMonarch-7B
4
+ tags:
5
+ - generated_from_trainer
6
+ - axolotl
7
+ - mistral
8
+ - instruct
9
+ - finetune
10
+ - chatml
11
+ - gpt4
12
+ - synthetic data
13
+ - distillation
14
+ model-index:
15
+ - name: AlphaMonarch-laser
16
+ results: []
17
+ datasets:
18
+ - mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha
19
+ language:
20
+ - en
21
+ library_name: transformers
22
+ pipeline_tag: text-generation
23
+ ---
24
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
25
+ should probably proofread and complete it, then remove this comment. -->
26
+
27
+ # AlphaMonarch-laser
28
+
29
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/64e380b2e12618b261fa6ba0/62S_ExHO6NKCM3NhPDrds.jpeg)
30
+
31
+ AlphaMonarch-laser is a new DPO merge using laserQLoRA that retains all the reasoning abilities of the very best merges and significantly improves its
32
+ conversational abilities. Kind of the best of both worlds in a 7B model. This model uses [mlabonne/NeuralMonarch-7B](https://huggingface.co/mlabonne/NeuralMonarch-7B)
33
+ as its base model, finetuned on only half of the layers using laserQLoRA. The preference dataset used for DPO is [mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha](https://huggingface.co/datasets/mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha).
34
+
35
+
36
+ * [mlabonne/NeuralMonarch-7B](https://huggingface.co/mlabonne/NeuralMonarch-7B)
37
+ * [mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha](https://huggingface.co/datasets/mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha)
38
+
39
+
40
+ </details><br>
41
+
42
+ This model is a laserQLoRA fine-tuned version of [mlabonne/NeuralMonarch-7B](https://huggingface.co/mlabonne/NeuralMonarch-7B).
43
+
44
+
45
+ ## Evaluation data
46
+
47
+ 🏆 Evaluation
48
+
49
+
50
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64e380b2e12618b261fa6ba0/MIkOaXVGJ0T5UVYIEhtYA.png)
51
+
52
+
53
+ | Task |Version| Metric |Value| |Stderr|
54
+ |-------------|------:|--------|----:|---|-----:|
55
+ |arc_challenge| 0|acc |70.30|± | 1.33|
56
+ | | |acc_norm|73.12|± | 1.29|
57
+ |hellaswag | 0|acc |71.80|± | 0.44|
58
+ | | |acc_norm|89.20|± | 0.30|
59
+ |gsm8k | 0|acc |66.71|± | 1.29|
60
+ |winogrande | 0|acc |84.60|± | 1.01|
61
+ |mmlu | 0|acc |64.69|± | 1.00|
62
+
63
+ Average: 75.9% without mmlu
64
+
65
+ ### TruthfulQA
66
+ | Task |Version|Metric|Value| |Stderr|
67
+ |-------------|------:|------|----:|---|-----:|
68
+ |truthfulqa_mc| 1|mc1 |62.79|± | 1.69|
69
+ | | |mc2 |77.90|± | 1.37|
70
+
71
+
72
+ ### Training hyperparameters
73
+
74
+ The following hyperparameters were used during training:
75
+ - learning_rate: 5e-07
76
+ - train_batch_size: 1
77
+ - eval_batch_size: 8
78
+ - seed: 42
79
+ - gradient_accumulation_steps: 8
80
+ - total_train_batch_size: 8
81
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
82
+ - lr_scheduler_type: cosine
83
+ - lr_scheduler_warmup_steps: 100
84
+ - training_steps: 1080
85
+
86
+
87
+ ### 📝 Axolotl Configuration
88
+
89
+ ```yaml
90
+ base_model: mlabonne/NeuralMonarch-7B
91
+ model_type: MistralForCausalLM
92
+ tokenizer_type: LlamaTokenizer
93
+ is_mistral_derived_model: true
94
+ load_in_8bit: false
95
+ load_in_4bit: true
96
+ strict: false
97
+ rl: dpo
98
+ chat_template: chatml
99
+ datasets:
100
+ - path: mlabonne/chatml-OpenHermes2.5-dpo-binarized-alpha
101
+ split: train
102
+ type: chatml.intel
103
+ dataset_prepared_path:
104
+ val_set_size: 0.01
105
+ output_dir: ./out
106
+ adapter: qlora
107
+ lora_model_dir:
108
+ sequence_len: 1800
109
+ sample_packing: false
110
+ pad_to_sequence_len: false
111
+ lora_r: 16
112
+ lora_alpha: 16
113
+ lora_dropout: 0.05
114
+ lora_target_linear: true
115
+ lora_fan_in_fan_out:
116
+ lora_target_modules:
117
+ - layers.1.self_attn.q_proj
118
+ - layers.0.self_attn.q_proj
119
+ - layers.15.self_attn.q_proj
120
+ - layers.12.self_attn.q_proj
121
+ - layers.11.self_attn.q_proj
122
+ - layers.14.self_attn.q_proj
123
+ - layers.9.self_attn.q_proj
124
+ - layers.16.self_attn.q_proj
125
+ - layers.30.self_attn.q_proj
126
+ - layers.18.self_attn.q_proj
127
+ - layers.13.self_attn.q_proj
128
+ - layers.10.self_attn.q_proj
129
+ - layers.7.self_attn.q_proj
130
+ - layers.8.self_attn.q_proj
131
+ - layers.4.self_attn.q_proj
132
+ - layers.19.self_attn.q_proj
133
+ - layers.27.self_attn.k_proj
134
+ - layers.24.self_attn.k_proj
135
+ - layers.25.self_attn.k_proj
136
+ - layers.22.self_attn.k_proj
137
+ - layers.26.self_attn.k_proj
138
+ - layers.29.self_attn.k_proj
139
+ - layers.23.self_attn.k_proj
140
+ - layers.28.self_attn.k_proj
141
+ - layers.21.self_attn.k_proj
142
+ - layers.31.self_attn.k_proj
143
+ - layers.30.self_attn.k_proj
144
+ - layers.20.self_attn.k_proj
145
+ - layers.5.self_attn.k_proj
146
+ - layers.19.self_attn.k_proj
147
+ - layers.17.self_attn.k_proj
148
+ - layers.18.self_attn.k_proj
149
+ - layers.19.self_attn.v_proj
150
+ - layers.24.self_attn.v_proj
151
+ - layers.18.self_attn.v_proj
152
+ - layers.5.self_attn.v_proj
153
+ - layers.3.self_attn.v_proj
154
+ - layers.16.self_attn.v_proj
155
+ - layers.23.self_attn.v_proj
156
+ - layers.27.self_attn.v_proj
157
+ - layers.25.self_attn.v_proj
158
+ - layers.26.self_attn.v_proj
159
+ - layers.20.self_attn.v_proj
160
+ - layers.6.self_attn.v_proj
161
+ - layers.15.self_attn.v_proj
162
+ - layers.17.self_attn.v_proj
163
+ - layers.29.self_attn.v_proj
164
+ - layers.22.self_attn.v_proj
165
+ - layers.12.self_attn.o_proj
166
+ - layers.9.self_attn.o_proj
167
+ - layers.14.self_attn.o_proj
168
+ - layers.0.self_attn.o_proj
169
+ - layers.6.self_attn.o_proj
170
+ - layers.8.self_attn.o_proj
171
+ - layers.10.self_attn.o_proj
172
+ - layers.11.self_attn.o_proj
173
+ - layers.13.self_attn.o_proj
174
+ - layers.24.self_attn.o_proj
175
+ - layers.7.self_attn.o_proj
176
+ - layers.15.self_attn.o_proj
177
+ - layers.5.self_attn.o_proj
178
+ - layers.17.self_attn.o_proj
179
+ - layers.25.self_attn.o_proj
180
+ - layers.4.self_attn.o_proj
181
+ - layers.31.mlp.gate_proj
182
+ - layers.30.mlp.gate_proj
183
+ - layers.4.mlp.gate_proj
184
+ - layers.3.mlp.gate_proj
185
+ - layers.29.mlp.gate_proj
186
+ - layers.28.mlp.gate_proj
187
+ - layers.6.mlp.gate_proj
188
+ - layers.27.mlp.gate_proj
189
+ - layers.5.mlp.gate_proj
190
+ - layers.26.mlp.gate_proj
191
+ - layers.25.mlp.gate_proj
192
+ - layers.7.mlp.gate_proj
193
+ - layers.2.mlp.gate_proj
194
+ - layers.24.mlp.gate_proj
195
+ - layers.23.mlp.gate_proj
196
+ - layers.10.mlp.gate_proj
197
+ - layers.6.mlp.up_proj
198
+ - layers.4.mlp.up_proj
199
+ - layers.5.mlp.up_proj
200
+ - layers.27.mlp.up_proj
201
+ - layers.25.mlp.up_proj
202
+ - layers.26.mlp.up_proj
203
+ - layers.17.mlp.up_proj
204
+ - layers.24.mlp.up_proj
205
+ - layers.7.mlp.up_proj
206
+ - layers.10.mlp.up_proj
207
+ - layers.3.mlp.up_proj
208
+ - layers.11.mlp.up_proj
209
+ - layers.23.mlp.up_proj
210
+ - layers.9.mlp.up_proj
211
+ - layers.14.mlp.up_proj
212
+ - layers.18.mlp.up_proj
213
+ - layers.19.mlp.down_proj
214
+ - layers.20.mlp.down_proj
215
+ - layers.18.mlp.down_proj
216
+ - layers.21.mlp.down_proj
217
+ - layers.29.mlp.down_proj
218
+ - layers.1.mlp.down_proj
219
+ - layers.22.mlp.down_proj
220
+ - layers.28.mlp.down_proj
221
+ - layers.23.mlp.down_proj
222
+ - layers.30.mlp.down_proj
223
+ - layers.17.mlp.down_proj
224
+ - layers.4.mlp.down_proj
225
+ - layers.2.mlp.down_proj
226
+ - layers.15.mlp.down_proj
227
+ - layers.5.mlp.down_proj
228
+ wandb_project: axolotl
229
+ wandb_entity:
230
+ wandb_watch:
231
+ wandb_name:
232
+ wandb_log_model:
233
+ gradient_accumulation_steps: 8
234
+ micro_batch_size: 1
235
+ num_epochs: 1
236
+ optimizer: paged_adamw_32bit
237
+ lr_scheduler: cosine
238
+ learning_rate: 5e-7
239
+ train_on_inputs: false
240
+ group_by_length: false
241
+ bf16: true
242
+ fp16: false
243
+ tf32: true
244
+ gradient_checkpointing: true
245
+ early_stopping_patience:
246
+ resume_from_checkpoint:
247
+ local_rank:
248
+ logging_steps: 1
249
+ xformers_attention:
250
+ flash_attention: true
251
+ warmup_steps: 100
252
+ evals_per_epoch: 1
253
+ eval_table_size:
254
+ eval_table_max_new_tokens: 128
255
+ save_steps: 1080
256
+ max_steps: 1080
257
+ debug:
258
+ deepspeed:
259
+ weight_decay: 0.0
260
+ fsdp:
261
+ fsdp_config:
262
+ special_tokens:
263
+ ```
264
+
265
+
266
+ ### Framework versions
267
+
268
+ - Transformers 4.38.0.dev0
269
+ - Pytorch 2.1.2+cu118
270
+ - Datasets 2.17.0
271
+ - Tokenizers 0.15.0
272
+ - axolotl: 0.4.0
273
+
274
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)