nbeerbower commited on
Commit
60de5f3
·
verified ·
1 Parent(s): 2834600

Add model card with training configuration

Browse files
Files changed (1) hide show
  1. README.md +225 -0
README.md ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: image-text-to-text
4
+ tags:
5
+ - merlina
6
+ - grimoire
7
+ - image-text-to-text
8
+ - vision-language-model
9
+ - sft
10
+ datasets:
11
+ - hemlang/Hemlock2-DPO
12
+ - hemlang/hemlock-formulary-SFT
13
+ - hemlang/hemlock-codex-SFT
14
+ base_model:
15
+ - Jackrong/Qwopus3.5-9B-Coder
16
+ ---
17
+
18
+ # Hemlock-Qwopus3.5-9B-Coder
19
+
20
+ ## Training Configuration
21
+
22
+ | Parameter | Value |
23
+ |-----------|-------|
24
+ | Training Mode | SFT |
25
+ | Base Model | `Jackrong/Qwopus3.5-9B-Coder` |
26
+ | Learning Rate | 0.0002 |
27
+ | Epochs | 2 |
28
+ | Batch Size | 2 |
29
+ | Gradient Accumulation | 8 |
30
+ | Effective Batch Size | 16 |
31
+ | Max Sequence Length | 4096 |
32
+ | Optimizer | paged_adamw_8bit |
33
+ | LR Scheduler | cosine |
34
+ | Warmup Ratio | 0.05 |
35
+ | Weight Decay | 0.01 |
36
+ | Max Grad Norm | 1.0 |
37
+ | Seed | 42 |
38
+ | LoRA Rank (r) | 256 |
39
+ | LoRA Alpha | 256 |
40
+ | LoRA Dropout | 0.05 |
41
+ | Target Modules | k_proj, o_proj, q_proj, v_proj, down_proj, gate_proj, up_proj |
42
+ | Quantization | 4-bit (NF4) |
43
+ | GPU | NVIDIA RTX A6000 |
44
+
45
+ ## Datasets
46
+
47
+ Trained on 3 concatenated datasets:
48
+
49
+ 1. [`hemlang/Hemlock2-DPO`](https://huggingface.co/datasets/hemlang/Hemlock2-DPO) (split: `train`)
50
+ 2. [`hemlang/hemlock-formulary-SFT`](https://huggingface.co/datasets/hemlang/hemlock-formulary-SFT) (split: `train`)
51
+ 3. [`hemlang/hemlock-codex-SFT`](https://huggingface.co/datasets/hemlang/hemlock-codex-SFT) (split: `train`)
52
+
53
+ ## Reproduce this training run
54
+
55
+ This model was trained with [Merlina](https://github.com/Schneewolf-Labs/Merlina). Save the JSON below to `data/configs/<name>.json` (or import it via the *Load Configuration* dialog) to reproduce the exact training setup. Credentials are not included — Merlina will use your own `HF_TOKEN` and `WANDB_API_KEY` from `.env` or the form.
56
+
57
+ ```json
58
+ {
59
+ "_metadata": {
60
+ "name": "Hemlock-Qwopus3.5-9B-Coder",
61
+ "description": "Training configuration shared from a Merlina-trained model.",
62
+ "tags": [],
63
+ "schema": "merlina/training-config",
64
+ "schema_version": 1,
65
+ "merlina_version": "2.0.1"
66
+ },
67
+ "base_model": "Jackrong/Qwopus3.5-9B-Coder",
68
+ "output_name": "Hemlock-Qwopus3.5-9B-Coder",
69
+ "use_lora": true,
70
+ "lora_r": 256,
71
+ "lora_alpha": 256,
72
+ "lora_dropout": 0.05,
73
+ "target_modules": [
74
+ "k_proj",
75
+ "o_proj",
76
+ "q_proj",
77
+ "v_proj",
78
+ "down_proj",
79
+ "gate_proj",
80
+ "up_proj"
81
+ ],
82
+ "modules_to_save": [],
83
+ "lora_task_type": "CAUSAL_LM",
84
+ "learning_rate": 0.0002,
85
+ "num_epochs": 2,
86
+ "batch_size": 2,
87
+ "gradient_accumulation_steps": 8,
88
+ "max_length": 4096,
89
+ "max_prompt_length": 1024,
90
+ "model_type": "auto",
91
+ "training_mode": "sft",
92
+ "beta": 0.1,
93
+ "label_smoothing": 0.0,
94
+ "gamma": 0.5,
95
+ "vision_model_id": null,
96
+ "stage": null,
97
+ "unfreeze_vision_top_n": null,
98
+ "image_token_id": null,
99
+ "min_pixels": null,
100
+ "max_pixels": null,
101
+ "image_column": null,
102
+ "caption_column": null,
103
+ "instruction": null,
104
+ "streaming": null,
105
+ "model_name": null,
106
+ "image_resolution": 1024,
107
+ "lora_rank": 32,
108
+ "lora_target_modules": null,
109
+ "lora_use_dora": false,
110
+ "mid_training_samples": true,
111
+ "dataset_jsonl_path": null,
112
+ "dataset_name": null,
113
+ "dataset_split": null,
114
+ "sample_prompts": null,
115
+ "sample_num_steps": null,
116
+ "dataset": {
117
+ "source": {
118
+ "source_type": "huggingface",
119
+ "repo_id": "hemlang/Hemlock2-DPO",
120
+ "split": "train",
121
+ "file_path": null,
122
+ "file_format": null,
123
+ "dataset_id": null,
124
+ "streaming": false,
125
+ "streaming_batch_size": 10000,
126
+ "column_mapping": null
127
+ },
128
+ "additional_sources": [
129
+ {
130
+ "source_type": "huggingface",
131
+ "repo_id": "hemlang/hemlock-formulary-SFT",
132
+ "split": "train",
133
+ "file_path": null,
134
+ "file_format": null,
135
+ "dataset_id": null,
136
+ "streaming": false,
137
+ "streaming_batch_size": 10000,
138
+ "column_mapping": {
139
+ "instruction": "prompt",
140
+ "output": "chosen"
141
+ }
142
+ },
143
+ {
144
+ "source_type": "huggingface",
145
+ "repo_id": "hemlang/hemlock-codex-SFT",
146
+ "split": "train",
147
+ "file_path": null,
148
+ "file_format": null,
149
+ "dataset_id": null,
150
+ "streaming": false,
151
+ "streaming_batch_size": 10000,
152
+ "column_mapping": {
153
+ "instruction": "prompt",
154
+ "output": "chosen"
155
+ }
156
+ }
157
+ ],
158
+ "format": {
159
+ "format_type": "tokenizer",
160
+ "custom_templates": null,
161
+ "enable_thinking": true
162
+ },
163
+ "model_name": "Jackrong/Qwopus3.5-9B-Coder",
164
+ "column_mapping": {
165
+ "prompt": "prompt",
166
+ "chosen": "chosen",
167
+ "rejected": "rejected"
168
+ },
169
+ "convert_messages_format": true,
170
+ "deduplicate": false,
171
+ "dedupe_strategy": "prompt_chosen",
172
+ "test_size": 0.01,
173
+ "max_samples": null,
174
+ "system_prompt": null,
175
+ "system_prompt_mode": "fill_empty",
176
+ "training_mode": "sft"
177
+ },
178
+ "seed": 42,
179
+ "max_grad_norm": 1.0,
180
+ "warmup_ratio": 0.05,
181
+ "eval_steps": 0.2,
182
+ "use_4bit": true,
183
+ "use_wandb": true,
184
+ "push_to_hub": true,
185
+ "merge_lora_before_upload": true,
186
+ "hf_hub_private": true,
187
+ "export_gguf": false,
188
+ "gguf_quant_types": [
189
+ "Q4_K_M"
190
+ ],
191
+ "keep_gguf_fp16": false,
192
+ "shuffle_dataset": true,
193
+ "weight_decay": 0.01,
194
+ "lr_scheduler_type": "cosine",
195
+ "gradient_checkpointing": true,
196
+ "logging_steps": 1,
197
+ "optimizer_type": "paged_adamw_8bit",
198
+ "adam_beta1": 0.9,
199
+ "adam_beta2": 0.999,
200
+ "adam_epsilon": 1e-08,
201
+ "adafactor_relative_step": false,
202
+ "adafactor_scale_parameter": false,
203
+ "adafactor_warmup_init": false,
204
+ "adafactor_decay_rate": -0.8,
205
+ "adafactor_beta1": null,
206
+ "adafactor_clip_threshold": 1.0,
207
+ "attn_implementation": "sdpa",
208
+ "use_liger": true,
209
+ "torch_compile": false,
210
+ "neftune_alpha": null,
211
+ "eval_on_start": false,
212
+ "gpu_ids": null,
213
+ "multi_gpu_strategy": "auto",
214
+ "wandb_project": null,
215
+ "wandb_run_name": null,
216
+ "wandb_tags": null,
217
+ "wandb_notes": null
218
+ }
219
+ ```
220
+
221
+ ---
222
+
223
+ ![Trained with Merlina](https://raw.githubusercontent.com/Schneewolf-Labs/Merlina/refs/heads/main/frontend/madewithmerlina_smol.png)
224
+
225
+ [Merlina on GitHub](https://github.com/Schneewolf-Labs/Merlina)