darwinkernelpanic commited on
Commit
7e70d93
·
verified ·
1 Parent(s): 298dd72

End of training

Browse files
README.md CHANGED
@@ -1,3 +1,153 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: deepseek-ai/deepseek-coder-6.7b-instruct
5
+ tags:
6
+ - axolotl
7
+ - base_model:adapter:deepseek-ai/deepseek-coder-6.7b-instruct
8
+ - lora
9
+ - transformers
10
+ datasets:
11
+ - darwinkernelpanic/luau_corpus_axolotl
12
+ pipeline_tag: text-generation
13
+ model-index:
14
+ - name: deepseek-coder-6.7b-instruct-luau
15
+ results: []
16
+ ---
17
+
18
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
19
+ should probably proofread and complete it, then remove this comment. -->
20
+
21
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
22
+ <details><summary>See axolotl config</summary>
23
+
24
+ axolotl version: `0.13.0.dev0`
25
+ ```yaml
26
+ base_model: deepseek-ai/deepseek-coder-6.7b-instruct
27
+ hub_model_id: darwinkernelpanic/deepseek-coder-6.7b-instruct-luau
28
+ hub_strategy: end
29
+ trust_remote_code: true
30
+
31
+ load_in_8bit: false
32
+ load_in_4bit: true
33
+
34
+ datasets:
35
+ - path: darwinkernelpanic/luau_corpus_axolotl
36
+ type: completion
37
+ field_instruction: prompt
38
+ field_output: completion
39
+
40
+ dataset_prepared_path:
41
+ val_set_size: 0.05
42
+ output_dir: ./outputs/deepseek-luau-finetune
43
+
44
+ sequence_len: 3072
45
+ sample_packing: true
46
+ eval_sample_packing: true
47
+
48
+ adapter: qlora
49
+ lora_model_dir:
50
+ lora_r: 32
51
+ lora_alpha: 32
52
+ lora_dropout: 0.05
53
+ lora_target_linear: true
54
+
55
+ wandb_project: deepseek-luau-finetune
56
+ wandb_entity:
57
+ wandb_watch:
58
+ wandb_name: deepseek-coder-6.7b-luau
59
+ wandb_log_model:
60
+
61
+ gradient_accumulation_steps: 2
62
+ micro_batch_size: 6
63
+ num_epochs: 3
64
+ optimizer: adamw_torch_fused
65
+ lr_scheduler: cosine
66
+ learning_rate: 0.0002
67
+ bf16: auto
68
+ tf32: true
69
+
70
+ gradient_checkpointing: true
71
+ gradient_checkpointing_kwargs:
72
+ use_reentrant: false
73
+
74
+ resume_from_checkpoint:
75
+ logging_steps: 10
76
+ flash_attention: true
77
+ warmup_ratio: 0.1
78
+ evals_per_epoch: 4
79
+ saves_per_epoch: 1
80
+ weight_decay: 0.01
81
+
82
+ fsdp: []
83
+ fsdp_config: {}
84
+
85
+ special_tokens:
86
+ pad_token: "<|EOT|>"
87
+ ```
88
+
89
+ </details><br>
90
+
91
+ # deepseek-coder-6.7b-instruct-luau
92
+
93
+ This model is a fine-tuned version of [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct) on the darwinkernelpanic/luau_corpus_axolotl dataset.
94
+ It achieves the following results on the evaluation set:
95
+ - Loss: 1.6346
96
+ - Ppl: 5.1272
97
+ - Memory/max Active (gib): 10.65
98
+ - Memory/max Allocated (gib): 10.65
99
+ - Memory/device Reserved (gib): 11.93
100
+
101
+ ## Model description
102
+
103
+ More information needed
104
+
105
+ ## Intended uses & limitations
106
+
107
+ More information needed
108
+
109
+ ## Training and evaluation data
110
+
111
+ More information needed
112
+
113
+ ## Training procedure
114
+
115
+ ### Training hyperparameters
116
+
117
+ The following hyperparameters were used during training:
118
+ - learning_rate: 0.0002
119
+ - train_batch_size: 6
120
+ - eval_batch_size: 6
121
+ - seed: 42
122
+ - gradient_accumulation_steps: 2
123
+ - total_train_batch_size: 12
124
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
125
+ - lr_scheduler_type: cosine
126
+ - lr_scheduler_warmup_steps: 16
127
+ - training_steps: 162
128
+
129
+ ### Training results
130
+
131
+ | Training Loss | Epoch | Step | Validation Loss | Ppl | Active (gib) | Allocated (gib) | Reserved (gib) |
132
+ |:-------------:|:------:|:----:|:---------------:|:-------:|:------------:|:---------------:|:--------------:|
133
+ | No log | 0 | 0 | 3.8515 | 47.0637 | 7.0 | 7.0 | 7.26 |
134
+ | 3.2644 | 0.2593 | 14 | 2.8645 | 17.5407 | 10.65 | 10.65 | 12.22 |
135
+ | 2.6242 | 0.5185 | 28 | 2.2633 | 9.6147 | 12.27 | 12.27 | 14.58 |
136
+ | 2.0431 | 0.7778 | 42 | 2.0479 | 7.7515 | 10.65 | 10.65 | 13.92 |
137
+ | 1.9054 | 1.0370 | 56 | 1.9163 | 6.796 | 10.65 | 10.65 | 14.72 |
138
+ | 1.7318 | 1.2963 | 70 | 1.8184 | 6.1622 | 7.61 | 7.61 | 13.92 |
139
+ | 1.6119 | 1.5556 | 84 | 1.7550 | 5.7836 | 12.27 | 12.27 | 14.54 |
140
+ | 1.6022 | 1.8148 | 98 | 1.7048 | 5.5006 | 10.65 | 10.65 | 14.23 |
141
+ | 1.6249 | 2.0741 | 112 | 1.6723 | 5.3242 | 10.65 | 10.65 | 13.99 |
142
+ | 1.4995 | 2.3333 | 126 | 1.6503 | 5.2088 | 10.65 | 10.65 | 11.93 |
143
+ | 1.4803 | 2.5926 | 140 | 1.6381 | 5.1452 | 7.61 | 7.61 | 14.58 |
144
+ | 1.4872 | 2.8519 | 154 | 1.6346 | 5.1272 | 10.65 | 10.65 | 11.93 |
145
+
146
+
147
+ ### Framework versions
148
+
149
+ - PEFT 0.18.0
150
+ - Transformers 4.57.1
151
+ - Pytorch 2.8.0+cu128
152
+ - Datasets 4.4.1
153
+ - Tokenizers 0.22.1
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "down_proj",
33
+ "v_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "up_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d72763e0fec0fd3f48f68a77f8f9afed9f9e61d6de1dc393f5765939f8fd9710
3
+ size 319876032
chat_template.jinja ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}
2
+ {% set add_generation_prompt = false %}
3
+ {% endif %}
4
+ {%- set ns = namespace(found=false) -%}
5
+ {%- for message in messages -%}
6
+ {%- if message['role'] == 'system' -%}
7
+ {%- set ns.found = true -%}
8
+ {%- endif -%}
9
+ {%- endfor -%}
10
+ {{bos_token}}{%- if not ns.found -%}
11
+ {{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}
12
+ {%- endif %}
13
+ {%- for message in messages %}
14
+ {%- if message['role'] == 'system' %}
15
+ {{ message['content'] }}
16
+ {%- else %}
17
+ {%- if message['role'] == 'user' %}
18
+ {{'### Instruction:\n' + message['content'] + '\n'}}
19
+ {%- else %}
20
+ {{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}
21
+ {%- endif %}
22
+ {%- endif %}
23
+ {%- endfor %}
24
+ {% if add_generation_prompt %}
25
+ {{'### Response:'}}
26
+ {% endif %}
config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 32013,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 32021,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 4096,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 11008,
15
+ "max_position_embeddings": 16384,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 32,
19
+ "num_hidden_layers": 32,
20
+ "num_key_value_heads": 32,
21
+ "pretraining_tp": 1,
22
+ "quantization_config": {
23
+ "_load_in_4bit": true,
24
+ "_load_in_8bit": false,
25
+ "bnb_4bit_compute_dtype": "bfloat16",
26
+ "bnb_4bit_quant_storage": "bfloat16",
27
+ "bnb_4bit_quant_type": "nf4",
28
+ "bnb_4bit_use_double_quant": true,
29
+ "llm_int8_enable_fp32_cpu_offload": false,
30
+ "llm_int8_has_fp16_weight": false,
31
+ "llm_int8_skip_modules": null,
32
+ "llm_int8_threshold": 6.0,
33
+ "load_in_4bit": true,
34
+ "load_in_8bit": false,
35
+ "quant_method": "bitsandbytes"
36
+ },
37
+ "rms_norm_eps": 1e-06,
38
+ "rope_scaling": {
39
+ "factor": 4.0,
40
+ "rope_type": "linear",
41
+ "type": "linear"
42
+ },
43
+ "rope_theta": 100000,
44
+ "tie_word_embeddings": false,
45
+ "transformers_version": "4.57.1",
46
+ "use_cache": false,
47
+ "vocab_size": 32256
48
+ }
debug.log ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|EOT|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|EOT|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "32000": {
7
+ "content": "õ",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "32001": {
15
+ "content": "÷",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": false
21
+ },
22
+ "32002": {
23
+ "content": "Á",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "32003": {
31
+ "content": "ý",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "32004": {
39
+ "content": "À",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "32005": {
47
+ "content": "ÿ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "32006": {
55
+ "content": "ø",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "32007": {
63
+ "content": "ú",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "32008": {
71
+ "content": "þ",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "32009": {
79
+ "content": "ü",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "32010": {
87
+ "content": "ù",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "32011": {
95
+ "content": "ö",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "32012": {
103
+ "content": "û",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "32013": {
111
+ "content": "<|begin▁of▁sentence|>",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "32014": {
119
+ "content": "<|end▁of▁sentence|>",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": true
125
+ },
126
+ "32015": {
127
+ "content": "<|fim▁hole|>",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "32016": {
135
+ "content": "<|fim▁begin|>",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "32017": {
143
+ "content": "<|fim▁end|>",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "32018": {
151
+ "content": "<pad>",
152
+ "lstrip": false,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "32019": {
159
+ "content": "<|User|>",
160
+ "lstrip": false,
161
+ "normalized": true,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "32020": {
167
+ "content": "<|Assistant|>",
168
+ "lstrip": false,
169
+ "normalized": true,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "32021": {
175
+ "content": "<|EOT|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": true
181
+ }
182
+ },
183
+ "bos_token": "<|begin▁of▁sentence|>",
184
+ "clean_up_tokenization_spaces": false,
185
+ "eos_token": "<|EOT|>",
186
+ "extra_special_tokens": {},
187
+ "legacy": true,
188
+ "model_max_length": 16384,
189
+ "pad_token": "<|EOT|>",
190
+ "sp_model_kwargs": {},
191
+ "tokenizer_class": "LlamaTokenizerFast",
192
+ "unk_token": null,
193
+ "use_default_system_prompt": false
194
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c27fbd9e94f449281717b151184d446a615170e5efaa42d4022dcd869cf6ea9a
3
+ size 7441