intervitens commited on
Commit
1015cb1
·
verified ·
1 Parent(s): 997b8fa

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ tags:
4
+ - generated_from_trainer
5
+ base_model: mistralai_Mistral-Nemo-Instruct-2407
6
+ model-index:
7
+ - name: 12b-out-r3
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
15
+ <details><summary>See axolotl config</summary>
16
+
17
+ axolotl version: `0.5.0`
18
+ ```yaml
19
+ base_model: mistralai_Mistral-Nemo-Instruct-2407
20
+ model_type: AutoModelForCausalLM
21
+ tokenizer_type: AutoTokenizer
22
+
23
+ plugins:
24
+ - axolotl.integrations.liger.LigerPlugin
25
+ liger_rope: true
26
+ liger_rms_norm: true
27
+ liger_swiglu: true
28
+ liger_fused_linear_cross_entropy: true
29
+
30
+ load_in_8bit: false
31
+ load_in_4bit: false
32
+ strict: false
33
+
34
+ datasets:
35
+ - path: NewEden/OpenCAI-ShareGPT
36
+ type: chat_template
37
+ # chat_template: mistralv3tekken
38
+ roles_to_train: ["gpt"]
39
+ field_messages: conversations
40
+ message_field_role: from
41
+ message_field_content: value
42
+ train_on_eos: turn
43
+ - path: NewEden/vanilla-backrooms-claude-sharegpt
44
+ type: chat_template
45
+ # chat_template: mistralv3tekken
46
+ roles_to_train: ["gpt"]
47
+ field_messages: conversations
48
+ message_field_role: from
49
+ message_field_content: value
50
+ train_on_eos: turn
51
+ - path: anthracite-org/kalo_opus_misc_240827
52
+ type: chat_template
53
+ # chat_template: mistralv3tekken
54
+ roles_to_train: ["gpt"]
55
+ field_messages: conversations
56
+ message_field_role: from
57
+ message_field_content: value
58
+ train_on_eos: turn
59
+ - path: anthracite-org/kalo_misc_part2
60
+ type: chat_template
61
+ # chat_template: mistralv3tekken
62
+ roles_to_train: ["gpt"]
63
+ field_messages: conversations
64
+ message_field_role: from
65
+ message_field_content: value
66
+ train_on_eos: turn
67
+ - path: NewEden/Roleplay-Logs-V2
68
+ type: chat_template
69
+ # chat_template: mistralv3tekken
70
+ roles_to_train: ["gpt"]
71
+ field_messages: conversations
72
+ message_field_role: from
73
+ message_field_content: value
74
+ train_on_eos: turn
75
+ dataset_prepared_path: dataset_prepared
76
+ val_set_size: 0.0
77
+ output_dir: 12b-out-r3
78
+
79
+ sequence_len: 16384
80
+ sample_packing: true
81
+ pad_to_sequence_len: true
82
+
83
+ adapter: lora
84
+ lora_model_dir:
85
+ lora_r: 32
86
+ lora_alpha: 16
87
+ lora_dropout: 0.05
88
+ #lora_target_linear:
89
+ #lora_fan_in_fan_out: true
90
+ peft_use_rslora: true
91
+ lora_target_modules:
92
+ - gate_proj
93
+ - down_proj
94
+ - up_proj
95
+ - q_proj
96
+ - v_proj
97
+ - k_proj
98
+ - o_proj
99
+
100
+ lora_modules_to_save:
101
+ - embed_tokens
102
+ - lm_head
103
+
104
+
105
+ wandb_project: 12b-control
106
+ wandb_entity:
107
+ wandb_watch:
108
+ wandb_name: 12b-control-r3
109
+ wandb_log_model:
110
+
111
+ gradient_accumulation_steps: 2
112
+ micro_batch_size: 1
113
+ num_epochs: 4
114
+ optimizer: paged_adamw_8bit
115
+ lr_scheduler: cosine
116
+ learning_rate: 0.00001
117
+
118
+ train_on_inputs: false
119
+ group_by_length: false
120
+ bf16: auto
121
+ fp16:
122
+ tf32: false
123
+
124
+ gradient_checkpointing: unsloth
125
+ #gradient_checkpointing_kwargs:
126
+ # use_reentrant: false
127
+ early_stopping_patience:
128
+ resume_from_checkpoint:
129
+ local_rank:
130
+ logging_steps: 1
131
+ xformers_attention:
132
+ flash_attention: true
133
+
134
+ warmup_steps: 40
135
+ evals_per_epoch:
136
+ eval_table_size:
137
+ eval_max_new_tokens:
138
+ saves_per_epoch: 1
139
+ debug:
140
+ deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16.json
141
+ weight_decay: 0.03
142
+ fsdp:
143
+ fsdp_config:
144
+ special_tokens:
145
+ pad_token: <pad>
146
+
147
+
148
+ ```
149
+
150
+ </details><br>
151
+
152
+ # 12b-out-r3
153
+
154
+ This model was trained from scratch on the None dataset.
155
+
156
+ ## Model description
157
+
158
+ More information needed
159
+
160
+ ## Intended uses & limitations
161
+
162
+ More information needed
163
+
164
+ ## Training and evaluation data
165
+
166
+ More information needed
167
+
168
+ ## Training procedure
169
+
170
+ ### Training hyperparameters
171
+
172
+ The following hyperparameters were used during training:
173
+ - learning_rate: 1e-05
174
+ - train_batch_size: 1
175
+ - eval_batch_size: 1
176
+ - seed: 42
177
+ - distributed_type: multi-GPU
178
+ - num_devices: 4
179
+ - gradient_accumulation_steps: 2
180
+ - total_train_batch_size: 8
181
+ - total_eval_batch_size: 4
182
+ - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
183
+ - lr_scheduler_type: cosine
184
+ - lr_scheduler_warmup_steps: 40
185
+ - num_epochs: 4
186
+
187
+ ### Training results
188
+
189
+
190
+
191
+ ### Framework versions
192
+
193
+ - PEFT 0.13.2
194
+ - Transformers 4.46.1
195
+ - Pytorch 2.3.1+cu121
196
+ - Datasets 3.0.1
197
+ - Tokenizers 0.20.3
adapter_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "mistralai_Mistral-Nemo-Instruct-2407",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": [
18
+ "embed_tokens",
19
+ "lm_head"
20
+ ],
21
+ "peft_type": "LORA",
22
+ "r": 32,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "up_proj",
27
+ "o_proj",
28
+ "k_proj",
29
+ "down_proj",
30
+ "v_proj",
31
+ "q_proj",
32
+ "gate_proj"
33
+ ],
34
+ "task_type": "CAUSAL_LM",
35
+ "use_dora": false,
36
+ "use_rslora": true
37
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:325447b46ec5a949650d6e31661f38563fcc13795bec1befe394d897e9d1c726
3
+ size 2912497608
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "mistralai_Mistral-Nemo-Instruct-2407",
4
+ "architectures": [
5
+ "MistralForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 5120,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 14336,
15
+ "max_position_embeddings": 131072,
16
+ "model_type": "mistral",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 40,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.46.1",
26
+ "use_cache": false,
27
+ "vocab_size": 131072
28
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0240ce510f08e6c2041724e9043e33be9d251d1e4a4d94eb68cd47b954b61d2
3
+ size 17078292
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a34494574e9f6fb5caa0e48edb41cb98af38d171ff118bd32731ba085811af8
3
+ size 7992