PocketDoc commited on
Commit
2748234
·
verified ·
1 Parent(s): 32b0e65

Model save

Browse files
Files changed (1) hide show
  1. README.md +212 -0
README.md ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model: Dans-DiscountModels/mistral-7b-v0.3-DanChat
4
+ tags:
5
+ - axolotl
6
+ - generated_from_trainer
7
+ datasets:
8
+ - Dans-DiscountModels/dpe-130l-m-7b-32k
9
+ model-index:
10
+ - name: 7b-m-dans-personalityengine-v1.3.0L-TestArticle-1
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
18
+ <details><summary>See axolotl config</summary>
19
+
20
+ axolotl version: `0.10.0.dev0`
21
+ ```yaml
22
+ base_model: Dans-DiscountModels/mistral-7b-v0.3-DanChat
23
+ model_type: AutoModelForCausalLM
24
+ tokenizer_type: AutoTokenizer
25
+
26
+ trust_remote_code:
27
+
28
+ # wandb configuration
29
+ wandb_project: 7b-m-dans-personalityengine
30
+ wandb_watch:
31
+
32
+ wandb_run_id: V1.3.0L-1-8 # V{Version}-{Run Number}-{Attempt Number}
33
+ wandb_log_model:
34
+
35
+ # push checkpoints to hub
36
+ hub_model_id: Dans-DiscountModels/7b-m-dans-personalityengine-v1.3.0L-TestArticle-1
37
+ # how to push checkpoints to hub
38
+ # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
39
+ hub_strategy: "every_save"
40
+ # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
41
+ # Required to be true when used in combination with `push_dataset_to_hub`
42
+ hf_use_auth_token: true
43
+
44
+ # where to save the finished model to
45
+ output_dir: ./7b-m-dans-personalityengine
46
+
47
+ # where to save the dataset to
48
+ dataset_prepared_path: ./7b-m-dans-personalityengine-data
49
+
50
+ save_safetensors: true
51
+
52
+ # dataset settings (local or huggingface repo)
53
+ datasets:
54
+ - path: Dans-DiscountModels/dpe-130l-m-7b-32k
55
+ split: train
56
+ ds_type: parquet
57
+ type:
58
+
59
+ test_datasets:
60
+ - path: Dans-DiscountModels/dpe-130l-m-7b-32k
61
+ split: validation
62
+ ds_type: parquet
63
+ type:
64
+
65
+
66
+ plugins:
67
+ - axolotl.integrations.liger.LigerPlugin
68
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
69
+ liger_rope: true
70
+ liger_rms_norm: true
71
+ liger_layer_norm: true
72
+ liger_glu_activation: true
73
+ liger_fused_linear_cross_entropy: false
74
+ cut_cross_entropy: true
75
+
76
+ load_in_8bit: false
77
+ load_in_4bit: false
78
+ strict: false
79
+
80
+ sequence_len: 32768
81
+
82
+ sample_packing: true
83
+ eval_sample_packing: true
84
+
85
+ pad_to_sequence_len: true
86
+
87
+ gradient_checkpointing: true
88
+ # gradient_checkpointing_kwargs:
89
+ # use_reentrant: false
90
+
91
+ gradient_accumulation_steps: 1
92
+ micro_batch_size: 4
93
+
94
+ num_epochs: 2
95
+
96
+ optimizer: ademamix_8bit
97
+ optim_args: "beta1=0.9,beta2=0.999,beta3=0.999,alpha=5"
98
+
99
+ lr_scheduler: rex
100
+ learning_rate: 0.000000012
101
+ cosine_min_lr_ratio: 0.1
102
+
103
+ # weight_decay: 0.03
104
+ max_grad_norm: 0.001
105
+
106
+ train_on_inputs: false
107
+ group_by_length: false
108
+
109
+ bf16: true
110
+ fp16: false
111
+ tf32: false
112
+
113
+ early_stopping_patience:
114
+
115
+ resume_from_checkpoint:
116
+ auto_resume_from_checkpoints: false
117
+
118
+ local_rank:
119
+ logging_steps: 1
120
+ xformers_attention:
121
+ flash_attention: true
122
+
123
+ warmup_ratio: 0.05
124
+
125
+ evals_per_epoch: 10
126
+ eval_table_size:
127
+ eval_max_new_tokens:
128
+
129
+ saves_per_epoch: 2
130
+ save_total_limit: 1
131
+
132
+ debug: false
133
+
134
+ deepspeed: deepspeed_configs/zero3_bf16.json
135
+
136
+ fsdp:
137
+ fsdp_config:
138
+
139
+ special_tokens:
140
+ ```
141
+
142
+ </details><br>
143
+
144
+ # 7b-m-dans-personalityengine-v1.3.0L-TestArticle-1
145
+
146
+ This model is a fine-tuned version of [Dans-DiscountModels/mistral-7b-v0.3-DanChat](https://huggingface.co/Dans-DiscountModels/mistral-7b-v0.3-DanChat) on the Dans-DiscountModels/dpe-130l-m-7b-32k dataset.
147
+ It achieves the following results on the evaluation set:
148
+ - Loss: 1.5911
149
+
150
+ ## Model description
151
+
152
+ More information needed
153
+
154
+ ## Intended uses & limitations
155
+
156
+ More information needed
157
+
158
+ ## Training and evaluation data
159
+
160
+ More information needed
161
+
162
+ ## Training procedure
163
+
164
+ ### Training hyperparameters
165
+
166
+ The following hyperparameters were used during training:
167
+ - learning_rate: 1.2e-08
168
+ - train_batch_size: 4
169
+ - eval_batch_size: 4
170
+ - seed: 42
171
+ - distributed_type: multi-GPU
172
+ - num_devices: 8
173
+ - total_train_batch_size: 32
174
+ - total_eval_batch_size: 32
175
+ - optimizer: Use ademamix_8bit and the args are:
176
+ beta1=0.9,beta2=0.999,beta3=0.999,alpha=5
177
+ - lr_scheduler_type: cosine
178
+ - lr_scheduler_warmup_steps: 47
179
+ - num_epochs: 2.0
180
+
181
+ ### Training results
182
+
183
+ | Training Loss | Epoch | Step | Validation Loss |
184
+ |:-------------:|:------:|:----:|:---------------:|
185
+ | 1.4427 | 0.0021 | 1 | 1.5639 |
186
+ | 1.5781 | 0.1015 | 48 | 1.5631 |
187
+ | 1.462 | 0.2030 | 96 | 1.5590 |
188
+ | 1.6565 | 0.3044 | 144 | 1.5540 |
189
+ | 1.454 | 0.4059 | 192 | 1.5498 |
190
+ | 1.5414 | 0.5074 | 240 | 1.5471 |
191
+ | 1.6084 | 0.6089 | 288 | 1.5459 |
192
+ | 1.5315 | 0.7104 | 336 | 1.5457 |
193
+ | 1.4646 | 0.8118 | 384 | 1.5465 |
194
+ | 1.5506 | 0.9133 | 432 | 1.5482 |
195
+ | 1.5083 | 1.0148 | 480 | 1.5506 |
196
+ | 1.4986 | 1.1163 | 528 | 1.5538 |
197
+ | 1.4976 | 1.2178 | 576 | 1.5576 |
198
+ | 1.6139 | 1.3192 | 624 | 1.5618 |
199
+ | 1.6305 | 1.4207 | 672 | 1.5666 |
200
+ | 1.5522 | 1.5222 | 720 | 1.5717 |
201
+ | 1.5846 | 1.6237 | 768 | 1.5771 |
202
+ | 1.6093 | 1.7252 | 816 | 1.5824 |
203
+ | 1.6282 | 1.8266 | 864 | 1.5873 |
204
+ | 1.5984 | 1.9281 | 912 | 1.5911 |
205
+
206
+
207
+ ### Framework versions
208
+
209
+ - Transformers 4.51.3
210
+ - Pytorch 2.7.0+cu126
211
+ - Datasets 3.5.1
212
+ - Tokenizers 0.21.1