AiForgeMaster commited on
Commit
3f3fa97
·
verified ·
1 Parent(s): d625300

Model save

Browse files
README.md ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: AiForgeMaster/Qwen3-4B-P3-TC-RSSFT-1
5
+ tags:
6
+ - axolotl
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: Qwen3-4B-P3-RSSFT-KE-1
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
17
+ <details><summary>See axolotl config</summary>
18
+
19
+ axolotl version: `0.13.0.dev0`
20
+ ```yaml
21
+ # axolotl train config.yaml
22
+
23
+ # Prevent NCCL timeout
24
+ ddp_timeout: 7200 # 2 hours timeout instead of 10 minutes
25
+
26
+ # Load model from local models directory first, fallback to HuggingFace if not found
27
+ base_model: AiForgeMaster/Qwen3-4B-P3-TC-RSSFT-1 # Local path - will fallback to Qwen/Qwen3-4B if not found locally
28
+ # Automatically upload checkpoint and final model to HF
29
+ hub_model_id: AiForgeMaster/Qwen3-4B-P3-RSSFT-KE-1
30
+
31
+ load_in_8bit: false
32
+ load_in_4bit: false
33
+ strict: false
34
+
35
+ # SFT dataset configuration - using HuggingFace datasets
36
+ datasets:
37
+ - path: AiForgeMaster/KE-2017-2025 # Private HF dataset - requires API key
38
+ type: chat_template
39
+ split: train
40
+ field_messages: messages
41
+ trust_remote_code: false
42
+ # skip: 0 # number of rows of data to skip over from the beginning
43
+
44
+ # Local paths relative to working directory
45
+ dataset_prepared_path: ./data/prepared
46
+ val_set_size: 0.0 # Set to 0 for SFT (no validation split)
47
+ output_dir: ./outputs
48
+
49
+ # Cache directories for HuggingFace downloads (relative to working dir)
50
+ # This ensures models and datasets are downloaded to local directories
51
+ hf_use_auth_token: true # Use HF token for private repos if needed
52
+
53
+ sequence_len: 8192
54
+ sample_packing: false # Standard for SFT
55
+ eval_sample_packing: false # Disable for SFT
56
+
57
+ # WandB configuration - fill in your details
58
+ wandb_project: ngpt-cpt
59
+ wandb_entity: null
60
+ wandb_watch: gradients
61
+ wandb_name: qwen3_4b_p3_rssft_ke_1
62
+ wandb_log_model: end
63
+
64
+ # Batch size configuration (total effective batch size = micro_batch_size * gradient_accumulation_steps * num_gpus)
65
+ # For batch size 8-16: micro_batch_size=2, gradient_accumulation_steps=4 gives effective batch size of 8 per GPU
66
+ gradient_accumulation_steps: 2
67
+ micro_batch_size: 2 # Adjust based on your GPU memory
68
+ optimizer: adamw_torch_fused
69
+ lr_scheduler: cosine
70
+ learning_rate: 2e-5 # Good learning rate for SFT
71
+
72
+ bf16: auto
73
+ tf32: true
74
+
75
+ max_grad_norm: 1.0
76
+
77
+ gradient_checkpointing: true
78
+ gradient_checkpointing_kwargs:
79
+ use_reentrant: false
80
+ logging_steps: 10 # Log every 10 steps
81
+ flash_attention: true
82
+
83
+ warmup_steps: 50 # Good warmup for SFT
84
+ # Checkpoint saving configuration - save every 50 steps
85
+ save_steps: 50
86
+ save_strategy: steps
87
+ save_total_limit: 5 # Keep only 5 most recent checkpoints
88
+ save_only_model: false # Save full checkpoint including optimizer state
89
+
90
+ # Evaluation configuration removed for pure SFT (val_set_size: 0.0)
91
+ # eval_steps: 2000 # Not supported when val_set_size == 0
92
+ # eval_strategy: steps # Not supported when val_set_size == 0
93
+ weight_decay: 0.01 # Good weight decay for SFT
94
+
95
+ # Liger optimizations for memory efficiency and speed
96
+ plugins:
97
+ - axolotl.integrations.liger.LigerPlugin
98
+
99
+ liger_rope: true
100
+ liger_rms_norm: true
101
+ liger_glu_activation: true
102
+ liger_layer_norm: true
103
+ liger_fused_linear_cross_entropy: true
104
+
105
+ # Additional SFT optimizations
106
+ # Enable for first run to validate checkpoint saving works
107
+ save_first_step: true
108
+
109
+ # Memory optimizations
110
+ dataloader_pin_memory: true
111
+ dataloader_num_workers: 4
112
+ remove_unused_columns: true
113
+
114
+ # Advanced training settings for SFT
115
+ # Calculate max_steps for full epoch: dataset_size / (micro_batch_size * gradient_accumulation_steps * num_gpus)
116
+ # max_steps: 175 # Set for one full epoch with your dataset size
117
+ num_epochs: 1
118
+ group_by_length: true # Good for SFT efficiency
119
+ train_on_inputs: true # train on user inputs in SFT
120
+
121
+ # Loss monitoring
122
+ loss_watchdog_threshold: 10.0 # Stop if loss exceeds this value
123
+ loss_watchdog_patience: 3
124
+
125
+ # Garbage collection to manage memory
126
+ gc_steps: 100 # Run garbage collection every 100 steps
127
+ ```
128
+
129
+ </details><br>
130
+
131
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/uskfoundation/ngpt-cpt/runs/oy7n1t61)
132
+ # Qwen3-4B-P3-RSSFT-KE-1
133
+
134
+ This model is a fine-tuned version of [AiForgeMaster/Qwen3-4B-P3-TC-RSSFT-1](https://huggingface.co/AiForgeMaster/Qwen3-4B-P3-TC-RSSFT-1) on an unknown dataset.
135
+
136
+ ## Model description
137
+
138
+ More information needed
139
+
140
+ ## Intended uses & limitations
141
+
142
+ More information needed
143
+
144
+ ## Training and evaluation data
145
+
146
+ More information needed
147
+
148
+ ## Training procedure
149
+
150
+ ### Training hyperparameters
151
+
152
+ The following hyperparameters were used during training:
153
+ - learning_rate: 2e-05
154
+ - train_batch_size: 2
155
+ - eval_batch_size: 2
156
+ - seed: 42
157
+ - gradient_accumulation_steps: 2
158
+ - total_train_batch_size: 4
159
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
160
+ - lr_scheduler_type: cosine
161
+ - lr_scheduler_warmup_steps: 50
162
+ - training_steps: 416
163
+
164
+ ### Framework versions
165
+
166
+ - Transformers 4.56.1
167
+ - Pytorch 2.7.1+cu126
168
+ - Datasets 4.0.0
169
+ - Tokenizers 0.22.0
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645
6
+ ],
7
+ "max_length": 40960,
8
+ "pad_token_id": 151643,
9
+ "transformers_version": "4.56.1",
10
+ "use_cache": false
11
+ }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e557a73d5abf8c91c0c7b7de3f4fec9cae62d69c6b8af012193cf2fa40345cca
3
  size 4967215360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32bc206666f45d987d4ab262fffec62797b79db877078b09eb4cc40e408210a5
3
  size 4967215360
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2451a2580c19125d8fb1a5b809d097f621143f40e12d080d741cd93b25162241
3
  size 3077766632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b81685543c894f0548252f24f171b519d5785ac7d453269aeb0ea08128d4d81
3
  size 3077766632