Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/training.log +24 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/training.log +24 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/training.log +24 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.2-c4book/pytorch_model.bin +3 -0
- Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/training.log +24 -0
- Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/training.log +5 -0
- Qwen2.5-3B-Instruct-bl-0.215-c4book/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.215-c4book/pytorch_model.bin +3 -0
- Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/training.log +8 -0
- Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/training.log +24 -0
- Qwen2.5-3B-Instruct-bl-0.225-c4book/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.225-c4book/pytorch_model.bin +3 -0
- Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/training.log +24 -0
- Qwen2.5-3B-Instruct-bl-0.25-c4book/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.25-c4book/pytorch_model.bin +3 -0
- Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/training.log +7 -0
- Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/training.log +7 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/training.log +12 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/training.log +5 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/train.sh +1 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/training.log +24 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/description.txt +28 -0
- Qwen2.5-3B-Instruct-bl-0.3-c4book/pytorch_model.bin +3 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 14
|
| 14 |
+
- block_attention_layer_end: 22
|
| 15 |
+
- block_mlp_layer_start: 14
|
| 16 |
+
- block_mlp_layer_end: 22
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 14 --block_mlp_layer_end 22 --block_attention_layer_start 14 --block_attention_layer_end 22 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/training.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-07 07:18:52 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-07 07:18:54 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-07 07:18:54 - INFO : Pruning Attention Layer = [14, 15, 16, 17, 18, 19, 20, 21]
|
| 4 |
+
2025-10-07 07:18:54 - INFO : Pruning MLP Layer = [14, 15, 16, 17, 18, 19, 20, 21]
|
| 5 |
+
2025-10-07 07:18:54 - INFO : Start Pruning
|
| 6 |
+
2025-10-07 07:19:45 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
|
| 7 |
+
2025-10-07 07:19:45 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
|
| 8 |
+
2025-10-07 07:19:56 - INFO : Start Backwarding in iterative steps = 0...
|
| 9 |
+
2025-10-07 07:19:56 - INFO : Loss = 3.6055665016174316
|
| 10 |
+
2025-10-07 07:19:57 - INFO : After Iter 1/1, #parameters: 2777647104
|
| 11 |
+
2025-10-07 07:19:57 - INFO : #Param before: 3085938688, #Param after: 2777647104, Ratio = 90.0098%
|
| 12 |
+
2025-10-07 07:20:57 - ERROR : `trust_remote_code` is not supported anymore.
|
| 13 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 14 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 15 |
+
2025-10-07 07:20:58 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 16 |
+
2025-10-07 07:20:58 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 17 |
+
2025-10-07 07:20:58 - ERROR : `trust_remote_code` is not supported anymore.
|
| 18 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 19 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 20 |
+
2025-10-07 07:20:58 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 21 |
+
2025-10-07 07:20:58 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 22 |
+
2025-10-07 07:21:03 - INFO : PPL after pruning: {'c4': 6.662430613808609, 'wikitext2': 11.784638661080912, 'ptb': 22.494734284035275}
|
| 23 |
+
2025-10-07 07:21:03 - INFO : Memory Requirement: 7670.80419921875 MiB
|
| 24 |
+
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 10
|
| 14 |
+
- block_attention_layer_end: 26
|
| 15 |
+
- block_mlp_layer_start: 10
|
| 16 |
+
- block_mlp_layer_end: 26
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 10 --block_mlp_layer_end 26 --block_attention_layer_start 10 --block_attention_layer_end 26 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/training.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-07 07:26:30 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-07 07:26:32 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-07 07:26:32 - INFO : Pruning Attention Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
|
| 4 |
+
2025-10-07 07:26:32 - INFO : Pruning MLP Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
|
| 5 |
+
2025-10-07 07:26:33 - INFO : Start Pruning
|
| 6 |
+
2025-10-07 07:27:13 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
|
| 7 |
+
2025-10-07 07:27:13 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
|
| 8 |
+
2025-10-07 07:27:23 - INFO : Start Backwarding in iterative steps = 0...
|
| 9 |
+
2025-10-07 07:27:23 - INFO : Loss = 3.6055665016174316
|
| 10 |
+
2025-10-07 07:27:25 - INFO : After Iter 1/1, #parameters: 2469355520
|
| 11 |
+
2025-10-07 07:27:25 - INFO : #Param before: 3085938688, #Param after: 2469355520, Ratio = 80.0196%
|
| 12 |
+
2025-10-07 07:28:36 - ERROR : `trust_remote_code` is not supported anymore.
|
| 13 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 14 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 15 |
+
2025-10-07 07:28:37 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 16 |
+
2025-10-07 07:28:37 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 17 |
+
2025-10-07 07:28:37 - ERROR : `trust_remote_code` is not supported anymore.
|
| 18 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 19 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 20 |
+
2025-10-07 07:28:37 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 21 |
+
2025-10-07 07:28:37 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 22 |
+
2025-10-07 07:28:41 - INFO : PPL after pruning: {'c4': 8.791876103118177, 'wikitext2': 22.059646737841184, 'ptb': 41.2128458403547}
|
| 23 |
+
2025-10-07 07:28:41 - INFO : Memory Requirement: 9438.86669921875 MiB
|
| 24 |
+
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 9
|
| 14 |
+
- block_attention_layer_end: 27
|
| 15 |
+
- block_mlp_layer_start: 9
|
| 16 |
+
- block_mlp_layer_end: 27
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/training.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-22 09:56:11 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-22 09:56:14 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-22 09:56:14 - INFO : Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
|
| 4 |
+
2025-10-22 09:56:14 - INFO : Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
|
| 5 |
+
2025-10-22 09:56:14 - INFO : Start Pruning
|
| 6 |
+
2025-10-22 09:57:08 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
|
| 7 |
+
2025-10-22 09:57:08 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
|
| 8 |
+
2025-10-22 09:57:18 - INFO : Start Backwarding in iterative steps = 0...
|
| 9 |
+
2025-10-22 09:57:19 - INFO : Loss = 3.6055665016174316
|
| 10 |
+
2025-10-22 09:57:20 - INFO : After Iter 1/1, #parameters: 2392282624
|
| 11 |
+
2025-10-22 09:57:20 - INFO : #Param before: 3085938688, #Param after: 2392282624, Ratio = 77.5220%
|
| 12 |
+
2025-10-22 09:58:10 - ERROR : `trust_remote_code` is not supported anymore.
|
| 13 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 14 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 15 |
+
2025-10-22 09:58:11 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 16 |
+
2025-10-22 09:58:11 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 17 |
+
2025-10-22 09:58:11 - ERROR : `trust_remote_code` is not supported anymore.
|
| 18 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 19 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 20 |
+
2025-10-22 09:58:11 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 21 |
+
2025-10-22 09:58:11 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 22 |
+
2025-10-22 09:58:15 - INFO : PPL after pruning: {'c4': 9.827221607438576, 'wikitext2': 26.453556883236853, 'ptb': 47.90120329104557}
|
| 23 |
+
2025-10-22 09:58:15 - INFO : Memory Requirement: 9880.88232421875 MiB
|
| 24 |
+
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 9
|
| 14 |
+
- block_attention_layer_end: 27
|
| 15 |
+
- block_mlp_layer_start: 9
|
| 16 |
+
- block_mlp_layer_end: 27
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.2-c4book/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bab2adb5206c3f6dd0895845a69a647bda2be5ed4ba1b68dbecf7b8a5b46a486
|
| 3 |
+
size 4791757567
|
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 11
|
| 14 |
+
- block_attention_layer_end: 25
|
| 15 |
+
- block_mlp_layer_start: 11
|
| 16 |
+
- block_mlp_layer_end: 25
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.215-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 11 --block_mlp_layer_end 25 --block_attention_layer_start 11 --block_attention_layer_end 25 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/training.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-10 14:05:17 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-10 14:05:19 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-10 14:05:19 - INFO : Pruning Attention Layer = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
|
| 4 |
+
2025-10-10 14:05:19 - INFO : Pruning MLP Layer = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
|
| 5 |
+
2025-10-10 14:05:19 - INFO : Start Pruning
|
| 6 |
+
2025-10-10 14:06:15 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
|
| 7 |
+
2025-10-10 14:06:15 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
|
| 8 |
+
2025-10-10 14:06:25 - INFO : Start Backwarding in iterative steps = 0...
|
| 9 |
+
2025-10-10 14:06:26 - INFO : Loss = 3.6055665016174316
|
| 10 |
+
2025-10-10 14:06:27 - INFO : After Iter 1/1, #parameters: 2546428416
|
| 11 |
+
2025-10-10 14:06:27 - INFO : #Param before: 3085938688, #Param after: 2546428416, Ratio = 82.5171%
|
| 12 |
+
2025-10-10 14:07:38 - ERROR : `trust_remote_code` is not supported anymore.
|
| 13 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 14 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 15 |
+
2025-10-10 14:07:38 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 16 |
+
2025-10-10 14:07:38 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 17 |
+
2025-10-10 14:07:38 - ERROR : `trust_remote_code` is not supported anymore.
|
| 18 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 19 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 20 |
+
2025-10-10 14:07:39 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 21 |
+
2025-10-10 14:07:39 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 22 |
+
2025-10-10 14:07:43 - INFO : PPL after pruning: {'c4': 8.115290514356445, 'wikitext2': 18.075026786359576, 'ptb': 34.97688798216538}
|
| 23 |
+
2025-10-10 14:07:43 - INFO : Memory Requirement: 8996.85107421875 MiB
|
| 24 |
+
|
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 10
|
| 14 |
+
- block_attention_layer_end: 25
|
| 15 |
+
- block_mlp_layer_start: 10
|
| 16 |
+
- block_mlp_layer_end: 25
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.215-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 10 --block_mlp_layer_end 25 --block_attention_layer_start 10 --block_attention_layer_end 25 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/training.log
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-10 14:08:13 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-10 14:08:16 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-10 14:08:16 - INFO : Pruning Attention Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
|
| 4 |
+
2025-10-10 14:08:16 - INFO : Pruning MLP Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
|
| 5 |
+
2025-10-10 14:08:17 - INFO : Start Pruning
|
Qwen2.5-3B-Instruct-bl-0.215-c4book/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 10
|
| 14 |
+
- block_attention_layer_end: 25
|
| 15 |
+
- block_mlp_layer_start: 10
|
| 16 |
+
- block_mlp_layer_end: 25
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.215-c4book/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0b198b559663b1573ff340663d4d3c23896bb3459f94b998bddac1d33b8f126
|
| 3 |
+
size 5100049439
|
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 9
|
| 14 |
+
- block_attention_layer_end: 27
|
| 15 |
+
- block_mlp_layer_start: 9
|
| 16 |
+
- block_mlp_layer_end: 27
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.225-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/training.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-10 17:31:41 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-10 17:31:43 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-10 17:31:43 - INFO : Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
|
| 4 |
+
2025-10-10 17:31:43 - INFO : Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
|
| 5 |
+
2025-10-10 17:31:44 - INFO : Start Pruning
|
| 6 |
+
2025-10-10 17:32:38 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
|
| 7 |
+
2025-10-10 17:32:38 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
|
| 8 |
+
2025-10-10 17:32:49 - INFO : Start Backwarding in iterative steps = 0...
|
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 9
|
| 14 |
+
- block_attention_layer_end: 27
|
| 15 |
+
- block_mlp_layer_start: 9
|
| 16 |
+
- block_mlp_layer_end: 27
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.225-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/training.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-10 17:41:36 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-10 17:41:38 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-10 17:41:38 - INFO : Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
|
| 4 |
+
2025-10-10 17:41:38 - INFO : Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
|
| 5 |
+
2025-10-10 17:41:39 - INFO : Start Pruning
|
| 6 |
+
2025-10-10 17:42:37 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
|
| 7 |
+
2025-10-10 17:42:37 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
|
| 8 |
+
2025-10-10 17:42:47 - INFO : Start Backwarding in iterative steps = 0...
|
| 9 |
+
2025-10-10 17:42:47 - INFO : Loss = 3.6055665016174316
|
| 10 |
+
2025-10-10 17:42:49 - INFO : After Iter 1/1, #parameters: 2392282624
|
| 11 |
+
2025-10-10 17:42:49 - INFO : #Param before: 3085938688, #Param after: 2392282624, Ratio = 77.5220%
|
| 12 |
+
2025-10-10 17:43:52 - ERROR : `trust_remote_code` is not supported anymore.
|
| 13 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 14 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 15 |
+
2025-10-10 17:43:52 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 16 |
+
2025-10-10 17:43:52 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 17 |
+
2025-10-10 17:43:52 - ERROR : `trust_remote_code` is not supported anymore.
|
| 18 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 19 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 20 |
+
2025-10-10 17:43:53 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 21 |
+
2025-10-10 17:43:53 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 22 |
+
2025-10-10 17:43:57 - INFO : PPL after pruning: {'c4': 9.827221607438576, 'wikitext2': 26.453556883236853, 'ptb': 47.90120329104557}
|
| 23 |
+
2025-10-10 17:43:57 - INFO : Memory Requirement: 9880.88232421875 MiB
|
| 24 |
+
|
Qwen2.5-3B-Instruct-bl-0.225-c4book/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 9
|
| 14 |
+
- block_attention_layer_end: 27
|
| 15 |
+
- block_mlp_layer_start: 9
|
| 16 |
+
- block_mlp_layer_end: 27
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.225-c4book/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ff600fc12f5785ca0ea11fb91a233cf7e43a2ec36b7b41098082e44345cfe26
|
| 3 |
+
size 4791757567
|
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.25-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 8
|
| 14 |
+
- block_attention_layer_end: 28
|
| 15 |
+
- block_mlp_layer_start: 8
|
| 16 |
+
- block_mlp_layer_end: 28
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.25-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 8 --block_mlp_layer_end 28 --block_attention_layer_start 8 --block_attention_layer_end 28 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/training.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-10 07:39:21 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-10 07:39:23 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-10 07:39:23 - INFO : Pruning Attention Layer = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
|
| 4 |
+
2025-10-10 07:39:23 - INFO : Pruning MLP Layer = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
|
| 5 |
+
2025-10-10 07:39:23 - INFO : Start Pruning
|
| 6 |
+
2025-10-10 07:40:10 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
|
| 7 |
+
2025-10-10 07:40:10 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
|
| 8 |
+
2025-10-10 07:40:20 - INFO : Start Backwarding in iterative steps = 0...
|
| 9 |
+
2025-10-10 07:40:21 - INFO : Loss = 3.6055665016174316
|
| 10 |
+
2025-10-10 07:40:22 - INFO : After Iter 1/1, #parameters: 2315209728
|
| 11 |
+
2025-10-10 07:40:22 - INFO : #Param before: 3085938688, #Param after: 2315209728, Ratio = 75.0245%
|
| 12 |
+
2025-10-10 07:41:09 - ERROR : `trust_remote_code` is not supported anymore.
|
| 13 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 14 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 15 |
+
2025-10-10 07:41:10 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 16 |
+
2025-10-10 07:41:10 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 17 |
+
2025-10-10 07:41:10 - ERROR : `trust_remote_code` is not supported anymore.
|
| 18 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 19 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 20 |
+
2025-10-10 07:41:10 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 21 |
+
2025-10-10 07:41:10 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 22 |
+
2025-10-10 07:41:14 - INFO : PPL after pruning: {'c4': 10.963057921054194, 'wikitext2': 31.846818916590372, 'ptb': 54.06755996279899}
|
| 23 |
+
2025-10-10 07:41:14 - INFO : Memory Requirement: 10322.89794921875 MiB
|
| 24 |
+
|
Qwen2.5-3B-Instruct-bl-0.25-c4book/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.25-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 8
|
| 14 |
+
- block_attention_layer_end: 28
|
| 15 |
+
- block_mlp_layer_start: 8
|
| 16 |
+
- block_mlp_layer_end: 28
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.25-c4book/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44981d1747d90431ddbf55f4c8725d3719a777bf0ee3dcbcc51d58d859ee6bf6
|
| 3 |
+
size 4637611631
|
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 3
|
| 14 |
+
- block_attention_layer_end: 6
|
| 15 |
+
- block_mlp_layer_start: 3
|
| 16 |
+
- block_mlp_layer_end: 6
|
| 17 |
+
- iterative_steps: 5
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 30
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.6
|
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3 --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 3 --block_mlp_layer_end 6 --block_attention_layer_start 3 --block_attention_layer_end 6 --max_seq_len 2048 --num_examples 30
|
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/training.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-16 04:57:53 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-06-16 04:57:55 - INFO : Use taylor pruner...
|
| 3 |
+
2025-06-16 04:57:55 - INFO : Pruning Attention Layer = [3, 4, 5]
|
| 4 |
+
2025-06-16 04:57:55 - INFO : Pruning MLP Layer = [3, 4, 5]
|
| 5 |
+
2025-06-16 04:57:55 - INFO : Start Pruning
|
| 6 |
+
2025-06-16 04:59:26 - INFO : Start Backwarding in iterative steps = 0...
|
| 7 |
+
2025-06-16 04:59:26 - INFO : Loss = 4.31801176071167
|
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 3
|
| 14 |
+
- block_attention_layer_end: 6
|
| 15 |
+
- block_mlp_layer_start: 3
|
| 16 |
+
- block_mlp_layer_end: 6
|
| 17 |
+
- iterative_steps: 5
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 30
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.6
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
|
| 4 |
+
- pruning_ratio: 0.3
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 6
|
| 14 |
+
- block_attention_layer_end: 30
|
| 15 |
+
- block_mlp_layer_start: 6
|
| 16 |
+
- block_mlp_layer_end: 30
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 30
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.6
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.3 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 30 --block_attention_layer_start 6 --block_attention_layer_end 30 --max_seq_len 2048 --num_examples 30
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/training.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-15 07:40:53 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-06-15 07:40:55 - INFO : Use taylor pruner...
|
| 3 |
+
2025-06-15 07:40:55 - INFO : Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
|
| 4 |
+
2025-06-15 07:40:55 - INFO : Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
|
| 5 |
+
2025-06-15 07:40:56 - INFO : Start Pruning
|
| 6 |
+
2025-06-15 07:41:54 - INFO : Start Backwarding in iterative steps = 0...
|
| 7 |
+
2025-06-15 07:41:55 - INFO : Loss = 3.5623340606689453
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 6
|
| 14 |
+
- block_attention_layer_end: 30
|
| 15 |
+
- block_mlp_layer_start: 6
|
| 16 |
+
- block_mlp_layer_end: 30
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 30
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.6
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 30 --block_attention_layer_start 6 --block_attention_layer_end 30 --max_seq_len 2048 --num_examples 30
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/training.log
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-15 07:42:20 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-06-15 07:42:22 - INFO : Use taylor pruner...
|
| 3 |
+
2025-06-15 07:42:22 - INFO : Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
|
| 4 |
+
2025-06-15 07:42:22 - INFO : Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
|
| 5 |
+
2025-06-15 07:42:22 - INFO : Start Pruning
|
| 6 |
+
2025-06-15 07:43:21 - INFO : Start Backwarding in iterative steps = 0...
|
| 7 |
+
2025-06-15 07:43:21 - INFO : Loss = 3.5623340606689453
|
| 8 |
+
2025-06-15 07:43:23 - INFO : After Iter 1/1, #parameters: 2161063936
|
| 9 |
+
2025-06-15 07:43:23 - INFO : #Param before: 3085938688, #Param after: 2161063936, Ratio = 70.0294%
|
| 10 |
+
2025-06-15 07:44:27 - INFO : PPL after pruning: {'c4': 19.429584428039178, 'wikitext2': 49.037147591523336, 'ptb': 71.48811770889668}
|
| 11 |
+
2025-06-15 07:44:27 - INFO : Memory Requirement: 11301.94384765625 MiB
|
| 12 |
+
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 6
|
| 14 |
+
- block_attention_layer_end: 32
|
| 15 |
+
- block_mlp_layer_start: 6
|
| 16 |
+
- block_mlp_layer_end: 32
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 32 --block_attention_layer_start 6 --block_attention_layer_end 32 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/training.log
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-07 07:34:11 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-07 07:34:13 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-07 07:34:13 - INFO : Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
|
| 4 |
+
2025-10-07 07:34:13 - INFO : Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
|
| 5 |
+
2025-10-07 07:34:14 - INFO : Start Pruning
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 6
|
| 14 |
+
- block_attention_layer_end: 32
|
| 15 |
+
- block_mlp_layer_start: 6
|
| 16 |
+
- block_mlp_layer_end: 32
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/train.sh
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 32 --block_attention_layer_start 6 --block_attention_layer_end 32 --max_seq_len 2048 --num_examples 20
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/training.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-10-07 07:38:26 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
|
| 2 |
+
2025-10-07 07:38:28 - INFO : Use taylor pruner...
|
| 3 |
+
2025-10-07 07:38:28 - INFO : Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
|
| 4 |
+
2025-10-07 07:38:28 - INFO : Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
|
| 5 |
+
2025-10-07 07:38:28 - INFO : Start Pruning
|
| 6 |
+
2025-10-07 07:39:11 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
|
| 7 |
+
2025-10-07 07:39:11 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
|
| 8 |
+
2025-10-07 07:39:21 - INFO : Start Backwarding in iterative steps = 0...
|
| 9 |
+
2025-10-07 07:39:22 - INFO : Loss = 3.6055665016174316
|
| 10 |
+
2025-10-07 07:39:23 - INFO : After Iter 1/1, #parameters: 2083991040
|
| 11 |
+
2025-10-07 07:39:23 - INFO : #Param before: 3085938688, #Param after: 2083991040, Ratio = 67.5318%
|
| 12 |
+
2025-10-07 07:40:19 - ERROR : `trust_remote_code` is not supported anymore.
|
| 13 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 14 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 15 |
+
2025-10-07 07:40:20 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 16 |
+
2025-10-07 07:40:20 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 17 |
+
2025-10-07 07:40:20 - ERROR : `trust_remote_code` is not supported anymore.
|
| 18 |
+
Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
|
| 19 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 20 |
+
2025-10-07 07:40:20 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
|
| 21 |
+
2025-10-07 07:40:20 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
|
| 22 |
+
2025-10-07 07:40:24 - INFO : PPL after pruning: {'c4': 27.83159468972004, 'wikitext2': 72.04880596519355, 'ptb': 104.4216991379013}
|
| 23 |
+
2025-10-07 07:40:24 - INFO : Memory Requirement: 11648.94482421875 MiB
|
| 24 |
+
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/description.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Training Parameters:
|
| 2 |
+
- base_model: Qwen/Qwen2.5-3B-Instruct
|
| 3 |
+
- save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
|
| 4 |
+
- pruning_ratio: 0.5
|
| 5 |
+
- pruner_type: taylor
|
| 6 |
+
- temperature: 1.0
|
| 7 |
+
- top_p: 0.95
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- channel_wise: False
|
| 10 |
+
- block_wise: True
|
| 11 |
+
- layer_wise: False
|
| 12 |
+
- layer: 12
|
| 13 |
+
- block_attention_layer_start: 6
|
| 14 |
+
- block_attention_layer_end: 32
|
| 15 |
+
- block_mlp_layer_start: 6
|
| 16 |
+
- block_mlp_layer_end: 32
|
| 17 |
+
- iterative_steps: 1
|
| 18 |
+
- grouping_strategy: sum
|
| 19 |
+
- global_pruning: False
|
| 20 |
+
- taylor: vectorize
|
| 21 |
+
- num_examples: 20
|
| 22 |
+
- device: cuda
|
| 23 |
+
- test_before_train: False
|
| 24 |
+
- eval_device: cuda
|
| 25 |
+
- test_after_train: False
|
| 26 |
+
- seed: 42
|
| 27 |
+
- save_model: True
|
| 28 |
+
- torch_version: 2.8
|
Qwen2.5-3B-Instruct-bl-0.3-c4book/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bd01c38add15eb14321d0db1ce4b4da767a38acb8e763e0511ec4a3fbfb03ad
|
| 3 |
+
size 4175173811
|