HuangRT commited on
Commit
f9e95a8
·
verified ·
1 Parent(s): d7441cc

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/description.txt +28 -0
  2. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/train.sh +1 -0
  3. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/training.log +24 -0
  4. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/description.txt +28 -0
  5. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/train.sh +1 -0
  6. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/training.log +24 -0
  7. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/description.txt +28 -0
  8. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/train.sh +1 -0
  9. Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/training.log +24 -0
  10. Qwen2.5-3B-Instruct-bl-0.2-c4book/description.txt +28 -0
  11. Qwen2.5-3B-Instruct-bl-0.2-c4book/pytorch_model.bin +3 -0
  12. Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/description.txt +28 -0
  13. Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/train.sh +1 -0
  14. Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/training.log +24 -0
  15. Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/description.txt +28 -0
  16. Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/train.sh +1 -0
  17. Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/training.log +5 -0
  18. Qwen2.5-3B-Instruct-bl-0.215-c4book/description.txt +28 -0
  19. Qwen2.5-3B-Instruct-bl-0.215-c4book/pytorch_model.bin +3 -0
  20. Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/description.txt +28 -0
  21. Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/train.sh +1 -0
  22. Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/training.log +8 -0
  23. Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/description.txt +28 -0
  24. Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/train.sh +1 -0
  25. Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/training.log +24 -0
  26. Qwen2.5-3B-Instruct-bl-0.225-c4book/description.txt +28 -0
  27. Qwen2.5-3B-Instruct-bl-0.225-c4book/pytorch_model.bin +3 -0
  28. Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/description.txt +28 -0
  29. Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/train.sh +1 -0
  30. Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/training.log +24 -0
  31. Qwen2.5-3B-Instruct-bl-0.25-c4book/description.txt +28 -0
  32. Qwen2.5-3B-Instruct-bl-0.25-c4book/pytorch_model.bin +3 -0
  33. Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/description.txt +28 -0
  34. Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/train.sh +1 -0
  35. Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/training.log +7 -0
  36. Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/description.txt +28 -0
  37. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/description.txt +28 -0
  38. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/train.sh +1 -0
  39. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/training.log +7 -0
  40. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/description.txt +28 -0
  41. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/train.sh +1 -0
  42. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/training.log +12 -0
  43. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/description.txt +28 -0
  44. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/train.sh +1 -0
  45. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/training.log +5 -0
  46. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/description.txt +28 -0
  47. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/train.sh +1 -0
  48. Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/training.log +24 -0
  49. Qwen2.5-3B-Instruct-bl-0.3-c4book/description.txt +28 -0
  50. Qwen2.5-3B-Instruct-bl-0.3-c4book/pytorch_model.bin +3 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 14
14
+ - block_attention_layer_end: 22
15
+ - block_mlp_layer_start: 14
16
+ - block_mlp_layer_end: 22
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 14 --block_mlp_layer_end 22 --block_attention_layer_start 14 --block_attention_layer_end 22 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/training.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-10-07 07:18:52 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-07 07:18:54 - INFO : Use taylor pruner...
3
+ 2025-10-07 07:18:54 - INFO : Pruning Attention Layer = [14, 15, 16, 17, 18, 19, 20, 21]
4
+ 2025-10-07 07:18:54 - INFO : Pruning MLP Layer = [14, 15, 16, 17, 18, 19, 20, 21]
5
+ 2025-10-07 07:18:54 - INFO : Start Pruning
6
+ 2025-10-07 07:19:45 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
7
+ 2025-10-07 07:19:45 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
8
+ 2025-10-07 07:19:56 - INFO : Start Backwarding in iterative steps = 0...
9
+ 2025-10-07 07:19:56 - INFO : Loss = 3.6055665016174316
10
+ 2025-10-07 07:19:57 - INFO : After Iter 1/1, #parameters: 2777647104
11
+ 2025-10-07 07:19:57 - INFO : #Param before: 3085938688, #Param after: 2777647104, Ratio = 90.0098%
12
+ 2025-10-07 07:20:57 - ERROR : `trust_remote_code` is not supported anymore.
13
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
14
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
15
+ 2025-10-07 07:20:58 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
16
+ 2025-10-07 07:20:58 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
17
+ 2025-10-07 07:20:58 - ERROR : `trust_remote_code` is not supported anymore.
18
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
19
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
20
+ 2025-10-07 07:20:58 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
21
+ 2025-10-07 07:20:58 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
22
+ 2025-10-07 07:21:03 - INFO : PPL after pruning: {'c4': 6.662430613808609, 'wikitext2': 11.784638661080912, 'ptb': 22.494734284035275}
23
+ 2025-10-07 07:21:03 - INFO : Memory Requirement: 7670.80419921875 MiB
24
+
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 10
14
+ - block_attention_layer_end: 26
15
+ - block_mlp_layer_start: 10
16
+ - block_mlp_layer_end: 26
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 10 --block_mlp_layer_end 26 --block_attention_layer_start 10 --block_attention_layer_end 26 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/training.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-10-07 07:26:30 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-07 07:26:32 - INFO : Use taylor pruner...
3
+ 2025-10-07 07:26:32 - INFO : Pruning Attention Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
4
+ 2025-10-07 07:26:32 - INFO : Pruning MLP Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
5
+ 2025-10-07 07:26:33 - INFO : Start Pruning
6
+ 2025-10-07 07:27:13 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
7
+ 2025-10-07 07:27:13 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
8
+ 2025-10-07 07:27:23 - INFO : Start Backwarding in iterative steps = 0...
9
+ 2025-10-07 07:27:23 - INFO : Loss = 3.6055665016174316
10
+ 2025-10-07 07:27:25 - INFO : After Iter 1/1, #parameters: 2469355520
11
+ 2025-10-07 07:27:25 - INFO : #Param before: 3085938688, #Param after: 2469355520, Ratio = 80.0196%
12
+ 2025-10-07 07:28:36 - ERROR : `trust_remote_code` is not supported anymore.
13
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
14
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
15
+ 2025-10-07 07:28:37 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
16
+ 2025-10-07 07:28:37 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
17
+ 2025-10-07 07:28:37 - ERROR : `trust_remote_code` is not supported anymore.
18
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
19
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
20
+ 2025-10-07 07:28:37 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
21
+ 2025-10-07 07:28:37 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
22
+ 2025-10-07 07:28:41 - INFO : PPL after pruning: {'c4': 8.791876103118177, 'wikitext2': 22.059646737841184, 'ptb': 41.2128458403547}
23
+ 2025-10-07 07:28:41 - INFO : Memory Requirement: 9438.86669921875 MiB
24
+
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 9
14
+ - block_attention_layer_end: 27
15
+ - block_mlp_layer_start: 9
16
+ - block_mlp_layer_end: 27
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/training.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-10-22 09:56:11 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-22 09:56:14 - INFO : Use taylor pruner...
3
+ 2025-10-22 09:56:14 - INFO : Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
4
+ 2025-10-22 09:56:14 - INFO : Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
5
+ 2025-10-22 09:56:14 - INFO : Start Pruning
6
+ 2025-10-22 09:57:08 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
7
+ 2025-10-22 09:57:08 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
8
+ 2025-10-22 09:57:18 - INFO : Start Backwarding in iterative steps = 0...
9
+ 2025-10-22 09:57:19 - INFO : Loss = 3.6055665016174316
10
+ 2025-10-22 09:57:20 - INFO : After Iter 1/1, #parameters: 2392282624
11
+ 2025-10-22 09:57:20 - INFO : #Param before: 3085938688, #Param after: 2392282624, Ratio = 77.5220%
12
+ 2025-10-22 09:58:10 - ERROR : `trust_remote_code` is not supported anymore.
13
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
14
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
15
+ 2025-10-22 09:58:11 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
16
+ 2025-10-22 09:58:11 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
17
+ 2025-10-22 09:58:11 - ERROR : `trust_remote_code` is not supported anymore.
18
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
19
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
20
+ 2025-10-22 09:58:11 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
21
+ 2025-10-22 09:58:11 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
22
+ 2025-10-22 09:58:15 - INFO : PPL after pruning: {'c4': 9.827221607438576, 'wikitext2': 26.453556883236853, 'ptb': 47.90120329104557}
23
+ 2025-10-22 09:58:15 - INFO : Memory Requirement: 9880.88232421875 MiB
24
+
Qwen2.5-3B-Instruct-bl-0.2-c4book/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 9
14
+ - block_attention_layer_end: 27
15
+ - block_mlp_layer_start: 9
16
+ - block_mlp_layer_end: 27
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.2-c4book/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bab2adb5206c3f6dd0895845a69a647bda2be5ed4ba1b68dbecf7b8a5b46a486
3
+ size 4791757567
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 11
14
+ - block_attention_layer_end: 25
15
+ - block_mlp_layer_start: 11
16
+ - block_mlp_layer_end: 25
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.215-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 11 --block_mlp_layer_end 25 --block_attention_layer_start 11 --block_attention_layer_end 25 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/training.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-10-10 14:05:17 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-10 14:05:19 - INFO : Use taylor pruner...
3
+ 2025-10-10 14:05:19 - INFO : Pruning Attention Layer = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
4
+ 2025-10-10 14:05:19 - INFO : Pruning MLP Layer = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
5
+ 2025-10-10 14:05:19 - INFO : Start Pruning
6
+ 2025-10-10 14:06:15 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
7
+ 2025-10-10 14:06:15 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
8
+ 2025-10-10 14:06:25 - INFO : Start Backwarding in iterative steps = 0...
9
+ 2025-10-10 14:06:26 - INFO : Loss = 3.6055665016174316
10
+ 2025-10-10 14:06:27 - INFO : After Iter 1/1, #parameters: 2546428416
11
+ 2025-10-10 14:06:27 - INFO : #Param before: 3085938688, #Param after: 2546428416, Ratio = 82.5171%
12
+ 2025-10-10 14:07:38 - ERROR : `trust_remote_code` is not supported anymore.
13
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
14
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
15
+ 2025-10-10 14:07:38 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
16
+ 2025-10-10 14:07:38 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
17
+ 2025-10-10 14:07:38 - ERROR : `trust_remote_code` is not supported anymore.
18
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
19
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
20
+ 2025-10-10 14:07:39 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
21
+ 2025-10-10 14:07:39 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
22
+ 2025-10-10 14:07:43 - INFO : PPL after pruning: {'c4': 8.115290514356445, 'wikitext2': 18.075026786359576, 'ptb': 34.97688798216538}
23
+ 2025-10-10 14:07:43 - INFO : Memory Requirement: 8996.85107421875 MiB
24
+
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 10
14
+ - block_attention_layer_end: 25
15
+ - block_mlp_layer_start: 10
16
+ - block_mlp_layer_end: 25
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.215-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 10 --block_mlp_layer_end 25 --block_attention_layer_start 10 --block_attention_layer_end 25 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/training.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 2025-10-10 14:08:13 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-10 14:08:16 - INFO : Use taylor pruner...
3
+ 2025-10-10 14:08:16 - INFO : Pruning Attention Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
4
+ 2025-10-10 14:08:16 - INFO : Pruning MLP Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
5
+ 2025-10-10 14:08:17 - INFO : Start Pruning
Qwen2.5-3B-Instruct-bl-0.215-c4book/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 10
14
+ - block_attention_layer_end: 25
15
+ - block_mlp_layer_start: 10
16
+ - block_mlp_layer_end: 25
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.215-c4book/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0b198b559663b1573ff340663d4d3c23896bb3459f94b998bddac1d33b8f126
3
+ size 5100049439
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 9
14
+ - block_attention_layer_end: 27
15
+ - block_mlp_layer_start: 9
16
+ - block_mlp_layer_end: 27
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.225-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/training.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 2025-10-10 17:31:41 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-10 17:31:43 - INFO : Use taylor pruner...
3
+ 2025-10-10 17:31:43 - INFO : Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
4
+ 2025-10-10 17:31:43 - INFO : Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
5
+ 2025-10-10 17:31:44 - INFO : Start Pruning
6
+ 2025-10-10 17:32:38 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
7
+ 2025-10-10 17:32:38 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
8
+ 2025-10-10 17:32:49 - INFO : Start Backwarding in iterative steps = 0...
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 9
14
+ - block_attention_layer_end: 27
15
+ - block_mlp_layer_start: 9
16
+ - block_mlp_layer_end: 27
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.225-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/training.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-10-10 17:41:36 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-10 17:41:38 - INFO : Use taylor pruner...
3
+ 2025-10-10 17:41:38 - INFO : Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
4
+ 2025-10-10 17:41:38 - INFO : Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
5
+ 2025-10-10 17:41:39 - INFO : Start Pruning
6
+ 2025-10-10 17:42:37 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
7
+ 2025-10-10 17:42:37 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
8
+ 2025-10-10 17:42:47 - INFO : Start Backwarding in iterative steps = 0...
9
+ 2025-10-10 17:42:47 - INFO : Loss = 3.6055665016174316
10
+ 2025-10-10 17:42:49 - INFO : After Iter 1/1, #parameters: 2392282624
11
+ 2025-10-10 17:42:49 - INFO : #Param before: 3085938688, #Param after: 2392282624, Ratio = 77.5220%
12
+ 2025-10-10 17:43:52 - ERROR : `trust_remote_code` is not supported anymore.
13
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
14
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
15
+ 2025-10-10 17:43:52 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
16
+ 2025-10-10 17:43:52 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
17
+ 2025-10-10 17:43:52 - ERROR : `trust_remote_code` is not supported anymore.
18
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
19
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
20
+ 2025-10-10 17:43:53 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
21
+ 2025-10-10 17:43:53 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
22
+ 2025-10-10 17:43:57 - INFO : PPL after pruning: {'c4': 9.827221607438576, 'wikitext2': 26.453556883236853, 'ptb': 47.90120329104557}
23
+ 2025-10-10 17:43:57 - INFO : Memory Requirement: 9880.88232421875 MiB
24
+
Qwen2.5-3B-Instruct-bl-0.225-c4book/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 9
14
+ - block_attention_layer_end: 27
15
+ - block_mlp_layer_start: 9
16
+ - block_mlp_layer_end: 27
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.225-c4book/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ff600fc12f5785ca0ea11fb91a233cf7e43a2ec36b7b41098082e44345cfe26
3
+ size 4791757567
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.25-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 8
14
+ - block_attention_layer_end: 28
15
+ - block_mlp_layer_start: 8
16
+ - block_mlp_layer_end: 28
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.25-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 8 --block_mlp_layer_end 28 --block_attention_layer_start 8 --block_attention_layer_end 28 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/training.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-10-10 07:39:21 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-10 07:39:23 - INFO : Use taylor pruner...
3
+ 2025-10-10 07:39:23 - INFO : Pruning Attention Layer = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
4
+ 2025-10-10 07:39:23 - INFO : Pruning MLP Layer = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
5
+ 2025-10-10 07:39:23 - INFO : Start Pruning
6
+ 2025-10-10 07:40:10 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
7
+ 2025-10-10 07:40:10 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
8
+ 2025-10-10 07:40:20 - INFO : Start Backwarding in iterative steps = 0...
9
+ 2025-10-10 07:40:21 - INFO : Loss = 3.6055665016174316
10
+ 2025-10-10 07:40:22 - INFO : After Iter 1/1, #parameters: 2315209728
11
+ 2025-10-10 07:40:22 - INFO : #Param before: 3085938688, #Param after: 2315209728, Ratio = 75.0245%
12
+ 2025-10-10 07:41:09 - ERROR : `trust_remote_code` is not supported anymore.
13
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
14
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
15
+ 2025-10-10 07:41:10 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
16
+ 2025-10-10 07:41:10 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
17
+ 2025-10-10 07:41:10 - ERROR : `trust_remote_code` is not supported anymore.
18
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
19
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
20
+ 2025-10-10 07:41:10 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
21
+ 2025-10-10 07:41:10 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
22
+ 2025-10-10 07:41:14 - INFO : PPL after pruning: {'c4': 10.963057921054194, 'wikitext2': 31.846818916590372, 'ptb': 54.06755996279899}
23
+ 2025-10-10 07:41:14 - INFO : Memory Requirement: 10322.89794921875 MiB
24
+
Qwen2.5-3B-Instruct-bl-0.25-c4book/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.25-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 8
14
+ - block_attention_layer_end: 28
15
+ - block_mlp_layer_start: 8
16
+ - block_mlp_layer_end: 28
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.25-c4book/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44981d1747d90431ddbf55f4c8725d3719a777bf0ee3dcbcc51d58d859ee6bf6
3
+ size 4637611631
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 3
14
+ - block_attention_layer_end: 6
15
+ - block_mlp_layer_start: 3
16
+ - block_mlp_layer_end: 6
17
+ - iterative_steps: 5
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 30
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.6
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3 --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 3 --block_mlp_layer_end 6 --block_attention_layer_start 3 --block_attention_layer_end 6 --max_seq_len 2048 --num_examples 30
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/training.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ 2025-06-16 04:57:53 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-06-16 04:57:55 - INFO : Use taylor pruner...
3
+ 2025-06-16 04:57:55 - INFO : Pruning Attention Layer = [3, 4, 5]
4
+ 2025-06-16 04:57:55 - INFO : Pruning MLP Layer = [3, 4, 5]
5
+ 2025-06-16 04:57:55 - INFO : Start Pruning
6
+ 2025-06-16 04:59:26 - INFO : Start Backwarding in iterative steps = 0...
7
+ 2025-06-16 04:59:26 - INFO : Loss = 4.31801176071167
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 3
14
+ - block_attention_layer_end: 6
15
+ - block_mlp_layer_start: 3
16
+ - block_mlp_layer_end: 6
17
+ - iterative_steps: 5
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 30
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.6
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
4
+ - pruning_ratio: 0.3
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 6
14
+ - block_attention_layer_end: 30
15
+ - block_mlp_layer_start: 6
16
+ - block_mlp_layer_end: 30
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 30
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.6
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.3 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 30 --block_attention_layer_start 6 --block_attention_layer_end 30 --max_seq_len 2048 --num_examples 30
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/training.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ 2025-06-15 07:40:53 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-06-15 07:40:55 - INFO : Use taylor pruner...
3
+ 2025-06-15 07:40:55 - INFO : Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
4
+ 2025-06-15 07:40:55 - INFO : Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
5
+ 2025-06-15 07:40:56 - INFO : Start Pruning
6
+ 2025-06-15 07:41:54 - INFO : Start Backwarding in iterative steps = 0...
7
+ 2025-06-15 07:41:55 - INFO : Loss = 3.5623340606689453
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 6
14
+ - block_attention_layer_end: 30
15
+ - block_mlp_layer_start: 6
16
+ - block_mlp_layer_end: 30
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 30
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.6
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 30 --block_attention_layer_start 6 --block_attention_layer_end 30 --max_seq_len 2048 --num_examples 30
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/training.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-06-15 07:42:20 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-06-15 07:42:22 - INFO : Use taylor pruner...
3
+ 2025-06-15 07:42:22 - INFO : Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
4
+ 2025-06-15 07:42:22 - INFO : Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
5
+ 2025-06-15 07:42:22 - INFO : Start Pruning
6
+ 2025-06-15 07:43:21 - INFO : Start Backwarding in iterative steps = 0...
7
+ 2025-06-15 07:43:21 - INFO : Loss = 3.5623340606689453
8
+ 2025-06-15 07:43:23 - INFO : After Iter 1/1, #parameters: 2161063936
9
+ 2025-06-15 07:43:23 - INFO : #Param before: 3085938688, #Param after: 2161063936, Ratio = 70.0294%
10
+ 2025-06-15 07:44:27 - INFO : PPL after pruning: {'c4': 19.429584428039178, 'wikitext2': 49.037147591523336, 'ptb': 71.48811770889668}
11
+ 2025-06-15 07:44:27 - INFO : Memory Requirement: 11301.94384765625 MiB
12
+
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 6
14
+ - block_attention_layer_end: 32
15
+ - block_mlp_layer_start: 6
16
+ - block_mlp_layer_end: 32
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 32 --block_attention_layer_start 6 --block_attention_layer_end 32 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/training.log ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 2025-10-07 07:34:11 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-07 07:34:13 - INFO : Use taylor pruner...
3
+ 2025-10-07 07:34:13 - INFO : Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
4
+ 2025-10-07 07:34:13 - INFO : Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
5
+ 2025-10-07 07:34:14 - INFO : Start Pruning
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 6
14
+ - block_attention_layer_end: 32
15
+ - block_mlp_layer_start: 6
16
+ - block_mlp_layer_end: 32
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/train.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 32 --block_attention_layer_start 6 --block_attention_layer_end 32 --max_seq_len 2048 --num_examples 20
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/training.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-10-07 07:38:26 - INFO : We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2
+ 2025-10-07 07:38:28 - INFO : Use taylor pruner...
3
+ 2025-10-07 07:38:28 - INFO : Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
4
+ 2025-10-07 07:38:28 - INFO : Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
5
+ 2025-10-07 07:38:28 - INFO : Start Pruning
6
+ 2025-10-07 07:39:11 - WARNING : Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
7
+ 2025-10-07 07:39:11 - WARNING : Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
8
+ 2025-10-07 07:39:21 - INFO : Start Backwarding in iterative steps = 0...
9
+ 2025-10-07 07:39:22 - INFO : Loss = 3.6055665016174316
10
+ 2025-10-07 07:39:23 - INFO : After Iter 1/1, #parameters: 2083991040
11
+ 2025-10-07 07:39:23 - INFO : #Param before: 3085938688, #Param after: 2083991040, Ratio = 67.5318%
12
+ 2025-10-07 07:40:19 - ERROR : `trust_remote_code` is not supported anymore.
13
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
14
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
15
+ 2025-10-07 07:40:20 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
16
+ 2025-10-07 07:40:20 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
17
+ 2025-10-07 07:40:20 - ERROR : `trust_remote_code` is not supported anymore.
18
+ Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
19
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
20
+ 2025-10-07 07:40:20 - WARNING : Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
21
+ 2025-10-07 07:40:20 - WARNING : Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
22
+ 2025-10-07 07:40:24 - INFO : PPL after pruning: {'c4': 27.83159468972004, 'wikitext2': 72.04880596519355, 'ptb': 104.4216991379013}
23
+ 2025-10-07 07:40:24 - INFO : Memory Requirement: 11648.94482421875 MiB
24
+
Qwen2.5-3B-Instruct-bl-0.3-c4book/description.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - Training Parameters:
2
+ - base_model: Qwen/Qwen2.5-3B-Instruct
3
+ - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
4
+ - pruning_ratio: 0.5
5
+ - pruner_type: taylor
6
+ - temperature: 1.0
7
+ - top_p: 0.95
8
+ - max_seq_len: 2048
9
+ - channel_wise: False
10
+ - block_wise: True
11
+ - layer_wise: False
12
+ - layer: 12
13
+ - block_attention_layer_start: 6
14
+ - block_attention_layer_end: 32
15
+ - block_mlp_layer_start: 6
16
+ - block_mlp_layer_end: 32
17
+ - iterative_steps: 1
18
+ - grouping_strategy: sum
19
+ - global_pruning: False
20
+ - taylor: vectorize
21
+ - num_examples: 20
22
+ - device: cuda
23
+ - test_before_train: False
24
+ - eval_device: cuda
25
+ - test_after_train: False
26
+ - seed: 42
27
+ - save_model: True
28
+ - torch_version: 2.8
Qwen2.5-3B-Instruct-bl-0.3-c4book/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bd01c38add15eb14321d0db1ce4b4da767a38acb8e763e0511ec4a3fbfb03ad
3
+ size 4175173811