HuangRT commited on Jan 5

Commit

f9e95a8

verified ·

1 Parent(s): d7441cc

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/training.log +24 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/training.log +24 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/training.log +24 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.2-c4book/pytorch_model.bin +3 -0
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/training.log +24 -0
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/training.log +5 -0
Qwen2.5-3B-Instruct-bl-0.215-c4book/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.215-c4book/pytorch_model.bin +3 -0
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/training.log +8 -0
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/training.log +24 -0
Qwen2.5-3B-Instruct-bl-0.225-c4book/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.225-c4book/pytorch_model.bin +3 -0
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/training.log +24 -0
Qwen2.5-3B-Instruct-bl-0.25-c4book/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.25-c4book/pytorch_model.bin +3 -0
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/training.log +7 -0
Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/training.log +7 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/training.log +12 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/training.log +5 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/train.sh +1 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/training.log +24 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/description.txt +28 -0
Qwen2.5-3B-Instruct-bl-0.3-c4book/pytorch_model.bin +3 -0

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 14
+  - block_attention_layer_end: 22
+  - block_mlp_layer_start: 14
+  - block_mlp_layer_end: 22
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 14 --block_mlp_layer_end 22 --block_attention_layer_start 14 --block_attention_layer_end 22 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-18-49/training.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-10-07 07:18:52 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-07 07:18:54 - INFO :       Use taylor pruner...
+2025-10-07 07:18:54 - INFO :       Pruning Attention Layer = [14, 15, 16, 17, 18, 19, 20, 21]
+2025-10-07 07:18:54 - INFO :       Pruning MLP Layer = [14, 15, 16, 17, 18, 19, 20, 21]
+2025-10-07 07:18:54 - INFO :       Start Pruning
+2025-10-07 07:19:45 - WARNING :       Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
+2025-10-07 07:19:45 - WARNING :       Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
+2025-10-07 07:19:56 - INFO :       Start Backwarding in iterative steps = 0...
+2025-10-07 07:19:56 - INFO :       Loss = 3.6055665016174316
+2025-10-07 07:19:57 - INFO :       After Iter 1/1, #parameters: 2777647104
+2025-10-07 07:19:57 - INFO :       #Param before: 3085938688, #Param after: 2777647104, Ratio = 90.0098%
+2025-10-07 07:20:57 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-07 07:20:58 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-07 07:20:58 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-07 07:20:58 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-07 07:20:58 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-07 07:20:58 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-07 07:21:03 - INFO :       PPL after pruning: {'c4': 6.662430613808609, 'wikitext2': 11.784638661080912, 'ptb': 22.494734284035275}
+2025-10-07 07:21:03 - INFO :       Memory Requirement: 7670.80419921875 MiB

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 10
+  - block_attention_layer_end: 26
+  - block_mlp_layer_start: 10
+  - block_mlp_layer_end: 26
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 10 --block_mlp_layer_end 26 --block_attention_layer_start 10 --block_attention_layer_end 26 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-07-07-26-27/training.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-10-07 07:26:30 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-07 07:26:32 - INFO :       Use taylor pruner...
+2025-10-07 07:26:32 - INFO :       Pruning Attention Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
+2025-10-07 07:26:32 - INFO :       Pruning MLP Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
+2025-10-07 07:26:33 - INFO :       Start Pruning
+2025-10-07 07:27:13 - WARNING :       Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
+2025-10-07 07:27:13 - WARNING :       Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
+2025-10-07 07:27:23 - INFO :       Start Backwarding in iterative steps = 0...
+2025-10-07 07:27:23 - INFO :       Loss = 3.6055665016174316
+2025-10-07 07:27:25 - INFO :       After Iter 1/1, #parameters: 2469355520
+2025-10-07 07:27:25 - INFO :       #Param before: 3085938688, #Param after: 2469355520, Ratio = 80.0196%
+2025-10-07 07:28:36 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-07 07:28:37 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-07 07:28:37 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-07 07:28:37 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-07 07:28:37 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-07 07:28:37 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-07 07:28:41 - INFO :       PPL after pruning: {'c4': 8.791876103118177, 'wikitext2': 22.059646737841184, 'ptb': 41.2128458403547}
+2025-10-07 07:28:41 - INFO :       Memory Requirement: 9438.86669921875 MiB

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 9
+  - block_attention_layer_end: 27
+  - block_mlp_layer_start: 9
+  - block_mlp_layer_end: 27
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.2-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.2-c4book/2025-10-22-09-56-09/training.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-10-22 09:56:11 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-22 09:56:14 - INFO :       Use taylor pruner...
+2025-10-22 09:56:14 - INFO :       Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+2025-10-22 09:56:14 - INFO :       Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+2025-10-22 09:56:14 - INFO :       Start Pruning
+2025-10-22 09:57:08 - WARNING :       Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
+2025-10-22 09:57:08 - WARNING :       Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
+2025-10-22 09:57:18 - INFO :       Start Backwarding in iterative steps = 0...
+2025-10-22 09:57:19 - INFO :       Loss = 3.6055665016174316
+2025-10-22 09:57:20 - INFO :       After Iter 1/1, #parameters: 2392282624
+2025-10-22 09:57:20 - INFO :       #Param before: 3085938688, #Param after: 2392282624, Ratio = 77.5220%
+2025-10-22 09:58:10 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-22 09:58:11 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-22 09:58:11 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-22 09:58:11 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-22 09:58:11 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-22 09:58:11 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-22 09:58:15 - INFO :       PPL after pruning: {'c4': 9.827221607438576, 'wikitext2': 26.453556883236853, 'ptb': 47.90120329104557}
+2025-10-22 09:58:15 - INFO :       Memory Requirement: 9880.88232421875 MiB

Qwen2.5-3B-Instruct-bl-0.2-c4book/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.2-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 9
+  - block_attention_layer_end: 27
+  - block_mlp_layer_start: 9
+  - block_mlp_layer_end: 27
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.2-c4book/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bab2adb5206c3f6dd0895845a69a647bda2be5ed4ba1b68dbecf7b8a5b46a486
+size 4791757567

Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 11
+  - block_attention_layer_end: 25
+  - block_mlp_layer_start: 11
+  - block_mlp_layer_end: 25
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.215-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 11 --block_mlp_layer_end 25 --block_attention_layer_start 11 --block_attention_layer_end 25 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-05-14/training.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-10-10 14:05:17 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-10 14:05:19 - INFO :       Use taylor pruner...
+2025-10-10 14:05:19 - INFO :       Pruning Attention Layer = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+2025-10-10 14:05:19 - INFO :       Pruning MLP Layer = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+2025-10-10 14:05:19 - INFO :       Start Pruning
+2025-10-10 14:06:15 - WARNING :       Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
+2025-10-10 14:06:15 - WARNING :       Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
+2025-10-10 14:06:25 - INFO :       Start Backwarding in iterative steps = 0...
+2025-10-10 14:06:26 - INFO :       Loss = 3.6055665016174316
+2025-10-10 14:06:27 - INFO :       After Iter 1/1, #parameters: 2546428416
+2025-10-10 14:06:27 - INFO :       #Param before: 3085938688, #Param after: 2546428416, Ratio = 82.5171%
+2025-10-10 14:07:38 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-10 14:07:38 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-10 14:07:38 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-10 14:07:38 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-10 14:07:39 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-10 14:07:39 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-10 14:07:43 - INFO :       PPL after pruning: {'c4': 8.115290514356445, 'wikitext2': 18.075026786359576, 'ptb': 34.97688798216538}
+2025-10-10 14:07:43 - INFO :       Memory Requirement: 8996.85107421875 MiB

Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 10
+  - block_attention_layer_end: 25
+  - block_mlp_layer_start: 10
+  - block_mlp_layer_end: 25
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.215-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 10 --block_mlp_layer_end 25 --block_attention_layer_start 10 --block_attention_layer_end 25 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.215-c4book/2025-10-10-14-08-10/training.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2025-10-10 14:08:13 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-10 14:08:16 - INFO :       Use taylor pruner...
+2025-10-10 14:08:16 - INFO :       Pruning Attention Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+2025-10-10 14:08:16 - INFO :       Pruning MLP Layer = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
+2025-10-10 14:08:17 - INFO :       Start Pruning

Qwen2.5-3B-Instruct-bl-0.215-c4book/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.215-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 10
+  - block_attention_layer_end: 25
+  - block_mlp_layer_start: 10
+  - block_mlp_layer_end: 25
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.215-c4book/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0b198b559663b1573ff340663d4d3c23896bb3459f94b998bddac1d33b8f126
+size 5100049439

Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 9
+  - block_attention_layer_end: 27
+  - block_mlp_layer_start: 9
+  - block_mlp_layer_end: 27
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.225-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-31-38/training.log ADDED Viewed

	@@ -0,0 +1,8 @@

+2025-10-10 17:31:41 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-10 17:31:43 - INFO :       Use taylor pruner...
+2025-10-10 17:31:43 - INFO :       Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+2025-10-10 17:31:43 - INFO :       Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+2025-10-10 17:31:44 - INFO :       Start Pruning
+2025-10-10 17:32:38 - WARNING :       Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
+2025-10-10 17:32:38 - WARNING :       Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
+2025-10-10 17:32:49 - INFO :       Start Backwarding in iterative steps = 0...

Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 9
+  - block_attention_layer_end: 27
+  - block_mlp_layer_start: 9
+  - block_mlp_layer_end: 27
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.225-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 9 --block_mlp_layer_end 27 --block_attention_layer_start 9 --block_attention_layer_end 27 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.225-c4book/2025-10-10-17-41-33/training.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-10-10 17:41:36 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-10 17:41:38 - INFO :       Use taylor pruner...
+2025-10-10 17:41:38 - INFO :       Pruning Attention Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+2025-10-10 17:41:38 - INFO :       Pruning MLP Layer = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
+2025-10-10 17:41:39 - INFO :       Start Pruning
+2025-10-10 17:42:37 - WARNING :       Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
+2025-10-10 17:42:37 - WARNING :       Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
+2025-10-10 17:42:47 - INFO :       Start Backwarding in iterative steps = 0...
+2025-10-10 17:42:47 - INFO :       Loss = 3.6055665016174316
+2025-10-10 17:42:49 - INFO :       After Iter 1/1, #parameters: 2392282624
+2025-10-10 17:42:49 - INFO :       #Param before: 3085938688, #Param after: 2392282624, Ratio = 77.5220%
+2025-10-10 17:43:52 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-10 17:43:52 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-10 17:43:52 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-10 17:43:52 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-10 17:43:53 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-10 17:43:53 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-10 17:43:57 - INFO :       PPL after pruning: {'c4': 9.827221607438576, 'wikitext2': 26.453556883236853, 'ptb': 47.90120329104557}
+2025-10-10 17:43:57 - INFO :       Memory Requirement: 9880.88232421875 MiB

Qwen2.5-3B-Instruct-bl-0.225-c4book/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.225-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 9
+  - block_attention_layer_end: 27
+  - block_mlp_layer_start: 9
+  - block_mlp_layer_end: 27
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.225-c4book/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff600fc12f5785ca0ea11fb91a233cf7e43a2ec36b7b41098082e44345cfe26
+size 4791757567

Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.25-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 8
+  - block_attention_layer_end: 28
+  - block_mlp_layer_start: 8
+  - block_mlp_layer_end: 28
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.25-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 8 --block_mlp_layer_end 28 --block_attention_layer_start 8 --block_attention_layer_end 28 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.25-c4book/2025-10-10-07-39-18/training.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-10-10 07:39:21 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-10 07:39:23 - INFO :       Use taylor pruner...
+2025-10-10 07:39:23 - INFO :       Pruning Attention Layer = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
+2025-10-10 07:39:23 - INFO :       Pruning MLP Layer = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
+2025-10-10 07:39:23 - INFO :       Start Pruning
+2025-10-10 07:40:10 - WARNING :       Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
+2025-10-10 07:40:10 - WARNING :       Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
+2025-10-10 07:40:20 - INFO :       Start Backwarding in iterative steps = 0...
+2025-10-10 07:40:21 - INFO :       Loss = 3.6055665016174316
+2025-10-10 07:40:22 - INFO :       After Iter 1/1, #parameters: 2315209728
+2025-10-10 07:40:22 - INFO :       #Param before: 3085938688, #Param after: 2315209728, Ratio = 75.0245%
+2025-10-10 07:41:09 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-10 07:41:10 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-10 07:41:10 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-10 07:41:10 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-10 07:41:10 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-10 07:41:10 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-10 07:41:14 - INFO :       PPL after pruning: {'c4': 10.963057921054194, 'wikitext2': 31.846818916590372, 'ptb': 54.06755996279899}
+2025-10-10 07:41:14 - INFO :       Memory Requirement: 10322.89794921875 MiB

Qwen2.5-3B-Instruct-bl-0.25-c4book/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.25-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 8
+  - block_attention_layer_end: 28
+  - block_mlp_layer_start: 8
+  - block_mlp_layer_end: 28
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.25-c4book/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44981d1747d90431ddbf55f4c8725d3719a777bf0ee3dcbcc51d58d859ee6bf6
+size 4637611631

Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 3
+  - block_attention_layer_end: 6
+  - block_mlp_layer_start: 3
+  - block_mlp_layer_end: 6
+  - iterative_steps: 5
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 30
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.6

Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3 --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 3 --block_mlp_layer_end 6 --block_attention_layer_start 3 --block_attention_layer_end 6 --max_seq_len 2048 --num_examples 30

Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/2025-06-16-04-57-53/training.log ADDED Viewed

	@@ -0,0 +1,7 @@

+2025-06-16 04:57:53 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-06-16 04:57:55 - INFO :       Use taylor pruner...
+2025-06-16 04:57:55 - INFO :       Pruning Attention Layer = [3, 4, 5]
+2025-06-16 04:57:55 - INFO :       Pruning MLP Layer = [3, 4, 5]
+2025-06-16 04:57:55 - INFO :       Start Pruning
+2025-06-16 04:59:26 - INFO :       Start Backwarding in iterative steps = 0...
+2025-06-16 04:59:26 - INFO :       Loss = 4.31801176071167

Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: qwen_checkpoints/Qwen2_5_3B_Instruct_bl_0_3
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-bl-0.15-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 3
+  - block_attention_layer_end: 6
+  - block_mlp_layer_start: 3
+  - block_mlp_layer_end: 6
+  - iterative_steps: 5
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 30
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.6

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
+  - pruning_ratio: 0.3
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 6
+  - block_attention_layer_end: 30
+  - block_mlp_layer_start: 6
+  - block_mlp_layer_end: 30
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 30
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.6

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.3 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 30 --block_attention_layer_start 6 --block_attention_layer_end 30 --max_seq_len 2048 --num_examples 30

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-40-52/training.log ADDED Viewed

	@@ -0,0 +1,7 @@

+2025-06-15 07:40:53 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-06-15 07:40:55 - INFO :       Use taylor pruner...
+2025-06-15 07:40:55 - INFO :       Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
+2025-06-15 07:40:55 - INFO :       Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
+2025-06-15 07:40:56 - INFO :       Start Pruning
+2025-06-15 07:41:54 - INFO :       Start Backwarding in iterative steps = 0...
+2025-06-15 07:41:55 - INFO :       Loss = 3.5623340606689453

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 6
+  - block_attention_layer_end: 30
+  - block_mlp_layer_start: 6
+  - block_mlp_layer_end: 30
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 30
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.6

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 30 --block_attention_layer_start 6 --block_attention_layer_end 30 --max_seq_len 2048 --num_examples 30

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-06-15-07-42-19/training.log ADDED Viewed

	@@ -0,0 +1,12 @@

+2025-06-15 07:42:20 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-06-15 07:42:22 - INFO :       Use taylor pruner...
+2025-06-15 07:42:22 - INFO :       Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
+2025-06-15 07:42:22 - INFO :       Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
+2025-06-15 07:42:22 - INFO :       Start Pruning
+2025-06-15 07:43:21 - INFO :       Start Backwarding in iterative steps = 0...
+2025-06-15 07:43:21 - INFO :       Loss = 3.5623340606689453
+2025-06-15 07:43:23 - INFO :       After Iter 1/1, #parameters: 2161063936
+2025-06-15 07:43:23 - INFO :       #Param before: 3085938688, #Param after: 2161063936, Ratio = 70.0294%
+2025-06-15 07:44:27 - INFO :       PPL after pruning: {'c4': 19.429584428039178, 'wikitext2': 49.037147591523336, 'ptb': 71.48811770889668}
+2025-06-15 07:44:27 - INFO :       Memory Requirement: 11301.94384765625 MiB

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 6
+  - block_attention_layer_end: 32
+  - block_mlp_layer_start: 6
+  - block_mlp_layer_end: 32
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 32 --block_attention_layer_start 6 --block_attention_layer_end 32 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-34-09/training.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2025-10-07 07:34:11 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-07 07:34:13 - INFO :       Use taylor pruner...
+2025-10-07 07:34:13 - INFO :       Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+2025-10-07 07:34:13 - INFO :       Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+2025-10-07 07:34:14 - INFO :       Start Pruning

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 6
+  - block_attention_layer_end: 32
+  - block_mlp_layer_start: 6
+  - block_mlp_layer_end: 32
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/train.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python qwen.py --base_model Qwen/Qwen2.5-3B-Instruct --pruning_ratio 0.5 --save_ckpt_log_name Qwen2.5-3B-Instruct-bl-0.3-c4book --pruner_type taylor --taylor vectorize --save_model --block_wise --block_mlp_layer_start 6 --block_mlp_layer_end 32 --block_attention_layer_start 6 --block_attention_layer_end 32 --max_seq_len 2048 --num_examples 20

Qwen2.5-3B-Instruct-bl-0.3-c4book/2025-10-07-07-38-22/training.log ADDED Viewed

	@@ -0,0 +1,24 @@

+2025-10-07 07:38:26 - INFO :       We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+2025-10-07 07:38:28 - INFO :       Use taylor pruner...
+2025-10-07 07:38:28 - INFO :       Pruning Attention Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+2025-10-07 07:38:28 - INFO :       Pruning MLP Layer = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+2025-10-07 07:38:28 - INFO :       Start Pruning
+2025-10-07 07:39:11 - WARNING :       Using the latest cached version of the dataset since bookcorpus couldn't be found on the Hugging Face Hub
+2025-10-07 07:39:11 - WARNING :       Found the latest cached dataset configuration 'plain_text' at /home/kaixin/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f (last modified on Thu Mar 20 09:18:10 2025).
+2025-10-07 07:39:21 - INFO :       Start Backwarding in iterative steps = 0...
+2025-10-07 07:39:22 - INFO :       Loss = 3.6055665016174316
+2025-10-07 07:39:23 - INFO :       After Iter 1/1, #parameters: 2083991040
+2025-10-07 07:39:23 - INFO :       #Param before: 3085938688, #Param after: 2083991040, Ratio = 67.5318%
+2025-10-07 07:40:19 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-07 07:40:20 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-07 07:40:20 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-07 07:40:20 - ERROR :       `trust_remote_code` is not supported anymore.
+Please check that the Hugging Face dataset 'ptb_text_only' isn't based on a loading script and remove `trust_remote_code`.
+If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
+2025-10-07 07:40:20 - WARNING :       Using the latest cached version of the dataset since ptb_text_only couldn't be found on the Hugging Face Hub
+2025-10-07 07:40:20 - WARNING :       Found the latest cached dataset configuration 'penn_treebank' at /home/kaixin/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f (last modified on Tue Mar 18 15:50:44 2025).
+2025-10-07 07:40:24 - INFO :       PPL after pruning: {'c4': 27.83159468972004, 'wikitext2': 72.04880596519355, 'ptb': 104.4216991379013}
+2025-10-07 07:40:24 - INFO :       Memory Requirement: 11648.94482421875 MiB

Qwen2.5-3B-Instruct-bl-0.3-c4book/description.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+- Training Parameters:
+  - base_model: Qwen/Qwen2.5-3B-Instruct
+  - save_ckpt_log_name: Qwen2.5-3B-Instruct-bl-0.3-c4book
+  - pruning_ratio: 0.5
+  - pruner_type: taylor
+  - temperature: 1.0
+  - top_p: 0.95
+  - max_seq_len: 2048
+  - channel_wise: False
+  - block_wise: True
+  - layer_wise: False
+  - layer: 12
+  - block_attention_layer_start: 6
+  - block_attention_layer_end: 32
+  - block_mlp_layer_start: 6
+  - block_mlp_layer_end: 32
+  - iterative_steps: 1
+  - grouping_strategy: sum
+  - global_pruning: False
+  - taylor: vectorize
+  - num_examples: 20
+  - device: cuda
+  - test_before_train: False
+  - eval_device: cuda
+  - test_after_train: False
+  - seed: 42
+  - save_model: True
+  - torch_version: 2.8

Qwen2.5-3B-Instruct-bl-0.3-c4book/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bd01c38add15eb14321d0db1ce4b4da767a38acb8e763e0511ec4a3fbfb03ad
+size 4175173811