| port="21305" | |
| GPUs="4,5,6,7" | |
| dataset="c4_val" | |
| prune_data_type="pt" | |
| n_calibration_samples=64 | |
| seq_len=2048 | |
| prune_method="nbl_linearize" | |
| num_layers_to_linearize=32 | |
| model_name=llama3-8b-base | |
| model_name_or_path=/workspace/1016_qif/Meta-Llama-3-8B | |
| # model_name=llama3.1-8b-base | |
| # model_name_or_path=/workspace/1016_qif/Llama-3.1-8B | |
| folder_name="${model_name}-${prune_method}-linearize${num_layers_to_linearize}" | |
| nbl_metric_cache_file="../results_prune/cache/${model_name}-${prune_method}-${dataset}-${n_calibration_samples}samples.pt" | |
| echo ${folder_name} | |
| output_dir=../results_prune/${folder_name} | |
| prune_model_save_path=${output_dir}/checkpoint | |
| CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \ | |
| src/compress.py \ | |
| --stage prune \ | |
| --model_name_or_path ${model_name_or_path} \ | |
| --dataset ${dataset} \ | |
| --dataset_dir ./src/llmtuner/data \ | |
| --split "train" \ | |
| --prune_data_type ${prune_data_type} \ | |
| --cutoff_len ${seq_len} \ | |
| --output_dir ${output_dir} \ | |
| --logging_steps 10 \ | |
| --bf16 \ | |
| --n_calibration_samples ${n_calibration_samples} \ | |
| --prune_method ${prune_method} \ | |
| --num_layers_to_linearize ${num_layers_to_linearize} \ | |
| --nbl_metric_cache_file ${nbl_metric_cache_file} \ | |
| --prune_model_save_path ${prune_model_save_path} | |
| # After running this script, the pruned model with linearized layers will be saved | |
| # in the specified `prune_model_save_path`. | |
| # The model can be loaded as usual, and the specified attention layers will be | |
| # replaced by the learned linear layers. | |