nbl_try / LLM-Drop /scripts /dropping /nbl_linearize.sh
s1ghhh's picture
Upload folder using huggingface_hub
d73500e verified
#!/usr/bin/bash
port="21305"
GPUs="4,5,6,7"
dataset="c4_val"
prune_data_type="pt"
n_calibration_samples=64
seq_len=2048
prune_method="nbl_linearize"
num_layers_to_linearize=32
model_name=llama3-8b-base
model_name_or_path=/workspace/1016_qif/Meta-Llama-3-8B
# model_name=llama3.1-8b-base
# model_name_or_path=/workspace/1016_qif/Llama-3.1-8B
folder_name="${model_name}-${prune_method}-linearize${num_layers_to_linearize}"
nbl_metric_cache_file="../results_prune/cache/${model_name}-${prune_method}-${dataset}-${n_calibration_samples}samples.pt"
echo ${folder_name}
output_dir=../results_prune/${folder_name}
prune_model_save_path=${output_dir}/checkpoint
CUDA_VISIBLE_DEVICES=$GPUs accelerate launch --main_process_port $port \
src/compress.py \
--stage prune \
--model_name_or_path ${model_name_or_path} \
--dataset ${dataset} \
--dataset_dir ./src/llmtuner/data \
--split "train" \
--prune_data_type ${prune_data_type} \
--cutoff_len ${seq_len} \
--output_dir ${output_dir} \
--logging_steps 10 \
--bf16 \
--n_calibration_samples ${n_calibration_samples} \
--prune_method ${prune_method} \
--num_layers_to_linearize ${num_layers_to_linearize} \
--nbl_metric_cache_file ${nbl_metric_cache_file} \
--prune_model_save_path ${prune_model_save_path}
# After running this script, the pruned model with linearized layers will be saved
# in the specified `prune_model_save_path`.
# The model can be loaded as usual, and the specified attention layers will be
# replaced by the learned linear layers.