upload script
Browse files- script/loratune.sh +38 -0
- script/run_abprune.sh +171 -0
- script/run_abprune_inst.sh +197 -0
- script/run_abprune_small.sh +162 -0
- script/run_abprune_smoke.sh +134 -0
- script/run_eval_ppl.sh +53 -0
- script/run_eval_zeroshot.sh +91 -0
- script/run_laco_llama.sh +69 -0
- script/run_laco_qwen.sh +69 -0
- script/run_llmpruner_llama.sh +66 -0
- script/run_llmpruner_qwen.sh +61 -0
- script/run_llmpruner_whole.sh +62 -0
- script/run_llmstreamline_llama.sh +58 -0
- script/run_llmstreamline_qwen.sh +53 -0
- script/run_replaceme_llama.sh +57 -0
- script/run_replaceme_qwen.sh +26 -0
- script/run_uidl_llama.sh +12 -0
- script/run_uidl_qwen.sh +12 -0
script/loratune.sh
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-2}"
|
| 5 |
+
|
| 6 |
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
|
| 8 |
+
if [[ $# -lt 2 ]]; then
|
| 9 |
+
cat <<'USAGE'
|
| 10 |
+
Usage:
|
| 11 |
+
script/loratune.sh <base_model> <output_dir> [extra lora args...]
|
| 12 |
+
|
| 13 |
+
Example:
|
| 14 |
+
script/loratune.sh /path/to/base_model /path/to/output_dir --epochs 2 --batch_size 32
|
| 15 |
+
USAGE
|
| 16 |
+
exit 1
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
BASE_MODEL="$1"
|
| 20 |
+
OUTPUT_DIR="$2"
|
| 21 |
+
shift 2
|
| 22 |
+
|
| 23 |
+
python "$ROOT/src/loratune.py" \
|
| 24 |
+
--base_model "$BASE_MODEL" \
|
| 25 |
+
--output_dir "$OUTPUT_DIR" \
|
| 26 |
+
--device cuda \
|
| 27 |
+
--dtype "${DTYPE:-bfloat16}" \
|
| 28 |
+
--instruction_dataset "${INSTRUCTION_DATASET:-tatsu-lab/alpaca}" \
|
| 29 |
+
--instruction_split "${INSTRUCTION_SPLIT:-train}" \
|
| 30 |
+
--max_samples "${MAX_SAMPLES:-0}" \
|
| 31 |
+
--seq_len "${SEQ_LEN:-1024}" \
|
| 32 |
+
--batch_size "${BATCH_SIZE:-64}" \
|
| 33 |
+
--micro_batch_size "${MICRO_BATCH_SIZE:-8}" \
|
| 34 |
+
--epochs "${EPOCHS:-1.0}" \
|
| 35 |
+
--learning_rate "${LEARNING_RATE:-1e-4}" \
|
| 36 |
+
--log_steps "${LOG_STEPS:-100}" \
|
| 37 |
+
--lora_rank "${LORA_RANK:-8}" \
|
| 38 |
+
"$@"
|
script/run_abprune.sh
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
|
| 8 |
+
if [[ $# -lt 1 ]]; then
|
| 9 |
+
cat <<'USAGE'
|
| 10 |
+
Usage:
|
| 11 |
+
script/run_abprune.sh <model> [output_dir] [extra fuse_layers args...]
|
| 12 |
+
|
| 13 |
+
Examples:
|
| 14 |
+
script/run_abprune.sh Qwen/Qwen3-1.7B
|
| 15 |
+
script/run_abprune.sh /path/to/model /path/to/output --num_progressive 8
|
| 16 |
+
USAGE
|
| 17 |
+
exit 1
|
| 18 |
+
fi
|
| 19 |
+
# all meta-llama/Llama-2-7b-hf, meta-llama/Llama-3.1-8B
|
| 20 |
+
model="$1"
|
| 21 |
+
shift
|
| 22 |
+
|
| 23 |
+
dataset="${DATASET:-slimpajama}"
|
| 24 |
+
dataset_config="${DATASET_CONFIG:-none}"
|
| 25 |
+
num_progressive="${NUM_PROGRESSIVE:-16}"
|
| 26 |
+
seq_len="${SEQ_LEN:-1024}"
|
| 27 |
+
target_tokens="${TARGET_TOKENS:-500000}"
|
| 28 |
+
calib_sequences="${CALIB_SEQUENCES:-128}"
|
| 29 |
+
distill_batch_size="${DISTILL_BATCH_SIZE:-1}"
|
| 30 |
+
eval_batch_size="${EVAL_BATCH_SIZE:-1}"
|
| 31 |
+
eval_num_samples="${EVAL_NUM_SAMPLES:-200}"
|
| 32 |
+
distill_seq_len="${DISTILL_SEQ_LEN:-1024}"
|
| 33 |
+
lora_epochs="${LORA_EPOCHS:-0}"
|
| 34 |
+
distill_epochs="${DISTILL_EPOCHS:-1.0}"
|
| 35 |
+
distill_kl_weight="${DISTILL_KL_WEIGHT:-0.02}"
|
| 36 |
+
distill_kl_temp="${DISTILL_KL_TEMP:-4.0}"
|
| 37 |
+
distill_hidden_mse_weight="${DISTILL_HIDDEN_MSE_WEIGHT:-1.0}"
|
| 38 |
+
distill_attn_mse_weight="${DISTILL_ATTN_MSE_WEIGHT:-0.25}"
|
| 39 |
+
distill_mlp_mse_weight="${DISTILL_MLP_MSE_WEIGHT:-1.0}"
|
| 40 |
+
reparam_eta="${REPARAM_ETA:-0}"
|
| 41 |
+
reparam_gamma="${REPARAM_GAMMA:-0}"
|
| 42 |
+
reparam_attn_reg_scale="${REPARAM_ATTN_REG_SCALE:-1.0}"
|
| 43 |
+
reparam_mlp_reg_scale="${REPARAM_MLP_REG_SCALE:-1.0}"
|
| 44 |
+
reparam_param_subset="${REPARAM_PARAM_SUBSET:-mlp}"
|
| 45 |
+
dtype="${DTYPE:-bfloat16}"
|
| 46 |
+
batch_size="${BATCH_SIZE:-2}"
|
| 47 |
+
use_pertensor_fisher="${USE_PERTENSOR_FISHER:-0}"
|
| 48 |
+
save_full_model_cycles="${SAVE_FULL_MODEL_CYCLES:-6,11}"
|
| 49 |
+
comm_skip_post_reselect="${COMM_SKIP_POST_RESELECT:-1}"
|
| 50 |
+
head_permute="${HEAD_PERMUTE:-0}"
|
| 51 |
+
head_permute_select="${HEAD_PERMUTE_SELECT:-$head_permute}"
|
| 52 |
+
head_permute_merge="${HEAD_PERMUTE_MERGE:-$head_permute}"
|
| 53 |
+
|
| 54 |
+
fisher_args=(--fisher_mode param)
|
| 55 |
+
if [[ "$use_pertensor_fisher" == "1" ]]; then
|
| 56 |
+
fisher_args=(--fisher_mode tensor)
|
| 57 |
+
fi
|
| 58 |
+
|
| 59 |
+
output_dir_suffix="progressive_common_${num_progressive}_nopost_only_last"
|
| 60 |
+
if [[ "$use_pertensor_fisher" == "1" ]]; then
|
| 61 |
+
output_dir_suffix="${output_dir_suffix}_pertensor"
|
| 62 |
+
fi
|
| 63 |
+
|
| 64 |
+
model_slug="$(echo "$model" | tr '/:@' '___' | tr -cs '[:alnum:]_.-' '_' | sed 's/^_\\+//; s/_\\+$//')"
|
| 65 |
+
output_dir_default="$repo_root/results/${model_slug}_${output_dir_suffix}"
|
| 66 |
+
output_dir=""
|
| 67 |
+
if [[ $# -gt 0 && "${1:0:2}" != "--" ]]; then
|
| 68 |
+
output_dir="$1"
|
| 69 |
+
shift
|
| 70 |
+
elif [[ -n "${OUTDIR:-}" ]]; then
|
| 71 |
+
output_dir="${OUTDIR}"
|
| 72 |
+
else
|
| 73 |
+
output_dir="${output_dir_default}"
|
| 74 |
+
fi
|
| 75 |
+
if [[ -n "${RUN_NAME:-}" ]]; then
|
| 76 |
+
output_dir="${output_dir}_${RUN_NAME}"
|
| 77 |
+
fi
|
| 78 |
+
|
| 79 |
+
python_args=(
|
| 80 |
+
--model "$model" \
|
| 81 |
+
--dataset "$dataset" \
|
| 82 |
+
--dataset_config "$dataset_config" \
|
| 83 |
+
--target_tokens "$target_tokens" \
|
| 84 |
+
--num_samples "$calib_sequences" \
|
| 85 |
+
--seq_len "$seq_len" \
|
| 86 |
+
--batch_size "$batch_size" \
|
| 87 |
+
--distill_batch_size "$distill_batch_size" \
|
| 88 |
+
--distill_seq_len "$distill_seq_len" \
|
| 89 |
+
--distill_epochs "$distill_epochs" \
|
| 90 |
+
--eval_batch_size "$eval_batch_size" \
|
| 91 |
+
--eval_seq_len "$seq_len" \
|
| 92 |
+
--eval_num_samples "$eval_num_samples" \
|
| 93 |
+
--distill_kl_weight "$distill_kl_weight" \
|
| 94 |
+
--distill_kl_temp "$distill_kl_temp" \
|
| 95 |
+
--distill_hidden_mse_weight "$distill_hidden_mse_weight" \
|
| 96 |
+
--distill_attn_mse_weight "$distill_attn_mse_weight" \
|
| 97 |
+
--distill_mlp_mse_weight "$distill_mlp_mse_weight" \
|
| 98 |
+
--reparam_eta "$reparam_eta" \
|
| 99 |
+
--reparam_gamma "$reparam_gamma" \
|
| 100 |
+
--reparam_attn_reg_scale "$reparam_attn_reg_scale" \
|
| 101 |
+
--reparam_mlp_reg_scale "$reparam_mlp_reg_scale" \
|
| 102 |
+
--reparam_param_subset "$reparam_param_subset" \
|
| 103 |
+
--distill_weight_decay 0.0 \
|
| 104 |
+
--distill_max_grad_norm 1.0 \
|
| 105 |
+
--distill_grad_accum_steps 1 \
|
| 106 |
+
--distill_eval_every 2000 \
|
| 107 |
+
--lora_eval_every 2000 \
|
| 108 |
+
--lora_epochs "$lora_epochs" \
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
python_args+=("${fisher_args[@]}")
|
| 112 |
+
if [[ -n "$save_full_model_cycles" ]]; then
|
| 113 |
+
python_args+=(--save_full_model_cycles "$save_full_model_cycles")
|
| 114 |
+
fi
|
| 115 |
+
|
| 116 |
+
python_args+=(
|
| 117 |
+
--distill_method reparam \
|
| 118 |
+
--redistrib_teacher_source previous_cycle \
|
| 119 |
+
--comm_enabled \
|
| 120 |
+
--comm_mu_auto \
|
| 121 |
+
--layer auto \
|
| 122 |
+
--exclude_pairs 0,1,-1 \
|
| 123 |
+
--num_progressive "$num_progressive" \
|
| 124 |
+
--output_dir "$output_dir" \
|
| 125 |
+
--dtype "$dtype" \
|
| 126 |
+
)
|
| 127 |
+
if [[ "$comm_skip_post_reselect" == "1" ]]; then
|
| 128 |
+
python_args+=(--comm_skip_post_reselect)
|
| 129 |
+
fi
|
| 130 |
+
if [[ "$head_permute_select" == "0" ]]; then
|
| 131 |
+
python_args+=(--no_head_permute_select)
|
| 132 |
+
fi
|
| 133 |
+
if [[ "$head_permute_merge" == "0" ]]; then
|
| 134 |
+
python_args+=(--no_head_permute_merge)
|
| 135 |
+
fi
|
| 136 |
+
python_args+=("$@")
|
| 137 |
+
|
| 138 |
+
mkdir -p "$output_dir"
|
| 139 |
+
run_args_file="$output_dir/run_args.txt"
|
| 140 |
+
git_commit="unknown"
|
| 141 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 142 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 143 |
+
fi
|
| 144 |
+
start_epoch=$(date +%s)
|
| 145 |
+
start_time=$(date --iso-8601=seconds)
|
| 146 |
+
{
|
| 147 |
+
echo "git_commit=$git_commit"
|
| 148 |
+
echo "start_time=$start_time"
|
| 149 |
+
echo "HEAD_PERMUTE=$head_permute"
|
| 150 |
+
echo "HEAD_PERMUTE_SELECT=$head_permute_select"
|
| 151 |
+
echo "HEAD_PERMUTE_MERGE=$head_permute_merge"
|
| 152 |
+
echo "command:"
|
| 153 |
+
printf '%q ' python "$repo_root/src/fuse_layers.py" "${python_args[@]}"
|
| 154 |
+
echo
|
| 155 |
+
} > "$run_args_file"
|
| 156 |
+
|
| 157 |
+
write_run_summary() {
|
| 158 |
+
local exit_code=$?
|
| 159 |
+
local end_epoch end_time elapsed_seconds
|
| 160 |
+
end_epoch=$(date +%s)
|
| 161 |
+
end_time=$(date --iso-8601=seconds)
|
| 162 |
+
elapsed_seconds=$((end_epoch - start_epoch))
|
| 163 |
+
{
|
| 164 |
+
echo "end_time=$end_time"
|
| 165 |
+
echo "elapsed_seconds=$elapsed_seconds"
|
| 166 |
+
echo "exit_code=$exit_code"
|
| 167 |
+
} >> "$run_args_file"
|
| 168 |
+
}
|
| 169 |
+
trap write_run_summary EXIT
|
| 170 |
+
|
| 171 |
+
python "$repo_root/src/fuse_layers.py" "${python_args[@]}"
|
script/run_abprune_inst.sh
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
|
| 8 |
+
if [[ $# -lt 1 ]]; then
|
| 9 |
+
cat <<'USAGE'
|
| 10 |
+
Usage:
|
| 11 |
+
script/run_abprune_inst.sh <model> [output_dir] [extra fuse_layers args...]
|
| 12 |
+
|
| 13 |
+
Examples:
|
| 14 |
+
script/run_abprune_inst.sh Qwen/Qwen3-1.7B
|
| 15 |
+
script/run_abprune_inst.sh /path/to/model /path/to/output --num_progressive 8
|
| 16 |
+
USAGE
|
| 17 |
+
exit 1
|
| 18 |
+
fi
|
| 19 |
+
# all meta-llama/Llama-2-7b-hf, meta-llama/Llama-3.1-8B, facebook/opt-6.7b
|
| 20 |
+
model="$1"
|
| 21 |
+
shift
|
| 22 |
+
|
| 23 |
+
dataset="${DATASET:-slimpajama}"
|
| 24 |
+
dataset_config="${DATASET_CONFIG:-none}"
|
| 25 |
+
num_progressive="${NUM_PROGRESSIVE:-16}"
|
| 26 |
+
seq_len="${SEQ_LEN:-1024}"
|
| 27 |
+
target_tokens="${TARGET_TOKENS:-500000}"
|
| 28 |
+
calib_sequences="${CALIB_SEQUENCES:-128}"
|
| 29 |
+
distill_batch_size="${DISTILL_BATCH_SIZE:-1}"
|
| 30 |
+
eval_batch_size="${EVAL_BATCH_SIZE:-1}"
|
| 31 |
+
eval_num_samples="${EVAL_NUM_SAMPLES:-200}"
|
| 32 |
+
distill_seq_len="${DISTILL_SEQ_LEN:-1024}"
|
| 33 |
+
lora_epochs="${LORA_EPOCHS:-0}"
|
| 34 |
+
calibration_source="${CALIBRATION_SOURCE:-lm}"
|
| 35 |
+
instruction_dataset="${INSTRUCTION_DATASET:-}"
|
| 36 |
+
instruction_config="${INSTRUCTION_CONFIG:-none}"
|
| 37 |
+
instruction_split="${INSTRUCTION_SPLIT:-train}"
|
| 38 |
+
instruction_format="${INSTRUCTION_FORMAT:-auto}"
|
| 39 |
+
instruction_field_instruction="${INSTRUCTION_FIELD_INSTRUCTION:-instruction}"
|
| 40 |
+
instruction_field_input="${INSTRUCTION_FIELD_INPUT:-input}"
|
| 41 |
+
instruction_field_output="${INSTRUCTION_FIELD_OUTPUT:-output}"
|
| 42 |
+
distillation_source="${DISTILLATION_SOURCE:-$calibration_source}"
|
| 43 |
+
distill_inst_samples="${DISTILL_INST_SAMPLES:-500}"
|
| 44 |
+
distill_epochs="${DISTILL_EPOCHS:-1.0}"
|
| 45 |
+
distill_kl_weight="${DISTILL_KL_WEIGHT:-0.02}"
|
| 46 |
+
distill_kl_temp="${DISTILL_KL_TEMP:-4.0}"
|
| 47 |
+
distill_hidden_mse_weight="${DISTILL_HIDDEN_MSE_WEIGHT:-1.0}"
|
| 48 |
+
distill_attn_mse_weight="${DISTILL_ATTN_MSE_WEIGHT:-0.25}"
|
| 49 |
+
distill_mlp_mse_weight="${DISTILL_MLP_MSE_WEIGHT:-1.0}"
|
| 50 |
+
reparam_eta="${REPARAM_ETA:-0}"
|
| 51 |
+
reparam_gamma="${REPARAM_GAMMA:-0}"
|
| 52 |
+
reparam_attn_reg_scale="${REPARAM_ATTN_REG_SCALE:-1.0}"
|
| 53 |
+
reparam_mlp_reg_scale="${REPARAM_MLP_REG_SCALE:-1.0}"
|
| 54 |
+
reparam_param_subset="${REPARAM_PARAM_SUBSET:-mlp}"
|
| 55 |
+
dtype="${DTYPE:-bfloat16}"
|
| 56 |
+
batch_size="${BATCH_SIZE:-2}"
|
| 57 |
+
use_pertensor_fisher="${USE_PERTENSOR_FISHER:-0}"
|
| 58 |
+
save_full_model_cycles="${SAVE_FULL_MODEL_CYCLES:-6,11}"
|
| 59 |
+
comm_skip_post_reselect="${COMM_SKIP_POST_RESELECT:-1}"
|
| 60 |
+
head_permute="${HEAD_PERMUTE:-1}"
|
| 61 |
+
head_permute_select="${HEAD_PERMUTE_SELECT:-$head_permute}"
|
| 62 |
+
head_permute_merge="${HEAD_PERMUTE_MERGE:-$head_permute}"
|
| 63 |
+
|
| 64 |
+
fisher_args=(--fisher_mode param)
|
| 65 |
+
if [[ "$use_pertensor_fisher" == "1" ]]; then
|
| 66 |
+
fisher_args=(--fisher_mode tensor)
|
| 67 |
+
fi
|
| 68 |
+
|
| 69 |
+
output_dir_suffix="progressive_common_${num_progressive}_nopost_only_last"
|
| 70 |
+
if [[ "$use_pertensor_fisher" == "1" ]]; then
|
| 71 |
+
output_dir_suffix="${output_dir_suffix}_pertensor"
|
| 72 |
+
fi
|
| 73 |
+
|
| 74 |
+
model_slug="$(echo "$model" | tr '/:@' '___' | tr -cs '[:alnum:]_.-' '_' | sed 's/^_\\+//; s/_\\+$//')"
|
| 75 |
+
output_dir_default="$repo_root/results/${model_slug}_${output_dir_suffix}"
|
| 76 |
+
output_dir=""
|
| 77 |
+
if [[ $# -gt 0 && "${1:0:2}" != "--" ]]; then
|
| 78 |
+
output_dir="$1"
|
| 79 |
+
shift
|
| 80 |
+
elif [[ -n "${OUTDIR:-}" ]]; then
|
| 81 |
+
output_dir="${OUTDIR}"
|
| 82 |
+
else
|
| 83 |
+
output_dir="${output_dir_default}"
|
| 84 |
+
fi
|
| 85 |
+
if [[ -n "${RUN_NAME:-}" ]]; then
|
| 86 |
+
output_dir="${output_dir}_${RUN_NAME}"
|
| 87 |
+
fi
|
| 88 |
+
|
| 89 |
+
python_args=(
|
| 90 |
+
--model "$model" \
|
| 91 |
+
--dataset "$dataset" \
|
| 92 |
+
--dataset_config "$dataset_config" \
|
| 93 |
+
--target_tokens "$target_tokens" \
|
| 94 |
+
--num_samples "$calib_sequences" \
|
| 95 |
+
--seq_len "$seq_len" \
|
| 96 |
+
--batch_size "$batch_size" \
|
| 97 |
+
--calibration_source "$calibration_source" \
|
| 98 |
+
--distillation_source "$distillation_source" \
|
| 99 |
+
--distill_batch_size "$distill_batch_size" \
|
| 100 |
+
--distill_inst_samples "$distill_inst_samples" \
|
| 101 |
+
--distill_seq_len "$distill_seq_len" \
|
| 102 |
+
--distill_epochs "$distill_epochs" \
|
| 103 |
+
--eval_batch_size "$eval_batch_size" \
|
| 104 |
+
--eval_seq_len "$seq_len" \
|
| 105 |
+
--eval_num_samples "$eval_num_samples" \
|
| 106 |
+
--distill_kl_weight "$distill_kl_weight" \
|
| 107 |
+
--distill_kl_temp "$distill_kl_temp" \
|
| 108 |
+
--distill_hidden_mse_weight "$distill_hidden_mse_weight" \
|
| 109 |
+
--distill_attn_mse_weight "$distill_attn_mse_weight" \
|
| 110 |
+
--distill_mlp_mse_weight "$distill_mlp_mse_weight" \
|
| 111 |
+
--reparam_eta "$reparam_eta" \
|
| 112 |
+
--reparam_gamma "$reparam_gamma" \
|
| 113 |
+
--reparam_attn_reg_scale "$reparam_attn_reg_scale" \
|
| 114 |
+
--reparam_mlp_reg_scale "$reparam_mlp_reg_scale" \
|
| 115 |
+
--reparam_param_subset "$reparam_param_subset" \
|
| 116 |
+
--distill_weight_decay 0.0 \
|
| 117 |
+
--distill_max_grad_norm 1.0 \
|
| 118 |
+
--distill_grad_accum_steps 1 \
|
| 119 |
+
--distill_eval_every 2000 \
|
| 120 |
+
--lora_eval_every 2000 \
|
| 121 |
+
--lora_epochs "$lora_epochs" \
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if [[ -n "$instruction_dataset" ]]; then
|
| 125 |
+
python_args+=(
|
| 126 |
+
--instruction_dataset "$instruction_dataset" \
|
| 127 |
+
--instruction_config "$instruction_config" \
|
| 128 |
+
--instruction_split "$instruction_split" \
|
| 129 |
+
--instruction_format "$instruction_format" \
|
| 130 |
+
--instruction_field_instruction "$instruction_field_instruction" \
|
| 131 |
+
--instruction_field_input "$instruction_field_input" \
|
| 132 |
+
--instruction_field_output "$instruction_field_output" \
|
| 133 |
+
)
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
python_args+=("${fisher_args[@]}")
|
| 137 |
+
if [[ -n "$save_full_model_cycles" ]]; then
|
| 138 |
+
python_args+=(--save_full_model_cycles "$save_full_model_cycles")
|
| 139 |
+
fi
|
| 140 |
+
|
| 141 |
+
python_args+=(
|
| 142 |
+
--distill_method reparam \
|
| 143 |
+
--redistrib_teacher_source previous_cycle \
|
| 144 |
+
--comm_enabled \
|
| 145 |
+
--comm_mu_auto \
|
| 146 |
+
--layer auto \
|
| 147 |
+
--exclude_pairs 0,1,-1 \
|
| 148 |
+
--num_progressive "$num_progressive" \
|
| 149 |
+
--output_dir "$output_dir" \
|
| 150 |
+
--dtype "$dtype" \
|
| 151 |
+
)
|
| 152 |
+
if [[ "$comm_skip_post_reselect" == "1" ]]; then
|
| 153 |
+
python_args+=(--comm_skip_post_reselect)
|
| 154 |
+
fi
|
| 155 |
+
if [[ "$head_permute_select" == "0" ]]; then
|
| 156 |
+
python_args+=(--no_head_permute_select)
|
| 157 |
+
fi
|
| 158 |
+
if [[ "$head_permute_merge" == "0" ]]; then
|
| 159 |
+
python_args+=(--no_head_permute_merge)
|
| 160 |
+
fi
|
| 161 |
+
python_args+=("$@")
|
| 162 |
+
|
| 163 |
+
mkdir -p "$output_dir"
|
| 164 |
+
run_args_file="$output_dir/run_args.txt"
|
| 165 |
+
git_commit="unknown"
|
| 166 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 167 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 168 |
+
fi
|
| 169 |
+
start_epoch=$(date +%s)
|
| 170 |
+
start_time=$(date --iso-8601=seconds)
|
| 171 |
+
{
|
| 172 |
+
echo "git_commit=$git_commit"
|
| 173 |
+
echo "start_time=$start_time"
|
| 174 |
+
echo "HEAD_PERMUTE=$head_permute"
|
| 175 |
+
echo "HEAD_PERMUTE_SELECT=$head_permute_select"
|
| 176 |
+
echo "HEAD_PERMUTE_MERGE=$head_permute_merge"
|
| 177 |
+
echo "command:"
|
| 178 |
+
printf '%q ' python "$repo_root/src_inst/fuse_layers.py" "${python_args[@]}"
|
| 179 |
+
echo
|
| 180 |
+
} > "$run_args_file"
|
| 181 |
+
|
| 182 |
+
write_run_summary() {
|
| 183 |
+
local exit_code=$?
|
| 184 |
+
local end_epoch end_time elapsed_seconds
|
| 185 |
+
end_epoch=$(date +%s)
|
| 186 |
+
end_time=$(date --iso-8601=seconds)
|
| 187 |
+
elapsed_seconds=$((end_epoch - start_epoch))
|
| 188 |
+
{
|
| 189 |
+
echo "end_time=$end_time"
|
| 190 |
+
echo "elapsed_seconds=$elapsed_seconds"
|
| 191 |
+
echo "exit_code=$exit_code"
|
| 192 |
+
} >> "$run_args_file"
|
| 193 |
+
}
|
| 194 |
+
trap write_run_summary EXIT
|
| 195 |
+
|
| 196 |
+
PYTHONPATH="$repo_root/src_inst:$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 197 |
+
python "$repo_root/src_inst/fuse_layers.py" "${python_args[@]}"
|
script/run_abprune_small.sh
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-3}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
|
| 8 |
+
if [[ $# -lt 1 ]]; then
|
| 9 |
+
cat <<'USAGE'
|
| 10 |
+
Usage:
|
| 11 |
+
script/run_abprune.sh <model> [output_dir] [extra fuse_layers args...]
|
| 12 |
+
|
| 13 |
+
Examples:
|
| 14 |
+
script/run_abprune.sh Qwen/Qwen3-1.7B
|
| 15 |
+
script/run_abprune.sh /path/to/model /path/to/output --num_progressive 8
|
| 16 |
+
USAGE
|
| 17 |
+
exit 1
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
model="$1"
|
| 21 |
+
shift
|
| 22 |
+
|
| 23 |
+
# whole settings
|
| 24 |
+
dataset="${DATASET:-slimpajama}"
|
| 25 |
+
dataset_config="${DATASET_CONFIG:-none}"
|
| 26 |
+
num_progressive="${NUM_PROGRESSIVE:-14}"
|
| 27 |
+
dtype="${DTYPE:-bfloat16}"
|
| 28 |
+
use_pertensor_fisher="${USE_PERTENSOR_FISHER:-0}"
|
| 29 |
+
save_full_model_cycles="${SAVE_FULL_MODEL_CYCLES:-6,11}"
|
| 30 |
+
head_permute="${HEAD_PERMUTE:-1}"
|
| 31 |
+
head_permute_select="${HEAD_PERMUTE_SELECT:-$head_permute}"
|
| 32 |
+
head_permute_merge="${HEAD_PERMUTE_MERGE:-$head_permute}"
|
| 33 |
+
|
| 34 |
+
# calibration dataset
|
| 35 |
+
calib_sequences="${CALIB_SEQUENCES:-128}"
|
| 36 |
+
seq_len="${SEQ_LEN:-512}"
|
| 37 |
+
|
| 38 |
+
# distillation dataset
|
| 39 |
+
distill_seq_len="${DISTILL_SEQ_LEN:-512}"
|
| 40 |
+
target_tokens="${TARGET_TOKENS:-500000}"
|
| 41 |
+
distill_batch_size="${DISTILL_BATCH_SIZE:-1}"
|
| 42 |
+
|
| 43 |
+
# distillation evaluation
|
| 44 |
+
batch_size="${BATCH_SIZE:-1}"
|
| 45 |
+
eval_batch_size="${EVAL_BATCH_SIZE:-1}"
|
| 46 |
+
eval_num_samples="${EVAL_NUM_SAMPLES:-200}"
|
| 47 |
+
lora_epochs="${LORA_EPOCHS:-0}"
|
| 48 |
+
distill_epochs="${DISTILL_EPOCHS:-1.0}"
|
| 49 |
+
distill_kl_weight="${DISTILL_KL_WEIGHT:-0.01}"
|
| 50 |
+
distill_kl_temp="${DISTILL_KL_TEMP:-4.0}"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
fisher_args=(--fisher_mode param)
|
| 54 |
+
if [[ "$use_pertensor_fisher" == "1" ]]; then
|
| 55 |
+
fisher_args=(--fisher_mode tensor)
|
| 56 |
+
fi
|
| 57 |
+
|
| 58 |
+
output_dir_suffix="progressive_common_${num_progressive}_nopost_only_last"
|
| 59 |
+
if [[ "$use_pertensor_fisher" == "1" ]]; then
|
| 60 |
+
output_dir_suffix="${output_dir_suffix}_pertensor"
|
| 61 |
+
fi
|
| 62 |
+
|
| 63 |
+
model_slug="$(echo "$model" | tr '/:@' '___' | tr -cs '[:alnum:]_.-' '_' | sed 's/^_\\+//; s/_\\+$//')"
|
| 64 |
+
output_dir_default="$repo_root/results/${model_slug}_${output_dir_suffix}"
|
| 65 |
+
output_dir=""
|
| 66 |
+
if [[ $# -gt 0 && "${1:0:2}" != "--" ]]; then
|
| 67 |
+
output_dir="$1"
|
| 68 |
+
shift
|
| 69 |
+
elif [[ -n "${OUTDIR:-}" ]]; then
|
| 70 |
+
output_dir="${OUTDIR}"
|
| 71 |
+
else
|
| 72 |
+
output_dir="${output_dir_default}"
|
| 73 |
+
fi
|
| 74 |
+
if [[ -n "${RUN_NAME:-}" ]]; then
|
| 75 |
+
output_dir="${output_dir}_${RUN_NAME}"
|
| 76 |
+
fi
|
| 77 |
+
|
| 78 |
+
python_args=(
|
| 79 |
+
--model "$model" \
|
| 80 |
+
--dataset "$dataset" \
|
| 81 |
+
--dataset_config "$dataset_config" \
|
| 82 |
+
--target_tokens "$target_tokens" \
|
| 83 |
+
--num_samples "$calib_sequences" \
|
| 84 |
+
--seq_len "$seq_len" \
|
| 85 |
+
--batch_size "$batch_size" \
|
| 86 |
+
--distill_batch_size "$distill_batch_size" \
|
| 87 |
+
--distill_seq_len "$distill_seq_len" \
|
| 88 |
+
--distill_epochs "$distill_epochs" \
|
| 89 |
+
--eval_batch_size "$eval_batch_size" \
|
| 90 |
+
--eval_seq_len "$seq_len" \
|
| 91 |
+
--eval_num_samples "$eval_num_samples" \
|
| 92 |
+
--distill_kl_weight "$distill_kl_weight" \
|
| 93 |
+
--distill_kl_temp "$distill_kl_temp" \
|
| 94 |
+
--distill_weight_decay 0.0 \
|
| 95 |
+
--distill_max_grad_norm 1.0 \
|
| 96 |
+
--distill_grad_accum_steps 1 \
|
| 97 |
+
--distill_eval_every 2000 \
|
| 98 |
+
--lora_eval_every 2000 \
|
| 99 |
+
--lora_epochs "$lora_epochs" \
|
| 100 |
+
--auto_metric dwce \
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# --auto_cosine_topk 5
|
| 104 |
+
|
| 105 |
+
python_args+=("${fisher_args[@]}")
|
| 106 |
+
if [[ -n "$save_full_model_cycles" ]]; then
|
| 107 |
+
python_args+=(--save_full_model_cycles "$save_full_model_cycles")
|
| 108 |
+
fi
|
| 109 |
+
|
| 110 |
+
python_args+=(
|
| 111 |
+
--distill_method reparam \
|
| 112 |
+
--redistrib_teacher_source previous_cycle \
|
| 113 |
+
--comm_enabled \
|
| 114 |
+
--comm_mu_auto \
|
| 115 |
+
--layer auto \
|
| 116 |
+
--exclude_pairs 0,1,-1 \
|
| 117 |
+
--num_progressive "$num_progressive" \
|
| 118 |
+
--output_dir "$output_dir" \
|
| 119 |
+
--dtype "$dtype" \
|
| 120 |
+
)
|
| 121 |
+
if [[ "$head_permute_select" == "0" ]]; then
|
| 122 |
+
python_args+=(--no_head_permute_select)
|
| 123 |
+
fi
|
| 124 |
+
if [[ "$head_permute_merge" == "0" ]]; then
|
| 125 |
+
python_args+=(--no_head_permute_merge)
|
| 126 |
+
fi
|
| 127 |
+
python_args+=("$@")
|
| 128 |
+
|
| 129 |
+
mkdir -p "$output_dir"
|
| 130 |
+
run_args_file="$output_dir/run_args.txt"
|
| 131 |
+
git_commit="unknown"
|
| 132 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 133 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 134 |
+
fi
|
| 135 |
+
start_epoch=$(date +%s)
|
| 136 |
+
start_time=$(date --iso-8601=seconds)
|
| 137 |
+
{
|
| 138 |
+
echo "git_commit=$git_commit"
|
| 139 |
+
echo "start_time=$start_time"
|
| 140 |
+
echo "HEAD_PERMUTE=$head_permute"
|
| 141 |
+
echo "HEAD_PERMUTE_SELECT=$head_permute_select"
|
| 142 |
+
echo "HEAD_PERMUTE_MERGE=$head_permute_merge"
|
| 143 |
+
echo "command:"
|
| 144 |
+
printf '%q ' python "$repo_root/src/fuse_layers.py" "${python_args[@]}"
|
| 145 |
+
echo
|
| 146 |
+
} > "$run_args_file"
|
| 147 |
+
|
| 148 |
+
write_run_summary() {
|
| 149 |
+
local exit_code=$?
|
| 150 |
+
local end_epoch end_time elapsed_seconds
|
| 151 |
+
end_epoch=$(date +%s)
|
| 152 |
+
end_time=$(date --iso-8601=seconds)
|
| 153 |
+
elapsed_seconds=$((end_epoch - start_epoch))
|
| 154 |
+
{
|
| 155 |
+
echo "end_time=$end_time"
|
| 156 |
+
echo "elapsed_seconds=$elapsed_seconds"
|
| 157 |
+
echo "exit_code=$exit_code"
|
| 158 |
+
} >> "$run_args_file"
|
| 159 |
+
}
|
| 160 |
+
trap write_run_summary EXIT
|
| 161 |
+
|
| 162 |
+
python "$repo_root/src/fuse_layers.py" "${python_args[@]}"
|
script/run_abprune_smoke.sh
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-1}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
|
| 8 |
+
if [[ $# -lt 1 ]]; then
|
| 9 |
+
cat <<'USAGE'
|
| 10 |
+
Usage:
|
| 11 |
+
script/run_abprune_smoke.sh <model> [output_dir] [extra fuse_layers args...]
|
| 12 |
+
|
| 13 |
+
Examples:
|
| 14 |
+
script/run_abprune_smoke.sh Qwen/Qwen3-1.7B
|
| 15 |
+
script/run_abprune_smoke.sh /path/to/model /path/to/output --num_progressive 3
|
| 16 |
+
USAGE
|
| 17 |
+
exit 1
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
model="$1"
|
| 21 |
+
shift
|
| 22 |
+
|
| 23 |
+
dataset="${DATASET:-slimpajama}"
|
| 24 |
+
dataset_config="${DATASET_CONFIG:-none}"
|
| 25 |
+
num_progressive="${NUM_PROGRESSIVE:-4}"
|
| 26 |
+
seq_len="${SEQ_LEN:-128}"
|
| 27 |
+
target_tokens="${TARGET_TOKENS:-8192}"
|
| 28 |
+
calib_sequences="${CALIB_SEQUENCES:-8}"
|
| 29 |
+
distill_batch_size="${DISTILL_BATCH_SIZE:-1}"
|
| 30 |
+
eval_batch_size="${EVAL_BATCH_SIZE:-1}"
|
| 31 |
+
eval_num_samples="${EVAL_NUM_SAMPLES:-8}"
|
| 32 |
+
distill_seq_len="${DISTILL_SEQ_LEN:-128}"
|
| 33 |
+
lora_epochs="${LORA_EPOCHS:-0}"
|
| 34 |
+
distill_epochs="${DISTILL_EPOCHS:-0.1}"
|
| 35 |
+
distill_kl_weight="${DISTILL_KL_WEIGHT:-0.01}"
|
| 36 |
+
distill_kl_temp="${DISTILL_KL_TEMP:-4.0}"
|
| 37 |
+
dtype="${DTYPE:-bfloat16}"
|
| 38 |
+
batch_size="${BATCH_SIZE:-1}"
|
| 39 |
+
use_pertensor_fisher="${USE_PERTENSOR_FISHER:-0}"
|
| 40 |
+
save_full_model_cycles="${SAVE_FULL_MODEL_CYCLES:-3}"
|
| 41 |
+
|
| 42 |
+
fisher_args=(--fisher_mode param)
|
| 43 |
+
if [[ "$use_pertensor_fisher" == "1" ]]; then
|
| 44 |
+
fisher_args=(--fisher_mode tensor)
|
| 45 |
+
fi
|
| 46 |
+
|
| 47 |
+
output_dir_suffix="progressive_common_smoke"
|
| 48 |
+
if [[ "$use_pertensor_fisher" == "1" ]]; then
|
| 49 |
+
output_dir_suffix="${output_dir_suffix}_pertensor"
|
| 50 |
+
fi
|
| 51 |
+
|
| 52 |
+
model_slug="$(echo "$model" | tr '/:@' '___' | tr -cs '[:alnum:]_.-' '_' | sed 's/^_\\+//; s/_\\+$//')"
|
| 53 |
+
output_dir_default="$repo_root/results/${model_slug}_${output_dir_suffix}"
|
| 54 |
+
output_dir=""
|
| 55 |
+
if [[ $# -gt 0 && "${1:0:2}" != "--" ]]; then
|
| 56 |
+
output_dir="$1"
|
| 57 |
+
shift
|
| 58 |
+
elif [[ -n "${OUTDIR:-}" ]]; then
|
| 59 |
+
output_dir="${OUTDIR}"
|
| 60 |
+
else
|
| 61 |
+
output_dir="${output_dir_default}"
|
| 62 |
+
fi
|
| 63 |
+
if [[ -n "${RUN_NAME:-}" ]]; then
|
| 64 |
+
output_dir="${output_dir}_${RUN_NAME}"
|
| 65 |
+
fi
|
| 66 |
+
|
| 67 |
+
python_args=(
|
| 68 |
+
--model "$model" \
|
| 69 |
+
--dataset "$dataset" \
|
| 70 |
+
--dataset_config "$dataset_config" \
|
| 71 |
+
--target_tokens "$target_tokens" \
|
| 72 |
+
--num_samples "$calib_sequences" \
|
| 73 |
+
--seq_len "$seq_len" \
|
| 74 |
+
--batch_size "$batch_size" \
|
| 75 |
+
--distill_batch_size "$distill_batch_size" \
|
| 76 |
+
--distill_seq_len "$distill_seq_len" \
|
| 77 |
+
--distill_epochs "$distill_epochs" \
|
| 78 |
+
--eval_batch_size "$eval_batch_size" \
|
| 79 |
+
--eval_seq_len "$seq_len" \
|
| 80 |
+
--eval_num_samples "$eval_num_samples" \
|
| 81 |
+
--distill_kl_weight "$distill_kl_weight" \
|
| 82 |
+
--distill_kl_temp "$distill_kl_temp" \
|
| 83 |
+
--distill_weight_decay 0.0 \
|
| 84 |
+
--distill_max_grad_norm 1.0 \
|
| 85 |
+
--distill_grad_accum_steps 1 \
|
| 86 |
+
--distill_eval_every 0 \
|
| 87 |
+
--lora_eval_every 0 \
|
| 88 |
+
--lora_epochs "$lora_epochs" \
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
python_args+=("${fisher_args[@]}")
|
| 92 |
+
if [[ -n "$save_full_model_cycles" ]]; then
|
| 93 |
+
python_args+=(--save_full_model_cycles "$save_full_model_cycles")
|
| 94 |
+
fi
|
| 95 |
+
|
| 96 |
+
python_args+=(
|
| 97 |
+
--distill_method reparam \
|
| 98 |
+
--redistrib_teacher_source previous_cycle \
|
| 99 |
+
--comm_enabled \
|
| 100 |
+
--comm_mu_auto \
|
| 101 |
+
--layer auto \
|
| 102 |
+
--exclude_pairs -1 \
|
| 103 |
+
--num_progressive "$num_progressive" \
|
| 104 |
+
--output_dir "$output_dir" \
|
| 105 |
+
--dtype "$dtype" \
|
| 106 |
+
)
|
| 107 |
+
python_args+=("$@")
|
| 108 |
+
|
| 109 |
+
mkdir -p "$output_dir"
|
| 110 |
+
run_args_file="$output_dir/run_args.txt"
|
| 111 |
+
start_epoch=$(date +%s)
|
| 112 |
+
start_time=$(date --iso-8601=seconds)
|
| 113 |
+
{
|
| 114 |
+
echo "start_time=$start_time"
|
| 115 |
+
echo "command:"
|
| 116 |
+
printf '%q ' python "$repo_root/src/fuse_layers.py" "${python_args[@]}"
|
| 117 |
+
echo
|
| 118 |
+
} > "$run_args_file"
|
| 119 |
+
|
| 120 |
+
write_run_summary() {
|
| 121 |
+
local exit_code=$?
|
| 122 |
+
local end_epoch end_time elapsed_seconds
|
| 123 |
+
end_epoch=$(date +%s)
|
| 124 |
+
end_time=$(date --iso-8601=seconds)
|
| 125 |
+
elapsed_seconds=$((end_epoch - start_epoch))
|
| 126 |
+
{
|
| 127 |
+
echo "end_time=$end_time"
|
| 128 |
+
echo "elapsed_seconds=$elapsed_seconds"
|
| 129 |
+
echo "exit_code=$exit_code"
|
| 130 |
+
} >> "$run_args_file"
|
| 131 |
+
}
|
| 132 |
+
trap write_run_summary EXIT
|
| 133 |
+
|
| 134 |
+
python "$repo_root/src/fuse_layers.py" "${python_args[@]}"
|
script/run_eval_ppl.sh
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES=0
|
| 5 |
+
|
| 6 |
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
|
| 8 |
+
if [[ $# -lt 1 ]]; then
|
| 9 |
+
cat <<'USAGE'
|
| 10 |
+
Usage:
|
| 11 |
+
script/run_eval_ppl.sh <model_path> [output_dir] [extra eval_ppl args...]
|
| 12 |
+
|
| 13 |
+
Examples:
|
| 14 |
+
script/run_eval_ppl.sh /path/to/model
|
| 15 |
+
script/run_eval_ppl.sh /path/to/model /tmp/ppl_out --dataset wikitext2 --max_seq_len 1024 --batch_size 8
|
| 16 |
+
USAGE
|
| 17 |
+
exit 1
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
MODEL_PATH="$1"
|
| 21 |
+
shift
|
| 22 |
+
|
| 23 |
+
OUTPUT_DIR=""
|
| 24 |
+
if [[ $# -gt 0 && "${1:0:2}" != "--" ]]; then
|
| 25 |
+
OUTPUT_DIR="$1"
|
| 26 |
+
shift
|
| 27 |
+
fi
|
| 28 |
+
|
| 29 |
+
CMD=(
|
| 30 |
+
python "$ROOT_DIR/src/eval_ppl.py"
|
| 31 |
+
--base_model "$MODEL_PATH"
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if [[ -n "$OUTPUT_DIR" ]]; then
|
| 35 |
+
CMD+=(--output_dir "$OUTPUT_DIR")
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
# Default to WikiText-2 only unless user explicitly sets --dataset.
|
| 39 |
+
HAS_DATASET_FLAG=0
|
| 40 |
+
for arg in "$@"; do
|
| 41 |
+
if [[ "$arg" == "--dataset" ]]; then
|
| 42 |
+
HAS_DATASET_FLAG=1
|
| 43 |
+
break
|
| 44 |
+
fi
|
| 45 |
+
done
|
| 46 |
+
if [[ "$HAS_DATASET_FLAG" -eq 0 ]]; then
|
| 47 |
+
CMD+=(--dataset wikitext2)
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
CMD+=("$@")
|
| 51 |
+
|
| 52 |
+
echo "Running: ${CMD[*]}"
|
| 53 |
+
exec "${CMD[@]}"
|
script/run_eval_zeroshot.sh
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-2}"
|
| 5 |
+
|
| 6 |
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
|
| 8 |
+
if [[ $# -lt 2 ]]; then
|
| 9 |
+
cat <<'USAGE'
|
| 10 |
+
Usage:
|
| 11 |
+
script/run_eval_zeroshot.sh <model_path> <output_dir> [--mmlu] [extra lm_eval args...]
|
| 12 |
+
|
| 13 |
+
Examples:
|
| 14 |
+
script/run_eval_zeroshot.sh /path/to/model /path/to/output
|
| 15 |
+
script/run_eval_zeroshot.sh /path/to/model /path/to/output --mmlu
|
| 16 |
+
script/run_eval_zeroshot.sh /path/to/model /path/to/output --tasks arc_easy,arc_challenge,hellaswag
|
| 17 |
+
USAGE
|
| 18 |
+
exit 1
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
MODEL_PATH="$1"
|
| 22 |
+
OUTPUT_DIR="$2"
|
| 23 |
+
shift 2
|
| 24 |
+
|
| 25 |
+
TASKS="${TASKS:-arc_easy,arc_challenge,hellaswag,piqa,winogrande,openbookqa,boolq}"
|
| 26 |
+
DEVICE="${DEVICE:-cuda}"
|
| 27 |
+
BATCH_SIZE="${BATCH_SIZE:-auto}"
|
| 28 |
+
NUM_FEWSHOT="${NUM_FEWSHOT:-0}"
|
| 29 |
+
OUTPUT_FILE="${OUTPUT_FILE:-zeroshot_results.json}"
|
| 30 |
+
|
| 31 |
+
INCLUDE_MMLU=0
|
| 32 |
+
PASSTHROUGH_ARGS=()
|
| 33 |
+
for arg in "$@"; do
|
| 34 |
+
if [[ "$arg" == "--mmlu" ]]; then
|
| 35 |
+
INCLUDE_MMLU=1
|
| 36 |
+
continue
|
| 37 |
+
fi
|
| 38 |
+
PASSTHROUGH_ARGS+=("$arg")
|
| 39 |
+
done
|
| 40 |
+
if [[ "$INCLUDE_MMLU" -eq 1 && ",$TASKS," != *",mmlu,"* ]]; then
|
| 41 |
+
TASKS="${TASKS},mmlu"
|
| 42 |
+
fi
|
| 43 |
+
|
| 44 |
+
mkdir -p "$OUTPUT_DIR"
|
| 45 |
+
RUN_ARGS_FILE="$OUTPUT_DIR/run_zeroshot_args.txt"
|
| 46 |
+
RESOLVED_MODEL_PATH="$MODEL_PATH"
|
| 47 |
+
|
| 48 |
+
git_commit="unknown"
|
| 49 |
+
if git -C "$ROOT_DIR" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 50 |
+
git_commit=$(git -C "$ROOT_DIR" rev-parse HEAD)
|
| 51 |
+
fi
|
| 52 |
+
start_epoch=$(date +%s)
|
| 53 |
+
start_time=$(date --iso-8601=seconds)
|
| 54 |
+
|
| 55 |
+
LM_EVAL_CMD=(
|
| 56 |
+
lm_eval
|
| 57 |
+
--model hf
|
| 58 |
+
--model_args "pretrained=$RESOLVED_MODEL_PATH"
|
| 59 |
+
--tasks "$TASKS"
|
| 60 |
+
--num_fewshot "$NUM_FEWSHOT"
|
| 61 |
+
--device "$DEVICE"
|
| 62 |
+
--batch_size 32
|
| 63 |
+
--output_path "$OUTPUT_DIR/$OUTPUT_FILE"
|
| 64 |
+
)
|
| 65 |
+
LM_EVAL_CMD+=("${PASSTHROUGH_ARGS[@]}")
|
| 66 |
+
|
| 67 |
+
{
|
| 68 |
+
echo "git_commit=$git_commit"
|
| 69 |
+
echo "start_time=$start_time"
|
| 70 |
+
echo "resolved_model_path=$RESOLVED_MODEL_PATH"
|
| 71 |
+
echo "command:"
|
| 72 |
+
printf '%q ' "${LM_EVAL_CMD[@]}"
|
| 73 |
+
echo
|
| 74 |
+
} > "$RUN_ARGS_FILE"
|
| 75 |
+
|
| 76 |
+
write_run_summary() {
|
| 77 |
+
local exit_code=$?
|
| 78 |
+
local end_epoch end_time elapsed_seconds
|
| 79 |
+
end_epoch=$(date +%s)
|
| 80 |
+
end_time=$(date --iso-8601=seconds)
|
| 81 |
+
elapsed_seconds=$((end_epoch - start_epoch))
|
| 82 |
+
{
|
| 83 |
+
echo "end_time=$end_time"
|
| 84 |
+
echo "elapsed_seconds=$elapsed_seconds"
|
| 85 |
+
echo "exit_code=$exit_code"
|
| 86 |
+
} >> "$RUN_ARGS_FILE"
|
| 87 |
+
}
|
| 88 |
+
trap write_run_summary EXIT
|
| 89 |
+
|
| 90 |
+
echo "Running: ${LM_EVAL_CMD[*]}"
|
| 91 |
+
exec "${LM_EVAL_CMD[@]}"
|
script/run_laco_llama.sh
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
workdir="$repo_root/compare_model/LaCo"
|
| 8 |
+
|
| 9 |
+
model_path="${MODEL_PATH:-meta-llama/Llama-2-7b-hf}"
|
| 10 |
+
target_layers="${TARGET_LAYERS:-16}"
|
| 11 |
+
merge_layers="${MERGE_LAYERS:-2}"
|
| 12 |
+
interval="${INTERVAL:-1}"
|
| 13 |
+
lowest_layer="${LOWEST_LAYER:-0}"
|
| 14 |
+
threshold="${THRESHOLD:-0.45}"
|
| 15 |
+
dtype="${DTYPE:-bfloat16}"
|
| 16 |
+
device="${DEVICE:-cuda}"
|
| 17 |
+
max_prompt_length="${MAX_PROMPT_LENGTH:-128}"
|
| 18 |
+
output_dir="${OUTDIR:-$repo_root/results/laco_llama_target_${target_layers}}"
|
| 19 |
+
|
| 20 |
+
python_args=(
|
| 21 |
+
--model_path "$model_path"
|
| 22 |
+
--output_dir "$output_dir"
|
| 23 |
+
--target_layers "$target_layers"
|
| 24 |
+
--merge_layers "$merge_layers"
|
| 25 |
+
--interval "$interval"
|
| 26 |
+
--lowest_layer "$lowest_layer"
|
| 27 |
+
--threshold "$threshold"
|
| 28 |
+
--dtype "$dtype"
|
| 29 |
+
--device "$device"
|
| 30 |
+
--max_prompt_length "$max_prompt_length"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
if [[ "${TRUST_REMOTE_CODE:-0}" == "1" ]]; then
|
| 34 |
+
python_args+=(--trust_remote_code)
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
if [[ "${FORCE_TARGET:-1}" == "1" ]]; then
|
| 38 |
+
python_args+=(--force_target)
|
| 39 |
+
else
|
| 40 |
+
python_args+=(--no_force_target)
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
if [[ -n "${PROMPT_FILE:-}" ]]; then
|
| 44 |
+
python_args+=(--prompt_file "$PROMPT_FILE")
|
| 45 |
+
fi
|
| 46 |
+
|
| 47 |
+
if [[ -n "${SAVE_LAYERS:-}" ]]; then
|
| 48 |
+
# shellcheck disable=SC2206
|
| 49 |
+
save_layers=(${SAVE_LAYERS})
|
| 50 |
+
python_args+=(--save_layers "${save_layers[@]}")
|
| 51 |
+
fi
|
| 52 |
+
|
| 53 |
+
python_args+=("$@")
|
| 54 |
+
|
| 55 |
+
mkdir -p "$output_dir"
|
| 56 |
+
git_commit="unknown"
|
| 57 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 58 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 59 |
+
fi
|
| 60 |
+
{
|
| 61 |
+
echo "git_commit=$git_commit"
|
| 62 |
+
echo "command:"
|
| 63 |
+
printf '%q ' python "$repo_root/compare_model/LaCo/laco_llama.py" "${python_args[@]}"
|
| 64 |
+
echo
|
| 65 |
+
} > "$output_dir/run_args.txt"
|
| 66 |
+
|
| 67 |
+
cd "$workdir"
|
| 68 |
+
PYTHONPATH="$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 69 |
+
python laco_llama.py "${python_args[@]}"
|
script/run_laco_qwen.sh
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
workdir="$repo_root/compare_model/LaCo"
|
| 8 |
+
|
| 9 |
+
model_path="${MODEL_PATH:-Qwen/Qwen3-1.7B}"
|
| 10 |
+
target_layers="${TARGET_LAYERS:-14}"
|
| 11 |
+
merge_layers="${MERGE_LAYERS:-2}"
|
| 12 |
+
interval="${INTERVAL:-1}"
|
| 13 |
+
lowest_layer="${LOWEST_LAYER:-0}"
|
| 14 |
+
threshold="${THRESHOLD:-0.45}"
|
| 15 |
+
dtype="${DTYPE:-bfloat16}"
|
| 16 |
+
device="${DEVICE:-cuda}"
|
| 17 |
+
max_prompt_length="${MAX_PROMPT_LENGTH:-128}"
|
| 18 |
+
output_dir="${OUTDIR:-$repo_root/results/laco_qwen_target_${target_layers}}"
|
| 19 |
+
|
| 20 |
+
python_args=(
|
| 21 |
+
--model_path "$model_path"
|
| 22 |
+
--output_dir "$output_dir"
|
| 23 |
+
--target_layers "$target_layers"
|
| 24 |
+
--merge_layers "$merge_layers"
|
| 25 |
+
--interval "$interval"
|
| 26 |
+
--lowest_layer "$lowest_layer"
|
| 27 |
+
--threshold "$threshold"
|
| 28 |
+
--dtype "$dtype"
|
| 29 |
+
--device "$device"
|
| 30 |
+
--max_prompt_length "$max_prompt_length"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
if [[ "${TRUST_REMOTE_CODE:-1}" == "1" ]]; then
|
| 34 |
+
python_args+=(--trust_remote_code)
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
if [[ "${FORCE_TARGET:-1}" == "1" ]]; then
|
| 38 |
+
python_args+=(--force_target)
|
| 39 |
+
else
|
| 40 |
+
python_args+=(--no_force_target)
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
if [[ -n "${PROMPT_FILE:-}" ]]; then
|
| 44 |
+
python_args+=(--prompt_file "$PROMPT_FILE")
|
| 45 |
+
fi
|
| 46 |
+
|
| 47 |
+
if [[ -n "${SAVE_LAYERS:-}" ]]; then
|
| 48 |
+
# shellcheck disable=SC2206
|
| 49 |
+
save_layers=(${SAVE_LAYERS})
|
| 50 |
+
python_args+=(--save_layers "${save_layers[@]}")
|
| 51 |
+
fi
|
| 52 |
+
|
| 53 |
+
python_args+=("$@")
|
| 54 |
+
|
| 55 |
+
mkdir -p "$output_dir"
|
| 56 |
+
git_commit="unknown"
|
| 57 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 58 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 59 |
+
fi
|
| 60 |
+
{
|
| 61 |
+
echo "git_commit=$git_commit"
|
| 62 |
+
echo "command:"
|
| 63 |
+
printf '%q ' python "$repo_root/compare_model/LaCo/laco_qwen.py" "${python_args[@]}"
|
| 64 |
+
echo
|
| 65 |
+
} > "$output_dir/run_args.txt"
|
| 66 |
+
|
| 67 |
+
cd "$workdir"
|
| 68 |
+
PYTHONPATH="$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 69 |
+
python laco_qwen.py "${python_args[@]}"
|
script/run_llmpruner_llama.sh
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# meta-llama/Llama-2-7b-hf, meta-llama/Llama-3.1-8B
|
| 5 |
+
|
| 6 |
+
# Prune-only runner for LLM-Pruner on Llama-family checkpoints.
|
| 7 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-2}"
|
| 8 |
+
|
| 9 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 10 |
+
workdir="$repo_root/compare_model/LLM-Pruner"
|
| 11 |
+
|
| 12 |
+
base_model="${BASE_MODEL:-meta-llama/Llama-2-7b-hf}"
|
| 13 |
+
prune_ckpt_path="${PRUNE_CKPT_PATH:-llama2_7b_prune}"
|
| 14 |
+
pruning_ratio="${PRUNING_RATIO:-0.25}"
|
| 15 |
+
block_mlp_layer_start="${BLOCK_MLP_LAYER_START:-4}"
|
| 16 |
+
block_mlp_layer_end="${BLOCK_MLP_LAYER_END:-30}"
|
| 17 |
+
block_attention_layer_start="${BLOCK_ATTENTION_LAYER_START:-4}"
|
| 18 |
+
block_attention_layer_end="${BLOCK_ATTENTION_LAYER_END:-30}"
|
| 19 |
+
pruner_type="${PRUNER_TYPE:-taylor}"
|
| 20 |
+
taylor_mode="${TAYLOR_MODE:-param_first}"
|
| 21 |
+
device="${DEVICE:-cpu}"
|
| 22 |
+
eval_device="${EVAL_DEVICE:-cuda}"
|
| 23 |
+
|
| 24 |
+
default_script="hf_prune.py"
|
| 25 |
+
skip_eval_flag="--skip_post_eval"
|
| 26 |
+
if [[ "$base_model" == *"Llama-3"* ]] || [[ "$base_model" == *"Llama-3."* ]] || [[ "$base_model" == *"llama-3"* ]]; then
|
| 27 |
+
default_script="llama3.py"
|
| 28 |
+
skip_eval_flag="--skip_eval_after_prune"
|
| 29 |
+
fi
|
| 30 |
+
script_name="${PRUNE_SCRIPT:-$default_script}"
|
| 31 |
+
|
| 32 |
+
output_dir="${OUTDIR:-$workdir/prune_log/$prune_ckpt_path}"
|
| 33 |
+
|
| 34 |
+
python_args=(
|
| 35 |
+
--base_model "$base_model"
|
| 36 |
+
--pruning_ratio "$pruning_ratio"
|
| 37 |
+
--block_wise
|
| 38 |
+
--block_mlp_layer_start "$block_mlp_layer_start"
|
| 39 |
+
--block_mlp_layer_end "$block_mlp_layer_end"
|
| 40 |
+
--block_attention_layer_start "$block_attention_layer_start"
|
| 41 |
+
--block_attention_layer_end "$block_attention_layer_end"
|
| 42 |
+
--pruner_type "$pruner_type"
|
| 43 |
+
--taylor "$taylor_mode"
|
| 44 |
+
--device "$device"
|
| 45 |
+
--eval_device "$eval_device"
|
| 46 |
+
--save_ckpt_log_name "$prune_ckpt_path"
|
| 47 |
+
--save_model
|
| 48 |
+
"$skip_eval_flag"
|
| 49 |
+
)
|
| 50 |
+
python_args+=("$@")
|
| 51 |
+
|
| 52 |
+
mkdir -p "$output_dir"
|
| 53 |
+
git_commit="unknown"
|
| 54 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 55 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 56 |
+
fi
|
| 57 |
+
{
|
| 58 |
+
echo "git_commit=$git_commit"
|
| 59 |
+
echo "command:"
|
| 60 |
+
printf '%q ' python "$repo_root/compare_model/LLM-Pruner/$script_name" "${python_args[@]}"
|
| 61 |
+
echo
|
| 62 |
+
} > "$output_dir/run_args.txt"
|
| 63 |
+
|
| 64 |
+
cd "$workdir"
|
| 65 |
+
PYTHONPATH="$workdir:$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 66 |
+
python "$script_name" "${python_args[@]}"
|
script/run_llmpruner_qwen.sh
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Prune-only runner for LLM-Pruner on Qwen-family checkpoints.
|
| 5 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 6 |
+
|
| 7 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 8 |
+
workdir="$repo_root/compare_model/LLM-Pruner"
|
| 9 |
+
|
| 10 |
+
base_model="${BASE_MODEL:-Qwen/Qwen3-1.7B}"
|
| 11 |
+
prune_ckpt_path="${PRUNE_CKPT_PATH:-qwen3_1_7b_prune}"
|
| 12 |
+
pruning_ratio="${PRUNING_RATIO:-0.25}"
|
| 13 |
+
block_mlp_layer_start="${BLOCK_MLP_LAYER_START:-4}"
|
| 14 |
+
block_mlp_layer_end="${BLOCK_MLP_LAYER_END:-24}"
|
| 15 |
+
block_attention_layer_start="${BLOCK_ATTENTION_LAYER_START:-4}"
|
| 16 |
+
block_attention_layer_end="${BLOCK_ATTENTION_LAYER_END:-24}"
|
| 17 |
+
block_attention_roots="${BLOCK_ATTENTION_ROOTS:-q_proj,k_proj}"
|
| 18 |
+
block_mlp_roots="${BLOCK_MLP_ROOTS:-gate_proj,up_proj}"
|
| 19 |
+
pruner_type="${PRUNER_TYPE:-taylor}"
|
| 20 |
+
taylor_mode="${TAYLOR_MODE:-param_first}"
|
| 21 |
+
device="${DEVICE:-cuda}"
|
| 22 |
+
eval_device="${EVAL_DEVICE:-cuda}"
|
| 23 |
+
script_name="${PRUNE_SCRIPT:-llama3.py}"
|
| 24 |
+
|
| 25 |
+
output_dir="${OUTDIR:-$workdir/prune_log/$prune_ckpt_path}"
|
| 26 |
+
|
| 27 |
+
python_args=(
|
| 28 |
+
--base_model "$base_model"
|
| 29 |
+
--pruning_ratio "$pruning_ratio"
|
| 30 |
+
--block_wise
|
| 31 |
+
--block_attention_roots "$block_attention_roots"
|
| 32 |
+
--block_mlp_roots "$block_mlp_roots"
|
| 33 |
+
--block_mlp_layer_start "$block_mlp_layer_start"
|
| 34 |
+
--block_mlp_layer_end "$block_mlp_layer_end"
|
| 35 |
+
--block_attention_layer_start "$block_attention_layer_start"
|
| 36 |
+
--block_attention_layer_end "$block_attention_layer_end"
|
| 37 |
+
--pruner_type "$pruner_type"
|
| 38 |
+
--taylor "$taylor_mode"
|
| 39 |
+
--test_after_train
|
| 40 |
+
--device "$device"
|
| 41 |
+
--eval_device "$eval_device"
|
| 42 |
+
--save_ckpt_log_name "$prune_ckpt_path"
|
| 43 |
+
--save_model
|
| 44 |
+
)
|
| 45 |
+
python_args+=("$@")
|
| 46 |
+
|
| 47 |
+
mkdir -p "$output_dir"
|
| 48 |
+
git_commit="unknown"
|
| 49 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 50 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 51 |
+
fi
|
| 52 |
+
{
|
| 53 |
+
echo "git_commit=$git_commit"
|
| 54 |
+
echo "command:"
|
| 55 |
+
printf '%q ' python "$repo_root/compare_model/LLM-Pruner/$script_name" "${python_args[@]}"
|
| 56 |
+
echo
|
| 57 |
+
} > "$output_dir/run_args.txt"
|
| 58 |
+
|
| 59 |
+
cd "$workdir"
|
| 60 |
+
PYTHONPATH="$workdir:$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 61 |
+
python "$script_name" "${python_args[@]}"
|
script/run_llmpruner_whole.sh
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 5 |
+
runner="$repo_root/script/run_llmpruner_llama.sh"
|
| 6 |
+
|
| 7 |
+
base_model="${BASE_MODEL:-meta-llama/Llama-3.1-8B}"
|
| 8 |
+
device="${DEVICE:-cpu}"
|
| 9 |
+
eval_device="${EVAL_DEVICE:-cuda}"
|
| 10 |
+
num_examples="${NUM_EXAMPLES:-10}"
|
| 11 |
+
model_tag="$(basename "$base_model" | tr '[:upper:]' '[:lower:]' | tr -c 'a-z0-9' '_')"
|
| 12 |
+
|
| 13 |
+
run_case() {
|
| 14 |
+
local label="$1"
|
| 15 |
+
local ratio="$2"
|
| 16 |
+
shift 2
|
| 17 |
+
|
| 18 |
+
echo "[LLM-Pruner] ${label}: PRUNING_RATIO=${ratio}"
|
| 19 |
+
BASE_MODEL="$base_model" \
|
| 20 |
+
PRUNE_CKPT_PATH="${model_tag}_${label}" \
|
| 21 |
+
PRUNING_RATIO="$ratio" \
|
| 22 |
+
DEVICE="$device" \
|
| 23 |
+
EVAL_DEVICE="$eval_device" \
|
| 24 |
+
bash "$runner" --num_examples "$num_examples" "$@"
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
# Equivalent block-only pruning scales for dropping layers from a 32-layer Llama-2 7B.
|
| 28 |
+
run_case "drop6eq" "0.23"
|
| 29 |
+
run_case "drop11eq" "0.45"
|
| 30 |
+
run_case "drop16eq" "0.70" # Ratio = 54.3965%
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# for llama2 7b
|
| 35 |
+
# run_case "drop6eq" "0.23"
|
| 36 |
+
# run_case "drop11eq" "0.42"
|
| 37 |
+
# run_case "drop16eq" "0.62"
|
| 38 |
+
|
| 39 |
+
# for llama3 8b
|
| 40 |
+
# run_case "drop6eq" "0.23"
|
| 41 |
+
# run_case "drop11eq" "0.45"
|
| 42 |
+
# run_case "drop16eq" "0.70"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# "0.23" 86.1997%
|
| 49 |
+
# "0.48" 68.1928%
|
| 50 |
+
# "0.51" # 68.1928%
|
| 51 |
+
# "0.70" # 56.6762%
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# llama 7b depth
|
| 55 |
+
# 18.02%
|
| 56 |
+
# 33.04%
|
| 57 |
+
# 48.05%
|
| 58 |
+
|
| 59 |
+
# llama 8b depth
|
| 60 |
+
# 16.30%, 86.1997%
|
| 61 |
+
# 29.88%, 72.2934%
|
| 62 |
+
# 43.46%, 56.6762%
|
script/run_llmstreamline_llama.sh
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 5 |
+
|
| 6 |
+
# meta-llama/Llama-3.2-3B
|
| 7 |
+
# meta-llama/Llama-2-7b-hf
|
| 8 |
+
|
| 9 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 10 |
+
workdir="$repo_root/compare_model/LLM-Streamline"
|
| 11 |
+
|
| 12 |
+
pruned_blocks="${COMMON_PRUNED_BLOCKS:-14}"
|
| 13 |
+
layer_intervals="${LAYER_INTERVALS:-$pruned_blocks}"
|
| 14 |
+
output_dir="${OUTDIR:-$repo_root/results/llama2_7b_streamline_mse_common_${layer_intervals}}"
|
| 15 |
+
model_name="${MODEL_NAME:-meta-llama/Llama-3.2-3B}"
|
| 16 |
+
cosine_num_data="${COSINE_NUM_DATA:-300}"
|
| 17 |
+
train_num_data="${TRAIN_NUM_DATA:-5000}"
|
| 18 |
+
epochs="${EPOCHS:-15}"
|
| 19 |
+
batch_size="${BATCH_SIZE:-8}"
|
| 20 |
+
train_batch_size="${TRAIN_BATCH_SIZE:-$batch_size}"
|
| 21 |
+
grad_accum="${GRAD_ACCUM:-16}"
|
| 22 |
+
lr="${LR:-1e-5}"
|
| 23 |
+
min_lr="${MIN_LR:-5e-5}"
|
| 24 |
+
wd="${WD:-1e-3}"
|
| 25 |
+
dtype="${DTYPE:-bfloat16}"
|
| 26 |
+
|
| 27 |
+
python_args=(
|
| 28 |
+
--model_name "$model_name"
|
| 29 |
+
--output_dir "$output_dir"
|
| 30 |
+
--layer_intervals "$layer_intervals"
|
| 31 |
+
--cosine_num_data "$cosine_num_data"
|
| 32 |
+
--train_num_data "$train_num_data"
|
| 33 |
+
--epoches "$epochs"
|
| 34 |
+
--batch_size "$batch_size"
|
| 35 |
+
--train_batch_size "$train_batch_size"
|
| 36 |
+
--dtype "$dtype"
|
| 37 |
+
--gradient_accumulation_step "$grad_accum"
|
| 38 |
+
--lr "$lr"
|
| 39 |
+
--min_lr "$min_lr"
|
| 40 |
+
--wd "$wd"
|
| 41 |
+
)
|
| 42 |
+
python_args+=("$@")
|
| 43 |
+
|
| 44 |
+
mkdir -p "$output_dir"
|
| 45 |
+
git_commit="unknown"
|
| 46 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 47 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 48 |
+
fi
|
| 49 |
+
{
|
| 50 |
+
echo "git_commit=$git_commit"
|
| 51 |
+
echo "command:"
|
| 52 |
+
printf '%q ' python "$repo_root/compare_model/LLM-Streamline/mseloss_entry.py" "${python_args[@]}"
|
| 53 |
+
echo
|
| 54 |
+
} > "$output_dir/run_args.txt"
|
| 55 |
+
|
| 56 |
+
cd "$workdir"
|
| 57 |
+
PYTHONPATH="$workdir:$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 58 |
+
python mseloss_entry.py "${python_args[@]}"
|
script/run_llmstreamline_qwen.sh
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-3}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
workdir="$repo_root/compare_model/LLM-Streamline"
|
| 8 |
+
|
| 9 |
+
pruned_blocks="${COMMON_PRUNED_BLOCKS:-14}"
|
| 10 |
+
layer_intervals="${LAYER_INTERVALS:-$pruned_blocks}"
|
| 11 |
+
output_dir="${OUTDIR:-$repo_root/results/qwen3_1_7b_streamline_mse_common_${layer_intervals}}"
|
| 12 |
+
model_name="${MODEL_NAME:-Qwen/Qwen3-1.7B}"
|
| 13 |
+
cosine_num_data="${COSINE_NUM_DATA:-300}"
|
| 14 |
+
train_num_data="${TRAIN_NUM_DATA:-5000}"
|
| 15 |
+
epochs="${EPOCHS:-15}"
|
| 16 |
+
batch_size="${BATCH_SIZE:-8}"
|
| 17 |
+
train_batch_size="${TRAIN_BATCH_SIZE:-$batch_size}"
|
| 18 |
+
grad_accum="${GRAD_ACCUM:-16}"
|
| 19 |
+
lr="${LR:-1e-5}"
|
| 20 |
+
min_lr="${MIN_LR:-5e-5}"
|
| 21 |
+
wd="${WD:-1e-3}"
|
| 22 |
+
|
| 23 |
+
python_args=(
|
| 24 |
+
--model_name "$model_name"
|
| 25 |
+
--output_dir "$output_dir"
|
| 26 |
+
--layer_intervals "$layer_intervals"
|
| 27 |
+
--cosine_num_data "$cosine_num_data"
|
| 28 |
+
--train_num_data "$train_num_data"
|
| 29 |
+
--epoches "$epochs"
|
| 30 |
+
--batch_size "$batch_size"
|
| 31 |
+
--train_batch_size "$train_batch_size"
|
| 32 |
+
--gradient_accumulation_step "$grad_accum"
|
| 33 |
+
--lr "$lr"
|
| 34 |
+
--min_lr "$min_lr"
|
| 35 |
+
--wd "$wd"
|
| 36 |
+
)
|
| 37 |
+
python_args+=("$@")
|
| 38 |
+
|
| 39 |
+
mkdir -p "$output_dir"
|
| 40 |
+
git_commit="unknown"
|
| 41 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 42 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 43 |
+
fi
|
| 44 |
+
{
|
| 45 |
+
echo "git_commit=$git_commit"
|
| 46 |
+
echo "command:"
|
| 47 |
+
printf '%q ' python "$repo_root/compare_model/LLM-Streamline/qwen_mseloss_entry.py" "${python_args[@]}"
|
| 48 |
+
echo
|
| 49 |
+
} > "$output_dir/run_args.txt"
|
| 50 |
+
|
| 51 |
+
cd "$workdir"
|
| 52 |
+
PYTHONPATH="$workdir:$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 53 |
+
python qwen_mseloss_entry.py "${python_args[@]}"
|
script/run_replaceme_llama.sh
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-1}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
workdir="$repo_root/compare_model/ReplaceMe"
|
| 8 |
+
default_config="$workdir/examples/llama2_7b_replaceme_lstsq_skip16_common.yaml"
|
| 9 |
+
config_path="${CONFIG_PATH:-$default_config}"
|
| 10 |
+
target_tokens="${COMMON_TARGET_TOKENS_OVERRIDE:-}"
|
| 11 |
+
passthrough_args=()
|
| 12 |
+
|
| 13 |
+
while [[ $# -gt 0 ]]; do
|
| 14 |
+
case "$1" in
|
| 15 |
+
--target_tokens)
|
| 16 |
+
if [[ $# -lt 2 ]]; then
|
| 17 |
+
echo "error: --target_tokens requires a value" >&2
|
| 18 |
+
exit 1
|
| 19 |
+
fi
|
| 20 |
+
target_tokens="$2"
|
| 21 |
+
shift 2
|
| 22 |
+
;;
|
| 23 |
+
--target_tokens=*)
|
| 24 |
+
target_tokens="${1#*=}"
|
| 25 |
+
shift
|
| 26 |
+
;;
|
| 27 |
+
*)
|
| 28 |
+
passthrough_args+=("$1")
|
| 29 |
+
shift
|
| 30 |
+
;;
|
| 31 |
+
esac
|
| 32 |
+
done
|
| 33 |
+
|
| 34 |
+
mkdir -p "$repo_root/results/llama_7b_replaceme_common_16"
|
| 35 |
+
git_commit="unknown"
|
| 36 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 37 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 38 |
+
fi
|
| 39 |
+
{
|
| 40 |
+
echo "git_commit=$git_commit"
|
| 41 |
+
echo "config_path=$config_path"
|
| 42 |
+
echo "COMMON_TARGET_TOKENS_OVERRIDE=${target_tokens:-4500000}"
|
| 43 |
+
echo "command:"
|
| 44 |
+
printf '%q ' python "$repo_root/compare_model/ReplaceMe/run_replaceme.py" --config "$config_path"
|
| 45 |
+
if [[ -n "$target_tokens" ]]; then
|
| 46 |
+
printf '%q ' "# COMMON_TARGET_TOKENS_OVERRIDE=$target_tokens"
|
| 47 |
+
fi
|
| 48 |
+
if [[ ${#passthrough_args[@]} -gt 0 ]]; then
|
| 49 |
+
printf '%q ' "${passthrough_args[@]}"
|
| 50 |
+
fi
|
| 51 |
+
echo
|
| 52 |
+
} > "$repo_root/results/llama_7b_replaceme_common_16/run_args.txt"
|
| 53 |
+
|
| 54 |
+
cd "$workdir"
|
| 55 |
+
COMMON_TARGET_TOKENS_OVERRIDE="${target_tokens:-${COMMON_TARGET_TOKENS_OVERRIDE:-}}" \
|
| 56 |
+
PYTHONPATH="$workdir:$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 57 |
+
python run_replaceme.py --config "$config_path" "${passthrough_args[@]}"
|
script/run_replaceme_qwen.sh
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-3}"
|
| 5 |
+
|
| 6 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 7 |
+
workdir="$repo_root/compare_model/ReplaceMe"
|
| 8 |
+
default_config="$workdir/examples/qwen3_1.7b_replaceme_lstsq_skip14_common.yaml"
|
| 9 |
+
config_path="${CONFIG_PATH:-$default_config}"
|
| 10 |
+
|
| 11 |
+
mkdir -p "$repo_root/results/llama_7b_replaceme_common_14"
|
| 12 |
+
git_commit="unknown"
|
| 13 |
+
if git -C "$repo_root" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
| 14 |
+
git_commit=$(git -C "$repo_root" rev-parse HEAD)
|
| 15 |
+
fi
|
| 16 |
+
{
|
| 17 |
+
echo "git_commit=$git_commit"
|
| 18 |
+
echo "config_path=$config_path"
|
| 19 |
+
echo "command:"
|
| 20 |
+
printf '%q ' python "$repo_root/compare_model/ReplaceMe/run_replaceme_qwen.py" --config "$config_path"
|
| 21 |
+
echo
|
| 22 |
+
} > "$repo_root/results/llama_7b_replaceme_common_14/run_args.txt"
|
| 23 |
+
|
| 24 |
+
cd "$workdir"
|
| 25 |
+
PYTHONPATH="$workdir:$repo_root${PYTHONPATH:+:$PYTHONPATH}" \
|
| 26 |
+
python run_replaceme_qwen.py --config "$config_path" "$@"
|
script/run_uidl_llama.sh
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 5 |
+
|
| 6 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 7 |
+
export MODEL="${MODEL:-meta-llama/Llama-2-7b-hf}"
|
| 8 |
+
export OUTPUT_ROOT="${OUTPUT_ROOT:-$repo_root/results/uidl_llama}"
|
| 9 |
+
export SIM_DATASET="${SIM_DATASET:-slimpajama}"
|
| 10 |
+
export SIM_DATASET_CONFIG="${SIM_DATASET_CONFIG:-none}"
|
| 11 |
+
|
| 12 |
+
"$repo_root/compare_model/UIDL/run_uidl_prune.sh" "$@"
|
script/run_uidl_qwen.sh
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 5 |
+
|
| 6 |
+
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
|
| 7 |
+
export MODEL="${MODEL:-Qwen/Qwen3-1.7B}"
|
| 8 |
+
export OUTPUT_ROOT="${OUTPUT_ROOT:-$repo_root/results/uidl_qwen}"
|
| 9 |
+
export SIM_DATASET="${SIM_DATASET:-slimpajama}"
|
| 10 |
+
export SIM_DATASET_CONFIG="${SIM_DATASET_CONFIG:-none}"
|
| 11 |
+
|
| 12 |
+
"$repo_root/compare_model/UIDL/run_uidl_prune.sh" "$@"
|