GLM-4.7-REAP-50 / scripts /run_reap.py
0xSero's picture
Upload folder using huggingface_hub
8a26b34 verified
#!/usr/bin/env python3
"""
REAP (Router-Experts Activation Pruning) for GLM-4.7 MoE
This script prunes MoE experts from GLM-4.7 using the REAP methodology from Cerebras.
Requires: https://github.com/Cerebras/reap (or fork with GLM support)
Usage:
python run_reap.py --compression-ratio 0.50 --model-path /path/to/GLM-4.7
For observation reuse (instant pruning at different ratios):
python run_reap.py --compression-ratio 0.35 --reuse-observations observations_1360_angular-seed_42.pt
"""
import argparse
import subprocess
import sys
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description="REAP pruning for GLM-4.7")
parser.add_argument("--model-path", type=str, required=True,
help="Path to GLM-4.7 model")
parser.add_argument("--compression-ratio", type=float, required=True,
help="Compression ratio (0.30 = keep 70%, 0.50 = keep 50%)")
parser.add_argument("--output-dir", type=str, default=None,
help="Output directory (default: auto-generated)")
parser.add_argument("--dataset", type=str,
default="0xSero/glm47-reap-calibration-v2",
help="Calibration dataset")
parser.add_argument("--samples", type=int, default=1360,
help="Number of calibration samples")
parser.add_argument("--seed", type=int, default=42,
help="Random seed")
parser.add_argument("--distance", type=str, default="angular",
choices=["angular", "cosine", "euclidean"],
help="Distance measure for expert clustering")
parser.add_argument("--reuse-observations", type=str, default=None,
help="Path to pre-computed observations file for instant pruning")
parser.add_argument("--reap-repo", type=str, default="./reap",
help="Path to REAP repository")
args = parser.parse_args()
# Validate
if not Path(args.model_path).exists():
print(f"ERROR: Model path not found: {args.model_path}")
sys.exit(1)
reap_script = Path(args.reap_repo) / "src" / "reap" / "prune.py"
if not reap_script.exists():
print(f"ERROR: REAP prune.py not found at: {reap_script}")
print("Clone the REAP repo: git clone https://github.com/Cerebras/reap")
sys.exit(1)
# Build output directory name
if args.output_dir is None:
ratio_pct = int(args.compression_ratio * 100)
args.output_dir = f"./GLM-4.7-REAP-{ratio_pct}"
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
# Build command
cmd = [
sys.executable, str(reap_script),
"--model-name", args.model_path,
"--dataset-name", args.dataset,
"--compression-ratio", str(args.compression_ratio),
"--prune-method", "reap",
"--seed", str(args.seed),
"--do-eval", "false",
"--profile", "false",
"--samples_per_category", str(args.samples),
"--model_max_length", "2048",
"--distance_measure", args.distance,
"--record_pruning_metrics_only", "true",
"--output_file_name", f"observations_{args.samples}_{args.distance}-seed_{args.seed}.pt",
]
if args.reuse_observations:
cmd.extend(["--load_observations", args.reuse_observations])
print(f"Reusing observations from: {args.reuse_observations}")
print("This enables instant pruning without re-running calibration!")
print("=" * 60)
print(f"REAP Pruning: GLM-4.7 @ {args.compression_ratio*100:.0f}% compression")
print("=" * 60)
print(f"Model: {args.model_path}")
print(f"Output: {args.output_dir}")
print(f"Dataset: {args.dataset} ({args.samples} samples)")
print(f"Distance: {args.distance}")
print("=" * 60)
# Run REAP
env = {
**dict(__import__('os').environ),
"CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
}
result = subprocess.run(cmd, env=env)
if result.returncode == 0:
print("\n" + "=" * 60)
print("REAP pruning complete!")
print(f"Pruned model saved to: {args.output_dir}")
print("=" * 60)
else:
print(f"\nERROR: REAP failed with code {result.returncode}")
sys.exit(1)
if __name__ == "__main__":
main()