| |
| """ |
| REAP (Router-Experts Activation Pruning) for GLM-4.7 MoE |
| |
| This script prunes MoE experts from GLM-4.7 using the REAP methodology from Cerebras. |
| Requires: https://github.com/Cerebras/reap (or fork with GLM support) |
| |
| Usage: |
| python run_reap.py --compression-ratio 0.50 --model-path /path/to/GLM-4.7 |
| |
| For observation reuse (instant pruning at different ratios): |
| python run_reap.py --compression-ratio 0.35 --reuse-observations observations_1360_angular-seed_42.pt |
| """ |
|
|
| import argparse |
| import subprocess |
| import sys |
| from pathlib import Path |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="REAP pruning for GLM-4.7") |
| parser.add_argument("--model-path", type=str, required=True, |
| help="Path to GLM-4.7 model") |
| parser.add_argument("--compression-ratio", type=float, required=True, |
| help="Compression ratio (0.30 = keep 70%, 0.50 = keep 50%)") |
| parser.add_argument("--output-dir", type=str, default=None, |
| help="Output directory (default: auto-generated)") |
| parser.add_argument("--dataset", type=str, |
| default="0xSero/glm47-reap-calibration-v2", |
| help="Calibration dataset") |
| parser.add_argument("--samples", type=int, default=1360, |
| help="Number of calibration samples") |
| parser.add_argument("--seed", type=int, default=42, |
| help="Random seed") |
| parser.add_argument("--distance", type=str, default="angular", |
| choices=["angular", "cosine", "euclidean"], |
| help="Distance measure for expert clustering") |
| parser.add_argument("--reuse-observations", type=str, default=None, |
| help="Path to pre-computed observations file for instant pruning") |
| parser.add_argument("--reap-repo", type=str, default="./reap", |
| help="Path to REAP repository") |
|
|
| args = parser.parse_args() |
|
|
| |
| if not Path(args.model_path).exists(): |
| print(f"ERROR: Model path not found: {args.model_path}") |
| sys.exit(1) |
|
|
| reap_script = Path(args.reap_repo) / "src" / "reap" / "prune.py" |
| if not reap_script.exists(): |
| print(f"ERROR: REAP prune.py not found at: {reap_script}") |
| print("Clone the REAP repo: git clone https://github.com/Cerebras/reap") |
| sys.exit(1) |
|
|
| |
| if args.output_dir is None: |
| ratio_pct = int(args.compression_ratio * 100) |
| args.output_dir = f"./GLM-4.7-REAP-{ratio_pct}" |
|
|
| Path(args.output_dir).mkdir(parents=True, exist_ok=True) |
|
|
| |
| cmd = [ |
| sys.executable, str(reap_script), |
| "--model-name", args.model_path, |
| "--dataset-name", args.dataset, |
| "--compression-ratio", str(args.compression_ratio), |
| "--prune-method", "reap", |
| "--seed", str(args.seed), |
| "--do-eval", "false", |
| "--profile", "false", |
| "--samples_per_category", str(args.samples), |
| "--model_max_length", "2048", |
| "--distance_measure", args.distance, |
| "--record_pruning_metrics_only", "true", |
| "--output_file_name", f"observations_{args.samples}_{args.distance}-seed_{args.seed}.pt", |
| ] |
|
|
| if args.reuse_observations: |
| cmd.extend(["--load_observations", args.reuse_observations]) |
| print(f"Reusing observations from: {args.reuse_observations}") |
| print("This enables instant pruning without re-running calibration!") |
|
|
| print("=" * 60) |
| print(f"REAP Pruning: GLM-4.7 @ {args.compression_ratio*100:.0f}% compression") |
| print("=" * 60) |
| print(f"Model: {args.model_path}") |
| print(f"Output: {args.output_dir}") |
| print(f"Dataset: {args.dataset} ({args.samples} samples)") |
| print(f"Distance: {args.distance}") |
| print("=" * 60) |
|
|
| |
| env = { |
| **dict(__import__('os').environ), |
| "CUDA_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", |
| "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", |
| } |
|
|
| result = subprocess.run(cmd, env=env) |
|
|
| if result.returncode == 0: |
| print("\n" + "=" * 60) |
| print("REAP pruning complete!") |
| print(f"Pruned model saved to: {args.output_dir}") |
| print("=" * 60) |
| else: |
| print(f"\nERROR: REAP failed with code {result.returncode}") |
| sys.exit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|